OK, back to the game.
Thanks to TotoOnTheMoon (forgive the case), and following TascoDLX, I've used compression for my tiles : 4 bits/pixels point to 16 palettes of 16 colors. Thus, I load half less memory (bandwidth hungry), and my CPUs work on computing, and not memory operations. I get 278k cycles / layer for 1 CPU, ie 82 layers/s, or 2.73 layers @ 30 fps.
Ain't sure a forum is a Continuus tool, anyway ...
Code: Select all
MOV.L FB,R1 ; R1 = FrameBuffer
MOV.L TILES,R2 ; R2 = Tiles data
MOV.L PLANE_A,R3 ; R3 = Plane data
STC VBR,R0
MOV.W CPU_OFFSET,R1
MOV.B @(R0,R1),R1
CMP/EQ #'S',R0
BF CPUSlaveInitSkip
MOV.W SLAVE_OFFSET,R0
ADD R0,R3
CPUSlaveInitSkip:
* Main loop
MAIN:
MOV.W NB_TILES,R4 ; R4 = Number of tiles
LOOP_PLANE:
MOV.W @R3+,R5 ; R5 = palette number [15:12] - tileNumber [11:0] -> tile
MOV R5,R6 ; R6 = palette number [15:12] - tileNumber [11:0] -> color
MOV.L MASK_TILE0,R0 ; R0 = 0[31:12] - 1[11:0]
AND R0,R5 ; R5 = Tile number
* Let's extract Tile Address
SHLL8 R5
SHLR2 R5 ; R5 = tileOffset = tileNumber * 64
ADD R2,R5 ; R5 = tileAddress = TILES + tileOffset
MOV #16,R9 ; R9 = number of 16-bits word in a tile. 1 tile = 64 pixels = 16 x 4 pixels
LOOP_TILE:
MOV.W @R5+,R7 ; R7 = pixel0[15:12] pixel1[11:8] pixel2[7:4] pixel3[3:0]
* Let's unpack the 4-bit pixels
MOV R7,R8 ; R8 = pixel0[15:12] pixel1[11:8] pixel2[7:4] pixel3[3:0]
SHLR2 R8
SHLR2 R8 ; R8 = 0[15:12] pixel0[11:8] pixel1[7:4] pixel2[3:0]
MOV.L MASK_TILE1,R0 ; R0 = 0[15:12] 1[11:8] 0[7:4] 1[3:0]
AND R0,R7 ; R7 = 0[15:12] pixel1[11:8] 0[7:4] pixel3[3:0]
AND R0,R8 ; R8 = 0[15:12] pixel0[11:8] 0[7:4] pixel2[3:0]
SHLL8 R8 ; R8 = 0[31:24] 0[23:20] pixel0[19:16] 0[15:12] pixel2[11:8] 0[7:0]
SWAP.W R8,R8 ; R8 = 0[31:28] pixel2[27:24] 0[23:16] 0[15:8] 0[7:4] pixel0[3:0]
SWAP.B R8,R8 ; R8 = 0[31:28] pixel2[27:24] 0[23:16] 0[15:12] pixel0[11:8] 0[7:0]
SWAP.W R8,R8 ; R8 = 0[31:28] pixel0[27:24] 0[23:16] 0[15:12] pixel2[11:8] 0[7:0]
SHLL8 R7 ; R8 = 0[31:24] 0[23:20] pixel1[19:16] 0[15:12] pixel3[11:8] 0[7:0]
SWAP.B R7,R7 ; R7 = 0[31:24] 0[23:20] pixel1[19:16] 0[15:8] 0[7:4] pixel3[3:0]
OR R8,R7 ; R7 = 0[31:28] pixel0[27:24] 0[23:20] pixel1[19:16] 0[15:12] pixel2[11:8] 0[7:4] pixel3[3:0]
* R7 is now fulled with 4 4-bit pixels
* Let's pack the color to add a palette number
MOV.L MASK_COLOR,R0 ; R0 = 0[31:16] 1[15:12] 0[11:0]
AND R0,R6 ; R6 = 0[31:16] pal#[15:12] 0[11:0]
MOV R6,R8 ; R8 = 0[31:16] pal#[15:12] 0[11:0]
SWAP.B R8,R8 ; R8 = 0[31:16] 0[15:8] pal#[7:4] 0[3:0]
OR R8,R6 ; R6 = 0[31:16] pal#[15:12] 0[11:8] pal#[7:4] 0[3:0]
MOV R6,R8 ; R8 = 0[31:16] pal#[15:12] 0[11:8] pal#[7:4] 0[3:0]
SWAP.W R8,R8 ; R8 = pal#[31:28] 0[27:24] pal#[23:20] 0[19:16] 0[15:0]
OR R8,R6 ; R6 = pal#[31:28] 0[27:24] pal#[23:20] 0[19:16] pal#[15:12] 0[11:8] pal#[7:4] 0[3:0]
* The palette number is now interlace in the 32-bits longword
* Let's put the palette number into the pixels
OR R7,R6 ; R6 = pal#[31:28] pixel0[27:24] pal#[23:20] pixel1[19:16] pal#[15:12] pixel2[11:8] pal#[7:4] pixel3[3:0]
* We can now draw 4 pixels.
MOV.L R6,@R1
ADD #4,R1
DT R9
BF LOOP_TILE
DT R4
BF LOOP_PLANE
BRA MAIN
NOP
SDRAM dc.l $06000000
FB dc.l $24000000
TILES dc.l $06006000
PLANE_A dc.l $06004000
MASK_TILE0 dc.l $00000FFF
MASK_TILE1 dc.l $00000F0F
MASK_COLOR dc.l $0000F000
SLAVE_OFFSET dc.w $04B0
LINE_OFFSET dc.w 328
NB_TILES dc.w 560
CPU_OFFSET dc.w $0140