Sorry about not being able to find the time recently.
I've modified Gens/GS r7 and added profiling support ---hopefully without any problems, it was a quick & dirty hack.
I compiled the library and the cube demo with "-O2 -pg", and ran the program for about a minute (which should've gathered enough stats); here's the top10 (
Edit: updated the results with a new run)
Code: Select all
Flat profile:
Each sample counts as 7.8213e-06 seconds.
% cumulative self self total
time seconds seconds calls ms/call ms/call name
23.68 27.71 27.71 4190 6.61 6.76 doBlitBlankExt
15.18 45.47 17.76 700 25.37 25.37 M3D_transform3D
11.96 59.46 13.99 8400 1.67 1.67 BMP_drawLine
3.88 64.00 4.55 700 6.49 26.48 drawPoints
3.29 67.85 3.85 702 5.48 6.29 BMP_clear
2.95 71.30 3.45 4197 0.82 0.83 JOY_update
2.85 74.64 3.34 BMP_clipLine
2.33 77.36 2.72 12570 0.22 4.31 BMP_doBlankProcess
2.23 79.97 2.61 698 3.73 4.31 VDP_fillTileMapRect
2.06 82.38 2.41 698 3.46 4.04 VDP_setTileMapRect
(and here's the
gprof2dot output.) I'll polish & publish the hack, along with the MD side of the code soon (probably on GitHub).
Edit: You can get the fork
here.
The code output by gcc4 for doBlitBlankExt and M3D_transform3D is as follows
Code: Select all
doBlitBlankExt:
link.w %fp,#-4
movem.l #16188,-(%sp)
.data
.align 2
.LP19:
.long 0
.text
lea .LP19,%a0
jsr mcount
pea 2.w
jsr VDP_setAutoInc
move.l bmp_buffer_read,%a2
addq.l #4,%sp
move.l #20992,%d0
cmp.l bmp_buffer_0.l,%a2
jeq .L61
.L38:
move.w bmp_state,%d1
btst #1,%d1
jeq .L39
move.w save_i.2002,%d6
moveq #20,%d2
sub.w %d6,%d2
move.w save_j.2003,%d1
moveq #32,%d3
sub.w %d1,%d3
and.l #65535,%d3
and.l #65535,%d2
move.l %d2,%d4
lsl.l #8,%d4
move.l %d3,%a0
add.l %d4,%a0
add.l %a0,%a0
add.l %a0,%a0
add.l %a0,%a2
lsl.l #5,%d2
add.l %d3,%d2
lsl.l #5,%d2
move.l %d2,%a0
add.l %d0,%a0
add.l %a0,%a0
move.l %a0,%a1
add.l #vramwrite_tab,%a1
move.l (%a1,%a0.l),%d0
move.l %d0,12582916
move.w %d1,%a1
subq.w #1,%a1
tst.w %d1
jeq .L40
move.w %a1,%d0
move.l %a2,%a0
.L41:
move.l (%a0),12582912
move.l 128(%a0),12582912
move.l 256(%a0),12582912
move.l 384(%a0),12582912
move.l 512(%a0),12582912
move.l 640(%a0),12582912
move.l 768(%a0),12582912
move.l 896(%a0),12582912
addq.l #4,%a0
dbra %d0,.L41
moveq #0,%d0
move.w %a1,%d0
addq.l #1,%d0
add.l %d0,%d0
add.l %d0,%d0
add.l %d0,%a2
.L40:
lea (896,%a2),%a2
move.w %d6,%d5
subq.w #1,%d5
jsr VDP_getScreenHeight
and.l #65535,%d0
move.l %d0,%d4
add.l #-160,%d4
asr.l #1,%d4
subq.b #2,%d4
subq.w #2,%d6
tst.w %d5
jeq .L43
.L57:
move.b 12582920,%d0
cmp.b %d4,%d0
jeq .L51
move.l %a2,%d2
add.l #128,%d2
move.l %a2,%d1
add.l #256,%d1
lea (384,%a2),%a5
lea (512,%a2),%a4
lea (640,%a2),%a3
lea (768,%a2),%a1
lea (896,%a2),%a0
move.l %a0,-4(%fp)
moveq #31,%d3
move.l (%a2)+,%d0
move.l %d0,12582912
move.l %d2,%a0
addq.l #4,%d2
move.l (%a0)+,%d0
move.l %d0,12582912
move.l %d1,%a0
addq.l #4,%d1
move.l (%a0)+,%d0
move.l %d0,12582912
move.l (%a5)+,%d0
move.l %d0,12582912
move.l (%a4)+,%d0
move.l %d0,12582912
move.l (%a3)+,%d0
move.l %d0,12582912
move.l (%a1)+,%d0
move.l %d0,12582912
move.l -4(%fp),%a0
addq.l #4,-4(%fp)
move.l (%a0)+,%d0
move.l %d0,12582912
move.w %d3,%d0
subq.w #1,%d0
cmp.w #-1,%d0
jeq .L62
.L47:
move.b 12582920,%d7
cmp.b %d7,%d4
jeq .L48
move.w %d0,%d3
move.l (%a2)+,%d0
move.l %d0,12582912
move.l %d2,%a0
addq.l #4,%d2
move.l (%a0)+,%d0
move.l %d0,12582912
move.l %d1,%a0
addq.l #4,%d1
move.l (%a0)+,%d0
move.l %d0,12582912
move.l (%a5)+,%d0
move.l %d0,12582912
move.l (%a4)+,%d0
move.l %d0,12582912
move.l (%a3)+,%d0
move.l %d0,12582912
move.l (%a1)+,%d0
move.l %d0,12582912
move.l -4(%fp),%a0
addq.l #4,-4(%fp)
move.l (%a0)+,%d0
move.l %d0,12582912
move.w %d3,%d0
subq.w #1,%d0
cmp.w #-1,%d0
jne .L47
.L62:
lea (896,%a2),%a2
move.w %d6,%d0
subq.w #1,%d0
move.w %d6,%d5
jeq .L43
move.w %d0,%d6
jra .L57
.L51:
moveq #32,%d3
.L48:
move.w %d5,save_i.2002
move.w %d3,save_j.2003
clr.w %d0
movem.l -44(%fp),#15612
unlk %fp
rts
.L39:
or.w #2,%d1
move.w %d1,bmp_state
move.l %d0,%a0
add.l %d0,%a0
move.l %a0,%a1
add.l #vramwrite_tab,%a1
move.l (%a1,%a0.l),%d0
move.l %d0,12582916
jsr VDP_getScreenHeight
and.l #65535,%d0
move.l %d0,%d4
add.l #-160,%d4
asr.l #1,%d4
subq.b #2,%d4
moveq #19,%d6
moveq #20,%d5
jra .L57
.L61:
move.w #512,%d0
jra .L38
.L43:
and.w #-3,bmp_state
moveq #1,%d0
movem.l -44(%fp),#15612
unlk %fp
rts
Code: Select all
M3D_transform3D:
link.w %fp,#-12
movem.l #16188,-(%sp)
.data
.align 2
.LP19:
.long 0
.text
lea .LP19,%a0
jsr mcount
move.w 18(%fp),%d0
tst.w rebuildMat
jne .L29
.L20:
move.w %d0,%d1
subq.w #1,%d1
tst.w %d0
jeq .L21
move.w mat,%d5
move.w mat+2,-2(%fp)
move.w mat+4,-4(%fp)
move.w Tx,%a2
move.w mat+6,%d7
move.w mat+8,%d6
move.w mat+10,-6(%fp)
move.w Ty,%a5
move.w mat+12,-8(%fp)
move.w mat+14,%d4
move.w mat+16,-10(%fp)
move.w Tz,%a4
moveq #0,%d0
move.w %d1,%d0
addq.l #1,%d0
move.l %d0,%d1
add.l %d0,%d1
add.l %d1,%d0
add.l %d0,%d0
move.l 8(%fp),%a3
add.l %d0,%a3
move.l 12(%fp),%a1
move.l 8(%fp),%a0
.L22:
move.w 2(%a0),%d2
move.w 4(%a0),%d0
move.w (%a0),%d3
muls.w %d5,%d3
asr.l #6,%d3
move.w -2(%fp),%d1
muls.w %d2,%d1
asr.l #6,%d1
add.w %d1,%d3
add.w %a2,%d3
move.w -4(%fp),%d1
muls.w %d0,%d1
asr.l #6,%d1
add.w %d1,%d3
move.w %d3,(%a1)
move.w (%a0),%d3
move.w %d3,%d1
muls.w %d7,%d1
asr.l #6,%d1
muls.w %d6,%d2
asr.l #6,%d2
add.w %d1,%d2
add.w %a5,%d2
move.w -6(%fp),%d1
muls.w %d0,%d1
asr.l #6,%d1
add.w %d1,%d2
move.w %d2,2(%a1)
muls.w -8(%fp),%d3
asr.l #6,%d3
move.w 2(%a0),%d1
muls.w %d4,%d1
asr.l #6,%d1
add.w %d1,%d3
add.w %a4,%d3
muls.w -10(%fp),%d0
asr.l #6,%d0
add.w %d0,%d3
move.w %d3,4(%a1)
addq.l #6,%a0
addq.l #6,%a1
cmp.l %a0,%a3
jne .L22
.L21:
tst.w light_enabled
jeq .L19
move.w light,%d1
move.w light+2,%d2
move.w light+4,%d0
move.w %d2,%d4
muls.w matInv+2,%d4
asr.l #6,%d4
move.w %d1,%d3
muls.w matInv,%d3
asr.l #6,%d3
add.w %d3,%d4
move.w %d0,%d3
muls.w matInv+4,%d3
asr.l #6,%d3
add.w %d3,%d4
move.w %d4,light_trans
move.w %d2,%d4
muls.w matInv+8,%d4
asr.l #6,%d4
move.w %d1,%d3
muls.w matInv+6,%d3
asr.l #6,%d3
add.w %d3,%d4
move.w %d0,%d3
muls.w matInv+10,%d3
asr.l #6,%d3
add.w %d3,%d4
move.w %d4,light_trans+2
muls.w matInv+14,%d2
asr.l #6,%d2
muls.w matInv+12,%d1
asr.l #6,%d1
add.w %d2,%d1
muls.w matInv+16,%d0
asr.l #6,%d0
add.w %d0,%d1
move.w %d1,light_trans+4
.L19:
movem.l -52(%fp),#15612
unlk %fp
rts
.L29:
move.w Rx,%d4
asr.w #6,%d4
move.l %d4,%d1
and.l #1023,%d1
lea sintab16,%a0
add.l %d1,%d1
move.w (%a0,%d1.l),%d6
move.w Ry,%d3
asr.w #6,%d3
move.l %d3,%d1
and.l #1023,%d1
add.l %d1,%d1
move.w (%a0,%d1.l),matInv+4
move.w Rz,%d1
asr.w #6,%d1
move.l %d1,%d2
and.l #1023,%d2
add.l %d2,%d2
move.w (%a0,%d2.l),%d2
add.l #256,%d4
and.l #1023,%d4
add.l %d4,%d4
move.w (%a0,%d4.l),%d5
add.l #256,%d3
and.l #1023,%d3
add.l %d3,%d3
move.w (%a0,%d3.l),%d4
add.l #256,%d1
and.l #1023,%d1
add.l %d1,%d1
move.w (%a0,%d1.l),%d1
move.w matInv+4,%d7
muls.w %d6,%d7
lsr.l #6,%d7
move.l %d7,%a1
move.w matInv+4,%d3
muls.w %d5,%d3
lsr.l #6,%d3
move.l %d3,%a5
move.w %d4,%d3
muls.w %d1,%d3
asr.l #6,%d3
move.w %d3,mat
move.w %d4,%d7
muls.w %d2,%d7
asr.l #6,%d7
neg.w %d7
move.w %d7,%a3
move.w %d7,mat+6
move.w matInv+4,mat+12
move.w %a1,%d7
muls.w %d1,%d7
asr.l #6,%d7
move.l %d7,%a4
move.w %d5,%d7
muls.w %d2,%d7
asr.l #6,%d7
add.w %d7,%a4
move.w %a4,mat+2
move.w %d5,%d7
muls.w %d1,%d7
asr.l #6,%d7
move.l %d7,%a2
move.w %a1,%d7
muls.w %d2,%d7
asr.l #6,%d7
sub.w %d7,%a2
move.w %a2,mat+8
move.w %d6,%d7
muls.w %d4,%d7
asr.l #6,%d7
neg.w %d7
move.w %d7,%a1
move.w %d7,mat+14
move.w %d6,%d7
muls.w %d2,%d7
asr.l #6,%d7
move.l %d7,%a0
move.w %a5,%d7
muls.w %d1,%d7
asr.l #6,%d7
sub.w %d7,%a0
move.w %a0,mat+4
move.w %a5,%d7
muls.w %d7,%d2
asr.l #6,%d2
muls.w %d6,%d1
asr.l #6,%d1
add.w %d1,%d2
move.w %d2,mat+10
muls.w %d5,%d4
asr.l #6,%d4
move.w %d4,mat+16
move.w %d3,matInv
move.w %a4,matInv+6
move.w %a0,matInv+12
move.w %a3,matInv+2
move.w %a2,matInv+8
move.w %d2,matInv+14
move.w %a1,matInv+10
move.w %d4,matInv+16
clr.w rebuildMat
jra .L20
Pretty lengthy, but we may get an idea by direct comparison.