Sorry about not being able to find the time recently.
I've modified Gens/GS r7 and added profiling support ---hopefully without any problems, it was a quick & dirty hack.
I compiled the library and the cube demo with "-O2 -pg", and ran the program for about a minute (which should've gathered enough stats); here's the top10 (
Edit: updated the results with a new run)
Code: Select all
Flat profile:
Each sample counts as 7.8213e-06 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 23.68     27.71    27.71     4190     6.61     6.76  doBlitBlankExt
 15.18     45.47    17.76      700    25.37    25.37  M3D_transform3D
 11.96     59.46    13.99     8400     1.67     1.67  BMP_drawLine
  3.88     64.00     4.55      700     6.49    26.48  drawPoints
  3.29     67.85     3.85      702     5.48     6.29  BMP_clear
  2.95     71.30     3.45     4197     0.82     0.83  JOY_update
  2.85     74.64     3.34                             BMP_clipLine
  2.33     77.36     2.72    12570     0.22     4.31  BMP_doBlankProcess
  2.23     79.97     2.61      698     3.73     4.31  VDP_fillTileMapRect
  2.06     82.38     2.41      698     3.46     4.04  VDP_setTileMapRect
(and here's the 
gprof2dot output.) I'll polish & publish the hack, along with the MD side of the code soon (probably on GitHub). 
Edit: You can get the fork 
here.
The code output by gcc4 for doBlitBlankExt and M3D_transform3D is as follows
Code: Select all
doBlitBlankExt:
	link.w %fp,#-4
	movem.l #16188,-(%sp)
	.data
	.align	2
.LP19:
	.long	0
	.text
	lea .LP19,%a0
	jsr mcount
	pea 2.w
	jsr VDP_setAutoInc
	move.l bmp_buffer_read,%a2
	addq.l #4,%sp
	move.l #20992,%d0
	cmp.l bmp_buffer_0.l,%a2
	jeq .L61
.L38:
	move.w bmp_state,%d1
	btst #1,%d1
	jeq .L39
	move.w save_i.2002,%d6
	moveq #20,%d2
	sub.w %d6,%d2
	move.w save_j.2003,%d1
	moveq #32,%d3
	sub.w %d1,%d3
	and.l #65535,%d3
	and.l #65535,%d2
	move.l %d2,%d4
	lsl.l #8,%d4
	move.l %d3,%a0
	add.l %d4,%a0
	add.l %a0,%a0
	add.l %a0,%a0
	add.l %a0,%a2
	lsl.l #5,%d2
	add.l %d3,%d2
	lsl.l #5,%d2
	move.l %d2,%a0
	add.l %d0,%a0
	add.l %a0,%a0
	move.l %a0,%a1
	add.l #vramwrite_tab,%a1
	move.l (%a1,%a0.l),%d0
	move.l %d0,12582916
	move.w %d1,%a1
	subq.w #1,%a1
	tst.w %d1
	jeq .L40
	move.w %a1,%d0
	move.l %a2,%a0
.L41:
	move.l (%a0),12582912
	move.l 128(%a0),12582912
	move.l 256(%a0),12582912
	move.l 384(%a0),12582912
	move.l 512(%a0),12582912
	move.l 640(%a0),12582912
	move.l 768(%a0),12582912
	move.l 896(%a0),12582912
	addq.l #4,%a0
	dbra %d0,.L41
	moveq #0,%d0
	move.w %a1,%d0
	addq.l #1,%d0
	add.l %d0,%d0
	add.l %d0,%d0
	add.l %d0,%a2
.L40:
	lea (896,%a2),%a2
	move.w %d6,%d5
	subq.w #1,%d5
	jsr VDP_getScreenHeight
	and.l #65535,%d0
	move.l %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	subq.b #2,%d4
	subq.w #2,%d6
	tst.w %d5
	jeq .L43
.L57:
	move.b 12582920,%d0
	cmp.b %d4,%d0
	jeq .L51
	move.l %a2,%d2
	add.l #128,%d2
	move.l %a2,%d1
	add.l #256,%d1
	lea (384,%a2),%a5
	lea (512,%a2),%a4
	lea (640,%a2),%a3
	lea (768,%a2),%a1
	lea (896,%a2),%a0
	move.l %a0,-4(%fp)
	moveq #31,%d3
	move.l (%a2)+,%d0
	move.l %d0,12582912
	move.l %d2,%a0
	addq.l #4,%d2
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l %d1,%a0
	addq.l #4,%d1
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l (%a5)+,%d0
	move.l %d0,12582912
	move.l (%a4)+,%d0
	move.l %d0,12582912
	move.l (%a3)+,%d0
	move.l %d0,12582912
	move.l (%a1)+,%d0
	move.l %d0,12582912
	move.l -4(%fp),%a0
	addq.l #4,-4(%fp)
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.w %d3,%d0
	subq.w #1,%d0
	cmp.w #-1,%d0
	jeq .L62
.L47:
	move.b 12582920,%d7
	cmp.b %d7,%d4
	jeq .L48
	move.w %d0,%d3
	move.l (%a2)+,%d0
	move.l %d0,12582912
	move.l %d2,%a0
	addq.l #4,%d2
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l %d1,%a0
	addq.l #4,%d1
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l (%a5)+,%d0
	move.l %d0,12582912
	move.l (%a4)+,%d0
	move.l %d0,12582912
	move.l (%a3)+,%d0
	move.l %d0,12582912
	move.l (%a1)+,%d0
	move.l %d0,12582912
	move.l -4(%fp),%a0
	addq.l #4,-4(%fp)
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.w %d3,%d0
	subq.w #1,%d0
	cmp.w #-1,%d0
	jne .L47
.L62:
	lea (896,%a2),%a2
	move.w %d6,%d0
	subq.w #1,%d0
	move.w %d6,%d5
	jeq .L43
	move.w %d0,%d6
	jra .L57
.L51:
	moveq #32,%d3
.L48:
	move.w %d5,save_i.2002
	move.w %d3,save_j.2003
	clr.w %d0
	movem.l -44(%fp),#15612
	unlk %fp
	rts
.L39:
	or.w #2,%d1
	move.w %d1,bmp_state
	move.l %d0,%a0
	add.l %d0,%a0
	move.l %a0,%a1
	add.l #vramwrite_tab,%a1
	move.l (%a1,%a0.l),%d0
	move.l %d0,12582916
	jsr VDP_getScreenHeight
	and.l #65535,%d0
	move.l %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	subq.b #2,%d4
	moveq #19,%d6
	moveq #20,%d5
	jra .L57
.L61:
	move.w #512,%d0
	jra .L38
.L43:
	and.w #-3,bmp_state
	moveq #1,%d0
	movem.l -44(%fp),#15612
	unlk %fp
	rts
 
Code: Select all
M3D_transform3D:
	link.w %fp,#-12
	movem.l #16188,-(%sp)
	.data
	.align	2
.LP19:
	.long	0
	.text
	lea .LP19,%a0
	jsr mcount
	move.w 18(%fp),%d0
	tst.w rebuildMat
	jne .L29
.L20:
	move.w %d0,%d1
	subq.w #1,%d1
	tst.w %d0
	jeq .L21
	move.w mat,%d5
	move.w mat+2,-2(%fp)
	move.w mat+4,-4(%fp)
	move.w Tx,%a2
	move.w mat+6,%d7
	move.w mat+8,%d6
	move.w mat+10,-6(%fp)
	move.w Ty,%a5
	move.w mat+12,-8(%fp)
	move.w mat+14,%d4
	move.w mat+16,-10(%fp)
	move.w Tz,%a4
	moveq #0,%d0
	move.w %d1,%d0
	addq.l #1,%d0
	move.l %d0,%d1
	add.l %d0,%d1
	add.l %d1,%d0
	add.l %d0,%d0
	move.l 8(%fp),%a3
	add.l %d0,%a3
	move.l 12(%fp),%a1
	move.l 8(%fp),%a0
.L22:
	move.w 2(%a0),%d2
	move.w 4(%a0),%d0
	move.w (%a0),%d3
	muls.w %d5,%d3
	asr.l #6,%d3
	move.w -2(%fp),%d1
	muls.w %d2,%d1
	asr.l #6,%d1
	add.w %d1,%d3
	add.w %a2,%d3
	move.w -4(%fp),%d1
	muls.w %d0,%d1
	asr.l #6,%d1
	add.w %d1,%d3
	move.w %d3,(%a1)
	move.w (%a0),%d3
	move.w %d3,%d1
	muls.w %d7,%d1
	asr.l #6,%d1
	muls.w %d6,%d2
	asr.l #6,%d2
	add.w %d1,%d2
	add.w %a5,%d2
	move.w -6(%fp),%d1
	muls.w %d0,%d1
	asr.l #6,%d1
	add.w %d1,%d2
	move.w %d2,2(%a1)
	muls.w -8(%fp),%d3
	asr.l #6,%d3
	move.w 2(%a0),%d1
	muls.w %d4,%d1
	asr.l #6,%d1
	add.w %d1,%d3
	add.w %a4,%d3
	muls.w -10(%fp),%d0
	asr.l #6,%d0
	add.w %d0,%d3
	move.w %d3,4(%a1)
	addq.l #6,%a0
	addq.l #6,%a1
	cmp.l %a0,%a3
	jne .L22
.L21:
	tst.w light_enabled
	jeq .L19
	move.w light,%d1
	move.w light+2,%d2
	move.w light+4,%d0
	move.w %d2,%d4
	muls.w matInv+2,%d4
	asr.l #6,%d4
	move.w %d1,%d3
	muls.w matInv,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d0,%d3
	muls.w matInv+4,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d4,light_trans
	move.w %d2,%d4
	muls.w matInv+8,%d4
	asr.l #6,%d4
	move.w %d1,%d3
	muls.w matInv+6,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d0,%d3
	muls.w matInv+10,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d4,light_trans+2
	muls.w matInv+14,%d2
	asr.l #6,%d2
	muls.w matInv+12,%d1
	asr.l #6,%d1
	add.w %d2,%d1
	muls.w matInv+16,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	move.w %d1,light_trans+4
.L19:
	movem.l -52(%fp),#15612
	unlk %fp
	rts
.L29:
	move.w Rx,%d4
	asr.w #6,%d4
	move.l %d4,%d1
	and.l #1023,%d1
	lea sintab16,%a0
	add.l %d1,%d1
	move.w (%a0,%d1.l),%d6
	move.w Ry,%d3
	asr.w #6,%d3
	move.l %d3,%d1
	and.l #1023,%d1
	add.l %d1,%d1
	move.w (%a0,%d1.l),matInv+4
	move.w Rz,%d1
	asr.w #6,%d1
	move.l %d1,%d2
	and.l #1023,%d2
	add.l %d2,%d2
	move.w (%a0,%d2.l),%d2
	add.l #256,%d4
	and.l #1023,%d4
	add.l %d4,%d4
	move.w (%a0,%d4.l),%d5
	add.l #256,%d3
	and.l #1023,%d3
	add.l %d3,%d3
	move.w (%a0,%d3.l),%d4
	add.l #256,%d1
	and.l #1023,%d1
	add.l %d1,%d1
	move.w (%a0,%d1.l),%d1
	move.w matInv+4,%d7
	muls.w %d6,%d7
	lsr.l #6,%d7
	move.l %d7,%a1
	move.w matInv+4,%d3
	muls.w %d5,%d3
	lsr.l #6,%d3
	move.l %d3,%a5
	move.w %d4,%d3
	muls.w %d1,%d3
	asr.l #6,%d3
	move.w %d3,mat
	move.w %d4,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	neg.w %d7
	move.w %d7,%a3
	move.w %d7,mat+6
	move.w matInv+4,mat+12
	move.w %a1,%d7
	muls.w %d1,%d7
	asr.l #6,%d7
	move.l %d7,%a4
	move.w %d5,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	add.w %d7,%a4
	move.w %a4,mat+2
	move.w %d5,%d7
	muls.w %d1,%d7
	asr.l #6,%d7
	move.l %d7,%a2
	move.w %a1,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	sub.w %d7,%a2
	move.w %a2,mat+8
	move.w %d6,%d7
	muls.w %d4,%d7
	asr.l #6,%d7
	neg.w %d7
	move.w %d7,%a1
	move.w %d7,mat+14
	move.w %d6,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	move.l %d7,%a0
	move.w %a5,%d7
	muls.w %d1,%d7
	asr.l #6,%d7
	sub.w %d7,%a0
	move.w %a0,mat+4
	move.w %a5,%d7
	muls.w %d7,%d2
	asr.l #6,%d2
	muls.w %d6,%d1
	asr.l #6,%d1
	add.w %d1,%d2
	move.w %d2,mat+10
	muls.w %d5,%d4
	asr.l #6,%d4
	move.w %d4,mat+16
	move.w %d3,matInv
	move.w %a4,matInv+6
	move.w %a0,matInv+12
	move.w %a3,matInv+2
	move.w %a2,matInv+8
	move.w %d2,matInv+14
	move.w %a1,matInv+10
	move.w %d4,matInv+16
	clr.w rebuildMat
	jra .L20
Pretty lengthy, but we may get an idea by direct comparison.