Sega Genesis Dev Kit (SGDK)

fdarkangel · Post by **fdarkangel** » Sat Mar 31, 2012 9:11 pm

Stef wrote:I will test that and post the result but what is the point of having this information ?

Edit :
Ok, same awful performance in GCC 3.4.6 and GCC 4.1.1 when optimizer is disabled.

Thanks!
I just wanted to rule out the possibility that this has to do with anything other than the optimizer. It was a remote possibility anyway.

Stef wrote:Yeah the cube sample heavily rely on the library and it is less the case of Particle when you have many particles on the screen. It is why i chosen this sample to compare code. Actually we have only to focus on 2 simples methods which eat 99% cpu time : updatePartic() & drawPartic(). I will modify my previous assembly output to only keep these methods.

It's just too bad that Mega Drive emulators don't support profiling, such that we need to sort to such methods. My guess is porting VBA's profiler code to Gens and profiling the ROM would be faster and would solve the problem for everyone in the future (controlling the profiler is done thorugh unused SWIs in GBA, we can use VDP regs). In fact, this might be a good excuse to pull-in gdb and ELF support, and KMod debugging functions to the upstream as well.
And I'm just lucky that this happened to come up while discussing with the author of Gens

fdarkangel · Post by **fdarkangel** » Thu Apr 05, 2012 6:45 am

Sorry about not being able to find the time recently.
I've modified Gens/GS r7 and added profiling support ---hopefully without any problems, it was a quick & dirty hack.

I compiled the library and the cube demo with "-O2 -pg", and ran the program for about a minute (which should've gathered enough stats); here's the top10 (Edit: updated the results with a new run)

Code: Select all

Flat profile:

Each sample counts as 7.8213e-06 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 23.68     27.71    27.71     4190     6.61     6.76  doBlitBlankExt
 15.18     45.47    17.76      700    25.37    25.37  M3D_transform3D
 11.96     59.46    13.99     8400     1.67     1.67  BMP_drawLine
  3.88     64.00     4.55      700     6.49    26.48  drawPoints
  3.29     67.85     3.85      702     5.48     6.29  BMP_clear
  2.95     71.30     3.45     4197     0.82     0.83  JOY_update
  2.85     74.64     3.34                             BMP_clipLine
  2.33     77.36     2.72    12570     0.22     4.31  BMP_doBlankProcess
  2.23     79.97     2.61      698     3.73     4.31  VDP_fillTileMapRect
  2.06     82.38     2.41      698     3.46     4.04  VDP_setTileMapRect

(and here's the gprof2dot output.) I'll polish & publish the hack, along with the MD side of the code soon (probably on GitHub). Edit: You can get the fork here.

The code output by gcc4 for doBlitBlankExt and M3D_transform3D is as follows

Code: Select all

doBlitBlankExt:
	link.w %fp,#-4
	movem.l #16188,-(%sp)
	.data
	.align	2
.LP19:
	.long	0
	.text
	lea .LP19,%a0
	jsr mcount
	pea 2.w
	jsr VDP_setAutoInc
	move.l bmp_buffer_read,%a2
	addq.l #4,%sp
	move.l #20992,%d0
	cmp.l bmp_buffer_0.l,%a2
	jeq .L61
.L38:
	move.w bmp_state,%d1
	btst #1,%d1
	jeq .L39
	move.w save_i.2002,%d6
	moveq #20,%d2
	sub.w %d6,%d2
	move.w save_j.2003,%d1
	moveq #32,%d3
	sub.w %d1,%d3
	and.l #65535,%d3
	and.l #65535,%d2
	move.l %d2,%d4
	lsl.l #8,%d4
	move.l %d3,%a0
	add.l %d4,%a0
	add.l %a0,%a0
	add.l %a0,%a0
	add.l %a0,%a2
	lsl.l #5,%d2
	add.l %d3,%d2
	lsl.l #5,%d2
	move.l %d2,%a0
	add.l %d0,%a0
	add.l %a0,%a0
	move.l %a0,%a1
	add.l #vramwrite_tab,%a1
	move.l (%a1,%a0.l),%d0
	move.l %d0,12582916
	move.w %d1,%a1
	subq.w #1,%a1
	tst.w %d1
	jeq .L40
	move.w %a1,%d0
	move.l %a2,%a0
.L41:
	move.l (%a0),12582912
	move.l 128(%a0),12582912
	move.l 256(%a0),12582912
	move.l 384(%a0),12582912
	move.l 512(%a0),12582912
	move.l 640(%a0),12582912
	move.l 768(%a0),12582912
	move.l 896(%a0),12582912
	addq.l #4,%a0
	dbra %d0,.L41
	moveq #0,%d0
	move.w %a1,%d0
	addq.l #1,%d0
	add.l %d0,%d0
	add.l %d0,%d0
	add.l %d0,%a2
.L40:
	lea (896,%a2),%a2
	move.w %d6,%d5
	subq.w #1,%d5
	jsr VDP_getScreenHeight
	and.l #65535,%d0
	move.l %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	subq.b #2,%d4
	subq.w #2,%d6
	tst.w %d5
	jeq .L43
.L57:
	move.b 12582920,%d0
	cmp.b %d4,%d0
	jeq .L51
	move.l %a2,%d2
	add.l #128,%d2
	move.l %a2,%d1
	add.l #256,%d1
	lea (384,%a2),%a5
	lea (512,%a2),%a4
	lea (640,%a2),%a3
	lea (768,%a2),%a1
	lea (896,%a2),%a0
	move.l %a0,-4(%fp)
	moveq #31,%d3
	move.l (%a2)+,%d0
	move.l %d0,12582912
	move.l %d2,%a0
	addq.l #4,%d2
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l %d1,%a0
	addq.l #4,%d1
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l (%a5)+,%d0
	move.l %d0,12582912
	move.l (%a4)+,%d0
	move.l %d0,12582912
	move.l (%a3)+,%d0
	move.l %d0,12582912
	move.l (%a1)+,%d0
	move.l %d0,12582912
	move.l -4(%fp),%a0
	addq.l #4,-4(%fp)
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.w %d3,%d0
	subq.w #1,%d0
	cmp.w #-1,%d0
	jeq .L62
.L47:
	move.b 12582920,%d7
	cmp.b %d7,%d4
	jeq .L48
	move.w %d0,%d3
	move.l (%a2)+,%d0
	move.l %d0,12582912
	move.l %d2,%a0
	addq.l #4,%d2
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l %d1,%a0
	addq.l #4,%d1
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.l (%a5)+,%d0
	move.l %d0,12582912
	move.l (%a4)+,%d0
	move.l %d0,12582912
	move.l (%a3)+,%d0
	move.l %d0,12582912
	move.l (%a1)+,%d0
	move.l %d0,12582912
	move.l -4(%fp),%a0
	addq.l #4,-4(%fp)
	move.l (%a0)+,%d0
	move.l %d0,12582912
	move.w %d3,%d0
	subq.w #1,%d0
	cmp.w #-1,%d0
	jne .L47
.L62:
	lea (896,%a2),%a2
	move.w %d6,%d0
	subq.w #1,%d0
	move.w %d6,%d5
	jeq .L43
	move.w %d0,%d6
	jra .L57
.L51:
	moveq #32,%d3
.L48:
	move.w %d5,save_i.2002
	move.w %d3,save_j.2003
	clr.w %d0
	movem.l -44(%fp),#15612
	unlk %fp
	rts
.L39:
	or.w #2,%d1
	move.w %d1,bmp_state
	move.l %d0,%a0
	add.l %d0,%a0
	move.l %a0,%a1
	add.l #vramwrite_tab,%a1
	move.l (%a1,%a0.l),%d0
	move.l %d0,12582916
	jsr VDP_getScreenHeight
	and.l #65535,%d0
	move.l %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	subq.b #2,%d4
	moveq #19,%d6
	moveq #20,%d5
	jra .L57
.L61:
	move.w #512,%d0
	jra .L38
.L43:
	and.w #-3,bmp_state
	moveq #1,%d0
	movem.l -44(%fp),#15612
	unlk %fp
	rts

Code: Select all

M3D_transform3D:
	link.w %fp,#-12
	movem.l #16188,-(%sp)
	.data
	.align	2
.LP19:
	.long	0
	.text
	lea .LP19,%a0
	jsr mcount
	move.w 18(%fp),%d0
	tst.w rebuildMat
	jne .L29
.L20:
	move.w %d0,%d1
	subq.w #1,%d1
	tst.w %d0
	jeq .L21
	move.w mat,%d5
	move.w mat+2,-2(%fp)
	move.w mat+4,-4(%fp)
	move.w Tx,%a2
	move.w mat+6,%d7
	move.w mat+8,%d6
	move.w mat+10,-6(%fp)
	move.w Ty,%a5
	move.w mat+12,-8(%fp)
	move.w mat+14,%d4
	move.w mat+16,-10(%fp)
	move.w Tz,%a4
	moveq #0,%d0
	move.w %d1,%d0
	addq.l #1,%d0
	move.l %d0,%d1
	add.l %d0,%d1
	add.l %d1,%d0
	add.l %d0,%d0
	move.l 8(%fp),%a3
	add.l %d0,%a3
	move.l 12(%fp),%a1
	move.l 8(%fp),%a0
.L22:
	move.w 2(%a0),%d2
	move.w 4(%a0),%d0
	move.w (%a0),%d3
	muls.w %d5,%d3
	asr.l #6,%d3
	move.w -2(%fp),%d1
	muls.w %d2,%d1
	asr.l #6,%d1
	add.w %d1,%d3
	add.w %a2,%d3
	move.w -4(%fp),%d1
	muls.w %d0,%d1
	asr.l #6,%d1
	add.w %d1,%d3
	move.w %d3,(%a1)
	move.w (%a0),%d3
	move.w %d3,%d1
	muls.w %d7,%d1
	asr.l #6,%d1
	muls.w %d6,%d2
	asr.l #6,%d2
	add.w %d1,%d2
	add.w %a5,%d2
	move.w -6(%fp),%d1
	muls.w %d0,%d1
	asr.l #6,%d1
	add.w %d1,%d2
	move.w %d2,2(%a1)
	muls.w -8(%fp),%d3
	asr.l #6,%d3
	move.w 2(%a0),%d1
	muls.w %d4,%d1
	asr.l #6,%d1
	add.w %d1,%d3
	add.w %a4,%d3
	muls.w -10(%fp),%d0
	asr.l #6,%d0
	add.w %d0,%d3
	move.w %d3,4(%a1)
	addq.l #6,%a0
	addq.l #6,%a1
	cmp.l %a0,%a3
	jne .L22
.L21:
	tst.w light_enabled
	jeq .L19
	move.w light,%d1
	move.w light+2,%d2
	move.w light+4,%d0
	move.w %d2,%d4
	muls.w matInv+2,%d4
	asr.l #6,%d4
	move.w %d1,%d3
	muls.w matInv,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d0,%d3
	muls.w matInv+4,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d4,light_trans
	move.w %d2,%d4
	muls.w matInv+8,%d4
	asr.l #6,%d4
	move.w %d1,%d3
	muls.w matInv+6,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d0,%d3
	muls.w matInv+10,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	move.w %d4,light_trans+2
	muls.w matInv+14,%d2
	asr.l #6,%d2
	muls.w matInv+12,%d1
	asr.l #6,%d1
	add.w %d2,%d1
	muls.w matInv+16,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	move.w %d1,light_trans+4
.L19:
	movem.l -52(%fp),#15612
	unlk %fp
	rts
.L29:
	move.w Rx,%d4
	asr.w #6,%d4
	move.l %d4,%d1
	and.l #1023,%d1
	lea sintab16,%a0
	add.l %d1,%d1
	move.w (%a0,%d1.l),%d6
	move.w Ry,%d3
	asr.w #6,%d3
	move.l %d3,%d1
	and.l #1023,%d1
	add.l %d1,%d1
	move.w (%a0,%d1.l),matInv+4
	move.w Rz,%d1
	asr.w #6,%d1
	move.l %d1,%d2
	and.l #1023,%d2
	add.l %d2,%d2
	move.w (%a0,%d2.l),%d2
	add.l #256,%d4
	and.l #1023,%d4
	add.l %d4,%d4
	move.w (%a0,%d4.l),%d5
	add.l #256,%d3
	and.l #1023,%d3
	add.l %d3,%d3
	move.w (%a0,%d3.l),%d4
	add.l #256,%d1
	and.l #1023,%d1
	add.l %d1,%d1
	move.w (%a0,%d1.l),%d1
	move.w matInv+4,%d7
	muls.w %d6,%d7
	lsr.l #6,%d7
	move.l %d7,%a1
	move.w matInv+4,%d3
	muls.w %d5,%d3
	lsr.l #6,%d3
	move.l %d3,%a5
	move.w %d4,%d3
	muls.w %d1,%d3
	asr.l #6,%d3
	move.w %d3,mat
	move.w %d4,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	neg.w %d7
	move.w %d7,%a3
	move.w %d7,mat+6
	move.w matInv+4,mat+12
	move.w %a1,%d7
	muls.w %d1,%d7
	asr.l #6,%d7
	move.l %d7,%a4
	move.w %d5,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	add.w %d7,%a4
	move.w %a4,mat+2
	move.w %d5,%d7
	muls.w %d1,%d7
	asr.l #6,%d7
	move.l %d7,%a2
	move.w %a1,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	sub.w %d7,%a2
	move.w %a2,mat+8
	move.w %d6,%d7
	muls.w %d4,%d7
	asr.l #6,%d7
	neg.w %d7
	move.w %d7,%a1
	move.w %d7,mat+14
	move.w %d6,%d7
	muls.w %d2,%d7
	asr.l #6,%d7
	move.l %d7,%a0
	move.w %a5,%d7
	muls.w %d1,%d7
	asr.l #6,%d7
	sub.w %d7,%a0
	move.w %a0,mat+4
	move.w %a5,%d7
	muls.w %d7,%d2
	asr.l #6,%d2
	muls.w %d6,%d1
	asr.l #6,%d1
	add.w %d1,%d2
	move.w %d2,mat+10
	muls.w %d5,%d4
	asr.l #6,%d4
	move.w %d4,mat+16
	move.w %d3,matInv
	move.w %a4,matInv+6
	move.w %a0,matInv+12
	move.w %a3,matInv+2
	move.w %a2,matInv+8
	move.w %d2,matInv+14
	move.w %a1,matInv+10
	move.w %d4,matInv+16
	clr.w rebuildMat
	jra .L20

Pretty lengthy, but we may get an idea by direct comparison.

Stef · Post by **Stef** » Fri Apr 06, 2012 9:50 pm

I was quite busy lately too, i am really impressed you directly patched Gens to add profiling tools

The code you are providing for comparison is very long also you used -O2 flags which do not give best results.

Here is the code obtained with flags : "-O3 -fno-web -fno-gcse -fno-unit-at-a-time -fomit-frame-pointer"

Code: Select all

doBlitBlankExt:
	movm.l #0x3838,-(%sp)
	pea 2.w
	jbsr VDP_setAutoInc
	move.l bmp_buffer_read,%a2
	addq.l #4,%sp
	move.l #512,%d4
	cmp.l bmp_buffer_0.l,%a2
	jbeq .L141
	move.w #20992,%d4
.L141:
	move.l #12582916,%a1
	move.l #12582912,%a3
	move.w bmp_state,%d0
	btst #1,%d0
	jbeq .L142
	moveq #20,%d3
	sub.w (save_i.2),%d3
	move.w (save_j.3),%a0
	move.w #32,%a4
	sub.w %a0,%a4
	move.l %d3,%d1
	lsl.l #5,%d1
	and.l #2097120,%d1
	clr.l %d0
	move.w %a4,%d0
	add.l %d1,%d0
	lsl.l #5,%d0
	add.l %d0,%d4
	lsl.l #8,%d3
	and.l #16776960,%d3
	clr.l %d2
	move.w %a4,%d2
	add.l %d3,%d2
	lsl.l #2,%d2
	add.l %d2,%a2
	move.l %d4,%d0
	and.l #16383,%d0
	add.l #16384,%d0
	swap %d0
	clr.w %d0
	moveq #14,%d1
	lsr.l %d1,%d4
	add.l %d4,%d0
	move.l %d0,(%a1)
	move.w %a0,%d1
	subq.w #1,%d1
	cmp.w #-1,%d1
	jbeq .L155
	.align	2
.L161:
	move.l (%a2),(%a3)
	move.l 128(%a2),(%a3)
	move.l 256(%a2),(%a3)
	move.l 384(%a2),(%a3)
	move.l 512(%a2),(%a3)
	move.l 640(%a2),(%a3)
	move.l 768(%a2),(%a3)
	move.l 896(%a2),(%a3)
	addq.l #4,%a2
	dbra %d1,.L161
.L155:
	lea (896,%a2),%a2
	move.w (save_i.2),%d3
	subq.w #1,%d3
	move.l #12582920,%a4
	jbsr VDP_getScreenHeight
	clr.l %d4
	move.w %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	move.b %d4,%d2
	subq.b #2,%d2
	subq.w #1,%d3
	cmp.w #-1,%d3
	jbeq .L157
.L162:
	moveq #31,%d1
	.align	2
.L152:
	move.b (%a4),%d0
	cmp.b %d0,%d2
	jbeq .L160
	move.l (%a2),(%a3)
	move.l 128(%a2),(%a3)
	move.l 256(%a2),(%a3)
	move.l 384(%a2),(%a3)
	move.l 512(%a2),(%a3)
	move.l 640(%a2),(%a3)
	move.l 768(%a2),(%a3)
	move.l 896(%a2),(%a3)
	addq.l #4,%a2
	dbra %d1,.L152
	lea (896,%a2),%a2
	dbra %d3,.L162
.L157:
	and.w #-3,bmp_state
	moveq #1,%d0
	jbra .L139
	.align	2
.L142:
	or.w #2,%d0
	move.w %d0,bmp_state
	moveq #20,%d3
	move.l %d4,%d2
	and.l #16383,%d2
	add.l #16384,%d2
	swap %d2
	clr.w %d2
	moveq #14,%d0
	lsr.l %d0,%d4
	add.l %d4,%d2
	move.l %d2,(%a1)
	move.l #12582920,%a4
	jbsr VDP_getScreenHeight
	clr.l %d4
	move.w %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	move.b %d4,%d2
	subq.b #2,%d2
	subq.w #1,%d3
	cmp.w #-1,%d3
	jbne .L162
	jbra .L157
	.align	2
.L160:
	addq.w #1,%d3
	move.w %d3,(save_i.2)
	addq.w #1,%d1
	move.w %d1,(save_j.3)
	moveq #0,%d0
.L139:
	movm.l (%sp)+,#0x1c1c
	rts

Code: Select all

M3D_transform3D:
	movm.l #0x3f3e,-(%sp)
	move.l 56(%sp),%a6
	tst.w rebuildMat
	jbeq .L20
	move.w Rx,%d5
	asr.w #6,%d5
	ext.l %d5
	move.l %d5,%d0
	and.l #1023,%d0
	lea sintab16,%a0
	add.l %d0,%d0
	move.w (%a0,%d0.l),%d7
	move.w Ry,%d2
	asr.w #6,%d2
	ext.l %d2
	move.l %d2,%d6
	and.l #1023,%d6
	add.l %d6,%d6
	move.w (%a0,%d6.l),%a5
	move.w Rz,%d1
	asr.w #6,%d1
	ext.l %d1
	move.l %d1,%d4
	and.l #1023,%d4
	add.l %d4,%d4
	move.w (%a0,%d4.l),%a2
	add.l #256,%d5
	and.l #1023,%d5
	add.l %d5,%d5
	move.w (%a0,%d5.l),%d4
	add.l #256,%d2
	and.l #1023,%d2
	add.l %d2,%d2
	move.w (%a0,%d2.l),%a4
	add.l #256,%d1
	and.l #1023,%d1
	add.l %d1,%d1
	move.w (%a0,%d1.l),%a1
	move.w %a5,%d1
	muls.w %d7,%d1
	lsr.l #6,%d1
	move.w %a5,%d2
	muls.w %d4,%d2
	lsr.l #6,%d2
	move.w %a4,%d0
	move.w %a1,%d3
	muls.w %d3,%d0
	asr.l #6,%d0
	move.l %d0,%a3
	move.w %d0,mat
	move.w %a4,%d6
	move.w %a2,%d3
	muls.w %d3,%d6
	asr.l #6,%d6
	neg.w %d6
	move.w %d6,%a0
	move.w %d6,mat+6
	move.w %a5,mat+12
	move.w %a1,%d6
	muls.w %d1,%d6
	asr.l #6,%d6
	move.w %a2,%d5
	muls.w %d4,%d5
	asr.l #6,%d5
	add.w %d5,%d6
	move.w %d6,mat+2
	move.w %a1,%d5
	muls.w %d4,%d5
	asr.l #6,%d5
	muls.w %d3,%d1
	asr.l #6,%d1
	sub.w %d1,%d5
	move.w %d5,mat+8
	move.w %a4,%d3
	muls.w %d7,%d3
	asr.l #6,%d3
	neg.w %d3
	move.w %d3,mat+14
	move.w %a2,%d1
	muls.w %d7,%d1
	asr.l #6,%d1
	move.w %a1,%d0
	muls.w %d2,%d0
	asr.l #6,%d0
	sub.w %d0,%d1
	move.w %d1,mat+4
	move.w %a2,%d0
	muls.w %d0,%d2
	asr.l #6,%d2
	move.w %a1,%d0
	muls.w %d0,%d7
	asr.l #6,%d7
	add.w %d7,%d2
	move.w %d2,mat+10
	move.w %a4,%d0
	muls.w %d0,%d4
	asr.l #6,%d4
	move.w %d4,mat+16
	move.w %a3,matInv
	move.w %d6,matInv+6
	move.w %d1,matInv+12
	move.w %a0,matInv+2
	move.w %d5,matInv+8
	move.w %d2,matInv+14
	move.w %a5,matInv+4
	move.w %d3,matInv+10
	move.w %d4,matInv+16
	clr.w rebuildMat
.L20:
	move.l 48(%sp),%a0
	move.l 52(%sp),%a1
	move.w %a6,%d5
	subq.w #1,%d5
	cmp.w #-1,%d5
	jbeq .L27
	lea mat,%a5
	move.w Tx,%a2
	move.w Ty,%d7
	move.w Tz,%d6
	lea mat+2,%a4
	lea mat+4,%a3
	.align	2
.L24:
	move.w (%a0),%d2
	muls.w (%a5),%d2
	asr.l #6,%d2
	move.w 2(%a0),%d4
	move.w %d4,%d3
	muls.w (%a4),%d3
	asr.l #6,%d3
	add.w %d3,%d2
	move.w 4(%a0),%d3
	move.w %d3,%d0
	muls.w (%a3),%d0
	asr.l #6,%d0
	add.w %d0,%d2
	add.w %a2,%d2
	move.w %d2,(%a1)
	move.w (%a0),%d2
	move.w %d2,%d1
	muls.w mat+6,%d1
	asr.l #6,%d1
	muls.w mat+8,%d4
	asr.l #6,%d4
	add.w %d4,%d1
	move.w %d3,%d4
	muls.w mat+10,%d4
	asr.l #6,%d4
	add.w %d4,%d1
	add.w %d7,%d1
	move.w %d1,2(%a1)
	muls.w mat+12,%d2
	asr.l #6,%d2
	move.w 2(%a0),%d1
	muls.w mat+14,%d1
	asr.l #6,%d1
	add.w %d1,%d2
	muls.w mat+16,%d3
	asr.l #6,%d3
	add.w %d3,%d2
	add.w %d6,%d2
	move.w %d2,4(%a1)
	addq.l #6,%a0
	addq.l #6,%a1
	dbra %d5,.L24
.L27:
	tst.w light_enabled
	jbeq .L19
	move.w light,%d7
	move.w %d7,%d3
	muls.w matInv,%d3
	asr.l #6,%d3
	move.w light+2,%d6
	move.w %d6,%d5
	muls.w matInv+2,%d5
	asr.l #6,%d5
	add.w %d5,%d3
	move.w light+4,%d5
	move.w %d5,%d0
	muls.w matInv+4,%d0
	asr.l #6,%d0
	add.w %d0,%d3
	move.w %d3,light_trans
	move.w %d7,%d1
	muls.w matInv+6,%d1
	asr.l #6,%d1
	move.w %d6,%d2
	muls.w matInv+8,%d2
	asr.l #6,%d2
	add.w %d2,%d1
	move.w %d5,%d4
	muls.w matInv+10,%d4
	asr.l #6,%d4
	add.w %d4,%d1
	move.w %d1,light_trans+2
	muls.w matInv+12,%d7
	asr.l #6,%d7
	muls.w matInv+14,%d6
	asr.l #6,%d6
	add.w %d6,%d7
	muls.w matInv+16,%d5
	asr.l #6,%d5
	add.w %d5,%d7
	move.w %d7,light_trans+4
.L19:
	movm.l (%sp)+,#0x7cfc
	rts

Stef · Post by **Stef** » Fri Apr 06, 2012 9:55 pm

Same but with "-O2 -fomit-frame-pointer" flag only now :

Code: Select all

doBlitBlankExt:
	movm.l #0x3820,-(%sp)
	pea 2.w
	jbsr VDP_setAutoInc
	move.l bmp_buffer_read,%a2
	addq.l #4,%sp
	move.l #512,%d4
	cmp.l bmp_buffer_0.l,%a2
	jbeq .L125
	move.w #20992,%d4
.L125:
	move.w bmp_state,%d0
	btst #1,%d0
	jbeq .L126
	move.w (save_i.2),%a1
	moveq #20,%d3
	sub.w %a1,%d3
	move.w (save_j.3),%a0
	moveq #32,%d2
	sub.w %a0,%d2
	move.l %d3,%d1
	lsl.l #5,%d1
	and.l #2097120,%d1
	clr.l %d0
	move.w %d2,%d0
	add.l %d1,%d0
	lsl.l #5,%d0
	add.l %d0,%d4
	lsl.l #8,%d3
	and.l #16776960,%d3
	and.l #0xFFFF,%d2
	add.l %d3,%d2
	lsl.l #2,%d2
	add.l %d2,%a2
	move.l %d4,%d0
	and.l #16383,%d0
	add.l #16384,%d0
	swap %d0
	clr.w %d0
	moveq #14,%d1
	lsr.l %d1,%d4
	add.l %d4,%d0
	move.l %d0,12582916
	move.w %a0,%d1
	subq.w #1,%d1
	cmp.w #-1,%d1
	jbeq .L139
	.align	2
.L129:
	move.l (%a2),12582912
	move.l 128(%a2),12582912
	move.l 256(%a2),12582912
	move.l 384(%a2),12582912
	move.l 512(%a2),12582912
	move.l 640(%a2),12582912
	move.l 768(%a2),12582912
	move.l 896(%a2),12582912
	addq.l #4,%a2
	dbra %d1,.L129
.L139:
	lea (896,%a2),%a2
	move.w %a1,%d3
	subq.w #1,%d3
	jbsr VDP_getScreenHeight
	and.l #0xFFFF,%d0
	add.l #-160,%d0
	asr.l #1,%d0
	move.b %d0,%d2
	subq.b #2,%d2
	subq.w #1,%d3
	cmp.w #-1,%d3
	jbeq .L141
.L137:
	moveq #31,%d1
	.align	2
.L136:
	move.b 12582920,%d0
	cmp.b %d0,%d2
	jbeq .L144
	move.l (%a2),12582912
	move.l 128(%a2),12582912
	move.l 256(%a2),12582912
	move.l 384(%a2),12582912
	move.l 512(%a2),12582912
	move.l 640(%a2),12582912
	move.l 768(%a2),12582912
	move.l 896(%a2),12582912
	addq.l #4,%a2
	dbra %d1,.L136
	lea (896,%a2),%a2
	dbra %d3,.L137
.L141:
	and.w #-3,bmp_state
	moveq #1,%d0
	jbra .L123
	.align	2
.L126:
	or.w #2,%d0
	move.w %d0,bmp_state
	moveq #20,%d3
	move.l %d4,%d0
	and.l #16383,%d0
	add.l #16384,%d0
	swap %d0
	clr.w %d0
	moveq #14,%d1
	lsr.l %d1,%d4
	add.l %d4,%d0
	move.l %d0,12582916
	jbsr VDP_getScreenHeight
	and.l #0xFFFF,%d0
	add.l #-160,%d0
	asr.l #1,%d0
	move.b %d0,%d2
	subq.b #2,%d2
	subq.w #1,%d3
	cmp.w #-1,%d3
	jbne .L137
	jbra .L141
	.align	2
.L144:
	addq.w #1,%d3
	move.w %d3,(save_i.2)
	addq.w #1,%d1
	move.w %d1,(save_j.3)
	moveq #0,%d0
.L123:
	movm.l (%sp)+,#0x41c
	rts

Code: Select all

M3D_transform3D:
	movm.l #0x3f3e,-(%sp)
	move.l 56(%sp),%a6
	tst.w rebuildMat
	jbeq .L20
	move.w Rx,%d3
	asr.w #6,%d3
	ext.l %d3
	move.l %d3,%d0
	and.l #1023,%d0
	lea sintab16,%a0
	add.l %d0,%d0
	move.w (%a0,%d0.l),%d7
	move.w Ry,%d2
	asr.w #6,%d2
	ext.l %d2
	move.l %d2,%d0
	and.l #1023,%d0
	add.l %d0,%d0
	move.w (%a0,%d0.l),%a5
	move.w Rz,%d1
	asr.w #6,%d1
	ext.l %d1
	move.l %d1,%d0
	and.l #1023,%d0
	add.l %d0,%d0
	move.w (%a0,%d0.l),%a2
	add.l #256,%d3
	and.l #1023,%d3
	add.l %d3,%d3
	move.w (%a0,%d3.l),%d4
	add.l #256,%d2
	and.l #1023,%d2
	add.l %d2,%d2
	move.w (%a0,%d2.l),%a4
	add.l #256,%d1
	and.l #1023,%d1
	add.l %d1,%d1
	move.w (%a0,%d1.l),%a1
	move.w %a5,%d1
	muls.w %d7,%d1
	lsr.l #6,%d1
	move.w %a5,%d2
	muls.w %d4,%d2
	lsr.l #6,%d2
	move.w %a4,%d0
	move.w %a1,%d3
	muls.w %d3,%d0
	asr.l #6,%d0
	move.l %d0,%a3
	move.w %d0,mat
	move.w %a4,%d0
	move.w %a2,%d3
	muls.w %d3,%d0
	asr.l #6,%d0
	neg.w %d0
	move.w %d0,%a0
	move.w %d0,mat+6
	move.w %a5,mat+12
	move.w %a1,%d6
	muls.w %d1,%d6
	asr.l #6,%d6
	move.w %a2,%d0
	muls.w %d4,%d0
	asr.l #6,%d0
	add.w %d0,%d6
	move.w %d6,mat+2
	move.w %a1,%d5
	muls.w %d4,%d5
	asr.l #6,%d5
	muls.w %d3,%d1
	asr.l #6,%d1
	sub.w %d1,%d5
	move.w %d5,mat+8
	move.w %a4,%d3
	muls.w %d7,%d3
	asr.l #6,%d3
	neg.w %d3
	move.w %d3,mat+14
	move.w %a2,%d1
	muls.w %d7,%d1
	asr.l #6,%d1
	move.w %a1,%d0
	muls.w %d2,%d0
	asr.l #6,%d0
	sub.w %d0,%d1
	move.w %d1,mat+4
	move.w %a2,%d0
	muls.w %d0,%d2
	asr.l #6,%d2
	move.w %a1,%d0
	muls.w %d0,%d7
	asr.l #6,%d7
	add.w %d7,%d2
	move.w %d2,mat+10
	move.w %a4,%d0
	muls.w %d0,%d4
	asr.l #6,%d4
	move.w %d4,mat+16
	move.w %a3,matInv
	move.w %d6,matInv+6
	move.w %d1,matInv+12
	move.w %a0,matInv+2
	move.w %d5,matInv+8
	move.w %d2,matInv+14
	move.w %a5,matInv+4
	move.w %d3,matInv+10
	move.w %d4,matInv+16
	clr.w rebuildMat
.L20:
	move.l 48(%sp),%a0
	move.l 52(%sp),%a1
	move.w %a6,%d5
	subq.w #1,%d5
	cmp.w #-1,%d5
	jbeq .L27
	move.w Tx,%a2
	move.w Ty,%d7
	move.w Tz,%d6
	.align	2
.L24:
	move.w (%a0),%d1
	muls.w mat,%d1
	asr.l #6,%d1
	move.w 2(%a0),%d4
	move.w %d4,%d0
	muls.w mat+2,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	move.w 4(%a0),%d3
	move.w %d3,%d0
	muls.w mat+4,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	add.w %a2,%d1
	move.w %d1,(%a1)
	move.w (%a0),%d2
	move.w %d2,%d1
	muls.w mat+6,%d1
	asr.l #6,%d1
	muls.w mat+8,%d4
	asr.l #6,%d4
	add.w %d4,%d1
	move.w %d3,%d0
	muls.w mat+10,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	add.w %d7,%d1
	move.w %d1,2(%a1)
	muls.w mat+12,%d2
	asr.l #6,%d2
	move.w 2(%a0),%d0
	muls.w mat+14,%d0
	asr.l #6,%d0
	add.w %d0,%d2
	muls.w mat+16,%d3
	asr.l #6,%d3
	add.w %d3,%d2
	add.w %d6,%d2
	move.w %d2,4(%a1)
	addq.l #6,%a0
	addq.l #6,%a1
	dbra %d5,.L24
.L27:
	tst.w light_enabled
	jbeq .L19
	move.w light,%d4
	move.w %d4,%d1
	muls.w matInv,%d1
	asr.l #6,%d1
	move.w light+2,%d3
	move.w %d3,%d0
	muls.w matInv+2,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	move.w light+4,%d2
	move.w %d2,%d0
	muls.w matInv+4,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	move.w %d1,light_trans
	move.w %d4,%d1
	muls.w matInv+6,%d1
	asr.l #6,%d1
	move.w %d3,%d0
	muls.w matInv+8,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	move.w %d2,%d0
	muls.w matInv+10,%d0
	asr.l #6,%d0
	add.w %d0,%d1
	move.w %d1,light_trans+2
	muls.w matInv+12,%d4
	asr.l #6,%d4
	muls.w matInv+14,%d3
	asr.l #6,%d3
	add.w %d3,%d4
	muls.w matInv+16,%d2
	asr.l #6,%d2
	add.w %d2,%d4
	move.w %d4,light_trans+4
.L19:
	movm.l (%sp)+,#0x7cfc
	rts

fdarkangel · Post by **fdarkangel** » Fri Apr 06, 2012 11:13 pm

Stef wrote:I was quite busy lately too, i am really impressed you directly patched Gens to add profiling tools

The code you are providing for comparison is very long also you used -O2 flags which do not give best results.

Thanks a lot!

It'd be even nicer if we can use gcov though

Indeed, I was using -O2 to be on the safe side. Here's my -O3 -fno-web -fno-gcse -fno-unit-at-a-time -fomit-frame-pointer

Code: Select all

doBlitBlankExt:
	subq.l #4,%sp
	movem.l #16190,-(%sp)
	pea 2.w
	jsr VDP_setAutoInc
	move.l bmp_buffer_read,%a2
	addq.l #4,%sp
	move.l #20992,%d0
	cmp.l bmp_buffer_0.l,%a2
	jeq .L288
.L271:
	move.w bmp_state,%d1
	btst #1,%d1
	jeq .L272
	move.w save_i.2004,%d6
	moveq #20,%d2
	sub.w %d6,%d2
	move.w save_j.2005,%d1
	moveq #32,%d3
	sub.w %d1,%d3
	and.l #65535,%d3
	and.l #65535,%d2
	move.l %d2,%d4
	lsl.l #8,%d4
	move.l %d3,%a0
	add.l %d4,%a0
	add.l %a0,%a0
	add.l %a0,%a0
	add.l %a0,%a2
	lsl.l #5,%d2
	add.l %d3,%d2
	lsl.l #5,%d2
	move.l %d2,%a0
	add.l %d0,%a0
	add.l %a0,%a0
	move.l %a0,%a1
	add.l #vramwrite_tab,%a1
	move.l (%a1,%a0.l),%d0
	move.l %d0,12582916
	move.w %d1,%d2
	subq.w #1,%d2
	tst.w %d1
	jeq .L273
	move.w %d2,%d0
	move.l %a2,%a0
.L274:
	move.l #12582912,%a1
	move.l (%a0),(%a1)
	move.l 128(%a0),(%a1)
	move.l 256(%a0),(%a1)
	move.l 384(%a0),(%a1)
	move.l 512(%a0),(%a1)
	move.l 640(%a0),(%a1)
	move.l 768(%a0),(%a1)
	move.l 896(%a0),(%a1)
	addq.l #4,%a0
	dbra %d0,.L274
	moveq #0,%d0
	move.w %d2,%d0
	addq.l #1,%d0
	add.l %d0,%d0
	add.l %d0,%d0
	add.l %d0,%a2
.L273:
	lea (896,%a2),%a2
	move.w %d6,%d5
	subq.w #1,%d5
	jsr VDP_getScreenHeight
	and.l #65535,%d0
	move.l %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	subq.b #2,%d4
	subq.w #2,%d6
	tst.w %d5
	jeq .L276
.L286:
	move.b 12582920,%d0
	cmp.b %d4,%d0
	jeq .L284
	move.l %a2,%d2
	add.l #128,%d2
	move.l %a2,%d1
	add.l #256,%d1
	lea (384,%a2),%a6
	lea (512,%a2),%a5
	lea (640,%a2),%a4
	lea (768,%a2),%a3
	lea (896,%a2),%a0
	move.l %a0,44(%sp)
	moveq #31,%d3
	move.l (%a2)+,%d0
	move.l #12582912,%a0
	move.l %d0,(%a0)
	move.l %d2,%a1
	addq.l #4,%d2
	move.l (%a1)+,%d0
	move.l %d0,(%a0)
	move.l %d1,%a1
	addq.l #4,%d1
	move.l (%a1)+,%d0
	move.l %d0,(%a0)
	move.l (%a6)+,%d0
	move.l %d0,(%a0)
	move.l (%a5)+,%d0
	move.l %d0,(%a0)
	move.l (%a4)+,%d0
	move.l %d0,(%a0)
	move.l (%a3)+,%d0
	move.l %d0,(%a0)
	move.l 44(%sp),%a1
	addq.l #4,44(%sp)
	move.l (%a1)+,%d0
	move.l %d0,(%a0)
	move.w %d3,%d0
	subq.w #1,%d0
	cmp.w #-1,%d0
	jeq .L289
.L280:
	move.b 12582920,%d7
	cmp.b %d7,%d4
	jeq .L281
	move.w %d0,%d3
	move.l (%a2)+,%d0
	move.l #12582912,%a0
	move.l %d0,(%a0)
	move.l %d2,%a1
	addq.l #4,%d2
	move.l (%a1)+,%d0
	move.l %d0,(%a0)
	move.l %d1,%a1
	addq.l #4,%d1
	move.l (%a1)+,%d0
	move.l %d0,(%a0)
	move.l (%a6)+,%d0
	move.l %d0,(%a0)
	move.l (%a5)+,%d0
	move.l %d0,(%a0)
	move.l (%a4)+,%d0
	move.l %d0,(%a0)
	move.l (%a3)+,%d0
	move.l %d0,(%a0)
	move.l 44(%sp),%a1
	addq.l #4,44(%sp)
	move.l (%a1)+,%d0
	move.l %d0,(%a0)
	move.w %d3,%d0
	subq.w #1,%d0
	cmp.w #-1,%d0
	jne .L280
.L289:
	lea (896,%a2),%a2
	move.w %d6,%d0
	subq.w #1,%d0
	move.w %d6,%d5
	jeq .L276
	move.w %d0,%d6
	jra .L286
.L284:
	moveq #32,%d3
.L281:
	move.w %d5,save_i.2004
	move.w %d3,save_j.2005
	clr.w %d0
	movem.l (%sp)+,#31996
	addq.l #4,%sp
	rts
.L272:
	or.w #2,%d1
	move.w %d1,bmp_state
	move.l %d0,%a0
	add.l %d0,%a0
	move.l %a0,%a1
	add.l #vramwrite_tab,%a1
	move.l (%a1,%a0.l),%d0
	move.l %d0,12582916
	jsr VDP_getScreenHeight
	and.l #65535,%d0
	move.l %d0,%d4
	add.l #-160,%d4
	asr.l #1,%d4
	subq.b #2,%d4
	moveq #19,%d6
	moveq #20,%d5
	jra .L286
.L288:
	move.w #512,%d0
	jra .L271
.L276:
	and.w #-3,bmp_state
	moveq #1,%d0
	movem.l (%sp)+,#31996
	addq.l #4,%sp
	rts

O2 or O3, the code generated by gcc4 is longer.
However, I noticed an awkwardness for the code generated for this loop

Code: Select all

        while(j--)
        {
            // send it to VRAM
            *pldata = src[(BMP_PITCH * 0) / 4];
            *pldata = src[(BMP_PITCH * 1) / 4];
            *pldata = src[(BMP_PITCH * 2) / 4];
            *pldata = src[(BMP_PITCH * 3) / 4];
            *pldata = src[(BMP_PITCH * 4) / 4];
            *pldata = src[(BMP_PITCH * 5) / 4];
            *pldata = src[(BMP_PITCH * 6) / 4];
            *pldata = src[(BMP_PITCH * 7) / 4];

            src++;
        }

There are two copies of this loop throughout the code. gcc3 gives this at both optimization levels

Code: Select all

.L161: 
   move.l (%a2),(%a3) 
   move.l 128(%a2),(%a3) 
   move.l 256(%a2),(%a3) 
   move.l 384(%a2),(%a3) 
   move.l 512(%a2),(%a3) 
   move.l 640(%a2),(%a3) 
   move.l 768(%a2),(%a3) 
   move.l 896(%a2),(%a3) 
   addq.l #4,%a2 
   dbra %d1,.L161

The second loop is identical to this one.
gcc4 on the other hand gives this for the first one

Code: Select all

.L274: 
   move.l #12582912,%a1 
   move.l (%a0),(%a1) 
   move.l 128(%a0),(%a1) 
   move.l 256(%a0),(%a1) 
   move.l 384(%a0),(%a1) 
   move.l 512(%a0),(%a1) 
   move.l 640(%a0),(%a1) 
   move.l 768(%a0),(%a1) 
   move.l 896(%a0),(%a1) 
   addq.l #4,%a0 
   dbra %d0,.L274

whereas the second one is a different and much longer version of it! (.L286 above in this post). There is also a pointles "move.l #12582912,%a1" within the loop (because of -fno-gcse). I'll try to isolate this "weirdness" and file another bug report.

Meanwhile, I tried cross-compiling gcc 3.4.6 (with binutils-2.22) and gosh, I bumped into a gcc-3.4.6 bug!

Code: Select all

sgdk1/buildscripts/m68k-elf/gcc/gcc/xgcc -Bsgdk1/buildscripts/m68k-elf/gcc/gcc/ -B/opt/toolchains/sega/m68k-elf-old/m68k-elf/bin/ -B/opt/toolchains/sega/m68k-elf-old/m68k-elf/lib/ -isystem /opt/toolchains/sega/m68k-elf-old/m68k-elf/include -isystem /opt/toolchains/sega/m68k-elf-old/m68k-elf/sys-include -O2  -DIN_GCC -DCROSS_COMPILE   -W -Wall -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes -Wold-style-definition  -isystem ./include   -g  -DIN_LIBGCC2 -D__GCC_FLOAT_NOT_NEEDED -Dinhibit_libc -I. -I. -Isgdk1/buildscripts/gcc-3.4.6/gcc -Isgdk1/buildscripts/gcc-3.4.6/gcc/. -Isgdk1/buildscripts/gcc-3.4.6/gcc/../include   -m68000 -DL_fixdfdi -c sgdk1/buildscripts/gcc-3.4.6/gcc/libgcc2.c -o libgcc/m68000/_fixdfdi.o
sgdk1/buildscripts/gcc-3.4.6/gcc/libgcc2.c: In function `__fixdfdi':
sgdk1/buildscripts/gcc-3.4.6/gcc/libgcc2.c:1277: internal compiler error: in do_SUBST, at combine.c:447
Please submit a full bug report,
with preprocessed source if appropriate.

I just submitted another bug report. Edit: Since gcc3 is no longer supported, the bug was marked as "won't fix".

Edit2: I tried compiling bmp.c with gcc3 and the rest of the library with gcc4, and I got 15 FPS on Gens/GS. gcc3 libgendev.a (with the same options: -O3 ...) gives 15 FPS again so it seems the real problem is in doBlitBlankExt.

Edit3: I isolated the code an filed a new bug.

Stef · Post by **Stef** » Sat Apr 07, 2012 9:34 pm

fdarkangel wrote: Indeed, I was using -O2 to be on the safe side. Here's my -O3 -fno-web -fno-gcse -fno-unit-at-a-time -fomit-frame-pointer

...

Meanwhile, I tried cross-compiling gcc 3.4.6 (with binutils-2.22) and gosh, I bumped into a gcc-3.4.6 bug!

...

I just submitted another bug report. Edit: Since gcc3 is no longer supported, the bug was marked as "won't fix".

Edit2: I tried compiling bmp.c with gcc3 and the rest of the library with gcc4, and I got 15 FPS on Gens/GS. gcc3 libgendev.a (with the same options: -O3 ...) gives 15 FPS again so it seems the real problem is in doBlitBlankExt.

Edit3: I isolated the code an filed a new bug.

Well unfortunately i can say that doBlitBlankExt is just an example of the problem i have with GCC 4, i tried many demos / piece of code and it was always slower with GCC4 than GCC3.
The loop i used in the blit code has nothing special and if a simple case like this one produce inefficient code we can be sure we will have inefficient code for a lot of others cases. I'm not sure GCC developers can easily fix these optimizations problems which are probably absents on newer CPU target... that would be nice as we could then re use inline keyword

About your GCC 3.4.6 compilation bug, maybe this is due to some libraries you tried to compile... i remember i met some problems also so i only built libc.

fdarkangel · Post by **fdarkangel** » Sun Apr 08, 2012 1:10 am

Stef wrote:Well unfortunately i can say that doBlitBlankExt is just an example of the problem i have with GCC 4, i tried many demos / piece of code and it was always slower with GCC4 than GCC3.
The loop i used in the blit code has nothing special and if a simple case like this one produce inefficient code we can be sure we will have inefficient code for a lot of others cases. I'm not sure GCC developers can easily fix these optimizations problems which are probably absents on newer CPU target... that would be nice as we could then re use inline keyword

About your GCC 3.4.6 compilation bug, maybe this is due to some libraries you tried to compile... i remember i met some problems also so i only built libc.

Are the sources for these other demos available? It'd be great if we analyse them as well.
However, this was a very critical optimization bug that we found; it apparently affects nested loops, which is almost guaranteed to affect the observable performance of a program. The same cannot be said for any optimizer bug. And it may be possible that gcc4 will perform better that gcc3 when this is fixed. Let's wait for the patches an see the results.

It's not because GCC crew doesn't care about these bugs. This is what happens when users don't care about the bugs of a new version, and keep using an unmaintained ancient version. If 68k people abandoned gcc3 when their developers abandoned it and switched to gcc4, somebody would have noticed this target-dependent optimization bug years ago. Reported bugs on a maintained software can be fixed, so please, there's no need to be pessimistic about gcc4.

There is nothing wrong with the file "gcc-3.4.6/gcc/libgcc2.c" in the sense that it should crash a compiler. gcc3, like any other software, has bugs. It's a gcc3 bug, one of many (you can search the GCC bugzilla for an exact list), but the crucial difference is it won't be fixed: GCC developers no longer care about gcc3. So if we're not willing to fix gcc3 bugs and improve it in time, I think we should rather focus on what makes gcc4 worse in some cases. In the long term, we all will benefit from this.

Mixail · Post by **Mixail** » Fri Apr 13, 2012 2:28 pm

How to use PSG?
psg.h

Code: Select all

void PSG_init();

void PSG_write(u8 data);

void PSG_setEnvelope(u8 channel, u8 value);
void PSG_setTone(u8 channel, u16 value);
void PSG_setFrequency(u8 channel, u16 value);

Please give an example of his work.

What format of the file should be? *.psg?

Stef · Post by **Stef** » Fri Apr 13, 2012 10:35 pm

fdarkangel wrote: Are the sources for these other demos available? It'd be great if we analyse them as well.
However, this was a very critical optimization bug that we found; it apparently affects nested loops, which is almost guaranteed to affect the observable performance of a program. The same cannot be said for any optimizer bug. And it may be possible that gcc4 will perform better that gcc3 when this is fixed. Let's wait for the patches an see the results.

It's not because GCC crew doesn't care about these bugs. This is what happens when users don't care about the bugs of a new version, and keep using an unmaintained ancient version. If 68k people abandoned gcc3 when their developers abandoned it and switched to gcc4, somebody would have noticed this target-dependent optimization bug years ago. Reported bugs on a maintained software can be fixed, so please, there's no need to be pessimistic about gcc4.

There is nothing wrong with the file "gcc-3.4.6/gcc/libgcc2.c" in the sense that it should crash a compiler. gcc3, like any other software, has bugs. It's a gcc3 bug, one of many (you can search the GCC bugzilla for an exact list), but the crucial difference is it won't be fixed: GCC developers no longer care about gcc3. So if we're not willing to fix gcc3 bugs and improve it in time, I think we should rather focus on what makes gcc4 worse in some cases. In the long term, we all will benefit from this.

Oh sorry i didn't see your response, well some of others demos are the sample provided in SGDK. Honestly i would prefer to use GCC 4 but honestly i don't think there is only one regression in optimizer which make it that slow compared to GCC 3 for the m68k. But you are right, GCC 4 is maintened and we can expect more from it than GCC 3.

Moon-Watcher · Post by **Moon-Watcher** » Mon May 07, 2012 8:17 pm

I suffered today an weird behaviour using fix16 and fix32. Seems doesn't work in the way the should.

Code: Select all

    char str[8];
    fix16 value = FIX16(123.45);

    fix16ToStr ( value, str, 5 );
    VDP_drawText ( str, 0, 0 );

    fix16ToStr( fix16Int ( value ), str, 5 );
    VDP_drawText ( str, 0, 1 );

    fix16ToStr( fix16Frac ( value ), str, 5 );
    VDP_drawText ( str, 0, 2 );

This code returns 123.437 (or so, but not the exact number) Also fractional part is 0.347

Any clue?

Chilly Willy · Post by **Chilly Willy** » Mon May 07, 2012 10:18 pm

Moon-Watcher wrote:I suffered today an weird behaviour using fix16 and fix32. Seems doesn't work in the way the should.
Code: Select all
    char str[8];
    fix16 value = FIX16(123.45);

    fix16ToStr ( value, str, 5 );
    VDP_drawText ( str, 0, 0 );

    fix16ToStr( fix16Int ( value ), str, 5 );
    VDP_drawText ( str, 0, 1 );

    fix16ToStr( fix16Frac ( value ), str, 5 );
    VDP_drawText ( str, 0, 2 );
This code returns 123.437 (or so, but not the exact number) Also fractional part is 0.347

Any clue?

It's correct - FIX16 only has 6 bits for the fractional part. .45*64 = 28.8. Truncate that to 28 and divide by 64 - 28/64 = .4375.

This is an inherent problem for ANY float using binary representation. You have the exact same problem with float or double, albeit to a smaller fraction due to the greater number of fractional digits. FIX16 will not be very accurate, and is meant for places where speed is more necessary than accuracy. Use FIX32 if you need better accuracy.

EDIT: You clearly transposed the first two digits of the fraction... .347 instead of .437. Looking at the code, there's no way it would have printed .347 for the number you gave.

Moon-Watcher · Post by **Moon-Watcher** » Mon May 07, 2012 10:56 pm

Chilly Willy wrote:
Moon-Watcher wrote:I suffered today an weird behaviour using fix16 and fix32. Seems doesn't work in the way the should.
Code: Select all
    char str[8];
    fix16 value = FIX16(123.45);

    fix16ToStr ( value, str, 5 );
    VDP_drawText ( str, 0, 0 );

    fix16ToStr( fix16Int ( value ), str, 5 );
    VDP_drawText ( str, 0, 1 );

    fix16ToStr( fix16Frac ( value ), str, 5 );
    VDP_drawText ( str, 0, 2 );
This code returns 123.437 (or so, but not the exact number) Also fractional part is 0.347

Any clue?
It's correct - FIX16 only has 6 bits for the fractional part. .45*64 = 28.8. Truncate that to 28 and divide by 64 - 28/64 = .4375.

This is an inherent problem for ANY float using binary representation. You have the exact same problem with float or double, albeit to a smaller fraction due to the greater number of fractional digits. FIX16 will not be very accurate, and is meant for places where speed is more necessary than accuracy. Use FIX32 if you need better accuracy.

EDIT: You clearly transposed the first two digits of the fraction... .347 instead of .437. Looking at the code, there's no way it would have printed .347 for the number you gave.

Right. The correct value is 0.437 - Just for add more info; FIX32 returns a fractional part of 0.449.

Oh! So this is how floats works, I understand the problem now. Thanks for the info, Chilly. Your knowledge is always wellcome

Chilly Willy · Post by **Chilly Willy** » Tue May 08, 2012 12:58 am

No problem - I've just dealt with fixed point ints quite a bit. They're not that tough once you figure them out, and handy for quite a few things. If the CPU doesn't have hardware support for floating point, fixed point numbers can be a tremendous boost as long as you keep in mind the limitations.

16.16 fixed point numbers are very common in many mid-90s games, like Doom. 16.16 means you have a 32 bit signed int where the most significant 16 bits are the integer part, and the least significant 16 bits are the fractional part. It's by no means the only way to split it up - FIX32 in sgdk uses 22.10, and my MOD playing code for the 32X uses 18.14 for the voices (you need a bit more precision for the voices since the scaling affects the pitch of the instrument); my MOD playing code fixed point numbers are also unsigned rather than signed since you can't have negative values on sample playback.

So you can see the flexibility inherent in fixed pointed numbers - the ability to adjust the precision using different splits in the bits allocated to the int vs the fraction; being able to use signed or unsigned numbers; and the speed that goes with integer operations on these older CPUs.

Stef · Post by **Stef** » Tue May 08, 2012 8:44 pm

New version of SGDK released

Not many new stuffs but at least i completed the methods documentation which tough me sometime so i hope that will be useful

Check changes here.

sega16 · Post by **sega16** » Tue May 08, 2012 11:16 pm

Thank you my rom size went down from 1.1mb to 896kb nice update I already like it and I have been using it for only 5 minutes.