Here's the GH font, decompressed from raw rom bytes with some custom C# code:
Compressed data is 990 bytes, which decompresses to 2400 bytes (a ratio of 2.4, which is pretty good!).
I know, it's not super impressive considering you could just rip it with an emulator, but as far as I can tell it's a new compression format (none of the usual culprits like Nemesis, Kosinski, RLE fit). It seems to use this one format a lot, mostly for background pattern loading. Hopefully I'll now be able to decompress big chunks of the rom and see what's actually stored where.
Algorithm-wise, its kinda an RLE format. It has a variable bit-length encoding that decodes and write sparse values into a buffer, sometimes with an additional flag set. Once it's done that, it loops over subsections and smears values across memory depending on the last value written and whether the additional flag is set.
The other neat thing about it is that it doesn't seem to have any restrictions on length and just operates on a bit stream. GH seems to let it decode a pattern at a time and then send that to VRAM, and the decompressor can (and often does) stop mid-byte, only needing two 16-bit data values and an address to save the entire decompression state able to resume later.
Annotated asm:
Code: Select all
; Custom decompression routine
; Called from cmd2 and cmd3 (so data stream already in A1)
;
; Decompresses into ram b400, where it is ready for compacting and sending to ram/vram
'
; inputs:
; A1 - data stream
; input+output:
; F730 - number of bits remaining in data to process (saved version of D2)
; F732 - outstanding data to process (saved version of D3)
; outputs:
; 128 bytes written to B400..B480
;
; register usage:
; A5 - pointer to output buffer write position
; A4 - pointer to end of output buffer
;
; D2 - how many unprocessed bits currently in registers
; D3 - current input word
; D4 - holds data before being written to output
; D6 - write loop counter (also offset to output buffer)
; D7 - general purpose working bits
;
decompress_sub_loc_00002712:
MOVEM.l A5/A4/D7/D6/D5/D4/D3/D2, -(SP) ; save register state
MOVE.w $FFFFF730.w, D2 ; restore any saved decompression state
MOVE.w $FFFFF732.w, D3
LEA $FFFFB400.w, A5 ; pointer to start of output buffer
LEA $80(A5), A4 ; A4 holds pointer to end of output buffer (ie. B400 + 128 bytes, B480).
; Used to check if we filled the buffer yet.
@repeat_all_00002726:
; top prep
SUBQ.w #5, D2
BGT.w @top_prep_A_00002754
BEQ.w @top_prep_B_00002748
; fall through to top prep C
MOVE.w D2, D7
ADDQ.w #5, D2
LSL.l D2, D3
MOVE.w (A1)+, D3 ; read data stream
NEG.w D7
LSL.l D7, D3
ADDI.w #$000B, D2
MOVE.l D3, D7
SWAP D7
BRA.w @common_top_prep_00002758
@top_prep_B_00002748:
MOVEQ #$10, D2
ROL.w #5, D3
MOVE.w D3, D7
MOVE.w (A1)+, D3 ; read data stream
BRA.w @common_top_prep_00002758
@top_prep_A_00002754:
ROL.w #5, D3
MOVE.w D3, D7
@common_top_prep_00002758:
ANDI.w #$001F, D7 ; mask off bottom 5 bits
LSR.w #1, D7 ; shift right 1
BCS.w @pre_inner_loop_00002770 ; branch carry set? carry set if bit shifted out was 1
; ie. Did we shunt a 1 out?
; Direct Write Nibble
; we're skipping the inner loop
; so do inner loop replacement code
@_DirectWrite_00002762:
MOVE.w D7, D4
MOVE.w D4, (A5)+ ; WRITE output word and advance
MOVE.w D4, D5
ORI.w #$8000, D5 ; set 15th bit
BRA.w @prep_for_shunt_loop_000027F6 ; skip inner loop and go straight to shunting
@pre_inner_loop_00002770:
MOVE.w D7, D4
MOVE.w D4, (A5)+ ; WRITE output and advance
MOVE.w D4, D5
BSET.l #$0F, D5
MOVEQ #0, D6
; inner decode loop?
@innerLoop:
SUBQ.w #2, D2
BGT.w @inner_branch_A_000027A2
BEQ.w @inner_branch_B_00002796
; fall through to inner branch C
; need 2 bits, but only have 1 in register D3
MOVEQ #$F, D2 ; valid bits will becomes 16
ADD.w D3, D3 ;
MOVE.w (A1)+, D3 ; read data stream
ADDX.w D3, D3 ; shift top bit off the end of the word
MOVE.w D3, D7
ADDX.w D7, D7
BRA.w @after_inner_branch_000027A6
@inner_branch_B_00002796: ; inner branch B
MOVEQ #$10, D2
ROL.w #2, D3
MOVE.w D3, D7
MOVE.w (A1)+, D3 ; read data stream
BRA.w @after_inner_branch_000027A6
@inner_branch_A_000027A2: ; inner branch A
ROL.w #2, D3
MOVE.w D3, D7
@after_inner_branch_000027A6:
ANDI.w #3, D7
BEQ.w @inner_continues_000027BA
; write ahead method 1
@_write_ahead_1_marker_000027AE ; just a dummy for a breakpoint
ADDQ.w #6, D7
ADD.w D7, D7
ADD.w D7, D6 ; add to write offset
MOVE.w D5, -$2(A5,D6.w) ; WRITE D5 to write head with offset
BRA.b @innerLoop
@inner_continues_000027BA: ; jmp here if D7 is zero
SUBQ.w #1, D2
BNE.w @unknown_skip_000027C8
; inner reinit
MOVEQ #$10, D2
ADD.w D3, D3
MOVE.w (A1)+, D3 ; read data stream
ROXR.w #1, D3
@unknown_skip_000027C8:
ADDX.w D3, D3
BCC.w @prep_for_shunt_loop_000027F6 ; exit inner loop
SUBQ.w #1, D2
BNE.w @unknown_skip_000027DC
; re-read before write ahead
MOVEQ #$10, D2
ADD.w D3, D3
MOVE.w (A1)+, D3 ; read data stream
ROXR.w #1, D3 ; now shunt Xtend flag back in
@unknown_skip_000027DC:
ADDX.w D3, D3
BCS.w @write_ahead_3_000027EC
; write ahead method 2
@_write_ahead_2_marker_000027E2 ; just a dummy for a breakpoint
ADDI.w #$C, D6
MOVE.w D5, -$2(A5,D6.w) ; WRITE D5 to write head with offset
BRA.b @innerLoop
; write ahead method 3
@write_ahead_3_000027EC:
ADDI.w #$14, D6
MOVE.w D5, -$2(A5,D6.w) ; WRITE D5 to write head with offset
BRA.b @innerLoop ; end of inner loop
@prep_for_shunt_loop_000027F6: ; 1st arrives here
MOVEQ #0, D7
MOVEQ #1, D6
; -------- shunt loop --------
; look for runs of bits set ie 111100 loops 4 times
; and outputs D7+4, D6=1,2,4,8,16,32,64,etc.
@shunt_loop_000027FA:
ADDQ.w #1, D7
ADD.w D6, D6
SUBQ.w #1, D2
BNE.w @has_more_bits_00002816 ; is counter >0?
; ie. are there still bits left?
; branch-not-equal (branch if Zero flag clear)
; fall through here when D2 counter just decremented to 0
MOVEQ #$10, D2
ADD.w D3, D3 ; shunt D3
BCC.w @read_input_then_exit_shunt_00002810 ; did we just shunt 0?
; branch-if-carry-clear
; ie. branch if we shunted 0
MOVE.w (A1)+, D3 ; READ data stream
BRA.b @shunt_loop_000027FA
@read_input_then_exit_shunt_00002810:
MOVE.w (A1)+, D3 ; READ data stream
BRA.w @after_shunt_loop_0000281A ; EXIT SHUNT LOOP
@has_more_bits_00002816: ; more bits to process
ADD.w D3, D3 ; shunt D3
BCS.b @shunt_loop_000027FA ; Did we just shunt out a 1?
; -------- end shunt loop --------
; Prep for write
; has three paths, A, B, C, plus common finish
@after_shunt_loop_0000281A:
SUB.w D7, D2
BGT.w @write_prep_A_00002850 ; is D2>D7?
BEQ.w @write_prep_B_00002840
; fall through to write prep C
SWAP D3 ; clear upper word of D3
CLR.w D3
SWAP D3
ADD.w D7, D2
LSL.l D2, D3
MOVE.w (A1)+, D3 ; read data stream
SUB.w D2, D7
LSL.l D7, D3
MOVEQ #$00000010, D2
SUB.w D7, D2
MOVE.l D3, D7
SWAP D7
BRA.w @unknown_skip_0000285A
@write_prep_B_00002840:
MOVEQ #$00000010, D2
SWAP D3
CLR.w D3
ROL.l D7, D3
MOVE.w D3, D7
MOVE.w (A1)+, D3 ; read data stream
BRA.w @unknown_skip_0000285A
@write_prep_A_00002850: ;
SWAP D3
CLR.w D3
ROL.l D7, D3
MOVE.w D3, D7
SWAP D3
@unknown_skip_0000285A: ; finish write prep
ADD.w D7, D6
SUBQ.w #3, D6
@_dummy_check_if_should_write_loop_0000285E
BCS.w @check_if_buffer_full_00002872 ; ie. branch if D6>0
; -------- write loop --------
; takes the last-written word of the output buffer
; and repeats that D6 times
; if bit 15 of current output set, then start writing that instead
; and clear bit 15
@write_loop_00002862: ; when we reach write loop, D6 tells us how many times to repeat
;
MOVE.w (A5), D7 ; peek head of output buffer
BPL.w @unknown_skip_0000286C ; if bit 15 set, then skip ahead
MOVE.w D7, D5
MOVE.b D5, D4 ; write lower byte of output to D4 before full D4.w is output
@unknown_skip_0000286C:
MOVE.w D4, (A5)+ ; WRITE D4 to output buffer
DBF D6, @write_loop_00002862 ; and repeat write loop
; fall out here when write loop done
; -------- end write loop --------
@check_if_buffer_full_00002872:
CMPA.l A4, A5 ; check current write pointer with address of end of buffer
BCS.w @repeat_all_00002726 ; if not filled buffer, repeat
; for first pattern, buffer is full first time around
MOVE.w D2, $FFFFF730.w ; save decompression state
MOVE.w D3, $FFFFF732.w ; save decompression state
MOVEM.l (SP)+, D2/D3/D4/D5/D6/D7/A4/A5 ; restore register state
RTS
Code: Select all
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace GunstarCompression
{
class Decompressor
{
private byte[] decompressionBuffer;
private int decompressionBufferCursor;
private List<byte> outputData;
private byte[] inputData;
private int inputCursor;
private ushort bitsToProcess;
private int numValidBits;
private ushort d4;
private ushort d5;
private ushort d6;
private int iterationCount;
public Decompressor()
{
decompressionBuffer = new byte[256];
outputData = new List<byte>();
}
public byte[] GetOutput()
{
return outputData.ToArray();
}
public void Decompress(byte[] inputData)
{
this.inputData = inputData;
inputCursor = 0;
decompressionBufferCursor = 0;
d4 = 0;
d5 = 0;
outputData.Clear();
// Initial prep
int numDecompressSteps = inputData[0];
bitsToProcess = (ushort)(((ushort)inputData[1]) << 8);
numValidBits = 8;
inputCursor += 2;
for (int i=0; i<numDecompressSteps; i++)
{
decompressionBufferCursor = 0;
do
{
DecompressStep();
}
while (decompressionBufferCursor < 128);
PrintDecompressionBuffer(i);
CompactToOutput();
}
}
private void DecompressStep()
{
// ---- Top Prep ----
// Read the next data chunk of 5 bits
ushort dataChunk = ReadBits(5);
bool isLsbSet = (dataChunk & 0x1) > 0;
dataChunk = (ushort)((dataChunk >> 1) & 0xF);
if (isLsbSet)
{
// lower bit is set
// Write nibble then do inner loop
byte output = (byte)dataChunk;
WriteAndIncrement(output);
d4 = output;
d5 = (ushort)(output | 0x8000);
d6 = 0; // write offset
// Inner Loop
while (true)
{
// Process next two bits from the stream
/*
ushort bitPair = 0; // D7
if (numValidBits > 2)
{
// inner branch A
bitPair = ReadBits(2);
}
else if (numValidBits == 2)
{
// inner branch B
bitPair = ReadBits(2);
}
else
{
// inner branch C
bitPair = ReadBits(2);
}
*/
ushort bitPair = ReadBits(2);
// After inner branch
if (bitPair == 0)
{
// inner continues
ushort tst = ReadBits(1);
if (tst == 0)
{
break;
}
tst = ReadBits(1);
if (tst == 1)
{
// write ahead method 3
d6 += 20;
WriteWordAtOffset(d5, d6 - 1);
}
else
{
// write ahead method 2
d6 += 12;
WriteWordAtOffset(d5, d6-2);
}
}
else
{
// write ahead method 1
ushort inc = (ushort)((bitPair + 6) * 2);
d6 += inc;
// D5 probably has hi bit set at this point
WriteWordAtOffset(d5, d6-2);
}
}
}
else
{
// lower bit clear
// Direct byte write
byte output = (byte)dataChunk;
WriteAndIncrement(output);
d4 = output;
d5 = (ushort)(output | 0x8000);
}
// ---- Shunt Loop ----
int runLength = 0;
int runMultiplier = 1;
while (true)
{
runLength++;
runMultiplier *= 2;
ushort data = ReadBits(1);
if (data == 0)
{
// Found end of run
break;
}
}
// ---- Prep For Write (After Shunt Loop) ----
/*
// Really this can just be compressed into a single ReadBits call
// But keeping it split into three for now so it can be traced along with the asm version
ushort workingBits = 0;
if (numValidBits > runLength)
{
// Write prep A
workingBits = ReadBits(runLength);
}
else if (numValidBits == runLength)
{
// Write prep B
workingBits = ReadBits(runLength);
}
else
{
// Write prep C
workingBits = ReadBits(runLength);
}
*/
ushort workingBits = ReadBits(runLength);
int loopAmount = (runMultiplier + workingBits) - 3;
loopAmount += 1;
// ---- Write Loop ----
if (loopAmount > 0)
{
// todo: do we need to add one to the runMultiplier because of how the asm handles it's loop counter check at the end of the loop?
for (int i = 0; i < loopAmount; i++)
{
ushort d7 = (ushort)((decompressionBuffer[decompressionBufferCursor] << 8)
| decompressionBuffer[decompressionBufferCursor+1]);
if ((d7 & 0x8000) > 0)
{
d5 = d7;
d4 = (ushort)(d5 & 0xF); // just take data nibble, clear bit 15
}
WriteAndIncrement((byte)d4);
}
}
iterationCount++;
}
private ushort ReadBits(int numRequiredBits)
{
ushort mask = CreateBitMask(numRequiredBits);
ushort output;
if (numValidBits > numRequiredBits)
{
output = (ushort)((bitsToProcess >> (16-numRequiredBits)) & mask);
bitsToProcess = (ushort)(bitsToProcess << numRequiredBits);
numValidBits -= numRequiredBits;
}
else if (numValidBits == numRequiredBits)
{
// Extract required bits
output = (ushort)((bitsToProcess >> (16 - numRequiredBits)) & mask);
// Read in next word to process
bitsToProcess = (ushort)((inputData[inputCursor] << 8) | inputData[inputCursor + 1]);
inputCursor += 2;
numValidBits = 16;
}
else // less that X bits to proccess
{
// TODO: Do we need to create upper and lower masks?
// upper is something like 00000011111100
// lower is something like 00000000000011
// otherwise we might get wrong data mixed in with our output because it's not masked properly
ushort upperMask = CreateBitMask(numValidBits);
// First gather remaining bits
output = (ushort)((bitsToProcess >> (16 - numRequiredBits)) & mask);
int numOustandingBits = numRequiredBits - numValidBits;
// Read in next word to process
bitsToProcess = (ushort)((inputData[inputCursor] << 8) | inputData[inputCursor + 1]);
inputCursor += 2;
numValidBits = 16;
// And extract the outstanding bits
output |= (ushort)((bitsToProcess >> (16 - numOustandingBits)) & mask);
bitsToProcess = (ushort)(bitsToProcess << numOustandingBits);
numValidBits -= numOustandingBits;
}
return output;
}
private static ushort CreateBitMask(int numBits)
{
ushort mask = 0;
for (int i = 0; i < numBits; i++)
mask |= (ushort)(1 << i);
return mask;
}
private void WriteAndIncrement(ushort value)
{
this.decompressionBuffer[decompressionBufferCursor++] = (byte)((value >> 8) & 0xFF);
this.decompressionBuffer[decompressionBufferCursor++] = (byte)((value & 0xFF));
}
private void WriteWordAtOffset(ushort value, int offset)
{
byte lo = (byte)(value & 0xFF);
byte hi = (byte)((value >> 8) & 0xFF);
this.decompressionBuffer[decompressionBufferCursor + offset] = hi;
this.decompressionBuffer[decompressionBufferCursor + offset + 1] = lo;
}
private void CompactToOutput()
{
int cursorPos = 0;
for (int i=0; i<32; i++)
{
byte hi = (byte)((ReadDecompressedWord(ref cursorPos) & 0xF) << 4);
byte lo = (byte)((ReadDecompressedWord(ref cursorPos) & 0xF) << 0);
byte value = (byte)(hi | lo);
outputData.Add(value);
}
}
private ushort ReadDecompressedWord(ref int cursorPos)
{
byte hi = decompressionBuffer[cursorPos];
byte lo = decompressionBuffer[cursorPos + 1];
cursorPos += 2;
return (ushort)((hi << 8) | lo);
}
private void PrintDecompressionBuffer(int stepNum)
{
Console.Out.WriteLine(string.Format("Decompression buffer: (step {0})", stepNum));
int pos = 0;
for (int y=0; y<8; y++)
{
string rawBytes = string.Format(" {0}{1} {2}{3} {4}{5} {6}{7} {8}{9} {10}{11} {12}{13} {14}{15}",
decompressionBuffer[pos] .ToString("X2"),
decompressionBuffer[pos + 1].ToString("X2"),
decompressionBuffer[pos + 2].ToString("X2"),
decompressionBuffer[pos + 3].ToString("X2"),
decompressionBuffer[pos + 4].ToString("X2"),
decompressionBuffer[pos + 5].ToString("X2"),
decompressionBuffer[pos + 6].ToString("X2"),
decompressionBuffer[pos + 7].ToString("X2"),
decompressionBuffer[pos + 8].ToString("X2"),
decompressionBuffer[pos + 9].ToString("X2"),
decompressionBuffer[pos +10].ToString("X2"),
decompressionBuffer[pos +11].ToString("X2"),
decompressionBuffer[pos +12].ToString("X2"),
decompressionBuffer[pos +13].ToString("X2"),
decompressionBuffer[pos +14].ToString("X2"),
decompressionBuffer[pos +15].ToString("X2"));
string nibbles = string.Format("{0} {1} {2} {3} {4} {5} {6} {7}",
PrettyFormatNibble(decompressionBuffer[pos + 1]),
PrettyFormatNibble(decompressionBuffer[pos + 3]),
PrettyFormatNibble(decompressionBuffer[pos + 5]),
PrettyFormatNibble(decompressionBuffer[pos + 7]),
PrettyFormatNibble(decompressionBuffer[pos + 9]),
PrettyFormatNibble(decompressionBuffer[pos +11]),
PrettyFormatNibble(decompressionBuffer[pos +13]),
PrettyFormatNibble(decompressionBuffer[pos +15]) );
Console.Out.WriteLine(rawBytes + " " + nibbles);
pos += 16;
}
}
private static string PrettyFormatNibble(byte value)
{
if (value == 0)
return ".";
return value.ToString("X1");
}
}
}