Unicode is gigantic so displaying UTF-8 text is probably out of the question. The trick here is that you have your own charset as usual, and then you convert Unicode codepoints to your own charset on the fly as you're rendering the text (and replace anything unavailable with a placeholder, e.g. a question mark). This helps keep resources usage low (just using enough for whatever you may need) while still being able to use UTF-8.
Anyway, here's how to do it in C:
Code: Select all
unsigned decode_utf8(const char *text)
{
unsigned ch = (uint8_t)(*text);
if (ch < 0x80) {
return *text;
}
if (ch < 0xE0) {
return (text[0] & 0x1F) << 6 |
(text[1] & 0x3F);
}
if (ch < 0xF0) {
return (text[0] & 0x0F) << 12 |
(text[1] & 0x3F) << 6 |
(text[2] & 0x3F);
}
return (text[0] & 0x07) << 18 |
(text[1] & 0x3F) << 12 |
(text[2] & 0x3F) << 6 |
(text[3] & 0x3F);
}
unsigned codepoint_size(unsigned codepoint)
{
if (codepoint < 0x80)
return 1;
if (codepoint < 0x800)
return 2;
if (codepoint < 0x10000)
return 3;
return 4;
}
Code: Select all
; input a6.l .... Pointer to character
; output a6.l ... Pointer to next character
; output d7.l ... Unicode codepoint
DecodeUTF8:
moveq #0, d7
move.b (a6)+, d7
bmi.s @TwoBytes
rts
@TwoBytes:
cmp.b #$E0, d7
bhs.s @ThreeBytes
and.b #$1F, d7
lsl.w #8, d7
move.b (a6)+, d7
lsl.b #2, d7
lsr.w #2, d7
rts
@ThreeBytes:
cmp.b #$F0, d7
bhs.s @FourBytes
and.b #$0F, d7
swap d7
move.b (a6)+, d7
lsl.b #2, d7
lsl.w #6, d7
move.b (a6)+, d7
lsl.b #2, d7
lsl.w #2, d7
lsr.l #4, d7
rts
@FourBytes:
and.b #$07, d7
lsl.w #8, d7
move.b (a6)+, d7
lsl.b #2, d7
lsr.w #2, d7
swap
move.b (a6)+, d7
lsl.b #2, d7
lsl.w #6, d7
move.b (a6)+, d7
lsl.b #2, d7
lsl.w #2, d7
lsr.l #4, d7
rts