News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Unicode to UTF-8

Started by Larry Hammick, December 18, 2009, 08:20:56 PM

Previous topic - Next topic

Larry Hammick

Here's a start on it:
UNI_to_UTF8:     ; (dest,src,charcount)  returns the output size in eax
    push ebp     ; charcount=-1 means the string is nul-terminated
    mov  ebp,esp
    push ebx
    push esi
    push edi
    mov  esi,[ebp+12]  ;source
    mov  ebx,[ebp+16]  ;count of wide characters
    cmp  ebx,-1
    jne  short @F
    mov  ecx,ebx       ;-1
    mov  edi,esi       ;source
    xor  eax,eax
    repne scasw
    dec  ebx           ;to -2
    sub  ebx,ecx       ;ebx is now the input size in wide characters
@@: mov  edi,[ebp+8]   ;destination
UU_loop:
    dec  ebx
    js   short UU_done
    xor  eax,eax
    lodsw
    test ah,11111000b
    jnz  short UU_3
    test ax,0000011110000000b
    jnz  short UU_2
    stosb
    jmp  short UU_loop
UU_3:
    mov  ch,al
    shr  eax,6
    mov  cl,al
    shr  eax,6
    and  cx,0011111100111111b
    or   cx,1000000010000000b
    or   al,11100000b
    stosb
    xchg eax,ecx
    stosw
    jmp  short UU_loop
UU_2:
    shl  eax,2       ;eax has 8 to 11 bits of info
    shr  al,2
    xchg al,ah       ;the six low bits go in the latter output byte
    or   ax,1000000011000000b
    stosw
    jmp  short UU_loop
UU_done:
    xchg eax,edi
    sub  eax,[ebp+8]
    pop  edi
    pop  esi
    pop  ebx
    pop  ebp
    ret  12

That's the easy part. The reverse, expanding UTF-8 to Unicode, will require quite a few tests for invalid input.

I tested the above with some Russian text (400h-4FFh in Unicode) and Hindi text (900-9FFh) and so far so good.

Larry Hammick

Same thing but with dword input characters from the 31-bit Universal Character Set (UCS-4):
USC_to_UTF8 proc uses ebx esi edi dest:ptr,src:ptr,charcount:dword
                 ;returns the output size in eax, or
                 ; eax=-1 if the input includes an invalid dword >= 80000000h
                 ; charcount=-1 means the string is nul-terminated
    mov  esi,src
    mov  ebx,charcount
    cmp  ebx,-1
    jne  short @F
    mov  ecx,ebx       ;-1
    mov  edi,esi       ;source
    xor  eax,eax
    repne scasd
    dec  ebx           ;to -2
    sub  ebx,ecx       ;ebx is now the input size in dwords
@@: mov  edi,dest
UCS_loop:
    dec  ebx
    js   UCS_done
    lodsd
    bsr ecx,eax
    cmp cl,6
    jbe short UCS_1
    cmp cl,10
    jbe short UCS_2
    cmp cl,15
    jbe short UCS_3
    cmp cl,20
    jbe short UCS_4
    cmp cl,25
    jbe short UCS_5
    cmp cl,30
    jbe short UCS_6
    or eax,-1
    jmp UCS_ret
UCS_6:        ;output will be:
              ; 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    call fan_eax_to_edx_3
      ;18 bits done, 13 remaining
    shl eax,2
    shr al,2
    or  al,10000000b
    mov dl,al
      ;7 bits remaing
    shr eax,6
    shr al,2
    or ax,1111110010000000b
    xchg al,ah
    stosw
    xchg eax,edx
    stosd
    jmp UCS_loop
UCS_1:
    stosb
    jmp UCS_loop
UCS_2:
    shl  eax,2
    shr  al,2
    xchg al,ah
    or   ax,1000000011000000b
    stosw
    jmp UCS_loop
UCS_3:         ;1110xxxx 10xxxxxx 10xxxxxx
    mov  ch,al
    shr  eax,6
    mov  cl,al
    shr  eax,6
    and  cx,0011111100111111b
    or   cx,1000000010000000b
    or   al,11100000b
    stosb
    xchg eax,ecx
    stosw
    jmp UCS_loop
UCS_4:          ;11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    call fan_eax_to_edx_3
    mov dl,al
    and dl,00000111b
    or  dl,11110000b
    xchg eax,edx
    stosd
    jmp UCS_loop
UCS_5:          ;111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    call fan_eax_to_edx_3
    shl eax,2
    shr al,2
    or  ax,1111100010000000b
    mov dl,al
    mov al,ah
    stosb
    xchg eax,edx
    stosd
    jmp  UCS_loop
UCS_done:
    xchg eax,edi
    sub  eax,dest
UCS_ret:
    ret
USC_to_UTF8 endp

fan_eax_to_edx_3:
    mov ecx,3
@@: mov dl,al
    shr eax,6
    and dl,00111111b
    or  dl,10000000b
    shl edx,8
    loop @B
    ret