The MASM Forum Archive 2004 to 2012

General Forums => The Laboratory => Topic started by: jj2007 on September 08, 2008, 08:09:53 AM

Title: qwtoa using xmm registers?
Post by: jj2007 on September 08, 2008, 08:09:53 AM
Dear all,
I have had a look at the dwtoa code, which is very simple and compact and reasonably fast. However, it can't deal with 64-bit integers. I wonder whether it would be possible to convert it to SSE, using xmm registers only? I post a test bed below. Any ideas?

P.S.: I am aware of the 2005 qwtoa thread (http://www.masm32.com/board/index.php?topic=3051.msg23461#msg23461).

include \masm32\include\masm32rt.inc
; .686
; include \masm32\macros\timers.asm
.xmm


.data?
MyInt64 dq ?
NumBuffer dd 20 dup(?)

.data
MyReal10 Real10 12345678901234567890.0
lpBuffer dd NumBuffer

.code
start:

; following 4 lines needed for new xmm loop
  mov edx, offset MyReal10
  fld REAL10 ptr [edx] ; load from mem, 6 cycles
  mov edx, offset MyInt64
  fistp qword ptr [edx] ; the 64-bit integer is now in MyInt64...

  mov eax, 1234567890 ; test value we want to convert to ASCII
  mov edi, lpBuffer ; pointer to output buffer

; This is the core loop of dwtoa:
; -------------- can this loop be translated to SSE2, using only xmm registers?? --------------
  mov ecx, 3435973837 ; 0CCCCCCCDh, magic multiplier
  .While eax > 0
mov ebx, eax
mul ecx
shr edx, 3
mov eax, edx
lea edx, [edx*4+edx]
add edx, edx
sub ebx, edx
add bl, '0'
mov [edi], bl
add edi, 1
  .Endw
; -------------------------------------------------------------------------------------------------------------------------------------

  mov byte ptr [edi], 0       ; terminate the string
; We now have all the digits, but in reverse order
    mov esi, lpBuffer ; pointer to output buffer
    .while esi < edi
      sub edi, 1
      mov al, [esi]
      mov ah, [edi]
      mov [edi], al
      mov [esi], ah
      add esi, 1
    .endw

  invoke MessageBox, 0, lpBuffer, chr$("Our number:"), MB_OK

end start
Title: Re: qwtoa using xmm registers?
Post by: qWord on September 08, 2008, 12:35:47 PM
hi,

I've written an SSE2-implementation of dw2a (just for fun  :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

regard, qWord


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Convert unsigned DWORD to ASC-string by using      ;
;  SSE2 extension                                     ;
;                                                     ;
; IN:  eax = unsigned dword-value                     ;
;      edx = pointer to buffer, must be algined to 16 ;
;                                                     ;
; OUT: eax = pointer to string in buffer              ;
;                                                     ;
;idea and implementation                              ;
;by qWord, September 2008                             ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
d2a proc
   ;eax == dwNum
   ;edx == lpBuffer
   
    .data
        align 16          ; shr 13    shr 26
        d2a_msk_0   QWORD 3518437209, 2882303762 ;magic-number
        d2a_msk_4   QWORD 10000     , 100000000
        d2a_msk_1   REAL4 1.000001E-4, 1.000001E-3, 1.000001E-2, 1.000001E-1
        d2a_msk_2   REAL4 4 dup(10.0000000)
        d2a_msk_3   BYTE 16 dup(030h)
    .code

    test eax,eax
    .if !ZERO?

        movdqa xmm4,OWORD ptr d2a_msk_0              ;split dword into 3 parts
        movdqa xmm2,OWORD ptr d2a_msk_4              ; e.g.:  1234567890
                                                     ;    ________|________
        movd xmm0,eax                                ;   |        |        |
        pshufd xmm0,xmm0,011001100y                  ;  xmm2     xmm1    xmm0
        pshufd xmm3,xmm0,011001100y                  ;
        pmuludq xmm0,xmm4                            ;   12      3456    7890
        movdqa xmm1,xmm0                             ;
        psrlq xmm0,13+32                             ;
        psrlq xmm1,26+32                             ;
                                                     ;
        pshufd xmm0,xmm0,01001110y                   ;
        punpckhqdq xmm0,xmm1                         ;
        ;xmm0 = xl/10000 | xh/100000000              ;
        ;xmm3 = xl       | xh                        ;
                                                     ;
                                                     ;
        pmuludq xmm2,xmm0                            ;
        psubd xmm3,xmm2                              ;
        ; xmm3 = xl mod 10000 | xh mod 100000000     ;
                                                     ;
        pshufd xmm2,xmm3,01001110y                   ;
        pmuludq xmm2,xmm4                            ;
        psrlq xmm2,13+32                             ;
                                                     ;
        pshufd xmm0,xmm0,010101010y                  ;
        pshufd xmm1,xmm2,000000000y                  ;
        pshufd xmm2,xmm3,000000000y                  ;

        cvtdq2ps xmm0,xmm0                    ;;  extract the numerics
        cvtdq2ps xmm1,xmm1                    ;;
        cvtdq2ps xmm2,xmm2                    ;;  xmm-reg:
                                              ;;
        movdqa xmm3,OWORD ptr d2a_msk_1       ;;   2345.0     2345.0     2345.0     2345.0
        movdqa xmm6,OWORD ptr d2a_msk_2       ;;     |___________|_________|___________|
        movdqa xmm7,OWORD ptr d2a_msk_3       ;;                      |
                                              ;;                   div. by
        movdqa xmm4,xmm3                      ;;                      |
        movdqa xmm5,xmm3                      ;;     |-----------|----|----|-----------|
                                              ;;   10000.0    1000.0     100.0       10.0
        mulps xmm3,xmm2                       ;;
        mulps xmm4,xmm1                       ;;                      |
        mulps xmm5,xmm0                       ;;                      |
                                              ;;                      v
        cvttps2dq xmm2,xmm3                   ;;
        cvttps2dq xmm1,xmm4                   ;;   0.2345     2.345      23.45       234.5
        cvttps2dq xmm0,xmm5                   ;;                      |
                                              ;;                      |
        cvtdq2ps xmm2,xmm2                    ;;             truncate each value
        cvtdq2ps xmm1,xmm1                    ;;    and subtract from non-truncated values
        cvtdq2ps xmm0,xmm0                    ;;                      |
                                              ;;                      V
        subps xmm3,xmm2                       ;;   0.2345     0.345       0.45       0.5
        subps xmm4,xmm1                       ;;                      |
        subps xmm5,xmm0                       ;;                      |
                                              ;;                 mul. by 10.0
        mulps xmm3,xmm6                       ;;                and truncate
        mulps xmm4,xmm6                       ;;                      |
        mulps xmm5,xmm6                       ;;                      V
                                              ;;    2         3           4          5
        cvttps2dq xmm3,xmm3                   ;;
        cvttps2dq xmm4,xmm4                   ;;
        cvttps2dq xmm5,xmm5                   ;;
                                              ;;
        PACKSSDW xmm5,xmm4                    ;;
        PACKSSDW xmm3,xmm3                    ;;
        PACKSSWB xmm5,xmm3                    ;;
                                              ;;
        paddb xmm5,xmm7                       ;;

        pcmpeqb xmm7,xmm5                     ;;;  skip preceding zeros
        xor eax,eax                           ;;;  and return addr. in eax
        pinsrw xmm5,eax,6                     ;;;
        pmovmskb eax,xmm7                     ;;;
        movdqa OWORD ptr [edx],xmm5           ;;;
        not eax                               ;;;
        bsf eax,eax                           ;;;
                                              ;;;
        test eax,32                           ;;;
        .if !ZERO?                            ;;;
            dec eax                           ;;;
        .endif                                ;;;
        add eax,edx                           ;;;
        ret                                   ;;;
        align 16                              ;;;
    .else
        mov eax,030h
        mov DWORD ptr [edx],eax
        mov eax,edx
        ret
    .endif

d2a endp
Title: Re: qwtoa using xmm registers?
Post by: hutch-- on September 08, 2008, 12:43:18 PM
The trick is instead of converting an existing procedure that is "known", simply write a new one that has the extra capacity.
Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 08, 2008, 12:47:24 PM
Quote from: qWord on September 08, 2008, 12:35:47 PM
I've written an SSE2-implementation of dw2a (just for fun  :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

Thanks, qWord. I hope I'll understand it :dazzled:
Did you time it against good ol' dwtoa?
Title: Re: qwtoa using xmm registers?
Post by: qWord on September 08, 2008, 01:00:38 PM
Quote from: jj2007 on September 08, 2008, 12:47:24 PM
Did you time it against good ol' dwtoa?

yes, on my core2duo it takes ca. 48 clocks.
it's slower than dwtoa(masmlib) on values < 10000,but faster on greater values
Title: Re: qwtoa using xmm registers?
Post by: BlackVortex on September 08, 2008, 02:31:45 PM
Quote from: qWord
it's slower than dwtoa(masmlib) on values > 10000, but faster on greater values
that didn't really make sense  :-D
Title: Re: qwtoa using xmm registers?
Post by: hutch-- on September 08, 2008, 02:33:37 PM
He probably inverted to > instead of < .
Title: Re: qwtoa using xmm registers?
Post by: BlackVortex on September 08, 2008, 02:41:41 PM
Quote from: hutch-- on September 08, 2008, 02:33:37 PM
He probably inverted to > instead of < .
Just teasing him   :cheekygreen:
Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 08, 2008, 03:52:29 PM
this runs in under 1 cycles on my core 2 duo.  It uses a 64k lookup table to do two 16-bit values at the same time.  You could do an 8-bit lookup table, use less memory, and still PROBABLY have it run under 1 cycle.  or close to it.  the lookup table is made of all dword values.  It returns a 16-bit value, which is equivalent to the ascii value for that word.  so offset 0 in the table would have '00'.  Make sense?


mov eax,10000
mov edi,offset dst

mov ebx,eax

and eax,0ffffh
shr ebx,16

mov ecx,[lookup_table+eax*4]
mov edx,[lookup_table+ebx*4]

mov word [edi],cx
mov word [edi+2],dx


EDIT: my bad, that's actually my HEX to ascii routine.  I cut and pasted the wrong one.
Title: Re: qwtoa using xmm registers?
Post by: drizz on September 08, 2008, 06:36:28 PM
here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

[attachment deleted by admin]
Title: Re: qwtoa using xmm registers?
Post by: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post  ::)
Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 08, 2008, 09:48:06 PM
Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post  ::)

thanks!

Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 09, 2008, 09:16:49 AM
Quote from: drizz on September 08, 2008, 06:36:28 PM
here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

Looks promising, thanks. It seems that SSEx is not an option, despite of the simplicity of the algo.
Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 09, 2008, 01:03:40 PM
Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post  ::)

what kind of processor do you have?

EDIT:  I am guessing an athlon.

EDIT2: my timings for your unsigned 64-bit to ascii code are 158.685983 cycles for a FULL 64-bit number.  So there are no skipping of the loops.  I have a core 2 duo.
Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 09, 2008, 02:27:04 PM
I ran qwords code.  It runs in 48.504735 cycles on my core 2 duo.  But it's only doing a dword.
Title: Re: qwtoa using xmm registers?
Post by: drizz on September 09, 2008, 04:17:53 PM
ups i found a small bug for 0 qword, comment the first jump to fix
; test eax,eax
mov edi,ecx
mov ebp,ecx
; jz @F


Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)
Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 09, 2008, 08:46:50 PM
Quote from: drizz on September 09, 2008, 04:17:53 PM
ups i found a small bug for 0 qword, comment the first jump to fix
; test eax,eax
mov edi,ecx
mov ebp,ecx
; jz @F


Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)

I tested with a non-zero qword, but I'll fix that as well.
Title: Re: qwtoa using xmm registers?
Post by: drizz on September 09, 2008, 09:07:37 PM
New & improved version! (removed division completely) :wink

Edit: Forgot to mention the timing : 596 clocks

Edit2: buggy code removed
Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 11, 2008, 04:51:16 PM
Hi drizz,

Compliments again, this is likely to become a replacement for dwtoa.

First the good news: It's 3 cycles faster than dwtoa for a dword!

Run *** "qwtoaDrizz.exe" ***********************

512 cycles for U64ToStr 12345678901234567890
512 cycles for U64ToStr 10987654320:98765432
119 cycles for U64ToStr 1234567890
122 cycles for dwtoa    1234567890

Now the bad news: It seems I stumbled over a number that does not translate correctly...

.data
My64a QWORD 12345678901234567890
My64b QWORD 10987654321098765432
My64c QWORD 1234567890
My32c DWORD 1234567890


File with timing code attached.

[attachment deleted by admin]
Title: Re: qwtoa using xmm registers?
Post by: drizz on September 11, 2008, 07:05:32 PM
Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh
U64ToStr proc QwVal:QWORD, pszString:DWORD
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
push ebp
push esi
push edi
push ebx
mov esi,[esp+1*4][4*4]; a0
mov edi,[esp+2*4][4*4]; a1
mov ebp,[esp+3*4][4*4]; buffer pointer
test edi,edi
jz @F
;; QWORD conversion loop
@@U64Cvt:
mov eax,0CCCCCCCDh; = b0
mul esi; get a0*b0 = d1:d0
mov ecx,edx;d1
mov eax,0CCCCCCCDh; = b0
xor ebx,ebx
mul edi; get a1*b0 = e1:e0
add ecx,eax;e0
adc ebx,edx;e1
mov eax,0CCCCCCCCh; =b1
mul esi; get a0*b1 = f1:f0
add ecx,eax;f0
adc ebx,edx;f1
mov ecx,0
mov eax,0CCCCCCCCh; =b1
adc ecx,ecx
mul edi; get a1*b1 = g1:g0
add eax,ebx;g0
adc edx,ecx;g1
shrd eax,edx,3
shr edx,3;;------ quotient in edx::eax
; upper dwords will be the same after multiplication
lea ecx,[eax*4+eax]
mov ebx,esi
lea ecx,[ecx+ecx-'0']
sub ebx,ecx
mov esi,eax
mov [ebp],bl
mov edi,edx
add ebp,1
test edi,edi
jnz @@U64Cvt
@@:
;; we are here if HI-DWORD is 0
mov eax,esi
mov ebx,esi
mov edi,0CCCCCCCDh
;; Lower Dword conversion loop
@@U32Cvt:
mul edi
shr edx,3
lea ecx,[edx*4+edx]
lea ecx,[ecx+ecx-'0']
sub ebx,ecx
mov eax,edx
mov [ebp],bl
mov ebx,edx
add ebp,1
test eax,eax
jnz @@U32Cvt
;; Calculate output string length and reverse it
mov edi,[esp+3*4][4*4]
mov [ebp],al
mov eax,ebp
sub eax,edi
;; reverse buffer digits
@@: sub ebp,1
mov bl,[edi]
mov cl,[ebp]
mov [ebp],bl
mov [edi],cl
add edi,1
cmp edi,ebp
jb @B
;; string length in eax
pop ebx
pop edi
pop esi
pop ebp
ret 3*4
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
U64ToStr endp

Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 12, 2008, 05:12:42 AM
Quote from: drizz on September 11, 2008, 07:05:32 PM
Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh
Yep, you are making good progress. Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.
invoke GetFileSize, hFile, lpFileSizeHigh ; address of high-order word for file size

Some code rearrangements make it a bit faster, on a Core 2 Celeron M:
386 cycles for qw2Str   12345678901234567890
401 cycles for U64ToStr 12345678901234567890
386 cycles for qw2Str   18446744073709551615
402 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   18446744073709551615
403 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   10987654321098765432
402 cycles for U64ToStr 10987654321098765432
110 cycles for qw2Str   1234567890
117 cycles for U64ToStr 1234567890
124 cycles dwtoa, dw    1234567890
73 cycles qw SSE, dw    001234567890


Full code attached.

qw2Str proc ; ptr qword in eax, ptr buffer in edx
push ebp ; credits to  drizz
push esi
push edi
push ebx
push edx ; will be popped as edi
mov esi, [eax]
if 0
mov edi, [eax+4] ; 2-3 cycles slower
else
add eax, 4 ; 2 bytes longer
mov edi, [eax]
endif
mov ebp, edx ; buffer pointer, also on stack
test edi, edi
jz @F

  @@U64Cvt: ; QWORD conversion loop
mov eax, 0CCCCCCCDh ; = b0
mul esi ; get a0*b0 = d1:d0
mov eax, 0CCCCCCCDh ; = b0 (mov eax up saves 1 cycle)
mov ecx, edx ; d1
xor ebx, ebx
mul edi ; get a1*b0 = e1:e0
add ecx, eax ; e0
adc ebx, edx ; e1
mov eax, 0CCCCCCCCh ; =b1
mul esi ; get a0*b1 = f1:f0
add ecx, eax ; f0
adc ebx, edx ; f1
push 0 ; 5 cycles faster, 3 bytes
pop ecx ; shorter than mov ecx, 0
mov eax, 0CCCCCCCCh ; =b1
adc ecx, ecx
mul edi ; get a1*b1 = g1:g0
add eax, ebx ; g0
adc edx, ecx ; g1
shrd eax, edx, 3
shr edx, 3 ; ------ quotient in edx::eax

; upper dwords will be the same after multiplication
lea ecx, [eax*4+eax] ; ecx=5*eax
if 1
neg ecx ; same speed, 2 bytes shorter
lea ebx, [esi+2*ecx+"0"]
else
mov ebx, esi
lea ecx, [ecx+ecx-'0']
sub ebx, ecx
endif
mov esi, eax
mov [ebp], bl
mov edi, edx
add ebp, 1 ; inc ebp costs 1-2 cycles here
test edi, edi
jnz @@U64Cvt

  @@: ; we are here if HI-DWORD is 0
mov eax, esi
mov ebx, esi
mov edi, 0CCCCCCCDh

; Lower Dword conversion loop
  @@U32Cvt: mul edi
shr edx, 3
lea ecx, [edx*4+edx] ; ecx=edx*5
if 1
neg ecx ; 3 cycles faster
lea ebx, [ebx+2*ecx+"0"]
else
lea ecx, [ecx+ecx-'0'] ; ecx=edx*10-48
sub ebx, ecx
endif
mov eax, edx ; sub ebx, mov eax 2 cycles faster than inverse sequence
mov [ebp], bl
mov ebx, edx
inc ebp ; add ebp, 1 here a bit slower and 2 bytes more
test eax, eax
jnz @@U32Cvt

; Calculate output string length and reverse it
pop edi ; ex mov edi, [esp+3*4][4*4]
mov [ebp], al
mov eax, ebp
sub eax, edi

; reverse buffer digits (ca. 30 cycles for 20 digits)
  @@: sub ebp, 1 ; dec ebp ca. 10 cycles slower here!
mov bl, [edi]
mov cl, [ebp]
mov [ebp], bl
mov [edi], cl
add edi, 1 ; inc edi ca. 10 cycles slower here!
cmp edi, ebp
jb @B

; string length in eax
pop ebx
pop edi
pop esi
pop ebp
ret
qw2Str endp

[attachment deleted by admin]
Title: Re: qwtoa using xmm registers?
Post by: drizz on September 13, 2008, 01:52:23 AM
Quote from: jj2007 on September 12, 2008, 05:12:42 AM
...Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.
Of course, feel free (anyone) to change it as you like and put it to good use  :U
Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 13, 2008, 06:29:34 AM
Quote from: drizz on September 13, 2008, 01:52:23 AM
Of course, feel free (anyone) to change it as you like and put it to good use  :U

Done, see the float$ thread (http://www.masm32.com/board/index.php?topic=9756.msg72501#msg72501) :U