News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

qwtoa using xmm registers?

Started by jj2007, September 08, 2008, 08:09:53 AM

Previous topic - Next topic

drizz

ups i found a small bug for 0 qword, comment the first jump to fix
; test eax,eax
mov edi,ecx
mov ebp,ecx
; jz @F


Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)
The truth cannot be learned ... it can only be recognized.

Mark_Larson

Quote from: drizz on September 09, 2008, 04:17:53 PM
ups i found a small bug for 0 qword, comment the first jump to fix
; test eax,eax
mov edi,ecx
mov ebp,ecx
; jz @F


Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)

I tested with a non-zero qword, but I'll fix that as well.
BIOS programmers do it fastest, hehe.  ;)

My Optimization webpage
htttp://www.website.masmforum.com/mark/index.htm

drizz

#17
New & improved version! (removed division completely) :wink

Edit: Forgot to mention the timing : 596 clocks

Edit2: buggy code removed
The truth cannot be learned ... it can only be recognized.

jj2007

Hi drizz,

Compliments again, this is likely to become a replacement for dwtoa.

First the good news: It's 3 cycles faster than dwtoa for a dword!

Run *** "qwtoaDrizz.exe" ***********************

512 cycles for U64ToStr 12345678901234567890
512 cycles for U64ToStr 10987654320:98765432
119 cycles for U64ToStr 1234567890
122 cycles for dwtoa    1234567890

Now the bad news: It seems I stumbled over a number that does not translate correctly...

.data
My64a QWORD 12345678901234567890
My64b QWORD 10987654321098765432
My64c QWORD 1234567890
My32c DWORD 1234567890


File with timing code attached.

[attachment deleted by admin]

drizz

Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh
U64ToStr proc QwVal:QWORD, pszString:DWORD
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
push ebp
push esi
push edi
push ebx
mov esi,[esp+1*4][4*4]; a0
mov edi,[esp+2*4][4*4]; a1
mov ebp,[esp+3*4][4*4]; buffer pointer
test edi,edi
jz @F
;; QWORD conversion loop
@@U64Cvt:
mov eax,0CCCCCCCDh; = b0
mul esi; get a0*b0 = d1:d0
mov ecx,edx;d1
mov eax,0CCCCCCCDh; = b0
xor ebx,ebx
mul edi; get a1*b0 = e1:e0
add ecx,eax;e0
adc ebx,edx;e1
mov eax,0CCCCCCCCh; =b1
mul esi; get a0*b1 = f1:f0
add ecx,eax;f0
adc ebx,edx;f1
mov ecx,0
mov eax,0CCCCCCCCh; =b1
adc ecx,ecx
mul edi; get a1*b1 = g1:g0
add eax,ebx;g0
adc edx,ecx;g1
shrd eax,edx,3
shr edx,3;;------ quotient in edx::eax
; upper dwords will be the same after multiplication
lea ecx,[eax*4+eax]
mov ebx,esi
lea ecx,[ecx+ecx-'0']
sub ebx,ecx
mov esi,eax
mov [ebp],bl
mov edi,edx
add ebp,1
test edi,edi
jnz @@U64Cvt
@@:
;; we are here if HI-DWORD is 0
mov eax,esi
mov ebx,esi
mov edi,0CCCCCCCDh
;; Lower Dword conversion loop
@@U32Cvt:
mul edi
shr edx,3
lea ecx,[edx*4+edx]
lea ecx,[ecx+ecx-'0']
sub ebx,ecx
mov eax,edx
mov [ebp],bl
mov ebx,edx
add ebp,1
test eax,eax
jnz @@U32Cvt
;; Calculate output string length and reverse it
mov edi,[esp+3*4][4*4]
mov [ebp],al
mov eax,ebp
sub eax,edi
;; reverse buffer digits
@@: sub ebp,1
mov bl,[edi]
mov cl,[ebp]
mov [ebp],bl
mov [edi],cl
add edi,1
cmp edi,ebp
jb @B
;; string length in eax
pop ebx
pop edi
pop esi
pop ebp
ret 3*4
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
U64ToStr endp

The truth cannot be learned ... it can only be recognized.

jj2007

Quote from: drizz on September 11, 2008, 07:05:32 PM
Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh
Yep, you are making good progress. Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.
invoke GetFileSize, hFile, lpFileSizeHigh ; address of high-order word for file size

Some code rearrangements make it a bit faster, on a Core 2 Celeron M:
386 cycles for qw2Str   12345678901234567890
401 cycles for U64ToStr 12345678901234567890
386 cycles for qw2Str   18446744073709551615
402 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   18446744073709551615
403 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   10987654321098765432
402 cycles for U64ToStr 10987654321098765432
110 cycles for qw2Str   1234567890
117 cycles for U64ToStr 1234567890
124 cycles dwtoa, dw    1234567890
73 cycles qw SSE, dw    001234567890


Full code attached.

qw2Str proc ; ptr qword in eax, ptr buffer in edx
push ebp ; credits to  drizz
push esi
push edi
push ebx
push edx ; will be popped as edi
mov esi, [eax]
if 0
mov edi, [eax+4] ; 2-3 cycles slower
else
add eax, 4 ; 2 bytes longer
mov edi, [eax]
endif
mov ebp, edx ; buffer pointer, also on stack
test edi, edi
jz @F

  @@U64Cvt: ; QWORD conversion loop
mov eax, 0CCCCCCCDh ; = b0
mul esi ; get a0*b0 = d1:d0
mov eax, 0CCCCCCCDh ; = b0 (mov eax up saves 1 cycle)
mov ecx, edx ; d1
xor ebx, ebx
mul edi ; get a1*b0 = e1:e0
add ecx, eax ; e0
adc ebx, edx ; e1
mov eax, 0CCCCCCCCh ; =b1
mul esi ; get a0*b1 = f1:f0
add ecx, eax ; f0
adc ebx, edx ; f1
push 0 ; 5 cycles faster, 3 bytes
pop ecx ; shorter than mov ecx, 0
mov eax, 0CCCCCCCCh ; =b1
adc ecx, ecx
mul edi ; get a1*b1 = g1:g0
add eax, ebx ; g0
adc edx, ecx ; g1
shrd eax, edx, 3
shr edx, 3 ; ------ quotient in edx::eax

; upper dwords will be the same after multiplication
lea ecx, [eax*4+eax] ; ecx=5*eax
if 1
neg ecx ; same speed, 2 bytes shorter
lea ebx, [esi+2*ecx+"0"]
else
mov ebx, esi
lea ecx, [ecx+ecx-'0']
sub ebx, ecx
endif
mov esi, eax
mov [ebp], bl
mov edi, edx
add ebp, 1 ; inc ebp costs 1-2 cycles here
test edi, edi
jnz @@U64Cvt

  @@: ; we are here if HI-DWORD is 0
mov eax, esi
mov ebx, esi
mov edi, 0CCCCCCCDh

; Lower Dword conversion loop
  @@U32Cvt: mul edi
shr edx, 3
lea ecx, [edx*4+edx] ; ecx=edx*5
if 1
neg ecx ; 3 cycles faster
lea ebx, [ebx+2*ecx+"0"]
else
lea ecx, [ecx+ecx-'0'] ; ecx=edx*10-48
sub ebx, ecx
endif
mov eax, edx ; sub ebx, mov eax 2 cycles faster than inverse sequence
mov [ebp], bl
mov ebx, edx
inc ebp ; add ebp, 1 here a bit slower and 2 bytes more
test eax, eax
jnz @@U32Cvt

; Calculate output string length and reverse it
pop edi ; ex mov edi, [esp+3*4][4*4]
mov [ebp], al
mov eax, ebp
sub eax, edi

; reverse buffer digits (ca. 30 cycles for 20 digits)
  @@: sub ebp, 1 ; dec ebp ca. 10 cycles slower here!
mov bl, [edi]
mov cl, [ebp]
mov [ebp], bl
mov [edi], cl
add edi, 1 ; inc edi ca. 10 cycles slower here!
cmp edi, ebp
jb @B

; string length in eax
pop ebx
pop edi
pop esi
pop ebp
ret
qw2Str endp

[attachment deleted by admin]

drizz

Quote from: jj2007 on September 12, 2008, 05:12:42 AM
...Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.
Of course, feel free (anyone) to change it as you like and put it to good use  :U
The truth cannot be learned ... it can only be recognized.

jj2007

Quote from: drizz on September 13, 2008, 01:52:23 AM
Of course, feel free (anyone) to change it as you like and put it to good use  :U

Done, see the float$ thread :U