Dear all,
I have had a look at the dwtoa code, which is very simple and compact and reasonably fast. However, it can't deal with 64-bit integers. I wonder whether it would be possible to convert it to SSE, using xmm registers only? I post a test bed below. Any ideas?
P.S.: I am aware of the 2005 qwtoa thread (http://www.masm32.com/board/index.php?topic=3051.msg23461#msg23461).
include \masm32\include\masm32rt.inc
; .686
; include \masm32\macros\timers.asm
.xmm
.data?
MyInt64 dq ?
NumBuffer dd 20 dup(?)
.data
MyReal10 Real10 12345678901234567890.0
lpBuffer dd NumBuffer
.code
start:
; following 4 lines needed for new xmm loop
mov edx, offset MyReal10
fld REAL10 ptr [edx] ; load from mem, 6 cycles
mov edx, offset MyInt64
fistp qword ptr [edx] ; the 64-bit integer is now in MyInt64...
mov eax, 1234567890 ; test value we want to convert to ASCII
mov edi, lpBuffer ; pointer to output buffer
; This is the core loop of dwtoa:
; -------------- can this loop be translated to SSE2, using only xmm registers?? --------------
mov ecx, 3435973837 ; 0CCCCCCCDh, magic multiplier
.While eax > 0
mov ebx, eax
mul ecx
shr edx, 3
mov eax, edx
lea edx, [edx*4+edx]
add edx, edx
sub ebx, edx
add bl, '0'
mov [edi], bl
add edi, 1
.Endw
; -------------------------------------------------------------------------------------------------------------------------------------
mov byte ptr [edi], 0 ; terminate the string
; We now have all the digits, but in reverse order
mov esi, lpBuffer ; pointer to output buffer
.while esi < edi
sub edi, 1
mov al, [esi]
mov ah, [edi]
mov [edi], al
mov [esi], ah
add esi, 1
.endw
invoke MessageBox, 0, lpBuffer, chr$("Our number:"), MB_OK
end start
hi,
I've written an SSE2-implementation of dw2a (just for fun :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.
regard, qWord
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; Convert unsigned DWORD to ASC-string by using ;
; SSE2 extension ;
; ;
; IN: eax = unsigned dword-value ;
; edx = pointer to buffer, must be algined to 16 ;
; ;
; OUT: eax = pointer to string in buffer ;
; ;
;idea and implementation ;
;by qWord, September 2008 ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
d2a proc
;eax == dwNum
;edx == lpBuffer
.data
align 16 ; shr 13 shr 26
d2a_msk_0 QWORD 3518437209, 2882303762 ;magic-number
d2a_msk_4 QWORD 10000 , 100000000
d2a_msk_1 REAL4 1.000001E-4, 1.000001E-3, 1.000001E-2, 1.000001E-1
d2a_msk_2 REAL4 4 dup(10.0000000)
d2a_msk_3 BYTE 16 dup(030h)
.code
test eax,eax
.if !ZERO?
movdqa xmm4,OWORD ptr d2a_msk_0 ;split dword into 3 parts
movdqa xmm2,OWORD ptr d2a_msk_4 ; e.g.: 1234567890
; ________|________
movd xmm0,eax ; | | |
pshufd xmm0,xmm0,011001100y ; xmm2 xmm1 xmm0
pshufd xmm3,xmm0,011001100y ;
pmuludq xmm0,xmm4 ; 12 3456 7890
movdqa xmm1,xmm0 ;
psrlq xmm0,13+32 ;
psrlq xmm1,26+32 ;
;
pshufd xmm0,xmm0,01001110y ;
punpckhqdq xmm0,xmm1 ;
;xmm0 = xl/10000 | xh/100000000 ;
;xmm3 = xl | xh ;
;
;
pmuludq xmm2,xmm0 ;
psubd xmm3,xmm2 ;
; xmm3 = xl mod 10000 | xh mod 100000000 ;
;
pshufd xmm2,xmm3,01001110y ;
pmuludq xmm2,xmm4 ;
psrlq xmm2,13+32 ;
;
pshufd xmm0,xmm0,010101010y ;
pshufd xmm1,xmm2,000000000y ;
pshufd xmm2,xmm3,000000000y ;
cvtdq2ps xmm0,xmm0 ;; extract the numerics
cvtdq2ps xmm1,xmm1 ;;
cvtdq2ps xmm2,xmm2 ;; xmm-reg:
;;
movdqa xmm3,OWORD ptr d2a_msk_1 ;; 2345.0 2345.0 2345.0 2345.0
movdqa xmm6,OWORD ptr d2a_msk_2 ;; |___________|_________|___________|
movdqa xmm7,OWORD ptr d2a_msk_3 ;; |
;; div. by
movdqa xmm4,xmm3 ;; |
movdqa xmm5,xmm3 ;; |-----------|----|----|-----------|
;; 10000.0 1000.0 100.0 10.0
mulps xmm3,xmm2 ;;
mulps xmm4,xmm1 ;; |
mulps xmm5,xmm0 ;; |
;; v
cvttps2dq xmm2,xmm3 ;;
cvttps2dq xmm1,xmm4 ;; 0.2345 2.345 23.45 234.5
cvttps2dq xmm0,xmm5 ;; |
;; |
cvtdq2ps xmm2,xmm2 ;; truncate each value
cvtdq2ps xmm1,xmm1 ;; and subtract from non-truncated values
cvtdq2ps xmm0,xmm0 ;; |
;; V
subps xmm3,xmm2 ;; 0.2345 0.345 0.45 0.5
subps xmm4,xmm1 ;; |
subps xmm5,xmm0 ;; |
;; mul. by 10.0
mulps xmm3,xmm6 ;; and truncate
mulps xmm4,xmm6 ;; |
mulps xmm5,xmm6 ;; V
;; 2 3 4 5
cvttps2dq xmm3,xmm3 ;;
cvttps2dq xmm4,xmm4 ;;
cvttps2dq xmm5,xmm5 ;;
;;
PACKSSDW xmm5,xmm4 ;;
PACKSSDW xmm3,xmm3 ;;
PACKSSWB xmm5,xmm3 ;;
;;
paddb xmm5,xmm7 ;;
pcmpeqb xmm7,xmm5 ;;; skip preceding zeros
xor eax,eax ;;; and return addr. in eax
pinsrw xmm5,eax,6 ;;;
pmovmskb eax,xmm7 ;;;
movdqa OWORD ptr [edx],xmm5 ;;;
not eax ;;;
bsf eax,eax ;;;
;;;
test eax,32 ;;;
.if !ZERO? ;;;
dec eax ;;;
.endif ;;;
add eax,edx ;;;
ret ;;;
align 16 ;;;
.else
mov eax,030h
mov DWORD ptr [edx],eax
mov eax,edx
ret
.endif
d2a endp
The trick is instead of converting an existing procedure that is "known", simply write a new one that has the extra capacity.
Quote from: qWord on September 08, 2008, 12:35:47 PM
I've written an SSE2-implementation of dw2a (just for fun :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.
Thanks, qWord. I hope I'll understand it :dazzled:
Did you time it against good ol' dwtoa?
Quote from: jj2007 on September 08, 2008, 12:47:24 PM
Did you time it against good ol' dwtoa?
yes, on my core2duo it takes ca. 48 clocks.
it's slower than dwtoa(masmlib) on values
< 10000,but faster on greater values
Quote from: qWord
it's slower than dwtoa(masmlib) on values > 10000, but faster on greater values
that didn't really make sense :-D
He probably inverted to > instead of < .
Quote from: hutch-- on September 08, 2008, 02:33:37 PM
He probably inverted to > instead of < .
Just teasing him :cheekygreen:
this runs in under 1 cycles on my core 2 duo. It uses a 64k lookup table to do two 16-bit values at the same time. You could do an 8-bit lookup table, use less memory, and still PROBABLY have it run under 1 cycle. or close to it. the lookup table is made of all dword values. It returns a 16-bit value, which is equivalent to the ascii value for that word. so offset 0 in the table would have '00'. Make sense?
mov eax,10000
mov edi,offset dst
mov ebx,eax
and eax,0ffffh
shr ebx,16
mov ecx,[lookup_table+eax*4]
mov edx,[lookup_table+ebx*4]
mov word [edi],cx
mov word [edi+2],dx
EDIT: my bad, that's actually my HEX to ascii routine. I cut and pasted the wrong one.
here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!
[attachment deleted by admin]
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0
Edit: i'm reffering to the twice edited & deleted post ::)
Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0
Edit: i'm reffering to the twice edited & deleted post ::)
thanks!
Quote from: drizz on September 08, 2008, 06:36:28 PM
here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!
Looks promising, thanks. It seems that SSEx is not an option, despite of the simplicity of the algo.
Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0
Edit: i'm reffering to the twice edited & deleted post ::)
what kind of processor do you have?
EDIT: I am guessing an athlon.
EDIT2: my timings for your unsigned 64-bit to ascii code are 158.685983 cycles for a FULL 64-bit number. So there are no skipping of the loops. I have a core 2 duo.
I ran qwords code. It runs in 48.504735 cycles on my core 2 duo. But it's only doing a dword.
ups i found a small bug for 0 qword, comment the first jump to fix
; test eax,eax
mov edi,ecx
mov ebp,ecx
; jz @F
Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)
Quote from: drizz on September 09, 2008, 04:17:53 PM
ups i found a small bug for 0 qword, comment the first jump to fix
; test eax,eax
mov edi,ecx
mov ebp,ecx
; jz @F
Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)
I tested with a non-zero qword, but I'll fix that as well.
New & improved version! (removed division completely) :wink
Edit: Forgot to mention the timing : 596 clocks
Edit2: buggy code removed
Hi drizz,
Compliments again, this is likely to become a replacement for dwtoa.
First the good news: It's 3 cycles faster than dwtoa for a dword!
Run *** "qwtoaDrizz.exe" ***********************
512 cycles for U64ToStr 12345678901234567890
512 cycles for U64ToStr 10987654320:98765432
119 cycles for U64ToStr 1234567890
122 cycles for dwtoa 1234567890
Now the bad news: It seems I stumbled over a number that does not translate correctly...
.data
My64a QWORD 12345678901234567890
My64b QWORD 10987654321098765432
My64c QWORD 1234567890
My32c DWORD 1234567890
File with timing code attached.
[attachment deleted by admin]
Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh
U64ToStr proc QwVal:QWORD, pszString:DWORD
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
push ebp
push esi
push edi
push ebx
mov esi,[esp+1*4][4*4]; a0
mov edi,[esp+2*4][4*4]; a1
mov ebp,[esp+3*4][4*4]; buffer pointer
test edi,edi
jz @F
;; QWORD conversion loop
@@U64Cvt:
mov eax,0CCCCCCCDh; = b0
mul esi; get a0*b0 = d1:d0
mov ecx,edx;d1
mov eax,0CCCCCCCDh; = b0
xor ebx,ebx
mul edi; get a1*b0 = e1:e0
add ecx,eax;e0
adc ebx,edx;e1
mov eax,0CCCCCCCCh; =b1
mul esi; get a0*b1 = f1:f0
add ecx,eax;f0
adc ebx,edx;f1
mov ecx,0
mov eax,0CCCCCCCCh; =b1
adc ecx,ecx
mul edi; get a1*b1 = g1:g0
add eax,ebx;g0
adc edx,ecx;g1
shrd eax,edx,3
shr edx,3;;------ quotient in edx::eax
; upper dwords will be the same after multiplication
lea ecx,[eax*4+eax]
mov ebx,esi
lea ecx,[ecx+ecx-'0']
sub ebx,ecx
mov esi,eax
mov [ebp],bl
mov edi,edx
add ebp,1
test edi,edi
jnz @@U64Cvt
@@:
;; we are here if HI-DWORD is 0
mov eax,esi
mov ebx,esi
mov edi,0CCCCCCCDh
;; Lower Dword conversion loop
@@U32Cvt:
mul edi
shr edx,3
lea ecx,[edx*4+edx]
lea ecx,[ecx+ecx-'0']
sub ebx,ecx
mov eax,edx
mov [ebp],bl
mov ebx,edx
add ebp,1
test eax,eax
jnz @@U32Cvt
;; Calculate output string length and reverse it
mov edi,[esp+3*4][4*4]
mov [ebp],al
mov eax,ebp
sub eax,edi
;; reverse buffer digits
@@: sub ebp,1
mov bl,[edi]
mov cl,[ebp]
mov [ebp],bl
mov [edi],cl
add edi,1
cmp edi,ebp
jb @B
;; string length in eax
pop ebx
pop edi
pop esi
pop ebp
ret 3*4
OPTION PROLOGUE:PROLOGUEDEF
OPTION EPILOGUE:EPILOGUEDEF
U64ToStr endp
Quote from: drizz on September 11, 2008, 07:05:32 PM
Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh
Yep, you are making good progress. Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.
invoke GetFileSize, hFile, lpFileSizeHigh ; address of high-order word for file size
Some code rearrangements make it a bit faster, on a Core 2 Celeron M:
386 cycles for qw2Str 12345678901234567890
401 cycles for U64ToStr 12345678901234567890
386 cycles for qw2Str 18446744073709551615
402 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str 18446744073709551615
403 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str 10987654321098765432
402 cycles for U64ToStr 10987654321098765432
110 cycles for qw2Str 1234567890
117 cycles for U64ToStr 1234567890
124 cycles dwtoa, dw 1234567890
73 cycles qw SSE, dw 001234567890
Full code attached.
qw2Str proc ; ptr qword in eax, ptr buffer in edx
push ebp ; credits to drizz
push esi
push edi
push ebx
push edx ; will be popped as edi
mov esi, [eax]
if 0
mov edi, [eax+4] ; 2-3 cycles slower
else
add eax, 4 ; 2 bytes longer
mov edi, [eax]
endif
mov ebp, edx ; buffer pointer, also on stack
test edi, edi
jz @F
@@U64Cvt: ; QWORD conversion loop
mov eax, 0CCCCCCCDh ; = b0
mul esi ; get a0*b0 = d1:d0
mov eax, 0CCCCCCCDh ; = b0 (mov eax up saves 1 cycle)
mov ecx, edx ; d1
xor ebx, ebx
mul edi ; get a1*b0 = e1:e0
add ecx, eax ; e0
adc ebx, edx ; e1
mov eax, 0CCCCCCCCh ; =b1
mul esi ; get a0*b1 = f1:f0
add ecx, eax ; f0
adc ebx, edx ; f1
push 0 ; 5 cycles faster, 3 bytes
pop ecx ; shorter than mov ecx, 0
mov eax, 0CCCCCCCCh ; =b1
adc ecx, ecx
mul edi ; get a1*b1 = g1:g0
add eax, ebx ; g0
adc edx, ecx ; g1
shrd eax, edx, 3
shr edx, 3 ; ------ quotient in edx::eax
; upper dwords will be the same after multiplication
lea ecx, [eax*4+eax] ; ecx=5*eax
if 1
neg ecx ; same speed, 2 bytes shorter
lea ebx, [esi+2*ecx+"0"]
else
mov ebx, esi
lea ecx, [ecx+ecx-'0']
sub ebx, ecx
endif
mov esi, eax
mov [ebp], bl
mov edi, edx
add ebp, 1 ; inc ebp costs 1-2 cycles here
test edi, edi
jnz @@U64Cvt
@@: ; we are here if HI-DWORD is 0
mov eax, esi
mov ebx, esi
mov edi, 0CCCCCCCDh
; Lower Dword conversion loop
@@U32Cvt: mul edi
shr edx, 3
lea ecx, [edx*4+edx] ; ecx=edx*5
if 1
neg ecx ; 3 cycles faster
lea ebx, [ebx+2*ecx+"0"]
else
lea ecx, [ecx+ecx-'0'] ; ecx=edx*10-48
sub ebx, ecx
endif
mov eax, edx ; sub ebx, mov eax 2 cycles faster than inverse sequence
mov [ebp], bl
mov ebx, edx
inc ebp ; add ebp, 1 here a bit slower and 2 bytes more
test eax, eax
jnz @@U32Cvt
; Calculate output string length and reverse it
pop edi ; ex mov edi, [esp+3*4][4*4]
mov [ebp], al
mov eax, ebp
sub eax, edi
; reverse buffer digits (ca. 30 cycles for 20 digits)
@@: sub ebp, 1 ; dec ebp ca. 10 cycles slower here!
mov bl, [edi]
mov cl, [ebp]
mov [ebp], bl
mov [edi], cl
add edi, 1 ; inc edi ca. 10 cycles slower here!
cmp edi, ebp
jb @B
; string length in eax
pop ebx
pop edi
pop esi
pop ebp
ret
qw2Str endp
[attachment deleted by admin]
Quote from: jj2007 on September 12, 2008, 05:12:42 AM
...Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.
Of course, feel free (anyone) to change it as you like and put it to good use :U
Quote from: drizz on September 13, 2008, 01:52:23 AM
Of course, feel free (anyone) to change it as you like and put it to good use :U
Done, see the float$ thread (http://www.masm32.com/board/index.php?topic=9756.msg72501#msg72501) :U