qwtoa using xmm registers?

drizz · September 09, 2008, 04:17:53 PM

ups i found a small bug for 0 qword, comment the first jump to fix

;	test eax,eax
	mov edi,ecx
	mov ebp,ecx
;	jz @F

Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)

Mark_Larson · September 09, 2008, 08:46:50 PM

Quote from: drizz on September 09, 2008, 04:17:53 PM
ups i found a small bug for 0 qword, comment the first jump to fix
Code Select Expand
; test eax,eax mov edi,ecx mov ebp,ecx ; jz @F

Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)

I tested with a non-zero qword, but I'll fix that as well.

drizz · September 09, 2008, 09:07:37 PM

New & improved version! (removed division completely) :wink

Edit: Forgot to mention the timing : 596 clocks

Edit2: buggy code removed

jj2007 · September 11, 2008, 04:51:16 PM

Hi drizz,

Compliments again, this is likely to become a replacement for dwtoa.

First the good news: It's 3 cycles faster than dwtoa for a dword!

Run *** "qwtoaDrizz.exe" ***********************

512 cycles for U64ToStr 12345678901234567890
512 cycles for U64ToStr 10987654320:98765432
119 cycles for U64ToStr 1234567890
122 cycles for dwtoa 1234567890

Now the bad news: It seems I stumbled over a number that does not translate correctly...

Code Select

.data
My64a	QWORD	12345678901234567890
My64b	QWORD	10987654321098765432
My64c	QWORD	1234567890
My32c	DWORD	1234567890

File with timing code attached.

[attachment deleted by admin]

drizz · September 11, 2008, 07:05:32 PM

Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh

Code Select

U64ToStr proc QwVal:QWORD, pszString:DWORD
	OPTION PROLOGUE:NONE
	OPTION EPILOGUE:NONE
	push ebp
	push esi
	push edi
	push ebx
	mov esi,[esp+1*4][4*4]; a0
	mov edi,[esp+2*4][4*4]; a1
	mov ebp,[esp+3*4][4*4]; buffer pointer
	test edi,edi
	jz @F
	;; QWORD conversion loop
@@U64Cvt:
	mov eax,0CCCCCCCDh; = b0
	mul esi; get a0*b0 = d1:d0
	mov ecx,edx;d1
	mov eax,0CCCCCCCDh; = b0
	xor ebx,ebx
	mul edi; get a1*b0 = e1:e0
	add ecx,eax;e0
	adc ebx,edx;e1
	mov eax,0CCCCCCCCh; =b1
	mul esi; get a0*b1 = f1:f0
	add ecx,eax;f0
	adc ebx,edx;f1
	mov ecx,0
	mov eax,0CCCCCCCCh; =b1
	adc ecx,ecx
	mul edi; get a1*b1 = g1:g0
	add eax,ebx;g0
	adc edx,ecx;g1
	shrd eax,edx,3
	shr edx,3;;------ quotient in edx::eax
	; upper dwords will be the same after multiplication
	lea ecx,[eax*4+eax]
	mov ebx,esi
	lea ecx,[ecx+ecx-'0']
	sub ebx,ecx
	mov esi,eax
	mov [ebp],bl
	mov edi,edx
	add ebp,1
	test edi,edi
	jnz @@U64Cvt
@@:
	;; we are here if HI-DWORD is 0
	mov eax,esi
	mov ebx,esi
	mov edi,0CCCCCCCDh
	;; Lower Dword conversion loop 
@@U32Cvt:
	mul edi
	shr edx,3
	lea ecx,[edx*4+edx]
	lea ecx,[ecx+ecx-'0']
	sub ebx,ecx
	mov eax,edx
	mov [ebp],bl
	mov ebx,edx
	add ebp,1
	test eax,eax
	jnz @@U32Cvt
	;; Calculate output string length and reverse it
	mov edi,[esp+3*4][4*4]
	mov [ebp],al
	mov eax,ebp
	sub eax,edi
	;; reverse buffer digits
@@:	sub ebp,1
	mov bl,[edi]
	mov cl,[ebp]
	mov [ebp],bl
	mov [edi],cl
	add edi,1
	cmp edi,ebp
	jb @B
	;; string length in eax
	pop ebx
	pop edi
	pop esi
	pop ebp
	ret 3*4
	OPTION PROLOGUE:PROLOGUEDEF
	OPTION EPILOGUE:EPILOGUEDEF	
U64ToStr endp

jj2007 · September 12, 2008, 05:12:42 AM

Quote from: drizz on September 11, 2008, 07:05:32 PM
Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh

Yep, you are making good progress. Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.

Code Select

invoke GetFileSize, hFile, lpFileSizeHigh ; address of high-order word for file size

Some code rearrangements make it a bit faster, on a Core 2 Celeron M:

Code Select

386 cycles for qw2Str   12345678901234567890
401 cycles for U64ToStr 12345678901234567890
386 cycles for qw2Str   18446744073709551615
402 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   18446744073709551615
403 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   10987654321098765432
402 cycles for U64ToStr 10987654321098765432
110 cycles for qw2Str   1234567890
117 cycles for U64ToStr 1234567890
124 cycles dwtoa, dw    1234567890
73 cycles qw SSE, dw    001234567890

Full code attached.

Code Select

qw2Str proc	; ptr qword in eax, ptr buffer in edx
	push ebp			; credits to  drizz 
	push esi
	push edi
	push ebx
	push edx			; will be popped as edi
	mov esi, [eax]
	if 0
		mov edi, [eax+4]	; 2-3 cycles slower
	else
		add eax, 4	; 2 bytes longer
		mov edi, [eax]
	endif
	mov ebp, edx		; buffer pointer, also on stack
	test edi, edi
	jz @F

  @@U64Cvt:	; QWORD conversion loop
	mov eax, 0CCCCCCCDh	; = b0
	mul esi			; get a0*b0 = d1:d0
	mov eax, 0CCCCCCCDh	; = b0 (mov eax up saves 1 cycle)
	mov ecx, edx		; d1
	xor ebx, ebx
	mul edi			; get a1*b0 = e1:e0
	add ecx, eax		; e0
	adc ebx, edx		; e1
	mov eax, 0CCCCCCCCh	; =b1
	mul esi			; get a0*b1 = f1:f0
	add ecx, eax		; f0
	adc ebx, edx		; f1
	push 0			; 5 cycles faster, 3 bytes
	pop ecx			; shorter than mov ecx, 0
	mov eax, 0CCCCCCCCh	; =b1
	adc ecx, ecx
	mul edi			; get a1*b1 = g1:g0
	add eax, ebx		; g0
	adc edx, ecx		; g1
	shrd eax, edx, 3
	shr edx, 3			; ------ quotient in edx::eax

	; upper dwords will be the same after multiplication
	lea ecx, [eax*4+eax]	; ecx=5*eax
	if 1
		neg ecx		; same speed, 2 bytes shorter
		lea ebx, [esi+2*ecx+"0"]
	else
		mov ebx, esi
		lea ecx, [ecx+ecx-'0']
		sub ebx, ecx
	endif
	mov esi, eax
	mov [ebp], bl
	mov edi, edx
	add ebp, 1		; inc ebp costs 1-2 cycles here
	test edi, edi
	jnz @@U64Cvt

  @@:	; we are here if HI-DWORD is 0
	mov eax, esi
	mov ebx, esi
	mov edi, 0CCCCCCCDh

	; Lower Dword conversion loop
  @@U32Cvt:	mul edi
	shr edx, 3
	lea ecx, [edx*4+edx]	; ecx=edx*5
	if 1
		neg ecx		; 3 cycles faster
		lea ebx, [ebx+2*ecx+"0"]
	else
		lea ecx, [ecx+ecx-'0']	; ecx=edx*10-48
		sub ebx, ecx
	endif
	mov eax, edx		; sub ebx, mov eax 2 cycles faster than inverse sequence
	mov [ebp], bl
	mov ebx, edx
	inc ebp			; add ebp, 1 here a bit slower and 2 bytes more
	test eax, eax
	jnz @@U32Cvt

	; Calculate output string length and reverse it
	pop edi 			; ex mov edi, [esp+3*4][4*4]
	mov [ebp], al
	mov eax, ebp
	sub eax, edi

	; reverse buffer digits (ca. 30 cycles for 20 digits)
  @@:	sub ebp, 1		; dec ebp ca. 10 cycles slower here!
	mov bl, [edi]
	mov cl, [ebp]
	mov [ebp], bl
	mov [edi], cl
	add edi, 1			; inc edi ca. 10 cycles slower here!
	cmp edi, ebp
	jb @B

	; string length in eax
	pop ebx
	pop edi
	pop esi
	pop ebp
	ret
qw2Str endp

[attachment deleted by admin]

drizz · September 13, 2008, 01:52:23 AM

Quote from: jj2007 on September 12, 2008, 05:12:42 AM
...Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.

Of course, feel free (anyone) to change it as you like and put it to good use :U

jj2007 · September 13, 2008, 06:29:34 AM

Quote from: drizz on September 13, 2008, 01:52:23 AM
Of course, feel free (anyone) to change it as you like and put it to good use :U

Done, see the float$ thread :U

News:

qwtoa using xmm registers?