Print Page - qwtoa using xmm registers?

Title: qwtoa using xmm registers?
Post by: jj2007 on September 08, 2008, 08:09:53 AM

Dear all,
I have had a look at the dwtoa code, which is very simple and compact and reasonably fast. However, it can't deal with 64-bit integers. I wonder whether it would be possible to convert it to SSE, using xmm registers only? I post a test bed below. Any ideas?

P.S.: I am aware of the 2005 qwtoa thread (http://www.masm32.com/board/index.php?topic=3051.msg23461#msg23461).

Code Select

include \masm32\include\masm32rt.inc
; .686
; include \masm32\macros\timers.asm
.xmm


.data?
MyInt64	dq	?
NumBuffer		dd 20 dup(?)

.data
MyReal10	Real10	12345678901234567890.0
lpBuffer	dd	NumBuffer

.code
start:

; following 4 lines needed for new xmm loop
  mov edx, offset MyReal10
  fld REAL10 ptr [edx]		; load from mem, 6 cycles
  mov edx, offset MyInt64
  fistp qword ptr [edx]		; the 64-bit integer is now in MyInt64...

  mov eax, 1234567890	; test value we want to convert to ASCII
  mov edi, lpBuffer	; pointer to output buffer

; This is the core loop of dwtoa: 
; -------------- can this loop be translated to SSE2, using only xmm registers?? --------------
  mov ecx, 3435973837	; 0CCCCCCCDh, magic multiplier
  .While eax > 0
	mov ebx, eax
	mul ecx
	shr edx, 3
	mov eax, edx
	lea edx, [edx*4+edx]
	add edx, edx
	sub ebx, edx
	add bl, '0'
	mov [edi], bl
	add edi, 1
  .Endw
; -------------------------------------------------------------------------------------------------------------------------------------

  mov byte ptr [edi], 0       ; terminate the string
; We now have all the digits, but in reverse order
    mov esi, lpBuffer	; pointer to output buffer
    .while esi < edi
      sub edi, 1
      mov al, [esi]
      mov ah, [edi]
      mov [edi], al
      mov [esi], ah
      add esi, 1
    .endw

  invoke MessageBox, 0, lpBuffer, chr$("Our number:"), MB_OK

end start

Title: Re: qwtoa using xmm registers?
Post by: qWord on September 08, 2008, 12:35:47 PM

hi,

I've written an SSE2-implementation of dw2a (just for fun :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

regard, qWord

Code Select


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Convert unsigned DWORD to ASC-string by using      ;
;  SSE2 extension                                     ;
;                                                     ;
; IN:  eax = unsigned dword-value                     ;
;      edx = pointer to buffer, must be algined to 16 ;
;                                                     ;
; OUT: eax = pointer to string in buffer              ;
;                                                     ;
;idea and implementation                              ;
;by qWord, September 2008                             ;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
align 16
d2a proc 
   ;eax == dwNum
   ;edx == lpBuffer
   
    .data
        align 16          ; shr 13    shr 26
        d2a_msk_0   QWORD 3518437209, 2882303762 ;magic-number
        d2a_msk_4   QWORD 10000     , 100000000
        d2a_msk_1   REAL4 1.000001E-4, 1.000001E-3, 1.000001E-2, 1.000001E-1
        d2a_msk_2   REAL4 4 dup(10.0000000)
        d2a_msk_3   BYTE 16 dup(030h)
    .code

    test eax,eax
    .if !ZERO?

        movdqa xmm4,OWORD ptr d2a_msk_0              ;split dword into 3 parts
        movdqa xmm2,OWORD ptr d2a_msk_4              ; e.g.:  1234567890
                                                     ;    ________|________
        movd xmm0,eax                                ;   |        |        |
        pshufd xmm0,xmm0,011001100y                  ;  xmm2     xmm1    xmm0
        pshufd xmm3,xmm0,011001100y                  ;
        pmuludq xmm0,xmm4                            ;   12      3456    7890
        movdqa xmm1,xmm0                             ;
        psrlq xmm0,13+32                             ;
        psrlq xmm1,26+32                             ;
                                                     ;
        pshufd xmm0,xmm0,01001110y                   ;
        punpckhqdq xmm0,xmm1                         ;
        ;xmm0 = xl/10000 | xh/100000000              ;
        ;xmm3 = xl       | xh                        ;
                                                     ;
                                                     ;
        pmuludq xmm2,xmm0                            ;
        psubd xmm3,xmm2                              ;
        ; xmm3 = xl mod 10000 | xh mod 100000000     ;
                                                     ;
        pshufd xmm2,xmm3,01001110y                   ;
        pmuludq xmm2,xmm4                            ;
        psrlq xmm2,13+32                             ;
                                                     ;
        pshufd xmm0,xmm0,010101010y                  ;
        pshufd xmm1,xmm2,000000000y                  ;
        pshufd xmm2,xmm3,000000000y                  ;

        cvtdq2ps xmm0,xmm0                    ;;  extract the numerics
        cvtdq2ps xmm1,xmm1                    ;;
        cvtdq2ps xmm2,xmm2                    ;;  xmm-reg:
                                              ;;
        movdqa xmm3,OWORD ptr d2a_msk_1       ;;   2345.0     2345.0     2345.0     2345.0
        movdqa xmm6,OWORD ptr d2a_msk_2       ;;     |___________|_________|___________|
        movdqa xmm7,OWORD ptr d2a_msk_3       ;;                      |
                                              ;;                   div. by
        movdqa xmm4,xmm3                      ;;                      |
        movdqa xmm5,xmm3                      ;;     |-----------|----|----|-----------|
                                              ;;   10000.0    1000.0     100.0       10.0
        mulps xmm3,xmm2                       ;;
        mulps xmm4,xmm1                       ;;                      |
        mulps xmm5,xmm0                       ;;                      |
                                              ;;                      v
        cvttps2dq xmm2,xmm3                   ;;
        cvttps2dq xmm1,xmm4                   ;;   0.2345     2.345      23.45       234.5
        cvttps2dq xmm0,xmm5                   ;;                      |
                                              ;;                      |
        cvtdq2ps xmm2,xmm2                    ;;             truncate each value
        cvtdq2ps xmm1,xmm1                    ;;    and subtract from non-truncated values
        cvtdq2ps xmm0,xmm0                    ;;                      |
                                              ;;                      V
        subps xmm3,xmm2                       ;;   0.2345     0.345       0.45       0.5
        subps xmm4,xmm1                       ;;                      |
        subps xmm5,xmm0                       ;;                      |
                                              ;;                 mul. by 10.0
        mulps xmm3,xmm6                       ;;                and truncate
        mulps xmm4,xmm6                       ;;                      |
        mulps xmm5,xmm6                       ;;                      V
                                              ;;    2         3           4          5
        cvttps2dq xmm3,xmm3                   ;;
        cvttps2dq xmm4,xmm4                   ;;
        cvttps2dq xmm5,xmm5                   ;;
                                              ;;
        PACKSSDW xmm5,xmm4                    ;;
        PACKSSDW xmm3,xmm3                    ;;
        PACKSSWB xmm5,xmm3                    ;;
                                              ;;
        paddb xmm5,xmm7                       ;;

        pcmpeqb xmm7,xmm5                     ;;;  skip preceding zeros
        xor eax,eax                           ;;;  and return addr. in eax
        pinsrw xmm5,eax,6                     ;;;
        pmovmskb eax,xmm7                     ;;;
        movdqa OWORD ptr [edx],xmm5           ;;;
        not eax                               ;;;
        bsf eax,eax                           ;;;
                                              ;;;
        test eax,32                           ;;;
        .if !ZERO?                            ;;;
            dec eax                           ;;;
        .endif                                ;;;
        add eax,edx                           ;;;
        ret                                   ;;;
        align 16                              ;;;
    .else
        mov eax,030h
        mov DWORD ptr [edx],eax
        mov eax,edx
        ret
    .endif

d2a endp

Title: Re: qwtoa using xmm registers?
Post by: hutch-- on September 08, 2008, 12:43:18 PM

The trick is instead of converting an existing procedure that is "known", simply write a new one that has the extra capacity.

Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 08, 2008, 12:47:24 PM

Quote from: qWord on September 08, 2008, 12:35:47 PM
I've written an SSE2-implementation of dw2a (just for fun :dance:) and I think it could be addapt to convert 64bit values. Hope this give you a hint.

Thanks, qWord. I hope I'll understand it :dazzled:
Did you time it against good ol' dwtoa?

Title: Re: qwtoa using xmm registers?
Post by: qWord on September 08, 2008, 01:00:38 PM

Quote from: jj2007 on September 08, 2008, 12:47:24 PM
Did you time it against good ol' dwtoa?

yes, on my core2duo it takes ca. 48 clocks.
it's slower than dwtoa(masmlib) on values < 10000,but faster on greater values

Title: Re: qwtoa using xmm registers?
Post by: BlackVortex on September 08, 2008, 02:31:45 PM

Quote from: qWord
it's slower than dwtoa(masmlib) on values > 10000, but faster on greater values

that didn't really make sense :-D

Title: Re: qwtoa using xmm registers?
Post by: hutch-- on September 08, 2008, 02:33:37 PM

He probably inverted to > instead of < .

Title: Re: qwtoa using xmm registers?
Post by: BlackVortex on September 08, 2008, 02:41:41 PM

Quote from: hutch-- on September 08, 2008, 02:33:37 PM
He probably inverted to > instead of < .

Just teasing him :cheekygreen:

Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 08, 2008, 03:52:29 PM

this runs in under 1 cycles on my core 2 duo. It uses a 64k lookup table to do two 16-bit values at the same time. You could do an 8-bit lookup table, use less memory, and still PROBABLY have it run under 1 cycle. or close to it. the lookup table is made of all dword values. It returns a 16-bit value, which is equivalent to the ascii value for that word. so offset 0 in the table would have '00'. Make sense?

Code Select


	mov	eax,10000
	mov	edi,offset dst

	mov	ebx,eax

	and	eax,0ffffh
	shr	ebx,16
	
	mov	ecx,[lookup_table+eax*4]
	mov	edx,[lookup_table+ebx*4]
	
	mov	word [edi],cx
	mov	word [edi+2],dx

EDIT: my bad, that's actually my HEX to ascii routine. I cut and pasted the wrong one.

Title: Re: qwtoa using xmm registers?
Post by: drizz on September 08, 2008, 06:36:28 PM

here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

[attachment deleted by admin]

Title: Re: qwtoa using xmm registers?
Post by: drizz on September 08, 2008, 09:22:14 PM

Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post ::)

Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 08, 2008, 09:48:06 PM

Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post ::)

thanks!

Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 09, 2008, 09:16:49 AM

Quote from: drizz on September 08, 2008, 06:36:28 PM
here are my 63bit (signed) and 64bit (unsigned) conversion routines ( to and from ascii ).
edit: no mmx sse!

Looks promising, thanks. It seems that SSEx is not an option, despite of the simplicity of the algo.

Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 09, 2008, 01:03:40 PM

Quote from: drizz on September 08, 2008, 09:22:14 PM
Mark, if you are interested in some fine Qword & Dword-To-Hex string algos, take a look here:
http://www.asmcommunity.net/board/index.php?topic=22187.0

Edit: i'm reffering to the twice edited & deleted post ::)

what kind of processor do you have?

EDIT: I am guessing an athlon.

EDIT2: my timings for your unsigned 64-bit to ascii code are 158.685983 cycles for a FULL 64-bit number. So there are no skipping of the loops. I have a core 2 duo.

Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 09, 2008, 02:27:04 PM

I ran qwords code. It runs in 48.504735 cycles on my core 2 duo. But it's only doing a dword.

Title: Re: qwtoa using xmm registers?
Post by: drizz on September 09, 2008, 04:17:53 PM

ups i found a small bug for 0 qword, comment the first jump to fix

Code Select

;	test eax,eax
	mov edi,ecx
	mov ebp,ecx
;	jz @F

Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)

Title: Re: qwtoa using xmm registers?
Post by: Mark_Larson on September 09, 2008, 08:46:50 PM

Quote from: drizz on September 09, 2008, 04:17:53 PM
ups i found a small bug for 0 qword, comment the first jump to fix
Code Select Expand
; test eax,eax mov edi,ecx mov ebp,ecx ; jz @F

Mark, I get 643 clocks for QWORD(-1,-1) on PIII 1.3, XPSP2
(800 clocks for _i64toa)

I tested with a non-zero qword, but I'll fix that as well.

Title: Re: qwtoa using xmm registers?
Post by: drizz on September 09, 2008, 09:07:37 PM

New & improved version! (removed division completely) :wink

Edit: Forgot to mention the timing : 596 clocks

Edit2: buggy code removed

Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 11, 2008, 04:51:16 PM

Hi drizz,

Compliments again, this is likely to become a replacement for dwtoa.

First the good news: It's 3 cycles faster than dwtoa for a dword!

Run *** "qwtoaDrizz.exe" ***********************

512 cycles for U64ToStr 12345678901234567890
512 cycles for U64ToStr 10987654320:98765432
119 cycles for U64ToStr 1234567890
122 cycles for dwtoa 1234567890

Now the bad news: It seems I stumbled over a number that does not translate correctly...

Code Select

.data
My64a	QWORD	12345678901234567890
My64b	QWORD	10987654321098765432
My64c	QWORD	1234567890
My32c	DWORD	1234567890

File with timing code attached.

[attachment deleted by admin]

Title: Re: qwtoa using xmm registers?
Post by: drizz on September 11, 2008, 07:05:32 PM

Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh

Code Select

U64ToStr proc QwVal:QWORD, pszString:DWORD
	OPTION PROLOGUE:NONE
	OPTION EPILOGUE:NONE
	push ebp
	push esi
	push edi
	push ebx
	mov esi,[esp+1*4][4*4]; a0
	mov edi,[esp+2*4][4*4]; a1
	mov ebp,[esp+3*4][4*4]; buffer pointer
	test edi,edi
	jz @F
	;; QWORD conversion loop
@@U64Cvt:
	mov eax,0CCCCCCCDh; = b0
	mul esi; get a0*b0 = d1:d0
	mov ecx,edx;d1
	mov eax,0CCCCCCCDh; = b0
	xor ebx,ebx
	mul edi; get a1*b0 = e1:e0
	add ecx,eax;e0
	adc ebx,edx;e1
	mov eax,0CCCCCCCCh; =b1
	mul esi; get a0*b1 = f1:f0
	add ecx,eax;f0
	adc ebx,edx;f1
	mov ecx,0
	mov eax,0CCCCCCCCh; =b1
	adc ecx,ecx
	mul edi; get a1*b1 = g1:g0
	add eax,ebx;g0
	adc edx,ecx;g1
	shrd eax,edx,3
	shr edx,3;;------ quotient in edx::eax
	; upper dwords will be the same after multiplication
	lea ecx,[eax*4+eax]
	mov ebx,esi
	lea ecx,[ecx+ecx-'0']
	sub ebx,ecx
	mov esi,eax
	mov [ebp],bl
	mov edi,edx
	add ebp,1
	test edi,edi
	jnz @@U64Cvt
@@:
	;; we are here if HI-DWORD is 0
	mov eax,esi
	mov ebx,esi
	mov edi,0CCCCCCCDh
	;; Lower Dword conversion loop 
@@U32Cvt:
	mul edi
	shr edx,3
	lea ecx,[edx*4+edx]
	lea ecx,[ecx+ecx-'0']
	sub ebx,ecx
	mov eax,edx
	mov [ebp],bl
	mov ebx,edx
	add ebp,1
	test eax,eax
	jnz @@U32Cvt
	;; Calculate output string length and reverse it
	mov edi,[esp+3*4][4*4]
	mov [ebp],al
	mov eax,ebp
	sub eax,edi
	;; reverse buffer digits
@@:	sub ebp,1
	mov bl,[edi]
	mov cl,[ebp]
	mov [ebp],bl
	mov [edi],cl
	add edi,1
	cmp edi,ebp
	jb @B
	;; string length in eax
	pop ebx
	pop edi
	pop esi
	pop ebp
	ret 3*4
	OPTION PROLOGUE:PROLOGUEDEF
	OPTION EPILOGUE:EPILOGUEDEF	
U64ToStr endp

Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 12, 2008, 05:12:42 AM

Quote from: drizz on September 11, 2008, 07:05:32 PM
Bug fixed and even more optimised :)
474 clocks for Full qword 0FFFFFFFFFFFFFFFFh

Yep, you are making good progress. Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.

Code Select

invoke GetFileSize, hFile, lpFileSizeHigh ; address of high-order word for file size

Some code rearrangements make it a bit faster, on a Core 2 Celeron M:

Code Select

386 cycles for qw2Str   12345678901234567890
401 cycles for U64ToStr 12345678901234567890
386 cycles for qw2Str   18446744073709551615
402 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   18446744073709551615
403 cycles for U64ToStr 18446744073709551615
386 cycles for qw2Str   10987654321098765432
402 cycles for U64ToStr 10987654321098765432
110 cycles for qw2Str   1234567890
117 cycles for U64ToStr 1234567890
124 cycles dwtoa, dw    1234567890
73 cycles qw SSE, dw    001234567890

Full code attached.

Code Select

qw2Str proc	; ptr qword in eax, ptr buffer in edx
	push ebp			; credits to  drizz 
	push esi
	push edi
	push ebx
	push edx			; will be popped as edi
	mov esi, [eax]
	if 0
		mov edi, [eax+4]	; 2-3 cycles slower
	else
		add eax, 4	; 2 bytes longer
		mov edi, [eax]
	endif
	mov ebp, edx		; buffer pointer, also on stack
	test edi, edi
	jz @F

  @@U64Cvt:	; QWORD conversion loop
	mov eax, 0CCCCCCCDh	; = b0
	mul esi			; get a0*b0 = d1:d0
	mov eax, 0CCCCCCCDh	; = b0 (mov eax up saves 1 cycle)
	mov ecx, edx		; d1
	xor ebx, ebx
	mul edi			; get a1*b0 = e1:e0
	add ecx, eax		; e0
	adc ebx, edx		; e1
	mov eax, 0CCCCCCCCh	; =b1
	mul esi			; get a0*b1 = f1:f0
	add ecx, eax		; f0
	adc ebx, edx		; f1
	push 0			; 5 cycles faster, 3 bytes
	pop ecx			; shorter than mov ecx, 0
	mov eax, 0CCCCCCCCh	; =b1
	adc ecx, ecx
	mul edi			; get a1*b1 = g1:g0
	add eax, ebx		; g0
	adc edx, ecx		; g1
	shrd eax, edx, 3
	shr edx, 3			; ------ quotient in edx::eax

	; upper dwords will be the same after multiplication
	lea ecx, [eax*4+eax]	; ecx=5*eax
	if 1
		neg ecx		; same speed, 2 bytes shorter
		lea ebx, [esi+2*ecx+"0"]
	else
		mov ebx, esi
		lea ecx, [ecx+ecx-'0']
		sub ebx, ecx
	endif
	mov esi, eax
	mov [ebp], bl
	mov edi, edx
	add ebp, 1		; inc ebp costs 1-2 cycles here
	test edi, edi
	jnz @@U64Cvt

  @@:	; we are here if HI-DWORD is 0
	mov eax, esi
	mov ebx, esi
	mov edi, 0CCCCCCCDh

	; Lower Dword conversion loop
  @@U32Cvt:	mul edi
	shr edx, 3
	lea ecx, [edx*4+edx]	; ecx=edx*5
	if 1
		neg ecx		; 3 cycles faster
		lea ebx, [ebx+2*ecx+"0"]
	else
		lea ecx, [ecx+ecx-'0']	; ecx=edx*10-48
		sub ebx, ecx
	endif
	mov eax, edx		; sub ebx, mov eax 2 cycles faster than inverse sequence
	mov [ebp], bl
	mov ebx, edx
	inc ebp			; add ebp, 1 here a bit slower and 2 bytes more
	test eax, eax
	jnz @@U32Cvt

	; Calculate output string length and reverse it
	pop edi 			; ex mov edi, [esp+3*4][4*4]
	mov [ebp], al
	mov eax, ebp
	sub eax, edi

	; reverse buffer digits (ca. 30 cycles for 20 digits)
  @@:	sub ebp, 1		; dec ebp ca. 10 cycles slower here!
	mov bl, [edi]
	mov cl, [ebp]
	mov [ebp], bl
	mov [edi], cl
	add edi, 1			; inc edi ca. 10 cycles slower here!
	cmp edi, ebp
	jb @B

	; string length in eax
	pop ebx
	pop edi
	pop esi
	pop ebp
	ret
qw2Str endp

[attachment deleted by admin]

Title: Re: qwtoa using xmm registers?
Post by: drizz on September 13, 2008, 01:52:23 AM

Quote from: jj2007 on September 12, 2008, 05:12:42 AM
...Below some small changes. I decided to pass a pointer to the qword in eax, and the pointer to the buffer in edx. Speedwise the same, but a bit more practical for e.g.

Of course, feel free (anyone) to change it as you like and put it to good use :U

Title: Re: qwtoa using xmm registers?
Post by: jj2007 on September 13, 2008, 06:29:34 AM

Quote from: drizz on September 13, 2008, 01:52:23 AM
Of course, feel free (anyone) to change it as you like and put it to good use :U

Done, see the float$ thread (http://www.masm32.com/board/index.php?topic=9756.msg72501#msg72501) :U

The MASM Forum Archive 2004 to 2012

General Forums => The Laboratory => Topic started by: jj2007 on September 08, 2008, 08:09:53 AM