The MASM Forum Archive 2004 to 2012

General Forums => The Workshop => Topic started by: jj2007 on November 17, 2009, 04:52:55 PM



Title: Fast bitwise swap?
Post by: jj2007 on November 17, 2009, 04:52:55 PM
Anybody aware of an algo or an instruction, even SSEx, that allows a fast bitwise swap? BSWAP does it bytewise.
XLAT or MOV AL,[BX+AL] is of course an option, but it would require a 256*256 table for a word register...


Title: Re: Fast bitwise swap?
Post by: dedndave on November 17, 2009, 05:18:48 PM
i have wanted the same thing a few times, Jochen
for a word value, you could use a byte table twice and swap bytes
the only thought i have ever had (that would actually work - lol) was to shift out of one register into another
i have never implemented such an algo because i new it would be too slow
there may be some math trick, but it eludes me - lol


Title: Re: Fast bitwise swap?
Post by: qWord on November 17, 2009, 05:54:27 PM
Quote from: jj2007 on November 17, 2009, 04:52:55 PM
... even SSEx ...
that's an job for sse2  :8)
Code:
.data
align 16
msk db 1,1,2,2,4,4,8,8,16,16,32,32,64,64,128,128
.code

mov eax, what ever; e.g.: 0110010001011010011001001y ==> 1001001100101101000100110y

movd xmm0,eax
movdqa xmm4,OWORD ptr msk
pxor xmm5,xmm5
punpcklbw xmm0,xmm0
punpcklwd xmm0,xmm0
movdqa xmm1,xmm0
movdqa xmm2,xmm0
movdqa xmm3,xmm0
psrldq xmm0,12
psrldq xmm1,8
psrldq xmm2,4

pshufd xmm0,xmm0,0 ; xmm0 = byte[3] of eax
pshufd xmm1,xmm1,0 ; xmm1 = byte[2] "  "
pshufd xmm2,xmm2,0 ; xmm2 = byte[1] "  "
pshufd xmm3,xmm3,0 ; xmm3 = byte
  • "  "

    pandn xmm0,xmm4 ; mask bits
    pandn xmm1,xmm4 ;
    pandn xmm2,xmm4 ;
    pandn xmm3,xmm4 ;

    pcmpeqb xmm0,xmm5
    pcmpeqb xmm1,xmm5
    pcmpeqb xmm2,xmm5
    pcmpeqb xmm3,xmm5

    psrlw xmm0,8 ;
    psrlw xmm1,8 ;
    psrlw xmm2,8 ;
    psrlw xmm3,8 ;

    pshufhw xmm0,xmm0,00011011y
    pshufhw xmm1,xmm1,00011011y
    pshufhw xmm2,xmm2,00011011y
    pshufhw xmm3,xmm3,00011011y

    pshuflw xmm0,xmm0,00011011y
    pshuflw xmm1,xmm1,00011011y
    pshuflw xmm2,xmm2,00011011y
    pshuflw xmm3,xmm3,00011011y

    pshufd xmm0,xmm0,01001110y
    pshufd xmm1,xmm1,01001110y
    pshufd xmm2,xmm2,01001110y
    pshufd xmm3,xmm3,01001110y

    packuswb xmm0,xmm1
    packuswb xmm2,xmm3

    pmovmskb eax,xmm0
    pmovmskb edx,xmm2

    shl edx,16
    or eax,edx


  • Title: Re: Fast bitwise swap?
    Post by: qWord on November 17, 2009, 06:05:59 PM
    Quote from: jj2007 on November 17, 2009, 04:52:55 PM
    XLAT or MOV AL,[BX+AL] is of course an option, but it would require a 256*256 table for a word register...

    no, you need only one table (256 byte). For an word you have to make two look ups - when storing the result, you must only swap this two bytes.

    EDIT:
    here is some code (for 32 bit values):
    Code:
    .data

    align 16
    lut LABEL BYTE

    cntr = 0
    REPEAT 256
    .radix 2
    txt TEXTEQU %(cntr)
    .radix 10
    txt2 TEXTEQU <>

    cnt2=@SizeStr(%txt)
    % WHILE cnt2 NE 8
    txt CATSTR <0>,txt
    cnt2=@SizeStr(%txt)
    ENDM

    % FORC chr,<txt>
    txt2 CATSTR <&chr>,txt2
    ENDM
    txt2 TEXTEQU <0>,txt2,<y>
    db txt2
    cntr  = cntr + 1
    ENDM


    .code

    mov eax,123456 ; 00000000000000011110001001000000 ==> 00000010010001111000000000000000
                     

    xor edx,edx
    movzx ecx,al
    mov dl,[lut+ecx]
    shl edx,8
    movzx ecx,ah
    shr eax,16
    mov dl,[lut+ecx]
    shl edx,8
    movzx ecx,al
    mov dl,[lut+ecx]
    shl edx,8
    movzx ecx,ah
    mov dl,[lut+ecx]
    ; result in edx


    Title: Re: Fast bitwise swap?
    Post by: jj2007 on November 17, 2009, 11:14:46 PM
    Quote from: dedndave on November 17, 2009, 05:18:48 PM
    i have wanted the same thing a few times, Jochen
    for a word value, you could use a byte table twice and swap bytes
    the only thought i have ever had (that would actually work - lol) was to shift out of one register into another
    i have never implemented such an algo because i new it would be too slow
    there may be some math trick, but it eludes me - lol


    Dave,
    Speed is not an issue, because you need to create the table only once. For a word size table, that's around 1 millisecond. Here is my first attempt, 20 bytes long:

    Code:
    include \masm32\include\masm32rt.inc

    .data?
    bufSrc db 36 dup(?)
    bufDest db 36 dup(?)

    .code
    start:
    xor ebx, ebx ; clear outer counter

    L0: m2m esi, 16 ; set inner counter
    mov eax, ebx ; the source word

    L1: rol ax, 1 ; get a carry flag in case the high order bit is set
    rcr dx, 1 ; and pull it in from the left
    dec esi
    jne L1

    .if ebx>65515 || ebx<20 ; show some nice results
    mov edi, edx
    mov esi, eax
    print str$(ebx), 9
    invoke dw2bin_ex, esi, offset bufSrc
    invoke dw2bin_ex, edi, offset bufDest
    print offset bufSrc+16, 32
    print offset bufDest+16, 13, 10
    .endif

    inc bx
    jne L0

    L2: print "Size="
    mov eax, L2
    sub eax, start
    sub eax, 110 ; correct size for print routine
    print str$(eax)
    exit
    end start

    Output:
    Code:
    0       0000000000000000 0000000000000000
    1       0000000000000001 1000000000000000
    2       0000000000000010 0100000000000000
    3       0000000000000011 1100000000000000
    4       0000000000000100 0010000000000000
    5       0000000000000101 1010000000000000
    6       0000000000000110 0110000000000000
    7       0000000000000111 1110000000000000
    8       0000000000001000 0001000000000000
    9       0000000000001001 1001000000000000
    10      0000000000001010 0101000000000000
    11      0000000000001011 1101000000000000
    12      0000000000001100 0011000000000000
    13      0000000000001101 1011000000000000
    14      0000000000001110 0111000000000000
    15      0000000000001111 1111000000000000
    16      0000000000010000 0000100000000000
    17      0000000000010001 1000100000000000
    18      0000000000010010 0100100000000000
    19      0000000000010011 1100100000000000
    65516   1111111111101100 0011011111111111
    65517   1111111111101101 1011011111111111
    65518   1111111111101110 0111011111111111
    65519   1111111111101111 1111011111111111
    65520   1111111111110000 0000111111111111
    65521   1111111111110001 1000111111111111
    65522   1111111111110010 0100111111111111
    65523   1111111111110011 1100111111111111
    65524   1111111111110100 0010111111111111
    65525   1111111111110101 1010111111111111
    65526   1111111111110110 0110111111111111
    65527   1111111111110111 1110111111111111
    65528   1111111111111000 0001111111111111
    65529   1111111111111001 1001111111111111
    65530   1111111111111010 0101111111111111
    65531   1111111111111011 1101111111111111
    65532   1111111111111100 0011111111111111
    65533   1111111111111101 1011111111111111
    65534   1111111111111110 0111111111111111
    65535   1111111111111111 1111111111111111


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 17, 2009, 11:56:04 PM
    oh - lol
    i didn't know this was for table creation - that makes it nice
    you are shifting out of and into word size registers
    i would go backwards, so you could use dword size registers - no size override bytes

    L1:    shr    eax,1
            rcl    edx,1
            dec    esi
            jnz    L1


    Title: Re: Fast bitwise swap?
    Post by: jj2007 on November 18, 2009, 12:17:29 AM
    Quote from: dedndave on November 17, 2009, 11:56:04 PM
    i would go backwards, so you could use dword size registers - no size override bytes


    Hmmm... did you test it? Doesn't work for me...


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 18, 2009, 12:46:13 AM
    huh ?
    it should work fine - try it again


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 18, 2009, 12:50:52 AM
    i get nothing out of your routine at all, my friend
    ok - now i got it - lol
    it can't be named "shift.exe" i guess i have that program someplace - lol
    the pattern looks cooler on a black console window than it does in here

    my code works fine, as long as you do not need the original value
    try this...

            push    eax
    L1:    shr    eax,1
            rcl    edx,1
            dec    esi
            jnz    L1
            pop    eax


    Title: Re: Fast bitwise swap?
    Post by: jj2007 on November 18, 2009, 01:04:31 AM
    Can you post a complete example? Mine is implemented in the bin2byte thread (http://www.masm32.com/board/index.php?topic=12719.msg98155#msg98155) now.


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 18, 2009, 01:16:20 AM

    wtCreate: m2m esi, 16 ; set inner counter
    mov edx, ebx ; the source word

    @@:
      shr edx,1  ;was  rol dx, 1 ; get a carry flag in case the high order bit is set
      rcl eax,1  ;was  rcr ax, 1 ; and pull it in from the left
          dec esi
          jne @B
    mov edx,ebx  ;added

    i am not sure if you need the original copy in edx


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 18, 2009, 01:34:32 AM
    i think i see why it may not have worked for you
    in the post above, you have

    mov eax, ebx ; the source word

    L1: rol ax, 1 ; get a carry flag in case the high order bit is set
    rcr dx, 1 ; and pull it in from the left
    dec esi
    jne L1

    so my code was

    L1:    shr    eax,1
            rcl    edx,1
            dec    esi
            jnz    L1

    in Bin2Byte, you swapped the AX/DX registers to

    @@:  rol dx, 1      ; get a carry flag in case the high order bit is set
          rcr ax, 1      ; and pull it in from the left
          dec esi
          jne @B


    Title: Re: Fast bitwise swap?
    Post by: MichaelW on November 18, 2009, 02:30:02 AM
    In my tests of a 32-bit operation the code was fastest (65 cycles versus 101) if I unrolled the loop completely:
    Code:
    REPEAT 32
        shr edx, 1
        rcl eax, 1
    ENDM
    And the P3 cycle counts were 65 versus 30 for an 8-bit table lookup.


    Title: Re: Fast bitwise swap?
    Post by: sinsi on November 18, 2009, 06:13:32 AM
    qWord, I get 7 clocks for your code, this is 4
    Code:
    movzx ecx,al
    mov dh,[lut2+ecx]
    movzx ecx,ah
    shr eax,16
    mov dl,[lut2+ecx]
    shl edx,16
    movzx ecx,al
    mov dh,[lut2+ecx]
    movzx ecx,ah
    mov dl,[lut2+ecx]
    ; result in edx
    Still your code though.

    How about a 4-bit look-up? 16 bytes vs 256 - I am thinking of size here, since 4 clocks is fast enough.


    Title: Re: Fast bitwise swap?
    Post by: jj2007 on November 18, 2009, 09:11:00 AM
    Quote from: dedndave on November 18, 2009, 01:34:32 AM
    i think i see why it may not have worked for you


    Yeah, I know. I had tested that, and tried all kinds of combinations, but probably I was just too tired. Now it works, thanxalot :thumbu


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 18, 2009, 02:47:12 PM
    i would think a 256 byte LUT would be a good comprimise between size and speed
    really, how fast does this thing need to be ?
    just my 2 cents - lol
    now, you are 2 cents richer, i guess  :bg
    don't spend it all at once


    Title: Re: Fast bitwise swap?
    Post by: drizz on November 19, 2009, 01:02:31 AM
    Hello everyone,
    :8)
    Code:
    MirrorEAX:
    bswap eax
    mov edx,00001111000011110000111100001111b
    and edx,eax
    and eax,11110000111100001111000011110000b
    shl edx,4
    shr eax,4
    or eax,edx
    mov edx,00110011001100110011001100110011b
    and edx,eax
    and eax,11001100110011001100110011001100b
    shr eax,2
    lea eax,[edx*4+eax]; shl edx,2; or eax,edx
    mov edx,01010101010101010101010101010101b
    and edx,eax
    and eax,10101010101010101010101010101010b
    shr eax,1
    lea eax,[edx*2+eax]; shl edx,1; or eax,edx


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 19, 2009, 01:55:54 AM
    very cool, Drizz  :U
    i will have to remember that one - lol
    did you just make that up ? - or have you used it before ?


    Title: Re: Fast bitwise swap?
    Post by: MichaelW on November 19, 2009, 03:57:06 AM
    The attachment tests everything except the SSE2 code, all for 32-bit values and encapsulated in procedures without a stack frame.
    Cycle counts on a P3:
    Code:
    27 cycles, qword 256-byte-lut, 313 bytes
    101 cycles, dave shr-rcl-loop, 19 bytes
    12 cycles, drizz stir-fry, 64 bytes
    13 cycles, drizz/lingo, 68 bytes


    Title: Re: Fast bitwise swap?
    Post by: sinsi on November 19, 2009, 04:06:31 AM
    Q6600
    Code:
    7 cycles, qword 256-byte-lut, 313 bytes
    63 cycles, dave shr-rcl-loop, 19 bytes
    6 cycles, drizz stir-fry, 64 bytes

    One day drizz you will comment your code so we know what in the hell you are on about  :bg very cool.


    Title: Re: Fast bitwise swap?
    Post by: lingo on November 19, 2009, 07:08:51 AM
    MichaelW,
    Please include this in your test as a bitswap_drizz_1:(not tested) :wink
    Code:
    align 16
    bitswap_drizz proc ddval:DWORD
    pop ecx
    pop eax
    bswap        eax
    mov edx,eax
    and eax,11110000111100001111000011110000b
    shr eax,4
    and edx,00001111000011110000111100001111b
    shl edx,4
    mov ecx, 00110011001100110011001100110011b
    or eax, edx
    mov edx, eax
    and eax,11001100110011001100110011001100b
    shr eax,2
    and edx,ecx
    lea eax,[edx*4+eax]; shl edx,2; or eax,edx
    mov ecx,01010101010101010101010101010101b
            mov             edx,eax
            and eax,10101010101010101010101010101010b
            shr eax,1
    and edx,ecx
    lea eax,[edx*2+eax]; shl edx,1; or eax,edx
    jmp dword ptr [esp-2*4]


    Title: Re: Fast bitwise swap?
    Post by: sinsi on November 19, 2009, 07:26:09 AM
    Code:
    7 cycles, qword 256-byte-lut, 313 bytes
    63 cycles, dave shr-rcl-loop, 19 bytes
    6 cycles, drizz stir-fry, 68 bytes
    Replaced. 4 bytes more, jj will not be happy, jan.

    edit:replaced into the drizz stir-fry, sorry lingo. :red


    Title: Re: Fast bitwise swap?
    Post by: lingo on November 19, 2009, 02:30:08 PM
    sinsi,
    Please test it for me too..:
    The code is by bitRake and Nexo :wink
    Code:

    align 16
    bitNexo proc ddval:DWORD
    pop  edx
    pop  eax
    mov  edx,eax
    shr  eax,1
    and  edx,055555555h
    and  eax,055555555h
    lea  eax,[2*edx+eax]
    mov  edx,eax
    shr  eax,2
    and  edx,033333333h
    and  eax,033333333h
    lea  eax,[4*edx+eax]
    mov  edx,eax
    shr  eax,4
    and  edx,0F0F0F0Fh
    and  eax,0F0F0F0Fh
    shl  edx,4
    add  eax,edx
    bswap eax
    jmp  dword ptr [esp-2*4]


    Title: Re: Fast bitwise swap?
    Post by: drizz on November 19, 2009, 02:37:56 PM
    Quote from: dedndave on November 19, 2009, 01:55:54 AM
    did you just make that up ? - or have you used it before ?
    I remembered that i used similar technique for bit count (popcnt), so I just applied that for mirroring.
    Quote from: sinsi on November 19, 2009, 04:06:31 AM
    One day drizz you will comment your code so we know what in the hell you are on about  :bg very cool.
    Code:
    0123 4567 89AB CDEF | GHIJ KLMN OPQR STUV
    ------------------------------------------- swap 16 bits _
    GHIJ KLMN OPQR STUV | 0123 4567 89AB CDEF                 \ bswap
    ------------------------------------------- swap 8 bits  _/
    OPQR STUV GHIJ KLMN | 89AB CDEF 0123 4567
    ------------------------------------------- swap 4 bits
    STUV OPQR KLMN GHIJ | CDEF 89AB 4567 0123
    ------------------------------------------- swap 2 bits
    UVST QROP MNKL IJGH | EFCD AB89 6745 2301
    ------------------------------------------- swap 1 bit
    VUTS RQPO NMLK JIHG | FEDC BA98 7654 3210


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 19, 2009, 03:09:36 PM
            lea    [reg32+n*reg32]

    for n <> 1, it isn't a very fast instruction on many processors


    Title: Re: Fast bitwise swap?
    Post by: lingo on November 19, 2009, 03:51:07 PM
    You can try without lea..:
    Code:
    align 16
    swapbits proc ddval:DWORD
    pop  ecx
    pop  eax
            bswap   eax           
    mov     edx,eax       
    and     eax,00F0F0F0Fh
    xor     edx,eax       
    rol     eax,8           
    or      eax,edx
    mov     edx,eax
    and     eax,033333333h
    xor     edx,eax
    rol     eax,4
    or      eax,edx       
    mov     edx,eax       
    and     eax,055555555h
    xor     edx,eax
    rol     eax,2
    or      eax,edx
    ror     eax,7           
    jmp     ecx


    Title: Re: Fast bitwise swap?
    Post by: dedndave on November 19, 2009, 05:35:59 PM
    i like how you TRIED to sneak those XOR's in there on us - lol
    that is the best non-simd code yet, i think


    Title: Re: Fast bitwise swap?
    Post by: sinsi on November 20, 2009, 07:21:40 AM
    Quote from: lingo on November 19, 2009, 03:51:07 PM
    Code:
    align 16
    swapbits proc ddval:DWORD
    pop ecx
    pop eax
    ...
    jmp ecx

    Just a question, aren't cpu's optimised for call/ret?
    Wouldn't a simple "mov eax,[esp+4]" with a "ret 4" be quicker?


    Title: Re: Fast bitwise swap?
    Post by: jj2007 on November 20, 2009, 08:00:42 AM
    Quote from: MichaelW on November 19, 2009, 03:57:06 AM
    The attachment tests everything except the SSE2 code, all for 32-bit values and encapsulated in procedures without a stack frame.
    Cycle counts on a P3:
    Code:
    27 cycles, qword 256-byte-lut, 313 bytes
    101 cycles, dave shr-rcl-loop, 19 bytes
    12 cycles, drizz stir-fry, 64 bytes
    13 cycles, drizz/lingo, 68 bytes



    Code:
    bitswap_qword endp
    size_qword = $-bitswap_qword+256
    Interesting technique to get the code size, thanks Michael. It fails for ml 9.0 but JWasm and ml 6.14 are fine.
    My timings on Celeron M:
    Code:
    9 cycles, qword 256-byte-lut, 313 bytes
    114 cycles, dave shr-rcl-loop, 19 bytes
    9 cycles, drizz stir-fry, 64 bytes
    9 cycles, drizz/lingo, 68 bytes
    New algos proposed by Lingo:
    Code:
    9      cycles for swapbits
    7      cycles for bitNexo
    9      cycles for bitswap_drizz


    Title: Re: Fast bitwise swap?
    Post by: lingo on November 21, 2009, 03:37:46 PM
    "Wouldn't a simple "mov eax,[esp+4]" with a "ret 4" be quicker?"

    With my CPU NO...I like drizz's algo so, just copied and changed "mov eax,[esp+4]"&"ret 4"
    with "pop ecx"&"jmp ecx" and you can see the difference:
    Code:

    C:\My Documents\ASM\bitswap>test
    00010010 00110100 01010110 01111000
    00011110 01101010 00101100 01001000
    00011110 01101010 00101100 01001000
    00011110 01101010 00101100 01001000

    8 cycles, qword 256-byte-lut, 313 bytes
    6 cycles, drizz stir-fry, 64 bytes
    3 cycles, drizzNew stir-fry, 61 bytes

    Press any key to exit...

    3 cycles and 3 bytes less. :wink


    Title: Re: Fast bitwise swap?
    Post by: jj2007 on November 21, 2009, 05:59:40 PM
    Celeron M:
    Code:
    9 cycles, qword 256-byte-lut, 313 bytes
    9 cycles, drizz stir-fry, 64 bytes
    8 cycles, drizzNew stir-fry, 61 bytes


    Title: Re: Fast bitwise swap?
    Post by: drizz on November 21, 2009, 08:50:20 PM
    AMD Opteron(tm) Processor 148    (SSE3)
    Code:
    5 cycles, qword 256-byte-lut, 313 bytes
    4 cycles, drizz stir-fry, 64 bytes
    4 cycles, drizzNew stir-fry, 61 bytes


    Title: Re: Fast bitwise swap?
    Post by: WryBugz on November 25, 2009, 10:14:34 PM
    10 cycles, qword 256 byte lut
    8 cycles, drizz stir fry
    5 cycles drizznew stir fry

    I lose!

    So much for Intel Core I-7
    Anyone got an xt I can have?


    The MASM Forum Archive 2004 to 2012 | Powered by SMF 1.0.12.
    © 2001-2005, Lewis Media. All Rights Reserved.