News:

MASM32 SDK Description, downloads and other helpful links
MASM32.com New Forum Link
masmforum WebSite

Software based Bilinear filtering

Started by johnsa, August 19, 2009, 02:49:36 PM

Previous topic - Next topic

johnsa

So I've been playing around again with a software texture mapper implementation. I started writing it 100% in c# just to prototype it and get it looking accurate before worrying about moving blocks of code back to native (eventually the whole shader in asm).
So to start with I moved the bilinear texel lookup in asm/sse. Granted that made a huge difference already, its still not as fast I think it could be. I tried re organizing the texture data in a few different formats including float, eventually settled on re arranging the data so that the quads (4 texels in question) are linear in memory and aligned to only require a single movdqa to load all 4 dword texels. That seemed like the fastest way with the least mem-access. Anyhow.. below is the lookup code I'm using can any see any improvements or have any thoughts? (It supports tiling too by mod'ing the texture coords post scaling to texture size... this code also combines flat lambert shade factor (this will fall away later.. when it modulatesthe full R/G/B diffuse component).


mov esi,textureBufferPtr

movss xmm0,u
movss xmm1,v

mulss xmm0,dword ptr textureWidth ; XMM0 = (u * maxU)
mulss xmm1,dword ptr textureHeight ; XMM1 = (v * maxV)

movaps xmm2,xmm0
movaps xmm3,xmm1

; XMM0/1 are the integral portion of U/V.
cvttss2si eax,xmm0
cvtsi2ss xmm0,eax

cvttss2si ebx,xmm1
cvtsi2ss xmm1,ebx

; Get fractional portion of final U/V.
subss xmm2,xmm0 ; XMM2 = fracu = (u*maxU)-floor(u*maxU)
subss xmm3,xmm1 ; XMM3 = fracv = (v*maxV)-floor(v*maxV)

; Get Final Texture Offset.
and eax,textureWidthI
and ebx,textureHeightI
shl eax,4
imul ebx,textureWidth16
add esi,eax
add esi,ebx

; Calculate Single Weights.
; w1 = (1.0f-fracu)*(1.0f-fracv)
; w2 = fracu * (1.0f-fracv)
; w3 = (1.0f-fracu)*fracv
; w4 = fracu*fracv
movss xmm0,SingleOne
movaps xmm1,xmm0
subss xmm0,xmm2 ; XMM0 = 1.0f - fracu
subss xmm1,xmm3 ; XMM1 = 1.0f - fracv

pshufd xmm4,xmm0,0 ; XMM4 = | 1-fracu | 1-fracu | 1-fracu | 1-fracu |
pshufd xmm5,xmm2,0 ; XMM5 = | fracu   | fracu   | fracu   | fracu   |
shufps xmm4,xmm5,00000000b                  ; XMM4 = | fracu   | fracu   | 1-fracu | 1-fracu |
pshufd xmm4,xmm4,00110011b ; XMM4 = | 1-fracu | fracu   | 1-fracu | fracu   |

pshufd xmm5,xmm1,0 ; XMM5 = | 1-fracv | 1-fracv | 1-fracv | 1-fracv |
pshufd xmm6,xmm3,0 ; XMM6 = | fracv   | fracv   | fracv   | fracv   |
shufps xmm6,xmm5,00000000b                  ; XMM6 = | 1-fracv | 1-fracv | fracv   | fracv   |
mulps xmm4,xmm6 ; XMM4 = | w1      | w2      | w3      | w4      |

; Shuffle Weights into all entries.
pshufd xmm0,xmm4,11111111b ; xmm0 = | w1 | w1 | w1 | w1 |
pshufd xmm1,xmm4,10101010b ; xmm1 = | w2 | w2 | w2 | w2 |
pshufd xmm2,xmm4,01010101b ; xmm2 = | w3 | w3 | w3 | w3 |
pshufd xmm3,xmm4,00000000b ; xmm3 = | w4 | w4 | w4 | w4 |

; Load the 4 quad texels.
prefetchnta [esi+128]
movdqa xmm4,oword ptr [esi]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4

pshufb xmm4,oword ptr texelMask1
pand xmm4,oword ptr pixelAndMask
cvtdq2ps xmm4,xmm4

pshufb xmm5,oword ptr texelMask2
pand xmm5,oword ptr pixelAndMask
cvtdq2ps xmm5,xmm5

pshufb xmm6,oword ptr texelMask3
pand xmm6,oword ptr pixelAndMask
cvtdq2ps xmm6,xmm6

pshufb xmm7,oword ptr texelMask4
pand xmm7,oword ptr pixelAndMask
cvtdq2ps xmm7,xmm7

; Multiply Texel ARGB components By Weights.
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3

;Sum them.
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7

; Apply flat shade coefficient.
movss xmm1,s
pshufd xmm1,xmm1,0
mulps xmm4,xmm1

; Convert ARGBs back to DWORDS and get final texel colour.
cvttps2dq xmm0,xmm4
pshufb xmm0,oword ptr pixelMask2
movd eax,xmm0


NightWare

mov esi,textureBufferPtr

movss xmm0,u
movss xmm1,v

mulss xmm0,dword ptr textureWidth ; XMM0 = (u * maxU)
mulss xmm1,dword ptr textureHeight ; XMM1 = (v * maxV)

movaps xmm2,xmm0
movaps xmm3,xmm1

; XMM0/1 are the integral portion of U/V.
cvttss2si ebx,xmm1
cvttss2si eax,xmm0
cvtsi2ss xmm1,ebx
cvtsi2ss xmm0,eax


; Get fractional portion of final U/V.
subss xmm3,xmm1 ; XMM3 = fracv = (v*maxV)-floor(v*maxV)
subss xmm2,xmm0 ; XMM2 = fracu = (u*maxU)-floor(u*maxU)

; Get Final Texture Offset.
and eax,textureWidthI
and ebx,textureHeightI
shl eax,4
imul ebx,textureWidth16
add esi,eax
add esi,ebx

; Calculate Single Weights.
; w1 = (1.0f-fracu)*(1.0f-fracv)
; w2 = fracu * (1.0f-fracv)
; w3 = (1.0f-fracu)*fracv
; w4 = fracu*fracv
movss xmm1,SingleOne
movaps xmm0,xmm1
subss xmm0,xmm2 ; XMM0 = 1.0f - fracu
subss xmm1,xmm3 ; XMM1 = 1.0f - fracv

; pshufd xmm3,xmm0,0 ; xmm3 = | 1-fracu | 1-fracu | 1-fracu | 1-fracu |
; pshufd xmm5,xmm2,0 ; XMM5 = | fracu   | fracu   | fracu   | fracu   |
; shufps xmm3,xmm5,00000000b                  ; xmm3 = | fracu   | fracu   | 1-fracu | 1-fracu |
; pshufd xmm3,xmm3,00110011b ; xmm3 = | 1-fracu | fracu   | 1-fracu | fracu   |
pshufd xmm5,xmm0,0
pshufd xmm3,xmm2,0
punpckldq xmm3,xmm5
; sse3 :
; movss xmm3,xmm2
; punpckldq xmm3,xmm0
; movddup xmm3,xmm3

pshufd xmm5,xmm1,0 ; XMM5 = | 1-fracv | 1-fracv | 1-fracv | 1-fracv |
pshufd xmm4,xmm3,0 ; xmm4 = | fracv   | fracv   | fracv   | fracv   |
; shufps xmm4,xmm5,00000000b                  ; xmm4 = | 1-fracv | 1-fracv | fracv   | fracv   |
; mulps xmm4,xmm3 ; xmm3 = | w1      | w2      | w3      | w4      |
movlhps xmm4,xmm5
mulps xmm4,xmm3
; sse3 :
; movss xmm4,xmm3
; movlhps xmm4,xmm1
; movsldup xmm4,xmm4
; mulps xmm4,xmm3

; Shuffle Weights into all entries.
pshufd xmm0,xmm4,11111111b ; xmm0 = | w1 | w1 | w1 | w1 |
pshufd xmm1,xmm4,10101010b ; xmm1 = | w2 | w2 | w2 | w2 |
pshufd xmm2,xmm4,01010101b ; xmm2 = | w3 | w3 | w3 | w3 |
pshufd xmm3,xmm4,00000000b ; xmm3 = | w4 | w4 | w4 | w4 |

; Load the 4 quad texels.
prefetchnta [esi+128]
movdqa xmm4,oword ptr [esi]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4

pshufb xmm4,oword ptr texelMask1
pshufb xmm5,oword ptr texelMask2
pshufb xmm6,oword ptr texelMask3
pshufb xmm7,oword ptr texelMask4

pand xmm4,oword ptr pixelAndMask
pand xmm5,oword ptr pixelAndMask
pand xmm6,oword ptr pixelAndMask
pand xmm7,oword ptr pixelAndMask

cvtdq2ps xmm4,xmm4
cvtdq2ps xmm5,xmm5
cvtdq2ps xmm6,xmm6
cvtdq2ps xmm7,xmm7

; Multiply Texel ARGB components By Weights.
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3

;Sum them.
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7

; Apply flat shade coefficient.
movss xmm1,s
pshufd xmm1,xmm1,0
mulps xmm1,xmm4

; Convert ARGBs back to DWORDS and get final texel colour.
cvttps2dq xmm0,xmm1
pshufb xmm0,oword ptr pixelMask2
movd eax,xmm0


hi johnsa,
i haven't tested it of course, just made very minor change in the code posted. you to test.

for simd, in general :

- try to transform your code to use the same source register of the previous line, when possible (it's faster, it works also for destination register, but not always...). using the previous source register as the current destination register is also a good choice.

- the work is always made from dword 0 (right) to dword 3 (left) (you must represent yourself the work made by the cpu, to "see" how the registers should be ordered/used).

- try to pair/triple/quad instructions to avoids registers stalls. plus, pairing/quad instructions will align your code a bit (whatever the size of the instructions, you will never have a code totally misaligned). here you shoul pair it and repeat the operations, to free some register, it will allow you to store usefull values (for example here : pixelAndMask), and put it outside of the algo/loop)

- mul give always the same result in the destination register (whatever the order of the registers), so don't hesitate to re-use the destination register of the previous instruction,
the time that the source register will be defined/loaded by the cpu, the result of the previous operation on the destination register will be finished (or stall reduced).

- also don't hesitate to use the same register (both dest/src) for an operation, if you don't need to keep the values... example : cvttps2dq xmm0,xmm0 for convert ARGBs back. there is not enough simd register, so you must not waste them.

- last general comment (here, for those who want to start with simd), simd has been developped for multimedia/video game. so if you need a specific work on the registers, the corresponding intruction certainly exist. if not then you don't do the things like it should, or you're in advance to the others  :toothy