Blending pixel function

Farabi · February 24, 2008, 03:06:05 AM

I made a blending pixel function, the purpose of this function is to made a transparent pixel from 2 pixel.
This function is slow, if you can enhance it, I will be very happy.
If you found this function is usefull without modifying it, your welcome to take it.

Quote
BlendPixel proc uses esi edi pixel:dword, pixel2:dword,percent:dword
   LOCAL r,g,b:dword
   LOCAL r2,g2,b2:dword
   LOCAL c1,c2:dword
   LOCAL rat:Dword

   mov edx,pixel
   mov eax,pixel2

   bswap edx
   bswap eax

   mov c1,edx
   mov c2,eax

   lea esi,c1
   lea edi,c2

   movzx eax,byte ptr[esi+3]
   mov r,eax
   movzx eax,byte ptr[esi+2]
   mov g,eax
   movzx eax,byte ptr[esi+1]
   mov b,eax

   movzx eax,byte ptr[edi+3]
   mov r2,eax
   movzx eax,byte ptr[edi+2]
   mov g2,eax
   movzx eax,byte ptr[edi+1]
   mov b2,eax

   mov ecx,100
   sub ecx,percent
   mov rat,ecx

   invoke fGrad,rat,100,r
   mov r,eax
   invoke fGrad,rat,100,g
   mov g,eax
   invoke fGrad,rat,100,b
   mov b,eax

   invoke fGrad,percent,100,r2
   mov r2,eax
   invoke fGrad,percent,100,g2
   mov g2,eax
   invoke fGrad,percent,100,b2
   mov b2,eax


   mov eax,r
   add r2,eax
   mov eax,g
   add g2,eax
   mov eax,b
   add b2,eax

;   shr r2,1
;   shr g2,1
;   shr b2,1

   mov eax,b2
   shl eax,8
   or eax,g2
   shl eax,8
   or eax,r2


   ret
BlendPixel endp

NightWare · February 24, 2008, 03:35:05 AM

Quote from: Farabi on February 24, 2008, 03:06:05 AM
a transparent pixel from 2 pixel.

i don't know what you really mean here, but if you want a pixel being more or less transparent on a background pixel, then (here MMX) :

Code Select


.DATA
ALIGN 16
Sprite_Simd_Mask_RVB_unpck	DWORD 000FF00FFh,0000000FFh

.CODE
; outside loop :
	pxor MM7,MM7
	movq MM6,QWORD PTR Sprite_Simd_Mask_RVB_unpck	; MM6 = 0,M,0,M , 0,M,0,M
...

; inside loop, if loop (where, here, the level of transparency is determined by the alpha of the color to place, 0 to 255):
		movd MM1,edx		; MM1 = _,_,_,_ , A,R,V,B (color to place)
		movd MM0,DWORD PTR [edi]		; MM0 = _,_,_,_ , A,R,V,B (backbround pixel location)
		punpcklbw MM1,MM7		; MM1 = 0,A,0,R , 0,V,0,B
		punpcklbw MM0,MM7		; MM0 = 0,A,0,R , 0,V,0,B
		movq MM2,MM1		; MM2 = 0,A,0,R , 0,V,0,B
		pshufw MM2,MM2,0FFh	; MM2 = 0,A,0,A , 0,A,0,A
		movq MM3,MM2		; MM3 = 0,A,0,A , 0,A,0,A
		pxor MM3,MM6		; MM3 = 0,~A,0,~A , 0,~A,0,~A
		pmullw MM1,MM2		; MM1 = A*A,R*A , V*A,B*A
		pmullw MM0,MM3		; MM0 = A*0,R*A , V*A,B*A
		paddw MM0,MM1		; MM0 = A*0+A*0,R*A+R*A , V*A+V*A,B*A+B*A
		psrlw MM0,8		; MM0 = 0,A,0,R , 0,V,0,B
		packuswb MM0,MM7		; MM0 = _,_,_,_ , A,R,V,B
		movd DWORD PTR [edi],MM0 ; replace the new color

even if it's not exactly what your looking for, you will certainly find it usefull, and fast...

Farabi · February 24, 2008, 10:12:26 AM

Hai nightware,
Im sure your code is faster but how to implement it to my project and run it? Can you made it as a procedure where the parameter is pixel1, pixel2, percent?
Sorry for my lack of english.

NightWare · February 25, 2008, 03:06:06 AM

untested, but i suppose it should look like :

Code Select

.DATA
ALIGN 16
Sprite_Simd_Mask_RVB_unpck	DWORD 000FF00FFh,0000000FFh

.CODE
BlendPixel proc pixel:dword,pixel2:dword,percent:dword

	mov eax,percent	; 0 to 100
	mov edx,0FFh
	mul edx
	mov edx,42949673
	mul edx			; become 0 to 255 in edx

	shl edx,24		; A,_,_,_

	mov eax,pixel		; _,R,G,B read pixel
	or eax,edx		; A,R,G,B add alpha
	mov edx,pixel2		; _,R,G,B read pixel2

	pxor MM7,MM7		; MM7 = 0,0,0,0 , 0,0,0,0
	movq MM6,QWORD PTR Sprite_Simd_Mask_RVB_unpck	; MM6 = 0,0,0,M , 0,M,0,M
	movd MM1,eax		; MM1 = _,_,_,_ , A,R,G,B (pixel)
	movd MM0,edx		; MM0 = _,_,_,_ , A,R,G,B (pixel2)
	punpcklbw MM1,MM7	; MM1 = 0,A,0,R , 0,G,0,B
	punpcklbw MM0,MM7	; MM0 = 0,A,0,R , 0,G,0,B
	movq MM2,MM1		; MM2 = 0,A,0,R , 0,G,0,B
	pshufw MM2,MM2,0FFh	; MM2 = 0,A,0,A , 0,A,0,A
	movq MM3,MM2		; MM3 = 0,A,0,A , 0,A,0,A
	pxor MM3,MM6		; MM3 = 0,~A,0,~A , 0,~A,0,~A
	pmullw MM1,MM2		; MM1 = A*A,R*A , G*A,B*A
	pmullw MM0,MM3		; MM0 = A*0,R*A , G*A,B*A
	paddw MM0,MM1		; MM0 = A*0+A*0,R*A+R*A , G*A+G*A,B*A+B*A
	psrlw MM0,8		; MM0 = 0,A,0,R , 0,G,0,B
	packuswb MM0,MM7	; MM0 = _,_,_,_ , A,R,G,B
	movd eax,MM0		; A,R,G,B new

	and eax,000FFFFFFh	; _,R,G,B only keep RGB

	ret
BlendPixel endp

asmfan · February 25, 2008, 07:19:26 AM

Have a look on my old experiments.
There factor is 4-word constant n words of wich is used (n - number of color channels, ARGB -4, RGB - 3). Each used word must be [0; 256] to represent all variety 0-255 of Src and Dst color channel.
Blending with factor:

result_channel = (chan1*(256 - factor) + chan2*factor) / 256

Use Olly to understand all weird places of algo.

[attachment deleted by admin]

NightWare · February 25, 2008, 10:19:14 PM

Quote from: asmfan on February 25, 2008, 07:19:26 AM
Each used word must be [0; 256] to represent all variety 0-255 of Src and Dst color channel.

you're right, here corrected :

Code Select

.DATA
ALIGN 16
Sprite_Simd_Mask_RVB_unpck_1		DWORD 000010001h,000000001h
Sprite_Simd_Mask_RVB_unpck_255	DWORD 000FF00FFh,0000000FFh

.CODE
BlendPixel proc pixel:dword,pixel2:dword,percent:dword

	mov eax,percent	; 0 to 100
	mov edx,0FFh
	mul edx
	mov edx,42949673
	mul edx			; become 0 to 255 in edx

	shl edx,24		; A,_,_,_

	mov eax,pixel		; _,R,G,B read pixel
	or eax,edx		; A,R,G,B add alpha
	mov edx,pixel2		; _,R,G,B read pixel2

	pxor MM7,MM7		; MM7 = 0,0,0,0 , 0,0,0,0
	movq MM6,QWORD PTR Sprite_Simd_Mask_RVB_unpck_1	; MM6 = 0,0,0,1 , 0,1,0,1
	movd MM1,eax		; MM1 = _,_,_,_ , A,R,G,B (pixel)
	movd MM0,edx		; MM0 = _,_,_,_ , A,R,G,B (pixel2)
	punpcklbw MM1,MM7	; MM1 = 0,A,0,R , 0,G,0,B
	punpcklbw MM0,MM7	; MM0 = 0,A,0,R , 0,G,0,B
	movq MM2,MM1		; MM2 = 0,A,0,R , 0,G,0,B
	pshufw MM2,MM2,0FFh	; MM2 = 0,A,0,A , 0,A,0,A
	movq MM3,QWORD PTR Sprite_Simd_Mask_RVB_unpck_255	; MM3 = 0,0,0,255 , 0,255,0,255
	psubw MM3,MM2		; MM3 = 0,~A,0,~A , 0,~A,0,~A
	paddw MM2,MM6		; MM2 = 0,A+1,0,A+1 , 0,A+1,0,A+1
	paddw MM3,MM6		; MM3 = 0,~A+1,0,~A+1 , 0,~A+1,0,~A+1
	pmullw MM1,MM2		; MM1 = A*A,R*A , G*A,B*A
	pmullw MM0,MM3		; MM0 = A*0,R*A , G*A,B*A
	paddw MM0,MM1		; MM0 = A*0+A*0,R*A+R*A , G*A+G*A,B*A+B*A
	psrlw MM0,8		; MM0 = 0,A,0,R , 0,G,0,B
	packuswb MM0,MM7	; MM0 = _,_,_,_ , A,R,G,B
	movd eax,MM0		; A,R,G,B new

	and eax,000FFFFFFh	; _,R,G,B only keep RGB

	ret
BlendPixel endp

Mark_Larson · February 26, 2008, 12:24:59 PM

What kind of processor do you have Farabi? If you have an Intel processor, I recommend converting NighttWare's code to doing SSE2.

I have a alphablending function around somewhere that I can try digging up. It has been optimizsed for speed.

Farabi · February 27, 2008, 05:06:10 AM

Quote from: Mark_Larson on February 26, 2008, 12:24:59 PM
What kind of processor do you have Farabi? If you have an Intel processor, I recommend converting NighttWare's code to doing SSE2.

I have a alphablending function around somewhere that I can try digging up. It has been optimizsed for speed.

Intel celleron 1.7 Ghz.

Farabi · February 27, 2008, 05:10:12 AM

I forget to write the fGrad function.

Code Select


fGrad proc x:dword,nMaxx:dword,nNum:dword
	
	xor edx,edx		; 1
	mov eax,nNum	; 2
	mul x			; 42
					
	div nMaxx		; 40
	; 85 clock cycle
	
	ret
fGrad endp

daydreamer · February 27, 2008, 08:14:46 AM

Quote from: Mark_Larson on February 26, 2008, 12:24:59 PM
What kind of processor do you have Farabi? If you have an Intel processor, I recommend converting NighttWare's code to doing SSE2.

I have a alphablending function around somewhere that I can try digging up. It has been optimizsed for speed.

in case of all pixels use the same alphablend value you can unroll it to make use of a pixels only RGBRGBRGBRGB, is faster than work on ARGBARGBARGB pixels

Quote from: Farabi on February 27, 2008, 05:10:12 AM
I forget to write the fGrad function.
Code Select Expand
fGrad proc x:dword,nMaxx:dword,nNum:dword xor edx,edx ; 1 mov eax,nNum ; 2 mul x ; 42 div nMaxx ; 40 ; 85 clock cycle ret fGrad endp

again if you have SSE2 caps and know some math, you can use a series of RPSD and newton-raphson method to get enough precision to your liking and use Doubleprecisionfloat Reciprocal you have 63 bits enough precision instead of 32bit integer div

Mark_Larson · February 27, 2008, 03:28:27 PM

Quote from: daydreamer on February 27, 2008, 08:14:46 AM
Quote from: Mark_Larson on February 26, 2008, 12:24:59 PM
What kind of processor do you have Farabi? If you have an Intel processor, I recommend converting NighttWare's code to doing SSE2.

I have a alphablending function around somewhere that I can try digging up. It has been optimizsed for speed.
in case of all pixels use the same alphablend value you can unroll it to make use of a pixels only RGBRGBRGBRGB, is faster than work on ARGBARGBARGB pixels
Quote from: Farabi on February 27, 2008, 05:10:12 AM
I forget to write the fGrad function.
Code Select Expand
fGrad proc x:dword,nMaxx:dword,nNum:dword xor edx,edx ; 1 mov eax,nNum ; 2 mul x ; 42 div nMaxx ; 40 ; 85 clock cycle ret fGrad endp
again if you have SSE2 caps and know some math, you can use a series of RPSD and newton-raphson method to get enough precision to your liking and use Doubleprecisionfloat Reciprocal you have 63 bits enough precision instead of 32bit integer div

daydreamer can you go into more detail how that would work?

Farabi · March 01, 2008, 01:34:25 PM

Nightware:
Good job, your function is 5 times or more faster than mine. Now I can draw a transparent rectangle in realtime with CPU usage below 5%.

NightWare · March 01, 2008, 09:57:15 PM

Quote from: Farabi on March 01, 2008, 01:34:25 PM
Good job

:wink here i've removed all the useless instructions (-7) of my previous code :

Code Select

.DATA
ALIGN 16
Sprite_Simd_Mask_RVB_unpck_1		DWORD 000010001h,000000001h
Sprite_Simd_Mask_RVB_unpck_256	DWORD 001000100h,000000100h

.CODE
BlendPixel proc pixel:dword,pixel2:dword,percent:dword

	mov eax,percent	; 0 to 100
	mov edx,0FFh
	mul edx
	mov edx,42949673
	mul edx			; become 0 to 255 in edx

	pxor MM7,MM7		; MM7 = 0,0,0,0 , 0,0,0,0
	movd MM2,edx		; MM2 = _,_,_,_ , 0,0,0,A (alpha)
	movd MM1,pixel		; MM1 = _,_,_,_ , _,R,G,B (pixel)
	movd MM0,pixel2	; MM0 = _,_,_,_ , _,R,G,B (pixel2)
	movq MM3,QWORD PTR Sprite_Simd_Mask_RVB_unpck_256	; MM3 = 0,0,0,256 , 0,256,0,256
	pshufw MM2,MM2,040h	; MM2 = 0,0,0,A , 0,A,0,A
	punpcklbw MM1,MM7	; MM1 = 0,_,0,R , 0,G,0,B
	punpcklbw MM0,MM7	; MM0 = 0,_,0,R , 0,G,0,B
	psubw MM3,MM2		; MM3 = 0,0,0,~A+1 , 0,~A+1,0,~A+1
	paddw MM2,QWORD PTR Sprite_Simd_Mask_RVB_unpck_1	; MM2 = 0,0,0,A+1 , 0,A+1,0,A+1
	pmullw MM0,MM3		; MM0 = _*0,R*A , G*A,B*A
	pmullw MM1,MM2		; MM1 = _*0,R*A , G*A,B*A
	paddw MM1,MM0		; MM1 = _*0+_*0,R*A+R*A , G*A+G*A,B*A+B*A
	psrlw MM1,8		; MM1 = 0,0,0,R , 0,G,0,B
	packuswb MM1,MM7	; MM0 = _,_,_,_ , 0,R,G,B
	movd eax,MM1		; 0,R,G,B new

	ret
BlendPixel endp

Quote from: daydreamer on February 27, 2008, 08:14:46 AM
in case of all pixels use the same alphablend value you can unroll it to make use of a pixels only RGBRGBRGBRGB, is faster than work on ARGBARGBARGB pixels

yep, RGBR,GBRG,BRGB is faster, but only on 24 bits screens, on 32 bits you will loose the speed gain when shuffling the bytes. Now, if you use a 24 bits screen, you MUST use this technic to balance the slowdown generated by the misalignment on others algos (sprites/scrolling/etc...).

Quote from: daydreamer on February 27, 2008, 08:14:46 AM
you can use a series of RPSD and newton-raphson method to get enough precision

:wink anyway it's not a good idea, newton-raphson increase the precision, yes, but when you have a math problem wich generate a gpf due to lack of precision, it's just harder/longer to find where is the problem, so it's better to avoid newton-raphson and solve the problem in a different way...

daydreamer · March 03, 2008, 02:29:29 PM

Quote from: NightWare on March 01, 2008, 09:57:15 PM
yep, RGBR,GBRG,BRGB is faster, but only on 24 bits screens, on 32 bits you will loose the speed gain when shuffling the bytes. Now, if you use a 24 bits screen, you MUST use this technic to balance the slowdown generated by the misalignment on others algos (sprites/scrolling/etc...).

Quote from: daydreamer on February 27, 2008, 08:14:46 AM
you can use a series of RPSD and newton-raphson method to get enough precision
:wink anyway it's not a good idea, newton-raphson increase the precision, yes, but when you have a math problem wich generate a gpf due to lack of precision, it's just harder/longer to find where is the problem, so it's better to avoid newton-raphson and solve the problem in a different way...

its pointless to have 32bit in systemram if you are not into heavy usage of load hires transmaps into alphachannel, systemram is the only thing you gonna blend things in if doing it in software and memorybandwidth will have impact on your code more than extra shuffles when doing final transfer from 24bit systemram->32bit vram, due to vram is deadslow to read from

I found another method for division you can use if you have SSE2 you have access to MMX/SSE2 can make use of 64bit mul's you could use that for a fixed point 32:32 bit 1/x mul instead of slow div

u · March 03, 2008, 03:45:48 PM

Actually for 4 years already every videocard supports BitBlt (including the final blit onscreen) in hardware, rendering from sysram-based textures. (letting the cpu do computations while DMA is running). But the only depth any of these cards is the current depth of the screen. Do a benchmark - see how many FPS you can get with a 24bit DIB (fill it with noise on every frame, and then BitBlt it on the main window of your app, just make sure it's a maximized window) versus 32bpp (which is usually the standard since 4 years afaik). Though, I prefer 32bpp because anyway I'm doing lots of alphablending and custom types of blending.

News:

Blending pixel function