overkill 32 bits avx2 assembly code
[mini2dgl.git] / avx2.s
blob046c47b2c0120b4aaacc13479f07638afce46294
1 ; vim: set filetype=fasm foldmethod=marker commentstring=;%s colorcolumn=101 :
2 include 'x64.inc'
3 include 'format/elf32.inc'
4 include 'ext/avx2.inc'
5 use32
6 section '.text' executable writeable align 64
7 ;***************************************************************************************************
8 public glTexXImage2D_avx2
9 align 16
10 glTexXImage2D_avx2: namespace glTexXImage2D_avx2
11 ;---------------------------------------------------------------------------------------------------
12 ; input
13 dst := 0
14 dst.x_offset := 4
15 dst.y_offset := 8
16 dst.width.pixs_n := 12 ; the real width
17 dst.line.pixs_n := 16
18 src := 20
19 src.line.pixs_n := 24 ; the src width is the dst.width
20 height.lines_n := 28
21 ;---------------------------------------------------------------------------------------------------
22 ; stack
23 ;---- frame
24 define dst.adjust esp + 4 * -6
25 define src.adjust esp + 4 * -5
26 define ebx_save esp + 4 * -4
27 define edi_save esp + 4 * -3
28 define esi_save esp + 4 * -2
29 define ebp_save esp + 4 * -1
30 ;--- caller
31 define ret_addr esp + 4 * 0
32 define input esp + 4 * 1
33 ;---------------------------------------------------------------------------------------------------
34 mov [ebx_save], ebx
35 mov [edi_save], edi
36 mov [esi_save], esi
37 mov [ebp_save], ebp
38 ; input (right after dst adjust, src adjust, ebx, edi, esi, ebp, "return addr")
39 mov ebx, dword [input]
40 mov edi, dword [ebx + dst]
41 mov esi, dword [ebx + src]
42 ; dst --------------------------------------------------------------------------------------
43 mov eax, dword [ebx + dst.line.pixs_n]
44 shl eax, 2
45 mov ecx, eax ; = dst.line.bytes_n
46 mul dword [ebx + dst.y_offset] ; edx:eax = offset of the line of the first dst rect width
47 add edi, eax ; edi = points on the first pix of the line of the first dst rect width
48 mov eax, dword [ebx + dst.x_offset]
49 shl eax, 2 ; = dst.x_offset.bytes_n
50 add edi, eax ; edi = points on the first pix of the dst rect
51 ; adjust value from the end pix (past the last pix) of a dst rect width to the first pix of the dst rect width on next line
52 mov ebp, [ebx + dst.width.pixs_n]
53 shl ebp, 2 ; = dst.width.bytes_n
54 sub ecx, ebp ; = dst.line.bytes_n - dst.width.bytes_n = adjust value
55 mov dword [dst.adjust], ecx
56 ; src --------------------------------------------------------------------------------------
57 mov edx, dword [ebx + src.line.pixs_n]
58 shl edx, 2 ; = src.line.bytes_n
59 ; adjust value from the end pix (past the last pix) of a src rect width to the first pix of the src rect width on next line
60 sub edx, ebp ; = src.line.bytes_n - dst.width.bytes_n = adjust value
61 mov dword [src.adjust], edx
62 ; ------------------------------------------------------------------------------------------
63 mov ebx, [ebx + height.lines_n]
64 align_nops
65 next_line:
66 mov ecx, ebp ; = dst.width.bytes_n
67 cmp ecx, 32 * 8
68 jb cpy_32_7
69 ; aggressive unrolled line cpy (useless from AMD zen3 with optimized REP instructions)
70 align_nops
71 cpy_32_8:
72 vmovdqu ymm0, yword [esi + 32 * 0]
73 vmovdqu ymm1, yword [esi + 32 * 1]
74 vmovdqu ymm2, yword [esi + 32 * 2]
75 vmovdqu ymm3, yword [esi + 32 * 3]
76 vmovdqu ymm4, yword [esi + 32 * 4]
77 vmovdqu ymm5, yword [esi + 32 * 5]
78 vmovdqu ymm6, yword [esi + 32 * 6]
79 vmovdqu ymm7, yword [esi + 32 * 7]
81 vmovdqu yword [edi + 32 * 0], ymm0
82 vmovdqu yword [edi + 32 * 1], ymm1
83 vmovdqu yword [edi + 32 * 2], ymm2
84 vmovdqu yword [edi + 32 * 3], ymm3
85 vmovdqu yword [edi + 32 * 4], ymm4
86 vmovdqu yword [edi + 32 * 5], ymm5
87 vmovdqu yword [edi + 32 * 6], ymm6
88 vmovdqu yword [edi + 32 * 7], ymm7
90 sub ecx, 32 * 8
91 add edi, 32 * 8
92 add esi, 32 * 8
93 cmp ecx, 32 * 8
94 jae cpy_32_8
95 align_nops
96 cpy_32_7:
97 cmp ecx, 32 * 7
98 jb cpy_32_6
99 vmovdqu ymm0, yword [esi + 32 * 0]
100 vmovdqu ymm1, yword [esi + 32 * 1]
101 vmovdqu ymm2, yword [esi + 32 * 2]
102 vmovdqu ymm3, yword [esi + 32 * 3]
103 vmovdqu ymm4, yword [esi + 32 * 4]
104 vmovdqu ymm5, yword [esi + 32 * 5]
105 vmovdqu ymm6, yword [esi + 32 * 6]
107 vmovdqu yword [edi + 32 * 0], ymm0
108 vmovdqu yword [edi + 32 * 1], ymm1
109 vmovdqu yword [edi + 32 * 2], ymm2
110 vmovdqu yword [edi + 32 * 3], ymm3
111 vmovdqu yword [edi + 32 * 4], ymm4
112 vmovdqu yword [edi + 32 * 5], ymm5
113 vmovdqu yword [edi + 32 * 6], ymm6
115 sub ecx, 32 * 7
116 add edi, 32 * 7
117 add esi, 32 * 7
118 align_nops
119 cpy_32_6:
120 cmp ecx, 32 * 6
121 jb cpy_32_5
122 vmovdqu ymm0, yword [esi + 32 * 0]
123 vmovdqu ymm1, yword [esi + 32 * 1]
124 vmovdqu ymm2, yword [esi + 32 * 2]
125 vmovdqu ymm3, yword [esi + 32 * 3]
126 vmovdqu ymm4, yword [esi + 32 * 4]
127 vmovdqu ymm5, yword [esi + 32 * 5]
129 vmovdqu yword [edi + 32 * 0], ymm0
130 vmovdqu yword [edi + 32 * 1], ymm1
131 vmovdqu yword [edi + 32 * 2], ymm2
132 vmovdqu yword [edi + 32 * 3], ymm3
133 vmovdqu yword [edi + 32 * 4], ymm4
134 vmovdqu yword [edi + 32 * 5], ymm5
136 sub ecx, 32 * 6
137 add edi, 32 * 6
138 add esi, 32 * 6
139 align_nops
140 cpy_32_5:
141 cmp ecx, 32 * 5
142 jb cpy_32_4
143 vmovdqu ymm0, yword [esi + 32 * 0]
144 vmovdqu ymm1, yword [esi + 32 * 1]
145 vmovdqu ymm2, yword [esi + 32 * 2]
146 vmovdqu ymm3, yword [esi + 32 * 3]
147 vmovdqu ymm4, yword [esi + 32 * 4]
149 vmovdqu yword [edi + 32 * 0], ymm0
150 vmovdqu yword [edi + 32 * 1], ymm1
151 vmovdqu yword [edi + 32 * 2], ymm2
152 vmovdqu yword [edi + 32 * 3], ymm3
153 vmovdqu yword [edi + 32 * 4], ymm4
155 sub ecx, 32 * 5
156 add edi, 32 * 5
157 add esi, 32 * 5
158 align_nops
159 cpy_32_4:
160 cmp ecx, 32 * 4
161 jb cpy_32_3
162 vmovdqu ymm0, yword [esi + 32 * 0]
163 vmovdqu ymm1, yword [esi + 32 * 1]
164 vmovdqu ymm2, yword [esi + 32 * 2]
165 vmovdqu ymm3, yword [esi + 32 * 3]
167 vmovdqu yword [edi + 32 * 0], ymm0
168 vmovdqu yword [edi + 32 * 1], ymm1
169 vmovdqu yword [edi + 32 * 2], ymm2
170 vmovdqu yword [edi + 32 * 3], ymm3
172 sub ecx, 32 * 4
173 add edi, 32 * 4
174 add esi, 32 * 4
175 align_nops
176 cpy_32_3:
177 cmp ecx, 32 * 3
178 jb cpy_32_2
179 vmovdqu ymm0, yword [esi + 32 * 0]
180 vmovdqu ymm1, yword [esi + 32 * 1]
181 vmovdqu ymm2, yword [esi + 32 * 2]
183 vmovdqu yword [edi + 32 * 0], ymm0
184 vmovdqu yword [edi + 32 * 1], ymm1
185 vmovdqu yword [edi + 32 * 2], ymm2
187 sub ecx, 32 * 3
188 add edi, 32 * 3
189 add esi, 32 * 3
190 align_nops
191 cpy_32_2:
192 cmp ecx, 32 * 2
193 jb cpy_32
194 vmovdqu ymm0, yword [esi + 32 * 0]
195 vmovdqu ymm1, yword [esi + 32 * 1]
197 vmovdqu yword [edi + 32 * 0], ymm0
198 vmovdqu yword [edi + 32 * 1], ymm1
200 sub ecx, 32 * 2
201 add edi, 32 * 2
202 add esi, 32 * 2
203 align_nops
204 cpy_32:
205 cmp ecx, 32
206 jb cpy_16
207 vmovdqu ymm0, yword [esi + 32 * 0]
208 vmovdqu yword [edi + 32 * 0], ymm0
210 sub ecx, 32
211 add edi, 32
212 add esi, 32
213 align_nops
214 cpy_16:
215 cmp ecx, 16
216 jb cpy_8
217 vmovdqu xmm0, xword [esi]
218 vmovdqu xword [edi], xmm0
220 sub ecx, 16
221 add edi, 16
222 add esi, 16
223 align_nops
224 cpy_8:
225 cmp ecx, 8
226 jb cpy_4
227 mov eax, dword [esi + 4 * 0]
228 mov edx, dword [esi + 4 * 1]
229 mov dword [edi + 4 * 0], eax
230 mov dword [edi + 4 * 1], edx
232 sub ecx, 8
233 add edi, 8
234 add esi, 8
235 align_nops
236 cpy_4:
237 cmp ecx, 4
238 jb cpy_2
239 mov eax, dword [esi]
240 mov dword [edi], eax
242 sub ecx, 4
243 add edi, 4
244 add esi, 4
245 align_nops
246 cpy_2:
247 cmp ecx, 2
248 jb cpy_1
249 mov ax, word [esi]
250 mov word [edi], ax
252 sub ecx, 2
253 add edi, 2
254 add esi, 2
255 align_nops
256 cpy_1:
257 test ecx, ecx
258 jz prepare_next_line
259 mov al, byte [esi]
260 mov byte [edi], al
262 inc edi
263 inc esi
264 align_nops
265 prepare_next_line:
266 dec ebx ; height--
267 jz epilog
268 add edi, dword [dst.adjust]
269 add esi, dword [src.adjust]
270 jmp next_line
271 align_nops
272 epilog:
273 mov ebx, [ebx_save]
274 mov edi, [edi_save]
275 mov esi, [esi_save]
276 mov ebp, [ebp_save]
277 vzeroupper ; end of AVX2 code
279 end namespace ; glTexXImage2D_avx2
280 ;***************************************************************************************************
281 public clearcolor_avx2
282 align 16
283 clearcolor_avx2: namespace clearcolor_avx2
284 ;---------------------------------------------------------------------------------------------------
285 ; input
286 dst := 0
287 dst.width.pixs_n := 4
288 dst.line.bytes_n := 8
289 dst.height.lines_n := 12
290 ;---------------------------------------------------------------------------------------------------
291 ; stack
292 ;---- frame
293 define ebx_save esp + 4 * -4
294 define edi_save esp + 4 * -3
295 define esi_save esp + 4 * -2
296 define ebp_save esp + 4 * -1
297 ;--- caller
298 define ret_addr esp + 4 * 0
299 define input esp + 4 * 1
300 ;---------------------------------------------------------------------------------------------------
301 mov eax, 0xff252525
302 vmovd xmm0, eax
303 vpbroadcastd ymm0, xmm0
304 vmovdqu ymm1, ymm0
305 vmovdqu ymm2, ymm0
306 vmovdqu ymm3, ymm0
307 vmovdqu ymm4, ymm0
308 vmovdqu ymm5, ymm0
309 vmovdqu ymm6, ymm0
310 vmovdqu ymm7, ymm0
312 mov [ebx_save], ebx
313 mov [edi_save], edi
314 mov [esi_save], esi
315 mov [ebp_save], ebp
316 ; input (right after ebx, edi, esi, ebp, "return addr")
317 mov ebx, dword [input]
318 mov edi, dword [ebx + dst]
319 ; dst --------------------------------------------------------------------------------------
320 mov ebp, [ebx + dst.width.pixs_n]
321 mov esi, [ebx + dst.line.bytes_n]
322 shl ebp, 2 ; = dst.width.bytes_n
323 ; adjust value from the end pix (past the last pix) of a dst rect width to the first pix of the dst rect width on next line
324 sub esi, ebp ; = dst.line.bytes_n - dst.width.bytes_n = adjust value
325 ; ------------------------------------------------------------------------------------------
326 mov ebx, [ebx + dst.height.lines_n]
327 align_nops
328 next_line:
329 mov ecx, ebp ; = dst.width.bytes_n
330 cmp ecx, 32 * 8
331 jb cpy_32_7
332 ; aggressive unrolled line cpy (useless from AMD zen3 with optimized REP instructions)
333 align_nops
334 cpy_32_8:
335 vmovdqu yword [edi + 32 * 0], ymm0
336 vmovdqu yword [edi + 32 * 1], ymm1
337 vmovdqu yword [edi + 32 * 2], ymm2
338 vmovdqu yword [edi + 32 * 3], ymm3
339 vmovdqu yword [edi + 32 * 4], ymm4
340 vmovdqu yword [edi + 32 * 5], ymm5
341 vmovdqu yword [edi + 32 * 6], ymm6
342 vmovdqu yword [edi + 32 * 7], ymm7
344 sub ecx, 32 * 8
345 add edi, 32 * 8
346 cmp ecx, 32 * 8
347 jae cpy_32_8
348 align_nops
349 cpy_32_7:
350 cmp ecx, 32 * 7
351 jb cpy_32_6
352 vmovdqu yword [edi + 32 * 0], ymm0
353 vmovdqu yword [edi + 32 * 1], ymm1
354 vmovdqu yword [edi + 32 * 2], ymm2
355 vmovdqu yword [edi + 32 * 3], ymm3
356 vmovdqu yword [edi + 32 * 4], ymm4
357 vmovdqu yword [edi + 32 * 5], ymm5
358 vmovdqu yword [edi + 32 * 6], ymm6
360 sub ecx, 32 * 7
361 add edi, 32 * 7
362 align_nops
363 cpy_32_6:
364 cmp ecx, 32 * 6
365 jb cpy_32_5
366 vmovdqu yword [edi + 32 * 0], ymm0
367 vmovdqu yword [edi + 32 * 1], ymm1
368 vmovdqu yword [edi + 32 * 2], ymm2
369 vmovdqu yword [edi + 32 * 3], ymm3
370 vmovdqu yword [edi + 32 * 4], ymm4
371 vmovdqu yword [edi + 32 * 5], ymm5
373 sub ecx, 32 * 6
374 add edi, 32 * 6
375 align_nops
376 cpy_32_5:
377 cmp ecx, 32 * 5
378 jb cpy_32_4
379 vmovdqu yword [edi + 32 * 0], ymm0
380 vmovdqu yword [edi + 32 * 1], ymm1
381 vmovdqu yword [edi + 32 * 2], ymm2
382 vmovdqu yword [edi + 32 * 3], ymm3
383 vmovdqu yword [edi + 32 * 4], ymm4
385 sub ecx, 32 * 5
386 add edi, 32 * 5
387 align_nops
388 cpy_32_4:
389 cmp ecx, 32 * 4
390 jb cpy_32_3
391 vmovdqu yword [edi + 32 * 0], ymm0
392 vmovdqu yword [edi + 32 * 1], ymm1
393 vmovdqu yword [edi + 32 * 2], ymm2
394 vmovdqu yword [edi + 32 * 3], ymm3
396 sub ecx, 32 * 4
397 add edi, 32 * 4
398 align_nops
399 cpy_32_3:
400 cmp ecx, 32 * 3
401 jb cpy_32_2
402 vmovdqu yword [edi + 32 * 0], ymm0
403 vmovdqu yword [edi + 32 * 1], ymm1
404 vmovdqu yword [edi + 32 * 2], ymm2
406 sub ecx, 32 * 3
407 add edi, 32 * 3
408 align_nops
409 cpy_32_2:
410 cmp ecx, 32 * 2
411 jb cpy_32
412 vmovdqu yword [edi + 32 * 0], ymm0
413 vmovdqu yword [edi + 32 * 1], ymm1
415 sub ecx, 32 * 2
416 add edi, 32 * 2
417 align_nops
418 cpy_32:
419 cmp ecx, 32
420 jb cpy_16
421 vmovdqu yword [edi + 32 * 0], ymm0
423 sub ecx, 32
424 add edi, 32
425 align_nops
426 cpy_16:
427 cmp ecx, 16
428 jb cpy_8
429 vmovdqu xword [edi], xmm0
431 sub ecx, 16
432 add edi, 16
433 align_nops
434 cpy_8:
435 cmp ecx, 8
436 jb cpy_4
437 mov dword [edi + 4 * 0], 0xff252525
438 mov dword [edi + 4 * 1], 0xff252525
440 sub ecx, 8
441 add edi, 8
442 align_nops
443 cpy_4:
444 test ecx, ecx
445 jz prepare_next_line
446 mov dword [edi], 0xff252525
448 add edi, 4
449 align_nops
450 prepare_next_line:
451 dec ebx ; height--
452 jz epilog
453 add edi, esi ; adjust
454 jmp next_line
455 align_nops
456 epilog:
457 mov ebx, [ebx_save]
458 mov edi, [edi_save]
459 mov esi, [esi_save]
460 mov ebp, [ebp_save]
461 vzeroupper ; end of AVX2 code
463 end namespace ; clearcolor_avx2
464 ;***************************************************************************************************
465 public minmax_avx2
466 align 16
467 minmax_avx2: namespace minmax_avx2
468 ;---------------------------------------------------------------------------------------------------
469 ; ctx
470 min := 0 ; out
471 v0 := 0
472 v0_plane_x := 0
473 v0_plane_y := 1 * 4
474 v0_tex_s := 2 * 4
475 v0_tex_t := 3 * 4
476 max := 4 * 4 ; out
477 v1 := 4 * 4 ; in
478 v1_plane_x := 4 * 4
479 v1_plane_y := 5 * 4
480 v1_tex_s := 6 * 4
481 v1_tex_t := 7 * 4
482 v2 := 8 * 4
483 v2_plane_x := 8 * 4
484 v2_plane_y := 9 * 4
485 v2_tex_s := 10 * 4
486 v2_tex_t := 11 * 4
487 v3 := 12 * 4
488 v3_plane_x := 12 * 4
489 v3_plane_y := 13 * 4
490 v3_tex_s := 14 * 4
491 v3_tex_t := 15 * 4
492 minmax_scale := 16 * 4
493 minmax_scale_x := 16 * 4
494 minmax_scale_y := 17 * 4
495 minmax_scale_s := 18 * 4
496 minmax_scale_t := 19 * 4
497 ;---------------------------------------------------------------------------------------------------
498 ; stack
499 ;--- caller
500 define ret_addr esp + 4 * 0
501 define ctx esp + 4 * 1
502 ;---------------------------------------------------------------------------------------------------
503 mov eax, dword [ctx]
505 vmovups xmm0, [eax + v0]
506 vmovups xmm1, [eax + v1]
507 vmovups xmm2, [eax + v2]
508 vmovups xmm3, [eax + v3]
509 vmovups xmm6, [eax + minmax_scale]
511 vminps xmm5, xmm0, xmm1
512 vminps xmm5, xmm5, xmm2
513 vminps xmm5, xmm5, xmm3
514 vcvtdq2ps xmm6, xmm6 ; to f32
515 vmulps xmm5, xmm5, xmm6
516 vcvtps2dq xmm5, xmm5
517 vmovups [eax + min], xmm5
519 vmaxps xmm4, xmm0, xmm1
520 vmaxps xmm4, xmm4, xmm2
521 vmaxps xmm4, xmm4, xmm3
522 vmulps xmm4, xmm4, xmm6
523 vcvtps2dq xmm4, xmm4
524 vmovups [eax + max], xmm4
526 vzeroupper ; end of AVX2 code
528 end namespace ; minmax_avx2
529 ;***************************************************************************************************
530 ; TODO: test if the steam client is actually using argb (the blue seems to go away)
531 public alphablend_rgba_avx2
532 align 16
533 alphablend_rgba_avx2: namespace alphablend_rgba_avx2
534 ;---------------------------------------------------------------------------------------------------
535 ; input
536 dst := 0
537 dst_adjust_bytes_n := 4
538 src := 8
539 src_adjust_bytes_n := 12
540 width_pixs_n := 16
541 height_lines_n := 20
542 ;---------------------------------------------------------------------------------------------------
543 ; stack
544 ;---- frame
545 define ebx_save esp + 4 * -4
546 define edi_save esp + 4 * -3
547 define esi_save esp + 4 * -2
548 define ebp_save esp + 4 * -1
549 ;--- caller
550 define ret_addr esp + 4 * 0
551 define input esp + 4 * 1
552 ;---------------------------------------------------------------------------------------------------
553 mov [ebx_save], ebx
554 mov [edi_save], edi
555 mov [esi_save], esi
556 mov [ebp_save], ebp
557 mov ebx, dword [input]
559 mov edi, dword [ebx + dst]
560 mov esi, dword [ebx + src]
561 mov edx, dword [ebx + height_lines_n]
563 ; CONSTANTS -- START -----------------------------------------------------------------------
564 ; 0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
565 ; ff00ff00ff00ff00 ff00ff00ff00ff00 ff00ff00ff00ff00 ff00ff00ff00ff00
566 vpcmpeqb ymm7, ymm7, ymm7
567 vpsrlw ymm7, ymm7, 8
568 ; 0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
569 ; 000000ff000000ff 000000ff000000ff 000000ff000000ff 000000ff000000ff
570 vpcmpeqb ymm6, ymm6, ymm6
571 vpsrld ymm6, ymm6, 24
572 vpslld ymm6, ymm6, 24
574 mov eax, 0x808080 ; see below for why (maths)
575 mov ecx, 0x807f807f ; see below for why (maths)
576 vmovd xmm5, eax
577 ; 0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
578 ; 8080800080808000 8080800080808000 8080800080808000 8080800080808000
579 vpbroadcastd ymm5, xmm5
580 vmovd xmm4, ecx
581 ; 0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
582 ; 7f807f807f807f80 7f807f807f807f80 7f807f807f807f80 7f807f807f807f80
583 vpbroadcastd ymm4, xmm4
584 ; CONSTANTS -- END -------------------------------------------------------------------------
585 align_nops
586 next_line:
587 mov ecx, dword [ebx + width_pixs_n]
589 cmp ecx, 4
590 jb blend_2pixs
591 align_nops
592 blend_4pixs:
593 ; load 4 pixels from the src and 4 pixels from the dst
594 vmovdqu xmm0, xword [edi] ; clear hi xmm from ymm
595 vmovdqu xmm1, xword [esi] ; clear hi xmm from ymm
596 ; from positive to translated and signed values b = B - 0x7f, but _not_ for alpha
597 ; WE DO THAT BECAUSE WE WILL USE THE VPMADDUBSW INSTRUCTION BELOW AND A XOR 0xFF
598 ; 0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
599 ; from: R0G0B0A0R1G1B1A1 R2G2B2A2R3G3B3A3 0000000000000000 0000000000000000
600 ; to: r0g0b0a0b1g1r1A1 r2g2b2a2r3g3b3A3 0000000000000000 0000000000000000
601 vpsubb xmm0, xmm0, xmm5
602 vpsubb xmm1, xmm1, xmm5
603 ; 0 1 2 3 4 5 6 7 8 9101112131415 1617181920212223 2425262728293031
604 ; from: r0g0b0a0r1g1b1A1 r2g2b2a2r3g3b3A3 0000000000000000 0000000000000000
605 ; to: r0g0b0a0r1g1b1A1 0000000000000000 r2g2b2a2r3g3b3A3 0000000000000000
606 vpermq ymm0, ymm0, 10011000b
607 vpermq ymm1, ymm1, 10011000b
608 ; f->framebuffer, t->texture
609 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
610 ; from: r0f g0f b0f A0f r1f g1f b1f A1f 00 00 00 00 00 00 00 00 r2f g2f b2f A2f r3f g3f b3f A3f 00 00 00 00 00 00 00 00
611 ; r0t g0t b0t A0t r1t g1t b1t A1t 00 00 00 00 00 00 00 00 r2t g2t b2t A2t r3t g3t b3t A3t 00 00 00 00 00 00 00 00
612 ; to: r0f r0t g0f g0t b0f b0t A0f A0t r1f r1t g1f g1t b1f b1t A1f A1t r2f r2t g2f g2t b2f b2t a2f A2t r3f r3t g3f g3t b3f b3t A3f A3t
613 vpunpcklbw ymm0, ymm0, ymm1
614 ; rgba -> bgra, switch r and b
615 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
616 ; from: r0f b0t g0f g0t b0f b0t A0f A0t r1f r1t g1f g1t b1f b1t A1f A1t r2f r2t g2f g2t b2f b2t a2f A2t r3f r3t g3f g3t b3f b3t A3f A3t
617 ; to: b0f b0t g0f g0t r0f r0t A0f A0t b1f b1t g1f g1t r1f r1t A1f A1t b2f b2t g2f g2t r2f r2t a2f A2t b3f b3t g3f g3t r3f r3t A3f A3t
618 vpshuflw ymm0, ymm0, 11000110b
619 vpshufhw ymm0, ymm0, 11000110b
620 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
621 ; from: b0f b0t g0f g0t r0f r0t A0f A0t b1f b1t g1f g1t r1f r1t A1f A1t b2f b2t g2f g2t r2f r2t A2f A2t b3f b3t g3f g3t r3f r3t A3f A3t
622 ; to: a0t 00 00 00 00 00 00 00 a1t 00 00 00 00 00 00 00 a2t 00 00 00 00 00 00 00 a3t 00 00 00 00 00 00 00
623 vpsrlq ymm1, ymm0, 56
624 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
625 ; from: A0t 00 00 00 00 00 00 00 A1t 00 00 00 00 00 00 00 A2t 00 00 00 00 00 00 00 A3t 00 00 00 00 00 00 00
626 ; to: 00 A0t 00 00 00 00 00 00 00 A1t 00 00 00 00 00 00 00 A2t 00 00 00 00 00 00 00 A3t 00 00 00 00 00 00
627 vpsllq ymm2, ymm1, 8
628 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
629 ; from: A0t 00 00 00 00 00 00 00 A1t 00 00 00 00 00 00 00 A2t 00 00 00 00 00 00 00 A3t 00 00 00 00 00 00 00
630 ; to: A0t A0t 00 00 00 00 00 00 A1t A1t 00 00 00 00 00 00 A2t A2t 00 00 00 00 00 00 A3t A3t 00 00 00 00 00 00
631 vpor ymm1, ymm2, ymm1
632 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
633 ; from: A0t A0t 00 00 00 00 00 00 A1t A1t 00 00 00 00 00 00 A2t A2t 00 00 00 00 00 00 A3t A3t 00 00 00 00 00 00
634 ; to: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t 00 00 00 00 00 00 A2t A2t A2t A2t A2t A2t d2f A2t A3f A3t 00 00 00 00 00 00
635 vpshuflw ymm1, ymm1, 0
636 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
637 ; from: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t 00 00 00 00 00 00 A2t A2t A2t A2t A2t A2t A2t A2t A3t A3t 00 00 00 00 00 00
638 ; to: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t A1t A1t A1t A1t A1t A1t A2t A2t A2t A2t A2t A2t A2t A2t A3t A3t A3t A3t A3t A3t A3t A3t
639 vpshufhw ymm1, ymm1, 0
640 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
641 ; from: A0t A0t A0t A0t A0t A0t A0t A0t A1t A1t A1t A1t A1t A1t A1t A1t A2t A2t A2t A2t A2t A2t A2t A2t A3t A3t A3t A3t A3t A3t A3t A3t
642 ; ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00 ff 00
643 ; to: D = (255 - A) = xor A, 0xff
644 ; D0t A0t D0t A0t D0t A0t D0t A0t D1t A1t D1t A1t D1t A1t D1t A1t D2t A2t D2t A2t D2t A2t D2t A2t D3t A3t D3t A3t D3t A3t D3t A3t
645 vpxor ymm1, ymm1, ymm7
647 ; integer pixel alpha blending with 2^8 = 0x100 divisor instead of 0xff, on a 16bits scale
648 ; F = (F * (0xff - A) + T * A + 0xff) >> 8
649 ; BUT we are working with f and t which are:
650 ; f = (F - 0x80) and t = (T - 0x80)
651 ; If you do the maths, you get:
652 ; F = f * (0xff * A) + t * A + 0x807f (0xff * 0x80) + 0xff
653 ; = f * (0xff * A) + t * A + 0x807f
655 ; f * (0xff -a) + t * a
656 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
657 ; src1(unsigned 8bits): D0f A0t D0f A0t D0f A0t D0f A0t D1f A1t D1f A1t D1f A1t D1f A1t D2f A2t D2f A2t D2f A2t D2f A2t D3f A3t D3f A3t D3f A3t D3f A3t
658 ; src2(signed 8bits): b0f b0t g0f g0t r0f r0t A0f A0t b1f b1t g1f g1t r1f r1t A1f A1t b2f b2t g2f g2t r2f r2t A2f A2t b3f b3t g3f g3t r3f r3t A3f A3t
659 ; to: b0----- g0----- r0----- a0----- b1----- g1----- r1----- a1----- b2----- g2----- r2----- a1----- b3----- g3----- r3----- a3-----
660 vpmaddubsw ymm0, ymm1, ymm0 ; ymm1 is unsigned, ymm0 is signed
661 ; + 0x807f
662 vpaddw ymm0, ymm0, ymm4
663 ; >> 8
664 vpsrlw ymm0, ymm0, 8
665 ; we have to do it this way because of the weird handling of ymm regs by vpackuswb
666 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
667 ; from: b0----- g0----- r0----- A0----- b1----- g1----- r1----- A1-----
668 ; from: b2----- g2----- r2----- A1----- b3----- g3----- r3----- A3-----
669 ; to: b0 g0 r0 A0 b1 g1 r1 A1 b2 g2 r2 A2 b3 g3 r3 A3
670 vextracti128 xmm1, ymm0, 1
671 vpackuswb xmm0, xmm0, xmm1
672 ; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
673 ; from: b0 g0 r0 A0 b1 g1 r1 A1 b2 g2 r2 A2 b3 g3 r3 A3 b0 g0 r0 A0 b1 g1 r1 A1 b2 g2 r2 A2 b3 g3 r3 A3
674 ; to: b0 g0 r0 ff b1 g1 r1 ff b2 g2 r2 ff b3 g3 r3 ff b0 g0 r0 A0 b1 g1 r1 ff b2 g2 r2 ff b3 g3 r3 ff
675 vpor xmm0, xmm0, xmm6
677 vmovdqu [edi], xmm0
679 sub ecx, 4 ; pixs
680 add edi, 4 * 4 ; bytes
681 add esi, 4 * 4 ; bytes
682 cmp ecx, 4
683 jae blend_4pixs
684 align_nops
685 blend_2pixs:
686 cmp ecx, 2
687 jb blend_1pixs
688 vmovq xmm0, qword [edi] ; zero extended
689 vmovq xmm1, qword [esi] ; zero extended
690 vpsubb xmm0, xmm0, xmm5
691 vpsubb xmm1, xmm1, xmm5
692 vpunpcklbw xmm0, xmm0, xmm1
693 vpshuflw xmm0, xmm0, 11000110b
694 vpsrlq xmm1, xmm0, 56
695 vpsllq xmm2, xmm1, 8
696 vpor xmm1, xmm2, xmm1
697 vpshuflw xmm1, xmm1, 0
698 vpshufhw xmm1, xmm1, 0
699 vpxor xmm1, xmm1, xmm7
700 vpmaddubsw xmm0, xmm1, xmm0 ; xmm1 is unsigned, xmm0 is signed
701 vpaddw xmm0, xmm0, xmm4
702 vpsrlw xmm0, xmm0, 8
703 vpackuswb xmm0, xmm0, xmm0
704 vpor xmm0, xmm0, xmm6
705 vmovq qword [edi], xmm0
707 sub ecx, 2 ; pixs
708 add edi, 2 * 4 ; bytes
709 add esi, 2 * 4 ; bytes
710 align_nops
711 blend_1pixs:
712 test ecx, ecx
713 jz prepare_next_line
714 vmovd xmm0, dword [edi] ; zero extended
715 vmovd xmm1, dword [esi] ; zero extended
716 vpsubb xmm0, xmm0, xmm5
717 vpsubb xmm1, xmm1, xmm5
718 vpunpcklbw xmm0, xmm0, xmm1
719 vpshuflw xmm0, xmm0, 11000110b
720 vpsrlq xmm1, xmm0, 56
721 vpsllq xmm2, xmm1, 8
722 vpor xmm1, xmm2, xmm1
723 vpshuflw xmm1, xmm1, 0
724 vpshufhw xmm1, xmm1, 0
725 vpxor xmm1, xmm1, xmm7
726 vpmaddubsw xmm0, xmm1, xmm0 ; xmm1 is unsigned, xmm0 is signed
727 vpaddw xmm0, xmm0, xmm4
728 vpsrlw xmm0, xmm0, 8
729 vpackuswb xmm0, xmm0, xmm0
730 vpor xmm0, xmm0, xmm6
731 vmovd dword [edi], xmm0
732 add edi, 4 ; bytes
733 add esi, 4 ; bytes
734 align_nops
735 prepare_next_line:
736 add edi, dword [ebx + dst_adjust_bytes_n]
737 add esi, dword [ebx + src_adjust_bytes_n]
738 dec edx ; lines--
739 jnz next_line
740 align_nops
741 epilog:
742 mov ebx, [ebx_save]
743 mov edi, [edi_save]
744 mov esi, [esi_save]
745 mov ebp, [ebp_save]
746 vzeroupper ; end of AVX2 code
748 end namespace ; alphablend_bgra_avx2
749 ;***************************************************************************************************
750 macro align_nops
751 local a
752 virtual
753 align 16
754 a = $ - $$
755 end virtual
756 if a = 15
757 db 0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
758 db 0x66, 0xf, 0x1f, 0x44, 0x00, 0x00
759 else if a = 14
760 db 0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
761 db 0xf, 0x1f, 0x44, 0x00, 0x00
762 else if a = 13
763 db 0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
764 db 0xf, 0x1f, 0x40, 0x00
765 else if a = 12
766 db 0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
767 db 0xf, 0x1f, 0x00
768 else if a = 11
769 db 0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
770 db 0x66, 0x90
771 else if a = 10
772 db 0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
773 db 0x90
774 else if a = 9
775 db 0x66, 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
776 else if a = 8
777 db 0xf, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00
778 else if a = 7
779 db 0xf, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00
780 else if a = 6
781 db 0x66, 0xf, 0x1f, 0x44, 0x00, 0x00
782 else if a = 5
783 db 0xf, 0x1f, 0x44, 0x00, 0x00
784 else if a = 4
785 db 0xf, 0x1f, 0x40, 0x00
786 else if a = 3
787 db 0xf, 0x1f, 0x00
788 else if a = 2
789 db 0x66, 0x90
790 else if a = 1
791 db 0x90
792 end if
793 end macro