Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / pixel-util8.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
6 ;*
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
11 ;*
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;*
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
24
25 %include "x86inc.asm"
26 %include "x86util.asm"
27
28 SECTION_RODATA 32
29
30 %if BIT_DEPTH == 10
31 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
32 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
33 pf_64: times 4 dd 64.0
34 pf_128: times 4 dd 128.0
35 %elif BIT_DEPTH == 9
36 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
37 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
38 %else ; 8-bit
39 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
40 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
41 %endif
42 mask_ff: times 16 db 0xff
43 times 16 db 0
44 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
45 deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
46 hmul_16p: times 16 db 1
47 times 8 db 1, -1
48 hmulw_16p: times 8 dw 1
49 times 4 dw 1, -1
50
51 trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
52
53 SECTION .text
54
55 cextern pw_1
56 cextern pb_1
57 cextern pw_00ff
58 cextern pw_2000
59 cextern pw_pixel_max
60 cextern pd_1
61 cextern pd_32767
62 cextern pd_n32768
63
64
65 ;-----------------------------------------------------------------------------
66 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
67 ;-----------------------------------------------------------------------------
68 INIT_XMM sse2
69 %if HIGH_BIT_DEPTH
70 cglobal getResidual4, 4,4,4
71 add r3, r3
72
73 ; row 0-1
74 movh m0, [r0]
75 movh m1, [r0 + r3]
76 movh m2, [r1]
77 movh m3, [r1 + r3]
78 punpcklqdq m0, m1
79 punpcklqdq m2, m3
80 psubw m0, m2
81
82 movh [r2], m0
83 movhps [r2 + r3], m0
84 lea r0, [r0 + r3 * 2]
85 lea r1, [r1 + r3 * 2]
86 lea r2, [r2 + r3 * 2]
87
88 ; row 2-3
89 movh m0, [r0]
90 movh m1, [r0 + r3]
91 movh m2, [r1]
92 movh m3, [r1 + r3]
93 punpcklqdq m0, m1
94 punpcklqdq m2, m3
95 psubw m0, m2
96
97 movh [r2], m0
98 movhps [r2 + r3], m0
99 %else
100 cglobal getResidual4, 4,4,5
101 pxor m0, m0
102
103 ; row 0-1
104 movd m1, [r0]
105 movd m2, [r0 + r3]
106 movd m3, [r1]
107 movd m4, [r1 + r3]
108 punpckldq m1, m2
109 punpcklbw m1, m0
110 punpckldq m3, m4
111 punpcklbw m3, m0
112 psubw m1, m3
113 movh [r2], m1
114 movhps [r2 + r3 * 2], m1
115 lea r0, [r0 + r3 * 2]
116 lea r1, [r1 + r3 * 2]
117 lea r2, [r2 + r3 * 4]
118
119 ; row 2-3
120 movd m1, [r0]
121 movd m2, [r0 + r3]
122 movd m3, [r1]
123 movd m4, [r1 + r3]
124 punpckldq m1, m2
125 punpcklbw m1, m0
126 punpckldq m3, m4
127 punpcklbw m3, m0
128 psubw m1, m3
129 movh [r2], m1
130 movhps [r2 + r3 * 2], m1
131 %endif
132 RET
133
134
135 INIT_XMM sse2
136 %if HIGH_BIT_DEPTH
137 cglobal getResidual8, 4,4,4
138 add r3, r3
139
140 %assign x 0
141 %rep 8/2
142 ; row 0-1
143 movu m1, [r0]
144 movu m2, [r0 + r3]
145 movu m3, [r1]
146 movu m4, [r1 + r3]
147 psubw m1, m3
148 psubw m2, m4
149 movu [r2], m1
150 movu [r2 + r3], m2
151 %assign x x+1
152 %if (x != 4)
153 lea r0, [r0 + r3 * 2]
154 lea r1, [r1 + r3 * 2]
155 lea r2, [r2 + r3 * 2]
156 %endif
157 %endrep
158 %else
159 cglobal getResidual8, 4,4,5
160 pxor m0, m0
161
162 %assign x 0
163 %rep 8/2
164 ; row 0-1
165 movh m1, [r0]
166 movh m2, [r0 + r3]
167 movh m3, [r1]
168 movh m4, [r1 + r3]
169 punpcklbw m1, m0
170 punpcklbw m2, m0
171 punpcklbw m3, m0
172 punpcklbw m4, m0
173 psubw m1, m3
174 psubw m2, m4
175 movu [r2], m1
176 movu [r2 + r3 * 2], m2
177 %assign x x+1
178 %if (x != 4)
179 lea r0, [r0 + r3 * 2]
180 lea r1, [r1 + r3 * 2]
181 lea r2, [r2 + r3 * 4]
182 %endif
183 %endrep
184 %endif
185 RET
186
187 %if HIGH_BIT_DEPTH
188 INIT_XMM sse2
189 cglobal getResidual16, 4,5,6
190 add r3, r3
191 mov r4d, 16/4
192 .loop:
193 ; row 0-1
194 movu m0, [r0]
195 movu m1, [r0 + 16]
196 movu m2, [r0 + r3]
197 movu m3, [r0 + r3 + 16]
198 movu m4, [r1]
199 movu m5, [r1 + 16]
200 psubw m0, m4
201 psubw m1, m5
202 movu m4, [r1 + r3]
203 movu m5, [r1 + r3 + 16]
204 psubw m2, m4
205 psubw m3, m5
206 lea r0, [r0 + r3 * 2]
207 lea r1, [r1 + r3 * 2]
208
209 movu [r2], m0
210 movu [r2 + 16], m1
211 movu [r2 + r3], m2
212 movu [r2 + r3 + 16], m3
213 lea r2, [r2 + r3 * 2]
214
215 ; row 2-3
216 movu m0, [r0]
217 movu m1, [r0 + 16]
218 movu m2, [r0 + r3]
219 movu m3, [r0 + r3 + 16]
220 movu m4, [r1]
221 movu m5, [r1 + 16]
222 psubw m0, m4
223 psubw m1, m5
224 movu m4, [r1 + r3]
225 movu m5, [r1 + r3 + 16]
226 psubw m2, m4
227 psubw m3, m5
228
229 movu [r2], m0
230 movu [r2 + 16], m1
231 movu [r2 + r3], m2
232 movu [r2 + r3 + 16], m3
233
234 dec r4d
235
236 lea r0, [r0 + r3 * 2]
237 lea r1, [r1 + r3 * 2]
238 lea r2, [r2 + r3 * 2]
239
240 jnz .loop
241 %else
242
243 INIT_XMM sse4
244 cglobal getResidual16, 4,5,8
245 mov r4d, 16/4
246 pxor m0, m0
247 .loop:
248 ; row 0-1
249 movu m1, [r0]
250 movu m2, [r0 + r3]
251 movu m3, [r1]
252 movu m4, [r1 + r3]
253 pmovzxbw m5, m1
254 punpckhbw m1, m0
255 pmovzxbw m6, m2
256 punpckhbw m2, m0
257 pmovzxbw m7, m3
258 punpckhbw m3, m0
259 psubw m5, m7
260 psubw m1, m3
261 pmovzxbw m7, m4
262 punpckhbw m4, m0
263 psubw m6, m7
264 psubw m2, m4
265
266 movu [r2], m5
267 movu [r2 + 16], m1
268 movu [r2 + r3 * 2], m6
269 movu [r2 + r3 * 2 + 16], m2
270
271 lea r0, [r0 + r3 * 2]
272 lea r1, [r1 + r3 * 2]
273 lea r2, [r2 + r3 * 4]
274
275 ; row 2-3
276 movu m1, [r0]
277 movu m2, [r0 + r3]
278 movu m3, [r1]
279 movu m4, [r1 + r3]
280 pmovzxbw m5, m1
281 punpckhbw m1, m0
282 pmovzxbw m6, m2
283 punpckhbw m2, m0
284 pmovzxbw m7, m3
285 punpckhbw m3, m0
286 psubw m5, m7
287 psubw m1, m3
288 pmovzxbw m7, m4
289 punpckhbw m4, m0
290 psubw m6, m7
291 psubw m2, m4
292
293 movu [r2], m5
294 movu [r2 + 16], m1
295 movu [r2 + r3 * 2], m6
296 movu [r2 + r3 * 2 + 16], m2
297
298 dec r4d
299
300 lea r0, [r0 + r3 * 2]
301 lea r1, [r1 + r3 * 2]
302 lea r2, [r2 + r3 * 4]
303
304 jnz .loop
305 %endif
306
307 RET
308
309 %if HIGH_BIT_DEPTH
310 INIT_XMM sse2
311 cglobal getResidual32, 4,5,6
312 add r3, r3
313 mov r4d, 32/2
314 .loop:
315 ; row 0
316 movu m0, [r0]
317 movu m1, [r0 + 16]
318 movu m2, [r0 + 32]
319 movu m3, [r0 + 48]
320 movu m4, [r1]
321 movu m5, [r1 + 16]
322 psubw m0, m4
323 psubw m1, m5
324 movu m4, [r1 + 32]
325 movu m5, [r1 + 48]
326 psubw m2, m4
327 psubw m3, m5
328
329 movu [r2], m0
330 movu [r2 + 16], m1
331 movu [r2 + 32], m2
332 movu [r2 + 48], m3
333
334 ; row 1
335 movu m0, [r0 + r3]
336 movu m1, [r0 + r3 + 16]
337 movu m2, [r0 + r3 + 32]
338 movu m3, [r0 + r3 + 48]
339 movu m4, [r1 + r3]
340 movu m5, [r1 + r3 + 16]
341 psubw m0, m4
342 psubw m1, m5
343 movu m4, [r1 + r3 + 32]
344 movu m5, [r1 + r3 + 48]
345 psubw m2, m4
346 psubw m3, m5
347
348 movu [r2 + r3], m0
349 movu [r2 + r3 + 16], m1
350 movu [r2 + r3 + 32], m2
351 movu [r2 + r3 + 48], m3
352
353 dec r4d
354
355 lea r0, [r0 + r3 * 2]
356 lea r1, [r1 + r3 * 2]
357 lea r2, [r2 + r3 * 2]
358
359 jnz .loop
360
361 %else
362 INIT_XMM sse4
363 cglobal getResidual32, 4,5,7
364 mov r4d, 32/2
365 pxor m0, m0
366 .loop:
367 movu m1, [r0]
368 movu m2, [r0 + 16]
369 movu m3, [r1]
370 movu m4, [r1 + 16]
371 pmovzxbw m5, m1
372 punpckhbw m1, m0
373 pmovzxbw m6, m3
374 punpckhbw m3, m0
375 psubw m5, m6
376 psubw m1, m3
377 movu [r2 + 0 * 16], m5
378 movu [r2 + 1 * 16], m1
379
380 pmovzxbw m5, m2
381 punpckhbw m2, m0
382 pmovzxbw m6, m4
383 punpckhbw m4, m0
384 psubw m5, m6
385 psubw m2, m4
386 movu [r2 + 2 * 16], m5
387 movu [r2 + 3 * 16], m2
388
389 movu m1, [r0 + r3]
390 movu m2, [r0 + r3 + 16]
391 movu m3, [r1 + r3]
392 movu m4, [r1 + r3 + 16]
393 pmovzxbw m5, m1
394 punpckhbw m1, m0
395 pmovzxbw m6, m3
396 punpckhbw m3, m0
397 psubw m5, m6
398 psubw m1, m3
399 movu [r2 + r3 * 2 + 0 * 16], m5
400 movu [r2 + r3 * 2 + 1 * 16], m1
401
402 pmovzxbw m5, m2
403 punpckhbw m2, m0
404 pmovzxbw m6, m4
405 punpckhbw m4, m0
406 psubw m5, m6
407 psubw m2, m4
408 movu [r2 + r3 * 2 + 2 * 16], m5
409 movu [r2 + r3 * 2 + 3 * 16], m2
410
411 dec r4d
412
413 lea r0, [r0 + r3 * 2]
414 lea r1, [r1 + r3 * 2]
415 lea r2, [r2 + r3 * 4]
416
417 jnz .loop
418 %endif
419 RET
420
421
422 ;-----------------------------------------------------------------------------
423 ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
424 ;-----------------------------------------------------------------------------
425 INIT_XMM sse4
426 cglobal quant, 5,6,8
427 ; fill qbits
428 movd m4, r4d ; m4 = qbits
429
430 ; fill qbits-8
431 sub r4d, 8
432 movd m6, r4d ; m6 = qbits8
433
434 ; fill offset
435 movd m5, r5m
436 pshufd m5, m5, 0 ; m5 = add
437
438 lea r5, [pd_1]
439
440 mov r4d, r6m
441 shr r4d, 3
442 pxor m7, m7 ; m7 = numZero
443 .loop:
444 ; 4 coeff
445 pmovsxwd m0, [r0] ; m0 = level
446 pabsd m1, m0
447 pmulld m1, [r1] ; m0 = tmpLevel1
448 paddd m2, m1, m5
449 psrad m2, m4 ; m2 = level1
450
451 pslld m3, m2, 8
452 psrad m1, m6
453 psubd m1, m3 ; m1 = deltaU1
454
455 movu [r2], m1
456 psignd m3, m2, m0
457 pminud m2, [r5]
458 paddd m7, m2
459 packssdw m3, m3
460 movh [r3], m3
461
462 ; 4 coeff
463 pmovsxwd m0, [r0 + 8] ; m0 = level
464 pabsd m1, m0
465 pmulld m1, [r1 + 16] ; m0 = tmpLevel1
466 paddd m2, m1, m5
467 psrad m2, m4 ; m2 = level1
468 pslld m3, m2, 8
469 psrad m1, m6
470 psubd m1, m3 ; m1 = deltaU1
471 movu [r2 + 16], m1
472 psignd m3, m2, m0
473 pminud m2, [r5]
474 paddd m7, m2
475 packssdw m3, m3
476 movh [r3 + 8], m3
477
478 add r0, 16
479 add r1, 32
480 add r2, 32
481 add r3, 16
482
483 dec r4d
484 jnz .loop
485
486 pxor m0, m0
487 psadbw m7, m0
488 movhlps m0, m7
489 paddd m7, m0
490 movd eax, m7
491 RET
492
493
494 IACA_START
495 %if ARCH_X86_64 == 1
496 INIT_YMM avx2
497 cglobal quant, 5,5,10
498 ; fill qbits
499 movd xm4, r4d ; m4 = qbits
500
501 ; fill qbits-8
502 sub r4d, 8
503 movd xm6, r4d ; m6 = qbits8
504
505 ; fill offset
506 vpbroadcastd m5, r5m ; m5 = add
507
508 vpbroadcastw m9, [pw_1] ; m9 = word [1]
509
510 mov r4d, r6m
511 shr r4d, 4
512 pxor m7, m7 ; m7 = numZero
513 .loop:
514 ; 8 coeff
515 pmovsxwd m0, [r0] ; m0 = level
516 pabsd m1, m0
517 pmulld m1, [r1] ; m0 = tmpLevel1
518 paddd m2, m1, m5
519 psrad m2, xm4 ; m2 = level1
520
521 pslld m3, m2, 8
522 psrad m1, xm6
523 psubd m1, m3 ; m1 = deltaU1
524 movu [r2], m1
525 psignd m2, m0
526
527 ; 8 coeff
528 pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
529 pabsd m1, m0
530 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
531 paddd m3, m1, m5
532 psrad m3, xm4 ; m2 = level1
533
534 pslld m8, m3, 8
535 psrad m1, xm6
536 psubd m1, m8 ; m1 = deltaU1
537 movu [r2 + mmsize], m1
538 psignd m3, m0
539
540 packssdw m2, m3
541 vpermq m2, m2, q3120
542 movu [r3], m2
543
544 ; count non-zero coeff
545 ; TODO: popcnt is faster, but some CPU can't support
546 pminuw m2, m9
547 paddw m7, m2
548
549 add r0, mmsize
550 add r1, mmsize*2
551 add r2, mmsize*2
552 add r3, mmsize
553
554 dec r4d
555 jnz .loop
556
557 ; sum count
558 xorpd m0, m0
559 psadbw m7, m0
560 vextracti128 xm1, m7, 1
561 paddd xm7, xm1
562 movhlps xm0, xm7
563 paddd xm7, xm0
564 movd eax, xm7
565 RET
566
567 %else ; ARCH_X86_64 == 1
568 INIT_YMM avx2
569 cglobal quant, 5,6,8
570 ; fill qbits
571 movd xm4, r4d ; m4 = qbits
572
573 ; fill qbits-8
574 sub r4d, 8
575 movd xm6, r4d ; m6 = qbits8
576
577 ; fill offset
578 vpbroadcastd m5, r5m ; m5 = ad
579
580 lea r5, [pd_1]
581
582 mov r4d, r6m
583 shr r4d, 4
584 pxor m7, m7 ; m7 = numZero
585 .loop:
586 ; 8 coeff
587 pmovsxwd m0, [r0] ; m0 = level
588 pabsd m1, m0
589 pmulld m1, [r1] ; m0 = tmpLevel1
590 paddd m2, m1, m5
591 psrad m2, xm4 ; m2 = level1
592
593 pslld m3, m2, 8
594 psrad m1, xm6
595 psubd m1, m3 ; m1 = deltaU1
596
597 movu [r2], m1
598 psignd m3, m2, m0
599 pminud m2, [r5]
600 paddd m7, m2
601 packssdw m3, m3
602 vpermq m3, m3, q0020
603 movu [r3], xm3
604
605 ; 8 coeff
606 pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
607 pabsd m1, m0
608 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
609 paddd m2, m1, m5
610 psrad m2, xm4 ; m2 = level1
611
612 pslld m3, m2, 8
613 psrad m1, xm6
614 psubd m1, m3 ; m1 = deltaU1
615
616 movu [r2 + mmsize], m1
617 psignd m3, m2, m0
618 pminud m2, [r5]
619 paddd m7, m2
620 packssdw m3, m3
621 vpermq m3, m3, q0020
622 movu [r3 + mmsize/2], xm3
623
624 add r0, mmsize
625 add r1, mmsize*2
626 add r2, mmsize*2
627 add r3, mmsize
628
629 dec r4d
630 jnz .loop
631
632 xorpd m0, m0
633 psadbw m7, m0
634 vextracti128 xm1, m7, 1
635 paddd xm7, xm1
636 movhlps xm0, xm7
637 paddd xm7, xm0
638 movd eax, xm7
639 RET
640 %endif ; ARCH_X86_64 == 1
641 IACA_END
642
643
644 ;-----------------------------------------------------------------------------
645 ; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
646 ;-----------------------------------------------------------------------------
647 INIT_XMM sse4
648 cglobal nquant, 3,5,8
649 movd m6, r4m
650 mov r4d, r5m
651 pxor m7, m7 ; m7 = numZero
652 movd m5, r3m ; m5 = qbits
653 pshufd m6, m6, 0 ; m6 = add
654 mov r3d, r4d ; r3 = numCoeff
655 shr r4d, 3
656
657 .loop:
658 pmovsxwd m0, [r0] ; m0 = level
659 pmovsxwd m1, [r0 + 8] ; m1 = level
660
661 pabsd m2, m0
662 pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
663 paddd m2, m6
664 psrad m2, m5 ; m0 = level1
665 psignd m2, m0
666
667 pabsd m3, m1
668 pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff
669 paddd m3, m6
670 psrad m3, m5 ; m1 = level1
671 psignd m3, m1
672
673 packssdw m2, m3
674
675 movu [r2], m2
676 add r0, 16
677 add r1, 32
678 add r2, 16
679
680 pxor m4, m4
681 pcmpeqw m2, m4
682 psubw m7, m2
683
684 dec r4d
685 jnz .loop
686
687 packuswb m7, m7
688 psadbw m7, m4
689 mov eax, r3d
690 movd r4d, m7
691 sub eax, r4d ; numSig
692 RET
693
694
695 INIT_YMM avx2
696 cglobal nquant, 3,5,7
697 vpbroadcastd m4, r4m
698 vpbroadcastd m6, [pw_1]
699 mov r4d, r5m
700 pxor m5, m5 ; m7 = numZero
701 movd xm3, r3m ; m5 = qbits
702 mov r3d, r4d ; r3 = numCoeff
703 shr r4d, 4
704
705 .loop:
706 pmovsxwd m0, [r0] ; m0 = level
707 pabsd m1, m0
708 pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
709 paddd m1, m4
710 psrad m1, xm3 ; m0 = level1
711 psignd m1, m0
712
713 pmovsxwd m0, [r0 + mmsize/2] ; m0 = level
714 pabsd m2, m0
715 pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
716 paddd m2, m4
717 psrad m2, xm3 ; m0 = level1
718 psignd m2, m0
719
720 packssdw m1, m2
721 vpermq m2, m1, q3120
722
723 movu [r2], m2
724 add r0, mmsize
725 add r1, mmsize * 2
726 add r2, mmsize
727
728 pminuw m1, m6
729 paddw m5, m1
730
731 dec r4d
732 jnz .loop
733
734 pxor m0, m0
735 psadbw m5, m0
736 vextracti128 xm0, m5, 1
737 paddd xm5, xm0
738 pshufd xm0, xm5, 2
739 paddd xm5, xm0
740 movd eax, xm5
741 RET
742
743
744 ;-----------------------------------------------------------------------------
745 ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
746 ;-----------------------------------------------------------------------------
747 INIT_XMM sse4
748 cglobal dequant_normal, 5,5,5
749 mova m2, [pw_1]
750 %if HIGH_BIT_DEPTH
751 cmp r3d, 32767
752 jle .skip
753 shr r3d, 2
754 sub r4d, 2
755 .skip:
756 %endif
757 movd m0, r4d ; m0 = shift
758 add r4d, 15
759 bts r3d, r4d
760 movd m1, r3d
761 pshufd m1, m1, 0 ; m1 = dword [add scale]
762 ; m0 = shift
763 ; m1 = scale
764 ; m2 = word [1]
765 .loop:
766 movu m3, [r0]
767 punpckhwd m4, m3, m2
768 punpcklwd m3, m2
769 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
770 pmaddwd m4, m1
771 psrad m3, m0
772 psrad m4, m0
773 packssdw m3, m4
774 mova [r1], m3
775
776 add r0, 16
777 add r1, 16
778
779 sub r2d, 8
780 jnz .loop
781 RET
782
783
784 INIT_YMM avx2
785 cglobal dequant_normal, 5,5,7
786 vpbroadcastd m2, [pw_1] ; m2 = word [1]
787 vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
788 vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
789 %if HIGH_BIT_DEPTH
790 cmp r3d, 32767
791 jle .skip
792 shr r3d, 2
793 sub r4d, 2
794 .skip:
795 %endif
796 movd xm0, r4d ; m0 = shift
797 add r4d, -1+16
798 bts r3d, r4d
799 vpbroadcastd m1, r3d ; m1 = dword [add scale]
800
801 ; m0 = shift
802 ; m1 = scale
803 ; m2 = word [1]
804 shr r2d, 4
805 .loop:
806 movu m3, [r0]
807 punpckhwd m4, m3, m2
808 punpcklwd m3, m2
809 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
810 pmaddwd m4, m1
811 psrad m3, xm0
812 psrad m4, xm0
813 pminsd m3, m5
814 pmaxsd m3, m6
815 pminsd m4, m5
816 pmaxsd m4, m6
817 packssdw m3, m4
818 mova [r1 + 0 * mmsize/2], xm3
819 vextracti128 [r1 + 1 * mmsize/2], m3, 1
820
821 add r0, mmsize
822 add r1, mmsize
823
824 dec r2d
825 jnz .loop
826 RET
827
828
829 ;-----------------------------------------------------------------------------
830 ; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
831 ;-----------------------------------------------------------------------------
832 INIT_XMM ssse3
833 cglobal count_nonzero, 2,2,3
834 pxor m0, m0
835 shr r1d, 4
836 movd m1, r1d
837 pshufb m1, m0
838
839 .loop:
840 mova m2, [r0 + 0]
841 packsswb m2, [r0 + 16]
842 add r0, 32
843 pcmpeqb m2, m0
844 paddb m1, m2
845 dec r1d
846 jnz .loop
847
848 psadbw m1, m0
849 pshufd m0, m1, 2
850 paddd m0, m1
851 movd eax, m0
852 RET
853
854
855 ;-----------------------------------------------------------------------------------------------------------------------------------------------
856 ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
857 ;-----------------------------------------------------------------------------------------------------------------------------------------------
858 INIT_XMM sse4
859 cglobal weight_pp, 6, 7, 6
860
861 shl r5d, 6 ; m0 = [w0<<6]
862 mov r6d, r6m
863 shl r6d, 16
864 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
865 movd m0, r6d
866 pshufd m0, m0, 0 ; m0 = [w0<<6, round]
867 movd m1, r7m
868 movd m2, r8m
869 pshufd m2, m2, 0
870 mova m5, [pw_1]
871 sub r2d, r3d
872 shr r3d, 4
873
874 .loopH:
875 mov r5d, r3d
876
877 .loopW:
878 pmovzxbw m4, [r0]
879 punpcklwd m3, m4, m5
880 pmaddwd m3, m0
881 psrad m3, m1
882 paddd m3, m2
883
884 punpckhwd m4, m5
885 pmaddwd m4, m0
886 psrad m4, m1
887 paddd m4, m2
888
889 packssdw m3, m4
890 packuswb m3, m3
891 movh [r1], m3
892
893 pmovzxbw m4, [r0 + 8]
894 punpcklwd m3, m4, m5
895 pmaddwd m3, m0
896 psrad m3, m1
897 paddd m3, m2
898
899 punpckhwd m4, m5
900 pmaddwd m4, m0
901 psrad m4, m1
902 paddd m4, m2
903
904 packssdw m3, m4
905 packuswb m3, m3
906 movh [r1 + 8], m3
907
908 add r0, 16
909 add r1, 16
910
911 dec r5d
912 jnz .loopW
913
914 lea r0, [r0 + r2]
915 lea r1, [r1 + r2]
916
917 dec r4d
918 jnz .loopH
919 RET
920
921
922 INIT_YMM avx2
923 cglobal weight_pp, 6, 7, 6
924
925 shl r5d, 6 ; m0 = [w0<<6]
926 mov r6d, r6m
927 shl r6d, 16
928 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
929 movd xm0, r6d
930 pshufd xm0, xm0, 0 ; m0 = [w0<<6, round]
931 vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
932
933 movd xm1, r7m
934 vpbroadcastd m2, r8m
935 mova m5, [pw_1]
936 sub r2d, r3d
937 shr r3d, 4
938
939 .loopH:
940 mov r5d, r3d
941
942 .loopW:
943 pmovzxbw m4, [r0]
944 punpcklwd m3, m4, m5
945 pmaddwd m3, m0
946 psrad m3, xm1
947 paddd m3, m2
948
949 punpckhwd m4, m5
950 pmaddwd m4, m0
951 psrad m4, xm1
952 paddd m4, m2
953
954 packssdw m3, m4
955 vextracti128 xm4, m3, 1
956 packuswb xm3, xm4
957 movu [r1], xm3
958
959 add r0, 16
960 add r1, 16
961
962 dec r5d
963 jnz .loopW
964
965 lea r0, [r0 + r2]
966 lea r1, [r1 + r2]
967
968 dec r4d
969 jnz .loopH
970 RET
971
972 ;-------------------------------------------------------------------------------------------------------------------------------------------------
973 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
974 ;-------------------------------------------------------------------------------------------------------------------------------------------------
975 INIT_XMM sse4
976 %if ARCH_X86_64
977 cglobal weight_sp, 6, 7+2, 7
978 %define tmp_r0 r7
979 %define tmp_r1 r8
980 %else ; ARCH_X86_64 = 0
981 cglobal weight_sp, 6, 7, 7, 0-(2*4)
982 %define tmp_r0 [(rsp + 0 * 4)]
983 %define tmp_r1 [(rsp + 1 * 4)]
984 %endif ; ARCH_X86_64
985
986 movd m0, r6m ; m0 = [w0]
987
988 movd m1, r7m ; m1 = [round]
989 punpcklwd m0, m1
990 pshufd m0, m0, 0 ; m0 = [w0 round]
991
992 movd m1, r8m ; m1 = [shift]
993
994 movd m2, r9m
995 pshufd m2, m2, 0 ; m2 =[offset]
996
997 mova m3, [pw_1]
998 mova m4, [pw_2000]
999
1000 add r2d, r2d
1001
1002 .loopH:
1003 mov r6d, r4d
1004
1005 ; save old src and dst
1006 mov tmp_r0, r0
1007 mov tmp_r1, r1
1008 .loopW:
1009 movu m5, [r0]
1010 paddw m5, m4
1011
1012 punpcklwd m6,m5, m3
1013 pmaddwd m6, m0
1014 psrad m6, m1
1015 paddd m6, m2
1016
1017 punpckhwd m5, m3
1018 pmaddwd m5, m0
1019 psrad m5, m1
1020 paddd m5, m2
1021
1022 packssdw m6, m5
1023 packuswb m6, m6
1024
1025 sub r6d, 8
1026 jl .width4
1027 movh [r1], m6
1028 je .nextH
1029 add r0, 16
1030 add r1, 8
1031
1032 jmp .loopW
1033
1034 .width4:
1035 cmp r6d, -4
1036 jl .width2
1037 movd [r1], m6
1038 je .nextH
1039 add r1, 4
1040 pshufd m6, m6, 1
1041
1042 .width2:
1043 pextrw [r1], m6, 0
1044
1045 .nextH:
1046 mov r0, tmp_r0
1047 mov r1, tmp_r1
1048 lea r0, [r0 + r2]
1049 lea r1, [r1 + r3]
1050
1051 dec r5d
1052 jnz .loopH
1053
1054 RET
1055
1056 ;-----------------------------------------------------------------
1057 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
1058 ;-----------------------------------------------------------------
1059 INIT_XMM sse2
1060 cglobal transpose4, 3, 3, 4, dest, src, stride
1061 %if HIGH_BIT_DEPTH == 1
1062 add r2, r2
1063 movh m0, [r1]
1064 movh m1, [r1 + r2]
1065 movh m2, [r1 + 2 * r2]
1066 lea r1, [r1 + 2 * r2]
1067 movh m3, [r1 + r2]
1068 punpcklwd m0, m1
1069 punpcklwd m2, m3
1070 punpckhdq m1, m0, m2
1071 punpckldq m0, m2
1072 movu [r0], m0
1073 movu [r0 + 16], m1
1074 %else ;HIGH_BIT_DEPTH == 0
1075 movd m0, [r1]
1076 movd m1, [r1 + r2]
1077 movd m2, [r1 + 2 * r2]
1078 lea r1, [r1 + 2 * r2]
1079 movd m3, [r1 + r2]
1080
1081 punpcklbw m0, m1
1082 punpcklbw m2, m3
1083 punpcklwd m0, m2
1084 movu [r0], m0
1085 %endif
1086 RET
1087
1088 ;-----------------------------------------------------------------
1089 ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
1090 ;-----------------------------------------------------------------
1091 %if HIGH_BIT_DEPTH == 1
1092 %if ARCH_X86_64 == 1
1093 INIT_YMM avx2
1094 cglobal transpose8, 3, 5, 5
1095 add r2, r2
1096 lea r3, [3 * r2]
1097 lea r4, [r1 + 4 * r2]
1098 movu xm0, [r1]
1099 vinserti128 m0, m0, [r4], 1
1100 movu xm1, [r1 + r2]
1101 vinserti128 m1, m1, [r4 + r2], 1
1102 movu xm2, [r1 + 2 * r2]
1103 vinserti128 m2, m2, [r4 + 2 * r2], 1
1104 movu xm3, [r1 + r3]
1105 vinserti128 m3, m3, [r4 + r3], 1
1106
1107 punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6]
1108 punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6]
1109
1110 punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8]
1111 punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8]
1112
1113 punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8]
1114 punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8]
1115
1116 punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8]
1117 punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8]
1118
1119 vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8]
1120 vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8]
1121 vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8]
1122 vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8]
1123
1124 movu [r0 + 0 * 32], m3
1125 movu [r0 + 1 * 32], m4
1126 movu [r0 + 2 * 32], m1
1127 movu [r0 + 3 * 32], m0
1128 RET
1129 %endif
1130
1131 INIT_XMM sse2
1132 %macro TRANSPOSE_4x4 1
1133 movh m0, [r1]
1134 movh m1, [r1 + r2]
1135 movh m2, [r1 + 2 * r2]
1136 lea r1, [r1 + 2 * r2]
1137 movh m3, [r1 + r2]
1138 punpcklwd m0, m1
1139 punpcklwd m2, m3
1140 punpckhdq m1, m0, m2
1141 punpckldq m0, m2
1142 movh [r0], m0
1143 movhps [r0 + %1], m0
1144 movh [r0 + 2 * %1], m1
1145 lea r0, [r0 + 2 * %1]
1146 movhps [r0 + %1], m1
1147 %endmacro
1148 cglobal transpose8_internal
1149 TRANSPOSE_4x4 r5
1150 lea r1, [r1 + 2 * r2]
1151 lea r0, [r3 + 8]
1152 TRANSPOSE_4x4 r5
1153 lea r1, [r1 + 2 * r2]
1154 neg r2
1155 lea r1, [r1 + r2 * 8 + 8]
1156 neg r2
1157 lea r0, [r3 + 4 * r5]
1158 TRANSPOSE_4x4 r5
1159 lea r1, [r1 + 2 * r2]
1160 lea r0, [r3 + 8 + 4 * r5]
1161 TRANSPOSE_4x4 r5
1162 ret
1163 cglobal transpose8, 3, 6, 4, dest, src, stride
1164 add r2, r2
1165 mov r3, r0
1166 mov r5, 16
1167 call transpose8_internal
1168 RET
1169 %else ;HIGH_BIT_DEPTH == 0
1170 %if ARCH_X86_64 == 1
1171 INIT_YMM avx2
1172 cglobal transpose8, 3, 4, 4
1173 lea r3, [r2 * 3]
1174 movq xm0, [r1]
1175 movhps xm0, [r1 + 2 * r2]
1176 movq xm1, [r1 + r2]
1177 movhps xm1, [r1 + r3]
1178 lea r1, [r1 + 4 * r2]
1179 movq xm2, [r1]
1180 movhps xm2, [r1 + 2 * r2]
1181 movq xm3, [r1 + r2]
1182 movhps xm3, [r1 + r3]
1183
1184 vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7]
1185 vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8]
1186
1187 punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6]
1188 punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8]
1189
1190 punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8]
1191 punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8]
1192
1193 mova m0, [trans8_shuf]
1194
1195 vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8]
1196 vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8]
1197
1198 movu [r0], m1
1199 movu [r0 + 32], m2
1200 RET
1201 %endif
1202
1203 INIT_XMM sse2
1204 cglobal transpose8, 3, 5, 8, dest, src, stride
1205 lea r3, [2 * r2]
1206 lea r4, [3 * r2]
1207 movh m0, [r1]
1208 movh m1, [r1 + r2]
1209 movh m2, [r1 + r3]
1210 movh m3, [r1 + r4]
1211 movh m4, [r1 + 4 * r2]
1212 lea r1, [r1 + 4 * r2]
1213 movh m5, [r1 + r2]
1214 movh m6, [r1 + r3]
1215 movh m7, [r1 + r4]
1216
1217 punpcklbw m0, m1
1218 punpcklbw m2, m3
1219 punpcklbw m4, m5
1220 punpcklbw m6, m7
1221
1222 punpckhwd m1, m0, m2
1223 punpcklwd m0, m2
1224 punpckhwd m5, m4, m6
1225 punpcklwd m4, m6
1226 punpckhdq m2, m0, m4
1227 punpckldq m0, m4
1228 punpckhdq m3, m1, m5
1229 punpckldq m1, m5
1230
1231 movu [r0], m0
1232 movu [r0 + 16], m2
1233 movu [r0 + 32], m1
1234 movu [r0 + 48], m3
1235 RET
1236 %endif
1237
1238 %macro TRANSPOSE_8x8 1
1239
1240 movh m0, [r1]
1241 movh m1, [r1 + r2]
1242 movh m2, [r1 + 2 * r2]
1243 lea r1, [r1 + 2 * r2]
1244 movh m3, [r1 + r2]
1245 movh m4, [r1 + 2 * r2]
1246 lea r1, [r1 + 2 * r2]
1247 movh m5, [r1 + r2]
1248 movh m6, [r1 + 2 * r2]
1249 lea r1, [r1 + 2 * r2]
1250 movh m7, [r1 + r2]
1251
1252 punpcklbw m0, m1
1253 punpcklbw m2, m3
1254 punpcklbw m4, m5
1255 punpcklbw m6, m7
1256
1257 punpckhwd m1, m0, m2
1258 punpcklwd m0, m2
1259 punpckhwd m5, m4, m6
1260 punpcklwd m4, m6
1261 punpckhdq m2, m0, m4
1262 punpckldq m0, m4
1263 punpckhdq m3, m1, m5
1264 punpckldq m1, m5
1265
1266 movh [r0], m0
1267 movhps [r0 + %1], m0
1268 movh [r0 + 2 * %1], m2
1269 lea r0, [r0 + 2 * %1]
1270 movhps [r0 + %1], m2
1271 movh [r0 + 2 * %1], m1
1272 lea r0, [r0 + 2 * %1]
1273 movhps [r0 + %1], m1
1274 movh [r0 + 2 * %1], m3
1275 lea r0, [r0 + 2 * %1]
1276 movhps [r0 + %1], m3
1277
1278 %endmacro
1279
1280
1281 ;-----------------------------------------------------------------
1282 ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
1283 ;-----------------------------------------------------------------
1284 %if HIGH_BIT_DEPTH == 1
1285 %if ARCH_X86_64 == 1
1286 INIT_YMM avx2
1287 cglobal transpose16x8_internal
1288 movu m0, [r1]
1289 movu m1, [r1 + r2]
1290 movu m2, [r1 + 2 * r2]
1291 movu m3, [r1 + r3]
1292 lea r1, [r1 + 4 * r2]
1293
1294 movu m4, [r1]
1295 movu m5, [r1 + r2]
1296 movu m6, [r1 + 2 * r2]
1297 movu m7, [r1 + r3]
1298
1299 punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2]
1300 punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2]
1301
1302 punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4]
1303 punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4]
1304
1305 punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6]
1306 punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6]
1307
1308 punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8]
1309 punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8]
1310
1311 punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4]
1312 punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4]
1313
1314 punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8]
1315 punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8]
1316
1317 punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4]
1318 punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4]
1319
1320 punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8]
1321 punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8]
1322
1323 punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8]
1324 punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8]
1325
1326 punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8]
1327 punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8]
1328
1329 punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8]
1330 punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8]
1331
1332 punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8]
1333 punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8]
1334
1335 movu [r0 + 0 * 32], xm6
1336 vextracti128 [r0 + 8 * 32], m6, 1
1337 movu [r0 + 1 * 32], xm7
1338 vextracti128 [r0 + 9 * 32], m7, 1
1339 movu [r0 + 2 * 32], xm1
1340 vextracti128 [r0 + 10 * 32], m1, 1
1341 movu [r0 + 3 * 32], xm8
1342 vextracti128 [r0 + 11 * 32], m8, 1
1343 movu [r0 + 4 * 32], xm3
1344 vextracti128 [r0 + 12 * 32], m3, 1
1345 movu [r0 + 5 * 32], xm5
1346 vextracti128 [r0 + 13 * 32], m5, 1
1347 movu [r0 + 6 * 32], xm2
1348 vextracti128 [r0 + 14 * 32], m2, 1
1349 movu [r0 + 7 * 32], xm0
1350 vextracti128 [r0 + 15 * 32], m0, 1
1351 ret
1352
1353 cglobal transpose16, 3, 4, 9
1354 add r2, r2
1355 lea r3, [r2 * 3]
1356 call transpose16x8_internal
1357 lea r1, [r1 + 4 * r2]
1358 add r0, 16
1359 call transpose16x8_internal
1360 RET
1361 %endif
1362 INIT_XMM sse2
1363 cglobal transpose16, 3, 7, 4, dest, src, stride
1364 add r2, r2
1365 mov r3, r0
1366 mov r4, r1
1367 mov r5, 32
1368 mov r6, r0
1369 call transpose8_internal
1370 lea r1, [r1 - 8 + 2 * r2]
1371 lea r0, [r6 + 16]
1372 mov r3, r0
1373 call transpose8_internal
1374 lea r1, [r4 + 16]
1375 lea r0, [r6 + 8 * r5]
1376 mov r3, r0
1377 call transpose8_internal
1378 lea r1, [r1 - 8 + 2 * r2]
1379 lea r0, [r6 + 8 * r5 + 16]
1380 mov r3, r0
1381 call transpose8_internal
1382 RET
1383 %else ;HIGH_BIT_DEPTH == 0
1384 %if ARCH_X86_64 == 1
1385 INIT_YMM avx2
1386 cglobal transpose16, 3, 5, 9
1387 lea r3, [r2 * 3]
1388 lea r4, [r1 + 8 * r2]
1389
1390 movu xm0, [r1]
1391 movu xm1, [r1 + r2]
1392 movu xm2, [r1 + 2 * r2]
1393 movu xm3, [r1 + r3]
1394 vinserti128 m0, m0, [r4], 1
1395 vinserti128 m1, m1, [r4 + r2], 1
1396 vinserti128 m2, m2, [r4 + 2 * r2], 1
1397 vinserti128 m3, m3, [r4 + r3], 1
1398 lea r1, [r1 + 4 * r2]
1399 lea r4, [r4 + 4 * r2]
1400
1401 movu xm4, [r1]
1402 movu xm5, [r1 + r2]
1403 movu xm6, [r1 + 2 * r2]
1404 movu xm7, [r1 + r3]
1405 vinserti128 m4, m4, [r4], 1
1406 vinserti128 m5, m5, [r4 + r2], 1
1407 vinserti128 m6, m6, [r4 + 2 * r2], 1
1408 vinserti128 m7, m7, [r4 + r3], 1
1409
1410 punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10]
1411 punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10]
1412
1413 punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12]
1414 punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12]
1415
1416 punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14]
1417 punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14]
1418
1419 punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16]
1420 punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16]
1421
1422 punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12]
1423 punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12]
1424
1425 punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16]
1426 punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16]
1427
1428 punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12]
1429 punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12]
1430
1431 punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16]
1432 punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16]
1433
1434 punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1435 punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1436
1437 punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1438 punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1439
1440 punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1441 punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1442
1443 punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1444 punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1445
1446 vpermq m6, m6, 0xD8
1447 vpermq m7, m7, 0xD8
1448 vpermq m1, m1, 0xD8
1449 vpermq m8, m8, 0xD8
1450 vpermq m3, m3, 0xD8
1451 vpermq m5, m5, 0xD8
1452 vpermq m2, m2, 0xD8
1453 vpermq m0, m0, 0xD8
1454
1455 movu [r0 + 0 * 16], m6
1456 movu [r0 + 2 * 16], m7
1457 movu [r0 + 4 * 16], m1
1458 movu [r0 + 6 * 16], m8
1459 movu [r0 + 8 * 16], m3
1460 movu [r0 + 10 * 16], m5
1461 movu [r0 + 12 * 16], m2
1462 movu [r0 + 14 * 16], m0
1463 RET
1464 %endif
1465 INIT_XMM sse2
1466 cglobal transpose16, 3, 5, 8, dest, src, stride
1467 mov r3, r0
1468 mov r4, r1
1469 TRANSPOSE_8x8 16
1470 lea r1, [r1 + 2 * r2]
1471 lea r0, [r3 + 8]
1472 TRANSPOSE_8x8 16
1473 lea r1, [r4 + 8]
1474 lea r0, [r3 + 8 * 16]
1475 TRANSPOSE_8x8 16
1476 lea r1, [r1 + 2 * r2]
1477 lea r0, [r3 + 8 * 16 + 8]
1478 TRANSPOSE_8x8 16
1479 RET
1480 %endif
1481
1482 cglobal transpose16_internal
1483 TRANSPOSE_8x8 r6
1484 lea r1, [r1 + 2 * r2]
1485 lea r0, [r5 + 8]
1486 TRANSPOSE_8x8 r6
1487 lea r1, [r1 + 2 * r2]
1488 neg r2
1489 lea r1, [r1 + r2 * 8]
1490 lea r1, [r1 + r2 * 8 + 8]
1491 neg r2
1492 lea r0, [r5 + 8 * r6]
1493 TRANSPOSE_8x8 r6
1494 lea r1, [r1 + 2 * r2]
1495 lea r0, [r5 + 8 * r6 + 8]
1496 TRANSPOSE_8x8 r6
1497 ret
1498
1499 ;-----------------------------------------------------------------
1500 ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
1501 ;-----------------------------------------------------------------
1502 %if HIGH_BIT_DEPTH == 1
1503 %if ARCH_X86_64 == 1
1504 INIT_YMM avx2
1505 cglobal transpose8x32_internal
1506 movu m0, [r1]
1507 movu m1, [r1 + 32]
1508 movu m2, [r1 + r2]
1509 movu m3, [r1 + r2 + 32]
1510 movu m4, [r1 + 2 * r2]
1511 movu m5, [r1 + 2 * r2 + 32]
1512 movu m6, [r1 + r3]
1513 movu m7, [r1 + r3 + 32]
1514 lea r1, [r1 + 4 * r2]
1515
1516 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
1517 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
1518
1519 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
1520 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
1521
1522 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
1523 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
1524
1525 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
1526 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
1527
1528 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
1529 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
1530
1531 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
1532 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
1533
1534 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
1535 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
1536
1537 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
1538 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
1539
1540 movq [r0 + 0 * 64], xm7
1541 movhps [r0 + 1 * 64], xm7
1542 vextracti128 xm5, m7, 1
1543 movq [r0 + 8 * 64], xm5
1544 movhps [r0 + 9 * 64], xm5
1545
1546 movu m7, [r1]
1547 movu m9, [r1 + 32]
1548 movu m10, [r1 + r2]
1549 movu m11, [r1 + r2 + 32]
1550 movu m12, [r1 + 2 * r2]
1551 movu m13, [r1 + 2 * r2 + 32]
1552 movu m14, [r1 + r3]
1553 movu m15, [r1 + r3 + 32]
1554
1555 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
1556 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
1557
1558 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
1559 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
1560
1561 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
1562 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
1563
1564 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
1565 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
1566
1567 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
1568 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
1569
1570 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
1571 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
1572
1573 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
1574 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
1575
1576 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
1577 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
1578
1579 movq [r0 + 0 * 64 + 8], xm15
1580 movhps [r0 + 1 * 64 + 8], xm15
1581 vextracti128 xm13, m15, 1
1582 movq [r0 + 8 * 64 + 8], xm13
1583 movhps [r0 + 9 * 64 + 8], xm13
1584
1585 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
1586 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
1587
1588 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
1589 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
1590
1591 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
1592 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
1593
1594 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
1595 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
1596
1597 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
1598 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
1599
1600 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
1601 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
1602
1603 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
1604 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
1605
1606 movu [r0 + 2 * 64], xm13
1607 vextracti128 [r0 + 10 * 64], m13, 1
1608
1609 movu [r0 + 3 * 64], xm8
1610 vextracti128 [r0 + 11 * 64], m8, 1
1611
1612 movu [r0 + 4 * 64], xm5
1613 vextracti128 [r0 + 12 * 64], m5, 1
1614
1615 movu [r0 + 5 * 64], xm2
1616 vextracti128 [r0 + 13 * 64], m2, 1
1617
1618 movu [r0 + 6 * 64], xm10
1619 vextracti128 [r0 + 14 * 64], m10, 1
1620
1621 movu [r0 + 7 * 64], xm0
1622 vextracti128 [r0 + 15 * 64], m0, 1
1623
1624 movu [r0 + 16 * 64], xm7
1625 vextracti128 [r0 + 24 * 64], m7, 1
1626
1627 movu [r0 + 17 * 64], xm4
1628 vextracti128 [r0 + 25 * 64], m4, 1
1629
1630 movu [r0 + 18 * 64], xm12
1631 vextracti128 [r0 + 26 * 64], m12, 1
1632
1633 movu [r0 + 19 * 64], xm6
1634 vextracti128 [r0 + 27 * 64], m6, 1
1635
1636 movu [r0 + 20 * 64], xm14
1637 vextracti128 [r0 + 28 * 64], m14, 1
1638
1639 movu [r0 + 21 * 64], xm3
1640 vextracti128 [r0 + 29 * 64], m3, 1
1641
1642 movu [r0 + 22 * 64], xm11
1643 vextracti128 [r0 + 30 * 64], m11, 1
1644
1645 movu [r0 + 23 * 64], xm1
1646 vextracti128 [r0 + 31 * 64], m1, 1
1647 ret
1648
1649 cglobal transpose32, 3, 4, 16
1650 add r2, r2
1651 lea r3, [r2 * 3]
1652 call transpose8x32_internal
1653 add r0, 16
1654 lea r1, [r1 + 4 * r2]
1655 call transpose8x32_internal
1656 add r0, 16
1657 lea r1, [r1 + 4 * r2]
1658 call transpose8x32_internal
1659 add r0, 16
1660 lea r1, [r1 + 4 * r2]
1661 call transpose8x32_internal
1662 RET
1663 %endif
1664 INIT_XMM sse2
1665 cglobal transpose32, 3, 7, 4, dest, src, stride
1666 add r2, r2
1667 mov r3, r0
1668 mov r4, r1
1669 mov r5, 64
1670 mov r6, r0
1671 call transpose8_internal
1672 lea r1, [r1 - 8 + 2 * r2]
1673 lea r0, [r6 + 16]
1674 mov r3, r0
1675 call transpose8_internal
1676 lea r1, [r1 - 8 + 2 * r2]
1677 lea r0, [r6 + 32]
1678 mov r3, r0
1679 call transpose8_internal
1680 lea r1, [r1 - 8 + 2 * r2]
1681 lea r0, [r6 + 48]
1682 mov r3, r0
1683 call transpose8_internal
1684 lea r1, [r4 + 16]
1685 lea r0, [r6 + 8 * 64]
1686 mov r3, r0
1687 call transpose8_internal
1688 lea r1, [r1 - 8 + 2 * r2]
1689 lea r0, [r6 + 8 * 64 + 16]
1690 mov r3, r0
1691 call transpose8_internal
1692 lea r1, [r1 - 8 + 2 * r2]
1693 lea r0, [r6 + 8 * 64 + 32]
1694 mov r3, r0
1695 call transpose8_internal
1696 lea r1, [r1 - 8 + 2 * r2]
1697 lea r0, [r6 + 8 * 64 + 48]
1698 mov r3, r0
1699 call transpose8_internal
1700 lea r1, [r4 + 32]
1701 lea r0, [r6 + 16 * 64]
1702 mov r3, r0
1703 call transpose8_internal
1704 lea r1, [r1 - 8 + 2 * r2]
1705 lea r0, [r6 + 16 * 64 + 16]
1706 mov r3, r0
1707 call transpose8_internal
1708 lea r1, [r1 - 8 + 2 * r2]
1709 lea r0, [r6 + 16 * 64 + 32]
1710 mov r3, r0
1711 call transpose8_internal
1712 lea r1, [r1 - 8 + 2 * r2]
1713 lea r0, [r6 + 16 * 64 + 48]
1714 mov r3, r0
1715 call transpose8_internal
1716 lea r1, [r4 + 48]
1717 lea r0, [r6 + 24 * 64]
1718 mov r3, r0
1719 call transpose8_internal
1720 lea r1, [r1 - 8 + 2 * r2]
1721 lea r0, [r6 + 24 * 64 + 16]
1722 mov r3, r0
1723 call transpose8_internal
1724 lea r1, [r1 - 8 + 2 * r2]
1725 lea r0, [r6 + 24 * 64 + 32]
1726 mov r3, r0
1727 call transpose8_internal
1728 lea r1, [r1 - 8 + 2 * r2]
1729 lea r0, [r6 + 24 * 64 + 48]
1730 mov r3, r0
1731 call transpose8_internal
1732 RET
1733 %else ;HIGH_BIT_DEPTH == 0
1734 INIT_XMM sse2
1735 cglobal transpose32, 3, 7, 8, dest, src, stride
1736 mov r3, r0
1737 mov r4, r1
1738 mov r5, r0
1739 mov r6, 32
1740 call transpose16_internal
1741 lea r1, [r1 - 8 + 2 * r2]
1742 lea r0, [r3 + 16]
1743 mov r5, r0
1744 call transpose16_internal
1745 lea r1, [r4 + 16]
1746 lea r0, [r3 + 16 * 32]
1747 mov r5, r0
1748 call transpose16_internal
1749 lea r1, [r1 - 8 + 2 * r2]
1750 lea r0, [r3 + 16 * 32 + 16]
1751 mov r5, r0
1752 call transpose16_internal
1753 RET
1754
1755 %if ARCH_X86_64 == 1
1756 INIT_YMM avx2
1757 cglobal transpose32, 3, 5, 16
1758 lea r3, [r2 * 3]
1759 mov r4d, 2
1760
1761 .loop:
1762 movu m0, [r1]
1763 movu m1, [r1 + r2]
1764 movu m2, [r1 + 2 * r2]
1765 movu m3, [r1 + r3]
1766 lea r1, [r1 + 4 * r2]
1767
1768 movu m4, [r1]
1769 movu m5, [r1 + r2]
1770 movu m6, [r1 + 2 * r2]
1771 movu m7, [r1 + r3]
1772
1773 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
1774 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
1775
1776 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
1777 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
1778
1779 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
1780 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
1781
1782 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
1783 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
1784
1785 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
1786 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
1787
1788 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
1789 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
1790
1791 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
1792 punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4]
1793
1794 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
1795 punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8]
1796
1797 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
1798 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
1799
1800 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
1801 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
1802
1803 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
1804 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
1805
1806 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
1807 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
1808
1809 movq [r0 + 0 * 32], xm6
1810 movhps [r0 + 1 * 32], xm6
1811 vextracti128 xm4, m6, 1
1812 movq [r0 + 16 * 32], xm4
1813 movhps [r0 + 17 * 32], xm4
1814
1815 lea r1, [r1 + 4 * r2]
1816 movu m9, [r1]
1817 movu m10, [r1 + r2]
1818 movu m11, [r1 + 2 * r2]
1819 movu m12, [r1 + r3]
1820 lea r1, [r1 + 4 * r2]
1821
1822 movu m13, [r1]
1823 movu m14, [r1 + r2]
1824 movu m15, [r1 + 2 * r2]
1825 movu m6, [r1 + r3]
1826
1827 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
1828 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
1829
1830 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
1831 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
1832
1833 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
1834 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
1835
1836 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
1837 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
1838
1839 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
1840 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
1841
1842 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
1843 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
1844
1845 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
1846 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
1847
1848 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
1849 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
1850
1851 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
1852 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
1853
1854 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
1855 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
1856
1857 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
1858 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
1859
1860 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
1861 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
1862
1863
1864 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1865 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1866
1867 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1868 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1869
1870 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1871 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1872
1873 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1874 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1875
1876 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1877 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1878
1879 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1880 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1881
1882 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1883 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1884
1885 movq [r0 + 0 * 32 + 8], xm15
1886 movhps [r0 + 1 * 32 + 8], xm15
1887 vextracti128 xm9, m15, 1
1888 movq [r0 + 16 * 32 + 8], xm9
1889 movhps [r0 + 17 * 32 + 8], xm9
1890
1891 movu [r0 + 2 * 32], xm13
1892 vextracti128 [r0 + 18 * 32], m13, 1
1893
1894 movu [r0 + 3 * 32], xm7
1895 vextracti128 [r0 + 19 * 32], m7, 1
1896
1897 movu [r0 + 4 * 32], xm6
1898 vextracti128 [r0 + 20 * 32], m6, 1
1899
1900 movu [r0 + 5 * 32], xm1
1901 vextracti128 [r0 + 21 * 32], m1, 1
1902
1903 movu [r0 + 6 * 32], xm10
1904 vextracti128 [r0 + 22 * 32], m10, 1
1905
1906 movu [r0 + 7 * 32], xm8
1907 vextracti128 [r0 + 23 * 32], m8, 1
1908
1909 movu [r0 + 8 * 32], xm4
1910 vextracti128 [r0 + 24 * 32], m4, 1
1911
1912 movu [r0 + 9 * 32], xm3
1913 vextracti128 [r0 + 25 * 32], m3, 1
1914
1915 movu [r0 + 10 * 32], xm12
1916 vextracti128 [r0 + 26 * 32], m12, 1
1917
1918 movu [r0 + 11 * 32], xm5
1919 vextracti128 [r0 + 27 * 32], m5, 1
1920
1921 movu [r0 + 12 * 32], xm14
1922 vextracti128 [r0 + 28 * 32], m14, 1
1923
1924 movu [r0 + 13 * 32], xm2
1925 vextracti128 [r0 + 29 * 32], m2, 1
1926
1927 movu [r0 + 14 * 32], xm11
1928 vextracti128 [r0 + 30 * 32], m11, 1
1929
1930 movu [r0 + 15 * 32], xm0
1931 vextracti128 [r0 + 31 * 32], m0, 1
1932
1933 add r0, 16
1934 lea r1, [r1 + 4 * r2]
1935 dec r4d
1936 jnz .loop
1937 RET
1938 %endif
1939 %endif
1940
1941 ;-----------------------------------------------------------------
1942 ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride)
1943 ;-----------------------------------------------------------------
1944 %if HIGH_BIT_DEPTH == 1
1945 %if ARCH_X86_64 == 1
1946 INIT_YMM avx2
1947 cglobal transpose8x32_64_internal
1948 movu m0, [r1]
1949 movu m1, [r1 + 32]
1950 movu m2, [r1 + r2]
1951 movu m3, [r1 + r2 + 32]
1952 movu m4, [r1 + 2 * r2]
1953 movu m5, [r1 + 2 * r2 + 32]
1954 movu m6, [r1 + r3]
1955 movu m7, [r1 + r3 + 32]
1956 lea r1, [r1 + 4 * r2]
1957
1958 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
1959 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
1960
1961 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
1962 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
1963
1964 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
1965 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
1966
1967 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
1968 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
1969
1970 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
1971 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
1972
1973 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
1974 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
1975
1976 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
1977 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
1978
1979 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
1980 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
1981
1982 movq [r0 + 0 * 128], xm7
1983 movhps [r0 + 1 * 128], xm7
1984 vextracti128 xm5, m7, 1
1985 movq [r0 + 8 * 128], xm5
1986 movhps [r0 + 9 * 128], xm5
1987
1988 movu m7, [r1]
1989 movu m9, [r1 + 32]
1990 movu m10, [r1 + r2]
1991 movu m11, [r1 + r2 + 32]
1992 movu m12, [r1 + 2 * r2]
1993 movu m13, [r1 + 2 * r2 + 32]
1994 movu m14, [r1 + r3]
1995 movu m15, [r1 + r3 + 32]
1996
1997 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
1998 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
1999
2000 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
2001 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
2002
2003 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
2004 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
2005
2006 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
2007 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
2008
2009 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
2010 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
2011
2012 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
2013 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
2014
2015 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
2016 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
2017
2018 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
2019 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
2020
2021 movq [r0 + 0 * 128 + 8], xm15
2022 movhps [r0 + 1 * 128 + 8], xm15
2023 vextracti128 xm13, m15, 1
2024 movq [r0 + 8 * 128 + 8], xm13
2025 movhps [r0 + 9 * 128 + 8], xm13
2026
2027 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
2028 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
2029
2030 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
2031 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
2032
2033 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
2034 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
2035
2036 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
2037 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
2038
2039 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
2040 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
2041
2042 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
2043 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
2044
2045 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
2046 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
2047
2048 movu [r0 + 2 * 128], xm13
2049 vextracti128 [r0 + 10 * 128], m13, 1
2050
2051 movu [r0 + 3 * 128], xm8
2052 vextracti128 [r0 + 11 * 128], m8, 1
2053
2054 movu [r0 + 4 * 128], xm5
2055 vextracti128 [r0 + 12 * 128], m5, 1
2056
2057 movu [r0 + 5 * 128], xm2
2058 vextracti128 [r0 + 13 * 128], m2, 1
2059
2060 movu [r0 + 6 * 128], xm10
2061 vextracti128 [r0 + 14 * 128], m10, 1
2062
2063 movu [r0 + 7 * 128], xm0
2064 vextracti128 [r0 + 15 * 128], m0, 1
2065
2066 movu [r0 + 16 * 128], xm7
2067 vextracti128 [r0 + 24 * 128], m7, 1
2068
2069 movu [r0 + 17 * 128], xm4
2070 vextracti128 [r0 + 25 * 128], m4, 1
2071
2072 movu [r0 + 18 * 128], xm12
2073 vextracti128 [r0 + 26 * 128], m12, 1
2074
2075 movu [r0 + 19 * 128], xm6
2076 vextracti128 [r0 + 27 * 128], m6, 1
2077
2078 movu [r0 + 20 * 128], xm14
2079 vextracti128 [r0 + 28 * 128], m14, 1
2080
2081 movu [r0 + 21 * 128], xm3
2082 vextracti128 [r0 + 29 * 128], m3, 1
2083
2084 movu [r0 + 22 * 128], xm11
2085 vextracti128 [r0 + 30 * 128], m11, 1
2086
2087 movu [r0 + 23 * 128], xm1
2088 vextracti128 [r0 + 31 * 128], m1, 1
2089 ret
2090
2091 cglobal transpose64, 3, 6, 16
2092 add r2, r2
2093 lea r3, [3 * r2]
2094 lea r4, [r1 + 64]
2095 lea r5, [r0 + 16]
2096
2097 call transpose8x32_64_internal
2098 mov r1, r4
2099 lea r0, [r0 + 32 * 128]
2100 call transpose8x32_64_internal
2101 mov r0, r5
2102 lea r5, [r0 + 16]
2103 lea r4, [r1 + 4 * r2]
2104 lea r1, [r4 - 64]
2105 call transpose8x32_64_internal
2106 mov r1, r4
2107 lea r0, [r0 + 32 * 128]
2108 call transpose8x32_64_internal
2109 mov r0, r5
2110 lea r5, [r0 + 16]
2111 lea r4, [r1 + 4 * r2]
2112 lea r1, [r4 - 64]
2113 call transpose8x32_64_internal
2114 mov r1, r4
2115 lea r0, [r0 + 32 * 128]
2116 call transpose8x32_64_internal
2117 mov r0, r5
2118 lea r5, [r0 + 16]
2119 lea r4, [r1 + 4 * r2]
2120 lea r1, [r4 - 64]
2121 call transpose8x32_64_internal
2122 mov r1, r4
2123 lea r0, [r0 + 32 * 128]
2124 call transpose8x32_64_internal
2125 mov r0, r5
2126 lea r5, [r0 + 16]
2127 lea r4, [r1 + 4 * r2]
2128 lea r1, [r4 - 64]
2129 call transpose8x32_64_internal
2130 mov r1, r4
2131 lea r0, [r0 + 32 * 128]
2132 call transpose8x32_64_internal
2133 mov r0, r5
2134 lea r5, [r0 + 16]
2135 lea r4, [r1 + 4 * r2]
2136 lea r1, [r4 - 64]
2137 call transpose8x32_64_internal
2138 mov r1, r4
2139 lea r0, [r0 + 32 * 128]
2140 call transpose8x32_64_internal
2141 mov r0, r5
2142 lea r5, [r0 + 16]
2143 lea r4, [r1 + 4 * r2]
2144 lea r1, [r4 - 64]
2145 call transpose8x32_64_internal
2146 mov r1, r4
2147 lea r0, [r0 + 32 * 128]
2148 call transpose8x32_64_internal
2149 mov r0, r5
2150 lea r4, [r1 + 4 * r2]
2151 lea r1, [r4 - 64]
2152 call transpose8x32_64_internal
2153 mov r1, r4
2154 lea r0, [r0 + 32 * 128]
2155 call transpose8x32_64_internal
2156 RET
2157 %endif
2158 INIT_XMM sse2
2159 cglobal transpose64, 3, 7, 4, dest, src, stride
2160 add r2, r2
2161 mov r3, r0
2162 mov r4, r1
2163 mov r5, 128
2164 mov r6, r0
2165 call transpose8_internal
2166 lea r1, [r1 - 8 + 2 * r2]
2167 lea r0, [r6 + 16]
2168 mov r3, r0
2169 call transpose8_internal
2170 lea r1, [r1 - 8 + 2 * r2]
2171 lea r0, [r6 + 32]
2172 mov r3, r0
2173 call transpose8_internal
2174 lea r1, [r1 - 8 + 2 * r2]
2175 lea r0, [r6 + 48]
2176 mov r3, r0
2177 call transpose8_internal
2178 lea r1, [r1 - 8 + 2 * r2]
2179 lea r0, [r6 + 64]
2180 mov r3, r0
2181 call transpose8_internal
2182 lea r1, [r1 - 8 + 2 * r2]
2183 lea r0, [r6 + 80]
2184 mov r3, r0
2185 call transpose8_internal
2186 lea r1, [r1 - 8 + 2 * r2]
2187 lea r0, [r6 + 96]
2188 mov r3, r0
2189 call transpose8_internal
2190 lea r1, [r1 - 8 + 2 * r2]
2191 lea r0, [r6 + 112]
2192 mov r3, r0
2193 call transpose8_internal
2194
2195 lea r1, [r4 + 16]
2196 lea r0, [r6 + 8 * 128]
2197 mov r3, r0
2198 call transpose8_internal
2199 lea r1, [r1 - 8 + 2 * r2]
2200 lea r0, [r6 + 8 * 128 + 16]
2201 mov r3, r0
2202 call transpose8_internal
2203 lea r1, [r1 - 8 + 2 * r2]
2204 lea r0, [r6 + 8 * 128 + 32]
2205 mov r3, r0
2206 call transpose8_internal
2207 lea r1, [r1 - 8 + 2 * r2]
2208 lea r0, [r6 + 8 * 128 + 48]
2209 mov r3, r0
2210 call transpose8_internal
2211 lea r1, [r1 - 8 + 2 * r2]
2212 lea r0, [r6 + 8 * 128 + 64]
2213 mov r3, r0
2214 call transpose8_internal
2215 lea r1, [r1 - 8 + 2 * r2]
2216 lea r0, [r6 + 8 * 128 + 80]
2217 mov r3, r0
2218 call transpose8_internal
2219 lea r1, [r1 - 8 + 2 * r2]
2220 lea r0, [r6 + 8 * 128 + 96]
2221 mov r3, r0
2222 call transpose8_internal
2223 lea r1, [r1 - 8 + 2 * r2]
2224 lea r0, [r6 + 8 * 128 + 112]
2225 mov r3, r0
2226 call transpose8_internal
2227
2228 lea r1, [r4 + 32]
2229 lea r0, [r6 + 16 * 128]
2230 mov r3, r0
2231 call transpose8_internal
2232 lea r1, [r1 - 8 + 2 * r2]
2233 lea r0, [r6 + 16 * 128 + 16]
2234 mov r3, r0
2235 call transpose8_internal
2236 lea r1, [r1 - 8 + 2 * r2]
2237 lea r0, [r6 + 16 * 128 + 32]
2238 mov r3, r0
2239 call transpose8_internal
2240 lea r1, [r1 - 8 + 2 * r2]
2241 lea r0, [r6 + 16 * 128 + 48]
2242 mov r3, r0
2243 call transpose8_internal
2244 lea r1, [r1 - 8 + 2 * r2]
2245 lea r0, [r6 + 16 * 128 + 64]
2246 mov r3, r0
2247 call transpose8_internal
2248 lea r1, [r1 - 8 + 2 * r2]
2249 lea r0, [r6 + 16 * 128 + 80]
2250 mov r3, r0
2251 call transpose8_internal
2252 lea r1, [r1 - 8 + 2 * r2]
2253 lea r0, [r6 + 16 * 128 + 96]
2254 mov r3, r0
2255 call transpose8_internal
2256 lea r1, [r1 - 8 + 2 * r2]
2257 lea r0, [r6 + 16 * 128 + 112]
2258 mov r3, r0
2259 call transpose8_internal
2260
2261 lea r1, [r4 + 48]
2262 lea r0, [r6 + 24 * 128]
2263 mov r3, r0
2264 call transpose8_internal
2265 lea r1, [r1 - 8 + 2 * r2]
2266 lea r0, [r6 + 24 * 128 + 16]
2267 mov r3, r0
2268 call transpose8_internal
2269 lea r1, [r1 - 8 + 2 * r2]
2270 lea r0, [r6 + 24 * 128 + 32]
2271 mov r3, r0
2272 call transpose8_internal
2273 lea r1, [r1 - 8 + 2 * r2]
2274 lea r0, [r6 + 24 * 128 + 48]
2275 mov r3, r0
2276 call transpose8_internal
2277 lea r1, [r1 - 8 + 2 * r2]
2278 lea r0, [r6 + 24 * 128 + 64]
2279 mov r3, r0
2280 call transpose8_internal
2281 lea r1, [r1 - 8 + 2 * r2]
2282 lea r0, [r6 + 24 * 128 + 80]
2283 mov r3, r0
2284 call transpose8_internal
2285 lea r1, [r1 - 8 + 2 * r2]
2286 lea r0, [r6 + 24 * 128 + 96]
2287 mov r3, r0
2288 call transpose8_internal
2289 lea r1, [r1 - 8 + 2 * r2]
2290 lea r0, [r6 + 24 * 128 + 112]
2291 mov r3, r0
2292 call transpose8_internal
2293
2294 lea r1, [r4 + 64]
2295 lea r0, [r6 + 32 * 128]
2296 mov r3, r0
2297 call transpose8_internal
2298 lea r1, [r1 - 8 + 2 * r2]
2299 lea r0, [r6 + 32 * 128 + 16]
2300 mov r3, r0
2301 call transpose8_internal
2302 lea r1, [r1 - 8 + 2 * r2]
2303 lea r0, [r6 + 32 * 128 + 32]
2304 mov r3, r0
2305 call transpose8_internal
2306 lea r1, [r1 - 8 + 2 * r2]
2307 lea r0, [r6 + 32 * 128 + 48]
2308 mov r3, r0
2309 call transpose8_internal
2310 lea r1, [r1 - 8 + 2 * r2]
2311 lea r0, [r6 + 32 * 128 + 64]
2312 mov r3, r0
2313 call transpose8_internal
2314 lea r1, [r1 - 8 + 2 * r2]
2315 lea r0, [r6 + 32 * 128 + 80]
2316 mov r3, r0
2317 call transpose8_internal
2318 lea r1, [r1 - 8 + 2 * r2]
2319 lea r0, [r6 + 32 * 128 + 96]
2320 mov r3, r0
2321 call transpose8_internal
2322 lea r1, [r1 - 8 + 2 * r2]
2323 lea r0, [r6 + 32 * 128 + 112]
2324 mov r3, r0
2325 call transpose8_internal
2326
2327 lea r1, [r4 + 80]
2328 lea r0, [r6 + 40 * 128]
2329 mov r3, r0
2330 call transpose8_internal
2331 lea r1, [r1 - 8 + 2 * r2]
2332 lea r0, [r6 + 40 * 128 + 16]
2333 mov r3, r0
2334 call transpose8_internal
2335 lea r1, [r1 - 8 + 2 * r2]
2336 lea r0, [r6 + 40 * 128 + 32]
2337 mov r3, r0
2338 call transpose8_internal
2339 lea r1, [r1 - 8 + 2 * r2]
2340 lea r0, [r6 + 40 * 128 + 48]
2341 mov r3, r0
2342 call transpose8_internal
2343 lea r1, [r1 - 8 + 2 * r2]
2344 lea r0, [r6 + 40 * 128 + 64]
2345 mov r3, r0
2346 call transpose8_internal
2347 lea r1, [r1 - 8 + 2 * r2]
2348 lea r0, [r6 + 40 * 128 + 80]
2349 mov r3, r0
2350 call transpose8_internal
2351 lea r1, [r1 - 8 + 2 * r2]
2352 lea r0, [r6 + 40 * 128 + 96]
2353 mov r3, r0
2354 call transpose8_internal
2355 lea r1, [r1 - 8 + 2 * r2]
2356 lea r0, [r6 + 40 * 128 + 112]
2357 mov r3, r0
2358 call transpose8_internal
2359
2360 lea r1, [r4 + 96]
2361 lea r0, [r6 + 48 * 128]
2362 mov r3, r0
2363 call transpose8_internal
2364 lea r1, [r1 - 8 + 2 * r2]
2365 lea r0, [r6 + 48 * 128 + 16]
2366 mov r3, r0
2367 call transpose8_internal
2368 lea r1, [r1 - 8 + 2 * r2]
2369 lea r0, [r6 + 48 * 128 + 32]
2370 mov r3, r0
2371 call transpose8_internal
2372 lea r1, [r1 - 8 + 2 * r2]
2373 lea r0, [r6 + 48 * 128 + 48]
2374 mov r3, r0
2375 call transpose8_internal
2376 lea r1, [r1 - 8 + 2 * r2]
2377 lea r0, [r6 + 48 * 128 + 64]
2378 mov r3, r0
2379 call transpose8_internal
2380 lea r1, [r1 - 8 + 2 * r2]
2381 lea r0, [r6 + 48 * 128 + 80]
2382 mov r3, r0
2383 call transpose8_internal
2384 lea r1, [r1 - 8 + 2 * r2]
2385 lea r0, [r6 + 48 * 128 + 96]
2386 mov r3, r0
2387 call transpose8_internal
2388 lea r1, [r1 - 8 + 2 * r2]
2389 lea r0, [r6 + 48 * 128 + 112]
2390 mov r3, r0
2391 call transpose8_internal
2392
2393 lea r1, [r4 + 112]
2394 lea r0, [r6 + 56 * 128]
2395 mov r3, r0
2396 call transpose8_internal
2397 lea r1, [r1 - 8 + 2 * r2]
2398 lea r0, [r6 + 56 * 128 + 16]
2399 mov r3, r0
2400 call transpose8_internal
2401 lea r1, [r1 - 8 + 2 * r2]
2402 lea r0, [r6 + 56 * 128 + 32]
2403 mov r3, r0
2404 call transpose8_internal
2405 lea r1, [r1 - 8 + 2 * r2]
2406 lea r0, [r6 + 56 * 128 + 48]
2407 mov r3, r0
2408 call transpose8_internal
2409 lea r1, [r1 - 8 + 2 * r2]
2410 lea r0, [r6 + 56 * 128 + 64]
2411 mov r3, r0
2412 call transpose8_internal
2413 lea r1, [r1 - 8 + 2 * r2]
2414 lea r0, [r6 + 56 * 128 + 80]
2415 mov r3, r0
2416 call transpose8_internal
2417 lea r1, [r1 - 8 + 2 * r2]
2418 lea r0, [r6 + 56 * 128 + 96]
2419 mov r3, r0
2420 call transpose8_internal
2421 lea r1, [r1 - 8 + 2 * r2]
2422 lea r0, [r6 + 56 * 128 + 112]
2423 mov r3, r0
2424 call transpose8_internal
2425 RET
2426 %else ;HIGH_BIT_DEPTH == 0
2427 %if ARCH_X86_64 == 1
2428 INIT_YMM avx2
2429
2430 cglobal transpose16x32_avx2
2431 movu m0, [r1]
2432 movu m1, [r1 + r2]
2433 movu m2, [r1 + 2 * r2]
2434 movu m3, [r1 + r3]
2435 lea r1, [r1 + 4 * r2]
2436
2437 movu m4, [r1]
2438 movu m5, [r1 + r2]
2439 movu m6, [r1 + 2 * r2]
2440 movu m7, [r1 + r3]
2441
2442 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
2443 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
2444
2445 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
2446 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
2447
2448 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
2449 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
2450
2451 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
2452 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
2453
2454 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
2455 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
2456
2457 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
2458 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
2459
2460 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
2461 punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]
2462
2463 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
2464 punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]
2465
2466 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
2467 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
2468
2469 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
2470 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
2471
2472 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
2473 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
2474
2475 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
2476 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
2477
2478 movq [r0 + 0 * 64], xm6
2479 movhps [r0 + 1 * 64], xm6
2480 vextracti128 xm4, m6, 1
2481 movq [r0 + 16 * 64], xm4
2482 movhps [r0 + 17 * 64], xm4
2483
2484 lea r1, [r1 + 4 * r2]
2485 movu m9, [r1]
2486 movu m10, [r1 + r2]
2487 movu m11, [r1 + 2 * r2]
2488 movu m12, [r1 + r3]
2489 lea r1, [r1 + 4 * r2]
2490
2491 movu m13, [r1]
2492 movu m14, [r1 + r2]
2493 movu m15, [r1 + 2 * r2]
2494 movu m6, [r1 + r3]
2495
2496 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
2497 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
2498
2499 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
2500 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
2501
2502 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
2503 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
2504
2505 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
2506 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
2507
2508 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
2509 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
2510
2511 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
2512 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
2513
2514 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
2515 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
2516
2517 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
2518 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
2519
2520 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
2521 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
2522
2523 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
2524 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
2525
2526 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
2527 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
2528
2529 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
2530 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
2531
2532
2533 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2534 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2535
2536 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2537 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2538
2539 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2540 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2541
2542 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2543 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2544
2545 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2546 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2547
2548 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2549 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2550
2551 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2552 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2553
2554 movq [r0 + 0 * 64 + 8], xm15
2555 movhps [r0 + 1 * 64 + 8], xm15
2556 vextracti128 xm9, m15, 1
2557 movq [r0 + 16 * 64 + 8], xm9
2558 movhps [r0 + 17 * 64 + 8], xm9
2559
2560 movu [r0 + 2 * 64], xm13
2561 vextracti128 [r0 + 18 * 64], m13, 1
2562
2563 movu [r0 + 3 * 64], xm7
2564 vextracti128 [r0 + 19 * 64], m7, 1
2565
2566 movu [r0 + 4 * 64], xm6
2567 vextracti128 [r0 + 20 * 64], m6, 1
2568
2569 movu [r0 + 5 * 64], xm1
2570 vextracti128 [r0 + 21 * 64], m1, 1
2571
2572 movu [r0 + 6 * 64], xm10
2573 vextracti128 [r0 + 22 * 64], m10, 1
2574
2575 movu [r0 + 7 * 64], xm8
2576 vextracti128 [r0 + 23 * 64], m8, 1
2577
2578 movu [r0 + 8 * 64], xm4
2579 vextracti128 [r0 + 24 * 64], m4, 1
2580
2581 movu [r0 + 9 * 64], xm3
2582 vextracti128 [r0 + 25 * 64], m3, 1
2583
2584 movu [r0 + 10 * 64], xm12
2585 vextracti128 [r0 + 26 * 64], m12, 1
2586
2587 movu [r0 + 11 * 64], xm5
2588 vextracti128 [r0 + 27 * 64], m5, 1
2589
2590 movu [r0 + 12 * 64], xm14
2591 vextracti128 [r0 + 28 * 64], m14, 1
2592
2593 movu [r0 + 13 * 64], xm2
2594 vextracti128 [r0 + 29 * 64], m2, 1
2595
2596 movu [r0 + 14 * 64], xm11
2597 vextracti128 [r0 + 30 * 64], m11, 1
2598
2599 movu [r0 + 15 * 64], xm0
2600 vextracti128 [r0 + 31 * 64], m0, 1
2601 ret
2602
2603 cglobal transpose64, 3, 6, 16
2604
2605 lea r3, [r2 * 3]
2606 lea r4, [r0 + 16]
2607
2608 lea r5, [r1 + 32]
2609 call transpose16x32_avx2
2610 lea r0, [r0 + 32 * 64]
2611 mov r1, r5
2612 call transpose16x32_avx2
2613
2614 mov r0, r4
2615 lea r5, [r1 + 4 * r2]
2616
2617 lea r1, [r5 - 32]
2618 call transpose16x32_avx2
2619 lea r0, [r0 + 32 * 64]
2620 mov r1, r5
2621 call transpose16x32_avx2
2622
2623 lea r0, [r4 + 16]
2624 lea r5, [r1 + 4 * r2]
2625
2626 lea r1, [r5 - 32]
2627 call transpose16x32_avx2
2628 lea r0, [r0 + 32 * 64]
2629 mov r1, r5
2630 call transpose16x32_avx2
2631
2632 lea r5, [r1 + 4 * r2]
2633 lea r0, [r4 + 32]
2634
2635 lea r1, [r5 - 32]
2636 call transpose16x32_avx2
2637 lea r0, [r0 + 32 * 64]
2638 mov r1, r5
2639 call transpose16x32_avx2
2640 RET
2641 %endif
2642
2643 INIT_XMM sse2
2644 cglobal transpose64, 3, 7, 8, dest, src, stride
2645 mov r3, r0
2646 mov r4, r1
2647 mov r5, r0
2648 mov r6, 64
2649 call transpose16_internal
2650 lea r1, [r1 - 8 + 2 * r2]
2651 lea r0, [r3 + 16]
2652 mov r5, r0
2653 call transpose16_internal
2654 lea r1, [r1 - 8 + 2 * r2]
2655 lea r0, [r3 + 32]
2656 mov r5, r0
2657 call transpose16_internal
2658 lea r1, [r1 - 8 + 2 * r2]
2659 lea r0, [r3 + 48]
2660 mov r5, r0
2661 call transpose16_internal
2662
2663 lea r1, [r4 + 16]
2664 lea r0, [r3 + 16 * 64]
2665 mov r5, r0
2666 call transpose16_internal
2667 lea r1, [r1 - 8 + 2 * r2]
2668 lea r0, [r3 + 16 * 64 + 16]
2669 mov r5, r0
2670 call transpose16_internal
2671 lea r1, [r1 - 8 + 2 * r2]
2672 lea r0, [r3 + 16 * 64 + 32]
2673 mov r5, r0
2674 call transpose16_internal
2675 lea r1, [r1 - 8 + 2 * r2]
2676 lea r0, [r3 + 16 * 64 + 48]
2677 mov r5, r0
2678 call transpose16_internal
2679
2680 lea r1, [r4 + 32]
2681 lea r0, [r3 + 32 * 64]
2682 mov r5, r0
2683 call transpose16_internal
2684 lea r1, [r1 - 8 + 2 * r2]
2685 lea r0, [r3 + 32 * 64 + 16]
2686 mov r5, r0
2687 call transpose16_internal
2688 lea r1, [r1 - 8 + 2 * r2]
2689 lea r0, [r3 + 32 * 64 + 32]
2690 mov r5, r0
2691 call transpose16_internal
2692 lea r1, [r1 - 8 + 2 * r2]
2693 lea r0, [r3 + 32 * 64 + 48]
2694 mov r5, r0
2695 call transpose16_internal
2696
2697 lea r1, [r4 + 48]
2698 lea r0, [r3 + 48 * 64]
2699 mov r5, r0
2700 call transpose16_internal
2701 lea r1, [r1 - 8 + 2 * r2]
2702 lea r0, [r3 + 48 * 64 + 16]
2703 mov r5, r0
2704 call transpose16_internal
2705 lea r1, [r1 - 8 + 2 * r2]
2706 lea r0, [r3 + 48 * 64 + 32]
2707 mov r5, r0
2708 call transpose16_internal
2709 lea r1, [r1 - 8 + 2 * r2]
2710 lea r0, [r3 + 48 * 64 + 48]
2711 mov r5, r0
2712 call transpose16_internal
2713 RET
2714 %endif
2715
2716
2717 ;=============================================================================
2718 ; SSIM
2719 ;=============================================================================
2720
2721 ;-----------------------------------------------------------------------------
2722 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
2723 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
2724 ;-----------------------------------------------------------------------------
2725 %macro SSIM_ITER 1
2726 %if HIGH_BIT_DEPTH
2727 movdqu m5, [r0+(%1&1)*r1]
2728 movdqu m6, [r2+(%1&1)*r3]
2729 %else
2730 movq m5, [r0+(%1&1)*r1]
2731 movq m6, [r2+(%1&1)*r3]
2732 punpcklbw m5, m0
2733 punpcklbw m6, m0
2734 %endif
2735 %if %1==1
2736 lea r0, [r0+r1*2]
2737 lea r2, [r2+r3*2]
2738 %endif
2739 %if %1==0
2740 movdqa m1, m5
2741 movdqa m2, m6
2742 %else
2743 paddw m1, m5
2744 paddw m2, m6
2745 %endif
2746 pmaddwd m7, m5, m6
2747 pmaddwd m5, m5
2748 pmaddwd m6, m6
2749 ACCUM paddd, 3, 5, %1
2750 ACCUM paddd, 4, 7, %1
2751 paddd m3, m6
2752 %endmacro
2753
2754 %macro SSIM 0
2755 cglobal pixel_ssim_4x4x2_core, 4,4,8
2756 FIX_STRIDES r1, r3
2757 pxor m0, m0
2758 SSIM_ITER 0
2759 SSIM_ITER 1
2760 SSIM_ITER 2
2761 SSIM_ITER 3
2762 ; PHADDW m1, m2
2763 ; PHADDD m3, m4
2764 movdqa m7, [pw_1]
2765 pshufd m5, m3, q2301
2766 pmaddwd m1, m7
2767 pmaddwd m2, m7
2768 pshufd m6, m4, q2301
2769 packssdw m1, m2
2770 paddd m3, m5
2771 pshufd m1, m1, q3120
2772 paddd m4, m6
2773 pmaddwd m1, m7
2774 punpckhdq m5, m3, m4
2775 punpckldq m3, m4
2776
2777 %if UNIX64
2778 %define t0 r4
2779 %else
2780 %define t0 rax
2781 mov t0, r4mp
2782 %endif
2783
2784 movq [t0+ 0], m1
2785 movq [t0+ 8], m3
2786 movhps [t0+16], m1
2787 movq [t0+24], m5
2788 RET
2789
2790 ;-----------------------------------------------------------------------------
2791 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
2792 ;-----------------------------------------------------------------------------
2793 cglobal pixel_ssim_end4, 2,3
2794 mov r2d, r2m
2795 mova m0, [r0+ 0]
2796 mova m1, [r0+16]
2797 mova m2, [r0+32]
2798 mova m3, [r0+48]
2799 mova m4, [r0+64]
2800 paddd m0, [r1+ 0]
2801 paddd m1, [r1+16]
2802 paddd m2, [r1+32]
2803 paddd m3, [r1+48]
2804 paddd m4, [r1+64]
2805 paddd m0, m1
2806 paddd m1, m2
2807 paddd m2, m3
2808 paddd m3, m4
2809 TRANSPOSE4x4D 0, 1, 2, 3, 4
2810
2811 ; s1=m0, s2=m1, ss=m2, s12=m3
2812 %if BIT_DEPTH == 10
2813 cvtdq2ps m0, m0
2814 cvtdq2ps m1, m1
2815 cvtdq2ps m2, m2
2816 cvtdq2ps m3, m3
2817 mulps m4, m0, m1 ; s1*s2
2818 mulps m0, m0 ; s1*s1
2819 mulps m1, m1 ; s2*s2
2820 mulps m2, [pf_64] ; ss*64
2821 mulps m3, [pf_128] ; s12*128
2822 addps m4, m4 ; s1*s2*2
2823 addps m0, m1 ; s1*s1 + s2*s2
2824 subps m2, m0 ; vars
2825 subps m3, m4 ; covar*2
2826 movaps m1, [ssim_c1]
2827 addps m4, m1 ; s1*s2*2 + ssim_c1
2828 addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
2829 movaps m1, [ssim_c2]
2830 addps m2, m1 ; vars + ssim_c2
2831 addps m3, m1 ; covar*2 + ssim_c2
2832 %else
2833 pmaddwd m4, m1, m0 ; s1*s2
2834 pslld m1, 16
2835 por m0, m1
2836 pmaddwd m0, m0 ; s1*s1 + s2*s2
2837 pslld m4, 1
2838 pslld m3, 7
2839 pslld m2, 6
2840 psubd m3, m4 ; covar*2
2841 psubd m2, m0 ; vars
2842 mova m1, [ssim_c1]
2843 paddd m0, m1
2844 paddd m4, m1
2845 mova m1, [ssim_c2]
2846 paddd m3, m1
2847 paddd m2, m1
2848 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
2849 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
2850 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
2851 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
2852 %endif
2853 mulps m4, m3
2854 mulps m0, m2
2855 divps m4, m0 ; ssim
2856
2857 cmp r2d, 4
2858 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
2859 neg r2
2860
2861 %ifdef PIC
2862 lea r3, [mask_ff + 16]
2863 %xdefine %%mask r3
2864 %else
2865 %xdefine %%mask mask_ff + 16
2866 %endif
2867 %if cpuflag(avx)
2868 andps m4, [%%mask + r2*4]
2869 %else
2870 movups m0, [%%mask + r2*4]
2871 andps m4, m0
2872 %endif
2873
2874 .skip:
2875 movhlps m0, m4
2876 addps m0, m4
2877 %if cpuflag(ssse3)
2878 movshdup m4, m0
2879 %else
2880 pshuflw m4, m0, q0032
2881 %endif
2882 addss m0, m4
2883 %if ARCH_X86_64 == 0
2884 movss r0m, m0
2885 fld dword r0m
2886 %endif
2887 RET
2888 %endmacro ; SSIM
2889
2890 INIT_XMM sse2
2891 SSIM
2892 INIT_XMM avx
2893 SSIM
2894
2895 ;-----------------------------------------------------------------
2896 ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
2897 ;-----------------------------------------------------------------
2898 INIT_XMM ssse3
2899 cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
2900 %if HIGH_BIT_DEPTH
2901 mova m7, [deinterleave_word_shuf]
2902
2903 movu m0, [r1]
2904 palignr m1, m0, 2
2905 movu m2, [r1 + 16]
2906 palignr m3, m2, 2
2907 movu m4, [r1 + 32]
2908 palignr m5, m4, 2
2909 movu m6, [r1 + 48]
2910 pavgw m0, m1
2911 palignr m1, m6, 2
2912 pavgw m2, m3
2913 pavgw m4, m5
2914 pavgw m6, m1
2915 pshufb m0, m0, m7
2916 pshufb m2, m2, m7
2917 pshufb m4, m4, m7
2918 pshufb m6, m6, m7
2919 punpcklqdq m0, m2
2920 movu [r0], m0
2921 punpcklqdq m4, m6
2922 movu [r0 + 16], m4
2923
2924
2925
2926 movu m0, [r1 + 64]
2927 palignr m1, m0, 2
2928 movu m2, [r1 + 80]
2929 palignr m3, m2, 2
2930 movu m4, [r1 + 96]
2931 palignr m5, m4, 2
2932 movu m6, [r1 + 112]
2933 pavgw m0, m1
2934 palignr m1, m6, 2
2935 pavgw m2, m3
2936 pavgw m4, m5
2937 pavgw m6, m1
2938 pshufb m0, m0, m7
2939 pshufb m2, m2, m7
2940 pshufb m4, m4, m7
2941 pshufb m6, m6, m7
2942 punpcklqdq m0, m2
2943 movu [r0 + 32], m0
2944 punpcklqdq m4, m6
2945 movu [r0 + 48], m4
2946
2947 movu m0, [r1 + 128]
2948 palignr m1, m0, 2
2949 movu m2, [r1 + 144]
2950 palignr m3, m2, 2
2951 movu m4, [r1 + 160]
2952 palignr m5, m4, 2
2953 movu m6, [r1 + 176]
2954 pavgw m0, m1
2955 palignr m1, m6, 2
2956 pavgw m2, m3
2957 pavgw m4, m5
2958 pavgw m6, m1
2959 pshufb m0, m0, m7
2960 pshufb m2, m2, m7
2961 pshufb m4, m4, m7
2962 pshufb m6, m6, m7
2963
2964 punpcklqdq m0, m2
2965 movu [r0 + 64], m0
2966 punpcklqdq m4, m6
2967 movu [r0 + 80], m4
2968
2969 movu m0, [r1 + 192]
2970 palignr m1, m0, 2
2971 movu m2, [r1 + 208]
2972 palignr m3, m2, 2
2973 movu m4, [r1 + 224]
2974 palignr m5, m4, 2
2975 movu m6, [r1 + 240]
2976 pavgw m0, m1
2977 palignr m1, m6, 2
2978 pavgw m2, m3
2979 pavgw m4, m5
2980 pavgw m6, m1
2981 pshufb m0, m0, m7
2982 pshufb m2, m2, m7
2983 pshufb m4, m4, m7
2984 pshufb m6, m6, m7
2985
2986 punpcklqdq m0, m2
2987 movu [r0 + 96], m0
2988 punpcklqdq m4, m6
2989 movu [r0 + 112], m4
2990
2991 %else
2992 mova m7, [deinterleave_shuf]
2993
2994 movu m0, [r1]
2995 palignr m1, m0, 1
2996 movu m2, [r1 + 16]
2997 palignr m3, m2, 1
2998 movu m4, [r1 + 32]
2999 palignr m5, m4, 1
3000 movu m6, [r1 + 48]
3001
3002 pavgb m0, m1
3003
3004 palignr m1, m6, 1
3005
3006 pavgb m2, m3
3007 pavgb m4, m5
3008 pavgb m6, m1
3009
3010 pshufb m0, m0, m7
3011 pshufb m2, m2, m7
3012 pshufb m4, m4, m7
3013 pshufb m6, m6, m7
3014
3015 punpcklqdq m0, m2
3016 movu [r0], m0
3017 punpcklqdq m4, m6
3018 movu [r0 + 16], m4
3019
3020 movu m0, [r1 + 64]
3021 palignr m1, m0, 1
3022 movu m2, [r1 + 80]
3023 palignr m3, m2, 1
3024 movu m4, [r1 + 96]
3025 palignr m5, m4, 1
3026 movu m6, [r1 + 112]
3027
3028 pavgb m0, m1
3029
3030 palignr m1, m6, 1
3031
3032 pavgb m2, m3
3033 pavgb m4, m5
3034 pavgb m6, m1
3035
3036 pshufb m0, m0, m7
3037 pshufb m2, m2, m7
3038 pshufb m4, m4, m7
3039 pshufb m6, m6, m7
3040
3041 punpcklqdq m0, m2
3042 movu [r0 + 32], m0
3043 punpcklqdq m4, m6
3044 movu [r0 + 48], m4
3045 %endif
3046 RET
3047
3048 %if HIGH_BIT_DEPTH == 1
3049 INIT_YMM avx2
3050 cglobal scale1D_128to64, 2, 2, 3
3051 pxor m2, m2
3052
3053 movu m0, [r1]
3054 movu m1, [r1 + 32]
3055 phaddw m0, m1
3056 pavgw m0, m2
3057 vpermq m0, m0, 0xD8
3058 movu [r0], m0
3059
3060 movu m0, [r1 + 64]
3061 movu m1, [r1 + 96]
3062 phaddw m0, m1
3063 pavgw m0, m2
3064 vpermq m0, m0, 0xD8
3065 movu [r0 + 32], m0
3066
3067 movu m0, [r1 + 128]
3068 movu m1, [r1 + 160]
3069 phaddw m0, m1
3070 pavgw m0, m2
3071 vpermq m0, m0, 0xD8
3072 movu [r0 + 64], m0
3073
3074 movu m0, [r1 + 192]
3075 movu m1, [r1 + 224]
3076 phaddw m0, m1
3077 pavgw m0, m2
3078 vpermq m0, m0, 0xD8
3079 movu [r0 + 96], m0
3080 RET
3081 %else ; HIGH_BIT_DEPTH == 0
3082 INIT_YMM avx2
3083 cglobal scale1D_128to64, 2, 2, 4
3084 pxor m2, m2
3085 mova m3, [pb_1]
3086
3087 movu m0, [r1]
3088 pmaddubsw m0, m0, m3
3089 pavgw m0, m2
3090 movu m1, [r1 + 32]
3091 pmaddubsw m1, m1, m3
3092 pavgw m1, m2
3093 packuswb m0, m1
3094 vpermq m0, m0, 0xD8
3095 movu [r0], m0
3096
3097 movu m0, [r1 + 64]
3098 pmaddubsw m0, m0, m3
3099 pavgw m0, m2
3100 movu m1, [r1 + 96]
3101 pmaddubsw m1, m1, m3
3102 pavgw m1, m2
3103 packuswb m0, m1
3104 vpermq m0, m0, 0xD8
3105 movu [r0 + 32], m0
3106 RET
3107 %endif
3108
3109 ;-----------------------------------------------------------------
3110 ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
3111 ;-----------------------------------------------------------------
3112 %if HIGH_BIT_DEPTH
3113 INIT_XMM ssse3
3114 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3115 mov r3d, 32
3116 mova m7, [deinterleave_word_shuf]
3117 add r2, r2
3118 .loop:
3119 movu m0, [r1] ;i
3120 psrld m1, m0, 16 ;j
3121 movu m2, [r1 + r2] ;k
3122 psrld m3, m2, 16 ;l
3123 movu m4, m0
3124 movu m5, m2
3125 pxor m4, m1 ;i^j
3126 pxor m5, m3 ;k^l
3127 por m4, m5 ;ij|kl
3128 pavgw m0, m1 ;s
3129 pavgw m2, m3 ;t
3130 movu m5, m0
3131 pavgw m0, m2 ;(s+t+1)/2
3132 pxor m5, m2 ;s^t
3133 pand m4, m5 ;(ij|kl)&st
3134 pand m4, [hmulw_16p]
3135 psubw m0, m4 ;Result
3136 movu m1, [r1 + 16] ;i
3137 psrld m2, m1, 16 ;j
3138 movu m3, [r1 + r2 + 16] ;k
3139 psrld m4, m3, 16 ;l
3140 movu m5, m1
3141 movu m6, m3
3142 pxor m5, m2 ;i^j
3143 pxor m6, m4 ;k^l
3144 por m5, m6 ;ij|kl
3145 pavgw m1, m2 ;s
3146 pavgw m3, m4 ;t
3147 movu m6, m1
3148 pavgw m1, m3 ;(s+t+1)/2
3149 pxor m6, m3 ;s^t
3150 pand m5, m6 ;(ij|kl)&st
3151 pand m5, [hmulw_16p]
3152 psubw m1, m5 ;Result
3153 pshufb m0, m7
3154 pshufb m1, m7
3155
3156 punpcklqdq m0, m1
3157 movu [r0], m0
3158
3159 movu m0, [r1 + 32] ;i
3160 psrld m1, m0, 16 ;j
3161 movu m2, [r1 + r2 + 32] ;k
3162 psrld m3, m2, 16 ;l
3163 movu m4, m0
3164 movu m5, m2
3165 pxor m4, m1 ;i^j
3166 pxor m5, m3 ;k^l
3167 por m4, m5 ;ij|kl
3168 pavgw m0, m1 ;s
3169 pavgw m2, m3 ;t
3170 movu m5, m0
3171 pavgw m0, m2 ;(s+t+1)/2
3172 pxor m5, m2 ;s^t
3173 pand m4, m5 ;(ij|kl)&st
3174 pand m4, [hmulw_16p]
3175 psubw m0, m4 ;Result
3176 movu m1, [r1 + 48] ;i
3177 psrld m2, m1, 16 ;j
3178 movu m3, [r1 + r2 + 48] ;k
3179 psrld m4, m3, 16 ;l
3180 movu m5, m1
3181 movu m6, m3
3182 pxor m5, m2 ;i^j
3183 pxor m6, m4 ;k^l
3184 por m5, m6 ;ij|kl
3185 pavgw m1, m2 ;s
3186 pavgw m3, m4 ;t
3187 movu m6, m1
3188 pavgw m1, m3 ;(s+t+1)/2
3189 pxor m6, m3 ;s^t
3190 pand m5, m6 ;(ij|kl)&st
3191 pand m5, [hmulw_16p]
3192 psubw m1, m5 ;Result
3193 pshufb m0, m7
3194 pshufb m1, m7
3195
3196 punpcklqdq m0, m1
3197 movu [r0 + 16], m0
3198
3199 movu m0, [r1 + 64] ;i
3200 psrld m1, m0, 16 ;j
3201 movu m2, [r1 + r2 + 64] ;k
3202 psrld m3, m2, 16 ;l
3203 movu m4, m0
3204 movu m5, m2
3205 pxor m4, m1 ;i^j
3206 pxor m5, m3 ;k^l
3207 por m4, m5 ;ij|kl
3208 pavgw m0, m1 ;s
3209 pavgw m2, m3 ;t
3210 movu m5, m0
3211 pavgw m0, m2 ;(s+t+1)/2
3212 pxor m5, m2 ;s^t
3213 pand m4, m5 ;(ij|kl)&st
3214 pand m4, [hmulw_16p]
3215 psubw m0, m4 ;Result
3216 movu m1, [r1 + 80] ;i
3217 psrld m2, m1, 16 ;j
3218 movu m3, [r1 + r2 + 80] ;k
3219 psrld m4, m3, 16 ;l
3220 movu m5, m1
3221 movu m6, m3
3222 pxor m5, m2 ;i^j
3223 pxor m6, m4 ;k^l
3224 por m5, m6 ;ij|kl
3225 pavgw m1, m2 ;s
3226 pavgw m3, m4 ;t
3227 movu m6, m1
3228 pavgw m1, m3 ;(s+t+1)/2
3229 pxor m6, m3 ;s^t
3230 pand m5, m6 ;(ij|kl)&st
3231 pand m5, [hmulw_16p]
3232 psubw m1, m5 ;Result
3233 pshufb m0, m7
3234 pshufb m1, m7
3235
3236 punpcklqdq m0, m1
3237 movu [r0 + 32], m0
3238
3239 movu m0, [r1 + 96] ;i
3240 psrld m1, m0, 16 ;j
3241 movu m2, [r1 + r2 + 96] ;k
3242 psrld m3, m2, 16 ;l
3243 movu m4, m0
3244 movu m5, m2
3245 pxor m4, m1 ;i^j
3246 pxor m5, m3 ;k^l
3247 por m4, m5 ;ij|kl
3248 pavgw m0, m1 ;s
3249 pavgw m2, m3 ;t
3250 movu m5, m0
3251 pavgw m0, m2 ;(s+t+1)/2
3252 pxor m5, m2 ;s^t
3253 pand m4, m5 ;(ij|kl)&st
3254 pand m4, [hmulw_16p]
3255 psubw m0, m4 ;Result
3256 movu m1, [r1 + 112] ;i
3257 psrld m2, m1, 16 ;j
3258 movu m3, [r1 + r2 + 112] ;k
3259 psrld m4, m3, 16 ;l
3260 movu m5, m1
3261 movu m6, m3
3262 pxor m5, m2 ;i^j
3263 pxor m6, m4 ;k^l
3264 por m5, m6 ;ij|kl
3265 pavgw m1, m2 ;s
3266 pavgw m3, m4 ;t
3267 movu m6, m1
3268 pavgw m1, m3 ;(s+t+1)/2
3269 pxor m6, m3 ;s^t
3270 pand m5, m6 ;(ij|kl)&st
3271 pand m5, [hmulw_16p]
3272 psubw m1, m5 ;Result
3273 pshufb m0, m7
3274 pshufb m1, m7
3275
3276 punpcklqdq m0, m1
3277 movu [r0 + 48], m0
3278 lea r0, [r0 + 64]
3279 lea r1, [r1 + 2 * r2]
3280 dec r3d
3281 jnz .loop
3282 RET
3283 %else
3284
3285 INIT_XMM ssse3
3286 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3287 mov r3d, 32
3288 mova m7, [deinterleave_shuf]
3289 .loop:
3290
3291 movu m0, [r1] ;i
3292 psrlw m1, m0, 8 ;j
3293 movu m2, [r1 + r2] ;k
3294 psrlw m3, m2, 8 ;l
3295 movu m4, m0
3296 movu m5, m2
3297
3298 pxor m4, m1 ;i^j
3299 pxor m5, m3 ;k^l
3300 por m4, m5 ;ij|kl
3301
3302 pavgb m0, m1 ;s
3303 pavgb m2, m3 ;t
3304 movu m5, m0
3305 pavgb m0, m2 ;(s+t+1)/2
3306 pxor m5, m2 ;s^t
3307 pand m4, m5 ;(ij|kl)&st
3308 pand m4, [hmul_16p]
3309 psubb m0, m4 ;Result
3310
3311 movu m1, [r1 + 16] ;i
3312 psrlw m2, m1, 8 ;j
3313 movu m3, [r1 + r2 + 16] ;k
3314 psrlw m4, m3, 8 ;l
3315 movu m5, m1
3316 movu m6, m3
3317
3318 pxor m5, m2 ;i^j
3319 pxor m6, m4 ;k^l
3320 por m5, m6 ;ij|kl
3321
3322 pavgb m1, m2 ;s
3323 pavgb m3, m4 ;t
3324 movu m6, m1
3325 pavgb m1, m3 ;(s+t+1)/2
3326 pxor m6, m3 ;s^t
3327 pand m5, m6 ;(ij|kl)&st
3328 pand m5, [hmul_16p]
3329 psubb m1, m5 ;Result
3330
3331 pshufb m0, m0, m7
3332 pshufb m1, m1, m7
3333
3334 punpcklqdq m0, m1
3335 movu [r0], m0
3336
3337 movu m0, [r1 + 32] ;i
3338 psrlw m1, m0, 8 ;j
3339 movu m2, [r1 + r2 + 32] ;k
3340 psrlw m3, m2, 8 ;l
3341 movu m4, m0
3342 movu m5, m2
3343
3344 pxor m4, m1 ;i^j
3345 pxor m5, m3 ;k^l
3346 por m4, m5 ;ij|kl
3347
3348 pavgb m0, m1 ;s
3349 pavgb m2, m3 ;t
3350 movu m5, m0
3351 pavgb m0, m2 ;(s+t+1)/2
3352 pxor m5, m2 ;s^t
3353 pand m4, m5 ;(ij|kl)&st
3354 pand m4, [hmul_16p]
3355 psubb m0, m4 ;Result
3356
3357 movu m1, [r1 + 48] ;i
3358 psrlw m2, m1, 8 ;j
3359 movu m3, [r1 + r2 + 48] ;k
3360 psrlw m4, m3, 8 ;l
3361 movu m5, m1
3362 movu m6, m3
3363
3364 pxor m5, m2 ;i^j
3365 pxor m6, m4 ;k^l
3366 por m5, m6 ;ij|kl
3367
3368 pavgb m1, m2 ;s
3369 pavgb m3, m4 ;t
3370 movu m6, m1
3371 pavgb m1, m3 ;(s+t+1)/2
3372 pxor m6, m3 ;s^t
3373 pand m5, m6 ;(ij|kl)&st
3374 pand m5, [hmul_16p]
3375 psubb m1, m5 ;Result
3376
3377 pshufb m0, m0, m7
3378 pshufb m1, m1, m7
3379
3380 punpcklqdq m0, m1
3381 movu [r0 + 16], m0
3382
3383 lea r0, [r0 + 32]
3384 lea r1, [r1 + 2 * r2]
3385 dec r3d
3386 jnz .loop
3387 RET
3388 %endif
3389
3390
3391 ;-----------------------------------------------------------------------------
3392 ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3393 ;-----------------------------------------------------------------------------
3394 %if HIGH_BIT_DEPTH
3395 INIT_XMM sse2
3396 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3397 add r4, r4
3398 add r5, r5
3399 add r1, r1
3400 movh m0, [r2]
3401 movh m2, [r2 + r4]
3402 movh m1, [r3]
3403 movh m3, [r3 + r5]
3404 lea r2, [r2 + r4 * 2]
3405 lea r3, [r3 + r5 * 2]
3406 movh m4, [r2]
3407 movh m6, [r2 + r4]
3408 movh m5, [r3]
3409 movh m7, [r3 + r5]
3410
3411 psubw m0, m1
3412 psubw m2, m3
3413 psubw m4, m5
3414 psubw m6, m7
3415
3416 movh [r0], m0
3417 movh [r0 + r1], m2
3418 lea r0, [r0 + r1 * 2]
3419 movh [r0], m4
3420 movh [r0 + r1], m6
3421
3422 RET
3423 %else
3424 INIT_XMM sse4
3425 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3426 add r1, r1
3427 movd m0, [r2]
3428 movd m2, [r2 + r4]
3429 movd m1, [r3]
3430 movd m3, [r3 + r5]
3431 lea r2, [r2 + r4 * 2]
3432 lea r3, [r3 + r5 * 2]
3433 movd m4, [r2]
3434 movd m6, [r2 + r4]
3435 movd m5, [r3]
3436 movd m7, [r3 + r5]
3437 punpckldq m0, m2
3438 punpckldq m1, m3
3439 punpckldq m4, m6
3440 punpckldq m5, m7
3441 pmovzxbw m0, m0
3442 pmovzxbw m1, m1
3443 pmovzxbw m4, m4
3444 pmovzxbw m5, m5
3445
3446 psubw m0, m1
3447 psubw m4, m5
3448
3449 movh [r0], m0
3450 movhps [r0 + r1], m0
3451 movh [r0 + r1 * 2], m4
3452 lea r0, [r0 + r1 * 2]
3453 movhps [r0 + r1], m4
3454
3455 RET
3456 %endif
3457
3458
3459 ;-----------------------------------------------------------------------------
3460 ; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3461 ;-----------------------------------------------------------------------------
3462 %macro PIXELSUB_PS_W4_H4 2
3463 %if HIGH_BIT_DEPTH
3464 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3465 mov r6d, %2/4
3466 add r4, r4
3467 add r5, r5
3468 add r1, r1
3469 .loop:
3470 movh m0, [r2]
3471 movh m2, [r2 + r4]
3472 movh m1, [r3]
3473 movh m3, [r3 + r5]
3474 lea r2, [r2 + r4 * 2]
3475 lea r3, [r3 + r5 * 2]
3476 movh m4, [r2]
3477 movh m6, [r2 + r4]
3478 movh m5, [r3]
3479 movh m7, [r3 + r5]
3480 dec r6d
3481 lea r2, [r2 + r4 * 2]
3482 lea r3, [r3 + r5 * 2]
3483
3484 psubw m0, m1
3485 psubw m2, m3
3486 psubw m4, m5
3487 psubw m6, m7
3488
3489 movh [r0], m0
3490 movh [r0 + r1], m2
3491 movh [r0 + r1 * 2], m4
3492 lea r0, [r0 + r1 * 2]
3493 movh [r0 + r1], m6
3494 lea r0, [r0 + r1 * 2]
3495
3496 jnz .loop
3497 RET
3498 %else
3499 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3500 mov r6d, %2/4
3501 add r1, r1
3502 .loop:
3503 movd m0, [r2]
3504 movd m2, [r2 + r4]
3505 movd m1, [r3]
3506 movd m3, [r3 + r5]
3507 lea r2, [r2 + r4 * 2]
3508 lea r3, [r3 + r5 * 2]
3509 movd m4, [r2]
3510 movd m6, [r2 + r4]
3511 movd m5, [r3]
3512 movd m7, [r3 + r5]
3513 dec r6d
3514 lea r2, [r2 + r4 * 2]
3515 lea r3, [r3 + r5 * 2]
3516 punpckldq m0, m2
3517 punpckldq m1, m3
3518 punpckldq m4, m6
3519 punpckldq m5, m7
3520 pmovzxbw m0, m0
3521 pmovzxbw m1, m1
3522 pmovzxbw m4, m4
3523 pmovzxbw m5, m5
3524
3525 psubw m0, m1
3526 psubw m4, m5
3527
3528 movh [r0], m0
3529 movhps [r0 + r1], m0
3530 movh [r0 + r1 * 2], m4
3531 lea r0, [r0 + r1 * 2]
3532 movhps [r0 + r1], m4
3533 lea r0, [r0 + r1 * 2]
3534
3535 jnz .loop
3536 RET
3537 %endif
3538 %endmacro
3539
3540 %if HIGH_BIT_DEPTH
3541 INIT_XMM sse2
3542 PIXELSUB_PS_W4_H4 4, 8
3543 %else
3544 INIT_XMM sse4
3545 PIXELSUB_PS_W4_H4 4, 8
3546 %endif
3547
3548
3549 ;-----------------------------------------------------------------------------
3550 ; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3551 ;-----------------------------------------------------------------------------
3552 %macro PIXELSUB_PS_W8_H4 2
3553 %if HIGH_BIT_DEPTH
3554 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3555 mov r6d, %2/4
3556 add r4, r4
3557 add r5, r5
3558 add r1, r1
3559 .loop:
3560 movu m0, [r2]
3561 movu m2, [r2 + r4]
3562 movu m1, [r3]
3563 movu m3, [r3 + r5]
3564 lea r2, [r2 + r4 * 2]
3565 lea r3, [r3 + r5 * 2]
3566 movu m4, [r2]
3567 movu m6, [r2 + r4]
3568 movu m5, [r3]
3569 movu m7, [r3 + r5]
3570 dec r6d
3571 lea r2, [r2 + r4 * 2]
3572 lea r3, [r3 + r5 * 2]
3573
3574 psubw m0, m1
3575 psubw m2, m3
3576 psubw m4, m5
3577 psubw m6, m7
3578
3579 movu [r0], m0
3580 movu [r0 + r1], m2
3581 movu [r0 + r1 * 2], m4
3582 lea r0, [r0 + r1 * 2]
3583 movu [r0 + r1], m6
3584 lea r0, [r0 + r1 * 2]
3585
3586 jnz .loop
3587 RET
3588 %else
3589 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3590 mov r6d, %2/4
3591 add r1, r1
3592 .loop:
3593 movh m0, [r2]
3594 movh m2, [r2 + r4]
3595 movh m1, [r3]
3596 movh m3, [r3 + r5]
3597 lea r2, [r2 + r4 * 2]
3598 lea r3, [r3 + r5 * 2]
3599 movh m4, [r2]
3600 movh m6, [r2 + r4]
3601 movh m5, [r3]
3602 movh m7, [r3 + r5]
3603 dec r6d
3604 lea r2, [r2 + r4 * 2]
3605 lea r3, [r3 + r5 * 2]
3606 pmovzxbw m0, m0
3607 pmovzxbw m1, m1
3608 pmovzxbw m2, m2
3609 pmovzxbw m3, m3
3610 pmovzxbw m4, m4
3611 pmovzxbw m5, m5
3612 pmovzxbw m6, m6
3613 pmovzxbw m7, m7
3614
3615 psubw m0, m1
3616 psubw m2, m3
3617 psubw m4, m5
3618 psubw m6, m7
3619
3620 movu [r0], m0
3621 movu [r0 + r1], m2
3622 movu [r0 + r1 * 2], m4
3623 lea r0, [r0 + r1 * 2]
3624 movu [r0 + r1], m6
3625 lea r0, [r0 + r1 * 2]
3626
3627 jnz .loop
3628 RET
3629 %endif
3630 %endmacro
3631
3632 %if HIGH_BIT_DEPTH
3633 INIT_XMM sse2
3634 PIXELSUB_PS_W8_H4 8, 8
3635 PIXELSUB_PS_W8_H4 8, 16
3636 %else
3637 INIT_XMM sse4
3638 PIXELSUB_PS_W8_H4 8, 8
3639 PIXELSUB_PS_W8_H4 8, 16
3640 %endif
3641
3642
3643 ;-----------------------------------------------------------------------------
3644 ; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3645 ;-----------------------------------------------------------------------------
3646 %macro PIXELSUB_PS_W16_H4 2
3647 %if HIGH_BIT_DEPTH
3648 cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3649 mov r6d, %2/4
3650 add r4, r4
3651 add r5, r5
3652 add r1, r1
3653 .loop:
3654 movu m0, [r2]
3655 movu m2, [r2 + 16]
3656 movu m1, [r3]
3657 movu m3, [r3 + 16]
3658 movu m4, [r2 + r4]
3659 movu m6, [r2 + r4 + 16]
3660 movu m5, [r3 + r5]
3661 movu m7, [r3 + r5 + 16]
3662 dec r6d
3663 lea r2, [r2 + r4 * 2]
3664 lea r3, [r3 + r5 * 2]
3665
3666 psubw m0, m1
3667 psubw m2, m3
3668 psubw m4, m5
3669 psubw m6, m7
3670
3671 movu [r0], m0
3672 movu [r0 + 16], m2
3673 movu [r0 + r1], m4
3674 movu [r0 + r1 + 16], m6
3675
3676 movu m0, [r2]
3677 movu m2, [r2 + 16]
3678 movu m1, [r3]
3679 movu m3, [r3 + 16]
3680 movu m4, [r2 + r4]
3681 movu m5, [r3 + r5]
3682 movu m6, [r2 + r4 + 16]
3683 movu m7, [r3 + r5 + 16]
3684 lea r0, [r0 + r1 * 2]
3685 lea r2, [r2 + r4 * 2]
3686 lea r3, [r3 + r5 * 2]
3687
3688 psubw m0, m1
3689 psubw m2, m3
3690 psubw m4, m5
3691 psubw m6, m7
3692
3693 movu [r0], m0
3694 movu [r0 + 16], m2
3695 movu [r0 + r1], m4
3696 movu [r0 + r1 + 16], m6
3697 lea r0, [r0 + r1 * 2]
3698
3699 jnz .loop
3700 RET
3701 %else
3702 cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
3703 mov r6d, %2/4
3704 pxor m6, m6
3705 add r1, r1
3706 .loop:
3707 movu m1, [r2]
3708 movu m3, [r3]
3709 pmovzxbw m0, m1
3710 pmovzxbw m2, m3
3711 punpckhbw m1, m6
3712 punpckhbw m3, m6
3713
3714 psubw m0, m2
3715 psubw m1, m3
3716
3717 movu m5, [r2 + r4]
3718 movu m3, [r3 + r5]
3719 lea r2, [r2 + r4 * 2]
3720 lea r3, [r3 + r5 * 2]
3721 pmovzxbw m4, m5
3722 pmovzxbw m2, m3
3723 punpckhbw m5, m6
3724 punpckhbw m3, m6
3725
3726 psubw m4, m2
3727 psubw m5, m3
3728
3729 movu [r0], m0
3730 movu [r0 + 16], m1
3731 movu [r0 + r1], m4
3732 movu [r0 + r1 + 16], m5
3733
3734 movu m1, [r2]
3735 movu m3, [r3]
3736 pmovzxbw m0, m1
3737 pmovzxbw m2, m3
3738 punpckhbw m1, m6
3739 punpckhbw m3, m6
3740
3741 psubw m0, m2
3742 psubw m1, m3
3743
3744 movu m5, [r2 + r4]
3745 movu m3, [r3 + r5]
3746 dec r6d
3747 lea r2, [r2 + r4 * 2]
3748 lea r3, [r3 + r5 * 2]
3749 lea r0, [r0 + r1 * 2]
3750 pmovzxbw m4, m5
3751 pmovzxbw m2, m3
3752 punpckhbw m5, m6
3753 punpckhbw m3, m6
3754
3755 psubw m4, m2
3756 psubw m5, m3
3757
3758 movu [r0], m0
3759 movu [r0 + 16], m1
3760 movu [r0 + r1], m4
3761 movu [r0 + r1 + 16], m5
3762 lea r0, [r0 + r1 * 2]
3763
3764 jnz .loop
3765 RET
3766 %endif
3767 %endmacro
3768
3769 %if HIGH_BIT_DEPTH
3770 INIT_XMM sse2
3771 PIXELSUB_PS_W16_H4 16, 16
3772 PIXELSUB_PS_W16_H4 16, 32
3773 %else
3774 INIT_XMM sse4
3775 PIXELSUB_PS_W16_H4 16, 16
3776 PIXELSUB_PS_W16_H4 16, 32
3777 %endif
3778
3779
3780 ;-----------------------------------------------------------------------------
3781 ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3782 ;-----------------------------------------------------------------------------
3783 %macro PIXELSUB_PS_W32_H2 2
3784 %if HIGH_BIT_DEPTH
3785 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3786 mov r6d, %2/2
3787 add r4, r4
3788 add r5, r5
3789 add r1, r1
3790 .loop:
3791 movu m0, [r2]
3792 movu m2, [r2 + 16]
3793 movu m4, [r2 + 32]
3794 movu m6, [r2 + 48]
3795 movu m1, [r3]
3796 movu m3, [r3 + 16]
3797 movu m5, [r3 + 32]
3798 movu m7, [r3 + 48]
3799 dec r6d
3800
3801 psubw m0, m1
3802 psubw m2, m3
3803 psubw m4, m5
3804 psubw m6, m7
3805
3806 movu [r0], m0
3807 movu [r0 + 16], m2
3808 movu [r0 + 32], m4
3809 movu [r0 + 48], m6
3810
3811 movu m0, [r2 + r4]
3812 movu m2, [r2 + r4 + 16]
3813 movu m4, [r2 + r4 + 32]
3814 movu m6, [r2 + r4 + 48]
3815 movu m1, [r3 + r5]
3816 movu m3, [r3 + r5 + 16]
3817 movu m5, [r3 + r5 + 32]
3818 movu m7, [r3 + r5 + 48]
3819 lea r2, [r2 + r4 * 2]
3820 lea r3, [r3 + r5 * 2]
3821
3822 psubw m0, m1
3823 psubw m2, m3
3824 psubw m4, m5
3825 psubw m6, m7
3826
3827 movu [r0 + r1], m0
3828 movu [r0 + r1 + 16], m2
3829 movu [r0 + r1 + 32], m4
3830 movu [r0 + r1 + 48], m6
3831 lea r0, [r0 + r1 * 2]
3832
3833 jnz .loop
3834 RET
3835 %else
3836 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3837 mov r6d, %2/2
3838 add r1, r1
3839 .loop:
3840 movh m0, [r2]
3841 movh m1, [r2 + 8]
3842 movh m2, [r2 + 16]
3843 movh m6, [r2 + 24]
3844 movh m3, [r3]
3845 movh m4, [r3 + 8]
3846 movh m5, [r3 + 16]
3847 movh m7, [r3 + 24]
3848 dec r6d
3849 pmovzxbw m0, m0
3850 pmovzxbw m1, m1
3851 pmovzxbw m2, m2
3852 pmovzxbw m6, m6
3853 pmovzxbw m3, m3
3854 pmovzxbw m4, m4
3855 pmovzxbw m5, m5
3856 pmovzxbw m7, m7
3857
3858 psubw m0, m3
3859 psubw m1, m4
3860 psubw m2, m5
3861 psubw m6, m7
3862
3863 movu [r0], m0
3864 movu [r0 + 16], m1
3865 movu [r0 + 32], m2
3866 movu [r0 + 48], m6
3867
3868 movh m0, [r2 + r4]
3869 movh m1, [r2 + r4 + 8]
3870 movh m2, [r2 + r4 + 16]
3871 movh m6, [r2 + r4 + 24]
3872 movh m3, [r3 + r5]
3873 movh m4, [r3 + r5 + 8]
3874 movh m5, [r3 + r5 + 16]
3875 movh m7, [r3 + r5 + 24]
3876 lea r2, [r2 + r4 * 2]
3877 lea r3, [r3 + r5 * 2]
3878 pmovzxbw m0, m0
3879 pmovzxbw m1, m1
3880 pmovzxbw m2, m2
3881 pmovzxbw m6, m6
3882 pmovzxbw m3, m3
3883 pmovzxbw m4, m4
3884 pmovzxbw m5, m5
3885 pmovzxbw m7, m7
3886
3887 psubw m0, m3
3888 psubw m1, m4
3889 psubw m2, m5
3890 psubw m6, m7
3891
3892 movu [r0 + r1], m0
3893 movu [r0 + r1 + 16], m1
3894 movu [r0 + r1 + 32], m2
3895 movu [r0 + r1 + 48], m6
3896 lea r0, [r0 + r1 * 2]
3897
3898 jnz .loop
3899 RET
3900 %endif
3901 %endmacro
3902
3903 %if HIGH_BIT_DEPTH
3904 INIT_XMM sse2
3905 PIXELSUB_PS_W32_H2 32, 32
3906 PIXELSUB_PS_W32_H2 32, 64
3907 %else
3908 INIT_XMM sse4
3909 PIXELSUB_PS_W32_H2 32, 32
3910 PIXELSUB_PS_W32_H2 32, 64
3911 %endif
3912
3913
3914 ;-----------------------------------------------------------------------------
3915 ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3916 ;-----------------------------------------------------------------------------
3917 %macro PIXELSUB_PS_W64_H2 2
3918 %if HIGH_BIT_DEPTH
3919 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3920 mov r6d, %2/2
3921 add r4, r4
3922 add r5, r5
3923 add r1, r1
3924 .loop:
3925 movu m0, [r2]
3926 movu m2, [r2 + 16]
3927 movu m4, [r2 + 32]
3928 movu m6, [r2 + 48]
3929 movu m1, [r3]
3930 movu m3, [r3 + 16]
3931 movu m5, [r3 + 32]
3932 movu m7, [r3 + 48]
3933
3934 psubw m0, m1
3935 psubw m2, m3
3936 psubw m4, m5
3937 psubw m6, m7
3938
3939 movu [r0], m0
3940 movu [r0 + 16], m2
3941 movu [r0 + 32], m4
3942 movu [r0 + 48], m6
3943
3944 movu m0, [r2 + 64]
3945 movu m2, [r2 + 80]
3946 movu m4, [r2 + 96]
3947 movu m6, [r2 + 112]
3948 movu m1, [r3 + 64]
3949 movu m3, [r3 + 80]
3950 movu m5, [r3 + 96]
3951 movu m7, [r3 + 112]
3952
3953 psubw m0, m1
3954 psubw m2, m3
3955 psubw m4, m5
3956 psubw m6, m7
3957
3958 movu [r0 + 64], m0
3959 movu [r0 + 80], m2
3960 movu [r0 + 96], m4
3961 movu [r0 + 112], m6
3962
3963 movu m0, [r2 + r4]
3964 movu m2, [r2 + r4 + 16]
3965 movu m4, [r2 + r4 + 32]
3966 movu m6, [r2 + r4 + 48]
3967 movu m1, [r3 + r5]
3968 movu m3, [r3 + r5 + 16]
3969 movu m5, [r3 + r5 + 32]
3970 movu m7, [r3 + r5 + 48]
3971
3972 psubw m0, m1
3973 psubw m2, m3
3974 psubw m4, m5
3975 psubw m6, m7
3976
3977 movu [r0 + r1], m0
3978 movu [r0 + r1 + 16], m2
3979 movu [r0 + r1 + 32], m4
3980 movu [r0 + r1 + 48], m6
3981
3982 movu m0, [r2 + r4 + 64]
3983 movu m2, [r2 + r4 + 80]
3984 movu m4, [r2 + r4 + 96]
3985 movu m6, [r2 + r4 + 112]
3986 movu m1, [r3 + r5 + 64]
3987 movu m3, [r3 + r5 + 80]
3988 movu m5, [r3 + r5 + 96]
3989 movu m7, [r3 + r5 + 112]
3990 dec r6d
3991 lea r2, [r2 + r4 * 2]
3992 lea r3, [r3 + r5 * 2]
3993
3994 psubw m0, m1
3995 psubw m2, m3
3996 psubw m4, m5
3997 psubw m6, m7
3998
3999 movu [r0 + r1 + 64], m0
4000 movu [r0 + r1 + 80], m2
4001 movu [r0 + r1 + 96], m4
4002 movu [r0 + r1 + 112], m6
4003 lea r0, [r0 + r1 * 2]
4004
4005 jnz .loop
4006 RET
4007 %else
4008 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4009 mov r6d, %2/2
4010 pxor m6, m6
4011 add r1, r1
4012 .loop:
4013 movu m1, [r2]
4014 movu m5, [r2 + 16]
4015 movu m3, [r3]
4016 movu m7, [r3 + 16]
4017
4018 pmovzxbw m0, m1
4019 pmovzxbw m4, m5
4020 pmovzxbw m2, m3
4021 punpckhbw m1, m6
4022 punpckhbw m3, m6
4023 punpckhbw m5, m6
4024
4025 psubw m0, m2
4026 psubw m1, m3
4027 pmovzxbw m2, m7
4028 punpckhbw m7, m6
4029 psubw m4, m2
4030 psubw m5, m7
4031
4032 movu m3, [r2 + 32]
4033 movu m7, [r3 + 32]
4034 pmovzxbw m2, m3
4035 punpckhbw m3, m6
4036
4037 movu [r0], m0
4038 movu [r0 + 16], m1
4039 movu [r0 + 32], m4
4040 movu [r0 + 48], m5
4041
4042 movu m1, [r2 + 48]
4043 movu m5, [r3 + 48]
4044 pmovzxbw m0, m1
4045 pmovzxbw m4, m7
4046 punpckhbw m1, m6
4047 punpckhbw m7, m6
4048
4049 psubw m2, m4
4050 psubw m3, m7
4051
4052 movu [r0 + 64], m2
4053 movu [r0 + 80], m3
4054
4055 movu m7, [r2 + r4]
4056 movu m3, [r3 + r5]
4057 pmovzxbw m2, m5
4058 pmovzxbw m4, m7
4059 punpckhbw m5, m6
4060 punpckhbw m7, m6
4061
4062 psubw m0, m2
4063 psubw m1, m5
4064
4065 movu [r0 + 96], m0
4066 movu [r0 + 112], m1
4067
4068 movu m2, [r2 + r4 + 16]
4069 movu m5, [r3 + r5 + 16]
4070 pmovzxbw m0, m3
4071 pmovzxbw m1, m2
4072 punpckhbw m3, m6
4073 punpckhbw m2, m6
4074
4075 psubw m4, m0
4076 psubw m7, m3
4077
4078 movu [r0 + r1], m4
4079 movu [r0 + r1 + 16], m7
4080
4081 movu m0, [r2 + r4 + 32]
4082 movu m3, [r3 + r5 + 32]
4083 dec r6d
4084 pmovzxbw m4, m5
4085 pmovzxbw m7, m0
4086 punpckhbw m5, m6
4087 punpckhbw m0, m6
4088
4089 psubw m1, m4
4090 psubw m2, m5
4091
4092 movu [r0 + r1 + 32], m1
4093 movu [r0 + r1 + 48], m2
4094
4095 movu m4, [r2 + r4 + 48]
4096 movu m5, [r3 + r5 + 48]
4097 lea r2, [r2 + r4 * 2]
4098 lea r3, [r3 + r5 * 2]
4099 pmovzxbw m1, m3
4100 pmovzxbw m2, m4
4101 punpckhbw m3, m6
4102 punpckhbw m4, m6
4103
4104 psubw m7, m1
4105 psubw m0, m3
4106
4107 movu [r0 + r1 + 64], m7
4108 movu [r0 + r1 + 80], m0
4109
4110 pmovzxbw m7, m5
4111 punpckhbw m5, m6
4112 psubw m2, m7
4113 psubw m4, m5
4114
4115 movu [r0 + r1 + 96], m2
4116 movu [r0 + r1 + 112], m4
4117 lea r0, [r0 + r1 * 2]
4118
4119 jnz .loop
4120 RET
4121 %endif
4122 %endmacro
4123
4124 %if HIGH_BIT_DEPTH
4125 INIT_XMM sse2
4126 PIXELSUB_PS_W64_H2 64, 64
4127 %else
4128 INIT_XMM sse4
4129 PIXELSUB_PS_W64_H2 64, 64
4130 %endif
4131
4132
4133 ;=============================================================================
4134 ; variance
4135 ;=============================================================================
4136
4137 %macro VAR_START 1
4138 pxor m5, m5 ; sum
4139 pxor m6, m6 ; sum squared
4140 %if HIGH_BIT_DEPTH == 0
4141 %if %1
4142 mova m7, [pw_00ff]
4143 %elif mmsize < 32
4144 pxor m7, m7 ; zero
4145 %endif
4146 %endif ; !HIGH_BIT_DEPTH
4147 %endmacro
4148
4149 %macro VAR_END 2
4150 %if HIGH_BIT_DEPTH
4151 %if mmsize == 8 && %1*%2 == 256
4152 HADDUW m5, m2
4153 %else
4154 %if %1 >= 32
4155 HADDW m5, m2
4156 movd m7, r4d
4157 paddd m5, m7
4158 %else
4159 HADDW m5, m2
4160 %endif
4161 %endif
4162 %else ; !HIGH_BIT_DEPTH
4163 %if %1 == 64
4164 HADDW m5, m2
4165 movd m7, r4d
4166 paddd m5, m7
4167 %else
4168 HADDW m5, m2
4169 %endif
4170 %endif ; HIGH_BIT_DEPTH
4171 HADDD m6, m1
4172 %if ARCH_X86_64
4173 punpckldq m5, m6
4174 movq rax, m5
4175 %else
4176 movd eax, m5
4177 movd edx, m6
4178 %endif
4179 RET
4180 %endmacro
4181
4182 %macro VAR_CORE 0
4183 paddw m5, m0
4184 paddw m5, m3
4185 paddw m5, m1
4186 paddw m5, m4
4187 pmaddwd m0, m0
4188 pmaddwd m3, m3
4189 pmaddwd m1, m1
4190 pmaddwd m4, m4
4191 paddd m6, m0
4192 paddd m6, m3
4193 paddd m6, m1
4194 paddd m6, m4
4195 %endmacro
4196
4197 %macro VAR_2ROW 3
4198 mov r2d, %2
4199 .loop%3:
4200 %if HIGH_BIT_DEPTH
4201 movu m0, [r0]
4202 movu m1, [r0+mmsize]
4203 movu m3, [r0+%1]
4204 movu m4, [r0+%1+mmsize]
4205 %else ; !HIGH_BIT_DEPTH
4206 mova m0, [r0]
4207 punpckhbw m1, m0, m7
4208 mova m3, [r0+%1]
4209 mova m4, m3
4210 punpcklbw m0, m7
4211 %endif ; HIGH_BIT_DEPTH
4212 %ifidn %1, r1
4213 lea r0, [r0+%1*2]
4214 %else
4215 add r0, r1
4216 %endif
4217 %if HIGH_BIT_DEPTH == 0
4218 punpcklbw m3, m7
4219 punpckhbw m4, m7
4220 %endif ; !HIGH_BIT_DEPTH
4221 VAR_CORE
4222 dec r2d
4223 jg .loop%3
4224 %endmacro
4225
4226 ;-----------------------------------------------------------------------------
4227 ; int pixel_var_wxh( uint8_t *, intptr_t )
4228 ;-----------------------------------------------------------------------------
4229 INIT_MMX mmx2
4230 cglobal pixel_var_16x16, 2,3
4231 FIX_STRIDES r1
4232 VAR_START 0
4233 VAR_2ROW 8*SIZEOF_PIXEL, 16, 1
4234 VAR_END 16, 16
4235
4236 cglobal pixel_var_8x8, 2,3
4237 FIX_STRIDES r1
4238 VAR_START 0
4239 VAR_2ROW r1, 4, 1
4240 VAR_END 8, 8
4241
4242 %if HIGH_BIT_DEPTH
4243 %macro VAR 0
4244 cglobal pixel_var_16x16, 2,3,8
4245 FIX_STRIDES r1
4246 VAR_START 0
4247 VAR_2ROW r1, 8, 1
4248 VAR_END 16, 16
4249
4250 cglobal pixel_var_8x8, 2,3,8
4251 lea r2, [r1*3]
4252 VAR_START 0
4253 movu m0, [r0]
4254 movu m1, [r0+r1*2]
4255 movu m3, [r0+r1*4]
4256 movu m4, [r0+r2*2]
4257 lea r0, [r0+r1*8]
4258 VAR_CORE
4259 movu m0, [r0]
4260 movu m1, [r0+r1*2]
4261 movu m3, [r0+r1*4]
4262 movu m4, [r0+r2*2]
4263 VAR_CORE
4264 VAR_END 8, 8
4265
4266 cglobal pixel_var_32x32, 2,6,8
4267 FIX_STRIDES r1
4268 mov r3, r0
4269 VAR_START 0
4270 VAR_2ROW r1, 8, 1
4271 HADDW m5, m2
4272 movd r4d, m5
4273 pxor m5, m5
4274 VAR_2ROW r1, 8, 2
4275 HADDW m5, m2
4276 movd r5d, m5
4277 add r4, r5
4278 pxor m5, m5
4279 lea r0, [r3 + 32]
4280 VAR_2ROW r1, 8, 3
4281 HADDW m5, m2
4282 movd r5d, m5
4283 add r4, r5
4284 pxor m5, m5
4285 VAR_2ROW r1, 8, 4
4286 VAR_END 32, 32
4287
4288 cglobal pixel_var_64x64, 2,6,8
4289 FIX_STRIDES r1
4290 mov r3, r0
4291 VAR_START 0
4292 VAR_2ROW r1, 8, 1
4293 HADDW m5, m2
4294 movd r4d, m5
4295 pxor m5, m5
4296 VAR_2ROW r1, 8, 2
4297 HADDW m5, m2
4298 movd r5d, m5
4299 add r4, r5
4300 pxor m5, m5
4301 VAR_2ROW r1, 8, 3
4302 HADDW m5, m2
4303 movd r5d, m5
4304 add r4, r5
4305 pxor m5, m5
4306 VAR_2ROW r1, 8, 4
4307 HADDW m5, m2
4308 movd r5d, m5
4309 add r4, r5
4310 pxor m5, m5
4311 lea r0, [r3 + 32]
4312 VAR_2ROW r1, 8, 5
4313 HADDW m5, m2
4314 movd r5d, m5
4315 add r4, r5
4316 pxor m5, m5
4317 VAR_2ROW r1, 8, 6
4318 HADDW m5, m2
4319 movd r5d, m5
4320 add r4, r5
4321 pxor m5, m5
4322 VAR_2ROW r1, 8, 7
4323 HADDW m5, m2
4324 movd r5d, m5
4325 add r4, r5
4326 pxor m5, m5
4327 VAR_2ROW r1, 8, 8
4328 HADDW m5, m2
4329 movd r5d, m5
4330 add r4, r5
4331 pxor m5, m5
4332 lea r0, [r3 + 64]
4333 VAR_2ROW r1, 8, 9
4334 HADDW m5, m2
4335 movd r5d, m5
4336 add r4, r5
4337 pxor m5, m5
4338 VAR_2ROW r1, 8, 10
4339 HADDW m5, m2
4340 movd r5d, m5
4341 add r4, r5
4342 pxor m5, m5
4343 VAR_2ROW r1, 8, 11
4344 HADDW m5, m2
4345 movd r5d, m5
4346 add r4, r5
4347 pxor m5, m5
4348 VAR_2ROW r1, 8, 12
4349 HADDW m5, m2
4350 movd r5d, m5
4351 add r4, r5
4352 pxor m5, m5
4353 lea r0, [r3 + 96]
4354 VAR_2ROW r1, 8, 13
4355 HADDW m5, m2
4356 movd r5d, m5
4357 add r4, r5
4358 pxor m5, m5
4359 VAR_2ROW r1, 8, 14
4360 HADDW m5, m2
4361 movd r5d, m5
4362 add r4, r5
4363 pxor m5, m5
4364 VAR_2ROW r1, 8, 15
4365 HADDW m5, m2
4366 movd r5d, m5
4367 add r4, r5
4368 pxor m5, m5
4369 VAR_2ROW r1, 8, 16
4370 VAR_END 64, 64
4371 %endmacro ; VAR
4372
4373 INIT_XMM sse2
4374 VAR
4375 INIT_XMM avx
4376 VAR
4377 INIT_XMM xop
4378 VAR
4379 %endif ; HIGH_BIT_DEPTH
4380
4381 %if HIGH_BIT_DEPTH == 0
4382 %macro VAR 0
4383 cglobal pixel_var_8x8, 2,3,8
4384 VAR_START 1
4385 lea r2, [r1 * 3]
4386 movh m0, [r0]
4387 movh m3, [r0 + r1]
4388 movhps m0, [r0 + r1 * 2]
4389 movhps m3, [r0 + r2]
4390 DEINTB 1, 0, 4, 3, 7
4391 lea r0, [r0 + r1 * 4]
4392 VAR_CORE
4393 movh m0, [r0]
4394 movh m3, [r0 + r1]
4395 movhps m0, [r0 + r1 * 2]
4396 movhps m3, [r0 + r2]
4397 DEINTB 1, 0, 4, 3, 7
4398 VAR_CORE
4399 VAR_END 8, 8
4400
4401 cglobal pixel_var_16x16_internal
4402 movu m0, [r0]
4403 movu m3, [r0 + r1]
4404 DEINTB 1, 0, 4, 3, 7
4405 VAR_CORE
4406 movu m0, [r0 + 2 * r1]
4407 movu m3, [r0 + r2]
4408 DEINTB 1, 0, 4, 3, 7
4409 lea r0, [r0 + r1 * 4]
4410 VAR_CORE
4411 movu m0, [r0]
4412 movu m3, [r0 + r1]
4413 DEINTB 1, 0, 4, 3, 7
4414 VAR_CORE
4415 movu m0, [r0 + 2 * r1]
4416 movu m3, [r0 + r2]
4417 DEINTB 1, 0, 4, 3, 7
4418 lea r0, [r0 + r1 * 4]
4419 VAR_CORE
4420 movu m0, [r0]
4421 movu m3, [r0 + r1]
4422 DEINTB 1, 0, 4, 3, 7
4423 VAR_CORE
4424 movu m0, [r0 + 2 * r1]
4425 movu m3, [r0 + r2]
4426 DEINTB 1, 0, 4, 3, 7
4427 lea r0, [r0 + r1 * 4]
4428 VAR_CORE
4429 movu m0, [r0]
4430 movu m3, [r0 + r1]
4431 DEINTB 1, 0, 4, 3, 7
4432 VAR_CORE
4433 movu m0, [r0 + 2 * r1]
4434 movu m3, [r0 + r2]
4435 DEINTB 1, 0, 4, 3, 7
4436 VAR_CORE
4437 ret
4438
4439 cglobal pixel_var_16x16, 2,3,8
4440 VAR_START 1
4441 lea r2, [r1 * 3]
4442 call pixel_var_16x16_internal
4443 VAR_END 16, 16
4444
4445 cglobal pixel_var_32x32, 2,4,8
4446 VAR_START 1
4447 lea r2, [r1 * 3]
4448 mov r3, r0
4449 call pixel_var_16x16_internal
4450 lea r0, [r0 + r1 * 4]
4451 call pixel_var_16x16_internal
4452 lea r0, [r3 + 16]
4453 call pixel_var_16x16_internal
4454 lea r0, [r0 + r1 * 4]
4455 call pixel_var_16x16_internal
4456 VAR_END 32, 32
4457
4458 cglobal pixel_var_64x64, 2,6,8
4459 VAR_START 1
4460 lea r2, [r1 * 3]
4461 mov r3, r0
4462 call pixel_var_16x16_internal
4463 lea r0, [r0 + r1 * 4]
4464 call pixel_var_16x16_internal
4465 lea r0, [r0 + r1 * 4]
4466 call pixel_var_16x16_internal
4467 lea r0, [r0 + r1 * 4]
4468 call pixel_var_16x16_internal
4469 HADDW m5, m2
4470 movd r4d, m5
4471 pxor m5, m5
4472 lea r0, [r3 + 16]
4473 call pixel_var_16x16_internal
4474 lea r0, [r0 + r1 * 4]
4475 call pixel_var_16x16_internal
4476 lea r0, [r0 + r1 * 4]
4477 call pixel_var_16x16_internal
4478 lea r0, [r0 + r1 * 4]
4479 call pixel_var_16x16_internal
4480 HADDW m5, m2
4481 movd r5d, m5
4482 add r4, r5
4483 pxor m5, m5
4484 lea r0, [r3 + 32]
4485 call pixel_var_16x16_internal
4486 lea r0, [r0 + r1 * 4]
4487 call pixel_var_16x16_internal
4488 lea r0, [r0 + r1 * 4]
4489 call pixel_var_16x16_internal
4490 lea r0, [r0 + r1 * 4]
4491 call pixel_var_16x16_internal
4492 lea r0, [r3 + 48]
4493 HADDW m5, m2
4494 movd r5d, m5
4495 add r4, r5
4496 pxor m5, m5
4497 call pixel_var_16x16_internal
4498 lea r0, [r0 + r1 * 4]
4499 call pixel_var_16x16_internal
4500 lea r0, [r0 + r1 * 4]
4501 call pixel_var_16x16_internal
4502 lea r0, [r0 + r1 * 4]
4503 call pixel_var_16x16_internal
4504 VAR_END 64, 64
4505 %endmacro ; VAR
4506
4507 INIT_XMM sse2
4508 VAR
4509 INIT_XMM avx
4510 VAR
4511 INIT_XMM xop
4512 VAR
4513
4514 INIT_YMM avx2
4515 cglobal pixel_var_16x16, 2,4,7
4516 VAR_START 0
4517 mov r2d, 4
4518 lea r3, [r1*3]
4519 .loop:
4520 pmovzxbw m0, [r0]
4521 pmovzxbw m3, [r0+r1]
4522 pmovzxbw m1, [r0+r1*2]
4523 pmovzxbw m4, [r0+r3]
4524 lea r0, [r0+r1*4]
4525 VAR_CORE
4526 dec r2d
4527 jg .loop
4528 vextracti128 xm0, m5, 1
4529 vextracti128 xm1, m6, 1
4530 paddw xm5, xm0
4531 paddd xm6, xm1
4532 HADDW xm5, xm2
4533 HADDD xm6, xm1
4534 %if ARCH_X86_64
4535 punpckldq xm5, xm6
4536 movq rax, xm5
4537 %else
4538 movd eax, xm5
4539 movd edx, xm6
4540 %endif
4541 RET
4542 %endif ; !HIGH_BIT_DEPTH
4543
4544 %macro VAR2_END 3
4545 HADDW %2, xm1
4546 movd r1d, %2
4547 imul r1d, r1d
4548 HADDD %3, xm1
4549 shr r1d, %1
4550 movd eax, %3
4551 movd [r4], %3
4552 sub eax, r1d ; sqr - (sum * sum >> shift)
4553 RET
4554 %endmacro
4555