Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / pixel-util8.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
5 ;* Nabajit Deka <nabajit@multicorewareinc.com>
6 ;*
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
11 ;*
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;*
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
24
25 %include "x86inc.asm"
26 %include "x86util.asm"
27
28 SECTION_RODATA 32
29
30 %if BIT_DEPTH == 10
31 ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
32 ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
33 pf_64: times 4 dd 64.0
34 pf_128: times 4 dd 128.0
35 %elif BIT_DEPTH == 9
36 ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64
37 ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63
38 %else ; 8-bit
39 ssim_c1: times 4 dd 416 ; .01*.01*255*255*64
40 ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63
41 %endif
42 mask_ff: times 16 db 0xff
43 times 16 db 0
44 deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
45 deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
46 hmul_16p: times 16 db 1
47 times 8 db 1, -1
48 hmulw_16p: times 8 dw 1
49 times 4 dw 1, -1
50
51 trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
52
53 SECTION .text
54
55 cextern pw_1
56 cextern pb_1
57 cextern pw_00ff
58 cextern pw_2000
59 cextern pw_pixel_max
60 cextern pd_1
61 cextern pd_32767
62 cextern pd_n32768
63
64 ;-----------------------------------------------------------------------------
65 ; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
66 ;-----------------------------------------------------------------------------
67 INIT_XMM sse2
68 %if HIGH_BIT_DEPTH
69 %if ARCH_X86_64 == 1
70 cglobal calcRecons4, 5,8,4
71 %define t7b r7b
72 %else
73 cglobal calcRecons4, 5,7,4,0-1
74 %define t7b byte [rsp]
75 %endif
76 mov r4d, r4m
77 mov r5d, r5m
78 mov r6d, r6m
79 add r4d, r4d
80 add r5d, r5d
81 add r6d, r6d
82
83 pxor m4, m4
84 mova m5, [pw_pixel_max]
85 mov t7b, 4/2
86 .loop:
87 movh m0, [r0]
88 movh m1, [r0 + r4]
89 punpcklqdq m0, m1
90 movh m2, [r1]
91 movh m3, [r1 + r4]
92 punpcklqdq m2, m3
93 paddw m0, m2
94 CLIPW m0, m4, m5
95
96 ; store recipred[]
97 movh [r3], m0
98 movhps [r3 + r6], m0
99
100 ; store recqt[]
101 movh [r2], m0
102 movhps [r2 + r5], m0
103
104 lea r0, [r0 + r4 * 2]
105 lea r1, [r1 + r4 * 2]
106 lea r2, [r2 + r5 * 2]
107 lea r3, [r3 + r6 * 2]
108
109 dec t7b
110 jnz .loop
111 RET
112 %else ;HIGH_BIT_DEPTH
113
114 %if ARCH_X86_64 == 1
115 cglobal calcRecons4, 5,8,4
116 %define t7b r7b
117 %else
118 cglobal calcRecons4, 5,7,4,0-1
119 %define t7b byte [rsp]
120 %endif
121 mov r4d, r4m
122 mov r5d, r5m
123 mov r6d, r6m
124 add r5d, r5d
125
126 pxor m0, m0
127 mov t7b, 4/2
128 .loop:
129 movd m1, [r0]
130 movd m2, [r0 + r4]
131 punpckldq m1, m2
132 punpcklbw m1, m0
133 movh m2, [r1]
134 movh m3, [r1 + r4 * 2]
135 punpcklqdq m2, m3
136 paddw m1, m2
137 packuswb m1, m1
138
139 ; store recon[] and recipred[]
140 movd [r3], m1
141 pshufd m2, m1, 1
142 movd [r3 + r6], m2
143
144 ; store recqt[]
145 punpcklbw m1, m0
146 movh [r2], m1
147 movhps [r2 + r5], m1
148
149 lea r0, [r0 + r4 * 2]
150 lea r1, [r1 + r4 * 4]
151 lea r2, [r2 + r5 * 2]
152 lea r3, [r3 + r6 * 2]
153
154 dec t7b
155 jnz .loop
156 RET
157 %endif ;HIGH_BIT_DEPTH
158
159
160 INIT_XMM sse2
161 %if ARCH_X86_64 == 1
162 cglobal calcRecons8, 5,8,4
163 %define t7b r7b
164 %else
165 cglobal calcRecons8, 5,7,4,0-1
166 %define t7b byte [rsp]
167 %endif
168
169 %if HIGH_BIT_DEPTH
170 mov r4d, r4m
171 mov r5d, r5m
172 mov r6d, r6m
173 add r4d, r4d
174 add r5d, r5d
175 add r6d, r6d
176
177 pxor m4, m4
178 mova m5, [pw_pixel_max]
179 mov t7b, 8/2
180 .loop:
181 movu m0, [r0]
182 movu m1, [r0 + r4]
183 movu m2, [r1]
184 movu m3, [r1 + r4]
185 paddw m0, m2
186 paddw m1, m3
187 CLIPW2 m0, m1, m4, m5
188
189 ; store recipred[]
190 movu [r3], m0
191 movu [r3 + r6], m1
192
193 ; store recqt[]
194 movu [r2], m0
195 movu [r2 + r5], m1
196
197 lea r0, [r0 + r4 * 2]
198 lea r1, [r1 + r4 * 2]
199 lea r2, [r2 + r5 * 2]
200 lea r3, [r3 + r6 * 2]
201
202 dec t7b
203 jnz .loop
204 RET
205 %else ;HIGH_BIT_DEPTH
206
207 mov r4d, r4m
208 mov r5d, r5m
209 mov r6d, r6m
210 add r5d, r5d
211
212 pxor m0, m0
213 mov t7b, 8/2
214 .loop:
215 movh m1, [r0]
216 movh m2, [r0 + r4]
217 punpcklbw m1, m0
218 punpcklbw m2, m0
219 movu m3, [r1]
220 movu m4, [r1 + r4 * 2]
221 paddw m1, m3
222 paddw m2, m4
223 packuswb m1, m2
224
225 ; store recon[] and recipred[]
226 movh [r3], m1
227 movhps [r3 + r6], m1
228
229 ; store recqt[]
230 punpcklbw m2, m1, m0
231 punpckhbw m1, m0
232 movu [r2], m2
233 movu [r2 + r5], m1
234
235 lea r0, [r0 + r4 * 2]
236 lea r1, [r1 + r4 * 4]
237 lea r2, [r2 + r5 * 2]
238 lea r3, [r3 + r6 * 2]
239
240 dec t7b
241 jnz .loop
242 RET
243 %endif ;HIGH_BIT_DEPTH
244
245
246
247 %if HIGH_BIT_DEPTH
248 INIT_XMM sse2
249 %if ARCH_X86_64 == 1
250 cglobal calcRecons16, 5,8,4
251 %define t7b r7b
252 %else
253 cglobal calcRecons16, 5,7,4,0-1
254 %define t7b byte [rsp]
255 %endif
256
257 mov r4d, r4m
258 mov r5d, r5m
259 mov r6d, r6m
260 add r4d, r4d
261 add r5d, r5d
262 add r6d, r6d
263
264 pxor m4, m4
265 mova m5, [pw_pixel_max]
266 mov t7b, 16/2
267 .loop:
268 movu m0, [r0]
269 movu m1, [r0 + 16]
270 movu m2, [r1]
271 movu m3, [r1 + 16]
272 paddw m0, m2
273 paddw m1, m3
274 CLIPW2 m0, m1, m4, m5
275
276 ; store recipred[]
277 movu [r3], m0
278 movu [r3 + 16], m1
279
280 ; store recqt[]
281 movu [r2], m0
282 movu [r2 + 16], m1
283
284 movu m0, [r0 + r4]
285 movu m1, [r0 + r4 + 16]
286 movu m2, [r1 + r4]
287 movu m3, [r1 + r4 + 16]
288 paddw m0, m2
289 paddw m1, m3
290 CLIPW2 m0, m1, m4, m5
291
292 ; store recon[] and recipred[]
293 movu [r3 + r6], m0
294 movu [r3 + r6 + 16], m1
295
296 ; store recqt[]
297 movu [r2 + r5], m0
298 movu [r2 + r5 + 16], m1
299
300 lea r0, [r0 + r4 * 2]
301 lea r1, [r1 + r4 * 2]
302 lea r2, [r2 + r5 * 2]
303 lea r3, [r3 + r6 * 2]
304
305 dec t7b
306 jnz .loop
307 RET
308 %else ;HIGH_BIT_DEPTH
309
310 INIT_XMM sse4
311 %if ARCH_X86_64 == 1
312 cglobal calcRecons16, 5,8,4
313 %define t7b r7b
314 %else
315 cglobal calcRecons16, 5,7,4,0-1
316 %define t7b byte [rsp]
317 %endif
318
319 mov r4d, r4m
320 mov r5d, r5m
321 mov r6d, r6m
322 add r5d, r5d
323
324 pxor m0, m0
325 mov t7b, 16
326 .loop:
327 movu m2, [r0]
328 pmovzxbw m1, m2
329 punpckhbw m2, m0
330 paddw m1, [r1]
331 paddw m2, [r1 + 16]
332 packuswb m1, m2
333
334 ; store recon[] and recipred[]
335 movu [r3], m1
336
337 ; store recqt[]
338 pmovzxbw m2, m1
339 punpckhbw m1, m0
340 movu [r2], m2
341 movu [r2 + 16], m1
342
343 add r2, r5
344 add r3, r6
345 add r0, r4
346 lea r1, [r1 + r4 * 2]
347
348 dec t7b
349 jnz .loop
350 RET
351 %endif ;HIGH_BIT_DEPTH
352
353 %if HIGH_BIT_DEPTH
354 INIT_XMM sse2
355 %if ARCH_X86_64 == 1
356 cglobal calcRecons32, 5,8,4
357 %define t7b r7b
358 %else
359 cglobal calcRecons32, 5,7,4,0-1
360 %define t7b byte [rsp]
361 %endif
362
363 mov r4d, r4m
364 mov r5d, r5m
365 mov r6d, r6m
366 add r4d, r4d
367 add r5d, r5d
368 add r6d, r6d
369
370 pxor m4, m4
371 mova m5, [pw_pixel_max]
372 mov t7b, 32/2
373 .loop:
374
375 movu m0, [r0]
376 movu m1, [r0 + 16]
377 movu m2, [r1]
378 movu m3, [r1 + 16]
379 paddw m0, m2
380 paddw m1, m3
381 CLIPW2 m0, m1, m4, m5
382
383 ; store recipred[]
384 movu [r3], m0
385 movu [r3 + 16], m1
386
387 ; store recqt[]
388 movu [r2], m0
389 movu [r2 + 16], m1
390
391 movu m0, [r0 + 32]
392 movu m1, [r0 + 48]
393 movu m2, [r1 + 32]
394 movu m3, [r1 + 48]
395 paddw m0, m2
396 paddw m1, m3
397 CLIPW2 m0, m1, m4, m5
398
399 ; store recon[] and recipred[]
400 movu [r3 + 32], m0
401 movu [r3 + 48], m1
402
403 ; store recqt[]
404 movu [r2 + 32], m0
405 movu [r2 + 48], m1
406 add r2, r5
407
408 movu m0, [r0 + r4]
409 movu m1, [r0 + r4 + 16]
410 movu m2, [r1 + r4]
411 movu m3, [r1 + r4 + 16]
412 paddw m0, m2
413 paddw m1, m3
414 CLIPW2 m0, m1, m4, m5
415
416 ; store recon[] and recipred[]
417 movu [r3 + r6], m0
418 movu [r3 + r6 + 16], m1
419
420 ; store recqt[]
421 movu [r2], m0
422 movu [r2 + 16], m1
423
424 movu m0, [r0 + r4 + 32]
425 movu m1, [r0 + r4 + 48]
426 movu m2, [r1 + r4 + 32]
427 movu m3, [r1 + r4 + 48]
428 paddw m0, m2
429 paddw m1, m3
430 CLIPW2 m0, m1, m4, m5
431
432 ; store recon[] and recipred[]
433 movu [r3 + r6 + 32], m0
434 movu [r3 + r6 + 48], m1
435 lea r3, [r3 + r6 * 2]
436
437 ; store recqt[]
438 movu [r2 + 32], m0
439 movu [r2 + 48], m1
440 add r2, r5
441
442 lea r0, [r0 + r4 * 2]
443 lea r1, [r1 + r4 * 2]
444
445 dec t7b
446 jnz .loop
447 RET
448 %else ;HIGH_BIT_DEPTH
449 INIT_XMM sse4
450 %if ARCH_X86_64 == 1
451 cglobal calcRecons32, 5,8,4
452 %define t7b r7b
453 %else
454 cglobal calcRecons32, 5,7,4,0-1
455 %define t7b byte [rsp]
456 %endif
457
458 mov r4d, r4m
459 mov r5d, r5m
460 mov r6d, r6m
461 add r5d, r5d
462
463 pxor m0, m0
464 mov t7b, 32
465 .loop:
466 movu m2, [r0]
467 movu m4, [r0 + 16]
468 pmovzxbw m1, m2
469 punpckhbw m2, m0
470 pmovzxbw m3, m4
471 punpckhbw m4, m0
472
473 paddw m1, [r1 + 0 * 16]
474 paddw m2, [r1 + 1 * 16]
475 packuswb m1, m2
476
477 paddw m3, [r1 + 2 * 16]
478 paddw m4, [r1 + 3 * 16]
479 packuswb m3, m4
480
481 ; store recon[] and recipred[]
482 movu [r3], m1
483 movu [r3 + 16], m3
484
485 ; store recqt[]
486 pmovzxbw m2, m1
487 punpckhbw m1, m0
488 movu [r2 + 0 * 16], m2
489 movu [r2 + 1 * 16], m1
490 pmovzxbw m4, m3
491 punpckhbw m3, m0
492 movu [r2 + 2 * 16], m4
493 movu [r2 + 3 * 16], m3
494
495 add r2, r5
496 add r3, r6
497 add r0, r4
498 lea r1, [r1 + r4 * 2]
499
500 dec t7b
501 jnz .loop
502 RET
503 %endif ;HIGH_BIT_DEPTH
504
505
506 ;-----------------------------------------------------------------------------
507 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
508 ;-----------------------------------------------------------------------------
509 INIT_XMM sse2
510 %if HIGH_BIT_DEPTH
511 cglobal getResidual4, 4,4,4
512 add r3, r3
513
514 ; row 0-1
515 movh m0, [r0]
516 movh m1, [r0 + r3]
517 movh m2, [r1]
518 movh m3, [r1 + r3]
519 punpcklqdq m0, m1
520 punpcklqdq m2, m3
521 psubw m0, m2
522
523 movh [r2], m0
524 movhps [r2 + r3], m0
525 lea r0, [r0 + r3 * 2]
526 lea r1, [r1 + r3 * 2]
527 lea r2, [r2 + r3 * 2]
528
529 ; row 2-3
530 movh m0, [r0]
531 movh m1, [r0 + r3]
532 movh m2, [r1]
533 movh m3, [r1 + r3]
534 punpcklqdq m0, m1
535 punpcklqdq m2, m3
536 psubw m0, m2
537
538 movh [r2], m0
539 movhps [r2 + r3], m0
540 %else
541 cglobal getResidual4, 4,4,5
542 pxor m0, m0
543
544 ; row 0-1
545 movd m1, [r0]
546 movd m2, [r0 + r3]
547 movd m3, [r1]
548 movd m4, [r1 + r3]
549 punpckldq m1, m2
550 punpcklbw m1, m0
551 punpckldq m3, m4
552 punpcklbw m3, m0
553 psubw m1, m3
554 movh [r2], m1
555 movhps [r2 + r3 * 2], m1
556 lea r0, [r0 + r3 * 2]
557 lea r1, [r1 + r3 * 2]
558 lea r2, [r2 + r3 * 4]
559
560 ; row 2-3
561 movd m1, [r0]
562 movd m2, [r0 + r3]
563 movd m3, [r1]
564 movd m4, [r1 + r3]
565 punpckldq m1, m2
566 punpcklbw m1, m0
567 punpckldq m3, m4
568 punpcklbw m3, m0
569 psubw m1, m3
570 movh [r2], m1
571 movhps [r2 + r3 * 2], m1
572 %endif
573 RET
574
575
576 INIT_XMM sse2
577 %if HIGH_BIT_DEPTH
578 cglobal getResidual8, 4,4,4
579 add r3, r3
580
581 %assign x 0
582 %rep 8/2
583 ; row 0-1
584 movu m1, [r0]
585 movu m2, [r0 + r3]
586 movu m3, [r1]
587 movu m4, [r1 + r3]
588 psubw m1, m3
589 psubw m2, m4
590 movu [r2], m1
591 movu [r2 + r3], m2
592 %assign x x+1
593 %if (x != 4)
594 lea r0, [r0 + r3 * 2]
595 lea r1, [r1 + r3 * 2]
596 lea r2, [r2 + r3 * 2]
597 %endif
598 %endrep
599 %else
600 cglobal getResidual8, 4,4,5
601 pxor m0, m0
602
603 %assign x 0
604 %rep 8/2
605 ; row 0-1
606 movh m1, [r0]
607 movh m2, [r0 + r3]
608 movh m3, [r1]
609 movh m4, [r1 + r3]
610 punpcklbw m1, m0
611 punpcklbw m2, m0
612 punpcklbw m3, m0
613 punpcklbw m4, m0
614 psubw m1, m3
615 psubw m2, m4
616 movu [r2], m1
617 movu [r2 + r3 * 2], m2
618 %assign x x+1
619 %if (x != 4)
620 lea r0, [r0 + r3 * 2]
621 lea r1, [r1 + r3 * 2]
622 lea r2, [r2 + r3 * 4]
623 %endif
624 %endrep
625 %endif
626 RET
627
628 %if HIGH_BIT_DEPTH
629 INIT_XMM sse2
630 cglobal getResidual16, 4,5,6
631 add r3, r3
632 mov r4d, 16/4
633 .loop:
634 ; row 0-1
635 movu m0, [r0]
636 movu m1, [r0 + 16]
637 movu m2, [r0 + r3]
638 movu m3, [r0 + r3 + 16]
639 movu m4, [r1]
640 movu m5, [r1 + 16]
641 psubw m0, m4
642 psubw m1, m5
643 movu m4, [r1 + r3]
644 movu m5, [r1 + r3 + 16]
645 psubw m2, m4
646 psubw m3, m5
647 lea r0, [r0 + r3 * 2]
648 lea r1, [r1 + r3 * 2]
649
650 movu [r2], m0
651 movu [r2 + 16], m1
652 movu [r2 + r3], m2
653 movu [r2 + r3 + 16], m3
654 lea r2, [r2 + r3 * 2]
655
656 ; row 2-3
657 movu m0, [r0]
658 movu m1, [r0 + 16]
659 movu m2, [r0 + r3]
660 movu m3, [r0 + r3 + 16]
661 movu m4, [r1]
662 movu m5, [r1 + 16]
663 psubw m0, m4
664 psubw m1, m5
665 movu m4, [r1 + r3]
666 movu m5, [r1 + r3 + 16]
667 psubw m2, m4
668 psubw m3, m5
669
670 movu [r2], m0
671 movu [r2 + 16], m1
672 movu [r2 + r3], m2
673 movu [r2 + r3 + 16], m3
674
675 dec r4d
676
677 lea r0, [r0 + r3 * 2]
678 lea r1, [r1 + r3 * 2]
679 lea r2, [r2 + r3 * 2]
680
681 jnz .loop
682 %else
683
684 INIT_XMM sse4
685 cglobal getResidual16, 4,5,8
686 mov r4d, 16/4
687 pxor m0, m0
688 .loop:
689 ; row 0-1
690 movu m1, [r0]
691 movu m2, [r0 + r3]
692 movu m3, [r1]
693 movu m4, [r1 + r3]
694 pmovzxbw m5, m1
695 punpckhbw m1, m0
696 pmovzxbw m6, m2
697 punpckhbw m2, m0
698 pmovzxbw m7, m3
699 punpckhbw m3, m0
700 psubw m5, m7
701 psubw m1, m3
702 pmovzxbw m7, m4
703 punpckhbw m4, m0
704 psubw m6, m7
705 psubw m2, m4
706
707 movu [r2], m5
708 movu [r2 + 16], m1
709 movu [r2 + r3 * 2], m6
710 movu [r2 + r3 * 2 + 16], m2
711
712 lea r0, [r0 + r3 * 2]
713 lea r1, [r1 + r3 * 2]
714 lea r2, [r2 + r3 * 4]
715
716 ; row 2-3
717 movu m1, [r0]
718 movu m2, [r0 + r3]
719 movu m3, [r1]
720 movu m4, [r1 + r3]
721 pmovzxbw m5, m1
722 punpckhbw m1, m0
723 pmovzxbw m6, m2
724 punpckhbw m2, m0
725 pmovzxbw m7, m3
726 punpckhbw m3, m0
727 psubw m5, m7
728 psubw m1, m3
729 pmovzxbw m7, m4
730 punpckhbw m4, m0
731 psubw m6, m7
732 psubw m2, m4
733
734 movu [r2], m5
735 movu [r2 + 16], m1
736 movu [r2 + r3 * 2], m6
737 movu [r2 + r3 * 2 + 16], m2
738
739 dec r4d
740
741 lea r0, [r0 + r3 * 2]
742 lea r1, [r1 + r3 * 2]
743 lea r2, [r2 + r3 * 4]
744
745 jnz .loop
746 %endif
747
748 RET
749
750 %if HIGH_BIT_DEPTH
751 INIT_XMM sse2
752 cglobal getResidual32, 4,5,6
753 add r3, r3
754 mov r4d, 32/2
755 .loop:
756 ; row 0
757 movu m0, [r0]
758 movu m1, [r0 + 16]
759 movu m2, [r0 + 32]
760 movu m3, [r0 + 48]
761 movu m4, [r1]
762 movu m5, [r1 + 16]
763 psubw m0, m4
764 psubw m1, m5
765 movu m4, [r1 + 32]
766 movu m5, [r1 + 48]
767 psubw m2, m4
768 psubw m3, m5
769
770 movu [r2], m0
771 movu [r2 + 16], m1
772 movu [r2 + 32], m2
773 movu [r2 + 48], m3
774
775 ; row 1
776 movu m0, [r0 + r3]
777 movu m1, [r0 + r3 + 16]
778 movu m2, [r0 + r3 + 32]
779 movu m3, [r0 + r3 + 48]
780 movu m4, [r1 + r3]
781 movu m5, [r1 + r3 + 16]
782 psubw m0, m4
783 psubw m1, m5
784 movu m4, [r1 + r3 + 32]
785 movu m5, [r1 + r3 + 48]
786 psubw m2, m4
787 psubw m3, m5
788
789 movu [r2 + r3], m0
790 movu [r2 + r3 + 16], m1
791 movu [r2 + r3 + 32], m2
792 movu [r2 + r3 + 48], m3
793
794 dec r4d
795
796 lea r0, [r0 + r3 * 2]
797 lea r1, [r1 + r3 * 2]
798 lea r2, [r2 + r3 * 2]
799
800 jnz .loop
801
802 %else
803 INIT_XMM sse4
804 cglobal getResidual32, 4,5,7
805 mov r4d, 32/2
806 pxor m0, m0
807 .loop:
808 movu m1, [r0]
809 movu m2, [r0 + 16]
810 movu m3, [r1]
811 movu m4, [r1 + 16]
812 pmovzxbw m5, m1
813 punpckhbw m1, m0
814 pmovzxbw m6, m3
815 punpckhbw m3, m0
816 psubw m5, m6
817 psubw m1, m3
818 movu [r2 + 0 * 16], m5
819 movu [r2 + 1 * 16], m1
820
821 pmovzxbw m5, m2
822 punpckhbw m2, m0
823 pmovzxbw m6, m4
824 punpckhbw m4, m0
825 psubw m5, m6
826 psubw m2, m4
827 movu [r2 + 2 * 16], m5
828 movu [r2 + 3 * 16], m2
829
830 movu m1, [r0 + r3]
831 movu m2, [r0 + r3 + 16]
832 movu m3, [r1 + r3]
833 movu m4, [r1 + r3 + 16]
834 pmovzxbw m5, m1
835 punpckhbw m1, m0
836 pmovzxbw m6, m3
837 punpckhbw m3, m0
838 psubw m5, m6
839 psubw m1, m3
840 movu [r2 + r3 * 2 + 0 * 16], m5
841 movu [r2 + r3 * 2 + 1 * 16], m1
842
843 pmovzxbw m5, m2
844 punpckhbw m2, m0
845 pmovzxbw m6, m4
846 punpckhbw m4, m0
847 psubw m5, m6
848 psubw m2, m4
849 movu [r2 + r3 * 2 + 2 * 16], m5
850 movu [r2 + r3 * 2 + 3 * 16], m2
851
852 dec r4d
853
854 lea r0, [r0 + r3 * 2]
855 lea r1, [r1 + r3 * 2]
856 lea r2, [r2 + r3 * 4]
857
858 jnz .loop
859 %endif
860 RET
861
862
863 ;-----------------------------------------------------------------------------
864 ; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
865 ;-----------------------------------------------------------------------------
866 INIT_XMM sse4
867 cglobal quant, 5,6,8
868 ; fill qbits
869 movd m4, r4d ; m4 = qbits
870
871 ; fill qbits-8
872 sub r4d, 8
873 movd m6, r4d ; m6 = qbits8
874
875 ; fill offset
876 movd m5, r5m
877 pshufd m5, m5, 0 ; m5 = add
878
879 lea r5, [pd_1]
880
881 mov r4d, r6m
882 shr r4d, 3
883 pxor m7, m7 ; m7 = numZero
884 .loop:
885 ; 4 coeff
886 movu m0, [r0] ; m0 = level
887 pabsd m1, m0
888 pmulld m1, [r1] ; m0 = tmpLevel1
889 paddd m2, m1, m5
890 psrad m2, m4 ; m2 = level1
891
892 pslld m3, m2, 8
893 psrad m1, m6
894 psubd m1, m3 ; m1 = deltaU1
895
896 movu [r2], m1
897 psignd m3, m2, m0
898 pminud m2, [r5]
899 paddd m7, m2
900 packssdw m3, m3
901 movh [r3], m3
902
903 ; 4 coeff
904 movu m0, [r0 + 16] ; m0 = level
905 pabsd m1, m0
906 pmulld m1, [r1 + 16] ; m0 = tmpLevel1
907 paddd m2, m1, m5
908 psrad m2, m4 ; m2 = level1
909 pslld m3, m2, 8
910 psrad m1, m6
911 psubd m1, m3 ; m1 = deltaU1
912 movu [r2 + 16], m1
913 psignd m3, m2, m0
914 pminud m2, [r5]
915 paddd m7, m2
916 packssdw m3, m3
917 movh [r3 + 8], m3
918
919 add r0, 32
920 add r1, 32
921 add r2, 32
922 add r3, 16
923
924 dec r4d
925 jnz .loop
926
927 pxor m0, m0
928 psadbw m7, m0
929 movhlps m0, m7
930 paddd m7, m0
931 movd eax, m7
932 RET
933
934
935 IACA_START
936 %if ARCH_X86_64 == 1
937 INIT_YMM avx2
938 cglobal quant, 5,5,10
939 ; fill qbits
940 movd xm4, r4d ; m4 = qbits
941
942 ; fill qbits-8
943 sub r4d, 8
944 movd xm6, r4d ; m6 = qbits8
945
946 ; fill offset
947 vpbroadcastd m5, r5m ; m5 = add
948
949 vpbroadcastw m9, [pw_1] ; m9 = word [1]
950
951 mov r4d, r6m
952 shr r4d, 4
953 pxor m7, m7 ; m7 = numZero
954 .loop:
955 ; 8 coeff
956 movu m0, [r0] ; m0 = level
957 pabsd m1, m0
958 pmulld m1, [r1] ; m0 = tmpLevel1
959 paddd m2, m1, m5
960 psrad m2, xm4 ; m2 = level1
961
962 pslld m3, m2, 8
963 psrad m1, xm6
964 psubd m1, m3 ; m1 = deltaU1
965 movu [r2], m1
966 psignd m2, m0
967
968 ; 8 coeff
969 movu m0, [r0 + mmsize] ; m0 = level
970 pabsd m1, m0
971 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
972 paddd m3, m1, m5
973 psrad m3, xm4 ; m2 = level1
974
975 pslld m8, m3, 8
976 psrad m1, xm6
977 psubd m1, m8 ; m1 = deltaU1
978 movu [r2 + mmsize], m1
979 psignd m3, m0
980
981 packssdw m2, m3
982 vpermq m2, m2, q3120
983 movu [r3], m2
984
985 ; count non-zero coeff
986 ; TODO: popcnt is faster, but some CPU can't support
987 pminuw m2, m9
988 paddw m7, m2
989
990 add r0, mmsize*2
991 add r1, mmsize*2
992 add r2, mmsize*2
993 add r3, mmsize
994
995 dec r4d
996 jnz .loop
997
998 ; sum count
999 xorpd m0, m0
1000 psadbw m7, m0
1001 vextracti128 xm1, m7, 1
1002 paddd xm7, xm1
1003 movhlps xm0, xm7
1004 paddd xm7, xm0
1005 movd eax, xm7
1006 RET
1007
1008 %else ; ARCH_X86_64 == 1
1009 INIT_YMM avx2
1010 cglobal quant, 5,6,8
1011 ; fill qbits
1012 movd xm4, r4d ; m4 = qbits
1013
1014 ; fill qbits-8
1015 sub r4d, 8
1016 movd xm6, r4d ; m6 = qbits8
1017
1018 ; fill offset
1019 vpbroadcastd m5, r5m ; m5 = ad
1020
1021 lea r5, [pd_1]
1022
1023 mov r4d, r6m
1024 shr r4d, 4
1025 pxor m7, m7 ; m7 = numZero
1026 .loop:
1027 ; 8 coeff
1028 movu m0, [r0] ; m0 = level
1029 pabsd m1, m0
1030 pmulld m1, [r1] ; m0 = tmpLevel1
1031 paddd m2, m1, m5
1032 psrad m2, xm4 ; m2 = level1
1033
1034 pslld m3, m2, 8
1035 psrad m1, xm6
1036 psubd m1, m3 ; m1 = deltaU1
1037
1038 movu [r2], m1
1039 psignd m3, m2, m0
1040 pminud m2, [r5]
1041 paddd m7, m2
1042 packssdw m3, m3
1043 vpermq m3, m3, q0020
1044 movu [r3], xm3
1045
1046 ; 8 coeff
1047 movu m0, [r0 + mmsize] ; m0 = level
1048 pabsd m1, m0
1049 pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
1050 paddd m2, m1, m5
1051 psrad m2, xm4 ; m2 = level1
1052
1053 pslld m3, m2, 8
1054 psrad m1, xm6
1055 psubd m1, m3 ; m1 = deltaU1
1056
1057 movu [r2 + mmsize], m1
1058 psignd m3, m2, m0
1059 pminud m2, [r5]
1060 paddd m7, m2
1061 packssdw m3, m3
1062 vpermq m3, m3, q0020
1063 movu [r3 + mmsize/2], xm3
1064
1065 add r0, mmsize*2
1066 add r1, mmsize*2
1067 add r2, mmsize*2
1068 add r3, mmsize
1069
1070 dec r4d
1071 jnz .loop
1072
1073 xorpd m0, m0
1074 psadbw m7, m0
1075 vextracti128 xm1, m7, 1
1076 paddd xm7, xm1
1077 movhlps xm0, xm7
1078 paddd xm7, xm0
1079 movd eax, xm7
1080 RET
1081 %endif ; ARCH_X86_64 == 1
1082 IACA_END
1083
1084
1085 ;-----------------------------------------------------------------------------
1086 ; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
1087 ;-----------------------------------------------------------------------------
1088 INIT_XMM sse4
1089 cglobal nquant, 3,5,8
1090 movd m6, r4m
1091 mov r4d, r5m
1092 pxor m7, m7 ; m7 = numZero
1093 movd m5, r3m ; m5 = qbits
1094 pshufd m6, m6, 0 ; m6 = add
1095 mov r3d, r4d ; r3 = numCoeff
1096 shr r4d, 3
1097
1098 .loop:
1099 movu m0, [r0] ; m0 = level
1100 movu m1, [r0 + 16] ; m1 = level
1101
1102 pabsd m2, m0
1103 pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff
1104 paddd m2, m6
1105 psrad m2, m5 ; m0 = level1
1106 psignd m2, m0
1107
1108 pabsd m3, m1
1109 pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff
1110 paddd m3, m6
1111 psrad m3, m5 ; m1 = level1
1112 psignd m3, m1
1113
1114 packssdw m2, m3
1115
1116 movu [r2], m2
1117 add r0, 32
1118 add r1, 32
1119 add r2, 16
1120
1121 pxor m4, m4
1122 pcmpeqw m2, m4
1123 psubw m7, m2
1124
1125 dec r4d
1126 jnz .loop
1127
1128 packuswb m7, m7
1129 psadbw m7, m4
1130 mov eax, r3d
1131 movd r4d, m7
1132 sub eax, r4d ; numSig
1133 RET
1134
1135
1136 INIT_YMM avx2
1137 cglobal nquant, 3,5,7
1138 vpbroadcastd m4, r4m
1139 vpbroadcastd m6, [pw_1]
1140 mov r4d, r5m
1141 pxor m5, m5 ; m7 = numZero
1142 movd xm3, r3m ; m5 = qbits
1143 mov r3d, r4d ; r3 = numCoeff
1144 shr r4d, 4
1145
1146 .loop:
1147 movu m0, [r0] ; m0 = level
1148 pabsd m1, m0
1149 pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
1150 paddd m1, m4
1151 psrad m1, xm3 ; m0 = level1
1152 psignd m1, m0
1153
1154 movu m0, [r0 + mmsize] ; m0 = level
1155 pabsd m2, m0
1156 pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
1157 paddd m2, m4
1158 psrad m2, xm3 ; m0 = level1
1159 psignd m2, m0
1160
1161 packssdw m1, m2
1162 vpermq m2, m1, q3120
1163
1164 movu [r2], m2
1165 add r0, mmsize * 2
1166 add r1, mmsize * 2
1167 add r2, mmsize
1168
1169 pminuw m1, m6
1170 paddw m5, m1
1171
1172 dec r4d
1173 jnz .loop
1174
1175 pxor m0, m0
1176 psadbw m5, m0
1177 vextracti128 xm0, m5, 1
1178 paddd xm5, xm0
1179 pshufd xm0, xm5, 2
1180 paddd xm5, xm0
1181 movd eax, xm5
1182 RET
1183
1184
1185 ;-----------------------------------------------------------------------------
1186 ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
1187 ;-----------------------------------------------------------------------------
1188 INIT_XMM sse4
1189 cglobal dequant_normal, 5,5,5
1190 mova m2, [pw_1]
1191 %if HIGH_BIT_DEPTH
1192 cmp r3d, 32767
1193 jle .skip
1194 shr r3d, 2
1195 sub r4d, 2
1196 .skip:
1197 %endif
1198 movd m0, r4d ; m0 = shift
1199 add r4d, 15
1200 bts r3d, r4d
1201 movd m1, r3d
1202 pshufd m1, m1, 0 ; m1 = dword [add scale]
1203 ; m0 = shift
1204 ; m1 = scale
1205 ; m2 = word [1]
1206 .loop:
1207 movu m3, [r0]
1208 punpckhwd m4, m3, m2
1209 punpcklwd m3, m2
1210 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
1211 pmaddwd m4, m1
1212 psrad m3, m0
1213 psrad m4, m0
1214 packssdw m3, m3 ; OPT_ME: store must be 32 bits
1215 pmovsxwd m3, m3
1216 packssdw m4, m4
1217 pmovsxwd m4, m4
1218 mova [r1], m3
1219 mova [r1 + 16], m4
1220
1221 add r0, 16
1222 add r1, 32
1223
1224 sub r2d, 8
1225 jnz .loop
1226 RET
1227
1228
1229 INIT_YMM avx2
1230 cglobal dequant_normal, 5,5,7
1231 vpbroadcastd m2, [pw_1] ; m2 = word [1]
1232 vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
1233 vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
1234 %if HIGH_BIT_DEPTH
1235 cmp r3d, 32767
1236 jle .skip
1237 shr r3d, 2
1238 sub r4d, 2
1239 .skip:
1240 %endif
1241 movd xm0, r4d ; m0 = shift
1242 add r4d, -1+16
1243 bts r3d, r4d
1244 vpbroadcastd m1, r3d ; m1 = dword [add scale]
1245
1246 ; m0 = shift
1247 ; m1 = scale
1248 ; m2 = word [1]
1249 shr r2d, 4
1250 .loop:
1251 movu m3, [r0]
1252 punpckhwd m4, m3, m2
1253 punpcklwd m3, m2
1254 pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
1255 pmaddwd m4, m1
1256 psrad m3, xm0
1257 psrad m4, xm0
1258 pminsd m3, m5
1259 pmaxsd m3, m6
1260 pminsd m4, m5
1261 pmaxsd m4, m6
1262 mova [r1 + 0 * mmsize/2], xm3
1263 mova [r1 + 1 * mmsize/2], xm4
1264 vextracti128 [r1 + 2 * mmsize/2], m3, 1
1265 vextracti128 [r1 + 3 * mmsize/2], m4, 1
1266
1267 add r0, mmsize
1268 add r1, mmsize * 2
1269
1270 dec r2d
1271 jnz .loop
1272 RET
1273
1274
1275 ;-----------------------------------------------------------------------------
1276 ; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
1277 ;-----------------------------------------------------------------------------
1278 INIT_XMM ssse3
1279 cglobal count_nonzero, 2,2,3
1280 pxor m0, m0
1281 shr r1d, 4
1282 movd m1, r1d
1283 pshufb m1, m0
1284
1285 .loop:
1286 mova m2, [r0 + 0]
1287 packsswb m2, [r0 + 16]
1288 add r0, 32
1289 pcmpeqb m2, m0
1290 paddb m1, m2
1291 dec r1d
1292 jnz .loop
1293
1294 psadbw m1, m0
1295 pshufd m0, m1, 2
1296 paddd m0, m1
1297 movd eax, m0
1298 RET
1299
1300
1301 ;-----------------------------------------------------------------------------------------------------------------------------------------------
1302 ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
1303 ;-----------------------------------------------------------------------------------------------------------------------------------------------
1304 INIT_XMM sse4
1305 cglobal weight_pp, 6, 7, 6
1306
1307 shl r5d, 6 ; m0 = [w0<<6]
1308 mov r6d, r6m
1309 shl r6d, 16
1310 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
1311 movd m0, r6d
1312 pshufd m0, m0, 0 ; m0 = [w0<<6, round]
1313 movd m1, r7m
1314 movd m2, r8m
1315 pshufd m2, m2, 0
1316 mova m5, [pw_1]
1317 sub r2d, r3d
1318 shr r3d, 4
1319
1320 .loopH:
1321 mov r5d, r3d
1322
1323 .loopW:
1324 pmovzxbw m4, [r0]
1325 punpcklwd m3, m4, m5
1326 pmaddwd m3, m0
1327 psrad m3, m1
1328 paddd m3, m2
1329
1330 punpckhwd m4, m5
1331 pmaddwd m4, m0
1332 psrad m4, m1
1333 paddd m4, m2
1334
1335 packssdw m3, m4
1336 packuswb m3, m3
1337 movh [r1], m3
1338
1339 pmovzxbw m4, [r0 + 8]
1340 punpcklwd m3, m4, m5
1341 pmaddwd m3, m0
1342 psrad m3, m1
1343 paddd m3, m2
1344
1345 punpckhwd m4, m5
1346 pmaddwd m4, m0
1347 psrad m4, m1
1348 paddd m4, m2
1349
1350 packssdw m3, m4
1351 packuswb m3, m3
1352 movh [r1 + 8], m3
1353
1354 add r0, 16
1355 add r1, 16
1356
1357 dec r5d
1358 jnz .loopW
1359
1360 lea r0, [r0 + r2]
1361 lea r1, [r1 + r2]
1362
1363 dec r4d
1364 jnz .loopH
1365 RET
1366
1367
1368 INIT_YMM avx2
1369 cglobal weight_pp, 6, 7, 6
1370
1371 shl r5d, 6 ; m0 = [w0<<6]
1372 mov r6d, r6m
1373 shl r6d, 16
1374 or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
1375 movd xm0, r6d
1376 pshufd xm0, xm0, 0 ; m0 = [w0<<6, round]
1377 vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
1378
1379 movd xm1, r7m
1380 vpbroadcastd m2, r8m
1381 mova m5, [pw_1]
1382 sub r2d, r3d
1383 shr r3d, 4
1384
1385 .loopH:
1386 mov r5d, r3d
1387
1388 .loopW:
1389 pmovzxbw m4, [r0]
1390 punpcklwd m3, m4, m5
1391 pmaddwd m3, m0
1392 psrad m3, xm1
1393 paddd m3, m2
1394
1395 punpckhwd m4, m5
1396 pmaddwd m4, m0
1397 psrad m4, xm1
1398 paddd m4, m2
1399
1400 packssdw m3, m4
1401 vextracti128 xm4, m3, 1
1402 packuswb xm3, xm4
1403 movu [r1], xm3
1404
1405 add r0, 16
1406 add r1, 16
1407
1408 dec r5d
1409 jnz .loopW
1410
1411 lea r0, [r0 + r2]
1412 lea r1, [r1 + r2]
1413
1414 dec r4d
1415 jnz .loopH
1416 RET
1417
1418 ;-------------------------------------------------------------------------------------------------------------------------------------------------
1419 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
1420 ;-------------------------------------------------------------------------------------------------------------------------------------------------
1421 INIT_XMM sse4
1422 %if ARCH_X86_64
1423 cglobal weight_sp, 6, 7+2, 7
1424 %define tmp_r0 r7
1425 %define tmp_r1 r8
1426 %else ; ARCH_X86_64 = 0
1427 cglobal weight_sp, 6, 7, 7, 0-(2*4)
1428 %define tmp_r0 [(rsp + 0 * 4)]
1429 %define tmp_r1 [(rsp + 1 * 4)]
1430 %endif ; ARCH_X86_64
1431
1432 movd m0, r6m ; m0 = [w0]
1433
1434 movd m1, r7m ; m1 = [round]
1435 punpcklwd m0, m1
1436 pshufd m0, m0, 0 ; m0 = [w0 round]
1437
1438 movd m1, r8m ; m1 = [shift]
1439
1440 movd m2, r9m
1441 pshufd m2, m2, 0 ; m2 =[offset]
1442
1443 mova m3, [pw_1]
1444 mova m4, [pw_2000]
1445
1446 add r2d, r2d
1447
1448 .loopH:
1449 mov r6d, r4d
1450
1451 ; save old src and dst
1452 mov tmp_r0, r0
1453 mov tmp_r1, r1
1454 .loopW:
1455 movu m5, [r0]
1456 paddw m5, m4
1457
1458 punpcklwd m6,m5, m3
1459 pmaddwd m6, m0
1460 psrad m6, m1
1461 paddd m6, m2
1462
1463 punpckhwd m5, m3
1464 pmaddwd m5, m0
1465 psrad m5, m1
1466 paddd m5, m2
1467
1468 packssdw m6, m5
1469 packuswb m6, m6
1470
1471 sub r6d, 8
1472 jl .width4
1473 movh [r1], m6
1474 je .nextH
1475 add r0, 16
1476 add r1, 8
1477
1478 jmp .loopW
1479
1480 .width4:
1481 cmp r6d, -4
1482 jl .width2
1483 movd [r1], m6
1484 je .nextH
1485 add r1, 4
1486 pshufd m6, m6, 1
1487
1488 .width2:
1489 pextrw [r1], m6, 0
1490
1491 .nextH:
1492 mov r0, tmp_r0
1493 mov r1, tmp_r1
1494 lea r0, [r0 + r2]
1495 lea r1, [r1 + r3]
1496
1497 dec r5d
1498 jnz .loopH
1499
1500 RET
1501
1502 ;-----------------------------------------------------------------
1503 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
1504 ;-----------------------------------------------------------------
1505 INIT_XMM sse2
1506 cglobal transpose4, 3, 3, 4, dest, src, stride
1507 %if HIGH_BIT_DEPTH == 1
1508 add r2, r2
1509 movh m0, [r1]
1510 movh m1, [r1 + r2]
1511 movh m2, [r1 + 2 * r2]
1512 lea r1, [r1 + 2 * r2]
1513 movh m3, [r1 + r2]
1514 punpcklwd m0, m1
1515 punpcklwd m2, m3
1516 punpckhdq m1, m0, m2
1517 punpckldq m0, m2
1518 movu [r0], m0
1519 movu [r0 + 16], m1
1520 %else ;HIGH_BIT_DEPTH == 0
1521 movd m0, [r1]
1522 movd m1, [r1 + r2]
1523 movd m2, [r1 + 2 * r2]
1524 lea r1, [r1 + 2 * r2]
1525 movd m3, [r1 + r2]
1526
1527 punpcklbw m0, m1
1528 punpcklbw m2, m3
1529 punpcklwd m0, m2
1530 movu [r0], m0
1531 %endif
1532 RET
1533
1534 ;-----------------------------------------------------------------
1535 ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride)
1536 ;-----------------------------------------------------------------
1537 %if HIGH_BIT_DEPTH == 1
1538 %if ARCH_X86_64 == 1
1539 INIT_YMM avx2
1540 cglobal transpose8, 3, 5, 5
1541 add r2, r2
1542 lea r3, [3 * r2]
1543 lea r4, [r1 + 4 * r2]
1544 movu xm0, [r1]
1545 vinserti128 m0, m0, [r4], 1
1546 movu xm1, [r1 + r2]
1547 vinserti128 m1, m1, [r4 + r2], 1
1548 movu xm2, [r1 + 2 * r2]
1549 vinserti128 m2, m2, [r4 + 2 * r2], 1
1550 movu xm3, [r1 + r3]
1551 vinserti128 m3, m3, [r4 + r3], 1
1552
1553 punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6]
1554 punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6]
1555
1556 punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8]
1557 punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8]
1558
1559 punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8]
1560 punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8]
1561
1562 punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8]
1563 punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8]
1564
1565 vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8]
1566 vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8]
1567 vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8]
1568 vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8]
1569
1570 movu [r0 + 0 * 32], m3
1571 movu [r0 + 1 * 32], m4
1572 movu [r0 + 2 * 32], m1
1573 movu [r0 + 3 * 32], m0
1574 RET
1575 %endif
1576
1577 INIT_XMM sse2
1578 %macro TRANSPOSE_4x4 1
1579 movh m0, [r1]
1580 movh m1, [r1 + r2]
1581 movh m2, [r1 + 2 * r2]
1582 lea r1, [r1 + 2 * r2]
1583 movh m3, [r1 + r2]
1584 punpcklwd m0, m1
1585 punpcklwd m2, m3
1586 punpckhdq m1, m0, m2
1587 punpckldq m0, m2
1588 movh [r0], m0
1589 movhps [r0 + %1], m0
1590 movh [r0 + 2 * %1], m1
1591 lea r0, [r0 + 2 * %1]
1592 movhps [r0 + %1], m1
1593 %endmacro
1594 cglobal transpose8_internal
1595 TRANSPOSE_4x4 r5
1596 lea r1, [r1 + 2 * r2]
1597 lea r0, [r3 + 8]
1598 TRANSPOSE_4x4 r5
1599 lea r1, [r1 + 2 * r2]
1600 neg r2
1601 lea r1, [r1 + r2 * 8 + 8]
1602 neg r2
1603 lea r0, [r3 + 4 * r5]
1604 TRANSPOSE_4x4 r5
1605 lea r1, [r1 + 2 * r2]
1606 lea r0, [r3 + 8 + 4 * r5]
1607 TRANSPOSE_4x4 r5
1608 ret
1609 cglobal transpose8, 3, 6, 4, dest, src, stride
1610 add r2, r2
1611 mov r3, r0
1612 mov r5, 16
1613 call transpose8_internal
1614 RET
1615 %else ;HIGH_BIT_DEPTH == 0
1616 %if ARCH_X86_64 == 1
1617 INIT_YMM avx2
1618 cglobal transpose8, 3, 4, 4
1619 lea r3, [r2 * 3]
1620 movq xm0, [r1]
1621 movhps xm0, [r1 + 2 * r2]
1622 movq xm1, [r1 + r2]
1623 movhps xm1, [r1 + r3]
1624 lea r1, [r1 + 4 * r2]
1625 movq xm2, [r1]
1626 movhps xm2, [r1 + 2 * r2]
1627 movq xm3, [r1 + r2]
1628 movhps xm3, [r1 + r3]
1629
1630 vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7]
1631 vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8]
1632
1633 punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6]
1634 punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8]
1635
1636 punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8]
1637 punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8]
1638
1639 mova m0, [trans8_shuf]
1640
1641 vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8]
1642 vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8]
1643
1644 movu [r0], m1
1645 movu [r0 + 32], m2
1646 RET
1647 %endif
1648
1649 INIT_XMM sse2
1650 cglobal transpose8, 3, 5, 8, dest, src, stride
1651 lea r3, [2 * r2]
1652 lea r4, [3 * r2]
1653 movh m0, [r1]
1654 movh m1, [r1 + r2]
1655 movh m2, [r1 + r3]
1656 movh m3, [r1 + r4]
1657 movh m4, [r1 + 4 * r2]
1658 lea r1, [r1 + 4 * r2]
1659 movh m5, [r1 + r2]
1660 movh m6, [r1 + r3]
1661 movh m7, [r1 + r4]
1662
1663 punpcklbw m0, m1
1664 punpcklbw m2, m3
1665 punpcklbw m4, m5
1666 punpcklbw m6, m7
1667
1668 punpckhwd m1, m0, m2
1669 punpcklwd m0, m2
1670 punpckhwd m5, m4, m6
1671 punpcklwd m4, m6
1672 punpckhdq m2, m0, m4
1673 punpckldq m0, m4
1674 punpckhdq m3, m1, m5
1675 punpckldq m1, m5
1676
1677 movu [r0], m0
1678 movu [r0 + 16], m2
1679 movu [r0 + 32], m1
1680 movu [r0 + 48], m3
1681 RET
1682 %endif
1683
1684 %macro TRANSPOSE_8x8 1
1685
1686 movh m0, [r1]
1687 movh m1, [r1 + r2]
1688 movh m2, [r1 + 2 * r2]
1689 lea r1, [r1 + 2 * r2]
1690 movh m3, [r1 + r2]
1691 movh m4, [r1 + 2 * r2]
1692 lea r1, [r1 + 2 * r2]
1693 movh m5, [r1 + r2]
1694 movh m6, [r1 + 2 * r2]
1695 lea r1, [r1 + 2 * r2]
1696 movh m7, [r1 + r2]
1697
1698 punpcklbw m0, m1
1699 punpcklbw m2, m3
1700 punpcklbw m4, m5
1701 punpcklbw m6, m7
1702
1703 punpckhwd m1, m0, m2
1704 punpcklwd m0, m2
1705 punpckhwd m5, m4, m6
1706 punpcklwd m4, m6
1707 punpckhdq m2, m0, m4
1708 punpckldq m0, m4
1709 punpckhdq m3, m1, m5
1710 punpckldq m1, m5
1711
1712 movh [r0], m0
1713 movhps [r0 + %1], m0
1714 movh [r0 + 2 * %1], m2
1715 lea r0, [r0 + 2 * %1]
1716 movhps [r0 + %1], m2
1717 movh [r0 + 2 * %1], m1
1718 lea r0, [r0 + 2 * %1]
1719 movhps [r0 + %1], m1
1720 movh [r0 + 2 * %1], m3
1721 lea r0, [r0 + 2 * %1]
1722 movhps [r0 + %1], m3
1723
1724 %endmacro
1725
1726
1727 ;-----------------------------------------------------------------
1728 ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride)
1729 ;-----------------------------------------------------------------
1730 %if HIGH_BIT_DEPTH == 1
1731 %if ARCH_X86_64 == 1
1732 INIT_YMM avx2
1733 cglobal transpose16x8_internal
1734 movu m0, [r1]
1735 movu m1, [r1 + r2]
1736 movu m2, [r1 + 2 * r2]
1737 movu m3, [r1 + r3]
1738 lea r1, [r1 + 4 * r2]
1739
1740 movu m4, [r1]
1741 movu m5, [r1 + r2]
1742 movu m6, [r1 + 2 * r2]
1743 movu m7, [r1 + r3]
1744
1745 punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2]
1746 punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2]
1747
1748 punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4]
1749 punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4]
1750
1751 punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6]
1752 punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6]
1753
1754 punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8]
1755 punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8]
1756
1757 punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4]
1758 punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4]
1759
1760 punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8]
1761 punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8]
1762
1763 punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4]
1764 punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4]
1765
1766 punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8]
1767 punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8]
1768
1769 punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8]
1770 punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8]
1771
1772 punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8]
1773 punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8]
1774
1775 punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8]
1776 punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8]
1777
1778 punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8]
1779 punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8]
1780
1781 movu [r0 + 0 * 32], xm6
1782 vextracti128 [r0 + 8 * 32], m6, 1
1783 movu [r0 + 1 * 32], xm7
1784 vextracti128 [r0 + 9 * 32], m7, 1
1785 movu [r0 + 2 * 32], xm1
1786 vextracti128 [r0 + 10 * 32], m1, 1
1787 movu [r0 + 3 * 32], xm8
1788 vextracti128 [r0 + 11 * 32], m8, 1
1789 movu [r0 + 4 * 32], xm3
1790 vextracti128 [r0 + 12 * 32], m3, 1
1791 movu [r0 + 5 * 32], xm5
1792 vextracti128 [r0 + 13 * 32], m5, 1
1793 movu [r0 + 6 * 32], xm2
1794 vextracti128 [r0 + 14 * 32], m2, 1
1795 movu [r0 + 7 * 32], xm0
1796 vextracti128 [r0 + 15 * 32], m0, 1
1797 ret
1798
1799 cglobal transpose16, 3, 4, 9
1800 add r2, r2
1801 lea r3, [r2 * 3]
1802 call transpose16x8_internal
1803 lea r1, [r1 + 4 * r2]
1804 add r0, 16
1805 call transpose16x8_internal
1806 RET
1807 %endif
1808 INIT_XMM sse2
1809 cglobal transpose16, 3, 7, 4, dest, src, stride
1810 add r2, r2
1811 mov r3, r0
1812 mov r4, r1
1813 mov r5, 32
1814 mov r6, r0
1815 call transpose8_internal
1816 lea r1, [r1 - 8 + 2 * r2]
1817 lea r0, [r6 + 16]
1818 mov r3, r0
1819 call transpose8_internal
1820 lea r1, [r4 + 16]
1821 lea r0, [r6 + 8 * r5]
1822 mov r3, r0
1823 call transpose8_internal
1824 lea r1, [r1 - 8 + 2 * r2]
1825 lea r0, [r6 + 8 * r5 + 16]
1826 mov r3, r0
1827 call transpose8_internal
1828 RET
1829 %else ;HIGH_BIT_DEPTH == 0
1830 %if ARCH_X86_64 == 1
1831 INIT_YMM avx2
1832 cglobal transpose16, 3, 5, 9
1833 lea r3, [r2 * 3]
1834 lea r4, [r1 + 8 * r2]
1835
1836 movu xm0, [r1]
1837 movu xm1, [r1 + r2]
1838 movu xm2, [r1 + 2 * r2]
1839 movu xm3, [r1 + r3]
1840 vinserti128 m0, m0, [r4], 1
1841 vinserti128 m1, m1, [r4 + r2], 1
1842 vinserti128 m2, m2, [r4 + 2 * r2], 1
1843 vinserti128 m3, m3, [r4 + r3], 1
1844 lea r1, [r1 + 4 * r2]
1845 lea r4, [r4 + 4 * r2]
1846
1847 movu xm4, [r1]
1848 movu xm5, [r1 + r2]
1849 movu xm6, [r1 + 2 * r2]
1850 movu xm7, [r1 + r3]
1851 vinserti128 m4, m4, [r4], 1
1852 vinserti128 m5, m5, [r4 + r2], 1
1853 vinserti128 m6, m6, [r4 + 2 * r2], 1
1854 vinserti128 m7, m7, [r4 + r3], 1
1855
1856 punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10]
1857 punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10]
1858
1859 punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12]
1860 punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12]
1861
1862 punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14]
1863 punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14]
1864
1865 punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16]
1866 punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16]
1867
1868 punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12]
1869 punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12]
1870
1871 punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16]
1872 punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16]
1873
1874 punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12]
1875 punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12]
1876
1877 punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16]
1878 punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16]
1879
1880 punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1881 punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1882
1883 punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1884 punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1885
1886 punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1887 punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1888
1889 punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1890 punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
1891
1892 vpermq m6, m6, 0xD8
1893 vpermq m7, m7, 0xD8
1894 vpermq m1, m1, 0xD8
1895 vpermq m8, m8, 0xD8
1896 vpermq m3, m3, 0xD8
1897 vpermq m5, m5, 0xD8
1898 vpermq m2, m2, 0xD8
1899 vpermq m0, m0, 0xD8
1900
1901 movu [r0 + 0 * 16], m6
1902 movu [r0 + 2 * 16], m7
1903 movu [r0 + 4 * 16], m1
1904 movu [r0 + 6 * 16], m8
1905 movu [r0 + 8 * 16], m3
1906 movu [r0 + 10 * 16], m5
1907 movu [r0 + 12 * 16], m2
1908 movu [r0 + 14 * 16], m0
1909 RET
1910 %endif
1911 INIT_XMM sse2
1912 cglobal transpose16, 3, 5, 8, dest, src, stride
1913 mov r3, r0
1914 mov r4, r1
1915 TRANSPOSE_8x8 16
1916 lea r1, [r1 + 2 * r2]
1917 lea r0, [r3 + 8]
1918 TRANSPOSE_8x8 16
1919 lea r1, [r4 + 8]
1920 lea r0, [r3 + 8 * 16]
1921 TRANSPOSE_8x8 16
1922 lea r1, [r1 + 2 * r2]
1923 lea r0, [r3 + 8 * 16 + 8]
1924 TRANSPOSE_8x8 16
1925 RET
1926 %endif
1927
1928 cglobal transpose16_internal
1929 TRANSPOSE_8x8 r6
1930 lea r1, [r1 + 2 * r2]
1931 lea r0, [r5 + 8]
1932 TRANSPOSE_8x8 r6
1933 lea r1, [r1 + 2 * r2]
1934 neg r2
1935 lea r1, [r1 + r2 * 8]
1936 lea r1, [r1 + r2 * 8 + 8]
1937 neg r2
1938 lea r0, [r5 + 8 * r6]
1939 TRANSPOSE_8x8 r6
1940 lea r1, [r1 + 2 * r2]
1941 lea r0, [r5 + 8 * r6 + 8]
1942 TRANSPOSE_8x8 r6
1943 ret
1944
1945 ;-----------------------------------------------------------------
1946 ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
1947 ;-----------------------------------------------------------------
1948 %if HIGH_BIT_DEPTH == 1
1949 %if ARCH_X86_64 == 1
1950 INIT_YMM avx2
1951 cglobal transpose8x32_internal
1952 movu m0, [r1]
1953 movu m1, [r1 + 32]
1954 movu m2, [r1 + r2]
1955 movu m3, [r1 + r2 + 32]
1956 movu m4, [r1 + 2 * r2]
1957 movu m5, [r1 + 2 * r2 + 32]
1958 movu m6, [r1 + r3]
1959 movu m7, [r1 + r3 + 32]
1960 lea r1, [r1 + 4 * r2]
1961
1962 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
1963 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
1964
1965 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
1966 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
1967
1968 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
1969 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
1970
1971 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
1972 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
1973
1974 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
1975 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
1976
1977 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
1978 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
1979
1980 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
1981 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
1982
1983 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
1984 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
1985
1986 movq [r0 + 0 * 64], xm7
1987 movhps [r0 + 1 * 64], xm7
1988 vextracti128 xm5, m7, 1
1989 movq [r0 + 8 * 64], xm5
1990 movhps [r0 + 9 * 64], xm5
1991
1992 movu m7, [r1]
1993 movu m9, [r1 + 32]
1994 movu m10, [r1 + r2]
1995 movu m11, [r1 + r2 + 32]
1996 movu m12, [r1 + 2 * r2]
1997 movu m13, [r1 + 2 * r2 + 32]
1998 movu m14, [r1 + r3]
1999 movu m15, [r1 + r3 + 32]
2000
2001 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
2002 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
2003
2004 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
2005 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
2006
2007 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
2008 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
2009
2010 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
2011 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
2012
2013 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
2014 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
2015
2016 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
2017 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
2018
2019 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
2020 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
2021
2022 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
2023 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
2024
2025 movq [r0 + 0 * 64 + 8], xm15
2026 movhps [r0 + 1 * 64 + 8], xm15
2027 vextracti128 xm13, m15, 1
2028 movq [r0 + 8 * 64 + 8], xm13
2029 movhps [r0 + 9 * 64 + 8], xm13
2030
2031 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
2032 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
2033
2034 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
2035 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
2036
2037 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
2038 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
2039
2040 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
2041 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
2042
2043 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
2044 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
2045
2046 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
2047 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
2048
2049 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
2050 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
2051
2052 movu [r0 + 2 * 64], xm13
2053 vextracti128 [r0 + 10 * 64], m13, 1
2054
2055 movu [r0 + 3 * 64], xm8
2056 vextracti128 [r0 + 11 * 64], m8, 1
2057
2058 movu [r0 + 4 * 64], xm5
2059 vextracti128 [r0 + 12 * 64], m5, 1
2060
2061 movu [r0 + 5 * 64], xm2
2062 vextracti128 [r0 + 13 * 64], m2, 1
2063
2064 movu [r0 + 6 * 64], xm10
2065 vextracti128 [r0 + 14 * 64], m10, 1
2066
2067 movu [r0 + 7 * 64], xm0
2068 vextracti128 [r0 + 15 * 64], m0, 1
2069
2070 movu [r0 + 16 * 64], xm7
2071 vextracti128 [r0 + 24 * 64], m7, 1
2072
2073 movu [r0 + 17 * 64], xm4
2074 vextracti128 [r0 + 25 * 64], m4, 1
2075
2076 movu [r0 + 18 * 64], xm12
2077 vextracti128 [r0 + 26 * 64], m12, 1
2078
2079 movu [r0 + 19 * 64], xm6
2080 vextracti128 [r0 + 27 * 64], m6, 1
2081
2082 movu [r0 + 20 * 64], xm14
2083 vextracti128 [r0 + 28 * 64], m14, 1
2084
2085 movu [r0 + 21 * 64], xm3
2086 vextracti128 [r0 + 29 * 64], m3, 1
2087
2088 movu [r0 + 22 * 64], xm11
2089 vextracti128 [r0 + 30 * 64], m11, 1
2090
2091 movu [r0 + 23 * 64], xm1
2092 vextracti128 [r0 + 31 * 64], m1, 1
2093 ret
2094
2095 cglobal transpose32, 3, 4, 16
2096 add r2, r2
2097 lea r3, [r2 * 3]
2098 call transpose8x32_internal
2099 add r0, 16
2100 lea r1, [r1 + 4 * r2]
2101 call transpose8x32_internal
2102 add r0, 16
2103 lea r1, [r1 + 4 * r2]
2104 call transpose8x32_internal
2105 add r0, 16
2106 lea r1, [r1 + 4 * r2]
2107 call transpose8x32_internal
2108 RET
2109 %endif
2110 INIT_XMM sse2
2111 cglobal transpose32, 3, 7, 4, dest, src, stride
2112 add r2, r2
2113 mov r3, r0
2114 mov r4, r1
2115 mov r5, 64
2116 mov r6, r0
2117 call transpose8_internal
2118 lea r1, [r1 - 8 + 2 * r2]
2119 lea r0, [r6 + 16]
2120 mov r3, r0
2121 call transpose8_internal
2122 lea r1, [r1 - 8 + 2 * r2]
2123 lea r0, [r6 + 32]
2124 mov r3, r0
2125 call transpose8_internal
2126 lea r1, [r1 - 8 + 2 * r2]
2127 lea r0, [r6 + 48]
2128 mov r3, r0
2129 call transpose8_internal
2130 lea r1, [r4 + 16]
2131 lea r0, [r6 + 8 * 64]
2132 mov r3, r0
2133 call transpose8_internal
2134 lea r1, [r1 - 8 + 2 * r2]
2135 lea r0, [r6 + 8 * 64 + 16]
2136 mov r3, r0
2137 call transpose8_internal
2138 lea r1, [r1 - 8 + 2 * r2]
2139 lea r0, [r6 + 8 * 64 + 32]
2140 mov r3, r0
2141 call transpose8_internal
2142 lea r1, [r1 - 8 + 2 * r2]
2143 lea r0, [r6 + 8 * 64 + 48]
2144 mov r3, r0
2145 call transpose8_internal
2146 lea r1, [r4 + 32]
2147 lea r0, [r6 + 16 * 64]
2148 mov r3, r0
2149 call transpose8_internal
2150 lea r1, [r1 - 8 + 2 * r2]
2151 lea r0, [r6 + 16 * 64 + 16]
2152 mov r3, r0
2153 call transpose8_internal
2154 lea r1, [r1 - 8 + 2 * r2]
2155 lea r0, [r6 + 16 * 64 + 32]
2156 mov r3, r0
2157 call transpose8_internal
2158 lea r1, [r1 - 8 + 2 * r2]
2159 lea r0, [r6 + 16 * 64 + 48]
2160 mov r3, r0
2161 call transpose8_internal
2162 lea r1, [r4 + 48]
2163 lea r0, [r6 + 24 * 64]
2164 mov r3, r0
2165 call transpose8_internal
2166 lea r1, [r1 - 8 + 2 * r2]
2167 lea r0, [r6 + 24 * 64 + 16]
2168 mov r3, r0
2169 call transpose8_internal
2170 lea r1, [r1 - 8 + 2 * r2]
2171 lea r0, [r6 + 24 * 64 + 32]
2172 mov r3, r0
2173 call transpose8_internal
2174 lea r1, [r1 - 8 + 2 * r2]
2175 lea r0, [r6 + 24 * 64 + 48]
2176 mov r3, r0
2177 call transpose8_internal
2178 RET
2179 %else ;HIGH_BIT_DEPTH == 0
2180 INIT_XMM sse2
2181 cglobal transpose32, 3, 7, 8, dest, src, stride
2182 mov r3, r0
2183 mov r4, r1
2184 mov r5, r0
2185 mov r6, 32
2186 call transpose16_internal
2187 lea r1, [r1 - 8 + 2 * r2]
2188 lea r0, [r3 + 16]
2189 mov r5, r0
2190 call transpose16_internal
2191 lea r1, [r4 + 16]
2192 lea r0, [r3 + 16 * 32]
2193 mov r5, r0
2194 call transpose16_internal
2195 lea r1, [r1 - 8 + 2 * r2]
2196 lea r0, [r3 + 16 * 32 + 16]
2197 mov r5, r0
2198 call transpose16_internal
2199 RET
2200
2201 %if ARCH_X86_64 == 1
2202 INIT_YMM avx2
2203 cglobal transpose32, 3, 5, 16
2204 lea r3, [r2 * 3]
2205 mov r4d, 2
2206
2207 .loop:
2208 movu m0, [r1]
2209 movu m1, [r1 + r2]
2210 movu m2, [r1 + 2 * r2]
2211 movu m3, [r1 + r3]
2212 lea r1, [r1 + 4 * r2]
2213
2214 movu m4, [r1]
2215 movu m5, [r1 + r2]
2216 movu m6, [r1 + 2 * r2]
2217 movu m7, [r1 + r3]
2218
2219 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
2220 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
2221
2222 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
2223 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
2224
2225 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
2226 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
2227
2228 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
2229 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
2230
2231 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
2232 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
2233
2234 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
2235 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
2236
2237 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
2238 punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4]
2239
2240 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
2241 punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8]
2242
2243 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
2244 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
2245
2246 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
2247 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
2248
2249 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
2250 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
2251
2252 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
2253 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
2254
2255 movq [r0 + 0 * 32], xm6
2256 movhps [r0 + 1 * 32], xm6
2257 vextracti128 xm4, m6, 1
2258 movq [r0 + 16 * 32], xm4
2259 movhps [r0 + 17 * 32], xm4
2260
2261 lea r1, [r1 + 4 * r2]
2262 movu m9, [r1]
2263 movu m10, [r1 + r2]
2264 movu m11, [r1 + 2 * r2]
2265 movu m12, [r1 + r3]
2266 lea r1, [r1 + 4 * r2]
2267
2268 movu m13, [r1]
2269 movu m14, [r1 + r2]
2270 movu m15, [r1 + 2 * r2]
2271 movu m6, [r1 + r3]
2272
2273 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
2274 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
2275
2276 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
2277 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
2278
2279 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
2280 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
2281
2282 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
2283 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
2284
2285 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
2286 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
2287
2288 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
2289 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
2290
2291 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
2292 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
2293
2294 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
2295 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
2296
2297 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
2298 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
2299
2300 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
2301 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
2302
2303 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
2304 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
2305
2306 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
2307 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
2308
2309
2310 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2311 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2312
2313 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2314 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2315
2316 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2317 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2318
2319 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2320 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2321
2322 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2323 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2324
2325 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2326 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2327
2328 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2329 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2330
2331 movq [r0 + 0 * 32 + 8], xm15
2332 movhps [r0 + 1 * 32 + 8], xm15
2333 vextracti128 xm9, m15, 1
2334 movq [r0 + 16 * 32 + 8], xm9
2335 movhps [r0 + 17 * 32 + 8], xm9
2336
2337 movu [r0 + 2 * 32], xm13
2338 vextracti128 [r0 + 18 * 32], m13, 1
2339
2340 movu [r0 + 3 * 32], xm7
2341 vextracti128 [r0 + 19 * 32], m7, 1
2342
2343 movu [r0 + 4 * 32], xm6
2344 vextracti128 [r0 + 20 * 32], m6, 1
2345
2346 movu [r0 + 5 * 32], xm1
2347 vextracti128 [r0 + 21 * 32], m1, 1
2348
2349 movu [r0 + 6 * 32], xm10
2350 vextracti128 [r0 + 22 * 32], m10, 1
2351
2352 movu [r0 + 7 * 32], xm8
2353 vextracti128 [r0 + 23 * 32], m8, 1
2354
2355 movu [r0 + 8 * 32], xm4
2356 vextracti128 [r0 + 24 * 32], m4, 1
2357
2358 movu [r0 + 9 * 32], xm3
2359 vextracti128 [r0 + 25 * 32], m3, 1
2360
2361 movu [r0 + 10 * 32], xm12
2362 vextracti128 [r0 + 26 * 32], m12, 1
2363
2364 movu [r0 + 11 * 32], xm5
2365 vextracti128 [r0 + 27 * 32], m5, 1
2366
2367 movu [r0 + 12 * 32], xm14
2368 vextracti128 [r0 + 28 * 32], m14, 1
2369
2370 movu [r0 + 13 * 32], xm2
2371 vextracti128 [r0 + 29 * 32], m2, 1
2372
2373 movu [r0 + 14 * 32], xm11
2374 vextracti128 [r0 + 30 * 32], m11, 1
2375
2376 movu [r0 + 15 * 32], xm0
2377 vextracti128 [r0 + 31 * 32], m0, 1
2378
2379 add r0, 16
2380 lea r1, [r1 + 4 * r2]
2381 dec r4d
2382 jnz .loop
2383 RET
2384 %endif
2385 %endif
2386
2387 ;-----------------------------------------------------------------
2388 ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride)
2389 ;-----------------------------------------------------------------
2390 %if HIGH_BIT_DEPTH == 1
2391 %if ARCH_X86_64 == 1
2392 INIT_YMM avx2
2393 cglobal transpose8x32_64_internal
2394 movu m0, [r1]
2395 movu m1, [r1 + 32]
2396 movu m2, [r1 + r2]
2397 movu m3, [r1 + r2 + 32]
2398 movu m4, [r1 + 2 * r2]
2399 movu m5, [r1 + 2 * r2 + 32]
2400 movu m6, [r1 + r3]
2401 movu m7, [r1 + r3 + 32]
2402 lea r1, [r1 + 4 * r2]
2403
2404 punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2]
2405 punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2]
2406
2407 punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4]
2408 punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4]
2409
2410 punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2]
2411 punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2]
2412
2413 punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4]
2414 punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4]
2415
2416 punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4]
2417 punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4]
2418
2419 punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4]
2420 punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4]
2421
2422 punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4]
2423 punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4]
2424
2425 punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4]
2426 punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4]
2427
2428 movq [r0 + 0 * 128], xm7
2429 movhps [r0 + 1 * 128], xm7
2430 vextracti128 xm5, m7, 1
2431 movq [r0 + 8 * 128], xm5
2432 movhps [r0 + 9 * 128], xm5
2433
2434 movu m7, [r1]
2435 movu m9, [r1 + 32]
2436 movu m10, [r1 + r2]
2437 movu m11, [r1 + r2 + 32]
2438 movu m12, [r1 + 2 * r2]
2439 movu m13, [r1 + 2 * r2 + 32]
2440 movu m14, [r1 + r3]
2441 movu m15, [r1 + r3 + 32]
2442
2443 punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6]
2444 punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6]
2445
2446 punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8]
2447 punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8]
2448
2449 punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6]
2450 punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6]
2451
2452 punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8]
2453 punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8]
2454
2455 punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8]
2456 punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8]
2457
2458 punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8]
2459 punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8]
2460
2461 punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8]
2462 punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8]
2463
2464 punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8]
2465 punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8]
2466
2467 movq [r0 + 0 * 128 + 8], xm15
2468 movhps [r0 + 1 * 128 + 8], xm15
2469 vextracti128 xm13, m15, 1
2470 movq [r0 + 8 * 128 + 8], xm13
2471 movhps [r0 + 9 * 128 + 8], xm13
2472
2473 punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8]
2474 punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8]
2475
2476 punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8]
2477 punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8]
2478
2479 punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8]
2480 punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8]
2481
2482 punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8]
2483 punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8]
2484
2485 punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8]
2486 punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8]
2487
2488 punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8]
2489 punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8]
2490
2491 punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8]
2492 punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8]
2493
2494 movu [r0 + 2 * 128], xm13
2495 vextracti128 [r0 + 10 * 128], m13, 1
2496
2497 movu [r0 + 3 * 128], xm8
2498 vextracti128 [r0 + 11 * 128], m8, 1
2499
2500 movu [r0 + 4 * 128], xm5
2501 vextracti128 [r0 + 12 * 128], m5, 1
2502
2503 movu [r0 + 5 * 128], xm2
2504 vextracti128 [r0 + 13 * 128], m2, 1
2505
2506 movu [r0 + 6 * 128], xm10
2507 vextracti128 [r0 + 14 * 128], m10, 1
2508
2509 movu [r0 + 7 * 128], xm0
2510 vextracti128 [r0 + 15 * 128], m0, 1
2511
2512 movu [r0 + 16 * 128], xm7
2513 vextracti128 [r0 + 24 * 128], m7, 1
2514
2515 movu [r0 + 17 * 128], xm4
2516 vextracti128 [r0 + 25 * 128], m4, 1
2517
2518 movu [r0 + 18 * 128], xm12
2519 vextracti128 [r0 + 26 * 128], m12, 1
2520
2521 movu [r0 + 19 * 128], xm6
2522 vextracti128 [r0 + 27 * 128], m6, 1
2523
2524 movu [r0 + 20 * 128], xm14
2525 vextracti128 [r0 + 28 * 128], m14, 1
2526
2527 movu [r0 + 21 * 128], xm3
2528 vextracti128 [r0 + 29 * 128], m3, 1
2529
2530 movu [r0 + 22 * 128], xm11
2531 vextracti128 [r0 + 30 * 128], m11, 1
2532
2533 movu [r0 + 23 * 128], xm1
2534 vextracti128 [r0 + 31 * 128], m1, 1
2535 ret
2536
2537 cglobal transpose64, 3, 6, 16
2538 add r2, r2
2539 lea r3, [3 * r2]
2540 lea r4, [r1 + 64]
2541 lea r5, [r0 + 16]
2542
2543 call transpose8x32_64_internal
2544 mov r1, r4
2545 lea r0, [r0 + 32 * 128]
2546 call transpose8x32_64_internal
2547 mov r0, r5
2548 lea r5, [r0 + 16]
2549 lea r4, [r1 + 4 * r2]
2550 lea r1, [r4 - 64]
2551 call transpose8x32_64_internal
2552 mov r1, r4
2553 lea r0, [r0 + 32 * 128]
2554 call transpose8x32_64_internal
2555 mov r0, r5
2556 lea r5, [r0 + 16]
2557 lea r4, [r1 + 4 * r2]
2558 lea r1, [r4 - 64]
2559 call transpose8x32_64_internal
2560 mov r1, r4
2561 lea r0, [r0 + 32 * 128]
2562 call transpose8x32_64_internal
2563 mov r0, r5
2564 lea r5, [r0 + 16]
2565 lea r4, [r1 + 4 * r2]
2566 lea r1, [r4 - 64]
2567 call transpose8x32_64_internal
2568 mov r1, r4
2569 lea r0, [r0 + 32 * 128]
2570 call transpose8x32_64_internal
2571 mov r0, r5
2572 lea r5, [r0 + 16]
2573 lea r4, [r1 + 4 * r2]
2574 lea r1, [r4 - 64]
2575 call transpose8x32_64_internal
2576 mov r1, r4
2577 lea r0, [r0 + 32 * 128]
2578 call transpose8x32_64_internal
2579 mov r0, r5
2580 lea r5, [r0 + 16]
2581 lea r4, [r1 + 4 * r2]
2582 lea r1, [r4 - 64]
2583 call transpose8x32_64_internal
2584 mov r1, r4
2585 lea r0, [r0 + 32 * 128]
2586 call transpose8x32_64_internal
2587 mov r0, r5
2588 lea r5, [r0 + 16]
2589 lea r4, [r1 + 4 * r2]
2590 lea r1, [r4 - 64]
2591 call transpose8x32_64_internal
2592 mov r1, r4
2593 lea r0, [r0 + 32 * 128]
2594 call transpose8x32_64_internal
2595 mov r0, r5
2596 lea r4, [r1 + 4 * r2]
2597 lea r1, [r4 - 64]
2598 call transpose8x32_64_internal
2599 mov r1, r4
2600 lea r0, [r0 + 32 * 128]
2601 call transpose8x32_64_internal
2602 RET
2603 %endif
2604 INIT_XMM sse2
2605 cglobal transpose64, 3, 7, 4, dest, src, stride
2606 add r2, r2
2607 mov r3, r0
2608 mov r4, r1
2609 mov r5, 128
2610 mov r6, r0
2611 call transpose8_internal
2612 lea r1, [r1 - 8 + 2 * r2]
2613 lea r0, [r6 + 16]
2614 mov r3, r0
2615 call transpose8_internal
2616 lea r1, [r1 - 8 + 2 * r2]
2617 lea r0, [r6 + 32]
2618 mov r3, r0
2619 call transpose8_internal
2620 lea r1, [r1 - 8 + 2 * r2]
2621 lea r0, [r6 + 48]
2622 mov r3, r0
2623 call transpose8_internal
2624 lea r1, [r1 - 8 + 2 * r2]
2625 lea r0, [r6 + 64]
2626 mov r3, r0
2627 call transpose8_internal
2628 lea r1, [r1 - 8 + 2 * r2]
2629 lea r0, [r6 + 80]
2630 mov r3, r0
2631 call transpose8_internal
2632 lea r1, [r1 - 8 + 2 * r2]
2633 lea r0, [r6 + 96]
2634 mov r3, r0
2635 call transpose8_internal
2636 lea r1, [r1 - 8 + 2 * r2]
2637 lea r0, [r6 + 112]
2638 mov r3, r0
2639 call transpose8_internal
2640
2641 lea r1, [r4 + 16]
2642 lea r0, [r6 + 8 * 128]
2643 mov r3, r0
2644 call transpose8_internal
2645 lea r1, [r1 - 8 + 2 * r2]
2646 lea r0, [r6 + 8 * 128 + 16]
2647 mov r3, r0
2648 call transpose8_internal
2649 lea r1, [r1 - 8 + 2 * r2]
2650 lea r0, [r6 + 8 * 128 + 32]
2651 mov r3, r0
2652 call transpose8_internal
2653 lea r1, [r1 - 8 + 2 * r2]
2654 lea r0, [r6 + 8 * 128 + 48]
2655 mov r3, r0
2656 call transpose8_internal
2657 lea r1, [r1 - 8 + 2 * r2]
2658 lea r0, [r6 + 8 * 128 + 64]
2659 mov r3, r0
2660 call transpose8_internal
2661 lea r1, [r1 - 8 + 2 * r2]
2662 lea r0, [r6 + 8 * 128 + 80]
2663 mov r3, r0
2664 call transpose8_internal
2665 lea r1, [r1 - 8 + 2 * r2]
2666 lea r0, [r6 + 8 * 128 + 96]
2667 mov r3, r0
2668 call transpose8_internal
2669 lea r1, [r1 - 8 + 2 * r2]
2670 lea r0, [r6 + 8 * 128 + 112]
2671 mov r3, r0
2672 call transpose8_internal
2673
2674 lea r1, [r4 + 32]
2675 lea r0, [r6 + 16 * 128]
2676 mov r3, r0
2677 call transpose8_internal
2678 lea r1, [r1 - 8 + 2 * r2]
2679 lea r0, [r6 + 16 * 128 + 16]
2680 mov r3, r0
2681 call transpose8_internal
2682 lea r1, [r1 - 8 + 2 * r2]
2683 lea r0, [r6 + 16 * 128 + 32]
2684 mov r3, r0
2685 call transpose8_internal
2686 lea r1, [r1 - 8 + 2 * r2]
2687 lea r0, [r6 + 16 * 128 + 48]
2688 mov r3, r0
2689 call transpose8_internal
2690 lea r1, [r1 - 8 + 2 * r2]
2691 lea r0, [r6 + 16 * 128 + 64]
2692 mov r3, r0
2693 call transpose8_internal
2694 lea r1, [r1 - 8 + 2 * r2]
2695 lea r0, [r6 + 16 * 128 + 80]
2696 mov r3, r0
2697 call transpose8_internal
2698 lea r1, [r1 - 8 + 2 * r2]
2699 lea r0, [r6 + 16 * 128 + 96]
2700 mov r3, r0
2701 call transpose8_internal
2702 lea r1, [r1 - 8 + 2 * r2]
2703 lea r0, [r6 + 16 * 128 + 112]
2704 mov r3, r0
2705 call transpose8_internal
2706
2707 lea r1, [r4 + 48]
2708 lea r0, [r6 + 24 * 128]
2709 mov r3, r0
2710 call transpose8_internal
2711 lea r1, [r1 - 8 + 2 * r2]
2712 lea r0, [r6 + 24 * 128 + 16]
2713 mov r3, r0
2714 call transpose8_internal
2715 lea r1, [r1 - 8 + 2 * r2]
2716 lea r0, [r6 + 24 * 128 + 32]
2717 mov r3, r0
2718 call transpose8_internal
2719 lea r1, [r1 - 8 + 2 * r2]
2720 lea r0, [r6 + 24 * 128 + 48]
2721 mov r3, r0
2722 call transpose8_internal
2723 lea r1, [r1 - 8 + 2 * r2]
2724 lea r0, [r6 + 24 * 128 + 64]
2725 mov r3, r0
2726 call transpose8_internal
2727 lea r1, [r1 - 8 + 2 * r2]
2728 lea r0, [r6 + 24 * 128 + 80]
2729 mov r3, r0
2730 call transpose8_internal
2731 lea r1, [r1 - 8 + 2 * r2]
2732 lea r0, [r6 + 24 * 128 + 96]
2733 mov r3, r0
2734 call transpose8_internal
2735 lea r1, [r1 - 8 + 2 * r2]
2736 lea r0, [r6 + 24 * 128 + 112]
2737 mov r3, r0
2738 call transpose8_internal
2739
2740 lea r1, [r4 + 64]
2741 lea r0, [r6 + 32 * 128]
2742 mov r3, r0
2743 call transpose8_internal
2744 lea r1, [r1 - 8 + 2 * r2]
2745 lea r0, [r6 + 32 * 128 + 16]
2746 mov r3, r0
2747 call transpose8_internal
2748 lea r1, [r1 - 8 + 2 * r2]
2749 lea r0, [r6 + 32 * 128 + 32]
2750 mov r3, r0
2751 call transpose8_internal
2752 lea r1, [r1 - 8 + 2 * r2]
2753 lea r0, [r6 + 32 * 128 + 48]
2754 mov r3, r0
2755 call transpose8_internal
2756 lea r1, [r1 - 8 + 2 * r2]
2757 lea r0, [r6 + 32 * 128 + 64]
2758 mov r3, r0
2759 call transpose8_internal
2760 lea r1, [r1 - 8 + 2 * r2]
2761 lea r0, [r6 + 32 * 128 + 80]
2762 mov r3, r0
2763 call transpose8_internal
2764 lea r1, [r1 - 8 + 2 * r2]
2765 lea r0, [r6 + 32 * 128 + 96]
2766 mov r3, r0
2767 call transpose8_internal
2768 lea r1, [r1 - 8 + 2 * r2]
2769 lea r0, [r6 + 32 * 128 + 112]
2770 mov r3, r0
2771 call transpose8_internal
2772
2773 lea r1, [r4 + 80]
2774 lea r0, [r6 + 40 * 128]
2775 mov r3, r0
2776 call transpose8_internal
2777 lea r1, [r1 - 8 + 2 * r2]
2778 lea r0, [r6 + 40 * 128 + 16]
2779 mov r3, r0
2780 call transpose8_internal
2781 lea r1, [r1 - 8 + 2 * r2]
2782 lea r0, [r6 + 40 * 128 + 32]
2783 mov r3, r0
2784 call transpose8_internal
2785 lea r1, [r1 - 8 + 2 * r2]
2786 lea r0, [r6 + 40 * 128 + 48]
2787 mov r3, r0
2788 call transpose8_internal
2789 lea r1, [r1 - 8 + 2 * r2]
2790 lea r0, [r6 + 40 * 128 + 64]
2791 mov r3, r0
2792 call transpose8_internal
2793 lea r1, [r1 - 8 + 2 * r2]
2794 lea r0, [r6 + 40 * 128 + 80]
2795 mov r3, r0
2796 call transpose8_internal
2797 lea r1, [r1 - 8 + 2 * r2]
2798 lea r0, [r6 + 40 * 128 + 96]
2799 mov r3, r0
2800 call transpose8_internal
2801 lea r1, [r1 - 8 + 2 * r2]
2802 lea r0, [r6 + 40 * 128 + 112]
2803 mov r3, r0
2804 call transpose8_internal
2805
2806 lea r1, [r4 + 96]
2807 lea r0, [r6 + 48 * 128]
2808 mov r3, r0
2809 call transpose8_internal
2810 lea r1, [r1 - 8 + 2 * r2]
2811 lea r0, [r6 + 48 * 128 + 16]
2812 mov r3, r0
2813 call transpose8_internal
2814 lea r1, [r1 - 8 + 2 * r2]
2815 lea r0, [r6 + 48 * 128 + 32]
2816 mov r3, r0
2817 call transpose8_internal
2818 lea r1, [r1 - 8 + 2 * r2]
2819 lea r0, [r6 + 48 * 128 + 48]
2820 mov r3, r0
2821 call transpose8_internal
2822 lea r1, [r1 - 8 + 2 * r2]
2823 lea r0, [r6 + 48 * 128 + 64]
2824 mov r3, r0
2825 call transpose8_internal
2826 lea r1, [r1 - 8 + 2 * r2]
2827 lea r0, [r6 + 48 * 128 + 80]
2828 mov r3, r0
2829 call transpose8_internal
2830 lea r1, [r1 - 8 + 2 * r2]
2831 lea r0, [r6 + 48 * 128 + 96]
2832 mov r3, r0
2833 call transpose8_internal
2834 lea r1, [r1 - 8 + 2 * r2]
2835 lea r0, [r6 + 48 * 128 + 112]
2836 mov r3, r0
2837 call transpose8_internal
2838
2839 lea r1, [r4 + 112]
2840 lea r0, [r6 + 56 * 128]
2841 mov r3, r0
2842 call transpose8_internal
2843 lea r1, [r1 - 8 + 2 * r2]
2844 lea r0, [r6 + 56 * 128 + 16]
2845 mov r3, r0
2846 call transpose8_internal
2847 lea r1, [r1 - 8 + 2 * r2]
2848 lea r0, [r6 + 56 * 128 + 32]
2849 mov r3, r0
2850 call transpose8_internal
2851 lea r1, [r1 - 8 + 2 * r2]
2852 lea r0, [r6 + 56 * 128 + 48]
2853 mov r3, r0
2854 call transpose8_internal
2855 lea r1, [r1 - 8 + 2 * r2]
2856 lea r0, [r6 + 56 * 128 + 64]
2857 mov r3, r0
2858 call transpose8_internal
2859 lea r1, [r1 - 8 + 2 * r2]
2860 lea r0, [r6 + 56 * 128 + 80]
2861 mov r3, r0
2862 call transpose8_internal
2863 lea r1, [r1 - 8 + 2 * r2]
2864 lea r0, [r6 + 56 * 128 + 96]
2865 mov r3, r0
2866 call transpose8_internal
2867 lea r1, [r1 - 8 + 2 * r2]
2868 lea r0, [r6 + 56 * 128 + 112]
2869 mov r3, r0
2870 call transpose8_internal
2871 RET
2872 %else ;HIGH_BIT_DEPTH == 0
2873 %if ARCH_X86_64 == 1
2874 INIT_YMM avx2
2875
2876 cglobal transpose16x32_avx2
2877 movu m0, [r1]
2878 movu m1, [r1 + r2]
2879 movu m2, [r1 + 2 * r2]
2880 movu m3, [r1 + r3]
2881 lea r1, [r1 + 4 * r2]
2882
2883 movu m4, [r1]
2884 movu m5, [r1 + r2]
2885 movu m6, [r1 + 2 * r2]
2886 movu m7, [r1 + r3]
2887
2888 punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
2889 punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
2890
2891 punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
2892 punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
2893
2894 punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
2895 punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
2896
2897 punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
2898 punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
2899
2900 punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
2901 punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
2902
2903 punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
2904 punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
2905
2906 punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
2907 punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]
2908
2909 punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
2910 punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]
2911
2912 punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
2913 punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
2914
2915 punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
2916 punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
2917
2918 punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
2919 punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
2920
2921 punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
2922 punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
2923
2924 movq [r0 + 0 * 64], xm6
2925 movhps [r0 + 1 * 64], xm6
2926 vextracti128 xm4, m6, 1
2927 movq [r0 + 16 * 64], xm4
2928 movhps [r0 + 17 * 64], xm4
2929
2930 lea r1, [r1 + 4 * r2]
2931 movu m9, [r1]
2932 movu m10, [r1 + r2]
2933 movu m11, [r1 + 2 * r2]
2934 movu m12, [r1 + r3]
2935 lea r1, [r1 + 4 * r2]
2936
2937 movu m13, [r1]
2938 movu m14, [r1 + r2]
2939 movu m15, [r1 + 2 * r2]
2940 movu m6, [r1 + r3]
2941
2942 punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
2943 punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
2944
2945 punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
2946 punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
2947
2948 punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
2949 punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
2950
2951 punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
2952 punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
2953
2954 punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
2955 punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
2956
2957 punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
2958 punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
2959
2960 punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
2961 punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
2962
2963 punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
2964 punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
2965
2966 punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
2967 punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
2968
2969 punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
2970 punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
2971
2972 punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
2973 punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
2974
2975 punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
2976 punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
2977
2978
2979 punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2980 punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2981
2982 punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2983 punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2984
2985 punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2986 punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2987
2988 punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2989 punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2990
2991 punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2992 punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2993
2994 punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2995 punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2996
2997 punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2998 punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
2999
3000 movq [r0 + 0 * 64 + 8], xm15
3001 movhps [r0 + 1 * 64 + 8], xm15
3002 vextracti128 xm9, m15, 1
3003 movq [r0 + 16 * 64 + 8], xm9
3004 movhps [r0 + 17 * 64 + 8], xm9
3005
3006 movu [r0 + 2 * 64], xm13
3007 vextracti128 [r0 + 18 * 64], m13, 1
3008
3009 movu [r0 + 3 * 64], xm7
3010 vextracti128 [r0 + 19 * 64], m7, 1
3011
3012 movu [r0 + 4 * 64], xm6
3013 vextracti128 [r0 + 20 * 64], m6, 1
3014
3015 movu [r0 + 5 * 64], xm1
3016 vextracti128 [r0 + 21 * 64], m1, 1
3017
3018 movu [r0 + 6 * 64], xm10
3019 vextracti128 [r0 + 22 * 64], m10, 1
3020
3021 movu [r0 + 7 * 64], xm8
3022 vextracti128 [r0 + 23 * 64], m8, 1
3023
3024 movu [r0 + 8 * 64], xm4
3025 vextracti128 [r0 + 24 * 64], m4, 1
3026
3027 movu [r0 + 9 * 64], xm3
3028 vextracti128 [r0 + 25 * 64], m3, 1
3029
3030 movu [r0 + 10 * 64], xm12
3031 vextracti128 [r0 + 26 * 64], m12, 1
3032
3033 movu [r0 + 11 * 64], xm5
3034 vextracti128 [r0 + 27 * 64], m5, 1
3035
3036 movu [r0 + 12 * 64], xm14
3037 vextracti128 [r0 + 28 * 64], m14, 1
3038
3039 movu [r0 + 13 * 64], xm2
3040 vextracti128 [r0 + 29 * 64], m2, 1
3041
3042 movu [r0 + 14 * 64], xm11
3043 vextracti128 [r0 + 30 * 64], m11, 1
3044
3045 movu [r0 + 15 * 64], xm0
3046 vextracti128 [r0 + 31 * 64], m0, 1
3047 ret
3048
3049 cglobal transpose64, 3, 6, 16
3050
3051 lea r3, [r2 * 3]
3052 lea r4, [r0 + 16]
3053
3054 lea r5, [r1 + 32]
3055 call transpose16x32_avx2
3056 lea r0, [r0 + 32 * 64]
3057 mov r1, r5
3058 call transpose16x32_avx2
3059
3060 mov r0, r4
3061 lea r5, [r1 + 4 * r2]
3062
3063 lea r1, [r5 - 32]
3064 call transpose16x32_avx2
3065 lea r0, [r0 + 32 * 64]
3066 mov r1, r5
3067 call transpose16x32_avx2
3068
3069 lea r0, [r4 + 16]
3070 lea r5, [r1 + 4 * r2]
3071
3072 lea r1, [r5 - 32]
3073 call transpose16x32_avx2
3074 lea r0, [r0 + 32 * 64]
3075 mov r1, r5
3076 call transpose16x32_avx2
3077
3078 lea r5, [r1 + 4 * r2]
3079 lea r0, [r4 + 32]
3080
3081 lea r1, [r5 - 32]
3082 call transpose16x32_avx2
3083 lea r0, [r0 + 32 * 64]
3084 mov r1, r5
3085 call transpose16x32_avx2
3086 RET
3087 %endif
3088
3089 INIT_XMM sse2
3090 cglobal transpose64, 3, 7, 8, dest, src, stride
3091 mov r3, r0
3092 mov r4, r1
3093 mov r5, r0
3094 mov r6, 64
3095 call transpose16_internal
3096 lea r1, [r1 - 8 + 2 * r2]
3097 lea r0, [r3 + 16]
3098 mov r5, r0
3099 call transpose16_internal
3100 lea r1, [r1 - 8 + 2 * r2]
3101 lea r0, [r3 + 32]
3102 mov r5, r0
3103 call transpose16_internal
3104 lea r1, [r1 - 8 + 2 * r2]
3105 lea r0, [r3 + 48]
3106 mov r5, r0
3107 call transpose16_internal
3108
3109 lea r1, [r4 + 16]
3110 lea r0, [r3 + 16 * 64]
3111 mov r5, r0
3112 call transpose16_internal
3113 lea r1, [r1 - 8 + 2 * r2]
3114 lea r0, [r3 + 16 * 64 + 16]
3115 mov r5, r0
3116 call transpose16_internal
3117 lea r1, [r1 - 8 + 2 * r2]
3118 lea r0, [r3 + 16 * 64 + 32]
3119 mov r5, r0
3120 call transpose16_internal
3121 lea r1, [r1 - 8 + 2 * r2]
3122 lea r0, [r3 + 16 * 64 + 48]
3123 mov r5, r0
3124 call transpose16_internal
3125
3126 lea r1, [r4 + 32]
3127 lea r0, [r3 + 32 * 64]
3128 mov r5, r0
3129 call transpose16_internal
3130 lea r1, [r1 - 8 + 2 * r2]
3131 lea r0, [r3 + 32 * 64 + 16]
3132 mov r5, r0
3133 call transpose16_internal
3134 lea r1, [r1 - 8 + 2 * r2]
3135 lea r0, [r3 + 32 * 64 + 32]
3136 mov r5, r0
3137 call transpose16_internal
3138 lea r1, [r1 - 8 + 2 * r2]
3139 lea r0, [r3 + 32 * 64 + 48]
3140 mov r5, r0
3141 call transpose16_internal
3142
3143 lea r1, [r4 + 48]
3144 lea r0, [r3 + 48 * 64]
3145 mov r5, r0
3146 call transpose16_internal
3147 lea r1, [r1 - 8 + 2 * r2]
3148 lea r0, [r3 + 48 * 64 + 16]
3149 mov r5, r0
3150 call transpose16_internal
3151 lea r1, [r1 - 8 + 2 * r2]
3152 lea r0, [r3 + 48 * 64 + 32]
3153 mov r5, r0
3154 call transpose16_internal
3155 lea r1, [r1 - 8 + 2 * r2]
3156 lea r0, [r3 + 48 * 64 + 48]
3157 mov r5, r0
3158 call transpose16_internal
3159 RET
3160 %endif
3161
3162
3163 ;=============================================================================
3164 ; SSIM
3165 ;=============================================================================
3166
3167 ;-----------------------------------------------------------------------------
3168 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
3169 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
3170 ;-----------------------------------------------------------------------------
3171 %macro SSIM_ITER 1
3172 %if HIGH_BIT_DEPTH
3173 movdqu m5, [r0+(%1&1)*r1]
3174 movdqu m6, [r2+(%1&1)*r3]
3175 %else
3176 movq m5, [r0+(%1&1)*r1]
3177 movq m6, [r2+(%1&1)*r3]
3178 punpcklbw m5, m0
3179 punpcklbw m6, m0
3180 %endif
3181 %if %1==1
3182 lea r0, [r0+r1*2]
3183 lea r2, [r2+r3*2]
3184 %endif
3185 %if %1==0
3186 movdqa m1, m5
3187 movdqa m2, m6
3188 %else
3189 paddw m1, m5
3190 paddw m2, m6
3191 %endif
3192 pmaddwd m7, m5, m6
3193 pmaddwd m5, m5
3194 pmaddwd m6, m6
3195 ACCUM paddd, 3, 5, %1
3196 ACCUM paddd, 4, 7, %1
3197 paddd m3, m6
3198 %endmacro
3199
3200 %macro SSIM 0
3201 cglobal pixel_ssim_4x4x2_core, 4,4,8
3202 FIX_STRIDES r1, r3
3203 pxor m0, m0
3204 SSIM_ITER 0
3205 SSIM_ITER 1
3206 SSIM_ITER 2
3207 SSIM_ITER 3
3208 ; PHADDW m1, m2
3209 ; PHADDD m3, m4
3210 movdqa m7, [pw_1]
3211 pshufd m5, m3, q2301
3212 pmaddwd m1, m7
3213 pmaddwd m2, m7
3214 pshufd m6, m4, q2301
3215 packssdw m1, m2
3216 paddd m3, m5
3217 pshufd m1, m1, q3120
3218 paddd m4, m6
3219 pmaddwd m1, m7
3220 punpckhdq m5, m3, m4
3221 punpckldq m3, m4
3222
3223 %if UNIX64
3224 %define t0 r4
3225 %else
3226 %define t0 rax
3227 mov t0, r4mp
3228 %endif
3229
3230 movq [t0+ 0], m1
3231 movq [t0+ 8], m3
3232 movhps [t0+16], m1
3233 movq [t0+24], m5
3234 RET
3235
3236 ;-----------------------------------------------------------------------------
3237 ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width )
3238 ;-----------------------------------------------------------------------------
3239 cglobal pixel_ssim_end4, 2,3
3240 mov r2d, r2m
3241 mova m0, [r0+ 0]
3242 mova m1, [r0+16]
3243 mova m2, [r0+32]
3244 mova m3, [r0+48]
3245 mova m4, [r0+64]
3246 paddd m0, [r1+ 0]
3247 paddd m1, [r1+16]
3248 paddd m2, [r1+32]
3249 paddd m3, [r1+48]
3250 paddd m4, [r1+64]
3251 paddd m0, m1
3252 paddd m1, m2
3253 paddd m2, m3
3254 paddd m3, m4
3255 TRANSPOSE4x4D 0, 1, 2, 3, 4
3256
3257 ; s1=m0, s2=m1, ss=m2, s12=m3
3258 %if BIT_DEPTH == 10
3259 cvtdq2ps m0, m0
3260 cvtdq2ps m1, m1
3261 cvtdq2ps m2, m2
3262 cvtdq2ps m3, m3
3263 mulps m4, m0, m1 ; s1*s2
3264 mulps m0, m0 ; s1*s1
3265 mulps m1, m1 ; s2*s2
3266 mulps m2, [pf_64] ; ss*64
3267 mulps m3, [pf_128] ; s12*128
3268 addps m4, m4 ; s1*s2*2
3269 addps m0, m1 ; s1*s1 + s2*s2
3270 subps m2, m0 ; vars
3271 subps m3, m4 ; covar*2
3272 movaps m1, [ssim_c1]
3273 addps m4, m1 ; s1*s2*2 + ssim_c1
3274 addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1
3275 movaps m1, [ssim_c2]
3276 addps m2, m1 ; vars + ssim_c2
3277 addps m3, m1 ; covar*2 + ssim_c2
3278 %else
3279 pmaddwd m4, m1, m0 ; s1*s2
3280 pslld m1, 16
3281 por m0, m1
3282 pmaddwd m0, m0 ; s1*s1 + s2*s2
3283 pslld m4, 1
3284 pslld m3, 7
3285 pslld m2, 6
3286 psubd m3, m4 ; covar*2
3287 psubd m2, m0 ; vars
3288 mova m1, [ssim_c1]
3289 paddd m0, m1
3290 paddd m4, m1
3291 mova m1, [ssim_c2]
3292 paddd m3, m1
3293 paddd m2, m1
3294 cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1)
3295 cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1)
3296 cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2)
3297 cvtdq2ps m2, m2 ; (float)(vars + ssim_c2)
3298 %endif
3299 mulps m4, m3
3300 mulps m0, m2
3301 divps m4, m0 ; ssim
3302
3303 cmp r2d, 4
3304 je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level
3305 neg r2
3306
3307 %ifdef PIC
3308 lea r3, [mask_ff + 16]
3309 %xdefine %%mask r3
3310 %else
3311 %xdefine %%mask mask_ff + 16
3312 %endif
3313 %if cpuflag(avx)
3314 andps m4, [%%mask + r2*4]
3315 %else
3316 movups m0, [%%mask + r2*4]
3317 andps m4, m0
3318 %endif
3319
3320 .skip:
3321 movhlps m0, m4
3322 addps m0, m4
3323 %if cpuflag(ssse3)
3324 movshdup m4, m0
3325 %else
3326 pshuflw m4, m0, q0032
3327 %endif
3328 addss m0, m4
3329 %if ARCH_X86_64 == 0
3330 movss r0m, m0
3331 fld dword r0m
3332 %endif
3333 RET
3334 %endmacro ; SSIM
3335
3336 INIT_XMM sse2
3337 SSIM
3338 INIT_XMM avx
3339 SSIM
3340
3341 ;-----------------------------------------------------------------
3342 ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
3343 ;-----------------------------------------------------------------
3344 INIT_XMM ssse3
3345 cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride
3346 %if HIGH_BIT_DEPTH
3347 mova m7, [deinterleave_word_shuf]
3348
3349 movu m0, [r1]
3350 palignr m1, m0, 2
3351 movu m2, [r1 + 16]
3352 palignr m3, m2, 2
3353 movu m4, [r1 + 32]
3354 palignr m5, m4, 2
3355 movu m6, [r1 + 48]
3356 pavgw m0, m1
3357 palignr m1, m6, 2
3358 pavgw m2, m3
3359 pavgw m4, m5
3360 pavgw m6, m1
3361 pshufb m0, m0, m7
3362 pshufb m2, m2, m7
3363 pshufb m4, m4, m7
3364 pshufb m6, m6, m7
3365 punpcklqdq m0, m2
3366 movu [r0], m0
3367 punpcklqdq m4, m6
3368 movu [r0 + 16], m4
3369
3370
3371
3372 movu m0, [r1 + 64]
3373 palignr m1, m0, 2
3374 movu m2, [r1 + 80]
3375 palignr m3, m2, 2
3376 movu m4, [r1 + 96]
3377 palignr m5, m4, 2
3378 movu m6, [r1 + 112]
3379 pavgw m0, m1
3380 palignr m1, m6, 2
3381 pavgw m2, m3
3382 pavgw m4, m5
3383 pavgw m6, m1
3384 pshufb m0, m0, m7
3385 pshufb m2, m2, m7
3386 pshufb m4, m4, m7
3387 pshufb m6, m6, m7
3388 punpcklqdq m0, m2
3389 movu [r0 + 32], m0
3390 punpcklqdq m4, m6
3391 movu [r0 + 48], m4
3392
3393 movu m0, [r1 + 128]
3394 palignr m1, m0, 2
3395 movu m2, [r1 + 144]
3396 palignr m3, m2, 2
3397 movu m4, [r1 + 160]
3398 palignr m5, m4, 2
3399 movu m6, [r1 + 176]
3400 pavgw m0, m1
3401 palignr m1, m6, 2
3402 pavgw m2, m3
3403 pavgw m4, m5
3404 pavgw m6, m1
3405 pshufb m0, m0, m7
3406 pshufb m2, m2, m7
3407 pshufb m4, m4, m7
3408 pshufb m6, m6, m7
3409
3410 punpcklqdq m0, m2
3411 movu [r0 + 64], m0
3412 punpcklqdq m4, m6
3413 movu [r0 + 80], m4
3414
3415 movu m0, [r1 + 192]
3416 palignr m1, m0, 2
3417 movu m2, [r1 + 208]
3418 palignr m3, m2, 2
3419 movu m4, [r1 + 224]
3420 palignr m5, m4, 2
3421 movu m6, [r1 + 240]
3422 pavgw m0, m1
3423 palignr m1, m6, 2
3424 pavgw m2, m3
3425 pavgw m4, m5
3426 pavgw m6, m1
3427 pshufb m0, m0, m7
3428 pshufb m2, m2, m7
3429 pshufb m4, m4, m7
3430 pshufb m6, m6, m7
3431
3432 punpcklqdq m0, m2
3433 movu [r0 + 96], m0
3434 punpcklqdq m4, m6
3435 movu [r0 + 112], m4
3436
3437 %else
3438 mova m7, [deinterleave_shuf]
3439
3440 movu m0, [r1]
3441 palignr m1, m0, 1
3442 movu m2, [r1 + 16]
3443 palignr m3, m2, 1
3444 movu m4, [r1 + 32]
3445 palignr m5, m4, 1
3446 movu m6, [r1 + 48]
3447
3448 pavgb m0, m1
3449
3450 palignr m1, m6, 1
3451
3452 pavgb m2, m3
3453 pavgb m4, m5
3454 pavgb m6, m1
3455
3456 pshufb m0, m0, m7
3457 pshufb m2, m2, m7
3458 pshufb m4, m4, m7
3459 pshufb m6, m6, m7
3460
3461 punpcklqdq m0, m2
3462 movu [r0], m0
3463 punpcklqdq m4, m6
3464 movu [r0 + 16], m4
3465
3466 movu m0, [r1 + 64]
3467 palignr m1, m0, 1
3468 movu m2, [r1 + 80]
3469 palignr m3, m2, 1
3470 movu m4, [r1 + 96]
3471 palignr m5, m4, 1
3472 movu m6, [r1 + 112]
3473
3474 pavgb m0, m1
3475
3476 palignr m1, m6, 1
3477
3478 pavgb m2, m3
3479 pavgb m4, m5
3480 pavgb m6, m1
3481
3482 pshufb m0, m0, m7
3483 pshufb m2, m2, m7
3484 pshufb m4, m4, m7
3485 pshufb m6, m6, m7
3486
3487 punpcklqdq m0, m2
3488 movu [r0 + 32], m0
3489 punpcklqdq m4, m6
3490 movu [r0 + 48], m4
3491 %endif
3492 RET
3493
3494 %if HIGH_BIT_DEPTH == 1
3495 INIT_YMM avx2
3496 cglobal scale1D_128to64, 2, 2, 3
3497 pxor m2, m2
3498
3499 movu m0, [r1]
3500 movu m1, [r1 + 32]
3501 phaddw m0, m1
3502 pavgw m0, m2
3503 vpermq m0, m0, 0xD8
3504 movu [r0], m0
3505
3506 movu m0, [r1 + 64]
3507 movu m1, [r1 + 96]
3508 phaddw m0, m1
3509 pavgw m0, m2
3510 vpermq m0, m0, 0xD8
3511 movu [r0 + 32], m0
3512
3513 movu m0, [r1 + 128]
3514 movu m1, [r1 + 160]
3515 phaddw m0, m1
3516 pavgw m0, m2
3517 vpermq m0, m0, 0xD8
3518 movu [r0 + 64], m0
3519
3520 movu m0, [r1 + 192]
3521 movu m1, [r1 + 224]
3522 phaddw m0, m1
3523 pavgw m0, m2
3524 vpermq m0, m0, 0xD8
3525 movu [r0 + 96], m0
3526 RET
3527 %else ; HIGH_BIT_DEPTH == 0
3528 INIT_YMM avx2
3529 cglobal scale1D_128to64, 2, 2, 4
3530 pxor m2, m2
3531 mova m3, [pb_1]
3532
3533 movu m0, [r1]
3534 pmaddubsw m0, m0, m3
3535 pavgw m0, m2
3536 movu m1, [r1 + 32]
3537 pmaddubsw m1, m1, m3
3538 pavgw m1, m2
3539 packuswb m0, m1
3540 vpermq m0, m0, 0xD8
3541 movu [r0], m0
3542
3543 movu m0, [r1 + 64]
3544 pmaddubsw m0, m0, m3
3545 pavgw m0, m2
3546 movu m1, [r1 + 96]
3547 pmaddubsw m1, m1, m3
3548 pavgw m1, m2
3549 packuswb m0, m1
3550 vpermq m0, m0, 0xD8
3551 movu [r0 + 32], m0
3552 RET
3553 %endif
3554
3555 ;-----------------------------------------------------------------
3556 ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
3557 ;-----------------------------------------------------------------
3558 %if HIGH_BIT_DEPTH
3559 INIT_XMM ssse3
3560 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3561 mov r3d, 32
3562 mova m7, [deinterleave_word_shuf]
3563 add r2, r2
3564 .loop:
3565 movu m0, [r1] ;i
3566 psrld m1, m0, 16 ;j
3567 movu m2, [r1 + r2] ;k
3568 psrld m3, m2, 16 ;l
3569 movu m4, m0
3570 movu m5, m2
3571 pxor m4, m1 ;i^j
3572 pxor m5, m3 ;k^l
3573 por m4, m5 ;ij|kl
3574 pavgw m0, m1 ;s
3575 pavgw m2, m3 ;t
3576 movu m5, m0
3577 pavgw m0, m2 ;(s+t+1)/2
3578 pxor m5, m2 ;s^t
3579 pand m4, m5 ;(ij|kl)&st
3580 pand m4, [hmulw_16p]
3581 psubw m0, m4 ;Result
3582 movu m1, [r1 + 16] ;i
3583 psrld m2, m1, 16 ;j
3584 movu m3, [r1 + r2 + 16] ;k
3585 psrld m4, m3, 16 ;l
3586 movu m5, m1
3587 movu m6, m3
3588 pxor m5, m2 ;i^j
3589 pxor m6, m4 ;k^l
3590 por m5, m6 ;ij|kl
3591 pavgw m1, m2 ;s
3592 pavgw m3, m4 ;t
3593 movu m6, m1
3594 pavgw m1, m3 ;(s+t+1)/2
3595 pxor m6, m3 ;s^t
3596 pand m5, m6 ;(ij|kl)&st
3597 pand m5, [hmulw_16p]
3598 psubw m1, m5 ;Result
3599 pshufb m0, m7
3600 pshufb m1, m7
3601
3602 punpcklqdq m0, m1
3603 movu [r0], m0
3604
3605 movu m0, [r1 + 32] ;i
3606 psrld m1, m0, 16 ;j
3607 movu m2, [r1 + r2 + 32] ;k
3608 psrld m3, m2, 16 ;l
3609 movu m4, m0
3610 movu m5, m2
3611 pxor m4, m1 ;i^j
3612 pxor m5, m3 ;k^l
3613 por m4, m5 ;ij|kl
3614 pavgw m0, m1 ;s
3615 pavgw m2, m3 ;t
3616 movu m5, m0
3617 pavgw m0, m2 ;(s+t+1)/2
3618 pxor m5, m2 ;s^t
3619 pand m4, m5 ;(ij|kl)&st
3620 pand m4, [hmulw_16p]
3621 psubw m0, m4 ;Result
3622 movu m1, [r1 + 48] ;i
3623 psrld m2, m1, 16 ;j
3624 movu m3, [r1 + r2 + 48] ;k
3625 psrld m4, m3, 16 ;l
3626 movu m5, m1
3627 movu m6, m3
3628 pxor m5, m2 ;i^j
3629 pxor m6, m4 ;k^l
3630 por m5, m6 ;ij|kl
3631 pavgw m1, m2 ;s
3632 pavgw m3, m4 ;t
3633 movu m6, m1
3634 pavgw m1, m3 ;(s+t+1)/2
3635 pxor m6, m3 ;s^t
3636 pand m5, m6 ;(ij|kl)&st
3637 pand m5, [hmulw_16p]
3638 psubw m1, m5 ;Result
3639 pshufb m0, m7
3640 pshufb m1, m7
3641
3642 punpcklqdq m0, m1
3643 movu [r0 + 16], m0
3644
3645 movu m0, [r1 + 64] ;i
3646 psrld m1, m0, 16 ;j
3647 movu m2, [r1 + r2 + 64] ;k
3648 psrld m3, m2, 16 ;l
3649 movu m4, m0
3650 movu m5, m2
3651 pxor m4, m1 ;i^j
3652 pxor m5, m3 ;k^l
3653 por m4, m5 ;ij|kl
3654 pavgw m0, m1 ;s
3655 pavgw m2, m3 ;t
3656 movu m5, m0
3657 pavgw m0, m2 ;(s+t+1)/2
3658 pxor m5, m2 ;s^t
3659 pand m4, m5 ;(ij|kl)&st
3660 pand m4, [hmulw_16p]
3661 psubw m0, m4 ;Result
3662 movu m1, [r1 + 80] ;i
3663 psrld m2, m1, 16 ;j
3664 movu m3, [r1 + r2 + 80] ;k
3665 psrld m4, m3, 16 ;l
3666 movu m5, m1
3667 movu m6, m3
3668 pxor m5, m2 ;i^j
3669 pxor m6, m4 ;k^l
3670 por m5, m6 ;ij|kl
3671 pavgw m1, m2 ;s
3672 pavgw m3, m4 ;t
3673 movu m6, m1
3674 pavgw m1, m3 ;(s+t+1)/2
3675 pxor m6, m3 ;s^t
3676 pand m5, m6 ;(ij|kl)&st
3677 pand m5, [hmulw_16p]
3678 psubw m1, m5 ;Result
3679 pshufb m0, m7
3680 pshufb m1, m7
3681
3682 punpcklqdq m0, m1
3683 movu [r0 + 32], m0
3684
3685 movu m0, [r1 + 96] ;i
3686 psrld m1, m0, 16 ;j
3687 movu m2, [r1 + r2 + 96] ;k
3688 psrld m3, m2, 16 ;l
3689 movu m4, m0
3690 movu m5, m2
3691 pxor m4, m1 ;i^j
3692 pxor m5, m3 ;k^l
3693 por m4, m5 ;ij|kl
3694 pavgw m0, m1 ;s
3695 pavgw m2, m3 ;t
3696 movu m5, m0
3697 pavgw m0, m2 ;(s+t+1)/2
3698 pxor m5, m2 ;s^t
3699 pand m4, m5 ;(ij|kl)&st
3700 pand m4, [hmulw_16p]
3701 psubw m0, m4 ;Result
3702 movu m1, [r1 + 112] ;i
3703 psrld m2, m1, 16 ;j
3704 movu m3, [r1 + r2 + 112] ;k
3705 psrld m4, m3, 16 ;l
3706 movu m5, m1
3707 movu m6, m3
3708 pxor m5, m2 ;i^j
3709 pxor m6, m4 ;k^l
3710 por m5, m6 ;ij|kl
3711 pavgw m1, m2 ;s
3712 pavgw m3, m4 ;t
3713 movu m6, m1
3714 pavgw m1, m3 ;(s+t+1)/2
3715 pxor m6, m3 ;s^t
3716 pand m5, m6 ;(ij|kl)&st
3717 pand m5, [hmulw_16p]
3718 psubw m1, m5 ;Result
3719 pshufb m0, m7
3720 pshufb m1, m7
3721
3722 punpcklqdq m0, m1
3723 movu [r0 + 48], m0
3724 lea r0, [r0 + 64]
3725 lea r1, [r1 + 2 * r2]
3726 dec r3d
3727 jnz .loop
3728 RET
3729 %else
3730
3731 INIT_XMM ssse3
3732 cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
3733 mov r3d, 32
3734 mova m7, [deinterleave_shuf]
3735 .loop:
3736
3737 movu m0, [r1] ;i
3738 psrlw m1, m0, 8 ;j
3739 movu m2, [r1 + r2] ;k
3740 psrlw m3, m2, 8 ;l
3741 movu m4, m0
3742 movu m5, m2
3743
3744 pxor m4, m1 ;i^j
3745 pxor m5, m3 ;k^l
3746 por m4, m5 ;ij|kl
3747
3748 pavgb m0, m1 ;s
3749 pavgb m2, m3 ;t
3750 movu m5, m0
3751 pavgb m0, m2 ;(s+t+1)/2
3752 pxor m5, m2 ;s^t
3753 pand m4, m5 ;(ij|kl)&st
3754 pand m4, [hmul_16p]
3755 psubb m0, m4 ;Result
3756
3757 movu m1, [r1 + 16] ;i
3758 psrlw m2, m1, 8 ;j
3759 movu m3, [r1 + r2 + 16] ;k
3760 psrlw m4, m3, 8 ;l
3761 movu m5, m1
3762 movu m6, m3
3763
3764 pxor m5, m2 ;i^j
3765 pxor m6, m4 ;k^l
3766 por m5, m6 ;ij|kl
3767
3768 pavgb m1, m2 ;s
3769 pavgb m3, m4 ;t
3770 movu m6, m1
3771 pavgb m1, m3 ;(s+t+1)/2
3772 pxor m6, m3 ;s^t
3773 pand m5, m6 ;(ij|kl)&st
3774 pand m5, [hmul_16p]
3775 psubb m1, m5 ;Result
3776
3777 pshufb m0, m0, m7
3778 pshufb m1, m1, m7
3779
3780 punpcklqdq m0, m1
3781 movu [r0], m0
3782
3783 movu m0, [r1 + 32] ;i
3784 psrlw m1, m0, 8 ;j
3785 movu m2, [r1 + r2 + 32] ;k
3786 psrlw m3, m2, 8 ;l
3787 movu m4, m0
3788 movu m5, m2
3789
3790 pxor m4, m1 ;i^j
3791 pxor m5, m3 ;k^l
3792 por m4, m5 ;ij|kl
3793
3794 pavgb m0, m1 ;s
3795 pavgb m2, m3 ;t
3796 movu m5, m0
3797 pavgb m0, m2 ;(s+t+1)/2
3798 pxor m5, m2 ;s^t
3799 pand m4, m5 ;(ij|kl)&st
3800 pand m4, [hmul_16p]
3801 psubb m0, m4 ;Result
3802
3803 movu m1, [r1 + 48] ;i
3804 psrlw m2, m1, 8 ;j
3805 movu m3, [r1 + r2 + 48] ;k
3806 psrlw m4, m3, 8 ;l
3807 movu m5, m1
3808 movu m6, m3
3809
3810 pxor m5, m2 ;i^j
3811 pxor m6, m4 ;k^l
3812 por m5, m6 ;ij|kl
3813
3814 pavgb m1, m2 ;s
3815 pavgb m3, m4 ;t
3816 movu m6, m1
3817 pavgb m1, m3 ;(s+t+1)/2
3818 pxor m6, m3 ;s^t
3819 pand m5, m6 ;(ij|kl)&st
3820 pand m5, [hmul_16p]
3821 psubb m1, m5 ;Result
3822
3823 pshufb m0, m0, m7
3824 pshufb m1, m1, m7
3825
3826 punpcklqdq m0, m1
3827 movu [r0 + 16], m0
3828
3829 lea r0, [r0 + 32]
3830 lea r1, [r1 + 2 * r2]
3831 dec r3d
3832 jnz .loop
3833 RET
3834 %endif
3835
3836
3837 ;-----------------------------------------------------------------------------
3838 ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3839 ;-----------------------------------------------------------------------------
3840 %if HIGH_BIT_DEPTH
3841 INIT_XMM sse2
3842 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3843 add r4, r4
3844 add r5, r5
3845 add r1, r1
3846 movh m0, [r2]
3847 movh m2, [r2 + r4]
3848 movh m1, [r3]
3849 movh m3, [r3 + r5]
3850 lea r2, [r2 + r4 * 2]
3851 lea r3, [r3 + r5 * 2]
3852 movh m4, [r2]
3853 movh m6, [r2 + r4]
3854 movh m5, [r3]
3855 movh m7, [r3 + r5]
3856
3857 psubw m0, m1
3858 psubw m2, m3
3859 psubw m4, m5
3860 psubw m6, m7
3861
3862 movh [r0], m0
3863 movh [r0 + r1], m2
3864 lea r0, [r0 + r1 * 2]
3865 movh [r0], m4
3866 movh [r0 + r1], m6
3867
3868 RET
3869 %else
3870 INIT_XMM sse4
3871 cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3872 add r1, r1
3873 movd m0, [r2]
3874 movd m2, [r2 + r4]
3875 movd m1, [r3]
3876 movd m3, [r3 + r5]
3877 lea r2, [r2 + r4 * 2]
3878 lea r3, [r3 + r5 * 2]
3879 movd m4, [r2]
3880 movd m6, [r2 + r4]
3881 movd m5, [r3]
3882 movd m7, [r3 + r5]
3883 punpckldq m0, m2
3884 punpckldq m1, m3
3885 punpckldq m4, m6
3886 punpckldq m5, m7
3887 pmovzxbw m0, m0
3888 pmovzxbw m1, m1
3889 pmovzxbw m4, m4
3890 pmovzxbw m5, m5
3891
3892 psubw m0, m1
3893 psubw m4, m5
3894
3895 movh [r0], m0
3896 movhps [r0 + r1], m0
3897 movh [r0 + r1 * 2], m4
3898 lea r0, [r0 + r1 * 2]
3899 movhps [r0 + r1], m4
3900
3901 RET
3902 %endif
3903
3904
3905 ;-----------------------------------------------------------------------------
3906 ; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3907 ;-----------------------------------------------------------------------------
3908 %macro PIXELSUB_PS_W4_H4 2
3909 %if HIGH_BIT_DEPTH
3910 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3911 mov r6d, %2/4
3912 add r4, r4
3913 add r5, r5
3914 add r1, r1
3915 .loop:
3916 movh m0, [r2]
3917 movh m2, [r2 + r4]
3918 movh m1, [r3]
3919 movh m3, [r3 + r5]
3920 lea r2, [r2 + r4 * 2]
3921 lea r3, [r3 + r5 * 2]
3922 movh m4, [r2]
3923 movh m6, [r2 + r4]
3924 movh m5, [r3]
3925 movh m7, [r3 + r5]
3926 dec r6d
3927 lea r2, [r2 + r4 * 2]
3928 lea r3, [r3 + r5 * 2]
3929
3930 psubw m0, m1
3931 psubw m2, m3
3932 psubw m4, m5
3933 psubw m6, m7
3934
3935 movh [r0], m0
3936 movh [r0 + r1], m2
3937 movh [r0 + r1 * 2], m4
3938 lea r0, [r0 + r1 * 2]
3939 movh [r0 + r1], m6
3940 lea r0, [r0 + r1 * 2]
3941
3942 jnz .loop
3943 RET
3944 %else
3945 cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
3946 mov r6d, %2/4
3947 add r1, r1
3948 .loop:
3949 movd m0, [r2]
3950 movd m2, [r2 + r4]
3951 movd m1, [r3]
3952 movd m3, [r3 + r5]
3953 lea r2, [r2 + r4 * 2]
3954 lea r3, [r3 + r5 * 2]
3955 movd m4, [r2]
3956 movd m6, [r2 + r4]
3957 movd m5, [r3]
3958 movd m7, [r3 + r5]
3959 dec r6d
3960 lea r2, [r2 + r4 * 2]
3961 lea r3, [r3 + r5 * 2]
3962 punpckldq m0, m2
3963 punpckldq m1, m3
3964 punpckldq m4, m6
3965 punpckldq m5, m7
3966 pmovzxbw m0, m0
3967 pmovzxbw m1, m1
3968 pmovzxbw m4, m4
3969 pmovzxbw m5, m5
3970
3971 psubw m0, m1
3972 psubw m4, m5
3973
3974 movh [r0], m0
3975 movhps [r0 + r1], m0
3976 movh [r0 + r1 * 2], m4
3977 lea r0, [r0 + r1 * 2]
3978 movhps [r0 + r1], m4
3979 lea r0, [r0 + r1 * 2]
3980
3981 jnz .loop
3982 RET
3983 %endif
3984 %endmacro
3985
3986 %if HIGH_BIT_DEPTH
3987 INIT_XMM sse2
3988 PIXELSUB_PS_W4_H4 4, 8
3989 %else
3990 INIT_XMM sse4
3991 PIXELSUB_PS_W4_H4 4, 8
3992 %endif
3993
3994
3995 ;-----------------------------------------------------------------------------
3996 ; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
3997 ;-----------------------------------------------------------------------------
3998 %macro PIXELSUB_PS_W8_H4 2
3999 %if HIGH_BIT_DEPTH
4000 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4001 mov r6d, %2/4
4002 add r4, r4
4003 add r5, r5
4004 add r1, r1
4005 .loop:
4006 movu m0, [r2]
4007 movu m2, [r2 + r4]
4008 movu m1, [r3]
4009 movu m3, [r3 + r5]
4010 lea r2, [r2 + r4 * 2]
4011 lea r3, [r3 + r5 * 2]
4012 movu m4, [r2]
4013 movu m6, [r2 + r4]
4014 movu m5, [r3]
4015 movu m7, [r3 + r5]
4016 dec r6d
4017 lea r2, [r2 + r4 * 2]
4018 lea r3, [r3 + r5 * 2]
4019
4020 psubw m0, m1
4021 psubw m2, m3
4022 psubw m4, m5
4023 psubw m6, m7
4024
4025 movu [r0], m0
4026 movu [r0 + r1], m2
4027 movu [r0 + r1 * 2], m4
4028 lea r0, [r0 + r1 * 2]
4029 movu [r0 + r1], m6
4030 lea r0, [r0 + r1 * 2]
4031
4032 jnz .loop
4033 RET
4034 %else
4035 cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4036 mov r6d, %2/4
4037 add r1, r1
4038 .loop:
4039 movh m0, [r2]
4040 movh m2, [r2 + r4]
4041 movh m1, [r3]
4042 movh m3, [r3 + r5]
4043 lea r2, [r2 + r4 * 2]
4044 lea r3, [r3 + r5 * 2]
4045 movh m4, [r2]
4046 movh m6, [r2 + r4]
4047 movh m5, [r3]
4048 movh m7, [r3 + r5]
4049 dec r6d
4050 lea r2, [r2 + r4 * 2]
4051 lea r3, [r3 + r5 * 2]
4052 pmovzxbw m0, m0
4053 pmovzxbw m1, m1
4054 pmovzxbw m2, m2
4055 pmovzxbw m3, m3
4056 pmovzxbw m4, m4
4057 pmovzxbw m5, m5
4058 pmovzxbw m6, m6
4059 pmovzxbw m7, m7
4060
4061 psubw m0, m1
4062 psubw m2, m3
4063 psubw m4, m5
4064 psubw m6, m7
4065
4066 movu [r0], m0
4067 movu [r0 + r1], m2
4068 movu [r0 + r1 * 2], m4
4069 lea r0, [r0 + r1 * 2]
4070 movu [r0 + r1], m6
4071 lea r0, [r0 + r1 * 2]
4072
4073 jnz .loop
4074 RET
4075 %endif
4076 %endmacro
4077
4078 %if HIGH_BIT_DEPTH
4079 INIT_XMM sse2
4080 PIXELSUB_PS_W8_H4 8, 8
4081 PIXELSUB_PS_W8_H4 8, 16
4082 %else
4083 INIT_XMM sse4
4084 PIXELSUB_PS_W8_H4 8, 8
4085 PIXELSUB_PS_W8_H4 8, 16
4086 %endif
4087
4088
4089 ;-----------------------------------------------------------------------------
4090 ; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
4091 ;-----------------------------------------------------------------------------
4092 %macro PIXELSUB_PS_W16_H4 2
4093 %if HIGH_BIT_DEPTH
4094 cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4095 mov r6d, %2/4
4096 add r4, r4
4097 add r5, r5
4098 add r1, r1
4099 .loop:
4100 movu m0, [r2]
4101 movu m2, [r2 + 16]
4102 movu m1, [r3]
4103 movu m3, [r3 + 16]
4104 movu m4, [r2 + r4]
4105 movu m6, [r2 + r4 + 16]
4106 movu m5, [r3 + r5]
4107 movu m7, [r3 + r5 + 16]
4108 dec r6d
4109 lea r2, [r2 + r4 * 2]
4110 lea r3, [r3 + r5 * 2]
4111
4112 psubw m0, m1
4113 psubw m2, m3
4114 psubw m4, m5
4115 psubw m6, m7
4116
4117 movu [r0], m0
4118 movu [r0 + 16], m2
4119 movu [r0 + r1], m4
4120 movu [r0 + r1 + 16], m6
4121
4122 movu m0, [r2]
4123 movu m2, [r2 + 16]
4124 movu m1, [r3]
4125 movu m3, [r3 + 16]
4126 movu m4, [r2 + r4]
4127 movu m5, [r3 + r5]
4128 movu m6, [r2 + r4 + 16]
4129 movu m7, [r3 + r5 + 16]
4130 lea r0, [r0 + r1 * 2]
4131 lea r2, [r2 + r4 * 2]
4132 lea r3, [r3 + r5 * 2]
4133
4134 psubw m0, m1
4135 psubw m2, m3
4136 psubw m4, m5
4137 psubw m6, m7
4138
4139 movu [r0], m0
4140 movu [r0 + 16], m2
4141 movu [r0 + r1], m4
4142 movu [r0 + r1 + 16], m6
4143 lea r0, [r0 + r1 * 2]
4144
4145 jnz .loop
4146 RET
4147 %else
4148 cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
4149 mov r6d, %2/4
4150 pxor m6, m6
4151 add r1, r1
4152 .loop:
4153 movu m1, [r2]
4154 movu m3, [r3]
4155 pmovzxbw m0, m1
4156 pmovzxbw m2, m3
4157 punpckhbw m1, m6
4158 punpckhbw m3, m6
4159
4160 psubw m0, m2
4161 psubw m1, m3
4162
4163 movu m5, [r2 + r4]
4164 movu m3, [r3 + r5]
4165 lea r2, [r2 + r4 * 2]
4166 lea r3, [r3 + r5 * 2]
4167 pmovzxbw m4, m5
4168 pmovzxbw m2, m3
4169 punpckhbw m5, m6
4170 punpckhbw m3, m6
4171
4172 psubw m4, m2
4173 psubw m5, m3
4174
4175 movu [r0], m0
4176 movu [r0 + 16], m1
4177 movu [r0 + r1], m4
4178 movu [r0 + r1 + 16], m5
4179
4180 movu m1, [r2]
4181 movu m3, [r3]
4182 pmovzxbw m0, m1
4183 pmovzxbw m2, m3
4184 punpckhbw m1, m6
4185 punpckhbw m3, m6
4186
4187 psubw m0, m2
4188 psubw m1, m3
4189
4190 movu m5, [r2 + r4]
4191 movu m3, [r3 + r5]
4192 dec r6d
4193 lea r2, [r2 + r4 * 2]
4194 lea r3, [r3 + r5 * 2]
4195 lea r0, [r0 + r1 * 2]
4196 pmovzxbw m4, m5
4197 pmovzxbw m2, m3
4198 punpckhbw m5, m6
4199 punpckhbw m3, m6
4200
4201 psubw m4, m2
4202 psubw m5, m3
4203
4204 movu [r0], m0
4205 movu [r0 + 16], m1
4206 movu [r0 + r1], m4
4207 movu [r0 + r1 + 16], m5
4208 lea r0, [r0 + r1 * 2]
4209
4210 jnz .loop
4211 RET
4212 %endif
4213 %endmacro
4214
4215 %if HIGH_BIT_DEPTH
4216 INIT_XMM sse2
4217 PIXELSUB_PS_W16_H4 16, 16
4218 PIXELSUB_PS_W16_H4 16, 32
4219 %else
4220 INIT_XMM sse4
4221 PIXELSUB_PS_W16_H4 16, 16
4222 PIXELSUB_PS_W16_H4 16, 32
4223 %endif
4224
4225
4226 ;-----------------------------------------------------------------------------
4227 ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
4228 ;-----------------------------------------------------------------------------
4229 %macro PIXELSUB_PS_W32_H2 2
4230 %if HIGH_BIT_DEPTH
4231 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4232 mov r6d, %2/2
4233 add r4, r4
4234 add r5, r5
4235 add r1, r1
4236 .loop:
4237 movu m0, [r2]
4238 movu m2, [r2 + 16]
4239 movu m4, [r2 + 32]
4240 movu m6, [r2 + 48]
4241 movu m1, [r3]
4242 movu m3, [r3 + 16]
4243 movu m5, [r3 + 32]
4244 movu m7, [r3 + 48]
4245 dec r6d
4246
4247 psubw m0, m1
4248 psubw m2, m3
4249 psubw m4, m5
4250 psubw m6, m7
4251
4252 movu [r0], m0
4253 movu [r0 + 16], m2
4254 movu [r0 + 32], m4
4255 movu [r0 + 48], m6
4256
4257 movu m0, [r2 + r4]
4258 movu m2, [r2 + r4 + 16]
4259 movu m4, [r2 + r4 + 32]
4260 movu m6, [r2 + r4 + 48]
4261 movu m1, [r3 + r5]
4262 movu m3, [r3 + r5 + 16]
4263 movu m5, [r3 + r5 + 32]
4264 movu m7, [r3 + r5 + 48]
4265 lea r2, [r2 + r4 * 2]
4266 lea r3, [r3 + r5 * 2]
4267
4268 psubw m0, m1
4269 psubw m2, m3
4270 psubw m4, m5
4271 psubw m6, m7
4272
4273 movu [r0 + r1], m0
4274 movu [r0 + r1 + 16], m2
4275 movu [r0 + r1 + 32], m4
4276 movu [r0 + r1 + 48], m6
4277 lea r0, [r0 + r1 * 2]
4278
4279 jnz .loop
4280 RET
4281 %else
4282 cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4283 mov r6d, %2/2
4284 add r1, r1
4285 .loop:
4286 movh m0, [r2]
4287 movh m1, [r2 + 8]
4288 movh m2, [r2 + 16]
4289 movh m6, [r2 + 24]
4290 movh m3, [r3]
4291 movh m4, [r3 + 8]
4292 movh m5, [r3 + 16]
4293 movh m7, [r3 + 24]
4294 dec r6d
4295 pmovzxbw m0, m0
4296 pmovzxbw m1, m1
4297 pmovzxbw m2, m2
4298 pmovzxbw m6, m6
4299 pmovzxbw m3, m3
4300 pmovzxbw m4, m4
4301 pmovzxbw m5, m5
4302 pmovzxbw m7, m7
4303
4304 psubw m0, m3
4305 psubw m1, m4
4306 psubw m2, m5
4307 psubw m6, m7
4308
4309 movu [r0], m0
4310 movu [r0 + 16], m1
4311 movu [r0 + 32], m2
4312 movu [r0 + 48], m6
4313
4314 movh m0, [r2 + r4]
4315 movh m1, [r2 + r4 + 8]
4316 movh m2, [r2 + r4 + 16]
4317 movh m6, [r2 + r4 + 24]
4318 movh m3, [r3 + r5]
4319 movh m4, [r3 + r5 + 8]
4320 movh m5, [r3 + r5 + 16]
4321 movh m7, [r3 + r5 + 24]
4322 lea r2, [r2 + r4 * 2]
4323 lea r3, [r3 + r5 * 2]
4324 pmovzxbw m0, m0
4325 pmovzxbw m1, m1
4326 pmovzxbw m2, m2
4327 pmovzxbw m6, m6
4328 pmovzxbw m3, m3
4329 pmovzxbw m4, m4
4330 pmovzxbw m5, m5
4331 pmovzxbw m7, m7
4332
4333 psubw m0, m3
4334 psubw m1, m4
4335 psubw m2, m5
4336 psubw m6, m7
4337
4338 movu [r0 + r1], m0
4339 movu [r0 + r1 + 16], m1
4340 movu [r0 + r1 + 32], m2
4341 movu [r0 + r1 + 48], m6
4342 lea r0, [r0 + r1 * 2]
4343
4344 jnz .loop
4345 RET
4346 %endif
4347 %endmacro
4348
4349 %if HIGH_BIT_DEPTH
4350 INIT_XMM sse2
4351 PIXELSUB_PS_W32_H2 32, 32
4352 PIXELSUB_PS_W32_H2 32, 64
4353 %else
4354 INIT_XMM sse4
4355 PIXELSUB_PS_W32_H2 32, 32
4356 PIXELSUB_PS_W32_H2 32, 64
4357 %endif
4358
4359
4360 ;-----------------------------------------------------------------------------
4361 ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
4362 ;-----------------------------------------------------------------------------
4363 %macro PIXELSUB_PS_W64_H2 2
4364 %if HIGH_BIT_DEPTH
4365 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4366 mov r6d, %2/2
4367 add r4, r4
4368 add r5, r5
4369 add r1, r1
4370 .loop:
4371 movu m0, [r2]
4372 movu m2, [r2 + 16]
4373 movu m4, [r2 + 32]
4374 movu m6, [r2 + 48]
4375 movu m1, [r3]
4376 movu m3, [r3 + 16]
4377 movu m5, [r3 + 32]
4378 movu m7, [r3 + 48]
4379
4380 psubw m0, m1
4381 psubw m2, m3
4382 psubw m4, m5
4383 psubw m6, m7
4384
4385 movu [r0], m0
4386 movu [r0 + 16], m2
4387 movu [r0 + 32], m4
4388 movu [r0 + 48], m6
4389
4390 movu m0, [r2 + 64]
4391 movu m2, [r2 + 80]
4392 movu m4, [r2 + 96]
4393 movu m6, [r2 + 112]
4394 movu m1, [r3 + 64]
4395 movu m3, [r3 + 80]
4396 movu m5, [r3 + 96]
4397 movu m7, [r3 + 112]
4398
4399 psubw m0, m1
4400 psubw m2, m3
4401 psubw m4, m5
4402 psubw m6, m7
4403
4404 movu [r0 + 64], m0
4405 movu [r0 + 80], m2
4406 movu [r0 + 96], m4
4407 movu [r0 + 112], m6
4408
4409 movu m0, [r2 + r4]
4410 movu m2, [r2 + r4 + 16]
4411 movu m4, [r2 + r4 + 32]
4412 movu m6, [r2 + r4 + 48]
4413 movu m1, [r3 + r5]
4414 movu m3, [r3 + r5 + 16]
4415 movu m5, [r3 + r5 + 32]
4416 movu m7, [r3 + r5 + 48]
4417
4418 psubw m0, m1
4419 psubw m2, m3
4420 psubw m4, m5
4421 psubw m6, m7
4422
4423 movu [r0 + r1], m0
4424 movu [r0 + r1 + 16], m2
4425 movu [r0 + r1 + 32], m4
4426 movu [r0 + r1 + 48], m6
4427
4428 movu m0, [r2 + r4 + 64]
4429 movu m2, [r2 + r4 + 80]
4430 movu m4, [r2 + r4 + 96]
4431 movu m6, [r2 + r4 + 112]
4432 movu m1, [r3 + r5 + 64]
4433 movu m3, [r3 + r5 + 80]
4434 movu m5, [r3 + r5 + 96]
4435 movu m7, [r3 + r5 + 112]
4436 dec r6d
4437 lea r2, [r2 + r4 * 2]
4438 lea r3, [r3 + r5 * 2]
4439
4440 psubw m0, m1
4441 psubw m2, m3
4442 psubw m4, m5
4443 psubw m6, m7
4444
4445 movu [r0 + r1 + 64], m0
4446 movu [r0 + r1 + 80], m2
4447 movu [r0 + r1 + 96], m4
4448 movu [r0 + r1 + 112], m6
4449 lea r0, [r0 + r1 * 2]
4450
4451 jnz .loop
4452 RET
4453 %else
4454 cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
4455 mov r6d, %2/2
4456 pxor m6, m6
4457 add r1, r1
4458 .loop:
4459 movu m1, [r2]
4460 movu m5, [r2 + 16]
4461 movu m3, [r3]
4462 movu m7, [r3 + 16]
4463
4464 pmovzxbw m0, m1
4465 pmovzxbw m4, m5
4466 pmovzxbw m2, m3
4467 punpckhbw m1, m6
4468 punpckhbw m3, m6
4469 punpckhbw m5, m6
4470
4471 psubw m0, m2
4472 psubw m1, m3
4473 pmovzxbw m2, m7
4474 punpckhbw m7, m6
4475 psubw m4, m2
4476 psubw m5, m7
4477
4478 movu m3, [r2 + 32]
4479 movu m7, [r3 + 32]
4480 pmovzxbw m2, m3
4481 punpckhbw m3, m6
4482
4483 movu [r0], m0
4484 movu [r0 + 16], m1
4485 movu [r0 + 32], m4
4486 movu [r0 + 48], m5
4487
4488 movu m1, [r2 + 48]
4489 movu m5, [r3 + 48]
4490 pmovzxbw m0, m1
4491 pmovzxbw m4, m7
4492 punpckhbw m1, m6
4493 punpckhbw m7, m6
4494
4495 psubw m2, m4
4496 psubw m3, m7
4497
4498 movu [r0 + 64], m2
4499 movu [r0 + 80], m3
4500
4501 movu m7, [r2 + r4]
4502 movu m3, [r3 + r5]
4503 pmovzxbw m2, m5
4504 pmovzxbw m4, m7
4505 punpckhbw m5, m6
4506 punpckhbw m7, m6
4507
4508 psubw m0, m2
4509 psubw m1, m5
4510
4511 movu [r0 + 96], m0
4512 movu [r0 + 112], m1
4513
4514 movu m2, [r2 + r4 + 16]
4515 movu m5, [r3 + r5 + 16]
4516 pmovzxbw m0, m3
4517 pmovzxbw m1, m2
4518 punpckhbw m3, m6
4519 punpckhbw m2, m6
4520
4521 psubw m4, m0
4522 psubw m7, m3
4523
4524 movu [r0 + r1], m4
4525 movu [r0 + r1 + 16], m7
4526
4527 movu m0, [r2 + r4 + 32]
4528 movu m3, [r3 + r5 + 32]
4529 dec r6d
4530 pmovzxbw m4, m5
4531 pmovzxbw m7, m0
4532 punpckhbw m5, m6
4533 punpckhbw m0, m6
4534
4535 psubw m1, m4
4536 psubw m2, m5
4537
4538 movu [r0 + r1 + 32], m1
4539 movu [r0 + r1 + 48], m2
4540
4541 movu m4, [r2 + r4 + 48]
4542 movu m5, [r3 + r5 + 48]
4543 lea r2, [r2 + r4 * 2]
4544 lea r3, [r3 + r5 * 2]
4545 pmovzxbw m1, m3
4546 pmovzxbw m2, m4
4547 punpckhbw m3, m6
4548 punpckhbw m4, m6
4549
4550 psubw m7, m1
4551 psubw m0, m3
4552
4553 movu [r0 + r1 + 64], m7
4554 movu [r0 + r1 + 80], m0
4555
4556 pmovzxbw m7, m5
4557 punpckhbw m5, m6
4558 psubw m2, m7
4559 psubw m4, m5
4560
4561 movu [r0 + r1 + 96], m2
4562 movu [r0 + r1 + 112], m4
4563 lea r0, [r0 + r1 * 2]
4564
4565 jnz .loop
4566 RET
4567 %endif
4568 %endmacro
4569
4570 %if HIGH_BIT_DEPTH
4571 INIT_XMM sse2
4572 PIXELSUB_PS_W64_H2 64, 64
4573 %else
4574 INIT_XMM sse4
4575 PIXELSUB_PS_W64_H2 64, 64
4576 %endif
4577
4578
4579 ;=============================================================================
4580 ; variance
4581 ;=============================================================================
4582
4583 %macro VAR_START 1
4584 pxor m5, m5 ; sum
4585 pxor m6, m6 ; sum squared
4586 %if HIGH_BIT_DEPTH == 0
4587 %if %1
4588 mova m7, [pw_00ff]
4589 %elif mmsize < 32
4590 pxor m7, m7 ; zero
4591 %endif
4592 %endif ; !HIGH_BIT_DEPTH
4593 %endmacro
4594
4595 %macro VAR_END 2
4596 %if HIGH_BIT_DEPTH
4597 %if mmsize == 8 && %1*%2 == 256
4598 HADDUW m5, m2
4599 %else
4600 %if %1 >= 32
4601 HADDW m5, m2
4602 movd m7, r4d
4603 paddd m5, m7
4604 %else
4605 HADDW m5, m2
4606 %endif
4607 %endif
4608 %else ; !HIGH_BIT_DEPTH
4609 %if %1 == 64
4610 HADDW m5, m2
4611 movd m7, r4d
4612 paddd m5, m7
4613 %else
4614 HADDW m5, m2
4615 %endif
4616 %endif ; HIGH_BIT_DEPTH
4617 HADDD m6, m1
4618 %if ARCH_X86_64
4619 punpckldq m5, m6
4620 movq rax, m5
4621 %else
4622 movd eax, m5
4623 movd edx, m6
4624 %endif
4625 RET
4626 %endmacro
4627
4628 %macro VAR_CORE 0
4629 paddw m5, m0
4630 paddw m5, m3
4631 paddw m5, m1
4632 paddw m5, m4
4633 pmaddwd m0, m0
4634 pmaddwd m3, m3
4635 pmaddwd m1, m1
4636 pmaddwd m4, m4
4637 paddd m6, m0
4638 paddd m6, m3
4639 paddd m6, m1
4640 paddd m6, m4
4641 %endmacro
4642
4643 %macro VAR_2ROW 3
4644 mov r2d, %2
4645 .loop%3:
4646 %if HIGH_BIT_DEPTH
4647 movu m0, [r0]
4648 movu m1, [r0+mmsize]
4649 movu m3, [r0+%1]
4650 movu m4, [r0+%1+mmsize]
4651 %else ; !HIGH_BIT_DEPTH
4652 mova m0, [r0]
4653 punpckhbw m1, m0, m7
4654 mova m3, [r0+%1]
4655 mova m4, m3
4656 punpcklbw m0, m7
4657 %endif ; HIGH_BIT_DEPTH
4658 %ifidn %1, r1
4659 lea r0, [r0+%1*2]
4660 %else
4661 add r0, r1
4662 %endif
4663 %if HIGH_BIT_DEPTH == 0
4664 punpcklbw m3, m7
4665 punpckhbw m4, m7
4666 %endif ; !HIGH_BIT_DEPTH
4667 VAR_CORE
4668 dec r2d
4669 jg .loop%3
4670 %endmacro
4671
4672 ;-----------------------------------------------------------------------------
4673 ; int pixel_var_wxh( uint8_t *, intptr_t )
4674 ;-----------------------------------------------------------------------------
4675 INIT_MMX mmx2
4676 cglobal pixel_var_16x16, 2,3
4677 FIX_STRIDES r1
4678 VAR_START 0
4679 VAR_2ROW 8*SIZEOF_PIXEL, 16, 1
4680 VAR_END 16, 16
4681
4682 cglobal pixel_var_8x8, 2,3
4683 FIX_STRIDES r1
4684 VAR_START 0
4685 VAR_2ROW r1, 4, 1
4686 VAR_END 8, 8
4687
4688 %if HIGH_BIT_DEPTH
4689 %macro VAR 0
4690 cglobal pixel_var_16x16, 2,3,8
4691 FIX_STRIDES r1
4692 VAR_START 0
4693 VAR_2ROW r1, 8, 1
4694 VAR_END 16, 16
4695
4696 cglobal pixel_var_8x8, 2,3,8
4697 lea r2, [r1*3]
4698 VAR_START 0
4699 movu m0, [r0]
4700 movu m1, [r0+r1*2]
4701 movu m3, [r0+r1*4]
4702 movu m4, [r0+r2*2]
4703 lea r0, [r0+r1*8]
4704 VAR_CORE
4705 movu m0, [r0]
4706 movu m1, [r0+r1*2]
4707 movu m3, [r0+r1*4]
4708 movu m4, [r0+r2*2]
4709 VAR_CORE
4710 VAR_END 8, 8
4711
4712 cglobal pixel_var_32x32, 2,6,8
4713 FIX_STRIDES r1
4714 mov r3, r0
4715 VAR_START 0
4716 VAR_2ROW r1, 8, 1
4717 HADDW m5, m2
4718 movd r4d, m5
4719 pxor m5, m5
4720 VAR_2ROW r1, 8, 2
4721 HADDW m5, m2
4722 movd r5d, m5
4723 add r4, r5
4724 pxor m5, m5
4725 lea r0, [r3 + 32]
4726 VAR_2ROW r1, 8, 3
4727 HADDW m5, m2
4728 movd r5d, m5
4729 add r4, r5
4730 pxor m5, m5
4731 VAR_2ROW r1, 8, 4
4732 VAR_END 32, 32
4733
4734 cglobal pixel_var_64x64, 2,6,8
4735 FIX_STRIDES r1
4736 mov r3, r0
4737 VAR_START 0
4738 VAR_2ROW r1, 8, 1
4739 HADDW m5, m2
4740 movd r4d, m5
4741 pxor m5, m5
4742 VAR_2ROW r1, 8, 2
4743 HADDW m5, m2
4744 movd r5d, m5
4745 add r4, r5
4746 pxor m5, m5
4747 VAR_2ROW r1, 8, 3
4748 HADDW m5, m2
4749 movd r5d, m5
4750 add r4, r5
4751 pxor m5, m5
4752 VAR_2ROW r1, 8, 4
4753 HADDW m5, m2
4754 movd r5d, m5
4755 add r4, r5
4756 pxor m5, m5
4757 lea r0, [r3 + 32]
4758 VAR_2ROW r1, 8, 5
4759 HADDW m5, m2
4760 movd r5d, m5
4761 add r4, r5
4762 pxor m5, m5
4763 VAR_2ROW r1, 8, 6
4764 HADDW m5, m2
4765 movd r5d, m5
4766 add r4, r5
4767 pxor m5, m5
4768 VAR_2ROW r1, 8, 7
4769 HADDW m5, m2
4770 movd r5d, m5
4771 add r4, r5
4772 pxor m5, m5
4773 VAR_2ROW r1, 8, 8
4774 HADDW m5, m2
4775 movd r5d, m5
4776 add r4, r5
4777 pxor m5, m5
4778 lea r0, [r3 + 64]
4779 VAR_2ROW r1, 8, 9
4780 HADDW m5, m2
4781 movd r5d, m5
4782 add r4, r5
4783 pxor m5, m5
4784 VAR_2ROW r1, 8, 10
4785 HADDW m5, m2
4786 movd r5d, m5
4787 add r4, r5
4788 pxor m5, m5
4789 VAR_2ROW r1, 8, 11
4790 HADDW m5, m2
4791 movd r5d, m5
4792 add r4, r5
4793 pxor m5, m5
4794 VAR_2ROW r1, 8, 12
4795 HADDW m5, m2
4796 movd r5d, m5
4797 add r4, r5
4798 pxor m5, m5
4799 lea r0, [r3 + 96]
4800 VAR_2ROW r1, 8, 13
4801 HADDW m5, m2
4802 movd r5d, m5
4803 add r4, r5
4804 pxor m5, m5
4805 VAR_2ROW r1, 8, 14
4806 HADDW m5, m2
4807 movd r5d, m5
4808 add r4, r5
4809 pxor m5, m5
4810 VAR_2ROW r1, 8, 15
4811 HADDW m5, m2
4812 movd r5d, m5
4813 add r4, r5
4814 pxor m5, m5
4815 VAR_2ROW r1, 8, 16
4816 VAR_END 64, 64
4817 %endmacro ; VAR
4818
4819 INIT_XMM sse2
4820 VAR
4821 INIT_XMM avx
4822 VAR
4823 INIT_XMM xop
4824 VAR
4825 %endif ; HIGH_BIT_DEPTH
4826
4827 %if HIGH_BIT_DEPTH == 0
4828 %macro VAR 0
4829 cglobal pixel_var_8x8, 2,3,8
4830 VAR_START 1
4831 lea r2, [r1 * 3]
4832 movh m0, [r0]
4833 movh m3, [r0 + r1]
4834 movhps m0, [r0 + r1 * 2]
4835 movhps m3, [r0 + r2]
4836 DEINTB 1, 0, 4, 3, 7
4837 lea r0, [r0 + r1 * 4]
4838 VAR_CORE
4839 movh m0, [r0]
4840 movh m3, [r0 + r1]
4841 movhps m0, [r0 + r1 * 2]
4842 movhps m3, [r0 + r2]
4843 DEINTB 1, 0, 4, 3, 7
4844 VAR_CORE
4845 VAR_END 8, 8
4846
4847 cglobal pixel_var_16x16_internal
4848 movu m0, [r0]
4849 movu m3, [r0 + r1]
4850 DEINTB 1, 0, 4, 3, 7
4851 VAR_CORE
4852 movu m0, [r0 + 2 * r1]
4853 movu m3, [r0 + r2]
4854 DEINTB 1, 0, 4, 3, 7
4855 lea r0, [r0 + r1 * 4]
4856 VAR_CORE
4857 movu m0, [r0]
4858 movu m3, [r0 + r1]
4859 DEINTB 1, 0, 4, 3, 7
4860 VAR_CORE
4861 movu m0, [r0 + 2 * r1]
4862 movu m3, [r0 + r2]
4863 DEINTB 1, 0, 4, 3, 7
4864 lea r0, [r0 + r1 * 4]
4865 VAR_CORE
4866 movu m0, [r0]
4867 movu m3, [r0 + r1]
4868 DEINTB 1, 0, 4, 3, 7
4869 VAR_CORE
4870 movu m0, [r0 + 2 * r1]
4871 movu m3, [r0 + r2]
4872 DEINTB 1, 0, 4, 3, 7
4873 lea r0, [r0 + r1 * 4]
4874 VAR_CORE
4875 movu m0, [r0]
4876 movu m3, [r0 + r1]
4877 DEINTB 1, 0, 4, 3, 7
4878 VAR_CORE
4879 movu m0, [r0 + 2 * r1]
4880 movu m3, [r0 + r2]
4881 DEINTB 1, 0, 4, 3, 7
4882 VAR_CORE
4883 ret
4884
4885 cglobal pixel_var_16x16, 2,3,8
4886 VAR_START 1
4887 lea r2, [r1 * 3]
4888 call pixel_var_16x16_internal
4889 VAR_END 16, 16
4890
4891 cglobal pixel_var_32x32, 2,4,8
4892 VAR_START 1
4893 lea r2, [r1 * 3]
4894 mov r3, r0
4895 call pixel_var_16x16_internal
4896 lea r0, [r0 + r1 * 4]
4897 call pixel_var_16x16_internal
4898 lea r0, [r3 + 16]
4899 call pixel_var_16x16_internal
4900 lea r0, [r0 + r1 * 4]
4901 call pixel_var_16x16_internal
4902 VAR_END 32, 32
4903
4904 cglobal pixel_var_64x64, 2,6,8
4905 VAR_START 1
4906 lea r2, [r1 * 3]
4907 mov r3, r0
4908 call pixel_var_16x16_internal
4909 lea r0, [r0 + r1 * 4]
4910 call pixel_var_16x16_internal
4911 lea r0, [r0 + r1 * 4]
4912 call pixel_var_16x16_internal
4913 lea r0, [r0 + r1 * 4]
4914 call pixel_var_16x16_internal
4915 HADDW m5, m2
4916 movd r4d, m5
4917 pxor m5, m5
4918 lea r0, [r3 + 16]
4919 call pixel_var_16x16_internal
4920 lea r0, [r0 + r1 * 4]
4921 call pixel_var_16x16_internal
4922 lea r0, [r0 + r1 * 4]
4923 call pixel_var_16x16_internal
4924 lea r0, [r0 + r1 * 4]
4925 call pixel_var_16x16_internal
4926 HADDW m5, m2
4927 movd r5d, m5
4928 add r4, r5
4929 pxor m5, m5
4930 lea r0, [r3 + 32]
4931 call pixel_var_16x16_internal
4932 lea r0, [r0 + r1 * 4]
4933 call pixel_var_16x16_internal
4934 lea r0, [r0 + r1 * 4]
4935 call pixel_var_16x16_internal
4936 lea r0, [r0 + r1 * 4]
4937 call pixel_var_16x16_internal
4938 lea r0, [r3 + 48]
4939 HADDW m5, m2
4940 movd r5d, m5
4941 add r4, r5
4942 pxor m5, m5
4943 call pixel_var_16x16_internal
4944 lea r0, [r0 + r1 * 4]
4945 call pixel_var_16x16_internal
4946 lea r0, [r0 + r1 * 4]
4947 call pixel_var_16x16_internal
4948 lea r0, [r0 + r1 * 4]
4949 call pixel_var_16x16_internal
4950 VAR_END 64, 64
4951 %endmacro ; VAR
4952
4953 INIT_XMM sse2
4954 VAR
4955 INIT_XMM avx
4956 VAR
4957 INIT_XMM xop
4958 VAR
4959
4960 INIT_YMM avx2
4961 cglobal pixel_var_16x16, 2,4,7
4962 VAR_START 0
4963 mov r2d, 4
4964 lea r3, [r1*3]
4965 .loop:
4966 pmovzxbw m0, [r0]
4967 pmovzxbw m3, [r0+r1]
4968 pmovzxbw m1, [r0+r1*2]
4969 pmovzxbw m4, [r0+r3]
4970 lea r0, [r0+r1*4]
4971 VAR_CORE
4972 dec r2d
4973 jg .loop
4974 vextracti128 xm0, m5, 1
4975 vextracti128 xm1, m6, 1
4976 paddw xm5, xm0
4977 paddd xm6, xm1
4978 HADDW xm5, xm2
4979 HADDD xm6, xm1
4980 %if ARCH_X86_64
4981 punpckldq xm5, xm6
4982 movq rax, xm5
4983 %else
4984 movd eax, xm5
4985 movd edx, xm6
4986 %endif
4987 RET
4988 %endif ; !HIGH_BIT_DEPTH
4989
4990 %macro VAR2_END 3
4991 HADDW %2, xm1
4992 movd r1d, %2
4993 imul r1d, r1d
4994 HADDD %3, xm1
4995 shr r1d, %1
4996 movd eax, %3
4997 movd [r4], %3
4998 sub eax, r1d ; sqr - (sum * sum >> shift)
4999 RET
5000 %endmacro
5001