Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* Copyright (C) 2013 x265 project | |
3 | ;* | |
4 | ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> | |
5 | ;* Nabajit Deka <nabajit@multicorewareinc.com> | |
6 | ;* | |
7 | ;* This program is free software; you can redistribute it and/or modify | |
8 | ;* it under the terms of the GNU General Public License as published by | |
9 | ;* the Free Software Foundation; either version 2 of the License, or | |
10 | ;* (at your option) any later version. | |
11 | ;* | |
12 | ;* This program is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | ;* GNU General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU General Public License | |
18 | ;* along with this program; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 | ;* | |
21 | ;* This program is also available under a commercial proprietary license. | |
22 | ;* For more information, contact us at license @ x265.com. | |
23 | ;*****************************************************************************/ | |
24 | ||
25 | %include "x86inc.asm" | |
26 | %include "x86util.asm" | |
27 | ||
28 | SECTION_RODATA 32 | |
29 | ||
30 | %if BIT_DEPTH == 10 | |
31 | ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 | |
32 | ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 | |
33 | pf_64: times 4 dd 64.0 | |
34 | pf_128: times 4 dd 128.0 | |
35 | %elif BIT_DEPTH == 9 | |
36 | ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64 | |
37 | ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 | |
38 | %else ; 8-bit | |
39 | ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 | |
40 | ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 | |
41 | %endif | |
42 | mask_ff: times 16 db 0xff | |
43 | times 16 db 0 | |
44 | deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 | |
45 | deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 | |
46 | hmul_16p: times 16 db 1 | |
47 | times 8 db 1, -1 | |
48 | hmulw_16p: times 8 dw 1 | |
49 | times 4 dw 1, -1 | |
50 | ||
51 | trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 | |
52 | ||
53 | SECTION .text | |
54 | ||
55 | cextern pw_1 | |
56 | cextern pb_1 | |
57 | cextern pw_00ff | |
58 | cextern pw_2000 | |
59 | cextern pw_pixel_max | |
60 | cextern pd_1 | |
61 | cextern pd_32767 | |
62 | cextern pd_n32768 | |
63 | ||
64 | ;----------------------------------------------------------------------------- | |
65 | ; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) | |
66 | ;----------------------------------------------------------------------------- | |
67 | INIT_XMM sse2 | |
68 | %if HIGH_BIT_DEPTH | |
69 | %if ARCH_X86_64 == 1 | |
70 | cglobal calcRecons4, 5,8,4 | |
71 | %define t7b r7b | |
72 | %else | |
73 | cglobal calcRecons4, 5,7,4,0-1 | |
74 | %define t7b byte [rsp] | |
75 | %endif | |
76 | mov r4d, r4m | |
77 | mov r5d, r5m | |
78 | mov r6d, r6m | |
79 | add r4d, r4d | |
80 | add r5d, r5d | |
81 | add r6d, r6d | |
82 | ||
83 | pxor m4, m4 | |
84 | mova m5, [pw_pixel_max] | |
85 | mov t7b, 4/2 | |
86 | .loop: | |
87 | movh m0, [r0] | |
88 | movh m1, [r0 + r4] | |
89 | punpcklqdq m0, m1 | |
90 | movh m2, [r1] | |
91 | movh m3, [r1 + r4] | |
92 | punpcklqdq m2, m3 | |
93 | paddw m0, m2 | |
94 | CLIPW m0, m4, m5 | |
95 | ||
96 | ; store recipred[] | |
97 | movh [r3], m0 | |
98 | movhps [r3 + r6], m0 | |
99 | ||
100 | ; store recqt[] | |
101 | movh [r2], m0 | |
102 | movhps [r2 + r5], m0 | |
103 | ||
104 | lea r0, [r0 + r4 * 2] | |
105 | lea r1, [r1 + r4 * 2] | |
106 | lea r2, [r2 + r5 * 2] | |
107 | lea r3, [r3 + r6 * 2] | |
108 | ||
109 | dec t7b | |
110 | jnz .loop | |
111 | RET | |
112 | %else ;HIGH_BIT_DEPTH | |
113 | ||
114 | %if ARCH_X86_64 == 1 | |
115 | cglobal calcRecons4, 5,8,4 | |
116 | %define t7b r7b | |
117 | %else | |
118 | cglobal calcRecons4, 5,7,4,0-1 | |
119 | %define t7b byte [rsp] | |
120 | %endif | |
121 | mov r4d, r4m | |
122 | mov r5d, r5m | |
123 | mov r6d, r6m | |
124 | add r5d, r5d | |
125 | ||
126 | pxor m0, m0 | |
127 | mov t7b, 4/2 | |
128 | .loop: | |
129 | movd m1, [r0] | |
130 | movd m2, [r0 + r4] | |
131 | punpckldq m1, m2 | |
132 | punpcklbw m1, m0 | |
133 | movh m2, [r1] | |
134 | movh m3, [r1 + r4 * 2] | |
135 | punpcklqdq m2, m3 | |
136 | paddw m1, m2 | |
137 | packuswb m1, m1 | |
138 | ||
139 | ; store recon[] and recipred[] | |
140 | movd [r3], m1 | |
141 | pshufd m2, m1, 1 | |
142 | movd [r3 + r6], m2 | |
143 | ||
144 | ; store recqt[] | |
145 | punpcklbw m1, m0 | |
146 | movh [r2], m1 | |
147 | movhps [r2 + r5], m1 | |
148 | ||
149 | lea r0, [r0 + r4 * 2] | |
150 | lea r1, [r1 + r4 * 4] | |
151 | lea r2, [r2 + r5 * 2] | |
152 | lea r3, [r3 + r6 * 2] | |
153 | ||
154 | dec t7b | |
155 | jnz .loop | |
156 | RET | |
157 | %endif ;HIGH_BIT_DEPTH | |
158 | ||
159 | ||
160 | INIT_XMM sse2 | |
161 | %if ARCH_X86_64 == 1 | |
162 | cglobal calcRecons8, 5,8,4 | |
163 | %define t7b r7b | |
164 | %else | |
165 | cglobal calcRecons8, 5,7,4,0-1 | |
166 | %define t7b byte [rsp] | |
167 | %endif | |
168 | ||
169 | %if HIGH_BIT_DEPTH | |
170 | mov r4d, r4m | |
171 | mov r5d, r5m | |
172 | mov r6d, r6m | |
173 | add r4d, r4d | |
174 | add r5d, r5d | |
175 | add r6d, r6d | |
176 | ||
177 | pxor m4, m4 | |
178 | mova m5, [pw_pixel_max] | |
179 | mov t7b, 8/2 | |
180 | .loop: | |
181 | movu m0, [r0] | |
182 | movu m1, [r0 + r4] | |
183 | movu m2, [r1] | |
184 | movu m3, [r1 + r4] | |
185 | paddw m0, m2 | |
186 | paddw m1, m3 | |
187 | CLIPW2 m0, m1, m4, m5 | |
188 | ||
189 | ; store recipred[] | |
190 | movu [r3], m0 | |
191 | movu [r3 + r6], m1 | |
192 | ||
193 | ; store recqt[] | |
194 | movu [r2], m0 | |
195 | movu [r2 + r5], m1 | |
196 | ||
197 | lea r0, [r0 + r4 * 2] | |
198 | lea r1, [r1 + r4 * 2] | |
199 | lea r2, [r2 + r5 * 2] | |
200 | lea r3, [r3 + r6 * 2] | |
201 | ||
202 | dec t7b | |
203 | jnz .loop | |
204 | RET | |
205 | %else ;HIGH_BIT_DEPTH | |
206 | ||
207 | mov r4d, r4m | |
208 | mov r5d, r5m | |
209 | mov r6d, r6m | |
210 | add r5d, r5d | |
211 | ||
212 | pxor m0, m0 | |
213 | mov t7b, 8/2 | |
214 | .loop: | |
215 | movh m1, [r0] | |
216 | movh m2, [r0 + r4] | |
217 | punpcklbw m1, m0 | |
218 | punpcklbw m2, m0 | |
219 | movu m3, [r1] | |
220 | movu m4, [r1 + r4 * 2] | |
221 | paddw m1, m3 | |
222 | paddw m2, m4 | |
223 | packuswb m1, m2 | |
224 | ||
225 | ; store recon[] and recipred[] | |
226 | movh [r3], m1 | |
227 | movhps [r3 + r6], m1 | |
228 | ||
229 | ; store recqt[] | |
230 | punpcklbw m2, m1, m0 | |
231 | punpckhbw m1, m0 | |
232 | movu [r2], m2 | |
233 | movu [r2 + r5], m1 | |
234 | ||
235 | lea r0, [r0 + r4 * 2] | |
236 | lea r1, [r1 + r4 * 4] | |
237 | lea r2, [r2 + r5 * 2] | |
238 | lea r3, [r3 + r6 * 2] | |
239 | ||
240 | dec t7b | |
241 | jnz .loop | |
242 | RET | |
243 | %endif ;HIGH_BIT_DEPTH | |
244 | ||
245 | ||
246 | ||
247 | %if HIGH_BIT_DEPTH | |
248 | INIT_XMM sse2 | |
249 | %if ARCH_X86_64 == 1 | |
250 | cglobal calcRecons16, 5,8,4 | |
251 | %define t7b r7b | |
252 | %else | |
253 | cglobal calcRecons16, 5,7,4,0-1 | |
254 | %define t7b byte [rsp] | |
255 | %endif | |
256 | ||
257 | mov r4d, r4m | |
258 | mov r5d, r5m | |
259 | mov r6d, r6m | |
260 | add r4d, r4d | |
261 | add r5d, r5d | |
262 | add r6d, r6d | |
263 | ||
264 | pxor m4, m4 | |
265 | mova m5, [pw_pixel_max] | |
266 | mov t7b, 16/2 | |
267 | .loop: | |
268 | movu m0, [r0] | |
269 | movu m1, [r0 + 16] | |
270 | movu m2, [r1] | |
271 | movu m3, [r1 + 16] | |
272 | paddw m0, m2 | |
273 | paddw m1, m3 | |
274 | CLIPW2 m0, m1, m4, m5 | |
275 | ||
276 | ; store recipred[] | |
277 | movu [r3], m0 | |
278 | movu [r3 + 16], m1 | |
279 | ||
280 | ; store recqt[] | |
281 | movu [r2], m0 | |
282 | movu [r2 + 16], m1 | |
283 | ||
284 | movu m0, [r0 + r4] | |
285 | movu m1, [r0 + r4 + 16] | |
286 | movu m2, [r1 + r4] | |
287 | movu m3, [r1 + r4 + 16] | |
288 | paddw m0, m2 | |
289 | paddw m1, m3 | |
290 | CLIPW2 m0, m1, m4, m5 | |
291 | ||
292 | ; store recon[] and recipred[] | |
293 | movu [r3 + r6], m0 | |
294 | movu [r3 + r6 + 16], m1 | |
295 | ||
296 | ; store recqt[] | |
297 | movu [r2 + r5], m0 | |
298 | movu [r2 + r5 + 16], m1 | |
299 | ||
300 | lea r0, [r0 + r4 * 2] | |
301 | lea r1, [r1 + r4 * 2] | |
302 | lea r2, [r2 + r5 * 2] | |
303 | lea r3, [r3 + r6 * 2] | |
304 | ||
305 | dec t7b | |
306 | jnz .loop | |
307 | RET | |
308 | %else ;HIGH_BIT_DEPTH | |
309 | ||
310 | INIT_XMM sse4 | |
311 | %if ARCH_X86_64 == 1 | |
312 | cglobal calcRecons16, 5,8,4 | |
313 | %define t7b r7b | |
314 | %else | |
315 | cglobal calcRecons16, 5,7,4,0-1 | |
316 | %define t7b byte [rsp] | |
317 | %endif | |
318 | ||
319 | mov r4d, r4m | |
320 | mov r5d, r5m | |
321 | mov r6d, r6m | |
322 | add r5d, r5d | |
323 | ||
324 | pxor m0, m0 | |
325 | mov t7b, 16 | |
326 | .loop: | |
327 | movu m2, [r0] | |
328 | pmovzxbw m1, m2 | |
329 | punpckhbw m2, m0 | |
330 | paddw m1, [r1] | |
331 | paddw m2, [r1 + 16] | |
332 | packuswb m1, m2 | |
333 | ||
334 | ; store recon[] and recipred[] | |
335 | movu [r3], m1 | |
336 | ||
337 | ; store recqt[] | |
338 | pmovzxbw m2, m1 | |
339 | punpckhbw m1, m0 | |
340 | movu [r2], m2 | |
341 | movu [r2 + 16], m1 | |
342 | ||
343 | add r2, r5 | |
344 | add r3, r6 | |
345 | add r0, r4 | |
346 | lea r1, [r1 + r4 * 2] | |
347 | ||
348 | dec t7b | |
349 | jnz .loop | |
350 | RET | |
351 | %endif ;HIGH_BIT_DEPTH | |
352 | ||
353 | %if HIGH_BIT_DEPTH | |
354 | INIT_XMM sse2 | |
355 | %if ARCH_X86_64 == 1 | |
356 | cglobal calcRecons32, 5,8,4 | |
357 | %define t7b r7b | |
358 | %else | |
359 | cglobal calcRecons32, 5,7,4,0-1 | |
360 | %define t7b byte [rsp] | |
361 | %endif | |
362 | ||
363 | mov r4d, r4m | |
364 | mov r5d, r5m | |
365 | mov r6d, r6m | |
366 | add r4d, r4d | |
367 | add r5d, r5d | |
368 | add r6d, r6d | |
369 | ||
370 | pxor m4, m4 | |
371 | mova m5, [pw_pixel_max] | |
372 | mov t7b, 32/2 | |
373 | .loop: | |
374 | ||
375 | movu m0, [r0] | |
376 | movu m1, [r0 + 16] | |
377 | movu m2, [r1] | |
378 | movu m3, [r1 + 16] | |
379 | paddw m0, m2 | |
380 | paddw m1, m3 | |
381 | CLIPW2 m0, m1, m4, m5 | |
382 | ||
383 | ; store recipred[] | |
384 | movu [r3], m0 | |
385 | movu [r3 + 16], m1 | |
386 | ||
387 | ; store recqt[] | |
388 | movu [r2], m0 | |
389 | movu [r2 + 16], m1 | |
390 | ||
391 | movu m0, [r0 + 32] | |
392 | movu m1, [r0 + 48] | |
393 | movu m2, [r1 + 32] | |
394 | movu m3, [r1 + 48] | |
395 | paddw m0, m2 | |
396 | paddw m1, m3 | |
397 | CLIPW2 m0, m1, m4, m5 | |
398 | ||
399 | ; store recon[] and recipred[] | |
400 | movu [r3 + 32], m0 | |
401 | movu [r3 + 48], m1 | |
402 | ||
403 | ; store recqt[] | |
404 | movu [r2 + 32], m0 | |
405 | movu [r2 + 48], m1 | |
406 | add r2, r5 | |
407 | ||
408 | movu m0, [r0 + r4] | |
409 | movu m1, [r0 + r4 + 16] | |
410 | movu m2, [r1 + r4] | |
411 | movu m3, [r1 + r4 + 16] | |
412 | paddw m0, m2 | |
413 | paddw m1, m3 | |
414 | CLIPW2 m0, m1, m4, m5 | |
415 | ||
416 | ; store recon[] and recipred[] | |
417 | movu [r3 + r6], m0 | |
418 | movu [r3 + r6 + 16], m1 | |
419 | ||
420 | ; store recqt[] | |
421 | movu [r2], m0 | |
422 | movu [r2 + 16], m1 | |
423 | ||
424 | movu m0, [r0 + r4 + 32] | |
425 | movu m1, [r0 + r4 + 48] | |
426 | movu m2, [r1 + r4 + 32] | |
427 | movu m3, [r1 + r4 + 48] | |
428 | paddw m0, m2 | |
429 | paddw m1, m3 | |
430 | CLIPW2 m0, m1, m4, m5 | |
431 | ||
432 | ; store recon[] and recipred[] | |
433 | movu [r3 + r6 + 32], m0 | |
434 | movu [r3 + r6 + 48], m1 | |
435 | lea r3, [r3 + r6 * 2] | |
436 | ||
437 | ; store recqt[] | |
438 | movu [r2 + 32], m0 | |
439 | movu [r2 + 48], m1 | |
440 | add r2, r5 | |
441 | ||
442 | lea r0, [r0 + r4 * 2] | |
443 | lea r1, [r1 + r4 * 2] | |
444 | ||
445 | dec t7b | |
446 | jnz .loop | |
447 | RET | |
448 | %else ;HIGH_BIT_DEPTH | |
449 | INIT_XMM sse4 | |
450 | %if ARCH_X86_64 == 1 | |
451 | cglobal calcRecons32, 5,8,4 | |
452 | %define t7b r7b | |
453 | %else | |
454 | cglobal calcRecons32, 5,7,4,0-1 | |
455 | %define t7b byte [rsp] | |
456 | %endif | |
457 | ||
458 | mov r4d, r4m | |
459 | mov r5d, r5m | |
460 | mov r6d, r6m | |
461 | add r5d, r5d | |
462 | ||
463 | pxor m0, m0 | |
464 | mov t7b, 32 | |
465 | .loop: | |
466 | movu m2, [r0] | |
467 | movu m4, [r0 + 16] | |
468 | pmovzxbw m1, m2 | |
469 | punpckhbw m2, m0 | |
470 | pmovzxbw m3, m4 | |
471 | punpckhbw m4, m0 | |
472 | ||
473 | paddw m1, [r1 + 0 * 16] | |
474 | paddw m2, [r1 + 1 * 16] | |
475 | packuswb m1, m2 | |
476 | ||
477 | paddw m3, [r1 + 2 * 16] | |
478 | paddw m4, [r1 + 3 * 16] | |
479 | packuswb m3, m4 | |
480 | ||
481 | ; store recon[] and recipred[] | |
482 | movu [r3], m1 | |
483 | movu [r3 + 16], m3 | |
484 | ||
485 | ; store recqt[] | |
486 | pmovzxbw m2, m1 | |
487 | punpckhbw m1, m0 | |
488 | movu [r2 + 0 * 16], m2 | |
489 | movu [r2 + 1 * 16], m1 | |
490 | pmovzxbw m4, m3 | |
491 | punpckhbw m3, m0 | |
492 | movu [r2 + 2 * 16], m4 | |
493 | movu [r2 + 3 * 16], m3 | |
494 | ||
495 | add r2, r5 | |
496 | add r3, r6 | |
497 | add r0, r4 | |
498 | lea r1, [r1 + r4 * 2] | |
499 | ||
500 | dec t7b | |
501 | jnz .loop | |
502 | RET | |
503 | %endif ;HIGH_BIT_DEPTH | |
504 | ||
505 | ||
506 | ;----------------------------------------------------------------------------- | |
507 | ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) | |
508 | ;----------------------------------------------------------------------------- | |
509 | INIT_XMM sse2 | |
510 | %if HIGH_BIT_DEPTH | |
511 | cglobal getResidual4, 4,4,4 | |
512 | add r3, r3 | |
513 | ||
514 | ; row 0-1 | |
515 | movh m0, [r0] | |
516 | movh m1, [r0 + r3] | |
517 | movh m2, [r1] | |
518 | movh m3, [r1 + r3] | |
519 | punpcklqdq m0, m1 | |
520 | punpcklqdq m2, m3 | |
521 | psubw m0, m2 | |
522 | ||
523 | movh [r2], m0 | |
524 | movhps [r2 + r3], m0 | |
525 | lea r0, [r0 + r3 * 2] | |
526 | lea r1, [r1 + r3 * 2] | |
527 | lea r2, [r2 + r3 * 2] | |
528 | ||
529 | ; row 2-3 | |
530 | movh m0, [r0] | |
531 | movh m1, [r0 + r3] | |
532 | movh m2, [r1] | |
533 | movh m3, [r1 + r3] | |
534 | punpcklqdq m0, m1 | |
535 | punpcklqdq m2, m3 | |
536 | psubw m0, m2 | |
537 | ||
538 | movh [r2], m0 | |
539 | movhps [r2 + r3], m0 | |
540 | %else | |
541 | cglobal getResidual4, 4,4,5 | |
542 | pxor m0, m0 | |
543 | ||
544 | ; row 0-1 | |
545 | movd m1, [r0] | |
546 | movd m2, [r0 + r3] | |
547 | movd m3, [r1] | |
548 | movd m4, [r1 + r3] | |
549 | punpckldq m1, m2 | |
550 | punpcklbw m1, m0 | |
551 | punpckldq m3, m4 | |
552 | punpcklbw m3, m0 | |
553 | psubw m1, m3 | |
554 | movh [r2], m1 | |
555 | movhps [r2 + r3 * 2], m1 | |
556 | lea r0, [r0 + r3 * 2] | |
557 | lea r1, [r1 + r3 * 2] | |
558 | lea r2, [r2 + r3 * 4] | |
559 | ||
560 | ; row 2-3 | |
561 | movd m1, [r0] | |
562 | movd m2, [r0 + r3] | |
563 | movd m3, [r1] | |
564 | movd m4, [r1 + r3] | |
565 | punpckldq m1, m2 | |
566 | punpcklbw m1, m0 | |
567 | punpckldq m3, m4 | |
568 | punpcklbw m3, m0 | |
569 | psubw m1, m3 | |
570 | movh [r2], m1 | |
571 | movhps [r2 + r3 * 2], m1 | |
572 | %endif | |
573 | RET | |
574 | ||
575 | ||
576 | INIT_XMM sse2 | |
577 | %if HIGH_BIT_DEPTH | |
578 | cglobal getResidual8, 4,4,4 | |
579 | add r3, r3 | |
580 | ||
581 | %assign x 0 | |
582 | %rep 8/2 | |
583 | ; row 0-1 | |
584 | movu m1, [r0] | |
585 | movu m2, [r0 + r3] | |
586 | movu m3, [r1] | |
587 | movu m4, [r1 + r3] | |
588 | psubw m1, m3 | |
589 | psubw m2, m4 | |
590 | movu [r2], m1 | |
591 | movu [r2 + r3], m2 | |
592 | %assign x x+1 | |
593 | %if (x != 4) | |
594 | lea r0, [r0 + r3 * 2] | |
595 | lea r1, [r1 + r3 * 2] | |
596 | lea r2, [r2 + r3 * 2] | |
597 | %endif | |
598 | %endrep | |
599 | %else | |
600 | cglobal getResidual8, 4,4,5 | |
601 | pxor m0, m0 | |
602 | ||
603 | %assign x 0 | |
604 | %rep 8/2 | |
605 | ; row 0-1 | |
606 | movh m1, [r0] | |
607 | movh m2, [r0 + r3] | |
608 | movh m3, [r1] | |
609 | movh m4, [r1 + r3] | |
610 | punpcklbw m1, m0 | |
611 | punpcklbw m2, m0 | |
612 | punpcklbw m3, m0 | |
613 | punpcklbw m4, m0 | |
614 | psubw m1, m3 | |
615 | psubw m2, m4 | |
616 | movu [r2], m1 | |
617 | movu [r2 + r3 * 2], m2 | |
618 | %assign x x+1 | |
619 | %if (x != 4) | |
620 | lea r0, [r0 + r3 * 2] | |
621 | lea r1, [r1 + r3 * 2] | |
622 | lea r2, [r2 + r3 * 4] | |
623 | %endif | |
624 | %endrep | |
625 | %endif | |
626 | RET | |
627 | ||
628 | %if HIGH_BIT_DEPTH | |
629 | INIT_XMM sse2 | |
630 | cglobal getResidual16, 4,5,6 | |
631 | add r3, r3 | |
632 | mov r4d, 16/4 | |
633 | .loop: | |
634 | ; row 0-1 | |
635 | movu m0, [r0] | |
636 | movu m1, [r0 + 16] | |
637 | movu m2, [r0 + r3] | |
638 | movu m3, [r0 + r3 + 16] | |
639 | movu m4, [r1] | |
640 | movu m5, [r1 + 16] | |
641 | psubw m0, m4 | |
642 | psubw m1, m5 | |
643 | movu m4, [r1 + r3] | |
644 | movu m5, [r1 + r3 + 16] | |
645 | psubw m2, m4 | |
646 | psubw m3, m5 | |
647 | lea r0, [r0 + r3 * 2] | |
648 | lea r1, [r1 + r3 * 2] | |
649 | ||
650 | movu [r2], m0 | |
651 | movu [r2 + 16], m1 | |
652 | movu [r2 + r3], m2 | |
653 | movu [r2 + r3 + 16], m3 | |
654 | lea r2, [r2 + r3 * 2] | |
655 | ||
656 | ; row 2-3 | |
657 | movu m0, [r0] | |
658 | movu m1, [r0 + 16] | |
659 | movu m2, [r0 + r3] | |
660 | movu m3, [r0 + r3 + 16] | |
661 | movu m4, [r1] | |
662 | movu m5, [r1 + 16] | |
663 | psubw m0, m4 | |
664 | psubw m1, m5 | |
665 | movu m4, [r1 + r3] | |
666 | movu m5, [r1 + r3 + 16] | |
667 | psubw m2, m4 | |
668 | psubw m3, m5 | |
669 | ||
670 | movu [r2], m0 | |
671 | movu [r2 + 16], m1 | |
672 | movu [r2 + r3], m2 | |
673 | movu [r2 + r3 + 16], m3 | |
674 | ||
675 | dec r4d | |
676 | ||
677 | lea r0, [r0 + r3 * 2] | |
678 | lea r1, [r1 + r3 * 2] | |
679 | lea r2, [r2 + r3 * 2] | |
680 | ||
681 | jnz .loop | |
682 | %else | |
683 | ||
684 | INIT_XMM sse4 | |
685 | cglobal getResidual16, 4,5,8 | |
686 | mov r4d, 16/4 | |
687 | pxor m0, m0 | |
688 | .loop: | |
689 | ; row 0-1 | |
690 | movu m1, [r0] | |
691 | movu m2, [r0 + r3] | |
692 | movu m3, [r1] | |
693 | movu m4, [r1 + r3] | |
694 | pmovzxbw m5, m1 | |
695 | punpckhbw m1, m0 | |
696 | pmovzxbw m6, m2 | |
697 | punpckhbw m2, m0 | |
698 | pmovzxbw m7, m3 | |
699 | punpckhbw m3, m0 | |
700 | psubw m5, m7 | |
701 | psubw m1, m3 | |
702 | pmovzxbw m7, m4 | |
703 | punpckhbw m4, m0 | |
704 | psubw m6, m7 | |
705 | psubw m2, m4 | |
706 | ||
707 | movu [r2], m5 | |
708 | movu [r2 + 16], m1 | |
709 | movu [r2 + r3 * 2], m6 | |
710 | movu [r2 + r3 * 2 + 16], m2 | |
711 | ||
712 | lea r0, [r0 + r3 * 2] | |
713 | lea r1, [r1 + r3 * 2] | |
714 | lea r2, [r2 + r3 * 4] | |
715 | ||
716 | ; row 2-3 | |
717 | movu m1, [r0] | |
718 | movu m2, [r0 + r3] | |
719 | movu m3, [r1] | |
720 | movu m4, [r1 + r3] | |
721 | pmovzxbw m5, m1 | |
722 | punpckhbw m1, m0 | |
723 | pmovzxbw m6, m2 | |
724 | punpckhbw m2, m0 | |
725 | pmovzxbw m7, m3 | |
726 | punpckhbw m3, m0 | |
727 | psubw m5, m7 | |
728 | psubw m1, m3 | |
729 | pmovzxbw m7, m4 | |
730 | punpckhbw m4, m0 | |
731 | psubw m6, m7 | |
732 | psubw m2, m4 | |
733 | ||
734 | movu [r2], m5 | |
735 | movu [r2 + 16], m1 | |
736 | movu [r2 + r3 * 2], m6 | |
737 | movu [r2 + r3 * 2 + 16], m2 | |
738 | ||
739 | dec r4d | |
740 | ||
741 | lea r0, [r0 + r3 * 2] | |
742 | lea r1, [r1 + r3 * 2] | |
743 | lea r2, [r2 + r3 * 4] | |
744 | ||
745 | jnz .loop | |
746 | %endif | |
747 | ||
748 | RET | |
749 | ||
750 | %if HIGH_BIT_DEPTH | |
751 | INIT_XMM sse2 | |
752 | cglobal getResidual32, 4,5,6 | |
753 | add r3, r3 | |
754 | mov r4d, 32/2 | |
755 | .loop: | |
756 | ; row 0 | |
757 | movu m0, [r0] | |
758 | movu m1, [r0 + 16] | |
759 | movu m2, [r0 + 32] | |
760 | movu m3, [r0 + 48] | |
761 | movu m4, [r1] | |
762 | movu m5, [r1 + 16] | |
763 | psubw m0, m4 | |
764 | psubw m1, m5 | |
765 | movu m4, [r1 + 32] | |
766 | movu m5, [r1 + 48] | |
767 | psubw m2, m4 | |
768 | psubw m3, m5 | |
769 | ||
770 | movu [r2], m0 | |
771 | movu [r2 + 16], m1 | |
772 | movu [r2 + 32], m2 | |
773 | movu [r2 + 48], m3 | |
774 | ||
775 | ; row 1 | |
776 | movu m0, [r0 + r3] | |
777 | movu m1, [r0 + r3 + 16] | |
778 | movu m2, [r0 + r3 + 32] | |
779 | movu m3, [r0 + r3 + 48] | |
780 | movu m4, [r1 + r3] | |
781 | movu m5, [r1 + r3 + 16] | |
782 | psubw m0, m4 | |
783 | psubw m1, m5 | |
784 | movu m4, [r1 + r3 + 32] | |
785 | movu m5, [r1 + r3 + 48] | |
786 | psubw m2, m4 | |
787 | psubw m3, m5 | |
788 | ||
789 | movu [r2 + r3], m0 | |
790 | movu [r2 + r3 + 16], m1 | |
791 | movu [r2 + r3 + 32], m2 | |
792 | movu [r2 + r3 + 48], m3 | |
793 | ||
794 | dec r4d | |
795 | ||
796 | lea r0, [r0 + r3 * 2] | |
797 | lea r1, [r1 + r3 * 2] | |
798 | lea r2, [r2 + r3 * 2] | |
799 | ||
800 | jnz .loop | |
801 | ||
802 | %else | |
803 | INIT_XMM sse4 | |
804 | cglobal getResidual32, 4,5,7 | |
805 | mov r4d, 32/2 | |
806 | pxor m0, m0 | |
807 | .loop: | |
808 | movu m1, [r0] | |
809 | movu m2, [r0 + 16] | |
810 | movu m3, [r1] | |
811 | movu m4, [r1 + 16] | |
812 | pmovzxbw m5, m1 | |
813 | punpckhbw m1, m0 | |
814 | pmovzxbw m6, m3 | |
815 | punpckhbw m3, m0 | |
816 | psubw m5, m6 | |
817 | psubw m1, m3 | |
818 | movu [r2 + 0 * 16], m5 | |
819 | movu [r2 + 1 * 16], m1 | |
820 | ||
821 | pmovzxbw m5, m2 | |
822 | punpckhbw m2, m0 | |
823 | pmovzxbw m6, m4 | |
824 | punpckhbw m4, m0 | |
825 | psubw m5, m6 | |
826 | psubw m2, m4 | |
827 | movu [r2 + 2 * 16], m5 | |
828 | movu [r2 + 3 * 16], m2 | |
829 | ||
830 | movu m1, [r0 + r3] | |
831 | movu m2, [r0 + r3 + 16] | |
832 | movu m3, [r1 + r3] | |
833 | movu m4, [r1 + r3 + 16] | |
834 | pmovzxbw m5, m1 | |
835 | punpckhbw m1, m0 | |
836 | pmovzxbw m6, m3 | |
837 | punpckhbw m3, m0 | |
838 | psubw m5, m6 | |
839 | psubw m1, m3 | |
840 | movu [r2 + r3 * 2 + 0 * 16], m5 | |
841 | movu [r2 + r3 * 2 + 1 * 16], m1 | |
842 | ||
843 | pmovzxbw m5, m2 | |
844 | punpckhbw m2, m0 | |
845 | pmovzxbw m6, m4 | |
846 | punpckhbw m4, m0 | |
847 | psubw m5, m6 | |
848 | psubw m2, m4 | |
849 | movu [r2 + r3 * 2 + 2 * 16], m5 | |
850 | movu [r2 + r3 * 2 + 3 * 16], m2 | |
851 | ||
852 | dec r4d | |
853 | ||
854 | lea r0, [r0 + r3 * 2] | |
855 | lea r1, [r1 + r3 * 2] | |
856 | lea r2, [r2 + r3 * 4] | |
857 | ||
858 | jnz .loop | |
859 | %endif | |
860 | RET | |
861 | ||
862 | ||
863 | ;----------------------------------------------------------------------------- | |
864 | ; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); | |
865 | ;----------------------------------------------------------------------------- | |
866 | INIT_XMM sse4 | |
867 | cglobal quant, 5,6,8 | |
868 | ; fill qbits | |
869 | movd m4, r4d ; m4 = qbits | |
870 | ||
871 | ; fill qbits-8 | |
872 | sub r4d, 8 | |
873 | movd m6, r4d ; m6 = qbits8 | |
874 | ||
875 | ; fill offset | |
876 | movd m5, r5m | |
877 | pshufd m5, m5, 0 ; m5 = add | |
878 | ||
879 | lea r5, [pd_1] | |
880 | ||
881 | mov r4d, r6m | |
882 | shr r4d, 3 | |
883 | pxor m7, m7 ; m7 = numZero | |
884 | .loop: | |
885 | ; 4 coeff | |
886 | movu m0, [r0] ; m0 = level | |
887 | pabsd m1, m0 | |
888 | pmulld m1, [r1] ; m0 = tmpLevel1 | |
889 | paddd m2, m1, m5 | |
890 | psrad m2, m4 ; m2 = level1 | |
891 | ||
892 | pslld m3, m2, 8 | |
893 | psrad m1, m6 | |
894 | psubd m1, m3 ; m1 = deltaU1 | |
895 | ||
896 | movu [r2], m1 | |
897 | psignd m3, m2, m0 | |
898 | pminud m2, [r5] | |
899 | paddd m7, m2 | |
900 | packssdw m3, m3 | |
901 | movh [r3], m3 | |
902 | ||
903 | ; 4 coeff | |
904 | movu m0, [r0 + 16] ; m0 = level | |
905 | pabsd m1, m0 | |
906 | pmulld m1, [r1 + 16] ; m0 = tmpLevel1 | |
907 | paddd m2, m1, m5 | |
908 | psrad m2, m4 ; m2 = level1 | |
909 | pslld m3, m2, 8 | |
910 | psrad m1, m6 | |
911 | psubd m1, m3 ; m1 = deltaU1 | |
912 | movu [r2 + 16], m1 | |
913 | psignd m3, m2, m0 | |
914 | pminud m2, [r5] | |
915 | paddd m7, m2 | |
916 | packssdw m3, m3 | |
917 | movh [r3 + 8], m3 | |
918 | ||
919 | add r0, 32 | |
920 | add r1, 32 | |
921 | add r2, 32 | |
922 | add r3, 16 | |
923 | ||
924 | dec r4d | |
925 | jnz .loop | |
926 | ||
927 | pxor m0, m0 | |
928 | psadbw m7, m0 | |
929 | movhlps m0, m7 | |
930 | paddd m7, m0 | |
931 | movd eax, m7 | |
932 | RET | |
933 | ||
934 | ||
935 | IACA_START | |
936 | %if ARCH_X86_64 == 1 | |
937 | INIT_YMM avx2 | |
938 | cglobal quant, 5,5,10 | |
939 | ; fill qbits | |
940 | movd xm4, r4d ; m4 = qbits | |
941 | ||
942 | ; fill qbits-8 | |
943 | sub r4d, 8 | |
944 | movd xm6, r4d ; m6 = qbits8 | |
945 | ||
946 | ; fill offset | |
947 | vpbroadcastd m5, r5m ; m5 = add | |
948 | ||
949 | vpbroadcastw m9, [pw_1] ; m9 = word [1] | |
950 | ||
951 | mov r4d, r6m | |
952 | shr r4d, 4 | |
953 | pxor m7, m7 ; m7 = numZero | |
954 | .loop: | |
955 | ; 8 coeff | |
956 | movu m0, [r0] ; m0 = level | |
957 | pabsd m1, m0 | |
958 | pmulld m1, [r1] ; m0 = tmpLevel1 | |
959 | paddd m2, m1, m5 | |
960 | psrad m2, xm4 ; m2 = level1 | |
961 | ||
962 | pslld m3, m2, 8 | |
963 | psrad m1, xm6 | |
964 | psubd m1, m3 ; m1 = deltaU1 | |
965 | movu [r2], m1 | |
966 | psignd m2, m0 | |
967 | ||
968 | ; 8 coeff | |
969 | movu m0, [r0 + mmsize] ; m0 = level | |
970 | pabsd m1, m0 | |
971 | pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 | |
972 | paddd m3, m1, m5 | |
973 | psrad m3, xm4 ; m2 = level1 | |
974 | ||
975 | pslld m8, m3, 8 | |
976 | psrad m1, xm6 | |
977 | psubd m1, m8 ; m1 = deltaU1 | |
978 | movu [r2 + mmsize], m1 | |
979 | psignd m3, m0 | |
980 | ||
981 | packssdw m2, m3 | |
982 | vpermq m2, m2, q3120 | |
983 | movu [r3], m2 | |
984 | ||
985 | ; count non-zero coeff | |
986 | ; TODO: popcnt is faster, but some CPU can't support | |
987 | pminuw m2, m9 | |
988 | paddw m7, m2 | |
989 | ||
990 | add r0, mmsize*2 | |
991 | add r1, mmsize*2 | |
992 | add r2, mmsize*2 | |
993 | add r3, mmsize | |
994 | ||
995 | dec r4d | |
996 | jnz .loop | |
997 | ||
998 | ; sum count | |
999 | xorpd m0, m0 | |
1000 | psadbw m7, m0 | |
1001 | vextracti128 xm1, m7, 1 | |
1002 | paddd xm7, xm1 | |
1003 | movhlps xm0, xm7 | |
1004 | paddd xm7, xm0 | |
1005 | movd eax, xm7 | |
1006 | RET | |
1007 | ||
1008 | %else ; ARCH_X86_64 == 1 | |
1009 | INIT_YMM avx2 | |
1010 | cglobal quant, 5,6,8 | |
1011 | ; fill qbits | |
1012 | movd xm4, r4d ; m4 = qbits | |
1013 | ||
1014 | ; fill qbits-8 | |
1015 | sub r4d, 8 | |
1016 | movd xm6, r4d ; m6 = qbits8 | |
1017 | ||
1018 | ; fill offset | |
1019 | vpbroadcastd m5, r5m ; m5 = ad | |
1020 | ||
1021 | lea r5, [pd_1] | |
1022 | ||
1023 | mov r4d, r6m | |
1024 | shr r4d, 4 | |
1025 | pxor m7, m7 ; m7 = numZero | |
1026 | .loop: | |
1027 | ; 8 coeff | |
1028 | movu m0, [r0] ; m0 = level | |
1029 | pabsd m1, m0 | |
1030 | pmulld m1, [r1] ; m0 = tmpLevel1 | |
1031 | paddd m2, m1, m5 | |
1032 | psrad m2, xm4 ; m2 = level1 | |
1033 | ||
1034 | pslld m3, m2, 8 | |
1035 | psrad m1, xm6 | |
1036 | psubd m1, m3 ; m1 = deltaU1 | |
1037 | ||
1038 | movu [r2], m1 | |
1039 | psignd m3, m2, m0 | |
1040 | pminud m2, [r5] | |
1041 | paddd m7, m2 | |
1042 | packssdw m3, m3 | |
1043 | vpermq m3, m3, q0020 | |
1044 | movu [r3], xm3 | |
1045 | ||
1046 | ; 8 coeff | |
1047 | movu m0, [r0 + mmsize] ; m0 = level | |
1048 | pabsd m1, m0 | |
1049 | pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 | |
1050 | paddd m2, m1, m5 | |
1051 | psrad m2, xm4 ; m2 = level1 | |
1052 | ||
1053 | pslld m3, m2, 8 | |
1054 | psrad m1, xm6 | |
1055 | psubd m1, m3 ; m1 = deltaU1 | |
1056 | ||
1057 | movu [r2 + mmsize], m1 | |
1058 | psignd m3, m2, m0 | |
1059 | pminud m2, [r5] | |
1060 | paddd m7, m2 | |
1061 | packssdw m3, m3 | |
1062 | vpermq m3, m3, q0020 | |
1063 | movu [r3 + mmsize/2], xm3 | |
1064 | ||
1065 | add r0, mmsize*2 | |
1066 | add r1, mmsize*2 | |
1067 | add r2, mmsize*2 | |
1068 | add r3, mmsize | |
1069 | ||
1070 | dec r4d | |
1071 | jnz .loop | |
1072 | ||
1073 | xorpd m0, m0 | |
1074 | psadbw m7, m0 | |
1075 | vextracti128 xm1, m7, 1 | |
1076 | paddd xm7, xm1 | |
1077 | movhlps xm0, xm7 | |
1078 | paddd xm7, xm0 | |
1079 | movd eax, xm7 | |
1080 | RET | |
1081 | %endif ; ARCH_X86_64 == 1 | |
1082 | IACA_END | |
1083 | ||
1084 | ||
1085 | ;----------------------------------------------------------------------------- | |
1086 | ; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); | |
1087 | ;----------------------------------------------------------------------------- | |
1088 | INIT_XMM sse4 | |
1089 | cglobal nquant, 3,5,8 | |
1090 | movd m6, r4m | |
1091 | mov r4d, r5m | |
1092 | pxor m7, m7 ; m7 = numZero | |
1093 | movd m5, r3m ; m5 = qbits | |
1094 | pshufd m6, m6, 0 ; m6 = add | |
1095 | mov r3d, r4d ; r3 = numCoeff | |
1096 | shr r4d, 3 | |
1097 | ||
1098 | .loop: | |
1099 | movu m0, [r0] ; m0 = level | |
1100 | movu m1, [r0 + 16] ; m1 = level | |
1101 | ||
1102 | pabsd m2, m0 | |
1103 | pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff | |
1104 | paddd m2, m6 | |
1105 | psrad m2, m5 ; m0 = level1 | |
1106 | psignd m2, m0 | |
1107 | ||
1108 | pabsd m3, m1 | |
1109 | pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff | |
1110 | paddd m3, m6 | |
1111 | psrad m3, m5 ; m1 = level1 | |
1112 | psignd m3, m1 | |
1113 | ||
1114 | packssdw m2, m3 | |
1115 | ||
1116 | movu [r2], m2 | |
1117 | add r0, 32 | |
1118 | add r1, 32 | |
1119 | add r2, 16 | |
1120 | ||
1121 | pxor m4, m4 | |
1122 | pcmpeqw m2, m4 | |
1123 | psubw m7, m2 | |
1124 | ||
1125 | dec r4d | |
1126 | jnz .loop | |
1127 | ||
1128 | packuswb m7, m7 | |
1129 | psadbw m7, m4 | |
1130 | mov eax, r3d | |
1131 | movd r4d, m7 | |
1132 | sub eax, r4d ; numSig | |
1133 | RET | |
1134 | ||
1135 | ||
1136 | INIT_YMM avx2 | |
1137 | cglobal nquant, 3,5,7 | |
1138 | vpbroadcastd m4, r4m | |
1139 | vpbroadcastd m6, [pw_1] | |
1140 | mov r4d, r5m | |
1141 | pxor m5, m5 ; m7 = numZero | |
1142 | movd xm3, r3m ; m5 = qbits | |
1143 | mov r3d, r4d ; r3 = numCoeff | |
1144 | shr r4d, 4 | |
1145 | ||
1146 | .loop: | |
1147 | movu m0, [r0] ; m0 = level | |
1148 | pabsd m1, m0 | |
1149 | pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff | |
1150 | paddd m1, m4 | |
1151 | psrad m1, xm3 ; m0 = level1 | |
1152 | psignd m1, m0 | |
1153 | ||
1154 | movu m0, [r0 + mmsize] ; m0 = level | |
1155 | pabsd m2, m0 | |
1156 | pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff | |
1157 | paddd m2, m4 | |
1158 | psrad m2, xm3 ; m0 = level1 | |
1159 | psignd m2, m0 | |
1160 | ||
1161 | packssdw m1, m2 | |
1162 | vpermq m2, m1, q3120 | |
1163 | ||
1164 | movu [r2], m2 | |
1165 | add r0, mmsize * 2 | |
1166 | add r1, mmsize * 2 | |
1167 | add r2, mmsize | |
1168 | ||
1169 | pminuw m1, m6 | |
1170 | paddw m5, m1 | |
1171 | ||
1172 | dec r4d | |
1173 | jnz .loop | |
1174 | ||
1175 | pxor m0, m0 | |
1176 | psadbw m5, m0 | |
1177 | vextracti128 xm0, m5, 1 | |
1178 | paddd xm5, xm0 | |
1179 | pshufd xm0, xm5, 2 | |
1180 | paddd xm5, xm0 | |
1181 | movd eax, xm5 | |
1182 | RET | |
1183 | ||
1184 | ||
1185 | ;----------------------------------------------------------------------------- | |
1186 | ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift) | |
1187 | ;----------------------------------------------------------------------------- | |
1188 | INIT_XMM sse4 | |
1189 | cglobal dequant_normal, 5,5,5 | |
1190 | mova m2, [pw_1] | |
1191 | %if HIGH_BIT_DEPTH | |
1192 | cmp r3d, 32767 | |
1193 | jle .skip | |
1194 | shr r3d, 2 | |
1195 | sub r4d, 2 | |
1196 | .skip: | |
1197 | %endif | |
1198 | movd m0, r4d ; m0 = shift | |
1199 | add r4d, 15 | |
1200 | bts r3d, r4d | |
1201 | movd m1, r3d | |
1202 | pshufd m1, m1, 0 ; m1 = dword [add scale] | |
1203 | ; m0 = shift | |
1204 | ; m1 = scale | |
1205 | ; m2 = word [1] | |
1206 | .loop: | |
1207 | movu m3, [r0] | |
1208 | punpckhwd m4, m3, m2 | |
1209 | punpcklwd m3, m2 | |
1210 | pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) | |
1211 | pmaddwd m4, m1 | |
1212 | psrad m3, m0 | |
1213 | psrad m4, m0 | |
1214 | packssdw m3, m3 ; OPT_ME: store must be 32 bits | |
1215 | pmovsxwd m3, m3 | |
1216 | packssdw m4, m4 | |
1217 | pmovsxwd m4, m4 | |
1218 | mova [r1], m3 | |
1219 | mova [r1 + 16], m4 | |
1220 | ||
1221 | add r0, 16 | |
1222 | add r1, 32 | |
1223 | ||
1224 | sub r2d, 8 | |
1225 | jnz .loop | |
1226 | RET | |
1227 | ||
1228 | ||
1229 | INIT_YMM avx2 | |
1230 | cglobal dequant_normal, 5,5,7 | |
1231 | vpbroadcastd m2, [pw_1] ; m2 = word [1] | |
1232 | vpbroadcastd m5, [pd_32767] ; m5 = dword [32767] | |
1233 | vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768] | |
1234 | %if HIGH_BIT_DEPTH | |
1235 | cmp r3d, 32767 | |
1236 | jle .skip | |
1237 | shr r3d, 2 | |
1238 | sub r4d, 2 | |
1239 | .skip: | |
1240 | %endif | |
1241 | movd xm0, r4d ; m0 = shift | |
1242 | add r4d, -1+16 | |
1243 | bts r3d, r4d | |
1244 | vpbroadcastd m1, r3d ; m1 = dword [add scale] | |
1245 | ||
1246 | ; m0 = shift | |
1247 | ; m1 = scale | |
1248 | ; m2 = word [1] | |
1249 | shr r2d, 4 | |
1250 | .loop: | |
1251 | movu m3, [r0] | |
1252 | punpckhwd m4, m3, m2 | |
1253 | punpcklwd m3, m2 | |
1254 | pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) | |
1255 | pmaddwd m4, m1 | |
1256 | psrad m3, xm0 | |
1257 | psrad m4, xm0 | |
1258 | pminsd m3, m5 | |
1259 | pmaxsd m3, m6 | |
1260 | pminsd m4, m5 | |
1261 | pmaxsd m4, m6 | |
1262 | mova [r1 + 0 * mmsize/2], xm3 | |
1263 | mova [r1 + 1 * mmsize/2], xm4 | |
1264 | vextracti128 [r1 + 2 * mmsize/2], m3, 1 | |
1265 | vextracti128 [r1 + 3 * mmsize/2], m4, 1 | |
1266 | ||
1267 | add r0, mmsize | |
1268 | add r1, mmsize * 2 | |
1269 | ||
1270 | dec r2d | |
1271 | jnz .loop | |
1272 | RET | |
1273 | ||
1274 | ||
1275 | ;----------------------------------------------------------------------------- | |
1276 | ; int count_nonzero(const int16_t *quantCoeff, int numCoeff); | |
1277 | ;----------------------------------------------------------------------------- | |
1278 | INIT_XMM ssse3 | |
1279 | cglobal count_nonzero, 2,2,3 | |
1280 | pxor m0, m0 | |
1281 | shr r1d, 4 | |
1282 | movd m1, r1d | |
1283 | pshufb m1, m0 | |
1284 | ||
1285 | .loop: | |
1286 | mova m2, [r0 + 0] | |
1287 | packsswb m2, [r0 + 16] | |
1288 | add r0, 32 | |
1289 | pcmpeqb m2, m0 | |
1290 | paddb m1, m2 | |
1291 | dec r1d | |
1292 | jnz .loop | |
1293 | ||
1294 | psadbw m1, m0 | |
1295 | pshufd m0, m1, 2 | |
1296 | paddd m0, m1 | |
1297 | movd eax, m0 | |
1298 | RET | |
1299 | ||
1300 | ||
1301 | ;----------------------------------------------------------------------------------------------------------------------------------------------- | |
1302 | ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) | |
1303 | ;----------------------------------------------------------------------------------------------------------------------------------------------- | |
1304 | INIT_XMM sse4 | |
1305 | cglobal weight_pp, 6, 7, 6 | |
1306 | ||
1307 | shl r5d, 6 ; m0 = [w0<<6] | |
1308 | mov r6d, r6m | |
1309 | shl r6d, 16 | |
1310 | or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. | |
1311 | movd m0, r6d | |
1312 | pshufd m0, m0, 0 ; m0 = [w0<<6, round] | |
1313 | movd m1, r7m | |
1314 | movd m2, r8m | |
1315 | pshufd m2, m2, 0 | |
1316 | mova m5, [pw_1] | |
1317 | sub r2d, r3d | |
1318 | shr r3d, 4 | |
1319 | ||
1320 | .loopH: | |
1321 | mov r5d, r3d | |
1322 | ||
1323 | .loopW: | |
1324 | pmovzxbw m4, [r0] | |
1325 | punpcklwd m3, m4, m5 | |
1326 | pmaddwd m3, m0 | |
1327 | psrad m3, m1 | |
1328 | paddd m3, m2 | |
1329 | ||
1330 | punpckhwd m4, m5 | |
1331 | pmaddwd m4, m0 | |
1332 | psrad m4, m1 | |
1333 | paddd m4, m2 | |
1334 | ||
1335 | packssdw m3, m4 | |
1336 | packuswb m3, m3 | |
1337 | movh [r1], m3 | |
1338 | ||
1339 | pmovzxbw m4, [r0 + 8] | |
1340 | punpcklwd m3, m4, m5 | |
1341 | pmaddwd m3, m0 | |
1342 | psrad m3, m1 | |
1343 | paddd m3, m2 | |
1344 | ||
1345 | punpckhwd m4, m5 | |
1346 | pmaddwd m4, m0 | |
1347 | psrad m4, m1 | |
1348 | paddd m4, m2 | |
1349 | ||
1350 | packssdw m3, m4 | |
1351 | packuswb m3, m3 | |
1352 | movh [r1 + 8], m3 | |
1353 | ||
1354 | add r0, 16 | |
1355 | add r1, 16 | |
1356 | ||
1357 | dec r5d | |
1358 | jnz .loopW | |
1359 | ||
1360 | lea r0, [r0 + r2] | |
1361 | lea r1, [r1 + r2] | |
1362 | ||
1363 | dec r4d | |
1364 | jnz .loopH | |
1365 | RET | |
1366 | ||
1367 | ||
1368 | INIT_YMM avx2 | |
1369 | cglobal weight_pp, 6, 7, 6 | |
1370 | ||
1371 | shl r5d, 6 ; m0 = [w0<<6] | |
1372 | mov r6d, r6m | |
1373 | shl r6d, 16 | |
1374 | or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. | |
1375 | movd xm0, r6d | |
1376 | pshufd xm0, xm0, 0 ; m0 = [w0<<6, round] | |
1377 | vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate | |
1378 | ||
1379 | movd xm1, r7m | |
1380 | vpbroadcastd m2, r8m | |
1381 | mova m5, [pw_1] | |
1382 | sub r2d, r3d | |
1383 | shr r3d, 4 | |
1384 | ||
1385 | .loopH: | |
1386 | mov r5d, r3d | |
1387 | ||
1388 | .loopW: | |
1389 | pmovzxbw m4, [r0] | |
1390 | punpcklwd m3, m4, m5 | |
1391 | pmaddwd m3, m0 | |
1392 | psrad m3, xm1 | |
1393 | paddd m3, m2 | |
1394 | ||
1395 | punpckhwd m4, m5 | |
1396 | pmaddwd m4, m0 | |
1397 | psrad m4, xm1 | |
1398 | paddd m4, m2 | |
1399 | ||
1400 | packssdw m3, m4 | |
1401 | vextracti128 xm4, m3, 1 | |
1402 | packuswb xm3, xm4 | |
1403 | movu [r1], xm3 | |
1404 | ||
1405 | add r0, 16 | |
1406 | add r1, 16 | |
1407 | ||
1408 | dec r5d | |
1409 | jnz .loopW | |
1410 | ||
1411 | lea r0, [r0 + r2] | |
1412 | lea r1, [r1 + r2] | |
1413 | ||
1414 | dec r4d | |
1415 | jnz .loopH | |
1416 | RET | |
1417 | ||
1418 | ;------------------------------------------------------------------------------------------------------------------------------------------------- | |
1419 | ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) | |
1420 | ;------------------------------------------------------------------------------------------------------------------------------------------------- | |
1421 | INIT_XMM sse4 | |
1422 | %if ARCH_X86_64 | |
1423 | cglobal weight_sp, 6, 7+2, 7 | |
1424 | %define tmp_r0 r7 | |
1425 | %define tmp_r1 r8 | |
1426 | %else ; ARCH_X86_64 = 0 | |
1427 | cglobal weight_sp, 6, 7, 7, 0-(2*4) | |
1428 | %define tmp_r0 [(rsp + 0 * 4)] | |
1429 | %define tmp_r1 [(rsp + 1 * 4)] | |
1430 | %endif ; ARCH_X86_64 | |
1431 | ||
1432 | movd m0, r6m ; m0 = [w0] | |
1433 | ||
1434 | movd m1, r7m ; m1 = [round] | |
1435 | punpcklwd m0, m1 | |
1436 | pshufd m0, m0, 0 ; m0 = [w0 round] | |
1437 | ||
1438 | movd m1, r8m ; m1 = [shift] | |
1439 | ||
1440 | movd m2, r9m | |
1441 | pshufd m2, m2, 0 ; m2 =[offset] | |
1442 | ||
1443 | mova m3, [pw_1] | |
1444 | mova m4, [pw_2000] | |
1445 | ||
1446 | add r2d, r2d | |
1447 | ||
1448 | .loopH: | |
1449 | mov r6d, r4d | |
1450 | ||
1451 | ; save old src and dst | |
1452 | mov tmp_r0, r0 | |
1453 | mov tmp_r1, r1 | |
1454 | .loopW: | |
1455 | movu m5, [r0] | |
1456 | paddw m5, m4 | |
1457 | ||
1458 | punpcklwd m6,m5, m3 | |
1459 | pmaddwd m6, m0 | |
1460 | psrad m6, m1 | |
1461 | paddd m6, m2 | |
1462 | ||
1463 | punpckhwd m5, m3 | |
1464 | pmaddwd m5, m0 | |
1465 | psrad m5, m1 | |
1466 | paddd m5, m2 | |
1467 | ||
1468 | packssdw m6, m5 | |
1469 | packuswb m6, m6 | |
1470 | ||
1471 | sub r6d, 8 | |
1472 | jl .width4 | |
1473 | movh [r1], m6 | |
1474 | je .nextH | |
1475 | add r0, 16 | |
1476 | add r1, 8 | |
1477 | ||
1478 | jmp .loopW | |
1479 | ||
1480 | .width4: | |
1481 | cmp r6d, -4 | |
1482 | jl .width2 | |
1483 | movd [r1], m6 | |
1484 | je .nextH | |
1485 | add r1, 4 | |
1486 | pshufd m6, m6, 1 | |
1487 | ||
1488 | .width2: | |
1489 | pextrw [r1], m6, 0 | |
1490 | ||
1491 | .nextH: | |
1492 | mov r0, tmp_r0 | |
1493 | mov r1, tmp_r1 | |
1494 | lea r0, [r0 + r2] | |
1495 | lea r1, [r1 + r3] | |
1496 | ||
1497 | dec r5d | |
1498 | jnz .loopH | |
1499 | ||
1500 | RET | |
1501 | ||
1502 | ;----------------------------------------------------------------- | |
1503 | ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride) | |
1504 | ;----------------------------------------------------------------- | |
1505 | INIT_XMM sse2 | |
1506 | cglobal transpose4, 3, 3, 4, dest, src, stride | |
1507 | %if HIGH_BIT_DEPTH == 1 | |
1508 | add r2, r2 | |
1509 | movh m0, [r1] | |
1510 | movh m1, [r1 + r2] | |
1511 | movh m2, [r1 + 2 * r2] | |
1512 | lea r1, [r1 + 2 * r2] | |
1513 | movh m3, [r1 + r2] | |
1514 | punpcklwd m0, m1 | |
1515 | punpcklwd m2, m3 | |
1516 | punpckhdq m1, m0, m2 | |
1517 | punpckldq m0, m2 | |
1518 | movu [r0], m0 | |
1519 | movu [r0 + 16], m1 | |
1520 | %else ;HIGH_BIT_DEPTH == 0 | |
1521 | movd m0, [r1] | |
1522 | movd m1, [r1 + r2] | |
1523 | movd m2, [r1 + 2 * r2] | |
1524 | lea r1, [r1 + 2 * r2] | |
1525 | movd m3, [r1 + r2] | |
1526 | ||
1527 | punpcklbw m0, m1 | |
1528 | punpcklbw m2, m3 | |
1529 | punpcklwd m0, m2 | |
1530 | movu [r0], m0 | |
1531 | %endif | |
1532 | RET | |
1533 | ||
1534 | ;----------------------------------------------------------------- | |
1535 | ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) | |
1536 | ;----------------------------------------------------------------- | |
1537 | %if HIGH_BIT_DEPTH == 1 | |
1538 | %if ARCH_X86_64 == 1 | |
1539 | INIT_YMM avx2 | |
1540 | cglobal transpose8, 3, 5, 5 | |
1541 | add r2, r2 | |
1542 | lea r3, [3 * r2] | |
1543 | lea r4, [r1 + 4 * r2] | |
1544 | movu xm0, [r1] | |
1545 | vinserti128 m0, m0, [r4], 1 | |
1546 | movu xm1, [r1 + r2] | |
1547 | vinserti128 m1, m1, [r4 + r2], 1 | |
1548 | movu xm2, [r1 + 2 * r2] | |
1549 | vinserti128 m2, m2, [r4 + 2 * r2], 1 | |
1550 | movu xm3, [r1 + r3] | |
1551 | vinserti128 m3, m3, [r4 + r3], 1 | |
1552 | ||
1553 | punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6] | |
1554 | punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6] | |
1555 | ||
1556 | punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8] | |
1557 | punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8] | |
1558 | ||
1559 | punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8] | |
1560 | punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8] | |
1561 | ||
1562 | punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8] | |
1563 | punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8] | |
1564 | ||
1565 | vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8] | |
1566 | vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8] | |
1567 | vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8] | |
1568 | vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8] | |
1569 | ||
1570 | movu [r0 + 0 * 32], m3 | |
1571 | movu [r0 + 1 * 32], m4 | |
1572 | movu [r0 + 2 * 32], m1 | |
1573 | movu [r0 + 3 * 32], m0 | |
1574 | RET | |
1575 | %endif | |
1576 | ||
1577 | INIT_XMM sse2 | |
1578 | %macro TRANSPOSE_4x4 1 | |
1579 | movh m0, [r1] | |
1580 | movh m1, [r1 + r2] | |
1581 | movh m2, [r1 + 2 * r2] | |
1582 | lea r1, [r1 + 2 * r2] | |
1583 | movh m3, [r1 + r2] | |
1584 | punpcklwd m0, m1 | |
1585 | punpcklwd m2, m3 | |
1586 | punpckhdq m1, m0, m2 | |
1587 | punpckldq m0, m2 | |
1588 | movh [r0], m0 | |
1589 | movhps [r0 + %1], m0 | |
1590 | movh [r0 + 2 * %1], m1 | |
1591 | lea r0, [r0 + 2 * %1] | |
1592 | movhps [r0 + %1], m1 | |
1593 | %endmacro | |
1594 | cglobal transpose8_internal | |
1595 | TRANSPOSE_4x4 r5 | |
1596 | lea r1, [r1 + 2 * r2] | |
1597 | lea r0, [r3 + 8] | |
1598 | TRANSPOSE_4x4 r5 | |
1599 | lea r1, [r1 + 2 * r2] | |
1600 | neg r2 | |
1601 | lea r1, [r1 + r2 * 8 + 8] | |
1602 | neg r2 | |
1603 | lea r0, [r3 + 4 * r5] | |
1604 | TRANSPOSE_4x4 r5 | |
1605 | lea r1, [r1 + 2 * r2] | |
1606 | lea r0, [r3 + 8 + 4 * r5] | |
1607 | TRANSPOSE_4x4 r5 | |
1608 | ret | |
1609 | cglobal transpose8, 3, 6, 4, dest, src, stride | |
1610 | add r2, r2 | |
1611 | mov r3, r0 | |
1612 | mov r5, 16 | |
1613 | call transpose8_internal | |
1614 | RET | |
1615 | %else ;HIGH_BIT_DEPTH == 0 | |
1616 | %if ARCH_X86_64 == 1 | |
1617 | INIT_YMM avx2 | |
1618 | cglobal transpose8, 3, 4, 4 | |
1619 | lea r3, [r2 * 3] | |
1620 | movq xm0, [r1] | |
1621 | movhps xm0, [r1 + 2 * r2] | |
1622 | movq xm1, [r1 + r2] | |
1623 | movhps xm1, [r1 + r3] | |
1624 | lea r1, [r1 + 4 * r2] | |
1625 | movq xm2, [r1] | |
1626 | movhps xm2, [r1 + 2 * r2] | |
1627 | movq xm3, [r1 + r2] | |
1628 | movhps xm3, [r1 + r3] | |
1629 | ||
1630 | vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7] | |
1631 | vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8] | |
1632 | ||
1633 | punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6] | |
1634 | punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8] | |
1635 | ||
1636 | punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8] | |
1637 | punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8] | |
1638 | ||
1639 | mova m0, [trans8_shuf] | |
1640 | ||
1641 | vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8] | |
1642 | vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8] | |
1643 | ||
1644 | movu [r0], m1 | |
1645 | movu [r0 + 32], m2 | |
1646 | RET | |
1647 | %endif | |
1648 | ||
1649 | INIT_XMM sse2 | |
1650 | cglobal transpose8, 3, 5, 8, dest, src, stride | |
1651 | lea r3, [2 * r2] | |
1652 | lea r4, [3 * r2] | |
1653 | movh m0, [r1] | |
1654 | movh m1, [r1 + r2] | |
1655 | movh m2, [r1 + r3] | |
1656 | movh m3, [r1 + r4] | |
1657 | movh m4, [r1 + 4 * r2] | |
1658 | lea r1, [r1 + 4 * r2] | |
1659 | movh m5, [r1 + r2] | |
1660 | movh m6, [r1 + r3] | |
1661 | movh m7, [r1 + r4] | |
1662 | ||
1663 | punpcklbw m0, m1 | |
1664 | punpcklbw m2, m3 | |
1665 | punpcklbw m4, m5 | |
1666 | punpcklbw m6, m7 | |
1667 | ||
1668 | punpckhwd m1, m0, m2 | |
1669 | punpcklwd m0, m2 | |
1670 | punpckhwd m5, m4, m6 | |
1671 | punpcklwd m4, m6 | |
1672 | punpckhdq m2, m0, m4 | |
1673 | punpckldq m0, m4 | |
1674 | punpckhdq m3, m1, m5 | |
1675 | punpckldq m1, m5 | |
1676 | ||
1677 | movu [r0], m0 | |
1678 | movu [r0 + 16], m2 | |
1679 | movu [r0 + 32], m1 | |
1680 | movu [r0 + 48], m3 | |
1681 | RET | |
1682 | %endif | |
1683 | ||
1684 | %macro TRANSPOSE_8x8 1 | |
1685 | ||
1686 | movh m0, [r1] | |
1687 | movh m1, [r1 + r2] | |
1688 | movh m2, [r1 + 2 * r2] | |
1689 | lea r1, [r1 + 2 * r2] | |
1690 | movh m3, [r1 + r2] | |
1691 | movh m4, [r1 + 2 * r2] | |
1692 | lea r1, [r1 + 2 * r2] | |
1693 | movh m5, [r1 + r2] | |
1694 | movh m6, [r1 + 2 * r2] | |
1695 | lea r1, [r1 + 2 * r2] | |
1696 | movh m7, [r1 + r2] | |
1697 | ||
1698 | punpcklbw m0, m1 | |
1699 | punpcklbw m2, m3 | |
1700 | punpcklbw m4, m5 | |
1701 | punpcklbw m6, m7 | |
1702 | ||
1703 | punpckhwd m1, m0, m2 | |
1704 | punpcklwd m0, m2 | |
1705 | punpckhwd m5, m4, m6 | |
1706 | punpcklwd m4, m6 | |
1707 | punpckhdq m2, m0, m4 | |
1708 | punpckldq m0, m4 | |
1709 | punpckhdq m3, m1, m5 | |
1710 | punpckldq m1, m5 | |
1711 | ||
1712 | movh [r0], m0 | |
1713 | movhps [r0 + %1], m0 | |
1714 | movh [r0 + 2 * %1], m2 | |
1715 | lea r0, [r0 + 2 * %1] | |
1716 | movhps [r0 + %1], m2 | |
1717 | movh [r0 + 2 * %1], m1 | |
1718 | lea r0, [r0 + 2 * %1] | |
1719 | movhps [r0 + %1], m1 | |
1720 | movh [r0 + 2 * %1], m3 | |
1721 | lea r0, [r0 + 2 * %1] | |
1722 | movhps [r0 + %1], m3 | |
1723 | ||
1724 | %endmacro | |
1725 | ||
1726 | ||
1727 | ;----------------------------------------------------------------- | |
1728 | ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride) | |
1729 | ;----------------------------------------------------------------- | |
1730 | %if HIGH_BIT_DEPTH == 1 | |
1731 | %if ARCH_X86_64 == 1 | |
1732 | INIT_YMM avx2 | |
1733 | cglobal transpose16x8_internal | |
1734 | movu m0, [r1] | |
1735 | movu m1, [r1 + r2] | |
1736 | movu m2, [r1 + 2 * r2] | |
1737 | movu m3, [r1 + r3] | |
1738 | lea r1, [r1 + 4 * r2] | |
1739 | ||
1740 | movu m4, [r1] | |
1741 | movu m5, [r1 + r2] | |
1742 | movu m6, [r1 + 2 * r2] | |
1743 | movu m7, [r1 + r3] | |
1744 | ||
1745 | punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2] | |
1746 | punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2] | |
1747 | ||
1748 | punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4] | |
1749 | punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4] | |
1750 | ||
1751 | punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6] | |
1752 | punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6] | |
1753 | ||
1754 | punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8] | |
1755 | punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8] | |
1756 | ||
1757 | punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4] | |
1758 | punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4] | |
1759 | ||
1760 | punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8] | |
1761 | punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8] | |
1762 | ||
1763 | punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4] | |
1764 | punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4] | |
1765 | ||
1766 | punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8] | |
1767 | punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8] | |
1768 | ||
1769 | punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8] | |
1770 | punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8] | |
1771 | ||
1772 | punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8] | |
1773 | punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8] | |
1774 | ||
1775 | punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8] | |
1776 | punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8] | |
1777 | ||
1778 | punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8] | |
1779 | punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8] | |
1780 | ||
1781 | movu [r0 + 0 * 32], xm6 | |
1782 | vextracti128 [r0 + 8 * 32], m6, 1 | |
1783 | movu [r0 + 1 * 32], xm7 | |
1784 | vextracti128 [r0 + 9 * 32], m7, 1 | |
1785 | movu [r0 + 2 * 32], xm1 | |
1786 | vextracti128 [r0 + 10 * 32], m1, 1 | |
1787 | movu [r0 + 3 * 32], xm8 | |
1788 | vextracti128 [r0 + 11 * 32], m8, 1 | |
1789 | movu [r0 + 4 * 32], xm3 | |
1790 | vextracti128 [r0 + 12 * 32], m3, 1 | |
1791 | movu [r0 + 5 * 32], xm5 | |
1792 | vextracti128 [r0 + 13 * 32], m5, 1 | |
1793 | movu [r0 + 6 * 32], xm2 | |
1794 | vextracti128 [r0 + 14 * 32], m2, 1 | |
1795 | movu [r0 + 7 * 32], xm0 | |
1796 | vextracti128 [r0 + 15 * 32], m0, 1 | |
1797 | ret | |
1798 | ||
1799 | cglobal transpose16, 3, 4, 9 | |
1800 | add r2, r2 | |
1801 | lea r3, [r2 * 3] | |
1802 | call transpose16x8_internal | |
1803 | lea r1, [r1 + 4 * r2] | |
1804 | add r0, 16 | |
1805 | call transpose16x8_internal | |
1806 | RET | |
1807 | %endif | |
1808 | INIT_XMM sse2 | |
1809 | cglobal transpose16, 3, 7, 4, dest, src, stride | |
1810 | add r2, r2 | |
1811 | mov r3, r0 | |
1812 | mov r4, r1 | |
1813 | mov r5, 32 | |
1814 | mov r6, r0 | |
1815 | call transpose8_internal | |
1816 | lea r1, [r1 - 8 + 2 * r2] | |
1817 | lea r0, [r6 + 16] | |
1818 | mov r3, r0 | |
1819 | call transpose8_internal | |
1820 | lea r1, [r4 + 16] | |
1821 | lea r0, [r6 + 8 * r5] | |
1822 | mov r3, r0 | |
1823 | call transpose8_internal | |
1824 | lea r1, [r1 - 8 + 2 * r2] | |
1825 | lea r0, [r6 + 8 * r5 + 16] | |
1826 | mov r3, r0 | |
1827 | call transpose8_internal | |
1828 | RET | |
1829 | %else ;HIGH_BIT_DEPTH == 0 | |
1830 | %if ARCH_X86_64 == 1 | |
1831 | INIT_YMM avx2 | |
1832 | cglobal transpose16, 3, 5, 9 | |
1833 | lea r3, [r2 * 3] | |
1834 | lea r4, [r1 + 8 * r2] | |
1835 | ||
1836 | movu xm0, [r1] | |
1837 | movu xm1, [r1 + r2] | |
1838 | movu xm2, [r1 + 2 * r2] | |
1839 | movu xm3, [r1 + r3] | |
1840 | vinserti128 m0, m0, [r4], 1 | |
1841 | vinserti128 m1, m1, [r4 + r2], 1 | |
1842 | vinserti128 m2, m2, [r4 + 2 * r2], 1 | |
1843 | vinserti128 m3, m3, [r4 + r3], 1 | |
1844 | lea r1, [r1 + 4 * r2] | |
1845 | lea r4, [r4 + 4 * r2] | |
1846 | ||
1847 | movu xm4, [r1] | |
1848 | movu xm5, [r1 + r2] | |
1849 | movu xm6, [r1 + 2 * r2] | |
1850 | movu xm7, [r1 + r3] | |
1851 | vinserti128 m4, m4, [r4], 1 | |
1852 | vinserti128 m5, m5, [r4 + r2], 1 | |
1853 | vinserti128 m6, m6, [r4 + 2 * r2], 1 | |
1854 | vinserti128 m7, m7, [r4 + r3], 1 | |
1855 | ||
1856 | punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10] | |
1857 | punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10] | |
1858 | ||
1859 | punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12] | |
1860 | punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12] | |
1861 | ||
1862 | punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14] | |
1863 | punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14] | |
1864 | ||
1865 | punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16] | |
1866 | punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16] | |
1867 | ||
1868 | punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12] | |
1869 | punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12] | |
1870 | ||
1871 | punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16] | |
1872 | punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16] | |
1873 | ||
1874 | punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12] | |
1875 | punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12] | |
1876 | ||
1877 | punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16] | |
1878 | punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16] | |
1879 | ||
1880 | punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1881 | punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1882 | ||
1883 | punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1884 | punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1885 | ||
1886 | punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1887 | punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1888 | ||
1889 | punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1890 | punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1891 | ||
1892 | vpermq m6, m6, 0xD8 | |
1893 | vpermq m7, m7, 0xD8 | |
1894 | vpermq m1, m1, 0xD8 | |
1895 | vpermq m8, m8, 0xD8 | |
1896 | vpermq m3, m3, 0xD8 | |
1897 | vpermq m5, m5, 0xD8 | |
1898 | vpermq m2, m2, 0xD8 | |
1899 | vpermq m0, m0, 0xD8 | |
1900 | ||
1901 | movu [r0 + 0 * 16], m6 | |
1902 | movu [r0 + 2 * 16], m7 | |
1903 | movu [r0 + 4 * 16], m1 | |
1904 | movu [r0 + 6 * 16], m8 | |
1905 | movu [r0 + 8 * 16], m3 | |
1906 | movu [r0 + 10 * 16], m5 | |
1907 | movu [r0 + 12 * 16], m2 | |
1908 | movu [r0 + 14 * 16], m0 | |
1909 | RET | |
1910 | %endif | |
1911 | INIT_XMM sse2 | |
1912 | cglobal transpose16, 3, 5, 8, dest, src, stride | |
1913 | mov r3, r0 | |
1914 | mov r4, r1 | |
1915 | TRANSPOSE_8x8 16 | |
1916 | lea r1, [r1 + 2 * r2] | |
1917 | lea r0, [r3 + 8] | |
1918 | TRANSPOSE_8x8 16 | |
1919 | lea r1, [r4 + 8] | |
1920 | lea r0, [r3 + 8 * 16] | |
1921 | TRANSPOSE_8x8 16 | |
1922 | lea r1, [r1 + 2 * r2] | |
1923 | lea r0, [r3 + 8 * 16 + 8] | |
1924 | TRANSPOSE_8x8 16 | |
1925 | RET | |
1926 | %endif | |
1927 | ||
1928 | cglobal transpose16_internal | |
1929 | TRANSPOSE_8x8 r6 | |
1930 | lea r1, [r1 + 2 * r2] | |
1931 | lea r0, [r5 + 8] | |
1932 | TRANSPOSE_8x8 r6 | |
1933 | lea r1, [r1 + 2 * r2] | |
1934 | neg r2 | |
1935 | lea r1, [r1 + r2 * 8] | |
1936 | lea r1, [r1 + r2 * 8 + 8] | |
1937 | neg r2 | |
1938 | lea r0, [r5 + 8 * r6] | |
1939 | TRANSPOSE_8x8 r6 | |
1940 | lea r1, [r1 + 2 * r2] | |
1941 | lea r0, [r5 + 8 * r6 + 8] | |
1942 | TRANSPOSE_8x8 r6 | |
1943 | ret | |
1944 | ||
1945 | ;----------------------------------------------------------------- | |
1946 | ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride) | |
1947 | ;----------------------------------------------------------------- | |
1948 | %if HIGH_BIT_DEPTH == 1 | |
1949 | %if ARCH_X86_64 == 1 | |
1950 | INIT_YMM avx2 | |
1951 | cglobal transpose8x32_internal | |
1952 | movu m0, [r1] | |
1953 | movu m1, [r1 + 32] | |
1954 | movu m2, [r1 + r2] | |
1955 | movu m3, [r1 + r2 + 32] | |
1956 | movu m4, [r1 + 2 * r2] | |
1957 | movu m5, [r1 + 2 * r2 + 32] | |
1958 | movu m6, [r1 + r3] | |
1959 | movu m7, [r1 + r3 + 32] | |
1960 | lea r1, [r1 + 4 * r2] | |
1961 | ||
1962 | punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] | |
1963 | punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] | |
1964 | ||
1965 | punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] | |
1966 | punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] | |
1967 | ||
1968 | punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] | |
1969 | punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] | |
1970 | ||
1971 | punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] | |
1972 | punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] | |
1973 | ||
1974 | punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] | |
1975 | punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] | |
1976 | ||
1977 | punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] | |
1978 | punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] | |
1979 | ||
1980 | punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] | |
1981 | punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] | |
1982 | ||
1983 | punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] | |
1984 | punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] | |
1985 | ||
1986 | movq [r0 + 0 * 64], xm7 | |
1987 | movhps [r0 + 1 * 64], xm7 | |
1988 | vextracti128 xm5, m7, 1 | |
1989 | movq [r0 + 8 * 64], xm5 | |
1990 | movhps [r0 + 9 * 64], xm5 | |
1991 | ||
1992 | movu m7, [r1] | |
1993 | movu m9, [r1 + 32] | |
1994 | movu m10, [r1 + r2] | |
1995 | movu m11, [r1 + r2 + 32] | |
1996 | movu m12, [r1 + 2 * r2] | |
1997 | movu m13, [r1 + 2 * r2 + 32] | |
1998 | movu m14, [r1 + r3] | |
1999 | movu m15, [r1 + r3 + 32] | |
2000 | ||
2001 | punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] | |
2002 | punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] | |
2003 | ||
2004 | punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] | |
2005 | punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] | |
2006 | ||
2007 | punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] | |
2008 | punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] | |
2009 | ||
2010 | punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] | |
2011 | punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] | |
2012 | ||
2013 | punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] | |
2014 | punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] | |
2015 | ||
2016 | punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] | |
2017 | punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] | |
2018 | ||
2019 | punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] | |
2020 | punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] | |
2021 | ||
2022 | punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] | |
2023 | punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] | |
2024 | ||
2025 | movq [r0 + 0 * 64 + 8], xm15 | |
2026 | movhps [r0 + 1 * 64 + 8], xm15 | |
2027 | vextracti128 xm13, m15, 1 | |
2028 | movq [r0 + 8 * 64 + 8], xm13 | |
2029 | movhps [r0 + 9 * 64 + 8], xm13 | |
2030 | ||
2031 | punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] | |
2032 | punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] | |
2033 | ||
2034 | punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] | |
2035 | punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] | |
2036 | ||
2037 | punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] | |
2038 | punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] | |
2039 | ||
2040 | punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] | |
2041 | punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] | |
2042 | ||
2043 | punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] | |
2044 | punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] | |
2045 | ||
2046 | punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] | |
2047 | punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] | |
2048 | ||
2049 | punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] | |
2050 | punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] | |
2051 | ||
2052 | movu [r0 + 2 * 64], xm13 | |
2053 | vextracti128 [r0 + 10 * 64], m13, 1 | |
2054 | ||
2055 | movu [r0 + 3 * 64], xm8 | |
2056 | vextracti128 [r0 + 11 * 64], m8, 1 | |
2057 | ||
2058 | movu [r0 + 4 * 64], xm5 | |
2059 | vextracti128 [r0 + 12 * 64], m5, 1 | |
2060 | ||
2061 | movu [r0 + 5 * 64], xm2 | |
2062 | vextracti128 [r0 + 13 * 64], m2, 1 | |
2063 | ||
2064 | movu [r0 + 6 * 64], xm10 | |
2065 | vextracti128 [r0 + 14 * 64], m10, 1 | |
2066 | ||
2067 | movu [r0 + 7 * 64], xm0 | |
2068 | vextracti128 [r0 + 15 * 64], m0, 1 | |
2069 | ||
2070 | movu [r0 + 16 * 64], xm7 | |
2071 | vextracti128 [r0 + 24 * 64], m7, 1 | |
2072 | ||
2073 | movu [r0 + 17 * 64], xm4 | |
2074 | vextracti128 [r0 + 25 * 64], m4, 1 | |
2075 | ||
2076 | movu [r0 + 18 * 64], xm12 | |
2077 | vextracti128 [r0 + 26 * 64], m12, 1 | |
2078 | ||
2079 | movu [r0 + 19 * 64], xm6 | |
2080 | vextracti128 [r0 + 27 * 64], m6, 1 | |
2081 | ||
2082 | movu [r0 + 20 * 64], xm14 | |
2083 | vextracti128 [r0 + 28 * 64], m14, 1 | |
2084 | ||
2085 | movu [r0 + 21 * 64], xm3 | |
2086 | vextracti128 [r0 + 29 * 64], m3, 1 | |
2087 | ||
2088 | movu [r0 + 22 * 64], xm11 | |
2089 | vextracti128 [r0 + 30 * 64], m11, 1 | |
2090 | ||
2091 | movu [r0 + 23 * 64], xm1 | |
2092 | vextracti128 [r0 + 31 * 64], m1, 1 | |
2093 | ret | |
2094 | ||
2095 | cglobal transpose32, 3, 4, 16 | |
2096 | add r2, r2 | |
2097 | lea r3, [r2 * 3] | |
2098 | call transpose8x32_internal | |
2099 | add r0, 16 | |
2100 | lea r1, [r1 + 4 * r2] | |
2101 | call transpose8x32_internal | |
2102 | add r0, 16 | |
2103 | lea r1, [r1 + 4 * r2] | |
2104 | call transpose8x32_internal | |
2105 | add r0, 16 | |
2106 | lea r1, [r1 + 4 * r2] | |
2107 | call transpose8x32_internal | |
2108 | RET | |
2109 | %endif | |
2110 | INIT_XMM sse2 | |
2111 | cglobal transpose32, 3, 7, 4, dest, src, stride | |
2112 | add r2, r2 | |
2113 | mov r3, r0 | |
2114 | mov r4, r1 | |
2115 | mov r5, 64 | |
2116 | mov r6, r0 | |
2117 | call transpose8_internal | |
2118 | lea r1, [r1 - 8 + 2 * r2] | |
2119 | lea r0, [r6 + 16] | |
2120 | mov r3, r0 | |
2121 | call transpose8_internal | |
2122 | lea r1, [r1 - 8 + 2 * r2] | |
2123 | lea r0, [r6 + 32] | |
2124 | mov r3, r0 | |
2125 | call transpose8_internal | |
2126 | lea r1, [r1 - 8 + 2 * r2] | |
2127 | lea r0, [r6 + 48] | |
2128 | mov r3, r0 | |
2129 | call transpose8_internal | |
2130 | lea r1, [r4 + 16] | |
2131 | lea r0, [r6 + 8 * 64] | |
2132 | mov r3, r0 | |
2133 | call transpose8_internal | |
2134 | lea r1, [r1 - 8 + 2 * r2] | |
2135 | lea r0, [r6 + 8 * 64 + 16] | |
2136 | mov r3, r0 | |
2137 | call transpose8_internal | |
2138 | lea r1, [r1 - 8 + 2 * r2] | |
2139 | lea r0, [r6 + 8 * 64 + 32] | |
2140 | mov r3, r0 | |
2141 | call transpose8_internal | |
2142 | lea r1, [r1 - 8 + 2 * r2] | |
2143 | lea r0, [r6 + 8 * 64 + 48] | |
2144 | mov r3, r0 | |
2145 | call transpose8_internal | |
2146 | lea r1, [r4 + 32] | |
2147 | lea r0, [r6 + 16 * 64] | |
2148 | mov r3, r0 | |
2149 | call transpose8_internal | |
2150 | lea r1, [r1 - 8 + 2 * r2] | |
2151 | lea r0, [r6 + 16 * 64 + 16] | |
2152 | mov r3, r0 | |
2153 | call transpose8_internal | |
2154 | lea r1, [r1 - 8 + 2 * r2] | |
2155 | lea r0, [r6 + 16 * 64 + 32] | |
2156 | mov r3, r0 | |
2157 | call transpose8_internal | |
2158 | lea r1, [r1 - 8 + 2 * r2] | |
2159 | lea r0, [r6 + 16 * 64 + 48] | |
2160 | mov r3, r0 | |
2161 | call transpose8_internal | |
2162 | lea r1, [r4 + 48] | |
2163 | lea r0, [r6 + 24 * 64] | |
2164 | mov r3, r0 | |
2165 | call transpose8_internal | |
2166 | lea r1, [r1 - 8 + 2 * r2] | |
2167 | lea r0, [r6 + 24 * 64 + 16] | |
2168 | mov r3, r0 | |
2169 | call transpose8_internal | |
2170 | lea r1, [r1 - 8 + 2 * r2] | |
2171 | lea r0, [r6 + 24 * 64 + 32] | |
2172 | mov r3, r0 | |
2173 | call transpose8_internal | |
2174 | lea r1, [r1 - 8 + 2 * r2] | |
2175 | lea r0, [r6 + 24 * 64 + 48] | |
2176 | mov r3, r0 | |
2177 | call transpose8_internal | |
2178 | RET | |
2179 | %else ;HIGH_BIT_DEPTH == 0 | |
2180 | INIT_XMM sse2 | |
2181 | cglobal transpose32, 3, 7, 8, dest, src, stride | |
2182 | mov r3, r0 | |
2183 | mov r4, r1 | |
2184 | mov r5, r0 | |
2185 | mov r6, 32 | |
2186 | call transpose16_internal | |
2187 | lea r1, [r1 - 8 + 2 * r2] | |
2188 | lea r0, [r3 + 16] | |
2189 | mov r5, r0 | |
2190 | call transpose16_internal | |
2191 | lea r1, [r4 + 16] | |
2192 | lea r0, [r3 + 16 * 32] | |
2193 | mov r5, r0 | |
2194 | call transpose16_internal | |
2195 | lea r1, [r1 - 8 + 2 * r2] | |
2196 | lea r0, [r3 + 16 * 32 + 16] | |
2197 | mov r5, r0 | |
2198 | call transpose16_internal | |
2199 | RET | |
2200 | ||
2201 | %if ARCH_X86_64 == 1 | |
2202 | INIT_YMM avx2 | |
2203 | cglobal transpose32, 3, 5, 16 | |
2204 | lea r3, [r2 * 3] | |
2205 | mov r4d, 2 | |
2206 | ||
2207 | .loop: | |
2208 | movu m0, [r1] | |
2209 | movu m1, [r1 + r2] | |
2210 | movu m2, [r1 + 2 * r2] | |
2211 | movu m3, [r1 + r3] | |
2212 | lea r1, [r1 + 4 * r2] | |
2213 | ||
2214 | movu m4, [r1] | |
2215 | movu m5, [r1 + r2] | |
2216 | movu m6, [r1 + 2 * r2] | |
2217 | movu m7, [r1 + r3] | |
2218 | ||
2219 | punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] | |
2220 | punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] | |
2221 | ||
2222 | punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] | |
2223 | punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] | |
2224 | ||
2225 | punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] | |
2226 | punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] | |
2227 | ||
2228 | punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] | |
2229 | punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] | |
2230 | ||
2231 | punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] | |
2232 | punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] | |
2233 | ||
2234 | punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] | |
2235 | punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] | |
2236 | ||
2237 | punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] | |
2238 | punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4] | |
2239 | ||
2240 | punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] | |
2241 | punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8] | |
2242 | ||
2243 | punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] | |
2244 | punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] | |
2245 | ||
2246 | punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] | |
2247 | punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] | |
2248 | ||
2249 | punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] | |
2250 | punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] | |
2251 | ||
2252 | punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] | |
2253 | punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] | |
2254 | ||
2255 | movq [r0 + 0 * 32], xm6 | |
2256 | movhps [r0 + 1 * 32], xm6 | |
2257 | vextracti128 xm4, m6, 1 | |
2258 | movq [r0 + 16 * 32], xm4 | |
2259 | movhps [r0 + 17 * 32], xm4 | |
2260 | ||
2261 | lea r1, [r1 + 4 * r2] | |
2262 | movu m9, [r1] | |
2263 | movu m10, [r1 + r2] | |
2264 | movu m11, [r1 + 2 * r2] | |
2265 | movu m12, [r1 + r3] | |
2266 | lea r1, [r1 + 4 * r2] | |
2267 | ||
2268 | movu m13, [r1] | |
2269 | movu m14, [r1 + r2] | |
2270 | movu m15, [r1 + 2 * r2] | |
2271 | movu m6, [r1 + r3] | |
2272 | ||
2273 | punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] | |
2274 | punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] | |
2275 | ||
2276 | punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] | |
2277 | punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] | |
2278 | ||
2279 | punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] | |
2280 | punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] | |
2281 | ||
2282 | punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] | |
2283 | punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] | |
2284 | ||
2285 | punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] | |
2286 | punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] | |
2287 | ||
2288 | punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] | |
2289 | punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] | |
2290 | ||
2291 | punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] | |
2292 | punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] | |
2293 | ||
2294 | punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] | |
2295 | punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] | |
2296 | ||
2297 | punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] | |
2298 | punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] | |
2299 | ||
2300 | punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] | |
2301 | punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] | |
2302 | ||
2303 | punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] | |
2304 | punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] | |
2305 | ||
2306 | punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] | |
2307 | punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] | |
2308 | ||
2309 | ||
2310 | punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2311 | punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2312 | ||
2313 | punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2314 | punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2315 | ||
2316 | punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2317 | punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2318 | ||
2319 | punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2320 | punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2321 | ||
2322 | punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2323 | punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2324 | ||
2325 | punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2326 | punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2327 | ||
2328 | punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2329 | punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2330 | ||
2331 | movq [r0 + 0 * 32 + 8], xm15 | |
2332 | movhps [r0 + 1 * 32 + 8], xm15 | |
2333 | vextracti128 xm9, m15, 1 | |
2334 | movq [r0 + 16 * 32 + 8], xm9 | |
2335 | movhps [r0 + 17 * 32 + 8], xm9 | |
2336 | ||
2337 | movu [r0 + 2 * 32], xm13 | |
2338 | vextracti128 [r0 + 18 * 32], m13, 1 | |
2339 | ||
2340 | movu [r0 + 3 * 32], xm7 | |
2341 | vextracti128 [r0 + 19 * 32], m7, 1 | |
2342 | ||
2343 | movu [r0 + 4 * 32], xm6 | |
2344 | vextracti128 [r0 + 20 * 32], m6, 1 | |
2345 | ||
2346 | movu [r0 + 5 * 32], xm1 | |
2347 | vextracti128 [r0 + 21 * 32], m1, 1 | |
2348 | ||
2349 | movu [r0 + 6 * 32], xm10 | |
2350 | vextracti128 [r0 + 22 * 32], m10, 1 | |
2351 | ||
2352 | movu [r0 + 7 * 32], xm8 | |
2353 | vextracti128 [r0 + 23 * 32], m8, 1 | |
2354 | ||
2355 | movu [r0 + 8 * 32], xm4 | |
2356 | vextracti128 [r0 + 24 * 32], m4, 1 | |
2357 | ||
2358 | movu [r0 + 9 * 32], xm3 | |
2359 | vextracti128 [r0 + 25 * 32], m3, 1 | |
2360 | ||
2361 | movu [r0 + 10 * 32], xm12 | |
2362 | vextracti128 [r0 + 26 * 32], m12, 1 | |
2363 | ||
2364 | movu [r0 + 11 * 32], xm5 | |
2365 | vextracti128 [r0 + 27 * 32], m5, 1 | |
2366 | ||
2367 | movu [r0 + 12 * 32], xm14 | |
2368 | vextracti128 [r0 + 28 * 32], m14, 1 | |
2369 | ||
2370 | movu [r0 + 13 * 32], xm2 | |
2371 | vextracti128 [r0 + 29 * 32], m2, 1 | |
2372 | ||
2373 | movu [r0 + 14 * 32], xm11 | |
2374 | vextracti128 [r0 + 30 * 32], m11, 1 | |
2375 | ||
2376 | movu [r0 + 15 * 32], xm0 | |
2377 | vextracti128 [r0 + 31 * 32], m0, 1 | |
2378 | ||
2379 | add r0, 16 | |
2380 | lea r1, [r1 + 4 * r2] | |
2381 | dec r4d | |
2382 | jnz .loop | |
2383 | RET | |
2384 | %endif | |
2385 | %endif | |
2386 | ||
2387 | ;----------------------------------------------------------------- | |
2388 | ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride) | |
2389 | ;----------------------------------------------------------------- | |
2390 | %if HIGH_BIT_DEPTH == 1 | |
2391 | %if ARCH_X86_64 == 1 | |
2392 | INIT_YMM avx2 | |
2393 | cglobal transpose8x32_64_internal | |
2394 | movu m0, [r1] | |
2395 | movu m1, [r1 + 32] | |
2396 | movu m2, [r1 + r2] | |
2397 | movu m3, [r1 + r2 + 32] | |
2398 | movu m4, [r1 + 2 * r2] | |
2399 | movu m5, [r1 + 2 * r2 + 32] | |
2400 | movu m6, [r1 + r3] | |
2401 | movu m7, [r1 + r3 + 32] | |
2402 | lea r1, [r1 + 4 * r2] | |
2403 | ||
2404 | punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] | |
2405 | punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] | |
2406 | ||
2407 | punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] | |
2408 | punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] | |
2409 | ||
2410 | punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] | |
2411 | punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] | |
2412 | ||
2413 | punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] | |
2414 | punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] | |
2415 | ||
2416 | punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] | |
2417 | punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] | |
2418 | ||
2419 | punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] | |
2420 | punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] | |
2421 | ||
2422 | punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] | |
2423 | punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] | |
2424 | ||
2425 | punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] | |
2426 | punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] | |
2427 | ||
2428 | movq [r0 + 0 * 128], xm7 | |
2429 | movhps [r0 + 1 * 128], xm7 | |
2430 | vextracti128 xm5, m7, 1 | |
2431 | movq [r0 + 8 * 128], xm5 | |
2432 | movhps [r0 + 9 * 128], xm5 | |
2433 | ||
2434 | movu m7, [r1] | |
2435 | movu m9, [r1 + 32] | |
2436 | movu m10, [r1 + r2] | |
2437 | movu m11, [r1 + r2 + 32] | |
2438 | movu m12, [r1 + 2 * r2] | |
2439 | movu m13, [r1 + 2 * r2 + 32] | |
2440 | movu m14, [r1 + r3] | |
2441 | movu m15, [r1 + r3 + 32] | |
2442 | ||
2443 | punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] | |
2444 | punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] | |
2445 | ||
2446 | punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] | |
2447 | punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] | |
2448 | ||
2449 | punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] | |
2450 | punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] | |
2451 | ||
2452 | punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] | |
2453 | punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] | |
2454 | ||
2455 | punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] | |
2456 | punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] | |
2457 | ||
2458 | punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] | |
2459 | punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] | |
2460 | ||
2461 | punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] | |
2462 | punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] | |
2463 | ||
2464 | punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] | |
2465 | punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] | |
2466 | ||
2467 | movq [r0 + 0 * 128 + 8], xm15 | |
2468 | movhps [r0 + 1 * 128 + 8], xm15 | |
2469 | vextracti128 xm13, m15, 1 | |
2470 | movq [r0 + 8 * 128 + 8], xm13 | |
2471 | movhps [r0 + 9 * 128 + 8], xm13 | |
2472 | ||
2473 | punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] | |
2474 | punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] | |
2475 | ||
2476 | punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] | |
2477 | punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] | |
2478 | ||
2479 | punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] | |
2480 | punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] | |
2481 | ||
2482 | punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] | |
2483 | punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] | |
2484 | ||
2485 | punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] | |
2486 | punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] | |
2487 | ||
2488 | punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] | |
2489 | punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] | |
2490 | ||
2491 | punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] | |
2492 | punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] | |
2493 | ||
2494 | movu [r0 + 2 * 128], xm13 | |
2495 | vextracti128 [r0 + 10 * 128], m13, 1 | |
2496 | ||
2497 | movu [r0 + 3 * 128], xm8 | |
2498 | vextracti128 [r0 + 11 * 128], m8, 1 | |
2499 | ||
2500 | movu [r0 + 4 * 128], xm5 | |
2501 | vextracti128 [r0 + 12 * 128], m5, 1 | |
2502 | ||
2503 | movu [r0 + 5 * 128], xm2 | |
2504 | vextracti128 [r0 + 13 * 128], m2, 1 | |
2505 | ||
2506 | movu [r0 + 6 * 128], xm10 | |
2507 | vextracti128 [r0 + 14 * 128], m10, 1 | |
2508 | ||
2509 | movu [r0 + 7 * 128], xm0 | |
2510 | vextracti128 [r0 + 15 * 128], m0, 1 | |
2511 | ||
2512 | movu [r0 + 16 * 128], xm7 | |
2513 | vextracti128 [r0 + 24 * 128], m7, 1 | |
2514 | ||
2515 | movu [r0 + 17 * 128], xm4 | |
2516 | vextracti128 [r0 + 25 * 128], m4, 1 | |
2517 | ||
2518 | movu [r0 + 18 * 128], xm12 | |
2519 | vextracti128 [r0 + 26 * 128], m12, 1 | |
2520 | ||
2521 | movu [r0 + 19 * 128], xm6 | |
2522 | vextracti128 [r0 + 27 * 128], m6, 1 | |
2523 | ||
2524 | movu [r0 + 20 * 128], xm14 | |
2525 | vextracti128 [r0 + 28 * 128], m14, 1 | |
2526 | ||
2527 | movu [r0 + 21 * 128], xm3 | |
2528 | vextracti128 [r0 + 29 * 128], m3, 1 | |
2529 | ||
2530 | movu [r0 + 22 * 128], xm11 | |
2531 | vextracti128 [r0 + 30 * 128], m11, 1 | |
2532 | ||
2533 | movu [r0 + 23 * 128], xm1 | |
2534 | vextracti128 [r0 + 31 * 128], m1, 1 | |
2535 | ret | |
2536 | ||
2537 | cglobal transpose64, 3, 6, 16 | |
2538 | add r2, r2 | |
2539 | lea r3, [3 * r2] | |
2540 | lea r4, [r1 + 64] | |
2541 | lea r5, [r0 + 16] | |
2542 | ||
2543 | call transpose8x32_64_internal | |
2544 | mov r1, r4 | |
2545 | lea r0, [r0 + 32 * 128] | |
2546 | call transpose8x32_64_internal | |
2547 | mov r0, r5 | |
2548 | lea r5, [r0 + 16] | |
2549 | lea r4, [r1 + 4 * r2] | |
2550 | lea r1, [r4 - 64] | |
2551 | call transpose8x32_64_internal | |
2552 | mov r1, r4 | |
2553 | lea r0, [r0 + 32 * 128] | |
2554 | call transpose8x32_64_internal | |
2555 | mov r0, r5 | |
2556 | lea r5, [r0 + 16] | |
2557 | lea r4, [r1 + 4 * r2] | |
2558 | lea r1, [r4 - 64] | |
2559 | call transpose8x32_64_internal | |
2560 | mov r1, r4 | |
2561 | lea r0, [r0 + 32 * 128] | |
2562 | call transpose8x32_64_internal | |
2563 | mov r0, r5 | |
2564 | lea r5, [r0 + 16] | |
2565 | lea r4, [r1 + 4 * r2] | |
2566 | lea r1, [r4 - 64] | |
2567 | call transpose8x32_64_internal | |
2568 | mov r1, r4 | |
2569 | lea r0, [r0 + 32 * 128] | |
2570 | call transpose8x32_64_internal | |
2571 | mov r0, r5 | |
2572 | lea r5, [r0 + 16] | |
2573 | lea r4, [r1 + 4 * r2] | |
2574 | lea r1, [r4 - 64] | |
2575 | call transpose8x32_64_internal | |
2576 | mov r1, r4 | |
2577 | lea r0, [r0 + 32 * 128] | |
2578 | call transpose8x32_64_internal | |
2579 | mov r0, r5 | |
2580 | lea r5, [r0 + 16] | |
2581 | lea r4, [r1 + 4 * r2] | |
2582 | lea r1, [r4 - 64] | |
2583 | call transpose8x32_64_internal | |
2584 | mov r1, r4 | |
2585 | lea r0, [r0 + 32 * 128] | |
2586 | call transpose8x32_64_internal | |
2587 | mov r0, r5 | |
2588 | lea r5, [r0 + 16] | |
2589 | lea r4, [r1 + 4 * r2] | |
2590 | lea r1, [r4 - 64] | |
2591 | call transpose8x32_64_internal | |
2592 | mov r1, r4 | |
2593 | lea r0, [r0 + 32 * 128] | |
2594 | call transpose8x32_64_internal | |
2595 | mov r0, r5 | |
2596 | lea r4, [r1 + 4 * r2] | |
2597 | lea r1, [r4 - 64] | |
2598 | call transpose8x32_64_internal | |
2599 | mov r1, r4 | |
2600 | lea r0, [r0 + 32 * 128] | |
2601 | call transpose8x32_64_internal | |
2602 | RET | |
2603 | %endif | |
2604 | INIT_XMM sse2 | |
2605 | cglobal transpose64, 3, 7, 4, dest, src, stride | |
2606 | add r2, r2 | |
2607 | mov r3, r0 | |
2608 | mov r4, r1 | |
2609 | mov r5, 128 | |
2610 | mov r6, r0 | |
2611 | call transpose8_internal | |
2612 | lea r1, [r1 - 8 + 2 * r2] | |
2613 | lea r0, [r6 + 16] | |
2614 | mov r3, r0 | |
2615 | call transpose8_internal | |
2616 | lea r1, [r1 - 8 + 2 * r2] | |
2617 | lea r0, [r6 + 32] | |
2618 | mov r3, r0 | |
2619 | call transpose8_internal | |
2620 | lea r1, [r1 - 8 + 2 * r2] | |
2621 | lea r0, [r6 + 48] | |
2622 | mov r3, r0 | |
2623 | call transpose8_internal | |
2624 | lea r1, [r1 - 8 + 2 * r2] | |
2625 | lea r0, [r6 + 64] | |
2626 | mov r3, r0 | |
2627 | call transpose8_internal | |
2628 | lea r1, [r1 - 8 + 2 * r2] | |
2629 | lea r0, [r6 + 80] | |
2630 | mov r3, r0 | |
2631 | call transpose8_internal | |
2632 | lea r1, [r1 - 8 + 2 * r2] | |
2633 | lea r0, [r6 + 96] | |
2634 | mov r3, r0 | |
2635 | call transpose8_internal | |
2636 | lea r1, [r1 - 8 + 2 * r2] | |
2637 | lea r0, [r6 + 112] | |
2638 | mov r3, r0 | |
2639 | call transpose8_internal | |
2640 | ||
2641 | lea r1, [r4 + 16] | |
2642 | lea r0, [r6 + 8 * 128] | |
2643 | mov r3, r0 | |
2644 | call transpose8_internal | |
2645 | lea r1, [r1 - 8 + 2 * r2] | |
2646 | lea r0, [r6 + 8 * 128 + 16] | |
2647 | mov r3, r0 | |
2648 | call transpose8_internal | |
2649 | lea r1, [r1 - 8 + 2 * r2] | |
2650 | lea r0, [r6 + 8 * 128 + 32] | |
2651 | mov r3, r0 | |
2652 | call transpose8_internal | |
2653 | lea r1, [r1 - 8 + 2 * r2] | |
2654 | lea r0, [r6 + 8 * 128 + 48] | |
2655 | mov r3, r0 | |
2656 | call transpose8_internal | |
2657 | lea r1, [r1 - 8 + 2 * r2] | |
2658 | lea r0, [r6 + 8 * 128 + 64] | |
2659 | mov r3, r0 | |
2660 | call transpose8_internal | |
2661 | lea r1, [r1 - 8 + 2 * r2] | |
2662 | lea r0, [r6 + 8 * 128 + 80] | |
2663 | mov r3, r0 | |
2664 | call transpose8_internal | |
2665 | lea r1, [r1 - 8 + 2 * r2] | |
2666 | lea r0, [r6 + 8 * 128 + 96] | |
2667 | mov r3, r0 | |
2668 | call transpose8_internal | |
2669 | lea r1, [r1 - 8 + 2 * r2] | |
2670 | lea r0, [r6 + 8 * 128 + 112] | |
2671 | mov r3, r0 | |
2672 | call transpose8_internal | |
2673 | ||
2674 | lea r1, [r4 + 32] | |
2675 | lea r0, [r6 + 16 * 128] | |
2676 | mov r3, r0 | |
2677 | call transpose8_internal | |
2678 | lea r1, [r1 - 8 + 2 * r2] | |
2679 | lea r0, [r6 + 16 * 128 + 16] | |
2680 | mov r3, r0 | |
2681 | call transpose8_internal | |
2682 | lea r1, [r1 - 8 + 2 * r2] | |
2683 | lea r0, [r6 + 16 * 128 + 32] | |
2684 | mov r3, r0 | |
2685 | call transpose8_internal | |
2686 | lea r1, [r1 - 8 + 2 * r2] | |
2687 | lea r0, [r6 + 16 * 128 + 48] | |
2688 | mov r3, r0 | |
2689 | call transpose8_internal | |
2690 | lea r1, [r1 - 8 + 2 * r2] | |
2691 | lea r0, [r6 + 16 * 128 + 64] | |
2692 | mov r3, r0 | |
2693 | call transpose8_internal | |
2694 | lea r1, [r1 - 8 + 2 * r2] | |
2695 | lea r0, [r6 + 16 * 128 + 80] | |
2696 | mov r3, r0 | |
2697 | call transpose8_internal | |
2698 | lea r1, [r1 - 8 + 2 * r2] | |
2699 | lea r0, [r6 + 16 * 128 + 96] | |
2700 | mov r3, r0 | |
2701 | call transpose8_internal | |
2702 | lea r1, [r1 - 8 + 2 * r2] | |
2703 | lea r0, [r6 + 16 * 128 + 112] | |
2704 | mov r3, r0 | |
2705 | call transpose8_internal | |
2706 | ||
2707 | lea r1, [r4 + 48] | |
2708 | lea r0, [r6 + 24 * 128] | |
2709 | mov r3, r0 | |
2710 | call transpose8_internal | |
2711 | lea r1, [r1 - 8 + 2 * r2] | |
2712 | lea r0, [r6 + 24 * 128 + 16] | |
2713 | mov r3, r0 | |
2714 | call transpose8_internal | |
2715 | lea r1, [r1 - 8 + 2 * r2] | |
2716 | lea r0, [r6 + 24 * 128 + 32] | |
2717 | mov r3, r0 | |
2718 | call transpose8_internal | |
2719 | lea r1, [r1 - 8 + 2 * r2] | |
2720 | lea r0, [r6 + 24 * 128 + 48] | |
2721 | mov r3, r0 | |
2722 | call transpose8_internal | |
2723 | lea r1, [r1 - 8 + 2 * r2] | |
2724 | lea r0, [r6 + 24 * 128 + 64] | |
2725 | mov r3, r0 | |
2726 | call transpose8_internal | |
2727 | lea r1, [r1 - 8 + 2 * r2] | |
2728 | lea r0, [r6 + 24 * 128 + 80] | |
2729 | mov r3, r0 | |
2730 | call transpose8_internal | |
2731 | lea r1, [r1 - 8 + 2 * r2] | |
2732 | lea r0, [r6 + 24 * 128 + 96] | |
2733 | mov r3, r0 | |
2734 | call transpose8_internal | |
2735 | lea r1, [r1 - 8 + 2 * r2] | |
2736 | lea r0, [r6 + 24 * 128 + 112] | |
2737 | mov r3, r0 | |
2738 | call transpose8_internal | |
2739 | ||
2740 | lea r1, [r4 + 64] | |
2741 | lea r0, [r6 + 32 * 128] | |
2742 | mov r3, r0 | |
2743 | call transpose8_internal | |
2744 | lea r1, [r1 - 8 + 2 * r2] | |
2745 | lea r0, [r6 + 32 * 128 + 16] | |
2746 | mov r3, r0 | |
2747 | call transpose8_internal | |
2748 | lea r1, [r1 - 8 + 2 * r2] | |
2749 | lea r0, [r6 + 32 * 128 + 32] | |
2750 | mov r3, r0 | |
2751 | call transpose8_internal | |
2752 | lea r1, [r1 - 8 + 2 * r2] | |
2753 | lea r0, [r6 + 32 * 128 + 48] | |
2754 | mov r3, r0 | |
2755 | call transpose8_internal | |
2756 | lea r1, [r1 - 8 + 2 * r2] | |
2757 | lea r0, [r6 + 32 * 128 + 64] | |
2758 | mov r3, r0 | |
2759 | call transpose8_internal | |
2760 | lea r1, [r1 - 8 + 2 * r2] | |
2761 | lea r0, [r6 + 32 * 128 + 80] | |
2762 | mov r3, r0 | |
2763 | call transpose8_internal | |
2764 | lea r1, [r1 - 8 + 2 * r2] | |
2765 | lea r0, [r6 + 32 * 128 + 96] | |
2766 | mov r3, r0 | |
2767 | call transpose8_internal | |
2768 | lea r1, [r1 - 8 + 2 * r2] | |
2769 | lea r0, [r6 + 32 * 128 + 112] | |
2770 | mov r3, r0 | |
2771 | call transpose8_internal | |
2772 | ||
2773 | lea r1, [r4 + 80] | |
2774 | lea r0, [r6 + 40 * 128] | |
2775 | mov r3, r0 | |
2776 | call transpose8_internal | |
2777 | lea r1, [r1 - 8 + 2 * r2] | |
2778 | lea r0, [r6 + 40 * 128 + 16] | |
2779 | mov r3, r0 | |
2780 | call transpose8_internal | |
2781 | lea r1, [r1 - 8 + 2 * r2] | |
2782 | lea r0, [r6 + 40 * 128 + 32] | |
2783 | mov r3, r0 | |
2784 | call transpose8_internal | |
2785 | lea r1, [r1 - 8 + 2 * r2] | |
2786 | lea r0, [r6 + 40 * 128 + 48] | |
2787 | mov r3, r0 | |
2788 | call transpose8_internal | |
2789 | lea r1, [r1 - 8 + 2 * r2] | |
2790 | lea r0, [r6 + 40 * 128 + 64] | |
2791 | mov r3, r0 | |
2792 | call transpose8_internal | |
2793 | lea r1, [r1 - 8 + 2 * r2] | |
2794 | lea r0, [r6 + 40 * 128 + 80] | |
2795 | mov r3, r0 | |
2796 | call transpose8_internal | |
2797 | lea r1, [r1 - 8 + 2 * r2] | |
2798 | lea r0, [r6 + 40 * 128 + 96] | |
2799 | mov r3, r0 | |
2800 | call transpose8_internal | |
2801 | lea r1, [r1 - 8 + 2 * r2] | |
2802 | lea r0, [r6 + 40 * 128 + 112] | |
2803 | mov r3, r0 | |
2804 | call transpose8_internal | |
2805 | ||
2806 | lea r1, [r4 + 96] | |
2807 | lea r0, [r6 + 48 * 128] | |
2808 | mov r3, r0 | |
2809 | call transpose8_internal | |
2810 | lea r1, [r1 - 8 + 2 * r2] | |
2811 | lea r0, [r6 + 48 * 128 + 16] | |
2812 | mov r3, r0 | |
2813 | call transpose8_internal | |
2814 | lea r1, [r1 - 8 + 2 * r2] | |
2815 | lea r0, [r6 + 48 * 128 + 32] | |
2816 | mov r3, r0 | |
2817 | call transpose8_internal | |
2818 | lea r1, [r1 - 8 + 2 * r2] | |
2819 | lea r0, [r6 + 48 * 128 + 48] | |
2820 | mov r3, r0 | |
2821 | call transpose8_internal | |
2822 | lea r1, [r1 - 8 + 2 * r2] | |
2823 | lea r0, [r6 + 48 * 128 + 64] | |
2824 | mov r3, r0 | |
2825 | call transpose8_internal | |
2826 | lea r1, [r1 - 8 + 2 * r2] | |
2827 | lea r0, [r6 + 48 * 128 + 80] | |
2828 | mov r3, r0 | |
2829 | call transpose8_internal | |
2830 | lea r1, [r1 - 8 + 2 * r2] | |
2831 | lea r0, [r6 + 48 * 128 + 96] | |
2832 | mov r3, r0 | |
2833 | call transpose8_internal | |
2834 | lea r1, [r1 - 8 + 2 * r2] | |
2835 | lea r0, [r6 + 48 * 128 + 112] | |
2836 | mov r3, r0 | |
2837 | call transpose8_internal | |
2838 | ||
2839 | lea r1, [r4 + 112] | |
2840 | lea r0, [r6 + 56 * 128] | |
2841 | mov r3, r0 | |
2842 | call transpose8_internal | |
2843 | lea r1, [r1 - 8 + 2 * r2] | |
2844 | lea r0, [r6 + 56 * 128 + 16] | |
2845 | mov r3, r0 | |
2846 | call transpose8_internal | |
2847 | lea r1, [r1 - 8 + 2 * r2] | |
2848 | lea r0, [r6 + 56 * 128 + 32] | |
2849 | mov r3, r0 | |
2850 | call transpose8_internal | |
2851 | lea r1, [r1 - 8 + 2 * r2] | |
2852 | lea r0, [r6 + 56 * 128 + 48] | |
2853 | mov r3, r0 | |
2854 | call transpose8_internal | |
2855 | lea r1, [r1 - 8 + 2 * r2] | |
2856 | lea r0, [r6 + 56 * 128 + 64] | |
2857 | mov r3, r0 | |
2858 | call transpose8_internal | |
2859 | lea r1, [r1 - 8 + 2 * r2] | |
2860 | lea r0, [r6 + 56 * 128 + 80] | |
2861 | mov r3, r0 | |
2862 | call transpose8_internal | |
2863 | lea r1, [r1 - 8 + 2 * r2] | |
2864 | lea r0, [r6 + 56 * 128 + 96] | |
2865 | mov r3, r0 | |
2866 | call transpose8_internal | |
2867 | lea r1, [r1 - 8 + 2 * r2] | |
2868 | lea r0, [r6 + 56 * 128 + 112] | |
2869 | mov r3, r0 | |
2870 | call transpose8_internal | |
2871 | RET | |
2872 | %else ;HIGH_BIT_DEPTH == 0 | |
2873 | %if ARCH_X86_64 == 1 | |
2874 | INIT_YMM avx2 | |
2875 | ||
2876 | cglobal transpose16x32_avx2 | |
2877 | movu m0, [r1] | |
2878 | movu m1, [r1 + r2] | |
2879 | movu m2, [r1 + 2 * r2] | |
2880 | movu m3, [r1 + r3] | |
2881 | lea r1, [r1 + 4 * r2] | |
2882 | ||
2883 | movu m4, [r1] | |
2884 | movu m5, [r1 + r2] | |
2885 | movu m6, [r1 + 2 * r2] | |
2886 | movu m7, [r1 + r3] | |
2887 | ||
2888 | punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] | |
2889 | punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] | |
2890 | ||
2891 | punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] | |
2892 | punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] | |
2893 | ||
2894 | punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] | |
2895 | punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] | |
2896 | ||
2897 | punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] | |
2898 | punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] | |
2899 | ||
2900 | punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] | |
2901 | punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] | |
2902 | ||
2903 | punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] | |
2904 | punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] | |
2905 | ||
2906 | punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] | |
2907 | punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4] | |
2908 | ||
2909 | punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] | |
2910 | punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8] | |
2911 | ||
2912 | punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] | |
2913 | punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] | |
2914 | ||
2915 | punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] | |
2916 | punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] | |
2917 | ||
2918 | punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] | |
2919 | punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] | |
2920 | ||
2921 | punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] | |
2922 | punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] | |
2923 | ||
2924 | movq [r0 + 0 * 64], xm6 | |
2925 | movhps [r0 + 1 * 64], xm6 | |
2926 | vextracti128 xm4, m6, 1 | |
2927 | movq [r0 + 16 * 64], xm4 | |
2928 | movhps [r0 + 17 * 64], xm4 | |
2929 | ||
2930 | lea r1, [r1 + 4 * r2] | |
2931 | movu m9, [r1] | |
2932 | movu m10, [r1 + r2] | |
2933 | movu m11, [r1 + 2 * r2] | |
2934 | movu m12, [r1 + r3] | |
2935 | lea r1, [r1 + 4 * r2] | |
2936 | ||
2937 | movu m13, [r1] | |
2938 | movu m14, [r1 + r2] | |
2939 | movu m15, [r1 + 2 * r2] | |
2940 | movu m6, [r1 + r3] | |
2941 | ||
2942 | punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] | |
2943 | punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] | |
2944 | ||
2945 | punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] | |
2946 | punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] | |
2947 | ||
2948 | punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] | |
2949 | punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] | |
2950 | ||
2951 | punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] | |
2952 | punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] | |
2953 | ||
2954 | punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] | |
2955 | punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] | |
2956 | ||
2957 | punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] | |
2958 | punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] | |
2959 | ||
2960 | punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] | |
2961 | punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] | |
2962 | ||
2963 | punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] | |
2964 | punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] | |
2965 | ||
2966 | punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] | |
2967 | punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] | |
2968 | ||
2969 | punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] | |
2970 | punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] | |
2971 | ||
2972 | punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] | |
2973 | punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] | |
2974 | ||
2975 | punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] | |
2976 | punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] | |
2977 | ||
2978 | ||
2979 | punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2980 | punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2981 | ||
2982 | punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2983 | punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2984 | ||
2985 | punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2986 | punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2987 | ||
2988 | punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2989 | punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2990 | ||
2991 | punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2992 | punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2993 | ||
2994 | punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2995 | punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2996 | ||
2997 | punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2998 | punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2999 | ||
3000 | movq [r0 + 0 * 64 + 8], xm15 | |
3001 | movhps [r0 + 1 * 64 + 8], xm15 | |
3002 | vextracti128 xm9, m15, 1 | |
3003 | movq [r0 + 16 * 64 + 8], xm9 | |
3004 | movhps [r0 + 17 * 64 + 8], xm9 | |
3005 | ||
3006 | movu [r0 + 2 * 64], xm13 | |
3007 | vextracti128 [r0 + 18 * 64], m13, 1 | |
3008 | ||
3009 | movu [r0 + 3 * 64], xm7 | |
3010 | vextracti128 [r0 + 19 * 64], m7, 1 | |
3011 | ||
3012 | movu [r0 + 4 * 64], xm6 | |
3013 | vextracti128 [r0 + 20 * 64], m6, 1 | |
3014 | ||
3015 | movu [r0 + 5 * 64], xm1 | |
3016 | vextracti128 [r0 + 21 * 64], m1, 1 | |
3017 | ||
3018 | movu [r0 + 6 * 64], xm10 | |
3019 | vextracti128 [r0 + 22 * 64], m10, 1 | |
3020 | ||
3021 | movu [r0 + 7 * 64], xm8 | |
3022 | vextracti128 [r0 + 23 * 64], m8, 1 | |
3023 | ||
3024 | movu [r0 + 8 * 64], xm4 | |
3025 | vextracti128 [r0 + 24 * 64], m4, 1 | |
3026 | ||
3027 | movu [r0 + 9 * 64], xm3 | |
3028 | vextracti128 [r0 + 25 * 64], m3, 1 | |
3029 | ||
3030 | movu [r0 + 10 * 64], xm12 | |
3031 | vextracti128 [r0 + 26 * 64], m12, 1 | |
3032 | ||
3033 | movu [r0 + 11 * 64], xm5 | |
3034 | vextracti128 [r0 + 27 * 64], m5, 1 | |
3035 | ||
3036 | movu [r0 + 12 * 64], xm14 | |
3037 | vextracti128 [r0 + 28 * 64], m14, 1 | |
3038 | ||
3039 | movu [r0 + 13 * 64], xm2 | |
3040 | vextracti128 [r0 + 29 * 64], m2, 1 | |
3041 | ||
3042 | movu [r0 + 14 * 64], xm11 | |
3043 | vextracti128 [r0 + 30 * 64], m11, 1 | |
3044 | ||
3045 | movu [r0 + 15 * 64], xm0 | |
3046 | vextracti128 [r0 + 31 * 64], m0, 1 | |
3047 | ret | |
3048 | ||
3049 | cglobal transpose64, 3, 6, 16 | |
3050 | ||
3051 | lea r3, [r2 * 3] | |
3052 | lea r4, [r0 + 16] | |
3053 | ||
3054 | lea r5, [r1 + 32] | |
3055 | call transpose16x32_avx2 | |
3056 | lea r0, [r0 + 32 * 64] | |
3057 | mov r1, r5 | |
3058 | call transpose16x32_avx2 | |
3059 | ||
3060 | mov r0, r4 | |
3061 | lea r5, [r1 + 4 * r2] | |
3062 | ||
3063 | lea r1, [r5 - 32] | |
3064 | call transpose16x32_avx2 | |
3065 | lea r0, [r0 + 32 * 64] | |
3066 | mov r1, r5 | |
3067 | call transpose16x32_avx2 | |
3068 | ||
3069 | lea r0, [r4 + 16] | |
3070 | lea r5, [r1 + 4 * r2] | |
3071 | ||
3072 | lea r1, [r5 - 32] | |
3073 | call transpose16x32_avx2 | |
3074 | lea r0, [r0 + 32 * 64] | |
3075 | mov r1, r5 | |
3076 | call transpose16x32_avx2 | |
3077 | ||
3078 | lea r5, [r1 + 4 * r2] | |
3079 | lea r0, [r4 + 32] | |
3080 | ||
3081 | lea r1, [r5 - 32] | |
3082 | call transpose16x32_avx2 | |
3083 | lea r0, [r0 + 32 * 64] | |
3084 | mov r1, r5 | |
3085 | call transpose16x32_avx2 | |
3086 | RET | |
3087 | %endif | |
3088 | ||
3089 | INIT_XMM sse2 | |
3090 | cglobal transpose64, 3, 7, 8, dest, src, stride | |
3091 | mov r3, r0 | |
3092 | mov r4, r1 | |
3093 | mov r5, r0 | |
3094 | mov r6, 64 | |
3095 | call transpose16_internal | |
3096 | lea r1, [r1 - 8 + 2 * r2] | |
3097 | lea r0, [r3 + 16] | |
3098 | mov r5, r0 | |
3099 | call transpose16_internal | |
3100 | lea r1, [r1 - 8 + 2 * r2] | |
3101 | lea r0, [r3 + 32] | |
3102 | mov r5, r0 | |
3103 | call transpose16_internal | |
3104 | lea r1, [r1 - 8 + 2 * r2] | |
3105 | lea r0, [r3 + 48] | |
3106 | mov r5, r0 | |
3107 | call transpose16_internal | |
3108 | ||
3109 | lea r1, [r4 + 16] | |
3110 | lea r0, [r3 + 16 * 64] | |
3111 | mov r5, r0 | |
3112 | call transpose16_internal | |
3113 | lea r1, [r1 - 8 + 2 * r2] | |
3114 | lea r0, [r3 + 16 * 64 + 16] | |
3115 | mov r5, r0 | |
3116 | call transpose16_internal | |
3117 | lea r1, [r1 - 8 + 2 * r2] | |
3118 | lea r0, [r3 + 16 * 64 + 32] | |
3119 | mov r5, r0 | |
3120 | call transpose16_internal | |
3121 | lea r1, [r1 - 8 + 2 * r2] | |
3122 | lea r0, [r3 + 16 * 64 + 48] | |
3123 | mov r5, r0 | |
3124 | call transpose16_internal | |
3125 | ||
3126 | lea r1, [r4 + 32] | |
3127 | lea r0, [r3 + 32 * 64] | |
3128 | mov r5, r0 | |
3129 | call transpose16_internal | |
3130 | lea r1, [r1 - 8 + 2 * r2] | |
3131 | lea r0, [r3 + 32 * 64 + 16] | |
3132 | mov r5, r0 | |
3133 | call transpose16_internal | |
3134 | lea r1, [r1 - 8 + 2 * r2] | |
3135 | lea r0, [r3 + 32 * 64 + 32] | |
3136 | mov r5, r0 | |
3137 | call transpose16_internal | |
3138 | lea r1, [r1 - 8 + 2 * r2] | |
3139 | lea r0, [r3 + 32 * 64 + 48] | |
3140 | mov r5, r0 | |
3141 | call transpose16_internal | |
3142 | ||
3143 | lea r1, [r4 + 48] | |
3144 | lea r0, [r3 + 48 * 64] | |
3145 | mov r5, r0 | |
3146 | call transpose16_internal | |
3147 | lea r1, [r1 - 8 + 2 * r2] | |
3148 | lea r0, [r3 + 48 * 64 + 16] | |
3149 | mov r5, r0 | |
3150 | call transpose16_internal | |
3151 | lea r1, [r1 - 8 + 2 * r2] | |
3152 | lea r0, [r3 + 48 * 64 + 32] | |
3153 | mov r5, r0 | |
3154 | call transpose16_internal | |
3155 | lea r1, [r1 - 8 + 2 * r2] | |
3156 | lea r0, [r3 + 48 * 64 + 48] | |
3157 | mov r5, r0 | |
3158 | call transpose16_internal | |
3159 | RET | |
3160 | %endif | |
3161 | ||
3162 | ||
3163 | ;============================================================================= | |
3164 | ; SSIM | |
3165 | ;============================================================================= | |
3166 | ||
3167 | ;----------------------------------------------------------------------------- | |
3168 | ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, | |
3169 | ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) | |
3170 | ;----------------------------------------------------------------------------- | |
3171 | %macro SSIM_ITER 1 | |
3172 | %if HIGH_BIT_DEPTH | |
3173 | movdqu m5, [r0+(%1&1)*r1] | |
3174 | movdqu m6, [r2+(%1&1)*r3] | |
3175 | %else | |
3176 | movq m5, [r0+(%1&1)*r1] | |
3177 | movq m6, [r2+(%1&1)*r3] | |
3178 | punpcklbw m5, m0 | |
3179 | punpcklbw m6, m0 | |
3180 | %endif | |
3181 | %if %1==1 | |
3182 | lea r0, [r0+r1*2] | |
3183 | lea r2, [r2+r3*2] | |
3184 | %endif | |
3185 | %if %1==0 | |
3186 | movdqa m1, m5 | |
3187 | movdqa m2, m6 | |
3188 | %else | |
3189 | paddw m1, m5 | |
3190 | paddw m2, m6 | |
3191 | %endif | |
3192 | pmaddwd m7, m5, m6 | |
3193 | pmaddwd m5, m5 | |
3194 | pmaddwd m6, m6 | |
3195 | ACCUM paddd, 3, 5, %1 | |
3196 | ACCUM paddd, 4, 7, %1 | |
3197 | paddd m3, m6 | |
3198 | %endmacro | |
3199 | ||
3200 | %macro SSIM 0 | |
3201 | cglobal pixel_ssim_4x4x2_core, 4,4,8 | |
3202 | FIX_STRIDES r1, r3 | |
3203 | pxor m0, m0 | |
3204 | SSIM_ITER 0 | |
3205 | SSIM_ITER 1 | |
3206 | SSIM_ITER 2 | |
3207 | SSIM_ITER 3 | |
3208 | ; PHADDW m1, m2 | |
3209 | ; PHADDD m3, m4 | |
3210 | movdqa m7, [pw_1] | |
3211 | pshufd m5, m3, q2301 | |
3212 | pmaddwd m1, m7 | |
3213 | pmaddwd m2, m7 | |
3214 | pshufd m6, m4, q2301 | |
3215 | packssdw m1, m2 | |
3216 | paddd m3, m5 | |
3217 | pshufd m1, m1, q3120 | |
3218 | paddd m4, m6 | |
3219 | pmaddwd m1, m7 | |
3220 | punpckhdq m5, m3, m4 | |
3221 | punpckldq m3, m4 | |
3222 | ||
3223 | %if UNIX64 | |
3224 | %define t0 r4 | |
3225 | %else | |
3226 | %define t0 rax | |
3227 | mov t0, r4mp | |
3228 | %endif | |
3229 | ||
3230 | movq [t0+ 0], m1 | |
3231 | movq [t0+ 8], m3 | |
3232 | movhps [t0+16], m1 | |
3233 | movq [t0+24], m5 | |
3234 | RET | |
3235 | ||
3236 | ;----------------------------------------------------------------------------- | |
3237 | ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) | |
3238 | ;----------------------------------------------------------------------------- | |
3239 | cglobal pixel_ssim_end4, 2,3 | |
3240 | mov r2d, r2m | |
3241 | mova m0, [r0+ 0] | |
3242 | mova m1, [r0+16] | |
3243 | mova m2, [r0+32] | |
3244 | mova m3, [r0+48] | |
3245 | mova m4, [r0+64] | |
3246 | paddd m0, [r1+ 0] | |
3247 | paddd m1, [r1+16] | |
3248 | paddd m2, [r1+32] | |
3249 | paddd m3, [r1+48] | |
3250 | paddd m4, [r1+64] | |
3251 | paddd m0, m1 | |
3252 | paddd m1, m2 | |
3253 | paddd m2, m3 | |
3254 | paddd m3, m4 | |
3255 | TRANSPOSE4x4D 0, 1, 2, 3, 4 | |
3256 | ||
3257 | ; s1=m0, s2=m1, ss=m2, s12=m3 | |
3258 | %if BIT_DEPTH == 10 | |
3259 | cvtdq2ps m0, m0 | |
3260 | cvtdq2ps m1, m1 | |
3261 | cvtdq2ps m2, m2 | |
3262 | cvtdq2ps m3, m3 | |
3263 | mulps m4, m0, m1 ; s1*s2 | |
3264 | mulps m0, m0 ; s1*s1 | |
3265 | mulps m1, m1 ; s2*s2 | |
3266 | mulps m2, [pf_64] ; ss*64 | |
3267 | mulps m3, [pf_128] ; s12*128 | |
3268 | addps m4, m4 ; s1*s2*2 | |
3269 | addps m0, m1 ; s1*s1 + s2*s2 | |
3270 | subps m2, m0 ; vars | |
3271 | subps m3, m4 ; covar*2 | |
3272 | movaps m1, [ssim_c1] | |
3273 | addps m4, m1 ; s1*s2*2 + ssim_c1 | |
3274 | addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1 | |
3275 | movaps m1, [ssim_c2] | |
3276 | addps m2, m1 ; vars + ssim_c2 | |
3277 | addps m3, m1 ; covar*2 + ssim_c2 | |
3278 | %else | |
3279 | pmaddwd m4, m1, m0 ; s1*s2 | |
3280 | pslld m1, 16 | |
3281 | por m0, m1 | |
3282 | pmaddwd m0, m0 ; s1*s1 + s2*s2 | |
3283 | pslld m4, 1 | |
3284 | pslld m3, 7 | |
3285 | pslld m2, 6 | |
3286 | psubd m3, m4 ; covar*2 | |
3287 | psubd m2, m0 ; vars | |
3288 | mova m1, [ssim_c1] | |
3289 | paddd m0, m1 | |
3290 | paddd m4, m1 | |
3291 | mova m1, [ssim_c2] | |
3292 | paddd m3, m1 | |
3293 | paddd m2, m1 | |
3294 | cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) | |
3295 | cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) | |
3296 | cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) | |
3297 | cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) | |
3298 | %endif | |
3299 | mulps m4, m3 | |
3300 | mulps m0, m2 | |
3301 | divps m4, m0 ; ssim | |
3302 | ||
3303 | cmp r2d, 4 | |
3304 | je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level | |
3305 | neg r2 | |
3306 | ||
3307 | %ifdef PIC | |
3308 | lea r3, [mask_ff + 16] | |
3309 | %xdefine %%mask r3 | |
3310 | %else | |
3311 | %xdefine %%mask mask_ff + 16 | |
3312 | %endif | |
3313 | %if cpuflag(avx) | |
3314 | andps m4, [%%mask + r2*4] | |
3315 | %else | |
3316 | movups m0, [%%mask + r2*4] | |
3317 | andps m4, m0 | |
3318 | %endif | |
3319 | ||
3320 | .skip: | |
3321 | movhlps m0, m4 | |
3322 | addps m0, m4 | |
3323 | %if cpuflag(ssse3) | |
3324 | movshdup m4, m0 | |
3325 | %else | |
3326 | pshuflw m4, m0, q0032 | |
3327 | %endif | |
3328 | addss m0, m4 | |
3329 | %if ARCH_X86_64 == 0 | |
3330 | movss r0m, m0 | |
3331 | fld dword r0m | |
3332 | %endif | |
3333 | RET | |
3334 | %endmacro ; SSIM | |
3335 | ||
3336 | INIT_XMM sse2 | |
3337 | SSIM | |
3338 | INIT_XMM avx | |
3339 | SSIM | |
3340 | ||
3341 | ;----------------------------------------------------------------- | |
3342 | ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) | |
3343 | ;----------------------------------------------------------------- | |
3344 | INIT_XMM ssse3 | |
3345 | cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride | |
3346 | %if HIGH_BIT_DEPTH | |
3347 | mova m7, [deinterleave_word_shuf] | |
3348 | ||
3349 | movu m0, [r1] | |
3350 | palignr m1, m0, 2 | |
3351 | movu m2, [r1 + 16] | |
3352 | palignr m3, m2, 2 | |
3353 | movu m4, [r1 + 32] | |
3354 | palignr m5, m4, 2 | |
3355 | movu m6, [r1 + 48] | |
3356 | pavgw m0, m1 | |
3357 | palignr m1, m6, 2 | |
3358 | pavgw m2, m3 | |
3359 | pavgw m4, m5 | |
3360 | pavgw m6, m1 | |
3361 | pshufb m0, m0, m7 | |
3362 | pshufb m2, m2, m7 | |
3363 | pshufb m4, m4, m7 | |
3364 | pshufb m6, m6, m7 | |
3365 | punpcklqdq m0, m2 | |
3366 | movu [r0], m0 | |
3367 | punpcklqdq m4, m6 | |
3368 | movu [r0 + 16], m4 | |
3369 | ||
3370 | ||
3371 | ||
3372 | movu m0, [r1 + 64] | |
3373 | palignr m1, m0, 2 | |
3374 | movu m2, [r1 + 80] | |
3375 | palignr m3, m2, 2 | |
3376 | movu m4, [r1 + 96] | |
3377 | palignr m5, m4, 2 | |
3378 | movu m6, [r1 + 112] | |
3379 | pavgw m0, m1 | |
3380 | palignr m1, m6, 2 | |
3381 | pavgw m2, m3 | |
3382 | pavgw m4, m5 | |
3383 | pavgw m6, m1 | |
3384 | pshufb m0, m0, m7 | |
3385 | pshufb m2, m2, m7 | |
3386 | pshufb m4, m4, m7 | |
3387 | pshufb m6, m6, m7 | |
3388 | punpcklqdq m0, m2 | |
3389 | movu [r0 + 32], m0 | |
3390 | punpcklqdq m4, m6 | |
3391 | movu [r0 + 48], m4 | |
3392 | ||
3393 | movu m0, [r1 + 128] | |
3394 | palignr m1, m0, 2 | |
3395 | movu m2, [r1 + 144] | |
3396 | palignr m3, m2, 2 | |
3397 | movu m4, [r1 + 160] | |
3398 | palignr m5, m4, 2 | |
3399 | movu m6, [r1 + 176] | |
3400 | pavgw m0, m1 | |
3401 | palignr m1, m6, 2 | |
3402 | pavgw m2, m3 | |
3403 | pavgw m4, m5 | |
3404 | pavgw m6, m1 | |
3405 | pshufb m0, m0, m7 | |
3406 | pshufb m2, m2, m7 | |
3407 | pshufb m4, m4, m7 | |
3408 | pshufb m6, m6, m7 | |
3409 | ||
3410 | punpcklqdq m0, m2 | |
3411 | movu [r0 + 64], m0 | |
3412 | punpcklqdq m4, m6 | |
3413 | movu [r0 + 80], m4 | |
3414 | ||
3415 | movu m0, [r1 + 192] | |
3416 | palignr m1, m0, 2 | |
3417 | movu m2, [r1 + 208] | |
3418 | palignr m3, m2, 2 | |
3419 | movu m4, [r1 + 224] | |
3420 | palignr m5, m4, 2 | |
3421 | movu m6, [r1 + 240] | |
3422 | pavgw m0, m1 | |
3423 | palignr m1, m6, 2 | |
3424 | pavgw m2, m3 | |
3425 | pavgw m4, m5 | |
3426 | pavgw m6, m1 | |
3427 | pshufb m0, m0, m7 | |
3428 | pshufb m2, m2, m7 | |
3429 | pshufb m4, m4, m7 | |
3430 | pshufb m6, m6, m7 | |
3431 | ||
3432 | punpcklqdq m0, m2 | |
3433 | movu [r0 + 96], m0 | |
3434 | punpcklqdq m4, m6 | |
3435 | movu [r0 + 112], m4 | |
3436 | ||
3437 | %else | |
3438 | mova m7, [deinterleave_shuf] | |
3439 | ||
3440 | movu m0, [r1] | |
3441 | palignr m1, m0, 1 | |
3442 | movu m2, [r1 + 16] | |
3443 | palignr m3, m2, 1 | |
3444 | movu m4, [r1 + 32] | |
3445 | palignr m5, m4, 1 | |
3446 | movu m6, [r1 + 48] | |
3447 | ||
3448 | pavgb m0, m1 | |
3449 | ||
3450 | palignr m1, m6, 1 | |
3451 | ||
3452 | pavgb m2, m3 | |
3453 | pavgb m4, m5 | |
3454 | pavgb m6, m1 | |
3455 | ||
3456 | pshufb m0, m0, m7 | |
3457 | pshufb m2, m2, m7 | |
3458 | pshufb m4, m4, m7 | |
3459 | pshufb m6, m6, m7 | |
3460 | ||
3461 | punpcklqdq m0, m2 | |
3462 | movu [r0], m0 | |
3463 | punpcklqdq m4, m6 | |
3464 | movu [r0 + 16], m4 | |
3465 | ||
3466 | movu m0, [r1 + 64] | |
3467 | palignr m1, m0, 1 | |
3468 | movu m2, [r1 + 80] | |
3469 | palignr m3, m2, 1 | |
3470 | movu m4, [r1 + 96] | |
3471 | palignr m5, m4, 1 | |
3472 | movu m6, [r1 + 112] | |
3473 | ||
3474 | pavgb m0, m1 | |
3475 | ||
3476 | palignr m1, m6, 1 | |
3477 | ||
3478 | pavgb m2, m3 | |
3479 | pavgb m4, m5 | |
3480 | pavgb m6, m1 | |
3481 | ||
3482 | pshufb m0, m0, m7 | |
3483 | pshufb m2, m2, m7 | |
3484 | pshufb m4, m4, m7 | |
3485 | pshufb m6, m6, m7 | |
3486 | ||
3487 | punpcklqdq m0, m2 | |
3488 | movu [r0 + 32], m0 | |
3489 | punpcklqdq m4, m6 | |
3490 | movu [r0 + 48], m4 | |
3491 | %endif | |
3492 | RET | |
3493 | ||
3494 | %if HIGH_BIT_DEPTH == 1 | |
3495 | INIT_YMM avx2 | |
3496 | cglobal scale1D_128to64, 2, 2, 3 | |
3497 | pxor m2, m2 | |
3498 | ||
3499 | movu m0, [r1] | |
3500 | movu m1, [r1 + 32] | |
3501 | phaddw m0, m1 | |
3502 | pavgw m0, m2 | |
3503 | vpermq m0, m0, 0xD8 | |
3504 | movu [r0], m0 | |
3505 | ||
3506 | movu m0, [r1 + 64] | |
3507 | movu m1, [r1 + 96] | |
3508 | phaddw m0, m1 | |
3509 | pavgw m0, m2 | |
3510 | vpermq m0, m0, 0xD8 | |
3511 | movu [r0 + 32], m0 | |
3512 | ||
3513 | movu m0, [r1 + 128] | |
3514 | movu m1, [r1 + 160] | |
3515 | phaddw m0, m1 | |
3516 | pavgw m0, m2 | |
3517 | vpermq m0, m0, 0xD8 | |
3518 | movu [r0 + 64], m0 | |
3519 | ||
3520 | movu m0, [r1 + 192] | |
3521 | movu m1, [r1 + 224] | |
3522 | phaddw m0, m1 | |
3523 | pavgw m0, m2 | |
3524 | vpermq m0, m0, 0xD8 | |
3525 | movu [r0 + 96], m0 | |
3526 | RET | |
3527 | %else ; HIGH_BIT_DEPTH == 0 | |
3528 | INIT_YMM avx2 | |
3529 | cglobal scale1D_128to64, 2, 2, 4 | |
3530 | pxor m2, m2 | |
3531 | mova m3, [pb_1] | |
3532 | ||
3533 | movu m0, [r1] | |
3534 | pmaddubsw m0, m0, m3 | |
3535 | pavgw m0, m2 | |
3536 | movu m1, [r1 + 32] | |
3537 | pmaddubsw m1, m1, m3 | |
3538 | pavgw m1, m2 | |
3539 | packuswb m0, m1 | |
3540 | vpermq m0, m0, 0xD8 | |
3541 | movu [r0], m0 | |
3542 | ||
3543 | movu m0, [r1 + 64] | |
3544 | pmaddubsw m0, m0, m3 | |
3545 | pavgw m0, m2 | |
3546 | movu m1, [r1 + 96] | |
3547 | pmaddubsw m1, m1, m3 | |
3548 | pavgw m1, m2 | |
3549 | packuswb m0, m1 | |
3550 | vpermq m0, m0, 0xD8 | |
3551 | movu [r0 + 32], m0 | |
3552 | RET | |
3553 | %endif | |
3554 | ||
3555 | ;----------------------------------------------------------------- | |
3556 | ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) | |
3557 | ;----------------------------------------------------------------- | |
3558 | %if HIGH_BIT_DEPTH | |
3559 | INIT_XMM ssse3 | |
3560 | cglobal scale2D_64to32, 3, 4, 8, dest, src, stride | |
3561 | mov r3d, 32 | |
3562 | mova m7, [deinterleave_word_shuf] | |
3563 | add r2, r2 | |
3564 | .loop: | |
3565 | movu m0, [r1] ;i | |
3566 | psrld m1, m0, 16 ;j | |
3567 | movu m2, [r1 + r2] ;k | |
3568 | psrld m3, m2, 16 ;l | |
3569 | movu m4, m0 | |
3570 | movu m5, m2 | |
3571 | pxor m4, m1 ;i^j | |
3572 | pxor m5, m3 ;k^l | |
3573 | por m4, m5 ;ij|kl | |
3574 | pavgw m0, m1 ;s | |
3575 | pavgw m2, m3 ;t | |
3576 | movu m5, m0 | |
3577 | pavgw m0, m2 ;(s+t+1)/2 | |
3578 | pxor m5, m2 ;s^t | |
3579 | pand m4, m5 ;(ij|kl)&st | |
3580 | pand m4, [hmulw_16p] | |
3581 | psubw m0, m4 ;Result | |
3582 | movu m1, [r1 + 16] ;i | |
3583 | psrld m2, m1, 16 ;j | |
3584 | movu m3, [r1 + r2 + 16] ;k | |
3585 | psrld m4, m3, 16 ;l | |
3586 | movu m5, m1 | |
3587 | movu m6, m3 | |
3588 | pxor m5, m2 ;i^j | |
3589 | pxor m6, m4 ;k^l | |
3590 | por m5, m6 ;ij|kl | |
3591 | pavgw m1, m2 ;s | |
3592 | pavgw m3, m4 ;t | |
3593 | movu m6, m1 | |
3594 | pavgw m1, m3 ;(s+t+1)/2 | |
3595 | pxor m6, m3 ;s^t | |
3596 | pand m5, m6 ;(ij|kl)&st | |
3597 | pand m5, [hmulw_16p] | |
3598 | psubw m1, m5 ;Result | |
3599 | pshufb m0, m7 | |
3600 | pshufb m1, m7 | |
3601 | ||
3602 | punpcklqdq m0, m1 | |
3603 | movu [r0], m0 | |
3604 | ||
3605 | movu m0, [r1 + 32] ;i | |
3606 | psrld m1, m0, 16 ;j | |
3607 | movu m2, [r1 + r2 + 32] ;k | |
3608 | psrld m3, m2, 16 ;l | |
3609 | movu m4, m0 | |
3610 | movu m5, m2 | |
3611 | pxor m4, m1 ;i^j | |
3612 | pxor m5, m3 ;k^l | |
3613 | por m4, m5 ;ij|kl | |
3614 | pavgw m0, m1 ;s | |
3615 | pavgw m2, m3 ;t | |
3616 | movu m5, m0 | |
3617 | pavgw m0, m2 ;(s+t+1)/2 | |
3618 | pxor m5, m2 ;s^t | |
3619 | pand m4, m5 ;(ij|kl)&st | |
3620 | pand m4, [hmulw_16p] | |
3621 | psubw m0, m4 ;Result | |
3622 | movu m1, [r1 + 48] ;i | |
3623 | psrld m2, m1, 16 ;j | |
3624 | movu m3, [r1 + r2 + 48] ;k | |
3625 | psrld m4, m3, 16 ;l | |
3626 | movu m5, m1 | |
3627 | movu m6, m3 | |
3628 | pxor m5, m2 ;i^j | |
3629 | pxor m6, m4 ;k^l | |
3630 | por m5, m6 ;ij|kl | |
3631 | pavgw m1, m2 ;s | |
3632 | pavgw m3, m4 ;t | |
3633 | movu m6, m1 | |
3634 | pavgw m1, m3 ;(s+t+1)/2 | |
3635 | pxor m6, m3 ;s^t | |
3636 | pand m5, m6 ;(ij|kl)&st | |
3637 | pand m5, [hmulw_16p] | |
3638 | psubw m1, m5 ;Result | |
3639 | pshufb m0, m7 | |
3640 | pshufb m1, m7 | |
3641 | ||
3642 | punpcklqdq m0, m1 | |
3643 | movu [r0 + 16], m0 | |
3644 | ||
3645 | movu m0, [r1 + 64] ;i | |
3646 | psrld m1, m0, 16 ;j | |
3647 | movu m2, [r1 + r2 + 64] ;k | |
3648 | psrld m3, m2, 16 ;l | |
3649 | movu m4, m0 | |
3650 | movu m5, m2 | |
3651 | pxor m4, m1 ;i^j | |
3652 | pxor m5, m3 ;k^l | |
3653 | por m4, m5 ;ij|kl | |
3654 | pavgw m0, m1 ;s | |
3655 | pavgw m2, m3 ;t | |
3656 | movu m5, m0 | |
3657 | pavgw m0, m2 ;(s+t+1)/2 | |
3658 | pxor m5, m2 ;s^t | |
3659 | pand m4, m5 ;(ij|kl)&st | |
3660 | pand m4, [hmulw_16p] | |
3661 | psubw m0, m4 ;Result | |
3662 | movu m1, [r1 + 80] ;i | |
3663 | psrld m2, m1, 16 ;j | |
3664 | movu m3, [r1 + r2 + 80] ;k | |
3665 | psrld m4, m3, 16 ;l | |
3666 | movu m5, m1 | |
3667 | movu m6, m3 | |
3668 | pxor m5, m2 ;i^j | |
3669 | pxor m6, m4 ;k^l | |
3670 | por m5, m6 ;ij|kl | |
3671 | pavgw m1, m2 ;s | |
3672 | pavgw m3, m4 ;t | |
3673 | movu m6, m1 | |
3674 | pavgw m1, m3 ;(s+t+1)/2 | |
3675 | pxor m6, m3 ;s^t | |
3676 | pand m5, m6 ;(ij|kl)&st | |
3677 | pand m5, [hmulw_16p] | |
3678 | psubw m1, m5 ;Result | |
3679 | pshufb m0, m7 | |
3680 | pshufb m1, m7 | |
3681 | ||
3682 | punpcklqdq m0, m1 | |
3683 | movu [r0 + 32], m0 | |
3684 | ||
3685 | movu m0, [r1 + 96] ;i | |
3686 | psrld m1, m0, 16 ;j | |
3687 | movu m2, [r1 + r2 + 96] ;k | |
3688 | psrld m3, m2, 16 ;l | |
3689 | movu m4, m0 | |
3690 | movu m5, m2 | |
3691 | pxor m4, m1 ;i^j | |
3692 | pxor m5, m3 ;k^l | |
3693 | por m4, m5 ;ij|kl | |
3694 | pavgw m0, m1 ;s | |
3695 | pavgw m2, m3 ;t | |
3696 | movu m5, m0 | |
3697 | pavgw m0, m2 ;(s+t+1)/2 | |
3698 | pxor m5, m2 ;s^t | |
3699 | pand m4, m5 ;(ij|kl)&st | |
3700 | pand m4, [hmulw_16p] | |
3701 | psubw m0, m4 ;Result | |
3702 | movu m1, [r1 + 112] ;i | |
3703 | psrld m2, m1, 16 ;j | |
3704 | movu m3, [r1 + r2 + 112] ;k | |
3705 | psrld m4, m3, 16 ;l | |
3706 | movu m5, m1 | |
3707 | movu m6, m3 | |
3708 | pxor m5, m2 ;i^j | |
3709 | pxor m6, m4 ;k^l | |
3710 | por m5, m6 ;ij|kl | |
3711 | pavgw m1, m2 ;s | |
3712 | pavgw m3, m4 ;t | |
3713 | movu m6, m1 | |
3714 | pavgw m1, m3 ;(s+t+1)/2 | |
3715 | pxor m6, m3 ;s^t | |
3716 | pand m5, m6 ;(ij|kl)&st | |
3717 | pand m5, [hmulw_16p] | |
3718 | psubw m1, m5 ;Result | |
3719 | pshufb m0, m7 | |
3720 | pshufb m1, m7 | |
3721 | ||
3722 | punpcklqdq m0, m1 | |
3723 | movu [r0 + 48], m0 | |
3724 | lea r0, [r0 + 64] | |
3725 | lea r1, [r1 + 2 * r2] | |
3726 | dec r3d | |
3727 | jnz .loop | |
3728 | RET | |
3729 | %else | |
3730 | ||
3731 | INIT_XMM ssse3 | |
3732 | cglobal scale2D_64to32, 3, 4, 8, dest, src, stride | |
3733 | mov r3d, 32 | |
3734 | mova m7, [deinterleave_shuf] | |
3735 | .loop: | |
3736 | ||
3737 | movu m0, [r1] ;i | |
3738 | psrlw m1, m0, 8 ;j | |
3739 | movu m2, [r1 + r2] ;k | |
3740 | psrlw m3, m2, 8 ;l | |
3741 | movu m4, m0 | |
3742 | movu m5, m2 | |
3743 | ||
3744 | pxor m4, m1 ;i^j | |
3745 | pxor m5, m3 ;k^l | |
3746 | por m4, m5 ;ij|kl | |
3747 | ||
3748 | pavgb m0, m1 ;s | |
3749 | pavgb m2, m3 ;t | |
3750 | movu m5, m0 | |
3751 | pavgb m0, m2 ;(s+t+1)/2 | |
3752 | pxor m5, m2 ;s^t | |
3753 | pand m4, m5 ;(ij|kl)&st | |
3754 | pand m4, [hmul_16p] | |
3755 | psubb m0, m4 ;Result | |
3756 | ||
3757 | movu m1, [r1 + 16] ;i | |
3758 | psrlw m2, m1, 8 ;j | |
3759 | movu m3, [r1 + r2 + 16] ;k | |
3760 | psrlw m4, m3, 8 ;l | |
3761 | movu m5, m1 | |
3762 | movu m6, m3 | |
3763 | ||
3764 | pxor m5, m2 ;i^j | |
3765 | pxor m6, m4 ;k^l | |
3766 | por m5, m6 ;ij|kl | |
3767 | ||
3768 | pavgb m1, m2 ;s | |
3769 | pavgb m3, m4 ;t | |
3770 | movu m6, m1 | |
3771 | pavgb m1, m3 ;(s+t+1)/2 | |
3772 | pxor m6, m3 ;s^t | |
3773 | pand m5, m6 ;(ij|kl)&st | |
3774 | pand m5, [hmul_16p] | |
3775 | psubb m1, m5 ;Result | |
3776 | ||
3777 | pshufb m0, m0, m7 | |
3778 | pshufb m1, m1, m7 | |
3779 | ||
3780 | punpcklqdq m0, m1 | |
3781 | movu [r0], m0 | |
3782 | ||
3783 | movu m0, [r1 + 32] ;i | |
3784 | psrlw m1, m0, 8 ;j | |
3785 | movu m2, [r1 + r2 + 32] ;k | |
3786 | psrlw m3, m2, 8 ;l | |
3787 | movu m4, m0 | |
3788 | movu m5, m2 | |
3789 | ||
3790 | pxor m4, m1 ;i^j | |
3791 | pxor m5, m3 ;k^l | |
3792 | por m4, m5 ;ij|kl | |
3793 | ||
3794 | pavgb m0, m1 ;s | |
3795 | pavgb m2, m3 ;t | |
3796 | movu m5, m0 | |
3797 | pavgb m0, m2 ;(s+t+1)/2 | |
3798 | pxor m5, m2 ;s^t | |
3799 | pand m4, m5 ;(ij|kl)&st | |
3800 | pand m4, [hmul_16p] | |
3801 | psubb m0, m4 ;Result | |
3802 | ||
3803 | movu m1, [r1 + 48] ;i | |
3804 | psrlw m2, m1, 8 ;j | |
3805 | movu m3, [r1 + r2 + 48] ;k | |
3806 | psrlw m4, m3, 8 ;l | |
3807 | movu m5, m1 | |
3808 | movu m6, m3 | |
3809 | ||
3810 | pxor m5, m2 ;i^j | |
3811 | pxor m6, m4 ;k^l | |
3812 | por m5, m6 ;ij|kl | |
3813 | ||
3814 | pavgb m1, m2 ;s | |
3815 | pavgb m3, m4 ;t | |
3816 | movu m6, m1 | |
3817 | pavgb m1, m3 ;(s+t+1)/2 | |
3818 | pxor m6, m3 ;s^t | |
3819 | pand m5, m6 ;(ij|kl)&st | |
3820 | pand m5, [hmul_16p] | |
3821 | psubb m1, m5 ;Result | |
3822 | ||
3823 | pshufb m0, m0, m7 | |
3824 | pshufb m1, m1, m7 | |
3825 | ||
3826 | punpcklqdq m0, m1 | |
3827 | movu [r0 + 16], m0 | |
3828 | ||
3829 | lea r0, [r0 + 32] | |
3830 | lea r1, [r1 + 2 * r2] | |
3831 | dec r3d | |
3832 | jnz .loop | |
3833 | RET | |
3834 | %endif | |
3835 | ||
3836 | ||
3837 | ;----------------------------------------------------------------------------- | |
3838 | ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3839 | ;----------------------------------------------------------------------------- | |
3840 | %if HIGH_BIT_DEPTH | |
3841 | INIT_XMM sse2 | |
3842 | cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3843 | add r4, r4 | |
3844 | add r5, r5 | |
3845 | add r1, r1 | |
3846 | movh m0, [r2] | |
3847 | movh m2, [r2 + r4] | |
3848 | movh m1, [r3] | |
3849 | movh m3, [r3 + r5] | |
3850 | lea r2, [r2 + r4 * 2] | |
3851 | lea r3, [r3 + r5 * 2] | |
3852 | movh m4, [r2] | |
3853 | movh m6, [r2 + r4] | |
3854 | movh m5, [r3] | |
3855 | movh m7, [r3 + r5] | |
3856 | ||
3857 | psubw m0, m1 | |
3858 | psubw m2, m3 | |
3859 | psubw m4, m5 | |
3860 | psubw m6, m7 | |
3861 | ||
3862 | movh [r0], m0 | |
3863 | movh [r0 + r1], m2 | |
3864 | lea r0, [r0 + r1 * 2] | |
3865 | movh [r0], m4 | |
3866 | movh [r0 + r1], m6 | |
3867 | ||
3868 | RET | |
3869 | %else | |
3870 | INIT_XMM sse4 | |
3871 | cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3872 | add r1, r1 | |
3873 | movd m0, [r2] | |
3874 | movd m2, [r2 + r4] | |
3875 | movd m1, [r3] | |
3876 | movd m3, [r3 + r5] | |
3877 | lea r2, [r2 + r4 * 2] | |
3878 | lea r3, [r3 + r5 * 2] | |
3879 | movd m4, [r2] | |
3880 | movd m6, [r2 + r4] | |
3881 | movd m5, [r3] | |
3882 | movd m7, [r3 + r5] | |
3883 | punpckldq m0, m2 | |
3884 | punpckldq m1, m3 | |
3885 | punpckldq m4, m6 | |
3886 | punpckldq m5, m7 | |
3887 | pmovzxbw m0, m0 | |
3888 | pmovzxbw m1, m1 | |
3889 | pmovzxbw m4, m4 | |
3890 | pmovzxbw m5, m5 | |
3891 | ||
3892 | psubw m0, m1 | |
3893 | psubw m4, m5 | |
3894 | ||
3895 | movh [r0], m0 | |
3896 | movhps [r0 + r1], m0 | |
3897 | movh [r0 + r1 * 2], m4 | |
3898 | lea r0, [r0 + r1 * 2] | |
3899 | movhps [r0 + r1], m4 | |
3900 | ||
3901 | RET | |
3902 | %endif | |
3903 | ||
3904 | ||
3905 | ;----------------------------------------------------------------------------- | |
3906 | ; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3907 | ;----------------------------------------------------------------------------- | |
3908 | %macro PIXELSUB_PS_W4_H4 2 | |
3909 | %if HIGH_BIT_DEPTH | |
3910 | cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3911 | mov r6d, %2/4 | |
3912 | add r4, r4 | |
3913 | add r5, r5 | |
3914 | add r1, r1 | |
3915 | .loop: | |
3916 | movh m0, [r2] | |
3917 | movh m2, [r2 + r4] | |
3918 | movh m1, [r3] | |
3919 | movh m3, [r3 + r5] | |
3920 | lea r2, [r2 + r4 * 2] | |
3921 | lea r3, [r3 + r5 * 2] | |
3922 | movh m4, [r2] | |
3923 | movh m6, [r2 + r4] | |
3924 | movh m5, [r3] | |
3925 | movh m7, [r3 + r5] | |
3926 | dec r6d | |
3927 | lea r2, [r2 + r4 * 2] | |
3928 | lea r3, [r3 + r5 * 2] | |
3929 | ||
3930 | psubw m0, m1 | |
3931 | psubw m2, m3 | |
3932 | psubw m4, m5 | |
3933 | psubw m6, m7 | |
3934 | ||
3935 | movh [r0], m0 | |
3936 | movh [r0 + r1], m2 | |
3937 | movh [r0 + r1 * 2], m4 | |
3938 | lea r0, [r0 + r1 * 2] | |
3939 | movh [r0 + r1], m6 | |
3940 | lea r0, [r0 + r1 * 2] | |
3941 | ||
3942 | jnz .loop | |
3943 | RET | |
3944 | %else | |
3945 | cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3946 | mov r6d, %2/4 | |
3947 | add r1, r1 | |
3948 | .loop: | |
3949 | movd m0, [r2] | |
3950 | movd m2, [r2 + r4] | |
3951 | movd m1, [r3] | |
3952 | movd m3, [r3 + r5] | |
3953 | lea r2, [r2 + r4 * 2] | |
3954 | lea r3, [r3 + r5 * 2] | |
3955 | movd m4, [r2] | |
3956 | movd m6, [r2 + r4] | |
3957 | movd m5, [r3] | |
3958 | movd m7, [r3 + r5] | |
3959 | dec r6d | |
3960 | lea r2, [r2 + r4 * 2] | |
3961 | lea r3, [r3 + r5 * 2] | |
3962 | punpckldq m0, m2 | |
3963 | punpckldq m1, m3 | |
3964 | punpckldq m4, m6 | |
3965 | punpckldq m5, m7 | |
3966 | pmovzxbw m0, m0 | |
3967 | pmovzxbw m1, m1 | |
3968 | pmovzxbw m4, m4 | |
3969 | pmovzxbw m5, m5 | |
3970 | ||
3971 | psubw m0, m1 | |
3972 | psubw m4, m5 | |
3973 | ||
3974 | movh [r0], m0 | |
3975 | movhps [r0 + r1], m0 | |
3976 | movh [r0 + r1 * 2], m4 | |
3977 | lea r0, [r0 + r1 * 2] | |
3978 | movhps [r0 + r1], m4 | |
3979 | lea r0, [r0 + r1 * 2] | |
3980 | ||
3981 | jnz .loop | |
3982 | RET | |
3983 | %endif | |
3984 | %endmacro | |
3985 | ||
3986 | %if HIGH_BIT_DEPTH | |
3987 | INIT_XMM sse2 | |
3988 | PIXELSUB_PS_W4_H4 4, 8 | |
3989 | %else | |
3990 | INIT_XMM sse4 | |
3991 | PIXELSUB_PS_W4_H4 4, 8 | |
3992 | %endif | |
3993 | ||
3994 | ||
3995 | ;----------------------------------------------------------------------------- | |
3996 | ; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3997 | ;----------------------------------------------------------------------------- | |
3998 | %macro PIXELSUB_PS_W8_H4 2 | |
3999 | %if HIGH_BIT_DEPTH | |
4000 | cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4001 | mov r6d, %2/4 | |
4002 | add r4, r4 | |
4003 | add r5, r5 | |
4004 | add r1, r1 | |
4005 | .loop: | |
4006 | movu m0, [r2] | |
4007 | movu m2, [r2 + r4] | |
4008 | movu m1, [r3] | |
4009 | movu m3, [r3 + r5] | |
4010 | lea r2, [r2 + r4 * 2] | |
4011 | lea r3, [r3 + r5 * 2] | |
4012 | movu m4, [r2] | |
4013 | movu m6, [r2 + r4] | |
4014 | movu m5, [r3] | |
4015 | movu m7, [r3 + r5] | |
4016 | dec r6d | |
4017 | lea r2, [r2 + r4 * 2] | |
4018 | lea r3, [r3 + r5 * 2] | |
4019 | ||
4020 | psubw m0, m1 | |
4021 | psubw m2, m3 | |
4022 | psubw m4, m5 | |
4023 | psubw m6, m7 | |
4024 | ||
4025 | movu [r0], m0 | |
4026 | movu [r0 + r1], m2 | |
4027 | movu [r0 + r1 * 2], m4 | |
4028 | lea r0, [r0 + r1 * 2] | |
4029 | movu [r0 + r1], m6 | |
4030 | lea r0, [r0 + r1 * 2] | |
4031 | ||
4032 | jnz .loop | |
4033 | RET | |
4034 | %else | |
4035 | cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4036 | mov r6d, %2/4 | |
4037 | add r1, r1 | |
4038 | .loop: | |
4039 | movh m0, [r2] | |
4040 | movh m2, [r2 + r4] | |
4041 | movh m1, [r3] | |
4042 | movh m3, [r3 + r5] | |
4043 | lea r2, [r2 + r4 * 2] | |
4044 | lea r3, [r3 + r5 * 2] | |
4045 | movh m4, [r2] | |
4046 | movh m6, [r2 + r4] | |
4047 | movh m5, [r3] | |
4048 | movh m7, [r3 + r5] | |
4049 | dec r6d | |
4050 | lea r2, [r2 + r4 * 2] | |
4051 | lea r3, [r3 + r5 * 2] | |
4052 | pmovzxbw m0, m0 | |
4053 | pmovzxbw m1, m1 | |
4054 | pmovzxbw m2, m2 | |
4055 | pmovzxbw m3, m3 | |
4056 | pmovzxbw m4, m4 | |
4057 | pmovzxbw m5, m5 | |
4058 | pmovzxbw m6, m6 | |
4059 | pmovzxbw m7, m7 | |
4060 | ||
4061 | psubw m0, m1 | |
4062 | psubw m2, m3 | |
4063 | psubw m4, m5 | |
4064 | psubw m6, m7 | |
4065 | ||
4066 | movu [r0], m0 | |
4067 | movu [r0 + r1], m2 | |
4068 | movu [r0 + r1 * 2], m4 | |
4069 | lea r0, [r0 + r1 * 2] | |
4070 | movu [r0 + r1], m6 | |
4071 | lea r0, [r0 + r1 * 2] | |
4072 | ||
4073 | jnz .loop | |
4074 | RET | |
4075 | %endif | |
4076 | %endmacro | |
4077 | ||
4078 | %if HIGH_BIT_DEPTH | |
4079 | INIT_XMM sse2 | |
4080 | PIXELSUB_PS_W8_H4 8, 8 | |
4081 | PIXELSUB_PS_W8_H4 8, 16 | |
4082 | %else | |
4083 | INIT_XMM sse4 | |
4084 | PIXELSUB_PS_W8_H4 8, 8 | |
4085 | PIXELSUB_PS_W8_H4 8, 16 | |
4086 | %endif | |
4087 | ||
4088 | ||
4089 | ;----------------------------------------------------------------------------- | |
4090 | ; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
4091 | ;----------------------------------------------------------------------------- | |
4092 | %macro PIXELSUB_PS_W16_H4 2 | |
4093 | %if HIGH_BIT_DEPTH | |
4094 | cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4095 | mov r6d, %2/4 | |
4096 | add r4, r4 | |
4097 | add r5, r5 | |
4098 | add r1, r1 | |
4099 | .loop: | |
4100 | movu m0, [r2] | |
4101 | movu m2, [r2 + 16] | |
4102 | movu m1, [r3] | |
4103 | movu m3, [r3 + 16] | |
4104 | movu m4, [r2 + r4] | |
4105 | movu m6, [r2 + r4 + 16] | |
4106 | movu m5, [r3 + r5] | |
4107 | movu m7, [r3 + r5 + 16] | |
4108 | dec r6d | |
4109 | lea r2, [r2 + r4 * 2] | |
4110 | lea r3, [r3 + r5 * 2] | |
4111 | ||
4112 | psubw m0, m1 | |
4113 | psubw m2, m3 | |
4114 | psubw m4, m5 | |
4115 | psubw m6, m7 | |
4116 | ||
4117 | movu [r0], m0 | |
4118 | movu [r0 + 16], m2 | |
4119 | movu [r0 + r1], m4 | |
4120 | movu [r0 + r1 + 16], m6 | |
4121 | ||
4122 | movu m0, [r2] | |
4123 | movu m2, [r2 + 16] | |
4124 | movu m1, [r3] | |
4125 | movu m3, [r3 + 16] | |
4126 | movu m4, [r2 + r4] | |
4127 | movu m5, [r3 + r5] | |
4128 | movu m6, [r2 + r4 + 16] | |
4129 | movu m7, [r3 + r5 + 16] | |
4130 | lea r0, [r0 + r1 * 2] | |
4131 | lea r2, [r2 + r4 * 2] | |
4132 | lea r3, [r3 + r5 * 2] | |
4133 | ||
4134 | psubw m0, m1 | |
4135 | psubw m2, m3 | |
4136 | psubw m4, m5 | |
4137 | psubw m6, m7 | |
4138 | ||
4139 | movu [r0], m0 | |
4140 | movu [r0 + 16], m2 | |
4141 | movu [r0 + r1], m4 | |
4142 | movu [r0 + r1 + 16], m6 | |
4143 | lea r0, [r0 + r1 * 2] | |
4144 | ||
4145 | jnz .loop | |
4146 | RET | |
4147 | %else | |
4148 | cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4149 | mov r6d, %2/4 | |
4150 | pxor m6, m6 | |
4151 | add r1, r1 | |
4152 | .loop: | |
4153 | movu m1, [r2] | |
4154 | movu m3, [r3] | |
4155 | pmovzxbw m0, m1 | |
4156 | pmovzxbw m2, m3 | |
4157 | punpckhbw m1, m6 | |
4158 | punpckhbw m3, m6 | |
4159 | ||
4160 | psubw m0, m2 | |
4161 | psubw m1, m3 | |
4162 | ||
4163 | movu m5, [r2 + r4] | |
4164 | movu m3, [r3 + r5] | |
4165 | lea r2, [r2 + r4 * 2] | |
4166 | lea r3, [r3 + r5 * 2] | |
4167 | pmovzxbw m4, m5 | |
4168 | pmovzxbw m2, m3 | |
4169 | punpckhbw m5, m6 | |
4170 | punpckhbw m3, m6 | |
4171 | ||
4172 | psubw m4, m2 | |
4173 | psubw m5, m3 | |
4174 | ||
4175 | movu [r0], m0 | |
4176 | movu [r0 + 16], m1 | |
4177 | movu [r0 + r1], m4 | |
4178 | movu [r0 + r1 + 16], m5 | |
4179 | ||
4180 | movu m1, [r2] | |
4181 | movu m3, [r3] | |
4182 | pmovzxbw m0, m1 | |
4183 | pmovzxbw m2, m3 | |
4184 | punpckhbw m1, m6 | |
4185 | punpckhbw m3, m6 | |
4186 | ||
4187 | psubw m0, m2 | |
4188 | psubw m1, m3 | |
4189 | ||
4190 | movu m5, [r2 + r4] | |
4191 | movu m3, [r3 + r5] | |
4192 | dec r6d | |
4193 | lea r2, [r2 + r4 * 2] | |
4194 | lea r3, [r3 + r5 * 2] | |
4195 | lea r0, [r0 + r1 * 2] | |
4196 | pmovzxbw m4, m5 | |
4197 | pmovzxbw m2, m3 | |
4198 | punpckhbw m5, m6 | |
4199 | punpckhbw m3, m6 | |
4200 | ||
4201 | psubw m4, m2 | |
4202 | psubw m5, m3 | |
4203 | ||
4204 | movu [r0], m0 | |
4205 | movu [r0 + 16], m1 | |
4206 | movu [r0 + r1], m4 | |
4207 | movu [r0 + r1 + 16], m5 | |
4208 | lea r0, [r0 + r1 * 2] | |
4209 | ||
4210 | jnz .loop | |
4211 | RET | |
4212 | %endif | |
4213 | %endmacro | |
4214 | ||
4215 | %if HIGH_BIT_DEPTH | |
4216 | INIT_XMM sse2 | |
4217 | PIXELSUB_PS_W16_H4 16, 16 | |
4218 | PIXELSUB_PS_W16_H4 16, 32 | |
4219 | %else | |
4220 | INIT_XMM sse4 | |
4221 | PIXELSUB_PS_W16_H4 16, 16 | |
4222 | PIXELSUB_PS_W16_H4 16, 32 | |
4223 | %endif | |
4224 | ||
4225 | ||
4226 | ;----------------------------------------------------------------------------- | |
4227 | ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
4228 | ;----------------------------------------------------------------------------- | |
4229 | %macro PIXELSUB_PS_W32_H2 2 | |
4230 | %if HIGH_BIT_DEPTH | |
4231 | cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4232 | mov r6d, %2/2 | |
4233 | add r4, r4 | |
4234 | add r5, r5 | |
4235 | add r1, r1 | |
4236 | .loop: | |
4237 | movu m0, [r2] | |
4238 | movu m2, [r2 + 16] | |
4239 | movu m4, [r2 + 32] | |
4240 | movu m6, [r2 + 48] | |
4241 | movu m1, [r3] | |
4242 | movu m3, [r3 + 16] | |
4243 | movu m5, [r3 + 32] | |
4244 | movu m7, [r3 + 48] | |
4245 | dec r6d | |
4246 | ||
4247 | psubw m0, m1 | |
4248 | psubw m2, m3 | |
4249 | psubw m4, m5 | |
4250 | psubw m6, m7 | |
4251 | ||
4252 | movu [r0], m0 | |
4253 | movu [r0 + 16], m2 | |
4254 | movu [r0 + 32], m4 | |
4255 | movu [r0 + 48], m6 | |
4256 | ||
4257 | movu m0, [r2 + r4] | |
4258 | movu m2, [r2 + r4 + 16] | |
4259 | movu m4, [r2 + r4 + 32] | |
4260 | movu m6, [r2 + r4 + 48] | |
4261 | movu m1, [r3 + r5] | |
4262 | movu m3, [r3 + r5 + 16] | |
4263 | movu m5, [r3 + r5 + 32] | |
4264 | movu m7, [r3 + r5 + 48] | |
4265 | lea r2, [r2 + r4 * 2] | |
4266 | lea r3, [r3 + r5 * 2] | |
4267 | ||
4268 | psubw m0, m1 | |
4269 | psubw m2, m3 | |
4270 | psubw m4, m5 | |
4271 | psubw m6, m7 | |
4272 | ||
4273 | movu [r0 + r1], m0 | |
4274 | movu [r0 + r1 + 16], m2 | |
4275 | movu [r0 + r1 + 32], m4 | |
4276 | movu [r0 + r1 + 48], m6 | |
4277 | lea r0, [r0 + r1 * 2] | |
4278 | ||
4279 | jnz .loop | |
4280 | RET | |
4281 | %else | |
4282 | cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4283 | mov r6d, %2/2 | |
4284 | add r1, r1 | |
4285 | .loop: | |
4286 | movh m0, [r2] | |
4287 | movh m1, [r2 + 8] | |
4288 | movh m2, [r2 + 16] | |
4289 | movh m6, [r2 + 24] | |
4290 | movh m3, [r3] | |
4291 | movh m4, [r3 + 8] | |
4292 | movh m5, [r3 + 16] | |
4293 | movh m7, [r3 + 24] | |
4294 | dec r6d | |
4295 | pmovzxbw m0, m0 | |
4296 | pmovzxbw m1, m1 | |
4297 | pmovzxbw m2, m2 | |
4298 | pmovzxbw m6, m6 | |
4299 | pmovzxbw m3, m3 | |
4300 | pmovzxbw m4, m4 | |
4301 | pmovzxbw m5, m5 | |
4302 | pmovzxbw m7, m7 | |
4303 | ||
4304 | psubw m0, m3 | |
4305 | psubw m1, m4 | |
4306 | psubw m2, m5 | |
4307 | psubw m6, m7 | |
4308 | ||
4309 | movu [r0], m0 | |
4310 | movu [r0 + 16], m1 | |
4311 | movu [r0 + 32], m2 | |
4312 | movu [r0 + 48], m6 | |
4313 | ||
4314 | movh m0, [r2 + r4] | |
4315 | movh m1, [r2 + r4 + 8] | |
4316 | movh m2, [r2 + r4 + 16] | |
4317 | movh m6, [r2 + r4 + 24] | |
4318 | movh m3, [r3 + r5] | |
4319 | movh m4, [r3 + r5 + 8] | |
4320 | movh m5, [r3 + r5 + 16] | |
4321 | movh m7, [r3 + r5 + 24] | |
4322 | lea r2, [r2 + r4 * 2] | |
4323 | lea r3, [r3 + r5 * 2] | |
4324 | pmovzxbw m0, m0 | |
4325 | pmovzxbw m1, m1 | |
4326 | pmovzxbw m2, m2 | |
4327 | pmovzxbw m6, m6 | |
4328 | pmovzxbw m3, m3 | |
4329 | pmovzxbw m4, m4 | |
4330 | pmovzxbw m5, m5 | |
4331 | pmovzxbw m7, m7 | |
4332 | ||
4333 | psubw m0, m3 | |
4334 | psubw m1, m4 | |
4335 | psubw m2, m5 | |
4336 | psubw m6, m7 | |
4337 | ||
4338 | movu [r0 + r1], m0 | |
4339 | movu [r0 + r1 + 16], m1 | |
4340 | movu [r0 + r1 + 32], m2 | |
4341 | movu [r0 + r1 + 48], m6 | |
4342 | lea r0, [r0 + r1 * 2] | |
4343 | ||
4344 | jnz .loop | |
4345 | RET | |
4346 | %endif | |
4347 | %endmacro | |
4348 | ||
4349 | %if HIGH_BIT_DEPTH | |
4350 | INIT_XMM sse2 | |
4351 | PIXELSUB_PS_W32_H2 32, 32 | |
4352 | PIXELSUB_PS_W32_H2 32, 64 | |
4353 | %else | |
4354 | INIT_XMM sse4 | |
4355 | PIXELSUB_PS_W32_H2 32, 32 | |
4356 | PIXELSUB_PS_W32_H2 32, 64 | |
4357 | %endif | |
4358 | ||
4359 | ||
4360 | ;----------------------------------------------------------------------------- | |
4361 | ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
4362 | ;----------------------------------------------------------------------------- | |
4363 | %macro PIXELSUB_PS_W64_H2 2 | |
4364 | %if HIGH_BIT_DEPTH | |
4365 | cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4366 | mov r6d, %2/2 | |
4367 | add r4, r4 | |
4368 | add r5, r5 | |
4369 | add r1, r1 | |
4370 | .loop: | |
4371 | movu m0, [r2] | |
4372 | movu m2, [r2 + 16] | |
4373 | movu m4, [r2 + 32] | |
4374 | movu m6, [r2 + 48] | |
4375 | movu m1, [r3] | |
4376 | movu m3, [r3 + 16] | |
4377 | movu m5, [r3 + 32] | |
4378 | movu m7, [r3 + 48] | |
4379 | ||
4380 | psubw m0, m1 | |
4381 | psubw m2, m3 | |
4382 | psubw m4, m5 | |
4383 | psubw m6, m7 | |
4384 | ||
4385 | movu [r0], m0 | |
4386 | movu [r0 + 16], m2 | |
4387 | movu [r0 + 32], m4 | |
4388 | movu [r0 + 48], m6 | |
4389 | ||
4390 | movu m0, [r2 + 64] | |
4391 | movu m2, [r2 + 80] | |
4392 | movu m4, [r2 + 96] | |
4393 | movu m6, [r2 + 112] | |
4394 | movu m1, [r3 + 64] | |
4395 | movu m3, [r3 + 80] | |
4396 | movu m5, [r3 + 96] | |
4397 | movu m7, [r3 + 112] | |
4398 | ||
4399 | psubw m0, m1 | |
4400 | psubw m2, m3 | |
4401 | psubw m4, m5 | |
4402 | psubw m6, m7 | |
4403 | ||
4404 | movu [r0 + 64], m0 | |
4405 | movu [r0 + 80], m2 | |
4406 | movu [r0 + 96], m4 | |
4407 | movu [r0 + 112], m6 | |
4408 | ||
4409 | movu m0, [r2 + r4] | |
4410 | movu m2, [r2 + r4 + 16] | |
4411 | movu m4, [r2 + r4 + 32] | |
4412 | movu m6, [r2 + r4 + 48] | |
4413 | movu m1, [r3 + r5] | |
4414 | movu m3, [r3 + r5 + 16] | |
4415 | movu m5, [r3 + r5 + 32] | |
4416 | movu m7, [r3 + r5 + 48] | |
4417 | ||
4418 | psubw m0, m1 | |
4419 | psubw m2, m3 | |
4420 | psubw m4, m5 | |
4421 | psubw m6, m7 | |
4422 | ||
4423 | movu [r0 + r1], m0 | |
4424 | movu [r0 + r1 + 16], m2 | |
4425 | movu [r0 + r1 + 32], m4 | |
4426 | movu [r0 + r1 + 48], m6 | |
4427 | ||
4428 | movu m0, [r2 + r4 + 64] | |
4429 | movu m2, [r2 + r4 + 80] | |
4430 | movu m4, [r2 + r4 + 96] | |
4431 | movu m6, [r2 + r4 + 112] | |
4432 | movu m1, [r3 + r5 + 64] | |
4433 | movu m3, [r3 + r5 + 80] | |
4434 | movu m5, [r3 + r5 + 96] | |
4435 | movu m7, [r3 + r5 + 112] | |
4436 | dec r6d | |
4437 | lea r2, [r2 + r4 * 2] | |
4438 | lea r3, [r3 + r5 * 2] | |
4439 | ||
4440 | psubw m0, m1 | |
4441 | psubw m2, m3 | |
4442 | psubw m4, m5 | |
4443 | psubw m6, m7 | |
4444 | ||
4445 | movu [r0 + r1 + 64], m0 | |
4446 | movu [r0 + r1 + 80], m2 | |
4447 | movu [r0 + r1 + 96], m4 | |
4448 | movu [r0 + r1 + 112], m6 | |
4449 | lea r0, [r0 + r1 * 2] | |
4450 | ||
4451 | jnz .loop | |
4452 | RET | |
4453 | %else | |
4454 | cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4455 | mov r6d, %2/2 | |
4456 | pxor m6, m6 | |
4457 | add r1, r1 | |
4458 | .loop: | |
4459 | movu m1, [r2] | |
4460 | movu m5, [r2 + 16] | |
4461 | movu m3, [r3] | |
4462 | movu m7, [r3 + 16] | |
4463 | ||
4464 | pmovzxbw m0, m1 | |
4465 | pmovzxbw m4, m5 | |
4466 | pmovzxbw m2, m3 | |
4467 | punpckhbw m1, m6 | |
4468 | punpckhbw m3, m6 | |
4469 | punpckhbw m5, m6 | |
4470 | ||
4471 | psubw m0, m2 | |
4472 | psubw m1, m3 | |
4473 | pmovzxbw m2, m7 | |
4474 | punpckhbw m7, m6 | |
4475 | psubw m4, m2 | |
4476 | psubw m5, m7 | |
4477 | ||
4478 | movu m3, [r2 + 32] | |
4479 | movu m7, [r3 + 32] | |
4480 | pmovzxbw m2, m3 | |
4481 | punpckhbw m3, m6 | |
4482 | ||
4483 | movu [r0], m0 | |
4484 | movu [r0 + 16], m1 | |
4485 | movu [r0 + 32], m4 | |
4486 | movu [r0 + 48], m5 | |
4487 | ||
4488 | movu m1, [r2 + 48] | |
4489 | movu m5, [r3 + 48] | |
4490 | pmovzxbw m0, m1 | |
4491 | pmovzxbw m4, m7 | |
4492 | punpckhbw m1, m6 | |
4493 | punpckhbw m7, m6 | |
4494 | ||
4495 | psubw m2, m4 | |
4496 | psubw m3, m7 | |
4497 | ||
4498 | movu [r0 + 64], m2 | |
4499 | movu [r0 + 80], m3 | |
4500 | ||
4501 | movu m7, [r2 + r4] | |
4502 | movu m3, [r3 + r5] | |
4503 | pmovzxbw m2, m5 | |
4504 | pmovzxbw m4, m7 | |
4505 | punpckhbw m5, m6 | |
4506 | punpckhbw m7, m6 | |
4507 | ||
4508 | psubw m0, m2 | |
4509 | psubw m1, m5 | |
4510 | ||
4511 | movu [r0 + 96], m0 | |
4512 | movu [r0 + 112], m1 | |
4513 | ||
4514 | movu m2, [r2 + r4 + 16] | |
4515 | movu m5, [r3 + r5 + 16] | |
4516 | pmovzxbw m0, m3 | |
4517 | pmovzxbw m1, m2 | |
4518 | punpckhbw m3, m6 | |
4519 | punpckhbw m2, m6 | |
4520 | ||
4521 | psubw m4, m0 | |
4522 | psubw m7, m3 | |
4523 | ||
4524 | movu [r0 + r1], m4 | |
4525 | movu [r0 + r1 + 16], m7 | |
4526 | ||
4527 | movu m0, [r2 + r4 + 32] | |
4528 | movu m3, [r3 + r5 + 32] | |
4529 | dec r6d | |
4530 | pmovzxbw m4, m5 | |
4531 | pmovzxbw m7, m0 | |
4532 | punpckhbw m5, m6 | |
4533 | punpckhbw m0, m6 | |
4534 | ||
4535 | psubw m1, m4 | |
4536 | psubw m2, m5 | |
4537 | ||
4538 | movu [r0 + r1 + 32], m1 | |
4539 | movu [r0 + r1 + 48], m2 | |
4540 | ||
4541 | movu m4, [r2 + r4 + 48] | |
4542 | movu m5, [r3 + r5 + 48] | |
4543 | lea r2, [r2 + r4 * 2] | |
4544 | lea r3, [r3 + r5 * 2] | |
4545 | pmovzxbw m1, m3 | |
4546 | pmovzxbw m2, m4 | |
4547 | punpckhbw m3, m6 | |
4548 | punpckhbw m4, m6 | |
4549 | ||
4550 | psubw m7, m1 | |
4551 | psubw m0, m3 | |
4552 | ||
4553 | movu [r0 + r1 + 64], m7 | |
4554 | movu [r0 + r1 + 80], m0 | |
4555 | ||
4556 | pmovzxbw m7, m5 | |
4557 | punpckhbw m5, m6 | |
4558 | psubw m2, m7 | |
4559 | psubw m4, m5 | |
4560 | ||
4561 | movu [r0 + r1 + 96], m2 | |
4562 | movu [r0 + r1 + 112], m4 | |
4563 | lea r0, [r0 + r1 * 2] | |
4564 | ||
4565 | jnz .loop | |
4566 | RET | |
4567 | %endif | |
4568 | %endmacro | |
4569 | ||
4570 | %if HIGH_BIT_DEPTH | |
4571 | INIT_XMM sse2 | |
4572 | PIXELSUB_PS_W64_H2 64, 64 | |
4573 | %else | |
4574 | INIT_XMM sse4 | |
4575 | PIXELSUB_PS_W64_H2 64, 64 | |
4576 | %endif | |
4577 | ||
4578 | ||
4579 | ;============================================================================= | |
4580 | ; variance | |
4581 | ;============================================================================= | |
4582 | ||
4583 | %macro VAR_START 1 | |
4584 | pxor m5, m5 ; sum | |
4585 | pxor m6, m6 ; sum squared | |
4586 | %if HIGH_BIT_DEPTH == 0 | |
4587 | %if %1 | |
4588 | mova m7, [pw_00ff] | |
4589 | %elif mmsize < 32 | |
4590 | pxor m7, m7 ; zero | |
4591 | %endif | |
4592 | %endif ; !HIGH_BIT_DEPTH | |
4593 | %endmacro | |
4594 | ||
4595 | %macro VAR_END 2 | |
4596 | %if HIGH_BIT_DEPTH | |
4597 | %if mmsize == 8 && %1*%2 == 256 | |
4598 | HADDUW m5, m2 | |
4599 | %else | |
4600 | %if %1 >= 32 | |
4601 | HADDW m5, m2 | |
4602 | movd m7, r4d | |
4603 | paddd m5, m7 | |
4604 | %else | |
4605 | HADDW m5, m2 | |
4606 | %endif | |
4607 | %endif | |
4608 | %else ; !HIGH_BIT_DEPTH | |
4609 | %if %1 == 64 | |
4610 | HADDW m5, m2 | |
4611 | movd m7, r4d | |
4612 | paddd m5, m7 | |
4613 | %else | |
4614 | HADDW m5, m2 | |
4615 | %endif | |
4616 | %endif ; HIGH_BIT_DEPTH | |
4617 | HADDD m6, m1 | |
4618 | %if ARCH_X86_64 | |
4619 | punpckldq m5, m6 | |
4620 | movq rax, m5 | |
4621 | %else | |
4622 | movd eax, m5 | |
4623 | movd edx, m6 | |
4624 | %endif | |
4625 | RET | |
4626 | %endmacro | |
4627 | ||
4628 | %macro VAR_CORE 0 | |
4629 | paddw m5, m0 | |
4630 | paddw m5, m3 | |
4631 | paddw m5, m1 | |
4632 | paddw m5, m4 | |
4633 | pmaddwd m0, m0 | |
4634 | pmaddwd m3, m3 | |
4635 | pmaddwd m1, m1 | |
4636 | pmaddwd m4, m4 | |
4637 | paddd m6, m0 | |
4638 | paddd m6, m3 | |
4639 | paddd m6, m1 | |
4640 | paddd m6, m4 | |
4641 | %endmacro | |
4642 | ||
4643 | %macro VAR_2ROW 3 | |
4644 | mov r2d, %2 | |
4645 | .loop%3: | |
4646 | %if HIGH_BIT_DEPTH | |
4647 | movu m0, [r0] | |
4648 | movu m1, [r0+mmsize] | |
4649 | movu m3, [r0+%1] | |
4650 | movu m4, [r0+%1+mmsize] | |
4651 | %else ; !HIGH_BIT_DEPTH | |
4652 | mova m0, [r0] | |
4653 | punpckhbw m1, m0, m7 | |
4654 | mova m3, [r0+%1] | |
4655 | mova m4, m3 | |
4656 | punpcklbw m0, m7 | |
4657 | %endif ; HIGH_BIT_DEPTH | |
4658 | %ifidn %1, r1 | |
4659 | lea r0, [r0+%1*2] | |
4660 | %else | |
4661 | add r0, r1 | |
4662 | %endif | |
4663 | %if HIGH_BIT_DEPTH == 0 | |
4664 | punpcklbw m3, m7 | |
4665 | punpckhbw m4, m7 | |
4666 | %endif ; !HIGH_BIT_DEPTH | |
4667 | VAR_CORE | |
4668 | dec r2d | |
4669 | jg .loop%3 | |
4670 | %endmacro | |
4671 | ||
4672 | ;----------------------------------------------------------------------------- | |
4673 | ; int pixel_var_wxh( uint8_t *, intptr_t ) | |
4674 | ;----------------------------------------------------------------------------- | |
4675 | INIT_MMX mmx2 | |
4676 | cglobal pixel_var_16x16, 2,3 | |
4677 | FIX_STRIDES r1 | |
4678 | VAR_START 0 | |
4679 | VAR_2ROW 8*SIZEOF_PIXEL, 16, 1 | |
4680 | VAR_END 16, 16 | |
4681 | ||
4682 | cglobal pixel_var_8x8, 2,3 | |
4683 | FIX_STRIDES r1 | |
4684 | VAR_START 0 | |
4685 | VAR_2ROW r1, 4, 1 | |
4686 | VAR_END 8, 8 | |
4687 | ||
4688 | %if HIGH_BIT_DEPTH | |
4689 | %macro VAR 0 | |
4690 | cglobal pixel_var_16x16, 2,3,8 | |
4691 | FIX_STRIDES r1 | |
4692 | VAR_START 0 | |
4693 | VAR_2ROW r1, 8, 1 | |
4694 | VAR_END 16, 16 | |
4695 | ||
4696 | cglobal pixel_var_8x8, 2,3,8 | |
4697 | lea r2, [r1*3] | |
4698 | VAR_START 0 | |
4699 | movu m0, [r0] | |
4700 | movu m1, [r0+r1*2] | |
4701 | movu m3, [r0+r1*4] | |
4702 | movu m4, [r0+r2*2] | |
4703 | lea r0, [r0+r1*8] | |
4704 | VAR_CORE | |
4705 | movu m0, [r0] | |
4706 | movu m1, [r0+r1*2] | |
4707 | movu m3, [r0+r1*4] | |
4708 | movu m4, [r0+r2*2] | |
4709 | VAR_CORE | |
4710 | VAR_END 8, 8 | |
4711 | ||
4712 | cglobal pixel_var_32x32, 2,6,8 | |
4713 | FIX_STRIDES r1 | |
4714 | mov r3, r0 | |
4715 | VAR_START 0 | |
4716 | VAR_2ROW r1, 8, 1 | |
4717 | HADDW m5, m2 | |
4718 | movd r4d, m5 | |
4719 | pxor m5, m5 | |
4720 | VAR_2ROW r1, 8, 2 | |
4721 | HADDW m5, m2 | |
4722 | movd r5d, m5 | |
4723 | add r4, r5 | |
4724 | pxor m5, m5 | |
4725 | lea r0, [r3 + 32] | |
4726 | VAR_2ROW r1, 8, 3 | |
4727 | HADDW m5, m2 | |
4728 | movd r5d, m5 | |
4729 | add r4, r5 | |
4730 | pxor m5, m5 | |
4731 | VAR_2ROW r1, 8, 4 | |
4732 | VAR_END 32, 32 | |
4733 | ||
4734 | cglobal pixel_var_64x64, 2,6,8 | |
4735 | FIX_STRIDES r1 | |
4736 | mov r3, r0 | |
4737 | VAR_START 0 | |
4738 | VAR_2ROW r1, 8, 1 | |
4739 | HADDW m5, m2 | |
4740 | movd r4d, m5 | |
4741 | pxor m5, m5 | |
4742 | VAR_2ROW r1, 8, 2 | |
4743 | HADDW m5, m2 | |
4744 | movd r5d, m5 | |
4745 | add r4, r5 | |
4746 | pxor m5, m5 | |
4747 | VAR_2ROW r1, 8, 3 | |
4748 | HADDW m5, m2 | |
4749 | movd r5d, m5 | |
4750 | add r4, r5 | |
4751 | pxor m5, m5 | |
4752 | VAR_2ROW r1, 8, 4 | |
4753 | HADDW m5, m2 | |
4754 | movd r5d, m5 | |
4755 | add r4, r5 | |
4756 | pxor m5, m5 | |
4757 | lea r0, [r3 + 32] | |
4758 | VAR_2ROW r1, 8, 5 | |
4759 | HADDW m5, m2 | |
4760 | movd r5d, m5 | |
4761 | add r4, r5 | |
4762 | pxor m5, m5 | |
4763 | VAR_2ROW r1, 8, 6 | |
4764 | HADDW m5, m2 | |
4765 | movd r5d, m5 | |
4766 | add r4, r5 | |
4767 | pxor m5, m5 | |
4768 | VAR_2ROW r1, 8, 7 | |
4769 | HADDW m5, m2 | |
4770 | movd r5d, m5 | |
4771 | add r4, r5 | |
4772 | pxor m5, m5 | |
4773 | VAR_2ROW r1, 8, 8 | |
4774 | HADDW m5, m2 | |
4775 | movd r5d, m5 | |
4776 | add r4, r5 | |
4777 | pxor m5, m5 | |
4778 | lea r0, [r3 + 64] | |
4779 | VAR_2ROW r1, 8, 9 | |
4780 | HADDW m5, m2 | |
4781 | movd r5d, m5 | |
4782 | add r4, r5 | |
4783 | pxor m5, m5 | |
4784 | VAR_2ROW r1, 8, 10 | |
4785 | HADDW m5, m2 | |
4786 | movd r5d, m5 | |
4787 | add r4, r5 | |
4788 | pxor m5, m5 | |
4789 | VAR_2ROW r1, 8, 11 | |
4790 | HADDW m5, m2 | |
4791 | movd r5d, m5 | |
4792 | add r4, r5 | |
4793 | pxor m5, m5 | |
4794 | VAR_2ROW r1, 8, 12 | |
4795 | HADDW m5, m2 | |
4796 | movd r5d, m5 | |
4797 | add r4, r5 | |
4798 | pxor m5, m5 | |
4799 | lea r0, [r3 + 96] | |
4800 | VAR_2ROW r1, 8, 13 | |
4801 | HADDW m5, m2 | |
4802 | movd r5d, m5 | |
4803 | add r4, r5 | |
4804 | pxor m5, m5 | |
4805 | VAR_2ROW r1, 8, 14 | |
4806 | HADDW m5, m2 | |
4807 | movd r5d, m5 | |
4808 | add r4, r5 | |
4809 | pxor m5, m5 | |
4810 | VAR_2ROW r1, 8, 15 | |
4811 | HADDW m5, m2 | |
4812 | movd r5d, m5 | |
4813 | add r4, r5 | |
4814 | pxor m5, m5 | |
4815 | VAR_2ROW r1, 8, 16 | |
4816 | VAR_END 64, 64 | |
4817 | %endmacro ; VAR | |
4818 | ||
4819 | INIT_XMM sse2 | |
4820 | VAR | |
4821 | INIT_XMM avx | |
4822 | VAR | |
4823 | INIT_XMM xop | |
4824 | VAR | |
4825 | %endif ; HIGH_BIT_DEPTH | |
4826 | ||
4827 | %if HIGH_BIT_DEPTH == 0 | |
4828 | %macro VAR 0 | |
4829 | cglobal pixel_var_8x8, 2,3,8 | |
4830 | VAR_START 1 | |
4831 | lea r2, [r1 * 3] | |
4832 | movh m0, [r0] | |
4833 | movh m3, [r0 + r1] | |
4834 | movhps m0, [r0 + r1 * 2] | |
4835 | movhps m3, [r0 + r2] | |
4836 | DEINTB 1, 0, 4, 3, 7 | |
4837 | lea r0, [r0 + r1 * 4] | |
4838 | VAR_CORE | |
4839 | movh m0, [r0] | |
4840 | movh m3, [r0 + r1] | |
4841 | movhps m0, [r0 + r1 * 2] | |
4842 | movhps m3, [r0 + r2] | |
4843 | DEINTB 1, 0, 4, 3, 7 | |
4844 | VAR_CORE | |
4845 | VAR_END 8, 8 | |
4846 | ||
4847 | cglobal pixel_var_16x16_internal | |
4848 | movu m0, [r0] | |
4849 | movu m3, [r0 + r1] | |
4850 | DEINTB 1, 0, 4, 3, 7 | |
4851 | VAR_CORE | |
4852 | movu m0, [r0 + 2 * r1] | |
4853 | movu m3, [r0 + r2] | |
4854 | DEINTB 1, 0, 4, 3, 7 | |
4855 | lea r0, [r0 + r1 * 4] | |
4856 | VAR_CORE | |
4857 | movu m0, [r0] | |
4858 | movu m3, [r0 + r1] | |
4859 | DEINTB 1, 0, 4, 3, 7 | |
4860 | VAR_CORE | |
4861 | movu m0, [r0 + 2 * r1] | |
4862 | movu m3, [r0 + r2] | |
4863 | DEINTB 1, 0, 4, 3, 7 | |
4864 | lea r0, [r0 + r1 * 4] | |
4865 | VAR_CORE | |
4866 | movu m0, [r0] | |
4867 | movu m3, [r0 + r1] | |
4868 | DEINTB 1, 0, 4, 3, 7 | |
4869 | VAR_CORE | |
4870 | movu m0, [r0 + 2 * r1] | |
4871 | movu m3, [r0 + r2] | |
4872 | DEINTB 1, 0, 4, 3, 7 | |
4873 | lea r0, [r0 + r1 * 4] | |
4874 | VAR_CORE | |
4875 | movu m0, [r0] | |
4876 | movu m3, [r0 + r1] | |
4877 | DEINTB 1, 0, 4, 3, 7 | |
4878 | VAR_CORE | |
4879 | movu m0, [r0 + 2 * r1] | |
4880 | movu m3, [r0 + r2] | |
4881 | DEINTB 1, 0, 4, 3, 7 | |
4882 | VAR_CORE | |
4883 | ret | |
4884 | ||
4885 | cglobal pixel_var_16x16, 2,3,8 | |
4886 | VAR_START 1 | |
4887 | lea r2, [r1 * 3] | |
4888 | call pixel_var_16x16_internal | |
4889 | VAR_END 16, 16 | |
4890 | ||
4891 | cglobal pixel_var_32x32, 2,4,8 | |
4892 | VAR_START 1 | |
4893 | lea r2, [r1 * 3] | |
4894 | mov r3, r0 | |
4895 | call pixel_var_16x16_internal | |
4896 | lea r0, [r0 + r1 * 4] | |
4897 | call pixel_var_16x16_internal | |
4898 | lea r0, [r3 + 16] | |
4899 | call pixel_var_16x16_internal | |
4900 | lea r0, [r0 + r1 * 4] | |
4901 | call pixel_var_16x16_internal | |
4902 | VAR_END 32, 32 | |
4903 | ||
4904 | cglobal pixel_var_64x64, 2,6,8 | |
4905 | VAR_START 1 | |
4906 | lea r2, [r1 * 3] | |
4907 | mov r3, r0 | |
4908 | call pixel_var_16x16_internal | |
4909 | lea r0, [r0 + r1 * 4] | |
4910 | call pixel_var_16x16_internal | |
4911 | lea r0, [r0 + r1 * 4] | |
4912 | call pixel_var_16x16_internal | |
4913 | lea r0, [r0 + r1 * 4] | |
4914 | call pixel_var_16x16_internal | |
4915 | HADDW m5, m2 | |
4916 | movd r4d, m5 | |
4917 | pxor m5, m5 | |
4918 | lea r0, [r3 + 16] | |
4919 | call pixel_var_16x16_internal | |
4920 | lea r0, [r0 + r1 * 4] | |
4921 | call pixel_var_16x16_internal | |
4922 | lea r0, [r0 + r1 * 4] | |
4923 | call pixel_var_16x16_internal | |
4924 | lea r0, [r0 + r1 * 4] | |
4925 | call pixel_var_16x16_internal | |
4926 | HADDW m5, m2 | |
4927 | movd r5d, m5 | |
4928 | add r4, r5 | |
4929 | pxor m5, m5 | |
4930 | lea r0, [r3 + 32] | |
4931 | call pixel_var_16x16_internal | |
4932 | lea r0, [r0 + r1 * 4] | |
4933 | call pixel_var_16x16_internal | |
4934 | lea r0, [r0 + r1 * 4] | |
4935 | call pixel_var_16x16_internal | |
4936 | lea r0, [r0 + r1 * 4] | |
4937 | call pixel_var_16x16_internal | |
4938 | lea r0, [r3 + 48] | |
4939 | HADDW m5, m2 | |
4940 | movd r5d, m5 | |
4941 | add r4, r5 | |
4942 | pxor m5, m5 | |
4943 | call pixel_var_16x16_internal | |
4944 | lea r0, [r0 + r1 * 4] | |
4945 | call pixel_var_16x16_internal | |
4946 | lea r0, [r0 + r1 * 4] | |
4947 | call pixel_var_16x16_internal | |
4948 | lea r0, [r0 + r1 * 4] | |
4949 | call pixel_var_16x16_internal | |
4950 | VAR_END 64, 64 | |
4951 | %endmacro ; VAR | |
4952 | ||
4953 | INIT_XMM sse2 | |
4954 | VAR | |
4955 | INIT_XMM avx | |
4956 | VAR | |
4957 | INIT_XMM xop | |
4958 | VAR | |
4959 | ||
4960 | INIT_YMM avx2 | |
4961 | cglobal pixel_var_16x16, 2,4,7 | |
4962 | VAR_START 0 | |
4963 | mov r2d, 4 | |
4964 | lea r3, [r1*3] | |
4965 | .loop: | |
4966 | pmovzxbw m0, [r0] | |
4967 | pmovzxbw m3, [r0+r1] | |
4968 | pmovzxbw m1, [r0+r1*2] | |
4969 | pmovzxbw m4, [r0+r3] | |
4970 | lea r0, [r0+r1*4] | |
4971 | VAR_CORE | |
4972 | dec r2d | |
4973 | jg .loop | |
4974 | vextracti128 xm0, m5, 1 | |
4975 | vextracti128 xm1, m6, 1 | |
4976 | paddw xm5, xm0 | |
4977 | paddd xm6, xm1 | |
4978 | HADDW xm5, xm2 | |
4979 | HADDD xm6, xm1 | |
4980 | %if ARCH_X86_64 | |
4981 | punpckldq xm5, xm6 | |
4982 | movq rax, xm5 | |
4983 | %else | |
4984 | movd eax, xm5 | |
4985 | movd edx, xm6 | |
4986 | %endif | |
4987 | RET | |
4988 | %endif ; !HIGH_BIT_DEPTH | |
4989 | ||
4990 | %macro VAR2_END 3 | |
4991 | HADDW %2, xm1 | |
4992 | movd r1d, %2 | |
4993 | imul r1d, r1d | |
4994 | HADDD %3, xm1 | |
4995 | shr r1d, %1 | |
4996 | movd eax, %3 | |
4997 | movd [r4], %3 | |
4998 | sub eax, r1d ; sqr - (sum * sum >> shift) | |
4999 | RET | |
5000 | %endmacro | |
5001 |