]>
Commit | Line | Data |
---|---|---|
1 | ;***************************************************************************** | |
2 | ;* Copyright (C) 2013 x265 project | |
3 | ;* | |
4 | ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> | |
5 | ;* Nabajit Deka <nabajit@multicorewareinc.com> | |
6 | ;* | |
7 | ;* This program is free software; you can redistribute it and/or modify | |
8 | ;* it under the terms of the GNU General Public License as published by | |
9 | ;* the Free Software Foundation; either version 2 of the License, or | |
10 | ;* (at your option) any later version. | |
11 | ;* | |
12 | ;* This program is distributed in the hope that it will be useful, | |
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | ;* GNU General Public License for more details. | |
16 | ;* | |
17 | ;* You should have received a copy of the GNU General Public License | |
18 | ;* along with this program; if not, write to the Free Software | |
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
20 | ;* | |
21 | ;* This program is also available under a commercial proprietary license. | |
22 | ;* For more information, contact us at license @ x265.com. | |
23 | ;*****************************************************************************/ | |
24 | ||
25 | %include "x86inc.asm" | |
26 | %include "x86util.asm" | |
27 | ||
28 | SECTION_RODATA 32 | |
29 | ||
30 | %if BIT_DEPTH == 10 | |
31 | ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64 | |
32 | ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63 | |
33 | pf_64: times 4 dd 64.0 | |
34 | pf_128: times 4 dd 128.0 | |
35 | %elif BIT_DEPTH == 9 | |
36 | ssim_c1: times 4 dd 1671 ; .01*.01*511*511*64 | |
37 | ssim_c2: times 4 dd 947556 ; .03*.03*511*511*64*63 | |
38 | %else ; 8-bit | |
39 | ssim_c1: times 4 dd 416 ; .01*.01*255*255*64 | |
40 | ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63 | |
41 | %endif | |
42 | mask_ff: times 16 db 0xff | |
43 | times 16 db 0 | |
44 | deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 | |
45 | deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 | |
46 | hmul_16p: times 16 db 1 | |
47 | times 8 db 1, -1 | |
48 | hmulw_16p: times 8 dw 1 | |
49 | times 4 dw 1, -1 | |
50 | ||
51 | trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 | |
52 | ||
53 | SECTION .text | |
54 | ||
55 | cextern pw_1 | |
56 | cextern pb_1 | |
57 | cextern pw_00ff | |
58 | cextern pw_2000 | |
59 | cextern pw_pixel_max | |
60 | cextern pd_1 | |
61 | cextern pd_32767 | |
62 | cextern pd_n32768 | |
63 | ||
64 | ||
65 | ;----------------------------------------------------------------------------- | |
66 | ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) | |
67 | ;----------------------------------------------------------------------------- | |
68 | INIT_XMM sse2 | |
69 | %if HIGH_BIT_DEPTH | |
70 | cglobal getResidual4, 4,4,4 | |
71 | add r3, r3 | |
72 | ||
73 | ; row 0-1 | |
74 | movh m0, [r0] | |
75 | movh m1, [r0 + r3] | |
76 | movh m2, [r1] | |
77 | movh m3, [r1 + r3] | |
78 | punpcklqdq m0, m1 | |
79 | punpcklqdq m2, m3 | |
80 | psubw m0, m2 | |
81 | ||
82 | movh [r2], m0 | |
83 | movhps [r2 + r3], m0 | |
84 | lea r0, [r0 + r3 * 2] | |
85 | lea r1, [r1 + r3 * 2] | |
86 | lea r2, [r2 + r3 * 2] | |
87 | ||
88 | ; row 2-3 | |
89 | movh m0, [r0] | |
90 | movh m1, [r0 + r3] | |
91 | movh m2, [r1] | |
92 | movh m3, [r1 + r3] | |
93 | punpcklqdq m0, m1 | |
94 | punpcklqdq m2, m3 | |
95 | psubw m0, m2 | |
96 | ||
97 | movh [r2], m0 | |
98 | movhps [r2 + r3], m0 | |
99 | %else | |
100 | cglobal getResidual4, 4,4,5 | |
101 | pxor m0, m0 | |
102 | ||
103 | ; row 0-1 | |
104 | movd m1, [r0] | |
105 | movd m2, [r0 + r3] | |
106 | movd m3, [r1] | |
107 | movd m4, [r1 + r3] | |
108 | punpckldq m1, m2 | |
109 | punpcklbw m1, m0 | |
110 | punpckldq m3, m4 | |
111 | punpcklbw m3, m0 | |
112 | psubw m1, m3 | |
113 | movh [r2], m1 | |
114 | movhps [r2 + r3 * 2], m1 | |
115 | lea r0, [r0 + r3 * 2] | |
116 | lea r1, [r1 + r3 * 2] | |
117 | lea r2, [r2 + r3 * 4] | |
118 | ||
119 | ; row 2-3 | |
120 | movd m1, [r0] | |
121 | movd m2, [r0 + r3] | |
122 | movd m3, [r1] | |
123 | movd m4, [r1 + r3] | |
124 | punpckldq m1, m2 | |
125 | punpcklbw m1, m0 | |
126 | punpckldq m3, m4 | |
127 | punpcklbw m3, m0 | |
128 | psubw m1, m3 | |
129 | movh [r2], m1 | |
130 | movhps [r2 + r3 * 2], m1 | |
131 | %endif | |
132 | RET | |
133 | ||
134 | ||
135 | INIT_XMM sse2 | |
136 | %if HIGH_BIT_DEPTH | |
137 | cglobal getResidual8, 4,4,4 | |
138 | add r3, r3 | |
139 | ||
140 | %assign x 0 | |
141 | %rep 8/2 | |
142 | ; row 0-1 | |
143 | movu m1, [r0] | |
144 | movu m2, [r0 + r3] | |
145 | movu m3, [r1] | |
146 | movu m4, [r1 + r3] | |
147 | psubw m1, m3 | |
148 | psubw m2, m4 | |
149 | movu [r2], m1 | |
150 | movu [r2 + r3], m2 | |
151 | %assign x x+1 | |
152 | %if (x != 4) | |
153 | lea r0, [r0 + r3 * 2] | |
154 | lea r1, [r1 + r3 * 2] | |
155 | lea r2, [r2 + r3 * 2] | |
156 | %endif | |
157 | %endrep | |
158 | %else | |
159 | cglobal getResidual8, 4,4,5 | |
160 | pxor m0, m0 | |
161 | ||
162 | %assign x 0 | |
163 | %rep 8/2 | |
164 | ; row 0-1 | |
165 | movh m1, [r0] | |
166 | movh m2, [r0 + r3] | |
167 | movh m3, [r1] | |
168 | movh m4, [r1 + r3] | |
169 | punpcklbw m1, m0 | |
170 | punpcklbw m2, m0 | |
171 | punpcklbw m3, m0 | |
172 | punpcklbw m4, m0 | |
173 | psubw m1, m3 | |
174 | psubw m2, m4 | |
175 | movu [r2], m1 | |
176 | movu [r2 + r3 * 2], m2 | |
177 | %assign x x+1 | |
178 | %if (x != 4) | |
179 | lea r0, [r0 + r3 * 2] | |
180 | lea r1, [r1 + r3 * 2] | |
181 | lea r2, [r2 + r3 * 4] | |
182 | %endif | |
183 | %endrep | |
184 | %endif | |
185 | RET | |
186 | ||
187 | %if HIGH_BIT_DEPTH | |
188 | INIT_XMM sse2 | |
189 | cglobal getResidual16, 4,5,6 | |
190 | add r3, r3 | |
191 | mov r4d, 16/4 | |
192 | .loop: | |
193 | ; row 0-1 | |
194 | movu m0, [r0] | |
195 | movu m1, [r0 + 16] | |
196 | movu m2, [r0 + r3] | |
197 | movu m3, [r0 + r3 + 16] | |
198 | movu m4, [r1] | |
199 | movu m5, [r1 + 16] | |
200 | psubw m0, m4 | |
201 | psubw m1, m5 | |
202 | movu m4, [r1 + r3] | |
203 | movu m5, [r1 + r3 + 16] | |
204 | psubw m2, m4 | |
205 | psubw m3, m5 | |
206 | lea r0, [r0 + r3 * 2] | |
207 | lea r1, [r1 + r3 * 2] | |
208 | ||
209 | movu [r2], m0 | |
210 | movu [r2 + 16], m1 | |
211 | movu [r2 + r3], m2 | |
212 | movu [r2 + r3 + 16], m3 | |
213 | lea r2, [r2 + r3 * 2] | |
214 | ||
215 | ; row 2-3 | |
216 | movu m0, [r0] | |
217 | movu m1, [r0 + 16] | |
218 | movu m2, [r0 + r3] | |
219 | movu m3, [r0 + r3 + 16] | |
220 | movu m4, [r1] | |
221 | movu m5, [r1 + 16] | |
222 | psubw m0, m4 | |
223 | psubw m1, m5 | |
224 | movu m4, [r1 + r3] | |
225 | movu m5, [r1 + r3 + 16] | |
226 | psubw m2, m4 | |
227 | psubw m3, m5 | |
228 | ||
229 | movu [r2], m0 | |
230 | movu [r2 + 16], m1 | |
231 | movu [r2 + r3], m2 | |
232 | movu [r2 + r3 + 16], m3 | |
233 | ||
234 | dec r4d | |
235 | ||
236 | lea r0, [r0 + r3 * 2] | |
237 | lea r1, [r1 + r3 * 2] | |
238 | lea r2, [r2 + r3 * 2] | |
239 | ||
240 | jnz .loop | |
241 | %else | |
242 | ||
243 | INIT_XMM sse4 | |
244 | cglobal getResidual16, 4,5,8 | |
245 | mov r4d, 16/4 | |
246 | pxor m0, m0 | |
247 | .loop: | |
248 | ; row 0-1 | |
249 | movu m1, [r0] | |
250 | movu m2, [r0 + r3] | |
251 | movu m3, [r1] | |
252 | movu m4, [r1 + r3] | |
253 | pmovzxbw m5, m1 | |
254 | punpckhbw m1, m0 | |
255 | pmovzxbw m6, m2 | |
256 | punpckhbw m2, m0 | |
257 | pmovzxbw m7, m3 | |
258 | punpckhbw m3, m0 | |
259 | psubw m5, m7 | |
260 | psubw m1, m3 | |
261 | pmovzxbw m7, m4 | |
262 | punpckhbw m4, m0 | |
263 | psubw m6, m7 | |
264 | psubw m2, m4 | |
265 | ||
266 | movu [r2], m5 | |
267 | movu [r2 + 16], m1 | |
268 | movu [r2 + r3 * 2], m6 | |
269 | movu [r2 + r3 * 2 + 16], m2 | |
270 | ||
271 | lea r0, [r0 + r3 * 2] | |
272 | lea r1, [r1 + r3 * 2] | |
273 | lea r2, [r2 + r3 * 4] | |
274 | ||
275 | ; row 2-3 | |
276 | movu m1, [r0] | |
277 | movu m2, [r0 + r3] | |
278 | movu m3, [r1] | |
279 | movu m4, [r1 + r3] | |
280 | pmovzxbw m5, m1 | |
281 | punpckhbw m1, m0 | |
282 | pmovzxbw m6, m2 | |
283 | punpckhbw m2, m0 | |
284 | pmovzxbw m7, m3 | |
285 | punpckhbw m3, m0 | |
286 | psubw m5, m7 | |
287 | psubw m1, m3 | |
288 | pmovzxbw m7, m4 | |
289 | punpckhbw m4, m0 | |
290 | psubw m6, m7 | |
291 | psubw m2, m4 | |
292 | ||
293 | movu [r2], m5 | |
294 | movu [r2 + 16], m1 | |
295 | movu [r2 + r3 * 2], m6 | |
296 | movu [r2 + r3 * 2 + 16], m2 | |
297 | ||
298 | dec r4d | |
299 | ||
300 | lea r0, [r0 + r3 * 2] | |
301 | lea r1, [r1 + r3 * 2] | |
302 | lea r2, [r2 + r3 * 4] | |
303 | ||
304 | jnz .loop | |
305 | %endif | |
306 | ||
307 | RET | |
308 | ||
309 | %if HIGH_BIT_DEPTH | |
310 | INIT_XMM sse2 | |
311 | cglobal getResidual32, 4,5,6 | |
312 | add r3, r3 | |
313 | mov r4d, 32/2 | |
314 | .loop: | |
315 | ; row 0 | |
316 | movu m0, [r0] | |
317 | movu m1, [r0 + 16] | |
318 | movu m2, [r0 + 32] | |
319 | movu m3, [r0 + 48] | |
320 | movu m4, [r1] | |
321 | movu m5, [r1 + 16] | |
322 | psubw m0, m4 | |
323 | psubw m1, m5 | |
324 | movu m4, [r1 + 32] | |
325 | movu m5, [r1 + 48] | |
326 | psubw m2, m4 | |
327 | psubw m3, m5 | |
328 | ||
329 | movu [r2], m0 | |
330 | movu [r2 + 16], m1 | |
331 | movu [r2 + 32], m2 | |
332 | movu [r2 + 48], m3 | |
333 | ||
334 | ; row 1 | |
335 | movu m0, [r0 + r3] | |
336 | movu m1, [r0 + r3 + 16] | |
337 | movu m2, [r0 + r3 + 32] | |
338 | movu m3, [r0 + r3 + 48] | |
339 | movu m4, [r1 + r3] | |
340 | movu m5, [r1 + r3 + 16] | |
341 | psubw m0, m4 | |
342 | psubw m1, m5 | |
343 | movu m4, [r1 + r3 + 32] | |
344 | movu m5, [r1 + r3 + 48] | |
345 | psubw m2, m4 | |
346 | psubw m3, m5 | |
347 | ||
348 | movu [r2 + r3], m0 | |
349 | movu [r2 + r3 + 16], m1 | |
350 | movu [r2 + r3 + 32], m2 | |
351 | movu [r2 + r3 + 48], m3 | |
352 | ||
353 | dec r4d | |
354 | ||
355 | lea r0, [r0 + r3 * 2] | |
356 | lea r1, [r1 + r3 * 2] | |
357 | lea r2, [r2 + r3 * 2] | |
358 | ||
359 | jnz .loop | |
360 | ||
361 | %else | |
362 | INIT_XMM sse4 | |
363 | cglobal getResidual32, 4,5,7 | |
364 | mov r4d, 32/2 | |
365 | pxor m0, m0 | |
366 | .loop: | |
367 | movu m1, [r0] | |
368 | movu m2, [r0 + 16] | |
369 | movu m3, [r1] | |
370 | movu m4, [r1 + 16] | |
371 | pmovzxbw m5, m1 | |
372 | punpckhbw m1, m0 | |
373 | pmovzxbw m6, m3 | |
374 | punpckhbw m3, m0 | |
375 | psubw m5, m6 | |
376 | psubw m1, m3 | |
377 | movu [r2 + 0 * 16], m5 | |
378 | movu [r2 + 1 * 16], m1 | |
379 | ||
380 | pmovzxbw m5, m2 | |
381 | punpckhbw m2, m0 | |
382 | pmovzxbw m6, m4 | |
383 | punpckhbw m4, m0 | |
384 | psubw m5, m6 | |
385 | psubw m2, m4 | |
386 | movu [r2 + 2 * 16], m5 | |
387 | movu [r2 + 3 * 16], m2 | |
388 | ||
389 | movu m1, [r0 + r3] | |
390 | movu m2, [r0 + r3 + 16] | |
391 | movu m3, [r1 + r3] | |
392 | movu m4, [r1 + r3 + 16] | |
393 | pmovzxbw m5, m1 | |
394 | punpckhbw m1, m0 | |
395 | pmovzxbw m6, m3 | |
396 | punpckhbw m3, m0 | |
397 | psubw m5, m6 | |
398 | psubw m1, m3 | |
399 | movu [r2 + r3 * 2 + 0 * 16], m5 | |
400 | movu [r2 + r3 * 2 + 1 * 16], m1 | |
401 | ||
402 | pmovzxbw m5, m2 | |
403 | punpckhbw m2, m0 | |
404 | pmovzxbw m6, m4 | |
405 | punpckhbw m4, m0 | |
406 | psubw m5, m6 | |
407 | psubw m2, m4 | |
408 | movu [r2 + r3 * 2 + 2 * 16], m5 | |
409 | movu [r2 + r3 * 2 + 3 * 16], m2 | |
410 | ||
411 | dec r4d | |
412 | ||
413 | lea r0, [r0 + r3 * 2] | |
414 | lea r1, [r1 + r3 * 2] | |
415 | lea r2, [r2 + r3 * 4] | |
416 | ||
417 | jnz .loop | |
418 | %endif | |
419 | RET | |
420 | ||
421 | ||
422 | ;----------------------------------------------------------------------------- | |
423 | ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); | |
424 | ;----------------------------------------------------------------------------- | |
425 | INIT_XMM sse4 | |
426 | cglobal quant, 5,6,8 | |
427 | ; fill qbits | |
428 | movd m4, r4d ; m4 = qbits | |
429 | ||
430 | ; fill qbits-8 | |
431 | sub r4d, 8 | |
432 | movd m6, r4d ; m6 = qbits8 | |
433 | ||
434 | ; fill offset | |
435 | movd m5, r5m | |
436 | pshufd m5, m5, 0 ; m5 = add | |
437 | ||
438 | lea r5, [pd_1] | |
439 | ||
440 | mov r4d, r6m | |
441 | shr r4d, 3 | |
442 | pxor m7, m7 ; m7 = numZero | |
443 | .loop: | |
444 | ; 4 coeff | |
445 | pmovsxwd m0, [r0] ; m0 = level | |
446 | pabsd m1, m0 | |
447 | pmulld m1, [r1] ; m0 = tmpLevel1 | |
448 | paddd m2, m1, m5 | |
449 | psrad m2, m4 ; m2 = level1 | |
450 | ||
451 | pslld m3, m2, 8 | |
452 | psrad m1, m6 | |
453 | psubd m1, m3 ; m1 = deltaU1 | |
454 | ||
455 | movu [r2], m1 | |
456 | psignd m3, m2, m0 | |
457 | pminud m2, [r5] | |
458 | paddd m7, m2 | |
459 | packssdw m3, m3 | |
460 | movh [r3], m3 | |
461 | ||
462 | ; 4 coeff | |
463 | pmovsxwd m0, [r0 + 8] ; m0 = level | |
464 | pabsd m1, m0 | |
465 | pmulld m1, [r1 + 16] ; m0 = tmpLevel1 | |
466 | paddd m2, m1, m5 | |
467 | psrad m2, m4 ; m2 = level1 | |
468 | pslld m3, m2, 8 | |
469 | psrad m1, m6 | |
470 | psubd m1, m3 ; m1 = deltaU1 | |
471 | movu [r2 + 16], m1 | |
472 | psignd m3, m2, m0 | |
473 | pminud m2, [r5] | |
474 | paddd m7, m2 | |
475 | packssdw m3, m3 | |
476 | movh [r3 + 8], m3 | |
477 | ||
478 | add r0, 16 | |
479 | add r1, 32 | |
480 | add r2, 32 | |
481 | add r3, 16 | |
482 | ||
483 | dec r4d | |
484 | jnz .loop | |
485 | ||
486 | pxor m0, m0 | |
487 | psadbw m7, m0 | |
488 | movhlps m0, m7 | |
489 | paddd m7, m0 | |
490 | movd eax, m7 | |
491 | RET | |
492 | ||
493 | ||
494 | IACA_START | |
495 | %if ARCH_X86_64 == 1 | |
496 | INIT_YMM avx2 | |
497 | cglobal quant, 5,5,10 | |
498 | ; fill qbits | |
499 | movd xm4, r4d ; m4 = qbits | |
500 | ||
501 | ; fill qbits-8 | |
502 | sub r4d, 8 | |
503 | movd xm6, r4d ; m6 = qbits8 | |
504 | ||
505 | ; fill offset | |
506 | vpbroadcastd m5, r5m ; m5 = add | |
507 | ||
508 | vpbroadcastw m9, [pw_1] ; m9 = word [1] | |
509 | ||
510 | mov r4d, r6m | |
511 | shr r4d, 4 | |
512 | pxor m7, m7 ; m7 = numZero | |
513 | .loop: | |
514 | ; 8 coeff | |
515 | pmovsxwd m0, [r0] ; m0 = level | |
516 | pabsd m1, m0 | |
517 | pmulld m1, [r1] ; m0 = tmpLevel1 | |
518 | paddd m2, m1, m5 | |
519 | psrad m2, xm4 ; m2 = level1 | |
520 | ||
521 | pslld m3, m2, 8 | |
522 | psrad m1, xm6 | |
523 | psubd m1, m3 ; m1 = deltaU1 | |
524 | movu [r2], m1 | |
525 | psignd m2, m0 | |
526 | ||
527 | ; 8 coeff | |
528 | pmovsxwd m0, [r0 + mmsize/2] ; m0 = level | |
529 | pabsd m1, m0 | |
530 | pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 | |
531 | paddd m3, m1, m5 | |
532 | psrad m3, xm4 ; m2 = level1 | |
533 | ||
534 | pslld m8, m3, 8 | |
535 | psrad m1, xm6 | |
536 | psubd m1, m8 ; m1 = deltaU1 | |
537 | movu [r2 + mmsize], m1 | |
538 | psignd m3, m0 | |
539 | ||
540 | packssdw m2, m3 | |
541 | vpermq m2, m2, q3120 | |
542 | movu [r3], m2 | |
543 | ||
544 | ; count non-zero coeff | |
545 | ; TODO: popcnt is faster, but some CPU can't support | |
546 | pminuw m2, m9 | |
547 | paddw m7, m2 | |
548 | ||
549 | add r0, mmsize | |
550 | add r1, mmsize*2 | |
551 | add r2, mmsize*2 | |
552 | add r3, mmsize | |
553 | ||
554 | dec r4d | |
555 | jnz .loop | |
556 | ||
557 | ; sum count | |
558 | xorpd m0, m0 | |
559 | psadbw m7, m0 | |
560 | vextracti128 xm1, m7, 1 | |
561 | paddd xm7, xm1 | |
562 | movhlps xm0, xm7 | |
563 | paddd xm7, xm0 | |
564 | movd eax, xm7 | |
565 | RET | |
566 | ||
567 | %else ; ARCH_X86_64 == 1 | |
568 | INIT_YMM avx2 | |
569 | cglobal quant, 5,6,8 | |
570 | ; fill qbits | |
571 | movd xm4, r4d ; m4 = qbits | |
572 | ||
573 | ; fill qbits-8 | |
574 | sub r4d, 8 | |
575 | movd xm6, r4d ; m6 = qbits8 | |
576 | ||
577 | ; fill offset | |
578 | vpbroadcastd m5, r5m ; m5 = ad | |
579 | ||
580 | lea r5, [pd_1] | |
581 | ||
582 | mov r4d, r6m | |
583 | shr r4d, 4 | |
584 | pxor m7, m7 ; m7 = numZero | |
585 | .loop: | |
586 | ; 8 coeff | |
587 | pmovsxwd m0, [r0] ; m0 = level | |
588 | pabsd m1, m0 | |
589 | pmulld m1, [r1] ; m0 = tmpLevel1 | |
590 | paddd m2, m1, m5 | |
591 | psrad m2, xm4 ; m2 = level1 | |
592 | ||
593 | pslld m3, m2, 8 | |
594 | psrad m1, xm6 | |
595 | psubd m1, m3 ; m1 = deltaU1 | |
596 | ||
597 | movu [r2], m1 | |
598 | psignd m3, m2, m0 | |
599 | pminud m2, [r5] | |
600 | paddd m7, m2 | |
601 | packssdw m3, m3 | |
602 | vpermq m3, m3, q0020 | |
603 | movu [r3], xm3 | |
604 | ||
605 | ; 8 coeff | |
606 | pmovsxwd m0, [r0 + mmsize/2] ; m0 = level | |
607 | pabsd m1, m0 | |
608 | pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1 | |
609 | paddd m2, m1, m5 | |
610 | psrad m2, xm4 ; m2 = level1 | |
611 | ||
612 | pslld m3, m2, 8 | |
613 | psrad m1, xm6 | |
614 | psubd m1, m3 ; m1 = deltaU1 | |
615 | ||
616 | movu [r2 + mmsize], m1 | |
617 | psignd m3, m2, m0 | |
618 | pminud m2, [r5] | |
619 | paddd m7, m2 | |
620 | packssdw m3, m3 | |
621 | vpermq m3, m3, q0020 | |
622 | movu [r3 + mmsize/2], xm3 | |
623 | ||
624 | add r0, mmsize | |
625 | add r1, mmsize*2 | |
626 | add r2, mmsize*2 | |
627 | add r3, mmsize | |
628 | ||
629 | dec r4d | |
630 | jnz .loop | |
631 | ||
632 | xorpd m0, m0 | |
633 | psadbw m7, m0 | |
634 | vextracti128 xm1, m7, 1 | |
635 | paddd xm7, xm1 | |
636 | movhlps xm0, xm7 | |
637 | paddd xm7, xm0 | |
638 | movd eax, xm7 | |
639 | RET | |
640 | %endif ; ARCH_X86_64 == 1 | |
641 | IACA_END | |
642 | ||
643 | ||
644 | ;----------------------------------------------------------------------------- | |
645 | ; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff); | |
646 | ;----------------------------------------------------------------------------- | |
647 | INIT_XMM sse4 | |
648 | cglobal nquant, 3,5,8 | |
649 | movd m6, r4m | |
650 | mov r4d, r5m | |
651 | pxor m7, m7 ; m7 = numZero | |
652 | movd m5, r3m ; m5 = qbits | |
653 | pshufd m6, m6, 0 ; m6 = add | |
654 | mov r3d, r4d ; r3 = numCoeff | |
655 | shr r4d, 3 | |
656 | ||
657 | .loop: | |
658 | pmovsxwd m0, [r0] ; m0 = level | |
659 | pmovsxwd m1, [r0 + 8] ; m1 = level | |
660 | ||
661 | pabsd m2, m0 | |
662 | pmulld m2, [r1] ; m0 = tmpLevel1 * qcoeff | |
663 | paddd m2, m6 | |
664 | psrad m2, m5 ; m0 = level1 | |
665 | psignd m2, m0 | |
666 | ||
667 | pabsd m3, m1 | |
668 | pmulld m3, [r1 + 16] ; m1 = tmpLevel1 * qcoeff | |
669 | paddd m3, m6 | |
670 | psrad m3, m5 ; m1 = level1 | |
671 | psignd m3, m1 | |
672 | ||
673 | packssdw m2, m3 | |
674 | ||
675 | movu [r2], m2 | |
676 | add r0, 16 | |
677 | add r1, 32 | |
678 | add r2, 16 | |
679 | ||
680 | pxor m4, m4 | |
681 | pcmpeqw m2, m4 | |
682 | psubw m7, m2 | |
683 | ||
684 | dec r4d | |
685 | jnz .loop | |
686 | ||
687 | packuswb m7, m7 | |
688 | psadbw m7, m4 | |
689 | mov eax, r3d | |
690 | movd r4d, m7 | |
691 | sub eax, r4d ; numSig | |
692 | RET | |
693 | ||
694 | ||
695 | INIT_YMM avx2 | |
696 | cglobal nquant, 3,5,7 | |
697 | vpbroadcastd m4, r4m | |
698 | vpbroadcastd m6, [pw_1] | |
699 | mov r4d, r5m | |
700 | pxor m5, m5 ; m7 = numZero | |
701 | movd xm3, r3m ; m5 = qbits | |
702 | mov r3d, r4d ; r3 = numCoeff | |
703 | shr r4d, 4 | |
704 | ||
705 | .loop: | |
706 | pmovsxwd m0, [r0] ; m0 = level | |
707 | pabsd m1, m0 | |
708 | pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff | |
709 | paddd m1, m4 | |
710 | psrad m1, xm3 ; m0 = level1 | |
711 | psignd m1, m0 | |
712 | ||
713 | pmovsxwd m0, [r0 + mmsize/2] ; m0 = level | |
714 | pabsd m2, m0 | |
715 | pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff | |
716 | paddd m2, m4 | |
717 | psrad m2, xm3 ; m0 = level1 | |
718 | psignd m2, m0 | |
719 | ||
720 | packssdw m1, m2 | |
721 | vpermq m2, m1, q3120 | |
722 | ||
723 | movu [r2], m2 | |
724 | add r0, mmsize | |
725 | add r1, mmsize * 2 | |
726 | add r2, mmsize | |
727 | ||
728 | pminuw m1, m6 | |
729 | paddw m5, m1 | |
730 | ||
731 | dec r4d | |
732 | jnz .loop | |
733 | ||
734 | pxor m0, m0 | |
735 | psadbw m5, m0 | |
736 | vextracti128 xm0, m5, 1 | |
737 | paddd xm5, xm0 | |
738 | pshufd xm0, xm5, 2 | |
739 | paddd xm5, xm0 | |
740 | movd eax, xm5 | |
741 | RET | |
742 | ||
743 | ||
744 | ;----------------------------------------------------------------------------- | |
745 | ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift) | |
746 | ;----------------------------------------------------------------------------- | |
747 | INIT_XMM sse4 | |
748 | cglobal dequant_normal, 5,5,5 | |
749 | mova m2, [pw_1] | |
750 | %if HIGH_BIT_DEPTH | |
751 | cmp r3d, 32767 | |
752 | jle .skip | |
753 | shr r3d, 2 | |
754 | sub r4d, 2 | |
755 | .skip: | |
756 | %endif | |
757 | movd m0, r4d ; m0 = shift | |
758 | add r4d, 15 | |
759 | bts r3d, r4d | |
760 | movd m1, r3d | |
761 | pshufd m1, m1, 0 ; m1 = dword [add scale] | |
762 | ; m0 = shift | |
763 | ; m1 = scale | |
764 | ; m2 = word [1] | |
765 | .loop: | |
766 | movu m3, [r0] | |
767 | punpckhwd m4, m3, m2 | |
768 | punpcklwd m3, m2 | |
769 | pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) | |
770 | pmaddwd m4, m1 | |
771 | psrad m3, m0 | |
772 | psrad m4, m0 | |
773 | packssdw m3, m4 | |
774 | mova [r1], m3 | |
775 | ||
776 | add r0, 16 | |
777 | add r1, 16 | |
778 | ||
779 | sub r2d, 8 | |
780 | jnz .loop | |
781 | RET | |
782 | ||
783 | ||
784 | INIT_YMM avx2 | |
785 | cglobal dequant_normal, 5,5,7 | |
786 | vpbroadcastd m2, [pw_1] ; m2 = word [1] | |
787 | vpbroadcastd m5, [pd_32767] ; m5 = dword [32767] | |
788 | vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768] | |
789 | %if HIGH_BIT_DEPTH | |
790 | cmp r3d, 32767 | |
791 | jle .skip | |
792 | shr r3d, 2 | |
793 | sub r4d, 2 | |
794 | .skip: | |
795 | %endif | |
796 | movd xm0, r4d ; m0 = shift | |
797 | add r4d, -1+16 | |
798 | bts r3d, r4d | |
799 | vpbroadcastd m1, r3d ; m1 = dword [add scale] | |
800 | ||
801 | ; m0 = shift | |
802 | ; m1 = scale | |
803 | ; m2 = word [1] | |
804 | shr r2d, 4 | |
805 | .loop: | |
806 | movu m3, [r0] | |
807 | punpckhwd m4, m3, m2 | |
808 | punpcklwd m3, m2 | |
809 | pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add) | |
810 | pmaddwd m4, m1 | |
811 | psrad m3, xm0 | |
812 | psrad m4, xm0 | |
813 | pminsd m3, m5 | |
814 | pmaxsd m3, m6 | |
815 | pminsd m4, m5 | |
816 | pmaxsd m4, m6 | |
817 | packssdw m3, m4 | |
818 | mova [r1 + 0 * mmsize/2], xm3 | |
819 | vextracti128 [r1 + 1 * mmsize/2], m3, 1 | |
820 | ||
821 | add r0, mmsize | |
822 | add r1, mmsize | |
823 | ||
824 | dec r2d | |
825 | jnz .loop | |
826 | RET | |
827 | ||
828 | ||
829 | ;----------------------------------------------------------------------------- | |
830 | ; int count_nonzero(const int16_t *quantCoeff, int numCoeff); | |
831 | ;----------------------------------------------------------------------------- | |
832 | INIT_XMM ssse3 | |
833 | cglobal count_nonzero, 2,2,3 | |
834 | pxor m0, m0 | |
835 | shr r1d, 4 | |
836 | movd m1, r1d | |
837 | pshufb m1, m0 | |
838 | ||
839 | .loop: | |
840 | mova m2, [r0 + 0] | |
841 | packsswb m2, [r0 + 16] | |
842 | add r0, 32 | |
843 | pcmpeqb m2, m0 | |
844 | paddb m1, m2 | |
845 | dec r1d | |
846 | jnz .loop | |
847 | ||
848 | psadbw m1, m0 | |
849 | pshufd m0, m1, 2 | |
850 | paddd m0, m1 | |
851 | movd eax, m0 | |
852 | RET | |
853 | ||
854 | ||
855 | ;----------------------------------------------------------------------------------------------------------------------------------------------- | |
856 | ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset) | |
857 | ;----------------------------------------------------------------------------------------------------------------------------------------------- | |
858 | INIT_XMM sse4 | |
859 | cglobal weight_pp, 6, 7, 6 | |
860 | ||
861 | shl r5d, 6 ; m0 = [w0<<6] | |
862 | mov r6d, r6m | |
863 | shl r6d, 16 | |
864 | or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. | |
865 | movd m0, r6d | |
866 | pshufd m0, m0, 0 ; m0 = [w0<<6, round] | |
867 | movd m1, r7m | |
868 | movd m2, r8m | |
869 | pshufd m2, m2, 0 | |
870 | mova m5, [pw_1] | |
871 | sub r2d, r3d | |
872 | shr r3d, 4 | |
873 | ||
874 | .loopH: | |
875 | mov r5d, r3d | |
876 | ||
877 | .loopW: | |
878 | pmovzxbw m4, [r0] | |
879 | punpcklwd m3, m4, m5 | |
880 | pmaddwd m3, m0 | |
881 | psrad m3, m1 | |
882 | paddd m3, m2 | |
883 | ||
884 | punpckhwd m4, m5 | |
885 | pmaddwd m4, m0 | |
886 | psrad m4, m1 | |
887 | paddd m4, m2 | |
888 | ||
889 | packssdw m3, m4 | |
890 | packuswb m3, m3 | |
891 | movh [r1], m3 | |
892 | ||
893 | pmovzxbw m4, [r0 + 8] | |
894 | punpcklwd m3, m4, m5 | |
895 | pmaddwd m3, m0 | |
896 | psrad m3, m1 | |
897 | paddd m3, m2 | |
898 | ||
899 | punpckhwd m4, m5 | |
900 | pmaddwd m4, m0 | |
901 | psrad m4, m1 | |
902 | paddd m4, m2 | |
903 | ||
904 | packssdw m3, m4 | |
905 | packuswb m3, m3 | |
906 | movh [r1 + 8], m3 | |
907 | ||
908 | add r0, 16 | |
909 | add r1, 16 | |
910 | ||
911 | dec r5d | |
912 | jnz .loopW | |
913 | ||
914 | lea r0, [r0 + r2] | |
915 | lea r1, [r1 + r2] | |
916 | ||
917 | dec r4d | |
918 | jnz .loopH | |
919 | RET | |
920 | ||
921 | ||
922 | INIT_YMM avx2 | |
923 | cglobal weight_pp, 6, 7, 6 | |
924 | ||
925 | shl r5d, 6 ; m0 = [w0<<6] | |
926 | mov r6d, r6m | |
927 | shl r6d, 16 | |
928 | or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each. | |
929 | movd xm0, r6d | |
930 | pshufd xm0, xm0, 0 ; m0 = [w0<<6, round] | |
931 | vinserti128 m0, m0, xm0, 1 ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate | |
932 | ||
933 | movd xm1, r7m | |
934 | vpbroadcastd m2, r8m | |
935 | mova m5, [pw_1] | |
936 | sub r2d, r3d | |
937 | shr r3d, 4 | |
938 | ||
939 | .loopH: | |
940 | mov r5d, r3d | |
941 | ||
942 | .loopW: | |
943 | pmovzxbw m4, [r0] | |
944 | punpcklwd m3, m4, m5 | |
945 | pmaddwd m3, m0 | |
946 | psrad m3, xm1 | |
947 | paddd m3, m2 | |
948 | ||
949 | punpckhwd m4, m5 | |
950 | pmaddwd m4, m0 | |
951 | psrad m4, xm1 | |
952 | paddd m4, m2 | |
953 | ||
954 | packssdw m3, m4 | |
955 | vextracti128 xm4, m3, 1 | |
956 | packuswb xm3, xm4 | |
957 | movu [r1], xm3 | |
958 | ||
959 | add r0, 16 | |
960 | add r1, 16 | |
961 | ||
962 | dec r5d | |
963 | jnz .loopW | |
964 | ||
965 | lea r0, [r0 + r2] | |
966 | lea r1, [r1 + r2] | |
967 | ||
968 | dec r4d | |
969 | jnz .loopH | |
970 | RET | |
971 | ||
972 | ;------------------------------------------------------------------------------------------------------------------------------------------------- | |
973 | ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) | |
974 | ;------------------------------------------------------------------------------------------------------------------------------------------------- | |
975 | INIT_XMM sse4 | |
976 | %if ARCH_X86_64 | |
977 | cglobal weight_sp, 6, 7+2, 7 | |
978 | %define tmp_r0 r7 | |
979 | %define tmp_r1 r8 | |
980 | %else ; ARCH_X86_64 = 0 | |
981 | cglobal weight_sp, 6, 7, 7, 0-(2*4) | |
982 | %define tmp_r0 [(rsp + 0 * 4)] | |
983 | %define tmp_r1 [(rsp + 1 * 4)] | |
984 | %endif ; ARCH_X86_64 | |
985 | ||
986 | movd m0, r6m ; m0 = [w0] | |
987 | ||
988 | movd m1, r7m ; m1 = [round] | |
989 | punpcklwd m0, m1 | |
990 | pshufd m0, m0, 0 ; m0 = [w0 round] | |
991 | ||
992 | movd m1, r8m ; m1 = [shift] | |
993 | ||
994 | movd m2, r9m | |
995 | pshufd m2, m2, 0 ; m2 =[offset] | |
996 | ||
997 | mova m3, [pw_1] | |
998 | mova m4, [pw_2000] | |
999 | ||
1000 | add r2d, r2d | |
1001 | ||
1002 | .loopH: | |
1003 | mov r6d, r4d | |
1004 | ||
1005 | ; save old src and dst | |
1006 | mov tmp_r0, r0 | |
1007 | mov tmp_r1, r1 | |
1008 | .loopW: | |
1009 | movu m5, [r0] | |
1010 | paddw m5, m4 | |
1011 | ||
1012 | punpcklwd m6,m5, m3 | |
1013 | pmaddwd m6, m0 | |
1014 | psrad m6, m1 | |
1015 | paddd m6, m2 | |
1016 | ||
1017 | punpckhwd m5, m3 | |
1018 | pmaddwd m5, m0 | |
1019 | psrad m5, m1 | |
1020 | paddd m5, m2 | |
1021 | ||
1022 | packssdw m6, m5 | |
1023 | packuswb m6, m6 | |
1024 | ||
1025 | sub r6d, 8 | |
1026 | jl .width4 | |
1027 | movh [r1], m6 | |
1028 | je .nextH | |
1029 | add r0, 16 | |
1030 | add r1, 8 | |
1031 | ||
1032 | jmp .loopW | |
1033 | ||
1034 | .width4: | |
1035 | cmp r6d, -4 | |
1036 | jl .width2 | |
1037 | movd [r1], m6 | |
1038 | je .nextH | |
1039 | add r1, 4 | |
1040 | pshufd m6, m6, 1 | |
1041 | ||
1042 | .width2: | |
1043 | pextrw [r1], m6, 0 | |
1044 | ||
1045 | .nextH: | |
1046 | mov r0, tmp_r0 | |
1047 | mov r1, tmp_r1 | |
1048 | lea r0, [r0 + r2] | |
1049 | lea r1, [r1 + r3] | |
1050 | ||
1051 | dec r5d | |
1052 | jnz .loopH | |
1053 | ||
1054 | RET | |
1055 | ||
1056 | ;----------------------------------------------------------------- | |
1057 | ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride) | |
1058 | ;----------------------------------------------------------------- | |
1059 | INIT_XMM sse2 | |
1060 | cglobal transpose4, 3, 3, 4, dest, src, stride | |
1061 | %if HIGH_BIT_DEPTH == 1 | |
1062 | add r2, r2 | |
1063 | movh m0, [r1] | |
1064 | movh m1, [r1 + r2] | |
1065 | movh m2, [r1 + 2 * r2] | |
1066 | lea r1, [r1 + 2 * r2] | |
1067 | movh m3, [r1 + r2] | |
1068 | punpcklwd m0, m1 | |
1069 | punpcklwd m2, m3 | |
1070 | punpckhdq m1, m0, m2 | |
1071 | punpckldq m0, m2 | |
1072 | movu [r0], m0 | |
1073 | movu [r0 + 16], m1 | |
1074 | %else ;HIGH_BIT_DEPTH == 0 | |
1075 | movd m0, [r1] | |
1076 | movd m1, [r1 + r2] | |
1077 | movd m2, [r1 + 2 * r2] | |
1078 | lea r1, [r1 + 2 * r2] | |
1079 | movd m3, [r1 + r2] | |
1080 | ||
1081 | punpcklbw m0, m1 | |
1082 | punpcklbw m2, m3 | |
1083 | punpcklwd m0, m2 | |
1084 | movu [r0], m0 | |
1085 | %endif | |
1086 | RET | |
1087 | ||
1088 | ;----------------------------------------------------------------- | |
1089 | ; void transpose_8x8(pixel *dst, pixel *src, intptr_t stride) | |
1090 | ;----------------------------------------------------------------- | |
1091 | %if HIGH_BIT_DEPTH == 1 | |
1092 | %if ARCH_X86_64 == 1 | |
1093 | INIT_YMM avx2 | |
1094 | cglobal transpose8, 3, 5, 5 | |
1095 | add r2, r2 | |
1096 | lea r3, [3 * r2] | |
1097 | lea r4, [r1 + 4 * r2] | |
1098 | movu xm0, [r1] | |
1099 | vinserti128 m0, m0, [r4], 1 | |
1100 | movu xm1, [r1 + r2] | |
1101 | vinserti128 m1, m1, [r4 + r2], 1 | |
1102 | movu xm2, [r1 + 2 * r2] | |
1103 | vinserti128 m2, m2, [r4 + 2 * r2], 1 | |
1104 | movu xm3, [r1 + r3] | |
1105 | vinserti128 m3, m3, [r4 + r3], 1 | |
1106 | ||
1107 | punpcklwd m4, m0, m1 ;[1 - 4][row1row2;row5row6] | |
1108 | punpckhwd m0, m1 ;[5 - 8][row1row2;row5row6] | |
1109 | ||
1110 | punpcklwd m1, m2, m3 ;[1 - 4][row3row4;row7row8] | |
1111 | punpckhwd m2, m3 ;[5 - 8][row3row4;row7row8] | |
1112 | ||
1113 | punpckldq m3, m4, m1 ;[1 - 2][row1row2row3row4;row5row6row7row8] | |
1114 | punpckhdq m4, m1 ;[3 - 4][row1row2row3row4;row5row6row7row8] | |
1115 | ||
1116 | punpckldq m1, m0, m2 ;[5 - 6][row1row2row3row4;row5row6row7row8] | |
1117 | punpckhdq m0, m2 ;[7 - 8][row1row2row3row4;row5row6row7row8] | |
1118 | ||
1119 | vpermq m3, m3, 0xD8 ;[1 ; 2][row1row2row3row4row5row6row7row8] | |
1120 | vpermq m4, m4, 0xD8 ;[3 ; 4][row1row2row3row4row5row6row7row8] | |
1121 | vpermq m1, m1, 0xD8 ;[5 ; 6][row1row2row3row4row5row6row7row8] | |
1122 | vpermq m0, m0, 0xD8 ;[7 ; 8][row1row2row3row4row5row6row7row8] | |
1123 | ||
1124 | movu [r0 + 0 * 32], m3 | |
1125 | movu [r0 + 1 * 32], m4 | |
1126 | movu [r0 + 2 * 32], m1 | |
1127 | movu [r0 + 3 * 32], m0 | |
1128 | RET | |
1129 | %endif | |
1130 | ||
1131 | INIT_XMM sse2 | |
1132 | %macro TRANSPOSE_4x4 1 | |
1133 | movh m0, [r1] | |
1134 | movh m1, [r1 + r2] | |
1135 | movh m2, [r1 + 2 * r2] | |
1136 | lea r1, [r1 + 2 * r2] | |
1137 | movh m3, [r1 + r2] | |
1138 | punpcklwd m0, m1 | |
1139 | punpcklwd m2, m3 | |
1140 | punpckhdq m1, m0, m2 | |
1141 | punpckldq m0, m2 | |
1142 | movh [r0], m0 | |
1143 | movhps [r0 + %1], m0 | |
1144 | movh [r0 + 2 * %1], m1 | |
1145 | lea r0, [r0 + 2 * %1] | |
1146 | movhps [r0 + %1], m1 | |
1147 | %endmacro | |
1148 | cglobal transpose8_internal | |
1149 | TRANSPOSE_4x4 r5 | |
1150 | lea r1, [r1 + 2 * r2] | |
1151 | lea r0, [r3 + 8] | |
1152 | TRANSPOSE_4x4 r5 | |
1153 | lea r1, [r1 + 2 * r2] | |
1154 | neg r2 | |
1155 | lea r1, [r1 + r2 * 8 + 8] | |
1156 | neg r2 | |
1157 | lea r0, [r3 + 4 * r5] | |
1158 | TRANSPOSE_4x4 r5 | |
1159 | lea r1, [r1 + 2 * r2] | |
1160 | lea r0, [r3 + 8 + 4 * r5] | |
1161 | TRANSPOSE_4x4 r5 | |
1162 | ret | |
1163 | cglobal transpose8, 3, 6, 4, dest, src, stride | |
1164 | add r2, r2 | |
1165 | mov r3, r0 | |
1166 | mov r5, 16 | |
1167 | call transpose8_internal | |
1168 | RET | |
1169 | %else ;HIGH_BIT_DEPTH == 0 | |
1170 | %if ARCH_X86_64 == 1 | |
1171 | INIT_YMM avx2 | |
1172 | cglobal transpose8, 3, 4, 4 | |
1173 | lea r3, [r2 * 3] | |
1174 | movq xm0, [r1] | |
1175 | movhps xm0, [r1 + 2 * r2] | |
1176 | movq xm1, [r1 + r2] | |
1177 | movhps xm1, [r1 + r3] | |
1178 | lea r1, [r1 + 4 * r2] | |
1179 | movq xm2, [r1] | |
1180 | movhps xm2, [r1 + 2 * r2] | |
1181 | movq xm3, [r1 + r2] | |
1182 | movhps xm3, [r1 + r3] | |
1183 | ||
1184 | vinserti128 m0, m0, xm2, 1 ;[row1 row3 row5 row7] | |
1185 | vinserti128 m1, m1, xm3, 1 ;[row2 row4 row6 row8] | |
1186 | ||
1187 | punpcklbw m2, m0, m1 ;[1 - 8; 1 - 8][row1row2; row5row6] | |
1188 | punpckhbw m0, m1 ;[1 - 8; 1 - 8][row3row4; row7row8] | |
1189 | ||
1190 | punpcklwd m1, m2, m0 ;[1 - 4; 1 - 4][row1row2row3row4; row5row6row7row8] | |
1191 | punpckhwd m2, m0 ;[5 - 8; 5 - 8][row1row2row3row4; row5row6row7row8] | |
1192 | ||
1193 | mova m0, [trans8_shuf] | |
1194 | ||
1195 | vpermd m1, m0, m1 ;[1 - 2; 3 - 4][row1row2row3row4row5row6row7row8] | |
1196 | vpermd m2, m0, m2 ;[4 - 5; 6 - 7][row1row2row3row4row5row6row7row8] | |
1197 | ||
1198 | movu [r0], m1 | |
1199 | movu [r0 + 32], m2 | |
1200 | RET | |
1201 | %endif | |
1202 | ||
1203 | INIT_XMM sse2 | |
1204 | cglobal transpose8, 3, 5, 8, dest, src, stride | |
1205 | lea r3, [2 * r2] | |
1206 | lea r4, [3 * r2] | |
1207 | movh m0, [r1] | |
1208 | movh m1, [r1 + r2] | |
1209 | movh m2, [r1 + r3] | |
1210 | movh m3, [r1 + r4] | |
1211 | movh m4, [r1 + 4 * r2] | |
1212 | lea r1, [r1 + 4 * r2] | |
1213 | movh m5, [r1 + r2] | |
1214 | movh m6, [r1 + r3] | |
1215 | movh m7, [r1 + r4] | |
1216 | ||
1217 | punpcklbw m0, m1 | |
1218 | punpcklbw m2, m3 | |
1219 | punpcklbw m4, m5 | |
1220 | punpcklbw m6, m7 | |
1221 | ||
1222 | punpckhwd m1, m0, m2 | |
1223 | punpcklwd m0, m2 | |
1224 | punpckhwd m5, m4, m6 | |
1225 | punpcklwd m4, m6 | |
1226 | punpckhdq m2, m0, m4 | |
1227 | punpckldq m0, m4 | |
1228 | punpckhdq m3, m1, m5 | |
1229 | punpckldq m1, m5 | |
1230 | ||
1231 | movu [r0], m0 | |
1232 | movu [r0 + 16], m2 | |
1233 | movu [r0 + 32], m1 | |
1234 | movu [r0 + 48], m3 | |
1235 | RET | |
1236 | %endif | |
1237 | ||
1238 | %macro TRANSPOSE_8x8 1 | |
1239 | ||
1240 | movh m0, [r1] | |
1241 | movh m1, [r1 + r2] | |
1242 | movh m2, [r1 + 2 * r2] | |
1243 | lea r1, [r1 + 2 * r2] | |
1244 | movh m3, [r1 + r2] | |
1245 | movh m4, [r1 + 2 * r2] | |
1246 | lea r1, [r1 + 2 * r2] | |
1247 | movh m5, [r1 + r2] | |
1248 | movh m6, [r1 + 2 * r2] | |
1249 | lea r1, [r1 + 2 * r2] | |
1250 | movh m7, [r1 + r2] | |
1251 | ||
1252 | punpcklbw m0, m1 | |
1253 | punpcklbw m2, m3 | |
1254 | punpcklbw m4, m5 | |
1255 | punpcklbw m6, m7 | |
1256 | ||
1257 | punpckhwd m1, m0, m2 | |
1258 | punpcklwd m0, m2 | |
1259 | punpckhwd m5, m4, m6 | |
1260 | punpcklwd m4, m6 | |
1261 | punpckhdq m2, m0, m4 | |
1262 | punpckldq m0, m4 | |
1263 | punpckhdq m3, m1, m5 | |
1264 | punpckldq m1, m5 | |
1265 | ||
1266 | movh [r0], m0 | |
1267 | movhps [r0 + %1], m0 | |
1268 | movh [r0 + 2 * %1], m2 | |
1269 | lea r0, [r0 + 2 * %1] | |
1270 | movhps [r0 + %1], m2 | |
1271 | movh [r0 + 2 * %1], m1 | |
1272 | lea r0, [r0 + 2 * %1] | |
1273 | movhps [r0 + %1], m1 | |
1274 | movh [r0 + 2 * %1], m3 | |
1275 | lea r0, [r0 + 2 * %1] | |
1276 | movhps [r0 + %1], m3 | |
1277 | ||
1278 | %endmacro | |
1279 | ||
1280 | ||
1281 | ;----------------------------------------------------------------- | |
1282 | ; void transpose_16x16(pixel *dst, pixel *src, intptr_t stride) | |
1283 | ;----------------------------------------------------------------- | |
1284 | %if HIGH_BIT_DEPTH == 1 | |
1285 | %if ARCH_X86_64 == 1 | |
1286 | INIT_YMM avx2 | |
1287 | cglobal transpose16x8_internal | |
1288 | movu m0, [r1] | |
1289 | movu m1, [r1 + r2] | |
1290 | movu m2, [r1 + 2 * r2] | |
1291 | movu m3, [r1 + r3] | |
1292 | lea r1, [r1 + 4 * r2] | |
1293 | ||
1294 | movu m4, [r1] | |
1295 | movu m5, [r1 + r2] | |
1296 | movu m6, [r1 + 2 * r2] | |
1297 | movu m7, [r1 + r3] | |
1298 | ||
1299 | punpcklwd m8, m0, m1 ;[1 - 4; 9 - 12][1 2] | |
1300 | punpckhwd m0, m1 ;[5 - 8; 13 -16][1 2] | |
1301 | ||
1302 | punpcklwd m1, m2, m3 ;[1 - 4; 9 - 12][3 4] | |
1303 | punpckhwd m2, m3 ;[5 - 8; 13 -16][3 4] | |
1304 | ||
1305 | punpcklwd m3, m4, m5 ;[1 - 4; 9 - 12][5 6] | |
1306 | punpckhwd m4, m5 ;[5 - 8; 13 -16][5 6] | |
1307 | ||
1308 | punpcklwd m5, m6, m7 ;[1 - 4; 9 - 12][7 8] | |
1309 | punpckhwd m6, m7 ;[5 - 8; 13 -16][7 8] | |
1310 | ||
1311 | punpckldq m7, m8, m1 ;[1 - 2; 9 - 10][1 2 3 4] | |
1312 | punpckhdq m8, m1 ;[3 - 4; 11 - 12][1 2 3 4] | |
1313 | ||
1314 | punpckldq m1, m3, m5 ;[1 - 2; 9 - 10][5 6 7 8] | |
1315 | punpckhdq m3, m5 ;[3 - 4; 11 - 12][5 6 7 8] | |
1316 | ||
1317 | punpckldq m5, m0, m2 ;[5 - 6; 13 - 14][1 2 3 4] | |
1318 | punpckhdq m0, m2 ;[7 - 8; 15 - 16][1 2 3 4] | |
1319 | ||
1320 | punpckldq m2, m4, m6 ;[5 - 6; 13 - 14][5 6 7 8] | |
1321 | punpckhdq m4, m6 ;[7 - 8; 15 - 16][5 6 7 8] | |
1322 | ||
1323 | punpcklqdq m6, m7, m1 ;[1 ; 9 ][1 2 3 4 5 6 7 8] | |
1324 | punpckhqdq m7, m1 ;[2 ; 10][1 2 3 4 5 6 7 8] | |
1325 | ||
1326 | punpcklqdq m1, m8, m3 ;[3 ; 11][1 2 3 4 5 6 7 8] | |
1327 | punpckhqdq m8, m3 ;[4 ; 12][1 2 3 4 5 6 7 8] | |
1328 | ||
1329 | punpcklqdq m3, m5, m2 ;[5 ; 13][1 2 3 4 5 6 7 8] | |
1330 | punpckhqdq m5, m2 ;[6 ; 14][1 2 3 4 5 6 7 8] | |
1331 | ||
1332 | punpcklqdq m2, m0, m4 ;[7 ; 15][1 2 3 4 5 6 7 8] | |
1333 | punpckhqdq m0, m4 ;[8 ; 16][1 2 3 4 5 6 7 8] | |
1334 | ||
1335 | movu [r0 + 0 * 32], xm6 | |
1336 | vextracti128 [r0 + 8 * 32], m6, 1 | |
1337 | movu [r0 + 1 * 32], xm7 | |
1338 | vextracti128 [r0 + 9 * 32], m7, 1 | |
1339 | movu [r0 + 2 * 32], xm1 | |
1340 | vextracti128 [r0 + 10 * 32], m1, 1 | |
1341 | movu [r0 + 3 * 32], xm8 | |
1342 | vextracti128 [r0 + 11 * 32], m8, 1 | |
1343 | movu [r0 + 4 * 32], xm3 | |
1344 | vextracti128 [r0 + 12 * 32], m3, 1 | |
1345 | movu [r0 + 5 * 32], xm5 | |
1346 | vextracti128 [r0 + 13 * 32], m5, 1 | |
1347 | movu [r0 + 6 * 32], xm2 | |
1348 | vextracti128 [r0 + 14 * 32], m2, 1 | |
1349 | movu [r0 + 7 * 32], xm0 | |
1350 | vextracti128 [r0 + 15 * 32], m0, 1 | |
1351 | ret | |
1352 | ||
1353 | cglobal transpose16, 3, 4, 9 | |
1354 | add r2, r2 | |
1355 | lea r3, [r2 * 3] | |
1356 | call transpose16x8_internal | |
1357 | lea r1, [r1 + 4 * r2] | |
1358 | add r0, 16 | |
1359 | call transpose16x8_internal | |
1360 | RET | |
1361 | %endif | |
1362 | INIT_XMM sse2 | |
1363 | cglobal transpose16, 3, 7, 4, dest, src, stride | |
1364 | add r2, r2 | |
1365 | mov r3, r0 | |
1366 | mov r4, r1 | |
1367 | mov r5, 32 | |
1368 | mov r6, r0 | |
1369 | call transpose8_internal | |
1370 | lea r1, [r1 - 8 + 2 * r2] | |
1371 | lea r0, [r6 + 16] | |
1372 | mov r3, r0 | |
1373 | call transpose8_internal | |
1374 | lea r1, [r4 + 16] | |
1375 | lea r0, [r6 + 8 * r5] | |
1376 | mov r3, r0 | |
1377 | call transpose8_internal | |
1378 | lea r1, [r1 - 8 + 2 * r2] | |
1379 | lea r0, [r6 + 8 * r5 + 16] | |
1380 | mov r3, r0 | |
1381 | call transpose8_internal | |
1382 | RET | |
1383 | %else ;HIGH_BIT_DEPTH == 0 | |
1384 | %if ARCH_X86_64 == 1 | |
1385 | INIT_YMM avx2 | |
1386 | cglobal transpose16, 3, 5, 9 | |
1387 | lea r3, [r2 * 3] | |
1388 | lea r4, [r1 + 8 * r2] | |
1389 | ||
1390 | movu xm0, [r1] | |
1391 | movu xm1, [r1 + r2] | |
1392 | movu xm2, [r1 + 2 * r2] | |
1393 | movu xm3, [r1 + r3] | |
1394 | vinserti128 m0, m0, [r4], 1 | |
1395 | vinserti128 m1, m1, [r4 + r2], 1 | |
1396 | vinserti128 m2, m2, [r4 + 2 * r2], 1 | |
1397 | vinserti128 m3, m3, [r4 + r3], 1 | |
1398 | lea r1, [r1 + 4 * r2] | |
1399 | lea r4, [r4 + 4 * r2] | |
1400 | ||
1401 | movu xm4, [r1] | |
1402 | movu xm5, [r1 + r2] | |
1403 | movu xm6, [r1 + 2 * r2] | |
1404 | movu xm7, [r1 + r3] | |
1405 | vinserti128 m4, m4, [r4], 1 | |
1406 | vinserti128 m5, m5, [r4 + r2], 1 | |
1407 | vinserti128 m6, m6, [r4 + 2 * r2], 1 | |
1408 | vinserti128 m7, m7, [r4 + r3], 1 | |
1409 | ||
1410 | punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10] | |
1411 | punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10] | |
1412 | ||
1413 | punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12] | |
1414 | punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12] | |
1415 | ||
1416 | punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14] | |
1417 | punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14] | |
1418 | ||
1419 | punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16] | |
1420 | punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16] | |
1421 | ||
1422 | punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12] | |
1423 | punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12] | |
1424 | ||
1425 | punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16] | |
1426 | punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16] | |
1427 | ||
1428 | punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12] | |
1429 | punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12] | |
1430 | ||
1431 | punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16] | |
1432 | punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16] | |
1433 | ||
1434 | punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1435 | punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1436 | ||
1437 | punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1438 | punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1439 | ||
1440 | punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1441 | punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1442 | ||
1443 | punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1444 | punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1445 | ||
1446 | vpermq m6, m6, 0xD8 | |
1447 | vpermq m7, m7, 0xD8 | |
1448 | vpermq m1, m1, 0xD8 | |
1449 | vpermq m8, m8, 0xD8 | |
1450 | vpermq m3, m3, 0xD8 | |
1451 | vpermq m5, m5, 0xD8 | |
1452 | vpermq m2, m2, 0xD8 | |
1453 | vpermq m0, m0, 0xD8 | |
1454 | ||
1455 | movu [r0 + 0 * 16], m6 | |
1456 | movu [r0 + 2 * 16], m7 | |
1457 | movu [r0 + 4 * 16], m1 | |
1458 | movu [r0 + 6 * 16], m8 | |
1459 | movu [r0 + 8 * 16], m3 | |
1460 | movu [r0 + 10 * 16], m5 | |
1461 | movu [r0 + 12 * 16], m2 | |
1462 | movu [r0 + 14 * 16], m0 | |
1463 | RET | |
1464 | %endif | |
1465 | INIT_XMM sse2 | |
1466 | cglobal transpose16, 3, 5, 8, dest, src, stride | |
1467 | mov r3, r0 | |
1468 | mov r4, r1 | |
1469 | TRANSPOSE_8x8 16 | |
1470 | lea r1, [r1 + 2 * r2] | |
1471 | lea r0, [r3 + 8] | |
1472 | TRANSPOSE_8x8 16 | |
1473 | lea r1, [r4 + 8] | |
1474 | lea r0, [r3 + 8 * 16] | |
1475 | TRANSPOSE_8x8 16 | |
1476 | lea r1, [r1 + 2 * r2] | |
1477 | lea r0, [r3 + 8 * 16 + 8] | |
1478 | TRANSPOSE_8x8 16 | |
1479 | RET | |
1480 | %endif | |
1481 | ||
1482 | cglobal transpose16_internal | |
1483 | TRANSPOSE_8x8 r6 | |
1484 | lea r1, [r1 + 2 * r2] | |
1485 | lea r0, [r5 + 8] | |
1486 | TRANSPOSE_8x8 r6 | |
1487 | lea r1, [r1 + 2 * r2] | |
1488 | neg r2 | |
1489 | lea r1, [r1 + r2 * 8] | |
1490 | lea r1, [r1 + r2 * 8 + 8] | |
1491 | neg r2 | |
1492 | lea r0, [r5 + 8 * r6] | |
1493 | TRANSPOSE_8x8 r6 | |
1494 | lea r1, [r1 + 2 * r2] | |
1495 | lea r0, [r5 + 8 * r6 + 8] | |
1496 | TRANSPOSE_8x8 r6 | |
1497 | ret | |
1498 | ||
1499 | ;----------------------------------------------------------------- | |
1500 | ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride) | |
1501 | ;----------------------------------------------------------------- | |
1502 | %if HIGH_BIT_DEPTH == 1 | |
1503 | %if ARCH_X86_64 == 1 | |
1504 | INIT_YMM avx2 | |
1505 | cglobal transpose8x32_internal | |
1506 | movu m0, [r1] | |
1507 | movu m1, [r1 + 32] | |
1508 | movu m2, [r1 + r2] | |
1509 | movu m3, [r1 + r2 + 32] | |
1510 | movu m4, [r1 + 2 * r2] | |
1511 | movu m5, [r1 + 2 * r2 + 32] | |
1512 | movu m6, [r1 + r3] | |
1513 | movu m7, [r1 + r3 + 32] | |
1514 | lea r1, [r1 + 4 * r2] | |
1515 | ||
1516 | punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] | |
1517 | punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] | |
1518 | ||
1519 | punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] | |
1520 | punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] | |
1521 | ||
1522 | punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] | |
1523 | punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] | |
1524 | ||
1525 | punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] | |
1526 | punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] | |
1527 | ||
1528 | punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] | |
1529 | punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] | |
1530 | ||
1531 | punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] | |
1532 | punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] | |
1533 | ||
1534 | punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] | |
1535 | punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] | |
1536 | ||
1537 | punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] | |
1538 | punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] | |
1539 | ||
1540 | movq [r0 + 0 * 64], xm7 | |
1541 | movhps [r0 + 1 * 64], xm7 | |
1542 | vextracti128 xm5, m7, 1 | |
1543 | movq [r0 + 8 * 64], xm5 | |
1544 | movhps [r0 + 9 * 64], xm5 | |
1545 | ||
1546 | movu m7, [r1] | |
1547 | movu m9, [r1 + 32] | |
1548 | movu m10, [r1 + r2] | |
1549 | movu m11, [r1 + r2 + 32] | |
1550 | movu m12, [r1 + 2 * r2] | |
1551 | movu m13, [r1 + 2 * r2 + 32] | |
1552 | movu m14, [r1 + r3] | |
1553 | movu m15, [r1 + r3 + 32] | |
1554 | ||
1555 | punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] | |
1556 | punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] | |
1557 | ||
1558 | punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] | |
1559 | punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] | |
1560 | ||
1561 | punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] | |
1562 | punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] | |
1563 | ||
1564 | punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] | |
1565 | punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] | |
1566 | ||
1567 | punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] | |
1568 | punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] | |
1569 | ||
1570 | punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] | |
1571 | punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] | |
1572 | ||
1573 | punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] | |
1574 | punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] | |
1575 | ||
1576 | punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] | |
1577 | punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] | |
1578 | ||
1579 | movq [r0 + 0 * 64 + 8], xm15 | |
1580 | movhps [r0 + 1 * 64 + 8], xm15 | |
1581 | vextracti128 xm13, m15, 1 | |
1582 | movq [r0 + 8 * 64 + 8], xm13 | |
1583 | movhps [r0 + 9 * 64 + 8], xm13 | |
1584 | ||
1585 | punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] | |
1586 | punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] | |
1587 | ||
1588 | punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] | |
1589 | punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] | |
1590 | ||
1591 | punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] | |
1592 | punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] | |
1593 | ||
1594 | punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] | |
1595 | punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] | |
1596 | ||
1597 | punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] | |
1598 | punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] | |
1599 | ||
1600 | punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] | |
1601 | punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] | |
1602 | ||
1603 | punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] | |
1604 | punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] | |
1605 | ||
1606 | movu [r0 + 2 * 64], xm13 | |
1607 | vextracti128 [r0 + 10 * 64], m13, 1 | |
1608 | ||
1609 | movu [r0 + 3 * 64], xm8 | |
1610 | vextracti128 [r0 + 11 * 64], m8, 1 | |
1611 | ||
1612 | movu [r0 + 4 * 64], xm5 | |
1613 | vextracti128 [r0 + 12 * 64], m5, 1 | |
1614 | ||
1615 | movu [r0 + 5 * 64], xm2 | |
1616 | vextracti128 [r0 + 13 * 64], m2, 1 | |
1617 | ||
1618 | movu [r0 + 6 * 64], xm10 | |
1619 | vextracti128 [r0 + 14 * 64], m10, 1 | |
1620 | ||
1621 | movu [r0 + 7 * 64], xm0 | |
1622 | vextracti128 [r0 + 15 * 64], m0, 1 | |
1623 | ||
1624 | movu [r0 + 16 * 64], xm7 | |
1625 | vextracti128 [r0 + 24 * 64], m7, 1 | |
1626 | ||
1627 | movu [r0 + 17 * 64], xm4 | |
1628 | vextracti128 [r0 + 25 * 64], m4, 1 | |
1629 | ||
1630 | movu [r0 + 18 * 64], xm12 | |
1631 | vextracti128 [r0 + 26 * 64], m12, 1 | |
1632 | ||
1633 | movu [r0 + 19 * 64], xm6 | |
1634 | vextracti128 [r0 + 27 * 64], m6, 1 | |
1635 | ||
1636 | movu [r0 + 20 * 64], xm14 | |
1637 | vextracti128 [r0 + 28 * 64], m14, 1 | |
1638 | ||
1639 | movu [r0 + 21 * 64], xm3 | |
1640 | vextracti128 [r0 + 29 * 64], m3, 1 | |
1641 | ||
1642 | movu [r0 + 22 * 64], xm11 | |
1643 | vextracti128 [r0 + 30 * 64], m11, 1 | |
1644 | ||
1645 | movu [r0 + 23 * 64], xm1 | |
1646 | vextracti128 [r0 + 31 * 64], m1, 1 | |
1647 | ret | |
1648 | ||
1649 | cglobal transpose32, 3, 4, 16 | |
1650 | add r2, r2 | |
1651 | lea r3, [r2 * 3] | |
1652 | call transpose8x32_internal | |
1653 | add r0, 16 | |
1654 | lea r1, [r1 + 4 * r2] | |
1655 | call transpose8x32_internal | |
1656 | add r0, 16 | |
1657 | lea r1, [r1 + 4 * r2] | |
1658 | call transpose8x32_internal | |
1659 | add r0, 16 | |
1660 | lea r1, [r1 + 4 * r2] | |
1661 | call transpose8x32_internal | |
1662 | RET | |
1663 | %endif | |
1664 | INIT_XMM sse2 | |
1665 | cglobal transpose32, 3, 7, 4, dest, src, stride | |
1666 | add r2, r2 | |
1667 | mov r3, r0 | |
1668 | mov r4, r1 | |
1669 | mov r5, 64 | |
1670 | mov r6, r0 | |
1671 | call transpose8_internal | |
1672 | lea r1, [r1 - 8 + 2 * r2] | |
1673 | lea r0, [r6 + 16] | |
1674 | mov r3, r0 | |
1675 | call transpose8_internal | |
1676 | lea r1, [r1 - 8 + 2 * r2] | |
1677 | lea r0, [r6 + 32] | |
1678 | mov r3, r0 | |
1679 | call transpose8_internal | |
1680 | lea r1, [r1 - 8 + 2 * r2] | |
1681 | lea r0, [r6 + 48] | |
1682 | mov r3, r0 | |
1683 | call transpose8_internal | |
1684 | lea r1, [r4 + 16] | |
1685 | lea r0, [r6 + 8 * 64] | |
1686 | mov r3, r0 | |
1687 | call transpose8_internal | |
1688 | lea r1, [r1 - 8 + 2 * r2] | |
1689 | lea r0, [r6 + 8 * 64 + 16] | |
1690 | mov r3, r0 | |
1691 | call transpose8_internal | |
1692 | lea r1, [r1 - 8 + 2 * r2] | |
1693 | lea r0, [r6 + 8 * 64 + 32] | |
1694 | mov r3, r0 | |
1695 | call transpose8_internal | |
1696 | lea r1, [r1 - 8 + 2 * r2] | |
1697 | lea r0, [r6 + 8 * 64 + 48] | |
1698 | mov r3, r0 | |
1699 | call transpose8_internal | |
1700 | lea r1, [r4 + 32] | |
1701 | lea r0, [r6 + 16 * 64] | |
1702 | mov r3, r0 | |
1703 | call transpose8_internal | |
1704 | lea r1, [r1 - 8 + 2 * r2] | |
1705 | lea r0, [r6 + 16 * 64 + 16] | |
1706 | mov r3, r0 | |
1707 | call transpose8_internal | |
1708 | lea r1, [r1 - 8 + 2 * r2] | |
1709 | lea r0, [r6 + 16 * 64 + 32] | |
1710 | mov r3, r0 | |
1711 | call transpose8_internal | |
1712 | lea r1, [r1 - 8 + 2 * r2] | |
1713 | lea r0, [r6 + 16 * 64 + 48] | |
1714 | mov r3, r0 | |
1715 | call transpose8_internal | |
1716 | lea r1, [r4 + 48] | |
1717 | lea r0, [r6 + 24 * 64] | |
1718 | mov r3, r0 | |
1719 | call transpose8_internal | |
1720 | lea r1, [r1 - 8 + 2 * r2] | |
1721 | lea r0, [r6 + 24 * 64 + 16] | |
1722 | mov r3, r0 | |
1723 | call transpose8_internal | |
1724 | lea r1, [r1 - 8 + 2 * r2] | |
1725 | lea r0, [r6 + 24 * 64 + 32] | |
1726 | mov r3, r0 | |
1727 | call transpose8_internal | |
1728 | lea r1, [r1 - 8 + 2 * r2] | |
1729 | lea r0, [r6 + 24 * 64 + 48] | |
1730 | mov r3, r0 | |
1731 | call transpose8_internal | |
1732 | RET | |
1733 | %else ;HIGH_BIT_DEPTH == 0 | |
1734 | INIT_XMM sse2 | |
1735 | cglobal transpose32, 3, 7, 8, dest, src, stride | |
1736 | mov r3, r0 | |
1737 | mov r4, r1 | |
1738 | mov r5, r0 | |
1739 | mov r6, 32 | |
1740 | call transpose16_internal | |
1741 | lea r1, [r1 - 8 + 2 * r2] | |
1742 | lea r0, [r3 + 16] | |
1743 | mov r5, r0 | |
1744 | call transpose16_internal | |
1745 | lea r1, [r4 + 16] | |
1746 | lea r0, [r3 + 16 * 32] | |
1747 | mov r5, r0 | |
1748 | call transpose16_internal | |
1749 | lea r1, [r1 - 8 + 2 * r2] | |
1750 | lea r0, [r3 + 16 * 32 + 16] | |
1751 | mov r5, r0 | |
1752 | call transpose16_internal | |
1753 | RET | |
1754 | ||
1755 | %if ARCH_X86_64 == 1 | |
1756 | INIT_YMM avx2 | |
1757 | cglobal transpose32, 3, 5, 16 | |
1758 | lea r3, [r2 * 3] | |
1759 | mov r4d, 2 | |
1760 | ||
1761 | .loop: | |
1762 | movu m0, [r1] | |
1763 | movu m1, [r1 + r2] | |
1764 | movu m2, [r1 + 2 * r2] | |
1765 | movu m3, [r1 + r3] | |
1766 | lea r1, [r1 + 4 * r2] | |
1767 | ||
1768 | movu m4, [r1] | |
1769 | movu m5, [r1 + r2] | |
1770 | movu m6, [r1 + 2 * r2] | |
1771 | movu m7, [r1 + r3] | |
1772 | ||
1773 | punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] | |
1774 | punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] | |
1775 | ||
1776 | punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] | |
1777 | punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] | |
1778 | ||
1779 | punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] | |
1780 | punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] | |
1781 | ||
1782 | punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] | |
1783 | punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] | |
1784 | ||
1785 | punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] | |
1786 | punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] | |
1787 | ||
1788 | punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] | |
1789 | punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] | |
1790 | ||
1791 | punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] | |
1792 | punpckhwd m0, m2 ;[13- 15; 29 - 32][1 2 3 4] | |
1793 | ||
1794 | punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] | |
1795 | punpckhwd m4, m6 ;[13- 15; 29 - 32][5 6 7 8] | |
1796 | ||
1797 | punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] | |
1798 | punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] | |
1799 | ||
1800 | punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] | |
1801 | punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] | |
1802 | ||
1803 | punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] | |
1804 | punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] | |
1805 | ||
1806 | punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] | |
1807 | punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] | |
1808 | ||
1809 | movq [r0 + 0 * 32], xm6 | |
1810 | movhps [r0 + 1 * 32], xm6 | |
1811 | vextracti128 xm4, m6, 1 | |
1812 | movq [r0 + 16 * 32], xm4 | |
1813 | movhps [r0 + 17 * 32], xm4 | |
1814 | ||
1815 | lea r1, [r1 + 4 * r2] | |
1816 | movu m9, [r1] | |
1817 | movu m10, [r1 + r2] | |
1818 | movu m11, [r1 + 2 * r2] | |
1819 | movu m12, [r1 + r3] | |
1820 | lea r1, [r1 + 4 * r2] | |
1821 | ||
1822 | movu m13, [r1] | |
1823 | movu m14, [r1 + r2] | |
1824 | movu m15, [r1 + 2 * r2] | |
1825 | movu m6, [r1 + r3] | |
1826 | ||
1827 | punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] | |
1828 | punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] | |
1829 | ||
1830 | punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] | |
1831 | punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] | |
1832 | ||
1833 | punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] | |
1834 | punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] | |
1835 | ||
1836 | punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] | |
1837 | punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] | |
1838 | ||
1839 | punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] | |
1840 | punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] | |
1841 | ||
1842 | punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] | |
1843 | punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] | |
1844 | ||
1845 | punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] | |
1846 | punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] | |
1847 | ||
1848 | punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] | |
1849 | punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] | |
1850 | ||
1851 | punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] | |
1852 | punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] | |
1853 | ||
1854 | punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] | |
1855 | punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] | |
1856 | ||
1857 | punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] | |
1858 | punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] | |
1859 | ||
1860 | punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] | |
1861 | punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] | |
1862 | ||
1863 | ||
1864 | punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1865 | punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1866 | ||
1867 | punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1868 | punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1869 | ||
1870 | punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1871 | punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1872 | ||
1873 | punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1874 | punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1875 | ||
1876 | punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1877 | punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1878 | ||
1879 | punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1880 | punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1881 | ||
1882 | punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1883 | punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
1884 | ||
1885 | movq [r0 + 0 * 32 + 8], xm15 | |
1886 | movhps [r0 + 1 * 32 + 8], xm15 | |
1887 | vextracti128 xm9, m15, 1 | |
1888 | movq [r0 + 16 * 32 + 8], xm9 | |
1889 | movhps [r0 + 17 * 32 + 8], xm9 | |
1890 | ||
1891 | movu [r0 + 2 * 32], xm13 | |
1892 | vextracti128 [r0 + 18 * 32], m13, 1 | |
1893 | ||
1894 | movu [r0 + 3 * 32], xm7 | |
1895 | vextracti128 [r0 + 19 * 32], m7, 1 | |
1896 | ||
1897 | movu [r0 + 4 * 32], xm6 | |
1898 | vextracti128 [r0 + 20 * 32], m6, 1 | |
1899 | ||
1900 | movu [r0 + 5 * 32], xm1 | |
1901 | vextracti128 [r0 + 21 * 32], m1, 1 | |
1902 | ||
1903 | movu [r0 + 6 * 32], xm10 | |
1904 | vextracti128 [r0 + 22 * 32], m10, 1 | |
1905 | ||
1906 | movu [r0 + 7 * 32], xm8 | |
1907 | vextracti128 [r0 + 23 * 32], m8, 1 | |
1908 | ||
1909 | movu [r0 + 8 * 32], xm4 | |
1910 | vextracti128 [r0 + 24 * 32], m4, 1 | |
1911 | ||
1912 | movu [r0 + 9 * 32], xm3 | |
1913 | vextracti128 [r0 + 25 * 32], m3, 1 | |
1914 | ||
1915 | movu [r0 + 10 * 32], xm12 | |
1916 | vextracti128 [r0 + 26 * 32], m12, 1 | |
1917 | ||
1918 | movu [r0 + 11 * 32], xm5 | |
1919 | vextracti128 [r0 + 27 * 32], m5, 1 | |
1920 | ||
1921 | movu [r0 + 12 * 32], xm14 | |
1922 | vextracti128 [r0 + 28 * 32], m14, 1 | |
1923 | ||
1924 | movu [r0 + 13 * 32], xm2 | |
1925 | vextracti128 [r0 + 29 * 32], m2, 1 | |
1926 | ||
1927 | movu [r0 + 14 * 32], xm11 | |
1928 | vextracti128 [r0 + 30 * 32], m11, 1 | |
1929 | ||
1930 | movu [r0 + 15 * 32], xm0 | |
1931 | vextracti128 [r0 + 31 * 32], m0, 1 | |
1932 | ||
1933 | add r0, 16 | |
1934 | lea r1, [r1 + 4 * r2] | |
1935 | dec r4d | |
1936 | jnz .loop | |
1937 | RET | |
1938 | %endif | |
1939 | %endif | |
1940 | ||
1941 | ;----------------------------------------------------------------- | |
1942 | ; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride) | |
1943 | ;----------------------------------------------------------------- | |
1944 | %if HIGH_BIT_DEPTH == 1 | |
1945 | %if ARCH_X86_64 == 1 | |
1946 | INIT_YMM avx2 | |
1947 | cglobal transpose8x32_64_internal | |
1948 | movu m0, [r1] | |
1949 | movu m1, [r1 + 32] | |
1950 | movu m2, [r1 + r2] | |
1951 | movu m3, [r1 + r2 + 32] | |
1952 | movu m4, [r1 + 2 * r2] | |
1953 | movu m5, [r1 + 2 * r2 + 32] | |
1954 | movu m6, [r1 + r3] | |
1955 | movu m7, [r1 + r3 + 32] | |
1956 | lea r1, [r1 + 4 * r2] | |
1957 | ||
1958 | punpcklwd m8, m0, m2 ;[1 - 4; 9 - 12][1 2] | |
1959 | punpckhwd m0, m2 ;[5 - 8; 13 - 16][1 2] | |
1960 | ||
1961 | punpcklwd m2, m4, m6 ;[1 - 4; 9 - 12][3 4] | |
1962 | punpckhwd m4, m6 ;[5 - 8; 13 - 16][3 4] | |
1963 | ||
1964 | punpcklwd m6, m1, m3 ;[17 - 20; 25 - 28][1 2] | |
1965 | punpckhwd m1, m3 ;[21 - 24; 29 - 32][1 2] | |
1966 | ||
1967 | punpcklwd m3, m5, m7 ;[17 - 20; 25 - 28][3 4] | |
1968 | punpckhwd m5, m7 ;[21 - 24; 29 - 32][3 4] | |
1969 | ||
1970 | punpckldq m7, m8, m2 ;[1 - 2; 9 - 10][1 2 3 4] | |
1971 | punpckhdq m8, m2 ;[3 - 4; 11 - 12][1 2 3 4] | |
1972 | ||
1973 | punpckldq m2, m0, m4 ;[5 - 6; 13 - 14][1 2 3 4] | |
1974 | punpckhdq m0, m4 ;[7 - 8; 15 - 16][1 2 3 4] | |
1975 | ||
1976 | punpckldq m4, m6, m3 ;[17 - 18; 25 - 26][1 2 3 4] | |
1977 | punpckhdq m6, m3 ;[19 - 20; 27 - 28][1 2 3 4] | |
1978 | ||
1979 | punpckldq m3, m1, m5 ;[21 - 22; 29 - 30][1 2 3 4] | |
1980 | punpckhdq m1, m5 ;[23 - 24; 31 - 32][1 2 3 4] | |
1981 | ||
1982 | movq [r0 + 0 * 128], xm7 | |
1983 | movhps [r0 + 1 * 128], xm7 | |
1984 | vextracti128 xm5, m7, 1 | |
1985 | movq [r0 + 8 * 128], xm5 | |
1986 | movhps [r0 + 9 * 128], xm5 | |
1987 | ||
1988 | movu m7, [r1] | |
1989 | movu m9, [r1 + 32] | |
1990 | movu m10, [r1 + r2] | |
1991 | movu m11, [r1 + r2 + 32] | |
1992 | movu m12, [r1 + 2 * r2] | |
1993 | movu m13, [r1 + 2 * r2 + 32] | |
1994 | movu m14, [r1 + r3] | |
1995 | movu m15, [r1 + r3 + 32] | |
1996 | ||
1997 | punpcklwd m5, m7, m10 ;[1 - 4; 9 - 12][5 6] | |
1998 | punpckhwd m7, m10 ;[5 - 8; 13 - 16][5 6] | |
1999 | ||
2000 | punpcklwd m10, m12, m14 ;[1 - 4; 9 - 12][7 8] | |
2001 | punpckhwd m12, m14 ;[5 - 8; 13 - 16][7 8] | |
2002 | ||
2003 | punpcklwd m14, m9, m11 ;[17 - 20; 25 - 28][5 6] | |
2004 | punpckhwd m9, m11 ;[21 - 24; 29 - 32][5 6] | |
2005 | ||
2006 | punpcklwd m11, m13, m15 ;[17 - 20; 25 - 28][7 8] | |
2007 | punpckhwd m13, m15 ;[21 - 24; 29 - 32][7 8] | |
2008 | ||
2009 | punpckldq m15, m5, m10 ;[1 - 2; 9 - 10][5 6 7 8] | |
2010 | punpckhdq m5, m10 ;[3 - 4; 11 - 12][5 6 7 8] | |
2011 | ||
2012 | punpckldq m10, m7, m12 ;[5 - 6; 13 - 14][5 6 7 8] | |
2013 | punpckhdq m7, m12 ;[7 - 8; 15 - 16][5 6 7 8] | |
2014 | ||
2015 | punpckldq m12, m14, m11 ;[17 - 18; 25 - 26][5 6 7 8] | |
2016 | punpckhdq m14, m11 ;[19 - 20; 27 - 28][5 6 7 8] | |
2017 | ||
2018 | punpckldq m11, m9, m13 ;[21 - 22; 29 - 30][5 6 7 8] | |
2019 | punpckhdq m9, m13 ;[23 - 24; 31 - 32][5 6 7 8] | |
2020 | ||
2021 | movq [r0 + 0 * 128 + 8], xm15 | |
2022 | movhps [r0 + 1 * 128 + 8], xm15 | |
2023 | vextracti128 xm13, m15, 1 | |
2024 | movq [r0 + 8 * 128 + 8], xm13 | |
2025 | movhps [r0 + 9 * 128 + 8], xm13 | |
2026 | ||
2027 | punpcklqdq m13, m8, m5 ;[3 ; 11][1 2 3 4 5 6 7 8] | |
2028 | punpckhqdq m8, m5 ;[4 ; 12][1 2 3 4 5 6 7 8] | |
2029 | ||
2030 | punpcklqdq m5, m2, m10 ;[5 ; 13][1 2 3 4 5 6 7 8] | |
2031 | punpckhqdq m2, m10 ;[6 ; 14][1 2 3 4 5 6 7 8] | |
2032 | ||
2033 | punpcklqdq m10, m0, m7 ;[7 ; 15][1 2 3 4 5 6 7 8] | |
2034 | punpckhqdq m0, m7 ;[8 ; 16][1 2 3 4 5 6 7 8] | |
2035 | ||
2036 | punpcklqdq m7, m4, m12 ;[17 ; 25][1 2 3 4 5 6 7 8] | |
2037 | punpckhqdq m4, m12 ;[18 ; 26][1 2 3 4 5 6 7 8] | |
2038 | ||
2039 | punpcklqdq m12, m6, m14 ;[19 ; 27][1 2 3 4 5 6 7 8] | |
2040 | punpckhqdq m6, m14 ;[20 ; 28][1 2 3 4 5 6 7 8] | |
2041 | ||
2042 | punpcklqdq m14, m3, m11 ;[21 ; 29][1 2 3 4 5 6 7 8] | |
2043 | punpckhqdq m3, m11 ;[22 ; 30][1 2 3 4 5 6 7 8] | |
2044 | ||
2045 | punpcklqdq m11, m1, m9 ;[23 ; 31][1 2 3 4 5 6 7 8] | |
2046 | punpckhqdq m1, m9 ;[24 ; 32][1 2 3 4 5 6 7 8] | |
2047 | ||
2048 | movu [r0 + 2 * 128], xm13 | |
2049 | vextracti128 [r0 + 10 * 128], m13, 1 | |
2050 | ||
2051 | movu [r0 + 3 * 128], xm8 | |
2052 | vextracti128 [r0 + 11 * 128], m8, 1 | |
2053 | ||
2054 | movu [r0 + 4 * 128], xm5 | |
2055 | vextracti128 [r0 + 12 * 128], m5, 1 | |
2056 | ||
2057 | movu [r0 + 5 * 128], xm2 | |
2058 | vextracti128 [r0 + 13 * 128], m2, 1 | |
2059 | ||
2060 | movu [r0 + 6 * 128], xm10 | |
2061 | vextracti128 [r0 + 14 * 128], m10, 1 | |
2062 | ||
2063 | movu [r0 + 7 * 128], xm0 | |
2064 | vextracti128 [r0 + 15 * 128], m0, 1 | |
2065 | ||
2066 | movu [r0 + 16 * 128], xm7 | |
2067 | vextracti128 [r0 + 24 * 128], m7, 1 | |
2068 | ||
2069 | movu [r0 + 17 * 128], xm4 | |
2070 | vextracti128 [r0 + 25 * 128], m4, 1 | |
2071 | ||
2072 | movu [r0 + 18 * 128], xm12 | |
2073 | vextracti128 [r0 + 26 * 128], m12, 1 | |
2074 | ||
2075 | movu [r0 + 19 * 128], xm6 | |
2076 | vextracti128 [r0 + 27 * 128], m6, 1 | |
2077 | ||
2078 | movu [r0 + 20 * 128], xm14 | |
2079 | vextracti128 [r0 + 28 * 128], m14, 1 | |
2080 | ||
2081 | movu [r0 + 21 * 128], xm3 | |
2082 | vextracti128 [r0 + 29 * 128], m3, 1 | |
2083 | ||
2084 | movu [r0 + 22 * 128], xm11 | |
2085 | vextracti128 [r0 + 30 * 128], m11, 1 | |
2086 | ||
2087 | movu [r0 + 23 * 128], xm1 | |
2088 | vextracti128 [r0 + 31 * 128], m1, 1 | |
2089 | ret | |
2090 | ||
2091 | cglobal transpose64, 3, 6, 16 | |
2092 | add r2, r2 | |
2093 | lea r3, [3 * r2] | |
2094 | lea r4, [r1 + 64] | |
2095 | lea r5, [r0 + 16] | |
2096 | ||
2097 | call transpose8x32_64_internal | |
2098 | mov r1, r4 | |
2099 | lea r0, [r0 + 32 * 128] | |
2100 | call transpose8x32_64_internal | |
2101 | mov r0, r5 | |
2102 | lea r5, [r0 + 16] | |
2103 | lea r4, [r1 + 4 * r2] | |
2104 | lea r1, [r4 - 64] | |
2105 | call transpose8x32_64_internal | |
2106 | mov r1, r4 | |
2107 | lea r0, [r0 + 32 * 128] | |
2108 | call transpose8x32_64_internal | |
2109 | mov r0, r5 | |
2110 | lea r5, [r0 + 16] | |
2111 | lea r4, [r1 + 4 * r2] | |
2112 | lea r1, [r4 - 64] | |
2113 | call transpose8x32_64_internal | |
2114 | mov r1, r4 | |
2115 | lea r0, [r0 + 32 * 128] | |
2116 | call transpose8x32_64_internal | |
2117 | mov r0, r5 | |
2118 | lea r5, [r0 + 16] | |
2119 | lea r4, [r1 + 4 * r2] | |
2120 | lea r1, [r4 - 64] | |
2121 | call transpose8x32_64_internal | |
2122 | mov r1, r4 | |
2123 | lea r0, [r0 + 32 * 128] | |
2124 | call transpose8x32_64_internal | |
2125 | mov r0, r5 | |
2126 | lea r5, [r0 + 16] | |
2127 | lea r4, [r1 + 4 * r2] | |
2128 | lea r1, [r4 - 64] | |
2129 | call transpose8x32_64_internal | |
2130 | mov r1, r4 | |
2131 | lea r0, [r0 + 32 * 128] | |
2132 | call transpose8x32_64_internal | |
2133 | mov r0, r5 | |
2134 | lea r5, [r0 + 16] | |
2135 | lea r4, [r1 + 4 * r2] | |
2136 | lea r1, [r4 - 64] | |
2137 | call transpose8x32_64_internal | |
2138 | mov r1, r4 | |
2139 | lea r0, [r0 + 32 * 128] | |
2140 | call transpose8x32_64_internal | |
2141 | mov r0, r5 | |
2142 | lea r5, [r0 + 16] | |
2143 | lea r4, [r1 + 4 * r2] | |
2144 | lea r1, [r4 - 64] | |
2145 | call transpose8x32_64_internal | |
2146 | mov r1, r4 | |
2147 | lea r0, [r0 + 32 * 128] | |
2148 | call transpose8x32_64_internal | |
2149 | mov r0, r5 | |
2150 | lea r4, [r1 + 4 * r2] | |
2151 | lea r1, [r4 - 64] | |
2152 | call transpose8x32_64_internal | |
2153 | mov r1, r4 | |
2154 | lea r0, [r0 + 32 * 128] | |
2155 | call transpose8x32_64_internal | |
2156 | RET | |
2157 | %endif | |
2158 | INIT_XMM sse2 | |
2159 | cglobal transpose64, 3, 7, 4, dest, src, stride | |
2160 | add r2, r2 | |
2161 | mov r3, r0 | |
2162 | mov r4, r1 | |
2163 | mov r5, 128 | |
2164 | mov r6, r0 | |
2165 | call transpose8_internal | |
2166 | lea r1, [r1 - 8 + 2 * r2] | |
2167 | lea r0, [r6 + 16] | |
2168 | mov r3, r0 | |
2169 | call transpose8_internal | |
2170 | lea r1, [r1 - 8 + 2 * r2] | |
2171 | lea r0, [r6 + 32] | |
2172 | mov r3, r0 | |
2173 | call transpose8_internal | |
2174 | lea r1, [r1 - 8 + 2 * r2] | |
2175 | lea r0, [r6 + 48] | |
2176 | mov r3, r0 | |
2177 | call transpose8_internal | |
2178 | lea r1, [r1 - 8 + 2 * r2] | |
2179 | lea r0, [r6 + 64] | |
2180 | mov r3, r0 | |
2181 | call transpose8_internal | |
2182 | lea r1, [r1 - 8 + 2 * r2] | |
2183 | lea r0, [r6 + 80] | |
2184 | mov r3, r0 | |
2185 | call transpose8_internal | |
2186 | lea r1, [r1 - 8 + 2 * r2] | |
2187 | lea r0, [r6 + 96] | |
2188 | mov r3, r0 | |
2189 | call transpose8_internal | |
2190 | lea r1, [r1 - 8 + 2 * r2] | |
2191 | lea r0, [r6 + 112] | |
2192 | mov r3, r0 | |
2193 | call transpose8_internal | |
2194 | ||
2195 | lea r1, [r4 + 16] | |
2196 | lea r0, [r6 + 8 * 128] | |
2197 | mov r3, r0 | |
2198 | call transpose8_internal | |
2199 | lea r1, [r1 - 8 + 2 * r2] | |
2200 | lea r0, [r6 + 8 * 128 + 16] | |
2201 | mov r3, r0 | |
2202 | call transpose8_internal | |
2203 | lea r1, [r1 - 8 + 2 * r2] | |
2204 | lea r0, [r6 + 8 * 128 + 32] | |
2205 | mov r3, r0 | |
2206 | call transpose8_internal | |
2207 | lea r1, [r1 - 8 + 2 * r2] | |
2208 | lea r0, [r6 + 8 * 128 + 48] | |
2209 | mov r3, r0 | |
2210 | call transpose8_internal | |
2211 | lea r1, [r1 - 8 + 2 * r2] | |
2212 | lea r0, [r6 + 8 * 128 + 64] | |
2213 | mov r3, r0 | |
2214 | call transpose8_internal | |
2215 | lea r1, [r1 - 8 + 2 * r2] | |
2216 | lea r0, [r6 + 8 * 128 + 80] | |
2217 | mov r3, r0 | |
2218 | call transpose8_internal | |
2219 | lea r1, [r1 - 8 + 2 * r2] | |
2220 | lea r0, [r6 + 8 * 128 + 96] | |
2221 | mov r3, r0 | |
2222 | call transpose8_internal | |
2223 | lea r1, [r1 - 8 + 2 * r2] | |
2224 | lea r0, [r6 + 8 * 128 + 112] | |
2225 | mov r3, r0 | |
2226 | call transpose8_internal | |
2227 | ||
2228 | lea r1, [r4 + 32] | |
2229 | lea r0, [r6 + 16 * 128] | |
2230 | mov r3, r0 | |
2231 | call transpose8_internal | |
2232 | lea r1, [r1 - 8 + 2 * r2] | |
2233 | lea r0, [r6 + 16 * 128 + 16] | |
2234 | mov r3, r0 | |
2235 | call transpose8_internal | |
2236 | lea r1, [r1 - 8 + 2 * r2] | |
2237 | lea r0, [r6 + 16 * 128 + 32] | |
2238 | mov r3, r0 | |
2239 | call transpose8_internal | |
2240 | lea r1, [r1 - 8 + 2 * r2] | |
2241 | lea r0, [r6 + 16 * 128 + 48] | |
2242 | mov r3, r0 | |
2243 | call transpose8_internal | |
2244 | lea r1, [r1 - 8 + 2 * r2] | |
2245 | lea r0, [r6 + 16 * 128 + 64] | |
2246 | mov r3, r0 | |
2247 | call transpose8_internal | |
2248 | lea r1, [r1 - 8 + 2 * r2] | |
2249 | lea r0, [r6 + 16 * 128 + 80] | |
2250 | mov r3, r0 | |
2251 | call transpose8_internal | |
2252 | lea r1, [r1 - 8 + 2 * r2] | |
2253 | lea r0, [r6 + 16 * 128 + 96] | |
2254 | mov r3, r0 | |
2255 | call transpose8_internal | |
2256 | lea r1, [r1 - 8 + 2 * r2] | |
2257 | lea r0, [r6 + 16 * 128 + 112] | |
2258 | mov r3, r0 | |
2259 | call transpose8_internal | |
2260 | ||
2261 | lea r1, [r4 + 48] | |
2262 | lea r0, [r6 + 24 * 128] | |
2263 | mov r3, r0 | |
2264 | call transpose8_internal | |
2265 | lea r1, [r1 - 8 + 2 * r2] | |
2266 | lea r0, [r6 + 24 * 128 + 16] | |
2267 | mov r3, r0 | |
2268 | call transpose8_internal | |
2269 | lea r1, [r1 - 8 + 2 * r2] | |
2270 | lea r0, [r6 + 24 * 128 + 32] | |
2271 | mov r3, r0 | |
2272 | call transpose8_internal | |
2273 | lea r1, [r1 - 8 + 2 * r2] | |
2274 | lea r0, [r6 + 24 * 128 + 48] | |
2275 | mov r3, r0 | |
2276 | call transpose8_internal | |
2277 | lea r1, [r1 - 8 + 2 * r2] | |
2278 | lea r0, [r6 + 24 * 128 + 64] | |
2279 | mov r3, r0 | |
2280 | call transpose8_internal | |
2281 | lea r1, [r1 - 8 + 2 * r2] | |
2282 | lea r0, [r6 + 24 * 128 + 80] | |
2283 | mov r3, r0 | |
2284 | call transpose8_internal | |
2285 | lea r1, [r1 - 8 + 2 * r2] | |
2286 | lea r0, [r6 + 24 * 128 + 96] | |
2287 | mov r3, r0 | |
2288 | call transpose8_internal | |
2289 | lea r1, [r1 - 8 + 2 * r2] | |
2290 | lea r0, [r6 + 24 * 128 + 112] | |
2291 | mov r3, r0 | |
2292 | call transpose8_internal | |
2293 | ||
2294 | lea r1, [r4 + 64] | |
2295 | lea r0, [r6 + 32 * 128] | |
2296 | mov r3, r0 | |
2297 | call transpose8_internal | |
2298 | lea r1, [r1 - 8 + 2 * r2] | |
2299 | lea r0, [r6 + 32 * 128 + 16] | |
2300 | mov r3, r0 | |
2301 | call transpose8_internal | |
2302 | lea r1, [r1 - 8 + 2 * r2] | |
2303 | lea r0, [r6 + 32 * 128 + 32] | |
2304 | mov r3, r0 | |
2305 | call transpose8_internal | |
2306 | lea r1, [r1 - 8 + 2 * r2] | |
2307 | lea r0, [r6 + 32 * 128 + 48] | |
2308 | mov r3, r0 | |
2309 | call transpose8_internal | |
2310 | lea r1, [r1 - 8 + 2 * r2] | |
2311 | lea r0, [r6 + 32 * 128 + 64] | |
2312 | mov r3, r0 | |
2313 | call transpose8_internal | |
2314 | lea r1, [r1 - 8 + 2 * r2] | |
2315 | lea r0, [r6 + 32 * 128 + 80] | |
2316 | mov r3, r0 | |
2317 | call transpose8_internal | |
2318 | lea r1, [r1 - 8 + 2 * r2] | |
2319 | lea r0, [r6 + 32 * 128 + 96] | |
2320 | mov r3, r0 | |
2321 | call transpose8_internal | |
2322 | lea r1, [r1 - 8 + 2 * r2] | |
2323 | lea r0, [r6 + 32 * 128 + 112] | |
2324 | mov r3, r0 | |
2325 | call transpose8_internal | |
2326 | ||
2327 | lea r1, [r4 + 80] | |
2328 | lea r0, [r6 + 40 * 128] | |
2329 | mov r3, r0 | |
2330 | call transpose8_internal | |
2331 | lea r1, [r1 - 8 + 2 * r2] | |
2332 | lea r0, [r6 + 40 * 128 + 16] | |
2333 | mov r3, r0 | |
2334 | call transpose8_internal | |
2335 | lea r1, [r1 - 8 + 2 * r2] | |
2336 | lea r0, [r6 + 40 * 128 + 32] | |
2337 | mov r3, r0 | |
2338 | call transpose8_internal | |
2339 | lea r1, [r1 - 8 + 2 * r2] | |
2340 | lea r0, [r6 + 40 * 128 + 48] | |
2341 | mov r3, r0 | |
2342 | call transpose8_internal | |
2343 | lea r1, [r1 - 8 + 2 * r2] | |
2344 | lea r0, [r6 + 40 * 128 + 64] | |
2345 | mov r3, r0 | |
2346 | call transpose8_internal | |
2347 | lea r1, [r1 - 8 + 2 * r2] | |
2348 | lea r0, [r6 + 40 * 128 + 80] | |
2349 | mov r3, r0 | |
2350 | call transpose8_internal | |
2351 | lea r1, [r1 - 8 + 2 * r2] | |
2352 | lea r0, [r6 + 40 * 128 + 96] | |
2353 | mov r3, r0 | |
2354 | call transpose8_internal | |
2355 | lea r1, [r1 - 8 + 2 * r2] | |
2356 | lea r0, [r6 + 40 * 128 + 112] | |
2357 | mov r3, r0 | |
2358 | call transpose8_internal | |
2359 | ||
2360 | lea r1, [r4 + 96] | |
2361 | lea r0, [r6 + 48 * 128] | |
2362 | mov r3, r0 | |
2363 | call transpose8_internal | |
2364 | lea r1, [r1 - 8 + 2 * r2] | |
2365 | lea r0, [r6 + 48 * 128 + 16] | |
2366 | mov r3, r0 | |
2367 | call transpose8_internal | |
2368 | lea r1, [r1 - 8 + 2 * r2] | |
2369 | lea r0, [r6 + 48 * 128 + 32] | |
2370 | mov r3, r0 | |
2371 | call transpose8_internal | |
2372 | lea r1, [r1 - 8 + 2 * r2] | |
2373 | lea r0, [r6 + 48 * 128 + 48] | |
2374 | mov r3, r0 | |
2375 | call transpose8_internal | |
2376 | lea r1, [r1 - 8 + 2 * r2] | |
2377 | lea r0, [r6 + 48 * 128 + 64] | |
2378 | mov r3, r0 | |
2379 | call transpose8_internal | |
2380 | lea r1, [r1 - 8 + 2 * r2] | |
2381 | lea r0, [r6 + 48 * 128 + 80] | |
2382 | mov r3, r0 | |
2383 | call transpose8_internal | |
2384 | lea r1, [r1 - 8 + 2 * r2] | |
2385 | lea r0, [r6 + 48 * 128 + 96] | |
2386 | mov r3, r0 | |
2387 | call transpose8_internal | |
2388 | lea r1, [r1 - 8 + 2 * r2] | |
2389 | lea r0, [r6 + 48 * 128 + 112] | |
2390 | mov r3, r0 | |
2391 | call transpose8_internal | |
2392 | ||
2393 | lea r1, [r4 + 112] | |
2394 | lea r0, [r6 + 56 * 128] | |
2395 | mov r3, r0 | |
2396 | call transpose8_internal | |
2397 | lea r1, [r1 - 8 + 2 * r2] | |
2398 | lea r0, [r6 + 56 * 128 + 16] | |
2399 | mov r3, r0 | |
2400 | call transpose8_internal | |
2401 | lea r1, [r1 - 8 + 2 * r2] | |
2402 | lea r0, [r6 + 56 * 128 + 32] | |
2403 | mov r3, r0 | |
2404 | call transpose8_internal | |
2405 | lea r1, [r1 - 8 + 2 * r2] | |
2406 | lea r0, [r6 + 56 * 128 + 48] | |
2407 | mov r3, r0 | |
2408 | call transpose8_internal | |
2409 | lea r1, [r1 - 8 + 2 * r2] | |
2410 | lea r0, [r6 + 56 * 128 + 64] | |
2411 | mov r3, r0 | |
2412 | call transpose8_internal | |
2413 | lea r1, [r1 - 8 + 2 * r2] | |
2414 | lea r0, [r6 + 56 * 128 + 80] | |
2415 | mov r3, r0 | |
2416 | call transpose8_internal | |
2417 | lea r1, [r1 - 8 + 2 * r2] | |
2418 | lea r0, [r6 + 56 * 128 + 96] | |
2419 | mov r3, r0 | |
2420 | call transpose8_internal | |
2421 | lea r1, [r1 - 8 + 2 * r2] | |
2422 | lea r0, [r6 + 56 * 128 + 112] | |
2423 | mov r3, r0 | |
2424 | call transpose8_internal | |
2425 | RET | |
2426 | %else ;HIGH_BIT_DEPTH == 0 | |
2427 | %if ARCH_X86_64 == 1 | |
2428 | INIT_YMM avx2 | |
2429 | ||
2430 | cglobal transpose16x32_avx2 | |
2431 | movu m0, [r1] | |
2432 | movu m1, [r1 + r2] | |
2433 | movu m2, [r1 + 2 * r2] | |
2434 | movu m3, [r1 + r3] | |
2435 | lea r1, [r1 + 4 * r2] | |
2436 | ||
2437 | movu m4, [r1] | |
2438 | movu m5, [r1 + r2] | |
2439 | movu m6, [r1 + 2 * r2] | |
2440 | movu m7, [r1 + r3] | |
2441 | ||
2442 | punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2] | |
2443 | punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2] | |
2444 | ||
2445 | punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4] | |
2446 | punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4] | |
2447 | ||
2448 | punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6] | |
2449 | punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6] | |
2450 | ||
2451 | punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8] | |
2452 | punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8] | |
2453 | ||
2454 | punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4] | |
2455 | punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4] | |
2456 | ||
2457 | punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8] | |
2458 | punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8] | |
2459 | ||
2460 | punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4] | |
2461 | punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4] | |
2462 | ||
2463 | punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8] | |
2464 | punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8] | |
2465 | ||
2466 | punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8] | |
2467 | punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8] | |
2468 | ||
2469 | punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8] | |
2470 | punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8] | |
2471 | ||
2472 | punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8] | |
2473 | punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8] | |
2474 | ||
2475 | punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8] | |
2476 | punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8] | |
2477 | ||
2478 | movq [r0 + 0 * 64], xm6 | |
2479 | movhps [r0 + 1 * 64], xm6 | |
2480 | vextracti128 xm4, m6, 1 | |
2481 | movq [r0 + 16 * 64], xm4 | |
2482 | movhps [r0 + 17 * 64], xm4 | |
2483 | ||
2484 | lea r1, [r1 + 4 * r2] | |
2485 | movu m9, [r1] | |
2486 | movu m10, [r1 + r2] | |
2487 | movu m11, [r1 + 2 * r2] | |
2488 | movu m12, [r1 + r3] | |
2489 | lea r1, [r1 + 4 * r2] | |
2490 | ||
2491 | movu m13, [r1] | |
2492 | movu m14, [r1 + r2] | |
2493 | movu m15, [r1 + 2 * r2] | |
2494 | movu m6, [r1 + r3] | |
2495 | ||
2496 | punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10] | |
2497 | punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10] | |
2498 | ||
2499 | punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12] | |
2500 | punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12] | |
2501 | ||
2502 | punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14] | |
2503 | punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14] | |
2504 | ||
2505 | punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16] | |
2506 | punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16] | |
2507 | ||
2508 | punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12] | |
2509 | punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12] | |
2510 | ||
2511 | punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16] | |
2512 | punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16] | |
2513 | ||
2514 | punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12] | |
2515 | punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12] | |
2516 | ||
2517 | punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16] | |
2518 | punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16] | |
2519 | ||
2520 | punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16] | |
2521 | punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16] | |
2522 | ||
2523 | punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16] | |
2524 | punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16] | |
2525 | ||
2526 | punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16] | |
2527 | punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16] | |
2528 | ||
2529 | punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16] | |
2530 | punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16] | |
2531 | ||
2532 | ||
2533 | punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2534 | punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2535 | ||
2536 | punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2537 | punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2538 | ||
2539 | punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2540 | punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2541 | ||
2542 | punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2543 | punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2544 | ||
2545 | punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2546 | punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2547 | ||
2548 | punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2549 | punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2550 | ||
2551 | punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2552 | punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
2553 | ||
2554 | movq [r0 + 0 * 64 + 8], xm15 | |
2555 | movhps [r0 + 1 * 64 + 8], xm15 | |
2556 | vextracti128 xm9, m15, 1 | |
2557 | movq [r0 + 16 * 64 + 8], xm9 | |
2558 | movhps [r0 + 17 * 64 + 8], xm9 | |
2559 | ||
2560 | movu [r0 + 2 * 64], xm13 | |
2561 | vextracti128 [r0 + 18 * 64], m13, 1 | |
2562 | ||
2563 | movu [r0 + 3 * 64], xm7 | |
2564 | vextracti128 [r0 + 19 * 64], m7, 1 | |
2565 | ||
2566 | movu [r0 + 4 * 64], xm6 | |
2567 | vextracti128 [r0 + 20 * 64], m6, 1 | |
2568 | ||
2569 | movu [r0 + 5 * 64], xm1 | |
2570 | vextracti128 [r0 + 21 * 64], m1, 1 | |
2571 | ||
2572 | movu [r0 + 6 * 64], xm10 | |
2573 | vextracti128 [r0 + 22 * 64], m10, 1 | |
2574 | ||
2575 | movu [r0 + 7 * 64], xm8 | |
2576 | vextracti128 [r0 + 23 * 64], m8, 1 | |
2577 | ||
2578 | movu [r0 + 8 * 64], xm4 | |
2579 | vextracti128 [r0 + 24 * 64], m4, 1 | |
2580 | ||
2581 | movu [r0 + 9 * 64], xm3 | |
2582 | vextracti128 [r0 + 25 * 64], m3, 1 | |
2583 | ||
2584 | movu [r0 + 10 * 64], xm12 | |
2585 | vextracti128 [r0 + 26 * 64], m12, 1 | |
2586 | ||
2587 | movu [r0 + 11 * 64], xm5 | |
2588 | vextracti128 [r0 + 27 * 64], m5, 1 | |
2589 | ||
2590 | movu [r0 + 12 * 64], xm14 | |
2591 | vextracti128 [r0 + 28 * 64], m14, 1 | |
2592 | ||
2593 | movu [r0 + 13 * 64], xm2 | |
2594 | vextracti128 [r0 + 29 * 64], m2, 1 | |
2595 | ||
2596 | movu [r0 + 14 * 64], xm11 | |
2597 | vextracti128 [r0 + 30 * 64], m11, 1 | |
2598 | ||
2599 | movu [r0 + 15 * 64], xm0 | |
2600 | vextracti128 [r0 + 31 * 64], m0, 1 | |
2601 | ret | |
2602 | ||
2603 | cglobal transpose64, 3, 6, 16 | |
2604 | ||
2605 | lea r3, [r2 * 3] | |
2606 | lea r4, [r0 + 16] | |
2607 | ||
2608 | lea r5, [r1 + 32] | |
2609 | call transpose16x32_avx2 | |
2610 | lea r0, [r0 + 32 * 64] | |
2611 | mov r1, r5 | |
2612 | call transpose16x32_avx2 | |
2613 | ||
2614 | mov r0, r4 | |
2615 | lea r5, [r1 + 4 * r2] | |
2616 | ||
2617 | lea r1, [r5 - 32] | |
2618 | call transpose16x32_avx2 | |
2619 | lea r0, [r0 + 32 * 64] | |
2620 | mov r1, r5 | |
2621 | call transpose16x32_avx2 | |
2622 | ||
2623 | lea r0, [r4 + 16] | |
2624 | lea r5, [r1 + 4 * r2] | |
2625 | ||
2626 | lea r1, [r5 - 32] | |
2627 | call transpose16x32_avx2 | |
2628 | lea r0, [r0 + 32 * 64] | |
2629 | mov r1, r5 | |
2630 | call transpose16x32_avx2 | |
2631 | ||
2632 | lea r5, [r1 + 4 * r2] | |
2633 | lea r0, [r4 + 32] | |
2634 | ||
2635 | lea r1, [r5 - 32] | |
2636 | call transpose16x32_avx2 | |
2637 | lea r0, [r0 + 32 * 64] | |
2638 | mov r1, r5 | |
2639 | call transpose16x32_avx2 | |
2640 | RET | |
2641 | %endif | |
2642 | ||
2643 | INIT_XMM sse2 | |
2644 | cglobal transpose64, 3, 7, 8, dest, src, stride | |
2645 | mov r3, r0 | |
2646 | mov r4, r1 | |
2647 | mov r5, r0 | |
2648 | mov r6, 64 | |
2649 | call transpose16_internal | |
2650 | lea r1, [r1 - 8 + 2 * r2] | |
2651 | lea r0, [r3 + 16] | |
2652 | mov r5, r0 | |
2653 | call transpose16_internal | |
2654 | lea r1, [r1 - 8 + 2 * r2] | |
2655 | lea r0, [r3 + 32] | |
2656 | mov r5, r0 | |
2657 | call transpose16_internal | |
2658 | lea r1, [r1 - 8 + 2 * r2] | |
2659 | lea r0, [r3 + 48] | |
2660 | mov r5, r0 | |
2661 | call transpose16_internal | |
2662 | ||
2663 | lea r1, [r4 + 16] | |
2664 | lea r0, [r3 + 16 * 64] | |
2665 | mov r5, r0 | |
2666 | call transpose16_internal | |
2667 | lea r1, [r1 - 8 + 2 * r2] | |
2668 | lea r0, [r3 + 16 * 64 + 16] | |
2669 | mov r5, r0 | |
2670 | call transpose16_internal | |
2671 | lea r1, [r1 - 8 + 2 * r2] | |
2672 | lea r0, [r3 + 16 * 64 + 32] | |
2673 | mov r5, r0 | |
2674 | call transpose16_internal | |
2675 | lea r1, [r1 - 8 + 2 * r2] | |
2676 | lea r0, [r3 + 16 * 64 + 48] | |
2677 | mov r5, r0 | |
2678 | call transpose16_internal | |
2679 | ||
2680 | lea r1, [r4 + 32] | |
2681 | lea r0, [r3 + 32 * 64] | |
2682 | mov r5, r0 | |
2683 | call transpose16_internal | |
2684 | lea r1, [r1 - 8 + 2 * r2] | |
2685 | lea r0, [r3 + 32 * 64 + 16] | |
2686 | mov r5, r0 | |
2687 | call transpose16_internal | |
2688 | lea r1, [r1 - 8 + 2 * r2] | |
2689 | lea r0, [r3 + 32 * 64 + 32] | |
2690 | mov r5, r0 | |
2691 | call transpose16_internal | |
2692 | lea r1, [r1 - 8 + 2 * r2] | |
2693 | lea r0, [r3 + 32 * 64 + 48] | |
2694 | mov r5, r0 | |
2695 | call transpose16_internal | |
2696 | ||
2697 | lea r1, [r4 + 48] | |
2698 | lea r0, [r3 + 48 * 64] | |
2699 | mov r5, r0 | |
2700 | call transpose16_internal | |
2701 | lea r1, [r1 - 8 + 2 * r2] | |
2702 | lea r0, [r3 + 48 * 64 + 16] | |
2703 | mov r5, r0 | |
2704 | call transpose16_internal | |
2705 | lea r1, [r1 - 8 + 2 * r2] | |
2706 | lea r0, [r3 + 48 * 64 + 32] | |
2707 | mov r5, r0 | |
2708 | call transpose16_internal | |
2709 | lea r1, [r1 - 8 + 2 * r2] | |
2710 | lea r0, [r3 + 48 * 64 + 48] | |
2711 | mov r5, r0 | |
2712 | call transpose16_internal | |
2713 | RET | |
2714 | %endif | |
2715 | ||
2716 | ||
2717 | ;============================================================================= | |
2718 | ; SSIM | |
2719 | ;============================================================================= | |
2720 | ||
2721 | ;----------------------------------------------------------------------------- | |
2722 | ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, | |
2723 | ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) | |
2724 | ;----------------------------------------------------------------------------- | |
2725 | %macro SSIM_ITER 1 | |
2726 | %if HIGH_BIT_DEPTH | |
2727 | movdqu m5, [r0+(%1&1)*r1] | |
2728 | movdqu m6, [r2+(%1&1)*r3] | |
2729 | %else | |
2730 | movq m5, [r0+(%1&1)*r1] | |
2731 | movq m6, [r2+(%1&1)*r3] | |
2732 | punpcklbw m5, m0 | |
2733 | punpcklbw m6, m0 | |
2734 | %endif | |
2735 | %if %1==1 | |
2736 | lea r0, [r0+r1*2] | |
2737 | lea r2, [r2+r3*2] | |
2738 | %endif | |
2739 | %if %1==0 | |
2740 | movdqa m1, m5 | |
2741 | movdqa m2, m6 | |
2742 | %else | |
2743 | paddw m1, m5 | |
2744 | paddw m2, m6 | |
2745 | %endif | |
2746 | pmaddwd m7, m5, m6 | |
2747 | pmaddwd m5, m5 | |
2748 | pmaddwd m6, m6 | |
2749 | ACCUM paddd, 3, 5, %1 | |
2750 | ACCUM paddd, 4, 7, %1 | |
2751 | paddd m3, m6 | |
2752 | %endmacro | |
2753 | ||
2754 | %macro SSIM 0 | |
2755 | cglobal pixel_ssim_4x4x2_core, 4,4,8 | |
2756 | FIX_STRIDES r1, r3 | |
2757 | pxor m0, m0 | |
2758 | SSIM_ITER 0 | |
2759 | SSIM_ITER 1 | |
2760 | SSIM_ITER 2 | |
2761 | SSIM_ITER 3 | |
2762 | ; PHADDW m1, m2 | |
2763 | ; PHADDD m3, m4 | |
2764 | movdqa m7, [pw_1] | |
2765 | pshufd m5, m3, q2301 | |
2766 | pmaddwd m1, m7 | |
2767 | pmaddwd m2, m7 | |
2768 | pshufd m6, m4, q2301 | |
2769 | packssdw m1, m2 | |
2770 | paddd m3, m5 | |
2771 | pshufd m1, m1, q3120 | |
2772 | paddd m4, m6 | |
2773 | pmaddwd m1, m7 | |
2774 | punpckhdq m5, m3, m4 | |
2775 | punpckldq m3, m4 | |
2776 | ||
2777 | %if UNIX64 | |
2778 | %define t0 r4 | |
2779 | %else | |
2780 | %define t0 rax | |
2781 | mov t0, r4mp | |
2782 | %endif | |
2783 | ||
2784 | movq [t0+ 0], m1 | |
2785 | movq [t0+ 8], m3 | |
2786 | movhps [t0+16], m1 | |
2787 | movq [t0+24], m5 | |
2788 | RET | |
2789 | ||
2790 | ;----------------------------------------------------------------------------- | |
2791 | ; float pixel_ssim_end( int sum0[5][4], int sum1[5][4], int width ) | |
2792 | ;----------------------------------------------------------------------------- | |
2793 | cglobal pixel_ssim_end4, 2,3 | |
2794 | mov r2d, r2m | |
2795 | mova m0, [r0+ 0] | |
2796 | mova m1, [r0+16] | |
2797 | mova m2, [r0+32] | |
2798 | mova m3, [r0+48] | |
2799 | mova m4, [r0+64] | |
2800 | paddd m0, [r1+ 0] | |
2801 | paddd m1, [r1+16] | |
2802 | paddd m2, [r1+32] | |
2803 | paddd m3, [r1+48] | |
2804 | paddd m4, [r1+64] | |
2805 | paddd m0, m1 | |
2806 | paddd m1, m2 | |
2807 | paddd m2, m3 | |
2808 | paddd m3, m4 | |
2809 | TRANSPOSE4x4D 0, 1, 2, 3, 4 | |
2810 | ||
2811 | ; s1=m0, s2=m1, ss=m2, s12=m3 | |
2812 | %if BIT_DEPTH == 10 | |
2813 | cvtdq2ps m0, m0 | |
2814 | cvtdq2ps m1, m1 | |
2815 | cvtdq2ps m2, m2 | |
2816 | cvtdq2ps m3, m3 | |
2817 | mulps m4, m0, m1 ; s1*s2 | |
2818 | mulps m0, m0 ; s1*s1 | |
2819 | mulps m1, m1 ; s2*s2 | |
2820 | mulps m2, [pf_64] ; ss*64 | |
2821 | mulps m3, [pf_128] ; s12*128 | |
2822 | addps m4, m4 ; s1*s2*2 | |
2823 | addps m0, m1 ; s1*s1 + s2*s2 | |
2824 | subps m2, m0 ; vars | |
2825 | subps m3, m4 ; covar*2 | |
2826 | movaps m1, [ssim_c1] | |
2827 | addps m4, m1 ; s1*s2*2 + ssim_c1 | |
2828 | addps m0, m1 ; s1*s1 + s2*s2 + ssim_c1 | |
2829 | movaps m1, [ssim_c2] | |
2830 | addps m2, m1 ; vars + ssim_c2 | |
2831 | addps m3, m1 ; covar*2 + ssim_c2 | |
2832 | %else | |
2833 | pmaddwd m4, m1, m0 ; s1*s2 | |
2834 | pslld m1, 16 | |
2835 | por m0, m1 | |
2836 | pmaddwd m0, m0 ; s1*s1 + s2*s2 | |
2837 | pslld m4, 1 | |
2838 | pslld m3, 7 | |
2839 | pslld m2, 6 | |
2840 | psubd m3, m4 ; covar*2 | |
2841 | psubd m2, m0 ; vars | |
2842 | mova m1, [ssim_c1] | |
2843 | paddd m0, m1 | |
2844 | paddd m4, m1 | |
2845 | mova m1, [ssim_c2] | |
2846 | paddd m3, m1 | |
2847 | paddd m2, m1 | |
2848 | cvtdq2ps m0, m0 ; (float)(s1*s1 + s2*s2 + ssim_c1) | |
2849 | cvtdq2ps m4, m4 ; (float)(s1*s2*2 + ssim_c1) | |
2850 | cvtdq2ps m3, m3 ; (float)(covar*2 + ssim_c2) | |
2851 | cvtdq2ps m2, m2 ; (float)(vars + ssim_c2) | |
2852 | %endif | |
2853 | mulps m4, m3 | |
2854 | mulps m0, m2 | |
2855 | divps m4, m0 ; ssim | |
2856 | ||
2857 | cmp r2d, 4 | |
2858 | je .skip ; faster only if this is the common case; remove branch if we use ssim on a macroblock level | |
2859 | neg r2 | |
2860 | ||
2861 | %ifdef PIC | |
2862 | lea r3, [mask_ff + 16] | |
2863 | %xdefine %%mask r3 | |
2864 | %else | |
2865 | %xdefine %%mask mask_ff + 16 | |
2866 | %endif | |
2867 | %if cpuflag(avx) | |
2868 | andps m4, [%%mask + r2*4] | |
2869 | %else | |
2870 | movups m0, [%%mask + r2*4] | |
2871 | andps m4, m0 | |
2872 | %endif | |
2873 | ||
2874 | .skip: | |
2875 | movhlps m0, m4 | |
2876 | addps m0, m4 | |
2877 | %if cpuflag(ssse3) | |
2878 | movshdup m4, m0 | |
2879 | %else | |
2880 | pshuflw m4, m0, q0032 | |
2881 | %endif | |
2882 | addss m0, m4 | |
2883 | %if ARCH_X86_64 == 0 | |
2884 | movss r0m, m0 | |
2885 | fld dword r0m | |
2886 | %endif | |
2887 | RET | |
2888 | %endmacro ; SSIM | |
2889 | ||
2890 | INIT_XMM sse2 | |
2891 | SSIM | |
2892 | INIT_XMM avx | |
2893 | SSIM | |
2894 | ||
2895 | ;----------------------------------------------------------------- | |
2896 | ; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/) | |
2897 | ;----------------------------------------------------------------- | |
2898 | INIT_XMM ssse3 | |
2899 | cglobal scale1D_128to64, 2, 2, 8, dest, src1, stride | |
2900 | %if HIGH_BIT_DEPTH | |
2901 | mova m7, [deinterleave_word_shuf] | |
2902 | ||
2903 | movu m0, [r1] | |
2904 | palignr m1, m0, 2 | |
2905 | movu m2, [r1 + 16] | |
2906 | palignr m3, m2, 2 | |
2907 | movu m4, [r1 + 32] | |
2908 | palignr m5, m4, 2 | |
2909 | movu m6, [r1 + 48] | |
2910 | pavgw m0, m1 | |
2911 | palignr m1, m6, 2 | |
2912 | pavgw m2, m3 | |
2913 | pavgw m4, m5 | |
2914 | pavgw m6, m1 | |
2915 | pshufb m0, m0, m7 | |
2916 | pshufb m2, m2, m7 | |
2917 | pshufb m4, m4, m7 | |
2918 | pshufb m6, m6, m7 | |
2919 | punpcklqdq m0, m2 | |
2920 | movu [r0], m0 | |
2921 | punpcklqdq m4, m6 | |
2922 | movu [r0 + 16], m4 | |
2923 | ||
2924 | ||
2925 | ||
2926 | movu m0, [r1 + 64] | |
2927 | palignr m1, m0, 2 | |
2928 | movu m2, [r1 + 80] | |
2929 | palignr m3, m2, 2 | |
2930 | movu m4, [r1 + 96] | |
2931 | palignr m5, m4, 2 | |
2932 | movu m6, [r1 + 112] | |
2933 | pavgw m0, m1 | |
2934 | palignr m1, m6, 2 | |
2935 | pavgw m2, m3 | |
2936 | pavgw m4, m5 | |
2937 | pavgw m6, m1 | |
2938 | pshufb m0, m0, m7 | |
2939 | pshufb m2, m2, m7 | |
2940 | pshufb m4, m4, m7 | |
2941 | pshufb m6, m6, m7 | |
2942 | punpcklqdq m0, m2 | |
2943 | movu [r0 + 32], m0 | |
2944 | punpcklqdq m4, m6 | |
2945 | movu [r0 + 48], m4 | |
2946 | ||
2947 | movu m0, [r1 + 128] | |
2948 | palignr m1, m0, 2 | |
2949 | movu m2, [r1 + 144] | |
2950 | palignr m3, m2, 2 | |
2951 | movu m4, [r1 + 160] | |
2952 | palignr m5, m4, 2 | |
2953 | movu m6, [r1 + 176] | |
2954 | pavgw m0, m1 | |
2955 | palignr m1, m6, 2 | |
2956 | pavgw m2, m3 | |
2957 | pavgw m4, m5 | |
2958 | pavgw m6, m1 | |
2959 | pshufb m0, m0, m7 | |
2960 | pshufb m2, m2, m7 | |
2961 | pshufb m4, m4, m7 | |
2962 | pshufb m6, m6, m7 | |
2963 | ||
2964 | punpcklqdq m0, m2 | |
2965 | movu [r0 + 64], m0 | |
2966 | punpcklqdq m4, m6 | |
2967 | movu [r0 + 80], m4 | |
2968 | ||
2969 | movu m0, [r1 + 192] | |
2970 | palignr m1, m0, 2 | |
2971 | movu m2, [r1 + 208] | |
2972 | palignr m3, m2, 2 | |
2973 | movu m4, [r1 + 224] | |
2974 | palignr m5, m4, 2 | |
2975 | movu m6, [r1 + 240] | |
2976 | pavgw m0, m1 | |
2977 | palignr m1, m6, 2 | |
2978 | pavgw m2, m3 | |
2979 | pavgw m4, m5 | |
2980 | pavgw m6, m1 | |
2981 | pshufb m0, m0, m7 | |
2982 | pshufb m2, m2, m7 | |
2983 | pshufb m4, m4, m7 | |
2984 | pshufb m6, m6, m7 | |
2985 | ||
2986 | punpcklqdq m0, m2 | |
2987 | movu [r0 + 96], m0 | |
2988 | punpcklqdq m4, m6 | |
2989 | movu [r0 + 112], m4 | |
2990 | ||
2991 | %else | |
2992 | mova m7, [deinterleave_shuf] | |
2993 | ||
2994 | movu m0, [r1] | |
2995 | palignr m1, m0, 1 | |
2996 | movu m2, [r1 + 16] | |
2997 | palignr m3, m2, 1 | |
2998 | movu m4, [r1 + 32] | |
2999 | palignr m5, m4, 1 | |
3000 | movu m6, [r1 + 48] | |
3001 | ||
3002 | pavgb m0, m1 | |
3003 | ||
3004 | palignr m1, m6, 1 | |
3005 | ||
3006 | pavgb m2, m3 | |
3007 | pavgb m4, m5 | |
3008 | pavgb m6, m1 | |
3009 | ||
3010 | pshufb m0, m0, m7 | |
3011 | pshufb m2, m2, m7 | |
3012 | pshufb m4, m4, m7 | |
3013 | pshufb m6, m6, m7 | |
3014 | ||
3015 | punpcklqdq m0, m2 | |
3016 | movu [r0], m0 | |
3017 | punpcklqdq m4, m6 | |
3018 | movu [r0 + 16], m4 | |
3019 | ||
3020 | movu m0, [r1 + 64] | |
3021 | palignr m1, m0, 1 | |
3022 | movu m2, [r1 + 80] | |
3023 | palignr m3, m2, 1 | |
3024 | movu m4, [r1 + 96] | |
3025 | palignr m5, m4, 1 | |
3026 | movu m6, [r1 + 112] | |
3027 | ||
3028 | pavgb m0, m1 | |
3029 | ||
3030 | palignr m1, m6, 1 | |
3031 | ||
3032 | pavgb m2, m3 | |
3033 | pavgb m4, m5 | |
3034 | pavgb m6, m1 | |
3035 | ||
3036 | pshufb m0, m0, m7 | |
3037 | pshufb m2, m2, m7 | |
3038 | pshufb m4, m4, m7 | |
3039 | pshufb m6, m6, m7 | |
3040 | ||
3041 | punpcklqdq m0, m2 | |
3042 | movu [r0 + 32], m0 | |
3043 | punpcklqdq m4, m6 | |
3044 | movu [r0 + 48], m4 | |
3045 | %endif | |
3046 | RET | |
3047 | ||
3048 | %if HIGH_BIT_DEPTH == 1 | |
3049 | INIT_YMM avx2 | |
3050 | cglobal scale1D_128to64, 2, 2, 3 | |
3051 | pxor m2, m2 | |
3052 | ||
3053 | movu m0, [r1] | |
3054 | movu m1, [r1 + 32] | |
3055 | phaddw m0, m1 | |
3056 | pavgw m0, m2 | |
3057 | vpermq m0, m0, 0xD8 | |
3058 | movu [r0], m0 | |
3059 | ||
3060 | movu m0, [r1 + 64] | |
3061 | movu m1, [r1 + 96] | |
3062 | phaddw m0, m1 | |
3063 | pavgw m0, m2 | |
3064 | vpermq m0, m0, 0xD8 | |
3065 | movu [r0 + 32], m0 | |
3066 | ||
3067 | movu m0, [r1 + 128] | |
3068 | movu m1, [r1 + 160] | |
3069 | phaddw m0, m1 | |
3070 | pavgw m0, m2 | |
3071 | vpermq m0, m0, 0xD8 | |
3072 | movu [r0 + 64], m0 | |
3073 | ||
3074 | movu m0, [r1 + 192] | |
3075 | movu m1, [r1 + 224] | |
3076 | phaddw m0, m1 | |
3077 | pavgw m0, m2 | |
3078 | vpermq m0, m0, 0xD8 | |
3079 | movu [r0 + 96], m0 | |
3080 | RET | |
3081 | %else ; HIGH_BIT_DEPTH == 0 | |
3082 | INIT_YMM avx2 | |
3083 | cglobal scale1D_128to64, 2, 2, 4 | |
3084 | pxor m2, m2 | |
3085 | mova m3, [pb_1] | |
3086 | ||
3087 | movu m0, [r1] | |
3088 | pmaddubsw m0, m0, m3 | |
3089 | pavgw m0, m2 | |
3090 | movu m1, [r1 + 32] | |
3091 | pmaddubsw m1, m1, m3 | |
3092 | pavgw m1, m2 | |
3093 | packuswb m0, m1 | |
3094 | vpermq m0, m0, 0xD8 | |
3095 | movu [r0], m0 | |
3096 | ||
3097 | movu m0, [r1 + 64] | |
3098 | pmaddubsw m0, m0, m3 | |
3099 | pavgw m0, m2 | |
3100 | movu m1, [r1 + 96] | |
3101 | pmaddubsw m1, m1, m3 | |
3102 | pavgw m1, m2 | |
3103 | packuswb m0, m1 | |
3104 | vpermq m0, m0, 0xD8 | |
3105 | movu [r0 + 32], m0 | |
3106 | RET | |
3107 | %endif | |
3108 | ||
3109 | ;----------------------------------------------------------------- | |
3110 | ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) | |
3111 | ;----------------------------------------------------------------- | |
3112 | %if HIGH_BIT_DEPTH | |
3113 | INIT_XMM ssse3 | |
3114 | cglobal scale2D_64to32, 3, 4, 8, dest, src, stride | |
3115 | mov r3d, 32 | |
3116 | mova m7, [deinterleave_word_shuf] | |
3117 | add r2, r2 | |
3118 | .loop: | |
3119 | movu m0, [r1] ;i | |
3120 | psrld m1, m0, 16 ;j | |
3121 | movu m2, [r1 + r2] ;k | |
3122 | psrld m3, m2, 16 ;l | |
3123 | movu m4, m0 | |
3124 | movu m5, m2 | |
3125 | pxor m4, m1 ;i^j | |
3126 | pxor m5, m3 ;k^l | |
3127 | por m4, m5 ;ij|kl | |
3128 | pavgw m0, m1 ;s | |
3129 | pavgw m2, m3 ;t | |
3130 | movu m5, m0 | |
3131 | pavgw m0, m2 ;(s+t+1)/2 | |
3132 | pxor m5, m2 ;s^t | |
3133 | pand m4, m5 ;(ij|kl)&st | |
3134 | pand m4, [hmulw_16p] | |
3135 | psubw m0, m4 ;Result | |
3136 | movu m1, [r1 + 16] ;i | |
3137 | psrld m2, m1, 16 ;j | |
3138 | movu m3, [r1 + r2 + 16] ;k | |
3139 | psrld m4, m3, 16 ;l | |
3140 | movu m5, m1 | |
3141 | movu m6, m3 | |
3142 | pxor m5, m2 ;i^j | |
3143 | pxor m6, m4 ;k^l | |
3144 | por m5, m6 ;ij|kl | |
3145 | pavgw m1, m2 ;s | |
3146 | pavgw m3, m4 ;t | |
3147 | movu m6, m1 | |
3148 | pavgw m1, m3 ;(s+t+1)/2 | |
3149 | pxor m6, m3 ;s^t | |
3150 | pand m5, m6 ;(ij|kl)&st | |
3151 | pand m5, [hmulw_16p] | |
3152 | psubw m1, m5 ;Result | |
3153 | pshufb m0, m7 | |
3154 | pshufb m1, m7 | |
3155 | ||
3156 | punpcklqdq m0, m1 | |
3157 | movu [r0], m0 | |
3158 | ||
3159 | movu m0, [r1 + 32] ;i | |
3160 | psrld m1, m0, 16 ;j | |
3161 | movu m2, [r1 + r2 + 32] ;k | |
3162 | psrld m3, m2, 16 ;l | |
3163 | movu m4, m0 | |
3164 | movu m5, m2 | |
3165 | pxor m4, m1 ;i^j | |
3166 | pxor m5, m3 ;k^l | |
3167 | por m4, m5 ;ij|kl | |
3168 | pavgw m0, m1 ;s | |
3169 | pavgw m2, m3 ;t | |
3170 | movu m5, m0 | |
3171 | pavgw m0, m2 ;(s+t+1)/2 | |
3172 | pxor m5, m2 ;s^t | |
3173 | pand m4, m5 ;(ij|kl)&st | |
3174 | pand m4, [hmulw_16p] | |
3175 | psubw m0, m4 ;Result | |
3176 | movu m1, [r1 + 48] ;i | |
3177 | psrld m2, m1, 16 ;j | |
3178 | movu m3, [r1 + r2 + 48] ;k | |
3179 | psrld m4, m3, 16 ;l | |
3180 | movu m5, m1 | |
3181 | movu m6, m3 | |
3182 | pxor m5, m2 ;i^j | |
3183 | pxor m6, m4 ;k^l | |
3184 | por m5, m6 ;ij|kl | |
3185 | pavgw m1, m2 ;s | |
3186 | pavgw m3, m4 ;t | |
3187 | movu m6, m1 | |
3188 | pavgw m1, m3 ;(s+t+1)/2 | |
3189 | pxor m6, m3 ;s^t | |
3190 | pand m5, m6 ;(ij|kl)&st | |
3191 | pand m5, [hmulw_16p] | |
3192 | psubw m1, m5 ;Result | |
3193 | pshufb m0, m7 | |
3194 | pshufb m1, m7 | |
3195 | ||
3196 | punpcklqdq m0, m1 | |
3197 | movu [r0 + 16], m0 | |
3198 | ||
3199 | movu m0, [r1 + 64] ;i | |
3200 | psrld m1, m0, 16 ;j | |
3201 | movu m2, [r1 + r2 + 64] ;k | |
3202 | psrld m3, m2, 16 ;l | |
3203 | movu m4, m0 | |
3204 | movu m5, m2 | |
3205 | pxor m4, m1 ;i^j | |
3206 | pxor m5, m3 ;k^l | |
3207 | por m4, m5 ;ij|kl | |
3208 | pavgw m0, m1 ;s | |
3209 | pavgw m2, m3 ;t | |
3210 | movu m5, m0 | |
3211 | pavgw m0, m2 ;(s+t+1)/2 | |
3212 | pxor m5, m2 ;s^t | |
3213 | pand m4, m5 ;(ij|kl)&st | |
3214 | pand m4, [hmulw_16p] | |
3215 | psubw m0, m4 ;Result | |
3216 | movu m1, [r1 + 80] ;i | |
3217 | psrld m2, m1, 16 ;j | |
3218 | movu m3, [r1 + r2 + 80] ;k | |
3219 | psrld m4, m3, 16 ;l | |
3220 | movu m5, m1 | |
3221 | movu m6, m3 | |
3222 | pxor m5, m2 ;i^j | |
3223 | pxor m6, m4 ;k^l | |
3224 | por m5, m6 ;ij|kl | |
3225 | pavgw m1, m2 ;s | |
3226 | pavgw m3, m4 ;t | |
3227 | movu m6, m1 | |
3228 | pavgw m1, m3 ;(s+t+1)/2 | |
3229 | pxor m6, m3 ;s^t | |
3230 | pand m5, m6 ;(ij|kl)&st | |
3231 | pand m5, [hmulw_16p] | |
3232 | psubw m1, m5 ;Result | |
3233 | pshufb m0, m7 | |
3234 | pshufb m1, m7 | |
3235 | ||
3236 | punpcklqdq m0, m1 | |
3237 | movu [r0 + 32], m0 | |
3238 | ||
3239 | movu m0, [r1 + 96] ;i | |
3240 | psrld m1, m0, 16 ;j | |
3241 | movu m2, [r1 + r2 + 96] ;k | |
3242 | psrld m3, m2, 16 ;l | |
3243 | movu m4, m0 | |
3244 | movu m5, m2 | |
3245 | pxor m4, m1 ;i^j | |
3246 | pxor m5, m3 ;k^l | |
3247 | por m4, m5 ;ij|kl | |
3248 | pavgw m0, m1 ;s | |
3249 | pavgw m2, m3 ;t | |
3250 | movu m5, m0 | |
3251 | pavgw m0, m2 ;(s+t+1)/2 | |
3252 | pxor m5, m2 ;s^t | |
3253 | pand m4, m5 ;(ij|kl)&st | |
3254 | pand m4, [hmulw_16p] | |
3255 | psubw m0, m4 ;Result | |
3256 | movu m1, [r1 + 112] ;i | |
3257 | psrld m2, m1, 16 ;j | |
3258 | movu m3, [r1 + r2 + 112] ;k | |
3259 | psrld m4, m3, 16 ;l | |
3260 | movu m5, m1 | |
3261 | movu m6, m3 | |
3262 | pxor m5, m2 ;i^j | |
3263 | pxor m6, m4 ;k^l | |
3264 | por m5, m6 ;ij|kl | |
3265 | pavgw m1, m2 ;s | |
3266 | pavgw m3, m4 ;t | |
3267 | movu m6, m1 | |
3268 | pavgw m1, m3 ;(s+t+1)/2 | |
3269 | pxor m6, m3 ;s^t | |
3270 | pand m5, m6 ;(ij|kl)&st | |
3271 | pand m5, [hmulw_16p] | |
3272 | psubw m1, m5 ;Result | |
3273 | pshufb m0, m7 | |
3274 | pshufb m1, m7 | |
3275 | ||
3276 | punpcklqdq m0, m1 | |
3277 | movu [r0 + 48], m0 | |
3278 | lea r0, [r0 + 64] | |
3279 | lea r1, [r1 + 2 * r2] | |
3280 | dec r3d | |
3281 | jnz .loop | |
3282 | RET | |
3283 | %else | |
3284 | ||
3285 | INIT_XMM ssse3 | |
3286 | cglobal scale2D_64to32, 3, 4, 8, dest, src, stride | |
3287 | mov r3d, 32 | |
3288 | mova m7, [deinterleave_shuf] | |
3289 | .loop: | |
3290 | ||
3291 | movu m0, [r1] ;i | |
3292 | psrlw m1, m0, 8 ;j | |
3293 | movu m2, [r1 + r2] ;k | |
3294 | psrlw m3, m2, 8 ;l | |
3295 | movu m4, m0 | |
3296 | movu m5, m2 | |
3297 | ||
3298 | pxor m4, m1 ;i^j | |
3299 | pxor m5, m3 ;k^l | |
3300 | por m4, m5 ;ij|kl | |
3301 | ||
3302 | pavgb m0, m1 ;s | |
3303 | pavgb m2, m3 ;t | |
3304 | movu m5, m0 | |
3305 | pavgb m0, m2 ;(s+t+1)/2 | |
3306 | pxor m5, m2 ;s^t | |
3307 | pand m4, m5 ;(ij|kl)&st | |
3308 | pand m4, [hmul_16p] | |
3309 | psubb m0, m4 ;Result | |
3310 | ||
3311 | movu m1, [r1 + 16] ;i | |
3312 | psrlw m2, m1, 8 ;j | |
3313 | movu m3, [r1 + r2 + 16] ;k | |
3314 | psrlw m4, m3, 8 ;l | |
3315 | movu m5, m1 | |
3316 | movu m6, m3 | |
3317 | ||
3318 | pxor m5, m2 ;i^j | |
3319 | pxor m6, m4 ;k^l | |
3320 | por m5, m6 ;ij|kl | |
3321 | ||
3322 | pavgb m1, m2 ;s | |
3323 | pavgb m3, m4 ;t | |
3324 | movu m6, m1 | |
3325 | pavgb m1, m3 ;(s+t+1)/2 | |
3326 | pxor m6, m3 ;s^t | |
3327 | pand m5, m6 ;(ij|kl)&st | |
3328 | pand m5, [hmul_16p] | |
3329 | psubb m1, m5 ;Result | |
3330 | ||
3331 | pshufb m0, m0, m7 | |
3332 | pshufb m1, m1, m7 | |
3333 | ||
3334 | punpcklqdq m0, m1 | |
3335 | movu [r0], m0 | |
3336 | ||
3337 | movu m0, [r1 + 32] ;i | |
3338 | psrlw m1, m0, 8 ;j | |
3339 | movu m2, [r1 + r2 + 32] ;k | |
3340 | psrlw m3, m2, 8 ;l | |
3341 | movu m4, m0 | |
3342 | movu m5, m2 | |
3343 | ||
3344 | pxor m4, m1 ;i^j | |
3345 | pxor m5, m3 ;k^l | |
3346 | por m4, m5 ;ij|kl | |
3347 | ||
3348 | pavgb m0, m1 ;s | |
3349 | pavgb m2, m3 ;t | |
3350 | movu m5, m0 | |
3351 | pavgb m0, m2 ;(s+t+1)/2 | |
3352 | pxor m5, m2 ;s^t | |
3353 | pand m4, m5 ;(ij|kl)&st | |
3354 | pand m4, [hmul_16p] | |
3355 | psubb m0, m4 ;Result | |
3356 | ||
3357 | movu m1, [r1 + 48] ;i | |
3358 | psrlw m2, m1, 8 ;j | |
3359 | movu m3, [r1 + r2 + 48] ;k | |
3360 | psrlw m4, m3, 8 ;l | |
3361 | movu m5, m1 | |
3362 | movu m6, m3 | |
3363 | ||
3364 | pxor m5, m2 ;i^j | |
3365 | pxor m6, m4 ;k^l | |
3366 | por m5, m6 ;ij|kl | |
3367 | ||
3368 | pavgb m1, m2 ;s | |
3369 | pavgb m3, m4 ;t | |
3370 | movu m6, m1 | |
3371 | pavgb m1, m3 ;(s+t+1)/2 | |
3372 | pxor m6, m3 ;s^t | |
3373 | pand m5, m6 ;(ij|kl)&st | |
3374 | pand m5, [hmul_16p] | |
3375 | psubb m1, m5 ;Result | |
3376 | ||
3377 | pshufb m0, m0, m7 | |
3378 | pshufb m1, m1, m7 | |
3379 | ||
3380 | punpcklqdq m0, m1 | |
3381 | movu [r0 + 16], m0 | |
3382 | ||
3383 | lea r0, [r0 + 32] | |
3384 | lea r1, [r1 + 2 * r2] | |
3385 | dec r3d | |
3386 | jnz .loop | |
3387 | RET | |
3388 | %endif | |
3389 | ||
3390 | ||
3391 | ;----------------------------------------------------------------------------- | |
3392 | ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3393 | ;----------------------------------------------------------------------------- | |
3394 | %if HIGH_BIT_DEPTH | |
3395 | INIT_XMM sse2 | |
3396 | cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3397 | add r4, r4 | |
3398 | add r5, r5 | |
3399 | add r1, r1 | |
3400 | movh m0, [r2] | |
3401 | movh m2, [r2 + r4] | |
3402 | movh m1, [r3] | |
3403 | movh m3, [r3 + r5] | |
3404 | lea r2, [r2 + r4 * 2] | |
3405 | lea r3, [r3 + r5 * 2] | |
3406 | movh m4, [r2] | |
3407 | movh m6, [r2 + r4] | |
3408 | movh m5, [r3] | |
3409 | movh m7, [r3 + r5] | |
3410 | ||
3411 | psubw m0, m1 | |
3412 | psubw m2, m3 | |
3413 | psubw m4, m5 | |
3414 | psubw m6, m7 | |
3415 | ||
3416 | movh [r0], m0 | |
3417 | movh [r0 + r1], m2 | |
3418 | lea r0, [r0 + r1 * 2] | |
3419 | movh [r0], m4 | |
3420 | movh [r0 + r1], m6 | |
3421 | ||
3422 | RET | |
3423 | %else | |
3424 | INIT_XMM sse4 | |
3425 | cglobal pixel_sub_ps_4x4, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3426 | add r1, r1 | |
3427 | movd m0, [r2] | |
3428 | movd m2, [r2 + r4] | |
3429 | movd m1, [r3] | |
3430 | movd m3, [r3 + r5] | |
3431 | lea r2, [r2 + r4 * 2] | |
3432 | lea r3, [r3 + r5 * 2] | |
3433 | movd m4, [r2] | |
3434 | movd m6, [r2 + r4] | |
3435 | movd m5, [r3] | |
3436 | movd m7, [r3 + r5] | |
3437 | punpckldq m0, m2 | |
3438 | punpckldq m1, m3 | |
3439 | punpckldq m4, m6 | |
3440 | punpckldq m5, m7 | |
3441 | pmovzxbw m0, m0 | |
3442 | pmovzxbw m1, m1 | |
3443 | pmovzxbw m4, m4 | |
3444 | pmovzxbw m5, m5 | |
3445 | ||
3446 | psubw m0, m1 | |
3447 | psubw m4, m5 | |
3448 | ||
3449 | movh [r0], m0 | |
3450 | movhps [r0 + r1], m0 | |
3451 | movh [r0 + r1 * 2], m4 | |
3452 | lea r0, [r0 + r1 * 2] | |
3453 | movhps [r0 + r1], m4 | |
3454 | ||
3455 | RET | |
3456 | %endif | |
3457 | ||
3458 | ||
3459 | ;----------------------------------------------------------------------------- | |
3460 | ; void pixel_sub_ps_4x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3461 | ;----------------------------------------------------------------------------- | |
3462 | %macro PIXELSUB_PS_W4_H4 2 | |
3463 | %if HIGH_BIT_DEPTH | |
3464 | cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3465 | mov r6d, %2/4 | |
3466 | add r4, r4 | |
3467 | add r5, r5 | |
3468 | add r1, r1 | |
3469 | .loop: | |
3470 | movh m0, [r2] | |
3471 | movh m2, [r2 + r4] | |
3472 | movh m1, [r3] | |
3473 | movh m3, [r3 + r5] | |
3474 | lea r2, [r2 + r4 * 2] | |
3475 | lea r3, [r3 + r5 * 2] | |
3476 | movh m4, [r2] | |
3477 | movh m6, [r2 + r4] | |
3478 | movh m5, [r3] | |
3479 | movh m7, [r3 + r5] | |
3480 | dec r6d | |
3481 | lea r2, [r2 + r4 * 2] | |
3482 | lea r3, [r3 + r5 * 2] | |
3483 | ||
3484 | psubw m0, m1 | |
3485 | psubw m2, m3 | |
3486 | psubw m4, m5 | |
3487 | psubw m6, m7 | |
3488 | ||
3489 | movh [r0], m0 | |
3490 | movh [r0 + r1], m2 | |
3491 | movh [r0 + r1 * 2], m4 | |
3492 | lea r0, [r0 + r1 * 2] | |
3493 | movh [r0 + r1], m6 | |
3494 | lea r0, [r0 + r1 * 2] | |
3495 | ||
3496 | jnz .loop | |
3497 | RET | |
3498 | %else | |
3499 | cglobal pixel_sub_ps_4x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3500 | mov r6d, %2/4 | |
3501 | add r1, r1 | |
3502 | .loop: | |
3503 | movd m0, [r2] | |
3504 | movd m2, [r2 + r4] | |
3505 | movd m1, [r3] | |
3506 | movd m3, [r3 + r5] | |
3507 | lea r2, [r2 + r4 * 2] | |
3508 | lea r3, [r3 + r5 * 2] | |
3509 | movd m4, [r2] | |
3510 | movd m6, [r2 + r4] | |
3511 | movd m5, [r3] | |
3512 | movd m7, [r3 + r5] | |
3513 | dec r6d | |
3514 | lea r2, [r2 + r4 * 2] | |
3515 | lea r3, [r3 + r5 * 2] | |
3516 | punpckldq m0, m2 | |
3517 | punpckldq m1, m3 | |
3518 | punpckldq m4, m6 | |
3519 | punpckldq m5, m7 | |
3520 | pmovzxbw m0, m0 | |
3521 | pmovzxbw m1, m1 | |
3522 | pmovzxbw m4, m4 | |
3523 | pmovzxbw m5, m5 | |
3524 | ||
3525 | psubw m0, m1 | |
3526 | psubw m4, m5 | |
3527 | ||
3528 | movh [r0], m0 | |
3529 | movhps [r0 + r1], m0 | |
3530 | movh [r0 + r1 * 2], m4 | |
3531 | lea r0, [r0 + r1 * 2] | |
3532 | movhps [r0 + r1], m4 | |
3533 | lea r0, [r0 + r1 * 2] | |
3534 | ||
3535 | jnz .loop | |
3536 | RET | |
3537 | %endif | |
3538 | %endmacro | |
3539 | ||
3540 | %if HIGH_BIT_DEPTH | |
3541 | INIT_XMM sse2 | |
3542 | PIXELSUB_PS_W4_H4 4, 8 | |
3543 | %else | |
3544 | INIT_XMM sse4 | |
3545 | PIXELSUB_PS_W4_H4 4, 8 | |
3546 | %endif | |
3547 | ||
3548 | ||
3549 | ;----------------------------------------------------------------------------- | |
3550 | ; void pixel_sub_ps_8x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3551 | ;----------------------------------------------------------------------------- | |
3552 | %macro PIXELSUB_PS_W8_H4 2 | |
3553 | %if HIGH_BIT_DEPTH | |
3554 | cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3555 | mov r6d, %2/4 | |
3556 | add r4, r4 | |
3557 | add r5, r5 | |
3558 | add r1, r1 | |
3559 | .loop: | |
3560 | movu m0, [r2] | |
3561 | movu m2, [r2 + r4] | |
3562 | movu m1, [r3] | |
3563 | movu m3, [r3 + r5] | |
3564 | lea r2, [r2 + r4 * 2] | |
3565 | lea r3, [r3 + r5 * 2] | |
3566 | movu m4, [r2] | |
3567 | movu m6, [r2 + r4] | |
3568 | movu m5, [r3] | |
3569 | movu m7, [r3 + r5] | |
3570 | dec r6d | |
3571 | lea r2, [r2 + r4 * 2] | |
3572 | lea r3, [r3 + r5 * 2] | |
3573 | ||
3574 | psubw m0, m1 | |
3575 | psubw m2, m3 | |
3576 | psubw m4, m5 | |
3577 | psubw m6, m7 | |
3578 | ||
3579 | movu [r0], m0 | |
3580 | movu [r0 + r1], m2 | |
3581 | movu [r0 + r1 * 2], m4 | |
3582 | lea r0, [r0 + r1 * 2] | |
3583 | movu [r0 + r1], m6 | |
3584 | lea r0, [r0 + r1 * 2] | |
3585 | ||
3586 | jnz .loop | |
3587 | RET | |
3588 | %else | |
3589 | cglobal pixel_sub_ps_8x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3590 | mov r6d, %2/4 | |
3591 | add r1, r1 | |
3592 | .loop: | |
3593 | movh m0, [r2] | |
3594 | movh m2, [r2 + r4] | |
3595 | movh m1, [r3] | |
3596 | movh m3, [r3 + r5] | |
3597 | lea r2, [r2 + r4 * 2] | |
3598 | lea r3, [r3 + r5 * 2] | |
3599 | movh m4, [r2] | |
3600 | movh m6, [r2 + r4] | |
3601 | movh m5, [r3] | |
3602 | movh m7, [r3 + r5] | |
3603 | dec r6d | |
3604 | lea r2, [r2 + r4 * 2] | |
3605 | lea r3, [r3 + r5 * 2] | |
3606 | pmovzxbw m0, m0 | |
3607 | pmovzxbw m1, m1 | |
3608 | pmovzxbw m2, m2 | |
3609 | pmovzxbw m3, m3 | |
3610 | pmovzxbw m4, m4 | |
3611 | pmovzxbw m5, m5 | |
3612 | pmovzxbw m6, m6 | |
3613 | pmovzxbw m7, m7 | |
3614 | ||
3615 | psubw m0, m1 | |
3616 | psubw m2, m3 | |
3617 | psubw m4, m5 | |
3618 | psubw m6, m7 | |
3619 | ||
3620 | movu [r0], m0 | |
3621 | movu [r0 + r1], m2 | |
3622 | movu [r0 + r1 * 2], m4 | |
3623 | lea r0, [r0 + r1 * 2] | |
3624 | movu [r0 + r1], m6 | |
3625 | lea r0, [r0 + r1 * 2] | |
3626 | ||
3627 | jnz .loop | |
3628 | RET | |
3629 | %endif | |
3630 | %endmacro | |
3631 | ||
3632 | %if HIGH_BIT_DEPTH | |
3633 | INIT_XMM sse2 | |
3634 | PIXELSUB_PS_W8_H4 8, 8 | |
3635 | PIXELSUB_PS_W8_H4 8, 16 | |
3636 | %else | |
3637 | INIT_XMM sse4 | |
3638 | PIXELSUB_PS_W8_H4 8, 8 | |
3639 | PIXELSUB_PS_W8_H4 8, 16 | |
3640 | %endif | |
3641 | ||
3642 | ||
3643 | ;----------------------------------------------------------------------------- | |
3644 | ; void pixel_sub_ps_16x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3645 | ;----------------------------------------------------------------------------- | |
3646 | %macro PIXELSUB_PS_W16_H4 2 | |
3647 | %if HIGH_BIT_DEPTH | |
3648 | cglobal pixel_sub_ps_16x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3649 | mov r6d, %2/4 | |
3650 | add r4, r4 | |
3651 | add r5, r5 | |
3652 | add r1, r1 | |
3653 | .loop: | |
3654 | movu m0, [r2] | |
3655 | movu m2, [r2 + 16] | |
3656 | movu m1, [r3] | |
3657 | movu m3, [r3 + 16] | |
3658 | movu m4, [r2 + r4] | |
3659 | movu m6, [r2 + r4 + 16] | |
3660 | movu m5, [r3 + r5] | |
3661 | movu m7, [r3 + r5 + 16] | |
3662 | dec r6d | |
3663 | lea r2, [r2 + r4 * 2] | |
3664 | lea r3, [r3 + r5 * 2] | |
3665 | ||
3666 | psubw m0, m1 | |
3667 | psubw m2, m3 | |
3668 | psubw m4, m5 | |
3669 | psubw m6, m7 | |
3670 | ||
3671 | movu [r0], m0 | |
3672 | movu [r0 + 16], m2 | |
3673 | movu [r0 + r1], m4 | |
3674 | movu [r0 + r1 + 16], m6 | |
3675 | ||
3676 | movu m0, [r2] | |
3677 | movu m2, [r2 + 16] | |
3678 | movu m1, [r3] | |
3679 | movu m3, [r3 + 16] | |
3680 | movu m4, [r2 + r4] | |
3681 | movu m5, [r3 + r5] | |
3682 | movu m6, [r2 + r4 + 16] | |
3683 | movu m7, [r3 + r5 + 16] | |
3684 | lea r0, [r0 + r1 * 2] | |
3685 | lea r2, [r2 + r4 * 2] | |
3686 | lea r3, [r3 + r5 * 2] | |
3687 | ||
3688 | psubw m0, m1 | |
3689 | psubw m2, m3 | |
3690 | psubw m4, m5 | |
3691 | psubw m6, m7 | |
3692 | ||
3693 | movu [r0], m0 | |
3694 | movu [r0 + 16], m2 | |
3695 | movu [r0 + r1], m4 | |
3696 | movu [r0 + r1 + 16], m6 | |
3697 | lea r0, [r0 + r1 * 2] | |
3698 | ||
3699 | jnz .loop | |
3700 | RET | |
3701 | %else | |
3702 | cglobal pixel_sub_ps_16x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3703 | mov r6d, %2/4 | |
3704 | pxor m6, m6 | |
3705 | add r1, r1 | |
3706 | .loop: | |
3707 | movu m1, [r2] | |
3708 | movu m3, [r3] | |
3709 | pmovzxbw m0, m1 | |
3710 | pmovzxbw m2, m3 | |
3711 | punpckhbw m1, m6 | |
3712 | punpckhbw m3, m6 | |
3713 | ||
3714 | psubw m0, m2 | |
3715 | psubw m1, m3 | |
3716 | ||
3717 | movu m5, [r2 + r4] | |
3718 | movu m3, [r3 + r5] | |
3719 | lea r2, [r2 + r4 * 2] | |
3720 | lea r3, [r3 + r5 * 2] | |
3721 | pmovzxbw m4, m5 | |
3722 | pmovzxbw m2, m3 | |
3723 | punpckhbw m5, m6 | |
3724 | punpckhbw m3, m6 | |
3725 | ||
3726 | psubw m4, m2 | |
3727 | psubw m5, m3 | |
3728 | ||
3729 | movu [r0], m0 | |
3730 | movu [r0 + 16], m1 | |
3731 | movu [r0 + r1], m4 | |
3732 | movu [r0 + r1 + 16], m5 | |
3733 | ||
3734 | movu m1, [r2] | |
3735 | movu m3, [r3] | |
3736 | pmovzxbw m0, m1 | |
3737 | pmovzxbw m2, m3 | |
3738 | punpckhbw m1, m6 | |
3739 | punpckhbw m3, m6 | |
3740 | ||
3741 | psubw m0, m2 | |
3742 | psubw m1, m3 | |
3743 | ||
3744 | movu m5, [r2 + r4] | |
3745 | movu m3, [r3 + r5] | |
3746 | dec r6d | |
3747 | lea r2, [r2 + r4 * 2] | |
3748 | lea r3, [r3 + r5 * 2] | |
3749 | lea r0, [r0 + r1 * 2] | |
3750 | pmovzxbw m4, m5 | |
3751 | pmovzxbw m2, m3 | |
3752 | punpckhbw m5, m6 | |
3753 | punpckhbw m3, m6 | |
3754 | ||
3755 | psubw m4, m2 | |
3756 | psubw m5, m3 | |
3757 | ||
3758 | movu [r0], m0 | |
3759 | movu [r0 + 16], m1 | |
3760 | movu [r0 + r1], m4 | |
3761 | movu [r0 + r1 + 16], m5 | |
3762 | lea r0, [r0 + r1 * 2] | |
3763 | ||
3764 | jnz .loop | |
3765 | RET | |
3766 | %endif | |
3767 | %endmacro | |
3768 | ||
3769 | %if HIGH_BIT_DEPTH | |
3770 | INIT_XMM sse2 | |
3771 | PIXELSUB_PS_W16_H4 16, 16 | |
3772 | PIXELSUB_PS_W16_H4 16, 32 | |
3773 | %else | |
3774 | INIT_XMM sse4 | |
3775 | PIXELSUB_PS_W16_H4 16, 16 | |
3776 | PIXELSUB_PS_W16_H4 16, 32 | |
3777 | %endif | |
3778 | ||
3779 | ||
3780 | ;----------------------------------------------------------------------------- | |
3781 | ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3782 | ;----------------------------------------------------------------------------- | |
3783 | %macro PIXELSUB_PS_W32_H2 2 | |
3784 | %if HIGH_BIT_DEPTH | |
3785 | cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3786 | mov r6d, %2/2 | |
3787 | add r4, r4 | |
3788 | add r5, r5 | |
3789 | add r1, r1 | |
3790 | .loop: | |
3791 | movu m0, [r2] | |
3792 | movu m2, [r2 + 16] | |
3793 | movu m4, [r2 + 32] | |
3794 | movu m6, [r2 + 48] | |
3795 | movu m1, [r3] | |
3796 | movu m3, [r3 + 16] | |
3797 | movu m5, [r3 + 32] | |
3798 | movu m7, [r3 + 48] | |
3799 | dec r6d | |
3800 | ||
3801 | psubw m0, m1 | |
3802 | psubw m2, m3 | |
3803 | psubw m4, m5 | |
3804 | psubw m6, m7 | |
3805 | ||
3806 | movu [r0], m0 | |
3807 | movu [r0 + 16], m2 | |
3808 | movu [r0 + 32], m4 | |
3809 | movu [r0 + 48], m6 | |
3810 | ||
3811 | movu m0, [r2 + r4] | |
3812 | movu m2, [r2 + r4 + 16] | |
3813 | movu m4, [r2 + r4 + 32] | |
3814 | movu m6, [r2 + r4 + 48] | |
3815 | movu m1, [r3 + r5] | |
3816 | movu m3, [r3 + r5 + 16] | |
3817 | movu m5, [r3 + r5 + 32] | |
3818 | movu m7, [r3 + r5 + 48] | |
3819 | lea r2, [r2 + r4 * 2] | |
3820 | lea r3, [r3 + r5 * 2] | |
3821 | ||
3822 | psubw m0, m1 | |
3823 | psubw m2, m3 | |
3824 | psubw m4, m5 | |
3825 | psubw m6, m7 | |
3826 | ||
3827 | movu [r0 + r1], m0 | |
3828 | movu [r0 + r1 + 16], m2 | |
3829 | movu [r0 + r1 + 32], m4 | |
3830 | movu [r0 + r1 + 48], m6 | |
3831 | lea r0, [r0 + r1 * 2] | |
3832 | ||
3833 | jnz .loop | |
3834 | RET | |
3835 | %else | |
3836 | cglobal pixel_sub_ps_32x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3837 | mov r6d, %2/2 | |
3838 | add r1, r1 | |
3839 | .loop: | |
3840 | movh m0, [r2] | |
3841 | movh m1, [r2 + 8] | |
3842 | movh m2, [r2 + 16] | |
3843 | movh m6, [r2 + 24] | |
3844 | movh m3, [r3] | |
3845 | movh m4, [r3 + 8] | |
3846 | movh m5, [r3 + 16] | |
3847 | movh m7, [r3 + 24] | |
3848 | dec r6d | |
3849 | pmovzxbw m0, m0 | |
3850 | pmovzxbw m1, m1 | |
3851 | pmovzxbw m2, m2 | |
3852 | pmovzxbw m6, m6 | |
3853 | pmovzxbw m3, m3 | |
3854 | pmovzxbw m4, m4 | |
3855 | pmovzxbw m5, m5 | |
3856 | pmovzxbw m7, m7 | |
3857 | ||
3858 | psubw m0, m3 | |
3859 | psubw m1, m4 | |
3860 | psubw m2, m5 | |
3861 | psubw m6, m7 | |
3862 | ||
3863 | movu [r0], m0 | |
3864 | movu [r0 + 16], m1 | |
3865 | movu [r0 + 32], m2 | |
3866 | movu [r0 + 48], m6 | |
3867 | ||
3868 | movh m0, [r2 + r4] | |
3869 | movh m1, [r2 + r4 + 8] | |
3870 | movh m2, [r2 + r4 + 16] | |
3871 | movh m6, [r2 + r4 + 24] | |
3872 | movh m3, [r3 + r5] | |
3873 | movh m4, [r3 + r5 + 8] | |
3874 | movh m5, [r3 + r5 + 16] | |
3875 | movh m7, [r3 + r5 + 24] | |
3876 | lea r2, [r2 + r4 * 2] | |
3877 | lea r3, [r3 + r5 * 2] | |
3878 | pmovzxbw m0, m0 | |
3879 | pmovzxbw m1, m1 | |
3880 | pmovzxbw m2, m2 | |
3881 | pmovzxbw m6, m6 | |
3882 | pmovzxbw m3, m3 | |
3883 | pmovzxbw m4, m4 | |
3884 | pmovzxbw m5, m5 | |
3885 | pmovzxbw m7, m7 | |
3886 | ||
3887 | psubw m0, m3 | |
3888 | psubw m1, m4 | |
3889 | psubw m2, m5 | |
3890 | psubw m6, m7 | |
3891 | ||
3892 | movu [r0 + r1], m0 | |
3893 | movu [r0 + r1 + 16], m1 | |
3894 | movu [r0 + r1 + 32], m2 | |
3895 | movu [r0 + r1 + 48], m6 | |
3896 | lea r0, [r0 + r1 * 2] | |
3897 | ||
3898 | jnz .loop | |
3899 | RET | |
3900 | %endif | |
3901 | %endmacro | |
3902 | ||
3903 | %if HIGH_BIT_DEPTH | |
3904 | INIT_XMM sse2 | |
3905 | PIXELSUB_PS_W32_H2 32, 32 | |
3906 | PIXELSUB_PS_W32_H2 32, 64 | |
3907 | %else | |
3908 | INIT_XMM sse4 | |
3909 | PIXELSUB_PS_W32_H2 32, 32 | |
3910 | PIXELSUB_PS_W32_H2 32, 64 | |
3911 | %endif | |
3912 | ||
3913 | ||
3914 | ;----------------------------------------------------------------------------- | |
3915 | ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); | |
3916 | ;----------------------------------------------------------------------------- | |
3917 | %macro PIXELSUB_PS_W64_H2 2 | |
3918 | %if HIGH_BIT_DEPTH | |
3919 | cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
3920 | mov r6d, %2/2 | |
3921 | add r4, r4 | |
3922 | add r5, r5 | |
3923 | add r1, r1 | |
3924 | .loop: | |
3925 | movu m0, [r2] | |
3926 | movu m2, [r2 + 16] | |
3927 | movu m4, [r2 + 32] | |
3928 | movu m6, [r2 + 48] | |
3929 | movu m1, [r3] | |
3930 | movu m3, [r3 + 16] | |
3931 | movu m5, [r3 + 32] | |
3932 | movu m7, [r3 + 48] | |
3933 | ||
3934 | psubw m0, m1 | |
3935 | psubw m2, m3 | |
3936 | psubw m4, m5 | |
3937 | psubw m6, m7 | |
3938 | ||
3939 | movu [r0], m0 | |
3940 | movu [r0 + 16], m2 | |
3941 | movu [r0 + 32], m4 | |
3942 | movu [r0 + 48], m6 | |
3943 | ||
3944 | movu m0, [r2 + 64] | |
3945 | movu m2, [r2 + 80] | |
3946 | movu m4, [r2 + 96] | |
3947 | movu m6, [r2 + 112] | |
3948 | movu m1, [r3 + 64] | |
3949 | movu m3, [r3 + 80] | |
3950 | movu m5, [r3 + 96] | |
3951 | movu m7, [r3 + 112] | |
3952 | ||
3953 | psubw m0, m1 | |
3954 | psubw m2, m3 | |
3955 | psubw m4, m5 | |
3956 | psubw m6, m7 | |
3957 | ||
3958 | movu [r0 + 64], m0 | |
3959 | movu [r0 + 80], m2 | |
3960 | movu [r0 + 96], m4 | |
3961 | movu [r0 + 112], m6 | |
3962 | ||
3963 | movu m0, [r2 + r4] | |
3964 | movu m2, [r2 + r4 + 16] | |
3965 | movu m4, [r2 + r4 + 32] | |
3966 | movu m6, [r2 + r4 + 48] | |
3967 | movu m1, [r3 + r5] | |
3968 | movu m3, [r3 + r5 + 16] | |
3969 | movu m5, [r3 + r5 + 32] | |
3970 | movu m7, [r3 + r5 + 48] | |
3971 | ||
3972 | psubw m0, m1 | |
3973 | psubw m2, m3 | |
3974 | psubw m4, m5 | |
3975 | psubw m6, m7 | |
3976 | ||
3977 | movu [r0 + r1], m0 | |
3978 | movu [r0 + r1 + 16], m2 | |
3979 | movu [r0 + r1 + 32], m4 | |
3980 | movu [r0 + r1 + 48], m6 | |
3981 | ||
3982 | movu m0, [r2 + r4 + 64] | |
3983 | movu m2, [r2 + r4 + 80] | |
3984 | movu m4, [r2 + r4 + 96] | |
3985 | movu m6, [r2 + r4 + 112] | |
3986 | movu m1, [r3 + r5 + 64] | |
3987 | movu m3, [r3 + r5 + 80] | |
3988 | movu m5, [r3 + r5 + 96] | |
3989 | movu m7, [r3 + r5 + 112] | |
3990 | dec r6d | |
3991 | lea r2, [r2 + r4 * 2] | |
3992 | lea r3, [r3 + r5 * 2] | |
3993 | ||
3994 | psubw m0, m1 | |
3995 | psubw m2, m3 | |
3996 | psubw m4, m5 | |
3997 | psubw m6, m7 | |
3998 | ||
3999 | movu [r0 + r1 + 64], m0 | |
4000 | movu [r0 + r1 + 80], m2 | |
4001 | movu [r0 + r1 + 96], m4 | |
4002 | movu [r0 + r1 + 112], m6 | |
4003 | lea r0, [r0 + r1 * 2] | |
4004 | ||
4005 | jnz .loop | |
4006 | RET | |
4007 | %else | |
4008 | cglobal pixel_sub_ps_64x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 | |
4009 | mov r6d, %2/2 | |
4010 | pxor m6, m6 | |
4011 | add r1, r1 | |
4012 | .loop: | |
4013 | movu m1, [r2] | |
4014 | movu m5, [r2 + 16] | |
4015 | movu m3, [r3] | |
4016 | movu m7, [r3 + 16] | |
4017 | ||
4018 | pmovzxbw m0, m1 | |
4019 | pmovzxbw m4, m5 | |
4020 | pmovzxbw m2, m3 | |
4021 | punpckhbw m1, m6 | |
4022 | punpckhbw m3, m6 | |
4023 | punpckhbw m5, m6 | |
4024 | ||
4025 | psubw m0, m2 | |
4026 | psubw m1, m3 | |
4027 | pmovzxbw m2, m7 | |
4028 | punpckhbw m7, m6 | |
4029 | psubw m4, m2 | |
4030 | psubw m5, m7 | |
4031 | ||
4032 | movu m3, [r2 + 32] | |
4033 | movu m7, [r3 + 32] | |
4034 | pmovzxbw m2, m3 | |
4035 | punpckhbw m3, m6 | |
4036 | ||
4037 | movu [r0], m0 | |
4038 | movu [r0 + 16], m1 | |
4039 | movu [r0 + 32], m4 | |
4040 | movu [r0 + 48], m5 | |
4041 | ||
4042 | movu m1, [r2 + 48] | |
4043 | movu m5, [r3 + 48] | |
4044 | pmovzxbw m0, m1 | |
4045 | pmovzxbw m4, m7 | |
4046 | punpckhbw m1, m6 | |
4047 | punpckhbw m7, m6 | |
4048 | ||
4049 | psubw m2, m4 | |
4050 | psubw m3, m7 | |
4051 | ||
4052 | movu [r0 + 64], m2 | |
4053 | movu [r0 + 80], m3 | |
4054 | ||
4055 | movu m7, [r2 + r4] | |
4056 | movu m3, [r3 + r5] | |
4057 | pmovzxbw m2, m5 | |
4058 | pmovzxbw m4, m7 | |
4059 | punpckhbw m5, m6 | |
4060 | punpckhbw m7, m6 | |
4061 | ||
4062 | psubw m0, m2 | |
4063 | psubw m1, m5 | |
4064 | ||
4065 | movu [r0 + 96], m0 | |
4066 | movu [r0 + 112], m1 | |
4067 | ||
4068 | movu m2, [r2 + r4 + 16] | |
4069 | movu m5, [r3 + r5 + 16] | |
4070 | pmovzxbw m0, m3 | |
4071 | pmovzxbw m1, m2 | |
4072 | punpckhbw m3, m6 | |
4073 | punpckhbw m2, m6 | |
4074 | ||
4075 | psubw m4, m0 | |
4076 | psubw m7, m3 | |
4077 | ||
4078 | movu [r0 + r1], m4 | |
4079 | movu [r0 + r1 + 16], m7 | |
4080 | ||
4081 | movu m0, [r2 + r4 + 32] | |
4082 | movu m3, [r3 + r5 + 32] | |
4083 | dec r6d | |
4084 | pmovzxbw m4, m5 | |
4085 | pmovzxbw m7, m0 | |
4086 | punpckhbw m5, m6 | |
4087 | punpckhbw m0, m6 | |
4088 | ||
4089 | psubw m1, m4 | |
4090 | psubw m2, m5 | |
4091 | ||
4092 | movu [r0 + r1 + 32], m1 | |
4093 | movu [r0 + r1 + 48], m2 | |
4094 | ||
4095 | movu m4, [r2 + r4 + 48] | |
4096 | movu m5, [r3 + r5 + 48] | |
4097 | lea r2, [r2 + r4 * 2] | |
4098 | lea r3, [r3 + r5 * 2] | |
4099 | pmovzxbw m1, m3 | |
4100 | pmovzxbw m2, m4 | |
4101 | punpckhbw m3, m6 | |
4102 | punpckhbw m4, m6 | |
4103 | ||
4104 | psubw m7, m1 | |
4105 | psubw m0, m3 | |
4106 | ||
4107 | movu [r0 + r1 + 64], m7 | |
4108 | movu [r0 + r1 + 80], m0 | |
4109 | ||
4110 | pmovzxbw m7, m5 | |
4111 | punpckhbw m5, m6 | |
4112 | psubw m2, m7 | |
4113 | psubw m4, m5 | |
4114 | ||
4115 | movu [r0 + r1 + 96], m2 | |
4116 | movu [r0 + r1 + 112], m4 | |
4117 | lea r0, [r0 + r1 * 2] | |
4118 | ||
4119 | jnz .loop | |
4120 | RET | |
4121 | %endif | |
4122 | %endmacro | |
4123 | ||
4124 | %if HIGH_BIT_DEPTH | |
4125 | INIT_XMM sse2 | |
4126 | PIXELSUB_PS_W64_H2 64, 64 | |
4127 | %else | |
4128 | INIT_XMM sse4 | |
4129 | PIXELSUB_PS_W64_H2 64, 64 | |
4130 | %endif | |
4131 | ||
4132 | ||
4133 | ;============================================================================= | |
4134 | ; variance | |
4135 | ;============================================================================= | |
4136 | ||
4137 | %macro VAR_START 1 | |
4138 | pxor m5, m5 ; sum | |
4139 | pxor m6, m6 ; sum squared | |
4140 | %if HIGH_BIT_DEPTH == 0 | |
4141 | %if %1 | |
4142 | mova m7, [pw_00ff] | |
4143 | %elif mmsize < 32 | |
4144 | pxor m7, m7 ; zero | |
4145 | %endif | |
4146 | %endif ; !HIGH_BIT_DEPTH | |
4147 | %endmacro | |
4148 | ||
4149 | %macro VAR_END 2 | |
4150 | %if HIGH_BIT_DEPTH | |
4151 | %if mmsize == 8 && %1*%2 == 256 | |
4152 | HADDUW m5, m2 | |
4153 | %else | |
4154 | %if %1 >= 32 | |
4155 | HADDW m5, m2 | |
4156 | movd m7, r4d | |
4157 | paddd m5, m7 | |
4158 | %else | |
4159 | HADDW m5, m2 | |
4160 | %endif | |
4161 | %endif | |
4162 | %else ; !HIGH_BIT_DEPTH | |
4163 | %if %1 == 64 | |
4164 | HADDW m5, m2 | |
4165 | movd m7, r4d | |
4166 | paddd m5, m7 | |
4167 | %else | |
4168 | HADDW m5, m2 | |
4169 | %endif | |
4170 | %endif ; HIGH_BIT_DEPTH | |
4171 | HADDD m6, m1 | |
4172 | %if ARCH_X86_64 | |
4173 | punpckldq m5, m6 | |
4174 | movq rax, m5 | |
4175 | %else | |
4176 | movd eax, m5 | |
4177 | movd edx, m6 | |
4178 | %endif | |
4179 | RET | |
4180 | %endmacro | |
4181 | ||
4182 | %macro VAR_CORE 0 | |
4183 | paddw m5, m0 | |
4184 | paddw m5, m3 | |
4185 | paddw m5, m1 | |
4186 | paddw m5, m4 | |
4187 | pmaddwd m0, m0 | |
4188 | pmaddwd m3, m3 | |
4189 | pmaddwd m1, m1 | |
4190 | pmaddwd m4, m4 | |
4191 | paddd m6, m0 | |
4192 | paddd m6, m3 | |
4193 | paddd m6, m1 | |
4194 | paddd m6, m4 | |
4195 | %endmacro | |
4196 | ||
4197 | %macro VAR_2ROW 3 | |
4198 | mov r2d, %2 | |
4199 | .loop%3: | |
4200 | %if HIGH_BIT_DEPTH | |
4201 | movu m0, [r0] | |
4202 | movu m1, [r0+mmsize] | |
4203 | movu m3, [r0+%1] | |
4204 | movu m4, [r0+%1+mmsize] | |
4205 | %else ; !HIGH_BIT_DEPTH | |
4206 | mova m0, [r0] | |
4207 | punpckhbw m1, m0, m7 | |
4208 | mova m3, [r0+%1] | |
4209 | mova m4, m3 | |
4210 | punpcklbw m0, m7 | |
4211 | %endif ; HIGH_BIT_DEPTH | |
4212 | %ifidn %1, r1 | |
4213 | lea r0, [r0+%1*2] | |
4214 | %else | |
4215 | add r0, r1 | |
4216 | %endif | |
4217 | %if HIGH_BIT_DEPTH == 0 | |
4218 | punpcklbw m3, m7 | |
4219 | punpckhbw m4, m7 | |
4220 | %endif ; !HIGH_BIT_DEPTH | |
4221 | VAR_CORE | |
4222 | dec r2d | |
4223 | jg .loop%3 | |
4224 | %endmacro | |
4225 | ||
4226 | ;----------------------------------------------------------------------------- | |
4227 | ; int pixel_var_wxh( uint8_t *, intptr_t ) | |
4228 | ;----------------------------------------------------------------------------- | |
4229 | INIT_MMX mmx2 | |
4230 | cglobal pixel_var_16x16, 2,3 | |
4231 | FIX_STRIDES r1 | |
4232 | VAR_START 0 | |
4233 | VAR_2ROW 8*SIZEOF_PIXEL, 16, 1 | |
4234 | VAR_END 16, 16 | |
4235 | ||
4236 | cglobal pixel_var_8x8, 2,3 | |
4237 | FIX_STRIDES r1 | |
4238 | VAR_START 0 | |
4239 | VAR_2ROW r1, 4, 1 | |
4240 | VAR_END 8, 8 | |
4241 | ||
4242 | %if HIGH_BIT_DEPTH | |
4243 | %macro VAR 0 | |
4244 | cglobal pixel_var_16x16, 2,3,8 | |
4245 | FIX_STRIDES r1 | |
4246 | VAR_START 0 | |
4247 | VAR_2ROW r1, 8, 1 | |
4248 | VAR_END 16, 16 | |
4249 | ||
4250 | cglobal pixel_var_8x8, 2,3,8 | |
4251 | lea r2, [r1*3] | |
4252 | VAR_START 0 | |
4253 | movu m0, [r0] | |
4254 | movu m1, [r0+r1*2] | |
4255 | movu m3, [r0+r1*4] | |
4256 | movu m4, [r0+r2*2] | |
4257 | lea r0, [r0+r1*8] | |
4258 | VAR_CORE | |
4259 | movu m0, [r0] | |
4260 | movu m1, [r0+r1*2] | |
4261 | movu m3, [r0+r1*4] | |
4262 | movu m4, [r0+r2*2] | |
4263 | VAR_CORE | |
4264 | VAR_END 8, 8 | |
4265 | ||
4266 | cglobal pixel_var_32x32, 2,6,8 | |
4267 | FIX_STRIDES r1 | |
4268 | mov r3, r0 | |
4269 | VAR_START 0 | |
4270 | VAR_2ROW r1, 8, 1 | |
4271 | HADDW m5, m2 | |
4272 | movd r4d, m5 | |
4273 | pxor m5, m5 | |
4274 | VAR_2ROW r1, 8, 2 | |
4275 | HADDW m5, m2 | |
4276 | movd r5d, m5 | |
4277 | add r4, r5 | |
4278 | pxor m5, m5 | |
4279 | lea r0, [r3 + 32] | |
4280 | VAR_2ROW r1, 8, 3 | |
4281 | HADDW m5, m2 | |
4282 | movd r5d, m5 | |
4283 | add r4, r5 | |
4284 | pxor m5, m5 | |
4285 | VAR_2ROW r1, 8, 4 | |
4286 | VAR_END 32, 32 | |
4287 | ||
4288 | cglobal pixel_var_64x64, 2,6,8 | |
4289 | FIX_STRIDES r1 | |
4290 | mov r3, r0 | |
4291 | VAR_START 0 | |
4292 | VAR_2ROW r1, 8, 1 | |
4293 | HADDW m5, m2 | |
4294 | movd r4d, m5 | |
4295 | pxor m5, m5 | |
4296 | VAR_2ROW r1, 8, 2 | |
4297 | HADDW m5, m2 | |
4298 | movd r5d, m5 | |
4299 | add r4, r5 | |
4300 | pxor m5, m5 | |
4301 | VAR_2ROW r1, 8, 3 | |
4302 | HADDW m5, m2 | |
4303 | movd r5d, m5 | |
4304 | add r4, r5 | |
4305 | pxor m5, m5 | |
4306 | VAR_2ROW r1, 8, 4 | |
4307 | HADDW m5, m2 | |
4308 | movd r5d, m5 | |
4309 | add r4, r5 | |
4310 | pxor m5, m5 | |
4311 | lea r0, [r3 + 32] | |
4312 | VAR_2ROW r1, 8, 5 | |
4313 | HADDW m5, m2 | |
4314 | movd r5d, m5 | |
4315 | add r4, r5 | |
4316 | pxor m5, m5 | |
4317 | VAR_2ROW r1, 8, 6 | |
4318 | HADDW m5, m2 | |
4319 | movd r5d, m5 | |
4320 | add r4, r5 | |
4321 | pxor m5, m5 | |
4322 | VAR_2ROW r1, 8, 7 | |
4323 | HADDW m5, m2 | |
4324 | movd r5d, m5 | |
4325 | add r4, r5 | |
4326 | pxor m5, m5 | |
4327 | VAR_2ROW r1, 8, 8 | |
4328 | HADDW m5, m2 | |
4329 | movd r5d, m5 | |
4330 | add r4, r5 | |
4331 | pxor m5, m5 | |
4332 | lea r0, [r3 + 64] | |
4333 | VAR_2ROW r1, 8, 9 | |
4334 | HADDW m5, m2 | |
4335 | movd r5d, m5 | |
4336 | add r4, r5 | |
4337 | pxor m5, m5 | |
4338 | VAR_2ROW r1, 8, 10 | |
4339 | HADDW m5, m2 | |
4340 | movd r5d, m5 | |
4341 | add r4, r5 | |
4342 | pxor m5, m5 | |
4343 | VAR_2ROW r1, 8, 11 | |
4344 | HADDW m5, m2 | |
4345 | movd r5d, m5 | |
4346 | add r4, r5 | |
4347 | pxor m5, m5 | |
4348 | VAR_2ROW r1, 8, 12 | |
4349 | HADDW m5, m2 | |
4350 | movd r5d, m5 | |
4351 | add r4, r5 | |
4352 | pxor m5, m5 | |
4353 | lea r0, [r3 + 96] | |
4354 | VAR_2ROW r1, 8, 13 | |
4355 | HADDW m5, m2 | |
4356 | movd r5d, m5 | |
4357 | add r4, r5 | |
4358 | pxor m5, m5 | |
4359 | VAR_2ROW r1, 8, 14 | |
4360 | HADDW m5, m2 | |
4361 | movd r5d, m5 | |
4362 | add r4, r5 | |
4363 | pxor m5, m5 | |
4364 | VAR_2ROW r1, 8, 15 | |
4365 | HADDW m5, m2 | |
4366 | movd r5d, m5 | |
4367 | add r4, r5 | |
4368 | pxor m5, m5 | |
4369 | VAR_2ROW r1, 8, 16 | |
4370 | VAR_END 64, 64 | |
4371 | %endmacro ; VAR | |
4372 | ||
4373 | INIT_XMM sse2 | |
4374 | VAR | |
4375 | INIT_XMM avx | |
4376 | VAR | |
4377 | INIT_XMM xop | |
4378 | VAR | |
4379 | %endif ; HIGH_BIT_DEPTH | |
4380 | ||
4381 | %if HIGH_BIT_DEPTH == 0 | |
4382 | %macro VAR 0 | |
4383 | cglobal pixel_var_8x8, 2,3,8 | |
4384 | VAR_START 1 | |
4385 | lea r2, [r1 * 3] | |
4386 | movh m0, [r0] | |
4387 | movh m3, [r0 + r1] | |
4388 | movhps m0, [r0 + r1 * 2] | |
4389 | movhps m3, [r0 + r2] | |
4390 | DEINTB 1, 0, 4, 3, 7 | |
4391 | lea r0, [r0 + r1 * 4] | |
4392 | VAR_CORE | |
4393 | movh m0, [r0] | |
4394 | movh m3, [r0 + r1] | |
4395 | movhps m0, [r0 + r1 * 2] | |
4396 | movhps m3, [r0 + r2] | |
4397 | DEINTB 1, 0, 4, 3, 7 | |
4398 | VAR_CORE | |
4399 | VAR_END 8, 8 | |
4400 | ||
4401 | cglobal pixel_var_16x16_internal | |
4402 | movu m0, [r0] | |
4403 | movu m3, [r0 + r1] | |
4404 | DEINTB 1, 0, 4, 3, 7 | |
4405 | VAR_CORE | |
4406 | movu m0, [r0 + 2 * r1] | |
4407 | movu m3, [r0 + r2] | |
4408 | DEINTB 1, 0, 4, 3, 7 | |
4409 | lea r0, [r0 + r1 * 4] | |
4410 | VAR_CORE | |
4411 | movu m0, [r0] | |
4412 | movu m3, [r0 + r1] | |
4413 | DEINTB 1, 0, 4, 3, 7 | |
4414 | VAR_CORE | |
4415 | movu m0, [r0 + 2 * r1] | |
4416 | movu m3, [r0 + r2] | |
4417 | DEINTB 1, 0, 4, 3, 7 | |
4418 | lea r0, [r0 + r1 * 4] | |
4419 | VAR_CORE | |
4420 | movu m0, [r0] | |
4421 | movu m3, [r0 + r1] | |
4422 | DEINTB 1, 0, 4, 3, 7 | |
4423 | VAR_CORE | |
4424 | movu m0, [r0 + 2 * r1] | |
4425 | movu m3, [r0 + r2] | |
4426 | DEINTB 1, 0, 4, 3, 7 | |
4427 | lea r0, [r0 + r1 * 4] | |
4428 | VAR_CORE | |
4429 | movu m0, [r0] | |
4430 | movu m3, [r0 + r1] | |
4431 | DEINTB 1, 0, 4, 3, 7 | |
4432 | VAR_CORE | |
4433 | movu m0, [r0 + 2 * r1] | |
4434 | movu m3, [r0 + r2] | |
4435 | DEINTB 1, 0, 4, 3, 7 | |
4436 | VAR_CORE | |
4437 | ret | |
4438 | ||
4439 | cglobal pixel_var_16x16, 2,3,8 | |
4440 | VAR_START 1 | |
4441 | lea r2, [r1 * 3] | |
4442 | call pixel_var_16x16_internal | |
4443 | VAR_END 16, 16 | |
4444 | ||
4445 | cglobal pixel_var_32x32, 2,4,8 | |
4446 | VAR_START 1 | |
4447 | lea r2, [r1 * 3] | |
4448 | mov r3, r0 | |
4449 | call pixel_var_16x16_internal | |
4450 | lea r0, [r0 + r1 * 4] | |
4451 | call pixel_var_16x16_internal | |
4452 | lea r0, [r3 + 16] | |
4453 | call pixel_var_16x16_internal | |
4454 | lea r0, [r0 + r1 * 4] | |
4455 | call pixel_var_16x16_internal | |
4456 | VAR_END 32, 32 | |
4457 | ||
4458 | cglobal pixel_var_64x64, 2,6,8 | |
4459 | VAR_START 1 | |
4460 | lea r2, [r1 * 3] | |
4461 | mov r3, r0 | |
4462 | call pixel_var_16x16_internal | |
4463 | lea r0, [r0 + r1 * 4] | |
4464 | call pixel_var_16x16_internal | |
4465 | lea r0, [r0 + r1 * 4] | |
4466 | call pixel_var_16x16_internal | |
4467 | lea r0, [r0 + r1 * 4] | |
4468 | call pixel_var_16x16_internal | |
4469 | HADDW m5, m2 | |
4470 | movd r4d, m5 | |
4471 | pxor m5, m5 | |
4472 | lea r0, [r3 + 16] | |
4473 | call pixel_var_16x16_internal | |
4474 | lea r0, [r0 + r1 * 4] | |
4475 | call pixel_var_16x16_internal | |
4476 | lea r0, [r0 + r1 * 4] | |
4477 | call pixel_var_16x16_internal | |
4478 | lea r0, [r0 + r1 * 4] | |
4479 | call pixel_var_16x16_internal | |
4480 | HADDW m5, m2 | |
4481 | movd r5d, m5 | |
4482 | add r4, r5 | |
4483 | pxor m5, m5 | |
4484 | lea r0, [r3 + 32] | |
4485 | call pixel_var_16x16_internal | |
4486 | lea r0, [r0 + r1 * 4] | |
4487 | call pixel_var_16x16_internal | |
4488 | lea r0, [r0 + r1 * 4] | |
4489 | call pixel_var_16x16_internal | |
4490 | lea r0, [r0 + r1 * 4] | |
4491 | call pixel_var_16x16_internal | |
4492 | lea r0, [r3 + 48] | |
4493 | HADDW m5, m2 | |
4494 | movd r5d, m5 | |
4495 | add r4, r5 | |
4496 | pxor m5, m5 | |
4497 | call pixel_var_16x16_internal | |
4498 | lea r0, [r0 + r1 * 4] | |
4499 | call pixel_var_16x16_internal | |
4500 | lea r0, [r0 + r1 * 4] | |
4501 | call pixel_var_16x16_internal | |
4502 | lea r0, [r0 + r1 * 4] | |
4503 | call pixel_var_16x16_internal | |
4504 | VAR_END 64, 64 | |
4505 | %endmacro ; VAR | |
4506 | ||
4507 | INIT_XMM sse2 | |
4508 | VAR | |
4509 | INIT_XMM avx | |
4510 | VAR | |
4511 | INIT_XMM xop | |
4512 | VAR | |
4513 | ||
4514 | INIT_YMM avx2 | |
4515 | cglobal pixel_var_16x16, 2,4,7 | |
4516 | VAR_START 0 | |
4517 | mov r2d, 4 | |
4518 | lea r3, [r1*3] | |
4519 | .loop: | |
4520 | pmovzxbw m0, [r0] | |
4521 | pmovzxbw m3, [r0+r1] | |
4522 | pmovzxbw m1, [r0+r1*2] | |
4523 | pmovzxbw m4, [r0+r3] | |
4524 | lea r0, [r0+r1*4] | |
4525 | VAR_CORE | |
4526 | dec r2d | |
4527 | jg .loop | |
4528 | vextracti128 xm0, m5, 1 | |
4529 | vextracti128 xm1, m6, 1 | |
4530 | paddw xm5, xm0 | |
4531 | paddd xm6, xm1 | |
4532 | HADDW xm5, xm2 | |
4533 | HADDD xm6, xm1 | |
4534 | %if ARCH_X86_64 | |
4535 | punpckldq xm5, xm6 | |
4536 | movq rax, xm5 | |
4537 | %else | |
4538 | movd eax, xm5 | |
4539 | movd edx, xm6 | |
4540 | %endif | |
4541 | RET | |
4542 | %endif ; !HIGH_BIT_DEPTH | |
4543 | ||
4544 | %macro VAR2_END 3 | |
4545 | HADDW %2, xm1 | |
4546 | movd r1d, %2 | |
4547 | imul r1d, r1d | |
4548 | HADDD %3, xm1 | |
4549 | shr r1d, %1 | |
4550 | movd eax, %3 | |
4551 | movd [r4], %3 | |
4552 | sub eax, r1d ; sqr - (sum * sum >> shift) | |
4553 | RET | |
4554 | %endmacro | |
4555 |