Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* sad16-a.asm: x86 high depth sad functions | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2010-2013 x264 project | |
5 | ;* | |
6 | ;* Authors: Oskar Arvidsson <oskar@irock.se> | |
7 | ;* Henrik Gramner <henrik@gramner.com> | |
8 | ;* Dnyaneshwar Gorade <dnyaneshwar@multicorewareinc.com> | |
9 | ;* | |
10 | ;* This program is free software; you can redistribute it and/or modify | |
11 | ;* it under the terms of the GNU General Public License as published by | |
12 | ;* the Free Software Foundation; either version 2 of the License, or | |
13 | ;* (at your option) any later version. | |
14 | ;* | |
15 | ;* This program is distributed in the hope that it will be useful, | |
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | ;* GNU General Public License for more details. | |
19 | ;* | |
20 | ;* You should have received a copy of the GNU General Public License | |
21 | ;* along with this program; if not, write to the Free Software | |
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
23 | ;* | |
24 | ;* This program is also available under a commercial proprietary license. | |
25 | ;* For more information, contact us at license @ x265.com. | |
26 | ;***************************************************************************** | |
27 | ||
28 | %include "x86inc.asm" | |
29 | %include "x86util.asm" | |
30 | ||
31 | SECTION .text | |
32 | ||
33 | cextern pw_1 | |
34 | ||
35 | ;============================================================================= | |
36 | ; SAD MMX | |
37 | ;============================================================================= | |
38 | ||
39 | %macro SAD_INC_1x16P_MMX 0 | |
40 | movu m1, [r0+ 0] | |
41 | movu m2, [r0+ 8] | |
42 | movu m3, [r0+16] | |
43 | movu m4, [r0+24] | |
44 | psubw m1, [r2+ 0] | |
45 | psubw m2, [r2+ 8] | |
46 | psubw m3, [r2+16] | |
47 | psubw m4, [r2+24] | |
48 | ABSW2 m1, m2, m1, m2, m5, m6 | |
49 | ABSW2 m3, m4, m3, m4, m7, m5 | |
50 | lea r0, [r0+2*r1] | |
51 | lea r2, [r2+2*r3] | |
52 | paddw m1, m2 | |
53 | paddw m3, m4 | |
54 | paddw m0, m1 | |
55 | paddw m0, m3 | |
56 | %endmacro | |
57 | ||
58 | %macro SAD_INC_2x8P_MMX 0 | |
59 | movu m1, [r0+0] | |
60 | movu m2, [r0+8] | |
61 | movu m3, [r0+2*r1+0] | |
62 | movu m4, [r0+2*r1+8] | |
63 | psubw m1, [r2+0] | |
64 | psubw m2, [r2+8] | |
65 | psubw m3, [r2+2*r3+0] | |
66 | psubw m4, [r2+2*r3+8] | |
67 | ABSW2 m1, m2, m1, m2, m5, m6 | |
68 | ABSW2 m3, m4, m3, m4, m7, m5 | |
69 | lea r0, [r0+4*r1] | |
70 | lea r2, [r2+4*r3] | |
71 | paddw m1, m2 | |
72 | paddw m3, m4 | |
73 | paddw m0, m1 | |
74 | paddw m0, m3 | |
75 | %endmacro | |
76 | ||
77 | %macro SAD_INC_2x4P_MMX 0 | |
78 | movu m1, [r0] | |
79 | movu m2, [r0+2*r1] | |
80 | psubw m1, [r2] | |
81 | psubw m2, [r2+2*r3] | |
82 | ABSW2 m1, m2, m1, m2, m3, m4 | |
83 | lea r0, [r0+4*r1] | |
84 | lea r2, [r2+4*r3] | |
85 | paddw m0, m1 | |
86 | paddw m0, m2 | |
87 | %endmacro | |
88 | ||
89 | ;----------------------------------------------------------------------------- | |
90 | ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
91 | ;----------------------------------------------------------------------------- | |
92 | %macro SAD_MMX 3 | |
93 | cglobal pixel_sad_%1x%2, 4,5-(%2&4/4) | |
94 | pxor m0, m0 | |
95 | %if %2 == 4 | |
96 | SAD_INC_%3x%1P_MMX | |
97 | SAD_INC_%3x%1P_MMX | |
98 | %else | |
99 | mov r4d, %2/%3 | |
100 | .loop: | |
101 | SAD_INC_%3x%1P_MMX | |
102 | dec r4d | |
103 | jg .loop | |
104 | %endif | |
105 | %if %1*%2 == 256 | |
106 | HADDUW m0, m1 | |
107 | %else | |
108 | HADDW m0, m1 | |
109 | %endif | |
110 | movd eax, m0 | |
111 | RET | |
112 | %endmacro | |
113 | ||
114 | INIT_MMX mmx2 | |
115 | SAD_MMX 16, 16, 1 | |
116 | SAD_MMX 16, 8, 1 | |
117 | SAD_MMX 8, 16, 2 | |
118 | SAD_MMX 8, 8, 2 | |
119 | SAD_MMX 8, 4, 2 | |
120 | SAD_MMX 4, 8, 2 | |
121 | SAD_MMX 4, 4, 2 | |
122 | SAD_MMX 4, 16, 2 | |
123 | INIT_MMX ssse3 | |
124 | SAD_MMX 4, 8, 2 | |
125 | SAD_MMX 4, 4, 2 | |
126 | ||
127 | ;============================================================================= | |
128 | ; SAD XMM | |
129 | ;============================================================================= | |
130 | ||
131 | %macro SAD_1x32 0 | |
132 | movu m1, [r2+ 0] | |
133 | movu m2, [r2+16] | |
134 | movu m3, [r2+32] | |
135 | movu m4, [r2+48] | |
136 | psubw m1, [r0+0] | |
137 | psubw m2, [r0+16] | |
138 | psubw m3, [r0+32] | |
139 | psubw m4, [r0+48] | |
140 | ABSW2 m1, m2, m1, m2, m5, m6 | |
141 | pmaddwd m1, [pw_1] | |
142 | pmaddwd m2, [pw_1] | |
143 | lea r0, [r0+2*r1] | |
144 | lea r2, [r2+2*r3] | |
145 | ABSW2 m3, m4, m3, m4, m7, m5 | |
146 | pmaddwd m3, [pw_1] | |
147 | pmaddwd m4, [pw_1] | |
148 | paddd m1, m2 | |
149 | paddd m3, m4 | |
150 | paddd m0, m1 | |
151 | paddd m0, m3 | |
152 | %endmacro | |
153 | ||
154 | %macro SAD_1x24 0 | |
155 | movu m1, [r2+ 0] | |
156 | movu m2, [r2+16] | |
157 | movu m3, [r2+32] | |
158 | psubw m1, [r0+0] | |
159 | psubw m2, [r0+16] | |
160 | psubw m3, [r0+32] | |
161 | ABSW2 m1, m2, m1, m2, m4, m6 | |
162 | pmaddwd m1, [pw_1] | |
163 | pmaddwd m2, [pw_1] | |
164 | lea r0, [r0+2*r1] | |
165 | lea r2, [r2+2*r3] | |
166 | pxor m4, m4 | |
167 | psubw m4, m3 | |
168 | pmaxsw m3, m4 | |
169 | pmaddwd m3, [pw_1] | |
170 | paddd m1, m2 | |
171 | paddd m0, m1 | |
172 | paddd m0, m3 | |
173 | %endmacro | |
174 | ||
175 | %macro SAD_1x48 0 | |
176 | movu m1, [r2+ 0] | |
177 | movu m2, [r2+16] | |
178 | movu m3, [r2+32] | |
179 | movu m4, [r2+48] | |
180 | psubw m1, [r0+0] | |
181 | psubw m2, [r0+16] | |
182 | psubw m3, [r0+32] | |
183 | psubw m4, [r0+48] | |
184 | ABSW2 m1, m2, m1, m2, m5, m6 | |
185 | pmaddwd m1, [pw_1] | |
186 | pmaddwd m2, [pw_1] | |
187 | ABSW2 m3, m4, m3, m4, m7, m5 | |
188 | pmaddwd m3, [pw_1] | |
189 | pmaddwd m4, [pw_1] | |
190 | paddd m1, m2 | |
191 | paddd m3, m4 | |
192 | paddd m0, m1 | |
193 | paddd m0, m3 | |
194 | movu m1, [r2+64] | |
195 | movu m2, [r2+80] | |
196 | psubw m1, [r0+64] | |
197 | psubw m2, [r0+80] | |
198 | ABSW2 m1, m2, m1, m2, m3, m4 | |
199 | pmaddwd m1, [pw_1] | |
200 | pmaddwd m2, [pw_1] | |
201 | lea r0, [r0+2*r1] | |
202 | lea r2, [r2+2*r3] | |
203 | paddd m0, m1 | |
204 | paddd m0, m2 | |
205 | %endmacro | |
206 | ||
207 | %macro SAD_1x64 0 | |
208 | movu m1, [r2+ 0] | |
209 | movu m2, [r2+16] | |
210 | movu m3, [r2+32] | |
211 | movu m4, [r2+48] | |
212 | psubw m1, [r0+0] | |
213 | psubw m2, [r0+16] | |
214 | psubw m3, [r0+32] | |
215 | psubw m4, [r0+48] | |
216 | ABSW2 m1, m2, m1, m2, m5, m6 | |
217 | pmaddwd m1, [pw_1] | |
218 | pmaddwd m2, [pw_1] | |
219 | ABSW2 m3, m4, m3, m4, m7, m5 | |
220 | pmaddwd m3, [pw_1] | |
221 | pmaddwd m4, [pw_1] | |
222 | paddd m1, m2 | |
223 | paddd m3, m4 | |
224 | paddd m0, m1 | |
225 | paddd m0, m3 | |
226 | movu m1, [r2+64] | |
227 | movu m2, [r2+80] | |
228 | movu m3, [r2+96] | |
229 | movu m4, [r2+112] | |
230 | psubw m1, [r0+64] | |
231 | psubw m2, [r0+80] | |
232 | psubw m3, [r0+96] | |
233 | psubw m4, [r0+112] | |
234 | ABSW2 m1, m2, m1, m2, m5, m6 | |
235 | pmaddwd m1, [pw_1] | |
236 | pmaddwd m2, [pw_1] | |
237 | ABSW2 m3, m4, m3, m4, m7, m5 | |
238 | pmaddwd m3, [pw_1] | |
239 | pmaddwd m4, [pw_1] | |
240 | paddd m1, m2 | |
241 | paddd m3, m4 | |
242 | paddd m0, m1 | |
243 | paddd m0, m3 | |
244 | lea r0, [r0+2*r1] | |
245 | lea r2, [r2+2*r3] | |
246 | %endmacro | |
247 | ||
248 | %macro SAD_1x12 0 | |
249 | movu m1, [r2+0] | |
250 | movh m2, [r2+16] | |
251 | psubw m1, [r0+0] | |
252 | movh m3, [r0+16] | |
253 | psubw m2, m3 | |
254 | ABSW2 m1, m2, m1, m2, m4, m6 | |
255 | pmaddwd m1, [pw_1] | |
256 | pmaddwd m2, [pw_1] | |
257 | lea r0, [r0+2*r1] | |
258 | lea r2, [r2+2*r3] | |
259 | paddd m1, m2 | |
260 | paddd m0, m1 | |
261 | %endmacro | |
262 | ||
263 | %macro SAD_INC_2ROW 1 | |
264 | %if 2*%1 > mmsize | |
265 | movu m1, [r2+ 0] | |
266 | movu m2, [r2+16] | |
267 | movu m3, [r2+2*r3+ 0] | |
268 | movu m4, [r2+2*r3+16] | |
269 | psubw m1, [r0+ 0] | |
270 | psubw m2, [r0+16] | |
271 | psubw m3, [r0+2*r1+ 0] | |
272 | psubw m4, [r0+2*r1+16] | |
273 | ABSW2 m1, m2, m1, m2, m5, m6 | |
274 | lea r0, [r0+4*r1] | |
275 | lea r2, [r2+4*r3] | |
276 | ABSW2 m3, m4, m3, m4, m7, m5 | |
277 | paddw m1, m2 | |
278 | paddw m3, m4 | |
279 | paddw m3, m1 | |
280 | pmaddwd m3, [pw_1] | |
281 | paddd m0, m3 | |
282 | %else | |
283 | movu m1, [r2] | |
284 | movu m2, [r2+2*r3] | |
285 | psubw m1, [r0] | |
286 | psubw m2, [r0+2*r1] | |
287 | ABSW2 m1, m2, m1, m2, m3, m4 | |
288 | lea r0, [r0+4*r1] | |
289 | lea r2, [r2+4*r3] | |
290 | paddw m2, m1 | |
291 | pmaddwd m2, [pw_1] | |
292 | paddd m0, m2 | |
293 | %endif | |
294 | %endmacro | |
295 | ||
296 | ;----------------------------------------------------------------------------- | |
297 | ; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
298 | ;----------------------------------------------------------------------------- | |
299 | %macro SAD 2 | |
300 | cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize) | |
301 | pxor m0, m0 | |
302 | %if %2 == 4 | |
303 | SAD_INC_2ROW %1 | |
304 | SAD_INC_2ROW %1 | |
305 | %else | |
306 | mov r4d, %2/2 | |
307 | .loop: | |
308 | SAD_INC_2ROW %1 | |
309 | dec r4d | |
310 | jg .loop | |
311 | %endif | |
312 | ||
313 | HADDD m0, m1 | |
314 | movd eax, xm0 | |
315 | RET | |
316 | %endmacro | |
317 | ||
318 | INIT_XMM sse2 | |
319 | SAD 16, 4 | |
320 | SAD 16, 8 | |
321 | SAD 16, 12 | |
322 | SAD 16, 16 | |
323 | SAD 16, 32 | |
324 | SAD 16, 64 | |
325 | ||
326 | INIT_XMM sse2 | |
327 | SAD 8, 4 | |
328 | SAD 8, 8 | |
329 | SAD 8, 16 | |
330 | SAD 8, 32 | |
331 | ||
332 | ;------------------------------------------------------------------ | |
333 | ; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
334 | ;------------------------------------------------------------------ | |
335 | %macro SAD_32 2 | |
336 | cglobal pixel_sad_%1x%2, 4,5,8 | |
337 | pxor m0, m0 | |
338 | mov r4d, %2/4 | |
339 | .loop: | |
340 | SAD_1x32 | |
341 | SAD_1x32 | |
342 | SAD_1x32 | |
343 | SAD_1x32 | |
344 | dec r4d | |
345 | jnz .loop | |
346 | ||
347 | HADDD m0, m1 | |
348 | movd eax, xm0 | |
349 | RET | |
350 | %endmacro | |
351 | ||
352 | INIT_XMM sse2 | |
353 | SAD_32 32, 8 | |
354 | SAD_32 32, 16 | |
355 | SAD_32 32, 24 | |
356 | SAD_32 32, 32 | |
357 | SAD_32 32, 64 | |
358 | ||
359 | ;------------------------------------------------------------------ | |
360 | ; int pixel_sad_64xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
361 | ;------------------------------------------------------------------ | |
362 | %macro SAD_64 2 | |
363 | cglobal pixel_sad_%1x%2, 4,5,8 | |
364 | pxor m0, m0 | |
365 | mov r4d, %2/4 | |
366 | .loop: | |
367 | SAD_1x64 | |
368 | SAD_1x64 | |
369 | SAD_1x64 | |
370 | SAD_1x64 | |
371 | dec r4d | |
372 | jnz .loop | |
373 | ||
374 | HADDD m0, m1 | |
375 | movd eax, xmm0 | |
376 | RET | |
377 | %endmacro | |
378 | ||
379 | INIT_XMM sse2 | |
380 | SAD_64 64, 16 | |
381 | SAD_64 64, 32 | |
382 | SAD_64 64, 48 | |
383 | SAD_64 64, 64 | |
384 | ||
385 | ;------------------------------------------------------------------ | |
386 | ; int pixel_sad_48xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
387 | ;------------------------------------------------------------------ | |
388 | %macro SAD_48 2 | |
389 | cglobal pixel_sad_%1x%2, 4,5,8 | |
390 | pxor m0, m0 | |
391 | mov r4d, %2/4 | |
392 | .loop: | |
393 | SAD_1x48 | |
394 | SAD_1x48 | |
395 | SAD_1x48 | |
396 | SAD_1x48 | |
397 | dec r4d | |
398 | jnz .loop | |
399 | ||
400 | HADDD m0, m1 | |
401 | movd eax, xmm0 | |
402 | RET | |
403 | %endmacro | |
404 | ||
405 | INIT_XMM sse2 | |
406 | SAD_48 48, 64 | |
407 | ||
408 | ;------------------------------------------------------------------ | |
409 | ; int pixel_sad_24xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
410 | ;------------------------------------------------------------------ | |
411 | %macro SAD_24 2 | |
412 | cglobal pixel_sad_%1x%2, 4,5,8 | |
413 | pxor m0, m0 | |
414 | mov r4d, %2/4 | |
415 | .loop: | |
416 | SAD_1x24 | |
417 | SAD_1x24 | |
418 | SAD_1x24 | |
419 | SAD_1x24 | |
420 | dec r4d | |
421 | jnz .loop | |
422 | ||
423 | HADDD m0, m1 | |
424 | movd eax, xmm0 | |
425 | RET | |
426 | %endmacro | |
427 | ||
428 | INIT_XMM sse2 | |
429 | SAD_24 24, 32 | |
430 | ||
431 | ;------------------------------------------------------------------ | |
432 | ; int pixel_sad_12xN( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
433 | ;------------------------------------------------------------------ | |
434 | %macro SAD_12 2 | |
435 | cglobal pixel_sad_%1x%2, 4,5,8 | |
436 | pxor m0, m0 | |
437 | mov r4d, %2/4 | |
438 | .loop: | |
439 | SAD_1x12 | |
440 | SAD_1x12 | |
441 | SAD_1x12 | |
442 | SAD_1x12 | |
443 | dec r4d | |
444 | jnz .loop | |
445 | ||
446 | HADDD m0, m1 | |
447 | movd eax, xmm0 | |
448 | RET | |
449 | %endmacro | |
450 | ||
451 | INIT_XMM sse2 | |
452 | SAD_12 12, 16 | |
453 | ||
454 | ||
455 | ;============================================================================= | |
456 | ; SAD x3/x4 | |
457 | ;============================================================================= | |
458 | ||
459 | %macro SAD_X3_INC_P 0 | |
460 | add r0, 4*FENC_STRIDE | |
461 | lea r1, [r1+4*r4] | |
462 | lea r2, [r2+4*r4] | |
463 | lea r3, [r3+4*r4] | |
464 | %endmacro | |
465 | ||
466 | %macro SAD_X3_ONE_START 0 | |
467 | mova m3, [r0] | |
468 | movu m0, [r1] | |
469 | movu m1, [r2] | |
470 | movu m2, [r3] | |
471 | psubw m0, m3 | |
472 | psubw m1, m3 | |
473 | psubw m2, m3 | |
474 | ABSW2 m0, m1, m0, m1, m4, m5 | |
475 | ABSW m2, m2, m6 | |
476 | pmaddwd m0, [pw_1] | |
477 | pmaddwd m1, [pw_1] | |
478 | pmaddwd m2, [pw_1] | |
479 | %endmacro | |
480 | ||
481 | %macro SAD_X3_ONE 2 | |
482 | mova m6, [r0+%1] | |
483 | movu m3, [r1+%2] | |
484 | movu m4, [r2+%2] | |
485 | movu m5, [r3+%2] | |
486 | psubw m3, m6 | |
487 | psubw m4, m6 | |
488 | psubw m5, m6 | |
489 | ABSW2 m3, m4, m3, m4, m7, m6 | |
490 | ABSW m5, m5, m6 | |
491 | pmaddwd m3, [pw_1] | |
492 | pmaddwd m4, [pw_1] | |
493 | pmaddwd m5, [pw_1] | |
494 | paddd m0, m3 | |
495 | paddd m1, m4 | |
496 | paddd m2, m5 | |
497 | %endmacro | |
498 | ||
499 | %macro SAD_X3_END 2 | |
500 | %if mmsize == 8 && %1*%2 == 256 | |
501 | HADDUW m0, m3 | |
502 | HADDUW m1, m4 | |
503 | HADDUW m2, m5 | |
504 | %else | |
505 | HADDD m0, m3 | |
506 | HADDD m1, m4 | |
507 | HADDD m2, m5 | |
508 | %endif | |
509 | %if UNIX64 | |
510 | movd [r5+0], xm0 | |
511 | movd [r5+4], xm1 | |
512 | movd [r5+8], xm2 | |
513 | %else | |
514 | mov r0, r5mp | |
515 | movd [r0+0], xm0 | |
516 | movd [r0+4], xm1 | |
517 | movd [r0+8], xm2 | |
518 | %endif | |
519 | RET | |
520 | %endmacro | |
521 | ||
522 | %macro SAD_X4_INC_P 0 | |
523 | add r0, 4*FENC_STRIDE | |
524 | lea r1, [r1+4*r5] | |
525 | lea r2, [r2+4*r5] | |
526 | lea r3, [r3+4*r5] | |
527 | lea r4, [r4+4*r5] | |
528 | %endmacro | |
529 | ||
530 | %macro SAD_X4_ONE_START 0 | |
531 | mova m4, [r0] | |
532 | movu m0, [r1] | |
533 | movu m1, [r2] | |
534 | movu m2, [r3] | |
535 | movu m3, [r4] | |
536 | psubw m0, m4 | |
537 | psubw m1, m4 | |
538 | psubw m2, m4 | |
539 | psubw m3, m4 | |
540 | ABSW2 m0, m1, m0, m1, m5, m6 | |
541 | ABSW2 m2, m3, m2, m3, m4, m7 | |
542 | pmaddwd m0, [pw_1] | |
543 | pmaddwd m1, [pw_1] | |
544 | pmaddwd m2, [pw_1] | |
545 | pmaddwd m3, [pw_1] | |
546 | %endmacro | |
547 | ||
548 | %macro SAD_X4_ONE 2 | |
549 | mova m4, [r0+%1] | |
550 | movu m5, [r1+%2] | |
551 | movu m6, [r2+%2] | |
552 | %if num_mmregs > 8 | |
553 | movu m7, [r3+%2] | |
554 | movu m8, [r4+%2] | |
555 | psubw m5, m4 | |
556 | psubw m6, m4 | |
557 | psubw m7, m4 | |
558 | psubw m8, m4 | |
559 | ABSW2 m5, m6, m5, m6, m9, m10 | |
560 | ABSW2 m7, m8, m7, m8, m9, m10 | |
561 | pmaddwd m5, [pw_1] | |
562 | pmaddwd m6, [pw_1] | |
563 | pmaddwd m7, [pw_1] | |
564 | pmaddwd m8, [pw_1] | |
565 | paddd m0, m5 | |
566 | paddd m1, m6 | |
567 | paddd m2, m7 | |
568 | paddd m3, m8 | |
569 | %elif cpuflag(ssse3) | |
570 | movu m7, [r3+%2] | |
571 | psubw m5, m4 | |
572 | psubw m6, m4 | |
573 | psubw m7, m4 | |
574 | movu m4, [r4+%2] | |
575 | pabsw m5, m5 | |
576 | psubw m4, [r0+%1] | |
577 | pabsw m6, m6 | |
578 | pabsw m7, m7 | |
579 | pabsw m4, m4 | |
580 | pmaddwd m5, [pw_1] | |
581 | pmaddwd m6, [pw_1] | |
582 | pmaddwd m7, [pw_1] | |
583 | pmaddwd m4, [pw_1] | |
584 | paddd m0, m5 | |
585 | paddd m1, m6 | |
586 | paddd m2, m7 | |
587 | paddd m3, m4 | |
588 | %else ; num_mmregs == 8 && !ssse3 | |
589 | psubw m5, m4 | |
590 | psubw m6, m4 | |
591 | ABSW m5, m5, m7 | |
592 | ABSW m6, m6, m7 | |
593 | pmaddwd m5, [pw_1] | |
594 | pmaddwd m6, [pw_1] | |
595 | paddd m0, m5 | |
596 | paddd m1, m6 | |
597 | movu m5, [r3+%2] | |
598 | movu m6, [r4+%2] | |
599 | psubw m5, m4 | |
600 | psubw m6, m4 | |
601 | ABSW2 m5, m6, m5, m6, m7, m4 | |
602 | pmaddwd m5, [pw_1] | |
603 | pmaddwd m6, [pw_1] | |
604 | paddd m2, m5 | |
605 | paddd m3, m6 | |
606 | %endif | |
607 | %endmacro | |
608 | ||
609 | %macro SAD_X4_END 2 | |
610 | %if mmsize == 8 && %1*%2 == 256 | |
611 | HADDUW m0, m4 | |
612 | HADDUW m1, m5 | |
613 | HADDUW m2, m6 | |
614 | HADDUW m3, m7 | |
615 | %else | |
616 | HADDD m0, m4 | |
617 | HADDD m1, m5 | |
618 | HADDD m2, m6 | |
619 | HADDD m3, m7 | |
620 | %endif | |
621 | mov r0, r6mp | |
622 | movd [r0+ 0], xm0 | |
623 | movd [r0+ 4], xm1 | |
624 | movd [r0+ 8], xm2 | |
625 | movd [r0+12], xm3 | |
626 | RET | |
627 | %endmacro | |
628 | ||
629 | %macro SAD_X_2xNP 4 | |
630 | %assign x %3 | |
631 | %rep %4 | |
632 | SAD_X%1_ONE x*mmsize, x*mmsize | |
633 | SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize | |
634 | %assign x x+1 | |
635 | %endrep | |
636 | %endmacro | |
637 | ||
638 | %macro PIXEL_VSAD 0 | |
639 | cglobal pixel_vsad, 3,3,8 | |
640 | mova m0, [r0] | |
641 | mova m1, [r0+16] | |
642 | mova m2, [r0+2*r1] | |
643 | mova m3, [r0+2*r1+16] | |
644 | lea r0, [r0+4*r1] | |
645 | psubw m0, m2 | |
646 | psubw m1, m3 | |
647 | ABSW2 m0, m1, m0, m1, m4, m5 | |
648 | paddw m0, m1 | |
649 | sub r2d, 2 | |
650 | je .end | |
651 | .loop: | |
652 | mova m4, [r0] | |
653 | mova m5, [r0+16] | |
654 | mova m6, [r0+2*r1] | |
655 | mova m7, [r0+2*r1+16] | |
656 | lea r0, [r0+4*r1] | |
657 | psubw m2, m4 | |
658 | psubw m3, m5 | |
659 | psubw m4, m6 | |
660 | psubw m5, m7 | |
661 | ABSW m2, m2, m1 | |
662 | ABSW m3, m3, m1 | |
663 | ABSW m4, m4, m1 | |
664 | ABSW m5, m5, m1 | |
665 | paddw m0, m2 | |
666 | paddw m0, m3 | |
667 | paddw m0, m4 | |
668 | paddw m0, m5 | |
669 | mova m2, m6 | |
670 | mova m3, m7 | |
671 | sub r2d, 2 | |
672 | jg .loop | |
673 | .end: | |
674 | %if BIT_DEPTH == 9 | |
675 | HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682 | |
676 | %else | |
677 | HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426 | |
678 | %endif | |
679 | movd eax, m0 | |
680 | RET | |
681 | %endmacro | |
682 | INIT_XMM sse2 | |
683 | PIXEL_VSAD | |
684 | INIT_XMM ssse3 | |
685 | PIXEL_VSAD | |
686 | INIT_XMM xop | |
687 | PIXEL_VSAD | |
688 | ||
689 | INIT_YMM avx2 | |
690 | cglobal pixel_vsad, 3,3 | |
691 | mova m0, [r0] | |
692 | mova m1, [r0+2*r1] | |
693 | lea r0, [r0+4*r1] | |
694 | psubw m0, m1 | |
695 | pabsw m0, m0 | |
696 | sub r2d, 2 | |
697 | je .end | |
698 | .loop: | |
699 | mova m2, [r0] | |
700 | mova m3, [r0+2*r1] | |
701 | lea r0, [r0+4*r1] | |
702 | psubw m1, m2 | |
703 | psubw m2, m3 | |
704 | pabsw m1, m1 | |
705 | pabsw m2, m2 | |
706 | paddw m0, m1 | |
707 | paddw m0, m2 | |
708 | mova m1, m3 | |
709 | sub r2d, 2 | |
710 | jg .loop | |
711 | .end: | |
712 | %if BIT_DEPTH == 9 | |
713 | HADDW m0, m1 | |
714 | %else | |
715 | HADDUW m0, m1 | |
716 | %endif | |
717 | movd eax, xm0 | |
718 | RET | |
719 | ||
720 | ;----------------------------------------------------------------------------- | |
721 | ; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1, | |
722 | ; uint16_t *pix2, intptr_t i_stride, int scores[3] ) | |
723 | ;----------------------------------------------------------------------------- | |
724 | %macro SAD_X 3 | |
725 | cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS | |
726 | %assign regnum %1+1 | |
727 | %xdefine STRIDE r %+ regnum | |
728 | mov r6, %3/2-1 | |
729 | SAD_X%1_ONE_START | |
730 | SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE | |
731 | SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1 | |
732 | .loop: | |
733 | SAD_X%1_INC_P | |
734 | SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2) | |
735 | dec r6 | |
736 | jg .loop | |
737 | %if %1 == 4 | |
738 | mov r6, r6m | |
739 | %endif | |
740 | SAD_X%1_END %2, %3 | |
741 | %endmacro | |
742 | ||
743 | INIT_MMX mmx2 | |
744 | %define XMM_REGS 0 | |
745 | SAD_X 3, 16, 16 | |
746 | SAD_X 3, 16, 8 | |
747 | SAD_X 3, 12, 16 | |
748 | SAD_X 3, 8, 16 | |
749 | SAD_X 3, 8, 8 | |
750 | SAD_X 3, 8, 4 | |
751 | SAD_X 3, 4, 16 | |
752 | SAD_X 3, 4, 8 | |
753 | SAD_X 3, 4, 4 | |
754 | SAD_X 4, 16, 16 | |
755 | SAD_X 4, 16, 8 | |
756 | SAD_X 4, 12, 16 | |
757 | SAD_X 4, 8, 16 | |
758 | SAD_X 4, 8, 8 | |
759 | SAD_X 4, 8, 4 | |
760 | SAD_X 4, 4, 16 | |
761 | SAD_X 4, 4, 8 | |
762 | SAD_X 4, 4, 4 | |
763 | INIT_MMX ssse3 | |
764 | SAD_X 3, 4, 8 | |
765 | SAD_X 3, 4, 4 | |
766 | SAD_X 4, 4, 8 | |
767 | SAD_X 4, 4, 4 | |
768 | INIT_XMM ssse3 | |
769 | %define XMM_REGS 7 | |
770 | SAD_X 3, 16, 16 | |
771 | SAD_X 3, 16, 8 | |
772 | SAD_X 3, 8, 16 | |
773 | SAD_X 3, 8, 8 | |
774 | SAD_X 3, 8, 4 | |
775 | %define XMM_REGS 9 | |
776 | SAD_X 4, 16, 16 | |
777 | SAD_X 4, 16, 8 | |
778 | SAD_X 4, 8, 16 | |
779 | SAD_X 4, 8, 8 | |
780 | SAD_X 4, 8, 4 | |
781 | INIT_XMM sse2 | |
782 | %define XMM_REGS 8 | |
783 | SAD_X 3, 64, 64 | |
784 | SAD_X 3, 64, 48 | |
785 | SAD_X 3, 64, 32 | |
786 | SAD_X 3, 64, 16 | |
787 | SAD_X 3, 48, 64 | |
788 | SAD_X 3, 32, 64 | |
789 | SAD_X 3, 32, 32 | |
790 | SAD_X 3, 32, 24 | |
791 | SAD_X 3, 32, 16 | |
792 | SAD_X 3, 32, 8 | |
793 | SAD_X 3, 24, 32 | |
794 | SAD_X 3, 16, 64 | |
795 | SAD_X 3, 16, 32 | |
796 | SAD_X 3, 16, 16 | |
797 | SAD_X 3, 16, 12 | |
798 | SAD_X 3, 16, 8 | |
799 | SAD_X 3, 16, 4 | |
800 | SAD_X 3, 8, 32 | |
801 | SAD_X 3, 8, 16 | |
802 | SAD_X 3, 8, 8 | |
803 | SAD_X 3, 8, 4 | |
804 | %define XMM_REGS 11 | |
805 | SAD_X 4, 64, 64 | |
806 | SAD_X 4, 64, 48 | |
807 | SAD_X 4, 64, 32 | |
808 | SAD_X 4, 64, 16 | |
809 | SAD_X 4, 48, 64 | |
810 | SAD_X 4, 32, 64 | |
811 | SAD_X 4, 32, 32 | |
812 | SAD_X 4, 32, 24 | |
813 | SAD_X 4, 32, 16 | |
814 | SAD_X 4, 32, 8 | |
815 | SAD_X 4, 24, 32 | |
816 | SAD_X 4, 16, 64 | |
817 | SAD_X 4, 16, 32 | |
818 | SAD_X 4, 16, 16 | |
819 | SAD_X 4, 16, 12 | |
820 | SAD_X 4, 16, 8 | |
821 | SAD_X 4, 16, 4 | |
822 | SAD_X 4, 8, 32 | |
823 | SAD_X 4, 8, 16 | |
824 | SAD_X 4, 8, 8 | |
825 | SAD_X 4, 8, 4 | |
826 | INIT_YMM avx2 | |
827 | %define XMM_REGS 7 | |
828 | SAD_X 3, 16, 16 | |
829 | SAD_X 3, 16, 8 | |
830 | %define XMM_REGS 9 | |
831 | SAD_X 4, 16, 16 | |
832 | SAD_X 4, 16, 8 | |
833 |