Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* ssd-a.asm: x86 ssd functions | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2003-2013 x264 project | |
5 | ;* | |
6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |
7 | ;* Fiona Glaser <fiona@x264.com> | |
8 | ;* Laurent Aimar <fenrir@via.ecp.fr> | |
9 | ;* Alex Izvorski <aizvorksi@gmail.com> | |
10 | ;* | |
11 | ;* This program is free software; you can redistribute it and/or modify | |
12 | ;* it under the terms of the GNU General Public License as published by | |
13 | ;* the Free Software Foundation; either version 2 of the License, or | |
14 | ;* (at your option) any later version. | |
15 | ;* | |
16 | ;* This program is distributed in the hope that it will be useful, | |
17 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | ;* GNU General Public License for more details. | |
20 | ;* | |
21 | ;* You should have received a copy of the GNU General Public License | |
22 | ;* along with this program; if not, write to the Free Software | |
23 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
24 | ;* | |
25 | ;* This program is also available under a commercial proprietary license. | |
26 | ;* For more information, contact us at license @ x265.com. | |
27 | ;***************************************************************************** | |
28 | ||
29 | %include "x86inc.asm" | |
30 | %include "x86util.asm" | |
31 | ||
32 | SECTION_RODATA 32 | |
33 | ||
34 | SECTION .text | |
35 | ||
36 | cextern pw_00ff | |
37 | cextern hsub_mul | |
38 | ||
39 | ;============================================================================= | |
40 | ; SSD | |
41 | ;============================================================================= | |
42 | ||
43 | %if HIGH_BIT_DEPTH | |
44 | ;----------------------------------------------------------------------------- | |
45 | ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
46 | ;----------------------------------------------------------------------------- | |
47 | %macro SSD_ONE 2 | |
48 | cglobal pixel_ssd_ss_%1x%2, 4,7,8 | |
49 | FIX_STRIDES r1, r3 | |
50 | %if mmsize == %1*2 | |
51 | %define offset0_1 r1 | |
52 | %define offset0_2 r1*2 | |
53 | %define offset0_3 r5 | |
54 | %define offset1_1 r3 | |
55 | %define offset1_2 r3*2 | |
56 | %define offset1_3 r6 | |
57 | lea r5, [3*r1] | |
58 | lea r6, [3*r3] | |
59 | %elif mmsize == %1 | |
60 | %define offset0_1 mmsize | |
61 | %define offset0_2 r1 | |
62 | %define offset0_3 r1+mmsize | |
63 | %define offset1_1 mmsize | |
64 | %define offset1_2 r3 | |
65 | %define offset1_3 r3+mmsize | |
66 | %elif mmsize == %1/2 | |
67 | %define offset0_1 mmsize | |
68 | %define offset0_2 mmsize*2 | |
69 | %define offset0_3 mmsize*3 | |
70 | %define offset1_1 mmsize | |
71 | %define offset1_2 mmsize*2 | |
72 | %define offset1_3 mmsize*3 | |
73 | %endif | |
74 | %assign %%n %2/(2*mmsize/%1) | |
75 | %if %%n > 1 | |
76 | mov r4d, %%n | |
77 | %endif | |
78 | pxor m0, m0 | |
79 | .loop: | |
80 | movu m1, [r0] | |
81 | movu m2, [r0+offset0_1] | |
82 | movu m3, [r0+offset0_2] | |
83 | movu m4, [r0+offset0_3] | |
84 | movu m6, [r2] | |
85 | movu m7, [r2+offset1_1] | |
86 | psubw m1, m6 | |
87 | psubw m2, m7 | |
88 | movu m6, [r2+offset1_2] | |
89 | movu m7, [r2+offset1_3] | |
90 | psubw m3, m6 | |
91 | psubw m4, m7 | |
92 | %if %%n > 1 | |
93 | lea r0, [r0+r1*(%2/%%n)] | |
94 | lea r2, [r2+r3*(%2/%%n)] | |
95 | %endif | |
96 | pmaddwd m1, m1 | |
97 | pmaddwd m2, m2 | |
98 | pmaddwd m3, m3 | |
99 | pmaddwd m4, m4 | |
100 | paddd m1, m2 | |
101 | paddd m3, m4 | |
102 | paddd m0, m1 | |
103 | paddd m0, m3 | |
104 | %if %%n > 1 | |
105 | dec r4d | |
106 | jg .loop | |
107 | %endif | |
108 | HADDD m0, m5 | |
109 | movd eax, xm0 | |
110 | %ifidn movu,movq ; detect MMX | |
111 | EMMS | |
112 | %endif | |
113 | RET | |
114 | %endmacro | |
115 | ||
116 | %macro SSD_TWO 2 | |
117 | cglobal pixel_ssd_ss_%1x%2, 4,7,8 | |
118 | FIX_STRIDES r1, r3 | |
119 | pxor m0, m0 | |
120 | mov r4d, %2/2 | |
121 | lea r5, [r1 * 2] | |
122 | lea r6, [r3 * 2] | |
123 | .loop: | |
124 | movu m1, [r0] | |
125 | movu m2, [r0 + 16] | |
126 | movu m3, [r0 + 32] | |
127 | movu m4, [r0 + 48] | |
128 | movu m6, [r2] | |
129 | movu m7, [r2 + 16] | |
130 | psubw m1, m6 | |
131 | psubw m2, m7 | |
132 | movu m6, [r2 + 32] | |
133 | movu m7, [r2 + 48] | |
134 | psubw m3, m6 | |
135 | psubw m4, m7 | |
136 | pmaddwd m1, m1 | |
137 | pmaddwd m2, m2 | |
138 | pmaddwd m3, m3 | |
139 | pmaddwd m4, m4 | |
140 | paddd m1, m2 | |
141 | paddd m3, m4 | |
142 | paddd m0, m1 | |
143 | paddd m0, m3 | |
144 | movu m1, [r0 + 64] | |
145 | movu m2, [r0 + 80] | |
146 | movu m6, [r2 + 64] | |
147 | movu m7, [r2 + 80] | |
148 | psubw m1, m6 | |
149 | psubw m2, m7 | |
150 | pmaddwd m1, m1 | |
151 | pmaddwd m2, m2 | |
152 | paddd m1, m2 | |
153 | paddd m0, m1 | |
154 | %if %1 == 64 | |
155 | movu m3, [r0 + 96] | |
156 | movu m4, [r0 + 112] | |
157 | movu m6, [r2 + 96] | |
158 | movu m7, [r2 + 112] | |
159 | psubw m3, m6 | |
160 | psubw m4, m7 | |
161 | pmaddwd m3, m3 | |
162 | pmaddwd m4, m4 | |
163 | paddd m3, m4 | |
164 | paddd m0, m3 | |
165 | %endif | |
166 | movu m1, [r0 + r1] | |
167 | movu m2, [r0 + r1 + 16] | |
168 | movu m3, [r0 + r1 + 32] | |
169 | movu m4, [r0 + r1 + 48] | |
170 | movu m6, [r2 + r3] | |
171 | movu m7, [r2 + r3 + 16] | |
172 | psubw m1, m6 | |
173 | psubw m2, m7 | |
174 | movu m6, [r2 + r3 + 32] | |
175 | movu m7, [r2 + r3 + 48] | |
176 | psubw m3, m6 | |
177 | psubw m4, m7 | |
178 | pmaddwd m1, m1 | |
179 | pmaddwd m2, m2 | |
180 | pmaddwd m3, m3 | |
181 | pmaddwd m4, m4 | |
182 | paddd m1, m2 | |
183 | paddd m3, m4 | |
184 | paddd m0, m1 | |
185 | paddd m0, m3 | |
186 | movu m1, [r0 + r1 + 64] | |
187 | movu m2, [r0 + r1 + 80] | |
188 | movu m6, [r2 + r3 + 64] | |
189 | movu m7, [r2 + r3 + 80] | |
190 | psubw m1, m6 | |
191 | psubw m2, m7 | |
192 | pmaddwd m1, m1 | |
193 | pmaddwd m2, m2 | |
194 | paddd m1, m2 | |
195 | paddd m0, m1 | |
196 | %if %1 == 64 | |
197 | movu m3, [r0 + r1 + 96] | |
198 | movu m4, [r0 + r1 + 112] | |
199 | movu m6, [r2 + r3 + 96] | |
200 | movu m7, [r2 + r3 + 112] | |
201 | psubw m3, m6 | |
202 | psubw m4, m7 | |
203 | pmaddwd m3, m3 | |
204 | pmaddwd m4, m4 | |
205 | paddd m3, m4 | |
206 | paddd m0, m3 | |
207 | %endif | |
208 | lea r0, [r0 + r5] | |
209 | lea r2, [r2 + r6] | |
210 | dec r4d | |
211 | jnz .loop | |
212 | HADDD m0, m5 | |
213 | movd eax, xm0 | |
214 | RET | |
215 | %endmacro | |
216 | %macro SSD_24 2 | |
217 | cglobal pixel_ssd_ss_%1x%2, 4,7,8 | |
218 | FIX_STRIDES r1, r3 | |
219 | pxor m0, m0 | |
220 | mov r4d, %2/2 | |
221 | lea r5, [r1 * 2] | |
222 | lea r6, [r3 * 2] | |
223 | .loop: | |
224 | movu m1, [r0] | |
225 | movu m2, [r0 + 16] | |
226 | movu m3, [r0 + 32] | |
227 | movu m5, [r2] | |
228 | movu m6, [r2 + 16] | |
229 | movu m7, [r2 + 32] | |
230 | psubw m1, m5 | |
231 | psubw m2, m6 | |
232 | psubw m3, m7 | |
233 | pmaddwd m1, m1 | |
234 | pmaddwd m2, m2 | |
235 | pmaddwd m3, m3 | |
236 | paddd m1, m2 | |
237 | paddd m0, m1 | |
238 | movu m1, [r0 + r1] | |
239 | movu m2, [r0 + r1 + 16] | |
240 | movu m4, [r0 + r1 + 32] | |
241 | movu m5, [r2 + r3] | |
242 | movu m6, [r2 + r3 + 16] | |
243 | movu m7, [r2 + r3 + 32] | |
244 | psubw m1, m5 | |
245 | psubw m2, m6 | |
246 | psubw m4, m7 | |
247 | pmaddwd m1, m1 | |
248 | pmaddwd m2, m2 | |
249 | pmaddwd m4, m4 | |
250 | paddd m1, m2 | |
251 | paddd m3, m4 | |
252 | paddd m0, m1 | |
253 | paddd m0, m3 | |
254 | lea r0, [r0 + r5] | |
255 | lea r2, [r2 + r6] | |
256 | dec r4d | |
257 | jnz .loop | |
258 | HADDD m0, m5 | |
259 | movd eax, xm0 | |
260 | RET | |
261 | %endmacro | |
262 | %macro SSD_12 2 | |
263 | cglobal pixel_ssd_ss_%1x%2, 4,7,8 | |
264 | FIX_STRIDES r1, r3 | |
265 | pxor m0, m0 | |
266 | mov r4d, %2/4 | |
267 | lea r5, [r1 * 2] | |
268 | lea r6, [r3 * 2] | |
269 | .loop: | |
270 | movu m1, [r0] | |
271 | movh m2, [r0 + 16] | |
272 | movu m3, [r0 + r1] | |
273 | punpcklqdq m2, [r0 + r1 + 16] | |
274 | movu m7, [r2] | |
275 | psubw m1, m7 | |
276 | movh m4, [r2 + 16] | |
277 | movu m7, [r2 + r3] | |
278 | psubw m3, m7 | |
279 | punpcklqdq m4, [r2 + r3 + 16] | |
280 | psubw m2, m4 | |
281 | pmaddwd m1, m1 | |
282 | pmaddwd m2, m2 | |
283 | pmaddwd m3, m3 | |
284 | paddd m1, m2 | |
285 | paddd m0, m1 | |
286 | ||
287 | movu m1, [r0 + r5] | |
288 | movh m2, [r0 + r5 + 16] | |
289 | lea r0, [r0 + r5] | |
290 | movu m6, [r0 + r1] | |
291 | punpcklqdq m2, [r0 + r1 + 16] | |
292 | movu m7, [r2 + r6] | |
293 | psubw m1, m7 | |
294 | movh m4, [r2 + r6 + 16] | |
295 | lea r2, [r2 + r6] | |
296 | movu m7, [r2 + r3] | |
297 | psubw m6, m7 | |
298 | punpcklqdq m4, [r2 + r3 + 16] | |
299 | psubw m2, m4 | |
300 | pmaddwd m1, m1 | |
301 | pmaddwd m2, m2 | |
302 | pmaddwd m6, m6 | |
303 | paddd m1, m2 | |
304 | paddd m3, m6 | |
305 | paddd m0, m1 | |
306 | paddd m0, m3 | |
307 | lea r0, [r0 + r5] | |
308 | lea r2, [r2 + r6] | |
309 | dec r4d | |
310 | jnz .loop | |
311 | HADDD m0, m5 | |
312 | movd eax, xm0 | |
313 | RET | |
314 | %endmacro | |
315 | INIT_MMX mmx2 | |
316 | SSD_ONE 4, 4 | |
317 | SSD_ONE 4, 8 | |
318 | SSD_ONE 4, 16 | |
319 | SSD_ONE 8, 4 | |
320 | SSD_ONE 8, 8 | |
321 | SSD_ONE 8, 16 | |
322 | SSD_ONE 16, 8 | |
323 | SSD_ONE 16, 16 | |
324 | INIT_XMM sse2 | |
325 | SSD_ONE 8, 4 | |
326 | SSD_ONE 8, 8 | |
327 | SSD_ONE 8, 16 | |
328 | SSD_ONE 8, 32 | |
329 | SSD_12 12, 16 | |
330 | SSD_ONE 16, 4 | |
331 | SSD_ONE 16, 8 | |
332 | SSD_ONE 16, 12 | |
333 | SSD_ONE 16, 16 | |
334 | SSD_ONE 16, 32 | |
335 | SSD_ONE 16, 64 | |
336 | SSD_24 24, 32 | |
337 | SSD_ONE 32, 8 | |
338 | SSD_ONE 32, 16 | |
339 | SSD_ONE 32, 24 | |
340 | SSD_ONE 32, 32 | |
341 | SSD_ONE 32, 64 | |
342 | SSD_TWO 48, 64 | |
343 | SSD_TWO 64, 16 | |
344 | SSD_TWO 64, 32 | |
345 | SSD_TWO 64, 48 | |
346 | SSD_TWO 64, 64 | |
347 | INIT_YMM avx2 | |
348 | SSD_ONE 16, 8 | |
349 | SSD_ONE 16, 16 | |
350 | %endif ; HIGH_BIT_DEPTH | |
351 | ||
352 | ;----------------------------------------------------------------------------- | |
353 | ; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t ) | |
354 | ;----------------------------------------------------------------------------- | |
355 | %if HIGH_BIT_DEPTH == 0 | |
356 | %macro SSD_SS 2 | |
357 | cglobal pixel_ssd_ss_%1x%2, 4,7,6 | |
358 | FIX_STRIDES r1, r3 | |
359 | %if mmsize == %1*4 || mmsize == %1*2 | |
360 | %define offset0_1 r1*2 | |
361 | %define offset0_2 r1*4 | |
362 | %define offset0_3 r5 | |
363 | %define offset1_1 r3*2 | |
364 | %define offset1_2 r3*4 | |
365 | %define offset1_3 r6 | |
366 | lea r5, [4*r1] | |
367 | lea r6, [4*r3] | |
368 | lea r5, [r5 + 2*r1] | |
369 | lea r6, [r6 + 2*r3] | |
370 | %elif mmsize == %1 | |
371 | %define offset0_1 16 | |
372 | %define offset0_2 r1*2 | |
373 | %define offset0_3 r1*2+16 | |
374 | %define offset1_1 16 | |
375 | %define offset1_2 r3*2 | |
376 | %define offset1_3 r3*2+16 | |
377 | %endif | |
378 | %if %1 == 4 | |
379 | %assign %%n %2/(mmsize/%1) | |
380 | %else | |
381 | %assign %%n %2/(2*mmsize/%1) | |
382 | %endif | |
383 | %if %%n > 1 | |
384 | mov r4d, %%n | |
385 | %endif | |
386 | pxor m0, m0 | |
387 | .loop: | |
388 | %if %1 == 4 | |
389 | movh m1, [r0] | |
390 | movh m2, [r2] | |
391 | psubw m1, m2 | |
392 | pmaddwd m1, m1 | |
393 | paddd m0, m1 | |
394 | movh m1, [r0 + offset0_1] | |
395 | movh m2, [r2 + offset1_1] | |
396 | psubw m1, m2 | |
397 | pmaddwd m1, m1 | |
398 | paddd m0, m1 | |
399 | movh m1, [r0 + offset0_2] | |
400 | movh m2, [r2 + offset1_2] | |
401 | psubw m1, m2 | |
402 | pmaddwd m1, m1 | |
403 | paddd m0, m1 | |
404 | movh m1, [r0 + offset0_3] | |
405 | movh m2, [r2 + offset1_3] | |
406 | psubw m1, m2 | |
407 | pmaddwd m1, m1 | |
408 | paddd m0, m1 | |
409 | %else | |
410 | movu m1, [r0] | |
411 | movu m2, [r2] | |
412 | psubw m1, m2 | |
413 | pmaddwd m1, m1 | |
414 | paddd m0, m1 | |
415 | movu m1, [r0 + offset0_1] | |
416 | movu m2, [r2 + offset1_1] | |
417 | psubw m1, m2 | |
418 | pmaddwd m1, m1 | |
419 | paddd m0, m1 | |
420 | movu m1, [r0 + offset0_2] | |
421 | movu m2, [r2 + offset1_2] | |
422 | psubw m1, m2 | |
423 | pmaddwd m1, m1 | |
424 | paddd m0, m1 | |
425 | movu m1, [r0 + offset0_3] | |
426 | movu m2, [r2 + offset1_3] | |
427 | psubw m1, m2 | |
428 | pmaddwd m1, m1 | |
429 | paddd m0, m1 | |
430 | %endif | |
431 | lea r0, [r0+r1*(%2/%%n)*2] | |
432 | lea r2, [r2+r3*(%2/%%n)*2] | |
433 | %if %%n > 1 | |
434 | dec r4d | |
435 | jg .loop | |
436 | %endif | |
437 | %if %1 == 4 | |
438 | %if notcpuflag(ssse3) | |
439 | pshufd m1, m0, 1 | |
440 | paddd m0, m1 | |
441 | %else | |
442 | phaddd m0, m0 | |
443 | %endif | |
444 | %else | |
445 | HADDD m0, m1 | |
446 | %endif | |
447 | movd eax, m0 | |
448 | RET | |
449 | %endmacro | |
450 | %macro SSD_SS_ONE 0 | |
451 | SSD_SS 4, 4 | |
452 | SSD_SS 4, 8 | |
453 | SSD_SS 4, 16 | |
454 | SSD_SS 8, 4 | |
455 | SSD_SS 8, 8 | |
456 | SSD_SS 8, 16 | |
457 | SSD_SS 8, 32 | |
458 | SSD_SS 16, 4 | |
459 | SSD_SS 16, 8 | |
460 | SSD_SS 16, 12 | |
461 | SSD_SS 16, 16 | |
462 | SSD_SS 16, 32 | |
463 | SSD_SS 16, 64 | |
464 | %endmacro | |
465 | ||
466 | %macro SSD_SS_12x16 0 | |
467 | cglobal pixel_ssd_ss_12x16, 4,7,6 | |
468 | FIX_STRIDES r1, r3 | |
469 | mov r4d, 8 | |
470 | pxor m0, m0 | |
471 | .loop: | |
472 | movu m1, [r0] | |
473 | movu m2, [r2] | |
474 | psubw m1, m2 | |
475 | pmaddwd m1, m1 | |
476 | paddd m0, m1 | |
477 | movu m1, [r0 + 16] | |
478 | movu m2, [r2 + 16] | |
479 | psubw m1, m2 | |
480 | pmaddwd m1, m1 | |
481 | pslldq m1, 8 | |
482 | psrldq m1, 8 | |
483 | paddd m0, m1 | |
484 | lea r0, [r0 + 2*r1] | |
485 | lea r2, [r2 + 2*r3] | |
486 | movu m1, [r0] | |
487 | movu m2, [r2] | |
488 | psubw m1, m2 | |
489 | pmaddwd m1, m1 | |
490 | paddd m0, m1 | |
491 | movu m1, [r0 + 16] | |
492 | movu m2, [r2 + 16] | |
493 | psubw m1, m2 | |
494 | pmaddwd m1, m1 | |
495 | pslldq m1, 8 | |
496 | psrldq m1, 8 | |
497 | paddd m0, m1 | |
498 | lea r0, [r0 + 2*r1] | |
499 | lea r2, [r2 + 2*r3] | |
500 | dec r4d | |
501 | jnz .loop | |
502 | HADDD m0, m1 | |
503 | movd eax, m0 | |
504 | RET | |
505 | %endmacro | |
506 | ||
507 | %macro SSD_SS_32 1 | |
508 | cglobal pixel_ssd_ss_32x%1, 4,7,6 | |
509 | FIX_STRIDES r1, r3 | |
510 | mov r4d, %1/2 | |
511 | pxor m0, m0 | |
512 | .loop: | |
513 | movu m1, [r0] | |
514 | movu m2, [r2] | |
515 | psubw m1, m2 | |
516 | pmaddwd m1, m1 | |
517 | paddd m0, m1 | |
518 | movu m1, [r0 + 16] | |
519 | movu m2, [r2 + 16] | |
520 | psubw m1, m2 | |
521 | pmaddwd m1, m1 | |
522 | paddd m0, m1 | |
523 | movu m1, [r0 + 32] | |
524 | movu m2, [r2 + 32] | |
525 | psubw m1, m2 | |
526 | pmaddwd m1, m1 | |
527 | paddd m0, m1 | |
528 | movu m1, [r0 + 48] | |
529 | movu m2, [r2 + 48] | |
530 | psubw m1, m2 | |
531 | pmaddwd m1, m1 | |
532 | paddd m0, m1 | |
533 | lea r0, [r0 + 2*r1] | |
534 | lea r2, [r2 + 2*r3] | |
535 | movu m1, [r0] | |
536 | movu m2, [r2] | |
537 | psubw m1, m2 | |
538 | pmaddwd m1, m1 | |
539 | paddd m0, m1 | |
540 | movu m1, [r0 + 16] | |
541 | movu m2, [r2 + 16] | |
542 | psubw m1, m2 | |
543 | pmaddwd m1, m1 | |
544 | paddd m0, m1 | |
545 | movu m1, [r0 + 32] | |
546 | movu m2, [r2 + 32] | |
547 | psubw m1, m2 | |
548 | pmaddwd m1, m1 | |
549 | paddd m0, m1 | |
550 | movu m1, [r0 + 48] | |
551 | movu m2, [r2 + 48] | |
552 | psubw m1, m2 | |
553 | pmaddwd m1, m1 | |
554 | paddd m0, m1 | |
555 | lea r0, [r0 + 2*r1] | |
556 | lea r2, [r2 + 2*r3] | |
557 | dec r4d | |
558 | jnz .loop | |
559 | HADDD m0, m1 | |
560 | movd eax, m0 | |
561 | RET | |
562 | %endmacro | |
563 | ||
564 | %macro SSD_SS_32xN 0 | |
565 | SSD_SS_32 8 | |
566 | SSD_SS_32 16 | |
567 | SSD_SS_32 24 | |
568 | SSD_SS_32 32 | |
569 | SSD_SS_32 64 | |
570 | %endmacro | |
571 | ||
572 | %macro SSD_SS_24 0 | |
573 | cglobal pixel_ssd_ss_24x32, 4,7,6 | |
574 | FIX_STRIDES r1, r3 | |
575 | mov r4d, 16 | |
576 | pxor m0, m0 | |
577 | .loop: | |
578 | movu m1, [r0] | |
579 | movu m2, [r2] | |
580 | psubw m1, m2 | |
581 | pmaddwd m1, m1 | |
582 | paddd m0, m1 | |
583 | movu m1, [r0 + 16] | |
584 | movu m2, [r2 + 16] | |
585 | psubw m1, m2 | |
586 | pmaddwd m1, m1 | |
587 | paddd m0, m1 | |
588 | movu m1, [r0 + 32] | |
589 | movu m2, [r2 + 32] | |
590 | psubw m1, m2 | |
591 | pmaddwd m1, m1 | |
592 | paddd m0, m1 | |
593 | lea r0, [r0 + 2*r1] | |
594 | lea r2, [r2 + 2*r3] | |
595 | movu m1, [r0] | |
596 | movu m2, [r2] | |
597 | psubw m1, m2 | |
598 | pmaddwd m1, m1 | |
599 | paddd m0, m1 | |
600 | movu m1, [r0 + 16] | |
601 | movu m2, [r2 + 16] | |
602 | psubw m1, m2 | |
603 | pmaddwd m1, m1 | |
604 | paddd m0, m1 | |
605 | movu m1, [r0 + 32] | |
606 | movu m2, [r2 + 32] | |
607 | psubw m1, m2 | |
608 | pmaddwd m1, m1 | |
609 | paddd m0, m1 | |
610 | lea r0, [r0 + 2*r1] | |
611 | lea r2, [r2 + 2*r3] | |
612 | dec r4d | |
613 | jnz .loop | |
614 | HADDD m0, m1 | |
615 | movd eax, m0 | |
616 | RET | |
617 | %endmacro | |
618 | ||
619 | %macro SSD_SS_48 0 | |
620 | cglobal pixel_ssd_ss_48x64, 4,7,6 | |
621 | FIX_STRIDES r1, r3 | |
622 | mov r4d, 32 | |
623 | pxor m0, m0 | |
624 | .loop: | |
625 | movu m1, [r0] | |
626 | movu m2, [r2] | |
627 | psubw m1, m2 | |
628 | pmaddwd m1, m1 | |
629 | paddd m0, m1 | |
630 | movu m1, [r0 + 16] | |
631 | movu m2, [r2 + 16] | |
632 | psubw m1, m2 | |
633 | pmaddwd m1, m1 | |
634 | paddd m0, m1 | |
635 | movu m1, [r0 + 32] | |
636 | movu m2, [r2 + 32] | |
637 | psubw m1, m2 | |
638 | pmaddwd m1, m1 | |
639 | paddd m0, m1 | |
640 | movu m1, [r0 + 48] | |
641 | movu m2, [r2 + 48] | |
642 | psubw m1, m2 | |
643 | pmaddwd m1, m1 | |
644 | paddd m0, m1 | |
645 | movu m1, [r0 + 64] | |
646 | movu m2, [r2 + 64] | |
647 | psubw m1, m2 | |
648 | pmaddwd m1, m1 | |
649 | paddd m0, m1 | |
650 | movu m1, [r0 + 80] | |
651 | movu m2, [r2 + 80] | |
652 | psubw m1, m2 | |
653 | pmaddwd m1, m1 | |
654 | paddd m0, m1 | |
655 | lea r0, [r0 + 2*r1] | |
656 | lea r2, [r2 + 2*r3] | |
657 | movu m1, [r0] | |
658 | movu m2, [r2] | |
659 | psubw m1, m2 | |
660 | pmaddwd m1, m1 | |
661 | paddd m0, m1 | |
662 | movu m1, [r0 + 16] | |
663 | movu m2, [r2 + 16] | |
664 | psubw m1, m2 | |
665 | pmaddwd m1, m1 | |
666 | paddd m0, m1 | |
667 | movu m1, [r0 + 32] | |
668 | movu m2, [r2 + 32] | |
669 | psubw m1, m2 | |
670 | pmaddwd m1, m1 | |
671 | paddd m0, m1 | |
672 | movu m1, [r0 + 48] | |
673 | movu m2, [r2 + 48] | |
674 | psubw m1, m2 | |
675 | pmaddwd m1, m1 | |
676 | paddd m0, m1 | |
677 | movu m1, [r0 + 64] | |
678 | movu m2, [r2 + 64] | |
679 | psubw m1, m2 | |
680 | pmaddwd m1, m1 | |
681 | paddd m0, m1 | |
682 | movu m1, [r0 + 80] | |
683 | movu m2, [r2 + 80] | |
684 | psubw m1, m2 | |
685 | pmaddwd m1, m1 | |
686 | paddd m0, m1 | |
687 | lea r0, [r0 + 2*r1] | |
688 | lea r2, [r2 + 2*r3] | |
689 | dec r4d | |
690 | jnz .loop | |
691 | HADDD m0, m1 | |
692 | movd eax, m0 | |
693 | RET | |
694 | %endmacro | |
695 | ||
696 | %macro SSD_SS_64 1 | |
697 | cglobal pixel_ssd_ss_64x%1, 4,7,6 | |
698 | FIX_STRIDES r1, r3 | |
699 | mov r4d, %1/2 | |
700 | pxor m0, m0 | |
701 | .loop: | |
702 | movu m1, [r0] | |
703 | movu m2, [r2] | |
704 | psubw m1, m2 | |
705 | pmaddwd m1, m1 | |
706 | paddd m0, m1 | |
707 | movu m1, [r0 + 16] | |
708 | movu m2, [r2 + 16] | |
709 | psubw m1, m2 | |
710 | pmaddwd m1, m1 | |
711 | paddd m0, m1 | |
712 | movu m1, [r0 + 32] | |
713 | movu m2, [r2 + 32] | |
714 | psubw m1, m2 | |
715 | pmaddwd m1, m1 | |
716 | paddd m0, m1 | |
717 | movu m1, [r0 + 48] | |
718 | movu m2, [r2 + 48] | |
719 | psubw m1, m2 | |
720 | pmaddwd m1, m1 | |
721 | paddd m0, m1 | |
722 | movu m1, [r0 + 64] | |
723 | movu m2, [r2 + 64] | |
724 | psubw m1, m2 | |
725 | pmaddwd m1, m1 | |
726 | paddd m0, m1 | |
727 | movu m1, [r0 + 80] | |
728 | movu m2, [r2 + 80] | |
729 | psubw m1, m2 | |
730 | pmaddwd m1, m1 | |
731 | paddd m0, m1 | |
732 | movu m1, [r0 + 96] | |
733 | movu m2, [r2 + 96] | |
734 | psubw m1, m2 | |
735 | pmaddwd m1, m1 | |
736 | paddd m0, m1 | |
737 | movu m1, [r0 + 112] | |
738 | movu m2, [r2 + 112] | |
739 | psubw m1, m2 | |
740 | pmaddwd m1, m1 | |
741 | paddd m0, m1 | |
742 | lea r0, [r0 + 2*r1] | |
743 | lea r2, [r2 + 2*r3] | |
744 | movu m1, [r0] | |
745 | movu m2, [r2] | |
746 | psubw m1, m2 | |
747 | pmaddwd m1, m1 | |
748 | paddd m0, m1 | |
749 | movu m1, [r0 + 16] | |
750 | movu m2, [r2 + 16] | |
751 | psubw m1, m2 | |
752 | pmaddwd m1, m1 | |
753 | paddd m0, m1 | |
754 | movu m1, [r0 + 32] | |
755 | movu m2, [r2 + 32] | |
756 | psubw m1, m2 | |
757 | pmaddwd m1, m1 | |
758 | paddd m0, m1 | |
759 | movu m1, [r0 + 48] | |
760 | movu m2, [r2 + 48] | |
761 | psubw m1, m2 | |
762 | pmaddwd m1, m1 | |
763 | paddd m0, m1 | |
764 | movu m1, [r0 + 64] | |
765 | movu m2, [r2 + 64] | |
766 | psubw m1, m2 | |
767 | pmaddwd m1, m1 | |
768 | paddd m0, m1 | |
769 | movu m1, [r0 + 80] | |
770 | movu m2, [r2 + 80] | |
771 | psubw m1, m2 | |
772 | pmaddwd m1, m1 | |
773 | paddd m0, m1 | |
774 | movu m1, [r0 + 96] | |
775 | movu m2, [r2 + 96] | |
776 | psubw m1, m2 | |
777 | pmaddwd m1, m1 | |
778 | paddd m0, m1 | |
779 | movu m1, [r0 + 112] | |
780 | movu m2, [r2 + 112] | |
781 | psubw m1, m2 | |
782 | pmaddwd m1, m1 | |
783 | paddd m0, m1 | |
784 | lea r0, [r0 + 2*r1] | |
785 | lea r2, [r2 + 2*r3] | |
786 | dec r4d | |
787 | jnz .loop | |
788 | HADDD m0, m1 | |
789 | movd eax, m0 | |
790 | RET | |
791 | %endmacro | |
792 | ||
793 | %macro SSD_SS_64xN 0 | |
794 | SSD_SS_64 16 | |
795 | SSD_SS_64 32 | |
796 | SSD_SS_64 48 | |
797 | SSD_SS_64 64 | |
798 | %endmacro | |
799 | ||
800 | INIT_XMM sse2 | |
801 | SSD_SS_ONE | |
802 | SSD_SS_12x16 | |
803 | SSD_SS_24 | |
804 | SSD_SS_32xN | |
805 | SSD_SS_48 | |
806 | SSD_SS_64xN | |
807 | INIT_XMM sse4 | |
808 | SSD_SS_ONE | |
809 | SSD_SS_12x16 | |
810 | SSD_SS_24 | |
811 | SSD_SS_32xN | |
812 | SSD_SS_48 | |
813 | SSD_SS_64xN | |
814 | INIT_XMM avx | |
815 | SSD_SS_ONE | |
816 | SSD_SS_12x16 | |
817 | SSD_SS_24 | |
818 | SSD_SS_32xN | |
819 | SSD_SS_48 | |
820 | SSD_SS_64xN | |
821 | %endif ; !HIGH_BIT_DEPTH | |
822 | ||
823 | %if HIGH_BIT_DEPTH == 0 | |
824 | %macro SSD_LOAD_FULL 5 | |
825 | mova m1, [t0+%1] | |
826 | mova m2, [t2+%2] | |
827 | mova m3, [t0+%3] | |
828 | mova m4, [t2+%4] | |
829 | %if %5==1 | |
830 | add t0, t1 | |
831 | add t2, t3 | |
832 | %elif %5==2 | |
833 | lea t0, [t0+2*t1] | |
834 | lea t2, [t2+2*t3] | |
835 | %endif | |
836 | %endmacro | |
837 | ||
838 | %macro LOAD 5 | |
839 | movh m%1, %3 | |
840 | movh m%2, %4 | |
841 | %if %5 | |
842 | lea t0, [t0+2*t1] | |
843 | %endif | |
844 | %endmacro | |
845 | ||
846 | %macro JOIN 7 | |
847 | movh m%3, %5 | |
848 | movh m%4, %6 | |
849 | %if %7 | |
850 | lea t2, [t2+2*t3] | |
851 | %endif | |
852 | punpcklbw m%1, m7 | |
853 | punpcklbw m%3, m7 | |
854 | psubw m%1, m%3 | |
855 | punpcklbw m%2, m7 | |
856 | punpcklbw m%4, m7 | |
857 | psubw m%2, m%4 | |
858 | %endmacro | |
859 | ||
860 | %macro JOIN_SSE2 7 | |
861 | movh m%3, %5 | |
862 | movh m%4, %6 | |
863 | %if %7 | |
864 | lea t2, [t2+2*t3] | |
865 | %endif | |
866 | punpcklqdq m%1, m%2 | |
867 | punpcklqdq m%3, m%4 | |
868 | DEINTB %2, %1, %4, %3, 7 | |
869 | psubw m%2, m%4 | |
870 | psubw m%1, m%3 | |
871 | %endmacro | |
872 | ||
873 | %macro JOIN_SSSE3 7 | |
874 | movh m%3, %5 | |
875 | movh m%4, %6 | |
876 | %if %7 | |
877 | lea t2, [t2+2*t3] | |
878 | %endif | |
879 | punpcklbw m%1, m%3 | |
880 | punpcklbw m%2, m%4 | |
881 | %endmacro | |
882 | ||
883 | %macro LOAD_AVX2 5 | |
884 | mova xm%1, %3 | |
885 | vinserti128 m%1, m%1, %4, 1 | |
886 | %if %5 | |
887 | lea t0, [t0+2*t1] | |
888 | %endif | |
889 | %endmacro | |
890 | ||
891 | %macro JOIN_AVX2 7 | |
892 | mova xm%2, %5 | |
893 | vinserti128 m%2, m%2, %6, 1 | |
894 | %if %7 | |
895 | lea t2, [t2+2*t3] | |
896 | %endif | |
897 | SBUTTERFLY bw, %1, %2, %3 | |
898 | %endmacro | |
899 | ||
900 | %macro SSD_LOAD_HALF 5 | |
901 | LOAD 1, 2, [t0+%1], [t0+%3], 1 | |
902 | JOIN 1, 2, 3, 4, [t2+%2], [t2+%4], 1 | |
903 | LOAD 3, 4, [t0+%1], [t0+%3], %5 | |
904 | JOIN 3, 4, 5, 6, [t2+%2], [t2+%4], %5 | |
905 | %endmacro | |
906 | ||
907 | %macro SSD_CORE 7-8 | |
908 | %ifidn %8, FULL | |
909 | mova m%6, m%2 | |
910 | mova m%7, m%4 | |
911 | psubusb m%2, m%1 | |
912 | psubusb m%4, m%3 | |
913 | psubusb m%1, m%6 | |
914 | psubusb m%3, m%7 | |
915 | por m%1, m%2 | |
916 | por m%3, m%4 | |
917 | punpcklbw m%2, m%1, m%5 | |
918 | punpckhbw m%1, m%5 | |
919 | punpcklbw m%4, m%3, m%5 | |
920 | punpckhbw m%3, m%5 | |
921 | %endif | |
922 | pmaddwd m%1, m%1 | |
923 | pmaddwd m%2, m%2 | |
924 | pmaddwd m%3, m%3 | |
925 | pmaddwd m%4, m%4 | |
926 | %endmacro | |
927 | ||
928 | %macro SSD_CORE_SSE2 7-8 | |
929 | %ifidn %8, FULL | |
930 | DEINTB %6, %1, %7, %2, %5 | |
931 | psubw m%6, m%7 | |
932 | psubw m%1, m%2 | |
933 | SWAP %6, %2, %1 | |
934 | DEINTB %6, %3, %7, %4, %5 | |
935 | psubw m%6, m%7 | |
936 | psubw m%3, m%4 | |
937 | SWAP %6, %4, %3 | |
938 | %endif | |
939 | pmaddwd m%1, m%1 | |
940 | pmaddwd m%2, m%2 | |
941 | pmaddwd m%3, m%3 | |
942 | pmaddwd m%4, m%4 | |
943 | %endmacro | |
944 | ||
945 | %macro SSD_CORE_SSSE3 7-8 | |
946 | %ifidn %8, FULL | |
947 | punpckhbw m%6, m%1, m%2 | |
948 | punpckhbw m%7, m%3, m%4 | |
949 | punpcklbw m%1, m%2 | |
950 | punpcklbw m%3, m%4 | |
951 | SWAP %6, %2, %3 | |
952 | SWAP %7, %4 | |
953 | %endif | |
954 | pmaddubsw m%1, m%5 | |
955 | pmaddubsw m%2, m%5 | |
956 | pmaddubsw m%3, m%5 | |
957 | pmaddubsw m%4, m%5 | |
958 | pmaddwd m%1, m%1 | |
959 | pmaddwd m%2, m%2 | |
960 | pmaddwd m%3, m%3 | |
961 | pmaddwd m%4, m%4 | |
962 | %endmacro | |
963 | ||
964 | %macro SSD_ITER 6 | |
965 | SSD_LOAD_%1 %2,%3,%4,%5,%6 | |
966 | SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 | |
967 | paddd m1, m2 | |
968 | paddd m3, m4 | |
969 | paddd m0, m1 | |
970 | paddd m0, m3 | |
971 | %endmacro | |
972 | ||
973 | ;----------------------------------------------------------------------------- | |
974 | ; int pixel_ssd_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
975 | ;----------------------------------------------------------------------------- | |
976 | %macro SSD 2 | |
977 | %if %1 != %2 | |
978 | %assign function_align 8 | |
979 | %else | |
980 | %assign function_align 16 | |
981 | %endif | |
982 | cglobal pixel_ssd_%1x%2, 0,0,0 | |
983 | mov al, %1*%2/mmsize/2 | |
984 | ||
985 | %if %1 != %2 | |
986 | jmp mangle(x265_pixel_ssd_%1x%1 %+ SUFFIX %+ .startloop) | |
987 | %else | |
988 | ||
989 | .startloop: | |
990 | %if ARCH_X86_64 | |
991 | DECLARE_REG_TMP 0,1,2,3 | |
992 | PROLOGUE 0,0,8 | |
993 | %else | |
994 | PROLOGUE 0,5 | |
995 | DECLARE_REG_TMP 1,2,3,4 | |
996 | mov t0, r0m | |
997 | mov t1, r1m | |
998 | mov t2, r2m | |
999 | mov t3, r3m | |
1000 | %endif | |
1001 | ||
1002 | %if cpuflag(ssse3) | |
1003 | mova m7, [hsub_mul] | |
1004 | %elifidn cpuname, sse2 | |
1005 | mova m7, [pw_00ff] | |
1006 | %elif %1 >= mmsize | |
1007 | pxor m7, m7 | |
1008 | %endif | |
1009 | pxor m0, m0 | |
1010 | ||
1011 | ALIGN 16 | |
1012 | .loop: | |
1013 | %if %1 > mmsize | |
1014 | SSD_ITER FULL, 0, 0, mmsize, mmsize, 1 | |
1015 | %elif %1 == mmsize | |
1016 | SSD_ITER FULL, 0, 0, t1, t3, 2 | |
1017 | %else | |
1018 | SSD_ITER HALF, 0, 0, t1, t3, 2 | |
1019 | %endif | |
1020 | dec al | |
1021 | jg .loop | |
1022 | %if mmsize==32 | |
1023 | vextracti128 xm1, m0, 1 | |
1024 | paddd xm0, xm1 | |
1025 | HADDD xm0, xm1 | |
1026 | movd eax, xm0 | |
1027 | %else | |
1028 | HADDD m0, m1 | |
1029 | movd eax, m0 | |
1030 | %endif | |
1031 | %if (mmsize == 8) | |
1032 | emms | |
1033 | %endif | |
1034 | RET | |
1035 | %endif | |
1036 | %endmacro | |
1037 | ||
1038 | %macro HEVC_SSD 0 | |
1039 | SSD 32, 64 | |
1040 | SSD 16, 64 | |
1041 | SSD 32, 32 | |
1042 | SSD 32, 16 | |
1043 | SSD 16, 32 | |
1044 | SSD 32, 8 | |
1045 | SSD 8, 32 | |
1046 | SSD 32, 24 | |
1047 | SSD 24, 24 ; not used, but resolves x265_pixel_ssd_24x24_sse2.startloop symbol | |
1048 | SSD 8, 4 | |
1049 | SSD 8, 8 | |
1050 | SSD 16, 16 | |
1051 | SSD 16, 12 | |
1052 | SSD 16, 8 | |
1053 | SSD 8, 16 | |
1054 | SSD 16, 4 | |
1055 | %endmacro | |
1056 | ||
1057 | INIT_MMX mmx | |
1058 | SSD 16, 16 | |
1059 | SSD 16, 8 | |
1060 | SSD 8, 8 | |
1061 | SSD 8, 16 | |
1062 | SSD 4, 4 | |
1063 | SSD 8, 4 | |
1064 | SSD 4, 8 | |
1065 | SSD 4, 16 | |
1066 | INIT_XMM sse2slow | |
1067 | SSD 16, 16 | |
1068 | SSD 8, 8 | |
1069 | SSD 16, 8 | |
1070 | SSD 8, 16 | |
1071 | SSD 8, 4 | |
1072 | INIT_XMM sse2 | |
1073 | %define SSD_CORE SSD_CORE_SSE2 | |
1074 | %define JOIN JOIN_SSE2 | |
1075 | HEVC_SSD | |
1076 | INIT_XMM ssse3 | |
1077 | %define SSD_CORE SSD_CORE_SSSE3 | |
1078 | %define JOIN JOIN_SSSE3 | |
1079 | HEVC_SSD | |
1080 | INIT_XMM avx | |
1081 | HEVC_SSD | |
1082 | INIT_MMX ssse3 | |
1083 | SSD 4, 4 | |
1084 | SSD 4, 8 | |
1085 | SSD 4, 16 | |
1086 | INIT_XMM xop | |
1087 | SSD 16, 16 | |
1088 | SSD 8, 8 | |
1089 | SSD 16, 8 | |
1090 | SSD 8, 16 | |
1091 | SSD 8, 4 | |
1092 | %define LOAD LOAD_AVX2 | |
1093 | %define JOIN JOIN_AVX2 | |
1094 | INIT_YMM avx2 | |
1095 | SSD 16, 16 | |
1096 | SSD 16, 8 | |
1097 | %assign function_align 16 | |
1098 | %endif ; !HIGH_BIT_DEPTH | |
1099 | ||
1100 | ;----------------------------------------------------------------------------- | |
1101 | ; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
1102 | ;----------------------------------------------------------------------------- | |
1103 | INIT_XMM sse4 | |
1104 | cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2 | |
1105 | ||
1106 | pxor m6, m6 | |
1107 | mov r4d, 4 | |
1108 | ||
1109 | .loop: | |
1110 | movu m0, [r0] | |
1111 | movu m1, [r2] | |
1112 | movu m2, [r0 + r1] | |
1113 | movu m3, [r2 + r3] | |
1114 | ||
1115 | punpckhdq m4, m0, m2 | |
1116 | punpckhdq m5, m1, m3 | |
1117 | ||
1118 | pmovzxbw m0, m0 | |
1119 | pmovzxbw m1, m1 | |
1120 | pmovzxbw m2, m2 | |
1121 | pmovzxbw m3, m3 | |
1122 | pmovzxbw m4, m4 | |
1123 | pmovzxbw m5, m5 | |
1124 | ||
1125 | psubw m0, m1 | |
1126 | psubw m2, m3 | |
1127 | psubw m4, m5 | |
1128 | ||
1129 | pmaddwd m0, m0 | |
1130 | pmaddwd m2, m2 | |
1131 | pmaddwd m4, m4 | |
1132 | ||
1133 | paddd m0, m2 | |
1134 | paddd m6, m4 | |
1135 | paddd m6, m0 | |
1136 | ||
1137 | movu m0, [r0 + 2 * r1] | |
1138 | movu m1, [r2 + 2 * r3] | |
1139 | lea r0, [r0 + 2 * r1] | |
1140 | lea r2, [r2 + 2 * r3] | |
1141 | movu m2, [r0 + r1] | |
1142 | movu m3, [r2 + r3] | |
1143 | ||
1144 | punpckhdq m4, m0, m2 | |
1145 | punpckhdq m5, m1, m3 | |
1146 | ||
1147 | pmovzxbw m0, m0 | |
1148 | pmovzxbw m1, m1 | |
1149 | pmovzxbw m2, m2 | |
1150 | pmovzxbw m3, m3 | |
1151 | pmovzxbw m4, m4 | |
1152 | pmovzxbw m5, m5 | |
1153 | ||
1154 | psubw m0, m1 | |
1155 | psubw m2, m3 | |
1156 | psubw m4, m5 | |
1157 | ||
1158 | pmaddwd m0, m0 | |
1159 | pmaddwd m2, m2 | |
1160 | pmaddwd m4, m4 | |
1161 | ||
1162 | paddd m0, m2 | |
1163 | paddd m6, m4 | |
1164 | paddd m6, m0 | |
1165 | ||
1166 | dec r4d | |
1167 | lea r0, [r0 + 2 * r1] | |
1168 | lea r2, [r2 + 2 * r3] | |
1169 | jnz .loop | |
1170 | ||
1171 | HADDD m6, m1 | |
1172 | movd eax, m6 | |
1173 | ||
1174 | RET | |
1175 | ||
1176 | ;----------------------------------------------------------------------------- | |
1177 | ; int pixel_ssd_24x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
1178 | ;----------------------------------------------------------------------------- | |
1179 | INIT_XMM sse4 | |
1180 | cglobal pixel_ssd_24x32, 4, 5, 8, src1, stride1, src2, stride2 | |
1181 | ||
1182 | pxor m7, m7 | |
1183 | pxor m6, m6 | |
1184 | mov r4d, 16 | |
1185 | ||
1186 | .loop: | |
1187 | movu m1, [r0] | |
1188 | pmovzxbw m0, m1 | |
1189 | punpckhbw m1, m6 | |
1190 | pmovzxbw m2, [r0 + 16] | |
1191 | movu m4, [r2] | |
1192 | pmovzxbw m3, m4 | |
1193 | punpckhbw m4, m6 | |
1194 | pmovzxbw m5, [r2 + 16] | |
1195 | ||
1196 | psubw m0, m3 | |
1197 | psubw m1, m4 | |
1198 | psubw m2, m5 | |
1199 | ||
1200 | pmaddwd m0, m0 | |
1201 | pmaddwd m1, m1 | |
1202 | pmaddwd m2, m2 | |
1203 | ||
1204 | paddd m0, m1 | |
1205 | paddd m7, m2 | |
1206 | paddd m7, m0 | |
1207 | ||
1208 | movu m1, [r0 + r1] | |
1209 | pmovzxbw m0, m1 | |
1210 | punpckhbw m1, m6 | |
1211 | pmovzxbw m2, [r0 + r1 + 16] | |
1212 | movu m4, [r2 + r3] | |
1213 | pmovzxbw m3, m4 | |
1214 | punpckhbw m4, m6 | |
1215 | pmovzxbw m5, [r2 + r3 + 16] | |
1216 | ||
1217 | psubw m0, m3 | |
1218 | psubw m1, m4 | |
1219 | psubw m2, m5 | |
1220 | ||
1221 | pmaddwd m0, m0 | |
1222 | pmaddwd m1, m1 | |
1223 | pmaddwd m2, m2 | |
1224 | ||
1225 | paddd m0, m1 | |
1226 | paddd m7, m2 | |
1227 | paddd m7, m0 | |
1228 | ||
1229 | dec r4d | |
1230 | lea r0, [r0 + 2 * r1] | |
1231 | lea r2, [r2 + 2 * r3] | |
1232 | jnz .loop | |
1233 | ||
1234 | HADDD m7, m1 | |
1235 | movd eax, m7 | |
1236 | ||
1237 | RET | |
1238 | ||
1239 | %macro PIXEL_SSD_16x4 0 | |
1240 | movu m1, [r0] | |
1241 | pmovzxbw m0, m1 | |
1242 | punpckhbw m1, m6 | |
1243 | movu m3, [r2] | |
1244 | pmovzxbw m2, m3 | |
1245 | punpckhbw m3, m6 | |
1246 | ||
1247 | psubw m0, m2 | |
1248 | psubw m1, m3 | |
1249 | ||
1250 | movu m5, [r0 + r1] | |
1251 | pmovzxbw m4, m5 | |
1252 | punpckhbw m5, m6 | |
1253 | movu m3, [r2 + r3] | |
1254 | pmovzxbw m2, m3 | |
1255 | punpckhbw m3, m6 | |
1256 | ||
1257 | psubw m4, m2 | |
1258 | psubw m5, m3 | |
1259 | ||
1260 | pmaddwd m0, m0 | |
1261 | pmaddwd m1, m1 | |
1262 | pmaddwd m4, m4 | |
1263 | pmaddwd m5, m5 | |
1264 | ||
1265 | paddd m0, m1 | |
1266 | paddd m4, m5 | |
1267 | paddd m4, m0 | |
1268 | paddd m7, m4 | |
1269 | ||
1270 | movu m1, [r0 + r6] | |
1271 | pmovzxbw m0, m1 | |
1272 | punpckhbw m1, m6 | |
1273 | movu m3, [r2 + 2 * r3] | |
1274 | pmovzxbw m2, m3 | |
1275 | punpckhbw m3, m6 | |
1276 | ||
1277 | psubw m0, m2 | |
1278 | psubw m1, m3 | |
1279 | ||
1280 | lea r0, [r0 + r6] | |
1281 | lea r2, [r2 + 2 * r3] | |
1282 | movu m5, [r0 + r1] | |
1283 | pmovzxbw m4, m5 | |
1284 | punpckhbw m5, m6 | |
1285 | movu m3, [r2 + r3] | |
1286 | pmovzxbw m2, m3 | |
1287 | punpckhbw m3, m6 | |
1288 | ||
1289 | psubw m4, m2 | |
1290 | psubw m5, m3 | |
1291 | ||
1292 | pmaddwd m0, m0 | |
1293 | pmaddwd m1, m1 | |
1294 | pmaddwd m4, m4 | |
1295 | pmaddwd m5, m5 | |
1296 | ||
1297 | paddd m0, m1 | |
1298 | paddd m4, m5 | |
1299 | paddd m4, m0 | |
1300 | paddd m7, m4 | |
1301 | %endmacro | |
1302 | ||
1303 | cglobal pixel_ssd_16x16_internal | |
1304 | PIXEL_SSD_16x4 | |
1305 | lea r0, [r0 + r6] | |
1306 | lea r2, [r2 + 2 * r3] | |
1307 | PIXEL_SSD_16x4 | |
1308 | lea r0, [r0 + r6] | |
1309 | lea r2, [r2 + 2 * r3] | |
1310 | PIXEL_SSD_16x4 | |
1311 | lea r0, [r0 + r6] | |
1312 | lea r2, [r2 + 2 * r3] | |
1313 | PIXEL_SSD_16x4 | |
1314 | ret | |
1315 | ||
1316 | ;----------------------------------------------------------------------------- | |
1317 | ; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
1318 | ;----------------------------------------------------------------------------- | |
1319 | INIT_XMM sse4 | |
1320 | cglobal pixel_ssd_48x64, 4, 7, 8, src1, stride1, src2, stride2 | |
1321 | ||
1322 | pxor m7, m7 | |
1323 | pxor m6, m6 | |
1324 | mov r4, r0 | |
1325 | mov r5, r2 | |
1326 | lea r6, [r1 * 2] | |
1327 | ||
1328 | call pixel_ssd_16x16_internal | |
1329 | lea r0, [r0 + r6] | |
1330 | lea r2, [r2 + 2 * r3] | |
1331 | call pixel_ssd_16x16_internal | |
1332 | lea r0, [r0 + r6] | |
1333 | lea r2, [r2 + 2 * r3] | |
1334 | call pixel_ssd_16x16_internal | |
1335 | lea r0, [r0 + r6] | |
1336 | lea r2, [r2 + 2 * r3] | |
1337 | call pixel_ssd_16x16_internal | |
1338 | lea r0, [r4 + 16] | |
1339 | lea r2, [r5 + 16] | |
1340 | call pixel_ssd_16x16_internal | |
1341 | lea r0, [r0 + r6] | |
1342 | lea r2, [r2 + 2 * r3] | |
1343 | call pixel_ssd_16x16_internal | |
1344 | lea r0, [r0 + r6] | |
1345 | lea r2, [r2 + 2 * r3] | |
1346 | call pixel_ssd_16x16_internal | |
1347 | lea r0, [r0 + r6] | |
1348 | lea r2, [r2 + 2 * r3] | |
1349 | call pixel_ssd_16x16_internal | |
1350 | lea r0, [r4 + 32] | |
1351 | lea r2, [r5 + 32] | |
1352 | call pixel_ssd_16x16_internal | |
1353 | lea r0, [r0 + r6] | |
1354 | lea r2, [r2 + 2 * r3] | |
1355 | call pixel_ssd_16x16_internal | |
1356 | lea r0, [r0 + r6] | |
1357 | lea r2, [r2 + 2 * r3] | |
1358 | call pixel_ssd_16x16_internal | |
1359 | lea r0, [r0 + r6] | |
1360 | lea r2, [r2 + 2 * r3] | |
1361 | call pixel_ssd_16x16_internal | |
1362 | ||
1363 | HADDD m7, m1 | |
1364 | movd eax, m7 | |
1365 | ||
1366 | RET | |
1367 | ||
1368 | ;----------------------------------------------------------------------------- | |
1369 | ; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
1370 | ;----------------------------------------------------------------------------- | |
1371 | INIT_XMM sse4 | |
1372 | cglobal pixel_ssd_64x16, 4, 7, 8, src1, stride1, src2, stride2 | |
1373 | ||
1374 | pxor m7, m7 | |
1375 | pxor m6, m6 | |
1376 | mov r4, r0 | |
1377 | mov r5, r2 | |
1378 | lea r6, [r1 * 2] | |
1379 | ||
1380 | call pixel_ssd_16x16_internal | |
1381 | lea r0, [r4 + 16] | |
1382 | lea r2, [r5 + 16] | |
1383 | call pixel_ssd_16x16_internal | |
1384 | lea r0, [r4 + 32] | |
1385 | lea r2, [r5 + 32] | |
1386 | call pixel_ssd_16x16_internal | |
1387 | lea r0, [r4 + 48] | |
1388 | lea r2, [r5 + 48] | |
1389 | call pixel_ssd_16x16_internal | |
1390 | ||
1391 | HADDD m7, m1 | |
1392 | movd eax, m7 | |
1393 | ||
1394 | RET | |
1395 | ||
1396 | ;----------------------------------------------------------------------------- | |
1397 | ; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
1398 | ;----------------------------------------------------------------------------- | |
1399 | INIT_XMM sse4 | |
1400 | cglobal pixel_ssd_64x32, 4, 7, 8, src1, stride1, src2, stride2 | |
1401 | ||
1402 | pxor m7, m7 | |
1403 | pxor m6, m6 | |
1404 | mov r4, r0 | |
1405 | mov r5, r2 | |
1406 | lea r6, [r1 * 2] | |
1407 | ||
1408 | call pixel_ssd_16x16_internal | |
1409 | lea r0, [r0 + r6] | |
1410 | lea r2, [r2 + 2 * r3] | |
1411 | call pixel_ssd_16x16_internal | |
1412 | lea r0, [r4 + 16] | |
1413 | lea r2, [r5 + 16] | |
1414 | call pixel_ssd_16x16_internal | |
1415 | lea r0, [r0 + r6] | |
1416 | lea r2, [r2 + 2 * r3] | |
1417 | call pixel_ssd_16x16_internal | |
1418 | lea r0, [r4 + 32] | |
1419 | lea r2, [r5 + 32] | |
1420 | call pixel_ssd_16x16_internal | |
1421 | lea r0, [r0 + r6] | |
1422 | lea r2, [r2 + 2 * r3] | |
1423 | call pixel_ssd_16x16_internal | |
1424 | lea r0, [r4 + 48] | |
1425 | lea r2, [r5 + 48] | |
1426 | call pixel_ssd_16x16_internal | |
1427 | lea r0, [r0 + r6] | |
1428 | lea r2, [r2 + 2 * r3] | |
1429 | call pixel_ssd_16x16_internal | |
1430 | ||
1431 | HADDD m7, m1 | |
1432 | movd eax, m7 | |
1433 | ||
1434 | RET | |
1435 | ||
1436 | ;----------------------------------------------------------------------------- | |
1437 | ; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
1438 | ;----------------------------------------------------------------------------- | |
1439 | INIT_XMM sse4 | |
1440 | cglobal pixel_ssd_64x48, 4, 7, 8, src1, stride1, src2, stride2 | |
1441 | ||
1442 | pxor m7, m7 | |
1443 | pxor m6, m6 | |
1444 | mov r4, r0 | |
1445 | mov r5, r2 | |
1446 | lea r6, [r1 * 2] | |
1447 | ||
1448 | call pixel_ssd_16x16_internal | |
1449 | lea r0, [r0 + r6] | |
1450 | lea r2, [r2 + 2 * r3] | |
1451 | call pixel_ssd_16x16_internal | |
1452 | lea r0, [r0 + r6] | |
1453 | lea r2, [r2 + 2 * r3] | |
1454 | call pixel_ssd_16x16_internal | |
1455 | lea r0, [r4 + 16] | |
1456 | lea r2, [r5 + 16] | |
1457 | call pixel_ssd_16x16_internal | |
1458 | lea r0, [r0 + r6] | |
1459 | lea r2, [r2 + 2 * r3] | |
1460 | call pixel_ssd_16x16_internal | |
1461 | lea r0, [r0 + r6] | |
1462 | lea r2, [r2 + 2 * r3] | |
1463 | call pixel_ssd_16x16_internal | |
1464 | lea r0, [r4 + 32] | |
1465 | lea r2, [r5 + 32] | |
1466 | call pixel_ssd_16x16_internal | |
1467 | lea r0, [r0 + r6] | |
1468 | lea r2, [r2 + 2 * r3] | |
1469 | call pixel_ssd_16x16_internal | |
1470 | lea r0, [r0 + r6] | |
1471 | lea r2, [r2 + 2 * r3] | |
1472 | call pixel_ssd_16x16_internal | |
1473 | lea r0, [r4 + 48] | |
1474 | lea r2, [r5 + 48] | |
1475 | call pixel_ssd_16x16_internal | |
1476 | lea r0, [r0 + r6] | |
1477 | lea r2, [r2 + 2 * r3] | |
1478 | call pixel_ssd_16x16_internal | |
1479 | lea r0, [r0 + r6] | |
1480 | lea r2, [r2 + 2 * r3] | |
1481 | call pixel_ssd_16x16_internal | |
1482 | ||
1483 | HADDD m7, m1 | |
1484 | movd eax, m7 | |
1485 | ||
1486 | RET | |
1487 | ||
1488 | ;----------------------------------------------------------------------------- | |
1489 | ; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
1490 | ;----------------------------------------------------------------------------- | |
1491 | INIT_XMM sse4 | |
1492 | cglobal pixel_ssd_64x64, 4, 7, 8, src1, stride1, src2, stride2 | |
1493 | ||
1494 | pxor m7, m7 | |
1495 | pxor m6, m6 | |
1496 | mov r4, r0 | |
1497 | mov r5, r2 | |
1498 | lea r6, [r1 * 2] | |
1499 | ||
1500 | call pixel_ssd_16x16_internal | |
1501 | lea r0, [r0 + r6] | |
1502 | lea r2, [r2 + 2 * r3] | |
1503 | call pixel_ssd_16x16_internal | |
1504 | lea r0, [r0 + r6] | |
1505 | lea r2, [r2 + 2 * r3] | |
1506 | call pixel_ssd_16x16_internal | |
1507 | lea r0, [r0 + r6] | |
1508 | lea r2, [r2 + 2 * r3] | |
1509 | call pixel_ssd_16x16_internal | |
1510 | lea r0, [r4 + 16] | |
1511 | lea r2, [r5 + 16] | |
1512 | call pixel_ssd_16x16_internal | |
1513 | lea r0, [r0 + r6] | |
1514 | lea r2, [r2 + 2 * r3] | |
1515 | call pixel_ssd_16x16_internal | |
1516 | lea r0, [r0 + r6] | |
1517 | lea r2, [r2 + 2 * r3] | |
1518 | call pixel_ssd_16x16_internal | |
1519 | lea r0, [r0 + r6] | |
1520 | lea r2, [r2 + 2 * r3] | |
1521 | call pixel_ssd_16x16_internal | |
1522 | lea r0, [r4 + 32] | |
1523 | lea r2, [r5 + 32] | |
1524 | call pixel_ssd_16x16_internal | |
1525 | lea r0, [r0 + r6] | |
1526 | lea r2, [r2 + 2 * r3] | |
1527 | call pixel_ssd_16x16_internal | |
1528 | lea r0, [r0 + r6] | |
1529 | lea r2, [r2 + 2 * r3] | |
1530 | call pixel_ssd_16x16_internal | |
1531 | lea r0, [r0 + r6] | |
1532 | lea r2, [r2 + 2 * r3] | |
1533 | call pixel_ssd_16x16_internal | |
1534 | lea r0, [r4 + 48] | |
1535 | lea r2, [r5 + 48] | |
1536 | call pixel_ssd_16x16_internal | |
1537 | lea r0, [r0 + r6] | |
1538 | lea r2, [r2 + 2 * r3] | |
1539 | call pixel_ssd_16x16_internal | |
1540 | lea r0, [r0 + r6] | |
1541 | lea r2, [r2 + 2 * r3] | |
1542 | call pixel_ssd_16x16_internal | |
1543 | lea r0, [r0 + r6] | |
1544 | lea r2, [r2 + 2 * r3] | |
1545 | call pixel_ssd_16x16_internal | |
1546 | ||
1547 | HADDD m7, m1 | |
1548 | movd eax, m7 | |
1549 | ||
1550 | RET | |
1551 | ||
1552 | ;----------------------------------------------------------------------------- | |
1553 | ; int pixel_ssd_sp ( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1554 | ;----------------------------------------------------------------------------- | |
1555 | ||
1556 | cglobal pixel_ssd_sp_4x4_internal | |
1557 | movh m0, [r0] | |
1558 | movh m1, [r0 + r1] | |
1559 | punpcklqdq m0, m1 | |
1560 | movd m2, [r2] | |
1561 | movd m3, [r2 + r3] | |
1562 | punpckldq m2, m3 | |
1563 | pmovzxbw m2, m2 | |
1564 | psubw m0, m2 | |
1565 | movh m4, [r0 + 2 * r1] | |
1566 | movh m5, [r0 + r4] | |
1567 | punpcklqdq m4, m5 | |
1568 | movd m6, [r2 + 2 * r3] | |
1569 | lea r2, [r2 + 2 * r3] | |
1570 | movd m1, [r2 + r3] | |
1571 | punpckldq m6, m1 | |
1572 | pmovzxbw m6, m6 | |
1573 | psubw m4, m6 | |
1574 | pmaddwd m0, m0 | |
1575 | pmaddwd m4, m4 | |
1576 | paddd m0, m4 | |
1577 | paddd m7, m0 | |
1578 | ret | |
1579 | ||
1580 | ;----------------------------------------------------------------------------- | |
1581 | ; int pixel_ssd_sp_4x4( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1582 | ;----------------------------------------------------------------------------- | |
1583 | INIT_XMM sse4 | |
1584 | cglobal pixel_ssd_sp_4x4, 4, 5, 8, src1, stride1, src2, stride2 | |
1585 | pxor m7, m7 | |
1586 | add r1, r1 | |
1587 | lea r4, [r1 * 3] | |
1588 | call pixel_ssd_sp_4x4_internal | |
1589 | HADDD m7, m1 | |
1590 | movd eax, m7 | |
1591 | RET | |
1592 | ||
1593 | ;----------------------------------------------------------------------------- | |
1594 | ; int pixel_ssd_sp_4x8( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1595 | ;----------------------------------------------------------------------------- | |
1596 | INIT_XMM sse4 | |
1597 | cglobal pixel_ssd_sp_4x8, 4, 5, 8, src1, stride1, src2, stride2 | |
1598 | pxor m7, m7 | |
1599 | add r1, r1 | |
1600 | lea r4, [r1 * 3] | |
1601 | call pixel_ssd_sp_4x4_internal | |
1602 | lea r0, [r0 + 4 * r1] | |
1603 | lea r2, [r2 + 2 * r3] | |
1604 | call pixel_ssd_sp_4x4_internal | |
1605 | HADDD m7, m1 | |
1606 | movd eax, m7 | |
1607 | RET | |
1608 | ||
1609 | ;----------------------------------------------------------------------------- | |
1610 | ; int pixel_ssd_sp_4x16( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1611 | ;----------------------------------------------------------------------------- | |
1612 | INIT_XMM sse4 | |
1613 | cglobal pixel_ssd_sp_4x16, 4, 5, 8, src1, stride1, src2, stride2 | |
1614 | pxor m7, m7 | |
1615 | add r1, r1 | |
1616 | lea r4, [r1 * 3] | |
1617 | call pixel_ssd_sp_4x4_internal | |
1618 | lea r0, [r0 + 4 * r1] | |
1619 | lea r2, [r2 + 2 * r3] | |
1620 | call pixel_ssd_sp_4x4_internal | |
1621 | lea r0, [r0 + 4 * r1] | |
1622 | lea r2, [r2 + 2 * r3] | |
1623 | call pixel_ssd_sp_4x4_internal | |
1624 | lea r0, [r0 + 4 * r1] | |
1625 | lea r2, [r2 + 2 * r3] | |
1626 | call pixel_ssd_sp_4x4_internal | |
1627 | HADDD m7, m1 | |
1628 | movd eax, m7 | |
1629 | RET | |
1630 | ||
1631 | cglobal pixel_ssd_sp_8x4_internal | |
1632 | movu m0, [r0] | |
1633 | movu m1, [r0 + r1] | |
1634 | movh m2, [r2] | |
1635 | movh m3, [r2 + r3] | |
1636 | pmovzxbw m2, m2 | |
1637 | pmovzxbw m3, m3 | |
1638 | ||
1639 | psubw m0, m2 | |
1640 | psubw m1, m3 | |
1641 | ||
1642 | movu m4, [r0 + 2 * r1] | |
1643 | movu m5, [r0 + r4] | |
1644 | movh m2, [r2 + 2 * r3] | |
1645 | movh m3, [r2 + r5] | |
1646 | pmovzxbw m2, m2 | |
1647 | pmovzxbw m3, m3 | |
1648 | ||
1649 | psubw m4, m2 | |
1650 | psubw m5, m3 | |
1651 | ||
1652 | pmaddwd m0, m0 | |
1653 | pmaddwd m1, m1 | |
1654 | pmaddwd m4, m4 | |
1655 | pmaddwd m5, m5 | |
1656 | ||
1657 | paddd m0, m1 | |
1658 | paddd m4, m5 | |
1659 | paddd m4, m0 | |
1660 | paddd m7, m4 | |
1661 | ret | |
1662 | ||
1663 | ;----------------------------------------------------------------------------- | |
1664 | ; int pixel_ssd_sp_8x4( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1665 | ;----------------------------------------------------------------------------- | |
1666 | INIT_XMM sse4 | |
1667 | cglobal pixel_ssd_sp_8x4, 4, 6, 8, src1, stride1, src2, stride2 | |
1668 | pxor m7, m7 | |
1669 | add r1, r1 | |
1670 | lea r4, [r1 * 3] | |
1671 | lea r5, [r3 * 3] | |
1672 | call pixel_ssd_sp_8x4_internal | |
1673 | HADDD m7, m1 | |
1674 | movd eax, m7 | |
1675 | RET | |
1676 | ||
1677 | ;----------------------------------------------------------------------------- | |
1678 | ; int pixel_ssd_sp_8x8( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1679 | ;----------------------------------------------------------------------------- | |
1680 | INIT_XMM sse4 | |
1681 | cglobal pixel_ssd_sp_8x8, 4, 6, 8, src1, stride1, src2, stride2 | |
1682 | pxor m7, m7 | |
1683 | add r1, r1 | |
1684 | lea r4, [r1 * 3] | |
1685 | lea r5, [r3 * 3] | |
1686 | call pixel_ssd_sp_8x4_internal | |
1687 | lea r0, [r0 + 4 * r1] | |
1688 | lea r2, [r2 + 4 * r3] | |
1689 | call pixel_ssd_sp_8x4_internal | |
1690 | HADDD m7, m1 | |
1691 | movd eax, m7 | |
1692 | RET | |
1693 | ||
1694 | ;----------------------------------------------------------------------------- | |
1695 | ; int pixel_ssd_sp_8x16( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1696 | ;----------------------------------------------------------------------------- | |
1697 | INIT_XMM sse4 | |
1698 | cglobal pixel_ssd_sp_8x16, 4, 6, 8, src1, stride1, src2, stride2 | |
1699 | pxor m7, m7 | |
1700 | add r1, r1 | |
1701 | lea r4, [r1 * 3] | |
1702 | lea r5, [r3 * 3] | |
1703 | call pixel_ssd_sp_8x4_internal | |
1704 | lea r0, [r0 + 4 * r1] | |
1705 | lea r2, [r2 + 4 * r3] | |
1706 | call pixel_ssd_sp_8x4_internal | |
1707 | lea r0, [r0 + 4 * r1] | |
1708 | lea r2, [r2 + 4 * r3] | |
1709 | call pixel_ssd_sp_8x4_internal | |
1710 | lea r0, [r0 + 4 * r1] | |
1711 | lea r2, [r2 + 4 * r3] | |
1712 | call pixel_ssd_sp_8x4_internal | |
1713 | HADDD m7, m1 | |
1714 | movd eax, m7 | |
1715 | RET | |
1716 | ||
1717 | ;----------------------------------------------------------------------------- | |
1718 | ; int pixel_ssd_sp_8x32( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1719 | ;----------------------------------------------------------------------------- | |
1720 | INIT_XMM sse4 | |
1721 | cglobal pixel_ssd_sp_8x32, 4, 6, 8, src1, stride1, src2, stride2 | |
1722 | pxor m7, m7 | |
1723 | add r1, r1 | |
1724 | lea r4, [r1 * 3] | |
1725 | lea r5, [r3 * 3] | |
1726 | call pixel_ssd_sp_8x4_internal | |
1727 | lea r0, [r0 + 4 * r1] | |
1728 | lea r2, [r2 + 4 * r3] | |
1729 | call pixel_ssd_sp_8x4_internal | |
1730 | lea r0, [r0 + 4 * r1] | |
1731 | lea r2, [r2 + 4 * r3] | |
1732 | call pixel_ssd_sp_8x4_internal | |
1733 | lea r0, [r0 + 4 * r1] | |
1734 | lea r2, [r2 + 4 * r3] | |
1735 | call pixel_ssd_sp_8x4_internal | |
1736 | lea r0, [r0 + 4 * r1] | |
1737 | lea r2, [r2 + 4 * r3] | |
1738 | call pixel_ssd_sp_8x4_internal | |
1739 | lea r0, [r0 + 4 * r1] | |
1740 | lea r2, [r2 + 4 * r3] | |
1741 | call pixel_ssd_sp_8x4_internal | |
1742 | lea r0, [r0 + 4 * r1] | |
1743 | lea r2, [r2 + 4 * r3] | |
1744 | call pixel_ssd_sp_8x4_internal | |
1745 | lea r0, [r0 + 4 * r1] | |
1746 | lea r2, [r2 + 4 * r3] | |
1747 | call pixel_ssd_sp_8x4_internal | |
1748 | HADDD m7, m1 | |
1749 | movd eax, m7 | |
1750 | RET | |
1751 | ||
1752 | ;----------------------------------------------------------------------------- | |
1753 | ; int pixel_ssd_sp_12x16( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1754 | ;----------------------------------------------------------------------------- | |
1755 | INIT_XMM sse4 | |
1756 | cglobal pixel_ssd_sp_12x16, 4, 7, 8, src1, stride1, src2, stride2 | |
1757 | pxor m7, m7 | |
1758 | add r1, r1 | |
1759 | lea r4, [r1 * 3] | |
1760 | mov r5, r0 | |
1761 | mov r6, r2 | |
1762 | call pixel_ssd_sp_4x4_internal | |
1763 | lea r0, [r0 + 4 * r1] | |
1764 | lea r2, [r2 + 2 * r3] | |
1765 | call pixel_ssd_sp_4x4_internal | |
1766 | lea r0, [r0 + 4 * r1] | |
1767 | lea r2, [r2 + 2 * r3] | |
1768 | call pixel_ssd_sp_4x4_internal | |
1769 | lea r0, [r0 + 4 * r1] | |
1770 | lea r2, [r2 + 2 * r3] | |
1771 | call pixel_ssd_sp_4x4_internal | |
1772 | lea r0, [r5 + 8] | |
1773 | lea r2, [r6 + 4] | |
1774 | lea r5, [r3 * 3] | |
1775 | call pixel_ssd_sp_8x4_internal | |
1776 | lea r0, [r0 + 4 * r1] | |
1777 | lea r2, [r2 + 4 * r3] | |
1778 | call pixel_ssd_sp_8x4_internal | |
1779 | lea r0, [r0 + 4 * r1] | |
1780 | lea r2, [r2 + 4 * r3] | |
1781 | call pixel_ssd_sp_8x4_internal | |
1782 | lea r0, [r0 + 4 * r1] | |
1783 | lea r2, [r2 + 4 * r3] | |
1784 | call pixel_ssd_sp_8x4_internal | |
1785 | HADDD m7, m1 | |
1786 | movd eax, m7 | |
1787 | RET | |
1788 | ||
1789 | %macro PIXEL_SSD_SP_16x4 0 | |
1790 | movu m0, [r0] | |
1791 | movu m1, [r0 + 16] | |
1792 | movu m3, [r2] | |
1793 | pmovzxbw m2, m3 | |
1794 | punpckhbw m3, m6 | |
1795 | ||
1796 | psubw m0, m2 | |
1797 | psubw m1, m3 | |
1798 | ||
1799 | movu m4, [r0 + r1] | |
1800 | movu m5, [r0 + r1 +16] | |
1801 | movu m3, [r2 + r3] | |
1802 | pmovzxbw m2, m3 | |
1803 | punpckhbw m3, m6 | |
1804 | ||
1805 | psubw m4, m2 | |
1806 | psubw m5, m3 | |
1807 | ||
1808 | pmaddwd m0, m0 | |
1809 | pmaddwd m1, m1 | |
1810 | pmaddwd m4, m4 | |
1811 | pmaddwd m5, m5 | |
1812 | ||
1813 | paddd m0, m1 | |
1814 | paddd m4, m5 | |
1815 | paddd m4, m0 | |
1816 | paddd m7, m4 | |
1817 | ||
1818 | movu m0, [r0 + 2 * r1] | |
1819 | movu m1, [r0 + 2 * r1 + 16] | |
1820 | movu m3, [r2 + 2 * r3] | |
1821 | pmovzxbw m2, m3 | |
1822 | punpckhbw m3, m6 | |
1823 | ||
1824 | psubw m0, m2 | |
1825 | psubw m1, m3 | |
1826 | ||
1827 | lea r0, [r0 + 2 * r1] | |
1828 | lea r2, [r2 + 2 * r3] | |
1829 | movu m4, [r0 + r1] | |
1830 | movu m5, [r0 + r1 + 16] | |
1831 | movu m3, [r2 + r3] | |
1832 | pmovzxbw m2, m3 | |
1833 | punpckhbw m3, m6 | |
1834 | ||
1835 | psubw m4, m2 | |
1836 | psubw m5, m3 | |
1837 | ||
1838 | pmaddwd m0, m0 | |
1839 | pmaddwd m1, m1 | |
1840 | pmaddwd m4, m4 | |
1841 | pmaddwd m5, m5 | |
1842 | ||
1843 | paddd m0, m1 | |
1844 | paddd m4, m5 | |
1845 | paddd m4, m0 | |
1846 | paddd m7, m4 | |
1847 | %endmacro | |
1848 | ||
1849 | ;----------------------------------------------------------------------------- | |
1850 | ; int pixel_ssd_sp_16x4( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1851 | ;----------------------------------------------------------------------------- | |
1852 | INIT_XMM sse4 | |
1853 | cglobal pixel_ssd_sp_16x4, 4, 6, 8, src1, stride1, src2, stride2 | |
1854 | ||
1855 | pxor m6, m6 | |
1856 | pxor m7, m7 | |
1857 | add r1, r1 | |
1858 | PIXEL_SSD_SP_16x4 | |
1859 | HADDD m7, m1 | |
1860 | movd eax, m7 | |
1861 | ||
1862 | RET | |
1863 | ||
1864 | ;----------------------------------------------------------------------------- | |
1865 | ; int pixel_ssd_sp_16x8( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1866 | ;----------------------------------------------------------------------------- | |
1867 | INIT_XMM sse4 | |
1868 | cglobal pixel_ssd_sp_16x8, 4, 4, 8, src1, stride1, src2, stride2 | |
1869 | ||
1870 | pxor m6, m6 | |
1871 | pxor m7, m7 | |
1872 | add r1, r1 | |
1873 | PIXEL_SSD_SP_16x4 | |
1874 | lea r0, [r0 + 2 * r1] | |
1875 | lea r2, [r2 + 2 * r3] | |
1876 | PIXEL_SSD_SP_16x4 | |
1877 | HADDD m7, m1 | |
1878 | movd eax, m7 | |
1879 | RET | |
1880 | ||
1881 | ;----------------------------------------------------------------------------- | |
1882 | ; int pixel_ssd_sp_16x12( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1883 | ;----------------------------------------------------------------------------- | |
1884 | INIT_XMM sse4 | |
1885 | cglobal pixel_ssd_sp_16x12, 4, 6, 8, src1, stride1, src2, stride2 | |
1886 | ||
1887 | pxor m6, m6 | |
1888 | pxor m7, m7 | |
1889 | add r1, r1 | |
1890 | lea r4, [r1 * 2] | |
1891 | lea r5, [r3 * 2] | |
1892 | PIXEL_SSD_SP_16x4 | |
1893 | lea r0, [r0 + r4] | |
1894 | lea r2, [r2 + r5] | |
1895 | PIXEL_SSD_SP_16x4 | |
1896 | lea r0, [r0 + r4] | |
1897 | lea r2, [r2 + r5] | |
1898 | PIXEL_SSD_SP_16x4 | |
1899 | HADDD m7, m1 | |
1900 | movd eax, m7 | |
1901 | RET | |
1902 | ||
1903 | ;----------------------------------------------------------------------------- | |
1904 | ; int pixel_ssd_sp_16x16( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1905 | ;----------------------------------------------------------------------------- | |
1906 | INIT_XMM sse4 | |
1907 | cglobal pixel_ssd_sp_16x16, 4, 6, 8, src1, stride1, src2, stride2 | |
1908 | ||
1909 | pxor m6, m6 | |
1910 | pxor m7, m7 | |
1911 | add r1, r1 | |
1912 | lea r4, [r1 * 2] | |
1913 | lea r5, [r3 * 2] | |
1914 | PIXEL_SSD_SP_16x4 | |
1915 | lea r0, [r0 + r4] | |
1916 | lea r2, [r2 + r5] | |
1917 | PIXEL_SSD_SP_16x4 | |
1918 | lea r0, [r0 + r4] | |
1919 | lea r2, [r2 + r5] | |
1920 | PIXEL_SSD_SP_16x4 | |
1921 | lea r0, [r0 + r4] | |
1922 | lea r2, [r2 + r5] | |
1923 | PIXEL_SSD_SP_16x4 | |
1924 | HADDD m7, m1 | |
1925 | movd eax, m7 | |
1926 | RET | |
1927 | ||
1928 | cglobal pixel_ssd_sp_16x16_internal | |
1929 | PIXEL_SSD_SP_16x4 | |
1930 | lea r0, [r0 + r4] | |
1931 | lea r2, [r2 + 2 * r3] | |
1932 | PIXEL_SSD_SP_16x4 | |
1933 | lea r0, [r0 + r4] | |
1934 | lea r2, [r2 + 2 * r3] | |
1935 | PIXEL_SSD_SP_16x4 | |
1936 | lea r0, [r0 + r4] | |
1937 | lea r2, [r2 + 2 * r3] | |
1938 | PIXEL_SSD_SP_16x4 | |
1939 | ret | |
1940 | ||
1941 | ;----------------------------------------------------------------------------- | |
1942 | ; int pixel_ssd_sp_16x32( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1943 | ;----------------------------------------------------------------------------- | |
1944 | INIT_XMM sse4 | |
1945 | cglobal pixel_ssd_sp_16x32, 4, 5, 8, src1, stride1, src2, stride2 | |
1946 | ||
1947 | pxor m6, m6 | |
1948 | pxor m7, m7 | |
1949 | add r1, r1 | |
1950 | lea r4, [r1 * 2] | |
1951 | call pixel_ssd_sp_16x16_internal | |
1952 | lea r0, [r0 + r4] | |
1953 | lea r2, [r2 + 2 * r3] | |
1954 | call pixel_ssd_sp_16x16_internal | |
1955 | HADDD m7, m1 | |
1956 | movd eax, m7 | |
1957 | RET | |
1958 | ||
1959 | ;----------------------------------------------------------------------------- | |
1960 | ; int pixel_ssd_sp_16x64( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1961 | ;----------------------------------------------------------------------------- | |
1962 | INIT_XMM sse4 | |
1963 | cglobal pixel_ssd_sp_16x64, 4, 6, 8, src1, stride1, src2, stride2 | |
1964 | ||
1965 | pxor m6, m6 | |
1966 | pxor m7, m7 | |
1967 | add r1, r1 | |
1968 | lea r4, [r1 * 2] | |
1969 | lea r5, [r3 * 2] | |
1970 | call pixel_ssd_sp_16x16_internal | |
1971 | lea r0, [r0 + r4] | |
1972 | lea r2, [r2 + r5] | |
1973 | call pixel_ssd_sp_16x16_internal | |
1974 | lea r0, [r0 + r4] | |
1975 | lea r2, [r2 + r5] | |
1976 | call pixel_ssd_sp_16x16_internal | |
1977 | lea r0, [r0 + r4] | |
1978 | lea r2, [r2 + r5] | |
1979 | call pixel_ssd_sp_16x16_internal | |
1980 | ||
1981 | HADDD m7, m1 | |
1982 | movd eax, m7 | |
1983 | RET | |
1984 | ||
1985 | ;----------------------------------------------------------------------------- | |
1986 | ; int pixel_ssd_sp_24x32( int16_t *, intptr_t, uint8_t *, intptr_t ) | |
1987 | ;----------------------------------------------------------------------------- | |
1988 | INIT_XMM sse4 | |
1989 | cglobal pixel_ssd_sp_24x32, 4, 7, 8, src1, stride1, src2, stride2 | |
1990 | pxor m6, m6 | |
1991 | pxor m7, m7 | |
1992 | add r1, r1 | |
1993 | lea r4, [r1 * 2] | |
1994 | mov r5, r0 | |
1995 | mov r6, r2 | |
1996 | call pixel_ssd_sp_16x16_internal | |
1997 | lea r0, [r0 + r4] | |
1998 | lea r2, [r2 + 2 * r3] | |
1999 | call pixel_ssd_sp_16x16_internal | |
2000 | lea r0, [r5 + 32] | |
2001 | lea r2, [r6 + 16] | |
2002 | lea r4, [r1 * 3] | |
2003 | lea r5, [r3 * 3] | |
2004 | call pixel_ssd_sp_8x4_internal | |
2005 | lea r0, [r0 + 4 * r1] | |
2006 | lea r2, [r2 + 4 * r3] | |
2007 | call pixel_ssd_sp_8x4_internal | |
2008 | lea r0, [r0 + 4 * r1] | |
2009 | lea r2, [r2 + 4 * r3] | |
2010 | call pixel_ssd_sp_8x4_internal | |
2011 | lea r0, [r0 + 4 * r1] | |
2012 | lea r2, [r2 + 4 * r3] | |
2013 | call pixel_ssd_sp_8x4_internal | |
2014 | lea r0, [r0 + 4 * r1] | |
2015 | lea r2, [r2 + 4 * r3] | |
2016 | call pixel_ssd_sp_8x4_internal | |
2017 | lea r0, [r0 + 4 * r1] | |
2018 | lea r2, [r2 + 4 * r3] | |
2019 | call pixel_ssd_sp_8x4_internal | |
2020 | lea r0, [r0 + 4 * r1] | |
2021 | lea r2, [r2 + 4 * r3] | |
2022 | call pixel_ssd_sp_8x4_internal | |
2023 | lea r0, [r0 + 4 * r1] | |
2024 | lea r2, [r2 + 4 * r3] | |
2025 | call pixel_ssd_sp_8x4_internal | |
2026 | HADDD m7, m1 | |
2027 | movd eax, m7 | |
2028 | RET | |
2029 | ||
2030 | ;----------------------------------------------------------------------------- | |
2031 | ; int pixel_ssd_32x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2032 | ;----------------------------------------------------------------------------- | |
2033 | INIT_XMM sse4 | |
2034 | cglobal pixel_ssd_sp_32x8, 4, 7, 8, src1, stride1, src2, stride2 | |
2035 | ||
2036 | pxor m7, m7 | |
2037 | pxor m6, m6 | |
2038 | mov r5, r0 | |
2039 | mov r6, r2 | |
2040 | add r1, r1 | |
2041 | lea r4, [r1 * 2] | |
2042 | PIXEL_SSD_SP_16x4 | |
2043 | lea r0, [r0 + r4] | |
2044 | lea r2, [r2 + 2 * r3] | |
2045 | PIXEL_SSD_SP_16x4 | |
2046 | lea r0, [r5 + 32] | |
2047 | lea r2, [r6 + 16] | |
2048 | PIXEL_SSD_SP_16x4 | |
2049 | lea r0, [r0 + r4] | |
2050 | lea r2, [r2 + 2 * r3] | |
2051 | PIXEL_SSD_SP_16x4 | |
2052 | HADDD m7, m1 | |
2053 | movd eax, m7 | |
2054 | RET | |
2055 | ||
2056 | ;----------------------------------------------------------------------------- | |
2057 | ; int pixel_ssd_32x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2058 | ;----------------------------------------------------------------------------- | |
2059 | INIT_XMM sse4 | |
2060 | cglobal pixel_ssd_sp_32x16, 4, 7, 8, src1, stride1, src2, stride2 | |
2061 | ||
2062 | pxor m7, m7 | |
2063 | pxor m6, m6 | |
2064 | mov r5, r0 | |
2065 | mov r6, r2 | |
2066 | add r1, r1 | |
2067 | lea r4, [r1 * 2] | |
2068 | call pixel_ssd_sp_16x16_internal | |
2069 | lea r0, [r5 + 32] | |
2070 | lea r2, [r6 + 16] | |
2071 | call pixel_ssd_sp_16x16_internal | |
2072 | HADDD m7, m1 | |
2073 | movd eax, m7 | |
2074 | RET | |
2075 | ||
2076 | ;----------------------------------------------------------------------------- | |
2077 | ; int pixel_ssd_32x24( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2078 | ;----------------------------------------------------------------------------- | |
2079 | INIT_XMM sse4 | |
2080 | cglobal pixel_ssd_sp_32x24, 4, 7, 8, src1, stride1, src2, stride2 | |
2081 | ||
2082 | pxor m7, m7 | |
2083 | pxor m6, m6 | |
2084 | mov r5, r0 | |
2085 | mov r6, r2 | |
2086 | add r1, r1 | |
2087 | lea r4, [r1 * 2] | |
2088 | call pixel_ssd_sp_16x16_internal | |
2089 | lea r0, [r0 + r4] | |
2090 | lea r2, [r2 + 2 * r3] | |
2091 | PIXEL_SSD_SP_16x4 | |
2092 | lea r0, [r0 + r4] | |
2093 | lea r2, [r2 + 2 * r3] | |
2094 | PIXEL_SSD_SP_16x4 | |
2095 | lea r0, [r5 + 32] | |
2096 | lea r2, [r6 + 16] | |
2097 | call pixel_ssd_sp_16x16_internal | |
2098 | lea r0, [r0 + r4] | |
2099 | lea r2, [r2 + 2 * r3] | |
2100 | PIXEL_SSD_SP_16x4 | |
2101 | lea r0, [r0 + r4] | |
2102 | lea r2, [r2 + 2 * r3] | |
2103 | PIXEL_SSD_SP_16x4 | |
2104 | HADDD m7, m1 | |
2105 | movd eax, m7 | |
2106 | RET | |
2107 | ||
2108 | ;----------------------------------------------------------------------------- | |
2109 | ; int pixel_ssd_32x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2110 | ;----------------------------------------------------------------------------- | |
2111 | INIT_XMM sse4 | |
2112 | cglobal pixel_ssd_sp_32x32, 4, 7, 8, src1, stride1, src2, stride2 | |
2113 | ||
2114 | pxor m7, m7 | |
2115 | pxor m6, m6 | |
2116 | mov r5, r0 | |
2117 | mov r6, r2 | |
2118 | add r1, r1 | |
2119 | lea r4, [r1 * 2] | |
2120 | call pixel_ssd_sp_16x16_internal | |
2121 | lea r0, [r0 + r4] | |
2122 | lea r2, [r2 + 2 * r3] | |
2123 | call pixel_ssd_sp_16x16_internal | |
2124 | lea r0, [r5 + 32] | |
2125 | lea r2, [r6 + 16] | |
2126 | call pixel_ssd_sp_16x16_internal | |
2127 | lea r0, [r0 + r4] | |
2128 | lea r2, [r2 + 2 * r3] | |
2129 | call pixel_ssd_sp_16x16_internal | |
2130 | HADDD m7, m1 | |
2131 | movd eax, m7 | |
2132 | RET | |
2133 | ||
2134 | ;----------------------------------------------------------------------------- | |
2135 | ; int pixel_ssd_32x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2136 | ;----------------------------------------------------------------------------- | |
2137 | INIT_XMM sse4 | |
2138 | cglobal pixel_ssd_sp_32x64, 4, 7, 8, src1, stride1, src2, stride2 | |
2139 | ||
2140 | pxor m7, m7 | |
2141 | pxor m6, m6 | |
2142 | mov r5, r0 | |
2143 | mov r6, r2 | |
2144 | add r1, r1 | |
2145 | lea r4, [r1 * 2] | |
2146 | call pixel_ssd_sp_16x16_internal | |
2147 | lea r0, [r0 + r4] | |
2148 | lea r2, [r2 + 2 * r3] | |
2149 | call pixel_ssd_sp_16x16_internal | |
2150 | lea r0, [r0 + r4] | |
2151 | lea r2, [r2 + 2 * r3] | |
2152 | call pixel_ssd_sp_16x16_internal | |
2153 | lea r0, [r0 + r4] | |
2154 | lea r2, [r2 + 2 * r3] | |
2155 | call pixel_ssd_sp_16x16_internal | |
2156 | lea r0, [r5 + 32] | |
2157 | lea r2, [r6 + 16] | |
2158 | call pixel_ssd_sp_16x16_internal | |
2159 | lea r0, [r0 + r4] | |
2160 | lea r2, [r2 + 2 * r3] | |
2161 | call pixel_ssd_sp_16x16_internal | |
2162 | lea r0, [r0 + r4] | |
2163 | lea r2, [r2 + 2 * r3] | |
2164 | call pixel_ssd_sp_16x16_internal | |
2165 | lea r0, [r0 + r4] | |
2166 | lea r2, [r2 + 2 * r3] | |
2167 | call pixel_ssd_sp_16x16_internal | |
2168 | HADDD m7, m1 | |
2169 | movd eax, m7 | |
2170 | RET | |
2171 | ||
2172 | ;----------------------------------------------------------------------------- | |
2173 | ; int pixel_ssd_48x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2174 | ;----------------------------------------------------------------------------- | |
2175 | INIT_XMM sse4 | |
2176 | cglobal pixel_ssd_sp_48x64, 4, 7, 8, src1, stride1, src2, stride2 | |
2177 | ||
2178 | pxor m7, m7 | |
2179 | pxor m6, m6 | |
2180 | mov r5, r0 | |
2181 | mov r6, r2 | |
2182 | add r1, r1 | |
2183 | lea r4, [r1 * 2] | |
2184 | call pixel_ssd_sp_16x16_internal | |
2185 | lea r0, [r0 + r4] | |
2186 | lea r2, [r2 + 2 * r3] | |
2187 | call pixel_ssd_sp_16x16_internal | |
2188 | lea r0, [r0 + r4] | |
2189 | lea r2, [r2 + 2 * r3] | |
2190 | call pixel_ssd_sp_16x16_internal | |
2191 | lea r0, [r0 + r4] | |
2192 | lea r2, [r2 + 2 * r3] | |
2193 | call pixel_ssd_sp_16x16_internal | |
2194 | lea r0, [r5 + 32] | |
2195 | lea r2, [r6 + 16] | |
2196 | call pixel_ssd_sp_16x16_internal | |
2197 | lea r0, [r0 + r4] | |
2198 | lea r2, [r2 + 2 * r3] | |
2199 | call pixel_ssd_sp_16x16_internal | |
2200 | lea r0, [r0 + r4] | |
2201 | lea r2, [r2 + 2 * r3] | |
2202 | call pixel_ssd_sp_16x16_internal | |
2203 | lea r0, [r0 + r4] | |
2204 | lea r2, [r2 + 2 * r3] | |
2205 | call pixel_ssd_sp_16x16_internal | |
2206 | lea r0, [r5 + 64] | |
2207 | lea r2, [r6 + 32] | |
2208 | call pixel_ssd_sp_16x16_internal | |
2209 | lea r0, [r0 + r4] | |
2210 | lea r2, [r2 + 2 * r3] | |
2211 | call pixel_ssd_sp_16x16_internal | |
2212 | lea r0, [r0 + r4] | |
2213 | lea r2, [r2 + 2 * r3] | |
2214 | call pixel_ssd_sp_16x16_internal | |
2215 | lea r0, [r0 + r4] | |
2216 | lea r2, [r2 + 2 * r3] | |
2217 | call pixel_ssd_sp_16x16_internal | |
2218 | HADDD m7, m1 | |
2219 | movd eax, m7 | |
2220 | RET | |
2221 | ||
2222 | ;----------------------------------------------------------------------------- | |
2223 | ; int pixel_ssd_64x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2224 | ;----------------------------------------------------------------------------- | |
2225 | INIT_XMM sse4 | |
2226 | cglobal pixel_ssd_sp_64x16, 4, 7, 8, src1, stride1, src2, stride2 | |
2227 | ||
2228 | pxor m7, m7 | |
2229 | pxor m6, m6 | |
2230 | mov r5, r0 | |
2231 | mov r6, r2 | |
2232 | add r1, r1 | |
2233 | lea r4, [r1 * 2] | |
2234 | call pixel_ssd_sp_16x16_internal | |
2235 | lea r0, [r5 + 32] | |
2236 | lea r2, [r6 + 16] | |
2237 | call pixel_ssd_sp_16x16_internal | |
2238 | lea r0, [r5 + 64] | |
2239 | lea r2, [r6 + 32] | |
2240 | call pixel_ssd_sp_16x16_internal | |
2241 | lea r0, [r5 + 96] | |
2242 | lea r2, [r6 + 48] | |
2243 | call pixel_ssd_sp_16x16_internal | |
2244 | HADDD m7, m1 | |
2245 | movd eax, m7 | |
2246 | RET | |
2247 | ||
2248 | ;----------------------------------------------------------------------------- | |
2249 | ; int pixel_ssd_64x32( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2250 | ;----------------------------------------------------------------------------- | |
2251 | INIT_XMM sse4 | |
2252 | cglobal pixel_ssd_sp_64x32, 4, 7, 8, src1, stride1, src2, stride2 | |
2253 | ||
2254 | pxor m7, m7 | |
2255 | pxor m6, m6 | |
2256 | mov r5, r0 | |
2257 | mov r6, r2 | |
2258 | add r1, r1 | |
2259 | lea r4, [r1 * 2] | |
2260 | call pixel_ssd_sp_16x16_internal | |
2261 | lea r0, [r0 + r4] | |
2262 | lea r2, [r2 + 2 * r3] | |
2263 | call pixel_ssd_sp_16x16_internal | |
2264 | lea r0, [r5 + 32] | |
2265 | lea r2, [r6 + 16] | |
2266 | call pixel_ssd_sp_16x16_internal | |
2267 | lea r0, [r0 + r4] | |
2268 | lea r2, [r2 + 2 * r3] | |
2269 | call pixel_ssd_sp_16x16_internal | |
2270 | lea r0, [r5 + 64] | |
2271 | lea r2, [r6 + 32] | |
2272 | call pixel_ssd_sp_16x16_internal | |
2273 | lea r0, [r0 + r4] | |
2274 | lea r2, [r2 + 2 * r3] | |
2275 | call pixel_ssd_sp_16x16_internal | |
2276 | lea r0, [r5 + 96] | |
2277 | lea r2, [r6 + 48] | |
2278 | call pixel_ssd_sp_16x16_internal | |
2279 | lea r0, [r0 + r4] | |
2280 | lea r2, [r2 + 2 * r3] | |
2281 | call pixel_ssd_sp_16x16_internal | |
2282 | HADDD m7, m1 | |
2283 | movd eax, m7 | |
2284 | RET | |
2285 | ||
2286 | ;----------------------------------------------------------------------------- | |
2287 | ; int pixel_ssd_64x48( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2288 | ;----------------------------------------------------------------------------- | |
2289 | INIT_XMM sse4 | |
2290 | cglobal pixel_ssd_sp_64x48, 4, 7, 8, src1, stride1, src2, stride2 | |
2291 | ||
2292 | pxor m7, m7 | |
2293 | pxor m6, m6 | |
2294 | mov r5, r0 | |
2295 | mov r6, r2 | |
2296 | add r1, r1 | |
2297 | lea r4, [r1 * 2] | |
2298 | call pixel_ssd_sp_16x16_internal | |
2299 | lea r0, [r0 + r4] | |
2300 | lea r2, [r2 + 2 * r3] | |
2301 | call pixel_ssd_sp_16x16_internal | |
2302 | lea r0, [r0 + r4] | |
2303 | lea r2, [r2 + 2 * r3] | |
2304 | call pixel_ssd_sp_16x16_internal | |
2305 | lea r0, [r5 + 32] | |
2306 | lea r2, [r6 + 16] | |
2307 | call pixel_ssd_sp_16x16_internal | |
2308 | lea r0, [r0 + r4] | |
2309 | lea r2, [r2 + 2 * r3] | |
2310 | call pixel_ssd_sp_16x16_internal | |
2311 | lea r0, [r0 + r4] | |
2312 | lea r2, [r2 + 2 * r3] | |
2313 | call pixel_ssd_sp_16x16_internal | |
2314 | lea r0, [r5 + 64] | |
2315 | lea r2, [r6 + 32] | |
2316 | call pixel_ssd_sp_16x16_internal | |
2317 | lea r0, [r0 + r4] | |
2318 | lea r2, [r2 + 2 * r3] | |
2319 | call pixel_ssd_sp_16x16_internal | |
2320 | lea r0, [r0 + r4] | |
2321 | lea r2, [r2 + 2 * r3] | |
2322 | call pixel_ssd_sp_16x16_internal | |
2323 | lea r0, [r5 + 96] | |
2324 | lea r2, [r6 + 48] | |
2325 | call pixel_ssd_sp_16x16_internal | |
2326 | lea r0, [r0 + r4] | |
2327 | lea r2, [r2 + 2 * r3] | |
2328 | call pixel_ssd_sp_16x16_internal | |
2329 | lea r0, [r0 + r4] | |
2330 | lea r2, [r2 + 2 * r3] | |
2331 | call pixel_ssd_sp_16x16_internal | |
2332 | HADDD m7, m1 | |
2333 | movd eax, m7 | |
2334 | RET | |
2335 | ||
2336 | ;----------------------------------------------------------------------------- | |
2337 | ; int pixel_ssd_64x64( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
2338 | ;----------------------------------------------------------------------------- | |
2339 | INIT_XMM sse4 | |
2340 | cglobal pixel_ssd_sp_64x64, 4, 7, 8, src1, stride1, src2, stride2 | |
2341 | ||
2342 | pxor m7, m7 | |
2343 | pxor m6, m6 | |
2344 | mov r5, r0 | |
2345 | mov r6, r2 | |
2346 | add r1, r1 | |
2347 | lea r4, [r1 * 2] | |
2348 | call pixel_ssd_sp_16x16_internal | |
2349 | lea r0, [r0 + r4] | |
2350 | lea r2, [r2 + 2 * r3] | |
2351 | call pixel_ssd_sp_16x16_internal | |
2352 | lea r0, [r0 + r4] | |
2353 | lea r2, [r2 + 2 * r3] | |
2354 | call pixel_ssd_sp_16x16_internal | |
2355 | lea r0, [r0 + r4] | |
2356 | lea r2, [r2 + 2 * r3] | |
2357 | call pixel_ssd_sp_16x16_internal | |
2358 | lea r0, [r5 + 32] | |
2359 | lea r2, [r6 + 16] | |
2360 | call pixel_ssd_sp_16x16_internal | |
2361 | lea r0, [r0 + r4] | |
2362 | lea r2, [r2 + 2 * r3] | |
2363 | call pixel_ssd_sp_16x16_internal | |
2364 | lea r0, [r0 + r4] | |
2365 | lea r2, [r2 + 2 * r3] | |
2366 | call pixel_ssd_sp_16x16_internal | |
2367 | lea r0, [r0 + r4] | |
2368 | lea r2, [r2 + 2 * r3] | |
2369 | call pixel_ssd_sp_16x16_internal | |
2370 | lea r0, [r5 + 64] | |
2371 | lea r2, [r6 + 32] | |
2372 | call pixel_ssd_sp_16x16_internal | |
2373 | lea r0, [r0 + r4] | |
2374 | lea r2, [r2 + 2 * r3] | |
2375 | call pixel_ssd_sp_16x16_internal | |
2376 | lea r0, [r0 + r4] | |
2377 | lea r2, [r2 + 2 * r3] | |
2378 | call pixel_ssd_sp_16x16_internal | |
2379 | lea r0, [r0 + r4] | |
2380 | lea r2, [r2 + 2 * r3] | |
2381 | call pixel_ssd_sp_16x16_internal | |
2382 | lea r0, [r5 + 96] | |
2383 | lea r2, [r6 + 48] | |
2384 | call pixel_ssd_sp_16x16_internal | |
2385 | lea r0, [r0 + r4] | |
2386 | lea r2, [r2 + 2 * r3] | |
2387 | call pixel_ssd_sp_16x16_internal | |
2388 | lea r0, [r0 + r4] | |
2389 | lea r2, [r2 + 2 * r3] | |
2390 | call pixel_ssd_sp_16x16_internal | |
2391 | lea r0, [r0 + r4] | |
2392 | lea r2, [r2 + 2 * r3] | |
2393 | call pixel_ssd_sp_16x16_internal | |
2394 | HADDD m7, m1 | |
2395 | movd eax, m7 | |
2396 | RET | |
2397 | ||
2398 | ||
2399 | ;----------------------------------------------------------------------------- | |
2400 | ; int pixel_ssd_s( int16_t *ref, intptr_t i_stride ) | |
2401 | ;----------------------------------------------------------------------------- | |
2402 | INIT_XMM sse2 | |
2403 | cglobal pixel_ssd_s_4, 2,2,2 | |
2404 | add r1, r1 | |
2405 | movh m0, [r0] | |
2406 | movhps m0, [r0 + r1] | |
2407 | ||
2408 | lea r0, [r0 + r1 * 2] | |
2409 | movh m1, [r0] | |
2410 | movhps m1, [r0 + r1] | |
2411 | ||
2412 | pmaddwd m0, m0 | |
2413 | pmaddwd m1, m1 | |
2414 | paddd m0, m1 | |
2415 | ||
2416 | ; calculate sum and return | |
2417 | HADDD m0, m1 | |
2418 | movd eax, m0 | |
2419 | RET | |
2420 | ||
2421 | ||
2422 | INIT_XMM sse2 | |
2423 | cglobal pixel_ssd_s_8, 2,3,5 | |
2424 | add r1, r1 | |
2425 | lea r2, [r1 * 3] | |
2426 | movu m0, [r0] | |
2427 | movu m1, [r0 + r1] | |
2428 | movu m2, [r0 + r1 * 2] | |
2429 | movu m3, [r0 + r2] | |
2430 | ||
2431 | pmaddwd m0, m0 | |
2432 | pmaddwd m1, m1 | |
2433 | pmaddwd m2, m2 | |
2434 | pmaddwd m3, m3 | |
2435 | paddd m0, m1 | |
2436 | paddd m2, m3 | |
2437 | paddd m0, m2 | |
2438 | ||
2439 | lea r0, [r0 + r1 * 4] | |
2440 | movu m4, [r0] | |
2441 | movu m1, [r0 + r1] | |
2442 | movu m2, [r0 + r1 * 2] | |
2443 | movu m3, [r0 + r2] | |
2444 | ||
2445 | pmaddwd m4, m4 | |
2446 | pmaddwd m1, m1 | |
2447 | pmaddwd m2, m2 | |
2448 | pmaddwd m3, m3 | |
2449 | paddd m4, m1 | |
2450 | paddd m2, m3 | |
2451 | paddd m4, m2 | |
2452 | paddd m0, m4 | |
2453 | ||
2454 | ; calculate sum and return | |
2455 | HADDD m0, m1 | |
2456 | movd eax, m0 | |
2457 | RET | |
2458 | ||
2459 | ||
2460 | INIT_XMM sse2 | |
2461 | cglobal pixel_ssd_s_16, 2,3,5 | |
2462 | add r1, r1 | |
2463 | ||
2464 | mov r2d, 4 | |
2465 | pxor m0, m0 | |
2466 | .loop: | |
2467 | movu m1, [r0] | |
2468 | movu m2, [r0 + mmsize] | |
2469 | movu m3, [r0 + r1] | |
2470 | movu m4, [r0 + r1 + mmsize] | |
2471 | lea r0, [r0 + r1 * 2] | |
2472 | ||
2473 | pmaddwd m1, m1 | |
2474 | pmaddwd m2, m2 | |
2475 | pmaddwd m3, m3 | |
2476 | pmaddwd m4, m4 | |
2477 | paddd m1, m2 | |
2478 | paddd m3, m4 | |
2479 | paddd m1, m3 | |
2480 | paddd m0, m1 | |
2481 | ||
2482 | movu m1, [r0] | |
2483 | movu m2, [r0 + mmsize] | |
2484 | movu m3, [r0 + r1] | |
2485 | movu m4, [r0 + r1 + mmsize] | |
2486 | lea r0, [r0 + r1 * 2] | |
2487 | ||
2488 | pmaddwd m1, m1 | |
2489 | pmaddwd m2, m2 | |
2490 | pmaddwd m3, m3 | |
2491 | pmaddwd m4, m4 | |
2492 | paddd m1, m2 | |
2493 | paddd m3, m4 | |
2494 | paddd m1, m3 | |
2495 | paddd m0, m1 | |
2496 | ||
2497 | dec r2d | |
2498 | jnz .loop | |
2499 | ||
2500 | ; calculate sum and return | |
2501 | HADDD m0, m1 | |
2502 | movd eax, m0 | |
2503 | RET | |
2504 | ||
2505 | ||
2506 | INIT_XMM sse2 | |
2507 | cglobal pixel_ssd_s_32, 2,3,5 | |
2508 | add r1, r1 | |
2509 | ||
2510 | mov r2d, 16 | |
2511 | pxor m0, m0 | |
2512 | .loop: | |
2513 | movu m1, [r0 + 0 * mmsize] | |
2514 | movu m2, [r0 + 1 * mmsize] | |
2515 | movu m3, [r0 + 2 * mmsize] | |
2516 | movu m4, [r0 + 3 * mmsize] | |
2517 | add r0, r1 | |
2518 | ||
2519 | pmaddwd m1, m1 | |
2520 | pmaddwd m2, m2 | |
2521 | pmaddwd m3, m3 | |
2522 | pmaddwd m4, m4 | |
2523 | paddd m1, m2 | |
2524 | paddd m3, m4 | |
2525 | paddd m1, m3 | |
2526 | paddd m0, m1 | |
2527 | ||
2528 | movu m1, [r0 + 0 * mmsize] | |
2529 | movu m2, [r0 + 1 * mmsize] | |
2530 | movu m3, [r0 + 2 * mmsize] | |
2531 | movu m4, [r0 + 3 * mmsize] | |
2532 | add r0, r1 | |
2533 | ||
2534 | pmaddwd m1, m1 | |
2535 | pmaddwd m2, m2 | |
2536 | pmaddwd m3, m3 | |
2537 | pmaddwd m4, m4 | |
2538 | paddd m1, m2 | |
2539 | paddd m3, m4 | |
2540 | paddd m1, m3 | |
2541 | paddd m0, m1 | |
2542 | ||
2543 | dec r2d | |
2544 | jnz .loop | |
2545 | ||
2546 | ; calculate sum and return | |
2547 | HADDD m0, m1 | |
2548 | movd eax, m0 | |
2549 | RET | |
2550 | ||
2551 | ||
2552 | INIT_YMM avx2 | |
2553 | cglobal pixel_ssd_s_32, 2,4,5 | |
2554 | add r1, r1 | |
2555 | lea r3, [r1 * 3] | |
2556 | ||
2557 | mov r2d, 8 | |
2558 | pxor m0, m0 | |
2559 | .loop: | |
2560 | movu m1, [r0 + 0 * mmsize] | |
2561 | movu m2, [r0 + 1 * mmsize] | |
2562 | movu m3, [r0 + r1 + 0 * mmsize] | |
2563 | movu m4, [r0 + r1 + 1 * mmsize] | |
2564 | ||
2565 | pmaddwd m1, m1 | |
2566 | pmaddwd m2, m2 | |
2567 | pmaddwd m3, m3 | |
2568 | pmaddwd m4, m4 | |
2569 | paddd m1, m2 | |
2570 | paddd m3, m4 | |
2571 | paddd m1, m3 | |
2572 | paddd m0, m1 | |
2573 | ||
2574 | movu m1, [r0 + r1 * 2 + 0 * mmsize] | |
2575 | movu m2, [r0 + r1 * 2 + 1 * mmsize] | |
2576 | movu m3, [r0 + r3 + 0 * mmsize] | |
2577 | movu m4, [r0 + r3 + 1 * mmsize] | |
2578 | lea r0, [r0 + 4 * r1] | |
2579 | ||
2580 | pmaddwd m1, m1 | |
2581 | pmaddwd m2, m2 | |
2582 | pmaddwd m3, m3 | |
2583 | pmaddwd m4, m4 | |
2584 | paddd m1, m2 | |
2585 | paddd m3, m4 | |
2586 | paddd m1, m3 | |
2587 | paddd m0, m1 | |
2588 | ||
2589 | dec r2d | |
2590 | jnz .loop | |
2591 | ||
2592 | ; calculate sum and return | |
2593 | HADDD m0, m1 | |
2594 | movd eax, xm0 | |
2595 | RET |