Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* MMX/SSSE3-optimized functions for H264 chroma MC | |
3 | ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, | |
4 | ;* 2005-2008 Loren Merritt | |
5 | ;* | |
6 | ;* This file is part of FFmpeg. | |
7 | ;* | |
8 | ;* FFmpeg is free software; you can redistribute it and/or | |
9 | ;* modify it under the terms of the GNU Lesser General Public | |
10 | ;* License as published by the Free Software Foundation; either | |
11 | ;* version 2.1 of the License, or (at your option) any later version. | |
12 | ;* | |
13 | ;* FFmpeg is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | ;* Lesser General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU Lesser General Public | |
19 | ;* License along with FFmpeg; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
21 | ;****************************************************************************** | |
22 | ||
23 | %include "libavutil/x86/x86util.asm" | |
24 | ||
25 | SECTION_RODATA | |
26 | ||
27 | rnd_rv40_2d_tbl: times 4 dw 0 | |
28 | times 4 dw 16 | |
29 | times 4 dw 32 | |
30 | times 4 dw 16 | |
31 | times 4 dw 32 | |
32 | times 4 dw 28 | |
33 | times 4 dw 32 | |
34 | times 4 dw 28 | |
35 | times 4 dw 0 | |
36 | times 4 dw 32 | |
37 | times 4 dw 16 | |
38 | times 4 dw 32 | |
39 | times 4 dw 32 | |
40 | times 4 dw 28 | |
41 | times 4 dw 32 | |
42 | times 4 dw 28 | |
43 | rnd_rv40_1d_tbl: times 4 dw 0 | |
44 | times 4 dw 2 | |
45 | times 4 dw 4 | |
46 | times 4 dw 2 | |
47 | times 4 dw 4 | |
48 | times 4 dw 3 | |
49 | times 4 dw 4 | |
50 | times 4 dw 3 | |
51 | times 4 dw 0 | |
52 | times 4 dw 4 | |
53 | times 4 dw 2 | |
54 | times 4 dw 4 | |
55 | times 4 dw 4 | |
56 | times 4 dw 3 | |
57 | times 4 dw 4 | |
58 | times 4 dw 3 | |
59 | ||
60 | cextern pw_3 | |
61 | cextern pw_4 | |
62 | cextern pw_8 | |
63 | pw_28: times 8 dw 28 | |
64 | cextern pw_32 | |
65 | cextern pw_64 | |
66 | ||
67 | SECTION .text | |
68 | ||
69 | %macro mv0_pixels_mc8 0 | |
70 | lea r4, [r2*2 ] | |
71 | .next4rows: | |
72 | movq mm0, [r1 ] | |
73 | movq mm1, [r1+r2] | |
74 | add r1, r4 | |
75 | CHROMAMC_AVG mm0, [r0 ] | |
76 | CHROMAMC_AVG mm1, [r0+r2] | |
77 | movq [r0 ], mm0 | |
78 | movq [r0+r2], mm1 | |
79 | add r0, r4 | |
80 | movq mm0, [r1 ] | |
81 | movq mm1, [r1+r2] | |
82 | add r1, r4 | |
83 | CHROMAMC_AVG mm0, [r0 ] | |
84 | CHROMAMC_AVG mm1, [r0+r2] | |
85 | movq [r0 ], mm0 | |
86 | movq [r0+r2], mm1 | |
87 | add r0, r4 | |
88 | sub r3d, 4 | |
89 | jne .next4rows | |
90 | %endmacro | |
91 | ||
92 | %macro chroma_mc8_mmx_func 2-3 | |
93 | %ifidn %2, rv40 | |
94 | %ifdef PIC | |
95 | %define rnd_1d_rv40 r8 | |
96 | %define rnd_2d_rv40 r8 | |
97 | %define extra_regs 2 | |
98 | %else ; no-PIC | |
99 | %define rnd_1d_rv40 rnd_rv40_1d_tbl | |
100 | %define rnd_2d_rv40 rnd_rv40_2d_tbl | |
101 | %define extra_regs 1 | |
102 | %endif ; PIC | |
103 | %else | |
104 | %define extra_regs 0 | |
105 | %endif ; rv40 | |
106 | ; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */, | |
107 | ; uint8_t *src /* align 1 */, | |
108 | ; int stride, int h, int mx, int my) | |
109 | cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 | |
110 | %if ARCH_X86_64 | |
111 | movsxd r2, r2d | |
112 | %endif | |
113 | mov r6d, r5d | |
114 | or r6d, r4d | |
115 | jne .at_least_one_non_zero | |
116 | ; mx == 0 AND my == 0 - no filter needed | |
117 | mv0_pixels_mc8 | |
118 | REP_RET | |
119 | ||
120 | .at_least_one_non_zero: | |
121 | %ifidn %2, rv40 | |
122 | %if ARCH_X86_64 | |
123 | mov r7, r5 | |
124 | and r7, 6 ; &~1 for mx/my=[0,7] | |
125 | lea r7, [r7*4+r4] | |
126 | sar r7d, 1 | |
127 | %define rnd_bias r7 | |
128 | %define dest_reg r0 | |
129 | %else ; x86-32 | |
130 | mov r0, r5 | |
131 | and r0, 6 ; &~1 for mx/my=[0,7] | |
132 | lea r0, [r0*4+r4] | |
133 | sar r0d, 1 | |
134 | %define rnd_bias r0 | |
135 | %define dest_reg r5 | |
136 | %endif | |
137 | %else ; vc1, h264 | |
138 | %define rnd_bias 0 | |
139 | %define dest_reg r0 | |
140 | %endif | |
141 | ||
142 | test r5d, r5d | |
143 | mov r6, 1 | |
144 | je .my_is_zero | |
145 | test r4d, r4d | |
146 | mov r6, r2 ; dxy = x ? 1 : stride | |
147 | jne .both_non_zero | |
148 | .my_is_zero: | |
149 | ; mx == 0 XOR my == 0 - 1 dimensional filter only | |
150 | or r4d, r5d ; x + y | |
151 | ||
152 | %ifidn %2, rv40 | |
153 | %ifdef PIC | |
154 | lea r8, [rnd_rv40_1d_tbl] | |
155 | %endif | |
156 | %if ARCH_X86_64 == 0 | |
157 | mov r5, r0m | |
158 | %endif | |
159 | %endif | |
160 | ||
161 | movd m5, r4d | |
162 | movq m4, [pw_8] | |
163 | movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 | |
164 | punpcklwd m5, m5 | |
165 | punpckldq m5, m5 ; mm5 = B = x | |
166 | pxor m7, m7 | |
167 | psubw m4, m5 ; mm4 = A = 8-x | |
168 | ||
169 | .next1drow: | |
170 | movq m0, [r1 ] ; mm0 = src[0..7] | |
171 | movq m2, [r1+r6] ; mm1 = src[1..8] | |
172 | ||
173 | movq m1, m0 | |
174 | movq m3, m2 | |
175 | punpcklbw m0, m7 | |
176 | punpckhbw m1, m7 | |
177 | punpcklbw m2, m7 | |
178 | punpckhbw m3, m7 | |
179 | pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] | |
180 | pmullw m1, m4 | |
181 | pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] | |
182 | pmullw m3, m5 | |
183 | ||
184 | paddw m0, m6 | |
185 | paddw m1, m6 | |
186 | paddw m0, m2 | |
187 | paddw m1, m3 | |
188 | psrlw m0, 3 | |
189 | psrlw m1, 3 | |
190 | packuswb m0, m1 | |
191 | CHROMAMC_AVG m0, [dest_reg] | |
192 | movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 | |
193 | ||
194 | add dest_reg, r2 | |
195 | add r1, r2 | |
196 | dec r3d | |
197 | jne .next1drow | |
198 | REP_RET | |
199 | ||
200 | .both_non_zero: ; general case, bilinear | |
201 | movd m4, r4d ; x | |
202 | movd m6, r5d ; y | |
203 | %ifidn %2, rv40 | |
204 | %ifdef PIC | |
205 | lea r8, [rnd_rv40_2d_tbl] | |
206 | %endif | |
207 | %if ARCH_X86_64 == 0 | |
208 | mov r5, r0m | |
209 | %endif | |
210 | %endif | |
211 | mov r6, rsp ; backup stack pointer | |
212 | and rsp, ~(mmsize-1) ; align stack | |
213 | sub rsp, 16 ; AA and DD | |
214 | ||
215 | punpcklwd m4, m4 | |
216 | punpcklwd m6, m6 | |
217 | punpckldq m4, m4 ; mm4 = x words | |
218 | punpckldq m6, m6 ; mm6 = y words | |
219 | movq m5, m4 | |
220 | pmullw m4, m6 ; mm4 = x * y | |
221 | psllw m5, 3 | |
222 | psllw m6, 3 | |
223 | movq m7, m5 | |
224 | paddw m7, m6 | |
225 | movq [rsp+8], m4 ; DD = x * y | |
226 | psubw m5, m4 ; mm5 = B = 8x - xy | |
227 | psubw m6, m4 ; mm6 = C = 8y - xy | |
228 | paddw m4, [pw_64] | |
229 | psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 | |
230 | pxor m7, m7 | |
231 | movq [rsp ], m4 | |
232 | ||
233 | movq m0, [r1 ] ; mm0 = src[0..7] | |
234 | movq m1, [r1+1] ; mm1 = src[1..8] | |
235 | .next2drow: | |
236 | add r1, r2 | |
237 | ||
238 | movq m2, m0 | |
239 | movq m3, m1 | |
240 | punpckhbw m0, m7 | |
241 | punpcklbw m1, m7 | |
242 | punpcklbw m2, m7 | |
243 | punpckhbw m3, m7 | |
244 | pmullw m0, [rsp] | |
245 | pmullw m2, [rsp] | |
246 | pmullw m1, m5 | |
247 | pmullw m3, m5 | |
248 | paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] | |
249 | paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] | |
250 | ||
251 | movq m0, [r1] | |
252 | movq m1, m0 | |
253 | punpcklbw m0, m7 | |
254 | punpckhbw m1, m7 | |
255 | pmullw m0, m6 | |
256 | pmullw m1, m6 | |
257 | paddw m2, m0 | |
258 | paddw m3, m1 ; [mm2,mm3] += C * src[0..7] | |
259 | ||
260 | movq m1, [r1+1] | |
261 | movq m0, m1 | |
262 | movq m4, m1 | |
263 | punpcklbw m0, m7 | |
264 | punpckhbw m4, m7 | |
265 | pmullw m0, [rsp+8] | |
266 | pmullw m4, [rsp+8] | |
267 | paddw m2, m0 | |
268 | paddw m3, m4 ; [mm2,mm3] += D * src[1..8] | |
269 | movq m0, [r1] | |
270 | ||
271 | paddw m2, [rnd_2d_%2+rnd_bias*8] | |
272 | paddw m3, [rnd_2d_%2+rnd_bias*8] | |
273 | psrlw m2, 6 | |
274 | psrlw m3, 6 | |
275 | packuswb m2, m3 | |
276 | CHROMAMC_AVG m2, [dest_reg] | |
277 | movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 | |
278 | ||
279 | add dest_reg, r2 | |
280 | dec r3d | |
281 | jne .next2drow | |
282 | mov rsp, r6 ; restore stack pointer | |
283 | RET | |
284 | %endmacro | |
285 | ||
286 | %macro chroma_mc4_mmx_func 2 | |
287 | %define extra_regs 0 | |
288 | %ifidn %2, rv40 | |
289 | %ifdef PIC | |
290 | %define extra_regs 1 | |
291 | %endif ; PIC | |
292 | %endif ; rv40 | |
293 | cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 | |
294 | %if ARCH_X86_64 | |
295 | movsxd r2, r2d | |
296 | %endif | |
297 | pxor m7, m7 | |
298 | movd m2, r4d ; x | |
299 | movd m3, r5d ; y | |
300 | movq m4, [pw_8] | |
301 | movq m5, [pw_8] | |
302 | punpcklwd m2, m2 | |
303 | punpcklwd m3, m3 | |
304 | punpcklwd m2, m2 | |
305 | punpcklwd m3, m3 | |
306 | psubw m4, m2 | |
307 | psubw m5, m3 | |
308 | ||
309 | %ifidn %2, rv40 | |
310 | %ifdef PIC | |
311 | lea r6, [rnd_rv40_2d_tbl] | |
312 | %define rnd_2d_rv40 r6 | |
313 | %else | |
314 | %define rnd_2d_rv40 rnd_rv40_2d_tbl | |
315 | %endif | |
316 | and r5, 6 ; &~1 for mx/my=[0,7] | |
317 | lea r5, [r5*4+r4] | |
318 | sar r5d, 1 | |
319 | %define rnd_bias r5 | |
320 | %else ; vc1, h264 | |
321 | %define rnd_bias 0 | |
322 | %endif | |
323 | ||
324 | movd m0, [r1 ] | |
325 | movd m6, [r1+1] | |
326 | add r1, r2 | |
327 | punpcklbw m0, m7 | |
328 | punpcklbw m6, m7 | |
329 | pmullw m0, m4 | |
330 | pmullw m6, m2 | |
331 | paddw m6, m0 | |
332 | ||
333 | .next2rows: | |
334 | movd m0, [r1 ] | |
335 | movd m1, [r1+1] | |
336 | add r1, r2 | |
337 | punpcklbw m0, m7 | |
338 | punpcklbw m1, m7 | |
339 | pmullw m0, m4 | |
340 | pmullw m1, m2 | |
341 | paddw m1, m0 | |
342 | movq m0, m1 | |
343 | ||
344 | pmullw m6, m5 | |
345 | pmullw m1, m3 | |
346 | paddw m6, [rnd_2d_%2+rnd_bias*8] | |
347 | paddw m1, m6 | |
348 | psrlw m1, 6 | |
349 | packuswb m1, m1 | |
350 | CHROMAMC_AVG4 m1, m6, [r0] | |
351 | movd [r0], m1 | |
352 | add r0, r2 | |
353 | ||
354 | movd m6, [r1 ] | |
355 | movd m1, [r1+1] | |
356 | add r1, r2 | |
357 | punpcklbw m6, m7 | |
358 | punpcklbw m1, m7 | |
359 | pmullw m6, m4 | |
360 | pmullw m1, m2 | |
361 | paddw m1, m6 | |
362 | movq m6, m1 | |
363 | pmullw m0, m5 | |
364 | pmullw m1, m3 | |
365 | paddw m0, [rnd_2d_%2+rnd_bias*8] | |
366 | paddw m1, m0 | |
367 | psrlw m1, 6 | |
368 | packuswb m1, m1 | |
369 | CHROMAMC_AVG4 m1, m0, [r0] | |
370 | movd [r0], m1 | |
371 | add r0, r2 | |
372 | sub r3d, 2 | |
373 | jnz .next2rows | |
374 | REP_RET | |
375 | %endmacro | |
376 | ||
377 | %macro chroma_mc2_mmx_func 2 | |
378 | cglobal %1_%2_chroma_mc2, 6, 7, 0 | |
379 | %if ARCH_X86_64 | |
380 | movsxd r2, r2d | |
381 | %endif | |
382 | ||
383 | mov r6d, r4d | |
384 | shl r4d, 16 | |
385 | sub r4d, r6d | |
386 | add r4d, 8 | |
387 | imul r5d, r4d ; x*y<<16 | y*(8-x) | |
388 | shl r4d, 3 | |
389 | sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) | |
390 | ||
391 | movd m5, r4d | |
392 | movd m6, r5d | |
393 | punpckldq m5, m5 ; mm5 = {A,B,A,B} | |
394 | punpckldq m6, m6 ; mm6 = {C,D,C,D} | |
395 | pxor m7, m7 | |
396 | movd m2, [r1] | |
397 | punpcklbw m2, m7 | |
398 | pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] | |
399 | ||
400 | .nextrow: | |
401 | add r1, r2 | |
402 | movq m1, m2 | |
403 | pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] | |
404 | movd m0, [r1] | |
405 | punpcklbw m0, m7 | |
406 | pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] | |
407 | movq m2, m0 | |
408 | pmaddwd m0, m6 | |
409 | paddw m1, [rnd_2d_%2] | |
410 | paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] | |
411 | psrlw m1, 6 | |
412 | packssdw m1, m7 | |
413 | packuswb m1, m7 | |
414 | CHROMAMC_AVG4 m1, m3, [r0] | |
415 | movd r5d, m1 | |
416 | mov [r0], r5w | |
417 | add r0, r2 | |
418 | sub r3d, 1 | |
419 | jnz .nextrow | |
420 | REP_RET | |
421 | %endmacro | |
422 | ||
423 | %define rnd_1d_h264 pw_4 | |
424 | %define rnd_2d_h264 pw_32 | |
425 | %define rnd_1d_vc1 pw_3 | |
426 | %define rnd_2d_vc1 pw_28 | |
427 | ||
428 | %macro NOTHING 2-3 | |
429 | %endmacro | |
430 | %macro DIRECT_AVG 2 | |
431 | PAVGB %1, %2 | |
432 | %endmacro | |
433 | %macro COPY_AVG 3 | |
434 | movd %2, %3 | |
435 | PAVGB %1, %2 | |
436 | %endmacro | |
437 | ||
438 | INIT_MMX mmx | |
439 | %define CHROMAMC_AVG NOTHING | |
440 | %define CHROMAMC_AVG4 NOTHING | |
441 | chroma_mc8_mmx_func put, h264, _rnd | |
442 | chroma_mc8_mmx_func put, vc1, _nornd | |
443 | chroma_mc8_mmx_func put, rv40 | |
444 | chroma_mc4_mmx_func put, h264 | |
445 | chroma_mc4_mmx_func put, rv40 | |
446 | ||
447 | INIT_MMX mmxext | |
448 | chroma_mc2_mmx_func put, h264 | |
449 | ||
450 | %define CHROMAMC_AVG DIRECT_AVG | |
451 | %define CHROMAMC_AVG4 COPY_AVG | |
452 | chroma_mc8_mmx_func avg, h264, _rnd | |
453 | chroma_mc8_mmx_func avg, vc1, _nornd | |
454 | chroma_mc8_mmx_func avg, rv40 | |
455 | chroma_mc4_mmx_func avg, h264 | |
456 | chroma_mc4_mmx_func avg, rv40 | |
457 | chroma_mc2_mmx_func avg, h264 | |
458 | ||
459 | INIT_MMX 3dnow | |
460 | chroma_mc8_mmx_func avg, h264, _rnd | |
461 | chroma_mc8_mmx_func avg, vc1, _nornd | |
462 | chroma_mc8_mmx_func avg, rv40 | |
463 | chroma_mc4_mmx_func avg, h264 | |
464 | chroma_mc4_mmx_func avg, rv40 | |
465 | ||
466 | %macro chroma_mc8_ssse3_func 2-3 | |
467 | cglobal %1_%2_chroma_mc8%3, 6, 7, 8 | |
468 | %if ARCH_X86_64 | |
469 | movsxd r2, r2d | |
470 | %endif | |
471 | mov r6d, r5d | |
472 | or r6d, r4d | |
473 | jne .at_least_one_non_zero | |
474 | ; mx == 0 AND my == 0 - no filter needed | |
475 | mv0_pixels_mc8 | |
476 | REP_RET | |
477 | ||
478 | .at_least_one_non_zero: | |
479 | test r5d, r5d | |
480 | je .my_is_zero | |
481 | test r4d, r4d | |
482 | je .mx_is_zero | |
483 | ||
484 | ; general case, bilinear | |
485 | mov r6d, r4d | |
486 | shl r4d, 8 | |
487 | sub r4, r6 | |
488 | mov r6, 8 | |
489 | add r4, 8 ; x*288+8 = x<<8 | (8-x) | |
490 | sub r6d, r5d | |
491 | imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | |
492 | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | |
493 | ||
494 | movd m7, r6d | |
495 | movd m6, r4d | |
496 | movdqa m5, [rnd_2d_%2] | |
497 | movq m0, [r1 ] | |
498 | movq m1, [r1+1] | |
499 | pshuflw m7, m7, 0 | |
500 | pshuflw m6, m6, 0 | |
501 | punpcklbw m0, m1 | |
502 | movlhps m7, m7 | |
503 | movlhps m6, m6 | |
504 | ||
505 | .next2rows: | |
506 | movq m1, [r1+r2*1 ] | |
507 | movq m2, [r1+r2*1+1] | |
508 | movq m3, [r1+r2*2 ] | |
509 | movq m4, [r1+r2*2+1] | |
510 | lea r1, [r1+r2*2] | |
511 | punpcklbw m1, m2 | |
512 | movdqa m2, m1 | |
513 | punpcklbw m3, m4 | |
514 | movdqa m4, m3 | |
515 | pmaddubsw m0, m7 | |
516 | pmaddubsw m1, m6 | |
517 | pmaddubsw m2, m7 | |
518 | pmaddubsw m3, m6 | |
519 | paddw m0, m5 | |
520 | paddw m2, m5 | |
521 | paddw m1, m0 | |
522 | paddw m3, m2 | |
523 | psrlw m1, 6 | |
524 | movdqa m0, m4 | |
525 | psrlw m3, 6 | |
526 | %ifidn %1, avg | |
527 | movq m2, [r0 ] | |
528 | movhps m2, [r0+r2] | |
529 | %endif | |
530 | packuswb m1, m3 | |
531 | CHROMAMC_AVG m1, m2 | |
532 | movq [r0 ], m1 | |
533 | movhps [r0+r2], m1 | |
534 | sub r3d, 2 | |
535 | lea r0, [r0+r2*2] | |
536 | jg .next2rows | |
537 | REP_RET | |
538 | ||
539 | .my_is_zero: | |
540 | mov r5d, r4d | |
541 | shl r4d, 8 | |
542 | add r4, 8 | |
543 | sub r4, r5 ; 255*x+8 = x<<8 | (8-x) | |
544 | movd m7, r4d | |
545 | movdqa m6, [rnd_1d_%2] | |
546 | pshuflw m7, m7, 0 | |
547 | movlhps m7, m7 | |
548 | ||
549 | .next2xrows: | |
550 | movq m0, [r1 ] | |
551 | movq m1, [r1 +1] | |
552 | movq m2, [r1+r2 ] | |
553 | movq m3, [r1+r2+1] | |
554 | punpcklbw m0, m1 | |
555 | punpcklbw m2, m3 | |
556 | pmaddubsw m0, m7 | |
557 | pmaddubsw m2, m7 | |
558 | %ifidn %1, avg | |
559 | movq m4, [r0 ] | |
560 | movhps m4, [r0+r2] | |
561 | %endif | |
562 | paddw m0, m6 | |
563 | paddw m2, m6 | |
564 | psrlw m0, 3 | |
565 | psrlw m2, 3 | |
566 | packuswb m0, m2 | |
567 | CHROMAMC_AVG m0, m4 | |
568 | movq [r0 ], m0 | |
569 | movhps [r0+r2], m0 | |
570 | sub r3d, 2 | |
571 | lea r0, [r0+r2*2] | |
572 | lea r1, [r1+r2*2] | |
573 | jg .next2xrows | |
574 | REP_RET | |
575 | ||
576 | .mx_is_zero: | |
577 | mov r4d, r5d | |
578 | shl r5d, 8 | |
579 | add r5, 8 | |
580 | sub r5, r4 ; 255*y+8 = y<<8 | (8-y) | |
581 | movd m7, r5d | |
582 | movdqa m6, [rnd_1d_%2] | |
583 | pshuflw m7, m7, 0 | |
584 | movlhps m7, m7 | |
585 | ||
586 | .next2yrows: | |
587 | movq m0, [r1 ] | |
588 | movq m1, [r1+r2 ] | |
589 | movdqa m2, m1 | |
590 | movq m3, [r1+r2*2] | |
591 | lea r1, [r1+r2*2] | |
592 | punpcklbw m0, m1 | |
593 | punpcklbw m2, m3 | |
594 | pmaddubsw m0, m7 | |
595 | pmaddubsw m2, m7 | |
596 | %ifidn %1, avg | |
597 | movq m4, [r0 ] | |
598 | movhps m4, [r0+r2] | |
599 | %endif | |
600 | paddw m0, m6 | |
601 | paddw m2, m6 | |
602 | psrlw m0, 3 | |
603 | psrlw m2, 3 | |
604 | packuswb m0, m2 | |
605 | CHROMAMC_AVG m0, m4 | |
606 | movq [r0 ], m0 | |
607 | movhps [r0+r2], m0 | |
608 | sub r3d, 2 | |
609 | lea r0, [r0+r2*2] | |
610 | jg .next2yrows | |
611 | REP_RET | |
612 | %endmacro | |
613 | ||
614 | %macro chroma_mc4_ssse3_func 2 | |
615 | cglobal %1_%2_chroma_mc4, 6, 7, 0 | |
616 | %if ARCH_X86_64 | |
617 | movsxd r2, r2d | |
618 | %endif | |
619 | mov r6, r4 | |
620 | shl r4d, 8 | |
621 | sub r4d, r6d | |
622 | mov r6, 8 | |
623 | add r4d, 8 ; x*288+8 | |
624 | sub r6d, r5d | |
625 | imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) | |
626 | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) | |
627 | ||
628 | movd m7, r6d | |
629 | movd m6, r4d | |
630 | movq m5, [pw_32] | |
631 | movd m0, [r1 ] | |
632 | pshufw m7, m7, 0 | |
633 | punpcklbw m0, [r1+1] | |
634 | pshufw m6, m6, 0 | |
635 | ||
636 | .next2rows: | |
637 | movd m1, [r1+r2*1 ] | |
638 | movd m3, [r1+r2*2 ] | |
639 | punpcklbw m1, [r1+r2*1+1] | |
640 | punpcklbw m3, [r1+r2*2+1] | |
641 | lea r1, [r1+r2*2] | |
642 | movq m2, m1 | |
643 | movq m4, m3 | |
644 | pmaddubsw m0, m7 | |
645 | pmaddubsw m1, m6 | |
646 | pmaddubsw m2, m7 | |
647 | pmaddubsw m3, m6 | |
648 | paddw m0, m5 | |
649 | paddw m2, m5 | |
650 | paddw m1, m0 | |
651 | paddw m3, m2 | |
652 | psrlw m1, 6 | |
653 | movq m0, m4 | |
654 | psrlw m3, 6 | |
655 | packuswb m1, m1 | |
656 | packuswb m3, m3 | |
657 | CHROMAMC_AVG m1, [r0 ] | |
658 | CHROMAMC_AVG m3, [r0+r2] | |
659 | movd [r0 ], m1 | |
660 | movd [r0+r2], m3 | |
661 | sub r3d, 2 | |
662 | lea r0, [r0+r2*2] | |
663 | jg .next2rows | |
664 | REP_RET | |
665 | %endmacro | |
666 | ||
667 | %define CHROMAMC_AVG NOTHING | |
668 | INIT_XMM ssse3 | |
669 | chroma_mc8_ssse3_func put, h264, _rnd | |
670 | chroma_mc8_ssse3_func put, vc1, _nornd | |
671 | INIT_MMX ssse3 | |
672 | chroma_mc4_ssse3_func put, h264 | |
673 | ||
674 | %define CHROMAMC_AVG DIRECT_AVG | |
675 | INIT_XMM ssse3 | |
676 | chroma_mc8_ssse3_func avg, h264, _rnd | |
677 | chroma_mc8_ssse3_func avg, vc1, _nornd | |
678 | INIT_MMX ssse3 | |
679 | chroma_mc4_ssse3_func avg, h264 |