Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/SSSE3-optimized H.264 QPEL code | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt | |
5 | ;* Copyright (C) 2012 Daniel Kang | |
6 | ;* | |
7 | ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> | |
8 | ;* | |
9 | ;* This file is part of FFmpeg. | |
10 | ;* | |
11 | ;* FFmpeg is free software; you can redistribute it and/or | |
12 | ;* modify it under the terms of the GNU Lesser General Public | |
13 | ;* License as published by the Free Software Foundation; either | |
14 | ;* version 2.1 of the License, or (at your option) any later version. | |
15 | ;* | |
16 | ;* FFmpeg is distributed in the hope that it will be useful, | |
17 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
19 | ;* Lesser General Public License for more details. | |
20 | ;* | |
21 | ;* You should have received a copy of the GNU Lesser General Public | |
22 | ;* License along with FFmpeg; if not, write to the Free Software | |
23 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
24 | ;****************************************************************************** | |
25 | ||
26 | %include "libavutil/x86/x86util.asm" | |
27 | ||
28 | SECTION_RODATA 32 | |
29 | ||
30 | cextern pw_16 | |
31 | cextern pw_5 | |
32 | cextern pb_0 | |
33 | ||
34 | SECTION .text | |
35 | ||
36 | ||
37 | %macro op_avgh 3 | |
38 | movh %3, %2 | |
39 | pavgb %1, %3 | |
40 | movh %2, %1 | |
41 | %endmacro | |
42 | ||
43 | %macro op_avg 2-3 | |
44 | pavgb %1, %2 | |
45 | mova %2, %1 | |
46 | %endmacro | |
47 | ||
48 | %macro op_puth 2-3 | |
49 | movh %2, %1 | |
50 | %endmacro | |
51 | ||
52 | %macro op_put 2-3 | |
53 | mova %2, %1 | |
54 | %endmacro | |
55 | ||
56 | %macro QPEL4_H_LOWPASS_OP 1 | |
57 | cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride | |
58 | movsxdifnidn r2, r2d | |
59 | movsxdifnidn r3, r3d | |
60 | pxor m7, m7 | |
61 | mova m4, [pw_5] | |
62 | mova m5, [pw_16] | |
63 | mov r4d, 4 | |
64 | .loop: | |
65 | movh m1, [r1-1] | |
66 | movh m2, [r1+0] | |
67 | movh m3, [r1+1] | |
68 | movh m0, [r1+2] | |
69 | punpcklbw m1, m7 | |
70 | punpcklbw m2, m7 | |
71 | punpcklbw m3, m7 | |
72 | punpcklbw m0, m7 | |
73 | paddw m1, m0 | |
74 | paddw m2, m3 | |
75 | movh m0, [r1-2] | |
76 | movh m3, [r1+3] | |
77 | punpcklbw m0, m7 | |
78 | punpcklbw m3, m7 | |
79 | paddw m0, m3 | |
80 | psllw m2, 2 | |
81 | psubw m2, m1 | |
82 | pmullw m2, m4 | |
83 | paddw m0, m5 | |
84 | paddw m0, m2 | |
85 | psraw m0, 5 | |
86 | packuswb m0, m0 | |
87 | op_%1h m0, [r0], m6 | |
88 | add r0, r2 | |
89 | add r1, r3 | |
90 | dec r4d | |
91 | jg .loop | |
92 | REP_RET | |
93 | %endmacro | |
94 | ||
95 | INIT_MMX mmxext | |
96 | QPEL4_H_LOWPASS_OP put | |
97 | QPEL4_H_LOWPASS_OP avg | |
98 | ||
99 | %macro QPEL8_H_LOWPASS_OP 1 | |
100 | cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride | |
101 | movsxdifnidn r2, r2d | |
102 | movsxdifnidn r3, r3d | |
103 | mov r4d, 8 | |
104 | pxor m7, m7 | |
105 | mova m6, [pw_5] | |
106 | .loop: | |
107 | mova m0, [r1] | |
108 | mova m2, [r1+1] | |
109 | mova m1, m0 | |
110 | mova m3, m2 | |
111 | punpcklbw m0, m7 | |
112 | punpckhbw m1, m7 | |
113 | punpcklbw m2, m7 | |
114 | punpckhbw m3, m7 | |
115 | paddw m0, m2 | |
116 | paddw m1, m3 | |
117 | psllw m0, 2 | |
118 | psllw m1, 2 | |
119 | mova m2, [r1-1] | |
120 | mova m4, [r1+2] | |
121 | mova m3, m2 | |
122 | mova m5, m4 | |
123 | punpcklbw m2, m7 | |
124 | punpckhbw m3, m7 | |
125 | punpcklbw m4, m7 | |
126 | punpckhbw m5, m7 | |
127 | paddw m2, m4 | |
128 | paddw m5, m3 | |
129 | psubw m0, m2 | |
130 | psubw m1, m5 | |
131 | pmullw m0, m6 | |
132 | pmullw m1, m6 | |
133 | movd m2, [r1-2] | |
134 | movd m5, [r1+7] | |
135 | punpcklbw m2, m7 | |
136 | punpcklbw m5, m7 | |
137 | paddw m2, m3 | |
138 | paddw m4, m5 | |
139 | mova m5, [pw_16] | |
140 | paddw m2, m5 | |
141 | paddw m4, m5 | |
142 | paddw m0, m2 | |
143 | paddw m1, m4 | |
144 | psraw m0, 5 | |
145 | psraw m1, 5 | |
146 | packuswb m0, m1 | |
147 | op_%1 m0, [r0], m4 | |
148 | add r0, r2 | |
149 | add r1, r3 | |
150 | dec r4d | |
151 | jg .loop | |
152 | REP_RET | |
153 | %endmacro | |
154 | ||
155 | INIT_MMX mmxext | |
156 | QPEL8_H_LOWPASS_OP put | |
157 | QPEL8_H_LOWPASS_OP avg | |
158 | ||
159 | %macro QPEL8_H_LOWPASS_OP_XMM 1 | |
160 | cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride | |
161 | movsxdifnidn r2, r2d | |
162 | movsxdifnidn r3, r3d | |
163 | mov r4d, 8 | |
164 | pxor m7, m7 | |
165 | mova m6, [pw_5] | |
166 | .loop: | |
167 | movu m1, [r1-2] | |
168 | mova m0, m1 | |
169 | punpckhbw m1, m7 | |
170 | punpcklbw m0, m7 | |
171 | mova m2, m1 | |
172 | mova m3, m1 | |
173 | mova m4, m1 | |
174 | mova m5, m1 | |
175 | palignr m4, m0, 2 | |
176 | palignr m3, m0, 4 | |
177 | palignr m2, m0, 6 | |
178 | palignr m1, m0, 8 | |
179 | palignr m5, m0, 10 | |
180 | paddw m0, m5 | |
181 | paddw m2, m3 | |
182 | paddw m1, m4 | |
183 | psllw m2, 2 | |
184 | psubw m2, m1 | |
185 | paddw m0, [pw_16] | |
186 | pmullw m2, m6 | |
187 | paddw m2, m0 | |
188 | psraw m2, 5 | |
189 | packuswb m2, m2 | |
190 | op_%1h m2, [r0], m4 | |
191 | add r1, r3 | |
192 | add r0, r2 | |
193 | dec r4d | |
194 | jne .loop | |
195 | REP_RET | |
196 | %endmacro | |
197 | ||
198 | INIT_XMM ssse3 | |
199 | QPEL8_H_LOWPASS_OP_XMM put | |
200 | QPEL8_H_LOWPASS_OP_XMM avg | |
201 | ||
202 | ||
203 | %macro QPEL4_H_LOWPASS_L2_OP 1 | |
204 | cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride | |
205 | movsxdifnidn r3, r3d | |
206 | movsxdifnidn r4, r4d | |
207 | pxor m7, m7 | |
208 | mova m4, [pw_5] | |
209 | mova m5, [pw_16] | |
210 | mov r5d, 4 | |
211 | .loop: | |
212 | movh m1, [r1-1] | |
213 | movh m2, [r1+0] | |
214 | movh m3, [r1+1] | |
215 | movh m0, [r1+2] | |
216 | punpcklbw m1, m7 | |
217 | punpcklbw m2, m7 | |
218 | punpcklbw m3, m7 | |
219 | punpcklbw m0, m7 | |
220 | paddw m1, m0 | |
221 | paddw m2, m3 | |
222 | movh m0, [r1-2] | |
223 | movh m3, [r1+3] | |
224 | punpcklbw m0, m7 | |
225 | punpcklbw m3, m7 | |
226 | paddw m0, m3 | |
227 | psllw m2, 2 | |
228 | psubw m2, m1 | |
229 | pmullw m2, m4 | |
230 | paddw m0, m5 | |
231 | paddw m0, m2 | |
232 | movh m3, [r2] | |
233 | psraw m0, 5 | |
234 | packuswb m0, m0 | |
235 | pavgb m0, m3 | |
236 | op_%1h m0, [r0], m6 | |
237 | add r0, r3 | |
238 | add r1, r3 | |
239 | add r2, r4 | |
240 | dec r5d | |
241 | jg .loop | |
242 | REP_RET | |
243 | %endmacro | |
244 | ||
245 | INIT_MMX mmxext | |
246 | QPEL4_H_LOWPASS_L2_OP put | |
247 | QPEL4_H_LOWPASS_L2_OP avg | |
248 | ||
249 | ||
250 | %macro QPEL8_H_LOWPASS_L2_OP 1 | |
251 | cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride | |
252 | movsxdifnidn r3, r3d | |
253 | movsxdifnidn r4, r4d | |
254 | mov r5d, 8 | |
255 | pxor m7, m7 | |
256 | mova m6, [pw_5] | |
257 | .loop: | |
258 | mova m0, [r1] | |
259 | mova m2, [r1+1] | |
260 | mova m1, m0 | |
261 | mova m3, m2 | |
262 | punpcklbw m0, m7 | |
263 | punpckhbw m1, m7 | |
264 | punpcklbw m2, m7 | |
265 | punpckhbw m3, m7 | |
266 | paddw m0, m2 | |
267 | paddw m1, m3 | |
268 | psllw m0, 2 | |
269 | psllw m1, 2 | |
270 | mova m2, [r1-1] | |
271 | mova m4, [r1+2] | |
272 | mova m3, m2 | |
273 | mova m5, m4 | |
274 | punpcklbw m2, m7 | |
275 | punpckhbw m3, m7 | |
276 | punpcklbw m4, m7 | |
277 | punpckhbw m5, m7 | |
278 | paddw m2, m4 | |
279 | paddw m5, m3 | |
280 | psubw m0, m2 | |
281 | psubw m1, m5 | |
282 | pmullw m0, m6 | |
283 | pmullw m1, m6 | |
284 | movd m2, [r1-2] | |
285 | movd m5, [r1+7] | |
286 | punpcklbw m2, m7 | |
287 | punpcklbw m5, m7 | |
288 | paddw m2, m3 | |
289 | paddw m4, m5 | |
290 | mova m5, [pw_16] | |
291 | paddw m2, m5 | |
292 | paddw m4, m5 | |
293 | paddw m0, m2 | |
294 | paddw m1, m4 | |
295 | psraw m0, 5 | |
296 | psraw m1, 5 | |
297 | mova m4, [r2] | |
298 | packuswb m0, m1 | |
299 | pavgb m0, m4 | |
300 | op_%1 m0, [r0], m4 | |
301 | add r0, r3 | |
302 | add r1, r3 | |
303 | add r2, r4 | |
304 | dec r5d | |
305 | jg .loop | |
306 | REP_RET | |
307 | %endmacro | |
308 | ||
309 | INIT_MMX mmxext | |
310 | QPEL8_H_LOWPASS_L2_OP put | |
311 | QPEL8_H_LOWPASS_L2_OP avg | |
312 | ||
313 | ||
314 | %macro QPEL8_H_LOWPASS_L2_OP_XMM 1 | |
315 | cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride | |
316 | movsxdifnidn r3, r3d | |
317 | movsxdifnidn r4, r4d | |
318 | mov r5d, 8 | |
319 | pxor m7, m7 | |
320 | mova m6, [pw_5] | |
321 | .loop: | |
322 | lddqu m1, [r1-2] | |
323 | mova m0, m1 | |
324 | punpckhbw m1, m7 | |
325 | punpcklbw m0, m7 | |
326 | mova m2, m1 | |
327 | mova m3, m1 | |
328 | mova m4, m1 | |
329 | mova m5, m1 | |
330 | palignr m4, m0, 2 | |
331 | palignr m3, m0, 4 | |
332 | palignr m2, m0, 6 | |
333 | palignr m1, m0, 8 | |
334 | palignr m5, m0, 10 | |
335 | paddw m0, m5 | |
336 | paddw m2, m3 | |
337 | paddw m1, m4 | |
338 | psllw m2, 2 | |
339 | movh m3, [r2] | |
340 | psubw m2, m1 | |
341 | paddw m0, [pw_16] | |
342 | pmullw m2, m6 | |
343 | paddw m2, m0 | |
344 | psraw m2, 5 | |
345 | packuswb m2, m2 | |
346 | pavgb m2, m3 | |
347 | op_%1h m2, [r0], m4 | |
348 | add r1, r3 | |
349 | add r0, r3 | |
350 | add r2, r4 | |
351 | dec r5d | |
352 | jg .loop | |
353 | REP_RET | |
354 | %endmacro | |
355 | ||
356 | INIT_XMM ssse3 | |
357 | QPEL8_H_LOWPASS_L2_OP_XMM put | |
358 | QPEL8_H_LOWPASS_L2_OP_XMM avg | |
359 | ||
360 | ||
361 | ; All functions that call this are required to have function arguments of | |
362 | ; dst, src, dstStride, srcStride | |
363 | %macro FILT_V 1 | |
364 | mova m6, m2 | |
365 | movh m5, [r1] | |
366 | paddw m6, m3 | |
367 | psllw m6, 2 | |
368 | psubw m6, m1 | |
369 | psubw m6, m4 | |
370 | punpcklbw m5, m7 | |
371 | pmullw m6, [pw_5] | |
372 | paddw m0, [pw_16] | |
373 | add r1, r3 | |
374 | paddw m0, m5 | |
375 | paddw m6, m0 | |
376 | psraw m6, 5 | |
377 | packuswb m6, m6 | |
378 | op_%1h m6, [r0], m0 ; 1 | |
379 | add r0, r2 | |
380 | SWAP 0, 1, 2, 3, 4, 5 | |
381 | %endmacro | |
382 | ||
383 | %macro QPEL4_V_LOWPASS_OP 1 | |
384 | cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride | |
385 | movsxdifnidn r2, r2d | |
386 | movsxdifnidn r3, r3d | |
387 | sub r1, r3 | |
388 | sub r1, r3 | |
389 | pxor m7, m7 | |
390 | movh m0, [r1] | |
391 | movh m1, [r1+r3] | |
392 | lea r1, [r1+2*r3] | |
393 | movh m2, [r1] | |
394 | movh m3, [r1+r3] | |
395 | lea r1, [r1+2*r3] | |
396 | movh m4, [r1] | |
397 | add r1, r3 | |
398 | punpcklbw m0, m7 | |
399 | punpcklbw m1, m7 | |
400 | punpcklbw m2, m7 | |
401 | punpcklbw m3, m7 | |
402 | punpcklbw m4, m7 | |
403 | FILT_V %1 | |
404 | FILT_V %1 | |
405 | FILT_V %1 | |
406 | FILT_V %1 | |
407 | RET | |
408 | %endmacro | |
409 | ||
410 | INIT_MMX mmxext | |
411 | QPEL4_V_LOWPASS_OP put | |
412 | QPEL4_V_LOWPASS_OP avg | |
413 | ||
414 | ||
415 | ||
416 | %macro QPEL8OR16_V_LOWPASS_OP 1 | |
417 | %if cpuflag(sse2) | |
418 | cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h | |
419 | movsxdifnidn r2, r2d | |
420 | movsxdifnidn r3, r3d | |
421 | sub r1, r3 | |
422 | sub r1, r3 | |
423 | %else | |
424 | cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h | |
425 | movsxdifnidn r2, r2d | |
426 | movsxdifnidn r3, r3d | |
427 | %endif | |
428 | pxor m7, m7 | |
429 | movh m0, [r1] | |
430 | movh m1, [r1+r3] | |
431 | lea r1, [r1+2*r3] | |
432 | movh m2, [r1] | |
433 | movh m3, [r1+r3] | |
434 | lea r1, [r1+2*r3] | |
435 | movh m4, [r1] | |
436 | add r1, r3 | |
437 | punpcklbw m0, m7 | |
438 | punpcklbw m1, m7 | |
439 | punpcklbw m2, m7 | |
440 | punpcklbw m3, m7 | |
441 | punpcklbw m4, m7 | |
442 | FILT_V %1 | |
443 | FILT_V %1 | |
444 | FILT_V %1 | |
445 | FILT_V %1 | |
446 | FILT_V %1 | |
447 | FILT_V %1 | |
448 | FILT_V %1 | |
449 | FILT_V %1 | |
450 | cmp r4d, 16 | |
451 | jne .end | |
452 | FILT_V %1 | |
453 | FILT_V %1 | |
454 | FILT_V %1 | |
455 | FILT_V %1 | |
456 | FILT_V %1 | |
457 | FILT_V %1 | |
458 | FILT_V %1 | |
459 | FILT_V %1 | |
460 | .end: | |
461 | REP_RET | |
462 | %endmacro | |
463 | ||
464 | INIT_MMX mmxext | |
465 | QPEL8OR16_V_LOWPASS_OP put | |
466 | QPEL8OR16_V_LOWPASS_OP avg | |
467 | ||
468 | INIT_XMM sse2 | |
469 | QPEL8OR16_V_LOWPASS_OP put | |
470 | QPEL8OR16_V_LOWPASS_OP avg | |
471 | ||
472 | ||
473 | ; All functions that use this are required to have args: | |
474 | ; src, tmp, srcSize | |
475 | %macro FILT_HV 1 ; offset | |
476 | mova m6, m2 | |
477 | movh m5, [r0] | |
478 | paddw m6, m3 | |
479 | psllw m6, 2 | |
480 | paddw m0, [pw_16] | |
481 | psubw m6, m1 | |
482 | psubw m6, m4 | |
483 | punpcklbw m5, m7 | |
484 | pmullw m6, [pw_5] | |
485 | paddw m0, m5 | |
486 | add r0, r2 | |
487 | paddw m6, m0 | |
488 | mova [r1+%1], m6 | |
489 | SWAP 0, 1, 2, 3, 4, 5 | |
490 | %endmacro | |
491 | ||
492 | %macro QPEL4_HV1_LOWPASS_OP 1 | |
493 | cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride | |
494 | movsxdifnidn r2, r2d | |
495 | pxor m7, m7 | |
496 | movh m0, [r0] | |
497 | movh m1, [r0+r2] | |
498 | lea r0, [r0+2*r2] | |
499 | movh m2, [r0] | |
500 | movh m3, [r0+r2] | |
501 | lea r0, [r0+2*r2] | |
502 | movh m4, [r0] | |
503 | add r0, r2 | |
504 | punpcklbw m0, m7 | |
505 | punpcklbw m1, m7 | |
506 | punpcklbw m2, m7 | |
507 | punpcklbw m3, m7 | |
508 | punpcklbw m4, m7 | |
509 | FILT_HV 0*24 | |
510 | FILT_HV 1*24 | |
511 | FILT_HV 2*24 | |
512 | FILT_HV 3*24 | |
513 | RET | |
514 | ||
515 | cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride | |
516 | movsxdifnidn r2, r2d | |
517 | mov r3d, 4 | |
518 | .loop: | |
519 | mova m0, [r0] | |
520 | paddw m0, [r0+10] | |
521 | mova m1, [r0+2] | |
522 | paddw m1, [r0+8] | |
523 | mova m2, [r0+4] | |
524 | paddw m2, [r0+6] | |
525 | psubw m0, m1 | |
526 | psraw m0, 2 | |
527 | psubw m0, m1 | |
528 | paddsw m0, m2 | |
529 | psraw m0, 2 | |
530 | paddw m0, m2 | |
531 | psraw m0, 6 | |
532 | packuswb m0, m0 | |
533 | op_%1h m0, [r1], m7 | |
534 | add r0, 24 | |
535 | add r1, r2 | |
536 | dec r3d | |
537 | jnz .loop | |
538 | REP_RET | |
539 | %endmacro | |
540 | ||
541 | INIT_MMX mmxext | |
542 | QPEL4_HV1_LOWPASS_OP put | |
543 | QPEL4_HV1_LOWPASS_OP avg | |
544 | ||
545 | %macro QPEL8OR16_HV1_LOWPASS_OP 1 | |
546 | cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size | |
547 | movsxdifnidn r2, r2d | |
548 | pxor m7, m7 | |
549 | movh m0, [r0] | |
550 | movh m1, [r0+r2] | |
551 | lea r0, [r0+2*r2] | |
552 | movh m2, [r0] | |
553 | movh m3, [r0+r2] | |
554 | lea r0, [r0+2*r2] | |
555 | movh m4, [r0] | |
556 | add r0, r2 | |
557 | punpcklbw m0, m7 | |
558 | punpcklbw m1, m7 | |
559 | punpcklbw m2, m7 | |
560 | punpcklbw m3, m7 | |
561 | punpcklbw m4, m7 | |
562 | FILT_HV 0*48 | |
563 | FILT_HV 1*48 | |
564 | FILT_HV 2*48 | |
565 | FILT_HV 3*48 | |
566 | FILT_HV 4*48 | |
567 | FILT_HV 5*48 | |
568 | FILT_HV 6*48 | |
569 | FILT_HV 7*48 | |
570 | cmp r3d, 16 | |
571 | jne .end | |
572 | FILT_HV 8*48 | |
573 | FILT_HV 9*48 | |
574 | FILT_HV 10*48 | |
575 | FILT_HV 11*48 | |
576 | FILT_HV 12*48 | |
577 | FILT_HV 13*48 | |
578 | FILT_HV 14*48 | |
579 | FILT_HV 15*48 | |
580 | .end: | |
581 | REP_RET | |
582 | %endmacro | |
583 | ||
584 | INIT_MMX mmxext | |
585 | QPEL8OR16_HV1_LOWPASS_OP put | |
586 | QPEL8OR16_HV1_LOWPASS_OP avg | |
587 | ||
588 | INIT_XMM sse2 | |
589 | QPEL8OR16_HV1_LOWPASS_OP put | |
590 | ||
591 | ||
592 | ||
593 | %macro QPEL8OR16_HV2_LOWPASS_OP 1 | |
594 | ; unused is to match ssse3 and mmxext args | |
595 | cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h | |
596 | movsxdifnidn r2, r2d | |
597 | .loop: | |
598 | mova m0, [r1] | |
599 | mova m3, [r1+8] | |
600 | mova m1, [r1+2] | |
601 | mova m4, [r1+10] | |
602 | paddw m0, m4 | |
603 | paddw m1, m3 | |
604 | paddw m3, [r1+18] | |
605 | paddw m4, [r1+16] | |
606 | mova m2, [r1+4] | |
607 | mova m5, [r1+12] | |
608 | paddw m2, [r1+6] | |
609 | paddw m5, [r1+14] | |
610 | psubw m0, m1 | |
611 | psubw m3, m4 | |
612 | psraw m0, 2 | |
613 | psraw m3, 2 | |
614 | psubw m0, m1 | |
615 | psubw m3, m4 | |
616 | paddsw m0, m2 | |
617 | paddsw m3, m5 | |
618 | psraw m0, 2 | |
619 | psraw m3, 2 | |
620 | paddw m0, m2 | |
621 | paddw m3, m5 | |
622 | psraw m0, 6 | |
623 | psraw m3, 6 | |
624 | packuswb m0, m3 | |
625 | op_%1 m0, [r0], m7 | |
626 | add r1, 48 | |
627 | add r0, r2 | |
628 | dec r4d | |
629 | jne .loop | |
630 | REP_RET | |
631 | %endmacro | |
632 | ||
633 | INIT_MMX mmxext | |
634 | QPEL8OR16_HV2_LOWPASS_OP put | |
635 | QPEL8OR16_HV2_LOWPASS_OP avg | |
636 | ||
637 | %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 | |
638 | cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size | |
639 | movsxdifnidn r2, r2d | |
640 | movsxdifnidn r3, r3d | |
641 | cmp r4d, 16 | |
642 | je .op16 | |
643 | .loop8: | |
644 | mova m1, [r1+16] | |
645 | mova m0, [r1] | |
646 | mova m2, m1 | |
647 | mova m3, m1 | |
648 | mova m4, m1 | |
649 | mova m5, m1 | |
650 | palignr m5, m0, 10 | |
651 | palignr m4, m0, 8 | |
652 | palignr m3, m0, 6 | |
653 | palignr m2, m0, 4 | |
654 | palignr m1, m0, 2 | |
655 | paddw m0, m5 | |
656 | paddw m1, m4 | |
657 | paddw m2, m3 | |
658 | psubw m0, m1 | |
659 | psraw m0, 2 | |
660 | psubw m0, m1 | |
661 | paddw m0, m2 | |
662 | psraw m0, 2 | |
663 | paddw m0, m2 | |
664 | psraw m0, 6 | |
665 | packuswb m0, m0 | |
666 | op_%1h m0, [r0], m7 | |
667 | add r1, 48 | |
668 | add r0, r2 | |
669 | dec r4d | |
670 | jne .loop8 | |
671 | jmp .done | |
672 | .op16: | |
673 | mova m4, [r1+32] | |
674 | mova m5, [r1+16] | |
675 | mova m7, [r1] | |
676 | mova m3, m4 | |
677 | mova m2, m4 | |
678 | mova m1, m4 | |
679 | mova m0, m4 | |
680 | palignr m0, m5, 10 | |
681 | palignr m1, m5, 8 | |
682 | palignr m2, m5, 6 | |
683 | palignr m3, m5, 4 | |
684 | palignr m4, m5, 2 | |
685 | paddw m0, m5 | |
686 | paddw m1, m4 | |
687 | paddw m2, m3 | |
688 | mova m6, m5 | |
689 | mova m4, m5 | |
690 | mova m3, m5 | |
691 | palignr m4, m7, 8 | |
692 | palignr m6, m7, 2 | |
693 | palignr m3, m7, 10 | |
694 | paddw m4, m6 | |
695 | mova m6, m5 | |
696 | palignr m5, m7, 6 | |
697 | palignr m6, m7, 4 | |
698 | paddw m3, m7 | |
699 | paddw m5, m6 | |
700 | psubw m0, m1 | |
701 | psubw m3, m4 | |
702 | psraw m0, 2 | |
703 | psraw m3, 2 | |
704 | psubw m0, m1 | |
705 | psubw m3, m4 | |
706 | paddw m0, m2 | |
707 | paddw m3, m5 | |
708 | psraw m0, 2 | |
709 | psraw m3, 2 | |
710 | paddw m0, m2 | |
711 | paddw m3, m5 | |
712 | psraw m0, 6 | |
713 | psraw m3, 6 | |
714 | packuswb m3, m0 | |
715 | op_%1 m3, [r0], m7 | |
716 | add r1, 48 | |
717 | add r0, r2 | |
718 | dec r4d | |
719 | jne .op16 | |
720 | .done: | |
721 | REP_RET | |
722 | %endmacro | |
723 | ||
724 | INIT_XMM ssse3 | |
725 | QPEL8OR16_HV2_LOWPASS_OP_XMM put | |
726 | QPEL8OR16_HV2_LOWPASS_OP_XMM avg | |
727 | ||
728 | ||
729 | %macro PIXELS4_L2_SHIFT5 1 | |
730 | cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h | |
731 | movsxdifnidn r3, r3d | |
732 | movsxdifnidn r4, r4d | |
733 | mova m0, [r1] | |
734 | mova m1, [r1+24] | |
735 | psraw m0, 5 | |
736 | psraw m1, 5 | |
737 | packuswb m0, m0 | |
738 | packuswb m1, m1 | |
739 | pavgb m0, [r2] | |
740 | pavgb m1, [r2+r4] | |
741 | op_%1h m0, [r0], m4 | |
742 | op_%1h m1, [r0+r3], m5 | |
743 | lea r2, [r2+r4*2] | |
744 | lea r0, [r0+r3*2] | |
745 | mova m0, [r1+48] | |
746 | mova m1, [r1+72] | |
747 | psraw m0, 5 | |
748 | psraw m1, 5 | |
749 | packuswb m0, m0 | |
750 | packuswb m1, m1 | |
751 | pavgb m0, [r2] | |
752 | pavgb m1, [r2+r4] | |
753 | op_%1h m0, [r0], m4 | |
754 | op_%1h m1, [r0+r3], m5 | |
755 | RET | |
756 | %endmacro | |
757 | ||
758 | INIT_MMX mmxext | |
759 | PIXELS4_L2_SHIFT5 put | |
760 | PIXELS4_L2_SHIFT5 avg | |
761 | ||
762 | ||
763 | %macro PIXELS8_L2_SHIFT5 1 | |
764 | cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h | |
765 | movsxdifnidn r3, r3d | |
766 | movsxdifnidn r4, r4d | |
767 | .loop: | |
768 | mova m0, [r1] | |
769 | mova m1, [r1+8] | |
770 | mova m2, [r1+48] | |
771 | mova m3, [r1+48+8] | |
772 | psraw m0, 5 | |
773 | psraw m1, 5 | |
774 | psraw m2, 5 | |
775 | psraw m3, 5 | |
776 | packuswb m0, m1 | |
777 | packuswb m2, m3 | |
778 | pavgb m0, [r2] | |
779 | pavgb m2, [r2+r4] | |
780 | op_%1 m0, [r0], m4 | |
781 | op_%1 m2, [r0+r3], m5 | |
782 | lea r2, [r2+2*r4] | |
783 | add r1, 48*2 | |
784 | lea r0, [r0+2*r3] | |
785 | sub r5d, 2 | |
786 | jne .loop | |
787 | REP_RET | |
788 | %endmacro | |
789 | ||
790 | INIT_MMX mmxext | |
791 | PIXELS8_L2_SHIFT5 put | |
792 | PIXELS8_L2_SHIFT5 avg | |
793 | ||
794 | ||
795 | %if ARCH_X86_64 | |
796 | %macro QPEL16_H_LOWPASS_L2_OP 1 | |
797 | cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride | |
798 | movsxdifnidn r3, r3d | |
799 | movsxdifnidn r4, r4d | |
800 | mov r5d, 16 | |
801 | pxor m15, m15 | |
802 | mova m14, [pw_5] | |
803 | mova m13, [pw_16] | |
804 | .loop: | |
805 | lddqu m1, [r1+6] | |
806 | lddqu m7, [r1-2] | |
807 | mova m0, m1 | |
808 | punpckhbw m1, m15 | |
809 | punpcklbw m0, m15 | |
810 | punpcklbw m7, m15 | |
811 | mova m2, m1 | |
812 | mova m6, m0 | |
813 | mova m3, m1 | |
814 | mova m8, m0 | |
815 | mova m4, m1 | |
816 | mova m9, m0 | |
817 | mova m12, m0 | |
818 | mova m11, m1 | |
819 | palignr m11, m0, 10 | |
820 | palignr m12, m7, 10 | |
821 | palignr m4, m0, 2 | |
822 | palignr m9, m7, 2 | |
823 | palignr m3, m0, 4 | |
824 | palignr m8, m7, 4 | |
825 | palignr m2, m0, 6 | |
826 | palignr m6, m7, 6 | |
827 | paddw m11, m0 | |
828 | palignr m1, m0, 8 | |
829 | palignr m0, m7, 8 | |
830 | paddw m7, m12 | |
831 | paddw m2, m3 | |
832 | paddw m6, m8 | |
833 | paddw m1, m4 | |
834 | paddw m0, m9 | |
835 | psllw m2, 2 | |
836 | psllw m6, 2 | |
837 | psubw m2, m1 | |
838 | psubw m6, m0 | |
839 | paddw m11, m13 | |
840 | paddw m7, m13 | |
841 | pmullw m2, m14 | |
842 | pmullw m6, m14 | |
843 | lddqu m3, [r2] | |
844 | paddw m2, m11 | |
845 | paddw m6, m7 | |
846 | psraw m2, 5 | |
847 | psraw m6, 5 | |
848 | packuswb m6, m2 | |
849 | pavgb m6, m3 | |
850 | op_%1 m6, [r0], m11 | |
851 | add r1, r3 | |
852 | add r0, r3 | |
853 | add r2, r4 | |
854 | dec r5d | |
855 | jg .loop | |
856 | REP_RET | |
857 | %endmacro | |
858 | ||
859 | INIT_XMM ssse3 | |
860 | QPEL16_H_LOWPASS_L2_OP put | |
861 | QPEL16_H_LOWPASS_L2_OP avg | |
862 | %endif |