Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / qpeldsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* mpeg4 qpel
3;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4;* Copyright (c) 2008 Loren Merritt
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27cextern pb_1
28cextern pw_3
29cextern pw_15
30cextern pw_16
31cextern pw_20
32
33
34SECTION_TEXT
35
36; void ff_put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
37%macro PUT_NO_RND_PIXELS8_L2 0
38cglobal put_no_rnd_pixels8_l2, 6,6
39 movsxdifnidn r4, r4d
40 movsxdifnidn r3, r3d
41 pcmpeqb m6, m6
42 test r5d, 1
43 je .loop
44 mova m0, [r1]
45 mova m1, [r2]
46 add r1, r4
47 add r2, 8
48 pxor m0, m6
49 pxor m1, m6
50 PAVGB m0, m1
51 pxor m0, m6
52 mova [r0], m0
53 add r0, r3
54 dec r5d
55.loop:
56 mova m0, [r1]
57 add r1, r4
58 mova m1, [r1]
59 add r1, r4
60 mova m2, [r2]
61 mova m3, [r2+8]
62 pxor m0, m6
63 pxor m1, m6
64 pxor m2, m6
65 pxor m3, m6
66 PAVGB m0, m2
67 PAVGB m1, m3
68 pxor m0, m6
69 pxor m1, m6
70 mova [r0], m0
71 add r0, r3
72 mova [r0], m1
73 add r0, r3
74 mova m0, [r1]
75 add r1, r4
76 mova m1, [r1]
77 add r1, r4
78 mova m2, [r2+16]
79 mova m3, [r2+24]
80 pxor m0, m6
81 pxor m1, m6
82 pxor m2, m6
83 pxor m3, m6
84 PAVGB m0, m2
85 PAVGB m1, m3
86 pxor m0, m6
87 pxor m1, m6
88 mova [r0], m0
89 add r0, r3
90 mova [r0], m1
91 add r0, r3
92 add r2, 32
93 sub r5d, 4
94 jne .loop
95 REP_RET
96%endmacro
97
98INIT_MMX mmxext
99PUT_NO_RND_PIXELS8_L2
100
101
102; void ff_put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
103%macro PUT_NO_RND_PIXELS16_l2 0
104cglobal put_no_rnd_pixels16_l2, 6,6
105 movsxdifnidn r3, r3d
106 movsxdifnidn r4, r4d
107 pcmpeqb m6, m6
108 test r5d, 1
109 je .loop
110 mova m0, [r1]
111 mova m1, [r1+8]
112 mova m2, [r2]
113 mova m3, [r2+8]
114 pxor m0, m6
115 pxor m1, m6
116 pxor m2, m6
117 pxor m3, m6
118 PAVGB m0, m2
119 PAVGB m1, m3
120 pxor m0, m6
121 pxor m1, m6
122 add r1, r4
123 add r2, 16
124 mova [r0], m0
125 mova [r0+8], m1
126 add r0, r3
127 dec r5d
128.loop:
129 mova m0, [r1]
130 mova m1, [r1+8]
131 add r1, r4
132 mova m2, [r2]
133 mova m3, [r2+8]
134 pxor m0, m6
135 pxor m1, m6
136 pxor m2, m6
137 pxor m3, m6
138 PAVGB m0, m2
139 PAVGB m1, m3
140 pxor m0, m6
141 pxor m1, m6
142 mova [r0], m0
143 mova [r0+8], m1
144 add r0, r3
145 mova m0, [r1]
146 mova m1, [r1+8]
147 add r1, r4
148 mova m2, [r2+16]
149 mova m3, [r2+24]
150 pxor m0, m6
151 pxor m1, m6
152 pxor m2, m6
153 pxor m3, m6
154 PAVGB m0, m2
155 PAVGB m1, m3
156 pxor m0, m6
157 pxor m1, m6
158 mova [r0], m0
159 mova [r0+8], m1
160 add r0, r3
161 add r2, 32
162 sub r5d, 2
163 jne .loop
164 REP_RET
165%endmacro
166
167INIT_MMX mmxext
168PUT_NO_RND_PIXELS16_l2
169INIT_MMX 3dnow
170PUT_NO_RND_PIXELS16_l2
171
172%macro MPEG4_QPEL16_H_LOWPASS 1
173cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
174 movsxdifnidn r2, r2d
175 movsxdifnidn r3, r3d
176 pxor m7, m7
177.loop:
178 mova m0, [r1]
179 mova m1, m0
180 mova m2, m0
181 punpcklbw m0, m7
182 punpckhbw m1, m7
183 pshufw m5, m0, 0x90
184 pshufw m6, m0, 0x41
185 mova m3, m2
186 mova m4, m2
187 psllq m2, 8
188 psllq m3, 16
189 psllq m4, 24
190 punpckhbw m2, m7
191 punpckhbw m3, m7
192 punpckhbw m4, m7
193 paddw m5, m3
194 paddw m6, m2
195 paddw m5, m5
196 psubw m6, m5
197 pshufw m5, m0, 6
198 pmullw m6, [pw_3]
199 paddw m0, m4
200 paddw m5, m1
201 pmullw m0, [pw_20]
202 psubw m0, m5
203 paddw m6, [PW_ROUND]
204 paddw m0, m6
205 psraw m0, 5
206 mova [rsp+8], m0
207 mova m0, [r1+5]
208 mova m5, m0
209 mova m6, m0
210 psrlq m0, 8
211 psrlq m5, 16
212 punpcklbw m0, m7
213 punpcklbw m5, m7
214 paddw m2, m0
215 paddw m3, m5
216 paddw m2, m2
217 psubw m3, m2
218 mova m2, m6
219 psrlq m6, 24
220 punpcklbw m2, m7
221 punpcklbw m6, m7
222 pmullw m3, [pw_3]
223 paddw m1, m2
224 paddw m4, m6
225 pmullw m1, [pw_20]
226 psubw m3, m4
227 paddw m1, [PW_ROUND]
228 paddw m3, m1
229 psraw m3, 5
230 mova m1, [rsp+8]
231 packuswb m1, m3
232 OP_MOV [r0], m1, m4
233 mova m1, [r1+9]
234 mova m4, m1
235 mova m3, m1
236 psrlq m1, 8
237 psrlq m4, 16
238 punpcklbw m1, m7
239 punpcklbw m4, m7
240 paddw m5, m1
241 paddw m0, m4
242 paddw m5, m5
243 psubw m0, m5
244 mova m5, m3
245 psrlq m3, 24
246 pmullw m0, [pw_3]
247 punpcklbw m3, m7
248 paddw m2, m3
249 psubw m0, m2
250 mova m2, m5
251 punpcklbw m2, m7
252 punpckhbw m5, m7
253 paddw m6, m2
254 pmullw m6, [pw_20]
255 paddw m0, [PW_ROUND]
256 paddw m0, m6
257 psraw m0, 5
258 paddw m3, m5
259 pshufw m6, m5, 0xf9
260 paddw m6, m4
261 pshufw m4, m5, 0xbe
262 pshufw m5, m5, 0x6f
263 paddw m4, m1
264 paddw m5, m2
265 paddw m6, m6
266 psubw m4, m6
267 pmullw m3, [pw_20]
268 pmullw m4, [pw_3]
269 psubw m3, m5
270 paddw m4, [PW_ROUND]
271 paddw m4, m3
272 psraw m4, 5
273 packuswb m0, m4
274 OP_MOV [r0+8], m0, m4
275 add r1, r3
276 add r0, r2
277 dec r4d
278 jne .loop
279 REP_RET
280%endmacro
281
282%macro PUT_OP 2-3
283 mova %1, %2
284%endmacro
285
286%macro AVG_OP 2-3
287 mova %3, %1
288 pavgb %2, %3
289 mova %1, %2
290%endmacro
291
292INIT_MMX mmxext
293%define PW_ROUND pw_16
294%define OP_MOV PUT_OP
295MPEG4_QPEL16_H_LOWPASS put
296%define PW_ROUND pw_16
297%define OP_MOV AVG_OP
298MPEG4_QPEL16_H_LOWPASS avg
299%define PW_ROUND pw_15
300%define OP_MOV PUT_OP
301MPEG4_QPEL16_H_LOWPASS put_no_rnd
302
303
304
305%macro MPEG4_QPEL8_H_LOWPASS 1
306cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
307 movsxdifnidn r2, r2d
308 movsxdifnidn r3, r3d
309 pxor m7, m7
310.loop:
311 mova m0, [r1]
312 mova m1, m0
313 mova m2, m0
314 punpcklbw m0, m7
315 punpckhbw m1, m7
316 pshufw m5, m0, 0x90
317 pshufw m6, m0, 0x41
318 mova m3, m2
319 mova m4, m2
320 psllq m2, 8
321 psllq m3, 16
322 psllq m4, 24
323 punpckhbw m2, m7
324 punpckhbw m3, m7
325 punpckhbw m4, m7
326 paddw m5, m3
327 paddw m6, m2
328 paddw m5, m5
329 psubw m6, m5
330 pshufw m5, m0, 0x6
331 pmullw m6, [pw_3]
332 paddw m0, m4
333 paddw m5, m1
334 pmullw m0, [pw_20]
335 psubw m0, m5
336 paddw m6, [PW_ROUND]
337 paddw m0, m6
338 psraw m0, 5
339 movh m5, [r1+5]
340 punpcklbw m5, m7
341 pshufw m6, m5, 0xf9
342 paddw m1, m5
343 paddw m2, m6
344 pshufw m6, m5, 0xbe
345 pshufw m5, m5, 0x6f
346 paddw m3, m6
347 paddw m4, m5
348 paddw m2, m2
349 psubw m3, m2
350 pmullw m1, [pw_20]
351 pmullw m3, [pw_3]
352 psubw m3, m4
353 paddw m1, [PW_ROUND]
354 paddw m3, m1
355 psraw m3, 5
356 packuswb m0, m3
357 OP_MOV [r0], m0, m4
358 add r1, r3
359 add r0, r2
360 dec r4d
361 jne .loop
362 REP_RET
363%endmacro
364
365INIT_MMX mmxext
366%define PW_ROUND pw_16
367%define OP_MOV PUT_OP
368MPEG4_QPEL8_H_LOWPASS put
369%define PW_ROUND pw_16
370%define OP_MOV AVG_OP
371MPEG4_QPEL8_H_LOWPASS avg
372%define PW_ROUND pw_15
373%define OP_MOV PUT_OP
374MPEG4_QPEL8_H_LOWPASS put_no_rnd
375
376
377
378%macro QPEL_V_LOW 5
379 paddw m0, m1
380 mova m4, [pw_20]
381 pmullw m4, m0
382 mova m0, %4
383 mova m5, %1
384 paddw m5, m0
385 psubw m4, m5
386 mova m5, %2
387 mova m6, %3
388 paddw m5, m3
389 paddw m6, m2
390 paddw m6, m6
391 psubw m5, m6
392 pmullw m5, [pw_3]
393 paddw m4, [PW_ROUND]
394 paddw m5, m4
395 psraw m5, 5
396 packuswb m5, m5
397 OP_MOV %5, m5, m7
398 SWAP 0,1,2,3
399%endmacro
400
401%macro MPEG4_QPEL16_V_LOWPASS 1
402cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
403 movsxdifnidn r2, r2d
404 movsxdifnidn r3, r3d
405
406 mov r4d, 17
407 mov r5, rsp
408 pxor m7, m7
409.looph:
410 mova m0, [r1]
411 mova m1, [r1]
412 mova m2, [r1+8]
413 mova m3, [r1+8]
414 punpcklbw m0, m7
415 punpckhbw m1, m7
416 punpcklbw m2, m7
417 punpckhbw m3, m7
418 mova [r5], m0
419 mova [r5+0x88], m1
420 mova [r5+0x110], m2
421 mova [r5+0x198], m3
422 add r5, 8
423 add r1, r3
424 dec r4d
425 jne .looph
426
427
428 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
429 mov r4d, 4
430 mov r1, 4
431 neg r2
432 lea r1, [r1+r2*8]
433 lea r1, [r1+r2*4]
434 lea r1, [r1+r2*2]
435 neg r2
436 mov r5, rsp
437.loopv:
438 pxor m7, m7
439 mova m0, [r5+ 0x0]
440 mova m1, [r5+ 0x8]
441 mova m2, [r5+0x10]
442 mova m3, [r5+0x18]
443 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
444 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
445 lea r0, [r0+r2*2]
446 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
447 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
448 lea r0, [r0+r2*2]
449 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
450 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
451 lea r0, [r0+r2*2]
452 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
453 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
454 lea r0, [r0+r2*2]
455 QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
456 QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
457 lea r0, [r0+r2*2]
458 QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
459 QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
460 lea r0, [r0+r2*2]
461 QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
462 QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
463 lea r0, [r0+r2*2]
464 QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
465 QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
466
467 add r5, 0x88
468 add r0, r1
469 dec r4d
470 jne .loopv
471 REP_RET
472%endmacro
473
474%macro PUT_OPH 2-3
475 movh %1, %2
476%endmacro
477
478%macro AVG_OPH 2-3
479 movh %3, %1
480 pavgb %2, %3
481 movh %1, %2
482%endmacro
483
484INIT_MMX mmxext
485%define PW_ROUND pw_16
486%define OP_MOV PUT_OPH
487MPEG4_QPEL16_V_LOWPASS put
488%define PW_ROUND pw_16
489%define OP_MOV AVG_OPH
490MPEG4_QPEL16_V_LOWPASS avg
491%define PW_ROUND pw_15
492%define OP_MOV PUT_OPH
493MPEG4_QPEL16_V_LOWPASS put_no_rnd
494
495
496
497%macro MPEG4_QPEL8_V_LOWPASS 1
498cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
499 movsxdifnidn r2, r2d
500 movsxdifnidn r3, r3d
501
502 mov r4d, 9
503 mov r5, rsp
504 pxor m7, m7
505.looph:
506 mova m0, [r1]
507 mova m1, [r1]
508 punpcklbw m0, m7
509 punpckhbw m1, m7
510 mova [r5], m0
511 mova [r5+0x48], m1
512 add r5, 8
513 add r1, r3
514 dec r4d
515 jne .looph
516
517
518 ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
519 mov r4d, 2
520 mov r1, 4
521 neg r2
522 lea r1, [r1+r2*4]
523 lea r1, [r1+r2*2]
524 neg r2
525 mov r5, rsp
526.loopv:
527 pxor m7, m7
528 mova m0, [r5+ 0x0]
529 mova m1, [r5+ 0x8]
530 mova m2, [r5+0x10]
531 mova m3, [r5+0x18]
532 QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
533 QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
534 lea r0, [r0+r2*2]
535 QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
536 QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
537 lea r0, [r0+r2*2]
538 QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
539 QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
540 lea r0, [r0+r2*2]
541 QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
542 QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
543
544 add r5, 0x48
545 add r0, r1
546 dec r4d
547 jne .loopv
548 REP_RET
549%endmacro
550
551INIT_MMX mmxext
552%define PW_ROUND pw_16
553%define OP_MOV PUT_OPH
554MPEG4_QPEL8_V_LOWPASS put
555%define PW_ROUND pw_16
556%define OP_MOV AVG_OPH
557MPEG4_QPEL8_V_LOWPASS avg
558%define PW_ROUND pw_15
559%define OP_MOV PUT_OPH
560MPEG4_QPEL8_V_LOWPASS put_no_rnd