Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* | |
3 | ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> | |
4 | ;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> | |
5 | ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> | |
6 | ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> | |
7 | ;* Copyright (c) 2013 Daniel Kang | |
8 | ;* | |
9 | ;* SIMD-optimized halfpel functions | |
10 | ;* | |
11 | ;* This file is part of FFmpeg. | |
12 | ;* | |
13 | ;* FFmpeg is free software; you can redistribute it and/or | |
14 | ;* modify it under the terms of the GNU Lesser General Public | |
15 | ;* License as published by the Free Software Foundation; either | |
16 | ;* version 2.1 of the License, or (at your option) any later version. | |
17 | ;* | |
18 | ;* FFmpeg is distributed in the hope that it will be useful, | |
19 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
21 | ;* Lesser General Public License for more details. | |
22 | ;* | |
23 | ;* You should have received a copy of the GNU Lesser General Public | |
24 | ;* License along with FFmpeg; if not, write to the Free Software | |
25 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
26 | ;****************************************************************************** | |
27 | ||
28 | %include "libavutil/x86/x86util.asm" | |
29 | ||
30 | SECTION_RODATA | |
31 | cextern pb_1 | |
32 | cextern pw_2 | |
33 | pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 | |
34 | pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7 | |
35 | ||
36 | cextern pw_8192 | |
37 | ||
38 | SECTION_TEXT | |
39 | ||
40 | ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
41 | %macro PUT_PIXELS8_X2 0 | |
42 | %if cpuflag(sse2) | |
43 | cglobal put_pixels16_x2, 4,5,4 | |
44 | %else | |
45 | cglobal put_pixels8_x2, 4,5 | |
46 | %endif | |
47 | lea r4, [r2*2] | |
48 | .loop: | |
49 | movu m0, [r1+1] | |
50 | movu m1, [r1+r2+1] | |
51 | %if cpuflag(sse2) | |
52 | movu m2, [r1] | |
53 | movu m3, [r1+r2] | |
54 | pavgb m0, m2 | |
55 | pavgb m1, m3 | |
56 | %else | |
57 | PAVGB m0, [r1] | |
58 | PAVGB m1, [r1+r2] | |
59 | %endif | |
60 | mova [r0], m0 | |
61 | mova [r0+r2], m1 | |
62 | add r1, r4 | |
63 | add r0, r4 | |
64 | movu m0, [r1+1] | |
65 | movu m1, [r1+r2+1] | |
66 | %if cpuflag(sse2) | |
67 | movu m2, [r1] | |
68 | movu m3, [r1+r2] | |
69 | pavgb m0, m2 | |
70 | pavgb m1, m3 | |
71 | %else | |
72 | PAVGB m0, [r1] | |
73 | PAVGB m1, [r1+r2] | |
74 | %endif | |
75 | add r1, r4 | |
76 | mova [r0], m0 | |
77 | mova [r0+r2], m1 | |
78 | add r0, r4 | |
79 | sub r3d, 4 | |
80 | jne .loop | |
81 | REP_RET | |
82 | %endmacro | |
83 | ||
84 | INIT_MMX mmxext | |
85 | PUT_PIXELS8_X2 | |
86 | INIT_MMX 3dnow | |
87 | PUT_PIXELS8_X2 | |
88 | ||
89 | ||
90 | ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
91 | %macro PUT_PIXELS_16 0 | |
92 | cglobal put_pixels16_x2, 4,5 | |
93 | lea r4, [r2*2] | |
94 | .loop: | |
95 | mova m0, [r1] | |
96 | mova m1, [r1+r2] | |
97 | mova m2, [r1+8] | |
98 | mova m3, [r1+r2+8] | |
99 | PAVGB m0, [r1+1] | |
100 | PAVGB m1, [r1+r2+1] | |
101 | PAVGB m2, [r1+9] | |
102 | PAVGB m3, [r1+r2+9] | |
103 | mova [r0], m0 | |
104 | mova [r0+r2], m1 | |
105 | mova [r0+8], m2 | |
106 | mova [r0+r2+8], m3 | |
107 | add r1, r4 | |
108 | add r0, r4 | |
109 | mova m0, [r1] | |
110 | mova m1, [r1+r2] | |
111 | mova m2, [r1+8] | |
112 | mova m3, [r1+r2+8] | |
113 | PAVGB m0, [r1+1] | |
114 | PAVGB m1, [r1+r2+1] | |
115 | PAVGB m2, [r1+9] | |
116 | PAVGB m3, [r1+r2+9] | |
117 | add r1, r4 | |
118 | mova [r0], m0 | |
119 | mova [r0+r2], m1 | |
120 | mova [r0+8], m2 | |
121 | mova [r0+r2+8], m3 | |
122 | add r0, r4 | |
123 | sub r3d, 4 | |
124 | jne .loop | |
125 | REP_RET | |
126 | %endmacro | |
127 | ||
128 | INIT_MMX mmxext | |
129 | PUT_PIXELS_16 | |
130 | INIT_MMX 3dnow | |
131 | PUT_PIXELS_16 | |
132 | ; The 8_X2 macro can easily be used here | |
133 | INIT_XMM sse2 | |
134 | PUT_PIXELS8_X2 | |
135 | ||
136 | ||
137 | ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
138 | %macro PUT_NO_RND_PIXELS8_X2 0 | |
139 | cglobal put_no_rnd_pixels8_x2, 4,5 | |
140 | mova m6, [pb_1] | |
141 | lea r4, [r2*2] | |
142 | .loop: | |
143 | mova m0, [r1] | |
144 | mova m2, [r1+r2] | |
145 | mova m1, [r1+1] | |
146 | mova m3, [r1+r2+1] | |
147 | add r1, r4 | |
148 | psubusb m0, m6 | |
149 | psubusb m2, m6 | |
150 | PAVGB m0, m1 | |
151 | PAVGB m2, m3 | |
152 | mova [r0], m0 | |
153 | mova [r0+r2], m2 | |
154 | mova m0, [r1] | |
155 | mova m1, [r1+1] | |
156 | mova m2, [r1+r2] | |
157 | mova m3, [r1+r2+1] | |
158 | add r0, r4 | |
159 | add r1, r4 | |
160 | psubusb m0, m6 | |
161 | psubusb m2, m6 | |
162 | PAVGB m0, m1 | |
163 | PAVGB m2, m3 | |
164 | mova [r0], m0 | |
165 | mova [r0+r2], m2 | |
166 | add r0, r4 | |
167 | sub r3d, 4 | |
168 | jne .loop | |
169 | REP_RET | |
170 | %endmacro | |
171 | ||
172 | INIT_MMX mmxext | |
173 | PUT_NO_RND_PIXELS8_X2 | |
174 | INIT_MMX 3dnow | |
175 | PUT_NO_RND_PIXELS8_X2 | |
176 | ||
177 | ||
178 | ; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
179 | %macro PUT_NO_RND_PIXELS8_X2_EXACT 0 | |
180 | cglobal put_no_rnd_pixels8_x2_exact, 4,5 | |
181 | lea r4, [r2*3] | |
182 | pcmpeqb m6, m6 | |
183 | .loop: | |
184 | mova m0, [r1] | |
185 | mova m2, [r1+r2] | |
186 | mova m1, [r1+1] | |
187 | mova m3, [r1+r2+1] | |
188 | pxor m0, m6 | |
189 | pxor m2, m6 | |
190 | pxor m1, m6 | |
191 | pxor m3, m6 | |
192 | PAVGB m0, m1 | |
193 | PAVGB m2, m3 | |
194 | pxor m0, m6 | |
195 | pxor m2, m6 | |
196 | mova [r0], m0 | |
197 | mova [r0+r2], m2 | |
198 | mova m0, [r1+r2*2] | |
199 | mova m1, [r1+r2*2+1] | |
200 | mova m2, [r1+r4] | |
201 | mova m3, [r1+r4+1] | |
202 | pxor m0, m6 | |
203 | pxor m1, m6 | |
204 | pxor m2, m6 | |
205 | pxor m3, m6 | |
206 | PAVGB m0, m1 | |
207 | PAVGB m2, m3 | |
208 | pxor m0, m6 | |
209 | pxor m2, m6 | |
210 | mova [r0+r2*2], m0 | |
211 | mova [r0+r4], m2 | |
212 | lea r1, [r1+r2*4] | |
213 | lea r0, [r0+r2*4] | |
214 | sub r3d, 4 | |
215 | jg .loop | |
216 | REP_RET | |
217 | %endmacro | |
218 | ||
219 | INIT_MMX mmxext | |
220 | PUT_NO_RND_PIXELS8_X2_EXACT | |
221 | INIT_MMX 3dnow | |
222 | PUT_NO_RND_PIXELS8_X2_EXACT | |
223 | ||
224 | ||
225 | ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
226 | %macro PUT_PIXELS8_Y2 0 | |
227 | %if cpuflag(sse2) | |
228 | cglobal put_pixels16_y2, 4,5,3 | |
229 | %else | |
230 | cglobal put_pixels8_y2, 4,5 | |
231 | %endif | |
232 | lea r4, [r2*2] | |
233 | movu m0, [r1] | |
234 | sub r0, r2 | |
235 | .loop: | |
236 | movu m1, [r1+r2] | |
237 | movu m2, [r1+r4] | |
238 | add r1, r4 | |
239 | PAVGB m0, m1 | |
240 | PAVGB m1, m2 | |
241 | mova [r0+r2], m0 | |
242 | mova [r0+r4], m1 | |
243 | movu m1, [r1+r2] | |
244 | movu m0, [r1+r4] | |
245 | add r0, r4 | |
246 | add r1, r4 | |
247 | PAVGB m2, m1 | |
248 | PAVGB m1, m0 | |
249 | mova [r0+r2], m2 | |
250 | mova [r0+r4], m1 | |
251 | add r0, r4 | |
252 | sub r3d, 4 | |
253 | jne .loop | |
254 | REP_RET | |
255 | %endmacro | |
256 | ||
257 | INIT_MMX mmxext | |
258 | PUT_PIXELS8_Y2 | |
259 | INIT_MMX 3dnow | |
260 | PUT_PIXELS8_Y2 | |
261 | ; actually, put_pixels16_y2_sse2 | |
262 | INIT_XMM sse2 | |
263 | PUT_PIXELS8_Y2 | |
264 | ||
265 | ||
266 | ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
267 | %macro PUT_NO_RND_PIXELS8_Y2 0 | |
268 | cglobal put_no_rnd_pixels8_y2, 4,5 | |
269 | mova m6, [pb_1] | |
270 | lea r4, [r2+r2] | |
271 | mova m0, [r1] | |
272 | sub r0, r2 | |
273 | .loop: | |
274 | mova m1, [r1+r2] | |
275 | mova m2, [r1+r4] | |
276 | add r1, r4 | |
277 | psubusb m1, m6 | |
278 | PAVGB m0, m1 | |
279 | PAVGB m1, m2 | |
280 | mova [r0+r2], m0 | |
281 | mova [r0+r4], m1 | |
282 | mova m1, [r1+r2] | |
283 | mova m0, [r1+r4] | |
284 | add r0, r4 | |
285 | add r1, r4 | |
286 | psubusb m1, m6 | |
287 | PAVGB m2, m1 | |
288 | PAVGB m1, m0 | |
289 | mova [r0+r2], m2 | |
290 | mova [r0+r4], m1 | |
291 | add r0, r4 | |
292 | sub r3d, 4 | |
293 | jne .loop | |
294 | REP_RET | |
295 | %endmacro | |
296 | ||
297 | INIT_MMX mmxext | |
298 | PUT_NO_RND_PIXELS8_Y2 | |
299 | INIT_MMX 3dnow | |
300 | PUT_NO_RND_PIXELS8_Y2 | |
301 | ||
302 | ||
303 | ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
304 | %macro PUT_NO_RND_PIXELS8_Y2_EXACT 0 | |
305 | cglobal put_no_rnd_pixels8_y2_exact, 4,5 | |
306 | lea r4, [r2*3] | |
307 | mova m0, [r1] | |
308 | pcmpeqb m6, m6 | |
309 | add r1, r2 | |
310 | pxor m0, m6 | |
311 | .loop: | |
312 | mova m1, [r1] | |
313 | mova m2, [r1+r2] | |
314 | pxor m1, m6 | |
315 | pxor m2, m6 | |
316 | PAVGB m0, m1 | |
317 | PAVGB m1, m2 | |
318 | pxor m0, m6 | |
319 | pxor m1, m6 | |
320 | mova [r0], m0 | |
321 | mova [r0+r2], m1 | |
322 | mova m1, [r1+r2*2] | |
323 | mova m0, [r1+r4] | |
324 | pxor m1, m6 | |
325 | pxor m0, m6 | |
326 | PAVGB m2, m1 | |
327 | PAVGB m1, m0 | |
328 | pxor m2, m6 | |
329 | pxor m1, m6 | |
330 | mova [r0+r2*2], m2 | |
331 | mova [r0+r4], m1 | |
332 | lea r1, [r1+r2*4] | |
333 | lea r0, [r0+r2*4] | |
334 | sub r3d, 4 | |
335 | jg .loop | |
336 | REP_RET | |
337 | %endmacro | |
338 | ||
339 | INIT_MMX mmxext | |
340 | PUT_NO_RND_PIXELS8_Y2_EXACT | |
341 | INIT_MMX 3dnow | |
342 | PUT_NO_RND_PIXELS8_Y2_EXACT | |
343 | ||
344 | ||
345 | ; void ff_avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
346 | %macro AVG_PIXELS8 0 | |
347 | cglobal avg_pixels8, 4,5 | |
348 | lea r4, [r2*2] | |
349 | .loop: | |
350 | mova m0, [r0] | |
351 | mova m1, [r0+r2] | |
352 | PAVGB m0, [r1] | |
353 | PAVGB m1, [r1+r2] | |
354 | mova [r0], m0 | |
355 | mova [r0+r2], m1 | |
356 | add r1, r4 | |
357 | add r0, r4 | |
358 | mova m0, [r0] | |
359 | mova m1, [r0+r2] | |
360 | PAVGB m0, [r1] | |
361 | PAVGB m1, [r1+r2] | |
362 | add r1, r4 | |
363 | mova [r0], m0 | |
364 | mova [r0+r2], m1 | |
365 | add r0, r4 | |
366 | sub r3d, 4 | |
367 | jne .loop | |
368 | REP_RET | |
369 | %endmacro | |
370 | ||
371 | INIT_MMX 3dnow | |
372 | AVG_PIXELS8 | |
373 | ||
374 | ||
375 | ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
376 | %macro AVG_PIXELS8_X2 0 | |
377 | %if cpuflag(sse2) | |
378 | cglobal avg_pixels16_x2, 4,5,4 | |
379 | %else | |
380 | cglobal avg_pixels8_x2, 4,5 | |
381 | %endif | |
382 | lea r4, [r2*2] | |
383 | %if notcpuflag(mmxext) | |
384 | pcmpeqd m5, m5 | |
385 | paddb m5, m5 | |
386 | %endif | |
387 | .loop: | |
388 | movu m0, [r1] | |
389 | movu m2, [r1+r2] | |
390 | %if cpuflag(sse2) | |
391 | movu m1, [r1+1] | |
392 | movu m3, [r1+r2+1] | |
393 | pavgb m0, m1 | |
394 | pavgb m2, m3 | |
395 | %else | |
396 | PAVGB m0, [r1+1], m3, m5 | |
397 | PAVGB m2, [r1+r2+1], m4, m5 | |
398 | %endif | |
399 | PAVGB m0, [r0], m3, m5 | |
400 | PAVGB m2, [r0+r2], m4, m5 | |
401 | add r1, r4 | |
402 | mova [r0], m0 | |
403 | mova [r0+r2], m2 | |
404 | movu m0, [r1] | |
405 | movu m2, [r1+r2] | |
406 | %if cpuflag(sse2) | |
407 | movu m1, [r1+1] | |
408 | movu m3, [r1+r2+1] | |
409 | pavgb m0, m1 | |
410 | pavgb m2, m3 | |
411 | %else | |
412 | PAVGB m0, [r1+1], m3, m5 | |
413 | PAVGB m2, [r1+r2+1], m4, m5 | |
414 | %endif | |
415 | add r0, r4 | |
416 | add r1, r4 | |
417 | PAVGB m0, [r0], m3, m5 | |
418 | PAVGB m2, [r0+r2], m4, m5 | |
419 | mova [r0], m0 | |
420 | mova [r0+r2], m2 | |
421 | add r0, r4 | |
422 | sub r3d, 4 | |
423 | jne .loop | |
424 | REP_RET | |
425 | %endmacro | |
426 | ||
427 | INIT_MMX mmx | |
428 | AVG_PIXELS8_X2 | |
429 | INIT_MMX mmxext | |
430 | AVG_PIXELS8_X2 | |
431 | INIT_MMX 3dnow | |
432 | AVG_PIXELS8_X2 | |
433 | ; actually avg_pixels16_x2 | |
434 | INIT_XMM sse2 | |
435 | AVG_PIXELS8_X2 | |
436 | ||
437 | ||
438 | ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
439 | %macro AVG_PIXELS8_Y2 0 | |
440 | %if cpuflag(sse2) | |
441 | cglobal avg_pixels16_y2, 4,5,3 | |
442 | %else | |
443 | cglobal avg_pixels8_y2, 4,5 | |
444 | %endif | |
445 | lea r4, [r2*2] | |
446 | movu m0, [r1] | |
447 | sub r0, r2 | |
448 | .loop: | |
449 | movu m1, [r1+r2] | |
450 | movu m2, [r1+r4] | |
451 | add r1, r4 | |
452 | PAVGB m0, m1 | |
453 | PAVGB m1, m2 | |
454 | PAVGB m0, [r0+r2] | |
455 | PAVGB m1, [r0+r4] | |
456 | mova [r0+r2], m0 | |
457 | mova [r0+r4], m1 | |
458 | movu m1, [r1+r2] | |
459 | movu m0, [r1+r4] | |
460 | PAVGB m2, m1 | |
461 | PAVGB m1, m0 | |
462 | add r0, r4 | |
463 | add r1, r4 | |
464 | PAVGB m2, [r0+r2] | |
465 | PAVGB m1, [r0+r4] | |
466 | mova [r0+r2], m2 | |
467 | mova [r0+r4], m1 | |
468 | add r0, r4 | |
469 | sub r3d, 4 | |
470 | jne .loop | |
471 | REP_RET | |
472 | %endmacro | |
473 | ||
474 | INIT_MMX mmxext | |
475 | AVG_PIXELS8_Y2 | |
476 | INIT_MMX 3dnow | |
477 | AVG_PIXELS8_Y2 | |
478 | ; actually avg_pixels16_y2 | |
479 | INIT_XMM sse2 | |
480 | AVG_PIXELS8_Y2 | |
481 | ||
482 | ||
483 | ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
484 | ; Note this is not correctly rounded, and is therefore used for | |
485 | ; not-bitexact output | |
486 | %macro AVG_APPROX_PIXELS8_XY2 0 | |
487 | cglobal avg_approx_pixels8_xy2, 4,5 | |
488 | mova m6, [pb_1] | |
489 | lea r4, [r2*2] | |
490 | mova m0, [r1] | |
491 | PAVGB m0, [r1+1] | |
492 | .loop: | |
493 | mova m2, [r1+r4] | |
494 | mova m1, [r1+r2] | |
495 | psubusb m2, m6 | |
496 | PAVGB m1, [r1+r2+1] | |
497 | PAVGB m2, [r1+r4+1] | |
498 | add r1, r4 | |
499 | PAVGB m0, m1 | |
500 | PAVGB m1, m2 | |
501 | PAVGB m0, [r0] | |
502 | PAVGB m1, [r0+r2] | |
503 | mova [r0], m0 | |
504 | mova [r0+r2], m1 | |
505 | mova m1, [r1+r2] | |
506 | mova m0, [r1+r4] | |
507 | PAVGB m1, [r1+r2+1] | |
508 | PAVGB m0, [r1+r4+1] | |
509 | add r0, r4 | |
510 | add r1, r4 | |
511 | PAVGB m2, m1 | |
512 | PAVGB m1, m0 | |
513 | PAVGB m2, [r0] | |
514 | PAVGB m1, [r0+r2] | |
515 | mova [r0], m2 | |
516 | mova [r0+r2], m1 | |
517 | add r0, r4 | |
518 | sub r3d, 4 | |
519 | jne .loop | |
520 | REP_RET | |
521 | %endmacro | |
522 | ||
523 | INIT_MMX mmxext | |
524 | AVG_APPROX_PIXELS8_XY2 | |
525 | INIT_MMX 3dnow | |
526 | AVG_APPROX_PIXELS8_XY2 | |
527 | ||
528 | ||
529 | ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |
530 | %macro SET_PIXELS_XY2 1 | |
531 | %if cpuflag(sse2) | |
532 | cglobal %1_pixels16_xy2, 4,5,8 | |
533 | %else | |
534 | cglobal %1_pixels8_xy2, 4,5 | |
535 | %endif | |
536 | pxor m7, m7 | |
537 | mova m6, [pw_2] | |
538 | movu m0, [r1] | |
539 | movu m4, [r1+1] | |
540 | mova m1, m0 | |
541 | mova m5, m4 | |
542 | punpcklbw m0, m7 | |
543 | punpcklbw m4, m7 | |
544 | punpckhbw m1, m7 | |
545 | punpckhbw m5, m7 | |
546 | paddusw m4, m0 | |
547 | paddusw m5, m1 | |
548 | xor r4, r4 | |
549 | add r1, r2 | |
550 | .loop: | |
551 | movu m0, [r1+r4] | |
552 | movu m2, [r1+r4+1] | |
553 | mova m1, m0 | |
554 | mova m3, m2 | |
555 | punpcklbw m0, m7 | |
556 | punpcklbw m2, m7 | |
557 | punpckhbw m1, m7 | |
558 | punpckhbw m3, m7 | |
559 | paddusw m0, m2 | |
560 | paddusw m1, m3 | |
561 | paddusw m4, m6 | |
562 | paddusw m5, m6 | |
563 | paddusw m4, m0 | |
564 | paddusw m5, m1 | |
565 | psrlw m4, 2 | |
566 | psrlw m5, 2 | |
567 | %ifidn %1, avg | |
568 | mova m3, [r0+r4] | |
569 | packuswb m4, m5 | |
570 | PAVGB m4, m3 | |
571 | %else | |
572 | packuswb m4, m5 | |
573 | %endif | |
574 | mova [r0+r4], m4 | |
575 | add r4, r2 | |
576 | ||
577 | movu m2, [r1+r4] | |
578 | movu m4, [r1+r4+1] | |
579 | mova m3, m2 | |
580 | mova m5, m4 | |
581 | punpcklbw m2, m7 | |
582 | punpcklbw m4, m7 | |
583 | punpckhbw m3, m7 | |
584 | punpckhbw m5, m7 | |
585 | paddusw m4, m2 | |
586 | paddusw m5, m3 | |
587 | paddusw m0, m6 | |
588 | paddusw m1, m6 | |
589 | paddusw m0, m4 | |
590 | paddusw m1, m5 | |
591 | psrlw m0, 2 | |
592 | psrlw m1, 2 | |
593 | %ifidn %1, avg | |
594 | mova m3, [r0+r4] | |
595 | packuswb m0, m1 | |
596 | PAVGB m0, m3 | |
597 | %else | |
598 | packuswb m0, m1 | |
599 | %endif | |
600 | mova [r0+r4], m0 | |
601 | add r4, r2 | |
602 | sub r3d, 2 | |
603 | jnz .loop | |
604 | REP_RET | |
605 | %endmacro | |
606 | ||
607 | INIT_MMX mmxext | |
608 | SET_PIXELS_XY2 avg | |
609 | INIT_MMX 3dnow | |
610 | SET_PIXELS_XY2 avg | |
611 | INIT_XMM sse2 | |
612 | SET_PIXELS_XY2 put | |
613 | SET_PIXELS_XY2 avg | |
614 | ||
615 | %macro SSSE3_PIXELS_XY2 1-2 | |
616 | %if %0 == 2 ; sse2 | |
617 | cglobal %1_pixels16_xy2, 4,5,%2 | |
618 | mova m4, [pb_interleave16] | |
619 | %else | |
620 | cglobal %1_pixels8_xy2, 4,5 | |
621 | mova m4, [pb_interleave8] | |
622 | %endif | |
623 | mova m5, [pb_1] | |
624 | movu m0, [r1] | |
625 | movu m1, [r1+1] | |
626 | pmaddubsw m0, m5 | |
627 | pmaddubsw m1, m5 | |
628 | xor r4, r4 | |
629 | add r1, r2 | |
630 | .loop: | |
631 | movu m2, [r1+r4] | |
632 | movu m3, [r1+r4+1] | |
633 | pmaddubsw m2, m5 | |
634 | pmaddubsw m3, m5 | |
635 | paddusw m0, m2 | |
636 | paddusw m1, m3 | |
637 | pmulhrsw m0, [pw_8192] | |
638 | pmulhrsw m1, [pw_8192] | |
639 | %ifidn %1, avg | |
640 | mova m6, [r0+r4] | |
641 | packuswb m0, m1 | |
642 | pshufb m0, m4 | |
643 | pavgb m0, m6 | |
644 | %else | |
645 | packuswb m0, m1 | |
646 | pshufb m0, m4 | |
647 | %endif | |
648 | mova [r0+r4], m0 | |
649 | add r4, r2 | |
650 | ||
651 | movu m0, [r1+r4] | |
652 | movu m1, [r1+r4+1] | |
653 | pmaddubsw m0, m5 | |
654 | pmaddubsw m1, m5 | |
655 | paddusw m2, m0 | |
656 | paddusw m3, m1 | |
657 | pmulhrsw m2, [pw_8192] | |
658 | pmulhrsw m3, [pw_8192] | |
659 | %ifidn %1, avg | |
660 | mova m6, [r0+r4] | |
661 | packuswb m2, m3 | |
662 | pshufb m2, m4 | |
663 | pavgb m2, m6 | |
664 | %else | |
665 | packuswb m2, m3 | |
666 | pshufb m2, m4 | |
667 | %endif | |
668 | mova [r0+r4], m2 | |
669 | add r4, r2 | |
670 | sub r3d, 2 | |
671 | jnz .loop | |
672 | REP_RET | |
673 | %endmacro | |
674 | ||
675 | INIT_MMX ssse3 | |
676 | SSSE3_PIXELS_XY2 put | |
677 | SSSE3_PIXELS_XY2 avg | |
678 | INIT_XMM ssse3 | |
679 | SSSE3_PIXELS_XY2 put, 6 | |
680 | SSSE3_PIXELS_XY2 avg, 7 |