Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* Copyright (C) 2013 x265 project | |
3 | ;* | |
4 | ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
5 | ;* | |
6 | ;* This program is free software; you can redistribute it and/or modify | |
7 | ;* it under the terms of the GNU General Public License as published by | |
8 | ;* the Free Software Foundation; either version 2 of the License, or | |
9 | ;* (at your option) any later version. | |
10 | ;* | |
11 | ;* This program is distributed in the hope that it will be useful, | |
12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | ;* GNU General Public License for more details. | |
15 | ;* | |
16 | ;* You should have received a copy of the GNU General Public License | |
17 | ;* along with this program; if not, write to the Free Software | |
18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 | ;* | |
20 | ;* This program is also available under a commercial proprietary license. | |
21 | ;* For more information, contact us at license @ x265.com. | |
22 | ;*****************************************************************************/ | |
23 | ||
24 | %include "x86inc.asm" | |
25 | %include "x86util.asm" | |
26 | ||
27 | SECTION_RODATA 32 | |
28 | ||
29 | SECTION .text | |
30 | ||
31 | cextern pw_pixel_max | |
32 | ||
33 | ;----------------------------------------------------------------------------- | |
34 | ; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
35 | ;----------------------------------------------------------------------------- | |
36 | %if HIGH_BIT_DEPTH | |
37 | INIT_XMM sse2 | |
38 | cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
39 | mova m1, [pw_pixel_max] | |
40 | pxor m0, m0 | |
41 | add r4, r4 | |
42 | add r5, r5 | |
43 | add r1, r1 | |
44 | movh m2, [r2] | |
45 | movhps m2, [r2 + r4] | |
46 | movh m3, [r3] | |
47 | movhps m3, [r3 + r5] | |
48 | lea r2, [r2 + r4 * 2] | |
49 | lea r3, [r3 + r5 * 2] | |
50 | movh m4, [r2] | |
51 | movhps m4, [r2 + r4] | |
52 | movh m5, [r3] | |
53 | movhps m5, [r3 + r5] | |
54 | ||
55 | paddw m2, m3 | |
56 | paddw m4, m5 | |
57 | CLIPW2 m2, m4, m0, m1 | |
58 | ||
59 | movh [r0], m2 | |
60 | movhps [r0 + r1], m2 | |
61 | lea r0, [r0 + r1 * 2] | |
62 | movh [r0], m4 | |
63 | movhps [r0 + r1], m4 | |
64 | ||
65 | RET | |
66 | %else | |
67 | INIT_XMM sse4 | |
68 | cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
69 | add r5, r5 | |
70 | pmovzxbw m0, [r2] | |
71 | pmovzxbw m2, [r2 + r4] | |
72 | movh m1, [r3] | |
73 | movh m3, [r3 + r5] | |
74 | lea r2, [r2 + r4 * 2] | |
75 | lea r3, [r3 + r5 * 2] | |
76 | pmovzxbw m4, [r2] | |
77 | pmovzxbw m6, [r2 + r4] | |
78 | movh m5, [r3] | |
79 | movh m7, [r3 + r5] | |
80 | ||
81 | paddw m0, m1 | |
82 | paddw m2, m3 | |
83 | paddw m4, m5 | |
84 | paddw m6, m7 | |
85 | packuswb m0, m0 | |
86 | packuswb m2, m2 | |
87 | packuswb m4, m4 | |
88 | packuswb m6, m6 | |
89 | ||
90 | movd [r0], m0 | |
91 | movd [r0 + r1], m2 | |
92 | lea r0, [r0 + r1 * 2] | |
93 | movd [r0], m4 | |
94 | movd [r0 + r1], m6 | |
95 | ||
96 | RET | |
97 | %endif | |
98 | ||
99 | ||
100 | ;----------------------------------------------------------------------------- | |
101 | ; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
102 | ;----------------------------------------------------------------------------- | |
103 | %macro PIXEL_ADD_PS_W4_H4 2 | |
104 | %if HIGH_BIT_DEPTH | |
105 | INIT_XMM sse2 | |
106 | cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
107 | mova m1, [pw_pixel_max] | |
108 | pxor m0, m0 | |
109 | mov r6d, %2/4 | |
110 | add r4, r4 | |
111 | add r5, r5 | |
112 | add r1, r1 | |
113 | .loop: | |
114 | movh m2, [r2] | |
115 | movhps m2, [r2 + r4] | |
116 | movh m3, [r3] | |
117 | movhps m3, [r3 + r5] | |
118 | lea r2, [r2 + r4 * 2] | |
119 | lea r3, [r3 + r5 * 2] | |
120 | movh m4, [r2] | |
121 | movhps m4, [r2 + r4] | |
122 | movh m5, [r3] | |
123 | movhps m5, [r3 + r5] | |
124 | dec r6d | |
125 | lea r2, [r2 + r4 * 2] | |
126 | lea r3, [r3 + r5 * 2] | |
127 | ||
128 | paddw m2, m3 | |
129 | paddw m4, m5 | |
130 | CLIPW2 m2, m4, m0, m1 | |
131 | ||
132 | movh [r0], m2 | |
133 | movhps [r0 + r1], m2 | |
134 | lea r0, [r0 + r1 * 2] | |
135 | movh [r0], m4 | |
136 | movhps [r0 + r1], m4 | |
137 | lea r0, [r0 + r1 * 2] | |
138 | ||
139 | jnz .loop | |
140 | RET | |
141 | %else | |
142 | INIT_XMM sse4 | |
143 | cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
144 | mov r6d, %2/4 | |
145 | add r5, r5 | |
146 | .loop: | |
147 | pmovzxbw m0, [r2] | |
148 | pmovzxbw m2, [r2 + r4] | |
149 | movh m1, [r3] | |
150 | movh m3, [r3 + r5] | |
151 | lea r2, [r2 + r4 * 2] | |
152 | lea r3, [r3 + r5 * 2] | |
153 | pmovzxbw m4, [r2] | |
154 | pmovzxbw m6, [r2 + r4] | |
155 | movh m5, [r3] | |
156 | movh m7, [r3 + r5] | |
157 | dec r6d | |
158 | lea r2, [r2 + r4 * 2] | |
159 | lea r3, [r3 + r5 * 2] | |
160 | ||
161 | paddw m0, m1 | |
162 | paddw m2, m3 | |
163 | paddw m4, m5 | |
164 | paddw m6, m7 | |
165 | packuswb m0, m0 | |
166 | packuswb m2, m2 | |
167 | packuswb m4, m4 | |
168 | packuswb m6, m6 | |
169 | ||
170 | movd [r0], m0 | |
171 | movd [r0 + r1], m2 | |
172 | lea r0, [r0 + r1 * 2] | |
173 | movd [r0], m4 | |
174 | movd [r0 + r1], m6 | |
175 | lea r0, [r0 + r1 * 2] | |
176 | ||
177 | jnz .loop | |
178 | RET | |
179 | %endif | |
180 | %endmacro | |
181 | ||
182 | PIXEL_ADD_PS_W4_H4 4, 8 | |
183 | ||
184 | ||
185 | ;----------------------------------------------------------------------------- | |
186 | ; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
187 | ;----------------------------------------------------------------------------- | |
188 | %macro PIXEL_ADD_PS_W8_H4 2 | |
189 | %if HIGH_BIT_DEPTH | |
190 | INIT_XMM sse2 | |
191 | cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
192 | mova m5, [pw_pixel_max] | |
193 | pxor m4, m4 | |
194 | mov r6d, %2/4 | |
195 | add r4, r4 | |
196 | add r5, r5 | |
197 | add r1, r1 | |
198 | .loop: | |
199 | movu m0, [r2] | |
200 | movu m2, [r2 + r4] | |
201 | movu m1, [r3] | |
202 | movu m3, [r3 + r5] | |
203 | lea r2, [r2 + r4 * 2] | |
204 | lea r3, [r3 + r5 * 2] | |
205 | ||
206 | paddw m0, m1 | |
207 | paddw m2, m3 | |
208 | CLIPW2 m0, m2, m4, m5 | |
209 | ||
210 | movu [r0], m0 | |
211 | movu [r0 + r1], m2 | |
212 | ||
213 | movu m0, [r2] | |
214 | movu m2, [r2 + r4] | |
215 | movu m1, [r3] | |
216 | movu m3, [r3 + r5] | |
217 | dec r6d | |
218 | lea r0, [r0 + r1 * 2] | |
219 | lea r2, [r2 + r4 * 2] | |
220 | lea r3, [r3 + r5 * 2] | |
221 | ||
222 | paddw m0, m1 | |
223 | paddw m2, m3 | |
224 | CLIPW2 m0, m2, m4, m5 | |
225 | ||
226 | movu [r0], m0 | |
227 | movu [r0 + r1], m2 | |
228 | lea r0, [r0 + r1 * 2] | |
229 | ||
230 | jnz .loop | |
231 | RET | |
232 | %else | |
233 | INIT_XMM sse4 | |
234 | cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
235 | mov r6d, %2/4 | |
236 | add r5, r5 | |
237 | .loop: | |
238 | pmovzxbw m0, [r2] | |
239 | pmovzxbw m2, [r2 + r4] | |
240 | movu m1, [r3] | |
241 | movu m3, [r3 + r5] | |
242 | lea r2, [r2 + r4 * 2] | |
243 | lea r3, [r3 + r5 * 2] | |
244 | pmovzxbw m4, [r2] | |
245 | pmovzxbw m6, [r2 + r4] | |
246 | movu m5, [r3] | |
247 | movu m7, [r3 + r5] | |
248 | dec r6d | |
249 | lea r2, [r2 + r4 * 2] | |
250 | lea r3, [r3 + r5 * 2] | |
251 | ||
252 | paddw m0, m1 | |
253 | paddw m2, m3 | |
254 | paddw m4, m5 | |
255 | paddw m6, m7 | |
256 | packuswb m0, m0 | |
257 | packuswb m2, m2 | |
258 | packuswb m4, m4 | |
259 | packuswb m6, m6 | |
260 | ||
261 | movh [r0], m0 | |
262 | movh [r0 + r1], m2 | |
263 | lea r0, [r0 + r1 * 2] | |
264 | movh [r0], m4 | |
265 | movh [r0 + r1], m6 | |
266 | lea r0, [r0 + r1 * 2] | |
267 | ||
268 | jnz .loop | |
269 | RET | |
270 | %endif | |
271 | %endmacro | |
272 | ||
273 | PIXEL_ADD_PS_W8_H4 8, 8 | |
274 | PIXEL_ADD_PS_W8_H4 8, 16 | |
275 | ||
276 | ||
277 | ;----------------------------------------------------------------------------- | |
278 | ; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
279 | ;----------------------------------------------------------------------------- | |
280 | %macro PIXEL_ADD_PS_W16_H4 2 | |
281 | %if HIGH_BIT_DEPTH | |
282 | INIT_XMM sse2 | |
283 | cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
284 | mova m5, [pw_pixel_max] | |
285 | pxor m4, m4 | |
286 | mov r6d, %2/4 | |
287 | add r4, r4 | |
288 | add r5, r5 | |
289 | add r1, r1 | |
290 | .loop: | |
291 | movu m0, [r2] | |
292 | movu m2, [r2 + 16] | |
293 | movu m1, [r3] | |
294 | movu m3, [r3 + 16] | |
295 | ||
296 | paddw m0, m1 | |
297 | paddw m2, m3 | |
298 | CLIPW2 m0, m2, m4, m5 | |
299 | ||
300 | movu [r0], m0 | |
301 | movu [r0 + 16], m2 | |
302 | ||
303 | movu m0, [r2 + r4] | |
304 | movu m2, [r2 + r4 + 16] | |
305 | movu m1, [r3 + r5] | |
306 | movu m3, [r3 + r5 + 16] | |
307 | lea r2, [r2 + r4 * 2] | |
308 | lea r3, [r3 + r5 * 2] | |
309 | ||
310 | paddw m0, m1 | |
311 | paddw m2, m3 | |
312 | CLIPW2 m0, m2, m4, m5 | |
313 | ||
314 | movu [r0 + r1], m0 | |
315 | movu [r0 + r1 + 16], m2 | |
316 | ||
317 | movu m0, [r2] | |
318 | movu m2, [r2 + 16] | |
319 | movu m1, [r3] | |
320 | movu m3, [r3 + 16] | |
321 | lea r0, [r0 + r1 * 2] | |
322 | ||
323 | paddw m0, m1 | |
324 | paddw m2, m3 | |
325 | CLIPW2 m0, m2, m4, m5 | |
326 | ||
327 | movu [r0], m0 | |
328 | movu [r0 + 16], m2 | |
329 | ||
330 | movu m0, [r2 + r4] | |
331 | movu m2, [r2 + r4 + 16] | |
332 | movu m1, [r3 + r5] | |
333 | movu m3, [r3 + r5 + 16] | |
334 | dec r6d | |
335 | lea r2, [r2 + r4 * 2] | |
336 | lea r3, [r3 + r5 * 2] | |
337 | ||
338 | paddw m0, m1 | |
339 | paddw m2, m3 | |
340 | CLIPW2 m0, m2, m4, m5 | |
341 | ||
342 | movu [r0 + r1], m0 | |
343 | movu [r0 + r1 + 16], m2 | |
344 | lea r0, [r0 + r1 * 2] | |
345 | ||
346 | jnz .loop | |
347 | RET | |
348 | %else | |
349 | INIT_XMM sse4 | |
350 | cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
351 | mov r6d, %2/4 | |
352 | add r5, r5 | |
353 | .loop: | |
354 | pmovzxbw m0, [r2] | |
355 | pmovzxbw m1, [r2 + 8] | |
356 | pmovzxbw m4, [r2 + r4] | |
357 | pmovzxbw m5, [r2 + r4 + 8] | |
358 | movu m2, [r3] | |
359 | movu m3, [r3 + 16] | |
360 | movu m6, [r3 + r5] | |
361 | movu m7, [r3 + r5 + 16] | |
362 | lea r2, [r2 + r4 * 2] | |
363 | lea r3, [r3 + r5 * 2] | |
364 | ||
365 | paddw m0, m2 | |
366 | paddw m1, m3 | |
367 | paddw m4, m6 | |
368 | paddw m5, m7 | |
369 | packuswb m0, m1 | |
370 | packuswb m4, m5 | |
371 | ||
372 | movu [r0], m0 | |
373 | movu [r0 + r1], m4 | |
374 | ||
375 | pmovzxbw m0, [r2] | |
376 | pmovzxbw m1, [r2 + 8] | |
377 | pmovzxbw m4, [r2 + r4] | |
378 | pmovzxbw m5, [r2 + r4 + 8] | |
379 | movu m2, [r3] | |
380 | movu m3, [r3 + 16] | |
381 | movu m6, [r3 + r5] | |
382 | movu m7, [r3 + r5 + 16] | |
383 | dec r6d | |
384 | lea r0, [r0 + r1 * 2] | |
385 | lea r2, [r2 + r4 * 2] | |
386 | lea r3, [r3 + r5 * 2] | |
387 | ||
388 | paddw m0, m2 | |
389 | paddw m1, m3 | |
390 | paddw m4, m6 | |
391 | paddw m5, m7 | |
392 | packuswb m0, m1 | |
393 | packuswb m4, m5 | |
394 | ||
395 | movu [r0], m0 | |
396 | movu [r0 + r1], m4 | |
397 | lea r0, [r0 + r1 * 2] | |
398 | ||
399 | jnz .loop | |
400 | RET | |
401 | %endif | |
402 | %endmacro | |
403 | ||
404 | PIXEL_ADD_PS_W16_H4 16, 16 | |
405 | PIXEL_ADD_PS_W16_H4 16, 32 | |
406 | ||
407 | ||
408 | ;----------------------------------------------------------------------------- | |
409 | ; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
410 | ;----------------------------------------------------------------------------- | |
411 | %macro PIXEL_ADD_PS_W32_H2 2 | |
412 | %if HIGH_BIT_DEPTH | |
413 | INIT_XMM sse2 | |
414 | cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
415 | mova m5, [pw_pixel_max] | |
416 | pxor m4, m4 | |
417 | mov r6d, %2/2 | |
418 | add r4, r4 | |
419 | add r5, r5 | |
420 | add r1, r1 | |
421 | .loop: | |
422 | movu m0, [r2] | |
423 | movu m2, [r2 + 16] | |
424 | movu m1, [r3] | |
425 | movu m3, [r3 + 16] | |
426 | ||
427 | paddw m0, m1 | |
428 | paddw m2, m3 | |
429 | CLIPW2 m0, m2, m4, m5 | |
430 | ||
431 | movu [r0], m0 | |
432 | movu [r0 + 16], m2 | |
433 | ||
434 | movu m0, [r2 + 32] | |
435 | movu m2, [r2 + 48] | |
436 | movu m1, [r3 + 32] | |
437 | movu m3, [r3 + 48] | |
438 | ||
439 | paddw m0, m1 | |
440 | paddw m2, m3 | |
441 | CLIPW2 m0, m2, m4, m5 | |
442 | ||
443 | movu [r0 + 32], m0 | |
444 | movu [r0 + 48], m2 | |
445 | ||
446 | movu m0, [r2 + r4] | |
447 | movu m2, [r2 + r4 + 16] | |
448 | movu m1, [r3 + r5] | |
449 | movu m3, [r3 + r5 + 16] | |
450 | ||
451 | paddw m0, m1 | |
452 | paddw m2, m3 | |
453 | CLIPW2 m0, m2, m4, m5 | |
454 | ||
455 | movu [r0 + r1], m0 | |
456 | movu [r0 + r1 + 16], m2 | |
457 | ||
458 | movu m0, [r2 + r4 + 32] | |
459 | movu m2, [r2 + r4 + 48] | |
460 | movu m1, [r3 + r5 + 32] | |
461 | movu m3, [r3 + r5 + 48] | |
462 | dec r6d | |
463 | lea r2, [r2 + r4 * 2] | |
464 | lea r3, [r3 + r5 * 2] | |
465 | ||
466 | paddw m0, m1 | |
467 | paddw m2, m3 | |
468 | CLIPW2 m0, m2, m4, m5 | |
469 | ||
470 | movu [r0 + r1 + 32], m0 | |
471 | movu [r0 + r1 + 48], m2 | |
472 | lea r0, [r0 + r1 * 2] | |
473 | ||
474 | jnz .loop | |
475 | RET | |
476 | %else | |
477 | INIT_XMM sse4 | |
478 | cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
479 | mov r6d, %2/2 | |
480 | add r5, r5 | |
481 | .loop: | |
482 | pmovzxbw m0, [r2] | |
483 | pmovzxbw m1, [r2 + 8] | |
484 | pmovzxbw m2, [r2 + 16] | |
485 | pmovzxbw m3, [r2 + 24] | |
486 | movu m4, [r3] | |
487 | movu m5, [r3 + 16] | |
488 | movu m6, [r3 + 32] | |
489 | movu m7, [r3 + 48] | |
490 | ||
491 | paddw m0, m4 | |
492 | paddw m1, m5 | |
493 | paddw m2, m6 | |
494 | paddw m3, m7 | |
495 | packuswb m0, m1 | |
496 | packuswb m2, m3 | |
497 | ||
498 | movu [r0], m0 | |
499 | movu [r0 + 16], m2 | |
500 | ||
501 | pmovzxbw m0, [r2 + r4] | |
502 | pmovzxbw m1, [r2 + r4 + 8] | |
503 | pmovzxbw m2, [r2 + r4 + 16] | |
504 | pmovzxbw m3, [r2 + r4 + 24] | |
505 | movu m4, [r3 + r5] | |
506 | movu m5, [r3 + r5 + 16] | |
507 | movu m6, [r3 + r5 + 32] | |
508 | movu m7, [r3 + r5 + 48] | |
509 | dec r6d | |
510 | lea r2, [r2 + r4 * 2] | |
511 | lea r3, [r3 + r5 * 2] | |
512 | ||
513 | paddw m0, m4 | |
514 | paddw m1, m5 | |
515 | paddw m2, m6 | |
516 | paddw m3, m7 | |
517 | packuswb m0, m1 | |
518 | packuswb m2, m3 | |
519 | ||
520 | movu [r0 + r1], m0 | |
521 | movu [r0 + r1 + 16], m2 | |
522 | lea r0, [r0 + r1 * 2] | |
523 | ||
524 | jnz .loop | |
525 | RET | |
526 | %endif | |
527 | %endmacro | |
528 | ||
529 | PIXEL_ADD_PS_W32_H2 32, 32 | |
530 | PIXEL_ADD_PS_W32_H2 32, 64 | |
531 | ||
532 | ||
533 | ;----------------------------------------------------------------------------- | |
534 | ; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) | |
535 | ;----------------------------------------------------------------------------- | |
536 | %macro PIXEL_ADD_PS_W64_H2 2 | |
537 | %if HIGH_BIT_DEPTH | |
538 | INIT_XMM sse2 | |
539 | cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1 | |
540 | mova m5, [pw_pixel_max] | |
541 | pxor m4, m4 | |
542 | mov r6d, %2/2 | |
543 | add r4, r4 | |
544 | add r5, r5 | |
545 | add r1, r1 | |
546 | .loop: | |
547 | movu m0, [r2] | |
548 | movu m2, [r2 + 16] | |
549 | movu m1, [r3] | |
550 | movu m3, [r3 + 16] | |
551 | ||
552 | paddw m0, m1 | |
553 | paddw m2, m3 | |
554 | CLIPW2 m0, m2, m4, m5 | |
555 | ||
556 | movu [r0], m0 | |
557 | movu [r0 + 16], m2 | |
558 | ||
559 | movu m0, [r2 + 32] | |
560 | movu m2, [r2 + 48] | |
561 | movu m1, [r3 + 32] | |
562 | movu m3, [r3 + 48] | |
563 | ||
564 | paddw m0, m1 | |
565 | paddw m2, m3 | |
566 | CLIPW2 m0, m2, m4, m5 | |
567 | ||
568 | movu [r0 + 32], m0 | |
569 | movu [r0 + 48], m2 | |
570 | ||
571 | movu m0, [r2 + 64] | |
572 | movu m2, [r2 + 80] | |
573 | movu m1, [r3 + 64] | |
574 | movu m3, [r3 + 80] | |
575 | ||
576 | paddw m0, m1 | |
577 | paddw m2, m3 | |
578 | CLIPW2 m0, m2, m4, m5 | |
579 | ||
580 | movu [r0 + 64], m0 | |
581 | movu [r0 + 80], m2 | |
582 | ||
583 | movu m0, [r2 + 96] | |
584 | movu m2, [r2 + 112] | |
585 | movu m1, [r3 + 96] | |
586 | movu m3, [r3 + 112] | |
587 | ||
588 | paddw m0, m1 | |
589 | paddw m2, m3 | |
590 | CLIPW2 m0, m2, m4, m5 | |
591 | ||
592 | movu [r0 + 96], m0 | |
593 | movu [r0 + 112], m2 | |
594 | ||
595 | movu m0, [r2 + r4] | |
596 | movu m2, [r2 + r4 + 16] | |
597 | movu m1, [r3 + r5] | |
598 | movu m3, [r3 + r5 + 16] | |
599 | ||
600 | paddw m0, m1 | |
601 | paddw m2, m3 | |
602 | CLIPW2 m0, m2, m4, m5 | |
603 | ||
604 | movu [r0 + r1], m0 | |
605 | movu [r0 + r1 + 16], m2 | |
606 | ||
607 | movu m0, [r2 + r4 + 32] | |
608 | movu m2, [r2 + r4 + 48] | |
609 | movu m1, [r3 + r5 + 32] | |
610 | movu m3, [r3 + r5 + 48] | |
611 | ||
612 | paddw m0, m1 | |
613 | paddw m2, m3 | |
614 | CLIPW2 m0, m2, m4, m5 | |
615 | ||
616 | movu [r0 + r1 + 32], m0 | |
617 | movu [r0 + r1 + 48], m2 | |
618 | ||
619 | movu m0, [r2 + r4 + 64] | |
620 | movu m2, [r2 + r4 + 80] | |
621 | movu m1, [r3 + r5 + 64] | |
622 | movu m3, [r3 + r5 + 80] | |
623 | ||
624 | paddw m0, m1 | |
625 | paddw m2, m3 | |
626 | CLIPW2 m0, m2, m4, m5 | |
627 | ||
628 | movu [r0 + r1 + 64], m0 | |
629 | movu [r0 + r1 + 80], m2 | |
630 | ||
631 | movu m0, [r2 + r4 + 96] | |
632 | movu m2, [r2 + r4 + 112] | |
633 | movu m1, [r3 + r5 + 96] | |
634 | movu m3, [r3 + r5 + 112] | |
635 | dec r6d | |
636 | lea r2, [r2 + r4 * 2] | |
637 | lea r3, [r3 + r5 * 2] | |
638 | ||
639 | paddw m0, m1 | |
640 | paddw m2, m3 | |
641 | CLIPW2 m0, m2, m4, m5 | |
642 | ||
643 | movu [r0 + r1 + 96], m0 | |
644 | movu [r0 + r1 + 112], m2 | |
645 | lea r0, [r0 + r1 * 2] | |
646 | ||
647 | jnz .loop | |
648 | RET | |
649 | %else | |
650 | INIT_XMM sse4 | |
651 | cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 | |
652 | mov r6d, %2/2 | |
653 | add r5, r5 | |
654 | .loop: | |
655 | pmovzxbw m0, [r2] | |
656 | pmovzxbw m1, [r2 + 8] | |
657 | pmovzxbw m2, [r2 + 16] | |
658 | pmovzxbw m3, [r2 + 24] | |
659 | movu m4, [r3] | |
660 | movu m5, [r3 + 16] | |
661 | movu m6, [r3 + 32] | |
662 | movu m7, [r3 + 48] | |
663 | ||
664 | paddw m0, m4 | |
665 | paddw m1, m5 | |
666 | paddw m2, m6 | |
667 | paddw m3, m7 | |
668 | packuswb m0, m1 | |
669 | packuswb m2, m3 | |
670 | ||
671 | movu [r0], m0 | |
672 | movu [r0 + 16], m2 | |
673 | ||
674 | pmovzxbw m0, [r2 + 32] | |
675 | pmovzxbw m1, [r2 + 40] | |
676 | pmovzxbw m2, [r2 + 48] | |
677 | pmovzxbw m3, [r2 + 56] | |
678 | movu m4, [r3 + 64] | |
679 | movu m5, [r3 + 80] | |
680 | movu m6, [r3 + 96] | |
681 | movu m7, [r3 + 112] | |
682 | ||
683 | paddw m0, m4 | |
684 | paddw m1, m5 | |
685 | paddw m2, m6 | |
686 | paddw m3, m7 | |
687 | packuswb m0, m1 | |
688 | packuswb m2, m3 | |
689 | ||
690 | movu [r0 + 32], m0 | |
691 | movu [r0 + 48], m2 | |
692 | ||
693 | pmovzxbw m0, [r2 + r4] | |
694 | pmovzxbw m1, [r2 + r4 + 8] | |
695 | pmovzxbw m2, [r2 + r4 + 16] | |
696 | pmovzxbw m3, [r2 + r4 + 24] | |
697 | movu m4, [r3 + r5] | |
698 | movu m5, [r3 + r5 + 16] | |
699 | movu m6, [r3 + r5 + 32] | |
700 | movu m7, [r3 + r5 + 48] | |
701 | ||
702 | paddw m0, m4 | |
703 | paddw m1, m5 | |
704 | paddw m2, m6 | |
705 | paddw m3, m7 | |
706 | packuswb m0, m1 | |
707 | packuswb m2, m3 | |
708 | ||
709 | movu [r0 + r1], m0 | |
710 | movu [r0 + r1 + 16], m2 | |
711 | ||
712 | pmovzxbw m0, [r2 + r4 + 32] | |
713 | pmovzxbw m1, [r2 + r4 + 40] | |
714 | pmovzxbw m2, [r2 + r4 + 48] | |
715 | pmovzxbw m3, [r2 + r4 + 56] | |
716 | movu m4, [r3 + r5 + 64] | |
717 | movu m5, [r3 + r5 + 80] | |
718 | movu m6, [r3 + r5 + 96] | |
719 | movu m7, [r3 + r5 + 112] | |
720 | dec r6d | |
721 | lea r2, [r2 + r4 * 2] | |
722 | lea r3, [r3 + r5 * 2] | |
723 | ||
724 | paddw m0, m4 | |
725 | paddw m1, m5 | |
726 | paddw m2, m6 | |
727 | paddw m3, m7 | |
728 | packuswb m0, m1 | |
729 | packuswb m2, m3 | |
730 | ||
731 | movu [r0 + r1 + 32], m0 | |
732 | movu [r0 + r1 + 48], m2 | |
733 | lea r0, [r0 + r1 * 2] | |
734 | ||
735 | jnz .loop | |
736 | RET | |
737 | %endif | |
738 | %endmacro | |
739 | ||
740 | PIXEL_ADD_PS_W64_H2 64, 64 |