Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / pixeladd8.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5;*
6;* This program is free software; you can redistribute it and/or modify
7;* it under the terms of the GNU General Public License as published by
8;* the Free Software Foundation; either version 2 of the License, or
9;* (at your option) any later version.
10;*
11;* This program is distributed in the hope that it will be useful,
12;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14;* GNU General Public License for more details.
15;*
16;* You should have received a copy of the GNU General Public License
17;* along with this program; if not, write to the Free Software
18;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19;*
20;* This program is also available under a commercial proprietary license.
21;* For more information, contact us at license @ x265.com.
22;*****************************************************************************/
23
24%include "x86inc.asm"
25%include "x86util.asm"
26
27SECTION_RODATA 32
28
29SECTION .text
30
31cextern pw_pixel_max
32
33;-----------------------------------------------------------------------------
34; void pixel_add_ps_4x4(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
35;-----------------------------------------------------------------------------
36%if HIGH_BIT_DEPTH
37INIT_XMM sse2
38cglobal pixel_add_ps_4x4, 6, 6, 6, dest, destride, src0, scr1, srcStride0, srcStride1
39 mova m1, [pw_pixel_max]
40 pxor m0, m0
41 add r4, r4
42 add r5, r5
43 add r1, r1
44 movh m2, [r2]
45 movhps m2, [r2 + r4]
46 movh m3, [r3]
47 movhps m3, [r3 + r5]
48 lea r2, [r2 + r4 * 2]
49 lea r3, [r3 + r5 * 2]
50 movh m4, [r2]
51 movhps m4, [r2 + r4]
52 movh m5, [r3]
53 movhps m5, [r3 + r5]
54
55 paddw m2, m3
56 paddw m4, m5
57 CLIPW2 m2, m4, m0, m1
58
59 movh [r0], m2
60 movhps [r0 + r1], m2
61 lea r0, [r0 + r1 * 2]
62 movh [r0], m4
63 movhps [r0 + r1], m4
64
65 RET
66%else
67INIT_XMM sse4
68cglobal pixel_add_ps_4x4, 6, 6, 8, dest, destride, src0, scr1, srcStride0, srcStride1
69 add r5, r5
70 pmovzxbw m0, [r2]
71 pmovzxbw m2, [r2 + r4]
72 movh m1, [r3]
73 movh m3, [r3 + r5]
74 lea r2, [r2 + r4 * 2]
75 lea r3, [r3 + r5 * 2]
76 pmovzxbw m4, [r2]
77 pmovzxbw m6, [r2 + r4]
78 movh m5, [r3]
79 movh m7, [r3 + r5]
80
81 paddw m0, m1
82 paddw m2, m3
83 paddw m4, m5
84 paddw m6, m7
85 packuswb m0, m0
86 packuswb m2, m2
87 packuswb m4, m4
88 packuswb m6, m6
89
90 movd [r0], m0
91 movd [r0 + r1], m2
92 lea r0, [r0 + r1 * 2]
93 movd [r0], m4
94 movd [r0 + r1], m6
95
96 RET
97%endif
98
99
100;-----------------------------------------------------------------------------
101; void pixel_add_ps_4x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
102;-----------------------------------------------------------------------------
103%macro PIXEL_ADD_PS_W4_H4 2
104%if HIGH_BIT_DEPTH
105INIT_XMM sse2
106cglobal pixel_add_ps_4x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
107 mova m1, [pw_pixel_max]
108 pxor m0, m0
109 mov r6d, %2/4
110 add r4, r4
111 add r5, r5
112 add r1, r1
113.loop:
114 movh m2, [r2]
115 movhps m2, [r2 + r4]
116 movh m3, [r3]
117 movhps m3, [r3 + r5]
118 lea r2, [r2 + r4 * 2]
119 lea r3, [r3 + r5 * 2]
120 movh m4, [r2]
121 movhps m4, [r2 + r4]
122 movh m5, [r3]
123 movhps m5, [r3 + r5]
124 dec r6d
125 lea r2, [r2 + r4 * 2]
126 lea r3, [r3 + r5 * 2]
127
128 paddw m2, m3
129 paddw m4, m5
130 CLIPW2 m2, m4, m0, m1
131
132 movh [r0], m2
133 movhps [r0 + r1], m2
134 lea r0, [r0 + r1 * 2]
135 movh [r0], m4
136 movhps [r0 + r1], m4
137 lea r0, [r0 + r1 * 2]
138
139 jnz .loop
140 RET
141%else
142INIT_XMM sse4
143cglobal pixel_add_ps_4x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
144 mov r6d, %2/4
145 add r5, r5
146.loop:
147 pmovzxbw m0, [r2]
148 pmovzxbw m2, [r2 + r4]
149 movh m1, [r3]
150 movh m3, [r3 + r5]
151 lea r2, [r2 + r4 * 2]
152 lea r3, [r3 + r5 * 2]
153 pmovzxbw m4, [r2]
154 pmovzxbw m6, [r2 + r4]
155 movh m5, [r3]
156 movh m7, [r3 + r5]
157 dec r6d
158 lea r2, [r2 + r4 * 2]
159 lea r3, [r3 + r5 * 2]
160
161 paddw m0, m1
162 paddw m2, m3
163 paddw m4, m5
164 paddw m6, m7
165 packuswb m0, m0
166 packuswb m2, m2
167 packuswb m4, m4
168 packuswb m6, m6
169
170 movd [r0], m0
171 movd [r0 + r1], m2
172 lea r0, [r0 + r1 * 2]
173 movd [r0], m4
174 movd [r0 + r1], m6
175 lea r0, [r0 + r1 * 2]
176
177 jnz .loop
178 RET
179%endif
180%endmacro
181
182PIXEL_ADD_PS_W4_H4 4, 8
183
184
185;-----------------------------------------------------------------------------
186; void pixel_add_ps_8x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
187;-----------------------------------------------------------------------------
188%macro PIXEL_ADD_PS_W8_H4 2
189%if HIGH_BIT_DEPTH
190INIT_XMM sse2
191cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
192 mova m5, [pw_pixel_max]
193 pxor m4, m4
194 mov r6d, %2/4
195 add r4, r4
196 add r5, r5
197 add r1, r1
198.loop:
199 movu m0, [r2]
200 movu m2, [r2 + r4]
201 movu m1, [r3]
202 movu m3, [r3 + r5]
203 lea r2, [r2 + r4 * 2]
204 lea r3, [r3 + r5 * 2]
205
206 paddw m0, m1
207 paddw m2, m3
208 CLIPW2 m0, m2, m4, m5
209
210 movu [r0], m0
211 movu [r0 + r1], m2
212
213 movu m0, [r2]
214 movu m2, [r2 + r4]
215 movu m1, [r3]
216 movu m3, [r3 + r5]
217 dec r6d
218 lea r0, [r0 + r1 * 2]
219 lea r2, [r2 + r4 * 2]
220 lea r3, [r3 + r5 * 2]
221
222 paddw m0, m1
223 paddw m2, m3
224 CLIPW2 m0, m2, m4, m5
225
226 movu [r0], m0
227 movu [r0 + r1], m2
228 lea r0, [r0 + r1 * 2]
229
230 jnz .loop
231 RET
232%else
233INIT_XMM sse4
234cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
235 mov r6d, %2/4
236 add r5, r5
237.loop:
238 pmovzxbw m0, [r2]
239 pmovzxbw m2, [r2 + r4]
240 movu m1, [r3]
241 movu m3, [r3 + r5]
242 lea r2, [r2 + r4 * 2]
243 lea r3, [r3 + r5 * 2]
244 pmovzxbw m4, [r2]
245 pmovzxbw m6, [r2 + r4]
246 movu m5, [r3]
247 movu m7, [r3 + r5]
248 dec r6d
249 lea r2, [r2 + r4 * 2]
250 lea r3, [r3 + r5 * 2]
251
252 paddw m0, m1
253 paddw m2, m3
254 paddw m4, m5
255 paddw m6, m7
256 packuswb m0, m0
257 packuswb m2, m2
258 packuswb m4, m4
259 packuswb m6, m6
260
261 movh [r0], m0
262 movh [r0 + r1], m2
263 lea r0, [r0 + r1 * 2]
264 movh [r0], m4
265 movh [r0 + r1], m6
266 lea r0, [r0 + r1 * 2]
267
268 jnz .loop
269 RET
270%endif
271%endmacro
272
273PIXEL_ADD_PS_W8_H4 8, 8
274PIXEL_ADD_PS_W8_H4 8, 16
275
276
277;-----------------------------------------------------------------------------
278; void pixel_add_ps_16x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
279;-----------------------------------------------------------------------------
280%macro PIXEL_ADD_PS_W16_H4 2
281%if HIGH_BIT_DEPTH
282INIT_XMM sse2
283cglobal pixel_add_ps_16x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
284 mova m5, [pw_pixel_max]
285 pxor m4, m4
286 mov r6d, %2/4
287 add r4, r4
288 add r5, r5
289 add r1, r1
290.loop:
291 movu m0, [r2]
292 movu m2, [r2 + 16]
293 movu m1, [r3]
294 movu m3, [r3 + 16]
295
296 paddw m0, m1
297 paddw m2, m3
298 CLIPW2 m0, m2, m4, m5
299
300 movu [r0], m0
301 movu [r0 + 16], m2
302
303 movu m0, [r2 + r4]
304 movu m2, [r2 + r4 + 16]
305 movu m1, [r3 + r5]
306 movu m3, [r3 + r5 + 16]
307 lea r2, [r2 + r4 * 2]
308 lea r3, [r3 + r5 * 2]
309
310 paddw m0, m1
311 paddw m2, m3
312 CLIPW2 m0, m2, m4, m5
313
314 movu [r0 + r1], m0
315 movu [r0 + r1 + 16], m2
316
317 movu m0, [r2]
318 movu m2, [r2 + 16]
319 movu m1, [r3]
320 movu m3, [r3 + 16]
321 lea r0, [r0 + r1 * 2]
322
323 paddw m0, m1
324 paddw m2, m3
325 CLIPW2 m0, m2, m4, m5
326
327 movu [r0], m0
328 movu [r0 + 16], m2
329
330 movu m0, [r2 + r4]
331 movu m2, [r2 + r4 + 16]
332 movu m1, [r3 + r5]
333 movu m3, [r3 + r5 + 16]
334 dec r6d
335 lea r2, [r2 + r4 * 2]
336 lea r3, [r3 + r5 * 2]
337
338 paddw m0, m1
339 paddw m2, m3
340 CLIPW2 m0, m2, m4, m5
341
342 movu [r0 + r1], m0
343 movu [r0 + r1 + 16], m2
344 lea r0, [r0 + r1 * 2]
345
346 jnz .loop
347 RET
348%else
349INIT_XMM sse4
350cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
351 mov r6d, %2/4
352 add r5, r5
353.loop:
354 pmovzxbw m0, [r2]
355 pmovzxbw m1, [r2 + 8]
356 pmovzxbw m4, [r2 + r4]
357 pmovzxbw m5, [r2 + r4 + 8]
358 movu m2, [r3]
359 movu m3, [r3 + 16]
360 movu m6, [r3 + r5]
361 movu m7, [r3 + r5 + 16]
362 lea r2, [r2 + r4 * 2]
363 lea r3, [r3 + r5 * 2]
364
365 paddw m0, m2
366 paddw m1, m3
367 paddw m4, m6
368 paddw m5, m7
369 packuswb m0, m1
370 packuswb m4, m5
371
372 movu [r0], m0
373 movu [r0 + r1], m4
374
375 pmovzxbw m0, [r2]
376 pmovzxbw m1, [r2 + 8]
377 pmovzxbw m4, [r2 + r4]
378 pmovzxbw m5, [r2 + r4 + 8]
379 movu m2, [r3]
380 movu m3, [r3 + 16]
381 movu m6, [r3 + r5]
382 movu m7, [r3 + r5 + 16]
383 dec r6d
384 lea r0, [r0 + r1 * 2]
385 lea r2, [r2 + r4 * 2]
386 lea r3, [r3 + r5 * 2]
387
388 paddw m0, m2
389 paddw m1, m3
390 paddw m4, m6
391 paddw m5, m7
392 packuswb m0, m1
393 packuswb m4, m5
394
395 movu [r0], m0
396 movu [r0 + r1], m4
397 lea r0, [r0 + r1 * 2]
398
399 jnz .loop
400 RET
401%endif
402%endmacro
403
404PIXEL_ADD_PS_W16_H4 16, 16
405PIXEL_ADD_PS_W16_H4 16, 32
406
407
408;-----------------------------------------------------------------------------
409; void pixel_add_ps_32x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
410;-----------------------------------------------------------------------------
411%macro PIXEL_ADD_PS_W32_H2 2
412%if HIGH_BIT_DEPTH
413INIT_XMM sse2
414cglobal pixel_add_ps_32x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
415 mova m5, [pw_pixel_max]
416 pxor m4, m4
417 mov r6d, %2/2
418 add r4, r4
419 add r5, r5
420 add r1, r1
421.loop:
422 movu m0, [r2]
423 movu m2, [r2 + 16]
424 movu m1, [r3]
425 movu m3, [r3 + 16]
426
427 paddw m0, m1
428 paddw m2, m3
429 CLIPW2 m0, m2, m4, m5
430
431 movu [r0], m0
432 movu [r0 + 16], m2
433
434 movu m0, [r2 + 32]
435 movu m2, [r2 + 48]
436 movu m1, [r3 + 32]
437 movu m3, [r3 + 48]
438
439 paddw m0, m1
440 paddw m2, m3
441 CLIPW2 m0, m2, m4, m5
442
443 movu [r0 + 32], m0
444 movu [r0 + 48], m2
445
446 movu m0, [r2 + r4]
447 movu m2, [r2 + r4 + 16]
448 movu m1, [r3 + r5]
449 movu m3, [r3 + r5 + 16]
450
451 paddw m0, m1
452 paddw m2, m3
453 CLIPW2 m0, m2, m4, m5
454
455 movu [r0 + r1], m0
456 movu [r0 + r1 + 16], m2
457
458 movu m0, [r2 + r4 + 32]
459 movu m2, [r2 + r4 + 48]
460 movu m1, [r3 + r5 + 32]
461 movu m3, [r3 + r5 + 48]
462 dec r6d
463 lea r2, [r2 + r4 * 2]
464 lea r3, [r3 + r5 * 2]
465
466 paddw m0, m1
467 paddw m2, m3
468 CLIPW2 m0, m2, m4, m5
469
470 movu [r0 + r1 + 32], m0
471 movu [r0 + r1 + 48], m2
472 lea r0, [r0 + r1 * 2]
473
474 jnz .loop
475 RET
476%else
477INIT_XMM sse4
478cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
479 mov r6d, %2/2
480 add r5, r5
481.loop:
482 pmovzxbw m0, [r2]
483 pmovzxbw m1, [r2 + 8]
484 pmovzxbw m2, [r2 + 16]
485 pmovzxbw m3, [r2 + 24]
486 movu m4, [r3]
487 movu m5, [r3 + 16]
488 movu m6, [r3 + 32]
489 movu m7, [r3 + 48]
490
491 paddw m0, m4
492 paddw m1, m5
493 paddw m2, m6
494 paddw m3, m7
495 packuswb m0, m1
496 packuswb m2, m3
497
498 movu [r0], m0
499 movu [r0 + 16], m2
500
501 pmovzxbw m0, [r2 + r4]
502 pmovzxbw m1, [r2 + r4 + 8]
503 pmovzxbw m2, [r2 + r4 + 16]
504 pmovzxbw m3, [r2 + r4 + 24]
505 movu m4, [r3 + r5]
506 movu m5, [r3 + r5 + 16]
507 movu m6, [r3 + r5 + 32]
508 movu m7, [r3 + r5 + 48]
509 dec r6d
510 lea r2, [r2 + r4 * 2]
511 lea r3, [r3 + r5 * 2]
512
513 paddw m0, m4
514 paddw m1, m5
515 paddw m2, m6
516 paddw m3, m7
517 packuswb m0, m1
518 packuswb m2, m3
519
520 movu [r0 + r1], m0
521 movu [r0 + r1 + 16], m2
522 lea r0, [r0 + r1 * 2]
523
524 jnz .loop
525 RET
526%endif
527%endmacro
528
529PIXEL_ADD_PS_W32_H2 32, 32
530PIXEL_ADD_PS_W32_H2 32, 64
531
532
533;-----------------------------------------------------------------------------
534; void pixel_add_ps_64x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)
535;-----------------------------------------------------------------------------
536%macro PIXEL_ADD_PS_W64_H2 2
537%if HIGH_BIT_DEPTH
538INIT_XMM sse2
539cglobal pixel_add_ps_64x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
540 mova m5, [pw_pixel_max]
541 pxor m4, m4
542 mov r6d, %2/2
543 add r4, r4
544 add r5, r5
545 add r1, r1
546.loop:
547 movu m0, [r2]
548 movu m2, [r2 + 16]
549 movu m1, [r3]
550 movu m3, [r3 + 16]
551
552 paddw m0, m1
553 paddw m2, m3
554 CLIPW2 m0, m2, m4, m5
555
556 movu [r0], m0
557 movu [r0 + 16], m2
558
559 movu m0, [r2 + 32]
560 movu m2, [r2 + 48]
561 movu m1, [r3 + 32]
562 movu m3, [r3 + 48]
563
564 paddw m0, m1
565 paddw m2, m3
566 CLIPW2 m0, m2, m4, m5
567
568 movu [r0 + 32], m0
569 movu [r0 + 48], m2
570
571 movu m0, [r2 + 64]
572 movu m2, [r2 + 80]
573 movu m1, [r3 + 64]
574 movu m3, [r3 + 80]
575
576 paddw m0, m1
577 paddw m2, m3
578 CLIPW2 m0, m2, m4, m5
579
580 movu [r0 + 64], m0
581 movu [r0 + 80], m2
582
583 movu m0, [r2 + 96]
584 movu m2, [r2 + 112]
585 movu m1, [r3 + 96]
586 movu m3, [r3 + 112]
587
588 paddw m0, m1
589 paddw m2, m3
590 CLIPW2 m0, m2, m4, m5
591
592 movu [r0 + 96], m0
593 movu [r0 + 112], m2
594
595 movu m0, [r2 + r4]
596 movu m2, [r2 + r4 + 16]
597 movu m1, [r3 + r5]
598 movu m3, [r3 + r5 + 16]
599
600 paddw m0, m1
601 paddw m2, m3
602 CLIPW2 m0, m2, m4, m5
603
604 movu [r0 + r1], m0
605 movu [r0 + r1 + 16], m2
606
607 movu m0, [r2 + r4 + 32]
608 movu m2, [r2 + r4 + 48]
609 movu m1, [r3 + r5 + 32]
610 movu m3, [r3 + r5 + 48]
611
612 paddw m0, m1
613 paddw m2, m3
614 CLIPW2 m0, m2, m4, m5
615
616 movu [r0 + r1 + 32], m0
617 movu [r0 + r1 + 48], m2
618
619 movu m0, [r2 + r4 + 64]
620 movu m2, [r2 + r4 + 80]
621 movu m1, [r3 + r5 + 64]
622 movu m3, [r3 + r5 + 80]
623
624 paddw m0, m1
625 paddw m2, m3
626 CLIPW2 m0, m2, m4, m5
627
628 movu [r0 + r1 + 64], m0
629 movu [r0 + r1 + 80], m2
630
631 movu m0, [r2 + r4 + 96]
632 movu m2, [r2 + r4 + 112]
633 movu m1, [r3 + r5 + 96]
634 movu m3, [r3 + r5 + 112]
635 dec r6d
636 lea r2, [r2 + r4 * 2]
637 lea r3, [r3 + r5 * 2]
638
639 paddw m0, m1
640 paddw m2, m3
641 CLIPW2 m0, m2, m4, m5
642
643 movu [r0 + r1 + 96], m0
644 movu [r0 + r1 + 112], m2
645 lea r0, [r0 + r1 * 2]
646
647 jnz .loop
648 RET
649%else
650INIT_XMM sse4
651cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
652 mov r6d, %2/2
653 add r5, r5
654.loop:
655 pmovzxbw m0, [r2]
656 pmovzxbw m1, [r2 + 8]
657 pmovzxbw m2, [r2 + 16]
658 pmovzxbw m3, [r2 + 24]
659 movu m4, [r3]
660 movu m5, [r3 + 16]
661 movu m6, [r3 + 32]
662 movu m7, [r3 + 48]
663
664 paddw m0, m4
665 paddw m1, m5
666 paddw m2, m6
667 paddw m3, m7
668 packuswb m0, m1
669 packuswb m2, m3
670
671 movu [r0], m0
672 movu [r0 + 16], m2
673
674 pmovzxbw m0, [r2 + 32]
675 pmovzxbw m1, [r2 + 40]
676 pmovzxbw m2, [r2 + 48]
677 pmovzxbw m3, [r2 + 56]
678 movu m4, [r3 + 64]
679 movu m5, [r3 + 80]
680 movu m6, [r3 + 96]
681 movu m7, [r3 + 112]
682
683 paddw m0, m4
684 paddw m1, m5
685 paddw m2, m6
686 paddw m3, m7
687 packuswb m0, m1
688 packuswb m2, m3
689
690 movu [r0 + 32], m0
691 movu [r0 + 48], m2
692
693 pmovzxbw m0, [r2 + r4]
694 pmovzxbw m1, [r2 + r4 + 8]
695 pmovzxbw m2, [r2 + r4 + 16]
696 pmovzxbw m3, [r2 + r4 + 24]
697 movu m4, [r3 + r5]
698 movu m5, [r3 + r5 + 16]
699 movu m6, [r3 + r5 + 32]
700 movu m7, [r3 + r5 + 48]
701
702 paddw m0, m4
703 paddw m1, m5
704 paddw m2, m6
705 paddw m3, m7
706 packuswb m0, m1
707 packuswb m2, m3
708
709 movu [r0 + r1], m0
710 movu [r0 + r1 + 16], m2
711
712 pmovzxbw m0, [r2 + r4 + 32]
713 pmovzxbw m1, [r2 + r4 + 40]
714 pmovzxbw m2, [r2 + r4 + 48]
715 pmovzxbw m3, [r2 + r4 + 56]
716 movu m4, [r3 + r5 + 64]
717 movu m5, [r3 + r5 + 80]
718 movu m6, [r3 + r5 + 96]
719 movu m7, [r3 + r5 + 112]
720 dec r6d
721 lea r2, [r2 + r4 * 2]
722 lea r3, [r3 + r5 * 2]
723
724 paddw m0, m4
725 paddw m1, m5
726 paddw m2, m6
727 paddw m3, m7
728 packuswb m0, m1
729 packuswb m2, m3
730
731 movu [r0 + r1 + 32], m0
732 movu [r0 + r1 + 48], m2
733 lea r0, [r0 + r1 * 2]
734
735 jnz .loop
736 RET
737%endif
738%endmacro
739
740PIXEL_ADD_PS_W64_H2 64, 64