Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / blockcopy8.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5;* Murugan Vairavel <murugan@multicorewareinc.com>
6;*
7;* This program is free software; you can redistribute it and/or modify
8;* it under the terms of the GNU General Public License as published by
9;* the Free Software Foundation; either version 2 of the License, or
10;* (at your option) any later version.
11;*
12;* This program is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15;* GNU General Public License for more details.
16;*
17;* You should have received a copy of the GNU General Public License
18;* along with this program; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20;*
21;* This program is also available under a commercial proprietary license.
22;* For more information, contact us at license @ x265.com.
23;*****************************************************************************/
24
25%include "x86inc.asm"
26%include "x86util.asm"
27
28SECTION_RODATA 32
29
30tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
31
32cextern pb_4
33cextern pb_1
34cextern pb_16
35cextern pb_64
36cextern pw_4
37cextern pb_8
38cextern pb_32
39cextern pb_128
40
41SECTION .text
42
43;-----------------------------------------------------------------------------
44; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
45;-----------------------------------------------------------------------------
46INIT_XMM sse2
47cglobal blockcopy_pp_2x4, 4, 7, 0
48 mov r4w, [r2]
49 mov r5w, [r2 + r3]
50 lea r2, [r2 + r3 * 2]
51 mov r6w, [r2]
52 mov r3w, [r2 + r3]
53
54 mov [r0], r4w
55 mov [r0 + r1], r5w
56 lea r0, [r0 + 2 * r1]
57 mov [r0], r6w
58 mov [r0 + r1], r3w
59RET
60
61;-----------------------------------------------------------------------------
62; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
63;-----------------------------------------------------------------------------
64INIT_XMM sse2
65cglobal blockcopy_pp_2x8, 4, 7, 0
66 mov r4w, [r2]
67 mov r5w, [r2 + r3]
68 mov r6w, [r2 + 2 * r3]
69
70 mov [r0], r4w
71 mov [r0 + r1], r5w
72 mov [r0 + 2 * r1], r6w
73
74 lea r0, [r0 + 2 * r1]
75 lea r2, [r2 + 2 * r3]
76
77 mov r4w, [r2 + r3]
78 mov r5w, [r2 + 2 * r3]
79
80 mov [r0 + r1], r4w
81 mov [r0 + 2 * r1], r5w
82
83 lea r0, [r0 + 2 * r1]
84 lea r2, [r2 + 2 * r3]
85
86 mov r4w, [r2 + r3]
87 mov r5w, [r2 + 2 * r3]
88
89 mov [r0 + r1], r4w
90 mov [r0 + 2 * r1], r5w
91
92 lea r0, [r0 + 2 * r1]
93 lea r2, [r2 + 2 * r3]
94
95 mov r4w, [r2 + r3]
96 mov [r0 + r1], r4w
97 RET
98
99;-----------------------------------------------------------------------------
100; void blockcopy_pp_2x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
101;-----------------------------------------------------------------------------
102INIT_XMM sse2
103cglobal blockcopy_pp_2x16, 4, 7, 0
104 mov r6d, 16/2
105.loop:
106 mov r4w, [r2]
107 mov r5w, [r2 + r3]
108 dec r6d
109 lea r2, [r2 + r3 * 2]
110 mov [r0], r4w
111 mov [r0 + r1], r5w
112 lea r0, [r0 + r1 * 2]
113 jnz .loop
114 RET
115
116
117;-----------------------------------------------------------------------------
118; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
119;-----------------------------------------------------------------------------
120INIT_XMM sse2
121cglobal blockcopy_pp_4x2, 4, 6, 0
122 mov r4d, [r2]
123 mov r5d, [r2 + r3]
124
125 mov [r0], r4d
126 mov [r0 + r1], r5d
127 RET
128
129;-----------------------------------------------------------------------------
130; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
131;-----------------------------------------------------------------------------
132INIT_XMM sse2
133cglobal blockcopy_pp_4x4, 4, 4, 4
134 movd m0, [r2]
135 movd m1, [r2 + r3]
136 movd m2, [r2 + 2 * r3]
137 lea r3, [r3 + r3 * 2]
138 movd m3, [r2 + r3]
139
140 movd [r0], m0
141 movd [r0 + r1], m1
142 movd [r0 + 2 * r1], m2
143 lea r1, [r1 + 2 * r1]
144 movd [r0 + r1], m3
145 RET
146
147;-----------------------------------------------------------------------------
148; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
149;-----------------------------------------------------------------------------
150%macro BLOCKCOPY_PP_W4_H8 2
151INIT_XMM sse2
152cglobal blockcopy_pp_%1x%2, 4, 5, 4
153 mov r4d, %2/8
154.loop:
155 movd m0, [r2]
156 movd m1, [r2 + r3]
157 lea r2, [r2 + 2 * r3]
158 movd m2, [r2]
159 movd m3, [r2 + r3]
160
161 movd [r0], m0
162 movd [r0 + r1], m1
163 lea r0, [r0 + 2 * r1]
164 movd [r0], m2
165 movd [r0 + r1], m3
166
167 lea r0, [r0 + 2 * r1]
168 lea r2, [r2 + 2 * r3]
169 movd m0, [r2]
170 movd m1, [r2 + r3]
171 lea r2, [r2 + 2 * r3]
172 movd m2, [r2]
173 movd m3, [r2 + r3]
174
175 movd [r0], m0
176 movd [r0 + r1], m1
177 lea r0, [r0 + 2 * r1]
178 movd [r0], m2
179 movd [r0 + r1], m3
180
181 lea r0, [r0 + 2 * r1]
182 lea r2, [r2 + 2 * r3]
183
184 dec r4d
185 jnz .loop
186 RET
187%endmacro
188
189BLOCKCOPY_PP_W4_H8 4, 8
190BLOCKCOPY_PP_W4_H8 4, 16
191
192BLOCKCOPY_PP_W4_H8 4, 32
193
194;-----------------------------------------------------------------------------
195; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
196;-----------------------------------------------------------------------------
197INIT_XMM sse2
198cglobal blockcopy_pp_6x8, 4, 7, 8
199
200 movd m0, [r2]
201 movd m1, [r2 + r3]
202 movd m2, [r2 + 2 * r3]
203 lea r5, [r2 + 2 * r3]
204 movd m3, [r5 + r3]
205
206 movd m4, [r5 + 2 * r3]
207 lea r5, [r5 + 2 * r3]
208 movd m5, [r5 + r3]
209 movd m6, [r5 + 2 * r3]
210 lea r5, [r5 + 2 * r3]
211 movd m7, [r5 + r3]
212
213 movd [r0], m0
214 movd [r0 + r1], m1
215 movd [r0 + 2 * r1], m2
216 lea r6, [r0 + 2 * r1]
217 movd [r6 + r1], m3
218
219 movd [r6 + 2 * r1], m4
220 lea r6, [r6 + 2 * r1]
221 movd [r6 + r1], m5
222 movd [r6 + 2 * r1], m6
223 lea r6, [r6 + 2 * r1]
224 movd [r6 + r1], m7
225
226 mov r4w, [r2 + 4]
227 mov r5w, [r2 + r3 + 4]
228 mov r6w, [r2 + 2 * r3 + 4]
229
230 mov [r0 + 4], r4w
231 mov [r0 + r1 + 4], r5w
232 mov [r0 + 2 * r1 + 4], r6w
233
234 lea r0, [r0 + 2 * r1]
235 lea r2, [r2 + 2 * r3]
236
237 mov r4w, [r2 + r3 + 4]
238 mov r5w, [r2 + 2 * r3 + 4]
239
240 mov [r0 + r1 + 4], r4w
241 mov [r0 + 2 * r1 + 4], r5w
242
243 lea r0, [r0 + 2 * r1]
244 lea r2, [r2 + 2 * r3]
245
246 mov r4w, [r2 + r3 + 4]
247 mov r5w, [r2 + 2 * r3 + 4]
248
249 mov [r0 + r1 + 4], r4w
250 mov [r0 + 2 * r1 + 4], r5w
251
252 lea r0, [r0 + 2 * r1]
253 lea r2, [r2 + 2 * r3]
254
255 mov r4w, [r2 + r3 + 4]
256 mov [r0 + r1 + 4], r4w
257 RET
258
259;-----------------------------------------------------------------------------
260; void blockcopy_pp_6x16(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
261;-----------------------------------------------------------------------------
262INIT_XMM sse2
263cglobal blockcopy_pp_6x16, 4, 7, 2
264 mov r6d, 16/2
265.loop:
266 movd m0, [r2]
267 mov r4w, [r2 + 4]
268 movd m1, [r2 + r3]
269 mov r5w, [r2 + r3 + 4]
270 lea r2, [r2 + r3 * 2]
271 movd [r0], m0
272 mov [r0 + 4], r4w
273 movd [r0 + r1], m1
274 mov [r0 + r1 + 4], r5w
275 lea r0, [r0 + r1 * 2]
276 dec r6d
277 jnz .loop
278 RET
279
280
281;-----------------------------------------------------------------------------
282; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
283;-----------------------------------------------------------------------------
284INIT_XMM sse2
285cglobal blockcopy_pp_8x2, 4, 4, 2
286 movh m0, [r2]
287 movh m1, [r2 + r3]
288
289 movh [r0], m0
290 movh [r0 + r1], m1
291RET
292
293;-----------------------------------------------------------------------------
294; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
295;-----------------------------------------------------------------------------
296INIT_XMM sse2
297cglobal blockcopy_pp_8x4, 4, 4, 4
298 movh m0, [r2]
299 movh m1, [r2 + r3]
300 movh m2, [r2 + 2 * r3]
301 lea r3, [r3 + r3 * 2]
302 movh m3, [r2 + r3]
303
304 movh [r0], m0
305 movh [r0 + r1], m1
306 movh [r0 + 2 * r1], m2
307 lea r1, [r1 + 2 * r1]
308 movh [r0 + r1], m3
309 RET
310
311;-----------------------------------------------------------------------------
312; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
313;-----------------------------------------------------------------------------
314INIT_XMM sse2
315cglobal blockcopy_pp_8x6, 4, 7, 6
316 movh m0, [r2]
317 movh m1, [r2 + r3]
318 movh m2, [r2 + 2 * r3]
319 lea r5, [r2 + 2 * r3]
320 movh m3, [r5 + r3]
321 movh m4, [r5 + 2 * r3]
322 lea r5, [r5 + 2 * r3]
323 movh m5, [r5 + r3]
324
325 movh [r0], m0
326 movh [r0 + r1], m1
327 movh [r0 + 2 * r1], m2
328 lea r6, [r0 + 2 * r1]
329 movh [r6 + r1], m3
330 movh [r6 + 2 * r1], m4
331 lea r6, [r6 + 2 * r1]
332 movh [r6 + r1], m5
333 RET
334
335;-----------------------------------------------------------------------------
336; void blockcopy_pp_8x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
337;-----------------------------------------------------------------------------
338INIT_XMM sse2
339cglobal blockcopy_pp_8x12, 4, 5, 2
340 mov r4d, 12/2
341.loop:
342 movh m0, [r2]
343 movh m1, [r2 + r3]
344 movh [r0], m0
345 movh [r0 + r1], m1
346 dec r4d
347 lea r0, [r0 + 2 * r1]
348 lea r2, [r2 + 2 * r3]
349 jnz .loop
350 RET
351
352;-----------------------------------------------------------------------------
353; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
354;-----------------------------------------------------------------------------
355%macro BLOCKCOPY_PP_W8_H8 2
356INIT_XMM sse2
357cglobal blockcopy_pp_%1x%2, 4, 5, 6
358 mov r4d, %2/8
359
360.loop:
361 movh m0, [r2]
362 movh m1, [r2 + r3]
363 lea r2, [r2 + 2 * r3]
364 movh m2, [r2]
365 movh m3, [r2 + r3]
366 lea r2, [r2 + 2 * r3]
367 movh m4, [r2]
368 movh m5, [r2 + r3]
369
370 movh [r0], m0
371 movh [r0 + r1], m1
372 lea r0, [r0 + 2 * r1]
373 movh [r0], m2
374 movh [r0 + r1], m3
375 lea r0, [r0 + 2 * r1]
376 movh [r0], m4
377 movh [r0 + r1], m5
378
379 lea r2, [r2 + 2 * r3]
380 movh m4, [r2]
381 movh m5, [r2 + r3]
382 lea r0, [r0 + 2 * r1]
383 movh [r0], m4
384 movh [r0 + r1], m5
385
386 dec r4d
387 lea r0, [r0 + 2 * r1]
388 lea r2, [r2 + 2 * r3]
389 jnz .loop
390RET
391%endmacro
392
393BLOCKCOPY_PP_W8_H8 8, 8
394BLOCKCOPY_PP_W8_H8 8, 16
395BLOCKCOPY_PP_W8_H8 8, 32
396
397BLOCKCOPY_PP_W8_H8 8, 64
398
399;-----------------------------------------------------------------------------
400; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
401;-----------------------------------------------------------------------------
402%macro BLOCKCOPY_PP_W12_H4 2
403INIT_XMM sse2
404cglobal blockcopy_pp_%1x%2, 4, 5, 4
405 mov r4d, %2/4
406
407.loop:
408 movh m0, [r2]
409 movd m1, [r2 + 8]
410 movh m2, [r2 + r3]
411 movd m3, [r2 + r3 + 8]
412 lea r2, [r2 + 2 * r3]
413
414 movh [r0], m0
415 movd [r0 + 8], m1
416 movh [r0 + r1], m2
417 movd [r0 + r1 + 8], m3
418 lea r0, [r0 + 2 * r1]
419
420 movh m0, [r2]
421 movd m1, [r2 + 8]
422 movh m2, [r2 + r3]
423 movd m3, [r2 + r3 + 8]
424
425 movh [r0], m0
426 movd [r0 + 8], m1
427 movh [r0 + r1], m2
428 movd [r0 + r1 + 8], m3
429
430 dec r4d
431 lea r0, [r0 + 2 * r1]
432 lea r2, [r2 + 2 * r3]
433 jnz .loop
434 RET
435%endmacro
436
437BLOCKCOPY_PP_W12_H4 12, 16
438
439BLOCKCOPY_PP_W12_H4 12, 32
440
441;-----------------------------------------------------------------------------
442; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
443;-----------------------------------------------------------------------------
444%macro BLOCKCOPY_PP_W16_H4 2
445INIT_XMM sse2
446cglobal blockcopy_pp_%1x%2, 4, 5, 4
447 mov r4d, %2/4
448
449.loop:
450 movu m0, [r2]
451 movu m1, [r2 + r3]
452 lea r2, [r2 + 2 * r3]
453 movu m2, [r2]
454 movu m3, [r2 + r3]
455
456 movu [r0], m0
457 movu [r0 + r1], m1
458 lea r0, [r0 + 2 * r1]
459 movu [r0], m2
460 movu [r0 + r1], m3
461
462 dec r4d
463 lea r0, [r0 + 2 * r1]
464 lea r2, [r2 + 2 * r3]
465 jnz .loop
466
467 RET
468%endmacro
469
470BLOCKCOPY_PP_W16_H4 16, 4
471BLOCKCOPY_PP_W16_H4 16, 12
472
473;-----------------------------------------------------------------------------
474; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
475;-----------------------------------------------------------------------------
476%macro BLOCKCOPY_PP_W16_H8 2
477INIT_XMM sse2
478cglobal blockcopy_pp_%1x%2, 4, 5, 6
479 mov r4d, %2/8
480
481.loop:
482 movu m0, [r2]
483 movu m1, [r2 + r3]
484 lea r2, [r2 + 2 * r3]
485 movu m2, [r2]
486 movu m3, [r2 + r3]
487 lea r2, [r2 + 2 * r3]
488 movu m4, [r2]
489 movu m5, [r2 + r3]
490 lea r2, [r2 + 2 * r3]
491
492 movu [r0], m0
493 movu [r0 + r1], m1
494 lea r0, [r0 + 2 * r1]
495 movu [r0], m2
496 movu [r0 + r1], m3
497 lea r0, [r0 + 2 * r1]
498 movu [r0], m4
499 movu [r0 + r1], m5
500 lea r0, [r0 + 2 * r1]
501
502 movu m0, [r2]
503 movu m1, [r2 + r3]
504 movu [r0], m0
505 movu [r0 + r1], m1
506
507 dec r4d
508 lea r0, [r0 + 2 * r1]
509 lea r2, [r2 + 2 * r3]
510 jnz .loop
511 RET
512%endmacro
513
514BLOCKCOPY_PP_W16_H8 16, 8
515BLOCKCOPY_PP_W16_H8 16, 16
516BLOCKCOPY_PP_W16_H8 16, 32
517BLOCKCOPY_PP_W16_H8 16, 64
518
519BLOCKCOPY_PP_W16_H8 16, 24
520
521;-----------------------------------------------------------------------------
522; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
523;-----------------------------------------------------------------------------
524%macro BLOCKCOPY_PP_W24_H4 2
525INIT_XMM sse2
526cglobal blockcopy_pp_%1x%2, 4, 5, 6
527 mov r4d, %2/4
528
529.loop:
530 movu m0, [r2]
531 movh m1, [r2 + 16]
532 movu m2, [r2 + r3]
533 movh m3, [r2 + r3 + 16]
534 lea r2, [r2 + 2 * r3]
535 movu m4, [r2]
536 movh m5, [r2 + 16]
537
538 movu [r0], m0
539 movh [r0 + 16], m1
540 movu [r0 + r1], m2
541 movh [r0 + r1 + 16], m3
542 lea r0, [r0 + 2 * r1]
543 movu [r0], m4
544 movh [r0 + 16], m5
545
546 movu m0, [r2 + r3]
547 movh m1, [r2 + r3 + 16]
548 movu [r0 + r1], m0
549 movh [r0 + r1 + 16], m1
550
551 dec r4d
552 lea r0, [r0 + 2 * r1]
553 lea r2, [r2 + 2 * r3]
554 jnz .loop
555 RET
556%endmacro
557
558BLOCKCOPY_PP_W24_H4 24, 32
559
560BLOCKCOPY_PP_W24_H4 24, 64
561
562;-----------------------------------------------------------------------------
563; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
564;-----------------------------------------------------------------------------
565%macro BLOCKCOPY_PP_W32_H4 2
566INIT_XMM sse2
567cglobal blockcopy_pp_%1x%2, 4, 5, 4
568 mov r4d, %2/4
569
570.loop:
571 movu m0, [r2]
572 movu m1, [r2 + 16]
573 movu m2, [r2 + r3]
574 movu m3, [r2 + r3 + 16]
575 lea r2, [r2 + 2 * r3]
576
577 movu [r0], m0
578 movu [r0 + 16], m1
579 movu [r0 + r1], m2
580 movu [r0 + r1 + 16], m3
581 lea r0, [r0 + 2 * r1]
582
583 movu m0, [r2]
584 movu m1, [r2 + 16]
585 movu m2, [r2 + r3]
586 movu m3, [r2 + r3 + 16]
587
588 movu [r0], m0
589 movu [r0 + 16], m1
590 movu [r0 + r1], m2
591 movu [r0 + r1 + 16], m3
592
593 dec r4d
594 lea r0, [r0 + 2 * r1]
595 lea r2, [r2 + 2 * r3]
596 jnz .loop
597 RET
598%endmacro
599
600BLOCKCOPY_PP_W32_H4 32, 8
601BLOCKCOPY_PP_W32_H4 32, 16
602BLOCKCOPY_PP_W32_H4 32, 24
603BLOCKCOPY_PP_W32_H4 32, 32
604BLOCKCOPY_PP_W32_H4 32, 64
605
606BLOCKCOPY_PP_W32_H4 32, 48
607
608INIT_YMM avx
609cglobal blockcopy_pp_32x8, 4, 6, 6
610 lea r4, [3 * r1]
611 lea r5, [3 * r3]
612
613 movu m0, [r2]
614 movu m1, [r2 + r3]
615 movu m2, [r2 + 2 * r3]
616 movu m3, [r2 + r5]
617 lea r2, [r2 + 4 * r3]
618 movu m4, [r2]
619 movu m5, [r2 + r3]
620
621 movu [r0], m0
622 movu [r0 + r1], m1
623 movu [r0 + 2 * r1], m2
624 movu [r0 + r4], m3
625 lea r0, [r0 + 4 * r1]
626 movu [r0], m4
627 movu [r0 + r1], m5
628
629 movu m0, [r2 + 2 * r3]
630 movu m1, [r2 + r5]
631
632 movu [r0 + 2 * r1], m0
633 movu [r0 + r4], m1
634 RET
635
636INIT_YMM avx
637cglobal blockcopy_pp_32x16, 4, 6, 6
638 lea r4, [3 * r1]
639 lea r5, [3 * r3]
640
641 movu m0, [r2]
642 movu m1, [r2 + r3]
643 movu m2, [r2 + 2 * r3]
644 movu m3, [r2 + r5]
645 lea r2, [r2 + 4 * r3]
646 movu m4, [r2]
647 movu m5, [r2 + r3]
648
649 movu [r0], m0
650 movu [r0 + r1], m1
651 movu [r0 + 2 * r1], m2
652 movu [r0 + r4], m3
653 lea r0, [r0 + 4 * r1]
654 movu [r0], m4
655 movu [r0 + r1], m5
656
657 movu m0, [r2 + 2 * r3]
658 movu m1, [r2 + r5]
659 lea r2, [r2 + 4 * r3]
660 movu m2, [r2]
661 movu m3, [r2 + r3]
662 movu m4, [r2 + 2 * r3]
663 movu m5, [r2 + r5]
664
665 movu [r0 + 2 * r1], m0
666 movu [r0 + r4], m1
667 lea r0, [r0 + 4 * r1]
668 movu [r0], m2
669 movu [r0 + r1], m3
670 movu [r0 + 2 * r1], m4
671 movu [r0 + r4], m5
672
673 lea r2, [r2 + 4 * r3]
674 movu m0, [r2]
675 movu m1, [r2 + r3]
676 movu m2, [r2 + 2 * r3]
677 movu m3, [r2 + r5]
678
679 lea r0, [r0 + 4 * r1]
680 movu [r0], m0
681 movu [r0 + r1], m1
682 movu [r0 + 2 * r1], m2
683 movu [r0 + r4], m3
684 RET
685
686;-----------------------------------------------------------------------------
687; void blockcopy_pp_32x24(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
688;-----------------------------------------------------------------------------
689INIT_YMM avx
690cglobal blockcopy_pp_32x24, 4, 7, 6
691lea r4, [3 * r1]
692lea r5, [3 * r3]
693mov r6d, 24/8
694
695.loop:
696 movu m0, [r2]
697 movu m1, [r2 + r3]
698 movu m2, [r2 + 2 * r3]
699 movu m3, [r2 + r5]
700 lea r2, [r2 + 4 * r3]
701 movu m4, [r2]
702 movu m5, [r2 + r3]
703
704 movu [r0], m0
705 movu [r0 + r1], m1
706 movu [r0 + 2 * r1], m2
707 movu [r0 + r4], m3
708 lea r0, [r0 + 4 * r1]
709 movu [r0], m4
710 movu [r0 + r1], m5
711
712 movu m0, [r2 + 2 * r3]
713 movu m1, [r2 + r5]
714
715 movu [r0 + 2 * r1], m0
716 movu [r0 + r4], m1
717
718 lea r2, [r2 + 4 * r3]
719 lea r0, [r0 + 4 * r1]
720 dec r6d
721 jnz .loop
722 RET
723
724;-----------------------------------------------------------------------------
725; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
726;-----------------------------------------------------------------------------
727%macro BLOCKCOPY_PP_W32_H16_avx 2
728INIT_YMM avx
729cglobal blockcopy_pp_%1x%2, 4, 7, 6
730 lea r4, [3 * r1]
731 lea r5, [3 * r3]
732 mov r6d, %2/16
733
734.loop:
735 movu m0, [r2]
736 movu m1, [r2 + r3]
737 movu m2, [r2 + 2 * r3]
738 movu m3, [r2 + r5]
739 lea r2, [r2 + 4 * r3]
740 movu m4, [r2]
741 movu m5, [r2 + r3]
742
743 movu [r0], m0
744 movu [r0 + r1], m1
745 movu [r0 + 2 * r1], m2
746 movu [r0 + r4], m3
747 lea r0, [r0 + 4 * r1]
748 movu [r0], m4
749 movu [r0 + r1], m5
750
751 movu m0, [r2 + 2 * r3]
752 movu m1, [r2 + r5]
753 lea r2, [r2 + 4 * r3]
754 movu m2, [r2]
755 movu m3, [r2 + r3]
756 movu m4, [r2 + 2 * r3]
757 movu m5, [r2 + r5]
758
759 movu [r0 + 2 * r1], m0
760 movu [r0 + r4], m1
761 lea r0, [r0 + 4 * r1]
762 movu [r0], m2
763 movu [r0 + r1], m3
764 movu [r0 + 2 * r1], m4
765 movu [r0 + r4], m5
766
767 lea r2, [r2 + 4 * r3]
768 movu m0, [r2]
769 movu m1, [r2 + r3]
770 movu m2, [r2 + 2 * r3]
771 movu m3, [r2 + r5]
772
773 lea r0, [r0 + 4 * r1]
774 movu [r0], m0
775 movu [r0 + r1], m1
776 movu [r0 + 2 * r1], m2
777 movu [r0 + r4], m3
778
779 lea r2, [r2 + 4 * r3]
780 lea r0, [r0 + 4 * r1]
781 dec r6d
782 jnz .loop
783 RET
784%endmacro
785
786BLOCKCOPY_PP_W32_H16_avx 32, 32
787BLOCKCOPY_PP_W32_H16_avx 32, 48
788BLOCKCOPY_PP_W32_H16_avx 32, 64
789
790;-----------------------------------------------------------------------------
791; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
792;-----------------------------------------------------------------------------
793%macro BLOCKCOPY_PP_W48_H2 2
794INIT_XMM sse2
795cglobal blockcopy_pp_%1x%2, 4, 5, 6
796 mov r4d, %2/4
797
798.loop:
799 movu m0, [r2]
800 movu m1, [r2 + 16]
801 movu m2, [r2 + 32]
802 movu m3, [r2 + r3]
803 movu m4, [r2 + r3 + 16]
804 movu m5, [r2 + r3 + 32]
805 lea r2, [r2 + 2 * r3]
806
807 movu [r0], m0
808 movu [r0 + 16], m1
809 movu [r0 + 32], m2
810 movu [r0 + r1], m3
811 movu [r0 + r1 + 16], m4
812 movu [r0 + r1 + 32], m5
813 lea r0, [r0 + 2 * r1]
814
815 movu m0, [r2]
816 movu m1, [r2 + 16]
817 movu m2, [r2 + 32]
818 movu m3, [r2 + r3]
819 movu m4, [r2 + r3 + 16]
820 movu m5, [r2 + r3 + 32]
821
822 movu [r0], m0
823 movu [r0 + 16], m1
824 movu [r0 + 32], m2
825 movu [r0 + r1], m3
826 movu [r0 + r1 + 16], m4
827 movu [r0 + r1 + 32], m5
828
829 dec r4d
830 lea r0, [r0 + 2 * r1]
831 lea r2, [r2 + 2 * r3]
832 jnz .loop
833 RET
834%endmacro
835
836BLOCKCOPY_PP_W48_H2 48, 64
837
838;-----------------------------------------------------------------------------
839; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
840;-----------------------------------------------------------------------------
841%macro BLOCKCOPY_PP_W64_H4 2
842INIT_XMM sse2
843cglobal blockcopy_pp_%1x%2, 4, 5, 6
844 mov r4d, %2/4
845
846.loop:
847 movu m0, [r2]
848 movu m1, [r2 + 16]
849 movu m2, [r2 + 32]
850 movu m3, [r2 + 48]
851 movu m4, [r2 + r3]
852 movu m5, [r2 + r3 + 16]
853
854 movu [r0], m0
855 movu [r0 + 16], m1
856 movu [r0 + 32], m2
857 movu [r0 + 48], m3
858 movu [r0 + r1], m4
859 movu [r0 + r1 + 16], m5
860
861 movu m0, [r2 + r3 + 32]
862 movu m1, [r2 + r3 + 48]
863 lea r2, [r2 + 2 * r3]
864 movu m2, [r2]
865 movu m3, [r2 + 16]
866 movu m4, [r2 + 32]
867 movu m5, [r2 + 48]
868
869 movu [r0 + r1 + 32], m0
870 movu [r0 + r1 + 48], m1
871 lea r0, [r0 + 2 * r1]
872 movu [r0], m2
873 movu [r0 + 16], m3
874 movu [r0 + 32], m4
875 movu [r0 + 48], m5
876
877 movu m0, [r2 + r3]
878 movu m1, [r2 + r3 + 16]
879 movu m2, [r2 + r3 + 32]
880 movu m3, [r2 + r3 + 48]
881
882 movu [r0 + r1], m0
883 movu [r0 + r1 + 16], m1
884 movu [r0 + r1 + 32], m2
885 movu [r0 + r1 + 48], m3
886
887 dec r4d
888 lea r0, [r0 + 2 * r1]
889 lea r2, [r2 + 2 * r3]
890 jnz .loop
891 RET
892%endmacro
893
894BLOCKCOPY_PP_W64_H4 64, 16
895BLOCKCOPY_PP_W64_H4 64, 32
896BLOCKCOPY_PP_W64_H4 64, 48
897BLOCKCOPY_PP_W64_H4 64, 64
898
899;-----------------------------------------------------------------------------
900; void blockcopy_sp_2x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
901;-----------------------------------------------------------------------------
902INIT_XMM sse4
903cglobal blockcopy_sp_2x4, 4, 5, 2
904
905add r3, r3
906
907;Row 0-1
908movd m0, [r2]
909movd m1, [r2 + r3]
910packuswb m0, m1
911movd r4d, m0
912mov [r0], r4w
913pextrw [r0 + r1], m0, 4
914
915;Row 2-3
916movd m0, [r2 + 2 * r3]
917lea r2, [r2 + 2 * r3]
918movd m1, [r2 + r3]
919packuswb m0, m1
920movd r4d, m0
921mov [r0 + 2 * r1], r4w
922lea r0, [r0 + 2 * r1]
923pextrw [r0 + r1], m0, 4
924
925RET
926
927
928;-----------------------------------------------------------------------------
929; void blockcopy_sp_2x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
930;-----------------------------------------------------------------------------
931INIT_XMM sse4
932cglobal blockcopy_sp_2x8, 4, 5, 2
933
934add r3, r3
935
936;Row 0-1
937movd m0, [r2]
938movd m1, [r2 + r3]
939packuswb m0, m1
940movd r4d, m0
941mov [r0], r4w
942pextrw [r0 + r1], m0, 4
943
944;Row 2-3
945movd m0, [r2 + 2 * r3]
946lea r2, [r2 + 2 * r3]
947movd m1, [r2 + r3]
948packuswb m0, m1
949movd r4d, m0
950mov [r0 + 2 * r1], r4w
951lea r0, [r0 + 2 * r1]
952pextrw [r0 + r1], m0, 4
953
954;Row 4-5
955movd m0, [r2 + 2 * r3]
956lea r2, [r2 + 2 * r3]
957movd m1, [r2 + r3]
958packuswb m0, m1
959movd r4d, m0
960mov [r0 + 2 * r1], r4w
961lea r0, [r0 + 2 * r1]
962pextrw [r0 + r1], m0, 4
963
964;Row 6-7
965movd m0, [r2 + 2 * r3]
966lea r2, [r2 + 2 * r3]
967movd m1, [r2 + r3]
968packuswb m0, m1
969movd r4d, m0
970mov [r0 + 2 * r1], r4w
971lea r0, [r0 + 2 * r1]
972pextrw [r0 + r1], m0, 4
973
974RET
975
976;-----------------------------------------------------------------------------
977; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
978;-----------------------------------------------------------------------------
979%macro BLOCKCOPY_SP_W2_H2 2
980INIT_XMM sse2
981cglobal blockcopy_sp_%1x%2, 4, 7, 2, dest, destStride, src, srcStride
982 add r3, r3
983 mov r6d, %2/2
984.loop:
985 movd m0, [r2]
986 movd m1, [r2 + r3]
987 dec r6d
988 lea r2, [r2 + r3 * 2]
989 packuswb m0, m0
990 packuswb m1, m1
991 movd r4d, m0
992 movd r5d, m1
993 mov [r0], r4w
994 mov [r0 + r1], r5w
995 lea r0, [r0 + r1 * 2]
996 jnz .loop
997 RET
998%endmacro
999
1000BLOCKCOPY_SP_W2_H2 2, 4
1001BLOCKCOPY_SP_W2_H2 2, 8
1002
1003BLOCKCOPY_SP_W2_H2 2, 16
1004
1005;-----------------------------------------------------------------------------
1006; void blockcopy_sp_4x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1007;-----------------------------------------------------------------------------
1008INIT_XMM sse2
1009cglobal blockcopy_sp_4x2, 4, 4, 2, dest, destStride, src, srcStride
1010
1011add r3, r3
1012
1013movh m0, [r2]
1014movh m1, [r2 + r3]
1015
1016packuswb m0, m1
1017
1018movd [r0], m0
1019pshufd m0, m0, 2
1020movd [r0 + r1], m0
1021
1022RET
1023
1024;-----------------------------------------------------------------------------
1025; void blockcopy_sp_4x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1026;-----------------------------------------------------------------------------
1027INIT_XMM sse2
1028cglobal blockcopy_sp_4x4, 4, 4, 4, dest, destStride, src, srcStride
1029
1030add r3, r3
1031
1032movh m0, [r2]
1033movh m1, [r2 + r3]
1034movh m2, [r2 + 2 * r3]
1035lea r2, [r2 + 2 * r3]
1036movh m3, [r2 + r3]
1037
1038packuswb m0, m1
1039packuswb m2, m3
1040
1041movd [r0], m0
1042pshufd m0, m0, 2
1043movd [r0 + r1], m0
1044movd [r0 + 2 * r1], m2
1045lea r0, [r0 + 2 * r1]
1046pshufd m2, m2, 2
1047movd [r0 + r1], m2
1048
1049RET
1050
1051;-----------------------------------------------------------------------------
1052; void blockcopy_sp_4x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1053;-----------------------------------------------------------------------------
1054INIT_XMM sse2
1055cglobal blockcopy_sp_4x8, 4, 4, 8, dest, destStride, src, srcStride
1056
1057add r3, r3
1058
1059movh m0, [r2]
1060movh m1, [r2 + r3]
1061movh m2, [r2 + 2 * r3]
1062lea r2, [r2 + 2 * r3]
1063movh m3, [r2 + r3]
1064movh m4, [r2 + 2 * r3]
1065lea r2, [r2 + 2 * r3]
1066movh m5, [r2 + r3]
1067movh m6, [r2 + 2 * r3]
1068lea r2, [r2 + 2 * r3]
1069movh m7, [r2 + r3]
1070
1071packuswb m0, m1
1072packuswb m2, m3
1073packuswb m4, m5
1074packuswb m6, m7
1075
1076movd [r0], m0
1077pshufd m0, m0, 2
1078movd [r0 + r1], m0
1079movd [r0 + 2 * r1], m2
1080lea r0, [r0 + 2 * r1]
1081pshufd m2, m2, 2
1082movd [r0 + r1], m2
1083movd [r0 + 2 * r1], m4
1084lea r0, [r0 + 2 * r1]
1085pshufd m4, m4, 2
1086movd [r0 + r1], m4
1087movd [r0 + 2 * r1], m6
1088lea r0, [r0 + 2 * r1]
1089pshufd m6, m6, 2
1090movd [r0 + r1], m6
1091
1092RET
1093
1094;-----------------------------------------------------------------------------
1095; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1096;-----------------------------------------------------------------------------
1097%macro BLOCKCOPY_SP_W4_H8 2
1098INIT_XMM sse2
1099cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
1100
1101mov r4d, %2/8
1102
1103add r3, r3
1104
1105.loop:
1106 movh m0, [r2]
1107 movh m1, [r2 + r3]
1108 movh m2, [r2 + 2 * r3]
1109 lea r2, [r2 + 2 * r3]
1110 movh m3, [r2 + r3]
1111 movh m4, [r2 + 2 * r3]
1112 lea r2, [r2 + 2 * r3]
1113 movh m5, [r2 + r3]
1114 movh m6, [r2 + 2 * r3]
1115 lea r2, [r2 + 2 * r3]
1116 movh m7, [r2 + r3]
1117
1118 packuswb m0, m1
1119 packuswb m2, m3
1120 packuswb m4, m5
1121 packuswb m6, m7
1122
1123 movd [r0], m0
1124 pshufd m0, m0, 2
1125 movd [r0 + r1], m0
1126 movd [r0 + 2 * r1], m2
1127 lea r0, [r0 + 2 * r1]
1128 pshufd m2, m2, 2
1129 movd [r0 + r1], m2
1130 movd [r0 + 2 * r1], m4
1131 lea r0, [r0 + 2 * r1]
1132 pshufd m4, m4, 2
1133 movd [r0 + r1], m4
1134 movd [r0 + 2 * r1], m6
1135 lea r0, [r0 + 2 * r1]
1136 pshufd m6, m6, 2
1137 movd [r0 + r1], m6
1138
1139 lea r0, [r0 + 2 * r1]
1140 lea r2, [r2 + 2 * r3]
1141
1142 dec r4d
1143 jnz .loop
1144
1145RET
1146%endmacro
1147
1148BLOCKCOPY_SP_W4_H8 4, 16
1149
1150BLOCKCOPY_SP_W4_H8 4, 32
1151
1152;-----------------------------------------------------------------------------
1153; void blockcopy_sp_6x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1154;-----------------------------------------------------------------------------
1155INIT_XMM sse4
1156cglobal blockcopy_sp_6x8, 4, 4, 2
1157
1158 add r3, r3
1159
1160 movu m0, [r2]
1161 movu m1, [r2 + r3]
1162 packuswb m0, m1
1163
1164 movd [r0], m0
1165 pextrw [r0 + 4], m0, 2
1166
1167 movhlps m0, m0
1168 movd [r0 + r1], m0
1169 pextrw [r0 + r1 + 4], m0, 2
1170
1171 lea r0, [r0 + 2 * r1]
1172 lea r2, [r2 + 2 * r3]
1173
1174 movu m0, [r2]
1175 movu m1, [r2 + r3]
1176 packuswb m0, m1
1177
1178 movd [r0], m0
1179 pextrw [r0 + 4], m0, 2
1180
1181 movhlps m0, m0
1182 movd [r0 + r1], m0
1183 pextrw [r0 + r1 + 4], m0, 2
1184
1185 lea r0, [r0 + 2 * r1]
1186 lea r2, [r2 + 2 * r3]
1187
1188 movu m0, [r2]
1189 movu m1, [r2 + r3]
1190 packuswb m0, m1
1191
1192 movd [r0], m0
1193 pextrw [r0 + 4], m0, 2
1194
1195 movhlps m0, m0
1196 movd [r0 + r1], m0
1197 pextrw [r0 + r1 + 4], m0, 2
1198
1199 lea r0, [r0 + 2 * r1]
1200 lea r2, [r2 + 2 * r3]
1201
1202 movu m0, [r2]
1203 movu m1, [r2 + r3]
1204 packuswb m0, m1
1205
1206 movd [r0], m0
1207 pextrw [r0 + 4], m0, 2
1208
1209 movhlps m0, m0
1210 movd [r0 + r1], m0
1211 pextrw [r0 + r1 + 4], m0, 2
1212
1213 RET
1214
1215;-----------------------------------------------------------------------------
1216; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1217;-----------------------------------------------------------------------------
1218%macro BLOCKCOPY_SP_W6_H2 2
1219INIT_XMM sse2
1220cglobal blockcopy_sp_%1x%2, 4, 7, 4, dest, destStride, src, srcStride
1221 add r3, r3
1222 mov r6d, %2/2
1223.loop:
1224 movh m0, [r2]
1225 movd m2, [r2 + 8]
1226 movh m1, [r2 + r3]
1227 movd m3, [r2 + r3 + 8]
1228 dec r6d
1229 lea r2, [r2 + r3 * 2]
1230 packuswb m0, m0
1231 packuswb m2, m2
1232 packuswb m1, m1
1233 packuswb m3, m3
1234 movd r4d, m2
1235 movd r5d, m3
1236 movd [r0], m0
1237 mov [r0 + 4], r4w
1238 movd [r0 + r1], m1
1239 mov [r0 + r1 + 4], r5w
1240 lea r0, [r0 + r1 * 2]
1241 jnz .loop
1242 RET
1243%endmacro
1244
1245BLOCKCOPY_SP_W6_H2 6, 8
1246
1247BLOCKCOPY_SP_W6_H2 6, 16
1248
1249;-----------------------------------------------------------------------------
1250; void blockcopy_sp_8x2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1251;-----------------------------------------------------------------------------
1252INIT_XMM sse2
1253cglobal blockcopy_sp_8x2, 4, 4, 2, dest, destStride, src, srcStride
1254
1255add r3, r3
1256
1257movu m0, [r2]
1258movu m1, [r2 + r3]
1259
1260packuswb m0, m1
1261
1262movlps [r0], m0
1263movhps [r0 + r1], m0
1264
1265RET
1266
1267;-----------------------------------------------------------------------------
1268; void blockcopy_sp_8x4(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1269;-----------------------------------------------------------------------------
1270INIT_XMM sse2
1271cglobal blockcopy_sp_8x4, 4, 4, 4, dest, destStride, src, srcStride
1272
1273add r3, r3
1274
1275movu m0, [r2]
1276movu m1, [r2 + r3]
1277movu m2, [r2 + 2 * r3]
1278lea r2, [r2 + 2 * r3]
1279movu m3, [r2 + r3]
1280
1281packuswb m0, m1
1282packuswb m2, m3
1283
1284movlps [r0], m0
1285movhps [r0 + r1], m0
1286movlps [r0 + 2 * r1], m2
1287lea r0, [r0 + 2 * r1]
1288movhps [r0 + r1], m2
1289
1290RET
1291
1292;-----------------------------------------------------------------------------
1293; void blockcopy_sp_8x6(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1294;-----------------------------------------------------------------------------
1295INIT_XMM sse2
1296cglobal blockcopy_sp_8x6, 4, 4, 6, dest, destStride, src, srcStride
1297
1298add r3, r3
1299
1300movu m0, [r2]
1301movu m1, [r2 + r3]
1302movu m2, [r2 + 2 * r3]
1303lea r2, [r2 + 2 * r3]
1304movu m3, [r2 + r3]
1305movu m4, [r2 + 2 * r3]
1306lea r2, [r2 + 2 * r3]
1307movu m5, [r2 + r3]
1308
1309packuswb m0, m1
1310packuswb m2, m3
1311packuswb m4, m5
1312
1313movlps [r0], m0
1314movhps [r0 + r1], m0
1315movlps [r0 + 2 * r1], m2
1316lea r0, [r0 + 2 * r1]
1317movhps [r0 + r1], m2
1318movlps [r0 + 2 * r1], m4
1319lea r0, [r0 + 2 * r1]
1320movhps [r0 + r1], m4
1321
1322RET
1323
1324;-----------------------------------------------------------------------------
1325; void blockcopy_sp_8x8(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1326;-----------------------------------------------------------------------------
1327INIT_XMM sse2
1328cglobal blockcopy_sp_8x8, 4, 4, 8, dest, destStride, src, srcStride
1329
1330add r3, r3
1331
1332movu m0, [r2]
1333movu m1, [r2 + r3]
1334movu m2, [r2 + 2 * r3]
1335lea r2, [r2 + 2 * r3]
1336movu m3, [r2 + r3]
1337movu m4, [r2 + 2 * r3]
1338lea r2, [r2 + 2 * r3]
1339movu m5, [r2 + r3]
1340movu m6, [r2 + 2 * r3]
1341lea r2, [r2 + 2 * r3]
1342movu m7, [r2 + r3]
1343
1344packuswb m0, m1
1345packuswb m2, m3
1346packuswb m4, m5
1347packuswb m6, m7
1348
1349movlps [r0], m0
1350movhps [r0 + r1], m0
1351movlps [r0 + 2 * r1], m2
1352lea r0, [r0 + 2 * r1]
1353movhps [r0 + r1], m2
1354movlps [r0 + 2 * r1], m4
1355lea r0, [r0 + 2 * r1]
1356movhps [r0 + r1], m4
1357movlps [r0 + 2 * r1], m6
1358lea r0, [r0 + 2 * r1]
1359movhps [r0 + r1], m6
1360
1361RET
1362
1363;-----------------------------------------------------------------------------
1364; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1365;-----------------------------------------------------------------------------
1366%macro BLOCKCOPY_SP_W8_H4 2
1367INIT_XMM sse2
1368cglobal blockcopy_sp_%1x%2, 4, 5, 4, dest, destStride, src, srcStride
1369 add r3, r3
1370 mov r4d, %2/4
1371.loop:
1372 movu m0, [r2]
1373 movu m1, [r2 + r3]
1374 lea r2, [r2 + r3 * 2]
1375 movu m2, [r2]
1376 movu m3, [r2 + r3]
1377 dec r4d
1378 lea r2, [r2 + r3 * 2]
1379 packuswb m0, m1
1380 packuswb m2, m3
1381 movlps [r0], m0
1382 movhps [r0 + r1], m0
1383 lea r0, [r0 + r1 * 2]
1384 movlps [r0], m2
1385 movhps [r0 + r1], m2
1386 lea r0, [r0 + r1 * 2]
1387 jnz .loop
1388 RET
1389%endmacro
1390
1391BLOCKCOPY_SP_W8_H4 8, 12
1392
1393;-----------------------------------------------------------------------------
1394; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1395;-----------------------------------------------------------------------------
1396%macro BLOCKCOPY_SP_W8_H8 2
1397INIT_XMM sse2
1398cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
1399
1400mov r4d, %2/8
1401
1402add r3, r3
1403
1404.loop:
1405 movu m0, [r2]
1406 movu m1, [r2 + r3]
1407 movu m2, [r2 + 2 * r3]
1408 lea r2, [r2 + 2 * r3]
1409 movu m3, [r2 + r3]
1410 movu m4, [r2 + 2 * r3]
1411 lea r2, [r2 + 2 * r3]
1412 movu m5, [r2 + r3]
1413 movu m6, [r2 + 2 * r3]
1414 lea r2, [r2 + 2 * r3]
1415 movu m7, [r2 + r3]
1416
1417 packuswb m0, m1
1418 packuswb m2, m3
1419 packuswb m4, m5
1420 packuswb m6, m7
1421
1422 movlps [r0], m0
1423 movhps [r0 + r1], m0
1424 movlps [r0 + 2 * r1], m2
1425 lea r0, [r0 + 2 * r1]
1426 movhps [r0 + r1], m2
1427 movlps [r0 + 2 * r1], m4
1428 lea r0, [r0 + 2 * r1]
1429 movhps [r0 + r1], m4
1430 movlps [r0 + 2 * r1], m6
1431 lea r0, [r0 + 2 * r1]
1432 movhps [r0 + r1], m6
1433
1434 lea r0, [r0 + 2 * r1]
1435 lea r2, [r2 + 2 * r3]
1436
1437 dec r4d
1438 jnz .loop
1439
1440RET
1441%endmacro
1442
1443BLOCKCOPY_SP_W8_H8 8, 16
1444BLOCKCOPY_SP_W8_H8 8, 32
1445
1446BLOCKCOPY_SP_W8_H8 8, 64
1447
1448;-----------------------------------------------------------------------------
1449; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1450;-----------------------------------------------------------------------------
1451%macro BLOCKCOPY_SP_W12_H4 2
1452INIT_XMM sse2
1453cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
1454
1455mov r4d, %2/4
1456
1457add r3, r3
1458
1459.loop:
1460 movu m0, [r2]
1461 movu m1, [r2 + 16]
1462 movu m2, [r2 + r3]
1463 movu m3, [r2 + r3 + 16]
1464 movu m4, [r2 + 2 * r3]
1465 movu m5, [r2 + 2 * r3 + 16]
1466 lea r2, [r2 + 2 * r3]
1467 movu m6, [r2 + r3]
1468 movu m7, [r2 + r3 + 16]
1469
1470 packuswb m0, m1
1471 packuswb m2, m3
1472 packuswb m4, m5
1473 packuswb m6, m7
1474
1475 movh [r0], m0
1476 pshufd m0, m0, 2
1477 movd [r0 + 8], m0
1478
1479 movh [r0 + r1], m2
1480 pshufd m2, m2, 2
1481 movd [r0 + r1 + 8], m2
1482
1483 movh [r0 + 2 * r1], m4
1484 pshufd m4, m4, 2
1485 movd [r0 + 2 * r1 + 8], m4
1486
1487 lea r0, [r0 + 2 * r1]
1488 movh [r0 + r1], m6
1489 pshufd m6, m6, 2
1490 movd [r0 + r1 + 8], m6
1491
1492 lea r0, [r0 + 2 * r1]
1493 lea r2, [r2 + 2 * r3]
1494
1495 dec r4d
1496 jnz .loop
1497
1498RET
1499%endmacro
1500
1501BLOCKCOPY_SP_W12_H4 12, 16
1502
1503BLOCKCOPY_SP_W12_H4 12, 32
1504
1505;-----------------------------------------------------------------------------
1506; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1507;-----------------------------------------------------------------------------
1508%macro BLOCKCOPY_SP_W16_H4 2
1509INIT_XMM sse2
1510cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
1511
1512mov r4d, %2/4
1513
1514add r3, r3
1515
1516.loop:
1517 movu m0, [r2]
1518 movu m1, [r2 + 16]
1519 movu m2, [r2 + r3]
1520 movu m3, [r2 + r3 + 16]
1521 movu m4, [r2 + 2 * r3]
1522 movu m5, [r2 + 2 * r3 + 16]
1523 lea r2, [r2 + 2 * r3]
1524 movu m6, [r2 + r3]
1525 movu m7, [r2 + r3 + 16]
1526
1527 packuswb m0, m1
1528 packuswb m2, m3
1529 packuswb m4, m5
1530 packuswb m6, m7
1531
1532 movu [r0], m0
1533 movu [r0 + r1], m2
1534 movu [r0 + 2 * r1], m4
1535 lea r0, [r0 + 2 * r1]
1536 movu [r0 + r1], m6
1537
1538 lea r0, [r0 + 2 * r1]
1539 lea r2, [r2 + 2 * r3]
1540
1541 dec r4d
1542 jnz .loop
1543
1544RET
1545%endmacro
1546
1547BLOCKCOPY_SP_W16_H4 16, 4
1548BLOCKCOPY_SP_W16_H4 16, 8
1549BLOCKCOPY_SP_W16_H4 16, 12
1550BLOCKCOPY_SP_W16_H4 16, 16
1551BLOCKCOPY_SP_W16_H4 16, 32
1552BLOCKCOPY_SP_W16_H4 16, 64
1553
1554BLOCKCOPY_SP_W16_H4 16, 24
1555
1556;-----------------------------------------------------------------------------
1557; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1558;-----------------------------------------------------------------------------
1559%macro BLOCKCOPY_SP_W24_H2 2
1560INIT_XMM sse2
1561cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
1562
1563mov r4d, %2/2
1564
1565add r3, r3
1566
1567.loop:
1568 movu m0, [r2]
1569 movu m1, [r2 + 16]
1570 movu m2, [r2 + 32]
1571 movu m3, [r2 + r3]
1572 movu m4, [r2 + r3 + 16]
1573 movu m5, [r2 + r3 + 32]
1574
1575 packuswb m0, m1
1576 packuswb m2, m3
1577 packuswb m4, m5
1578
1579 movu [r0], m0
1580 movlps [r0 + 16], m2
1581 movhps [r0 + r1], m2
1582 movu [r0 + r1 + 8], m4
1583
1584 lea r0, [r0 + 2 * r1]
1585 lea r2, [r2 + 2 * r3]
1586
1587 dec r4d
1588 jnz .loop
1589
1590RET
1591%endmacro
1592
1593BLOCKCOPY_SP_W24_H2 24, 32
1594
1595BLOCKCOPY_SP_W24_H2 24, 64
1596
1597;-----------------------------------------------------------------------------
1598; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1599;-----------------------------------------------------------------------------
1600%macro BLOCKCOPY_SP_W32_H2 2
1601INIT_XMM sse2
1602cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
1603
1604mov r4d, %2/2
1605
1606add r3, r3
1607
1608.loop:
1609 movu m0, [r2]
1610 movu m1, [r2 + 16]
1611 movu m2, [r2 + 32]
1612 movu m3, [r2 + 48]
1613 movu m4, [r2 + r3]
1614 movu m5, [r2 + r3 + 16]
1615 movu m6, [r2 + r3 + 32]
1616 movu m7, [r2 + r3 + 48]
1617
1618 packuswb m0, m1
1619 packuswb m2, m3
1620 packuswb m4, m5
1621 packuswb m6, m7
1622
1623 movu [r0], m0
1624 movu [r0 + 16], m2
1625 movu [r0 + r1], m4
1626 movu [r0 + r1 + 16], m6
1627
1628 lea r0, [r0 + 2 * r1]
1629 lea r2, [r2 + 2 * r3]
1630
1631 dec r4d
1632 jnz .loop
1633
1634RET
1635%endmacro
1636
1637BLOCKCOPY_SP_W32_H2 32, 8
1638BLOCKCOPY_SP_W32_H2 32, 16
1639BLOCKCOPY_SP_W32_H2 32, 24
1640BLOCKCOPY_SP_W32_H2 32, 32
1641BLOCKCOPY_SP_W32_H2 32, 64
1642
1643BLOCKCOPY_SP_W32_H2 32, 48
1644
1645;-----------------------------------------------------------------------------
1646; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1647;-----------------------------------------------------------------------------
1648%macro BLOCKCOPY_SP_W48_H2 2
1649INIT_XMM sse2
1650cglobal blockcopy_sp_%1x%2, 4, 5, 6, dest, destStride, src, srcStride
1651
1652mov r4d, %2
1653
1654add r3, r3
1655
1656.loop:
1657 movu m0, [r2]
1658 movu m1, [r2 + 16]
1659 movu m2, [r2 + 32]
1660 movu m3, [r2 + 48]
1661 movu m4, [r2 + 64]
1662 movu m5, [r2 + 80]
1663
1664 packuswb m0, m1
1665 packuswb m2, m3
1666 packuswb m4, m5
1667
1668 movu [r0], m0
1669 movu [r0 + 16], m2
1670 movu [r0 + 32], m4
1671
1672 lea r0, [r0 + r1]
1673 lea r2, [r2 + r3]
1674
1675 dec r4d
1676 jnz .loop
1677
1678RET
1679%endmacro
1680
1681BLOCKCOPY_SP_W48_H2 48, 64
1682
1683;-----------------------------------------------------------------------------
1684; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
1685;-----------------------------------------------------------------------------
1686%macro BLOCKCOPY_SP_W64_H1 2
1687INIT_XMM sse2
1688cglobal blockcopy_sp_%1x%2, 4, 5, 8, dest, destStride, src, srcStride
1689
1690mov r4d, %2
1691
1692add r3, r3
1693
1694.loop:
1695 movu m0, [r2]
1696 movu m1, [r2 + 16]
1697 movu m2, [r2 + 32]
1698 movu m3, [r2 + 48]
1699 movu m4, [r2 + 64]
1700 movu m5, [r2 + 80]
1701 movu m6, [r2 + 96]
1702 movu m7, [r2 + 112]
1703
1704 packuswb m0, m1
1705 packuswb m2, m3
1706 packuswb m4, m5
1707 packuswb m6, m7
1708
1709 movu [r0], m0
1710 movu [r0 + 16], m2
1711 movu [r0 + 32], m4
1712 movu [r0 + 48], m6
1713
1714 lea r0, [r0 + r1]
1715 lea r2, [r2 + r3]
1716
1717 dec r4d
1718 jnz .loop
1719
1720RET
1721%endmacro
1722
1723BLOCKCOPY_SP_W64_H1 64, 16
1724BLOCKCOPY_SP_W64_H1 64, 32
1725BLOCKCOPY_SP_W64_H1 64, 48
1726BLOCKCOPY_SP_W64_H1 64, 64
1727
1728;-----------------------------------------------------------------------------
1729; void blockfill_s_4x4(int16_t *dest, intptr_t destride, int16_t val)
1730;-----------------------------------------------------------------------------
1731INIT_XMM sse2
1732cglobal blockfill_s_4x4, 3, 3, 1, dest, destStride, val
1733
1734add r1, r1
1735
1736movd m0, r2d
1737pshuflw m0, m0, 0
1738
1739movh [r0], m0
1740movh [r0 + r1], m0
1741movh [r0 + 2 * r1], m0
1742lea r0, [r0 + 2 * r1]
1743movh [r0 + r1], m0
1744
1745RET
1746
1747;-----------------------------------------------------------------------------
1748; void blockfill_s_8x8(int16_t *dest, intptr_t destride, int16_t val)
1749;-----------------------------------------------------------------------------
1750INIT_XMM sse2
1751cglobal blockfill_s_8x8, 3, 3, 1, dest, destStride, val
1752
1753add r1, r1
1754
1755movd m0, r2d
1756pshuflw m0, m0, 0
1757pshufd m0, m0, 0
1758
1759movu [r0], m0
1760movu [r0 + r1], m0
1761movu [r0 + 2 * r1], m0
1762
1763lea r0, [r0 + 2 * r1]
1764movu [r0 + r1], m0
1765movu [r0 + 2 * r1], m0
1766
1767lea r0, [r0 + 2 * r1]
1768movu [r0 + r1], m0
1769movu [r0 + 2 * r1], m0
1770
1771lea r0, [r0 + 2 * r1]
1772movu [r0 + r1], m0
1773
1774RET
1775
1776;-----------------------------------------------------------------------------
1777; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
1778;-----------------------------------------------------------------------------
1779%macro BLOCKFILL_S_W16_H8 2
1780INIT_XMM sse2
1781cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
1782
1783mov r3d, %2/8
1784
1785add r1, r1
1786
1787movd m0, r2d
1788pshuflw m0, m0, 0
1789pshufd m0, m0, 0
1790
1791.loop:
1792 movu [r0], m0
1793 movu [r0 + 16], m0
1794
1795 movu [r0 + r1], m0
1796 movu [r0 + r1 + 16], m0
1797
1798 movu [r0 + 2 * r1], m0
1799 movu [r0 + 2 * r1 + 16], m0
1800
1801 lea r4, [r0 + 2 * r1]
1802 movu [r4 + r1], m0
1803 movu [r4 + r1 + 16], m0
1804
1805 movu [r0 + 4 * r1], m0
1806 movu [r0 + 4 * r1 + 16], m0
1807
1808 lea r4, [r0 + 4 * r1]
1809 movu [r4 + r1], m0
1810 movu [r4 + r1 + 16], m0
1811
1812 movu [r4 + 2 * r1], m0
1813 movu [r4 + 2 * r1 + 16], m0
1814
1815 lea r4, [r4 + 2 * r1]
1816 movu [r4 + r1], m0
1817 movu [r4 + r1 + 16], m0
1818
1819 lea r0, [r0 + 8 * r1]
1820
1821 dec r3d
1822 jnz .loop
1823
1824RET
1825%endmacro
1826
1827BLOCKFILL_S_W16_H8 16, 16
1828
1829INIT_YMM avx2
1830cglobal blockfill_s_16x16, 3, 4, 1
1831add r1, r1
1832lea r3, [3 * r1]
1833movd xm0, r2d
1834vpbroadcastw m0, xm0
1835
1836movu [r0], m0
1837movu [r0 + r1], m0
1838movu [r0 + 2 * r1], m0
1839movu [r0 + r3], m0
1840lea r0, [r0 + 4 * r1]
1841movu [r0], m0
1842movu [r0 + r1], m0
1843movu [r0 + 2 * r1], m0
1844movu [r0 + r3], m0
1845lea r0, [r0 + 4 * r1]
1846movu [r0], m0
1847movu [r0 + r1], m0
1848movu [r0 + 2 * r1], m0
1849movu [r0 + r3], m0
1850lea r0, [r0 + 4 * r1]
1851movu [r0], m0
1852movu [r0 + r1], m0
1853movu [r0 + 2 * r1], m0
1854movu [r0 + r3], m0
1855RET
1856
1857;-----------------------------------------------------------------------------
1858; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
1859;-----------------------------------------------------------------------------
1860%macro BLOCKFILL_S_W32_H4 2
1861INIT_XMM sse2
1862cglobal blockfill_s_%1x%2, 3, 5, 1, dest, destStride, val
1863
1864mov r3d, %2/4
1865
1866add r1, r1
1867
1868movd m0, r2d
1869pshuflw m0, m0, 0
1870pshufd m0, m0, 0
1871
1872.loop:
1873 movu [r0], m0
1874 movu [r0 + 16], m0
1875 movu [r0 + 32], m0
1876 movu [r0 + 48], m0
1877
1878 movu [r0 + r1], m0
1879 movu [r0 + r1 + 16], m0
1880 movu [r0 + r1 + 32], m0
1881 movu [r0 + r1 + 48], m0
1882
1883 movu [r0 + 2 * r1], m0
1884 movu [r0 + 2 * r1 + 16], m0
1885 movu [r0 + 2 * r1 + 32], m0
1886 movu [r0 + 2 * r1 + 48], m0
1887
1888 lea r4, [r0 + 2 * r1]
1889
1890 movu [r4 + r1], m0
1891 movu [r4 + r1 + 16], m0
1892 movu [r4 + r1 + 32], m0
1893 movu [r4 + r1 + 48], m0
1894
1895 lea r0, [r0 + 4 * r1]
1896
1897 dec r3d
1898 jnz .loop
1899
1900RET
1901%endmacro
1902
1903BLOCKFILL_S_W32_H4 32, 32
1904
1905INIT_YMM avx2
1906cglobal blockfill_s_32x32, 3, 4, 1
1907add r1, r1
1908lea r3, [3 * r1]
1909movd xm0, r2d
1910vpbroadcastw m0, xm0
1911
1912movu [r0], m0
1913movu [r0 + 32], m0
1914movu [r0 + r1], m0
1915movu [r0 + r1 + 32], m0
1916movu [r0 + 2 * r1], m0
1917movu [r0 + 2 * r1 + 32], m0
1918movu [r0 + r3], m0
1919movu [r0 + r3 + 32], m0
1920lea r0, [r0 + 4 * r1]
1921movu [r0], m0
1922movu [r0 + 32], m0
1923movu [r0 + r1], m0
1924movu [r0 + r1 + 32], m0
1925movu [r0 + 2 * r1], m0
1926movu [r0 + 2 * r1 + 32], m0
1927movu [r0 + r3], m0
1928movu [r0 + r3 + 32], m0
1929lea r0, [r0 + 4 * r1]
1930movu [r0], m0
1931movu [r0 + 32], m0
1932movu [r0 + r1], m0
1933movu [r0 + r1 + 32], m0
1934movu [r0 + 2 * r1], m0
1935movu [r0 + 2 * r1 + 32], m0
1936movu [r0 + r3], m0
1937movu [r0 + r3 + 32], m0
1938lea r0, [r0 + 4 * r1]
1939movu [r0], m0
1940movu [r0 + 32], m0
1941movu [r0 + r1], m0
1942movu [r0 + r1 + 32], m0
1943movu [r0 + 2 * r1], m0
1944movu [r0 + 2 * r1 + 32], m0
1945movu [r0 + r3], m0
1946movu [r0 + r3 + 32], m0
1947lea r0, [r0 + 4 * r1]
1948movu [r0], m0
1949movu [r0 + 32], m0
1950movu [r0 + r1], m0
1951movu [r0 + r1 + 32], m0
1952movu [r0 + 2 * r1], m0
1953movu [r0 + 2 * r1 + 32], m0
1954movu [r0 + r3], m0
1955movu [r0 + r3 + 32], m0
1956lea r0, [r0 + 4 * r1]
1957movu [r0], m0
1958movu [r0 + 32], m0
1959movu [r0 + r1], m0
1960movu [r0 + r1 + 32], m0
1961movu [r0 + 2 * r1], m0
1962movu [r0 + 2 * r1 + 32], m0
1963movu [r0 + r3], m0
1964movu [r0 + r3 + 32], m0
1965lea r0, [r0 + 4 * r1]
1966movu [r0], m0
1967movu [r0 + 32], m0
1968movu [r0 + r1], m0
1969movu [r0 + r1 + 32], m0
1970movu [r0 + 2 * r1], m0
1971movu [r0 + 2 * r1 + 32], m0
1972movu [r0 + r3], m0
1973movu [r0 + r3 + 32], m0
1974lea r0, [r0 + 4 * r1]
1975movu [r0], m0
1976movu [r0 + 32], m0
1977movu [r0 + r1], m0
1978movu [r0 + r1 + 32], m0
1979movu [r0 + 2 * r1], m0
1980movu [r0 + 2 * r1 + 32], m0
1981movu [r0 + r3], m0
1982movu [r0 + r3 + 32], m0
1983RET
1984
1985;-----------------------------------------------------------------------------
1986; void blockcopy_ps_2x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
1987;-----------------------------------------------------------------------------
1988INIT_XMM sse4
1989cglobal blockcopy_ps_2x4, 4, 4, 1, dest, destStride, src, srcStride
1990
1991add r1, r1
1992
1993movd m0, [r2]
1994pmovzxbw m0, m0
1995movd [r0], m0
1996
1997movd m0, [r2 + r3]
1998pmovzxbw m0, m0
1999movd [r0 + r1], m0
2000
2001movd m0, [r2 + 2 * r3]
2002pmovzxbw m0, m0
2003movd [r0 + 2 * r1], m0
2004
2005lea r2, [r2 + 2 * r3]
2006lea r0, [r0 + 2 * r1]
2007
2008movd m0, [r2 + r3]
2009pmovzxbw m0, m0
2010movd [r0 + r1], m0
2011
2012RET
2013
2014
2015;-----------------------------------------------------------------------------
2016; void blockcopy_ps_2x8(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2017;-----------------------------------------------------------------------------
2018INIT_XMM sse4
2019cglobal blockcopy_ps_2x8, 4, 4, 1, dest, destStride, src, srcStride
2020
2021add r1, r1
2022
2023movd m0, [r2]
2024pmovzxbw m0, m0
2025movd [r0], m0
2026
2027movd m0, [r2 + r3]
2028pmovzxbw m0, m0
2029movd [r0 + r1], m0
2030
2031movd m0, [r2 + 2 * r3]
2032pmovzxbw m0, m0
2033movd [r0 + 2 * r1], m0
2034
2035lea r2, [r2 + 2 * r3]
2036lea r0, [r0 + 2 * r1]
2037
2038movd m0, [r2 + r3]
2039pmovzxbw m0, m0
2040movd [r0 + r1], m0
2041
2042movd m0, [r2 + 2 * r3]
2043pmovzxbw m0, m0
2044movd [r0 + 2 * r1], m0
2045
2046lea r2, [r2 + 2 * r3]
2047lea r0, [r0 + 2 * r1]
2048
2049movd m0, [r2 + r3]
2050pmovzxbw m0, m0
2051movd [r0 + r1], m0
2052
2053movd m0, [r2 + 2 * r3]
2054pmovzxbw m0, m0
2055movd [r0 + 2 * r1], m0
2056
2057lea r2, [r2 + 2 * r3]
2058lea r0, [r0 + 2 * r1]
2059
2060movd m0, [r2 + r3]
2061pmovzxbw m0, m0
2062movd [r0 + r1], m0
2063
2064RET
2065
2066
2067;-----------------------------------------------------------------------------
2068; void blockcopy_ps_2x16(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2069;-----------------------------------------------------------------------------
2070INIT_XMM sse4
2071cglobal blockcopy_ps_2x16, 4, 5, 2, dest, destStride, src, srcStride
2072 add r1, r1
2073 mov r4d, 16/2
2074.loop:
2075 movd m0, [r2]
2076 movd m1, [r2 + r3]
2077 dec r4d
2078 lea r2, [r2 + r3 * 2]
2079 pmovzxbw m0, m0
2080 pmovzxbw m1, m1
2081 movd [r0], m0
2082 movd [r0 + r1], m1
2083 lea r0, [r0 + r1 * 2]
2084 jnz .loop
2085 RET
2086
2087
2088;-----------------------------------------------------------------------------
2089; void blockcopy_ps_4x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2090;-----------------------------------------------------------------------------
2091INIT_XMM sse4
2092cglobal blockcopy_ps_4x2, 4, 4, 1, dest, destStride, src, srcStride
2093
2094add r1, r1
2095
2096movd m0, [r2]
2097pmovzxbw m0, m0
2098movh [r0], m0
2099
2100movd m0, [r2 + r3]
2101pmovzxbw m0, m0
2102movh [r0 + r1], m0
2103
2104RET
2105
2106
2107;-----------------------------------------------------------------------------
2108; void blockcopy_ps_4x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2109;-----------------------------------------------------------------------------
2110INIT_XMM sse4
2111cglobal blockcopy_ps_4x4, 4, 4, 1, dest, destStride, src, srcStride
2112
2113add r1, r1
2114
2115movd m0, [r2]
2116pmovzxbw m0, m0
2117movh [r0], m0
2118
2119movd m0, [r2 + r3]
2120pmovzxbw m0, m0
2121movh [r0 + r1], m0
2122
2123movd m0, [r2 + 2 * r3]
2124pmovzxbw m0, m0
2125movh [r0 + 2 * r1], m0
2126
2127lea r2, [r2 + 2 * r3]
2128lea r0, [r0 + 2 * r1]
2129
2130movd m0, [r2 + r3]
2131pmovzxbw m0, m0
2132movh [r0 + r1], m0
2133
2134RET
2135
2136
2137;-----------------------------------------------------------------------------
2138; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2139;-----------------------------------------------------------------------------
2140%macro BLOCKCOPY_PS_W4_H4 2
2141INIT_XMM sse4
2142cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
2143
2144add r1, r1
2145mov r4d, %2/4
2146
2147.loop:
2148 movd m0, [r2]
2149 pmovzxbw m0, m0
2150 movh [r0], m0
2151
2152 movd m0, [r2 + r3]
2153 pmovzxbw m0, m0
2154 movh [r0 + r1], m0
2155
2156 movd m0, [r2 + 2 * r3]
2157 pmovzxbw m0, m0
2158 movh [r0 + 2 * r1], m0
2159
2160 lea r2, [r2 + 2 * r3]
2161 lea r0, [r0 + 2 * r1]
2162
2163 movd m0, [r2 + r3]
2164 pmovzxbw m0, m0
2165 movh [r0 + r1], m0
2166
2167 lea r0, [r0 + 2 * r1]
2168 lea r2, [r2 + 2 * r3]
2169
2170 dec r4d
2171 jnz .loop
2172
2173RET
2174%endmacro
2175
2176BLOCKCOPY_PS_W4_H4 4, 8
2177BLOCKCOPY_PS_W4_H4 4, 16
2178
2179BLOCKCOPY_PS_W4_H4 4, 32
2180
2181
2182;-----------------------------------------------------------------------------
2183; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2184;-----------------------------------------------------------------------------
2185%macro BLOCKCOPY_PS_W6_H4 2
2186INIT_XMM sse4
2187cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
2188
2189add r1, r1
2190mov r4d, %2/4
2191
2192.loop:
2193 movh m0, [r2]
2194 pmovzxbw m0, m0
2195 movh [r0], m0
2196 pextrd [r0 + 8], m0, 2
2197
2198 movh m0, [r2 + r3]
2199 pmovzxbw m0, m0
2200 movh [r0 + r1], m0
2201 pextrd [r0 + r1 + 8], m0, 2
2202
2203 movh m0, [r2 + 2 * r3]
2204 pmovzxbw m0, m0
2205 movh [r0 + 2 * r1], m0
2206 pextrd [r0 + 2 * r1 + 8], m0, 2
2207
2208 lea r2, [r2 + 2 * r3]
2209 lea r0, [r0 + 2 * r1]
2210
2211 movh m0, [r2 + r3]
2212 pmovzxbw m0, m0
2213 movh [r0 + r1], m0
2214 pextrd [r0 + r1 + 8], m0, 2
2215
2216 lea r0, [r0 + 2 * r1]
2217 lea r2, [r2 + 2 * r3]
2218
2219 dec r4d
2220 jnz .loop
2221
2222RET
2223%endmacro
2224
2225BLOCKCOPY_PS_W6_H4 6, 8
2226
2227BLOCKCOPY_PS_W6_H4 6, 16
2228
2229;-----------------------------------------------------------------------------
2230; void blockcopy_ps_8x2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2231;-----------------------------------------------------------------------------
2232INIT_XMM sse4
2233cglobal blockcopy_ps_8x2, 4, 4, 1, dest, destStride, src, srcStride
2234
2235add r1, r1
2236
2237movh m0, [r2]
2238pmovzxbw m0, m0
2239movu [r0], m0
2240
2241movh m0, [r2 + r3]
2242pmovzxbw m0, m0
2243movu [r0 + r1], m0
2244
2245RET
2246
2247;-----------------------------------------------------------------------------
2248; void blockcopy_ps_8x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2249;-----------------------------------------------------------------------------
2250INIT_XMM sse4
2251cglobal blockcopy_ps_8x4, 4, 4, 1, dest, destStride, src, srcStride
2252
2253add r1, r1
2254
2255movh m0, [r2]
2256pmovzxbw m0, m0
2257movu [r0], m0
2258
2259movh m0, [r2 + r3]
2260pmovzxbw m0, m0
2261movu [r0 + r1], m0
2262
2263movh m0, [r2 + 2 * r3]
2264pmovzxbw m0, m0
2265movu [r0 + 2 * r1], m0
2266
2267lea r2, [r2 + 2 * r3]
2268lea r0, [r0 + 2 * r1]
2269
2270movh m0, [r2 + r3]
2271pmovzxbw m0, m0
2272movu [r0 + r1], m0
2273
2274RET
2275
2276;-----------------------------------------------------------------------------
2277; void blockcopy_ps_8x6(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2278;-----------------------------------------------------------------------------
2279INIT_XMM sse4
2280cglobal blockcopy_ps_8x6, 4, 4, 1, dest, destStride, src, srcStride
2281
2282add r1, r1
2283
2284movh m0, [r2]
2285pmovzxbw m0, m0
2286movu [r0], m0
2287
2288movh m0, [r2 + r3]
2289pmovzxbw m0, m0
2290movu [r0 + r1], m0
2291
2292movh m0, [r2 + 2 * r3]
2293pmovzxbw m0, m0
2294movu [r0 + 2 * r1], m0
2295
2296lea r2, [r2 + 2 * r3]
2297lea r0, [r0 + 2 * r1]
2298
2299movh m0, [r2 + r3]
2300pmovzxbw m0, m0
2301movu [r0 + r1], m0
2302
2303movh m0, [r2 + 2 * r3]
2304pmovzxbw m0, m0
2305movu [r0 + 2 * r1], m0
2306
2307lea r2, [r2 + 2 * r3]
2308lea r0, [r0 + 2 * r1]
2309
2310movh m0, [r2 + r3]
2311pmovzxbw m0, m0
2312movu [r0 + r1], m0
2313
2314RET
2315
2316;-----------------------------------------------------------------------------
2317; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2318;-----------------------------------------------------------------------------
2319%macro BLOCKCOPY_PS_W8_H4 2
2320INIT_XMM sse4
2321cglobal blockcopy_ps_%1x%2, 4, 5, 1, dest, destStride, src, srcStride
2322
2323add r1, r1
2324mov r4d, %2/4
2325
2326.loop:
2327 movh m0, [r2]
2328 pmovzxbw m0, m0
2329 movu [r0], m0
2330
2331 movh m0, [r2 + r3]
2332 pmovzxbw m0, m0
2333 movu [r0 + r1], m0
2334
2335 movh m0, [r2 + 2 * r3]
2336 pmovzxbw m0, m0
2337 movu [r0 + 2 * r1], m0
2338
2339 lea r2, [r2 + 2 * r3]
2340 lea r0, [r0 + 2 * r1]
2341
2342 movh m0, [r2 + r3]
2343 pmovzxbw m0, m0
2344 movu [r0 + r1], m0
2345
2346 lea r0, [r0 + 2 * r1]
2347 lea r2, [r2 + 2 * r3]
2348
2349 dec r4d
2350 jnz .loop
2351
2352RET
2353%endmacro
2354
2355BLOCKCOPY_PS_W8_H4 8, 8
2356BLOCKCOPY_PS_W8_H4 8, 16
2357BLOCKCOPY_PS_W8_H4 8, 32
2358
2359BLOCKCOPY_PS_W8_H4 8, 12
2360BLOCKCOPY_PS_W8_H4 8, 64
2361
2362
2363;-----------------------------------------------------------------------------
2364; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2365;-----------------------------------------------------------------------------
2366%macro BLOCKCOPY_PS_W12_H2 2
2367INIT_XMM sse4
2368cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
2369
2370add r1, r1
2371mov r4d, %2/2
2372pxor m0, m0
2373
2374.loop:
2375 movu m1, [r2]
2376 pmovzxbw m2, m1
2377 movu [r0], m2
2378 punpckhbw m1, m0
2379 movh [r0 + 16], m1
2380
2381 movu m1, [r2 + r3]
2382 pmovzxbw m2, m1
2383 movu [r0 + r1], m2
2384 punpckhbw m1, m0
2385 movh [r0 + r1 + 16], m1
2386
2387 lea r0, [r0 + 2 * r1]
2388 lea r2, [r2 + 2 * r3]
2389
2390 dec r4d
2391 jnz .loop
2392
2393RET
2394%endmacro
2395
2396BLOCKCOPY_PS_W12_H2 12, 16
2397
2398BLOCKCOPY_PS_W12_H2 12, 32
2399
2400;-----------------------------------------------------------------------------
2401; void blockcopy_ps_16x4(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2402;-----------------------------------------------------------------------------
2403INIT_XMM sse4
2404cglobal blockcopy_ps_16x4, 4, 4, 3, dest, destStride, src, srcStride
2405
2406add r1, r1
2407pxor m0, m0
2408
2409movu m1, [r2]
2410pmovzxbw m2, m1
2411movu [r0], m2
2412punpckhbw m1, m0
2413movu [r0 + 16], m1
2414
2415movu m1, [r2 + r3]
2416pmovzxbw m2, m1
2417movu [r0 + r1], m2
2418punpckhbw m1, m0
2419movu [r0 + r1 + 16], m1
2420
2421movu m1, [r2 + 2 * r3]
2422pmovzxbw m2, m1
2423movu [r0 + 2 * r1], m2
2424punpckhbw m1, m0
2425movu [r0 + 2 * r1 + 16], m1
2426
2427lea r0, [r0 + 2 * r1]
2428lea r2, [r2 + 2 * r3]
2429
2430movu m1, [r2 + r3]
2431pmovzxbw m2, m1
2432movu [r0 + r1], m2
2433punpckhbw m1, m0
2434movu [r0 + r1 + 16], m1
2435
2436RET
2437
2438;-----------------------------------------------------------------------------
2439; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2440;-----------------------------------------------------------------------------
2441%macro BLOCKCOPY_PS_W16_H4 2
2442INIT_XMM sse4
2443cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
2444
2445add r1, r1
2446mov r4d, %2/4
2447pxor m0, m0
2448
2449.loop:
2450 movu m1, [r2]
2451 pmovzxbw m2, m1
2452 movu [r0], m2
2453 punpckhbw m1, m0
2454 movu [r0 + 16], m1
2455
2456 movu m1, [r2 + r3]
2457 pmovzxbw m2, m1
2458 movu [r0 + r1], m2
2459 punpckhbw m1, m0
2460 movu [r0 + r1 + 16], m1
2461
2462 movu m1, [r2 + 2 * r3]
2463 pmovzxbw m2, m1
2464 movu [r0 + 2 * r1], m2
2465 punpckhbw m1, m0
2466 movu [r0 + 2 * r1 + 16], m1
2467
2468 lea r0, [r0 + 2 * r1]
2469 lea r2, [r2 + 2 * r3]
2470
2471 movu m1, [r2 + r3]
2472 pmovzxbw m2, m1
2473 movu [r0 + r1], m2
2474 punpckhbw m1, m0
2475 movu [r0 + r1 + 16], m1
2476
2477 lea r0, [r0 + 2 * r1]
2478 lea r2, [r2 + 2 * r3]
2479
2480 dec r4d
2481 jnz .loop
2482
2483RET
2484%endmacro
2485
2486BLOCKCOPY_PS_W16_H4 16, 8
2487BLOCKCOPY_PS_W16_H4 16, 12
2488BLOCKCOPY_PS_W16_H4 16, 16
2489BLOCKCOPY_PS_W16_H4 16, 32
2490BLOCKCOPY_PS_W16_H4 16, 64
2491
2492BLOCKCOPY_PS_W16_H4 16, 24
2493
2494;-----------------------------------------------------------------------------
2495; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2496;-----------------------------------------------------------------------------
2497%macro BLOCKCOPY_PS_W24_H2 2
2498INIT_XMM sse4
2499cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
2500
2501add r1, r1
2502mov r4d, %2/2
2503pxor m0, m0
2504
2505.loop:
2506 movu m1, [r2]
2507 pmovzxbw m2, m1
2508 movu [r0], m2
2509 punpckhbw m1, m0
2510 movu [r0 + 16], m1
2511
2512 movh m1, [r2 + 16]
2513 pmovzxbw m1, m1
2514 movu [r0 + 32], m1
2515
2516 movu m1, [r2 + r3]
2517 pmovzxbw m2, m1
2518 movu [r0 + r1], m2
2519 punpckhbw m1, m0
2520 movu [r0 + r1 + 16], m1
2521
2522 movh m1, [r2 + r3 + 16]
2523 pmovzxbw m1, m1
2524 movu [r0 + r1 + 32], m1
2525
2526 lea r0, [r0 + 2 * r1]
2527 lea r2, [r2 + 2 * r3]
2528
2529 dec r4d
2530 jnz .loop
2531
2532RET
2533%endmacro
2534
2535BLOCKCOPY_PS_W24_H2 24, 32
2536
2537BLOCKCOPY_PS_W24_H2 24, 64
2538
2539;-----------------------------------------------------------------------------
2540; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2541;-----------------------------------------------------------------------------
2542%macro BLOCKCOPY_PS_W32_H2 2
2543INIT_XMM sse4
2544cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
2545
2546add r1, r1
2547mov r4d, %2/2
2548pxor m0, m0
2549
2550.loop:
2551 movu m1, [r2]
2552 pmovzxbw m2, m1
2553 movu [r0], m2
2554 punpckhbw m1, m0
2555 movu [r0 + 16], m1
2556
2557 movu m1, [r2 + 16]
2558 pmovzxbw m2, m1
2559 movu [r0 + 32], m2
2560 punpckhbw m1, m0
2561 movu [r0 + 48], m1
2562
2563 movu m1, [r2 + r3]
2564 pmovzxbw m2, m1
2565 movu [r0 + r1], m2
2566 punpckhbw m1, m0
2567 movu [r0 + r1 + 16], m1
2568
2569 movu m1, [r2 + r3 + 16]
2570 pmovzxbw m2, m1
2571 movu [r0 + r1 + 32], m2
2572 punpckhbw m1, m0
2573 movu [r0 + r1 + 48], m1
2574
2575 lea r0, [r0 + 2 * r1]
2576 lea r2, [r2 + 2 * r3]
2577
2578 dec r4d
2579 jnz .loop
2580
2581RET
2582%endmacro
2583
2584BLOCKCOPY_PS_W32_H2 32, 8
2585BLOCKCOPY_PS_W32_H2 32, 16
2586BLOCKCOPY_PS_W32_H2 32, 24
2587BLOCKCOPY_PS_W32_H2 32, 32
2588BLOCKCOPY_PS_W32_H2 32, 64
2589
2590BLOCKCOPY_PS_W32_H2 32, 48
2591
2592;-----------------------------------------------------------------------------
2593; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2594;-----------------------------------------------------------------------------
2595%macro BLOCKCOPY_PS_W48_H2 2
2596INIT_XMM sse4
2597cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
2598
2599add r1, r1
2600mov r4d, %2/2
2601pxor m0, m0
2602
2603.loop:
2604 movu m1, [r2]
2605 pmovzxbw m2, m1
2606 movu [r0], m2
2607 punpckhbw m1, m0
2608 movu [r0 + 16], m1
2609
2610 movu m1, [r2 + 16]
2611 pmovzxbw m2, m1
2612 movu [r0 + 32], m2
2613 punpckhbw m1, m0
2614 movu [r0 + 48], m1
2615
2616 movu m1, [r2 + 32]
2617 pmovzxbw m2, m1
2618 movu [r0 + 64], m2
2619 punpckhbw m1, m0
2620 movu [r0 + 80], m1
2621
2622 movu m1, [r2 + r3]
2623 pmovzxbw m2, m1
2624 movu [r0 + r1], m2
2625 punpckhbw m1, m0
2626 movu [r0 + r1 + 16], m1
2627
2628 movu m1, [r2 + r3 + 16]
2629 pmovzxbw m2, m1
2630 movu [r0 + r1 + 32], m2
2631 punpckhbw m1, m0
2632 movu [r0 + r1 + 48], m1
2633
2634 movu m1, [r2 + r3 + 32]
2635 pmovzxbw m2, m1
2636 movu [r0 + r1 + 64], m2
2637 punpckhbw m1, m0
2638 movu [r0 + r1 + 80], m1
2639
2640 lea r0, [r0 + 2 * r1]
2641 lea r2, [r2 + 2 * r3]
2642
2643 dec r4d
2644 jnz .loop
2645
2646RET
2647%endmacro
2648
2649BLOCKCOPY_PS_W48_H2 48, 64
2650
2651;-----------------------------------------------------------------------------
2652; void blockcopy_ps_%1x%2(int16_t *dest, intptr_t destStride, pixel *src, intptr_t srcStride);
2653;-----------------------------------------------------------------------------
2654%macro BLOCKCOPY_PS_W64_H2 2
2655INIT_XMM sse4
2656cglobal blockcopy_ps_%1x%2, 4, 5, 3, dest, destStride, src, srcStride
2657
2658add r1, r1
2659mov r4d, %2/2
2660pxor m0, m0
2661
2662.loop:
2663 movu m1, [r2]
2664 pmovzxbw m2, m1
2665 movu [r0], m2
2666 punpckhbw m1, m0
2667 movu [r0 + 16], m1
2668
2669 movu m1, [r2 + 16]
2670 pmovzxbw m2, m1
2671 movu [r0 + 32], m2
2672 punpckhbw m1, m0
2673 movu [r0 + 48], m1
2674
2675 movu m1, [r2 + 32]
2676 pmovzxbw m2, m1
2677 movu [r0 + 64], m2
2678 punpckhbw m1, m0
2679 movu [r0 + 80], m1
2680
2681 movu m1, [r2 + 48]
2682 pmovzxbw m2, m1
2683 movu [r0 + 96], m2
2684 punpckhbw m1, m0
2685 movu [r0 + 112], m1
2686
2687 movu m1, [r2 + r3]
2688 pmovzxbw m2, m1
2689 movu [r0 + r1], m2
2690 punpckhbw m1, m0
2691 movu [r0 + r1 + 16], m1
2692
2693 movu m1, [r2 + r3 + 16]
2694 pmovzxbw m2, m1
2695 movu [r0 + r1 + 32], m2
2696 punpckhbw m1, m0
2697 movu [r0 + r1 + 48], m1
2698
2699 movu m1, [r2 + r3 + 32]
2700 pmovzxbw m2, m1
2701 movu [r0 + r1 + 64], m2
2702 punpckhbw m1, m0
2703 movu [r0 + r1 + 80], m1
2704
2705 movu m1, [r2 + r3 + 48]
2706 pmovzxbw m2, m1
2707 movu [r0 + r1 + 96], m2
2708 punpckhbw m1, m0
2709 movu [r0 + r1 + 112], m1
2710
2711 lea r0, [r0 + 2 * r1]
2712 lea r2, [r2 + 2 * r3]
2713
2714 dec r4d
2715 jnz .loop
2716
2717RET
2718%endmacro
2719
2720BLOCKCOPY_PS_W64_H2 64, 16
2721BLOCKCOPY_PS_W64_H2 64, 32
2722BLOCKCOPY_PS_W64_H2 64, 48
2723BLOCKCOPY_PS_W64_H2 64, 64
2724
2725;-----------------------------------------------------------------------------
2726; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2727;-----------------------------------------------------------------------------
2728INIT_XMM sse2
2729cglobal blockcopy_ss_2x4, 4, 6, 0
2730 add r1, r1
2731 add r3, r3
2732
2733 mov r4d, [r2]
2734 mov r5d, [r2 + r3]
2735 mov [r0], r4d
2736 mov [r0 + r1], r5d
2737
2738 lea r2, [r2 + r3 * 2]
2739 lea r0, [r0 + 2 * r1]
2740
2741 mov r4d, [r2]
2742 mov r5d, [r2 + r3]
2743 mov [r0], r4d
2744 mov [r0 + r1], r5d
2745
2746 RET
2747
2748;-----------------------------------------------------------------------------
2749; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2750;-----------------------------------------------------------------------------
2751INIT_XMM sse2
2752cglobal blockcopy_ss_2x8, 4, 6, 0
2753 add r1, r1
2754 add r3, r3
2755
2756 mov r4d, [r2]
2757 mov r5d, [r2 + r3]
2758 mov [r0], r4d
2759 mov [r0 + r1], r5d
2760
2761 lea r2, [r2 + r3 * 2]
2762 lea r0, [r0 + 2 * r1]
2763
2764 mov r4d, [r2]
2765 mov r5d, [r2 + r3]
2766 mov [r0], r4d
2767 mov [r0 + r1], r5d
2768
2769 lea r2, [r2 + r3 * 2]
2770 lea r0, [r0 + 2 * r1]
2771
2772 mov r4d, [r2]
2773 mov r5d, [r2 + r3]
2774 mov [r0], r4d
2775 mov [r0 + r1], r5d
2776
2777 lea r2, [r2 + r3 * 2]
2778 lea r0, [r0 + 2 * r1]
2779
2780 mov r4d, [r2]
2781 mov r5d, [r2 + r3]
2782 mov [r0], r4d
2783 mov [r0 + r1], r5d
2784
2785 RET
2786
2787;-----------------------------------------------------------------------------
2788; void blockcopy_ss_2x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2789;-----------------------------------------------------------------------------
2790INIT_XMM sse2
2791cglobal blockcopy_ss_2x16, 4, 7, 0
2792 add r1, r1
2793 add r3, r3
2794 mov r6d, 16/2
2795.loop:
2796 mov r4d, [r2]
2797 mov r5d, [r2 + r3]
2798 dec r6d
2799 lea r2, [r2 + r3 * 2]
2800 mov [r0], r4d
2801 mov [r0 + r1], r5d
2802 lea r0, [r0 + r1 * 2]
2803 jnz .loop
2804 RET
2805
2806
2807;-----------------------------------------------------------------------------
2808; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2809;-----------------------------------------------------------------------------
2810INIT_XMM sse2
2811cglobal blockcopy_ss_4x2, 4, 4, 2
2812 add r1, r1
2813 add r3, r3
2814
2815 movh m0, [r2]
2816 movh m1, [r2 + r3]
2817
2818 movh [r0], m0
2819 movh [r0 + r1], m1
2820
2821 RET
2822
2823;-----------------------------------------------------------------------------
2824; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2825;-----------------------------------------------------------------------------
2826INIT_XMM sse2
2827cglobal blockcopy_ss_4x4, 4, 4, 4
2828 add r1, r1
2829 add r3, r3
2830 movh m0, [r2]
2831 movh m1, [r2 + r3]
2832 lea r2, [r2 + r3 * 2]
2833 movh m2, [r2]
2834 movh m3, [r2 + r3]
2835
2836 movh [r0], m0
2837 movh [r0 + r1], m1
2838 lea r0, [r0 + 2 * r1]
2839 movh [r0], m2
2840 movh [r0 + r1], m3
2841 RET
2842
2843;-----------------------------------------------------------------------------
2844; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2845;-----------------------------------------------------------------------------
2846%macro BLOCKCOPY_SS_W4_H8 2
2847INIT_XMM sse2
2848cglobal blockcopy_ss_%1x%2, 4, 5, 4
2849 mov r4d, %2/8
2850 add r1, r1
2851 add r3, r3
2852.loop:
2853 movh m0, [r2]
2854 movh m1, [r2 + r3]
2855 lea r2, [r2 + r3 * 2]
2856 movh m2, [r2]
2857 movh m3, [r2 + r3]
2858
2859 movh [r0], m0
2860 movh [r0 + r1], m1
2861 lea r0, [r0 + 2 * r1]
2862 movh [r0], m2
2863 movh [r0 + r1], m3
2864
2865 lea r0, [r0 + 2 * r1]
2866 lea r2, [r2 + 2 * r3]
2867 movh m0, [r2]
2868 movh m1, [r2 + r3]
2869 lea r2, [r2 + r3 * 2]
2870 movh m2, [r2]
2871 movh m3, [r2 + r3]
2872
2873 movh [r0], m0
2874 movh [r0 + r1], m1
2875 lea r0, [r0 + 2 * r1]
2876 movh [r0], m2
2877 movh [r0 + r1], m3
2878 lea r0, [r0 + 2 * r1]
2879 lea r2, [r2 + 2 * r3]
2880
2881 dec r4d
2882 jnz .loop
2883 RET
2884%endmacro
2885
2886BLOCKCOPY_SS_W4_H8 4, 8
2887BLOCKCOPY_SS_W4_H8 4, 16
2888
2889BLOCKCOPY_SS_W4_H8 4, 32
2890
2891;-----------------------------------------------------------------------------
2892; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2893;-----------------------------------------------------------------------------
2894INIT_XMM sse2
2895cglobal blockcopy_ss_6x8, 4, 4, 4
2896 add r1, r1
2897 add r3, r3
2898
2899 movu m0, [r2]
2900 movu m1, [r2 + r3]
2901 pshufd m2, m0, 2
2902 pshufd m3, m1, 2
2903 movh [r0], m0
2904 movd [r0 + 8], m2
2905 movh [r0 + r1], m1
2906 movd [r0 + r1 + 8], m3
2907
2908 lea r0, [r0 + 2 * r1]
2909 lea r2, [r2 + 2 * r3]
2910
2911 movu m0, [r2]
2912 movu m1, [r2 + r3]
2913 pshufd m2, m0, 2
2914 pshufd m3, m1, 2
2915 movh [r0], m0
2916 movd [r0 + 8], m2
2917 movh [r0 + r1], m1
2918 movd [r0 + r1 + 8], m3
2919
2920 lea r0, [r0 + 2 * r1]
2921 lea r2, [r2 + 2 * r3]
2922
2923 movu m0, [r2]
2924 movu m1, [r2 + r3]
2925 pshufd m2, m0, 2
2926 pshufd m3, m1, 2
2927 movh [r0], m0
2928 movd [r0 + 8], m2
2929 movh [r0 + r1], m1
2930 movd [r0 + r1 + 8], m3
2931
2932 lea r0, [r0 + 2 * r1]
2933 lea r2, [r2 + 2 * r3]
2934
2935 movu m0, [r2]
2936 movu m1, [r2 + r3]
2937 pshufd m2, m0, 2
2938 pshufd m3, m1, 2
2939 movh [r0], m0
2940 movd [r0 + 8], m2
2941 movh [r0 + r1], m1
2942 movd [r0 + r1 + 8], m3
2943
2944 RET
2945
2946;-----------------------------------------------------------------------------
2947; void blockcopy_ss_6x16(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2948;-----------------------------------------------------------------------------
2949INIT_XMM sse2
2950cglobal blockcopy_ss_6x16, 4, 5, 4
2951 add r1, r1
2952 add r3, r3
2953 mov r4d, 16/2
2954.loop:
2955 movh m0, [r2]
2956 movd m2, [r2 + 8]
2957 movh m1, [r2 + r3]
2958 movd m3, [r2 + r3 + 8]
2959 dec r4d
2960 lea r2, [r2 + r3 * 2]
2961 movh [r0], m0
2962 movd [r0 + 8], m2
2963 movh [r0 + r1], m1
2964 movd [r0 + r1 + 8], m3
2965 lea r0, [r0 + r1 * 2]
2966 jnz .loop
2967 RET
2968
2969
2970;-----------------------------------------------------------------------------
2971; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2972;-----------------------------------------------------------------------------
2973INIT_XMM sse2
2974cglobal blockcopy_ss_8x2, 4, 4, 2
2975 add r1, r1
2976 add r3, r3
2977
2978 movu m0, [r2]
2979 movu m1, [r2 + r3]
2980
2981 movu [r0], m0
2982 movu [r0 + r1], m1
2983
2984 RET
2985
2986;-----------------------------------------------------------------------------
2987; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
2988;-----------------------------------------------------------------------------
2989INIT_XMM sse2
2990cglobal blockcopy_ss_8x4, 4, 4, 4
2991 add r1, r1
2992 add r3, r3
2993
2994 movu m0, [r2]
2995 movu m1, [r2 + r3]
2996 lea r2, [r2 + r3 * 2]
2997 movu m2, [r2]
2998 movu m3, [r2 + r3]
2999
3000 movu [r0], m0
3001 movu [r0 + r1], m1
3002 lea r0, [r0 + 2 * r1]
3003 movu [r0], m2
3004 movu [r0 + r1], m3
3005 RET
3006
3007;-----------------------------------------------------------------------------
3008; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3009;-----------------------------------------------------------------------------
3010INIT_XMM sse2
3011cglobal blockcopy_ss_8x6, 4, 4, 4
3012
3013 add r1, r1
3014 add r3, r3
3015 movu m0, [r2]
3016 movu m1, [r2 + r3]
3017 lea r2, [r2 + r3 * 2]
3018 movu m2, [r2]
3019 movu m3, [r2 + r3]
3020
3021 movu [r0], m0
3022 movu [r0 + r1], m1
3023 lea r0, [r0 + 2 * r1]
3024 movu [r0], m2
3025 movu [r0 + r1], m3
3026
3027 lea r2, [r2 + r3 * 2]
3028 lea r0, [r0 + 2 * r1]
3029
3030 movu m0, [r2]
3031 movu m1, [r2 + r3]
3032 movu [r0], m0
3033 movu [r0 + r1], m1
3034 RET
3035
3036;-----------------------------------------------------------------------------
3037; void blockcopy_ss_8x12(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3038;-----------------------------------------------------------------------------
3039INIT_XMM sse2
3040cglobal blockcopy_ss_8x12, 4, 5, 2
3041 add r1, r1
3042 add r3, r3
3043 mov r4d, 12/2
3044.loop:
3045 movu m0, [r2]
3046 movu m1, [r2 + r3]
3047 lea r2, [r2 + 2 * r3]
3048 dec r4d
3049 movu [r0], m0
3050 movu [r0 + r1], m1
3051 lea r0, [r0 + 2 * r1]
3052 jnz .loop
3053 RET
3054
3055
3056;-----------------------------------------------------------------------------
3057; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3058;-----------------------------------------------------------------------------
3059%macro BLOCKCOPY_SS_W8_H8 2
3060INIT_XMM sse2
3061cglobal blockcopy_ss_%1x%2, 4, 5, 4
3062 mov r4d, %2/8
3063 add r1, r1
3064 add r3, r3
3065.loop:
3066 movu m0, [r2]
3067 movu m1, [r2 + r3]
3068 lea r2, [r2 + r3 * 2]
3069 movu m2, [r2]
3070 movu m3, [r2 + r3]
3071
3072 movu [r0], m0
3073 movu [r0 + r1], m1
3074 lea r0, [r0 + 2 * r1]
3075 movu [r0], m2
3076 movu [r0 + r1], m3
3077
3078
3079 lea r2, [r2 + 2 * r3]
3080 lea r0, [r0 + 2 * r1]
3081
3082 movu m0, [r2]
3083 movu m1, [r2 + r3]
3084 lea r2, [r2 + r3 * 2]
3085 movu m2, [r2]
3086 movu m3, [r2 + r3]
3087
3088 movu [r0], m0
3089 movu [r0 + r1], m1
3090 lea r0, [r0 + 2 * r1]
3091 movu [r0], m2
3092 movu [r0 + r1], m3
3093
3094 dec r4d
3095 lea r0, [r0 + 2 * r1]
3096 lea r2, [r2 + 2 * r3]
3097 jnz .loop
3098RET
3099%endmacro
3100
3101BLOCKCOPY_SS_W8_H8 8, 8
3102BLOCKCOPY_SS_W8_H8 8, 16
3103BLOCKCOPY_SS_W8_H8 8, 32
3104
3105BLOCKCOPY_SS_W8_H8 8, 64
3106
3107;-----------------------------------------------------------------------------
3108; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3109;-----------------------------------------------------------------------------
3110%macro BLOCKCOPY_SS_W12_H4 2
3111INIT_XMM sse2
3112cglobal blockcopy_ss_%1x%2, 4, 5, 4
3113
3114 mov r4d, %2/4
3115 add r1, r1
3116 add r3, r3
3117.loop:
3118 movu m0, [r2]
3119 movh m1, [r2 + 16]
3120 movu m2, [r2 + r3]
3121 movh m3, [r2 + r3 + 16]
3122 lea r2, [r2 + 2 * r3]
3123
3124 movu [r0], m0
3125 movh [r0 + 16], m1
3126 movu [r0 + r1], m2
3127 movh [r0 + r1 + 16], m3
3128
3129 lea r0, [r0 + 2 * r1]
3130 movu m0, [r2]
3131 movh m1, [r2 + 16]
3132 movu m2, [r2 + r3]
3133 movh m3, [r2 + r3 + 16]
3134
3135 movu [r0], m0
3136 movh [r0 + 16], m1
3137 movu [r0 + r1], m2
3138 movh [r0 + r1 + 16], m3
3139
3140 dec r4d
3141 lea r0, [r0 + 2 * r1]
3142 lea r2, [r2 + 2 * r3]
3143 jnz .loop
3144 RET
3145%endmacro
3146
3147BLOCKCOPY_SS_W12_H4 12, 16
3148
3149BLOCKCOPY_SS_W12_H4 12, 32
3150
3151;-----------------------------------------------------------------------------
3152; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3153;-----------------------------------------------------------------------------
3154%macro BLOCKCOPY_SS_W16_H4 2
3155INIT_XMM sse2
3156cglobal blockcopy_ss_%1x%2, 4, 5, 4
3157 mov r4d, %2/4
3158 add r1, r1
3159 add r3, r3
3160.loop:
3161 movu m0, [r2]
3162 movu m1, [r2 + 16]
3163 movu m2, [r2 + r3]
3164 movu m3, [r2 + r3 + 16]
3165
3166 movu [r0], m0
3167 movu [r0 + 16], m1
3168 movu [r0 + r1], m2
3169 movu [r0 + r1 + 16], m3
3170
3171 lea r2, [r2 + 2 * r3]
3172 lea r0, [r0 + 2 * r1]
3173
3174 movu m0, [r2]
3175 movu m1, [r2 + 16]
3176 movu m2, [r2 + r3]
3177 movu m3, [r2 + r3 + 16]
3178
3179 movu [r0], m0
3180 movu [r0 + 16], m1
3181 movu [r0 + r1], m2
3182 movu [r0 + r1 + 16], m3
3183
3184 dec r4d
3185 lea r0, [r0 + 2 * r1]
3186 lea r2, [r2 + 2 * r3]
3187 jnz .loop
3188 RET
3189%endmacro
3190
3191BLOCKCOPY_SS_W16_H4 16, 4
3192BLOCKCOPY_SS_W16_H4 16, 12
3193
3194;-----------------------------------------------------------------------------
3195; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3196;-----------------------------------------------------------------------------
3197%macro BLOCKCOPY_SS_W16_H4_avx 2
3198INIT_YMM avx
3199cglobal blockcopy_ss_%1x%2, 4, 7, 4
3200 mov r4d, %2/4
3201 add r1, r1
3202 add r3, r3
3203 lea r5, [3 * r3]
3204 lea r6, [3 * r1]
3205.loop:
3206 movu m0, [r2]
3207 movu m1, [r2 + r3]
3208 movu m2, [r2 + 2 * r3]
3209 movu m3, [r2 + r5]
3210
3211 movu [r0], m0
3212 movu [r0 + r1], m1
3213 movu [r0 + 2 * r1], m2
3214 movu [r0 + r6], m3
3215
3216 lea r0, [r0 + 4 * r1]
3217 lea r2, [r2 + 4 * r3]
3218 dec r4d
3219 jnz .loop
3220 RET
3221%endmacro
3222
3223BLOCKCOPY_SS_W16_H4_avx 16, 4
3224BLOCKCOPY_SS_W16_H4_avx 16, 12
3225BLOCKCOPY_SS_W16_H4_avx 16, 8
3226BLOCKCOPY_SS_W16_H4_avx 16, 16
3227BLOCKCOPY_SS_W16_H4_avx 16, 24
3228BLOCKCOPY_SS_W16_H4_avx 16, 32
3229BLOCKCOPY_SS_W16_H4_avx 16, 64
3230
3231;-----------------------------------------------------------------------------
3232; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3233;-----------------------------------------------------------------------------
3234%macro BLOCKCOPY_SS_W16_H8 2
3235INIT_XMM sse2
3236cglobal blockcopy_ss_%1x%2, 4, 5, 4
3237 mov r4d, %2/8
3238 add r1, r1
3239 add r3, r3
3240.loop:
3241 movu m0, [r2]
3242 movu m1, [r2 + 16]
3243 movu m2, [r2 + r3]
3244 movu m3, [r2 + r3 + 16]
3245
3246 movu [r0], m0
3247 movu [r0 + 16], m1
3248 movu [r0 + r1], m2
3249 movu [r0 + r1 + 16], m3
3250
3251 lea r2, [r2 + 2 * r3]
3252 lea r0, [r0 + 2 * r1]
3253
3254 movu m0, [r2]
3255 movu m1, [r2 + 16]
3256 movu m2, [r2 + r3]
3257 movu m3, [r2 + r3 + 16]
3258
3259 movu [r0], m0
3260 movu [r0 + 16], m1
3261 movu [r0 + r1], m2
3262 movu [r0 + r1 + 16], m3
3263
3264 lea r2, [r2 + 2 * r3]
3265 lea r0, [r0 + 2 * r1]
3266
3267 movu m0, [r2]
3268 movu m1, [r2 + 16]
3269 movu m2, [r2 + r3]
3270 movu m3, [r2 + r3 + 16]
3271
3272 movu [r0], m0
3273 movu [r0 + 16], m1
3274 movu [r0 + r1], m2
3275 movu [r0 + r1 + 16], m3
3276
3277 lea r2, [r2 + 2 * r3]
3278 lea r0, [r0 + 2 * r1]
3279
3280 movu m0, [r2]
3281 movu m1, [r2 + 16]
3282 movu m2, [r2 + r3]
3283 movu m3, [r2 + r3 + 16]
3284
3285 movu [r0], m0
3286 movu [r0 + 16], m1
3287 movu [r0 + r1], m2
3288 movu [r0 + r1 + 16], m3
3289
3290 dec r4d
3291 lea r2, [r2 + 2 * r3]
3292 lea r0, [r0 + 2 * r1]
3293 jnz .loop
3294 RET
3295%endmacro
3296
3297BLOCKCOPY_SS_W16_H8 16, 8
3298BLOCKCOPY_SS_W16_H8 16, 16
3299BLOCKCOPY_SS_W16_H8 16, 32
3300BLOCKCOPY_SS_W16_H8 16, 64
3301
3302BLOCKCOPY_SS_W16_H8 16, 24
3303
3304;-----------------------------------------------------------------------------
3305; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3306;-----------------------------------------------------------------------------
3307%macro BLOCKCOPY_SS_W24_H4 2
3308INIT_XMM sse2
3309cglobal blockcopy_ss_%1x%2, 4, 5, 6
3310 mov r4d, %2/4
3311 add r1, r1
3312 add r3, r3
3313.loop
3314 movu m0, [r2]
3315 movu m1, [r2 + 16]
3316 movu m2, [r2 + 32]
3317 movu m3, [r2 + r3]
3318 movu m4, [r2 + r3 + 16]
3319 movu m5, [r2 + r3 + 32]
3320
3321 movu [r0], m0
3322 movu [r0 + 16], m1
3323 movu [r0 + 32], m2
3324 movu [r0 + r1], m3
3325 movu [r0 + r1 + 16], m4
3326 movu [r0 + r1 + 32], m5
3327
3328 lea r2, [r2 + 2 * r3]
3329 lea r0, [r0 + 2 * r1]
3330
3331 movu m0, [r2]
3332 movu m1, [r2 + 16]
3333 movu m2, [r2 + 32]
3334 movu m3, [r2 + r3]
3335 movu m4, [r2 + r3 + 16]
3336 movu m5, [r2 + r3 + 32]
3337
3338 movu [r0], m0
3339 movu [r0 + 16], m1
3340 movu [r0 + 32], m2
3341 movu [r0 + r1], m3
3342 movu [r0 + r1 + 16], m4
3343 movu [r0 + r1 + 32], m5
3344
3345 dec r4d
3346 lea r2, [r2 + 2 * r3]
3347 lea r0, [r0 + 2 * r1]
3348 jnz .loop
3349 RET
3350%endmacro
3351
3352BLOCKCOPY_SS_W24_H4 24, 32
3353
3354BLOCKCOPY_SS_W24_H4 24, 64
3355
3356;-----------------------------------------------------------------------------
3357; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3358;-----------------------------------------------------------------------------
3359%macro BLOCKCOPY_SS_W32_H4 2
3360INIT_XMM sse2
3361cglobal blockcopy_ss_%1x%2, 4, 5, 4
3362 mov r4d, %2/4
3363 add r1, r1
3364 add r3, r3
3365.loop:
3366 movu m0, [r2]
3367 movu m1, [r2 + 16]
3368 movu m2, [r2 + 32]
3369 movu m3, [r2 + 48]
3370
3371 movu [r0], m0
3372 movu [r0 + 16], m1
3373 movu [r0 + 32], m2
3374 movu [r0 + 48], m3
3375
3376 movu m0, [r2 + r3]
3377 movu m1, [r2 + r3 + 16]
3378 movu m2, [r2 + r3 + 32]
3379 movu m3, [r2 + r3 + 48]
3380
3381 movu [r0 + r1], m0
3382 movu [r0 + r1 + 16], m1
3383 movu [r0 + r1 + 32], m2
3384 movu [r0 + r1 + 48], m3
3385
3386 lea r2, [r2 + 2 * r3]
3387 lea r0, [r0 + 2 * r1]
3388
3389 movu m0, [r2]
3390 movu m1, [r2 + 16]
3391 movu m2, [r2 + 32]
3392 movu m3, [r2 + 48]
3393
3394 movu [r0], m0
3395 movu [r0 + 16], m1
3396 movu [r0 + 32], m2
3397 movu [r0 + 48], m3
3398
3399 movu m0, [r2 + r3]
3400 movu m1, [r2 + r3 + 16]
3401 movu m2, [r2 + r3 + 32]
3402 movu m3, [r2 + r3 + 48]
3403
3404 movu [r0 + r1], m0
3405 movu [r0 + r1 + 16], m1
3406 movu [r0 + r1 + 32], m2
3407 movu [r0 + r1 + 48], m3
3408
3409 dec r4d
3410 lea r2, [r2 + 2 * r3]
3411 lea r0, [r0 + 2 * r1]
3412 jnz .loop
3413 RET
3414%endmacro
3415
3416BLOCKCOPY_SS_W32_H4 32, 8
3417BLOCKCOPY_SS_W32_H4 32, 16
3418BLOCKCOPY_SS_W32_H4 32, 24
3419BLOCKCOPY_SS_W32_H4 32, 32
3420BLOCKCOPY_SS_W32_H4 32, 64
3421
3422BLOCKCOPY_SS_W32_H4 32, 48
3423
3424;-----------------------------------------------------------------------------
3425; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3426;-----------------------------------------------------------------------------
3427%macro BLOCKCOPY_SS_W48_H2 2
3428INIT_XMM sse2
3429cglobal blockcopy_ss_%1x%2, 4, 5, 6
3430 mov r4d, %2/4
3431 add r1, r1
3432 add r3, r3
3433.loop:
3434 movu m0, [r2]
3435 movu m1, [r2 + 16]
3436 movu m2, [r2 + 32]
3437 movu m3, [r2 + 48]
3438 movu m4, [r2 + 64]
3439 movu m5, [r2 + 80]
3440
3441 movu [r0], m0
3442 movu [r0 + 16], m1
3443 movu [r0 + 32], m2
3444 movu [r0 + 48], m3
3445 movu [r0 + 64], m4
3446 movu [r0 + 80], m5
3447
3448 movu m0, [r2 + r3]
3449 movu m1, [r2 + r3 + 16]
3450 movu m2, [r2 + r3 + 32]
3451 movu m3, [r2 + r3 + 48]
3452 movu m4, [r2 + r3 + 64]
3453 movu m5, [r2 + r3 + 80]
3454
3455 movu [r0 + r1], m0
3456 movu [r0 + r1 + 16], m1
3457 movu [r0 + r1 + 32], m2
3458 movu [r0 + r1 + 48], m3
3459 movu [r0 + r1 + 64], m4
3460 movu [r0 + r1 + 80], m5
3461
3462 lea r2, [r2 + 2 * r3]
3463 lea r0, [r0 + 2 * r1]
3464
3465 movu m0, [r2]
3466 movu m1, [r2 + 16]
3467 movu m2, [r2 + 32]
3468 movu m3, [r2 + 48]
3469 movu m4, [r2 + 64]
3470 movu m5, [r2 + 80]
3471
3472 movu [r0], m0
3473 movu [r0 + 16], m1
3474 movu [r0 + 32], m2
3475 movu [r0 + 48], m3
3476 movu [r0 + 64], m4
3477 movu [r0 + 80], m5
3478
3479 movu m0, [r2 + r3]
3480 movu m1, [r2 + r3 + 16]
3481 movu m2, [r2 + r3 + 32]
3482 movu m3, [r2 + r3 + 48]
3483 movu m4, [r2 + r3 + 64]
3484 movu m5, [r2 + r3 + 80]
3485
3486 movu [r0 + r1], m0
3487 movu [r0 + r1 + 16], m1
3488 movu [r0 + r1 + 32], m2
3489 movu [r0 + r1 + 48], m3
3490 movu [r0 + r1 + 64], m4
3491 movu [r0 + r1 + 80], m5
3492
3493 dec r4d
3494 lea r2, [r2 + 2 * r3]
3495 lea r0, [r0 + 2 * r1]
3496 jnz .loop
3497RET
3498%endmacro
3499
3500BLOCKCOPY_SS_W48_H2 48, 64
3501
3502;-----------------------------------------------------------------------------
3503; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3504;-----------------------------------------------------------------------------
3505%macro BLOCKCOPY_SS_W64_H4 2
3506INIT_XMM sse2
3507cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
3508 mov r4d, %2/4
3509 add r1, r1
3510 add r3, r3
3511.loop:
3512 movu m0, [r2]
3513 movu m1, [r2 + 16]
3514 movu m2, [r2 + 32]
3515 movu m3, [r2 + 48]
3516
3517 movu [r0], m0
3518 movu [r0 + 16], m1
3519 movu [r0 + 32], m2
3520 movu [r0 + 48], m3
3521
3522 movu m0, [r2 + 64]
3523 movu m1, [r2 + 80]
3524 movu m2, [r2 + 96]
3525 movu m3, [r2 + 112]
3526
3527 movu [r0 + 64], m0
3528 movu [r0 + 80], m1
3529 movu [r0 + 96], m2
3530 movu [r0 + 112], m3
3531
3532 movu m0, [r2 + r3]
3533 movu m1, [r2 + r3 + 16]
3534 movu m2, [r2 + r3 + 32]
3535 movu m3, [r2 + r3 + 48]
3536
3537 movu [r0 + r1], m0
3538 movu [r0 + r1 + 16], m1
3539 movu [r0 + r1 + 32], m2
3540 movu [r0 + r1 + 48], m3
3541
3542 movu m0, [r2 + r3 + 64]
3543 movu m1, [r2 + r3 + 80]
3544 movu m2, [r2 + r3 + 96]
3545 movu m3, [r2 + r3 + 112]
3546
3547 movu [r0 + r1 + 64], m0
3548 movu [r0 + r1 + 80], m1
3549 movu [r0 + r1 + 96], m2
3550 movu [r0 + r1 + 112], m3
3551
3552 lea r2, [r2 + 2 * r3]
3553 lea r0, [r0 + 2 * r1]
3554
3555 movu m0, [r2]
3556 movu m1, [r2 + 16]
3557 movu m2, [r2 + 32]
3558 movu m3, [r2 + 48]
3559
3560 movu [r0], m0
3561 movu [r0 + 16], m1
3562 movu [r0 + 32], m2
3563 movu [r0 + 48], m3
3564
3565 movu m0, [r2 + 64]
3566 movu m1, [r2 + 80]
3567 movu m2, [r2 + 96]
3568 movu m3, [r2 + 112]
3569
3570 movu [r0 + 64], m0
3571 movu [r0 + 80], m1
3572 movu [r0 + 96], m2
3573 movu [r0 + 112], m3
3574
3575 movu m0, [r2 + r3]
3576 movu m1, [r2 + r3 + 16]
3577 movu m2, [r2 + r3 + 32]
3578 movu m3, [r2 + r3 + 48]
3579
3580 movu [r0 + r1], m0
3581 movu [r0 + r1 + 16], m1
3582 movu [r0 + r1 + 32], m2
3583 movu [r0 + r1 + 48], m3
3584
3585 movu m0, [r2 + r3 + 64]
3586 movu m1, [r2 + r3 + 80]
3587 movu m2, [r2 + r3 + 96]
3588 movu m3, [r2 + r3 + 112]
3589
3590 movu [r0 + r1 + 64], m0
3591 movu [r0 + r1 + 80], m1
3592 movu [r0 + r1 + 96], m2
3593 movu [r0 + r1 + 112], m3
3594
3595 dec r4d
3596 lea r2, [r2 + 2 * r3]
3597 lea r0, [r0 + 2 * r1]
3598 jnz .loop
3599
3600 RET
3601%endmacro
3602
3603BLOCKCOPY_SS_W64_H4 64, 16
3604BLOCKCOPY_SS_W64_H4 64, 32
3605BLOCKCOPY_SS_W64_H4 64, 48
3606BLOCKCOPY_SS_W64_H4 64, 64
3607
3608;-----------------------------------------------------------------------------
3609; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
3610;-----------------------------------------------------------------------------
3611%macro BLOCKCOPY_SS_W64_H4_avx 2
3612INIT_YMM avx
3613cglobal blockcopy_ss_%1x%2, 4, 7, 4, dest, deststride, src, srcstride
3614 mov r4d, %2/4
3615 add r1, r1
3616 add r3, r3
3617 lea r5, [3 * r1]
3618 lea r6, [3 * r3]
3619.loop:
3620 movu m0, [r2]
3621 movu m1, [r2 + 32]
3622 movu m2, [r2 + 64]
3623 movu m3, [r2 + 96]
3624
3625 movu [r0], m0
3626 movu [r0 + 32], m1
3627 movu [r0 + 64], m2
3628 movu [r0 + 96], m3
3629
3630 movu m0, [r2 + r3]
3631 movu m1, [r2 + r3 + 32]
3632 movu m2, [r2 + r3 + 64]
3633 movu m3, [r2 + r3 + 96]
3634
3635 movu [r0 + r1], m0
3636 movu [r0 + r1 + 32], m1
3637 movu [r0 + r1 + 64], m2
3638 movu [r0 + r1 + 96], m3
3639
3640 movu m0, [r2 + 2 * r3]
3641 movu m1, [r2 + 2 * r3 + 32]
3642 movu m2, [r2 + 2 * r3 + 64]
3643 movu m3, [r2 + 2 * r3 + 96]
3644
3645 movu [r0 + 2 * r1], m0
3646 movu [r0 + 2 * r1 + 32], m1
3647 movu [r0 + 2 * r1 + 64], m2
3648 movu [r0 + 2 * r1 + 96], m3
3649
3650 movu m0, [r2 + r6]
3651 movu m1, [r2 + r6 + 32]
3652 movu m2, [r2 + r6 + 64]
3653 movu m3, [r2 + r6 + 96]
3654 lea r2, [r2 + 4 * r3]
3655
3656 movu [r0 + r5], m0
3657 movu [r0 + r5 + 32], m1
3658 movu [r0 + r5 + 64], m2
3659 movu [r0 + r5 + 96], m3
3660 lea r0, [r0 + 4 * r1]
3661
3662 dec r4d
3663 jnz .loop
3664 RET
3665%endmacro
3666
3667BLOCKCOPY_SS_W64_H4_avx 64, 16
3668BLOCKCOPY_SS_W64_H4_avx 64, 32
3669BLOCKCOPY_SS_W64_H4_avx 64, 48
3670BLOCKCOPY_SS_W64_H4_avx 64, 64
3671
3672;-----------------------------------------------------------------------------
3673; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
3674;-----------------------------------------------------------------------------
3675INIT_XMM sse2
3676cglobal cvt32to16_shr, 4, 7, 3, dst, src, stride
3677%define rnd m2
3678%define shift m1
3679
3680 ; make shift
3681 mov r5d, r3m
3682 movd shift, r5d
3683
3684 ; make round
3685 dec r5
3686 xor r6, r6
3687 bts r6, r5
3688
3689 movd rnd, r6d
3690 pshufd rnd, rnd, 0
3691
3692 ; register alloc
3693 ; r0 - dst
3694 ; r1 - src
3695 ; r2 - stride * 2 (short*)
3696 ; r3 - lx
3697 ; r4 - size
3698 ; r5 - ly
3699 ; r6 - diff
3700 add r2d, r2d
3701
3702 mov r4d, r4m
3703 mov r5, r4
3704 mov r6, r2
3705 sub r6, r4
3706 add r6, r6
3707
3708 shr r5, 1
3709.loop_row:
3710
3711 mov r3, r4
3712 shr r3, 2
3713.loop_col:
3714 ; row 0
3715 movu m0, [r1]
3716 paddd m0, rnd
3717 psrad m0, shift
3718 packssdw m0, m0
3719 movh [r0], m0
3720
3721 ; row 1
3722 movu m0, [r1 + r4 * 4]
3723 paddd m0, rnd
3724 psrad m0, shift
3725 packssdw m0, m0
3726 movh [r0 + r2], m0
3727
3728 ; move col pointer
3729 add r1, 16
3730 add r0, 8
3731
3732 dec r3
3733 jg .loop_col
3734
3735 ; update pointer
3736 lea r1, [r1 + r4 * 4]
3737 add r0, r6
3738
3739 ; end of loop_row
3740 dec r5
3741 jg .loop_row
3742
3743 RET
3744
3745
3746;--------------------------------------------------------------------------------------
3747; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size);
3748;--------------------------------------------------------------------------------------
3749INIT_XMM sse4
3750cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size
3751%define shift m1
3752
3753 ; make shift
3754 mov r5d, r3m
3755 movd shift, r5d
3756
3757 ; register alloc
3758 ; r0 - dst
3759 ; r1 - src
3760 ; r2 - stride
3761 ; r3 - shift
3762 ; r4 - size
3763
3764 sub r2d, r4d
3765 add r2d, r2d
3766 mov r5d, r4d
3767 shr r4d, 2
3768.loop_row:
3769 mov r6d, r4d
3770
3771.loop_col:
3772 pmovsxwd m0, [r1]
3773 pslld m0, shift
3774 movu [r0], m0
3775
3776 add r1, 8
3777 add r0, 16
3778
3779 dec r6d
3780 jnz .loop_col
3781
3782 add r1, r2
3783 dec r5d
3784 jnz .loop_row
3785 RET
3786
3787
3788;--------------------------------------------------------------------------------------
3789; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
3790;--------------------------------------------------------------------------------------
3791INIT_XMM sse4
3792cglobal cvt16to32_shr_4, 3,3,3
3793 add r2d, r2d
3794 movd m0, r3m
3795 movd m1, r4m
3796 pshufd m1, m1, 0
3797
3798 ; register alloc
3799 ; r0 - dst
3800 ; r1 - src
3801 ; r2 - stride
3802 ; m0 - shift
3803 ; m1 - dword [offset]
3804
3805 ; Row 0
3806 pmovsxwd m2, [r1]
3807 paddd m2, m1
3808 psrad m2, m0
3809 movu [r0 + 0 * mmsize], m2
3810
3811 ; Row 1
3812 pmovsxwd m2, [r1 + r2]
3813 paddd m2, m1
3814 psrad m2, m0
3815 movu [r0 + 1 * mmsize], m2
3816
3817 ; Row 2
3818 lea r1, [r1 + r2 * 2]
3819 pmovsxwd m2, [r1]
3820 paddd m2, m1
3821 psrad m2, m0
3822 movu [r0 + 2 * mmsize], m2
3823
3824 ; Row 3
3825 pmovsxwd m2, [r1 + r2]
3826 paddd m2, m1
3827 psrad m2, m0
3828 movu [r0 + 3 * mmsize], m2
3829 RET
3830
3831
3832;--------------------------------------------------------------------------------------
3833; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
3834;--------------------------------------------------------------------------------------
3835INIT_XMM sse4
3836cglobal cvt16to32_shr_8, 3,5,3
3837 add r2d, r2d
3838 movd m0, r3m
3839 movd m1, r4m
3840 pshufd m1, m1, 0
3841 mov r3d, 8/4
3842 lea r4, [r2 * 3]
3843
3844 ; register alloc
3845 ; r0 - dst
3846 ; r1 - src
3847 ; r2 - stride
3848 ; r3 - loop counter
3849 ; r4 - stride * 3
3850 ; m0 - shift
3851 ; m1 - dword [offset]
3852
3853.loop:
3854 ; Row 0
3855 pmovsxwd m2, [r1]
3856 pmovsxwd m3, [r1 + mmsize/2]
3857 paddd m2, m1
3858 paddd m3, m1
3859 psrad m2, m0
3860 psrad m3, m0
3861 movu [r0 + 0 * mmsize], m2
3862 movu [r0 + 1 * mmsize], m3
3863
3864 ; Row 1
3865 pmovsxwd m2, [r1 + r2]
3866 pmovsxwd m3, [r1 + r2 + mmsize/2]
3867 paddd m2, m1
3868 paddd m3, m1
3869 psrad m2, m0
3870 psrad m3, m0
3871 movu [r0 + 2 * mmsize], m2
3872 movu [r0 + 3 * mmsize], m3
3873
3874 ; Row 2
3875 pmovsxwd m2, [r1 + r2 * 2]
3876 pmovsxwd m3, [r1 + r2 * 2 + mmsize/2]
3877 paddd m2, m1
3878 paddd m3, m1
3879 psrad m2, m0
3880 psrad m3, m0
3881 movu [r0 + 4 * mmsize], m2
3882 movu [r0 + 5 * mmsize], m3
3883
3884 ; Row 3
3885 pmovsxwd m2, [r1 + r4]
3886 pmovsxwd m3, [r1 + r4 + mmsize/2]
3887 paddd m2, m1
3888 paddd m3, m1
3889 psrad m2, m0
3890 psrad m3, m0
3891 movu [r0 + 6 * mmsize], m2
3892 movu [r0 + 7 * mmsize], m3
3893
3894 add r0, 8 * mmsize
3895 lea r1, [r1 + r2 * 4]
3896 dec r3d
3897 jnz .loop
3898 RET
3899
3900
3901;--------------------------------------------------------------------------------------
3902; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
3903;--------------------------------------------------------------------------------------
3904INIT_XMM sse4
3905cglobal cvt16to32_shr_16, 3,4,6
3906 add r2d, r2d
3907 movd m0, r3m
3908 movd m1, r4m
3909 pshufd m1, m1, 0
3910 mov r3d, 16/2
3911
3912 ; register alloc
3913 ; r0 - dst
3914 ; r1 - src
3915 ; r2 - stride
3916 ; r3 - loop counter
3917 ; m0 - shift
3918 ; m1 - dword [offset]
3919
3920.loop:
3921 ; Row 0
3922 pmovsxwd m2, [r1 + 0 * mmsize/2]
3923 pmovsxwd m3, [r1 + 1 * mmsize/2]
3924 pmovsxwd m4, [r1 + 2 * mmsize/2]
3925 pmovsxwd m5, [r1 + 3 * mmsize/2]
3926 paddd m2, m1
3927 paddd m3, m1
3928 paddd m4, m1
3929 paddd m5, m1
3930 psrad m2, m0
3931 psrad m3, m0
3932 psrad m4, m0
3933 psrad m5, m0
3934 movu [r0 + 0 * mmsize], m2
3935 movu [r0 + 1 * mmsize], m3
3936 movu [r0 + 2 * mmsize], m4
3937 movu [r0 + 3 * mmsize], m5
3938
3939 ; Row 1
3940 pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
3941 pmovsxwd m3, [r1 + r2 +1 * mmsize/2]
3942 pmovsxwd m4, [r1 + r2 +2 * mmsize/2]
3943 pmovsxwd m5, [r1 + r2 +3 * mmsize/2]
3944 paddd m2, m1
3945 paddd m3, m1
3946 paddd m4, m1
3947 paddd m5, m1
3948 psrad m2, m0
3949 psrad m3, m0
3950 psrad m4, m0
3951 psrad m5, m0
3952 movu [r0 + 4 * mmsize], m2
3953 movu [r0 + 5 * mmsize], m3
3954 movu [r0 + 6 * mmsize], m4
3955 movu [r0 + 7 * mmsize], m5
3956
3957 add r0, 8 * mmsize
3958 lea r1, [r1 + r2 * 2]
3959 dec r3d
3960 jnz .loop
3961 RET
3962
3963
3964;--------------------------------------------------------------------------------------
3965; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
3966;--------------------------------------------------------------------------------------
3967INIT_XMM sse4
3968cglobal cvt16to32_shr_32, 3,4,6
3969 add r2d, r2d
3970 movd m0, r3m
3971 movd m1, r4m
3972 pshufd m1, m1, 0
3973 mov r3d, 32/1
3974
3975 ; register alloc
3976 ; r0 - dst
3977 ; r1 - src
3978 ; r2 - stride
3979 ; r3 - loop counter
3980 ; m0 - shift
3981 ; m1 - dword [offset]
3982
3983.loop:
3984 ; Row 0
3985 pmovsxwd m2, [r1 + 0 * mmsize/2]
3986 pmovsxwd m3, [r1 + 1 * mmsize/2]
3987 pmovsxwd m4, [r1 + 2 * mmsize/2]
3988 pmovsxwd m5, [r1 + 3 * mmsize/2]
3989 paddd m2, m1
3990 paddd m3, m1
3991 paddd m4, m1
3992 paddd m5, m1
3993 psrad m2, m0
3994 psrad m3, m0
3995 psrad m4, m0
3996 psrad m5, m0
3997 movu [r0 + 0 * mmsize], m2
3998 movu [r0 + 1 * mmsize], m3
3999 movu [r0 + 2 * mmsize], m4
4000 movu [r0 + 3 * mmsize], m5
4001
4002 pmovsxwd m2, [r1 + 4 * mmsize/2]
4003 pmovsxwd m3, [r1 + 5 * mmsize/2]
4004 pmovsxwd m4, [r1 + 6 * mmsize/2]
4005 pmovsxwd m5, [r1 + 7 * mmsize/2]
4006 paddd m2, m1
4007 paddd m3, m1
4008 paddd m4, m1
4009 paddd m5, m1
4010 psrad m2, m0
4011 psrad m3, m0
4012 psrad m4, m0
4013 psrad m5, m0
4014 movu [r0 + 4 * mmsize], m2
4015 movu [r0 + 5 * mmsize], m3
4016 movu [r0 + 6 * mmsize], m4
4017 movu [r0 + 7 * mmsize], m5
4018
4019 add r0, 8 * mmsize
4020 add r1, r2
4021 dec r3d
4022 jnz .loop
4023 RET
4024
4025
4026;--------------------------------------------------------------------------------------
4027; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
4028;--------------------------------------------------------------------------------------
4029INIT_XMM sse2
4030cglobal cvt32to16_shl_4, 3,3,5
4031 add r2d, r2d
4032 movd m0, r3m
4033
4034 ; Row 0-3
4035 movu m1, [r1 + 0 * mmsize]
4036 movu m2, [r1 + 1 * mmsize]
4037 movu m3, [r1 + 2 * mmsize]
4038 movu m4, [r1 + 3 * mmsize]
4039 packssdw m1, m2
4040 packssdw m3, m4
4041 psllw m1, m0
4042 psllw m3, m0
4043 movh [r0], m1
4044 movhps [r0 + r2], m1
4045 movh [r0 + r2 * 2], m3
4046 lea r2, [r2 * 3]
4047 movhps [r0 + r2], m3
4048 RET
4049
4050
4051INIT_YMM avx2
4052cglobal cvt32to16_shl_4, 3,3,3
4053 add r2d, r2d
4054 movd xm0, r3m
4055
4056 ; Row 0-3
4057 movu m1, [r1 + 0 * mmsize]
4058 movu m2, [r1 + 1 * mmsize]
4059 packssdw m1, m2
4060 psllw m1, xm0
4061 vextracti128 xm0, m1, 1
4062 movq [r0], xm1
4063 movq [r0 + r2], xm0
4064 lea r0, [r0 + r2 * 2]
4065 movhps [r0], xm1
4066 movhps [r0 + r2], xm0
4067 RET
4068
4069
4070;--------------------------------------------------------------------------------------
4071; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
4072;--------------------------------------------------------------------------------------
4073INIT_XMM sse2
4074cglobal cvt32to16_shl_8, 3,5,5
4075 add r2d, r2d
4076 movd m0, r3m
4077 mov r3d, 8/4
4078 lea r4, [r2 * 3]
4079
4080.loop:
4081 ; Row 0-1
4082 movu m1, [r1 + 0 * mmsize]
4083 movu m2, [r1 + 1 * mmsize]
4084 movu m3, [r1 + 2 * mmsize]
4085 movu m4, [r1 + 3 * mmsize]
4086 packssdw m1, m2
4087 packssdw m3, m4
4088 psllw m1, m0
4089 psllw m3, m0
4090 movu [r0], m1
4091 movu [r0 + r2], m3
4092
4093 ; Row 2-3
4094 movu m1, [r1 + 4 * mmsize]
4095 movu m2, [r1 + 5 * mmsize]
4096 movu m3, [r1 + 6 * mmsize]
4097 movu m4, [r1 + 7 * mmsize]
4098 packssdw m1, m2
4099 packssdw m3, m4
4100 psllw m1, m0
4101 psllw m3, m0
4102 movu [r0 + r2 * 2], m1
4103 movu [r0 + r4], m3
4104
4105 add r1, 8 * mmsize
4106 lea r0, [r0 + r2 * 4]
4107 dec r3d
4108 jnz .loop
4109 RET
4110
4111
4112INIT_YMM avx2
4113cglobal cvt32to16_shl_8, 3,4,3
4114 add r2d, r2d
4115 movd xm0, r3m
4116 lea r3, [r2 * 3]
4117
4118 ; Row 0-1
4119 movu xm1, [r1 + 0 * mmsize]
4120 vinserti128 m1, m1, [r1 + 1 * mmsize], 1
4121 movu xm2, [r1 + 0 * mmsize + mmsize/2]
4122 vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
4123 packssdw m1, m2
4124 psllw m1, xm0
4125 movu [r0], xm1
4126 vextracti128 [r0 + r2], m1, 1
4127
4128 ; Row 2-3
4129 movu xm1, [r1 + 2 * mmsize]
4130 vinserti128 m1, m1, [r1 + 3 * mmsize], 1
4131 movu xm2, [r1 + 2 * mmsize + mmsize/2]
4132 vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
4133 packssdw m1, m2
4134 psllw m1, xm0
4135 movu [r0 + r2 * 2], xm1
4136 vextracti128 [r0 + r3], m1, 1
4137
4138 add r1, 4 * mmsize
4139 lea r0, [r0 + r2 * 4]
4140
4141 ; Row 4-5
4142 movu m1, [r1 + 0 * mmsize]
4143 movu m2, [r1 + 1 * mmsize]
4144 packssdw m1, m2
4145 vpermq m1, m1, 11011000b
4146 psllw m1, xm0
4147 movu [r0], xm1
4148 vextracti128 [r0 + r2], m1, 1
4149
4150 ; Row 6-7
4151 movu m1, [r1 + 2 * mmsize]
4152 movu m2, [r1 + 3 * mmsize]
4153 packssdw m1, m2
4154 vpermq m1, m1, 11011000b
4155 psllw m1, xm0
4156 movu [r0 + r2 * 2], xm1
4157 vextracti128 [r0 + r3], m1, 1
4158 RET
4159
4160;--------------------------------------------------------------------------------------
4161; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
4162;--------------------------------------------------------------------------------------
4163INIT_XMM sse2
4164cglobal cvt32to16_shl_16, 3,4,5
4165 add r2d, r2d
4166 movd m0, r3m
4167 mov r3d, 16/2
4168
4169.loop:
4170 ; Row 0
4171 movu m1, [r1 + 0 * mmsize]
4172 movu m2, [r1 + 1 * mmsize]
4173 movu m3, [r1 + 2 * mmsize]
4174 movu m4, [r1 + 3 * mmsize]
4175 packssdw m1, m2
4176 packssdw m3, m4
4177 psllw m1, m0
4178 psllw m3, m0
4179 movu [r0], m1
4180 movu [r0 + mmsize], m3
4181
4182 ; Row 1
4183 movu m1, [r1 + 4 * mmsize]
4184 movu m2, [r1 + 5 * mmsize]
4185 movu m3, [r1 + 6 * mmsize]
4186 movu m4, [r1 + 7 * mmsize]
4187 packssdw m1, m2
4188 packssdw m3, m4
4189 psllw m1, m0
4190 psllw m3, m0
4191 movu [r0 + r2], m1
4192 movu [r0 + r2 + mmsize], m3
4193
4194 add r1, 8 * mmsize
4195 lea r0, [r0 + r2 * 2]
4196 dec r3d
4197 jnz .loop
4198 RET
4199
4200
4201INIT_YMM avx2
4202cglobal cvt32to16_shl_16, 3,5,3
4203 add r2d, r2d
4204 movd xm0, r3m
4205 mov r3d, 16/4
4206 lea r4, [r2 * 3]
4207
4208.loop:
4209 ; Row 0
4210 movu xm1, [r1 + 0 * mmsize]
4211 vinserti128 m1, m1, [r1 + 1 * mmsize], 1
4212 movu xm2, [r1 + 0 * mmsize + mmsize/2]
4213 vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
4214 packssdw m1, m2
4215 psllw m1, xm0
4216 movu [r0], m1
4217
4218 ; Row 1
4219 movu xm1, [r1 + 2 * mmsize]
4220 vinserti128 m1, m1, [r1 + 3 * mmsize], 1
4221 movu xm2, [r1 + 2 * mmsize + mmsize/2]
4222 vinserti128 m2, m2, [r1 + 3 * mmsize + mmsize/2], 1
4223 packssdw m1, m2
4224 psllw m1, xm0
4225 movu [r0 + r2], m1
4226
4227 add r1, 4 * mmsize
4228
4229 ; Row 2
4230 movu xm1, [r1 + 0 * mmsize]
4231 vinserti128 m1, m1, [r1 + 1 * mmsize], 1
4232 movu xm2, [r1 + 0 * mmsize + mmsize/2]
4233 vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
4234 packssdw m1, m2
4235 psllw m1, xm0
4236 movu [r0 + r2 * 2], m1
4237
4238 ; Row 3
4239 movu m1, [r1 + 2 * mmsize]
4240 movu m2, [r1 + 3 * mmsize]
4241 packssdw m1, m2
4242 psllw m1, xm0
4243 vpermq m1, m1, 11011000b
4244 movu [r0 + r4], m1
4245
4246 add r1, 4 * mmsize
4247 lea r0, [r0 + r2 * 4]
4248 dec r3d
4249 jnz .loop
4250 RET
4251
4252
4253;--------------------------------------------------------------------------------------
4254; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
4255;--------------------------------------------------------------------------------------
4256INIT_XMM sse2
4257cglobal cvt32to16_shl_32, 3,4,5
4258 add r2d, r2d
4259 movd m0, r3m
4260 mov r3d, 32/1
4261
4262.loop:
4263 ; Row 0
4264 movu m1, [r1 + 0 * mmsize]
4265 movu m2, [r1 + 1 * mmsize]
4266 movu m3, [r1 + 2 * mmsize]
4267 movu m4, [r1 + 3 * mmsize]
4268 packssdw m1, m2
4269 packssdw m3, m4
4270 psllw m1, m0
4271 psllw m3, m0
4272 movu [r0 + 0 * mmsize], m1
4273 movu [r0 + 1 * mmsize], m3
4274
4275 movu m1, [r1 + 4 * mmsize]
4276 movu m2, [r1 + 5 * mmsize]
4277 movu m3, [r1 + 6 * mmsize]
4278 movu m4, [r1 + 7 * mmsize]
4279 packssdw m1, m2
4280 packssdw m3, m4
4281 psllw m1, m0
4282 psllw m3, m0
4283 movu [r0 + 2 * mmsize], m1
4284 movu [r0 + 3 * mmsize], m3
4285
4286 add r1, 8 * mmsize
4287 add r0, r2
4288 dec r3d
4289 jnz .loop
4290 RET
4291
4292
4293INIT_YMM avx2
4294cglobal cvt32to16_shl_32, 3,4,5
4295 add r2d, r2d
4296 movd xm0, r3m
4297 mov r3d, 32/2
4298
4299.loop:
4300 ; Row 0
4301 movu xm1, [r1 + 0 * mmsize]
4302 vinserti128 m1, m1, [r1 + 1 * mmsize], 1
4303 movu xm2, [r1 + 0 * mmsize + mmsize/2]
4304 vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
4305 movu xm3, [r1 + 2 * mmsize]
4306 vinserti128 m3, m3, [r1 + 3 * mmsize], 1
4307 movu xm4, [r1 + 2 * mmsize + mmsize/2]
4308 vinserti128 m4, m4, [r1 + 3 * mmsize + mmsize/2], 1
4309 packssdw m1, m2
4310 packssdw m3, m4
4311 psllw m1, xm0
4312 psllw m3, xm0
4313 movu [r0], m1
4314 movu [r0 + mmsize], m3
4315
4316 add r1, 4 * mmsize
4317
4318 ; Row 1
4319 movu xm1, [r1 + 0 * mmsize]
4320 vinserti128 m1, m1, [r1 + 1 * mmsize], 1
4321 movu xm2, [r1 + 0 * mmsize + mmsize/2]
4322 vinserti128 m2, m2, [r1 + 1 * mmsize + mmsize/2], 1
4323 movu m3, [r1 + 2 * mmsize]
4324 movu m4, [r1 + 3 * mmsize]
4325 packssdw m1, m2
4326 packssdw m3, m4
4327 psllw m1, xm0
4328 psllw m3, xm0
4329 vpermq m3, m3, 11011000b
4330 movu [r0 + r2], m1
4331 movu [r0 + r2 + mmsize], m3
4332
4333 add r1, 4 * mmsize
4334 lea r0, [r0 + r2 * 2]
4335 dec r3d
4336 jnz .loop
4337 RET
4338
4339
4340;--------------------------------------------------------------------------------------
4341; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
4342;--------------------------------------------------------------------------------------
4343INIT_XMM sse4
4344cglobal copy_cnt_4, 3,3,3
4345 add r2d, r2d
4346 pxor m2, m2
4347
4348 ; row 0 & 1
4349 movh m0, [r1]
4350 movhps m0, [r1 + r2]
4351 mova [r0], m0
4352
4353 ; row 2 & 3
4354 movh m1, [r1 + r2 * 2]
4355 lea r2, [r2 * 3]
4356 movhps m1, [r1 + r2]
4357 mova [r0 + 16], m1
4358
4359 packsswb m0, m1
4360 pcmpeqb m0, m2
4361
4362 ; get count
4363 ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
4364%if 0
4365 pmovmskb eax, m0
4366 not ax
4367 popcnt ax, ax
4368%else
4369 mova m1, [pb_1]
4370 paddb m0, m1
4371 psadbw m0, m2
4372 pshufd m1, m0, 2
4373 paddw m0, m1
4374 movd eax, m0
4375%endif
4376 RET
4377
4378
4379;--------------------------------------------------------------------------------------
4380; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
4381;--------------------------------------------------------------------------------------
4382INIT_XMM sse4
4383cglobal copy_cnt_8, 3,3,6
4384 add r2d, r2d
4385 pxor m4, m4
4386 pxor m5, m5
4387
4388 ; row 0 & 1
4389 movu m0, [r1]
4390 movu m1, [r1 + r2]
4391 movu [r0], m0
4392 movu [r0 + 16], m1
4393
4394 packsswb m0, m1
4395 pcmpeqb m0, m4
4396 paddb m5, m0
4397
4398 ; row 2 & 3
4399 lea r1, [r1 + 2 * r2]
4400 movu m0, [r1]
4401 movu m1, [r1 + r2]
4402 movu [r0 + 32], m0
4403 movu [r0 + 48], m1
4404
4405 packsswb m0, m1
4406 pcmpeqb m0, m4
4407 paddb m5, m0
4408
4409 ; row 4 & 5
4410 lea r1, [r1 + 2 * r2]
4411 movu m0, [r1]
4412 movu m1, [r1 + r2]
4413 movu [r0 + 64], m0
4414 movu [r0 + 80], m1
4415
4416 packsswb m0, m1
4417 pcmpeqb m0, m4
4418 paddb m5, m0
4419
4420 ; row 6 & 7
4421 lea r1, [r1 + 2 * r2]
4422 movu m0, [r1]
4423 movu m1, [r1 + r2]
4424 movu [r0 + 96], m0
4425 movu [r0 + 112], m1
4426
4427 packsswb m0, m1
4428 pcmpeqb m0, m4
4429 paddb m5, m0
4430
4431 ; get count
4432 mova m0, [pb_4]
4433 paddb m5, m0
4434 psadbw m5, m4
4435 pshufd m0, m5, 2
4436 paddw m5, m0
4437 movd eax, m5
4438 RET
4439
4440
4441INIT_YMM avx2
4442cglobal copy_cnt_8, 3,4,5
4443 add r2d, r2d
4444 lea r3, [r2 * 3]
4445
4446 ; row 0 - 1
4447 movu xm0, [r1]
4448 vinserti128 m0, m0, [r1 + r2], 1
4449 movu [r0], m0
4450
4451 ; row 2 - 3
4452 movu xm1, [r1 + r2 * 2]
4453 vinserti128 m1, m1, [r1 + r3], 1
4454 movu [r0 + 32], m1
4455 lea r1, [r1 + r2 * 4]
4456
4457 ; row 4 - 5
4458 movu xm2, [r1]
4459 vinserti128 m2, m2, [r1 + r2], 1
4460 movu [r0 + 64], m2
4461
4462 ; row 6 - 7
4463 movu xm3, [r1 + r2 * 2]
4464 vinserti128 m3, m3, [r1 + r3], 1
4465 movu [r0 + 96], m3
4466
4467 ; get count
4468 xorpd m4, m4
4469 vpacksswb m0, m1
4470 vpacksswb m2, m3
4471 pminub m0, [pb_1]
4472 pminub m2, [pb_1]
4473 paddb m0, m2
4474 vextracti128 xm1, m0, 1
4475 paddb xm0, xm1
4476 psadbw xm0, xm4
4477 movhlps xm1, xm0
4478 paddd xm0, xm1
4479 movd eax, xm0
4480 RET
4481
4482
4483;--------------------------------------------------------------------------------------
4484; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
4485;--------------------------------------------------------------------------------------
4486INIT_XMM sse4
4487cglobal copy_cnt_16, 3,4,6
4488 add r2d, r2d
4489 mov r3d, 4
4490 pxor m4, m4
4491 pxor m5, m5
4492
4493.loop
4494 ; row 0
4495 movu m0, [r1]
4496 movu m1, [r1 + 16]
4497 movu [r0], m0
4498 movu [r0 + 16], m1
4499
4500 packsswb m0, m1
4501 pcmpeqb m0, m4
4502 paddb m5, m0
4503
4504 ; row 1
4505 movu m0, [r1 + r2]
4506 movu m1, [r1 + r2 + 16]
4507 movu [r0 + 32], m0
4508 movu [r0 + 48], m1
4509
4510 packsswb m0, m1
4511 pcmpeqb m0, m4
4512 paddb m5, m0
4513
4514 ; row 2
4515 movu m0, [r1 + 2 * r2]
4516 movu m1, [r1 + 2 * r2 + 16]
4517 movu [r0 + 64], m0
4518 movu [r0 + 80], m1
4519
4520 packsswb m0, m1
4521 pcmpeqb m0, m4
4522 paddb m5, m0
4523
4524 ; row 3
4525 lea r1, [r1 + 2 * r2]
4526 movu m0, [r1 + r2]
4527 movu m1, [r1 + r2 + 16]
4528 movu [r0 + 96], m0
4529 movu [r0 + 112], m1
4530
4531 packsswb m0, m1
4532 pcmpeqb m0, m4
4533 paddb m5, m0
4534
4535 add r0, 128
4536 lea r1, [r1 + 2 * r2]
4537 dec r3d
4538 jnz .loop
4539
4540 mova m0, [pb_16]
4541 paddb m5, m0
4542 psadbw m5, m4
4543 pshufd m0, m5, 2
4544 paddw m5, m0
4545 movd eax, m5
4546 RET
4547
4548
4549INIT_YMM avx2
4550cglobal copy_cnt_16, 3, 5, 5
4551 add r2d, r2d
4552 lea r3, [r2 * 3]
4553 mov r4d, 16/4
4554
4555 mova m3, [pb_1]
4556 xorpd m4, m4
4557
4558.loop:
4559 ; row 0 - 1
4560 movu m0, [r1]
4561 movu [r0], m0
4562 movu m1, [r1 + r2]
4563 movu [r0 + 32], m1
4564
4565 packsswb m0, m1
4566 pminub m0, m3
4567
4568 ; row 2 - 3
4569 movu m1, [r1 + r2 * 2]
4570 movu [r0 + 64], m1
4571 movu m2, [r1 + r3]
4572 movu [r0 + 96], m2
4573
4574 packsswb m1, m2
4575 pminub m1, m3
4576 paddb m0, m1
4577 paddb m4, m0
4578
4579 add r0, 128
4580 lea r1, [r1 + 4 * r2]
4581 dec r4d
4582 jnz .loop
4583
4584 ; get count
4585 xorpd m0, m0
4586 vextracti128 xm1, m4, 1
4587 paddb xm4, xm1
4588 psadbw xm4, xm0
4589 movhlps xm1, xm4
4590 paddd xm4, xm1
4591 movd eax, xm4
4592 RET
4593
4594;--------------------------------------------------------------------------------------
4595; uint32_t copy_cnt(int32_t *dst, int16_t *src, intptr_t stride);
4596;--------------------------------------------------------------------------------------
4597INIT_XMM sse4
4598cglobal copy_cnt_32, 3,4,6
4599 add r2d, r2d
4600 mov r3d, 16
4601 pxor m4, m4
4602 pxor m5, m5
4603
4604.loop
4605 ; row 0
4606 movu m0, [r1]
4607 movu m1, [r1 + 16]
4608 movu [r0], m0
4609 movu [r0 + 16], m1
4610
4611 packsswb m0, m1
4612 pcmpeqb m0, m4
4613 paddb m5, m0
4614
4615 movu m0, [r1 + 32]
4616 movu m1, [r1 + 48]
4617 movu [r0 + 32], m0
4618 movu [r0 + 48], m1
4619
4620 packsswb m0, m1
4621 pcmpeqb m0, m4
4622 paddb m5, m0
4623
4624 ; row 1
4625 movu m0, [r1 + r2]
4626 movu m1, [r1 + r2 + 16]
4627 movu [r0 + 64], m0
4628 movu [r0 + 80], m1
4629
4630 packsswb m0, m1
4631 pcmpeqb m0, m4
4632 paddb m5, m0
4633
4634 movu m0, [r1 + r2 + 32]
4635 movu m1, [r1 + r2 + 48]
4636 movu [r0 + 96], m0
4637 movu [r0 + 112], m1
4638
4639 packsswb m0, m1
4640 pcmpeqb m0, m4
4641 paddb m5, m0
4642
4643 add r0, 128
4644 lea r1, [r1 + 2 * r2]
4645 dec r3d
4646 jnz .loop
4647
4648 ; get count
4649 mova m0, [pb_64]
4650 paddb m5, m0
4651 psadbw m5, m4
4652 pshufd m0, m5, 2
4653 paddw m5, m0
4654 movd eax, m5
4655 RET
4656
4657
4658INIT_YMM avx2
4659cglobal copy_cnt_32, 3, 5, 5
4660 add r2d, r2d
4661 mov r3d, 32/2
4662
4663 mova m3, [pb_1]
4664 xorpd m4, m4
4665
4666.loop:
4667 ; row 0
4668 movu m0, [r1]
4669 movu [r0], m0
4670 movu m1, [r1 + 32]
4671 movu [r0 + 32], m1
4672
4673 packsswb m0, m1
4674 pminub m0, m3
4675
4676 ; row 1
4677 movu m1, [r1 + r2]
4678 movu [r0 + 64], m1
4679 movu m2, [r1 + r2 + 32]
4680 movu [r0 + 96], m2
4681
4682 packsswb m1, m2
4683 pminub m1, m3
4684 paddb m0, m1
4685 paddb m4, m0
4686
4687 add r0, 128
4688 lea r1, [r1 + 2 * r2]
4689 dec r3d
4690 jnz .loop
4691
4692 ; get count
4693 xorpd m0, m0
4694 vextracti128 xm1, m4, 1
4695 paddb xm4, xm1
4696 psadbw xm4, xm0
4697 movhlps xm1, xm4
4698 paddd xm4, xm1
4699 movd eax, xm4
4700 RET
4701
4702;-----------------------------------------------------------------------------
4703; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
4704;-----------------------------------------------------------------------------
4705
4706INIT_XMM sse4
4707cglobal copy_shr, 4, 7, 4, dst, src, stride
4708%define rnd m2
4709%define shift m1
4710
4711 ; make shift
4712 mov r5d, r3m
4713 movd shift, r5d
4714
4715 ; make round
4716 dec r5
4717 xor r6, r6
4718 bts r6, r5
4719
4720 movd rnd, r6d
4721 pshufd rnd, rnd, 0
4722
4723 ; register alloc
4724 ; r0 - dst
4725 ; r1 - src
4726 ; r2 - stride * 2 (short*)
4727 ; r3 - lx
4728 ; r4 - size
4729 ; r5 - ly
4730 ; r6 - diff
4731 add r2d, r2d
4732
4733 mov r4d, r4m
4734 mov r5, r4 ; size
4735 mov r6, r2 ; stride
4736 sub r6, r4
4737 add r6, r6
4738
4739 shr r5, 1
4740.loop_row:
4741
4742 mov r3, r4
4743 shr r3, 2
4744.loop_col:
4745 ; row 0
4746 movh m3, [r1]
4747 pmovsxwd m0, m3
4748 paddd m0, rnd
4749 psrad m0, shift
4750 packssdw m0, m0
4751 movh [r0], m0
4752
4753 ; row 1
4754 movh m3, [r1 + r4 * 2]
4755 pmovsxwd m0, m3
4756 paddd m0, rnd
4757 psrad m0, shift
4758 packssdw m0, m0
4759 movh [r0 + r2], m0
4760
4761 ; move col pointer
4762 add r1, 8
4763 add r0, 8
4764
4765 dec r3
4766 jg .loop_col
4767
4768 ; update pointer
4769 lea r1, [r1 + r4 * 2]
4770 add r0, r6
4771
4772 ; end of loop_row
4773 dec r5
4774 jg .loop_row
4775
4776 RET
4777
4778;--------------------------------------------------------------------------------------
4779; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
4780;--------------------------------------------------------------------------------------
4781INIT_XMM sse2
4782cglobal copy_shl_4, 3,3,3
4783 add r2d, r2d
4784 movd m0, r3m
4785
4786 ; Row 0-3
4787 movu m1, [r1 + 0 * mmsize]
4788 movu m2, [r1 + 1 * mmsize]
4789 psllw m1, m0
4790 psllw m2, m0
4791 movh [r0], m1
4792 movhps [r0 + r2], m1
4793 movh [r0 + r2 * 2], m2
4794 lea r2, [r2 * 3]
4795 movhps [r0 + r2], m2
4796 RET
4797
4798;--------------------------------------------------------------------------------------
4799; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
4800;--------------------------------------------------------------------------------------
4801INIT_XMM sse2
4802cglobal copy_shl_8, 3,4,5
4803 add r2d, r2d
4804 movd m0, r3m
4805
4806 ; Row 0-3
4807 movu m1, [r1 + 0 * mmsize]
4808 movu m2, [r1 + 1 * mmsize]
4809 movu m3, [r1 + 2 * mmsize]
4810 movu m4, [r1 + 3 * mmsize]
4811 psllw m1, m0
4812 psllw m2, m0
4813 psllw m3, m0
4814 psllw m4, m0
4815 movu [r0], m1
4816 movu [r0 + r2], m2
4817 movu [r0 + 2 * r2], m3
4818 lea r0, [r0 + 2 * r2]
4819 movu [r0 + r2], m4
4820
4821 ; Row 4-7
4822 movu m1, [r1 + 4 * mmsize]
4823 movu m2, [r1 + 5 * mmsize]
4824 movu m3, [r1 + 6 * mmsize]
4825 movu m4, [r1 + 7 * mmsize]
4826 psllw m1, m0
4827 psllw m2, m0
4828 psllw m3, m0
4829 psllw m4, m0
4830 movu [r0 + r2 * 2], m1
4831 lea r0, [r0 + 2 * r2]
4832 movu [r0 + r2], m2
4833 movu [r0 + 2 * r2], m3
4834 lea r0, [r0 + 2 * r2]
4835 movu [r0 + r2], m4
4836 RET
4837
4838;--------------------------------------------------------------------------------------
4839; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
4840;--------------------------------------------------------------------------------------
4841INIT_XMM sse2
4842cglobal copy_shl_16, 3,4,5
4843 add r2d, r2d
4844 movd m0, r3m
4845 mov r3d, 256/64
4846
4847.loop:
4848 ; Row 0-3
4849 movu m1, [r1 + 0 * mmsize]
4850 movu m2, [r1 + 1 * mmsize]
4851 movu m3, [r1 + 2 * mmsize]
4852 movu m4, [r1 + 3 * mmsize]
4853 psllw m1, m0
4854 psllw m2, m0
4855 psllw m3, m0
4856 psllw m4, m0
4857 movu [r0], m1
4858 movu [r0 + 16], m2
4859 movu [r0 + r2], m3
4860 movu [r0 + r2 + 16], m4
4861
4862 ; Row 4-7
4863 movu m1, [r1 + 4 * mmsize]
4864 movu m2, [r1 + 5 * mmsize]
4865 movu m3, [r1 + 6 * mmsize]
4866 movu m4, [r1 + 7 * mmsize]
4867 psllw m1, m0
4868 psllw m2, m0
4869 psllw m3, m0
4870 psllw m4, m0
4871 movu [r0 + r2 * 2], m1
4872 movu [r0 + r2 * 2 + 16], m2
4873 lea r0, [r0 + r2 * 2]
4874 movu [r0 + r2], m3
4875 movu [r0 + r2 + 16], m4
4876
4877 add r1, 8 * mmsize
4878 lea r0, [r0 + r2 * 2]
4879 dec r3d
4880 jnz .loop
4881 RET
4882
4883;--------------------------------------------------------------------------------------
4884; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
4885;--------------------------------------------------------------------------------------
4886INIT_XMM sse2
4887cglobal copy_shl_32, 3,4,5
4888 add r2d, r2d
4889 movd m0, r3m
4890 mov r3d, 1024/64
4891
4892.loop:
4893 ; Row 0-3
4894 movu m1, [r1 + 0 * mmsize]
4895 movu m2, [r1 + 1 * mmsize]
4896 movu m3, [r1 + 2 * mmsize]
4897 movu m4, [r1 + 3 * mmsize]
4898 psllw m1, m0
4899 psllw m2, m0
4900 psllw m3, m0
4901 psllw m4, m0
4902 movu [r0], m1
4903 movu [r0 + 16], m2
4904 movu [r0 + 32], m3
4905 movu [r0 + 48], m4
4906
4907 ; Row 4-7
4908 movu m1, [r1 + 4 * mmsize]
4909 movu m2, [r1 + 5 * mmsize]
4910 movu m3, [r1 + 6 * mmsize]
4911 movu m4, [r1 + 7 * mmsize]
4912 psllw m1, m0
4913 psllw m2, m0
4914 psllw m3, m0
4915 psllw m4, m0
4916 movu [r0 + r2], m1
4917 movu [r0 + r2 + 16], m2
4918 movu [r0 + r2 + 32], m3
4919 movu [r0 + r2 + 48], m4
4920
4921 add r1, 8 * mmsize
4922 lea r0, [r0 + r2 * 2]
4923 dec r3d
4924 jnz .loop
4925 RET