Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / blockcopy8.asm
CommitLineData
72b9787e
JB
1;*****************************************************************************
2;* Copyright (C) 2013 x265 project
3;*
4;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5;* Murugan Vairavel <murugan@multicorewareinc.com>
6;*
7;* This program is free software; you can redistribute it and/or modify
8;* it under the terms of the GNU General Public License as published by
9;* the Free Software Foundation; either version 2 of the License, or
10;* (at your option) any later version.
11;*
12;* This program is distributed in the hope that it will be useful,
13;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15;* GNU General Public License for more details.
16;*
17;* You should have received a copy of the GNU General Public License
18;* along with this program; if not, write to the Free Software
19;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20;*
21;* This program is also available under a commercial proprietary license.
22;* For more information, contact us at license @ x265.com.
23;*****************************************************************************/
24
25%include "x86inc.asm"
26%include "x86util.asm"
27
28SECTION_RODATA 32
29
30tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
31
32cextern pb_4
33cextern pb_1
34cextern pb_16
35cextern pb_64
36cextern pw_4
37cextern pb_8
38cextern pb_32
39cextern pb_128
40
41SECTION .text
42
43;-----------------------------------------------------------------------------
b53f7c52 44; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
45;-----------------------------------------------------------------------------
46INIT_XMM sse2
47cglobal blockcopy_pp_2x4, 4, 7, 0
48 mov r4w, [r2]
49 mov r5w, [r2 + r3]
50 lea r2, [r2 + r3 * 2]
51 mov r6w, [r2]
52 mov r3w, [r2 + r3]
53
54 mov [r0], r4w
55 mov [r0 + r1], r5w
56 lea r0, [r0 + 2 * r1]
57 mov [r0], r6w
58 mov [r0 + r1], r3w
59RET
60
61;-----------------------------------------------------------------------------
b53f7c52 62; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
63;-----------------------------------------------------------------------------
64INIT_XMM sse2
65cglobal blockcopy_pp_2x8, 4, 7, 0
66 mov r4w, [r2]
67 mov r5w, [r2 + r3]
68 mov r6w, [r2 + 2 * r3]
69
70 mov [r0], r4w
71 mov [r0 + r1], r5w
72 mov [r0 + 2 * r1], r6w
73
74 lea r0, [r0 + 2 * r1]
75 lea r2, [r2 + 2 * r3]
76
77 mov r4w, [r2 + r3]
78 mov r5w, [r2 + 2 * r3]
79
80 mov [r0 + r1], r4w
81 mov [r0 + 2 * r1], r5w
82
83 lea r0, [r0 + 2 * r1]
84 lea r2, [r2 + 2 * r3]
85
86 mov r4w, [r2 + r3]
87 mov r5w, [r2 + 2 * r3]
88
89 mov [r0 + r1], r4w
90 mov [r0 + 2 * r1], r5w
91
92 lea r0, [r0 + 2 * r1]
93 lea r2, [r2 + 2 * r3]
94
95 mov r4w, [r2 + r3]
96 mov [r0 + r1], r4w
97 RET
98
99;-----------------------------------------------------------------------------
b53f7c52 100; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
101;-----------------------------------------------------------------------------
102INIT_XMM sse2
103cglobal blockcopy_pp_2x16, 4, 7, 0
104 mov r6d, 16/2
105.loop:
106 mov r4w, [r2]
107 mov r5w, [r2 + r3]
108 dec r6d
109 lea r2, [r2 + r3 * 2]
110 mov [r0], r4w
111 mov [r0 + r1], r5w
112 lea r0, [r0 + r1 * 2]
113 jnz .loop
114 RET
115
116
117;-----------------------------------------------------------------------------
b53f7c52 118; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
119;-----------------------------------------------------------------------------
120INIT_XMM sse2
121cglobal blockcopy_pp_4x2, 4, 6, 0
122 mov r4d, [r2]
123 mov r5d, [r2 + r3]
124
125 mov [r0], r4d
126 mov [r0 + r1], r5d
127 RET
128
129;-----------------------------------------------------------------------------
b53f7c52 130; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
131;-----------------------------------------------------------------------------
132INIT_XMM sse2
133cglobal blockcopy_pp_4x4, 4, 4, 4
134 movd m0, [r2]
135 movd m1, [r2 + r3]
136 movd m2, [r2 + 2 * r3]
137 lea r3, [r3 + r3 * 2]
138 movd m3, [r2 + r3]
139
140 movd [r0], m0
141 movd [r0 + r1], m1
142 movd [r0 + 2 * r1], m2
143 lea r1, [r1 + 2 * r1]
144 movd [r0 + r1], m3
145 RET
146
147;-----------------------------------------------------------------------------
b53f7c52 148; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
149;-----------------------------------------------------------------------------
150%macro BLOCKCOPY_PP_W4_H8 2
151INIT_XMM sse2
152cglobal blockcopy_pp_%1x%2, 4, 5, 4
153 mov r4d, %2/8
154.loop:
155 movd m0, [r2]
156 movd m1, [r2 + r3]
157 lea r2, [r2 + 2 * r3]
158 movd m2, [r2]
159 movd m3, [r2 + r3]
160
161 movd [r0], m0
162 movd [r0 + r1], m1
163 lea r0, [r0 + 2 * r1]
164 movd [r0], m2
165 movd [r0 + r1], m3
166
167 lea r0, [r0 + 2 * r1]
168 lea r2, [r2 + 2 * r3]
169 movd m0, [r2]
170 movd m1, [r2 + r3]
171 lea r2, [r2 + 2 * r3]
172 movd m2, [r2]
173 movd m3, [r2 + r3]
174
175 movd [r0], m0
176 movd [r0 + r1], m1
177 lea r0, [r0 + 2 * r1]
178 movd [r0], m2
179 movd [r0 + r1], m3
180
181 lea r0, [r0 + 2 * r1]
182 lea r2, [r2 + 2 * r3]
183
184 dec r4d
185 jnz .loop
186 RET
187%endmacro
188
189BLOCKCOPY_PP_W4_H8 4, 8
190BLOCKCOPY_PP_W4_H8 4, 16
191
192BLOCKCOPY_PP_W4_H8 4, 32
193
194;-----------------------------------------------------------------------------
b53f7c52 195; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
196;-----------------------------------------------------------------------------
197INIT_XMM sse2
198cglobal blockcopy_pp_6x8, 4, 7, 8
199
200 movd m0, [r2]
201 movd m1, [r2 + r3]
202 movd m2, [r2 + 2 * r3]
203 lea r5, [r2 + 2 * r3]
204 movd m3, [r5 + r3]
205
206 movd m4, [r5 + 2 * r3]
207 lea r5, [r5 + 2 * r3]
208 movd m5, [r5 + r3]
209 movd m6, [r5 + 2 * r3]
210 lea r5, [r5 + 2 * r3]
211 movd m7, [r5 + r3]
212
213 movd [r0], m0
214 movd [r0 + r1], m1
215 movd [r0 + 2 * r1], m2
216 lea r6, [r0 + 2 * r1]
217 movd [r6 + r1], m3
218
219 movd [r6 + 2 * r1], m4
220 lea r6, [r6 + 2 * r1]
221 movd [r6 + r1], m5
222 movd [r6 + 2 * r1], m6
223 lea r6, [r6 + 2 * r1]
224 movd [r6 + r1], m7
225
226 mov r4w, [r2 + 4]
227 mov r5w, [r2 + r3 + 4]
228 mov r6w, [r2 + 2 * r3 + 4]
229
230 mov [r0 + 4], r4w
231 mov [r0 + r1 + 4], r5w
232 mov [r0 + 2 * r1 + 4], r6w
233
234 lea r0, [r0 + 2 * r1]
235 lea r2, [r2 + 2 * r3]
236
237 mov r4w, [r2 + r3 + 4]
238 mov r5w, [r2 + 2 * r3 + 4]
239
240 mov [r0 + r1 + 4], r4w
241 mov [r0 + 2 * r1 + 4], r5w
242
243 lea r0, [r0 + 2 * r1]
244 lea r2, [r2 + 2 * r3]
245
246 mov r4w, [r2 + r3 + 4]
247 mov r5w, [r2 + 2 * r3 + 4]
248
249 mov [r0 + r1 + 4], r4w
250 mov [r0 + 2 * r1 + 4], r5w
251
252 lea r0, [r0 + 2 * r1]
253 lea r2, [r2 + 2 * r3]
254
255 mov r4w, [r2 + r3 + 4]
256 mov [r0 + r1 + 4], r4w
257 RET
258
259;-----------------------------------------------------------------------------
b53f7c52 260; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
261;-----------------------------------------------------------------------------
262INIT_XMM sse2
263cglobal blockcopy_pp_6x16, 4, 7, 2
264 mov r6d, 16/2
265.loop:
266 movd m0, [r2]
267 mov r4w, [r2 + 4]
268 movd m1, [r2 + r3]
269 mov r5w, [r2 + r3 + 4]
270 lea r2, [r2 + r3 * 2]
271 movd [r0], m0
272 mov [r0 + 4], r4w
273 movd [r0 + r1], m1
274 mov [r0 + r1 + 4], r5w
275 lea r0, [r0 + r1 * 2]
276 dec r6d
277 jnz .loop
278 RET
279
280
281;-----------------------------------------------------------------------------
b53f7c52 282; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
283;-----------------------------------------------------------------------------
284INIT_XMM sse2
285cglobal blockcopy_pp_8x2, 4, 4, 2
286 movh m0, [r2]
287 movh m1, [r2 + r3]
288
289 movh [r0], m0
290 movh [r0 + r1], m1
291RET
292
293;-----------------------------------------------------------------------------
b53f7c52 294; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
295;-----------------------------------------------------------------------------
296INIT_XMM sse2
297cglobal blockcopy_pp_8x4, 4, 4, 4
298 movh m0, [r2]
299 movh m1, [r2 + r3]
300 movh m2, [r2 + 2 * r3]
301 lea r3, [r3 + r3 * 2]
302 movh m3, [r2 + r3]
303
304 movh [r0], m0
305 movh [r0 + r1], m1
306 movh [r0 + 2 * r1], m2
307 lea r1, [r1 + 2 * r1]
308 movh [r0 + r1], m3
309 RET
310
311;-----------------------------------------------------------------------------
b53f7c52 312; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
313;-----------------------------------------------------------------------------
314INIT_XMM sse2
315cglobal blockcopy_pp_8x6, 4, 7, 6
316 movh m0, [r2]
317 movh m1, [r2 + r3]
318 movh m2, [r2 + 2 * r3]
319 lea r5, [r2 + 2 * r3]
320 movh m3, [r5 + r3]
321 movh m4, [r5 + 2 * r3]
322 lea r5, [r5 + 2 * r3]
323 movh m5, [r5 + r3]
324
325 movh [r0], m0
326 movh [r0 + r1], m1
327 movh [r0 + 2 * r1], m2
328 lea r6, [r0 + 2 * r1]
329 movh [r6 + r1], m3
330 movh [r6 + 2 * r1], m4
331 lea r6, [r6 + 2 * r1]
332 movh [r6 + r1], m5
333 RET
334
335;-----------------------------------------------------------------------------
b53f7c52 336; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
337;-----------------------------------------------------------------------------
338INIT_XMM sse2
339cglobal blockcopy_pp_8x12, 4, 5, 2
340 mov r4d, 12/2
341.loop:
342 movh m0, [r2]
343 movh m1, [r2 + r3]
344 movh [r0], m0
345 movh [r0 + r1], m1
346 dec r4d
347 lea r0, [r0 + 2 * r1]
348 lea r2, [r2 + 2 * r3]
349 jnz .loop
350 RET
351
352;-----------------------------------------------------------------------------
b53f7c52 353; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
354;-----------------------------------------------------------------------------
355%macro BLOCKCOPY_PP_W8_H8 2
356INIT_XMM sse2
357cglobal blockcopy_pp_%1x%2, 4, 5, 6
358 mov r4d, %2/8
359
360.loop:
361 movh m0, [r2]
362 movh m1, [r2 + r3]
363 lea r2, [r2 + 2 * r3]
364 movh m2, [r2]
365 movh m3, [r2 + r3]
366 lea r2, [r2 + 2 * r3]
367 movh m4, [r2]
368 movh m5, [r2 + r3]
369
370 movh [r0], m0
371 movh [r0 + r1], m1
372 lea r0, [r0 + 2 * r1]
373 movh [r0], m2
374 movh [r0 + r1], m3
375 lea r0, [r0 + 2 * r1]
376 movh [r0], m4
377 movh [r0 + r1], m5
378
379 lea r2, [r2 + 2 * r3]
380 movh m4, [r2]
381 movh m5, [r2 + r3]
382 lea r0, [r0 + 2 * r1]
383 movh [r0], m4
384 movh [r0 + r1], m5
385
386 dec r4d
387 lea r0, [r0 + 2 * r1]
388 lea r2, [r2 + 2 * r3]
389 jnz .loop
390RET
391%endmacro
392
393BLOCKCOPY_PP_W8_H8 8, 8
394BLOCKCOPY_PP_W8_H8 8, 16
395BLOCKCOPY_PP_W8_H8 8, 32
396
397BLOCKCOPY_PP_W8_H8 8, 64
398
399;-----------------------------------------------------------------------------
b53f7c52 400; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
401;-----------------------------------------------------------------------------
402%macro BLOCKCOPY_PP_W12_H4 2
403INIT_XMM sse2
404cglobal blockcopy_pp_%1x%2, 4, 5, 4
405 mov r4d, %2/4
406
407.loop:
408 movh m0, [r2]
409 movd m1, [r2 + 8]
410 movh m2, [r2 + r3]
411 movd m3, [r2 + r3 + 8]
412 lea r2, [r2 + 2 * r3]
413
414 movh [r0], m0
415 movd [r0 + 8], m1
416 movh [r0 + r1], m2
417 movd [r0 + r1 + 8], m3
418 lea r0, [r0 + 2 * r1]
419
420 movh m0, [r2]
421 movd m1, [r2 + 8]
422 movh m2, [r2 + r3]
423 movd m3, [r2 + r3 + 8]
424
425 movh [r0], m0
426 movd [r0 + 8], m1
427 movh [r0 + r1], m2
428 movd [r0 + r1 + 8], m3
429
430 dec r4d
431 lea r0, [r0 + 2 * r1]
432 lea r2, [r2 + 2 * r3]
433 jnz .loop
434 RET
435%endmacro
436
437BLOCKCOPY_PP_W12_H4 12, 16
438
439BLOCKCOPY_PP_W12_H4 12, 32
440
441;-----------------------------------------------------------------------------
b53f7c52 442; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
443;-----------------------------------------------------------------------------
444%macro BLOCKCOPY_PP_W16_H4 2
445INIT_XMM sse2
446cglobal blockcopy_pp_%1x%2, 4, 5, 4
447 mov r4d, %2/4
448
449.loop:
450 movu m0, [r2]
451 movu m1, [r2 + r3]
452 lea r2, [r2 + 2 * r3]
453 movu m2, [r2]
454 movu m3, [r2 + r3]
455
456 movu [r0], m0
457 movu [r0 + r1], m1
458 lea r0, [r0 + 2 * r1]
459 movu [r0], m2
460 movu [r0 + r1], m3
461
462 dec r4d
463 lea r0, [r0 + 2 * r1]
464 lea r2, [r2 + 2 * r3]
465 jnz .loop
466
467 RET
468%endmacro
469
470BLOCKCOPY_PP_W16_H4 16, 4
471BLOCKCOPY_PP_W16_H4 16, 12
472
473;-----------------------------------------------------------------------------
b53f7c52 474; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
475;-----------------------------------------------------------------------------
476%macro BLOCKCOPY_PP_W16_H8 2
477INIT_XMM sse2
478cglobal blockcopy_pp_%1x%2, 4, 5, 6
479 mov r4d, %2/8
480
481.loop:
482 movu m0, [r2]
483 movu m1, [r2 + r3]
484 lea r2, [r2 + 2 * r3]
485 movu m2, [r2]
486 movu m3, [r2 + r3]
487 lea r2, [r2 + 2 * r3]
488 movu m4, [r2]
489 movu m5, [r2 + r3]
490 lea r2, [r2 + 2 * r3]
491
492 movu [r0], m0
493 movu [r0 + r1], m1
494 lea r0, [r0 + 2 * r1]
495 movu [r0], m2
496 movu [r0 + r1], m3
497 lea r0, [r0 + 2 * r1]
498 movu [r0], m4
499 movu [r0 + r1], m5
500 lea r0, [r0 + 2 * r1]
501
502 movu m0, [r2]
503 movu m1, [r2 + r3]
504 movu [r0], m0
505 movu [r0 + r1], m1
506
507 dec r4d
508 lea r0, [r0 + 2 * r1]
509 lea r2, [r2 + 2 * r3]
510 jnz .loop
511 RET
512%endmacro
513
514BLOCKCOPY_PP_W16_H8 16, 8
515BLOCKCOPY_PP_W16_H8 16, 16
516BLOCKCOPY_PP_W16_H8 16, 32
517BLOCKCOPY_PP_W16_H8 16, 64
518
519BLOCKCOPY_PP_W16_H8 16, 24
520
521;-----------------------------------------------------------------------------
b53f7c52 522; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
523;-----------------------------------------------------------------------------
524%macro BLOCKCOPY_PP_W24_H4 2
525INIT_XMM sse2
526cglobal blockcopy_pp_%1x%2, 4, 5, 6
527 mov r4d, %2/4
528
529.loop:
530 movu m0, [r2]
531 movh m1, [r2 + 16]
532 movu m2, [r2 + r3]
533 movh m3, [r2 + r3 + 16]
534 lea r2, [r2 + 2 * r3]
535 movu m4, [r2]
536 movh m5, [r2 + 16]
537
538 movu [r0], m0
539 movh [r0 + 16], m1
540 movu [r0 + r1], m2
541 movh [r0 + r1 + 16], m3
542 lea r0, [r0 + 2 * r1]
543 movu [r0], m4
544 movh [r0 + 16], m5
545
546 movu m0, [r2 + r3]
547 movh m1, [r2 + r3 + 16]
548 movu [r0 + r1], m0
549 movh [r0 + r1 + 16], m1
550
551 dec r4d
552 lea r0, [r0 + 2 * r1]
553 lea r2, [r2 + 2 * r3]
554 jnz .loop
555 RET
556%endmacro
557
558BLOCKCOPY_PP_W24_H4 24, 32
559
560BLOCKCOPY_PP_W24_H4 24, 64
561
562;-----------------------------------------------------------------------------
b53f7c52 563; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
564;-----------------------------------------------------------------------------
565%macro BLOCKCOPY_PP_W32_H4 2
566INIT_XMM sse2
567cglobal blockcopy_pp_%1x%2, 4, 5, 4
568 mov r4d, %2/4
569
570.loop:
571 movu m0, [r2]
572 movu m1, [r2 + 16]
573 movu m2, [r2 + r3]
574 movu m3, [r2 + r3 + 16]
575 lea r2, [r2 + 2 * r3]
576
577 movu [r0], m0
578 movu [r0 + 16], m1
579 movu [r0 + r1], m2
580 movu [r0 + r1 + 16], m3
581 lea r0, [r0 + 2 * r1]
582
583 movu m0, [r2]
584 movu m1, [r2 + 16]
585 movu m2, [r2 + r3]
586 movu m3, [r2 + r3 + 16]
587
588 movu [r0], m0
589 movu [r0 + 16], m1
590 movu [r0 + r1], m2
591 movu [r0 + r1 + 16], m3
592
593 dec r4d
594 lea r0, [r0 + 2 * r1]
595 lea r2, [r2 + 2 * r3]
596 jnz .loop
597 RET
598%endmacro
599
600BLOCKCOPY_PP_W32_H4 32, 8
601BLOCKCOPY_PP_W32_H4 32, 16
602BLOCKCOPY_PP_W32_H4 32, 24
603BLOCKCOPY_PP_W32_H4 32, 32
604BLOCKCOPY_PP_W32_H4 32, 64
605
606BLOCKCOPY_PP_W32_H4 32, 48
607
608INIT_YMM avx
609cglobal blockcopy_pp_32x8, 4, 6, 6
610 lea r4, [3 * r1]
611 lea r5, [3 * r3]
612
613 movu m0, [r2]
614 movu m1, [r2 + r3]
615 movu m2, [r2 + 2 * r3]
616 movu m3, [r2 + r5]
617 lea r2, [r2 + 4 * r3]
618 movu m4, [r2]
619 movu m5, [r2 + r3]
620
621 movu [r0], m0
622 movu [r0 + r1], m1
623 movu [r0 + 2 * r1], m2
624 movu [r0 + r4], m3
625 lea r0, [r0 + 4 * r1]
626 movu [r0], m4
627 movu [r0 + r1], m5
628
629 movu m0, [r2 + 2 * r3]
630 movu m1, [r2 + r5]
631
632 movu [r0 + 2 * r1], m0
633 movu [r0 + r4], m1
634 RET
635
636INIT_YMM avx
637cglobal blockcopy_pp_32x16, 4, 6, 6
638 lea r4, [3 * r1]
639 lea r5, [3 * r3]
640
641 movu m0, [r2]
642 movu m1, [r2 + r3]
643 movu m2, [r2 + 2 * r3]
644 movu m3, [r2 + r5]
645 lea r2, [r2 + 4 * r3]
646 movu m4, [r2]
647 movu m5, [r2 + r3]
648
649 movu [r0], m0
650 movu [r0 + r1], m1
651 movu [r0 + 2 * r1], m2
652 movu [r0 + r4], m3
653 lea r0, [r0 + 4 * r1]
654 movu [r0], m4
655 movu [r0 + r1], m5
656
657 movu m0, [r2 + 2 * r3]
658 movu m1, [r2 + r5]
659 lea r2, [r2 + 4 * r3]
660 movu m2, [r2]
661 movu m3, [r2 + r3]
662 movu m4, [r2 + 2 * r3]
663 movu m5, [r2 + r5]
664
665 movu [r0 + 2 * r1], m0
666 movu [r0 + r4], m1
667 lea r0, [r0 + 4 * r1]
668 movu [r0], m2
669 movu [r0 + r1], m3
670 movu [r0 + 2 * r1], m4
671 movu [r0 + r4], m5
672
673 lea r2, [r2 + 4 * r3]
674 movu m0, [r2]
675 movu m1, [r2 + r3]
676 movu m2, [r2 + 2 * r3]
677 movu m3, [r2 + r5]
678
679 lea r0, [r0 + 4 * r1]
680 movu [r0], m0
681 movu [r0 + r1], m1
682 movu [r0 + 2 * r1], m2
683 movu [r0 + r4], m3
684 RET
685
686;-----------------------------------------------------------------------------
b53f7c52 687; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
688;-----------------------------------------------------------------------------
689INIT_YMM avx
690cglobal blockcopy_pp_32x24, 4, 7, 6
691lea r4, [3 * r1]
692lea r5, [3 * r3]
693mov r6d, 24/8
694
695.loop:
696 movu m0, [r2]
697 movu m1, [r2 + r3]
698 movu m2, [r2 + 2 * r3]
699 movu m3, [r2 + r5]
700 lea r2, [r2 + 4 * r3]
701 movu m4, [r2]
702 movu m5, [r2 + r3]
703
704 movu [r0], m0
705 movu [r0 + r1], m1
706 movu [r0 + 2 * r1], m2
707 movu [r0 + r4], m3
708 lea r0, [r0 + 4 * r1]
709 movu [r0], m4
710 movu [r0 + r1], m5
711
712 movu m0, [r2 + 2 * r3]
713 movu m1, [r2 + r5]
714
715 movu [r0 + 2 * r1], m0
716 movu [r0 + r4], m1
717
718 lea r2, [r2 + 4 * r3]
719 lea r0, [r0 + 4 * r1]
720 dec r6d
721 jnz .loop
722 RET
723
724;-----------------------------------------------------------------------------
b53f7c52 725; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
726;-----------------------------------------------------------------------------
727%macro BLOCKCOPY_PP_W32_H16_avx 2
728INIT_YMM avx
729cglobal blockcopy_pp_%1x%2, 4, 7, 6
730 lea r4, [3 * r1]
731 lea r5, [3 * r3]
732 mov r6d, %2/16
733
734.loop:
735 movu m0, [r2]
736 movu m1, [r2 + r3]
737 movu m2, [r2 + 2 * r3]
738 movu m3, [r2 + r5]
739 lea r2, [r2 + 4 * r3]
740 movu m4, [r2]
741 movu m5, [r2 + r3]
742
743 movu [r0], m0
744 movu [r0 + r1], m1
745 movu [r0 + 2 * r1], m2
746 movu [r0 + r4], m3
747 lea r0, [r0 + 4 * r1]
748 movu [r0], m4
749 movu [r0 + r1], m5
750
751 movu m0, [r2 + 2 * r3]
752 movu m1, [r2 + r5]
753 lea r2, [r2 + 4 * r3]
754 movu m2, [r2]
755 movu m3, [r2 + r3]
756 movu m4, [r2 + 2 * r3]
757 movu m5, [r2 + r5]
758
759 movu [r0 + 2 * r1], m0
760 movu [r0 + r4], m1
761 lea r0, [r0 + 4 * r1]
762 movu [r0], m2
763 movu [r0 + r1], m3
764 movu [r0 + 2 * r1], m4
765 movu [r0 + r4], m5
766
767 lea r2, [r2 + 4 * r3]
768 movu m0, [r2]
769 movu m1, [r2 + r3]
770 movu m2, [r2 + 2 * r3]
771 movu m3, [r2 + r5]
772
773 lea r0, [r0 + 4 * r1]
774 movu [r0], m0
775 movu [r0 + r1], m1
776 movu [r0 + 2 * r1], m2
777 movu [r0 + r4], m3
778
779 lea r2, [r2 + 4 * r3]
780 lea r0, [r0 + 4 * r1]
781 dec r6d
782 jnz .loop
783 RET
784%endmacro
785
786BLOCKCOPY_PP_W32_H16_avx 32, 32
787BLOCKCOPY_PP_W32_H16_avx 32, 48
788BLOCKCOPY_PP_W32_H16_avx 32, 64
789
790;-----------------------------------------------------------------------------
b53f7c52 791; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
792;-----------------------------------------------------------------------------
793%macro BLOCKCOPY_PP_W48_H2 2
794INIT_XMM sse2
795cglobal blockcopy_pp_%1x%2, 4, 5, 6
796 mov r4d, %2/4
797
798.loop:
799 movu m0, [r2]
800 movu m1, [r2 + 16]
801 movu m2, [r2 + 32]
802 movu m3, [r2 + r3]
803 movu m4, [r2 + r3 + 16]
804 movu m5, [r2 + r3 + 32]
805 lea r2, [r2 + 2 * r3]
806
807 movu [r0], m0
808 movu [r0 + 16], m1
809 movu [r0 + 32], m2
810 movu [r0 + r1], m3
811 movu [r0 + r1 + 16], m4
812 movu [r0 + r1 + 32], m5
813 lea r0, [r0 + 2 * r1]
814
815 movu m0, [r2]
816 movu m1, [r2 + 16]
817 movu m2, [r2 + 32]
818 movu m3, [r2 + r3]
819 movu m4, [r2 + r3 + 16]
820 movu m5, [r2 + r3 + 32]
821
822 movu [r0], m0
823 movu [r0 + 16], m1
824 movu [r0 + 32], m2
825 movu [r0 + r1], m3
826 movu [r0 + r1 + 16], m4
827 movu [r0 + r1 + 32], m5
828
829 dec r4d
830 lea r0, [r0 + 2 * r1]
831 lea r2, [r2 + 2 * r3]
832 jnz .loop
833 RET
834%endmacro
835
836BLOCKCOPY_PP_W48_H2 48, 64
837
838;-----------------------------------------------------------------------------
b53f7c52 839; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
72b9787e
JB
840;-----------------------------------------------------------------------------
841%macro BLOCKCOPY_PP_W64_H4 2
842INIT_XMM sse2
843cglobal blockcopy_pp_%1x%2, 4, 5, 6
844 mov r4d, %2/4
845
846.loop:
847 movu m0, [r2]
848 movu m1, [r2 + 16]
849 movu m2, [r2 + 32]
850 movu m3, [r2 + 48]
851 movu m4, [r2 + r3]
852 movu m5, [r2 + r3 + 16]
853
854 movu [r0], m0
855 movu [r0 + 16], m1
856 movu [r0 + 32], m2
857 movu [r0 + 48], m3
858 movu [r0 + r1], m4
859 movu [r0 + r1 + 16], m5
860
861 movu m0, [r2 + r3 + 32]
862 movu m1, [r2 + r3 + 48]
863 lea r2, [r2 + 2 * r3]
864 movu m2, [r2]
865 movu m3, [r2 + 16]
866 movu m4, [r2 + 32]
867 movu m5, [r2 + 48]
868
869 movu [r0 + r1 + 32], m0
870 movu [r0 + r1 + 48], m1
871 lea r0, [r0 + 2 * r1]
872 movu [r0], m2
873 movu [r0 + 16], m3
874 movu [r0 + 32], m4
875 movu [r0 + 48], m5
876
877 movu m0, [r2 + r3]
878 movu m1, [r2 + r3 + 16]
879 movu m2, [r2 + r3 + 32]
880 movu m3, [r2 + r3 + 48]
881
882 movu [r0 + r1], m0
883 movu [r0 + r1 + 16], m1
884 movu [r0 + r1 + 32], m2
885 movu [r0 + r1 + 48], m3
886
887 dec r4d
888 lea r0, [r0 + 2 * r1]
889 lea r2, [r2 + 2 * r3]
890 jnz .loop
891 RET
892%endmacro
893
894BLOCKCOPY_PP_W64_H4 64, 16
895BLOCKCOPY_PP_W64_H4 64, 32
896BLOCKCOPY_PP_W64_H4 64, 48
897BLOCKCOPY_PP_W64_H4 64, 64
898
899;-----------------------------------------------------------------------------
b53f7c52 900; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
901;-----------------------------------------------------------------------------
902INIT_XMM sse4
903cglobal blockcopy_sp_2x4, 4, 5, 2
904
905add r3, r3
906
907;Row 0-1
908movd m0, [r2]
909movd m1, [r2 + r3]
910packuswb m0, m1
911movd r4d, m0
912mov [r0], r4w
913pextrw [r0 + r1], m0, 4
914
915;Row 2-3
916movd m0, [r2 + 2 * r3]
917lea r2, [r2 + 2 * r3]
918movd m1, [r2 + r3]
919packuswb m0, m1
920movd r4d, m0
921mov [r0 + 2 * r1], r4w
922lea r0, [r0 + 2 * r1]
923pextrw [r0 + r1], m0, 4
924
925RET
926
927
928;-----------------------------------------------------------------------------
b53f7c52 929; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
930;-----------------------------------------------------------------------------
931INIT_XMM sse4
932cglobal blockcopy_sp_2x8, 4, 5, 2
933
934add r3, r3
935
936;Row 0-1
937movd m0, [r2]
938movd m1, [r2 + r3]
939packuswb m0, m1
940movd r4d, m0
941mov [r0], r4w
942pextrw [r0 + r1], m0, 4
943
944;Row 2-3
945movd m0, [r2 + 2 * r3]
946lea r2, [r2 + 2 * r3]
947movd m1, [r2 + r3]
948packuswb m0, m1
949movd r4d, m0
950mov [r0 + 2 * r1], r4w
951lea r0, [r0 + 2 * r1]
952pextrw [r0 + r1], m0, 4
953
954;Row 4-5
955movd m0, [r2 + 2 * r3]
956lea r2, [r2 + 2 * r3]
957movd m1, [r2 + r3]
958packuswb m0, m1
959movd r4d, m0
960mov [r0 + 2 * r1], r4w
961lea r0, [r0 + 2 * r1]
962pextrw [r0 + r1], m0, 4
963
964;Row 6-7
965movd m0, [r2 + 2 * r3]
966lea r2, [r2 + 2 * r3]
967movd m1, [r2 + r3]
968packuswb m0, m1
969movd r4d, m0
970mov [r0 + 2 * r1], r4w
971lea r0, [r0 + 2 * r1]
972pextrw [r0 + r1], m0, 4
973
974RET
975
976;-----------------------------------------------------------------------------
b53f7c52 977; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
978;-----------------------------------------------------------------------------
979%macro BLOCKCOPY_SP_W2_H2 2
980INIT_XMM sse2
b53f7c52 981cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
72b9787e
JB
982 add r3, r3
983 mov r6d, %2/2
984.loop:
985 movd m0, [r2]
986 movd m1, [r2 + r3]
987 dec r6d
988 lea r2, [r2 + r3 * 2]
989 packuswb m0, m0
990 packuswb m1, m1
991 movd r4d, m0
992 movd r5d, m1
993 mov [r0], r4w
994 mov [r0 + r1], r5w
995 lea r0, [r0 + r1 * 2]
996 jnz .loop
997 RET
998%endmacro
999
1000BLOCKCOPY_SP_W2_H2 2, 4
1001BLOCKCOPY_SP_W2_H2 2, 8
1002
1003BLOCKCOPY_SP_W2_H2 2, 16
1004
1005;-----------------------------------------------------------------------------
b53f7c52 1006; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1007;-----------------------------------------------------------------------------
1008INIT_XMM sse2
b53f7c52 1009cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
72b9787e
JB
1010
1011add r3, r3
1012
1013movh m0, [r2]
1014movh m1, [r2 + r3]
1015
1016packuswb m0, m1
1017
1018movd [r0], m0
1019pshufd m0, m0, 2
1020movd [r0 + r1], m0
1021
1022RET
1023
1024;-----------------------------------------------------------------------------
b53f7c52 1025; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1026;-----------------------------------------------------------------------------
1027INIT_XMM sse2
b53f7c52 1028cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
72b9787e
JB
1029
1030add r3, r3
1031
1032movh m0, [r2]
1033movh m1, [r2 + r3]
1034movh m2, [r2 + 2 * r3]
1035lea r2, [r2 + 2 * r3]
1036movh m3, [r2 + r3]
1037
1038packuswb m0, m1
1039packuswb m2, m3
1040
1041movd [r0], m0
1042pshufd m0, m0, 2
1043movd [r0 + r1], m0
1044movd [r0 + 2 * r1], m2
1045lea r0, [r0 + 2 * r1]
1046pshufd m2, m2, 2
1047movd [r0 + r1], m2
1048
1049RET
1050
1051;-----------------------------------------------------------------------------
b53f7c52 1052; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1053;-----------------------------------------------------------------------------
1054INIT_XMM sse2
b53f7c52 1055cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
72b9787e
JB
1056
1057add r3, r3
1058
1059movh m0, [r2]
1060movh m1, [r2 + r3]
1061movh m2, [r2 + 2 * r3]
1062lea r2, [r2 + 2 * r3]
1063movh m3, [r2 + r3]
1064movh m4, [r2 + 2 * r3]
1065lea r2, [r2 + 2 * r3]
1066movh m5, [r2 + r3]
1067movh m6, [r2 + 2 * r3]
1068lea r2, [r2 + 2 * r3]
1069movh m7, [r2 + r3]
1070
1071packuswb m0, m1
1072packuswb m2, m3
1073packuswb m4, m5
1074packuswb m6, m7
1075
1076movd [r0], m0
1077pshufd m0, m0, 2
1078movd [r0 + r1], m0
1079movd [r0 + 2 * r1], m2
1080lea r0, [r0 + 2 * r1]
1081pshufd m2, m2, 2
1082movd [r0 + r1], m2
1083movd [r0 + 2 * r1], m4
1084lea r0, [r0 + 2 * r1]
1085pshufd m4, m4, 2
1086movd [r0 + r1], m4
1087movd [r0 + 2 * r1], m6
1088lea r0, [r0 + 2 * r1]
1089pshufd m6, m6, 2
1090movd [r0 + r1], m6
1091
1092RET
1093
1094;-----------------------------------------------------------------------------
b53f7c52 1095; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1096;-----------------------------------------------------------------------------
1097%macro BLOCKCOPY_SP_W4_H8 2
1098INIT_XMM sse2
b53f7c52 1099cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
72b9787e
JB
1100
1101mov r4d, %2/8
1102
1103add r3, r3
1104
1105.loop:
1106 movh m0, [r2]
1107 movh m1, [r2 + r3]
1108 movh m2, [r2 + 2 * r3]
1109 lea r2, [r2 + 2 * r3]
1110 movh m3, [r2 + r3]
1111 movh m4, [r2 + 2 * r3]
1112 lea r2, [r2 + 2 * r3]
1113 movh m5, [r2 + r3]
1114 movh m6, [r2 + 2 * r3]
1115 lea r2, [r2 + 2 * r3]
1116 movh m7, [r2 + r3]
1117
1118 packuswb m0, m1
1119 packuswb m2, m3
1120 packuswb m4, m5
1121 packuswb m6, m7
1122
1123 movd [r0], m0
1124 pshufd m0, m0, 2
1125 movd [r0 + r1], m0
1126 movd [r0 + 2 * r1], m2
1127 lea r0, [r0 + 2 * r1]
1128 pshufd m2, m2, 2
1129 movd [r0 + r1], m2
1130 movd [r0 + 2 * r1], m4
1131 lea r0, [r0 + 2 * r1]
1132 pshufd m4, m4, 2
1133 movd [r0 + r1], m4
1134 movd [r0 + 2 * r1], m6
1135 lea r0, [r0 + 2 * r1]
1136 pshufd m6, m6, 2
1137 movd [r0 + r1], m6
1138
1139 lea r0, [r0 + 2 * r1]
1140 lea r2, [r2 + 2 * r3]
1141
1142 dec r4d
1143 jnz .loop
1144
1145RET
1146%endmacro
1147
1148BLOCKCOPY_SP_W4_H8 4, 16
1149
1150BLOCKCOPY_SP_W4_H8 4, 32
1151
1152;-----------------------------------------------------------------------------
b53f7c52 1153; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1154;-----------------------------------------------------------------------------
1155INIT_XMM sse4
1156cglobal blockcopy_sp_6x8, 4, 4, 2
1157
1158 add r3, r3
1159
1160 movu m0, [r2]
1161 movu m1, [r2 + r3]
1162 packuswb m0, m1
1163
1164 movd [r0], m0
1165 pextrw [r0 + 4], m0, 2
1166
1167 movhlps m0, m0
1168 movd [r0 + r1], m0
1169 pextrw [r0 + r1 + 4], m0, 2
1170
1171 lea r0, [r0 + 2 * r1]
1172 lea r2, [r2 + 2 * r3]
1173
1174 movu m0, [r2]
1175 movu m1, [r2 + r3]
1176 packuswb m0, m1
1177
1178 movd [r0], m0
1179 pextrw [r0 + 4], m0, 2
1180
1181 movhlps m0, m0
1182 movd [r0 + r1], m0
1183 pextrw [r0 + r1 + 4], m0, 2
1184
1185 lea r0, [r0 + 2 * r1]
1186 lea r2, [r2 + 2 * r3]
1187
1188 movu m0, [r2]
1189 movu m1, [r2 + r3]
1190 packuswb m0, m1
1191
1192 movd [r0], m0
1193 pextrw [r0 + 4], m0, 2
1194
1195 movhlps m0, m0
1196 movd [r0 + r1], m0
1197 pextrw [r0 + r1 + 4], m0, 2
1198
1199 lea r0, [r0 + 2 * r1]
1200 lea r2, [r2 + 2 * r3]
1201
1202 movu m0, [r2]
1203 movu m1, [r2 + r3]
1204 packuswb m0, m1
1205
1206 movd [r0], m0
1207 pextrw [r0 + 4], m0, 2
1208
1209 movhlps m0, m0
1210 movd [r0 + r1], m0
1211 pextrw [r0 + r1 + 4], m0, 2
1212
1213 RET
1214
1215;-----------------------------------------------------------------------------
b53f7c52 1216; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1217;-----------------------------------------------------------------------------
1218%macro BLOCKCOPY_SP_W6_H2 2
1219INIT_XMM sse2
b53f7c52 1220cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
72b9787e
JB
1221 add r3, r3
1222 mov r6d, %2/2
1223.loop:
1224 movh m0, [r2]
1225 movd m2, [r2 + 8]
1226 movh m1, [r2 + r3]
1227 movd m3, [r2 + r3 + 8]
1228 dec r6d
1229 lea r2, [r2 + r3 * 2]
1230 packuswb m0, m0
1231 packuswb m2, m2
1232 packuswb m1, m1
1233 packuswb m3, m3
1234 movd r4d, m2
1235 movd r5d, m3
1236 movd [r0], m0
1237 mov [r0 + 4], r4w
1238 movd [r0 + r1], m1
1239 mov [r0 + r1 + 4], r5w
1240 lea r0, [r0 + r1 * 2]
1241 jnz .loop
1242 RET
1243%endmacro
1244
1245BLOCKCOPY_SP_W6_H2 6, 8
1246
1247BLOCKCOPY_SP_W6_H2 6, 16
1248
1249;-----------------------------------------------------------------------------
b53f7c52 1250; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1251;-----------------------------------------------------------------------------
1252INIT_XMM sse2
b53f7c52 1253cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
72b9787e
JB
1254
1255add r3, r3
1256
1257movu m0, [r2]
1258movu m1, [r2 + r3]
1259
1260packuswb m0, m1
1261
1262movlps [r0], m0
1263movhps [r0 + r1], m0
1264
1265RET
1266
1267;-----------------------------------------------------------------------------
b53f7c52 1268; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1269;-----------------------------------------------------------------------------
1270INIT_XMM sse2
b53f7c52 1271cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
72b9787e
JB
1272
1273add r3, r3
1274
1275movu m0, [r2]
1276movu m1, [r2 + r3]
1277movu m2, [r2 + 2 * r3]
1278lea r2, [r2 + 2 * r3]
1279movu m3, [r2 + r3]
1280
1281packuswb m0, m1
1282packuswb m2, m3
1283
1284movlps [r0], m0
1285movhps [r0 + r1], m0
1286movlps [r0 + 2 * r1], m2
1287lea r0, [r0 + 2 * r1]
1288movhps [r0 + r1], m2
1289
1290RET
1291
1292;-----------------------------------------------------------------------------
b53f7c52 1293; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1294;-----------------------------------------------------------------------------
1295INIT_XMM sse2
b53f7c52 1296cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
72b9787e
JB
1297
1298add r3, r3
1299
1300movu m0, [r2]
1301movu m1, [r2 + r3]
1302movu m2, [r2 + 2 * r3]
1303lea r2, [r2 + 2 * r3]
1304movu m3, [r2 + r3]
1305movu m4, [r2 + 2 * r3]
1306lea r2, [r2 + 2 * r3]
1307movu m5, [r2 + r3]
1308
1309packuswb m0, m1
1310packuswb m2, m3
1311packuswb m4, m5
1312
1313movlps [r0], m0
1314movhps [r0 + r1], m0
1315movlps [r0 + 2 * r1], m2
1316lea r0, [r0 + 2 * r1]
1317movhps [r0 + r1], m2
1318movlps [r0 + 2 * r1], m4
1319lea r0, [r0 + 2 * r1]
1320movhps [r0 + r1], m4
1321
1322RET
1323
1324;-----------------------------------------------------------------------------
b53f7c52 1325; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1326;-----------------------------------------------------------------------------
1327INIT_XMM sse2
b53f7c52 1328cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
72b9787e
JB
1329
1330add r3, r3
1331
1332movu m0, [r2]
1333movu m1, [r2 + r3]
1334movu m2, [r2 + 2 * r3]
1335lea r2, [r2 + 2 * r3]
1336movu m3, [r2 + r3]
1337movu m4, [r2 + 2 * r3]
1338lea r2, [r2 + 2 * r3]
1339movu m5, [r2 + r3]
1340movu m6, [r2 + 2 * r3]
1341lea r2, [r2 + 2 * r3]
1342movu m7, [r2 + r3]
1343
1344packuswb m0, m1
1345packuswb m2, m3
1346packuswb m4, m5
1347packuswb m6, m7
1348
1349movlps [r0], m0
1350movhps [r0 + r1], m0
1351movlps [r0 + 2 * r1], m2
1352lea r0, [r0 + 2 * r1]
1353movhps [r0 + r1], m2
1354movlps [r0 + 2 * r1], m4
1355lea r0, [r0 + 2 * r1]
1356movhps [r0 + r1], m4
1357movlps [r0 + 2 * r1], m6
1358lea r0, [r0 + 2 * r1]
1359movhps [r0 + r1], m6
1360
1361RET
1362
1363;-----------------------------------------------------------------------------
b53f7c52 1364; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1365;-----------------------------------------------------------------------------
1366%macro BLOCKCOPY_SP_W8_H4 2
1367INIT_XMM sse2
b53f7c52 1368cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
72b9787e
JB
1369 add r3, r3
1370 mov r4d, %2/4
1371.loop:
1372 movu m0, [r2]
1373 movu m1, [r2 + r3]
1374 lea r2, [r2 + r3 * 2]
1375 movu m2, [r2]
1376 movu m3, [r2 + r3]
1377 dec r4d
1378 lea r2, [r2 + r3 * 2]
1379 packuswb m0, m1
1380 packuswb m2, m3
1381 movlps [r0], m0
1382 movhps [r0 + r1], m0
1383 lea r0, [r0 + r1 * 2]
1384 movlps [r0], m2
1385 movhps [r0 + r1], m2
1386 lea r0, [r0 + r1 * 2]
1387 jnz .loop
1388 RET
1389%endmacro
1390
1391BLOCKCOPY_SP_W8_H4 8, 12
1392
1393;-----------------------------------------------------------------------------
b53f7c52 1394; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1395;-----------------------------------------------------------------------------
1396%macro BLOCKCOPY_SP_W8_H8 2
1397INIT_XMM sse2
b53f7c52 1398cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
72b9787e
JB
1399
1400mov r4d, %2/8
1401
1402add r3, r3
1403
1404.loop:
1405 movu m0, [r2]
1406 movu m1, [r2 + r3]
1407 movu m2, [r2 + 2 * r3]
1408 lea r2, [r2 + 2 * r3]
1409 movu m3, [r2 + r3]
1410 movu m4, [r2 + 2 * r3]
1411 lea r2, [r2 + 2 * r3]
1412 movu m5, [r2 + r3]
1413 movu m6, [r2 + 2 * r3]
1414 lea r2, [r2 + 2 * r3]
1415 movu m7, [r2 + r3]
1416
1417 packuswb m0, m1
1418 packuswb m2, m3
1419 packuswb m4, m5
1420 packuswb m6, m7
1421
1422 movlps [r0], m0
1423 movhps [r0 + r1], m0
1424 movlps [r0 + 2 * r1], m2
1425 lea r0, [r0 + 2 * r1]
1426 movhps [r0 + r1], m2
1427 movlps [r0 + 2 * r1], m4
1428 lea r0, [r0 + 2 * r1]
1429 movhps [r0 + r1], m4
1430 movlps [r0 + 2 * r1], m6
1431 lea r0, [r0 + 2 * r1]
1432 movhps [r0 + r1], m6
1433
1434 lea r0, [r0 + 2 * r1]
1435 lea r2, [r2 + 2 * r3]
1436
1437 dec r4d
1438 jnz .loop
1439
1440RET
1441%endmacro
1442
1443BLOCKCOPY_SP_W8_H8 8, 16
1444BLOCKCOPY_SP_W8_H8 8, 32
1445
1446BLOCKCOPY_SP_W8_H8 8, 64
1447
1448;-----------------------------------------------------------------------------
b53f7c52 1449; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1450;-----------------------------------------------------------------------------
1451%macro BLOCKCOPY_SP_W12_H4 2
1452INIT_XMM sse2
b53f7c52 1453cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
72b9787e
JB
1454
1455mov r4d, %2/4
1456
1457add r3, r3
1458
1459.loop:
1460 movu m0, [r2]
1461 movu m1, [r2 + 16]
1462 movu m2, [r2 + r3]
1463 movu m3, [r2 + r3 + 16]
1464 movu m4, [r2 + 2 * r3]
1465 movu m5, [r2 + 2 * r3 + 16]
1466 lea r2, [r2 + 2 * r3]
1467 movu m6, [r2 + r3]
1468 movu m7, [r2 + r3 + 16]
1469
1470 packuswb m0, m1
1471 packuswb m2, m3
1472 packuswb m4, m5
1473 packuswb m6, m7
1474
1475 movh [r0], m0
1476 pshufd m0, m0, 2
1477 movd [r0 + 8], m0
1478
1479 movh [r0 + r1], m2
1480 pshufd m2, m2, 2
1481 movd [r0 + r1 + 8], m2
1482
1483 movh [r0 + 2 * r1], m4
1484 pshufd m4, m4, 2
1485 movd [r0 + 2 * r1 + 8], m4
1486
1487 lea r0, [r0 + 2 * r1]
1488 movh [r0 + r1], m6
1489 pshufd m6, m6, 2
1490 movd [r0 + r1 + 8], m6
1491
1492 lea r0, [r0 + 2 * r1]
1493 lea r2, [r2 + 2 * r3]
1494
1495 dec r4d
1496 jnz .loop
1497
1498RET
1499%endmacro
1500
1501BLOCKCOPY_SP_W12_H4 12, 16
1502
1503BLOCKCOPY_SP_W12_H4 12, 32
1504
1505;-----------------------------------------------------------------------------
b53f7c52 1506; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1507;-----------------------------------------------------------------------------
1508%macro BLOCKCOPY_SP_W16_H4 2
1509INIT_XMM sse2
b53f7c52 1510cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
72b9787e
JB
1511
1512mov r4d, %2/4
1513
1514add r3, r3
1515
1516.loop:
1517 movu m0, [r2]
1518 movu m1, [r2 + 16]
1519 movu m2, [r2 + r3]
1520 movu m3, [r2 + r3 + 16]
1521 movu m4, [r2 + 2 * r3]
1522 movu m5, [r2 + 2 * r3 + 16]
1523 lea r2, [r2 + 2 * r3]
1524 movu m6, [r2 + r3]
1525 movu m7, [r2 + r3 + 16]
1526
1527 packuswb m0, m1
1528 packuswb m2, m3
1529 packuswb m4, m5
1530 packuswb m6, m7
1531
1532 movu [r0], m0
1533 movu [r0 + r1], m2
1534 movu [r0 + 2 * r1], m4
1535 lea r0, [r0 + 2 * r1]
1536 movu [r0 + r1], m6
1537
1538 lea r0, [r0 + 2 * r1]
1539 lea r2, [r2 + 2 * r3]
1540
1541 dec r4d
1542 jnz .loop
1543
1544RET
1545%endmacro
1546
1547BLOCKCOPY_SP_W16_H4 16, 4
1548BLOCKCOPY_SP_W16_H4 16, 8
1549BLOCKCOPY_SP_W16_H4 16, 12
1550BLOCKCOPY_SP_W16_H4 16, 16
1551BLOCKCOPY_SP_W16_H4 16, 32
1552BLOCKCOPY_SP_W16_H4 16, 64
1553
1554BLOCKCOPY_SP_W16_H4 16, 24
1555
1556;-----------------------------------------------------------------------------
b53f7c52 1557; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1558;-----------------------------------------------------------------------------
1559%macro BLOCKCOPY_SP_W24_H2 2
1560INIT_XMM sse2
b53f7c52 1561cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
72b9787e
JB
1562
1563mov r4d, %2/2
1564
1565add r3, r3
1566
1567.loop:
1568 movu m0, [r2]
1569 movu m1, [r2 + 16]
1570 movu m2, [r2 + 32]
1571 movu m3, [r2 + r3]
1572 movu m4, [r2 + r3 + 16]
1573 movu m5, [r2 + r3 + 32]
1574
1575 packuswb m0, m1
1576 packuswb m2, m3
1577 packuswb m4, m5
1578
1579 movu [r0], m0
1580 movlps [r0 + 16], m2
1581 movhps [r0 + r1], m2
1582 movu [r0 + r1 + 8], m4
1583
1584 lea r0, [r0 + 2 * r1]
1585 lea r2, [r2 + 2 * r3]
1586
1587 dec r4d
1588 jnz .loop
1589
1590RET
1591%endmacro
1592
1593BLOCKCOPY_SP_W24_H2 24, 32
1594
1595BLOCKCOPY_SP_W24_H2 24, 64
1596
1597;-----------------------------------------------------------------------------
b53f7c52 1598; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1599;-----------------------------------------------------------------------------
1600%macro BLOCKCOPY_SP_W32_H2 2
1601INIT_XMM sse2
b53f7c52 1602cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
72b9787e
JB
1603
1604mov r4d, %2/2
1605
1606add r3, r3
1607
1608.loop:
1609 movu m0, [r2]
1610 movu m1, [r2 + 16]
1611 movu m2, [r2 + 32]
1612 movu m3, [r2 + 48]
1613 movu m4, [r2 + r3]
1614 movu m5, [r2 + r3 + 16]
1615 movu m6, [r2 + r3 + 32]
1616 movu m7, [r2 + r3 + 48]
1617
1618 packuswb m0, m1
1619 packuswb m2, m3
1620 packuswb m4, m5
1621 packuswb m6, m7
1622
1623 movu [r0], m0
1624 movu [r0 + 16], m2
1625 movu [r0 + r1], m4
1626 movu [r0 + r1 + 16], m6
1627
1628 lea r0, [r0 + 2 * r1]
1629 lea r2, [r2 + 2 * r3]
1630
1631 dec r4d
1632 jnz .loop
1633
1634RET
1635%endmacro
1636
1637BLOCKCOPY_SP_W32_H2 32, 8
1638BLOCKCOPY_SP_W32_H2 32, 16
1639BLOCKCOPY_SP_W32_H2 32, 24
1640BLOCKCOPY_SP_W32_H2 32, 32
1641BLOCKCOPY_SP_W32_H2 32, 64
1642
1643BLOCKCOPY_SP_W32_H2 32, 48
1644
1645;-----------------------------------------------------------------------------
b53f7c52 1646; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1647;-----------------------------------------------------------------------------
1648%macro BLOCKCOPY_SP_W48_H2 2
1649INIT_XMM sse2
b53f7c52 1650cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
72b9787e
JB
1651
1652mov r4d, %2
1653
1654add r3, r3
1655
1656.loop:
1657 movu m0, [r2]
1658 movu m1, [r2 + 16]
1659 movu m2, [r2 + 32]
1660 movu m3, [r2 + 48]
1661 movu m4, [r2 + 64]
1662 movu m5, [r2 + 80]
1663
1664 packuswb m0, m1
1665 packuswb m2, m3
1666 packuswb m4, m5
1667
1668 movu [r0], m0
1669 movu [r0 + 16], m2
1670 movu [r0 + 32], m4
1671
1672 lea r0, [r0 + r1]
1673 lea r2, [r2 + r3]
1674
1675 dec r4d
1676 jnz .loop
1677
1678RET
1679%endmacro
1680
1681BLOCKCOPY_SP_W48_H2 48, 64
1682
1683;-----------------------------------------------------------------------------
b53f7c52 1684; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
1685;-----------------------------------------------------------------------------
1686%macro BLOCKCOPY_SP_W64_H1 2
1687INIT_XMM sse2
b53f7c52 1688cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
72b9787e
JB
1689
1690mov r4d, %2
1691
1692add r3, r3
1693
1694.loop:
1695 movu m0, [r2]
1696 movu m1, [r2 + 16]
1697 movu m2, [r2 + 32]
1698 movu m3, [r2 + 48]
1699 movu m4, [r2 + 64]
1700 movu m5, [r2 + 80]
1701 movu m6, [r2 + 96]
1702 movu m7, [r2 + 112]
1703
1704 packuswb m0, m1
1705 packuswb m2, m3
1706 packuswb m4, m5
1707 packuswb m6, m7
1708
1709 movu [r0], m0
1710 movu [r0 + 16], m2
1711 movu [r0 + 32], m4
1712 movu [r0 + 48], m6
1713
1714 lea r0, [r0 + r1]
1715 lea r2, [r2 + r3]
1716
1717 dec r4d
1718 jnz .loop
1719
1720RET
1721%endmacro
1722
1723BLOCKCOPY_SP_W64_H1 64, 16
1724BLOCKCOPY_SP_W64_H1 64, 32
1725BLOCKCOPY_SP_W64_H1 64, 48
1726BLOCKCOPY_SP_W64_H1 64, 64
1727
1728;-----------------------------------------------------------------------------
b53f7c52 1729; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
72b9787e
JB
1730;-----------------------------------------------------------------------------
1731INIT_XMM sse2
b53f7c52 1732cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
72b9787e
JB
1733
1734add r1, r1
1735
1736movd m0, r2d
1737pshuflw m0, m0, 0
1738
1739movh [r0], m0
1740movh [r0 + r1], m0
1741movh [r0 + 2 * r1], m0
1742lea r0, [r0 + 2 * r1]
1743movh [r0 + r1], m0
1744
1745RET
1746
1747;-----------------------------------------------------------------------------
b53f7c52 1748; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
72b9787e
JB
1749;-----------------------------------------------------------------------------
1750INIT_XMM sse2
b53f7c52 1751cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
72b9787e
JB
1752
1753add r1, r1
1754
1755movd m0, r2d
1756pshuflw m0, m0, 0
1757pshufd m0, m0, 0
1758
1759movu [r0], m0
1760movu [r0 + r1], m0
1761movu [r0 + 2 * r1], m0
1762
1763lea r0, [r0 + 2 * r1]
1764movu [r0 + r1], m0
1765movu [r0 + 2 * r1], m0
1766
1767lea r0, [r0 + 2 * r1]
1768movu [r0 + r1], m0
1769movu [r0 + 2 * r1], m0
1770
1771lea r0, [r0 + 2 * r1]
1772movu [r0 + r1], m0
1773
1774RET
1775
1776;-----------------------------------------------------------------------------
b53f7c52 1777; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
72b9787e
JB
1778;-----------------------------------------------------------------------------
1779%macro BLOCKFILL_S_W16_H8 2
1780INIT_XMM sse2
b53f7c52 1781cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
72b9787e
JB
1782
1783mov r3d, %2/8
1784
1785add r1, r1
1786
1787movd m0, r2d
1788pshuflw m0, m0, 0
1789pshufd m0, m0, 0
1790
1791.loop:
1792 movu [r0], m0
1793 movu [r0 + 16], m0
1794
1795 movu [r0 + r1], m0
1796 movu [r0 + r1 + 16], m0
1797
1798 movu [r0 + 2 * r1], m0
1799 movu [r0 + 2 * r1 + 16], m0
1800
1801 lea r4, [r0 + 2 * r1]
1802 movu [r4 + r1], m0
1803 movu [r4 + r1 + 16], m0
1804
1805 movu [r0 + 4 * r1], m0
1806 movu [r0 + 4 * r1 + 16], m0
1807
1808 lea r4, [r0 + 4 * r1]
1809 movu [r4 + r1], m0
1810 movu [r4 + r1 + 16], m0
1811
1812 movu [r4 + 2 * r1], m0
1813 movu [r4 + 2 * r1 + 16], m0
1814
1815 lea r4, [r4 + 2 * r1]
1816 movu [r4 + r1], m0
1817 movu [r4 + r1 + 16], m0
1818
1819 lea r0, [r0 + 8 * r1]
1820
1821 dec r3d
1822 jnz .loop
1823
1824RET
1825%endmacro
1826
1827BLOCKFILL_S_W16_H8 16, 16
1828
1829INIT_YMM avx2
1830cglobal blockfill_s_16x16, 3, 4, 1
1831add r1, r1
1832lea r3, [3 * r1]
1833movd xm0, r2d
1834vpbroadcastw m0, xm0
1835
1836movu [r0], m0
1837movu [r0 + r1], m0
1838movu [r0 + 2 * r1], m0
1839movu [r0 + r3], m0
1840lea r0, [r0 + 4 * r1]
1841movu [r0], m0
1842movu [r0 + r1], m0
1843movu [r0 + 2 * r1], m0
1844movu [r0 + r3], m0
1845lea r0, [r0 + 4 * r1]
1846movu [r0], m0
1847movu [r0 + r1], m0
1848movu [r0 + 2 * r1], m0
1849movu [r0 + r3], m0
1850lea r0, [r0 + 4 * r1]
1851movu [r0], m0
1852movu [r0 + r1], m0
1853movu [r0 + 2 * r1], m0
1854movu [r0 + r3], m0
1855RET
1856
1857;-----------------------------------------------------------------------------
b53f7c52 1858; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
72b9787e
JB
1859;-----------------------------------------------------------------------------
1860%macro BLOCKFILL_S_W32_H4 2
1861INIT_XMM sse2
b53f7c52 1862cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
72b9787e
JB
1863
1864mov r3d, %2/4
1865
1866add r1, r1
1867
1868movd m0, r2d
1869pshuflw m0, m0, 0
1870pshufd m0, m0, 0
1871
1872.loop:
1873 movu [r0], m0
1874 movu [r0 + 16], m0
1875 movu [r0 + 32], m0
1876 movu [r0 + 48], m0
1877
1878 movu [r0 + r1], m0
1879 movu [r0 + r1 + 16], m0
1880 movu [r0 + r1 + 32], m0
1881 movu [r0 + r1 + 48], m0
1882
1883 movu [r0 + 2 * r1], m0
1884 movu [r0 + 2 * r1 + 16], m0
1885 movu [r0 + 2 * r1 + 32], m0
1886 movu [r0 + 2 * r1 + 48], m0
1887
1888 lea r4, [r0 + 2 * r1]
1889
1890 movu [r4 + r1], m0
1891 movu [r4 + r1 + 16], m0
1892 movu [r4 + r1 + 32], m0
1893 movu [r4 + r1 + 48], m0
1894
1895 lea r0, [r0 + 4 * r1]
1896
1897 dec r3d
1898 jnz .loop
1899
1900RET
1901%endmacro
1902
1903BLOCKFILL_S_W32_H4 32, 32
1904
1905INIT_YMM avx2
1906cglobal blockfill_s_32x32, 3, 4, 1
1907add r1, r1
1908lea r3, [3 * r1]
1909movd xm0, r2d
1910vpbroadcastw m0, xm0
1911
1912movu [r0], m0
1913movu [r0 + 32], m0
1914movu [r0 + r1], m0
1915movu [r0 + r1 + 32], m0
1916movu [r0 + 2 * r1], m0
1917movu [r0 + 2 * r1 + 32], m0
1918movu [r0 + r3], m0
1919movu [r0 + r3 + 32], m0
1920lea r0, [r0 + 4 * r1]
1921movu [r0], m0
1922movu [r0 + 32], m0
1923movu [r0 + r1], m0
1924movu [r0 + r1 + 32], m0
1925movu [r0 + 2 * r1], m0
1926movu [r0 + 2 * r1 + 32], m0
1927movu [r0 + r3], m0
1928movu [r0 + r3 + 32], m0
1929lea r0, [r0 + 4 * r1]
1930movu [r0], m0
1931movu [r0 + 32], m0
1932movu [r0 + r1], m0
1933movu [r0 + r1 + 32], m0
1934movu [r0 + 2 * r1], m0
1935movu [r0 + 2 * r1 + 32], m0
1936movu [r0 + r3], m0
1937movu [r0 + r3 + 32], m0
1938lea r0, [r0 + 4 * r1]
1939movu [r0], m0
1940movu [r0 + 32], m0
1941movu [r0 + r1], m0
1942movu [r0 + r1 + 32], m0
1943movu [r0 + 2 * r1], m0
1944movu [r0 + 2 * r1 + 32], m0
1945movu [r0 + r3], m0
1946movu [r0 + r3 + 32], m0
1947lea r0, [r0 + 4 * r1]
1948movu [r0], m0
1949movu [r0 + 32], m0
1950movu [r0 + r1], m0
1951movu [r0 + r1 + 32], m0
1952movu [r0 + 2 * r1], m0
1953movu [r0 + 2 * r1 + 32], m0
1954movu [r0 + r3], m0
1955movu [r0 + r3 + 32], m0
1956lea r0, [r0 + 4 * r1]
1957movu [r0], m0
1958movu [r0 + 32], m0
1959movu [r0 + r1], m0
1960movu [r0 + r1 + 32], m0
1961movu [r0 + 2 * r1], m0
1962movu [r0 + 2 * r1 + 32], m0
1963movu [r0 + r3], m0
1964movu [r0 + r3 + 32], m0
1965lea r0, [r0 + 4 * r1]
1966movu [r0], m0
1967movu [r0 + 32], m0
1968movu [r0 + r1], m0
1969movu [r0 + r1 + 32], m0
1970movu [r0 + 2 * r1], m0
1971movu [r0 + 2 * r1 + 32], m0
1972movu [r0 + r3], m0
1973movu [r0 + r3 + 32], m0
1974lea r0, [r0 + 4 * r1]
1975movu [r0], m0
1976movu [r0 + 32], m0
1977movu [r0 + r1], m0
1978movu [r0 + r1 + 32], m0
1979movu [r0 + 2 * r1], m0
1980movu [r0 + 2 * r1 + 32], m0
1981movu [r0 + r3], m0
1982movu [r0 + r3 + 32], m0
1983RET
1984
1985;-----------------------------------------------------------------------------
b53f7c52 1986; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
1987;-----------------------------------------------------------------------------
1988INIT_XMM sse4
b53f7c52 1989cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
72b9787e
JB
1990
1991add r1, r1
1992
1993movd m0, [r2]
1994pmovzxbw m0, m0
1995movd [r0], m0
1996
1997movd m0, [r2 + r3]
1998pmovzxbw m0, m0
1999movd [r0 + r1], m0
2000
2001movd m0, [r2 + 2 * r3]
2002pmovzxbw m0, m0
2003movd [r0 + 2 * r1], m0
2004
2005lea r2, [r2 + 2 * r3]
2006lea r0, [r0 + 2 * r1]
2007
2008movd m0, [r2 + r3]
2009pmovzxbw m0, m0
2010movd [r0 + r1], m0
2011
2012RET
2013
2014
2015;-----------------------------------------------------------------------------
b53f7c52 2016; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2017;-----------------------------------------------------------------------------
2018INIT_XMM sse4
b53f7c52 2019cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
72b9787e
JB
2020
2021add r1, r1
2022
2023movd m0, [r2]
2024pmovzxbw m0, m0
2025movd [r0], m0
2026
2027movd m0, [r2 + r3]
2028pmovzxbw m0, m0
2029movd [r0 + r1], m0
2030
2031movd m0, [r2 + 2 * r3]
2032pmovzxbw m0, m0
2033movd [r0 + 2 * r1], m0
2034
2035lea r2, [r2 + 2 * r3]
2036lea r0, [r0 + 2 * r1]
2037
2038movd m0, [r2 + r3]
2039pmovzxbw m0, m0
2040movd [r0 + r1], m0
2041
2042movd m0, [r2 + 2 * r3]
2043pmovzxbw m0, m0
2044movd [r0 + 2 * r1], m0
2045
2046lea r2, [r2 + 2 * r3]
2047lea r0, [r0 + 2 * r1]
2048
2049movd m0, [r2 + r3]
2050pmovzxbw m0, m0
2051movd [r0 + r1], m0
2052
2053movd m0, [r2 + 2 * r3]
2054pmovzxbw m0, m0
2055movd [r0 + 2 * r1], m0
2056
2057lea r2, [r2 + 2 * r3]
2058lea r0, [r0 + 2 * r1]
2059
2060movd m0, [r2 + r3]
2061pmovzxbw m0, m0
2062movd [r0 + r1], m0
2063
2064RET
2065
2066
2067;-----------------------------------------------------------------------------
b53f7c52 2068; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2069;-----------------------------------------------------------------------------
2070INIT_XMM sse4
b53f7c52 2071cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
72b9787e
JB
2072 add r1, r1
2073 mov r4d, 16/2
2074.loop:
2075 movd m0, [r2]
2076 movd m1, [r2 + r3]
2077 dec r4d
2078 lea r2, [r2 + r3 * 2]
2079 pmovzxbw m0, m0
2080 pmovzxbw m1, m1
2081 movd [r0], m0
2082 movd [r0 + r1], m1
2083 lea r0, [r0 + r1 * 2]
2084 jnz .loop
2085 RET
2086
2087
2088;-----------------------------------------------------------------------------
b53f7c52 2089; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2090;-----------------------------------------------------------------------------
2091INIT_XMM sse4
b53f7c52 2092cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
72b9787e
JB
2093
2094add r1, r1
2095
2096movd m0, [r2]
2097pmovzxbw m0, m0
2098movh [r0], m0
2099
2100movd m0, [r2 + r3]
2101pmovzxbw m0, m0
2102movh [r0 + r1], m0
2103
2104RET
2105
2106
2107;-----------------------------------------------------------------------------
b53f7c52 2108; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2109;-----------------------------------------------------------------------------
2110INIT_XMM sse4
b53f7c52 2111cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
72b9787e
JB
2112
2113add r1, r1
2114
2115movd m0, [r2]
2116pmovzxbw m0, m0
2117movh [r0], m0
2118
2119movd m0, [r2 + r3]
2120pmovzxbw m0, m0
2121movh [r0 + r1], m0
2122
2123movd m0, [r2 + 2 * r3]
2124pmovzxbw m0, m0
2125movh [r0 + 2 * r1], m0
2126
2127lea r2, [r2 + 2 * r3]
2128lea r0, [r0 + 2 * r1]
2129
2130movd m0, [r2 + r3]
2131pmovzxbw m0, m0
2132movh [r0 + r1], m0
2133
2134RET
2135
2136
2137;-----------------------------------------------------------------------------
b53f7c52 2138; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2139;-----------------------------------------------------------------------------
2140%macro BLOCKCOPY_PS_W4_H4 2
2141INIT_XMM sse4
b53f7c52 2142cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
72b9787e
JB
2143
2144add r1, r1
2145mov r4d, %2/4
2146
2147.loop:
2148 movd m0, [r2]
2149 pmovzxbw m0, m0
2150 movh [r0], m0
2151
2152 movd m0, [r2 + r3]
2153 pmovzxbw m0, m0
2154 movh [r0 + r1], m0
2155
2156 movd m0, [r2 + 2 * r3]
2157 pmovzxbw m0, m0
2158 movh [r0 + 2 * r1], m0
2159
2160 lea r2, [r2 + 2 * r3]
2161 lea r0, [r0 + 2 * r1]
2162
2163 movd m0, [r2 + r3]
2164 pmovzxbw m0, m0
2165 movh [r0 + r1], m0
2166
2167 lea r0, [r0 + 2 * r1]
2168 lea r2, [r2 + 2 * r3]
2169
2170 dec r4d
2171 jnz .loop
2172
2173RET
2174%endmacro
2175
2176BLOCKCOPY_PS_W4_H4 4, 8
2177BLOCKCOPY_PS_W4_H4 4, 16
2178
2179BLOCKCOPY_PS_W4_H4 4, 32
2180
2181
2182;-----------------------------------------------------------------------------
b53f7c52 2183; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2184;-----------------------------------------------------------------------------
2185%macro BLOCKCOPY_PS_W6_H4 2
2186INIT_XMM sse4
b53f7c52 2187cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
72b9787e
JB
2188
2189add r1, r1
2190mov r4d, %2/4
2191
2192.loop:
2193 movh m0, [r2]
2194 pmovzxbw m0, m0
2195 movh [r0], m0
2196 pextrd [r0 + 8], m0, 2
2197
2198 movh m0, [r2 + r3]
2199 pmovzxbw m0, m0
2200 movh [r0 + r1], m0
2201 pextrd [r0 + r1 + 8], m0, 2
2202
2203 movh m0, [r2 + 2 * r3]
2204 pmovzxbw m0, m0
2205 movh [r0 + 2 * r1], m0
2206 pextrd [r0 + 2 * r1 + 8], m0, 2
2207
2208 lea r2, [r2 + 2 * r3]
2209 lea r0, [r0 + 2 * r1]
2210
2211 movh m0, [r2 + r3]
2212 pmovzxbw m0, m0
2213 movh [r0 + r1], m0
2214 pextrd [r0 + r1 + 8], m0, 2
2215
2216 lea r0, [r0 + 2 * r1]
2217 lea r2, [r2 + 2 * r3]
2218
2219 dec r4d
2220 jnz .loop
2221
2222RET
2223%endmacro
2224
2225BLOCKCOPY_PS_W6_H4 6, 8
2226
2227BLOCKCOPY_PS_W6_H4 6, 16
2228
2229;-----------------------------------------------------------------------------
b53f7c52 2230; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2231;-----------------------------------------------------------------------------
2232INIT_XMM sse4
b53f7c52 2233cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
72b9787e
JB
2234
2235add r1, r1
2236
2237movh m0, [r2]
2238pmovzxbw m0, m0
2239movu [r0], m0
2240
2241movh m0, [r2 + r3]
2242pmovzxbw m0, m0
2243movu [r0 + r1], m0
2244
2245RET
2246
2247;-----------------------------------------------------------------------------
b53f7c52 2248; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2249;-----------------------------------------------------------------------------
2250INIT_XMM sse4
b53f7c52 2251cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
72b9787e
JB
2252
2253add r1, r1
2254
2255movh m0, [r2]
2256pmovzxbw m0, m0
2257movu [r0], m0
2258
2259movh m0, [r2 + r3]
2260pmovzxbw m0, m0
2261movu [r0 + r1], m0
2262
2263movh m0, [r2 + 2 * r3]
2264pmovzxbw m0, m0
2265movu [r0 + 2 * r1], m0
2266
2267lea r2, [r2 + 2 * r3]
2268lea r0, [r0 + 2 * r1]
2269
2270movh m0, [r2 + r3]
2271pmovzxbw m0, m0
2272movu [r0 + r1], m0
2273
2274RET
2275
2276;-----------------------------------------------------------------------------
b53f7c52 2277; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2278;-----------------------------------------------------------------------------
2279INIT_XMM sse4
b53f7c52 2280cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
72b9787e
JB
2281
2282add r1, r1
2283
2284movh m0, [r2]
2285pmovzxbw m0, m0
2286movu [r0], m0
2287
2288movh m0, [r2 + r3]
2289pmovzxbw m0, m0
2290movu [r0 + r1], m0
2291
2292movh m0, [r2 + 2 * r3]
2293pmovzxbw m0, m0
2294movu [r0 + 2 * r1], m0
2295
2296lea r2, [r2 + 2 * r3]
2297lea r0, [r0 + 2 * r1]
2298
2299movh m0, [r2 + r3]
2300pmovzxbw m0, m0
2301movu [r0 + r1], m0
2302
2303movh m0, [r2 + 2 * r3]
2304pmovzxbw m0, m0
2305movu [r0 + 2 * r1], m0
2306
2307lea r2, [r2 + 2 * r3]
2308lea r0, [r0 + 2 * r1]
2309
2310movh m0, [r2 + r3]
2311pmovzxbw m0, m0
2312movu [r0 + r1], m0
2313
2314RET
2315
2316;-----------------------------------------------------------------------------
b53f7c52 2317; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2318;-----------------------------------------------------------------------------
2319%macro BLOCKCOPY_PS_W8_H4 2
2320INIT_XMM sse4
b53f7c52 2321cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
72b9787e
JB
2322
2323add r1, r1
2324mov r4d, %2/4
2325
2326.loop:
2327 movh m0, [r2]
2328 pmovzxbw m0, m0
2329 movu [r0], m0
2330
2331 movh m0, [r2 + r3]
2332 pmovzxbw m0, m0
2333 movu [r0 + r1], m0
2334
2335 movh m0, [r2 + 2 * r3]
2336 pmovzxbw m0, m0
2337 movu [r0 + 2 * r1], m0
2338
2339 lea r2, [r2 + 2 * r3]
2340 lea r0, [r0 + 2 * r1]
2341
2342 movh m0, [r2 + r3]
2343 pmovzxbw m0, m0
2344 movu [r0 + r1], m0
2345
2346 lea r0, [r0 + 2 * r1]
2347 lea r2, [r2 + 2 * r3]
2348
2349 dec r4d
2350 jnz .loop
2351
2352RET
2353%endmacro
2354
2355BLOCKCOPY_PS_W8_H4 8, 8
2356BLOCKCOPY_PS_W8_H4 8, 16
2357BLOCKCOPY_PS_W8_H4 8, 32
2358
2359BLOCKCOPY_PS_W8_H4 8, 12
2360BLOCKCOPY_PS_W8_H4 8, 64
2361
2362
2363;-----------------------------------------------------------------------------
b53f7c52 2364; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2365;-----------------------------------------------------------------------------
2366%macro BLOCKCOPY_PS_W12_H2 2
2367INIT_XMM sse4
b53f7c52 2368cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
72b9787e
JB
2369
2370add r1, r1
2371mov r4d, %2/2
2372pxor m0, m0
2373
2374.loop:
2375 movu m1, [r2]
2376 pmovzxbw m2, m1
2377 movu [r0], m2
2378 punpckhbw m1, m0
2379 movh [r0 + 16], m1
2380
2381 movu m1, [r2 + r3]
2382 pmovzxbw m2, m1
2383 movu [r0 + r1], m2
2384 punpckhbw m1, m0
2385 movh [r0 + r1 + 16], m1
2386
2387 lea r0, [r0 + 2 * r1]
2388 lea r2, [r2 + 2 * r3]
2389
2390 dec r4d
2391 jnz .loop
2392
2393RET
2394%endmacro
2395
2396BLOCKCOPY_PS_W12_H2 12, 16
2397
2398BLOCKCOPY_PS_W12_H2 12, 32
2399
2400;-----------------------------------------------------------------------------
b53f7c52 2401; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2402;-----------------------------------------------------------------------------
2403INIT_XMM sse4
b53f7c52 2404cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
72b9787e
JB
2405
2406add r1, r1
2407pxor m0, m0
2408
2409movu m1, [r2]
2410pmovzxbw m2, m1
2411movu [r0], m2
2412punpckhbw m1, m0
2413movu [r0 + 16], m1
2414
2415movu m1, [r2 + r3]
2416pmovzxbw m2, m1
2417movu [r0 + r1], m2
2418punpckhbw m1, m0
2419movu [r0 + r1 + 16], m1
2420
2421movu m1, [r2 + 2 * r3]
2422pmovzxbw m2, m1
2423movu [r0 + 2 * r1], m2
2424punpckhbw m1, m0
2425movu [r0 + 2 * r1 + 16], m1
2426
2427lea r0, [r0 + 2 * r1]
2428lea r2, [r2 + 2 * r3]
2429
2430movu m1, [r2 + r3]
2431pmovzxbw m2, m1
2432movu [r0 + r1], m2
2433punpckhbw m1, m0
2434movu [r0 + r1 + 16], m1
2435
2436RET
2437
2438;-----------------------------------------------------------------------------
b53f7c52 2439; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2440;-----------------------------------------------------------------------------
2441%macro BLOCKCOPY_PS_W16_H4 2
2442INIT_XMM sse4
b53f7c52 2443cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
72b9787e
JB
2444
2445add r1, r1
2446mov r4d, %2/4
2447pxor m0, m0
2448
2449.loop:
2450 movu m1, [r2]
2451 pmovzxbw m2, m1
2452 movu [r0], m2
2453 punpckhbw m1, m0
2454 movu [r0 + 16], m1
2455
2456 movu m1, [r2 + r3]
2457 pmovzxbw m2, m1
2458 movu [r0 + r1], m2
2459 punpckhbw m1, m0
2460 movu [r0 + r1 + 16], m1
2461
2462 movu m1, [r2 + 2 * r3]
2463 pmovzxbw m2, m1
2464 movu [r0 + 2 * r1], m2
2465 punpckhbw m1, m0
2466 movu [r0 + 2 * r1 + 16], m1
2467
2468 lea r0, [r0 + 2 * r1]
2469 lea r2, [r2 + 2 * r3]
2470
2471 movu m1, [r2 + r3]
2472 pmovzxbw m2, m1
2473 movu [r0 + r1], m2
2474 punpckhbw m1, m0
2475 movu [r0 + r1 + 16], m1
2476
2477 lea r0, [r0 + 2 * r1]
2478 lea r2, [r2 + 2 * r3]
2479
2480 dec r4d
2481 jnz .loop
2482
2483RET
2484%endmacro
2485
2486BLOCKCOPY_PS_W16_H4 16, 8
2487BLOCKCOPY_PS_W16_H4 16, 12
2488BLOCKCOPY_PS_W16_H4 16, 16
2489BLOCKCOPY_PS_W16_H4 16, 32
2490BLOCKCOPY_PS_W16_H4 16, 64
2491
2492BLOCKCOPY_PS_W16_H4 16, 24
2493
2494;-----------------------------------------------------------------------------
b53f7c52 2495; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2496;-----------------------------------------------------------------------------
2497%macro BLOCKCOPY_PS_W24_H2 2
2498INIT_XMM sse4
b53f7c52 2499cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
72b9787e
JB
2500
2501add r1, r1
2502mov r4d, %2/2
2503pxor m0, m0
2504
2505.loop:
2506 movu m1, [r2]
2507 pmovzxbw m2, m1
2508 movu [r0], m2
2509 punpckhbw m1, m0
2510 movu [r0 + 16], m1
2511
2512 movh m1, [r2 + 16]
2513 pmovzxbw m1, m1
2514 movu [r0 + 32], m1
2515
2516 movu m1, [r2 + r3]
2517 pmovzxbw m2, m1
2518 movu [r0 + r1], m2
2519 punpckhbw m1, m0
2520 movu [r0 + r1 + 16], m1
2521
2522 movh m1, [r2 + r3 + 16]
2523 pmovzxbw m1, m1
2524 movu [r0 + r1 + 32], m1
2525
2526 lea r0, [r0 + 2 * r1]
2527 lea r2, [r2 + 2 * r3]
2528
2529 dec r4d
2530 jnz .loop
2531
2532RET
2533%endmacro
2534
2535BLOCKCOPY_PS_W24_H2 24, 32
2536
2537BLOCKCOPY_PS_W24_H2 24, 64
2538
2539;-----------------------------------------------------------------------------
b53f7c52 2540; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2541;-----------------------------------------------------------------------------
2542%macro BLOCKCOPY_PS_W32_H2 2
2543INIT_XMM sse4
b53f7c52 2544cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
72b9787e
JB
2545
2546add r1, r1
2547mov r4d, %2/2
2548pxor m0, m0
2549
2550.loop:
2551 movu m1, [r2]
2552 pmovzxbw m2, m1
2553 movu [r0], m2
2554 punpckhbw m1, m0
2555 movu [r0 + 16], m1
2556
2557 movu m1, [r2 + 16]
2558 pmovzxbw m2, m1
2559 movu [r0 + 32], m2
2560 punpckhbw m1, m0
2561 movu [r0 + 48], m1
2562
2563 movu m1, [r2 + r3]
2564 pmovzxbw m2, m1
2565 movu [r0 + r1], m2
2566 punpckhbw m1, m0
2567 movu [r0 + r1 + 16], m1
2568
2569 movu m1, [r2 + r3 + 16]
2570 pmovzxbw m2, m1
2571 movu [r0 + r1 + 32], m2
2572 punpckhbw m1, m0
2573 movu [r0 + r1 + 48], m1
2574
2575 lea r0, [r0 + 2 * r1]
2576 lea r2, [r2 + 2 * r3]
2577
2578 dec r4d
2579 jnz .loop
2580
2581RET
2582%endmacro
2583
2584BLOCKCOPY_PS_W32_H2 32, 8
2585BLOCKCOPY_PS_W32_H2 32, 16
2586BLOCKCOPY_PS_W32_H2 32, 24
2587BLOCKCOPY_PS_W32_H2 32, 32
2588BLOCKCOPY_PS_W32_H2 32, 64
2589
2590BLOCKCOPY_PS_W32_H2 32, 48
2591
2592;-----------------------------------------------------------------------------
b53f7c52 2593; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2594;-----------------------------------------------------------------------------
2595%macro BLOCKCOPY_PS_W48_H2 2
2596INIT_XMM sse4
b53f7c52 2597cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
72b9787e
JB
2598
2599add r1, r1
2600mov r4d, %2/2
2601pxor m0, m0
2602
2603.loop:
2604 movu m1, [r2]
2605 pmovzxbw m2, m1
2606 movu [r0], m2
2607 punpckhbw m1, m0
2608 movu [r0 + 16], m1
2609
2610 movu m1, [r2 + 16]
2611 pmovzxbw m2, m1
2612 movu [r0 + 32], m2
2613 punpckhbw m1, m0
2614 movu [r0 + 48], m1
2615
2616 movu m1, [r2 + 32]
2617 pmovzxbw m2, m1
2618 movu [r0 + 64], m2
2619 punpckhbw m1, m0
2620 movu [r0 + 80], m1
2621
2622 movu m1, [r2 + r3]
2623 pmovzxbw m2, m1
2624 movu [r0 + r1], m2
2625 punpckhbw m1, m0
2626 movu [r0 + r1 + 16], m1
2627
2628 movu m1, [r2 + r3 + 16]
2629 pmovzxbw m2, m1
2630 movu [r0 + r1 + 32], m2
2631 punpckhbw m1, m0
2632 movu [r0 + r1 + 48], m1
2633
2634 movu m1, [r2 + r3 + 32]
2635 pmovzxbw m2, m1
2636 movu [r0 + r1 + 64], m2
2637 punpckhbw m1, m0
2638 movu [r0 + r1 + 80], m1
2639
2640 lea r0, [r0 + 2 * r1]
2641 lea r2, [r2 + 2 * r3]
2642
2643 dec r4d
2644 jnz .loop
2645
2646RET
2647%endmacro
2648
2649BLOCKCOPY_PS_W48_H2 48, 64
2650
2651;-----------------------------------------------------------------------------
b53f7c52 2652; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
2653;-----------------------------------------------------------------------------
2654%macro BLOCKCOPY_PS_W64_H2 2
2655INIT_XMM sse4
b53f7c52 2656cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
72b9787e
JB
2657
2658add r1, r1
2659mov r4d, %2/2
2660pxor m0, m0
2661
2662.loop:
2663 movu m1, [r2]
2664 pmovzxbw m2, m1
2665 movu [r0], m2
2666 punpckhbw m1, m0
2667 movu [r0 + 16], m1
2668
2669 movu m1, [r2 + 16]
2670 pmovzxbw m2, m1
2671 movu [r0 + 32], m2
2672 punpckhbw m1, m0
2673 movu [r0 + 48], m1
2674
2675 movu m1, [r2 + 32]
2676 pmovzxbw m2, m1
2677 movu [r0 + 64], m2
2678 punpckhbw m1, m0
2679 movu [r0 + 80], m1
2680
2681 movu m1, [r2 + 48]
2682 pmovzxbw m2, m1
2683 movu [r0 + 96], m2
2684 punpckhbw m1, m0
2685 movu [r0 + 112], m1
2686
2687 movu m1, [r2 + r3]
2688 pmovzxbw m2, m1
2689 movu [r0 + r1], m2
2690 punpckhbw m1, m0
2691 movu [r0 + r1 + 16], m1
2692
2693 movu m1, [r2 + r3 + 16]
2694 pmovzxbw m2, m1
2695 movu [r0 + r1 + 32], m2
2696 punpckhbw m1, m0
2697 movu [r0 + r1 + 48], m1
2698
2699 movu m1, [r2 + r3 + 32]
2700 pmovzxbw m2, m1
2701 movu [r0 + r1 + 64], m2
2702 punpckhbw m1, m0
2703 movu [r0 + r1 + 80], m1
2704
2705 movu m1, [r2 + r3 + 48]
2706 pmovzxbw m2, m1
2707 movu [r0 + r1 + 96], m2
2708 punpckhbw m1, m0
2709 movu [r0 + r1 + 112], m1
2710
2711 lea r0, [r0 + 2 * r1]
2712 lea r2, [r2 + 2 * r3]
2713
2714 dec r4d
2715 jnz .loop
2716
2717RET
2718%endmacro
2719
2720BLOCKCOPY_PS_W64_H2 64, 16
2721BLOCKCOPY_PS_W64_H2 64, 32
2722BLOCKCOPY_PS_W64_H2 64, 48
2723BLOCKCOPY_PS_W64_H2 64, 64
2724
2725;-----------------------------------------------------------------------------
b53f7c52 2726; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2727;-----------------------------------------------------------------------------
2728INIT_XMM sse2
2729cglobal blockcopy_ss_2x4, 4, 6, 0
2730 add r1, r1
2731 add r3, r3
2732
2733 mov r4d, [r2]
2734 mov r5d, [r2 + r3]
2735 mov [r0], r4d
2736 mov [r0 + r1], r5d
2737
2738 lea r2, [r2 + r3 * 2]
2739 lea r0, [r0 + 2 * r1]
2740
2741 mov r4d, [r2]
2742 mov r5d, [r2 + r3]
2743 mov [r0], r4d
2744 mov [r0 + r1], r5d
2745
2746 RET
2747
2748;-----------------------------------------------------------------------------
b53f7c52 2749; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2750;-----------------------------------------------------------------------------
2751INIT_XMM sse2
2752cglobal blockcopy_ss_2x8, 4, 6, 0
2753 add r1, r1
2754 add r3, r3
2755
2756 mov r4d, [r2]
2757 mov r5d, [r2 + r3]
2758 mov [r0], r4d
2759 mov [r0 + r1], r5d
2760
2761 lea r2, [r2 + r3 * 2]
2762 lea r0, [r0 + 2 * r1]
2763
2764 mov r4d, [r2]
2765 mov r5d, [r2 + r3]
2766 mov [r0], r4d
2767 mov [r0 + r1], r5d
2768
2769 lea r2, [r2 + r3 * 2]
2770 lea r0, [r0 + 2 * r1]
2771
2772 mov r4d, [r2]
2773 mov r5d, [r2 + r3]
2774 mov [r0], r4d
2775 mov [r0 + r1], r5d
2776
2777 lea r2, [r2 + r3 * 2]
2778 lea r0, [r0 + 2 * r1]
2779
2780 mov r4d, [r2]
2781 mov r5d, [r2 + r3]
2782 mov [r0], r4d
2783 mov [r0 + r1], r5d
2784
2785 RET
2786
2787;-----------------------------------------------------------------------------
b53f7c52 2788; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2789;-----------------------------------------------------------------------------
2790INIT_XMM sse2
2791cglobal blockcopy_ss_2x16, 4, 7, 0
2792 add r1, r1
2793 add r3, r3
2794 mov r6d, 16/2
2795.loop:
2796 mov r4d, [r2]
2797 mov r5d, [r2 + r3]
2798 dec r6d
2799 lea r2, [r2 + r3 * 2]
2800 mov [r0], r4d
2801 mov [r0 + r1], r5d
2802 lea r0, [r0 + r1 * 2]
2803 jnz .loop
2804 RET
2805
2806
2807;-----------------------------------------------------------------------------
b53f7c52 2808; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2809;-----------------------------------------------------------------------------
2810INIT_XMM sse2
2811cglobal blockcopy_ss_4x2, 4, 4, 2
2812 add r1, r1
2813 add r3, r3
2814
2815 movh m0, [r2]
2816 movh m1, [r2 + r3]
2817
2818 movh [r0], m0
2819 movh [r0 + r1], m1
2820
2821 RET
2822
2823;-----------------------------------------------------------------------------
b53f7c52 2824; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2825;-----------------------------------------------------------------------------
2826INIT_XMM sse2
2827cglobal blockcopy_ss_4x4, 4, 4, 4
2828 add r1, r1
2829 add r3, r3
2830 movh m0, [r2]
2831 movh m1, [r2 + r3]
2832 lea r2, [r2 + r3 * 2]
2833 movh m2, [r2]
2834 movh m3, [r2 + r3]
2835
2836 movh [r0], m0
2837 movh [r0 + r1], m1
2838 lea r0, [r0 + 2 * r1]
2839 movh [r0], m2
2840 movh [r0 + r1], m3
2841 RET
2842
2843;-----------------------------------------------------------------------------
b53f7c52 2844; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2845;-----------------------------------------------------------------------------
2846%macro BLOCKCOPY_SS_W4_H8 2
2847INIT_XMM sse2
2848cglobal blockcopy_ss_%1x%2, 4, 5, 4
2849 mov r4d, %2/8
2850 add r1, r1
2851 add r3, r3
2852.loop:
2853 movh m0, [r2]
2854 movh m1, [r2 + r3]
2855 lea r2, [r2 + r3 * 2]
2856 movh m2, [r2]
2857 movh m3, [r2 + r3]
2858
2859 movh [r0], m0
2860 movh [r0 + r1], m1
2861 lea r0, [r0 + 2 * r1]
2862 movh [r0], m2
2863 movh [r0 + r1], m3
2864
2865 lea r0, [r0 + 2 * r1]
2866 lea r2, [r2 + 2 * r3]
2867 movh m0, [r2]
2868 movh m1, [r2 + r3]
2869 lea r2, [r2 + r3 * 2]
2870 movh m2, [r2]
2871 movh m3, [r2 + r3]
2872
2873 movh [r0], m0
2874 movh [r0 + r1], m1
2875 lea r0, [r0 + 2 * r1]
2876 movh [r0], m2
2877 movh [r0 + r1], m3
2878 lea r0, [r0 + 2 * r1]
2879 lea r2, [r2 + 2 * r3]
2880
2881 dec r4d
2882 jnz .loop
2883 RET
2884%endmacro
2885
2886BLOCKCOPY_SS_W4_H8 4, 8
2887BLOCKCOPY_SS_W4_H8 4, 16
2888
2889BLOCKCOPY_SS_W4_H8 4, 32
2890
2891;-----------------------------------------------------------------------------
b53f7c52 2892; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2893;-----------------------------------------------------------------------------
2894INIT_XMM sse2
2895cglobal blockcopy_ss_6x8, 4, 4, 4
2896 add r1, r1
2897 add r3, r3
2898
2899 movu m0, [r2]
2900 movu m1, [r2 + r3]
2901 pshufd m2, m0, 2
2902 pshufd m3, m1, 2
2903 movh [r0], m0
2904 movd [r0 + 8], m2
2905 movh [r0 + r1], m1
2906 movd [r0 + r1 + 8], m3
2907
2908 lea r0, [r0 + 2 * r1]
2909 lea r2, [r2 + 2 * r3]
2910
2911 movu m0, [r2]
2912 movu m1, [r2 + r3]
2913 pshufd m2, m0, 2
2914 pshufd m3, m1, 2
2915 movh [r0], m0
2916 movd [r0 + 8], m2
2917 movh [r0 + r1], m1
2918 movd [r0 + r1 + 8], m3
2919
2920 lea r0, [r0 + 2 * r1]
2921 lea r2, [r2 + 2 * r3]
2922
2923 movu m0, [r2]
2924 movu m1, [r2 + r3]
2925 pshufd m2, m0, 2
2926 pshufd m3, m1, 2
2927 movh [r0], m0
2928 movd [r0 + 8], m2
2929 movh [r0 + r1], m1
2930 movd [r0 + r1 + 8], m3
2931
2932 lea r0, [r0 + 2 * r1]
2933 lea r2, [r2 + 2 * r3]
2934
2935 movu m0, [r2]
2936 movu m1, [r2 + r3]
2937 pshufd m2, m0, 2
2938 pshufd m3, m1, 2
2939 movh [r0], m0
2940 movd [r0 + 8], m2
2941 movh [r0 + r1], m1
2942 movd [r0 + r1 + 8], m3
2943
2944 RET
2945
2946;-----------------------------------------------------------------------------
b53f7c52 2947; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2948;-----------------------------------------------------------------------------
2949INIT_XMM sse2
2950cglobal blockcopy_ss_6x16, 4, 5, 4
2951 add r1, r1
2952 add r3, r3
2953 mov r4d, 16/2
2954.loop:
2955 movh m0, [r2]
2956 movd m2, [r2 + 8]
2957 movh m1, [r2 + r3]
2958 movd m3, [r2 + r3 + 8]
2959 dec r4d
2960 lea r2, [r2 + r3 * 2]
2961 movh [r0], m0
2962 movd [r0 + 8], m2
2963 movh [r0 + r1], m1
2964 movd [r0 + r1 + 8], m3
2965 lea r0, [r0 + r1 * 2]
2966 jnz .loop
2967 RET
2968
2969
2970;-----------------------------------------------------------------------------
b53f7c52 2971; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2972;-----------------------------------------------------------------------------
2973INIT_XMM sse2
2974cglobal blockcopy_ss_8x2, 4, 4, 2
2975 add r1, r1
2976 add r3, r3
2977
2978 movu m0, [r2]
2979 movu m1, [r2 + r3]
2980
2981 movu [r0], m0
2982 movu [r0 + r1], m1
2983
2984 RET
2985
2986;-----------------------------------------------------------------------------
b53f7c52 2987; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
2988;-----------------------------------------------------------------------------
2989INIT_XMM sse2
2990cglobal blockcopy_ss_8x4, 4, 4, 4
2991 add r1, r1
2992 add r3, r3
2993
2994 movu m0, [r2]
2995 movu m1, [r2 + r3]
2996 lea r2, [r2 + r3 * 2]
2997 movu m2, [r2]
2998 movu m3, [r2 + r3]
2999
3000 movu [r0], m0
3001 movu [r0 + r1], m1
3002 lea r0, [r0 + 2 * r1]
3003 movu [r0], m2
3004 movu [r0 + r1], m3
3005 RET
3006
3007;-----------------------------------------------------------------------------
b53f7c52 3008; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3009;-----------------------------------------------------------------------------
3010INIT_XMM sse2
3011cglobal blockcopy_ss_8x6, 4, 4, 4
3012
3013 add r1, r1
3014 add r3, r3
3015 movu m0, [r2]
3016 movu m1, [r2 + r3]
3017 lea r2, [r2 + r3 * 2]
3018 movu m2, [r2]
3019 movu m3, [r2 + r3]
3020
3021 movu [r0], m0
3022 movu [r0 + r1], m1
3023 lea r0, [r0 + 2 * r1]
3024 movu [r0], m2
3025 movu [r0 + r1], m3
3026
3027 lea r2, [r2 + r3 * 2]
3028 lea r0, [r0 + 2 * r1]
3029
3030 movu m0, [r2]
3031 movu m1, [r2 + r3]
3032 movu [r0], m0
3033 movu [r0 + r1], m1
3034 RET
3035
3036;-----------------------------------------------------------------------------
b53f7c52 3037; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3038;-----------------------------------------------------------------------------
3039INIT_XMM sse2
3040cglobal blockcopy_ss_8x12, 4, 5, 2
3041 add r1, r1
3042 add r3, r3
3043 mov r4d, 12/2
3044.loop:
3045 movu m0, [r2]
3046 movu m1, [r2 + r3]
3047 lea r2, [r2 + 2 * r3]
3048 dec r4d
3049 movu [r0], m0
3050 movu [r0 + r1], m1
3051 lea r0, [r0 + 2 * r1]
3052 jnz .loop
3053 RET
3054
3055
3056;-----------------------------------------------------------------------------
b53f7c52 3057; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3058;-----------------------------------------------------------------------------
3059%macro BLOCKCOPY_SS_W8_H8 2
3060INIT_XMM sse2
3061cglobal blockcopy_ss_%1x%2, 4, 5, 4
3062 mov r4d, %2/8
3063 add r1, r1
3064 add r3, r3
3065.loop:
3066 movu m0, [r2]
3067 movu m1, [r2 + r3]
3068 lea r2, [r2 + r3 * 2]
3069 movu m2, [r2]
3070 movu m3, [r2 + r3]
3071
3072 movu [r0], m0
3073 movu [r0 + r1], m1
3074 lea r0, [r0 + 2 * r1]
3075 movu [r0], m2
3076 movu [r0 + r1], m3
3077
3078
3079 lea r2, [r2 + 2 * r3]
3080 lea r0, [r0 + 2 * r1]
3081
3082 movu m0, [r2]
3083 movu m1, [r2 + r3]
3084 lea r2, [r2 + r3 * 2]
3085 movu m2, [r2]
3086 movu m3, [r2 + r3]
3087
3088 movu [r0], m0
3089 movu [r0 + r1], m1
3090 lea r0, [r0 + 2 * r1]
3091 movu [r0], m2
3092 movu [r0 + r1], m3
3093
3094 dec r4d
3095 lea r0, [r0 + 2 * r1]
3096 lea r2, [r2 + 2 * r3]
3097 jnz .loop
3098RET
3099%endmacro
3100
3101BLOCKCOPY_SS_W8_H8 8, 8
3102BLOCKCOPY_SS_W8_H8 8, 16
3103BLOCKCOPY_SS_W8_H8 8, 32
3104
3105BLOCKCOPY_SS_W8_H8 8, 64
3106
3107;-----------------------------------------------------------------------------
b53f7c52 3108; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3109;-----------------------------------------------------------------------------
3110%macro BLOCKCOPY_SS_W12_H4 2
3111INIT_XMM sse2
3112cglobal blockcopy_ss_%1x%2, 4, 5, 4
3113
3114 mov r4d, %2/4
3115 add r1, r1
3116 add r3, r3
3117.loop:
3118 movu m0, [r2]
3119 movh m1, [r2 + 16]
3120 movu m2, [r2 + r3]
3121 movh m3, [r2 + r3 + 16]
3122 lea r2, [r2 + 2 * r3]
3123
3124 movu [r0], m0
3125 movh [r0 + 16], m1
3126 movu [r0 + r1], m2
3127 movh [r0 + r1 + 16], m3
3128
3129 lea r0, [r0 + 2 * r1]
3130 movu m0, [r2]
3131 movh m1, [r2 + 16]
3132 movu m2, [r2 + r3]
3133 movh m3, [r2 + r3 + 16]
3134
3135 movu [r0], m0
3136 movh [r0 + 16], m1
3137 movu [r0 + r1], m2
3138 movh [r0 + r1 + 16], m3
3139
3140 dec r4d
3141 lea r0, [r0 + 2 * r1]
3142 lea r2, [r2 + 2 * r3]
3143 jnz .loop
3144 RET
3145%endmacro
3146
3147BLOCKCOPY_SS_W12_H4 12, 16
3148
3149BLOCKCOPY_SS_W12_H4 12, 32
3150
3151;-----------------------------------------------------------------------------
b53f7c52 3152; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3153;-----------------------------------------------------------------------------
3154%macro BLOCKCOPY_SS_W16_H4 2
3155INIT_XMM sse2
3156cglobal blockcopy_ss_%1x%2, 4, 5, 4
3157 mov r4d, %2/4
3158 add r1, r1
3159 add r3, r3
3160.loop:
3161 movu m0, [r2]
3162 movu m1, [r2 + 16]
3163 movu m2, [r2 + r3]
3164 movu m3, [r2 + r3 + 16]
3165
3166 movu [r0], m0
3167 movu [r0 + 16], m1
3168 movu [r0 + r1], m2
3169 movu [r0 + r1 + 16], m3
3170
3171 lea r2, [r2 + 2 * r3]
3172 lea r0, [r0 + 2 * r1]
3173
3174 movu m0, [r2]
3175 movu m1, [r2 + 16]
3176 movu m2, [r2 + r3]
3177 movu m3, [r2 + r3 + 16]
3178
3179 movu [r0], m0
3180 movu [r0 + 16], m1
3181 movu [r0 + r1], m2
3182 movu [r0 + r1 + 16], m3
3183
3184 dec r4d
3185 lea r0, [r0 + 2 * r1]
3186 lea r2, [r2 + 2 * r3]
3187 jnz .loop
3188 RET
3189%endmacro
3190
3191BLOCKCOPY_SS_W16_H4 16, 4
3192BLOCKCOPY_SS_W16_H4 16, 12
3193
3194;-----------------------------------------------------------------------------
b53f7c52 3195; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3196;-----------------------------------------------------------------------------
3197%macro BLOCKCOPY_SS_W16_H4_avx 2
3198INIT_YMM avx
3199cglobal blockcopy_ss_%1x%2, 4, 7, 4
3200 mov r4d, %2/4
3201 add r1, r1
3202 add r3, r3
3203 lea r5, [3 * r3]
3204 lea r6, [3 * r1]
3205.loop:
3206 movu m0, [r2]
3207 movu m1, [r2 + r3]
3208 movu m2, [r2 + 2 * r3]
3209 movu m3, [r2 + r5]
3210
3211 movu [r0], m0
3212 movu [r0 + r1], m1
3213 movu [r0 + 2 * r1], m2
3214 movu [r0 + r6], m3
3215
3216 lea r0, [r0 + 4 * r1]
3217 lea r2, [r2 + 4 * r3]
3218 dec r4d
3219 jnz .loop
3220 RET
3221%endmacro
3222
3223BLOCKCOPY_SS_W16_H4_avx 16, 4
3224BLOCKCOPY_SS_W16_H4_avx 16, 12
3225BLOCKCOPY_SS_W16_H4_avx 16, 8
3226BLOCKCOPY_SS_W16_H4_avx 16, 16
3227BLOCKCOPY_SS_W16_H4_avx 16, 24
3228BLOCKCOPY_SS_W16_H4_avx 16, 32
3229BLOCKCOPY_SS_W16_H4_avx 16, 64
3230
3231;-----------------------------------------------------------------------------
b53f7c52 3232; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3233;-----------------------------------------------------------------------------
3234%macro BLOCKCOPY_SS_W16_H8 2
3235INIT_XMM sse2
3236cglobal blockcopy_ss_%1x%2, 4, 5, 4
3237 mov r4d, %2/8
3238 add r1, r1
3239 add r3, r3
3240.loop:
3241 movu m0, [r2]
3242 movu m1, [r2 + 16]
3243 movu m2, [r2 + r3]
3244 movu m3, [r2 + r3 + 16]
3245
3246 movu [r0], m0
3247 movu [r0 + 16], m1
3248 movu [r0 + r1], m2
3249 movu [r0 + r1 + 16], m3
3250
3251 lea r2, [r2 + 2 * r3]
3252 lea r0, [r0 + 2 * r1]
3253
3254 movu m0, [r2]
3255 movu m1, [r2 + 16]
3256 movu m2, [r2 + r3]
3257 movu m3, [r2 + r3 + 16]
3258
3259 movu [r0], m0
3260 movu [r0 + 16], m1
3261 movu [r0 + r1], m2
3262 movu [r0 + r1 + 16], m3
3263
3264 lea r2, [r2 + 2 * r3]
3265 lea r0, [r0 + 2 * r1]
3266
3267 movu m0, [r2]
3268 movu m1, [r2 + 16]
3269 movu m2, [r2 + r3]
3270 movu m3, [r2 + r3 + 16]
3271
3272 movu [r0], m0
3273 movu [r0 + 16], m1
3274 movu [r0 + r1], m2
3275 movu [r0 + r1 + 16], m3
3276
3277 lea r2, [r2 + 2 * r3]
3278 lea r0, [r0 + 2 * r1]
3279
3280 movu m0, [r2]
3281 movu m1, [r2 + 16]
3282 movu m2, [r2 + r3]
3283 movu m3, [r2 + r3 + 16]
3284
3285 movu [r0], m0
3286 movu [r0 + 16], m1
3287 movu [r0 + r1], m2
3288 movu [r0 + r1 + 16], m3
3289
3290 dec r4d
3291 lea r2, [r2 + 2 * r3]
3292 lea r0, [r0 + 2 * r1]
3293 jnz .loop
3294 RET
3295%endmacro
3296
3297BLOCKCOPY_SS_W16_H8 16, 8
3298BLOCKCOPY_SS_W16_H8 16, 16
3299BLOCKCOPY_SS_W16_H8 16, 32
3300BLOCKCOPY_SS_W16_H8 16, 64
3301
3302BLOCKCOPY_SS_W16_H8 16, 24
3303
3304;-----------------------------------------------------------------------------
b53f7c52 3305; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3306;-----------------------------------------------------------------------------
3307%macro BLOCKCOPY_SS_W24_H4 2
3308INIT_XMM sse2
3309cglobal blockcopy_ss_%1x%2, 4, 5, 6
3310 mov r4d, %2/4
3311 add r1, r1
3312 add r3, r3
3313.loop
3314 movu m0, [r2]
3315 movu m1, [r2 + 16]
3316 movu m2, [r2 + 32]
3317 movu m3, [r2 + r3]
3318 movu m4, [r2 + r3 + 16]
3319 movu m5, [r2 + r3 + 32]
3320
3321 movu [r0], m0
3322 movu [r0 + 16], m1
3323 movu [r0 + 32], m2
3324 movu [r0 + r1], m3
3325 movu [r0 + r1 + 16], m4
3326 movu [r0 + r1 + 32], m5
3327
3328 lea r2, [r2 + 2 * r3]
3329 lea r0, [r0 + 2 * r1]
3330
3331 movu m0, [r2]
3332 movu m1, [r2 + 16]
3333 movu m2, [r2 + 32]
3334 movu m3, [r2 + r3]
3335 movu m4, [r2 + r3 + 16]
3336 movu m5, [r2 + r3 + 32]
3337
3338 movu [r0], m0
3339 movu [r0 + 16], m1
3340 movu [r0 + 32], m2
3341 movu [r0 + r1], m3
3342 movu [r0 + r1 + 16], m4
3343 movu [r0 + r1 + 32], m5
3344
3345 dec r4d
3346 lea r2, [r2 + 2 * r3]
3347 lea r0, [r0 + 2 * r1]
3348 jnz .loop
3349 RET
3350%endmacro
3351
3352BLOCKCOPY_SS_W24_H4 24, 32
3353
3354BLOCKCOPY_SS_W24_H4 24, 64
3355
3356;-----------------------------------------------------------------------------
b53f7c52 3357; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3358;-----------------------------------------------------------------------------
3359%macro BLOCKCOPY_SS_W32_H4 2
3360INIT_XMM sse2
3361cglobal blockcopy_ss_%1x%2, 4, 5, 4
3362 mov r4d, %2/4
3363 add r1, r1
3364 add r3, r3
3365.loop:
3366 movu m0, [r2]
3367 movu m1, [r2 + 16]
3368 movu m2, [r2 + 32]
3369 movu m3, [r2 + 48]
3370
3371 movu [r0], m0
3372 movu [r0 + 16], m1
3373 movu [r0 + 32], m2
3374 movu [r0 + 48], m3
3375
3376 movu m0, [r2 + r3]
3377 movu m1, [r2 + r3 + 16]
3378 movu m2, [r2 + r3 + 32]
3379 movu m3, [r2 + r3 + 48]
3380
3381 movu [r0 + r1], m0
3382 movu [r0 + r1 + 16], m1
3383 movu [r0 + r1 + 32], m2
3384 movu [r0 + r1 + 48], m3
3385
3386 lea r2, [r2 + 2 * r3]
3387 lea r0, [r0 + 2 * r1]
3388
3389 movu m0, [r2]
3390 movu m1, [r2 + 16]
3391 movu m2, [r2 + 32]
3392 movu m3, [r2 + 48]
3393
3394 movu [r0], m0
3395 movu [r0 + 16], m1
3396 movu [r0 + 32], m2
3397 movu [r0 + 48], m3
3398
3399 movu m0, [r2 + r3]
3400 movu m1, [r2 + r3 + 16]
3401 movu m2, [r2 + r3 + 32]
3402 movu m3, [r2 + r3 + 48]
3403
3404 movu [r0 + r1], m0
3405 movu [r0 + r1 + 16], m1
3406 movu [r0 + r1 + 32], m2
3407 movu [r0 + r1 + 48], m3
3408
3409 dec r4d
3410 lea r2, [r2 + 2 * r3]
3411 lea r0, [r0 + 2 * r1]
3412 jnz .loop
3413 RET
3414%endmacro
3415
3416BLOCKCOPY_SS_W32_H4 32, 8
3417BLOCKCOPY_SS_W32_H4 32, 16
3418BLOCKCOPY_SS_W32_H4 32, 24
3419BLOCKCOPY_SS_W32_H4 32, 32
3420BLOCKCOPY_SS_W32_H4 32, 64
3421
3422BLOCKCOPY_SS_W32_H4 32, 48
3423
3424;-----------------------------------------------------------------------------
b53f7c52 3425; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3426;-----------------------------------------------------------------------------
3427%macro BLOCKCOPY_SS_W48_H2 2
3428INIT_XMM sse2
3429cglobal blockcopy_ss_%1x%2, 4, 5, 6
3430 mov r4d, %2/4
3431 add r1, r1
3432 add r3, r3
3433.loop:
3434 movu m0, [r2]
3435 movu m1, [r2 + 16]
3436 movu m2, [r2 + 32]
3437 movu m3, [r2 + 48]
3438 movu m4, [r2 + 64]
3439 movu m5, [r2 + 80]
3440
3441 movu [r0], m0
3442 movu [r0 + 16], m1
3443 movu [r0 + 32], m2
3444 movu [r0 + 48], m3
3445 movu [r0 + 64], m4
3446 movu [r0 + 80], m5
3447
3448 movu m0, [r2 + r3]
3449 movu m1, [r2 + r3 + 16]
3450 movu m2, [r2 + r3 + 32]
3451 movu m3, [r2 + r3 + 48]
3452 movu m4, [r2 + r3 + 64]
3453 movu m5, [r2 + r3 + 80]
3454
3455 movu [r0 + r1], m0
3456 movu [r0 + r1 + 16], m1
3457 movu [r0 + r1 + 32], m2
3458 movu [r0 + r1 + 48], m3
3459 movu [r0 + r1 + 64], m4
3460 movu [r0 + r1 + 80], m5
3461
3462 lea r2, [r2 + 2 * r3]
3463 lea r0, [r0 + 2 * r1]
3464
3465 movu m0, [r2]
3466 movu m1, [r2 + 16]
3467 movu m2, [r2 + 32]
3468 movu m3, [r2 + 48]
3469 movu m4, [r2 + 64]
3470 movu m5, [r2 + 80]
3471
3472 movu [r0], m0
3473 movu [r0 + 16], m1
3474 movu [r0 + 32], m2
3475 movu [r0 + 48], m3
3476 movu [r0 + 64], m4
3477 movu [r0 + 80], m5
3478
3479 movu m0, [r2 + r3]
3480 movu m1, [r2 + r3 + 16]
3481 movu m2, [r2 + r3 + 32]
3482 movu m3, [r2 + r3 + 48]
3483 movu m4, [r2 + r3 + 64]
3484 movu m5, [r2 + r3 + 80]
3485
3486 movu [r0 + r1], m0
3487 movu [r0 + r1 + 16], m1
3488 movu [r0 + r1 + 32], m2
3489 movu [r0 + r1 + 48], m3
3490 movu [r0 + r1 + 64], m4
3491 movu [r0 + r1 + 80], m5
3492
3493 dec r4d
3494 lea r2, [r2 + 2 * r3]
3495 lea r0, [r0 + 2 * r1]
3496 jnz .loop
3497RET
3498%endmacro
3499
3500BLOCKCOPY_SS_W48_H2 48, 64
3501
3502;-----------------------------------------------------------------------------
b53f7c52 3503; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3504;-----------------------------------------------------------------------------
3505%macro BLOCKCOPY_SS_W64_H4 2
3506INIT_XMM sse2
b53f7c52 3507cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
72b9787e
JB
3508 mov r4d, %2/4
3509 add r1, r1
3510 add r3, r3
3511.loop:
3512 movu m0, [r2]
3513 movu m1, [r2 + 16]
3514 movu m2, [r2 + 32]
3515 movu m3, [r2 + 48]
3516
3517 movu [r0], m0
3518 movu [r0 + 16], m1
3519 movu [r0 + 32], m2
3520 movu [r0 + 48], m3
3521
3522 movu m0, [r2 + 64]
3523 movu m1, [r2 + 80]
3524 movu m2, [r2 + 96]
3525 movu m3, [r2 + 112]
3526
3527 movu [r0 + 64], m0
3528 movu [r0 + 80], m1
3529 movu [r0 + 96], m2
3530 movu [r0 + 112], m3
3531
3532 movu m0, [r2 + r3]
3533 movu m1, [r2 + r3 + 16]
3534 movu m2, [r2 + r3 + 32]
3535 movu m3, [r2 + r3 + 48]
3536
3537 movu [r0 + r1], m0
3538 movu [r0 + r1 + 16], m1
3539 movu [r0 + r1 + 32], m2
3540 movu [r0 + r1 + 48], m3
3541
3542 movu m0, [r2 + r3 + 64]
3543 movu m1, [r2 + r3 + 80]
3544 movu m2, [r2 + r3 + 96]
3545 movu m3, [r2 + r3 + 112]
3546
3547 movu [r0 + r1 + 64], m0
3548 movu [r0 + r1 + 80], m1
3549 movu [r0 + r1 + 96], m2
3550 movu [r0 + r1 + 112], m3
3551
3552 lea r2, [r2 + 2 * r3]
3553 lea r0, [r0 + 2 * r1]
3554
3555 movu m0, [r2]
3556 movu m1, [r2 + 16]
3557 movu m2, [r2 + 32]
3558 movu m3, [r2 + 48]
3559
3560 movu [r0], m0
3561 movu [r0 + 16], m1
3562 movu [r0 + 32], m2
3563 movu [r0 + 48], m3
3564
3565 movu m0, [r2 + 64]
3566 movu m1, [r2 + 80]
3567 movu m2, [r2 + 96]
3568 movu m3, [r2 + 112]
3569
3570 movu [r0 + 64], m0
3571 movu [r0 + 80], m1
3572 movu [r0 + 96], m2
3573 movu [r0 + 112], m3
3574
3575 movu m0, [r2 + r3]
3576 movu m1, [r2 + r3 + 16]
3577 movu m2, [r2 + r3 + 32]
3578 movu m3, [r2 + r3 + 48]
3579
3580 movu [r0 + r1], m0
3581 movu [r0 + r1 + 16], m1
3582 movu [r0 + r1 + 32], m2
3583 movu [r0 + r1 + 48], m3
3584
3585 movu m0, [r2 + r3 + 64]
3586 movu m1, [r2 + r3 + 80]
3587 movu m2, [r2 + r3 + 96]
3588 movu m3, [r2 + r3 + 112]
3589
3590 movu [r0 + r1 + 64], m0
3591 movu [r0 + r1 + 80], m1
3592 movu [r0 + r1 + 96], m2
3593 movu [r0 + r1 + 112], m3
3594
3595 dec r4d
3596 lea r2, [r2 + 2 * r3]
3597 lea r0, [r0 + 2 * r1]
3598 jnz .loop
3599
3600 RET
3601%endmacro
3602
3603BLOCKCOPY_SS_W64_H4 64, 16
3604BLOCKCOPY_SS_W64_H4 64, 32
3605BLOCKCOPY_SS_W64_H4 64, 48
3606BLOCKCOPY_SS_W64_H4 64, 64
3607
3608;-----------------------------------------------------------------------------
b53f7c52 3609; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
72b9787e
JB
3610;-----------------------------------------------------------------------------
3611%macro BLOCKCOPY_SS_W64_H4_avx 2
3612INIT_YMM avx
b53f7c52 3613cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
72b9787e
JB
3614 mov r4d, %2/4
3615 add r1, r1
3616 add r3, r3
3617 lea r5, [3 * r1]
3618 lea r6, [3 * r3]
3619.loop:
3620 movu m0, [r2]
3621 movu m1, [r2 + 32]
3622 movu m2, [r2 + 64]
3623 movu m3, [r2 + 96]
3624
3625 movu [r0], m0
3626 movu [r0 + 32], m1
3627 movu [r0 + 64], m2
3628 movu [r0 + 96], m3
3629
3630 movu m0, [r2 + r3]
3631 movu m1, [r2 + r3 + 32]
3632 movu m2, [r2 + r3 + 64]
3633 movu m3, [r2 + r3 + 96]
3634
3635 movu [r0 + r1], m0
3636 movu [r0 + r1 + 32], m1
3637 movu [r0 + r1 + 64], m2
3638 movu [r0 + r1 + 96], m3
3639
3640 movu m0, [r2 + 2 * r3]
3641 movu m1, [r2 + 2 * r3 + 32]
3642 movu m2, [r2 + 2 * r3 + 64]
3643 movu m3, [r2 + 2 * r3 + 96]
3644
3645 movu [r0 + 2 * r1], m0
3646 movu [r0 + 2 * r1 + 32], m1
3647 movu [r0 + 2 * r1 + 64], m2
3648 movu [r0 + 2 * r1 + 96], m3
3649
3650 movu m0, [r2 + r6]
3651 movu m1, [r2 + r6 + 32]
3652 movu m2, [r2 + r6 + 64]
3653 movu m3, [r2 + r6 + 96]
3654 lea r2, [r2 + 4 * r3]
3655
3656 movu [r0 + r5], m0
3657 movu [r0 + r5 + 32], m1
3658 movu [r0 + r5 + 64], m2
3659 movu [r0 + r5 + 96], m3
3660 lea r0, [r0 + 4 * r1]
3661
3662 dec r4d
3663 jnz .loop
3664 RET
3665%endmacro
3666
3667BLOCKCOPY_SS_W64_H4_avx 64, 16
3668BLOCKCOPY_SS_W64_H4_avx 64, 32
3669BLOCKCOPY_SS_W64_H4_avx 64, 48
3670BLOCKCOPY_SS_W64_H4_avx 64, 64
3671
72b9787e 3672;--------------------------------------------------------------------------------------
b53f7c52 3673; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
72b9787e 3674;--------------------------------------------------------------------------------------
b53f7c52
JB
3675INIT_XMM sse2
3676cglobal cpy2Dto1D_shr_4, 3, 4, 4
72b9787e
JB
3677 add r2d, r2d
3678 movd m0, r3m
b53f7c52
JB
3679 pcmpeqw m1, m1
3680 psllw m1, m0
3681 psraw m1, 1
72b9787e
JB
3682
3683 ; register alloc
3684 ; r0 - dst
3685 ; r1 - src
b53f7c52 3686 ; r2 - srcStride
72b9787e 3687 ; m0 - shift
b53f7c52 3688 ; m1 - word [-round]
72b9787e 3689
b53f7c52
JB
3690 ; Row 0-3
3691 movh m2, [r1]
3692 movhps m2, [r1 + r2]
72b9787e 3693 lea r1, [r1 + r2 * 2]
b53f7c52
JB
3694 movh m3, [r1]
3695 movhps m3, [r1 + r2]
3696 psubw m2, m1
3697 psubw m3, m1
3698 psraw m2, m0
3699 psraw m3, m0
3700 mova [r0 + 0 * mmsize], m2
3701 mova [r0 + 1 * mmsize], m3
72b9787e
JB
3702 RET
3703
3704
3705;--------------------------------------------------------------------------------------
b53f7c52 3706; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
72b9787e 3707;--------------------------------------------------------------------------------------
b53f7c52
JB
3708INIT_XMM sse2
3709cglobal cpy2Dto1D_shr_8, 3, 5, 4
72b9787e
JB
3710 add r2d, r2d
3711 movd m0, r3m
b53f7c52
JB
3712 pcmpeqw m1, m1
3713 psllw m1, m0
3714 psraw m1, 1
72b9787e
JB
3715 mov r3d, 8/4
3716 lea r4, [r2 * 3]
3717
3718 ; register alloc
3719 ; r0 - dst
3720 ; r1 - src
b53f7c52 3721 ; r2 - srcStride
72b9787e
JB
3722 ; r3 - loop counter
3723 ; r4 - stride * 3
3724 ; m0 - shift
b53f7c52 3725 ; m1 - word [-round]
72b9787e
JB
3726
3727.loop:
b53f7c52
JB
3728 ; Row 0-1
3729 mova m2, [r1]
3730 mova m3, [r1 + r2]
3731 psubw m2, m1
3732 psubw m3, m1
3733 psraw m2, m0
3734 psraw m3, m0
3735 mova [r0 + 0 * mmsize], m2
3736 mova [r0 + 1 * mmsize], m3
72b9787e 3737
b53f7c52
JB
3738 ; Row 2-3
3739 mova m2, [r1 + r2 * 2]
3740 mova m3, [r1 + r4]
3741 psubw m2, m1
3742 psubw m3, m1
3743 psraw m2, m0
3744 psraw m3, m0
3745 mova [r0 + 2 * mmsize], m2
3746 mova [r0 + 3 * mmsize], m3
3747
3748 add r0, 4 * mmsize
72b9787e
JB
3749 lea r1, [r1 + r2 * 4]
3750 dec r3d
3751 jnz .loop
3752 RET
3753
3754
3755;--------------------------------------------------------------------------------------
b53f7c52 3756; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
72b9787e 3757;--------------------------------------------------------------------------------------
b53f7c52
JB
3758INIT_XMM sse2
3759cglobal cpy2Dto1D_shr_16, 3, 4, 4
72b9787e
JB
3760 add r2d, r2d
3761 movd m0, r3m
b53f7c52
JB
3762 pcmpeqw m1, m1
3763 psllw m1, m0
3764 psraw m1, 1
72b9787e
JB
3765 mov r3d, 16/2
3766
3767 ; register alloc
3768 ; r0 - dst
3769 ; r1 - src
b53f7c52 3770 ; r2 - srcStride
72b9787e
JB
3771 ; r3 - loop counter
3772 ; m0 - shift
b53f7c52 3773 ; m1 - word [-round]
72b9787e
JB
3774
3775.loop:
3776 ; Row 0
b53f7c52
JB
3777 mova m2, [r1 + 0 * mmsize]
3778 mova m3, [r1 + 1 * mmsize]
3779 psubw m2, m1
3780 psubw m3, m1
3781 psraw m2, m0
3782 psraw m3, m0
3783 mova [r0 + 0 * mmsize], m2
3784 mova [r0 + 1 * mmsize], m3
72b9787e
JB
3785
3786 ; Row 1
b53f7c52
JB
3787 mova m2, [r1 + r2 + 0 * mmsize]
3788 mova m3, [r1 + r2 + 1 * mmsize]
3789 psubw m2, m1
3790 psubw m3, m1
3791 psraw m2, m0
3792 psraw m3, m0
3793 mova [r0 + 2 * mmsize], m2
3794 mova [r0 + 3 * mmsize], m3
3795
3796 add r0, 4 * mmsize
72b9787e
JB
3797 lea r1, [r1 + r2 * 2]
3798 dec r3d
3799 jnz .loop
3800 RET
3801
3802
3803;--------------------------------------------------------------------------------------
b53f7c52 3804; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
72b9787e 3805;--------------------------------------------------------------------------------------
b53f7c52
JB
3806INIT_XMM sse2
3807cglobal cpy2Dto1D_shr_32, 3, 4, 6
72b9787e
JB
3808 add r2d, r2d
3809 movd m0, r3m
b53f7c52
JB
3810 pcmpeqw m1, m1
3811 psllw m1, m0
3812 psraw m1, 1
72b9787e
JB
3813 mov r3d, 32/1
3814
3815 ; register alloc
3816 ; r0 - dst
3817 ; r1 - src
b53f7c52 3818 ; r2 - srcStride
72b9787e
JB
3819 ; r3 - loop counter
3820 ; m0 - shift
b53f7c52 3821 ; m1 - word [-round]
72b9787e
JB
3822
3823.loop:
3824 ; Row 0
b53f7c52
JB
3825 mova m2, [r1 + 0 * mmsize]
3826 mova m3, [r1 + 1 * mmsize]
3827 mova m4, [r1 + 2 * mmsize]
3828 mova m5, [r1 + 3 * mmsize]
3829 psubw m2, m1
3830 psubw m3, m1
3831 psubw m4, m1
3832 psubw m5, m1
3833 psraw m2, m0
3834 psraw m3, m0
3835 psraw m4, m0
3836 psraw m5, m0
3837 mova [r0 + 0 * mmsize], m2
3838 mova [r0 + 1 * mmsize], m3
3839 mova [r0 + 2 * mmsize], m4
3840 mova [r0 + 3 * mmsize], m5
3841
3842 add r0, 4 * mmsize
72b9787e
JB
3843 add r1, r2
3844 dec r3d
3845 jnz .loop
3846 RET
3847
3848
3849;--------------------------------------------------------------------------------------
b53f7c52 3850; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
3851;--------------------------------------------------------------------------------------
3852INIT_XMM sse2
b53f7c52 3853cglobal cpy1Dto2D_shl_4, 3, 3, 3
72b9787e
JB
3854 add r2d, r2d
3855 movd m0, r3m
3856
3857 ; Row 0-3
b53f7c52
JB
3858 mova m1, [r1 + 0 * mmsize]
3859 mova m2, [r1 + 1 * mmsize]
72b9787e 3860 psllw m1, m0
b53f7c52 3861 psllw m2, m0
72b9787e
JB
3862 movh [r0], m1
3863 movhps [r0 + r2], m1
b53f7c52 3864 movh [r0 + r2 * 2], m2
72b9787e 3865 lea r2, [r2 * 3]
b53f7c52 3866 movhps [r0 + r2], m2
72b9787e
JB
3867 RET
3868
3869
3870INIT_YMM avx2
b53f7c52 3871cglobal cpy1Dto2D_shl_4, 3, 3, 2
72b9787e
JB
3872 add r2d, r2d
3873 movd xm0, r3m
3874
3875 ; Row 0-3
b53f7c52 3876 movu m1, [r1]
72b9787e
JB
3877 psllw m1, xm0
3878 vextracti128 xm0, m1, 1
3879 movq [r0], xm1
b53f7c52 3880 movhps [r0 + r2], xm1
72b9787e 3881 lea r0, [r0 + r2 * 2]
b53f7c52 3882 movq [r0], xm0
72b9787e
JB
3883 movhps [r0 + r2], xm0
3884 RET
3885
3886
3887;--------------------------------------------------------------------------------------
b53f7c52 3888; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
3889;--------------------------------------------------------------------------------------
3890INIT_XMM sse2
b53f7c52 3891cglobal cpy1Dto2D_shl_8, 3, 4, 5
72b9787e
JB
3892 add r2d, r2d
3893 movd m0, r3m
b53f7c52 3894 lea r3, [r2 * 3]
72b9787e 3895
b53f7c52
JB
3896 ; Row 0-3
3897 mova m1, [r1 + 0 * mmsize]
3898 mova m2, [r1 + 1 * mmsize]
3899 mova m3, [r1 + 2 * mmsize]
3900 mova m4, [r1 + 3 * mmsize]
72b9787e 3901 psllw m1, m0
b53f7c52 3902 psllw m2, m0
72b9787e 3903 psllw m3, m0
b53f7c52
JB
3904 psllw m4, m0
3905 mova [r0], m1
3906 mova [r0 + r2], m2
3907 mova [r0 + r2 * 2], m3
3908 mova [r0 + r3], m4
3909 lea r0, [r0 + r2 * 4]
72b9787e 3910
b53f7c52
JB
3911 ; Row 4-7
3912 mova m1, [r1 + 4 * mmsize]
3913 mova m2, [r1 + 5 * mmsize]
3914 mova m3, [r1 + 6 * mmsize]
3915 mova m4, [r1 + 7 * mmsize]
72b9787e 3916 psllw m1, m0
b53f7c52 3917 psllw m2, m0
72b9787e 3918 psllw m3, m0
b53f7c52
JB
3919 psllw m4, m0
3920 mova [r0], m1
3921 mova [r0 + r2], m2
3922 mova [r0 + r2 * 2], m3
3923 mova [r0 + r3], m4
72b9787e
JB
3924 RET
3925
3926
3927INIT_YMM avx2
b53f7c52 3928cglobal cpy1Dto2D_shl_8, 3, 4, 3
72b9787e
JB
3929 add r2d, r2d
3930 movd xm0, r3m
3931 lea r3, [r2 * 3]
3932
b53f7c52 3933 ; Row 0-3
72b9787e
JB
3934 movu m1, [r1 + 0 * mmsize]
3935 movu m2, [r1 + 1 * mmsize]
72b9787e 3936 psllw m1, xm0
b53f7c52 3937 psllw m2, xm0
72b9787e
JB
3938 movu [r0], xm1
3939 vextracti128 [r0 + r2], m1, 1
b53f7c52
JB
3940 movu [r0 + r2 * 2], xm2
3941 vextracti128 [r0 + r3], m2, 1
72b9787e 3942
b53f7c52 3943 ; Row 4-7
72b9787e
JB
3944 movu m1, [r1 + 2 * mmsize]
3945 movu m2, [r1 + 3 * mmsize]
b53f7c52 3946 lea r0, [r0 + r2 * 4]
72b9787e 3947 psllw m1, xm0
b53f7c52
JB
3948 psllw m2, xm0
3949 movu [r0], xm1
3950 vextracti128 [r0 + r2], m1, 1
3951 movu [r0 + r2 * 2], xm2
3952 vextracti128 [r0 + r3], m2, 1
72b9787e
JB
3953 RET
3954
b53f7c52 3955
72b9787e 3956;--------------------------------------------------------------------------------------
b53f7c52 3957; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
3958;--------------------------------------------------------------------------------------
3959INIT_XMM sse2
b53f7c52 3960cglobal cpy1Dto2D_shl_16, 3, 4, 5
72b9787e
JB
3961 add r2d, r2d
3962 movd m0, r3m
b53f7c52 3963 mov r3d, 16/4
72b9787e
JB
3964
3965.loop:
b53f7c52
JB
3966 ; Row 0-1
3967 mova m1, [r1 + 0 * mmsize]
3968 mova m2, [r1 + 1 * mmsize]
3969 mova m3, [r1 + 2 * mmsize]
3970 mova m4, [r1 + 3 * mmsize]
72b9787e 3971 psllw m1, m0
b53f7c52 3972 psllw m2, m0
72b9787e 3973 psllw m3, m0
b53f7c52
JB
3974 psllw m4, m0
3975 mova [r0], m1
3976 mova [r0 + 16], m2
3977 mova [r0 + r2], m3
3978 mova [r0 + r2 + 16], m4
72b9787e 3979
b53f7c52
JB
3980 ; Row 2-3
3981 mova m1, [r1 + 4 * mmsize]
3982 mova m2, [r1 + 5 * mmsize]
3983 mova m3, [r1 + 6 * mmsize]
3984 mova m4, [r1 + 7 * mmsize]
3985 lea r0, [r0 + r2 * 2]
72b9787e 3986 psllw m1, m0
b53f7c52 3987 psllw m2, m0
72b9787e 3988 psllw m3, m0
b53f7c52
JB
3989 psllw m4, m0
3990 mova [r0], m1
3991 mova [r0 + 16], m2
3992 mova [r0 + r2], m3
3993 mova [r0 + r2 + 16], m4
72b9787e
JB
3994
3995 add r1, 8 * mmsize
3996 lea r0, [r0 + r2 * 2]
3997 dec r3d
3998 jnz .loop
3999 RET
4000
4001
4002INIT_YMM avx2
b53f7c52 4003cglobal cpy1Dto2D_shl_16, 3, 5, 3
72b9787e
JB
4004 add r2d, r2d
4005 movd xm0, r3m
4006 mov r3d, 16/4
4007 lea r4, [r2 * 3]
4008
4009.loop:
b53f7c52
JB
4010 ; Row 0-1
4011 movu m1, [r1 + 0 * mmsize]
4012 movu m2, [r1 + 1 * mmsize]
72b9787e 4013 psllw m1, xm0
b53f7c52 4014 psllw m2, xm0
72b9787e 4015 movu [r0], m1
b53f7c52 4016 movu [r0 + r2], m2
72b9787e 4017
b53f7c52 4018 ; Row 2-3
72b9787e
JB
4019 movu m1, [r1 + 2 * mmsize]
4020 movu m2, [r1 + 3 * mmsize]
72b9787e 4021 psllw m1, xm0
b53f7c52
JB
4022 psllw m2, xm0
4023 movu [r0 + r2 * 2], m1
4024 movu [r0 + r4], m2
72b9787e
JB
4025
4026 add r1, 4 * mmsize
4027 lea r0, [r0 + r2 * 4]
4028 dec r3d
4029 jnz .loop
4030 RET
4031
4032
4033;--------------------------------------------------------------------------------------
b53f7c52 4034; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
4035;--------------------------------------------------------------------------------------
4036INIT_XMM sse2
b53f7c52 4037cglobal cpy1Dto2D_shl_32, 3, 4, 5
72b9787e
JB
4038 add r2d, r2d
4039 movd m0, r3m
b53f7c52 4040 mov r3d, 32/2
72b9787e
JB
4041
4042.loop:
4043 ; Row 0
b53f7c52
JB
4044 mova m1, [r1 + 0 * mmsize]
4045 mova m2, [r1 + 1 * mmsize]
4046 mova m3, [r1 + 2 * mmsize]
4047 mova m4, [r1 + 3 * mmsize]
72b9787e 4048 psllw m1, m0
b53f7c52 4049 psllw m2, m0
72b9787e 4050 psllw m3, m0
b53f7c52
JB
4051 psllw m4, m0
4052 mova [r0 + 0 * mmsize], m1
4053 mova [r0 + 1 * mmsize], m2
4054 mova [r0 + 2 * mmsize], m3
4055 mova [r0 + 3 * mmsize], m4
4056
4057 ; Row 1
4058 mova m1, [r1 + 4 * mmsize]
4059 mova m2, [r1 + 5 * mmsize]
4060 mova m3, [r1 + 6 * mmsize]
4061 mova m4, [r1 + 7 * mmsize]
72b9787e 4062 psllw m1, m0
b53f7c52 4063 psllw m2, m0
72b9787e 4064 psllw m3, m0
b53f7c52
JB
4065 psllw m4, m0
4066 mova [r0 + r2 + 0 * mmsize], m1
4067 mova [r0 + r2 + 1 * mmsize], m2
4068 mova [r0 + r2 + 2 * mmsize], m3
4069 mova [r0 + r2 + 3 * mmsize], m4
72b9787e
JB
4070
4071 add r1, 8 * mmsize
b53f7c52 4072 lea r0, [r0 + r2 * 2]
72b9787e
JB
4073 dec r3d
4074 jnz .loop
4075 RET
4076
4077
4078INIT_YMM avx2
b53f7c52 4079cglobal cpy1Dto2D_shl_32, 3, 4, 5
72b9787e
JB
4080 add r2d, r2d
4081 movd xm0, r3m
4082 mov r3d, 32/2
4083
4084.loop:
b53f7c52
JB
4085 ; Row 0-1
4086 movu m1, [r1 + 0 * mmsize]
4087 movu m2, [r1 + 1 * mmsize]
72b9787e
JB
4088 movu m3, [r1 + 2 * mmsize]
4089 movu m4, [r1 + 3 * mmsize]
72b9787e 4090 psllw m1, xm0
b53f7c52 4091 psllw m2, xm0
72b9787e 4092 psllw m3, xm0
b53f7c52
JB
4093 psllw m4, xm0
4094 movu [r0], m1
4095 movu [r0 + mmsize], m2
4096 movu [r0 + r2], m3
4097 movu [r0 + r2 + mmsize], m4
72b9787e
JB
4098
4099 add r1, 4 * mmsize
4100 lea r0, [r0 + r2 * 2]
4101 dec r3d
4102 jnz .loop
4103 RET
4104
4105
4106;--------------------------------------------------------------------------------------
b53f7c52 4107; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
72b9787e
JB
4108;--------------------------------------------------------------------------------------
4109INIT_XMM sse4
4110cglobal copy_cnt_4, 3,3,3
4111 add r2d, r2d
4112 pxor m2, m2
4113
4114 ; row 0 & 1
4115 movh m0, [r1]
4116 movhps m0, [r1 + r2]
4117 mova [r0], m0
4118
4119 ; row 2 & 3
4120 movh m1, [r1 + r2 * 2]
4121 lea r2, [r2 * 3]
4122 movhps m1, [r1 + r2]
4123 mova [r0 + 16], m1
4124
4125 packsswb m0, m1
4126 pcmpeqb m0, m2
4127
4128 ; get count
4129 ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
4130%if 0
4131 pmovmskb eax, m0
4132 not ax
4133 popcnt ax, ax
4134%else
4135 mova m1, [pb_1]
4136 paddb m0, m1
4137 psadbw m0, m2
4138 pshufd m1, m0, 2
4139 paddw m0, m1
4140 movd eax, m0
4141%endif
4142 RET
4143
4144
4145;--------------------------------------------------------------------------------------
b53f7c52 4146; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
72b9787e
JB
4147;--------------------------------------------------------------------------------------
4148INIT_XMM sse4
4149cglobal copy_cnt_8, 3,3,6
4150 add r2d, r2d
4151 pxor m4, m4
4152 pxor m5, m5
4153
4154 ; row 0 & 1
4155 movu m0, [r1]
4156 movu m1, [r1 + r2]
4157 movu [r0], m0
4158 movu [r0 + 16], m1
4159
4160 packsswb m0, m1
4161 pcmpeqb m0, m4
4162 paddb m5, m0
4163
4164 ; row 2 & 3
4165 lea r1, [r1 + 2 * r2]
4166 movu m0, [r1]
4167 movu m1, [r1 + r2]
4168 movu [r0 + 32], m0
4169 movu [r0 + 48], m1
4170
4171 packsswb m0, m1
4172 pcmpeqb m0, m4
4173 paddb m5, m0
4174
4175 ; row 4 & 5
4176 lea r1, [r1 + 2 * r2]
4177 movu m0, [r1]
4178 movu m1, [r1 + r2]
4179 movu [r0 + 64], m0
4180 movu [r0 + 80], m1
4181
4182 packsswb m0, m1
4183 pcmpeqb m0, m4
4184 paddb m5, m0
4185
4186 ; row 6 & 7
4187 lea r1, [r1 + 2 * r2]
4188 movu m0, [r1]
4189 movu m1, [r1 + r2]
4190 movu [r0 + 96], m0
4191 movu [r0 + 112], m1
4192
4193 packsswb m0, m1
4194 pcmpeqb m0, m4
4195 paddb m5, m0
4196
4197 ; get count
4198 mova m0, [pb_4]
4199 paddb m5, m0
4200 psadbw m5, m4
4201 pshufd m0, m5, 2
4202 paddw m5, m0
4203 movd eax, m5
4204 RET
4205
4206
4207INIT_YMM avx2
4208cglobal copy_cnt_8, 3,4,5
4209 add r2d, r2d
4210 lea r3, [r2 * 3]
4211
4212 ; row 0 - 1
4213 movu xm0, [r1]
4214 vinserti128 m0, m0, [r1 + r2], 1
4215 movu [r0], m0
4216
4217 ; row 2 - 3
4218 movu xm1, [r1 + r2 * 2]
4219 vinserti128 m1, m1, [r1 + r3], 1
4220 movu [r0 + 32], m1
4221 lea r1, [r1 + r2 * 4]
4222
4223 ; row 4 - 5
4224 movu xm2, [r1]
4225 vinserti128 m2, m2, [r1 + r2], 1
4226 movu [r0 + 64], m2
4227
4228 ; row 6 - 7
4229 movu xm3, [r1 + r2 * 2]
4230 vinserti128 m3, m3, [r1 + r3], 1
4231 movu [r0 + 96], m3
4232
4233 ; get count
4234 xorpd m4, m4
4235 vpacksswb m0, m1
4236 vpacksswb m2, m3
4237 pminub m0, [pb_1]
4238 pminub m2, [pb_1]
4239 paddb m0, m2
4240 vextracti128 xm1, m0, 1
4241 paddb xm0, xm1
4242 psadbw xm0, xm4
4243 movhlps xm1, xm0
4244 paddd xm0, xm1
4245 movd eax, xm0
4246 RET
4247
4248
4249;--------------------------------------------------------------------------------------
b53f7c52 4250; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
72b9787e
JB
4251;--------------------------------------------------------------------------------------
4252INIT_XMM sse4
4253cglobal copy_cnt_16, 3,4,6
4254 add r2d, r2d
4255 mov r3d, 4
4256 pxor m4, m4
4257 pxor m5, m5
4258
4259.loop
4260 ; row 0
4261 movu m0, [r1]
4262 movu m1, [r1 + 16]
4263 movu [r0], m0
4264 movu [r0 + 16], m1
4265
4266 packsswb m0, m1
4267 pcmpeqb m0, m4
4268 paddb m5, m0
4269
4270 ; row 1
4271 movu m0, [r1 + r2]
4272 movu m1, [r1 + r2 + 16]
4273 movu [r0 + 32], m0
4274 movu [r0 + 48], m1
4275
4276 packsswb m0, m1
4277 pcmpeqb m0, m4
4278 paddb m5, m0
4279
4280 ; row 2
4281 movu m0, [r1 + 2 * r2]
4282 movu m1, [r1 + 2 * r2 + 16]
4283 movu [r0 + 64], m0
4284 movu [r0 + 80], m1
4285
4286 packsswb m0, m1
4287 pcmpeqb m0, m4
4288 paddb m5, m0
4289
4290 ; row 3
4291 lea r1, [r1 + 2 * r2]
4292 movu m0, [r1 + r2]
4293 movu m1, [r1 + r2 + 16]
4294 movu [r0 + 96], m0
4295 movu [r0 + 112], m1
4296
4297 packsswb m0, m1
4298 pcmpeqb m0, m4
4299 paddb m5, m0
4300
4301 add r0, 128
4302 lea r1, [r1 + 2 * r2]
4303 dec r3d
4304 jnz .loop
4305
4306 mova m0, [pb_16]
4307 paddb m5, m0
4308 psadbw m5, m4
4309 pshufd m0, m5, 2
4310 paddw m5, m0
4311 movd eax, m5
4312 RET
4313
4314
4315INIT_YMM avx2
4316cglobal copy_cnt_16, 3, 5, 5
4317 add r2d, r2d
4318 lea r3, [r2 * 3]
4319 mov r4d, 16/4
4320
4321 mova m3, [pb_1]
4322 xorpd m4, m4
4323
4324.loop:
4325 ; row 0 - 1
4326 movu m0, [r1]
4327 movu [r0], m0
4328 movu m1, [r1 + r2]
4329 movu [r0 + 32], m1
4330
4331 packsswb m0, m1
4332 pminub m0, m3
4333
4334 ; row 2 - 3
4335 movu m1, [r1 + r2 * 2]
4336 movu [r0 + 64], m1
4337 movu m2, [r1 + r3]
4338 movu [r0 + 96], m2
4339
4340 packsswb m1, m2
4341 pminub m1, m3
4342 paddb m0, m1
4343 paddb m4, m0
4344
4345 add r0, 128
4346 lea r1, [r1 + 4 * r2]
4347 dec r4d
4348 jnz .loop
4349
4350 ; get count
4351 xorpd m0, m0
4352 vextracti128 xm1, m4, 1
4353 paddb xm4, xm1
4354 psadbw xm4, xm0
4355 movhlps xm1, xm4
4356 paddd xm4, xm1
4357 movd eax, xm4
4358 RET
4359
4360;--------------------------------------------------------------------------------------
b53f7c52 4361; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
72b9787e
JB
4362;--------------------------------------------------------------------------------------
4363INIT_XMM sse4
4364cglobal copy_cnt_32, 3,4,6
4365 add r2d, r2d
4366 mov r3d, 16
4367 pxor m4, m4
4368 pxor m5, m5
4369
4370.loop
4371 ; row 0
4372 movu m0, [r1]
4373 movu m1, [r1 + 16]
4374 movu [r0], m0
4375 movu [r0 + 16], m1
4376
4377 packsswb m0, m1
4378 pcmpeqb m0, m4
4379 paddb m5, m0
4380
4381 movu m0, [r1 + 32]
4382 movu m1, [r1 + 48]
4383 movu [r0 + 32], m0
4384 movu [r0 + 48], m1
4385
4386 packsswb m0, m1
4387 pcmpeqb m0, m4
4388 paddb m5, m0
4389
4390 ; row 1
4391 movu m0, [r1 + r2]
4392 movu m1, [r1 + r2 + 16]
4393 movu [r0 + 64], m0
4394 movu [r0 + 80], m1
4395
4396 packsswb m0, m1
4397 pcmpeqb m0, m4
4398 paddb m5, m0
4399
4400 movu m0, [r1 + r2 + 32]
4401 movu m1, [r1 + r2 + 48]
4402 movu [r0 + 96], m0
4403 movu [r0 + 112], m1
4404
4405 packsswb m0, m1
4406 pcmpeqb m0, m4
4407 paddb m5, m0
4408
4409 add r0, 128
4410 lea r1, [r1 + 2 * r2]
4411 dec r3d
4412 jnz .loop
4413
4414 ; get count
4415 mova m0, [pb_64]
4416 paddb m5, m0
4417 psadbw m5, m4
4418 pshufd m0, m5, 2
4419 paddw m5, m0
4420 movd eax, m5
4421 RET
4422
4423
4424INIT_YMM avx2
4425cglobal copy_cnt_32, 3, 5, 5
4426 add r2d, r2d
4427 mov r3d, 32/2
4428
4429 mova m3, [pb_1]
4430 xorpd m4, m4
4431
4432.loop:
4433 ; row 0
4434 movu m0, [r1]
4435 movu [r0], m0
4436 movu m1, [r1 + 32]
4437 movu [r0 + 32], m1
4438
4439 packsswb m0, m1
4440 pminub m0, m3
4441
4442 ; row 1
4443 movu m1, [r1 + r2]
4444 movu [r0 + 64], m1
4445 movu m2, [r1 + r2 + 32]
4446 movu [r0 + 96], m2
4447
4448 packsswb m1, m2
4449 pminub m1, m3
4450 paddb m0, m1
4451 paddb m4, m0
4452
4453 add r0, 128
4454 lea r1, [r1 + 2 * r2]
4455 dec r3d
4456 jnz .loop
4457
4458 ; get count
4459 xorpd m0, m0
4460 vextracti128 xm1, m4, 1
4461 paddb xm4, xm1
4462 psadbw xm4, xm0
4463 movhlps xm1, xm4
4464 paddd xm4, xm1
4465 movd eax, xm4
4466 RET
4467
72b9787e 4468
b53f7c52
JB
4469;--------------------------------------------------------------------------------------
4470; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4471;--------------------------------------------------------------------------------------
4472INIT_XMM sse2
4473cglobal cpy2Dto1D_shl_4, 4, 4, 4
4474 add r2d, r2d
4475 movd m0, r3d
4476
4477 ; register alloc
4478 ; r0 - dst
4479 ; r1 - src
4480 ; r2 - srcStride
4481 ; m0 - shift
72b9787e 4482
b53f7c52
JB
4483 ; Row 0-3
4484 movh m2, [r1]
4485 movhps m2, [r1 + r2]
4486 lea r1, [r1 + r2 * 2]
4487 movh m3, [r1]
4488 movhps m3, [r1 + r2]
4489 psllw m2, m0
4490 psllw m3, m0
4491 mova [r0 + 0 * mmsize], m2
4492 mova [r0 + 1 * mmsize], m3
72b9787e 4493
b53f7c52 4494 RET
72b9787e 4495
b53f7c52
JB
4496
4497;--------------------------------------------------------------------------------------
4498; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4499;--------------------------------------------------------------------------------------
4500INIT_XMM sse2
4501cglobal cpy2Dto1D_shl_8, 4, 5, 4
4502 add r2d, r2d
4503 movd m0, r3d
4504 mov r3d, 8/4
4505 lea r4, [r2 * 3]
72b9787e
JB
4506
4507 ; register alloc
4508 ; r0 - dst
4509 ; r1 - src
b53f7c52
JB
4510 ; r2 - srcStride
4511 ; r3 - loop counter
4512 ; r4 - stride * 3
4513 ; m0 - shift
4514
4515.loop:
4516 ; Row 0, 1
4517 mova m2, [r1]
4518 mova m3, [r1 + r2]
4519 psllw m2, m0
4520 psllw m3, m0
4521 mova [r0 + 0 * mmsize], m2
4522 mova [r0 + 1 * mmsize], m3
4523
4524 ; Row 2, 3
4525 mova m2, [r1 + r2 * 2]
4526 mova m3, [r1 + r4]
4527 psllw m2, m0
4528 psllw m3, m0
4529 mova [r0 + 2 * mmsize], m2
4530 mova [r0 + 3 * mmsize], m3
4531
4532 add r0, 4 * mmsize
4533 lea r1, [r1 + r2 * 4]
4534 dec r3d
4535 jnz .loop
4536 RET
72b9787e 4537
72b9787e 4538
b53f7c52
JB
4539;--------------------------------------------------------------------------------------
4540; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4541;--------------------------------------------------------------------------------------
4542INIT_XMM sse2
4543cglobal cpy2Dto1D_shl_16, 4, 4, 4
4544 add r2d, r2d
4545 movd m0, r3d
4546 mov r3d, 16/2
72b9787e 4547
b53f7c52
JB
4548 ; register alloc
4549 ; r0 - dst
4550 ; r1 - src
4551 ; r2 - srcStride
4552 ; r3 - loop counter
4553 ; m0 - shift
72b9787e 4554
b53f7c52
JB
4555.loop:
4556 ; Row 0
4557 mova m2, [r1 + 0 * mmsize]
4558 mova m3, [r1 + 1 * mmsize]
4559 psllw m2, m0
4560 psllw m3, m0
4561 mova [r0 + 0 * mmsize], m2
4562 mova [r0 + 1 * mmsize], m3
72b9787e 4563
b53f7c52
JB
4564 ; Row 1
4565 mova m2, [r1 + r2 + 0 * mmsize]
4566 mova m3, [r1 + r2 + 1 * mmsize]
4567 psllw m2, m0
4568 psllw m3, m0
4569 mova [r0 + 2 * mmsize], m2
4570 mova [r0 + 3 * mmsize], m3
4571
4572 add r0, 4 * mmsize
4573 lea r1, [r1 + r2 * 2]
4574 dec r3d
4575 jnz .loop
4576 RET
72b9787e 4577
72b9787e 4578
b53f7c52
JB
4579;--------------------------------------------------------------------------------------
4580; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4581;--------------------------------------------------------------------------------------
4582INIT_XMM sse2
4583cglobal cpy2Dto1D_shl_32, 4, 4, 6
4584 add r2d, r2d
4585 movd m0, r3d
4586 mov r3d, 32/1
72b9787e 4587
b53f7c52
JB
4588 ; register alloc
4589 ; r0 - dst
4590 ; r1 - src
4591 ; r2 - srcStride
4592 ; r3 - loop counter
4593 ; m0 - shift
72b9787e 4594
b53f7c52
JB
4595.loop:
4596 ; Row 0
4597 mova m2, [r1 + 0 * mmsize]
4598 mova m3, [r1 + 1 * mmsize]
4599 mova m4, [r1 + 2 * mmsize]
4600 mova m5, [r1 + 3 * mmsize]
4601 psllw m2, m0
4602 psllw m3, m0
4603 psllw m4, m0
4604 psllw m5, m0
4605 mova [r0 + 0 * mmsize], m2
4606 mova [r0 + 1 * mmsize], m3
4607 mova [r0 + 2 * mmsize], m4
4608 mova [r0 + 3 * mmsize], m5
4609
4610 add r0, 4 * mmsize
4611 add r1, r2
4612 dec r3d
4613 jnz .loop
72b9787e
JB
4614 RET
4615
b53f7c52 4616
72b9787e 4617;--------------------------------------------------------------------------------------
b53f7c52 4618; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
4619;--------------------------------------------------------------------------------------
4620INIT_XMM sse2
b53f7c52 4621cglobal cpy1Dto2D_shr_4, 3, 3, 4
72b9787e
JB
4622 add r2d, r2d
4623 movd m0, r3m
b53f7c52
JB
4624 pcmpeqw m1, m1
4625 psllw m1, m0
4626 psraw m1, 1
72b9787e
JB
4627
4628 ; Row 0-3
b53f7c52
JB
4629 mova m2, [r1 + 0 * mmsize]
4630 mova m3, [r1 + 1 * mmsize]
4631 psubw m2, m1
4632 psubw m3, m1
4633 psraw m2, m0
4634 psraw m3, m0
4635 movh [r0], m2
72b9787e 4636 movhps [r0 + r2], m2
b53f7c52
JB
4637 movh [r0 + r2 * 2], m3
4638 lea r2, [r2 * 3]
4639 movhps [r0 + r2], m3
72b9787e
JB
4640 RET
4641
b53f7c52
JB
4642
4643INIT_YMM avx2
4644cglobal cpy1Dto2D_shr_4, 3, 3, 3
4645 add r2d, r2d
4646 movd xm0, r3m
4647 pcmpeqw m1, m1
4648 psllw m1, xm0
4649 psraw m1, 1
4650
4651 ; Row 0-3
4652 movu m2, [r1]
4653 psubw m2, m1
4654 psraw m2, xm0
4655 vextracti128 xm1, m2, 1
4656 movq [r0], xm2
4657 movhps [r0 + r2], xm2
4658 lea r0, [r0 + r2 * 2]
4659 movq [r0], xm1
4660 movhps [r0 + r2], xm1
4661 RET
4662
4663
72b9787e 4664;--------------------------------------------------------------------------------------
b53f7c52 4665; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
4666;--------------------------------------------------------------------------------------
4667INIT_XMM sse2
b53f7c52 4668cglobal cpy1Dto2D_shr_8, 3, 4, 6
72b9787e
JB
4669 add r2d, r2d
4670 movd m0, r3m
b53f7c52
JB
4671 pcmpeqw m1, m1
4672 psllw m1, m0
4673 psraw m1, 1
4674 lea r3, [r2 * 3]
72b9787e
JB
4675
4676 ; Row 0-3
b53f7c52
JB
4677 mova m2, [r1 + 0 * mmsize]
4678 mova m3, [r1 + 1 * mmsize]
4679 mova m4, [r1 + 2 * mmsize]
4680 mova m5, [r1 + 3 * mmsize]
4681 psubw m2, m1
4682 psubw m3, m1
4683 psubw m4, m1
4684 psubw m5, m1
4685 psraw m2, m0
4686 psraw m3, m0
4687 psraw m4, m0
4688 psraw m5, m0
4689 mova [r0], m2
4690 mova [r0 + r2], m3
4691 mova [r0 + r2 * 2], m4
4692 mova [r0 + r3], m5
72b9787e
JB
4693
4694 ; Row 4-7
b53f7c52
JB
4695 mova m2, [r1 + 4 * mmsize]
4696 mova m3, [r1 + 5 * mmsize]
4697 mova m4, [r1 + 6 * mmsize]
4698 mova m5, [r1 + 7 * mmsize]
4699 lea r0, [r0 + r2 * 4]
4700 psubw m2, m1
4701 psubw m3, m1
4702 psubw m4, m1
4703 psubw m5, m1
4704 psraw m2, m0
4705 psraw m3, m0
4706 psraw m4, m0
4707 psraw m5, m0
4708 mova [r0], m2
4709 mova [r0 + r2], m3
4710 mova [r0 + r2 * 2], m4
4711 mova [r0 + r3], m5
4712 RET
4713
4714
4715INIT_YMM avx2
4716cglobal cpy1Dto2D_shr_8, 3, 4, 4
4717 add r2d, r2d
4718 movd xm0, r3m
4719 pcmpeqw m1, m1
4720 psllw m1, xm0
4721 psraw m1, 1
4722 lea r3, [r2 * 3]
4723
4724 ; Row 0-3
4725 movu m2, [r1 + 0 * mmsize]
4726 movu m3, [r1 + 1 * mmsize]
4727 psubw m2, m1
4728 psubw m3, m1
4729 psraw m2, xm0
4730 psraw m3, xm0
4731 movu [r0], xm2
4732 vextracti128 [r0 + r2], m2, 1
4733 movu [r0 + r2 * 2], xm3
4734 vextracti128 [r0 + r3], m3, 1
4735
4736 ; Row 4-7
4737 movu m2, [r1 + 2 * mmsize]
4738 movu m3, [r1 + 3 * mmsize]
4739 lea r0, [r0 + r2 * 4]
4740 psubw m2, m1
4741 psubw m3, m1
4742 psraw m2, xm0
4743 psraw m3, xm0
4744 movu [r0], xm2
4745 vextracti128 [r0 + r2], m2, 1
4746 movu [r0 + r2 * 2], xm3
4747 vextracti128 [r0 + r3], m3, 1
72b9787e
JB
4748 RET
4749
b53f7c52 4750
72b9787e 4751;--------------------------------------------------------------------------------------
b53f7c52 4752; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
4753;--------------------------------------------------------------------------------------
4754INIT_XMM sse2
b53f7c52 4755cglobal cpy1Dto2D_shr_16, 3, 5, 6
72b9787e
JB
4756 add r2d, r2d
4757 movd m0, r3m
b53f7c52
JB
4758 pcmpeqw m1, m1
4759 psllw m1, m0
4760 psraw m1, 1
4761 mov r3d, 16/4
4762 lea r4, [r2 * 3]
72b9787e
JB
4763
4764.loop:
b53f7c52
JB
4765 ; Row 0-1
4766 mova m2, [r1 + 0 * mmsize]
4767 mova m3, [r1 + 1 * mmsize]
4768 mova m4, [r1 + 2 * mmsize]
4769 mova m5, [r1 + 3 * mmsize]
4770 psubw m2, m1
4771 psubw m3, m1
4772 psubw m4, m1
4773 psubw m5, m1
4774 psraw m2, m0
4775 psraw m3, m0
4776 psraw m4, m0
4777 psraw m5, m0
4778 mova [r0], m2
4779 mova [r0 + mmsize], m3
4780 mova [r0 + r2], m4
4781 mova [r0 + r2 + mmsize], m5
72b9787e 4782
b53f7c52
JB
4783 ; Row 2-3
4784 mova m2, [r1 + 4 * mmsize]
4785 mova m3, [r1 + 5 * mmsize]
4786 mova m4, [r1 + 6 * mmsize]
4787 mova m5, [r1 + 7 * mmsize]
4788 psubw m2, m1
4789 psubw m3, m1
4790 psubw m4, m1
4791 psubw m5, m1
4792 psraw m2, m0
4793 psraw m3, m0
4794 psraw m4, m0
4795 psraw m5, m0
4796 mova [r0 + r2 * 2], m2
4797 mova [r0 + r2 * 2 + mmsize], m3
4798 mova [r0 + r4], m4
4799 mova [r0 + r4 + mmsize], m5
72b9787e
JB
4800
4801 add r1, 8 * mmsize
b53f7c52
JB
4802 lea r0, [r0 + r2 * 4]
4803 dec r3d
4804 jnz .loop
4805 RET
4806
4807
4808INIT_YMM avx2
4809cglobal cpy1Dto2D_shr_16, 3, 5, 4
4810 add r2d, r2d
4811 movd xm0, r3m
4812 pcmpeqw m1, m1
4813 psllw m1, xm0
4814 psraw m1, 1
4815 mov r3d, 16/4
4816 lea r4, [r2 * 3]
4817
4818.loop:
4819 ; Row 0-1
4820 movu m2, [r1 + 0 * mmsize]
4821 movu m3, [r1 + 1 * mmsize]
4822 psubw m2, m1
4823 psubw m3, m1
4824 psraw m2, xm0
4825 psraw m3, xm0
4826 movu [r0], m2
4827 movu [r0 + r2], m3
4828
4829 ; Row 2-3
4830 movu m2, [r1 + 2 * mmsize]
4831 movu m3, [r1 + 3 * mmsize]
4832 psubw m2, m1
4833 psubw m3, m1
4834 psraw m2, xm0
4835 psraw m3, xm0
4836 movu [r0 + r2 * 2], m2
4837 movu [r0 + r4], m3
4838
4839 add r1, 4 * mmsize
4840 lea r0, [r0 + r2 * 4]
72b9787e
JB
4841 dec r3d
4842 jnz .loop
4843 RET
4844
b53f7c52 4845
72b9787e 4846;--------------------------------------------------------------------------------------
b53f7c52 4847; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
72b9787e
JB
4848;--------------------------------------------------------------------------------------
4849INIT_XMM sse2
b53f7c52 4850cglobal cpy1Dto2D_shr_32, 3, 4, 6
72b9787e
JB
4851 add r2d, r2d
4852 movd m0, r3m
b53f7c52
JB
4853 pcmpeqw m1, m1
4854 psllw m1, m0
4855 psraw m1, 1
4856 mov r3d, 32/2
72b9787e
JB
4857
4858.loop:
b53f7c52
JB
4859 ; Row 0
4860 mova m2, [r1 + 0 * mmsize]
4861 mova m3, [r1 + 1 * mmsize]
4862 mova m4, [r1 + 2 * mmsize]
4863 mova m5, [r1 + 3 * mmsize]
4864 psubw m2, m1
4865 psubw m3, m1
4866 psubw m4, m1
4867 psubw m5, m1
4868 psraw m2, m0
4869 psraw m3, m0
4870 psraw m4, m0
4871 psraw m5, m0
4872 mova [r0 + 0 * mmsize], m2
4873 mova [r0 + 1 * mmsize], m3
4874 mova [r0 + 2 * mmsize], m4
4875 mova [r0 + 3 * mmsize], m5
72b9787e 4876
b53f7c52
JB
4877 ; Row 1
4878 mova m2, [r1 + 4 * mmsize]
4879 mova m3, [r1 + 5 * mmsize]
4880 mova m4, [r1 + 6 * mmsize]
4881 mova m5, [r1 + 7 * mmsize]
4882 psubw m2, m1
4883 psubw m3, m1
4884 psubw m4, m1
4885 psubw m5, m1
4886 psraw m2, m0
4887 psraw m3, m0
4888 psraw m4, m0
4889 psraw m5, m0
4890 mova [r0 + r2 + 0 * mmsize], m2
4891 mova [r0 + r2 + 1 * mmsize], m3
4892 mova [r0 + r2 + 2 * mmsize], m4
4893 mova [r0 + r2 + 3 * mmsize], m5
72b9787e
JB
4894
4895 add r1, 8 * mmsize
4896 lea r0, [r0 + r2 * 2]
4897 dec r3d
4898 jnz .loop
4899 RET
b53f7c52
JB
4900
4901
4902INIT_YMM avx2
4903cglobal cpy1Dto2D_shr_32, 3, 4, 6
4904 add r2d, r2d
4905 movd xm0, r3m
4906 pcmpeqw m1, m1
4907 psllw m1, xm0
4908 psraw m1, 1
4909 mov r3d, 32/2
4910
4911.loop:
4912 ; Row 0-1
4913 movu m2, [r1 + 0 * mmsize]
4914 movu m3, [r1 + 1 * mmsize]
4915 movu m4, [r1 + 2 * mmsize]
4916 movu m5, [r1 + 3 * mmsize]
4917 psubw m2, m1
4918 psubw m3, m1
4919 psubw m4, m1
4920 psubw m5, m1
4921 psraw m2, xm0
4922 psraw m3, xm0
4923 psraw m4, xm0
4924 psraw m5, xm0
4925 movu [r0], m2
4926 movu [r0 + mmsize], m3
4927 movu [r0 + r2], m4
4928 movu [r0 + r2 + mmsize], m5
4929
4930 add r1, 4 * mmsize
4931 lea r0, [r0 + r2 * 2]
4932 dec r3d
4933 jnz .loop
4934 RET