Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / blockcopy8.asm
1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
3 ;*
4 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5 ;* Murugan Vairavel <murugan@multicorewareinc.com>
6 ;*
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
11 ;*
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
16 ;*
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
20 ;*
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
24
25 %include "x86inc.asm"
26 %include "x86util.asm"
27
28 SECTION_RODATA 32
29
30 tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
31
32 cextern pb_4
33 cextern pb_1
34 cextern pb_16
35 cextern pb_64
36 cextern pw_4
37 cextern pb_8
38 cextern pb_32
39 cextern pb_128
40
41 SECTION .text
42
43 ;-----------------------------------------------------------------------------
44 ; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
45 ;-----------------------------------------------------------------------------
46 INIT_XMM sse2
47 cglobal blockcopy_pp_2x4, 4, 7, 0
48 mov r4w, [r2]
49 mov r5w, [r2 + r3]
50 lea r2, [r2 + r3 * 2]
51 mov r6w, [r2]
52 mov r3w, [r2 + r3]
53
54 mov [r0], r4w
55 mov [r0 + r1], r5w
56 lea r0, [r0 + 2 * r1]
57 mov [r0], r6w
58 mov [r0 + r1], r3w
59 RET
60
61 ;-----------------------------------------------------------------------------
62 ; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
63 ;-----------------------------------------------------------------------------
64 INIT_XMM sse2
65 cglobal blockcopy_pp_2x8, 4, 7, 0
66 mov r4w, [r2]
67 mov r5w, [r2 + r3]
68 mov r6w, [r2 + 2 * r3]
69
70 mov [r0], r4w
71 mov [r0 + r1], r5w
72 mov [r0 + 2 * r1], r6w
73
74 lea r0, [r0 + 2 * r1]
75 lea r2, [r2 + 2 * r3]
76
77 mov r4w, [r2 + r3]
78 mov r5w, [r2 + 2 * r3]
79
80 mov [r0 + r1], r4w
81 mov [r0 + 2 * r1], r5w
82
83 lea r0, [r0 + 2 * r1]
84 lea r2, [r2 + 2 * r3]
85
86 mov r4w, [r2 + r3]
87 mov r5w, [r2 + 2 * r3]
88
89 mov [r0 + r1], r4w
90 mov [r0 + 2 * r1], r5w
91
92 lea r0, [r0 + 2 * r1]
93 lea r2, [r2 + 2 * r3]
94
95 mov r4w, [r2 + r3]
96 mov [r0 + r1], r4w
97 RET
98
99 ;-----------------------------------------------------------------------------
100 ; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
101 ;-----------------------------------------------------------------------------
102 INIT_XMM sse2
103 cglobal blockcopy_pp_2x16, 4, 7, 0
104 mov r6d, 16/2
105 .loop:
106 mov r4w, [r2]
107 mov r5w, [r2 + r3]
108 dec r6d
109 lea r2, [r2 + r3 * 2]
110 mov [r0], r4w
111 mov [r0 + r1], r5w
112 lea r0, [r0 + r1 * 2]
113 jnz .loop
114 RET
115
116
117 ;-----------------------------------------------------------------------------
118 ; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
119 ;-----------------------------------------------------------------------------
120 INIT_XMM sse2
121 cglobal blockcopy_pp_4x2, 4, 6, 0
122 mov r4d, [r2]
123 mov r5d, [r2 + r3]
124
125 mov [r0], r4d
126 mov [r0 + r1], r5d
127 RET
128
129 ;-----------------------------------------------------------------------------
130 ; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
131 ;-----------------------------------------------------------------------------
132 INIT_XMM sse2
133 cglobal blockcopy_pp_4x4, 4, 4, 4
134 movd m0, [r2]
135 movd m1, [r2 + r3]
136 movd m2, [r2 + 2 * r3]
137 lea r3, [r3 + r3 * 2]
138 movd m3, [r2 + r3]
139
140 movd [r0], m0
141 movd [r0 + r1], m1
142 movd [r0 + 2 * r1], m2
143 lea r1, [r1 + 2 * r1]
144 movd [r0 + r1], m3
145 RET
146
147 ;-----------------------------------------------------------------------------
148 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
149 ;-----------------------------------------------------------------------------
150 %macro BLOCKCOPY_PP_W4_H8 2
151 INIT_XMM sse2
152 cglobal blockcopy_pp_%1x%2, 4, 5, 4
153 mov r4d, %2/8
154 .loop:
155 movd m0, [r2]
156 movd m1, [r2 + r3]
157 lea r2, [r2 + 2 * r3]
158 movd m2, [r2]
159 movd m3, [r2 + r3]
160
161 movd [r0], m0
162 movd [r0 + r1], m1
163 lea r0, [r0 + 2 * r1]
164 movd [r0], m2
165 movd [r0 + r1], m3
166
167 lea r0, [r0 + 2 * r1]
168 lea r2, [r2 + 2 * r3]
169 movd m0, [r2]
170 movd m1, [r2 + r3]
171 lea r2, [r2 + 2 * r3]
172 movd m2, [r2]
173 movd m3, [r2 + r3]
174
175 movd [r0], m0
176 movd [r0 + r1], m1
177 lea r0, [r0 + 2 * r1]
178 movd [r0], m2
179 movd [r0 + r1], m3
180
181 lea r0, [r0 + 2 * r1]
182 lea r2, [r2 + 2 * r3]
183
184 dec r4d
185 jnz .loop
186 RET
187 %endmacro
188
189 BLOCKCOPY_PP_W4_H8 4, 8
190 BLOCKCOPY_PP_W4_H8 4, 16
191
192 BLOCKCOPY_PP_W4_H8 4, 32
193
194 ;-----------------------------------------------------------------------------
195 ; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
196 ;-----------------------------------------------------------------------------
197 INIT_XMM sse2
198 cglobal blockcopy_pp_6x8, 4, 7, 8
199
200 movd m0, [r2]
201 movd m1, [r2 + r3]
202 movd m2, [r2 + 2 * r3]
203 lea r5, [r2 + 2 * r3]
204 movd m3, [r5 + r3]
205
206 movd m4, [r5 + 2 * r3]
207 lea r5, [r5 + 2 * r3]
208 movd m5, [r5 + r3]
209 movd m6, [r5 + 2 * r3]
210 lea r5, [r5 + 2 * r3]
211 movd m7, [r5 + r3]
212
213 movd [r0], m0
214 movd [r0 + r1], m1
215 movd [r0 + 2 * r1], m2
216 lea r6, [r0 + 2 * r1]
217 movd [r6 + r1], m3
218
219 movd [r6 + 2 * r1], m4
220 lea r6, [r6 + 2 * r1]
221 movd [r6 + r1], m5
222 movd [r6 + 2 * r1], m6
223 lea r6, [r6 + 2 * r1]
224 movd [r6 + r1], m7
225
226 mov r4w, [r2 + 4]
227 mov r5w, [r2 + r3 + 4]
228 mov r6w, [r2 + 2 * r3 + 4]
229
230 mov [r0 + 4], r4w
231 mov [r0 + r1 + 4], r5w
232 mov [r0 + 2 * r1 + 4], r6w
233
234 lea r0, [r0 + 2 * r1]
235 lea r2, [r2 + 2 * r3]
236
237 mov r4w, [r2 + r3 + 4]
238 mov r5w, [r2 + 2 * r3 + 4]
239
240 mov [r0 + r1 + 4], r4w
241 mov [r0 + 2 * r1 + 4], r5w
242
243 lea r0, [r0 + 2 * r1]
244 lea r2, [r2 + 2 * r3]
245
246 mov r4w, [r2 + r3 + 4]
247 mov r5w, [r2 + 2 * r3 + 4]
248
249 mov [r0 + r1 + 4], r4w
250 mov [r0 + 2 * r1 + 4], r5w
251
252 lea r0, [r0 + 2 * r1]
253 lea r2, [r2 + 2 * r3]
254
255 mov r4w, [r2 + r3 + 4]
256 mov [r0 + r1 + 4], r4w
257 RET
258
259 ;-----------------------------------------------------------------------------
260 ; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
261 ;-----------------------------------------------------------------------------
262 INIT_XMM sse2
263 cglobal blockcopy_pp_6x16, 4, 7, 2
264 mov r6d, 16/2
265 .loop:
266 movd m0, [r2]
267 mov r4w, [r2 + 4]
268 movd m1, [r2 + r3]
269 mov r5w, [r2 + r3 + 4]
270 lea r2, [r2 + r3 * 2]
271 movd [r0], m0
272 mov [r0 + 4], r4w
273 movd [r0 + r1], m1
274 mov [r0 + r1 + 4], r5w
275 lea r0, [r0 + r1 * 2]
276 dec r6d
277 jnz .loop
278 RET
279
280
281 ;-----------------------------------------------------------------------------
282 ; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
283 ;-----------------------------------------------------------------------------
284 INIT_XMM sse2
285 cglobal blockcopy_pp_8x2, 4, 4, 2
286 movh m0, [r2]
287 movh m1, [r2 + r3]
288
289 movh [r0], m0
290 movh [r0 + r1], m1
291 RET
292
293 ;-----------------------------------------------------------------------------
294 ; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
295 ;-----------------------------------------------------------------------------
296 INIT_XMM sse2
297 cglobal blockcopy_pp_8x4, 4, 4, 4
298 movh m0, [r2]
299 movh m1, [r2 + r3]
300 movh m2, [r2 + 2 * r3]
301 lea r3, [r3 + r3 * 2]
302 movh m3, [r2 + r3]
303
304 movh [r0], m0
305 movh [r0 + r1], m1
306 movh [r0 + 2 * r1], m2
307 lea r1, [r1 + 2 * r1]
308 movh [r0 + r1], m3
309 RET
310
311 ;-----------------------------------------------------------------------------
312 ; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
313 ;-----------------------------------------------------------------------------
314 INIT_XMM sse2
315 cglobal blockcopy_pp_8x6, 4, 7, 6
316 movh m0, [r2]
317 movh m1, [r2 + r3]
318 movh m2, [r2 + 2 * r3]
319 lea r5, [r2 + 2 * r3]
320 movh m3, [r5 + r3]
321 movh m4, [r5 + 2 * r3]
322 lea r5, [r5 + 2 * r3]
323 movh m5, [r5 + r3]
324
325 movh [r0], m0
326 movh [r0 + r1], m1
327 movh [r0 + 2 * r1], m2
328 lea r6, [r0 + 2 * r1]
329 movh [r6 + r1], m3
330 movh [r6 + 2 * r1], m4
331 lea r6, [r6 + 2 * r1]
332 movh [r6 + r1], m5
333 RET
334
335 ;-----------------------------------------------------------------------------
336 ; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
337 ;-----------------------------------------------------------------------------
338 INIT_XMM sse2
339 cglobal blockcopy_pp_8x12, 4, 5, 2
340 mov r4d, 12/2
341 .loop:
342 movh m0, [r2]
343 movh m1, [r2 + r3]
344 movh [r0], m0
345 movh [r0 + r1], m1
346 dec r4d
347 lea r0, [r0 + 2 * r1]
348 lea r2, [r2 + 2 * r3]
349 jnz .loop
350 RET
351
352 ;-----------------------------------------------------------------------------
353 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
354 ;-----------------------------------------------------------------------------
355 %macro BLOCKCOPY_PP_W8_H8 2
356 INIT_XMM sse2
357 cglobal blockcopy_pp_%1x%2, 4, 5, 6
358 mov r4d, %2/8
359
360 .loop:
361 movh m0, [r2]
362 movh m1, [r2 + r3]
363 lea r2, [r2 + 2 * r3]
364 movh m2, [r2]
365 movh m3, [r2 + r3]
366 lea r2, [r2 + 2 * r3]
367 movh m4, [r2]
368 movh m5, [r2 + r3]
369
370 movh [r0], m0
371 movh [r0 + r1], m1
372 lea r0, [r0 + 2 * r1]
373 movh [r0], m2
374 movh [r0 + r1], m3
375 lea r0, [r0 + 2 * r1]
376 movh [r0], m4
377 movh [r0 + r1], m5
378
379 lea r2, [r2 + 2 * r3]
380 movh m4, [r2]
381 movh m5, [r2 + r3]
382 lea r0, [r0 + 2 * r1]
383 movh [r0], m4
384 movh [r0 + r1], m5
385
386 dec r4d
387 lea r0, [r0 + 2 * r1]
388 lea r2, [r2 + 2 * r3]
389 jnz .loop
390 RET
391 %endmacro
392
393 BLOCKCOPY_PP_W8_H8 8, 8
394 BLOCKCOPY_PP_W8_H8 8, 16
395 BLOCKCOPY_PP_W8_H8 8, 32
396
397 BLOCKCOPY_PP_W8_H8 8, 64
398
399 ;-----------------------------------------------------------------------------
400 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
401 ;-----------------------------------------------------------------------------
402 %macro BLOCKCOPY_PP_W12_H4 2
403 INIT_XMM sse2
404 cglobal blockcopy_pp_%1x%2, 4, 5, 4
405 mov r4d, %2/4
406
407 .loop:
408 movh m0, [r2]
409 movd m1, [r2 + 8]
410 movh m2, [r2 + r3]
411 movd m3, [r2 + r3 + 8]
412 lea r2, [r2 + 2 * r3]
413
414 movh [r0], m0
415 movd [r0 + 8], m1
416 movh [r0 + r1], m2
417 movd [r0 + r1 + 8], m3
418 lea r0, [r0 + 2 * r1]
419
420 movh m0, [r2]
421 movd m1, [r2 + 8]
422 movh m2, [r2 + r3]
423 movd m3, [r2 + r3 + 8]
424
425 movh [r0], m0
426 movd [r0 + 8], m1
427 movh [r0 + r1], m2
428 movd [r0 + r1 + 8], m3
429
430 dec r4d
431 lea r0, [r0 + 2 * r1]
432 lea r2, [r2 + 2 * r3]
433 jnz .loop
434 RET
435 %endmacro
436
437 BLOCKCOPY_PP_W12_H4 12, 16
438
439 BLOCKCOPY_PP_W12_H4 12, 32
440
441 ;-----------------------------------------------------------------------------
442 ; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
443 ;-----------------------------------------------------------------------------
444 %macro BLOCKCOPY_PP_W16_H4 2
445 INIT_XMM sse2
446 cglobal blockcopy_pp_%1x%2, 4, 5, 4
447 mov r4d, %2/4
448
449 .loop:
450 movu m0, [r2]
451 movu m1, [r2 + r3]
452 lea r2, [r2 + 2 * r3]
453 movu m2, [r2]
454 movu m3, [r2 + r3]
455
456 movu [r0], m0
457 movu [r0 + r1], m1
458 lea r0, [r0 + 2 * r1]
459 movu [r0], m2
460 movu [r0 + r1], m3
461
462 dec r4d
463 lea r0, [r0 + 2 * r1]
464 lea r2, [r2 + 2 * r3]
465 jnz .loop
466
467 RET
468 %endmacro
469
470 BLOCKCOPY_PP_W16_H4 16, 4
471 BLOCKCOPY_PP_W16_H4 16, 12
472
473 ;-----------------------------------------------------------------------------
474 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
475 ;-----------------------------------------------------------------------------
476 %macro BLOCKCOPY_PP_W16_H8 2
477 INIT_XMM sse2
478 cglobal blockcopy_pp_%1x%2, 4, 5, 6
479 mov r4d, %2/8
480
481 .loop:
482 movu m0, [r2]
483 movu m1, [r2 + r3]
484 lea r2, [r2 + 2 * r3]
485 movu m2, [r2]
486 movu m3, [r2 + r3]
487 lea r2, [r2 + 2 * r3]
488 movu m4, [r2]
489 movu m5, [r2 + r3]
490 lea r2, [r2 + 2 * r3]
491
492 movu [r0], m0
493 movu [r0 + r1], m1
494 lea r0, [r0 + 2 * r1]
495 movu [r0], m2
496 movu [r0 + r1], m3
497 lea r0, [r0 + 2 * r1]
498 movu [r0], m4
499 movu [r0 + r1], m5
500 lea r0, [r0 + 2 * r1]
501
502 movu m0, [r2]
503 movu m1, [r2 + r3]
504 movu [r0], m0
505 movu [r0 + r1], m1
506
507 dec r4d
508 lea r0, [r0 + 2 * r1]
509 lea r2, [r2 + 2 * r3]
510 jnz .loop
511 RET
512 %endmacro
513
514 BLOCKCOPY_PP_W16_H8 16, 8
515 BLOCKCOPY_PP_W16_H8 16, 16
516 BLOCKCOPY_PP_W16_H8 16, 32
517 BLOCKCOPY_PP_W16_H8 16, 64
518
519 BLOCKCOPY_PP_W16_H8 16, 24
520
521 ;-----------------------------------------------------------------------------
522 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
523 ;-----------------------------------------------------------------------------
524 %macro BLOCKCOPY_PP_W24_H4 2
525 INIT_XMM sse2
526 cglobal blockcopy_pp_%1x%2, 4, 5, 6
527 mov r4d, %2/4
528
529 .loop:
530 movu m0, [r2]
531 movh m1, [r2 + 16]
532 movu m2, [r2 + r3]
533 movh m3, [r2 + r3 + 16]
534 lea r2, [r2 + 2 * r3]
535 movu m4, [r2]
536 movh m5, [r2 + 16]
537
538 movu [r0], m0
539 movh [r0 + 16], m1
540 movu [r0 + r1], m2
541 movh [r0 + r1 + 16], m3
542 lea r0, [r0 + 2 * r1]
543 movu [r0], m4
544 movh [r0 + 16], m5
545
546 movu m0, [r2 + r3]
547 movh m1, [r2 + r3 + 16]
548 movu [r0 + r1], m0
549 movh [r0 + r1 + 16], m1
550
551 dec r4d
552 lea r0, [r0 + 2 * r1]
553 lea r2, [r2 + 2 * r3]
554 jnz .loop
555 RET
556 %endmacro
557
558 BLOCKCOPY_PP_W24_H4 24, 32
559
560 BLOCKCOPY_PP_W24_H4 24, 64
561
562 ;-----------------------------------------------------------------------------
563 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
564 ;-----------------------------------------------------------------------------
565 %macro BLOCKCOPY_PP_W32_H4 2
566 INIT_XMM sse2
567 cglobal blockcopy_pp_%1x%2, 4, 5, 4
568 mov r4d, %2/4
569
570 .loop:
571 movu m0, [r2]
572 movu m1, [r2 + 16]
573 movu m2, [r2 + r3]
574 movu m3, [r2 + r3 + 16]
575 lea r2, [r2 + 2 * r3]
576
577 movu [r0], m0
578 movu [r0 + 16], m1
579 movu [r0 + r1], m2
580 movu [r0 + r1 + 16], m3
581 lea r0, [r0 + 2 * r1]
582
583 movu m0, [r2]
584 movu m1, [r2 + 16]
585 movu m2, [r2 + r3]
586 movu m3, [r2 + r3 + 16]
587
588 movu [r0], m0
589 movu [r0 + 16], m1
590 movu [r0 + r1], m2
591 movu [r0 + r1 + 16], m3
592
593 dec r4d
594 lea r0, [r0 + 2 * r1]
595 lea r2, [r2 + 2 * r3]
596 jnz .loop
597 RET
598 %endmacro
599
600 BLOCKCOPY_PP_W32_H4 32, 8
601 BLOCKCOPY_PP_W32_H4 32, 16
602 BLOCKCOPY_PP_W32_H4 32, 24
603 BLOCKCOPY_PP_W32_H4 32, 32
604 BLOCKCOPY_PP_W32_H4 32, 64
605
606 BLOCKCOPY_PP_W32_H4 32, 48
607
608 INIT_YMM avx
609 cglobal blockcopy_pp_32x8, 4, 6, 6
610 lea r4, [3 * r1]
611 lea r5, [3 * r3]
612
613 movu m0, [r2]
614 movu m1, [r2 + r3]
615 movu m2, [r2 + 2 * r3]
616 movu m3, [r2 + r5]
617 lea r2, [r2 + 4 * r3]
618 movu m4, [r2]
619 movu m5, [r2 + r3]
620
621 movu [r0], m0
622 movu [r0 + r1], m1
623 movu [r0 + 2 * r1], m2
624 movu [r0 + r4], m3
625 lea r0, [r0 + 4 * r1]
626 movu [r0], m4
627 movu [r0 + r1], m5
628
629 movu m0, [r2 + 2 * r3]
630 movu m1, [r2 + r5]
631
632 movu [r0 + 2 * r1], m0
633 movu [r0 + r4], m1
634 RET
635
636 INIT_YMM avx
637 cglobal blockcopy_pp_32x16, 4, 6, 6
638 lea r4, [3 * r1]
639 lea r5, [3 * r3]
640
641 movu m0, [r2]
642 movu m1, [r2 + r3]
643 movu m2, [r2 + 2 * r3]
644 movu m3, [r2 + r5]
645 lea r2, [r2 + 4 * r3]
646 movu m4, [r2]
647 movu m5, [r2 + r3]
648
649 movu [r0], m0
650 movu [r0 + r1], m1
651 movu [r0 + 2 * r1], m2
652 movu [r0 + r4], m3
653 lea r0, [r0 + 4 * r1]
654 movu [r0], m4
655 movu [r0 + r1], m5
656
657 movu m0, [r2 + 2 * r3]
658 movu m1, [r2 + r5]
659 lea r2, [r2 + 4 * r3]
660 movu m2, [r2]
661 movu m3, [r2 + r3]
662 movu m4, [r2 + 2 * r3]
663 movu m5, [r2 + r5]
664
665 movu [r0 + 2 * r1], m0
666 movu [r0 + r4], m1
667 lea r0, [r0 + 4 * r1]
668 movu [r0], m2
669 movu [r0 + r1], m3
670 movu [r0 + 2 * r1], m4
671 movu [r0 + r4], m5
672
673 lea r2, [r2 + 4 * r3]
674 movu m0, [r2]
675 movu m1, [r2 + r3]
676 movu m2, [r2 + 2 * r3]
677 movu m3, [r2 + r5]
678
679 lea r0, [r0 + 4 * r1]
680 movu [r0], m0
681 movu [r0 + r1], m1
682 movu [r0 + 2 * r1], m2
683 movu [r0 + r4], m3
684 RET
685
686 ;-----------------------------------------------------------------------------
687 ; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
688 ;-----------------------------------------------------------------------------
689 INIT_YMM avx
690 cglobal blockcopy_pp_32x24, 4, 7, 6
691 lea r4, [3 * r1]
692 lea r5, [3 * r3]
693 mov r6d, 24/8
694
695 .loop:
696 movu m0, [r2]
697 movu m1, [r2 + r3]
698 movu m2, [r2 + 2 * r3]
699 movu m3, [r2 + r5]
700 lea r2, [r2 + 4 * r3]
701 movu m4, [r2]
702 movu m5, [r2 + r3]
703
704 movu [r0], m0
705 movu [r0 + r1], m1
706 movu [r0 + 2 * r1], m2
707 movu [r0 + r4], m3
708 lea r0, [r0 + 4 * r1]
709 movu [r0], m4
710 movu [r0 + r1], m5
711
712 movu m0, [r2 + 2 * r3]
713 movu m1, [r2 + r5]
714
715 movu [r0 + 2 * r1], m0
716 movu [r0 + r4], m1
717
718 lea r2, [r2 + 4 * r3]
719 lea r0, [r0 + 4 * r1]
720 dec r6d
721 jnz .loop
722 RET
723
724 ;-----------------------------------------------------------------------------
725 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
726 ;-----------------------------------------------------------------------------
727 %macro BLOCKCOPY_PP_W32_H16_avx 2
728 INIT_YMM avx
729 cglobal blockcopy_pp_%1x%2, 4, 7, 6
730 lea r4, [3 * r1]
731 lea r5, [3 * r3]
732 mov r6d, %2/16
733
734 .loop:
735 movu m0, [r2]
736 movu m1, [r2 + r3]
737 movu m2, [r2 + 2 * r3]
738 movu m3, [r2 + r5]
739 lea r2, [r2 + 4 * r3]
740 movu m4, [r2]
741 movu m5, [r2 + r3]
742
743 movu [r0], m0
744 movu [r0 + r1], m1
745 movu [r0 + 2 * r1], m2
746 movu [r0 + r4], m3
747 lea r0, [r0 + 4 * r1]
748 movu [r0], m4
749 movu [r0 + r1], m5
750
751 movu m0, [r2 + 2 * r3]
752 movu m1, [r2 + r5]
753 lea r2, [r2 + 4 * r3]
754 movu m2, [r2]
755 movu m3, [r2 + r3]
756 movu m4, [r2 + 2 * r3]
757 movu m5, [r2 + r5]
758
759 movu [r0 + 2 * r1], m0
760 movu [r0 + r4], m1
761 lea r0, [r0 + 4 * r1]
762 movu [r0], m2
763 movu [r0 + r1], m3
764 movu [r0 + 2 * r1], m4
765 movu [r0 + r4], m5
766
767 lea r2, [r2 + 4 * r3]
768 movu m0, [r2]
769 movu m1, [r2 + r3]
770 movu m2, [r2 + 2 * r3]
771 movu m3, [r2 + r5]
772
773 lea r0, [r0 + 4 * r1]
774 movu [r0], m0
775 movu [r0 + r1], m1
776 movu [r0 + 2 * r1], m2
777 movu [r0 + r4], m3
778
779 lea r2, [r2 + 4 * r3]
780 lea r0, [r0 + 4 * r1]
781 dec r6d
782 jnz .loop
783 RET
784 %endmacro
785
786 BLOCKCOPY_PP_W32_H16_avx 32, 32
787 BLOCKCOPY_PP_W32_H16_avx 32, 48
788 BLOCKCOPY_PP_W32_H16_avx 32, 64
789
790 ;-----------------------------------------------------------------------------
791 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
792 ;-----------------------------------------------------------------------------
793 %macro BLOCKCOPY_PP_W48_H2 2
794 INIT_XMM sse2
795 cglobal blockcopy_pp_%1x%2, 4, 5, 6
796 mov r4d, %2/4
797
798 .loop:
799 movu m0, [r2]
800 movu m1, [r2 + 16]
801 movu m2, [r2 + 32]
802 movu m3, [r2 + r3]
803 movu m4, [r2 + r3 + 16]
804 movu m5, [r2 + r3 + 32]
805 lea r2, [r2 + 2 * r3]
806
807 movu [r0], m0
808 movu [r0 + 16], m1
809 movu [r0 + 32], m2
810 movu [r0 + r1], m3
811 movu [r0 + r1 + 16], m4
812 movu [r0 + r1 + 32], m5
813 lea r0, [r0 + 2 * r1]
814
815 movu m0, [r2]
816 movu m1, [r2 + 16]
817 movu m2, [r2 + 32]
818 movu m3, [r2 + r3]
819 movu m4, [r2 + r3 + 16]
820 movu m5, [r2 + r3 + 32]
821
822 movu [r0], m0
823 movu [r0 + 16], m1
824 movu [r0 + 32], m2
825 movu [r0 + r1], m3
826 movu [r0 + r1 + 16], m4
827 movu [r0 + r1 + 32], m5
828
829 dec r4d
830 lea r0, [r0 + 2 * r1]
831 lea r2, [r2 + 2 * r3]
832 jnz .loop
833 RET
834 %endmacro
835
836 BLOCKCOPY_PP_W48_H2 48, 64
837
838 ;-----------------------------------------------------------------------------
839 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
840 ;-----------------------------------------------------------------------------
841 %macro BLOCKCOPY_PP_W64_H4 2
842 INIT_XMM sse2
843 cglobal blockcopy_pp_%1x%2, 4, 5, 6
844 mov r4d, %2/4
845
846 .loop:
847 movu m0, [r2]
848 movu m1, [r2 + 16]
849 movu m2, [r2 + 32]
850 movu m3, [r2 + 48]
851 movu m4, [r2 + r3]
852 movu m5, [r2 + r3 + 16]
853
854 movu [r0], m0
855 movu [r0 + 16], m1
856 movu [r0 + 32], m2
857 movu [r0 + 48], m3
858 movu [r0 + r1], m4
859 movu [r0 + r1 + 16], m5
860
861 movu m0, [r2 + r3 + 32]
862 movu m1, [r2 + r3 + 48]
863 lea r2, [r2 + 2 * r3]
864 movu m2, [r2]
865 movu m3, [r2 + 16]
866 movu m4, [r2 + 32]
867 movu m5, [r2 + 48]
868
869 movu [r0 + r1 + 32], m0
870 movu [r0 + r1 + 48], m1
871 lea r0, [r0 + 2 * r1]
872 movu [r0], m2
873 movu [r0 + 16], m3
874 movu [r0 + 32], m4
875 movu [r0 + 48], m5
876
877 movu m0, [r2 + r3]
878 movu m1, [r2 + r3 + 16]
879 movu m2, [r2 + r3 + 32]
880 movu m3, [r2 + r3 + 48]
881
882 movu [r0 + r1], m0
883 movu [r0 + r1 + 16], m1
884 movu [r0 + r1 + 32], m2
885 movu [r0 + r1 + 48], m3
886
887 dec r4d
888 lea r0, [r0 + 2 * r1]
889 lea r2, [r2 + 2 * r3]
890 jnz .loop
891 RET
892 %endmacro
893
894 BLOCKCOPY_PP_W64_H4 64, 16
895 BLOCKCOPY_PP_W64_H4 64, 32
896 BLOCKCOPY_PP_W64_H4 64, 48
897 BLOCKCOPY_PP_W64_H4 64, 64
898
899 ;-----------------------------------------------------------------------------
900 ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
901 ;-----------------------------------------------------------------------------
902 INIT_XMM sse4
903 cglobal blockcopy_sp_2x4, 4, 5, 2
904
905 add r3, r3
906
907 ;Row 0-1
908 movd m0, [r2]
909 movd m1, [r2 + r3]
910 packuswb m0, m1
911 movd r4d, m0
912 mov [r0], r4w
913 pextrw [r0 + r1], m0, 4
914
915 ;Row 2-3
916 movd m0, [r2 + 2 * r3]
917 lea r2, [r2 + 2 * r3]
918 movd m1, [r2 + r3]
919 packuswb m0, m1
920 movd r4d, m0
921 mov [r0 + 2 * r1], r4w
922 lea r0, [r0 + 2 * r1]
923 pextrw [r0 + r1], m0, 4
924
925 RET
926
927
928 ;-----------------------------------------------------------------------------
929 ; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
930 ;-----------------------------------------------------------------------------
931 INIT_XMM sse4
932 cglobal blockcopy_sp_2x8, 4, 5, 2
933
934 add r3, r3
935
936 ;Row 0-1
937 movd m0, [r2]
938 movd m1, [r2 + r3]
939 packuswb m0, m1
940 movd r4d, m0
941 mov [r0], r4w
942 pextrw [r0 + r1], m0, 4
943
944 ;Row 2-3
945 movd m0, [r2 + 2 * r3]
946 lea r2, [r2 + 2 * r3]
947 movd m1, [r2 + r3]
948 packuswb m0, m1
949 movd r4d, m0
950 mov [r0 + 2 * r1], r4w
951 lea r0, [r0 + 2 * r1]
952 pextrw [r0 + r1], m0, 4
953
954 ;Row 4-5
955 movd m0, [r2 + 2 * r3]
956 lea r2, [r2 + 2 * r3]
957 movd m1, [r2 + r3]
958 packuswb m0, m1
959 movd r4d, m0
960 mov [r0 + 2 * r1], r4w
961 lea r0, [r0 + 2 * r1]
962 pextrw [r0 + r1], m0, 4
963
964 ;Row 6-7
965 movd m0, [r2 + 2 * r3]
966 lea r2, [r2 + 2 * r3]
967 movd m1, [r2 + r3]
968 packuswb m0, m1
969 movd r4d, m0
970 mov [r0 + 2 * r1], r4w
971 lea r0, [r0 + 2 * r1]
972 pextrw [r0 + r1], m0, 4
973
974 RET
975
976 ;-----------------------------------------------------------------------------
977 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
978 ;-----------------------------------------------------------------------------
979 %macro BLOCKCOPY_SP_W2_H2 2
980 INIT_XMM sse2
981 cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
982 add r3, r3
983 mov r6d, %2/2
984 .loop:
985 movd m0, [r2]
986 movd m1, [r2 + r3]
987 dec r6d
988 lea r2, [r2 + r3 * 2]
989 packuswb m0, m0
990 packuswb m1, m1
991 movd r4d, m0
992 movd r5d, m1
993 mov [r0], r4w
994 mov [r0 + r1], r5w
995 lea r0, [r0 + r1 * 2]
996 jnz .loop
997 RET
998 %endmacro
999
1000 BLOCKCOPY_SP_W2_H2 2, 4
1001 BLOCKCOPY_SP_W2_H2 2, 8
1002
1003 BLOCKCOPY_SP_W2_H2 2, 16
1004
1005 ;-----------------------------------------------------------------------------
1006 ; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1007 ;-----------------------------------------------------------------------------
1008 INIT_XMM sse2
1009 cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
1010
1011 add r3, r3
1012
1013 movh m0, [r2]
1014 movh m1, [r2 + r3]
1015
1016 packuswb m0, m1
1017
1018 movd [r0], m0
1019 pshufd m0, m0, 2
1020 movd [r0 + r1], m0
1021
1022 RET
1023
1024 ;-----------------------------------------------------------------------------
1025 ; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1026 ;-----------------------------------------------------------------------------
1027 INIT_XMM sse2
1028 cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
1029
1030 add r3, r3
1031
1032 movh m0, [r2]
1033 movh m1, [r2 + r3]
1034 movh m2, [r2 + 2 * r3]
1035 lea r2, [r2 + 2 * r3]
1036 movh m3, [r2 + r3]
1037
1038 packuswb m0, m1
1039 packuswb m2, m3
1040
1041 movd [r0], m0
1042 pshufd m0, m0, 2
1043 movd [r0 + r1], m0
1044 movd [r0 + 2 * r1], m2
1045 lea r0, [r0 + 2 * r1]
1046 pshufd m2, m2, 2
1047 movd [r0 + r1], m2
1048
1049 RET
1050
1051 ;-----------------------------------------------------------------------------
1052 ; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1053 ;-----------------------------------------------------------------------------
1054 INIT_XMM sse2
1055 cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
1056
1057 add r3, r3
1058
1059 movh m0, [r2]
1060 movh m1, [r2 + r3]
1061 movh m2, [r2 + 2 * r3]
1062 lea r2, [r2 + 2 * r3]
1063 movh m3, [r2 + r3]
1064 movh m4, [r2 + 2 * r3]
1065 lea r2, [r2 + 2 * r3]
1066 movh m5, [r2 + r3]
1067 movh m6, [r2 + 2 * r3]
1068 lea r2, [r2 + 2 * r3]
1069 movh m7, [r2 + r3]
1070
1071 packuswb m0, m1
1072 packuswb m2, m3
1073 packuswb m4, m5
1074 packuswb m6, m7
1075
1076 movd [r0], m0
1077 pshufd m0, m0, 2
1078 movd [r0 + r1], m0
1079 movd [r0 + 2 * r1], m2
1080 lea r0, [r0 + 2 * r1]
1081 pshufd m2, m2, 2
1082 movd [r0 + r1], m2
1083 movd [r0 + 2 * r1], m4
1084 lea r0, [r0 + 2 * r1]
1085 pshufd m4, m4, 2
1086 movd [r0 + r1], m4
1087 movd [r0 + 2 * r1], m6
1088 lea r0, [r0 + 2 * r1]
1089 pshufd m6, m6, 2
1090 movd [r0 + r1], m6
1091
1092 RET
1093
1094 ;-----------------------------------------------------------------------------
1095 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1096 ;-----------------------------------------------------------------------------
1097 %macro BLOCKCOPY_SP_W4_H8 2
1098 INIT_XMM sse2
1099 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1100
1101 mov r4d, %2/8
1102
1103 add r3, r3
1104
1105 .loop:
1106 movh m0, [r2]
1107 movh m1, [r2 + r3]
1108 movh m2, [r2 + 2 * r3]
1109 lea r2, [r2 + 2 * r3]
1110 movh m3, [r2 + r3]
1111 movh m4, [r2 + 2 * r3]
1112 lea r2, [r2 + 2 * r3]
1113 movh m5, [r2 + r3]
1114 movh m6, [r2 + 2 * r3]
1115 lea r2, [r2 + 2 * r3]
1116 movh m7, [r2 + r3]
1117
1118 packuswb m0, m1
1119 packuswb m2, m3
1120 packuswb m4, m5
1121 packuswb m6, m7
1122
1123 movd [r0], m0
1124 pshufd m0, m0, 2
1125 movd [r0 + r1], m0
1126 movd [r0 + 2 * r1], m2
1127 lea r0, [r0 + 2 * r1]
1128 pshufd m2, m2, 2
1129 movd [r0 + r1], m2
1130 movd [r0 + 2 * r1], m4
1131 lea r0, [r0 + 2 * r1]
1132 pshufd m4, m4, 2
1133 movd [r0 + r1], m4
1134 movd [r0 + 2 * r1], m6
1135 lea r0, [r0 + 2 * r1]
1136 pshufd m6, m6, 2
1137 movd [r0 + r1], m6
1138
1139 lea r0, [r0 + 2 * r1]
1140 lea r2, [r2 + 2 * r3]
1141
1142 dec r4d
1143 jnz .loop
1144
1145 RET
1146 %endmacro
1147
1148 BLOCKCOPY_SP_W4_H8 4, 16
1149
1150 BLOCKCOPY_SP_W4_H8 4, 32
1151
1152 ;-----------------------------------------------------------------------------
1153 ; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1154 ;-----------------------------------------------------------------------------
1155 INIT_XMM sse4
1156 cglobal blockcopy_sp_6x8, 4, 4, 2
1157
1158 add r3, r3
1159
1160 movu m0, [r2]
1161 movu m1, [r2 + r3]
1162 packuswb m0, m1
1163
1164 movd [r0], m0
1165 pextrw [r0 + 4], m0, 2
1166
1167 movhlps m0, m0
1168 movd [r0 + r1], m0
1169 pextrw [r0 + r1 + 4], m0, 2
1170
1171 lea r0, [r0 + 2 * r1]
1172 lea r2, [r2 + 2 * r3]
1173
1174 movu m0, [r2]
1175 movu m1, [r2 + r3]
1176 packuswb m0, m1
1177
1178 movd [r0], m0
1179 pextrw [r0 + 4], m0, 2
1180
1181 movhlps m0, m0
1182 movd [r0 + r1], m0
1183 pextrw [r0 + r1 + 4], m0, 2
1184
1185 lea r0, [r0 + 2 * r1]
1186 lea r2, [r2 + 2 * r3]
1187
1188 movu m0, [r2]
1189 movu m1, [r2 + r3]
1190 packuswb m0, m1
1191
1192 movd [r0], m0
1193 pextrw [r0 + 4], m0, 2
1194
1195 movhlps m0, m0
1196 movd [r0 + r1], m0
1197 pextrw [r0 + r1 + 4], m0, 2
1198
1199 lea r0, [r0 + 2 * r1]
1200 lea r2, [r2 + 2 * r3]
1201
1202 movu m0, [r2]
1203 movu m1, [r2 + r3]
1204 packuswb m0, m1
1205
1206 movd [r0], m0
1207 pextrw [r0 + 4], m0, 2
1208
1209 movhlps m0, m0
1210 movd [r0 + r1], m0
1211 pextrw [r0 + r1 + 4], m0, 2
1212
1213 RET
1214
1215 ;-----------------------------------------------------------------------------
1216 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1217 ;-----------------------------------------------------------------------------
1218 %macro BLOCKCOPY_SP_W6_H2 2
1219 INIT_XMM sse2
1220 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
1221 add r3, r3
1222 mov r6d, %2/2
1223 .loop:
1224 movh m0, [r2]
1225 movd m2, [r2 + 8]
1226 movh m1, [r2 + r3]
1227 movd m3, [r2 + r3 + 8]
1228 dec r6d
1229 lea r2, [r2 + r3 * 2]
1230 packuswb m0, m0
1231 packuswb m2, m2
1232 packuswb m1, m1
1233 packuswb m3, m3
1234 movd r4d, m2
1235 movd r5d, m3
1236 movd [r0], m0
1237 mov [r0 + 4], r4w
1238 movd [r0 + r1], m1
1239 mov [r0 + r1 + 4], r5w
1240 lea r0, [r0 + r1 * 2]
1241 jnz .loop
1242 RET
1243 %endmacro
1244
1245 BLOCKCOPY_SP_W6_H2 6, 8
1246
1247 BLOCKCOPY_SP_W6_H2 6, 16
1248
1249 ;-----------------------------------------------------------------------------
1250 ; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1251 ;-----------------------------------------------------------------------------
1252 INIT_XMM sse2
1253 cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
1254
1255 add r3, r3
1256
1257 movu m0, [r2]
1258 movu m1, [r2 + r3]
1259
1260 packuswb m0, m1
1261
1262 movlps [r0], m0
1263 movhps [r0 + r1], m0
1264
1265 RET
1266
1267 ;-----------------------------------------------------------------------------
1268 ; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1269 ;-----------------------------------------------------------------------------
1270 INIT_XMM sse2
1271 cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
1272
1273 add r3, r3
1274
1275 movu m0, [r2]
1276 movu m1, [r2 + r3]
1277 movu m2, [r2 + 2 * r3]
1278 lea r2, [r2 + 2 * r3]
1279 movu m3, [r2 + r3]
1280
1281 packuswb m0, m1
1282 packuswb m2, m3
1283
1284 movlps [r0], m0
1285 movhps [r0 + r1], m0
1286 movlps [r0 + 2 * r1], m2
1287 lea r0, [r0 + 2 * r1]
1288 movhps [r0 + r1], m2
1289
1290 RET
1291
1292 ;-----------------------------------------------------------------------------
1293 ; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1294 ;-----------------------------------------------------------------------------
1295 INIT_XMM sse2
1296 cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
1297
1298 add r3, r3
1299
1300 movu m0, [r2]
1301 movu m1, [r2 + r3]
1302 movu m2, [r2 + 2 * r3]
1303 lea r2, [r2 + 2 * r3]
1304 movu m3, [r2 + r3]
1305 movu m4, [r2 + 2 * r3]
1306 lea r2, [r2 + 2 * r3]
1307 movu m5, [r2 + r3]
1308
1309 packuswb m0, m1
1310 packuswb m2, m3
1311 packuswb m4, m5
1312
1313 movlps [r0], m0
1314 movhps [r0 + r1], m0
1315 movlps [r0 + 2 * r1], m2
1316 lea r0, [r0 + 2 * r1]
1317 movhps [r0 + r1], m2
1318 movlps [r0 + 2 * r1], m4
1319 lea r0, [r0 + 2 * r1]
1320 movhps [r0 + r1], m4
1321
1322 RET
1323
1324 ;-----------------------------------------------------------------------------
1325 ; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1326 ;-----------------------------------------------------------------------------
1327 INIT_XMM sse2
1328 cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
1329
1330 add r3, r3
1331
1332 movu m0, [r2]
1333 movu m1, [r2 + r3]
1334 movu m2, [r2 + 2 * r3]
1335 lea r2, [r2 + 2 * r3]
1336 movu m3, [r2 + r3]
1337 movu m4, [r2 + 2 * r3]
1338 lea r2, [r2 + 2 * r3]
1339 movu m5, [r2 + r3]
1340 movu m6, [r2 + 2 * r3]
1341 lea r2, [r2 + 2 * r3]
1342 movu m7, [r2 + r3]
1343
1344 packuswb m0, m1
1345 packuswb m2, m3
1346 packuswb m4, m5
1347 packuswb m6, m7
1348
1349 movlps [r0], m0
1350 movhps [r0 + r1], m0
1351 movlps [r0 + 2 * r1], m2
1352 lea r0, [r0 + 2 * r1]
1353 movhps [r0 + r1], m2
1354 movlps [r0 + 2 * r1], m4
1355 lea r0, [r0 + 2 * r1]
1356 movhps [r0 + r1], m4
1357 movlps [r0 + 2 * r1], m6
1358 lea r0, [r0 + 2 * r1]
1359 movhps [r0 + r1], m6
1360
1361 RET
1362
1363 ;-----------------------------------------------------------------------------
1364 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1365 ;-----------------------------------------------------------------------------
1366 %macro BLOCKCOPY_SP_W8_H4 2
1367 INIT_XMM sse2
1368 cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
1369 add r3, r3
1370 mov r4d, %2/4
1371 .loop:
1372 movu m0, [r2]
1373 movu m1, [r2 + r3]
1374 lea r2, [r2 + r3 * 2]
1375 movu m2, [r2]
1376 movu m3, [r2 + r3]
1377 dec r4d
1378 lea r2, [r2 + r3 * 2]
1379 packuswb m0, m1
1380 packuswb m2, m3
1381 movlps [r0], m0
1382 movhps [r0 + r1], m0
1383 lea r0, [r0 + r1 * 2]
1384 movlps [r0], m2
1385 movhps [r0 + r1], m2
1386 lea r0, [r0 + r1 * 2]
1387 jnz .loop
1388 RET
1389 %endmacro
1390
1391 BLOCKCOPY_SP_W8_H4 8, 12
1392
1393 ;-----------------------------------------------------------------------------
1394 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1395 ;-----------------------------------------------------------------------------
1396 %macro BLOCKCOPY_SP_W8_H8 2
1397 INIT_XMM sse2
1398 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1399
1400 mov r4d, %2/8
1401
1402 add r3, r3
1403
1404 .loop:
1405 movu m0, [r2]
1406 movu m1, [r2 + r3]
1407 movu m2, [r2 + 2 * r3]
1408 lea r2, [r2 + 2 * r3]
1409 movu m3, [r2 + r3]
1410 movu m4, [r2 + 2 * r3]
1411 lea r2, [r2 + 2 * r3]
1412 movu m5, [r2 + r3]
1413 movu m6, [r2 + 2 * r3]
1414 lea r2, [r2 + 2 * r3]
1415 movu m7, [r2 + r3]
1416
1417 packuswb m0, m1
1418 packuswb m2, m3
1419 packuswb m4, m5
1420 packuswb m6, m7
1421
1422 movlps [r0], m0
1423 movhps [r0 + r1], m0
1424 movlps [r0 + 2 * r1], m2
1425 lea r0, [r0 + 2 * r1]
1426 movhps [r0 + r1], m2
1427 movlps [r0 + 2 * r1], m4
1428 lea r0, [r0 + 2 * r1]
1429 movhps [r0 + r1], m4
1430 movlps [r0 + 2 * r1], m6
1431 lea r0, [r0 + 2 * r1]
1432 movhps [r0 + r1], m6
1433
1434 lea r0, [r0 + 2 * r1]
1435 lea r2, [r2 + 2 * r3]
1436
1437 dec r4d
1438 jnz .loop
1439
1440 RET
1441 %endmacro
1442
1443 BLOCKCOPY_SP_W8_H8 8, 16
1444 BLOCKCOPY_SP_W8_H8 8, 32
1445
1446 BLOCKCOPY_SP_W8_H8 8, 64
1447
1448 ;-----------------------------------------------------------------------------
1449 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1450 ;-----------------------------------------------------------------------------
1451 %macro BLOCKCOPY_SP_W12_H4 2
1452 INIT_XMM sse2
1453 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1454
1455 mov r4d, %2/4
1456
1457 add r3, r3
1458
1459 .loop:
1460 movu m0, [r2]
1461 movu m1, [r2 + 16]
1462 movu m2, [r2 + r3]
1463 movu m3, [r2 + r3 + 16]
1464 movu m4, [r2 + 2 * r3]
1465 movu m5, [r2 + 2 * r3 + 16]
1466 lea r2, [r2 + 2 * r3]
1467 movu m6, [r2 + r3]
1468 movu m7, [r2 + r3 + 16]
1469
1470 packuswb m0, m1
1471 packuswb m2, m3
1472 packuswb m4, m5
1473 packuswb m6, m7
1474
1475 movh [r0], m0
1476 pshufd m0, m0, 2
1477 movd [r0 + 8], m0
1478
1479 movh [r0 + r1], m2
1480 pshufd m2, m2, 2
1481 movd [r0 + r1 + 8], m2
1482
1483 movh [r0 + 2 * r1], m4
1484 pshufd m4, m4, 2
1485 movd [r0 + 2 * r1 + 8], m4
1486
1487 lea r0, [r0 + 2 * r1]
1488 movh [r0 + r1], m6
1489 pshufd m6, m6, 2
1490 movd [r0 + r1 + 8], m6
1491
1492 lea r0, [r0 + 2 * r1]
1493 lea r2, [r2 + 2 * r3]
1494
1495 dec r4d
1496 jnz .loop
1497
1498 RET
1499 %endmacro
1500
1501 BLOCKCOPY_SP_W12_H4 12, 16
1502
1503 BLOCKCOPY_SP_W12_H4 12, 32
1504
1505 ;-----------------------------------------------------------------------------
1506 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1507 ;-----------------------------------------------------------------------------
1508 %macro BLOCKCOPY_SP_W16_H4 2
1509 INIT_XMM sse2
1510 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1511
1512 mov r4d, %2/4
1513
1514 add r3, r3
1515
1516 .loop:
1517 movu m0, [r2]
1518 movu m1, [r2 + 16]
1519 movu m2, [r2 + r3]
1520 movu m3, [r2 + r3 + 16]
1521 movu m4, [r2 + 2 * r3]
1522 movu m5, [r2 + 2 * r3 + 16]
1523 lea r2, [r2 + 2 * r3]
1524 movu m6, [r2 + r3]
1525 movu m7, [r2 + r3 + 16]
1526
1527 packuswb m0, m1
1528 packuswb m2, m3
1529 packuswb m4, m5
1530 packuswb m6, m7
1531
1532 movu [r0], m0
1533 movu [r0 + r1], m2
1534 movu [r0 + 2 * r1], m4
1535 lea r0, [r0 + 2 * r1]
1536 movu [r0 + r1], m6
1537
1538 lea r0, [r0 + 2 * r1]
1539 lea r2, [r2 + 2 * r3]
1540
1541 dec r4d
1542 jnz .loop
1543
1544 RET
1545 %endmacro
1546
1547 BLOCKCOPY_SP_W16_H4 16, 4
1548 BLOCKCOPY_SP_W16_H4 16, 8
1549 BLOCKCOPY_SP_W16_H4 16, 12
1550 BLOCKCOPY_SP_W16_H4 16, 16
1551 BLOCKCOPY_SP_W16_H4 16, 32
1552 BLOCKCOPY_SP_W16_H4 16, 64
1553
1554 BLOCKCOPY_SP_W16_H4 16, 24
1555
1556 ;-----------------------------------------------------------------------------
1557 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1558 ;-----------------------------------------------------------------------------
1559 %macro BLOCKCOPY_SP_W24_H2 2
1560 INIT_XMM sse2
1561 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
1562
1563 mov r4d, %2/2
1564
1565 add r3, r3
1566
1567 .loop:
1568 movu m0, [r2]
1569 movu m1, [r2 + 16]
1570 movu m2, [r2 + 32]
1571 movu m3, [r2 + r3]
1572 movu m4, [r2 + r3 + 16]
1573 movu m5, [r2 + r3 + 32]
1574
1575 packuswb m0, m1
1576 packuswb m2, m3
1577 packuswb m4, m5
1578
1579 movu [r0], m0
1580 movlps [r0 + 16], m2
1581 movhps [r0 + r1], m2
1582 movu [r0 + r1 + 8], m4
1583
1584 lea r0, [r0 + 2 * r1]
1585 lea r2, [r2 + 2 * r3]
1586
1587 dec r4d
1588 jnz .loop
1589
1590 RET
1591 %endmacro
1592
1593 BLOCKCOPY_SP_W24_H2 24, 32
1594
1595 BLOCKCOPY_SP_W24_H2 24, 64
1596
1597 ;-----------------------------------------------------------------------------
1598 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1599 ;-----------------------------------------------------------------------------
1600 %macro BLOCKCOPY_SP_W32_H2 2
1601 INIT_XMM sse2
1602 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1603
1604 mov r4d, %2/2
1605
1606 add r3, r3
1607
1608 .loop:
1609 movu m0, [r2]
1610 movu m1, [r2 + 16]
1611 movu m2, [r2 + 32]
1612 movu m3, [r2 + 48]
1613 movu m4, [r2 + r3]
1614 movu m5, [r2 + r3 + 16]
1615 movu m6, [r2 + r3 + 32]
1616 movu m7, [r2 + r3 + 48]
1617
1618 packuswb m0, m1
1619 packuswb m2, m3
1620 packuswb m4, m5
1621 packuswb m6, m7
1622
1623 movu [r0], m0
1624 movu [r0 + 16], m2
1625 movu [r0 + r1], m4
1626 movu [r0 + r1 + 16], m6
1627
1628 lea r0, [r0 + 2 * r1]
1629 lea r2, [r2 + 2 * r3]
1630
1631 dec r4d
1632 jnz .loop
1633
1634 RET
1635 %endmacro
1636
1637 BLOCKCOPY_SP_W32_H2 32, 8
1638 BLOCKCOPY_SP_W32_H2 32, 16
1639 BLOCKCOPY_SP_W32_H2 32, 24
1640 BLOCKCOPY_SP_W32_H2 32, 32
1641 BLOCKCOPY_SP_W32_H2 32, 64
1642
1643 BLOCKCOPY_SP_W32_H2 32, 48
1644
1645 ;-----------------------------------------------------------------------------
1646 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1647 ;-----------------------------------------------------------------------------
1648 %macro BLOCKCOPY_SP_W48_H2 2
1649 INIT_XMM sse2
1650 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
1651
1652 mov r4d, %2
1653
1654 add r3, r3
1655
1656 .loop:
1657 movu m0, [r2]
1658 movu m1, [r2 + 16]
1659 movu m2, [r2 + 32]
1660 movu m3, [r2 + 48]
1661 movu m4, [r2 + 64]
1662 movu m5, [r2 + 80]
1663
1664 packuswb m0, m1
1665 packuswb m2, m3
1666 packuswb m4, m5
1667
1668 movu [r0], m0
1669 movu [r0 + 16], m2
1670 movu [r0 + 32], m4
1671
1672 lea r0, [r0 + r1]
1673 lea r2, [r2 + r3]
1674
1675 dec r4d
1676 jnz .loop
1677
1678 RET
1679 %endmacro
1680
1681 BLOCKCOPY_SP_W48_H2 48, 64
1682
1683 ;-----------------------------------------------------------------------------
1684 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1685 ;-----------------------------------------------------------------------------
1686 %macro BLOCKCOPY_SP_W64_H1 2
1687 INIT_XMM sse2
1688 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1689
1690 mov r4d, %2
1691
1692 add r3, r3
1693
1694 .loop:
1695 movu m0, [r2]
1696 movu m1, [r2 + 16]
1697 movu m2, [r2 + 32]
1698 movu m3, [r2 + 48]
1699 movu m4, [r2 + 64]
1700 movu m5, [r2 + 80]
1701 movu m6, [r2 + 96]
1702 movu m7, [r2 + 112]
1703
1704 packuswb m0, m1
1705 packuswb m2, m3
1706 packuswb m4, m5
1707 packuswb m6, m7
1708
1709 movu [r0], m0
1710 movu [r0 + 16], m2
1711 movu [r0 + 32], m4
1712 movu [r0 + 48], m6
1713
1714 lea r0, [r0 + r1]
1715 lea r2, [r2 + r3]
1716
1717 dec r4d
1718 jnz .loop
1719
1720 RET
1721 %endmacro
1722
1723 BLOCKCOPY_SP_W64_H1 64, 16
1724 BLOCKCOPY_SP_W64_H1 64, 32
1725 BLOCKCOPY_SP_W64_H1 64, 48
1726 BLOCKCOPY_SP_W64_H1 64, 64
1727
1728 ;-----------------------------------------------------------------------------
1729 ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
1730 ;-----------------------------------------------------------------------------
1731 INIT_XMM sse2
1732 cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
1733
1734 add r1, r1
1735
1736 movd m0, r2d
1737 pshuflw m0, m0, 0
1738
1739 movh [r0], m0
1740 movh [r0 + r1], m0
1741 movh [r0 + 2 * r1], m0
1742 lea r0, [r0 + 2 * r1]
1743 movh [r0 + r1], m0
1744
1745 RET
1746
1747 ;-----------------------------------------------------------------------------
1748 ; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
1749 ;-----------------------------------------------------------------------------
1750 INIT_XMM sse2
1751 cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
1752
1753 add r1, r1
1754
1755 movd m0, r2d
1756 pshuflw m0, m0, 0
1757 pshufd m0, m0, 0
1758
1759 movu [r0], m0
1760 movu [r0 + r1], m0
1761 movu [r0 + 2 * r1], m0
1762
1763 lea r0, [r0 + 2 * r1]
1764 movu [r0 + r1], m0
1765 movu [r0 + 2 * r1], m0
1766
1767 lea r0, [r0 + 2 * r1]
1768 movu [r0 + r1], m0
1769 movu [r0 + 2 * r1], m0
1770
1771 lea r0, [r0 + 2 * r1]
1772 movu [r0 + r1], m0
1773
1774 RET
1775
1776 ;-----------------------------------------------------------------------------
1777 ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
1778 ;-----------------------------------------------------------------------------
1779 %macro BLOCKFILL_S_W16_H8 2
1780 INIT_XMM sse2
1781 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
1782
1783 mov r3d, %2/8
1784
1785 add r1, r1
1786
1787 movd m0, r2d
1788 pshuflw m0, m0, 0
1789 pshufd m0, m0, 0
1790
1791 .loop:
1792 movu [r0], m0
1793 movu [r0 + 16], m0
1794
1795 movu [r0 + r1], m0
1796 movu [r0 + r1 + 16], m0
1797
1798 movu [r0 + 2 * r1], m0
1799 movu [r0 + 2 * r1 + 16], m0
1800
1801 lea r4, [r0 + 2 * r1]
1802 movu [r4 + r1], m0
1803 movu [r4 + r1 + 16], m0
1804
1805 movu [r0 + 4 * r1], m0
1806 movu [r0 + 4 * r1 + 16], m0
1807
1808 lea r4, [r0 + 4 * r1]
1809 movu [r4 + r1], m0
1810 movu [r4 + r1 + 16], m0
1811
1812 movu [r4 + 2 * r1], m0
1813 movu [r4 + 2 * r1 + 16], m0
1814
1815 lea r4, [r4 + 2 * r1]
1816 movu [r4 + r1], m0
1817 movu [r4 + r1 + 16], m0
1818
1819 lea r0, [r0 + 8 * r1]
1820
1821 dec r3d
1822 jnz .loop
1823
1824 RET
1825 %endmacro
1826
1827 BLOCKFILL_S_W16_H8 16, 16
1828
1829 INIT_YMM avx2
1830 cglobal blockfill_s_16x16, 3, 4, 1
1831 add r1, r1
1832 lea r3, [3 * r1]
1833 movd xm0, r2d
1834 vpbroadcastw m0, xm0
1835
1836 movu [r0], m0
1837 movu [r0 + r1], m0
1838 movu [r0 + 2 * r1], m0
1839 movu [r0 + r3], m0
1840 lea r0, [r0 + 4 * r1]
1841 movu [r0], m0
1842 movu [r0 + r1], m0
1843 movu [r0 + 2 * r1], m0
1844 movu [r0 + r3], m0
1845 lea r0, [r0 + 4 * r1]
1846 movu [r0], m0
1847 movu [r0 + r1], m0
1848 movu [r0 + 2 * r1], m0
1849 movu [r0 + r3], m0
1850 lea r0, [r0 + 4 * r1]
1851 movu [r0], m0
1852 movu [r0 + r1], m0
1853 movu [r0 + 2 * r1], m0
1854 movu [r0 + r3], m0
1855 RET
1856
1857 ;-----------------------------------------------------------------------------
1858 ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
1859 ;-----------------------------------------------------------------------------
1860 %macro BLOCKFILL_S_W32_H4 2
1861 INIT_XMM sse2
1862 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
1863
1864 mov r3d, %2/4
1865
1866 add r1, r1
1867
1868 movd m0, r2d
1869 pshuflw m0, m0, 0
1870 pshufd m0, m0, 0
1871
1872 .loop:
1873 movu [r0], m0
1874 movu [r0 + 16], m0
1875 movu [r0 + 32], m0
1876 movu [r0 + 48], m0
1877
1878 movu [r0 + r1], m0
1879 movu [r0 + r1 + 16], m0
1880 movu [r0 + r1 + 32], m0
1881 movu [r0 + r1 + 48], m0
1882
1883 movu [r0 + 2 * r1], m0
1884 movu [r0 + 2 * r1 + 16], m0
1885 movu [r0 + 2 * r1 + 32], m0
1886 movu [r0 + 2 * r1 + 48], m0
1887
1888 lea r4, [r0 + 2 * r1]
1889
1890 movu [r4 + r1], m0
1891 movu [r4 + r1 + 16], m0
1892 movu [r4 + r1 + 32], m0
1893 movu [r4 + r1 + 48], m0
1894
1895 lea r0, [r0 + 4 * r1]
1896
1897 dec r3d
1898 jnz .loop
1899
1900 RET
1901 %endmacro
1902
1903 BLOCKFILL_S_W32_H4 32, 32
1904
1905 INIT_YMM avx2
1906 cglobal blockfill_s_32x32, 3, 4, 1
1907 add r1, r1
1908 lea r3, [3 * r1]
1909 movd xm0, r2d
1910 vpbroadcastw m0, xm0
1911
1912 movu [r0], m0
1913 movu [r0 + 32], m0
1914 movu [r0 + r1], m0
1915 movu [r0 + r1 + 32], m0
1916 movu [r0 + 2 * r1], m0
1917 movu [r0 + 2 * r1 + 32], m0
1918 movu [r0 + r3], m0
1919 movu [r0 + r3 + 32], m0
1920 lea r0, [r0 + 4 * r1]
1921 movu [r0], m0
1922 movu [r0 + 32], m0
1923 movu [r0 + r1], m0
1924 movu [r0 + r1 + 32], m0
1925 movu [r0 + 2 * r1], m0
1926 movu [r0 + 2 * r1 + 32], m0
1927 movu [r0 + r3], m0
1928 movu [r0 + r3 + 32], m0
1929 lea r0, [r0 + 4 * r1]
1930 movu [r0], m0
1931 movu [r0 + 32], m0
1932 movu [r0 + r1], m0
1933 movu [r0 + r1 + 32], m0
1934 movu [r0 + 2 * r1], m0
1935 movu [r0 + 2 * r1 + 32], m0
1936 movu [r0 + r3], m0
1937 movu [r0 + r3 + 32], m0
1938 lea r0, [r0 + 4 * r1]
1939 movu [r0], m0
1940 movu [r0 + 32], m0
1941 movu [r0 + r1], m0
1942 movu [r0 + r1 + 32], m0
1943 movu [r0 + 2 * r1], m0
1944 movu [r0 + 2 * r1 + 32], m0
1945 movu [r0 + r3], m0
1946 movu [r0 + r3 + 32], m0
1947 lea r0, [r0 + 4 * r1]
1948 movu [r0], m0
1949 movu [r0 + 32], m0
1950 movu [r0 + r1], m0
1951 movu [r0 + r1 + 32], m0
1952 movu [r0 + 2 * r1], m0
1953 movu [r0 + 2 * r1 + 32], m0
1954 movu [r0 + r3], m0
1955 movu [r0 + r3 + 32], m0
1956 lea r0, [r0 + 4 * r1]
1957 movu [r0], m0
1958 movu [r0 + 32], m0
1959 movu [r0 + r1], m0
1960 movu [r0 + r1 + 32], m0
1961 movu [r0 + 2 * r1], m0
1962 movu [r0 + 2 * r1 + 32], m0
1963 movu [r0 + r3], m0
1964 movu [r0 + r3 + 32], m0
1965 lea r0, [r0 + 4 * r1]
1966 movu [r0], m0
1967 movu [r0 + 32], m0
1968 movu [r0 + r1], m0
1969 movu [r0 + r1 + 32], m0
1970 movu [r0 + 2 * r1], m0
1971 movu [r0 + 2 * r1 + 32], m0
1972 movu [r0 + r3], m0
1973 movu [r0 + r3 + 32], m0
1974 lea r0, [r0 + 4 * r1]
1975 movu [r0], m0
1976 movu [r0 + 32], m0
1977 movu [r0 + r1], m0
1978 movu [r0 + r1 + 32], m0
1979 movu [r0 + 2 * r1], m0
1980 movu [r0 + 2 * r1 + 32], m0
1981 movu [r0 + r3], m0
1982 movu [r0 + r3 + 32], m0
1983 RET
1984
1985 ;-----------------------------------------------------------------------------
1986 ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
1987 ;-----------------------------------------------------------------------------
1988 INIT_XMM sse4
1989 cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
1990
1991 add r1, r1
1992
1993 movd m0, [r2]
1994 pmovzxbw m0, m0
1995 movd [r0], m0
1996
1997 movd m0, [r2 + r3]
1998 pmovzxbw m0, m0
1999 movd [r0 + r1], m0
2000
2001 movd m0, [r2 + 2 * r3]
2002 pmovzxbw m0, m0
2003 movd [r0 + 2 * r1], m0
2004
2005 lea r2, [r2 + 2 * r3]
2006 lea r0, [r0 + 2 * r1]
2007
2008 movd m0, [r2 + r3]
2009 pmovzxbw m0, m0
2010 movd [r0 + r1], m0
2011
2012 RET
2013
2014
2015 ;-----------------------------------------------------------------------------
2016 ; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2017 ;-----------------------------------------------------------------------------
2018 INIT_XMM sse4
2019 cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
2020
2021 add r1, r1
2022
2023 movd m0, [r2]
2024 pmovzxbw m0, m0
2025 movd [r0], m0
2026
2027 movd m0, [r2 + r3]
2028 pmovzxbw m0, m0
2029 movd [r0 + r1], m0
2030
2031 movd m0, [r2 + 2 * r3]
2032 pmovzxbw m0, m0
2033 movd [r0 + 2 * r1], m0
2034
2035 lea r2, [r2 + 2 * r3]
2036 lea r0, [r0 + 2 * r1]
2037
2038 movd m0, [r2 + r3]
2039 pmovzxbw m0, m0
2040 movd [r0 + r1], m0
2041
2042 movd m0, [r2 + 2 * r3]
2043 pmovzxbw m0, m0
2044 movd [r0 + 2 * r1], m0
2045
2046 lea r2, [r2 + 2 * r3]
2047 lea r0, [r0 + 2 * r1]
2048
2049 movd m0, [r2 + r3]
2050 pmovzxbw m0, m0
2051 movd [r0 + r1], m0
2052
2053 movd m0, [r2 + 2 * r3]
2054 pmovzxbw m0, m0
2055 movd [r0 + 2 * r1], m0
2056
2057 lea r2, [r2 + 2 * r3]
2058 lea r0, [r0 + 2 * r1]
2059
2060 movd m0, [r2 + r3]
2061 pmovzxbw m0, m0
2062 movd [r0 + r1], m0
2063
2064 RET
2065
2066
2067 ;-----------------------------------------------------------------------------
2068 ; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2069 ;-----------------------------------------------------------------------------
2070 INIT_XMM sse4
2071 cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
2072 add r1, r1
2073 mov r4d, 16/2
2074 .loop:
2075 movd m0, [r2]
2076 movd m1, [r2 + r3]
2077 dec r4d
2078 lea r2, [r2 + r3 * 2]
2079 pmovzxbw m0, m0
2080 pmovzxbw m1, m1
2081 movd [r0], m0
2082 movd [r0 + r1], m1
2083 lea r0, [r0 + r1 * 2]
2084 jnz .loop
2085 RET
2086
2087
2088 ;-----------------------------------------------------------------------------
2089 ; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2090 ;-----------------------------------------------------------------------------
2091 INIT_XMM sse4
2092 cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
2093
2094 add r1, r1
2095
2096 movd m0, [r2]
2097 pmovzxbw m0, m0
2098 movh [r0], m0
2099
2100 movd m0, [r2 + r3]
2101 pmovzxbw m0, m0
2102 movh [r0 + r1], m0
2103
2104 RET
2105
2106
2107 ;-----------------------------------------------------------------------------
2108 ; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2109 ;-----------------------------------------------------------------------------
2110 INIT_XMM sse4
2111 cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
2112
2113 add r1, r1
2114
2115 movd m0, [r2]
2116 pmovzxbw m0, m0
2117 movh [r0], m0
2118
2119 movd m0, [r2 + r3]
2120 pmovzxbw m0, m0
2121 movh [r0 + r1], m0
2122
2123 movd m0, [r2 + 2 * r3]
2124 pmovzxbw m0, m0
2125 movh [r0 + 2 * r1], m0
2126
2127 lea r2, [r2 + 2 * r3]
2128 lea r0, [r0 + 2 * r1]
2129
2130 movd m0, [r2 + r3]
2131 pmovzxbw m0, m0
2132 movh [r0 + r1], m0
2133
2134 RET
2135
2136
2137 ;-----------------------------------------------------------------------------
2138 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2139 ;-----------------------------------------------------------------------------
2140 %macro BLOCKCOPY_PS_W4_H4 2
2141 INIT_XMM sse4
2142 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
2143
2144 add r1, r1
2145 mov r4d, %2/4
2146
2147 .loop:
2148 movd m0, [r2]
2149 pmovzxbw m0, m0
2150 movh [r0], m0
2151
2152 movd m0, [r2 + r3]
2153 pmovzxbw m0, m0
2154 movh [r0 + r1], m0
2155
2156 movd m0, [r2 + 2 * r3]
2157 pmovzxbw m0, m0
2158 movh [r0 + 2 * r1], m0
2159
2160 lea r2, [r2 + 2 * r3]
2161 lea r0, [r0 + 2 * r1]
2162
2163 movd m0, [r2 + r3]
2164 pmovzxbw m0, m0
2165 movh [r0 + r1], m0
2166
2167 lea r0, [r0 + 2 * r1]
2168 lea r2, [r2 + 2 * r3]
2169
2170 dec r4d
2171 jnz .loop
2172
2173 RET
2174 %endmacro
2175
2176 BLOCKCOPY_PS_W4_H4 4, 8
2177 BLOCKCOPY_PS_W4_H4 4, 16
2178
2179 BLOCKCOPY_PS_W4_H4 4, 32
2180
2181
2182 ;-----------------------------------------------------------------------------
2183 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2184 ;-----------------------------------------------------------------------------
2185 %macro BLOCKCOPY_PS_W6_H4 2
2186 INIT_XMM sse4
2187 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
2188
2189 add r1, r1
2190 mov r4d, %2/4
2191
2192 .loop:
2193 movh m0, [r2]
2194 pmovzxbw m0, m0
2195 movh [r0], m0
2196 pextrd [r0 + 8], m0, 2
2197
2198 movh m0, [r2 + r3]
2199 pmovzxbw m0, m0
2200 movh [r0 + r1], m0
2201 pextrd [r0 + r1 + 8], m0, 2
2202
2203 movh m0, [r2 + 2 * r3]
2204 pmovzxbw m0, m0
2205 movh [r0 + 2 * r1], m0
2206 pextrd [r0 + 2 * r1 + 8], m0, 2
2207
2208 lea r2, [r2 + 2 * r3]
2209 lea r0, [r0 + 2 * r1]
2210
2211 movh m0, [r2 + r3]
2212 pmovzxbw m0, m0
2213 movh [r0 + r1], m0
2214 pextrd [r0 + r1 + 8], m0, 2
2215
2216 lea r0, [r0 + 2 * r1]
2217 lea r2, [r2 + 2 * r3]
2218
2219 dec r4d
2220 jnz .loop
2221
2222 RET
2223 %endmacro
2224
2225 BLOCKCOPY_PS_W6_H4 6, 8
2226
2227 BLOCKCOPY_PS_W6_H4 6, 16
2228
2229 ;-----------------------------------------------------------------------------
2230 ; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2231 ;-----------------------------------------------------------------------------
2232 INIT_XMM sse4
2233 cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
2234
2235 add r1, r1
2236
2237 movh m0, [r2]
2238 pmovzxbw m0, m0
2239 movu [r0], m0
2240
2241 movh m0, [r2 + r3]
2242 pmovzxbw m0, m0
2243 movu [r0 + r1], m0
2244
2245 RET
2246
2247 ;-----------------------------------------------------------------------------
2248 ; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2249 ;-----------------------------------------------------------------------------
2250 INIT_XMM sse4
2251 cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
2252
2253 add r1, r1
2254
2255 movh m0, [r2]
2256 pmovzxbw m0, m0
2257 movu [r0], m0
2258
2259 movh m0, [r2 + r3]
2260 pmovzxbw m0, m0
2261 movu [r0 + r1], m0
2262
2263 movh m0, [r2 + 2 * r3]
2264 pmovzxbw m0, m0
2265 movu [r0 + 2 * r1], m0
2266
2267 lea r2, [r2 + 2 * r3]
2268 lea r0, [r0 + 2 * r1]
2269
2270 movh m0, [r2 + r3]
2271 pmovzxbw m0, m0
2272 movu [r0 + r1], m0
2273
2274 RET
2275
2276 ;-----------------------------------------------------------------------------
2277 ; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2278 ;-----------------------------------------------------------------------------
2279 INIT_XMM sse4
2280 cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
2281
2282 add r1, r1
2283
2284 movh m0, [r2]
2285 pmovzxbw m0, m0
2286 movu [r0], m0
2287
2288 movh m0, [r2 + r3]
2289 pmovzxbw m0, m0
2290 movu [r0 + r1], m0
2291
2292 movh m0, [r2 + 2 * r3]
2293 pmovzxbw m0, m0
2294 movu [r0 + 2 * r1], m0
2295
2296 lea r2, [r2 + 2 * r3]
2297 lea r0, [r0 + 2 * r1]
2298
2299 movh m0, [r2 + r3]
2300 pmovzxbw m0, m0
2301 movu [r0 + r1], m0
2302
2303 movh m0, [r2 + 2 * r3]
2304 pmovzxbw m0, m0
2305 movu [r0 + 2 * r1], m0
2306
2307 lea r2, [r2 + 2 * r3]
2308 lea r0, [r0 + 2 * r1]
2309
2310 movh m0, [r2 + r3]
2311 pmovzxbw m0, m0
2312 movu [r0 + r1], m0
2313
2314 RET
2315
2316 ;-----------------------------------------------------------------------------
2317 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2318 ;-----------------------------------------------------------------------------
2319 %macro BLOCKCOPY_PS_W8_H4 2
2320 INIT_XMM sse4
2321 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
2322
2323 add r1, r1
2324 mov r4d, %2/4
2325
2326 .loop:
2327 movh m0, [r2]
2328 pmovzxbw m0, m0
2329 movu [r0], m0
2330
2331 movh m0, [r2 + r3]
2332 pmovzxbw m0, m0
2333 movu [r0 + r1], m0
2334
2335 movh m0, [r2 + 2 * r3]
2336 pmovzxbw m0, m0
2337 movu [r0 + 2 * r1], m0
2338
2339 lea r2, [r2 + 2 * r3]
2340 lea r0, [r0 + 2 * r1]
2341
2342 movh m0, [r2 + r3]
2343 pmovzxbw m0, m0
2344 movu [r0 + r1], m0
2345
2346 lea r0, [r0 + 2 * r1]
2347 lea r2, [r2 + 2 * r3]
2348
2349 dec r4d
2350 jnz .loop
2351
2352 RET
2353 %endmacro
2354
2355 BLOCKCOPY_PS_W8_H4 8, 8
2356 BLOCKCOPY_PS_W8_H4 8, 16
2357 BLOCKCOPY_PS_W8_H4 8, 32
2358
2359 BLOCKCOPY_PS_W8_H4 8, 12
2360 BLOCKCOPY_PS_W8_H4 8, 64
2361
2362
2363 ;-----------------------------------------------------------------------------
2364 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2365 ;-----------------------------------------------------------------------------
2366 %macro BLOCKCOPY_PS_W12_H2 2
2367 INIT_XMM sse4
2368 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2369
2370 add r1, r1
2371 mov r4d, %2/2
2372 pxor m0, m0
2373
2374 .loop:
2375 movu m1, [r2]
2376 pmovzxbw m2, m1
2377 movu [r0], m2
2378 punpckhbw m1, m0
2379 movh [r0 + 16], m1
2380
2381 movu m1, [r2 + r3]
2382 pmovzxbw m2, m1
2383 movu [r0 + r1], m2
2384 punpckhbw m1, m0
2385 movh [r0 + r1 + 16], m1
2386
2387 lea r0, [r0 + 2 * r1]
2388 lea r2, [r2 + 2 * r3]
2389
2390 dec r4d
2391 jnz .loop
2392
2393 RET
2394 %endmacro
2395
2396 BLOCKCOPY_PS_W12_H2 12, 16
2397
2398 BLOCKCOPY_PS_W12_H2 12, 32
2399
2400 ;-----------------------------------------------------------------------------
2401 ; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2402 ;-----------------------------------------------------------------------------
2403 INIT_XMM sse4
2404 cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
2405
2406 add r1, r1
2407 pxor m0, m0
2408
2409 movu m1, [r2]
2410 pmovzxbw m2, m1
2411 movu [r0], m2
2412 punpckhbw m1, m0
2413 movu [r0 + 16], m1
2414
2415 movu m1, [r2 + r3]
2416 pmovzxbw m2, m1
2417 movu [r0 + r1], m2
2418 punpckhbw m1, m0
2419 movu [r0 + r1 + 16], m1
2420
2421 movu m1, [r2 + 2 * r3]
2422 pmovzxbw m2, m1
2423 movu [r0 + 2 * r1], m2
2424 punpckhbw m1, m0
2425 movu [r0 + 2 * r1 + 16], m1
2426
2427 lea r0, [r0 + 2 * r1]
2428 lea r2, [r2 + 2 * r3]
2429
2430 movu m1, [r2 + r3]
2431 pmovzxbw m2, m1
2432 movu [r0 + r1], m2
2433 punpckhbw m1, m0
2434 movu [r0 + r1 + 16], m1
2435
2436 RET
2437
2438 ;-----------------------------------------------------------------------------
2439 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2440 ;-----------------------------------------------------------------------------
2441 %macro BLOCKCOPY_PS_W16_H4 2
2442 INIT_XMM sse4
2443 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2444
2445 add r1, r1
2446 mov r4d, %2/4
2447 pxor m0, m0
2448
2449 .loop:
2450 movu m1, [r2]
2451 pmovzxbw m2, m1
2452 movu [r0], m2
2453 punpckhbw m1, m0
2454 movu [r0 + 16], m1
2455
2456 movu m1, [r2 + r3]
2457 pmovzxbw m2, m1
2458 movu [r0 + r1], m2
2459 punpckhbw m1, m0
2460 movu [r0 + r1 + 16], m1
2461
2462 movu m1, [r2 + 2 * r3]
2463 pmovzxbw m2, m1
2464 movu [r0 + 2 * r1], m2
2465 punpckhbw m1, m0
2466 movu [r0 + 2 * r1 + 16], m1
2467
2468 lea r0, [r0 + 2 * r1]
2469 lea r2, [r2 + 2 * r3]
2470
2471 movu m1, [r2 + r3]
2472 pmovzxbw m2, m1
2473 movu [r0 + r1], m2
2474 punpckhbw m1, m0
2475 movu [r0 + r1 + 16], m1
2476
2477 lea r0, [r0 + 2 * r1]
2478 lea r2, [r2 + 2 * r3]
2479
2480 dec r4d
2481 jnz .loop
2482
2483 RET
2484 %endmacro
2485
2486 BLOCKCOPY_PS_W16_H4 16, 8
2487 BLOCKCOPY_PS_W16_H4 16, 12
2488 BLOCKCOPY_PS_W16_H4 16, 16
2489 BLOCKCOPY_PS_W16_H4 16, 32
2490 BLOCKCOPY_PS_W16_H4 16, 64
2491
2492 BLOCKCOPY_PS_W16_H4 16, 24
2493
2494 ;-----------------------------------------------------------------------------
2495 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2496 ;-----------------------------------------------------------------------------
2497 %macro BLOCKCOPY_PS_W24_H2 2
2498 INIT_XMM sse4
2499 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2500
2501 add r1, r1
2502 mov r4d, %2/2
2503 pxor m0, m0
2504
2505 .loop:
2506 movu m1, [r2]
2507 pmovzxbw m2, m1
2508 movu [r0], m2
2509 punpckhbw m1, m0
2510 movu [r0 + 16], m1
2511
2512 movh m1, [r2 + 16]
2513 pmovzxbw m1, m1
2514 movu [r0 + 32], m1
2515
2516 movu m1, [r2 + r3]
2517 pmovzxbw m2, m1
2518 movu [r0 + r1], m2
2519 punpckhbw m1, m0
2520 movu [r0 + r1 + 16], m1
2521
2522 movh m1, [r2 + r3 + 16]
2523 pmovzxbw m1, m1
2524 movu [r0 + r1 + 32], m1
2525
2526 lea r0, [r0 + 2 * r1]
2527 lea r2, [r2 + 2 * r3]
2528
2529 dec r4d
2530 jnz .loop
2531
2532 RET
2533 %endmacro
2534
2535 BLOCKCOPY_PS_W24_H2 24, 32
2536
2537 BLOCKCOPY_PS_W24_H2 24, 64
2538
2539 ;-----------------------------------------------------------------------------
2540 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2541 ;-----------------------------------------------------------------------------
2542 %macro BLOCKCOPY_PS_W32_H2 2
2543 INIT_XMM sse4
2544 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2545
2546 add r1, r1
2547 mov r4d, %2/2
2548 pxor m0, m0
2549
2550 .loop:
2551 movu m1, [r2]
2552 pmovzxbw m2, m1
2553 movu [r0], m2
2554 punpckhbw m1, m0
2555 movu [r0 + 16], m1
2556
2557 movu m1, [r2 + 16]
2558 pmovzxbw m2, m1
2559 movu [r0 + 32], m2
2560 punpckhbw m1, m0
2561 movu [r0 + 48], m1
2562
2563 movu m1, [r2 + r3]
2564 pmovzxbw m2, m1
2565 movu [r0 + r1], m2
2566 punpckhbw m1, m0
2567 movu [r0 + r1 + 16], m1
2568
2569 movu m1, [r2 + r3 + 16]
2570 pmovzxbw m2, m1
2571 movu [r0 + r1 + 32], m2
2572 punpckhbw m1, m0
2573 movu [r0 + r1 + 48], m1
2574
2575 lea r0, [r0 + 2 * r1]
2576 lea r2, [r2 + 2 * r3]
2577
2578 dec r4d
2579 jnz .loop
2580
2581 RET
2582 %endmacro
2583
2584 BLOCKCOPY_PS_W32_H2 32, 8
2585 BLOCKCOPY_PS_W32_H2 32, 16
2586 BLOCKCOPY_PS_W32_H2 32, 24
2587 BLOCKCOPY_PS_W32_H2 32, 32
2588 BLOCKCOPY_PS_W32_H2 32, 64
2589
2590 BLOCKCOPY_PS_W32_H2 32, 48
2591
2592 ;-----------------------------------------------------------------------------
2593 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2594 ;-----------------------------------------------------------------------------
2595 %macro BLOCKCOPY_PS_W48_H2 2
2596 INIT_XMM sse4
2597 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2598
2599 add r1, r1
2600 mov r4d, %2/2
2601 pxor m0, m0
2602
2603 .loop:
2604 movu m1, [r2]
2605 pmovzxbw m2, m1
2606 movu [r0], m2
2607 punpckhbw m1, m0
2608 movu [r0 + 16], m1
2609
2610 movu m1, [r2 + 16]
2611 pmovzxbw m2, m1
2612 movu [r0 + 32], m2
2613 punpckhbw m1, m0
2614 movu [r0 + 48], m1
2615
2616 movu m1, [r2 + 32]
2617 pmovzxbw m2, m1
2618 movu [r0 + 64], m2
2619 punpckhbw m1, m0
2620 movu [r0 + 80], m1
2621
2622 movu m1, [r2 + r3]
2623 pmovzxbw m2, m1
2624 movu [r0 + r1], m2
2625 punpckhbw m1, m0
2626 movu [r0 + r1 + 16], m1
2627
2628 movu m1, [r2 + r3 + 16]
2629 pmovzxbw m2, m1
2630 movu [r0 + r1 + 32], m2
2631 punpckhbw m1, m0
2632 movu [r0 + r1 + 48], m1
2633
2634 movu m1, [r2 + r3 + 32]
2635 pmovzxbw m2, m1
2636 movu [r0 + r1 + 64], m2
2637 punpckhbw m1, m0
2638 movu [r0 + r1 + 80], m1
2639
2640 lea r0, [r0 + 2 * r1]
2641 lea r2, [r2 + 2 * r3]
2642
2643 dec r4d
2644 jnz .loop
2645
2646 RET
2647 %endmacro
2648
2649 BLOCKCOPY_PS_W48_H2 48, 64
2650
2651 ;-----------------------------------------------------------------------------
2652 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2653 ;-----------------------------------------------------------------------------
2654 %macro BLOCKCOPY_PS_W64_H2 2
2655 INIT_XMM sse4
2656 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2657
2658 add r1, r1
2659 mov r4d, %2/2
2660 pxor m0, m0
2661
2662 .loop:
2663 movu m1, [r2]
2664 pmovzxbw m2, m1
2665 movu [r0], m2
2666 punpckhbw m1, m0
2667 movu [r0 + 16], m1
2668
2669 movu m1, [r2 + 16]
2670 pmovzxbw m2, m1
2671 movu [r0 + 32], m2
2672 punpckhbw m1, m0
2673 movu [r0 + 48], m1
2674
2675 movu m1, [r2 + 32]
2676 pmovzxbw m2, m1
2677 movu [r0 + 64], m2
2678 punpckhbw m1, m0
2679 movu [r0 + 80], m1
2680
2681 movu m1, [r2 + 48]
2682 pmovzxbw m2, m1
2683 movu [r0 + 96], m2
2684 punpckhbw m1, m0
2685 movu [r0 + 112], m1
2686
2687 movu m1, [r2 + r3]
2688 pmovzxbw m2, m1
2689 movu [r0 + r1], m2
2690 punpckhbw m1, m0
2691 movu [r0 + r1 + 16], m1
2692
2693 movu m1, [r2 + r3 + 16]
2694 pmovzxbw m2, m1
2695 movu [r0 + r1 + 32], m2
2696 punpckhbw m1, m0
2697 movu [r0 + r1 + 48], m1
2698
2699 movu m1, [r2 + r3 + 32]
2700 pmovzxbw m2, m1
2701 movu [r0 + r1 + 64], m2
2702 punpckhbw m1, m0
2703 movu [r0 + r1 + 80], m1
2704
2705 movu m1, [r2 + r3 + 48]
2706 pmovzxbw m2, m1
2707 movu [r0 + r1 + 96], m2
2708 punpckhbw m1, m0
2709 movu [r0 + r1 + 112], m1
2710
2711 lea r0, [r0 + 2 * r1]
2712 lea r2, [r2 + 2 * r3]
2713
2714 dec r4d
2715 jnz .loop
2716
2717 RET
2718 %endmacro
2719
2720 BLOCKCOPY_PS_W64_H2 64, 16
2721 BLOCKCOPY_PS_W64_H2 64, 32
2722 BLOCKCOPY_PS_W64_H2 64, 48
2723 BLOCKCOPY_PS_W64_H2 64, 64
2724
2725 ;-----------------------------------------------------------------------------
2726 ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2727 ;-----------------------------------------------------------------------------
2728 INIT_XMM sse2
2729 cglobal blockcopy_ss_2x4, 4, 6, 0
2730 add r1, r1
2731 add r3, r3
2732
2733 mov r4d, [r2]
2734 mov r5d, [r2 + r3]
2735 mov [r0], r4d
2736 mov [r0 + r1], r5d
2737
2738 lea r2, [r2 + r3 * 2]
2739 lea r0, [r0 + 2 * r1]
2740
2741 mov r4d, [r2]
2742 mov r5d, [r2 + r3]
2743 mov [r0], r4d
2744 mov [r0 + r1], r5d
2745
2746 RET
2747
2748 ;-----------------------------------------------------------------------------
2749 ; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2750 ;-----------------------------------------------------------------------------
2751 INIT_XMM sse2
2752 cglobal blockcopy_ss_2x8, 4, 6, 0
2753 add r1, r1
2754 add r3, r3
2755
2756 mov r4d, [r2]
2757 mov r5d, [r2 + r3]
2758 mov [r0], r4d
2759 mov [r0 + r1], r5d
2760
2761 lea r2, [r2 + r3 * 2]
2762 lea r0, [r0 + 2 * r1]
2763
2764 mov r4d, [r2]
2765 mov r5d, [r2 + r3]
2766 mov [r0], r4d
2767 mov [r0 + r1], r5d
2768
2769 lea r2, [r2 + r3 * 2]
2770 lea r0, [r0 + 2 * r1]
2771
2772 mov r4d, [r2]
2773 mov r5d, [r2 + r3]
2774 mov [r0], r4d
2775 mov [r0 + r1], r5d
2776
2777 lea r2, [r2 + r3 * 2]
2778 lea r0, [r0 + 2 * r1]
2779
2780 mov r4d, [r2]
2781 mov r5d, [r2 + r3]
2782 mov [r0], r4d
2783 mov [r0 + r1], r5d
2784
2785 RET
2786
2787 ;-----------------------------------------------------------------------------
2788 ; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2789 ;-----------------------------------------------------------------------------
2790 INIT_XMM sse2
2791 cglobal blockcopy_ss_2x16, 4, 7, 0
2792 add r1, r1
2793 add r3, r3
2794 mov r6d, 16/2
2795 .loop:
2796 mov r4d, [r2]
2797 mov r5d, [r2 + r3]
2798 dec r6d
2799 lea r2, [r2 + r3 * 2]
2800 mov [r0], r4d
2801 mov [r0 + r1], r5d
2802 lea r0, [r0 + r1 * 2]
2803 jnz .loop
2804 RET
2805
2806
2807 ;-----------------------------------------------------------------------------
2808 ; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2809 ;-----------------------------------------------------------------------------
2810 INIT_XMM sse2
2811 cglobal blockcopy_ss_4x2, 4, 4, 2
2812 add r1, r1
2813 add r3, r3
2814
2815 movh m0, [r2]
2816 movh m1, [r2 + r3]
2817
2818 movh [r0], m0
2819 movh [r0 + r1], m1
2820
2821 RET
2822
2823 ;-----------------------------------------------------------------------------
2824 ; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2825 ;-----------------------------------------------------------------------------
2826 INIT_XMM sse2
2827 cglobal blockcopy_ss_4x4, 4, 4, 4
2828 add r1, r1
2829 add r3, r3
2830 movh m0, [r2]
2831 movh m1, [r2 + r3]
2832 lea r2, [r2 + r3 * 2]
2833 movh m2, [r2]
2834 movh m3, [r2 + r3]
2835
2836 movh [r0], m0
2837 movh [r0 + r1], m1
2838 lea r0, [r0 + 2 * r1]
2839 movh [r0], m2
2840 movh [r0 + r1], m3
2841 RET
2842
2843 ;-----------------------------------------------------------------------------
2844 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2845 ;-----------------------------------------------------------------------------
2846 %macro BLOCKCOPY_SS_W4_H8 2
2847 INIT_XMM sse2
2848 cglobal blockcopy_ss_%1x%2, 4, 5, 4
2849 mov r4d, %2/8
2850 add r1, r1
2851 add r3, r3
2852 .loop:
2853 movh m0, [r2]
2854 movh m1, [r2 + r3]
2855 lea r2, [r2 + r3 * 2]
2856 movh m2, [r2]
2857 movh m3, [r2 + r3]
2858
2859 movh [r0], m0
2860 movh [r0 + r1], m1
2861 lea r0, [r0 + 2 * r1]
2862 movh [r0], m2
2863 movh [r0 + r1], m3
2864
2865 lea r0, [r0 + 2 * r1]
2866 lea r2, [r2 + 2 * r3]
2867 movh m0, [r2]
2868 movh m1, [r2 + r3]
2869 lea r2, [r2 + r3 * 2]
2870 movh m2, [r2]
2871 movh m3, [r2 + r3]
2872
2873 movh [r0], m0
2874 movh [r0 + r1], m1
2875 lea r0, [r0 + 2 * r1]
2876 movh [r0], m2
2877 movh [r0 + r1], m3
2878 lea r0, [r0 + 2 * r1]
2879 lea r2, [r2 + 2 * r3]
2880
2881 dec r4d
2882 jnz .loop
2883 RET
2884 %endmacro
2885
2886 BLOCKCOPY_SS_W4_H8 4, 8
2887 BLOCKCOPY_SS_W4_H8 4, 16
2888
2889 BLOCKCOPY_SS_W4_H8 4, 32
2890
2891 ;-----------------------------------------------------------------------------
2892 ; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2893 ;-----------------------------------------------------------------------------
2894 INIT_XMM sse2
2895 cglobal blockcopy_ss_6x8, 4, 4, 4
2896 add r1, r1
2897 add r3, r3
2898
2899 movu m0, [r2]
2900 movu m1, [r2 + r3]
2901 pshufd m2, m0, 2
2902 pshufd m3, m1, 2
2903 movh [r0], m0
2904 movd [r0 + 8], m2
2905 movh [r0 + r1], m1
2906 movd [r0 + r1 + 8], m3
2907
2908 lea r0, [r0 + 2 * r1]
2909 lea r2, [r2 + 2 * r3]
2910
2911 movu m0, [r2]
2912 movu m1, [r2 + r3]
2913 pshufd m2, m0, 2
2914 pshufd m3, m1, 2
2915 movh [r0], m0
2916 movd [r0 + 8], m2
2917 movh [r0 + r1], m1
2918 movd [r0 + r1 + 8], m3
2919
2920 lea r0, [r0 + 2 * r1]
2921 lea r2, [r2 + 2 * r3]
2922
2923 movu m0, [r2]
2924 movu m1, [r2 + r3]
2925 pshufd m2, m0, 2
2926 pshufd m3, m1, 2
2927 movh [r0], m0
2928 movd [r0 + 8], m2
2929 movh [r0 + r1], m1
2930 movd [r0 + r1 + 8], m3
2931
2932 lea r0, [r0 + 2 * r1]
2933 lea r2, [r2 + 2 * r3]
2934
2935 movu m0, [r2]
2936 movu m1, [r2 + r3]
2937 pshufd m2, m0, 2
2938 pshufd m3, m1, 2
2939 movh [r0], m0
2940 movd [r0 + 8], m2
2941 movh [r0 + r1], m1
2942 movd [r0 + r1 + 8], m3
2943
2944 RET
2945
2946 ;-----------------------------------------------------------------------------
2947 ; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2948 ;-----------------------------------------------------------------------------
2949 INIT_XMM sse2
2950 cglobal blockcopy_ss_6x16, 4, 5, 4
2951 add r1, r1
2952 add r3, r3
2953 mov r4d, 16/2
2954 .loop:
2955 movh m0, [r2]
2956 movd m2, [r2 + 8]
2957 movh m1, [r2 + r3]
2958 movd m3, [r2 + r3 + 8]
2959 dec r4d
2960 lea r2, [r2 + r3 * 2]
2961 movh [r0], m0
2962 movd [r0 + 8], m2
2963 movh [r0 + r1], m1
2964 movd [r0 + r1 + 8], m3
2965 lea r0, [r0 + r1 * 2]
2966 jnz .loop
2967 RET
2968
2969
2970 ;-----------------------------------------------------------------------------
2971 ; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2972 ;-----------------------------------------------------------------------------
2973 INIT_XMM sse2
2974 cglobal blockcopy_ss_8x2, 4, 4, 2
2975 add r1, r1
2976 add r3, r3
2977
2978 movu m0, [r2]
2979 movu m1, [r2 + r3]
2980
2981 movu [r0], m0
2982 movu [r0 + r1], m1
2983
2984 RET
2985
2986 ;-----------------------------------------------------------------------------
2987 ; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2988 ;-----------------------------------------------------------------------------
2989 INIT_XMM sse2
2990 cglobal blockcopy_ss_8x4, 4, 4, 4
2991 add r1, r1
2992 add r3, r3
2993
2994 movu m0, [r2]
2995 movu m1, [r2 + r3]
2996 lea r2, [r2 + r3 * 2]
2997 movu m2, [r2]
2998 movu m3, [r2 + r3]
2999
3000 movu [r0], m0
3001 movu [r0 + r1], m1
3002 lea r0, [r0 + 2 * r1]
3003 movu [r0], m2
3004 movu [r0 + r1], m3
3005 RET
3006
3007 ;-----------------------------------------------------------------------------
3008 ; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3009 ;-----------------------------------------------------------------------------
3010 INIT_XMM sse2
3011 cglobal blockcopy_ss_8x6, 4, 4, 4
3012
3013 add r1, r1
3014 add r3, r3
3015 movu m0, [r2]
3016 movu m1, [r2 + r3]
3017 lea r2, [r2 + r3 * 2]
3018 movu m2, [r2]
3019 movu m3, [r2 + r3]
3020
3021 movu [r0], m0
3022 movu [r0 + r1], m1
3023 lea r0, [r0 + 2 * r1]
3024 movu [r0], m2
3025 movu [r0 + r1], m3
3026
3027 lea r2, [r2 + r3 * 2]
3028 lea r0, [r0 + 2 * r1]
3029
3030 movu m0, [r2]
3031 movu m1, [r2 + r3]
3032 movu [r0], m0
3033 movu [r0 + r1], m1
3034 RET
3035
3036 ;-----------------------------------------------------------------------------
3037 ; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3038 ;-----------------------------------------------------------------------------
3039 INIT_XMM sse2
3040 cglobal blockcopy_ss_8x12, 4, 5, 2
3041 add r1, r1
3042 add r3, r3
3043 mov r4d, 12/2
3044 .loop:
3045 movu m0, [r2]
3046 movu m1, [r2 + r3]
3047 lea r2, [r2 + 2 * r3]
3048 dec r4d
3049 movu [r0], m0
3050 movu [r0 + r1], m1
3051 lea r0, [r0 + 2 * r1]
3052 jnz .loop
3053 RET
3054
3055
3056 ;-----------------------------------------------------------------------------
3057 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3058 ;-----------------------------------------------------------------------------
3059 %macro BLOCKCOPY_SS_W8_H8 2
3060 INIT_XMM sse2
3061 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3062 mov r4d, %2/8
3063 add r1, r1
3064 add r3, r3
3065 .loop:
3066 movu m0, [r2]
3067 movu m1, [r2 + r3]
3068 lea r2, [r2 + r3 * 2]
3069 movu m2, [r2]
3070 movu m3, [r2 + r3]
3071
3072 movu [r0], m0
3073 movu [r0 + r1], m1
3074 lea r0, [r0 + 2 * r1]
3075 movu [r0], m2
3076 movu [r0 + r1], m3
3077
3078
3079 lea r2, [r2 + 2 * r3]
3080 lea r0, [r0 + 2 * r1]
3081
3082 movu m0, [r2]
3083 movu m1, [r2 + r3]
3084 lea r2, [r2 + r3 * 2]
3085 movu m2, [r2]
3086 movu m3, [r2 + r3]
3087
3088 movu [r0], m0
3089 movu [r0 + r1], m1
3090 lea r0, [r0 + 2 * r1]
3091 movu [r0], m2
3092 movu [r0 + r1], m3
3093
3094 dec r4d
3095 lea r0, [r0 + 2 * r1]
3096 lea r2, [r2 + 2 * r3]
3097 jnz .loop
3098 RET
3099 %endmacro
3100
3101 BLOCKCOPY_SS_W8_H8 8, 8
3102 BLOCKCOPY_SS_W8_H8 8, 16
3103 BLOCKCOPY_SS_W8_H8 8, 32
3104
3105 BLOCKCOPY_SS_W8_H8 8, 64
3106
3107 ;-----------------------------------------------------------------------------
3108 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3109 ;-----------------------------------------------------------------------------
3110 %macro BLOCKCOPY_SS_W12_H4 2
3111 INIT_XMM sse2
3112 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3113
3114 mov r4d, %2/4
3115 add r1, r1
3116 add r3, r3
3117 .loop:
3118 movu m0, [r2]
3119 movh m1, [r2 + 16]
3120 movu m2, [r2 + r3]
3121 movh m3, [r2 + r3 + 16]
3122 lea r2, [r2 + 2 * r3]
3123
3124 movu [r0], m0
3125 movh [r0 + 16], m1
3126 movu [r0 + r1], m2
3127 movh [r0 + r1 + 16], m3
3128
3129 lea r0, [r0 + 2 * r1]
3130 movu m0, [r2]
3131 movh m1, [r2 + 16]
3132 movu m2, [r2 + r3]
3133 movh m3, [r2 + r3 + 16]
3134
3135 movu [r0], m0
3136 movh [r0 + 16], m1
3137 movu [r0 + r1], m2
3138 movh [r0 + r1 + 16], m3
3139
3140 dec r4d
3141 lea r0, [r0 + 2 * r1]
3142 lea r2, [r2 + 2 * r3]
3143 jnz .loop
3144 RET
3145 %endmacro
3146
3147 BLOCKCOPY_SS_W12_H4 12, 16
3148
3149 BLOCKCOPY_SS_W12_H4 12, 32
3150
3151 ;-----------------------------------------------------------------------------
3152 ; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3153 ;-----------------------------------------------------------------------------
3154 %macro BLOCKCOPY_SS_W16_H4 2
3155 INIT_XMM sse2
3156 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3157 mov r4d, %2/4
3158 add r1, r1
3159 add r3, r3
3160 .loop:
3161 movu m0, [r2]
3162 movu m1, [r2 + 16]
3163 movu m2, [r2 + r3]
3164 movu m3, [r2 + r3 + 16]
3165
3166 movu [r0], m0
3167 movu [r0 + 16], m1
3168 movu [r0 + r1], m2
3169 movu [r0 + r1 + 16], m3
3170
3171 lea r2, [r2 + 2 * r3]
3172 lea r0, [r0 + 2 * r1]
3173
3174 movu m0, [r2]
3175 movu m1, [r2 + 16]
3176 movu m2, [r2 + r3]
3177 movu m3, [r2 + r3 + 16]
3178
3179 movu [r0], m0
3180 movu [r0 + 16], m1
3181 movu [r0 + r1], m2
3182 movu [r0 + r1 + 16], m3
3183
3184 dec r4d
3185 lea r0, [r0 + 2 * r1]
3186 lea r2, [r2 + 2 * r3]
3187 jnz .loop
3188 RET
3189 %endmacro
3190
3191 BLOCKCOPY_SS_W16_H4 16, 4
3192 BLOCKCOPY_SS_W16_H4 16, 12
3193
3194 ;-----------------------------------------------------------------------------
3195 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3196 ;-----------------------------------------------------------------------------
3197 %macro BLOCKCOPY_SS_W16_H4_avx 2
3198 INIT_YMM avx
3199 cglobal blockcopy_ss_%1x%2, 4, 7, 4
3200 mov r4d, %2/4
3201 add r1, r1
3202 add r3, r3
3203 lea r5, [3 * r3]
3204 lea r6, [3 * r1]
3205 .loop:
3206 movu m0, [r2]
3207 movu m1, [r2 + r3]
3208 movu m2, [r2 + 2 * r3]
3209 movu m3, [r2 + r5]
3210
3211 movu [r0], m0
3212 movu [r0 + r1], m1
3213 movu [r0 + 2 * r1], m2
3214 movu [r0 + r6], m3
3215
3216 lea r0, [r0 + 4 * r1]
3217 lea r2, [r2 + 4 * r3]
3218 dec r4d
3219 jnz .loop
3220 RET
3221 %endmacro
3222
3223 BLOCKCOPY_SS_W16_H4_avx 16, 4
3224 BLOCKCOPY_SS_W16_H4_avx 16, 12
3225 BLOCKCOPY_SS_W16_H4_avx 16, 8
3226 BLOCKCOPY_SS_W16_H4_avx 16, 16
3227 BLOCKCOPY_SS_W16_H4_avx 16, 24
3228 BLOCKCOPY_SS_W16_H4_avx 16, 32
3229 BLOCKCOPY_SS_W16_H4_avx 16, 64
3230
3231 ;-----------------------------------------------------------------------------
3232 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3233 ;-----------------------------------------------------------------------------
3234 %macro BLOCKCOPY_SS_W16_H8 2
3235 INIT_XMM sse2
3236 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3237 mov r4d, %2/8
3238 add r1, r1
3239 add r3, r3
3240 .loop:
3241 movu m0, [r2]
3242 movu m1, [r2 + 16]
3243 movu m2, [r2 + r3]
3244 movu m3, [r2 + r3 + 16]
3245
3246 movu [r0], m0
3247 movu [r0 + 16], m1
3248 movu [r0 + r1], m2
3249 movu [r0 + r1 + 16], m3
3250
3251 lea r2, [r2 + 2 * r3]
3252 lea r0, [r0 + 2 * r1]
3253
3254 movu m0, [r2]
3255 movu m1, [r2 + 16]
3256 movu m2, [r2 + r3]
3257 movu m3, [r2 + r3 + 16]
3258
3259 movu [r0], m0
3260 movu [r0 + 16], m1
3261 movu [r0 + r1], m2
3262 movu [r0 + r1 + 16], m3
3263
3264 lea r2, [r2 + 2 * r3]
3265 lea r0, [r0 + 2 * r1]
3266
3267 movu m0, [r2]
3268 movu m1, [r2 + 16]
3269 movu m2, [r2 + r3]
3270 movu m3, [r2 + r3 + 16]
3271
3272 movu [r0], m0
3273 movu [r0 + 16], m1
3274 movu [r0 + r1], m2
3275 movu [r0 + r1 + 16], m3
3276
3277 lea r2, [r2 + 2 * r3]
3278 lea r0, [r0 + 2 * r1]
3279
3280 movu m0, [r2]
3281 movu m1, [r2 + 16]
3282 movu m2, [r2 + r3]
3283 movu m3, [r2 + r3 + 16]
3284
3285 movu [r0], m0
3286 movu [r0 + 16], m1
3287 movu [r0 + r1], m2
3288 movu [r0 + r1 + 16], m3
3289
3290 dec r4d
3291 lea r2, [r2 + 2 * r3]
3292 lea r0, [r0 + 2 * r1]
3293 jnz .loop
3294 RET
3295 %endmacro
3296
3297 BLOCKCOPY_SS_W16_H8 16, 8
3298 BLOCKCOPY_SS_W16_H8 16, 16
3299 BLOCKCOPY_SS_W16_H8 16, 32
3300 BLOCKCOPY_SS_W16_H8 16, 64
3301
3302 BLOCKCOPY_SS_W16_H8 16, 24
3303
3304 ;-----------------------------------------------------------------------------
3305 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3306 ;-----------------------------------------------------------------------------
3307 %macro BLOCKCOPY_SS_W24_H4 2
3308 INIT_XMM sse2
3309 cglobal blockcopy_ss_%1x%2, 4, 5, 6
3310 mov r4d, %2/4
3311 add r1, r1
3312 add r3, r3
3313 .loop
3314 movu m0, [r2]
3315 movu m1, [r2 + 16]
3316 movu m2, [r2 + 32]
3317 movu m3, [r2 + r3]
3318 movu m4, [r2 + r3 + 16]
3319 movu m5, [r2 + r3 + 32]
3320
3321 movu [r0], m0
3322 movu [r0 + 16], m1
3323 movu [r0 + 32], m2
3324 movu [r0 + r1], m3
3325 movu [r0 + r1 + 16], m4
3326 movu [r0 + r1 + 32], m5
3327
3328 lea r2, [r2 + 2 * r3]
3329 lea r0, [r0 + 2 * r1]
3330
3331 movu m0, [r2]
3332 movu m1, [r2 + 16]
3333 movu m2, [r2 + 32]
3334 movu m3, [r2 + r3]
3335 movu m4, [r2 + r3 + 16]
3336 movu m5, [r2 + r3 + 32]
3337
3338 movu [r0], m0
3339 movu [r0 + 16], m1
3340 movu [r0 + 32], m2
3341 movu [r0 + r1], m3
3342 movu [r0 + r1 + 16], m4
3343 movu [r0 + r1 + 32], m5
3344
3345 dec r4d
3346 lea r2, [r2 + 2 * r3]
3347 lea r0, [r0 + 2 * r1]
3348 jnz .loop
3349 RET
3350 %endmacro
3351
3352 BLOCKCOPY_SS_W24_H4 24, 32
3353
3354 BLOCKCOPY_SS_W24_H4 24, 64
3355
3356 ;-----------------------------------------------------------------------------
3357 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3358 ;-----------------------------------------------------------------------------
3359 %macro BLOCKCOPY_SS_W32_H4 2
3360 INIT_XMM sse2
3361 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3362 mov r4d, %2/4
3363 add r1, r1
3364 add r3, r3
3365 .loop:
3366 movu m0, [r2]
3367 movu m1, [r2 + 16]
3368 movu m2, [r2 + 32]
3369 movu m3, [r2 + 48]
3370
3371 movu [r0], m0
3372 movu [r0 + 16], m1
3373 movu [r0 + 32], m2
3374 movu [r0 + 48], m3
3375
3376 movu m0, [r2 + r3]
3377 movu m1, [r2 + r3 + 16]
3378 movu m2, [r2 + r3 + 32]
3379 movu m3, [r2 + r3 + 48]
3380
3381 movu [r0 + r1], m0
3382 movu [r0 + r1 + 16], m1
3383 movu [r0 + r1 + 32], m2
3384 movu [r0 + r1 + 48], m3
3385
3386 lea r2, [r2 + 2 * r3]
3387 lea r0, [r0 + 2 * r1]
3388
3389 movu m0, [r2]
3390 movu m1, [r2 + 16]
3391 movu m2, [r2 + 32]
3392 movu m3, [r2 + 48]
3393
3394 movu [r0], m0
3395 movu [r0 + 16], m1
3396 movu [r0 + 32], m2
3397 movu [r0 + 48], m3
3398
3399 movu m0, [r2 + r3]
3400 movu m1, [r2 + r3 + 16]
3401 movu m2, [r2 + r3 + 32]
3402 movu m3, [r2 + r3 + 48]
3403
3404 movu [r0 + r1], m0
3405 movu [r0 + r1 + 16], m1
3406 movu [r0 + r1 + 32], m2
3407 movu [r0 + r1 + 48], m3
3408
3409 dec r4d
3410 lea r2, [r2 + 2 * r3]
3411 lea r0, [r0 + 2 * r1]
3412 jnz .loop
3413 RET
3414 %endmacro
3415
3416 BLOCKCOPY_SS_W32_H4 32, 8
3417 BLOCKCOPY_SS_W32_H4 32, 16
3418 BLOCKCOPY_SS_W32_H4 32, 24
3419 BLOCKCOPY_SS_W32_H4 32, 32
3420 BLOCKCOPY_SS_W32_H4 32, 64
3421
3422 BLOCKCOPY_SS_W32_H4 32, 48
3423
3424 ;-----------------------------------------------------------------------------
3425 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3426 ;-----------------------------------------------------------------------------
3427 %macro BLOCKCOPY_SS_W48_H2 2
3428 INIT_XMM sse2
3429 cglobal blockcopy_ss_%1x%2, 4, 5, 6
3430 mov r4d, %2/4
3431 add r1, r1
3432 add r3, r3
3433 .loop:
3434 movu m0, [r2]
3435 movu m1, [r2 + 16]
3436 movu m2, [r2 + 32]
3437 movu m3, [r2 + 48]
3438 movu m4, [r2 + 64]
3439 movu m5, [r2 + 80]
3440
3441 movu [r0], m0
3442 movu [r0 + 16], m1
3443 movu [r0 + 32], m2
3444 movu [r0 + 48], m3
3445 movu [r0 + 64], m4
3446 movu [r0 + 80], m5
3447
3448 movu m0, [r2 + r3]
3449 movu m1, [r2 + r3 + 16]
3450 movu m2, [r2 + r3 + 32]
3451 movu m3, [r2 + r3 + 48]
3452 movu m4, [r2 + r3 + 64]
3453 movu m5, [r2 + r3 + 80]
3454
3455 movu [r0 + r1], m0
3456 movu [r0 + r1 + 16], m1
3457 movu [r0 + r1 + 32], m2
3458 movu [r0 + r1 + 48], m3
3459 movu [r0 + r1 + 64], m4
3460 movu [r0 + r1 + 80], m5
3461
3462 lea r2, [r2 + 2 * r3]
3463 lea r0, [r0 + 2 * r1]
3464
3465 movu m0, [r2]
3466 movu m1, [r2 + 16]
3467 movu m2, [r2 + 32]
3468 movu m3, [r2 + 48]
3469 movu m4, [r2 + 64]
3470 movu m5, [r2 + 80]
3471
3472 movu [r0], m0
3473 movu [r0 + 16], m1
3474 movu [r0 + 32], m2
3475 movu [r0 + 48], m3
3476 movu [r0 + 64], m4
3477 movu [r0 + 80], m5
3478
3479 movu m0, [r2 + r3]
3480 movu m1, [r2 + r3 + 16]
3481 movu m2, [r2 + r3 + 32]
3482 movu m3, [r2 + r3 + 48]
3483 movu m4, [r2 + r3 + 64]
3484 movu m5, [r2 + r3 + 80]
3485
3486 movu [r0 + r1], m0
3487 movu [r0 + r1 + 16], m1
3488 movu [r0 + r1 + 32], m2
3489 movu [r0 + r1 + 48], m3
3490 movu [r0 + r1 + 64], m4
3491 movu [r0 + r1 + 80], m5
3492
3493 dec r4d
3494 lea r2, [r2 + 2 * r3]
3495 lea r0, [r0 + 2 * r1]
3496 jnz .loop
3497 RET
3498 %endmacro
3499
3500 BLOCKCOPY_SS_W48_H2 48, 64
3501
3502 ;-----------------------------------------------------------------------------
3503 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3504 ;-----------------------------------------------------------------------------
3505 %macro BLOCKCOPY_SS_W64_H4 2
3506 INIT_XMM sse2
3507 cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
3508 mov r4d, %2/4
3509 add r1, r1
3510 add r3, r3
3511 .loop:
3512 movu m0, [r2]
3513 movu m1, [r2 + 16]
3514 movu m2, [r2 + 32]
3515 movu m3, [r2 + 48]
3516
3517 movu [r0], m0
3518 movu [r0 + 16], m1
3519 movu [r0 + 32], m2
3520 movu [r0 + 48], m3
3521
3522 movu m0, [r2 + 64]
3523 movu m1, [r2 + 80]
3524 movu m2, [r2 + 96]
3525 movu m3, [r2 + 112]
3526
3527 movu [r0 + 64], m0
3528 movu [r0 + 80], m1
3529 movu [r0 + 96], m2
3530 movu [r0 + 112], m3
3531
3532 movu m0, [r2 + r3]
3533 movu m1, [r2 + r3 + 16]
3534 movu m2, [r2 + r3 + 32]
3535 movu m3, [r2 + r3 + 48]
3536
3537 movu [r0 + r1], m0
3538 movu [r0 + r1 + 16], m1
3539 movu [r0 + r1 + 32], m2
3540 movu [r0 + r1 + 48], m3
3541
3542 movu m0, [r2 + r3 + 64]
3543 movu m1, [r2 + r3 + 80]
3544 movu m2, [r2 + r3 + 96]
3545 movu m3, [r2 + r3 + 112]
3546
3547 movu [r0 + r1 + 64], m0
3548 movu [r0 + r1 + 80], m1
3549 movu [r0 + r1 + 96], m2
3550 movu [r0 + r1 + 112], m3
3551
3552 lea r2, [r2 + 2 * r3]
3553 lea r0, [r0 + 2 * r1]
3554
3555 movu m0, [r2]
3556 movu m1, [r2 + 16]
3557 movu m2, [r2 + 32]
3558 movu m3, [r2 + 48]
3559
3560 movu [r0], m0
3561 movu [r0 + 16], m1
3562 movu [r0 + 32], m2
3563 movu [r0 + 48], m3
3564
3565 movu m0, [r2 + 64]
3566 movu m1, [r2 + 80]
3567 movu m2, [r2 + 96]
3568 movu m3, [r2 + 112]
3569
3570 movu [r0 + 64], m0
3571 movu [r0 + 80], m1
3572 movu [r0 + 96], m2
3573 movu [r0 + 112], m3
3574
3575 movu m0, [r2 + r3]
3576 movu m1, [r2 + r3 + 16]
3577 movu m2, [r2 + r3 + 32]
3578 movu m3, [r2 + r3 + 48]
3579
3580 movu [r0 + r1], m0
3581 movu [r0 + r1 + 16], m1
3582 movu [r0 + r1 + 32], m2
3583 movu [r0 + r1 + 48], m3
3584
3585 movu m0, [r2 + r3 + 64]
3586 movu m1, [r2 + r3 + 80]
3587 movu m2, [r2 + r3 + 96]
3588 movu m3, [r2 + r3 + 112]
3589
3590 movu [r0 + r1 + 64], m0
3591 movu [r0 + r1 + 80], m1
3592 movu [r0 + r1 + 96], m2
3593 movu [r0 + r1 + 112], m3
3594
3595 dec r4d
3596 lea r2, [r2 + 2 * r3]
3597 lea r0, [r0 + 2 * r1]
3598 jnz .loop
3599
3600 RET
3601 %endmacro
3602
3603 BLOCKCOPY_SS_W64_H4 64, 16
3604 BLOCKCOPY_SS_W64_H4 64, 32
3605 BLOCKCOPY_SS_W64_H4 64, 48
3606 BLOCKCOPY_SS_W64_H4 64, 64
3607
3608 ;-----------------------------------------------------------------------------
3609 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3610 ;-----------------------------------------------------------------------------
3611 %macro BLOCKCOPY_SS_W64_H4_avx 2
3612 INIT_YMM avx
3613 cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
3614 mov r4d, %2/4
3615 add r1, r1
3616 add r3, r3
3617 lea r5, [3 * r1]
3618 lea r6, [3 * r3]
3619 .loop:
3620 movu m0, [r2]
3621 movu m1, [r2 + 32]
3622 movu m2, [r2 + 64]
3623 movu m3, [r2 + 96]
3624
3625 movu [r0], m0
3626 movu [r0 + 32], m1
3627 movu [r0 + 64], m2
3628 movu [r0 + 96], m3
3629
3630 movu m0, [r2 + r3]
3631 movu m1, [r2 + r3 + 32]
3632 movu m2, [r2 + r3 + 64]
3633 movu m3, [r2 + r3 + 96]
3634
3635 movu [r0 + r1], m0
3636 movu [r0 + r1 + 32], m1
3637 movu [r0 + r1 + 64], m2
3638 movu [r0 + r1 + 96], m3
3639
3640 movu m0, [r2 + 2 * r3]
3641 movu m1, [r2 + 2 * r3 + 32]
3642 movu m2, [r2 + 2 * r3 + 64]
3643 movu m3, [r2 + 2 * r3 + 96]
3644
3645 movu [r0 + 2 * r1], m0
3646 movu [r0 + 2 * r1 + 32], m1
3647 movu [r0 + 2 * r1 + 64], m2
3648 movu [r0 + 2 * r1 + 96], m3
3649
3650 movu m0, [r2 + r6]
3651 movu m1, [r2 + r6 + 32]
3652 movu m2, [r2 + r6 + 64]
3653 movu m3, [r2 + r6 + 96]
3654 lea r2, [r2 + 4 * r3]
3655
3656 movu [r0 + r5], m0
3657 movu [r0 + r5 + 32], m1
3658 movu [r0 + r5 + 64], m2
3659 movu [r0 + r5 + 96], m3
3660 lea r0, [r0 + 4 * r1]
3661
3662 dec r4d
3663 jnz .loop
3664 RET
3665 %endmacro
3666
3667 BLOCKCOPY_SS_W64_H4_avx 64, 16
3668 BLOCKCOPY_SS_W64_H4_avx 64, 32
3669 BLOCKCOPY_SS_W64_H4_avx 64, 48
3670 BLOCKCOPY_SS_W64_H4_avx 64, 64
3671
3672 ;--------------------------------------------------------------------------------------
3673 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3674 ;--------------------------------------------------------------------------------------
3675 INIT_XMM sse2
3676 cglobal cpy2Dto1D_shr_4, 3, 4, 4
3677 add r2d, r2d
3678 movd m0, r3m
3679 pcmpeqw m1, m1
3680 psllw m1, m0
3681 psraw m1, 1
3682
3683 ; register alloc
3684 ; r0 - dst
3685 ; r1 - src
3686 ; r2 - srcStride
3687 ; m0 - shift
3688 ; m1 - word [-round]
3689
3690 ; Row 0-3
3691 movh m2, [r1]
3692 movhps m2, [r1 + r2]
3693 lea r1, [r1 + r2 * 2]
3694 movh m3, [r1]
3695 movhps m3, [r1 + r2]
3696 psubw m2, m1
3697 psubw m3, m1
3698 psraw m2, m0
3699 psraw m3, m0
3700 mova [r0 + 0 * mmsize], m2
3701 mova [r0 + 1 * mmsize], m3
3702 RET
3703
3704
3705 ;--------------------------------------------------------------------------------------
3706 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3707 ;--------------------------------------------------------------------------------------
3708 INIT_XMM sse2
3709 cglobal cpy2Dto1D_shr_8, 3, 5, 4
3710 add r2d, r2d
3711 movd m0, r3m
3712 pcmpeqw m1, m1
3713 psllw m1, m0
3714 psraw m1, 1
3715 mov r3d, 8/4
3716 lea r4, [r2 * 3]
3717
3718 ; register alloc
3719 ; r0 - dst
3720 ; r1 - src
3721 ; r2 - srcStride
3722 ; r3 - loop counter
3723 ; r4 - stride * 3
3724 ; m0 - shift
3725 ; m1 - word [-round]
3726
3727 .loop:
3728 ; Row 0-1
3729 mova m2, [r1]
3730 mova m3, [r1 + r2]
3731 psubw m2, m1
3732 psubw m3, m1
3733 psraw m2, m0
3734 psraw m3, m0
3735 mova [r0 + 0 * mmsize], m2
3736 mova [r0 + 1 * mmsize], m3
3737
3738 ; Row 2-3
3739 mova m2, [r1 + r2 * 2]
3740 mova m3, [r1 + r4]
3741 psubw m2, m1
3742 psubw m3, m1
3743 psraw m2, m0
3744 psraw m3, m0
3745 mova [r0 + 2 * mmsize], m2
3746 mova [r0 + 3 * mmsize], m3
3747
3748 add r0, 4 * mmsize
3749 lea r1, [r1 + r2 * 4]
3750 dec r3d
3751 jnz .loop
3752 RET
3753
3754
3755 ;--------------------------------------------------------------------------------------
3756 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3757 ;--------------------------------------------------------------------------------------
3758 INIT_XMM sse2
3759 cglobal cpy2Dto1D_shr_16, 3, 4, 4
3760 add r2d, r2d
3761 movd m0, r3m
3762 pcmpeqw m1, m1
3763 psllw m1, m0
3764 psraw m1, 1
3765 mov r3d, 16/2
3766
3767 ; register alloc
3768 ; r0 - dst
3769 ; r1 - src
3770 ; r2 - srcStride
3771 ; r3 - loop counter
3772 ; m0 - shift
3773 ; m1 - word [-round]
3774
3775 .loop:
3776 ; Row 0
3777 mova m2, [r1 + 0 * mmsize]
3778 mova m3, [r1 + 1 * mmsize]
3779 psubw m2, m1
3780 psubw m3, m1
3781 psraw m2, m0
3782 psraw m3, m0
3783 mova [r0 + 0 * mmsize], m2
3784 mova [r0 + 1 * mmsize], m3
3785
3786 ; Row 1
3787 mova m2, [r1 + r2 + 0 * mmsize]
3788 mova m3, [r1 + r2 + 1 * mmsize]
3789 psubw m2, m1
3790 psubw m3, m1
3791 psraw m2, m0
3792 psraw m3, m0
3793 mova [r0 + 2 * mmsize], m2
3794 mova [r0 + 3 * mmsize], m3
3795
3796 add r0, 4 * mmsize
3797 lea r1, [r1 + r2 * 2]
3798 dec r3d
3799 jnz .loop
3800 RET
3801
3802
3803 ;--------------------------------------------------------------------------------------
3804 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3805 ;--------------------------------------------------------------------------------------
3806 INIT_XMM sse2
3807 cglobal cpy2Dto1D_shr_32, 3, 4, 6
3808 add r2d, r2d
3809 movd m0, r3m
3810 pcmpeqw m1, m1
3811 psllw m1, m0
3812 psraw m1, 1
3813 mov r3d, 32/1
3814
3815 ; register alloc
3816 ; r0 - dst
3817 ; r1 - src
3818 ; r2 - srcStride
3819 ; r3 - loop counter
3820 ; m0 - shift
3821 ; m1 - word [-round]
3822
3823 .loop:
3824 ; Row 0
3825 mova m2, [r1 + 0 * mmsize]
3826 mova m3, [r1 + 1 * mmsize]
3827 mova m4, [r1 + 2 * mmsize]
3828 mova m5, [r1 + 3 * mmsize]
3829 psubw m2, m1
3830 psubw m3, m1
3831 psubw m4, m1
3832 psubw m5, m1
3833 psraw m2, m0
3834 psraw m3, m0
3835 psraw m4, m0
3836 psraw m5, m0
3837 mova [r0 + 0 * mmsize], m2
3838 mova [r0 + 1 * mmsize], m3
3839 mova [r0 + 2 * mmsize], m4
3840 mova [r0 + 3 * mmsize], m5
3841
3842 add r0, 4 * mmsize
3843 add r1, r2
3844 dec r3d
3845 jnz .loop
3846 RET
3847
3848
3849 ;--------------------------------------------------------------------------------------
3850 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
3851 ;--------------------------------------------------------------------------------------
3852 INIT_XMM sse2
3853 cglobal cpy1Dto2D_shl_4, 3, 3, 3
3854 add r2d, r2d
3855 movd m0, r3m
3856
3857 ; Row 0-3
3858 mova m1, [r1 + 0 * mmsize]
3859 mova m2, [r1 + 1 * mmsize]
3860 psllw m1, m0
3861 psllw m2, m0
3862 movh [r0], m1
3863 movhps [r0 + r2], m1
3864 movh [r0 + r2 * 2], m2
3865 lea r2, [r2 * 3]
3866 movhps [r0 + r2], m2
3867 RET
3868
3869
3870 INIT_YMM avx2
3871 cglobal cpy1Dto2D_shl_4, 3, 3, 2
3872 add r2d, r2d
3873 movd xm0, r3m
3874
3875 ; Row 0-3
3876 movu m1, [r1]
3877 psllw m1, xm0
3878 vextracti128 xm0, m1, 1
3879 movq [r0], xm1
3880 movhps [r0 + r2], xm1
3881 lea r0, [r0 + r2 * 2]
3882 movq [r0], xm0
3883 movhps [r0 + r2], xm0
3884 RET
3885
3886
3887 ;--------------------------------------------------------------------------------------
3888 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
3889 ;--------------------------------------------------------------------------------------
3890 INIT_XMM sse2
3891 cglobal cpy1Dto2D_shl_8, 3, 4, 5
3892 add r2d, r2d
3893 movd m0, r3m
3894 lea r3, [r2 * 3]
3895
3896 ; Row 0-3
3897 mova m1, [r1 + 0 * mmsize]
3898 mova m2, [r1 + 1 * mmsize]
3899 mova m3, [r1 + 2 * mmsize]
3900 mova m4, [r1 + 3 * mmsize]
3901 psllw m1, m0
3902 psllw m2, m0
3903 psllw m3, m0
3904 psllw m4, m0
3905 mova [r0], m1
3906 mova [r0 + r2], m2
3907 mova [r0 + r2 * 2], m3
3908 mova [r0 + r3], m4
3909 lea r0, [r0 + r2 * 4]
3910
3911 ; Row 4-7
3912 mova m1, [r1 + 4 * mmsize]
3913 mova m2, [r1 + 5 * mmsize]
3914 mova m3, [r1 + 6 * mmsize]
3915 mova m4, [r1 + 7 * mmsize]
3916 psllw m1, m0
3917 psllw m2, m0
3918 psllw m3, m0
3919 psllw m4, m0
3920 mova [r0], m1
3921 mova [r0 + r2], m2
3922 mova [r0 + r2 * 2], m3
3923 mova [r0 + r3], m4
3924 RET
3925
3926
3927 INIT_YMM avx2
3928 cglobal cpy1Dto2D_shl_8, 3, 4, 3
3929 add r2d, r2d
3930 movd xm0, r3m
3931 lea r3, [r2 * 3]
3932
3933 ; Row 0-3
3934 movu m1, [r1 + 0 * mmsize]
3935 movu m2, [r1 + 1 * mmsize]
3936 psllw m1, xm0
3937 psllw m2, xm0
3938 movu [r0], xm1
3939 vextracti128 [r0 + r2], m1, 1
3940 movu [r0 + r2 * 2], xm2
3941 vextracti128 [r0 + r3], m2, 1
3942
3943 ; Row 4-7
3944 movu m1, [r1 + 2 * mmsize]
3945 movu m2, [r1 + 3 * mmsize]
3946 lea r0, [r0 + r2 * 4]
3947 psllw m1, xm0
3948 psllw m2, xm0
3949 movu [r0], xm1
3950 vextracti128 [r0 + r2], m1, 1
3951 movu [r0 + r2 * 2], xm2
3952 vextracti128 [r0 + r3], m2, 1
3953 RET
3954
3955
3956 ;--------------------------------------------------------------------------------------
3957 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
3958 ;--------------------------------------------------------------------------------------
3959 INIT_XMM sse2
3960 cglobal cpy1Dto2D_shl_16, 3, 4, 5
3961 add r2d, r2d
3962 movd m0, r3m
3963 mov r3d, 16/4
3964
3965 .loop:
3966 ; Row 0-1
3967 mova m1, [r1 + 0 * mmsize]
3968 mova m2, [r1 + 1 * mmsize]
3969 mova m3, [r1 + 2 * mmsize]
3970 mova m4, [r1 + 3 * mmsize]
3971 psllw m1, m0
3972 psllw m2, m0
3973 psllw m3, m0
3974 psllw m4, m0
3975 mova [r0], m1
3976 mova [r0 + 16], m2
3977 mova [r0 + r2], m3
3978 mova [r0 + r2 + 16], m4
3979
3980 ; Row 2-3
3981 mova m1, [r1 + 4 * mmsize]
3982 mova m2, [r1 + 5 * mmsize]
3983 mova m3, [r1 + 6 * mmsize]
3984 mova m4, [r1 + 7 * mmsize]
3985 lea r0, [r0 + r2 * 2]
3986 psllw m1, m0
3987 psllw m2, m0
3988 psllw m3, m0
3989 psllw m4, m0
3990 mova [r0], m1
3991 mova [r0 + 16], m2
3992 mova [r0 + r2], m3
3993 mova [r0 + r2 + 16], m4
3994
3995 add r1, 8 * mmsize
3996 lea r0, [r0 + r2 * 2]
3997 dec r3d
3998 jnz .loop
3999 RET
4000
4001
4002 INIT_YMM avx2
4003 cglobal cpy1Dto2D_shl_16, 3, 5, 3
4004 add r2d, r2d
4005 movd xm0, r3m
4006 mov r3d, 16/4
4007 lea r4, [r2 * 3]
4008
4009 .loop:
4010 ; Row 0-1
4011 movu m1, [r1 + 0 * mmsize]
4012 movu m2, [r1 + 1 * mmsize]
4013 psllw m1, xm0
4014 psllw m2, xm0
4015 movu [r0], m1
4016 movu [r0 + r2], m2
4017
4018 ; Row 2-3
4019 movu m1, [r1 + 2 * mmsize]
4020 movu m2, [r1 + 3 * mmsize]
4021 psllw m1, xm0
4022 psllw m2, xm0
4023 movu [r0 + r2 * 2], m1
4024 movu [r0 + r4], m2
4025
4026 add r1, 4 * mmsize
4027 lea r0, [r0 + r2 * 4]
4028 dec r3d
4029 jnz .loop
4030 RET
4031
4032
4033 ;--------------------------------------------------------------------------------------
4034 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4035 ;--------------------------------------------------------------------------------------
4036 INIT_XMM sse2
4037 cglobal cpy1Dto2D_shl_32, 3, 4, 5
4038 add r2d, r2d
4039 movd m0, r3m
4040 mov r3d, 32/2
4041
4042 .loop:
4043 ; Row 0
4044 mova m1, [r1 + 0 * mmsize]
4045 mova m2, [r1 + 1 * mmsize]
4046 mova m3, [r1 + 2 * mmsize]
4047 mova m4, [r1 + 3 * mmsize]
4048 psllw m1, m0
4049 psllw m2, m0
4050 psllw m3, m0
4051 psllw m4, m0
4052 mova [r0 + 0 * mmsize], m1
4053 mova [r0 + 1 * mmsize], m2
4054 mova [r0 + 2 * mmsize], m3
4055 mova [r0 + 3 * mmsize], m4
4056
4057 ; Row 1
4058 mova m1, [r1 + 4 * mmsize]
4059 mova m2, [r1 + 5 * mmsize]
4060 mova m3, [r1 + 6 * mmsize]
4061 mova m4, [r1 + 7 * mmsize]
4062 psllw m1, m0
4063 psllw m2, m0
4064 psllw m3, m0
4065 psllw m4, m0
4066 mova [r0 + r2 + 0 * mmsize], m1
4067 mova [r0 + r2 + 1 * mmsize], m2
4068 mova [r0 + r2 + 2 * mmsize], m3
4069 mova [r0 + r2 + 3 * mmsize], m4
4070
4071 add r1, 8 * mmsize
4072 lea r0, [r0 + r2 * 2]
4073 dec r3d
4074 jnz .loop
4075 RET
4076
4077
4078 INIT_YMM avx2
4079 cglobal cpy1Dto2D_shl_32, 3, 4, 5
4080 add r2d, r2d
4081 movd xm0, r3m
4082 mov r3d, 32/2
4083
4084 .loop:
4085 ; Row 0-1
4086 movu m1, [r1 + 0 * mmsize]
4087 movu m2, [r1 + 1 * mmsize]
4088 movu m3, [r1 + 2 * mmsize]
4089 movu m4, [r1 + 3 * mmsize]
4090 psllw m1, xm0
4091 psllw m2, xm0
4092 psllw m3, xm0
4093 psllw m4, xm0
4094 movu [r0], m1
4095 movu [r0 + mmsize], m2
4096 movu [r0 + r2], m3
4097 movu [r0 + r2 + mmsize], m4
4098
4099 add r1, 4 * mmsize
4100 lea r0, [r0 + r2 * 2]
4101 dec r3d
4102 jnz .loop
4103 RET
4104
4105
4106 ;--------------------------------------------------------------------------------------
4107 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
4108 ;--------------------------------------------------------------------------------------
4109 INIT_XMM sse4
4110 cglobal copy_cnt_4, 3,3,3
4111 add r2d, r2d
4112 pxor m2, m2
4113
4114 ; row 0 & 1
4115 movh m0, [r1]
4116 movhps m0, [r1 + r2]
4117 mova [r0], m0
4118
4119 ; row 2 & 3
4120 movh m1, [r1 + r2 * 2]
4121 lea r2, [r2 * 3]
4122 movhps m1, [r1 + r2]
4123 mova [r0 + 16], m1
4124
4125 packsswb m0, m1
4126 pcmpeqb m0, m2
4127
4128 ; get count
4129 ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
4130 %if 0
4131 pmovmskb eax, m0
4132 not ax
4133 popcnt ax, ax
4134 %else
4135 mova m1, [pb_1]
4136 paddb m0, m1
4137 psadbw m0, m2
4138 pshufd m1, m0, 2
4139 paddw m0, m1
4140 movd eax, m0
4141 %endif
4142 RET
4143
4144
4145 ;--------------------------------------------------------------------------------------
4146 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
4147 ;--------------------------------------------------------------------------------------
4148 INIT_XMM sse4
4149 cglobal copy_cnt_8, 3,3,6
4150 add r2d, r2d
4151 pxor m4, m4
4152 pxor m5, m5
4153
4154 ; row 0 & 1
4155 movu m0, [r1]
4156 movu m1, [r1 + r2]
4157 movu [r0], m0
4158 movu [r0 + 16], m1
4159
4160 packsswb m0, m1
4161 pcmpeqb m0, m4
4162 paddb m5, m0
4163
4164 ; row 2 & 3
4165 lea r1, [r1 + 2 * r2]
4166 movu m0, [r1]
4167 movu m1, [r1 + r2]
4168 movu [r0 + 32], m0
4169 movu [r0 + 48], m1
4170
4171 packsswb m0, m1
4172 pcmpeqb m0, m4
4173 paddb m5, m0
4174
4175 ; row 4 & 5
4176 lea r1, [r1 + 2 * r2]
4177 movu m0, [r1]
4178 movu m1, [r1 + r2]
4179 movu [r0 + 64], m0
4180 movu [r0 + 80], m1
4181
4182 packsswb m0, m1
4183 pcmpeqb m0, m4
4184 paddb m5, m0
4185
4186 ; row 6 & 7
4187 lea r1, [r1 + 2 * r2]
4188 movu m0, [r1]
4189 movu m1, [r1 + r2]
4190 movu [r0 + 96], m0
4191 movu [r0 + 112], m1
4192
4193 packsswb m0, m1
4194 pcmpeqb m0, m4
4195 paddb m5, m0
4196
4197 ; get count
4198 mova m0, [pb_4]
4199 paddb m5, m0
4200 psadbw m5, m4
4201 pshufd m0, m5, 2
4202 paddw m5, m0
4203 movd eax, m5
4204 RET
4205
4206
4207 INIT_YMM avx2
4208 cglobal copy_cnt_8, 3,4,5
4209 add r2d, r2d
4210 lea r3, [r2 * 3]
4211
4212 ; row 0 - 1
4213 movu xm0, [r1]
4214 vinserti128 m0, m0, [r1 + r2], 1
4215 movu [r0], m0
4216
4217 ; row 2 - 3
4218 movu xm1, [r1 + r2 * 2]
4219 vinserti128 m1, m1, [r1 + r3], 1
4220 movu [r0 + 32], m1
4221 lea r1, [r1 + r2 * 4]
4222
4223 ; row 4 - 5
4224 movu xm2, [r1]
4225 vinserti128 m2, m2, [r1 + r2], 1
4226 movu [r0 + 64], m2
4227
4228 ; row 6 - 7
4229 movu xm3, [r1 + r2 * 2]
4230 vinserti128 m3, m3, [r1 + r3], 1
4231 movu [r0 + 96], m3
4232
4233 ; get count
4234 xorpd m4, m4
4235 vpacksswb m0, m1
4236 vpacksswb m2, m3
4237 pminub m0, [pb_1]
4238 pminub m2, [pb_1]
4239 paddb m0, m2
4240 vextracti128 xm1, m0, 1
4241 paddb xm0, xm1
4242 psadbw xm0, xm4
4243 movhlps xm1, xm0
4244 paddd xm0, xm1
4245 movd eax, xm0
4246 RET
4247
4248
4249 ;--------------------------------------------------------------------------------------
4250 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
4251 ;--------------------------------------------------------------------------------------
4252 INIT_XMM sse4
4253 cglobal copy_cnt_16, 3,4,6
4254 add r2d, r2d
4255 mov r3d, 4
4256 pxor m4, m4
4257 pxor m5, m5
4258
4259 .loop
4260 ; row 0
4261 movu m0, [r1]
4262 movu m1, [r1 + 16]
4263 movu [r0], m0
4264 movu [r0 + 16], m1
4265
4266 packsswb m0, m1
4267 pcmpeqb m0, m4
4268 paddb m5, m0
4269
4270 ; row 1
4271 movu m0, [r1 + r2]
4272 movu m1, [r1 + r2 + 16]
4273 movu [r0 + 32], m0
4274 movu [r0 + 48], m1
4275
4276 packsswb m0, m1
4277 pcmpeqb m0, m4
4278 paddb m5, m0
4279
4280 ; row 2
4281 movu m0, [r1 + 2 * r2]
4282 movu m1, [r1 + 2 * r2 + 16]
4283 movu [r0 + 64], m0
4284 movu [r0 + 80], m1
4285
4286 packsswb m0, m1
4287 pcmpeqb m0, m4
4288 paddb m5, m0
4289
4290 ; row 3
4291 lea r1, [r1 + 2 * r2]
4292 movu m0, [r1 + r2]
4293 movu m1, [r1 + r2 + 16]
4294 movu [r0 + 96], m0
4295 movu [r0 + 112], m1
4296
4297 packsswb m0, m1
4298 pcmpeqb m0, m4
4299 paddb m5, m0
4300
4301 add r0, 128
4302 lea r1, [r1 + 2 * r2]
4303 dec r3d
4304 jnz .loop
4305
4306 mova m0, [pb_16]
4307 paddb m5, m0
4308 psadbw m5, m4
4309 pshufd m0, m5, 2
4310 paddw m5, m0
4311 movd eax, m5
4312 RET
4313
4314
4315 INIT_YMM avx2
4316 cglobal copy_cnt_16, 3, 5, 5
4317 add r2d, r2d
4318 lea r3, [r2 * 3]
4319 mov r4d, 16/4
4320
4321 mova m3, [pb_1]
4322 xorpd m4, m4
4323
4324 .loop:
4325 ; row 0 - 1
4326 movu m0, [r1]
4327 movu [r0], m0
4328 movu m1, [r1 + r2]
4329 movu [r0 + 32], m1
4330
4331 packsswb m0, m1
4332 pminub m0, m3
4333
4334 ; row 2 - 3
4335 movu m1, [r1 + r2 * 2]
4336 movu [r0 + 64], m1
4337 movu m2, [r1 + r3]
4338 movu [r0 + 96], m2
4339
4340 packsswb m1, m2
4341 pminub m1, m3
4342 paddb m0, m1
4343 paddb m4, m0
4344
4345 add r0, 128
4346 lea r1, [r1 + 4 * r2]
4347 dec r4d
4348 jnz .loop
4349
4350 ; get count
4351 xorpd m0, m0
4352 vextracti128 xm1, m4, 1
4353 paddb xm4, xm1
4354 psadbw xm4, xm0
4355 movhlps xm1, xm4
4356 paddd xm4, xm1
4357 movd eax, xm4
4358 RET
4359
4360 ;--------------------------------------------------------------------------------------
4361 ; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
4362 ;--------------------------------------------------------------------------------------
4363 INIT_XMM sse4
4364 cglobal copy_cnt_32, 3,4,6
4365 add r2d, r2d
4366 mov r3d, 16
4367 pxor m4, m4
4368 pxor m5, m5
4369
4370 .loop
4371 ; row 0
4372 movu m0, [r1]
4373 movu m1, [r1 + 16]
4374 movu [r0], m0
4375 movu [r0 + 16], m1
4376
4377 packsswb m0, m1
4378 pcmpeqb m0, m4
4379 paddb m5, m0
4380
4381 movu m0, [r1 + 32]
4382 movu m1, [r1 + 48]
4383 movu [r0 + 32], m0
4384 movu [r0 + 48], m1
4385
4386 packsswb m0, m1
4387 pcmpeqb m0, m4
4388 paddb m5, m0
4389
4390 ; row 1
4391 movu m0, [r1 + r2]
4392 movu m1, [r1 + r2 + 16]
4393 movu [r0 + 64], m0
4394 movu [r0 + 80], m1
4395
4396 packsswb m0, m1
4397 pcmpeqb m0, m4
4398 paddb m5, m0
4399
4400 movu m0, [r1 + r2 + 32]
4401 movu m1, [r1 + r2 + 48]
4402 movu [r0 + 96], m0
4403 movu [r0 + 112], m1
4404
4405 packsswb m0, m1
4406 pcmpeqb m0, m4
4407 paddb m5, m0
4408
4409 add r0, 128
4410 lea r1, [r1 + 2 * r2]
4411 dec r3d
4412 jnz .loop
4413
4414 ; get count
4415 mova m0, [pb_64]
4416 paddb m5, m0
4417 psadbw m5, m4
4418 pshufd m0, m5, 2
4419 paddw m5, m0
4420 movd eax, m5
4421 RET
4422
4423
4424 INIT_YMM avx2
4425 cglobal copy_cnt_32, 3, 5, 5
4426 add r2d, r2d
4427 mov r3d, 32/2
4428
4429 mova m3, [pb_1]
4430 xorpd m4, m4
4431
4432 .loop:
4433 ; row 0
4434 movu m0, [r1]
4435 movu [r0], m0
4436 movu m1, [r1 + 32]
4437 movu [r0 + 32], m1
4438
4439 packsswb m0, m1
4440 pminub m0, m3
4441
4442 ; row 1
4443 movu m1, [r1 + r2]
4444 movu [r0 + 64], m1
4445 movu m2, [r1 + r2 + 32]
4446 movu [r0 + 96], m2
4447
4448 packsswb m1, m2
4449 pminub m1, m3
4450 paddb m0, m1
4451 paddb m4, m0
4452
4453 add r0, 128
4454 lea r1, [r1 + 2 * r2]
4455 dec r3d
4456 jnz .loop
4457
4458 ; get count
4459 xorpd m0, m0
4460 vextracti128 xm1, m4, 1
4461 paddb xm4, xm1
4462 psadbw xm4, xm0
4463 movhlps xm1, xm4
4464 paddd xm4, xm1
4465 movd eax, xm4
4466 RET
4467
4468
4469 ;--------------------------------------------------------------------------------------
4470 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4471 ;--------------------------------------------------------------------------------------
4472 INIT_XMM sse2
4473 cglobal cpy2Dto1D_shl_4, 4, 4, 4
4474 add r2d, r2d
4475 movd m0, r3d
4476
4477 ; register alloc
4478 ; r0 - dst
4479 ; r1 - src
4480 ; r2 - srcStride
4481 ; m0 - shift
4482
4483 ; Row 0-3
4484 movh m2, [r1]
4485 movhps m2, [r1 + r2]
4486 lea r1, [r1 + r2 * 2]
4487 movh m3, [r1]
4488 movhps m3, [r1 + r2]
4489 psllw m2, m0
4490 psllw m3, m0
4491 mova [r0 + 0 * mmsize], m2
4492 mova [r0 + 1 * mmsize], m3
4493
4494 RET
4495
4496
4497 ;--------------------------------------------------------------------------------------
4498 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4499 ;--------------------------------------------------------------------------------------
4500 INIT_XMM sse2
4501 cglobal cpy2Dto1D_shl_8, 4, 5, 4
4502 add r2d, r2d
4503 movd m0, r3d
4504 mov r3d, 8/4
4505 lea r4, [r2 * 3]
4506
4507 ; register alloc
4508 ; r0 - dst
4509 ; r1 - src
4510 ; r2 - srcStride
4511 ; r3 - loop counter
4512 ; r4 - stride * 3
4513 ; m0 - shift
4514
4515 .loop:
4516 ; Row 0, 1
4517 mova m2, [r1]
4518 mova m3, [r1 + r2]
4519 psllw m2, m0
4520 psllw m3, m0
4521 mova [r0 + 0 * mmsize], m2
4522 mova [r0 + 1 * mmsize], m3
4523
4524 ; Row 2, 3
4525 mova m2, [r1 + r2 * 2]
4526 mova m3, [r1 + r4]
4527 psllw m2, m0
4528 psllw m3, m0
4529 mova [r0 + 2 * mmsize], m2
4530 mova [r0 + 3 * mmsize], m3
4531
4532 add r0, 4 * mmsize
4533 lea r1, [r1 + r2 * 4]
4534 dec r3d
4535 jnz .loop
4536 RET
4537
4538
4539 ;--------------------------------------------------------------------------------------
4540 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4541 ;--------------------------------------------------------------------------------------
4542 INIT_XMM sse2
4543 cglobal cpy2Dto1D_shl_16, 4, 4, 4
4544 add r2d, r2d
4545 movd m0, r3d
4546 mov r3d, 16/2
4547
4548 ; register alloc
4549 ; r0 - dst
4550 ; r1 - src
4551 ; r2 - srcStride
4552 ; r3 - loop counter
4553 ; m0 - shift
4554
4555 .loop:
4556 ; Row 0
4557 mova m2, [r1 + 0 * mmsize]
4558 mova m3, [r1 + 1 * mmsize]
4559 psllw m2, m0
4560 psllw m3, m0
4561 mova [r0 + 0 * mmsize], m2
4562 mova [r0 + 1 * mmsize], m3
4563
4564 ; Row 1
4565 mova m2, [r1 + r2 + 0 * mmsize]
4566 mova m3, [r1 + r2 + 1 * mmsize]
4567 psllw m2, m0
4568 psllw m3, m0
4569 mova [r0 + 2 * mmsize], m2
4570 mova [r0 + 3 * mmsize], m3
4571
4572 add r0, 4 * mmsize
4573 lea r1, [r1 + r2 * 2]
4574 dec r3d
4575 jnz .loop
4576 RET
4577
4578
4579 ;--------------------------------------------------------------------------------------
4580 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4581 ;--------------------------------------------------------------------------------------
4582 INIT_XMM sse2
4583 cglobal cpy2Dto1D_shl_32, 4, 4, 6
4584 add r2d, r2d
4585 movd m0, r3d
4586 mov r3d, 32/1
4587
4588 ; register alloc
4589 ; r0 - dst
4590 ; r1 - src
4591 ; r2 - srcStride
4592 ; r3 - loop counter
4593 ; m0 - shift
4594
4595 .loop:
4596 ; Row 0
4597 mova m2, [r1 + 0 * mmsize]
4598 mova m3, [r1 + 1 * mmsize]
4599 mova m4, [r1 + 2 * mmsize]
4600 mova m5, [r1 + 3 * mmsize]
4601 psllw m2, m0
4602 psllw m3, m0
4603 psllw m4, m0
4604 psllw m5, m0
4605 mova [r0 + 0 * mmsize], m2
4606 mova [r0 + 1 * mmsize], m3
4607 mova [r0 + 2 * mmsize], m4
4608 mova [r0 + 3 * mmsize], m5
4609
4610 add r0, 4 * mmsize
4611 add r1, r2
4612 dec r3d
4613 jnz .loop
4614 RET
4615
4616
4617 ;--------------------------------------------------------------------------------------
4618 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4619 ;--------------------------------------------------------------------------------------
4620 INIT_XMM sse2
4621 cglobal cpy1Dto2D_shr_4, 3, 3, 4
4622 add r2d, r2d
4623 movd m0, r3m
4624 pcmpeqw m1, m1
4625 psllw m1, m0
4626 psraw m1, 1
4627
4628 ; Row 0-3
4629 mova m2, [r1 + 0 * mmsize]
4630 mova m3, [r1 + 1 * mmsize]
4631 psubw m2, m1
4632 psubw m3, m1
4633 psraw m2, m0
4634 psraw m3, m0
4635 movh [r0], m2
4636 movhps [r0 + r2], m2
4637 movh [r0 + r2 * 2], m3
4638 lea r2, [r2 * 3]
4639 movhps [r0 + r2], m3
4640 RET
4641
4642
4643 INIT_YMM avx2
4644 cglobal cpy1Dto2D_shr_4, 3, 3, 3
4645 add r2d, r2d
4646 movd xm0, r3m
4647 pcmpeqw m1, m1
4648 psllw m1, xm0
4649 psraw m1, 1
4650
4651 ; Row 0-3
4652 movu m2, [r1]
4653 psubw m2, m1
4654 psraw m2, xm0
4655 vextracti128 xm1, m2, 1
4656 movq [r0], xm2
4657 movhps [r0 + r2], xm2
4658 lea r0, [r0 + r2 * 2]
4659 movq [r0], xm1
4660 movhps [r0 + r2], xm1
4661 RET
4662
4663
4664 ;--------------------------------------------------------------------------------------
4665 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4666 ;--------------------------------------------------------------------------------------
4667 INIT_XMM sse2
4668 cglobal cpy1Dto2D_shr_8, 3, 4, 6
4669 add r2d, r2d
4670 movd m0, r3m
4671 pcmpeqw m1, m1
4672 psllw m1, m0
4673 psraw m1, 1
4674 lea r3, [r2 * 3]
4675
4676 ; Row 0-3
4677 mova m2, [r1 + 0 * mmsize]
4678 mova m3, [r1 + 1 * mmsize]
4679 mova m4, [r1 + 2 * mmsize]
4680 mova m5, [r1 + 3 * mmsize]
4681 psubw m2, m1
4682 psubw m3, m1
4683 psubw m4, m1
4684 psubw m5, m1
4685 psraw m2, m0
4686 psraw m3, m0
4687 psraw m4, m0
4688 psraw m5, m0
4689 mova [r0], m2
4690 mova [r0 + r2], m3
4691 mova [r0 + r2 * 2], m4
4692 mova [r0 + r3], m5
4693
4694 ; Row 4-7
4695 mova m2, [r1 + 4 * mmsize]
4696 mova m3, [r1 + 5 * mmsize]
4697 mova m4, [r1 + 6 * mmsize]
4698 mova m5, [r1 + 7 * mmsize]
4699 lea r0, [r0 + r2 * 4]
4700 psubw m2, m1
4701 psubw m3, m1
4702 psubw m4, m1
4703 psubw m5, m1
4704 psraw m2, m0
4705 psraw m3, m0
4706 psraw m4, m0
4707 psraw m5, m0
4708 mova [r0], m2
4709 mova [r0 + r2], m3
4710 mova [r0 + r2 * 2], m4
4711 mova [r0 + r3], m5
4712 RET
4713
4714
4715 INIT_YMM avx2
4716 cglobal cpy1Dto2D_shr_8, 3, 4, 4
4717 add r2d, r2d
4718 movd xm0, r3m
4719 pcmpeqw m1, m1
4720 psllw m1, xm0
4721 psraw m1, 1
4722 lea r3, [r2 * 3]
4723
4724 ; Row 0-3
4725 movu m2, [r1 + 0 * mmsize]
4726 movu m3, [r1 + 1 * mmsize]
4727 psubw m2, m1
4728 psubw m3, m1
4729 psraw m2, xm0
4730 psraw m3, xm0
4731 movu [r0], xm2
4732 vextracti128 [r0 + r2], m2, 1
4733 movu [r0 + r2 * 2], xm3
4734 vextracti128 [r0 + r3], m3, 1
4735
4736 ; Row 4-7
4737 movu m2, [r1 + 2 * mmsize]
4738 movu m3, [r1 + 3 * mmsize]
4739 lea r0, [r0 + r2 * 4]
4740 psubw m2, m1
4741 psubw m3, m1
4742 psraw m2, xm0
4743 psraw m3, xm0
4744 movu [r0], xm2
4745 vextracti128 [r0 + r2], m2, 1
4746 movu [r0 + r2 * 2], xm3
4747 vextracti128 [r0 + r3], m3, 1
4748 RET
4749
4750
4751 ;--------------------------------------------------------------------------------------
4752 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4753 ;--------------------------------------------------------------------------------------
4754 INIT_XMM sse2
4755 cglobal cpy1Dto2D_shr_16, 3, 5, 6
4756 add r2d, r2d
4757 movd m0, r3m
4758 pcmpeqw m1, m1
4759 psllw m1, m0
4760 psraw m1, 1
4761 mov r3d, 16/4
4762 lea r4, [r2 * 3]
4763
4764 .loop:
4765 ; Row 0-1
4766 mova m2, [r1 + 0 * mmsize]
4767 mova m3, [r1 + 1 * mmsize]
4768 mova m4, [r1 + 2 * mmsize]
4769 mova m5, [r1 + 3 * mmsize]
4770 psubw m2, m1
4771 psubw m3, m1
4772 psubw m4, m1
4773 psubw m5, m1
4774 psraw m2, m0
4775 psraw m3, m0
4776 psraw m4, m0
4777 psraw m5, m0
4778 mova [r0], m2
4779 mova [r0 + mmsize], m3
4780 mova [r0 + r2], m4
4781 mova [r0 + r2 + mmsize], m5
4782
4783 ; Row 2-3
4784 mova m2, [r1 + 4 * mmsize]
4785 mova m3, [r1 + 5 * mmsize]
4786 mova m4, [r1 + 6 * mmsize]
4787 mova m5, [r1 + 7 * mmsize]
4788 psubw m2, m1
4789 psubw m3, m1
4790 psubw m4, m1
4791 psubw m5, m1
4792 psraw m2, m0
4793 psraw m3, m0
4794 psraw m4, m0
4795 psraw m5, m0
4796 mova [r0 + r2 * 2], m2
4797 mova [r0 + r2 * 2 + mmsize], m3
4798 mova [r0 + r4], m4
4799 mova [r0 + r4 + mmsize], m5
4800
4801 add r1, 8 * mmsize
4802 lea r0, [r0 + r2 * 4]
4803 dec r3d
4804 jnz .loop
4805 RET
4806
4807
4808 INIT_YMM avx2
4809 cglobal cpy1Dto2D_shr_16, 3, 5, 4
4810 add r2d, r2d
4811 movd xm0, r3m
4812 pcmpeqw m1, m1
4813 psllw m1, xm0
4814 psraw m1, 1
4815 mov r3d, 16/4
4816 lea r4, [r2 * 3]
4817
4818 .loop:
4819 ; Row 0-1
4820 movu m2, [r1 + 0 * mmsize]
4821 movu m3, [r1 + 1 * mmsize]
4822 psubw m2, m1
4823 psubw m3, m1
4824 psraw m2, xm0
4825 psraw m3, xm0
4826 movu [r0], m2
4827 movu [r0 + r2], m3
4828
4829 ; Row 2-3
4830 movu m2, [r1 + 2 * mmsize]
4831 movu m3, [r1 + 3 * mmsize]
4832 psubw m2, m1
4833 psubw m3, m1
4834 psraw m2, xm0
4835 psraw m3, xm0
4836 movu [r0 + r2 * 2], m2
4837 movu [r0 + r4], m3
4838
4839 add r1, 4 * mmsize
4840 lea r0, [r0 + r2 * 4]
4841 dec r3d
4842 jnz .loop
4843 RET
4844
4845
4846 ;--------------------------------------------------------------------------------------
4847 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4848 ;--------------------------------------------------------------------------------------
4849 INIT_XMM sse2
4850 cglobal cpy1Dto2D_shr_32, 3, 4, 6
4851 add r2d, r2d
4852 movd m0, r3m
4853 pcmpeqw m1, m1
4854 psllw m1, m0
4855 psraw m1, 1
4856 mov r3d, 32/2
4857
4858 .loop:
4859 ; Row 0
4860 mova m2, [r1 + 0 * mmsize]
4861 mova m3, [r1 + 1 * mmsize]
4862 mova m4, [r1 + 2 * mmsize]
4863 mova m5, [r1 + 3 * mmsize]
4864 psubw m2, m1
4865 psubw m3, m1
4866 psubw m4, m1
4867 psubw m5, m1
4868 psraw m2, m0
4869 psraw m3, m0
4870 psraw m4, m0
4871 psraw m5, m0
4872 mova [r0 + 0 * mmsize], m2
4873 mova [r0 + 1 * mmsize], m3
4874 mova [r0 + 2 * mmsize], m4
4875 mova [r0 + 3 * mmsize], m5
4876
4877 ; Row 1
4878 mova m2, [r1 + 4 * mmsize]
4879 mova m3, [r1 + 5 * mmsize]
4880 mova m4, [r1 + 6 * mmsize]
4881 mova m5, [r1 + 7 * mmsize]
4882 psubw m2, m1
4883 psubw m3, m1
4884 psubw m4, m1
4885 psubw m5, m1
4886 psraw m2, m0
4887 psraw m3, m0
4888 psraw m4, m0
4889 psraw m5, m0
4890 mova [r0 + r2 + 0 * mmsize], m2
4891 mova [r0 + r2 + 1 * mmsize], m3
4892 mova [r0 + r2 + 2 * mmsize], m4
4893 mova [r0 + r2 + 3 * mmsize], m5
4894
4895 add r1, 8 * mmsize
4896 lea r0, [r0 + r2 * 2]
4897 dec r3d
4898 jnz .loop
4899 RET
4900
4901
4902 INIT_YMM avx2
4903 cglobal cpy1Dto2D_shr_32, 3, 4, 6
4904 add r2d, r2d
4905 movd xm0, r3m
4906 pcmpeqw m1, m1
4907 psllw m1, xm0
4908 psraw m1, 1
4909 mov r3d, 32/2
4910
4911 .loop:
4912 ; Row 0-1
4913 movu m2, [r1 + 0 * mmsize]
4914 movu m3, [r1 + 1 * mmsize]
4915 movu m4, [r1 + 2 * mmsize]
4916 movu m5, [r1 + 3 * mmsize]
4917 psubw m2, m1
4918 psubw m3, m1
4919 psubw m4, m1
4920 psubw m5, m1
4921 psraw m2, xm0
4922 psraw m3, xm0
4923 psraw m4, xm0
4924 psraw m5, xm0
4925 movu [r0], m2
4926 movu [r0 + mmsize], m3
4927 movu [r0 + r2], m4
4928 movu [r0 + r2 + mmsize], m5
4929
4930 add r1, 4 * mmsize
4931 lea r0, [r0 + r2 * 2]
4932 dec r3d
4933 jnz .loop
4934 RET