1 ;*****************************************************************************
2 ;* Copyright (C) 2013 x265 project
4 ;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com>
5 ;* Murugan Vairavel <murugan@multicorewareinc.com>
7 ;* This program is free software; you can redistribute it and/or modify
8 ;* it under the terms of the GNU General Public License as published by
9 ;* the Free Software Foundation; either version 2 of the License, or
10 ;* (at your option) any later version.
12 ;* This program is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;* GNU General Public License for more details.
17 ;* You should have received a copy of the GNU General Public License
18 ;* along with this program; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 ;* This program is also available under a commercial proprietary license.
22 ;* For more information, contact us at license @ x265.com.
23 ;*****************************************************************************/
26 %include "x86util.asm"
30 tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
43 ;-----------------------------------------------------------------------------
44 ; void blockcopy_pp_2x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
45 ;-----------------------------------------------------------------------------
47 cglobal blockcopy_pp_2x4, 4, 7, 0
61 ;-----------------------------------------------------------------------------
62 ; void blockcopy_pp_2x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
63 ;-----------------------------------------------------------------------------
65 cglobal blockcopy_pp_2x8, 4, 7, 0
68 mov r6w, [r2 + 2 * r3]
72 mov [r0 + 2 * r1], r6w
78 mov r5w, [r2 + 2 * r3]
81 mov [r0 + 2 * r1], r5w
87 mov r5w, [r2 + 2 * r3]
90 mov [r0 + 2 * r1], r5w
99 ;-----------------------------------------------------------------------------
100 ; void blockcopy_pp_2x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
101 ;-----------------------------------------------------------------------------
103 cglobal blockcopy_pp_2x16, 4, 7, 0
109 lea r2, [r2 + r3 * 2]
112 lea r0, [r0 + r1 * 2]
117 ;-----------------------------------------------------------------------------
118 ; void blockcopy_pp_4x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
119 ;-----------------------------------------------------------------------------
121 cglobal blockcopy_pp_4x2, 4, 6, 0
129 ;-----------------------------------------------------------------------------
130 ; void blockcopy_pp_4x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
131 ;-----------------------------------------------------------------------------
133 cglobal blockcopy_pp_4x4, 4, 4, 4
136 movd m2, [r2 + 2 * r3]
137 lea r3, [r3 + r3 * 2]
142 movd [r0 + 2 * r1], m2
143 lea r1, [r1 + 2 * r1]
147 ;-----------------------------------------------------------------------------
148 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
149 ;-----------------------------------------------------------------------------
150 %macro BLOCKCOPY_PP_W4_H8 2
152 cglobal blockcopy_pp_%1x%2, 4, 5, 4
157 lea r2, [r2 + 2 * r3]
163 lea r0, [r0 + 2 * r1]
167 lea r0, [r0 + 2 * r1]
168 lea r2, [r2 + 2 * r3]
171 lea r2, [r2 + 2 * r3]
177 lea r0, [r0 + 2 * r1]
181 lea r0, [r0 + 2 * r1]
182 lea r2, [r2 + 2 * r3]
189 BLOCKCOPY_PP_W4_H8 4, 8
190 BLOCKCOPY_PP_W4_H8 4, 16
192 BLOCKCOPY_PP_W4_H8 4, 32
194 ;-----------------------------------------------------------------------------
195 ; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
196 ;-----------------------------------------------------------------------------
198 cglobal blockcopy_pp_6x8, 4, 7, 8
202 movd m2, [r2 + 2 * r3]
203 lea r5, [r2 + 2 * r3]
206 movd m4, [r5 + 2 * r3]
207 lea r5, [r5 + 2 * r3]
209 movd m6, [r5 + 2 * r3]
210 lea r5, [r5 + 2 * r3]
215 movd [r0 + 2 * r1], m2
216 lea r6, [r0 + 2 * r1]
219 movd [r6 + 2 * r1], m4
220 lea r6, [r6 + 2 * r1]
222 movd [r6 + 2 * r1], m6
223 lea r6, [r6 + 2 * r1]
227 mov r5w, [r2 + r3 + 4]
228 mov r6w, [r2 + 2 * r3 + 4]
231 mov [r0 + r1 + 4], r5w
232 mov [r0 + 2 * r1 + 4], r6w
234 lea r0, [r0 + 2 * r1]
235 lea r2, [r2 + 2 * r3]
237 mov r4w, [r2 + r3 + 4]
238 mov r5w, [r2 + 2 * r3 + 4]
240 mov [r0 + r1 + 4], r4w
241 mov [r0 + 2 * r1 + 4], r5w
243 lea r0, [r0 + 2 * r1]
244 lea r2, [r2 + 2 * r3]
246 mov r4w, [r2 + r3 + 4]
247 mov r5w, [r2 + 2 * r3 + 4]
249 mov [r0 + r1 + 4], r4w
250 mov [r0 + 2 * r1 + 4], r5w
252 lea r0, [r0 + 2 * r1]
253 lea r2, [r2 + 2 * r3]
255 mov r4w, [r2 + r3 + 4]
256 mov [r0 + r1 + 4], r4w
259 ;-----------------------------------------------------------------------------
260 ; void blockcopy_pp_6x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
261 ;-----------------------------------------------------------------------------
263 cglobal blockcopy_pp_6x16, 4, 7, 2
269 mov r5w, [r2 + r3 + 4]
270 lea r2, [r2 + r3 * 2]
274 mov [r0 + r1 + 4], r5w
275 lea r0, [r0 + r1 * 2]
281 ;-----------------------------------------------------------------------------
282 ; void blockcopy_pp_8x2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
283 ;-----------------------------------------------------------------------------
285 cglobal blockcopy_pp_8x2, 4, 4, 2
293 ;-----------------------------------------------------------------------------
294 ; void blockcopy_pp_8x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
295 ;-----------------------------------------------------------------------------
297 cglobal blockcopy_pp_8x4, 4, 4, 4
300 movh m2, [r2 + 2 * r3]
301 lea r3, [r3 + r3 * 2]
306 movh [r0 + 2 * r1], m2
307 lea r1, [r1 + 2 * r1]
311 ;-----------------------------------------------------------------------------
312 ; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
313 ;-----------------------------------------------------------------------------
315 cglobal blockcopy_pp_8x6, 4, 7, 6
318 movh m2, [r2 + 2 * r3]
319 lea r5, [r2 + 2 * r3]
321 movh m4, [r5 + 2 * r3]
322 lea r5, [r5 + 2 * r3]
327 movh [r0 + 2 * r1], m2
328 lea r6, [r0 + 2 * r1]
330 movh [r6 + 2 * r1], m4
331 lea r6, [r6 + 2 * r1]
335 ;-----------------------------------------------------------------------------
336 ; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
337 ;-----------------------------------------------------------------------------
339 cglobal blockcopy_pp_8x12, 4, 5, 2
347 lea r0, [r0 + 2 * r1]
348 lea r2, [r2 + 2 * r3]
352 ;-----------------------------------------------------------------------------
353 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
354 ;-----------------------------------------------------------------------------
355 %macro BLOCKCOPY_PP_W8_H8 2
357 cglobal blockcopy_pp_%1x%2, 4, 5, 6
363 lea r2, [r2 + 2 * r3]
366 lea r2, [r2 + 2 * r3]
372 lea r0, [r0 + 2 * r1]
375 lea r0, [r0 + 2 * r1]
379 lea r2, [r2 + 2 * r3]
382 lea r0, [r0 + 2 * r1]
387 lea r0, [r0 + 2 * r1]
388 lea r2, [r2 + 2 * r3]
393 BLOCKCOPY_PP_W8_H8 8, 8
394 BLOCKCOPY_PP_W8_H8 8, 16
395 BLOCKCOPY_PP_W8_H8 8, 32
397 BLOCKCOPY_PP_W8_H8 8, 64
399 ;-----------------------------------------------------------------------------
400 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
401 ;-----------------------------------------------------------------------------
402 %macro BLOCKCOPY_PP_W12_H4 2
404 cglobal blockcopy_pp_%1x%2, 4, 5, 4
411 movd m3, [r2 + r3 + 8]
412 lea r2, [r2 + 2 * r3]
417 movd [r0 + r1 + 8], m3
418 lea r0, [r0 + 2 * r1]
423 movd m3, [r2 + r3 + 8]
428 movd [r0 + r1 + 8], m3
431 lea r0, [r0 + 2 * r1]
432 lea r2, [r2 + 2 * r3]
437 BLOCKCOPY_PP_W12_H4 12, 16
439 BLOCKCOPY_PP_W12_H4 12, 32
441 ;-----------------------------------------------------------------------------
442 ; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
443 ;-----------------------------------------------------------------------------
444 %macro BLOCKCOPY_PP_W16_H4 2
446 cglobal blockcopy_pp_%1x%2, 4, 5, 4
452 lea r2, [r2 + 2 * r3]
458 lea r0, [r0 + 2 * r1]
463 lea r0, [r0 + 2 * r1]
464 lea r2, [r2 + 2 * r3]
470 BLOCKCOPY_PP_W16_H4 16, 4
471 BLOCKCOPY_PP_W16_H4 16, 12
473 ;-----------------------------------------------------------------------------
474 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
475 ;-----------------------------------------------------------------------------
476 %macro BLOCKCOPY_PP_W16_H8 2
478 cglobal blockcopy_pp_%1x%2, 4, 5, 6
484 lea r2, [r2 + 2 * r3]
487 lea r2, [r2 + 2 * r3]
490 lea r2, [r2 + 2 * r3]
494 lea r0, [r0 + 2 * r1]
497 lea r0, [r0 + 2 * r1]
500 lea r0, [r0 + 2 * r1]
508 lea r0, [r0 + 2 * r1]
509 lea r2, [r2 + 2 * r3]
514 BLOCKCOPY_PP_W16_H8 16, 8
515 BLOCKCOPY_PP_W16_H8 16, 16
516 BLOCKCOPY_PP_W16_H8 16, 32
517 BLOCKCOPY_PP_W16_H8 16, 64
519 BLOCKCOPY_PP_W16_H8 16, 24
521 ;-----------------------------------------------------------------------------
522 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
523 ;-----------------------------------------------------------------------------
524 %macro BLOCKCOPY_PP_W24_H4 2
526 cglobal blockcopy_pp_%1x%2, 4, 5, 6
533 movh m3, [r2 + r3 + 16]
534 lea r2, [r2 + 2 * r3]
541 movh [r0 + r1 + 16], m3
542 lea r0, [r0 + 2 * r1]
547 movh m1, [r2 + r3 + 16]
549 movh [r0 + r1 + 16], m1
552 lea r0, [r0 + 2 * r1]
553 lea r2, [r2 + 2 * r3]
558 BLOCKCOPY_PP_W24_H4 24, 32
560 BLOCKCOPY_PP_W24_H4 24, 64
562 ;-----------------------------------------------------------------------------
563 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
564 ;-----------------------------------------------------------------------------
565 %macro BLOCKCOPY_PP_W32_H4 2
567 cglobal blockcopy_pp_%1x%2, 4, 5, 4
574 movu m3, [r2 + r3 + 16]
575 lea r2, [r2 + 2 * r3]
580 movu [r0 + r1 + 16], m3
581 lea r0, [r0 + 2 * r1]
586 movu m3, [r2 + r3 + 16]
591 movu [r0 + r1 + 16], m3
594 lea r0, [r0 + 2 * r1]
595 lea r2, [r2 + 2 * r3]
600 BLOCKCOPY_PP_W32_H4 32, 8
601 BLOCKCOPY_PP_W32_H4 32, 16
602 BLOCKCOPY_PP_W32_H4 32, 24
603 BLOCKCOPY_PP_W32_H4 32, 32
604 BLOCKCOPY_PP_W32_H4 32, 64
606 BLOCKCOPY_PP_W32_H4 32, 48
609 cglobal blockcopy_pp_32x8, 4, 6, 6
615 movu m2, [r2 + 2 * r3]
617 lea r2, [r2 + 4 * r3]
623 movu [r0 + 2 * r1], m2
625 lea r0, [r0 + 4 * r1]
629 movu m0, [r2 + 2 * r3]
632 movu [r0 + 2 * r1], m0
637 cglobal blockcopy_pp_32x16, 4, 6, 6
643 movu m2, [r2 + 2 * r3]
645 lea r2, [r2 + 4 * r3]
651 movu [r0 + 2 * r1], m2
653 lea r0, [r0 + 4 * r1]
657 movu m0, [r2 + 2 * r3]
659 lea r2, [r2 + 4 * r3]
662 movu m4, [r2 + 2 * r3]
665 movu [r0 + 2 * r1], m0
667 lea r0, [r0 + 4 * r1]
670 movu [r0 + 2 * r1], m4
673 lea r2, [r2 + 4 * r3]
676 movu m2, [r2 + 2 * r3]
679 lea r0, [r0 + 4 * r1]
682 movu [r0 + 2 * r1], m2
686 ;-----------------------------------------------------------------------------
687 ; void blockcopy_pp_32x24(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
688 ;-----------------------------------------------------------------------------
690 cglobal blockcopy_pp_32x24, 4, 7, 6
698 movu m2, [r2 + 2 * r3]
700 lea r2, [r2 + 4 * r3]
706 movu [r0 + 2 * r1], m2
708 lea r0, [r0 + 4 * r1]
712 movu m0, [r2 + 2 * r3]
715 movu [r0 + 2 * r1], m0
718 lea r2, [r2 + 4 * r3]
719 lea r0, [r0 + 4 * r1]
724 ;-----------------------------------------------------------------------------
725 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
726 ;-----------------------------------------------------------------------------
727 %macro BLOCKCOPY_PP_W32_H16_avx 2
729 cglobal blockcopy_pp_%1x%2, 4, 7, 6
737 movu m2, [r2 + 2 * r3]
739 lea r2, [r2 + 4 * r3]
745 movu [r0 + 2 * r1], m2
747 lea r0, [r0 + 4 * r1]
751 movu m0, [r2 + 2 * r3]
753 lea r2, [r2 + 4 * r3]
756 movu m4, [r2 + 2 * r3]
759 movu [r0 + 2 * r1], m0
761 lea r0, [r0 + 4 * r1]
764 movu [r0 + 2 * r1], m4
767 lea r2, [r2 + 4 * r3]
770 movu m2, [r2 + 2 * r3]
773 lea r0, [r0 + 4 * r1]
776 movu [r0 + 2 * r1], m2
779 lea r2, [r2 + 4 * r3]
780 lea r0, [r0 + 4 * r1]
786 BLOCKCOPY_PP_W32_H16_avx 32, 32
787 BLOCKCOPY_PP_W32_H16_avx 32, 48
788 BLOCKCOPY_PP_W32_H16_avx 32, 64
790 ;-----------------------------------------------------------------------------
791 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
792 ;-----------------------------------------------------------------------------
793 %macro BLOCKCOPY_PP_W48_H2 2
795 cglobal blockcopy_pp_%1x%2, 4, 5, 6
803 movu m4, [r2 + r3 + 16]
804 movu m5, [r2 + r3 + 32]
805 lea r2, [r2 + 2 * r3]
811 movu [r0 + r1 + 16], m4
812 movu [r0 + r1 + 32], m5
813 lea r0, [r0 + 2 * r1]
819 movu m4, [r2 + r3 + 16]
820 movu m5, [r2 + r3 + 32]
826 movu [r0 + r1 + 16], m4
827 movu [r0 + r1 + 32], m5
830 lea r0, [r0 + 2 * r1]
831 lea r2, [r2 + 2 * r3]
836 BLOCKCOPY_PP_W48_H2 48, 64
838 ;-----------------------------------------------------------------------------
839 ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
840 ;-----------------------------------------------------------------------------
841 %macro BLOCKCOPY_PP_W64_H4 2
843 cglobal blockcopy_pp_%1x%2, 4, 5, 6
852 movu m5, [r2 + r3 + 16]
859 movu [r0 + r1 + 16], m5
861 movu m0, [r2 + r3 + 32]
862 movu m1, [r2 + r3 + 48]
863 lea r2, [r2 + 2 * r3]
869 movu [r0 + r1 + 32], m0
870 movu [r0 + r1 + 48], m1
871 lea r0, [r0 + 2 * r1]
878 movu m1, [r2 + r3 + 16]
879 movu m2, [r2 + r3 + 32]
880 movu m3, [r2 + r3 + 48]
883 movu [r0 + r1 + 16], m1
884 movu [r0 + r1 + 32], m2
885 movu [r0 + r1 + 48], m3
888 lea r0, [r0 + 2 * r1]
889 lea r2, [r2 + 2 * r3]
894 BLOCKCOPY_PP_W64_H4 64, 16
895 BLOCKCOPY_PP_W64_H4 64, 32
896 BLOCKCOPY_PP_W64_H4 64, 48
897 BLOCKCOPY_PP_W64_H4 64, 64
899 ;-----------------------------------------------------------------------------
900 ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
901 ;-----------------------------------------------------------------------------
903 cglobal blockcopy_sp_2x4, 4, 5, 2
913 pextrw [r0 + r1], m0, 4
916 movd m0, [r2 + 2 * r3]
917 lea r2, [r2 + 2 * r3]
921 mov [r0 + 2 * r1], r4w
922 lea r0, [r0 + 2 * r1]
923 pextrw [r0 + r1], m0, 4
928 ;-----------------------------------------------------------------------------
929 ; void blockcopy_sp_2x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
930 ;-----------------------------------------------------------------------------
932 cglobal blockcopy_sp_2x8, 4, 5, 2
942 pextrw [r0 + r1], m0, 4
945 movd m0, [r2 + 2 * r3]
946 lea r2, [r2 + 2 * r3]
950 mov [r0 + 2 * r1], r4w
951 lea r0, [r0 + 2 * r1]
952 pextrw [r0 + r1], m0, 4
955 movd m0, [r2 + 2 * r3]
956 lea r2, [r2 + 2 * r3]
960 mov [r0 + 2 * r1], r4w
961 lea r0, [r0 + 2 * r1]
962 pextrw [r0 + r1], m0, 4
965 movd m0, [r2 + 2 * r3]
966 lea r2, [r2 + 2 * r3]
970 mov [r0 + 2 * r1], r4w
971 lea r0, [r0 + 2 * r1]
972 pextrw [r0 + r1], m0, 4
976 ;-----------------------------------------------------------------------------
977 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
978 ;-----------------------------------------------------------------------------
979 %macro BLOCKCOPY_SP_W2_H2 2
981 cglobal blockcopy_sp_%1x%2, 4, 7, 2, dst, dstStride, src, srcStride
988 lea r2, [r2 + r3 * 2]
995 lea r0, [r0 + r1 * 2]
1000 BLOCKCOPY_SP_W2_H2 2, 4
1001 BLOCKCOPY_SP_W2_H2 2, 8
1003 BLOCKCOPY_SP_W2_H2 2, 16
1005 ;-----------------------------------------------------------------------------
1006 ; void blockcopy_sp_4x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1007 ;-----------------------------------------------------------------------------
1009 cglobal blockcopy_sp_4x2, 4, 4, 2, dst, dstStride, src, srcStride
1024 ;-----------------------------------------------------------------------------
1025 ; void blockcopy_sp_4x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1026 ;-----------------------------------------------------------------------------
1028 cglobal blockcopy_sp_4x4, 4, 4, 4, dst, dstStride, src, srcStride
1034 movh m2, [r2 + 2 * r3]
1035 lea r2, [r2 + 2 * r3]
1044 movd [r0 + 2 * r1], m2
1045 lea r0, [r0 + 2 * r1]
1051 ;-----------------------------------------------------------------------------
1052 ; void blockcopy_sp_4x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1053 ;-----------------------------------------------------------------------------
1055 cglobal blockcopy_sp_4x8, 4, 4, 8, dst, dstStride, src, srcStride
1061 movh m2, [r2 + 2 * r3]
1062 lea r2, [r2 + 2 * r3]
1064 movh m4, [r2 + 2 * r3]
1065 lea r2, [r2 + 2 * r3]
1067 movh m6, [r2 + 2 * r3]
1068 lea r2, [r2 + 2 * r3]
1079 movd [r0 + 2 * r1], m2
1080 lea r0, [r0 + 2 * r1]
1083 movd [r0 + 2 * r1], m4
1084 lea r0, [r0 + 2 * r1]
1087 movd [r0 + 2 * r1], m6
1088 lea r0, [r0 + 2 * r1]
1094 ;-----------------------------------------------------------------------------
1095 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1096 ;-----------------------------------------------------------------------------
1097 %macro BLOCKCOPY_SP_W4_H8 2
1099 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1108 movh m2, [r2 + 2 * r3]
1109 lea r2, [r2 + 2 * r3]
1111 movh m4, [r2 + 2 * r3]
1112 lea r2, [r2 + 2 * r3]
1114 movh m6, [r2 + 2 * r3]
1115 lea r2, [r2 + 2 * r3]
1126 movd [r0 + 2 * r1], m2
1127 lea r0, [r0 + 2 * r1]
1130 movd [r0 + 2 * r1], m4
1131 lea r0, [r0 + 2 * r1]
1134 movd [r0 + 2 * r1], m6
1135 lea r0, [r0 + 2 * r1]
1139 lea r0, [r0 + 2 * r1]
1140 lea r2, [r2 + 2 * r3]
1148 BLOCKCOPY_SP_W4_H8 4, 16
1150 BLOCKCOPY_SP_W4_H8 4, 32
1152 ;-----------------------------------------------------------------------------
1153 ; void blockcopy_sp_6x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1154 ;-----------------------------------------------------------------------------
1156 cglobal blockcopy_sp_6x8, 4, 4, 2
1165 pextrw [r0 + 4], m0, 2
1169 pextrw [r0 + r1 + 4], m0, 2
1171 lea r0, [r0 + 2 * r1]
1172 lea r2, [r2 + 2 * r3]
1179 pextrw [r0 + 4], m0, 2
1183 pextrw [r0 + r1 + 4], m0, 2
1185 lea r0, [r0 + 2 * r1]
1186 lea r2, [r2 + 2 * r3]
1193 pextrw [r0 + 4], m0, 2
1197 pextrw [r0 + r1 + 4], m0, 2
1199 lea r0, [r0 + 2 * r1]
1200 lea r2, [r2 + 2 * r3]
1207 pextrw [r0 + 4], m0, 2
1211 pextrw [r0 + r1 + 4], m0, 2
1215 ;-----------------------------------------------------------------------------
1216 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1217 ;-----------------------------------------------------------------------------
1218 %macro BLOCKCOPY_SP_W6_H2 2
1220 cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
1227 movd m3, [r2 + r3 + 8]
1229 lea r2, [r2 + r3 * 2]
1239 mov [r0 + r1 + 4], r5w
1240 lea r0, [r0 + r1 * 2]
1245 BLOCKCOPY_SP_W6_H2 6, 8
1247 BLOCKCOPY_SP_W6_H2 6, 16
1249 ;-----------------------------------------------------------------------------
1250 ; void blockcopy_sp_8x2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1251 ;-----------------------------------------------------------------------------
1253 cglobal blockcopy_sp_8x2, 4, 4, 2, dst, dstStride, src, srcStride
1263 movhps [r0 + r1], m0
1267 ;-----------------------------------------------------------------------------
1268 ; void blockcopy_sp_8x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1269 ;-----------------------------------------------------------------------------
1271 cglobal blockcopy_sp_8x4, 4, 4, 4, dst, dstStride, src, srcStride
1277 movu m2, [r2 + 2 * r3]
1278 lea r2, [r2 + 2 * r3]
1285 movhps [r0 + r1], m0
1286 movlps [r0 + 2 * r1], m2
1287 lea r0, [r0 + 2 * r1]
1288 movhps [r0 + r1], m2
1292 ;-----------------------------------------------------------------------------
1293 ; void blockcopy_sp_8x6(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1294 ;-----------------------------------------------------------------------------
1296 cglobal blockcopy_sp_8x6, 4, 4, 6, dst, dstStride, src, srcStride
1302 movu m2, [r2 + 2 * r3]
1303 lea r2, [r2 + 2 * r3]
1305 movu m4, [r2 + 2 * r3]
1306 lea r2, [r2 + 2 * r3]
1314 movhps [r0 + r1], m0
1315 movlps [r0 + 2 * r1], m2
1316 lea r0, [r0 + 2 * r1]
1317 movhps [r0 + r1], m2
1318 movlps [r0 + 2 * r1], m4
1319 lea r0, [r0 + 2 * r1]
1320 movhps [r0 + r1], m4
1324 ;-----------------------------------------------------------------------------
1325 ; void blockcopy_sp_8x8(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1326 ;-----------------------------------------------------------------------------
1328 cglobal blockcopy_sp_8x8, 4, 4, 8, dst, dstStride, src, srcStride
1334 movu m2, [r2 + 2 * r3]
1335 lea r2, [r2 + 2 * r3]
1337 movu m4, [r2 + 2 * r3]
1338 lea r2, [r2 + 2 * r3]
1340 movu m6, [r2 + 2 * r3]
1341 lea r2, [r2 + 2 * r3]
1350 movhps [r0 + r1], m0
1351 movlps [r0 + 2 * r1], m2
1352 lea r0, [r0 + 2 * r1]
1353 movhps [r0 + r1], m2
1354 movlps [r0 + 2 * r1], m4
1355 lea r0, [r0 + 2 * r1]
1356 movhps [r0 + r1], m4
1357 movlps [r0 + 2 * r1], m6
1358 lea r0, [r0 + 2 * r1]
1359 movhps [r0 + r1], m6
1363 ;-----------------------------------------------------------------------------
1364 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1365 ;-----------------------------------------------------------------------------
1366 %macro BLOCKCOPY_SP_W8_H4 2
1368 cglobal blockcopy_sp_%1x%2, 4, 5, 4, dst, dstStride, src, srcStride
1374 lea r2, [r2 + r3 * 2]
1378 lea r2, [r2 + r3 * 2]
1382 movhps [r0 + r1], m0
1383 lea r0, [r0 + r1 * 2]
1385 movhps [r0 + r1], m2
1386 lea r0, [r0 + r1 * 2]
1391 BLOCKCOPY_SP_W8_H4 8, 12
1393 ;-----------------------------------------------------------------------------
1394 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1395 ;-----------------------------------------------------------------------------
1396 %macro BLOCKCOPY_SP_W8_H8 2
1398 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1407 movu m2, [r2 + 2 * r3]
1408 lea r2, [r2 + 2 * r3]
1410 movu m4, [r2 + 2 * r3]
1411 lea r2, [r2 + 2 * r3]
1413 movu m6, [r2 + 2 * r3]
1414 lea r2, [r2 + 2 * r3]
1423 movhps [r0 + r1], m0
1424 movlps [r0 + 2 * r1], m2
1425 lea r0, [r0 + 2 * r1]
1426 movhps [r0 + r1], m2
1427 movlps [r0 + 2 * r1], m4
1428 lea r0, [r0 + 2 * r1]
1429 movhps [r0 + r1], m4
1430 movlps [r0 + 2 * r1], m6
1431 lea r0, [r0 + 2 * r1]
1432 movhps [r0 + r1], m6
1434 lea r0, [r0 + 2 * r1]
1435 lea r2, [r2 + 2 * r3]
1443 BLOCKCOPY_SP_W8_H8 8, 16
1444 BLOCKCOPY_SP_W8_H8 8, 32
1446 BLOCKCOPY_SP_W8_H8 8, 64
1448 ;-----------------------------------------------------------------------------
1449 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1450 ;-----------------------------------------------------------------------------
1451 %macro BLOCKCOPY_SP_W12_H4 2
1453 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1463 movu m3, [r2 + r3 + 16]
1464 movu m4, [r2 + 2 * r3]
1465 movu m5, [r2 + 2 * r3 + 16]
1466 lea r2, [r2 + 2 * r3]
1468 movu m7, [r2 + r3 + 16]
1481 movd [r0 + r1 + 8], m2
1483 movh [r0 + 2 * r1], m4
1485 movd [r0 + 2 * r1 + 8], m4
1487 lea r0, [r0 + 2 * r1]
1490 movd [r0 + r1 + 8], m6
1492 lea r0, [r0 + 2 * r1]
1493 lea r2, [r2 + 2 * r3]
1501 BLOCKCOPY_SP_W12_H4 12, 16
1503 BLOCKCOPY_SP_W12_H4 12, 32
1505 ;-----------------------------------------------------------------------------
1506 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1507 ;-----------------------------------------------------------------------------
1508 %macro BLOCKCOPY_SP_W16_H4 2
1510 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1520 movu m3, [r2 + r3 + 16]
1521 movu m4, [r2 + 2 * r3]
1522 movu m5, [r2 + 2 * r3 + 16]
1523 lea r2, [r2 + 2 * r3]
1525 movu m7, [r2 + r3 + 16]
1534 movu [r0 + 2 * r1], m4
1535 lea r0, [r0 + 2 * r1]
1538 lea r0, [r0 + 2 * r1]
1539 lea r2, [r2 + 2 * r3]
1547 BLOCKCOPY_SP_W16_H4 16, 4
1548 BLOCKCOPY_SP_W16_H4 16, 8
1549 BLOCKCOPY_SP_W16_H4 16, 12
1550 BLOCKCOPY_SP_W16_H4 16, 16
1551 BLOCKCOPY_SP_W16_H4 16, 32
1552 BLOCKCOPY_SP_W16_H4 16, 64
1554 BLOCKCOPY_SP_W16_H4 16, 24
1556 ;-----------------------------------------------------------------------------
1557 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1558 ;-----------------------------------------------------------------------------
1559 %macro BLOCKCOPY_SP_W24_H2 2
1561 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
1572 movu m4, [r2 + r3 + 16]
1573 movu m5, [r2 + r3 + 32]
1580 movlps [r0 + 16], m2
1581 movhps [r0 + r1], m2
1582 movu [r0 + r1 + 8], m4
1584 lea r0, [r0 + 2 * r1]
1585 lea r2, [r2 + 2 * r3]
1593 BLOCKCOPY_SP_W24_H2 24, 32
1595 BLOCKCOPY_SP_W24_H2 24, 64
1597 ;-----------------------------------------------------------------------------
1598 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1599 ;-----------------------------------------------------------------------------
1600 %macro BLOCKCOPY_SP_W32_H2 2
1602 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1614 movu m5, [r2 + r3 + 16]
1615 movu m6, [r2 + r3 + 32]
1616 movu m7, [r2 + r3 + 48]
1626 movu [r0 + r1 + 16], m6
1628 lea r0, [r0 + 2 * r1]
1629 lea r2, [r2 + 2 * r3]
1637 BLOCKCOPY_SP_W32_H2 32, 8
1638 BLOCKCOPY_SP_W32_H2 32, 16
1639 BLOCKCOPY_SP_W32_H2 32, 24
1640 BLOCKCOPY_SP_W32_H2 32, 32
1641 BLOCKCOPY_SP_W32_H2 32, 64
1643 BLOCKCOPY_SP_W32_H2 32, 48
1645 ;-----------------------------------------------------------------------------
1646 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1647 ;-----------------------------------------------------------------------------
1648 %macro BLOCKCOPY_SP_W48_H2 2
1650 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
1681 BLOCKCOPY_SP_W48_H2 48, 64
1683 ;-----------------------------------------------------------------------------
1684 ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
1685 ;-----------------------------------------------------------------------------
1686 %macro BLOCKCOPY_SP_W64_H1 2
1688 cglobal blockcopy_sp_%1x%2, 4, 5, 8, dst, dstStride, src, srcStride
1723 BLOCKCOPY_SP_W64_H1 64, 16
1724 BLOCKCOPY_SP_W64_H1 64, 32
1725 BLOCKCOPY_SP_W64_H1 64, 48
1726 BLOCKCOPY_SP_W64_H1 64, 64
1728 ;-----------------------------------------------------------------------------
1729 ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val)
1730 ;-----------------------------------------------------------------------------
1732 cglobal blockfill_s_4x4, 3, 3, 1, dst, dstStride, val
1741 movh [r0 + 2 * r1], m0
1742 lea r0, [r0 + 2 * r1]
1747 ;-----------------------------------------------------------------------------
1748 ; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val)
1749 ;-----------------------------------------------------------------------------
1751 cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val
1761 movu [r0 + 2 * r1], m0
1763 lea r0, [r0 + 2 * r1]
1765 movu [r0 + 2 * r1], m0
1767 lea r0, [r0 + 2 * r1]
1769 movu [r0 + 2 * r1], m0
1771 lea r0, [r0 + 2 * r1]
1776 ;-----------------------------------------------------------------------------
1777 ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
1778 ;-----------------------------------------------------------------------------
1779 %macro BLOCKFILL_S_W16_H8 2
1781 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
1796 movu [r0 + r1 + 16], m0
1798 movu [r0 + 2 * r1], m0
1799 movu [r0 + 2 * r1 + 16], m0
1801 lea r4, [r0 + 2 * r1]
1803 movu [r4 + r1 + 16], m0
1805 movu [r0 + 4 * r1], m0
1806 movu [r0 + 4 * r1 + 16], m0
1808 lea r4, [r0 + 4 * r1]
1810 movu [r4 + r1 + 16], m0
1812 movu [r4 + 2 * r1], m0
1813 movu [r4 + 2 * r1 + 16], m0
1815 lea r4, [r4 + 2 * r1]
1817 movu [r4 + r1 + 16], m0
1819 lea r0, [r0 + 8 * r1]
1827 BLOCKFILL_S_W16_H8 16, 16
1830 cglobal blockfill_s_16x16, 3, 4, 1
1834 vpbroadcastw m0, xm0
1838 movu [r0 + 2 * r1], m0
1840 lea r0, [r0 + 4 * r1]
1843 movu [r0 + 2 * r1], m0
1845 lea r0, [r0 + 4 * r1]
1848 movu [r0 + 2 * r1], m0
1850 lea r0, [r0 + 4 * r1]
1853 movu [r0 + 2 * r1], m0
1857 ;-----------------------------------------------------------------------------
1858 ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val)
1859 ;-----------------------------------------------------------------------------
1860 %macro BLOCKFILL_S_W32_H4 2
1862 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val
1879 movu [r0 + r1 + 16], m0
1880 movu [r0 + r1 + 32], m0
1881 movu [r0 + r1 + 48], m0
1883 movu [r0 + 2 * r1], m0
1884 movu [r0 + 2 * r1 + 16], m0
1885 movu [r0 + 2 * r1 + 32], m0
1886 movu [r0 + 2 * r1 + 48], m0
1888 lea r4, [r0 + 2 * r1]
1891 movu [r4 + r1 + 16], m0
1892 movu [r4 + r1 + 32], m0
1893 movu [r4 + r1 + 48], m0
1895 lea r0, [r0 + 4 * r1]
1903 BLOCKFILL_S_W32_H4 32, 32
1906 cglobal blockfill_s_32x32, 3, 4, 1
1910 vpbroadcastw m0, xm0
1915 movu [r0 + r1 + 32], m0
1916 movu [r0 + 2 * r1], m0
1917 movu [r0 + 2 * r1 + 32], m0
1919 movu [r0 + r3 + 32], m0
1920 lea r0, [r0 + 4 * r1]
1924 movu [r0 + r1 + 32], m0
1925 movu [r0 + 2 * r1], m0
1926 movu [r0 + 2 * r1 + 32], m0
1928 movu [r0 + r3 + 32], m0
1929 lea r0, [r0 + 4 * r1]
1933 movu [r0 + r1 + 32], m0
1934 movu [r0 + 2 * r1], m0
1935 movu [r0 + 2 * r1 + 32], m0
1937 movu [r0 + r3 + 32], m0
1938 lea r0, [r0 + 4 * r1]
1942 movu [r0 + r1 + 32], m0
1943 movu [r0 + 2 * r1], m0
1944 movu [r0 + 2 * r1 + 32], m0
1946 movu [r0 + r3 + 32], m0
1947 lea r0, [r0 + 4 * r1]
1951 movu [r0 + r1 + 32], m0
1952 movu [r0 + 2 * r1], m0
1953 movu [r0 + 2 * r1 + 32], m0
1955 movu [r0 + r3 + 32], m0
1956 lea r0, [r0 + 4 * r1]
1960 movu [r0 + r1 + 32], m0
1961 movu [r0 + 2 * r1], m0
1962 movu [r0 + 2 * r1 + 32], m0
1964 movu [r0 + r3 + 32], m0
1965 lea r0, [r0 + 4 * r1]
1969 movu [r0 + r1 + 32], m0
1970 movu [r0 + 2 * r1], m0
1971 movu [r0 + 2 * r1 + 32], m0
1973 movu [r0 + r3 + 32], m0
1974 lea r0, [r0 + 4 * r1]
1978 movu [r0 + r1 + 32], m0
1979 movu [r0 + 2 * r1], m0
1980 movu [r0 + 2 * r1 + 32], m0
1982 movu [r0 + r3 + 32], m0
1985 ;-----------------------------------------------------------------------------
1986 ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
1987 ;-----------------------------------------------------------------------------
1989 cglobal blockcopy_ps_2x4, 4, 4, 1, dst, dstStride, src, srcStride
2001 movd m0, [r2 + 2 * r3]
2003 movd [r0 + 2 * r1], m0
2005 lea r2, [r2 + 2 * r3]
2006 lea r0, [r0 + 2 * r1]
2015 ;-----------------------------------------------------------------------------
2016 ; void blockcopy_ps_2x8(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2017 ;-----------------------------------------------------------------------------
2019 cglobal blockcopy_ps_2x8, 4, 4, 1, dst, dstStride, src, srcStride
2031 movd m0, [r2 + 2 * r3]
2033 movd [r0 + 2 * r1], m0
2035 lea r2, [r2 + 2 * r3]
2036 lea r0, [r0 + 2 * r1]
2042 movd m0, [r2 + 2 * r3]
2044 movd [r0 + 2 * r1], m0
2046 lea r2, [r2 + 2 * r3]
2047 lea r0, [r0 + 2 * r1]
2053 movd m0, [r2 + 2 * r3]
2055 movd [r0 + 2 * r1], m0
2057 lea r2, [r2 + 2 * r3]
2058 lea r0, [r0 + 2 * r1]
2067 ;-----------------------------------------------------------------------------
2068 ; void blockcopy_ps_2x16(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2069 ;-----------------------------------------------------------------------------
2071 cglobal blockcopy_ps_2x16, 4, 5, 2, dst, dstStride, src, srcStride
2078 lea r2, [r2 + r3 * 2]
2083 lea r0, [r0 + r1 * 2]
2088 ;-----------------------------------------------------------------------------
2089 ; void blockcopy_ps_4x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2090 ;-----------------------------------------------------------------------------
2092 cglobal blockcopy_ps_4x2, 4, 4, 1, dst, dstStride, src, srcStride
2107 ;-----------------------------------------------------------------------------
2108 ; void blockcopy_ps_4x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2109 ;-----------------------------------------------------------------------------
2111 cglobal blockcopy_ps_4x4, 4, 4, 1, dst, dstStride, src, srcStride
2123 movd m0, [r2 + 2 * r3]
2125 movh [r0 + 2 * r1], m0
2127 lea r2, [r2 + 2 * r3]
2128 lea r0, [r0 + 2 * r1]
2137 ;-----------------------------------------------------------------------------
2138 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2139 ;-----------------------------------------------------------------------------
2140 %macro BLOCKCOPY_PS_W4_H4 2
2142 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
2156 movd m0, [r2 + 2 * r3]
2158 movh [r0 + 2 * r1], m0
2160 lea r2, [r2 + 2 * r3]
2161 lea r0, [r0 + 2 * r1]
2167 lea r0, [r0 + 2 * r1]
2168 lea r2, [r2 + 2 * r3]
2176 BLOCKCOPY_PS_W4_H4 4, 8
2177 BLOCKCOPY_PS_W4_H4 4, 16
2179 BLOCKCOPY_PS_W4_H4 4, 32
2182 ;-----------------------------------------------------------------------------
2183 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2184 ;-----------------------------------------------------------------------------
2185 %macro BLOCKCOPY_PS_W6_H4 2
2187 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
2196 pextrd [r0 + 8], m0, 2
2201 pextrd [r0 + r1 + 8], m0, 2
2203 movh m0, [r2 + 2 * r3]
2205 movh [r0 + 2 * r1], m0
2206 pextrd [r0 + 2 * r1 + 8], m0, 2
2208 lea r2, [r2 + 2 * r3]
2209 lea r0, [r0 + 2 * r1]
2214 pextrd [r0 + r1 + 8], m0, 2
2216 lea r0, [r0 + 2 * r1]
2217 lea r2, [r2 + 2 * r3]
2225 BLOCKCOPY_PS_W6_H4 6, 8
2227 BLOCKCOPY_PS_W6_H4 6, 16
2229 ;-----------------------------------------------------------------------------
2230 ; void blockcopy_ps_8x2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2231 ;-----------------------------------------------------------------------------
2233 cglobal blockcopy_ps_8x2, 4, 4, 1, dst, dstStride, src, srcStride
2247 ;-----------------------------------------------------------------------------
2248 ; void blockcopy_ps_8x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2249 ;-----------------------------------------------------------------------------
2251 cglobal blockcopy_ps_8x4, 4, 4, 1, dst, dstStride, src, srcStride
2263 movh m0, [r2 + 2 * r3]
2265 movu [r0 + 2 * r1], m0
2267 lea r2, [r2 + 2 * r3]
2268 lea r0, [r0 + 2 * r1]
2276 ;-----------------------------------------------------------------------------
2277 ; void blockcopy_ps_8x6(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2278 ;-----------------------------------------------------------------------------
2280 cglobal blockcopy_ps_8x6, 4, 4, 1, dst, dstStride, src, srcStride
2292 movh m0, [r2 + 2 * r3]
2294 movu [r0 + 2 * r1], m0
2296 lea r2, [r2 + 2 * r3]
2297 lea r0, [r0 + 2 * r1]
2303 movh m0, [r2 + 2 * r3]
2305 movu [r0 + 2 * r1], m0
2307 lea r2, [r2 + 2 * r3]
2308 lea r0, [r0 + 2 * r1]
2316 ;-----------------------------------------------------------------------------
2317 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2318 ;-----------------------------------------------------------------------------
2319 %macro BLOCKCOPY_PS_W8_H4 2
2321 cglobal blockcopy_ps_%1x%2, 4, 5, 1, dst, dstStride, src, srcStride
2335 movh m0, [r2 + 2 * r3]
2337 movu [r0 + 2 * r1], m0
2339 lea r2, [r2 + 2 * r3]
2340 lea r0, [r0 + 2 * r1]
2346 lea r0, [r0 + 2 * r1]
2347 lea r2, [r2 + 2 * r3]
2355 BLOCKCOPY_PS_W8_H4 8, 8
2356 BLOCKCOPY_PS_W8_H4 8, 16
2357 BLOCKCOPY_PS_W8_H4 8, 32
2359 BLOCKCOPY_PS_W8_H4 8, 12
2360 BLOCKCOPY_PS_W8_H4 8, 64
2363 ;-----------------------------------------------------------------------------
2364 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2365 ;-----------------------------------------------------------------------------
2366 %macro BLOCKCOPY_PS_W12_H2 2
2368 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2385 movh [r0 + r1 + 16], m1
2387 lea r0, [r0 + 2 * r1]
2388 lea r2, [r2 + 2 * r3]
2396 BLOCKCOPY_PS_W12_H2 12, 16
2398 BLOCKCOPY_PS_W12_H2 12, 32
2400 ;-----------------------------------------------------------------------------
2401 ; void blockcopy_ps_16x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2402 ;-----------------------------------------------------------------------------
2404 cglobal blockcopy_ps_16x4, 4, 4, 3, dst, dstStride, src, srcStride
2419 movu [r0 + r1 + 16], m1
2421 movu m1, [r2 + 2 * r3]
2423 movu [r0 + 2 * r1], m2
2425 movu [r0 + 2 * r1 + 16], m1
2427 lea r0, [r0 + 2 * r1]
2428 lea r2, [r2 + 2 * r3]
2434 movu [r0 + r1 + 16], m1
2438 ;-----------------------------------------------------------------------------
2439 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2440 ;-----------------------------------------------------------------------------
2441 %macro BLOCKCOPY_PS_W16_H4 2
2443 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2460 movu [r0 + r1 + 16], m1
2462 movu m1, [r2 + 2 * r3]
2464 movu [r0 + 2 * r1], m2
2466 movu [r0 + 2 * r1 + 16], m1
2468 lea r0, [r0 + 2 * r1]
2469 lea r2, [r2 + 2 * r3]
2475 movu [r0 + r1 + 16], m1
2477 lea r0, [r0 + 2 * r1]
2478 lea r2, [r2 + 2 * r3]
2486 BLOCKCOPY_PS_W16_H4 16, 8
2487 BLOCKCOPY_PS_W16_H4 16, 12
2488 BLOCKCOPY_PS_W16_H4 16, 16
2489 BLOCKCOPY_PS_W16_H4 16, 32
2490 BLOCKCOPY_PS_W16_H4 16, 64
2492 BLOCKCOPY_PS_W16_H4 16, 24
2494 ;-----------------------------------------------------------------------------
2495 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2496 ;-----------------------------------------------------------------------------
2497 %macro BLOCKCOPY_PS_W24_H2 2
2499 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2520 movu [r0 + r1 + 16], m1
2522 movh m1, [r2 + r3 + 16]
2524 movu [r0 + r1 + 32], m1
2526 lea r0, [r0 + 2 * r1]
2527 lea r2, [r2 + 2 * r3]
2535 BLOCKCOPY_PS_W24_H2 24, 32
2537 BLOCKCOPY_PS_W24_H2 24, 64
2539 ;-----------------------------------------------------------------------------
2540 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2541 ;-----------------------------------------------------------------------------
2542 %macro BLOCKCOPY_PS_W32_H2 2
2544 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2567 movu [r0 + r1 + 16], m1
2569 movu m1, [r2 + r3 + 16]
2571 movu [r0 + r1 + 32], m2
2573 movu [r0 + r1 + 48], m1
2575 lea r0, [r0 + 2 * r1]
2576 lea r2, [r2 + 2 * r3]
2584 BLOCKCOPY_PS_W32_H2 32, 8
2585 BLOCKCOPY_PS_W32_H2 32, 16
2586 BLOCKCOPY_PS_W32_H2 32, 24
2587 BLOCKCOPY_PS_W32_H2 32, 32
2588 BLOCKCOPY_PS_W32_H2 32, 64
2590 BLOCKCOPY_PS_W32_H2 32, 48
2592 ;-----------------------------------------------------------------------------
2593 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2594 ;-----------------------------------------------------------------------------
2595 %macro BLOCKCOPY_PS_W48_H2 2
2597 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2626 movu [r0 + r1 + 16], m1
2628 movu m1, [r2 + r3 + 16]
2630 movu [r0 + r1 + 32], m2
2632 movu [r0 + r1 + 48], m1
2634 movu m1, [r2 + r3 + 32]
2636 movu [r0 + r1 + 64], m2
2638 movu [r0 + r1 + 80], m1
2640 lea r0, [r0 + 2 * r1]
2641 lea r2, [r2 + 2 * r3]
2649 BLOCKCOPY_PS_W48_H2 48, 64
2651 ;-----------------------------------------------------------------------------
2652 ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
2653 ;-----------------------------------------------------------------------------
2654 %macro BLOCKCOPY_PS_W64_H2 2
2656 cglobal blockcopy_ps_%1x%2, 4, 5, 3, dst, dstStride, src, srcStride
2691 movu [r0 + r1 + 16], m1
2693 movu m1, [r2 + r3 + 16]
2695 movu [r0 + r1 + 32], m2
2697 movu [r0 + r1 + 48], m1
2699 movu m1, [r2 + r3 + 32]
2701 movu [r0 + r1 + 64], m2
2703 movu [r0 + r1 + 80], m1
2705 movu m1, [r2 + r3 + 48]
2707 movu [r0 + r1 + 96], m2
2709 movu [r0 + r1 + 112], m1
2711 lea r0, [r0 + 2 * r1]
2712 lea r2, [r2 + 2 * r3]
2720 BLOCKCOPY_PS_W64_H2 64, 16
2721 BLOCKCOPY_PS_W64_H2 64, 32
2722 BLOCKCOPY_PS_W64_H2 64, 48
2723 BLOCKCOPY_PS_W64_H2 64, 64
2725 ;-----------------------------------------------------------------------------
2726 ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2727 ;-----------------------------------------------------------------------------
2729 cglobal blockcopy_ss_2x4, 4, 6, 0
2738 lea r2, [r2 + r3 * 2]
2739 lea r0, [r0 + 2 * r1]
2748 ;-----------------------------------------------------------------------------
2749 ; void blockcopy_ss_2x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2750 ;-----------------------------------------------------------------------------
2752 cglobal blockcopy_ss_2x8, 4, 6, 0
2761 lea r2, [r2 + r3 * 2]
2762 lea r0, [r0 + 2 * r1]
2769 lea r2, [r2 + r3 * 2]
2770 lea r0, [r0 + 2 * r1]
2777 lea r2, [r2 + r3 * 2]
2778 lea r0, [r0 + 2 * r1]
2787 ;-----------------------------------------------------------------------------
2788 ; void blockcopy_ss_2x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2789 ;-----------------------------------------------------------------------------
2791 cglobal blockcopy_ss_2x16, 4, 7, 0
2799 lea r2, [r2 + r3 * 2]
2802 lea r0, [r0 + r1 * 2]
2807 ;-----------------------------------------------------------------------------
2808 ; void blockcopy_ss_4x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2809 ;-----------------------------------------------------------------------------
2811 cglobal blockcopy_ss_4x2, 4, 4, 2
2823 ;-----------------------------------------------------------------------------
2824 ; void blockcopy_ss_4x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2825 ;-----------------------------------------------------------------------------
2827 cglobal blockcopy_ss_4x4, 4, 4, 4
2832 lea r2, [r2 + r3 * 2]
2838 lea r0, [r0 + 2 * r1]
2843 ;-----------------------------------------------------------------------------
2844 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2845 ;-----------------------------------------------------------------------------
2846 %macro BLOCKCOPY_SS_W4_H8 2
2848 cglobal blockcopy_ss_%1x%2, 4, 5, 4
2855 lea r2, [r2 + r3 * 2]
2861 lea r0, [r0 + 2 * r1]
2865 lea r0, [r0 + 2 * r1]
2866 lea r2, [r2 + 2 * r3]
2869 lea r2, [r2 + r3 * 2]
2875 lea r0, [r0 + 2 * r1]
2878 lea r0, [r0 + 2 * r1]
2879 lea r2, [r2 + 2 * r3]
2886 BLOCKCOPY_SS_W4_H8 4, 8
2887 BLOCKCOPY_SS_W4_H8 4, 16
2889 BLOCKCOPY_SS_W4_H8 4, 32
2891 ;-----------------------------------------------------------------------------
2892 ; void blockcopy_ss_6x8(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2893 ;-----------------------------------------------------------------------------
2895 cglobal blockcopy_ss_6x8, 4, 4, 4
2906 movd [r0 + r1 + 8], m3
2908 lea r0, [r0 + 2 * r1]
2909 lea r2, [r2 + 2 * r3]
2918 movd [r0 + r1 + 8], m3
2920 lea r0, [r0 + 2 * r1]
2921 lea r2, [r2 + 2 * r3]
2930 movd [r0 + r1 + 8], m3
2932 lea r0, [r0 + 2 * r1]
2933 lea r2, [r2 + 2 * r3]
2942 movd [r0 + r1 + 8], m3
2946 ;-----------------------------------------------------------------------------
2947 ; void blockcopy_ss_6x16(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2948 ;-----------------------------------------------------------------------------
2950 cglobal blockcopy_ss_6x16, 4, 5, 4
2958 movd m3, [r2 + r3 + 8]
2960 lea r2, [r2 + r3 * 2]
2964 movd [r0 + r1 + 8], m3
2965 lea r0, [r0 + r1 * 2]
2970 ;-----------------------------------------------------------------------------
2971 ; void blockcopy_ss_8x2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2972 ;-----------------------------------------------------------------------------
2974 cglobal blockcopy_ss_8x2, 4, 4, 2
2986 ;-----------------------------------------------------------------------------
2987 ; void blockcopy_ss_8x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
2988 ;-----------------------------------------------------------------------------
2990 cglobal blockcopy_ss_8x4, 4, 4, 4
2996 lea r2, [r2 + r3 * 2]
3002 lea r0, [r0 + 2 * r1]
3007 ;-----------------------------------------------------------------------------
3008 ; void blockcopy_ss_8x6(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3009 ;-----------------------------------------------------------------------------
3011 cglobal blockcopy_ss_8x6, 4, 4, 4
3017 lea r2, [r2 + r3 * 2]
3023 lea r0, [r0 + 2 * r1]
3027 lea r2, [r2 + r3 * 2]
3028 lea r0, [r0 + 2 * r1]
3036 ;-----------------------------------------------------------------------------
3037 ; void blockcopy_ss_8x12(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3038 ;-----------------------------------------------------------------------------
3040 cglobal blockcopy_ss_8x12, 4, 5, 2
3047 lea r2, [r2 + 2 * r3]
3051 lea r0, [r0 + 2 * r1]
3056 ;-----------------------------------------------------------------------------
3057 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3058 ;-----------------------------------------------------------------------------
3059 %macro BLOCKCOPY_SS_W8_H8 2
3061 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3068 lea r2, [r2 + r3 * 2]
3074 lea r0, [r0 + 2 * r1]
3079 lea r2, [r2 + 2 * r3]
3080 lea r0, [r0 + 2 * r1]
3084 lea r2, [r2 + r3 * 2]
3090 lea r0, [r0 + 2 * r1]
3095 lea r0, [r0 + 2 * r1]
3096 lea r2, [r2 + 2 * r3]
3101 BLOCKCOPY_SS_W8_H8 8, 8
3102 BLOCKCOPY_SS_W8_H8 8, 16
3103 BLOCKCOPY_SS_W8_H8 8, 32
3105 BLOCKCOPY_SS_W8_H8 8, 64
3107 ;-----------------------------------------------------------------------------
3108 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3109 ;-----------------------------------------------------------------------------
3110 %macro BLOCKCOPY_SS_W12_H4 2
3112 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3121 movh m3, [r2 + r3 + 16]
3122 lea r2, [r2 + 2 * r3]
3127 movh [r0 + r1 + 16], m3
3129 lea r0, [r0 + 2 * r1]
3133 movh m3, [r2 + r3 + 16]
3138 movh [r0 + r1 + 16], m3
3141 lea r0, [r0 + 2 * r1]
3142 lea r2, [r2 + 2 * r3]
3147 BLOCKCOPY_SS_W12_H4 12, 16
3149 BLOCKCOPY_SS_W12_H4 12, 32
3151 ;-----------------------------------------------------------------------------
3152 ; void blockcopy_ss_16x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3153 ;-----------------------------------------------------------------------------
3154 %macro BLOCKCOPY_SS_W16_H4 2
3156 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3164 movu m3, [r2 + r3 + 16]
3169 movu [r0 + r1 + 16], m3
3171 lea r2, [r2 + 2 * r3]
3172 lea r0, [r0 + 2 * r1]
3177 movu m3, [r2 + r3 + 16]
3182 movu [r0 + r1 + 16], m3
3185 lea r0, [r0 + 2 * r1]
3186 lea r2, [r2 + 2 * r3]
3191 BLOCKCOPY_SS_W16_H4 16, 4
3192 BLOCKCOPY_SS_W16_H4 16, 12
3194 ;-----------------------------------------------------------------------------
3195 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3196 ;-----------------------------------------------------------------------------
3197 %macro BLOCKCOPY_SS_W16_H4_avx 2
3199 cglobal blockcopy_ss_%1x%2, 4, 7, 4
3208 movu m2, [r2 + 2 * r3]
3213 movu [r0 + 2 * r1], m2
3216 lea r0, [r0 + 4 * r1]
3217 lea r2, [r2 + 4 * r3]
3223 BLOCKCOPY_SS_W16_H4_avx 16, 4
3224 BLOCKCOPY_SS_W16_H4_avx 16, 12
3225 BLOCKCOPY_SS_W16_H4_avx 16, 8
3226 BLOCKCOPY_SS_W16_H4_avx 16, 16
3227 BLOCKCOPY_SS_W16_H4_avx 16, 24
3228 BLOCKCOPY_SS_W16_H4_avx 16, 32
3229 BLOCKCOPY_SS_W16_H4_avx 16, 64
3231 ;-----------------------------------------------------------------------------
3232 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3233 ;-----------------------------------------------------------------------------
3234 %macro BLOCKCOPY_SS_W16_H8 2
3236 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3244 movu m3, [r2 + r3 + 16]
3249 movu [r0 + r1 + 16], m3
3251 lea r2, [r2 + 2 * r3]
3252 lea r0, [r0 + 2 * r1]
3257 movu m3, [r2 + r3 + 16]
3262 movu [r0 + r1 + 16], m3
3264 lea r2, [r2 + 2 * r3]
3265 lea r0, [r0 + 2 * r1]
3270 movu m3, [r2 + r3 + 16]
3275 movu [r0 + r1 + 16], m3
3277 lea r2, [r2 + 2 * r3]
3278 lea r0, [r0 + 2 * r1]
3283 movu m3, [r2 + r3 + 16]
3288 movu [r0 + r1 + 16], m3
3291 lea r2, [r2 + 2 * r3]
3292 lea r0, [r0 + 2 * r1]
3297 BLOCKCOPY_SS_W16_H8 16, 8
3298 BLOCKCOPY_SS_W16_H8 16, 16
3299 BLOCKCOPY_SS_W16_H8 16, 32
3300 BLOCKCOPY_SS_W16_H8 16, 64
3302 BLOCKCOPY_SS_W16_H8 16, 24
3304 ;-----------------------------------------------------------------------------
3305 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3306 ;-----------------------------------------------------------------------------
3307 %macro BLOCKCOPY_SS_W24_H4 2
3309 cglobal blockcopy_ss_%1x%2, 4, 5, 6
3318 movu m4, [r2 + r3 + 16]
3319 movu m5, [r2 + r3 + 32]
3325 movu [r0 + r1 + 16], m4
3326 movu [r0 + r1 + 32], m5
3328 lea r2, [r2 + 2 * r3]
3329 lea r0, [r0 + 2 * r1]
3335 movu m4, [r2 + r3 + 16]
3336 movu m5, [r2 + r3 + 32]
3342 movu [r0 + r1 + 16], m4
3343 movu [r0 + r1 + 32], m5
3346 lea r2, [r2 + 2 * r3]
3347 lea r0, [r0 + 2 * r1]
3352 BLOCKCOPY_SS_W24_H4 24, 32
3354 BLOCKCOPY_SS_W24_H4 24, 64
3356 ;-----------------------------------------------------------------------------
3357 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3358 ;-----------------------------------------------------------------------------
3359 %macro BLOCKCOPY_SS_W32_H4 2
3361 cglobal blockcopy_ss_%1x%2, 4, 5, 4
3377 movu m1, [r2 + r3 + 16]
3378 movu m2, [r2 + r3 + 32]
3379 movu m3, [r2 + r3 + 48]
3382 movu [r0 + r1 + 16], m1
3383 movu [r0 + r1 + 32], m2
3384 movu [r0 + r1 + 48], m3
3386 lea r2, [r2 + 2 * r3]
3387 lea r0, [r0 + 2 * r1]
3400 movu m1, [r2 + r3 + 16]
3401 movu m2, [r2 + r3 + 32]
3402 movu m3, [r2 + r3 + 48]
3405 movu [r0 + r1 + 16], m1
3406 movu [r0 + r1 + 32], m2
3407 movu [r0 + r1 + 48], m3
3410 lea r2, [r2 + 2 * r3]
3411 lea r0, [r0 + 2 * r1]
3416 BLOCKCOPY_SS_W32_H4 32, 8
3417 BLOCKCOPY_SS_W32_H4 32, 16
3418 BLOCKCOPY_SS_W32_H4 32, 24
3419 BLOCKCOPY_SS_W32_H4 32, 32
3420 BLOCKCOPY_SS_W32_H4 32, 64
3422 BLOCKCOPY_SS_W32_H4 32, 48
3424 ;-----------------------------------------------------------------------------
3425 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3426 ;-----------------------------------------------------------------------------
3427 %macro BLOCKCOPY_SS_W48_H2 2
3429 cglobal blockcopy_ss_%1x%2, 4, 5, 6
3449 movu m1, [r2 + r3 + 16]
3450 movu m2, [r2 + r3 + 32]
3451 movu m3, [r2 + r3 + 48]
3452 movu m4, [r2 + r3 + 64]
3453 movu m5, [r2 + r3 + 80]
3456 movu [r0 + r1 + 16], m1
3457 movu [r0 + r1 + 32], m2
3458 movu [r0 + r1 + 48], m3
3459 movu [r0 + r1 + 64], m4
3460 movu [r0 + r1 + 80], m5
3462 lea r2, [r2 + 2 * r3]
3463 lea r0, [r0 + 2 * r1]
3480 movu m1, [r2 + r3 + 16]
3481 movu m2, [r2 + r3 + 32]
3482 movu m3, [r2 + r3 + 48]
3483 movu m4, [r2 + r3 + 64]
3484 movu m5, [r2 + r3 + 80]
3487 movu [r0 + r1 + 16], m1
3488 movu [r0 + r1 + 32], m2
3489 movu [r0 + r1 + 48], m3
3490 movu [r0 + r1 + 64], m4
3491 movu [r0 + r1 + 80], m5
3494 lea r2, [r2 + 2 * r3]
3495 lea r0, [r0 + 2 * r1]
3500 BLOCKCOPY_SS_W48_H2 48, 64
3502 ;-----------------------------------------------------------------------------
3503 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3504 ;-----------------------------------------------------------------------------
3505 %macro BLOCKCOPY_SS_W64_H4 2
3507 cglobal blockcopy_ss_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride
3533 movu m1, [r2 + r3 + 16]
3534 movu m2, [r2 + r3 + 32]
3535 movu m3, [r2 + r3 + 48]
3538 movu [r0 + r1 + 16], m1
3539 movu [r0 + r1 + 32], m2
3540 movu [r0 + r1 + 48], m3
3542 movu m0, [r2 + r3 + 64]
3543 movu m1, [r2 + r3 + 80]
3544 movu m2, [r2 + r3 + 96]
3545 movu m3, [r2 + r3 + 112]
3547 movu [r0 + r1 + 64], m0
3548 movu [r0 + r1 + 80], m1
3549 movu [r0 + r1 + 96], m2
3550 movu [r0 + r1 + 112], m3
3552 lea r2, [r2 + 2 * r3]
3553 lea r0, [r0 + 2 * r1]
3576 movu m1, [r2 + r3 + 16]
3577 movu m2, [r2 + r3 + 32]
3578 movu m3, [r2 + r3 + 48]
3581 movu [r0 + r1 + 16], m1
3582 movu [r0 + r1 + 32], m2
3583 movu [r0 + r1 + 48], m3
3585 movu m0, [r2 + r3 + 64]
3586 movu m1, [r2 + r3 + 80]
3587 movu m2, [r2 + r3 + 96]
3588 movu m3, [r2 + r3 + 112]
3590 movu [r0 + r1 + 64], m0
3591 movu [r0 + r1 + 80], m1
3592 movu [r0 + r1 + 96], m2
3593 movu [r0 + r1 + 112], m3
3596 lea r2, [r2 + 2 * r3]
3597 lea r0, [r0 + 2 * r1]
3603 BLOCKCOPY_SS_W64_H4 64, 16
3604 BLOCKCOPY_SS_W64_H4 64, 32
3605 BLOCKCOPY_SS_W64_H4 64, 48
3606 BLOCKCOPY_SS_W64_H4 64, 64
3608 ;-----------------------------------------------------------------------------
3609 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
3610 ;-----------------------------------------------------------------------------
3611 %macro BLOCKCOPY_SS_W64_H4_avx 2
3613 cglobal blockcopy_ss_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride
3631 movu m1, [r2 + r3 + 32]
3632 movu m2, [r2 + r3 + 64]
3633 movu m3, [r2 + r3 + 96]
3636 movu [r0 + r1 + 32], m1
3637 movu [r0 + r1 + 64], m2
3638 movu [r0 + r1 + 96], m3
3640 movu m0, [r2 + 2 * r3]
3641 movu m1, [r2 + 2 * r3 + 32]
3642 movu m2, [r2 + 2 * r3 + 64]
3643 movu m3, [r2 + 2 * r3 + 96]
3645 movu [r0 + 2 * r1], m0
3646 movu [r0 + 2 * r1 + 32], m1
3647 movu [r0 + 2 * r1 + 64], m2
3648 movu [r0 + 2 * r1 + 96], m3
3651 movu m1, [r2 + r6 + 32]
3652 movu m2, [r2 + r6 + 64]
3653 movu m3, [r2 + r6 + 96]
3654 lea r2, [r2 + 4 * r3]
3657 movu [r0 + r5 + 32], m1
3658 movu [r0 + r5 + 64], m2
3659 movu [r0 + r5 + 96], m3
3660 lea r0, [r0 + 4 * r1]
3667 BLOCKCOPY_SS_W64_H4_avx 64, 16
3668 BLOCKCOPY_SS_W64_H4_avx 64, 32
3669 BLOCKCOPY_SS_W64_H4_avx 64, 48
3670 BLOCKCOPY_SS_W64_H4_avx 64, 64
3672 ;--------------------------------------------------------------------------------------
3673 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3674 ;--------------------------------------------------------------------------------------
3676 cglobal cpy2Dto1D_shr_4, 3, 4, 4
3688 ; m1 - word [-round]
3692 movhps m2, [r1 + r2]
3693 lea r1, [r1 + r2 * 2]
3695 movhps m3, [r1 + r2]
3700 mova [r0 + 0 * mmsize], m2
3701 mova [r0 + 1 * mmsize], m3
3705 ;--------------------------------------------------------------------------------------
3706 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3707 ;--------------------------------------------------------------------------------------
3709 cglobal cpy2Dto1D_shr_8, 3, 5, 4
3725 ; m1 - word [-round]
3735 mova [r0 + 0 * mmsize], m2
3736 mova [r0 + 1 * mmsize], m3
3739 mova m2, [r1 + r2 * 2]
3745 mova [r0 + 2 * mmsize], m2
3746 mova [r0 + 3 * mmsize], m3
3749 lea r1, [r1 + r2 * 4]
3755 ;--------------------------------------------------------------------------------------
3756 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3757 ;--------------------------------------------------------------------------------------
3759 cglobal cpy2Dto1D_shr_16, 3, 4, 4
3773 ; m1 - word [-round]
3777 mova m2, [r1 + 0 * mmsize]
3778 mova m3, [r1 + 1 * mmsize]
3783 mova [r0 + 0 * mmsize], m2
3784 mova [r0 + 1 * mmsize], m3
3787 mova m2, [r1 + r2 + 0 * mmsize]
3788 mova m3, [r1 + r2 + 1 * mmsize]
3793 mova [r0 + 2 * mmsize], m2
3794 mova [r0 + 3 * mmsize], m3
3797 lea r1, [r1 + r2 * 2]
3803 ;--------------------------------------------------------------------------------------
3804 ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
3805 ;--------------------------------------------------------------------------------------
3807 cglobal cpy2Dto1D_shr_32, 3, 4, 6
3821 ; m1 - word [-round]
3825 mova m2, [r1 + 0 * mmsize]
3826 mova m3, [r1 + 1 * mmsize]
3827 mova m4, [r1 + 2 * mmsize]
3828 mova m5, [r1 + 3 * mmsize]
3837 mova [r0 + 0 * mmsize], m2
3838 mova [r0 + 1 * mmsize], m3
3839 mova [r0 + 2 * mmsize], m4
3840 mova [r0 + 3 * mmsize], m5
3849 ;--------------------------------------------------------------------------------------
3850 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
3851 ;--------------------------------------------------------------------------------------
3853 cglobal cpy1Dto2D_shl_4, 3, 3, 3
3858 mova m1, [r1 + 0 * mmsize]
3859 mova m2, [r1 + 1 * mmsize]
3863 movhps [r0 + r2], m1
3864 movh [r0 + r2 * 2], m2
3866 movhps [r0 + r2], m2
3871 cglobal cpy1Dto2D_shl_4, 3, 3, 2
3878 vextracti128 xm0, m1, 1
3880 movhps [r0 + r2], xm1
3881 lea r0, [r0 + r2 * 2]
3883 movhps [r0 + r2], xm0
3887 ;--------------------------------------------------------------------------------------
3888 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
3889 ;--------------------------------------------------------------------------------------
3891 cglobal cpy1Dto2D_shl_8, 3, 4, 5
3897 mova m1, [r1 + 0 * mmsize]
3898 mova m2, [r1 + 1 * mmsize]
3899 mova m3, [r1 + 2 * mmsize]
3900 mova m4, [r1 + 3 * mmsize]
3907 mova [r0 + r2 * 2], m3
3909 lea r0, [r0 + r2 * 4]
3912 mova m1, [r1 + 4 * mmsize]
3913 mova m2, [r1 + 5 * mmsize]
3914 mova m3, [r1 + 6 * mmsize]
3915 mova m4, [r1 + 7 * mmsize]
3922 mova [r0 + r2 * 2], m3
3928 cglobal cpy1Dto2D_shl_8, 3, 4, 3
3934 movu m1, [r1 + 0 * mmsize]
3935 movu m2, [r1 + 1 * mmsize]
3939 vextracti128 [r0 + r2], m1, 1
3940 movu [r0 + r2 * 2], xm2
3941 vextracti128 [r0 + r3], m2, 1
3944 movu m1, [r1 + 2 * mmsize]
3945 movu m2, [r1 + 3 * mmsize]
3946 lea r0, [r0 + r2 * 4]
3950 vextracti128 [r0 + r2], m1, 1
3951 movu [r0 + r2 * 2], xm2
3952 vextracti128 [r0 + r3], m2, 1
3956 ;--------------------------------------------------------------------------------------
3957 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
3958 ;--------------------------------------------------------------------------------------
3960 cglobal cpy1Dto2D_shl_16, 3, 4, 5
3967 mova m1, [r1 + 0 * mmsize]
3968 mova m2, [r1 + 1 * mmsize]
3969 mova m3, [r1 + 2 * mmsize]
3970 mova m4, [r1 + 3 * mmsize]
3978 mova [r0 + r2 + 16], m4
3981 mova m1, [r1 + 4 * mmsize]
3982 mova m2, [r1 + 5 * mmsize]
3983 mova m3, [r1 + 6 * mmsize]
3984 mova m4, [r1 + 7 * mmsize]
3985 lea r0, [r0 + r2 * 2]
3993 mova [r0 + r2 + 16], m4
3996 lea r0, [r0 + r2 * 2]
4003 cglobal cpy1Dto2D_shl_16, 3, 5, 3
4011 movu m1, [r1 + 0 * mmsize]
4012 movu m2, [r1 + 1 * mmsize]
4019 movu m1, [r1 + 2 * mmsize]
4020 movu m2, [r1 + 3 * mmsize]
4023 movu [r0 + r2 * 2], m1
4027 lea r0, [r0 + r2 * 4]
4033 ;--------------------------------------------------------------------------------------
4034 ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4035 ;--------------------------------------------------------------------------------------
4037 cglobal cpy1Dto2D_shl_32, 3, 4, 5
4044 mova m1, [r1 + 0 * mmsize]
4045 mova m2, [r1 + 1 * mmsize]
4046 mova m3, [r1 + 2 * mmsize]
4047 mova m4, [r1 + 3 * mmsize]
4052 mova [r0 + 0 * mmsize], m1
4053 mova [r0 + 1 * mmsize], m2
4054 mova [r0 + 2 * mmsize], m3
4055 mova [r0 + 3 * mmsize], m4
4058 mova m1, [r1 + 4 * mmsize]
4059 mova m2, [r1 + 5 * mmsize]
4060 mova m3, [r1 + 6 * mmsize]
4061 mova m4, [r1 + 7 * mmsize]
4066 mova [r0 + r2 + 0 * mmsize], m1
4067 mova [r0 + r2 + 1 * mmsize], m2
4068 mova [r0 + r2 + 2 * mmsize], m3
4069 mova [r0 + r2 + 3 * mmsize], m4
4072 lea r0, [r0 + r2 * 2]
4079 cglobal cpy1Dto2D_shl_32, 3, 4, 5
4086 movu m1, [r1 + 0 * mmsize]
4087 movu m2, [r1 + 1 * mmsize]
4088 movu m3, [r1 + 2 * mmsize]
4089 movu m4, [r1 + 3 * mmsize]
4095 movu [r0 + mmsize], m2
4097 movu [r0 + r2 + mmsize], m4
4100 lea r0, [r0 + r2 * 2]
4106 ;--------------------------------------------------------------------------------------
4107 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
4108 ;--------------------------------------------------------------------------------------
4110 cglobal copy_cnt_4, 3,3,3
4116 movhps m0, [r1 + r2]
4120 movh m1, [r1 + r2 * 2]
4122 movhps m1, [r1 + r2]
4129 ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
4145 ;--------------------------------------------------------------------------------------
4146 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
4147 ;--------------------------------------------------------------------------------------
4149 cglobal copy_cnt_8, 3,3,6
4165 lea r1, [r1 + 2 * r2]
4176 lea r1, [r1 + 2 * r2]
4187 lea r1, [r1 + 2 * r2]
4208 cglobal copy_cnt_8, 3,4,5
4214 vinserti128 m0, m0, [r1 + r2], 1
4218 movu xm1, [r1 + r2 * 2]
4219 vinserti128 m1, m1, [r1 + r3], 1
4221 lea r1, [r1 + r2 * 4]
4225 vinserti128 m2, m2, [r1 + r2], 1
4229 movu xm3, [r1 + r2 * 2]
4230 vinserti128 m3, m3, [r1 + r3], 1
4240 vextracti128 xm1, m0, 1
4249 ;--------------------------------------------------------------------------------------
4250 ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride);
4251 ;--------------------------------------------------------------------------------------
4253 cglobal copy_cnt_16, 3,4,6
4272 movu m1, [r1 + r2 + 16]
4281 movu m0, [r1 + 2 * r2]
4282 movu m1, [r1 + 2 * r2 + 16]
4291 lea r1, [r1 + 2 * r2]
4293 movu m1, [r1 + r2 + 16]
4302 lea r1, [r1 + 2 * r2]
4316 cglobal copy_cnt_16, 3, 5, 5
4335 movu m1, [r1 + r2 * 2]
4346 lea r1, [r1 + 4 * r2]
4352 vextracti128 xm1, m4, 1
4360 ;--------------------------------------------------------------------------------------
4361 ; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
4362 ;--------------------------------------------------------------------------------------
4364 cglobal copy_cnt_32, 3,4,6
4392 movu m1, [r1 + r2 + 16]
4400 movu m0, [r1 + r2 + 32]
4401 movu m1, [r1 + r2 + 48]
4410 lea r1, [r1 + 2 * r2]
4425 cglobal copy_cnt_32, 3, 5, 5
4445 movu m2, [r1 + r2 + 32]
4454 lea r1, [r1 + 2 * r2]
4460 vextracti128 xm1, m4, 1
4469 ;--------------------------------------------------------------------------------------
4470 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4471 ;--------------------------------------------------------------------------------------
4473 cglobal cpy2Dto1D_shl_4, 4, 4, 4
4485 movhps m2, [r1 + r2]
4486 lea r1, [r1 + r2 * 2]
4488 movhps m3, [r1 + r2]
4491 mova [r0 + 0 * mmsize], m2
4492 mova [r0 + 1 * mmsize], m3
4497 ;--------------------------------------------------------------------------------------
4498 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4499 ;--------------------------------------------------------------------------------------
4501 cglobal cpy2Dto1D_shl_8, 4, 5, 4
4521 mova [r0 + 0 * mmsize], m2
4522 mova [r0 + 1 * mmsize], m3
4525 mova m2, [r1 + r2 * 2]
4529 mova [r0 + 2 * mmsize], m2
4530 mova [r0 + 3 * mmsize], m3
4533 lea r1, [r1 + r2 * 4]
4539 ;--------------------------------------------------------------------------------------
4540 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4541 ;--------------------------------------------------------------------------------------
4543 cglobal cpy2Dto1D_shl_16, 4, 4, 4
4557 mova m2, [r1 + 0 * mmsize]
4558 mova m3, [r1 + 1 * mmsize]
4561 mova [r0 + 0 * mmsize], m2
4562 mova [r0 + 1 * mmsize], m3
4565 mova m2, [r1 + r2 + 0 * mmsize]
4566 mova m3, [r1 + r2 + 1 * mmsize]
4569 mova [r0 + 2 * mmsize], m2
4570 mova [r0 + 3 * mmsize], m3
4573 lea r1, [r1 + r2 * 2]
4579 ;--------------------------------------------------------------------------------------
4580 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
4581 ;--------------------------------------------------------------------------------------
4583 cglobal cpy2Dto1D_shl_32, 4, 4, 6
4597 mova m2, [r1 + 0 * mmsize]
4598 mova m3, [r1 + 1 * mmsize]
4599 mova m4, [r1 + 2 * mmsize]
4600 mova m5, [r1 + 3 * mmsize]
4605 mova [r0 + 0 * mmsize], m2
4606 mova [r0 + 1 * mmsize], m3
4607 mova [r0 + 2 * mmsize], m4
4608 mova [r0 + 3 * mmsize], m5
4617 ;--------------------------------------------------------------------------------------
4618 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4619 ;--------------------------------------------------------------------------------------
4621 cglobal cpy1Dto2D_shr_4, 3, 3, 4
4629 mova m2, [r1 + 0 * mmsize]
4630 mova m3, [r1 + 1 * mmsize]
4636 movhps [r0 + r2], m2
4637 movh [r0 + r2 * 2], m3
4639 movhps [r0 + r2], m3
4644 cglobal cpy1Dto2D_shr_4, 3, 3, 3
4655 vextracti128 xm1, m2, 1
4657 movhps [r0 + r2], xm2
4658 lea r0, [r0 + r2 * 2]
4660 movhps [r0 + r2], xm1
4664 ;--------------------------------------------------------------------------------------
4665 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4666 ;--------------------------------------------------------------------------------------
4668 cglobal cpy1Dto2D_shr_8, 3, 4, 6
4677 mova m2, [r1 + 0 * mmsize]
4678 mova m3, [r1 + 1 * mmsize]
4679 mova m4, [r1 + 2 * mmsize]
4680 mova m5, [r1 + 3 * mmsize]
4691 mova [r0 + r2 * 2], m4
4695 mova m2, [r1 + 4 * mmsize]
4696 mova m3, [r1 + 5 * mmsize]
4697 mova m4, [r1 + 6 * mmsize]
4698 mova m5, [r1 + 7 * mmsize]
4699 lea r0, [r0 + r2 * 4]
4710 mova [r0 + r2 * 2], m4
4716 cglobal cpy1Dto2D_shr_8, 3, 4, 4
4725 movu m2, [r1 + 0 * mmsize]
4726 movu m3, [r1 + 1 * mmsize]
4732 vextracti128 [r0 + r2], m2, 1
4733 movu [r0 + r2 * 2], xm3
4734 vextracti128 [r0 + r3], m3, 1
4737 movu m2, [r1 + 2 * mmsize]
4738 movu m3, [r1 + 3 * mmsize]
4739 lea r0, [r0 + r2 * 4]
4745 vextracti128 [r0 + r2], m2, 1
4746 movu [r0 + r2 * 2], xm3
4747 vextracti128 [r0 + r3], m3, 1
4751 ;--------------------------------------------------------------------------------------
4752 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4753 ;--------------------------------------------------------------------------------------
4755 cglobal cpy1Dto2D_shr_16, 3, 5, 6
4766 mova m2, [r1 + 0 * mmsize]
4767 mova m3, [r1 + 1 * mmsize]
4768 mova m4, [r1 + 2 * mmsize]
4769 mova m5, [r1 + 3 * mmsize]
4779 mova [r0 + mmsize], m3
4781 mova [r0 + r2 + mmsize], m5
4784 mova m2, [r1 + 4 * mmsize]
4785 mova m3, [r1 + 5 * mmsize]
4786 mova m4, [r1 + 6 * mmsize]
4787 mova m5, [r1 + 7 * mmsize]
4796 mova [r0 + r2 * 2], m2
4797 mova [r0 + r2 * 2 + mmsize], m3
4799 mova [r0 + r4 + mmsize], m5
4802 lea r0, [r0 + r2 * 4]
4809 cglobal cpy1Dto2D_shr_16, 3, 5, 4
4820 movu m2, [r1 + 0 * mmsize]
4821 movu m3, [r1 + 1 * mmsize]
4830 movu m2, [r1 + 2 * mmsize]
4831 movu m3, [r1 + 3 * mmsize]
4836 movu [r0 + r2 * 2], m2
4840 lea r0, [r0 + r2 * 4]
4846 ;--------------------------------------------------------------------------------------
4847 ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
4848 ;--------------------------------------------------------------------------------------
4850 cglobal cpy1Dto2D_shr_32, 3, 4, 6
4860 mova m2, [r1 + 0 * mmsize]
4861 mova m3, [r1 + 1 * mmsize]
4862 mova m4, [r1 + 2 * mmsize]
4863 mova m5, [r1 + 3 * mmsize]
4872 mova [r0 + 0 * mmsize], m2
4873 mova [r0 + 1 * mmsize], m3
4874 mova [r0 + 2 * mmsize], m4
4875 mova [r0 + 3 * mmsize], m5
4878 mova m2, [r1 + 4 * mmsize]
4879 mova m3, [r1 + 5 * mmsize]
4880 mova m4, [r1 + 6 * mmsize]
4881 mova m5, [r1 + 7 * mmsize]
4890 mova [r0 + r2 + 0 * mmsize], m2
4891 mova [r0 + r2 + 1 * mmsize], m3
4892 mova [r0 + r2 + 2 * mmsize], m4
4893 mova [r0 + r2 + 3 * mmsize], m5
4896 lea r0, [r0 + r2 * 2]
4903 cglobal cpy1Dto2D_shr_32, 3, 4, 6
4913 movu m2, [r1 + 0 * mmsize]
4914 movu m3, [r1 + 1 * mmsize]
4915 movu m4, [r1 + 2 * mmsize]
4916 movu m5, [r1 + 3 * mmsize]
4926 movu [r0 + mmsize], m3
4928 movu [r0 + r2 + mmsize], m5
4931 lea r0, [r0 + r2 * 2]