Imported Upstream version 1.4+222+hg5f9f7194267b
[deb_x265.git] / source / common / x86 / blockcopy8.h
CommitLineData
72b9787e
JB
1/*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24#ifndef X265_BLOCKCOPY8_H
25#define X265_BLOCKCOPY8_H
26
b53f7c52
JB
27void x265_cpy2Dto1D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
28void x265_cpy2Dto1D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
29void x265_cpy2Dto1D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
30void x265_cpy2Dto1D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
31void x265_cpy2Dto1D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
32void x265_cpy2Dto1D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
33void x265_cpy2Dto1D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
34void x265_cpy2Dto1D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
35void x265_cpy1Dto2D_shl_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
36void x265_cpy1Dto2D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
37void x265_cpy1Dto2D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
38void x265_cpy1Dto2D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
39void x265_cpy1Dto2D_shl_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
40void x265_cpy1Dto2D_shl_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
41void x265_cpy1Dto2D_shl_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
42void x265_cpy1Dto2D_shl_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
43void x265_cpy1Dto2D_shr_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
44void x265_cpy1Dto2D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
45void x265_cpy1Dto2D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
46void x265_cpy1Dto2D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
47void x265_cpy1Dto2D_shr_4_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
48void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
49void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
50void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift);
51uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
52uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
53uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
54uint32_t x265_copy_cnt_32_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride);
55uint32_t x265_copy_cnt_4_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
56uint32_t x265_copy_cnt_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
57uint32_t x265_copy_cnt_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
58uint32_t x265_copy_cnt_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride);
72b9787e
JB
59
60#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
b53f7c52
JB
61 void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
62 void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); \
63 void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
72b9787e
JB
64
65#define SETUP_BLOCKCOPY_PS(W, H, cpu) \
b53f7c52 66 void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
72b9787e
JB
67
68#define SETUP_BLOCKCOPY_SP(W, H, cpu) \
b53f7c52 69 void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
72b9787e
JB
70
71#define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \
b53f7c52
JB
72 void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); \
73 void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
72b9787e
JB
74
75#define BLOCKCOPY_COMMON(cpu) \
76 SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \
77 SETUP_BLOCKCOPY_FUNC(4, 2, cpu); \
78 SETUP_BLOCKCOPY_FUNC(8, 8, cpu); \
79 SETUP_BLOCKCOPY_FUNC(8, 4, cpu); \
80 SETUP_BLOCKCOPY_FUNC(4, 8, cpu); \
81 SETUP_BLOCKCOPY_FUNC(8, 6, cpu); \
82 SETUP_BLOCKCOPY_FUNC(8, 2, cpu); \
83 SETUP_BLOCKCOPY_FUNC(16, 16, cpu); \
84 SETUP_BLOCKCOPY_FUNC(16, 8, cpu); \
85 SETUP_BLOCKCOPY_FUNC(8, 16, cpu); \
86 SETUP_BLOCKCOPY_FUNC(16, 12, cpu); \
87 SETUP_BLOCKCOPY_FUNC(12, 16, cpu); \
88 SETUP_BLOCKCOPY_FUNC(16, 4, cpu); \
89 SETUP_BLOCKCOPY_FUNC(4, 16, cpu); \
90 SETUP_BLOCKCOPY_FUNC(32, 32, cpu); \
91 SETUP_BLOCKCOPY_FUNC(32, 16, cpu); \
92 SETUP_BLOCKCOPY_FUNC(16, 32, cpu); \
93 SETUP_BLOCKCOPY_FUNC(32, 24, cpu); \
94 SETUP_BLOCKCOPY_FUNC(24, 32, cpu); \
95 SETUP_BLOCKCOPY_FUNC(32, 8, cpu); \
96 SETUP_BLOCKCOPY_FUNC(8, 32, cpu); \
97 SETUP_BLOCKCOPY_FUNC(64, 64, cpu); \
98 SETUP_BLOCKCOPY_FUNC(64, 32, cpu); \
99 SETUP_BLOCKCOPY_FUNC(32, 64, cpu); \
100 SETUP_BLOCKCOPY_FUNC(64, 48, cpu); \
101 SETUP_BLOCKCOPY_FUNC(48, 64, cpu); \
102 SETUP_BLOCKCOPY_FUNC(64, 16, cpu); \
103 SETUP_BLOCKCOPY_FUNC(16, 64, cpu);
104
105#define BLOCKCOPY_SP(cpu) \
106 SETUP_BLOCKCOPY_SP(2, 4, cpu); \
107 SETUP_BLOCKCOPY_SP(2, 8, cpu); \
108 SETUP_BLOCKCOPY_SP(6, 8, cpu); \
109 \
110 SETUP_BLOCKCOPY_SP(2, 16, cpu); \
111 SETUP_BLOCKCOPY_SP(4, 32, cpu); \
112 SETUP_BLOCKCOPY_SP(6, 16, cpu); \
113 SETUP_BLOCKCOPY_SP(8, 12, cpu); \
114 SETUP_BLOCKCOPY_SP(8, 64, cpu); \
115 SETUP_BLOCKCOPY_SP(12, 32, cpu); \
116 SETUP_BLOCKCOPY_SP(16, 24, cpu); \
117 SETUP_BLOCKCOPY_SP(24, 64, cpu); \
118 SETUP_BLOCKCOPY_SP(32, 48, cpu);
119
120#define BLOCKCOPY_SS_PP(cpu) \
121 SETUP_BLOCKCOPY_SS_PP(2, 4, cpu); \
122 SETUP_BLOCKCOPY_SS_PP(2, 8, cpu); \
123 SETUP_BLOCKCOPY_SS_PP(6, 8, cpu); \
124 \
125 SETUP_BLOCKCOPY_SS_PP(2, 16, cpu); \
126 SETUP_BLOCKCOPY_SS_PP(4, 32, cpu); \
127 SETUP_BLOCKCOPY_SS_PP(6, 16, cpu); \
128 SETUP_BLOCKCOPY_SS_PP(8, 12, cpu); \
129 SETUP_BLOCKCOPY_SS_PP(8, 64, cpu); \
130 SETUP_BLOCKCOPY_SS_PP(12, 32, cpu); \
131 SETUP_BLOCKCOPY_SS_PP(16, 24, cpu); \
132 SETUP_BLOCKCOPY_SS_PP(24, 64, cpu); \
133 SETUP_BLOCKCOPY_SS_PP(32, 48, cpu);
134
135
136#define BLOCKCOPY_PS(cpu) \
137 SETUP_BLOCKCOPY_PS(2, 4, cpu); \
138 SETUP_BLOCKCOPY_PS(2, 8, cpu); \
139 SETUP_BLOCKCOPY_PS(4, 2, cpu); \
140 SETUP_BLOCKCOPY_PS(4, 4, cpu); \
141 SETUP_BLOCKCOPY_PS(4, 8, cpu); \
142 SETUP_BLOCKCOPY_PS(4, 16, cpu); \
143 SETUP_BLOCKCOPY_PS(6, 8, cpu); \
144 SETUP_BLOCKCOPY_PS(8, 2, cpu); \
145 SETUP_BLOCKCOPY_PS(8, 4, cpu); \
146 SETUP_BLOCKCOPY_PS(8, 6, cpu); \
147 SETUP_BLOCKCOPY_PS(8, 8, cpu); \
148 SETUP_BLOCKCOPY_PS(8, 16, cpu); \
149 SETUP_BLOCKCOPY_PS(8, 32, cpu); \
150 SETUP_BLOCKCOPY_PS(12, 16, cpu); \
151 SETUP_BLOCKCOPY_PS(16, 4, cpu); \
152 SETUP_BLOCKCOPY_PS(16, 8, cpu); \
153 SETUP_BLOCKCOPY_PS(16, 12, cpu); \
154 SETUP_BLOCKCOPY_PS(16, 16, cpu); \
155 SETUP_BLOCKCOPY_PS(16, 32, cpu); \
156 SETUP_BLOCKCOPY_PS(24, 32, cpu); \
157 SETUP_BLOCKCOPY_PS(32, 8, cpu); \
158 SETUP_BLOCKCOPY_PS(32, 16, cpu); \
159 SETUP_BLOCKCOPY_PS(32, 24, cpu); \
160 SETUP_BLOCKCOPY_PS(32, 32, cpu); \
161 SETUP_BLOCKCOPY_PS(16, 64, cpu); \
162 SETUP_BLOCKCOPY_PS(32, 64, cpu); \
163 SETUP_BLOCKCOPY_PS(48, 64, cpu); \
164 SETUP_BLOCKCOPY_PS(64, 16, cpu); \
165 SETUP_BLOCKCOPY_PS(64, 32, cpu); \
166 SETUP_BLOCKCOPY_PS(64, 48, cpu); \
167 SETUP_BLOCKCOPY_PS(64, 64, cpu); \
168 \
169 SETUP_BLOCKCOPY_PS(2, 16, cpu); \
170 SETUP_BLOCKCOPY_PS(4, 32, cpu); \
171 SETUP_BLOCKCOPY_PS(6, 16, cpu); \
172 SETUP_BLOCKCOPY_PS(8, 12, cpu); \
173 SETUP_BLOCKCOPY_PS(8, 64, cpu); \
174 SETUP_BLOCKCOPY_PS(12, 32, cpu); \
175 SETUP_BLOCKCOPY_PS(16, 24, cpu); \
176 SETUP_BLOCKCOPY_PS(24, 64, cpu); \
177 SETUP_BLOCKCOPY_PS(32, 48, cpu);
178
179BLOCKCOPY_COMMON(_sse2);
180BLOCKCOPY_SS_PP(_sse2);
181BLOCKCOPY_SP(_sse4);
182BLOCKCOPY_PS(_sse4);
183
184BLOCKCOPY_SP(_sse2);
185
b53f7c52
JB
186void x265_blockfill_s_4x4_sse2(int16_t* dst, intptr_t dstride, int16_t val);
187void x265_blockfill_s_8x8_sse2(int16_t* dst, intptr_t dstride, int16_t val);
188void x265_blockfill_s_16x16_sse2(int16_t* dst, intptr_t dstride, int16_t val);
189void x265_blockfill_s_32x32_sse2(int16_t* dst, intptr_t dstride, int16_t val);
190void x265_blockcopy_ss_16x4_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
191void x265_blockcopy_ss_16x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
192void x265_blockcopy_ss_16x12_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
193void x265_blockcopy_ss_16x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
194void x265_blockcopy_ss_16x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
195void x265_blockcopy_ss_16x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
196void x265_blockcopy_ss_16x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
197void x265_blockcopy_ss_64x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
198void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
199void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
200void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
201
202void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
203void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
204void x265_blockcopy_pp_32x24_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
205void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
206void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
207void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb);
208
209void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val);
210void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val);
72b9787e
JB
211
212#undef BLOCKCOPY_COMMON
213#undef BLOCKCOPY_SS_PP
214#undef BLOCKCOPY_SP
215#undef BLOCKCOPY_PS
216#undef SETUP_BLOCKCOPY_PS
217#undef SETUP_BLOCKCOPY_SP
218#undef SETUP_BLOCKCOPY_SS_PP
219#undef SETUP_BLOCKCOPY_FUNC
220
221#endif // ifndef X265_I386_PIXEL_H