Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ;****************************************************************************** |
2 | ;* VP9 Intra prediction SIMD optimizations | |
3 | ;* | |
4 | ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> | |
5 | ;* | |
6 | ;* Parts based on: | |
7 | ;* H.264 intra prediction asm optimizations | |
8 | ;* Copyright (c) 2010 Fiona Glaser | |
9 | ;* Copyright (c) 2010 Holger Lubitz | |
10 | ;* Copyright (c) 2010 Loren Merritt | |
11 | ;* Copyright (c) 2010 Ronald S. Bultje | |
12 | ;* | |
13 | ;* This file is part of FFmpeg. | |
14 | ;* | |
15 | ;* FFmpeg is free software; you can redistribute it and/or | |
16 | ;* modify it under the terms of the GNU Lesser General Public | |
17 | ;* License as published by the Free Software Foundation; either | |
18 | ;* version 2.1 of the License, or (at your option) any later version. | |
19 | ;* | |
20 | ;* FFmpeg is distributed in the hope that it will be useful, | |
21 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
22 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
23 | ;* Lesser General Public License for more details. | |
24 | ;* | |
25 | ;* You should have received a copy of the GNU Lesser General Public | |
26 | ;* License along with FFmpeg; if not, write to the Free Software | |
27 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
28 | ;****************************************************************************** | |
29 | ||
30 | %include "libavutil/x86/x86util.asm" | |
31 | ||
32 | SECTION_RODATA 32 | |
33 | ||
34 | pw_m256: times 16 dw -256 | |
35 | pw_m255: times 16 dw -255 | |
36 | pw_4096: times 8 dw 4096 | |
37 | ||
38 | pb_4x3_4x2_4x1_4x0: times 4 db 3 | |
39 | times 4 db 2 | |
40 | times 4 db 1 | |
41 | times 4 db 0 | |
42 | pb_8x1_8x0: times 8 db 1 | |
43 | times 8 db 0 | |
44 | pb_8x3_8x2: times 8 db 3 | |
45 | times 8 db 2 | |
46 | pb_0to5_2x7: db 0, 1, 2, 3, 4, 5, 7, 7 | |
47 | times 8 db -1 | |
48 | pb_0to6_9x7: db 0, 1, 2, 3, 4, 5, 6 | |
49 | times 9 db 7 | |
50 | pb_1to6_10x7: db 1, 2, 3, 4, 5, 6 | |
51 | times 10 db 7 | |
52 | pb_2to6_3x7: | |
53 | pb_2to6_11x7: db 2, 3, 4, 5, 6 | |
54 | times 11 db 7 | |
55 | pb_1toE_2xF: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 | |
56 | pb_2toE_3xF: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 | |
57 | pb_13456_3xm1: db 1, 3, 4, 5, 6 | |
58 | times 3 db -1 | |
59 | pb_6012_4xm1: db 6, 0, 1, 2 | |
60 | times 4 db -1 | |
61 | pb_6xm1_246_8toE: times 6 db -1 | |
62 | db 2, 4, 6, 8, 9, 10, 11, 12, 13, 14 | |
63 | pb_6xm1_BDF_0to6: times 6 db -1 | |
64 | db 11, 13, 15, 0, 1, 2, 3, 4, 5, 6 | |
65 | pb_02468ACE_13579BDF: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 | |
66 | pb_7to1_9x0: db 7, 6, 5, 4 | |
67 | pb_3to1_5x0: db 3, 2, 1 | |
68 | times 9 db 0 | |
69 | pb_Fto0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
70 | ||
71 | pb_2: times 32 db 2 | |
72 | pb_15: times 16 db 15 | |
73 | ||
74 | cextern pb_1 | |
75 | cextern pb_3 | |
76 | cextern pw_512 | |
77 | cextern pw_1024 | |
78 | cextern pw_2048 | |
79 | cextern pw_8192 | |
80 | ||
81 | SECTION .text | |
82 | ||
83 | ; dc_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) | |
84 | ||
85 | INIT_MMX ssse3 | |
86 | cglobal vp9_ipred_dc_4x4, 4, 4, 0, dst, stride, l, a | |
87 | movd m0, [lq] | |
88 | punpckldq m0, [aq] | |
89 | pxor m1, m1 | |
90 | psadbw m0, m1 | |
91 | pmulhrsw m0, [pw_4096] | |
92 | pshufb m0, m1 | |
93 | movd [dstq+strideq*0], m0 | |
94 | movd [dstq+strideq*1], m0 | |
95 | lea dstq, [dstq+strideq*2] | |
96 | movd [dstq+strideq*0], m0 | |
97 | movd [dstq+strideq*1], m0 | |
98 | RET | |
99 | ||
100 | INIT_MMX ssse3 | |
101 | cglobal vp9_ipred_dc_8x8, 4, 4, 0, dst, stride, l, a | |
102 | movq m0, [lq] | |
103 | movq m1, [aq] | |
104 | DEFINE_ARGS dst, stride, stride3 | |
105 | lea stride3q, [strideq*3] | |
106 | pxor m2, m2 | |
107 | psadbw m0, m2 | |
108 | psadbw m1, m2 | |
109 | paddw m0, m1 | |
110 | pmulhrsw m0, [pw_2048] | |
111 | pshufb m0, m2 | |
112 | movq [dstq+strideq*0], m0 | |
113 | movq [dstq+strideq*1], m0 | |
114 | movq [dstq+strideq*2], m0 | |
115 | movq [dstq+stride3q ], m0 | |
116 | lea dstq, [dstq+strideq*4] | |
117 | movq [dstq+strideq*0], m0 | |
118 | movq [dstq+strideq*1], m0 | |
119 | movq [dstq+strideq*2], m0 | |
120 | movq [dstq+stride3q ], m0 | |
121 | RET | |
122 | ||
123 | INIT_XMM ssse3 | |
124 | cglobal vp9_ipred_dc_16x16, 4, 4, 3, dst, stride, l, a | |
125 | mova m0, [lq] | |
126 | mova m1, [aq] | |
127 | DEFINE_ARGS dst, stride, stride3, cnt | |
128 | lea stride3q, [strideq*3] | |
129 | pxor m2, m2 | |
130 | psadbw m0, m2 | |
131 | psadbw m1, m2 | |
132 | paddw m0, m1 | |
133 | movhlps m1, m0 | |
134 | paddw m0, m1 | |
135 | pmulhrsw m0, [pw_1024] | |
136 | pshufb m0, m2 | |
137 | mov cntd, 4 | |
138 | .loop: | |
139 | mova [dstq+strideq*0], m0 | |
140 | mova [dstq+strideq*1], m0 | |
141 | mova [dstq+strideq*2], m0 | |
142 | mova [dstq+stride3q ], m0 | |
143 | lea dstq, [dstq+strideq*4] | |
144 | dec cntd | |
145 | jg .loop | |
146 | RET | |
147 | ||
148 | INIT_XMM ssse3 | |
149 | cglobal vp9_ipred_dc_32x32, 4, 4, 5, dst, stride, l, a | |
150 | mova m0, [lq] | |
151 | mova m1, [lq+16] | |
152 | mova m2, [aq] | |
153 | mova m3, [aq+16] | |
154 | DEFINE_ARGS dst, stride, stride3, cnt | |
155 | lea stride3q, [strideq*3] | |
156 | pxor m4, m4 | |
157 | psadbw m0, m4 | |
158 | psadbw m1, m4 | |
159 | psadbw m2, m4 | |
160 | psadbw m3, m4 | |
161 | paddw m0, m1 | |
162 | paddw m2, m3 | |
163 | paddw m0, m2 | |
164 | movhlps m1, m0 | |
165 | paddw m0, m1 | |
166 | pmulhrsw m0, [pw_512] | |
167 | pshufb m0, m4 | |
168 | mov cntd, 8 | |
169 | .loop: | |
170 | mova [dstq+strideq*0+ 0], m0 | |
171 | mova [dstq+strideq*0+16], m0 | |
172 | mova [dstq+strideq*1+ 0], m0 | |
173 | mova [dstq+strideq*1+16], m0 | |
174 | mova [dstq+strideq*2+ 0], m0 | |
175 | mova [dstq+strideq*2+16], m0 | |
176 | mova [dstq+stride3q + 0], m0 | |
177 | mova [dstq+stride3q +16], m0 | |
178 | lea dstq, [dstq+strideq*4] | |
179 | dec cntd | |
180 | jg .loop | |
181 | RET | |
182 | ||
183 | %if HAVE_AVX2_EXTERNAL | |
184 | INIT_YMM avx2 | |
185 | cglobal vp9_ipred_dc_32x32, 4, 4, 3, dst, stride, l, a | |
186 | mova m0, [lq] | |
187 | mova m1, [aq] | |
188 | DEFINE_ARGS dst, stride, stride3, cnt | |
189 | lea stride3q, [strideq*3] | |
190 | pxor m2, m2 | |
191 | psadbw m0, m2 | |
192 | psadbw m1, m2 | |
193 | paddw m0, m1 | |
194 | vextracti128 xm1, m0, 1 | |
195 | paddw xm0, xm1 | |
196 | movhlps xm1, xm0 | |
197 | paddw xm0, xm1 | |
198 | pmulhrsw xm0, [pw_512] | |
199 | vpbroadcastb m0, xm0 | |
200 | mov cntd, 4 | |
201 | .loop: | |
202 | mova [dstq+strideq*0], m0 | |
203 | mova [dstq+strideq*1], m0 | |
204 | mova [dstq+strideq*2], m0 | |
205 | mova [dstq+stride3q ], m0 | |
206 | lea dstq, [dstq+strideq*4] | |
207 | mova [dstq+strideq*0], m0 | |
208 | mova [dstq+strideq*1], m0 | |
209 | mova [dstq+strideq*2], m0 | |
210 | mova [dstq+stride3q ], m0 | |
211 | lea dstq, [dstq+strideq*4] | |
212 | dec cntd | |
213 | jg .loop | |
214 | RET | |
215 | %endif | |
216 | ||
217 | ; dc_top/left_NxN(uint8_t *dst, ptrdiff_t stride, const uint8_t *l, const uint8_t *a) | |
218 | ||
219 | %macro DC_1D_FUNCS 2 ; dir (top or left), arg (a or l) | |
220 | INIT_MMX ssse3 | |
221 | cglobal vp9_ipred_dc_%1_4x4, 4, 4, 0, dst, stride, l, a | |
222 | movd m0, [%2q] | |
223 | pxor m1, m1 | |
224 | psadbw m0, m1 | |
225 | pmulhrsw m0, [pw_8192] | |
226 | pshufb m0, m1 | |
227 | movd [dstq+strideq*0], m0 | |
228 | movd [dstq+strideq*1], m0 | |
229 | lea dstq, [dstq+strideq*2] | |
230 | movd [dstq+strideq*0], m0 | |
231 | movd [dstq+strideq*1], m0 | |
232 | RET | |
233 | ||
234 | INIT_MMX ssse3 | |
235 | cglobal vp9_ipred_dc_%1_8x8, 4, 4, 0, dst, stride, l, a | |
236 | movq m0, [%2q] | |
237 | DEFINE_ARGS dst, stride, stride3 | |
238 | lea stride3q, [strideq*3] | |
239 | pxor m1, m1 | |
240 | psadbw m0, m1 | |
241 | pmulhrsw m0, [pw_4096] | |
242 | pshufb m0, m1 | |
243 | movq [dstq+strideq*0], m0 | |
244 | movq [dstq+strideq*1], m0 | |
245 | movq [dstq+strideq*2], m0 | |
246 | movq [dstq+stride3q ], m0 | |
247 | lea dstq, [dstq+strideq*4] | |
248 | movq [dstq+strideq*0], m0 | |
249 | movq [dstq+strideq*1], m0 | |
250 | movq [dstq+strideq*2], m0 | |
251 | movq [dstq+stride3q ], m0 | |
252 | RET | |
253 | ||
254 | INIT_XMM ssse3 | |
255 | cglobal vp9_ipred_dc_%1_16x16, 4, 4, 3, dst, stride, l, a | |
256 | mova m0, [%2q] | |
257 | DEFINE_ARGS dst, stride, stride3, cnt | |
258 | lea stride3q, [strideq*3] | |
259 | pxor m2, m2 | |
260 | psadbw m0, m2 | |
261 | movhlps m1, m0 | |
262 | paddw m0, m1 | |
263 | pmulhrsw m0, [pw_2048] | |
264 | pshufb m0, m2 | |
265 | mov cntd, 4 | |
266 | .loop: | |
267 | mova [dstq+strideq*0], m0 | |
268 | mova [dstq+strideq*1], m0 | |
269 | mova [dstq+strideq*2], m0 | |
270 | mova [dstq+stride3q ], m0 | |
271 | lea dstq, [dstq+strideq*4] | |
272 | dec cntd | |
273 | jg .loop | |
274 | RET | |
275 | ||
276 | INIT_XMM ssse3 | |
277 | cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a | |
278 | mova m0, [%2q] | |
279 | mova m1, [%2q+16] | |
280 | DEFINE_ARGS dst, stride, stride3, cnt | |
281 | lea stride3q, [strideq*3] | |
282 | pxor m2, m2 | |
283 | psadbw m0, m2 | |
284 | psadbw m1, m2 | |
285 | paddw m0, m1 | |
286 | movhlps m1, m0 | |
287 | paddw m0, m1 | |
288 | pmulhrsw m0, [pw_1024] | |
289 | pshufb m0, m2 | |
290 | mov cntd, 8 | |
291 | .loop: | |
292 | mova [dstq+strideq*0+ 0], m0 | |
293 | mova [dstq+strideq*0+16], m0 | |
294 | mova [dstq+strideq*1+ 0], m0 | |
295 | mova [dstq+strideq*1+16], m0 | |
296 | mova [dstq+strideq*2+ 0], m0 | |
297 | mova [dstq+strideq*2+16], m0 | |
298 | mova [dstq+stride3q + 0], m0 | |
299 | mova [dstq+stride3q +16], m0 | |
300 | lea dstq, [dstq+strideq*4] | |
301 | dec cntd | |
302 | jg .loop | |
303 | RET | |
304 | ||
305 | %if HAVE_AVX2_EXTERNAL | |
306 | INIT_YMM avx2 | |
307 | cglobal vp9_ipred_dc_%1_32x32, 4, 4, 3, dst, stride, l, a | |
308 | mova m0, [%2q] | |
309 | DEFINE_ARGS dst, stride, stride3, cnt | |
310 | lea stride3q, [strideq*3] | |
311 | pxor m2, m2 | |
312 | psadbw m0, m2 | |
313 | vextracti128 xm1, m0, 1 | |
314 | paddw xm0, xm1 | |
315 | movhlps xm1, xm0 | |
316 | paddw xm0, xm1 | |
317 | pmulhrsw xm0, [pw_1024] | |
318 | vpbroadcastb m0, xm0 | |
319 | mov cntd, 4 | |
320 | .loop: | |
321 | mova [dstq+strideq*0], m0 | |
322 | mova [dstq+strideq*1], m0 | |
323 | mova [dstq+strideq*2], m0 | |
324 | mova [dstq+stride3q ], m0 | |
325 | lea dstq, [dstq+strideq*4] | |
326 | mova [dstq+strideq*0], m0 | |
327 | mova [dstq+strideq*1], m0 | |
328 | mova [dstq+strideq*2], m0 | |
329 | mova [dstq+stride3q ], m0 | |
330 | lea dstq, [dstq+strideq*4] | |
331 | dec cntd | |
332 | jg .loop | |
333 | RET | |
334 | %endif | |
335 | %endmacro | |
336 | ||
337 | DC_1D_FUNCS top, a | |
338 | DC_1D_FUNCS left, l | |
339 | ||
340 | ; v | |
341 | ||
342 | INIT_MMX mmx | |
343 | cglobal vp9_ipred_v_8x8, 4, 4, 0, dst, stride, l, a | |
344 | movq m0, [aq] | |
345 | DEFINE_ARGS dst, stride, stride3 | |
346 | lea stride3q, [strideq*3] | |
347 | movq [dstq+strideq*0], m0 | |
348 | movq [dstq+strideq*1], m0 | |
349 | movq [dstq+strideq*2], m0 | |
350 | movq [dstq+stride3q ], m0 | |
351 | lea dstq, [dstq+strideq*4] | |
352 | movq [dstq+strideq*0], m0 | |
353 | movq [dstq+strideq*1], m0 | |
354 | movq [dstq+strideq*2], m0 | |
355 | movq [dstq+stride3q ], m0 | |
356 | RET | |
357 | ||
358 | INIT_XMM sse2 | |
359 | cglobal vp9_ipred_v_16x16, 4, 4, 1, dst, stride, l, a | |
360 | mova m0, [aq] | |
361 | DEFINE_ARGS dst, stride, stride3, cnt | |
362 | lea stride3q, [strideq*3] | |
363 | mov cntd, 4 | |
364 | .loop: | |
365 | mova [dstq+strideq*0], m0 | |
366 | mova [dstq+strideq*1], m0 | |
367 | mova [dstq+strideq*2], m0 | |
368 | mova [dstq+stride3q ], m0 | |
369 | lea dstq, [dstq+strideq*4] | |
370 | dec cntd | |
371 | jg .loop | |
372 | RET | |
373 | ||
374 | INIT_XMM sse2 | |
375 | cglobal vp9_ipred_v_32x32, 4, 4, 2, dst, stride, l, a | |
376 | mova m0, [aq] | |
377 | mova m1, [aq+16] | |
378 | DEFINE_ARGS dst, stride, stride3, cnt | |
379 | lea stride3q, [strideq*3] | |
380 | mov cntd, 8 | |
381 | .loop: | |
382 | mova [dstq+strideq*0+ 0], m0 | |
383 | mova [dstq+strideq*0+16], m1 | |
384 | mova [dstq+strideq*1+ 0], m0 | |
385 | mova [dstq+strideq*1+16], m1 | |
386 | mova [dstq+strideq*2+ 0], m0 | |
387 | mova [dstq+strideq*2+16], m1 | |
388 | mova [dstq+stride3q + 0], m0 | |
389 | mova [dstq+stride3q +16], m1 | |
390 | lea dstq, [dstq+strideq*4] | |
391 | dec cntd | |
392 | jg .loop | |
393 | RET | |
394 | ||
395 | %if HAVE_AVX2_EXTERNAL | |
396 | INIT_YMM avx2 | |
397 | cglobal vp9_ipred_v_32x32, 4, 4, 1, dst, stride, l, a | |
398 | mova m0, [aq] | |
399 | DEFINE_ARGS dst, stride, stride3, cnt | |
400 | lea stride3q, [strideq*3] | |
401 | mov cntd, 4 | |
402 | .loop: | |
403 | mova [dstq+strideq*0], m0 | |
404 | mova [dstq+strideq*1], m0 | |
405 | mova [dstq+strideq*2], m0 | |
406 | mova [dstq+stride3q ], m0 | |
407 | lea dstq, [dstq+strideq*4] | |
408 | mova [dstq+strideq*0], m0 | |
409 | mova [dstq+strideq*1], m0 | |
410 | mova [dstq+strideq*2], m0 | |
411 | mova [dstq+stride3q ], m0 | |
412 | lea dstq, [dstq+strideq*4] | |
413 | dec cntd | |
414 | jg .loop | |
415 | RET | |
416 | %endif | |
417 | ||
418 | ; h | |
419 | ||
420 | INIT_XMM ssse3 | |
421 | cglobal vp9_ipred_h_4x4, 3, 4, 1, dst, stride, l, stride3 | |
422 | movd m0, [lq] | |
423 | pshufb m0, [pb_4x3_4x2_4x1_4x0] | |
424 | lea stride3q, [strideq*3] | |
425 | movd [dstq+strideq*0], m0 | |
426 | psrldq m0, 4 | |
427 | movd [dstq+strideq*1], m0 | |
428 | psrldq m0, 4 | |
429 | movd [dstq+strideq*2], m0 | |
430 | psrldq m0, 4 | |
431 | movd [dstq+stride3q ], m0 | |
432 | RET | |
433 | ||
434 | %macro H_XMM_FUNCS 1 | |
435 | INIT_XMM %1 | |
436 | cglobal vp9_ipred_h_8x8, 3, 5, 4, dst, stride, l, stride3, cnt | |
437 | mova m2, [pb_8x1_8x0] | |
438 | mova m3, [pb_8x3_8x2] | |
439 | lea stride3q, [strideq*3] | |
440 | mov cntq, 1 | |
441 | .loop: | |
442 | movd m0, [lq+cntq*4] | |
443 | pshufb m1, m0, m3 | |
444 | pshufb m0, m2 | |
445 | movq [dstq+strideq*0], m1 | |
446 | movhps [dstq+strideq*1], m1 | |
447 | movq [dstq+strideq*2], m0 | |
448 | movhps [dstq+stride3q ], m0 | |
449 | lea dstq, [dstq+strideq*4] | |
450 | dec cntq | |
451 | jge .loop | |
452 | RET | |
453 | ||
454 | INIT_XMM %1 | |
455 | cglobal vp9_ipred_h_16x16, 3, 5, 8, dst, stride, l, stride3, cnt | |
456 | mova m5, [pb_1] | |
457 | mova m6, [pb_2] | |
458 | mova m7, [pb_3] | |
459 | pxor m4, m4 | |
460 | lea stride3q, [strideq*3] | |
461 | mov cntq, 3 | |
462 | .loop: | |
463 | movd m3, [lq+cntq*4] | |
464 | pshufb m0, m3, m7 | |
465 | pshufb m1, m3, m6 | |
466 | mova [dstq+strideq*0], m0 | |
467 | mova [dstq+strideq*1], m1 | |
468 | pshufb m2, m3, m5 | |
469 | pshufb m3, m4 | |
470 | mova [dstq+strideq*2], m2 | |
471 | mova [dstq+stride3q ], m3 | |
472 | lea dstq, [dstq+strideq*4] | |
473 | dec cntq | |
474 | jge .loop | |
475 | RET | |
476 | ||
477 | INIT_XMM %1 | |
478 | cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt | |
479 | mova m5, [pb_1] | |
480 | mova m6, [pb_2] | |
481 | mova m7, [pb_3] | |
482 | pxor m4, m4 | |
483 | lea stride3q, [strideq*3] | |
484 | mov cntq, 7 | |
485 | .loop: | |
486 | movd m3, [lq+cntq*4] | |
487 | pshufb m0, m3, m7 | |
488 | pshufb m1, m3, m6 | |
489 | mova [dstq+strideq*0+ 0], m0 | |
490 | mova [dstq+strideq*0+16], m0 | |
491 | mova [dstq+strideq*1+ 0], m1 | |
492 | mova [dstq+strideq*1+16], m1 | |
493 | pshufb m2, m3, m5 | |
494 | pshufb m3, m4 | |
495 | mova [dstq+strideq*2+ 0], m2 | |
496 | mova [dstq+strideq*2+16], m2 | |
497 | mova [dstq+stride3q + 0], m3 | |
498 | mova [dstq+stride3q +16], m3 | |
499 | lea dstq, [dstq+strideq*4] | |
500 | dec cntq | |
501 | jge .loop | |
502 | RET | |
503 | %endmacro | |
504 | ||
505 | H_XMM_FUNCS ssse3 | |
506 | H_XMM_FUNCS avx | |
507 | ||
508 | %if HAVE_AVX2_EXTERNAL | |
509 | INIT_YMM avx2 | |
510 | cglobal vp9_ipred_h_32x32, 3, 5, 8, dst, stride, l, stride3, cnt | |
511 | mova m5, [pb_1] | |
512 | mova m6, [pb_2] | |
513 | mova m7, [pb_3] | |
514 | pxor m4, m4 | |
515 | lea stride3q, [strideq*3] | |
516 | mov cntq, 7 | |
517 | .loop: | |
518 | movd xm3, [lq+cntq*4] | |
519 | vinserti128 m3, m3, xm3, 1 | |
520 | pshufb m0, m3, m7 | |
521 | pshufb m1, m3, m6 | |
522 | mova [dstq+strideq*0], m0 | |
523 | mova [dstq+strideq*1], m1 | |
524 | pshufb m2, m3, m5 | |
525 | pshufb m3, m4 | |
526 | mova [dstq+strideq*2], m2 | |
527 | mova [dstq+stride3q ], m3 | |
528 | lea dstq, [dstq+strideq*4] | |
529 | dec cntq | |
530 | jge .loop | |
531 | RET | |
532 | %endif | |
533 | ||
534 | ; tm | |
535 | ||
536 | INIT_MMX ssse3 | |
537 | cglobal vp9_ipred_tm_4x4, 4, 4, 0, dst, stride, l, a | |
538 | pxor m1, m1 | |
539 | pinsrw m2, [aq-1], 0 | |
540 | movd m0, [aq] | |
541 | DEFINE_ARGS dst, stride, l, cnt | |
542 | mova m3, [pw_m256] | |
543 | mova m4, [pw_m255] | |
544 | pshufb m2, m3 | |
545 | punpcklbw m0, m1 | |
546 | psubw m0, m2 | |
547 | mov cntq, 1 | |
548 | .loop: | |
549 | pinsrw m2, [lq+cntq*2], 0 | |
550 | pshufb m1, m2, m4 | |
551 | pshufb m2, m3 | |
552 | paddw m1, m0 | |
553 | paddw m2, m0 | |
554 | packuswb m1, m1 | |
555 | packuswb m2, m2 | |
556 | movd [dstq+strideq*0], m1 | |
557 | movd [dstq+strideq*1], m2 | |
558 | lea dstq, [dstq+strideq*2] | |
559 | dec cntq | |
560 | jge .loop | |
561 | RET | |
562 | ||
563 | %macro TM_XMM_FUNCS 1 | |
564 | INIT_XMM %1 | |
565 | cglobal vp9_ipred_tm_8x8, 4, 4, 5, dst, stride, l, a | |
566 | pxor m1, m1 | |
567 | pinsrw m2, [aq-1], 0 | |
568 | movh m0, [aq] | |
569 | DEFINE_ARGS dst, stride, l, cnt | |
570 | mova m3, [pw_m256] | |
571 | mova m4, [pw_m255] | |
572 | pshufb m2, m3 | |
573 | punpcklbw m0, m1 | |
574 | psubw m0, m2 | |
575 | mov cntq, 3 | |
576 | .loop: | |
577 | pinsrw m2, [lq+cntq*2], 0 | |
578 | pshufb m1, m2, m4 | |
579 | pshufb m2, m3 | |
580 | paddw m1, m0 | |
581 | paddw m2, m0 | |
582 | packuswb m1, m2 | |
583 | movh [dstq+strideq*0], m1 | |
584 | movhps [dstq+strideq*1], m1 | |
585 | lea dstq, [dstq+strideq*2] | |
586 | dec cntq | |
587 | jge .loop | |
588 | RET | |
589 | ||
590 | INIT_XMM %1 | |
591 | cglobal vp9_ipred_tm_16x16, 4, 4, 8, dst, stride, l, a | |
592 | pxor m3, m3 | |
593 | pinsrw m2, [aq-1], 0 | |
594 | mova m0, [aq] | |
595 | DEFINE_ARGS dst, stride, l, cnt | |
596 | mova m4, [pw_m256] | |
597 | mova m5, [pw_m255] | |
598 | pshufb m2, m4 | |
599 | punpckhbw m1, m0, m3 | |
600 | punpcklbw m0, m3 | |
601 | psubw m1, m2 | |
602 | psubw m0, m2 | |
603 | mov cntq, 7 | |
604 | .loop: | |
605 | pinsrw m7, [lq+cntq*2], 0 | |
606 | pshufb m3, m7, m5 | |
607 | pshufb m7, m4 | |
608 | paddw m2, m3, m0 | |
609 | paddw m3, m1 | |
610 | paddw m6, m7, m0 | |
611 | paddw m7, m1 | |
612 | packuswb m2, m3 | |
613 | packuswb m6, m7 | |
614 | mova [dstq+strideq*0], m2 | |
615 | mova [dstq+strideq*1], m6 | |
616 | lea dstq, [dstq+strideq*2] | |
617 | dec cntq | |
618 | jge .loop | |
619 | RET | |
620 | ||
621 | %if ARCH_X86_64 | |
622 | INIT_XMM %1 | |
623 | cglobal vp9_ipred_tm_32x32, 4, 4, 14, dst, stride, l, a | |
624 | pxor m5, m5 | |
625 | pinsrw m4, [aq-1], 0 | |
626 | mova m0, [aq] | |
627 | mova m2, [aq+16] | |
628 | DEFINE_ARGS dst, stride, l, cnt | |
629 | mova m8, [pw_m256] | |
630 | mova m9, [pw_m255] | |
631 | pshufb m4, m8 | |
632 | punpckhbw m1, m0, m5 | |
633 | punpckhbw m3, m2, m5 | |
634 | punpcklbw m0, m5 | |
635 | punpcklbw m2, m5 | |
636 | psubw m1, m4 | |
637 | psubw m0, m4 | |
638 | psubw m3, m4 | |
639 | psubw m2, m4 | |
640 | mov cntq, 15 | |
641 | .loop: | |
642 | pinsrw m13, [lq+cntq*2], 0 | |
643 | pshufb m7, m13, m9 | |
644 | pshufb m13, m8 | |
645 | paddw m4, m7, m0 | |
646 | paddw m5, m7, m1 | |
647 | paddw m6, m7, m2 | |
648 | paddw m7, m3 | |
649 | paddw m10, m13, m0 | |
650 | paddw m11, m13, m1 | |
651 | paddw m12, m13, m2 | |
652 | paddw m13, m3 | |
653 | packuswb m4, m5 | |
654 | packuswb m6, m7 | |
655 | packuswb m10, m11 | |
656 | packuswb m12, m13 | |
657 | mova [dstq+strideq*0+ 0], m4 | |
658 | mova [dstq+strideq*0+16], m6 | |
659 | mova [dstq+strideq*1+ 0], m10 | |
660 | mova [dstq+strideq*1+16], m12 | |
661 | lea dstq, [dstq+strideq*2] | |
662 | dec cntq | |
663 | jge .loop | |
664 | RET | |
665 | %endif | |
666 | %endmacro | |
667 | ||
668 | TM_XMM_FUNCS ssse3 | |
669 | TM_XMM_FUNCS avx | |
670 | ||
671 | %if HAVE_AVX2_EXTERNAL | |
672 | INIT_YMM avx2 | |
673 | cglobal vp9_ipred_tm_32x32, 4, 4, 8, dst, stride, l, a | |
674 | pxor m3, m3 | |
675 | pinsrw xm2, [aq-1], 0 | |
676 | vinserti128 m2, m2, xm2, 1 | |
677 | mova m0, [aq] | |
678 | DEFINE_ARGS dst, stride, l, cnt | |
679 | mova m4, [pw_m256] | |
680 | mova m5, [pw_m255] | |
681 | pshufb m2, m4 | |
682 | punpckhbw m1, m0, m3 | |
683 | punpcklbw m0, m3 | |
684 | psubw m1, m2 | |
685 | psubw m0, m2 | |
686 | mov cntq, 15 | |
687 | .loop: | |
688 | pinsrw xm7, [lq+cntq*2], 0 | |
689 | vinserti128 m7, m7, xm7, 1 | |
690 | pshufb m3, m7, m5 | |
691 | pshufb m7, m4 | |
692 | paddw m2, m3, m0 | |
693 | paddw m3, m1 | |
694 | paddw m6, m7, m0 | |
695 | paddw m7, m1 | |
696 | packuswb m2, m3 | |
697 | packuswb m6, m7 | |
698 | mova [dstq+strideq*0], m2 | |
699 | mova [dstq+strideq*1], m6 | |
700 | lea dstq, [dstq+strideq*2] | |
701 | dec cntq | |
702 | jge .loop | |
703 | RET | |
704 | %endif | |
705 | ||
706 | ; dl | |
707 | ||
708 | %macro LOWPASS 4 ; left [dst], center, right, tmp | |
709 | pxor m%4, m%1, m%3 | |
710 | pand m%4, [pb_1] | |
711 | pavgb m%1, m%3 | |
712 | psubusb m%1, m%4 | |
713 | pavgb m%1, m%2 | |
714 | %endmacro | |
715 | ||
716 | INIT_MMX ssse3 | |
717 | cglobal vp9_ipred_dl_4x4, 4, 4, 0, dst, stride, l, a | |
718 | movq m1, [aq] | |
719 | pshufb m0, m1, [pb_0to5_2x7] | |
720 | pshufb m2, m1, [pb_2to6_3x7] | |
721 | psrlq m1, 8 | |
722 | LOWPASS 0, 1, 2, 3 | |
723 | ||
724 | pshufw m1, m0, q3321 | |
725 | movd [dstq+strideq*0], m0 | |
726 | movd [dstq+strideq*2], m1 | |
727 | psrlq m0, 8 | |
728 | psrlq m1, 8 | |
729 | add dstq, strideq | |
730 | movd [dstq+strideq*0], m0 | |
731 | movd [dstq+strideq*2], m1 | |
732 | RET | |
733 | ||
734 | %macro DL_XMM_FUNCS 1 | |
735 | INIT_XMM %1 | |
736 | cglobal vp9_ipred_dl_8x8, 4, 4, 4, dst, stride, stride5, a | |
737 | movq m0, [aq] | |
738 | lea stride5q, [strideq*5] | |
739 | pshufb m1, m0, [pb_1to6_10x7] | |
740 | psrldq m2, m1, 1 | |
741 | shufps m0, m1, q3210 | |
742 | LOWPASS 0, 1, 2, 3 | |
743 | ||
744 | pshufd m1, m0, q3321 | |
745 | movq [dstq+strideq*0], m0 | |
746 | movq [dstq+strideq*4], m1 | |
747 | psrldq m0, 1 | |
748 | psrldq m1, 1 | |
749 | movq [dstq+strideq*1], m0 | |
750 | movq [dstq+stride5q ], m1 | |
751 | lea dstq, [dstq+strideq*2] | |
752 | psrldq m0, 1 | |
753 | psrldq m1, 1 | |
754 | movq [dstq+strideq*0], m0 | |
755 | movq [dstq+strideq*4], m1 | |
756 | psrldq m0, 1 | |
757 | psrldq m1, 1 | |
758 | movq [dstq+strideq*1], m0 | |
759 | movq [dstq+stride5q ], m1 | |
760 | RET | |
761 | ||
762 | INIT_XMM %1 | |
763 | cglobal vp9_ipred_dl_16x16, 4, 4, 6, dst, stride, l, a | |
764 | mova m5, [pb_1toE_2xF] | |
765 | mova m0, [aq] | |
766 | pshufb m1, m0, m5 | |
767 | pshufb m2, m1, m5 | |
768 | pshufb m4, m0, [pb_15] | |
769 | LOWPASS 0, 1, 2, 3 | |
770 | DEFINE_ARGS dst, stride, cnt, stride9 | |
771 | lea stride9q, [strideq*3] | |
772 | mov cntd, 4 | |
773 | lea stride9q, [stride9q*3] | |
774 | ||
775 | .loop: | |
776 | movhlps m4, m0 | |
777 | mova [dstq+strideq*0], m0 | |
778 | pshufb m0, m5 | |
779 | mova [dstq+strideq*8], m4 | |
780 | movhlps m4, m0 | |
781 | mova [dstq+strideq*1], m0 | |
782 | pshufb m0, m5 | |
783 | mova [dstq+stride9q ], m4 | |
784 | lea dstq, [dstq+strideq*2] | |
785 | dec cntd | |
786 | jg .loop | |
787 | RET | |
788 | ||
789 | INIT_XMM %1 | |
790 | cglobal vp9_ipred_dl_32x32, 4, 5, 8, dst, stride, cnt, a, dst16 | |
791 | mova m5, [pb_1toE_2xF] | |
792 | mova m0, [aq] | |
793 | mova m1, [aq+16] | |
794 | palignr m2, m1, m0, 1 | |
795 | palignr m3, m1, m0, 2 | |
796 | LOWPASS 0, 2, 3, 4 | |
797 | pshufb m2, m1, m5 | |
798 | pshufb m3, m2, m5 | |
799 | pshufb m6, m1, [pb_15] | |
800 | LOWPASS 1, 2, 3, 4 | |
801 | mova m7, m6 | |
802 | lea dst16q, [dstq +strideq*8] | |
803 | mov cntd, 8 | |
804 | lea dst16q, [dst16q+strideq*8] | |
805 | .loop: | |
806 | movhlps m7, m1 | |
807 | mova [dstq +strideq*0+ 0], m0 | |
808 | mova [dstq +strideq*0+16], m1 | |
809 | movhps [dstq+strideq*8+ 0], m0 | |
810 | movq [dstq +strideq*8+ 8], m1 | |
811 | mova [dstq +strideq*8+16], m7 | |
812 | mova [dst16q+strideq*0+ 0], m1 | |
813 | mova [dst16q+strideq*0+16], m6 | |
814 | mova [dst16q+strideq*8+ 0], m7 | |
815 | mova [dst16q+strideq*8+16], m6 | |
816 | %if cpuflag(avx) | |
817 | vpalignr m0, m1, m0, 1 | |
818 | pshufb m1, m5 | |
819 | %else | |
820 | palignr m2, m1, m0, 1 | |
821 | pshufb m1, m5 | |
822 | mova m0, m2 | |
823 | %endif | |
824 | add dstq, strideq | |
825 | add dst16q, strideq | |
826 | dec cntd | |
827 | jg .loop | |
828 | RET | |
829 | %endmacro | |
830 | ||
831 | DL_XMM_FUNCS ssse3 | |
832 | DL_XMM_FUNCS avx | |
833 | ||
834 | ; dr | |
835 | ||
836 | INIT_MMX ssse3 | |
837 | cglobal vp9_ipred_dr_4x4, 4, 4, 0, dst, stride, l, a | |
838 | movd m0, [lq] | |
839 | punpckldq m0, [aq-1] | |
840 | movd m1, [aq+3] | |
841 | DEFINE_ARGS dst, stride, stride3 | |
842 | lea stride3q, [strideq*3] | |
843 | palignr m1, m0, 1 | |
844 | psrlq m2, m1, 8 | |
845 | LOWPASS 0, 1, 2, 3 | |
846 | ||
847 | movd [dstq+stride3q ], m0 | |
848 | psrlq m0, 8 | |
849 | movd [dstq+strideq*2], m0 | |
850 | psrlq m0, 8 | |
851 | movd [dstq+strideq*1], m0 | |
852 | psrlq m0, 8 | |
853 | movd [dstq+strideq*0], m0 | |
854 | RET | |
855 | ||
856 | %macro DR_XMM_FUNCS 1 | |
857 | INIT_XMM %1 | |
858 | cglobal vp9_ipred_dr_8x8, 4, 4, 4, dst, stride, l, a | |
859 | movq m1, [lq] | |
860 | movhps m1, [aq-1] | |
861 | movd m2, [aq+7] | |
862 | DEFINE_ARGS dst, stride, stride3 | |
863 | lea stride3q, [strideq*3] | |
864 | pslldq m0, m1, 1 | |
865 | palignr m2, m1, 1 | |
866 | LOWPASS 0, 1, 2, 3 | |
867 | ||
868 | movhps [dstq+strideq*0], m0 | |
869 | pslldq m0, 1 | |
870 | movhps [dstq+strideq*1], m0 | |
871 | pslldq m0, 1 | |
872 | movhps [dstq+strideq*2], m0 | |
873 | pslldq m0, 1 | |
874 | movhps [dstq+stride3q ], m0 | |
875 | pslldq m0, 1 | |
876 | lea dstq, [dstq+strideq*4] | |
877 | movhps [dstq+strideq*0], m0 | |
878 | pslldq m0, 1 | |
879 | movhps [dstq+strideq*1], m0 | |
880 | pslldq m0, 1 | |
881 | movhps [dstq+strideq*2], m0 | |
882 | pslldq m0, 1 | |
883 | movhps [dstq+stride3q ], m0 | |
884 | RET | |
885 | ||
886 | INIT_XMM %1 | |
887 | cglobal vp9_ipred_dr_16x16, 4, 4, 6, dst, stride, l, a | |
888 | mova m1, [lq] | |
889 | movu m2, [aq-1] | |
890 | movd m4, [aq+15] | |
891 | DEFINE_ARGS dst, stride, stride9, cnt | |
892 | lea stride9q, [strideq *3] | |
893 | mov cntd, 4 | |
894 | lea stride9q, [stride9q*3] | |
895 | palignr m4, m2, 1 | |
896 | palignr m3, m2, m1, 15 | |
897 | LOWPASS 3, 2, 4, 5 | |
898 | pslldq m0, m1, 1 | |
899 | palignr m2, m1, 1 | |
900 | LOWPASS 0, 1, 2, 4 | |
901 | ||
902 | .loop: | |
903 | mova [dstq+strideq*0 ], m3 | |
904 | movhps [dstq+strideq*8+0], m0 | |
905 | movq [dstq+strideq*8+8], m3 | |
906 | palignr m3, m0, 15 | |
907 | pslldq m0, 1 | |
908 | mova [dstq+strideq*1 ], m3 | |
909 | movhps [dstq+stride9q +0], m0 | |
910 | movq [dstq+stride9q +8], m3 | |
911 | palignr m3, m0, 15 | |
912 | pslldq m0, 1 | |
913 | lea dstq, [dstq+strideq*2] | |
914 | dec cntd | |
915 | jg .loop | |
916 | RET | |
917 | ||
918 | INIT_XMM %1 | |
919 | cglobal vp9_ipred_dr_32x32, 4, 4, 8, dst, stride, l, a | |
920 | mova m1, [lq] | |
921 | mova m2, [lq+16] | |
922 | movu m3, [aq-1] | |
923 | movu m4, [aq+15] | |
924 | movd m5, [aq+31] | |
925 | DEFINE_ARGS dst, stride, stride8, cnt | |
926 | lea stride8q, [strideq*8] | |
927 | palignr m5, m4, 1 | |
928 | palignr m6, m4, m3, 15 | |
929 | LOWPASS 5, 4, 6, 7 | |
930 | palignr m4, m3, 1 | |
931 | palignr m6, m3, m2, 15 | |
932 | LOWPASS 4, 3, 6, 7 | |
933 | palignr m3, m2, 1 | |
934 | palignr m6, m2, m1, 15 | |
935 | LOWPASS 3, 2, 6, 7 | |
936 | palignr m2, m1, 1 | |
937 | pslldq m0, m1, 1 | |
938 | LOWPASS 2, 1, 0, 6 | |
939 | mov cntd, 16 | |
940 | ||
941 | ; out=m2/m3/m4/m5 | |
942 | .loop: | |
943 | mova [dstq+stride8q*0+ 0], m4 | |
944 | mova [dstq+stride8q*0+16], m5 | |
945 | mova [dstq+stride8q*2+ 0], m3 | |
946 | mova [dstq+stride8q*2+16], m4 | |
947 | palignr m5, m4, 15 | |
948 | palignr m4, m3, 15 | |
949 | palignr m3, m2, 15 | |
950 | pslldq m2, 1 | |
951 | add dstq, strideq | |
952 | dec cntd | |
953 | jg .loop | |
954 | RET | |
955 | %endmacro | |
956 | ||
957 | DR_XMM_FUNCS ssse3 | |
958 | DR_XMM_FUNCS avx | |
959 | ||
960 | ; vl | |
961 | ||
962 | INIT_MMX ssse3 | |
963 | cglobal vp9_ipred_vl_4x4, 4, 4, 0, dst, stride, l, a | |
964 | movq m0, [aq] | |
965 | psrlq m1, m0, 8 | |
966 | psrlq m2, m1, 8 | |
967 | LOWPASS 2, 1, 0, 3 | |
968 | pavgb m1, m0 | |
969 | movd [dstq+strideq*0], m1 | |
970 | movd [dstq+strideq*1], m2 | |
971 | lea dstq, [dstq+strideq*2] | |
972 | psrlq m1, 8 | |
973 | psrlq m2, 8 | |
974 | movd [dstq+strideq*0], m1 | |
975 | movd [dstq+strideq*1], m2 | |
976 | RET | |
977 | ||
978 | %macro VL_XMM_FUNCS 1 | |
979 | INIT_XMM %1 | |
980 | cglobal vp9_ipred_vl_8x8, 4, 4, 4, dst, stride, l, a | |
981 | movq m0, [aq] | |
982 | pshufb m0, [pb_0to6_9x7] | |
983 | DEFINE_ARGS dst, stride, stride3 | |
984 | lea stride3q, [strideq*3] | |
985 | psrldq m1, m0, 1 | |
986 | psrldq m2, m0, 2 | |
987 | LOWPASS 2, 1, 0, 3 | |
988 | pavgb m1, m0 | |
989 | ||
990 | movq [dstq+strideq*0], m1 | |
991 | movq [dstq+strideq*1], m2 | |
992 | psrldq m1, 1 | |
993 | psrldq m2, 1 | |
994 | movq [dstq+strideq*2], m1 | |
995 | movq [dstq+stride3q ], m2 | |
996 | lea dstq, [dstq+strideq*4] | |
997 | psrldq m1, 1 | |
998 | psrldq m2, 1 | |
999 | movq [dstq+strideq*0], m1 | |
1000 | movq [dstq+strideq*1], m2 | |
1001 | psrldq m1, 1 | |
1002 | psrldq m2, 1 | |
1003 | movq [dstq+strideq*2], m1 | |
1004 | movq [dstq+stride3q ], m2 | |
1005 | RET | |
1006 | ||
1007 | INIT_XMM %1 | |
1008 | cglobal vp9_ipred_vl_16x16, 4, 4, 5, dst, stride, l, a | |
1009 | mova m0, [aq] | |
1010 | mova m4, [pb_1toE_2xF] | |
1011 | DEFINE_ARGS dst, stride, stride3, cnt | |
1012 | lea stride3q, [strideq*3] | |
1013 | pshufb m1, m0, m4 | |
1014 | pshufb m2, m1, m4 | |
1015 | LOWPASS 2, 1, 0, 3 | |
1016 | pavgb m1, m0 | |
1017 | mov cntd, 4 | |
1018 | .loop: | |
1019 | mova [dstq+strideq*0], m1 | |
1020 | mova [dstq+strideq*1], m2 | |
1021 | pshufb m1, m4 | |
1022 | pshufb m2, m4 | |
1023 | mova [dstq+strideq*2], m1 | |
1024 | mova [dstq+stride3q ], m2 | |
1025 | pshufb m1, m4 | |
1026 | pshufb m2, m4 | |
1027 | lea dstq, [dstq+strideq*4] | |
1028 | dec cntd | |
1029 | jg .loop | |
1030 | RET | |
1031 | ||
1032 | INIT_XMM %1 | |
1033 | cglobal vp9_ipred_vl_32x32, 4, 4, 7, dst, stride, l, a | |
1034 | mova m0, [aq] | |
1035 | mova m5, [aq+16] | |
1036 | mova m4, [pb_1toE_2xF] | |
1037 | DEFINE_ARGS dst, stride, dst16, cnt | |
1038 | palignr m2, m5, m0, 1 | |
1039 | palignr m3, m5, m0, 2 | |
1040 | lea dst16q, [dstq +strideq*8] | |
1041 | LOWPASS 3, 2, 0, 6 | |
1042 | pavgb m2, m0 | |
1043 | pshufb m0, m5, m4 | |
1044 | pshufb m1, m0, m4 | |
1045 | lea dst16q, [dst16q+strideq*8] | |
1046 | LOWPASS 1, 0, 5, 6 | |
1047 | pavgb m0, m5 | |
1048 | pshufb m5, [pb_15] | |
1049 | mov cntd, 8 | |
1050 | ||
1051 | .loop: | |
1052 | %macro %%write 3 | |
1053 | mova [dstq+stride%1+ 0], %2 | |
1054 | mova [dstq+stride%1+16], %3 | |
1055 | movhps [dst16q+stride%1 ], %2 | |
1056 | movu [dst16q+stride%1+ 8], %3 | |
1057 | movq [dst16q+stride%1+24], m5 | |
1058 | %if cpuflag(avx) | |
1059 | palignr %2, %3, %2, 1 | |
1060 | pshufb %3, m4 | |
1061 | %else | |
1062 | palignr m6, %3, %2, 1 | |
1063 | pshufb %3, m4 | |
1064 | mova %2, m6 | |
1065 | %endif | |
1066 | %endmacro | |
1067 | ||
1068 | %%write q*0, m2, m0 | |
1069 | %%write q*1, m3, m1 | |
1070 | lea dstq, [dstq +strideq*2] | |
1071 | lea dst16q, [dst16q+strideq*2] | |
1072 | dec cntd | |
1073 | jg .loop | |
1074 | RET | |
1075 | %endmacro | |
1076 | ||
1077 | VL_XMM_FUNCS ssse3 | |
1078 | VL_XMM_FUNCS avx | |
1079 | ||
1080 | ; vr | |
1081 | ||
1082 | INIT_MMX ssse3 | |
1083 | cglobal vp9_ipred_vr_4x4, 4, 4, 0, dst, stride, l, a | |
1084 | movq m1, [aq-1] | |
1085 | punpckldq m2, [lq] | |
1086 | movd m0, [aq] | |
1087 | DEFINE_ARGS dst, stride, stride3 | |
1088 | lea stride3q, [strideq*3] | |
1089 | pavgb m0, m1 | |
1090 | palignr m1, m2, 5 | |
1091 | psrlq m2, m1, 8 | |
1092 | psllq m3, m1, 8 | |
1093 | LOWPASS 2, 1, 3, 4 | |
1094 | ||
1095 | ; ABCD <- for the following predictor: | |
1096 | ; EFGH | |
1097 | ; IABC | m0 contains ABCDxxxx | |
1098 | ; JEFG | m2 contains xJIEFGHx | |
1099 | ||
1100 | punpckldq m0, m2 | |
1101 | pshufb m2, [pb_13456_3xm1] | |
1102 | movd [dstq+strideq*0], m0 | |
1103 | pshufb m0, [pb_6012_4xm1] | |
1104 | movd [dstq+stride3q ], m2 | |
1105 | psrlq m2, 8 | |
1106 | movd [dstq+strideq*2], m0 | |
1107 | movd [dstq+strideq*1], m2 | |
1108 | RET | |
1109 | ||
1110 | %macro VR_XMM_FUNCS 1 | |
1111 | INIT_XMM %1 | |
1112 | cglobal vp9_ipred_vr_8x8, 4, 4, 5, dst, stride, l, a | |
1113 | movu m1, [aq-1] | |
1114 | movhps m2, [lq] | |
1115 | movq m0, [aq] | |
1116 | DEFINE_ARGS dst, stride, stride3 | |
1117 | lea stride3q, [strideq*3] | |
1118 | pavgb m0, m1 | |
1119 | palignr m1, m2, 9 | |
1120 | pslldq m2, m1, 1 | |
1121 | pslldq m3, m1, 2 | |
1122 | LOWPASS 1, 2, 3, 4 | |
1123 | ||
1124 | ; ABCDEFGH <- for the following predictor: | |
1125 | ; IJKLMNOP | |
1126 | ; QABCDEFG | m0 contains ABCDEFGHxxxxxxxx | |
1127 | ; RIJKLMNO | m1 contains xxVUTSRQIJKLMNOP | |
1128 | ; SQABCDEF | |
1129 | ; TRIJKLMN | |
1130 | ; USQABCDE | |
1131 | ; VTRIJKLM | |
1132 | ||
1133 | punpcklqdq m0, m1 ; ABCDEFGHxxVUTSRQ | |
1134 | movq [dstq+strideq*0], m0 | |
1135 | pshufb m0, [pb_6xm1_BDF_0to6] ; xxxxxxUSQABCDEFG | |
1136 | movhps [dstq+strideq*1], m1 | |
1137 | pshufb m1, [pb_6xm1_246_8toE] ; xxxxxxVTRIJKLMNO | |
1138 | movhps [dstq+strideq*2], m0 | |
1139 | pslldq m0, 1 | |
1140 | movhps [dstq+stride3q ], m1 | |
1141 | lea dstq, [dstq+strideq*4] | |
1142 | pslldq m1, 1 | |
1143 | movhps [dstq+strideq*0], m0 | |
1144 | pslldq m0, 1 | |
1145 | movhps [dstq+strideq*1], m1 | |
1146 | pslldq m1, 1 | |
1147 | movhps [dstq+strideq*2], m0 | |
1148 | movhps [dstq+stride3q ], m1 | |
1149 | RET | |
1150 | ||
1151 | INIT_XMM %1 | |
1152 | cglobal vp9_ipred_vr_16x16, 4, 4, 6, dst, stride, l, a | |
1153 | mova m0, [aq] | |
1154 | movu m1, [aq-1] | |
1155 | mova m2, [lq] | |
1156 | DEFINE_ARGS dst, stride, stride3, cnt | |
1157 | lea stride3q, [strideq*3] | |
1158 | palignr m3, m1, m2, 15 | |
1159 | LOWPASS 3, 1, 0, 4 | |
1160 | pavgb m0, m1 | |
1161 | palignr m1, m2, 1 | |
1162 | pslldq m4, m2, 1 | |
1163 | LOWPASS 1, 2, 4, 5 | |
1164 | pshufb m1, [pb_02468ACE_13579BDF] | |
1165 | mov cntd, 4 | |
1166 | ||
1167 | .loop: | |
1168 | movlhps m2, m1 | |
1169 | mova [dstq+strideq*0], m0 | |
1170 | mova [dstq+strideq*1], m3 | |
1171 | palignr m4, m0, m1, 15 | |
1172 | palignr m5, m3, m2, 15 | |
1173 | mova [dstq+strideq*2], m4 | |
1174 | mova [dstq+stride3q ], m5 | |
1175 | lea dstq, [dstq+strideq*4] | |
1176 | palignr m0, m1, 14 | |
1177 | palignr m3, m2, 14 | |
1178 | pslldq m1, 2 | |
1179 | dec cntd | |
1180 | jg .loop | |
1181 | RET | |
1182 | ||
1183 | %if ARCH_X86_64 | |
1184 | INIT_XMM %1 | |
1185 | cglobal vp9_ipred_vr_32x32, 4, 4, 9, dst, stride, l, a | |
1186 | mova m0, [aq] | |
1187 | mova m2, [aq+16] | |
1188 | movu m1, [aq-1] | |
1189 | palignr m3, m2, m0, 15 | |
1190 | palignr m4, m2, m0, 14 | |
1191 | LOWPASS 4, 3, 2, 5 | |
1192 | pavgb m3, m2 | |
1193 | mova m2, [lq+16] | |
1194 | palignr m5, m1, m2, 15 | |
1195 | LOWPASS 5, 1, 0, 6 | |
1196 | pavgb m0, m1 | |
1197 | mova m6, [lq] | |
1198 | palignr m1, m2, 1 | |
1199 | palignr m7, m2, m6, 15 | |
1200 | LOWPASS 1, 2, 7, 8 | |
1201 | palignr m2, m6, 1 | |
1202 | pslldq m7, m6, 1 | |
1203 | LOWPASS 2, 6, 7, 8 | |
1204 | pshufb m1, [pb_02468ACE_13579BDF] | |
1205 | pshufb m2, [pb_02468ACE_13579BDF] | |
1206 | DEFINE_ARGS dst, stride, dst16, cnt | |
1207 | lea dst16q, [dstq +strideq*8] | |
1208 | lea dst16q, [dst16q+strideq*8] | |
1209 | SBUTTERFLY qdq, 2, 1, 6 | |
1210 | mov cntd, 8 | |
1211 | ||
1212 | .loop: | |
1213 | ; even lines (0, 2, 4, ...): m1 | m0, m3 | |
1214 | ; odd lines (1, 3, 5, ...): m2 | m5, m4 | |
1215 | %macro %%write 4 | |
1216 | mova [dstq+stride%1+ 0], %3 | |
1217 | mova [dstq+stride%1+16], %4 | |
1218 | movhps [dst16q+stride%1 ], %2 | |
1219 | movu [dst16q+stride%1+ 8], %3 | |
1220 | movq [dst16q+stride%1+24], %4 | |
1221 | palignr %4, %3, 15 | |
1222 | palignr %3, %2, 15 | |
1223 | pslldq %2, 1 | |
1224 | %endmacro | |
1225 | ||
1226 | %%write q*0, m1, m0, m3 | |
1227 | %%write q*1, m2, m5, m4 | |
1228 | lea dstq, [dstq +strideq*2] | |
1229 | lea dst16q, [dst16q+strideq*2] | |
1230 | dec cntd | |
1231 | jg .loop | |
1232 | RET | |
1233 | %endif | |
1234 | %endmacro | |
1235 | ||
1236 | VR_XMM_FUNCS ssse3 | |
1237 | VR_XMM_FUNCS avx | |
1238 | ||
1239 | ; hd | |
1240 | ||
1241 | INIT_MMX ssse3 | |
1242 | cglobal vp9_ipred_hd_4x4, 4, 4, 0, dst, stride, l, a | |
1243 | movd m0, [lq] | |
1244 | punpckldq m0, [aq-1] | |
1245 | DEFINE_ARGS dst, stride, stride3 | |
1246 | lea stride3q, [strideq*3] | |
1247 | psrlq m1, m0, 8 | |
1248 | psrlq m2, m1, 8 | |
1249 | LOWPASS 2, 1, 0, 3 | |
1250 | pavgb m1, m0 | |
1251 | ||
1252 | ; DHIJ <- for the following predictor: | |
1253 | ; CGDH | |
1254 | ; BFCG | m1 contains ABCDxxxx | |
1255 | ; AEBF | m2 contains EFGHIJxx | |
1256 | ||
1257 | punpcklbw m1, m2 | |
1258 | punpckhdq m0, m1, m2 | |
1259 | ||
1260 | ; m1 contains AEBFCGDH | |
1261 | ; m0 contains CGDHIJxx | |
1262 | ||
1263 | movd [dstq+stride3q ], m1 | |
1264 | movd [dstq+strideq*1], m0 | |
1265 | psrlq m1, 16 | |
1266 | psrlq m0, 16 | |
1267 | movd [dstq+strideq*2], m1 | |
1268 | movd [dstq+strideq*0], m0 | |
1269 | RET | |
1270 | ||
1271 | %macro HD_XMM_FUNCS 1 | |
1272 | INIT_XMM %1 | |
1273 | cglobal vp9_ipred_hd_8x8, 4, 4, 4, dst, stride, l, a | |
1274 | movq m0, [lq] | |
1275 | movhps m0, [aq-1] | |
1276 | DEFINE_ARGS dst, stride, stride3, dst4 | |
1277 | lea stride3q, [strideq*3] | |
1278 | lea dst4q, [dstq+strideq*4] | |
1279 | psrldq m1, m0, 1 | |
1280 | psrldq m2, m1, 1 | |
1281 | LOWPASS 2, 1, 0, 3 | |
1282 | pavgb m1, m0 | |
1283 | ||
1284 | ; HPQRSTUV <- for the following predictor | |
1285 | ; GOHPQRST | |
1286 | ; FNGOHPQR | m1 contains ABCDEFGHxxxxxxxx | |
1287 | ; EMFNGOHP | m2 contains IJKLMNOPQRSTUVxx | |
1288 | ; DLEMFNGO | |
1289 | ; CKDLEMFN | |
1290 | ; BJCKDLEM | |
1291 | ; AIBJCKDL | |
1292 | ||
1293 | punpcklbw m1, m2 | |
1294 | movhlps m2, m2 | |
1295 | ||
1296 | ; m1 contains AIBJCKDLEMFNGOHP | |
1297 | ; m2 contains QRSTUVxxxxxxxxxx | |
1298 | ||
1299 | movhps [dstq +stride3q ], m1 | |
1300 | movq [dst4q+stride3q ], m1 | |
1301 | palignr m3, m2, m1, 2 | |
1302 | movhps [dstq +strideq*2], m3 | |
1303 | movq [dst4q+strideq*2], m3 | |
1304 | palignr m3, m2, m1, 4 | |
1305 | movhps [dstq +strideq*1], m3 | |
1306 | movq [dst4q+strideq*1], m3 | |
1307 | palignr m2, m1, 6 | |
1308 | movhps [dstq +strideq*0], m2 | |
1309 | movq [dst4q+strideq*0], m2 | |
1310 | RET | |
1311 | ||
1312 | INIT_XMM %1 | |
1313 | cglobal vp9_ipred_hd_16x16, 4, 6, 7, dst, stride, l, a | |
1314 | mova m0, [lq] | |
1315 | movu m3, [aq-1] | |
1316 | DEFINE_ARGS dst, stride, stride4, dst4, dst8, dst12 | |
1317 | lea stride4q, [strideq*4] | |
1318 | lea dst4q, [dstq +stride4q] | |
1319 | lea dst8q, [dst4q+stride4q] | |
1320 | lea dst12q, [dst8q+stride4q] | |
1321 | psrldq m4, m3, 1 | |
1322 | psrldq m5, m3, 2 | |
1323 | LOWPASS 5, 4, 3, 6 | |
1324 | palignr m1, m3, m0, 1 | |
1325 | palignr m2, m3, m0, 2 | |
1326 | LOWPASS 2, 1, 0, 6 | |
1327 | pavgb m1, m0 | |
1328 | SBUTTERFLY bw, 1, 2, 6 | |
1329 | ||
1330 | ; I PROBABLY INVERTED L0 ad L16 here | |
1331 | ; m1, m2, m5 | |
1332 | .loop: | |
1333 | sub stride4q, strideq | |
1334 | movhps [dstq +stride4q +0], m2 | |
1335 | movq [dstq +stride4q +8], m5 | |
1336 | mova [dst4q+stride4q ], m2 | |
1337 | movhps [dst8q+stride4q +0], m1 | |
1338 | movq [dst8q+stride4q +8], m2 | |
1339 | mova [dst12q+stride4q ], m1 | |
1340 | %if cpuflag(avx) | |
1341 | palignr m1, m2, m1, 2 | |
1342 | palignr m2, m5, m2, 2 | |
1343 | %else | |
1344 | palignr m3, m2, m1, 2 | |
1345 | palignr m0, m5, m2, 2 | |
1346 | mova m1, m3 | |
1347 | mova m2, m0 | |
1348 | %endif | |
1349 | psrldq m5, 2 | |
1350 | jg .loop | |
1351 | RET | |
1352 | ||
1353 | INIT_XMM %1 | |
1354 | cglobal vp9_ipred_hd_32x32, 4, 6, 8, dst, stride, l, a | |
1355 | mova m0, [lq] | |
1356 | mova m1, [lq+16] | |
1357 | movu m2, [aq-1] | |
1358 | movu m3, [aq+15] | |
1359 | DEFINE_ARGS dst, stride, stride8, dst8, dst16, dst24 | |
1360 | lea stride8q, [strideq*8] | |
1361 | lea dst8q, [dstq +stride8q] | |
1362 | lea dst16q, [dst8q +stride8q] | |
1363 | lea dst24q, [dst16q+stride8q] | |
1364 | psrldq m4, m3, 1 | |
1365 | psrldq m5, m3, 2 | |
1366 | LOWPASS 5, 4, 3, 6 | |
1367 | palignr m4, m3, m2, 2 | |
1368 | palignr m3, m2, 1 | |
1369 | LOWPASS 4, 3, 2, 6 | |
1370 | palignr m3, m2, m1, 2 | |
1371 | palignr m2, m1, 1 | |
1372 | LOWPASS 3, 2, 1, 6 | |
1373 | pavgb m2, m1 | |
1374 | palignr m6, m1, m0, 1 | |
1375 | palignr m1, m0, 2 | |
1376 | LOWPASS 1, 6, 0, 7 | |
1377 | pavgb m0, m6 | |
1378 | SBUTTERFLY bw, 2, 3, 6 | |
1379 | SBUTTERFLY bw, 0, 1, 6 | |
1380 | ||
1381 | ; m0, m1, m2, m3, m4, m5 | |
1382 | .loop: | |
1383 | sub stride8q, strideq | |
1384 | mova [dstq +stride8q+ 0], m3 | |
1385 | mova [dstq +stride8q+16], m4 | |
1386 | mova [dst8q +stride8q+ 0], m2 | |
1387 | mova [dst8q +stride8q+16], m3 | |
1388 | mova [dst16q+stride8q+ 0], m1 | |
1389 | mova [dst16q+stride8q+16], m2 | |
1390 | mova [dst24q+stride8q+ 0], m0 | |
1391 | mova [dst24q+stride8q+16], m1 | |
1392 | %if cpuflag(avx) | |
1393 | palignr m0, m1, m0, 2 | |
1394 | palignr m1, m2, m1, 2 | |
1395 | palignr m2, m3, m2, 2 | |
1396 | palignr m3, m4, m3, 2 | |
1397 | palignr m4, m5, m4, 2 | |
1398 | psrldq m5, 2 | |
1399 | %else | |
1400 | psrldq m6, m5, 2 | |
1401 | palignr m5, m4, 2 | |
1402 | palignr m4, m3, 2 | |
1403 | palignr m3, m2, 2 | |
1404 | palignr m2, m1, 2 | |
1405 | palignr m1, m0, 2 | |
1406 | mova m0, m1 | |
1407 | mova m1, m2 | |
1408 | mova m2, m3 | |
1409 | mova m3, m4 | |
1410 | mova m4, m5 | |
1411 | mova m5, m6 | |
1412 | %endif | |
1413 | jg .loop | |
1414 | RET | |
1415 | %endmacro | |
1416 | ||
1417 | HD_XMM_FUNCS ssse3 | |
1418 | HD_XMM_FUNCS avx | |
1419 | ||
1420 | INIT_MMX ssse3 | |
1421 | cglobal vp9_ipred_hu_4x4, 3, 3, 0, dst, stride, l | |
1422 | movd m0, [lq] | |
1423 | pshufb m0, [pb_3to1_5x0] | |
1424 | psrlq m1, m0, 8 | |
1425 | psrlq m2, m1, 8 | |
1426 | LOWPASS 2, 1, 0, 3 | |
1427 | pavgb m1, m0 | |
1428 | DEFINE_ARGS dst, stride, stride3 | |
1429 | lea stride3q, [strideq*3] | |
1430 | SBUTTERFLY bw, 1, 2, 0 | |
1431 | palignr m2, m1, 2 | |
1432 | movd [dstq+strideq*0], m1 | |
1433 | movd [dstq+strideq*1], m2 | |
1434 | punpckhdq m1, m1 | |
1435 | punpckhdq m2, m2 | |
1436 | movd [dstq+strideq*2], m1 | |
1437 | movd [dstq+stride3q ], m2 | |
1438 | RET | |
1439 | ||
1440 | %macro HU_XMM_FUNCS 1 | |
1441 | INIT_XMM %1 | |
1442 | cglobal vp9_ipred_hu_8x8, 3, 4, 4, dst, stride, l | |
1443 | movq m0, [lq] | |
1444 | pshufb m0, [pb_7to1_9x0] | |
1445 | psrldq m1, m0, 1 | |
1446 | psrldq m2, m1, 1 | |
1447 | LOWPASS 2, 1, 0, 3 | |
1448 | pavgb m1, m0 | |
1449 | DEFINE_ARGS dst, stride, stride3, dst4 | |
1450 | lea stride3q, [strideq*3] | |
1451 | lea dst4q, [dstq+strideq*4] | |
1452 | SBUTTERFLY bw, 1, 2, 0 | |
1453 | movq [dstq +strideq*0], m1 | |
1454 | movhps [dst4q+strideq*0], m1 | |
1455 | palignr m0, m2, m1, 2 | |
1456 | movq [dstq +strideq*1], m0 | |
1457 | movhps [dst4q+strideq*1], m0 | |
1458 | palignr m0, m2, m1, 4 | |
1459 | movq [dstq +strideq*2], m0 | |
1460 | movhps [dst4q+strideq*2], m0 | |
1461 | palignr m2, m1, 6 | |
1462 | movq [dstq +stride3q ], m2 | |
1463 | movhps [dst4q+stride3q ], m2 | |
1464 | RET | |
1465 | ||
1466 | INIT_XMM %1 | |
1467 | cglobal vp9_ipred_hu_16x16, 3, 4, 5, dst, stride, l | |
1468 | mova m0, [lq] | |
1469 | pshufb m0, [pb_Fto0] | |
1470 | mova m3, [pb_2toE_3xF] | |
1471 | pshufb m1, m0, [pb_1toE_2xF] | |
1472 | pshufb m2, m0, m3 | |
1473 | LOWPASS 2, 1, 0, 4 | |
1474 | pavgb m1, m0 | |
1475 | DEFINE_ARGS dst, stride, stride9, cnt | |
1476 | lea stride9q, [strideq *3] | |
1477 | mov cntd, 4 | |
1478 | lea stride9q, [stride9q*3] | |
1479 | SBUTTERFLY bw, 1, 2, 0 | |
1480 | ||
1481 | .loop: | |
1482 | mova [dstq+strideq*0], m1 | |
1483 | mova [dstq+strideq*8], m2 | |
1484 | palignr m0, m2, m1, 2 | |
1485 | pshufb m2, m3 | |
1486 | mova [dstq+strideq*1], m0 | |
1487 | mova [dstq+stride9q ], m2 | |
1488 | palignr m1, m2, m0, 2 | |
1489 | pshufb m2, m3 | |
1490 | lea dstq, [dstq+strideq*2] | |
1491 | dec cntd | |
1492 | jg .loop | |
1493 | RET | |
1494 | ||
1495 | INIT_XMM %1 | |
1496 | cglobal vp9_ipred_hu_32x32, 3, 7, 7, dst, stride, l | |
1497 | mova m0, [lq] | |
1498 | mova m1, [lq+16] | |
1499 | mova m2, [pb_Fto0] | |
1500 | mova m4, [pb_2toE_3xF] | |
1501 | pshufb m0, m2 | |
1502 | pshufb m1, m2 | |
1503 | palignr m2, m0, m1, 1 | |
1504 | palignr m3, m0, m1, 2 | |
1505 | LOWPASS 3, 2, 1, 5 | |
1506 | pavgb m2, m1 | |
1507 | pshufb m1, m0, m4 | |
1508 | pshufb m5, m0, [pb_1toE_2xF] | |
1509 | LOWPASS 1, 5, 0, 6 | |
1510 | pavgb m0, m5 | |
1511 | DEFINE_ARGS dst, stride, cnt, stride0, dst8, dst16, dst24 | |
1512 | mov cntd, 8 | |
1513 | xor stride0q, stride0q | |
1514 | lea dst8q, [dstq +strideq*8] | |
1515 | lea dst16q, [dst8q +strideq*8] | |
1516 | lea dst24q, [dst16q+strideq*8] | |
1517 | SBUTTERFLY bw, 0, 1, 5 | |
1518 | SBUTTERFLY bw, 2, 3, 5 | |
1519 | pshufb m6, m1, [pb_15] | |
1520 | ||
1521 | .loop: | |
1522 | mova [dstq +stride0q+ 0], m2 | |
1523 | mova [dstq +stride0q+16], m3 | |
1524 | mova [dst8q +stride0q+ 0], m3 | |
1525 | mova [dst8q +stride0q+16], m0 | |
1526 | mova [dst16q+stride0q+ 0], m0 | |
1527 | mova [dst16q+stride0q+16], m1 | |
1528 | mova [dst24q+stride0q+ 0], m1 | |
1529 | mova [dst24q+stride0q+16], m6 | |
1530 | %if cpuflag(avx) | |
1531 | palignr m2, m3, m2, 2 | |
1532 | palignr m3, m0, m3, 2 | |
1533 | palignr m0, m1, m0, 2 | |
1534 | pshufb m1, m4 | |
1535 | %else | |
1536 | pshufb m5, m1, m4 | |
1537 | palignr m1, m0, 2 | |
1538 | palignr m0, m3, 2 | |
1539 | palignr m3, m2, 2 | |
1540 | mova m2, m3 | |
1541 | mova m3, m0 | |
1542 | mova m0, m1 | |
1543 | mova m1, m5 | |
1544 | %endif | |
1545 | add stride0q, strideq | |
1546 | dec cntd | |
1547 | jg .loop | |
1548 | RET | |
1549 | %endmacro | |
1550 | ||
1551 | HU_XMM_FUNCS ssse3 | |
1552 | HU_XMM_FUNCS avx | |
1553 | ||
1554 | ; FIXME 127, 128, 129 ? |