Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* Copyright (C) 2013 x265 project | |
3 | ;* | |
4 | ;* Authors: Min Chen <chenm003@163.com> | |
5 | ;* Nabajit Deka <nabajit@multicorewareinc.com> | |
6 | ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
7 | ;* | |
8 | ;* This program is free software; you can redistribute it and/or modify | |
9 | ;* it under the terms of the GNU General Public License as published by | |
10 | ;* the Free Software Foundation; either version 2 of the License, or | |
11 | ;* (at your option) any later version. | |
12 | ;* | |
13 | ;* This program is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | ;* GNU General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU General Public License | |
19 | ;* along with this program; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
21 | ;* | |
22 | ;* This program is also available under a commercial proprietary license. | |
23 | ;* For more information, contact us at license @ x265.com. | |
24 | ;*****************************************************************************/ | |
25 | ||
26 | %include "x86inc.asm" | |
27 | %include "x86util.asm" | |
28 | ||
29 | SECTION_RODATA 32 | |
30 | tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 | |
31 | db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 | |
32 | db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 | |
33 | ||
b53f7c52 JB |
34 | ALIGN 32 |
35 | const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15 | |
36 | ||
37 | ALIGN 32 | |
38 | const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 | |
39 | dd 2, 3, 3, 4, 4, 5, 5, 6 | |
40 | ||
72b9787e JB |
41 | ALIGN 32 |
42 | tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 | |
43 | db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 | |
44 | db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 | |
45 | db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14 | |
46 | ||
47 | tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 | |
48 | db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 | |
49 | ||
50 | tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3 | |
51 | ||
72b9787e JB |
52 | tab_c_526336: times 4 dd 8192*64+2048 |
53 | ||
54 | tab_ChromaCoeff: db 0, 64, 0, 0 | |
55 | db -2, 58, 10, -2 | |
56 | db -4, 54, 16, -2 | |
57 | db -6, 46, 28, -4 | |
58 | db -4, 36, 36, -4 | |
59 | db -4, 28, 46, -6 | |
60 | db -2, 16, 54, -4 | |
61 | db -2, 10, 58, -2 | |
62 | ||
63 | tab_ChromaCoeffV: times 4 dw 0, 64 | |
64 | times 4 dw 0, 0 | |
65 | ||
66 | times 4 dw -2, 58 | |
67 | times 4 dw 10, -2 | |
68 | ||
69 | times 4 dw -4, 54 | |
70 | times 4 dw 16, -2 | |
71 | ||
72 | times 4 dw -6, 46 | |
73 | times 4 dw 28, -4 | |
74 | ||
75 | times 4 dw -4, 36 | |
76 | times 4 dw 36, -4 | |
77 | ||
78 | times 4 dw -4, 28 | |
79 | times 4 dw 46, -6 | |
80 | ||
81 | times 4 dw -2, 16 | |
82 | times 4 dw 54, -4 | |
83 | ||
84 | times 4 dw -2, 10 | |
85 | times 4 dw 58, -2 | |
86 | ||
87 | tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0 | |
88 | db -1, 4, -10, 58, 17, -5, 1, 0 | |
89 | db -1, 4, -11, 40, 40, -11, 4, -1 | |
90 | db 0, 1, -5, 17, 58, -10, 4, -1 | |
91 | ||
92 | tab_LumaCoeffV: times 4 dw 0, 0 | |
93 | times 4 dw 0, 64 | |
94 | times 4 dw 0, 0 | |
95 | times 4 dw 0, 0 | |
96 | ||
97 | times 4 dw -1, 4 | |
98 | times 4 dw -10, 58 | |
99 | times 4 dw 17, -5 | |
100 | times 4 dw 1, 0 | |
101 | ||
102 | times 4 dw -1, 4 | |
103 | times 4 dw -11, 40 | |
104 | times 4 dw 40, -11 | |
105 | times 4 dw 4, -1 | |
106 | ||
107 | times 4 dw 0, 1 | |
108 | times 4 dw -5, 17 | |
109 | times 4 dw 58, -10 | |
110 | times 4 dw 4, -1 | |
111 | ||
112 | tab_LumaCoeffVer: times 8 db 0, 0 | |
113 | times 8 db 0, 64 | |
114 | times 8 db 0, 0 | |
115 | times 8 db 0, 0 | |
116 | ||
117 | times 8 db -1, 4 | |
118 | times 8 db -10, 58 | |
119 | times 8 db 17, -5 | |
120 | times 8 db 1, 0 | |
121 | ||
122 | times 8 db -1, 4 | |
123 | times 8 db -11, 40 | |
124 | times 8 db 40, -11 | |
125 | times 8 db 4, -1 | |
126 | ||
127 | times 8 db 0, 1 | |
128 | times 8 db -5, 17 | |
129 | times 8 db 58, -10 | |
130 | times 8 db 4, -1 | |
131 | ||
b53f7c52 JB |
132 | ALIGN 32 |
133 | tab_LumaCoeffVer_32: times 16 db 0, 0 | |
134 | times 16 db 0, 64 | |
135 | times 16 db 0, 0 | |
136 | times 16 db 0, 0 | |
137 | ||
138 | times 16 db -1, 4 | |
139 | times 16 db -10, 58 | |
140 | times 16 db 17, -5 | |
141 | times 16 db 1, 0 | |
142 | ||
143 | times 16 db -1, 4 | |
144 | times 16 db -11, 40 | |
145 | times 16 db 40, -11 | |
146 | times 16 db 4, -1 | |
147 | ||
148 | times 16 db 0, 1 | |
149 | times 16 db -5, 17 | |
150 | times 16 db 58, -10 | |
151 | times 16 db 4, -1 | |
152 | ||
153 | ALIGN 32 | |
154 | tab_ChromaCoeffVer_32: times 16 db 0, 64 | |
155 | times 16 db 0, 0 | |
156 | ||
157 | times 16 db -2, 58 | |
158 | times 16 db 10, -2 | |
159 | ||
160 | times 16 db -4, 54 | |
161 | times 16 db 16, -2 | |
162 | ||
163 | times 16 db -6, 46 | |
164 | times 16 db 28, -4 | |
165 | ||
166 | times 16 db -4, 36 | |
167 | times 16 db 36, -4 | |
168 | ||
169 | times 16 db -4, 28 | |
170 | times 16 db 46, -6 | |
171 | ||
172 | times 16 db -2, 16 | |
173 | times 16 db 54, -4 | |
174 | ||
175 | times 16 db -2, 10 | |
176 | times 16 db 58, -2 | |
177 | ||
72b9787e JB |
178 | tab_c_64_n64: times 8 db 64, -64 |
179 | ||
b53f7c52 JB |
180 | const interp4_shuf, times 2 db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15 |
181 | ||
182 | ALIGN 32 | |
183 | interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 | |
184 | db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 | |
72b9787e JB |
185 | |
186 | SECTION .text | |
187 | ||
b53f7c52 | 188 | cextern pb_128 |
72b9787e JB |
189 | cextern pw_1 |
190 | cextern pw_512 | |
191 | cextern pw_2000 | |
192 | ||
193 | %macro FILTER_H4_w2_2 3 | |
194 | movh %2, [srcq - 1] | |
195 | pshufb %2, %2, Tm0 | |
196 | movh %1, [srcq + srcstrideq - 1] | |
197 | pshufb %1, %1, Tm0 | |
198 | punpcklqdq %2, %1 | |
199 | pmaddubsw %2, coef2 | |
200 | phaddw %2, %2 | |
201 | pmulhrsw %2, %3 | |
202 | packuswb %2, %2 | |
203 | movd r4, %2 | |
204 | mov [dstq], r4w | |
205 | shr r4, 16 | |
206 | mov [dstq + dststrideq], r4w | |
207 | %endmacro | |
208 | ||
209 | ;----------------------------------------------------------------------------- | |
210 | ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
211 | ;----------------------------------------------------------------------------- | |
212 | INIT_XMM sse4 | |
213 | cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride | |
214 | %define coef2 m4 | |
215 | %define Tm0 m3 | |
216 | %define t2 m2 | |
217 | %define t1 m1 | |
218 | %define t0 m0 | |
219 | ||
220 | mov r4d, r4m | |
221 | ||
222 | %ifdef PIC | |
223 | lea r5, [tab_ChromaCoeff] | |
224 | movd coef2, [r5 + r4 * 4] | |
225 | %else | |
226 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
227 | %endif | |
228 | ||
229 | pshufd coef2, coef2, 0 | |
b53f7c52 | 230 | mova t2, [pw_512] |
72b9787e JB |
231 | mova Tm0, [tab_Tm] |
232 | ||
233 | %rep 2 | |
234 | FILTER_H4_w2_2 t0, t1, t2 | |
235 | lea srcq, [srcq + srcstrideq * 2] | |
236 | lea dstq, [dstq + dststrideq * 2] | |
237 | %endrep | |
238 | ||
239 | RET | |
240 | ||
241 | ;----------------------------------------------------------------------------- | |
242 | ; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
243 | ;----------------------------------------------------------------------------- | |
244 | INIT_XMM sse4 | |
245 | cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride | |
246 | %define coef2 m4 | |
247 | %define Tm0 m3 | |
248 | %define t2 m2 | |
249 | %define t1 m1 | |
250 | %define t0 m0 | |
251 | ||
252 | mov r4d, r4m | |
253 | ||
254 | %ifdef PIC | |
255 | lea r5, [tab_ChromaCoeff] | |
256 | movd coef2, [r5 + r4 * 4] | |
257 | %else | |
258 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
259 | %endif | |
260 | ||
261 | pshufd coef2, coef2, 0 | |
b53f7c52 | 262 | mova t2, [pw_512] |
72b9787e JB |
263 | mova Tm0, [tab_Tm] |
264 | ||
265 | %rep 4 | |
266 | FILTER_H4_w2_2 t0, t1, t2 | |
267 | lea srcq, [srcq + srcstrideq * 2] | |
268 | lea dstq, [dstq + dststrideq * 2] | |
269 | %endrep | |
270 | ||
271 | RET | |
272 | ||
273 | ;----------------------------------------------------------------------------- | |
274 | ; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
275 | ;----------------------------------------------------------------------------- | |
276 | INIT_XMM sse4 | |
277 | cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride | |
278 | %define coef2 m4 | |
279 | %define Tm0 m3 | |
280 | %define t2 m2 | |
281 | %define t1 m1 | |
282 | %define t0 m0 | |
283 | ||
284 | mov r4d, r4m | |
285 | ||
286 | %ifdef PIC | |
287 | lea r5, [tab_ChromaCoeff] | |
288 | movd coef2, [r5 + r4 * 4] | |
289 | %else | |
290 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
291 | %endif | |
292 | ||
293 | pshufd coef2, coef2, 0 | |
b53f7c52 | 294 | mova t2, [pw_512] |
72b9787e JB |
295 | mova Tm0, [tab_Tm] |
296 | ||
297 | mov r5d, 16/2 | |
298 | ||
299 | .loop: | |
300 | FILTER_H4_w2_2 t0, t1, t2 | |
301 | lea srcq, [srcq + srcstrideq * 2] | |
302 | lea dstq, [dstq + dststrideq * 2] | |
303 | dec r5d | |
304 | jnz .loop | |
305 | ||
306 | RET | |
307 | ||
308 | %macro FILTER_H4_w4_2 3 | |
309 | movh %2, [srcq - 1] | |
310 | pshufb %2, %2, Tm0 | |
311 | pmaddubsw %2, coef2 | |
312 | movh %1, [srcq + srcstrideq - 1] | |
313 | pshufb %1, %1, Tm0 | |
314 | pmaddubsw %1, coef2 | |
315 | phaddw %2, %1 | |
316 | pmulhrsw %2, %3 | |
317 | packuswb %2, %2 | |
318 | movd [dstq], %2 | |
319 | palignr %2, %2, 4 | |
320 | movd [dstq + dststrideq], %2 | |
321 | %endmacro | |
322 | ||
323 | ;----------------------------------------------------------------------------- | |
324 | ; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
325 | ;----------------------------------------------------------------------------- | |
326 | INIT_XMM sse4 | |
327 | cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride | |
328 | %define coef2 m4 | |
329 | %define Tm0 m3 | |
330 | %define t2 m2 | |
331 | %define t1 m1 | |
332 | %define t0 m0 | |
333 | ||
334 | mov r4d, r4m | |
335 | ||
336 | %ifdef PIC | |
337 | lea r5, [tab_ChromaCoeff] | |
338 | movd coef2, [r5 + r4 * 4] | |
339 | %else | |
340 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
341 | %endif | |
342 | ||
343 | pshufd coef2, coef2, 0 | |
b53f7c52 | 344 | mova t2, [pw_512] |
72b9787e JB |
345 | mova Tm0, [tab_Tm] |
346 | ||
347 | FILTER_H4_w4_2 t0, t1, t2 | |
348 | ||
349 | RET | |
350 | ||
351 | ;----------------------------------------------------------------------------- | |
352 | ; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
353 | ;----------------------------------------------------------------------------- | |
354 | INIT_XMM sse4 | |
355 | cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride | |
356 | %define coef2 m4 | |
357 | %define Tm0 m3 | |
358 | %define t2 m2 | |
359 | %define t1 m1 | |
360 | %define t0 m0 | |
361 | ||
362 | mov r4d, r4m | |
363 | ||
364 | %ifdef PIC | |
365 | lea r5, [tab_ChromaCoeff] | |
366 | movd coef2, [r5 + r4 * 4] | |
367 | %else | |
368 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
369 | %endif | |
370 | ||
371 | pshufd coef2, coef2, 0 | |
b53f7c52 | 372 | mova t2, [pw_512] |
72b9787e JB |
373 | mova Tm0, [tab_Tm] |
374 | ||
375 | %rep 2 | |
376 | FILTER_H4_w4_2 t0, t1, t2 | |
377 | lea srcq, [srcq + srcstrideq * 2] | |
378 | lea dstq, [dstq + dststrideq * 2] | |
379 | %endrep | |
380 | ||
381 | RET | |
382 | ||
383 | ;----------------------------------------------------------------------------- | |
384 | ; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
385 | ;----------------------------------------------------------------------------- | |
386 | INIT_XMM sse4 | |
387 | cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride | |
388 | %define coef2 m4 | |
389 | %define Tm0 m3 | |
390 | %define t2 m2 | |
391 | %define t1 m1 | |
392 | %define t0 m0 | |
393 | ||
394 | mov r4d, r4m | |
395 | ||
396 | %ifdef PIC | |
397 | lea r5, [tab_ChromaCoeff] | |
398 | movd coef2, [r5 + r4 * 4] | |
399 | %else | |
400 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
401 | %endif | |
402 | ||
403 | pshufd coef2, coef2, 0 | |
b53f7c52 | 404 | mova t2, [pw_512] |
72b9787e JB |
405 | mova Tm0, [tab_Tm] |
406 | ||
407 | %rep 4 | |
408 | FILTER_H4_w4_2 t0, t1, t2 | |
409 | lea srcq, [srcq + srcstrideq * 2] | |
410 | lea dstq, [dstq + dststrideq * 2] | |
411 | %endrep | |
412 | ||
413 | RET | |
414 | ||
415 | ;----------------------------------------------------------------------------- | |
416 | ; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
417 | ;----------------------------------------------------------------------------- | |
418 | INIT_XMM sse4 | |
419 | cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride | |
420 | %define coef2 m4 | |
421 | %define Tm0 m3 | |
422 | %define t2 m2 | |
423 | %define t1 m1 | |
424 | %define t0 m0 | |
425 | ||
426 | mov r4d, r4m | |
427 | ||
428 | %ifdef PIC | |
429 | lea r5, [tab_ChromaCoeff] | |
430 | movd coef2, [r5 + r4 * 4] | |
431 | %else | |
432 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
433 | %endif | |
434 | ||
435 | pshufd coef2, coef2, 0 | |
b53f7c52 | 436 | mova t2, [pw_512] |
72b9787e JB |
437 | mova Tm0, [tab_Tm] |
438 | ||
439 | %rep 8 | |
440 | FILTER_H4_w4_2 t0, t1, t2 | |
441 | lea srcq, [srcq + srcstrideq * 2] | |
442 | lea dstq, [dstq + dststrideq * 2] | |
443 | %endrep | |
444 | ||
445 | RET | |
446 | ||
447 | ;----------------------------------------------------------------------------- | |
448 | ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
449 | ;----------------------------------------------------------------------------- | |
450 | INIT_XMM sse4 | |
451 | cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride | |
452 | %define coef2 m4 | |
453 | %define Tm0 m3 | |
454 | %define t2 m2 | |
455 | %define t1 m1 | |
456 | %define t0 m0 | |
457 | ||
458 | mov r4d, r4m | |
459 | ||
460 | %ifdef PIC | |
461 | lea r5, [tab_ChromaCoeff] | |
462 | movd coef2, [r5 + r4 * 4] | |
463 | %else | |
464 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
465 | %endif | |
466 | ||
467 | pshufd coef2, coef2, 0 | |
b53f7c52 | 468 | mova t2, [pw_512] |
72b9787e JB |
469 | mova Tm0, [tab_Tm] |
470 | ||
471 | mov r5d, 32/2 | |
472 | ||
473 | .loop: | |
474 | FILTER_H4_w4_2 t0, t1, t2 | |
475 | lea srcq, [srcq + srcstrideq * 2] | |
476 | lea dstq, [dstq + dststrideq * 2] | |
477 | dec r5d | |
478 | jnz .loop | |
479 | ||
480 | RET | |
481 | ||
b53f7c52 JB |
482 | ALIGN 32 |
483 | const interp_4tap_8x8_horiz_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 | |
484 | ||
72b9787e JB |
485 | |
486 | %macro FILTER_H4_w6 3 | |
487 | movu %1, [srcq - 1] | |
488 | pshufb %2, %1, Tm0 | |
489 | pmaddubsw %2, coef2 | |
490 | pshufb %1, %1, Tm1 | |
491 | pmaddubsw %1, coef2 | |
492 | phaddw %2, %1 | |
493 | pmulhrsw %2, %3 | |
494 | packuswb %2, %2 | |
495 | movd [dstq], %2 | |
496 | pextrw [dstq + 4], %2, 2 | |
497 | %endmacro | |
498 | ||
499 | %macro FILTER_H4_w8 3 | |
500 | movu %1, [srcq - 1] | |
501 | pshufb %2, %1, Tm0 | |
502 | pmaddubsw %2, coef2 | |
503 | pshufb %1, %1, Tm1 | |
504 | pmaddubsw %1, coef2 | |
505 | phaddw %2, %1 | |
506 | pmulhrsw %2, %3 | |
507 | packuswb %2, %2 | |
508 | movh [dstq], %2 | |
509 | %endmacro | |
510 | ||
511 | %macro FILTER_H4_w12 3 | |
512 | movu %1, [srcq - 1] | |
513 | pshufb %2, %1, Tm0 | |
514 | pmaddubsw %2, coef2 | |
515 | pshufb %1, %1, Tm1 | |
516 | pmaddubsw %1, coef2 | |
517 | phaddw %2, %1 | |
518 | pmulhrsw %2, %3 | |
519 | movu %1, [srcq - 1 + 8] | |
520 | pshufb %1, %1, Tm0 | |
521 | pmaddubsw %1, coef2 | |
522 | phaddw %1, %1 | |
523 | pmulhrsw %1, %3 | |
524 | packuswb %2, %1 | |
525 | movh [dstq], %2 | |
526 | pextrd [dstq + 8], %2, 2 | |
527 | %endmacro | |
528 | ||
529 | %macro FILTER_H4_w16 4 | |
530 | movu %1, [srcq - 1] | |
531 | pshufb %2, %1, Tm0 | |
532 | pmaddubsw %2, coef2 | |
533 | pshufb %1, %1, Tm1 | |
534 | pmaddubsw %1, coef2 | |
535 | phaddw %2, %1 | |
536 | movu %1, [srcq - 1 + 8] | |
537 | pshufb %4, %1, Tm0 | |
538 | pmaddubsw %4, coef2 | |
539 | pshufb %1, %1, Tm1 | |
540 | pmaddubsw %1, coef2 | |
541 | phaddw %4, %1 | |
542 | pmulhrsw %2, %3 | |
543 | pmulhrsw %4, %3 | |
544 | packuswb %2, %4 | |
545 | movu [dstq], %2 | |
546 | %endmacro | |
547 | ||
548 | %macro FILTER_H4_w24 4 | |
549 | movu %1, [srcq - 1] | |
550 | pshufb %2, %1, Tm0 | |
551 | pmaddubsw %2, coef2 | |
552 | pshufb %1, %1, Tm1 | |
553 | pmaddubsw %1, coef2 | |
554 | phaddw %2, %1 | |
555 | movu %1, [srcq - 1 + 8] | |
556 | pshufb %4, %1, Tm0 | |
557 | pmaddubsw %4, coef2 | |
558 | pshufb %1, %1, Tm1 | |
559 | pmaddubsw %1, coef2 | |
560 | phaddw %4, %1 | |
561 | pmulhrsw %2, %3 | |
562 | pmulhrsw %4, %3 | |
563 | packuswb %2, %4 | |
564 | movu [dstq], %2 | |
565 | movu %1, [srcq - 1 + 16] | |
566 | pshufb %2, %1, Tm0 | |
567 | pmaddubsw %2, coef2 | |
568 | pshufb %1, %1, Tm1 | |
569 | pmaddubsw %1, coef2 | |
570 | phaddw %2, %1 | |
571 | pmulhrsw %2, %3 | |
572 | packuswb %2, %2 | |
573 | movh [dstq + 16], %2 | |
574 | %endmacro | |
575 | ||
576 | %macro FILTER_H4_w32 4 | |
577 | movu %1, [srcq - 1] | |
578 | pshufb %2, %1, Tm0 | |
579 | pmaddubsw %2, coef2 | |
580 | pshufb %1, %1, Tm1 | |
581 | pmaddubsw %1, coef2 | |
582 | phaddw %2, %1 | |
583 | movu %1, [srcq - 1 + 8] | |
584 | pshufb %4, %1, Tm0 | |
585 | pmaddubsw %4, coef2 | |
586 | pshufb %1, %1, Tm1 | |
587 | pmaddubsw %1, coef2 | |
588 | phaddw %4, %1 | |
589 | pmulhrsw %2, %3 | |
590 | pmulhrsw %4, %3 | |
591 | packuswb %2, %4 | |
592 | movu [dstq], %2 | |
593 | movu %1, [srcq - 1 + 16] | |
594 | pshufb %2, %1, Tm0 | |
595 | pmaddubsw %2, coef2 | |
596 | pshufb %1, %1, Tm1 | |
597 | pmaddubsw %1, coef2 | |
598 | phaddw %2, %1 | |
599 | movu %1, [srcq - 1 + 24] | |
600 | pshufb %4, %1, Tm0 | |
601 | pmaddubsw %4, coef2 | |
602 | pshufb %1, %1, Tm1 | |
603 | pmaddubsw %1, coef2 | |
604 | phaddw %4, %1 | |
605 | pmulhrsw %2, %3 | |
606 | pmulhrsw %4, %3 | |
607 | packuswb %2, %4 | |
608 | movu [dstq + 16], %2 | |
609 | %endmacro | |
610 | ||
611 | %macro FILTER_H4_w16o 5 | |
612 | movu %1, [srcq + %5 - 1] | |
613 | pshufb %2, %1, Tm0 | |
614 | pmaddubsw %2, coef2 | |
615 | pshufb %1, %1, Tm1 | |
616 | pmaddubsw %1, coef2 | |
617 | phaddw %2, %1 | |
618 | movu %1, [srcq + %5 - 1 + 8] | |
619 | pshufb %4, %1, Tm0 | |
620 | pmaddubsw %4, coef2 | |
621 | pshufb %1, %1, Tm1 | |
622 | pmaddubsw %1, coef2 | |
623 | phaddw %4, %1 | |
624 | pmulhrsw %2, %3 | |
625 | pmulhrsw %4, %3 | |
626 | packuswb %2, %4 | |
627 | movu [dstq + %5], %2 | |
628 | %endmacro | |
629 | ||
630 | %macro FILTER_H4_w48 4 | |
631 | FILTER_H4_w16o %1, %2, %3, %4, 0 | |
632 | FILTER_H4_w16o %1, %2, %3, %4, 16 | |
633 | FILTER_H4_w16o %1, %2, %3, %4, 32 | |
634 | %endmacro | |
635 | ||
636 | %macro FILTER_H4_w64 4 | |
637 | FILTER_H4_w16o %1, %2, %3, %4, 0 | |
638 | FILTER_H4_w16o %1, %2, %3, %4, 16 | |
639 | FILTER_H4_w16o %1, %2, %3, %4, 32 | |
640 | FILTER_H4_w16o %1, %2, %3, %4, 48 | |
641 | %endmacro | |
642 | ||
643 | ;----------------------------------------------------------------------------- | |
644 | ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
645 | ;----------------------------------------------------------------------------- | |
646 | %macro IPFILTER_CHROMA 2 | |
647 | INIT_XMM sse4 | |
648 | cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride | |
649 | %define coef2 m5 | |
650 | %define Tm0 m4 | |
651 | %define Tm1 m3 | |
652 | %define t2 m2 | |
653 | %define t1 m1 | |
654 | %define t0 m0 | |
655 | ||
656 | mov r4d, r4m | |
657 | ||
658 | %ifdef PIC | |
659 | lea r5, [tab_ChromaCoeff] | |
660 | movd coef2, [r5 + r4 * 4] | |
661 | %else | |
662 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
663 | %endif | |
664 | ||
665 | mov r5d, %2 | |
666 | ||
667 | pshufd coef2, coef2, 0 | |
b53f7c52 | 668 | mova t2, [pw_512] |
72b9787e JB |
669 | mova Tm0, [tab_Tm] |
670 | mova Tm1, [tab_Tm + 16] | |
671 | ||
672 | .loop: | |
673 | FILTER_H4_w%1 t0, t1, t2 | |
674 | add srcq, srcstrideq | |
675 | add dstq, dststrideq | |
676 | ||
677 | dec r5d | |
678 | jnz .loop | |
679 | ||
680 | RET | |
681 | %endmacro | |
682 | ||
683 | ||
684 | IPFILTER_CHROMA 6, 8 | |
685 | IPFILTER_CHROMA 8, 2 | |
686 | IPFILTER_CHROMA 8, 4 | |
687 | IPFILTER_CHROMA 8, 6 | |
688 | IPFILTER_CHROMA 8, 8 | |
689 | IPFILTER_CHROMA 8, 16 | |
690 | IPFILTER_CHROMA 8, 32 | |
691 | IPFILTER_CHROMA 12, 16 | |
692 | ||
693 | IPFILTER_CHROMA 6, 16 | |
694 | IPFILTER_CHROMA 8, 12 | |
695 | IPFILTER_CHROMA 8, 64 | |
696 | IPFILTER_CHROMA 12, 32 | |
697 | ||
698 | ;----------------------------------------------------------------------------- | |
699 | ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
700 | ;----------------------------------------------------------------------------- | |
701 | %macro IPFILTER_CHROMA_W 2 | |
702 | INIT_XMM sse4 | |
703 | cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride | |
704 | %define coef2 m6 | |
705 | %define Tm0 m5 | |
706 | %define Tm1 m4 | |
707 | %define t3 m3 | |
708 | %define t2 m2 | |
709 | %define t1 m1 | |
710 | %define t0 m0 | |
711 | ||
712 | mov r4d, r4m | |
713 | ||
714 | %ifdef PIC | |
715 | lea r5, [tab_ChromaCoeff] | |
716 | movd coef2, [r5 + r4 * 4] | |
717 | %else | |
718 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
719 | %endif | |
720 | ||
721 | mov r5d, %2 | |
722 | ||
723 | pshufd coef2, coef2, 0 | |
b53f7c52 | 724 | mova t2, [pw_512] |
72b9787e JB |
725 | mova Tm0, [tab_Tm] |
726 | mova Tm1, [tab_Tm + 16] | |
727 | ||
728 | .loop: | |
729 | FILTER_H4_w%1 t0, t1, t2, t3 | |
730 | add srcq, srcstrideq | |
731 | add dstq, dststrideq | |
732 | ||
733 | dec r5d | |
734 | jnz .loop | |
735 | ||
736 | RET | |
737 | %endmacro | |
738 | ||
739 | IPFILTER_CHROMA_W 16, 4 | |
740 | IPFILTER_CHROMA_W 16, 8 | |
741 | IPFILTER_CHROMA_W 16, 12 | |
742 | IPFILTER_CHROMA_W 16, 16 | |
743 | IPFILTER_CHROMA_W 16, 32 | |
744 | IPFILTER_CHROMA_W 32, 8 | |
745 | IPFILTER_CHROMA_W 32, 16 | |
746 | IPFILTER_CHROMA_W 32, 24 | |
747 | IPFILTER_CHROMA_W 24, 32 | |
748 | IPFILTER_CHROMA_W 32, 32 | |
749 | ||
750 | IPFILTER_CHROMA_W 16, 24 | |
751 | IPFILTER_CHROMA_W 16, 64 | |
752 | IPFILTER_CHROMA_W 32, 48 | |
753 | IPFILTER_CHROMA_W 24, 64 | |
754 | IPFILTER_CHROMA_W 32, 64 | |
755 | ||
756 | IPFILTER_CHROMA_W 64, 64 | |
757 | IPFILTER_CHROMA_W 64, 32 | |
758 | IPFILTER_CHROMA_W 64, 48 | |
759 | IPFILTER_CHROMA_W 48, 64 | |
760 | IPFILTER_CHROMA_W 64, 16 | |
761 | ||
762 | ||
763 | %macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst | |
764 | movu %1, %7 | |
765 | pshufb %2, %1, [tab_Lm + 0] | |
766 | pmaddubsw %2, %5 | |
767 | pshufb %3, %1, [tab_Lm + 16] | |
768 | pmaddubsw %3, %5 | |
769 | phaddw %2, %3 | |
770 | pshufb %4, %1, [tab_Lm + 32] | |
771 | pmaddubsw %4, %5 | |
772 | pshufb %1, %1, [tab_Lm + 48] | |
773 | pmaddubsw %1, %5 | |
774 | phaddw %4, %1 | |
775 | phaddw %2, %4 | |
776 | %if %0 == 8 | |
777 | pmulhrsw %2, %6 | |
778 | packuswb %2, %2 | |
779 | movh %8, %2 | |
780 | %endif | |
781 | %endmacro | |
782 | ||
783 | %macro FILTER_H8_W4 2 | |
784 | movu %1, [r0 - 3 + r5] | |
785 | pshufb %2, %1, [tab_Lm] | |
786 | pmaddubsw %2, m3 | |
787 | pshufb m7, %1, [tab_Lm + 16] | |
788 | pmaddubsw m7, m3 | |
789 | phaddw %2, m7 | |
790 | phaddw %2, %2 | |
791 | %endmacro | |
792 | ||
793 | ;---------------------------------------------------------------------------------------------------------------------------- | |
794 | ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
795 | ;---------------------------------------------------------------------------------------------------------------------------- | |
796 | %macro IPFILTER_LUMA 3 | |
797 | INIT_XMM sse4 | |
798 | cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8 | |
799 | ||
800 | mov r4d, r4m | |
801 | ||
802 | %ifdef PIC | |
803 | lea r6, [tab_LumaCoeff] | |
804 | movh m3, [r6 + r4 * 8] | |
805 | %else | |
806 | movh m3, [tab_LumaCoeff + r4 * 8] | |
807 | %endif | |
808 | punpcklqdq m3, m3 | |
809 | ||
810 | %ifidn %3, pp | |
b53f7c52 | 811 | mova m2, [pw_512] |
72b9787e JB |
812 | %else |
813 | mova m2, [pw_2000] | |
814 | %endif | |
815 | ||
816 | mov r4d, %2 | |
817 | %ifidn %3, ps | |
818 | add r3, r3 | |
819 | cmp r5m, byte 0 | |
820 | je .loopH | |
821 | lea r6, [r1 + 2 * r1] | |
822 | sub r0, r6 | |
823 | add r4d, 7 | |
824 | %endif | |
825 | ||
826 | .loopH: | |
827 | xor r5, r5 | |
828 | %rep %1 / 8 | |
829 | %ifidn %3, pp | |
830 | FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5] | |
831 | %else | |
832 | FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5] | |
833 | psubw m1, m2 | |
834 | movu [r2 + 2 * r5], m1 | |
835 | %endif | |
836 | add r5, 8 | |
837 | %endrep | |
838 | ||
839 | %rep (%1 % 8) / 4 | |
840 | FILTER_H8_W4 m0, m1 | |
841 | %ifidn %3, pp | |
842 | pmulhrsw m1, m2 | |
843 | packuswb m1, m1 | |
844 | movd [r2 + r5], m1 | |
845 | %else | |
846 | psubw m1, m2 | |
847 | movh [r2 + 2 * r5], m1 | |
848 | %endif | |
849 | %endrep | |
850 | ||
851 | add r0, r1 | |
852 | add r2, r3 | |
853 | ||
854 | dec r4d | |
855 | jnz .loopH | |
856 | RET | |
857 | %endmacro | |
858 | ||
859 | ||
860 | INIT_YMM avx2 | |
861 | cglobal interp_8tap_horiz_pp_4x4, 4,6,6 | |
862 | mov r4d, r4m | |
863 | ||
864 | %ifdef PIC | |
865 | lea r5, [tab_LumaCoeff] | |
866 | vpbroadcastq m0, [r5 + r4 * 8] | |
867 | %else | |
868 | vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] | |
869 | %endif | |
870 | ||
871 | mova m1, [tab_Lm] | |
872 | vpbroadcastd m2, [pw_1] | |
873 | ||
874 | ; register map | |
875 | ; m0 - interpolate coeff | |
876 | ; m1 - shuffle order table | |
877 | ; m2 - constant word 1 | |
878 | ||
879 | sub r0, 3 | |
880 | ; Row 0-1 | |
881 | vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
882 | pshufb m3, m1 | |
883 | pmaddubsw m3, m0 | |
884 | pmaddwd m3, m2 | |
885 | vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
886 | pshufb m4, m1 | |
887 | pmaddubsw m4, m0 | |
888 | pmaddwd m4, m2 | |
889 | phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] | |
890 | ||
891 | ; Row 2-3 | |
892 | lea r0, [r0 + r1 * 2] | |
893 | vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
894 | pshufb m4, m1 | |
895 | pmaddubsw m4, m0 | |
896 | pmaddwd m4, m2 | |
897 | vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
898 | pshufb m5, m1 | |
899 | pmaddubsw m5, m0 | |
900 | pmaddwd m5, m2 | |
901 | phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] | |
902 | ||
903 | packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] | |
904 | pmulhrsw m3, [pw_512] | |
905 | vextracti128 xm4, m3, 1 | |
906 | packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] | |
b53f7c52 | 907 | pshufb xm3, [interp4_shuf] ; [row3 row1 row2 row0] |
72b9787e JB |
908 | |
909 | lea r0, [r3 * 3] | |
910 | movd [r2], xm3 | |
911 | pextrd [r2+r3], xm3, 2 | |
912 | pextrd [r2+r3*2], xm3, 1 | |
913 | pextrd [r2+r0], xm3, 3 | |
914 | RET | |
915 | ||
b53f7c52 JB |
916 | INIT_YMM avx2 |
917 | cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7 | |
918 | mov r4d, r4m | |
919 | ||
920 | %ifdef PIC | |
921 | lea r5, [tab_LumaCoeff] | |
922 | vpbroadcastq m0, [r5 + r4 * 8] | |
923 | %else | |
924 | vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] | |
925 | %endif | |
926 | ||
927 | mova m1, [tab_Lm] | |
928 | mova m2, [tab_Lm + 32] | |
929 | ||
930 | ; register map | |
931 | ; m0 - interpolate coeff | |
932 | ; m1, m2 - shuffle order table | |
933 | ||
934 | sub r0, 3 | |
935 | lea r5, [r1 * 3] | |
936 | lea r4, [r3 * 3] | |
937 | ||
938 | ; Row 0 | |
939 | vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
940 | pshufb m4, m3, m2 | |
941 | pshufb m3, m1 | |
942 | pmaddubsw m3, m0 | |
943 | pmaddubsw m4, m0 | |
944 | phaddw m3, m4 | |
945 | ; Row 1 | |
946 | vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
947 | pshufb m5, m4, m2 | |
948 | pshufb m4, m1 | |
949 | pmaddubsw m4, m0 | |
950 | pmaddubsw m5, m0 | |
951 | phaddw m4, m5 | |
952 | ||
953 | phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] | |
954 | pmulhrsw m3, [pw_512] | |
955 | ||
956 | ; Row 2 | |
957 | vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
958 | pshufb m5, m4, m2 | |
959 | pshufb m4, m1 | |
960 | pmaddubsw m4, m0 | |
961 | pmaddubsw m5, m0 | |
962 | phaddw m4, m5 | |
963 | ; Row 3 | |
964 | vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
965 | pshufb m6, m5, m2 | |
966 | pshufb m5, m1 | |
967 | pmaddubsw m5, m0 | |
968 | pmaddubsw m6, m0 | |
969 | phaddw m5, m6 | |
970 | ||
971 | phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] | |
972 | pmulhrsw m4, [pw_512] | |
973 | ||
974 | packuswb m3, m4 | |
975 | vextracti128 xm4, m3, 1 | |
976 | punpcklwd xm5, xm3, xm4 | |
977 | ||
978 | movq [r2], xm5 | |
979 | movhps [r2 + r3], xm5 | |
980 | ||
981 | punpckhwd xm5, xm3, xm4 | |
982 | movq [r2 + r3 * 2], xm5 | |
983 | movhps [r2 + r4], xm5 | |
984 | RET | |
985 | ||
986 | %macro IPFILTER_LUMA_AVX2_8xN 2 | |
987 | INIT_YMM avx2 | |
988 | cglobal interp_8tap_horiz_pp_%1x%2, 4, 7, 7 | |
989 | mov r4d, r4m | |
990 | ||
991 | %ifdef PIC | |
992 | lea r5, [tab_LumaCoeff] | |
993 | vpbroadcastq m0, [r5 + r4 * 8] | |
994 | %else | |
995 | vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] | |
996 | %endif | |
997 | ||
998 | mova m1, [tab_Lm] | |
999 | mova m2, [tab_Lm + 32] | |
1000 | ||
1001 | ; register map | |
1002 | ; m0 - interpolate coeff | |
1003 | ; m1, m2 - shuffle order table | |
1004 | ||
1005 | sub r0, 3 | |
1006 | lea r5, [r1 * 3] | |
1007 | lea r6, [r3 * 3] | |
1008 | mov r4d, %2 / 4 | |
1009 | .loop: | |
1010 | ; Row 0 | |
1011 | vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1012 | pshufb m4, m3, m2 | |
1013 | pshufb m3, m1 | |
1014 | pmaddubsw m3, m0 | |
1015 | pmaddubsw m4, m0 | |
1016 | phaddw m3, m4 | |
1017 | ; Row 1 | |
1018 | vbroadcasti128 m4, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1019 | pshufb m5, m4, m2 | |
1020 | pshufb m4, m1 | |
1021 | pmaddubsw m4, m0 | |
1022 | pmaddubsw m5, m0 | |
1023 | phaddw m4, m5 | |
1024 | ||
1025 | phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A] | |
1026 | pmulhrsw m3, [pw_512] | |
1027 | ||
1028 | ; Row 2 | |
1029 | vbroadcasti128 m4, [r0 + r1 * 2] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1030 | pshufb m5, m4, m2 | |
1031 | pshufb m4, m1 | |
1032 | pmaddubsw m4, m0 | |
1033 | pmaddubsw m5, m0 | |
1034 | phaddw m4, m5 | |
1035 | ; Row 3 | |
1036 | vbroadcasti128 m5, [r0 + r5] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1037 | pshufb m6, m5, m2 | |
1038 | pshufb m5, m1 | |
1039 | pmaddubsw m5, m0 | |
1040 | pmaddubsw m6, m0 | |
1041 | phaddw m5, m6 | |
1042 | ||
1043 | phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A] | |
1044 | pmulhrsw m4, [pw_512] | |
1045 | ||
1046 | packuswb m3, m4 | |
1047 | vextracti128 xm4, m3, 1 | |
1048 | punpcklwd xm5, xm3, xm4 | |
1049 | ||
1050 | movq [r2], xm5 | |
1051 | movhps [r2 + r3], xm5 | |
1052 | ||
1053 | punpckhwd xm5, xm3, xm4 | |
1054 | movq [r2 + r3 * 2], xm5 | |
1055 | movhps [r2 + r6], xm5 | |
1056 | ||
1057 | lea r0, [r0 + r1 * 4] | |
1058 | lea r2, [r2 + r3 * 4] | |
1059 | dec r4d | |
1060 | jnz .loop | |
1061 | RET | |
1062 | %endmacro | |
1063 | ||
1064 | IPFILTER_LUMA_AVX2_8xN 8, 8 | |
1065 | IPFILTER_LUMA_AVX2_8xN 8, 16 | |
1066 | IPFILTER_LUMA_AVX2_8xN 8, 32 | |
1067 | ||
1068 | %macro IPFILTER_LUMA_AVX2 2 | |
1069 | INIT_YMM avx2 | |
1070 | cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 | |
1071 | sub r0, 3 | |
1072 | mov r4d, r4m | |
1073 | %ifdef PIC | |
1074 | lea r5, [tab_LumaCoeff] | |
1075 | vpbroadcastd m0, [r5 + r4 * 8] | |
1076 | vpbroadcastd m1, [r5 + r4 * 8 + 4] | |
1077 | %else | |
1078 | vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] | |
1079 | vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] | |
1080 | %endif | |
1081 | movu m3, [tab_Tm + 16] | |
1082 | vpbroadcastd m7, [pw_1] | |
1083 | ||
1084 | ; register map | |
1085 | ; m0 , m1 interpolate coeff | |
1086 | ; m2 , m2 shuffle order table | |
1087 | ; m7 - pw_1 | |
72b9787e | 1088 | |
b53f7c52 JB |
1089 | mov r4d, %2/2 |
1090 | .loop: | |
1091 | ; Row 0 | |
1092 | vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1093 | pshufb m5, m4, m3 | |
1094 | pshufb m4, [tab_Tm] | |
1095 | pmaddubsw m4, m0 | |
1096 | pmaddubsw m5, m1 | |
1097 | paddw m4, m5 | |
1098 | pmaddwd m4, m7 | |
1099 | vbroadcasti128 m5, [r0 + 8] ; second 8 elements in Row0 | |
1100 | pshufb m6, m5, m3 | |
1101 | pshufb m5, [tab_Tm] | |
1102 | pmaddubsw m5, m0 | |
1103 | pmaddubsw m6, m1 | |
1104 | paddw m5, m6 | |
1105 | pmaddwd m5, m7 | |
1106 | packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] | |
1107 | pmulhrsw m4, [pw_512] | |
1108 | vbroadcasti128 m2, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1109 | pshufb m5, m2, m3 | |
1110 | pshufb m2, [tab_Tm] | |
1111 | pmaddubsw m2, m0 | |
1112 | pmaddubsw m5, m1 | |
1113 | paddw m2, m5 | |
1114 | pmaddwd m2, m7 | |
1115 | vbroadcasti128 m5, [r0 + r1 + 8] ; second 8 elements in Row0 | |
1116 | pshufb m6, m5, m3 | |
1117 | pshufb m5, [tab_Tm] | |
1118 | pmaddubsw m5, m0 | |
1119 | pmaddubsw m6, m1 | |
1120 | paddw m5, m6 | |
1121 | pmaddwd m5, m7 | |
1122 | packssdw m2, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] | |
1123 | pmulhrsw m2, [pw_512] | |
1124 | packuswb m4, m2 | |
1125 | vpermq m4, m4, 11011000b | |
1126 | vextracti128 xm5, m4, 1 | |
1127 | pshufd xm4, xm4, 11011000b | |
1128 | pshufd xm5, xm5, 11011000b | |
1129 | movu [r2], xm4 | |
1130 | movu [r2+r3], xm5 | |
1131 | lea r0, [r0 + r1 * 2] | |
1132 | lea r2, [r2 + r3 * 2] | |
1133 | dec r4d | |
1134 | jnz .loop | |
1135 | RET | |
1136 | %endmacro | |
1137 | ||
1138 | %macro IPFILTER_LUMA_32x_avx2 2 | |
1139 | INIT_YMM avx2 | |
1140 | cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 | |
1141 | sub r0, 3 | |
1142 | mov r4d, r4m | |
1143 | %ifdef PIC | |
1144 | lea r5, [tab_LumaCoeff] | |
1145 | vpbroadcastd m0, [r5 + r4 * 8] | |
1146 | vpbroadcastd m1, [r5 + r4 * 8 + 4] | |
1147 | %else | |
1148 | vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] | |
1149 | vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] | |
1150 | %endif | |
1151 | movu m3, [tab_Tm + 16] | |
1152 | vpbroadcastd m7, [pw_1] | |
1153 | ||
1154 | ; register map | |
1155 | ; m0 , m1 interpolate coeff | |
1156 | ; m2 , m2 shuffle order table | |
1157 | ; m7 - pw_1 | |
1158 | ||
1159 | mov r4d, %2 | |
1160 | .loop: | |
1161 | ; Row 0 | |
1162 | vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1163 | pshufb m5, m4, m3 | |
1164 | pshufb m4, [tab_Tm] | |
1165 | pmaddubsw m4, m0 | |
1166 | pmaddubsw m5, m1 | |
1167 | paddw m4, m5 | |
1168 | pmaddwd m4, m7 | |
1169 | vbroadcasti128 m5, [r0 + 8] | |
1170 | pshufb m6, m5, m3 | |
1171 | pshufb m5, [tab_Tm] | |
1172 | pmaddubsw m5, m0 | |
1173 | pmaddubsw m6, m1 | |
1174 | paddw m5, m6 | |
1175 | pmaddwd m5, m7 | |
1176 | packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] | |
1177 | pmulhrsw m4, [pw_512] | |
1178 | vbroadcasti128 m2, [r0 + 16] | |
1179 | pshufb m5, m2, m3 | |
1180 | pshufb m2, [tab_Tm] | |
1181 | pmaddubsw m2, m0 | |
1182 | pmaddubsw m5, m1 | |
1183 | paddw m2, m5 | |
1184 | pmaddwd m2, m7 | |
1185 | vbroadcasti128 m5, [r0 + 24] | |
1186 | pshufb m6, m5, m3 | |
1187 | pshufb m5, [tab_Tm] | |
1188 | pmaddubsw m5, m0 | |
1189 | pmaddubsw m6, m1 | |
1190 | paddw m5, m6 | |
1191 | pmaddwd m5, m7 | |
1192 | packssdw m2, m5 | |
1193 | pmulhrsw m2, [pw_512] | |
1194 | packuswb m4, m2 | |
1195 | vpermq m4, m4, 11011000b | |
1196 | vextracti128 xm5, m4, 1 | |
1197 | pshufd xm4, xm4, 11011000b | |
1198 | pshufd xm5, xm5, 11011000b | |
1199 | movu [r2], xm4 | |
1200 | movu [r2 + 16], xm5 | |
1201 | lea r0, [r0 + r1] | |
1202 | lea r2, [r2 + r3] | |
1203 | dec r4d | |
1204 | jnz .loop | |
1205 | RET | |
1206 | %endmacro | |
1207 | ||
1208 | %macro IPFILTER_LUMA_64x_avx2 2 | |
1209 | INIT_YMM avx2 | |
1210 | cglobal interp_8tap_horiz_pp_%1x%2, 4,6,8 | |
1211 | sub r0, 3 | |
1212 | mov r4d, r4m | |
1213 | %ifdef PIC | |
1214 | lea r5, [tab_LumaCoeff] | |
1215 | vpbroadcastd m0, [r5 + r4 * 8] | |
1216 | vpbroadcastd m1, [r5 + r4 * 8 + 4] | |
1217 | %else | |
1218 | vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] | |
1219 | vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] | |
1220 | %endif | |
1221 | movu m3, [tab_Tm + 16] | |
1222 | vpbroadcastd m7, [pw_1] | |
1223 | ||
1224 | ; register map | |
1225 | ; m0 , m1 interpolate coeff | |
1226 | ; m2 , m2 shuffle order table | |
1227 | ; m7 - pw_1 | |
1228 | ||
1229 | mov r4d, %2 | |
1230 | .loop: | |
1231 | ; Row 0 | |
1232 | vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1233 | pshufb m5, m4, m3 | |
1234 | pshufb m4, [tab_Tm] | |
1235 | pmaddubsw m4, m0 | |
1236 | pmaddubsw m5, m1 | |
1237 | paddw m4, m5 | |
1238 | pmaddwd m4, m7 | |
1239 | vbroadcasti128 m5, [r0 + 8] | |
1240 | pshufb m6, m5, m3 | |
1241 | pshufb m5, [tab_Tm] | |
1242 | pmaddubsw m5, m0 | |
1243 | pmaddubsw m6, m1 | |
1244 | paddw m5, m6 | |
1245 | pmaddwd m5, m7 | |
1246 | packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] | |
1247 | pmulhrsw m4, [pw_512] | |
1248 | vbroadcasti128 m2, [r0 + 16] | |
1249 | pshufb m5, m2, m3 | |
1250 | pshufb m2, [tab_Tm] | |
1251 | pmaddubsw m2, m0 | |
1252 | pmaddubsw m5, m1 | |
1253 | paddw m2, m5 | |
1254 | pmaddwd m2, m7 | |
1255 | vbroadcasti128 m5, [r0 + 24] | |
1256 | pshufb m6, m5, m3 | |
1257 | pshufb m5, [tab_Tm] | |
1258 | pmaddubsw m5, m0 | |
1259 | pmaddubsw m6, m1 | |
1260 | paddw m5, m6 | |
1261 | pmaddwd m5, m7 | |
1262 | packssdw m2, m5 | |
1263 | pmulhrsw m2, [pw_512] | |
1264 | packuswb m4, m2 | |
1265 | vpermq m4, m4, 11011000b | |
1266 | vextracti128 xm5, m4, 1 | |
1267 | pshufd xm4, xm4, 11011000b | |
1268 | pshufd xm5, xm5, 11011000b | |
1269 | movu [r2], xm4 | |
1270 | movu [r2 + 16], xm5 | |
1271 | ||
1272 | vbroadcasti128 m4, [r0 + 32] | |
1273 | pshufb m5, m4, m3 | |
1274 | pshufb m4, [tab_Tm] | |
1275 | pmaddubsw m4, m0 | |
1276 | pmaddubsw m5, m1 | |
1277 | paddw m4, m5 | |
1278 | pmaddwd m4, m7 | |
1279 | vbroadcasti128 m5, [r0 + 40] | |
1280 | pshufb m6, m5, m3 | |
1281 | pshufb m5, [tab_Tm] | |
1282 | pmaddubsw m5, m0 | |
1283 | pmaddubsw m6, m1 | |
1284 | paddw m5, m6 | |
1285 | pmaddwd m5, m7 | |
1286 | packssdw m4, m5 | |
1287 | pmulhrsw m4, [pw_512] | |
1288 | vbroadcasti128 m2, [r0 + 48] | |
1289 | pshufb m5, m2, m3 | |
1290 | pshufb m2, [tab_Tm] | |
1291 | pmaddubsw m2, m0 | |
1292 | pmaddubsw m5, m1 | |
1293 | paddw m2, m5 | |
1294 | pmaddwd m2, m7 | |
1295 | vbroadcasti128 m5, [r0 + 56] | |
1296 | pshufb m6, m5, m3 | |
1297 | pshufb m5, [tab_Tm] | |
1298 | pmaddubsw m5, m0 | |
1299 | pmaddubsw m6, m1 | |
1300 | paddw m5, m6 | |
1301 | pmaddwd m5, m7 | |
1302 | packssdw m2, m5 | |
1303 | pmulhrsw m2, [pw_512] | |
1304 | packuswb m4, m2 | |
1305 | vpermq m4, m4, 11011000b | |
1306 | vextracti128 xm5, m4, 1 | |
1307 | pshufd xm4, xm4, 11011000b | |
1308 | pshufd xm5, xm5, 11011000b | |
1309 | movu [r2 +32], xm4 | |
1310 | movu [r2 + 48], xm5 | |
1311 | ||
1312 | lea r0, [r0 + r1] | |
1313 | lea r2, [r2 + r3] | |
1314 | dec r4d | |
1315 | jnz .loop | |
1316 | RET | |
1317 | %endmacro | |
1318 | ||
1319 | INIT_YMM avx2 | |
1320 | cglobal interp_8tap_horiz_pp_48x64, 4,6,8 | |
1321 | sub r0, 3 | |
1322 | mov r4d, r4m | |
1323 | %ifdef PIC | |
1324 | lea r5, [tab_LumaCoeff] | |
1325 | vpbroadcastd m0, [r5 + r4 * 8] | |
1326 | vpbroadcastd m1, [r5 + r4 * 8 + 4] | |
1327 | %else | |
1328 | vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] | |
1329 | vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] | |
1330 | %endif | |
1331 | movu m3, [tab_Tm + 16] | |
1332 | vpbroadcastd m7, [pw_1] | |
1333 | ||
1334 | ; register map | |
1335 | ; m0 , m1 interpolate coeff | |
1336 | ; m2 , m2 shuffle order table | |
1337 | ; m7 - pw_1 | |
1338 | ||
1339 | mov r4d, 64 | |
1340 | .loop: | |
1341 | ; Row 0 | |
1342 | vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1343 | pshufb m5, m4, m3 | |
1344 | pshufb m4, [tab_Tm] | |
1345 | pmaddubsw m4, m0 | |
1346 | pmaddubsw m5, m1 | |
1347 | paddw m4, m5 | |
1348 | pmaddwd m4, m7 | |
1349 | vbroadcasti128 m5, [r0 + 8] | |
1350 | pshufb m6, m5, m3 | |
1351 | pshufb m5, [tab_Tm] | |
1352 | pmaddubsw m5, m0 | |
1353 | pmaddubsw m6, m1 | |
1354 | paddw m5, m6 | |
1355 | pmaddwd m5, m7 | |
1356 | packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] | |
1357 | pmulhrsw m4, [pw_512] | |
1358 | ||
1359 | vbroadcasti128 m2, [r0 + 16] | |
1360 | pshufb m5, m2, m3 | |
1361 | pshufb m2, [tab_Tm] | |
1362 | pmaddubsw m2, m0 | |
1363 | pmaddubsw m5, m1 | |
1364 | paddw m2, m5 | |
1365 | pmaddwd m2, m7 | |
1366 | vbroadcasti128 m5, [r0 + 24] | |
1367 | pshufb m6, m5, m3 | |
1368 | pshufb m5, [tab_Tm] | |
1369 | pmaddubsw m5, m0 | |
1370 | pmaddubsw m6, m1 | |
1371 | paddw m5, m6 | |
1372 | pmaddwd m5, m7 | |
1373 | packssdw m2, m5 | |
1374 | pmulhrsw m2, [pw_512] | |
1375 | packuswb m4, m2 | |
1376 | vpermq m4, m4, 11011000b | |
1377 | vextracti128 xm5, m4, 1 | |
1378 | pshufd xm4, xm4, 11011000b | |
1379 | pshufd xm5, xm5, 11011000b | |
1380 | movu [r2], xm4 | |
1381 | movu [r2 + 16], xm5 | |
1382 | ||
1383 | vbroadcasti128 m4, [r0 + 32] | |
1384 | pshufb m5, m4, m3 | |
1385 | pshufb m4, [tab_Tm] | |
1386 | pmaddubsw m4, m0 | |
1387 | pmaddubsw m5, m1 | |
1388 | paddw m4, m5 | |
1389 | pmaddwd m4, m7 | |
1390 | vbroadcasti128 m5, [r0 + 40] | |
1391 | pshufb m6, m5, m3 | |
1392 | pshufb m5, [tab_Tm] | |
1393 | pmaddubsw m5, m0 | |
1394 | pmaddubsw m6, m1 | |
1395 | paddw m5, m6 | |
1396 | pmaddwd m5, m7 | |
1397 | packssdw m4, m5 | |
1398 | pmulhrsw m4, [pw_512] | |
1399 | packuswb m4, m4 | |
1400 | vpermq m4, m4, 11011000b | |
1401 | pshufd xm4, xm4, 11011000b | |
1402 | movu [r2 + 32], xm4 | |
1403 | ||
1404 | lea r0, [r0 + r1] | |
1405 | lea r2, [r2 + r3] | |
1406 | dec r4d | |
1407 | jnz .loop | |
1408 | RET | |
1409 | ||
1410 | INIT_YMM avx2 | |
1411 | cglobal interp_4tap_horiz_pp_4x4, 4,6,6 | |
1412 | mov r4d, r4m | |
1413 | ||
1414 | %ifdef PIC | |
1415 | lea r5, [tab_ChromaCoeff] | |
1416 | vpbroadcastd m0, [r5 + r4 * 4] | |
1417 | %else | |
1418 | vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] | |
1419 | %endif | |
1420 | ||
1421 | vpbroadcastd m2, [pw_1] | |
1422 | vbroadcasti128 m1, [tab_Tm] | |
1423 | ||
1424 | ; register map | |
1425 | ; m0 - interpolate coeff | |
1426 | ; m1 - shuffle order table | |
1427 | ; m2 - constant word 1 | |
1428 | ||
1429 | dec r0 | |
1430 | ||
1431 | ; Row 0-1 | |
1432 | vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1433 | vinserti128 m3, m3, [r0 + r1], 1 | |
1434 | pshufb m3, m1 | |
1435 | pmaddubsw m3, m0 | |
1436 | pmaddwd m3, m2 | |
1437 | ||
1438 | ; Row 2-3 | |
1439 | lea r0, [r0 + r1 * 2] | |
1440 | vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1441 | vinserti128 m4, m4, [r0 + r1], 1 | |
1442 | pshufb m4, m1 | |
1443 | pmaddubsw m4, m0 | |
1444 | pmaddwd m4, m2 | |
1445 | ||
1446 | packssdw m3, m4 | |
1447 | pmulhrsw m3, [pw_512] | |
1448 | vextracti128 xm4, m3, 1 | |
1449 | packuswb xm3, xm4 | |
1450 | ||
1451 | lea r0, [r3 * 3] | |
1452 | movd [r2], xm3 | |
1453 | pextrd [r2+r3], xm3, 2 | |
1454 | pextrd [r2+r3*2], xm3, 1 | |
1455 | pextrd [r2+r0], xm3, 3 | |
1456 | RET | |
1457 | ||
1458 | INIT_YMM avx2 | |
1459 | cglobal interp_4tap_horiz_pp_32x32, 4,6,7 | |
1460 | mov r4d, r4m | |
1461 | ||
1462 | %ifdef PIC | |
1463 | lea r5, [tab_ChromaCoeff] | |
1464 | vpbroadcastd m0, [r5 + r4 * 4] | |
1465 | %else | |
1466 | vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] | |
1467 | %endif | |
1468 | ||
1469 | mova m1, [interp4_horiz_shuf1] | |
1470 | vpbroadcastd m2, [pw_1] | |
1471 | mova m6, [pw_512] | |
1472 | ; register map | |
1473 | ; m0 - interpolate coeff | |
1474 | ; m1 - shuffle order table | |
1475 | ; m2 - constant word 1 | |
1476 | ||
1477 | dec r0 | |
1478 | mov r4d, 32 | |
1479 | ||
1480 | .loop: | |
1481 | ; Row 0 | |
1482 | vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1483 | pshufb m3, m1 | |
1484 | pmaddubsw m3, m0 | |
1485 | pmaddwd m3, m2 | |
1486 | vbroadcasti128 m4, [r0 + 4] | |
1487 | pshufb m4, m1 | |
1488 | pmaddubsw m4, m0 | |
1489 | pmaddwd m4, m2 | |
1490 | packssdw m3, m4 | |
1491 | pmulhrsw m3, m6 | |
1492 | ||
1493 | vbroadcasti128 m4, [r0 + 16] | |
1494 | pshufb m4, m1 | |
1495 | pmaddubsw m4, m0 | |
1496 | pmaddwd m4, m2 | |
1497 | vbroadcasti128 m5, [r0 + 20] | |
1498 | pshufb m5, m1 | |
1499 | pmaddubsw m5, m0 | |
1500 | pmaddwd m5, m2 | |
1501 | packssdw m4, m5 | |
1502 | pmulhrsw m4, m6 | |
1503 | ||
1504 | packuswb m3, m4 | |
1505 | vpermq m3, m3, 11011000b | |
1506 | ||
1507 | movu [r2], m3 | |
1508 | lea r2, [r2 + r3] | |
1509 | lea r0, [r0 + r1] | |
1510 | dec r4d | |
1511 | jnz .loop | |
1512 | RET | |
1513 | ||
1514 | ||
1515 | INIT_YMM avx2 | |
1516 | cglobal interp_4tap_horiz_pp_16x16, 4, 6, 7 | |
1517 | mov r4d, r4m | |
1518 | ||
1519 | %ifdef PIC | |
1520 | lea r5, [tab_ChromaCoeff] | |
1521 | vpbroadcastd m0, [r5 + r4 * 4] | |
1522 | %else | |
1523 | vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] | |
1524 | %endif | |
1525 | ||
1526 | mova m6, [pw_512] | |
1527 | mova m1, [interp4_horiz_shuf1] | |
1528 | vpbroadcastd m2, [pw_1] | |
1529 | ||
1530 | ; register map | |
1531 | ; m0 - interpolate coeff | |
1532 | ; m1 - shuffle order table | |
1533 | ; m2 - constant word 1 | |
1534 | ||
1535 | dec r0 | |
1536 | mov r4d, 8 | |
1537 | ||
1538 | .loop: | |
1539 | ; Row 0 | |
1540 | vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1541 | pshufb m3, m1 | |
1542 | pmaddubsw m3, m0 | |
1543 | pmaddwd m3, m2 | |
1544 | vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1545 | pshufb m4, m1 | |
1546 | pmaddubsw m4, m0 | |
1547 | pmaddwd m4, m2 | |
1548 | packssdw m3, m4 | |
1549 | pmulhrsw m3, m6 | |
1550 | ||
1551 | ; Row 1 | |
1552 | vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1553 | pshufb m4, m1 | |
1554 | pmaddubsw m4, m0 | |
1555 | pmaddwd m4, m2 | |
1556 | vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1557 | pshufb m5, m1 | |
1558 | pmaddubsw m5, m0 | |
1559 | pmaddwd m5, m2 | |
1560 | packssdw m4, m5 | |
1561 | pmulhrsw m4, m6 | |
1562 | ||
1563 | packuswb m3, m4 | |
1564 | vpermq m3, m3, 11011000b | |
1565 | ||
1566 | vextracti128 xm4, m3, 1 | |
1567 | movu [r2], xm3 | |
1568 | movu [r2 + r3], xm4 | |
1569 | lea r2, [r2 + r3 * 2] | |
1570 | lea r0, [r0 + r1 * 2] | |
1571 | dec r4d | |
1572 | jnz .loop | |
1573 | RET | |
72b9787e JB |
1574 | ;-------------------------------------------------------------------------------------------------------------- |
1575 | ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1576 | ;-------------------------------------------------------------------------------------------------------------- | |
1577 | IPFILTER_LUMA 4, 4, pp | |
1578 | IPFILTER_LUMA 4, 8, pp | |
1579 | IPFILTER_LUMA 12, 16, pp | |
1580 | IPFILTER_LUMA 4, 16, pp | |
1581 | ||
b53f7c52 JB |
1582 | INIT_YMM avx2 |
1583 | cglobal interp_4tap_horiz_pp_8x8, 4,6,6 | |
1584 | mov r4d, r4m | |
1585 | ||
1586 | %ifdef PIC | |
1587 | lea r5, [tab_ChromaCoeff] | |
1588 | vpbroadcastd m0, [r5 + r4 * 4] | |
1589 | %else | |
1590 | vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] | |
1591 | %endif | |
1592 | ||
1593 | movu m1, [tab_Tm] | |
1594 | vpbroadcastd m2, [pw_1] | |
1595 | ||
1596 | ; register map | |
1597 | ; m0 - interpolate coeff | |
1598 | ; m1 - shuffle order table | |
1599 | ; m2 - constant word 1 | |
1600 | ||
1601 | sub r0, 1 | |
1602 | mov r4d, 2 | |
1603 | ||
1604 | .loop: | |
1605 | ; Row 0 | |
1606 | vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1607 | pshufb m3, m1 | |
1608 | pmaddubsw m3, m0 | |
1609 | pmaddwd m3, m2 | |
1610 | ||
1611 | ; Row 1 | |
1612 | vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1613 | pshufb m4, m1 | |
1614 | pmaddubsw m4, m0 | |
1615 | pmaddwd m4, m2 | |
1616 | packssdw m3, m4 | |
1617 | pmulhrsw m3, [pw_512] | |
1618 | lea r0, [r0 + r1 * 2] | |
1619 | ||
1620 | ; Row 2 | |
1621 | vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1622 | pshufb m4, m1 | |
1623 | pmaddubsw m4, m0 | |
1624 | pmaddwd m4, m2 | |
1625 | ||
1626 | ; Row 3 | |
1627 | vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
1628 | pshufb m5, m1 | |
1629 | pmaddubsw m5, m0 | |
1630 | pmaddwd m5, m2 | |
1631 | packssdw m4, m5 | |
1632 | pmulhrsw m4, [pw_512] | |
1633 | ||
1634 | packuswb m3, m4 | |
1635 | mova m5, [interp_4tap_8x8_horiz_shuf] | |
1636 | vpermd m3, m5, m3 | |
1637 | vextracti128 xm4, m3, 1 | |
1638 | movq [r2], xm3 | |
1639 | movhps [r2 + r3], xm3 | |
1640 | lea r2, [r2 + r3 * 2] | |
1641 | movq [r2], xm4 | |
1642 | movhps [r2 + r3], xm4 | |
1643 | lea r2, [r2 + r3 * 2] | |
1644 | lea r0, [r0 + r1*2] | |
1645 | dec r4d | |
1646 | jnz .loop | |
1647 | RET | |
1648 | ||
1649 | IPFILTER_LUMA_AVX2 16, 4 | |
1650 | IPFILTER_LUMA_AVX2 16, 8 | |
1651 | IPFILTER_LUMA_AVX2 16, 12 | |
1652 | IPFILTER_LUMA_AVX2 16, 16 | |
1653 | IPFILTER_LUMA_AVX2 16, 32 | |
1654 | IPFILTER_LUMA_AVX2 16, 64 | |
1655 | ||
1656 | IPFILTER_LUMA_32x_avx2 32 , 8 | |
1657 | IPFILTER_LUMA_32x_avx2 32 , 16 | |
1658 | IPFILTER_LUMA_32x_avx2 32 , 24 | |
1659 | IPFILTER_LUMA_32x_avx2 32 , 32 | |
1660 | IPFILTER_LUMA_32x_avx2 32 , 64 | |
1661 | ||
1662 | IPFILTER_LUMA_64x_avx2 64 , 64 | |
1663 | IPFILTER_LUMA_64x_avx2 64 , 48 | |
1664 | IPFILTER_LUMA_64x_avx2 64 , 32 | |
1665 | IPFILTER_LUMA_64x_avx2 64 , 16 | |
1666 | ||
72b9787e JB |
1667 | ;-------------------------------------------------------------------------------------------------------------- |
1668 | ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1669 | ;-------------------------------------------------------------------------------------------------------------- | |
1670 | %macro IPFILTER_LUMA_PP_W8 2 | |
1671 | INIT_XMM sse4 | |
1672 | cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7 | |
1673 | mov r4d, r4m | |
1674 | ||
1675 | %ifdef PIC | |
1676 | lea r5, [tab_LumaCoeff] | |
1677 | movh m3, [r5 + r4 * 8] | |
1678 | %else | |
1679 | movh m3, [tab_LumaCoeff + r4 * 8] | |
1680 | %endif | |
1681 | pshufd m0, m3, 0 ; m0 = coeff-L | |
1682 | pshufd m1, m3, 0x55 ; m1 = coeff-H | |
1683 | lea r5, [tab_Tm] ; r5 = shuffle | |
1684 | mova m2, [pw_512] ; m2 = 512 | |
1685 | ||
1686 | mov r4d, %2 | |
1687 | .loopH: | |
1688 | %assign x 0 | |
1689 | %rep %1 / 8 | |
1690 | movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0] | |
1691 | pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0] | |
1692 | pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4] | |
1693 | pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8] | |
1694 | pmaddubsw m4, m0 | |
1695 | pmaddubsw m6, m5, m1 | |
1696 | pmaddubsw m5, m0 | |
1697 | pmaddubsw m3, m1 | |
1698 | paddw m4, m6 | |
1699 | paddw m5, m3 | |
1700 | phaddw m4, m5 | |
1701 | pmulhrsw m4, m2 | |
1702 | packuswb m4, m4 | |
1703 | movh [r2 + x], m4 | |
1704 | %assign x x+8 | |
1705 | %endrep | |
1706 | ||
1707 | add r0, r1 | |
1708 | add r2, r3 | |
1709 | ||
1710 | dec r4d | |
1711 | jnz .loopH | |
1712 | RET | |
1713 | %endmacro | |
1714 | ||
1715 | IPFILTER_LUMA_PP_W8 8, 4 | |
1716 | IPFILTER_LUMA_PP_W8 8, 8 | |
1717 | IPFILTER_LUMA_PP_W8 8, 16 | |
1718 | IPFILTER_LUMA_PP_W8 8, 32 | |
1719 | IPFILTER_LUMA_PP_W8 16, 4 | |
1720 | IPFILTER_LUMA_PP_W8 16, 8 | |
1721 | IPFILTER_LUMA_PP_W8 16, 12 | |
1722 | IPFILTER_LUMA_PP_W8 16, 16 | |
1723 | IPFILTER_LUMA_PP_W8 16, 32 | |
1724 | IPFILTER_LUMA_PP_W8 16, 64 | |
1725 | IPFILTER_LUMA_PP_W8 24, 32 | |
1726 | IPFILTER_LUMA_PP_W8 32, 8 | |
1727 | IPFILTER_LUMA_PP_W8 32, 16 | |
1728 | IPFILTER_LUMA_PP_W8 32, 24 | |
1729 | IPFILTER_LUMA_PP_W8 32, 32 | |
1730 | IPFILTER_LUMA_PP_W8 32, 64 | |
1731 | IPFILTER_LUMA_PP_W8 48, 64 | |
1732 | IPFILTER_LUMA_PP_W8 64, 16 | |
1733 | IPFILTER_LUMA_PP_W8 64, 32 | |
1734 | IPFILTER_LUMA_PP_W8 64, 48 | |
1735 | IPFILTER_LUMA_PP_W8 64, 64 | |
1736 | ||
1737 | ;---------------------------------------------------------------------------------------------------------------------------- | |
1738 | ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
1739 | ;---------------------------------------------------------------------------------------------------------------------------- | |
1740 | IPFILTER_LUMA 4, 4, ps | |
1741 | IPFILTER_LUMA 8, 8, ps | |
1742 | IPFILTER_LUMA 8, 4, ps | |
1743 | IPFILTER_LUMA 4, 8, ps | |
1744 | IPFILTER_LUMA 16, 16, ps | |
1745 | IPFILTER_LUMA 16, 8, ps | |
1746 | IPFILTER_LUMA 8, 16, ps | |
1747 | IPFILTER_LUMA 16, 12, ps | |
1748 | IPFILTER_LUMA 12, 16, ps | |
1749 | IPFILTER_LUMA 16, 4, ps | |
1750 | IPFILTER_LUMA 4, 16, ps | |
1751 | IPFILTER_LUMA 32, 32, ps | |
1752 | IPFILTER_LUMA 32, 16, ps | |
1753 | IPFILTER_LUMA 16, 32, ps | |
1754 | IPFILTER_LUMA 32, 24, ps | |
1755 | IPFILTER_LUMA 24, 32, ps | |
1756 | IPFILTER_LUMA 32, 8, ps | |
1757 | IPFILTER_LUMA 8, 32, ps | |
1758 | IPFILTER_LUMA 64, 64, ps | |
1759 | IPFILTER_LUMA 64, 32, ps | |
1760 | IPFILTER_LUMA 32, 64, ps | |
1761 | IPFILTER_LUMA 64, 48, ps | |
1762 | IPFILTER_LUMA 48, 64, ps | |
1763 | IPFILTER_LUMA 64, 16, ps | |
1764 | IPFILTER_LUMA 16, 64, ps | |
1765 | ||
1766 | ;----------------------------------------------------------------------------- | |
1767 | ; Interpolate HV | |
1768 | ;----------------------------------------------------------------------------- | |
1769 | %macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] | |
1770 | mova %5, [r0 + (%6 + 0) * 16] | |
1771 | mova %1, [r0 + (%6 + 1) * 16] | |
1772 | mova %2, [r0 + (%6 + 2) * 16] | |
1773 | punpcklwd %3, %5, %1 | |
1774 | punpckhwd %5, %1 | |
1775 | pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 | |
1776 | pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] | |
1777 | punpcklwd %4, %1, %2 | |
1778 | punpckhwd %1, %2 | |
1779 | pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 | |
1780 | pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] | |
1781 | %endmacro ; FILTER_HV8_START | |
1782 | ||
1783 | %macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] | |
1784 | mova %8, [r0 + (%9 + 0) * 16] | |
1785 | mova %1, [r0 + (%9 + 1) * 16] | |
1786 | punpcklwd %7, %2, %8 | |
1787 | punpckhwd %2, %8 | |
1788 | pmaddwd %7, [r5 + %10 * 16] | |
1789 | pmaddwd %2, [r5 + %10 * 16] | |
1790 | paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 | |
1791 | paddd %5, %2 ; R0 = H[0+1+2+3] | |
1792 | punpcklwd %7, %8, %1 | |
1793 | punpckhwd %8, %1 | |
1794 | pmaddwd %7, [r5 + %10 * 16] | |
1795 | pmaddwd %8, [r5 + %10 * 16] | |
1796 | paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 | |
1797 | paddd %6, %8 ; R1 = H[1+2+3+4] | |
1798 | %endmacro ; FILTER_HV8_MID | |
1799 | ||
1800 | ; Round and Saturate | |
1801 | %macro FILTER_HV8_END 4 ; output in [1, 3] | |
1802 | paddd %1, [tab_c_526336] | |
1803 | paddd %2, [tab_c_526336] | |
1804 | paddd %3, [tab_c_526336] | |
1805 | paddd %4, [tab_c_526336] | |
1806 | psrad %1, 12 | |
1807 | psrad %2, 12 | |
1808 | psrad %3, 12 | |
1809 | psrad %4, 12 | |
1810 | packssdw %1, %2 | |
1811 | packssdw %3, %4 | |
1812 | ||
1813 | ; TODO: is merge better? I think this way is short dependency link | |
1814 | packuswb %1, %3 | |
1815 | %endmacro ; FILTER_HV8_END | |
1816 | ||
1817 | ;----------------------------------------------------------------------------- | |
1818 | ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) | |
1819 | ;----------------------------------------------------------------------------- | |
1820 | INIT_XMM ssse3 | |
1821 | cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 | |
1822 | %define coef m7 | |
1823 | %define stk_buf rsp | |
1824 | ||
1825 | mov r4d, r4m | |
1826 | mov r5d, r5m | |
1827 | ||
1828 | %ifdef PIC | |
1829 | lea r6, [tab_LumaCoeff] | |
1830 | movh coef, [r6 + r4 * 8] | |
1831 | %else | |
1832 | movh coef, [tab_LumaCoeff + r4 * 8] | |
1833 | %endif | |
1834 | punpcklqdq coef, coef | |
1835 | ||
1836 | ; move to row -3 | |
1837 | lea r6, [r1 + r1 * 2] | |
1838 | sub r0, r6 | |
1839 | ||
1840 | xor r6, r6 | |
1841 | mov r4, rsp | |
1842 | ||
1843 | .loopH: | |
b53f7c52 | 1844 | FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3] |
72b9787e JB |
1845 | psubw m1, [pw_2000] |
1846 | mova [r4], m1 | |
1847 | ||
1848 | add r0, r1 | |
1849 | add r4, 16 | |
1850 | inc r6 | |
1851 | cmp r6, 8+7 | |
1852 | jnz .loopH | |
1853 | ||
1854 | ; ready to phase V | |
1855 | ; Here all of mN is free | |
1856 | ||
1857 | ; load coeff table | |
1858 | shl r5, 6 | |
1859 | lea r6, [tab_LumaCoeffV] | |
1860 | lea r5, [r5 + r6] | |
1861 | ||
1862 | ; load intermedia buffer | |
1863 | mov r0, stk_buf | |
1864 | ||
1865 | ; register mapping | |
1866 | ; r0 - src | |
1867 | ; r5 - coeff | |
1868 | ; r6 - loop_i | |
1869 | ||
1870 | ; let's go | |
1871 | xor r6, r6 | |
1872 | ||
1873 | ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache | |
1874 | .loopV: | |
1875 | ||
1876 | FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 | |
1877 | FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 | |
1878 | FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 | |
1879 | FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 | |
1880 | FILTER_HV8_END m3, m0, m4, m1 | |
1881 | ||
1882 | movh [r2], m3 | |
1883 | movhps [r2 + r3], m3 | |
1884 | ||
1885 | lea r0, [r0 + 16 * 2] | |
1886 | lea r2, [r2 + r3 * 2] | |
1887 | ||
1888 | inc r6 | |
1889 | cmp r6, 8/2 | |
1890 | jnz .loopV | |
1891 | ||
1892 | RET | |
1893 | ||
1894 | ;----------------------------------------------------------------------------- | |
1895 | ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1896 | ;----------------------------------------------------------------------------- | |
1897 | INIT_XMM sse4 | |
1898 | cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 | |
1899 | ||
1900 | mov r4d, r4m | |
1901 | sub r0, r1 | |
1902 | ||
1903 | %ifdef PIC | |
1904 | lea r5, [tab_ChromaCoeff] | |
1905 | movd m0, [r5 + r4 * 4] | |
1906 | %else | |
1907 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1908 | %endif | |
1909 | lea r4, [r1 * 3] | |
1910 | lea r5, [r0 + 4 * r1] | |
1911 | pshufb m0, [tab_Cm] | |
b53f7c52 | 1912 | mova m1, [pw_512] |
72b9787e JB |
1913 | |
1914 | movd m2, [r0] | |
1915 | movd m3, [r0 + r1] | |
1916 | movd m4, [r0 + 2 * r1] | |
1917 | movd m5, [r0 + r4] | |
1918 | ||
1919 | punpcklbw m2, m3 | |
1920 | punpcklbw m6, m4, m5 | |
1921 | punpcklbw m2, m6 | |
1922 | ||
1923 | pmaddubsw m2, m0 | |
1924 | ||
1925 | movd m6, [r5] | |
1926 | ||
1927 | punpcklbw m3, m4 | |
1928 | punpcklbw m7, m5, m6 | |
1929 | punpcklbw m3, m7 | |
1930 | ||
1931 | pmaddubsw m3, m0 | |
1932 | ||
1933 | phaddw m2, m3 | |
1934 | ||
1935 | pmulhrsw m2, m1 | |
1936 | ||
1937 | movd m7, [r5 + r1] | |
1938 | ||
1939 | punpcklbw m4, m5 | |
1940 | punpcklbw m3, m6, m7 | |
1941 | punpcklbw m4, m3 | |
1942 | ||
1943 | pmaddubsw m4, m0 | |
1944 | ||
1945 | movd m3, [r5 + 2 * r1] | |
1946 | ||
1947 | punpcklbw m5, m6 | |
1948 | punpcklbw m7, m3 | |
1949 | punpcklbw m5, m7 | |
1950 | ||
1951 | pmaddubsw m5, m0 | |
1952 | ||
1953 | phaddw m4, m5 | |
1954 | ||
1955 | pmulhrsw m4, m1 | |
1956 | packuswb m2, m4 | |
1957 | ||
1958 | pextrw [r2], m2, 0 | |
1959 | pextrw [r2 + r3], m2, 2 | |
1960 | lea r2, [r2 + 2 * r3] | |
1961 | pextrw [r2], m2, 4 | |
1962 | pextrw [r2 + r3], m2, 6 | |
1963 | ||
1964 | RET | |
1965 | ||
1966 | ;----------------------------------------------------------------------------- | |
1967 | ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1968 | ;----------------------------------------------------------------------------- | |
1969 | %macro FILTER_V4_W2_H4 2 | |
1970 | INIT_XMM sse4 | |
1971 | cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 | |
1972 | ||
1973 | mov r4d, r4m | |
1974 | sub r0, r1 | |
1975 | ||
1976 | %ifdef PIC | |
1977 | lea r5, [tab_ChromaCoeff] | |
1978 | movd m0, [r5 + r4 * 4] | |
1979 | %else | |
1980 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1981 | %endif | |
1982 | ||
1983 | pshufb m0, [tab_Cm] | |
1984 | ||
b53f7c52 | 1985 | mova m1, [pw_512] |
72b9787e JB |
1986 | |
1987 | mov r4d, %2 | |
1988 | lea r5, [3 * r1] | |
1989 | ||
1990 | .loop: | |
1991 | movd m2, [r0] | |
1992 | movd m3, [r0 + r1] | |
1993 | movd m4, [r0 + 2 * r1] | |
1994 | movd m5, [r0 + r5] | |
1995 | ||
1996 | punpcklbw m2, m3 | |
1997 | punpcklbw m6, m4, m5 | |
1998 | punpcklbw m2, m6 | |
1999 | ||
2000 | pmaddubsw m2, m0 | |
2001 | ||
2002 | lea r0, [r0 + 4 * r1] | |
2003 | movd m6, [r0] | |
2004 | ||
2005 | punpcklbw m3, m4 | |
2006 | punpcklbw m7, m5, m6 | |
2007 | punpcklbw m3, m7 | |
2008 | ||
2009 | pmaddubsw m3, m0 | |
2010 | ||
2011 | phaddw m2, m3 | |
2012 | ||
2013 | pmulhrsw m2, m1 | |
2014 | ||
2015 | movd m7, [r0 + r1] | |
2016 | ||
2017 | punpcklbw m4, m5 | |
2018 | punpcklbw m3, m6, m7 | |
2019 | punpcklbw m4, m3 | |
2020 | ||
2021 | pmaddubsw m4, m0 | |
2022 | ||
2023 | movd m3, [r0 + 2 * r1] | |
2024 | ||
2025 | punpcklbw m5, m6 | |
2026 | punpcklbw m7, m3 | |
2027 | punpcklbw m5, m7 | |
2028 | ||
2029 | pmaddubsw m5, m0 | |
2030 | ||
2031 | phaddw m4, m5 | |
2032 | ||
2033 | pmulhrsw m4, m1 | |
2034 | packuswb m2, m4 | |
2035 | ||
2036 | pextrw [r2], m2, 0 | |
2037 | pextrw [r2 + r3], m2, 2 | |
2038 | lea r2, [r2 + 2 * r3] | |
2039 | pextrw [r2], m2, 4 | |
2040 | pextrw [r2 + r3], m2, 6 | |
2041 | ||
2042 | lea r2, [r2 + 2 * r3] | |
2043 | ||
2044 | sub r4, 4 | |
2045 | jnz .loop | |
2046 | RET | |
2047 | %endmacro | |
2048 | ||
2049 | FILTER_V4_W2_H4 2, 8 | |
2050 | ||
2051 | FILTER_V4_W2_H4 2, 16 | |
2052 | ||
2053 | ;----------------------------------------------------------------------------- | |
2054 | ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2055 | ;----------------------------------------------------------------------------- | |
2056 | INIT_XMM sse4 | |
2057 | cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 | |
2058 | ||
2059 | mov r4d, r4m | |
2060 | sub r0, r1 | |
2061 | ||
2062 | %ifdef PIC | |
2063 | lea r5, [tab_ChromaCoeff] | |
2064 | movd m0, [r5 + r4 * 4] | |
2065 | %else | |
2066 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2067 | %endif | |
2068 | ||
2069 | pshufb m0, [tab_Cm] | |
2070 | lea r5, [r0 + 2 * r1] | |
2071 | ||
2072 | movd m2, [r0] | |
2073 | movd m3, [r0 + r1] | |
2074 | movd m4, [r5] | |
2075 | movd m5, [r5 + r1] | |
2076 | ||
2077 | punpcklbw m2, m3 | |
2078 | punpcklbw m1, m4, m5 | |
2079 | punpcklbw m2, m1 | |
2080 | ||
2081 | pmaddubsw m2, m0 | |
2082 | ||
2083 | movd m1, [r0 + 4 * r1] | |
2084 | ||
2085 | punpcklbw m3, m4 | |
2086 | punpcklbw m5, m1 | |
2087 | punpcklbw m3, m5 | |
2088 | ||
2089 | pmaddubsw m3, m0 | |
2090 | ||
2091 | phaddw m2, m3 | |
2092 | ||
b53f7c52 | 2093 | pmulhrsw m2, [pw_512] |
72b9787e JB |
2094 | packuswb m2, m2 |
2095 | movd [r2], m2 | |
2096 | pextrd [r2 + r3], m2, 1 | |
2097 | ||
2098 | RET | |
2099 | ||
2100 | ;----------------------------------------------------------------------------- | |
2101 | ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2102 | ;----------------------------------------------------------------------------- | |
2103 | INIT_XMM sse4 | |
2104 | cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 | |
2105 | ||
2106 | mov r4d, r4m | |
2107 | sub r0, r1 | |
2108 | ||
2109 | %ifdef PIC | |
2110 | lea r5, [tab_ChromaCoeff] | |
2111 | movd m0, [r5 + r4 * 4] | |
2112 | %else | |
2113 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2114 | %endif | |
2115 | ||
2116 | pshufb m0, [tab_Cm] | |
b53f7c52 | 2117 | mova m1, [pw_512] |
72b9787e JB |
2118 | lea r5, [r0 + 4 * r1] |
2119 | lea r4, [r1 * 3] | |
2120 | ||
2121 | movd m2, [r0] | |
2122 | movd m3, [r0 + r1] | |
2123 | movd m4, [r0 + 2 * r1] | |
2124 | movd m5, [r0 + r4] | |
2125 | ||
2126 | punpcklbw m2, m3 | |
2127 | punpcklbw m6, m4, m5 | |
2128 | punpcklbw m2, m6 | |
2129 | ||
2130 | pmaddubsw m2, m0 | |
2131 | ||
2132 | movd m6, [r5] | |
2133 | ||
2134 | punpcklbw m3, m4 | |
2135 | punpcklbw m7, m5, m6 | |
2136 | punpcklbw m3, m7 | |
2137 | ||
2138 | pmaddubsw m3, m0 | |
2139 | ||
2140 | phaddw m2, m3 | |
2141 | ||
2142 | pmulhrsw m2, m1 | |
2143 | ||
2144 | movd m7, [r5 + r1] | |
2145 | ||
2146 | punpcklbw m4, m5 | |
2147 | punpcklbw m3, m6, m7 | |
2148 | punpcklbw m4, m3 | |
2149 | ||
2150 | pmaddubsw m4, m0 | |
2151 | ||
2152 | movd m3, [r5 + 2 * r1] | |
2153 | ||
2154 | punpcklbw m5, m6 | |
2155 | punpcklbw m7, m3 | |
2156 | punpcklbw m5, m7 | |
2157 | ||
2158 | pmaddubsw m5, m0 | |
2159 | ||
2160 | phaddw m4, m5 | |
2161 | ||
2162 | pmulhrsw m4, m1 | |
2163 | ||
2164 | packuswb m2, m4 | |
2165 | movd [r2], m2 | |
2166 | pextrd [r2 + r3], m2, 1 | |
2167 | lea r2, [r2 + 2 * r3] | |
2168 | pextrd [r2], m2, 2 | |
2169 | pextrd [r2 + r3], m2, 3 | |
2170 | ||
2171 | RET | |
2172 | ||
b53f7c52 JB |
2173 | INIT_YMM avx2 |
2174 | cglobal interp_4tap_vert_pp_4x4, 4, 6, 3 | |
2175 | mov r4d, r4m | |
2176 | shl r4d, 6 | |
2177 | sub r0, r1 | |
2178 | ||
2179 | %ifdef PIC | |
2180 | lea r5, [tab_ChromaCoeffVer_32] | |
2181 | add r5, r4 | |
2182 | %else | |
2183 | lea r5, [tab_ChromaCoeffVer_32 + r4] | |
2184 | %endif | |
2185 | ||
2186 | lea r4, [r1 * 3] | |
2187 | ||
2188 | movd xm1, [r0] | |
2189 | pinsrd xm1, [r0 + r1], 1 | |
2190 | pinsrd xm1, [r0 + r1 * 2], 2 | |
2191 | pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] | |
2192 | lea r0, [r0 + r1 * 4] | |
2193 | movd xm2, [r0] | |
2194 | pinsrd xm2, [r0 + r1], 1 | |
2195 | pinsrd xm2, [r0 + r1 * 2], 2 ; m2 = row[x 6 5 4] | |
2196 | vinserti128 m1, m1, xm2, 1 ; m1 = row[x 6 5 4 3 2 1 0] | |
2197 | mova m2, [interp4_vpp_shuf1] | |
2198 | vpermd m0, m2, m1 ; m0 = row[4 3 3 2 2 1 1 0] | |
2199 | mova m2, [interp4_vpp_shuf1 + mmsize] | |
2200 | vpermd m1, m2, m1 ; m1 = row[6 5 5 4 4 3 3 2] | |
2201 | ||
2202 | mova m2, [interp4_vpp_shuf] | |
2203 | pshufb m0, m0, m2 | |
2204 | pshufb m1, m1, m2 | |
2205 | pmaddubsw m0, [r5] | |
2206 | pmaddubsw m1, [r5 + mmsize] | |
2207 | paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] | |
2208 | pmulhrsw m0, [pw_512] | |
2209 | vextracti128 xm1, m0, 1 | |
2210 | packuswb xm0, xm1 | |
2211 | lea r5, [r3 * 3] | |
2212 | movd [r2], xm0 | |
2213 | pextrd [r2 + r3], xm0, 1 | |
2214 | pextrd [r2 + r3 * 2], xm0, 2 | |
2215 | pextrd [r2 + r5], xm0, 3 | |
2216 | RET | |
2217 | ||
72b9787e JB |
2218 | ;----------------------------------------------------------------------------- |
2219 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2220 | ;----------------------------------------------------------------------------- | |
2221 | %macro FILTER_V4_W4_H4 2 | |
2222 | INIT_XMM sse4 | |
2223 | cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 | |
2224 | ||
2225 | mov r4d, r4m | |
2226 | sub r0, r1 | |
2227 | ||
2228 | %ifdef PIC | |
2229 | lea r5, [tab_ChromaCoeff] | |
2230 | movd m0, [r5 + r4 * 4] | |
2231 | %else | |
2232 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2233 | %endif | |
2234 | ||
2235 | pshufb m0, [tab_Cm] | |
2236 | ||
b53f7c52 | 2237 | mova m1, [pw_512] |
72b9787e JB |
2238 | |
2239 | mov r4d, %2 | |
2240 | ||
2241 | lea r5, [3 * r1] | |
2242 | ||
2243 | .loop: | |
2244 | movd m2, [r0] | |
2245 | movd m3, [r0 + r1] | |
2246 | movd m4, [r0 + 2 * r1] | |
2247 | movd m5, [r0 + r5] | |
2248 | ||
2249 | punpcklbw m2, m3 | |
2250 | punpcklbw m6, m4, m5 | |
2251 | punpcklbw m2, m6 | |
2252 | ||
2253 | pmaddubsw m2, m0 | |
2254 | ||
2255 | lea r0, [r0 + 4 * r1] | |
2256 | movd m6, [r0] | |
2257 | ||
2258 | punpcklbw m3, m4 | |
2259 | punpcklbw m7, m5, m6 | |
2260 | punpcklbw m3, m7 | |
2261 | ||
2262 | pmaddubsw m3, m0 | |
2263 | ||
2264 | phaddw m2, m3 | |
2265 | ||
2266 | pmulhrsw m2, m1 | |
2267 | ||
2268 | movd m7, [r0 + r1] | |
2269 | ||
2270 | punpcklbw m4, m5 | |
2271 | punpcklbw m3, m6, m7 | |
2272 | punpcklbw m4, m3 | |
2273 | ||
2274 | pmaddubsw m4, m0 | |
2275 | ||
2276 | movd m3, [r0 + 2 * r1] | |
2277 | ||
2278 | punpcklbw m5, m6 | |
2279 | punpcklbw m7, m3 | |
2280 | punpcklbw m5, m7 | |
2281 | ||
2282 | pmaddubsw m5, m0 | |
2283 | ||
2284 | phaddw m4, m5 | |
2285 | ||
2286 | pmulhrsw m4, m1 | |
2287 | packuswb m2, m4 | |
2288 | movd [r2], m2 | |
2289 | pextrd [r2 + r3], m2, 1 | |
2290 | lea r2, [r2 + 2 * r3] | |
2291 | pextrd [r2], m2, 2 | |
2292 | pextrd [r2 + r3], m2, 3 | |
2293 | ||
2294 | lea r2, [r2 + 2 * r3] | |
2295 | ||
2296 | sub r4, 4 | |
2297 | jnz .loop | |
2298 | RET | |
2299 | %endmacro | |
2300 | ||
2301 | FILTER_V4_W4_H4 4, 8 | |
2302 | FILTER_V4_W4_H4 4, 16 | |
2303 | ||
2304 | FILTER_V4_W4_H4 4, 32 | |
2305 | ||
2306 | %macro FILTER_V4_W8_H2 0 | |
2307 | punpcklbw m1, m2 | |
2308 | punpcklbw m7, m3, m0 | |
2309 | ||
2310 | pmaddubsw m1, m6 | |
2311 | pmaddubsw m7, m5 | |
2312 | ||
2313 | paddw m1, m7 | |
2314 | ||
2315 | pmulhrsw m1, m4 | |
2316 | packuswb m1, m1 | |
2317 | %endmacro | |
2318 | ||
2319 | %macro FILTER_V4_W8_H3 0 | |
2320 | punpcklbw m2, m3 | |
2321 | punpcklbw m7, m0, m1 | |
2322 | ||
2323 | pmaddubsw m2, m6 | |
2324 | pmaddubsw m7, m5 | |
2325 | ||
2326 | paddw m2, m7 | |
2327 | ||
2328 | pmulhrsw m2, m4 | |
2329 | packuswb m2, m2 | |
2330 | %endmacro | |
2331 | ||
2332 | %macro FILTER_V4_W8_H4 0 | |
2333 | punpcklbw m3, m0 | |
2334 | punpcklbw m7, m1, m2 | |
2335 | ||
2336 | pmaddubsw m3, m6 | |
2337 | pmaddubsw m7, m5 | |
2338 | ||
2339 | paddw m3, m7 | |
2340 | ||
2341 | pmulhrsw m3, m4 | |
2342 | packuswb m3, m3 | |
2343 | %endmacro | |
2344 | ||
2345 | %macro FILTER_V4_W8_H5 0 | |
2346 | punpcklbw m0, m1 | |
2347 | punpcklbw m7, m2, m3 | |
2348 | ||
2349 | pmaddubsw m0, m6 | |
2350 | pmaddubsw m7, m5 | |
2351 | ||
2352 | paddw m0, m7 | |
2353 | ||
2354 | pmulhrsw m0, m4 | |
2355 | packuswb m0, m0 | |
2356 | %endmacro | |
2357 | ||
2358 | %macro FILTER_V4_W8_8x2 2 | |
2359 | FILTER_V4_W8 %1, %2 | |
2360 | movq m0, [r0 + 4 * r1] | |
2361 | ||
2362 | FILTER_V4_W8_H2 | |
2363 | ||
2364 | movh [r2 + r3], m1 | |
2365 | %endmacro | |
2366 | ||
2367 | %macro FILTER_V4_W8_8x4 2 | |
2368 | FILTER_V4_W8_8x2 %1, %2 | |
2369 | ;8x3 | |
2370 | lea r6, [r0 + 4 * r1] | |
2371 | movq m1, [r6 + r1] | |
2372 | ||
2373 | FILTER_V4_W8_H3 | |
2374 | ||
2375 | movh [r2 + 2 * r3], m2 | |
2376 | ||
2377 | ;8x4 | |
2378 | movq m2, [r6 + 2 * r1] | |
2379 | ||
2380 | FILTER_V4_W8_H4 | |
2381 | ||
2382 | lea r5, [r2 + 2 * r3] | |
2383 | movh [r5 + r3], m3 | |
2384 | %endmacro | |
2385 | ||
2386 | %macro FILTER_V4_W8_8x6 2 | |
2387 | FILTER_V4_W8_8x4 %1, %2 | |
2388 | ;8x5 | |
2389 | lea r6, [r6 + 2 * r1] | |
2390 | movq m3, [r6 + r1] | |
2391 | ||
2392 | FILTER_V4_W8_H5 | |
2393 | ||
2394 | movh [r2 + 4 * r3], m0 | |
2395 | ||
2396 | ;8x6 | |
2397 | movq m0, [r0 + 8 * r1] | |
2398 | ||
2399 | FILTER_V4_W8_H2 | |
2400 | ||
2401 | lea r5, [r2 + 4 * r3] | |
2402 | movh [r5 + r3], m1 | |
2403 | %endmacro | |
2404 | ||
2405 | ;----------------------------------------------------------------------------- | |
2406 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2407 | ;----------------------------------------------------------------------------- | |
2408 | %macro FILTER_V4_W8 2 | |
2409 | INIT_XMM sse4 | |
2410 | cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 | |
2411 | ||
2412 | mov r4d, r4m | |
2413 | ||
2414 | sub r0, r1 | |
2415 | movq m0, [r0] | |
2416 | movq m1, [r0 + r1] | |
2417 | movq m2, [r0 + 2 * r1] | |
2418 | lea r5, [r0 + 2 * r1] | |
2419 | movq m3, [r5 + r1] | |
2420 | ||
2421 | punpcklbw m0, m1 | |
2422 | punpcklbw m4, m2, m3 | |
2423 | ||
2424 | %ifdef PIC | |
2425 | lea r6, [tab_ChromaCoeff] | |
2426 | movd m5, [r6 + r4 * 4] | |
2427 | %else | |
2428 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
2429 | %endif | |
2430 | ||
2431 | pshufb m6, m5, [tab_Vm] | |
2432 | pmaddubsw m0, m6 | |
2433 | ||
2434 | pshufb m5, [tab_Vm + 16] | |
2435 | pmaddubsw m4, m5 | |
2436 | ||
2437 | paddw m0, m4 | |
2438 | ||
b53f7c52 | 2439 | mova m4, [pw_512] |
72b9787e JB |
2440 | |
2441 | pmulhrsw m0, m4 | |
2442 | packuswb m0, m0 | |
2443 | movh [r2], m0 | |
2444 | %endmacro | |
2445 | ||
2446 | ;----------------------------------------------------------------------------- | |
2447 | ; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2448 | ;----------------------------------------------------------------------------- | |
2449 | FILTER_V4_W8_8x2 8, 2 | |
2450 | ||
2451 | RET | |
2452 | ||
2453 | ;----------------------------------------------------------------------------- | |
2454 | ; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2455 | ;----------------------------------------------------------------------------- | |
2456 | FILTER_V4_W8_8x4 8, 4 | |
2457 | ||
2458 | RET | |
2459 | ||
2460 | ;----------------------------------------------------------------------------- | |
2461 | ; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2462 | ;----------------------------------------------------------------------------- | |
2463 | FILTER_V4_W8_8x6 8, 6 | |
2464 | ||
2465 | RET | |
2466 | ||
2467 | ;------------------------------------------------------------------------------------------------------------- | |
2468 | ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2469 | ;------------------------------------------------------------------------------------------------------------- | |
2470 | INIT_XMM sse4 | |
2471 | cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 | |
2472 | ||
2473 | mov r4d, r4m | |
2474 | sub r0, r1 | |
2475 | add r3d, r3d | |
2476 | ||
2477 | %ifdef PIC | |
2478 | lea r5, [tab_ChromaCoeff] | |
2479 | movd m0, [r5 + r4 * 4] | |
2480 | %else | |
2481 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2482 | %endif | |
2483 | ||
2484 | pshufb m0, [tab_Cm] | |
2485 | ||
2486 | movd m2, [r0] | |
2487 | movd m3, [r0 + r1] | |
2488 | lea r5, [r0 + 2 * r1] | |
2489 | movd m4, [r5] | |
2490 | movd m5, [r5 + r1] | |
2491 | ||
2492 | punpcklbw m2, m3 | |
2493 | punpcklbw m1, m4, m5 | |
2494 | punpcklbw m2, m1 | |
2495 | ||
2496 | pmaddubsw m2, m0 | |
2497 | ||
2498 | movd m1, [r0 + 4 * r1] | |
2499 | ||
2500 | punpcklbw m3, m4 | |
2501 | punpcklbw m5, m1 | |
2502 | punpcklbw m3, m5 | |
2503 | ||
2504 | pmaddubsw m3, m0 | |
2505 | ||
2506 | phaddw m2, m3 | |
2507 | ||
2508 | psubw m2, [pw_2000] | |
2509 | movh [r2], m2 | |
2510 | movhps [r2 + r3], m2 | |
2511 | ||
2512 | RET | |
2513 | ||
2514 | ;------------------------------------------------------------------------------------------------------------- | |
2515 | ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2516 | ;------------------------------------------------------------------------------------------------------------- | |
2517 | INIT_XMM sse4 | |
2518 | cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 | |
2519 | ||
2520 | mov r4d, r4m | |
2521 | sub r0, r1 | |
2522 | add r3d, r3d | |
2523 | ||
2524 | %ifdef PIC | |
2525 | lea r5, [tab_ChromaCoeff] | |
2526 | movd m0, [r5 + r4 * 4] | |
2527 | %else | |
2528 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2529 | %endif | |
2530 | ||
2531 | pshufb m0, [tab_Cm] | |
2532 | ||
2533 | lea r4, [r1 * 3] | |
2534 | lea r5, [r0 + 4 * r1] | |
2535 | ||
2536 | movd m2, [r0] | |
2537 | movd m3, [r0 + r1] | |
2538 | movd m4, [r0 + 2 * r1] | |
2539 | movd m5, [r0 + r4] | |
2540 | ||
2541 | punpcklbw m2, m3 | |
2542 | punpcklbw m6, m4, m5 | |
2543 | punpcklbw m2, m6 | |
2544 | ||
2545 | pmaddubsw m2, m0 | |
2546 | ||
2547 | movd m6, [r5] | |
2548 | ||
2549 | punpcklbw m3, m4 | |
2550 | punpcklbw m1, m5, m6 | |
2551 | punpcklbw m3, m1 | |
2552 | ||
2553 | pmaddubsw m3, m0 | |
2554 | ||
2555 | phaddw m2, m3 | |
2556 | ||
2557 | mova m1, [pw_2000] | |
2558 | ||
2559 | psubw m2, m1 | |
2560 | movh [r2], m2 | |
2561 | movhps [r2 + r3], m2 | |
2562 | ||
2563 | movd m2, [r5 + r1] | |
2564 | ||
2565 | punpcklbw m4, m5 | |
2566 | punpcklbw m3, m6, m2 | |
2567 | punpcklbw m4, m3 | |
2568 | ||
2569 | pmaddubsw m4, m0 | |
2570 | ||
2571 | movd m3, [r5 + 2 * r1] | |
2572 | ||
2573 | punpcklbw m5, m6 | |
2574 | punpcklbw m2, m3 | |
2575 | punpcklbw m5, m2 | |
2576 | ||
2577 | pmaddubsw m5, m0 | |
2578 | ||
2579 | phaddw m4, m5 | |
2580 | ||
2581 | psubw m4, m1 | |
2582 | lea r2, [r2 + 2 * r3] | |
2583 | movh [r2], m4 | |
2584 | movhps [r2 + r3], m4 | |
2585 | ||
2586 | RET | |
2587 | ||
2588 | ;--------------------------------------------------------------------------------------------------------------- | |
2589 | ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2590 | ;--------------------------------------------------------------------------------------------------------------- | |
2591 | %macro FILTER_V_PS_W4_H4 2 | |
2592 | INIT_XMM sse4 | |
2593 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
2594 | ||
2595 | mov r4d, r4m | |
2596 | sub r0, r1 | |
2597 | add r3d, r3d | |
2598 | ||
2599 | %ifdef PIC | |
2600 | lea r5, [tab_ChromaCoeff] | |
2601 | movd m0, [r5 + r4 * 4] | |
2602 | %else | |
2603 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2604 | %endif | |
2605 | ||
2606 | pshufb m0, [tab_Cm] | |
2607 | ||
2608 | mova m1, [pw_2000] | |
2609 | ||
2610 | mov r4d, %2/4 | |
2611 | lea r5, [3 * r1] | |
2612 | ||
2613 | .loop: | |
2614 | movd m2, [r0] | |
2615 | movd m3, [r0 + r1] | |
2616 | movd m4, [r0 + 2 * r1] | |
2617 | movd m5, [r0 + r5] | |
2618 | ||
2619 | punpcklbw m2, m3 | |
2620 | punpcklbw m6, m4, m5 | |
2621 | punpcklbw m2, m6 | |
2622 | ||
2623 | pmaddubsw m2, m0 | |
2624 | ||
2625 | lea r0, [r0 + 4 * r1] | |
2626 | movd m6, [r0] | |
2627 | ||
2628 | punpcklbw m3, m4 | |
2629 | punpcklbw m7, m5, m6 | |
2630 | punpcklbw m3, m7 | |
2631 | ||
2632 | pmaddubsw m3, m0 | |
2633 | ||
2634 | phaddw m2, m3 | |
2635 | ||
2636 | psubw m2, m1 | |
2637 | movh [r2], m2 | |
2638 | movhps [r2 + r3], m2 | |
2639 | ||
2640 | movd m2, [r0 + r1] | |
2641 | ||
2642 | punpcklbw m4, m5 | |
2643 | punpcklbw m3, m6, m2 | |
2644 | punpcklbw m4, m3 | |
2645 | ||
2646 | pmaddubsw m4, m0 | |
2647 | ||
2648 | movd m3, [r0 + 2 * r1] | |
2649 | ||
2650 | punpcklbw m5, m6 | |
2651 | punpcklbw m2, m3 | |
2652 | punpcklbw m5, m2 | |
2653 | ||
2654 | pmaddubsw m5, m0 | |
2655 | ||
2656 | phaddw m4, m5 | |
2657 | ||
2658 | psubw m4, m1 | |
2659 | lea r2, [r2 + 2 * r3] | |
2660 | movh [r2], m4 | |
2661 | movhps [r2 + r3], m4 | |
2662 | ||
2663 | lea r2, [r2 + 2 * r3] | |
2664 | ||
2665 | dec r4d | |
2666 | jnz .loop | |
2667 | RET | |
2668 | %endmacro | |
2669 | ||
2670 | FILTER_V_PS_W4_H4 4, 8 | |
2671 | FILTER_V_PS_W4_H4 4, 16 | |
2672 | ||
2673 | FILTER_V_PS_W4_H4 4, 32 | |
2674 | ||
2675 | ;-------------------------------------------------------------------------------------------------------------- | |
2676 | ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2677 | ;-------------------------------------------------------------------------------------------------------------- | |
2678 | %macro FILTER_V_PS_W8_H8_H16_H2 2 | |
2679 | INIT_XMM sse4 | |
2680 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 | |
2681 | ||
2682 | mov r4d, r4m | |
2683 | sub r0, r1 | |
2684 | add r3d, r3d | |
2685 | ||
2686 | %ifdef PIC | |
2687 | lea r5, [tab_ChromaCoeff] | |
2688 | movd m5, [r5 + r4 * 4] | |
2689 | %else | |
2690 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
2691 | %endif | |
2692 | ||
2693 | pshufb m6, m5, [tab_Vm] | |
2694 | pshufb m5, [tab_Vm + 16] | |
2695 | mova m4, [pw_2000] | |
2696 | ||
2697 | mov r4d, %2/2 | |
2698 | lea r5, [3 * r1] | |
2699 | ||
2700 | .loopH: | |
2701 | movq m0, [r0] | |
2702 | movq m1, [r0 + r1] | |
2703 | movq m2, [r0 + 2 * r1] | |
2704 | movq m3, [r0 + r5] | |
2705 | ||
2706 | punpcklbw m0, m1 | |
2707 | punpcklbw m1, m2 | |
2708 | punpcklbw m2, m3 | |
2709 | ||
2710 | pmaddubsw m0, m6 | |
2711 | pmaddubsw m2, m5 | |
2712 | ||
2713 | paddw m0, m2 | |
2714 | ||
2715 | psubw m0, m4 | |
2716 | movu [r2], m0 | |
2717 | ||
2718 | movq m0, [r0 + 4 * r1] | |
2719 | ||
2720 | punpcklbw m3, m0 | |
2721 | ||
2722 | pmaddubsw m1, m6 | |
2723 | pmaddubsw m3, m5 | |
2724 | ||
2725 | paddw m1, m3 | |
2726 | psubw m1, m4 | |
2727 | ||
2728 | movu [r2 + r3], m1 | |
2729 | ||
2730 | lea r0, [r0 + 2 * r1] | |
2731 | lea r2, [r2 + 2 * r3] | |
2732 | ||
2733 | dec r4d | |
2734 | jnz .loopH | |
2735 | ||
2736 | RET | |
2737 | %endmacro | |
2738 | ||
2739 | FILTER_V_PS_W8_H8_H16_H2 8, 2 | |
2740 | FILTER_V_PS_W8_H8_H16_H2 8, 4 | |
2741 | FILTER_V_PS_W8_H8_H16_H2 8, 6 | |
2742 | ||
2743 | FILTER_V_PS_W8_H8_H16_H2 8, 12 | |
2744 | FILTER_V_PS_W8_H8_H16_H2 8, 64 | |
2745 | ||
2746 | ;-------------------------------------------------------------------------------------------------------------- | |
2747 | ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2748 | ;-------------------------------------------------------------------------------------------------------------- | |
2749 | %macro FILTER_V_PS_W8_H8_H16_H32 2 | |
2750 | INIT_XMM sse4 | |
2751 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
2752 | ||
2753 | mov r4d, r4m | |
2754 | sub r0, r1 | |
2755 | add r3d, r3d | |
2756 | ||
2757 | %ifdef PIC | |
2758 | lea r5, [tab_ChromaCoeff] | |
2759 | movd m5, [r5 + r4 * 4] | |
2760 | %else | |
2761 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
2762 | %endif | |
2763 | ||
2764 | pshufb m6, m5, [tab_Vm] | |
2765 | pshufb m5, [tab_Vm + 16] | |
2766 | mova m4, [pw_2000] | |
2767 | ||
2768 | mov r4d, %2/4 | |
2769 | lea r5, [3 * r1] | |
2770 | ||
2771 | .loop: | |
2772 | movq m0, [r0] | |
2773 | movq m1, [r0 + r1] | |
2774 | movq m2, [r0 + 2 * r1] | |
2775 | movq m3, [r0 + r5] | |
2776 | ||
2777 | punpcklbw m0, m1 | |
2778 | punpcklbw m1, m2 | |
2779 | punpcklbw m2, m3 | |
2780 | ||
2781 | pmaddubsw m0, m6 | |
2782 | pmaddubsw m7, m2, m5 | |
2783 | ||
2784 | paddw m0, m7 | |
2785 | ||
2786 | psubw m0, m4 | |
2787 | movu [r2], m0 | |
2788 | ||
2789 | lea r0, [r0 + 4 * r1] | |
2790 | movq m0, [r0] | |
2791 | ||
2792 | punpcklbw m3, m0 | |
2793 | ||
2794 | pmaddubsw m1, m6 | |
2795 | pmaddubsw m7, m3, m5 | |
2796 | ||
2797 | paddw m1, m7 | |
2798 | ||
2799 | psubw m1, m4 | |
2800 | movu [r2 + r3], m1 | |
2801 | ||
2802 | movq m1, [r0 + r1] | |
2803 | ||
2804 | punpcklbw m0, m1 | |
2805 | ||
2806 | pmaddubsw m2, m6 | |
2807 | pmaddubsw m0, m5 | |
2808 | ||
2809 | paddw m2, m0 | |
2810 | ||
2811 | psubw m2, m4 | |
2812 | lea r2, [r2 + 2 * r3] | |
2813 | movu [r2], m2 | |
2814 | ||
2815 | movq m2, [r0 + 2 * r1] | |
2816 | ||
2817 | punpcklbw m1, m2 | |
2818 | ||
2819 | pmaddubsw m3, m6 | |
2820 | pmaddubsw m1, m5 | |
2821 | ||
2822 | paddw m3, m1 | |
2823 | psubw m3, m4 | |
2824 | ||
2825 | movu [r2 + r3], m3 | |
2826 | ||
2827 | lea r2, [r2 + 2 * r3] | |
2828 | ||
2829 | dec r4d | |
2830 | jnz .loop | |
2831 | RET | |
2832 | %endmacro | |
2833 | ||
2834 | FILTER_V_PS_W8_H8_H16_H32 8, 8 | |
2835 | FILTER_V_PS_W8_H8_H16_H32 8, 16 | |
2836 | FILTER_V_PS_W8_H8_H16_H32 8, 32 | |
2837 | ||
2838 | ;------------------------------------------------------------------------------------------------------------ | |
2839 | ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2840 | ;------------------------------------------------------------------------------------------------------------ | |
2841 | %macro FILTER_V_PS_W6 2 | |
2842 | INIT_XMM sse4 | |
2843 | cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 | |
2844 | ||
2845 | mov r4d, r4m | |
2846 | sub r0, r1 | |
2847 | add r3d, r3d | |
2848 | ||
2849 | %ifdef PIC | |
2850 | lea r5, [tab_ChromaCoeff] | |
2851 | movd m5, [r5 + r4 * 4] | |
2852 | %else | |
2853 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
2854 | %endif | |
2855 | ||
2856 | pshufb m6, m5, [tab_Vm] | |
2857 | pshufb m5, [tab_Vm + 16] | |
2858 | mova m4, [pw_2000] | |
2859 | lea r5, [3 * r1] | |
2860 | mov r4d, %2/4 | |
2861 | ||
2862 | .loop: | |
2863 | movq m0, [r0] | |
2864 | movq m1, [r0 + r1] | |
2865 | movq m2, [r0 + 2 * r1] | |
2866 | movq m3, [r0 + r5] | |
2867 | ||
2868 | punpcklbw m0, m1 | |
2869 | punpcklbw m1, m2 | |
2870 | punpcklbw m2, m3 | |
2871 | ||
2872 | pmaddubsw m0, m6 | |
2873 | pmaddubsw m7, m2, m5 | |
2874 | ||
2875 | paddw m0, m7 | |
2876 | psubw m0, m4 | |
2877 | ||
2878 | movh [r2], m0 | |
2879 | pshufd m0, m0, 2 | |
2880 | movd [r2 + 8], m0 | |
2881 | ||
2882 | lea r0, [r0 + 4 * r1] | |
2883 | movq m0, [r0] | |
2884 | punpcklbw m3, m0 | |
2885 | ||
2886 | pmaddubsw m1, m6 | |
2887 | pmaddubsw m7, m3, m5 | |
2888 | ||
2889 | paddw m1, m7 | |
2890 | psubw m1, m4 | |
2891 | ||
2892 | movh [r2 + r3], m1 | |
2893 | pshufd m1, m1, 2 | |
2894 | movd [r2 + r3 + 8], m1 | |
2895 | ||
2896 | movq m1, [r0 + r1] | |
2897 | punpcklbw m0, m1 | |
2898 | ||
2899 | pmaddubsw m2, m6 | |
2900 | pmaddubsw m0, m5 | |
2901 | ||
2902 | paddw m2, m0 | |
2903 | psubw m2, m4 | |
2904 | ||
2905 | lea r2,[r2 + 2 * r3] | |
2906 | movh [r2], m2 | |
2907 | pshufd m2, m2, 2 | |
2908 | movd [r2 + 8], m2 | |
2909 | ||
2910 | movq m2,[r0 + 2 * r1] | |
2911 | punpcklbw m1, m2 | |
2912 | ||
2913 | pmaddubsw m3, m6 | |
2914 | pmaddubsw m1, m5 | |
2915 | ||
2916 | paddw m3, m1 | |
2917 | psubw m3, m4 | |
2918 | ||
2919 | movh [r2 + r3], m3 | |
2920 | pshufd m3, m3, 2 | |
2921 | movd [r2 + r3 + 8], m3 | |
2922 | ||
2923 | lea r2, [r2 + 2 * r3] | |
2924 | ||
2925 | dec r4d | |
2926 | jnz .loop | |
2927 | RET | |
2928 | %endmacro | |
2929 | ||
2930 | FILTER_V_PS_W6 6, 8 | |
2931 | FILTER_V_PS_W6 6, 16 | |
2932 | ||
2933 | ;--------------------------------------------------------------------------------------------------------------- | |
2934 | ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2935 | ;--------------------------------------------------------------------------------------------------------------- | |
2936 | %macro FILTER_V_PS_W12 2 | |
2937 | INIT_XMM sse4 | |
2938 | cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 | |
2939 | ||
2940 | mov r4d, r4m | |
2941 | sub r0, r1 | |
2942 | add r3d, r3d | |
2943 | ||
2944 | %ifdef PIC | |
2945 | lea r5, [tab_ChromaCoeff] | |
2946 | movd m0, [r5 + r4 * 4] | |
2947 | %else | |
2948 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2949 | %endif | |
2950 | ||
2951 | pshufb m1, m0, [tab_Vm] | |
2952 | pshufb m0, [tab_Vm + 16] | |
2953 | ||
2954 | mov r4d, %2/2 | |
2955 | ||
2956 | .loop: | |
2957 | movu m2, [r0] | |
2958 | movu m3, [r0 + r1] | |
2959 | ||
2960 | punpcklbw m4, m2, m3 | |
2961 | punpckhbw m2, m3 | |
2962 | ||
2963 | pmaddubsw m4, m1 | |
2964 | pmaddubsw m2, m1 | |
2965 | ||
2966 | lea r0, [r0 + 2 * r1] | |
2967 | movu m5, [r0] | |
2968 | movu m7, [r0 + r1] | |
2969 | ||
2970 | punpcklbw m6, m5, m7 | |
2971 | pmaddubsw m6, m0 | |
2972 | paddw m4, m6 | |
2973 | ||
2974 | punpckhbw m6, m5, m7 | |
2975 | pmaddubsw m6, m0 | |
2976 | paddw m2, m6 | |
2977 | ||
2978 | mova m6, [pw_2000] | |
2979 | ||
2980 | psubw m4, m6 | |
2981 | psubw m2, m6 | |
2982 | ||
2983 | movu [r2], m4 | |
2984 | movh [r2 + 16], m2 | |
2985 | ||
2986 | punpcklbw m4, m3, m5 | |
2987 | punpckhbw m3, m5 | |
2988 | ||
2989 | pmaddubsw m4, m1 | |
2990 | pmaddubsw m3, m1 | |
2991 | ||
2992 | movu m2, [r0 + 2 * r1] | |
2993 | ||
2994 | punpcklbw m5, m7, m2 | |
2995 | punpckhbw m7, m2 | |
2996 | ||
2997 | pmaddubsw m5, m0 | |
2998 | pmaddubsw m7, m0 | |
2999 | ||
3000 | paddw m4, m5 | |
3001 | paddw m3, m7 | |
3002 | ||
3003 | psubw m4, m6 | |
3004 | psubw m3, m6 | |
3005 | ||
3006 | movu [r2 + r3], m4 | |
3007 | movh [r2 + r3 + 16], m3 | |
3008 | ||
3009 | lea r2, [r2 + 2 * r3] | |
3010 | ||
3011 | dec r4d | |
3012 | jnz .loop | |
3013 | RET | |
3014 | %endmacro | |
3015 | ||
3016 | FILTER_V_PS_W12 12, 16 | |
3017 | FILTER_V_PS_W12 12, 32 | |
3018 | ||
3019 | ;--------------------------------------------------------------------------------------------------------------- | |
3020 | ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
3021 | ;--------------------------------------------------------------------------------------------------------------- | |
3022 | %macro FILTER_V_PS_W16 2 | |
3023 | INIT_XMM sse4 | |
3024 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
3025 | ||
3026 | mov r4d, r4m | |
3027 | sub r0, r1 | |
3028 | add r3d, r3d | |
3029 | ||
3030 | %ifdef PIC | |
3031 | lea r5, [tab_ChromaCoeff] | |
3032 | movd m0, [r5 + r4 * 4] | |
3033 | %else | |
3034 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
3035 | %endif | |
3036 | ||
3037 | pshufb m1, m0, [tab_Vm] | |
3038 | pshufb m0, [tab_Vm + 16] | |
3039 | mov r4d, %2/2 | |
3040 | ||
3041 | .loop: | |
3042 | movu m2, [r0] | |
3043 | movu m3, [r0 + r1] | |
3044 | ||
3045 | punpcklbw m4, m2, m3 | |
3046 | punpckhbw m2, m3 | |
3047 | ||
3048 | pmaddubsw m4, m1 | |
3049 | pmaddubsw m2, m1 | |
3050 | ||
3051 | lea r0, [r0 + 2 * r1] | |
3052 | movu m5, [r0] | |
3053 | movu m7, [r0 + r1] | |
3054 | ||
3055 | punpcklbw m6, m5, m7 | |
3056 | pmaddubsw m6, m0 | |
3057 | paddw m4, m6 | |
3058 | ||
3059 | punpckhbw m6, m5, m7 | |
3060 | pmaddubsw m6, m0 | |
3061 | paddw m2, m6 | |
3062 | ||
3063 | mova m6, [pw_2000] | |
3064 | ||
3065 | psubw m4, m6 | |
3066 | psubw m2, m6 | |
3067 | ||
3068 | movu [r2], m4 | |
3069 | movu [r2 + 16], m2 | |
3070 | ||
3071 | punpcklbw m4, m3, m5 | |
3072 | punpckhbw m3, m5 | |
3073 | ||
3074 | pmaddubsw m4, m1 | |
3075 | pmaddubsw m3, m1 | |
3076 | ||
3077 | movu m5, [r0 + 2 * r1] | |
3078 | ||
3079 | punpcklbw m2, m7, m5 | |
3080 | punpckhbw m7, m5 | |
3081 | ||
3082 | pmaddubsw m2, m0 | |
3083 | pmaddubsw m7, m0 | |
3084 | ||
3085 | paddw m4, m2 | |
3086 | paddw m3, m7 | |
3087 | ||
3088 | psubw m4, m6 | |
3089 | psubw m3, m6 | |
3090 | ||
3091 | movu [r2 + r3], m4 | |
3092 | movu [r2 + r3 + 16], m3 | |
3093 | ||
3094 | lea r2, [r2 + 2 * r3] | |
3095 | ||
3096 | dec r4d | |
3097 | jnz .loop | |
3098 | RET | |
3099 | %endmacro | |
3100 | ||
3101 | FILTER_V_PS_W16 16, 4 | |
3102 | FILTER_V_PS_W16 16, 8 | |
3103 | FILTER_V_PS_W16 16, 12 | |
3104 | FILTER_V_PS_W16 16, 16 | |
3105 | FILTER_V_PS_W16 16, 32 | |
3106 | ||
3107 | FILTER_V_PS_W16 16, 24 | |
3108 | FILTER_V_PS_W16 16, 64 | |
3109 | ||
3110 | ;-------------------------------------------------------------------------------------------------------------- | |
3111 | ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
3112 | ;-------------------------------------------------------------------------------------------------------------- | |
3113 | %macro FILTER_V4_PS_W24 2 | |
3114 | INIT_XMM sse4 | |
3115 | cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 | |
3116 | ||
3117 | mov r4d, r4m | |
3118 | sub r0, r1 | |
3119 | add r3d, r3d | |
3120 | ||
3121 | %ifdef PIC | |
3122 | lea r5, [tab_ChromaCoeff] | |
3123 | movd m0, [r5 + r4 * 4] | |
3124 | %else | |
3125 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
3126 | %endif | |
3127 | ||
3128 | pshufb m1, m0, [tab_Vm] | |
3129 | pshufb m0, [tab_Vm + 16] | |
3130 | ||
3131 | mov r4d, %2/2 | |
3132 | ||
3133 | .loop: | |
3134 | movu m2, [r0] | |
3135 | movu m3, [r0 + r1] | |
3136 | ||
3137 | punpcklbw m4, m2, m3 | |
3138 | punpckhbw m2, m3 | |
3139 | ||
3140 | pmaddubsw m4, m1 | |
3141 | pmaddubsw m2, m1 | |
3142 | ||
3143 | lea r5, [r0 + 2 * r1] | |
3144 | ||
3145 | movu m5, [r5] | |
3146 | movu m7, [r5 + r1] | |
3147 | ||
3148 | punpcklbw m6, m5, m7 | |
3149 | pmaddubsw m6, m0 | |
3150 | paddw m4, m6 | |
3151 | ||
3152 | punpckhbw m6, m5, m7 | |
3153 | pmaddubsw m6, m0 | |
3154 | paddw m2, m6 | |
3155 | ||
3156 | mova m6, [pw_2000] | |
3157 | ||
3158 | psubw m4, m6 | |
3159 | psubw m2, m6 | |
3160 | ||
3161 | movu [r2], m4 | |
3162 | movu [r2 + 16], m2 | |
3163 | ||
3164 | punpcklbw m4, m3, m5 | |
3165 | punpckhbw m3, m5 | |
3166 | ||
3167 | pmaddubsw m4, m1 | |
3168 | pmaddubsw m3, m1 | |
3169 | ||
3170 | movu m2, [r5 + 2 * r1] | |
3171 | ||
3172 | punpcklbw m5, m7, m2 | |
3173 | punpckhbw m7, m2 | |
3174 | ||
3175 | pmaddubsw m5, m0 | |
3176 | pmaddubsw m7, m0 | |
3177 | ||
3178 | paddw m4, m5 | |
3179 | paddw m3, m7 | |
3180 | ||
3181 | psubw m4, m6 | |
3182 | psubw m3, m6 | |
3183 | ||
3184 | movu [r2 + r3], m4 | |
3185 | movu [r2 + r3 + 16], m3 | |
3186 | ||
3187 | movq m2, [r0 + 16] | |
3188 | movq m3, [r0 + r1 + 16] | |
3189 | movq m4, [r5 + 16] | |
3190 | movq m5, [r5 + r1 + 16] | |
3191 | ||
3192 | punpcklbw m2, m3 | |
3193 | punpcklbw m7, m4, m5 | |
3194 | ||
3195 | pmaddubsw m2, m1 | |
3196 | pmaddubsw m7, m0 | |
3197 | ||
3198 | paddw m2, m7 | |
3199 | psubw m2, m6 | |
3200 | ||
3201 | movu [r2 + 32], m2 | |
3202 | ||
3203 | movq m2, [r5 + 2 * r1 + 16] | |
3204 | ||
3205 | punpcklbw m3, m4 | |
3206 | punpcklbw m5, m2 | |
3207 | ||
3208 | pmaddubsw m3, m1 | |
3209 | pmaddubsw m5, m0 | |
3210 | ||
3211 | paddw m3, m5 | |
3212 | psubw m3, m6 | |
3213 | ||
3214 | movu [r2 + r3 + 32], m3 | |
3215 | ||
3216 | mov r0, r5 | |
3217 | lea r2, [r2 + 2 * r3] | |
3218 | ||
3219 | dec r4d | |
3220 | jnz .loop | |
3221 | RET | |
3222 | %endmacro | |
3223 | ||
3224 | FILTER_V4_PS_W24 24, 32 | |
3225 | ||
3226 | FILTER_V4_PS_W24 24, 64 | |
3227 | ||
3228 | ;--------------------------------------------------------------------------------------------------------------- | |
3229 | ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
3230 | ;--------------------------------------------------------------------------------------------------------------- | |
3231 | %macro FILTER_V_PS_W32 2 | |
3232 | INIT_XMM sse4 | |
3233 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
3234 | ||
3235 | mov r4d, r4m | |
3236 | sub r0, r1 | |
3237 | add r3d, r3d | |
3238 | ||
3239 | %ifdef PIC | |
3240 | lea r5, [tab_ChromaCoeff] | |
3241 | movd m0, [r5 + r4 * 4] | |
3242 | %else | |
3243 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
3244 | %endif | |
3245 | ||
3246 | pshufb m1, m0, [tab_Vm] | |
3247 | pshufb m0, [tab_Vm + 16] | |
3248 | ||
3249 | mova m7, [pw_2000] | |
3250 | ||
3251 | mov r4d, %2 | |
3252 | ||
3253 | .loop: | |
3254 | movu m2, [r0] | |
3255 | movu m3, [r0 + r1] | |
3256 | ||
3257 | punpcklbw m4, m2, m3 | |
3258 | punpckhbw m2, m3 | |
3259 | ||
3260 | pmaddubsw m4, m1 | |
3261 | pmaddubsw m2, m1 | |
3262 | ||
3263 | lea r5, [r0 + 2 * r1] | |
3264 | movu m3, [r5] | |
3265 | movu m5, [r5 + r1] | |
3266 | ||
3267 | punpcklbw m6, m3, m5 | |
3268 | punpckhbw m3, m5 | |
3269 | ||
3270 | pmaddubsw m6, m0 | |
3271 | pmaddubsw m3, m0 | |
3272 | ||
3273 | paddw m4, m6 | |
3274 | paddw m2, m3 | |
3275 | ||
3276 | psubw m4, m7 | |
3277 | psubw m2, m7 | |
3278 | ||
3279 | movu [r2], m4 | |
3280 | movu [r2 + 16], m2 | |
3281 | ||
3282 | movu m2, [r0 + 16] | |
3283 | movu m3, [r0 + r1 + 16] | |
3284 | ||
3285 | punpcklbw m4, m2, m3 | |
3286 | punpckhbw m2, m3 | |
3287 | ||
3288 | pmaddubsw m4, m1 | |
3289 | pmaddubsw m2, m1 | |
3290 | ||
3291 | movu m3, [r5 + 16] | |
3292 | movu m5, [r5 + r1 + 16] | |
3293 | ||
3294 | punpcklbw m6, m3, m5 | |
3295 | punpckhbw m3, m5 | |
3296 | ||
3297 | pmaddubsw m6, m0 | |
3298 | pmaddubsw m3, m0 | |
3299 | ||
3300 | paddw m4, m6 | |
3301 | paddw m2, m3 | |
3302 | ||
3303 | psubw m4, m7 | |
3304 | psubw m2, m7 | |
3305 | ||
3306 | movu [r2 + 32], m4 | |
3307 | movu [r2 + 48], m2 | |
3308 | ||
3309 | lea r0, [r0 + r1] | |
3310 | lea r2, [r2 + r3] | |
3311 | ||
3312 | dec r4d | |
3313 | jnz .loop | |
3314 | RET | |
3315 | %endmacro | |
3316 | ||
3317 | FILTER_V_PS_W32 32, 8 | |
3318 | FILTER_V_PS_W32 32, 16 | |
3319 | FILTER_V_PS_W32 32, 24 | |
3320 | FILTER_V_PS_W32 32, 32 | |
3321 | ||
3322 | FILTER_V_PS_W32 32, 48 | |
3323 | FILTER_V_PS_W32 32, 64 | |
3324 | ||
3325 | ;----------------------------------------------------------------------------- | |
3326 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3327 | ;----------------------------------------------------------------------------- | |
3328 | %macro FILTER_V4_W8_H8_H16_H32 2 | |
3329 | INIT_XMM sse4 | |
3330 | cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 | |
3331 | ||
3332 | mov r4d, r4m | |
3333 | sub r0, r1 | |
3334 | ||
3335 | %ifdef PIC | |
3336 | lea r5, [tab_ChromaCoeff] | |
3337 | movd m5, [r5 + r4 * 4] | |
3338 | %else | |
3339 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
3340 | %endif | |
3341 | ||
3342 | pshufb m6, m5, [tab_Vm] | |
3343 | pshufb m5, [tab_Vm + 16] | |
b53f7c52 | 3344 | mova m4, [pw_512] |
72b9787e JB |
3345 | lea r5, [r1 * 3] |
3346 | ||
3347 | mov r4d, %2 | |
3348 | ||
3349 | .loop: | |
3350 | movq m0, [r0] | |
3351 | movq m1, [r0 + r1] | |
3352 | movq m2, [r0 + 2 * r1] | |
3353 | movq m3, [r0 + r5] | |
3354 | ||
3355 | punpcklbw m0, m1 | |
3356 | punpcklbw m1, m2 | |
3357 | punpcklbw m2, m3 | |
3358 | ||
3359 | pmaddubsw m0, m6 | |
3360 | pmaddubsw m7, m2, m5 | |
3361 | ||
3362 | paddw m0, m7 | |
3363 | ||
3364 | pmulhrsw m0, m4 | |
3365 | packuswb m0, m0 | |
3366 | movh [r2], m0 | |
3367 | ||
3368 | lea r0, [r0 + 4 * r1] | |
3369 | movq m0, [r0] | |
3370 | ||
3371 | punpcklbw m3, m0 | |
3372 | ||
3373 | pmaddubsw m1, m6 | |
3374 | pmaddubsw m7, m3, m5 | |
3375 | ||
3376 | paddw m1, m7 | |
3377 | ||
3378 | pmulhrsw m1, m4 | |
3379 | packuswb m1, m1 | |
3380 | movh [r2 + r3], m1 | |
3381 | ||
3382 | movq m1, [r0 + r1] | |
3383 | ||
3384 | punpcklbw m0, m1 | |
3385 | ||
3386 | pmaddubsw m2, m6 | |
3387 | pmaddubsw m0, m5 | |
3388 | ||
3389 | paddw m2, m0 | |
3390 | ||
3391 | pmulhrsw m2, m4 | |
3392 | ||
3393 | movq m7, [r0 + 2 * r1] | |
3394 | punpcklbw m1, m7 | |
3395 | ||
3396 | pmaddubsw m3, m6 | |
3397 | pmaddubsw m1, m5 | |
3398 | ||
3399 | paddw m3, m1 | |
3400 | ||
3401 | pmulhrsw m3, m4 | |
3402 | packuswb m2, m3 | |
3403 | ||
3404 | lea r2, [r2 + 2 * r3] | |
3405 | movh [r2], m2 | |
3406 | movhps [r2 + r3], m2 | |
3407 | ||
3408 | lea r2, [r2 + 2 * r3] | |
3409 | ||
3410 | sub r4, 4 | |
3411 | jnz .loop | |
3412 | RET | |
3413 | %endmacro | |
3414 | ||
3415 | FILTER_V4_W8_H8_H16_H32 8, 8 | |
3416 | FILTER_V4_W8_H8_H16_H32 8, 16 | |
3417 | FILTER_V4_W8_H8_H16_H32 8, 32 | |
3418 | ||
3419 | FILTER_V4_W8_H8_H16_H32 8, 12 | |
3420 | FILTER_V4_W8_H8_H16_H32 8, 64 | |
3421 | ||
b53f7c52 JB |
3422 | %macro PROCESS_CHROMA_AVX2_W8_8R 0 |
3423 | movq xm1, [r0] ; m1 = row 0 | |
3424 | movq xm2, [r0 + r1] ; m2 = row 1 | |
3425 | punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] | |
3426 | movq xm3, [r0 + r1 * 2] ; m3 = row 2 | |
3427 | punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] | |
3428 | vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] | |
3429 | pmaddubsw m5, [r5] | |
3430 | movq xm4, [r0 + r4] ; m4 = row 3 | |
3431 | punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] | |
3432 | lea r0, [r0 + r1 * 4] | |
3433 | movq xm1, [r0] ; m1 = row 4 | |
3434 | punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] | |
3435 | vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] | |
3436 | pmaddubsw m0, m2, [r5 + 1 * mmsize] | |
3437 | paddw m5, m0 | |
3438 | pmaddubsw m2, [r5] | |
3439 | movq xm3, [r0 + r1] ; m3 = row 5 | |
3440 | punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] | |
3441 | movq xm4, [r0 + r1 * 2] ; m4 = row 6 | |
3442 | punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] | |
3443 | vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] | |
3444 | pmaddubsw m0, m1, [r5 + 1 * mmsize] | |
3445 | paddw m2, m0 | |
3446 | pmaddubsw m1, [r5] | |
3447 | movq xm3, [r0 + r4] ; m3 = row 7 | |
3448 | punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] | |
3449 | lea r0, [r0 + r1 * 4] | |
3450 | movq xm0, [r0] ; m0 = row 8 | |
3451 | punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] | |
3452 | vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] | |
3453 | pmaddubsw m3, m4, [r5 + 1 * mmsize] | |
3454 | paddw m1, m3 | |
3455 | pmaddubsw m4, [r5] | |
3456 | movq xm3, [r0 + r1] ; m3 = row 9 | |
3457 | punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] | |
3458 | movq xm6, [r0 + r1 * 2] ; m6 = row 10 | |
3459 | punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] | |
3460 | vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] | |
3461 | pmaddubsw m0, [r5 + 1 * mmsize] | |
3462 | paddw m4, m0 | |
3463 | %endmacro | |
3464 | ||
3465 | INIT_YMM avx2 | |
3466 | cglobal interp_4tap_vert_pp_8x8, 4, 6, 7 | |
3467 | mov r4d, r4m | |
3468 | shl r4d, 6 | |
3469 | ||
3470 | %ifdef PIC | |
3471 | lea r5, [tab_ChromaCoeffVer_32] | |
3472 | add r5, r4 | |
3473 | %else | |
3474 | lea r5, [tab_ChromaCoeffVer_32 + r4] | |
3475 | %endif | |
3476 | ||
3477 | lea r4, [r1 * 3] | |
3478 | sub r0, r1 | |
3479 | PROCESS_CHROMA_AVX2_W8_8R | |
3480 | lea r4, [r3 * 3] | |
3481 | mova m3, [pw_512] | |
3482 | pmulhrsw m5, m3 ; m5 = word: row 0, row 1 | |
3483 | pmulhrsw m2, m3 ; m2 = word: row 2, row 3 | |
3484 | pmulhrsw m1, m3 ; m1 = word: row 4, row 5 | |
3485 | pmulhrsw m4, m3 ; m4 = word: row 6, row 7 | |
3486 | packuswb m5, m2 | |
3487 | packuswb m1, m4 | |
3488 | vextracti128 xm2, m5, 1 | |
3489 | vextracti128 xm4, m1, 1 | |
3490 | movq [r2], xm5 | |
3491 | movq [r2 + r3], xm2 | |
3492 | movhps [r2 + r3 * 2], xm5 | |
3493 | movhps [r2 + r4], xm2 | |
3494 | lea r2, [r2 + r3 * 4] | |
3495 | movq [r2], xm1 | |
3496 | movq [r2 + r3], xm4 | |
3497 | movhps [r2 + r3 * 2], xm1 | |
3498 | movhps [r2 + r4], xm4 | |
3499 | RET | |
72b9787e JB |
3500 | |
3501 | ;----------------------------------------------------------------------------- | |
3502 | ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3503 | ;----------------------------------------------------------------------------- | |
3504 | %macro FILTER_V4_W6_H4 2 | |
3505 | INIT_XMM sse4 | |
3506 | cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 | |
3507 | ||
3508 | mov r4d, r4m | |
3509 | sub r0, r1 | |
3510 | ||
3511 | %ifdef PIC | |
3512 | lea r5, [tab_ChromaCoeff] | |
3513 | movd m5, [r5 + r4 * 4] | |
3514 | %else | |
3515 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
3516 | %endif | |
3517 | ||
3518 | pshufb m6, m5, [tab_Vm] | |
3519 | pshufb m5, [tab_Vm + 16] | |
b53f7c52 | 3520 | mova m4, [pw_512] |
72b9787e JB |
3521 | |
3522 | mov r4d, %2 | |
3523 | lea r5, [3 * r1] | |
3524 | ||
3525 | .loop: | |
3526 | movq m0, [r0] | |
3527 | movq m1, [r0 + r1] | |
3528 | movq m2, [r0 + 2 * r1] | |
3529 | movq m3, [r0 + r5] | |
3530 | ||
3531 | punpcklbw m0, m1 | |
3532 | punpcklbw m1, m2 | |
3533 | punpcklbw m2, m3 | |
3534 | ||
3535 | pmaddubsw m0, m6 | |
3536 | pmaddubsw m7, m2, m5 | |
3537 | ||
3538 | paddw m0, m7 | |
3539 | ||
3540 | pmulhrsw m0, m4 | |
3541 | packuswb m0, m0 | |
3542 | movd [r2], m0 | |
3543 | pextrw [r2 + 4], m0, 2 | |
3544 | ||
3545 | lea r0, [r0 + 4 * r1] | |
3546 | ||
3547 | movq m0, [r0] | |
3548 | punpcklbw m3, m0 | |
3549 | ||
3550 | pmaddubsw m1, m6 | |
3551 | pmaddubsw m7, m3, m5 | |
3552 | ||
3553 | paddw m1, m7 | |
3554 | ||
3555 | pmulhrsw m1, m4 | |
3556 | packuswb m1, m1 | |
3557 | movd [r2 + r3], m1 | |
3558 | pextrw [r2 + r3 + 4], m1, 2 | |
3559 | ||
3560 | movq m1, [r0 + r1] | |
3561 | punpcklbw m7, m0, m1 | |
3562 | ||
3563 | pmaddubsw m2, m6 | |
3564 | pmaddubsw m7, m5 | |
3565 | ||
3566 | paddw m2, m7 | |
3567 | ||
3568 | pmulhrsw m2, m4 | |
3569 | packuswb m2, m2 | |
3570 | lea r2, [r2 + 2 * r3] | |
3571 | movd [r2], m2 | |
3572 | pextrw [r2 + 4], m2, 2 | |
3573 | ||
3574 | movq m2, [r0 + 2 * r1] | |
3575 | punpcklbw m1, m2 | |
3576 | ||
3577 | pmaddubsw m3, m6 | |
3578 | pmaddubsw m1, m5 | |
3579 | ||
3580 | paddw m3, m1 | |
3581 | ||
3582 | pmulhrsw m3, m4 | |
3583 | packuswb m3, m3 | |
3584 | ||
3585 | movd [r2 + r3], m3 | |
3586 | pextrw [r2 + r3 + 4], m3, 2 | |
3587 | ||
3588 | lea r2, [r2 + 2 * r3] | |
3589 | ||
3590 | sub r4, 4 | |
3591 | jnz .loop | |
3592 | RET | |
3593 | %endmacro | |
3594 | ||
3595 | FILTER_V4_W6_H4 6, 8 | |
3596 | ||
3597 | FILTER_V4_W6_H4 6, 16 | |
3598 | ||
3599 | ;----------------------------------------------------------------------------- | |
3600 | ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3601 | ;----------------------------------------------------------------------------- | |
3602 | %macro FILTER_V4_W12_H2 2 | |
3603 | INIT_XMM sse4 | |
3604 | cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 | |
3605 | ||
3606 | mov r4d, r4m | |
3607 | sub r0, r1 | |
3608 | ||
3609 | %ifdef PIC | |
3610 | lea r5, [tab_ChromaCoeff] | |
3611 | movd m0, [r5 + r4 * 4] | |
3612 | %else | |
3613 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
3614 | %endif | |
3615 | ||
3616 | pshufb m1, m0, [tab_Vm] | |
3617 | pshufb m0, [tab_Vm + 16] | |
3618 | ||
3619 | mov r4d, %2 | |
3620 | ||
3621 | .loop: | |
3622 | movu m2, [r0] | |
3623 | movu m3, [r0 + r1] | |
3624 | ||
3625 | punpcklbw m4, m2, m3 | |
3626 | punpckhbw m2, m3 | |
3627 | ||
3628 | pmaddubsw m4, m1 | |
3629 | pmaddubsw m2, m1 | |
3630 | ||
3631 | lea r0, [r0 + 2 * r1] | |
3632 | movu m5, [r0] | |
3633 | movu m7, [r0 + r1] | |
3634 | ||
3635 | punpcklbw m6, m5, m7 | |
3636 | pmaddubsw m6, m0 | |
3637 | paddw m4, m6 | |
3638 | ||
3639 | punpckhbw m6, m5, m7 | |
3640 | pmaddubsw m6, m0 | |
3641 | paddw m2, m6 | |
3642 | ||
b53f7c52 | 3643 | mova m6, [pw_512] |
72b9787e JB |
3644 | |
3645 | pmulhrsw m4, m6 | |
3646 | pmulhrsw m2, m6 | |
3647 | ||
3648 | packuswb m4, m2 | |
3649 | ||
3650 | movh [r2], m4 | |
3651 | pextrd [r2 + 8], m4, 2 | |
3652 | ||
3653 | punpcklbw m4, m3, m5 | |
3654 | punpckhbw m3, m5 | |
3655 | ||
3656 | pmaddubsw m4, m1 | |
3657 | pmaddubsw m3, m1 | |
3658 | ||
3659 | movu m5, [r0 + 2 * r1] | |
3660 | ||
3661 | punpcklbw m2, m7, m5 | |
3662 | punpckhbw m7, m5 | |
3663 | ||
3664 | pmaddubsw m2, m0 | |
3665 | pmaddubsw m7, m0 | |
3666 | ||
3667 | paddw m4, m2 | |
3668 | paddw m3, m7 | |
3669 | ||
3670 | pmulhrsw m4, m6 | |
3671 | pmulhrsw m3, m6 | |
3672 | ||
3673 | packuswb m4, m3 | |
3674 | ||
3675 | movh [r2 + r3], m4 | |
3676 | pextrd [r2 + r3 + 8], m4, 2 | |
3677 | ||
3678 | lea r2, [r2 + 2 * r3] | |
3679 | ||
3680 | sub r4, 2 | |
3681 | jnz .loop | |
3682 | RET | |
3683 | %endmacro | |
3684 | ||
3685 | FILTER_V4_W12_H2 12, 16 | |
3686 | ||
3687 | FILTER_V4_W12_H2 12, 32 | |
3688 | ||
3689 | ;----------------------------------------------------------------------------- | |
3690 | ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3691 | ;----------------------------------------------------------------------------- | |
3692 | %macro FILTER_V4_W16_H2 2 | |
3693 | INIT_XMM sse4 | |
3694 | cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 | |
3695 | ||
3696 | mov r4d, r4m | |
3697 | sub r0, r1 | |
3698 | ||
3699 | %ifdef PIC | |
3700 | lea r5, [tab_ChromaCoeff] | |
3701 | movd m0, [r5 + r4 * 4] | |
3702 | %else | |
3703 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
3704 | %endif | |
3705 | ||
3706 | pshufb m1, m0, [tab_Vm] | |
3707 | pshufb m0, [tab_Vm + 16] | |
3708 | ||
3709 | mov r4d, %2/2 | |
3710 | ||
3711 | .loop: | |
3712 | movu m2, [r0] | |
3713 | movu m3, [r0 + r1] | |
3714 | ||
3715 | punpcklbw m4, m2, m3 | |
3716 | punpckhbw m2, m3 | |
3717 | ||
3718 | pmaddubsw m4, m1 | |
3719 | pmaddubsw m2, m1 | |
3720 | ||
3721 | lea r0, [r0 + 2 * r1] | |
3722 | movu m5, [r0] | |
3723 | movu m6, [r0 + r1] | |
3724 | ||
3725 | punpckhbw m7, m5, m6 | |
3726 | pmaddubsw m7, m0 | |
3727 | paddw m2, m7 | |
3728 | ||
3729 | punpcklbw m7, m5, m6 | |
3730 | pmaddubsw m7, m0 | |
3731 | paddw m4, m7 | |
3732 | ||
b53f7c52 | 3733 | mova m7, [pw_512] |
72b9787e JB |
3734 | |
3735 | pmulhrsw m4, m7 | |
3736 | pmulhrsw m2, m7 | |
3737 | ||
3738 | packuswb m4, m2 | |
3739 | ||
3740 | movu [r2], m4 | |
3741 | ||
3742 | punpcklbw m4, m3, m5 | |
3743 | punpckhbw m3, m5 | |
3744 | ||
3745 | pmaddubsw m4, m1 | |
3746 | pmaddubsw m3, m1 | |
3747 | ||
3748 | movu m5, [r0 + 2 * r1] | |
3749 | ||
3750 | punpcklbw m2, m6, m5 | |
3751 | punpckhbw m6, m5 | |
3752 | ||
3753 | pmaddubsw m2, m0 | |
3754 | pmaddubsw m6, m0 | |
3755 | ||
3756 | paddw m4, m2 | |
3757 | paddw m3, m6 | |
3758 | ||
3759 | pmulhrsw m4, m7 | |
3760 | pmulhrsw m3, m7 | |
3761 | ||
3762 | packuswb m4, m3 | |
3763 | ||
3764 | movu [r2 + r3], m4 | |
3765 | ||
3766 | lea r2, [r2 + 2 * r3] | |
3767 | ||
3768 | dec r4d | |
3769 | jnz .loop | |
3770 | RET | |
3771 | %endmacro | |
3772 | ||
3773 | FILTER_V4_W16_H2 16, 4 | |
3774 | FILTER_V4_W16_H2 16, 8 | |
3775 | FILTER_V4_W16_H2 16, 12 | |
3776 | FILTER_V4_W16_H2 16, 16 | |
3777 | FILTER_V4_W16_H2 16, 32 | |
3778 | ||
3779 | FILTER_V4_W16_H2 16, 24 | |
3780 | FILTER_V4_W16_H2 16, 64 | |
3781 | ||
b53f7c52 JB |
3782 | INIT_YMM avx2 |
3783 | %if ARCH_X86_64 == 1 | |
3784 | cglobal interp_4tap_vert_pp_16x16, 4, 6, 15 | |
3785 | mov r4d, r4m | |
3786 | shl r4d, 6 | |
3787 | ||
3788 | %ifdef PIC | |
3789 | lea r5, [tab_ChromaCoeffVer_32] | |
3790 | add r5, r4 | |
3791 | %else | |
3792 | lea r5, [tab_ChromaCoeffVer_32 + r4] | |
3793 | %endif | |
3794 | ||
3795 | mova m12, [r5] | |
3796 | mova m13, [r5 + mmsize] | |
3797 | lea r4, [r1 * 3] | |
3798 | sub r0, r1 | |
3799 | lea r5, [r3 * 3] | |
3800 | mova m14, [pw_512] | |
3801 | ||
3802 | movu xm0, [r0] ; m0 = row 0 | |
3803 | movu xm1, [r0 + r1] ; m1 = row 1 | |
3804 | punpckhbw xm2, xm0, xm1 | |
3805 | punpcklbw xm0, xm1 | |
3806 | vinserti128 m0, m0, xm2, 1 | |
3807 | pmaddubsw m0, m12 | |
3808 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
3809 | punpckhbw xm3, xm1, xm2 | |
3810 | punpcklbw xm1, xm2 | |
3811 | vinserti128 m1, m1, xm3, 1 | |
3812 | pmaddubsw m1, m12 | |
3813 | movu xm3, [r0 + r4] ; m3 = row 3 | |
3814 | punpckhbw xm4, xm2, xm3 | |
3815 | punpcklbw xm2, xm3 | |
3816 | vinserti128 m2, m2, xm4, 1 | |
3817 | pmaddubsw m4, m2, m13 | |
3818 | paddw m0, m4 | |
3819 | pmaddubsw m2, m12 | |
3820 | lea r0, [r0 + r1 * 4] | |
3821 | movu xm4, [r0] ; m4 = row 4 | |
3822 | punpckhbw xm5, xm3, xm4 | |
3823 | punpcklbw xm3, xm4 | |
3824 | vinserti128 m3, m3, xm5, 1 | |
3825 | pmaddubsw m5, m3, m13 | |
3826 | paddw m1, m5 | |
3827 | pmaddubsw m3, m12 | |
3828 | movu xm5, [r0 + r1] ; m5 = row 5 | |
3829 | punpckhbw xm6, xm4, xm5 | |
3830 | punpcklbw xm4, xm5 | |
3831 | vinserti128 m4, m4, xm6, 1 | |
3832 | pmaddubsw m6, m4, m13 | |
3833 | paddw m2, m6 | |
3834 | pmaddubsw m4, m12 | |
3835 | movu xm6, [r0 + r1 * 2] ; m6 = row 6 | |
3836 | punpckhbw xm7, xm5, xm6 | |
3837 | punpcklbw xm5, xm6 | |
3838 | vinserti128 m5, m5, xm7, 1 | |
3839 | pmaddubsw m7, m5, m13 | |
3840 | paddw m3, m7 | |
3841 | pmaddubsw m5, m12 | |
3842 | movu xm7, [r0 + r4] ; m7 = row 7 | |
3843 | punpckhbw xm8, xm6, xm7 | |
3844 | punpcklbw xm6, xm7 | |
3845 | vinserti128 m6, m6, xm8, 1 | |
3846 | pmaddubsw m8, m6, m13 | |
3847 | paddw m4, m8 | |
3848 | pmaddubsw m6, m12 | |
3849 | lea r0, [r0 + r1 * 4] | |
3850 | movu xm8, [r0] ; m8 = row 8 | |
3851 | punpckhbw xm9, xm7, xm8 | |
3852 | punpcklbw xm7, xm8 | |
3853 | vinserti128 m7, m7, xm9, 1 | |
3854 | pmaddubsw m9, m7, m13 | |
3855 | paddw m5, m9 | |
3856 | pmaddubsw m7, m12 | |
3857 | movu xm9, [r0 + r1] ; m9 = row 9 | |
3858 | punpckhbw xm10, xm8, xm9 | |
3859 | punpcklbw xm8, xm9 | |
3860 | vinserti128 m8, m8, xm10, 1 | |
3861 | pmaddubsw m10, m8, m13 | |
3862 | paddw m6, m10 | |
3863 | pmaddubsw m8, m12 | |
3864 | movu xm10, [r0 + r1 * 2] ; m10 = row 10 | |
3865 | punpckhbw xm11, xm9, xm10 | |
3866 | punpcklbw xm9, xm10 | |
3867 | vinserti128 m9, m9, xm11, 1 | |
3868 | pmaddubsw m11, m9, m13 | |
3869 | paddw m7, m11 | |
3870 | pmaddubsw m9, m12 | |
3871 | ||
3872 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
3873 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
3874 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
3875 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
3876 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
3877 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
3878 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
3879 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
3880 | packuswb m0, m1 | |
3881 | packuswb m2, m3 | |
3882 | packuswb m4, m5 | |
3883 | packuswb m6, m7 | |
3884 | vpermq m0, m0, 11011000b | |
3885 | vpermq m2, m2, 11011000b | |
3886 | vpermq m4, m4, 11011000b | |
3887 | vpermq m6, m6, 11011000b | |
3888 | vextracti128 xm1, m0, 1 | |
3889 | vextracti128 xm3, m2, 1 | |
3890 | vextracti128 xm5, m4, 1 | |
3891 | vextracti128 xm7, m6, 1 | |
3892 | movu [r2], xm0 | |
3893 | movu [r2 + r3], xm1 | |
3894 | movu [r2 + r3 * 2], xm2 | |
3895 | movu [r2 + r5], xm3 | |
3896 | lea r2, [r2 + r3 * 4] | |
3897 | movu [r2], xm4 | |
3898 | movu [r2 + r3], xm5 | |
3899 | movu [r2 + r3 * 2], xm6 | |
3900 | movu [r2 + r5], xm7 | |
3901 | lea r2, [r2 + r3 * 4] | |
3902 | ||
3903 | movu xm11, [r0 + r4] ; m11 = row 11 | |
3904 | punpckhbw xm6, xm10, xm11 | |
3905 | punpcklbw xm10, xm11 | |
3906 | vinserti128 m10, m10, xm6, 1 | |
3907 | pmaddubsw m6, m10, m13 | |
3908 | paddw m8, m6 | |
3909 | pmaddubsw m10, m12 | |
3910 | lea r0, [r0 + r1 * 4] | |
3911 | movu xm6, [r0] ; m6 = row 12 | |
3912 | punpckhbw xm7, xm11, xm6 | |
3913 | punpcklbw xm11, xm6 | |
3914 | vinserti128 m11, m11, xm7, 1 | |
3915 | pmaddubsw m7, m11, m13 | |
3916 | paddw m9, m7 | |
3917 | pmaddubsw m11, m12 | |
3918 | ||
3919 | movu xm7, [r0 + r1] ; m7 = row 13 | |
3920 | punpckhbw xm0, xm6, xm7 | |
3921 | punpcklbw xm6, xm7 | |
3922 | vinserti128 m6, m6, xm0, 1 | |
3923 | pmaddubsw m0, m6, m13 | |
3924 | paddw m10, m0 | |
3925 | pmaddubsw m6, m12 | |
3926 | movu xm0, [r0 + r1 * 2] ; m0 = row 14 | |
3927 | punpckhbw xm1, xm7, xm0 | |
3928 | punpcklbw xm7, xm0 | |
3929 | vinserti128 m7, m7, xm1, 1 | |
3930 | pmaddubsw m1, m7, m13 | |
3931 | paddw m11, m1 | |
3932 | pmaddubsw m7, m12 | |
3933 | movu xm1, [r0 + r4] ; m1 = row 15 | |
3934 | punpckhbw xm2, xm0, xm1 | |
3935 | punpcklbw xm0, xm1 | |
3936 | vinserti128 m0, m0, xm2, 1 | |
3937 | pmaddubsw m2, m0, m13 | |
3938 | paddw m6, m2 | |
3939 | pmaddubsw m0, m12 | |
3940 | lea r0, [r0 + r1 * 4] | |
3941 | movu xm2, [r0] ; m2 = row 16 | |
3942 | punpckhbw xm3, xm1, xm2 | |
3943 | punpcklbw xm1, xm2 | |
3944 | vinserti128 m1, m1, xm3, 1 | |
3945 | pmaddubsw m3, m1, m13 | |
3946 | paddw m7, m3 | |
3947 | pmaddubsw m1, m12 | |
3948 | movu xm3, [r0 + r1] ; m3 = row 17 | |
3949 | punpckhbw xm4, xm2, xm3 | |
3950 | punpcklbw xm2, xm3 | |
3951 | vinserti128 m2, m2, xm4, 1 | |
3952 | pmaddubsw m2, m13 | |
3953 | paddw m0, m2 | |
3954 | movu xm4, [r0 + r1 * 2] ; m4 = row 18 | |
3955 | punpckhbw xm5, xm3, xm4 | |
3956 | punpcklbw xm3, xm4 | |
3957 | vinserti128 m3, m3, xm5, 1 | |
3958 | pmaddubsw m3, m13 | |
3959 | paddw m1, m3 | |
3960 | ||
3961 | pmulhrsw m8, m14 ; m8 = word: row 8 | |
3962 | pmulhrsw m9, m14 ; m9 = word: row 9 | |
3963 | pmulhrsw m10, m14 ; m10 = word: row 10 | |
3964 | pmulhrsw m11, m14 ; m11 = word: row 11 | |
3965 | pmulhrsw m6, m14 ; m6 = word: row 12 | |
3966 | pmulhrsw m7, m14 ; m7 = word: row 13 | |
3967 | pmulhrsw m0, m14 ; m0 = word: row 14 | |
3968 | pmulhrsw m1, m14 ; m1 = word: row 15 | |
3969 | packuswb m8, m9 | |
3970 | packuswb m10, m11 | |
3971 | packuswb m6, m7 | |
3972 | packuswb m0, m1 | |
3973 | vpermq m8, m8, 11011000b | |
3974 | vpermq m10, m10, 11011000b | |
3975 | vpermq m6, m6, 11011000b | |
3976 | vpermq m0, m0, 11011000b | |
3977 | vextracti128 xm9, m8, 1 | |
3978 | vextracti128 xm11, m10, 1 | |
3979 | vextracti128 xm7, m6, 1 | |
3980 | vextracti128 xm1, m0, 1 | |
3981 | movu [r2], xm8 | |
3982 | movu [r2 + r3], xm9 | |
3983 | movu [r2 + r3 * 2], xm10 | |
3984 | movu [r2 + r5], xm11 | |
3985 | lea r2, [r2 + r3 * 4] | |
3986 | movu [r2], xm6 | |
3987 | movu [r2 + r3], xm7 | |
3988 | movu [r2 + r3 * 2], xm0 | |
3989 | movu [r2 + r5], xm1 | |
3990 | RET | |
3991 | %endif | |
3992 | ||
72b9787e JB |
3993 | ;----------------------------------------------------------------------------- |
3994 | ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3995 | ;----------------------------------------------------------------------------- | |
3996 | %macro FILTER_V4_W24 2 | |
3997 | INIT_XMM sse4 | |
3998 | cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 | |
3999 | ||
4000 | mov r4d, r4m | |
4001 | sub r0, r1 | |
4002 | ||
4003 | %ifdef PIC | |
4004 | lea r5, [tab_ChromaCoeff] | |
4005 | movd m0, [r5 + r4 * 4] | |
4006 | %else | |
4007 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
4008 | %endif | |
4009 | ||
4010 | pshufb m1, m0, [tab_Vm] | |
4011 | pshufb m0, [tab_Vm + 16] | |
4012 | ||
4013 | mov r4d, %2 | |
4014 | ||
4015 | .loop: | |
4016 | movu m2, [r0] | |
4017 | movu m3, [r0 + r1] | |
4018 | ||
4019 | punpcklbw m4, m2, m3 | |
4020 | punpckhbw m2, m3 | |
4021 | ||
4022 | pmaddubsw m4, m1 | |
4023 | pmaddubsw m2, m1 | |
4024 | ||
4025 | lea r5, [r0 + 2 * r1] | |
4026 | movu m5, [r5] | |
4027 | movu m7, [r5 + r1] | |
4028 | ||
4029 | punpcklbw m6, m5, m7 | |
4030 | pmaddubsw m6, m0 | |
4031 | paddw m4, m6 | |
4032 | ||
4033 | punpckhbw m6, m5, m7 | |
4034 | pmaddubsw m6, m0 | |
4035 | paddw m2, m6 | |
4036 | ||
b53f7c52 | 4037 | mova m6, [pw_512] |
72b9787e JB |
4038 | |
4039 | pmulhrsw m4, m6 | |
4040 | pmulhrsw m2, m6 | |
4041 | ||
4042 | packuswb m4, m2 | |
4043 | ||
4044 | movu [r2], m4 | |
4045 | ||
4046 | punpcklbw m4, m3, m5 | |
4047 | punpckhbw m3, m5 | |
4048 | ||
4049 | pmaddubsw m4, m1 | |
4050 | pmaddubsw m3, m1 | |
4051 | ||
4052 | movu m2, [r5 + 2 * r1] | |
4053 | ||
4054 | punpcklbw m5, m7, m2 | |
4055 | punpckhbw m7, m2 | |
4056 | ||
4057 | pmaddubsw m5, m0 | |
4058 | pmaddubsw m7, m0 | |
4059 | ||
4060 | paddw m4, m5 | |
4061 | paddw m3, m7 | |
4062 | ||
4063 | pmulhrsw m4, m6 | |
4064 | pmulhrsw m3, m6 | |
4065 | ||
4066 | packuswb m4, m3 | |
4067 | ||
4068 | movu [r2 + r3], m4 | |
4069 | ||
4070 | movq m2, [r0 + 16] | |
4071 | movq m3, [r0 + r1 + 16] | |
4072 | movq m4, [r5 + 16] | |
4073 | movq m5, [r5 + r1 + 16] | |
4074 | ||
4075 | punpcklbw m2, m3 | |
4076 | punpcklbw m4, m5 | |
4077 | ||
4078 | pmaddubsw m2, m1 | |
4079 | pmaddubsw m4, m0 | |
4080 | ||
4081 | paddw m2, m4 | |
4082 | ||
4083 | pmulhrsw m2, m6 | |
4084 | ||
4085 | movq m3, [r0 + r1 + 16] | |
4086 | movq m4, [r5 + 16] | |
4087 | movq m5, [r5 + r1 + 16] | |
4088 | movq m7, [r5 + 2 * r1 + 16] | |
4089 | ||
4090 | punpcklbw m3, m4 | |
4091 | punpcklbw m5, m7 | |
4092 | ||
4093 | pmaddubsw m3, m1 | |
4094 | pmaddubsw m5, m0 | |
4095 | ||
4096 | paddw m3, m5 | |
4097 | ||
4098 | pmulhrsw m3, m6 | |
4099 | packuswb m2, m3 | |
4100 | ||
4101 | movh [r2 + 16], m2 | |
4102 | movhps [r2 + r3 + 16], m2 | |
4103 | ||
4104 | mov r0, r5 | |
4105 | lea r2, [r2 + 2 * r3] | |
4106 | ||
4107 | sub r4, 2 | |
4108 | jnz .loop | |
4109 | RET | |
4110 | %endmacro | |
4111 | ||
4112 | FILTER_V4_W24 24, 32 | |
4113 | ||
4114 | FILTER_V4_W24 24, 64 | |
4115 | ||
4116 | ;----------------------------------------------------------------------------- | |
4117 | ; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4118 | ;----------------------------------------------------------------------------- | |
4119 | %macro FILTER_V4_W32 2 | |
4120 | INIT_XMM sse4 | |
4121 | cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 | |
4122 | ||
4123 | mov r4d, r4m | |
4124 | sub r0, r1 | |
4125 | ||
4126 | %ifdef PIC | |
4127 | lea r5, [tab_ChromaCoeff] | |
4128 | movd m0, [r5 + r4 * 4] | |
4129 | %else | |
4130 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
4131 | %endif | |
4132 | ||
4133 | pshufb m1, m0, [tab_Vm] | |
4134 | pshufb m0, [tab_Vm + 16] | |
4135 | ||
b53f7c52 | 4136 | mova m7, [pw_512] |
72b9787e JB |
4137 | |
4138 | mov r4d, %2 | |
4139 | ||
4140 | .loop: | |
4141 | movu m2, [r0] | |
4142 | movu m3, [r0 + r1] | |
4143 | ||
4144 | punpcklbw m4, m2, m3 | |
4145 | punpckhbw m2, m3 | |
4146 | ||
4147 | pmaddubsw m4, m1 | |
4148 | pmaddubsw m2, m1 | |
4149 | ||
4150 | lea r5, [r0 + 2 * r1] | |
4151 | movu m3, [r5] | |
4152 | movu m5, [r5 + r1] | |
4153 | ||
4154 | punpcklbw m6, m3, m5 | |
4155 | punpckhbw m3, m5 | |
4156 | ||
4157 | pmaddubsw m6, m0 | |
4158 | pmaddubsw m3, m0 | |
4159 | ||
4160 | paddw m4, m6 | |
4161 | paddw m2, m3 | |
4162 | ||
4163 | pmulhrsw m4, m7 | |
4164 | pmulhrsw m2, m7 | |
4165 | ||
4166 | packuswb m4, m2 | |
4167 | ||
4168 | movu [r2], m4 | |
4169 | ||
4170 | movu m2, [r0 + 16] | |
4171 | movu m3, [r0 + r1 + 16] | |
4172 | ||
4173 | punpcklbw m4, m2, m3 | |
4174 | punpckhbw m2, m3 | |
4175 | ||
4176 | pmaddubsw m4, m1 | |
4177 | pmaddubsw m2, m1 | |
4178 | ||
4179 | movu m3, [r5 + 16] | |
4180 | movu m5, [r5 + r1 + 16] | |
4181 | ||
4182 | punpcklbw m6, m3, m5 | |
4183 | punpckhbw m3, m5 | |
4184 | ||
4185 | pmaddubsw m6, m0 | |
4186 | pmaddubsw m3, m0 | |
4187 | ||
4188 | paddw m4, m6 | |
4189 | paddw m2, m3 | |
4190 | ||
4191 | pmulhrsw m4, m7 | |
4192 | pmulhrsw m2, m7 | |
4193 | ||
4194 | packuswb m4, m2 | |
4195 | ||
4196 | movu [r2 + 16], m4 | |
4197 | ||
4198 | lea r0, [r0 + r1] | |
4199 | lea r2, [r2 + r3] | |
4200 | ||
4201 | dec r4 | |
4202 | jnz .loop | |
4203 | RET | |
4204 | %endmacro | |
4205 | ||
4206 | FILTER_V4_W32 32, 8 | |
4207 | FILTER_V4_W32 32, 16 | |
4208 | FILTER_V4_W32 32, 24 | |
4209 | FILTER_V4_W32 32, 32 | |
4210 | ||
4211 | FILTER_V4_W32 32, 48 | |
4212 | FILTER_V4_W32 32, 64 | |
4213 | ||
b53f7c52 JB |
4214 | INIT_YMM avx2 |
4215 | %if ARCH_X86_64 == 1 | |
4216 | cglobal interp_4tap_vert_pp_32x32, 4, 7, 13 | |
4217 | mov r4d, r4m | |
4218 | shl r4d, 6 | |
4219 | ||
4220 | %ifdef PIC | |
4221 | lea r5, [tab_ChromaCoeffVer_32] | |
4222 | add r5, r4 | |
4223 | %else | |
4224 | lea r5, [tab_ChromaCoeffVer_32 + r4] | |
4225 | %endif | |
4226 | ||
4227 | mova m10, [r5] | |
4228 | mova m11, [r5 + mmsize] | |
4229 | lea r4, [r1 * 3] | |
4230 | sub r0, r1 | |
4231 | lea r5, [r3 * 3] | |
4232 | mova m12, [pw_512] | |
4233 | mov r6d, 8 | |
4234 | .loopW: | |
4235 | movu m0, [r0] ; m0 = row 0 | |
4236 | movu m1, [r0 + r1] ; m1 = row 1 | |
4237 | punpcklbw m2, m0, m1 | |
4238 | punpckhbw m3, m0, m1 | |
4239 | pmaddubsw m2, m10 | |
4240 | pmaddubsw m3, m10 | |
4241 | movu m0, [r0 + r1 * 2] ; m0 = row 2 | |
4242 | punpcklbw m4, m1, m0 | |
4243 | punpckhbw m5, m1, m0 | |
4244 | pmaddubsw m4, m10 | |
4245 | pmaddubsw m5, m10 | |
4246 | movu m1, [r0 + r4] ; m1 = row 3 | |
4247 | punpcklbw m6, m0, m1 | |
4248 | punpckhbw m7, m0, m1 | |
4249 | pmaddubsw m8, m6, m11 | |
4250 | pmaddubsw m9, m7, m11 | |
4251 | pmaddubsw m6, m10 | |
4252 | pmaddubsw m7, m10 | |
4253 | paddw m2, m8 | |
4254 | paddw m3, m9 | |
4255 | pmulhrsw m2, m12 | |
4256 | pmulhrsw m3, m12 | |
4257 | packuswb m2, m3 | |
4258 | movu [r2], m2 | |
4259 | ||
4260 | lea r0, [r0 + r1 * 4] | |
4261 | movu m0, [r0] ; m0 = row 4 | |
4262 | punpcklbw m2, m1, m0 | |
4263 | punpckhbw m3, m1, m0 | |
4264 | pmaddubsw m8, m2, m11 | |
4265 | pmaddubsw m9, m3, m11 | |
4266 | pmaddubsw m2, m10 | |
4267 | pmaddubsw m3, m10 | |
4268 | paddw m4, m8 | |
4269 | paddw m5, m9 | |
4270 | pmulhrsw m4, m12 | |
4271 | pmulhrsw m5, m12 | |
4272 | packuswb m4, m5 | |
4273 | movu [r2 + r3], m4 | |
4274 | ||
4275 | movu m1, [r0 + r1] ; m1 = row 5 | |
4276 | punpcklbw m4, m0, m1 | |
4277 | punpckhbw m5, m0, m1 | |
4278 | pmaddubsw m4, m11 | |
4279 | pmaddubsw m5, m11 | |
4280 | paddw m6, m4 | |
4281 | paddw m7, m5 | |
4282 | pmulhrsw m6, m12 | |
4283 | pmulhrsw m7, m12 | |
4284 | packuswb m6, m7 | |
4285 | movu [r2 + r3 * 2], m6 | |
4286 | ||
4287 | movu m0, [r0 + r1 * 2] ; m0 = row 6 | |
4288 | punpcklbw m6, m1, m0 | |
4289 | punpckhbw m7, m1, m0 | |
4290 | pmaddubsw m6, m11 | |
4291 | pmaddubsw m7, m11 | |
4292 | paddw m2, m6 | |
4293 | paddw m3, m7 | |
4294 | pmulhrsw m2, m12 | |
4295 | pmulhrsw m3, m12 | |
4296 | packuswb m2, m3 | |
4297 | movu [r2 + r5], m2 | |
4298 | ||
4299 | lea r2, [r2 + r3 * 4] | |
4300 | dec r6d | |
4301 | jnz .loopW | |
4302 | RET | |
4303 | %endif | |
72b9787e JB |
4304 | |
4305 | ;----------------------------------------------------------------------------- | |
4306 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4307 | ;----------------------------------------------------------------------------- | |
4308 | %macro FILTER_V4_W16n_H2 2 | |
4309 | INIT_XMM sse4 | |
4310 | cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 | |
4311 | ||
4312 | mov r4d, r4m | |
4313 | sub r0, r1 | |
4314 | ||
4315 | %ifdef PIC | |
4316 | lea r5, [tab_ChromaCoeff] | |
4317 | movd m0, [r5 + r4 * 4] | |
4318 | %else | |
4319 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
4320 | %endif | |
4321 | ||
4322 | pshufb m1, m0, [tab_Vm] | |
4323 | pshufb m0, [tab_Vm + 16] | |
4324 | ||
4325 | mov r4d, %2/2 | |
4326 | ||
4327 | .loop: | |
4328 | ||
4329 | mov r6d, %1/16 | |
4330 | ||
4331 | .loopW: | |
4332 | ||
4333 | movu m2, [r0] | |
4334 | movu m3, [r0 + r1] | |
4335 | ||
4336 | punpcklbw m4, m2, m3 | |
4337 | punpckhbw m2, m3 | |
4338 | ||
4339 | pmaddubsw m4, m1 | |
4340 | pmaddubsw m2, m1 | |
4341 | ||
4342 | lea r5, [r0 + 2 * r1] | |
4343 | movu m5, [r5] | |
4344 | movu m6, [r5 + r1] | |
4345 | ||
4346 | punpckhbw m7, m5, m6 | |
4347 | pmaddubsw m7, m0 | |
4348 | paddw m2, m7 | |
4349 | ||
4350 | punpcklbw m7, m5, m6 | |
4351 | pmaddubsw m7, m0 | |
4352 | paddw m4, m7 | |
4353 | ||
b53f7c52 | 4354 | mova m7, [pw_512] |
72b9787e JB |
4355 | |
4356 | pmulhrsw m4, m7 | |
4357 | pmulhrsw m2, m7 | |
4358 | ||
4359 | packuswb m4, m2 | |
4360 | ||
4361 | movu [r2], m4 | |
4362 | ||
4363 | punpcklbw m4, m3, m5 | |
4364 | punpckhbw m3, m5 | |
4365 | ||
4366 | pmaddubsw m4, m1 | |
4367 | pmaddubsw m3, m1 | |
4368 | ||
4369 | movu m5, [r5 + 2 * r1] | |
4370 | ||
4371 | punpcklbw m2, m6, m5 | |
4372 | punpckhbw m6, m5 | |
4373 | ||
4374 | pmaddubsw m2, m0 | |
4375 | pmaddubsw m6, m0 | |
4376 | ||
4377 | paddw m4, m2 | |
4378 | paddw m3, m6 | |
4379 | ||
4380 | pmulhrsw m4, m7 | |
4381 | pmulhrsw m3, m7 | |
4382 | ||
4383 | packuswb m4, m3 | |
4384 | ||
4385 | movu [r2 + r3], m4 | |
4386 | ||
4387 | add r0, 16 | |
4388 | add r2, 16 | |
4389 | dec r6d | |
4390 | jnz .loopW | |
4391 | ||
4392 | lea r0, [r0 + r1 * 2 - %1] | |
4393 | lea r2, [r2 + r3 * 2 - %1] | |
4394 | ||
4395 | dec r4d | |
4396 | jnz .loop | |
4397 | RET | |
4398 | %endmacro | |
4399 | ||
4400 | FILTER_V4_W16n_H2 64, 64 | |
4401 | FILTER_V4_W16n_H2 64, 32 | |
4402 | FILTER_V4_W16n_H2 64, 48 | |
4403 | FILTER_V4_W16n_H2 48, 64 | |
4404 | FILTER_V4_W16n_H2 64, 16 | |
4405 | ||
4406 | ||
4407 | ;----------------------------------------------------------------------------- | |
4408 | ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) | |
4409 | ;----------------------------------------------------------------------------- | |
4410 | INIT_XMM ssse3 | |
4411 | cglobal luma_p2s, 3, 7, 6 | |
4412 | ||
4413 | ; load width and height | |
4414 | mov r3d, r3m | |
4415 | mov r4d, r4m | |
4416 | ||
4417 | ; load constant | |
b53f7c52 | 4418 | mova m4, [pb_128] |
72b9787e JB |
4419 | mova m5, [tab_c_64_n64] |
4420 | ||
4421 | .loopH: | |
4422 | ||
4423 | xor r5d, r5d | |
4424 | .loopW: | |
4425 | lea r6, [r0 + r5] | |
4426 | ||
4427 | movh m0, [r6] | |
4428 | punpcklbw m0, m4 | |
4429 | pmaddubsw m0, m5 | |
4430 | ||
4431 | movh m1, [r6 + r1] | |
4432 | punpcklbw m1, m4 | |
4433 | pmaddubsw m1, m5 | |
4434 | ||
4435 | movh m2, [r6 + r1 * 2] | |
4436 | punpcklbw m2, m4 | |
4437 | pmaddubsw m2, m5 | |
4438 | ||
4439 | lea r6, [r6 + r1 * 2] | |
4440 | movh m3, [r6 + r1] | |
4441 | punpcklbw m3, m4 | |
4442 | pmaddubsw m3, m5 | |
4443 | ||
4444 | add r5, 8 | |
4445 | cmp r5, r3 | |
4446 | jg .width4 | |
4447 | movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 | |
4448 | movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 | |
4449 | movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 | |
4450 | movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 | |
4451 | je .nextH | |
4452 | jmp .loopW | |
4453 | ||
4454 | .width4: | |
4455 | movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 | |
4456 | movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 | |
4457 | movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 | |
4458 | movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 | |
4459 | ||
4460 | .nextH: | |
4461 | lea r0, [r0 + r1 * 4] | |
4462 | add r2, FENC_STRIDE * 8 | |
4463 | ||
4464 | sub r4d, 4 | |
4465 | jnz .loopH | |
4466 | ||
4467 | RET | |
4468 | ||
4469 | %macro PROCESS_LUMA_W4_4R 0 | |
4470 | movd m0, [r0] | |
4471 | movd m1, [r0 + r1] | |
4472 | punpcklbw m2, m0, m1 ; m2=[0 1] | |
4473 | ||
4474 | lea r0, [r0 + 2 * r1] | |
4475 | movd m0, [r0] | |
4476 | punpcklbw m1, m0 ; m1=[1 2] | |
4477 | punpcklqdq m2, m1 ; m2=[0 1 1 2] | |
4478 | pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] | |
4479 | ||
4480 | movd m1, [r0 + r1] | |
4481 | punpcklbw m5, m0, m1 ; m2=[2 3] | |
4482 | lea r0, [r0 + 2 * r1] | |
4483 | movd m0, [r0] | |
4484 | punpcklbw m1, m0 ; m1=[3 4] | |
4485 | punpcklqdq m5, m1 ; m5=[2 3 3 4] | |
4486 | pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] | |
4487 | paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 | |
4488 | pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 | |
4489 | ||
4490 | movd m1, [r0 + r1] | |
4491 | punpcklbw m2, m0, m1 ; m2=[4 5] | |
4492 | lea r0, [r0 + 2 * r1] | |
4493 | movd m0, [r0] | |
4494 | punpcklbw m1, m0 ; m1=[5 6] | |
4495 | punpcklqdq m2, m1 ; m2=[4 5 5 6] | |
4496 | pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] | |
4497 | paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 | |
4498 | pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] | |
4499 | paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 | |
4500 | ||
4501 | movd m1, [r0 + r1] | |
4502 | punpcklbw m2, m0, m1 ; m2=[6 7] | |
4503 | lea r0, [r0 + 2 * r1] | |
4504 | movd m0, [r0] | |
4505 | punpcklbw m1, m0 ; m1=[7 8] | |
4506 | punpcklqdq m2, m1 ; m2=[6 7 7 8] | |
4507 | pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] | |
4508 | paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end | |
4509 | pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] | |
4510 | paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 | |
4511 | ||
4512 | movd m1, [r0 + r1] | |
4513 | punpcklbw m2, m0, m1 ; m2=[8 9] | |
4514 | movd m0, [r0 + 2 * r1] | |
4515 | punpcklbw m1, m0 ; m1=[9 10] | |
4516 | punpcklqdq m2, m1 ; m2=[8 9 9 10] | |
4517 | pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] | |
4518 | paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end | |
4519 | %endmacro | |
4520 | ||
4521 | %macro PROCESS_LUMA_W8_4R 0 | |
4522 | movq m0, [r0] | |
4523 | movq m1, [r0 + r1] | |
4524 | punpcklbw m0, m1 | |
4525 | pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 | |
4526 | ||
4527 | lea r0, [r0 + 2 * r1] | |
4528 | movq m0, [r0] | |
4529 | punpcklbw m1, m0 | |
4530 | pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 | |
4531 | ||
4532 | movq m1, [r0 + r1] | |
4533 | punpcklbw m0, m1 | |
4534 | pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 | |
4535 | pmaddubsw m0, [r6 + 1 * 16] | |
4536 | paddw m7, m0 ;m7=[0+1+2+3] Row1 | |
4537 | ||
4538 | lea r0, [r0 + 2 * r1] | |
4539 | movq m0, [r0] | |
4540 | punpcklbw m1, m0 | |
4541 | pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 | |
4542 | pmaddubsw m1, [r6 + 1 * 16] | |
4543 | paddw m6, m1 ;m6 = [1+2+3+4] Row2 | |
4544 | ||
4545 | movq m1, [r0 + r1] | |
4546 | punpcklbw m0, m1 | |
4547 | pmaddubsw m2, m0, [r6 + 1 * 16] | |
4548 | pmaddubsw m0, [r6 + 2 * 16] | |
4549 | paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 | |
4550 | paddw m5, m2 ;m5=[2+3+4+5] Row3 | |
4551 | ||
4552 | lea r0, [r0 + 2 * r1] | |
4553 | movq m0, [r0] | |
4554 | punpcklbw m1, m0 | |
4555 | pmaddubsw m2, m1, [r6 + 1 * 16] | |
4556 | pmaddubsw m1, [r6 + 2 * 16] | |
4557 | paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 | |
4558 | paddw m4, m2 ;m4=[3+4+5+6] Row4 | |
4559 | ||
4560 | movq m1, [r0 + r1] | |
4561 | punpcklbw m0, m1 | |
4562 | pmaddubsw m2, m0, [r6 + 2 * 16] | |
4563 | pmaddubsw m0, [r6 + 3 * 16] | |
4564 | paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end | |
4565 | paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 | |
4566 | ||
4567 | lea r0, [r0 + 2 * r1] | |
4568 | movq m0, [r0] | |
4569 | punpcklbw m1, m0 | |
4570 | pmaddubsw m2, m1, [r6 + 2 * 16] | |
4571 | pmaddubsw m1, [r6 + 3 * 16] | |
4572 | paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end | |
4573 | paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 | |
4574 | ||
4575 | movq m1, [r0 + r1] | |
4576 | punpcklbw m0, m1 | |
4577 | pmaddubsw m0, [r6 + 3 * 16] | |
4578 | paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end | |
4579 | ||
4580 | movq m0, [r0 + 2 * r1] | |
4581 | punpcklbw m1, m0 | |
4582 | pmaddubsw m1, [r6 + 3 * 16] | |
4583 | paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end | |
4584 | %endmacro | |
4585 | ||
4586 | ;------------------------------------------------------------------------------------------------------------- | |
4587 | ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4588 | ;------------------------------------------------------------------------------------------------------------- | |
4589 | %macro FILTER_VER_LUMA_4xN 3 | |
4590 | INIT_XMM sse4 | |
4591 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 | |
4592 | lea r5, [3 * r1] | |
4593 | sub r0, r5 | |
4594 | shl r4d, 6 | |
4595 | %ifidn %3,ps | |
4596 | add r3d, r3d | |
4597 | %endif | |
4598 | ||
4599 | %ifdef PIC | |
4600 | lea r5, [tab_LumaCoeffVer] | |
4601 | lea r6, [r5 + r4] | |
4602 | %else | |
4603 | lea r6, [tab_LumaCoeffVer + r4] | |
4604 | %endif | |
4605 | ||
4606 | %ifidn %3,pp | |
b53f7c52 | 4607 | mova m3, [pw_512] |
72b9787e JB |
4608 | %else |
4609 | mova m3, [pw_2000] | |
4610 | %endif | |
4611 | ||
4612 | mov r4d, %2/4 | |
4613 | lea r5, [4 * r1] | |
4614 | ||
4615 | .loopH: | |
4616 | PROCESS_LUMA_W4_4R | |
4617 | ||
4618 | %ifidn %3,pp | |
4619 | pmulhrsw m4, m3 | |
4620 | pmulhrsw m5, m3 | |
4621 | ||
4622 | packuswb m4, m5 | |
4623 | ||
4624 | movd [r2], m4 | |
4625 | pextrd [r2 + r3], m4, 1 | |
4626 | lea r2, [r2 + 2 * r3] | |
4627 | pextrd [r2], m4, 2 | |
4628 | pextrd [r2 + r3], m4, 3 | |
4629 | %else | |
4630 | psubw m4, m3 | |
4631 | psubw m5, m3 | |
4632 | ||
4633 | movlps [r2], m4 | |
4634 | movhps [r2 + r3], m4 | |
4635 | lea r2, [r2 + 2 * r3] | |
4636 | movlps [r2], m5 | |
4637 | movhps [r2 + r3], m5 | |
4638 | %endif | |
4639 | ||
4640 | sub r0, r5 | |
4641 | lea r2, [r2 + 2 * r3] | |
4642 | ||
4643 | dec r4d | |
4644 | jnz .loopH | |
4645 | ||
4646 | RET | |
4647 | %endmacro | |
4648 | ||
b53f7c52 JB |
4649 | |
4650 | INIT_YMM avx2 | |
4651 | cglobal interp_8tap_vert_pp_4x4, 4,6,8 | |
4652 | mov r4d, r4m | |
4653 | lea r5, [r1 * 3] | |
4654 | sub r0, r5 | |
4655 | ||
4656 | ; TODO: VPGATHERDD | |
4657 | movd xm1, [r0] ; m1 = row0 | |
4658 | movd xm2, [r0 + r1] ; m2 = row1 | |
4659 | punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] | |
4660 | ||
4661 | movd xm3, [r0 + r1 * 2] ; m3 = row2 | |
4662 | punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] | |
4663 | movd xm4, [r0 + r5] | |
4664 | punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] | |
4665 | punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] | |
4666 | ||
4667 | lea r0, [r0 + r1 * 4] | |
4668 | movd xm5, [r0] ; m5 = row4 | |
4669 | punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] | |
4670 | punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] | |
4671 | vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] | |
4672 | movd xm2, [r0 + r1] ; m2 = row5 | |
4673 | punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] | |
4674 | punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] | |
4675 | movd xm6, [r0 + r1 * 2] ; m6 = row6 | |
4676 | punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] | |
4677 | punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] | |
4678 | vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] | |
4679 | movd xm4, [r0 + r5] ; m4 = row7 | |
4680 | punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] | |
4681 | punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] | |
4682 | ||
4683 | lea r0, [r0 + r1 * 4] | |
4684 | movd xm7, [r0] ; m7 = row8 | |
4685 | punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] | |
4686 | punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] | |
4687 | vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] | |
4688 | movd xm2, [r0 + r1] ; m2 = row9 | |
4689 | punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] | |
4690 | punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] | |
4691 | movd xm7, [r0 + r1 * 2] ; m7 = rowA | |
4692 | punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] | |
4693 | punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] | |
4694 | vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] | |
4695 | ||
4696 | ; load filter coeff | |
4697 | %ifdef PIC | |
4698 | lea r5, [tab_LumaCoeff] | |
4699 | vpbroadcastd m0, [r5 + r4 * 8 + 0] | |
4700 | vpbroadcastd m2, [r5 + r4 * 8 + 4] | |
4701 | %else | |
4702 | vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] | |
4703 | vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] | |
4704 | %endif | |
4705 | ||
4706 | pmaddubsw m1, m0 | |
4707 | pmaddubsw m3, m0 | |
4708 | pmaddubsw m5, m2 | |
4709 | pmaddubsw m6, m2 | |
4710 | vbroadcasti128 m0, [pw_1] | |
4711 | pmaddwd m1, m0 | |
4712 | pmaddwd m3, m0 | |
4713 | pmaddwd m5, m0 | |
4714 | pmaddwd m6, m0 | |
4715 | paddd m1, m5 ; m1 = DQWORD ROW[1 0] | |
4716 | paddd m3, m6 ; m3 = DQWORD ROW[3 2] | |
4717 | packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] | |
4718 | ||
4719 | ; TODO: does it overflow? | |
4720 | pmulhrsw m1, [pw_512] | |
4721 | vextracti128 xm2, m1, 1 | |
4722 | packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] | |
4723 | movd [r2], xm1 | |
4724 | pextrd [r2 + r3], xm1, 2 | |
4725 | pextrd [r2 + r3 * 2], xm1, 1 | |
4726 | lea r4, [r3 * 3] | |
4727 | pextrd [r2 + r4], xm1, 3 | |
4728 | RET | |
4729 | ||
4730 | INIT_YMM avx2 | |
4731 | cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 | |
4732 | mov r4d, r4m | |
4733 | shl r4d, 7 | |
4734 | ||
4735 | %ifdef PIC | |
4736 | lea r5, [tab_LumaCoeffVer_32] | |
4737 | add r5, r4 | |
4738 | %else | |
4739 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
4740 | %endif | |
4741 | ||
4742 | lea r4, [r1 * 3] | |
4743 | sub r0, r4 | |
4744 | ||
4745 | add r3d, r3d | |
4746 | ||
4747 | movd xm1, [r0] | |
4748 | pinsrd xm1, [r0 + r1], 1 | |
4749 | pinsrd xm1, [r0 + r1 * 2], 2 | |
4750 | pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] | |
4751 | lea r0, [r0 + r1 * 4] | |
4752 | movd xm2, [r0] | |
4753 | pinsrd xm2, [r0 + r1], 1 | |
4754 | pinsrd xm2, [r0 + r1 * 2], 2 | |
4755 | pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] | |
4756 | vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] | |
4757 | lea r0, [r0 + r1 * 4] | |
4758 | movd xm3, [r0] | |
4759 | pinsrd xm3, [r0 + r1], 1 | |
4760 | pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] | |
4761 | vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] | |
4762 | mova m3, [interp4_vpp_shuf1] | |
4763 | vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] | |
4764 | vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] | |
4765 | mova m3, [interp4_vpp_shuf1 + mmsize] | |
4766 | vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] | |
4767 | vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] | |
4768 | ||
4769 | mova m3, [interp4_vpp_shuf] | |
4770 | pshufb m0, m0, m3 | |
4771 | pshufb m1, m1, m3 | |
4772 | pshufb m4, m4, m3 | |
4773 | pshufb m2, m2, m3 | |
4774 | pmaddubsw m0, [r5] | |
4775 | pmaddubsw m1, [r5 + mmsize] | |
4776 | pmaddubsw m4, [r5 + 2 * mmsize] | |
4777 | pmaddubsw m2, [r5 + 3 * mmsize] | |
4778 | paddw m0, m1 | |
4779 | paddw m0, m4 | |
4780 | paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] | |
4781 | ||
4782 | vbroadcasti128 m3, [pw_2000] | |
4783 | psubw m0, m3 | |
4784 | vextracti128 xm2, m0, 1 | |
4785 | lea r5, [r3 * 3] | |
4786 | movq [r2], xm0 | |
4787 | movhps [r2 + r3], xm0 | |
4788 | movq [r2 + r3 * 2], xm2 | |
4789 | movhps [r2 + r5], xm2 | |
4790 | RET | |
4791 | ||
72b9787e JB |
4792 | ;------------------------------------------------------------------------------------------------------------- |
4793 | ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4794 | ;------------------------------------------------------------------------------------------------------------- | |
4795 | FILTER_VER_LUMA_4xN 4, 4, pp | |
4796 | ||
4797 | ;------------------------------------------------------------------------------------------------------------- | |
4798 | ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4799 | ;------------------------------------------------------------------------------------------------------------- | |
4800 | FILTER_VER_LUMA_4xN 4, 8, pp | |
4801 | ||
4802 | ;------------------------------------------------------------------------------------------------------------- | |
4803 | ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4804 | ;------------------------------------------------------------------------------------------------------------- | |
4805 | FILTER_VER_LUMA_4xN 4, 16, pp | |
4806 | ||
4807 | ;------------------------------------------------------------------------------------------------------------- | |
4808 | ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4809 | ;------------------------------------------------------------------------------------------------------------- | |
4810 | FILTER_VER_LUMA_4xN 4, 4, ps | |
4811 | ||
4812 | ;------------------------------------------------------------------------------------------------------------- | |
4813 | ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4814 | ;------------------------------------------------------------------------------------------------------------- | |
4815 | FILTER_VER_LUMA_4xN 4, 8, ps | |
4816 | ||
4817 | ;------------------------------------------------------------------------------------------------------------- | |
4818 | ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4819 | ;------------------------------------------------------------------------------------------------------------- | |
4820 | FILTER_VER_LUMA_4xN 4, 16, ps | |
4821 | ||
b53f7c52 JB |
4822 | %macro PROCESS_LUMA_AVX2_W8_8R 0 |
4823 | movq xm1, [r0] ; m1 = row 0 | |
4824 | movq xm2, [r0 + r1] ; m2 = row 1 | |
4825 | punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] | |
4826 | movq xm3, [r0 + r1 * 2] ; m3 = row 2 | |
4827 | punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] | |
4828 | vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] | |
4829 | pmaddubsw m5, [r5] | |
4830 | movq xm4, [r0 + r4] ; m4 = row 3 | |
4831 | punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] | |
4832 | lea r0, [r0 + r1 * 4] | |
4833 | movq xm1, [r0] ; m1 = row 4 | |
4834 | punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] | |
4835 | vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] | |
4836 | pmaddubsw m0, m2, [r5 + 1 * mmsize] | |
4837 | paddw m5, m0 | |
4838 | pmaddubsw m2, [r5] | |
4839 | movq xm3, [r0 + r1] ; m3 = row 5 | |
4840 | punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] | |
4841 | movq xm4, [r0 + r1 * 2] ; m4 = row 6 | |
4842 | punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] | |
4843 | vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] | |
4844 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
4845 | paddw m5, m3 | |
4846 | pmaddubsw m0, m1, [r5 + 1 * mmsize] | |
4847 | paddw m2, m0 | |
4848 | pmaddubsw m1, [r5] | |
4849 | movq xm3, [r0 + r4] ; m3 = row 7 | |
4850 | punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] | |
4851 | lea r0, [r0 + r1 * 4] | |
4852 | movq xm0, [r0] ; m0 = row 8 | |
4853 | punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] | |
4854 | vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] | |
4855 | pmaddubsw m3, m4, [r5 + 3 * mmsize] | |
4856 | paddw m5, m3 | |
4857 | pmaddubsw m3, m4, [r5 + 2 * mmsize] | |
4858 | paddw m2, m3 | |
4859 | pmaddubsw m3, m4, [r5 + 1 * mmsize] | |
4860 | paddw m1, m3 | |
4861 | pmaddubsw m4, [r5] | |
4862 | movq xm3, [r0 + r1] ; m3 = row 9 | |
4863 | punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] | |
4864 | movq xm6, [r0 + r1 * 2] ; m6 = row 10 | |
4865 | punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] | |
4866 | vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] | |
4867 | pmaddubsw m3, m0, [r5 + 3 * mmsize] | |
4868 | paddw m2, m3 | |
4869 | pmaddubsw m3, m0, [r5 + 2 * mmsize] | |
4870 | paddw m1, m3 | |
4871 | pmaddubsw m0, [r5 + 1 * mmsize] | |
4872 | paddw m4, m0 | |
4873 | ||
4874 | movq xm3, [r0 + r4] ; m3 = row 11 | |
4875 | punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] | |
4876 | lea r0, [r0 + r1 * 4] | |
4877 | movq xm0, [r0] ; m0 = row 12 | |
4878 | punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] | |
4879 | vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] | |
4880 | pmaddubsw m3, m6, [r5 + 3 * mmsize] | |
4881 | paddw m1, m3 | |
4882 | pmaddubsw m6, [r5 + 2 * mmsize] | |
4883 | paddw m4, m6 | |
4884 | movq xm3, [r0 + r1] ; m3 = row 13 | |
4885 | punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] | |
4886 | movq xm6, [r0 + r1 * 2] ; m6 = row 14 | |
4887 | punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] | |
4888 | vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] | |
4889 | pmaddubsw m0, [r5 + 3 * mmsize] | |
4890 | paddw m4, m0 | |
4891 | %endmacro | |
4892 | ||
4893 | %macro PROCESS_LUMA_AVX2_W8_4R 0 | |
4894 | movq xm1, [r0] ; m1 = row 0 | |
4895 | movq xm2, [r0 + r1] ; m2 = row 1 | |
4896 | punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] | |
4897 | movq xm3, [r0 + r1 * 2] ; m3 = row 2 | |
4898 | punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] | |
4899 | vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] | |
4900 | pmaddubsw m5, [r5] | |
4901 | movq xm4, [r0 + r4] ; m4 = row 3 | |
4902 | punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] | |
4903 | lea r0, [r0 + r1 * 4] | |
4904 | movq xm1, [r0] ; m1 = row 4 | |
4905 | punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] | |
4906 | vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] | |
4907 | pmaddubsw m0, m2, [r5 + 1 * mmsize] | |
4908 | paddw m5, m0 | |
4909 | pmaddubsw m2, [r5] | |
4910 | movq xm3, [r0 + r1] ; m3 = row 5 | |
4911 | punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] | |
4912 | movq xm4, [r0 + r1 * 2] ; m4 = row 6 | |
4913 | punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] | |
4914 | vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] | |
4915 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
4916 | paddw m5, m3 | |
4917 | pmaddubsw m0, m1, [r5 + 1 * mmsize] | |
4918 | paddw m2, m0 | |
4919 | movq xm3, [r0 + r4] ; m3 = row 7 | |
4920 | punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] | |
4921 | lea r0, [r0 + r1 * 4] | |
4922 | movq xm0, [r0] ; m0 = row 8 | |
4923 | punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] | |
4924 | vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] | |
4925 | pmaddubsw m3, m4, [r5 + 3 * mmsize] | |
4926 | paddw m5, m3 | |
4927 | pmaddubsw m3, m4, [r5 + 2 * mmsize] | |
4928 | paddw m2, m3 | |
4929 | movq xm3, [r0 + r1] ; m3 = row 9 | |
4930 | punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] | |
4931 | movq xm6, [r0 + r1 * 2] ; m6 = row 10 | |
4932 | punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] | |
4933 | vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] | |
4934 | pmaddubsw m3, m0, [r5 + 3 * mmsize] | |
4935 | paddw m2, m3 | |
4936 | %endmacro | |
4937 | ||
72b9787e JB |
4938 | ;------------------------------------------------------------------------------------------------------------- |
4939 | ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4940 | ;------------------------------------------------------------------------------------------------------------- | |
4941 | %macro FILTER_VER_LUMA_8xN 3 | |
4942 | INIT_XMM sse4 | |
4943 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 | |
4944 | lea r5, [3 * r1] | |
4945 | sub r0, r5 | |
4946 | shl r4d, 6 | |
4947 | ||
4948 | %ifidn %3,ps | |
4949 | add r3d, r3d | |
4950 | %endif | |
4951 | ||
4952 | %ifdef PIC | |
4953 | lea r5, [tab_LumaCoeffVer] | |
4954 | lea r6, [r5 + r4] | |
4955 | %else | |
4956 | lea r6, [tab_LumaCoeffVer + r4] | |
4957 | %endif | |
4958 | ||
4959 | %ifidn %3,pp | |
b53f7c52 | 4960 | mova m3, [pw_512] |
72b9787e JB |
4961 | %else |
4962 | mova m3, [pw_2000] | |
4963 | %endif | |
4964 | ||
4965 | mov r4d, %2/4 | |
4966 | lea r5, [4 * r1] | |
4967 | ||
4968 | .loopH: | |
4969 | PROCESS_LUMA_W8_4R | |
4970 | ||
4971 | %ifidn %3,pp | |
4972 | pmulhrsw m7, m3 | |
4973 | pmulhrsw m6, m3 | |
4974 | pmulhrsw m5, m3 | |
4975 | pmulhrsw m4, m3 | |
4976 | ||
4977 | packuswb m7, m6 | |
4978 | packuswb m5, m4 | |
4979 | ||
4980 | movlps [r2], m7 | |
4981 | movhps [r2 + r3], m7 | |
4982 | lea r2, [r2 + 2 * r3] | |
4983 | movlps [r2], m5 | |
4984 | movhps [r2 + r3], m5 | |
4985 | %else | |
4986 | psubw m7, m3 | |
4987 | psubw m6, m3 | |
4988 | psubw m5, m3 | |
4989 | psubw m4, m3 | |
4990 | ||
4991 | movu [r2], m7 | |
4992 | movu [r2 + r3], m6 | |
4993 | lea r2, [r2 + 2 * r3] | |
4994 | movu [r2], m5 | |
4995 | movu [r2 + r3], m4 | |
4996 | %endif | |
4997 | ||
4998 | sub r0, r5 | |
4999 | lea r2, [r2 + 2 * r3] | |
5000 | ||
5001 | dec r4d | |
5002 | jnz .loopH | |
5003 | ||
5004 | RET | |
5005 | %endmacro | |
5006 | ||
b53f7c52 JB |
5007 | %macro FILTER_VER_LUMA_AVX2_8xN 2 |
5008 | INIT_YMM avx2 | |
5009 | cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize | |
5010 | mov r4d, r4m | |
5011 | shl r4d, 7 | |
5012 | ||
5013 | %ifdef PIC | |
5014 | lea r5, [tab_LumaCoeffVer_32] | |
5015 | add r5, r4 | |
5016 | %else | |
5017 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
5018 | %endif | |
5019 | lea r4, [r1 * 3] | |
5020 | sub r0, r4 | |
5021 | lea r6, [r1 * 4] | |
5022 | mov word [rsp], %2 / 8 | |
5023 | mova m7, [pw_512] | |
5024 | ||
5025 | .loop: | |
5026 | PROCESS_LUMA_AVX2_W8_8R | |
5027 | pmulhrsw m5, m7 ; m5 = word: row 0, row 1 | |
5028 | pmulhrsw m2, m7 ; m2 = word: row 2, row 3 | |
5029 | pmulhrsw m1, m7 ; m1 = word: row 4, row 5 | |
5030 | pmulhrsw m4, m7 ; m4 = word: row 6, row 7 | |
5031 | packuswb m5, m2 | |
5032 | packuswb m1, m4 | |
5033 | vextracti128 xm2, m5, 1 | |
5034 | vextracti128 xm4, m1, 1 | |
5035 | movq [r2], xm5 | |
5036 | movq [r2 + r3], xm2 | |
5037 | lea r2, [r2 + r3 * 2] | |
5038 | movhps [r2], xm5 | |
5039 | movhps [r2 + r3], xm2 | |
5040 | lea r2, [r2 + r3 * 2] | |
5041 | movq [r2], xm1 | |
5042 | movq [r2 + r3], xm4 | |
5043 | lea r2, [r2 + r3 * 2] | |
5044 | movhps [r2], xm1 | |
5045 | movhps [r2 + r3], xm4 | |
5046 | lea r2, [r2 + r3 * 2] | |
5047 | sub r0, r6 | |
5048 | dec word [rsp] | |
5049 | jnz .loop | |
5050 | RET | |
5051 | %endmacro | |
5052 | ||
5053 | INIT_YMM avx2 | |
5054 | cglobal interp_8tap_vert_pp_8x8, 4, 6, 7 | |
5055 | mov r4d, r4m | |
5056 | shl r4d, 7 | |
5057 | ||
5058 | %ifdef PIC | |
5059 | lea r5, [tab_LumaCoeffVer_32] | |
5060 | add r5, r4 | |
5061 | %else | |
5062 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
5063 | %endif | |
5064 | ||
5065 | lea r4, [r1 * 3] | |
5066 | sub r0, r4 | |
5067 | PROCESS_LUMA_AVX2_W8_8R | |
5068 | lea r4, [r3 * 3] | |
5069 | mova m3, [pw_512] | |
5070 | pmulhrsw m5, m3 ; m5 = word: row 0, row 1 | |
5071 | pmulhrsw m2, m3 ; m2 = word: row 2, row 3 | |
5072 | pmulhrsw m1, m3 ; m1 = word: row 4, row 5 | |
5073 | pmulhrsw m4, m3 ; m4 = word: row 6, row 7 | |
5074 | packuswb m5, m2 | |
5075 | packuswb m1, m4 | |
5076 | vextracti128 xm2, m5, 1 | |
5077 | vextracti128 xm4, m1, 1 | |
5078 | movq [r2], xm5 | |
5079 | movq [r2 + r3], xm2 | |
5080 | movhps [r2 + r3 * 2], xm5 | |
5081 | movhps [r2 + r4], xm2 | |
5082 | lea r2, [r2 + r3 * 4] | |
5083 | movq [r2], xm1 | |
5084 | movq [r2 + r3], xm4 | |
5085 | movhps [r2 + r3 * 2], xm1 | |
5086 | movhps [r2 + r4], xm4 | |
5087 | RET | |
5088 | ||
5089 | INIT_YMM avx2 | |
5090 | cglobal interp_8tap_vert_pp_8x4, 4, 6, 7 | |
5091 | mov r4d, r4m | |
5092 | shl r4d, 7 | |
5093 | ||
5094 | %ifdef PIC | |
5095 | lea r5, [tab_LumaCoeffVer_32] | |
5096 | add r5, r4 | |
5097 | %else | |
5098 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
5099 | %endif | |
5100 | ||
5101 | lea r4, [r1 * 3] | |
5102 | sub r0, r4 | |
5103 | PROCESS_LUMA_AVX2_W8_4R | |
5104 | lea r4, [r3 * 3] | |
5105 | mova m3, [pw_512] | |
5106 | pmulhrsw m5, m3 ; m5 = word: row 0, row 1 | |
5107 | pmulhrsw m2, m3 ; m2 = word: row 2, row 3 | |
5108 | packuswb m5, m2 | |
5109 | vextracti128 xm2, m5, 1 | |
5110 | movq [r2], xm5 | |
5111 | movq [r2 + r3], xm2 | |
5112 | movhps [r2 + r3 * 2], xm5 | |
5113 | movhps [r2 + r4], xm2 | |
5114 | RET | |
5115 | ||
72b9787e JB |
5116 | ;------------------------------------------------------------------------------------------------------------- |
5117 | ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5118 | ;------------------------------------------------------------------------------------------------------------- | |
5119 | FILTER_VER_LUMA_8xN 8, 4, pp | |
5120 | ||
5121 | ;------------------------------------------------------------------------------------------------------------- | |
5122 | ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5123 | ;------------------------------------------------------------------------------------------------------------- | |
5124 | FILTER_VER_LUMA_8xN 8, 8, pp | |
5125 | ||
5126 | ;------------------------------------------------------------------------------------------------------------- | |
5127 | ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5128 | ;------------------------------------------------------------------------------------------------------------- | |
5129 | FILTER_VER_LUMA_8xN 8, 16, pp | |
b53f7c52 | 5130 | FILTER_VER_LUMA_AVX2_8xN 8, 16 |
72b9787e JB |
5131 | |
5132 | ;------------------------------------------------------------------------------------------------------------- | |
5133 | ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5134 | ;------------------------------------------------------------------------------------------------------------- | |
5135 | FILTER_VER_LUMA_8xN 8, 32, pp | |
b53f7c52 | 5136 | FILTER_VER_LUMA_AVX2_8xN 8, 32 |
72b9787e JB |
5137 | |
5138 | ;------------------------------------------------------------------------------------------------------------- | |
5139 | ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5140 | ;------------------------------------------------------------------------------------------------------------- | |
5141 | FILTER_VER_LUMA_8xN 8, 4, ps | |
5142 | ||
5143 | ;------------------------------------------------------------------------------------------------------------- | |
5144 | ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5145 | ;------------------------------------------------------------------------------------------------------------- | |
5146 | FILTER_VER_LUMA_8xN 8, 8, ps | |
5147 | ||
5148 | ;------------------------------------------------------------------------------------------------------------- | |
5149 | ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5150 | ;------------------------------------------------------------------------------------------------------------- | |
5151 | FILTER_VER_LUMA_8xN 8, 16, ps | |
5152 | ||
5153 | ;------------------------------------------------------------------------------------------------------------- | |
5154 | ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5155 | ;------------------------------------------------------------------------------------------------------------- | |
5156 | FILTER_VER_LUMA_8xN 8, 32, ps | |
5157 | ||
5158 | ;------------------------------------------------------------------------------------------------------------- | |
5159 | ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5160 | ;------------------------------------------------------------------------------------------------------------- | |
5161 | %macro FILTER_VER_LUMA_12xN 3 | |
5162 | INIT_XMM sse4 | |
5163 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 | |
5164 | lea r5, [3 * r1] | |
5165 | sub r0, r5 | |
5166 | shl r4d, 6 | |
5167 | %ifidn %3,ps | |
5168 | add r3d, r3d | |
5169 | %endif | |
5170 | ||
5171 | %ifdef PIC | |
5172 | lea r5, [tab_LumaCoeffVer] | |
5173 | lea r6, [r5 + r4] | |
5174 | %else | |
5175 | lea r6, [tab_LumaCoeffVer + r4] | |
5176 | %endif | |
5177 | ||
5178 | %ifidn %3,pp | |
b53f7c52 | 5179 | mova m3, [pw_512] |
72b9787e JB |
5180 | %else |
5181 | mova m3, [pw_2000] | |
5182 | %endif | |
5183 | ||
5184 | mov r4d, %2/4 | |
5185 | ||
5186 | .loopH: | |
5187 | PROCESS_LUMA_W8_4R | |
5188 | ||
5189 | %ifidn %3,pp | |
5190 | pmulhrsw m7, m3 | |
5191 | pmulhrsw m6, m3 | |
5192 | pmulhrsw m5, m3 | |
5193 | pmulhrsw m4, m3 | |
5194 | ||
5195 | packuswb m7, m6 | |
5196 | packuswb m5, m4 | |
5197 | ||
5198 | movlps [r2], m7 | |
5199 | movhps [r2 + r3], m7 | |
5200 | lea r5, [r2 + 2 * r3] | |
5201 | movlps [r5], m5 | |
5202 | movhps [r5 + r3], m5 | |
5203 | %else | |
5204 | psubw m7, m3 | |
5205 | psubw m6, m3 | |
5206 | psubw m5, m3 | |
5207 | psubw m4, m3 | |
5208 | ||
5209 | movu [r2], m7 | |
5210 | movu [r2 + r3], m6 | |
5211 | lea r5, [r2 + 2 * r3] | |
5212 | movu [r5], m5 | |
5213 | movu [r5 + r3], m4 | |
5214 | %endif | |
5215 | ||
5216 | lea r5, [8 * r1 - 8] | |
5217 | sub r0, r5 | |
5218 | %ifidn %3,pp | |
5219 | add r2, 8 | |
5220 | %else | |
5221 | add r2, 16 | |
5222 | %endif | |
5223 | ||
5224 | PROCESS_LUMA_W4_4R | |
5225 | ||
5226 | %ifidn %3,pp | |
5227 | pmulhrsw m4, m3 | |
5228 | pmulhrsw m5, m3 | |
5229 | ||
5230 | packuswb m4, m5 | |
5231 | ||
5232 | movd [r2], m4 | |
5233 | pextrd [r2 + r3], m4, 1 | |
5234 | lea r5, [r2 + 2 * r3] | |
5235 | pextrd [r5], m4, 2 | |
5236 | pextrd [r5 + r3], m4, 3 | |
5237 | %else | |
5238 | psubw m4, m3 | |
5239 | psubw m5, m3 | |
5240 | ||
5241 | movlps [r2], m4 | |
5242 | movhps [r2 + r3], m4 | |
5243 | lea r5, [r2 + 2 * r3] | |
5244 | movlps [r5], m5 | |
5245 | movhps [r5 + r3], m5 | |
5246 | %endif | |
5247 | ||
5248 | lea r5, [4 * r1 + 8] | |
5249 | sub r0, r5 | |
5250 | %ifidn %3,pp | |
5251 | lea r2, [r2 + 4 * r3 - 8] | |
5252 | %else | |
5253 | lea r2, [r2 + 4 * r3 - 16] | |
5254 | %endif | |
5255 | ||
5256 | dec r4d | |
5257 | jnz .loopH | |
5258 | ||
5259 | RET | |
5260 | %endmacro | |
5261 | ||
5262 | ;------------------------------------------------------------------------------------------------------------- | |
5263 | ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5264 | ;------------------------------------------------------------------------------------------------------------- | |
5265 | FILTER_VER_LUMA_12xN 12, 16, pp | |
5266 | ||
5267 | ;------------------------------------------------------------------------------------------------------------- | |
5268 | ; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
5269 | ;------------------------------------------------------------------------------------------------------------- | |
5270 | FILTER_VER_LUMA_12xN 12, 16, ps | |
5271 | ||
b53f7c52 JB |
5272 | INIT_YMM avx2 |
5273 | %if ARCH_X86_64 == 1 | |
5274 | cglobal interp_8tap_vert_pp_12x16, 4, 7, 15 | |
5275 | mov r4d, r4m | |
5276 | shl r4d, 7 | |
5277 | ||
5278 | %ifdef PIC | |
5279 | lea r5, [tab_LumaCoeffVer_32] | |
5280 | add r5, r4 | |
5281 | %else | |
5282 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
5283 | %endif | |
5284 | ||
5285 | lea r4, [r1 * 3] | |
5286 | sub r0, r4 | |
5287 | lea r6, [r3 * 3] | |
5288 | mova m14, [pw_512] | |
5289 | ||
5290 | movu xm0, [r0] ; m0 = row 0 | |
5291 | movu xm1, [r0 + r1] ; m1 = row 1 | |
5292 | punpckhbw xm2, xm0, xm1 | |
5293 | punpcklbw xm0, xm1 | |
5294 | vinserti128 m0, m0, xm2, 1 | |
5295 | pmaddubsw m0, [r5] | |
5296 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
5297 | punpckhbw xm3, xm1, xm2 | |
5298 | punpcklbw xm1, xm2 | |
5299 | vinserti128 m1, m1, xm3, 1 | |
5300 | pmaddubsw m1, [r5] | |
5301 | movu xm3, [r0 + r4] ; m3 = row 3 | |
5302 | punpckhbw xm4, xm2, xm3 | |
5303 | punpcklbw xm2, xm3 | |
5304 | vinserti128 m2, m2, xm4, 1 | |
5305 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
5306 | paddw m0, m4 | |
5307 | pmaddubsw m2, [r5] | |
5308 | lea r0, [r0 + r1 * 4] | |
5309 | movu xm4, [r0] ; m4 = row 4 | |
5310 | punpckhbw xm5, xm3, xm4 | |
5311 | punpcklbw xm3, xm4 | |
5312 | vinserti128 m3, m3, xm5, 1 | |
5313 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
5314 | paddw m1, m5 | |
5315 | pmaddubsw m3, [r5] | |
5316 | movu xm5, [r0 + r1] ; m5 = row 5 | |
5317 | punpckhbw xm6, xm4, xm5 | |
5318 | punpcklbw xm4, xm5 | |
5319 | vinserti128 m4, m4, xm6, 1 | |
5320 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
5321 | paddw m0, m6 | |
5322 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
5323 | paddw m2, m6 | |
5324 | pmaddubsw m4, [r5] | |
5325 | movu xm6, [r0 + r1 * 2] ; m6 = row 6 | |
5326 | punpckhbw xm7, xm5, xm6 | |
5327 | punpcklbw xm5, xm6 | |
5328 | vinserti128 m5, m5, xm7, 1 | |
5329 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
5330 | paddw m1, m7 | |
5331 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
5332 | paddw m3, m7 | |
5333 | pmaddubsw m5, [r5] | |
5334 | movu xm7, [r0 + r4] ; m7 = row 7 | |
5335 | punpckhbw xm8, xm6, xm7 | |
5336 | punpcklbw xm6, xm7 | |
5337 | vinserti128 m6, m6, xm8, 1 | |
5338 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
5339 | paddw m0, m8 | |
5340 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
5341 | paddw m2, m8 | |
5342 | pmaddubsw m8, m6, [r5 + 1 * mmsize] | |
5343 | paddw m4, m8 | |
5344 | pmaddubsw m6, [r5] | |
5345 | lea r0, [r0 + r1 * 4] | |
5346 | movu xm8, [r0] ; m8 = row 8 | |
5347 | punpckhbw xm9, xm7, xm8 | |
5348 | punpcklbw xm7, xm8 | |
5349 | vinserti128 m7, m7, xm9, 1 | |
5350 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
5351 | paddw m1, m9 | |
5352 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
5353 | paddw m3, m9 | |
5354 | pmaddubsw m9, m7, [r5 + 1 * mmsize] | |
5355 | paddw m5, m9 | |
5356 | pmaddubsw m7, [r5] | |
5357 | movu xm9, [r0 + r1] ; m9 = row 9 | |
5358 | punpckhbw xm10, xm8, xm9 | |
5359 | punpcklbw xm8, xm9 | |
5360 | vinserti128 m8, m8, xm10, 1 | |
5361 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
5362 | paddw m2, m10 | |
5363 | pmaddubsw m10, m8, [r5 + 2 * mmsize] | |
5364 | paddw m4, m10 | |
5365 | pmaddubsw m10, m8, [r5 + 1 * mmsize] | |
5366 | paddw m6, m10 | |
5367 | pmaddubsw m8, [r5] | |
5368 | movu xm10, [r0 + r1 * 2] ; m10 = row 10 | |
5369 | punpckhbw xm11, xm9, xm10 | |
5370 | punpcklbw xm9, xm10 | |
5371 | vinserti128 m9, m9, xm11, 1 | |
5372 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
5373 | paddw m3, m11 | |
5374 | pmaddubsw m11, m9, [r5 + 2 * mmsize] | |
5375 | paddw m5, m11 | |
5376 | pmaddubsw m11, m9, [r5 + 1 * mmsize] | |
5377 | paddw m7, m11 | |
5378 | pmaddubsw m9, [r5] | |
5379 | movu xm11, [r0 + r4] ; m11 = row 11 | |
5380 | punpckhbw xm12, xm10, xm11 | |
5381 | punpcklbw xm10, xm11 | |
5382 | vinserti128 m10, m10, xm12, 1 | |
5383 | pmaddubsw m12, m10, [r5 + 3 * mmsize] | |
5384 | paddw m4, m12 | |
5385 | pmaddubsw m12, m10, [r5 + 2 * mmsize] | |
5386 | paddw m6, m12 | |
5387 | pmaddubsw m12, m10, [r5 + 1 * mmsize] | |
5388 | paddw m8, m12 | |
5389 | pmaddubsw m10, [r5] | |
5390 | lea r0, [r0 + r1 * 4] | |
5391 | movu xm12, [r0] ; m12 = row 12 | |
5392 | punpckhbw xm13, xm11, xm12 | |
5393 | punpcklbw xm11, xm12 | |
5394 | vinserti128 m11, m11, xm13, 1 | |
5395 | pmaddubsw m13, m11, [r5 + 3 * mmsize] | |
5396 | paddw m5, m13 | |
5397 | pmaddubsw m13, m11, [r5 + 2 * mmsize] | |
5398 | paddw m7, m13 | |
5399 | pmaddubsw m13, m11, [r5 + 1 * mmsize] | |
5400 | paddw m9, m13 | |
5401 | pmaddubsw m11, [r5] | |
5402 | ||
5403 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
5404 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
5405 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
5406 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
5407 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
5408 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
5409 | packuswb m0, m1 | |
5410 | packuswb m2, m3 | |
5411 | packuswb m4, m5 | |
5412 | vpermq m0, m0, 11011000b | |
5413 | vpermq m2, m2, 11011000b | |
5414 | vpermq m4, m4, 11011000b | |
5415 | vextracti128 xm1, m0, 1 | |
5416 | vextracti128 xm3, m2, 1 | |
5417 | vextracti128 xm5, m4, 1 | |
5418 | movq [r2], xm0 | |
5419 | pextrd [r2 + 8], xm0, 2 | |
5420 | movq [r2 + r3], xm1 | |
5421 | pextrd [r2 + r3 + 8], xm1, 2 | |
5422 | movq [r2 + r3 * 2], xm2 | |
5423 | pextrd [r2 + r3 * 2 + 8], xm2, 2 | |
5424 | movq [r2 + r6], xm3 | |
5425 | pextrd [r2 + r6 + 8], xm3, 2 | |
5426 | lea r2, [r2 + r3 * 4] | |
5427 | movq [r2], xm4 | |
5428 | pextrd [r2 + 8], xm4, 2 | |
5429 | movq [r2 + r3], xm5 | |
5430 | pextrd [r2 + r3 + 8], xm5, 2 | |
5431 | ||
5432 | movu xm13, [r0 + r1] ; m13 = row 13 | |
5433 | punpckhbw xm0, xm12, xm13 | |
5434 | punpcklbw xm12, xm13 | |
5435 | vinserti128 m12, m12, xm0, 1 | |
5436 | pmaddubsw m0, m12, [r5 + 3 * mmsize] | |
5437 | paddw m6, m0 | |
5438 | pmaddubsw m0, m12, [r5 + 2 * mmsize] | |
5439 | paddw m8, m0 | |
5440 | pmaddubsw m0, m12, [r5 + 1 * mmsize] | |
5441 | paddw m10, m0 | |
5442 | pmaddubsw m12, [r5] | |
5443 | movu xm0, [r0 + r1 * 2] ; m0 = row 14 | |
5444 | punpckhbw xm1, xm13, xm0 | |
5445 | punpcklbw xm13, xm0 | |
5446 | vinserti128 m13, m13, xm1, 1 | |
5447 | pmaddubsw m1, m13, [r5 + 3 * mmsize] | |
5448 | paddw m7, m1 | |
5449 | pmaddubsw m1, m13, [r5 + 2 * mmsize] | |
5450 | paddw m9, m1 | |
5451 | pmaddubsw m1, m13, [r5 + 1 * mmsize] | |
5452 | paddw m11, m1 | |
5453 | pmaddubsw m13, [r5] | |
5454 | ||
5455 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
5456 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
5457 | packuswb m6, m7 | |
5458 | vpermq m6, m6, 11011000b | |
5459 | vextracti128 xm7, m6, 1 | |
5460 | movq [r2 + r3 * 2], xm6 | |
5461 | pextrd [r2 + r3 * 2 + 8], xm6, 2 | |
5462 | movq [r2 + r6], xm7 | |
5463 | pextrd [r2 + r6 + 8], xm7, 2 | |
5464 | lea r2, [r2 + r3 * 4] | |
5465 | ||
5466 | movu xm1, [r0 + r4] ; m1 = row 15 | |
5467 | punpckhbw xm2, xm0, xm1 | |
5468 | punpcklbw xm0, xm1 | |
5469 | vinserti128 m0, m0, xm2, 1 | |
5470 | pmaddubsw m2, m0, [r5 + 3 * mmsize] | |
5471 | paddw m8, m2 | |
5472 | pmaddubsw m2, m0, [r5 + 2 * mmsize] | |
5473 | paddw m10, m2 | |
5474 | pmaddubsw m2, m0, [r5 + 1 * mmsize] | |
5475 | paddw m12, m2 | |
5476 | pmaddubsw m0, [r5] | |
5477 | lea r0, [r0 + r1 * 4] | |
5478 | movu xm2, [r0] ; m2 = row 16 | |
5479 | punpckhbw xm3, xm1, xm2 | |
5480 | punpcklbw xm1, xm2 | |
5481 | vinserti128 m1, m1, xm3, 1 | |
5482 | pmaddubsw m3, m1, [r5 + 3 * mmsize] | |
5483 | paddw m9, m3 | |
5484 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
5485 | paddw m11, m3 | |
5486 | pmaddubsw m3, m1, [r5 + 1 * mmsize] | |
5487 | paddw m13, m3 | |
5488 | pmaddubsw m1, [r5] | |
5489 | movu xm3, [r0 + r1] ; m3 = row 17 | |
5490 | punpckhbw xm4, xm2, xm3 | |
5491 | punpcklbw xm2, xm3 | |
5492 | vinserti128 m2, m2, xm4, 1 | |
5493 | pmaddubsw m4, m2, [r5 + 3 * mmsize] | |
5494 | paddw m10, m4 | |
5495 | pmaddubsw m4, m2, [r5 + 2 * mmsize] | |
5496 | paddw m12, m4 | |
5497 | pmaddubsw m2, [r5 + 1 * mmsize] | |
5498 | paddw m0, m2 | |
5499 | movu xm4, [r0 + r1 * 2] ; m4 = row 18 | |
5500 | punpckhbw xm5, xm3, xm4 | |
5501 | punpcklbw xm3, xm4 | |
5502 | vinserti128 m3, m3, xm5, 1 | |
5503 | pmaddubsw m5, m3, [r5 + 3 * mmsize] | |
5504 | paddw m11, m5 | |
5505 | pmaddubsw m5, m3, [r5 + 2 * mmsize] | |
5506 | paddw m13, m5 | |
5507 | pmaddubsw m3, [r5 + 1 * mmsize] | |
5508 | paddw m1, m3 | |
5509 | movu xm5, [r0 + r4] ; m5 = row 19 | |
5510 | punpckhbw xm6, xm4, xm5 | |
5511 | punpcklbw xm4, xm5 | |
5512 | vinserti128 m4, m4, xm6, 1 | |
5513 | pmaddubsw m6, m4, [r5 + 3 * mmsize] | |
5514 | paddw m12, m6 | |
5515 | pmaddubsw m4, [r5 + 2 * mmsize] | |
5516 | paddw m0, m4 | |
5517 | lea r0, [r0 + r1 * 4] | |
5518 | movu xm6, [r0] ; m6 = row 20 | |
5519 | punpckhbw xm7, xm5, xm6 | |
5520 | punpcklbw xm5, xm6 | |
5521 | vinserti128 m5, m5, xm7, 1 | |
5522 | pmaddubsw m7, m5, [r5 + 3 * mmsize] | |
5523 | paddw m13, m7 | |
5524 | pmaddubsw m5, [r5 + 2 * mmsize] | |
5525 | paddw m1, m5 | |
5526 | movu xm7, [r0 + r1] ; m7 = row 21 | |
5527 | punpckhbw xm2, xm6, xm7 | |
5528 | punpcklbw xm6, xm7 | |
5529 | vinserti128 m6, m6, xm2, 1 | |
5530 | pmaddubsw m6, [r5 + 3 * mmsize] | |
5531 | paddw m0, m6 | |
5532 | movu xm2, [r0 + r1 * 2] ; m2 = row 22 | |
5533 | punpckhbw xm3, xm7, xm2 | |
5534 | punpcklbw xm7, xm2 | |
5535 | vinserti128 m7, m7, xm3, 1 | |
5536 | pmaddubsw m7, [r5 + 3 * mmsize] | |
5537 | paddw m1, m7 | |
5538 | ||
5539 | pmulhrsw m8, m14 ; m8 = word: row 8 | |
5540 | pmulhrsw m9, m14 ; m9 = word: row 9 | |
5541 | pmulhrsw m10, m14 ; m10 = word: row 10 | |
5542 | pmulhrsw m11, m14 ; m11 = word: row 11 | |
5543 | pmulhrsw m12, m14 ; m12 = word: row 12 | |
5544 | pmulhrsw m13, m14 ; m13 = word: row 13 | |
5545 | pmulhrsw m0, m14 ; m0 = word: row 14 | |
5546 | pmulhrsw m1, m14 ; m1 = word: row 15 | |
5547 | packuswb m8, m9 | |
5548 | packuswb m10, m11 | |
5549 | packuswb m12, m13 | |
5550 | packuswb m0, m1 | |
5551 | vpermq m8, m8, 11011000b | |
5552 | vpermq m10, m10, 11011000b | |
5553 | vpermq m12, m12, 11011000b | |
5554 | vpermq m0, m0, 11011000b | |
5555 | vextracti128 xm9, m8, 1 | |
5556 | vextracti128 xm11, m10, 1 | |
5557 | vextracti128 xm13, m12, 1 | |
5558 | vextracti128 xm1, m0, 1 | |
5559 | movq [r2], xm8 | |
5560 | pextrd [r2 + 8], xm8, 2 | |
5561 | movq [r2 + r3], xm9 | |
5562 | pextrd [r2 + r3 + 8], xm9, 2 | |
5563 | movq [r2 + r3 * 2], xm10 | |
5564 | pextrd [r2 + r3 * 2 + 8], xm10, 2 | |
5565 | movq [r2 + r6], xm11 | |
5566 | pextrd [r2 + r6 + 8], xm11, 2 | |
5567 | lea r2, [r2 + r3 * 4] | |
5568 | movq [r2], xm12 | |
5569 | pextrd [r2 + 8], xm12, 2 | |
5570 | movq [r2 + r3], xm13 | |
5571 | pextrd [r2 + r3 + 8], xm13, 2 | |
5572 | movq [r2 + r3 * 2], xm0 | |
5573 | pextrd [r2 + r3 * 2 + 8], xm0, 2 | |
5574 | movq [r2 + r6], xm1 | |
5575 | pextrd [r2 + r6 + 8], xm1, 2 | |
5576 | RET | |
5577 | %endif | |
5578 | ||
5579 | INIT_YMM avx2 | |
5580 | %if ARCH_X86_64 == 1 | |
5581 | cglobal interp_8tap_vert_pp_16x16, 4, 7, 15 | |
5582 | mov r4d, r4m | |
5583 | shl r4d, 7 | |
5584 | ||
5585 | %ifdef PIC | |
5586 | lea r5, [tab_LumaCoeffVer_32] | |
5587 | add r5, r4 | |
5588 | %else | |
5589 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
5590 | %endif | |
5591 | ||
5592 | lea r4, [r1 * 3] | |
5593 | sub r0, r4 | |
5594 | lea r6, [r3 * 3] | |
5595 | mova m14, [pw_512] | |
5596 | ||
5597 | movu xm0, [r0] ; m0 = row 0 | |
5598 | movu xm1, [r0 + r1] ; m1 = row 1 | |
5599 | punpckhbw xm2, xm0, xm1 | |
5600 | punpcklbw xm0, xm1 | |
5601 | vinserti128 m0, m0, xm2, 1 | |
5602 | pmaddubsw m0, [r5] | |
5603 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
5604 | punpckhbw xm3, xm1, xm2 | |
5605 | punpcklbw xm1, xm2 | |
5606 | vinserti128 m1, m1, xm3, 1 | |
5607 | pmaddubsw m1, [r5] | |
5608 | movu xm3, [r0 + r4] ; m3 = row 3 | |
5609 | punpckhbw xm4, xm2, xm3 | |
5610 | punpcklbw xm2, xm3 | |
5611 | vinserti128 m2, m2, xm4, 1 | |
5612 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
5613 | paddw m0, m4 | |
5614 | pmaddubsw m2, [r5] | |
5615 | lea r0, [r0 + r1 * 4] | |
5616 | movu xm4, [r0] ; m4 = row 4 | |
5617 | punpckhbw xm5, xm3, xm4 | |
5618 | punpcklbw xm3, xm4 | |
5619 | vinserti128 m3, m3, xm5, 1 | |
5620 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
5621 | paddw m1, m5 | |
5622 | pmaddubsw m3, [r5] | |
5623 | movu xm5, [r0 + r1] ; m5 = row 5 | |
5624 | punpckhbw xm6, xm4, xm5 | |
5625 | punpcklbw xm4, xm5 | |
5626 | vinserti128 m4, m4, xm6, 1 | |
5627 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
5628 | paddw m0, m6 | |
5629 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
5630 | paddw m2, m6 | |
5631 | pmaddubsw m4, [r5] | |
5632 | movu xm6, [r0 + r1 * 2] ; m6 = row 6 | |
5633 | punpckhbw xm7, xm5, xm6 | |
5634 | punpcklbw xm5, xm6 | |
5635 | vinserti128 m5, m5, xm7, 1 | |
5636 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
5637 | paddw m1, m7 | |
5638 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
5639 | paddw m3, m7 | |
5640 | pmaddubsw m5, [r5] | |
5641 | movu xm7, [r0 + r4] ; m7 = row 7 | |
5642 | punpckhbw xm8, xm6, xm7 | |
5643 | punpcklbw xm6, xm7 | |
5644 | vinserti128 m6, m6, xm8, 1 | |
5645 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
5646 | paddw m0, m8 | |
5647 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
5648 | paddw m2, m8 | |
5649 | pmaddubsw m8, m6, [r5 + 1 * mmsize] | |
5650 | paddw m4, m8 | |
5651 | pmaddubsw m6, [r5] | |
5652 | lea r0, [r0 + r1 * 4] | |
5653 | movu xm8, [r0] ; m8 = row 8 | |
5654 | punpckhbw xm9, xm7, xm8 | |
5655 | punpcklbw xm7, xm8 | |
5656 | vinserti128 m7, m7, xm9, 1 | |
5657 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
5658 | paddw m1, m9 | |
5659 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
5660 | paddw m3, m9 | |
5661 | pmaddubsw m9, m7, [r5 + 1 * mmsize] | |
5662 | paddw m5, m9 | |
5663 | pmaddubsw m7, [r5] | |
5664 | movu xm9, [r0 + r1] ; m9 = row 9 | |
5665 | punpckhbw xm10, xm8, xm9 | |
5666 | punpcklbw xm8, xm9 | |
5667 | vinserti128 m8, m8, xm10, 1 | |
5668 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
5669 | paddw m2, m10 | |
5670 | pmaddubsw m10, m8, [r5 + 2 * mmsize] | |
5671 | paddw m4, m10 | |
5672 | pmaddubsw m10, m8, [r5 + 1 * mmsize] | |
5673 | paddw m6, m10 | |
5674 | pmaddubsw m8, [r5] | |
5675 | movu xm10, [r0 + r1 * 2] ; m10 = row 10 | |
5676 | punpckhbw xm11, xm9, xm10 | |
5677 | punpcklbw xm9, xm10 | |
5678 | vinserti128 m9, m9, xm11, 1 | |
5679 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
5680 | paddw m3, m11 | |
5681 | pmaddubsw m11, m9, [r5 + 2 * mmsize] | |
5682 | paddw m5, m11 | |
5683 | pmaddubsw m11, m9, [r5 + 1 * mmsize] | |
5684 | paddw m7, m11 | |
5685 | pmaddubsw m9, [r5] | |
5686 | movu xm11, [r0 + r4] ; m11 = row 11 | |
5687 | punpckhbw xm12, xm10, xm11 | |
5688 | punpcklbw xm10, xm11 | |
5689 | vinserti128 m10, m10, xm12, 1 | |
5690 | pmaddubsw m12, m10, [r5 + 3 * mmsize] | |
5691 | paddw m4, m12 | |
5692 | pmaddubsw m12, m10, [r5 + 2 * mmsize] | |
5693 | paddw m6, m12 | |
5694 | pmaddubsw m12, m10, [r5 + 1 * mmsize] | |
5695 | paddw m8, m12 | |
5696 | pmaddubsw m10, [r5] | |
5697 | lea r0, [r0 + r1 * 4] | |
5698 | movu xm12, [r0] ; m12 = row 12 | |
5699 | punpckhbw xm13, xm11, xm12 | |
5700 | punpcklbw xm11, xm12 | |
5701 | vinserti128 m11, m11, xm13, 1 | |
5702 | pmaddubsw m13, m11, [r5 + 3 * mmsize] | |
5703 | paddw m5, m13 | |
5704 | pmaddubsw m13, m11, [r5 + 2 * mmsize] | |
5705 | paddw m7, m13 | |
5706 | pmaddubsw m13, m11, [r5 + 1 * mmsize] | |
5707 | paddw m9, m13 | |
5708 | pmaddubsw m11, [r5] | |
5709 | ||
5710 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
5711 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
5712 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
5713 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
5714 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
5715 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
5716 | packuswb m0, m1 | |
5717 | packuswb m2, m3 | |
5718 | packuswb m4, m5 | |
5719 | vpermq m0, m0, 11011000b | |
5720 | vpermq m2, m2, 11011000b | |
5721 | vpermq m4, m4, 11011000b | |
5722 | vextracti128 xm1, m0, 1 | |
5723 | vextracti128 xm3, m2, 1 | |
5724 | vextracti128 xm5, m4, 1 | |
5725 | movu [r2], xm0 | |
5726 | movu [r2 + r3], xm1 | |
5727 | movu [r2 + r3 * 2], xm2 | |
5728 | movu [r2 + r6], xm3 | |
5729 | lea r2, [r2 + r3 * 4] | |
5730 | movu [r2], xm4 | |
5731 | movu [r2 + r3], xm5 | |
5732 | ||
5733 | movu xm13, [r0 + r1] ; m13 = row 13 | |
5734 | punpckhbw xm0, xm12, xm13 | |
5735 | punpcklbw xm12, xm13 | |
5736 | vinserti128 m12, m12, xm0, 1 | |
5737 | pmaddubsw m0, m12, [r5 + 3 * mmsize] | |
5738 | paddw m6, m0 | |
5739 | pmaddubsw m0, m12, [r5 + 2 * mmsize] | |
5740 | paddw m8, m0 | |
5741 | pmaddubsw m0, m12, [r5 + 1 * mmsize] | |
5742 | paddw m10, m0 | |
5743 | pmaddubsw m12, [r5] | |
5744 | movu xm0, [r0 + r1 * 2] ; m0 = row 14 | |
5745 | punpckhbw xm1, xm13, xm0 | |
5746 | punpcklbw xm13, xm0 | |
5747 | vinserti128 m13, m13, xm1, 1 | |
5748 | pmaddubsw m1, m13, [r5 + 3 * mmsize] | |
5749 | paddw m7, m1 | |
5750 | pmaddubsw m1, m13, [r5 + 2 * mmsize] | |
5751 | paddw m9, m1 | |
5752 | pmaddubsw m1, m13, [r5 + 1 * mmsize] | |
5753 | paddw m11, m1 | |
5754 | pmaddubsw m13, [r5] | |
5755 | ||
5756 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
5757 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
5758 | packuswb m6, m7 | |
5759 | vpermq m6, m6, 11011000b | |
5760 | vextracti128 xm7, m6, 1 | |
5761 | movu [r2 + r3 * 2], xm6 | |
5762 | movu [r2 + r6], xm7 | |
5763 | lea r2, [r2 + r3 * 4] | |
5764 | ||
5765 | movu xm1, [r0 + r4] ; m1 = row 15 | |
5766 | punpckhbw xm2, xm0, xm1 | |
5767 | punpcklbw xm0, xm1 | |
5768 | vinserti128 m0, m0, xm2, 1 | |
5769 | pmaddubsw m2, m0, [r5 + 3 * mmsize] | |
5770 | paddw m8, m2 | |
5771 | pmaddubsw m2, m0, [r5 + 2 * mmsize] | |
5772 | paddw m10, m2 | |
5773 | pmaddubsw m2, m0, [r5 + 1 * mmsize] | |
5774 | paddw m12, m2 | |
5775 | pmaddubsw m0, [r5] | |
5776 | lea r0, [r0 + r1 * 4] | |
5777 | movu xm2, [r0] ; m2 = row 16 | |
5778 | punpckhbw xm3, xm1, xm2 | |
5779 | punpcklbw xm1, xm2 | |
5780 | vinserti128 m1, m1, xm3, 1 | |
5781 | pmaddubsw m3, m1, [r5 + 3 * mmsize] | |
5782 | paddw m9, m3 | |
5783 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
5784 | paddw m11, m3 | |
5785 | pmaddubsw m3, m1, [r5 + 1 * mmsize] | |
5786 | paddw m13, m3 | |
5787 | pmaddubsw m1, [r5] | |
5788 | movu xm3, [r0 + r1] ; m3 = row 17 | |
5789 | punpckhbw xm4, xm2, xm3 | |
5790 | punpcklbw xm2, xm3 | |
5791 | vinserti128 m2, m2, xm4, 1 | |
5792 | pmaddubsw m4, m2, [r5 + 3 * mmsize] | |
5793 | paddw m10, m4 | |
5794 | pmaddubsw m4, m2, [r5 + 2 * mmsize] | |
5795 | paddw m12, m4 | |
5796 | pmaddubsw m2, [r5 + 1 * mmsize] | |
5797 | paddw m0, m2 | |
5798 | movu xm4, [r0 + r1 * 2] ; m4 = row 18 | |
5799 | punpckhbw xm5, xm3, xm4 | |
5800 | punpcklbw xm3, xm4 | |
5801 | vinserti128 m3, m3, xm5, 1 | |
5802 | pmaddubsw m5, m3, [r5 + 3 * mmsize] | |
5803 | paddw m11, m5 | |
5804 | pmaddubsw m5, m3, [r5 + 2 * mmsize] | |
5805 | paddw m13, m5 | |
5806 | pmaddubsw m3, [r5 + 1 * mmsize] | |
5807 | paddw m1, m3 | |
5808 | movu xm5, [r0 + r4] ; m5 = row 19 | |
5809 | punpckhbw xm6, xm4, xm5 | |
5810 | punpcklbw xm4, xm5 | |
5811 | vinserti128 m4, m4, xm6, 1 | |
5812 | pmaddubsw m6, m4, [r5 + 3 * mmsize] | |
5813 | paddw m12, m6 | |
5814 | pmaddubsw m4, [r5 + 2 * mmsize] | |
5815 | paddw m0, m4 | |
5816 | lea r0, [r0 + r1 * 4] | |
5817 | movu xm6, [r0] ; m6 = row 20 | |
5818 | punpckhbw xm7, xm5, xm6 | |
5819 | punpcklbw xm5, xm6 | |
5820 | vinserti128 m5, m5, xm7, 1 | |
5821 | pmaddubsw m7, m5, [r5 + 3 * mmsize] | |
5822 | paddw m13, m7 | |
5823 | pmaddubsw m5, [r5 + 2 * mmsize] | |
5824 | paddw m1, m5 | |
5825 | movu xm7, [r0 + r1] ; m7 = row 21 | |
5826 | punpckhbw xm2, xm6, xm7 | |
5827 | punpcklbw xm6, xm7 | |
5828 | vinserti128 m6, m6, xm2, 1 | |
5829 | pmaddubsw m6, [r5 + 3 * mmsize] | |
5830 | paddw m0, m6 | |
5831 | movu xm2, [r0 + r1 * 2] ; m2 = row 22 | |
5832 | punpckhbw xm3, xm7, xm2 | |
5833 | punpcklbw xm7, xm2 | |
5834 | vinserti128 m7, m7, xm3, 1 | |
5835 | pmaddubsw m7, [r5 + 3 * mmsize] | |
5836 | paddw m1, m7 | |
5837 | ||
5838 | pmulhrsw m8, m14 ; m8 = word: row 8 | |
5839 | pmulhrsw m9, m14 ; m9 = word: row 9 | |
5840 | pmulhrsw m10, m14 ; m10 = word: row 10 | |
5841 | pmulhrsw m11, m14 ; m11 = word: row 11 | |
5842 | pmulhrsw m12, m14 ; m12 = word: row 12 | |
5843 | pmulhrsw m13, m14 ; m13 = word: row 13 | |
5844 | pmulhrsw m0, m14 ; m0 = word: row 14 | |
5845 | pmulhrsw m1, m14 ; m1 = word: row 15 | |
5846 | packuswb m8, m9 | |
5847 | packuswb m10, m11 | |
5848 | packuswb m12, m13 | |
5849 | packuswb m0, m1 | |
5850 | vpermq m8, m8, 11011000b | |
5851 | vpermq m10, m10, 11011000b | |
5852 | vpermq m12, m12, 11011000b | |
5853 | vpermq m0, m0, 11011000b | |
5854 | vextracti128 xm9, m8, 1 | |
5855 | vextracti128 xm11, m10, 1 | |
5856 | vextracti128 xm13, m12, 1 | |
5857 | vextracti128 xm1, m0, 1 | |
5858 | movu [r2], xm8 | |
5859 | movu [r2 + r3], xm9 | |
5860 | movu [r2 + r3 * 2], xm10 | |
5861 | movu [r2 + r6], xm11 | |
5862 | lea r2, [r2 + r3 * 4] | |
5863 | movu [r2], xm12 | |
5864 | movu [r2 + r3], xm13 | |
5865 | movu [r2 + r3 * 2], xm0 | |
5866 | movu [r2 + r6], xm1 | |
5867 | RET | |
5868 | %endif | |
5869 | ||
5870 | INIT_YMM avx2 | |
5871 | %if ARCH_X86_64 == 1 | |
5872 | cglobal interp_8tap_vert_pp_16x12, 4, 7, 15 | |
5873 | mov r4d, r4m | |
5874 | shl r4d, 7 | |
5875 | ||
5876 | %ifdef PIC | |
5877 | lea r5, [tab_LumaCoeffVer_32] | |
5878 | add r5, r4 | |
5879 | %else | |
5880 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
5881 | %endif | |
5882 | ||
5883 | lea r4, [r1 * 3] | |
5884 | sub r0, r4 | |
5885 | lea r6, [r3 * 3] | |
5886 | mova m14, [pw_512] | |
5887 | ||
5888 | movu xm0, [r0] ; m0 = row 0 | |
5889 | movu xm1, [r0 + r1] ; m1 = row 1 | |
5890 | punpckhbw xm2, xm0, xm1 | |
5891 | punpcklbw xm0, xm1 | |
5892 | vinserti128 m0, m0, xm2, 1 | |
5893 | pmaddubsw m0, [r5] | |
5894 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
5895 | punpckhbw xm3, xm1, xm2 | |
5896 | punpcklbw xm1, xm2 | |
5897 | vinserti128 m1, m1, xm3, 1 | |
5898 | pmaddubsw m1, [r5] | |
5899 | movu xm3, [r0 + r4] ; m3 = row 3 | |
5900 | punpckhbw xm4, xm2, xm3 | |
5901 | punpcklbw xm2, xm3 | |
5902 | vinserti128 m2, m2, xm4, 1 | |
5903 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
5904 | paddw m0, m4 | |
5905 | pmaddubsw m2, [r5] | |
5906 | lea r0, [r0 + r1 * 4] | |
5907 | movu xm4, [r0] ; m4 = row 4 | |
5908 | punpckhbw xm5, xm3, xm4 | |
5909 | punpcklbw xm3, xm4 | |
5910 | vinserti128 m3, m3, xm5, 1 | |
5911 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
5912 | paddw m1, m5 | |
5913 | pmaddubsw m3, [r5] | |
5914 | movu xm5, [r0 + r1] ; m5 = row 5 | |
5915 | punpckhbw xm6, xm4, xm5 | |
5916 | punpcklbw xm4, xm5 | |
5917 | vinserti128 m4, m4, xm6, 1 | |
5918 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
5919 | paddw m0, m6 | |
5920 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
5921 | paddw m2, m6 | |
5922 | pmaddubsw m4, [r5] | |
5923 | movu xm6, [r0 + r1 * 2] ; m6 = row 6 | |
5924 | punpckhbw xm7, xm5, xm6 | |
5925 | punpcklbw xm5, xm6 | |
5926 | vinserti128 m5, m5, xm7, 1 | |
5927 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
5928 | paddw m1, m7 | |
5929 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
5930 | paddw m3, m7 | |
5931 | pmaddubsw m5, [r5] | |
5932 | movu xm7, [r0 + r4] ; m7 = row 7 | |
5933 | punpckhbw xm8, xm6, xm7 | |
5934 | punpcklbw xm6, xm7 | |
5935 | vinserti128 m6, m6, xm8, 1 | |
5936 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
5937 | paddw m0, m8 | |
5938 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
5939 | paddw m2, m8 | |
5940 | pmaddubsw m8, m6, [r5 + 1 * mmsize] | |
5941 | paddw m4, m8 | |
5942 | pmaddubsw m6, [r5] | |
5943 | lea r0, [r0 + r1 * 4] | |
5944 | movu xm8, [r0] ; m8 = row 8 | |
5945 | punpckhbw xm9, xm7, xm8 | |
5946 | punpcklbw xm7, xm8 | |
5947 | vinserti128 m7, m7, xm9, 1 | |
5948 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
5949 | paddw m1, m9 | |
5950 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
5951 | paddw m3, m9 | |
5952 | pmaddubsw m9, m7, [r5 + 1 * mmsize] | |
5953 | paddw m5, m9 | |
5954 | pmaddubsw m7, [r5] | |
5955 | movu xm9, [r0 + r1] ; m9 = row 9 | |
5956 | punpckhbw xm10, xm8, xm9 | |
5957 | punpcklbw xm8, xm9 | |
5958 | vinserti128 m8, m8, xm10, 1 | |
5959 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
5960 | paddw m2, m10 | |
5961 | pmaddubsw m10, m8, [r5 + 2 * mmsize] | |
5962 | paddw m4, m10 | |
5963 | pmaddubsw m10, m8, [r5 + 1 * mmsize] | |
5964 | paddw m6, m10 | |
5965 | pmaddubsw m8, [r5] | |
5966 | movu xm10, [r0 + r1 * 2] ; m10 = row 10 | |
5967 | punpckhbw xm11, xm9, xm10 | |
5968 | punpcklbw xm9, xm10 | |
5969 | vinserti128 m9, m9, xm11, 1 | |
5970 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
5971 | paddw m3, m11 | |
5972 | pmaddubsw m11, m9, [r5 + 2 * mmsize] | |
5973 | paddw m5, m11 | |
5974 | pmaddubsw m11, m9, [r5 + 1 * mmsize] | |
5975 | paddw m7, m11 | |
5976 | pmaddubsw m9, [r5] | |
5977 | movu xm11, [r0 + r4] ; m11 = row 11 | |
5978 | punpckhbw xm12, xm10, xm11 | |
5979 | punpcklbw xm10, xm11 | |
5980 | vinserti128 m10, m10, xm12, 1 | |
5981 | pmaddubsw m12, m10, [r5 + 3 * mmsize] | |
5982 | paddw m4, m12 | |
5983 | pmaddubsw m12, m10, [r5 + 2 * mmsize] | |
5984 | paddw m6, m12 | |
5985 | pmaddubsw m12, m10, [r5 + 1 * mmsize] | |
5986 | paddw m8, m12 | |
5987 | pmaddubsw m10, [r5] | |
5988 | lea r0, [r0 + r1 * 4] | |
5989 | movu xm12, [r0] ; m12 = row 12 | |
5990 | punpckhbw xm13, xm11, xm12 | |
5991 | punpcklbw xm11, xm12 | |
5992 | vinserti128 m11, m11, xm13, 1 | |
5993 | pmaddubsw m13, m11, [r5 + 3 * mmsize] | |
5994 | paddw m5, m13 | |
5995 | pmaddubsw m13, m11, [r5 + 2 * mmsize] | |
5996 | paddw m7, m13 | |
5997 | pmaddubsw m13, m11, [r5 + 1 * mmsize] | |
5998 | paddw m9, m13 | |
5999 | pmaddubsw m11, [r5] | |
6000 | ||
6001 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
6002 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
6003 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
6004 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
6005 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
6006 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
6007 | packuswb m0, m1 | |
6008 | packuswb m2, m3 | |
6009 | packuswb m4, m5 | |
6010 | vpermq m0, m0, 11011000b | |
6011 | vpermq m2, m2, 11011000b | |
6012 | vpermq m4, m4, 11011000b | |
6013 | vextracti128 xm1, m0, 1 | |
6014 | vextracti128 xm3, m2, 1 | |
6015 | vextracti128 xm5, m4, 1 | |
6016 | movu [r2], xm0 | |
6017 | movu [r2 + r3], xm1 | |
6018 | movu [r2 + r3 * 2], xm2 | |
6019 | movu [r2 + r6], xm3 | |
6020 | lea r2, [r2 + r3 * 4] | |
6021 | movu [r2], xm4 | |
6022 | movu [r2 + r3], xm5 | |
6023 | ||
6024 | movu xm13, [r0 + r1] ; m13 = row 13 | |
6025 | punpckhbw xm0, xm12, xm13 | |
6026 | punpcklbw xm12, xm13 | |
6027 | vinserti128 m12, m12, xm0, 1 | |
6028 | pmaddubsw m0, m12, [r5 + 3 * mmsize] | |
6029 | paddw m6, m0 | |
6030 | pmaddubsw m0, m12, [r5 + 2 * mmsize] | |
6031 | paddw m8, m0 | |
6032 | pmaddubsw m0, m12, [r5 + 1 * mmsize] | |
6033 | paddw m10, m0 | |
6034 | movu xm0, [r0 + r1 * 2] ; m0 = row 14 | |
6035 | punpckhbw xm1, xm13, xm0 | |
6036 | punpcklbw xm13, xm0 | |
6037 | vinserti128 m13, m13, xm1, 1 | |
6038 | pmaddubsw m1, m13, [r5 + 3 * mmsize] | |
6039 | paddw m7, m1 | |
6040 | pmaddubsw m1, m13, [r5 + 2 * mmsize] | |
6041 | paddw m9, m1 | |
6042 | pmaddubsw m1, m13, [r5 + 1 * mmsize] | |
6043 | paddw m11, m1 | |
6044 | ||
6045 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
6046 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
6047 | packuswb m6, m7 | |
6048 | vpermq m6, m6, 11011000b | |
6049 | vextracti128 xm7, m6, 1 | |
6050 | movu [r2 + r3 * 2], xm6 | |
6051 | movu [r2 + r6], xm7 | |
6052 | lea r2, [r2 + r3 * 4] | |
6053 | ||
6054 | movu xm1, [r0 + r4] ; m1 = row 15 | |
6055 | punpckhbw xm2, xm0, xm1 | |
6056 | punpcklbw xm0, xm1 | |
6057 | vinserti128 m0, m0, xm2, 1 | |
6058 | pmaddubsw m2, m0, [r5 + 3 * mmsize] | |
6059 | paddw m8, m2 | |
6060 | pmaddubsw m2, m0, [r5 + 2 * mmsize] | |
6061 | paddw m10, m2 | |
6062 | lea r0, [r0 + r1 * 4] | |
6063 | movu xm2, [r0] ; m2 = row 16 | |
6064 | punpckhbw xm3, xm1, xm2 | |
6065 | punpcklbw xm1, xm2 | |
6066 | vinserti128 m1, m1, xm3, 1 | |
6067 | pmaddubsw m3, m1, [r5 + 3 * mmsize] | |
6068 | paddw m9, m3 | |
6069 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
6070 | paddw m11, m3 | |
6071 | movu xm3, [r0 + r1] ; m3 = row 17 | |
6072 | punpckhbw xm4, xm2, xm3 | |
6073 | punpcklbw xm2, xm3 | |
6074 | vinserti128 m2, m2, xm4, 1 | |
6075 | pmaddubsw m4, m2, [r5 + 3 * mmsize] | |
6076 | paddw m10, m4 | |
6077 | movu xm4, [r0 + r1 * 2] ; m4 = row 18 | |
6078 | punpckhbw xm5, xm3, xm4 | |
6079 | punpcklbw xm3, xm4 | |
6080 | vinserti128 m3, m3, xm5, 1 | |
6081 | pmaddubsw m5, m3, [r5 + 3 * mmsize] | |
6082 | paddw m11, m5 | |
6083 | ||
6084 | pmulhrsw m8, m14 ; m8 = word: row 8 | |
6085 | pmulhrsw m9, m14 ; m9 = word: row 9 | |
6086 | pmulhrsw m10, m14 ; m10 = word: row 10 | |
6087 | pmulhrsw m11, m14 ; m11 = word: row 11 | |
6088 | packuswb m8, m9 | |
6089 | packuswb m10, m11 | |
6090 | vpermq m8, m8, 11011000b | |
6091 | vpermq m10, m10, 11011000b | |
6092 | vextracti128 xm9, m8, 1 | |
6093 | vextracti128 xm11, m10, 1 | |
6094 | movu [r2], xm8 | |
6095 | movu [r2 + r3], xm9 | |
6096 | movu [r2 + r3 * 2], xm10 | |
6097 | movu [r2 + r6], xm11 | |
6098 | RET | |
6099 | %endif | |
6100 | ||
6101 | INIT_YMM avx2 | |
6102 | %if ARCH_X86_64 == 1 | |
6103 | cglobal interp_8tap_vert_pp_16x8, 4, 7, 15 | |
6104 | mov r4d, r4m | |
6105 | shl r4d, 7 | |
6106 | ||
6107 | %ifdef PIC | |
6108 | lea r5, [tab_LumaCoeffVer_32] | |
6109 | add r5, r4 | |
6110 | %else | |
6111 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
6112 | %endif | |
6113 | ||
6114 | lea r4, [r1 * 3] | |
6115 | sub r0, r4 | |
6116 | lea r6, [r3 * 3] | |
6117 | mova m14, [pw_512] | |
6118 | ||
6119 | movu xm0, [r0] ; m0 = row 0 | |
6120 | movu xm1, [r0 + r1] ; m1 = row 1 | |
6121 | punpckhbw xm2, xm0, xm1 | |
6122 | punpcklbw xm0, xm1 | |
6123 | vinserti128 m0, m0, xm2, 1 | |
6124 | pmaddubsw m0, [r5] | |
6125 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
6126 | punpckhbw xm3, xm1, xm2 | |
6127 | punpcklbw xm1, xm2 | |
6128 | vinserti128 m1, m1, xm3, 1 | |
6129 | pmaddubsw m1, [r5] | |
6130 | movu xm3, [r0 + r4] ; m3 = row 3 | |
6131 | punpckhbw xm4, xm2, xm3 | |
6132 | punpcklbw xm2, xm3 | |
6133 | vinserti128 m2, m2, xm4, 1 | |
6134 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
6135 | paddw m0, m4 | |
6136 | pmaddubsw m2, [r5] | |
6137 | lea r0, [r0 + r1 * 4] | |
6138 | movu xm4, [r0] ; m4 = row 4 | |
6139 | punpckhbw xm5, xm3, xm4 | |
6140 | punpcklbw xm3, xm4 | |
6141 | vinserti128 m3, m3, xm5, 1 | |
6142 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
6143 | paddw m1, m5 | |
6144 | pmaddubsw m3, [r5] | |
6145 | movu xm5, [r0 + r1] ; m5 = row 5 | |
6146 | punpckhbw xm6, xm4, xm5 | |
6147 | punpcklbw xm4, xm5 | |
6148 | vinserti128 m4, m4, xm6, 1 | |
6149 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
6150 | paddw m0, m6 | |
6151 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
6152 | paddw m2, m6 | |
6153 | pmaddubsw m4, [r5] | |
6154 | movu xm6, [r0 + r1 * 2] ; m6 = row 6 | |
6155 | punpckhbw xm7, xm5, xm6 | |
6156 | punpcklbw xm5, xm6 | |
6157 | vinserti128 m5, m5, xm7, 1 | |
6158 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
6159 | paddw m1, m7 | |
6160 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
6161 | paddw m3, m7 | |
6162 | pmaddubsw m5, [r5] | |
6163 | movu xm7, [r0 + r4] ; m7 = row 7 | |
6164 | punpckhbw xm8, xm6, xm7 | |
6165 | punpcklbw xm6, xm7 | |
6166 | vinserti128 m6, m6, xm8, 1 | |
6167 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
6168 | paddw m0, m8 | |
6169 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
6170 | paddw m2, m8 | |
6171 | pmaddubsw m8, m6, [r5 + 1 * mmsize] | |
6172 | paddw m4, m8 | |
6173 | pmaddubsw m6, [r5] | |
6174 | lea r0, [r0 + r1 * 4] | |
6175 | movu xm8, [r0] ; m8 = row 8 | |
6176 | punpckhbw xm9, xm7, xm8 | |
6177 | punpcklbw xm7, xm8 | |
6178 | vinserti128 m7, m7, xm9, 1 | |
6179 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
6180 | paddw m1, m9 | |
6181 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
6182 | paddw m3, m9 | |
6183 | pmaddubsw m9, m7, [r5 + 1 * mmsize] | |
6184 | paddw m5, m9 | |
6185 | pmaddubsw m7, [r5] | |
6186 | movu xm9, [r0 + r1] ; m9 = row 9 | |
6187 | punpckhbw xm10, xm8, xm9 | |
6188 | punpcklbw xm8, xm9 | |
6189 | vinserti128 m8, m8, xm10, 1 | |
6190 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
6191 | paddw m2, m10 | |
6192 | pmaddubsw m10, m8, [r5 + 2 * mmsize] | |
6193 | paddw m4, m10 | |
6194 | pmaddubsw m10, m8, [r5 + 1 * mmsize] | |
6195 | paddw m6, m10 | |
6196 | movu xm10, [r0 + r1 * 2] ; m10 = row 10 | |
6197 | punpckhbw xm11, xm9, xm10 | |
6198 | punpcklbw xm9, xm10 | |
6199 | vinserti128 m9, m9, xm11, 1 | |
6200 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
6201 | paddw m3, m11 | |
6202 | pmaddubsw m11, m9, [r5 + 2 * mmsize] | |
6203 | paddw m5, m11 | |
6204 | pmaddubsw m11, m9, [r5 + 1 * mmsize] | |
6205 | paddw m7, m11 | |
6206 | movu xm11, [r0 + r4] ; m11 = row 11 | |
6207 | punpckhbw xm12, xm10, xm11 | |
6208 | punpcklbw xm10, xm11 | |
6209 | vinserti128 m10, m10, xm12, 1 | |
6210 | pmaddubsw m12, m10, [r5 + 3 * mmsize] | |
6211 | paddw m4, m12 | |
6212 | pmaddubsw m12, m10, [r5 + 2 * mmsize] | |
6213 | paddw m6, m12 | |
6214 | lea r0, [r0 + r1 * 4] | |
6215 | movu xm12, [r0] ; m12 = row 12 | |
6216 | punpckhbw xm13, xm11, xm12 | |
6217 | punpcklbw xm11, xm12 | |
6218 | vinserti128 m11, m11, xm13, 1 | |
6219 | pmaddubsw m13, m11, [r5 + 3 * mmsize] | |
6220 | paddw m5, m13 | |
6221 | pmaddubsw m13, m11, [r5 + 2 * mmsize] | |
6222 | paddw m7, m13 | |
6223 | ||
6224 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
6225 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
6226 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
6227 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
6228 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
6229 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
6230 | packuswb m0, m1 | |
6231 | packuswb m2, m3 | |
6232 | packuswb m4, m5 | |
6233 | vpermq m0, m0, 11011000b | |
6234 | vpermq m2, m2, 11011000b | |
6235 | vpermq m4, m4, 11011000b | |
6236 | vextracti128 xm1, m0, 1 | |
6237 | vextracti128 xm3, m2, 1 | |
6238 | vextracti128 xm5, m4, 1 | |
6239 | movu [r2], xm0 | |
6240 | movu [r2 + r3], xm1 | |
6241 | movu [r2 + r3 * 2], xm2 | |
6242 | movu [r2 + r6], xm3 | |
6243 | lea r2, [r2 + r3 * 4] | |
6244 | movu [r2], xm4 | |
6245 | movu [r2 + r3], xm5 | |
6246 | ||
6247 | movu xm13, [r0 + r1] ; m13 = row 13 | |
6248 | punpckhbw xm0, xm12, xm13 | |
6249 | punpcklbw xm12, xm13 | |
6250 | vinserti128 m12, m12, xm0, 1 | |
6251 | pmaddubsw m0, m12, [r5 + 3 * mmsize] | |
6252 | paddw m6, m0 | |
6253 | movu xm0, [r0 + r1 * 2] ; m0 = row 14 | |
6254 | punpckhbw xm1, xm13, xm0 | |
6255 | punpcklbw xm13, xm0 | |
6256 | vinserti128 m13, m13, xm1, 1 | |
6257 | pmaddubsw m1, m13, [r5 + 3 * mmsize] | |
6258 | paddw m7, m1 | |
6259 | ||
6260 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
6261 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
6262 | packuswb m6, m7 | |
6263 | vpermq m6, m6, 11011000b | |
6264 | vextracti128 xm7, m6, 1 | |
6265 | movu [r2 + r3 * 2], xm6 | |
6266 | movu [r2 + r6], xm7 | |
6267 | RET | |
6268 | %endif | |
6269 | ||
6270 | INIT_YMM avx2 | |
6271 | %if ARCH_X86_64 == 1 | |
6272 | cglobal interp_8tap_vert_pp_16x4, 4, 7, 13 | |
6273 | mov r4d, r4m | |
6274 | shl r4d, 7 | |
6275 | ||
6276 | %ifdef PIC | |
6277 | lea r5, [tab_LumaCoeffVer_32] | |
6278 | add r5, r4 | |
6279 | %else | |
6280 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
6281 | %endif | |
6282 | ||
6283 | lea r4, [r1 * 3] | |
6284 | sub r0, r4 | |
6285 | lea r6, [r3 * 3] | |
6286 | mova m12, [pw_512] | |
6287 | ||
6288 | movu xm0, [r0] ; m0 = row 0 | |
6289 | movu xm1, [r0 + r1] ; m1 = row 1 | |
6290 | punpckhbw xm2, xm0, xm1 | |
6291 | punpcklbw xm0, xm1 | |
6292 | vinserti128 m0, m0, xm2, 1 | |
6293 | pmaddubsw m0, [r5] | |
6294 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
6295 | punpckhbw xm3, xm1, xm2 | |
6296 | punpcklbw xm1, xm2 | |
6297 | vinserti128 m1, m1, xm3, 1 | |
6298 | pmaddubsw m1, [r5] | |
6299 | movu xm3, [r0 + r4] ; m3 = row 3 | |
6300 | punpckhbw xm4, xm2, xm3 | |
6301 | punpcklbw xm2, xm3 | |
6302 | vinserti128 m2, m2, xm4, 1 | |
6303 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
6304 | paddw m0, m4 | |
6305 | pmaddubsw m2, [r5] | |
6306 | lea r0, [r0 + r1 * 4] | |
6307 | movu xm4, [r0] ; m4 = row 4 | |
6308 | punpckhbw xm5, xm3, xm4 | |
6309 | punpcklbw xm3, xm4 | |
6310 | vinserti128 m3, m3, xm5, 1 | |
6311 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
6312 | paddw m1, m5 | |
6313 | pmaddubsw m3, [r5] | |
6314 | movu xm5, [r0 + r1] ; m5 = row 5 | |
6315 | punpckhbw xm6, xm4, xm5 | |
6316 | punpcklbw xm4, xm5 | |
6317 | vinserti128 m4, m4, xm6, 1 | |
6318 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
6319 | paddw m0, m6 | |
6320 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
6321 | paddw m2, m6 | |
6322 | movu xm6, [r0 + r1 * 2] ; m6 = row 6 | |
6323 | punpckhbw xm7, xm5, xm6 | |
6324 | punpcklbw xm5, xm6 | |
6325 | vinserti128 m5, m5, xm7, 1 | |
6326 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
6327 | paddw m1, m7 | |
6328 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
6329 | paddw m3, m7 | |
6330 | movu xm7, [r0 + r4] ; m7 = row 7 | |
6331 | punpckhbw xm8, xm6, xm7 | |
6332 | punpcklbw xm6, xm7 | |
6333 | vinserti128 m6, m6, xm8, 1 | |
6334 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
6335 | paddw m0, m8 | |
6336 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
6337 | paddw m2, m8 | |
6338 | lea r0, [r0 + r1 * 4] | |
6339 | movu xm8, [r0] ; m8 = row 8 | |
6340 | punpckhbw xm9, xm7, xm8 | |
6341 | punpcklbw xm7, xm8 | |
6342 | vinserti128 m7, m7, xm9, 1 | |
6343 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
6344 | paddw m1, m9 | |
6345 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
6346 | paddw m3, m9 | |
6347 | movu xm9, [r0 + r1] ; m9 = row 9 | |
6348 | punpckhbw xm10, xm8, xm9 | |
6349 | punpcklbw xm8, xm9 | |
6350 | vinserti128 m8, m8, xm10, 1 | |
6351 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
6352 | paddw m2, m10 | |
6353 | movu xm10, [r0 + r1 * 2] ; m10 = row 10 | |
6354 | punpckhbw xm11, xm9, xm10 | |
6355 | punpcklbw xm9, xm10 | |
6356 | vinserti128 m9, m9, xm11, 1 | |
6357 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
6358 | paddw m3, m11 | |
6359 | ||
6360 | pmulhrsw m0, m12 ; m0 = word: row 0 | |
6361 | pmulhrsw m1, m12 ; m1 = word: row 1 | |
6362 | pmulhrsw m2, m12 ; m2 = word: row 2 | |
6363 | pmulhrsw m3, m12 ; m3 = word: row 3 | |
6364 | packuswb m0, m1 | |
6365 | packuswb m2, m3 | |
6366 | vpermq m0, m0, 11011000b | |
6367 | vpermq m2, m2, 11011000b | |
6368 | vextracti128 xm1, m0, 1 | |
6369 | vextracti128 xm3, m2, 1 | |
6370 | movu [r2], xm0 | |
6371 | movu [r2 + r3], xm1 | |
6372 | movu [r2 + r3 * 2], xm2 | |
6373 | movu [r2 + r6], xm3 | |
6374 | RET | |
6375 | %endif | |
6376 | ||
6377 | %macro FILTER_VER_LUMA_AVX2_16xN 2 | |
6378 | INIT_YMM avx2 | |
6379 | %if ARCH_X86_64 == 1 | |
6380 | cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15 | |
6381 | mov r4d, r4m | |
6382 | shl r4d, 7 | |
6383 | ||
6384 | %ifdef PIC | |
6385 | lea r5, [tab_LumaCoeffVer_32] | |
6386 | add r5, r4 | |
6387 | %else | |
6388 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
6389 | %endif | |
6390 | ||
6391 | lea r4, [r1 * 3] | |
6392 | sub r0, r4 | |
6393 | lea r6, [r3 * 3] | |
6394 | lea r7, [r1 * 4] | |
6395 | mova m14, [pw_512] | |
6396 | mov r8d, %2 / 16 | |
6397 | ||
6398 | .loop: | |
6399 | movu xm0, [r0] ; m0 = row 0 | |
6400 | movu xm1, [r0 + r1] ; m1 = row 1 | |
6401 | punpckhbw xm2, xm0, xm1 | |
6402 | punpcklbw xm0, xm1 | |
6403 | vinserti128 m0, m0, xm2, 1 | |
6404 | pmaddubsw m0, [r5] | |
6405 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
6406 | punpckhbw xm3, xm1, xm2 | |
6407 | punpcklbw xm1, xm2 | |
6408 | vinserti128 m1, m1, xm3, 1 | |
6409 | pmaddubsw m1, [r5] | |
6410 | movu xm3, [r0 + r4] ; m3 = row 3 | |
6411 | punpckhbw xm4, xm2, xm3 | |
6412 | punpcklbw xm2, xm3 | |
6413 | vinserti128 m2, m2, xm4, 1 | |
6414 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
6415 | paddw m0, m4 | |
6416 | pmaddubsw m2, [r5] | |
6417 | lea r0, [r0 + r1 * 4] | |
6418 | movu xm4, [r0] ; m4 = row 4 | |
6419 | punpckhbw xm5, xm3, xm4 | |
6420 | punpcklbw xm3, xm4 | |
6421 | vinserti128 m3, m3, xm5, 1 | |
6422 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
6423 | paddw m1, m5 | |
6424 | pmaddubsw m3, [r5] | |
6425 | movu xm5, [r0 + r1] ; m5 = row 5 | |
6426 | punpckhbw xm6, xm4, xm5 | |
6427 | punpcklbw xm4, xm5 | |
6428 | vinserti128 m4, m4, xm6, 1 | |
6429 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
6430 | paddw m0, m6 | |
6431 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
6432 | paddw m2, m6 | |
6433 | pmaddubsw m4, [r5] | |
6434 | movu xm6, [r0 + r1 * 2] ; m6 = row 6 | |
6435 | punpckhbw xm7, xm5, xm6 | |
6436 | punpcklbw xm5, xm6 | |
6437 | vinserti128 m5, m5, xm7, 1 | |
6438 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
6439 | paddw m1, m7 | |
6440 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
6441 | paddw m3, m7 | |
6442 | pmaddubsw m5, [r5] | |
6443 | movu xm7, [r0 + r4] ; m7 = row 7 | |
6444 | punpckhbw xm8, xm6, xm7 | |
6445 | punpcklbw xm6, xm7 | |
6446 | vinserti128 m6, m6, xm8, 1 | |
6447 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
6448 | paddw m0, m8 | |
6449 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
6450 | paddw m2, m8 | |
6451 | pmaddubsw m8, m6, [r5 + 1 * mmsize] | |
6452 | paddw m4, m8 | |
6453 | pmaddubsw m6, [r5] | |
6454 | lea r0, [r0 + r1 * 4] | |
6455 | movu xm8, [r0] ; m8 = row 8 | |
6456 | punpckhbw xm9, xm7, xm8 | |
6457 | punpcklbw xm7, xm8 | |
6458 | vinserti128 m7, m7, xm9, 1 | |
6459 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
6460 | paddw m1, m9 | |
6461 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
6462 | paddw m3, m9 | |
6463 | pmaddubsw m9, m7, [r5 + 1 * mmsize] | |
6464 | paddw m5, m9 | |
6465 | pmaddubsw m7, [r5] | |
6466 | movu xm9, [r0 + r1] ; m9 = row 9 | |
6467 | punpckhbw xm10, xm8, xm9 | |
6468 | punpcklbw xm8, xm9 | |
6469 | vinserti128 m8, m8, xm10, 1 | |
6470 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
6471 | paddw m2, m10 | |
6472 | pmaddubsw m10, m8, [r5 + 2 * mmsize] | |
6473 | paddw m4, m10 | |
6474 | pmaddubsw m10, m8, [r5 + 1 * mmsize] | |
6475 | paddw m6, m10 | |
6476 | pmaddubsw m8, [r5] | |
6477 | movu xm10, [r0 + r1 * 2] ; m10 = row 10 | |
6478 | punpckhbw xm11, xm9, xm10 | |
6479 | punpcklbw xm9, xm10 | |
6480 | vinserti128 m9, m9, xm11, 1 | |
6481 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
6482 | paddw m3, m11 | |
6483 | pmaddubsw m11, m9, [r5 + 2 * mmsize] | |
6484 | paddw m5, m11 | |
6485 | pmaddubsw m11, m9, [r5 + 1 * mmsize] | |
6486 | paddw m7, m11 | |
6487 | pmaddubsw m9, [r5] | |
6488 | movu xm11, [r0 + r4] ; m11 = row 11 | |
6489 | punpckhbw xm12, xm10, xm11 | |
6490 | punpcklbw xm10, xm11 | |
6491 | vinserti128 m10, m10, xm12, 1 | |
6492 | pmaddubsw m12, m10, [r5 + 3 * mmsize] | |
6493 | paddw m4, m12 | |
6494 | pmaddubsw m12, m10, [r5 + 2 * mmsize] | |
6495 | paddw m6, m12 | |
6496 | pmaddubsw m12, m10, [r5 + 1 * mmsize] | |
6497 | paddw m8, m12 | |
6498 | pmaddubsw m10, [r5] | |
6499 | lea r0, [r0 + r1 * 4] | |
6500 | movu xm12, [r0] ; m12 = row 12 | |
6501 | punpckhbw xm13, xm11, xm12 | |
6502 | punpcklbw xm11, xm12 | |
6503 | vinserti128 m11, m11, xm13, 1 | |
6504 | pmaddubsw m13, m11, [r5 + 3 * mmsize] | |
6505 | paddw m5, m13 | |
6506 | pmaddubsw m13, m11, [r5 + 2 * mmsize] | |
6507 | paddw m7, m13 | |
6508 | pmaddubsw m13, m11, [r5 + 1 * mmsize] | |
6509 | paddw m9, m13 | |
6510 | pmaddubsw m11, [r5] | |
6511 | ||
6512 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
6513 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
6514 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
6515 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
6516 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
6517 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
6518 | packuswb m0, m1 | |
6519 | packuswb m2, m3 | |
6520 | packuswb m4, m5 | |
6521 | vpermq m0, m0, 11011000b | |
6522 | vpermq m2, m2, 11011000b | |
6523 | vpermq m4, m4, 11011000b | |
6524 | vextracti128 xm1, m0, 1 | |
6525 | vextracti128 xm3, m2, 1 | |
6526 | vextracti128 xm5, m4, 1 | |
6527 | movu [r2], xm0 | |
6528 | movu [r2 + r3], xm1 | |
6529 | movu [r2 + r3 * 2], xm2 | |
6530 | movu [r2 + r6], xm3 | |
6531 | lea r2, [r2 + r3 * 4] | |
6532 | movu [r2], xm4 | |
6533 | movu [r2 + r3], xm5 | |
6534 | ||
6535 | movu xm13, [r0 + r1] ; m13 = row 13 | |
6536 | punpckhbw xm0, xm12, xm13 | |
6537 | punpcklbw xm12, xm13 | |
6538 | vinserti128 m12, m12, xm0, 1 | |
6539 | pmaddubsw m0, m12, [r5 + 3 * mmsize] | |
6540 | paddw m6, m0 | |
6541 | pmaddubsw m0, m12, [r5 + 2 * mmsize] | |
6542 | paddw m8, m0 | |
6543 | pmaddubsw m0, m12, [r5 + 1 * mmsize] | |
6544 | paddw m10, m0 | |
6545 | pmaddubsw m12, [r5] | |
6546 | movu xm0, [r0 + r1 * 2] ; m0 = row 14 | |
6547 | punpckhbw xm1, xm13, xm0 | |
6548 | punpcklbw xm13, xm0 | |
6549 | vinserti128 m13, m13, xm1, 1 | |
6550 | pmaddubsw m1, m13, [r5 + 3 * mmsize] | |
6551 | paddw m7, m1 | |
6552 | pmaddubsw m1, m13, [r5 + 2 * mmsize] | |
6553 | paddw m9, m1 | |
6554 | pmaddubsw m1, m13, [r5 + 1 * mmsize] | |
6555 | paddw m11, m1 | |
6556 | pmaddubsw m13, [r5] | |
6557 | ||
6558 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
6559 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
6560 | packuswb m6, m7 | |
6561 | vpermq m6, m6, 11011000b | |
6562 | vextracti128 xm7, m6, 1 | |
6563 | movu [r2 + r3 * 2], xm6 | |
6564 | movu [r2 + r6], xm7 | |
6565 | lea r2, [r2 + r3 * 4] | |
6566 | ||
6567 | movu xm1, [r0 + r4] ; m1 = row 15 | |
6568 | punpckhbw xm2, xm0, xm1 | |
6569 | punpcklbw xm0, xm1 | |
6570 | vinserti128 m0, m0, xm2, 1 | |
6571 | pmaddubsw m2, m0, [r5 + 3 * mmsize] | |
6572 | paddw m8, m2 | |
6573 | pmaddubsw m2, m0, [r5 + 2 * mmsize] | |
6574 | paddw m10, m2 | |
6575 | pmaddubsw m2, m0, [r5 + 1 * mmsize] | |
6576 | paddw m12, m2 | |
6577 | pmaddubsw m0, [r5] | |
6578 | lea r0, [r0 + r1 * 4] | |
6579 | movu xm2, [r0] ; m2 = row 16 | |
6580 | punpckhbw xm3, xm1, xm2 | |
6581 | punpcklbw xm1, xm2 | |
6582 | vinserti128 m1, m1, xm3, 1 | |
6583 | pmaddubsw m3, m1, [r5 + 3 * mmsize] | |
6584 | paddw m9, m3 | |
6585 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
6586 | paddw m11, m3 | |
6587 | pmaddubsw m3, m1, [r5 + 1 * mmsize] | |
6588 | paddw m13, m3 | |
6589 | pmaddubsw m1, [r5] | |
6590 | movu xm3, [r0 + r1] ; m3 = row 17 | |
6591 | punpckhbw xm4, xm2, xm3 | |
6592 | punpcklbw xm2, xm3 | |
6593 | vinserti128 m2, m2, xm4, 1 | |
6594 | pmaddubsw m4, m2, [r5 + 3 * mmsize] | |
6595 | paddw m10, m4 | |
6596 | pmaddubsw m4, m2, [r5 + 2 * mmsize] | |
6597 | paddw m12, m4 | |
6598 | pmaddubsw m2, [r5 + 1 * mmsize] | |
6599 | paddw m0, m2 | |
6600 | movu xm4, [r0 + r1 * 2] ; m4 = row 18 | |
6601 | punpckhbw xm5, xm3, xm4 | |
6602 | punpcklbw xm3, xm4 | |
6603 | vinserti128 m3, m3, xm5, 1 | |
6604 | pmaddubsw m5, m3, [r5 + 3 * mmsize] | |
6605 | paddw m11, m5 | |
6606 | pmaddubsw m5, m3, [r5 + 2 * mmsize] | |
6607 | paddw m13, m5 | |
6608 | pmaddubsw m3, [r5 + 1 * mmsize] | |
6609 | paddw m1, m3 | |
6610 | movu xm5, [r0 + r4] ; m5 = row 19 | |
6611 | punpckhbw xm6, xm4, xm5 | |
6612 | punpcklbw xm4, xm5 | |
6613 | vinserti128 m4, m4, xm6, 1 | |
6614 | pmaddubsw m6, m4, [r5 + 3 * mmsize] | |
6615 | paddw m12, m6 | |
6616 | pmaddubsw m4, [r5 + 2 * mmsize] | |
6617 | paddw m0, m4 | |
6618 | lea r0, [r0 + r1 * 4] | |
6619 | movu xm6, [r0] ; m6 = row 20 | |
6620 | punpckhbw xm7, xm5, xm6 | |
6621 | punpcklbw xm5, xm6 | |
6622 | vinserti128 m5, m5, xm7, 1 | |
6623 | pmaddubsw m7, m5, [r5 + 3 * mmsize] | |
6624 | paddw m13, m7 | |
6625 | pmaddubsw m5, [r5 + 2 * mmsize] | |
6626 | paddw m1, m5 | |
6627 | movu xm7, [r0 + r1] ; m7 = row 21 | |
6628 | punpckhbw xm2, xm6, xm7 | |
6629 | punpcklbw xm6, xm7 | |
6630 | vinserti128 m6, m6, xm2, 1 | |
6631 | pmaddubsw m6, [r5 + 3 * mmsize] | |
6632 | paddw m0, m6 | |
6633 | movu xm2, [r0 + r1 * 2] ; m2 = row 22 | |
6634 | punpckhbw xm3, xm7, xm2 | |
6635 | punpcklbw xm7, xm2 | |
6636 | vinserti128 m7, m7, xm3, 1 | |
6637 | pmaddubsw m7, [r5 + 3 * mmsize] | |
6638 | paddw m1, m7 | |
6639 | ||
6640 | pmulhrsw m8, m14 ; m8 = word: row 8 | |
6641 | pmulhrsw m9, m14 ; m9 = word: row 9 | |
6642 | pmulhrsw m10, m14 ; m10 = word: row 10 | |
6643 | pmulhrsw m11, m14 ; m11 = word: row 11 | |
6644 | pmulhrsw m12, m14 ; m12 = word: row 12 | |
6645 | pmulhrsw m13, m14 ; m13 = word: row 13 | |
6646 | pmulhrsw m0, m14 ; m0 = word: row 14 | |
6647 | pmulhrsw m1, m14 ; m1 = word: row 15 | |
6648 | packuswb m8, m9 | |
6649 | packuswb m10, m11 | |
6650 | packuswb m12, m13 | |
6651 | packuswb m0, m1 | |
6652 | vpermq m8, m8, 11011000b | |
6653 | vpermq m10, m10, 11011000b | |
6654 | vpermq m12, m12, 11011000b | |
6655 | vpermq m0, m0, 11011000b | |
6656 | vextracti128 xm9, m8, 1 | |
6657 | vextracti128 xm11, m10, 1 | |
6658 | vextracti128 xm13, m12, 1 | |
6659 | vextracti128 xm1, m0, 1 | |
6660 | movu [r2], xm8 | |
6661 | movu [r2 + r3], xm9 | |
6662 | movu [r2 + r3 * 2], xm10 | |
6663 | movu [r2 + r6], xm11 | |
6664 | lea r2, [r2 + r3 * 4] | |
6665 | movu [r2], xm12 | |
6666 | movu [r2 + r3], xm13 | |
6667 | movu [r2 + r3 * 2], xm0 | |
6668 | movu [r2 + r6], xm1 | |
6669 | lea r2, [r2 + r3 * 4] | |
6670 | sub r0, r7 | |
6671 | dec r8d | |
6672 | jnz .loop | |
6673 | RET | |
6674 | %endif | |
6675 | %endmacro | |
6676 | ||
6677 | FILTER_VER_LUMA_AVX2_16xN 16, 32 | |
6678 | FILTER_VER_LUMA_AVX2_16xN 16, 64 | |
6679 | ||
6680 | %macro PROCESS_LUMA_AVX2_W16_16R 0 | |
6681 | movu xm0, [r0] ; m0 = row 0 | |
6682 | movu xm1, [r0 + r1] ; m1 = row 1 | |
6683 | punpckhbw xm2, xm0, xm1 | |
6684 | punpcklbw xm0, xm1 | |
6685 | vinserti128 m0, m0, xm2, 1 | |
6686 | pmaddubsw m0, [r5] | |
6687 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
6688 | punpckhbw xm3, xm1, xm2 | |
6689 | punpcklbw xm1, xm2 | |
6690 | vinserti128 m1, m1, xm3, 1 | |
6691 | pmaddubsw m1, [r5] | |
6692 | movu xm3, [r0 + r4] ; m3 = row 3 | |
6693 | punpckhbw xm4, xm2, xm3 | |
6694 | punpcklbw xm2, xm3 | |
6695 | vinserti128 m2, m2, xm4, 1 | |
6696 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
6697 | paddw m0, m4 | |
6698 | pmaddubsw m2, [r5] | |
6699 | lea r7, [r0 + r1 * 4] | |
6700 | movu xm4, [r7] ; m4 = row 4 | |
6701 | punpckhbw xm5, xm3, xm4 | |
6702 | punpcklbw xm3, xm4 | |
6703 | vinserti128 m3, m3, xm5, 1 | |
6704 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
6705 | paddw m1, m5 | |
6706 | pmaddubsw m3, [r5] | |
6707 | movu xm5, [r7 + r1] ; m5 = row 5 | |
6708 | punpckhbw xm6, xm4, xm5 | |
6709 | punpcklbw xm4, xm5 | |
6710 | vinserti128 m4, m4, xm6, 1 | |
6711 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
6712 | paddw m0, m6 | |
6713 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
6714 | paddw m2, m6 | |
6715 | pmaddubsw m4, [r5] | |
6716 | movu xm6, [r7 + r1 * 2] ; m6 = row 6 | |
6717 | punpckhbw xm7, xm5, xm6 | |
6718 | punpcklbw xm5, xm6 | |
6719 | vinserti128 m5, m5, xm7, 1 | |
6720 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
6721 | paddw m1, m7 | |
6722 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
6723 | paddw m3, m7 | |
6724 | pmaddubsw m5, [r5] | |
6725 | movu xm7, [r7 + r4] ; m7 = row 7 | |
6726 | punpckhbw xm8, xm6, xm7 | |
6727 | punpcklbw xm6, xm7 | |
6728 | vinserti128 m6, m6, xm8, 1 | |
6729 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
6730 | paddw m0, m8 | |
6731 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
6732 | paddw m2, m8 | |
6733 | pmaddubsw m8, m6, [r5 + 1 * mmsize] | |
6734 | paddw m4, m8 | |
6735 | pmaddubsw m6, [r5] | |
6736 | lea r7, [r7 + r1 * 4] | |
6737 | movu xm8, [r7] ; m8 = row 8 | |
6738 | punpckhbw xm9, xm7, xm8 | |
6739 | punpcklbw xm7, xm8 | |
6740 | vinserti128 m7, m7, xm9, 1 | |
6741 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
6742 | paddw m1, m9 | |
6743 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
6744 | paddw m3, m9 | |
6745 | pmaddubsw m9, m7, [r5 + 1 * mmsize] | |
6746 | paddw m5, m9 | |
6747 | pmaddubsw m7, [r5] | |
6748 | movu xm9, [r7 + r1] ; m9 = row 9 | |
6749 | punpckhbw xm10, xm8, xm9 | |
6750 | punpcklbw xm8, xm9 | |
6751 | vinserti128 m8, m8, xm10, 1 | |
6752 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
6753 | paddw m2, m10 | |
6754 | pmaddubsw m10, m8, [r5 + 2 * mmsize] | |
6755 | paddw m4, m10 | |
6756 | pmaddubsw m10, m8, [r5 + 1 * mmsize] | |
6757 | paddw m6, m10 | |
6758 | pmaddubsw m8, [r5] | |
6759 | movu xm10, [r7 + r1 * 2] ; m10 = row 10 | |
6760 | punpckhbw xm11, xm9, xm10 | |
6761 | punpcklbw xm9, xm10 | |
6762 | vinserti128 m9, m9, xm11, 1 | |
6763 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
6764 | paddw m3, m11 | |
6765 | pmaddubsw m11, m9, [r5 + 2 * mmsize] | |
6766 | paddw m5, m11 | |
6767 | pmaddubsw m11, m9, [r5 + 1 * mmsize] | |
6768 | paddw m7, m11 | |
6769 | pmaddubsw m9, [r5] | |
6770 | movu xm11, [r7 + r4] ; m11 = row 11 | |
6771 | punpckhbw xm12, xm10, xm11 | |
6772 | punpcklbw xm10, xm11 | |
6773 | vinserti128 m10, m10, xm12, 1 | |
6774 | pmaddubsw m12, m10, [r5 + 3 * mmsize] | |
6775 | paddw m4, m12 | |
6776 | pmaddubsw m12, m10, [r5 + 2 * mmsize] | |
6777 | paddw m6, m12 | |
6778 | pmaddubsw m12, m10, [r5 + 1 * mmsize] | |
6779 | paddw m8, m12 | |
6780 | pmaddubsw m10, [r5] | |
6781 | lea r7, [r7 + r1 * 4] | |
6782 | movu xm12, [r7] ; m12 = row 12 | |
6783 | punpckhbw xm13, xm11, xm12 | |
6784 | punpcklbw xm11, xm12 | |
6785 | vinserti128 m11, m11, xm13, 1 | |
6786 | pmaddubsw m13, m11, [r5 + 3 * mmsize] | |
6787 | paddw m5, m13 | |
6788 | pmaddubsw m13, m11, [r5 + 2 * mmsize] | |
6789 | paddw m7, m13 | |
6790 | pmaddubsw m13, m11, [r5 + 1 * mmsize] | |
6791 | paddw m9, m13 | |
6792 | pmaddubsw m11, [r5] | |
6793 | ||
6794 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
6795 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
6796 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
6797 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
6798 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
6799 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
6800 | packuswb m0, m1 | |
6801 | packuswb m2, m3 | |
6802 | packuswb m4, m5 | |
6803 | vpermq m0, m0, 11011000b | |
6804 | vpermq m2, m2, 11011000b | |
6805 | vpermq m4, m4, 11011000b | |
6806 | vextracti128 xm1, m0, 1 | |
6807 | vextracti128 xm3, m2, 1 | |
6808 | vextracti128 xm5, m4, 1 | |
6809 | movu [r2], xm0 | |
6810 | movu [r2 + r3], xm1 | |
6811 | movu [r2 + r3 * 2], xm2 | |
6812 | movu [r2 + r6], xm3 | |
6813 | lea r8, [r2 + r3 * 4] | |
6814 | movu [r8], xm4 | |
6815 | movu [r8 + r3], xm5 | |
6816 | ||
6817 | movu xm13, [r7 + r1] ; m13 = row 13 | |
6818 | punpckhbw xm0, xm12, xm13 | |
6819 | punpcklbw xm12, xm13 | |
6820 | vinserti128 m12, m12, xm0, 1 | |
6821 | pmaddubsw m0, m12, [r5 + 3 * mmsize] | |
6822 | paddw m6, m0 | |
6823 | pmaddubsw m0, m12, [r5 + 2 * mmsize] | |
6824 | paddw m8, m0 | |
6825 | pmaddubsw m0, m12, [r5 + 1 * mmsize] | |
6826 | paddw m10, m0 | |
6827 | pmaddubsw m12, [r5] | |
6828 | movu xm0, [r7 + r1 * 2] ; m0 = row 14 | |
6829 | punpckhbw xm1, xm13, xm0 | |
6830 | punpcklbw xm13, xm0 | |
6831 | vinserti128 m13, m13, xm1, 1 | |
6832 | pmaddubsw m1, m13, [r5 + 3 * mmsize] | |
6833 | paddw m7, m1 | |
6834 | pmaddubsw m1, m13, [r5 + 2 * mmsize] | |
6835 | paddw m9, m1 | |
6836 | pmaddubsw m1, m13, [r5 + 1 * mmsize] | |
6837 | paddw m11, m1 | |
6838 | pmaddubsw m13, [r5] | |
6839 | ||
6840 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
6841 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
6842 | packuswb m6, m7 | |
6843 | vpermq m6, m6, 11011000b | |
6844 | vextracti128 xm7, m6, 1 | |
6845 | movu [r8 + r3 * 2], xm6 | |
6846 | movu [r8 + r6], xm7 | |
6847 | lea r8, [r8 + r3 * 4] | |
6848 | ||
6849 | movu xm1, [r7 + r4] ; m1 = row 15 | |
6850 | punpckhbw xm2, xm0, xm1 | |
6851 | punpcklbw xm0, xm1 | |
6852 | vinserti128 m0, m0, xm2, 1 | |
6853 | pmaddubsw m2, m0, [r5 + 3 * mmsize] | |
6854 | paddw m8, m2 | |
6855 | pmaddubsw m2, m0, [r5 + 2 * mmsize] | |
6856 | paddw m10, m2 | |
6857 | pmaddubsw m2, m0, [r5 + 1 * mmsize] | |
6858 | paddw m12, m2 | |
6859 | pmaddubsw m0, [r5] | |
6860 | lea r7, [r7 + r1 * 4] | |
6861 | movu xm2, [r7] ; m2 = row 16 | |
6862 | punpckhbw xm3, xm1, xm2 | |
6863 | punpcklbw xm1, xm2 | |
6864 | vinserti128 m1, m1, xm3, 1 | |
6865 | pmaddubsw m3, m1, [r5 + 3 * mmsize] | |
6866 | paddw m9, m3 | |
6867 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
6868 | paddw m11, m3 | |
6869 | pmaddubsw m3, m1, [r5 + 1 * mmsize] | |
6870 | paddw m13, m3 | |
6871 | pmaddubsw m1, [r5] | |
6872 | movu xm3, [r7 + r1] ; m3 = row 17 | |
6873 | punpckhbw xm4, xm2, xm3 | |
6874 | punpcklbw xm2, xm3 | |
6875 | vinserti128 m2, m2, xm4, 1 | |
6876 | pmaddubsw m4, m2, [r5 + 3 * mmsize] | |
6877 | paddw m10, m4 | |
6878 | pmaddubsw m4, m2, [r5 + 2 * mmsize] | |
6879 | paddw m12, m4 | |
6880 | pmaddubsw m2, [r5 + 1 * mmsize] | |
6881 | paddw m0, m2 | |
6882 | movu xm4, [r7 + r1 * 2] ; m4 = row 18 | |
6883 | punpckhbw xm5, xm3, xm4 | |
6884 | punpcklbw xm3, xm4 | |
6885 | vinserti128 m3, m3, xm5, 1 | |
6886 | pmaddubsw m5, m3, [r5 + 3 * mmsize] | |
6887 | paddw m11, m5 | |
6888 | pmaddubsw m5, m3, [r5 + 2 * mmsize] | |
6889 | paddw m13, m5 | |
6890 | pmaddubsw m3, [r5 + 1 * mmsize] | |
6891 | paddw m1, m3 | |
6892 | movu xm5, [r7 + r4] ; m5 = row 19 | |
6893 | punpckhbw xm6, xm4, xm5 | |
6894 | punpcklbw xm4, xm5 | |
6895 | vinserti128 m4, m4, xm6, 1 | |
6896 | pmaddubsw m6, m4, [r5 + 3 * mmsize] | |
6897 | paddw m12, m6 | |
6898 | pmaddubsw m4, [r5 + 2 * mmsize] | |
6899 | paddw m0, m4 | |
6900 | lea r7, [r7 + r1 * 4] | |
6901 | movu xm6, [r7] ; m6 = row 20 | |
6902 | punpckhbw xm7, xm5, xm6 | |
6903 | punpcklbw xm5, xm6 | |
6904 | vinserti128 m5, m5, xm7, 1 | |
6905 | pmaddubsw m7, m5, [r5 + 3 * mmsize] | |
6906 | paddw m13, m7 | |
6907 | pmaddubsw m5, [r5 + 2 * mmsize] | |
6908 | paddw m1, m5 | |
6909 | movu xm7, [r7 + r1] ; m7 = row 21 | |
6910 | punpckhbw xm2, xm6, xm7 | |
6911 | punpcklbw xm6, xm7 | |
6912 | vinserti128 m6, m6, xm2, 1 | |
6913 | pmaddubsw m6, [r5 + 3 * mmsize] | |
6914 | paddw m0, m6 | |
6915 | movu xm2, [r7 + r1 * 2] ; m2 = row 22 | |
6916 | punpckhbw xm3, xm7, xm2 | |
6917 | punpcklbw xm7, xm2 | |
6918 | vinserti128 m7, m7, xm3, 1 | |
6919 | pmaddubsw m7, [r5 + 3 * mmsize] | |
6920 | paddw m1, m7 | |
6921 | ||
6922 | pmulhrsw m8, m14 ; m8 = word: row 8 | |
6923 | pmulhrsw m9, m14 ; m9 = word: row 9 | |
6924 | pmulhrsw m10, m14 ; m10 = word: row 10 | |
6925 | pmulhrsw m11, m14 ; m11 = word: row 11 | |
6926 | pmulhrsw m12, m14 ; m12 = word: row 12 | |
6927 | pmulhrsw m13, m14 ; m13 = word: row 13 | |
6928 | pmulhrsw m0, m14 ; m0 = word: row 14 | |
6929 | pmulhrsw m1, m14 ; m1 = word: row 15 | |
6930 | packuswb m8, m9 | |
6931 | packuswb m10, m11 | |
6932 | packuswb m12, m13 | |
6933 | packuswb m0, m1 | |
6934 | vpermq m8, m8, 11011000b | |
6935 | vpermq m10, m10, 11011000b | |
6936 | vpermq m12, m12, 11011000b | |
6937 | vpermq m0, m0, 11011000b | |
6938 | vextracti128 xm9, m8, 1 | |
6939 | vextracti128 xm11, m10, 1 | |
6940 | vextracti128 xm13, m12, 1 | |
6941 | vextracti128 xm1, m0, 1 | |
6942 | movu [r8], xm8 | |
6943 | movu [r8 + r3], xm9 | |
6944 | movu [r8 + r3 * 2], xm10 | |
6945 | movu [r8 + r6], xm11 | |
6946 | lea r8, [r8 + r3 * 4] | |
6947 | movu [r8], xm12 | |
6948 | movu [r8 + r3], xm13 | |
6949 | movu [r8 + r3 * 2], xm0 | |
6950 | movu [r8 + r6], xm1 | |
6951 | %endmacro | |
6952 | ||
6953 | %macro PROCESS_LUMA_AVX2_W16_8R 0 | |
6954 | movu xm0, [r0] ; m0 = row 0 | |
6955 | movu xm1, [r0 + r1] ; m1 = row 1 | |
6956 | punpckhbw xm2, xm0, xm1 | |
6957 | punpcklbw xm0, xm1 | |
6958 | vinserti128 m0, m0, xm2, 1 | |
6959 | pmaddubsw m0, [r5] | |
6960 | movu xm2, [r0 + r1 * 2] ; m2 = row 2 | |
6961 | punpckhbw xm3, xm1, xm2 | |
6962 | punpcklbw xm1, xm2 | |
6963 | vinserti128 m1, m1, xm3, 1 | |
6964 | pmaddubsw m1, [r5] | |
6965 | movu xm3, [r0 + r4] ; m3 = row 3 | |
6966 | punpckhbw xm4, xm2, xm3 | |
6967 | punpcklbw xm2, xm3 | |
6968 | vinserti128 m2, m2, xm4, 1 | |
6969 | pmaddubsw m4, m2, [r5 + 1 * mmsize] | |
6970 | paddw m0, m4 | |
6971 | pmaddubsw m2, [r5] | |
6972 | lea r7, [r0 + r1 * 4] | |
6973 | movu xm4, [r7] ; m4 = row 4 | |
6974 | punpckhbw xm5, xm3, xm4 | |
6975 | punpcklbw xm3, xm4 | |
6976 | vinserti128 m3, m3, xm5, 1 | |
6977 | pmaddubsw m5, m3, [r5 + 1 * mmsize] | |
6978 | paddw m1, m5 | |
6979 | pmaddubsw m3, [r5] | |
6980 | movu xm5, [r7 + r1] ; m5 = row 5 | |
6981 | punpckhbw xm6, xm4, xm5 | |
6982 | punpcklbw xm4, xm5 | |
6983 | vinserti128 m4, m4, xm6, 1 | |
6984 | pmaddubsw m6, m4, [r5 + 2 * mmsize] | |
6985 | paddw m0, m6 | |
6986 | pmaddubsw m6, m4, [r5 + 1 * mmsize] | |
6987 | paddw m2, m6 | |
6988 | pmaddubsw m4, [r5] | |
6989 | movu xm6, [r7 + r1 * 2] ; m6 = row 6 | |
6990 | punpckhbw xm7, xm5, xm6 | |
6991 | punpcklbw xm5, xm6 | |
6992 | vinserti128 m5, m5, xm7, 1 | |
6993 | pmaddubsw m7, m5, [r5 + 2 * mmsize] | |
6994 | paddw m1, m7 | |
6995 | pmaddubsw m7, m5, [r5 + 1 * mmsize] | |
6996 | paddw m3, m7 | |
6997 | pmaddubsw m5, [r5] | |
6998 | movu xm7, [r7 + r4] ; m7 = row 7 | |
6999 | punpckhbw xm8, xm6, xm7 | |
7000 | punpcklbw xm6, xm7 | |
7001 | vinserti128 m6, m6, xm8, 1 | |
7002 | pmaddubsw m8, m6, [r5 + 3 * mmsize] | |
7003 | paddw m0, m8 | |
7004 | pmaddubsw m8, m6, [r5 + 2 * mmsize] | |
7005 | paddw m2, m8 | |
7006 | pmaddubsw m8, m6, [r5 + 1 * mmsize] | |
7007 | paddw m4, m8 | |
7008 | pmaddubsw m6, [r5] | |
7009 | lea r7, [r7 + r1 * 4] | |
7010 | movu xm8, [r7] ; m8 = row 8 | |
7011 | punpckhbw xm9, xm7, xm8 | |
7012 | punpcklbw xm7, xm8 | |
7013 | vinserti128 m7, m7, xm9, 1 | |
7014 | pmaddubsw m9, m7, [r5 + 3 * mmsize] | |
7015 | paddw m1, m9 | |
7016 | pmaddubsw m9, m7, [r5 + 2 * mmsize] | |
7017 | paddw m3, m9 | |
7018 | pmaddubsw m9, m7, [r5 + 1 * mmsize] | |
7019 | paddw m5, m9 | |
7020 | pmaddubsw m7, [r5] | |
7021 | movu xm9, [r7 + r1] ; m9 = row 9 | |
7022 | punpckhbw xm10, xm8, xm9 | |
7023 | punpcklbw xm8, xm9 | |
7024 | vinserti128 m8, m8, xm10, 1 | |
7025 | pmaddubsw m10, m8, [r5 + 3 * mmsize] | |
7026 | paddw m2, m10 | |
7027 | pmaddubsw m10, m8, [r5 + 2 * mmsize] | |
7028 | paddw m4, m10 | |
7029 | pmaddubsw m10, m8, [r5 + 1 * mmsize] | |
7030 | paddw m6, m10 | |
7031 | movu xm10, [r7 + r1 * 2] ; m10 = row 10 | |
7032 | punpckhbw xm11, xm9, xm10 | |
7033 | punpcklbw xm9, xm10 | |
7034 | vinserti128 m9, m9, xm11, 1 | |
7035 | pmaddubsw m11, m9, [r5 + 3 * mmsize] | |
7036 | paddw m3, m11 | |
7037 | pmaddubsw m11, m9, [r5 + 2 * mmsize] | |
7038 | paddw m5, m11 | |
7039 | pmaddubsw m11, m9, [r5 + 1 * mmsize] | |
7040 | paddw m7, m11 | |
7041 | movu xm11, [r7 + r4] ; m11 = row 11 | |
7042 | punpckhbw xm12, xm10, xm11 | |
7043 | punpcklbw xm10, xm11 | |
7044 | vinserti128 m10, m10, xm12, 1 | |
7045 | pmaddubsw m12, m10, [r5 + 3 * mmsize] | |
7046 | paddw m4, m12 | |
7047 | pmaddubsw m12, m10, [r5 + 2 * mmsize] | |
7048 | paddw m6, m12 | |
7049 | lea r7, [r7 + r1 * 4] | |
7050 | movu xm12, [r7] ; m12 = row 12 | |
7051 | punpckhbw xm13, xm11, xm12 | |
7052 | punpcklbw xm11, xm12 | |
7053 | vinserti128 m11, m11, xm13, 1 | |
7054 | pmaddubsw m13, m11, [r5 + 3 * mmsize] | |
7055 | paddw m5, m13 | |
7056 | pmaddubsw m13, m11, [r5 + 2 * mmsize] | |
7057 | paddw m7, m13 | |
7058 | ||
7059 | pmulhrsw m0, m14 ; m0 = word: row 0 | |
7060 | pmulhrsw m1, m14 ; m1 = word: row 1 | |
7061 | pmulhrsw m2, m14 ; m2 = word: row 2 | |
7062 | pmulhrsw m3, m14 ; m3 = word: row 3 | |
7063 | pmulhrsw m4, m14 ; m4 = word: row 4 | |
7064 | pmulhrsw m5, m14 ; m5 = word: row 5 | |
7065 | packuswb m0, m1 | |
7066 | packuswb m2, m3 | |
7067 | packuswb m4, m5 | |
7068 | vpermq m0, m0, 11011000b | |
7069 | vpermq m2, m2, 11011000b | |
7070 | vpermq m4, m4, 11011000b | |
7071 | vextracti128 xm1, m0, 1 | |
7072 | vextracti128 xm3, m2, 1 | |
7073 | vextracti128 xm5, m4, 1 | |
7074 | movu [r2], xm0 | |
7075 | movu [r2 + r3], xm1 | |
7076 | movu [r2 + r3 * 2], xm2 | |
7077 | movu [r2 + r6], xm3 | |
7078 | lea r8, [r2 + r3 * 4] | |
7079 | movu [r8], xm4 | |
7080 | movu [r8 + r3], xm5 | |
7081 | ||
7082 | movu xm13, [r7 + r1] ; m13 = row 13 | |
7083 | punpckhbw xm0, xm12, xm13 | |
7084 | punpcklbw xm12, xm13 | |
7085 | vinserti128 m12, m12, xm0, 1 | |
7086 | pmaddubsw m0, m12, [r5 + 3 * mmsize] | |
7087 | paddw m6, m0 | |
7088 | movu xm0, [r7 + r1 * 2] ; m0 = row 14 | |
7089 | punpckhbw xm1, xm13, xm0 | |
7090 | punpcklbw xm13, xm0 | |
7091 | vinserti128 m13, m13, xm1, 1 | |
7092 | pmaddubsw m1, m13, [r5 + 3 * mmsize] | |
7093 | paddw m7, m1 | |
7094 | ||
7095 | pmulhrsw m6, m14 ; m6 = word: row 6 | |
7096 | pmulhrsw m7, m14 ; m7 = word: row 7 | |
7097 | packuswb m6, m7 | |
7098 | vpermq m6, m6, 11011000b | |
7099 | vextracti128 xm7, m6, 1 | |
7100 | movu [r8 + r3 * 2], xm6 | |
7101 | movu [r8 + r6], xm7 | |
7102 | %endmacro | |
7103 | ||
7104 | INIT_YMM avx2 | |
7105 | %if ARCH_X86_64 == 1 | |
7106 | cglobal interp_8tap_vert_pp_24x32, 4, 11, 15 | |
7107 | mov r4d, r4m | |
7108 | shl r4d, 7 | |
7109 | ||
7110 | %ifdef PIC | |
7111 | lea r5, [tab_LumaCoeffVer_32] | |
7112 | add r5, r4 | |
7113 | %else | |
7114 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7115 | %endif | |
7116 | ||
7117 | lea r4, [r1 * 3] | |
7118 | sub r0, r4 | |
7119 | lea r6, [r3 * 3] | |
7120 | lea r10, [r1 * 4] | |
7121 | mova m14, [pw_512] | |
7122 | mov r9d, 2 | |
7123 | .loopH: | |
7124 | PROCESS_LUMA_AVX2_W16_16R | |
7125 | add r2, 16 | |
7126 | add r0, 16 | |
7127 | ||
7128 | movq xm1, [r0] ; m1 = row 0 | |
7129 | movq xm2, [r0 + r1] ; m2 = row 1 | |
7130 | punpcklbw xm1, xm2 | |
7131 | movq xm3, [r0 + r1 * 2] ; m3 = row 2 | |
7132 | punpcklbw xm2, xm3 | |
7133 | vinserti128 m5, m1, xm2, 1 | |
7134 | pmaddubsw m5, [r5] | |
7135 | movq xm4, [r0 + r4] ; m4 = row 3 | |
7136 | punpcklbw xm3, xm4 | |
7137 | lea r7, [r0 + r1 * 4] | |
7138 | movq xm1, [r7] ; m1 = row 4 | |
7139 | punpcklbw xm4, xm1 | |
7140 | vinserti128 m2, m3, xm4, 1 | |
7141 | pmaddubsw m0, m2, [r5 + 1 * mmsize] | |
7142 | paddw m5, m0 | |
7143 | pmaddubsw m2, [r5] | |
7144 | movq xm3, [r7 + r1] ; m3 = row 5 | |
7145 | punpcklbw xm1, xm3 | |
7146 | movq xm4, [r7 + r1 * 2] ; m4 = row 6 | |
7147 | punpcklbw xm3, xm4 | |
7148 | vinserti128 m1, m1, xm3, 1 | |
7149 | pmaddubsw m3, m1, [r5 + 2 * mmsize] | |
7150 | paddw m5, m3 | |
7151 | pmaddubsw m0, m1, [r5 + 1 * mmsize] | |
7152 | paddw m2, m0 | |
7153 | pmaddubsw m1, [r5] | |
7154 | movq xm3, [r7 + r4] ; m3 = row 7 | |
7155 | punpcklbw xm4, xm3 | |
7156 | lea r7, [r7 + r1 * 4] | |
7157 | movq xm0, [r7] ; m0 = row 8 | |
7158 | punpcklbw xm3, xm0 | |
7159 | vinserti128 m4, m4, xm3, 1 | |
7160 | pmaddubsw m3, m4, [r5 + 3 * mmsize] | |
7161 | paddw m5, m3 | |
7162 | pmaddubsw m3, m4, [r5 + 2 * mmsize] | |
7163 | paddw m2, m3 | |
7164 | pmaddubsw m3, m4, [r5 + 1 * mmsize] | |
7165 | paddw m1, m3 | |
7166 | pmaddubsw m4, [r5] | |
7167 | movq xm3, [r7 + r1] ; m3 = row 9 | |
7168 | punpcklbw xm0, xm3 | |
7169 | movq xm6, [r7 + r1 * 2] ; m6 = row 10 | |
7170 | punpcklbw xm3, xm6 | |
7171 | vinserti128 m0, m0, xm3, 1 | |
7172 | pmaddubsw m3, m0, [r5 + 3 * mmsize] | |
7173 | paddw m2, m3 | |
7174 | pmaddubsw m3, m0, [r5 + 2 * mmsize] | |
7175 | paddw m1, m3 | |
7176 | pmaddubsw m3, m0, [r5 + 1 * mmsize] | |
7177 | paddw m4, m3 | |
7178 | pmaddubsw m0, [r5] | |
7179 | ||
7180 | movq xm3, [r7 + r4] ; m3 = row 11 | |
7181 | punpcklbw xm6, xm3 | |
7182 | lea r7, [r7 + r1 * 4] | |
7183 | movq xm7, [r7] ; m7 = row 12 | |
7184 | punpcklbw xm3, xm7 | |
7185 | vinserti128 m6, m6, xm3, 1 | |
7186 | pmaddubsw m3, m6, [r5 + 3 * mmsize] | |
7187 | paddw m1, m3 | |
7188 | pmaddubsw m3, m6, [r5 + 2 * mmsize] | |
7189 | paddw m4, m3 | |
7190 | pmaddubsw m3, m6, [r5 + 1 * mmsize] | |
7191 | paddw m0, m3 | |
7192 | pmaddubsw m6, [r5] | |
7193 | movq xm3, [r7 + r1] ; m3 = row 13 | |
7194 | punpcklbw xm7, xm3 | |
7195 | movq xm8, [r7 + r1 * 2] ; m8 = row 14 | |
7196 | punpcklbw xm3, xm8 | |
7197 | vinserti128 m7, m7, xm3, 1 | |
7198 | pmaddubsw m3, m7, [r5 + 3 * mmsize] | |
7199 | paddw m4, m3 | |
7200 | pmaddubsw m3, m7, [r5 + 2 * mmsize] | |
7201 | paddw m0, m3 | |
7202 | pmaddubsw m3, m7, [r5 + 1 * mmsize] | |
7203 | paddw m6, m3 | |
7204 | pmaddubsw m7, [r5] | |
7205 | movq xm3, [r7 + r4] ; m3 = row 15 | |
7206 | punpcklbw xm8, xm3 | |
7207 | lea r7, [r7 + r1 * 4] | |
7208 | movq xm9, [r7] ; m9 = row 16 | |
7209 | punpcklbw xm3, xm9 | |
7210 | vinserti128 m8, m8, xm3, 1 | |
7211 | pmaddubsw m3, m8, [r5 + 3 * mmsize] | |
7212 | paddw m0, m3 | |
7213 | pmaddubsw m3, m8, [r5 + 2 * mmsize] | |
7214 | paddw m6, m3 | |
7215 | pmaddubsw m3, m8, [r5 + 1 * mmsize] | |
7216 | paddw m7, m3 | |
7217 | pmaddubsw m8, [r5] | |
7218 | movq xm3, [r7 + r1] ; m3 = row 17 | |
7219 | punpcklbw xm9, xm3 | |
7220 | movq xm10, [r7 + r1 * 2] ; m10 = row 18 | |
7221 | punpcklbw xm3, xm10 | |
7222 | vinserti128 m9, m9, xm3, 1 | |
7223 | pmaddubsw m3, m9, [r5 + 3 * mmsize] | |
7224 | paddw m6, m3 | |
7225 | pmaddubsw m3, m9, [r5 + 2 * mmsize] | |
7226 | paddw m7, m3 | |
7227 | pmaddubsw m3, m9, [r5 + 1 * mmsize] | |
7228 | paddw m8, m3 | |
7229 | movq xm3, [r7 + r4] ; m3 = row 19 | |
7230 | punpcklbw xm10, xm3 | |
7231 | lea r7, [r7 + r1 * 4] | |
7232 | movq xm9, [r7] ; m9 = row 20 | |
7233 | punpcklbw xm3, xm9 | |
7234 | vinserti128 m10, m10, xm3, 1 | |
7235 | pmaddubsw m3, m10, [r5 + 3 * mmsize] | |
7236 | paddw m7, m3 | |
7237 | pmaddubsw m3, m10, [r5 + 2 * mmsize] | |
7238 | paddw m8, m3 | |
7239 | movq xm3, [r7 + r1] ; m3 = row 21 | |
7240 | punpcklbw xm9, xm3 | |
7241 | movq xm10, [r7 + r1 * 2] ; m10 = row 22 | |
7242 | punpcklbw xm3, xm10 | |
7243 | vinserti128 m9, m9, xm3, 1 | |
7244 | pmaddubsw m3, m9, [r5 + 3 * mmsize] | |
7245 | paddw m8, m3 | |
7246 | ||
7247 | pmulhrsw m5, m14 ; m5 = word: row 0, row 1 | |
7248 | pmulhrsw m2, m14 ; m2 = word: row 2, row 3 | |
7249 | pmulhrsw m1, m14 ; m1 = word: row 4, row 5 | |
7250 | pmulhrsw m4, m14 ; m4 = word: row 6, row 7 | |
7251 | pmulhrsw m0, m14 ; m0 = word: row 8, row 9 | |
7252 | pmulhrsw m6, m14 ; m6 = word: row 10, row 11 | |
7253 | pmulhrsw m7, m14 ; m7 = word: row 12, row 13 | |
7254 | pmulhrsw m8, m14 ; m8 = word: row 14, row 15 | |
7255 | packuswb m5, m2 | |
7256 | packuswb m1, m4 | |
7257 | packuswb m0, m6 | |
7258 | packuswb m7, m8 | |
7259 | vextracti128 xm2, m5, 1 | |
7260 | vextracti128 xm4, m1, 1 | |
7261 | vextracti128 xm6, m0, 1 | |
7262 | vextracti128 xm8, m7, 1 | |
7263 | movq [r2], xm5 | |
7264 | movq [r2 + r3], xm2 | |
7265 | movhps [r2 + r3 * 2], xm5 | |
7266 | movhps [r2 + r6], xm2 | |
7267 | lea r8, [r2 + r3 * 4] | |
7268 | movq [r8], xm1 | |
7269 | movq [r8 + r3], xm4 | |
7270 | movhps [r8 + r3 * 2], xm1 | |
7271 | movhps [r8 + r6], xm4 | |
7272 | lea r8, [r8 + r3 * 4] | |
7273 | movq [r8], xm0 | |
7274 | movq [r8 + r3], xm6 | |
7275 | movhps [r8 + r3 * 2], xm0 | |
7276 | movhps [r8 + r6], xm6 | |
7277 | lea r8, [r8 + r3 * 4] | |
7278 | movq [r8], xm7 | |
7279 | movq [r8 + r3], xm8 | |
7280 | movhps [r8 + r3 * 2], xm7 | |
7281 | movhps [r8 + r6], xm8 | |
7282 | ||
7283 | sub r7, r10 | |
7284 | lea r0, [r7 - 16] | |
7285 | lea r2, [r8 + r3 * 4 - 16] | |
7286 | dec r9d | |
7287 | jnz .loopH | |
7288 | RET | |
7289 | %endif | |
7290 | ||
7291 | %macro FILTER_VER_LUMA_AVX2_32xN 2 | |
7292 | INIT_YMM avx2 | |
7293 | %if ARCH_X86_64 == 1 | |
7294 | cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15 | |
7295 | mov r4d, r4m | |
7296 | shl r4d, 7 | |
7297 | ||
7298 | %ifdef PIC | |
7299 | lea r5, [tab_LumaCoeffVer_32] | |
7300 | add r5, r4 | |
7301 | %else | |
7302 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7303 | %endif | |
7304 | ||
7305 | lea r4, [r1 * 3] | |
7306 | sub r0, r4 | |
7307 | lea r6, [r3 * 3] | |
7308 | lea r11, [r1 * 4] | |
7309 | mova m14, [pw_512] | |
7310 | mov r9d, %2 / 16 | |
7311 | .loopH: | |
7312 | mov r10d, %1 / 16 | |
7313 | .loopW: | |
7314 | PROCESS_LUMA_AVX2_W16_16R | |
7315 | add r2, 16 | |
7316 | add r0, 16 | |
7317 | dec r10d | |
7318 | jnz .loopW | |
7319 | sub r7, r11 | |
7320 | lea r0, [r7 - 16] | |
7321 | lea r2, [r8 + r3 * 4 - 16] | |
7322 | dec r9d | |
7323 | jnz .loopH | |
7324 | RET | |
7325 | %endif | |
7326 | %endmacro | |
7327 | ||
7328 | FILTER_VER_LUMA_AVX2_32xN 32, 32 | |
7329 | FILTER_VER_LUMA_AVX2_32xN 32, 64 | |
7330 | ||
7331 | INIT_YMM avx2 | |
7332 | %if ARCH_X86_64 == 1 | |
7333 | cglobal interp_8tap_vert_pp_32x16, 4, 10, 15 | |
7334 | mov r4d, r4m | |
7335 | shl r4d, 7 | |
7336 | ||
7337 | %ifdef PIC | |
7338 | lea r5, [tab_LumaCoeffVer_32] | |
7339 | add r5, r4 | |
7340 | %else | |
7341 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7342 | %endif | |
7343 | ||
7344 | lea r4, [r1 * 3] | |
7345 | sub r0, r4 | |
7346 | lea r6, [r3 * 3] | |
7347 | mova m14, [pw_512] | |
7348 | mov r9d, 2 | |
7349 | .loopW: | |
7350 | PROCESS_LUMA_AVX2_W16_16R | |
7351 | add r2, 16 | |
7352 | add r0, 16 | |
7353 | dec r9d | |
7354 | jnz .loopW | |
7355 | RET | |
7356 | %endif | |
7357 | ||
7358 | INIT_YMM avx2 | |
7359 | %if ARCH_X86_64 == 1 | |
7360 | cglobal interp_8tap_vert_pp_32x24, 4, 10, 15 | |
7361 | mov r4d, r4m | |
7362 | shl r4d, 7 | |
7363 | ||
7364 | %ifdef PIC | |
7365 | lea r5, [tab_LumaCoeffVer_32] | |
7366 | add r5, r4 | |
7367 | %else | |
7368 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7369 | %endif | |
7370 | ||
7371 | lea r4, [r1 * 3] | |
7372 | sub r0, r4 | |
7373 | lea r6, [r3 * 3] | |
7374 | mova m14, [pw_512] | |
7375 | mov r9d, 2 | |
7376 | .loopW: | |
7377 | PROCESS_LUMA_AVX2_W16_16R | |
7378 | add r2, 16 | |
7379 | add r0, 16 | |
7380 | dec r9d | |
7381 | jnz .loopW | |
7382 | lea r9, [r1 * 4] | |
7383 | sub r7, r9 | |
7384 | lea r0, [r7 - 16] | |
7385 | lea r2, [r8 + r3 * 4 - 16] | |
7386 | mov r9d, 2 | |
7387 | .loop: | |
7388 | PROCESS_LUMA_AVX2_W16_8R | |
7389 | add r2, 16 | |
7390 | add r0, 16 | |
7391 | dec r9d | |
7392 | jnz .loop | |
7393 | RET | |
7394 | %endif | |
7395 | ||
7396 | INIT_YMM avx2 | |
7397 | %if ARCH_X86_64 == 1 | |
7398 | cglobal interp_8tap_vert_pp_32x8, 4, 10, 15 | |
7399 | mov r4d, r4m | |
7400 | shl r4d, 7 | |
7401 | ||
7402 | %ifdef PIC | |
7403 | lea r5, [tab_LumaCoeffVer_32] | |
7404 | add r5, r4 | |
7405 | %else | |
7406 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7407 | %endif | |
7408 | ||
7409 | lea r4, [r1 * 3] | |
7410 | sub r0, r4 | |
7411 | lea r6, [r3 * 3] | |
7412 | mova m14, [pw_512] | |
7413 | mov r9d, 2 | |
7414 | .loopW: | |
7415 | PROCESS_LUMA_AVX2_W16_8R | |
7416 | add r2, 16 | |
7417 | add r0, 16 | |
7418 | dec r9d | |
7419 | jnz .loopW | |
7420 | RET | |
7421 | %endif | |
7422 | ||
7423 | INIT_YMM avx2 | |
7424 | %if ARCH_X86_64 == 1 | |
7425 | cglobal interp_8tap_vert_pp_48x64, 4, 12, 15 | |
7426 | mov r4d, r4m | |
7427 | shl r4d, 7 | |
7428 | ||
7429 | %ifdef PIC | |
7430 | lea r5, [tab_LumaCoeffVer_32] | |
7431 | add r5, r4 | |
7432 | %else | |
7433 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7434 | %endif | |
7435 | ||
7436 | lea r4, [r1 * 3] | |
7437 | sub r0, r4 | |
7438 | lea r6, [r3 * 3] | |
7439 | lea r11, [r1 * 4] | |
7440 | mova m14, [pw_512] | |
7441 | mov r9d, 4 | |
7442 | .loopH: | |
7443 | mov r10d, 3 | |
7444 | .loopW: | |
7445 | PROCESS_LUMA_AVX2_W16_16R | |
7446 | add r2, 16 | |
7447 | add r0, 16 | |
7448 | dec r10d | |
7449 | jnz .loopW | |
7450 | sub r7, r11 | |
7451 | lea r0, [r7 - 32] | |
7452 | lea r2, [r8 + r3 * 4 - 32] | |
7453 | dec r9d | |
7454 | jnz .loopH | |
7455 | RET | |
7456 | %endif | |
7457 | ||
7458 | %macro FILTER_VER_LUMA_AVX2_64xN 2 | |
7459 | INIT_YMM avx2 | |
7460 | %if ARCH_X86_64 == 1 | |
7461 | cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15 | |
7462 | mov r4d, r4m | |
7463 | shl r4d, 7 | |
7464 | ||
7465 | %ifdef PIC | |
7466 | lea r5, [tab_LumaCoeffVer_32] | |
7467 | add r5, r4 | |
7468 | %else | |
7469 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7470 | %endif | |
7471 | ||
7472 | lea r4, [r1 * 3] | |
7473 | sub r0, r4 | |
7474 | lea r6, [r3 * 3] | |
7475 | lea r11, [r1 * 4] | |
7476 | mova m14, [pw_512] | |
7477 | mov r9d, %2 / 16 | |
7478 | .loopH: | |
7479 | mov r10d, %1 / 16 | |
7480 | .loopW: | |
7481 | PROCESS_LUMA_AVX2_W16_16R | |
7482 | add r2, 16 | |
7483 | add r0, 16 | |
7484 | dec r10d | |
7485 | jnz .loopW | |
7486 | sub r7, r11 | |
7487 | lea r0, [r7 - 48] | |
7488 | lea r2, [r8 + r3 * 4 - 48] | |
7489 | dec r9d | |
7490 | jnz .loopH | |
7491 | RET | |
7492 | %endif | |
7493 | %endmacro | |
7494 | ||
7495 | FILTER_VER_LUMA_AVX2_64xN 64, 32 | |
7496 | FILTER_VER_LUMA_AVX2_64xN 64, 48 | |
7497 | FILTER_VER_LUMA_AVX2_64xN 64, 64 | |
7498 | ||
7499 | INIT_YMM avx2 | |
7500 | %if ARCH_X86_64 == 1 | |
7501 | cglobal interp_8tap_vert_pp_64x16, 4, 10, 15 | |
7502 | mov r4d, r4m | |
7503 | shl r4d, 7 | |
7504 | ||
7505 | %ifdef PIC | |
7506 | lea r5, [tab_LumaCoeffVer_32] | |
7507 | add r5, r4 | |
7508 | %else | |
7509 | lea r5, [tab_LumaCoeffVer_32 + r4] | |
7510 | %endif | |
7511 | ||
7512 | lea r4, [r1 * 3] | |
7513 | sub r0, r4 | |
7514 | lea r6, [r3 * 3] | |
7515 | mova m14, [pw_512] | |
7516 | mov r9d, 4 | |
7517 | .loopW: | |
7518 | PROCESS_LUMA_AVX2_W16_16R | |
7519 | add r2, 16 | |
7520 | add r0, 16 | |
7521 | dec r9d | |
7522 | jnz .loopW | |
7523 | RET | |
7524 | %endif | |
7525 | ||
72b9787e JB |
7526 | ;------------------------------------------------------------------------------------------------------------- |
7527 | ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
7528 | ;------------------------------------------------------------------------------------------------------------- | |
7529 | %macro FILTER_VER_LUMA 3 | |
7530 | INIT_XMM sse4 | |
7531 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize | |
7532 | lea r5, [3 * r1] | |
7533 | sub r0, r5 | |
7534 | shl r4d, 6 | |
7535 | %ifidn %3,ps | |
7536 | add r3d, r3d | |
7537 | %endif | |
7538 | ||
7539 | %ifdef PIC | |
7540 | lea r5, [tab_LumaCoeffVer] | |
7541 | lea r6, [r5 + r4] | |
7542 | %else | |
7543 | lea r6, [tab_LumaCoeffVer + r4] | |
7544 | %endif | |
7545 | ||
7546 | %ifidn %3,pp | |
b53f7c52 | 7547 | mova m3, [pw_512] |
72b9787e JB |
7548 | %else |
7549 | mova m3, [pw_2000] | |
7550 | %endif | |
7551 | mov dword [rsp], %2/4 | |
7552 | ||
7553 | .loopH: | |
7554 | mov r4d, (%1/8) | |
7555 | .loopW: | |
7556 | PROCESS_LUMA_W8_4R | |
7557 | %ifidn %3,pp | |
7558 | pmulhrsw m7, m3 | |
7559 | pmulhrsw m6, m3 | |
7560 | pmulhrsw m5, m3 | |
7561 | pmulhrsw m4, m3 | |
7562 | ||
7563 | packuswb m7, m6 | |
7564 | packuswb m5, m4 | |
7565 | ||
7566 | movlps [r2], m7 | |
7567 | movhps [r2 + r3], m7 | |
7568 | lea r5, [r2 + 2 * r3] | |
7569 | movlps [r5], m5 | |
7570 | movhps [r5 + r3], m5 | |
7571 | %else | |
7572 | psubw m7, m3 | |
7573 | psubw m6, m3 | |
7574 | psubw m5, m3 | |
7575 | psubw m4, m3 | |
7576 | ||
7577 | movu [r2], m7 | |
7578 | movu [r2 + r3], m6 | |
7579 | lea r5, [r2 + 2 * r3] | |
7580 | movu [r5], m5 | |
7581 | movu [r5 + r3], m4 | |
7582 | %endif | |
7583 | ||
7584 | lea r5, [8 * r1 - 8] | |
7585 | sub r0, r5 | |
7586 | %ifidn %3,pp | |
7587 | add r2, 8 | |
7588 | %else | |
7589 | add r2, 16 | |
7590 | %endif | |
7591 | dec r4d | |
7592 | jnz .loopW | |
7593 | ||
7594 | lea r0, [r0 + 4 * r1 - %1] | |
7595 | %ifidn %3,pp | |
7596 | lea r2, [r2 + 4 * r3 - %1] | |
7597 | %else | |
7598 | lea r2, [r2 + 4 * r3 - 2 * %1] | |
7599 | %endif | |
7600 | ||
7601 | dec dword [rsp] | |
7602 | jnz .loopH | |
7603 | ||
7604 | RET | |
7605 | %endmacro | |
7606 | ||
7607 | FILTER_VER_LUMA 16, 4, pp | |
7608 | FILTER_VER_LUMA 16, 8, pp | |
7609 | FILTER_VER_LUMA 16, 12, pp | |
7610 | FILTER_VER_LUMA 16, 16, pp | |
7611 | FILTER_VER_LUMA 16, 32, pp | |
7612 | FILTER_VER_LUMA 16, 64, pp | |
7613 | FILTER_VER_LUMA 24, 32, pp | |
7614 | FILTER_VER_LUMA 32, 8, pp | |
7615 | FILTER_VER_LUMA 32, 16, pp | |
7616 | FILTER_VER_LUMA 32, 24, pp | |
7617 | FILTER_VER_LUMA 32, 32, pp | |
7618 | FILTER_VER_LUMA 32, 64, pp | |
7619 | FILTER_VER_LUMA 48, 64, pp | |
7620 | FILTER_VER_LUMA 64, 16, pp | |
7621 | FILTER_VER_LUMA 64, 32, pp | |
7622 | FILTER_VER_LUMA 64, 48, pp | |
7623 | FILTER_VER_LUMA 64, 64, pp | |
7624 | ||
7625 | FILTER_VER_LUMA 16, 4, ps | |
7626 | FILTER_VER_LUMA 16, 8, ps | |
7627 | FILTER_VER_LUMA 16, 12, ps | |
7628 | FILTER_VER_LUMA 16, 16, ps | |
7629 | FILTER_VER_LUMA 16, 32, ps | |
7630 | FILTER_VER_LUMA 16, 64, ps | |
7631 | FILTER_VER_LUMA 24, 32, ps | |
7632 | FILTER_VER_LUMA 32, 8, ps | |
7633 | FILTER_VER_LUMA 32, 16, ps | |
7634 | FILTER_VER_LUMA 32, 24, ps | |
7635 | FILTER_VER_LUMA 32, 32, ps | |
7636 | FILTER_VER_LUMA 32, 64, ps | |
7637 | FILTER_VER_LUMA 48, 64, ps | |
7638 | FILTER_VER_LUMA 64, 16, ps | |
7639 | FILTER_VER_LUMA 64, 32, ps | |
7640 | FILTER_VER_LUMA 64, 48, ps | |
7641 | FILTER_VER_LUMA 64, 64, ps | |
7642 | ||
7643 | %macro PROCESS_LUMA_SP_W4_4R 0 | |
7644 | movq m0, [r0] | |
7645 | movq m1, [r0 + r1] | |
7646 | punpcklwd m0, m1 ;m0=[0 1] | |
7647 | pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 | |
7648 | ||
7649 | lea r0, [r0 + 2 * r1] | |
7650 | movq m4, [r0] | |
7651 | punpcklwd m1, m4 ;m1=[1 2] | |
7652 | pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 | |
7653 | ||
7654 | movq m5, [r0 + r1] | |
7655 | punpcklwd m4, m5 ;m4=[2 3] | |
7656 | pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 | |
7657 | pmaddwd m4, [r6 + 1 * 16] | |
7658 | paddd m0, m4 ;m0=[0+1+2+3] Row1 | |
7659 | ||
7660 | lea r0, [r0 + 2 * r1] | |
7661 | movq m4, [r0] | |
7662 | punpcklwd m5, m4 ;m5=[3 4] | |
7663 | pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 | |
7664 | pmaddwd m5, [r6 + 1 * 16] | |
7665 | paddd m1, m5 ;m1 = [1+2+3+4] Row2 | |
7666 | ||
7667 | movq m5, [r0 + r1] | |
7668 | punpcklwd m4, m5 ;m4=[4 5] | |
7669 | pmaddwd m6, m4, [r6 + 1 * 16] | |
7670 | paddd m2, m6 ;m2=[2+3+4+5] Row3 | |
7671 | pmaddwd m4, [r6 + 2 * 16] | |
7672 | paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 | |
7673 | ||
7674 | lea r0, [r0 + 2 * r1] | |
7675 | movq m4, [r0] | |
7676 | punpcklwd m5, m4 ;m5=[5 6] | |
7677 | pmaddwd m6, m5, [r6 + 1 * 16] | |
7678 | paddd m3, m6 ;m3=[3+4+5+6] Row4 | |
7679 | pmaddwd m5, [r6 + 2 * 16] | |
7680 | paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 | |
7681 | ||
7682 | movq m5, [r0 + r1] | |
7683 | punpcklwd m4, m5 ;m4=[6 7] | |
7684 | pmaddwd m6, m4, [r6 + 2 * 16] | |
7685 | paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 | |
7686 | pmaddwd m4, [r6 + 3 * 16] | |
7687 | paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end | |
7688 | ||
7689 | lea r0, [r0 + 2 * r1] | |
7690 | movq m4, [r0] | |
7691 | punpcklwd m5, m4 ;m5=[7 8] | |
7692 | pmaddwd m6, m5, [r6 + 2 * 16] | |
7693 | paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 | |
7694 | pmaddwd m5, [r6 + 3 * 16] | |
7695 | paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end | |
7696 | ||
7697 | movq m5, [r0 + r1] | |
7698 | punpcklwd m4, m5 ;m4=[8 9] | |
7699 | pmaddwd m4, [r6 + 3 * 16] | |
7700 | paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end | |
7701 | ||
7702 | movq m4, [r0 + 2 * r1] | |
7703 | punpcklwd m5, m4 ;m5=[9 10] | |
7704 | pmaddwd m5, [r6 + 3 * 16] | |
7705 | paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end | |
7706 | %endmacro | |
7707 | ||
7708 | ;-------------------------------------------------------------------------------------------------------------- | |
7709 | ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
7710 | ;-------------------------------------------------------------------------------------------------------------- | |
7711 | %macro FILTER_VER_LUMA_SP 2 | |
7712 | INIT_XMM sse4 | |
7713 | cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize | |
7714 | ||
7715 | add r1d, r1d | |
7716 | lea r5, [r1 + 2 * r1] | |
7717 | sub r0, r5 | |
7718 | shl r4d, 6 | |
7719 | ||
7720 | %ifdef PIC | |
7721 | lea r5, [tab_LumaCoeffV] | |
7722 | lea r6, [r5 + r4] | |
7723 | %else | |
7724 | lea r6, [tab_LumaCoeffV + r4] | |
7725 | %endif | |
7726 | ||
7727 | mova m7, [tab_c_526336] | |
7728 | ||
7729 | mov dword [rsp], %2/4 | |
7730 | .loopH: | |
7731 | mov r4d, (%1/4) | |
7732 | .loopW: | |
7733 | PROCESS_LUMA_SP_W4_4R | |
7734 | ||
7735 | paddd m0, m7 | |
7736 | paddd m1, m7 | |
7737 | paddd m2, m7 | |
7738 | paddd m3, m7 | |
7739 | ||
7740 | psrad m0, 12 | |
7741 | psrad m1, 12 | |
7742 | psrad m2, 12 | |
7743 | psrad m3, 12 | |
7744 | ||
7745 | packssdw m0, m1 | |
7746 | packssdw m2, m3 | |
7747 | ||
7748 | packuswb m0, m2 | |
7749 | ||
7750 | movd [r2], m0 | |
7751 | pextrd [r2 + r3], m0, 1 | |
7752 | lea r5, [r2 + 2 * r3] | |
7753 | pextrd [r5], m0, 2 | |
7754 | pextrd [r5 + r3], m0, 3 | |
7755 | ||
7756 | lea r5, [8 * r1 - 2 * 4] | |
7757 | sub r0, r5 | |
7758 | add r2, 4 | |
7759 | ||
7760 | dec r4d | |
7761 | jnz .loopW | |
7762 | ||
7763 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
7764 | lea r2, [r2 + 4 * r3 - %1] | |
7765 | ||
7766 | dec dword [rsp] | |
7767 | jnz .loopH | |
7768 | ||
7769 | RET | |
7770 | %endmacro | |
7771 | ||
7772 | ;-------------------------------------------------------------------------------------------------------------- | |
7773 | ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
7774 | ;-------------------------------------------------------------------------------------------------------------- | |
7775 | FILTER_VER_LUMA_SP 4, 4 | |
7776 | FILTER_VER_LUMA_SP 8, 8 | |
7777 | FILTER_VER_LUMA_SP 8, 4 | |
7778 | FILTER_VER_LUMA_SP 4, 8 | |
7779 | FILTER_VER_LUMA_SP 16, 16 | |
7780 | FILTER_VER_LUMA_SP 16, 8 | |
7781 | FILTER_VER_LUMA_SP 8, 16 | |
7782 | FILTER_VER_LUMA_SP 16, 12 | |
7783 | FILTER_VER_LUMA_SP 12, 16 | |
7784 | FILTER_VER_LUMA_SP 16, 4 | |
7785 | FILTER_VER_LUMA_SP 4, 16 | |
7786 | FILTER_VER_LUMA_SP 32, 32 | |
7787 | FILTER_VER_LUMA_SP 32, 16 | |
7788 | FILTER_VER_LUMA_SP 16, 32 | |
7789 | FILTER_VER_LUMA_SP 32, 24 | |
7790 | FILTER_VER_LUMA_SP 24, 32 | |
7791 | FILTER_VER_LUMA_SP 32, 8 | |
7792 | FILTER_VER_LUMA_SP 8, 32 | |
7793 | FILTER_VER_LUMA_SP 64, 64 | |
7794 | FILTER_VER_LUMA_SP 64, 32 | |
7795 | FILTER_VER_LUMA_SP 32, 64 | |
7796 | FILTER_VER_LUMA_SP 64, 48 | |
7797 | FILTER_VER_LUMA_SP 48, 64 | |
7798 | FILTER_VER_LUMA_SP 64, 16 | |
7799 | FILTER_VER_LUMA_SP 16, 64 | |
7800 | ||
7801 | ; TODO: combin of U and V is more performance, but need more register | |
7802 | ; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it | |
7803 | INIT_XMM ssse3 | |
7804 | cglobal chroma_p2s, 3, 7, 4 | |
7805 | ||
7806 | ; load width and height | |
7807 | mov r3d, r3m | |
7808 | mov r4d, r4m | |
7809 | ||
7810 | ; load constant | |
b53f7c52 | 7811 | mova m2, [pb_128] |
72b9787e JB |
7812 | mova m3, [tab_c_64_n64] |
7813 | ||
7814 | .loopH: | |
7815 | ||
7816 | xor r5d, r5d | |
7817 | .loopW: | |
7818 | lea r6, [r0 + r5] | |
7819 | ||
7820 | movh m0, [r6] | |
7821 | punpcklbw m0, m2 | |
7822 | pmaddubsw m0, m3 | |
7823 | ||
7824 | movh m1, [r6 + r1] | |
7825 | punpcklbw m1, m2 | |
7826 | pmaddubsw m1, m3 | |
7827 | ||
7828 | add r5d, 8 | |
7829 | cmp r5d, r3d | |
7830 | lea r6, [r2 + r5 * 2] | |
7831 | jg .width4 | |
7832 | movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 | |
7833 | movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 | |
7834 | je .nextH | |
7835 | jmp .loopW | |
7836 | ||
7837 | .width4: | |
7838 | test r3d, 4 | |
7839 | jz .width2 | |
7840 | test r3d, 2 | |
7841 | movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 | |
7842 | movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 | |
7843 | lea r6, [r6 + 8] | |
7844 | pshufd m0, m0, 2 | |
7845 | pshufd m1, m1, 2 | |
7846 | jz .nextH | |
7847 | ||
7848 | .width2: | |
7849 | movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 | |
7850 | movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 | |
7851 | ||
7852 | .nextH: | |
7853 | lea r0, [r0 + r1 * 2] | |
7854 | add r2, FENC_STRIDE / 2 * 4 | |
7855 | ||
7856 | sub r4d, 2 | |
7857 | jnz .loopH | |
7858 | ||
7859 | RET | |
7860 | ||
7861 | %macro PROCESS_CHROMA_SP_W4_4R 0 | |
7862 | movq m0, [r0] | |
7863 | movq m1, [r0 + r1] | |
7864 | punpcklwd m0, m1 ;m0=[0 1] | |
7865 | pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 | |
7866 | ||
7867 | lea r0, [r0 + 2 * r1] | |
7868 | movq m4, [r0] | |
7869 | punpcklwd m1, m4 ;m1=[1 2] | |
7870 | pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 | |
7871 | ||
7872 | movq m5, [r0 + r1] | |
7873 | punpcklwd m4, m5 ;m4=[2 3] | |
7874 | pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 | |
7875 | pmaddwd m4, [r6 + 1 * 16] | |
7876 | paddd m0, m4 ;m0=[0+1+2+3] Row1 done | |
7877 | ||
7878 | lea r0, [r0 + 2 * r1] | |
7879 | movq m4, [r0] | |
7880 | punpcklwd m5, m4 ;m5=[3 4] | |
7881 | pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 | |
7882 | pmaddwd m5, [r6 + 1 * 16] | |
7883 | paddd m1, m5 ;m1 = [1+2+3+4] Row2 | |
7884 | ||
7885 | movq m5, [r0 + r1] | |
7886 | punpcklwd m4, m5 ;m4=[4 5] | |
7887 | pmaddwd m4, [r6 + 1 * 16] | |
7888 | paddd m2, m4 ;m2=[2+3+4+5] Row3 | |
7889 | ||
7890 | movq m4, [r0 + 2 * r1] | |
7891 | punpcklwd m5, m4 ;m5=[5 6] | |
7892 | pmaddwd m5, [r6 + 1 * 16] | |
7893 | paddd m3, m5 ;m3=[3+4+5+6] Row4 | |
7894 | %endmacro | |
7895 | ||
7896 | ;-------------------------------------------------------------------------------------------------------------- | |
7897 | ; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
7898 | ;-------------------------------------------------------------------------------------------------------------- | |
7899 | %macro FILTER_VER_CHROMA_SP 2 | |
7900 | INIT_XMM sse4 | |
7901 | cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize | |
7902 | ||
7903 | add r1d, r1d | |
7904 | sub r0, r1 | |
7905 | shl r4d, 5 | |
7906 | ||
7907 | %ifdef PIC | |
7908 | lea r5, [tab_ChromaCoeffV] | |
7909 | lea r6, [r5 + r4] | |
7910 | %else | |
7911 | lea r6, [tab_ChromaCoeffV + r4] | |
7912 | %endif | |
7913 | ||
7914 | mova m6, [tab_c_526336] | |
7915 | ||
7916 | mov dword [rsp], %2/4 | |
7917 | ||
7918 | .loopH: | |
7919 | mov r4d, (%1/4) | |
7920 | .loopW: | |
7921 | PROCESS_CHROMA_SP_W4_4R | |
7922 | ||
7923 | paddd m0, m6 | |
7924 | paddd m1, m6 | |
7925 | paddd m2, m6 | |
7926 | paddd m3, m6 | |
7927 | ||
7928 | psrad m0, 12 | |
7929 | psrad m1, 12 | |
7930 | psrad m2, 12 | |
7931 | psrad m3, 12 | |
7932 | ||
7933 | packssdw m0, m1 | |
7934 | packssdw m2, m3 | |
7935 | ||
7936 | packuswb m0, m2 | |
7937 | ||
7938 | movd [r2], m0 | |
7939 | pextrd [r2 + r3], m0, 1 | |
7940 | lea r5, [r2 + 2 * r3] | |
7941 | pextrd [r5], m0, 2 | |
7942 | pextrd [r5 + r3], m0, 3 | |
7943 | ||
7944 | lea r5, [4 * r1 - 2 * 4] | |
7945 | sub r0, r5 | |
7946 | add r2, 4 | |
7947 | ||
7948 | dec r4d | |
7949 | jnz .loopW | |
7950 | ||
7951 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
7952 | lea r2, [r2 + 4 * r3 - %1] | |
7953 | ||
7954 | dec dword [rsp] | |
7955 | jnz .loopH | |
7956 | ||
7957 | RET | |
7958 | %endmacro | |
7959 | ||
7960 | FILTER_VER_CHROMA_SP 4, 4 | |
7961 | FILTER_VER_CHROMA_SP 4, 8 | |
7962 | FILTER_VER_CHROMA_SP 16, 16 | |
7963 | FILTER_VER_CHROMA_SP 16, 8 | |
7964 | FILTER_VER_CHROMA_SP 16, 12 | |
7965 | FILTER_VER_CHROMA_SP 12, 16 | |
7966 | FILTER_VER_CHROMA_SP 16, 4 | |
7967 | FILTER_VER_CHROMA_SP 4, 16 | |
7968 | FILTER_VER_CHROMA_SP 32, 32 | |
7969 | FILTER_VER_CHROMA_SP 32, 16 | |
7970 | FILTER_VER_CHROMA_SP 16, 32 | |
7971 | FILTER_VER_CHROMA_SP 32, 24 | |
7972 | FILTER_VER_CHROMA_SP 24, 32 | |
7973 | FILTER_VER_CHROMA_SP 32, 8 | |
7974 | ||
7975 | FILTER_VER_CHROMA_SP 16, 24 | |
7976 | FILTER_VER_CHROMA_SP 16, 64 | |
7977 | FILTER_VER_CHROMA_SP 12, 32 | |
7978 | FILTER_VER_CHROMA_SP 4, 32 | |
7979 | FILTER_VER_CHROMA_SP 32, 64 | |
7980 | FILTER_VER_CHROMA_SP 32, 48 | |
7981 | FILTER_VER_CHROMA_SP 24, 64 | |
7982 | ||
7983 | FILTER_VER_CHROMA_SP 64, 64 | |
7984 | FILTER_VER_CHROMA_SP 64, 32 | |
7985 | FILTER_VER_CHROMA_SP 64, 48 | |
7986 | FILTER_VER_CHROMA_SP 48, 64 | |
7987 | FILTER_VER_CHROMA_SP 64, 16 | |
7988 | ||
7989 | ||
7990 | %macro PROCESS_CHROMA_SP_W2_4R 1 | |
7991 | movd m0, [r0] | |
7992 | movd m1, [r0 + r1] | |
7993 | punpcklwd m0, m1 ;m0=[0 1] | |
7994 | ||
7995 | lea r0, [r0 + 2 * r1] | |
7996 | movd m2, [r0] | |
7997 | punpcklwd m1, m2 ;m1=[1 2] | |
7998 | punpcklqdq m0, m1 ;m0=[0 1 1 2] | |
7999 | pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 | |
8000 | ||
8001 | movd m1, [r0 + r1] | |
8002 | punpcklwd m2, m1 ;m2=[2 3] | |
8003 | ||
8004 | lea r0, [r0 + 2 * r1] | |
8005 | movd m3, [r0] | |
8006 | punpcklwd m1, m3 ;m2=[3 4] | |
8007 | punpcklqdq m2, m1 ;m2=[2 3 3 4] | |
8008 | ||
8009 | pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 | |
8010 | pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 | |
8011 | paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 | |
8012 | ||
8013 | movd m1, [r0 + r1] | |
8014 | punpcklwd m3, m1 ;m3=[4 5] | |
8015 | ||
8016 | movd m4, [r0 + 2 * r1] | |
8017 | punpcklwd m1, m4 ;m1=[5 6] | |
8018 | punpcklqdq m3, m1 ;m2=[4 5 5 6] | |
8019 | pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 | |
8020 | paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 | |
8021 | %endmacro | |
8022 | ||
8023 | ;------------------------------------------------------------------------------------------------------------------- | |
8024 | ; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
8025 | ;------------------------------------------------------------------------------------------------------------------- | |
8026 | %macro FILTER_VER_CHROMA_SP_W2_4R 2 | |
8027 | INIT_XMM sse4 | |
8028 | cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 | |
8029 | ||
8030 | add r1d, r1d | |
8031 | sub r0, r1 | |
8032 | shl r4d, 5 | |
8033 | ||
8034 | %ifdef PIC | |
8035 | lea r5, [tab_ChromaCoeffV] | |
8036 | lea r5, [r5 + r4] | |
8037 | %else | |
8038 | lea r5, [tab_ChromaCoeffV + r4] | |
8039 | %endif | |
8040 | ||
8041 | mova m5, [tab_c_526336] | |
8042 | ||
8043 | mov r4d, (%2/4) | |
8044 | ||
8045 | .loopH: | |
8046 | PROCESS_CHROMA_SP_W2_4R r5 | |
8047 | ||
8048 | paddd m0, m5 | |
8049 | paddd m2, m5 | |
8050 | ||
8051 | psrad m0, 12 | |
8052 | psrad m2, 12 | |
8053 | ||
8054 | packssdw m0, m2 | |
8055 | packuswb m0, m0 | |
8056 | ||
8057 | pextrw [r2], m0, 0 | |
8058 | pextrw [r2 + r3], m0, 1 | |
8059 | lea r2, [r2 + 2 * r3] | |
8060 | pextrw [r2], m0, 2 | |
8061 | pextrw [r2 + r3], m0, 3 | |
8062 | ||
8063 | lea r2, [r2 + 2 * r3] | |
8064 | ||
8065 | dec r4d | |
8066 | jnz .loopH | |
8067 | ||
8068 | RET | |
8069 | %endmacro | |
8070 | ||
8071 | FILTER_VER_CHROMA_SP_W2_4R 2, 4 | |
8072 | FILTER_VER_CHROMA_SP_W2_4R 2, 8 | |
8073 | ||
8074 | FILTER_VER_CHROMA_SP_W2_4R 2, 16 | |
8075 | ||
8076 | ;-------------------------------------------------------------------------------------------------------------- | |
8077 | ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
8078 | ;-------------------------------------------------------------------------------------------------------------- | |
8079 | INIT_XMM sse4 | |
8080 | cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 | |
8081 | ||
8082 | add r1d, r1d | |
8083 | sub r0, r1 | |
8084 | shl r4d, 5 | |
8085 | ||
8086 | %ifdef PIC | |
8087 | lea r5, [tab_ChromaCoeffV] | |
8088 | lea r5, [r5 + r4] | |
8089 | %else | |
8090 | lea r5, [tab_ChromaCoeffV + r4] | |
8091 | %endif | |
8092 | ||
8093 | mova m4, [tab_c_526336] | |
8094 | ||
8095 | movq m0, [r0] | |
8096 | movq m1, [r0 + r1] | |
8097 | punpcklwd m0, m1 ;m0=[0 1] | |
8098 | pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 | |
8099 | ||
8100 | lea r0, [r0 + 2 * r1] | |
8101 | movq m2, [r0] | |
8102 | punpcklwd m1, m2 ;m1=[1 2] | |
8103 | pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 | |
8104 | ||
8105 | movq m3, [r0 + r1] | |
8106 | punpcklwd m2, m3 ;m4=[2 3] | |
8107 | pmaddwd m2, [r5 + 1 * 16] | |
8108 | paddd m0, m2 ;m0=[0+1+2+3] Row1 done | |
8109 | paddd m0, m4 | |
8110 | psrad m0, 12 | |
8111 | ||
8112 | movq m2, [r0 + 2 * r1] | |
8113 | punpcklwd m3, m2 ;m5=[3 4] | |
8114 | pmaddwd m3, [r5 + 1 * 16] | |
8115 | paddd m1, m3 ;m1 = [1+2+3+4] Row2 done | |
8116 | paddd m1, m4 | |
8117 | psrad m1, 12 | |
8118 | ||
8119 | packssdw m0, m1 | |
8120 | packuswb m0, m0 | |
8121 | ||
8122 | movd [r2], m0 | |
8123 | pextrd [r2 + r3], m0, 1 | |
8124 | ||
8125 | RET | |
8126 | ||
8127 | ;------------------------------------------------------------------------------------------------------------------- | |
8128 | ; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
8129 | ;------------------------------------------------------------------------------------------------------------------- | |
8130 | %macro FILTER_VER_CHROMA_SP_W6_H4 2 | |
8131 | INIT_XMM sse4 | |
8132 | cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 | |
8133 | ||
8134 | add r1d, r1d | |
8135 | sub r0, r1 | |
8136 | shl r4d, 5 | |
8137 | ||
8138 | %ifdef PIC | |
8139 | lea r5, [tab_ChromaCoeffV] | |
8140 | lea r6, [r5 + r4] | |
8141 | %else | |
8142 | lea r6, [tab_ChromaCoeffV + r4] | |
8143 | %endif | |
8144 | ||
8145 | mova m6, [tab_c_526336] | |
8146 | ||
8147 | mov r4d, %2/4 | |
8148 | ||
8149 | .loopH: | |
8150 | PROCESS_CHROMA_SP_W4_4R | |
8151 | ||
8152 | paddd m0, m6 | |
8153 | paddd m1, m6 | |
8154 | paddd m2, m6 | |
8155 | paddd m3, m6 | |
8156 | ||
8157 | psrad m0, 12 | |
8158 | psrad m1, 12 | |
8159 | psrad m2, 12 | |
8160 | psrad m3, 12 | |
8161 | ||
8162 | packssdw m0, m1 | |
8163 | packssdw m2, m3 | |
8164 | ||
8165 | packuswb m0, m2 | |
8166 | ||
8167 | movd [r2], m0 | |
8168 | pextrd [r2 + r3], m0, 1 | |
8169 | lea r5, [r2 + 2 * r3] | |
8170 | pextrd [r5], m0, 2 | |
8171 | pextrd [r5 + r3], m0, 3 | |
8172 | ||
8173 | lea r5, [4 * r1 - 2 * 4] | |
8174 | sub r0, r5 | |
8175 | add r2, 4 | |
8176 | ||
8177 | PROCESS_CHROMA_SP_W2_4R r6 | |
8178 | ||
8179 | paddd m0, m6 | |
8180 | paddd m2, m6 | |
8181 | ||
8182 | psrad m0, 12 | |
8183 | psrad m2, 12 | |
8184 | ||
8185 | packssdw m0, m2 | |
8186 | packuswb m0, m0 | |
8187 | ||
8188 | pextrw [r2], m0, 0 | |
8189 | pextrw [r2 + r3], m0, 1 | |
8190 | lea r2, [r2 + 2 * r3] | |
8191 | pextrw [r2], m0, 2 | |
8192 | pextrw [r2 + r3], m0, 3 | |
8193 | ||
8194 | sub r0, 2 * 4 | |
8195 | lea r2, [r2 + 2 * r3 - 4] | |
8196 | ||
8197 | dec r4d | |
8198 | jnz .loopH | |
8199 | ||
8200 | RET | |
8201 | %endmacro | |
8202 | ||
8203 | FILTER_VER_CHROMA_SP_W6_H4 6, 8 | |
8204 | ||
8205 | FILTER_VER_CHROMA_SP_W6_H4 6, 16 | |
8206 | ||
8207 | %macro PROCESS_CHROMA_SP_W8_2R 0 | |
8208 | movu m1, [r0] | |
8209 | movu m3, [r0 + r1] | |
8210 | punpcklwd m0, m1, m3 | |
8211 | pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l | |
8212 | punpckhwd m1, m3 | |
8213 | pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h | |
8214 | ||
8215 | movu m4, [r0 + 2 * r1] | |
8216 | punpcklwd m2, m3, m4 | |
8217 | pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l | |
8218 | punpckhwd m3, m4 | |
8219 | pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h | |
8220 | ||
8221 | lea r0, [r0 + 2 * r1] | |
8222 | movu m5, [r0 + r1] | |
8223 | punpcklwd m6, m4, m5 | |
8224 | pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l | |
8225 | paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum | |
8226 | punpckhwd m4, m5 | |
8227 | pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h | |
8228 | paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum | |
8229 | ||
8230 | movu m4, [r0 + 2 * r1] | |
8231 | punpcklwd m6, m5, m4 | |
8232 | pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l | |
8233 | paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum | |
8234 | punpckhwd m5, m4 | |
8235 | pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h | |
8236 | paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum | |
8237 | %endmacro | |
8238 | ||
8239 | ;-------------------------------------------------------------------------------------------------------------- | |
8240 | ; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
8241 | ;-------------------------------------------------------------------------------------------------------------- | |
8242 | %macro FILTER_VER_CHROMA_SP_W8_H2 2 | |
8243 | INIT_XMM sse2 | |
8244 | cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 | |
8245 | ||
8246 | add r1d, r1d | |
8247 | sub r0, r1 | |
8248 | shl r4d, 5 | |
8249 | ||
8250 | %ifdef PIC | |
8251 | lea r5, [tab_ChromaCoeffV] | |
8252 | lea r5, [r5 + r4] | |
8253 | %else | |
8254 | lea r5, [tab_ChromaCoeffV + r4] | |
8255 | %endif | |
8256 | ||
8257 | mova m7, [tab_c_526336] | |
8258 | ||
8259 | mov r4d, %2/2 | |
8260 | .loopH: | |
8261 | PROCESS_CHROMA_SP_W8_2R | |
8262 | ||
8263 | paddd m0, m7 | |
8264 | paddd m1, m7 | |
8265 | paddd m2, m7 | |
8266 | paddd m3, m7 | |
8267 | ||
8268 | psrad m0, 12 | |
8269 | psrad m1, 12 | |
8270 | psrad m2, 12 | |
8271 | psrad m3, 12 | |
8272 | ||
8273 | packssdw m0, m1 | |
8274 | packssdw m2, m3 | |
8275 | ||
8276 | packuswb m0, m2 | |
8277 | ||
8278 | movlps [r2], m0 | |
8279 | movhps [r2 + r3], m0 | |
8280 | ||
8281 | lea r2, [r2 + 2 * r3] | |
8282 | ||
8283 | dec r4d | |
8284 | jnz .loopH | |
8285 | ||
8286 | RET | |
8287 | %endmacro | |
8288 | ||
8289 | FILTER_VER_CHROMA_SP_W8_H2 8, 2 | |
8290 | FILTER_VER_CHROMA_SP_W8_H2 8, 4 | |
8291 | FILTER_VER_CHROMA_SP_W8_H2 8, 6 | |
8292 | FILTER_VER_CHROMA_SP_W8_H2 8, 8 | |
8293 | FILTER_VER_CHROMA_SP_W8_H2 8, 16 | |
8294 | FILTER_VER_CHROMA_SP_W8_H2 8, 32 | |
8295 | ||
8296 | FILTER_VER_CHROMA_SP_W8_H2 8, 12 | |
8297 | FILTER_VER_CHROMA_SP_W8_H2 8, 64 | |
8298 | ||
8299 | ||
8300 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8301 | ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
8302 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8303 | %macro FILTER_HORIZ_CHROMA_2xN 2 | |
8304 | INIT_XMM sse4 | |
8305 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride | |
8306 | %define coef2 m3 | |
8307 | %define Tm0 m2 | |
8308 | %define t1 m1 | |
8309 | %define t0 m0 | |
8310 | ||
8311 | dec srcq | |
8312 | mov r4d, r4m | |
8313 | add dststrided, dststrided | |
8314 | ||
8315 | %ifdef PIC | |
8316 | lea r6, [tab_ChromaCoeff] | |
8317 | movd coef2, [r6 + r4 * 4] | |
8318 | %else | |
8319 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
8320 | %endif | |
8321 | ||
8322 | pshufd coef2, coef2, 0 | |
8323 | mova t1, [pw_2000] | |
8324 | mova Tm0, [tab_Tm] | |
8325 | ||
8326 | mov r4d, %2 | |
8327 | cmp r5m, byte 0 | |
8328 | je .loopH | |
8329 | sub srcq, srcstrideq | |
8330 | add r4d, 3 | |
8331 | ||
8332 | .loopH: | |
8333 | movh t0, [srcq] | |
8334 | pshufb t0, t0, Tm0 | |
8335 | pmaddubsw t0, coef2 | |
8336 | phaddw t0, t0 | |
8337 | psubw t0, t1 | |
8338 | movd [dstq], t0 | |
8339 | ||
8340 | lea srcq, [srcq + srcstrideq] | |
8341 | lea dstq, [dstq + dststrideq] | |
8342 | ||
8343 | dec r4d | |
8344 | jnz .loopH | |
8345 | ||
8346 | RET | |
8347 | %endmacro | |
8348 | ||
8349 | FILTER_HORIZ_CHROMA_2xN 2, 4 | |
8350 | FILTER_HORIZ_CHROMA_2xN 2, 8 | |
8351 | ||
8352 | FILTER_HORIZ_CHROMA_2xN 2, 16 | |
8353 | ||
8354 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8355 | ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
8356 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8357 | %macro FILTER_HORIZ_CHROMA_4xN 2 | |
8358 | INIT_XMM sse4 | |
8359 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride | |
8360 | %define coef2 m3 | |
8361 | %define Tm0 m2 | |
8362 | %define t1 m1 | |
8363 | %define t0 m0 | |
8364 | ||
8365 | dec srcq | |
8366 | mov r4d, r4m | |
8367 | add dststrided, dststrided | |
8368 | ||
8369 | %ifdef PIC | |
8370 | lea r6, [tab_ChromaCoeff] | |
8371 | movd coef2, [r6 + r4 * 4] | |
8372 | %else | |
8373 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
8374 | %endif | |
8375 | ||
8376 | pshufd coef2, coef2, 0 | |
8377 | mova t1, [pw_2000] | |
8378 | mova Tm0, [tab_Tm] | |
8379 | ||
8380 | mov r4d, %2 | |
8381 | cmp r5m, byte 0 | |
8382 | je .loopH | |
8383 | sub srcq, srcstrideq | |
8384 | add r4d, 3 | |
8385 | ||
8386 | .loopH: | |
8387 | movh t0, [srcq] | |
8388 | pshufb t0, t0, Tm0 | |
8389 | pmaddubsw t0, coef2 | |
8390 | phaddw t0, t0 | |
8391 | psubw t0, t1 | |
8392 | movlps [dstq], t0 | |
8393 | ||
8394 | lea srcq, [srcq + srcstrideq] | |
8395 | lea dstq, [dstq + dststrideq] | |
8396 | ||
8397 | dec r4d | |
8398 | jnz .loopH | |
8399 | RET | |
8400 | %endmacro | |
8401 | ||
8402 | FILTER_HORIZ_CHROMA_4xN 4, 2 | |
8403 | FILTER_HORIZ_CHROMA_4xN 4, 4 | |
8404 | FILTER_HORIZ_CHROMA_4xN 4, 8 | |
8405 | FILTER_HORIZ_CHROMA_4xN 4, 16 | |
8406 | ||
8407 | FILTER_HORIZ_CHROMA_4xN 4, 32 | |
8408 | ||
8409 | %macro PROCESS_CHROMA_W6 3 | |
8410 | movu %1, [srcq] | |
8411 | pshufb %2, %1, Tm0 | |
8412 | pmaddubsw %2, coef2 | |
8413 | pshufb %1, %1, Tm1 | |
8414 | pmaddubsw %1, coef2 | |
8415 | phaddw %2, %1 | |
8416 | psubw %2, %3 | |
8417 | movh [dstq], %2 | |
8418 | pshufd %2, %2, 2 | |
8419 | movd [dstq + 8], %2 | |
8420 | %endmacro | |
8421 | ||
8422 | %macro PROCESS_CHROMA_W12 3 | |
8423 | movu %1, [srcq] | |
8424 | pshufb %2, %1, Tm0 | |
8425 | pmaddubsw %2, coef2 | |
8426 | pshufb %1, %1, Tm1 | |
8427 | pmaddubsw %1, coef2 | |
8428 | phaddw %2, %1 | |
8429 | psubw %2, %3 | |
8430 | movu [dstq], %2 | |
8431 | movu %1, [srcq + 8] | |
8432 | pshufb %1, %1, Tm0 | |
8433 | pmaddubsw %1, coef2 | |
8434 | phaddw %1, %1 | |
8435 | psubw %1, %3 | |
8436 | movh [dstq + 16], %1 | |
8437 | %endmacro | |
8438 | ||
8439 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8440 | ; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
8441 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8442 | %macro FILTER_HORIZ_CHROMA 2 | |
8443 | INIT_XMM sse4 | |
8444 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride | |
8445 | %define coef2 m5 | |
8446 | %define Tm0 m4 | |
8447 | %define Tm1 m3 | |
8448 | %define t2 m2 | |
8449 | %define t1 m1 | |
8450 | %define t0 m0 | |
8451 | ||
8452 | dec srcq | |
8453 | mov r4d, r4m | |
8454 | add dststrided, dststrided | |
8455 | ||
8456 | %ifdef PIC | |
8457 | lea r6, [tab_ChromaCoeff] | |
8458 | movd coef2, [r6 + r4 * 4] | |
8459 | %else | |
8460 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
8461 | %endif | |
8462 | ||
8463 | pshufd coef2, coef2, 0 | |
8464 | mova t2, [pw_2000] | |
8465 | mova Tm0, [tab_Tm] | |
8466 | mova Tm1, [tab_Tm + 16] | |
8467 | ||
8468 | mov r4d, %2 | |
8469 | cmp r5m, byte 0 | |
8470 | je .loopH | |
8471 | sub srcq, srcstrideq | |
8472 | add r4d, 3 | |
8473 | ||
8474 | .loopH: | |
8475 | PROCESS_CHROMA_W%1 t0, t1, t2 | |
8476 | add srcq, srcstrideq | |
8477 | add dstq, dststrideq | |
8478 | ||
8479 | dec r4d | |
8480 | jnz .loopH | |
8481 | ||
8482 | RET | |
8483 | %endmacro | |
8484 | ||
8485 | FILTER_HORIZ_CHROMA 6, 8 | |
8486 | FILTER_HORIZ_CHROMA 12, 16 | |
8487 | ||
8488 | FILTER_HORIZ_CHROMA 6, 16 | |
8489 | FILTER_HORIZ_CHROMA 12, 32 | |
8490 | ||
8491 | %macro PROCESS_CHROMA_W8 3 | |
8492 | movu %1, [srcq] | |
8493 | pshufb %2, %1, Tm0 | |
8494 | pmaddubsw %2, coef2 | |
8495 | pshufb %1, %1, Tm1 | |
8496 | pmaddubsw %1, coef2 | |
8497 | phaddw %2, %1 | |
8498 | psubw %2, %3 | |
8499 | movu [dstq], %2 | |
8500 | %endmacro | |
8501 | ||
8502 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8503 | ; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
8504 | ;----------------------------------------------------------------------------------------------------------------------------- | |
8505 | %macro FILTER_HORIZ_CHROMA_8xN 2 | |
8506 | INIT_XMM sse4 | |
8507 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride | |
8508 | %define coef2 m5 | |
8509 | %define Tm0 m4 | |
8510 | %define Tm1 m3 | |
8511 | %define t2 m2 | |
8512 | %define t1 m1 | |
8513 | %define t0 m0 | |
8514 | ||
8515 | dec srcq | |
8516 | mov r4d, r4m | |
8517 | add dststrided, dststrided | |
8518 | ||
8519 | %ifdef PIC | |
8520 | lea r6, [tab_ChromaCoeff] | |
8521 | movd coef2, [r6 + r4 * 4] | |
8522 | %else | |
8523 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
8524 | %endif | |
8525 | ||
8526 | pshufd coef2, coef2, 0 | |
8527 | mova t2, [pw_2000] | |
8528 | mova Tm0, [tab_Tm] | |
8529 | mova Tm1, [tab_Tm + 16] | |
8530 | ||
8531 | mov r4d, %2 | |
8532 | cmp r5m, byte 0 | |
8533 | je .loopH | |
8534 | sub srcq, srcstrideq | |
8535 | add r4d, 3 | |
8536 | ||
8537 | .loopH: | |
8538 | PROCESS_CHROMA_W8 t0, t1, t2 | |
8539 | add srcq, srcstrideq | |
8540 | add dstq, dststrideq | |
8541 | ||
8542 | dec r4d | |
8543 | jnz .loopH | |
8544 | ||
8545 | RET | |
8546 | %endmacro | |
8547 | ||
8548 | FILTER_HORIZ_CHROMA_8xN 8, 2 | |
8549 | FILTER_HORIZ_CHROMA_8xN 8, 4 | |
8550 | FILTER_HORIZ_CHROMA_8xN 8, 6 | |
8551 | FILTER_HORIZ_CHROMA_8xN 8, 8 | |
8552 | FILTER_HORIZ_CHROMA_8xN 8, 16 | |
8553 | FILTER_HORIZ_CHROMA_8xN 8, 32 | |
8554 | ||
8555 | FILTER_HORIZ_CHROMA_8xN 8, 12 | |
8556 | FILTER_HORIZ_CHROMA_8xN 8, 64 | |
8557 | ||
8558 | %macro PROCESS_CHROMA_W16 4 | |
8559 | movu %1, [srcq] | |
8560 | pshufb %2, %1, Tm0 | |
8561 | pmaddubsw %2, coef2 | |
8562 | pshufb %1, %1, Tm1 | |
8563 | pmaddubsw %1, coef2 | |
8564 | phaddw %2, %1 | |
8565 | movu %1, [srcq + 8] | |
8566 | pshufb %4, %1, Tm0 | |
8567 | pmaddubsw %4, coef2 | |
8568 | pshufb %1, %1, Tm1 | |
8569 | pmaddubsw %1, coef2 | |
8570 | phaddw %4, %1 | |
8571 | psubw %2, %3 | |
8572 | psubw %4, %3 | |
8573 | movu [dstq], %2 | |
8574 | movu [dstq + 16], %4 | |
8575 | %endmacro | |
8576 | ||
8577 | %macro PROCESS_CHROMA_W24 4 | |
8578 | movu %1, [srcq] | |
8579 | pshufb %2, %1, Tm0 | |
8580 | pmaddubsw %2, coef2 | |
8581 | pshufb %1, %1, Tm1 | |
8582 | pmaddubsw %1, coef2 | |
8583 | phaddw %2, %1 | |
8584 | movu %1, [srcq + 8] | |
8585 | pshufb %4, %1, Tm0 | |
8586 | pmaddubsw %4, coef2 | |
8587 | pshufb %1, %1, Tm1 | |
8588 | pmaddubsw %1, coef2 | |
8589 | phaddw %4, %1 | |
8590 | psubw %2, %3 | |
8591 | psubw %4, %3 | |
8592 | movu [dstq], %2 | |
8593 | movu [dstq + 16], %4 | |
8594 | movu %1, [srcq + 16] | |
8595 | pshufb %2, %1, Tm0 | |
8596 | pmaddubsw %2, coef2 | |
8597 | pshufb %1, %1, Tm1 | |
8598 | pmaddubsw %1, coef2 | |
8599 | phaddw %2, %1 | |
8600 | psubw %2, %3 | |
8601 | movu [dstq + 32], %2 | |
8602 | %endmacro | |
8603 | ||
8604 | %macro PROCESS_CHROMA_W32 4 | |
8605 | movu %1, [srcq] | |
8606 | pshufb %2, %1, Tm0 | |
8607 | pmaddubsw %2, coef2 | |
8608 | pshufb %1, %1, Tm1 | |
8609 | pmaddubsw %1, coef2 | |
8610 | phaddw %2, %1 | |
8611 | movu %1, [srcq + 8] | |
8612 | pshufb %4, %1, Tm0 | |
8613 | pmaddubsw %4, coef2 | |
8614 | pshufb %1, %1, Tm1 | |
8615 | pmaddubsw %1, coef2 | |
8616 | phaddw %4, %1 | |
8617 | psubw %2, %3 | |
8618 | psubw %4, %3 | |
8619 | movu [dstq], %2 | |
8620 | movu [dstq + 16], %4 | |
8621 | movu %1, [srcq + 16] | |
8622 | pshufb %2, %1, Tm0 | |
8623 | pmaddubsw %2, coef2 | |
8624 | pshufb %1, %1, Tm1 | |
8625 | pmaddubsw %1, coef2 | |
8626 | phaddw %2, %1 | |
8627 | movu %1, [srcq + 24] | |
8628 | pshufb %4, %1, Tm0 | |
8629 | pmaddubsw %4, coef2 | |
8630 | pshufb %1, %1, Tm1 | |
8631 | pmaddubsw %1, coef2 | |
8632 | phaddw %4, %1 | |
8633 | psubw %2, %3 | |
8634 | psubw %4, %3 | |
8635 | movu [dstq + 32], %2 | |
8636 | movu [dstq + 48], %4 | |
8637 | %endmacro | |
8638 | ||
8639 | %macro PROCESS_CHROMA_W16o 5 | |
8640 | movu %1, [srcq + %5] | |
8641 | pshufb %2, %1, Tm0 | |
8642 | pmaddubsw %2, coef2 | |
8643 | pshufb %1, %1, Tm1 | |
8644 | pmaddubsw %1, coef2 | |
8645 | phaddw %2, %1 | |
8646 | movu %1, [srcq + %5 + 8] | |
8647 | pshufb %4, %1, Tm0 | |
8648 | pmaddubsw %4, coef2 | |
8649 | pshufb %1, %1, Tm1 | |
8650 | pmaddubsw %1, coef2 | |
8651 | phaddw %4, %1 | |
8652 | psubw %2, %3 | |
8653 | psubw %4, %3 | |
8654 | movu [dstq + %5 * 2], %2 | |
8655 | movu [dstq + %5 * 2 + 16], %4 | |
8656 | %endmacro | |
8657 | ||
8658 | %macro PROCESS_CHROMA_W48 4 | |
8659 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 | |
8660 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 | |
8661 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 | |
8662 | %endmacro | |
8663 | ||
8664 | %macro PROCESS_CHROMA_W64 4 | |
8665 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 | |
8666 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 | |
8667 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 | |
8668 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 | |
8669 | %endmacro | |
8670 | ||
8671 | ;------------------------------------------------------------------------------------------------------------------------------ | |
8672 | ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
8673 | ;------------------------------------------------------------------------------------------------------------------------------ | |
8674 | %macro FILTER_HORIZ_CHROMA_WxN 2 | |
8675 | INIT_XMM sse4 | |
8676 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride | |
8677 | %define coef2 m6 | |
8678 | %define Tm0 m5 | |
8679 | %define Tm1 m4 | |
8680 | %define t3 m3 | |
8681 | %define t2 m2 | |
8682 | %define t1 m1 | |
8683 | %define t0 m0 | |
8684 | ||
8685 | dec srcq | |
8686 | mov r4d, r4m | |
8687 | add dststrided, dststrided | |
8688 | ||
8689 | %ifdef PIC | |
8690 | lea r6, [tab_ChromaCoeff] | |
8691 | movd coef2, [r6 + r4 * 4] | |
8692 | %else | |
8693 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
8694 | %endif | |
8695 | ||
8696 | pshufd coef2, coef2, 0 | |
8697 | mova t2, [pw_2000] | |
8698 | mova Tm0, [tab_Tm] | |
8699 | mova Tm1, [tab_Tm + 16] | |
8700 | ||
8701 | mov r4d, %2 | |
8702 | cmp r5m, byte 0 | |
8703 | je .loopH | |
8704 | sub srcq, srcstrideq | |
8705 | add r4d, 3 | |
8706 | ||
8707 | .loopH: | |
8708 | PROCESS_CHROMA_W%1 t0, t1, t2, t3 | |
8709 | add srcq, srcstrideq | |
8710 | add dstq, dststrideq | |
8711 | ||
8712 | dec r4d | |
8713 | jnz .loopH | |
8714 | ||
8715 | RET | |
8716 | %endmacro | |
8717 | ||
8718 | FILTER_HORIZ_CHROMA_WxN 16, 4 | |
8719 | FILTER_HORIZ_CHROMA_WxN 16, 8 | |
8720 | FILTER_HORIZ_CHROMA_WxN 16, 12 | |
8721 | FILTER_HORIZ_CHROMA_WxN 16, 16 | |
8722 | FILTER_HORIZ_CHROMA_WxN 16, 32 | |
8723 | FILTER_HORIZ_CHROMA_WxN 24, 32 | |
8724 | FILTER_HORIZ_CHROMA_WxN 32, 8 | |
8725 | FILTER_HORIZ_CHROMA_WxN 32, 16 | |
8726 | FILTER_HORIZ_CHROMA_WxN 32, 24 | |
8727 | FILTER_HORIZ_CHROMA_WxN 32, 32 | |
8728 | ||
8729 | FILTER_HORIZ_CHROMA_WxN 16, 24 | |
8730 | FILTER_HORIZ_CHROMA_WxN 16, 64 | |
8731 | FILTER_HORIZ_CHROMA_WxN 24, 64 | |
8732 | FILTER_HORIZ_CHROMA_WxN 32, 48 | |
8733 | FILTER_HORIZ_CHROMA_WxN 32, 64 | |
8734 | ||
8735 | FILTER_HORIZ_CHROMA_WxN 64, 64 | |
8736 | FILTER_HORIZ_CHROMA_WxN 64, 32 | |
8737 | FILTER_HORIZ_CHROMA_WxN 64, 48 | |
8738 | FILTER_HORIZ_CHROMA_WxN 48, 64 | |
8739 | FILTER_HORIZ_CHROMA_WxN 64, 16 | |
8740 | ||
8741 | ||
8742 | ;--------------------------------------------------------------------------------------------------------------- | |
8743 | ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
8744 | ;--------------------------------------------------------------------------------------------------------------- | |
8745 | %macro FILTER_V_PS_W16n 2 | |
8746 | INIT_XMM sse4 | |
8747 | cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 | |
8748 | ||
8749 | mov r4d, r4m | |
8750 | sub r0, r1 | |
8751 | add r3d, r3d | |
8752 | ||
8753 | %ifdef PIC | |
8754 | lea r5, [tab_ChromaCoeff] | |
8755 | movd m0, [r5 + r4 * 4] | |
8756 | %else | |
8757 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
8758 | %endif | |
8759 | ||
8760 | pshufb m1, m0, [tab_Vm] | |
8761 | pshufb m0, [tab_Vm + 16] | |
8762 | mov r4d, %2/2 | |
8763 | ||
8764 | .loop: | |
8765 | ||
8766 | mov r6d, %1/16 | |
8767 | ||
8768 | .loopW: | |
8769 | ||
8770 | movu m2, [r0] | |
8771 | movu m3, [r0 + r1] | |
8772 | ||
8773 | punpcklbw m4, m2, m3 | |
8774 | punpckhbw m2, m3 | |
8775 | ||
8776 | pmaddubsw m4, m1 | |
8777 | pmaddubsw m2, m1 | |
8778 | ||
8779 | lea r5, [r0 + 2 * r1] | |
8780 | movu m5, [r5] | |
8781 | movu m7, [r5 + r1] | |
8782 | ||
8783 | punpcklbw m6, m5, m7 | |
8784 | pmaddubsw m6, m0 | |
8785 | paddw m4, m6 | |
8786 | ||
8787 | punpckhbw m6, m5, m7 | |
8788 | pmaddubsw m6, m0 | |
8789 | paddw m2, m6 | |
8790 | ||
8791 | mova m6, [pw_2000] | |
8792 | ||
8793 | psubw m4, m6 | |
8794 | psubw m2, m6 | |
8795 | ||
8796 | movu [r2], m4 | |
8797 | movu [r2 + 16], m2 | |
8798 | ||
8799 | punpcklbw m4, m3, m5 | |
8800 | punpckhbw m3, m5 | |
8801 | ||
8802 | pmaddubsw m4, m1 | |
8803 | pmaddubsw m3, m1 | |
8804 | ||
8805 | movu m5, [r5 + 2 * r1] | |
8806 | ||
8807 | punpcklbw m2, m7, m5 | |
8808 | punpckhbw m7, m5 | |
8809 | ||
8810 | pmaddubsw m2, m0 | |
8811 | pmaddubsw m7, m0 | |
8812 | ||
8813 | paddw m4, m2 | |
8814 | paddw m3, m7 | |
8815 | ||
8816 | psubw m4, m6 | |
8817 | psubw m3, m6 | |
8818 | ||
8819 | movu [r2 + r3], m4 | |
8820 | movu [r2 + r3 + 16], m3 | |
8821 | ||
8822 | add r0, 16 | |
8823 | add r2, 32 | |
8824 | dec r6d | |
8825 | jnz .loopW | |
8826 | ||
8827 | lea r0, [r0 + r1 * 2 - %1] | |
8828 | lea r2, [r2 + r3 * 2 - %1 * 2] | |
8829 | ||
8830 | dec r4d | |
8831 | jnz .loop | |
8832 | RET | |
8833 | %endmacro | |
8834 | ||
8835 | FILTER_V_PS_W16n 64, 64 | |
8836 | FILTER_V_PS_W16n 64, 32 | |
8837 | FILTER_V_PS_W16n 64, 48 | |
8838 | FILTER_V_PS_W16n 48, 64 | |
8839 | FILTER_V_PS_W16n 64, 16 | |
8840 | ||
8841 | ||
8842 | ;------------------------------------------------------------------------------------------------------------ | |
8843 | ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
8844 | ;------------------------------------------------------------------------------------------------------------ | |
8845 | INIT_XMM sse4 | |
8846 | cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 | |
8847 | ||
8848 | mov r4d, r4m | |
8849 | sub r0, r1 | |
8850 | add r3d, r3d | |
8851 | ||
8852 | %ifdef PIC | |
8853 | lea r5, [tab_ChromaCoeff] | |
8854 | movd m0, [r5 + r4 * 4] | |
8855 | %else | |
8856 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
8857 | %endif | |
8858 | ||
8859 | pshufb m0, [tab_Cm] | |
8860 | ||
8861 | lea r5, [3 * r1] | |
8862 | ||
8863 | movd m2, [r0] | |
8864 | movd m3, [r0 + r1] | |
8865 | movd m4, [r0 + 2 * r1] | |
8866 | movd m5, [r0 + r5] | |
8867 | ||
8868 | punpcklbw m2, m3 | |
8869 | punpcklbw m6, m4, m5 | |
8870 | punpcklbw m2, m6 | |
8871 | ||
8872 | pmaddubsw m2, m0 | |
8873 | ||
8874 | lea r0, [r0 + 4 * r1] | |
8875 | movd m6, [r0] | |
8876 | ||
8877 | punpcklbw m3, m4 | |
8878 | punpcklbw m1, m5, m6 | |
8879 | punpcklbw m3, m1 | |
8880 | ||
8881 | pmaddubsw m3, m0 | |
8882 | phaddw m2, m3 | |
8883 | ||
8884 | mova m1, [pw_2000] | |
8885 | ||
8886 | psubw m2, m1 | |
8887 | ||
8888 | movd [r2], m2 | |
8889 | pextrd [r2 + r3], m2, 2 | |
8890 | ||
8891 | movd m2, [r0 + r1] | |
8892 | ||
8893 | punpcklbw m4, m5 | |
8894 | punpcklbw m3, m6, m2 | |
8895 | punpcklbw m4, m3 | |
8896 | ||
8897 | pmaddubsw m4, m0 | |
8898 | ||
8899 | movd m3, [r0 + 2 * r1] | |
8900 | ||
8901 | punpcklbw m5, m6 | |
8902 | punpcklbw m2, m3 | |
8903 | punpcklbw m5, m2 | |
8904 | ||
8905 | pmaddubsw m5, m0 | |
8906 | phaddw m4, m5 | |
8907 | psubw m4, m1 | |
8908 | ||
8909 | lea r2, [r2 + 2 * r3] | |
8910 | movd [r2], m4 | |
8911 | pextrd [r2 + r3], m4, 2 | |
8912 | ||
8913 | RET | |
8914 | ||
8915 | ;------------------------------------------------------------------------------------------------------------- | |
8916 | ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
8917 | ;------------------------------------------------------------------------------------------------------------- | |
8918 | %macro FILTER_V_PS_W2 2 | |
8919 | INIT_XMM sse4 | |
8920 | cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 | |
8921 | ||
8922 | mov r4d, r4m | |
8923 | sub r0, r1 | |
8924 | add r3d, r3d | |
8925 | ||
8926 | %ifdef PIC | |
8927 | lea r5, [tab_ChromaCoeff] | |
8928 | movd m0, [r5 + r4 * 4] | |
8929 | %else | |
8930 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
8931 | %endif | |
8932 | ||
8933 | pshufb m0, [tab_Cm] | |
8934 | ||
8935 | mova m1, [pw_2000] | |
8936 | lea r5, [3 * r1] | |
8937 | mov r4d, %2/4 | |
8938 | .loop: | |
8939 | movd m2, [r0] | |
8940 | movd m3, [r0 + r1] | |
8941 | movd m4, [r0 + 2 * r1] | |
8942 | movd m5, [r0 + r5] | |
8943 | ||
8944 | punpcklbw m2, m3 | |
8945 | punpcklbw m6, m4, m5 | |
8946 | punpcklbw m2, m6 | |
8947 | ||
8948 | pmaddubsw m2, m0 | |
8949 | ||
8950 | lea r0, [r0 + 4 * r1] | |
8951 | movd m6, [r0] | |
8952 | ||
8953 | punpcklbw m3, m4 | |
8954 | punpcklbw m7, m5, m6 | |
8955 | punpcklbw m3, m7 | |
8956 | ||
8957 | pmaddubsw m3, m0 | |
8958 | ||
8959 | phaddw m2, m3 | |
8960 | psubw m2, m1 | |
8961 | ||
8962 | ||
8963 | movd [r2], m2 | |
8964 | pshufd m2, m2, 2 | |
8965 | movd [r2 + r3], m2 | |
8966 | ||
8967 | movd m2, [r0 + r1] | |
8968 | ||
8969 | punpcklbw m4, m5 | |
8970 | punpcklbw m3, m6, m2 | |
8971 | punpcklbw m4, m3 | |
8972 | ||
8973 | pmaddubsw m4, m0 | |
8974 | ||
8975 | movd m3, [r0 + 2 * r1] | |
8976 | ||
8977 | punpcklbw m5, m6 | |
8978 | punpcklbw m2, m3 | |
8979 | punpcklbw m5, m2 | |
8980 | ||
8981 | pmaddubsw m5, m0 | |
8982 | ||
8983 | phaddw m4, m5 | |
8984 | ||
8985 | psubw m4, m1 | |
8986 | ||
8987 | lea r2, [r2 + 2 * r3] | |
8988 | movd [r2], m4 | |
8989 | pshufd m4 , m4 ,2 | |
8990 | movd [r2 + r3], m4 | |
8991 | ||
8992 | lea r2, [r2 + 2 * r3] | |
8993 | ||
8994 | dec r4d | |
8995 | jnz .loop | |
8996 | ||
8997 | RET | |
8998 | %endmacro | |
8999 | ||
9000 | FILTER_V_PS_W2 2, 8 | |
9001 | ||
9002 | FILTER_V_PS_W2 2, 16 | |
9003 | ||
9004 | ;----------------------------------------------------------------------------------------------------------------- | |
9005 | ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
9006 | ;----------------------------------------------------------------------------------------------------------------- | |
9007 | %macro FILTER_VER_CHROMA_SS 2 | |
9008 | INIT_XMM sse2 | |
9009 | cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize | |
9010 | ||
9011 | add r1d, r1d | |
9012 | add r3d, r3d | |
9013 | sub r0, r1 | |
9014 | shl r4d, 5 | |
9015 | ||
9016 | %ifdef PIC | |
9017 | lea r5, [tab_ChromaCoeffV] | |
9018 | lea r6, [r5 + r4] | |
9019 | %else | |
9020 | lea r6, [tab_ChromaCoeffV + r4] | |
9021 | %endif | |
9022 | ||
9023 | mov dword [rsp], %2/4 | |
9024 | ||
9025 | .loopH: | |
9026 | mov r4d, (%1/4) | |
9027 | .loopW: | |
9028 | PROCESS_CHROMA_SP_W4_4R | |
9029 | ||
9030 | psrad m0, 6 | |
9031 | psrad m1, 6 | |
9032 | psrad m2, 6 | |
9033 | psrad m3, 6 | |
9034 | ||
9035 | packssdw m0, m1 | |
9036 | packssdw m2, m3 | |
9037 | ||
9038 | movlps [r2], m0 | |
9039 | movhps [r2 + r3], m0 | |
9040 | lea r5, [r2 + 2 * r3] | |
9041 | movlps [r5], m2 | |
9042 | movhps [r5 + r3], m2 | |
9043 | ||
9044 | lea r5, [4 * r1 - 2 * 4] | |
9045 | sub r0, r5 | |
9046 | add r2, 2 * 4 | |
9047 | ||
9048 | dec r4d | |
9049 | jnz .loopW | |
9050 | ||
9051 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
9052 | lea r2, [r2 + 4 * r3 - 2 * %1] | |
9053 | ||
9054 | dec dword [rsp] | |
9055 | jnz .loopH | |
9056 | ||
9057 | RET | |
9058 | %endmacro | |
9059 | ||
9060 | FILTER_VER_CHROMA_SS 4, 4 | |
9061 | FILTER_VER_CHROMA_SS 4, 8 | |
9062 | FILTER_VER_CHROMA_SS 16, 16 | |
9063 | FILTER_VER_CHROMA_SS 16, 8 | |
9064 | FILTER_VER_CHROMA_SS 16, 12 | |
9065 | FILTER_VER_CHROMA_SS 12, 16 | |
9066 | FILTER_VER_CHROMA_SS 16, 4 | |
9067 | FILTER_VER_CHROMA_SS 4, 16 | |
9068 | FILTER_VER_CHROMA_SS 32, 32 | |
9069 | FILTER_VER_CHROMA_SS 32, 16 | |
9070 | FILTER_VER_CHROMA_SS 16, 32 | |
9071 | FILTER_VER_CHROMA_SS 32, 24 | |
9072 | FILTER_VER_CHROMA_SS 24, 32 | |
9073 | FILTER_VER_CHROMA_SS 32, 8 | |
9074 | ||
9075 | FILTER_VER_CHROMA_SS 16, 24 | |
9076 | FILTER_VER_CHROMA_SS 12, 32 | |
9077 | FILTER_VER_CHROMA_SS 4, 32 | |
9078 | FILTER_VER_CHROMA_SS 32, 64 | |
9079 | FILTER_VER_CHROMA_SS 16, 64 | |
9080 | FILTER_VER_CHROMA_SS 32, 48 | |
9081 | FILTER_VER_CHROMA_SS 24, 64 | |
9082 | ||
9083 | FILTER_VER_CHROMA_SS 64, 64 | |
9084 | FILTER_VER_CHROMA_SS 64, 32 | |
9085 | FILTER_VER_CHROMA_SS 64, 48 | |
9086 | FILTER_VER_CHROMA_SS 48, 64 | |
9087 | FILTER_VER_CHROMA_SS 64, 16 | |
9088 | ||
9089 | ||
9090 | ;--------------------------------------------------------------------------------------------------------------------- | |
9091 | ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
9092 | ;--------------------------------------------------------------------------------------------------------------------- | |
9093 | %macro FILTER_VER_CHROMA_SS_W2_4R 2 | |
9094 | INIT_XMM sse4 | |
9095 | cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 | |
9096 | ||
9097 | add r1d, r1d | |
9098 | add r3d, r3d | |
9099 | sub r0, r1 | |
9100 | shl r4d, 5 | |
9101 | ||
9102 | %ifdef PIC | |
9103 | lea r5, [tab_ChromaCoeffV] | |
9104 | lea r5, [r5 + r4] | |
9105 | %else | |
9106 | lea r5, [tab_ChromaCoeffV + r4] | |
9107 | %endif | |
9108 | ||
9109 | mov r4d, (%2/4) | |
9110 | ||
9111 | .loopH: | |
9112 | PROCESS_CHROMA_SP_W2_4R r5 | |
9113 | ||
9114 | psrad m0, 6 | |
9115 | psrad m2, 6 | |
9116 | ||
9117 | packssdw m0, m2 | |
9118 | ||
9119 | movd [r2], m0 | |
9120 | pextrd [r2 + r3], m0, 1 | |
9121 | lea r2, [r2 + 2 * r3] | |
9122 | pextrd [r2], m0, 2 | |
9123 | pextrd [r2 + r3], m0, 3 | |
9124 | ||
9125 | lea r2, [r2 + 2 * r3] | |
9126 | ||
9127 | dec r4d | |
9128 | jnz .loopH | |
9129 | ||
9130 | RET | |
9131 | %endmacro | |
9132 | ||
9133 | FILTER_VER_CHROMA_SS_W2_4R 2, 4 | |
9134 | FILTER_VER_CHROMA_SS_W2_4R 2, 8 | |
9135 | ||
9136 | FILTER_VER_CHROMA_SS_W2_4R 2, 16 | |
9137 | ||
9138 | ;--------------------------------------------------------------------------------------------------------------- | |
9139 | ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
9140 | ;--------------------------------------------------------------------------------------------------------------- | |
9141 | INIT_XMM sse2 | |
9142 | cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 | |
9143 | ||
9144 | add r1d, r1d | |
9145 | add r3d, r3d | |
9146 | sub r0, r1 | |
9147 | shl r4d, 5 | |
9148 | ||
9149 | %ifdef PIC | |
9150 | lea r5, [tab_ChromaCoeffV] | |
9151 | lea r5, [r5 + r4] | |
9152 | %else | |
9153 | lea r5, [tab_ChromaCoeffV + r4] | |
9154 | %endif | |
9155 | ||
9156 | movq m0, [r0] | |
9157 | movq m1, [r0 + r1] | |
9158 | punpcklwd m0, m1 ;m0=[0 1] | |
9159 | pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 | |
9160 | ||
9161 | lea r0, [r0 + 2 * r1] | |
9162 | movq m2, [r0] | |
9163 | punpcklwd m1, m2 ;m1=[1 2] | |
9164 | pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 | |
9165 | ||
9166 | movq m3, [r0 + r1] | |
9167 | punpcklwd m2, m3 ;m4=[2 3] | |
9168 | pmaddwd m2, [r5 + 1 * 16] | |
9169 | paddd m0, m2 ;m0=[0+1+2+3] Row1 done | |
9170 | psrad m0, 6 | |
9171 | ||
9172 | movq m2, [r0 + 2 * r1] | |
9173 | punpcklwd m3, m2 ;m5=[3 4] | |
9174 | pmaddwd m3, [r5 + 1 * 16] | |
9175 | paddd m1, m3 ;m1=[1+2+3+4] Row2 done | |
9176 | psrad m1, 6 | |
9177 | ||
9178 | packssdw m0, m1 | |
9179 | ||
9180 | movlps [r2], m0 | |
9181 | movhps [r2 + r3], m0 | |
9182 | ||
9183 | RET | |
9184 | ||
9185 | ;------------------------------------------------------------------------------------------------------------------- | |
9186 | ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
9187 | ;------------------------------------------------------------------------------------------------------------------- | |
9188 | %macro FILTER_VER_CHROMA_SS_W6_H4 2 | |
9189 | INIT_XMM sse4 | |
9190 | cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 | |
9191 | ||
9192 | add r1d, r1d | |
9193 | add r3d, r3d | |
9194 | sub r0, r1 | |
9195 | shl r4d, 5 | |
9196 | ||
9197 | %ifdef PIC | |
9198 | lea r5, [tab_ChromaCoeffV] | |
9199 | lea r6, [r5 + r4] | |
9200 | %else | |
9201 | lea r6, [tab_ChromaCoeffV + r4] | |
9202 | %endif | |
9203 | ||
9204 | mov r4d, %2/4 | |
9205 | ||
9206 | .loopH: | |
9207 | PROCESS_CHROMA_SP_W4_4R | |
9208 | ||
9209 | psrad m0, 6 | |
9210 | psrad m1, 6 | |
9211 | psrad m2, 6 | |
9212 | psrad m3, 6 | |
9213 | ||
9214 | packssdw m0, m1 | |
9215 | packssdw m2, m3 | |
9216 | ||
9217 | movlps [r2], m0 | |
9218 | movhps [r2 + r3], m0 | |
9219 | lea r5, [r2 + 2 * r3] | |
9220 | movlps [r5], m2 | |
9221 | movhps [r5 + r3], m2 | |
9222 | ||
9223 | lea r5, [4 * r1 - 2 * 4] | |
9224 | sub r0, r5 | |
9225 | add r2, 2 * 4 | |
9226 | ||
9227 | PROCESS_CHROMA_SP_W2_4R r6 | |
9228 | ||
9229 | psrad m0, 6 | |
9230 | psrad m2, 6 | |
9231 | ||
9232 | packssdw m0, m2 | |
9233 | ||
9234 | movd [r2], m0 | |
9235 | pextrd [r2 + r3], m0, 1 | |
9236 | lea r2, [r2 + 2 * r3] | |
9237 | pextrd [r2], m0, 2 | |
9238 | pextrd [r2 + r3], m0, 3 | |
9239 | ||
9240 | sub r0, 2 * 4 | |
9241 | lea r2, [r2 + 2 * r3 - 2 * 4] | |
9242 | ||
9243 | dec r4d | |
9244 | jnz .loopH | |
9245 | ||
9246 | RET | |
9247 | %endmacro | |
9248 | ||
9249 | FILTER_VER_CHROMA_SS_W6_H4 6, 8 | |
9250 | ||
9251 | FILTER_VER_CHROMA_SS_W6_H4 6, 16 | |
9252 | ||
9253 | ||
9254 | ;---------------------------------------------------------------------------------------------------------------- | |
9255 | ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
9256 | ;---------------------------------------------------------------------------------------------------------------- | |
9257 | %macro FILTER_VER_CHROMA_SS_W8_H2 2 | |
9258 | INIT_XMM sse2 | |
9259 | cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 | |
9260 | ||
9261 | add r1d, r1d | |
9262 | add r3d, r3d | |
9263 | sub r0, r1 | |
9264 | shl r4d, 5 | |
9265 | ||
9266 | %ifdef PIC | |
9267 | lea r5, [tab_ChromaCoeffV] | |
9268 | lea r5, [r5 + r4] | |
9269 | %else | |
9270 | lea r5, [tab_ChromaCoeffV + r4] | |
9271 | %endif | |
9272 | ||
9273 | mov r4d, %2/2 | |
9274 | .loopH: | |
9275 | PROCESS_CHROMA_SP_W8_2R | |
9276 | ||
9277 | psrad m0, 6 | |
9278 | psrad m1, 6 | |
9279 | psrad m2, 6 | |
9280 | psrad m3, 6 | |
9281 | ||
9282 | packssdw m0, m1 | |
9283 | packssdw m2, m3 | |
9284 | ||
9285 | movu [r2], m0 | |
9286 | movu [r2 + r3], m2 | |
9287 | ||
9288 | lea r2, [r2 + 2 * r3] | |
9289 | ||
9290 | dec r4d | |
9291 | jnz .loopH | |
9292 | ||
9293 | RET | |
9294 | %endmacro | |
9295 | ||
9296 | FILTER_VER_CHROMA_SS_W8_H2 8, 2 | |
9297 | FILTER_VER_CHROMA_SS_W8_H2 8, 4 | |
9298 | FILTER_VER_CHROMA_SS_W8_H2 8, 6 | |
9299 | FILTER_VER_CHROMA_SS_W8_H2 8, 8 | |
9300 | FILTER_VER_CHROMA_SS_W8_H2 8, 16 | |
9301 | FILTER_VER_CHROMA_SS_W8_H2 8, 32 | |
9302 | ||
9303 | FILTER_VER_CHROMA_SS_W8_H2 8, 12 | |
9304 | FILTER_VER_CHROMA_SS_W8_H2 8, 64 | |
9305 | ||
9306 | ;----------------------------------------------------------------------------------------------------------------- | |
9307 | ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
9308 | ;----------------------------------------------------------------------------------------------------------------- | |
9309 | %macro FILTER_VER_LUMA_SS 2 | |
9310 | INIT_XMM sse2 | |
9311 | cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize | |
9312 | ||
9313 | add r1d, r1d | |
9314 | add r3d, r3d | |
9315 | lea r5, [3 * r1] | |
9316 | sub r0, r5 | |
9317 | shl r4d, 6 | |
9318 | ||
9319 | %ifdef PIC | |
9320 | lea r5, [tab_LumaCoeffV] | |
9321 | lea r6, [r5 + r4] | |
9322 | %else | |
9323 | lea r6, [tab_LumaCoeffV + r4] | |
9324 | %endif | |
9325 | ||
9326 | mov dword [rsp], %2/4 | |
9327 | .loopH: | |
9328 | mov r4d, (%1/4) | |
9329 | .loopW: | |
9330 | movq m0, [r0] | |
9331 | movq m1, [r0 + r1] | |
9332 | punpcklwd m0, m1 ;m0=[0 1] | |
9333 | pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 | |
9334 | ||
9335 | lea r0, [r0 + 2 * r1] | |
9336 | movq m4, [r0] | |
9337 | punpcklwd m1, m4 ;m1=[1 2] | |
9338 | pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 | |
9339 | ||
9340 | movq m5, [r0 + r1] | |
9341 | punpcklwd m4, m5 ;m4=[2 3] | |
9342 | pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 | |
9343 | pmaddwd m4, [r6 + 1 * 16] | |
9344 | paddd m0, m4 ;m0=[0+1+2+3] Row1 | |
9345 | ||
9346 | lea r0, [r0 + 2 * r1] | |
9347 | movq m4, [r0] | |
9348 | punpcklwd m5, m4 ;m5=[3 4] | |
9349 | pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 | |
9350 | pmaddwd m5, [r6 + 1 * 16] | |
9351 | paddd m1, m5 ;m1 = [1+2+3+4] Row2 | |
9352 | ||
9353 | movq m5, [r0 + r1] | |
9354 | punpcklwd m4, m5 ;m4=[4 5] | |
9355 | pmaddwd m6, m4, [r6 + 1 * 16] | |
9356 | paddd m2, m6 ;m2=[2+3+4+5] Row3 | |
9357 | pmaddwd m4, [r6 + 2 * 16] | |
9358 | paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 | |
9359 | ||
9360 | lea r0, [r0 + 2 * r1] | |
9361 | movq m4, [r0] | |
9362 | punpcklwd m5, m4 ;m5=[5 6] | |
9363 | pmaddwd m6, m5, [r6 + 1 * 16] | |
9364 | paddd m3, m6 ;m3=[3+4+5+6] Row4 | |
9365 | pmaddwd m5, [r6 + 2 * 16] | |
9366 | paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 | |
9367 | ||
9368 | movq m5, [r0 + r1] | |
9369 | punpcklwd m4, m5 ;m4=[6 7] | |
9370 | pmaddwd m6, m4, [r6 + 2 * 16] | |
9371 | paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 | |
9372 | pmaddwd m4, [r6 + 3 * 16] | |
9373 | paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end | |
9374 | psrad m0, 6 | |
9375 | ||
9376 | lea r0, [r0 + 2 * r1] | |
9377 | movq m4, [r0] | |
9378 | punpcklwd m5, m4 ;m5=[7 8] | |
9379 | pmaddwd m6, m5, [r6 + 2 * 16] | |
9380 | paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 | |
9381 | pmaddwd m5, [r6 + 3 * 16] | |
9382 | paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end | |
9383 | psrad m1, 6 | |
9384 | ||
9385 | packssdw m0, m1 | |
9386 | ||
9387 | movlps [r2], m0 | |
9388 | movhps [r2 + r3], m0 | |
9389 | ||
9390 | movq m5, [r0 + r1] | |
9391 | punpcklwd m4, m5 ;m4=[8 9] | |
9392 | pmaddwd m4, [r6 + 3 * 16] | |
9393 | paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end | |
9394 | psrad m2, 6 | |
9395 | ||
9396 | movq m4, [r0 + 2 * r1] | |
9397 | punpcklwd m5, m4 ;m5=[9 10] | |
9398 | pmaddwd m5, [r6 + 3 * 16] | |
9399 | paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end | |
9400 | psrad m3, 6 | |
9401 | ||
9402 | packssdw m2, m3 | |
9403 | ||
9404 | movlps [r2 + 2 * r3], m2 | |
9405 | lea r5, [3 * r3] | |
9406 | movhps [r2 + r5], m2 | |
9407 | ||
9408 | lea r5, [8 * r1 - 2 * 4] | |
9409 | sub r0, r5 | |
9410 | add r2, 2 * 4 | |
9411 | ||
9412 | dec r4d | |
9413 | jnz .loopW | |
9414 | ||
9415 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
9416 | lea r2, [r2 + 4 * r3 - 2 * %1] | |
9417 | ||
9418 | dec dword [rsp] | |
9419 | jnz .loopH | |
9420 | ||
9421 | RET | |
9422 | %endmacro | |
9423 | ||
9424 | FILTER_VER_LUMA_SS 4, 4 | |
9425 | FILTER_VER_LUMA_SS 8, 8 | |
9426 | FILTER_VER_LUMA_SS 8, 4 | |
9427 | FILTER_VER_LUMA_SS 4, 8 | |
9428 | FILTER_VER_LUMA_SS 16, 16 | |
9429 | FILTER_VER_LUMA_SS 16, 8 | |
9430 | FILTER_VER_LUMA_SS 8, 16 | |
9431 | FILTER_VER_LUMA_SS 16, 12 | |
9432 | FILTER_VER_LUMA_SS 12, 16 | |
9433 | FILTER_VER_LUMA_SS 16, 4 | |
9434 | FILTER_VER_LUMA_SS 4, 16 | |
9435 | FILTER_VER_LUMA_SS 32, 32 | |
9436 | FILTER_VER_LUMA_SS 32, 16 | |
9437 | FILTER_VER_LUMA_SS 16, 32 | |
9438 | FILTER_VER_LUMA_SS 32, 24 | |
9439 | FILTER_VER_LUMA_SS 24, 32 | |
9440 | FILTER_VER_LUMA_SS 32, 8 | |
9441 | FILTER_VER_LUMA_SS 8, 32 | |
9442 | FILTER_VER_LUMA_SS 64, 64 | |
9443 | FILTER_VER_LUMA_SS 64, 32 | |
9444 | FILTER_VER_LUMA_SS 32, 64 | |
9445 | FILTER_VER_LUMA_SS 64, 48 | |
9446 | FILTER_VER_LUMA_SS 48, 64 | |
9447 | FILTER_VER_LUMA_SS 64, 16 | |
9448 | FILTER_VER_LUMA_SS 16, 64 |