Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* Copyright (C) 2013 x265 project | |
3 | ;* | |
4 | ;* Authors: Min Chen <chenm003@163.com> | |
5 | ;* Nabajit Deka <nabajit@multicorewareinc.com> | |
6 | ;* Praveen Kumar Tiwari <praveen@multicorewareinc.com> | |
7 | ;* | |
8 | ;* This program is free software; you can redistribute it and/or modify | |
9 | ;* it under the terms of the GNU General Public License as published by | |
10 | ;* the Free Software Foundation; either version 2 of the License, or | |
11 | ;* (at your option) any later version. | |
12 | ;* | |
13 | ;* This program is distributed in the hope that it will be useful, | |
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | ;* GNU General Public License for more details. | |
17 | ;* | |
18 | ;* You should have received a copy of the GNU General Public License | |
19 | ;* along with this program; if not, write to the Free Software | |
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
21 | ;* | |
22 | ;* This program is also available under a commercial proprietary license. | |
23 | ;* For more information, contact us at license @ x265.com. | |
24 | ;*****************************************************************************/ | |
25 | ||
26 | %include "x86inc.asm" | |
27 | %include "x86util.asm" | |
28 | ||
29 | SECTION_RODATA 32 | |
30 | tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 | |
31 | db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 | |
32 | db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 | |
33 | ||
34 | ALIGN 32 | |
35 | tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 | |
36 | db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 | |
37 | db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 | |
38 | db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14 | |
39 | ||
40 | tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 | |
41 | db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 | |
42 | ||
43 | tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3 | |
44 | ||
45 | tab_c_512: times 8 dw 512 | |
46 | tab_c_526336: times 4 dd 8192*64+2048 | |
47 | ||
48 | tab_ChromaCoeff: db 0, 64, 0, 0 | |
49 | db -2, 58, 10, -2 | |
50 | db -4, 54, 16, -2 | |
51 | db -6, 46, 28, -4 | |
52 | db -4, 36, 36, -4 | |
53 | db -4, 28, 46, -6 | |
54 | db -2, 16, 54, -4 | |
55 | db -2, 10, 58, -2 | |
56 | ||
57 | tab_ChromaCoeffV: times 4 dw 0, 64 | |
58 | times 4 dw 0, 0 | |
59 | ||
60 | times 4 dw -2, 58 | |
61 | times 4 dw 10, -2 | |
62 | ||
63 | times 4 dw -4, 54 | |
64 | times 4 dw 16, -2 | |
65 | ||
66 | times 4 dw -6, 46 | |
67 | times 4 dw 28, -4 | |
68 | ||
69 | times 4 dw -4, 36 | |
70 | times 4 dw 36, -4 | |
71 | ||
72 | times 4 dw -4, 28 | |
73 | times 4 dw 46, -6 | |
74 | ||
75 | times 4 dw -2, 16 | |
76 | times 4 dw 54, -4 | |
77 | ||
78 | times 4 dw -2, 10 | |
79 | times 4 dw 58, -2 | |
80 | ||
81 | tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0 | |
82 | db -1, 4, -10, 58, 17, -5, 1, 0 | |
83 | db -1, 4, -11, 40, 40, -11, 4, -1 | |
84 | db 0, 1, -5, 17, 58, -10, 4, -1 | |
85 | ||
86 | tab_LumaCoeffV: times 4 dw 0, 0 | |
87 | times 4 dw 0, 64 | |
88 | times 4 dw 0, 0 | |
89 | times 4 dw 0, 0 | |
90 | ||
91 | times 4 dw -1, 4 | |
92 | times 4 dw -10, 58 | |
93 | times 4 dw 17, -5 | |
94 | times 4 dw 1, 0 | |
95 | ||
96 | times 4 dw -1, 4 | |
97 | times 4 dw -11, 40 | |
98 | times 4 dw 40, -11 | |
99 | times 4 dw 4, -1 | |
100 | ||
101 | times 4 dw 0, 1 | |
102 | times 4 dw -5, 17 | |
103 | times 4 dw 58, -10 | |
104 | times 4 dw 4, -1 | |
105 | ||
106 | tab_LumaCoeffVer: times 8 db 0, 0 | |
107 | times 8 db 0, 64 | |
108 | times 8 db 0, 0 | |
109 | times 8 db 0, 0 | |
110 | ||
111 | times 8 db -1, 4 | |
112 | times 8 db -10, 58 | |
113 | times 8 db 17, -5 | |
114 | times 8 db 1, 0 | |
115 | ||
116 | times 8 db -1, 4 | |
117 | times 8 db -11, 40 | |
118 | times 8 db 40, -11 | |
119 | times 8 db 4, -1 | |
120 | ||
121 | times 8 db 0, 1 | |
122 | times 8 db -5, 17 | |
123 | times 8 db 58, -10 | |
124 | times 8 db 4, -1 | |
125 | ||
126 | tab_c_128: times 16 db 0x80 | |
127 | tab_c_64_n64: times 8 db 64, -64 | |
128 | ||
129 | ||
130 | SECTION .text | |
131 | ||
132 | cextern idct4_shuf1 | |
133 | cextern pw_1 | |
134 | cextern pw_512 | |
135 | cextern pw_2000 | |
136 | ||
137 | %macro FILTER_H4_w2_2 3 | |
138 | movh %2, [srcq - 1] | |
139 | pshufb %2, %2, Tm0 | |
140 | movh %1, [srcq + srcstrideq - 1] | |
141 | pshufb %1, %1, Tm0 | |
142 | punpcklqdq %2, %1 | |
143 | pmaddubsw %2, coef2 | |
144 | phaddw %2, %2 | |
145 | pmulhrsw %2, %3 | |
146 | packuswb %2, %2 | |
147 | movd r4, %2 | |
148 | mov [dstq], r4w | |
149 | shr r4, 16 | |
150 | mov [dstq + dststrideq], r4w | |
151 | %endmacro | |
152 | ||
153 | ;----------------------------------------------------------------------------- | |
154 | ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
155 | ;----------------------------------------------------------------------------- | |
156 | INIT_XMM sse4 | |
157 | cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride | |
158 | %define coef2 m4 | |
159 | %define Tm0 m3 | |
160 | %define t2 m2 | |
161 | %define t1 m1 | |
162 | %define t0 m0 | |
163 | ||
164 | mov r4d, r4m | |
165 | ||
166 | %ifdef PIC | |
167 | lea r5, [tab_ChromaCoeff] | |
168 | movd coef2, [r5 + r4 * 4] | |
169 | %else | |
170 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
171 | %endif | |
172 | ||
173 | pshufd coef2, coef2, 0 | |
174 | mova t2, [tab_c_512] | |
175 | mova Tm0, [tab_Tm] | |
176 | ||
177 | %rep 2 | |
178 | FILTER_H4_w2_2 t0, t1, t2 | |
179 | lea srcq, [srcq + srcstrideq * 2] | |
180 | lea dstq, [dstq + dststrideq * 2] | |
181 | %endrep | |
182 | ||
183 | RET | |
184 | ||
185 | ;----------------------------------------------------------------------------- | |
186 | ; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
187 | ;----------------------------------------------------------------------------- | |
188 | INIT_XMM sse4 | |
189 | cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride | |
190 | %define coef2 m4 | |
191 | %define Tm0 m3 | |
192 | %define t2 m2 | |
193 | %define t1 m1 | |
194 | %define t0 m0 | |
195 | ||
196 | mov r4d, r4m | |
197 | ||
198 | %ifdef PIC | |
199 | lea r5, [tab_ChromaCoeff] | |
200 | movd coef2, [r5 + r4 * 4] | |
201 | %else | |
202 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
203 | %endif | |
204 | ||
205 | pshufd coef2, coef2, 0 | |
206 | mova t2, [tab_c_512] | |
207 | mova Tm0, [tab_Tm] | |
208 | ||
209 | %rep 4 | |
210 | FILTER_H4_w2_2 t0, t1, t2 | |
211 | lea srcq, [srcq + srcstrideq * 2] | |
212 | lea dstq, [dstq + dststrideq * 2] | |
213 | %endrep | |
214 | ||
215 | RET | |
216 | ||
217 | ;----------------------------------------------------------------------------- | |
218 | ; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
219 | ;----------------------------------------------------------------------------- | |
220 | INIT_XMM sse4 | |
221 | cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride | |
222 | %define coef2 m4 | |
223 | %define Tm0 m3 | |
224 | %define t2 m2 | |
225 | %define t1 m1 | |
226 | %define t0 m0 | |
227 | ||
228 | mov r4d, r4m | |
229 | ||
230 | %ifdef PIC | |
231 | lea r5, [tab_ChromaCoeff] | |
232 | movd coef2, [r5 + r4 * 4] | |
233 | %else | |
234 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
235 | %endif | |
236 | ||
237 | pshufd coef2, coef2, 0 | |
238 | mova t2, [tab_c_512] | |
239 | mova Tm0, [tab_Tm] | |
240 | ||
241 | mov r5d, 16/2 | |
242 | ||
243 | .loop: | |
244 | FILTER_H4_w2_2 t0, t1, t2 | |
245 | lea srcq, [srcq + srcstrideq * 2] | |
246 | lea dstq, [dstq + dststrideq * 2] | |
247 | dec r5d | |
248 | jnz .loop | |
249 | ||
250 | RET | |
251 | ||
252 | %macro FILTER_H4_w4_2 3 | |
253 | movh %2, [srcq - 1] | |
254 | pshufb %2, %2, Tm0 | |
255 | pmaddubsw %2, coef2 | |
256 | movh %1, [srcq + srcstrideq - 1] | |
257 | pshufb %1, %1, Tm0 | |
258 | pmaddubsw %1, coef2 | |
259 | phaddw %2, %1 | |
260 | pmulhrsw %2, %3 | |
261 | packuswb %2, %2 | |
262 | movd [dstq], %2 | |
263 | palignr %2, %2, 4 | |
264 | movd [dstq + dststrideq], %2 | |
265 | %endmacro | |
266 | ||
267 | ;----------------------------------------------------------------------------- | |
268 | ; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
269 | ;----------------------------------------------------------------------------- | |
270 | INIT_XMM sse4 | |
271 | cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride | |
272 | %define coef2 m4 | |
273 | %define Tm0 m3 | |
274 | %define t2 m2 | |
275 | %define t1 m1 | |
276 | %define t0 m0 | |
277 | ||
278 | mov r4d, r4m | |
279 | ||
280 | %ifdef PIC | |
281 | lea r5, [tab_ChromaCoeff] | |
282 | movd coef2, [r5 + r4 * 4] | |
283 | %else | |
284 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
285 | %endif | |
286 | ||
287 | pshufd coef2, coef2, 0 | |
288 | mova t2, [tab_c_512] | |
289 | mova Tm0, [tab_Tm] | |
290 | ||
291 | FILTER_H4_w4_2 t0, t1, t2 | |
292 | ||
293 | RET | |
294 | ||
295 | ;----------------------------------------------------------------------------- | |
296 | ; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
297 | ;----------------------------------------------------------------------------- | |
298 | INIT_XMM sse4 | |
299 | cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride | |
300 | %define coef2 m4 | |
301 | %define Tm0 m3 | |
302 | %define t2 m2 | |
303 | %define t1 m1 | |
304 | %define t0 m0 | |
305 | ||
306 | mov r4d, r4m | |
307 | ||
308 | %ifdef PIC | |
309 | lea r5, [tab_ChromaCoeff] | |
310 | movd coef2, [r5 + r4 * 4] | |
311 | %else | |
312 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
313 | %endif | |
314 | ||
315 | pshufd coef2, coef2, 0 | |
316 | mova t2, [tab_c_512] | |
317 | mova Tm0, [tab_Tm] | |
318 | ||
319 | %rep 2 | |
320 | FILTER_H4_w4_2 t0, t1, t2 | |
321 | lea srcq, [srcq + srcstrideq * 2] | |
322 | lea dstq, [dstq + dststrideq * 2] | |
323 | %endrep | |
324 | ||
325 | RET | |
326 | ||
327 | ;----------------------------------------------------------------------------- | |
328 | ; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
329 | ;----------------------------------------------------------------------------- | |
330 | INIT_XMM sse4 | |
331 | cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride | |
332 | %define coef2 m4 | |
333 | %define Tm0 m3 | |
334 | %define t2 m2 | |
335 | %define t1 m1 | |
336 | %define t0 m0 | |
337 | ||
338 | mov r4d, r4m | |
339 | ||
340 | %ifdef PIC | |
341 | lea r5, [tab_ChromaCoeff] | |
342 | movd coef2, [r5 + r4 * 4] | |
343 | %else | |
344 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
345 | %endif | |
346 | ||
347 | pshufd coef2, coef2, 0 | |
348 | mova t2, [tab_c_512] | |
349 | mova Tm0, [tab_Tm] | |
350 | ||
351 | %rep 4 | |
352 | FILTER_H4_w4_2 t0, t1, t2 | |
353 | lea srcq, [srcq + srcstrideq * 2] | |
354 | lea dstq, [dstq + dststrideq * 2] | |
355 | %endrep | |
356 | ||
357 | RET | |
358 | ||
359 | ;----------------------------------------------------------------------------- | |
360 | ; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
361 | ;----------------------------------------------------------------------------- | |
362 | INIT_XMM sse4 | |
363 | cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride | |
364 | %define coef2 m4 | |
365 | %define Tm0 m3 | |
366 | %define t2 m2 | |
367 | %define t1 m1 | |
368 | %define t0 m0 | |
369 | ||
370 | mov r4d, r4m | |
371 | ||
372 | %ifdef PIC | |
373 | lea r5, [tab_ChromaCoeff] | |
374 | movd coef2, [r5 + r4 * 4] | |
375 | %else | |
376 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
377 | %endif | |
378 | ||
379 | pshufd coef2, coef2, 0 | |
380 | mova t2, [tab_c_512] | |
381 | mova Tm0, [tab_Tm] | |
382 | ||
383 | %rep 8 | |
384 | FILTER_H4_w4_2 t0, t1, t2 | |
385 | lea srcq, [srcq + srcstrideq * 2] | |
386 | lea dstq, [dstq + dststrideq * 2] | |
387 | %endrep | |
388 | ||
389 | RET | |
390 | ||
391 | ;----------------------------------------------------------------------------- | |
392 | ; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
393 | ;----------------------------------------------------------------------------- | |
394 | INIT_XMM sse4 | |
395 | cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride | |
396 | %define coef2 m4 | |
397 | %define Tm0 m3 | |
398 | %define t2 m2 | |
399 | %define t1 m1 | |
400 | %define t0 m0 | |
401 | ||
402 | mov r4d, r4m | |
403 | ||
404 | %ifdef PIC | |
405 | lea r5, [tab_ChromaCoeff] | |
406 | movd coef2, [r5 + r4 * 4] | |
407 | %else | |
408 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
409 | %endif | |
410 | ||
411 | pshufd coef2, coef2, 0 | |
412 | mova t2, [tab_c_512] | |
413 | mova Tm0, [tab_Tm] | |
414 | ||
415 | mov r5d, 32/2 | |
416 | ||
417 | .loop: | |
418 | FILTER_H4_w4_2 t0, t1, t2 | |
419 | lea srcq, [srcq + srcstrideq * 2] | |
420 | lea dstq, [dstq + dststrideq * 2] | |
421 | dec r5d | |
422 | jnz .loop | |
423 | ||
424 | RET | |
425 | ||
426 | ||
427 | %macro FILTER_H4_w6 3 | |
428 | movu %1, [srcq - 1] | |
429 | pshufb %2, %1, Tm0 | |
430 | pmaddubsw %2, coef2 | |
431 | pshufb %1, %1, Tm1 | |
432 | pmaddubsw %1, coef2 | |
433 | phaddw %2, %1 | |
434 | pmulhrsw %2, %3 | |
435 | packuswb %2, %2 | |
436 | movd [dstq], %2 | |
437 | pextrw [dstq + 4], %2, 2 | |
438 | %endmacro | |
439 | ||
440 | %macro FILTER_H4_w8 3 | |
441 | movu %1, [srcq - 1] | |
442 | pshufb %2, %1, Tm0 | |
443 | pmaddubsw %2, coef2 | |
444 | pshufb %1, %1, Tm1 | |
445 | pmaddubsw %1, coef2 | |
446 | phaddw %2, %1 | |
447 | pmulhrsw %2, %3 | |
448 | packuswb %2, %2 | |
449 | movh [dstq], %2 | |
450 | %endmacro | |
451 | ||
452 | %macro FILTER_H4_w12 3 | |
453 | movu %1, [srcq - 1] | |
454 | pshufb %2, %1, Tm0 | |
455 | pmaddubsw %2, coef2 | |
456 | pshufb %1, %1, Tm1 | |
457 | pmaddubsw %1, coef2 | |
458 | phaddw %2, %1 | |
459 | pmulhrsw %2, %3 | |
460 | movu %1, [srcq - 1 + 8] | |
461 | pshufb %1, %1, Tm0 | |
462 | pmaddubsw %1, coef2 | |
463 | phaddw %1, %1 | |
464 | pmulhrsw %1, %3 | |
465 | packuswb %2, %1 | |
466 | movh [dstq], %2 | |
467 | pextrd [dstq + 8], %2, 2 | |
468 | %endmacro | |
469 | ||
470 | %macro FILTER_H4_w16 4 | |
471 | movu %1, [srcq - 1] | |
472 | pshufb %2, %1, Tm0 | |
473 | pmaddubsw %2, coef2 | |
474 | pshufb %1, %1, Tm1 | |
475 | pmaddubsw %1, coef2 | |
476 | phaddw %2, %1 | |
477 | movu %1, [srcq - 1 + 8] | |
478 | pshufb %4, %1, Tm0 | |
479 | pmaddubsw %4, coef2 | |
480 | pshufb %1, %1, Tm1 | |
481 | pmaddubsw %1, coef2 | |
482 | phaddw %4, %1 | |
483 | pmulhrsw %2, %3 | |
484 | pmulhrsw %4, %3 | |
485 | packuswb %2, %4 | |
486 | movu [dstq], %2 | |
487 | %endmacro | |
488 | ||
489 | %macro FILTER_H4_w24 4 | |
490 | movu %1, [srcq - 1] | |
491 | pshufb %2, %1, Tm0 | |
492 | pmaddubsw %2, coef2 | |
493 | pshufb %1, %1, Tm1 | |
494 | pmaddubsw %1, coef2 | |
495 | phaddw %2, %1 | |
496 | movu %1, [srcq - 1 + 8] | |
497 | pshufb %4, %1, Tm0 | |
498 | pmaddubsw %4, coef2 | |
499 | pshufb %1, %1, Tm1 | |
500 | pmaddubsw %1, coef2 | |
501 | phaddw %4, %1 | |
502 | pmulhrsw %2, %3 | |
503 | pmulhrsw %4, %3 | |
504 | packuswb %2, %4 | |
505 | movu [dstq], %2 | |
506 | movu %1, [srcq - 1 + 16] | |
507 | pshufb %2, %1, Tm0 | |
508 | pmaddubsw %2, coef2 | |
509 | pshufb %1, %1, Tm1 | |
510 | pmaddubsw %1, coef2 | |
511 | phaddw %2, %1 | |
512 | pmulhrsw %2, %3 | |
513 | packuswb %2, %2 | |
514 | movh [dstq + 16], %2 | |
515 | %endmacro | |
516 | ||
517 | %macro FILTER_H4_w32 4 | |
518 | movu %1, [srcq - 1] | |
519 | pshufb %2, %1, Tm0 | |
520 | pmaddubsw %2, coef2 | |
521 | pshufb %1, %1, Tm1 | |
522 | pmaddubsw %1, coef2 | |
523 | phaddw %2, %1 | |
524 | movu %1, [srcq - 1 + 8] | |
525 | pshufb %4, %1, Tm0 | |
526 | pmaddubsw %4, coef2 | |
527 | pshufb %1, %1, Tm1 | |
528 | pmaddubsw %1, coef2 | |
529 | phaddw %4, %1 | |
530 | pmulhrsw %2, %3 | |
531 | pmulhrsw %4, %3 | |
532 | packuswb %2, %4 | |
533 | movu [dstq], %2 | |
534 | movu %1, [srcq - 1 + 16] | |
535 | pshufb %2, %1, Tm0 | |
536 | pmaddubsw %2, coef2 | |
537 | pshufb %1, %1, Tm1 | |
538 | pmaddubsw %1, coef2 | |
539 | phaddw %2, %1 | |
540 | movu %1, [srcq - 1 + 24] | |
541 | pshufb %4, %1, Tm0 | |
542 | pmaddubsw %4, coef2 | |
543 | pshufb %1, %1, Tm1 | |
544 | pmaddubsw %1, coef2 | |
545 | phaddw %4, %1 | |
546 | pmulhrsw %2, %3 | |
547 | pmulhrsw %4, %3 | |
548 | packuswb %2, %4 | |
549 | movu [dstq + 16], %2 | |
550 | %endmacro | |
551 | ||
552 | %macro FILTER_H4_w16o 5 | |
553 | movu %1, [srcq + %5 - 1] | |
554 | pshufb %2, %1, Tm0 | |
555 | pmaddubsw %2, coef2 | |
556 | pshufb %1, %1, Tm1 | |
557 | pmaddubsw %1, coef2 | |
558 | phaddw %2, %1 | |
559 | movu %1, [srcq + %5 - 1 + 8] | |
560 | pshufb %4, %1, Tm0 | |
561 | pmaddubsw %4, coef2 | |
562 | pshufb %1, %1, Tm1 | |
563 | pmaddubsw %1, coef2 | |
564 | phaddw %4, %1 | |
565 | pmulhrsw %2, %3 | |
566 | pmulhrsw %4, %3 | |
567 | packuswb %2, %4 | |
568 | movu [dstq + %5], %2 | |
569 | %endmacro | |
570 | ||
571 | %macro FILTER_H4_w48 4 | |
572 | FILTER_H4_w16o %1, %2, %3, %4, 0 | |
573 | FILTER_H4_w16o %1, %2, %3, %4, 16 | |
574 | FILTER_H4_w16o %1, %2, %3, %4, 32 | |
575 | %endmacro | |
576 | ||
577 | %macro FILTER_H4_w64 4 | |
578 | FILTER_H4_w16o %1, %2, %3, %4, 0 | |
579 | FILTER_H4_w16o %1, %2, %3, %4, 16 | |
580 | FILTER_H4_w16o %1, %2, %3, %4, 32 | |
581 | FILTER_H4_w16o %1, %2, %3, %4, 48 | |
582 | %endmacro | |
583 | ||
584 | ;----------------------------------------------------------------------------- | |
585 | ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
586 | ;----------------------------------------------------------------------------- | |
587 | %macro IPFILTER_CHROMA 2 | |
588 | INIT_XMM sse4 | |
589 | cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride | |
590 | %define coef2 m5 | |
591 | %define Tm0 m4 | |
592 | %define Tm1 m3 | |
593 | %define t2 m2 | |
594 | %define t1 m1 | |
595 | %define t0 m0 | |
596 | ||
597 | mov r4d, r4m | |
598 | ||
599 | %ifdef PIC | |
600 | lea r5, [tab_ChromaCoeff] | |
601 | movd coef2, [r5 + r4 * 4] | |
602 | %else | |
603 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
604 | %endif | |
605 | ||
606 | mov r5d, %2 | |
607 | ||
608 | pshufd coef2, coef2, 0 | |
609 | mova t2, [tab_c_512] | |
610 | mova Tm0, [tab_Tm] | |
611 | mova Tm1, [tab_Tm + 16] | |
612 | ||
613 | .loop: | |
614 | FILTER_H4_w%1 t0, t1, t2 | |
615 | add srcq, srcstrideq | |
616 | add dstq, dststrideq | |
617 | ||
618 | dec r5d | |
619 | jnz .loop | |
620 | ||
621 | RET | |
622 | %endmacro | |
623 | ||
624 | ||
625 | IPFILTER_CHROMA 6, 8 | |
626 | IPFILTER_CHROMA 8, 2 | |
627 | IPFILTER_CHROMA 8, 4 | |
628 | IPFILTER_CHROMA 8, 6 | |
629 | IPFILTER_CHROMA 8, 8 | |
630 | IPFILTER_CHROMA 8, 16 | |
631 | IPFILTER_CHROMA 8, 32 | |
632 | IPFILTER_CHROMA 12, 16 | |
633 | ||
634 | IPFILTER_CHROMA 6, 16 | |
635 | IPFILTER_CHROMA 8, 12 | |
636 | IPFILTER_CHROMA 8, 64 | |
637 | IPFILTER_CHROMA 12, 32 | |
638 | ||
639 | ;----------------------------------------------------------------------------- | |
640 | ; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
641 | ;----------------------------------------------------------------------------- | |
642 | %macro IPFILTER_CHROMA_W 2 | |
643 | INIT_XMM sse4 | |
644 | cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride | |
645 | %define coef2 m6 | |
646 | %define Tm0 m5 | |
647 | %define Tm1 m4 | |
648 | %define t3 m3 | |
649 | %define t2 m2 | |
650 | %define t1 m1 | |
651 | %define t0 m0 | |
652 | ||
653 | mov r4d, r4m | |
654 | ||
655 | %ifdef PIC | |
656 | lea r5, [tab_ChromaCoeff] | |
657 | movd coef2, [r5 + r4 * 4] | |
658 | %else | |
659 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
660 | %endif | |
661 | ||
662 | mov r5d, %2 | |
663 | ||
664 | pshufd coef2, coef2, 0 | |
665 | mova t2, [tab_c_512] | |
666 | mova Tm0, [tab_Tm] | |
667 | mova Tm1, [tab_Tm + 16] | |
668 | ||
669 | .loop: | |
670 | FILTER_H4_w%1 t0, t1, t2, t3 | |
671 | add srcq, srcstrideq | |
672 | add dstq, dststrideq | |
673 | ||
674 | dec r5d | |
675 | jnz .loop | |
676 | ||
677 | RET | |
678 | %endmacro | |
679 | ||
680 | IPFILTER_CHROMA_W 16, 4 | |
681 | IPFILTER_CHROMA_W 16, 8 | |
682 | IPFILTER_CHROMA_W 16, 12 | |
683 | IPFILTER_CHROMA_W 16, 16 | |
684 | IPFILTER_CHROMA_W 16, 32 | |
685 | IPFILTER_CHROMA_W 32, 8 | |
686 | IPFILTER_CHROMA_W 32, 16 | |
687 | IPFILTER_CHROMA_W 32, 24 | |
688 | IPFILTER_CHROMA_W 24, 32 | |
689 | IPFILTER_CHROMA_W 32, 32 | |
690 | ||
691 | IPFILTER_CHROMA_W 16, 24 | |
692 | IPFILTER_CHROMA_W 16, 64 | |
693 | IPFILTER_CHROMA_W 32, 48 | |
694 | IPFILTER_CHROMA_W 24, 64 | |
695 | IPFILTER_CHROMA_W 32, 64 | |
696 | ||
697 | IPFILTER_CHROMA_W 64, 64 | |
698 | IPFILTER_CHROMA_W 64, 32 | |
699 | IPFILTER_CHROMA_W 64, 48 | |
700 | IPFILTER_CHROMA_W 48, 64 | |
701 | IPFILTER_CHROMA_W 64, 16 | |
702 | ||
703 | ||
704 | %macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst | |
705 | movu %1, %7 | |
706 | pshufb %2, %1, [tab_Lm + 0] | |
707 | pmaddubsw %2, %5 | |
708 | pshufb %3, %1, [tab_Lm + 16] | |
709 | pmaddubsw %3, %5 | |
710 | phaddw %2, %3 | |
711 | pshufb %4, %1, [tab_Lm + 32] | |
712 | pmaddubsw %4, %5 | |
713 | pshufb %1, %1, [tab_Lm + 48] | |
714 | pmaddubsw %1, %5 | |
715 | phaddw %4, %1 | |
716 | phaddw %2, %4 | |
717 | %if %0 == 8 | |
718 | pmulhrsw %2, %6 | |
719 | packuswb %2, %2 | |
720 | movh %8, %2 | |
721 | %endif | |
722 | %endmacro | |
723 | ||
724 | %macro FILTER_H8_W4 2 | |
725 | movu %1, [r0 - 3 + r5] | |
726 | pshufb %2, %1, [tab_Lm] | |
727 | pmaddubsw %2, m3 | |
728 | pshufb m7, %1, [tab_Lm + 16] | |
729 | pmaddubsw m7, m3 | |
730 | phaddw %2, m7 | |
731 | phaddw %2, %2 | |
732 | %endmacro | |
733 | ||
734 | ;---------------------------------------------------------------------------------------------------------------------------- | |
735 | ; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
736 | ;---------------------------------------------------------------------------------------------------------------------------- | |
737 | %macro IPFILTER_LUMA 3 | |
738 | INIT_XMM sse4 | |
739 | cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8 | |
740 | ||
741 | mov r4d, r4m | |
742 | ||
743 | %ifdef PIC | |
744 | lea r6, [tab_LumaCoeff] | |
745 | movh m3, [r6 + r4 * 8] | |
746 | %else | |
747 | movh m3, [tab_LumaCoeff + r4 * 8] | |
748 | %endif | |
749 | punpcklqdq m3, m3 | |
750 | ||
751 | %ifidn %3, pp | |
752 | mova m2, [tab_c_512] | |
753 | %else | |
754 | mova m2, [pw_2000] | |
755 | %endif | |
756 | ||
757 | mov r4d, %2 | |
758 | %ifidn %3, ps | |
759 | add r3, r3 | |
760 | cmp r5m, byte 0 | |
761 | je .loopH | |
762 | lea r6, [r1 + 2 * r1] | |
763 | sub r0, r6 | |
764 | add r4d, 7 | |
765 | %endif | |
766 | ||
767 | .loopH: | |
768 | xor r5, r5 | |
769 | %rep %1 / 8 | |
770 | %ifidn %3, pp | |
771 | FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5] | |
772 | %else | |
773 | FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5] | |
774 | psubw m1, m2 | |
775 | movu [r2 + 2 * r5], m1 | |
776 | %endif | |
777 | add r5, 8 | |
778 | %endrep | |
779 | ||
780 | %rep (%1 % 8) / 4 | |
781 | FILTER_H8_W4 m0, m1 | |
782 | %ifidn %3, pp | |
783 | pmulhrsw m1, m2 | |
784 | packuswb m1, m1 | |
785 | movd [r2 + r5], m1 | |
786 | %else | |
787 | psubw m1, m2 | |
788 | movh [r2 + 2 * r5], m1 | |
789 | %endif | |
790 | %endrep | |
791 | ||
792 | add r0, r1 | |
793 | add r2, r3 | |
794 | ||
795 | dec r4d | |
796 | jnz .loopH | |
797 | RET | |
798 | %endmacro | |
799 | ||
800 | ||
801 | INIT_YMM avx2 | |
802 | cglobal interp_8tap_horiz_pp_4x4, 4,6,6 | |
803 | mov r4d, r4m | |
804 | ||
805 | %ifdef PIC | |
806 | lea r5, [tab_LumaCoeff] | |
807 | vpbroadcastq m0, [r5 + r4 * 8] | |
808 | %else | |
809 | vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] | |
810 | %endif | |
811 | ||
812 | mova m1, [tab_Lm] | |
813 | vpbroadcastd m2, [pw_1] | |
814 | ||
815 | ; register map | |
816 | ; m0 - interpolate coeff | |
817 | ; m1 - shuffle order table | |
818 | ; m2 - constant word 1 | |
819 | ||
820 | sub r0, 3 | |
821 | ; Row 0-1 | |
822 | vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
823 | pshufb m3, m1 | |
824 | pmaddubsw m3, m0 | |
825 | pmaddwd m3, m2 | |
826 | vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
827 | pshufb m4, m1 | |
828 | pmaddubsw m4, m0 | |
829 | pmaddwd m4, m2 | |
830 | phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] | |
831 | ||
832 | ; Row 2-3 | |
833 | lea r0, [r0 + r1 * 2] | |
834 | vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
835 | pshufb m4, m1 | |
836 | pmaddubsw m4, m0 | |
837 | pmaddwd m4, m2 | |
838 | vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] | |
839 | pshufb m5, m1 | |
840 | pmaddubsw m5, m0 | |
841 | pmaddwd m5, m2 | |
842 | phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] | |
843 | ||
844 | packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] | |
845 | pmulhrsw m3, [pw_512] | |
846 | vextracti128 xm4, m3, 1 | |
847 | packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] | |
848 | pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0] | |
849 | ||
850 | lea r0, [r3 * 3] | |
851 | movd [r2], xm3 | |
852 | pextrd [r2+r3], xm3, 2 | |
853 | pextrd [r2+r3*2], xm3, 1 | |
854 | pextrd [r2+r0], xm3, 3 | |
855 | RET | |
856 | ||
857 | ||
858 | ;-------------------------------------------------------------------------------------------------------------- | |
859 | ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
860 | ;-------------------------------------------------------------------------------------------------------------- | |
861 | IPFILTER_LUMA 4, 4, pp | |
862 | IPFILTER_LUMA 4, 8, pp | |
863 | IPFILTER_LUMA 12, 16, pp | |
864 | IPFILTER_LUMA 4, 16, pp | |
865 | ||
866 | ;-------------------------------------------------------------------------------------------------------------- | |
867 | ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
868 | ;-------------------------------------------------------------------------------------------------------------- | |
869 | %macro IPFILTER_LUMA_PP_W8 2 | |
870 | INIT_XMM sse4 | |
871 | cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7 | |
872 | mov r4d, r4m | |
873 | ||
874 | %ifdef PIC | |
875 | lea r5, [tab_LumaCoeff] | |
876 | movh m3, [r5 + r4 * 8] | |
877 | %else | |
878 | movh m3, [tab_LumaCoeff + r4 * 8] | |
879 | %endif | |
880 | pshufd m0, m3, 0 ; m0 = coeff-L | |
881 | pshufd m1, m3, 0x55 ; m1 = coeff-H | |
882 | lea r5, [tab_Tm] ; r5 = shuffle | |
883 | mova m2, [pw_512] ; m2 = 512 | |
884 | ||
885 | mov r4d, %2 | |
886 | .loopH: | |
887 | %assign x 0 | |
888 | %rep %1 / 8 | |
889 | movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0] | |
890 | pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0] | |
891 | pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4] | |
892 | pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8] | |
893 | pmaddubsw m4, m0 | |
894 | pmaddubsw m6, m5, m1 | |
895 | pmaddubsw m5, m0 | |
896 | pmaddubsw m3, m1 | |
897 | paddw m4, m6 | |
898 | paddw m5, m3 | |
899 | phaddw m4, m5 | |
900 | pmulhrsw m4, m2 | |
901 | packuswb m4, m4 | |
902 | movh [r2 + x], m4 | |
903 | %assign x x+8 | |
904 | %endrep | |
905 | ||
906 | add r0, r1 | |
907 | add r2, r3 | |
908 | ||
909 | dec r4d | |
910 | jnz .loopH | |
911 | RET | |
912 | %endmacro | |
913 | ||
914 | IPFILTER_LUMA_PP_W8 8, 4 | |
915 | IPFILTER_LUMA_PP_W8 8, 8 | |
916 | IPFILTER_LUMA_PP_W8 8, 16 | |
917 | IPFILTER_LUMA_PP_W8 8, 32 | |
918 | IPFILTER_LUMA_PP_W8 16, 4 | |
919 | IPFILTER_LUMA_PP_W8 16, 8 | |
920 | IPFILTER_LUMA_PP_W8 16, 12 | |
921 | IPFILTER_LUMA_PP_W8 16, 16 | |
922 | IPFILTER_LUMA_PP_W8 16, 32 | |
923 | IPFILTER_LUMA_PP_W8 16, 64 | |
924 | IPFILTER_LUMA_PP_W8 24, 32 | |
925 | IPFILTER_LUMA_PP_W8 32, 8 | |
926 | IPFILTER_LUMA_PP_W8 32, 16 | |
927 | IPFILTER_LUMA_PP_W8 32, 24 | |
928 | IPFILTER_LUMA_PP_W8 32, 32 | |
929 | IPFILTER_LUMA_PP_W8 32, 64 | |
930 | IPFILTER_LUMA_PP_W8 48, 64 | |
931 | IPFILTER_LUMA_PP_W8 64, 16 | |
932 | IPFILTER_LUMA_PP_W8 64, 32 | |
933 | IPFILTER_LUMA_PP_W8 64, 48 | |
934 | IPFILTER_LUMA_PP_W8 64, 64 | |
935 | ||
936 | ;---------------------------------------------------------------------------------------------------------------------------- | |
937 | ; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
938 | ;---------------------------------------------------------------------------------------------------------------------------- | |
939 | IPFILTER_LUMA 4, 4, ps | |
940 | IPFILTER_LUMA 8, 8, ps | |
941 | IPFILTER_LUMA 8, 4, ps | |
942 | IPFILTER_LUMA 4, 8, ps | |
943 | IPFILTER_LUMA 16, 16, ps | |
944 | IPFILTER_LUMA 16, 8, ps | |
945 | IPFILTER_LUMA 8, 16, ps | |
946 | IPFILTER_LUMA 16, 12, ps | |
947 | IPFILTER_LUMA 12, 16, ps | |
948 | IPFILTER_LUMA 16, 4, ps | |
949 | IPFILTER_LUMA 4, 16, ps | |
950 | IPFILTER_LUMA 32, 32, ps | |
951 | IPFILTER_LUMA 32, 16, ps | |
952 | IPFILTER_LUMA 16, 32, ps | |
953 | IPFILTER_LUMA 32, 24, ps | |
954 | IPFILTER_LUMA 24, 32, ps | |
955 | IPFILTER_LUMA 32, 8, ps | |
956 | IPFILTER_LUMA 8, 32, ps | |
957 | IPFILTER_LUMA 64, 64, ps | |
958 | IPFILTER_LUMA 64, 32, ps | |
959 | IPFILTER_LUMA 32, 64, ps | |
960 | IPFILTER_LUMA 64, 48, ps | |
961 | IPFILTER_LUMA 48, 64, ps | |
962 | IPFILTER_LUMA 64, 16, ps | |
963 | IPFILTER_LUMA 16, 64, ps | |
964 | ||
965 | ;----------------------------------------------------------------------------- | |
966 | ; Interpolate HV | |
967 | ;----------------------------------------------------------------------------- | |
968 | %macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2] | |
969 | mova %5, [r0 + (%6 + 0) * 16] | |
970 | mova %1, [r0 + (%6 + 1) * 16] | |
971 | mova %2, [r0 + (%6 + 2) * 16] | |
972 | punpcklwd %3, %5, %1 | |
973 | punpckhwd %5, %1 | |
974 | pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0 | |
975 | pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1] | |
976 | punpcklwd %4, %1, %2 | |
977 | punpckhwd %1, %2 | |
978 | pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1 | |
979 | pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2] | |
980 | %endmacro ; FILTER_HV8_START | |
981 | ||
982 | %macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6] | |
983 | mova %8, [r0 + (%9 + 0) * 16] | |
984 | mova %1, [r0 + (%9 + 1) * 16] | |
985 | punpcklwd %7, %2, %8 | |
986 | punpckhwd %2, %8 | |
987 | pmaddwd %7, [r5 + %10 * 16] | |
988 | pmaddwd %2, [r5 + %10 * 16] | |
989 | paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0 | |
990 | paddd %5, %2 ; R0 = H[0+1+2+3] | |
991 | punpcklwd %7, %8, %1 | |
992 | punpckhwd %8, %1 | |
993 | pmaddwd %7, [r5 + %10 * 16] | |
994 | pmaddwd %8, [r5 + %10 * 16] | |
995 | paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1 | |
996 | paddd %6, %8 ; R1 = H[1+2+3+4] | |
997 | %endmacro ; FILTER_HV8_MID | |
998 | ||
999 | ; Round and Saturate | |
1000 | %macro FILTER_HV8_END 4 ; output in [1, 3] | |
1001 | paddd %1, [tab_c_526336] | |
1002 | paddd %2, [tab_c_526336] | |
1003 | paddd %3, [tab_c_526336] | |
1004 | paddd %4, [tab_c_526336] | |
1005 | psrad %1, 12 | |
1006 | psrad %2, 12 | |
1007 | psrad %3, 12 | |
1008 | psrad %4, 12 | |
1009 | packssdw %1, %2 | |
1010 | packssdw %3, %4 | |
1011 | ||
1012 | ; TODO: is merge better? I think this way is short dependency link | |
1013 | packuswb %1, %3 | |
1014 | %endmacro ; FILTER_HV8_END | |
1015 | ||
1016 | ;----------------------------------------------------------------------------- | |
1017 | ; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY) | |
1018 | ;----------------------------------------------------------------------------- | |
1019 | INIT_XMM ssse3 | |
1020 | cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16 | |
1021 | %define coef m7 | |
1022 | %define stk_buf rsp | |
1023 | ||
1024 | mov r4d, r4m | |
1025 | mov r5d, r5m | |
1026 | ||
1027 | %ifdef PIC | |
1028 | lea r6, [tab_LumaCoeff] | |
1029 | movh coef, [r6 + r4 * 8] | |
1030 | %else | |
1031 | movh coef, [tab_LumaCoeff + r4 * 8] | |
1032 | %endif | |
1033 | punpcklqdq coef, coef | |
1034 | ||
1035 | ; move to row -3 | |
1036 | lea r6, [r1 + r1 * 2] | |
1037 | sub r0, r6 | |
1038 | ||
1039 | xor r6, r6 | |
1040 | mov r4, rsp | |
1041 | ||
1042 | .loopH: | |
1043 | FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3] | |
1044 | psubw m1, [pw_2000] | |
1045 | mova [r4], m1 | |
1046 | ||
1047 | add r0, r1 | |
1048 | add r4, 16 | |
1049 | inc r6 | |
1050 | cmp r6, 8+7 | |
1051 | jnz .loopH | |
1052 | ||
1053 | ; ready to phase V | |
1054 | ; Here all of mN is free | |
1055 | ||
1056 | ; load coeff table | |
1057 | shl r5, 6 | |
1058 | lea r6, [tab_LumaCoeffV] | |
1059 | lea r5, [r5 + r6] | |
1060 | ||
1061 | ; load intermedia buffer | |
1062 | mov r0, stk_buf | |
1063 | ||
1064 | ; register mapping | |
1065 | ; r0 - src | |
1066 | ; r5 - coeff | |
1067 | ; r6 - loop_i | |
1068 | ||
1069 | ; let's go | |
1070 | xor r6, r6 | |
1071 | ||
1072 | ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache | |
1073 | .loopV: | |
1074 | ||
1075 | FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0 | |
1076 | FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1 | |
1077 | FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2 | |
1078 | FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3 | |
1079 | FILTER_HV8_END m3, m0, m4, m1 | |
1080 | ||
1081 | movh [r2], m3 | |
1082 | movhps [r2 + r3], m3 | |
1083 | ||
1084 | lea r0, [r0 + 16 * 2] | |
1085 | lea r2, [r2 + r3 * 2] | |
1086 | ||
1087 | inc r6 | |
1088 | cmp r6, 8/2 | |
1089 | jnz .loopV | |
1090 | ||
1091 | RET | |
1092 | ||
1093 | ;----------------------------------------------------------------------------- | |
1094 | ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1095 | ;----------------------------------------------------------------------------- | |
1096 | INIT_XMM sse4 | |
1097 | cglobal interp_4tap_vert_pp_2x4, 4, 6, 8 | |
1098 | ||
1099 | mov r4d, r4m | |
1100 | sub r0, r1 | |
1101 | ||
1102 | %ifdef PIC | |
1103 | lea r5, [tab_ChromaCoeff] | |
1104 | movd m0, [r5 + r4 * 4] | |
1105 | %else | |
1106 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1107 | %endif | |
1108 | lea r4, [r1 * 3] | |
1109 | lea r5, [r0 + 4 * r1] | |
1110 | pshufb m0, [tab_Cm] | |
1111 | mova m1, [tab_c_512] | |
1112 | ||
1113 | movd m2, [r0] | |
1114 | movd m3, [r0 + r1] | |
1115 | movd m4, [r0 + 2 * r1] | |
1116 | movd m5, [r0 + r4] | |
1117 | ||
1118 | punpcklbw m2, m3 | |
1119 | punpcklbw m6, m4, m5 | |
1120 | punpcklbw m2, m6 | |
1121 | ||
1122 | pmaddubsw m2, m0 | |
1123 | ||
1124 | movd m6, [r5] | |
1125 | ||
1126 | punpcklbw m3, m4 | |
1127 | punpcklbw m7, m5, m6 | |
1128 | punpcklbw m3, m7 | |
1129 | ||
1130 | pmaddubsw m3, m0 | |
1131 | ||
1132 | phaddw m2, m3 | |
1133 | ||
1134 | pmulhrsw m2, m1 | |
1135 | ||
1136 | movd m7, [r5 + r1] | |
1137 | ||
1138 | punpcklbw m4, m5 | |
1139 | punpcklbw m3, m6, m7 | |
1140 | punpcklbw m4, m3 | |
1141 | ||
1142 | pmaddubsw m4, m0 | |
1143 | ||
1144 | movd m3, [r5 + 2 * r1] | |
1145 | ||
1146 | punpcklbw m5, m6 | |
1147 | punpcklbw m7, m3 | |
1148 | punpcklbw m5, m7 | |
1149 | ||
1150 | pmaddubsw m5, m0 | |
1151 | ||
1152 | phaddw m4, m5 | |
1153 | ||
1154 | pmulhrsw m4, m1 | |
1155 | packuswb m2, m4 | |
1156 | ||
1157 | pextrw [r2], m2, 0 | |
1158 | pextrw [r2 + r3], m2, 2 | |
1159 | lea r2, [r2 + 2 * r3] | |
1160 | pextrw [r2], m2, 4 | |
1161 | pextrw [r2 + r3], m2, 6 | |
1162 | ||
1163 | RET | |
1164 | ||
1165 | ;----------------------------------------------------------------------------- | |
1166 | ; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1167 | ;----------------------------------------------------------------------------- | |
1168 | %macro FILTER_V4_W2_H4 2 | |
1169 | INIT_XMM sse4 | |
1170 | cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 | |
1171 | ||
1172 | mov r4d, r4m | |
1173 | sub r0, r1 | |
1174 | ||
1175 | %ifdef PIC | |
1176 | lea r5, [tab_ChromaCoeff] | |
1177 | movd m0, [r5 + r4 * 4] | |
1178 | %else | |
1179 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1180 | %endif | |
1181 | ||
1182 | pshufb m0, [tab_Cm] | |
1183 | ||
1184 | mova m1, [tab_c_512] | |
1185 | ||
1186 | mov r4d, %2 | |
1187 | lea r5, [3 * r1] | |
1188 | ||
1189 | .loop: | |
1190 | movd m2, [r0] | |
1191 | movd m3, [r0 + r1] | |
1192 | movd m4, [r0 + 2 * r1] | |
1193 | movd m5, [r0 + r5] | |
1194 | ||
1195 | punpcklbw m2, m3 | |
1196 | punpcklbw m6, m4, m5 | |
1197 | punpcklbw m2, m6 | |
1198 | ||
1199 | pmaddubsw m2, m0 | |
1200 | ||
1201 | lea r0, [r0 + 4 * r1] | |
1202 | movd m6, [r0] | |
1203 | ||
1204 | punpcklbw m3, m4 | |
1205 | punpcklbw m7, m5, m6 | |
1206 | punpcklbw m3, m7 | |
1207 | ||
1208 | pmaddubsw m3, m0 | |
1209 | ||
1210 | phaddw m2, m3 | |
1211 | ||
1212 | pmulhrsw m2, m1 | |
1213 | ||
1214 | movd m7, [r0 + r1] | |
1215 | ||
1216 | punpcklbw m4, m5 | |
1217 | punpcklbw m3, m6, m7 | |
1218 | punpcklbw m4, m3 | |
1219 | ||
1220 | pmaddubsw m4, m0 | |
1221 | ||
1222 | movd m3, [r0 + 2 * r1] | |
1223 | ||
1224 | punpcklbw m5, m6 | |
1225 | punpcklbw m7, m3 | |
1226 | punpcklbw m5, m7 | |
1227 | ||
1228 | pmaddubsw m5, m0 | |
1229 | ||
1230 | phaddw m4, m5 | |
1231 | ||
1232 | pmulhrsw m4, m1 | |
1233 | packuswb m2, m4 | |
1234 | ||
1235 | pextrw [r2], m2, 0 | |
1236 | pextrw [r2 + r3], m2, 2 | |
1237 | lea r2, [r2 + 2 * r3] | |
1238 | pextrw [r2], m2, 4 | |
1239 | pextrw [r2 + r3], m2, 6 | |
1240 | ||
1241 | lea r2, [r2 + 2 * r3] | |
1242 | ||
1243 | sub r4, 4 | |
1244 | jnz .loop | |
1245 | RET | |
1246 | %endmacro | |
1247 | ||
1248 | FILTER_V4_W2_H4 2, 8 | |
1249 | ||
1250 | FILTER_V4_W2_H4 2, 16 | |
1251 | ||
1252 | ;----------------------------------------------------------------------------- | |
1253 | ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1254 | ;----------------------------------------------------------------------------- | |
1255 | INIT_XMM sse4 | |
1256 | cglobal interp_4tap_vert_pp_4x2, 4, 6, 6 | |
1257 | ||
1258 | mov r4d, r4m | |
1259 | sub r0, r1 | |
1260 | ||
1261 | %ifdef PIC | |
1262 | lea r5, [tab_ChromaCoeff] | |
1263 | movd m0, [r5 + r4 * 4] | |
1264 | %else | |
1265 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1266 | %endif | |
1267 | ||
1268 | pshufb m0, [tab_Cm] | |
1269 | lea r5, [r0 + 2 * r1] | |
1270 | ||
1271 | movd m2, [r0] | |
1272 | movd m3, [r0 + r1] | |
1273 | movd m4, [r5] | |
1274 | movd m5, [r5 + r1] | |
1275 | ||
1276 | punpcklbw m2, m3 | |
1277 | punpcklbw m1, m4, m5 | |
1278 | punpcklbw m2, m1 | |
1279 | ||
1280 | pmaddubsw m2, m0 | |
1281 | ||
1282 | movd m1, [r0 + 4 * r1] | |
1283 | ||
1284 | punpcklbw m3, m4 | |
1285 | punpcklbw m5, m1 | |
1286 | punpcklbw m3, m5 | |
1287 | ||
1288 | pmaddubsw m3, m0 | |
1289 | ||
1290 | phaddw m2, m3 | |
1291 | ||
1292 | pmulhrsw m2, [tab_c_512] | |
1293 | packuswb m2, m2 | |
1294 | movd [r2], m2 | |
1295 | pextrd [r2 + r3], m2, 1 | |
1296 | ||
1297 | RET | |
1298 | ||
1299 | ;----------------------------------------------------------------------------- | |
1300 | ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1301 | ;----------------------------------------------------------------------------- | |
1302 | INIT_XMM sse4 | |
1303 | cglobal interp_4tap_vert_pp_4x4, 4, 6, 8 | |
1304 | ||
1305 | mov r4d, r4m | |
1306 | sub r0, r1 | |
1307 | ||
1308 | %ifdef PIC | |
1309 | lea r5, [tab_ChromaCoeff] | |
1310 | movd m0, [r5 + r4 * 4] | |
1311 | %else | |
1312 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1313 | %endif | |
1314 | ||
1315 | pshufb m0, [tab_Cm] | |
1316 | mova m1, [tab_c_512] | |
1317 | lea r5, [r0 + 4 * r1] | |
1318 | lea r4, [r1 * 3] | |
1319 | ||
1320 | movd m2, [r0] | |
1321 | movd m3, [r0 + r1] | |
1322 | movd m4, [r0 + 2 * r1] | |
1323 | movd m5, [r0 + r4] | |
1324 | ||
1325 | punpcklbw m2, m3 | |
1326 | punpcklbw m6, m4, m5 | |
1327 | punpcklbw m2, m6 | |
1328 | ||
1329 | pmaddubsw m2, m0 | |
1330 | ||
1331 | movd m6, [r5] | |
1332 | ||
1333 | punpcklbw m3, m4 | |
1334 | punpcklbw m7, m5, m6 | |
1335 | punpcklbw m3, m7 | |
1336 | ||
1337 | pmaddubsw m3, m0 | |
1338 | ||
1339 | phaddw m2, m3 | |
1340 | ||
1341 | pmulhrsw m2, m1 | |
1342 | ||
1343 | movd m7, [r5 + r1] | |
1344 | ||
1345 | punpcklbw m4, m5 | |
1346 | punpcklbw m3, m6, m7 | |
1347 | punpcklbw m4, m3 | |
1348 | ||
1349 | pmaddubsw m4, m0 | |
1350 | ||
1351 | movd m3, [r5 + 2 * r1] | |
1352 | ||
1353 | punpcklbw m5, m6 | |
1354 | punpcklbw m7, m3 | |
1355 | punpcklbw m5, m7 | |
1356 | ||
1357 | pmaddubsw m5, m0 | |
1358 | ||
1359 | phaddw m4, m5 | |
1360 | ||
1361 | pmulhrsw m4, m1 | |
1362 | ||
1363 | packuswb m2, m4 | |
1364 | movd [r2], m2 | |
1365 | pextrd [r2 + r3], m2, 1 | |
1366 | lea r2, [r2 + 2 * r3] | |
1367 | pextrd [r2], m2, 2 | |
1368 | pextrd [r2 + r3], m2, 3 | |
1369 | ||
1370 | RET | |
1371 | ||
1372 | ;----------------------------------------------------------------------------- | |
1373 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1374 | ;----------------------------------------------------------------------------- | |
1375 | %macro FILTER_V4_W4_H4 2 | |
1376 | INIT_XMM sse4 | |
1377 | cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 | |
1378 | ||
1379 | mov r4d, r4m | |
1380 | sub r0, r1 | |
1381 | ||
1382 | %ifdef PIC | |
1383 | lea r5, [tab_ChromaCoeff] | |
1384 | movd m0, [r5 + r4 * 4] | |
1385 | %else | |
1386 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1387 | %endif | |
1388 | ||
1389 | pshufb m0, [tab_Cm] | |
1390 | ||
1391 | mova m1, [tab_c_512] | |
1392 | ||
1393 | mov r4d, %2 | |
1394 | ||
1395 | lea r5, [3 * r1] | |
1396 | ||
1397 | .loop: | |
1398 | movd m2, [r0] | |
1399 | movd m3, [r0 + r1] | |
1400 | movd m4, [r0 + 2 * r1] | |
1401 | movd m5, [r0 + r5] | |
1402 | ||
1403 | punpcklbw m2, m3 | |
1404 | punpcklbw m6, m4, m5 | |
1405 | punpcklbw m2, m6 | |
1406 | ||
1407 | pmaddubsw m2, m0 | |
1408 | ||
1409 | lea r0, [r0 + 4 * r1] | |
1410 | movd m6, [r0] | |
1411 | ||
1412 | punpcklbw m3, m4 | |
1413 | punpcklbw m7, m5, m6 | |
1414 | punpcklbw m3, m7 | |
1415 | ||
1416 | pmaddubsw m3, m0 | |
1417 | ||
1418 | phaddw m2, m3 | |
1419 | ||
1420 | pmulhrsw m2, m1 | |
1421 | ||
1422 | movd m7, [r0 + r1] | |
1423 | ||
1424 | punpcklbw m4, m5 | |
1425 | punpcklbw m3, m6, m7 | |
1426 | punpcklbw m4, m3 | |
1427 | ||
1428 | pmaddubsw m4, m0 | |
1429 | ||
1430 | movd m3, [r0 + 2 * r1] | |
1431 | ||
1432 | punpcklbw m5, m6 | |
1433 | punpcklbw m7, m3 | |
1434 | punpcklbw m5, m7 | |
1435 | ||
1436 | pmaddubsw m5, m0 | |
1437 | ||
1438 | phaddw m4, m5 | |
1439 | ||
1440 | pmulhrsw m4, m1 | |
1441 | packuswb m2, m4 | |
1442 | movd [r2], m2 | |
1443 | pextrd [r2 + r3], m2, 1 | |
1444 | lea r2, [r2 + 2 * r3] | |
1445 | pextrd [r2], m2, 2 | |
1446 | pextrd [r2 + r3], m2, 3 | |
1447 | ||
1448 | lea r2, [r2 + 2 * r3] | |
1449 | ||
1450 | sub r4, 4 | |
1451 | jnz .loop | |
1452 | RET | |
1453 | %endmacro | |
1454 | ||
1455 | FILTER_V4_W4_H4 4, 8 | |
1456 | FILTER_V4_W4_H4 4, 16 | |
1457 | ||
1458 | FILTER_V4_W4_H4 4, 32 | |
1459 | ||
1460 | %macro FILTER_V4_W8_H2 0 | |
1461 | punpcklbw m1, m2 | |
1462 | punpcklbw m7, m3, m0 | |
1463 | ||
1464 | pmaddubsw m1, m6 | |
1465 | pmaddubsw m7, m5 | |
1466 | ||
1467 | paddw m1, m7 | |
1468 | ||
1469 | pmulhrsw m1, m4 | |
1470 | packuswb m1, m1 | |
1471 | %endmacro | |
1472 | ||
1473 | %macro FILTER_V4_W8_H3 0 | |
1474 | punpcklbw m2, m3 | |
1475 | punpcklbw m7, m0, m1 | |
1476 | ||
1477 | pmaddubsw m2, m6 | |
1478 | pmaddubsw m7, m5 | |
1479 | ||
1480 | paddw m2, m7 | |
1481 | ||
1482 | pmulhrsw m2, m4 | |
1483 | packuswb m2, m2 | |
1484 | %endmacro | |
1485 | ||
1486 | %macro FILTER_V4_W8_H4 0 | |
1487 | punpcklbw m3, m0 | |
1488 | punpcklbw m7, m1, m2 | |
1489 | ||
1490 | pmaddubsw m3, m6 | |
1491 | pmaddubsw m7, m5 | |
1492 | ||
1493 | paddw m3, m7 | |
1494 | ||
1495 | pmulhrsw m3, m4 | |
1496 | packuswb m3, m3 | |
1497 | %endmacro | |
1498 | ||
1499 | %macro FILTER_V4_W8_H5 0 | |
1500 | punpcklbw m0, m1 | |
1501 | punpcklbw m7, m2, m3 | |
1502 | ||
1503 | pmaddubsw m0, m6 | |
1504 | pmaddubsw m7, m5 | |
1505 | ||
1506 | paddw m0, m7 | |
1507 | ||
1508 | pmulhrsw m0, m4 | |
1509 | packuswb m0, m0 | |
1510 | %endmacro | |
1511 | ||
1512 | %macro FILTER_V4_W8_8x2 2 | |
1513 | FILTER_V4_W8 %1, %2 | |
1514 | movq m0, [r0 + 4 * r1] | |
1515 | ||
1516 | FILTER_V4_W8_H2 | |
1517 | ||
1518 | movh [r2 + r3], m1 | |
1519 | %endmacro | |
1520 | ||
1521 | %macro FILTER_V4_W8_8x4 2 | |
1522 | FILTER_V4_W8_8x2 %1, %2 | |
1523 | ;8x3 | |
1524 | lea r6, [r0 + 4 * r1] | |
1525 | movq m1, [r6 + r1] | |
1526 | ||
1527 | FILTER_V4_W8_H3 | |
1528 | ||
1529 | movh [r2 + 2 * r3], m2 | |
1530 | ||
1531 | ;8x4 | |
1532 | movq m2, [r6 + 2 * r1] | |
1533 | ||
1534 | FILTER_V4_W8_H4 | |
1535 | ||
1536 | lea r5, [r2 + 2 * r3] | |
1537 | movh [r5 + r3], m3 | |
1538 | %endmacro | |
1539 | ||
1540 | %macro FILTER_V4_W8_8x6 2 | |
1541 | FILTER_V4_W8_8x4 %1, %2 | |
1542 | ;8x5 | |
1543 | lea r6, [r6 + 2 * r1] | |
1544 | movq m3, [r6 + r1] | |
1545 | ||
1546 | FILTER_V4_W8_H5 | |
1547 | ||
1548 | movh [r2 + 4 * r3], m0 | |
1549 | ||
1550 | ;8x6 | |
1551 | movq m0, [r0 + 8 * r1] | |
1552 | ||
1553 | FILTER_V4_W8_H2 | |
1554 | ||
1555 | lea r5, [r2 + 4 * r3] | |
1556 | movh [r5 + r3], m1 | |
1557 | %endmacro | |
1558 | ||
1559 | ;----------------------------------------------------------------------------- | |
1560 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1561 | ;----------------------------------------------------------------------------- | |
1562 | %macro FILTER_V4_W8 2 | |
1563 | INIT_XMM sse4 | |
1564 | cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 | |
1565 | ||
1566 | mov r4d, r4m | |
1567 | ||
1568 | sub r0, r1 | |
1569 | movq m0, [r0] | |
1570 | movq m1, [r0 + r1] | |
1571 | movq m2, [r0 + 2 * r1] | |
1572 | lea r5, [r0 + 2 * r1] | |
1573 | movq m3, [r5 + r1] | |
1574 | ||
1575 | punpcklbw m0, m1 | |
1576 | punpcklbw m4, m2, m3 | |
1577 | ||
1578 | %ifdef PIC | |
1579 | lea r6, [tab_ChromaCoeff] | |
1580 | movd m5, [r6 + r4 * 4] | |
1581 | %else | |
1582 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
1583 | %endif | |
1584 | ||
1585 | pshufb m6, m5, [tab_Vm] | |
1586 | pmaddubsw m0, m6 | |
1587 | ||
1588 | pshufb m5, [tab_Vm + 16] | |
1589 | pmaddubsw m4, m5 | |
1590 | ||
1591 | paddw m0, m4 | |
1592 | ||
1593 | mova m4, [tab_c_512] | |
1594 | ||
1595 | pmulhrsw m0, m4 | |
1596 | packuswb m0, m0 | |
1597 | movh [r2], m0 | |
1598 | %endmacro | |
1599 | ||
1600 | ;----------------------------------------------------------------------------- | |
1601 | ; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1602 | ;----------------------------------------------------------------------------- | |
1603 | FILTER_V4_W8_8x2 8, 2 | |
1604 | ||
1605 | RET | |
1606 | ||
1607 | ;----------------------------------------------------------------------------- | |
1608 | ; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1609 | ;----------------------------------------------------------------------------- | |
1610 | FILTER_V4_W8_8x4 8, 4 | |
1611 | ||
1612 | RET | |
1613 | ||
1614 | ;----------------------------------------------------------------------------- | |
1615 | ; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
1616 | ;----------------------------------------------------------------------------- | |
1617 | FILTER_V4_W8_8x6 8, 6 | |
1618 | ||
1619 | RET | |
1620 | ||
1621 | ;------------------------------------------------------------------------------------------------------------- | |
1622 | ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
1623 | ;------------------------------------------------------------------------------------------------------------- | |
1624 | INIT_XMM sse4 | |
1625 | cglobal interp_4tap_vert_ps_4x2, 4, 6, 6 | |
1626 | ||
1627 | mov r4d, r4m | |
1628 | sub r0, r1 | |
1629 | add r3d, r3d | |
1630 | ||
1631 | %ifdef PIC | |
1632 | lea r5, [tab_ChromaCoeff] | |
1633 | movd m0, [r5 + r4 * 4] | |
1634 | %else | |
1635 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1636 | %endif | |
1637 | ||
1638 | pshufb m0, [tab_Cm] | |
1639 | ||
1640 | movd m2, [r0] | |
1641 | movd m3, [r0 + r1] | |
1642 | lea r5, [r0 + 2 * r1] | |
1643 | movd m4, [r5] | |
1644 | movd m5, [r5 + r1] | |
1645 | ||
1646 | punpcklbw m2, m3 | |
1647 | punpcklbw m1, m4, m5 | |
1648 | punpcklbw m2, m1 | |
1649 | ||
1650 | pmaddubsw m2, m0 | |
1651 | ||
1652 | movd m1, [r0 + 4 * r1] | |
1653 | ||
1654 | punpcklbw m3, m4 | |
1655 | punpcklbw m5, m1 | |
1656 | punpcklbw m3, m5 | |
1657 | ||
1658 | pmaddubsw m3, m0 | |
1659 | ||
1660 | phaddw m2, m3 | |
1661 | ||
1662 | psubw m2, [pw_2000] | |
1663 | movh [r2], m2 | |
1664 | movhps [r2 + r3], m2 | |
1665 | ||
1666 | RET | |
1667 | ||
1668 | ;------------------------------------------------------------------------------------------------------------- | |
1669 | ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
1670 | ;------------------------------------------------------------------------------------------------------------- | |
1671 | INIT_XMM sse4 | |
1672 | cglobal interp_4tap_vert_ps_4x4, 4, 6, 7 | |
1673 | ||
1674 | mov r4d, r4m | |
1675 | sub r0, r1 | |
1676 | add r3d, r3d | |
1677 | ||
1678 | %ifdef PIC | |
1679 | lea r5, [tab_ChromaCoeff] | |
1680 | movd m0, [r5 + r4 * 4] | |
1681 | %else | |
1682 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1683 | %endif | |
1684 | ||
1685 | pshufb m0, [tab_Cm] | |
1686 | ||
1687 | lea r4, [r1 * 3] | |
1688 | lea r5, [r0 + 4 * r1] | |
1689 | ||
1690 | movd m2, [r0] | |
1691 | movd m3, [r0 + r1] | |
1692 | movd m4, [r0 + 2 * r1] | |
1693 | movd m5, [r0 + r4] | |
1694 | ||
1695 | punpcklbw m2, m3 | |
1696 | punpcklbw m6, m4, m5 | |
1697 | punpcklbw m2, m6 | |
1698 | ||
1699 | pmaddubsw m2, m0 | |
1700 | ||
1701 | movd m6, [r5] | |
1702 | ||
1703 | punpcklbw m3, m4 | |
1704 | punpcklbw m1, m5, m6 | |
1705 | punpcklbw m3, m1 | |
1706 | ||
1707 | pmaddubsw m3, m0 | |
1708 | ||
1709 | phaddw m2, m3 | |
1710 | ||
1711 | mova m1, [pw_2000] | |
1712 | ||
1713 | psubw m2, m1 | |
1714 | movh [r2], m2 | |
1715 | movhps [r2 + r3], m2 | |
1716 | ||
1717 | movd m2, [r5 + r1] | |
1718 | ||
1719 | punpcklbw m4, m5 | |
1720 | punpcklbw m3, m6, m2 | |
1721 | punpcklbw m4, m3 | |
1722 | ||
1723 | pmaddubsw m4, m0 | |
1724 | ||
1725 | movd m3, [r5 + 2 * r1] | |
1726 | ||
1727 | punpcklbw m5, m6 | |
1728 | punpcklbw m2, m3 | |
1729 | punpcklbw m5, m2 | |
1730 | ||
1731 | pmaddubsw m5, m0 | |
1732 | ||
1733 | phaddw m4, m5 | |
1734 | ||
1735 | psubw m4, m1 | |
1736 | lea r2, [r2 + 2 * r3] | |
1737 | movh [r2], m4 | |
1738 | movhps [r2 + r3], m4 | |
1739 | ||
1740 | RET | |
1741 | ||
1742 | ;--------------------------------------------------------------------------------------------------------------- | |
1743 | ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
1744 | ;--------------------------------------------------------------------------------------------------------------- | |
1745 | %macro FILTER_V_PS_W4_H4 2 | |
1746 | INIT_XMM sse4 | |
1747 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
1748 | ||
1749 | mov r4d, r4m | |
1750 | sub r0, r1 | |
1751 | add r3d, r3d | |
1752 | ||
1753 | %ifdef PIC | |
1754 | lea r5, [tab_ChromaCoeff] | |
1755 | movd m0, [r5 + r4 * 4] | |
1756 | %else | |
1757 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
1758 | %endif | |
1759 | ||
1760 | pshufb m0, [tab_Cm] | |
1761 | ||
1762 | mova m1, [pw_2000] | |
1763 | ||
1764 | mov r4d, %2/4 | |
1765 | lea r5, [3 * r1] | |
1766 | ||
1767 | .loop: | |
1768 | movd m2, [r0] | |
1769 | movd m3, [r0 + r1] | |
1770 | movd m4, [r0 + 2 * r1] | |
1771 | movd m5, [r0 + r5] | |
1772 | ||
1773 | punpcklbw m2, m3 | |
1774 | punpcklbw m6, m4, m5 | |
1775 | punpcklbw m2, m6 | |
1776 | ||
1777 | pmaddubsw m2, m0 | |
1778 | ||
1779 | lea r0, [r0 + 4 * r1] | |
1780 | movd m6, [r0] | |
1781 | ||
1782 | punpcklbw m3, m4 | |
1783 | punpcklbw m7, m5, m6 | |
1784 | punpcklbw m3, m7 | |
1785 | ||
1786 | pmaddubsw m3, m0 | |
1787 | ||
1788 | phaddw m2, m3 | |
1789 | ||
1790 | psubw m2, m1 | |
1791 | movh [r2], m2 | |
1792 | movhps [r2 + r3], m2 | |
1793 | ||
1794 | movd m2, [r0 + r1] | |
1795 | ||
1796 | punpcklbw m4, m5 | |
1797 | punpcklbw m3, m6, m2 | |
1798 | punpcklbw m4, m3 | |
1799 | ||
1800 | pmaddubsw m4, m0 | |
1801 | ||
1802 | movd m3, [r0 + 2 * r1] | |
1803 | ||
1804 | punpcklbw m5, m6 | |
1805 | punpcklbw m2, m3 | |
1806 | punpcklbw m5, m2 | |
1807 | ||
1808 | pmaddubsw m5, m0 | |
1809 | ||
1810 | phaddw m4, m5 | |
1811 | ||
1812 | psubw m4, m1 | |
1813 | lea r2, [r2 + 2 * r3] | |
1814 | movh [r2], m4 | |
1815 | movhps [r2 + r3], m4 | |
1816 | ||
1817 | lea r2, [r2 + 2 * r3] | |
1818 | ||
1819 | dec r4d | |
1820 | jnz .loop | |
1821 | RET | |
1822 | %endmacro | |
1823 | ||
1824 | FILTER_V_PS_W4_H4 4, 8 | |
1825 | FILTER_V_PS_W4_H4 4, 16 | |
1826 | ||
1827 | FILTER_V_PS_W4_H4 4, 32 | |
1828 | ||
1829 | ;-------------------------------------------------------------------------------------------------------------- | |
1830 | ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
1831 | ;-------------------------------------------------------------------------------------------------------------- | |
1832 | %macro FILTER_V_PS_W8_H8_H16_H2 2 | |
1833 | INIT_XMM sse4 | |
1834 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7 | |
1835 | ||
1836 | mov r4d, r4m | |
1837 | sub r0, r1 | |
1838 | add r3d, r3d | |
1839 | ||
1840 | %ifdef PIC | |
1841 | lea r5, [tab_ChromaCoeff] | |
1842 | movd m5, [r5 + r4 * 4] | |
1843 | %else | |
1844 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
1845 | %endif | |
1846 | ||
1847 | pshufb m6, m5, [tab_Vm] | |
1848 | pshufb m5, [tab_Vm + 16] | |
1849 | mova m4, [pw_2000] | |
1850 | ||
1851 | mov r4d, %2/2 | |
1852 | lea r5, [3 * r1] | |
1853 | ||
1854 | .loopH: | |
1855 | movq m0, [r0] | |
1856 | movq m1, [r0 + r1] | |
1857 | movq m2, [r0 + 2 * r1] | |
1858 | movq m3, [r0 + r5] | |
1859 | ||
1860 | punpcklbw m0, m1 | |
1861 | punpcklbw m1, m2 | |
1862 | punpcklbw m2, m3 | |
1863 | ||
1864 | pmaddubsw m0, m6 | |
1865 | pmaddubsw m2, m5 | |
1866 | ||
1867 | paddw m0, m2 | |
1868 | ||
1869 | psubw m0, m4 | |
1870 | movu [r2], m0 | |
1871 | ||
1872 | movq m0, [r0 + 4 * r1] | |
1873 | ||
1874 | punpcklbw m3, m0 | |
1875 | ||
1876 | pmaddubsw m1, m6 | |
1877 | pmaddubsw m3, m5 | |
1878 | ||
1879 | paddw m1, m3 | |
1880 | psubw m1, m4 | |
1881 | ||
1882 | movu [r2 + r3], m1 | |
1883 | ||
1884 | lea r0, [r0 + 2 * r1] | |
1885 | lea r2, [r2 + 2 * r3] | |
1886 | ||
1887 | dec r4d | |
1888 | jnz .loopH | |
1889 | ||
1890 | RET | |
1891 | %endmacro | |
1892 | ||
1893 | FILTER_V_PS_W8_H8_H16_H2 8, 2 | |
1894 | FILTER_V_PS_W8_H8_H16_H2 8, 4 | |
1895 | FILTER_V_PS_W8_H8_H16_H2 8, 6 | |
1896 | ||
1897 | FILTER_V_PS_W8_H8_H16_H2 8, 12 | |
1898 | FILTER_V_PS_W8_H8_H16_H2 8, 64 | |
1899 | ||
1900 | ;-------------------------------------------------------------------------------------------------------------- | |
1901 | ; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
1902 | ;-------------------------------------------------------------------------------------------------------------- | |
1903 | %macro FILTER_V_PS_W8_H8_H16_H32 2 | |
1904 | INIT_XMM sse4 | |
1905 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
1906 | ||
1907 | mov r4d, r4m | |
1908 | sub r0, r1 | |
1909 | add r3d, r3d | |
1910 | ||
1911 | %ifdef PIC | |
1912 | lea r5, [tab_ChromaCoeff] | |
1913 | movd m5, [r5 + r4 * 4] | |
1914 | %else | |
1915 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
1916 | %endif | |
1917 | ||
1918 | pshufb m6, m5, [tab_Vm] | |
1919 | pshufb m5, [tab_Vm + 16] | |
1920 | mova m4, [pw_2000] | |
1921 | ||
1922 | mov r4d, %2/4 | |
1923 | lea r5, [3 * r1] | |
1924 | ||
1925 | .loop: | |
1926 | movq m0, [r0] | |
1927 | movq m1, [r0 + r1] | |
1928 | movq m2, [r0 + 2 * r1] | |
1929 | movq m3, [r0 + r5] | |
1930 | ||
1931 | punpcklbw m0, m1 | |
1932 | punpcklbw m1, m2 | |
1933 | punpcklbw m2, m3 | |
1934 | ||
1935 | pmaddubsw m0, m6 | |
1936 | pmaddubsw m7, m2, m5 | |
1937 | ||
1938 | paddw m0, m7 | |
1939 | ||
1940 | psubw m0, m4 | |
1941 | movu [r2], m0 | |
1942 | ||
1943 | lea r0, [r0 + 4 * r1] | |
1944 | movq m0, [r0] | |
1945 | ||
1946 | punpcklbw m3, m0 | |
1947 | ||
1948 | pmaddubsw m1, m6 | |
1949 | pmaddubsw m7, m3, m5 | |
1950 | ||
1951 | paddw m1, m7 | |
1952 | ||
1953 | psubw m1, m4 | |
1954 | movu [r2 + r3], m1 | |
1955 | ||
1956 | movq m1, [r0 + r1] | |
1957 | ||
1958 | punpcklbw m0, m1 | |
1959 | ||
1960 | pmaddubsw m2, m6 | |
1961 | pmaddubsw m0, m5 | |
1962 | ||
1963 | paddw m2, m0 | |
1964 | ||
1965 | psubw m2, m4 | |
1966 | lea r2, [r2 + 2 * r3] | |
1967 | movu [r2], m2 | |
1968 | ||
1969 | movq m2, [r0 + 2 * r1] | |
1970 | ||
1971 | punpcklbw m1, m2 | |
1972 | ||
1973 | pmaddubsw m3, m6 | |
1974 | pmaddubsw m1, m5 | |
1975 | ||
1976 | paddw m3, m1 | |
1977 | psubw m3, m4 | |
1978 | ||
1979 | movu [r2 + r3], m3 | |
1980 | ||
1981 | lea r2, [r2 + 2 * r3] | |
1982 | ||
1983 | dec r4d | |
1984 | jnz .loop | |
1985 | RET | |
1986 | %endmacro | |
1987 | ||
1988 | FILTER_V_PS_W8_H8_H16_H32 8, 8 | |
1989 | FILTER_V_PS_W8_H8_H16_H32 8, 16 | |
1990 | FILTER_V_PS_W8_H8_H16_H32 8, 32 | |
1991 | ||
1992 | ;------------------------------------------------------------------------------------------------------------ | |
1993 | ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
1994 | ;------------------------------------------------------------------------------------------------------------ | |
1995 | %macro FILTER_V_PS_W6 2 | |
1996 | INIT_XMM sse4 | |
1997 | cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8 | |
1998 | ||
1999 | mov r4d, r4m | |
2000 | sub r0, r1 | |
2001 | add r3d, r3d | |
2002 | ||
2003 | %ifdef PIC | |
2004 | lea r5, [tab_ChromaCoeff] | |
2005 | movd m5, [r5 + r4 * 4] | |
2006 | %else | |
2007 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
2008 | %endif | |
2009 | ||
2010 | pshufb m6, m5, [tab_Vm] | |
2011 | pshufb m5, [tab_Vm + 16] | |
2012 | mova m4, [pw_2000] | |
2013 | lea r5, [3 * r1] | |
2014 | mov r4d, %2/4 | |
2015 | ||
2016 | .loop: | |
2017 | movq m0, [r0] | |
2018 | movq m1, [r0 + r1] | |
2019 | movq m2, [r0 + 2 * r1] | |
2020 | movq m3, [r0 + r5] | |
2021 | ||
2022 | punpcklbw m0, m1 | |
2023 | punpcklbw m1, m2 | |
2024 | punpcklbw m2, m3 | |
2025 | ||
2026 | pmaddubsw m0, m6 | |
2027 | pmaddubsw m7, m2, m5 | |
2028 | ||
2029 | paddw m0, m7 | |
2030 | psubw m0, m4 | |
2031 | ||
2032 | movh [r2], m0 | |
2033 | pshufd m0, m0, 2 | |
2034 | movd [r2 + 8], m0 | |
2035 | ||
2036 | lea r0, [r0 + 4 * r1] | |
2037 | movq m0, [r0] | |
2038 | punpcklbw m3, m0 | |
2039 | ||
2040 | pmaddubsw m1, m6 | |
2041 | pmaddubsw m7, m3, m5 | |
2042 | ||
2043 | paddw m1, m7 | |
2044 | psubw m1, m4 | |
2045 | ||
2046 | movh [r2 + r3], m1 | |
2047 | pshufd m1, m1, 2 | |
2048 | movd [r2 + r3 + 8], m1 | |
2049 | ||
2050 | movq m1, [r0 + r1] | |
2051 | punpcklbw m0, m1 | |
2052 | ||
2053 | pmaddubsw m2, m6 | |
2054 | pmaddubsw m0, m5 | |
2055 | ||
2056 | paddw m2, m0 | |
2057 | psubw m2, m4 | |
2058 | ||
2059 | lea r2,[r2 + 2 * r3] | |
2060 | movh [r2], m2 | |
2061 | pshufd m2, m2, 2 | |
2062 | movd [r2 + 8], m2 | |
2063 | ||
2064 | movq m2,[r0 + 2 * r1] | |
2065 | punpcklbw m1, m2 | |
2066 | ||
2067 | pmaddubsw m3, m6 | |
2068 | pmaddubsw m1, m5 | |
2069 | ||
2070 | paddw m3, m1 | |
2071 | psubw m3, m4 | |
2072 | ||
2073 | movh [r2 + r3], m3 | |
2074 | pshufd m3, m3, 2 | |
2075 | movd [r2 + r3 + 8], m3 | |
2076 | ||
2077 | lea r2, [r2 + 2 * r3] | |
2078 | ||
2079 | dec r4d | |
2080 | jnz .loop | |
2081 | RET | |
2082 | %endmacro | |
2083 | ||
2084 | FILTER_V_PS_W6 6, 8 | |
2085 | FILTER_V_PS_W6 6, 16 | |
2086 | ||
2087 | ;--------------------------------------------------------------------------------------------------------------- | |
2088 | ; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2089 | ;--------------------------------------------------------------------------------------------------------------- | |
2090 | %macro FILTER_V_PS_W12 2 | |
2091 | INIT_XMM sse4 | |
2092 | cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8 | |
2093 | ||
2094 | mov r4d, r4m | |
2095 | sub r0, r1 | |
2096 | add r3d, r3d | |
2097 | ||
2098 | %ifdef PIC | |
2099 | lea r5, [tab_ChromaCoeff] | |
2100 | movd m0, [r5 + r4 * 4] | |
2101 | %else | |
2102 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2103 | %endif | |
2104 | ||
2105 | pshufb m1, m0, [tab_Vm] | |
2106 | pshufb m0, [tab_Vm + 16] | |
2107 | ||
2108 | mov r4d, %2/2 | |
2109 | ||
2110 | .loop: | |
2111 | movu m2, [r0] | |
2112 | movu m3, [r0 + r1] | |
2113 | ||
2114 | punpcklbw m4, m2, m3 | |
2115 | punpckhbw m2, m3 | |
2116 | ||
2117 | pmaddubsw m4, m1 | |
2118 | pmaddubsw m2, m1 | |
2119 | ||
2120 | lea r0, [r0 + 2 * r1] | |
2121 | movu m5, [r0] | |
2122 | movu m7, [r0 + r1] | |
2123 | ||
2124 | punpcklbw m6, m5, m7 | |
2125 | pmaddubsw m6, m0 | |
2126 | paddw m4, m6 | |
2127 | ||
2128 | punpckhbw m6, m5, m7 | |
2129 | pmaddubsw m6, m0 | |
2130 | paddw m2, m6 | |
2131 | ||
2132 | mova m6, [pw_2000] | |
2133 | ||
2134 | psubw m4, m6 | |
2135 | psubw m2, m6 | |
2136 | ||
2137 | movu [r2], m4 | |
2138 | movh [r2 + 16], m2 | |
2139 | ||
2140 | punpcklbw m4, m3, m5 | |
2141 | punpckhbw m3, m5 | |
2142 | ||
2143 | pmaddubsw m4, m1 | |
2144 | pmaddubsw m3, m1 | |
2145 | ||
2146 | movu m2, [r0 + 2 * r1] | |
2147 | ||
2148 | punpcklbw m5, m7, m2 | |
2149 | punpckhbw m7, m2 | |
2150 | ||
2151 | pmaddubsw m5, m0 | |
2152 | pmaddubsw m7, m0 | |
2153 | ||
2154 | paddw m4, m5 | |
2155 | paddw m3, m7 | |
2156 | ||
2157 | psubw m4, m6 | |
2158 | psubw m3, m6 | |
2159 | ||
2160 | movu [r2 + r3], m4 | |
2161 | movh [r2 + r3 + 16], m3 | |
2162 | ||
2163 | lea r2, [r2 + 2 * r3] | |
2164 | ||
2165 | dec r4d | |
2166 | jnz .loop | |
2167 | RET | |
2168 | %endmacro | |
2169 | ||
2170 | FILTER_V_PS_W12 12, 16 | |
2171 | FILTER_V_PS_W12 12, 32 | |
2172 | ||
2173 | ;--------------------------------------------------------------------------------------------------------------- | |
2174 | ; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2175 | ;--------------------------------------------------------------------------------------------------------------- | |
2176 | %macro FILTER_V_PS_W16 2 | |
2177 | INIT_XMM sse4 | |
2178 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
2179 | ||
2180 | mov r4d, r4m | |
2181 | sub r0, r1 | |
2182 | add r3d, r3d | |
2183 | ||
2184 | %ifdef PIC | |
2185 | lea r5, [tab_ChromaCoeff] | |
2186 | movd m0, [r5 + r4 * 4] | |
2187 | %else | |
2188 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2189 | %endif | |
2190 | ||
2191 | pshufb m1, m0, [tab_Vm] | |
2192 | pshufb m0, [tab_Vm + 16] | |
2193 | mov r4d, %2/2 | |
2194 | ||
2195 | .loop: | |
2196 | movu m2, [r0] | |
2197 | movu m3, [r0 + r1] | |
2198 | ||
2199 | punpcklbw m4, m2, m3 | |
2200 | punpckhbw m2, m3 | |
2201 | ||
2202 | pmaddubsw m4, m1 | |
2203 | pmaddubsw m2, m1 | |
2204 | ||
2205 | lea r0, [r0 + 2 * r1] | |
2206 | movu m5, [r0] | |
2207 | movu m7, [r0 + r1] | |
2208 | ||
2209 | punpcklbw m6, m5, m7 | |
2210 | pmaddubsw m6, m0 | |
2211 | paddw m4, m6 | |
2212 | ||
2213 | punpckhbw m6, m5, m7 | |
2214 | pmaddubsw m6, m0 | |
2215 | paddw m2, m6 | |
2216 | ||
2217 | mova m6, [pw_2000] | |
2218 | ||
2219 | psubw m4, m6 | |
2220 | psubw m2, m6 | |
2221 | ||
2222 | movu [r2], m4 | |
2223 | movu [r2 + 16], m2 | |
2224 | ||
2225 | punpcklbw m4, m3, m5 | |
2226 | punpckhbw m3, m5 | |
2227 | ||
2228 | pmaddubsw m4, m1 | |
2229 | pmaddubsw m3, m1 | |
2230 | ||
2231 | movu m5, [r0 + 2 * r1] | |
2232 | ||
2233 | punpcklbw m2, m7, m5 | |
2234 | punpckhbw m7, m5 | |
2235 | ||
2236 | pmaddubsw m2, m0 | |
2237 | pmaddubsw m7, m0 | |
2238 | ||
2239 | paddw m4, m2 | |
2240 | paddw m3, m7 | |
2241 | ||
2242 | psubw m4, m6 | |
2243 | psubw m3, m6 | |
2244 | ||
2245 | movu [r2 + r3], m4 | |
2246 | movu [r2 + r3 + 16], m3 | |
2247 | ||
2248 | lea r2, [r2 + 2 * r3] | |
2249 | ||
2250 | dec r4d | |
2251 | jnz .loop | |
2252 | RET | |
2253 | %endmacro | |
2254 | ||
2255 | FILTER_V_PS_W16 16, 4 | |
2256 | FILTER_V_PS_W16 16, 8 | |
2257 | FILTER_V_PS_W16 16, 12 | |
2258 | FILTER_V_PS_W16 16, 16 | |
2259 | FILTER_V_PS_W16 16, 32 | |
2260 | ||
2261 | FILTER_V_PS_W16 16, 24 | |
2262 | FILTER_V_PS_W16 16, 64 | |
2263 | ||
2264 | ;-------------------------------------------------------------------------------------------------------------- | |
2265 | ;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2266 | ;-------------------------------------------------------------------------------------------------------------- | |
2267 | %macro FILTER_V4_PS_W24 2 | |
2268 | INIT_XMM sse4 | |
2269 | cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8 | |
2270 | ||
2271 | mov r4d, r4m | |
2272 | sub r0, r1 | |
2273 | add r3d, r3d | |
2274 | ||
2275 | %ifdef PIC | |
2276 | lea r5, [tab_ChromaCoeff] | |
2277 | movd m0, [r5 + r4 * 4] | |
2278 | %else | |
2279 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2280 | %endif | |
2281 | ||
2282 | pshufb m1, m0, [tab_Vm] | |
2283 | pshufb m0, [tab_Vm + 16] | |
2284 | ||
2285 | mov r4d, %2/2 | |
2286 | ||
2287 | .loop: | |
2288 | movu m2, [r0] | |
2289 | movu m3, [r0 + r1] | |
2290 | ||
2291 | punpcklbw m4, m2, m3 | |
2292 | punpckhbw m2, m3 | |
2293 | ||
2294 | pmaddubsw m4, m1 | |
2295 | pmaddubsw m2, m1 | |
2296 | ||
2297 | lea r5, [r0 + 2 * r1] | |
2298 | ||
2299 | movu m5, [r5] | |
2300 | movu m7, [r5 + r1] | |
2301 | ||
2302 | punpcklbw m6, m5, m7 | |
2303 | pmaddubsw m6, m0 | |
2304 | paddw m4, m6 | |
2305 | ||
2306 | punpckhbw m6, m5, m7 | |
2307 | pmaddubsw m6, m0 | |
2308 | paddw m2, m6 | |
2309 | ||
2310 | mova m6, [pw_2000] | |
2311 | ||
2312 | psubw m4, m6 | |
2313 | psubw m2, m6 | |
2314 | ||
2315 | movu [r2], m4 | |
2316 | movu [r2 + 16], m2 | |
2317 | ||
2318 | punpcklbw m4, m3, m5 | |
2319 | punpckhbw m3, m5 | |
2320 | ||
2321 | pmaddubsw m4, m1 | |
2322 | pmaddubsw m3, m1 | |
2323 | ||
2324 | movu m2, [r5 + 2 * r1] | |
2325 | ||
2326 | punpcklbw m5, m7, m2 | |
2327 | punpckhbw m7, m2 | |
2328 | ||
2329 | pmaddubsw m5, m0 | |
2330 | pmaddubsw m7, m0 | |
2331 | ||
2332 | paddw m4, m5 | |
2333 | paddw m3, m7 | |
2334 | ||
2335 | psubw m4, m6 | |
2336 | psubw m3, m6 | |
2337 | ||
2338 | movu [r2 + r3], m4 | |
2339 | movu [r2 + r3 + 16], m3 | |
2340 | ||
2341 | movq m2, [r0 + 16] | |
2342 | movq m3, [r0 + r1 + 16] | |
2343 | movq m4, [r5 + 16] | |
2344 | movq m5, [r5 + r1 + 16] | |
2345 | ||
2346 | punpcklbw m2, m3 | |
2347 | punpcklbw m7, m4, m5 | |
2348 | ||
2349 | pmaddubsw m2, m1 | |
2350 | pmaddubsw m7, m0 | |
2351 | ||
2352 | paddw m2, m7 | |
2353 | psubw m2, m6 | |
2354 | ||
2355 | movu [r2 + 32], m2 | |
2356 | ||
2357 | movq m2, [r5 + 2 * r1 + 16] | |
2358 | ||
2359 | punpcklbw m3, m4 | |
2360 | punpcklbw m5, m2 | |
2361 | ||
2362 | pmaddubsw m3, m1 | |
2363 | pmaddubsw m5, m0 | |
2364 | ||
2365 | paddw m3, m5 | |
2366 | psubw m3, m6 | |
2367 | ||
2368 | movu [r2 + r3 + 32], m3 | |
2369 | ||
2370 | mov r0, r5 | |
2371 | lea r2, [r2 + 2 * r3] | |
2372 | ||
2373 | dec r4d | |
2374 | jnz .loop | |
2375 | RET | |
2376 | %endmacro | |
2377 | ||
2378 | FILTER_V4_PS_W24 24, 32 | |
2379 | ||
2380 | FILTER_V4_PS_W24 24, 64 | |
2381 | ||
2382 | ;--------------------------------------------------------------------------------------------------------------- | |
2383 | ; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
2384 | ;--------------------------------------------------------------------------------------------------------------- | |
2385 | %macro FILTER_V_PS_W32 2 | |
2386 | INIT_XMM sse4 | |
2387 | cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8 | |
2388 | ||
2389 | mov r4d, r4m | |
2390 | sub r0, r1 | |
2391 | add r3d, r3d | |
2392 | ||
2393 | %ifdef PIC | |
2394 | lea r5, [tab_ChromaCoeff] | |
2395 | movd m0, [r5 + r4 * 4] | |
2396 | %else | |
2397 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2398 | %endif | |
2399 | ||
2400 | pshufb m1, m0, [tab_Vm] | |
2401 | pshufb m0, [tab_Vm + 16] | |
2402 | ||
2403 | mova m7, [pw_2000] | |
2404 | ||
2405 | mov r4d, %2 | |
2406 | ||
2407 | .loop: | |
2408 | movu m2, [r0] | |
2409 | movu m3, [r0 + r1] | |
2410 | ||
2411 | punpcklbw m4, m2, m3 | |
2412 | punpckhbw m2, m3 | |
2413 | ||
2414 | pmaddubsw m4, m1 | |
2415 | pmaddubsw m2, m1 | |
2416 | ||
2417 | lea r5, [r0 + 2 * r1] | |
2418 | movu m3, [r5] | |
2419 | movu m5, [r5 + r1] | |
2420 | ||
2421 | punpcklbw m6, m3, m5 | |
2422 | punpckhbw m3, m5 | |
2423 | ||
2424 | pmaddubsw m6, m0 | |
2425 | pmaddubsw m3, m0 | |
2426 | ||
2427 | paddw m4, m6 | |
2428 | paddw m2, m3 | |
2429 | ||
2430 | psubw m4, m7 | |
2431 | psubw m2, m7 | |
2432 | ||
2433 | movu [r2], m4 | |
2434 | movu [r2 + 16], m2 | |
2435 | ||
2436 | movu m2, [r0 + 16] | |
2437 | movu m3, [r0 + r1 + 16] | |
2438 | ||
2439 | punpcklbw m4, m2, m3 | |
2440 | punpckhbw m2, m3 | |
2441 | ||
2442 | pmaddubsw m4, m1 | |
2443 | pmaddubsw m2, m1 | |
2444 | ||
2445 | movu m3, [r5 + 16] | |
2446 | movu m5, [r5 + r1 + 16] | |
2447 | ||
2448 | punpcklbw m6, m3, m5 | |
2449 | punpckhbw m3, m5 | |
2450 | ||
2451 | pmaddubsw m6, m0 | |
2452 | pmaddubsw m3, m0 | |
2453 | ||
2454 | paddw m4, m6 | |
2455 | paddw m2, m3 | |
2456 | ||
2457 | psubw m4, m7 | |
2458 | psubw m2, m7 | |
2459 | ||
2460 | movu [r2 + 32], m4 | |
2461 | movu [r2 + 48], m2 | |
2462 | ||
2463 | lea r0, [r0 + r1] | |
2464 | lea r2, [r2 + r3] | |
2465 | ||
2466 | dec r4d | |
2467 | jnz .loop | |
2468 | RET | |
2469 | %endmacro | |
2470 | ||
2471 | FILTER_V_PS_W32 32, 8 | |
2472 | FILTER_V_PS_W32 32, 16 | |
2473 | FILTER_V_PS_W32 32, 24 | |
2474 | FILTER_V_PS_W32 32, 32 | |
2475 | ||
2476 | FILTER_V_PS_W32 32, 48 | |
2477 | FILTER_V_PS_W32 32, 64 | |
2478 | ||
2479 | ;----------------------------------------------------------------------------- | |
2480 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2481 | ;----------------------------------------------------------------------------- | |
2482 | %macro FILTER_V4_W8_H8_H16_H32 2 | |
2483 | INIT_XMM sse4 | |
2484 | cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 | |
2485 | ||
2486 | mov r4d, r4m | |
2487 | sub r0, r1 | |
2488 | ||
2489 | %ifdef PIC | |
2490 | lea r5, [tab_ChromaCoeff] | |
2491 | movd m5, [r5 + r4 * 4] | |
2492 | %else | |
2493 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
2494 | %endif | |
2495 | ||
2496 | pshufb m6, m5, [tab_Vm] | |
2497 | pshufb m5, [tab_Vm + 16] | |
2498 | mova m4, [tab_c_512] | |
2499 | lea r5, [r1 * 3] | |
2500 | ||
2501 | mov r4d, %2 | |
2502 | ||
2503 | .loop: | |
2504 | movq m0, [r0] | |
2505 | movq m1, [r0 + r1] | |
2506 | movq m2, [r0 + 2 * r1] | |
2507 | movq m3, [r0 + r5] | |
2508 | ||
2509 | punpcklbw m0, m1 | |
2510 | punpcklbw m1, m2 | |
2511 | punpcklbw m2, m3 | |
2512 | ||
2513 | pmaddubsw m0, m6 | |
2514 | pmaddubsw m7, m2, m5 | |
2515 | ||
2516 | paddw m0, m7 | |
2517 | ||
2518 | pmulhrsw m0, m4 | |
2519 | packuswb m0, m0 | |
2520 | movh [r2], m0 | |
2521 | ||
2522 | lea r0, [r0 + 4 * r1] | |
2523 | movq m0, [r0] | |
2524 | ||
2525 | punpcklbw m3, m0 | |
2526 | ||
2527 | pmaddubsw m1, m6 | |
2528 | pmaddubsw m7, m3, m5 | |
2529 | ||
2530 | paddw m1, m7 | |
2531 | ||
2532 | pmulhrsw m1, m4 | |
2533 | packuswb m1, m1 | |
2534 | movh [r2 + r3], m1 | |
2535 | ||
2536 | movq m1, [r0 + r1] | |
2537 | ||
2538 | punpcklbw m0, m1 | |
2539 | ||
2540 | pmaddubsw m2, m6 | |
2541 | pmaddubsw m0, m5 | |
2542 | ||
2543 | paddw m2, m0 | |
2544 | ||
2545 | pmulhrsw m2, m4 | |
2546 | ||
2547 | movq m7, [r0 + 2 * r1] | |
2548 | punpcklbw m1, m7 | |
2549 | ||
2550 | pmaddubsw m3, m6 | |
2551 | pmaddubsw m1, m5 | |
2552 | ||
2553 | paddw m3, m1 | |
2554 | ||
2555 | pmulhrsw m3, m4 | |
2556 | packuswb m2, m3 | |
2557 | ||
2558 | lea r2, [r2 + 2 * r3] | |
2559 | movh [r2], m2 | |
2560 | movhps [r2 + r3], m2 | |
2561 | ||
2562 | lea r2, [r2 + 2 * r3] | |
2563 | ||
2564 | sub r4, 4 | |
2565 | jnz .loop | |
2566 | RET | |
2567 | %endmacro | |
2568 | ||
2569 | FILTER_V4_W8_H8_H16_H32 8, 8 | |
2570 | FILTER_V4_W8_H8_H16_H32 8, 16 | |
2571 | FILTER_V4_W8_H8_H16_H32 8, 32 | |
2572 | ||
2573 | FILTER_V4_W8_H8_H16_H32 8, 12 | |
2574 | FILTER_V4_W8_H8_H16_H32 8, 64 | |
2575 | ||
2576 | ||
2577 | ;----------------------------------------------------------------------------- | |
2578 | ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2579 | ;----------------------------------------------------------------------------- | |
2580 | %macro FILTER_V4_W6_H4 2 | |
2581 | INIT_XMM sse4 | |
2582 | cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 | |
2583 | ||
2584 | mov r4d, r4m | |
2585 | sub r0, r1 | |
2586 | ||
2587 | %ifdef PIC | |
2588 | lea r5, [tab_ChromaCoeff] | |
2589 | movd m5, [r5 + r4 * 4] | |
2590 | %else | |
2591 | movd m5, [tab_ChromaCoeff + r4 * 4] | |
2592 | %endif | |
2593 | ||
2594 | pshufb m6, m5, [tab_Vm] | |
2595 | pshufb m5, [tab_Vm + 16] | |
2596 | mova m4, [tab_c_512] | |
2597 | ||
2598 | mov r4d, %2 | |
2599 | lea r5, [3 * r1] | |
2600 | ||
2601 | .loop: | |
2602 | movq m0, [r0] | |
2603 | movq m1, [r0 + r1] | |
2604 | movq m2, [r0 + 2 * r1] | |
2605 | movq m3, [r0 + r5] | |
2606 | ||
2607 | punpcklbw m0, m1 | |
2608 | punpcklbw m1, m2 | |
2609 | punpcklbw m2, m3 | |
2610 | ||
2611 | pmaddubsw m0, m6 | |
2612 | pmaddubsw m7, m2, m5 | |
2613 | ||
2614 | paddw m0, m7 | |
2615 | ||
2616 | pmulhrsw m0, m4 | |
2617 | packuswb m0, m0 | |
2618 | movd [r2], m0 | |
2619 | pextrw [r2 + 4], m0, 2 | |
2620 | ||
2621 | lea r0, [r0 + 4 * r1] | |
2622 | ||
2623 | movq m0, [r0] | |
2624 | punpcklbw m3, m0 | |
2625 | ||
2626 | pmaddubsw m1, m6 | |
2627 | pmaddubsw m7, m3, m5 | |
2628 | ||
2629 | paddw m1, m7 | |
2630 | ||
2631 | pmulhrsw m1, m4 | |
2632 | packuswb m1, m1 | |
2633 | movd [r2 + r3], m1 | |
2634 | pextrw [r2 + r3 + 4], m1, 2 | |
2635 | ||
2636 | movq m1, [r0 + r1] | |
2637 | punpcklbw m7, m0, m1 | |
2638 | ||
2639 | pmaddubsw m2, m6 | |
2640 | pmaddubsw m7, m5 | |
2641 | ||
2642 | paddw m2, m7 | |
2643 | ||
2644 | pmulhrsw m2, m4 | |
2645 | packuswb m2, m2 | |
2646 | lea r2, [r2 + 2 * r3] | |
2647 | movd [r2], m2 | |
2648 | pextrw [r2 + 4], m2, 2 | |
2649 | ||
2650 | movq m2, [r0 + 2 * r1] | |
2651 | punpcklbw m1, m2 | |
2652 | ||
2653 | pmaddubsw m3, m6 | |
2654 | pmaddubsw m1, m5 | |
2655 | ||
2656 | paddw m3, m1 | |
2657 | ||
2658 | pmulhrsw m3, m4 | |
2659 | packuswb m3, m3 | |
2660 | ||
2661 | movd [r2 + r3], m3 | |
2662 | pextrw [r2 + r3 + 4], m3, 2 | |
2663 | ||
2664 | lea r2, [r2 + 2 * r3] | |
2665 | ||
2666 | sub r4, 4 | |
2667 | jnz .loop | |
2668 | RET | |
2669 | %endmacro | |
2670 | ||
2671 | FILTER_V4_W6_H4 6, 8 | |
2672 | ||
2673 | FILTER_V4_W6_H4 6, 16 | |
2674 | ||
2675 | ;----------------------------------------------------------------------------- | |
2676 | ; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2677 | ;----------------------------------------------------------------------------- | |
2678 | %macro FILTER_V4_W12_H2 2 | |
2679 | INIT_XMM sse4 | |
2680 | cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8 | |
2681 | ||
2682 | mov r4d, r4m | |
2683 | sub r0, r1 | |
2684 | ||
2685 | %ifdef PIC | |
2686 | lea r5, [tab_ChromaCoeff] | |
2687 | movd m0, [r5 + r4 * 4] | |
2688 | %else | |
2689 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2690 | %endif | |
2691 | ||
2692 | pshufb m1, m0, [tab_Vm] | |
2693 | pshufb m0, [tab_Vm + 16] | |
2694 | ||
2695 | mov r4d, %2 | |
2696 | ||
2697 | .loop: | |
2698 | movu m2, [r0] | |
2699 | movu m3, [r0 + r1] | |
2700 | ||
2701 | punpcklbw m4, m2, m3 | |
2702 | punpckhbw m2, m3 | |
2703 | ||
2704 | pmaddubsw m4, m1 | |
2705 | pmaddubsw m2, m1 | |
2706 | ||
2707 | lea r0, [r0 + 2 * r1] | |
2708 | movu m5, [r0] | |
2709 | movu m7, [r0 + r1] | |
2710 | ||
2711 | punpcklbw m6, m5, m7 | |
2712 | pmaddubsw m6, m0 | |
2713 | paddw m4, m6 | |
2714 | ||
2715 | punpckhbw m6, m5, m7 | |
2716 | pmaddubsw m6, m0 | |
2717 | paddw m2, m6 | |
2718 | ||
2719 | mova m6, [tab_c_512] | |
2720 | ||
2721 | pmulhrsw m4, m6 | |
2722 | pmulhrsw m2, m6 | |
2723 | ||
2724 | packuswb m4, m2 | |
2725 | ||
2726 | movh [r2], m4 | |
2727 | pextrd [r2 + 8], m4, 2 | |
2728 | ||
2729 | punpcklbw m4, m3, m5 | |
2730 | punpckhbw m3, m5 | |
2731 | ||
2732 | pmaddubsw m4, m1 | |
2733 | pmaddubsw m3, m1 | |
2734 | ||
2735 | movu m5, [r0 + 2 * r1] | |
2736 | ||
2737 | punpcklbw m2, m7, m5 | |
2738 | punpckhbw m7, m5 | |
2739 | ||
2740 | pmaddubsw m2, m0 | |
2741 | pmaddubsw m7, m0 | |
2742 | ||
2743 | paddw m4, m2 | |
2744 | paddw m3, m7 | |
2745 | ||
2746 | pmulhrsw m4, m6 | |
2747 | pmulhrsw m3, m6 | |
2748 | ||
2749 | packuswb m4, m3 | |
2750 | ||
2751 | movh [r2 + r3], m4 | |
2752 | pextrd [r2 + r3 + 8], m4, 2 | |
2753 | ||
2754 | lea r2, [r2 + 2 * r3] | |
2755 | ||
2756 | sub r4, 2 | |
2757 | jnz .loop | |
2758 | RET | |
2759 | %endmacro | |
2760 | ||
2761 | FILTER_V4_W12_H2 12, 16 | |
2762 | ||
2763 | FILTER_V4_W12_H2 12, 32 | |
2764 | ||
2765 | ;----------------------------------------------------------------------------- | |
2766 | ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2767 | ;----------------------------------------------------------------------------- | |
2768 | %macro FILTER_V4_W16_H2 2 | |
2769 | INIT_XMM sse4 | |
2770 | cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8 | |
2771 | ||
2772 | mov r4d, r4m | |
2773 | sub r0, r1 | |
2774 | ||
2775 | %ifdef PIC | |
2776 | lea r5, [tab_ChromaCoeff] | |
2777 | movd m0, [r5 + r4 * 4] | |
2778 | %else | |
2779 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2780 | %endif | |
2781 | ||
2782 | pshufb m1, m0, [tab_Vm] | |
2783 | pshufb m0, [tab_Vm + 16] | |
2784 | ||
2785 | mov r4d, %2/2 | |
2786 | ||
2787 | .loop: | |
2788 | movu m2, [r0] | |
2789 | movu m3, [r0 + r1] | |
2790 | ||
2791 | punpcklbw m4, m2, m3 | |
2792 | punpckhbw m2, m3 | |
2793 | ||
2794 | pmaddubsw m4, m1 | |
2795 | pmaddubsw m2, m1 | |
2796 | ||
2797 | lea r0, [r0 + 2 * r1] | |
2798 | movu m5, [r0] | |
2799 | movu m6, [r0 + r1] | |
2800 | ||
2801 | punpckhbw m7, m5, m6 | |
2802 | pmaddubsw m7, m0 | |
2803 | paddw m2, m7 | |
2804 | ||
2805 | punpcklbw m7, m5, m6 | |
2806 | pmaddubsw m7, m0 | |
2807 | paddw m4, m7 | |
2808 | ||
2809 | mova m7, [tab_c_512] | |
2810 | ||
2811 | pmulhrsw m4, m7 | |
2812 | pmulhrsw m2, m7 | |
2813 | ||
2814 | packuswb m4, m2 | |
2815 | ||
2816 | movu [r2], m4 | |
2817 | ||
2818 | punpcklbw m4, m3, m5 | |
2819 | punpckhbw m3, m5 | |
2820 | ||
2821 | pmaddubsw m4, m1 | |
2822 | pmaddubsw m3, m1 | |
2823 | ||
2824 | movu m5, [r0 + 2 * r1] | |
2825 | ||
2826 | punpcklbw m2, m6, m5 | |
2827 | punpckhbw m6, m5 | |
2828 | ||
2829 | pmaddubsw m2, m0 | |
2830 | pmaddubsw m6, m0 | |
2831 | ||
2832 | paddw m4, m2 | |
2833 | paddw m3, m6 | |
2834 | ||
2835 | pmulhrsw m4, m7 | |
2836 | pmulhrsw m3, m7 | |
2837 | ||
2838 | packuswb m4, m3 | |
2839 | ||
2840 | movu [r2 + r3], m4 | |
2841 | ||
2842 | lea r2, [r2 + 2 * r3] | |
2843 | ||
2844 | dec r4d | |
2845 | jnz .loop | |
2846 | RET | |
2847 | %endmacro | |
2848 | ||
2849 | FILTER_V4_W16_H2 16, 4 | |
2850 | FILTER_V4_W16_H2 16, 8 | |
2851 | FILTER_V4_W16_H2 16, 12 | |
2852 | FILTER_V4_W16_H2 16, 16 | |
2853 | FILTER_V4_W16_H2 16, 32 | |
2854 | ||
2855 | FILTER_V4_W16_H2 16, 24 | |
2856 | FILTER_V4_W16_H2 16, 64 | |
2857 | ||
2858 | ;----------------------------------------------------------------------------- | |
2859 | ;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2860 | ;----------------------------------------------------------------------------- | |
2861 | %macro FILTER_V4_W24 2 | |
2862 | INIT_XMM sse4 | |
2863 | cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 | |
2864 | ||
2865 | mov r4d, r4m | |
2866 | sub r0, r1 | |
2867 | ||
2868 | %ifdef PIC | |
2869 | lea r5, [tab_ChromaCoeff] | |
2870 | movd m0, [r5 + r4 * 4] | |
2871 | %else | |
2872 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2873 | %endif | |
2874 | ||
2875 | pshufb m1, m0, [tab_Vm] | |
2876 | pshufb m0, [tab_Vm + 16] | |
2877 | ||
2878 | mov r4d, %2 | |
2879 | ||
2880 | .loop: | |
2881 | movu m2, [r0] | |
2882 | movu m3, [r0 + r1] | |
2883 | ||
2884 | punpcklbw m4, m2, m3 | |
2885 | punpckhbw m2, m3 | |
2886 | ||
2887 | pmaddubsw m4, m1 | |
2888 | pmaddubsw m2, m1 | |
2889 | ||
2890 | lea r5, [r0 + 2 * r1] | |
2891 | movu m5, [r5] | |
2892 | movu m7, [r5 + r1] | |
2893 | ||
2894 | punpcklbw m6, m5, m7 | |
2895 | pmaddubsw m6, m0 | |
2896 | paddw m4, m6 | |
2897 | ||
2898 | punpckhbw m6, m5, m7 | |
2899 | pmaddubsw m6, m0 | |
2900 | paddw m2, m6 | |
2901 | ||
2902 | mova m6, [tab_c_512] | |
2903 | ||
2904 | pmulhrsw m4, m6 | |
2905 | pmulhrsw m2, m6 | |
2906 | ||
2907 | packuswb m4, m2 | |
2908 | ||
2909 | movu [r2], m4 | |
2910 | ||
2911 | punpcklbw m4, m3, m5 | |
2912 | punpckhbw m3, m5 | |
2913 | ||
2914 | pmaddubsw m4, m1 | |
2915 | pmaddubsw m3, m1 | |
2916 | ||
2917 | movu m2, [r5 + 2 * r1] | |
2918 | ||
2919 | punpcklbw m5, m7, m2 | |
2920 | punpckhbw m7, m2 | |
2921 | ||
2922 | pmaddubsw m5, m0 | |
2923 | pmaddubsw m7, m0 | |
2924 | ||
2925 | paddw m4, m5 | |
2926 | paddw m3, m7 | |
2927 | ||
2928 | pmulhrsw m4, m6 | |
2929 | pmulhrsw m3, m6 | |
2930 | ||
2931 | packuswb m4, m3 | |
2932 | ||
2933 | movu [r2 + r3], m4 | |
2934 | ||
2935 | movq m2, [r0 + 16] | |
2936 | movq m3, [r0 + r1 + 16] | |
2937 | movq m4, [r5 + 16] | |
2938 | movq m5, [r5 + r1 + 16] | |
2939 | ||
2940 | punpcklbw m2, m3 | |
2941 | punpcklbw m4, m5 | |
2942 | ||
2943 | pmaddubsw m2, m1 | |
2944 | pmaddubsw m4, m0 | |
2945 | ||
2946 | paddw m2, m4 | |
2947 | ||
2948 | pmulhrsw m2, m6 | |
2949 | ||
2950 | movq m3, [r0 + r1 + 16] | |
2951 | movq m4, [r5 + 16] | |
2952 | movq m5, [r5 + r1 + 16] | |
2953 | movq m7, [r5 + 2 * r1 + 16] | |
2954 | ||
2955 | punpcklbw m3, m4 | |
2956 | punpcklbw m5, m7 | |
2957 | ||
2958 | pmaddubsw m3, m1 | |
2959 | pmaddubsw m5, m0 | |
2960 | ||
2961 | paddw m3, m5 | |
2962 | ||
2963 | pmulhrsw m3, m6 | |
2964 | packuswb m2, m3 | |
2965 | ||
2966 | movh [r2 + 16], m2 | |
2967 | movhps [r2 + r3 + 16], m2 | |
2968 | ||
2969 | mov r0, r5 | |
2970 | lea r2, [r2 + 2 * r3] | |
2971 | ||
2972 | sub r4, 2 | |
2973 | jnz .loop | |
2974 | RET | |
2975 | %endmacro | |
2976 | ||
2977 | FILTER_V4_W24 24, 32 | |
2978 | ||
2979 | FILTER_V4_W24 24, 64 | |
2980 | ||
2981 | ;----------------------------------------------------------------------------- | |
2982 | ; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
2983 | ;----------------------------------------------------------------------------- | |
2984 | %macro FILTER_V4_W32 2 | |
2985 | INIT_XMM sse4 | |
2986 | cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 | |
2987 | ||
2988 | mov r4d, r4m | |
2989 | sub r0, r1 | |
2990 | ||
2991 | %ifdef PIC | |
2992 | lea r5, [tab_ChromaCoeff] | |
2993 | movd m0, [r5 + r4 * 4] | |
2994 | %else | |
2995 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
2996 | %endif | |
2997 | ||
2998 | pshufb m1, m0, [tab_Vm] | |
2999 | pshufb m0, [tab_Vm + 16] | |
3000 | ||
3001 | mova m7, [tab_c_512] | |
3002 | ||
3003 | mov r4d, %2 | |
3004 | ||
3005 | .loop: | |
3006 | movu m2, [r0] | |
3007 | movu m3, [r0 + r1] | |
3008 | ||
3009 | punpcklbw m4, m2, m3 | |
3010 | punpckhbw m2, m3 | |
3011 | ||
3012 | pmaddubsw m4, m1 | |
3013 | pmaddubsw m2, m1 | |
3014 | ||
3015 | lea r5, [r0 + 2 * r1] | |
3016 | movu m3, [r5] | |
3017 | movu m5, [r5 + r1] | |
3018 | ||
3019 | punpcklbw m6, m3, m5 | |
3020 | punpckhbw m3, m5 | |
3021 | ||
3022 | pmaddubsw m6, m0 | |
3023 | pmaddubsw m3, m0 | |
3024 | ||
3025 | paddw m4, m6 | |
3026 | paddw m2, m3 | |
3027 | ||
3028 | pmulhrsw m4, m7 | |
3029 | pmulhrsw m2, m7 | |
3030 | ||
3031 | packuswb m4, m2 | |
3032 | ||
3033 | movu [r2], m4 | |
3034 | ||
3035 | movu m2, [r0 + 16] | |
3036 | movu m3, [r0 + r1 + 16] | |
3037 | ||
3038 | punpcklbw m4, m2, m3 | |
3039 | punpckhbw m2, m3 | |
3040 | ||
3041 | pmaddubsw m4, m1 | |
3042 | pmaddubsw m2, m1 | |
3043 | ||
3044 | movu m3, [r5 + 16] | |
3045 | movu m5, [r5 + r1 + 16] | |
3046 | ||
3047 | punpcklbw m6, m3, m5 | |
3048 | punpckhbw m3, m5 | |
3049 | ||
3050 | pmaddubsw m6, m0 | |
3051 | pmaddubsw m3, m0 | |
3052 | ||
3053 | paddw m4, m6 | |
3054 | paddw m2, m3 | |
3055 | ||
3056 | pmulhrsw m4, m7 | |
3057 | pmulhrsw m2, m7 | |
3058 | ||
3059 | packuswb m4, m2 | |
3060 | ||
3061 | movu [r2 + 16], m4 | |
3062 | ||
3063 | lea r0, [r0 + r1] | |
3064 | lea r2, [r2 + r3] | |
3065 | ||
3066 | dec r4 | |
3067 | jnz .loop | |
3068 | RET | |
3069 | %endmacro | |
3070 | ||
3071 | FILTER_V4_W32 32, 8 | |
3072 | FILTER_V4_W32 32, 16 | |
3073 | FILTER_V4_W32 32, 24 | |
3074 | FILTER_V4_W32 32, 32 | |
3075 | ||
3076 | FILTER_V4_W32 32, 48 | |
3077 | FILTER_V4_W32 32, 64 | |
3078 | ||
3079 | ||
3080 | ;----------------------------------------------------------------------------- | |
3081 | ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3082 | ;----------------------------------------------------------------------------- | |
3083 | %macro FILTER_V4_W16n_H2 2 | |
3084 | INIT_XMM sse4 | |
3085 | cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 | |
3086 | ||
3087 | mov r4d, r4m | |
3088 | sub r0, r1 | |
3089 | ||
3090 | %ifdef PIC | |
3091 | lea r5, [tab_ChromaCoeff] | |
3092 | movd m0, [r5 + r4 * 4] | |
3093 | %else | |
3094 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
3095 | %endif | |
3096 | ||
3097 | pshufb m1, m0, [tab_Vm] | |
3098 | pshufb m0, [tab_Vm + 16] | |
3099 | ||
3100 | mov r4d, %2/2 | |
3101 | ||
3102 | .loop: | |
3103 | ||
3104 | mov r6d, %1/16 | |
3105 | ||
3106 | .loopW: | |
3107 | ||
3108 | movu m2, [r0] | |
3109 | movu m3, [r0 + r1] | |
3110 | ||
3111 | punpcklbw m4, m2, m3 | |
3112 | punpckhbw m2, m3 | |
3113 | ||
3114 | pmaddubsw m4, m1 | |
3115 | pmaddubsw m2, m1 | |
3116 | ||
3117 | lea r5, [r0 + 2 * r1] | |
3118 | movu m5, [r5] | |
3119 | movu m6, [r5 + r1] | |
3120 | ||
3121 | punpckhbw m7, m5, m6 | |
3122 | pmaddubsw m7, m0 | |
3123 | paddw m2, m7 | |
3124 | ||
3125 | punpcklbw m7, m5, m6 | |
3126 | pmaddubsw m7, m0 | |
3127 | paddw m4, m7 | |
3128 | ||
3129 | mova m7, [tab_c_512] | |
3130 | ||
3131 | pmulhrsw m4, m7 | |
3132 | pmulhrsw m2, m7 | |
3133 | ||
3134 | packuswb m4, m2 | |
3135 | ||
3136 | movu [r2], m4 | |
3137 | ||
3138 | punpcklbw m4, m3, m5 | |
3139 | punpckhbw m3, m5 | |
3140 | ||
3141 | pmaddubsw m4, m1 | |
3142 | pmaddubsw m3, m1 | |
3143 | ||
3144 | movu m5, [r5 + 2 * r1] | |
3145 | ||
3146 | punpcklbw m2, m6, m5 | |
3147 | punpckhbw m6, m5 | |
3148 | ||
3149 | pmaddubsw m2, m0 | |
3150 | pmaddubsw m6, m0 | |
3151 | ||
3152 | paddw m4, m2 | |
3153 | paddw m3, m6 | |
3154 | ||
3155 | pmulhrsw m4, m7 | |
3156 | pmulhrsw m3, m7 | |
3157 | ||
3158 | packuswb m4, m3 | |
3159 | ||
3160 | movu [r2 + r3], m4 | |
3161 | ||
3162 | add r0, 16 | |
3163 | add r2, 16 | |
3164 | dec r6d | |
3165 | jnz .loopW | |
3166 | ||
3167 | lea r0, [r0 + r1 * 2 - %1] | |
3168 | lea r2, [r2 + r3 * 2 - %1] | |
3169 | ||
3170 | dec r4d | |
3171 | jnz .loop | |
3172 | RET | |
3173 | %endmacro | |
3174 | ||
3175 | FILTER_V4_W16n_H2 64, 64 | |
3176 | FILTER_V4_W16n_H2 64, 32 | |
3177 | FILTER_V4_W16n_H2 64, 48 | |
3178 | FILTER_V4_W16n_H2 48, 64 | |
3179 | FILTER_V4_W16n_H2 64, 16 | |
3180 | ||
3181 | ||
3182 | ;----------------------------------------------------------------------------- | |
3183 | ; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) | |
3184 | ;----------------------------------------------------------------------------- | |
3185 | INIT_XMM ssse3 | |
3186 | cglobal luma_p2s, 3, 7, 6 | |
3187 | ||
3188 | ; load width and height | |
3189 | mov r3d, r3m | |
3190 | mov r4d, r4m | |
3191 | ||
3192 | ; load constant | |
3193 | mova m4, [tab_c_128] | |
3194 | mova m5, [tab_c_64_n64] | |
3195 | ||
3196 | .loopH: | |
3197 | ||
3198 | xor r5d, r5d | |
3199 | .loopW: | |
3200 | lea r6, [r0 + r5] | |
3201 | ||
3202 | movh m0, [r6] | |
3203 | punpcklbw m0, m4 | |
3204 | pmaddubsw m0, m5 | |
3205 | ||
3206 | movh m1, [r6 + r1] | |
3207 | punpcklbw m1, m4 | |
3208 | pmaddubsw m1, m5 | |
3209 | ||
3210 | movh m2, [r6 + r1 * 2] | |
3211 | punpcklbw m2, m4 | |
3212 | pmaddubsw m2, m5 | |
3213 | ||
3214 | lea r6, [r6 + r1 * 2] | |
3215 | movh m3, [r6 + r1] | |
3216 | punpcklbw m3, m4 | |
3217 | pmaddubsw m3, m5 | |
3218 | ||
3219 | add r5, 8 | |
3220 | cmp r5, r3 | |
3221 | jg .width4 | |
3222 | movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 | |
3223 | movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 | |
3224 | movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 | |
3225 | movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 | |
3226 | je .nextH | |
3227 | jmp .loopW | |
3228 | ||
3229 | .width4: | |
3230 | movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 | |
3231 | movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 | |
3232 | movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 | |
3233 | movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 | |
3234 | ||
3235 | .nextH: | |
3236 | lea r0, [r0 + r1 * 4] | |
3237 | add r2, FENC_STRIDE * 8 | |
3238 | ||
3239 | sub r4d, 4 | |
3240 | jnz .loopH | |
3241 | ||
3242 | RET | |
3243 | ||
3244 | %macro PROCESS_LUMA_W4_4R 0 | |
3245 | movd m0, [r0] | |
3246 | movd m1, [r0 + r1] | |
3247 | punpcklbw m2, m0, m1 ; m2=[0 1] | |
3248 | ||
3249 | lea r0, [r0 + 2 * r1] | |
3250 | movd m0, [r0] | |
3251 | punpcklbw m1, m0 ; m1=[1 2] | |
3252 | punpcklqdq m2, m1 ; m2=[0 1 1 2] | |
3253 | pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] | |
3254 | ||
3255 | movd m1, [r0 + r1] | |
3256 | punpcklbw m5, m0, m1 ; m2=[2 3] | |
3257 | lea r0, [r0 + 2 * r1] | |
3258 | movd m0, [r0] | |
3259 | punpcklbw m1, m0 ; m1=[3 4] | |
3260 | punpcklqdq m5, m1 ; m5=[2 3 3 4] | |
3261 | pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] | |
3262 | paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 | |
3263 | pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 | |
3264 | ||
3265 | movd m1, [r0 + r1] | |
3266 | punpcklbw m2, m0, m1 ; m2=[4 5] | |
3267 | lea r0, [r0 + 2 * r1] | |
3268 | movd m0, [r0] | |
3269 | punpcklbw m1, m0 ; m1=[5 6] | |
3270 | punpcklqdq m2, m1 ; m2=[4 5 5 6] | |
3271 | pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] | |
3272 | paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 | |
3273 | pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] | |
3274 | paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 | |
3275 | ||
3276 | movd m1, [r0 + r1] | |
3277 | punpcklbw m2, m0, m1 ; m2=[6 7] | |
3278 | lea r0, [r0 + 2 * r1] | |
3279 | movd m0, [r0] | |
3280 | punpcklbw m1, m0 ; m1=[7 8] | |
3281 | punpcklqdq m2, m1 ; m2=[6 7 7 8] | |
3282 | pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] | |
3283 | paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end | |
3284 | pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] | |
3285 | paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 | |
3286 | ||
3287 | movd m1, [r0 + r1] | |
3288 | punpcklbw m2, m0, m1 ; m2=[8 9] | |
3289 | movd m0, [r0 + 2 * r1] | |
3290 | punpcklbw m1, m0 ; m1=[9 10] | |
3291 | punpcklqdq m2, m1 ; m2=[8 9 9 10] | |
3292 | pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] | |
3293 | paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end | |
3294 | %endmacro | |
3295 | ||
3296 | %macro PROCESS_LUMA_W8_4R 0 | |
3297 | movq m0, [r0] | |
3298 | movq m1, [r0 + r1] | |
3299 | punpcklbw m0, m1 | |
3300 | pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 | |
3301 | ||
3302 | lea r0, [r0 + 2 * r1] | |
3303 | movq m0, [r0] | |
3304 | punpcklbw m1, m0 | |
3305 | pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 | |
3306 | ||
3307 | movq m1, [r0 + r1] | |
3308 | punpcklbw m0, m1 | |
3309 | pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 | |
3310 | pmaddubsw m0, [r6 + 1 * 16] | |
3311 | paddw m7, m0 ;m7=[0+1+2+3] Row1 | |
3312 | ||
3313 | lea r0, [r0 + 2 * r1] | |
3314 | movq m0, [r0] | |
3315 | punpcklbw m1, m0 | |
3316 | pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 | |
3317 | pmaddubsw m1, [r6 + 1 * 16] | |
3318 | paddw m6, m1 ;m6 = [1+2+3+4] Row2 | |
3319 | ||
3320 | movq m1, [r0 + r1] | |
3321 | punpcklbw m0, m1 | |
3322 | pmaddubsw m2, m0, [r6 + 1 * 16] | |
3323 | pmaddubsw m0, [r6 + 2 * 16] | |
3324 | paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 | |
3325 | paddw m5, m2 ;m5=[2+3+4+5] Row3 | |
3326 | ||
3327 | lea r0, [r0 + 2 * r1] | |
3328 | movq m0, [r0] | |
3329 | punpcklbw m1, m0 | |
3330 | pmaddubsw m2, m1, [r6 + 1 * 16] | |
3331 | pmaddubsw m1, [r6 + 2 * 16] | |
3332 | paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 | |
3333 | paddw m4, m2 ;m4=[3+4+5+6] Row4 | |
3334 | ||
3335 | movq m1, [r0 + r1] | |
3336 | punpcklbw m0, m1 | |
3337 | pmaddubsw m2, m0, [r6 + 2 * 16] | |
3338 | pmaddubsw m0, [r6 + 3 * 16] | |
3339 | paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end | |
3340 | paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 | |
3341 | ||
3342 | lea r0, [r0 + 2 * r1] | |
3343 | movq m0, [r0] | |
3344 | punpcklbw m1, m0 | |
3345 | pmaddubsw m2, m1, [r6 + 2 * 16] | |
3346 | pmaddubsw m1, [r6 + 3 * 16] | |
3347 | paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end | |
3348 | paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 | |
3349 | ||
3350 | movq m1, [r0 + r1] | |
3351 | punpcklbw m0, m1 | |
3352 | pmaddubsw m0, [r6 + 3 * 16] | |
3353 | paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end | |
3354 | ||
3355 | movq m0, [r0 + 2 * r1] | |
3356 | punpcklbw m1, m0 | |
3357 | pmaddubsw m1, [r6 + 3 * 16] | |
3358 | paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end | |
3359 | %endmacro | |
3360 | ||
3361 | ;------------------------------------------------------------------------------------------------------------- | |
3362 | ; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3363 | ;------------------------------------------------------------------------------------------------------------- | |
3364 | %macro FILTER_VER_LUMA_4xN 3 | |
3365 | INIT_XMM sse4 | |
3366 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 | |
3367 | lea r5, [3 * r1] | |
3368 | sub r0, r5 | |
3369 | shl r4d, 6 | |
3370 | %ifidn %3,ps | |
3371 | add r3d, r3d | |
3372 | %endif | |
3373 | ||
3374 | %ifdef PIC | |
3375 | lea r5, [tab_LumaCoeffVer] | |
3376 | lea r6, [r5 + r4] | |
3377 | %else | |
3378 | lea r6, [tab_LumaCoeffVer + r4] | |
3379 | %endif | |
3380 | ||
3381 | %ifidn %3,pp | |
3382 | mova m3, [tab_c_512] | |
3383 | %else | |
3384 | mova m3, [pw_2000] | |
3385 | %endif | |
3386 | ||
3387 | mov r4d, %2/4 | |
3388 | lea r5, [4 * r1] | |
3389 | ||
3390 | .loopH: | |
3391 | PROCESS_LUMA_W4_4R | |
3392 | ||
3393 | %ifidn %3,pp | |
3394 | pmulhrsw m4, m3 | |
3395 | pmulhrsw m5, m3 | |
3396 | ||
3397 | packuswb m4, m5 | |
3398 | ||
3399 | movd [r2], m4 | |
3400 | pextrd [r2 + r3], m4, 1 | |
3401 | lea r2, [r2 + 2 * r3] | |
3402 | pextrd [r2], m4, 2 | |
3403 | pextrd [r2 + r3], m4, 3 | |
3404 | %else | |
3405 | psubw m4, m3 | |
3406 | psubw m5, m3 | |
3407 | ||
3408 | movlps [r2], m4 | |
3409 | movhps [r2 + r3], m4 | |
3410 | lea r2, [r2 + 2 * r3] | |
3411 | movlps [r2], m5 | |
3412 | movhps [r2 + r3], m5 | |
3413 | %endif | |
3414 | ||
3415 | sub r0, r5 | |
3416 | lea r2, [r2 + 2 * r3] | |
3417 | ||
3418 | dec r4d | |
3419 | jnz .loopH | |
3420 | ||
3421 | RET | |
3422 | %endmacro | |
3423 | ||
3424 | ;------------------------------------------------------------------------------------------------------------- | |
3425 | ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3426 | ;------------------------------------------------------------------------------------------------------------- | |
3427 | FILTER_VER_LUMA_4xN 4, 4, pp | |
3428 | ||
3429 | ;------------------------------------------------------------------------------------------------------------- | |
3430 | ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3431 | ;------------------------------------------------------------------------------------------------------------- | |
3432 | FILTER_VER_LUMA_4xN 4, 8, pp | |
3433 | ||
3434 | ;------------------------------------------------------------------------------------------------------------- | |
3435 | ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3436 | ;------------------------------------------------------------------------------------------------------------- | |
3437 | FILTER_VER_LUMA_4xN 4, 16, pp | |
3438 | ||
3439 | ;------------------------------------------------------------------------------------------------------------- | |
3440 | ; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3441 | ;------------------------------------------------------------------------------------------------------------- | |
3442 | FILTER_VER_LUMA_4xN 4, 4, ps | |
3443 | ||
3444 | ;------------------------------------------------------------------------------------------------------------- | |
3445 | ; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3446 | ;------------------------------------------------------------------------------------------------------------- | |
3447 | FILTER_VER_LUMA_4xN 4, 8, ps | |
3448 | ||
3449 | ;------------------------------------------------------------------------------------------------------------- | |
3450 | ; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3451 | ;------------------------------------------------------------------------------------------------------------- | |
3452 | FILTER_VER_LUMA_4xN 4, 16, ps | |
3453 | ||
3454 | ;------------------------------------------------------------------------------------------------------------- | |
3455 | ; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3456 | ;------------------------------------------------------------------------------------------------------------- | |
3457 | %macro FILTER_VER_LUMA_8xN 3 | |
3458 | INIT_XMM sse4 | |
3459 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 | |
3460 | lea r5, [3 * r1] | |
3461 | sub r0, r5 | |
3462 | shl r4d, 6 | |
3463 | ||
3464 | %ifidn %3,ps | |
3465 | add r3d, r3d | |
3466 | %endif | |
3467 | ||
3468 | %ifdef PIC | |
3469 | lea r5, [tab_LumaCoeffVer] | |
3470 | lea r6, [r5 + r4] | |
3471 | %else | |
3472 | lea r6, [tab_LumaCoeffVer + r4] | |
3473 | %endif | |
3474 | ||
3475 | %ifidn %3,pp | |
3476 | mova m3, [tab_c_512] | |
3477 | %else | |
3478 | mova m3, [pw_2000] | |
3479 | %endif | |
3480 | ||
3481 | mov r4d, %2/4 | |
3482 | lea r5, [4 * r1] | |
3483 | ||
3484 | .loopH: | |
3485 | PROCESS_LUMA_W8_4R | |
3486 | ||
3487 | %ifidn %3,pp | |
3488 | pmulhrsw m7, m3 | |
3489 | pmulhrsw m6, m3 | |
3490 | pmulhrsw m5, m3 | |
3491 | pmulhrsw m4, m3 | |
3492 | ||
3493 | packuswb m7, m6 | |
3494 | packuswb m5, m4 | |
3495 | ||
3496 | movlps [r2], m7 | |
3497 | movhps [r2 + r3], m7 | |
3498 | lea r2, [r2 + 2 * r3] | |
3499 | movlps [r2], m5 | |
3500 | movhps [r2 + r3], m5 | |
3501 | %else | |
3502 | psubw m7, m3 | |
3503 | psubw m6, m3 | |
3504 | psubw m5, m3 | |
3505 | psubw m4, m3 | |
3506 | ||
3507 | movu [r2], m7 | |
3508 | movu [r2 + r3], m6 | |
3509 | lea r2, [r2 + 2 * r3] | |
3510 | movu [r2], m5 | |
3511 | movu [r2 + r3], m4 | |
3512 | %endif | |
3513 | ||
3514 | sub r0, r5 | |
3515 | lea r2, [r2 + 2 * r3] | |
3516 | ||
3517 | dec r4d | |
3518 | jnz .loopH | |
3519 | ||
3520 | RET | |
3521 | %endmacro | |
3522 | ||
3523 | ;------------------------------------------------------------------------------------------------------------- | |
3524 | ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3525 | ;------------------------------------------------------------------------------------------------------------- | |
3526 | FILTER_VER_LUMA_8xN 8, 4, pp | |
3527 | ||
3528 | ;------------------------------------------------------------------------------------------------------------- | |
3529 | ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3530 | ;------------------------------------------------------------------------------------------------------------- | |
3531 | FILTER_VER_LUMA_8xN 8, 8, pp | |
3532 | ||
3533 | ;------------------------------------------------------------------------------------------------------------- | |
3534 | ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3535 | ;------------------------------------------------------------------------------------------------------------- | |
3536 | FILTER_VER_LUMA_8xN 8, 16, pp | |
3537 | ||
3538 | ;------------------------------------------------------------------------------------------------------------- | |
3539 | ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3540 | ;------------------------------------------------------------------------------------------------------------- | |
3541 | FILTER_VER_LUMA_8xN 8, 32, pp | |
3542 | ||
3543 | ;------------------------------------------------------------------------------------------------------------- | |
3544 | ; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3545 | ;------------------------------------------------------------------------------------------------------------- | |
3546 | FILTER_VER_LUMA_8xN 8, 4, ps | |
3547 | ||
3548 | ;------------------------------------------------------------------------------------------------------------- | |
3549 | ; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3550 | ;------------------------------------------------------------------------------------------------------------- | |
3551 | FILTER_VER_LUMA_8xN 8, 8, ps | |
3552 | ||
3553 | ;------------------------------------------------------------------------------------------------------------- | |
3554 | ; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3555 | ;------------------------------------------------------------------------------------------------------------- | |
3556 | FILTER_VER_LUMA_8xN 8, 16, ps | |
3557 | ||
3558 | ;------------------------------------------------------------------------------------------------------------- | |
3559 | ; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3560 | ;------------------------------------------------------------------------------------------------------------- | |
3561 | FILTER_VER_LUMA_8xN 8, 32, ps | |
3562 | ||
3563 | ;------------------------------------------------------------------------------------------------------------- | |
3564 | ; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3565 | ;------------------------------------------------------------------------------------------------------------- | |
3566 | %macro FILTER_VER_LUMA_12xN 3 | |
3567 | INIT_XMM sse4 | |
3568 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 | |
3569 | lea r5, [3 * r1] | |
3570 | sub r0, r5 | |
3571 | shl r4d, 6 | |
3572 | %ifidn %3,ps | |
3573 | add r3d, r3d | |
3574 | %endif | |
3575 | ||
3576 | %ifdef PIC | |
3577 | lea r5, [tab_LumaCoeffVer] | |
3578 | lea r6, [r5 + r4] | |
3579 | %else | |
3580 | lea r6, [tab_LumaCoeffVer + r4] | |
3581 | %endif | |
3582 | ||
3583 | %ifidn %3,pp | |
3584 | mova m3, [tab_c_512] | |
3585 | %else | |
3586 | mova m3, [pw_2000] | |
3587 | %endif | |
3588 | ||
3589 | mov r4d, %2/4 | |
3590 | ||
3591 | .loopH: | |
3592 | PROCESS_LUMA_W8_4R | |
3593 | ||
3594 | %ifidn %3,pp | |
3595 | pmulhrsw m7, m3 | |
3596 | pmulhrsw m6, m3 | |
3597 | pmulhrsw m5, m3 | |
3598 | pmulhrsw m4, m3 | |
3599 | ||
3600 | packuswb m7, m6 | |
3601 | packuswb m5, m4 | |
3602 | ||
3603 | movlps [r2], m7 | |
3604 | movhps [r2 + r3], m7 | |
3605 | lea r5, [r2 + 2 * r3] | |
3606 | movlps [r5], m5 | |
3607 | movhps [r5 + r3], m5 | |
3608 | %else | |
3609 | psubw m7, m3 | |
3610 | psubw m6, m3 | |
3611 | psubw m5, m3 | |
3612 | psubw m4, m3 | |
3613 | ||
3614 | movu [r2], m7 | |
3615 | movu [r2 + r3], m6 | |
3616 | lea r5, [r2 + 2 * r3] | |
3617 | movu [r5], m5 | |
3618 | movu [r5 + r3], m4 | |
3619 | %endif | |
3620 | ||
3621 | lea r5, [8 * r1 - 8] | |
3622 | sub r0, r5 | |
3623 | %ifidn %3,pp | |
3624 | add r2, 8 | |
3625 | %else | |
3626 | add r2, 16 | |
3627 | %endif | |
3628 | ||
3629 | PROCESS_LUMA_W4_4R | |
3630 | ||
3631 | %ifidn %3,pp | |
3632 | pmulhrsw m4, m3 | |
3633 | pmulhrsw m5, m3 | |
3634 | ||
3635 | packuswb m4, m5 | |
3636 | ||
3637 | movd [r2], m4 | |
3638 | pextrd [r2 + r3], m4, 1 | |
3639 | lea r5, [r2 + 2 * r3] | |
3640 | pextrd [r5], m4, 2 | |
3641 | pextrd [r5 + r3], m4, 3 | |
3642 | %else | |
3643 | psubw m4, m3 | |
3644 | psubw m5, m3 | |
3645 | ||
3646 | movlps [r2], m4 | |
3647 | movhps [r2 + r3], m4 | |
3648 | lea r5, [r2 + 2 * r3] | |
3649 | movlps [r5], m5 | |
3650 | movhps [r5 + r3], m5 | |
3651 | %endif | |
3652 | ||
3653 | lea r5, [4 * r1 + 8] | |
3654 | sub r0, r5 | |
3655 | %ifidn %3,pp | |
3656 | lea r2, [r2 + 4 * r3 - 8] | |
3657 | %else | |
3658 | lea r2, [r2 + 4 * r3 - 16] | |
3659 | %endif | |
3660 | ||
3661 | dec r4d | |
3662 | jnz .loopH | |
3663 | ||
3664 | RET | |
3665 | %endmacro | |
3666 | ||
3667 | ;------------------------------------------------------------------------------------------------------------- | |
3668 | ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3669 | ;------------------------------------------------------------------------------------------------------------- | |
3670 | FILTER_VER_LUMA_12xN 12, 16, pp | |
3671 | ||
3672 | ;------------------------------------------------------------------------------------------------------------- | |
3673 | ; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3674 | ;------------------------------------------------------------------------------------------------------------- | |
3675 | FILTER_VER_LUMA_12xN 12, 16, ps | |
3676 | ||
3677 | ;------------------------------------------------------------------------------------------------------------- | |
3678 | ; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3679 | ;------------------------------------------------------------------------------------------------------------- | |
3680 | %macro FILTER_VER_LUMA 3 | |
3681 | INIT_XMM sse4 | |
3682 | cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize | |
3683 | lea r5, [3 * r1] | |
3684 | sub r0, r5 | |
3685 | shl r4d, 6 | |
3686 | %ifidn %3,ps | |
3687 | add r3d, r3d | |
3688 | %endif | |
3689 | ||
3690 | %ifdef PIC | |
3691 | lea r5, [tab_LumaCoeffVer] | |
3692 | lea r6, [r5 + r4] | |
3693 | %else | |
3694 | lea r6, [tab_LumaCoeffVer + r4] | |
3695 | %endif | |
3696 | ||
3697 | %ifidn %3,pp | |
3698 | mova m3, [tab_c_512] | |
3699 | %else | |
3700 | mova m3, [pw_2000] | |
3701 | %endif | |
3702 | mov dword [rsp], %2/4 | |
3703 | ||
3704 | .loopH: | |
3705 | mov r4d, (%1/8) | |
3706 | .loopW: | |
3707 | PROCESS_LUMA_W8_4R | |
3708 | %ifidn %3,pp | |
3709 | pmulhrsw m7, m3 | |
3710 | pmulhrsw m6, m3 | |
3711 | pmulhrsw m5, m3 | |
3712 | pmulhrsw m4, m3 | |
3713 | ||
3714 | packuswb m7, m6 | |
3715 | packuswb m5, m4 | |
3716 | ||
3717 | movlps [r2], m7 | |
3718 | movhps [r2 + r3], m7 | |
3719 | lea r5, [r2 + 2 * r3] | |
3720 | movlps [r5], m5 | |
3721 | movhps [r5 + r3], m5 | |
3722 | %else | |
3723 | psubw m7, m3 | |
3724 | psubw m6, m3 | |
3725 | psubw m5, m3 | |
3726 | psubw m4, m3 | |
3727 | ||
3728 | movu [r2], m7 | |
3729 | movu [r2 + r3], m6 | |
3730 | lea r5, [r2 + 2 * r3] | |
3731 | movu [r5], m5 | |
3732 | movu [r5 + r3], m4 | |
3733 | %endif | |
3734 | ||
3735 | lea r5, [8 * r1 - 8] | |
3736 | sub r0, r5 | |
3737 | %ifidn %3,pp | |
3738 | add r2, 8 | |
3739 | %else | |
3740 | add r2, 16 | |
3741 | %endif | |
3742 | dec r4d | |
3743 | jnz .loopW | |
3744 | ||
3745 | lea r0, [r0 + 4 * r1 - %1] | |
3746 | %ifidn %3,pp | |
3747 | lea r2, [r2 + 4 * r3 - %1] | |
3748 | %else | |
3749 | lea r2, [r2 + 4 * r3 - 2 * %1] | |
3750 | %endif | |
3751 | ||
3752 | dec dword [rsp] | |
3753 | jnz .loopH | |
3754 | ||
3755 | RET | |
3756 | %endmacro | |
3757 | ||
3758 | FILTER_VER_LUMA 16, 4, pp | |
3759 | FILTER_VER_LUMA 16, 8, pp | |
3760 | FILTER_VER_LUMA 16, 12, pp | |
3761 | FILTER_VER_LUMA 16, 16, pp | |
3762 | FILTER_VER_LUMA 16, 32, pp | |
3763 | FILTER_VER_LUMA 16, 64, pp | |
3764 | FILTER_VER_LUMA 24, 32, pp | |
3765 | FILTER_VER_LUMA 32, 8, pp | |
3766 | FILTER_VER_LUMA 32, 16, pp | |
3767 | FILTER_VER_LUMA 32, 24, pp | |
3768 | FILTER_VER_LUMA 32, 32, pp | |
3769 | FILTER_VER_LUMA 32, 64, pp | |
3770 | FILTER_VER_LUMA 48, 64, pp | |
3771 | FILTER_VER_LUMA 64, 16, pp | |
3772 | FILTER_VER_LUMA 64, 32, pp | |
3773 | FILTER_VER_LUMA 64, 48, pp | |
3774 | FILTER_VER_LUMA 64, 64, pp | |
3775 | ||
3776 | FILTER_VER_LUMA 16, 4, ps | |
3777 | FILTER_VER_LUMA 16, 8, ps | |
3778 | FILTER_VER_LUMA 16, 12, ps | |
3779 | FILTER_VER_LUMA 16, 16, ps | |
3780 | FILTER_VER_LUMA 16, 32, ps | |
3781 | FILTER_VER_LUMA 16, 64, ps | |
3782 | FILTER_VER_LUMA 24, 32, ps | |
3783 | FILTER_VER_LUMA 32, 8, ps | |
3784 | FILTER_VER_LUMA 32, 16, ps | |
3785 | FILTER_VER_LUMA 32, 24, ps | |
3786 | FILTER_VER_LUMA 32, 32, ps | |
3787 | FILTER_VER_LUMA 32, 64, ps | |
3788 | FILTER_VER_LUMA 48, 64, ps | |
3789 | FILTER_VER_LUMA 64, 16, ps | |
3790 | FILTER_VER_LUMA 64, 32, ps | |
3791 | FILTER_VER_LUMA 64, 48, ps | |
3792 | FILTER_VER_LUMA 64, 64, ps | |
3793 | ||
3794 | %macro PROCESS_LUMA_SP_W4_4R 0 | |
3795 | movq m0, [r0] | |
3796 | movq m1, [r0 + r1] | |
3797 | punpcklwd m0, m1 ;m0=[0 1] | |
3798 | pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 | |
3799 | ||
3800 | lea r0, [r0 + 2 * r1] | |
3801 | movq m4, [r0] | |
3802 | punpcklwd m1, m4 ;m1=[1 2] | |
3803 | pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 | |
3804 | ||
3805 | movq m5, [r0 + r1] | |
3806 | punpcklwd m4, m5 ;m4=[2 3] | |
3807 | pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 | |
3808 | pmaddwd m4, [r6 + 1 * 16] | |
3809 | paddd m0, m4 ;m0=[0+1+2+3] Row1 | |
3810 | ||
3811 | lea r0, [r0 + 2 * r1] | |
3812 | movq m4, [r0] | |
3813 | punpcklwd m5, m4 ;m5=[3 4] | |
3814 | pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 | |
3815 | pmaddwd m5, [r6 + 1 * 16] | |
3816 | paddd m1, m5 ;m1 = [1+2+3+4] Row2 | |
3817 | ||
3818 | movq m5, [r0 + r1] | |
3819 | punpcklwd m4, m5 ;m4=[4 5] | |
3820 | pmaddwd m6, m4, [r6 + 1 * 16] | |
3821 | paddd m2, m6 ;m2=[2+3+4+5] Row3 | |
3822 | pmaddwd m4, [r6 + 2 * 16] | |
3823 | paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 | |
3824 | ||
3825 | lea r0, [r0 + 2 * r1] | |
3826 | movq m4, [r0] | |
3827 | punpcklwd m5, m4 ;m5=[5 6] | |
3828 | pmaddwd m6, m5, [r6 + 1 * 16] | |
3829 | paddd m3, m6 ;m3=[3+4+5+6] Row4 | |
3830 | pmaddwd m5, [r6 + 2 * 16] | |
3831 | paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 | |
3832 | ||
3833 | movq m5, [r0 + r1] | |
3834 | punpcklwd m4, m5 ;m4=[6 7] | |
3835 | pmaddwd m6, m4, [r6 + 2 * 16] | |
3836 | paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 | |
3837 | pmaddwd m4, [r6 + 3 * 16] | |
3838 | paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end | |
3839 | ||
3840 | lea r0, [r0 + 2 * r1] | |
3841 | movq m4, [r0] | |
3842 | punpcklwd m5, m4 ;m5=[7 8] | |
3843 | pmaddwd m6, m5, [r6 + 2 * 16] | |
3844 | paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 | |
3845 | pmaddwd m5, [r6 + 3 * 16] | |
3846 | paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end | |
3847 | ||
3848 | movq m5, [r0 + r1] | |
3849 | punpcklwd m4, m5 ;m4=[8 9] | |
3850 | pmaddwd m4, [r6 + 3 * 16] | |
3851 | paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end | |
3852 | ||
3853 | movq m4, [r0 + 2 * r1] | |
3854 | punpcklwd m5, m4 ;m5=[9 10] | |
3855 | pmaddwd m5, [r6 + 3 * 16] | |
3856 | paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end | |
3857 | %endmacro | |
3858 | ||
3859 | ;-------------------------------------------------------------------------------------------------------------- | |
3860 | ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3861 | ;-------------------------------------------------------------------------------------------------------------- | |
3862 | %macro FILTER_VER_LUMA_SP 2 | |
3863 | INIT_XMM sse4 | |
3864 | cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize | |
3865 | ||
3866 | add r1d, r1d | |
3867 | lea r5, [r1 + 2 * r1] | |
3868 | sub r0, r5 | |
3869 | shl r4d, 6 | |
3870 | ||
3871 | %ifdef PIC | |
3872 | lea r5, [tab_LumaCoeffV] | |
3873 | lea r6, [r5 + r4] | |
3874 | %else | |
3875 | lea r6, [tab_LumaCoeffV + r4] | |
3876 | %endif | |
3877 | ||
3878 | mova m7, [tab_c_526336] | |
3879 | ||
3880 | mov dword [rsp], %2/4 | |
3881 | .loopH: | |
3882 | mov r4d, (%1/4) | |
3883 | .loopW: | |
3884 | PROCESS_LUMA_SP_W4_4R | |
3885 | ||
3886 | paddd m0, m7 | |
3887 | paddd m1, m7 | |
3888 | paddd m2, m7 | |
3889 | paddd m3, m7 | |
3890 | ||
3891 | psrad m0, 12 | |
3892 | psrad m1, 12 | |
3893 | psrad m2, 12 | |
3894 | psrad m3, 12 | |
3895 | ||
3896 | packssdw m0, m1 | |
3897 | packssdw m2, m3 | |
3898 | ||
3899 | packuswb m0, m2 | |
3900 | ||
3901 | movd [r2], m0 | |
3902 | pextrd [r2 + r3], m0, 1 | |
3903 | lea r5, [r2 + 2 * r3] | |
3904 | pextrd [r5], m0, 2 | |
3905 | pextrd [r5 + r3], m0, 3 | |
3906 | ||
3907 | lea r5, [8 * r1 - 2 * 4] | |
3908 | sub r0, r5 | |
3909 | add r2, 4 | |
3910 | ||
3911 | dec r4d | |
3912 | jnz .loopW | |
3913 | ||
3914 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
3915 | lea r2, [r2 + 4 * r3 - %1] | |
3916 | ||
3917 | dec dword [rsp] | |
3918 | jnz .loopH | |
3919 | ||
3920 | RET | |
3921 | %endmacro | |
3922 | ||
3923 | ;-------------------------------------------------------------------------------------------------------------- | |
3924 | ; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
3925 | ;-------------------------------------------------------------------------------------------------------------- | |
3926 | FILTER_VER_LUMA_SP 4, 4 | |
3927 | FILTER_VER_LUMA_SP 8, 8 | |
3928 | FILTER_VER_LUMA_SP 8, 4 | |
3929 | FILTER_VER_LUMA_SP 4, 8 | |
3930 | FILTER_VER_LUMA_SP 16, 16 | |
3931 | FILTER_VER_LUMA_SP 16, 8 | |
3932 | FILTER_VER_LUMA_SP 8, 16 | |
3933 | FILTER_VER_LUMA_SP 16, 12 | |
3934 | FILTER_VER_LUMA_SP 12, 16 | |
3935 | FILTER_VER_LUMA_SP 16, 4 | |
3936 | FILTER_VER_LUMA_SP 4, 16 | |
3937 | FILTER_VER_LUMA_SP 32, 32 | |
3938 | FILTER_VER_LUMA_SP 32, 16 | |
3939 | FILTER_VER_LUMA_SP 16, 32 | |
3940 | FILTER_VER_LUMA_SP 32, 24 | |
3941 | FILTER_VER_LUMA_SP 24, 32 | |
3942 | FILTER_VER_LUMA_SP 32, 8 | |
3943 | FILTER_VER_LUMA_SP 8, 32 | |
3944 | FILTER_VER_LUMA_SP 64, 64 | |
3945 | FILTER_VER_LUMA_SP 64, 32 | |
3946 | FILTER_VER_LUMA_SP 32, 64 | |
3947 | FILTER_VER_LUMA_SP 64, 48 | |
3948 | FILTER_VER_LUMA_SP 48, 64 | |
3949 | FILTER_VER_LUMA_SP 64, 16 | |
3950 | FILTER_VER_LUMA_SP 16, 64 | |
3951 | ||
3952 | ; TODO: combin of U and V is more performance, but need more register | |
3953 | ; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it | |
3954 | INIT_XMM ssse3 | |
3955 | cglobal chroma_p2s, 3, 7, 4 | |
3956 | ||
3957 | ; load width and height | |
3958 | mov r3d, r3m | |
3959 | mov r4d, r4m | |
3960 | ||
3961 | ; load constant | |
3962 | mova m2, [tab_c_128] | |
3963 | mova m3, [tab_c_64_n64] | |
3964 | ||
3965 | .loopH: | |
3966 | ||
3967 | xor r5d, r5d | |
3968 | .loopW: | |
3969 | lea r6, [r0 + r5] | |
3970 | ||
3971 | movh m0, [r6] | |
3972 | punpcklbw m0, m2 | |
3973 | pmaddubsw m0, m3 | |
3974 | ||
3975 | movh m1, [r6 + r1] | |
3976 | punpcklbw m1, m2 | |
3977 | pmaddubsw m1, m3 | |
3978 | ||
3979 | add r5d, 8 | |
3980 | cmp r5d, r3d | |
3981 | lea r6, [r2 + r5 * 2] | |
3982 | jg .width4 | |
3983 | movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 | |
3984 | movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 | |
3985 | je .nextH | |
3986 | jmp .loopW | |
3987 | ||
3988 | .width4: | |
3989 | test r3d, 4 | |
3990 | jz .width2 | |
3991 | test r3d, 2 | |
3992 | movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 | |
3993 | movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 | |
3994 | lea r6, [r6 + 8] | |
3995 | pshufd m0, m0, 2 | |
3996 | pshufd m1, m1, 2 | |
3997 | jz .nextH | |
3998 | ||
3999 | .width2: | |
4000 | movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 | |
4001 | movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 | |
4002 | ||
4003 | .nextH: | |
4004 | lea r0, [r0 + r1 * 2] | |
4005 | add r2, FENC_STRIDE / 2 * 4 | |
4006 | ||
4007 | sub r4d, 2 | |
4008 | jnz .loopH | |
4009 | ||
4010 | RET | |
4011 | ||
4012 | %macro PROCESS_CHROMA_SP_W4_4R 0 | |
4013 | movq m0, [r0] | |
4014 | movq m1, [r0 + r1] | |
4015 | punpcklwd m0, m1 ;m0=[0 1] | |
4016 | pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 | |
4017 | ||
4018 | lea r0, [r0 + 2 * r1] | |
4019 | movq m4, [r0] | |
4020 | punpcklwd m1, m4 ;m1=[1 2] | |
4021 | pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 | |
4022 | ||
4023 | movq m5, [r0 + r1] | |
4024 | punpcklwd m4, m5 ;m4=[2 3] | |
4025 | pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 | |
4026 | pmaddwd m4, [r6 + 1 * 16] | |
4027 | paddd m0, m4 ;m0=[0+1+2+3] Row1 done | |
4028 | ||
4029 | lea r0, [r0 + 2 * r1] | |
4030 | movq m4, [r0] | |
4031 | punpcklwd m5, m4 ;m5=[3 4] | |
4032 | pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 | |
4033 | pmaddwd m5, [r6 + 1 * 16] | |
4034 | paddd m1, m5 ;m1 = [1+2+3+4] Row2 | |
4035 | ||
4036 | movq m5, [r0 + r1] | |
4037 | punpcklwd m4, m5 ;m4=[4 5] | |
4038 | pmaddwd m4, [r6 + 1 * 16] | |
4039 | paddd m2, m4 ;m2=[2+3+4+5] Row3 | |
4040 | ||
4041 | movq m4, [r0 + 2 * r1] | |
4042 | punpcklwd m5, m4 ;m5=[5 6] | |
4043 | pmaddwd m5, [r6 + 1 * 16] | |
4044 | paddd m3, m5 ;m3=[3+4+5+6] Row4 | |
4045 | %endmacro | |
4046 | ||
4047 | ;-------------------------------------------------------------------------------------------------------------- | |
4048 | ; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4049 | ;-------------------------------------------------------------------------------------------------------------- | |
4050 | %macro FILTER_VER_CHROMA_SP 2 | |
4051 | INIT_XMM sse4 | |
4052 | cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize | |
4053 | ||
4054 | add r1d, r1d | |
4055 | sub r0, r1 | |
4056 | shl r4d, 5 | |
4057 | ||
4058 | %ifdef PIC | |
4059 | lea r5, [tab_ChromaCoeffV] | |
4060 | lea r6, [r5 + r4] | |
4061 | %else | |
4062 | lea r6, [tab_ChromaCoeffV + r4] | |
4063 | %endif | |
4064 | ||
4065 | mova m6, [tab_c_526336] | |
4066 | ||
4067 | mov dword [rsp], %2/4 | |
4068 | ||
4069 | .loopH: | |
4070 | mov r4d, (%1/4) | |
4071 | .loopW: | |
4072 | PROCESS_CHROMA_SP_W4_4R | |
4073 | ||
4074 | paddd m0, m6 | |
4075 | paddd m1, m6 | |
4076 | paddd m2, m6 | |
4077 | paddd m3, m6 | |
4078 | ||
4079 | psrad m0, 12 | |
4080 | psrad m1, 12 | |
4081 | psrad m2, 12 | |
4082 | psrad m3, 12 | |
4083 | ||
4084 | packssdw m0, m1 | |
4085 | packssdw m2, m3 | |
4086 | ||
4087 | packuswb m0, m2 | |
4088 | ||
4089 | movd [r2], m0 | |
4090 | pextrd [r2 + r3], m0, 1 | |
4091 | lea r5, [r2 + 2 * r3] | |
4092 | pextrd [r5], m0, 2 | |
4093 | pextrd [r5 + r3], m0, 3 | |
4094 | ||
4095 | lea r5, [4 * r1 - 2 * 4] | |
4096 | sub r0, r5 | |
4097 | add r2, 4 | |
4098 | ||
4099 | dec r4d | |
4100 | jnz .loopW | |
4101 | ||
4102 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
4103 | lea r2, [r2 + 4 * r3 - %1] | |
4104 | ||
4105 | dec dword [rsp] | |
4106 | jnz .loopH | |
4107 | ||
4108 | RET | |
4109 | %endmacro | |
4110 | ||
4111 | FILTER_VER_CHROMA_SP 4, 4 | |
4112 | FILTER_VER_CHROMA_SP 4, 8 | |
4113 | FILTER_VER_CHROMA_SP 16, 16 | |
4114 | FILTER_VER_CHROMA_SP 16, 8 | |
4115 | FILTER_VER_CHROMA_SP 16, 12 | |
4116 | FILTER_VER_CHROMA_SP 12, 16 | |
4117 | FILTER_VER_CHROMA_SP 16, 4 | |
4118 | FILTER_VER_CHROMA_SP 4, 16 | |
4119 | FILTER_VER_CHROMA_SP 32, 32 | |
4120 | FILTER_VER_CHROMA_SP 32, 16 | |
4121 | FILTER_VER_CHROMA_SP 16, 32 | |
4122 | FILTER_VER_CHROMA_SP 32, 24 | |
4123 | FILTER_VER_CHROMA_SP 24, 32 | |
4124 | FILTER_VER_CHROMA_SP 32, 8 | |
4125 | ||
4126 | FILTER_VER_CHROMA_SP 16, 24 | |
4127 | FILTER_VER_CHROMA_SP 16, 64 | |
4128 | FILTER_VER_CHROMA_SP 12, 32 | |
4129 | FILTER_VER_CHROMA_SP 4, 32 | |
4130 | FILTER_VER_CHROMA_SP 32, 64 | |
4131 | FILTER_VER_CHROMA_SP 32, 48 | |
4132 | FILTER_VER_CHROMA_SP 24, 64 | |
4133 | ||
4134 | FILTER_VER_CHROMA_SP 64, 64 | |
4135 | FILTER_VER_CHROMA_SP 64, 32 | |
4136 | FILTER_VER_CHROMA_SP 64, 48 | |
4137 | FILTER_VER_CHROMA_SP 48, 64 | |
4138 | FILTER_VER_CHROMA_SP 64, 16 | |
4139 | ||
4140 | ||
4141 | %macro PROCESS_CHROMA_SP_W2_4R 1 | |
4142 | movd m0, [r0] | |
4143 | movd m1, [r0 + r1] | |
4144 | punpcklwd m0, m1 ;m0=[0 1] | |
4145 | ||
4146 | lea r0, [r0 + 2 * r1] | |
4147 | movd m2, [r0] | |
4148 | punpcklwd m1, m2 ;m1=[1 2] | |
4149 | punpcklqdq m0, m1 ;m0=[0 1 1 2] | |
4150 | pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 | |
4151 | ||
4152 | movd m1, [r0 + r1] | |
4153 | punpcklwd m2, m1 ;m2=[2 3] | |
4154 | ||
4155 | lea r0, [r0 + 2 * r1] | |
4156 | movd m3, [r0] | |
4157 | punpcklwd m1, m3 ;m2=[3 4] | |
4158 | punpcklqdq m2, m1 ;m2=[2 3 3 4] | |
4159 | ||
4160 | pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 | |
4161 | pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 | |
4162 | paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 | |
4163 | ||
4164 | movd m1, [r0 + r1] | |
4165 | punpcklwd m3, m1 ;m3=[4 5] | |
4166 | ||
4167 | movd m4, [r0 + 2 * r1] | |
4168 | punpcklwd m1, m4 ;m1=[5 6] | |
4169 | punpcklqdq m3, m1 ;m2=[4 5 5 6] | |
4170 | pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 | |
4171 | paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 | |
4172 | %endmacro | |
4173 | ||
4174 | ;------------------------------------------------------------------------------------------------------------------- | |
4175 | ; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4176 | ;------------------------------------------------------------------------------------------------------------------- | |
4177 | %macro FILTER_VER_CHROMA_SP_W2_4R 2 | |
4178 | INIT_XMM sse4 | |
4179 | cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 | |
4180 | ||
4181 | add r1d, r1d | |
4182 | sub r0, r1 | |
4183 | shl r4d, 5 | |
4184 | ||
4185 | %ifdef PIC | |
4186 | lea r5, [tab_ChromaCoeffV] | |
4187 | lea r5, [r5 + r4] | |
4188 | %else | |
4189 | lea r5, [tab_ChromaCoeffV + r4] | |
4190 | %endif | |
4191 | ||
4192 | mova m5, [tab_c_526336] | |
4193 | ||
4194 | mov r4d, (%2/4) | |
4195 | ||
4196 | .loopH: | |
4197 | PROCESS_CHROMA_SP_W2_4R r5 | |
4198 | ||
4199 | paddd m0, m5 | |
4200 | paddd m2, m5 | |
4201 | ||
4202 | psrad m0, 12 | |
4203 | psrad m2, 12 | |
4204 | ||
4205 | packssdw m0, m2 | |
4206 | packuswb m0, m0 | |
4207 | ||
4208 | pextrw [r2], m0, 0 | |
4209 | pextrw [r2 + r3], m0, 1 | |
4210 | lea r2, [r2 + 2 * r3] | |
4211 | pextrw [r2], m0, 2 | |
4212 | pextrw [r2 + r3], m0, 3 | |
4213 | ||
4214 | lea r2, [r2 + 2 * r3] | |
4215 | ||
4216 | dec r4d | |
4217 | jnz .loopH | |
4218 | ||
4219 | RET | |
4220 | %endmacro | |
4221 | ||
4222 | FILTER_VER_CHROMA_SP_W2_4R 2, 4 | |
4223 | FILTER_VER_CHROMA_SP_W2_4R 2, 8 | |
4224 | ||
4225 | FILTER_VER_CHROMA_SP_W2_4R 2, 16 | |
4226 | ||
4227 | ;-------------------------------------------------------------------------------------------------------------- | |
4228 | ; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4229 | ;-------------------------------------------------------------------------------------------------------------- | |
4230 | INIT_XMM sse4 | |
4231 | cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 | |
4232 | ||
4233 | add r1d, r1d | |
4234 | sub r0, r1 | |
4235 | shl r4d, 5 | |
4236 | ||
4237 | %ifdef PIC | |
4238 | lea r5, [tab_ChromaCoeffV] | |
4239 | lea r5, [r5 + r4] | |
4240 | %else | |
4241 | lea r5, [tab_ChromaCoeffV + r4] | |
4242 | %endif | |
4243 | ||
4244 | mova m4, [tab_c_526336] | |
4245 | ||
4246 | movq m0, [r0] | |
4247 | movq m1, [r0 + r1] | |
4248 | punpcklwd m0, m1 ;m0=[0 1] | |
4249 | pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 | |
4250 | ||
4251 | lea r0, [r0 + 2 * r1] | |
4252 | movq m2, [r0] | |
4253 | punpcklwd m1, m2 ;m1=[1 2] | |
4254 | pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 | |
4255 | ||
4256 | movq m3, [r0 + r1] | |
4257 | punpcklwd m2, m3 ;m4=[2 3] | |
4258 | pmaddwd m2, [r5 + 1 * 16] | |
4259 | paddd m0, m2 ;m0=[0+1+2+3] Row1 done | |
4260 | paddd m0, m4 | |
4261 | psrad m0, 12 | |
4262 | ||
4263 | movq m2, [r0 + 2 * r1] | |
4264 | punpcklwd m3, m2 ;m5=[3 4] | |
4265 | pmaddwd m3, [r5 + 1 * 16] | |
4266 | paddd m1, m3 ;m1 = [1+2+3+4] Row2 done | |
4267 | paddd m1, m4 | |
4268 | psrad m1, 12 | |
4269 | ||
4270 | packssdw m0, m1 | |
4271 | packuswb m0, m0 | |
4272 | ||
4273 | movd [r2], m0 | |
4274 | pextrd [r2 + r3], m0, 1 | |
4275 | ||
4276 | RET | |
4277 | ||
4278 | ;------------------------------------------------------------------------------------------------------------------- | |
4279 | ; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4280 | ;------------------------------------------------------------------------------------------------------------------- | |
4281 | %macro FILTER_VER_CHROMA_SP_W6_H4 2 | |
4282 | INIT_XMM sse4 | |
4283 | cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 | |
4284 | ||
4285 | add r1d, r1d | |
4286 | sub r0, r1 | |
4287 | shl r4d, 5 | |
4288 | ||
4289 | %ifdef PIC | |
4290 | lea r5, [tab_ChromaCoeffV] | |
4291 | lea r6, [r5 + r4] | |
4292 | %else | |
4293 | lea r6, [tab_ChromaCoeffV + r4] | |
4294 | %endif | |
4295 | ||
4296 | mova m6, [tab_c_526336] | |
4297 | ||
4298 | mov r4d, %2/4 | |
4299 | ||
4300 | .loopH: | |
4301 | PROCESS_CHROMA_SP_W4_4R | |
4302 | ||
4303 | paddd m0, m6 | |
4304 | paddd m1, m6 | |
4305 | paddd m2, m6 | |
4306 | paddd m3, m6 | |
4307 | ||
4308 | psrad m0, 12 | |
4309 | psrad m1, 12 | |
4310 | psrad m2, 12 | |
4311 | psrad m3, 12 | |
4312 | ||
4313 | packssdw m0, m1 | |
4314 | packssdw m2, m3 | |
4315 | ||
4316 | packuswb m0, m2 | |
4317 | ||
4318 | movd [r2], m0 | |
4319 | pextrd [r2 + r3], m0, 1 | |
4320 | lea r5, [r2 + 2 * r3] | |
4321 | pextrd [r5], m0, 2 | |
4322 | pextrd [r5 + r3], m0, 3 | |
4323 | ||
4324 | lea r5, [4 * r1 - 2 * 4] | |
4325 | sub r0, r5 | |
4326 | add r2, 4 | |
4327 | ||
4328 | PROCESS_CHROMA_SP_W2_4R r6 | |
4329 | ||
4330 | paddd m0, m6 | |
4331 | paddd m2, m6 | |
4332 | ||
4333 | psrad m0, 12 | |
4334 | psrad m2, 12 | |
4335 | ||
4336 | packssdw m0, m2 | |
4337 | packuswb m0, m0 | |
4338 | ||
4339 | pextrw [r2], m0, 0 | |
4340 | pextrw [r2 + r3], m0, 1 | |
4341 | lea r2, [r2 + 2 * r3] | |
4342 | pextrw [r2], m0, 2 | |
4343 | pextrw [r2 + r3], m0, 3 | |
4344 | ||
4345 | sub r0, 2 * 4 | |
4346 | lea r2, [r2 + 2 * r3 - 4] | |
4347 | ||
4348 | dec r4d | |
4349 | jnz .loopH | |
4350 | ||
4351 | RET | |
4352 | %endmacro | |
4353 | ||
4354 | FILTER_VER_CHROMA_SP_W6_H4 6, 8 | |
4355 | ||
4356 | FILTER_VER_CHROMA_SP_W6_H4 6, 16 | |
4357 | ||
4358 | %macro PROCESS_CHROMA_SP_W8_2R 0 | |
4359 | movu m1, [r0] | |
4360 | movu m3, [r0 + r1] | |
4361 | punpcklwd m0, m1, m3 | |
4362 | pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l | |
4363 | punpckhwd m1, m3 | |
4364 | pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h | |
4365 | ||
4366 | movu m4, [r0 + 2 * r1] | |
4367 | punpcklwd m2, m3, m4 | |
4368 | pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l | |
4369 | punpckhwd m3, m4 | |
4370 | pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h | |
4371 | ||
4372 | lea r0, [r0 + 2 * r1] | |
4373 | movu m5, [r0 + r1] | |
4374 | punpcklwd m6, m4, m5 | |
4375 | pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l | |
4376 | paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum | |
4377 | punpckhwd m4, m5 | |
4378 | pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h | |
4379 | paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum | |
4380 | ||
4381 | movu m4, [r0 + 2 * r1] | |
4382 | punpcklwd m6, m5, m4 | |
4383 | pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l | |
4384 | paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum | |
4385 | punpckhwd m5, m4 | |
4386 | pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h | |
4387 | paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum | |
4388 | %endmacro | |
4389 | ||
4390 | ;-------------------------------------------------------------------------------------------------------------- | |
4391 | ; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) | |
4392 | ;-------------------------------------------------------------------------------------------------------------- | |
4393 | %macro FILTER_VER_CHROMA_SP_W8_H2 2 | |
4394 | INIT_XMM sse2 | |
4395 | cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 | |
4396 | ||
4397 | add r1d, r1d | |
4398 | sub r0, r1 | |
4399 | shl r4d, 5 | |
4400 | ||
4401 | %ifdef PIC | |
4402 | lea r5, [tab_ChromaCoeffV] | |
4403 | lea r5, [r5 + r4] | |
4404 | %else | |
4405 | lea r5, [tab_ChromaCoeffV + r4] | |
4406 | %endif | |
4407 | ||
4408 | mova m7, [tab_c_526336] | |
4409 | ||
4410 | mov r4d, %2/2 | |
4411 | .loopH: | |
4412 | PROCESS_CHROMA_SP_W8_2R | |
4413 | ||
4414 | paddd m0, m7 | |
4415 | paddd m1, m7 | |
4416 | paddd m2, m7 | |
4417 | paddd m3, m7 | |
4418 | ||
4419 | psrad m0, 12 | |
4420 | psrad m1, 12 | |
4421 | psrad m2, 12 | |
4422 | psrad m3, 12 | |
4423 | ||
4424 | packssdw m0, m1 | |
4425 | packssdw m2, m3 | |
4426 | ||
4427 | packuswb m0, m2 | |
4428 | ||
4429 | movlps [r2], m0 | |
4430 | movhps [r2 + r3], m0 | |
4431 | ||
4432 | lea r2, [r2 + 2 * r3] | |
4433 | ||
4434 | dec r4d | |
4435 | jnz .loopH | |
4436 | ||
4437 | RET | |
4438 | %endmacro | |
4439 | ||
4440 | FILTER_VER_CHROMA_SP_W8_H2 8, 2 | |
4441 | FILTER_VER_CHROMA_SP_W8_H2 8, 4 | |
4442 | FILTER_VER_CHROMA_SP_W8_H2 8, 6 | |
4443 | FILTER_VER_CHROMA_SP_W8_H2 8, 8 | |
4444 | FILTER_VER_CHROMA_SP_W8_H2 8, 16 | |
4445 | FILTER_VER_CHROMA_SP_W8_H2 8, 32 | |
4446 | ||
4447 | FILTER_VER_CHROMA_SP_W8_H2 8, 12 | |
4448 | FILTER_VER_CHROMA_SP_W8_H2 8, 64 | |
4449 | ||
4450 | ||
4451 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4452 | ; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
4453 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4454 | %macro FILTER_HORIZ_CHROMA_2xN 2 | |
4455 | INIT_XMM sse4 | |
4456 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride | |
4457 | %define coef2 m3 | |
4458 | %define Tm0 m2 | |
4459 | %define t1 m1 | |
4460 | %define t0 m0 | |
4461 | ||
4462 | dec srcq | |
4463 | mov r4d, r4m | |
4464 | add dststrided, dststrided | |
4465 | ||
4466 | %ifdef PIC | |
4467 | lea r6, [tab_ChromaCoeff] | |
4468 | movd coef2, [r6 + r4 * 4] | |
4469 | %else | |
4470 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
4471 | %endif | |
4472 | ||
4473 | pshufd coef2, coef2, 0 | |
4474 | mova t1, [pw_2000] | |
4475 | mova Tm0, [tab_Tm] | |
4476 | ||
4477 | mov r4d, %2 | |
4478 | cmp r5m, byte 0 | |
4479 | je .loopH | |
4480 | sub srcq, srcstrideq | |
4481 | add r4d, 3 | |
4482 | ||
4483 | .loopH: | |
4484 | movh t0, [srcq] | |
4485 | pshufb t0, t0, Tm0 | |
4486 | pmaddubsw t0, coef2 | |
4487 | phaddw t0, t0 | |
4488 | psubw t0, t1 | |
4489 | movd [dstq], t0 | |
4490 | ||
4491 | lea srcq, [srcq + srcstrideq] | |
4492 | lea dstq, [dstq + dststrideq] | |
4493 | ||
4494 | dec r4d | |
4495 | jnz .loopH | |
4496 | ||
4497 | RET | |
4498 | %endmacro | |
4499 | ||
4500 | FILTER_HORIZ_CHROMA_2xN 2, 4 | |
4501 | FILTER_HORIZ_CHROMA_2xN 2, 8 | |
4502 | ||
4503 | FILTER_HORIZ_CHROMA_2xN 2, 16 | |
4504 | ||
4505 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4506 | ; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
4507 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4508 | %macro FILTER_HORIZ_CHROMA_4xN 2 | |
4509 | INIT_XMM sse4 | |
4510 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride | |
4511 | %define coef2 m3 | |
4512 | %define Tm0 m2 | |
4513 | %define t1 m1 | |
4514 | %define t0 m0 | |
4515 | ||
4516 | dec srcq | |
4517 | mov r4d, r4m | |
4518 | add dststrided, dststrided | |
4519 | ||
4520 | %ifdef PIC | |
4521 | lea r6, [tab_ChromaCoeff] | |
4522 | movd coef2, [r6 + r4 * 4] | |
4523 | %else | |
4524 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
4525 | %endif | |
4526 | ||
4527 | pshufd coef2, coef2, 0 | |
4528 | mova t1, [pw_2000] | |
4529 | mova Tm0, [tab_Tm] | |
4530 | ||
4531 | mov r4d, %2 | |
4532 | cmp r5m, byte 0 | |
4533 | je .loopH | |
4534 | sub srcq, srcstrideq | |
4535 | add r4d, 3 | |
4536 | ||
4537 | .loopH: | |
4538 | movh t0, [srcq] | |
4539 | pshufb t0, t0, Tm0 | |
4540 | pmaddubsw t0, coef2 | |
4541 | phaddw t0, t0 | |
4542 | psubw t0, t1 | |
4543 | movlps [dstq], t0 | |
4544 | ||
4545 | lea srcq, [srcq + srcstrideq] | |
4546 | lea dstq, [dstq + dststrideq] | |
4547 | ||
4548 | dec r4d | |
4549 | jnz .loopH | |
4550 | RET | |
4551 | %endmacro | |
4552 | ||
4553 | FILTER_HORIZ_CHROMA_4xN 4, 2 | |
4554 | FILTER_HORIZ_CHROMA_4xN 4, 4 | |
4555 | FILTER_HORIZ_CHROMA_4xN 4, 8 | |
4556 | FILTER_HORIZ_CHROMA_4xN 4, 16 | |
4557 | ||
4558 | FILTER_HORIZ_CHROMA_4xN 4, 32 | |
4559 | ||
4560 | %macro PROCESS_CHROMA_W6 3 | |
4561 | movu %1, [srcq] | |
4562 | pshufb %2, %1, Tm0 | |
4563 | pmaddubsw %2, coef2 | |
4564 | pshufb %1, %1, Tm1 | |
4565 | pmaddubsw %1, coef2 | |
4566 | phaddw %2, %1 | |
4567 | psubw %2, %3 | |
4568 | movh [dstq], %2 | |
4569 | pshufd %2, %2, 2 | |
4570 | movd [dstq + 8], %2 | |
4571 | %endmacro | |
4572 | ||
4573 | %macro PROCESS_CHROMA_W12 3 | |
4574 | movu %1, [srcq] | |
4575 | pshufb %2, %1, Tm0 | |
4576 | pmaddubsw %2, coef2 | |
4577 | pshufb %1, %1, Tm1 | |
4578 | pmaddubsw %1, coef2 | |
4579 | phaddw %2, %1 | |
4580 | psubw %2, %3 | |
4581 | movu [dstq], %2 | |
4582 | movu %1, [srcq + 8] | |
4583 | pshufb %1, %1, Tm0 | |
4584 | pmaddubsw %1, coef2 | |
4585 | phaddw %1, %1 | |
4586 | psubw %1, %3 | |
4587 | movh [dstq + 16], %1 | |
4588 | %endmacro | |
4589 | ||
4590 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4591 | ; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
4592 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4593 | %macro FILTER_HORIZ_CHROMA 2 | |
4594 | INIT_XMM sse4 | |
4595 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride | |
4596 | %define coef2 m5 | |
4597 | %define Tm0 m4 | |
4598 | %define Tm1 m3 | |
4599 | %define t2 m2 | |
4600 | %define t1 m1 | |
4601 | %define t0 m0 | |
4602 | ||
4603 | dec srcq | |
4604 | mov r4d, r4m | |
4605 | add dststrided, dststrided | |
4606 | ||
4607 | %ifdef PIC | |
4608 | lea r6, [tab_ChromaCoeff] | |
4609 | movd coef2, [r6 + r4 * 4] | |
4610 | %else | |
4611 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
4612 | %endif | |
4613 | ||
4614 | pshufd coef2, coef2, 0 | |
4615 | mova t2, [pw_2000] | |
4616 | mova Tm0, [tab_Tm] | |
4617 | mova Tm1, [tab_Tm + 16] | |
4618 | ||
4619 | mov r4d, %2 | |
4620 | cmp r5m, byte 0 | |
4621 | je .loopH | |
4622 | sub srcq, srcstrideq | |
4623 | add r4d, 3 | |
4624 | ||
4625 | .loopH: | |
4626 | PROCESS_CHROMA_W%1 t0, t1, t2 | |
4627 | add srcq, srcstrideq | |
4628 | add dstq, dststrideq | |
4629 | ||
4630 | dec r4d | |
4631 | jnz .loopH | |
4632 | ||
4633 | RET | |
4634 | %endmacro | |
4635 | ||
4636 | FILTER_HORIZ_CHROMA 6, 8 | |
4637 | FILTER_HORIZ_CHROMA 12, 16 | |
4638 | ||
4639 | FILTER_HORIZ_CHROMA 6, 16 | |
4640 | FILTER_HORIZ_CHROMA 12, 32 | |
4641 | ||
4642 | %macro PROCESS_CHROMA_W8 3 | |
4643 | movu %1, [srcq] | |
4644 | pshufb %2, %1, Tm0 | |
4645 | pmaddubsw %2, coef2 | |
4646 | pshufb %1, %1, Tm1 | |
4647 | pmaddubsw %1, coef2 | |
4648 | phaddw %2, %1 | |
4649 | psubw %2, %3 | |
4650 | movu [dstq], %2 | |
4651 | %endmacro | |
4652 | ||
4653 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4654 | ; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
4655 | ;----------------------------------------------------------------------------------------------------------------------------- | |
4656 | %macro FILTER_HORIZ_CHROMA_8xN 2 | |
4657 | INIT_XMM sse4 | |
4658 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride | |
4659 | %define coef2 m5 | |
4660 | %define Tm0 m4 | |
4661 | %define Tm1 m3 | |
4662 | %define t2 m2 | |
4663 | %define t1 m1 | |
4664 | %define t0 m0 | |
4665 | ||
4666 | dec srcq | |
4667 | mov r4d, r4m | |
4668 | add dststrided, dststrided | |
4669 | ||
4670 | %ifdef PIC | |
4671 | lea r6, [tab_ChromaCoeff] | |
4672 | movd coef2, [r6 + r4 * 4] | |
4673 | %else | |
4674 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
4675 | %endif | |
4676 | ||
4677 | pshufd coef2, coef2, 0 | |
4678 | mova t2, [pw_2000] | |
4679 | mova Tm0, [tab_Tm] | |
4680 | mova Tm1, [tab_Tm + 16] | |
4681 | ||
4682 | mov r4d, %2 | |
4683 | cmp r5m, byte 0 | |
4684 | je .loopH | |
4685 | sub srcq, srcstrideq | |
4686 | add r4d, 3 | |
4687 | ||
4688 | .loopH: | |
4689 | PROCESS_CHROMA_W8 t0, t1, t2 | |
4690 | add srcq, srcstrideq | |
4691 | add dstq, dststrideq | |
4692 | ||
4693 | dec r4d | |
4694 | jnz .loopH | |
4695 | ||
4696 | RET | |
4697 | %endmacro | |
4698 | ||
4699 | FILTER_HORIZ_CHROMA_8xN 8, 2 | |
4700 | FILTER_HORIZ_CHROMA_8xN 8, 4 | |
4701 | FILTER_HORIZ_CHROMA_8xN 8, 6 | |
4702 | FILTER_HORIZ_CHROMA_8xN 8, 8 | |
4703 | FILTER_HORIZ_CHROMA_8xN 8, 16 | |
4704 | FILTER_HORIZ_CHROMA_8xN 8, 32 | |
4705 | ||
4706 | FILTER_HORIZ_CHROMA_8xN 8, 12 | |
4707 | FILTER_HORIZ_CHROMA_8xN 8, 64 | |
4708 | ||
4709 | %macro PROCESS_CHROMA_W16 4 | |
4710 | movu %1, [srcq] | |
4711 | pshufb %2, %1, Tm0 | |
4712 | pmaddubsw %2, coef2 | |
4713 | pshufb %1, %1, Tm1 | |
4714 | pmaddubsw %1, coef2 | |
4715 | phaddw %2, %1 | |
4716 | movu %1, [srcq + 8] | |
4717 | pshufb %4, %1, Tm0 | |
4718 | pmaddubsw %4, coef2 | |
4719 | pshufb %1, %1, Tm1 | |
4720 | pmaddubsw %1, coef2 | |
4721 | phaddw %4, %1 | |
4722 | psubw %2, %3 | |
4723 | psubw %4, %3 | |
4724 | movu [dstq], %2 | |
4725 | movu [dstq + 16], %4 | |
4726 | %endmacro | |
4727 | ||
4728 | %macro PROCESS_CHROMA_W24 4 | |
4729 | movu %1, [srcq] | |
4730 | pshufb %2, %1, Tm0 | |
4731 | pmaddubsw %2, coef2 | |
4732 | pshufb %1, %1, Tm1 | |
4733 | pmaddubsw %1, coef2 | |
4734 | phaddw %2, %1 | |
4735 | movu %1, [srcq + 8] | |
4736 | pshufb %4, %1, Tm0 | |
4737 | pmaddubsw %4, coef2 | |
4738 | pshufb %1, %1, Tm1 | |
4739 | pmaddubsw %1, coef2 | |
4740 | phaddw %4, %1 | |
4741 | psubw %2, %3 | |
4742 | psubw %4, %3 | |
4743 | movu [dstq], %2 | |
4744 | movu [dstq + 16], %4 | |
4745 | movu %1, [srcq + 16] | |
4746 | pshufb %2, %1, Tm0 | |
4747 | pmaddubsw %2, coef2 | |
4748 | pshufb %1, %1, Tm1 | |
4749 | pmaddubsw %1, coef2 | |
4750 | phaddw %2, %1 | |
4751 | psubw %2, %3 | |
4752 | movu [dstq + 32], %2 | |
4753 | %endmacro | |
4754 | ||
4755 | %macro PROCESS_CHROMA_W32 4 | |
4756 | movu %1, [srcq] | |
4757 | pshufb %2, %1, Tm0 | |
4758 | pmaddubsw %2, coef2 | |
4759 | pshufb %1, %1, Tm1 | |
4760 | pmaddubsw %1, coef2 | |
4761 | phaddw %2, %1 | |
4762 | movu %1, [srcq + 8] | |
4763 | pshufb %4, %1, Tm0 | |
4764 | pmaddubsw %4, coef2 | |
4765 | pshufb %1, %1, Tm1 | |
4766 | pmaddubsw %1, coef2 | |
4767 | phaddw %4, %1 | |
4768 | psubw %2, %3 | |
4769 | psubw %4, %3 | |
4770 | movu [dstq], %2 | |
4771 | movu [dstq + 16], %4 | |
4772 | movu %1, [srcq + 16] | |
4773 | pshufb %2, %1, Tm0 | |
4774 | pmaddubsw %2, coef2 | |
4775 | pshufb %1, %1, Tm1 | |
4776 | pmaddubsw %1, coef2 | |
4777 | phaddw %2, %1 | |
4778 | movu %1, [srcq + 24] | |
4779 | pshufb %4, %1, Tm0 | |
4780 | pmaddubsw %4, coef2 | |
4781 | pshufb %1, %1, Tm1 | |
4782 | pmaddubsw %1, coef2 | |
4783 | phaddw %4, %1 | |
4784 | psubw %2, %3 | |
4785 | psubw %4, %3 | |
4786 | movu [dstq + 32], %2 | |
4787 | movu [dstq + 48], %4 | |
4788 | %endmacro | |
4789 | ||
4790 | %macro PROCESS_CHROMA_W16o 5 | |
4791 | movu %1, [srcq + %5] | |
4792 | pshufb %2, %1, Tm0 | |
4793 | pmaddubsw %2, coef2 | |
4794 | pshufb %1, %1, Tm1 | |
4795 | pmaddubsw %1, coef2 | |
4796 | phaddw %2, %1 | |
4797 | movu %1, [srcq + %5 + 8] | |
4798 | pshufb %4, %1, Tm0 | |
4799 | pmaddubsw %4, coef2 | |
4800 | pshufb %1, %1, Tm1 | |
4801 | pmaddubsw %1, coef2 | |
4802 | phaddw %4, %1 | |
4803 | psubw %2, %3 | |
4804 | psubw %4, %3 | |
4805 | movu [dstq + %5 * 2], %2 | |
4806 | movu [dstq + %5 * 2 + 16], %4 | |
4807 | %endmacro | |
4808 | ||
4809 | %macro PROCESS_CHROMA_W48 4 | |
4810 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 | |
4811 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 | |
4812 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 | |
4813 | %endmacro | |
4814 | ||
4815 | %macro PROCESS_CHROMA_W64 4 | |
4816 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 | |
4817 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 | |
4818 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 | |
4819 | PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 | |
4820 | %endmacro | |
4821 | ||
4822 | ;------------------------------------------------------------------------------------------------------------------------------ | |
4823 | ; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) | |
4824 | ;------------------------------------------------------------------------------------------------------------------------------ | |
4825 | %macro FILTER_HORIZ_CHROMA_WxN 2 | |
4826 | INIT_XMM sse4 | |
4827 | cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride | |
4828 | %define coef2 m6 | |
4829 | %define Tm0 m5 | |
4830 | %define Tm1 m4 | |
4831 | %define t3 m3 | |
4832 | %define t2 m2 | |
4833 | %define t1 m1 | |
4834 | %define t0 m0 | |
4835 | ||
4836 | dec srcq | |
4837 | mov r4d, r4m | |
4838 | add dststrided, dststrided | |
4839 | ||
4840 | %ifdef PIC | |
4841 | lea r6, [tab_ChromaCoeff] | |
4842 | movd coef2, [r6 + r4 * 4] | |
4843 | %else | |
4844 | movd coef2, [tab_ChromaCoeff + r4 * 4] | |
4845 | %endif | |
4846 | ||
4847 | pshufd coef2, coef2, 0 | |
4848 | mova t2, [pw_2000] | |
4849 | mova Tm0, [tab_Tm] | |
4850 | mova Tm1, [tab_Tm + 16] | |
4851 | ||
4852 | mov r4d, %2 | |
4853 | cmp r5m, byte 0 | |
4854 | je .loopH | |
4855 | sub srcq, srcstrideq | |
4856 | add r4d, 3 | |
4857 | ||
4858 | .loopH: | |
4859 | PROCESS_CHROMA_W%1 t0, t1, t2, t3 | |
4860 | add srcq, srcstrideq | |
4861 | add dstq, dststrideq | |
4862 | ||
4863 | dec r4d | |
4864 | jnz .loopH | |
4865 | ||
4866 | RET | |
4867 | %endmacro | |
4868 | ||
4869 | FILTER_HORIZ_CHROMA_WxN 16, 4 | |
4870 | FILTER_HORIZ_CHROMA_WxN 16, 8 | |
4871 | FILTER_HORIZ_CHROMA_WxN 16, 12 | |
4872 | FILTER_HORIZ_CHROMA_WxN 16, 16 | |
4873 | FILTER_HORIZ_CHROMA_WxN 16, 32 | |
4874 | FILTER_HORIZ_CHROMA_WxN 24, 32 | |
4875 | FILTER_HORIZ_CHROMA_WxN 32, 8 | |
4876 | FILTER_HORIZ_CHROMA_WxN 32, 16 | |
4877 | FILTER_HORIZ_CHROMA_WxN 32, 24 | |
4878 | FILTER_HORIZ_CHROMA_WxN 32, 32 | |
4879 | ||
4880 | FILTER_HORIZ_CHROMA_WxN 16, 24 | |
4881 | FILTER_HORIZ_CHROMA_WxN 16, 64 | |
4882 | FILTER_HORIZ_CHROMA_WxN 24, 64 | |
4883 | FILTER_HORIZ_CHROMA_WxN 32, 48 | |
4884 | FILTER_HORIZ_CHROMA_WxN 32, 64 | |
4885 | ||
4886 | FILTER_HORIZ_CHROMA_WxN 64, 64 | |
4887 | FILTER_HORIZ_CHROMA_WxN 64, 32 | |
4888 | FILTER_HORIZ_CHROMA_WxN 64, 48 | |
4889 | FILTER_HORIZ_CHROMA_WxN 48, 64 | |
4890 | FILTER_HORIZ_CHROMA_WxN 64, 16 | |
4891 | ||
4892 | ||
4893 | ;--------------------------------------------------------------------------------------------------------------- | |
4894 | ; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
4895 | ;--------------------------------------------------------------------------------------------------------------- | |
4896 | %macro FILTER_V_PS_W16n 2 | |
4897 | INIT_XMM sse4 | |
4898 | cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 | |
4899 | ||
4900 | mov r4d, r4m | |
4901 | sub r0, r1 | |
4902 | add r3d, r3d | |
4903 | ||
4904 | %ifdef PIC | |
4905 | lea r5, [tab_ChromaCoeff] | |
4906 | movd m0, [r5 + r4 * 4] | |
4907 | %else | |
4908 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
4909 | %endif | |
4910 | ||
4911 | pshufb m1, m0, [tab_Vm] | |
4912 | pshufb m0, [tab_Vm + 16] | |
4913 | mov r4d, %2/2 | |
4914 | ||
4915 | .loop: | |
4916 | ||
4917 | mov r6d, %1/16 | |
4918 | ||
4919 | .loopW: | |
4920 | ||
4921 | movu m2, [r0] | |
4922 | movu m3, [r0 + r1] | |
4923 | ||
4924 | punpcklbw m4, m2, m3 | |
4925 | punpckhbw m2, m3 | |
4926 | ||
4927 | pmaddubsw m4, m1 | |
4928 | pmaddubsw m2, m1 | |
4929 | ||
4930 | lea r5, [r0 + 2 * r1] | |
4931 | movu m5, [r5] | |
4932 | movu m7, [r5 + r1] | |
4933 | ||
4934 | punpcklbw m6, m5, m7 | |
4935 | pmaddubsw m6, m0 | |
4936 | paddw m4, m6 | |
4937 | ||
4938 | punpckhbw m6, m5, m7 | |
4939 | pmaddubsw m6, m0 | |
4940 | paddw m2, m6 | |
4941 | ||
4942 | mova m6, [pw_2000] | |
4943 | ||
4944 | psubw m4, m6 | |
4945 | psubw m2, m6 | |
4946 | ||
4947 | movu [r2], m4 | |
4948 | movu [r2 + 16], m2 | |
4949 | ||
4950 | punpcklbw m4, m3, m5 | |
4951 | punpckhbw m3, m5 | |
4952 | ||
4953 | pmaddubsw m4, m1 | |
4954 | pmaddubsw m3, m1 | |
4955 | ||
4956 | movu m5, [r5 + 2 * r1] | |
4957 | ||
4958 | punpcklbw m2, m7, m5 | |
4959 | punpckhbw m7, m5 | |
4960 | ||
4961 | pmaddubsw m2, m0 | |
4962 | pmaddubsw m7, m0 | |
4963 | ||
4964 | paddw m4, m2 | |
4965 | paddw m3, m7 | |
4966 | ||
4967 | psubw m4, m6 | |
4968 | psubw m3, m6 | |
4969 | ||
4970 | movu [r2 + r3], m4 | |
4971 | movu [r2 + r3 + 16], m3 | |
4972 | ||
4973 | add r0, 16 | |
4974 | add r2, 32 | |
4975 | dec r6d | |
4976 | jnz .loopW | |
4977 | ||
4978 | lea r0, [r0 + r1 * 2 - %1] | |
4979 | lea r2, [r2 + r3 * 2 - %1 * 2] | |
4980 | ||
4981 | dec r4d | |
4982 | jnz .loop | |
4983 | RET | |
4984 | %endmacro | |
4985 | ||
4986 | FILTER_V_PS_W16n 64, 64 | |
4987 | FILTER_V_PS_W16n 64, 32 | |
4988 | FILTER_V_PS_W16n 64, 48 | |
4989 | FILTER_V_PS_W16n 48, 64 | |
4990 | FILTER_V_PS_W16n 64, 16 | |
4991 | ||
4992 | ||
4993 | ;------------------------------------------------------------------------------------------------------------ | |
4994 | ;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
4995 | ;------------------------------------------------------------------------------------------------------------ | |
4996 | INIT_XMM sse4 | |
4997 | cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 | |
4998 | ||
4999 | mov r4d, r4m | |
5000 | sub r0, r1 | |
5001 | add r3d, r3d | |
5002 | ||
5003 | %ifdef PIC | |
5004 | lea r5, [tab_ChromaCoeff] | |
5005 | movd m0, [r5 + r4 * 4] | |
5006 | %else | |
5007 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
5008 | %endif | |
5009 | ||
5010 | pshufb m0, [tab_Cm] | |
5011 | ||
5012 | lea r5, [3 * r1] | |
5013 | ||
5014 | movd m2, [r0] | |
5015 | movd m3, [r0 + r1] | |
5016 | movd m4, [r0 + 2 * r1] | |
5017 | movd m5, [r0 + r5] | |
5018 | ||
5019 | punpcklbw m2, m3 | |
5020 | punpcklbw m6, m4, m5 | |
5021 | punpcklbw m2, m6 | |
5022 | ||
5023 | pmaddubsw m2, m0 | |
5024 | ||
5025 | lea r0, [r0 + 4 * r1] | |
5026 | movd m6, [r0] | |
5027 | ||
5028 | punpcklbw m3, m4 | |
5029 | punpcklbw m1, m5, m6 | |
5030 | punpcklbw m3, m1 | |
5031 | ||
5032 | pmaddubsw m3, m0 | |
5033 | phaddw m2, m3 | |
5034 | ||
5035 | mova m1, [pw_2000] | |
5036 | ||
5037 | psubw m2, m1 | |
5038 | ||
5039 | movd [r2], m2 | |
5040 | pextrd [r2 + r3], m2, 2 | |
5041 | ||
5042 | movd m2, [r0 + r1] | |
5043 | ||
5044 | punpcklbw m4, m5 | |
5045 | punpcklbw m3, m6, m2 | |
5046 | punpcklbw m4, m3 | |
5047 | ||
5048 | pmaddubsw m4, m0 | |
5049 | ||
5050 | movd m3, [r0 + 2 * r1] | |
5051 | ||
5052 | punpcklbw m5, m6 | |
5053 | punpcklbw m2, m3 | |
5054 | punpcklbw m5, m2 | |
5055 | ||
5056 | pmaddubsw m5, m0 | |
5057 | phaddw m4, m5 | |
5058 | psubw m4, m1 | |
5059 | ||
5060 | lea r2, [r2 + 2 * r3] | |
5061 | movd [r2], m4 | |
5062 | pextrd [r2 + r3], m4, 2 | |
5063 | ||
5064 | RET | |
5065 | ||
5066 | ;------------------------------------------------------------------------------------------------------------- | |
5067 | ; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
5068 | ;------------------------------------------------------------------------------------------------------------- | |
5069 | %macro FILTER_V_PS_W2 2 | |
5070 | INIT_XMM sse4 | |
5071 | cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 | |
5072 | ||
5073 | mov r4d, r4m | |
5074 | sub r0, r1 | |
5075 | add r3d, r3d | |
5076 | ||
5077 | %ifdef PIC | |
5078 | lea r5, [tab_ChromaCoeff] | |
5079 | movd m0, [r5 + r4 * 4] | |
5080 | %else | |
5081 | movd m0, [tab_ChromaCoeff + r4 * 4] | |
5082 | %endif | |
5083 | ||
5084 | pshufb m0, [tab_Cm] | |
5085 | ||
5086 | mova m1, [pw_2000] | |
5087 | lea r5, [3 * r1] | |
5088 | mov r4d, %2/4 | |
5089 | .loop: | |
5090 | movd m2, [r0] | |
5091 | movd m3, [r0 + r1] | |
5092 | movd m4, [r0 + 2 * r1] | |
5093 | movd m5, [r0 + r5] | |
5094 | ||
5095 | punpcklbw m2, m3 | |
5096 | punpcklbw m6, m4, m5 | |
5097 | punpcklbw m2, m6 | |
5098 | ||
5099 | pmaddubsw m2, m0 | |
5100 | ||
5101 | lea r0, [r0 + 4 * r1] | |
5102 | movd m6, [r0] | |
5103 | ||
5104 | punpcklbw m3, m4 | |
5105 | punpcklbw m7, m5, m6 | |
5106 | punpcklbw m3, m7 | |
5107 | ||
5108 | pmaddubsw m3, m0 | |
5109 | ||
5110 | phaddw m2, m3 | |
5111 | psubw m2, m1 | |
5112 | ||
5113 | ||
5114 | movd [r2], m2 | |
5115 | pshufd m2, m2, 2 | |
5116 | movd [r2 + r3], m2 | |
5117 | ||
5118 | movd m2, [r0 + r1] | |
5119 | ||
5120 | punpcklbw m4, m5 | |
5121 | punpcklbw m3, m6, m2 | |
5122 | punpcklbw m4, m3 | |
5123 | ||
5124 | pmaddubsw m4, m0 | |
5125 | ||
5126 | movd m3, [r0 + 2 * r1] | |
5127 | ||
5128 | punpcklbw m5, m6 | |
5129 | punpcklbw m2, m3 | |
5130 | punpcklbw m5, m2 | |
5131 | ||
5132 | pmaddubsw m5, m0 | |
5133 | ||
5134 | phaddw m4, m5 | |
5135 | ||
5136 | psubw m4, m1 | |
5137 | ||
5138 | lea r2, [r2 + 2 * r3] | |
5139 | movd [r2], m4 | |
5140 | pshufd m4 , m4 ,2 | |
5141 | movd [r2 + r3], m4 | |
5142 | ||
5143 | lea r2, [r2 + 2 * r3] | |
5144 | ||
5145 | dec r4d | |
5146 | jnz .loop | |
5147 | ||
5148 | RET | |
5149 | %endmacro | |
5150 | ||
5151 | FILTER_V_PS_W2 2, 8 | |
5152 | ||
5153 | FILTER_V_PS_W2 2, 16 | |
5154 | ||
5155 | ;----------------------------------------------------------------------------------------------------------------- | |
5156 | ; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
5157 | ;----------------------------------------------------------------------------------------------------------------- | |
5158 | %macro FILTER_VER_CHROMA_SS 2 | |
5159 | INIT_XMM sse2 | |
5160 | cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize | |
5161 | ||
5162 | add r1d, r1d | |
5163 | add r3d, r3d | |
5164 | sub r0, r1 | |
5165 | shl r4d, 5 | |
5166 | ||
5167 | %ifdef PIC | |
5168 | lea r5, [tab_ChromaCoeffV] | |
5169 | lea r6, [r5 + r4] | |
5170 | %else | |
5171 | lea r6, [tab_ChromaCoeffV + r4] | |
5172 | %endif | |
5173 | ||
5174 | mov dword [rsp], %2/4 | |
5175 | ||
5176 | .loopH: | |
5177 | mov r4d, (%1/4) | |
5178 | .loopW: | |
5179 | PROCESS_CHROMA_SP_W4_4R | |
5180 | ||
5181 | psrad m0, 6 | |
5182 | psrad m1, 6 | |
5183 | psrad m2, 6 | |
5184 | psrad m3, 6 | |
5185 | ||
5186 | packssdw m0, m1 | |
5187 | packssdw m2, m3 | |
5188 | ||
5189 | movlps [r2], m0 | |
5190 | movhps [r2 + r3], m0 | |
5191 | lea r5, [r2 + 2 * r3] | |
5192 | movlps [r5], m2 | |
5193 | movhps [r5 + r3], m2 | |
5194 | ||
5195 | lea r5, [4 * r1 - 2 * 4] | |
5196 | sub r0, r5 | |
5197 | add r2, 2 * 4 | |
5198 | ||
5199 | dec r4d | |
5200 | jnz .loopW | |
5201 | ||
5202 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
5203 | lea r2, [r2 + 4 * r3 - 2 * %1] | |
5204 | ||
5205 | dec dword [rsp] | |
5206 | jnz .loopH | |
5207 | ||
5208 | RET | |
5209 | %endmacro | |
5210 | ||
5211 | FILTER_VER_CHROMA_SS 4, 4 | |
5212 | FILTER_VER_CHROMA_SS 4, 8 | |
5213 | FILTER_VER_CHROMA_SS 16, 16 | |
5214 | FILTER_VER_CHROMA_SS 16, 8 | |
5215 | FILTER_VER_CHROMA_SS 16, 12 | |
5216 | FILTER_VER_CHROMA_SS 12, 16 | |
5217 | FILTER_VER_CHROMA_SS 16, 4 | |
5218 | FILTER_VER_CHROMA_SS 4, 16 | |
5219 | FILTER_VER_CHROMA_SS 32, 32 | |
5220 | FILTER_VER_CHROMA_SS 32, 16 | |
5221 | FILTER_VER_CHROMA_SS 16, 32 | |
5222 | FILTER_VER_CHROMA_SS 32, 24 | |
5223 | FILTER_VER_CHROMA_SS 24, 32 | |
5224 | FILTER_VER_CHROMA_SS 32, 8 | |
5225 | ||
5226 | FILTER_VER_CHROMA_SS 16, 24 | |
5227 | FILTER_VER_CHROMA_SS 12, 32 | |
5228 | FILTER_VER_CHROMA_SS 4, 32 | |
5229 | FILTER_VER_CHROMA_SS 32, 64 | |
5230 | FILTER_VER_CHROMA_SS 16, 64 | |
5231 | FILTER_VER_CHROMA_SS 32, 48 | |
5232 | FILTER_VER_CHROMA_SS 24, 64 | |
5233 | ||
5234 | FILTER_VER_CHROMA_SS 64, 64 | |
5235 | FILTER_VER_CHROMA_SS 64, 32 | |
5236 | FILTER_VER_CHROMA_SS 64, 48 | |
5237 | FILTER_VER_CHROMA_SS 48, 64 | |
5238 | FILTER_VER_CHROMA_SS 64, 16 | |
5239 | ||
5240 | ||
5241 | ;--------------------------------------------------------------------------------------------------------------------- | |
5242 | ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
5243 | ;--------------------------------------------------------------------------------------------------------------------- | |
5244 | %macro FILTER_VER_CHROMA_SS_W2_4R 2 | |
5245 | INIT_XMM sse4 | |
5246 | cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 | |
5247 | ||
5248 | add r1d, r1d | |
5249 | add r3d, r3d | |
5250 | sub r0, r1 | |
5251 | shl r4d, 5 | |
5252 | ||
5253 | %ifdef PIC | |
5254 | lea r5, [tab_ChromaCoeffV] | |
5255 | lea r5, [r5 + r4] | |
5256 | %else | |
5257 | lea r5, [tab_ChromaCoeffV + r4] | |
5258 | %endif | |
5259 | ||
5260 | mov r4d, (%2/4) | |
5261 | ||
5262 | .loopH: | |
5263 | PROCESS_CHROMA_SP_W2_4R r5 | |
5264 | ||
5265 | psrad m0, 6 | |
5266 | psrad m2, 6 | |
5267 | ||
5268 | packssdw m0, m2 | |
5269 | ||
5270 | movd [r2], m0 | |
5271 | pextrd [r2 + r3], m0, 1 | |
5272 | lea r2, [r2 + 2 * r3] | |
5273 | pextrd [r2], m0, 2 | |
5274 | pextrd [r2 + r3], m0, 3 | |
5275 | ||
5276 | lea r2, [r2 + 2 * r3] | |
5277 | ||
5278 | dec r4d | |
5279 | jnz .loopH | |
5280 | ||
5281 | RET | |
5282 | %endmacro | |
5283 | ||
5284 | FILTER_VER_CHROMA_SS_W2_4R 2, 4 | |
5285 | FILTER_VER_CHROMA_SS_W2_4R 2, 8 | |
5286 | ||
5287 | FILTER_VER_CHROMA_SS_W2_4R 2, 16 | |
5288 | ||
5289 | ;--------------------------------------------------------------------------------------------------------------- | |
5290 | ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
5291 | ;--------------------------------------------------------------------------------------------------------------- | |
5292 | INIT_XMM sse2 | |
5293 | cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 | |
5294 | ||
5295 | add r1d, r1d | |
5296 | add r3d, r3d | |
5297 | sub r0, r1 | |
5298 | shl r4d, 5 | |
5299 | ||
5300 | %ifdef PIC | |
5301 | lea r5, [tab_ChromaCoeffV] | |
5302 | lea r5, [r5 + r4] | |
5303 | %else | |
5304 | lea r5, [tab_ChromaCoeffV + r4] | |
5305 | %endif | |
5306 | ||
5307 | movq m0, [r0] | |
5308 | movq m1, [r0 + r1] | |
5309 | punpcklwd m0, m1 ;m0=[0 1] | |
5310 | pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 | |
5311 | ||
5312 | lea r0, [r0 + 2 * r1] | |
5313 | movq m2, [r0] | |
5314 | punpcklwd m1, m2 ;m1=[1 2] | |
5315 | pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 | |
5316 | ||
5317 | movq m3, [r0 + r1] | |
5318 | punpcklwd m2, m3 ;m4=[2 3] | |
5319 | pmaddwd m2, [r5 + 1 * 16] | |
5320 | paddd m0, m2 ;m0=[0+1+2+3] Row1 done | |
5321 | psrad m0, 6 | |
5322 | ||
5323 | movq m2, [r0 + 2 * r1] | |
5324 | punpcklwd m3, m2 ;m5=[3 4] | |
5325 | pmaddwd m3, [r5 + 1 * 16] | |
5326 | paddd m1, m3 ;m1=[1+2+3+4] Row2 done | |
5327 | psrad m1, 6 | |
5328 | ||
5329 | packssdw m0, m1 | |
5330 | ||
5331 | movlps [r2], m0 | |
5332 | movhps [r2 + r3], m0 | |
5333 | ||
5334 | RET | |
5335 | ||
5336 | ;------------------------------------------------------------------------------------------------------------------- | |
5337 | ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
5338 | ;------------------------------------------------------------------------------------------------------------------- | |
5339 | %macro FILTER_VER_CHROMA_SS_W6_H4 2 | |
5340 | INIT_XMM sse4 | |
5341 | cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 | |
5342 | ||
5343 | add r1d, r1d | |
5344 | add r3d, r3d | |
5345 | sub r0, r1 | |
5346 | shl r4d, 5 | |
5347 | ||
5348 | %ifdef PIC | |
5349 | lea r5, [tab_ChromaCoeffV] | |
5350 | lea r6, [r5 + r4] | |
5351 | %else | |
5352 | lea r6, [tab_ChromaCoeffV + r4] | |
5353 | %endif | |
5354 | ||
5355 | mov r4d, %2/4 | |
5356 | ||
5357 | .loopH: | |
5358 | PROCESS_CHROMA_SP_W4_4R | |
5359 | ||
5360 | psrad m0, 6 | |
5361 | psrad m1, 6 | |
5362 | psrad m2, 6 | |
5363 | psrad m3, 6 | |
5364 | ||
5365 | packssdw m0, m1 | |
5366 | packssdw m2, m3 | |
5367 | ||
5368 | movlps [r2], m0 | |
5369 | movhps [r2 + r3], m0 | |
5370 | lea r5, [r2 + 2 * r3] | |
5371 | movlps [r5], m2 | |
5372 | movhps [r5 + r3], m2 | |
5373 | ||
5374 | lea r5, [4 * r1 - 2 * 4] | |
5375 | sub r0, r5 | |
5376 | add r2, 2 * 4 | |
5377 | ||
5378 | PROCESS_CHROMA_SP_W2_4R r6 | |
5379 | ||
5380 | psrad m0, 6 | |
5381 | psrad m2, 6 | |
5382 | ||
5383 | packssdw m0, m2 | |
5384 | ||
5385 | movd [r2], m0 | |
5386 | pextrd [r2 + r3], m0, 1 | |
5387 | lea r2, [r2 + 2 * r3] | |
5388 | pextrd [r2], m0, 2 | |
5389 | pextrd [r2 + r3], m0, 3 | |
5390 | ||
5391 | sub r0, 2 * 4 | |
5392 | lea r2, [r2 + 2 * r3 - 2 * 4] | |
5393 | ||
5394 | dec r4d | |
5395 | jnz .loopH | |
5396 | ||
5397 | RET | |
5398 | %endmacro | |
5399 | ||
5400 | FILTER_VER_CHROMA_SS_W6_H4 6, 8 | |
5401 | ||
5402 | FILTER_VER_CHROMA_SS_W6_H4 6, 16 | |
5403 | ||
5404 | ||
5405 | ;---------------------------------------------------------------------------------------------------------------- | |
5406 | ; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
5407 | ;---------------------------------------------------------------------------------------------------------------- | |
5408 | %macro FILTER_VER_CHROMA_SS_W8_H2 2 | |
5409 | INIT_XMM sse2 | |
5410 | cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 | |
5411 | ||
5412 | add r1d, r1d | |
5413 | add r3d, r3d | |
5414 | sub r0, r1 | |
5415 | shl r4d, 5 | |
5416 | ||
5417 | %ifdef PIC | |
5418 | lea r5, [tab_ChromaCoeffV] | |
5419 | lea r5, [r5 + r4] | |
5420 | %else | |
5421 | lea r5, [tab_ChromaCoeffV + r4] | |
5422 | %endif | |
5423 | ||
5424 | mov r4d, %2/2 | |
5425 | .loopH: | |
5426 | PROCESS_CHROMA_SP_W8_2R | |
5427 | ||
5428 | psrad m0, 6 | |
5429 | psrad m1, 6 | |
5430 | psrad m2, 6 | |
5431 | psrad m3, 6 | |
5432 | ||
5433 | packssdw m0, m1 | |
5434 | packssdw m2, m3 | |
5435 | ||
5436 | movu [r2], m0 | |
5437 | movu [r2 + r3], m2 | |
5438 | ||
5439 | lea r2, [r2 + 2 * r3] | |
5440 | ||
5441 | dec r4d | |
5442 | jnz .loopH | |
5443 | ||
5444 | RET | |
5445 | %endmacro | |
5446 | ||
5447 | FILTER_VER_CHROMA_SS_W8_H2 8, 2 | |
5448 | FILTER_VER_CHROMA_SS_W8_H2 8, 4 | |
5449 | FILTER_VER_CHROMA_SS_W8_H2 8, 6 | |
5450 | FILTER_VER_CHROMA_SS_W8_H2 8, 8 | |
5451 | FILTER_VER_CHROMA_SS_W8_H2 8, 16 | |
5452 | FILTER_VER_CHROMA_SS_W8_H2 8, 32 | |
5453 | ||
5454 | FILTER_VER_CHROMA_SS_W8_H2 8, 12 | |
5455 | FILTER_VER_CHROMA_SS_W8_H2 8, 64 | |
5456 | ||
5457 | ;----------------------------------------------------------------------------------------------------------------- | |
5458 | ; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) | |
5459 | ;----------------------------------------------------------------------------------------------------------------- | |
5460 | %macro FILTER_VER_LUMA_SS 2 | |
5461 | INIT_XMM sse2 | |
5462 | cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize | |
5463 | ||
5464 | add r1d, r1d | |
5465 | add r3d, r3d | |
5466 | lea r5, [3 * r1] | |
5467 | sub r0, r5 | |
5468 | shl r4d, 6 | |
5469 | ||
5470 | %ifdef PIC | |
5471 | lea r5, [tab_LumaCoeffV] | |
5472 | lea r6, [r5 + r4] | |
5473 | %else | |
5474 | lea r6, [tab_LumaCoeffV + r4] | |
5475 | %endif | |
5476 | ||
5477 | mov dword [rsp], %2/4 | |
5478 | .loopH: | |
5479 | mov r4d, (%1/4) | |
5480 | .loopW: | |
5481 | movq m0, [r0] | |
5482 | movq m1, [r0 + r1] | |
5483 | punpcklwd m0, m1 ;m0=[0 1] | |
5484 | pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 | |
5485 | ||
5486 | lea r0, [r0 + 2 * r1] | |
5487 | movq m4, [r0] | |
5488 | punpcklwd m1, m4 ;m1=[1 2] | |
5489 | pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 | |
5490 | ||
5491 | movq m5, [r0 + r1] | |
5492 | punpcklwd m4, m5 ;m4=[2 3] | |
5493 | pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 | |
5494 | pmaddwd m4, [r6 + 1 * 16] | |
5495 | paddd m0, m4 ;m0=[0+1+2+3] Row1 | |
5496 | ||
5497 | lea r0, [r0 + 2 * r1] | |
5498 | movq m4, [r0] | |
5499 | punpcklwd m5, m4 ;m5=[3 4] | |
5500 | pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 | |
5501 | pmaddwd m5, [r6 + 1 * 16] | |
5502 | paddd m1, m5 ;m1 = [1+2+3+4] Row2 | |
5503 | ||
5504 | movq m5, [r0 + r1] | |
5505 | punpcklwd m4, m5 ;m4=[4 5] | |
5506 | pmaddwd m6, m4, [r6 + 1 * 16] | |
5507 | paddd m2, m6 ;m2=[2+3+4+5] Row3 | |
5508 | pmaddwd m4, [r6 + 2 * 16] | |
5509 | paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 | |
5510 | ||
5511 | lea r0, [r0 + 2 * r1] | |
5512 | movq m4, [r0] | |
5513 | punpcklwd m5, m4 ;m5=[5 6] | |
5514 | pmaddwd m6, m5, [r6 + 1 * 16] | |
5515 | paddd m3, m6 ;m3=[3+4+5+6] Row4 | |
5516 | pmaddwd m5, [r6 + 2 * 16] | |
5517 | paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 | |
5518 | ||
5519 | movq m5, [r0 + r1] | |
5520 | punpcklwd m4, m5 ;m4=[6 7] | |
5521 | pmaddwd m6, m4, [r6 + 2 * 16] | |
5522 | paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 | |
5523 | pmaddwd m4, [r6 + 3 * 16] | |
5524 | paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end | |
5525 | psrad m0, 6 | |
5526 | ||
5527 | lea r0, [r0 + 2 * r1] | |
5528 | movq m4, [r0] | |
5529 | punpcklwd m5, m4 ;m5=[7 8] | |
5530 | pmaddwd m6, m5, [r6 + 2 * 16] | |
5531 | paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 | |
5532 | pmaddwd m5, [r6 + 3 * 16] | |
5533 | paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end | |
5534 | psrad m1, 6 | |
5535 | ||
5536 | packssdw m0, m1 | |
5537 | ||
5538 | movlps [r2], m0 | |
5539 | movhps [r2 + r3], m0 | |
5540 | ||
5541 | movq m5, [r0 + r1] | |
5542 | punpcklwd m4, m5 ;m4=[8 9] | |
5543 | pmaddwd m4, [r6 + 3 * 16] | |
5544 | paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end | |
5545 | psrad m2, 6 | |
5546 | ||
5547 | movq m4, [r0 + 2 * r1] | |
5548 | punpcklwd m5, m4 ;m5=[9 10] | |
5549 | pmaddwd m5, [r6 + 3 * 16] | |
5550 | paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end | |
5551 | psrad m3, 6 | |
5552 | ||
5553 | packssdw m2, m3 | |
5554 | ||
5555 | movlps [r2 + 2 * r3], m2 | |
5556 | lea r5, [3 * r3] | |
5557 | movhps [r2 + r5], m2 | |
5558 | ||
5559 | lea r5, [8 * r1 - 2 * 4] | |
5560 | sub r0, r5 | |
5561 | add r2, 2 * 4 | |
5562 | ||
5563 | dec r4d | |
5564 | jnz .loopW | |
5565 | ||
5566 | lea r0, [r0 + 4 * r1 - 2 * %1] | |
5567 | lea r2, [r2 + 4 * r3 - 2 * %1] | |
5568 | ||
5569 | dec dword [rsp] | |
5570 | jnz .loopH | |
5571 | ||
5572 | RET | |
5573 | %endmacro | |
5574 | ||
5575 | FILTER_VER_LUMA_SS 4, 4 | |
5576 | FILTER_VER_LUMA_SS 8, 8 | |
5577 | FILTER_VER_LUMA_SS 8, 4 | |
5578 | FILTER_VER_LUMA_SS 4, 8 | |
5579 | FILTER_VER_LUMA_SS 16, 16 | |
5580 | FILTER_VER_LUMA_SS 16, 8 | |
5581 | FILTER_VER_LUMA_SS 8, 16 | |
5582 | FILTER_VER_LUMA_SS 16, 12 | |
5583 | FILTER_VER_LUMA_SS 12, 16 | |
5584 | FILTER_VER_LUMA_SS 16, 4 | |
5585 | FILTER_VER_LUMA_SS 4, 16 | |
5586 | FILTER_VER_LUMA_SS 32, 32 | |
5587 | FILTER_VER_LUMA_SS 32, 16 | |
5588 | FILTER_VER_LUMA_SS 16, 32 | |
5589 | FILTER_VER_LUMA_SS 32, 24 | |
5590 | FILTER_VER_LUMA_SS 24, 32 | |
5591 | FILTER_VER_LUMA_SS 32, 8 | |
5592 | FILTER_VER_LUMA_SS 8, 32 | |
5593 | FILTER_VER_LUMA_SS 64, 64 | |
5594 | FILTER_VER_LUMA_SS 64, 32 | |
5595 | FILTER_VER_LUMA_SS 32, 64 | |
5596 | FILTER_VER_LUMA_SS 64, 48 | |
5597 | FILTER_VER_LUMA_SS 48, 64 | |
5598 | FILTER_VER_LUMA_SS 64, 16 | |
5599 | FILTER_VER_LUMA_SS 16, 64 |