Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | ; /* |
2 | ; * Provide SSE luma and chroma mc functions for HEVC decoding | |
3 | ; * Copyright (c) 2013 Pierre-Edouard LEPERE | |
4 | ; * | |
5 | ; * This file is part of FFmpeg. | |
6 | ; * | |
7 | ; * FFmpeg is free software; you can redistribute it and/or | |
8 | ; * modify it under the terms of the GNU Lesser General Public | |
9 | ; * License as published by the Free Software Foundation; either | |
10 | ; * version 2.1 of the License, or (at your option) any later version. | |
11 | ; * | |
12 | ; * FFmpeg is distributed in the hope that it will be useful, | |
13 | ; * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | ; * Lesser General Public License for more details. | |
16 | ; * | |
17 | ; * You should have received a copy of the GNU Lesser General Public | |
18 | ; * License along with FFmpeg; if not, write to the Free Software | |
19 | ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
20 | ; */ | |
21 | %include "libavutil/x86/x86util.asm" | |
22 | ||
23 | SECTION_RODATA | |
24 | pw_8: times 8 dw (1 << 9) | |
25 | pw_10: times 8 dw (1 << 11) | |
26 | pw_12: times 8 dw (1 << 13) | |
27 | pw_bi_8: times 8 dw (1 << 8) | |
28 | pw_bi_10: times 8 dw (1 << 10) | |
29 | pw_bi_12: times 8 dw (1 << 12) | |
30 | max_pixels_10: times 8 dw ((1 << 10)-1) | |
31 | max_pixels_12: times 8 dw ((1 << 12)-1) | |
32 | zero: times 4 dd 0 | |
33 | one_per_32: times 4 dd 1 | |
34 | ||
35 | SECTION .text | |
36 | %macro EPEL_TABLE 4 | |
37 | hevc_epel_filters_%4_%1 times %2 d%3 -2, 58 | |
38 | times %2 d%3 10, -2 | |
39 | times %2 d%3 -4, 54 | |
40 | times %2 d%3 16, -2 | |
41 | times %2 d%3 -6, 46 | |
42 | times %2 d%3 28, -4 | |
43 | times %2 d%3 -4, 36 | |
44 | times %2 d%3 36, -4 | |
45 | times %2 d%3 -4, 28 | |
46 | times %2 d%3 46, -6 | |
47 | times %2 d%3 -2, 16 | |
48 | times %2 d%3 54, -4 | |
49 | times %2 d%3 -2, 10 | |
50 | times %2 d%3 58, -2 | |
51 | %endmacro | |
52 | ||
53 | ||
54 | ||
55 | EPEL_TABLE 8, 8, b, sse4 | |
56 | EPEL_TABLE 10, 4, w, sse4 | |
57 | EPEL_TABLE 12, 4, w, sse4 | |
58 | ||
59 | %macro QPEL_TABLE 4 | |
60 | hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4 | |
61 | times %2 d%3 -10, 58 | |
62 | times %2 d%3 17, -5 | |
63 | times %2 d%3 1, 0 | |
64 | times %2 d%3 -1, 4 | |
65 | times %2 d%3 -11, 40 | |
66 | times %2 d%3 40,-11 | |
67 | times %2 d%3 4, -1 | |
68 | times %2 d%3 0, 1 | |
69 | times %2 d%3 -5, 17 | |
70 | times %2 d%3 58,-10 | |
71 | times %2 d%3 4, -1 | |
72 | %endmacro | |
73 | ||
74 | QPEL_TABLE 8, 8, b, sse4 | |
75 | QPEL_TABLE 10, 4, w, sse4 | |
76 | QPEL_TABLE 12, 4, w, sse4 | |
77 | ||
78 | %define MAX_PB_SIZE 64 | |
79 | ||
80 | %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10 | |
81 | ||
82 | %if ARCH_X86_64 | |
83 | ||
84 | %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2 | |
85 | %if %1 <= 4 | |
86 | movq %3, [%2] ; load data from source2 | |
87 | %elif %1 <= 8 | |
88 | movdqa %3, [%2] ; load data from source2 | |
89 | %elif %1 <= 12 | |
90 | movdqa %3, [%2] ; load data from source2 | |
91 | movq %4, [%2+16] ; load data from source2 | |
92 | %else | |
93 | movdqa %3, [%2] ; load data from source2 | |
94 | movdqa %4, [%2+16] ; load data from source2 | |
95 | %endif | |
96 | %endmacro | |
97 | ||
98 | %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1 | |
99 | %if %1 == 2 || (%2 == 8 && %1 <= 4) | |
100 | movd %4, [%3] ; load data from source | |
101 | %elif %1 == 4 || (%2 == 8 && %1 <= 8) | |
102 | movq %4, [%3] ; load data from source | |
103 | %else | |
104 | movdqu %4, [%3] ; load data from source | |
105 | %endif | |
106 | %endmacro | |
107 | ||
108 | %macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2 | |
109 | %if %1 == 2 || (%2 == 8 && %1 <= 4) | |
110 | movq %4, [%3] ; load data from source2 | |
111 | %elif %1 == 4 || (%2 == 8 && %1 <= 8) | |
112 | movdqa %4, [%3] ; load data from source2 | |
113 | %elif %1 <= 12 | |
114 | movdqa %4, [%3] ; load data from source2 | |
115 | movq %5, [%3+16] ; load data from source2 | |
116 | %else | |
117 | movdqa %4, [%3] ; load data from source2 | |
118 | movdqa %5, [%3+16] ; load data from source2 | |
119 | %endif | |
120 | %endmacro | |
121 | ||
122 | %macro EPEL_FILTER 2-4 ; bit depth, filter index | |
123 | %ifdef PIC | |
124 | lea rfilterq, [hevc_epel_filters_sse4_%1] | |
125 | %else | |
126 | %define rfilterq hevc_epel_filters_sse4_%1 | |
127 | %endif | |
128 | sub %2q, 1 | |
129 | shl %2q, 5 ; multiply by 32 | |
130 | %if %0 == 2 | |
131 | movdqa m14, [rfilterq + %2q] ; get 2 first values of filters | |
132 | movdqa m15, [rfilterq + %2q+16] ; get 2 last values of filters | |
133 | %else | |
134 | movdqa %3, [rfilterq + %2q] ; get 2 first values of filters | |
135 | movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters | |
136 | %endif | |
137 | %endmacro | |
138 | ||
139 | %macro EPEL_HV_FILTER 1 | |
140 | %ifdef PIC | |
141 | lea rfilterq, [hevc_epel_filters_sse4_%1] | |
142 | %else | |
143 | %define rfilterq hevc_epel_filters_sse4_%1 | |
144 | %endif | |
145 | sub mxq, 1 | |
146 | sub myq, 1 | |
147 | shl mxq, 5 ; multiply by 32 | |
148 | shl myq, 5 ; multiply by 32 | |
149 | movdqa m14, [rfilterq + mxq] ; get 2 first values of filters | |
150 | movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters | |
151 | lea r3srcq, [srcstrideq*3] | |
152 | ||
153 | %ifdef PIC | |
154 | lea rfilterq, [hevc_epel_filters_sse4_10] | |
155 | %else | |
156 | %define rfilterq hevc_epel_filters_sse4_10 | |
157 | %endif | |
158 | movdqa m12, [rfilterq + myq] ; get 2 first values of filters | |
159 | movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters | |
160 | %endmacro | |
161 | ||
162 | %macro QPEL_FILTER 2 | |
163 | %ifdef PIC | |
164 | lea rfilterq, [hevc_qpel_filters_sse4_%1] | |
165 | %else | |
166 | %define rfilterq hevc_qpel_filters_sse4_%1 | |
167 | %endif | |
168 | lea %2q, [%2q*8-8] | |
169 | movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters | |
170 | movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters | |
171 | movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters | |
172 | movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters | |
173 | %endmacro | |
174 | ||
175 | %macro EPEL_LOAD 4 | |
176 | %ifdef PIC | |
177 | lea rfilterq, [%2] | |
178 | %else | |
179 | %define rfilterq %2 | |
180 | %endif | |
181 | %if (%1 == 8 && %4 <= 4) | |
182 | %define %%load movd | |
183 | %elif (%1 == 8 && %4 <= 8) || (%1 > 8 && %4 <= 4) | |
184 | %define %%load movq | |
185 | %else | |
186 | %define %%load movdqu | |
187 | %endif | |
188 | ||
189 | %%load m0, [rfilterq ] | |
190 | %ifnum %3 | |
191 | %%load m1, [rfilterq+ %3] | |
192 | %%load m2, [rfilterq+2*%3] | |
193 | %%load m3, [rfilterq+3*%3] | |
194 | %else | |
195 | %%load m1, [rfilterq+ %3q] | |
196 | %%load m2, [rfilterq+2*%3q] | |
197 | %%load m3, [rfilterq+r3srcq] | |
198 | %endif | |
199 | ||
200 | %if %1 == 8 | |
201 | %if %4 > 8 | |
202 | SBUTTERFLY bw, 0, 1, 10 | |
203 | SBUTTERFLY bw, 2, 3, 10 | |
204 | %else | |
205 | punpcklbw m0, m1 | |
206 | punpcklbw m2, m3 | |
207 | %endif | |
208 | %else | |
209 | %if %4 > 4 | |
210 | SBUTTERFLY wd, 0, 1, 10 | |
211 | SBUTTERFLY wd, 2, 3, 10 | |
212 | %else | |
213 | punpcklwd m0, m1 | |
214 | punpcklwd m2, m3 | |
215 | %endif | |
216 | %endif | |
217 | %endmacro | |
218 | ||
219 | ||
220 | %macro QPEL_H_LOAD 4 | |
221 | %assign %%stride (%1+7)/8 | |
222 | %if %1 == 8 | |
223 | %if %3 <= 4 | |
224 | %define %%load movd | |
225 | %elif %3 == 8 | |
226 | %define %%load movq | |
227 | %else | |
228 | %define %%load movdqu | |
229 | %endif | |
230 | %else | |
231 | %if %3 == 2 | |
232 | %define %%load movd | |
233 | %elif %3 == 4 | |
234 | %define %%load movq | |
235 | %else | |
236 | %define %%load movdqu | |
237 | %endif | |
238 | %endif | |
239 | %%load m0, [%2-3*%%stride] ;load data from source | |
240 | %%load m1, [%2-2*%%stride] | |
241 | %%load m2, [%2-%%stride ] | |
242 | %%load m3, [%2 ] | |
243 | %%load m4, [%2+%%stride ] | |
244 | %%load m5, [%2+2*%%stride] | |
245 | %%load m6, [%2+3*%%stride] | |
246 | %%load m7, [%2+4*%%stride] | |
247 | ||
248 | %if %1 == 8 | |
249 | %if %3 > 8 | |
250 | SBUTTERFLY wd, 0, 1, %4 | |
251 | SBUTTERFLY wd, 2, 3, %4 | |
252 | SBUTTERFLY wd, 4, 5, %4 | |
253 | SBUTTERFLY wd, 6, 7, %4 | |
254 | %else | |
255 | punpcklwd m0, m1 | |
256 | punpcklwd m2, m3 | |
257 | punpcklwd m4, m5 | |
258 | punpcklwd m6, m7 | |
259 | %endif | |
260 | %else | |
261 | %if %3 > 4 | |
262 | SBUTTERFLY dq, 0, 1, %4 | |
263 | SBUTTERFLY dq, 2, 3, %4 | |
264 | SBUTTERFLY dq, 4, 5, %4 | |
265 | SBUTTERFLY dq, 6, 7, %4 | |
266 | %else | |
267 | punpckldq m0, m1 | |
268 | punpckldq m2, m3 | |
269 | punpckldq m4, m5 | |
270 | punpckldq m6, m7 | |
271 | %endif | |
272 | %endif | |
273 | %endmacro | |
274 | ||
275 | %macro QPEL_V_LOAD 5 | |
276 | lea %5q, [%2] | |
277 | sub %5q, r3srcq | |
278 | movdqu m0, [%5q ] ;load x- 3*srcstride | |
279 | movdqu m1, [%5q+ %3q ] ;load x- 2*srcstride | |
280 | movdqu m2, [%5q+ 2*%3q ] ;load x-srcstride | |
281 | movdqu m3, [%2 ] ;load x | |
282 | movdqu m4, [%2+ %3q] ;load x+stride | |
283 | movdqu m5, [%2+ 2*%3q] ;load x+2*stride | |
284 | movdqu m6, [%2+r3srcq] ;load x+3*stride | |
285 | movdqu m7, [%2+ 4*%3q] ;load x+4*stride | |
286 | %if %1 == 8 | |
287 | %if %4 > 8 | |
288 | SBUTTERFLY bw, 0, 1, 8 | |
289 | SBUTTERFLY bw, 2, 3, 8 | |
290 | SBUTTERFLY bw, 4, 5, 8 | |
291 | SBUTTERFLY bw, 6, 7, 8 | |
292 | %else | |
293 | punpcklbw m0, m1 | |
294 | punpcklbw m2, m3 | |
295 | punpcklbw m4, m5 | |
296 | punpcklbw m6, m7 | |
297 | %endif | |
298 | %else | |
299 | %if %4 > 4 | |
300 | SBUTTERFLY wd, 0, 1, 8 | |
301 | SBUTTERFLY wd, 2, 3, 8 | |
302 | SBUTTERFLY wd, 4, 5, 8 | |
303 | SBUTTERFLY wd, 6, 7, 8 | |
304 | %else | |
305 | punpcklwd m0, m1 | |
306 | punpcklwd m2, m3 | |
307 | punpcklwd m4, m5 | |
308 | punpcklwd m6, m7 | |
309 | %endif | |
310 | %endif | |
311 | %endmacro | |
312 | ||
313 | %macro PEL_12STORE2 3 | |
314 | movd [%1], %2 | |
315 | %endmacro | |
316 | %macro PEL_12STORE4 3 | |
317 | movq [%1], %2 | |
318 | %endmacro | |
319 | %macro PEL_12STORE6 3 | |
320 | movq [%1], %2 | |
321 | psrldq %2, 8 | |
322 | movd [%1+8], %2 | |
323 | %endmacro | |
324 | %macro PEL_12STORE8 3 | |
325 | movdqa [%1], %2 | |
326 | %endmacro | |
327 | %macro PEL_12STORE12 3 | |
328 | movdqa [%1], %2 | |
329 | movq [%1+16], %3 | |
330 | %endmacro | |
331 | %macro PEL_12STORE16 3 | |
332 | PEL_12STORE8 %1, %2, %3 | |
333 | movdqa [%1+16], %3 | |
334 | %endmacro | |
335 | ||
336 | %macro PEL_10STORE2 3 | |
337 | movd [%1], %2 | |
338 | %endmacro | |
339 | %macro PEL_10STORE4 3 | |
340 | movq [%1], %2 | |
341 | %endmacro | |
342 | %macro PEL_10STORE6 3 | |
343 | movq [%1], %2 | |
344 | psrldq %2, 8 | |
345 | movd [%1+8], %2 | |
346 | %endmacro | |
347 | %macro PEL_10STORE8 3 | |
348 | movdqa [%1], %2 | |
349 | %endmacro | |
350 | %macro PEL_10STORE12 3 | |
351 | movdqa [%1], %2 | |
352 | movq [%1+16], %3 | |
353 | %endmacro | |
354 | %macro PEL_10STORE16 3 | |
355 | PEL_10STORE8 %1, %2, %3 | |
356 | movdqa [%1+16], %3 | |
357 | %endmacro | |
358 | ||
359 | %macro PEL_8STORE2 3 | |
360 | pextrw [%1], %2, 0 | |
361 | %endmacro | |
362 | %macro PEL_8STORE4 3 | |
363 | movd [%1], %2 | |
364 | %endmacro | |
365 | %macro PEL_8STORE6 3 | |
366 | movd [%1], %2 | |
367 | pextrw [%1+4], %2, 2 | |
368 | %endmacro | |
369 | %macro PEL_8STORE8 3 | |
370 | movq [%1], %2 | |
371 | %endmacro | |
372 | %macro PEL_8STORE12 3 | |
373 | movq [%1], %2 | |
374 | psrldq %2, 8 | |
375 | movd [%1+8], %2 | |
376 | %endmacro | |
377 | %macro PEL_8STORE16 3 | |
378 | movdqa [%1], %2 | |
379 | %endmacro | |
380 | ||
381 | %macro LOOP_END 3 | |
382 | add %1q, 2*MAX_PB_SIZE ; dst += dststride | |
383 | add %2q, %3q ; src += srcstride | |
384 | dec heightd ; cmp height | |
385 | jnz .loop ; height loop | |
386 | %endmacro | |
387 | ||
388 | ||
389 | %macro MC_PIXEL_COMPUTE 2 ;width, bitdepth | |
390 | %if %2 == 8 | |
391 | %if %1 > 8 | |
392 | punpckhbw m1, m0, m2 | |
393 | psllw m1, 14-%2 | |
394 | %endif | |
395 | punpcklbw m0, m2 | |
396 | %endif | |
397 | psllw m0, 14-%2 | |
398 | %endmacro | |
399 | ||
400 | ||
401 | %macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2 | |
402 | %if %1 == 8 | |
403 | pmaddubsw m0, %3 ;x1*c1+x2*c2 | |
404 | pmaddubsw m2, %4 ;x3*c3+x4*c4 | |
405 | paddw m0, m2 | |
406 | %if %2 > 8 | |
407 | pmaddubsw m1, %3 | |
408 | pmaddubsw m3, %4 | |
409 | paddw m1, m3 | |
410 | %endif | |
411 | %else | |
412 | pmaddwd m0, %3 | |
413 | pmaddwd m2, %4 | |
414 | paddd m0, m2 | |
415 | %if %2 > 4 | |
416 | pmaddwd m1, %3 | |
417 | pmaddwd m3, %4 | |
418 | paddd m1, m3 | |
419 | %endif | |
420 | %if %1 != 8 | |
421 | psrad m0, %1-8 | |
422 | psrad m1, %1-8 | |
423 | %endif | |
424 | packssdw m0, m1 | |
425 | %endif | |
426 | %endmacro | |
427 | ||
428 | %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx | |
429 | %ifdef PIC | |
430 | lea rfilterq, [hevc_qpel_filters_sse4_%2] | |
431 | %else | |
432 | %define rfilterq hevc_qpel_filters_sse4_%2 | |
433 | %endif | |
434 | ||
435 | %if %2 == 8 | |
436 | pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2 | |
437 | pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4 | |
438 | pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6 | |
439 | pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8 | |
440 | paddw m0, m2 | |
441 | paddw m4, m6 | |
442 | paddw m0, m4 | |
443 | %else | |
444 | pmaddwd m0, [rfilterq + %3q*8 ] | |
445 | pmaddwd m2, [rfilterq + %3q*8+16] | |
446 | pmaddwd m4, [rfilterq + %3q*8+32] | |
447 | pmaddwd m6, [rfilterq + %3q*8+48] | |
448 | paddd m0, m2 | |
449 | paddd m4, m6 | |
450 | paddd m0, m4 | |
451 | %if %2 != 8 | |
452 | psrad m0, %2-8 | |
453 | %endif | |
454 | %if %1 > 4 | |
455 | pmaddwd m1, [rfilterq + %3q*8 ] | |
456 | pmaddwd m3, [rfilterq + %3q*8+16] | |
457 | pmaddwd m5, [rfilterq + %3q*8+32] | |
458 | pmaddwd m7, [rfilterq + %3q*8+48] | |
459 | paddd m1, m3 | |
460 | paddd m5, m7 | |
461 | paddd m1, m5 | |
462 | %if %2 != 8 | |
463 | psrad m1, %2-8 | |
464 | %endif | |
465 | %endif | |
466 | p%4 m0, m1 | |
467 | %endif | |
468 | %endmacro | |
469 | ||
470 | %macro QPEL_COMPUTE 2 ; width, bitdepth | |
471 | %if %2 == 8 | |
472 | pmaddubsw m0, m12 ;x1*c1+x2*c2 | |
473 | pmaddubsw m2, m13 ;x3*c3+x4*c4 | |
474 | pmaddubsw m4, m14 ;x5*c5+x6*c6 | |
475 | pmaddubsw m6, m15 ;x7*c7+x8*c8 | |
476 | paddw m0, m2 | |
477 | paddw m4, m6 | |
478 | paddw m0, m4 | |
479 | %if %1 > 8 | |
480 | pmaddubsw m1, m12 | |
481 | pmaddubsw m3, m13 | |
482 | pmaddubsw m5, m14 | |
483 | pmaddubsw m7, m15 | |
484 | paddw m1, m3 | |
485 | paddw m5, m7 | |
486 | paddw m1, m5 | |
487 | %endif | |
488 | %else | |
489 | pmaddwd m0, m12 | |
490 | pmaddwd m2, m13 | |
491 | pmaddwd m4, m14 | |
492 | pmaddwd m6, m15 | |
493 | paddd m0, m2 | |
494 | paddd m4, m6 | |
495 | paddd m0, m4 | |
496 | %if %2 != 8 | |
497 | psrad m0, %2-8 | |
498 | %endif | |
499 | %if %1 > 4 | |
500 | pmaddwd m1, m12 | |
501 | pmaddwd m3, m13 | |
502 | pmaddwd m5, m14 | |
503 | pmaddwd m7, m15 | |
504 | paddd m1, m3 | |
505 | paddd m5, m7 | |
506 | paddd m1, m5 | |
507 | %if %2 != 8 | |
508 | psrad m1, %2-8 | |
509 | %endif | |
510 | %endif | |
511 | %endif | |
512 | %endmacro | |
513 | ||
514 | %macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw | |
515 | paddsw %3, %5 | |
516 | %if %1 > 8 | |
517 | paddsw %4, %6 | |
518 | %endif | |
519 | UNI_COMPUTE %1, %2, %3, %4, %7 | |
520 | %endmacro | |
521 | ||
522 | %macro UNI_COMPUTE 5 | |
523 | pmulhrsw %3, %5 | |
524 | %if %1 > 8 || (%2 > 8 && %1 > 4) | |
525 | pmulhrsw %4, %5 | |
526 | %endif | |
527 | %if %2 == 8 | |
528 | packuswb %3, %4 | |
529 | %else | |
530 | pminsw %3, [max_pixels_%2] | |
531 | pmaxsw %3, [zero] | |
532 | %if %1 > 8 | |
533 | pminsw %4, [max_pixels_%2] | |
534 | pmaxsw %4, [zero] | |
535 | %endif | |
536 | %endif | |
537 | %endmacro | |
538 | ||
539 | INIT_XMM sse4 ; adds ff_ and _sse4 to function name | |
540 | ; ****************************** | |
541 | ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride, | |
542 | ; uint8_t *_src, ptrdiff_t _srcstride, | |
543 | ; int height, int mx, int my) | |
544 | ; ****************************** | |
545 | ||
546 | %macro HEVC_PUT_HEVC_PEL_PIXELS 2 | |
547 | cglobal hevc_put_hevc_pel_pixels%1_%2, 4, 4, 3, dst, src, srcstride,height | |
548 | pxor m2, m2 | |
549 | .loop | |
550 | SIMPLE_LOAD %1, %2, srcq, m0 | |
551 | MC_PIXEL_COMPUTE %1, %2 | |
552 | PEL_10STORE%1 dstq, m0, m1 | |
553 | LOOP_END dst, src, srcstride | |
554 | RET | |
555 | ||
556 | cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 2, dst, dststride, src, srcstride,height | |
557 | .loop | |
558 | SIMPLE_LOAD %1, %2, srcq, m0 | |
559 | PEL_%2STORE%1 dstq, m0, m1 | |
560 | add dstq, dststrideq ; dst += dststride | |
561 | add srcq, srcstrideq ; src += srcstride | |
562 | dec heightd ; cmp height | |
563 | jnz .loop ; height loop | |
564 | RET | |
565 | ||
566 | cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 6, 6, 6, dst, dststride, src, srcstride, src2, height | |
567 | pxor m2, m2 | |
568 | movdqa m5, [pw_bi_%2] | |
569 | .loop | |
570 | SIMPLE_LOAD %1, %2, srcq, m0 | |
571 | SIMPLE_BILOAD %1, src2q, m3, m4 | |
572 | MC_PIXEL_COMPUTE %1, %2 | |
573 | BI_COMPUTE %1, %2, m0, m1, m3, m4, m5 | |
574 | PEL_%2STORE%1 dstq, m0, m1 | |
575 | add dstq, dststrideq ; dst += dststride | |
576 | add srcq, srcstrideq ; src += srcstride | |
577 | add src2q, 2*MAX_PB_SIZE ; src += srcstride | |
578 | dec heightd ; cmp height | |
579 | jnz .loop ; height loop | |
580 | RET | |
581 | ||
582 | %endmacro | |
583 | ||
584 | ||
585 | ; ****************************** | |
586 | ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride, | |
587 | ; uint8_t *_src, ptrdiff_t _srcstride, | |
588 | ; int width, int height, int mx, int my, | |
589 | ; int16_t* mcbuffer) | |
590 | ; ****************************** | |
591 | ||
592 | ||
593 | %macro HEVC_PUT_HEVC_EPEL 2 | |
594 | cglobal hevc_put_hevc_epel_h%1_%2, 5, 6, 6, dst, src, srcstride, height, mx, rfilter | |
595 | %assign %%stride ((%2 + 7)/8) | |
596 | EPEL_FILTER %2, mx, m4, m5 | |
597 | .loop | |
598 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
599 | EPEL_COMPUTE %2, %1, m4, m5 | |
600 | PEL_10STORE%1 dstq, m0, m1 | |
601 | LOOP_END dst, src, srcstride | |
602 | RET | |
603 | ||
604 | cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter | |
605 | %assign %%stride ((%2 + 7)/8) | |
606 | movdqa m6, [pw_%2] | |
607 | EPEL_FILTER %2, mx, m4, m5 | |
608 | .loop | |
609 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
610 | EPEL_COMPUTE %2, %1, m4, m5 | |
611 | UNI_COMPUTE %1, %2, m0, m1, m6 | |
612 | PEL_%2STORE%1 dstq, m0, m1 | |
613 | add dstq, dststrideq ; dst += dststride | |
614 | add srcq, srcstrideq ; src += srcstride | |
615 | dec heightd ; cmp height | |
616 | jnz .loop ; height loop | |
617 | RET | |
618 | ||
619 | cglobal hevc_put_hevc_bi_epel_h%1_%2, 7, 8, 7, dst, dststride, src, srcstride, src2, height, mx, rfilter | |
620 | movdqa m6, [pw_bi_%2] | |
621 | EPEL_FILTER %2, mx, m4, m5 | |
622 | .loop | |
623 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
624 | EPEL_COMPUTE %2, %1, m4, m5 | |
625 | SIMPLE_BILOAD %1, src2q, m2, m3 | |
626 | BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 | |
627 | PEL_%2STORE%1 dstq, m0, m1 | |
628 | add dstq, dststrideq ; dst += dststride | |
629 | add srcq, srcstrideq ; src += srcstride | |
630 | add src2q, 2*MAX_PB_SIZE ; src += srcstride | |
631 | dec heightd ; cmp height | |
632 | jnz .loop ; height loop | |
633 | RET | |
634 | ||
635 | ; ****************************** | |
636 | ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride, | |
637 | ; uint8_t *_src, ptrdiff_t _srcstride, | |
638 | ; int width, int height, int mx, int my, | |
639 | ; int16_t* mcbuffer) | |
640 | ; ****************************** | |
641 | ||
642 | cglobal hevc_put_hevc_epel_v%1_%2, 6, 7, 6, dst, src, srcstride, height, r3src, my, rfilter | |
643 | lea r3srcq, [srcstrideq*3] | |
644 | sub srcq, srcstrideq | |
645 | EPEL_FILTER %2, my, m4, m5 | |
646 | .loop | |
647 | EPEL_LOAD %2, srcq, srcstride, %1 | |
648 | EPEL_COMPUTE %2, %1, m4, m5 | |
649 | PEL_10STORE%1 dstq, m0, m1 | |
650 | LOOP_END dst, src, srcstride | |
651 | RET | |
652 | ||
653 | cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter | |
654 | lea r3srcq, [srcstrideq*3] | |
655 | movdqa m6, [pw_%2] | |
656 | sub srcq, srcstrideq | |
657 | EPEL_FILTER %2, my, m4, m5 | |
658 | .loop | |
659 | EPEL_LOAD %2, srcq, srcstride, %1 | |
660 | EPEL_COMPUTE %2, %1, m4, m5 | |
661 | UNI_COMPUTE %1, %2, m0, m1, m6 | |
662 | PEL_%2STORE%1 dstq, m0, m1 | |
663 | add dstq, dststrideq ; dst += dststride | |
664 | add srcq, srcstrideq ; src += srcstride | |
665 | dec heightd ; cmp height | |
666 | jnz .loop ; height loop | |
667 | RET | |
668 | ||
669 | ||
670 | cglobal hevc_put_hevc_bi_epel_v%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter | |
671 | lea r3srcq, [srcstrideq*3] | |
672 | movdqa m6, [pw_bi_%2] | |
673 | sub srcq, srcstrideq | |
674 | EPEL_FILTER %2, my, m4, m5 | |
675 | .loop | |
676 | EPEL_LOAD %2, srcq, srcstride, %1 | |
677 | EPEL_COMPUTE %2, %1, m4, m5 | |
678 | SIMPLE_BILOAD %1, src2q, m2, m3 | |
679 | BI_COMPUTE %1, %2, m0, m1, m2, m3, m6 | |
680 | PEL_%2STORE%1 dstq, m0, m1 | |
681 | add dstq, dststrideq ; dst += dststride | |
682 | add srcq, srcstrideq ; src += srcstride | |
683 | add src2q, 2*MAX_PB_SIZE ; src += srcstride | |
684 | dec heightd ; cmp height | |
685 | jnz .loop ; height loop | |
686 | RET | |
687 | %endmacro | |
688 | ||
689 | ||
690 | ; ****************************** | |
691 | ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride, | |
692 | ; uint8_t *_src, ptrdiff_t _srcstride, | |
693 | ; int width, int height, int mx, int my) | |
694 | ; ****************************** | |
695 | ||
696 | %macro HEVC_PUT_HEVC_EPEL_HV 2 | |
697 | cglobal hevc_put_hevc_epel_hv%1_%2, 6, 8, 12 , dst, src, srcstride, height, mx, my, r3src, rfilter | |
698 | %assign %%stride ((%2 + 7)/8) | |
699 | sub srcq, srcstrideq | |
700 | EPEL_HV_FILTER %2 | |
701 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
702 | EPEL_COMPUTE %2, %1, m14, m15 | |
703 | SWAP m4, m0 | |
704 | add srcq, srcstrideq | |
705 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
706 | EPEL_COMPUTE %2, %1, m14, m15 | |
707 | SWAP m5, m0 | |
708 | add srcq, srcstrideq | |
709 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
710 | EPEL_COMPUTE %2, %1, m14, m15 | |
711 | SWAP m6, m0 | |
712 | add srcq, srcstrideq | |
713 | .loop | |
714 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
715 | EPEL_COMPUTE %2, %1, m14, m15 | |
716 | SWAP m7, m0 | |
717 | punpcklwd m0, m4, m5 | |
718 | punpcklwd m2, m6, m7 | |
719 | %if %1 > 4 | |
720 | punpckhwd m1, m4, m5 | |
721 | punpckhwd m3, m6, m7 | |
722 | %endif | |
723 | EPEL_COMPUTE 14, %1, m12, m13 | |
724 | PEL_10STORE%1 dstq, m0, m1 | |
725 | movdqa m4, m5 | |
726 | movdqa m5, m6 | |
727 | movdqa m6, m7 | |
728 | LOOP_END dst, src, srcstride | |
729 | RET | |
730 | ||
731 | cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter | |
732 | %assign %%stride ((%2 + 7)/8) | |
733 | sub srcq, srcstrideq | |
734 | EPEL_HV_FILTER %2 | |
735 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
736 | EPEL_COMPUTE %2, %1, m14, m15 | |
737 | SWAP m4, m0 | |
738 | add srcq, srcstrideq | |
739 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
740 | EPEL_COMPUTE %2, %1, m14, m15 | |
741 | SWAP m5, m0 | |
742 | add srcq, srcstrideq | |
743 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
744 | EPEL_COMPUTE %2, %1, m14, m15 | |
745 | SWAP m6, m0 | |
746 | add srcq, srcstrideq | |
747 | .loop | |
748 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
749 | EPEL_COMPUTE %2, %1, m14, m15 | |
750 | SWAP m7, m0 | |
751 | punpcklwd m0, m4, m5 | |
752 | punpcklwd m2, m6, m7 | |
753 | %if %1 > 4 | |
754 | punpckhwd m1, m4, m5 | |
755 | punpckhwd m3, m6, m7 | |
756 | %endif | |
757 | EPEL_COMPUTE 14, %1, m12, m13 | |
758 | UNI_COMPUTE %1, %2, m0, m1, [pw_%2] | |
759 | PEL_%2STORE%1 dstq, m0, m1 | |
760 | movdqa m4, m5 | |
761 | movdqa m5, m6 | |
762 | movdqa m6, m7 | |
763 | add dstq, dststrideq ; dst += dststride | |
764 | add srcq, srcstrideq ; src += srcstride | |
765 | dec heightd ; cmp height | |
766 | jnz .loop ; height loop | |
767 | RET | |
768 | ||
769 | ||
770 | cglobal hevc_put_hevc_bi_epel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter | |
771 | %assign %%stride ((%2 + 7)/8) | |
772 | sub srcq, srcstrideq | |
773 | EPEL_HV_FILTER %2 | |
774 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
775 | EPEL_COMPUTE %2, %1, m14, m15 | |
776 | SWAP m4, m0 | |
777 | add srcq, srcstrideq | |
778 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
779 | EPEL_COMPUTE %2, %1, m14, m15 | |
780 | SWAP m5, m0 | |
781 | add srcq, srcstrideq | |
782 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
783 | EPEL_COMPUTE %2, %1, m14, m15 | |
784 | SWAP m6, m0 | |
785 | add srcq, srcstrideq | |
786 | .loop | |
787 | EPEL_LOAD %2, srcq-%%stride, %%stride, %1 | |
788 | EPEL_COMPUTE %2, %1, m14, m15 | |
789 | SWAP m7, m0 | |
790 | punpcklwd m0, m4, m5 | |
791 | punpcklwd m2, m6, m7 | |
792 | %if %1 > 4 | |
793 | punpckhwd m1, m4, m5 | |
794 | punpckhwd m3, m6, m7 | |
795 | %endif | |
796 | EPEL_COMPUTE 14, %1, m12, m13 | |
797 | SIMPLE_BILOAD %1, src2q, m8, m9 | |
798 | BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] | |
799 | PEL_%2STORE%1 dstq, m0, m1 | |
800 | movdqa m4, m5 | |
801 | movdqa m5, m6 | |
802 | movdqa m6, m7 | |
803 | add dstq, dststrideq ; dst += dststride | |
804 | add srcq, srcstrideq ; src += srcstride | |
805 | add src2q, 2*MAX_PB_SIZE ; src += srcstride | |
806 | dec heightd ; cmp height | |
807 | jnz .loop ; height loop | |
808 | RET | |
809 | %endmacro | |
810 | ||
811 | ; ****************************** | |
812 | ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride, | |
813 | ; uint8_t *_src, ptrdiff_t _srcstride, | |
814 | ; int width, int height, int mx, int my) | |
815 | ; ****************************** | |
816 | ||
817 | %macro HEVC_PUT_HEVC_QPEL 2 | |
818 | cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 15, dst, src, srcstride, height, mx, rfilter | |
819 | QPEL_FILTER %2, mx | |
820 | .loop | |
821 | QPEL_H_LOAD %2, srcq, %1, 10 | |
822 | QPEL_COMPUTE %1, %2 | |
823 | %if %2 > 8 | |
824 | packssdw m0, m1 | |
825 | %endif | |
826 | PEL_10STORE%1 dstq, m0, m1 | |
827 | LOOP_END dst, src, srcstride | |
828 | RET | |
829 | ||
830 | cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter | |
831 | movdqa m9, [pw_%2] | |
832 | QPEL_FILTER %2, mx | |
833 | .loop | |
834 | QPEL_H_LOAD %2, srcq, %1, 10 | |
835 | QPEL_COMPUTE %1, %2 | |
836 | %if %2 > 8 | |
837 | packssdw m0, m1 | |
838 | %endif | |
839 | UNI_COMPUTE %1, %2, m0, m1, m9 | |
840 | PEL_%2STORE%1 dstq, m0, m1 | |
841 | add dstq, dststrideq ; dst += dststride | |
842 | add srcq, srcstrideq ; src += srcstride | |
843 | dec heightd ; cmp height | |
844 | jnz .loop ; height loop | |
845 | RET | |
846 | ||
847 | cglobal hevc_put_hevc_bi_qpel_h%1_%2, 7, 8, 16 , dst, dststride, src, srcstride, src2, height, mx, rfilter | |
848 | movdqa m9, [pw_bi_%2] | |
849 | QPEL_FILTER %2, mx | |
850 | .loop | |
851 | QPEL_H_LOAD %2, srcq, %1, 10 | |
852 | QPEL_COMPUTE %1, %2 | |
853 | %if %2 > 8 | |
854 | packssdw m0, m1 | |
855 | %endif | |
856 | SIMPLE_BILOAD %1, src2q, m10, m11 | |
857 | BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 | |
858 | PEL_%2STORE%1 dstq, m0, m1 | |
859 | add dstq, dststrideq ; dst += dststride | |
860 | add srcq, srcstrideq ; src += srcstride | |
861 | add src2q, 2*MAX_PB_SIZE ; src += srcstride | |
862 | dec heightd ; cmp height | |
863 | jnz .loop ; height loop | |
864 | RET | |
865 | ||
866 | ||
867 | ; ****************************** | |
868 | ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride, | |
869 | ; uint8_t *_src, ptrdiff_t _srcstride, | |
870 | ; int width, int height, int mx, int my) | |
871 | ; ****************************** | |
872 | ||
873 | cglobal hevc_put_hevc_qpel_v%1_%2, 6, 8, 15, dst, src, srcstride, height, r3src, my, rfilter | |
874 | lea r3srcq, [srcstrideq*3] | |
875 | QPEL_FILTER %2, my | |
876 | .loop | |
877 | QPEL_V_LOAD %2, srcq, srcstride, %1, r7 | |
878 | QPEL_COMPUTE %1, %2 | |
879 | %if %2 > 8 | |
880 | packssdw m0, m1 | |
881 | %endif | |
882 | PEL_10STORE%1 dstq, m0, m1 | |
883 | LOOP_END dst, src, srcstride | |
884 | RET | |
885 | ||
886 | cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 9, 15, dst, dststride, src, srcstride, height, r3src, my, rfilter | |
887 | movdqa m9, [pw_%2] | |
888 | lea r3srcq, [srcstrideq*3] | |
889 | QPEL_FILTER %2, my | |
890 | .loop | |
891 | QPEL_V_LOAD %2, srcq, srcstride, %1, r8 | |
892 | QPEL_COMPUTE %1, %2 | |
893 | %if %2 > 8 | |
894 | packssdw m0, m1 | |
895 | %endif | |
896 | UNI_COMPUTE %1, %2, m0, m1, m9 | |
897 | PEL_%2STORE%1 dstq, m0, m1 | |
898 | add dstq, dststrideq ; dst += dststride | |
899 | add srcq, srcstrideq ; src += srcstride | |
900 | dec heightd ; cmp height | |
901 | jnz .loop ; height loop | |
902 | RET | |
903 | ||
904 | cglobal hevc_put_hevc_bi_qpel_v%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, r3src, my, rfilter | |
905 | movdqa m9, [pw_bi_%2] | |
906 | lea r3srcq, [srcstrideq*3] | |
907 | QPEL_FILTER %2, my | |
908 | .loop | |
909 | SIMPLE_BILOAD %1, src2q, m10, m11 | |
910 | QPEL_V_LOAD %2, srcq, srcstride, %1, r9 | |
911 | QPEL_COMPUTE %1, %2 | |
912 | %if %2 > 8 | |
913 | packssdw m0, m1 | |
914 | %endif | |
915 | BI_COMPUTE %1, %2, m0, m1, m10, m11, m9 | |
916 | PEL_%2STORE%1 dstq, m0, m1 | |
917 | add dstq, dststrideq ; dst += dststride | |
918 | add srcq, srcstrideq ; src += srcstride | |
919 | add src2q, 2*MAX_PB_SIZE ; src += srcstride | |
920 | dec heightd ; cmp height | |
921 | jnz .loop ; height loop | |
922 | RET | |
923 | %endmacro | |
924 | ||
925 | ||
926 | ; ****************************** | |
927 | ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride, | |
928 | ; uint8_t *_src, ptrdiff_t _srcstride, | |
929 | ; int height, int mx, int my) | |
930 | ; ****************************** | |
931 | %macro HEVC_PUT_HEVC_QPEL_HV 2 | |
932 | cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 8, 12, dst, src, srcstride, height, mx, my, r3src, rfilter | |
933 | lea mxq, [mxq*8-8] | |
934 | lea myq, [myq*8-8] | |
935 | lea r3srcq, [srcstrideq*3] | |
936 | sub srcq, r3srcq | |
937 | QPEL_H_LOAD %2, srcq, %1, 15 | |
938 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
939 | SWAP m8, m0 | |
940 | add srcq, srcstrideq | |
941 | QPEL_H_LOAD %2, srcq, %1, 15 | |
942 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
943 | SWAP m9, m0 | |
944 | add srcq, srcstrideq | |
945 | QPEL_H_LOAD %2, srcq, %1, 15 | |
946 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
947 | SWAP m10, m0 | |
948 | add srcq, srcstrideq | |
949 | QPEL_H_LOAD %2, srcq, %1, 15 | |
950 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
951 | SWAP m11, m0 | |
952 | add srcq, srcstrideq | |
953 | QPEL_H_LOAD %2, srcq, %1, 15 | |
954 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
955 | SWAP m12, m0 | |
956 | add srcq, srcstrideq | |
957 | QPEL_H_LOAD %2, srcq, %1, 15 | |
958 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
959 | SWAP m13, m0 | |
960 | add srcq, srcstrideq | |
961 | QPEL_H_LOAD %2, srcq, %1, 15 | |
962 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
963 | SWAP m14, m0 | |
964 | add srcq, srcstrideq | |
965 | .loop | |
966 | QPEL_H_LOAD %2, srcq, %1, 15 | |
967 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
968 | SWAP m15, m0 | |
969 | punpcklwd m0, m8, m9 | |
970 | punpcklwd m2, m10, m11 | |
971 | punpcklwd m4, m12, m13 | |
972 | punpcklwd m6, m14, m15 | |
973 | %if %1 > 4 | |
974 | punpckhwd m1, m8, m9 | |
975 | punpckhwd m3, m10, m11 | |
976 | punpckhwd m5, m12, m13 | |
977 | punpckhwd m7, m14, m15 | |
978 | %endif | |
979 | QPEL_HV_COMPUTE %1, 14, my, ackssdw | |
980 | PEL_10STORE%1 dstq, m0, m1 | |
981 | %if %1 <= 4 | |
982 | movq m8, m9 | |
983 | movq m9, m10 | |
984 | movq m10, m11 | |
985 | movq m11, m12 | |
986 | movq m12, m13 | |
987 | movq m13, m14 | |
988 | movq m14, m15 | |
989 | %else | |
990 | movdqa m8, m9 | |
991 | movdqa m9, m10 | |
992 | movdqa m10, m11 | |
993 | movdqa m11, m12 | |
994 | movdqa m12, m13 | |
995 | movdqa m13, m14 | |
996 | movdqa m14, m15 | |
997 | %endif | |
998 | LOOP_END dst, src, srcstride | |
999 | RET | |
1000 | ||
1001 | cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter | |
1002 | lea mxq, [mxq*8-8] | |
1003 | lea myq, [myq*8-8] | |
1004 | lea r3srcq, [srcstrideq*3] | |
1005 | sub srcq, r3srcq | |
1006 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1007 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1008 | SWAP m8, m0 | |
1009 | add srcq, srcstrideq | |
1010 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1011 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1012 | SWAP m9, m0 | |
1013 | add srcq, srcstrideq | |
1014 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1015 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1016 | SWAP m10, m0 | |
1017 | add srcq, srcstrideq | |
1018 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1019 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1020 | SWAP m11, m0 | |
1021 | add srcq, srcstrideq | |
1022 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1023 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1024 | SWAP m12, m0 | |
1025 | add srcq, srcstrideq | |
1026 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1027 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1028 | SWAP m13, m0 | |
1029 | add srcq, srcstrideq | |
1030 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1031 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1032 | SWAP m14, m0 | |
1033 | add srcq, srcstrideq | |
1034 | .loop | |
1035 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1036 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1037 | SWAP m15, m0 | |
1038 | punpcklwd m0, m8, m9 | |
1039 | punpcklwd m2, m10, m11 | |
1040 | punpcklwd m4, m12, m13 | |
1041 | punpcklwd m6, m14, m15 | |
1042 | %if %1 > 4 | |
1043 | punpckhwd m1, m8, m9 | |
1044 | punpckhwd m3, m10, m11 | |
1045 | punpckhwd m5, m12, m13 | |
1046 | punpckhwd m7, m14, m15 | |
1047 | %endif | |
1048 | QPEL_HV_COMPUTE %1, 14, my, ackusdw | |
1049 | UNI_COMPUTE %1, %2, m0, m1, [pw_%2] | |
1050 | PEL_%2STORE%1 dstq, m0, m1 | |
1051 | ||
1052 | %if %1 <= 4 | |
1053 | movq m8, m9 | |
1054 | movq m9, m10 | |
1055 | movq m10, m11 | |
1056 | movq m11, m12 | |
1057 | movq m12, m13 | |
1058 | movq m13, m14 | |
1059 | movq m14, m15 | |
1060 | %else | |
1061 | movdqa m8, m9 | |
1062 | movdqa m9, m10 | |
1063 | movdqa m10, m11 | |
1064 | movdqa m11, m12 | |
1065 | movdqa m12, m13 | |
1066 | movdqa m13, m14 | |
1067 | movdqa m14, m15 | |
1068 | %endif | |
1069 | add dstq, dststrideq ; dst += dststride | |
1070 | add srcq, srcstrideq ; src += srcstride | |
1071 | dec heightd ; cmp height | |
1072 | jnz .loop ; height loop | |
1073 | RET | |
1074 | ||
1075 | cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride, src2, height, mx, my, r3src, rfilter | |
1076 | lea mxq, [mxq*8-8] | |
1077 | lea myq, [myq*8-8] | |
1078 | lea r3srcq, [srcstrideq*3] | |
1079 | sub srcq, r3srcq | |
1080 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1081 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1082 | SWAP m8, m0 | |
1083 | add srcq, srcstrideq | |
1084 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1085 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1086 | SWAP m9, m0 | |
1087 | add srcq, srcstrideq | |
1088 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1089 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1090 | SWAP m10, m0 | |
1091 | add srcq, srcstrideq | |
1092 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1093 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1094 | SWAP m11, m0 | |
1095 | add srcq, srcstrideq | |
1096 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1097 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1098 | SWAP m12, m0 | |
1099 | add srcq, srcstrideq | |
1100 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1101 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1102 | SWAP m13, m0 | |
1103 | add srcq, srcstrideq | |
1104 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1105 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1106 | SWAP m14, m0 | |
1107 | add srcq, srcstrideq | |
1108 | .loop | |
1109 | QPEL_H_LOAD %2, srcq, %1, 15 | |
1110 | QPEL_HV_COMPUTE %1, %2, mx, ackssdw | |
1111 | SWAP m15, m0 | |
1112 | punpcklwd m0, m8, m9 | |
1113 | punpcklwd m2, m10, m11 | |
1114 | punpcklwd m4, m12, m13 | |
1115 | punpcklwd m6, m14, m15 | |
1116 | %if %1 > 4 | |
1117 | punpckhwd m1, m8, m9 | |
1118 | punpckhwd m3, m10, m11 | |
1119 | punpckhwd m5, m12, m13 | |
1120 | punpckhwd m7, m14, m15 | |
1121 | %endif | |
1122 | QPEL_HV_COMPUTE %1, 14, my, ackssdw | |
1123 | SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case | |
1124 | BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2] | |
1125 | PEL_%2STORE%1 dstq, m0, m1 | |
1126 | ||
1127 | %if %1 <= 4 | |
1128 | movq m8, m9 | |
1129 | movq m9, m10 | |
1130 | movq m10, m11 | |
1131 | movq m11, m12 | |
1132 | movq m12, m13 | |
1133 | movq m13, m14 | |
1134 | movq m14, m15 | |
1135 | %else | |
1136 | movdqa m8, m9 | |
1137 | movdqa m9, m10 | |
1138 | movdqa m10, m11 | |
1139 | movdqa m11, m12 | |
1140 | movdqa m12, m13 | |
1141 | movdqa m13, m14 | |
1142 | movdqa m14, m15 | |
1143 | %endif | |
1144 | add dstq, dststrideq ; dst += dststride | |
1145 | add srcq, srcstrideq ; src += srcstride | |
1146 | add src2q, 2*MAX_PB_SIZE ; src += srcstride | |
1147 | dec heightd ; cmp height | |
1148 | jnz .loop ; height loop | |
1149 | RET | |
1150 | %endmacro | |
1151 | ||
1152 | %macro WEIGHTING_FUNCS 2 | |
1153 | %if WIN64 || ARCH_X86_32 | |
1154 | cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox | |
1155 | mov r4d, denomm | |
1156 | %define SHIFT r4d | |
1157 | %else | |
1158 | cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox | |
1159 | %define SHIFT denomd | |
1160 | %endif | |
1161 | lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom | |
1162 | %if %1 <= 4 | |
1163 | pxor m1, m1 | |
1164 | %endif | |
1165 | movd m2, wxm ; WX | |
1166 | movd m4, SHIFT ; shift | |
1167 | %if %1 <= 4 | |
1168 | punpcklwd m2, m1 | |
1169 | %else | |
1170 | punpcklwd m2, m2 | |
1171 | %endif | |
1172 | dec SHIFT | |
1173 | movdqu m5, [one_per_32] | |
1174 | movd m6, SHIFT | |
1175 | pshufd m2, m2, 0 | |
1176 | mov SHIFT, oxm | |
1177 | pslld m5, m6 | |
1178 | %if %2 != 8 | |
1179 | shl SHIFT, %2-8 ; ox << (bitd - 8) | |
1180 | %endif | |
1181 | movd m3, SHIFT ; OX | |
1182 | pshufd m3, m3, 0 | |
1183 | %if WIN64 || ARCH_X86_32 | |
1184 | mov SHIFT, heightm | |
1185 | %endif | |
1186 | .loop | |
1187 | SIMPLE_LOAD %1, 10, srcq, m0 | |
1188 | %if %1 <= 4 | |
1189 | punpcklwd m0, m1 | |
1190 | pmaddwd m0, m2 | |
1191 | paddd m0, m5 | |
1192 | psrad m0, m4 | |
1193 | paddd m0, m3 | |
1194 | %else | |
1195 | pmulhw m6, m0, m2 | |
1196 | pmullw m0, m2 | |
1197 | punpckhwd m1, m0, m6 | |
1198 | punpcklwd m0, m6 | |
1199 | paddd m0, m5 | |
1200 | paddd m1, m5 | |
1201 | psrad m0, m4 | |
1202 | psrad m1, m4 | |
1203 | paddd m0, m3 | |
1204 | paddd m1, m3 | |
1205 | %endif | |
f6fa7814 | 1206 | packssdw m0, m1 |
2ba45a60 DM |
1207 | %if %2 == 8 |
1208 | packuswb m0, m0 | |
1209 | %else | |
1210 | pminsw m0, [max_pixels_%2] | |
f6fa7814 | 1211 | pmaxsw m0, [zero] |
2ba45a60 DM |
1212 | %endif |
1213 | PEL_%2STORE%1 dstq, m0, m1 | |
1214 | add dstq, dststrideq ; dst += dststride | |
1215 | add srcq, 2*MAX_PB_SIZE ; src += srcstride | |
1216 | dec heightd ; cmp height | |
1217 | jnz .loop ; height loop | |
1218 | RET | |
1219 | ||
1220 | cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1 | |
1221 | mov r6d, denomm | |
1222 | %if %1 <= 4 | |
1223 | pxor m1, m1 | |
1224 | %endif | |
1225 | movd m2, wx0m ; WX0 | |
1226 | lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom | |
1227 | movd m3, wx1m ; WX1 | |
1228 | movd m0, r6d ; shift | |
1229 | %if %1 <= 4 | |
1230 | punpcklwd m2, m1 | |
1231 | punpcklwd m3, m1 | |
1232 | %else | |
1233 | punpcklwd m2, m2 | |
1234 | punpcklwd m3, m3 | |
1235 | %endif | |
1236 | inc r6d | |
1237 | movd m5, r6d ; shift+1 | |
1238 | pshufd m2, m2, 0 | |
1239 | mov r6d, ox0m | |
1240 | pshufd m3, m3, 0 | |
1241 | add r6d, ox1m | |
1242 | %if %2 != 8 | |
1243 | shl r6d, %2-8 ; ox << (bitd - 8) | |
1244 | %endif | |
1245 | inc r6d | |
1246 | movd m4, r6d ; offset | |
1247 | pshufd m4, m4, 0 | |
1248 | mov r6d, heightm | |
1249 | pslld m4, m0 | |
1250 | ||
1251 | .loop | |
1252 | SIMPLE_LOAD %1, 10, srcq, m0 | |
1253 | SIMPLE_LOAD %1, 10, src2q, m8 | |
1254 | %if %1 <= 4 | |
1255 | punpcklwd m0, m1 | |
1256 | punpcklwd m8, m1 | |
1257 | pmaddwd m0, m3 | |
1258 | pmaddwd m8, m2 | |
1259 | paddd m0, m4 | |
1260 | paddd m0, m8 | |
1261 | psrad m0, m5 | |
1262 | %else | |
1263 | pmulhw m6, m0, m3 | |
1264 | pmullw m0, m3 | |
1265 | pmulhw m7, m8, m2 | |
1266 | pmullw m8, m2 | |
1267 | punpckhwd m1, m0, m6 | |
1268 | punpcklwd m0, m6 | |
1269 | punpckhwd m9, m8, m7 | |
1270 | punpcklwd m8, m7 | |
1271 | paddd m0, m8 | |
1272 | paddd m1, m9 | |
1273 | paddd m0, m4 | |
1274 | paddd m1, m4 | |
1275 | psrad m0, m5 | |
1276 | psrad m1, m5 | |
1277 | %endif | |
f6fa7814 | 1278 | packssdw m0, m1 |
2ba45a60 DM |
1279 | %if %2 == 8 |
1280 | packuswb m0, m0 | |
1281 | %else | |
1282 | pminsw m0, [max_pixels_%2] | |
f6fa7814 | 1283 | pmaxsw m0, [zero] |
2ba45a60 DM |
1284 | %endif |
1285 | PEL_%2STORE%1 dstq, m0, m1 | |
1286 | add dstq, dststrideq ; dst += dststride | |
1287 | add srcq, 2*MAX_PB_SIZE ; src += srcstride | |
1288 | add src2q, 2*MAX_PB_SIZE ; src2 += srcstride | |
1289 | dec r6d ; cmp height | |
1290 | jnz .loop ; height loop | |
1291 | RET | |
1292 | %endmacro | |
1293 | ||
1294 | WEIGHTING_FUNCS 2, 8 | |
1295 | WEIGHTING_FUNCS 4, 8 | |
1296 | WEIGHTING_FUNCS 6, 8 | |
1297 | WEIGHTING_FUNCS 8, 8 | |
1298 | ||
1299 | WEIGHTING_FUNCS 2, 10 | |
1300 | WEIGHTING_FUNCS 4, 10 | |
1301 | WEIGHTING_FUNCS 6, 10 | |
1302 | WEIGHTING_FUNCS 8, 10 | |
1303 | ||
1304 | WEIGHTING_FUNCS 2, 12 | |
1305 | WEIGHTING_FUNCS 4, 12 | |
1306 | WEIGHTING_FUNCS 6, 12 | |
1307 | WEIGHTING_FUNCS 8, 12 | |
1308 | ||
1309 | HEVC_PUT_HEVC_PEL_PIXELS 2, 8 | |
1310 | HEVC_PUT_HEVC_PEL_PIXELS 4, 8 | |
1311 | HEVC_PUT_HEVC_PEL_PIXELS 6, 8 | |
1312 | HEVC_PUT_HEVC_PEL_PIXELS 8, 8 | |
1313 | HEVC_PUT_HEVC_PEL_PIXELS 12, 8 | |
1314 | HEVC_PUT_HEVC_PEL_PIXELS 16, 8 | |
1315 | ||
1316 | HEVC_PUT_HEVC_PEL_PIXELS 2, 10 | |
1317 | HEVC_PUT_HEVC_PEL_PIXELS 4, 10 | |
1318 | HEVC_PUT_HEVC_PEL_PIXELS 6, 10 | |
1319 | HEVC_PUT_HEVC_PEL_PIXELS 8, 10 | |
1320 | ||
1321 | HEVC_PUT_HEVC_PEL_PIXELS 2, 12 | |
1322 | HEVC_PUT_HEVC_PEL_PIXELS 4, 12 | |
1323 | HEVC_PUT_HEVC_PEL_PIXELS 6, 12 | |
1324 | HEVC_PUT_HEVC_PEL_PIXELS 8, 12 | |
1325 | ||
1326 | HEVC_PUT_HEVC_EPEL 2, 8 | |
1327 | HEVC_PUT_HEVC_EPEL 4, 8 | |
1328 | HEVC_PUT_HEVC_EPEL 6, 8 | |
1329 | HEVC_PUT_HEVC_EPEL 8, 8 | |
1330 | HEVC_PUT_HEVC_EPEL 12, 8 | |
1331 | HEVC_PUT_HEVC_EPEL 16, 8 | |
1332 | ||
1333 | ||
1334 | HEVC_PUT_HEVC_EPEL 2, 10 | |
1335 | HEVC_PUT_HEVC_EPEL 4, 10 | |
1336 | HEVC_PUT_HEVC_EPEL 6, 10 | |
1337 | HEVC_PUT_HEVC_EPEL 8, 10 | |
1338 | ||
1339 | HEVC_PUT_HEVC_EPEL 2, 12 | |
1340 | HEVC_PUT_HEVC_EPEL 4, 12 | |
1341 | HEVC_PUT_HEVC_EPEL 6, 12 | |
1342 | HEVC_PUT_HEVC_EPEL 8, 12 | |
1343 | ||
1344 | HEVC_PUT_HEVC_EPEL_HV 2, 8 | |
1345 | HEVC_PUT_HEVC_EPEL_HV 4, 8 | |
1346 | HEVC_PUT_HEVC_EPEL_HV 6, 8 | |
1347 | HEVC_PUT_HEVC_EPEL_HV 8, 8 | |
1348 | ||
1349 | HEVC_PUT_HEVC_EPEL_HV 2, 10 | |
1350 | HEVC_PUT_HEVC_EPEL_HV 4, 10 | |
1351 | HEVC_PUT_HEVC_EPEL_HV 6, 10 | |
1352 | HEVC_PUT_HEVC_EPEL_HV 8, 10 | |
1353 | ||
1354 | HEVC_PUT_HEVC_EPEL_HV 2, 12 | |
1355 | HEVC_PUT_HEVC_EPEL_HV 4, 12 | |
1356 | HEVC_PUT_HEVC_EPEL_HV 6, 12 | |
1357 | HEVC_PUT_HEVC_EPEL_HV 8, 12 | |
1358 | ||
1359 | HEVC_PUT_HEVC_QPEL 4, 8 | |
1360 | HEVC_PUT_HEVC_QPEL 8, 8 | |
1361 | HEVC_PUT_HEVC_QPEL 12, 8 | |
1362 | HEVC_PUT_HEVC_QPEL 16, 8 | |
1363 | ||
1364 | HEVC_PUT_HEVC_QPEL 4, 10 | |
1365 | HEVC_PUT_HEVC_QPEL 8, 10 | |
1366 | ||
1367 | HEVC_PUT_HEVC_QPEL 4, 12 | |
1368 | HEVC_PUT_HEVC_QPEL 8, 12 | |
1369 | ||
1370 | HEVC_PUT_HEVC_QPEL_HV 2, 8 | |
1371 | HEVC_PUT_HEVC_QPEL_HV 4, 8 | |
1372 | HEVC_PUT_HEVC_QPEL_HV 6, 8 | |
1373 | HEVC_PUT_HEVC_QPEL_HV 8, 8 | |
1374 | ||
1375 | HEVC_PUT_HEVC_QPEL_HV 2, 10 | |
1376 | HEVC_PUT_HEVC_QPEL_HV 4, 10 | |
1377 | HEVC_PUT_HEVC_QPEL_HV 6, 10 | |
1378 | HEVC_PUT_HEVC_QPEL_HV 8, 10 | |
1379 | ||
1380 | HEVC_PUT_HEVC_QPEL_HV 2, 12 | |
1381 | HEVC_PUT_HEVC_QPEL_HV 4, 12 | |
1382 | HEVC_PUT_HEVC_QPEL_HV 6, 12 | |
1383 | HEVC_PUT_HEVC_QPEL_HV 8, 12 | |
1384 | ||
1385 | %endif ; ARCH_X86_64 |