Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* mc-a2.asm: x86 motion compensation | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2005-2013 x264 project | |
5 | ;* | |
6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |
7 | ;* Fiona Glaser <fiona@x264.com> | |
8 | ;* Holger Lubitz <holger@lubitz.org> | |
9 | ;* Mathieu Monnier <manao@melix.net> | |
10 | ;* Oskar Arvidsson <oskar@irock.se> | |
11 | ;* | |
12 | ;* This program is free software; you can redistribute it and/or modify | |
13 | ;* it under the terms of the GNU General Public License as published by | |
14 | ;* the Free Software Foundation; either version 2 of the License, or | |
15 | ;* (at your option) any later version. | |
16 | ;* | |
17 | ;* This program is distributed in the hope that it will be useful, | |
18 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ;* GNU General Public License for more details. | |
21 | ;* | |
22 | ;* You should have received a copy of the GNU General Public License | |
23 | ;* along with this program; if not, write to the Free Software | |
24 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
25 | ;* | |
26 | ;* This program is also available under a commercial proprietary license. | |
27 | ;* For more information, contact us at license @ x265.com. | |
28 | ;***************************************************************************** | |
29 | ||
30 | %include "x86inc.asm" | |
31 | %include "x86util.asm" | |
32 | ||
33 | SECTION_RODATA 32 | |
34 | ||
35 | deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 | |
36 | ||
37 | %if HIGH_BIT_DEPTH | |
38 | deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 | |
39 | deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 | |
40 | %else | |
41 | deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 | |
42 | deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 | |
43 | %endif | |
44 | pw_1024: times 16 dw 1024 | |
45 | ||
46 | pd_16: times 4 dd 16 | |
47 | pd_0f: times 4 dd 0xffff | |
48 | pf_inv256: times 8 dd 0.00390625 | |
49 | ||
50 | SECTION .text | |
51 | ||
52 | cextern pb_0 | |
53 | cextern pw_1 | |
54 | cextern pw_16 | |
55 | cextern pw_32 | |
56 | cextern pw_512 | |
57 | cextern pw_00ff | |
58 | cextern pw_3fff | |
59 | cextern pw_pixel_max | |
60 | cextern pd_ffff | |
61 | ||
62 | ;The hpel_filter routines use non-temporal writes for output. | |
63 | ;The following defines may be uncommented for testing. | |
64 | ;Doing the hpel_filter temporal may be a win if the last level cache | |
65 | ;is big enough (preliminary benching suggests on the order of 4* framesize). | |
66 | ||
67 | ;%define movntq movq | |
68 | ;%define movntps movaps | |
69 | ;%define sfence | |
70 | ||
71 | %if HIGH_BIT_DEPTH == 0 | |
72 | %undef movntq | |
73 | %undef movntps | |
74 | %undef sfence | |
75 | %endif ; !HIGH_BIT_DEPTH | |
76 | ||
77 | ;----------------------------------------------------------------------------- | |
78 | ; void plane_copy_core( pixel *dst, intptr_t i_dst, | |
79 | ; pixel *src, intptr_t i_src, int w, int h ) | |
80 | ;----------------------------------------------------------------------------- | |
81 | ; assumes i_dst and w are multiples of 16, and i_dst>w | |
82 | INIT_MMX | |
83 | cglobal plane_copy_core_mmx2, 6,7 | |
84 | FIX_STRIDES r1, r3, r4d | |
85 | %if HIGH_BIT_DEPTH == 0 | |
86 | movsxdifnidn r4, r4d | |
87 | %endif | |
88 | sub r1, r4 | |
89 | sub r3, r4 | |
90 | .loopy: | |
91 | lea r6d, [r4-63] | |
92 | .loopx: | |
93 | prefetchnta [r2+256] | |
94 | movq m0, [r2 ] | |
95 | movq m1, [r2+ 8] | |
96 | movntq [r0 ], m0 | |
97 | movntq [r0+ 8], m1 | |
98 | movq m2, [r2+16] | |
99 | movq m3, [r2+24] | |
100 | movntq [r0+16], m2 | |
101 | movntq [r0+24], m3 | |
102 | movq m4, [r2+32] | |
103 | movq m5, [r2+40] | |
104 | movntq [r0+32], m4 | |
105 | movntq [r0+40], m5 | |
106 | movq m6, [r2+48] | |
107 | movq m7, [r2+56] | |
108 | movntq [r0+48], m6 | |
109 | movntq [r0+56], m7 | |
110 | add r2, 64 | |
111 | add r0, 64 | |
112 | sub r6d, 64 | |
113 | jg .loopx | |
114 | prefetchnta [r2+256] | |
115 | add r6d, 63 | |
116 | jle .end16 | |
117 | .loop16: | |
118 | movq m0, [r2 ] | |
119 | movq m1, [r2+8] | |
120 | movntq [r0 ], m0 | |
121 | movntq [r0+8], m1 | |
122 | add r2, 16 | |
123 | add r0, 16 | |
124 | sub r6d, 16 | |
125 | jg .loop16 | |
126 | .end16: | |
127 | add r0, r1 | |
128 | add r2, r3 | |
129 | dec r5d | |
130 | jg .loopy | |
131 | sfence | |
132 | emms | |
133 | RET | |
134 | ||
135 | ||
136 | %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint | |
137 | %if HIGH_BIT_DEPTH | |
138 | %assign x 0 | |
139 | %rep 16/mmsize | |
140 | mov%4 m0, [%2+(x/2)*mmsize] | |
141 | mov%4 m1, [%3+(x/2)*mmsize] | |
142 | punpckhwd m2, m0, m1 | |
143 | punpcklwd m0, m1 | |
144 | mov%5a [%1+(x+0)*mmsize], m0 | |
145 | mov%5a [%1+(x+1)*mmsize], m2 | |
146 | %assign x (x+2) | |
147 | %endrep | |
148 | %else | |
149 | movq m0, [%2] | |
150 | %if mmsize==16 | |
151 | %ifidn %4, a | |
152 | punpcklbw m0, [%3] | |
153 | %else | |
154 | movq m1, [%3] | |
155 | punpcklbw m0, m1 | |
156 | %endif | |
157 | mov%5a [%1], m0 | |
158 | %else | |
159 | movq m1, [%3] | |
160 | punpckhbw m2, m0, m1 | |
161 | punpcklbw m0, m1 | |
162 | mov%5a [%1+0], m0 | |
163 | mov%5a [%1+8], m2 | |
164 | %endif | |
165 | %endif ; HIGH_BIT_DEPTH | |
166 | %endmacro | |
167 | ||
168 | %macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned | |
169 | %if HIGH_BIT_DEPTH | |
170 | %assign n 0 | |
171 | %rep 16/mmsize | |
172 | mova m0, [%3+(n+0)*mmsize] | |
173 | mova m1, [%3+(n+1)*mmsize] | |
174 | psrld m2, m0, 16 | |
175 | psrld m3, m1, 16 | |
176 | pand m0, %5 | |
177 | pand m1, %5 | |
178 | packssdw m0, m1 | |
179 | packssdw m2, m3 | |
180 | mov%6 [%1+(n/2)*mmsize], m0 | |
181 | mov%6 [%2+(n/2)*mmsize], m2 | |
182 | %assign n (n+2) | |
183 | %endrep | |
184 | %else ; !HIGH_BIT_DEPTH | |
185 | %if mmsize==16 | |
186 | mova m0, [%3] | |
187 | %if cpuflag(ssse3) | |
188 | pshufb m0, %5 | |
189 | %else | |
190 | mova m1, m0 | |
191 | pand m0, %5 | |
192 | psrlw m1, 8 | |
193 | packuswb m0, m1 | |
194 | %endif | |
195 | %if %4 | |
196 | mova [%1], m0 | |
197 | %else | |
198 | movq [%1], m0 | |
199 | movhps [%2], m0 | |
200 | %endif | |
201 | %else | |
202 | mova m0, [%3] | |
203 | mova m1, [%3+8] | |
204 | mova m2, m0 | |
205 | mova m3, m1 | |
206 | pand m0, %5 | |
207 | pand m1, %5 | |
208 | psrlw m2, 8 | |
209 | psrlw m3, 8 | |
210 | packuswb m0, m1 | |
211 | packuswb m2, m3 | |
212 | mova [%1], m0 | |
213 | mova [%2], m2 | |
214 | %endif ; mmsize == 16 | |
215 | %endif ; HIGH_BIT_DEPTH | |
216 | %endmacro | |
217 | ||
218 | %macro PLANE_INTERLEAVE 0 | |
219 | ;----------------------------------------------------------------------------- | |
220 | ; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst, | |
221 | ; uint8_t *srcu, intptr_t i_srcu, | |
222 | ; uint8_t *srcv, intptr_t i_srcv, int w, int h ) | |
223 | ;----------------------------------------------------------------------------- | |
224 | ; assumes i_dst and w are multiples of 16, and i_dst>2*w | |
225 | cglobal plane_copy_interleave_core, 6,9 | |
226 | mov r6d, r6m | |
227 | %if HIGH_BIT_DEPTH | |
228 | FIX_STRIDES r1, r3, r5, r6d | |
229 | movifnidn r1mp, r1 | |
230 | movifnidn r3mp, r3 | |
231 | mov r6m, r6d | |
232 | %endif | |
233 | lea r0, [r0+r6*2] | |
234 | add r2, r6 | |
235 | add r4, r6 | |
236 | %if ARCH_X86_64 | |
237 | DECLARE_REG_TMP 7,8 | |
238 | %else | |
239 | DECLARE_REG_TMP 1,3 | |
240 | %endif | |
241 | mov t1, r1 | |
242 | shr t1, SIZEOF_PIXEL | |
243 | sub t1, r6 | |
244 | mov t0d, r7m | |
245 | .loopy: | |
246 | mov r6d, r6m | |
247 | neg r6 | |
248 | .prefetch: | |
249 | prefetchnta [r2+r6] | |
250 | prefetchnta [r4+r6] | |
251 | add r6, 64 | |
252 | jl .prefetch | |
253 | mov r6d, r6m | |
254 | neg r6 | |
255 | .loopx: | |
256 | INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt | |
257 | INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt | |
258 | add r6, 16*SIZEOF_PIXEL | |
259 | jl .loopx | |
260 | .pad: | |
261 | %assign n 0 | |
262 | %rep SIZEOF_PIXEL | |
263 | %if mmsize==8 | |
264 | movntq [r0+r6*2+(n+ 0)], m0 | |
265 | movntq [r0+r6*2+(n+ 8)], m0 | |
266 | movntq [r0+r6*2+(n+16)], m0 | |
267 | movntq [r0+r6*2+(n+24)], m0 | |
268 | %else | |
269 | movntdq [r0+r6*2+(n+ 0)], m0 | |
270 | movntdq [r0+r6*2+(n+16)], m0 | |
271 | %endif | |
272 | %assign n n+32 | |
273 | %endrep | |
274 | add r6, 16*SIZEOF_PIXEL | |
275 | cmp r6, t1 | |
276 | jl .pad | |
277 | add r0, r1mp | |
278 | add r2, r3mp | |
279 | add r4, r5 | |
280 | dec t0d | |
281 | jg .loopy | |
282 | sfence | |
283 | emms | |
284 | RET | |
285 | ||
286 | ;----------------------------------------------------------------------------- | |
287 | ; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height ) | |
288 | ;----------------------------------------------------------------------------- | |
289 | cglobal store_interleave_chroma, 5,5 | |
290 | FIX_STRIDES r1 | |
291 | .loop: | |
292 | INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a | |
293 | INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a | |
294 | add r2, FDEC_STRIDEB*2 | |
295 | add r3, FDEC_STRIDEB*2 | |
296 | lea r0, [r0+r1*2] | |
297 | sub r4d, 2 | |
298 | jg .loop | |
299 | RET | |
300 | %endmacro ; PLANE_INTERLEAVE | |
301 | ||
302 | %macro DEINTERLEAVE_START 0 | |
303 | %if HIGH_BIT_DEPTH | |
304 | mova m4, [pd_ffff] | |
305 | %elif cpuflag(ssse3) | |
306 | mova m4, [deinterleave_shuf] | |
307 | %else | |
308 | mova m4, [pw_00ff] | |
309 | %endif ; HIGH_BIT_DEPTH | |
310 | %endmacro | |
311 | ||
312 | %macro PLANE_DEINTERLEAVE 0 | |
313 | ;----------------------------------------------------------------------------- | |
314 | ; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, | |
315 | ; pixel *dstv, intptr_t i_dstv, | |
316 | ; pixel *src, intptr_t i_src, int w, int h ) | |
317 | ;----------------------------------------------------------------------------- | |
318 | cglobal plane_copy_deinterleave, 6,7 | |
319 | DEINTERLEAVE_START | |
320 | mov r6d, r6m | |
321 | FIX_STRIDES r1, r3, r5, r6d | |
322 | %if HIGH_BIT_DEPTH | |
323 | mov r6m, r6d | |
324 | %endif | |
325 | add r0, r6 | |
326 | add r2, r6 | |
327 | lea r4, [r4+r6*2] | |
328 | .loopy: | |
329 | mov r6d, r6m | |
330 | neg r6 | |
331 | .loopx: | |
332 | DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u | |
333 | DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u | |
334 | add r6, 16*SIZEOF_PIXEL | |
335 | jl .loopx | |
336 | add r0, r1 | |
337 | add r2, r3 | |
338 | add r4, r5 | |
339 | dec dword r7m | |
340 | jg .loopy | |
341 | RET | |
342 | ||
343 | ;----------------------------------------------------------------------------- | |
344 | ; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height ) | |
345 | ;----------------------------------------------------------------------------- | |
346 | cglobal load_deinterleave_chroma_fenc, 4,4 | |
347 | DEINTERLEAVE_START | |
348 | FIX_STRIDES r2 | |
349 | .loop: | |
350 | DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a | |
351 | DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a | |
352 | add r0, FENC_STRIDEB*2 | |
353 | lea r1, [r1+r2*2] | |
354 | sub r3d, 2 | |
355 | jg .loop | |
356 | RET | |
357 | ||
358 | ;----------------------------------------------------------------------------- | |
359 | ; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height ) | |
360 | ;----------------------------------------------------------------------------- | |
361 | cglobal load_deinterleave_chroma_fdec, 4,4 | |
362 | DEINTERLEAVE_START | |
363 | FIX_STRIDES r2 | |
364 | .loop: | |
365 | DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a | |
366 | DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a | |
367 | add r0, FDEC_STRIDEB*2 | |
368 | lea r1, [r1+r2*2] | |
369 | sub r3d, 2 | |
370 | jg .loop | |
371 | RET | |
372 | %endmacro ; PLANE_DEINTERLEAVE | |
373 | ||
374 | %if HIGH_BIT_DEPTH | |
375 | INIT_MMX mmx2 | |
376 | PLANE_INTERLEAVE | |
377 | INIT_MMX mmx | |
378 | PLANE_DEINTERLEAVE | |
379 | INIT_XMM sse2 | |
380 | PLANE_INTERLEAVE | |
381 | PLANE_DEINTERLEAVE | |
382 | INIT_XMM avx | |
383 | PLANE_INTERLEAVE | |
384 | PLANE_DEINTERLEAVE | |
385 | %else | |
386 | INIT_MMX mmx2 | |
387 | PLANE_INTERLEAVE | |
388 | INIT_MMX mmx | |
389 | PLANE_DEINTERLEAVE | |
390 | INIT_XMM sse2 | |
391 | PLANE_INTERLEAVE | |
392 | PLANE_DEINTERLEAVE | |
393 | INIT_XMM ssse3 | |
394 | PLANE_DEINTERLEAVE | |
395 | %endif | |
396 | ||
397 | ; These functions are not general-use; not only do the SSE ones require aligned input, | |
398 | ; but they also will fail if given a non-mod16 size. | |
399 | ; memzero SSE will fail for non-mod128. | |
400 | ||
401 | ;----------------------------------------------------------------------------- | |
402 | ; void *memcpy_aligned( void *dst, const void *src, size_t n ); | |
403 | ;----------------------------------------------------------------------------- | |
404 | %macro MEMCPY 0 | |
405 | cglobal memcpy_aligned, 3,3 | |
406 | %if mmsize == 16 | |
407 | test r2d, 16 | |
408 | jz .copy2 | |
409 | mova m0, [r1+r2-16] | |
410 | mova [r0+r2-16], m0 | |
411 | sub r2d, 16 | |
412 | .copy2: | |
413 | %endif | |
414 | test r2d, 2*mmsize | |
415 | jz .copy4start | |
416 | mova m0, [r1+r2-1*mmsize] | |
417 | mova m1, [r1+r2-2*mmsize] | |
418 | mova [r0+r2-1*mmsize], m0 | |
419 | mova [r0+r2-2*mmsize], m1 | |
420 | sub r2d, 2*mmsize | |
421 | .copy4start: | |
422 | test r2d, r2d | |
423 | jz .ret | |
424 | .copy4: | |
425 | mova m0, [r1+r2-1*mmsize] | |
426 | mova m1, [r1+r2-2*mmsize] | |
427 | mova m2, [r1+r2-3*mmsize] | |
428 | mova m3, [r1+r2-4*mmsize] | |
429 | mova [r0+r2-1*mmsize], m0 | |
430 | mova [r0+r2-2*mmsize], m1 | |
431 | mova [r0+r2-3*mmsize], m2 | |
432 | mova [r0+r2-4*mmsize], m3 | |
433 | sub r2d, 4*mmsize | |
434 | jg .copy4 | |
435 | .ret: | |
436 | REP_RET | |
437 | %endmacro | |
438 | ||
439 | INIT_MMX mmx | |
440 | MEMCPY | |
441 | INIT_XMM sse | |
442 | MEMCPY | |
443 | ||
444 | ;----------------------------------------------------------------------------- | |
445 | ; void *memzero_aligned( void *dst, size_t n ); | |
446 | ;----------------------------------------------------------------------------- | |
447 | %macro MEMZERO 1 | |
448 | cglobal memzero_aligned, 2,2 | |
449 | add r0, r1 | |
450 | neg r1 | |
451 | %if mmsize == 8 | |
452 | pxor m0, m0 | |
453 | %else | |
454 | xorps m0, m0 | |
455 | %endif | |
456 | .loop: | |
457 | %assign i 0 | |
458 | %rep %1 | |
459 | mova [r0 + r1 + i], m0 | |
460 | %assign i i+mmsize | |
461 | %endrep | |
462 | add r1, mmsize*%1 | |
463 | jl .loop | |
464 | RET | |
465 | %endmacro | |
466 | ||
467 | INIT_MMX mmx | |
468 | MEMZERO 8 | |
469 | INIT_XMM sse | |
470 | MEMZERO 8 | |
471 | INIT_YMM avx | |
472 | MEMZERO 4 | |
473 | ||
474 | %if HIGH_BIT_DEPTH == 0 | |
475 | ;----------------------------------------------------------------------------- | |
476 | ; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride ) | |
477 | ;----------------------------------------------------------------------------- | |
478 | %macro INTEGRAL_INIT4H 0 | |
479 | cglobal integral_init4h, 3,4 | |
480 | lea r3, [r0+r2*2] | |
481 | add r1, r2 | |
482 | neg r2 | |
483 | pxor m4, m4 | |
484 | .loop: | |
485 | mova m0, [r1+r2] | |
486 | %if mmsize==32 | |
487 | movu m1, [r1+r2+8] | |
488 | %else | |
489 | mova m1, [r1+r2+16] | |
490 | palignr m1, m0, 8 | |
491 | %endif | |
492 | mpsadbw m0, m4, 0 | |
493 | mpsadbw m1, m4, 0 | |
494 | paddw m0, [r0+r2*2] | |
495 | paddw m1, [r0+r2*2+mmsize] | |
496 | mova [r3+r2*2 ], m0 | |
497 | mova [r3+r2*2+mmsize], m1 | |
498 | add r2, mmsize | |
499 | jl .loop | |
500 | RET | |
501 | %endmacro | |
502 | ||
503 | INIT_XMM sse4 | |
504 | INTEGRAL_INIT4H | |
505 | INIT_YMM avx2 | |
506 | INTEGRAL_INIT4H | |
507 | ||
508 | %macro INTEGRAL_INIT8H 0 | |
509 | cglobal integral_init8h, 3,4 | |
510 | lea r3, [r0+r2*2] | |
511 | add r1, r2 | |
512 | neg r2 | |
513 | pxor m4, m4 | |
514 | .loop: | |
515 | mova m0, [r1+r2] | |
516 | %if mmsize==32 | |
517 | movu m1, [r1+r2+8] | |
518 | mpsadbw m2, m0, m4, 100100b | |
519 | mpsadbw m3, m1, m4, 100100b | |
520 | %else | |
521 | mova m1, [r1+r2+16] | |
522 | palignr m1, m0, 8 | |
523 | mpsadbw m2, m0, m4, 100b | |
524 | mpsadbw m3, m1, m4, 100b | |
525 | %endif | |
526 | mpsadbw m0, m4, 0 | |
527 | mpsadbw m1, m4, 0 | |
528 | paddw m0, [r0+r2*2] | |
529 | paddw m1, [r0+r2*2+mmsize] | |
530 | paddw m0, m2 | |
531 | paddw m1, m3 | |
532 | mova [r3+r2*2 ], m0 | |
533 | mova [r3+r2*2+mmsize], m1 | |
534 | add r2, mmsize | |
535 | jl .loop | |
536 | RET | |
537 | %endmacro | |
538 | ||
539 | INIT_XMM sse4 | |
540 | INTEGRAL_INIT8H | |
541 | INIT_XMM avx | |
542 | INTEGRAL_INIT8H | |
543 | INIT_YMM avx2 | |
544 | INTEGRAL_INIT8H | |
545 | %endif ; !HIGH_BIT_DEPTH | |
546 | ||
547 | %macro INTEGRAL_INIT_8V 0 | |
548 | ;----------------------------------------------------------------------------- | |
549 | ; void integral_init8v( uint16_t *sum8, intptr_t stride ) | |
550 | ;----------------------------------------------------------------------------- | |
551 | cglobal integral_init8v, 3,3 | |
552 | add r1, r1 | |
553 | add r0, r1 | |
554 | lea r2, [r0+r1*8] | |
555 | neg r1 | |
556 | .loop: | |
557 | mova m0, [r2+r1] | |
558 | mova m1, [r2+r1+mmsize] | |
559 | psubw m0, [r0+r1] | |
560 | psubw m1, [r0+r1+mmsize] | |
561 | mova [r0+r1], m0 | |
562 | mova [r0+r1+mmsize], m1 | |
563 | add r1, 2*mmsize | |
564 | jl .loop | |
565 | RET | |
566 | %endmacro | |
567 | ||
568 | INIT_MMX mmx | |
569 | INTEGRAL_INIT_8V | |
570 | INIT_XMM sse2 | |
571 | INTEGRAL_INIT_8V | |
572 | INIT_YMM avx2 | |
573 | INTEGRAL_INIT_8V | |
574 | ||
575 | ;----------------------------------------------------------------------------- | |
576 | ; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride ) | |
577 | ;----------------------------------------------------------------------------- | |
578 | INIT_MMX mmx | |
579 | cglobal integral_init4v, 3,5 | |
580 | shl r2, 1 | |
581 | lea r3, [r0+r2*4] | |
582 | lea r4, [r0+r2*8] | |
583 | mova m0, [r0+r2] | |
584 | mova m4, [r4+r2] | |
585 | .loop: | |
586 | mova m1, m4 | |
587 | psubw m1, m0 | |
588 | mova m4, [r4+r2-8] | |
589 | mova m0, [r0+r2-8] | |
590 | paddw m1, m4 | |
591 | mova m3, [r3+r2-8] | |
592 | psubw m1, m0 | |
593 | psubw m3, m0 | |
594 | mova [r0+r2-8], m1 | |
595 | mova [r1+r2-8], m3 | |
596 | sub r2, 8 | |
597 | jge .loop | |
598 | RET | |
599 | ||
600 | INIT_XMM sse2 | |
601 | cglobal integral_init4v, 3,5 | |
602 | shl r2, 1 | |
603 | add r0, r2 | |
604 | add r1, r2 | |
605 | lea r3, [r0+r2*4] | |
606 | lea r4, [r0+r2*8] | |
607 | neg r2 | |
608 | .loop: | |
609 | mova m0, [r0+r2] | |
610 | mova m1, [r4+r2] | |
611 | mova m2, m0 | |
612 | mova m4, m1 | |
613 | shufpd m0, [r0+r2+16], 1 | |
614 | shufpd m1, [r4+r2+16], 1 | |
615 | paddw m0, m2 | |
616 | paddw m1, m4 | |
617 | mova m3, [r3+r2] | |
618 | psubw m1, m0 | |
619 | psubw m3, m2 | |
620 | mova [r0+r2], m1 | |
621 | mova [r1+r2], m3 | |
622 | add r2, 16 | |
623 | jl .loop | |
624 | RET | |
625 | ||
626 | INIT_XMM ssse3 | |
627 | cglobal integral_init4v, 3,5 | |
628 | shl r2, 1 | |
629 | add r0, r2 | |
630 | add r1, r2 | |
631 | lea r3, [r0+r2*4] | |
632 | lea r4, [r0+r2*8] | |
633 | neg r2 | |
634 | .loop: | |
635 | mova m2, [r0+r2] | |
636 | mova m0, [r0+r2+16] | |
637 | mova m4, [r4+r2] | |
638 | mova m1, [r4+r2+16] | |
639 | palignr m0, m2, 8 | |
640 | palignr m1, m4, 8 | |
641 | paddw m0, m2 | |
642 | paddw m1, m4 | |
643 | mova m3, [r3+r2] | |
644 | psubw m1, m0 | |
645 | psubw m3, m2 | |
646 | mova [r0+r2], m1 | |
647 | mova [r1+r2], m3 | |
648 | add r2, 16 | |
649 | jl .loop | |
650 | RET | |
651 | ||
652 | INIT_YMM avx2 | |
653 | cglobal integral_init4v, 3,5 | |
654 | add r2, r2 | |
655 | add r0, r2 | |
656 | add r1, r2 | |
657 | lea r3, [r0+r2*4] | |
658 | lea r4, [r0+r2*8] | |
659 | neg r2 | |
660 | .loop: | |
661 | mova m2, [r0+r2] | |
662 | movu m1, [r4+r2+8] | |
663 | paddw m0, m2, [r0+r2+8] | |
664 | paddw m1, [r4+r2] | |
665 | mova m3, [r3+r2] | |
666 | psubw m1, m0 | |
667 | psubw m3, m2 | |
668 | mova [r0+r2], m1 | |
669 | mova [r1+r2], m3 | |
670 | add r2, 32 | |
671 | jl .loop | |
672 | RET | |
673 | ||
674 | %macro FILT8x4 7 | |
675 | mova %3, [r0+%7] | |
676 | mova %4, [r0+r5+%7] | |
677 | pavgb %3, %4 | |
678 | pavgb %4, [r0+r5*2+%7] | |
679 | PALIGNR %1, %3, 1, m6 | |
680 | PALIGNR %2, %4, 1, m6 | |
681 | %if cpuflag(xop) | |
682 | pavgb %1, %3 | |
683 | pavgb %2, %4 | |
684 | %else | |
685 | pavgb %1, %3 | |
686 | pavgb %2, %4 | |
687 | psrlw %5, %1, 8 | |
688 | psrlw %6, %2, 8 | |
689 | pand %1, m7 | |
690 | pand %2, m7 | |
691 | %endif | |
692 | %endmacro | |
693 | ||
694 | %macro FILT32x4U 4 | |
695 | mova m1, [r0+r5] | |
696 | pavgb m0, m1, [r0] | |
697 | movu m3, [r0+r5+1] | |
698 | pavgb m2, m3, [r0+1] | |
699 | pavgb m1, [r0+r5*2] | |
700 | pavgb m3, [r0+r5*2+1] | |
701 | pavgb m0, m2 | |
702 | pavgb m1, m3 | |
703 | ||
704 | mova m3, [r0+r5+mmsize] | |
705 | pavgb m2, m3, [r0+mmsize] | |
706 | movu m5, [r0+r5+1+mmsize] | |
707 | pavgb m4, m5, [r0+1+mmsize] | |
708 | pavgb m3, [r0+r5*2+mmsize] | |
709 | pavgb m5, [r0+r5*2+1+mmsize] | |
710 | pavgb m2, m4 | |
711 | pavgb m3, m5 | |
712 | ||
713 | pshufb m0, m7 | |
714 | pshufb m1, m7 | |
715 | pshufb m2, m7 | |
716 | pshufb m3, m7 | |
717 | punpckhqdq m4, m0, m2 | |
718 | punpcklqdq m0, m0, m2 | |
719 | punpckhqdq m5, m1, m3 | |
720 | punpcklqdq m2, m1, m3 | |
721 | vpermq m0, m0, q3120 | |
722 | vpermq m1, m4, q3120 | |
723 | vpermq m2, m2, q3120 | |
724 | vpermq m3, m5, q3120 | |
725 | mova [%1], m0 | |
726 | mova [%2], m1 | |
727 | mova [%3], m2 | |
728 | mova [%4], m3 | |
729 | %endmacro | |
730 | ||
731 | %macro FILT16x2 4 | |
732 | mova m3, [r0+%4+mmsize] | |
733 | mova m2, [r0+%4] | |
734 | pavgb m3, [r0+%4+r5+mmsize] | |
735 | pavgb m2, [r0+%4+r5] | |
736 | PALIGNR %1, m3, 1, m6 | |
737 | pavgb %1, m3 | |
738 | PALIGNR m3, m2, 1, m6 | |
739 | pavgb m3, m2 | |
740 | %if cpuflag(xop) | |
741 | vpperm m5, m3, %1, m7 | |
742 | vpperm m3, m3, %1, m6 | |
743 | %else | |
744 | psrlw m5, m3, 8 | |
745 | psrlw m4, %1, 8 | |
746 | pand m3, m7 | |
747 | pand %1, m7 | |
748 | packuswb m3, %1 | |
749 | packuswb m5, m4 | |
750 | %endif | |
751 | mova [%2], m3 | |
752 | mova [%3], m5 | |
753 | mova %1, m2 | |
754 | %endmacro | |
755 | ||
756 | %macro FILT8x2U 3 | |
757 | mova m3, [r0+%3+8] | |
758 | mova m2, [r0+%3] | |
759 | pavgb m3, [r0+%3+r5+8] | |
760 | pavgb m2, [r0+%3+r5] | |
761 | mova m1, [r0+%3+9] | |
762 | mova m0, [r0+%3+1] | |
763 | pavgb m1, [r0+%3+r5+9] | |
764 | pavgb m0, [r0+%3+r5+1] | |
765 | pavgb m1, m3 | |
766 | pavgb m0, m2 | |
767 | psrlw m3, m1, 8 | |
768 | psrlw m2, m0, 8 | |
769 | pand m1, m7 | |
770 | pand m0, m7 | |
771 | packuswb m0, m1 | |
772 | packuswb m2, m3 | |
773 | mova [%1], m0 | |
774 | mova [%2], m2 | |
775 | %endmacro | |
776 | ||
777 | %macro FILT8xU 3 | |
778 | mova m3, [r0+%3+8] | |
779 | mova m2, [r0+%3] | |
780 | pavgw m3, [r0+%3+r5+8] | |
781 | pavgw m2, [r0+%3+r5] | |
782 | movu m1, [r0+%3+10] | |
783 | movu m0, [r0+%3+2] | |
784 | pavgw m1, [r0+%3+r5+10] | |
785 | pavgw m0, [r0+%3+r5+2] | |
786 | pavgw m1, m3 | |
787 | pavgw m0, m2 | |
788 | psrld m3, m1, 16 | |
789 | psrld m2, m0, 16 | |
790 | pand m1, m7 | |
791 | pand m0, m7 | |
792 | packssdw m0, m1 | |
793 | packssdw m2, m3 | |
794 | movu [%1], m0 | |
795 | mova [%2], m2 | |
796 | %endmacro | |
797 | ||
798 | %macro FILT8xA 4 | |
799 | mova m3, [r0+%4+mmsize] | |
800 | mova m2, [r0+%4] | |
801 | pavgw m3, [r0+%4+r5+mmsize] | |
802 | pavgw m2, [r0+%4+r5] | |
803 | PALIGNR %1, m3, 2, m6 | |
804 | pavgw %1, m3 | |
805 | PALIGNR m3, m2, 2, m6 | |
806 | pavgw m3, m2 | |
807 | %if cpuflag(xop) | |
808 | vpperm m5, m3, %1, m7 | |
809 | vpperm m3, m3, %1, m6 | |
810 | %else | |
811 | psrld m5, m3, 16 | |
812 | psrld m4, %1, 16 | |
813 | pand m3, m7 | |
814 | pand %1, m7 | |
815 | packssdw m3, %1 | |
816 | packssdw m5, m4 | |
817 | %endif | |
818 | mova [%2], m3 | |
819 | mova [%3], m5 | |
820 | mova %1, m2 | |
821 | %endmacro | |
822 | ||
823 | ;----------------------------------------------------------------------------- | |
824 | ; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, | |
825 | ; intptr_t src_stride, intptr_t dst_stride, int width, int height ) | |
826 | ;----------------------------------------------------------------------------- | |
827 | %macro FRAME_INIT_LOWRES 0 | |
828 | cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise | |
829 | %if HIGH_BIT_DEPTH | |
830 | shl dword r6m, 1 | |
831 | FIX_STRIDES r5 | |
832 | shl dword r7m, 1 | |
833 | %endif | |
834 | %if mmsize >= 16 | |
835 | add dword r7m, mmsize-1 | |
836 | and dword r7m, ~(mmsize-1) | |
837 | %endif | |
838 | ; src += 2*(height-1)*stride + 2*width | |
839 | mov r6d, r8m | |
840 | dec r6d | |
841 | imul r6d, r5d | |
842 | add r6d, r7m | |
843 | lea r0, [r0+r6*2] | |
844 | ; dst += (height-1)*stride + width | |
845 | mov r6d, r8m | |
846 | dec r6d | |
847 | imul r6d, r6m | |
848 | add r6d, r7m | |
849 | add r1, r6 | |
850 | add r2, r6 | |
851 | add r3, r6 | |
852 | add r4, r6 | |
853 | ; gap = stride - width | |
854 | mov r6d, r6m | |
855 | sub r6d, r7m | |
856 | PUSH r6 | |
857 | %define dst_gap [rsp+gprsize] | |
858 | mov r6d, r5d | |
859 | sub r6d, r7m | |
860 | shl r6d, 1 | |
861 | PUSH r6 | |
862 | %define src_gap [rsp] | |
863 | %if HIGH_BIT_DEPTH | |
864 | %if cpuflag(xop) | |
865 | mova m6, [deinterleave_shuf32a] | |
866 | mova m7, [deinterleave_shuf32b] | |
867 | %else | |
868 | pcmpeqw m7, m7 | |
869 | psrld m7, 16 | |
870 | %endif | |
871 | .vloop: | |
872 | mov r6d, r7m | |
873 | %ifnidn cpuname, mmx2 | |
874 | mova m0, [r0] | |
875 | mova m1, [r0+r5] | |
876 | pavgw m0, m1 | |
877 | pavgw m1, [r0+r5*2] | |
878 | %endif | |
879 | .hloop: | |
880 | sub r0, mmsize*2 | |
881 | sub r1, mmsize | |
882 | sub r2, mmsize | |
883 | sub r3, mmsize | |
884 | sub r4, mmsize | |
885 | %ifidn cpuname, mmx2 | |
886 | FILT8xU r1, r2, 0 | |
887 | FILT8xU r3, r4, r5 | |
888 | %else | |
889 | FILT8xA m0, r1, r2, 0 | |
890 | FILT8xA m1, r3, r4, r5 | |
891 | %endif | |
892 | sub r6d, mmsize | |
893 | jg .hloop | |
894 | %else ; !HIGH_BIT_DEPTH | |
895 | %if cpuflag(avx2) | |
896 | mova m7, [deinterleave_shuf] | |
897 | %elif cpuflag(xop) | |
898 | mova m6, [deinterleave_shuf32a] | |
899 | mova m7, [deinterleave_shuf32b] | |
900 | %else | |
901 | pcmpeqb m7, m7 | |
902 | psrlw m7, 8 | |
903 | %endif | |
904 | .vloop: | |
905 | mov r6d, r7m | |
906 | %ifnidn cpuname, mmx2 | |
907 | %if mmsize <= 16 | |
908 | mova m0, [r0] | |
909 | mova m1, [r0+r5] | |
910 | pavgb m0, m1 | |
911 | pavgb m1, [r0+r5*2] | |
912 | %endif | |
913 | %endif | |
914 | .hloop: | |
915 | sub r0, mmsize*2 | |
916 | sub r1, mmsize | |
917 | sub r2, mmsize | |
918 | sub r3, mmsize | |
919 | sub r4, mmsize | |
920 | %if mmsize==32 | |
921 | FILT32x4U r1, r2, r3, r4 | |
922 | %elifdef m8 | |
923 | FILT8x4 m0, m1, m2, m3, m10, m11, mmsize | |
924 | mova m8, m0 | |
925 | mova m9, m1 | |
926 | FILT8x4 m2, m3, m0, m1, m4, m5, 0 | |
927 | %if cpuflag(xop) | |
928 | vpperm m4, m2, m8, m7 | |
929 | vpperm m2, m2, m8, m6 | |
930 | vpperm m5, m3, m9, m7 | |
931 | vpperm m3, m3, m9, m6 | |
932 | %else | |
933 | packuswb m2, m8 | |
934 | packuswb m3, m9 | |
935 | packuswb m4, m10 | |
936 | packuswb m5, m11 | |
937 | %endif | |
938 | mova [r1], m2 | |
939 | mova [r2], m4 | |
940 | mova [r3], m3 | |
941 | mova [r4], m5 | |
942 | %elifidn cpuname, mmx2 | |
943 | FILT8x2U r1, r2, 0 | |
944 | FILT8x2U r3, r4, r5 | |
945 | %else | |
946 | FILT16x2 m0, r1, r2, 0 | |
947 | FILT16x2 m1, r3, r4, r5 | |
948 | %endif | |
949 | sub r6d, mmsize | |
950 | jg .hloop | |
951 | %endif ; HIGH_BIT_DEPTH | |
952 | .skip: | |
953 | mov r6, dst_gap | |
954 | sub r0, src_gap | |
955 | sub r1, r6 | |
956 | sub r2, r6 | |
957 | sub r3, r6 | |
958 | sub r4, r6 | |
959 | dec dword r8m | |
960 | jg .vloop | |
961 | ADD rsp, 2*gprsize | |
962 | emms | |
963 | RET | |
964 | %endmacro ; FRAME_INIT_LOWRES | |
965 | ||
966 | INIT_MMX mmx2 | |
967 | FRAME_INIT_LOWRES | |
968 | %if ARCH_X86_64 == 0 | |
969 | INIT_MMX cache32, mmx2 | |
970 | FRAME_INIT_LOWRES | |
971 | %endif | |
972 | INIT_XMM sse2 | |
973 | FRAME_INIT_LOWRES | |
974 | INIT_XMM ssse3 | |
975 | FRAME_INIT_LOWRES | |
976 | INIT_XMM avx | |
977 | FRAME_INIT_LOWRES | |
978 | INIT_XMM xop | |
979 | FRAME_INIT_LOWRES | |
980 | %if HIGH_BIT_DEPTH==0 | |
981 | INIT_YMM avx2 | |
982 | FRAME_INIT_LOWRES | |
983 | %endif | |
984 | ||
985 | ;----------------------------------------------------------------------------- | |
986 | ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, | |
987 | ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) | |
988 | ;----------------------------------------------------------------------------- | |
989 | %macro MBTREE 0 | |
990 | cglobal mbtree_propagate_cost, 7,7,7 | |
991 | add r6d, r6d | |
992 | lea r0, [r0+r6*2] | |
993 | add r1, r6 | |
994 | add r2, r6 | |
995 | add r3, r6 | |
996 | add r4, r6 | |
997 | neg r6 | |
998 | pxor xmm4, xmm4 | |
999 | movss xmm6, [r5] | |
1000 | shufps xmm6, xmm6, 0 | |
1001 | mulps xmm6, [pf_inv256] | |
1002 | movdqa xmm5, [pw_3fff] | |
1003 | .loop: | |
1004 | movq xmm2, [r2+r6] ; intra | |
1005 | movq xmm0, [r4+r6] ; invq | |
1006 | movq xmm3, [r3+r6] ; inter | |
1007 | movq xmm1, [r1+r6] ; prop | |
1008 | punpcklwd xmm2, xmm4 | |
1009 | punpcklwd xmm0, xmm4 | |
1010 | pmaddwd xmm0, xmm2 | |
1011 | pand xmm3, xmm5 | |
1012 | punpcklwd xmm1, xmm4 | |
1013 | punpcklwd xmm3, xmm4 | |
1014 | %if cpuflag(fma4) | |
1015 | cvtdq2ps xmm0, xmm0 | |
1016 | cvtdq2ps xmm1, xmm1 | |
1017 | fmaddps xmm0, xmm0, xmm6, xmm1 | |
1018 | cvtdq2ps xmm1, xmm2 | |
1019 | psubd xmm2, xmm3 | |
1020 | cvtdq2ps xmm2, xmm2 | |
1021 | rcpps xmm3, xmm1 | |
1022 | mulps xmm1, xmm3 | |
1023 | mulps xmm0, xmm2 | |
1024 | addps xmm2, xmm3, xmm3 | |
1025 | fnmaddps xmm3, xmm1, xmm3, xmm2 | |
1026 | mulps xmm0, xmm3 | |
1027 | %else | |
1028 | cvtdq2ps xmm0, xmm0 | |
1029 | mulps xmm0, xmm6 ; intra*invq*fps_factor>>8 | |
1030 | cvtdq2ps xmm1, xmm1 ; prop | |
1031 | addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8) | |
1032 | cvtdq2ps xmm1, xmm2 ; intra | |
1033 | psubd xmm2, xmm3 ; intra - inter | |
1034 | cvtdq2ps xmm2, xmm2 ; intra - inter | |
1035 | rcpps xmm3, xmm1 ; 1 / intra 1st approximation | |
1036 | mulps xmm1, xmm3 ; intra * (1/intra 1st approx) | |
1037 | mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2 | |
1038 | mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) | |
1039 | addps xmm3, xmm3 ; 2 * (1/intra 1st approx) | |
1040 | subps xmm3, xmm1 ; 2nd approximation for 1/intra | |
1041 | mulps xmm0, xmm3 ; / intra | |
1042 | %endif | |
1043 | cvtps2dq xmm0, xmm0 | |
1044 | movdqa [r0+r6*2], xmm0 | |
1045 | add r6, 8 | |
1046 | jl .loop | |
1047 | RET | |
1048 | %endmacro | |
1049 | ||
1050 | INIT_XMM sse2 | |
1051 | MBTREE | |
1052 | ; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower. | |
1053 | INIT_XMM fma4 | |
1054 | MBTREE | |
1055 | ||
1056 | %macro INT16_UNPACK 1 | |
1057 | vpunpckhwd xm4, xm%1, xm7 | |
1058 | vpunpcklwd xm%1, xm7 | |
1059 | vinsertf128 m%1, m%1, xm4, 1 | |
1060 | %endmacro | |
1061 | ||
1062 | ; FIXME: align loads/stores to 16 bytes | |
1063 | %macro MBTREE_AVX 0 | |
1064 | cglobal mbtree_propagate_cost, 7,7,8 | |
1065 | add r6d, r6d | |
1066 | lea r0, [r0+r6*2] | |
1067 | add r1, r6 | |
1068 | add r2, r6 | |
1069 | add r3, r6 | |
1070 | add r4, r6 | |
1071 | neg r6 | |
1072 | mova xm5, [pw_3fff] | |
1073 | vbroadcastss m6, [r5] | |
1074 | mulps m6, [pf_inv256] | |
1075 | %if notcpuflag(avx2) | |
1076 | pxor xm7, xm7 | |
1077 | %endif | |
1078 | .loop: | |
1079 | %if cpuflag(avx2) | |
1080 | pmovzxwd m0, [r2+r6] ; intra | |
1081 | pmovzxwd m1, [r4+r6] ; invq | |
1082 | pmovzxwd m2, [r1+r6] ; prop | |
1083 | pand xm3, xm5, [r3+r6] ; inter | |
1084 | pmovzxwd m3, xm3 | |
1085 | pmaddwd m1, m0 | |
1086 | psubd m4, m0, m3 | |
1087 | cvtdq2ps m0, m0 | |
1088 | cvtdq2ps m1, m1 | |
1089 | cvtdq2ps m2, m2 | |
1090 | cvtdq2ps m4, m4 | |
1091 | fmaddps m1, m1, m6, m2 | |
1092 | rcpps m3, m0 | |
1093 | mulps m2, m0, m3 | |
1094 | mulps m1, m4 | |
1095 | addps m4, m3, m3 | |
1096 | fnmaddps m4, m2, m3, m4 | |
1097 | mulps m1, m4 | |
1098 | %else | |
1099 | movu xm0, [r2+r6] | |
1100 | movu xm1, [r4+r6] | |
1101 | movu xm2, [r1+r6] | |
1102 | pand xm3, xm5, [r3+r6] | |
1103 | INT16_UNPACK 0 | |
1104 | INT16_UNPACK 1 | |
1105 | INT16_UNPACK 2 | |
1106 | INT16_UNPACK 3 | |
1107 | cvtdq2ps m0, m0 | |
1108 | cvtdq2ps m1, m1 | |
1109 | cvtdq2ps m2, m2 | |
1110 | cvtdq2ps m3, m3 | |
1111 | mulps m1, m0 | |
1112 | subps m4, m0, m3 | |
1113 | mulps m1, m6 ; intra*invq*fps_factor>>8 | |
1114 | addps m1, m2 ; prop + (intra*invq*fps_factor>>8) | |
1115 | rcpps m3, m0 ; 1 / intra 1st approximation | |
1116 | mulps m2, m0, m3 ; intra * (1/intra 1st approx) | |
1117 | mulps m2, m3 ; intra * (1/intra 1st approx)^2 | |
1118 | mulps m1, m4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) | |
1119 | addps m3, m3 ; 2 * (1/intra 1st approx) | |
1120 | subps m3, m2 ; 2nd approximation for 1/intra | |
1121 | mulps m1, m3 ; / intra | |
1122 | %endif | |
1123 | vcvtps2dq m1, m1 | |
1124 | movu [r0+r6*2], m1 | |
1125 | add r6, 16 | |
1126 | jl .loop | |
1127 | RET | |
1128 | %endmacro | |
1129 | ||
1130 | INIT_YMM avx | |
1131 | MBTREE_AVX | |
1132 | INIT_YMM avx2,fma3 | |
1133 | MBTREE_AVX |