Commit | Line | Data |
---|---|---|
2ba45a60 DM |
1 | /* |
2 | * QNS functions are compiled 3 times for MMX/3DNOW/SSSE3 | |
3 | * Copyright (c) 2004 Michael Niedermayer | |
4 | * | |
5 | * MMX optimization by Michael Niedermayer <michaelni@gmx.at> | |
6 | * 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> | |
7 | * | |
8 | * This file is part of FFmpeg. | |
9 | * | |
10 | * FFmpeg is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License as published by the Free Software Foundation; either | |
13 | * version 2.1 of the License, or (at your option) any later version. | |
14 | * | |
15 | * FFmpeg is distributed in the hope that it will be useful, | |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | * Lesser General Public License for more details. | |
19 | * | |
20 | * You should have received a copy of the GNU Lesser General Public | |
21 | * License along with FFmpeg; if not, write to the Free Software | |
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
23 | */ | |
24 | ||
25 | #include <stdint.h> | |
26 | ||
27 | #include "libavutil/avassert.h" | |
28 | #include "libavutil/common.h" | |
29 | #include "libavutil/x86/asm.h" | |
30 | ||
31 | #include "inline_asm.h" | |
32 | ||
33 | #define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0)) | |
34 | ||
35 | static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale) | |
36 | { | |
37 | x86_reg i=0; | |
38 | ||
39 | av_assert2(FFABS(scale) < MAX_ABS); | |
40 | scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; | |
41 | ||
42 | SET_RND(mm6); | |
43 | __asm__ volatile( | |
44 | "pxor %%mm7, %%mm7 \n\t" | |
45 | "movd %4, %%mm5 \n\t" | |
46 | "punpcklwd %%mm5, %%mm5 \n\t" | |
47 | "punpcklwd %%mm5, %%mm5 \n\t" | |
48 | ".p2align 4 \n\t" | |
49 | "1: \n\t" | |
50 | "movq (%1, %0), %%mm0 \n\t" | |
51 | "movq 8(%1, %0), %%mm1 \n\t" | |
52 | PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) | |
53 | "paddw (%2, %0), %%mm0 \n\t" | |
54 | "paddw 8(%2, %0), %%mm1 \n\t" | |
55 | "psraw $6, %%mm0 \n\t" | |
56 | "psraw $6, %%mm1 \n\t" | |
57 | "pmullw (%3, %0), %%mm0 \n\t" | |
58 | "pmullw 8(%3, %0), %%mm1 \n\t" | |
59 | "pmaddwd %%mm0, %%mm0 \n\t" | |
60 | "pmaddwd %%mm1, %%mm1 \n\t" | |
61 | "paddd %%mm1, %%mm0 \n\t" | |
62 | "psrld $4, %%mm0 \n\t" | |
63 | "paddd %%mm0, %%mm7 \n\t" | |
64 | "add $16, %0 \n\t" | |
65 | "cmp $128, %0 \n\t" //FIXME optimize & bench | |
66 | " jb 1b \n\t" | |
67 | PHADDD(%%mm7, %%mm6) | |
68 | "psrld $2, %%mm7 \n\t" | |
69 | "movd %%mm7, %0 \n\t" | |
70 | ||
71 | : "+r" (i) | |
72 | : "r"(basis), "r"(rem), "r"(weight), "g"(scale) | |
73 | ); | |
74 | return i; | |
75 | } | |
76 | ||
77 | static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale) | |
78 | { | |
79 | x86_reg i=0; | |
80 | ||
81 | if(FFABS(scale) < MAX_ABS){ | |
82 | scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; | |
83 | SET_RND(mm6); | |
84 | __asm__ volatile( | |
85 | "movd %3, %%mm5 \n\t" | |
86 | "punpcklwd %%mm5, %%mm5 \n\t" | |
87 | "punpcklwd %%mm5, %%mm5 \n\t" | |
88 | ".p2align 4 \n\t" | |
89 | "1: \n\t" | |
90 | "movq (%1, %0), %%mm0 \n\t" | |
91 | "movq 8(%1, %0), %%mm1 \n\t" | |
92 | PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) | |
93 | "paddw (%2, %0), %%mm0 \n\t" | |
94 | "paddw 8(%2, %0), %%mm1 \n\t" | |
95 | "movq %%mm0, (%2, %0) \n\t" | |
96 | "movq %%mm1, 8(%2, %0) \n\t" | |
97 | "add $16, %0 \n\t" | |
98 | "cmp $128, %0 \n\t" // FIXME optimize & bench | |
99 | " jb 1b \n\t" | |
100 | ||
101 | : "+r" (i) | |
102 | : "r"(basis), "r"(rem), "g"(scale) | |
103 | ); | |
104 | }else{ | |
105 | for(i=0; i<8*8; i++){ | |
106 | rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); | |
107 | } | |
108 | } | |
109 | } |