Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* pixel-32.asm: x86_32 pixel metrics | |
3 | ;***************************************************************************** | |
4 | ;* Copyright (C) 2003-2013 x264 project | |
5 | ;* | |
6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> | |
7 | ;* Laurent Aimar <fenrir@via.ecp.fr> | |
8 | ;* | |
9 | ;* This program is free software; you can redistribute it and/or modify | |
10 | ;* it under the terms of the GNU General Public License as published by | |
11 | ;* the Free Software Foundation; either version 2 of the License, or | |
12 | ;* (at your option) any later version. | |
13 | ;* | |
14 | ;* This program is distributed in the hope that it will be useful, | |
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | ;* GNU General Public License for more details. | |
18 | ;* | |
19 | ;* You should have received a copy of the GNU General Public License | |
20 | ;* along with this program; if not, write to the Free Software | |
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
22 | ;* | |
23 | ;* This program is also available under a commercial proprietary license. | |
24 | ;* For more information, contact us at license @ x265.com. | |
25 | ;***************************************************************************** | |
26 | ||
27 | %include "x86inc.asm" | |
28 | %include "x86util.asm" | |
29 | ||
30 | cextern pw_ppmmppmm | |
31 | cextern pw_pmpmpmpm | |
32 | ||
33 | SECTION .text | |
34 | INIT_MMX mmx2 | |
35 | ||
36 | %macro LOAD_DIFF_4x8P 1 ; dx | |
37 | LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1] | |
38 | LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3] | |
39 | LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] | |
40 | LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5] | |
41 | lea r0, [r0+4*r1] | |
42 | lea r2, [r2+4*r3] | |
43 | LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1] | |
44 | LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3] | |
45 | LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2] | |
46 | movq [spill], m5 | |
47 | LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5] | |
48 | movq m5, [spill] | |
49 | %endmacro | |
50 | ||
51 | %macro SUM4x8_MM 0 | |
52 | movq [spill], m6 | |
53 | movq [spill+8], m7 | |
54 | ABSW2 m0, m1, m0, m1, m6, m7 | |
55 | ABSW2 m2, m3, m2, m3, m6, m7 | |
56 | paddw m0, m2 | |
57 | paddw m1, m3 | |
58 | movq m6, [spill] | |
59 | movq m7, [spill+8] | |
60 | ABSW2 m4, m5, m4, m5, m2, m3 | |
61 | ABSW2 m6, m7, m6, m7, m2, m3 | |
62 | paddw m4, m6 | |
63 | paddw m5, m7 | |
64 | paddw m0, m4 | |
65 | paddw m1, m5 | |
66 | paddw m0, m1 | |
67 | %endmacro | |
68 | ||
69 | ;----------------------------------------------------------------------------- | |
70 | ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) | |
71 | ;----------------------------------------------------------------------------- | |
72 | cglobal pixel_sa8d_8x8_internal | |
73 | push r0 | |
74 | push r2 | |
75 | sub esp, 0x74 | |
76 | %define args esp+0x74 | |
77 | %define spill esp+0x60 ; +16 | |
78 | %define trans esp+0 ; +96 | |
79 | LOAD_DIFF_4x8P 0 | |
80 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
81 | ||
82 | movq [spill], m1 | |
83 | TRANSPOSE4x4W 4, 5, 6, 7, 1 | |
84 | movq [trans+0x00], m4 | |
85 | movq [trans+0x08], m5 | |
86 | movq [trans+0x10], m6 | |
87 | movq [trans+0x18], m7 | |
88 | movq m1, [spill] | |
89 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
90 | movq [trans+0x20], m0 | |
91 | movq [trans+0x28], m1 | |
92 | movq [trans+0x30], m2 | |
93 | movq [trans+0x38], m3 | |
94 | ||
95 | mov r0, [args+4] | |
96 | mov r2, [args] | |
97 | LOAD_DIFF_4x8P 4 | |
98 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
99 | ||
100 | movq [spill], m7 | |
101 | TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
102 | movq [trans+0x40], m0 | |
103 | movq [trans+0x48], m1 | |
104 | movq [trans+0x50], m2 | |
105 | movq [trans+0x58], m3 | |
106 | movq m7, [spill] | |
107 | TRANSPOSE4x4W 4, 5, 6, 7, 1 | |
108 | movq m0, [trans+0x00] | |
109 | movq m1, [trans+0x08] | |
110 | movq m2, [trans+0x10] | |
111 | movq m3, [trans+0x18] | |
112 | ||
113 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
114 | SUM4x8_MM | |
115 | movq [trans], m0 | |
116 | ||
117 | movq m0, [trans+0x20] | |
118 | movq m1, [trans+0x28] | |
119 | movq m2, [trans+0x30] | |
120 | movq m3, [trans+0x38] | |
121 | movq m4, [trans+0x40] | |
122 | movq m5, [trans+0x48] | |
123 | movq m6, [trans+0x50] | |
124 | movq m7, [trans+0x58] | |
125 | ||
126 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
127 | SUM4x8_MM | |
128 | ||
129 | pavgw m0, [trans] | |
130 | add esp, 0x7c | |
131 | ret | |
132 | %undef args | |
133 | %undef spill | |
134 | %undef trans | |
135 | ||
136 | %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op | |
137 | pxor %7, %7 | |
138 | pshufw %4, %1, q1032 | |
139 | pshufw %5, %2, q1032 | |
140 | pshufw %6, %3, q1032 | |
141 | paddusw %1, %4 | |
142 | paddusw %2, %5 | |
143 | paddusw %3, %6 | |
144 | punpcklwd %1, %7 | |
145 | punpcklwd %2, %7 | |
146 | punpcklwd %3, %7 | |
147 | pshufw %4, %1, q1032 | |
148 | pshufw %5, %2, q1032 | |
149 | pshufw %6, %3, q1032 | |
150 | %8 %1, %4 | |
151 | %8 %2, %5 | |
152 | %8 %3, %6 | |
153 | %endmacro | |
154 | ||
155 | %macro LOAD_4x8P 1 ; dx | |
156 | pxor m7, m7 | |
157 | movd m6, [r0+%1+7*FENC_STRIDE] | |
158 | movd m0, [r0+%1+0*FENC_STRIDE] | |
159 | movd m1, [r0+%1+1*FENC_STRIDE] | |
160 | movd m2, [r0+%1+2*FENC_STRIDE] | |
161 | movd m3, [r0+%1+3*FENC_STRIDE] | |
162 | movd m4, [r0+%1+4*FENC_STRIDE] | |
163 | movd m5, [r0+%1+5*FENC_STRIDE] | |
164 | punpcklbw m6, m7 | |
165 | punpcklbw m0, m7 | |
166 | punpcklbw m1, m7 | |
167 | movq [spill], m6 | |
168 | punpcklbw m2, m7 | |
169 | punpcklbw m3, m7 | |
170 | movd m6, [r0+%1+6*FENC_STRIDE] | |
171 | punpcklbw m4, m7 | |
172 | punpcklbw m5, m7 | |
173 | punpcklbw m6, m7 | |
174 | movq m7, [spill] | |
175 | %endmacro | |
176 | ||
177 | %macro HSUMSUB2 4 | |
178 | pshufw m4, %1, %3 | |
179 | pshufw m5, %2, %3 | |
180 | pmullw %1, %4 | |
181 | pmullw m5, %4 | |
182 | paddw %1, m4 | |
183 | paddw %2, m5 | |
184 | %endmacro | |
185 | ||
186 | ;----------------------------------------------------------------------------- | |
187 | ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res ) | |
188 | ;----------------------------------------------------------------------------- | |
189 | cglobal intra_sa8d_x3_8x8, 2,3 | |
190 | SUB esp, 0x94 | |
191 | %define edge esp+0x70 ; +32 | |
192 | %define spill esp+0x60 ; +16 | |
193 | %define trans esp+0 ; +96 | |
194 | %define sum esp+0 ; +32 | |
195 | ||
196 | pxor m7, m7 | |
197 | movq m0, [r1+7] | |
198 | movq m2, [r1+16] | |
199 | movq m1, m0 | |
200 | movq m3, m2 | |
201 | punpcklbw m0, m7 | |
202 | punpckhbw m1, m7 | |
203 | punpcklbw m2, m7 | |
204 | punpckhbw m3, m7 | |
205 | movq m6, [pw_ppmmppmm] | |
206 | HSUMSUB2 m0, m2, q1032, m6 | |
207 | HSUMSUB2 m1, m3, q1032, m6 | |
208 | movq m6, [pw_pmpmpmpm] | |
209 | HSUMSUB2 m0, m2, q2301, m6 | |
210 | HSUMSUB2 m1, m3, q2301, m6 | |
211 | movq m4, m0 | |
212 | movq m5, m2 | |
213 | paddw m0, m1 | |
214 | paddw m2, m3 | |
215 | psubw m4, m1 | |
216 | psubw m3, m5 | |
217 | movq [edge+0], m0 | |
218 | movq [edge+8], m4 | |
219 | movq [edge+16], m2 | |
220 | movq [edge+24], m3 | |
221 | ||
222 | LOAD_4x8P 0 | |
223 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
224 | ||
225 | movq [spill], m0 | |
226 | TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
227 | movq [trans+0x00], m4 | |
228 | movq [trans+0x08], m5 | |
229 | movq [trans+0x10], m6 | |
230 | movq [trans+0x18], m7 | |
231 | movq m0, [spill] | |
232 | TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
233 | movq [trans+0x20], m0 | |
234 | movq [trans+0x28], m1 | |
235 | movq [trans+0x30], m2 | |
236 | movq [trans+0x38], m3 | |
237 | ||
238 | LOAD_4x8P 4 | |
239 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
240 | ||
241 | movq [spill], m7 | |
242 | TRANSPOSE4x4W 0, 1, 2, 3, 7 | |
243 | movq [trans+0x40], m0 | |
244 | movq [trans+0x48], m1 | |
245 | movq [trans+0x50], m2 | |
246 | movq [trans+0x58], m3 | |
247 | movq m7, [spill] | |
248 | TRANSPOSE4x4W 4, 5, 6, 7, 0 | |
249 | movq m0, [trans+0x00] | |
250 | movq m1, [trans+0x08] | |
251 | movq m2, [trans+0x10] | |
252 | movq m3, [trans+0x18] | |
253 | ||
254 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
255 | ||
256 | movq [spill+0], m0 | |
257 | movq [spill+8], m1 | |
258 | ABSW2 m2, m3, m2, m3, m0, m1 | |
259 | ABSW2 m4, m5, m4, m5, m0, m1 | |
260 | paddw m2, m4 | |
261 | paddw m3, m5 | |
262 | ABSW2 m6, m7, m6, m7, m4, m5 | |
263 | movq m0, [spill+0] | |
264 | movq m1, [spill+8] | |
265 | paddw m2, m6 | |
266 | paddw m3, m7 | |
267 | paddw m2, m3 | |
268 | ABSW m1, m1, m4 | |
269 | paddw m2, m1 ; 7x4 sum | |
270 | movq m7, m0 | |
271 | movq m1, [edge+8] ; left bottom | |
272 | psllw m1, 3 | |
273 | psubw m7, m1 | |
274 | ABSW2 m0, m7, m0, m7, m5, m3 | |
275 | paddw m0, m2 | |
276 | paddw m7, m2 | |
277 | movq [sum+0], m0 ; dc | |
278 | movq [sum+8], m7 ; left | |
279 | ||
280 | movq m0, [trans+0x20] | |
281 | movq m1, [trans+0x28] | |
282 | movq m2, [trans+0x30] | |
283 | movq m3, [trans+0x38] | |
284 | movq m4, [trans+0x40] | |
285 | movq m5, [trans+0x48] | |
286 | movq m6, [trans+0x50] | |
287 | movq m7, [trans+0x58] | |
288 | ||
289 | HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7 | |
290 | ||
291 | movd [sum+0x10], m0 | |
292 | movd [sum+0x12], m1 | |
293 | movd [sum+0x14], m2 | |
294 | movd [sum+0x16], m3 | |
295 | movd [sum+0x18], m4 | |
296 | movd [sum+0x1a], m5 | |
297 | movd [sum+0x1c], m6 | |
298 | movd [sum+0x1e], m7 | |
299 | ||
300 | movq [spill], m0 | |
301 | movq [spill+8], m1 | |
302 | ABSW2 m2, m3, m2, m3, m0, m1 | |
303 | ABSW2 m4, m5, m4, m5, m0, m1 | |
304 | paddw m2, m4 | |
305 | paddw m3, m5 | |
306 | paddw m2, m3 | |
307 | movq m0, [spill] | |
308 | movq m1, [spill+8] | |
309 | ABSW2 m6, m7, m6, m7, m4, m5 | |
310 | ABSW m1, m1, m3 | |
311 | paddw m2, m7 | |
312 | paddw m1, m6 | |
313 | paddw m2, m1 ; 7x4 sum | |
314 | movq m1, m0 | |
315 | ||
316 | movq m7, [edge+0] | |
317 | psllw m7, 3 ; left top | |
318 | ||
319 | mov r2, [edge+0] | |
320 | add r2, [edge+16] | |
321 | lea r2, [4*r2+32] | |
322 | and r2, 0xffc0 | |
323 | movd m6, r2 ; dc | |
324 | ||
325 | psubw m1, m7 | |
326 | psubw m0, m6 | |
327 | ABSW2 m0, m1, m0, m1, m5, m6 | |
328 | movq m3, [sum+0] ; dc | |
329 | paddw m0, m2 | |
330 | paddw m1, m2 | |
331 | movq m2, m0 | |
332 | paddw m0, m3 | |
333 | paddw m1, [sum+8] ; h | |
334 | psrlq m2, 16 | |
335 | paddw m2, m3 | |
336 | ||
337 | movq m3, [edge+16] ; top left | |
338 | movq m4, [edge+24] ; top right | |
339 | psllw m3, 3 | |
340 | psllw m4, 3 | |
341 | psubw m3, [sum+16] | |
342 | psubw m4, [sum+24] | |
343 | ABSW2 m3, m4, m3, m4, m5, m6 | |
344 | paddw m2, m3 | |
345 | paddw m2, m4 ; v | |
346 | ||
347 | SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw | |
348 | mov r2, r2m | |
349 | pxor m7, m7 | |
350 | punpckldq m2, m1 | |
351 | pavgw m0, m7 | |
352 | pavgw m2, m7 | |
353 | movd [r2+8], m0 ; dc | |
354 | movq [r2+0], m2 ; v, h | |
355 | ADD esp, 0x94 | |
356 | RET | |
357 | %undef edge | |
358 | %undef spill | |
359 | %undef trans | |
360 | %undef sum | |
361 | ||
362 | ||
363 | ||
364 | ;----------------------------------------------------------------------------- | |
365 | ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1, | |
366 | ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] ) | |
367 | ;----------------------------------------------------------------------------- | |
368 | cglobal pixel_ssim_4x4x2_core, 0,5 | |
369 | mov r1, r1m | |
370 | mov r3, r3m | |
371 | mov r4, 4 | |
372 | pxor m0, m0 | |
373 | .loop: | |
374 | mov r0, r0m | |
375 | mov r2, r2m | |
376 | add r0, r4 | |
377 | add r2, r4 | |
378 | pxor m1, m1 | |
379 | pxor m2, m2 | |
380 | pxor m3, m3 | |
381 | pxor m4, m4 | |
382 | %rep 4 | |
383 | movd m5, [r0] | |
384 | movd m6, [r2] | |
385 | punpcklbw m5, m0 | |
386 | punpcklbw m6, m0 | |
387 | paddw m1, m5 | |
388 | paddw m2, m6 | |
389 | movq m7, m5 | |
390 | pmaddwd m5, m5 | |
391 | pmaddwd m7, m6 | |
392 | pmaddwd m6, m6 | |
393 | paddd m3, m5 | |
394 | paddd m4, m7 | |
395 | paddd m3, m6 | |
396 | add r0, r1 | |
397 | add r2, r3 | |
398 | %endrep | |
399 | mov r0, r4m | |
400 | lea r0, [r0+r4*4] | |
401 | pshufw m5, m1, q0032 | |
402 | pshufw m6, m2, q0032 | |
403 | paddusw m1, m5 | |
404 | paddusw m2, m6 | |
405 | punpcklwd m1, m2 | |
406 | pshufw m2, m1, q0032 | |
407 | pshufw m5, m3, q0032 | |
408 | pshufw m6, m4, q0032 | |
409 | paddusw m1, m2 | |
410 | paddd m3, m5 | |
411 | paddd m4, m6 | |
412 | punpcklwd m1, m0 | |
413 | punpckldq m3, m4 | |
414 | movq [r0+0], m1 | |
415 | movq [r0+8], m3 | |
416 | sub r4, 4 | |
417 | jge .loop | |
418 | emms | |
419 | RET | |
420 |