Imported Upstream version 1.4
[deb_x265.git] / source / common / x86 / pixel-32.asm
1 ;*****************************************************************************
2 ;* pixel-32.asm: x86_32 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
5 ;*
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
8 ;*
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
13 ;*
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
18 ;*
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 ;*
23 ;* This program is also available under a commercial proprietary license.
24 ;* For more information, contact us at license @ x265.com.
25 ;*****************************************************************************
26
27 %include "x86inc.asm"
28 %include "x86util.asm"
29
30 cextern pw_ppmmppmm
31 cextern pw_pmpmpmpm
32
33 SECTION .text
34 INIT_MMX mmx2
35
36 %macro LOAD_DIFF_4x8P 1 ; dx
37 LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
38 LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
39 LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
40 LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
41 lea r0, [r0+4*r1]
42 lea r2, [r2+4*r3]
43 LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
44 LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
45 LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
46 movq [spill], m5
47 LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
48 movq m5, [spill]
49 %endmacro
50
51 %macro SUM4x8_MM 0
52 movq [spill], m6
53 movq [spill+8], m7
54 ABSW2 m0, m1, m0, m1, m6, m7
55 ABSW2 m2, m3, m2, m3, m6, m7
56 paddw m0, m2
57 paddw m1, m3
58 movq m6, [spill]
59 movq m7, [spill+8]
60 ABSW2 m4, m5, m4, m5, m2, m3
61 ABSW2 m6, m7, m6, m7, m2, m3
62 paddw m4, m6
63 paddw m5, m7
64 paddw m0, m4
65 paddw m1, m5
66 paddw m0, m1
67 %endmacro
68
69 ;-----------------------------------------------------------------------------
70 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
71 ;-----------------------------------------------------------------------------
72 cglobal pixel_sa8d_8x8_internal
73 push r0
74 push r2
75 sub esp, 0x74
76 %define args esp+0x74
77 %define spill esp+0x60 ; +16
78 %define trans esp+0 ; +96
79 LOAD_DIFF_4x8P 0
80 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
81
82 movq [spill], m1
83 TRANSPOSE4x4W 4, 5, 6, 7, 1
84 movq [trans+0x00], m4
85 movq [trans+0x08], m5
86 movq [trans+0x10], m6
87 movq [trans+0x18], m7
88 movq m1, [spill]
89 TRANSPOSE4x4W 0, 1, 2, 3, 4
90 movq [trans+0x20], m0
91 movq [trans+0x28], m1
92 movq [trans+0x30], m2
93 movq [trans+0x38], m3
94
95 mov r0, [args+4]
96 mov r2, [args]
97 LOAD_DIFF_4x8P 4
98 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
99
100 movq [spill], m7
101 TRANSPOSE4x4W 0, 1, 2, 3, 7
102 movq [trans+0x40], m0
103 movq [trans+0x48], m1
104 movq [trans+0x50], m2
105 movq [trans+0x58], m3
106 movq m7, [spill]
107 TRANSPOSE4x4W 4, 5, 6, 7, 1
108 movq m0, [trans+0x00]
109 movq m1, [trans+0x08]
110 movq m2, [trans+0x10]
111 movq m3, [trans+0x18]
112
113 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
114 SUM4x8_MM
115 movq [trans], m0
116
117 movq m0, [trans+0x20]
118 movq m1, [trans+0x28]
119 movq m2, [trans+0x30]
120 movq m3, [trans+0x38]
121 movq m4, [trans+0x40]
122 movq m5, [trans+0x48]
123 movq m6, [trans+0x50]
124 movq m7, [trans+0x58]
125
126 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
127 SUM4x8_MM
128
129 pavgw m0, [trans]
130 add esp, 0x7c
131 ret
132 %undef args
133 %undef spill
134 %undef trans
135
136 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
137 pxor %7, %7
138 pshufw %4, %1, q1032
139 pshufw %5, %2, q1032
140 pshufw %6, %3, q1032
141 paddusw %1, %4
142 paddusw %2, %5
143 paddusw %3, %6
144 punpcklwd %1, %7
145 punpcklwd %2, %7
146 punpcklwd %3, %7
147 pshufw %4, %1, q1032
148 pshufw %5, %2, q1032
149 pshufw %6, %3, q1032
150 %8 %1, %4
151 %8 %2, %5
152 %8 %3, %6
153 %endmacro
154
155 %macro LOAD_4x8P 1 ; dx
156 pxor m7, m7
157 movd m6, [r0+%1+7*FENC_STRIDE]
158 movd m0, [r0+%1+0*FENC_STRIDE]
159 movd m1, [r0+%1+1*FENC_STRIDE]
160 movd m2, [r0+%1+2*FENC_STRIDE]
161 movd m3, [r0+%1+3*FENC_STRIDE]
162 movd m4, [r0+%1+4*FENC_STRIDE]
163 movd m5, [r0+%1+5*FENC_STRIDE]
164 punpcklbw m6, m7
165 punpcklbw m0, m7
166 punpcklbw m1, m7
167 movq [spill], m6
168 punpcklbw m2, m7
169 punpcklbw m3, m7
170 movd m6, [r0+%1+6*FENC_STRIDE]
171 punpcklbw m4, m7
172 punpcklbw m5, m7
173 punpcklbw m6, m7
174 movq m7, [spill]
175 %endmacro
176
177 %macro HSUMSUB2 4
178 pshufw m4, %1, %3
179 pshufw m5, %2, %3
180 pmullw %1, %4
181 pmullw m5, %4
182 paddw %1, m4
183 paddw %2, m5
184 %endmacro
185
186 ;-----------------------------------------------------------------------------
187 ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
188 ;-----------------------------------------------------------------------------
189 cglobal intra_sa8d_x3_8x8, 2,3
190 SUB esp, 0x94
191 %define edge esp+0x70 ; +32
192 %define spill esp+0x60 ; +16
193 %define trans esp+0 ; +96
194 %define sum esp+0 ; +32
195
196 pxor m7, m7
197 movq m0, [r1+7]
198 movq m2, [r1+16]
199 movq m1, m0
200 movq m3, m2
201 punpcklbw m0, m7
202 punpckhbw m1, m7
203 punpcklbw m2, m7
204 punpckhbw m3, m7
205 movq m6, [pw_ppmmppmm]
206 HSUMSUB2 m0, m2, q1032, m6
207 HSUMSUB2 m1, m3, q1032, m6
208 movq m6, [pw_pmpmpmpm]
209 HSUMSUB2 m0, m2, q2301, m6
210 HSUMSUB2 m1, m3, q2301, m6
211 movq m4, m0
212 movq m5, m2
213 paddw m0, m1
214 paddw m2, m3
215 psubw m4, m1
216 psubw m3, m5
217 movq [edge+0], m0
218 movq [edge+8], m4
219 movq [edge+16], m2
220 movq [edge+24], m3
221
222 LOAD_4x8P 0
223 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
224
225 movq [spill], m0
226 TRANSPOSE4x4W 4, 5, 6, 7, 0
227 movq [trans+0x00], m4
228 movq [trans+0x08], m5
229 movq [trans+0x10], m6
230 movq [trans+0x18], m7
231 movq m0, [spill]
232 TRANSPOSE4x4W 0, 1, 2, 3, 4
233 movq [trans+0x20], m0
234 movq [trans+0x28], m1
235 movq [trans+0x30], m2
236 movq [trans+0x38], m3
237
238 LOAD_4x8P 4
239 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
240
241 movq [spill], m7
242 TRANSPOSE4x4W 0, 1, 2, 3, 7
243 movq [trans+0x40], m0
244 movq [trans+0x48], m1
245 movq [trans+0x50], m2
246 movq [trans+0x58], m3
247 movq m7, [spill]
248 TRANSPOSE4x4W 4, 5, 6, 7, 0
249 movq m0, [trans+0x00]
250 movq m1, [trans+0x08]
251 movq m2, [trans+0x10]
252 movq m3, [trans+0x18]
253
254 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
255
256 movq [spill+0], m0
257 movq [spill+8], m1
258 ABSW2 m2, m3, m2, m3, m0, m1
259 ABSW2 m4, m5, m4, m5, m0, m1
260 paddw m2, m4
261 paddw m3, m5
262 ABSW2 m6, m7, m6, m7, m4, m5
263 movq m0, [spill+0]
264 movq m1, [spill+8]
265 paddw m2, m6
266 paddw m3, m7
267 paddw m2, m3
268 ABSW m1, m1, m4
269 paddw m2, m1 ; 7x4 sum
270 movq m7, m0
271 movq m1, [edge+8] ; left bottom
272 psllw m1, 3
273 psubw m7, m1
274 ABSW2 m0, m7, m0, m7, m5, m3
275 paddw m0, m2
276 paddw m7, m2
277 movq [sum+0], m0 ; dc
278 movq [sum+8], m7 ; left
279
280 movq m0, [trans+0x20]
281 movq m1, [trans+0x28]
282 movq m2, [trans+0x30]
283 movq m3, [trans+0x38]
284 movq m4, [trans+0x40]
285 movq m5, [trans+0x48]
286 movq m6, [trans+0x50]
287 movq m7, [trans+0x58]
288
289 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
290
291 movd [sum+0x10], m0
292 movd [sum+0x12], m1
293 movd [sum+0x14], m2
294 movd [sum+0x16], m3
295 movd [sum+0x18], m4
296 movd [sum+0x1a], m5
297 movd [sum+0x1c], m6
298 movd [sum+0x1e], m7
299
300 movq [spill], m0
301 movq [spill+8], m1
302 ABSW2 m2, m3, m2, m3, m0, m1
303 ABSW2 m4, m5, m4, m5, m0, m1
304 paddw m2, m4
305 paddw m3, m5
306 paddw m2, m3
307 movq m0, [spill]
308 movq m1, [spill+8]
309 ABSW2 m6, m7, m6, m7, m4, m5
310 ABSW m1, m1, m3
311 paddw m2, m7
312 paddw m1, m6
313 paddw m2, m1 ; 7x4 sum
314 movq m1, m0
315
316 movq m7, [edge+0]
317 psllw m7, 3 ; left top
318
319 mov r2, [edge+0]
320 add r2, [edge+16]
321 lea r2, [4*r2+32]
322 and r2, 0xffc0
323 movd m6, r2 ; dc
324
325 psubw m1, m7
326 psubw m0, m6
327 ABSW2 m0, m1, m0, m1, m5, m6
328 movq m3, [sum+0] ; dc
329 paddw m0, m2
330 paddw m1, m2
331 movq m2, m0
332 paddw m0, m3
333 paddw m1, [sum+8] ; h
334 psrlq m2, 16
335 paddw m2, m3
336
337 movq m3, [edge+16] ; top left
338 movq m4, [edge+24] ; top right
339 psllw m3, 3
340 psllw m4, 3
341 psubw m3, [sum+16]
342 psubw m4, [sum+24]
343 ABSW2 m3, m4, m3, m4, m5, m6
344 paddw m2, m3
345 paddw m2, m4 ; v
346
347 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
348 mov r2, r2m
349 pxor m7, m7
350 punpckldq m2, m1
351 pavgw m0, m7
352 pavgw m2, m7
353 movd [r2+8], m0 ; dc
354 movq [r2+0], m2 ; v, h
355 ADD esp, 0x94
356 RET
357 %undef edge
358 %undef spill
359 %undef trans
360 %undef sum
361
362
363
364 ;-----------------------------------------------------------------------------
365 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
366 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
367 ;-----------------------------------------------------------------------------
368 cglobal pixel_ssim_4x4x2_core, 0,5
369 mov r1, r1m
370 mov r3, r3m
371 mov r4, 4
372 pxor m0, m0
373 .loop:
374 mov r0, r0m
375 mov r2, r2m
376 add r0, r4
377 add r2, r4
378 pxor m1, m1
379 pxor m2, m2
380 pxor m3, m3
381 pxor m4, m4
382 %rep 4
383 movd m5, [r0]
384 movd m6, [r2]
385 punpcklbw m5, m0
386 punpcklbw m6, m0
387 paddw m1, m5
388 paddw m2, m6
389 movq m7, m5
390 pmaddwd m5, m5
391 pmaddwd m7, m6
392 pmaddwd m6, m6
393 paddd m3, m5
394 paddd m4, m7
395 paddd m3, m6
396 add r0, r1
397 add r2, r3
398 %endrep
399 mov r0, r4m
400 lea r0, [r0+r4*4]
401 pshufw m5, m1, q0032
402 pshufw m6, m2, q0032
403 paddusw m1, m5
404 paddusw m2, m6
405 punpcklwd m1, m2
406 pshufw m2, m1, q0032
407 pshufw m5, m3, q0032
408 pshufw m6, m4, q0032
409 paddusw m1, m2
410 paddd m3, m5
411 paddd m4, m6
412 punpcklwd m1, m0
413 punpckldq m3, m4
414 movq [r0+0], m1
415 movq [r0+8], m3
416 sub r4, 4
417 jge .loop
418 emms
419 RET
420