1 ;*****************************************************************************
2 ;* pixel-32.asm: x86_32 pixel metrics
3 ;*****************************************************************************
4 ;* Copyright (C) 2003-2013 x264 project
6 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
7 ;* Laurent Aimar <fenrir@via.ecp.fr>
9 ;* This program is free software; you can redistribute it and/or modify
10 ;* it under the terms of the GNU General Public License as published by
11 ;* the Free Software Foundation; either version 2 of the License, or
12 ;* (at your option) any later version.
14 ;* This program is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;* GNU General Public License for more details.
19 ;* You should have received a copy of the GNU General Public License
20 ;* along with this program; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 ;* This program is also available under a commercial proprietary license.
24 ;* For more information, contact us at license @ x265.com.
25 ;*****************************************************************************
28 %include "x86util.asm"
36 %macro LOAD_DIFF_4x8P 1 ; dx
37 LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
38 LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
39 LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
40 LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
43 LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
44 LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
45 LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
47 LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
54 ABSW2 m0, m1, m0, m1, m6, m7
55 ABSW2 m2, m3, m2, m3, m6, m7
60 ABSW2 m4, m5, m4, m5, m2, m3
61 ABSW2 m6, m7, m6, m7, m2, m3
69 ;-----------------------------------------------------------------------------
70 ; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
71 ;-----------------------------------------------------------------------------
72 cglobal pixel_sa8d_8x8_internal
77 %define spill esp+0x60 ; +16
78 %define trans esp+0 ; +96
80 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
83 TRANSPOSE4x4W 4, 5, 6, 7, 1
89 TRANSPOSE4x4W 0, 1, 2, 3, 4
98 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
101 TRANSPOSE4x4W 0, 1, 2, 3, 7
102 movq [trans+0x40], m0
103 movq [trans+0x48], m1
104 movq [trans+0x50], m2
105 movq [trans+0x58], m3
107 TRANSPOSE4x4W 4, 5, 6, 7, 1
108 movq m0, [trans+0x00]
109 movq m1, [trans+0x08]
110 movq m2, [trans+0x10]
111 movq m3, [trans+0x18]
113 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
117 movq m0, [trans+0x20]
118 movq m1, [trans+0x28]
119 movq m2, [trans+0x30]
120 movq m3, [trans+0x38]
121 movq m4, [trans+0x40]
122 movq m5, [trans+0x48]
123 movq m6, [trans+0x50]
124 movq m7, [trans+0x58]
126 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
136 %macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
155 %macro LOAD_4x8P 1 ; dx
157 movd m6, [r0+%1+7*FENC_STRIDE]
158 movd m0, [r0+%1+0*FENC_STRIDE]
159 movd m1, [r0+%1+1*FENC_STRIDE]
160 movd m2, [r0+%1+2*FENC_STRIDE]
161 movd m3, [r0+%1+3*FENC_STRIDE]
162 movd m4, [r0+%1+4*FENC_STRIDE]
163 movd m5, [r0+%1+5*FENC_STRIDE]
170 movd m6, [r0+%1+6*FENC_STRIDE]
186 ;-----------------------------------------------------------------------------
187 ; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
188 ;-----------------------------------------------------------------------------
189 cglobal intra_sa8d_x3_8x8, 2,3
191 %define edge esp+0x70 ; +32
192 %define spill esp+0x60 ; +16
193 %define trans esp+0 ; +96
194 %define sum esp+0 ; +32
205 movq m6, [pw_ppmmppmm]
206 HSUMSUB2 m0, m2, q1032, m6
207 HSUMSUB2 m1, m3, q1032, m6
208 movq m6, [pw_pmpmpmpm]
209 HSUMSUB2 m0, m2, q2301, m6
210 HSUMSUB2 m1, m3, q2301, m6
223 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
226 TRANSPOSE4x4W 4, 5, 6, 7, 0
227 movq [trans+0x00], m4
228 movq [trans+0x08], m5
229 movq [trans+0x10], m6
230 movq [trans+0x18], m7
232 TRANSPOSE4x4W 0, 1, 2, 3, 4
233 movq [trans+0x20], m0
234 movq [trans+0x28], m1
235 movq [trans+0x30], m2
236 movq [trans+0x38], m3
239 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
242 TRANSPOSE4x4W 0, 1, 2, 3, 7
243 movq [trans+0x40], m0
244 movq [trans+0x48], m1
245 movq [trans+0x50], m2
246 movq [trans+0x58], m3
248 TRANSPOSE4x4W 4, 5, 6, 7, 0
249 movq m0, [trans+0x00]
250 movq m1, [trans+0x08]
251 movq m2, [trans+0x10]
252 movq m3, [trans+0x18]
254 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
258 ABSW2 m2, m3, m2, m3, m0, m1
259 ABSW2 m4, m5, m4, m5, m0, m1
262 ABSW2 m6, m7, m6, m7, m4, m5
269 paddw m2, m1 ; 7x4 sum
271 movq m1, [edge+8] ; left bottom
274 ABSW2 m0, m7, m0, m7, m5, m3
277 movq [sum+0], m0 ; dc
278 movq [sum+8], m7 ; left
280 movq m0, [trans+0x20]
281 movq m1, [trans+0x28]
282 movq m2, [trans+0x30]
283 movq m3, [trans+0x38]
284 movq m4, [trans+0x40]
285 movq m5, [trans+0x48]
286 movq m6, [trans+0x50]
287 movq m7, [trans+0x58]
289 HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
302 ABSW2 m2, m3, m2, m3, m0, m1
303 ABSW2 m4, m5, m4, m5, m0, m1
309 ABSW2 m6, m7, m6, m7, m4, m5
313 paddw m2, m1 ; 7x4 sum
317 psllw m7, 3 ; left top
327 ABSW2 m0, m1, m0, m1, m5, m6
328 movq m3, [sum+0] ; dc
333 paddw m1, [sum+8] ; h
337 movq m3, [edge+16] ; top left
338 movq m4, [edge+24] ; top right
343 ABSW2 m3, m4, m3, m4, m5, m6
347 SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
354 movq [r2+0], m2 ; v, h
364 ;-----------------------------------------------------------------------------
365 ; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
366 ; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
367 ;-----------------------------------------------------------------------------
368 cglobal pixel_ssim_4x4x2_core, 0,5