Commit | Line | Data |
---|---|---|
72b9787e JB |
1 | ;***************************************************************************** |
2 | ;* Copyright (C) 2013 x265 project | |
3 | ;* | |
4 | ;* Authors: Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com> | |
5 | ;* | |
6 | ;* This program is free software; you can redistribute it and/or modify | |
7 | ;* it under the terms of the GNU General Public License as published by | |
8 | ;* the Free Software Foundation; either version 2 of the License, or | |
9 | ;* (at your option) any later version. | |
10 | ;* | |
11 | ;* This program is distributed in the hope that it will be useful, | |
12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | ;* GNU General Public License for more details. | |
15 | ;* | |
16 | ;* You should have received a copy of the GNU General Public License | |
17 | ;* along with this program; if not, write to the Free Software | |
18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. | |
19 | ;* | |
20 | ;* This program is also available under a commercial proprietary license. | |
21 | ;* For more information, contact us at license @ x265.com. | |
22 | ;*****************************************************************************/ | |
23 | ||
24 | %include "x86inc.asm" | |
25 | %include "x86util.asm" | |
26 | ||
27 | SECTION_RODATA 32 | |
28 | ||
29 | pb_0_8 times 8 db 0, 8 | |
30 | pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 | |
31 | pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0 | |
32 | c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 | |
33 | tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 | |
34 | pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 | |
35 | c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0 | |
36 | c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
37 | c_mode32_13_shuf: db 0, 0, 0, 0, 0, 0, 0, 0, 7, 6, 5, 4, 3, 2, 1, 0 | |
38 | c_mode32_14_shuf: db 15, 14, 13, 0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15 | |
39 | c_mode32_14_0: db 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
40 | c_mode32_15_0: db 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0 | |
41 | c_mode32_16_0: db 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0 | |
42 | c_mode32_17_0: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 | |
43 | c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | |
44 | c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | |
45 | c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 | |
46 | tab_S1: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 | |
47 | pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 | |
48 | c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6 | |
49 | c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4 | |
50 | c_mode16_14: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2 | |
51 | c_mode16_15: db 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2 | |
52 | c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2 | |
53 | c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1 | |
54 | c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 | |
55 | tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0 | |
56 | ||
57 | const ang_table | |
58 | %assign x 0 | |
59 | %rep 32 | |
60 | times 8 db (32-x), x | |
61 | %assign x x+1 | |
62 | %endrep | |
63 | ||
64 | SECTION .text | |
65 | ||
66 | cextern pw_8 | |
67 | cextern pw_1024 | |
68 | cextern pb_unpackbd1 | |
69 | cextern multiL | |
70 | cextern multiH | |
71 | cextern multiH2 | |
72 | cextern multiH3 | |
73 | cextern multi_2Row | |
74 | ||
75 | ;----------------------------------------------------------------------------- | |
76 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
77 | ;----------------------------------------------------------------------------- | |
78 | INIT_XMM sse4 | |
79 | cglobal intra_pred_dc4, 4,6,3 | |
80 | mov r4d, r5m | |
81 | inc r2 | |
82 | inc r3 | |
83 | pxor m0, m0 | |
84 | movd m1, [r2] | |
85 | movd m2, [r3] | |
86 | punpckldq m1, m2 | |
87 | psadbw m1, m0 ; m1 = sum | |
88 | ||
89 | test r4d, r4d | |
90 | ||
91 | mov r4d, 4096 | |
92 | movd m2, r4d | |
93 | pmulhrsw m1, m2 ; m1 = (sum + 4) / 8 | |
94 | movd r4d, m1 ; r4d = dc_val | |
95 | pshufb m1, m0 ; m1 = byte [dc_val ...] | |
96 | ||
97 | ; store DC 4x4 | |
98 | lea r5, [r1 * 3] | |
99 | movd [r0], m1 | |
100 | movd [r0 + r1], m1 | |
101 | movd [r0 + r1 * 2], m1 | |
102 | movd [r0 + r5], m1 | |
103 | ||
104 | ; do DC filter | |
105 | jz .end | |
106 | lea r5d, [r4d * 2 + 2] ; r5d = DC * 2 + 2 | |
107 | add r4d, r5d ; r4d = DC * 3 + 2 | |
108 | movd m1, r4d | |
109 | pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
110 | ||
111 | ; filter top | |
112 | pmovzxbw m2, [r3] | |
113 | paddw m2, m1 | |
114 | psraw m2, 2 | |
115 | packuswb m2, m2 | |
116 | movd [r0], m2 ; overwrite top-left pixel, we will update it later | |
117 | ||
118 | ; filter top-left | |
119 | movzx r3d, byte [r3] | |
120 | add r5d, r3d | |
121 | movzx r3d, byte [r2] | |
122 | add r3d, r5d | |
123 | shr r3d, 2 | |
124 | mov [r0], r3b | |
125 | ||
126 | ; filter left | |
127 | add r0, r1 | |
128 | pmovzxbw m2, [r2 + 1] | |
129 | paddw m2, m1 | |
130 | psraw m2, 2 | |
131 | packuswb m2, m2 | |
132 | pextrb [r0], m2, 0 | |
133 | pextrb [r0 + r1], m2, 1 | |
134 | pextrb [r0 + r1 * 2], m2, 2 | |
135 | ||
136 | .end: | |
137 | RET | |
138 | ||
139 | ||
140 | ;------------------------------------------------------------------------------------------- | |
141 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
142 | ;------------------------------------------------------------------------------------------- | |
143 | INIT_XMM sse4 | |
144 | cglobal intra_pred_dc8, 4, 7, 3 | |
145 | mov r4d, r5m | |
146 | inc r2 | |
147 | inc r3 | |
148 | pxor m0, m0 | |
149 | movh m1, [r2] | |
150 | movh m2, [r3] | |
151 | punpcklqdq m1, m2 | |
152 | psadbw m1, m0 | |
153 | pshufd m2, m1, 2 | |
154 | paddw m1, m2 | |
155 | ||
156 | movd r5d, m1 | |
157 | add r5d, 8 | |
158 | shr r5d, 4 ; sum = sum / 16 | |
159 | movd m1, r5d | |
160 | pshufb m1, m0 ; m1 = byte [dc_val ...] | |
161 | ||
162 | test r4d, r4d | |
163 | ||
164 | ; store DC 8x8 | |
165 | mov r6, r0 | |
166 | movh [r0], m1 | |
167 | movh [r0 + r1], m1 | |
168 | lea r0, [r0 + r1 * 2] | |
169 | movh [r0], m1 | |
170 | movh [r0 + r1], m1 | |
171 | lea r0, [r0 + r1 * 2] | |
172 | movh [r0], m1 | |
173 | movh [r0 + r1], m1 | |
174 | lea r0, [r0 + r1 * 2] | |
175 | movh [r0], m1 | |
176 | movh [r0 + r1], m1 | |
177 | ||
178 | ; Do DC Filter | |
179 | jz .end | |
180 | lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 | |
181 | add r5d, r4d ; r5d = DC * 3 + 2 | |
182 | movd m1, r5d | |
183 | pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
184 | pshufd m1, m1, 0 | |
185 | ||
186 | ; filter top | |
187 | pmovzxbw m2, [r3] | |
188 | paddw m2, m1 | |
189 | psraw m2, 2 | |
190 | packuswb m2, m2 | |
191 | movh [r6], m2 | |
192 | ||
193 | ; filter top-left | |
194 | movzx r3d, byte [r3] | |
195 | add r4d, r3d | |
196 | movzx r3d, byte [r2] | |
197 | add r3d, r4d | |
198 | shr r3d, 2 | |
199 | mov [r6], r3b | |
200 | ||
201 | ; filter left | |
202 | add r6, r1 | |
203 | pmovzxbw m2, [r2 + 1] | |
204 | paddw m2, m1 | |
205 | psraw m2, 2 | |
206 | packuswb m2, m2 | |
207 | pextrb [r6], m2, 0 | |
208 | pextrb [r6 + r1], m2, 1 | |
209 | pextrb [r6 + 2 * r1], m2, 2 | |
210 | lea r6, [r6 + r1 * 2] | |
211 | pextrb [r6 + r1], m2, 3 | |
212 | pextrb [r6 + r1 * 2], m2, 4 | |
213 | pextrb [r6 + r1 * 4], m2, 6 | |
214 | lea r1, [r1 * 3] | |
215 | pextrb [r6 + r1], m2, 5 | |
216 | ||
217 | .end: | |
218 | RET | |
219 | ||
220 | ;------------------------------------------------------------------------------------------- | |
221 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
222 | ;------------------------------------------------------------------------------------------- | |
223 | INIT_XMM sse4 | |
224 | cglobal intra_pred_dc16, 5, 7, 4 | |
225 | mov r4d, r5m | |
226 | inc r2 | |
227 | inc r3 | |
228 | pxor m0, m0 | |
229 | movu m1, [r2] | |
230 | movu m2, [r3] | |
231 | psadbw m1, m0 | |
232 | psadbw m2, m0 | |
233 | paddw m1, m2 | |
234 | pshufd m2, m1, 2 | |
235 | paddw m1, m2 | |
236 | ||
237 | movd r5d, m1 | |
238 | add r5d, 16 | |
239 | shr r5d, 5 ; sum = sum / 32 | |
240 | movd m1, r5d | |
241 | pshufb m1, m0 ; m1 = byte [dc_val ...] | |
242 | ||
243 | test r4d, r4d | |
244 | ||
245 | ; store DC 16x16 | |
246 | mov r6, r0 | |
247 | movu [r0], m1 | |
248 | movu [r0 + r1], m1 | |
249 | lea r0, [r0 + r1 * 2] | |
250 | movu [r0], m1 | |
251 | movu [r0 + r1], m1 | |
252 | lea r0, [r0 + r1 * 2] | |
253 | movu [r0], m1 | |
254 | movu [r0 + r1], m1 | |
255 | lea r0, [r0 + r1 * 2] | |
256 | movu [r0], m1 | |
257 | movu [r0 + r1], m1 | |
258 | lea r0, [r0 + r1 * 2] | |
259 | movu [r0], m1 | |
260 | movu [r0 + r1], m1 | |
261 | lea r0, [r0 + r1 * 2] | |
262 | movu [r0], m1 | |
263 | movu [r0 + r1], m1 | |
264 | lea r0, [r0 + r1 * 2] | |
265 | movu [r0], m1 | |
266 | movu [r0 + r1], m1 | |
267 | lea r0, [r0 + r1 * 2] | |
268 | movu [r0], m1 | |
269 | movu [r0 + r1], m1 | |
270 | ||
271 | ; Do DC Filter | |
272 | jz .end | |
273 | lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 | |
274 | add r5d, r4d ; r5d = DC * 3 + 2 | |
275 | movd m1, r5d | |
276 | pshuflw m1, m1, 0 ; m1 = pixDCx3 | |
277 | pshufd m1, m1, 0 | |
278 | ||
279 | ; filter top | |
280 | pmovzxbw m2, [r3] | |
281 | paddw m2, m1 | |
282 | psraw m2, 2 | |
283 | packuswb m2, m2 | |
284 | movh [r6], m2 | |
285 | pmovzxbw m3, [r3 + 8] | |
286 | paddw m3, m1 | |
287 | psraw m3, 2 | |
288 | packuswb m3, m3 | |
289 | movh [r6 + 8], m3 | |
290 | ||
291 | ; filter top-left | |
292 | movzx r3d, byte [r3] | |
293 | add r4d, r3d | |
294 | movzx r3d, byte [r2] | |
295 | add r3d, r4d | |
296 | shr r3d, 2 | |
297 | mov [r6], r3b | |
298 | ||
299 | ; filter left | |
300 | add r6, r1 | |
301 | pmovzxbw m2, [r2 + 1] | |
302 | paddw m2, m1 | |
303 | psraw m2, 2 | |
304 | packuswb m2, m2 | |
305 | pextrb [r6], m2, 0 | |
306 | pextrb [r6 + r1], m2, 1 | |
307 | pextrb [r6 + r1 * 2], m2, 2 | |
308 | lea r6, [r6 + r1 * 2] | |
309 | pextrb [r6 + r1], m2, 3 | |
310 | pextrb [r6 + r1 * 2], m2, 4 | |
311 | lea r6, [r6 + r1 * 2] | |
312 | pextrb [r6 + r1], m2, 5 | |
313 | pextrb [r6 + r1 * 2], m2, 6 | |
314 | lea r6, [r6 + r1 * 2] | |
315 | pextrb [r6 + r1], m2, 7 | |
316 | ||
317 | pmovzxbw m3, [r2 + 9] | |
318 | paddw m3, m1 | |
319 | psraw m3, 2 | |
320 | packuswb m3, m3 | |
321 | pextrb [r6 + r1 * 2], m3, 0 | |
322 | lea r6, [r6 + r1 * 2] | |
323 | pextrb [r6 + r1], m3, 1 | |
324 | pextrb [r6 + r1 * 2], m3, 2 | |
325 | lea r6, [r6 + r1 * 2] | |
326 | pextrb [r6 + r1], m3, 3 | |
327 | pextrb [r6 + r1 * 2], m3, 4 | |
328 | lea r6, [r6 + r1 * 2] | |
329 | pextrb [r6 + r1], m3, 5 | |
330 | pextrb [r6 + r1 * 2], m3, 6 | |
331 | ||
332 | .end: | |
333 | RET | |
334 | ||
335 | ;------------------------------------------------------------------------------------------- | |
336 | ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
337 | ;------------------------------------------------------------------------------------------- | |
338 | INIT_XMM sse4 | |
339 | cglobal intra_pred_dc32, 4, 5, 5 | |
340 | inc r2 | |
341 | inc r3 | |
342 | pxor m0, m0 | |
343 | movu m1, [r2] | |
344 | movu m2, [r2 + 16] | |
345 | movu m3, [r3] | |
346 | movu m4, [r3 + 16] | |
347 | psadbw m1, m0 | |
348 | psadbw m2, m0 | |
349 | psadbw m3, m0 | |
350 | psadbw m4, m0 | |
351 | paddw m1, m2 | |
352 | paddw m3, m4 | |
353 | paddw m1, m3 | |
354 | pshufd m2, m1, 2 | |
355 | paddw m1, m2 | |
356 | ||
357 | movd r4d, m1 | |
358 | add r4d, 32 | |
359 | shr r4d, 6 ; sum = sum / 64 | |
360 | movd m1, r4d | |
361 | pshufb m1, m0 ; m1 = byte [dc_val ...] | |
362 | ||
363 | %rep 2 | |
364 | ; store DC 16x16 | |
365 | movu [r0], m1 | |
366 | movu [r0 + r1], m1 | |
367 | movu [r0 + 16], m1 | |
368 | movu [r0 + r1 + 16],m1 | |
369 | lea r0, [r0 + 2 * r1] | |
370 | movu [r0], m1 | |
371 | movu [r0 + r1], m1 | |
372 | movu [r0 + 16], m1 | |
373 | movu [r0 + r1 + 16],m1 | |
374 | lea r0, [r0 + 2 * r1] | |
375 | movu [r0], m1 | |
376 | movu [r0 + r1], m1 | |
377 | movu [r0 + 16], m1 | |
378 | movu [r0 + r1 + 16],m1 | |
379 | lea r0, [r0 + 2 * r1] | |
380 | movu [r0], m1 | |
381 | movu [r0 + r1], m1 | |
382 | movu [r0 + 16], m1 | |
383 | movu [r0 + r1 + 16],m1 | |
384 | lea r0, [r0 + 2 * r1] | |
385 | movu [r0], m1 | |
386 | movu [r0 + r1], m1 | |
387 | movu [r0 + 16], m1 | |
388 | movu [r0 + r1 + 16],m1 | |
389 | lea r0, [r0 + 2 * r1] | |
390 | movu [r0], m1 | |
391 | movu [r0 + r1], m1 | |
392 | movu [r0 + 16], m1 | |
393 | movu [r0 + r1 + 16],m1 | |
394 | lea r0, [r0 + 2 * r1] | |
395 | movu [r0], m1 | |
396 | movu [r0 + r1], m1 | |
397 | movu [r0 + 16], m1 | |
398 | movu [r0 + r1 + 16],m1 | |
399 | lea r0, [r0 + 2 * r1] | |
400 | movu [r0], m1 | |
401 | movu [r0 + r1], m1 | |
402 | movu [r0 + 16], m1 | |
403 | movu [r0 + r1 + 16],m1 | |
404 | lea r0, [r0 + 2 * r1] | |
405 | %endrep | |
406 | ||
407 | RET | |
408 | ||
409 | ;----------------------------------------------------------------------------------------------------------- | |
410 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
411 | ;----------------------------------------------------------------------------------------------------------- | |
412 | INIT_XMM sse4 | |
413 | cglobal intra_pred_planar4, 4,7,5 | |
414 | inc r2 | |
415 | inc r3 | |
416 | pmovzxbw m0, [r3] ; topRow[i] = above[i]; | |
417 | punpcklqdq m0, m0 | |
418 | ||
419 | pxor m1, m1 | |
420 | movd m2, [r2 + 4] ; bottomLeft = left[4] | |
421 | movzx r6d, byte [r3 + 4] ; topRight = above[4]; | |
422 | pshufb m2, m1 | |
423 | punpcklbw m2, m1 | |
424 | psubw m2, m0 ; bottomRow[i] = bottomLeft - topRow[i] | |
425 | psllw m0, 2 | |
426 | punpcklqdq m3, m2, m1 | |
427 | psubw m0, m3 | |
428 | paddw m2, m2 | |
429 | ||
430 | %macro COMP_PRED_PLANAR_2ROW 1 | |
431 | movzx r4d, byte [r2 + %1] | |
432 | lea r4d, [r4d * 4 + 4] | |
433 | movd m3, r4d | |
434 | pshuflw m3, m3, 0 | |
435 | ||
436 | movzx r4d, byte [r2 + %1 + 1] | |
437 | lea r4d, [r4d * 4 + 4] | |
438 | movd m4, r4d | |
439 | pshuflw m4, m4, 0 | |
440 | punpcklqdq m3, m4 ; horPred | |
441 | ||
442 | movzx r4d, byte [r2 + %1] | |
443 | mov r5d, r6d | |
444 | sub r5d, r4d | |
445 | movd m4, r5d | |
446 | pshuflw m4, m4, 0 | |
447 | ||
448 | movzx r4d, byte [r2 + %1 + 1] | |
449 | mov r5d, r6d | |
450 | sub r5d, r4d | |
451 | movd m1, r5d | |
452 | pshuflw m1, m1, 0 | |
453 | punpcklqdq m4, m1 ; rightColumnN | |
454 | ||
455 | pmullw m4, [multi_2Row] | |
456 | paddw m3, m4 | |
457 | paddw m0, m2 | |
458 | paddw m3, m0 | |
459 | psraw m3, 3 | |
460 | packuswb m3, m3 | |
461 | ||
462 | movd [r0], m3 | |
463 | pshufd m3, m3, 0x55 | |
464 | movd [r0 + r1], m3 | |
465 | lea r0, [r0 + 2 * r1] | |
466 | %endmacro | |
467 | ||
468 | COMP_PRED_PLANAR_2ROW 0 | |
469 | COMP_PRED_PLANAR_2ROW 2 | |
470 | ||
471 | RET | |
472 | ||
473 | ;----------------------------------------------------------------------------------------------------------- | |
474 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
475 | ;----------------------------------------------------------------------------------------------------------- | |
476 | INIT_XMM sse4 | |
477 | cglobal intra_pred_planar8, 4,4,7 | |
478 | inc r2 | |
479 | inc r3 | |
480 | pxor m0, m0 | |
481 | pmovzxbw m1, [r3] ; v_topRow | |
482 | pmovzxbw m2, [r2] ; v_leftColumn | |
483 | ||
484 | movd m3, [r3 + 8] ; topRight = above[8]; | |
485 | movd m4, [r2 + 8] ; bottomLeft = left[8]; | |
486 | ||
487 | pshufb m3, m0 | |
488 | pshufb m4, m0 | |
489 | punpcklbw m3, m0 ; v_topRight | |
490 | punpcklbw m4, m0 ; v_bottomLeft | |
491 | ||
492 | psubw m4, m1 ; v_bottomRow | |
493 | psubw m3, m2 ; v_rightColumn | |
494 | ||
495 | psllw m1, 3 ; v_topRow | |
496 | psllw m2, 3 ; v_leftColumn | |
497 | ||
498 | paddw m6, m2, [pw_8] | |
499 | ||
500 | %macro PRED_PLANAR_ROW8 1 | |
501 | %if (%1 < 4) | |
502 | pshuflw m5, m6, 0x55 * %1 | |
503 | pshufd m5, m5, 0 | |
504 | pshuflw m2, m3, 0x55 * %1 | |
505 | pshufd m2, m2, 0 | |
506 | %else | |
507 | pshufhw m5, m6, 0x55 * (%1 - 4) | |
508 | pshufd m5, m5, 0xAA | |
509 | pshufhw m2, m3, 0x55 * (%1 - 4) | |
510 | pshufd m2, m2, 0xAA | |
511 | %endif | |
512 | ||
513 | pmullw m2, [multiL] | |
514 | paddw m5, m2 | |
515 | paddw m1, m4 | |
516 | paddw m5, m1 | |
517 | psraw m5, 4 | |
518 | packuswb m5, m5 | |
519 | ||
520 | movh [r0], m5 | |
521 | lea r0, [r0 + r1] | |
522 | ||
523 | %endmacro | |
524 | ||
525 | PRED_PLANAR_ROW8 0 | |
526 | PRED_PLANAR_ROW8 1 | |
527 | PRED_PLANAR_ROW8 2 | |
528 | PRED_PLANAR_ROW8 3 | |
529 | PRED_PLANAR_ROW8 4 | |
530 | PRED_PLANAR_ROW8 5 | |
531 | PRED_PLANAR_ROW8 6 | |
532 | PRED_PLANAR_ROW8 7 | |
533 | ||
534 | RET | |
535 | ||
536 | ||
537 | ;----------------------------------------------------------------------------------------------------------- | |
538 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
539 | ;----------------------------------------------------------------------------------------------------------- | |
540 | INIT_XMM sse4 | |
541 | cglobal intra_pred_planar16, 4,6,8 | |
542 | inc r2 | |
543 | inc r3 | |
544 | pxor m0, m0 | |
545 | pmovzxbw m1, [r3] ; topRow[0-7] | |
546 | pmovzxbw m2, [r3 + 8] ; topRow[8-15] | |
547 | ||
548 | movd m3, [r2 + 16] | |
549 | pshufb m3, m0 | |
550 | punpcklbw m3, m0 ; v_bottomLeft = left[16] | |
551 | movzx r4d, byte [r3 + 16] ; topRight = above[16] | |
552 | ||
553 | psubw m4, m3, m1 ; v_bottomRow[0] | |
554 | psubw m5, m3, m2 ; v_bottomRow[1] | |
555 | ||
556 | psllw m1, 4 | |
557 | psllw m2, 4 | |
558 | ||
559 | %macro PRED_PLANAR_ROW16 1 | |
560 | movzx r5d, byte [r2 + %1] | |
561 | add r5d, r5d | |
562 | lea r5d, [r5d * 8 + 16] | |
563 | movd m3, r5d | |
564 | pshuflw m3, m3, 0 | |
565 | pshufd m3, m3, 0 ; horPred | |
566 | ||
567 | movzx r5d, byte [r2 + %1] | |
568 | mov r3d, r4d | |
569 | sub r3d, r5d | |
570 | movd m6, r3d | |
571 | pshuflw m6, m6, 0 | |
572 | pshufd m6, m6, 0 | |
573 | ||
574 | pmullw m7, m6, [multiL] | |
575 | paddw m7, m3 | |
576 | paddw m1, m4 | |
577 | paddw m7, m1 | |
578 | psraw m7, 5 | |
579 | ||
580 | pmullw m6, m6, [multiH] | |
581 | paddw m3, m6 | |
582 | paddw m2, m5 | |
583 | paddw m3, m2 | |
584 | psraw m3, 5 | |
585 | ||
586 | packuswb m7, m3 | |
587 | movu [r0], m7 | |
588 | lea r0, [r0 + r1] | |
589 | %endmacro | |
590 | ||
591 | PRED_PLANAR_ROW16 0 | |
592 | PRED_PLANAR_ROW16 1 | |
593 | PRED_PLANAR_ROW16 2 | |
594 | PRED_PLANAR_ROW16 3 | |
595 | PRED_PLANAR_ROW16 4 | |
596 | PRED_PLANAR_ROW16 5 | |
597 | PRED_PLANAR_ROW16 6 | |
598 | PRED_PLANAR_ROW16 7 | |
599 | PRED_PLANAR_ROW16 8 | |
600 | PRED_PLANAR_ROW16 9 | |
601 | PRED_PLANAR_ROW16 10 | |
602 | PRED_PLANAR_ROW16 11 | |
603 | PRED_PLANAR_ROW16 12 | |
604 | PRED_PLANAR_ROW16 13 | |
605 | PRED_PLANAR_ROW16 14 | |
606 | PRED_PLANAR_ROW16 15 | |
607 | ||
608 | RET | |
609 | ||
610 | ||
611 | ;----------------------------------------------------------------------------------------------------------- | |
612 | ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) | |
613 | ;----------------------------------------------------------------------------------------------------------- | |
614 | INIT_XMM sse4 | |
615 | %if ARCH_X86_64 == 1 | |
616 | cglobal intra_pred_planar32, 4,7,12 | |
617 | %define bottomRow0 m8 | |
618 | %define bottomRow1 m9 | |
619 | %define bottomRow2 m10 | |
620 | %define bottomRow3 m11 | |
621 | %else | |
622 | cglobal intra_pred_planar32, 4,7,8,0-(4*mmsize) | |
623 | %define bottomRow0 [rsp + 0 * mmsize] | |
624 | %define bottomRow1 [rsp + 1 * mmsize] | |
625 | %define bottomRow2 [rsp + 2 * mmsize] | |
626 | %define bottomRow3 [rsp + 3 * mmsize] | |
627 | %endif | |
628 | inc r2 | |
629 | inc r3 | |
630 | pxor m3, m3 | |
631 | movd m0, [r2 + 32] | |
632 | pshufb m0, m3 | |
633 | punpcklbw m0, m3 ; v_bottomLeft = left[32] | |
634 | movzx r4d, byte [r3 + 32] ; topRight = above[32] | |
635 | ||
636 | pmovzxbw m1, [r3 + 0] ; topRow[0] | |
637 | pmovzxbw m2, [r3 + 8] ; topRow[1] | |
638 | pmovzxbw m3, [r3 +16] ; topRow[2] | |
639 | pmovzxbw m4, [r3 +24] ; topRow[3] | |
640 | ||
641 | psubw m5, m0, m1 ; v_bottomRow[0] | |
642 | psubw m6, m0, m2 ; v_bottomRow[1] | |
643 | psubw m7, m0, m3 ; v_bottomRow[2] | |
644 | psubw m0, m4 ; v_bottomRow[3] | |
645 | ||
646 | mova bottomRow0, m5 | |
647 | mova bottomRow1, m6 | |
648 | mova bottomRow2, m7 | |
649 | mova bottomRow3, m0 | |
650 | ||
651 | psllw m1, 5 | |
652 | psllw m2, 5 | |
653 | psllw m3, 5 | |
654 | psllw m4, 5 | |
655 | ||
656 | %macro COMP_PRED_PLANAR_ROW 1 | |
657 | movzx r5d, byte [r2] | |
658 | shl r5d, 5 | |
659 | add r5d, 32 | |
660 | movd m5, r5d | |
661 | pshuflw m5, m5, 0 | |
662 | pshufd m5, m5, 0 ; horPred | |
663 | ||
664 | movzx r5d, byte [r2] | |
665 | mov r6d, r4d | |
666 | sub r6d, r5d | |
667 | movd m6, r6d | |
668 | pshuflw m6, m6, 0 | |
669 | pshufd m6, m6, 0 | |
670 | ||
671 | %if (%1 == 0) | |
672 | pmullw m7, m6, [multiL] | |
673 | %else | |
674 | pmullw m7, m6, [multiH2] | |
675 | %endif | |
676 | ||
677 | paddw m7, m5 | |
678 | %if (%1 == 0) | |
679 | paddw m1, bottomRow0 | |
680 | paddw m7, m1 | |
681 | %else | |
682 | paddw m3, bottomRow2 | |
683 | paddw m7, m3 | |
684 | %endif | |
685 | psraw m7, 6 | |
686 | ||
687 | %if (%1 == 0) | |
688 | pmullw m6, [multiH] | |
689 | %else | |
690 | pmullw m6, [multiH3] | |
691 | %endif | |
692 | paddw m6, m5 | |
693 | %if (%1 == 0) | |
694 | paddw m2, bottomRow1 | |
695 | paddw m6, m2 | |
696 | %else | |
697 | paddw m4, bottomRow3 | |
698 | paddw m6, m4 | |
699 | %endif | |
700 | psraw m6, 6 | |
701 | ||
702 | packuswb m7, m6 | |
703 | movu [r0 + %1], m7 | |
704 | %endmacro | |
705 | ||
706 | mov r3, 32 | |
707 | .loop: | |
708 | COMP_PRED_PLANAR_ROW 0 | |
709 | COMP_PRED_PLANAR_ROW 16 | |
710 | inc r2 | |
711 | lea r0, [r0 + r1] | |
712 | ||
713 | dec r3 | |
714 | jnz .loop | |
715 | %undef COMP_PRED_PLANAR_ROW | |
716 | ||
717 | RET | |
718 | ||
719 | ;----------------------------------------------------------------------------- | |
720 | ; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
721 | ;----------------------------------------------------------------------------- | |
722 | INIT_XMM ssse3 | |
723 | cglobal intra_pred_ang4_2, 3,3,4 | |
724 | cmp r4m, byte 34 | |
725 | cmove r2, r3mp | |
726 | movh m0, [r2 + 2] | |
727 | movd [r0], m0 | |
728 | palignr m1, m0, 1 | |
729 | movd [r0 + r1], m1 | |
730 | palignr m2, m0, 2 | |
731 | movd [r0 + r1 * 2], m2 | |
732 | lea r1, [r1 * 3] | |
733 | psrldq m0, 3 | |
734 | movd [r0 + r1], m0 | |
735 | RET | |
736 | ||
737 | ||
738 | INIT_XMM sse4 | |
739 | cglobal intra_pred_ang4_3, 3,4,5 | |
740 | cmp r4m, byte 33 | |
741 | cmove r2, r3mp | |
742 | lea r3, [ang_table + 20 * 16] | |
743 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
744 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
745 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
746 | palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
747 | palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] | |
748 | palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] | |
749 | punpcklqdq m0, m1 | |
750 | punpcklqdq m2, m3 | |
751 | ||
752 | movh m3, [r3 + 6 * 16] ; [26] | |
753 | movhps m3, [r3] ; [20] | |
754 | movh m4, [r3 - 6 * 16] ; [14] | |
755 | movhps m4, [r3 - 12 * 16] ; [ 8] | |
756 | jmp .do_filter4x4 | |
757 | ||
758 | ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose | |
759 | ALIGN 16 | |
760 | .do_filter4x4: | |
761 | mova m1, [pw_1024] | |
762 | ||
763 | pmaddubsw m0, m3 | |
764 | pmulhrsw m0, m1 | |
765 | pmaddubsw m2, m4 | |
766 | pmulhrsw m2, m1 | |
767 | packuswb m0, m2 | |
768 | ||
769 | ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before | |
770 | jz .store | |
771 | ||
772 | ; transpose 4x4 | |
773 | pshufb m0, [c_trans_4x4] | |
774 | ||
775 | .store: | |
776 | ; TODO: use pextrd here after intrinsic ssse3 removed | |
777 | movd [r0], m0 | |
778 | pextrd [r0 + r1], m0, 1 | |
779 | pextrd [r0 + r1 * 2], m0, 2 | |
780 | lea r1, [r1 * 3] | |
781 | pextrd [r0 + r1], m0, 3 | |
782 | RET | |
783 | ||
784 | ||
785 | cglobal intra_pred_ang4_4, 3,4,5 | |
786 | cmp r4m, byte 32 | |
787 | cmove r2, r3mp | |
788 | lea r3, [ang_table + 18 * 16] | |
789 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
790 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
791 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
792 | palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
793 | palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] | |
794 | punpcklqdq m0, m1 | |
795 | punpcklqdq m2, m1, m3 | |
796 | ||
797 | movh m3, [r3 + 3 * 16] ; [21] | |
798 | movhps m3, [r3 - 8 * 16] ; [10] | |
799 | movh m4, [r3 + 13 * 16] ; [31] | |
800 | movhps m4, [r3 + 2 * 16] ; [20] | |
801 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
802 | ||
803 | ||
804 | cglobal intra_pred_ang4_5, 3,4,5 | |
805 | cmp r4m, byte 31 | |
806 | cmove r2, r3mp | |
807 | lea r3, [ang_table + 10 * 16] | |
808 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
809 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
810 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
811 | palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
812 | palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] | |
813 | punpcklqdq m0, m1 | |
814 | punpcklqdq m2, m1, m3 | |
815 | ||
816 | movh m3, [r3 + 7 * 16] ; [17] | |
817 | movhps m3, [r3 - 8 * 16] ; [ 2] | |
818 | movh m4, [r3 + 9 * 16] ; [19] | |
819 | movhps m4, [r3 - 6 * 16] ; [ 4] | |
820 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
821 | ||
822 | ||
823 | cglobal intra_pred_ang4_6, 3,4,5 | |
824 | cmp r4m, byte 30 | |
825 | cmove r2, r3mp | |
826 | lea r3, [ang_table + 19 * 16] | |
827 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
828 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
829 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
830 | palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
831 | punpcklqdq m0, m0 | |
832 | punpcklqdq m2, m2 | |
833 | ||
834 | movh m3, [r3 - 6 * 16] ; [13] | |
835 | movhps m3, [r3 + 7 * 16] ; [26] | |
836 | movh m4, [r3 - 12 * 16] ; [ 7] | |
837 | movhps m4, [r3 + 1 * 16] ; [20] | |
838 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
839 | ||
840 | ||
841 | cglobal intra_pred_ang4_7, 3,4,5 | |
842 | cmp r4m, byte 29 | |
843 | cmove r2, r3mp | |
844 | lea r3, [ang_table + 20 * 16] | |
845 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
846 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
847 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
848 | palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] | |
849 | punpcklqdq m2, m0, m3 | |
850 | punpcklqdq m0, m0 | |
851 | ||
852 | movh m3, [r3 - 11 * 16] ; [ 9] | |
853 | movhps m3, [r3 - 2 * 16] ; [18] | |
854 | movh m4, [r3 + 7 * 16] ; [27] | |
855 | movhps m4, [r3 - 16 * 16] ; [ 4] | |
856 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
857 | ||
858 | ||
859 | cglobal intra_pred_ang4_8, 3,4,5 | |
860 | cmp r4m, byte 28 | |
861 | cmove r2, r3mp | |
862 | lea r3, [ang_table + 13 * 16] | |
863 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
864 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
865 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
866 | punpcklqdq m0, m0 | |
867 | mova m2, m0 | |
868 | ||
869 | movh m3, [r3 - 8 * 16] ; [ 5] | |
870 | movhps m3, [r3 - 3 * 16] ; [10] | |
871 | movh m4, [r3 + 2 * 16] ; [15] | |
872 | movhps m4, [r3 + 7 * 16] ; [20] | |
873 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
874 | ||
875 | ||
876 | cglobal intra_pred_ang4_9, 3,4,5 | |
877 | cmp r4m, byte 27 | |
878 | cmove r2, r3mp | |
879 | lea r3, [ang_table + 4 * 16] | |
880 | movh m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
881 | palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] | |
882 | punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
883 | punpcklqdq m0, m0 | |
884 | mova m2, m0 | |
885 | ||
886 | movh m3, [r3 - 2 * 16] ; [ 2] | |
887 | movhps m3, [r3 - 0 * 16] ; [ 4] | |
888 | movh m4, [r3 + 2 * 16] ; [ 6] | |
889 | movhps m4, [r3 + 4 * 16] ; [ 8] | |
890 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
891 | ||
892 | ||
893 | cglobal intra_pred_ang4_10, 3,3,4 | |
894 | movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] | |
895 | pshufb m0, [pb_unpackbd1] | |
896 | ||
897 | pshufd m1, m0, 1 | |
898 | movhlps m2, m0 | |
899 | pshufd m3, m0, 3 | |
900 | movd [r0 + r1], m1 | |
901 | movd [r0 + r1 * 2], m2 | |
902 | lea r1, [r1 * 3] | |
903 | movd [r0 + r1], m3 | |
904 | ||
905 | cmp r5m, byte 0 | |
906 | jz .quit | |
907 | ||
908 | ; filter | |
909 | mov r2, r3mp | |
910 | pmovzxbw m0, m0 ; [-1 -1 -1 -1] | |
911 | movh m1, [r2] ; [4 3 2 1 0] | |
912 | pshufb m2, m1, [pb_0_8] ; [0 0 0 0] | |
913 | pshufb m1, [pb_unpackbw1] ; [4 3 2 1] | |
914 | psubw m1, m2 | |
915 | psraw m1, 1 | |
916 | paddw m0, m1 | |
917 | packuswb m0, m0 | |
918 | ||
919 | .quit: | |
920 | movd [r0], m0 | |
921 | RET | |
922 | ||
923 | ||
924 | INIT_XMM sse4 | |
925 | cglobal intra_pred_ang4_26, 4,4,3 | |
926 | movd m0, [r3 + 1] ; [8 7 6 5 4 3 2 1] | |
927 | ||
928 | ; store | |
929 | movd [r0], m0 | |
930 | movd [r0 + r1], m0 | |
931 | movd [r0 + r1 * 2], m0 | |
932 | lea r3, [r1 * 3] | |
933 | movd [r0 + r3], m0 | |
934 | ||
935 | ; filter | |
936 | cmp r5m, byte 0 | |
937 | jz .quit | |
938 | ||
939 | pshufb m0, [pb_0_8] ; [ 1 1 1 1] | |
940 | movh m1, [r2] ; [-4 -3 -2 -1 0] | |
941 | pshufb m2, m1, [pb_0_8] ; [0 0 0 0] | |
942 | pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1] | |
943 | psubw m1, m2 | |
944 | psraw m1, 1 | |
945 | paddw m0, m1 | |
946 | packuswb m0, m0 | |
947 | ||
948 | pextrb [r0], m0, 0 | |
949 | pextrb [r0 + r1], m0, 1 | |
950 | pextrb [r0 + r1 * 2], m0, 2 | |
951 | pextrb [r0 + r3], m0, 3 | |
952 | ||
953 | .quit: | |
954 | RET | |
955 | ||
956 | ||
957 | cglobal intra_pred_ang4_11, 3,4,5 | |
958 | cmp r4m, byte 25 | |
959 | cmove r2, r3mp | |
960 | lea r3, [ang_table + 24 * 16] | |
961 | movh m0, [r2] ; [x x x 4 3 2 1 0] | |
962 | palignr m1, m0, 1 ; [x x x x 4 3 2 1] | |
963 | punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] | |
964 | punpcklqdq m0, m0 | |
965 | mova m2, m0 | |
966 | ||
967 | movh m3, [r3 + 6 * 16] ; [24] | |
968 | movhps m3, [r3 + 4 * 16] ; [26] | |
969 | movh m4, [r3 + 2 * 16] ; [28] | |
970 | movhps m4, [r3 + 0 * 16] ; [30] | |
971 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
972 | ||
973 | ||
974 | cglobal intra_pred_ang4_12, 3,4,5 | |
975 | cmp r4m, byte 24 | |
976 | cmove r2, r3mp | |
977 | lea r3, [ang_table + 20 * 16] | |
978 | movh m0, [r2] ; [x x x 4 3 2 1 0] | |
979 | palignr m1, m0, 1 ; [x x x x 4 3 2 1] | |
980 | punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] | |
981 | punpcklqdq m0, m0 | |
982 | mova m2, m0 | |
983 | ||
984 | movh m3, [r3 + 7 * 16] ; [27] | |
985 | movhps m3, [r3 + 2 * 16] ; [22] | |
986 | movh m4, [r3 - 3 * 16] ; [17] | |
987 | movhps m4, [r3 - 8 * 16] ; [12] | |
988 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
989 | ||
990 | ||
991 | cglobal intra_pred_ang4_13, 4,4,5 | |
992 | cmp r4m, byte 23 | |
993 | jnz .load | |
994 | xchg r2, r3 | |
995 | .load: | |
996 | movh m1, [r2 - 1] ; [x x 4 3 2 1 0 x] | |
997 | palignr m0, m1, 1 ; [x x x 4 3 2 1 0] | |
998 | palignr m2, m1, 2 ; [x x x x 4 3 2 1] | |
999 | pinsrb m1, [r3 + 4], 0 | |
1000 | punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x] | |
1001 | punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0] | |
1002 | punpcklqdq m2, m0, m1 | |
1003 | punpcklqdq m0, m0 | |
1004 | ||
1005 | lea r3, [ang_table + 21 * 16] | |
1006 | movh m3, [r3 + 2 * 16] ; [23] | |
1007 | movhps m3, [r3 - 7 * 16] ; [14] | |
1008 | movh m4, [r3 - 16 * 16] ; [ 5] | |
1009 | movhps m4, [r3 + 7 * 16] ; [28] | |
1010 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
1011 | ||
1012 | ||
1013 | cglobal intra_pred_ang4_14, 4,4,5 | |
1014 | cmp r4m, byte 22 | |
1015 | jnz .load | |
1016 | xchg r2, r3 | |
1017 | .load: | |
1018 | movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] | |
1019 | palignr m0, m2, 1 ; [x x x 4 3 2 1 0] | |
1020 | palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
1021 | pinsrb m2, [r3 + 2], 0 | |
1022 | punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] | |
1023 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
1024 | punpcklqdq m0, m0 | |
1025 | punpcklqdq m2, m2 | |
1026 | ||
1027 | lea r3, [ang_table + 19 * 16] | |
1028 | movh m3, [r3 + 0 * 16] ; [19] | |
1029 | movhps m3, [r3 - 13 * 16] ; [ 6] | |
1030 | movh m4, [r3 + 6 * 16] ; [25] | |
1031 | movhps m4, [r3 - 7 * 16] ; [12] | |
1032 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
1033 | ||
1034 | ||
1035 | cglobal intra_pred_ang4_15, 4,4,5 | |
1036 | cmp r4m, byte 21 | |
1037 | jnz .load | |
1038 | xchg r2, r3 | |
1039 | .load: | |
1040 | movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] | |
1041 | palignr m0, m2, 1 ; [x x x 4 3 2 1 0] | |
1042 | palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
1043 | pinsrb m2, [r3 + 2], 0 | |
1044 | pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] | |
1045 | pinsrb m3, [r3 + 4], 0 | |
1046 | punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] | |
1047 | punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] | |
1048 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
1049 | punpcklqdq m0, m2 | |
1050 | punpcklqdq m2, m4 | |
1051 | ||
1052 | lea r3, [ang_table + 23 * 16] | |
1053 | movh m3, [r3 - 8 * 16] ; [15] | |
1054 | movhps m3, [r3 + 7 * 16] ; [30] | |
1055 | movh m4, [r3 - 10 * 16] ; [13] | |
1056 | movhps m4, [r3 + 5 * 16] ; [28] | |
1057 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
1058 | ||
1059 | ||
1060 | cglobal intra_pred_ang4_16, 4,4,5 | |
1061 | cmp r4m, byte 20 | |
1062 | jnz .load | |
1063 | xchg r2, r3 | |
1064 | .load: | |
1065 | movh m2, [r2 - 1] ; [x x 4 3 2 1 0 x] | |
1066 | palignr m0, m2, 1 ; [x x x 4 3 2 1 0] | |
1067 | palignr m1, m2, 2 ; [x x x x 4 3 2 1] | |
1068 | pinsrb m2, [r3 + 2], 0 | |
1069 | pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] | |
1070 | pinsrb m3, [r3 + 3], 0 | |
1071 | punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] | |
1072 | punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] | |
1073 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
1074 | punpcklqdq m0, m2 | |
1075 | punpcklqdq m2, m4 | |
1076 | ||
1077 | lea r3, [ang_table + 19 * 16] | |
1078 | movh m3, [r3 - 8 * 16] ; [11] | |
1079 | movhps m3, [r3 + 3 * 16] ; [22] | |
1080 | movh m4, [r3 - 18 * 16] ; [ 1] | |
1081 | movhps m4, [r3 - 7 * 16] ; [12] | |
1082 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
1083 | ||
1084 | ||
1085 | cglobal intra_pred_ang4_17, 4,4,5 | |
1086 | cmp r4m, byte 19 | |
1087 | jnz .load | |
1088 | xchg r2, r3 | |
1089 | .load: | |
1090 | movh m3, [r2 - 1] ; [- - 4 3 2 1 0 x] | |
1091 | palignr m0, m3, 1 ; [- - - 4 3 2 1 0] | |
1092 | palignr m1, m3, 2 ; [- - - - 4 3 2 1] | |
1093 | mova m4, m0 | |
1094 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] | |
1095 | ||
1096 | pinsrb m3, [r3 + 1], 0 | |
1097 | punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x] | |
1098 | punpcklqdq m0, m1 | |
1099 | ||
1100 | pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y] | |
1101 | pinsrb m2, [r3 + 2], 0 | |
1102 | pslldq m1, m2, 1 ; [4 3 2 1 0 x y z] | |
1103 | pinsrb m1, [r3 + 4], 0 | |
1104 | punpcklbw m1, m2 ; [1 0 0 x x y y z] | |
1105 | punpcklbw m2, m3 ; [2 1 1 0 0 x x y] | |
1106 | punpcklqdq m2, m1 | |
1107 | ||
1108 | lea r3, [ang_table + 14 * 16] | |
1109 | movh m3, [r3 - 8 * 16] ; [ 6] | |
1110 | movhps m3, [r3 - 2 * 16] ; [12] | |
1111 | movh m4, [r3 + 4 * 16] ; [18] | |
1112 | movhps m4, [r3 + 10 * 16] ; [24] | |
1113 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) | |
1114 | ||
1115 | ||
1116 | cglobal intra_pred_ang4_18, 4,4,1 | |
1117 | mov r2d, [r2] | |
1118 | bswap r2d | |
1119 | movd m0, r2d | |
1120 | pinsrd m0, [r3 + 1], 1 ; [- 3 2 1 0 -1 -2 -3] | |
1121 | lea r2, [r1 * 3] | |
1122 | movd [r0 + r2], m0 | |
1123 | psrldq m0, 1 | |
1124 | movd [r0 + r1 * 2], m0 | |
1125 | psrldq m0, 1 | |
1126 | movd [r0 + r1], m0 | |
1127 | psrldq m0, 1 | |
1128 | movd [r0], m0 | |
1129 | RET | |
1130 | ;----------------------------------------------------------------------------- | |
1131 | ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
1132 | ;----------------------------------------------------------------------------- | |
1133 | INIT_XMM ssse3 | |
1134 | cglobal intra_pred_ang8_2, 3,5,2 | |
1135 | cmp r4m, byte 34 | |
1136 | cmove r2, r3mp | |
1137 | movu m0, [r2 + 2] | |
1138 | lea r4, [r1 * 3] | |
1139 | ||
1140 | movh [r0], m0 | |
1141 | palignr m1, m0, 1 | |
1142 | movh [r0 + r1], m1 | |
1143 | palignr m1, m0, 2 | |
1144 | movh [r0 + r1 * 2], m1 | |
1145 | palignr m1, m0, 3 | |
1146 | movh [r0 + r4], m1 | |
1147 | palignr m1, m0, 4 | |
1148 | lea r0, [r0 + r1 * 4] | |
1149 | movh [r0], m1 | |
1150 | palignr m1, m0, 5 | |
1151 | movh [r0 + r1], m1 | |
1152 | palignr m1, m0, 6 | |
1153 | movh [r0 + r1 * 2], m1 | |
1154 | palignr m1, m0, 7 | |
1155 | movh [r0 + r4], m1 | |
1156 | RET | |
1157 | ||
1158 | INIT_XMM sse4 | |
1159 | cglobal intra_pred_ang8_3, 3,5,8 | |
1160 | cmp r4m, byte 33 | |
1161 | cmove r2, r3mp | |
1162 | lea r3, [ang_table + 22 * 16] | |
1163 | lea r4, [ang_table + 8 * 16] | |
1164 | mova m3, [pw_1024] | |
1165 | ||
1166 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1167 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
1168 | ||
1169 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
1170 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1171 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1172 | ||
1173 | pmaddubsw m4, m0, [r3 + 4 * 16] ; [26] | |
1174 | pmulhrsw m4, m3 | |
1175 | pmaddubsw m1, [r3 - 2 * 16] ; [20] | |
1176 | pmulhrsw m1, m3 | |
1177 | packuswb m4, m1 | |
1178 | ||
1179 | palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
1180 | ||
1181 | pmaddubsw m5, [r3 - 8 * 16] ; [14] | |
1182 | pmulhrsw m5, m3 | |
1183 | ||
1184 | palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
1185 | ||
1186 | pmaddubsw m6, [r4] ; [ 8] | |
1187 | pmulhrsw m6, m3 | |
1188 | packuswb m5, m6 | |
1189 | ||
1190 | palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] | |
1191 | ||
1192 | pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2] | |
1193 | pmulhrsw m6, m3 | |
1194 | ||
1195 | pmaddubsw m1, [r3 + 6 * 16] ; [28] | |
1196 | pmulhrsw m1, m3 | |
1197 | packuswb m6, m1 | |
1198 | ||
1199 | palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] | |
1200 | ||
1201 | pmaddubsw m1, [r3] ; [22] | |
1202 | pmulhrsw m1, m3 | |
1203 | ||
1204 | palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] | |
1205 | ||
1206 | pmaddubsw m2, [r3 - 6 * 16] ; [16] | |
1207 | pmulhrsw m2, m3 | |
1208 | packuswb m1, m2 | |
1209 | jmp .transpose8x8 | |
1210 | ||
1211 | ALIGN 16 | |
1212 | .transpose8x8: | |
1213 | jz .store | |
1214 | ||
1215 | ; transpose 8x8 | |
1216 | punpckhbw m0, m4, m5 | |
1217 | punpcklbw m4, m5 | |
1218 | punpckhbw m2, m4, m0 | |
1219 | punpcklbw m4, m0 | |
1220 | ||
1221 | punpckhbw m0, m6, m1 | |
1222 | punpcklbw m6, m1 | |
1223 | punpckhbw m1, m6, m0 | |
1224 | punpcklbw m6, m0 | |
1225 | ||
1226 | punpckhdq m5, m4, m6 | |
1227 | punpckldq m4, m6 | |
1228 | punpckldq m6, m2, m1 | |
1229 | punpckhdq m2, m1 | |
1230 | mova m1, m2 | |
1231 | ||
1232 | .store: | |
1233 | lea r4, [r1 * 3] | |
1234 | movh [r0], m4 | |
1235 | movhps [r0 + r1], m4 | |
1236 | movh [r0 + r1 * 2], m5 | |
1237 | movhps [r0 + r4], m5 | |
1238 | add r0, r4 | |
1239 | movh [r0 + r1], m6 | |
1240 | movhps [r0 + r1 * 2], m6 | |
1241 | movh [r0 + r4], m1 | |
1242 | movhps [r0 + r1 * 4], m1 | |
1243 | RET | |
1244 | ||
1245 | cglobal intra_pred_ang8_4, 3,5,8 | |
1246 | cmp r4m, byte 32 | |
1247 | cmove r2, r3mp | |
1248 | lea r3, [ang_table + 24 * 16] | |
1249 | lea r4, [ang_table + 10 * 16] | |
1250 | mova m3, [pw_1024] | |
1251 | ||
1252 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1253 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
1254 | ||
1255 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
1256 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1257 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1258 | mova m5, m1 | |
1259 | ||
1260 | pmaddubsw m4, m0, [r3 - 3 * 16] ; [21] | |
1261 | pmulhrsw m4, m3 | |
1262 | pmaddubsw m1, [r4] ; [10] | |
1263 | pmulhrsw m1, m3 | |
1264 | packuswb m4, m1 | |
1265 | ||
1266 | pmaddubsw m5, [r3 + 7 * 16] ; [31] | |
1267 | pmulhrsw m5, m3 | |
1268 | ||
1269 | palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
1270 | ||
1271 | pmaddubsw m6, [r3 - 4 * 16] ; [ 20] | |
1272 | pmulhrsw m6, m3 | |
1273 | packuswb m5, m6 | |
1274 | ||
1275 | palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
1276 | ||
1277 | pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9] | |
1278 | pmulhrsw m6, m3 | |
1279 | ||
1280 | pmaddubsw m1, [r3 + 6 * 16] ; [30] | |
1281 | pmulhrsw m1, m3 | |
1282 | packuswb m6, m1 | |
1283 | ||
1284 | palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] | |
1285 | ||
1286 | pmaddubsw m1, [r3 - 5 * 16] ; [19] | |
1287 | pmulhrsw m1, m3 | |
1288 | ||
1289 | palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8] | |
1290 | ||
1291 | pmaddubsw m2, [r4 - 2 * 16] ; [8] | |
1292 | pmulhrsw m2, m3 | |
1293 | packuswb m1, m2 | |
1294 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1295 | ||
1296 | cglobal intra_pred_ang8_5, 3,5,8 | |
1297 | cmp r4m, byte 31 | |
1298 | cmove r2, r3mp | |
1299 | lea r3, [ang_table + 17 * 16] | |
1300 | lea r4, [ang_table + 2 * 16] | |
1301 | mova m3, [pw_1024] | |
1302 | ||
1303 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1304 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
1305 | ||
1306 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
1307 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1308 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1309 | mova m5, m1 | |
1310 | ||
1311 | pmaddubsw m4, m0, [r3] ; [17] | |
1312 | pmulhrsw m4, m3 | |
1313 | pmaddubsw m1, [r4] ; [2] | |
1314 | pmulhrsw m1, m3 | |
1315 | packuswb m4, m1 | |
1316 | ||
1317 | pmaddubsw m5, [r3 + 2 * 16] ; [19] | |
1318 | pmulhrsw m5, m3 | |
1319 | ||
1320 | palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
1321 | mova m1, m6 | |
1322 | ||
1323 | pmaddubsw m1, [r4 + 2 * 16] ; [4] | |
1324 | pmulhrsw m1, m3 | |
1325 | packuswb m5, m1 | |
1326 | ||
1327 | pmaddubsw m6, [r3 + 4 * 16] ; [21] | |
1328 | pmulhrsw m6, m3 | |
1329 | ||
1330 | palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
1331 | ||
1332 | mova m7, m1 | |
1333 | pmaddubsw m7, [r4 + 4 * 16] ; [6] | |
1334 | pmulhrsw m7, m3 | |
1335 | packuswb m6, m7 | |
1336 | ||
1337 | pmaddubsw m1, [r3 + 6 * 16] ; [23] | |
1338 | pmulhrsw m1, m3 | |
1339 | ||
1340 | palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9] | |
1341 | ||
1342 | pmaddubsw m2, [r4 + 6 * 16] ; [8] | |
1343 | pmulhrsw m2, m3 | |
1344 | packuswb m1, m2 | |
1345 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1346 | ||
1347 | cglobal intra_pred_ang8_6, 3,5,8 | |
1348 | cmp r4m, byte 30 | |
1349 | cmove r2, r3mp | |
1350 | lea r3, [ang_table + 20 * 16] | |
1351 | lea r4, [ang_table + 8 * 16] | |
1352 | mova m7, [pw_1024] | |
1353 | ||
1354 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1355 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
1356 | ||
1357 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
1358 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1359 | mova m1, m0 | |
1360 | ||
1361 | pmaddubsw m4, m0, [r3 - 7 * 16] ; [13] | |
1362 | pmulhrsw m4, m7 | |
1363 | pmaddubsw m1, [r3 + 6 * 16] ; [26] | |
1364 | pmulhrsw m1, m7 | |
1365 | packuswb m4, m1 | |
1366 | ||
1367 | palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1368 | ||
1369 | pmaddubsw m5, m6, [r4 - 1 * 16] ; [7] | |
1370 | pmulhrsw m5, m7 | |
1371 | ||
1372 | pmaddubsw m6, [r3] ; [20] | |
1373 | pmulhrsw m6, m7 | |
1374 | packuswb m5, m6 | |
1375 | ||
1376 | palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
1377 | ||
1378 | pmaddubsw m6, m1, [r4 - 7 * 16] ; [1] | |
1379 | pmulhrsw m6, m7 | |
1380 | ||
1381 | mova m3, m1 | |
1382 | pmaddubsw m3, [r3 - 6 * 16] ; [14] | |
1383 | pmulhrsw m3, m7 | |
1384 | packuswb m6, m3 | |
1385 | ||
1386 | pmaddubsw m1, [r3 + 7 * 16] ; [27] | |
1387 | pmulhrsw m1, m7 | |
1388 | ||
1389 | palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
1390 | ||
1391 | pmaddubsw m2, [r4] ; [8] | |
1392 | pmulhrsw m2, m7 | |
1393 | packuswb m1, m2 | |
1394 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1395 | ||
1396 | cglobal intra_pred_ang8_7, 3,5,8 | |
1397 | cmp r4m, byte 29 | |
1398 | cmove r2, r3mp | |
1399 | lea r3, [ang_table + 24 * 16] | |
1400 | lea r4, [ang_table + 6 * 16] | |
1401 | mova m7, [pw_1024] | |
1402 | ||
1403 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1404 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
1405 | ||
1406 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
1407 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1408 | ||
1409 | pmaddubsw m4, m0, [r4 + 3 * 16] ; [9] | |
1410 | pmulhrsw m4, m7 | |
1411 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [18] | |
1412 | pmulhrsw m3, m7 | |
1413 | packuswb m4, m3 | |
1414 | ||
1415 | pmaddubsw m5, m0, [r3 + 3 * 16] ; [27] | |
1416 | pmulhrsw m5, m7 | |
1417 | ||
1418 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1419 | ||
1420 | pmaddubsw m6, m1, [r4 - 2 * 16] ; [4] | |
1421 | pmulhrsw m6, m7 | |
1422 | packuswb m5, m6 | |
1423 | ||
1424 | pmaddubsw m6, m1, [r4 + 7 * 16] ; [13] | |
1425 | pmulhrsw m6, m7 | |
1426 | ||
1427 | mova m3, m1 | |
1428 | pmaddubsw m3, [r3 - 2 * 16] ; [22] | |
1429 | pmulhrsw m3, m7 | |
1430 | packuswb m6, m3 | |
1431 | ||
1432 | pmaddubsw m1, [r3 + 7 * 16] ; [31] | |
1433 | pmulhrsw m1, m7 | |
1434 | ||
1435 | palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
1436 | ||
1437 | pmaddubsw m2, [r4 + 2 * 16] ; [8] | |
1438 | pmulhrsw m2, m7 | |
1439 | packuswb m1, m2 | |
1440 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1441 | ||
1442 | cglobal intra_pred_ang8_8, 3,5,8 | |
1443 | cmp r4m, byte 28 | |
1444 | cmove r2, r3mp | |
1445 | lea r3, [ang_table + 23 * 16] | |
1446 | lea r4, [ang_table + 8 * 16] | |
1447 | mova m7, [pw_1024] | |
1448 | ||
1449 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1450 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
1451 | ||
1452 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
1453 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1454 | palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1455 | ||
1456 | pmaddubsw m4, m0, [r4 - 3 * 16] ; [5] | |
1457 | pmulhrsw m4, m7 | |
1458 | pmaddubsw m3, m0, [r4 + 2 * 16] ; [10] | |
1459 | pmulhrsw m3, m7 | |
1460 | packuswb m4, m3 | |
1461 | ||
1462 | pmaddubsw m5, m0, [r3 - 8 * 16] ; [15] | |
1463 | pmulhrsw m5, m7 | |
1464 | ||
1465 | pmaddubsw m6, m0, [r3 - 3 * 16] ; [20] | |
1466 | pmulhrsw m6, m7 | |
1467 | packuswb m5, m6 | |
1468 | ||
1469 | pmaddubsw m6, m0, [r3 + 2 * 16] ; [25] | |
1470 | pmulhrsw m6, m7 | |
1471 | ||
1472 | pmaddubsw m0, [r3 + 7 * 16] ; [30] | |
1473 | pmulhrsw m0, m7 | |
1474 | packuswb m6, m0 | |
1475 | ||
1476 | pmaddubsw m1, m2, [r4 - 5 * 16] ; [3] | |
1477 | pmulhrsw m1, m7 | |
1478 | ||
1479 | pmaddubsw m2, [r4] ; [8] | |
1480 | pmulhrsw m2, m7 | |
1481 | packuswb m1, m2 | |
1482 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1483 | ||
1484 | cglobal intra_pred_ang8_9, 3,5,8 | |
1485 | cmp r4m, byte 27 | |
1486 | cmove r2, r3mp | |
1487 | lea r3, [ang_table + 10 * 16] | |
1488 | mova m7, [pw_1024] | |
1489 | ||
1490 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1491 | palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
1492 | ||
1493 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
1494 | ||
1495 | pmaddubsw m4, m0, [r3 - 8 * 16] ; [2] | |
1496 | pmulhrsw m4, m7 | |
1497 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [4] | |
1498 | pmulhrsw m3, m7 | |
1499 | packuswb m4, m3 | |
1500 | ||
1501 | pmaddubsw m5, m0, [r3 - 4 * 16] ; [6] | |
1502 | pmulhrsw m5, m7 | |
1503 | ||
1504 | pmaddubsw m6, m0, [r3 - 2 * 16] ; [8] | |
1505 | pmulhrsw m6, m7 | |
1506 | packuswb m5, m6 | |
1507 | ||
1508 | pmaddubsw m6, m0, [r3] ; [10] | |
1509 | pmulhrsw m6, m7 | |
1510 | ||
1511 | pmaddubsw m2, m0, [r3 + 2 * 16] ; [12] | |
1512 | pmulhrsw m2, m7 | |
1513 | packuswb m6, m2 | |
1514 | ||
1515 | pmaddubsw m1, m0, [r3 + 4 * 16] ; [14] | |
1516 | pmulhrsw m1, m7 | |
1517 | ||
1518 | pmaddubsw m0, [r3 + 6 * 16] ; [16] | |
1519 | pmulhrsw m0, m7 | |
1520 | packuswb m1, m0 | |
1521 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1522 | ||
1523 | cglobal intra_pred_ang8_10, 4,5,5 | |
1524 | movh m0, [r2 + 1] | |
1525 | mova m4, [pb_unpackbq] | |
1526 | palignr m1, m0, 2 | |
1527 | pshufb m1, m4 | |
1528 | palignr m2, m0, 4 | |
1529 | pshufb m2, m4 | |
1530 | palignr m3, m0, 6 | |
1531 | pshufb m3, m4 | |
1532 | pshufb m0, m4 | |
1533 | ||
1534 | lea r4, [r1 * 3] | |
1535 | movhps [r0 + r1], m0 | |
1536 | movh [r0 + r1 * 2], m1 | |
1537 | movhps [r0 + r4], m1 | |
1538 | lea r2, [r0 + r1 * 4] | |
1539 | movh [r2], m2 | |
1540 | movhps [r2 + r1], m2 | |
1541 | movh [r2 + r1 * 2], m3 | |
1542 | movhps [r2 + r4], m3 | |
1543 | ||
1544 | ; filter | |
1545 | cmp r5m, byte 0 | |
1546 | jz .quit | |
1547 | ||
1548 | pmovzxbw m0, m0 | |
1549 | movu m1, [r3] | |
1550 | palignr m2, m1, 1 | |
1551 | pshufb m1, m4 | |
1552 | pmovzxbw m1, m1 | |
1553 | pmovzxbw m2, m2 | |
1554 | psubw m2, m1 | |
1555 | psraw m2, 1 | |
1556 | paddw m0, m2 | |
1557 | packuswb m0, m0 | |
1558 | ||
1559 | .quit: | |
1560 | movh [r0], m0 | |
1561 | RET | |
1562 | ||
1563 | cglobal intra_pred_ang8_26, 4,5,3 | |
1564 | movh m0, [r3 + 1] | |
1565 | ||
1566 | lea r4, [r1 * 3] | |
1567 | movh [r0], m0 | |
1568 | movh [r0 + r1], m0 | |
1569 | movh [r0 + r1 * 2], m0 | |
1570 | movh [r0 + r4], m0 | |
1571 | lea r3, [r0 + r1 * 4] | |
1572 | movh [r3], m0 | |
1573 | movh [r3 + r1], m0 | |
1574 | movh [r3 + r1 * 2], m0 | |
1575 | movh [r3 + r4], m0 | |
1576 | ||
1577 | ; filter | |
1578 | cmp r5m, byte 0 | |
1579 | jz .quit | |
1580 | ||
1581 | pshufb m0, [pb_unpackbq] | |
1582 | pmovzxbw m0, m0 | |
1583 | movu m1, [r2] | |
1584 | palignr m2, m1, 1 | |
1585 | pshufb m1, [pb_unpackbq] | |
1586 | pmovzxbw m1, m1 | |
1587 | pmovzxbw m2, m2 | |
1588 | psubw m2, m1 | |
1589 | psraw m2, 1 | |
1590 | paddw m0, m2 | |
1591 | packuswb m0, m0 | |
1592 | pextrb [r0], m0, 0 | |
1593 | pextrb [r0 + r1], m0, 1 | |
1594 | pextrb [r0 + r1 * 2], m0, 2 | |
1595 | pextrb [r0 + r4], m0, 3 | |
1596 | pextrb [r3], m0, 4 | |
1597 | pextrb [r3 + r1], m0, 5 | |
1598 | pextrb [r3 + r1 * 2], m0, 6 | |
1599 | pextrb [r3 + r4], m0, 7 | |
1600 | ||
1601 | .quit: | |
1602 | RET | |
1603 | ||
1604 | cglobal intra_pred_ang8_11, 3,5,8 | |
1605 | cmp r4m, byte 25 | |
1606 | cmove r2, r3mp | |
1607 | lea r3, [ang_table + 23 * 16] | |
1608 | mova m7, [pw_1024] | |
1609 | ||
1610 | movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
1611 | palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
1612 | ||
1613 | punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
1614 | ||
1615 | pmaddubsw m4, m0, [r3 + 7 * 16] ; [30] | |
1616 | pmulhrsw m4, m7 | |
1617 | pmaddubsw m3, m0, [r3 + 5 * 16] ; [28] | |
1618 | pmulhrsw m3, m7 | |
1619 | packuswb m4, m3 | |
1620 | ||
1621 | pmaddubsw m5, m0, [r3 + 3 * 16] ; [26] | |
1622 | pmulhrsw m5, m7 | |
1623 | ||
1624 | pmaddubsw m6, m0, [r3 + 1 * 16] ; [24] | |
1625 | pmulhrsw m6, m7 | |
1626 | packuswb m5, m6 | |
1627 | ||
1628 | pmaddubsw m6, m0, [r3 - 1 * 16] ; [22] | |
1629 | pmulhrsw m6, m7 | |
1630 | ||
1631 | pmaddubsw m2, m0, [r3 - 3 * 16] ; [20] | |
1632 | pmulhrsw m2, m7 | |
1633 | packuswb m6, m2 | |
1634 | ||
1635 | pmaddubsw m1, m0, [r3 - 5 * 16] ; [18] | |
1636 | pmulhrsw m1, m7 | |
1637 | ||
1638 | pmaddubsw m0, [r3 - 7 * 16] ; [16] | |
1639 | pmulhrsw m0, m7 | |
1640 | packuswb m1, m0 | |
1641 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1642 | ||
1643 | cglobal intra_pred_ang8_12, 4,5,8 | |
1644 | cmp r4m, byte 24 | |
1645 | mov r4, r2 | |
1646 | cmovz r2, r3 | |
1647 | cmovz r3, r4 | |
1648 | ||
1649 | lea r4, [ang_table + 22 * 16] | |
1650 | mova m7, [pw_1024] | |
1651 | ||
1652 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
1653 | pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
1654 | pinsrb m0, [r3 + 6], 0 | |
1655 | punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] | |
1656 | punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
1657 | palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
1658 | ||
1659 | pmaddubsw m4, m2, [r4 + 5 * 16] ; [27] | |
1660 | pmulhrsw m4, m7 | |
1661 | pmaddubsw m3, m2, [r4] ; [22] | |
1662 | pmulhrsw m3, m7 | |
1663 | packuswb m4, m3 | |
1664 | ||
1665 | pmaddubsw m1, m0, [r4 + 7 * 16] ; [29] | |
1666 | pmulhrsw m1, m7 | |
1667 | ||
1668 | pmaddubsw m0, [r4 + 2 * 16] ; [24] | |
1669 | pmulhrsw m0, m7 | |
1670 | packuswb m1, m0 | |
1671 | ||
1672 | pmaddubsw m5, m2, [r4 - 5 * 16] ; [17] | |
1673 | pmulhrsw m5, m7 | |
1674 | ||
1675 | lea r4, [ang_table + 7 * 16] | |
1676 | pmaddubsw m6, m2, [r4 + 5 * 16] ; [12] | |
1677 | pmulhrsw m6, m7 | |
1678 | packuswb m5, m6 | |
1679 | ||
1680 | pmaddubsw m6, m2, [r4] ; [7] | |
1681 | pmulhrsw m6, m7 | |
1682 | ||
1683 | pmaddubsw m2, [r4 - 5 * 16] ; [2] | |
1684 | pmulhrsw m2, m7 | |
1685 | packuswb m6, m2 | |
1686 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1687 | ||
1688 | cglobal intra_pred_ang8_13, 4,5,8 | |
1689 | cmp r4m, byte 23 | |
1690 | mov r4, r2 | |
1691 | cmovz r2, r3 | |
1692 | cmovz r3, r4 | |
1693 | ||
1694 | lea r4, [ang_table + 24 * 16] | |
1695 | mova m7, [pw_1024] | |
1696 | ||
1697 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
1698 | pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
1699 | pinsrb m1, [r3 + 4], 0 | |
1700 | pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] | |
1701 | pinsrb m0, [r3 + 7], 0 | |
1702 | punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] | |
1703 | punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
1704 | palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
1705 | palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
1706 | ||
1707 | pmaddubsw m4, m5, [r4 - 1 * 16] ; [23] | |
1708 | pmulhrsw m4, m7 | |
1709 | ||
1710 | pmaddubsw m6, m1, [r4 + 4 * 16] ; [28] | |
1711 | pmulhrsw m6, m7 | |
1712 | ||
1713 | pmaddubsw m0, [r4] ; [24] | |
1714 | pmulhrsw m0, m7 | |
1715 | ||
1716 | lea r4, [ang_table + 13 * 16] | |
1717 | pmaddubsw m3, m5, [r4 + 1 * 16] ; [14] | |
1718 | pmulhrsw m3, m7 | |
1719 | packuswb m4, m3 | |
1720 | ||
1721 | pmaddubsw m5, [r4 - 8 * 16] ; [5] | |
1722 | pmulhrsw m5, m7 | |
1723 | packuswb m5, m6 | |
1724 | ||
1725 | pmaddubsw m6, m1, [r4 + 6 * 16] ; [19] | |
1726 | pmulhrsw m6, m7 | |
1727 | ||
1728 | pmaddubsw m2, m1, [r4 - 3 * 16] ; [10] | |
1729 | pmulhrsw m2, m7 | |
1730 | packuswb m6, m2 | |
1731 | ||
1732 | pmaddubsw m1, [r4 - 12 * 16] ; [1] | |
1733 | pmulhrsw m1, m7 | |
1734 | packuswb m1, m0 | |
1735 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1736 | ||
1737 | cglobal intra_pred_ang8_14, 4,5,8 | |
1738 | cmp r4m, byte 22 | |
1739 | mov r4, r2 | |
1740 | cmovz r2, r3 | |
1741 | cmovz r3, r4 | |
1742 | ||
1743 | lea r4, [ang_table + 24 * 16] | |
1744 | mova m3, [pw_1024] | |
1745 | ||
1746 | movu m1, [r2 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] | |
1747 | pinsrb m1, [r3 + 2], 1 | |
1748 | pinsrb m1, [r3 + 5], 0 | |
1749 | pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] | |
1750 | pinsrb m0, [r3 + 7], 0 | |
1751 | punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] | |
1752 | punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
1753 | palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
1754 | palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
1755 | palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
1756 | ||
1757 | pmaddubsw m4, m2, [r4 - 5 * 16] ; [19] | |
1758 | pmulhrsw m4, m3 | |
1759 | ||
1760 | pmaddubsw m0, [r4] ; [24] | |
1761 | pmulhrsw m0, m3 | |
1762 | ||
1763 | pmaddubsw m5, m6, [r4 + 1 * 16] ; [25] | |
1764 | pmulhrsw m5, m3 | |
1765 | ||
1766 | lea r4, [ang_table + 12 * 16] | |
1767 | pmaddubsw m6, [r4] ; [12] | |
1768 | pmulhrsw m6, m3 | |
1769 | packuswb m5, m6 | |
1770 | ||
1771 | pmaddubsw m6, m1, [r4 + 19 * 16] ; [31] | |
1772 | pmulhrsw m6, m3 | |
1773 | ||
1774 | pmaddubsw m2, [r4 - 6 * 16] ; [6] | |
1775 | pmulhrsw m2, m3 | |
1776 | packuswb m4, m2 | |
1777 | ||
1778 | pmaddubsw m2, m1, [r4 + 6 * 16] ; [18] | |
1779 | pmulhrsw m2, m3 | |
1780 | packuswb m6, m2 | |
1781 | ||
1782 | pmaddubsw m1, [r4 - 7 * 16] ; [5] | |
1783 | pmulhrsw m1, m3 | |
1784 | packuswb m1, m0 | |
1785 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1786 | ||
1787 | cglobal intra_pred_ang8_15, 4,5,8 | |
1788 | cmp r4m, byte 21 | |
1789 | mov r4, r2 | |
1790 | cmovz r2, r3 | |
1791 | cmovz r3, r4 | |
1792 | ||
1793 | lea r4, [ang_table + 23 * 16] | |
1794 | mova m3, [pw_1024] | |
1795 | ||
1796 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
1797 | movu m2, [r3] | |
1798 | pshufb m2, [c_mode16_15] | |
1799 | palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] | |
1800 | pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] | |
1801 | pinsrb m0, [r3 + 8], 0 | |
1802 | punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] | |
1803 | punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] | |
1804 | palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
1805 | palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
1806 | palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
1807 | palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
1808 | ||
1809 | pmaddubsw m4, [r4 - 8 * 16] ; [15] | |
1810 | pmulhrsw m4, m3 | |
1811 | ||
1812 | pmaddubsw m2, m5, [r4 + 7 * 16] ; [30] | |
1813 | pmulhrsw m2, m3 | |
1814 | packuswb m4, m2 | |
1815 | ||
1816 | pmaddubsw m5, [r4 - 10 * 16] ; [13] | |
1817 | pmulhrsw m5, m3 | |
1818 | ||
1819 | pmaddubsw m2, m6, [r4 + 5 * 16] ; [28] | |
1820 | pmulhrsw m2, m3 | |
1821 | packuswb m5, m2 | |
1822 | ||
1823 | pmaddubsw m2, m1, [r4 + 3 * 16] ; [26] | |
1824 | pmulhrsw m2, m3 | |
1825 | ||
1826 | pmaddubsw m0, [r4 + 1 * 16] ; [24] | |
1827 | pmulhrsw m0, m3 | |
1828 | ||
1829 | lea r4, [ang_table + 11 * 16] | |
1830 | pmaddubsw m6, [r4] ; [11] | |
1831 | pmulhrsw m6, m3 | |
1832 | packuswb m6, m2 | |
1833 | ||
1834 | pmaddubsw m1, [r4 - 2 * 16] ; [9] | |
1835 | pmulhrsw m1, m3 | |
1836 | packuswb m1, m0 | |
1837 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1838 | ||
1839 | cglobal intra_pred_ang8_16, 4,5,8 | |
1840 | cmp r4m, byte 20 | |
1841 | mov r4, r2 | |
1842 | cmovz r2, r3 | |
1843 | cmovz r3, r4 | |
1844 | ||
1845 | lea r4, [ang_table + 22 * 16] | |
1846 | mova m7, [pw_1024] | |
1847 | ||
1848 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
1849 | movu m2, [r3] | |
1850 | pshufb m2, [c_mode16_16] | |
1851 | palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] | |
1852 | pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] | |
1853 | pinsrb m0, [r3 + 8], 0 | |
1854 | punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] | |
1855 | punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e] | |
1856 | palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] | |
1857 | palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
1858 | palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
1859 | palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
1860 | palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
1861 | ||
1862 | pmaddubsw m3, m5, [r4] ; [22] | |
1863 | pmulhrsw m3, m7 | |
1864 | ||
1865 | pmaddubsw m0, [r4 + 2 * 16] ; [24] | |
1866 | pmulhrsw m0, m7 | |
1867 | ||
1868 | lea r4, [ang_table + 9 * 16] | |
1869 | ||
1870 | pmaddubsw m4, [r4 + 2 * 16] ; [11] | |
1871 | pmulhrsw m4, m7 | |
1872 | packuswb m4, m3 | |
1873 | ||
1874 | pmaddubsw m2, [r4 + 3 * 16] ; [12] | |
1875 | pmulhrsw m2, m7 | |
1876 | ||
1877 | pmaddubsw m5, [r4 - 8 * 16] ; [1] | |
1878 | pmulhrsw m5, m7 | |
1879 | packuswb m5, m2 | |
1880 | ||
1881 | mova m2, m6 | |
1882 | pmaddubsw m6, [r4 + 14 * 16] ; [23] | |
1883 | pmulhrsw m6, m7 | |
1884 | ||
1885 | pmaddubsw m2, [r4 - 7 * 16] ; [2] | |
1886 | pmulhrsw m2, m7 | |
1887 | packuswb m6, m2 | |
1888 | ||
1889 | pmaddubsw m1, [r4 + 4 * 16] ; [13] | |
1890 | pmulhrsw m1, m7 | |
1891 | packuswb m1, m0 | |
1892 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1893 | ||
1894 | cglobal intra_pred_ang8_17, 4,5,8 | |
1895 | cmp r4m, byte 19 | |
1896 | mov r4, r2 | |
1897 | cmovz r2, r3 | |
1898 | cmovz r3, r4 | |
1899 | ||
1900 | lea r4, [ang_table + 17 * 16] | |
1901 | mova m3, [pw_1024] | |
1902 | ||
1903 | movu m2, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
1904 | movu m1, [r3] | |
1905 | pshufb m1, [c_mode16_17] | |
1906 | palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] | |
1907 | pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f] | |
1908 | pinsrb m0, [r3 + 7], 0 | |
1909 | punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
1910 | punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f] | |
1911 | ||
1912 | palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
1913 | palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
1914 | palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
1915 | ||
1916 | ||
1917 | pmaddubsw m2, [r4 - 5 * 16] ; [12] | |
1918 | pmulhrsw m2, m3 | |
1919 | ||
1920 | pmaddubsw m4, [r4 - 11 * 16] ; [6] | |
1921 | pmulhrsw m4, m3 | |
1922 | packuswb m4, m2 | |
1923 | ||
1924 | pmaddubsw m5, [r4 + 1 * 16] ; [18] | |
1925 | pmulhrsw m5, m3 | |
1926 | ||
1927 | palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] | |
1928 | pmaddubsw m2, [r4 + 7 * 16] ; [24] | |
1929 | pmulhrsw m2, m3 | |
1930 | packuswb m5, m2 | |
1931 | ||
1932 | palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] | |
1933 | mova m2, m6 | |
1934 | pmaddubsw m6, [r4 + 13 * 16] ; [30] | |
1935 | pmulhrsw m6, m3 | |
1936 | ||
1937 | pmaddubsw m2, [r4 - 13 * 16] ; [4] | |
1938 | pmulhrsw m2, m3 | |
1939 | packuswb m6, m2 | |
1940 | ||
1941 | palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e] | |
1942 | pmaddubsw m1, [r4 - 7 * 16] ; [10] | |
1943 | pmulhrsw m1, m3 | |
1944 | ||
1945 | pmaddubsw m0, [r4 - 1 * 16] ; [16] | |
1946 | pmulhrsw m0, m3 | |
1947 | packuswb m1, m0 | |
1948 | jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) | |
1949 | ||
1950 | cglobal intra_pred_ang8_18, 4,4,1 | |
1951 | movu m0, [r2] | |
1952 | pshufb m0, [pb_swap8] | |
1953 | movhps m0, [r3 + 1] | |
1954 | lea r2, [r0 + r1 * 4] | |
1955 | lea r3, [r1 * 3] | |
1956 | movh [r2 + r3], m0 | |
1957 | psrldq m0, 1 | |
1958 | movh [r2 + r1 * 2], m0 | |
1959 | psrldq m0, 1 | |
1960 | movh [r2 + r1], m0 | |
1961 | psrldq m0, 1 | |
1962 | movh [r2], m0 | |
1963 | psrldq m0, 1 | |
1964 | movh [r0 + r3], m0 | |
1965 | psrldq m0, 1 | |
1966 | movh [r0 + r1 * 2], m0 | |
1967 | psrldq m0, 1 | |
1968 | movh [r0 + r1], m0 | |
1969 | psrldq m0, 1 | |
1970 | movh [r0], m0 | |
1971 | RET | |
1972 | ||
1973 | ||
1974 | ;----------------------------------------------------------------------------- | |
1975 | ; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
1976 | ;----------------------------------------------------------------------------- | |
1977 | INIT_XMM ssse3 | |
1978 | cglobal intra_pred_ang16_2, 3,3,3 | |
1979 | cmp r4m, byte 34 | |
1980 | cmove r2, r3mp | |
1981 | movu m0, [r2 + 2] | |
1982 | movu m1, [r2 + 18] | |
1983 | movu [r0], m0 | |
1984 | palignr m2, m1, m0, 1 | |
1985 | movu [r0 + r1], m2 | |
1986 | lea r0, [r0 + r1 * 2] | |
1987 | palignr m2, m1, m0, 2 | |
1988 | movu [r0], m2 | |
1989 | palignr m2, m1, m0, 3 | |
1990 | movu [r0 + r1], m2 | |
1991 | lea r0, [r0 + r1 * 2] | |
1992 | palignr m2, m1, m0, 4 | |
1993 | movu [r0], m2 | |
1994 | palignr m2, m1, m0, 5 | |
1995 | movu [r0 + r1], m2 | |
1996 | lea r0, [r0 + r1 * 2] | |
1997 | palignr m2, m1, m0, 6 | |
1998 | movu [r0], m2 | |
1999 | palignr m2, m1, m0, 7 | |
2000 | movu [r0 + r1], m2 | |
2001 | lea r0, [r0 + r1 * 2] | |
2002 | palignr m2, m1, m0, 8 | |
2003 | movu [r0], m2 | |
2004 | palignr m2, m1, m0, 9 | |
2005 | movu [r0 + r1], m2 | |
2006 | lea r0, [r0 + r1 * 2] | |
2007 | palignr m2, m1, m0, 10 | |
2008 | movu [r0], m2 | |
2009 | palignr m2, m1, m0, 11 | |
2010 | movu [r0 + r1], m2 | |
2011 | lea r0, [r0 + r1 * 2] | |
2012 | palignr m2, m1, m0, 12 | |
2013 | movu [r0], m2 | |
2014 | palignr m2, m1, m0, 13 | |
2015 | movu [r0 + r1], m2 | |
2016 | lea r0, [r0 + r1 * 2] | |
2017 | palignr m2, m1, m0, 14 | |
2018 | movu [r0], m2 | |
2019 | palignr m2, m1, m0, 15 | |
2020 | movu [r0 + r1], m2 | |
2021 | RET | |
2022 | ||
2023 | %macro TRANSPOSE_STORE_8x8 6 | |
2024 | %if %2 == 1 | |
2025 | ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 | |
2026 | punpckhbw m0, %3, %4 | |
2027 | punpcklbw %3, %4 | |
2028 | punpckhbw %4, %3, m0 | |
2029 | punpcklbw %3, m0 | |
2030 | ||
2031 | punpckhbw m0, %5, m1 | |
2032 | punpcklbw %5, %6 | |
2033 | punpckhbw %6, %5, m0 | |
2034 | punpcklbw %5, m0 | |
2035 | ||
2036 | punpckhdq m0, %3, %5 | |
2037 | punpckldq %3, %5 | |
2038 | punpckldq %5, %4, %6 | |
2039 | punpckhdq %4, %6 | |
2040 | ||
2041 | movh [r0 + + %1 * 8], %3 | |
2042 | movhps [r0 + r1 + %1 * 8], %3 | |
2043 | movh [r0 + r1*2 + %1 * 8], m0 | |
2044 | movhps [r0 + r5 + %1 * 8], m0 | |
2045 | movh [r6 + %1 * 8], %5 | |
2046 | movhps [r6 + r1 + %1 * 8], %5 | |
2047 | movh [r6 + r1*2 + %1 * 8], %4 | |
2048 | movhps [r6 + r5 + %1 * 8], %4 | |
2049 | %else | |
2050 | ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32 | |
2051 | movh [r0 ], %3 | |
2052 | movhps [r0 + r1 ], %3 | |
2053 | movh [r0 + r1 * 2], %4 | |
2054 | movhps [r0 + r5 ], %4 | |
2055 | lea r0, [r0 + r1 * 4] | |
2056 | movh [r0 ], %5 | |
2057 | movhps [r0 + r1 ], %5 | |
2058 | movh [r0 + r1 * 2], %6 | |
2059 | movhps [r0 + r5 ], %6 | |
2060 | lea r0, [r0 + r1 * 4] | |
2061 | %endif | |
2062 | %endmacro | |
2063 | ||
2064 | INIT_XMM sse4 | |
2065 | cglobal intra_pred_ang16_3, 3,7,8 | |
2066 | ||
2067 | lea r3, [ang_table + 16 * 16] | |
2068 | mov r4d, 2 | |
2069 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2070 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
2071 | mova m7, [pw_1024] | |
2072 | ||
2073 | .loop: | |
2074 | movu m0, [r2 + 1] | |
2075 | palignr m1, m0, 1 | |
2076 | ||
2077 | punpckhbw m2, m0, m1 | |
2078 | punpcklbw m0, m1 | |
2079 | palignr m1, m2, m0, 2 | |
2080 | ||
2081 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
2082 | pmulhrsw m4, m7 | |
2083 | pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
2084 | pmulhrsw m1, m7 | |
2085 | packuswb m4, m1 | |
2086 | ||
2087 | palignr m5, m2, m0, 4 | |
2088 | ||
2089 | pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
2090 | pmulhrsw m5, m7 | |
2091 | ||
2092 | palignr m6, m2, m0, 6 | |
2093 | ||
2094 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
2095 | pmulhrsw m6, m7 | |
2096 | packuswb m5, m6 | |
2097 | ||
2098 | palignr m1, m2, m0, 8 | |
2099 | ||
2100 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
2101 | pmulhrsw m6, m7 | |
2102 | ||
2103 | pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
2104 | pmulhrsw m1, m7 | |
2105 | packuswb m6, m1 | |
2106 | ||
2107 | palignr m1, m2, m0, 10 | |
2108 | ||
2109 | pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
2110 | pmulhrsw m1, m7 | |
2111 | ||
2112 | palignr m2, m0, 12 | |
2113 | ||
2114 | pmaddubsw m2, [r3] ; [16] | |
2115 | pmulhrsw m2, m7 | |
2116 | packuswb m1, m2 | |
2117 | ||
2118 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
2119 | ||
2120 | movu m0, [r2 + 8] | |
2121 | palignr m1, m0, 1 | |
2122 | ||
2123 | punpckhbw m2, m0, m1 | |
2124 | punpcklbw m0, m1 | |
2125 | palignr m5, m2, m0, 2 | |
2126 | ||
2127 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
2128 | pmulhrsw m4, m7 | |
2129 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
2130 | pmulhrsw m1, m7 | |
2131 | packuswb m4, m1 | |
2132 | ||
2133 | pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
2134 | pmulhrsw m5, m7 | |
2135 | ||
2136 | palignr m6, m2, m0, 4 | |
2137 | ||
2138 | pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
2139 | pmulhrsw m6, m7 | |
2140 | packuswb m5, m6 | |
2141 | ||
2142 | palignr m1, m2, m0, 6 | |
2143 | ||
2144 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
2145 | pmulhrsw m6, m7 | |
2146 | ||
2147 | palignr m1, m2, m0, 8 | |
2148 | ||
2149 | pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
2150 | pmulhrsw m1, m7 | |
2151 | packuswb m6, m1 | |
2152 | ||
2153 | palignr m1, m2, m0, 10 | |
2154 | ||
2155 | pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
2156 | pmulhrsw m1, m7 | |
2157 | packuswb m1, m1 | |
2158 | ||
2159 | movhps m1, [r2 + 14] ; [00] | |
2160 | ||
2161 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
2162 | ||
2163 | lea r0, [r6 + r1 * 4] | |
2164 | lea r6, [r6 + r1 * 8] | |
2165 | add r2, 8 | |
2166 | dec r4 | |
2167 | jnz .loop | |
2168 | ||
2169 | RET | |
2170 | ||
2171 | INIT_XMM sse4 | |
2172 | cglobal intra_pred_ang16_33, 3,7,8 | |
2173 | mov r2, r3mp | |
2174 | lea r3, [ang_table + 16 * 16] | |
2175 | mov r4d, 2 | |
2176 | lea r5, [r1 * 3] | |
2177 | mov r6, r0 | |
2178 | mova m7, [pw_1024] | |
2179 | ||
2180 | .loop: | |
2181 | movu m0, [r2 + 1] | |
2182 | palignr m1, m0, 1 | |
2183 | ||
2184 | punpckhbw m2, m0, m1 | |
2185 | punpcklbw m0, m1 | |
2186 | palignr m1, m2, m0, 2 | |
2187 | ||
2188 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
2189 | pmulhrsw m4, m7 | |
2190 | pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
2191 | pmulhrsw m1, m7 | |
2192 | packuswb m4, m1 | |
2193 | ||
2194 | palignr m5, m2, m0, 4 | |
2195 | ||
2196 | pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
2197 | pmulhrsw m5, m7 | |
2198 | ||
2199 | palignr m6, m2, m0, 6 | |
2200 | ||
2201 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
2202 | pmulhrsw m6, m7 | |
2203 | packuswb m5, m6 | |
2204 | ||
2205 | palignr m1, m2, m0, 8 | |
2206 | ||
2207 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
2208 | pmulhrsw m6, m7 | |
2209 | ||
2210 | pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
2211 | pmulhrsw m1, m7 | |
2212 | packuswb m6, m1 | |
2213 | ||
2214 | palignr m1, m2, m0, 10 | |
2215 | ||
2216 | pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
2217 | pmulhrsw m1, m7 | |
2218 | ||
2219 | palignr m2, m0, 12 | |
2220 | ||
2221 | pmaddubsw m2, [r3] ; [16] | |
2222 | pmulhrsw m2, m7 | |
2223 | packuswb m1, m2 | |
2224 | ||
2225 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
2226 | ||
2227 | movu m0, [r2 + 8] | |
2228 | palignr m1, m0, 1 | |
2229 | ||
2230 | punpckhbw m2, m0, m1 | |
2231 | punpcklbw m0, m1 | |
2232 | palignr m5, m2, m0, 2 | |
2233 | ||
2234 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
2235 | pmulhrsw m4, m7 | |
2236 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
2237 | pmulhrsw m1, m7 | |
2238 | packuswb m4, m1 | |
2239 | ||
2240 | pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
2241 | pmulhrsw m5, m7 | |
2242 | ||
2243 | palignr m6, m2, m0, 4 | |
2244 | ||
2245 | pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
2246 | pmulhrsw m6, m7 | |
2247 | packuswb m5, m6 | |
2248 | ||
2249 | palignr m1, m2, m0, 6 | |
2250 | ||
2251 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
2252 | pmulhrsw m6, m7 | |
2253 | ||
2254 | palignr m1, m2, m0, 8 | |
2255 | ||
2256 | pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
2257 | pmulhrsw m1, m7 | |
2258 | packuswb m6, m1 | |
2259 | ||
2260 | palignr m1, m2, m0, 10 | |
2261 | ||
2262 | pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
2263 | pmulhrsw m1, m7 | |
2264 | packuswb m1, m1 | |
2265 | ||
2266 | movh m2, [r2 + 14] ; [00] | |
2267 | ||
2268 | movh [r0 ], m4 | |
2269 | movhps [r0 + r1 ], m4 | |
2270 | movh [r0 + r1 * 2], m5 | |
2271 | movhps [r0 + r5 ], m5 | |
2272 | lea r0, [r0 + r1 * 4] | |
2273 | movh [r0 ], m6 | |
2274 | movhps [r0 + r1 ], m6 | |
2275 | movh [r0 + r1 * 2], m1 | |
2276 | movh [r0 + r5 ], m2 | |
2277 | ||
2278 | lea r0, [r6 + 8] | |
2279 | add r2, 8 | |
2280 | dec r4 | |
2281 | jnz .loop | |
2282 | ||
2283 | RET | |
2284 | ||
2285 | INIT_XMM sse4 | |
2286 | cglobal intra_pred_ang16_4, 3,7,8 | |
2287 | ||
2288 | lea r3, [ang_table + 16 * 16] | |
2289 | mov r4d, 2 | |
2290 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2291 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
2292 | mova m7, [pw_1024] | |
2293 | ||
2294 | .loop: | |
2295 | movu m0, [r2 + 1] | |
2296 | palignr m1, m0, 1 | |
2297 | ||
2298 | punpckhbw m2, m0, m1 | |
2299 | punpcklbw m0, m1 | |
2300 | palignr m1, m2, m0, 2 | |
2301 | mova m5, m1 | |
2302 | ||
2303 | pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] | |
2304 | pmulhrsw m4, m7 | |
2305 | pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
2306 | pmulhrsw m1, m7 | |
2307 | packuswb m4, m1 | |
2308 | ||
2309 | pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
2310 | pmulhrsw m5, m7 | |
2311 | ||
2312 | palignr m6, m2, m0, 4 | |
2313 | ||
2314 | pmaddubsw m6, [r3 + 4 * 16] ; [ 20] | |
2315 | pmulhrsw m6, m7 | |
2316 | packuswb m5, m6 | |
2317 | ||
2318 | palignr m1, m2, m0, 6 | |
2319 | ||
2320 | pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] | |
2321 | pmulhrsw m6, m7 | |
2322 | ||
2323 | pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
2324 | pmulhrsw m1, m7 | |
2325 | packuswb m6, m1 | |
2326 | ||
2327 | palignr m1, m2, m0, 8 | |
2328 | ||
2329 | pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
2330 | pmulhrsw m1, m7 | |
2331 | ||
2332 | palignr m2, m0, 10 | |
2333 | ||
2334 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
2335 | pmulhrsw m3, m7 | |
2336 | packuswb m1, m3 | |
2337 | ||
2338 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
2339 | ||
2340 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
2341 | pmulhrsw m4, m7 | |
2342 | ||
2343 | movu m0, [r2 + 6] | |
2344 | palignr m1, m0, 1 | |
2345 | ||
2346 | punpckhbw m2, m0, m1 | |
2347 | punpcklbw m0, m1 | |
2348 | palignr m1, m2, m0, 2 | |
2349 | ||
2350 | pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
2351 | pmulhrsw m1, m7 | |
2352 | packuswb m4, m1 | |
2353 | ||
2354 | palignr m5, m2, m0, 4 | |
2355 | mova m6, m5 | |
2356 | ||
2357 | pmaddubsw m5, [r3 - 9 * 16] ; [07] | |
2358 | pmulhrsw m5, m7 | |
2359 | ||
2360 | pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
2361 | pmulhrsw m6, m7 | |
2362 | packuswb m5, m6 | |
2363 | ||
2364 | palignr m6, m2, m0, 6 | |
2365 | ||
2366 | pmaddubsw m6, [r3 + 16] ; [17] | |
2367 | pmulhrsw m6, m7 | |
2368 | ||
2369 | palignr m1, m2, m0, 8 | |
2370 | palignr m2, m0, 10 | |
2371 | ||
2372 | pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] | |
2373 | pmulhrsw m3, m7 | |
2374 | packuswb m6, m3 | |
2375 | ||
2376 | pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
2377 | pmulhrsw m1, m7 | |
2378 | ||
2379 | pmaddubsw m2, [r3] ; [16] | |
2380 | pmulhrsw m2, m7 | |
2381 | packuswb m1, m2 | |
2382 | ||
2383 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
2384 | ||
2385 | lea r0, [r6 + r1 * 4] | |
2386 | lea r6, [r6 + r1 * 8] | |
2387 | add r2, 8 | |
2388 | dec r4 | |
2389 | jnz .loop | |
2390 | ||
2391 | RET | |
2392 | ||
2393 | INIT_XMM sse4 | |
2394 | cglobal intra_pred_ang16_32, 3,7,8 | |
2395 | mov r2, r3mp | |
2396 | lea r3, [ang_table + 16 * 16] | |
2397 | mov r4d, 2 | |
2398 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2399 | mov r6, r0 | |
2400 | mova m7, [pw_1024] | |
2401 | ||
2402 | .loop: | |
2403 | movu m0, [r2 + 1] | |
2404 | palignr m1, m0, 1 | |
2405 | ||
2406 | punpckhbw m2, m0, m1 | |
2407 | punpcklbw m0, m1 | |
2408 | palignr m1, m2, m0, 2 | |
2409 | mova m5, m1 | |
2410 | ||
2411 | ||
2412 | pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] | |
2413 | pmulhrsw m4, m7 | |
2414 | pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
2415 | pmulhrsw m1, m7 | |
2416 | packuswb m4, m1 | |
2417 | ||
2418 | pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
2419 | pmulhrsw m5, m7 | |
2420 | ||
2421 | palignr m6, m2, m0, 4 | |
2422 | ||
2423 | pmaddubsw m6, [r3 + 4 * 16] ; [ 20] | |
2424 | pmulhrsw m6, m7 | |
2425 | packuswb m5, m6 | |
2426 | ||
2427 | palignr m1, m2, m0, 6 | |
2428 | ||
2429 | pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] | |
2430 | pmulhrsw m6, m7 | |
2431 | ||
2432 | pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
2433 | pmulhrsw m1, m7 | |
2434 | packuswb m6, m1 | |
2435 | ||
2436 | palignr m1, m2, m0, 8 | |
2437 | ||
2438 | pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
2439 | pmulhrsw m1, m7 | |
2440 | ||
2441 | palignr m2, m0, 10 | |
2442 | ||
2443 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
2444 | pmulhrsw m3, m7 | |
2445 | packuswb m1, m3 | |
2446 | ||
2447 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
2448 | ||
2449 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
2450 | pmulhrsw m4, m7 | |
2451 | ||
2452 | movu m0, [r2 + 6] | |
2453 | palignr m1, m0, 1 | |
2454 | ||
2455 | punpckhbw m2, m0, m1 | |
2456 | punpcklbw m0, m1 | |
2457 | palignr m1, m2, m0, 2 | |
2458 | ||
2459 | pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
2460 | pmulhrsw m1, m7 | |
2461 | packuswb m4, m1 | |
2462 | ||
2463 | palignr m5, m2, m0, 4 | |
2464 | mova m6, m5 | |
2465 | ||
2466 | pmaddubsw m5, [r3 - 9 * 16] ; [07] | |
2467 | pmulhrsw m5, m7 | |
2468 | ||
2469 | pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
2470 | pmulhrsw m6, m7 | |
2471 | packuswb m5, m6 | |
2472 | ||
2473 | palignr m6, m2, m0, 6 | |
2474 | ||
2475 | pmaddubsw m6, [r3 + 16] ; [17] | |
2476 | pmulhrsw m6, m7 | |
2477 | ||
2478 | palignr m1, m2, m0, 8 | |
2479 | palignr m2, m0, 10 | |
2480 | ||
2481 | pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] | |
2482 | pmulhrsw m3, m7 | |
2483 | packuswb m6, m3 | |
2484 | ||
2485 | pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
2486 | pmulhrsw m1, m7 | |
2487 | ||
2488 | pmaddubsw m2, [r3] ; [16] | |
2489 | pmulhrsw m2, m7 | |
2490 | packuswb m1, m2 | |
2491 | ||
2492 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
2493 | ||
2494 | lea r0, [r6 + 8] | |
2495 | add r2, 8 | |
2496 | dec r4 | |
2497 | jnz .loop | |
2498 | ||
2499 | RET | |
2500 | ||
2501 | INIT_XMM sse4 | |
2502 | cglobal intra_pred_ang16_5, 3,7,8 | |
2503 | ||
2504 | lea r3, [ang_table + 16 * 16] | |
2505 | mov r4d, 2 | |
2506 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2507 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
2508 | mova m7, [pw_1024] | |
2509 | ||
2510 | .loop: | |
2511 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
2512 | movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
2513 | punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
2514 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2515 | ||
2516 | palignr m5, m2, m3, 2 | |
2517 | ||
2518 | pmaddubsw m4, m3, [r3 + 16] ; [17] | |
2519 | pmulhrsw m4, m7 | |
2520 | pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] | |
2521 | pmulhrsw m1, m7 | |
2522 | packuswb m4, m1 | |
2523 | ||
2524 | palignr m6, m2, m3, 4 | |
2525 | ||
2526 | pmaddubsw m5, [r3 + 3 * 16] ; [19] | |
2527 | pmulhrsw m5, m7 | |
2528 | pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] | |
2529 | pmulhrsw m1, m7 | |
2530 | packuswb m5, m1 | |
2531 | ||
2532 | palignr m1, m2, m3, 6 | |
2533 | ||
2534 | pmaddubsw m6, [r3 + 5 * 16] ; [21] | |
2535 | pmulhrsw m6, m7 | |
2536 | pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] | |
2537 | pmulhrsw m0, m7 | |
2538 | packuswb m6, m0 | |
2539 | ||
2540 | palignr m0, m2, m3, 8 | |
2541 | ||
2542 | pmaddubsw m1, [r3 + 7 * 16] ; [23] | |
2543 | pmulhrsw m1, m7 | |
2544 | pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
2545 | pmulhrsw m0, m7 | |
2546 | packuswb m1, m0 | |
2547 | ||
2548 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
2549 | ||
2550 | palignr m4, m2, m3, 8 | |
2551 | palignr m5, m2, m3, 10 | |
2552 | ||
2553 | pmaddubsw m4, [r3 + 9 * 16] ; [25] | |
2554 | pmulhrsw m4, m7 | |
2555 | pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] | |
2556 | pmulhrsw m1, m7 | |
2557 | packuswb m4, m1 | |
2558 | ||
2559 | palignr m6, m2, m3, 12 | |
2560 | ||
2561 | pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
2562 | pmulhrsw m5, m7 | |
2563 | pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] | |
2564 | pmulhrsw m1, m7 | |
2565 | packuswb m5, m1 | |
2566 | ||
2567 | palignr m1, m2, m3, 14 | |
2568 | ||
2569 | pmaddubsw m6, [r3 + 13 * 16] ; [29] | |
2570 | pmulhrsw m6, m7 | |
2571 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
2572 | pmulhrsw m0, m7 | |
2573 | packuswb m6, m0 | |
2574 | ||
2575 | pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
2576 | pmulhrsw m1, m7 | |
2577 | pmaddubsw m2, [r3] ; [16] | |
2578 | pmulhrsw m2, m7 | |
2579 | packuswb m1, m2 | |
2580 | ||
2581 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
2582 | ||
2583 | lea r0, [r6 + r1 * 4] | |
2584 | lea r6, [r6 + r1 * 8] | |
2585 | add r2, 8 | |
2586 | dec r4 | |
2587 | jnz .loop | |
2588 | ||
2589 | RET | |
2590 | ||
2591 | INIT_XMM sse4 | |
2592 | cglobal intra_pred_ang16_31, 3,7,8 | |
2593 | mov r2, r3mp | |
2594 | lea r3, [ang_table + 16 * 16] | |
2595 | mov r4d, 2 | |
2596 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2597 | mov r6, r0 | |
2598 | mova m7, [pw_1024] | |
2599 | ||
2600 | .loop: | |
2601 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
2602 | movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
2603 | punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
2604 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2605 | ||
2606 | palignr m5, m2, m3, 2 | |
2607 | ||
2608 | pmaddubsw m4, m3, [r3 + 16] ; [17] | |
2609 | pmulhrsw m4, m7 | |
2610 | pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] | |
2611 | pmulhrsw m1, m7 | |
2612 | packuswb m4, m1 | |
2613 | ||
2614 | palignr m6, m2, m3, 4 | |
2615 | ||
2616 | pmaddubsw m5, [r3 + 3 * 16] ; [19] | |
2617 | pmulhrsw m5, m7 | |
2618 | pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] | |
2619 | pmulhrsw m1, m7 | |
2620 | packuswb m5, m1 | |
2621 | ||
2622 | palignr m1, m2, m3, 6 | |
2623 | ||
2624 | pmaddubsw m6, [r3 + 5 * 16] ; [21] | |
2625 | pmulhrsw m6, m7 | |
2626 | pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] | |
2627 | pmulhrsw m0, m7 | |
2628 | packuswb m6, m0 | |
2629 | ||
2630 | palignr m0, m2, m3, 8 | |
2631 | ||
2632 | pmaddubsw m1, [r3 + 7 * 16] ; [23] | |
2633 | pmulhrsw m1, m7 | |
2634 | pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
2635 | pmulhrsw m0, m7 | |
2636 | packuswb m1, m0 | |
2637 | ||
2638 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
2639 | ||
2640 | palignr m4, m2, m3, 8 | |
2641 | palignr m5, m2, m3, 10 | |
2642 | ||
2643 | pmaddubsw m4, [r3 + 9 * 16] ; [25] | |
2644 | pmulhrsw m4, m7 | |
2645 | pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] | |
2646 | pmulhrsw m1, m7 | |
2647 | packuswb m4, m1 | |
2648 | ||
2649 | palignr m6, m2, m3, 12 | |
2650 | ||
2651 | pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
2652 | pmulhrsw m5, m7 | |
2653 | pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] | |
2654 | pmulhrsw m1, m7 | |
2655 | packuswb m5, m1 | |
2656 | ||
2657 | palignr m1, m2, m3, 14 | |
2658 | ||
2659 | pmaddubsw m6, [r3 + 13 * 16] ; [29] | |
2660 | pmulhrsw m6, m7 | |
2661 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
2662 | pmulhrsw m0, m7 | |
2663 | packuswb m6, m0 | |
2664 | ||
2665 | pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
2666 | pmulhrsw m1, m7 | |
2667 | pmaddubsw m2, [r3] ; [16] | |
2668 | pmulhrsw m2, m7 | |
2669 | packuswb m1, m2 | |
2670 | ||
2671 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
2672 | ||
2673 | lea r0, [r6 + 8] | |
2674 | add r2, 8 | |
2675 | dec r4 | |
2676 | jnz .loop | |
2677 | ||
2678 | RET | |
2679 | ||
2680 | INIT_XMM sse4 | |
2681 | cglobal intra_pred_ang16_6, 3,7,8 | |
2682 | ||
2683 | lea r3, [ang_table + 16 * 16] | |
2684 | mov r4d, 2 | |
2685 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2686 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
2687 | mova m7, [pw_1024] | |
2688 | ||
2689 | .loop: | |
2690 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
2691 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
2692 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
2693 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2694 | ||
2695 | pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] | |
2696 | pmulhrsw m4, m7 | |
2697 | pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] | |
2698 | pmulhrsw m1, m7 | |
2699 | packuswb m4, m1 | |
2700 | ||
2701 | palignr m6, m2, m3, 2 | |
2702 | ||
2703 | pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] | |
2704 | pmulhrsw m5, m7 | |
2705 | pmaddubsw m6, [r3 + 4 * 16] ; [20] | |
2706 | pmulhrsw m6, m7 | |
2707 | packuswb m5, m6 | |
2708 | ||
2709 | palignr m1, m2, m3, 4 | |
2710 | ||
2711 | pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] | |
2712 | pmulhrsw m6, m7 | |
2713 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
2714 | pmulhrsw m0, m7 | |
2715 | packuswb m6, m0 | |
2716 | ||
2717 | palignr m0, m2, m3, 6 | |
2718 | ||
2719 | pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
2720 | pmulhrsw m1, m7 | |
2721 | pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
2722 | pmulhrsw m0, m7 | |
2723 | packuswb m1, m0 | |
2724 | ||
2725 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
2726 | ||
2727 | palignr m4, m2, m3, 6 | |
2728 | palignr m6, m2, m3, 8 | |
2729 | ||
2730 | pmaddubsw m4, [r3 + 5 * 16] ; [21] | |
2731 | pmulhrsw m4, m7 | |
2732 | pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] | |
2733 | pmulhrsw m1, m7 | |
2734 | packuswb m4, m1 | |
2735 | ||
2736 | pmaddubsw m5, m6, [r3 - 16] ; [15] | |
2737 | pmulhrsw m5, m7 | |
2738 | pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
2739 | pmulhrsw m6, m7 | |
2740 | packuswb m5, m6 | |
2741 | ||
2742 | palignr m0, m2, m3, 10 | |
2743 | ||
2744 | pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] | |
2745 | pmulhrsw m6, m7 | |
2746 | pmaddubsw m0, [r3 + 6 * 16] ; [22] | |
2747 | pmulhrsw m0, m7 | |
2748 | packuswb m6, m0 | |
2749 | ||
2750 | palignr m2, m3, 12 | |
2751 | ||
2752 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
2753 | pmulhrsw m1, m7 | |
2754 | pmaddubsw m2, [r3] ; [16] | |
2755 | pmulhrsw m2, m7 | |
2756 | packuswb m1, m2 | |
2757 | ||
2758 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
2759 | ||
2760 | lea r0, [r6 + r1 * 4] | |
2761 | lea r6, [r6 + r1 * 8] | |
2762 | add r2, 8 | |
2763 | dec r4 | |
2764 | jnz .loop | |
2765 | ||
2766 | RET | |
2767 | ||
2768 | INIT_XMM sse4 | |
2769 | cglobal intra_pred_ang16_30, 3,7,8 | |
2770 | mov r2, r3mp | |
2771 | lea r3, [ang_table + 16 * 16] | |
2772 | mov r4d, 2 | |
2773 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2774 | mov r6, r0 | |
2775 | mova m7, [pw_1024] | |
2776 | ||
2777 | .loop: | |
2778 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
2779 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
2780 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
2781 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2782 | ||
2783 | pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] | |
2784 | pmulhrsw m4, m7 | |
2785 | pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] | |
2786 | pmulhrsw m1, m7 | |
2787 | packuswb m4, m1 | |
2788 | ||
2789 | palignr m6, m2, m3, 2 | |
2790 | ||
2791 | pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] | |
2792 | pmulhrsw m5, m7 | |
2793 | pmaddubsw m6, [r3 + 4 * 16] ; [20] | |
2794 | pmulhrsw m6, m7 | |
2795 | packuswb m5, m6 | |
2796 | ||
2797 | palignr m1, m2, m3, 4 | |
2798 | ||
2799 | pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] | |
2800 | pmulhrsw m6, m7 | |
2801 | pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] | |
2802 | pmulhrsw m0, m7 | |
2803 | packuswb m6, m0 | |
2804 | ||
2805 | palignr m0, m2, m3, 6 | |
2806 | ||
2807 | pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
2808 | pmulhrsw m1, m7 | |
2809 | pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
2810 | pmulhrsw m0, m7 | |
2811 | packuswb m1, m0 | |
2812 | ||
2813 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
2814 | ||
2815 | palignr m4, m2, m3, 6 | |
2816 | palignr m6, m2, m3, 8 | |
2817 | ||
2818 | pmaddubsw m4, [r3 + 5 * 16] ; [21] | |
2819 | pmulhrsw m4, m7 | |
2820 | pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] | |
2821 | pmulhrsw m1, m7 | |
2822 | packuswb m4, m1 | |
2823 | ||
2824 | pmaddubsw m5, m6, [r3 - 16] ; [15] | |
2825 | pmulhrsw m5, m7 | |
2826 | pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
2827 | pmulhrsw m6, m7 | |
2828 | packuswb m5, m6 | |
2829 | ||
2830 | palignr m0, m2, m3, 10 | |
2831 | ||
2832 | pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] | |
2833 | pmulhrsw m6, m7 | |
2834 | pmaddubsw m0, [r3 + 6 * 16] ; [22] | |
2835 | pmulhrsw m0, m7 | |
2836 | packuswb m6, m0 | |
2837 | ||
2838 | palignr m2, m3, 12 | |
2839 | ||
2840 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
2841 | pmulhrsw m1, m7 | |
2842 | pmaddubsw m2, [r3] ; [16] | |
2843 | pmulhrsw m2, m7 | |
2844 | packuswb m1, m2 | |
2845 | ||
2846 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
2847 | ||
2848 | lea r0, [r6 + 8] | |
2849 | add r2, 8 | |
2850 | dec r4 | |
2851 | jnz .loop | |
2852 | ||
2853 | RET | |
2854 | ||
2855 | INIT_XMM sse4 | |
2856 | cglobal intra_pred_ang16_7, 3,7,8 | |
2857 | ||
2858 | lea r3, [ang_table + 16 * 16] | |
2859 | mov r4d, 2 | |
2860 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2861 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
2862 | mova m7, [pw_1024] | |
2863 | ||
2864 | .loop: | |
2865 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
2866 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
2867 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
2868 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2869 | ||
2870 | pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] | |
2871 | pmulhrsw m4, m7 | |
2872 | pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] | |
2873 | pmulhrsw m0, m7 | |
2874 | packuswb m4, m0 | |
2875 | ||
2876 | palignr m1, m2, m3, 2 | |
2877 | ||
2878 | pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] | |
2879 | pmulhrsw m5, m7 | |
2880 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
2881 | pmulhrsw m6, m7 | |
2882 | packuswb m5, m6 | |
2883 | ||
2884 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
2885 | pmulhrsw m6, m7 | |
2886 | pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] | |
2887 | pmulhrsw m0, m7 | |
2888 | packuswb m6, m0 | |
2889 | ||
2890 | palignr m0, m2, m3, 4 | |
2891 | ||
2892 | pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
2893 | pmulhrsw m1, m7 | |
2894 | pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
2895 | pmulhrsw m0, m7 | |
2896 | packuswb m1, m0 | |
2897 | ||
2898 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
2899 | ||
2900 | palignr m1, m2, m3, 4 | |
2901 | ||
2902 | pmaddubsw m4, m1, [r3 + 16] ; [17] | |
2903 | pmulhrsw m4, m7 | |
2904 | pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
2905 | pmulhrsw m1, m7 | |
2906 | packuswb m4, m1 | |
2907 | ||
2908 | palignr m0, m2, m3, 6 | |
2909 | ||
2910 | pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] | |
2911 | pmulhrsw m5, m7 | |
2912 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
2913 | pmulhrsw m6, m7 | |
2914 | packuswb m5, m6 | |
2915 | ||
2916 | pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] | |
2917 | pmulhrsw m6, m7 | |
2918 | pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
2919 | pmulhrsw m0, m7 | |
2920 | packuswb m6, m0 | |
2921 | ||
2922 | palignr m2, m3, 8 | |
2923 | ||
2924 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] | |
2925 | pmulhrsw m1, m7 | |
2926 | pmaddubsw m2, [r3] ; [16] | |
2927 | pmulhrsw m2, m7 | |
2928 | packuswb m1, m2 | |
2929 | ||
2930 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
2931 | ||
2932 | lea r0, [r6 + r1 * 4] | |
2933 | lea r6, [r6 + r1 * 8] | |
2934 | add r2, 8 | |
2935 | dec r4 | |
2936 | jnz .loop | |
2937 | ||
2938 | RET | |
2939 | ||
2940 | INIT_XMM sse4 | |
2941 | cglobal intra_pred_ang16_29, 3,7,8 | |
2942 | mov r2, r3mp | |
2943 | lea r3, [ang_table + 16 * 16] | |
2944 | mov r4d, 2 | |
2945 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
2946 | mov r6, r0 | |
2947 | mova m7, [pw_1024] | |
2948 | ||
2949 | .loop: | |
2950 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
2951 | palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
2952 | punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
2953 | punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
2954 | ||
2955 | pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] | |
2956 | pmulhrsw m4, m7 | |
2957 | pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] | |
2958 | pmulhrsw m0, m7 | |
2959 | packuswb m4, m0 | |
2960 | ||
2961 | palignr m1, m2, m3, 2 | |
2962 | ||
2963 | pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] | |
2964 | pmulhrsw m5, m7 | |
2965 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
2966 | pmulhrsw m6, m7 | |
2967 | packuswb m5, m6 | |
2968 | ||
2969 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
2970 | pmulhrsw m6, m7 | |
2971 | pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] | |
2972 | pmulhrsw m0, m7 | |
2973 | packuswb m6, m0 | |
2974 | ||
2975 | palignr m0, m2, m3, 4 | |
2976 | ||
2977 | pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
2978 | pmulhrsw m1, m7 | |
2979 | pmaddubsw m0, [r3 - 8 * 16] ; [8] | |
2980 | pmulhrsw m0, m7 | |
2981 | packuswb m1, m0 | |
2982 | ||
2983 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
2984 | ||
2985 | palignr m1, m2, m3, 4 | |
2986 | ||
2987 | pmaddubsw m4, m1, [r3 + 16] ; [17] | |
2988 | pmulhrsw m4, m7 | |
2989 | pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
2990 | pmulhrsw m1, m7 | |
2991 | packuswb m4, m1 | |
2992 | ||
2993 | palignr m0, m2, m3, 6 | |
2994 | ||
2995 | pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] | |
2996 | pmulhrsw m5, m7 | |
2997 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
2998 | pmulhrsw m6, m7 | |
2999 | packuswb m5, m6 | |
3000 | ||
3001 | pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] | |
3002 | pmulhrsw m6, m7 | |
3003 | pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
3004 | pmulhrsw m0, m7 | |
3005 | packuswb m6, m0 | |
3006 | ||
3007 | palignr m2, m3, 8 | |
3008 | ||
3009 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] | |
3010 | pmulhrsw m1, m7 | |
3011 | pmaddubsw m2, [r3] ; [16] | |
3012 | pmulhrsw m2, m7 | |
3013 | packuswb m1, m2 | |
3014 | ||
3015 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
3016 | ||
3017 | lea r0, [r6 + 8] | |
3018 | add r2, 8 | |
3019 | dec r4 | |
3020 | jnz .loop | |
3021 | ||
3022 | RET | |
3023 | ||
3024 | INIT_XMM sse4 | |
3025 | cglobal intra_pred_ang16_8, 3,7,8 | |
3026 | ||
3027 | lea r3, [ang_table + 16 * 16] | |
3028 | mov r4d, 2 | |
3029 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3030 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
3031 | mova m7, [pw_1024] | |
3032 | ||
3033 | .loop: | |
3034 | movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3035 | palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3036 | punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3037 | punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3038 | ||
3039 | pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] | |
3040 | pmulhrsw m4, m7 | |
3041 | pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] | |
3042 | pmulhrsw m2, m7 | |
3043 | packuswb m4, m2 | |
3044 | ||
3045 | pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] | |
3046 | pmulhrsw m5, m7 | |
3047 | pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] | |
3048 | pmulhrsw m6, m7 | |
3049 | packuswb m5, m6 | |
3050 | ||
3051 | pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] | |
3052 | pmulhrsw m6, m7 | |
3053 | pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] | |
3054 | pmulhrsw m2, m7 | |
3055 | packuswb m6, m2 | |
3056 | ||
3057 | palignr m2, m0, m1, 2 | |
3058 | palignr m3, m0, m1, 4 | |
3059 | ||
3060 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
3061 | pmulhrsw m1, m7 | |
3062 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
3063 | pmulhrsw m0, m7 | |
3064 | packuswb m1, m0 | |
3065 | ||
3066 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
3067 | ||
3068 | pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] | |
3069 | pmulhrsw m4, m7 | |
3070 | pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] | |
3071 | pmulhrsw m5, m7 | |
3072 | packuswb m4, m5 | |
3073 | ||
3074 | pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] | |
3075 | pmulhrsw m5, m7 | |
3076 | pmaddubsw m2, [r3 + 12 * 16] ; [28] | |
3077 | pmulhrsw m2, m7 | |
3078 | packuswb m5, m2 | |
3079 | ||
3080 | pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] | |
3081 | pmulhrsw m6, m7 | |
3082 | pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] | |
3083 | pmulhrsw m1, m7 | |
3084 | packuswb m6, m1 | |
3085 | ||
3086 | pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] | |
3087 | pmulhrsw m1, m7 | |
3088 | pmaddubsw m3, [r3] ; [16] | |
3089 | pmulhrsw m3, m7 | |
3090 | packuswb m1, m3 | |
3091 | ||
3092 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
3093 | ||
3094 | lea r0, [r6 + r1 * 4] | |
3095 | lea r6, [r6 + r1 * 8] | |
3096 | add r2, 8 | |
3097 | dec r4 | |
3098 | jnz .loop | |
3099 | ||
3100 | RET | |
3101 | ||
3102 | INIT_XMM sse4 | |
3103 | cglobal intra_pred_ang16_28, 3,7,8 | |
3104 | mov r2, r3mp | |
3105 | lea r3, [ang_table + 16 * 16] | |
3106 | mov r4d, 2 | |
3107 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3108 | mov r6, r0 | |
3109 | mova m7, [pw_1024] | |
3110 | ||
3111 | .loop: | |
3112 | movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3113 | palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3114 | punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
3115 | punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3116 | ||
3117 | pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] | |
3118 | pmulhrsw m4, m7 | |
3119 | pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] | |
3120 | pmulhrsw m2, m7 | |
3121 | packuswb m4, m2 | |
3122 | ||
3123 | pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] | |
3124 | pmulhrsw m5, m7 | |
3125 | pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] | |
3126 | pmulhrsw m6, m7 | |
3127 | packuswb m5, m6 | |
3128 | ||
3129 | pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] | |
3130 | pmulhrsw m6, m7 | |
3131 | pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] | |
3132 | pmulhrsw m2, m7 | |
3133 | packuswb m6, m2 | |
3134 | ||
3135 | palignr m2, m0, m1, 2 | |
3136 | palignr m3, m0, m1, 4 | |
3137 | ||
3138 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
3139 | pmulhrsw m1, m7 | |
3140 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
3141 | pmulhrsw m0, m7 | |
3142 | packuswb m1, m0 | |
3143 | ||
3144 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
3145 | ||
3146 | pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] | |
3147 | pmulhrsw m4, m7 | |
3148 | pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] | |
3149 | pmulhrsw m5, m7 | |
3150 | packuswb m4, m5 | |
3151 | ||
3152 | pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] | |
3153 | pmulhrsw m5, m7 | |
3154 | pmaddubsw m2, [r3 + 12 * 16] ; [28] | |
3155 | pmulhrsw m2, m7 | |
3156 | packuswb m5, m2 | |
3157 | ||
3158 | pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] | |
3159 | pmulhrsw m6, m7 | |
3160 | pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] | |
3161 | pmulhrsw m1, m7 | |
3162 | packuswb m6, m1 | |
3163 | ||
3164 | pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] | |
3165 | pmulhrsw m1, m7 | |
3166 | pmaddubsw m3, [r3] ; [16] | |
3167 | pmulhrsw m3, m7 | |
3168 | packuswb m1, m3 | |
3169 | ||
3170 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
3171 | ||
3172 | lea r0, [r6 + 8] | |
3173 | add r2, 8 | |
3174 | dec r4 | |
3175 | jnz .loop | |
3176 | ||
3177 | RET | |
3178 | ||
3179 | INIT_XMM sse4 | |
3180 | cglobal intra_pred_ang16_9, 3,7,8 | |
3181 | ||
3182 | lea r3, [ang_table + 16 * 16] | |
3183 | mov r4d, 2 | |
3184 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3185 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
3186 | mova m7, [pw_1024] | |
3187 | ||
3188 | .loop: | |
3189 | movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3190 | palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3191 | punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3192 | ||
3193 | pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] | |
3194 | pmulhrsw m4, m7 | |
3195 | pmaddubsw m0, m2, [r3 - 12 * 16] ; [4] | |
3196 | pmulhrsw m0, m7 | |
3197 | packuswb m4, m0 | |
3198 | ||
3199 | pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] | |
3200 | pmulhrsw m5, m7 | |
3201 | pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] | |
3202 | pmulhrsw m6, m7 | |
3203 | packuswb m5, m6 | |
3204 | ||
3205 | pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] | |
3206 | pmulhrsw m6, m7 | |
3207 | pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] | |
3208 | pmulhrsw m0, m7 | |
3209 | packuswb m6, m0 | |
3210 | ||
3211 | pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] | |
3212 | pmulhrsw m1, m7 | |
3213 | pmaddubsw m0, m2, [r3] ; [16] | |
3214 | pmulhrsw m0, m7 | |
3215 | packuswb m1, m0 | |
3216 | ||
3217 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
3218 | ||
3219 | pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] | |
3220 | pmulhrsw m4, m7 | |
3221 | pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] | |
3222 | pmulhrsw m5, m7 | |
3223 | packuswb m4, m5 | |
3224 | ||
3225 | pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] | |
3226 | pmulhrsw m5, m7 | |
3227 | pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] | |
3228 | pmulhrsw m6, m7 | |
3229 | packuswb m5, m6 | |
3230 | ||
3231 | pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] | |
3232 | pmulhrsw m6, m7 | |
3233 | pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] | |
3234 | pmulhrsw m1, m7 | |
3235 | packuswb m6, m1 | |
3236 | ||
3237 | pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] | |
3238 | pmulhrsw m1, m7 | |
3239 | packuswb m1, m1 | |
3240 | ||
3241 | punpcklqdq m1, m3 ; [00] | |
3242 | ||
3243 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
3244 | ||
3245 | lea r0, [r6 + r1 * 4] | |
3246 | lea r6, [r6 + r1 * 8] | |
3247 | add r2, 8 | |
3248 | dec r4 | |
3249 | jnz .loop | |
3250 | ||
3251 | RET | |
3252 | ||
3253 | INIT_XMM sse4 | |
3254 | cglobal intra_pred_ang16_27, 3,7,8 | |
3255 | mov r2, r3mp | |
3256 | lea r3, [ang_table + 16 * 16] | |
3257 | mov r4d, 2 | |
3258 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3259 | mov r6, r0 | |
3260 | mova m7, [pw_1024] | |
3261 | ||
3262 | .loop: | |
3263 | movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3264 | palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
3265 | punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
3266 | ||
3267 | pmaddubsw m4, m3, [r3 - 14 * 16] ; [2] | |
3268 | pmulhrsw m4, m7 | |
3269 | pmaddubsw m0, m3, [r3 - 12 * 16] ; [4] | |
3270 | pmulhrsw m0, m7 | |
3271 | packuswb m4, m0 | |
3272 | ||
3273 | pmaddubsw m5, m3, [r3 - 10 * 16] ; [6] | |
3274 | pmulhrsw m5, m7 | |
3275 | pmaddubsw m6, m3, [r3 - 8 * 16] ; [8] | |
3276 | pmulhrsw m6, m7 | |
3277 | packuswb m5, m6 | |
3278 | ||
3279 | pmaddubsw m6, m3, [r3 - 6 * 16] ; [10] | |
3280 | pmulhrsw m6, m7 | |
3281 | pmaddubsw m0, m3, [r3 - 4 * 16] ; [12] | |
3282 | pmulhrsw m0, m7 | |
3283 | packuswb m6, m0 | |
3284 | ||
3285 | pmaddubsw m1, m3, [r3 - 2 * 16] ; [14] | |
3286 | pmulhrsw m1, m7 | |
3287 | pmaddubsw m0, m3, [r3] ; [16] | |
3288 | pmulhrsw m0, m7 | |
3289 | packuswb m1, m0 | |
3290 | ||
3291 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
3292 | ||
3293 | pmaddubsw m4, m3, [r3 + 2 * 16] ; [18] | |
3294 | pmulhrsw m4, m7 | |
3295 | pmaddubsw m5, m3, [r3 + 4 * 16] ; [20] | |
3296 | pmulhrsw m5, m7 | |
3297 | packuswb m4, m5 | |
3298 | ||
3299 | pmaddubsw m5, m3, [r3 + 6 * 16] ; [22] | |
3300 | pmulhrsw m5, m7 | |
3301 | pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] | |
3302 | pmulhrsw m6, m7 | |
3303 | packuswb m5, m6 | |
3304 | ||
3305 | pmaddubsw m6, m3, [r3 + 10 * 16] ; [26] | |
3306 | pmulhrsw m6, m7 | |
3307 | pmaddubsw m1, m3, [r3 + 12 * 16] ; [28] | |
3308 | pmulhrsw m1, m7 | |
3309 | packuswb m6, m1 | |
3310 | ||
3311 | pmaddubsw m1, m3, [r3 + 14 * 16] ; [30] | |
3312 | pmulhrsw m1, m7 | |
3313 | packuswb m1, m1 | |
3314 | ||
3315 | movh [r0 ], m4 | |
3316 | movhps [r0 + r1 ], m4 | |
3317 | movh [r0 + r1 * 2], m5 | |
3318 | movhps [r0 + r5 ], m5 | |
3319 | lea r0, [r0 + r1 * 4] | |
3320 | movh [r0 ], m6 | |
3321 | movhps [r0 + r1 ], m6 | |
3322 | movh [r0 + r1 * 2], m1 | |
3323 | movh [r0 + r5 ], m2 | |
3324 | ||
3325 | lea r0, [r6 + 8] | |
3326 | add r2, 8 | |
3327 | dec r4 | |
3328 | jnz .loop | |
3329 | ||
3330 | RET | |
3331 | ||
3332 | INIT_XMM sse4 | |
3333 | cglobal intra_pred_ang16_10, 6,6,8 | |
3334 | lea r4, [r1 * 3] | |
3335 | pxor m7, m7 | |
3336 | ||
3337 | movu m0, [r2 + 1] | |
3338 | palignr m1, m0, 1 | |
3339 | pshufb m1, m7 | |
3340 | palignr m2, m0, 2 | |
3341 | pshufb m2, m7 | |
3342 | palignr m3, m0, 3 | |
3343 | pshufb m3, m7 | |
3344 | palignr m4, m0, 4 | |
3345 | pshufb m4, m7 | |
3346 | palignr m5, m0, 5 | |
3347 | pshufb m5, m7 | |
3348 | palignr m6, m0, 6 | |
3349 | pshufb m6, m7 | |
3350 | ||
3351 | movu [r0 + r1], m1 | |
3352 | movu [r0 + r1 * 2], m2 | |
3353 | movu [r0 + r4], m3 | |
3354 | lea r2, [r0 + r1 * 4] | |
3355 | movu [r2], m4 | |
3356 | movu [r2 + r1], m5 | |
3357 | movu [r2 + r1 * 2], m6 | |
3358 | ||
3359 | palignr m1, m0, 7 | |
3360 | pshufb m1, m7 | |
3361 | movhlps m2, m0 | |
3362 | pshufb m2, m7 | |
3363 | palignr m3, m0, 9 | |
3364 | pshufb m3, m7 | |
3365 | palignr m4, m0, 10 | |
3366 | pshufb m4, m7 | |
3367 | palignr m5, m0, 11 | |
3368 | pshufb m5, m7 | |
3369 | palignr m6, m0, 12 | |
3370 | pshufb m6, m7 | |
3371 | ||
3372 | movu [r2 + r4], m1 | |
3373 | lea r2, [r2 + r1 * 4] | |
3374 | movu [r2], m2 | |
3375 | movu [r2 + r1], m3 | |
3376 | movu [r2 + r1 * 2], m4 | |
3377 | movu [r2 + r4], m5 | |
3378 | lea r2, [r2 + r1 * 4] | |
3379 | movu [r2], m6 | |
3380 | ||
3381 | palignr m1, m0, 13 | |
3382 | pshufb m1, m7 | |
3383 | palignr m2, m0, 14 | |
3384 | pshufb m2, m7 | |
3385 | palignr m3, m0, 15 | |
3386 | pshufb m3, m7 | |
3387 | pshufb m0, m7 | |
3388 | ||
3389 | movu [r2 + r1], m1 | |
3390 | movu [r2 + r1 * 2], m2 | |
3391 | movu [r2 + r4], m3 | |
3392 | ||
3393 | ; filter | |
3394 | cmp r5w, byte 0 | |
3395 | jz .quit | |
3396 | pmovzxbw m0, m0 | |
3397 | mova m1, m0 | |
3398 | movu m2, [r3] | |
3399 | movu m3, [r3 + 1] | |
3400 | ||
3401 | pshufb m2, m7 | |
3402 | pmovzxbw m2, m2 | |
3403 | movhlps m4, m3 | |
3404 | pmovzxbw m3, m3 | |
3405 | pmovzxbw m4, m4 | |
3406 | psubw m3, m2 | |
3407 | psubw m4, m2 | |
3408 | psraw m3, 1 | |
3409 | psraw m4, 1 | |
3410 | paddw m0, m3 | |
3411 | paddw m1, m4 | |
3412 | packuswb m0, m1 | |
3413 | ||
3414 | .quit: | |
3415 | movu [r0], m0 | |
3416 | ||
3417 | RET | |
3418 | ||
3419 | INIT_XMM sse4 | |
3420 | %if ARCH_X86_64 == 1 | |
3421 | cglobal intra_pred_ang16_26, 4,8,5 | |
3422 | mov r7, r5mp | |
3423 | %define bfilter r7w | |
3424 | %else | |
3425 | cglobal intra_pred_ang16_26, 6,7,5,0 - 4 | |
3426 | %define bfilter dword[rsp] | |
3427 | mov bfilter, r5 | |
3428 | %endif | |
3429 | movu m0, [r3 + 1] | |
3430 | ||
3431 | lea r4, [r1 * 3] | |
3432 | lea r3, [r0 + r1 * 4] | |
3433 | lea r5, [r3 + r1 * 4] | |
3434 | lea r6, [r5 + r1 * 4] | |
3435 | ||
3436 | movu [r0], m0 | |
3437 | movu [r0 + r1], m0 | |
3438 | movu [r0 + r1 * 2], m0 | |
3439 | movu [r0 + r4], m0 | |
3440 | movu [r3], m0 | |
3441 | movu [r3 + r1], m0 | |
3442 | movu [r3 + r1 * 2], m0 | |
3443 | movu [r3 + r4], m0 | |
3444 | movu [r5], m0 | |
3445 | movu [r5 + r1], m0 | |
3446 | movu [r5 + r1 * 2], m0 | |
3447 | movu [r5 + r4], m0 | |
3448 | ||
3449 | movu [r6], m0 | |
3450 | movu [r6 + r1], m0 | |
3451 | movu [r6 + r1 * 2], m0 | |
3452 | movu [r6 + r4], m0 | |
3453 | ||
3454 | ; filter | |
3455 | cmp bfilter, byte 0 | |
3456 | jz .quit | |
3457 | ||
3458 | pxor m4, m4 | |
3459 | pshufb m0, m4 | |
3460 | pmovzxbw m0, m0 | |
3461 | mova m1, m0 | |
3462 | movu m2, [r2] | |
3463 | movu m3, [r2 + 1] | |
3464 | ||
3465 | pshufb m2, m4 | |
3466 | pmovzxbw m2, m2 | |
3467 | movhlps m4, m3 | |
3468 | pmovzxbw m3, m3 | |
3469 | pmovzxbw m4, m4 | |
3470 | psubw m3, m2 | |
3471 | psubw m4, m2 | |
3472 | psraw m3, 1 | |
3473 | psraw m4, 1 | |
3474 | paddw m0, m3 | |
3475 | paddw m1, m4 | |
3476 | packuswb m0, m1 | |
3477 | ||
3478 | pextrb [r0], m0, 0 | |
3479 | pextrb [r0 + r1], m0, 1 | |
3480 | pextrb [r0 + r1 * 2], m0, 2 | |
3481 | pextrb [r0 + r4], m0, 3 | |
3482 | pextrb [r3], m0, 4 | |
3483 | pextrb [r3 + r1], m0, 5 | |
3484 | pextrb [r3 + r1 * 2], m0, 6 | |
3485 | pextrb [r3 + r4], m0, 7 | |
3486 | pextrb [r5], m0, 8 | |
3487 | pextrb [r5 + r1], m0, 9 | |
3488 | pextrb [r5 + r1 * 2], m0, 10 | |
3489 | pextrb [r5 + r4], m0, 11 | |
3490 | pextrb [r6], m0, 12 | |
3491 | pextrb [r6 + r1], m0, 13 | |
3492 | pextrb [r6 + r1 * 2], m0, 14 | |
3493 | pextrb [r6 + r4], m0, 15 | |
3494 | ||
3495 | .quit: | |
3496 | RET | |
3497 | ||
3498 | INIT_XMM sse4 | |
3499 | cglobal intra_pred_ang16_11, 3,7,8 | |
3500 | ||
3501 | lea r3, [ang_table + 16 * 16] | |
3502 | mov r4d, 2 | |
3503 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3504 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
3505 | mova m7, [pw_1024] | |
3506 | ||
3507 | .loop: | |
3508 | movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
3509 | mova m2, m3 | |
3510 | palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3511 | punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
3512 | ||
3513 | pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] | |
3514 | pmulhrsw m4, m7 | |
3515 | pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] | |
3516 | pmulhrsw m0, m7 | |
3517 | packuswb m4, m0 | |
3518 | ||
3519 | pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] | |
3520 | pmulhrsw m5, m7 | |
3521 | pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] | |
3522 | pmulhrsw m6, m7 | |
3523 | packuswb m5, m6 | |
3524 | ||
3525 | pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] | |
3526 | pmulhrsw m6, m7 | |
3527 | pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] | |
3528 | pmulhrsw m0, m7 | |
3529 | packuswb m6, m0 | |
3530 | ||
3531 | pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] | |
3532 | pmulhrsw m1, m7 | |
3533 | pmaddubsw m0, m3, [r3] ; [16] | |
3534 | pmulhrsw m0, m7 | |
3535 | packuswb m1, m0 | |
3536 | ||
3537 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
3538 | ||
3539 | pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] | |
3540 | pmulhrsw m4, m7 | |
3541 | pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] | |
3542 | pmulhrsw m5, m7 | |
3543 | packuswb m4, m5 | |
3544 | ||
3545 | pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] | |
3546 | pmulhrsw m5, m7 | |
3547 | pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] | |
3548 | pmulhrsw m6, m7 | |
3549 | packuswb m5, m6 | |
3550 | ||
3551 | pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] | |
3552 | pmulhrsw m6, m7 | |
3553 | pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] | |
3554 | pmulhrsw m1, m7 | |
3555 | packuswb m6, m1 | |
3556 | ||
3557 | pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] | |
3558 | pmulhrsw m1, m7 | |
3559 | packuswb m1, m1 | |
3560 | punpcklqdq m1, m2 ;[00] | |
3561 | ||
3562 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
3563 | ||
3564 | lea r0, [r6 + r1 * 4] | |
3565 | lea r6, [r6 + r1 * 8] | |
3566 | add r2, 8 | |
3567 | dec r4 | |
3568 | jnz .loop | |
3569 | ||
3570 | RET | |
3571 | ||
3572 | INIT_XMM sse4 | |
3573 | cglobal intra_pred_ang16_25, 3,7,8 | |
3574 | mov r2, r3mp | |
3575 | lea r3, [ang_table + 16 * 16] | |
3576 | mov r4d, 2 | |
3577 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3578 | mov r6, r0 | |
3579 | mova m7, [pw_1024] | |
3580 | ||
3581 | .loop: | |
3582 | movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
3583 | mova m2, m3 | |
3584 | palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3585 | punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
3586 | ||
3587 | pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] | |
3588 | pmulhrsw m4, m7 | |
3589 | pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] | |
3590 | pmulhrsw m0, m7 | |
3591 | packuswb m4, m0 | |
3592 | ||
3593 | pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] | |
3594 | pmulhrsw m5, m7 | |
3595 | pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] | |
3596 | pmulhrsw m6, m7 | |
3597 | packuswb m5, m6 | |
3598 | ||
3599 | pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] | |
3600 | pmulhrsw m6, m7 | |
3601 | pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] | |
3602 | pmulhrsw m0, m7 | |
3603 | packuswb m6, m0 | |
3604 | ||
3605 | pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] | |
3606 | pmulhrsw m1, m7 | |
3607 | pmaddubsw m0, m3, [r3] ; [16] | |
3608 | pmulhrsw m0, m7 | |
3609 | packuswb m1, m0 | |
3610 | ||
3611 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
3612 | ||
3613 | pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] | |
3614 | pmulhrsw m4, m7 | |
3615 | pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] | |
3616 | pmulhrsw m5, m7 | |
3617 | packuswb m4, m5 | |
3618 | ||
3619 | pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] | |
3620 | pmulhrsw m5, m7 | |
3621 | pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] | |
3622 | pmulhrsw m6, m7 | |
3623 | packuswb m5, m6 | |
3624 | ||
3625 | pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] | |
3626 | pmulhrsw m6, m7 | |
3627 | pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] | |
3628 | pmulhrsw m1, m7 | |
3629 | packuswb m6, m1 | |
3630 | ||
3631 | pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] | |
3632 | pmulhrsw m1, m7 | |
3633 | packuswb m1, m1 | |
3634 | ||
3635 | movh [r0 ], m4 | |
3636 | movhps [r0 + r1 ], m4 | |
3637 | movh [r0 + r1 * 2], m5 | |
3638 | movhps [r0 + r5 ], m5 | |
3639 | lea r0, [r0 + r1 * 4] | |
3640 | movh [r0 ], m6 | |
3641 | movhps [r0 + r1 ], m6 | |
3642 | movh [r0 + r1 * 2], m1 | |
3643 | movh [r0 + r5 ], m2 | |
3644 | ||
3645 | lea r0, [r6 + 8] | |
3646 | add r2, 8 | |
3647 | dec r4 | |
3648 | jnz .loop | |
3649 | ||
3650 | RET | |
3651 | ||
3652 | INIT_XMM sse4 | |
3653 | cglobal intra_pred_ang16_12, 4,7,8 | |
3654 | ||
3655 | lea r4, [ang_table + 16 * 16] | |
3656 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3657 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
3658 | mova m7, [pw_1024] | |
3659 | ||
3660 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
3661 | punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
3662 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
3663 | movu m2, [r3] | |
3664 | pshufb m2, [c_mode16_12] | |
3665 | ||
3666 | palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
3667 | ||
3668 | pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] | |
3669 | pmulhrsw m4, m7 | |
3670 | pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] | |
3671 | pmulhrsw m1, m7 | |
3672 | packuswb m4, m1 | |
3673 | ||
3674 | pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] | |
3675 | pmulhrsw m5, m7 | |
3676 | pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] | |
3677 | pmulhrsw m6, m7 | |
3678 | packuswb m5, m6 | |
3679 | ||
3680 | pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] | |
3681 | pmulhrsw m6, m7 | |
3682 | pmaddubsw m0, [r4 - 14 * 16] ; [2] | |
3683 | pmulhrsw m0, m7 | |
3684 | packuswb m6, m0 | |
3685 | ||
3686 | palignr m3, m2, 15 | |
3687 | ||
3688 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
3689 | pmulhrsw m1, m7 | |
3690 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
3691 | pmulhrsw m0, m7 | |
3692 | packuswb m1, m0 | |
3693 | ||
3694 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
3695 | ||
3696 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
3697 | pmulhrsw m4, m7 | |
3698 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
3699 | pmulhrsw m5, m7 | |
3700 | packuswb m4, m5 | |
3701 | ||
3702 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
3703 | pmulhrsw m5, m7 | |
3704 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
3705 | pmulhrsw m6, m7 | |
3706 | packuswb m5, m6 | |
3707 | ||
3708 | palignr m3, m2, 14 | |
3709 | ||
3710 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
3711 | pmulhrsw m6, m7 | |
3712 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
3713 | pmulhrsw m1, m7 | |
3714 | packuswb m6, m1 | |
3715 | ||
3716 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
3717 | pmulhrsw m1, m7 | |
3718 | pmaddubsw m3, [r4] ; [16] | |
3719 | pmulhrsw m3, m7 | |
3720 | packuswb m1, m3 | |
3721 | ||
3722 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
3723 | ||
3724 | lea r0, [r6 + r1 * 4] | |
3725 | lea r6, [r6 + r1 * 8] | |
3726 | ||
3727 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3728 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
3729 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
3730 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
3731 | ||
3732 | pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] | |
3733 | pmulhrsw m4, m7 | |
3734 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
3735 | pmulhrsw m5, m7 | |
3736 | packuswb m4, m5 | |
3737 | ||
3738 | pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] | |
3739 | pmulhrsw m5, m7 | |
3740 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
3741 | pmulhrsw m6, m7 | |
3742 | packuswb m5, m6 | |
3743 | ||
3744 | pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] | |
3745 | pmulhrsw m6, m7 | |
3746 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] | |
3747 | pmulhrsw m0, m7 | |
3748 | packuswb m6, m0 | |
3749 | ||
3750 | palignr m3, m2, 14 | |
3751 | ||
3752 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
3753 | pmulhrsw m1, m7 | |
3754 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
3755 | pmulhrsw m0, m7 | |
3756 | packuswb m1, m0 | |
3757 | ||
3758 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
3759 | ||
3760 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
3761 | pmulhrsw m4, m7 | |
3762 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
3763 | pmulhrsw m5, m7 | |
3764 | packuswb m4, m5 | |
3765 | ||
3766 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
3767 | pmulhrsw m5, m7 | |
3768 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
3769 | pmulhrsw m6, m7 | |
3770 | packuswb m5, m6 | |
3771 | ||
3772 | pslldq m2, 1 | |
3773 | palignr m3, m2, 14 | |
3774 | ||
3775 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
3776 | pmulhrsw m6, m7 | |
3777 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
3778 | pmulhrsw m1, m7 | |
3779 | packuswb m6, m1 | |
3780 | ||
3781 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
3782 | pmulhrsw m1, m7 | |
3783 | pmaddubsw m3, [r4] ; [16] | |
3784 | pmulhrsw m3, m7 | |
3785 | packuswb m1, m3 | |
3786 | ||
3787 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
3788 | ||
3789 | RET | |
3790 | ||
3791 | INIT_XMM sse4 | |
3792 | cglobal intra_pred_ang16_24, 4,7,8 | |
3793 | ||
3794 | lea r4, [ang_table + 16 * 16] | |
3795 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3796 | mov r6, r0 | |
3797 | mova m7, [pw_1024] | |
3798 | ||
3799 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
3800 | punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
3801 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
3802 | movu m2, [r2] | |
3803 | pshufb m2, [c_mode16_12] | |
3804 | ||
3805 | palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
3806 | ||
3807 | pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] | |
3808 | pmulhrsw m4, m7 | |
3809 | pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] | |
3810 | pmulhrsw m1, m7 | |
3811 | packuswb m4, m1 | |
3812 | ||
3813 | pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] | |
3814 | pmulhrsw m5, m7 | |
3815 | pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] | |
3816 | pmulhrsw m6, m7 | |
3817 | packuswb m5, m6 | |
3818 | ||
3819 | pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] | |
3820 | pmulhrsw m6, m7 | |
3821 | pmaddubsw m0, [r4 - 14 * 16] ; [2] | |
3822 | pmulhrsw m0, m7 | |
3823 | packuswb m6, m0 | |
3824 | ||
3825 | palignr m3, m2, 15 | |
3826 | ||
3827 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
3828 | pmulhrsw m1, m7 | |
3829 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
3830 | pmulhrsw m0, m7 | |
3831 | packuswb m1, m0 | |
3832 | ||
3833 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
3834 | ||
3835 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
3836 | pmulhrsw m4, m7 | |
3837 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
3838 | pmulhrsw m5, m7 | |
3839 | packuswb m4, m5 | |
3840 | ||
3841 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
3842 | pmulhrsw m5, m7 | |
3843 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
3844 | pmulhrsw m6, m7 | |
3845 | packuswb m5, m6 | |
3846 | ||
3847 | palignr m3, m2, 14 | |
3848 | ||
3849 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
3850 | pmulhrsw m6, m7 | |
3851 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
3852 | pmulhrsw m1, m7 | |
3853 | packuswb m6, m1 | |
3854 | ||
3855 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
3856 | pmulhrsw m1, m7 | |
3857 | pmaddubsw m3, [r4] ; [16] | |
3858 | pmulhrsw m3, m7 | |
3859 | packuswb m1, m3 | |
3860 | ||
3861 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
3862 | ||
3863 | lea r0, [r6 + 8] | |
3864 | ||
3865 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
3866 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
3867 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
3868 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
3869 | ||
3870 | pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] | |
3871 | pmulhrsw m4, m7 | |
3872 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
3873 | pmulhrsw m5, m7 | |
3874 | packuswb m4, m5 | |
3875 | ||
3876 | pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] | |
3877 | pmulhrsw m5, m7 | |
3878 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
3879 | pmulhrsw m6, m7 | |
3880 | packuswb m5, m6 | |
3881 | ||
3882 | pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] | |
3883 | pmulhrsw m6, m7 | |
3884 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] | |
3885 | pmulhrsw m0, m7 | |
3886 | packuswb m6, m0 | |
3887 | ||
3888 | palignr m3, m2, 14 | |
3889 | ||
3890 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
3891 | pmulhrsw m1, m7 | |
3892 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
3893 | pmulhrsw m0, m7 | |
3894 | packuswb m1, m0 | |
3895 | ||
3896 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
3897 | ||
3898 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
3899 | pmulhrsw m4, m7 | |
3900 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
3901 | pmulhrsw m5, m7 | |
3902 | packuswb m4, m5 | |
3903 | ||
3904 | pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] | |
3905 | pmulhrsw m5, m7 | |
3906 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
3907 | pmulhrsw m6, m7 | |
3908 | packuswb m5, m6 | |
3909 | ||
3910 | pslldq m2, 1 | |
3911 | palignr m3, m2, 14 | |
3912 | ||
3913 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
3914 | pmulhrsw m6, m7 | |
3915 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
3916 | pmulhrsw m1, m7 | |
3917 | packuswb m6, m1 | |
3918 | ||
3919 | pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] | |
3920 | pmulhrsw m1, m7 | |
3921 | pmaddubsw m3, [r4] ; [16] | |
3922 | pmulhrsw m3, m7 | |
3923 | packuswb m1, m3 | |
3924 | ||
3925 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
3926 | ||
3927 | RET | |
3928 | ||
3929 | INIT_XMM sse4 | |
3930 | cglobal intra_pred_ang16_13, 4,7,8 | |
3931 | ||
3932 | lea r4, [ang_table + 16 * 16] | |
3933 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
3934 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
3935 | mova m7, [pw_1024] | |
3936 | ||
3937 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
3938 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
3939 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
3940 | movu m2, [r3] | |
3941 | pshufb m2, [c_mode16_13] | |
3942 | ||
3943 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
3944 | ||
3945 | pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] | |
3946 | pmulhrsw m4, m7 | |
3947 | pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] | |
3948 | pmulhrsw m0, m7 | |
3949 | packuswb m4, m0 | |
3950 | ||
3951 | pmaddubsw m5, [r4 - 11 * 16] ; [05] | |
3952 | pmulhrsw m5, m7 | |
3953 | ||
3954 | palignr m3, m2, 15 | |
3955 | ||
3956 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
3957 | pmulhrsw m6, m7 | |
3958 | packuswb m5, m6 | |
3959 | ||
3960 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
3961 | pmulhrsw m6, m7 | |
3962 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
3963 | pmulhrsw m0, m7 | |
3964 | packuswb m6, m0 | |
3965 | ||
3966 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
3967 | pmulhrsw m1, m7 | |
3968 | ||
3969 | palignr m3, m2, 14 | |
3970 | ||
3971 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
3972 | pmulhrsw m0, m7 | |
3973 | packuswb m1, m0 | |
3974 | ||
3975 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
3976 | ||
3977 | pmaddubsw m4, m3, [r4 - 16] ; [15] | |
3978 | pmulhrsw m4, m7 | |
3979 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
3980 | pmulhrsw m5, m7 | |
3981 | packuswb m4, m5 | |
3982 | ||
3983 | pslldq m2, 1 | |
3984 | palignr m3, m2, 14 | |
3985 | ||
3986 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
3987 | pmulhrsw m5, m7 | |
3988 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
3989 | pmulhrsw m6, m7 | |
3990 | packuswb m5, m6 | |
3991 | ||
3992 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
3993 | pmulhrsw m6, m7 | |
3994 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
3995 | pmulhrsw m1, m7 | |
3996 | packuswb m6, m1 | |
3997 | ||
3998 | pslldq m2, 1 | |
3999 | palignr m3, m2, 14 | |
4000 | ||
4001 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
4002 | pmulhrsw m1, m7 | |
4003 | pmaddubsw m3, [r4] ; [16] | |
4004 | pmulhrsw m3, m7 | |
4005 | packuswb m1, m3 | |
4006 | ||
4007 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4008 | ||
4009 | lea r0, [r6 + r1 * 4] | |
4010 | lea r6, [r6 + r1 * 8] | |
4011 | ||
4012 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4013 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
4014 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
4015 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
4016 | ||
4017 | pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] | |
4018 | pmulhrsw m4, m7 | |
4019 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
4020 | pmulhrsw m5, m7 | |
4021 | packuswb m4, m5 | |
4022 | ||
4023 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
4024 | pmulhrsw m5, m7 | |
4025 | ||
4026 | palignr m3, m2, 14 | |
4027 | ||
4028 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
4029 | pmulhrsw m6, m7 | |
4030 | packuswb m5, m6 | |
4031 | ||
4032 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
4033 | pmulhrsw m6, m7 | |
4034 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
4035 | pmulhrsw m0, m7 | |
4036 | packuswb m6, m0 | |
4037 | ||
4038 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
4039 | pmulhrsw m1, m7 | |
4040 | ||
4041 | pslldq m2, 1 | |
4042 | palignr m3, m2, 14 | |
4043 | ||
4044 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4045 | pmulhrsw m0, m7 | |
4046 | packuswb m1, m0 | |
4047 | ||
4048 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
4049 | ||
4050 | pmaddubsw m4, m3, [r4 - 16] ; [15] | |
4051 | pmulhrsw m4, m7 | |
4052 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
4053 | pmulhrsw m5, m7 | |
4054 | packuswb m4, m5 | |
4055 | ||
4056 | pslldq m2, 1 | |
4057 | palignr m3, m2, 14 | |
4058 | ||
4059 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
4060 | pmulhrsw m5, m7 | |
4061 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
4062 | pmulhrsw m6, m7 | |
4063 | packuswb m5, m6 | |
4064 | ||
4065 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
4066 | pmulhrsw m6, m7 | |
4067 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
4068 | pmulhrsw m1, m7 | |
4069 | packuswb m6, m1 | |
4070 | ||
4071 | pslldq m2, 1 | |
4072 | palignr m3, m2, 14 | |
4073 | ||
4074 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
4075 | pmulhrsw m1, m7 | |
4076 | pmaddubsw m3, [r4] ; [16] | |
4077 | pmulhrsw m3, m7 | |
4078 | packuswb m1, m3 | |
4079 | ||
4080 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4081 | ||
4082 | RET | |
4083 | ||
4084 | INIT_XMM sse4 | |
4085 | cglobal intra_pred_ang16_23, 4,7,8 | |
4086 | ||
4087 | lea r4, [ang_table + 16 * 16] | |
4088 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4089 | mov r6, r0 | |
4090 | mova m7, [pw_1024] | |
4091 | ||
4092 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4093 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
4094 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
4095 | movu m2, [r2] | |
4096 | pshufb m2, [c_mode16_13] | |
4097 | ||
4098 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4099 | ||
4100 | pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] | |
4101 | pmulhrsw m4, m7 | |
4102 | pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] | |
4103 | pmulhrsw m0, m7 | |
4104 | packuswb m4, m0 | |
4105 | ||
4106 | pmaddubsw m5, [r4 - 11 * 16] ; [05] | |
4107 | pmulhrsw m5, m7 | |
4108 | ||
4109 | palignr m3, m2, 15 | |
4110 | ||
4111 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
4112 | pmulhrsw m6, m7 | |
4113 | packuswb m5, m6 | |
4114 | ||
4115 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
4116 | pmulhrsw m6, m7 | |
4117 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
4118 | pmulhrsw m0, m7 | |
4119 | packuswb m6, m0 | |
4120 | ||
4121 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
4122 | pmulhrsw m1, m7 | |
4123 | ||
4124 | palignr m3, m2, 14 | |
4125 | ||
4126 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4127 | pmulhrsw m0, m7 | |
4128 | packuswb m1, m0 | |
4129 | ||
4130 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4131 | ||
4132 | pmaddubsw m4, m3, [r4 - 16] ; [15] | |
4133 | pmulhrsw m4, m7 | |
4134 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
4135 | pmulhrsw m5, m7 | |
4136 | packuswb m4, m5 | |
4137 | ||
4138 | pslldq m2, 1 | |
4139 | palignr m3, m2, 14 | |
4140 | ||
4141 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
4142 | pmulhrsw m5, m7 | |
4143 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
4144 | pmulhrsw m6, m7 | |
4145 | packuswb m5, m6 | |
4146 | ||
4147 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
4148 | pmulhrsw m6, m7 | |
4149 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
4150 | pmulhrsw m1, m7 | |
4151 | packuswb m6, m1 | |
4152 | ||
4153 | pslldq m2, 1 | |
4154 | palignr m3, m2, 14 | |
4155 | ||
4156 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
4157 | pmulhrsw m1, m7 | |
4158 | pmaddubsw m3, [r4] ; [16] | |
4159 | pmulhrsw m3, m7 | |
4160 | packuswb m1, m3 | |
4161 | ||
4162 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
4163 | ||
4164 | lea r0, [r6 + 8] | |
4165 | ||
4166 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4167 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
4168 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
4169 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
4170 | ||
4171 | pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] | |
4172 | pmulhrsw m4, m7 | |
4173 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
4174 | pmulhrsw m5, m7 | |
4175 | packuswb m4, m5 | |
4176 | ||
4177 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
4178 | pmulhrsw m5, m7 | |
4179 | ||
4180 | palignr m3, m2, 14 | |
4181 | ||
4182 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
4183 | pmulhrsw m6, m7 | |
4184 | packuswb m5, m6 | |
4185 | ||
4186 | pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] | |
4187 | pmulhrsw m6, m7 | |
4188 | pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] | |
4189 | pmulhrsw m0, m7 | |
4190 | packuswb m6, m0 | |
4191 | ||
4192 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
4193 | pmulhrsw m1, m7 | |
4194 | ||
4195 | pslldq m2, 1 | |
4196 | palignr m3, m2, 14 | |
4197 | ||
4198 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4199 | pmulhrsw m0, m7 | |
4200 | packuswb m1, m0 | |
4201 | ||
4202 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4203 | ||
4204 | pmaddubsw m4, m3, [r4 - 16] ; [15] | |
4205 | pmulhrsw m4, m7 | |
4206 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
4207 | pmulhrsw m5, m7 | |
4208 | packuswb m4, m5 | |
4209 | ||
4210 | pslldq m2, 1 | |
4211 | palignr m3, m2, 14 | |
4212 | ||
4213 | pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] | |
4214 | pmulhrsw m5, m7 | |
4215 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
4216 | pmulhrsw m6, m7 | |
4217 | packuswb m5, m6 | |
4218 | ||
4219 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
4220 | pmulhrsw m6, m7 | |
4221 | pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] | |
4222 | pmulhrsw m1, m7 | |
4223 | packuswb m6, m1 | |
4224 | ||
4225 | pslldq m2, 1 | |
4226 | palignr m3, m2, 14 | |
4227 | ||
4228 | pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] | |
4229 | pmulhrsw m1, m7 | |
4230 | pmaddubsw m3, [r4] ; [16] | |
4231 | pmulhrsw m3, m7 | |
4232 | packuswb m1, m3 | |
4233 | ||
4234 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
4235 | ||
4236 | RET | |
4237 | ||
4238 | INIT_XMM sse4 | |
4239 | cglobal intra_pred_ang16_14, 4,7,8 | |
4240 | ||
4241 | lea r4, [ang_table + 16 * 16] | |
4242 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4243 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
4244 | mova m7, [pw_1024] | |
4245 | ||
4246 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4247 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
4248 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
4249 | movu m2, [r3] | |
4250 | pshufb m2, [c_mode16_14] | |
4251 | ||
4252 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4253 | ||
4254 | pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] | |
4255 | pmulhrsw m4, m7 | |
4256 | pmaddubsw m5, [r4 - 10 * 16] ; [06] | |
4257 | pmulhrsw m5, m7 | |
4258 | packuswb m4, m5 | |
4259 | ||
4260 | palignr m3, m2, 15 | |
4261 | ||
4262 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
4263 | pmulhrsw m5, m7 | |
4264 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
4265 | pmulhrsw m6, m7 | |
4266 | packuswb m5, m6 | |
4267 | ||
4268 | palignr m3, m2, 14 | |
4269 | ||
4270 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
4271 | pmulhrsw m6, m7 | |
4272 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
4273 | pmulhrsw m0, m7 | |
4274 | packuswb m6, m0 | |
4275 | ||
4276 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
4277 | pmulhrsw m1, m7 | |
4278 | ||
4279 | pslldq m2, 1 | |
4280 | palignr m3, m2, 14 | |
4281 | ||
4282 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4283 | pmulhrsw m0, m7 | |
4284 | packuswb m1, m0 | |
4285 | ||
4286 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
4287 | ||
4288 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
4289 | pmulhrsw m4, m7 | |
4290 | ||
4291 | pslldq m2, 1 | |
4292 | palignr m3, m2, 14 | |
4293 | ||
4294 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4295 | pmulhrsw m5, m7 | |
4296 | packuswb m4, m5 | |
4297 | ||
4298 | pmaddubsw m5, m3, [r4 + 16] ; [17] | |
4299 | pmulhrsw m5, m7 | |
4300 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
4301 | pmulhrsw m6, m7 | |
4302 | packuswb m5, m6 | |
4303 | ||
4304 | pslldq m2, 1 | |
4305 | palignr m3, m2, 14 | |
4306 | ||
4307 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
4308 | pmulhrsw m6, m7 | |
4309 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
4310 | pmulhrsw m1, m7 | |
4311 | packuswb m6, m1 | |
4312 | ||
4313 | pslldq m2, 1 | |
4314 | palignr m3, m2, 14 | |
4315 | ||
4316 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
4317 | pmulhrsw m1, m7 | |
4318 | pmaddubsw m3, [r4] ; [16] | |
4319 | pmulhrsw m3, m7 | |
4320 | packuswb m1, m3 | |
4321 | ||
4322 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4323 | ||
4324 | lea r0, [r6 + r1 * 4] | |
4325 | lea r6, [r6 + r1 * 8] | |
4326 | ||
4327 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4328 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
4329 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
4330 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
4331 | ||
4332 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
4333 | pmulhrsw m4, m7 | |
4334 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
4335 | pmulhrsw m5, m7 | |
4336 | packuswb m4, m5 | |
4337 | ||
4338 | palignr m3, m2, 14 | |
4339 | ||
4340 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
4341 | pmulhrsw m5, m7 | |
4342 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
4343 | pmulhrsw m6, m7 | |
4344 | packuswb m5, m6 | |
4345 | ||
4346 | pslldq m2, 1 | |
4347 | palignr m3, m2, 14 | |
4348 | ||
4349 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
4350 | pmulhrsw m6, m7 | |
4351 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
4352 | pmulhrsw m0, m7 | |
4353 | packuswb m6, m0 | |
4354 | ||
4355 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
4356 | pmulhrsw m1, m7 | |
4357 | ||
4358 | pslldq m2, 1 | |
4359 | palignr m3, m2, 14 | |
4360 | ||
4361 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4362 | pmulhrsw m0, m7 | |
4363 | packuswb m1, m0 | |
4364 | ||
4365 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
4366 | ||
4367 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
4368 | pmulhrsw m4, m7 | |
4369 | ||
4370 | pslldq m2, 1 | |
4371 | palignr m3, m2, 14 | |
4372 | ||
4373 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4374 | pmulhrsw m5, m7 | |
4375 | packuswb m4, m5 | |
4376 | ||
4377 | pmaddubsw m5, m3, [r4 + 16] ; [17] | |
4378 | pmulhrsw m5, m7 | |
4379 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
4380 | pmulhrsw m6, m7 | |
4381 | packuswb m5, m6 | |
4382 | ||
4383 | pslldq m2, 1 | |
4384 | palignr m3, m2, 14 | |
4385 | ||
4386 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
4387 | pmulhrsw m6, m7 | |
4388 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
4389 | pmulhrsw m1, m7 | |
4390 | packuswb m6, m1 | |
4391 | ||
4392 | pslldq m2, 1 | |
4393 | palignr m3, m2, 14 | |
4394 | ||
4395 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
4396 | pmulhrsw m1, m7 | |
4397 | pmaddubsw m3, [r4] ; [16] | |
4398 | pmulhrsw m3, m7 | |
4399 | packuswb m1, m3 | |
4400 | ||
4401 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4402 | ||
4403 | RET | |
4404 | ||
4405 | INIT_XMM sse4 | |
4406 | cglobal intra_pred_ang16_22, 4,7,8 | |
4407 | ||
4408 | lea r4, [ang_table + 16 * 16] | |
4409 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4410 | mov r6, r0 | |
4411 | mova m7, [pw_1024] | |
4412 | ||
4413 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4414 | punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
4415 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
4416 | movu m2, [r2] | |
4417 | pshufb m2, [c_mode16_14] | |
4418 | ||
4419 | palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4420 | ||
4421 | pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] | |
4422 | pmulhrsw m4, m7 | |
4423 | pmaddubsw m5, [r4 - 10 * 16] ; [06] | |
4424 | pmulhrsw m5, m7 | |
4425 | packuswb m4, m5 | |
4426 | ||
4427 | palignr m3, m2, 15 | |
4428 | ||
4429 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
4430 | pmulhrsw m5, m7 | |
4431 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
4432 | pmulhrsw m6, m7 | |
4433 | packuswb m5, m6 | |
4434 | ||
4435 | palignr m3, m2, 14 | |
4436 | ||
4437 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
4438 | pmulhrsw m6, m7 | |
4439 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
4440 | pmulhrsw m0, m7 | |
4441 | packuswb m6, m0 | |
4442 | ||
4443 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
4444 | pmulhrsw m1, m7 | |
4445 | ||
4446 | pslldq m2, 1 | |
4447 | palignr m3, m2, 14 | |
4448 | ||
4449 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4450 | pmulhrsw m0, m7 | |
4451 | packuswb m1, m0 | |
4452 | ||
4453 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4454 | ||
4455 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
4456 | pmulhrsw m4, m7 | |
4457 | ||
4458 | pslldq m2, 1 | |
4459 | palignr m3, m2, 14 | |
4460 | ||
4461 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4462 | pmulhrsw m5, m7 | |
4463 | packuswb m4, m5 | |
4464 | ||
4465 | pmaddubsw m5, m3, [r4 + 16] ; [17] | |
4466 | pmulhrsw m5, m7 | |
4467 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
4468 | pmulhrsw m6, m7 | |
4469 | packuswb m5, m6 | |
4470 | ||
4471 | pslldq m2, 1 | |
4472 | palignr m3, m2, 14 | |
4473 | ||
4474 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
4475 | pmulhrsw m6, m7 | |
4476 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
4477 | pmulhrsw m1, m7 | |
4478 | packuswb m6, m1 | |
4479 | ||
4480 | pslldq m2, 1 | |
4481 | palignr m3, m2, 14 | |
4482 | ||
4483 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
4484 | pmulhrsw m1, m7 | |
4485 | pmaddubsw m3, [r4] ; [16] | |
4486 | pmulhrsw m3, m7 | |
4487 | packuswb m1, m3 | |
4488 | ||
4489 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
4490 | ||
4491 | lea r0, [r6 + 8] | |
4492 | ||
4493 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4494 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
4495 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
4496 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] | |
4497 | ||
4498 | pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] | |
4499 | pmulhrsw m4, m7 | |
4500 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] | |
4501 | pmulhrsw m5, m7 | |
4502 | packuswb m4, m5 | |
4503 | ||
4504 | palignr m3, m2, 14 | |
4505 | ||
4506 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
4507 | pmulhrsw m5, m7 | |
4508 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
4509 | pmulhrsw m6, m7 | |
4510 | packuswb m5, m6 | |
4511 | ||
4512 | pslldq m2, 1 | |
4513 | palignr m3, m2, 14 | |
4514 | ||
4515 | pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] | |
4516 | pmulhrsw m6, m7 | |
4517 | pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] | |
4518 | pmulhrsw m0, m7 | |
4519 | packuswb m6, m0 | |
4520 | ||
4521 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
4522 | pmulhrsw m1, m7 | |
4523 | ||
4524 | pslldq m2, 1 | |
4525 | palignr m3, m2, 14 | |
4526 | ||
4527 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4528 | pmulhrsw m0, m7 | |
4529 | packuswb m1, m0 | |
4530 | ||
4531 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4532 | ||
4533 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
4534 | pmulhrsw m4, m7 | |
4535 | ||
4536 | pslldq m2, 1 | |
4537 | palignr m3, m2, 14 | |
4538 | ||
4539 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4540 | pmulhrsw m5, m7 | |
4541 | packuswb m4, m5 | |
4542 | ||
4543 | pmaddubsw m5, m3, [r4 + 16] ; [17] | |
4544 | pmulhrsw m5, m7 | |
4545 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
4546 | pmulhrsw m6, m7 | |
4547 | packuswb m5, m6 | |
4548 | ||
4549 | pslldq m2, 1 | |
4550 | palignr m3, m2, 14 | |
4551 | ||
4552 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
4553 | pmulhrsw m6, m7 | |
4554 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
4555 | pmulhrsw m1, m7 | |
4556 | packuswb m6, m1 | |
4557 | ||
4558 | pslldq m2, 1 | |
4559 | palignr m3, m2, 14 | |
4560 | ||
4561 | pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] | |
4562 | pmulhrsw m1, m7 | |
4563 | pmaddubsw m3, [r4] ; [16] | |
4564 | pmulhrsw m3, m7 | |
4565 | packuswb m1, m3 | |
4566 | ||
4567 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
4568 | ||
4569 | RET | |
4570 | ||
4571 | INIT_XMM sse4 | |
4572 | cglobal intra_pred_ang16_15, 4,7,8 | |
4573 | ||
4574 | lea r4, [ang_table + 16 * 16] | |
4575 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4576 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
4577 | mova m7, [pw_1024] | |
4578 | ||
4579 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4580 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
4581 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
4582 | movu m2, [r3] | |
4583 | pshufb m2, [c_mode16_15] | |
4584 | ||
4585 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4586 | ||
4587 | pmaddubsw m4, [r4 - 16] ; [15] | |
4588 | pmulhrsw m4, m7 | |
4589 | ||
4590 | palignr m3, m2, 15 | |
4591 | ||
4592 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4593 | pmulhrsw m5, m7 | |
4594 | packuswb m4, m5 | |
4595 | ||
4596 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
4597 | pmulhrsw m5, m7 | |
4598 | ||
4599 | palignr m3, m2, 14 | |
4600 | ||
4601 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
4602 | pmulhrsw m6, m7 | |
4603 | packuswb m5, m6 | |
4604 | ||
4605 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
4606 | pmulhrsw m6, m7 | |
4607 | ||
4608 | pslldq m2, 1 | |
4609 | palignr m3, m2, 14 | |
4610 | ||
4611 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
4612 | pmulhrsw m0, m7 | |
4613 | packuswb m6, m0 | |
4614 | ||
4615 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
4616 | pmulhrsw m1, m7 | |
4617 | ||
4618 | pslldq m2, 1 | |
4619 | palignr m3, m2, 14 | |
4620 | ||
4621 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4622 | pmulhrsw m0, m7 | |
4623 | packuswb m1, m0 | |
4624 | ||
4625 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
4626 | ||
4627 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
4628 | pmulhrsw m4, m7 | |
4629 | ||
4630 | pslldq m2, 1 | |
4631 | palignr m3, m2, 14 | |
4632 | ||
4633 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
4634 | pmulhrsw m5, m7 | |
4635 | packuswb m4, m5 | |
4636 | ||
4637 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
4638 | pmulhrsw m5, m7 | |
4639 | ||
4640 | pslldq m2, 1 | |
4641 | palignr m3, m2, 14 | |
4642 | ||
4643 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
4644 | pmulhrsw m6, m7 | |
4645 | packuswb m5, m6 | |
4646 | ||
4647 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
4648 | pmulhrsw m6, m7 | |
4649 | ||
4650 | pslldq m2, 1 | |
4651 | palignr m3, m2, 14 | |
4652 | ||
4653 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
4654 | pmulhrsw m1, m7 | |
4655 | packuswb m6, m1 | |
4656 | ||
4657 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
4658 | pmulhrsw m1, m7 | |
4659 | ||
4660 | pslldq m2, 1 | |
4661 | palignr m3, m2, 14 | |
4662 | ||
4663 | pmaddubsw m3, [r4] ; [16] | |
4664 | pmulhrsw m3, m7 | |
4665 | packuswb m1, m3 | |
4666 | ||
4667 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4668 | ||
4669 | lea r0, [r6 + r1 * 4] | |
4670 | lea r6, [r6 + r1 * 8] | |
4671 | ||
4672 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4673 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
4674 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
4675 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] | |
4676 | ||
4677 | pmaddubsw m4, m3, [r4 - 16] ; [15] | |
4678 | pmulhrsw m4, m7 | |
4679 | ||
4680 | palignr m3, m2, 14 | |
4681 | ||
4682 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4683 | pmulhrsw m5, m7 | |
4684 | packuswb m4, m5 | |
4685 | ||
4686 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
4687 | pmulhrsw m5, m7 | |
4688 | ||
4689 | pslldq m2, 1 | |
4690 | palignr m3, m2, 14 | |
4691 | ||
4692 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
4693 | pmulhrsw m6, m7 | |
4694 | packuswb m5, m6 | |
4695 | ||
4696 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
4697 | pmulhrsw m6, m7 | |
4698 | ||
4699 | pslldq m2, 1 | |
4700 | palignr m3, m2, 14 | |
4701 | ||
4702 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
4703 | pmulhrsw m0, m7 | |
4704 | packuswb m6, m0 | |
4705 | ||
4706 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
4707 | pmulhrsw m1, m7 | |
4708 | ||
4709 | pslldq m2, 1 | |
4710 | palignr m3, m2, 14 | |
4711 | ||
4712 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4713 | pmulhrsw m0, m7 | |
4714 | packuswb m1, m0 | |
4715 | ||
4716 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
4717 | ||
4718 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
4719 | pmulhrsw m4, m7 | |
4720 | ||
4721 | pslldq m2, 1 | |
4722 | palignr m3, m2, 14 | |
4723 | ||
4724 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
4725 | pmulhrsw m5, m7 | |
4726 | packuswb m4, m5 | |
4727 | ||
4728 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
4729 | pmulhrsw m5, m7 | |
4730 | ||
4731 | pslldq m2, 1 | |
4732 | palignr m3, m2, 14 | |
4733 | ||
4734 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
4735 | pmulhrsw m6, m7 | |
4736 | packuswb m5, m6 | |
4737 | ||
4738 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
4739 | pmulhrsw m6, m7 | |
4740 | ||
4741 | pslldq m2, 1 | |
4742 | palignr m3, m2, 14 | |
4743 | ||
4744 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
4745 | pmulhrsw m1, m7 | |
4746 | packuswb m6, m1 | |
4747 | ||
4748 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
4749 | pmulhrsw m1, m7 | |
4750 | ||
4751 | pslldq m2, 1 | |
4752 | palignr m3, m2, 14 | |
4753 | ||
4754 | pmaddubsw m3, [r4] ; [16] | |
4755 | pmulhrsw m3, m7 | |
4756 | packuswb m1, m3 | |
4757 | ||
4758 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
4759 | ||
4760 | RET | |
4761 | ||
4762 | INIT_XMM sse4 | |
4763 | cglobal intra_pred_ang16_21, 4,7,8 | |
4764 | ||
4765 | lea r4, [ang_table + 16 * 16] | |
4766 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4767 | mov r6, r0 | |
4768 | mova m7, [pw_1024] | |
4769 | ||
4770 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4771 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
4772 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
4773 | movu m2, [r2] | |
4774 | pshufb m2, [c_mode16_15] | |
4775 | ||
4776 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4777 | ||
4778 | pmaddubsw m4, [r4 - 16] ; [15] | |
4779 | pmulhrsw m4, m7 | |
4780 | ||
4781 | palignr m3, m2, 15 | |
4782 | ||
4783 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4784 | pmulhrsw m5, m7 | |
4785 | packuswb m4, m5 | |
4786 | ||
4787 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
4788 | pmulhrsw m5, m7 | |
4789 | ||
4790 | palignr m3, m2, 14 | |
4791 | ||
4792 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
4793 | pmulhrsw m6, m7 | |
4794 | packuswb m5, m6 | |
4795 | ||
4796 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
4797 | pmulhrsw m6, m7 | |
4798 | ||
4799 | pslldq m2, 1 | |
4800 | palignr m3, m2, 14 | |
4801 | ||
4802 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
4803 | pmulhrsw m0, m7 | |
4804 | packuswb m6, m0 | |
4805 | ||
4806 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
4807 | pmulhrsw m1, m7 | |
4808 | ||
4809 | pslldq m2, 1 | |
4810 | palignr m3, m2, 14 | |
4811 | ||
4812 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4813 | pmulhrsw m0, m7 | |
4814 | packuswb m1, m0 | |
4815 | ||
4816 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4817 | ||
4818 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
4819 | pmulhrsw m4, m7 | |
4820 | ||
4821 | pslldq m2, 1 | |
4822 | palignr m3, m2, 14 | |
4823 | ||
4824 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
4825 | pmulhrsw m5, m7 | |
4826 | packuswb m4, m5 | |
4827 | ||
4828 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
4829 | pmulhrsw m5, m7 | |
4830 | ||
4831 | pslldq m2, 1 | |
4832 | palignr m3, m2, 14 | |
4833 | ||
4834 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
4835 | pmulhrsw m6, m7 | |
4836 | packuswb m5, m6 | |
4837 | ||
4838 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
4839 | pmulhrsw m6, m7 | |
4840 | ||
4841 | pslldq m2, 1 | |
4842 | palignr m3, m2, 14 | |
4843 | ||
4844 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
4845 | pmulhrsw m1, m7 | |
4846 | packuswb m6, m1 | |
4847 | ||
4848 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
4849 | pmulhrsw m1, m7 | |
4850 | ||
4851 | pslldq m2, 1 | |
4852 | palignr m3, m2, 14 | |
4853 | ||
4854 | pmaddubsw m3, [r4] ; [16] | |
4855 | pmulhrsw m3, m7 | |
4856 | packuswb m1, m3 | |
4857 | ||
4858 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
4859 | ||
4860 | lea r0, [r6 + 8] | |
4861 | ||
4862 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
4863 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
4864 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
4865 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] | |
4866 | ||
4867 | pmaddubsw m4, m3, [r4 - 16] ; [15] | |
4868 | pmulhrsw m4, m7 | |
4869 | ||
4870 | palignr m3, m2, 14 | |
4871 | ||
4872 | pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] | |
4873 | pmulhrsw m5, m7 | |
4874 | packuswb m4, m5 | |
4875 | ||
4876 | pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] | |
4877 | pmulhrsw m5, m7 | |
4878 | ||
4879 | pslldq m2, 1 | |
4880 | palignr m3, m2, 14 | |
4881 | ||
4882 | pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] | |
4883 | pmulhrsw m6, m7 | |
4884 | packuswb m5, m6 | |
4885 | ||
4886 | pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] | |
4887 | pmulhrsw m6, m7 | |
4888 | ||
4889 | pslldq m2, 1 | |
4890 | palignr m3, m2, 14 | |
4891 | ||
4892 | pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] | |
4893 | pmulhrsw m0, m7 | |
4894 | packuswb m6, m0 | |
4895 | ||
4896 | pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] | |
4897 | pmulhrsw m1, m7 | |
4898 | ||
4899 | pslldq m2, 1 | |
4900 | palignr m3, m2, 14 | |
4901 | ||
4902 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
4903 | pmulhrsw m0, m7 | |
4904 | packuswb m1, m0 | |
4905 | ||
4906 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
4907 | ||
4908 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] | |
4909 | pmulhrsw m4, m7 | |
4910 | ||
4911 | pslldq m2, 1 | |
4912 | palignr m3, m2, 14 | |
4913 | ||
4914 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
4915 | pmulhrsw m5, m7 | |
4916 | packuswb m4, m5 | |
4917 | ||
4918 | pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] | |
4919 | pmulhrsw m5, m7 | |
4920 | ||
4921 | pslldq m2, 1 | |
4922 | palignr m3, m2, 14 | |
4923 | ||
4924 | pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] | |
4925 | pmulhrsw m6, m7 | |
4926 | packuswb m5, m6 | |
4927 | ||
4928 | pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] | |
4929 | pmulhrsw m6, m7 | |
4930 | ||
4931 | pslldq m2, 1 | |
4932 | palignr m3, m2, 14 | |
4933 | ||
4934 | pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] | |
4935 | pmulhrsw m1, m7 | |
4936 | packuswb m6, m1 | |
4937 | ||
4938 | pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] | |
4939 | pmulhrsw m1, m7 | |
4940 | ||
4941 | pslldq m2, 1 | |
4942 | palignr m3, m2, 14 | |
4943 | ||
4944 | pmaddubsw m3, [r4] ; [16] | |
4945 | pmulhrsw m3, m7 | |
4946 | packuswb m1, m3 | |
4947 | ||
4948 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
4949 | ||
4950 | RET | |
4951 | ||
4952 | INIT_XMM sse4 | |
4953 | cglobal intra_pred_ang16_16, 4,7,8 | |
4954 | ||
4955 | lea r4, [ang_table + 16 * 16] | |
4956 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
4957 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
4958 | mova m7, [pw_1024] | |
4959 | ||
4960 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
4961 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
4962 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
4963 | movu m2, [r3] | |
4964 | pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] | |
4965 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
4966 | ||
4967 | pmaddubsw m4, [r4 - 5 * 16] ; [11] | |
4968 | pmulhrsw m4, m7 | |
4969 | ||
4970 | palignr m3, m2, 15 | |
4971 | ||
4972 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
4973 | pmulhrsw m5, m7 | |
4974 | packuswb m4, m5 | |
4975 | ||
4976 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
4977 | pmulhrsw m5, m7 | |
4978 | ||
4979 | palignr m3, m2, 14 | |
4980 | ||
4981 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
4982 | pmulhrsw m6, m7 | |
4983 | packuswb m5, m6 | |
4984 | ||
4985 | pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] | |
4986 | palignr m3, m2, 14 | |
4987 | ||
4988 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
4989 | pmulhrsw m6, m7 | |
4990 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
4991 | pmulhrsw m0, m7 | |
4992 | packuswb m6, m0 | |
4993 | ||
4994 | pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
4995 | palignr m3, m2, 14 | |
4996 | ||
4997 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
4998 | pmulhrsw m1, m7 | |
4999 | ||
5000 | pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] | |
5001 | palignr m3, m2, 14 | |
5002 | ||
5003 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
5004 | pmulhrsw m0, m7 | |
5005 | packuswb m1, m0 | |
5006 | ||
5007 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5008 | ||
5009 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
5010 | pmulhrsw m4, m7 | |
5011 | ||
5012 | pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] | |
5013 | palignr m3, m2, 14 | |
5014 | ||
5015 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
5016 | pmulhrsw m5, m7 | |
5017 | packuswb m4, m5 | |
5018 | ||
5019 | pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] | |
5020 | palignr m3, m2, 14 | |
5021 | ||
5022 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
5023 | pmulhrsw m5, m7 | |
5024 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
5025 | pmulhrsw m6, m7 | |
5026 | packuswb m5, m6 | |
5027 | ||
5028 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] | |
5029 | palignr m3, m2, 14 | |
5030 | ||
5031 | pmaddubsw m6, m3, [r4 - 16] ; [15] | |
5032 | pmulhrsw m6, m7 | |
5033 | ||
5034 | pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] | |
5035 | palignr m3, m2, 14 | |
5036 | ||
5037 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5038 | pmulhrsw m1, m7 | |
5039 | packuswb m6, m1 | |
5040 | ||
5041 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
5042 | pmulhrsw m1, m7 | |
5043 | ||
5044 | pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] | |
5045 | palignr m3, m2, 14 | |
5046 | ||
5047 | pmaddubsw m3, [r4] ; [16] | |
5048 | pmulhrsw m3, m7 | |
5049 | packuswb m1, m3 | |
5050 | ||
5051 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5052 | ||
5053 | lea r0, [r6 + r1 * 4] | |
5054 | lea r6, [r6 + r1 * 8] | |
5055 | ||
5056 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5057 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
5058 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
5059 | palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
5060 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] | |
5061 | ||
5062 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
5063 | pmulhrsw m4, m7 | |
5064 | ||
5065 | palignr m3, m2, 14 | |
5066 | ||
5067 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
5068 | pmulhrsw m5, m7 | |
5069 | packuswb m4, m5 | |
5070 | ||
5071 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
5072 | pmulhrsw m5, m7 | |
5073 | ||
5074 | pslldq m2, 1 | |
5075 | palignr m3, m2, 14 | |
5076 | ||
5077 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
5078 | pmulhrsw m6, m7 | |
5079 | packuswb m5, m6 | |
5080 | ||
5081 | pslldq m2, 1 | |
5082 | palignr m3, m2, 14 | |
5083 | ||
5084 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
5085 | pmulhrsw m6, m7 | |
5086 | ||
5087 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
5088 | pmulhrsw m0, m7 | |
5089 | packuswb m6, m0 | |
5090 | ||
5091 | pslldq m2, 1 | |
5092 | palignr m3, m2, 14 | |
5093 | ||
5094 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
5095 | pmulhrsw m1, m7 | |
5096 | ||
5097 | pslldq m2, 1 | |
5098 | palignr m3, m2, 14 | |
5099 | ||
5100 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
5101 | pmulhrsw m0, m7 | |
5102 | packuswb m1, m0 | |
5103 | ||
5104 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5105 | ||
5106 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
5107 | pmulhrsw m4, m7 | |
5108 | ||
5109 | pslldq m2, 1 | |
5110 | palignr m3, m2, 14 | |
5111 | ||
5112 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
5113 | pmulhrsw m5, m7 | |
5114 | packuswb m4, m5 | |
5115 | ||
5116 | pslldq m2, 1 | |
5117 | palignr m3, m2, 14 | |
5118 | ||
5119 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
5120 | pmulhrsw m5, m7 | |
5121 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
5122 | pmulhrsw m6, m7 | |
5123 | packuswb m5, m6 | |
5124 | ||
5125 | pslldq m2, 1 | |
5126 | palignr m3, m2, 14 | |
5127 | ||
5128 | pmaddubsw m6, m3, [r4 - 16] ; [15] | |
5129 | pmulhrsw m6, m7 | |
5130 | ||
5131 | pslldq m2, 1 | |
5132 | palignr m3, m2, 14 | |
5133 | ||
5134 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5135 | pmulhrsw m1, m7 | |
5136 | packuswb m6, m1 | |
5137 | ||
5138 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
5139 | pmulhrsw m1, m7 | |
5140 | ||
5141 | pslldq m2, 1 | |
5142 | palignr m3, m2, 14 | |
5143 | ||
5144 | pmaddubsw m3, [r4] ; [16] | |
5145 | pmulhrsw m3, m7 | |
5146 | packuswb m1, m3 | |
5147 | ||
5148 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5149 | ||
5150 | RET | |
5151 | ||
5152 | INIT_XMM sse4 | |
5153 | cglobal intra_pred_ang16_20, 4,7,8 | |
5154 | ||
5155 | lea r4, [ang_table + 16 * 16] | |
5156 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5157 | mov r6, r0 | |
5158 | mova m7, [pw_1024] | |
5159 | ||
5160 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
5161 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
5162 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
5163 | movu m2, [r2] | |
5164 | pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] | |
5165 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
5166 | ||
5167 | pmaddubsw m4, [r4 - 5 * 16] ; [11] | |
5168 | pmulhrsw m4, m7 | |
5169 | ||
5170 | palignr m3, m2, 15 | |
5171 | ||
5172 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
5173 | pmulhrsw m5, m7 | |
5174 | packuswb m4, m5 | |
5175 | ||
5176 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
5177 | pmulhrsw m5, m7 | |
5178 | ||
5179 | palignr m3, m2, 14 | |
5180 | ||
5181 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
5182 | pmulhrsw m6, m7 | |
5183 | packuswb m5, m6 | |
5184 | ||
5185 | pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] | |
5186 | palignr m3, m2, 14 | |
5187 | ||
5188 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
5189 | pmulhrsw m6, m7 | |
5190 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
5191 | pmulhrsw m0, m7 | |
5192 | packuswb m6, m0 | |
5193 | ||
5194 | pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
5195 | palignr m3, m2, 14 | |
5196 | ||
5197 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
5198 | pmulhrsw m1, m7 | |
5199 | ||
5200 | pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] | |
5201 | palignr m3, m2, 14 | |
5202 | ||
5203 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
5204 | pmulhrsw m0, m7 | |
5205 | packuswb m1, m0 | |
5206 | ||
5207 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5208 | ||
5209 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
5210 | pmulhrsw m4, m7 | |
5211 | ||
5212 | pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] | |
5213 | palignr m3, m2, 14 | |
5214 | ||
5215 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
5216 | pmulhrsw m5, m7 | |
5217 | packuswb m4, m5 | |
5218 | ||
5219 | pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] | |
5220 | palignr m3, m2, 14 | |
5221 | ||
5222 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
5223 | pmulhrsw m5, m7 | |
5224 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
5225 | pmulhrsw m6, m7 | |
5226 | packuswb m5, m6 | |
5227 | ||
5228 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] | |
5229 | palignr m3, m2, 14 | |
5230 | ||
5231 | pmaddubsw m6, m3, [r4 - 16] ; [15] | |
5232 | pmulhrsw m6, m7 | |
5233 | ||
5234 | pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] | |
5235 | palignr m3, m2, 14 | |
5236 | ||
5237 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5238 | pmulhrsw m1, m7 | |
5239 | packuswb m6, m1 | |
5240 | ||
5241 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
5242 | pmulhrsw m1, m7 | |
5243 | ||
5244 | pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] | |
5245 | palignr m3, m2, 14 | |
5246 | ||
5247 | pmaddubsw m3, [r4] ; [16] | |
5248 | pmulhrsw m3, m7 | |
5249 | packuswb m1, m3 | |
5250 | ||
5251 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5252 | ||
5253 | lea r0, [r6 + 8] | |
5254 | ||
5255 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5256 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
5257 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
5258 | palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] | |
5259 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] | |
5260 | ||
5261 | pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] | |
5262 | pmulhrsw m4, m7 | |
5263 | ||
5264 | palignr m3, m2, 14 | |
5265 | ||
5266 | pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] | |
5267 | pmulhrsw m5, m7 | |
5268 | packuswb m4, m5 | |
5269 | ||
5270 | pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] | |
5271 | pmulhrsw m5, m7 | |
5272 | ||
5273 | pslldq m2, 1 | |
5274 | palignr m3, m2, 14 | |
5275 | ||
5276 | pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] | |
5277 | pmulhrsw m6, m7 | |
5278 | packuswb m5, m6 | |
5279 | ||
5280 | pslldq m2, 1 | |
5281 | palignr m3, m2, 14 | |
5282 | ||
5283 | pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] | |
5284 | pmulhrsw m6, m7 | |
5285 | ||
5286 | pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] | |
5287 | pmulhrsw m0, m7 | |
5288 | packuswb m6, m0 | |
5289 | ||
5290 | pslldq m2, 1 | |
5291 | palignr m3, m2, 14 | |
5292 | ||
5293 | pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] | |
5294 | pmulhrsw m1, m7 | |
5295 | ||
5296 | pslldq m2, 1 | |
5297 | palignr m3, m2, 14 | |
5298 | ||
5299 | pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] | |
5300 | pmulhrsw m0, m7 | |
5301 | packuswb m1, m0 | |
5302 | ||
5303 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5304 | ||
5305 | pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] | |
5306 | pmulhrsw m4, m7 | |
5307 | ||
5308 | pslldq m2, 1 | |
5309 | palignr m3, m2, 14 | |
5310 | ||
5311 | pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] | |
5312 | pmulhrsw m5, m7 | |
5313 | packuswb m4, m5 | |
5314 | ||
5315 | pslldq m2, 1 | |
5316 | palignr m3, m2, 14 | |
5317 | ||
5318 | pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] | |
5319 | pmulhrsw m5, m7 | |
5320 | pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] | |
5321 | pmulhrsw m6, m7 | |
5322 | packuswb m5, m6 | |
5323 | ||
5324 | pslldq m2, 1 | |
5325 | palignr m3, m2, 14 | |
5326 | ||
5327 | pmaddubsw m6, m3, [r4 - 16] ; [15] | |
5328 | pmulhrsw m6, m7 | |
5329 | ||
5330 | pslldq m2, 1 | |
5331 | palignr m3, m2, 14 | |
5332 | ||
5333 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5334 | pmulhrsw m1, m7 | |
5335 | packuswb m6, m1 | |
5336 | ||
5337 | pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] | |
5338 | pmulhrsw m1, m7 | |
5339 | ||
5340 | pslldq m2, 1 | |
5341 | palignr m3, m2, 14 | |
5342 | ||
5343 | pmaddubsw m3, [r4] ; [16] | |
5344 | pmulhrsw m3, m7 | |
5345 | packuswb m1, m3 | |
5346 | ||
5347 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5348 | ||
5349 | RET | |
5350 | ||
5351 | INIT_XMM sse4 | |
5352 | cglobal intra_pred_ang16_17, 4,7,8 | |
5353 | ||
5354 | lea r4, [ang_table + 16 * 16] | |
5355 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5356 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
5357 | mova m7, [pw_1024] | |
5358 | ||
5359 | movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
5360 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
5361 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
5362 | movu m2, [r3] | |
5363 | pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] | |
5364 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
5365 | ||
5366 | pmaddubsw m4, [r4 - 10 * 16] ; [06] | |
5367 | pmulhrsw m4, m7 | |
5368 | ||
5369 | palignr m3, m2, 15 | |
5370 | ||
5371 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
5372 | pmulhrsw m5, m7 | |
5373 | packuswb m4, m5 | |
5374 | ||
5375 | palignr m3, m2, 14 | |
5376 | ||
5377 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
5378 | pmulhrsw m5, m7 | |
5379 | ||
5380 | pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] | |
5381 | pinsrb m2, [r3 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] | |
5382 | palignr m3, m2, 14 | |
5383 | ||
5384 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
5385 | pmulhrsw m6, m7 | |
5386 | packuswb m5, m6 | |
5387 | ||
5388 | pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] | |
5389 | palignr m3, m2, 14 | |
5390 | ||
5391 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
5392 | pmulhrsw m6, m7 | |
5393 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
5394 | pmulhrsw m0, m7 | |
5395 | packuswb m6, m0 | |
5396 | ||
5397 | pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] | |
5398 | palignr m3, m2, 14 | |
5399 | ||
5400 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
5401 | pmulhrsw m1, m7 | |
5402 | ||
5403 | pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
5404 | palignr m3, m2, 14 | |
5405 | ||
5406 | pmaddubsw m0, m3, [r4] ; [16] | |
5407 | pmulhrsw m0, m7 | |
5408 | packuswb m1, m0 | |
5409 | ||
5410 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5411 | ||
5412 | pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] | |
5413 | palignr m3, m2, 14 | |
5414 | ||
5415 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
5416 | pmulhrsw m4, m7 | |
5417 | ||
5418 | pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] | |
5419 | palignr m3, m2, 14 | |
5420 | ||
5421 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
5422 | pmulhrsw m5, m7 | |
5423 | packuswb m4, m5 | |
5424 | ||
5425 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
5426 | pmulhrsw m5, m7 | |
5427 | ||
5428 | pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] | |
5429 | palignr m3, m2, 14 | |
5430 | ||
5431 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
5432 | pmulhrsw m6, m7 | |
5433 | packuswb m5, m6 | |
5434 | ||
5435 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] | |
5436 | palignr m3, m2, 14 | |
5437 | ||
5438 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
5439 | pmulhrsw m6, m7 | |
5440 | ||
5441 | pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] | |
5442 | palignr m3, m2, 14 | |
5443 | ||
5444 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
5445 | pmulhrsw m1, m7 | |
5446 | packuswb m6, m1 | |
5447 | ||
5448 | pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] | |
5449 | palignr m3, m2, 14 | |
5450 | ||
5451 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5452 | pmulhrsw m1, m7 | |
5453 | pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
5454 | pmulhrsw m3, m7 | |
5455 | packuswb m1, m3 | |
5456 | ||
5457 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5458 | ||
5459 | lea r0, [r6 + r1 * 4] | |
5460 | lea r6, [r6 + r1 * 8] | |
5461 | ||
5462 | movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5463 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
5464 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
5465 | palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
5466 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x] | |
5467 | ||
5468 | pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] | |
5469 | pmulhrsw m4, m7 | |
5470 | ||
5471 | palignr m3, m2, 14 | |
5472 | ||
5473 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
5474 | pmulhrsw m5, m7 | |
5475 | packuswb m4, m5 | |
5476 | ||
5477 | pslldq m2, 1 | |
5478 | palignr m3, m2, 14 | |
5479 | ||
5480 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
5481 | pmulhrsw m5, m7 | |
5482 | ||
5483 | pslldq m2, 1 | |
5484 | palignr m3, m2, 14 | |
5485 | ||
5486 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
5487 | pmulhrsw m6, m7 | |
5488 | packuswb m5, m6 | |
5489 | ||
5490 | pslldq m2, 1 | |
5491 | palignr m3, m2, 14 | |
5492 | ||
5493 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
5494 | pmulhrsw m6, m7 | |
5495 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
5496 | pmulhrsw m0, m7 | |
5497 | packuswb m6, m0 | |
5498 | ||
5499 | pslldq m2, 1 | |
5500 | palignr m3, m2, 14 | |
5501 | ||
5502 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
5503 | pmulhrsw m1, m7 | |
5504 | ||
5505 | pslldq m2, 1 | |
5506 | palignr m3, m2, 14 | |
5507 | ||
5508 | pmaddubsw m0, m3, [r4] ; [16] | |
5509 | pmulhrsw m0, m7 | |
5510 | packuswb m1, m0 | |
5511 | ||
5512 | TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 | |
5513 | ||
5514 | pslldq m2, 1 | |
5515 | palignr m3, m2, 14 | |
5516 | ||
5517 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
5518 | pmulhrsw m4, m7 | |
5519 | ||
5520 | pslldq m2, 1 | |
5521 | palignr m3, m2, 14 | |
5522 | ||
5523 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
5524 | pmulhrsw m5, m7 | |
5525 | packuswb m4, m5 | |
5526 | ||
5527 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
5528 | pmulhrsw m5, m7 | |
5529 | ||
5530 | pslldq m2, 1 | |
5531 | palignr m3, m2, 14 | |
5532 | ||
5533 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
5534 | pmulhrsw m6, m7 | |
5535 | packuswb m5, m6 | |
5536 | ||
5537 | pslldq m2, 1 | |
5538 | palignr m3, m2, 14 | |
5539 | ||
5540 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
5541 | pmulhrsw m6, m7 | |
5542 | ||
5543 | pslldq m2, 1 | |
5544 | palignr m3, m2, 14 | |
5545 | ||
5546 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
5547 | pmulhrsw m1, m7 | |
5548 | packuswb m6, m1 | |
5549 | ||
5550 | pslldq m2, 1 | |
5551 | palignr m3, m2, 14 | |
5552 | ||
5553 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5554 | pmulhrsw m1, m7 | |
5555 | pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
5556 | pmulhrsw m3, m7 | |
5557 | packuswb m1, m3 | |
5558 | ||
5559 | TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 | |
5560 | ||
5561 | RET | |
5562 | ||
5563 | INIT_XMM sse4 | |
5564 | cglobal intra_pred_ang16_19, 4,7,8 | |
5565 | ||
5566 | lea r4, [ang_table + 16 * 16] | |
5567 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
5568 | mov r6, r0 | |
5569 | mova m7, [pw_1024] | |
5570 | ||
5571 | movu m3, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
5572 | punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] | |
5573 | punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
5574 | movu m2, [r2] | |
5575 | pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] | |
5576 | palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] | |
5577 | ||
5578 | pmaddubsw m4, [r4 - 10 * 16] ; [06] | |
5579 | pmulhrsw m4, m7 | |
5580 | ||
5581 | palignr m3, m2, 15 | |
5582 | ||
5583 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
5584 | pmulhrsw m5, m7 | |
5585 | packuswb m4, m5 | |
5586 | ||
5587 | palignr m3, m2, 14 | |
5588 | ||
5589 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
5590 | pmulhrsw m5, m7 | |
5591 | ||
5592 | pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] | |
5593 | pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] | |
5594 | palignr m3, m2, 14 | |
5595 | ||
5596 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
5597 | pmulhrsw m6, m7 | |
5598 | packuswb m5, m6 | |
5599 | ||
5600 | pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] | |
5601 | palignr m3, m2, 14 | |
5602 | ||
5603 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
5604 | pmulhrsw m6, m7 | |
5605 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
5606 | pmulhrsw m0, m7 | |
5607 | packuswb m6, m0 | |
5608 | ||
5609 | pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] | |
5610 | palignr m3, m2, 14 | |
5611 | ||
5612 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
5613 | pmulhrsw m1, m7 | |
5614 | ||
5615 | pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
5616 | palignr m3, m2, 14 | |
5617 | ||
5618 | pmaddubsw m0, m3, [r4] ; [16] | |
5619 | pmulhrsw m0, m7 | |
5620 | packuswb m1, m0 | |
5621 | ||
5622 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5623 | ||
5624 | pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] | |
5625 | palignr m3, m2, 14 | |
5626 | ||
5627 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
5628 | pmulhrsw m4, m7 | |
5629 | ||
5630 | pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] | |
5631 | palignr m3, m2, 14 | |
5632 | ||
5633 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
5634 | pmulhrsw m5, m7 | |
5635 | packuswb m4, m5 | |
5636 | ||
5637 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
5638 | pmulhrsw m5, m7 | |
5639 | ||
5640 | pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] | |
5641 | palignr m3, m2, 14 | |
5642 | ||
5643 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
5644 | pmulhrsw m6, m7 | |
5645 | packuswb m5, m6 | |
5646 | ||
5647 | pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] | |
5648 | palignr m3, m2, 14 | |
5649 | ||
5650 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
5651 | pmulhrsw m6, m7 | |
5652 | ||
5653 | pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] | |
5654 | palignr m3, m2, 14 | |
5655 | ||
5656 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
5657 | pmulhrsw m1, m7 | |
5658 | packuswb m6, m1 | |
5659 | ||
5660 | pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] | |
5661 | palignr m3, m2, 14 | |
5662 | ||
5663 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5664 | pmulhrsw m1, m7 | |
5665 | pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
5666 | pmulhrsw m3, m7 | |
5667 | packuswb m1, m3 | |
5668 | ||
5669 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5670 | ||
5671 | lea r0, [r6 + 8] | |
5672 | ||
5673 | movu m1, [r3 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
5674 | pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] | |
5675 | punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] | |
5676 | palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] | |
5677 | movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] | |
5678 | ||
5679 | pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] | |
5680 | pmulhrsw m4, m7 | |
5681 | ||
5682 | palignr m3, m2, 14 | |
5683 | ||
5684 | pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] | |
5685 | pmulhrsw m5, m7 | |
5686 | packuswb m4, m5 | |
5687 | ||
5688 | pslldq m2, 1 | |
5689 | palignr m3, m2, 14 | |
5690 | ||
5691 | pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] | |
5692 | pmulhrsw m5, m7 | |
5693 | ||
5694 | pslldq m2, 1 | |
5695 | palignr m3, m2, 14 | |
5696 | ||
5697 | pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] | |
5698 | pmulhrsw m6, m7 | |
5699 | packuswb m5, m6 | |
5700 | ||
5701 | pslldq m2, 1 | |
5702 | palignr m3, m2, 14 | |
5703 | ||
5704 | pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] | |
5705 | pmulhrsw m6, m7 | |
5706 | pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] | |
5707 | pmulhrsw m0, m7 | |
5708 | packuswb m6, m0 | |
5709 | ||
5710 | pslldq m2, 1 | |
5711 | palignr m3, m2, 14 | |
5712 | ||
5713 | pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] | |
5714 | pmulhrsw m1, m7 | |
5715 | ||
5716 | pslldq m2, 1 | |
5717 | palignr m3, m2, 14 | |
5718 | ||
5719 | pmaddubsw m0, m3, [r4] ; [16] | |
5720 | pmulhrsw m0, m7 | |
5721 | packuswb m1, m0 | |
5722 | ||
5723 | TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 | |
5724 | ||
5725 | pslldq m2, 1 | |
5726 | palignr m3, m2, 14 | |
5727 | ||
5728 | pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] | |
5729 | pmulhrsw m4, m7 | |
5730 | ||
5731 | pslldq m2, 1 | |
5732 | palignr m3, m2, 14 | |
5733 | ||
5734 | pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] | |
5735 | pmulhrsw m5, m7 | |
5736 | packuswb m4, m5 | |
5737 | ||
5738 | pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] | |
5739 | pmulhrsw m5, m7 | |
5740 | ||
5741 | pslldq m2, 1 | |
5742 | palignr m3, m2, 14 | |
5743 | ||
5744 | pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] | |
5745 | pmulhrsw m6, m7 | |
5746 | packuswb m5, m6 | |
5747 | ||
5748 | pslldq m2, 1 | |
5749 | palignr m3, m2, 14 | |
5750 | ||
5751 | pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] | |
5752 | pmulhrsw m6, m7 | |
5753 | ||
5754 | pslldq m2, 1 | |
5755 | palignr m3, m2, 14 | |
5756 | ||
5757 | pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] | |
5758 | pmulhrsw m1, m7 | |
5759 | packuswb m6, m1 | |
5760 | ||
5761 | pslldq m2, 1 | |
5762 | palignr m3, m2, 14 | |
5763 | ||
5764 | pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] | |
5765 | pmulhrsw m1, m7 | |
5766 | pmaddubsw m3, [r4 - 16 * 16] ; [00] | |
5767 | pmulhrsw m3, m7 | |
5768 | packuswb m1, m3 | |
5769 | ||
5770 | TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 | |
5771 | ||
5772 | RET | |
5773 | ||
5774 | INIT_XMM sse4 | |
5775 | cglobal intra_pred_ang16_18, 4,5,3 | |
5776 | ||
5777 | movu m0, [r3] | |
5778 | movu m1, [r2] | |
5779 | mova m2, [c_mode16_18] | |
5780 | pshufb m1, m2 | |
5781 | ||
5782 | lea r2, [r1 * 2] | |
5783 | lea r3, [r1 * 3] | |
5784 | lea r4, [r1 * 4] | |
5785 | movu [r0], m0 | |
5786 | palignr m2, m0, m1, 15 | |
5787 | movu [r0 + r1], m2 | |
5788 | palignr m2, m0, m1, 14 | |
5789 | movu [r0 + r2], m2 | |
5790 | palignr m2, m0, m1, 13 | |
5791 | movu [r0 + r3], m2 | |
5792 | lea r0, [r0 + r4] | |
5793 | palignr m2, m0, m1, 12 | |
5794 | movu [r0], m2 | |
5795 | palignr m2, m0, m1, 11 | |
5796 | movu [r0 + r1], m2 | |
5797 | palignr m2, m0, m1, 10 | |
5798 | movu [r0 + r2], m2 | |
5799 | palignr m2, m0, m1, 9 | |
5800 | movu [r0 + r3], m2 | |
5801 | lea r0, [r0 + r4] | |
5802 | palignr m2, m0, m1, 8 | |
5803 | movu [r0], m2 | |
5804 | palignr m2, m0, m1, 7 | |
5805 | movu [r0 + r1], m2 | |
5806 | palignr m2, m0, m1, 6 | |
5807 | movu [r0 + r2], m2 | |
5808 | palignr m2, m0, m1, 5 | |
5809 | movu [r0 + r3], m2 | |
5810 | lea r0, [r0 + r4] | |
5811 | palignr m2, m0, m1, 4 | |
5812 | movu [r0], m2 | |
5813 | palignr m2, m0, m1, 3 | |
5814 | movu [r0 + r1], m2 | |
5815 | palignr m2, m0, m1, 2 | |
5816 | movu [r0 + r2], m2 | |
5817 | palignr m0, m1, 1 | |
5818 | movu [r0 + r3], m0 | |
5819 | RET | |
5820 | ||
5821 | ;--------------------------------------------------------------------------------------------------------------- | |
5822 | ; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
5823 | ;--------------------------------------------------------------------------------------------------------------- | |
5824 | INIT_XMM ssse3 | |
5825 | cglobal intra_pred_ang32_2, 3,4,4 | |
5826 | cmp r4m, byte 34 | |
5827 | cmove r2, r3mp | |
5828 | movu m0, [r2 + 2] | |
5829 | movu m1, [r2 + 18] | |
5830 | movu m3, [r2 + 34] | |
5831 | ||
5832 | lea r3, [r1 * 3] | |
5833 | ||
5834 | movu [r0], m0 | |
5835 | movu [r0 + 16], m1 | |
5836 | palignr m2, m1, m0, 1 | |
5837 | movu [r0 + r1], m2 | |
5838 | palignr m2, m3, m1, 1 | |
5839 | movu [r0 + r1 + 16], m2 | |
5840 | palignr m2, m1, m0, 2 | |
5841 | movu [r0 + r1 * 2], m2 | |
5842 | palignr m2, m3, m1, 2 | |
5843 | movu [r0 + r1 * 2 + 16], m2 | |
5844 | palignr m2, m1, m0, 3 | |
5845 | movu [r0 + r3], m2 | |
5846 | palignr m2, m3, m1, 3 | |
5847 | movu [r0 + r3 + 16], m2 | |
5848 | ||
5849 | lea r0, [r0 + r1 * 4] | |
5850 | ||
5851 | palignr m2, m1, m0, 4 | |
5852 | movu [r0], m2 | |
5853 | palignr m2, m3, m1, 4 | |
5854 | movu [r0 + 16], m2 | |
5855 | palignr m2, m1, m0, 5 | |
5856 | movu [r0 + r1], m2 | |
5857 | palignr m2, m3, m1, 5 | |
5858 | movu [r0 + r1 + 16], m2 | |
5859 | palignr m2, m1, m0, 6 | |
5860 | movu [r0 + r1 * 2], m2 | |
5861 | palignr m2, m3, m1, 6 | |
5862 | movu [r0 + r1 * 2 + 16], m2 | |
5863 | palignr m2, m1, m0, 7 | |
5864 | movu [r0 + r3], m2 | |
5865 | palignr m2, m3, m1, 7 | |
5866 | movu [r0 + r3 + 16], m2 | |
5867 | ||
5868 | lea r0, [r0 + r1 * 4] | |
5869 | ||
5870 | palignr m2, m1, m0, 8 | |
5871 | movu [r0], m2 | |
5872 | palignr m2, m3, m1, 8 | |
5873 | movu [r0 + 16], m2 | |
5874 | palignr m2, m1, m0, 9 | |
5875 | movu [r0 + r1], m2 | |
5876 | palignr m2, m3, m1, 9 | |
5877 | movu [r0 + r1 + 16], m2 | |
5878 | palignr m2, m1, m0, 10 | |
5879 | movu [r0 + r1 * 2], m2 | |
5880 | palignr m2, m3, m1, 10 | |
5881 | movu [r0 + r1 * 2 + 16], m2 | |
5882 | palignr m2, m1, m0, 11 | |
5883 | movu [r0 + r3], m2 | |
5884 | palignr m2, m3, m1, 11 | |
5885 | movu [r0 + r3 + 16], m2 | |
5886 | ||
5887 | lea r0, [r0 + r1 * 4] | |
5888 | ||
5889 | palignr m2, m1, m0, 12 | |
5890 | movu [r0], m2 | |
5891 | palignr m2, m3, m1, 12 | |
5892 | movu [r0 + 16], m2 | |
5893 | palignr m2, m1, m0, 13 | |
5894 | movu [r0 + r1], m2 | |
5895 | palignr m2, m3, m1, 13 | |
5896 | movu [r0 + r1 + 16], m2 | |
5897 | palignr m2, m1, m0, 14 | |
5898 | movu [r0 + r1 * 2], m2 | |
5899 | palignr m2, m3, m1, 14 | |
5900 | movu [r0 + r1 * 2 + 16], m2 | |
5901 | palignr m2, m1, m0, 15 | |
5902 | movu [r0 + r3], m2 | |
5903 | palignr m2, m3, m1, 15 | |
5904 | movu [r0 + r3 + 16], m2 | |
5905 | ||
5906 | lea r0, [r0 + r1 * 4] | |
5907 | ||
5908 | movu [r0], m1 | |
5909 | movu m0, [r2 + 50] | |
5910 | movu [r0 + 16], m3 | |
5911 | palignr m2, m3, m1, 1 | |
5912 | movu [r0 + r1], m2 | |
5913 | palignr m2, m0, m3, 1 | |
5914 | movu [r0 + r1 + 16], m2 | |
5915 | palignr m2, m3, m1, 2 | |
5916 | movu [r0 + r1 * 2], m2 | |
5917 | palignr m2, m0, m3, 2 | |
5918 | movu [r0 + r1 * 2 + 16], m2 | |
5919 | palignr m2, m3, m1, 3 | |
5920 | movu [r0 + r3], m2 | |
5921 | palignr m2, m0, m3, 3 | |
5922 | movu [r0 + r3 + 16], m2 | |
5923 | ||
5924 | lea r0, [r0 + r1 * 4] | |
5925 | ||
5926 | palignr m2, m3, m1, 4 | |
5927 | movu [r0], m2 | |
5928 | palignr m2, m0, m3, 4 | |
5929 | movu [r0 + 16], m2 | |
5930 | palignr m2, m3, m1, 5 | |
5931 | movu [r0 + r1], m2 | |
5932 | palignr m2, m0, m3, 5 | |
5933 | movu [r0 + r1 + 16], m2 | |
5934 | palignr m2, m3, m1, 6 | |
5935 | movu [r0 + r1 * 2], m2 | |
5936 | palignr m2, m0, m3, 6 | |
5937 | movu [r0 + r1 * 2 + 16], m2 | |
5938 | palignr m2, m3, m1, 7 | |
5939 | movu [r0 + r3], m2 | |
5940 | palignr m2, m0, m3, 7 | |
5941 | movu [r0 + r3 + 16], m2 | |
5942 | ||
5943 | lea r0, [r0 + r1 * 4] | |
5944 | ||
5945 | palignr m2, m3, m1, 8 | |
5946 | movu [r0], m2 | |
5947 | palignr m2, m0, m3, 8 | |
5948 | movu [r0 + 16], m2 | |
5949 | palignr m2, m3, m1, 9 | |
5950 | movu [r0 + r1], m2 | |
5951 | palignr m2, m0, m3, 9 | |
5952 | movu [r0 + r1 + 16], m2 | |
5953 | palignr m2, m3, m1, 10 | |
5954 | movu [r0 + r1 * 2], m2 | |
5955 | palignr m2, m0, m3, 10 | |
5956 | movu [r0 + r1 * 2 + 16], m2 | |
5957 | palignr m2, m3, m1, 11 | |
5958 | movu [r0 + r3], m2 | |
5959 | palignr m2, m0, m3, 11 | |
5960 | movu [r0 + r3 + 16], m2 | |
5961 | ||
5962 | lea r0, [r0 + r1 * 4] | |
5963 | ||
5964 | palignr m2, m3, m1, 12 | |
5965 | movu [r0], m2 | |
5966 | palignr m2, m0, m3, 12 | |
5967 | movu [r0 + 16], m2 | |
5968 | palignr m2, m3, m1, 13 | |
5969 | movu [r0 + r1], m2 | |
5970 | palignr m2, m0, m3, 13 | |
5971 | movu [r0 + r1 + 16], m2 | |
5972 | palignr m2, m3, m1, 14 | |
5973 | movu [r0 + r1 * 2], m2 | |
5974 | palignr m2, m0, m3, 14 | |
5975 | movu [r0 + r1 * 2 + 16], m2 | |
5976 | palignr m2, m3, m1, 15 | |
5977 | movu [r0 + r3], m2 | |
5978 | palignr m2, m0, m3, 15 | |
5979 | movu [r0 + r3 + 16], m2 | |
5980 | RET | |
5981 | ||
5982 | ; Process Intra32x32, input 8x8 in [m0, m1, m2, m3, m4, m5, m6, m7], output 8x8 | |
5983 | %macro PROC32_8x8 10 ; col4, transpose[0/1] c0, c1, c2, c3, c4, c5, c6, c7 | |
5984 | %if %3 == 0 | |
5985 | %else | |
5986 | pshufb m0, [r3] | |
5987 | pmaddubsw m0, [r4 + %3 * 16] | |
5988 | pmulhrsw m0, [pw_1024] | |
5989 | %endif | |
5990 | %if %4 == 0 | |
5991 | pmovzxbw m1, m1 | |
5992 | %else | |
5993 | pshufb m1, [r3] | |
5994 | pmaddubsw m1, [r4 + %4 * 16] | |
5995 | pmulhrsw m1, [pw_1024] | |
5996 | %endif | |
5997 | %if %3 == 0 | |
5998 | packuswb m1, m1 | |
5999 | movlhps m0, m1 | |
6000 | %else | |
6001 | packuswb m0, m1 | |
6002 | %endif | |
6003 | mova m1, [pw_1024] | |
6004 | %if %5 == 0 | |
6005 | %else | |
6006 | pshufb m2, [r3] | |
6007 | pmaddubsw m2, [r4 + %5 * 16] | |
6008 | pmulhrsw m2, m1 | |
6009 | %endif | |
6010 | %if %6 == 0 | |
6011 | pmovzxbw m3, m3 | |
6012 | %else | |
6013 | pshufb m3, [r3] | |
6014 | pmaddubsw m3, [r4 + %6 * 16] | |
6015 | pmulhrsw m3, m1 | |
6016 | %endif | |
6017 | %if %5 == 0 | |
6018 | packuswb m3, m3 | |
6019 | movlhps m2, m3 | |
6020 | %else | |
6021 | packuswb m2, m3 | |
6022 | %endif | |
6023 | %if %7 == 0 | |
6024 | %else | |
6025 | pshufb m4, [r3] | |
6026 | pmaddubsw m4, [r4 + %7 * 16] | |
6027 | pmulhrsw m4, m1 | |
6028 | %endif | |
6029 | %if %8 == 0 | |
6030 | pmovzxbw m5, m5 | |
6031 | %else | |
6032 | pshufb m5, [r3] | |
6033 | pmaddubsw m5, [r4 + %8 * 16] | |
6034 | pmulhrsw m5, m1 | |
6035 | %endif | |
6036 | %if %7 == 0 | |
6037 | packuswb m5, m5 | |
6038 | movlhps m4, m5 | |
6039 | %else | |
6040 | packuswb m4, m5 | |
6041 | %endif | |
6042 | %if %9 == 0 | |
6043 | %else | |
6044 | pshufb m6, [r3] | |
6045 | pmaddubsw m6, [r4 + %9 * 16] | |
6046 | pmulhrsw m6, m1 | |
6047 | %endif | |
6048 | %if %10 == 0 | |
6049 | pmovzxbw m7, m7 | |
6050 | %else | |
6051 | pshufb m7, [r3] | |
6052 | pmaddubsw m7, [r4 + %10 * 16] | |
6053 | pmulhrsw m7, m1 | |
6054 | %endif | |
6055 | %if %9 == 0 | |
6056 | packuswb m7, m7 | |
6057 | movlhps m6, m7 | |
6058 | %else | |
6059 | packuswb m6, m7 | |
6060 | %endif | |
6061 | ||
6062 | %if %2 == 1 | |
6063 | ; transpose | |
6064 | punpckhbw m1, m0, m2 | |
6065 | punpcklbw m0, m2 | |
6066 | punpckhbw m3, m0, m1 | |
6067 | punpcklbw m0, m1 | |
6068 | ||
6069 | punpckhbw m1, m4, m6 | |
6070 | punpcklbw m4, m6 | |
6071 | punpckhbw m6, m4, m1 | |
6072 | punpcklbw m4, m1 | |
6073 | ||
6074 | punpckhdq m2, m0, m4 | |
6075 | punpckldq m0, m4 | |
6076 | punpckldq m4, m3, m6 | |
6077 | punpckhdq m3, m6 | |
6078 | ||
6079 | movh [r0 + + %1 * 8], m0 | |
6080 | movhps [r0 + r1 + %1 * 8], m0 | |
6081 | movh [r0 + r1*2 + %1 * 8], m2 | |
6082 | movhps [r0 + r5 + %1 * 8], m2 | |
6083 | movh [r6 + %1 * 8], m4 | |
6084 | movhps [r6 + r1 + %1 * 8], m4 | |
6085 | movh [r6 + r1*2 + %1 * 8], m3 | |
6086 | movhps [r6 + r5 + %1 * 8], m3 | |
6087 | %else | |
6088 | movh [r0 ], m0 | |
6089 | movhps [r0 + r1 ], m0 | |
6090 | movh [r0 + r1 * 2], m2 | |
6091 | movhps [r0 + r5 ], m2 | |
6092 | lea r0, [r0 + r1 * 4] | |
6093 | movh [r0 ], m4 | |
6094 | movhps [r0 + r1 ], m4 | |
6095 | movh [r0 + r1 * 2], m6 | |
6096 | movhps [r0 + r5 ], m6 | |
6097 | %endif | |
6098 | %endmacro | |
6099 | ||
6100 | %macro MODE_3_33 1 | |
6101 | movu m0, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
6102 | palignr m1, m0, 1 ; [ x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] | |
6103 | punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] | |
6104 | punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] | |
6105 | palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] | |
6106 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
6107 | pmulhrsw m4, m7 | |
6108 | pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
6109 | pmulhrsw m1, m7 | |
6110 | packuswb m4, m1 | |
6111 | palignr m5, m2, m0, 4 | |
6112 | pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
6113 | pmulhrsw m5, m7 | |
6114 | palignr m6, m2, m0, 6 | |
6115 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
6116 | pmulhrsw m6, m7 | |
6117 | packuswb m5, m6 | |
6118 | palignr m1, m2, m0, 8 | |
6119 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
6120 | pmulhrsw m6, m7 | |
6121 | pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
6122 | pmulhrsw m1, m7 | |
6123 | packuswb m6, m1 | |
6124 | palignr m1, m2, m0, 10 | |
6125 | pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
6126 | pmulhrsw m1, m7 | |
6127 | palignr m2, m0, 12 | |
6128 | pmaddubsw m2, [r3] ; [16] | |
6129 | pmulhrsw m2, m7 | |
6130 | packuswb m1, m2 | |
6131 | ||
6132 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
6133 | ||
6134 | movu m0, [r2 + 8] | |
6135 | palignr m1, m0, 1 | |
6136 | punpckhbw m2, m0, m1 | |
6137 | punpcklbw m0, m1 | |
6138 | palignr m5, m2, m0, 2 | |
6139 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
6140 | pmulhrsw m4, m7 | |
6141 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
6142 | pmulhrsw m1, m7 | |
6143 | packuswb m4, m1 | |
6144 | pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
6145 | pmulhrsw m5, m7 | |
6146 | palignr m6, m2, m0, 4 | |
6147 | pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
6148 | pmulhrsw m6, m7 | |
6149 | packuswb m5, m6 | |
6150 | palignr m1, m2, m0, 6 | |
6151 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
6152 | pmulhrsw m6, m7 | |
6153 | palignr m1, m2, m0, 8 | |
6154 | pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
6155 | pmulhrsw m1, m7 | |
6156 | packuswb m6, m1 | |
6157 | palignr m1, m2, m0, 10 | |
6158 | pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
6159 | pmulhrsw m1, m7 | |
6160 | packuswb m1, m1 | |
6161 | movhps m1, [r2 + 14] ; [00] | |
6162 | ||
6163 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
6164 | ||
6165 | movu m0, [r2 + 14] | |
6166 | palignr m1, m0, 1 | |
6167 | punpckhbw m2, m0, m1 | |
6168 | punpcklbw m0, m1 | |
6169 | palignr m1, m2, m0, 2 | |
6170 | pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] | |
6171 | pmulhrsw m4, m7 | |
6172 | pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
6173 | pmulhrsw m1, m7 | |
6174 | packuswb m4, m1 | |
6175 | palignr m5, m2, m0, 4 | |
6176 | pmaddubsw m5, [r3 - 2 * 16] ; [14] | |
6177 | pmulhrsw m5, m7 | |
6178 | palignr m6, m2, m0, 6 | |
6179 | pmaddubsw m6, [r3 - 8 * 16] ; [ 8] | |
6180 | pmulhrsw m6, m7 | |
6181 | packuswb m5, m6 | |
6182 | palignr m1, m2, m0, 8 | |
6183 | pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] | |
6184 | pmulhrsw m6, m7 | |
6185 | pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
6186 | pmulhrsw m1, m7 | |
6187 | packuswb m6, m1 | |
6188 | palignr m1, m2, m0, 10 | |
6189 | pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
6190 | pmulhrsw m1, m7 | |
6191 | palignr m2, m0, 12 | |
6192 | pmaddubsw m2, [r3] ; [16] | |
6193 | pmulhrsw m2, m7 | |
6194 | packuswb m1, m2 | |
6195 | ||
6196 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
6197 | ||
6198 | movu m0, [r2 + 21] | |
6199 | palignr m1, m0, 1 | |
6200 | punpckhbw m2, m0, m1 | |
6201 | punpcklbw m0, m1 | |
6202 | palignr m5, m2, m0, 2 | |
6203 | pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] | |
6204 | pmulhrsw m4, m7 | |
6205 | pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] | |
6206 | pmulhrsw m1, m7 | |
6207 | packuswb m4, m1 | |
6208 | pmaddubsw m5, [r3 + 14 * 16] ; [30] | |
6209 | pmulhrsw m5, m7 | |
6210 | palignr m6, m2, m0, 4 | |
6211 | pmaddubsw m6, [r3 + 8 * 16] ; [24] | |
6212 | pmulhrsw m6, m7 | |
6213 | packuswb m5, m6 | |
6214 | palignr m1, m2, m0, 6 | |
6215 | pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] | |
6216 | pmulhrsw m6, m7 | |
6217 | palignr m1, m2, m0, 8 | |
6218 | pmaddubsw m1, [r3 - 4 * 16] ; [12] | |
6219 | pmulhrsw m1, m7 | |
6220 | packuswb m6, m1 | |
6221 | palignr m1, m2, m0, 10 | |
6222 | pmaddubsw m1, [r3 - 10 * 16] ; [06] | |
6223 | pmulhrsw m1, m7 | |
6224 | packuswb m1, m1 | |
6225 | movhps m1, [r2 + 27] ; [00] | |
6226 | ||
6227 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
6228 | %endmacro | |
6229 | ;------------------------------------------------------------------------------------------------------------------ | |
6230 | ; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
6231 | ;------------------------------------------------------------------------------------------------------------------ | |
6232 | INIT_XMM sse4 | |
6233 | cglobal intra_pred_ang32_3, 3,7,8 | |
6234 | lea r3, [ang_table + 16 * 16] | |
6235 | mov r4d, 4 | |
6236 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6237 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6238 | mova m7, [pw_1024] | |
6239 | .loop: | |
6240 | MODE_3_33 1 | |
6241 | lea r0, [r6 + r1 * 4] | |
6242 | lea r6, [r6 + r1 * 8] | |
6243 | add r2, 8 | |
6244 | dec r4 | |
6245 | jnz .loop | |
6246 | RET | |
6247 | ||
6248 | %macro MODE_4_32 1 | |
6249 | movu m0, [r2 + 1] | |
6250 | palignr m1, m0, 1 | |
6251 | punpckhbw m2, m0, m1 | |
6252 | punpcklbw m0, m1 | |
6253 | palignr m1, m2, m0, 2 | |
6254 | mova m5, m1 | |
6255 | pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] | |
6256 | pmulhrsw m4, m7 | |
6257 | pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
6258 | pmulhrsw m1, m7 | |
6259 | packuswb m4, m1 | |
6260 | pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
6261 | pmulhrsw m5, m7 | |
6262 | palignr m6, m2, m0, 4 | |
6263 | pmaddubsw m6, [r3 + 4 * 16] ; [ 20] | |
6264 | pmulhrsw m6, m7 | |
6265 | packuswb m5, m6 | |
6266 | palignr m1, m2, m0, 6 | |
6267 | pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] | |
6268 | pmulhrsw m6, m7 | |
6269 | pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
6270 | pmulhrsw m1, m7 | |
6271 | packuswb m6, m1 | |
6272 | palignr m1, m2, m0, 8 | |
6273 | pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
6274 | pmulhrsw m1, m7 | |
6275 | palignr m2, m0, 10 | |
6276 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
6277 | pmulhrsw m3, m7 | |
6278 | packuswb m1, m3 | |
6279 | ||
6280 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
6281 | ||
6282 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
6283 | pmulhrsw m4, m7 | |
6284 | movu m0, [r2 + 6] | |
6285 | palignr m1, m0, 1 | |
6286 | punpckhbw m2, m0, m1 | |
6287 | punpcklbw m0, m1 | |
6288 | palignr m1, m2, m0, 2 | |
6289 | pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
6290 | pmulhrsw m1, m7 | |
6291 | packuswb m4, m1 | |
6292 | palignr m5, m2, m0, 4 | |
6293 | mova m6, m5 | |
6294 | pmaddubsw m5, [r3 - 9 * 16] ; [07] | |
6295 | pmulhrsw m5, m7 | |
6296 | pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
6297 | pmulhrsw m6, m7 | |
6298 | packuswb m5, m6 | |
6299 | palignr m6, m2, m0, 6 | |
6300 | pmaddubsw m6, [r3 + 16] ; [17] | |
6301 | pmulhrsw m6, m7 | |
6302 | palignr m1, m2, m0, 8 | |
6303 | pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] | |
6304 | pmulhrsw m3, m7 | |
6305 | packuswb m6, m3 | |
6306 | pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
6307 | pmulhrsw m1, m7 | |
6308 | palignr m2, m0, 10 | |
6309 | pmaddubsw m2, [r3] ; [16] | |
6310 | pmulhrsw m2, m7 | |
6311 | packuswb m1, m2 | |
6312 | ||
6313 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
6314 | ||
6315 | movu m0, [r2 + 12] | |
6316 | palignr m1, m0, 1 | |
6317 | punpckhbw m2, m0, m1 | |
6318 | punpcklbw m0, m1 | |
6319 | mova m1, m0 | |
6320 | pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] | |
6321 | pmulhrsw m4, m7 | |
6322 | pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
6323 | pmulhrsw m1, m7 | |
6324 | packuswb m4, m1 | |
6325 | palignr m5, m2, m0, 2 | |
6326 | pmaddubsw m5, [r3 - 16] ; [15] | |
6327 | pmulhrsw m5, m7 | |
6328 | palignr m6, m2, m0, 4 | |
6329 | mova m1, m6 | |
6330 | pmaddubsw m1, [r3 - 12 * 16] ; [4] | |
6331 | pmulhrsw m1, m7 | |
6332 | packuswb m5, m1 | |
6333 | pmaddubsw m6, [r3 + 9 * 16] ; [25] | |
6334 | pmulhrsw m6, m7 | |
6335 | palignr m1, m2, m0, 6 | |
6336 | pmaddubsw m1, [r3 - 2 * 16] ; [14] | |
6337 | pmulhrsw m1, m7 | |
6338 | packuswb m6, m1 | |
6339 | palignr m1, m2, m0, 8 | |
6340 | mova m2, m1 | |
6341 | pmaddubsw m1, [r3 - 13 * 16] ; [3] | |
6342 | pmulhrsw m1, m7 | |
6343 | pmaddubsw m2, [r3 + 8 * 16] ; [24] | |
6344 | pmulhrsw m2, m7 | |
6345 | packuswb m1, m2 | |
6346 | ||
6347 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
6348 | ||
6349 | movu m0, [r2 + 17] | |
6350 | palignr m1, m0, 1 | |
6351 | punpckhbw m2, m0, m1 | |
6352 | punpcklbw m0, m1 | |
6353 | pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] | |
6354 | pmulhrsw m4, m7 | |
6355 | palignr m5, m2, m0, 2 | |
6356 | pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] | |
6357 | pmulhrsw m1, m7 | |
6358 | packuswb m4, m1 | |
6359 | pmaddubsw m5, [r3 + 7 * 16] ; [23] | |
6360 | pmulhrsw m5, m7 | |
6361 | palignr m6, m2, m0, 4 | |
6362 | pmaddubsw m6, [r3 - 4 * 16] ; [12] | |
6363 | pmulhrsw m6, m7 | |
6364 | packuswb m5, m6 | |
6365 | palignr m6, m2, m0, 6 | |
6366 | mova m1, m6 | |
6367 | pmaddubsw m6, [r3 - 15 * 16] ; [1] | |
6368 | pmulhrsw m6, m7 | |
6369 | pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
6370 | pmulhrsw m1, m7 | |
6371 | packuswb m6, m1 | |
6372 | palignr m1, m2, m0, 8 | |
6373 | pmaddubsw m1, [r3 - 5 * 16] ; [11] | |
6374 | pmulhrsw m1, m7 | |
6375 | packuswb m1, m1 | |
6376 | movhps m1, [r2 + 22] ; [00] | |
6377 | ||
6378 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
6379 | %endmacro | |
6380 | ;----------------------------------------------------------------------------------------------------------------- | |
6381 | ; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
6382 | ;----------------------------------------------------------------------------------------------------------------- | |
6383 | INIT_XMM sse4 | |
6384 | cglobal intra_pred_ang32_4, 3,7,8 | |
6385 | lea r3, [ang_table + 16 * 16] | |
6386 | mov r4d, 4 | |
6387 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6388 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6389 | mova m7, [pw_1024] | |
6390 | .loop: | |
6391 | MODE_4_32 1 | |
6392 | lea r0, [r6 + r1 * 4] | |
6393 | lea r6, [r6 + r1 * 8] | |
6394 | add r2, 8 | |
6395 | dec r4 | |
6396 | jnz .loop | |
6397 | RET | |
6398 | ||
6399 | %macro MODE_5_31 1 | |
6400 | movu m0, [r2 + 1] | |
6401 | palignr m1, m0, 1 | |
6402 | punpckhbw m2, m0, m1 | |
6403 | punpcklbw m0, m1 | |
6404 | palignr m1, m2, m0, 2 | |
6405 | mova m5, m1 | |
6406 | pmaddubsw m4, m0, [r3 + 16] ; [17] | |
6407 | pmulhrsw m4, m7 | |
6408 | pmaddubsw m1, [r3 - 14 * 16] ; [2] | |
6409 | pmulhrsw m1, m7 | |
6410 | packuswb m4, m1 | |
6411 | pmaddubsw m5, [r3 + 3 * 16] ; [19] | |
6412 | pmulhrsw m5, m7 | |
6413 | palignr m6, m2, m0, 4 | |
6414 | mova m1, m6 | |
6415 | pmaddubsw m6, [r3 - 12 * 16] ; [4] | |
6416 | pmulhrsw m6, m7 | |
6417 | packuswb m5, m6 | |
6418 | pmaddubsw m6, m1, [r3 + 5 * 16] ; [21] | |
6419 | pmulhrsw m6, m7 | |
6420 | palignr m1, m2, m0, 6 | |
6421 | mova m3, m1 | |
6422 | pmaddubsw m3, [r3 - 10 * 16] ; [6] | |
6423 | pmulhrsw m3, m7 | |
6424 | packuswb m6, m3 | |
6425 | pmaddubsw m1, [r3 + 7 * 16] ; [23] | |
6426 | pmulhrsw m1, m7 | |
6427 | palignr m2, m0, 8 | |
6428 | pmaddubsw m2, [r3 - 8 * 16] ; [8] | |
6429 | pmulhrsw m2, m7 | |
6430 | packuswb m1, m2 | |
6431 | ||
6432 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
6433 | ||
6434 | movu m0, [r2 + 5] | |
6435 | palignr m1, m0, 1 | |
6436 | punpckhbw m2, m0, m1 | |
6437 | punpcklbw m0, m1 | |
6438 | palignr m1, m2, m0, 2 | |
6439 | mova m5, m1 | |
6440 | pmaddubsw m4, m0, [r3 + 9 * 16] ; [25] | |
6441 | pmulhrsw m4, m7 | |
6442 | pmaddubsw m1, [r3 - 6 * 16] ; [10] | |
6443 | pmulhrsw m1, m7 | |
6444 | packuswb m4, m1 | |
6445 | pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
6446 | pmulhrsw m5, m7 | |
6447 | palignr m6, m2, m0, 4 | |
6448 | mova m1, m6 | |
6449 | pmaddubsw m6, [r3 - 4 * 16] ; [12] | |
6450 | pmulhrsw m6, m7 | |
6451 | packuswb m5, m6 | |
6452 | pmaddubsw m6, m1, [r3 + 13 * 16] ; [29] | |
6453 | pmulhrsw m6, m7 | |
6454 | palignr m1, m2, m0, 6 | |
6455 | mova m3, m1 | |
6456 | pmaddubsw m3, [r3 - 2 * 16] ; [14] | |
6457 | pmulhrsw m3, m7 | |
6458 | packuswb m6, m3 | |
6459 | pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
6460 | pmulhrsw m1, m7 | |
6461 | palignr m2, m0, 8 | |
6462 | pmaddubsw m2, [r3] ; [16] | |
6463 | pmulhrsw m2, m7 | |
6464 | packuswb m1, m2 | |
6465 | ||
6466 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
6467 | ||
6468 | movu m0, [r2 + 10] | |
6469 | palignr m1, m0, 1 | |
6470 | punpckhbw m2, m0, m1 | |
6471 | punpcklbw m0, m1 | |
6472 | mova m1, m0 | |
6473 | pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] | |
6474 | pmulhrsw m4, m7 | |
6475 | pmaddubsw m1, [r3 + 2 * 16] ; [18] | |
6476 | pmulhrsw m1, m7 | |
6477 | packuswb m4, m1 | |
6478 | palignr m5, m2, m0, 2 | |
6479 | mova m1, m5 | |
6480 | pmaddubsw m5, [r3 - 13 * 16] ; [3] | |
6481 | pmulhrsw m5, m7 | |
6482 | pmaddubsw m1, [r3 + 4 * 16] ; [20] | |
6483 | pmulhrsw m1, m7 | |
6484 | packuswb m5, m1 | |
6485 | palignr m1, m2, m0, 4 | |
6486 | pmaddubsw m6, m1, [r3 - 11 * 16] ; [5] | |
6487 | pmulhrsw m6, m7 | |
6488 | pmaddubsw m1, [r3 + 6 * 16] ; [22] | |
6489 | pmulhrsw m1, m7 | |
6490 | packuswb m6, m1 | |
6491 | palignr m2, m0, 6 | |
6492 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [7] | |
6493 | pmulhrsw m1, m7 | |
6494 | pmaddubsw m2, [r3 + 8 * 16] ; [24] | |
6495 | pmulhrsw m2, m7 | |
6496 | packuswb m1, m2 | |
6497 | ||
6498 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
6499 | ||
6500 | movu m0, [r2 + 14] | |
6501 | palignr m1, m0, 1 | |
6502 | punpckhbw m2, m0, m1 | |
6503 | punpcklbw m0, m1 | |
6504 | mova m1, m0 | |
6505 | pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] | |
6506 | pmulhrsw m4, m7 | |
6507 | pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
6508 | pmulhrsw m1, m7 | |
6509 | packuswb m4, m1 | |
6510 | palignr m5, m2, m0, 2 | |
6511 | mova m1, m5 | |
6512 | pmaddubsw m5, [r3 - 5 * 16] ; [11] | |
6513 | pmulhrsw m5, m7 | |
6514 | pmaddubsw m1, [r3 + 12 * 16] ; [28] | |
6515 | pmulhrsw m1, m7 | |
6516 | packuswb m5, m1 | |
6517 | palignr m1, m2, m0, 4 | |
6518 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
6519 | pmulhrsw m6, m7 | |
6520 | pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
6521 | pmulhrsw m1, m7 | |
6522 | packuswb m6, m1 | |
6523 | palignr m2, m0, 6 | |
6524 | pmaddubsw m1, m2, [r3 - 16] ; [15] | |
6525 | pmulhrsw m1, m7 | |
6526 | packuswb m1, m1 | |
6527 | movhps m1, [r2 + 18] ; [00] | |
6528 | ||
6529 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
6530 | %endmacro | |
6531 | ;------------------------------------------------------------------------------------------------------------------ | |
6532 | ; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
6533 | ;------------------------------------------------------------------------------------------------------------------ | |
6534 | INIT_XMM sse4 | |
6535 | cglobal intra_pred_ang32_5, 3,7,8 | |
6536 | lea r3, [ang_table + 16 * 16] | |
6537 | mov r4d, 4 | |
6538 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6539 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6540 | mova m7, [pw_1024] | |
6541 | .loop: | |
6542 | MODE_5_31 1 | |
6543 | lea r0, [r6 + r1 * 4] | |
6544 | lea r6, [r6 + r1 * 8] | |
6545 | add r2, 8 | |
6546 | dec r4 | |
6547 | jnz .loop | |
6548 | RET | |
6549 | ||
6550 | %macro MODE_6_30 1 | |
6551 | movu m0, [r2 + 1] | |
6552 | palignr m1, m0, 1 | |
6553 | punpckhbw m2, m0, m1 | |
6554 | punpcklbw m0, m1 | |
6555 | mova m1, m0 | |
6556 | pmaddubsw m4, m0, [r3 - 3 * 16] ; [13] | |
6557 | pmulhrsw m4, m7 | |
6558 | pmaddubsw m1, [r3 + 10 * 16] ; [26] | |
6559 | pmulhrsw m1, m7 | |
6560 | packuswb m4, m1 | |
6561 | palignr m6, m2, m0, 2 | |
6562 | pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] | |
6563 | pmulhrsw m5, m7 | |
6564 | pmaddubsw m6, [r3 + 4 * 16] ; [20] | |
6565 | pmulhrsw m6, m7 | |
6566 | packuswb m5, m6 | |
6567 | palignr m1, m2, m0, 4 | |
6568 | pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] | |
6569 | pmulhrsw m6, m7 | |
6570 | pmaddubsw m3, m1, [r3 - 2 * 16] ; [14] | |
6571 | pmulhrsw m3, m7 | |
6572 | packuswb m6, m3 | |
6573 | pmaddubsw m1, [r3 + 11 * 16] ; [27] | |
6574 | pmulhrsw m1, m7 | |
6575 | palignr m2, m0, 6 | |
6576 | pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] | |
6577 | pmulhrsw m3, m7 | |
6578 | packuswb m1, m3 | |
6579 | ||
6580 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
6581 | ||
6582 | pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] | |
6583 | pmulhrsw m4, m7 | |
6584 | movu m0, [r2 + 5] | |
6585 | palignr m1, m0, 1 | |
6586 | punpckhbw m2, m0, m1 | |
6587 | punpcklbw m0, m1 | |
6588 | mova m6, m0 | |
6589 | pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] | |
6590 | pmulhrsw m1, m7 | |
6591 | packuswb m4, m1 | |
6592 | pmaddubsw m5, m6, [r3 - 16] ; [15] | |
6593 | pmulhrsw m5, m7 | |
6594 | pmaddubsw m6, [r3 + 12 * 16] ; [28] | |
6595 | pmulhrsw m6, m7 | |
6596 | packuswb m5, m6 | |
6597 | palignr m3, m2, m0, 2 | |
6598 | pmaddubsw m6, m3, [r3 - 7 * 16] ; [9] | |
6599 | pmulhrsw m6, m7 | |
6600 | pmaddubsw m3, [r3 + 6 * 16] ; [22] | |
6601 | pmulhrsw m3, m7 | |
6602 | packuswb m6, m3 | |
6603 | palignr m2, m0, 4 | |
6604 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
6605 | pmulhrsw m1, m7 | |
6606 | pmaddubsw m3, m2, [r3] ; [16] | |
6607 | pmulhrsw m3, m7 | |
6608 | packuswb m1, m3 | |
6609 | ||
6610 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
6611 | ||
6612 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
6613 | pmulhrsw m4, m7 | |
6614 | movu m0, [r2 + 7] | |
6615 | palignr m1, m0, 1 | |
6616 | punpckhbw m2, m0, m1 | |
6617 | punpcklbw m0, m1 | |
6618 | palignr m5, m2, m0, 2 | |
6619 | pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] | |
6620 | pmulhrsw m1, m7 | |
6621 | packuswb m4, m1 | |
6622 | pmaddubsw m5, [r3 + 7 * 16] ; [23] | |
6623 | pmulhrsw m5, m7 | |
6624 | palignr m1, m2, m0, 4 | |
6625 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
6626 | pmulhrsw m6, m7 | |
6627 | packuswb m5, m6 | |
6628 | pmaddubsw m6, m1, [r3 + 16] ; [17] | |
6629 | pmulhrsw m6, m7 | |
6630 | pmaddubsw m1, [r3 + 14 * 16] ; [30] | |
6631 | pmulhrsw m1, m7 | |
6632 | packuswb m6, m1 | |
6633 | palignr m2, m2, m0, 6 | |
6634 | pmaddubsw m1, m2, [r3 - 5 * 16] ; [11] | |
6635 | pmulhrsw m1, m7 | |
6636 | pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] | |
6637 | pmulhrsw m2, m7 | |
6638 | packuswb m1, m2 | |
6639 | ||
6640 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
6641 | ||
6642 | movu m0, [r2 + 11] | |
6643 | palignr m1, m0, 1 | |
6644 | punpckhbw m2, m0, m1 | |
6645 | punpcklbw m0, m1 | |
6646 | mova m5, m0 | |
6647 | pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] | |
6648 | pmulhrsw m4, m7 | |
6649 | pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] | |
6650 | pmulhrsw m3, m7 | |
6651 | packuswb m4, m3 | |
6652 | pmaddubsw m5, [r3 + 15 * 16] ; [31] | |
6653 | pmulhrsw m5, m7 | |
6654 | palignr m6, m2, m0, 2 | |
6655 | pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] | |
6656 | pmulhrsw m1, m7 | |
6657 | packuswb m5, m1 | |
6658 | pmaddubsw m6, [r3 + 9 * 16] ; [25] | |
6659 | pmulhrsw m6, m7 | |
6660 | palignr m1, m2, m0, 4 | |
6661 | pmaddubsw m2, m1, [r3 - 10 * 16] ; [6] | |
6662 | pmulhrsw m2, m7 | |
6663 | packuswb m6, m2 | |
6664 | pmaddubsw m1, [r3 + 3 * 16] ; [19] | |
6665 | pmulhrsw m1, m7 | |
6666 | packuswb m1, m1 | |
6667 | movhps m1, [r2 + 14] ; [00] | |
6668 | ||
6669 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
6670 | %endmacro | |
6671 | ;------------------------------------------------------------------------------------------------------------------ | |
6672 | ; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
6673 | ;------------------------------------------------------------------------------------------------------------------ | |
6674 | INIT_XMM sse4 | |
6675 | cglobal intra_pred_ang32_6, 3,7,8 | |
6676 | lea r3, [ang_table + 16 * 16] | |
6677 | mov r4d, 4 | |
6678 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6679 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6680 | mova m7, [pw_1024] | |
6681 | .loop: | |
6682 | MODE_6_30 1 | |
6683 | lea r0, [r6 + r1 * 4] | |
6684 | lea r6, [r6 + r1 * 8] | |
6685 | add r2, 8 | |
6686 | dec r4 | |
6687 | jnz .loop | |
6688 | RET | |
6689 | ||
6690 | %macro MODE_7_29 1 | |
6691 | movu m0, [r2 + 1] | |
6692 | palignr m1, m0, 1 | |
6693 | punpckhbw m2, m0, m1 | |
6694 | punpcklbw m0, m1 | |
6695 | mova m5, m0 | |
6696 | pmaddubsw m4, m0, [r3 - 7 * 16] ; [9] | |
6697 | pmulhrsw m4, m7 | |
6698 | pmaddubsw m3, m5, [r3 + 2 * 16] ; [18] | |
6699 | pmulhrsw m3, m7 | |
6700 | packuswb m4, m3 | |
6701 | pmaddubsw m5, [r3 + 11 * 16] ; [27] | |
6702 | pmulhrsw m5, m7 | |
6703 | palignr m1, m2, m0, 2 | |
6704 | palignr m2, m0, 4 | |
6705 | pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] | |
6706 | pmulhrsw m6, m7 | |
6707 | packuswb m5, m6 | |
6708 | pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] | |
6709 | pmulhrsw m6, m7 | |
6710 | pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] | |
6711 | pmulhrsw m0, m7 | |
6712 | packuswb m6, m0 | |
6713 | pmaddubsw m1, [r3 + 15 * 16] ; [31] | |
6714 | pmulhrsw m1, m7 | |
6715 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
6716 | pmulhrsw m0, m7 | |
6717 | packuswb m1, m0 | |
6718 | ||
6719 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
6720 | ||
6721 | pmaddubsw m4, m2, [r3 + 16] ; [17] | |
6722 | pmulhrsw m4, m7 | |
6723 | pmaddubsw m2, [r3 + 10 * 16] ; [26] | |
6724 | pmulhrsw m2, m7 | |
6725 | packuswb m4, m2 | |
6726 | movu m0, [r2 + 4] | |
6727 | palignr m1, m0, 1 | |
6728 | punpckhbw m2, m0, m1 | |
6729 | punpcklbw m0, m1 | |
6730 | palignr m2, m0, 2 | |
6731 | pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] | |
6732 | pmulhrsw m5, m7 | |
6733 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
6734 | pmulhrsw m6, m7 | |
6735 | packuswb m5, m6 | |
6736 | pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] | |
6737 | pmulhrsw m6, m7 | |
6738 | pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
6739 | pmulhrsw m0, m7 | |
6740 | packuswb m6, m0 | |
6741 | pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] | |
6742 | pmulhrsw m1, m7 | |
6743 | pmaddubsw m3, m2, [r3] ; [16] | |
6744 | pmulhrsw m3, m7 | |
6745 | packuswb m1, m3 | |
6746 | ||
6747 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
6748 | ||
6749 | pmaddubsw m4, m2, [r3 + 9 * 16] ; [25] | |
6750 | pmulhrsw m4, m7 | |
6751 | movu m0, [r2 + 6] | |
6752 | palignr m1, m0, 1 | |
6753 | punpckhbw m2, m0, m1 | |
6754 | punpcklbw m0, m1 | |
6755 | palignr m2, m0, 2 | |
6756 | pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] | |
6757 | pmulhrsw m1, m7 | |
6758 | packuswb m4, m1 | |
6759 | pmaddubsw m5, m0, [r3 - 5 * 16] ; [11] | |
6760 | pmulhrsw m5, m7 | |
6761 | pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] | |
6762 | pmulhrsw m6, m7 | |
6763 | packuswb m5, m6 | |
6764 | pmaddubsw m6, m0, [r3 + 13 * 16] ; [29] | |
6765 | pmulhrsw m6, m7 | |
6766 | pmaddubsw m1, m2, [r3 - 10 * 16] ; [6] | |
6767 | pmulhrsw m1, m7 | |
6768 | packuswb m6, m1 | |
6769 | pmaddubsw m1, m2, [r3 - 16] ; [15] | |
6770 | pmulhrsw m1, m7 | |
6771 | pmaddubsw m2, m2, [r3 + 8 * 16] ; [24] | |
6772 | pmulhrsw m2, m7 | |
6773 | packuswb m1, m2 | |
6774 | ||
6775 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
6776 | ||
6777 | movu m0, [r2 + 8] | |
6778 | palignr m1, m0, 1 | |
6779 | punpckhbw m2, m0, m1 | |
6780 | punpcklbw m0, m1 | |
6781 | pmaddubsw m4, m0, [r3 - 15 * 16] ; [1] | |
6782 | pmulhrsw m4, m7 | |
6783 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] | |
6784 | pmulhrsw m3, m7 | |
6785 | packuswb m4, m3 | |
6786 | pmaddubsw m5, m0, [r3 + 3 * 16] ; [19] | |
6787 | pmulhrsw m5, m7 | |
6788 | pmaddubsw m6, m0, [r3 + 12 * 16] ; [28] | |
6789 | pmulhrsw m6, m7 | |
6790 | packuswb m5, m6 | |
6791 | palignr m2, m0, 2 | |
6792 | pmaddubsw m6, m2, [r3 - 11 * 16] ; [5] | |
6793 | pmulhrsw m6, m7 | |
6794 | pmaddubsw m0, m2, [r3 - 2 * 16] ; [14] | |
6795 | pmulhrsw m0, m7 | |
6796 | packuswb m6, m0 | |
6797 | pmaddubsw m1, m2, [r3 + 7 * 16] ; [23] | |
6798 | pmulhrsw m1, m7 | |
6799 | packuswb m1, m1 | |
6800 | movhps m1, [r2 + 10] ; [0] | |
6801 | ||
6802 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
6803 | %endmacro | |
6804 | ;------------------------------------------------------------------------------------------------------------------ | |
6805 | ; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
6806 | ;------------------------------------------------------------------------------------------------------------------ | |
6807 | INIT_XMM sse4 | |
6808 | cglobal intra_pred_ang32_7, 3,7,8 | |
6809 | lea r3, [ang_table + 16 * 16] | |
6810 | mov r4d, 4 | |
6811 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6812 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6813 | mova m7, [pw_1024] | |
6814 | .loop: | |
6815 | MODE_7_29 1 | |
6816 | lea r0, [r6 + r1 * 4] | |
6817 | lea r6, [r6 + r1 * 8] | |
6818 | add r2, 8 | |
6819 | dec r4 | |
6820 | jnz .loop | |
6821 | RET | |
6822 | ||
6823 | %macro MODE_8_28 1 | |
6824 | movu m0, [r2 + 1] | |
6825 | palignr m1, m0, 1 | |
6826 | punpckhbw m2, m0, m1 | |
6827 | punpcklbw m0, m1 | |
6828 | palignr m2, m0, 2 | |
6829 | pmaddubsw m4, m0, [r3 - 11 * 16] ; [5] | |
6830 | pmulhrsw m4, m7 | |
6831 | pmaddubsw m3, m0, [r3 - 6 * 16] ; [10] | |
6832 | pmulhrsw m3, m7 | |
6833 | packuswb m4, m3 | |
6834 | pmaddubsw m5, m0, [r3 - 1 * 16] ; [15] | |
6835 | pmulhrsw m5, m7 | |
6836 | pmaddubsw m6, m0, [r3 + 4 * 16] ; [20] | |
6837 | pmulhrsw m6, m7 | |
6838 | packuswb m5, m6 | |
6839 | pmaddubsw m6, m0, [r3 + 9 * 16] ; [25] | |
6840 | pmulhrsw m6, m7 | |
6841 | pmaddubsw m0, [r3 + 14 * 16] ; [30] | |
6842 | pmulhrsw m0, m7 | |
6843 | packuswb m6, m0 | |
6844 | pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] | |
6845 | pmulhrsw m1, m7 | |
6846 | pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] | |
6847 | pmulhrsw m0, m7 | |
6848 | packuswb m1, m0 | |
6849 | ||
6850 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
6851 | ||
6852 | pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] | |
6853 | pmulhrsw m4, m7 | |
6854 | pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] | |
6855 | pmulhrsw m5, m7 | |
6856 | packuswb m4, m5 | |
6857 | pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] | |
6858 | pmulhrsw m5, m7 | |
6859 | pmaddubsw m2, [r3 + 12 * 16] ; [28] | |
6860 | pmulhrsw m2, m7 | |
6861 | packuswb m5, m2 | |
6862 | movu m0, [r2 + 3] | |
6863 | palignr m1, m0, 1 | |
6864 | punpckhbw m2, m0, m1 | |
6865 | punpcklbw m0, m1 | |
6866 | pmaddubsw m6, m0, [r3 - 15 * 16] ; [01] | |
6867 | pmulhrsw m6, m7 | |
6868 | pmaddubsw m1, m0, [r3 - 10 * 16] ; [06] | |
6869 | pmulhrsw m1, m7 | |
6870 | packuswb m6, m1 | |
6871 | pmaddubsw m1, m0, [r3 - 5 * 16] ; [11] | |
6872 | pmulhrsw m1, m7 | |
6873 | mova m2, m0 | |
6874 | pmaddubsw m0, [r3] ; [16] | |
6875 | pmulhrsw m0, m7 | |
6876 | packuswb m1, m0 | |
6877 | ||
6878 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
6879 | ||
6880 | pmaddubsw m4, m2, [r3 + 5 * 16] ; [21] | |
6881 | pmulhrsw m4, m7 | |
6882 | pmaddubsw m5, m2, [r3 + 10 * 16] ; [26] | |
6883 | pmulhrsw m5, m7 | |
6884 | packuswb m4, m5 | |
6885 | pmaddubsw m5, m2, [r3 + 15 * 16] ; [31] | |
6886 | pmulhrsw m5, m7 | |
6887 | movu m0, [r2 + 4] | |
6888 | palignr m1, m0, 1 | |
6889 | punpckhbw m2, m0, m1 | |
6890 | punpcklbw m0, m1 | |
6891 | pmaddubsw m2, m0, [r3 - 12 * 16] ; [4] | |
6892 | pmulhrsw m2, m7 | |
6893 | packuswb m5, m2 | |
6894 | pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] | |
6895 | pmulhrsw m6, m7 | |
6896 | pmaddubsw m1, m0, [r3 - 2 * 16] ; [14] | |
6897 | pmulhrsw m1, m7 | |
6898 | packuswb m6, m1 | |
6899 | pmaddubsw m1, m0, [r3 + 3 * 16] ; [19] | |
6900 | pmulhrsw m1, m7 | |
6901 | mova m2, m0 | |
6902 | pmaddubsw m0, [r3 + 8 * 16] ; [24] | |
6903 | pmulhrsw m0, m7 | |
6904 | packuswb m1, m0 | |
6905 | ||
6906 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
6907 | ||
6908 | pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] | |
6909 | pmulhrsw m4, m7 | |
6910 | movu m0, [r2 + 5] | |
6911 | palignr m1, m0, 1 | |
6912 | punpckhbw m2, m0, m1 | |
6913 | punpcklbw m0, m1 | |
6914 | pmaddubsw m1, m0, [r3 - 14 * 16] ; [2] | |
6915 | pmulhrsw m1, m7 | |
6916 | packuswb m4, m1 | |
6917 | pmaddubsw m5, m0, [r3 - 9 * 16] ; [7] | |
6918 | pmulhrsw m5, m7 | |
6919 | pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] | |
6920 | pmulhrsw m6, m7 | |
6921 | packuswb m5, m6 | |
6922 | pmaddubsw m6, m0, [r3 + 16] ; [17] | |
6923 | pmulhrsw m6, m7 | |
6924 | pmaddubsw m1, m0, [r3 + 6 * 16] ; [22] | |
6925 | pmulhrsw m1, m7 | |
6926 | packuswb m6, m1 | |
6927 | pmaddubsw m1, m0, [r3 + 11 * 16] ; [27] | |
6928 | pmulhrsw m1, m7 | |
6929 | packuswb m1, m1 | |
6930 | movhps m1, [r2 + 6] ; [00] | |
6931 | ||
6932 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
6933 | %endmacro | |
6934 | ;------------------------------------------------------------------------------------------------------------------ | |
6935 | ; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
6936 | ;------------------------------------------------------------------------------------------------------------------ | |
6937 | INIT_XMM sse4 | |
6938 | cglobal intra_pred_ang32_8, 3,7,8 | |
6939 | lea r3, [ang_table + 16 * 16] | |
6940 | mov r4d, 4 | |
6941 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
6942 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
6943 | mova m7, [pw_1024] | |
6944 | .loop: | |
6945 | MODE_8_28 1 | |
6946 | lea r0, [r6 + r1 * 4] | |
6947 | lea r6, [r6 + r1 * 8] | |
6948 | add r2, 8 | |
6949 | dec r4 | |
6950 | jnz .loop | |
6951 | RET | |
6952 | ||
6953 | %macro MODE_9_27 1 | |
6954 | movu m2, [r2 + 1] | |
6955 | palignr m1, m2, 1 | |
6956 | punpckhbw m0, m2, m1 | |
6957 | punpcklbw m2, m1 | |
6958 | pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] | |
6959 | pmulhrsw m4, m7 | |
6960 | pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] | |
6961 | pmulhrsw m3, m7 | |
6962 | packuswb m4, m3 | |
6963 | pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] | |
6964 | pmulhrsw m5, m7 | |
6965 | pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] | |
6966 | pmulhrsw m6, m7 | |
6967 | packuswb m5, m6 | |
6968 | pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] | |
6969 | pmulhrsw m6, m7 | |
6970 | pmaddubsw m3, m2, [r3 - 4 * 16] ; [12] | |
6971 | pmulhrsw m3, m7 | |
6972 | packuswb m6, m3 | |
6973 | pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] | |
6974 | pmulhrsw m1, m7 | |
6975 | pmaddubsw m0, m2, [r3] ; [16] | |
6976 | pmulhrsw m0, m7 | |
6977 | packuswb m1, m0 | |
6978 | ||
6979 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
6980 | ||
6981 | pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] | |
6982 | pmulhrsw m4, m7 | |
6983 | pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] | |
6984 | pmulhrsw m5, m7 | |
6985 | packuswb m4, m5 | |
6986 | pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] | |
6987 | pmulhrsw m5, m7 | |
6988 | pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] | |
6989 | pmulhrsw m6, m7 | |
6990 | packuswb m5, m6 | |
6991 | pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] | |
6992 | pmulhrsw m6, m7 | |
6993 | pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] | |
6994 | pmulhrsw m1, m7 | |
6995 | packuswb m6, m1 | |
6996 | pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] | |
6997 | pmulhrsw m1, m7 | |
6998 | packuswb m1, m1 | |
6999 | movhps m1, [r2 + 2] ; [00] | |
7000 | ||
7001 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
7002 | ||
7003 | movu m2, [r2 + 2] | |
7004 | palignr m1, m2, 1 | |
7005 | punpcklbw m2, m1 | |
7006 | pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] | |
7007 | pmulhrsw m4, m7 | |
7008 | pmaddubsw m3, m2, [r3 - 12 * 16] ; [4] | |
7009 | pmulhrsw m3, m7 | |
7010 | packuswb m4, m3 | |
7011 | pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] | |
7012 | pmulhrsw m5, m7 | |
7013 | pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] | |
7014 | pmulhrsw m6, m7 | |
7015 | packuswb m5, m6 | |
7016 | pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] | |
7017 | pmulhrsw m6, m7 | |
7018 | pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] | |
7019 | pmulhrsw m0, m7 | |
7020 | packuswb m6, m0 | |
7021 | pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] | |
7022 | pmulhrsw m1, m7 | |
7023 | pmaddubsw m0, m2, [r3] ; [16] | |
7024 | pmulhrsw m0, m7 | |
7025 | packuswb m1, m0 | |
7026 | ||
7027 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
7028 | ||
7029 | movu m2, [r2 + 2] | |
7030 | palignr m1, m2, 1 | |
7031 | punpcklbw m2, m1 | |
7032 | pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] | |
7033 | pmulhrsw m4, m7 | |
7034 | pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] | |
7035 | pmulhrsw m5, m7 | |
7036 | packuswb m4, m5 | |
7037 | pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] | |
7038 | pmulhrsw m5, m7 | |
7039 | pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] | |
7040 | pmulhrsw m6, m7 | |
7041 | packuswb m5, m6 | |
7042 | pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] | |
7043 | pmulhrsw m6, m7 | |
7044 | pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] | |
7045 | pmulhrsw m1, m7 | |
7046 | packuswb m6, m1 | |
7047 | pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] | |
7048 | pmulhrsw m1, m7 | |
7049 | packuswb m1, m1 | |
7050 | movhps m1, [r2 + 3] ; [00] | |
7051 | ||
7052 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
7053 | %endmacro | |
7054 | ;------------------------------------------------------------------------------------------------------------------ | |
7055 | ; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7056 | ;------------------------------------------------------------------------------------------------------------------ | |
7057 | INIT_XMM sse4 | |
7058 | cglobal intra_pred_ang32_9, 3,7,8 | |
7059 | lea r3, [ang_table + 16 * 16] | |
7060 | mov r4d, 4 | |
7061 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7062 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7063 | mova m7, [pw_1024] | |
7064 | .loop: | |
7065 | MODE_9_27 1 | |
7066 | lea r0, [r6 + r1 * 4] | |
7067 | lea r6, [r6 + r1 * 8] | |
7068 | add r2, 8 | |
7069 | dec r4 | |
7070 | jnz .loop | |
7071 | RET | |
7072 | ||
7073 | ;------------------------------------------------------------------------------------------------------------------ | |
7074 | ; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7075 | ;------------------------------------------------------------------------------------------------------------------ | |
7076 | INIT_XMM sse4 | |
7077 | cglobal intra_pred_ang32_10, 6,7,8,0-(2*mmsize) | |
7078 | %define m8 [rsp + 0 * mmsize] | |
7079 | %define m9 [rsp + 1 * mmsize] | |
7080 | lea r4, [r1 * 3] | |
7081 | pxor m7, m7 | |
7082 | mov r6, 2 | |
7083 | movu m0, [r3] | |
7084 | movu m1, [r3 + 1] | |
7085 | mova m8, m0 | |
7086 | mova m9, m1 | |
7087 | mov r3d, r5d | |
7088 | ||
7089 | .loop: | |
7090 | movu m0, [r2 + 1] | |
7091 | palignr m1, m0, 1 | |
7092 | pshufb m1, m7 | |
7093 | palignr m2, m0, 2 | |
7094 | pshufb m2, m7 | |
7095 | palignr m3, m0, 3 | |
7096 | pshufb m3, m7 | |
7097 | palignr m4, m0, 4 | |
7098 | pshufb m4, m7 | |
7099 | palignr m5, m0, 5 | |
7100 | pshufb m5, m7 | |
7101 | palignr m6, m0, 6 | |
7102 | pshufb m6, m7 | |
7103 | ||
7104 | movu [r0 + r1], m1 | |
7105 | movu [r0 + r1 + 16], m1 | |
7106 | movu [r0 + r1 * 2], m2 | |
7107 | movu [r0 + r1 * 2 + 16], m2 | |
7108 | movu [r0 + r4], m3 | |
7109 | movu [r0 + r4 + 16], m3 | |
7110 | lea r5, [r0 + r1 * 4] | |
7111 | movu [r5], m4 | |
7112 | movu [r5 + 16], m4 | |
7113 | movu [r5 + r1], m5 | |
7114 | movu [r5 + r1 + 16], m5 | |
7115 | movu [r5 + r1 * 2], m6 | |
7116 | movu [r5 + r1 * 2 + 16], m6 | |
7117 | ||
7118 | palignr m1, m0, 7 | |
7119 | pshufb m1, m7 | |
7120 | movhlps m2, m0 | |
7121 | pshufb m2, m7 | |
7122 | palignr m3, m0, 9 | |
7123 | pshufb m3, m7 | |
7124 | palignr m4, m0, 10 | |
7125 | pshufb m4, m7 | |
7126 | palignr m5, m0, 11 | |
7127 | pshufb m5, m7 | |
7128 | palignr m6, m0, 12 | |
7129 | pshufb m6, m7 | |
7130 | ||
7131 | movu [r5 + r4], m1 | |
7132 | movu [r5 + r4 + 16], m1 | |
7133 | lea r5, [r5 + r1 * 4] | |
7134 | movu [r5], m2 | |
7135 | movu [r5 + 16], m2 | |
7136 | movu [r5 + r1], m3 | |
7137 | movu [r5 + r1 + 16], m3 | |
7138 | movu [r5 + r1 * 2], m4 | |
7139 | movu [r5 + r1 * 2 + 16], m4 | |
7140 | movu [r5 + r4], m5 | |
7141 | movu [r5 + r4 + 16], m5 | |
7142 | lea r5, [r5 + r1 * 4] | |
7143 | movu [r5], m6 | |
7144 | movu [r5 + 16], m6 | |
7145 | ||
7146 | palignr m1, m0, 13 | |
7147 | pshufb m1, m7 | |
7148 | palignr m2, m0, 14 | |
7149 | pshufb m2, m7 | |
7150 | palignr m3, m0, 15 | |
7151 | pshufb m3, m7 | |
7152 | pshufb m0, m7 | |
7153 | ||
7154 | movu [r5 + r1], m1 | |
7155 | movu [r5 + r1 + 16], m1 | |
7156 | movu [r5 + r1 * 2], m2 | |
7157 | movu [r5 + r1 * 2 + 16], m2 | |
7158 | movu [r5 + r4], m3 | |
7159 | movu [r5 + r4 + 16], m3 | |
7160 | ||
7161 | ; filter | |
7162 | cmp r3d, byte 0 | |
7163 | jz .quit | |
7164 | movhlps m1, m0 | |
7165 | pmovzxbw m0, m0 | |
7166 | mova m1, m0 | |
7167 | movu m2, m8 | |
7168 | movu m3, m9 | |
7169 | ||
7170 | pshufb m2, m7 | |
7171 | pmovzxbw m2, m2 | |
7172 | movhlps m4, m3 | |
7173 | pmovzxbw m3, m3 | |
7174 | pmovzxbw m4, m4 | |
7175 | psubw m3, m2 | |
7176 | psubw m4, m2 | |
7177 | psraw m3, 1 | |
7178 | psraw m4, 1 | |
7179 | paddw m0, m3 | |
7180 | paddw m1, m4 | |
7181 | packuswb m0, m1 | |
7182 | ||
7183 | .quit: | |
7184 | movu [r0], m0 | |
7185 | movu [r0 + 16], m0 | |
7186 | dec r6 | |
7187 | lea r0, [r5 + r1 * 4] | |
7188 | lea r2, [r2 + 16] | |
7189 | jnz .loop | |
7190 | RET | |
7191 | ||
7192 | ;------------------------------------------------------------------------------------------------------------------- | |
7193 | ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7194 | ;------------------------------------------------------------------------------------------------------------------- | |
7195 | INIT_XMM sse4 | |
7196 | cglobal intra_pred_ang32_11, 4,7,8 | |
7197 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
7198 | ||
7199 | mov r6, rsp | |
7200 | sub rsp, 64+gprsize | |
7201 | and rsp, ~63 | |
7202 | mov [rsp+64], r6 | |
7203 | ||
7204 | ; collect reference pixel | |
7205 | movu m0, [r3 + 16] | |
7206 | pxor m1, m1 | |
7207 | pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] | |
7208 | mova [rsp], m0 | |
7209 | movu m0, [r2] | |
7210 | movu m1, [r2 + 16] | |
7211 | movu m2, [r2 + 32] | |
7212 | movu [rsp + 1], m0 | |
7213 | movu [rsp + 1 + 16], m1 | |
7214 | movu [rsp + 1 + 32], m2 | |
7215 | mov [rsp + 63], byte 4 | |
7216 | ||
7217 | ; filter | |
7218 | lea r2, [rsp + 1] ; r2 -> [0] | |
7219 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
7220 | lea r4, [ang_table] ; r4 -> ang_table | |
7221 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7222 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7223 | mova m5, [pw_1024] ; m5 -> 1024 | |
7224 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
7225 | ||
7226 | .loop: | |
7227 | ; Row[0 - 7] | |
7228 | movu m7, [r2] | |
7229 | mova m0, m7 | |
7230 | mova m1, m7 | |
7231 | mova m2, m7 | |
7232 | mova m3, m7 | |
7233 | mova m4, m7 | |
7234 | mova m5, m7 | |
7235 | mova m6, m7 | |
7236 | PROC32_8x8 0, 1, 30,28,26,24,22,20,18,16 | |
7237 | ||
7238 | ; Row[8 - 15] | |
7239 | movu m7, [r2] | |
7240 | mova m0, m7 | |
7241 | mova m1, m7 | |
7242 | mova m2, m7 | |
7243 | mova m3, m7 | |
7244 | mova m4, m7 | |
7245 | mova m5, m7 | |
7246 | mova m6, m7 | |
7247 | PROC32_8x8 1, 1, 14,12,10,8,6,4,2,0 | |
7248 | ||
7249 | ; Row[16 - 23] | |
7250 | movu m7, [r2 - 1] | |
7251 | mova m0, m7 | |
7252 | mova m1, m7 | |
7253 | mova m2, m7 | |
7254 | mova m3, m7 | |
7255 | mova m4, m7 | |
7256 | mova m5, m7 | |
7257 | mova m6, m7 | |
7258 | PROC32_8x8 2, 1, 30,28,26,24,22,20,18,16 | |
7259 | ||
7260 | ; Row[24 - 31] | |
7261 | movu m7, [r2 - 1] | |
7262 | mova m0, m7 | |
7263 | mova m1, m7 | |
7264 | mova m2, m7 | |
7265 | mova m3, m7 | |
7266 | mova m4, m7 | |
7267 | mova m5, m7 | |
7268 | mova m6, m7 | |
7269 | PROC32_8x8 3, 1, 14,12,10,8,6,4,2,0 | |
7270 | ||
7271 | lea r0, [r6 + r1 * 4] | |
7272 | lea r6, [r6 + r1 * 8] | |
7273 | add r2, 8 | |
7274 | dec byte [rsp + 63] | |
7275 | jnz .loop | |
7276 | mov rsp, [rsp+64] | |
7277 | RET | |
7278 | ||
7279 | %macro MODE_12_24_ROW0 1 | |
7280 | movu m0, [r3 + 6] | |
7281 | pshufb m0, [c_mode32_12_0] | |
7282 | pinsrb m0, [r3 + 26], 12 | |
7283 | mova above, m0 | |
7284 | movu m2, [r2] | |
7285 | palignr m1, m2, 1 | |
7286 | punpcklbw m2, m1 | |
7287 | pmaddubsw m4, m2, [r4 + 11 * 16] ; [27] | |
7288 | pmulhrsw m4, m7 | |
7289 | pmaddubsw m3, m2, [r4 + 6 * 16] ; [22] | |
7290 | pmulhrsw m3, m7 | |
7291 | packuswb m4, m3 | |
7292 | pmaddubsw m5, m2, [r4 + 16] ; [17] | |
7293 | pmulhrsw m5, m7 | |
7294 | pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] | |
7295 | pmulhrsw m6, m7 | |
7296 | packuswb m5, m6 | |
7297 | pmaddubsw m6, m2, [r4 - 9 * 16] ; [7] | |
7298 | pmulhrsw m6, m7 | |
7299 | pmaddubsw m3, m2, [r4 - 14 * 16] ; [2] | |
7300 | pmulhrsw m3, m7 | |
7301 | packuswb m6, m3 | |
7302 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
7303 | palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
7304 | punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] | |
7305 | pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] | |
7306 | pmulhrsw m1, m7 | |
7307 | pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] | |
7308 | pmulhrsw m3, m7 | |
7309 | packuswb m1, m3 | |
7310 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
7311 | pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] | |
7312 | pmulhrsw m4, m7 | |
7313 | pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] | |
7314 | pmulhrsw m5, m7 | |
7315 | packuswb m4, m5 | |
7316 | pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] | |
7317 | pmulhrsw m5, m7 | |
7318 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
7319 | pmulhrsw m6, m7 | |
7320 | packuswb m5, m6 | |
7321 | palignr m2, above, 14 ;[6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] | |
7322 | pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] | |
7323 | pmulhrsw m6, m7 | |
7324 | pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] | |
7325 | pmulhrsw m1, m7 | |
7326 | packuswb m6, m1 | |
7327 | pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] | |
7328 | pmulhrsw m1, m7 | |
7329 | pmaddubsw m3, m2, [r4] ; [16] | |
7330 | pmulhrsw m3, m7 | |
7331 | packuswb m1, m3 | |
7332 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
7333 | pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] | |
7334 | pmulhrsw m4, m7 | |
7335 | pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] | |
7336 | pmulhrsw m3, m7 | |
7337 | packuswb m4, m3 | |
7338 | pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] | |
7339 | pmulhrsw m5, m7 | |
7340 | pslldq m1, above, 1 | |
7341 | palignr m2, m1, 14 | |
7342 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
7343 | pmulhrsw m6, m7 | |
7344 | packuswb m5, m6 | |
7345 | pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] | |
7346 | pmulhrsw m6, m7 | |
7347 | pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] | |
7348 | pmulhrsw m3, m7 | |
7349 | packuswb m6, m3 | |
7350 | pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] | |
7351 | pmulhrsw m1, m7 | |
7352 | pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] | |
7353 | pmulhrsw m3, m7 | |
7354 | packuswb m1, m3 | |
7355 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
7356 | pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] | |
7357 | pmulhrsw m4, m7 | |
7358 | pslldq m1, above, 2 | |
7359 | palignr m2, m1, 14 | |
7360 | pmaddubsw m5, m2, [r4 + 14 * 16] ; [30] | |
7361 | pmulhrsw m5, m7 | |
7362 | packuswb m4, m5 | |
7363 | pmaddubsw m5, m2, [r4 + 9 * 16] ; [25] | |
7364 | pmulhrsw m5, m7 | |
7365 | pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] | |
7366 | pmulhrsw m6, m7 | |
7367 | packuswb m5, m6 | |
7368 | pmaddubsw m6, m2, [r4 - 16] ; [15] | |
7369 | pmulhrsw m6, m7 | |
7370 | pmaddubsw m1, m2, [r4 - 6 * 16] ; [10] | |
7371 | pmulhrsw m1, m7 | |
7372 | packuswb m6, m1 | |
7373 | pmaddubsw m1, m2, [r4 - 11 * 16] ; [05] | |
7374 | pmulhrsw m1, m7 | |
7375 | movu m0, [pb_fact0] | |
7376 | pshufb m2, m0 | |
7377 | pmovzxbw m2, m2 | |
7378 | packuswb m1, m2 | |
7379 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
7380 | %endmacro | |
7381 | ||
7382 | %macro MODE_12_24 1 | |
7383 | movu m2, [r2] | |
7384 | palignr m1, m2, 1 | |
7385 | punpckhbw m0, m2, m1 | |
7386 | punpcklbw m2, m1 | |
7387 | palignr m0, m2, 2 | |
7388 | pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] | |
7389 | pmulhrsw m4, m7 | |
7390 | pmaddubsw m3, m0, [r4 + 6 * 16] ; [22] | |
7391 | pmulhrsw m3, m7 | |
7392 | packuswb m4, m3 | |
7393 | pmaddubsw m5, m0, [r4 + 16] ; [17] | |
7394 | pmulhrsw m5, m7 | |
7395 | pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] | |
7396 | pmulhrsw m6, m7 | |
7397 | packuswb m5, m6 | |
7398 | pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] | |
7399 | pmulhrsw m6, m7 | |
7400 | pmaddubsw m3, m0, [r4 - 14 * 16] ; [2] | |
7401 | pmulhrsw m3, m7 | |
7402 | packuswb m6, m3 | |
7403 | pmaddubsw m1, m2, [r4 + 13 * 16] ; [29] | |
7404 | pmulhrsw m1, m7 | |
7405 | pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] | |
7406 | pmulhrsw m3, m7 | |
7407 | packuswb m1, m3 | |
7408 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
7409 | pmaddubsw m4, m2, [r4 + 3 * 16] ; [19] | |
7410 | pmulhrsw m4, m7 | |
7411 | pmaddubsw m5, m2, [r4 - 2 * 16] ; [14] | |
7412 | pmulhrsw m5, m7 | |
7413 | packuswb m4, m5 | |
7414 | pmaddubsw m5, m2, [r4 - 7 * 16] ; [09] | |
7415 | pmulhrsw m5, m7 | |
7416 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
7417 | pmulhrsw m6, m7 | |
7418 | packuswb m5, m6 | |
7419 | movu m0, [r2 - 2] | |
7420 | palignr m1, m0, 1 | |
7421 | punpckhbw m2, m0, m1 | |
7422 | punpcklbw m0, m1 | |
7423 | palignr m2, m0, 2 | |
7424 | pmaddubsw m6, m2, [r4 + 15 * 16] ; [31] | |
7425 | pmulhrsw m6, m7 | |
7426 | pmaddubsw m1, m2, [r4 + 10 * 16] ; [26] | |
7427 | pmulhrsw m1, m7 | |
7428 | packuswb m6, m1 | |
7429 | pmaddubsw m1, m2, [r4 + 5 * 16] ; [21] | |
7430 | pmulhrsw m1, m7 | |
7431 | pmaddubsw m3, m2, [r4] ; [16] | |
7432 | pmulhrsw m3, m7 | |
7433 | packuswb m1, m3 | |
7434 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
7435 | pmaddubsw m4, m2, [r4 - 5 * 16] ; [11] | |
7436 | pmulhrsw m4, m7 | |
7437 | pmaddubsw m3, m2, [r4 - 10 * 16] ; [06] | |
7438 | pmulhrsw m3, m7 | |
7439 | packuswb m4, m3 | |
7440 | pmaddubsw m5, m2, [r4 - 15 * 16] ; [1] | |
7441 | pmulhrsw m5, m7 | |
7442 | movu m0, [r2 - 3] | |
7443 | palignr m1, m0, 1 | |
7444 | punpckhbw m2, m0, m1 | |
7445 | punpcklbw m0, m1 | |
7446 | palignr m2, m0, 2 | |
7447 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
7448 | pmulhrsw m6, m7 | |
7449 | packuswb m5, m6 | |
7450 | pmaddubsw m6, m2, [r4 + 7 * 16] ; [23] | |
7451 | pmulhrsw m6, m7 | |
7452 | pmaddubsw m3, m2, [r4 + 2 * 16] ; [18] | |
7453 | pmulhrsw m3, m7 | |
7454 | packuswb m6, m3 | |
7455 | pmaddubsw m1, m2, [r4 - 3 * 16] ; [13] | |
7456 | pmulhrsw m1, m7 | |
7457 | pmaddubsw m3, m2, [r4 - 8 * 16] ; [8] | |
7458 | pmulhrsw m3, m7 | |
7459 | packuswb m1, m3 | |
7460 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
7461 | pmaddubsw m4, m2, [r4 - 13 * 16] ; [3] | |
7462 | pmulhrsw m4, m7 | |
7463 | movu m2, [r2 - 4] | |
7464 | palignr m1, m2, 1 | |
7465 | punpckhbw m0, m2, m1 | |
7466 | punpcklbw m2, m1 | |
7467 | palignr m0, m2, 2 | |
7468 | pmaddubsw m5, m0, [r4 + 14 * 16] ; [30] | |
7469 | pmulhrsw m5, m7 | |
7470 | packuswb m4, m5 | |
7471 | pmaddubsw m5, m0, [r4 + 9 * 16] ; [25] | |
7472 | pmulhrsw m5, m7 | |
7473 | pmaddubsw m6, m0, [r4 + 4 * 16] ; [20] | |
7474 | pmulhrsw m6, m7 | |
7475 | packuswb m5, m6 | |
7476 | pmaddubsw m6, m0, [r4 - 16] ; [15] | |
7477 | pmulhrsw m6, m7 | |
7478 | pmaddubsw m1, m0, [r4 - 6 * 16] ; [10] | |
7479 | pmulhrsw m1, m7 | |
7480 | packuswb m6, m1 | |
7481 | pmaddubsw m1, m0, [r4 - 11 * 16] ; [05] | |
7482 | pmulhrsw m1, m7 | |
7483 | movu m2, [pb_fact0] | |
7484 | pshufb m0, m2 | |
7485 | pmovzxbw m0, m0 | |
7486 | packuswb m1, m0 | |
7487 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
7488 | %endmacro | |
7489 | ;----------------------------------------------------------------------------------------------------------------- | |
7490 | ; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7491 | ;----------------------------------------------------------------------------------------------------------------- | |
7492 | INIT_XMM sse4 | |
7493 | cglobal intra_pred_ang32_12, 4,7,8,0-(1*mmsize) | |
7494 | %define above [rsp + 0 * mmsize] | |
7495 | ||
7496 | lea r4, [ang_table + 16 * 16] | |
7497 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7498 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7499 | mova m7, [pw_1024] | |
7500 | ||
7501 | MODE_12_24_ROW0 1 | |
7502 | lea r0, [r6 + r1 * 4] | |
7503 | lea r6, [r6 + r1 * 8] | |
7504 | add r2, 7 | |
7505 | mov r3, 3 | |
7506 | .loop: | |
7507 | MODE_12_24 1 | |
7508 | lea r0, [r6 + r1 * 4] | |
7509 | lea r6, [r6 + r1 * 8] | |
7510 | add r2, 8 | |
7511 | dec r3 | |
7512 | jnz .loop | |
7513 | RET | |
7514 | ||
7515 | %macro MODE_13_23_ROW0 1 | |
7516 | movu m0, [r3 + 1] | |
7517 | movu m1, [r3 + 15] | |
7518 | pshufb m0, [c_mode32_13_0] | |
7519 | pshufb m1, [c_mode32_13_0] | |
7520 | punpckldq m0, m1 | |
7521 | pshufb m0, [c_mode32_13_shuf] | |
7522 | mova above, m0 | |
7523 | movu m2, [r2] | |
7524 | palignr m1, m2, 1 | |
7525 | punpcklbw m2, m1 | |
7526 | pmaddubsw m4, m2, [r4 + 7 * 16] ; [23] | |
7527 | pmulhrsw m4, m7 | |
7528 | pmaddubsw m3, m2, [r4 - 2 * 16] ; [14] | |
7529 | pmulhrsw m3, m7 | |
7530 | packuswb m4, m3 | |
7531 | pmaddubsw m5, m2, [r4 - 11 * 16] ; [5] | |
7532 | pmulhrsw m5, m7 | |
7533 | movu m1, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
7534 | palignr m2, m1, above, 15 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] | |
7535 | punpcklbw m2, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] | |
7536 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
7537 | pmulhrsw m6, m7 | |
7538 | packuswb m5, m6 | |
7539 | pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] | |
7540 | pmulhrsw m6, m7 | |
7541 | pmaddubsw m0, m2, [r4 - 6 * 16] ; [10] | |
7542 | pmulhrsw m0, m7 | |
7543 | packuswb m6, m0 | |
7544 | pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] | |
7545 | pmulhrsw m1, m7 | |
7546 | palignr m2, above, 14 | |
7547 | pmaddubsw m3, m2, [r4 + 8 * 16] ; [24] | |
7548 | pmulhrsw m3, m7 | |
7549 | packuswb m1, m3 | |
7550 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
7551 | pmaddubsw m4, m2, [r4 - 16] ; [15] | |
7552 | pmulhrsw m4, m7 | |
7553 | pmaddubsw m5, m2, [r4 - 10 * 16] ; [6] | |
7554 | pmulhrsw m5, m7 | |
7555 | packuswb m4, m5 | |
7556 | pslldq m0, above, 1 | |
7557 | palignr m2, m0, 14 | |
7558 | pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] | |
7559 | pmulhrsw m5, m7 | |
7560 | pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] | |
7561 | pmulhrsw m6, m7 | |
7562 | packuswb m5, m6 | |
7563 | pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] | |
7564 | pmulhrsw m6, m7 | |
7565 | pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] | |
7566 | pmulhrsw m1, m7 | |
7567 | packuswb m6, m1 | |
7568 | pslldq m0, 1 | |
7569 | palignr m2, m0, 14 | |
7570 | pmaddubsw m1, m2, [r4 + 9 * 16] ; [25] | |
7571 | pmulhrsw m1, m7 | |
7572 | pmaddubsw m0, m2, [r4] ; [16] | |
7573 | pmulhrsw m0, m7 | |
7574 | packuswb m1, m0 | |
7575 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
7576 | pmaddubsw m4, m2, [r4 - 9 * 16] ; [7] | |
7577 | pmulhrsw m4, m7 | |
7578 | pslldq m0, above, 3 | |
7579 | palignr m2, m0, 14 | |
7580 | pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] | |
7581 | pmulhrsw m3, m7 | |
7582 | packuswb m4, m3 | |
7583 | pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] | |
7584 | pmulhrsw m5, m7 | |
7585 | pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] | |
7586 | pmulhrsw m6, m7 | |
7587 | packuswb m5, m6 | |
7588 | pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] | |
7589 | pmulhrsw m6, m7 | |
7590 | pslldq m0, 1 | |
7591 | palignr m2, m0, 14 | |
7592 | pmaddubsw m0, m2, [r4 + 10 * 16] ; [26] | |
7593 | pmulhrsw m0, m7 | |
7594 | packuswb m6, m0 | |
7595 | pmaddubsw m1, m2, [r4 + 16] ; [17] | |
7596 | pmulhrsw m1, m7 | |
7597 | pmaddubsw m0, m2, [r4 - 8 * 16] ; [8] | |
7598 | pmulhrsw m0, m7 | |
7599 | packuswb m1, m0 | |
7600 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
7601 | pslldq m0, above, 5 | |
7602 | palignr m2, m0, 14 | |
7603 | pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] | |
7604 | pmulhrsw m4, m7 | |
7605 | pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] | |
7606 | pmulhrsw m5, m7 | |
7607 | packuswb m4, m5 | |
7608 | pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] | |
7609 | pmulhrsw m5, m7 | |
7610 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
7611 | pmulhrsw m6, m7 | |
7612 | packuswb m5, m6 | |
7613 | pslldq m0, 1 | |
7614 | palignr m2, m0, 14 | |
7615 | pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] | |
7616 | pmulhrsw m6, m7 | |
7617 | pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] | |
7618 | pmulhrsw m1, m7 | |
7619 | packuswb m6, m1 | |
7620 | pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] | |
7621 | pmulhrsw m1, m7 | |
7622 | pmaddubsw m3, m2, [r4 - 16 * 16] ; [00] | |
7623 | pmulhrsw m3, m7 | |
7624 | packuswb m1, m3 | |
7625 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
7626 | %endmacro | |
7627 | ||
7628 | %macro MODE_13_23 1 | |
7629 | movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
7630 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
7631 | punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] | |
7632 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
7633 | palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] | |
7634 | pmaddubsw m4, m0, [r4 + 7 * 16] ; [23] | |
7635 | pmulhrsw m4, m7 | |
7636 | pmaddubsw m3, m0, [r4 - 2 * 16] ; [14] | |
7637 | pmulhrsw m3, m7 | |
7638 | packuswb m4, m3 | |
7639 | pmaddubsw m5, m0, [r4 - 11 * 16] ; [05] | |
7640 | pmulhrsw m5, m7 | |
7641 | pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] | |
7642 | pmulhrsw m6, m7 | |
7643 | packuswb m5, m6 | |
7644 | pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] | |
7645 | pmulhrsw m6, m7 | |
7646 | pmaddubsw m3, m2, [r4 - 6 * 16] ; [10] | |
7647 | pmulhrsw m3, m7 | |
7648 | packuswb m6, m3 | |
7649 | pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] | |
7650 | pmulhrsw m1, m7 | |
7651 | movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1] | |
7652 | palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
7653 | punpckhbw m0, m2, m3 | |
7654 | punpcklbw m2, m3 | |
7655 | palignr m0, m2, 2 | |
7656 | pmaddubsw m3, m0, [r4 + 8 * 16] ; [24] | |
7657 | pmulhrsw m3, m7 | |
7658 | packuswb m1, m3 | |
7659 | mova m3, m0 | |
7660 | TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 | |
7661 | pmaddubsw m4, m3, [r4 - 16] ; [15] | |
7662 | pmulhrsw m4, m7 | |
7663 | pmaddubsw m5, m3, [r4 - 10 * 16] ; [6] | |
7664 | pmulhrsw m5, m7 | |
7665 | packuswb m4, m5 | |
7666 | pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] | |
7667 | pmulhrsw m5, m7 | |
7668 | pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] | |
7669 | pmulhrsw m6, m7 | |
7670 | packuswb m5, m6 | |
7671 | pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] | |
7672 | pmulhrsw m6, m7 | |
7673 | pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] | |
7674 | pmulhrsw m1, m7 | |
7675 | packuswb m6, m1 | |
7676 | movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
7677 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
7678 | punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] | |
7679 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
7680 | palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] | |
7681 | pmaddubsw m1, m0, [r4 + 9 * 16] ; [25] | |
7682 | pmulhrsw m1, m7 | |
7683 | pmaddubsw m3, m0, [r4] ; [16] | |
7684 | pmulhrsw m3, m7 | |
7685 | packuswb m1, m3 | |
7686 | mova m3, m0 | |
7687 | TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 | |
7688 | pmaddubsw m4, m3, [r4 - 9 * 16] ; [7] | |
7689 | pmulhrsw m4, m7 | |
7690 | pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] | |
7691 | pmulhrsw m3, m7 | |
7692 | packuswb m4, m3 | |
7693 | pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] | |
7694 | pmulhrsw m5, m7 | |
7695 | pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] | |
7696 | pmulhrsw m6, m7 | |
7697 | packuswb m5, m6 | |
7698 | pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] | |
7699 | pmulhrsw m6, m7 | |
7700 | movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
7701 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
7702 | punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] | |
7703 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
7704 | palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] | |
7705 | pmaddubsw m3, m0, [r4 + 10 * 16] ; [26] | |
7706 | pmulhrsw m3, m7 | |
7707 | packuswb m6, m3 | |
7708 | pmaddubsw m1, m0, [r4 + 16] ; [17] | |
7709 | pmulhrsw m1, m7 | |
7710 | pmaddubsw m3, m0, [r4 - 8 * 16] ; [8] | |
7711 | pmulhrsw m3, m7 | |
7712 | packuswb m1, m3 | |
7713 | TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 | |
7714 | pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] | |
7715 | pmulhrsw m4, m7 | |
7716 | pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] | |
7717 | pmulhrsw m5, m7 | |
7718 | packuswb m4, m5 | |
7719 | pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] | |
7720 | pmulhrsw m5, m7 | |
7721 | pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] | |
7722 | pmulhrsw m6, m7 | |
7723 | packuswb m5, m6 | |
7724 | movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] | |
7725 | palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] | |
7726 | punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] | |
7727 | pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] | |
7728 | pmulhrsw m6, m7 | |
7729 | pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] | |
7730 | pmulhrsw m1, m7 | |
7731 | packuswb m6, m1 | |
7732 | pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] | |
7733 | pmulhrsw m1, m7 | |
7734 | movu m0, [pb_fact0] | |
7735 | pshufb m2, m0 | |
7736 | pmovzxbw m2, m2 | |
7737 | packuswb m1, m2 | |
7738 | TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 | |
7739 | %endmacro | |
7740 | ;----------------------------------------------------------------------------------------------------------------- | |
7741 | ; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7742 | ;----------------------------------------------------------------------------------------------------------------- | |
7743 | INIT_XMM sse4 | |
7744 | cglobal intra_pred_ang32_13, 4,7,8,0-(1*mmsize) | |
7745 | %define above [rsp + 0 * mmsize] | |
7746 | lea r4, [ang_table + 16 * 16] | |
7747 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7748 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7749 | mova m7, [pw_1024] | |
7750 | ||
7751 | MODE_13_23_ROW0 1 | |
7752 | lea r0, [r6 + r1 * 4] | |
7753 | lea r6, [r6 + r1 * 8] | |
7754 | add r2, 7 | |
7755 | mov r3, 3 | |
7756 | .loop: | |
7757 | MODE_13_23 1 | |
7758 | lea r0, [r6 + r1 * 4] | |
7759 | lea r6, [r6 + r1 * 8] | |
7760 | add r2, 8 | |
7761 | dec r3 | |
7762 | jnz .loop | |
7763 | RET | |
7764 | ||
7765 | ;------------------------------------------------------------------------------------------------------------------- | |
7766 | ; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7767 | ;------------------------------------------------------------------------------------------------------------------- | |
7768 | INIT_XMM sse4 | |
7769 | cglobal intra_pred_ang32_14, 4,7,8 | |
7770 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
7771 | mov r6, rsp | |
7772 | sub rsp, 64+gprsize | |
7773 | and rsp, ~63 | |
7774 | mov [rsp+64], r6 | |
7775 | ||
7776 | ; collect reference pixel | |
7777 | movu m0, [r3] | |
7778 | movu m1, [r3 + 15] | |
7779 | pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] | |
7780 | pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] | |
7781 | pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] | |
7782 | palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] | |
7783 | mova [rsp], m0 | |
7784 | movu m0, [r2 + 1] | |
7785 | movu m1, [r2 + 1 + 16] | |
7786 | movu [rsp + 13], m0 | |
7787 | movu [rsp + 13 + 16], m1 | |
7788 | mov [rsp + 63], byte 4 | |
7789 | ||
7790 | ; filter | |
7791 | lea r2, [rsp + 13] ; r2 -> [0] | |
7792 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
7793 | lea r4, [ang_table] ; r4 -> ang_table | |
7794 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7795 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7796 | mova m5, [pw_1024] ; m5 -> 1024 | |
7797 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
7798 | ||
7799 | .loop: | |
7800 | ; Row[0 - 7] | |
7801 | movu m7, [r2 - 4] | |
7802 | palignr m0, m7, 3 | |
7803 | mova m1, m0 | |
7804 | palignr m2, m7, 2 | |
7805 | mova m3, m2 | |
7806 | palignr m4, m7, 1 | |
7807 | mova m5, m4 | |
7808 | mova m6, m4 | |
7809 | PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24 | |
7810 | ||
7811 | ; Row[8 - 15] | |
7812 | movu m7, [r2 - 7] | |
7813 | palignr m0, m7, 3 | |
7814 | palignr m1, m7, 2 | |
7815 | mova m2, m1 | |
7816 | mova m3, m1 | |
7817 | palignr m4, m7, 1 | |
7818 | mova m5, m4 | |
7819 | mova m6, m7 | |
7820 | PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16 | |
7821 | ||
7822 | ; Row[16 - 23] | |
7823 | movu m7, [r2 - 10] | |
7824 | palignr m0, m7, 3 | |
7825 | palignr m1, m7, 2 | |
7826 | mova m2, m1 | |
7827 | palignr m3, m7, 1 | |
7828 | mova m4, m3 | |
7829 | mova m5, m3 | |
7830 | mova m6, m7 | |
7831 | PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8 | |
7832 | ||
7833 | ; Row[24 - 31] | |
7834 | movu m7, [r2 - 13] | |
7835 | palignr m0, m7, 2 | |
7836 | mova m1, m0 | |
7837 | mova m2, m0 | |
7838 | palignr m3, m7, 1 | |
7839 | mova m4, m3 | |
7840 | mova m5, m7 | |
7841 | mova m6, m7 | |
7842 | PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0 | |
7843 | ||
7844 | lea r0, [r6 + r1 * 4] | |
7845 | lea r6, [r6 + r1 * 8] | |
7846 | add r2, 8 | |
7847 | dec byte [rsp + 63] | |
7848 | jnz .loop | |
7849 | mov rsp, [rsp+64] | |
7850 | RET | |
7851 | ||
7852 | ;------------------------------------------------------------------------------------------------------------------- | |
7853 | ; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7854 | ;------------------------------------------------------------------------------------------------------------------- | |
7855 | INIT_XMM sse4 | |
7856 | cglobal intra_pred_ang32_15, 4,7,8 | |
7857 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
7858 | mov r6, rsp | |
7859 | sub rsp, 64+gprsize | |
7860 | and rsp, ~63 | |
7861 | mov [rsp+64], r6 | |
7862 | ||
7863 | ; collect reference pixel | |
7864 | movu m0, [r3] | |
7865 | movu m1, [r3 + 15] | |
7866 | pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] | |
7867 | pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] | |
7868 | mova [rsp], m1 | |
7869 | movu [rsp + 8], m0 | |
7870 | movu m0, [r2 + 1] | |
7871 | movu m1, [r2 + 1 + 16] | |
7872 | movu [rsp + 17], m0 | |
7873 | movu [rsp + 17 + 16], m1 | |
7874 | mov [rsp + 63], byte 4 | |
7875 | ||
7876 | ; filter | |
7877 | lea r2, [rsp + 17] ; r2 -> [0] | |
7878 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
7879 | lea r4, [ang_table] ; r4 -> ang_table | |
7880 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7881 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7882 | mova m5, [pw_1024] ; m5 -> 1024 | |
7883 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
7884 | ||
7885 | .loop: | |
7886 | ; Row[0 - 7] | |
7887 | movu m7, [r2 - 5] | |
7888 | palignr m0, m7, 4 | |
7889 | palignr m1, m7, 3 | |
7890 | mova m2, m1 | |
7891 | palignr m3, m7, 2 | |
7892 | mova m4, m3 | |
7893 | palignr m5, m7, 1 | |
7894 | mova m6, m5 | |
7895 | PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24 | |
7896 | ||
7897 | ; Row[8 - 15] | |
7898 | movu m7, [r2 - 9] | |
7899 | palignr m0, m7, 4 | |
7900 | palignr m1, m7, 3 | |
7901 | mova m2, m1 | |
7902 | palignr m3, m7, 2 | |
7903 | mova m4, m3 | |
7904 | palignr m5, m7, 1 | |
7905 | mova m6, m5 | |
7906 | PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16 | |
7907 | ||
7908 | ; Row[16 - 23] | |
7909 | movu m7, [r2 - 13] | |
7910 | palignr m0, m7, 3 | |
7911 | mova m1, m0 | |
7912 | palignr m2, m7, 2 | |
7913 | mova m3, m2 | |
7914 | palignr m4, m7, 1 | |
7915 | mova m5, m4 | |
7916 | mova m6, m7 | |
7917 | PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8 | |
7918 | ||
7919 | ; Row[24 - 31] | |
7920 | movu m7, [r2 - 17] | |
7921 | palignr m0, m7, 3 | |
7922 | mova m1, m0 | |
7923 | palignr m2, m7, 2 | |
7924 | mova m3, m2 | |
7925 | palignr m4, m7, 1 | |
7926 | mova m5, m4 | |
7927 | mova m6, m7 | |
7928 | PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0 | |
7929 | ||
7930 | lea r0, [r6 + r1 * 4] | |
7931 | lea r6, [r6 + r1 * 8] | |
7932 | add r2, 8 | |
7933 | dec byte [rsp + 63] | |
7934 | jnz .loop | |
7935 | mov rsp, [rsp+64] | |
7936 | RET | |
7937 | ||
7938 | ;------------------------------------------------------------------------------------------------------------------- | |
7939 | ; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
7940 | ;------------------------------------------------------------------------------------------------------------------- | |
7941 | INIT_XMM sse4 | |
7942 | cglobal intra_pred_ang32_16, 4,7,8 | |
7943 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
7944 | mov r6, rsp | |
7945 | sub rsp, 64+gprsize | |
7946 | and rsp, ~63 | |
7947 | mov [rsp+64], r6 | |
7948 | ||
7949 | ; collect reference pixel | |
7950 | movu m0, [r3] | |
7951 | movu m1, [r3 + 15] | |
7952 | pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] | |
7953 | pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] | |
7954 | mova [rsp], m1 | |
7955 | movu [rsp + 10], m0 | |
7956 | movu m0, [r2 + 1] | |
7957 | movu m1, [r2 + 1 + 16] | |
7958 | movu [rsp + 21], m0 | |
7959 | movu [rsp + 21 + 16], m1 | |
7960 | mov [rsp + 63], byte 4 | |
7961 | ||
7962 | ; filter | |
7963 | lea r2, [rsp + 21] ; r2 -> [0] | |
7964 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
7965 | lea r4, [ang_table] ; r4 -> ang_table | |
7966 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
7967 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
7968 | mova m5, [pw_1024] ; m5 -> 1024 | |
7969 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
7970 | ||
7971 | .loop: | |
7972 | ; Row[0 - 7] | |
7973 | movu m7, [r2 - 6] | |
7974 | palignr m0, m7, 5 | |
7975 | palignr m1, m7, 4 | |
7976 | mova m2, m1 | |
7977 | palignr m3, m7, 3 | |
7978 | palignr m4, m7, 2 | |
7979 | mova m5, m4 | |
7980 | palignr m6, m7, 1 | |
7981 | PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24 | |
7982 | ||
7983 | ; Row[8 - 15] | |
7984 | movu m7, [r2 - 11] | |
7985 | palignr m0, m7, 5 | |
7986 | palignr m1, m7, 4 | |
7987 | palignr m2, m7, 3 | |
7988 | mova m3, m2 | |
7989 | palignr m4, m7, 2 | |
7990 | palignr m5, m7, 1 | |
7991 | mova m6, m5 | |
7992 | PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16 | |
7993 | ||
7994 | ; Row[16 - 23] | |
7995 | movu m7, [r2 - 16] | |
7996 | palignr m0, m7, 4 | |
7997 | mova m1, m0 | |
7998 | palignr m2, m7, 3 | |
7999 | palignr m3, m7, 2 | |
8000 | mova m4, m3 | |
8001 | palignr m5, m7, 1 | |
8002 | mova m6, m7 | |
8003 | PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8 | |
8004 | ||
8005 | ; Row[24 - 31] | |
8006 | movu m7, [r2 - 21] | |
8007 | palignr m0, m7, 4 | |
8008 | palignr m1, m7, 3 | |
8009 | mova m2, m1 | |
8010 | palignr m3, m7, 2 | |
8011 | palignr m4, m7, 1 | |
8012 | mova m5, m4 | |
8013 | mova m6, m7 | |
8014 | PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0 | |
8015 | ||
8016 | lea r0, [r6 + r1 * 4] | |
8017 | lea r6, [r6 + r1 * 8] | |
8018 | add r2, 8 | |
8019 | dec byte [rsp + 63] | |
8020 | jnz .loop | |
8021 | mov rsp, [rsp+64] | |
8022 | RET | |
8023 | ||
8024 | ;------------------------------------------------------------------------------------------------------------------ | |
8025 | ; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8026 | ;------------------------------------------------------------------------------------------------------------------ | |
8027 | INIT_XMM sse4 | |
8028 | cglobal intra_pred_ang32_17, 4,7,8 | |
8029 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
8030 | mov r6, rsp | |
8031 | sub rsp, 64+gprsize | |
8032 | and rsp, ~63 | |
8033 | mov [rsp+64], r6 | |
8034 | ||
8035 | ; collect reference pixel | |
8036 | movu m0, [r3] | |
8037 | movu m1, [r3 + 16] | |
8038 | pshufb m0, [c_mode32_17_0] | |
8039 | pshufb m1, [c_mode32_17_0] | |
8040 | mova [rsp ], m1 | |
8041 | movu [rsp + 13], m0 | |
8042 | movu m0, [r2 + 1] | |
8043 | movu m1, [r2 + 1 + 16] | |
8044 | movu [rsp + 26], m0 | |
8045 | movu [rsp + 26 + 16], m1 | |
8046 | mov [rsp + 63], byte 4 | |
8047 | ||
8048 | ; filter | |
8049 | lea r2, [rsp + 25] ; r2 -> [0] | |
8050 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
8051 | lea r4, [ang_table] ; r4 -> ang_table | |
8052 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8053 | lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride | |
8054 | mova m5, [pw_1024] ; m5 -> 1024 | |
8055 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
8056 | ||
8057 | .loop: | |
8058 | ; Row[0 - 7] | |
8059 | movu m7, [r2 - 6] | |
8060 | palignr m0, m7, 6 | |
8061 | palignr m1, m7, 5 | |
8062 | palignr m2, m7, 4 | |
8063 | palignr m3, m7, 3 | |
8064 | palignr m4, m7, 2 | |
8065 | mova m5, m4 | |
8066 | palignr m6, m7, 1 | |
8067 | PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16 | |
8068 | ||
8069 | ; Row[7 - 15] | |
8070 | movu m7, [r2 - 12] | |
8071 | palignr m0, m7, 5 | |
8072 | palignr m1, m7, 4 | |
8073 | mova m2, m1 | |
8074 | palignr m3, m7, 3 | |
8075 | palignr m4, m7, 2 | |
8076 | palignr m5, m7, 1 | |
8077 | mova m6, m7 | |
8078 | PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0 | |
8079 | ||
8080 | ; Row[16 - 23] | |
8081 | movu m7, [r2 - 19] | |
8082 | palignr m0, m7, 6 | |
8083 | palignr m1, m7, 5 | |
8084 | palignr m2, m7, 4 | |
8085 | palignr m3, m7, 3 | |
8086 | palignr m4, m7, 2 | |
8087 | mova m5, m4 | |
8088 | palignr m6, m7, 1 | |
8089 | PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16 | |
8090 | ||
8091 | ; Row[24 - 31] | |
8092 | movu m7, [r2 - 25] | |
8093 | palignr m0, m7, 5 | |
8094 | palignr m1, m7, 4 | |
8095 | mova m2, m1 | |
8096 | palignr m3, m7, 3 | |
8097 | palignr m4, m7, 2 | |
8098 | palignr m5, m7, 1 | |
8099 | mova m6, m7 | |
8100 | PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0 | |
8101 | ||
8102 | lea r0, [r6 + r1 * 4] | |
8103 | lea r6, [r6 + r1 * 8] | |
8104 | add r2, 8 | |
8105 | dec byte [rsp + 63] | |
8106 | jnz .loop | |
8107 | mov rsp, [rsp+64] | |
8108 | ||
8109 | RET | |
8110 | ||
8111 | ;------------------------------------------------------------------------------------------------------------------- | |
8112 | ; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8113 | ;------------------------------------------------------------------------------------------------------------------- | |
8114 | INIT_XMM sse4 | |
8115 | cglobal intra_pred_ang32_18, 4,5,5 | |
8116 | movu m0, [r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] | |
8117 | movu m1, [r3 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16] | |
8118 | movu m2, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] | |
8119 | movu m3, [r2 + 17] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] | |
8120 | ||
8121 | lea r2, [r1 * 2] | |
8122 | lea r3, [r1 * 3] | |
8123 | lea r4, [r1 * 4] | |
8124 | ||
8125 | movu [r0], m0 | |
8126 | movu [r0 + 16], m1 | |
8127 | ||
8128 | pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] | |
8129 | pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32] | |
8130 | ||
8131 | palignr m4, m0, m2, 15 | |
8132 | movu [r0 + r1], m4 | |
8133 | palignr m4, m1, m0, 15 | |
8134 | movu [r0 + r1 + 16], m4 | |
8135 | palignr m4, m0, m2, 14 | |
8136 | movu [r0 + r2], m4 | |
8137 | palignr m4, m1, m0, 14 | |
8138 | movu [r0 + r2 + 16], m4 | |
8139 | palignr m4, m0, m2, 13 | |
8140 | movu [r0 + r3], m4 | |
8141 | palignr m4, m1, m0, 13 | |
8142 | movu [r0 + r3 + 16], m4 | |
8143 | ||
8144 | lea r0, [r0 + r4] | |
8145 | ||
8146 | palignr m4, m0, m2, 12 | |
8147 | movu [r0], m4 | |
8148 | palignr m4, m1, m0, 12 | |
8149 | movu [r0 + 16], m4 | |
8150 | palignr m4, m0, m2, 11 | |
8151 | movu [r0 + r1], m4 | |
8152 | palignr m4, m1, m0, 11 | |
8153 | movu [r0 + r1 + 16], m4 | |
8154 | palignr m4, m0, m2, 10 | |
8155 | movu [r0 + r2], m4 | |
8156 | palignr m4, m1, m0, 10 | |
8157 | movu [r0 + r2 + 16], m4 | |
8158 | palignr m4, m0, m2, 9 | |
8159 | movu [r0 + r3], m4 | |
8160 | palignr m4, m1, m0, 9 | |
8161 | movu [r0 + r3 + 16], m4 | |
8162 | ||
8163 | lea r0, [r0 + r4] | |
8164 | ||
8165 | palignr m4, m0, m2, 8 | |
8166 | movu [r0], m4 | |
8167 | palignr m4, m1, m0, 8 | |
8168 | movu [r0 + 16], m4 | |
8169 | palignr m4, m0, m2, 7 | |
8170 | movu [r0 + r1], m4 | |
8171 | palignr m4, m1, m0, 7 | |
8172 | movu [r0 + r1 + 16], m4 | |
8173 | palignr m4, m0, m2, 6 | |
8174 | movu [r0 + r2], m4 | |
8175 | palignr m4, m1, m0, 6 | |
8176 | movu [r0 + r2 + 16], m4 | |
8177 | palignr m4, m0, m2, 5 | |
8178 | movu [r0 + r3], m4 | |
8179 | palignr m4, m1, m0, 5 | |
8180 | movu [r0 + r3 + 16], m4 | |
8181 | ||
8182 | lea r0, [r0 + r4] | |
8183 | ||
8184 | palignr m4, m0, m2, 4 | |
8185 | movu [r0], m4 | |
8186 | palignr m4, m1, m0, 4 | |
8187 | movu [r0 + 16], m4 | |
8188 | palignr m4, m0, m2, 3 | |
8189 | movu [r0 + r1], m4 | |
8190 | palignr m4, m1, m0, 3 | |
8191 | movu [r0 + r1 + 16], m4 | |
8192 | palignr m4, m0, m2, 2 | |
8193 | movu [r0 + r2], m4 | |
8194 | palignr m4, m1, m0, 2 | |
8195 | movu [r0 + r2 + 16], m4 | |
8196 | palignr m4, m0, m2, 1 | |
8197 | movu [r0 + r3], m4 | |
8198 | palignr m4, m1, m0, 1 | |
8199 | movu [r0 + r3 + 16], m4 | |
8200 | ||
8201 | lea r0, [r0 + r4] | |
8202 | ||
8203 | movu [r0], m2 | |
8204 | movu [r0 + 16], m0 | |
8205 | palignr m4, m2, m3, 15 | |
8206 | movu [r0 + r1], m4 | |
8207 | palignr m4, m0, m2, 15 | |
8208 | movu [r0 + r1 + 16], m4 | |
8209 | palignr m4, m2, m3, 14 | |
8210 | movu [r0 + r2], m4 | |
8211 | palignr m4, m0, m2, 14 | |
8212 | movu [r0 + r2 + 16], m4 | |
8213 | palignr m4, m2, m3, 13 | |
8214 | movu [r0 + r3], m4 | |
8215 | palignr m4, m0, m2, 13 | |
8216 | movu [r0 + r3 + 16], m4 | |
8217 | ||
8218 | lea r0, [r0 + r4] | |
8219 | ||
8220 | palignr m4, m2, m3, 12 | |
8221 | movu [r0], m4 | |
8222 | palignr m4, m0, m2, 12 | |
8223 | movu [r0 + 16], m4 | |
8224 | palignr m4, m2, m3, 11 | |
8225 | movu [r0 + r1], m4 | |
8226 | palignr m4, m0, m2, 11 | |
8227 | movu [r0 + r1 + 16], m4 | |
8228 | palignr m4, m2, m3, 10 | |
8229 | movu [r0 + r2], m4 | |
8230 | palignr m4, m0, m2, 10 | |
8231 | movu [r0 + r2 + 16], m4 | |
8232 | palignr m4, m2, m3, 9 | |
8233 | movu [r0 + r3], m4 | |
8234 | palignr m4, m0, m2, 9 | |
8235 | movu [r0 + r3 + 16], m4 | |
8236 | ||
8237 | lea r0, [r0 + r4] | |
8238 | ||
8239 | palignr m4, m2, m3, 8 | |
8240 | movu [r0], m4 | |
8241 | palignr m4, m0, m2, 8 | |
8242 | movu [r0 + 16], m4 | |
8243 | palignr m4, m2, m3, 7 | |
8244 | movu [r0 + r1], m4 | |
8245 | palignr m4, m0, m2, 7 | |
8246 | movu [r0 + r1 + 16], m4 | |
8247 | palignr m4, m2, m3, 6 | |
8248 | movu [r0 + r2], m4 | |
8249 | palignr m4, m0, m2, 6 | |
8250 | movu [r0 + r2 + 16], m4 | |
8251 | palignr m4, m2, m3, 5 | |
8252 | movu [r0 + r3], m4 | |
8253 | palignr m4, m0, m2, 5 | |
8254 | movu [r0 + r3 + 16], m4 | |
8255 | ||
8256 | lea r0, [r0 + r4] | |
8257 | ||
8258 | palignr m4, m2, m3, 4 | |
8259 | movu [r0], m4 | |
8260 | palignr m4, m0, m2, 4 | |
8261 | movu [r0 + 16], m4 | |
8262 | palignr m4, m2, m3, 3 | |
8263 | movu [r0 + r1], m4 | |
8264 | palignr m4, m0, m2, 3 | |
8265 | movu [r0 + r1 + 16], m4 | |
8266 | palignr m4, m2, m3, 2 | |
8267 | movu [r0 + r2], m4 | |
8268 | palignr m4, m0, m2, 2 | |
8269 | movu [r0 + r2 + 16], m4 | |
8270 | palignr m4, m2, m3, 1 | |
8271 | movu [r0 + r3], m4 | |
8272 | palignr m4, m0, m2, 1 | |
8273 | movu [r0 + r3 + 16], m4 | |
8274 | RET | |
8275 | ||
8276 | ;------------------------------------------------------------------------------------------------------------------ | |
8277 | ; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8278 | ;------------------------------------------------------------------------------------------------------------------ | |
8279 | INIT_XMM sse4 | |
8280 | cglobal intra_pred_ang32_19, 4,7,8 | |
8281 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
8282 | xchg r2, r3 | |
8283 | mov r6, rsp | |
8284 | sub rsp, 64+gprsize | |
8285 | and rsp, ~63 | |
8286 | mov [rsp+64], r6 | |
8287 | ||
8288 | ; collect reference pixel | |
8289 | movu m0, [r3] | |
8290 | movu m1, [r3 + 16] | |
8291 | pshufb m0, [c_mode32_17_0] | |
8292 | pshufb m1, [c_mode32_17_0] | |
8293 | mova [rsp ], m1 | |
8294 | movu [rsp + 13], m0 | |
8295 | movu m0, [r2 + 1] | |
8296 | movu m1, [r2 + 1 + 16] | |
8297 | movu [rsp + 26], m0 | |
8298 | movu [rsp + 26 + 16], m1 | |
8299 | mov [rsp + 63], byte 4 | |
8300 | ||
8301 | ; filter | |
8302 | lea r2, [rsp + 25] ; r2 -> [0] | |
8303 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
8304 | lea r4, [ang_table] ; r4 -> ang_table | |
8305 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8306 | lea r6, [r0] ; r6 -> r0 | |
8307 | mova m5, [pw_1024] ; m5 -> 1024 | |
8308 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
8309 | ||
8310 | .loop: | |
8311 | ; Row[0 - 7] | |
8312 | movu m7, [r2 - 6] | |
8313 | palignr m0, m7, 6 | |
8314 | palignr m1, m7, 5 | |
8315 | palignr m2, m7, 4 | |
8316 | palignr m3, m7, 3 | |
8317 | palignr m4, m7, 2 | |
8318 | mova m5, m4 | |
8319 | palignr m6, m7, 1 | |
8320 | PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16 | |
8321 | ||
8322 | ; Row[7 - 15] | |
8323 | movu m7, [r2 - 12] | |
8324 | palignr m0, m7, 5 | |
8325 | palignr m1, m7, 4 | |
8326 | mova m2, m1 | |
8327 | palignr m3, m7, 3 | |
8328 | palignr m4, m7, 2 | |
8329 | palignr m5, m7, 1 | |
8330 | mova m6, m7 | |
8331 | lea r0, [r0 + r1 * 4] | |
8332 | PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0 | |
8333 | ||
8334 | ; Row[16 - 23] | |
8335 | movu m7, [r2 - 19] | |
8336 | palignr m0, m7, 6 | |
8337 | palignr m1, m7, 5 | |
8338 | palignr m2, m7, 4 | |
8339 | palignr m3, m7, 3 | |
8340 | palignr m4, m7, 2 | |
8341 | mova m5, m4 | |
8342 | palignr m6, m7, 1 | |
8343 | lea r0, [r0 + r1 * 4] | |
8344 | PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16 | |
8345 | ||
8346 | ; Row[24 - 31] | |
8347 | movu m7, [r2 - 25] | |
8348 | palignr m0, m7, 5 | |
8349 | palignr m1, m7, 4 | |
8350 | mova m2, m1 | |
8351 | palignr m3, m7, 3 | |
8352 | palignr m4, m7, 2 | |
8353 | palignr m5, m7, 1 | |
8354 | mova m6, m7 | |
8355 | lea r0, [r0 + r1 * 4] | |
8356 | PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0 | |
8357 | ||
8358 | add r6, 8 | |
8359 | mov r0, r6 | |
8360 | add r2, 8 | |
8361 | dec byte [rsp + 63] | |
8362 | jnz .loop | |
8363 | mov rsp, [rsp+64] | |
8364 | RET | |
8365 | ||
8366 | ;------------------------------------------------------------------------------------------------------------------- | |
8367 | ; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8368 | ;------------------------------------------------------------------------------------------------------------------- | |
8369 | INIT_XMM sse4 | |
8370 | cglobal intra_pred_ang32_20, 4,7,8 | |
8371 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
8372 | xchg r2, r3 | |
8373 | mov r6, rsp | |
8374 | sub rsp, 64+gprsize | |
8375 | and rsp, ~63 | |
8376 | mov [rsp+64], r6 | |
8377 | ||
8378 | ; collect reference pixel | |
8379 | movu m0, [r3] | |
8380 | movu m1, [r3 + 15] | |
8381 | pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] | |
8382 | pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] | |
8383 | mova [rsp], m1 | |
8384 | movu [rsp + 10], m0 | |
8385 | movu m0, [r2 + 1] | |
8386 | movu m1, [r2 + 1 + 16] | |
8387 | movu [rsp + 21], m0 | |
8388 | movu [rsp + 21 + 16], m1 | |
8389 | mov [rsp + 63], byte 4 | |
8390 | ||
8391 | ; filter | |
8392 | lea r2, [rsp + 21] ; r2 -> [0] | |
8393 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
8394 | lea r4, [ang_table] ; r4 -> ang_table | |
8395 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8396 | lea r6, [r0] ; r6 -> r0 | |
8397 | mova m5, [pw_1024] ; m5 -> 1024 | |
8398 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
8399 | ||
8400 | .loop: | |
8401 | ; Row[0 - 7] | |
8402 | movu m7, [r2 - 6] | |
8403 | palignr m0, m7, 5 | |
8404 | palignr m1, m7, 4 | |
8405 | mova m2, m1 | |
8406 | palignr m3, m7, 3 | |
8407 | palignr m4, m7, 2 | |
8408 | mova m5, m4 | |
8409 | palignr m6, m7, 1 | |
8410 | PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24 | |
8411 | ||
8412 | ; Row[8 - 15] | |
8413 | movu m7, [r2 - 11] | |
8414 | palignr m0, m7, 5 | |
8415 | palignr m1, m7, 4 | |
8416 | palignr m2, m7, 3 | |
8417 | mova m3, m2 | |
8418 | palignr m4, m7, 2 | |
8419 | palignr m5, m7, 1 | |
8420 | mova m6, m5 | |
8421 | lea r0, [r0 + r1 * 4] | |
8422 | PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16 | |
8423 | ||
8424 | ; Row[16 - 23] | |
8425 | movu m7, [r2 - 16] | |
8426 | palignr m0, m7, 4 | |
8427 | mova m1, m0 | |
8428 | palignr m2, m7, 3 | |
8429 | palignr m3, m7, 2 | |
8430 | mova m4, m3 | |
8431 | palignr m5, m7, 1 | |
8432 | mova m6, m7 | |
8433 | lea r0, [r0 + r1 * 4] | |
8434 | PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8 | |
8435 | ||
8436 | ; Row[24 - 31] | |
8437 | movu m7, [r2 - 21] | |
8438 | palignr m0, m7, 4 | |
8439 | palignr m1, m7, 3 | |
8440 | mova m2, m1 | |
8441 | palignr m3, m7, 2 | |
8442 | palignr m4, m7, 1 | |
8443 | mova m5, m4 | |
8444 | mova m6, m7 | |
8445 | lea r0, [r0 + r1 * 4] | |
8446 | PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0 | |
8447 | ||
8448 | add r6, 8 | |
8449 | mov r0, r6 | |
8450 | add r2, 8 | |
8451 | dec byte [rsp + 63] | |
8452 | jnz .loop | |
8453 | mov rsp, [rsp+64] | |
8454 | RET | |
8455 | ||
8456 | ;------------------------------------------------------------------------------------------------------------------- | |
8457 | ; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8458 | ;------------------------------------------------------------------------------------------------------------------- | |
8459 | INIT_XMM sse4 | |
8460 | cglobal intra_pred_ang32_21, 4,7,8 | |
8461 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
8462 | xchg r2, r3 | |
8463 | mov r6, rsp | |
8464 | sub rsp, 64+gprsize | |
8465 | and rsp, ~63 | |
8466 | mov [rsp+64], r6 | |
8467 | ||
8468 | ; collect reference pixel | |
8469 | movu m0, [r3] | |
8470 | movu m1, [r3 + 15] | |
8471 | pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] | |
8472 | pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] | |
8473 | mova [rsp], m1 | |
8474 | movu [rsp + 8], m0 | |
8475 | movu m0, [r2 + 1] | |
8476 | movu m1, [r2 + 1 + 16] | |
8477 | movu [rsp + 17], m0 | |
8478 | movu [rsp + 17 + 16], m1 | |
8479 | mov [rsp + 63], byte 4 | |
8480 | ||
8481 | ; filter | |
8482 | lea r2, [rsp + 17] ; r2 -> [0] | |
8483 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
8484 | lea r4, [ang_table] ; r4 -> ang_table | |
8485 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8486 | lea r6, [r0] ; r6 -> r0 | |
8487 | mova m5, [pw_1024] ; m5 -> 1024 | |
8488 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
8489 | ||
8490 | .loop: | |
8491 | ; Row[0 - 7] | |
8492 | movu m7, [r2 - 5] | |
8493 | palignr m0, m7, 4 | |
8494 | palignr m1, m7, 3 | |
8495 | mova m2, m1 | |
8496 | palignr m3, m7, 2 | |
8497 | mova m4, m3 | |
8498 | palignr m5, m7, 1 | |
8499 | mova m6, m5 | |
8500 | PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24 | |
8501 | ||
8502 | ; Row[8 - 15] | |
8503 | movu m7, [r2 - 9] | |
8504 | palignr m0, m7, 4 | |
8505 | palignr m1, m7, 3 | |
8506 | mova m2, m1 | |
8507 | palignr m3, m7, 2 | |
8508 | mova m4, m3 | |
8509 | palignr m5, m7, 1 | |
8510 | mova m6, m5 | |
8511 | lea r0, [r0 + r1 * 4] | |
8512 | PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16 | |
8513 | ||
8514 | ; Row[16 - 23] | |
8515 | movu m7, [r2 - 13] | |
8516 | palignr m0, m7, 3 | |
8517 | mova m1, m0 | |
8518 | palignr m2, m7, 2 | |
8519 | mova m3, m2 | |
8520 | palignr m4, m7, 1 | |
8521 | mova m5, m4 | |
8522 | mova m6, m7 | |
8523 | lea r0, [r0 + r1 * 4] | |
8524 | PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8 | |
8525 | ||
8526 | ; Row[24 - 31] | |
8527 | movu m7, [r2 - 17] | |
8528 | palignr m0, m7, 3 | |
8529 | mova m1, m0 | |
8530 | palignr m2, m7, 2 | |
8531 | mova m3, m2 | |
8532 | palignr m4, m7, 1 | |
8533 | mova m5, m4 | |
8534 | mova m6, m7 | |
8535 | lea r0, [r0 + r1 * 4] | |
8536 | PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0 | |
8537 | ||
8538 | add r6, 8 | |
8539 | mov r0, r6 | |
8540 | add r2, 8 | |
8541 | dec byte [rsp + 63] | |
8542 | jnz .loop | |
8543 | mov rsp, [rsp+64] | |
8544 | RET | |
8545 | ||
8546 | ;------------------------------------------------------------------------------------------------------------------- | |
8547 | ; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8548 | ;------------------------------------------------------------------------------------------------------------------- | |
8549 | INIT_XMM sse4 | |
8550 | cglobal intra_pred_ang32_22, 4,7,8 | |
8551 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
8552 | ||
8553 | xchg r2, r3 | |
8554 | mov r6, rsp | |
8555 | sub rsp, 64+gprsize | |
8556 | and rsp, ~63 | |
8557 | mov [rsp+64], r6 | |
8558 | ||
8559 | ; collect reference pixel | |
8560 | movu m0, [r3] | |
8561 | movu m1, [r3 + 15] | |
8562 | pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] | |
8563 | pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] | |
8564 | pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] | |
8565 | palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] | |
8566 | mova [rsp], m0 | |
8567 | movu m0, [r2 + 1] | |
8568 | movu m1, [r2 + 1 + 16] | |
8569 | movu [rsp + 13], m0 | |
8570 | movu [rsp + 13 + 16], m1 | |
8571 | mov [rsp + 63], byte 4 | |
8572 | ||
8573 | ; filter | |
8574 | lea r2, [rsp + 13] ; r2 -> [0] | |
8575 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
8576 | lea r4, [ang_table] ; r4 -> ang_table | |
8577 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8578 | lea r6, [r0] ; r6 -> r0 | |
8579 | mova m5, [pw_1024] ; m5 -> 1024 | |
8580 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
8581 | ||
8582 | .loop: | |
8583 | ; Row[0 - 7] | |
8584 | movu m7, [r2 - 4] | |
8585 | palignr m0, m7, 3 | |
8586 | mova m1, m0 | |
8587 | palignr m2, m7, 2 | |
8588 | mova m3, m2 | |
8589 | palignr m4, m7, 1 | |
8590 | mova m5, m4 | |
8591 | mova m6, m4 | |
8592 | PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24 | |
8593 | ||
8594 | ; Row[8 - 15] | |
8595 | movu m7, [r2 - 7] | |
8596 | palignr m0, m7, 3 | |
8597 | palignr m1, m7, 2 | |
8598 | mova m2, m1 | |
8599 | mova m3, m1 | |
8600 | palignr m4, m7, 1 | |
8601 | mova m5, m4 | |
8602 | mova m6, m7 | |
8603 | lea r0, [r0 + r1 * 4] | |
8604 | PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16 | |
8605 | ||
8606 | ; Row[16 - 23] | |
8607 | movu m7, [r2 - 10] | |
8608 | palignr m0, m7, 3 | |
8609 | palignr m1, m7, 2 | |
8610 | mova m2, m1 | |
8611 | palignr m3, m7, 1 | |
8612 | mova m4, m3 | |
8613 | mova m5, m3 | |
8614 | mova m6, m7 | |
8615 | lea r0, [r0 + r1 * 4] | |
8616 | PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8 | |
8617 | ||
8618 | ; Row[24 - 31] | |
8619 | movu m7, [r2 - 13] | |
8620 | palignr m0, m7, 2 | |
8621 | mova m1, m0 | |
8622 | mova m2, m0 | |
8623 | palignr m3, m7, 1 | |
8624 | mova m4, m3 | |
8625 | mova m5, m7 | |
8626 | mova m6, m7 | |
8627 | lea r0, [r0 + r1 * 4] | |
8628 | PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0 | |
8629 | ||
8630 | add r6, 8 | |
8631 | mov r0, r6 | |
8632 | add r2, 8 | |
8633 | dec byte [rsp + 63] | |
8634 | jnz .loop | |
8635 | mov rsp, [rsp+64] | |
8636 | RET | |
8637 | ||
8638 | ;----------------------------------------------------------------------------------------------------------------- | |
8639 | ; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8640 | ;----------------------------------------------------------------------------------------------------------------- | |
8641 | INIT_XMM sse4 | |
8642 | cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize) | |
8643 | %define above [rsp + 0 * mmsize] | |
8644 | xchg r2, r3 | |
8645 | lea r4, [ang_table + 16 * 16] | |
8646 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8647 | mov r6, r0 | |
8648 | mova m7, [pw_1024] | |
8649 | ||
8650 | MODE_13_23_ROW0 0 | |
8651 | add r6, 8 | |
8652 | mov r0, r6 | |
8653 | add r2, 7 | |
8654 | mov r3, 3 | |
8655 | .loop: | |
8656 | MODE_13_23 0 | |
8657 | add r6, 8 | |
8658 | mov r0, r6 | |
8659 | add r2, 8 | |
8660 | dec r3 | |
8661 | jnz .loop | |
8662 | RET | |
8663 | ||
8664 | ;----------------------------------------------------------------------------------------------------------------- | |
8665 | ; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8666 | ;----------------------------------------------------------------------------------------------------------------- | |
8667 | INIT_XMM sse4 | |
8668 | cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize) | |
8669 | %define above [rsp + 0 * mmsize] | |
8670 | xchg r2, r3 | |
8671 | lea r4, [ang_table + 16 * 16] | |
8672 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8673 | mov r6, r0 | |
8674 | mova m7, [pw_1024] | |
8675 | ||
8676 | MODE_12_24_ROW0 0 | |
8677 | add r6, 8 | |
8678 | mov r0, r6 | |
8679 | add r2, 7 | |
8680 | mov r3, 3 | |
8681 | .loop: | |
8682 | MODE_12_24 0 | |
8683 | add r6, 8 | |
8684 | mov r0, r6 | |
8685 | add r2, 8 | |
8686 | dec r3 | |
8687 | jnz .loop | |
8688 | RET | |
8689 | ||
8690 | ;------------------------------------------------------------------------------------------------------------------- | |
8691 | ; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8692 | ;------------------------------------------------------------------------------------------------------------------- | |
8693 | INIT_XMM sse4 | |
8694 | cglobal intra_pred_ang32_25, 4,7,8 | |
8695 | ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line | |
8696 | xchg r2, r3 | |
8697 | mov r6, rsp | |
8698 | sub rsp, 64+gprsize | |
8699 | and rsp, ~63 | |
8700 | mov [rsp+64], r6 | |
8701 | ||
8702 | ; collect reference pixel | |
8703 | movu m0, [r3 + 16] | |
8704 | pxor m1, m1 | |
8705 | pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] | |
8706 | mova [rsp], m0 | |
8707 | movu m0, [r2] | |
8708 | movu m1, [r2 + 16] | |
8709 | movu m2, [r2 + 32] | |
8710 | movu [rsp + 1], m0 | |
8711 | movu [rsp + 1 + 16], m1 | |
8712 | movu [rsp + 1 + 32], m2 | |
8713 | mov [rsp + 63], byte 4 | |
8714 | ||
8715 | ; filter | |
8716 | lea r2, [rsp + 1] ; r2 -> [0] | |
8717 | lea r3, [c_shuf8_0] ; r3 -> shuffle8 | |
8718 | lea r4, [ang_table] ; r4 -> ang_table | |
8719 | lea r5, [r1 * 3] ; r5 -> 3 * stride | |
8720 | lea r6, [r0] ; r6 -> r0 | |
8721 | mova m5, [pw_1024] ; m5 -> 1024 | |
8722 | mova m6, [c_deinterval8] ; m6 -> c_deinterval8 | |
8723 | ||
8724 | .loop: | |
8725 | ; Row[0 - 7] | |
8726 | movu m7, [r2] | |
8727 | mova m0, m7 | |
8728 | mova m1, m7 | |
8729 | mova m2, m7 | |
8730 | mova m3, m7 | |
8731 | mova m4, m7 | |
8732 | mova m5, m7 | |
8733 | mova m6, m7 | |
8734 | PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16 | |
8735 | ||
8736 | ; Row[8 - 15] | |
8737 | movu m7, [r2] | |
8738 | mova m0, m7 | |
8739 | mova m1, m7 | |
8740 | mova m2, m7 | |
8741 | mova m3, m7 | |
8742 | mova m4, m7 | |
8743 | mova m5, m7 | |
8744 | mova m6, m7 | |
8745 | lea r0, [r0 + r1 * 4] | |
8746 | PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0 | |
8747 | ||
8748 | ; Row[16 - 23] | |
8749 | movu m7, [r2 - 1] | |
8750 | mova m0, m7 | |
8751 | mova m1, m7 | |
8752 | mova m2, m7 | |
8753 | mova m3, m7 | |
8754 | mova m4, m7 | |
8755 | mova m5, m7 | |
8756 | mova m6, m7 | |
8757 | lea r0, [r0 + r1 * 4] | |
8758 | PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16 | |
8759 | ||
8760 | ; Row[24 - 31] | |
8761 | movu m7, [r2 - 1] | |
8762 | mova m0, m7 | |
8763 | mova m1, m7 | |
8764 | mova m2, m7 | |
8765 | mova m3, m7 | |
8766 | mova m4, m7 | |
8767 | mova m5, m7 | |
8768 | mova m6, m7 | |
8769 | lea r0, [r0 + r1 * 4] | |
8770 | PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0 | |
8771 | ||
8772 | add r6, 8 | |
8773 | mov r0, r6 | |
8774 | add r2, 8 | |
8775 | dec byte [rsp + 63] | |
8776 | jnz .loop | |
8777 | mov rsp, [rsp+64] | |
8778 | RET | |
8779 | ||
8780 | ;------------------------------------------------------------------------------------------------------------------ | |
8781 | ; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8782 | ;------------------------------------------------------------------------------------------------------------------ | |
8783 | INIT_XMM sse4 | |
8784 | cglobal intra_pred_ang32_26, 6,7,7,0-(2*mmsize) | |
8785 | %define m8 [rsp + 0 * mmsize] | |
8786 | %define m9 [rsp + 1 * mmsize] | |
8787 | lea r4, [r1 * 3] | |
8788 | mov r6, 2 | |
8789 | movu m0, [r2] | |
8790 | movu m1, [r2 + 1] | |
8791 | mova m8, m0 | |
8792 | mova m9, m1 | |
8793 | mov r2d, r5d | |
8794 | ||
8795 | .loop: | |
8796 | movu m0, [r3 + 1] | |
8797 | ||
8798 | movu [r0], m0 | |
8799 | movu [r0 + r1], m0 | |
8800 | movu [r0 + r1 * 2], m0 | |
8801 | movu [r0 + r4], m0 | |
8802 | lea r5, [r0 + r1 * 4] | |
8803 | movu [r5], m0 | |
8804 | movu [r5 + r1], m0 | |
8805 | movu [r5 + r1 * 2], m0 | |
8806 | movu [r5 + r4], m0 | |
8807 | lea r5, [r5 + r1 * 4] | |
8808 | movu [r5], m0 | |
8809 | movu [r5 + r1], m0 | |
8810 | movu [r5 + r1 * 2], m0 | |
8811 | movu [r5 + r4], m0 | |
8812 | lea r5, [r5 + r1 * 4] | |
8813 | movu [r5], m0 | |
8814 | movu [r5 + r1], m0 | |
8815 | movu [r5 + r1 * 2], m0 | |
8816 | movu [r5 + r4], m0 | |
8817 | lea r5, [r0 + r1 * 4] | |
8818 | movu [r5], m0 | |
8819 | movu [r5 + r1], m0 | |
8820 | movu [r5 + r1 * 2], m0 | |
8821 | movu [r5 + r4], m0 | |
8822 | lea r5, [r5 + r1 * 4] | |
8823 | movu [r5], m0 | |
8824 | movu [r5 + r1], m0 | |
8825 | movu [r5 + r1 * 2], m0 | |
8826 | movu [r5 + r4], m0 | |
8827 | lea r5, [r5 + r1 * 4] | |
8828 | movu [r5], m0 | |
8829 | movu [r5 + r1], m0 | |
8830 | movu [r5 + r1 * 2], m0 | |
8831 | movu [r5 + r4], m0 | |
8832 | lea r5, [r5 + r1 * 4] | |
8833 | movu [r5], m0 | |
8834 | movu [r5 + r1], m0 | |
8835 | movu [r5 + r1 * 2], m0 | |
8836 | movu [r5 + r4], m0 | |
8837 | lea r5, [r5 + r1 * 4] | |
8838 | movu [r5], m0 | |
8839 | movu [r5 + r1], m0 | |
8840 | movu [r5 + r1 * 2], m0 | |
8841 | movu [r5 + r4], m0 | |
8842 | lea r5, [r5 + r1 * 4] | |
8843 | movu [r5], m0 | |
8844 | movu [r5 + r1], m0 | |
8845 | movu [r5 + r1 * 2], m0 | |
8846 | movu [r5 + r4], m0 | |
8847 | lea r5, [r5 + r1 * 4] | |
8848 | movu [r5], m0 | |
8849 | movu [r5 + r1], m0 | |
8850 | movu [r5 + r1 * 2], m0 | |
8851 | movu [r5 + r4], m0 | |
8852 | ||
8853 | ; filter | |
8854 | cmp r2d, byte 0 | |
8855 | jz .quit | |
8856 | ||
8857 | pxor m4, m4 | |
8858 | pshufb m0, m4 | |
8859 | pmovzxbw m0, m0 | |
8860 | mova m1, m0 | |
8861 | movu m2, m8 | |
8862 | movu m3, m9 | |
8863 | ||
8864 | pshufb m2, m4 | |
8865 | pmovzxbw m2, m2 | |
8866 | movhlps m4, m3 | |
8867 | pmovzxbw m3, m3 | |
8868 | pmovzxbw m4, m4 | |
8869 | psubw m3, m2 | |
8870 | psubw m4, m2 | |
8871 | psraw m3, 1 | |
8872 | psraw m4, 1 | |
8873 | paddw m0, m3 | |
8874 | paddw m1, m4 | |
8875 | packuswb m0, m1 | |
8876 | ||
8877 | pextrb [r0], m0, 0 | |
8878 | pextrb [r0 + r1], m0, 1 | |
8879 | pextrb [r0 + r1 * 2], m0, 2 | |
8880 | pextrb [r0 + r4], m0, 3 | |
8881 | lea r5, [r0 + r1 * 4] | |
8882 | pextrb [r5], m0, 4 | |
8883 | pextrb [r5 + r1], m0, 5 | |
8884 | pextrb [r5 + r1 * 2], m0, 6 | |
8885 | pextrb [r5 + r4], m0, 7 | |
8886 | lea r5, [r5 + r1 * 4] | |
8887 | pextrb [r5], m0, 8 | |
8888 | pextrb [r5 + r1], m0, 9 | |
8889 | pextrb [r5 + r1 * 2], m0, 10 | |
8890 | pextrb [r5 + r4], m0, 11 | |
8891 | lea r5, [r5 + r1 * 4] | |
8892 | pextrb [r5], m0, 12 | |
8893 | pextrb [r5 + r1], m0, 13 | |
8894 | pextrb [r5 + r1 * 2], m0, 14 | |
8895 | pextrb [r5 + r4], m0, 15 | |
8896 | ||
8897 | .quit: | |
8898 | lea r3, [r3 + 16] | |
8899 | add r0, 16 | |
8900 | dec r6d | |
8901 | jnz .loop | |
8902 | RET | |
8903 | ||
8904 | ;------------------------------------------------------------------------------------------------------------------ | |
8905 | ; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8906 | ;------------------------------------------------------------------------------------------------------------------ | |
8907 | INIT_XMM sse4 | |
8908 | cglobal intra_pred_ang32_27, 3,7,8 | |
8909 | mov r2, r3mp | |
8910 | lea r3, [ang_table + 16 * 16] | |
8911 | mov r4d, 4 | |
8912 | lea r5, [r1 * 3] | |
8913 | mov r6, r0 | |
8914 | mova m7, [pw_1024] | |
8915 | .loop: | |
8916 | MODE_9_27 0 | |
8917 | add r6, 8 | |
8918 | mov r0, r6 | |
8919 | add r2, 8 | |
8920 | dec r4 | |
8921 | jnz .loop | |
8922 | RET | |
8923 | ||
8924 | ;------------------------------------------------------------------------------------------------------------------ | |
8925 | ; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8926 | ;------------------------------------------------------------------------------------------------------------------ | |
8927 | INIT_XMM sse4 | |
8928 | cglobal intra_pred_ang32_28, 3,7,8 | |
8929 | mov r2, r3mp | |
8930 | lea r3, [ang_table + 16 * 16] | |
8931 | mov r4d, 4 | |
8932 | lea r5, [r1 * 3] | |
8933 | mov r6, r0 | |
8934 | mova m7, [pw_1024] | |
8935 | .loop: | |
8936 | MODE_8_28 0 | |
8937 | add r6, 8 | |
8938 | mov r0, r6 | |
8939 | add r2, 8 | |
8940 | dec r4 | |
8941 | jnz .loop | |
8942 | RET | |
8943 | ||
8944 | ;------------------------------------------------------------------------------------------------------------------ | |
8945 | ; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8946 | ;------------------------------------------------------------------------------------------------------------------ | |
8947 | INIT_XMM sse4 | |
8948 | cglobal intra_pred_ang32_29, 3,7,8 | |
8949 | mov r2, r3mp | |
8950 | lea r3, [ang_table + 16 * 16] | |
8951 | mov r4d, 4 | |
8952 | lea r5, [r1 * 3] | |
8953 | mov r6, r0 | |
8954 | mova m7, [pw_1024] | |
8955 | .loop: | |
8956 | MODE_7_29 0 | |
8957 | add r6, 8 | |
8958 | mov r0, r6 | |
8959 | add r2, 8 | |
8960 | dec r4 | |
8961 | jnz .loop | |
8962 | RET | |
8963 | ||
8964 | ;------------------------------------------------------------------------------------------------------------------ | |
8965 | ; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8966 | ;------------------------------------------------------------------------------------------------------------------ | |
8967 | INIT_XMM sse4 | |
8968 | cglobal intra_pred_ang32_30, 3,7,8 | |
8969 | mov r2, r3mp | |
8970 | lea r3, [ang_table + 16 * 16] | |
8971 | mov r4d, 4 | |
8972 | lea r5, [r1 * 3] | |
8973 | mov r6, r0 | |
8974 | mova m7, [pw_1024] | |
8975 | .loop: | |
8976 | MODE_6_30 0 | |
8977 | add r6, 8 | |
8978 | mov r0, r6 | |
8979 | add r2, 8 | |
8980 | dec r4 | |
8981 | jnz .loop | |
8982 | RET | |
8983 | ||
8984 | ;------------------------------------------------------------------------------------------------------------------ | |
8985 | ; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
8986 | ;------------------------------------------------------------------------------------------------------------------ | |
8987 | INIT_XMM sse4 | |
8988 | cglobal intra_pred_ang32_31, 3,7,8 | |
8989 | mov r2, r3mp | |
8990 | lea r3, [ang_table + 16 * 16] | |
8991 | mov r4d, 4 | |
8992 | lea r5, [r1 * 3] | |
8993 | mov r6, r0 | |
8994 | mova m7, [pw_1024] | |
8995 | .loop: | |
8996 | MODE_5_31 0 | |
8997 | add r6, 8 | |
8998 | mov r0, r6 | |
8999 | add r2, 8 | |
9000 | dec r4 | |
9001 | jnz .loop | |
9002 | RET | |
9003 | ||
9004 | ;----------------------------------------------------------------------------------------------------------------- | |
9005 | ; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
9006 | ;----------------------------------------------------------------------------------------------------------------- | |
9007 | INIT_XMM sse4 | |
9008 | cglobal intra_pred_ang32_32, 3,7,8 | |
9009 | mov r2, r3mp | |
9010 | lea r3, [ang_table + 16 * 16] | |
9011 | mov r4d, 4 | |
9012 | lea r5, [r1 * 3] | |
9013 | mov r6, r0 | |
9014 | mova m7, [pw_1024] | |
9015 | .loop: | |
9016 | MODE_4_32 0 | |
9017 | add r6, 8 | |
9018 | mov r0, r6 | |
9019 | add r2, 8 | |
9020 | dec r4 | |
9021 | jnz .loop | |
9022 | RET | |
9023 | ||
9024 | ;------------------------------------------------------------------------------------------------------------------ | |
9025 | ; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter) | |
9026 | ;------------------------------------------------------------------------------------------------------------------ | |
9027 | INIT_XMM sse4 | |
9028 | cglobal intra_pred_ang32_33, 3,7,8 | |
9029 | xchg r2, r3mp | |
9030 | lea r3, [ang_table + 16 * 16] | |
9031 | mov r4d, 4 | |
9032 | lea r5, [r1 * 3] | |
9033 | mov r6, r0 | |
9034 | mova m7, [pw_1024] | |
9035 | .loop: | |
9036 | MODE_3_33 0 | |
9037 | add r6, 8 | |
9038 | mov r0, r6 | |
9039 | add r2, 8 | |
9040 | dec r4 | |
9041 | jnz .loop | |
9042 | RET | |
9043 | ||
9044 | ;----------------------------------------------------------------------------- | |
9045 | ; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) | |
9046 | ;----------------------------------------------------------------------------- | |
9047 | INIT_XMM sse4 | |
9048 | cglobal all_angs_pred_4x4, 6, 6, 8 | |
9049 | ||
9050 | ; mode 2 | |
9051 | ||
9052 | movh m0, [r2 + 2] | |
9053 | movd [r0], m0 | |
9054 | ||
9055 | palignr m1, m0, 1 | |
9056 | movd [r0 + 4], m1 | |
9057 | ||
9058 | palignr m1, m0, 2 | |
9059 | movd [r0 + 8], m1 | |
9060 | ||
9061 | psrldq m0, 3 | |
9062 | movd [r0 + 12], m0 | |
9063 | ||
9064 | ; mode 3 | |
9065 | ||
9066 | mova m0, [pw_1024] | |
9067 | ||
9068 | movh m1, [r2 + 1] | |
9069 | ||
9070 | palignr m2, m1, 1 | |
9071 | punpcklbw m1, m2 | |
9072 | ||
9073 | lea r5, [ang_table] | |
9074 | ||
9075 | pmaddubsw m5, m1, [r5 + 26 * 16] | |
9076 | pmulhrsw m5, m0 | |
9077 | packuswb m5, m5 | |
9078 | movd [r0 + 16], m5 | |
9079 | ||
9080 | palignr m2, m1, 2 | |
9081 | ||
9082 | mova m7, [r5 + 20 * 16] | |
9083 | ||
9084 | pmaddubsw m6, m2, m7 | |
9085 | pmulhrsw m6, m0 | |
9086 | packuswb m6, m6 | |
9087 | movd [r0 + 20], m6 | |
9088 | ||
9089 | palignr m3, m1, 4 | |
9090 | ||
9091 | pmaddubsw m4, m3, [r5 + 14 * 16] | |
9092 | pmulhrsw m4, m0 | |
9093 | packuswb m4, m4 | |
9094 | movd [r0 + 24], m4 | |
9095 | ||
9096 | palignr m4, m1, 6 | |
9097 | ||
9098 | pmaddubsw m4, [r5 + 8 * 16] | |
9099 | pmulhrsw m4, m0 | |
9100 | packuswb m4, m4 | |
9101 | movd [r0 + 28], m4 | |
9102 | ||
9103 | ; mode 4 | |
9104 | ||
9105 | pmaddubsw m4, m1, [r5 + 21 * 16] | |
9106 | pmulhrsw m4, m0 | |
9107 | packuswb m4, m4 | |
9108 | movd [r0 + 32], m4 | |
9109 | ||
9110 | pmaddubsw m4, m2, [r5 + 10 * 16] | |
9111 | pmulhrsw m4, m0 | |
9112 | packuswb m4, m4 | |
9113 | movd [r0 + 36], m4 | |
9114 | ||
9115 | pmaddubsw m4, m2, [r5 + 31 * 16] | |
9116 | pmulhrsw m4, m0 | |
9117 | packuswb m4, m4 | |
9118 | movd [r0 + 40], m4 | |
9119 | ||
9120 | pmaddubsw m4, m3, m7 | |
9121 | pmulhrsw m4, m0 | |
9122 | packuswb m4, m4 | |
9123 | movd [r0 + 44], m4 | |
9124 | ||
9125 | ; mode 5 | |
9126 | ||
9127 | pmaddubsw m4, m1, [r5 + 17 * 16] | |
9128 | pmulhrsw m4, m0 | |
9129 | packuswb m4, m4 | |
9130 | movd [r0 + 48], m4 | |
9131 | ||
9132 | pmaddubsw m4, m2, [r5 + 2 * 16] | |
9133 | pmulhrsw m4, m0 | |
9134 | packuswb m4, m4 | |
9135 | movd [r0 + 52], m4 | |
9136 | ||
9137 | pmaddubsw m4, m2, [r5 + 19 * 16] | |
9138 | pmulhrsw m4, m0 | |
9139 | packuswb m4, m4 | |
9140 | movd [r0 + 56], m4 | |
9141 | ||
9142 | pmaddubsw m3, [r5 + 4 * 16] | |
9143 | pmulhrsw m3, m0 | |
9144 | packuswb m3, m3 | |
9145 | movd [r0 + 60], m3 | |
9146 | ||
9147 | ; mode 6 | |
9148 | ||
9149 | pmaddubsw m3, m1, [r5 + 13 * 16] | |
9150 | pmulhrsw m3, m0 | |
9151 | packuswb m3, m3 | |
9152 | movd [r0 + 64], m3 | |
9153 | ||
9154 | movd [r0 + 68], m5 | |
9155 | ||
9156 | pmaddubsw m3, m2, [r5 + 7 * 16] | |
9157 | pmulhrsw m3, m0 | |
9158 | packuswb m3, m3 | |
9159 | movd [r0 + 72], m3 | |
9160 | ||
9161 | movd [r0 + 76], m6 | |
9162 | ||
9163 | ; mode 7 | |
9164 | ||
9165 | pmaddubsw m3, m1, [r5 + 9 * 16] | |
9166 | pmulhrsw m3, m0 | |
9167 | packuswb m3, m3 | |
9168 | movd [r0 + 80], m3 | |
9169 | ||
9170 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
9171 | pmulhrsw m3, m0 | |
9172 | packuswb m3, m3 | |
9173 | movd [r0 + 84], m3 | |
9174 | ||
9175 | pmaddubsw m3, m1, [r5 + 27 * 16] | |
9176 | pmulhrsw m3, m0 | |
9177 | packuswb m3, m3 | |
9178 | movd [r0 + 88], m3 | |
9179 | ||
9180 | pmaddubsw m2, [r5 + 4 * 16] | |
9181 | pmulhrsw m2, m0 | |
9182 | packuswb m2, m2 | |
9183 | movd [r0 + 92], m2 | |
9184 | ||
9185 | ; mode 8 | |
9186 | ||
9187 | pmaddubsw m2, m1, [r5 + 5 * 16] | |
9188 | pmulhrsw m2, m0 | |
9189 | packuswb m2, m2 | |
9190 | movd [r0 + 96], m2 | |
9191 | ||
9192 | pmaddubsw m2, m1, [r5 + 10 * 16] | |
9193 | pmulhrsw m2, m0 | |
9194 | packuswb m2, m2 | |
9195 | movd [r0 + 100], m2 | |
9196 | ||
9197 | pmaddubsw m2, m1, [r5 + 15 * 16] | |
9198 | pmulhrsw m2, m0 | |
9199 | packuswb m2, m2 | |
9200 | movd [r0 + 104], m2 | |
9201 | ||
9202 | pmaddubsw m2, m1, m7 | |
9203 | pmulhrsw m2, m0 | |
9204 | packuswb m2, m2 | |
9205 | movd [r0 + 108], m2 | |
9206 | ||
9207 | ; mode 9 | |
9208 | ||
9209 | pmaddubsw m2, m1, [r5 + 2 * 16] | |
9210 | pmulhrsw m2, m0 | |
9211 | packuswb m2, m2 | |
9212 | movd [r0 + 112], m2 | |
9213 | ||
9214 | pmaddubsw m2, m1, [r5 + 4 * 16] | |
9215 | pmulhrsw m2, m0 | |
9216 | packuswb m2, m2 | |
9217 | movd [r0 + 116], m2 | |
9218 | ||
9219 | pmaddubsw m2, m1, [r5 + 6 * 16] | |
9220 | pmulhrsw m2, m0 | |
9221 | packuswb m2, m2 | |
9222 | movd [r0 + 120], m2 | |
9223 | ||
9224 | pmaddubsw m1, [r5 + 8 * 16] | |
9225 | pmulhrsw m1, m0 | |
9226 | packuswb m1, m1 | |
9227 | movd [r0 + 124], m1 | |
9228 | ||
9229 | ; mode 10 | |
9230 | ||
9231 | movh m1, [r2] | |
9232 | palignr m2, m1, 1 | |
9233 | pshufd m3, m2, 0 | |
9234 | movu [r0 + 128], m3 | |
9235 | ||
9236 | pxor m3, m3 | |
9237 | ||
9238 | pshufb m4, m2, m3 | |
9239 | punpcklbw m4, m3 | |
9240 | ||
9241 | movh m5, [r1] | |
9242 | ||
9243 | pshufb m6, m5, m3 | |
9244 | punpcklbw m6, m3 | |
9245 | ||
9246 | psrldq m5, 1 | |
9247 | punpcklbw m5, m3 | |
9248 | ||
9249 | psubw m5, m6 | |
9250 | psraw m5, 1 | |
9251 | ||
9252 | paddw m4, m5 | |
9253 | ||
9254 | packuswb m4, m3 | |
9255 | ||
9256 | pextrb [r0 + 128], m4, 0 | |
9257 | pextrb [r0 + 132], m4, 1 | |
9258 | pextrb [r0 + 136], m4, 2 | |
9259 | pextrb [r0 + 140], m4, 3 | |
9260 | ||
9261 | ; mode 11 | |
9262 | ||
9263 | palignr m2, m1, 1 | |
9264 | punpcklbw m1, m2 | |
9265 | ||
9266 | pmaddubsw m2, m1, [r5 + 30 * 16] | |
9267 | pmulhrsw m2, m0 | |
9268 | packuswb m2, m2 | |
9269 | movd [r0 + 144], m2 | |
9270 | ||
9271 | pmaddubsw m2, m1, [r5 + 28 * 16] | |
9272 | pmulhrsw m2, m0 | |
9273 | packuswb m2, m2 | |
9274 | movd [r0 + 148], m2 | |
9275 | ||
9276 | pmaddubsw m2, m1, [r5 + 26 * 16] | |
9277 | pmulhrsw m2, m0 | |
9278 | packuswb m2, m2 | |
9279 | movd [r0 + 152], m2 | |
9280 | ||
9281 | pmaddubsw m2, m1, [r5 + 24 * 16] | |
9282 | pmulhrsw m2, m0 | |
9283 | packuswb m2, m2 | |
9284 | movd [r0 + 156], m2 | |
9285 | ||
9286 | ; mode 12 | |
9287 | ||
9288 | pmaddubsw m2, m1, [r5 + 27 * 16] | |
9289 | pmulhrsw m2, m0 | |
9290 | packuswb m2, m2 | |
9291 | movd [r0 + 160], m2 | |
9292 | ||
9293 | pmaddubsw m2, m1, [r5 + 22 * 16] | |
9294 | pmulhrsw m2, m0 | |
9295 | packuswb m2, m2 | |
9296 | movd [r0 + 164], m2 | |
9297 | ||
9298 | pmaddubsw m2, m1, [r5 + 17 * 16] | |
9299 | pmulhrsw m2, m0 | |
9300 | packuswb m2, m2 | |
9301 | movd [r0 + 168], m2 | |
9302 | ||
9303 | pmaddubsw m2, m1, [r5 + 12 * 16] | |
9304 | pmulhrsw m2, m0 | |
9305 | packuswb m2, m2 | |
9306 | movd [r0 + 172], m2 | |
9307 | ||
9308 | ; mode 13 | |
9309 | ||
9310 | pmaddubsw m2, m1, [r5 + 23 * 16] | |
9311 | pmulhrsw m2, m0 | |
9312 | packuswb m2, m2 | |
9313 | movd [r0 + 176], m2 | |
9314 | ||
9315 | pmaddubsw m2, m1, [r5 + 14 * 16] | |
9316 | pmulhrsw m2, m0 | |
9317 | packuswb m2, m2 | |
9318 | movd [r0 + 180], m2 | |
9319 | ||
9320 | pmaddubsw m2, m1, [r5 + 5 * 16] | |
9321 | pmulhrsw m2, m0 | |
9322 | packuswb m2, m2 | |
9323 | movd [r0 + 184], m2 | |
9324 | ||
9325 | pslldq m2, m1, 2 | |
9326 | pinsrb m2, [r1 + 0], 1 | |
9327 | pinsrb m2, [r1 + 4], 0 | |
9328 | ||
9329 | pmaddubsw m3, m2, [r5 + 28 * 16] | |
9330 | pmulhrsw m3, m0 | |
9331 | packuswb m3, m3 | |
9332 | movd [r0 + 188], m3 | |
9333 | ||
9334 | ; mode 14 | |
9335 | ||
9336 | pmaddubsw m3, m1, [r5 + 19 * 16] | |
9337 | pmulhrsw m3, m0 | |
9338 | packuswb m3, m3 | |
9339 | movd [r0 + 192], m3 | |
9340 | ||
9341 | pmaddubsw m5, m1, [r5 + 6 * 16] | |
9342 | pmulhrsw m5, m0 | |
9343 | packuswb m5, m5 | |
9344 | movd [r0 + 196], m5 | |
9345 | ||
9346 | pinsrb m2, [r1 + 2], 0 | |
9347 | ||
9348 | pmaddubsw m3, m2, [r5 + 25 * 16] | |
9349 | pmulhrsw m3, m0 | |
9350 | packuswb m3, m3 | |
9351 | movd [r0 + 200], m3 | |
9352 | ||
9353 | pmaddubsw m3, m2, [r5 + 12 * 16] | |
9354 | pmulhrsw m3, m0 | |
9355 | packuswb m3, m3 | |
9356 | movd [r0 + 204], m3 | |
9357 | ||
9358 | ; mode 15 | |
9359 | ||
9360 | pmaddubsw m3, m1, [r5 + 15 * 16] | |
9361 | pmulhrsw m3, m0 | |
9362 | packuswb m3, m3 | |
9363 | movd [r0 + 208], m3 | |
9364 | ||
9365 | pmaddubsw m3, m2, [r5 + 30 * 16] | |
9366 | pmulhrsw m3, m0 | |
9367 | packuswb m3, m3 | |
9368 | movd [r0 + 212], m3 | |
9369 | ||
9370 | pmaddubsw m3, m2, [r5 + 13 * 16] | |
9371 | pmulhrsw m3, m0 | |
9372 | packuswb m3, m3 | |
9373 | movd [r0 + 216], m3 | |
9374 | ||
9375 | pslldq m3, m2, 2 | |
9376 | pinsrb m3, [r1 + 2], 1 | |
9377 | pinsrb m3, [r1 + 4], 0 | |
9378 | ||
9379 | pmaddubsw m4, m3, [r5 + 28 * 16] | |
9380 | pmulhrsw m4, m0 | |
9381 | packuswb m4, m4 | |
9382 | movd [r0 + 220], m4 | |
9383 | ||
9384 | ; mode 16 | |
9385 | ||
9386 | pmaddubsw m4, m1, [r5 + 11 * 16] | |
9387 | pmulhrsw m4, m0 | |
9388 | packuswb m4, m4 | |
9389 | movd [r0 + 224], m4 | |
9390 | ||
9391 | pmaddubsw m4, m2, [r5 + 22 * 16] | |
9392 | pmulhrsw m4, m0 | |
9393 | packuswb m4, m4 | |
9394 | movd [r0 + 228], m4 | |
9395 | ||
9396 | pmaddubsw m4, m2, [r5 + 1 * 16] | |
9397 | pmulhrsw m4, m0 | |
9398 | packuswb m4, m4 | |
9399 | movd [r0 + 232], m4 | |
9400 | ||
9401 | pinsrb m3, [r1 + 3], 0 | |
9402 | ||
9403 | pmaddubsw m3, [r5 + 12 * 16] | |
9404 | pmulhrsw m3, m0 | |
9405 | packuswb m3, m3 | |
9406 | movd [r0 + 236], m3 | |
9407 | ||
9408 | ; mode 17 | |
9409 | ||
9410 | movd [r0 + 240], m5 | |
9411 | ||
9412 | pslldq m1, 2 | |
9413 | pinsrb m1, [r1 + 1], 0 | |
9414 | pinsrb m1, [r1 + 0], 1 | |
9415 | ||
9416 | pmaddubsw m2, m1, [r5 + 12 * 16] | |
9417 | pmulhrsw m2, m0 | |
9418 | packuswb m2, m2 | |
9419 | movd [r0 + 244], m2 | |
9420 | ||
9421 | pslldq m1, 2 | |
9422 | pinsrb m1, [r1 + 2], 0 | |
9423 | pinsrb m1, [r1 + 1], 1 | |
9424 | ||
9425 | pmaddubsw m2, m1, [r5 + 18 * 16] | |
9426 | pmulhrsw m2, m0 | |
9427 | packuswb m2, m2 | |
9428 | movd [r0 + 248], m2 | |
9429 | ||
9430 | pslldq m1, 2 | |
9431 | pinsrb m1, [r1 + 4], 0 | |
9432 | pinsrb m1, [r1 + 2], 1 | |
9433 | ||
9434 | pmaddubsw m1, [r5 + 24 * 16] | |
9435 | pmulhrsw m1, m0 | |
9436 | packuswb m1, m1 | |
9437 | movd [r0 + 252], m1 | |
9438 | ||
9439 | ; mode 18 | |
9440 | ||
9441 | movh m1, [r1] | |
9442 | movd [r0 + 256], m1 | |
9443 | ||
9444 | pslldq m2, m1, 1 | |
9445 | pinsrb m2, [r2 + 1], 0 | |
9446 | movd [r0 + 260], m2 | |
9447 | ||
9448 | pslldq m3, m2, 1 | |
9449 | pinsrb m3, [r2 + 2], 0 | |
9450 | movd [r0 + 264], m3 | |
9451 | ||
9452 | pslldq m4, m3, 1 | |
9453 | pinsrb m4, [r2 + 3], 0 | |
9454 | movd [r0 + 268], m4 | |
9455 | ||
9456 | ; mode 19 | |
9457 | ||
9458 | palignr m4, m1, 1 | |
9459 | punpcklbw m1, m4 | |
9460 | ||
9461 | pmaddubsw m5, m1, [r5 + 6 * 16] | |
9462 | pmulhrsw m5, m0 | |
9463 | packuswb m5, m5 | |
9464 | movd [r0 + 272], m5 | |
9465 | ||
9466 | pslldq m2, m1, 2 | |
9467 | pinsrb m2, [r2 + 1], 0 | |
9468 | pinsrb m2, [r2], 1 | |
9469 | ||
9470 | pmaddubsw m3, m2, [r5 + 12 * 16] | |
9471 | pmulhrsw m3, m0 | |
9472 | packuswb m3, m3 | |
9473 | movd [r0 + 276], m3 | |
9474 | ||
9475 | pslldq m3, m2, 2 | |
9476 | pinsrb m3, [r2 + 1], 1 | |
9477 | pinsrb m3, [r2 + 2], 0 | |
9478 | ||
9479 | pmaddubsw m4, m3, [r5 + 18 * 16] | |
9480 | pmulhrsw m4, m0 | |
9481 | packuswb m4, m4 | |
9482 | movd [r0 + 280], m4 | |
9483 | ||
9484 | pslldq m3, 2 | |
9485 | pinsrb m3, [r2 + 2], 1 | |
9486 | pinsrb m3, [r2 + 4], 0 | |
9487 | ||
9488 | pmaddubsw m3, [r5 + 24 * 16] | |
9489 | pmulhrsw m3, m0 | |
9490 | packuswb m3, m3 | |
9491 | movd [r0 + 284], m3 | |
9492 | ||
9493 | ; mode 20 | |
9494 | ||
9495 | pmaddubsw m3, m1, [r5 + 11 * 16] | |
9496 | pmulhrsw m3, m0 | |
9497 | packuswb m3, m3 | |
9498 | movd [r0 + 288], m3 | |
9499 | ||
9500 | pinsrb m2, [r2 + 2], 0 | |
9501 | ||
9502 | pmaddubsw m3, m2, [r5 + 22 * 16] | |
9503 | pmulhrsw m3, m0 | |
9504 | packuswb m3, m3 | |
9505 | movd [r0 + 292], m3 | |
9506 | ||
9507 | pmaddubsw m3, m2, [r5 + 1 * 16] | |
9508 | pmulhrsw m3, m0 | |
9509 | packuswb m3, m3 | |
9510 | movd [r0 + 296], m3 | |
9511 | ||
9512 | pslldq m3, m2, 2 | |
9513 | pinsrb m3, [r2 + 2], 1 | |
9514 | pinsrb m3, [r2 + 3], 0 | |
9515 | ||
9516 | pmaddubsw m4, m3, [r5 + 12 * 16] | |
9517 | pmulhrsw m4, m0 | |
9518 | packuswb m4, m4 | |
9519 | movd [r0 + 300], m4 | |
9520 | ||
9521 | ; mode 21 | |
9522 | ||
9523 | pmaddubsw m4, m1, [r5 + 15 * 16] | |
9524 | pmulhrsw m4, m0 | |
9525 | packuswb m4, m4 | |
9526 | movd [r0 + 304], m4 | |
9527 | ||
9528 | pmaddubsw m4, m2, [r5 + 30 * 16] | |
9529 | pmulhrsw m4, m0 | |
9530 | packuswb m4, m4 | |
9531 | movd [r0 + 308], m4 | |
9532 | ||
9533 | pmaddubsw m4, m2, [r5 + 13 * 16] | |
9534 | pmulhrsw m4, m0 | |
9535 | packuswb m4, m4 | |
9536 | movd [r0 + 312], m4 | |
9537 | ||
9538 | pinsrb m3, [r2 + 4], 0 | |
9539 | ||
9540 | pmaddubsw m3, [r5 + 28 * 16] | |
9541 | pmulhrsw m3, m0 | |
9542 | packuswb m3, m3 | |
9543 | movd [r0 + 316], m3 | |
9544 | ||
9545 | ; mode 22 | |
9546 | ||
9547 | pmaddubsw m3, m1, [r5 + 19 * 16] | |
9548 | pmulhrsw m3, m0 | |
9549 | packuswb m3, m3 | |
9550 | movd [r0 + 320], m3 | |
9551 | ||
9552 | movd [r0 + 324], m5 | |
9553 | ||
9554 | pmaddubsw m3, m2, [r5 + 25 * 16] | |
9555 | pmulhrsw m3, m0 | |
9556 | packuswb m3, m3 | |
9557 | movd [r0 + 328], m3 | |
9558 | ||
9559 | pmaddubsw m3, m2, [r5 + 12 * 16] | |
9560 | pmulhrsw m3, m0 | |
9561 | packuswb m3, m3 | |
9562 | movd [r0 + 332], m3 | |
9563 | ||
9564 | ; mode 23 | |
9565 | ||
9566 | pmaddubsw m3, m1, [r5 + 23 * 16] | |
9567 | pmulhrsw m3, m0 | |
9568 | packuswb m3, m3 | |
9569 | movd [r0 + 336], m3 | |
9570 | ||
9571 | pmaddubsw m3, m1, [r5 + 14 * 16] | |
9572 | pmulhrsw m3, m0 | |
9573 | packuswb m3, m3 | |
9574 | movd [r0 + 340], m3 | |
9575 | ||
9576 | pmaddubsw m3, m1, [r5 + 5 * 16] | |
9577 | pmulhrsw m3, m0 | |
9578 | packuswb m3, m3 | |
9579 | movd [r0 + 344], m3 | |
9580 | ||
9581 | pinsrb m2, [r2 + 4], 0 | |
9582 | ||
9583 | pmaddubsw m2, [r5 + 28 * 16] | |
9584 | pmulhrsw m2, m0 | |
9585 | packuswb m2, m2 | |
9586 | movd [r0 + 348], m2 | |
9587 | ||
9588 | ; mode 24 | |
9589 | ||
9590 | pmaddubsw m2, m1, [r5 + 27 * 16] | |
9591 | pmulhrsw m2, m0 | |
9592 | packuswb m2, m2 | |
9593 | movd [r0 + 352], m2 | |
9594 | ||
9595 | pmaddubsw m2, m1, [r5 + 22 * 16] | |
9596 | pmulhrsw m2, m0 | |
9597 | packuswb m2, m2 | |
9598 | movd [r0 + 356], m2 | |
9599 | ||
9600 | pmaddubsw m2, m1, [r5 + 17 * 16] | |
9601 | pmulhrsw m2, m0 | |
9602 | packuswb m2, m2 | |
9603 | movd [r0 + 360], m2 | |
9604 | ||
9605 | pmaddubsw m2, m1, [r5 + 12 * 16] | |
9606 | pmulhrsw m2, m0 | |
9607 | packuswb m2, m2 | |
9608 | movd [r0 + 364], m2 | |
9609 | ||
9610 | ; mode 25 | |
9611 | ||
9612 | pmaddubsw m2, m1, [r5 + 30 * 16] | |
9613 | pmulhrsw m2, m0 | |
9614 | packuswb m2, m2 | |
9615 | movd [r0 + 368], m2 | |
9616 | ||
9617 | pmaddubsw m2, m1, [r5 + 28 * 16] | |
9618 | pmulhrsw m2, m0 | |
9619 | packuswb m2, m2 | |
9620 | movd [r0 + 372], m2 | |
9621 | ||
9622 | pmaddubsw m2, m1, [r5 + 26 * 16] | |
9623 | pmulhrsw m2, m0 | |
9624 | packuswb m2, m2 | |
9625 | movd [r0 + 376], m2 | |
9626 | ||
9627 | pmaddubsw m2, m1, [r5 + 24 * 16] | |
9628 | pmulhrsw m2, m0 | |
9629 | packuswb m2, m2 | |
9630 | movd [r0 + 380], m2 | |
9631 | ||
9632 | ; mode 26 | |
9633 | ||
9634 | movh m1, [r1 + 1] | |
9635 | pshufd m2, m1, 0 | |
9636 | movu [r0 + 384], m2 | |
9637 | ||
9638 | pxor m2, m2 | |
9639 | ||
9640 | pshufb m3, m1, m2 | |
9641 | punpcklbw m3, m2 | |
9642 | ||
9643 | movh m4, [r2] | |
9644 | ||
9645 | pshufb m5, m4, m2 | |
9646 | punpcklbw m5, m2 | |
9647 | ||
9648 | psrldq m4, 1 | |
9649 | punpcklbw m4, m2 | |
9650 | ||
9651 | psubw m4, m5 | |
9652 | psraw m4, 1 | |
9653 | ||
9654 | paddw m3, m4 | |
9655 | ||
9656 | packuswb m3, m2 | |
9657 | ||
9658 | pextrb [r0 + 384], m3, 0 | |
9659 | pextrb [r0 + 388], m3, 1 | |
9660 | pextrb [r0 + 392], m3, 2 | |
9661 | pextrb [r0 + 396], m3, 3 | |
9662 | ||
9663 | ; mode 27 | |
9664 | ||
9665 | palignr m2, m1, 1 | |
9666 | punpcklbw m1, m2 | |
9667 | ||
9668 | pmaddubsw m2, m1, [r5 + 2 * 16] | |
9669 | pmulhrsw m2, m0 | |
9670 | packuswb m2, m2 | |
9671 | movd [r0 + 400], m2 | |
9672 | ||
9673 | pmaddubsw m2, m1, [r5 + 4 * 16] | |
9674 | pmulhrsw m2, m0 | |
9675 | packuswb m2, m2 | |
9676 | movd [r0 + 404], m2 | |
9677 | ||
9678 | pmaddubsw m2, m1, [r5 + 6 * 16] | |
9679 | pmulhrsw m2, m0 | |
9680 | packuswb m2, m2 | |
9681 | movd [r0 + 408], m2 | |
9682 | ||
9683 | pmaddubsw m2, m1, [r5 + 8 * 16] | |
9684 | pmulhrsw m2, m0 | |
9685 | packuswb m2, m2 | |
9686 | movd [r0 + 412], m2 | |
9687 | ||
9688 | ; mode 28 | |
9689 | ||
9690 | pmaddubsw m2, m1, [r5 + 5 * 16] | |
9691 | pmulhrsw m2, m0 | |
9692 | packuswb m2, m2 | |
9693 | movd [r0 + 416], m2 | |
9694 | ||
9695 | pmaddubsw m2, m1, [r5 + 10 * 16] | |
9696 | pmulhrsw m2, m0 | |
9697 | packuswb m2, m2 | |
9698 | movd [r0 + 420], m2 | |
9699 | ||
9700 | pmaddubsw m2, m1, [r5 + 15 * 16] | |
9701 | pmulhrsw m2, m0 | |
9702 | packuswb m2, m2 | |
9703 | movd [r0 + 424], m2 | |
9704 | ||
9705 | pmaddubsw m2, m1, m7 | |
9706 | pmulhrsw m2, m0 | |
9707 | packuswb m2, m2 | |
9708 | movd [r0 + 428], m2 | |
9709 | ||
9710 | ; mode 29 | |
9711 | ||
9712 | pmaddubsw m2, m1, [r5 + 9 * 16] | |
9713 | pmulhrsw m2, m0 | |
9714 | packuswb m2, m2 | |
9715 | movd [r0 + 432], m2 | |
9716 | ||
9717 | pmaddubsw m2, m1, [r5 + 18 * 16] | |
9718 | pmulhrsw m2, m0 | |
9719 | packuswb m2, m2 | |
9720 | movd [r0 + 436], m2 | |
9721 | ||
9722 | pmaddubsw m2, m1, [r5 + 27 * 16] | |
9723 | pmulhrsw m2, m0 | |
9724 | packuswb m2, m2 | |
9725 | movd [r0 + 440], m2 | |
9726 | ||
9727 | palignr m2, m1, 2 | |
9728 | ||
9729 | pmaddubsw m3, m2, [r5 + 4 * 16] | |
9730 | pmulhrsw m3, m0 | |
9731 | packuswb m3, m3 | |
9732 | movd [r0 + 444], m3 | |
9733 | ||
9734 | ; mode 30 | |
9735 | ||
9736 | pmaddubsw m3, m1, [r5 + 13 * 16] | |
9737 | pmulhrsw m3, m0 | |
9738 | packuswb m3, m3 | |
9739 | movd [r0 + 448], m3 | |
9740 | ||
9741 | pmaddubsw m6, m1, [r5 + 26 * 16] | |
9742 | pmulhrsw m6, m0 | |
9743 | packuswb m6, m6 | |
9744 | movd [r0 + 452], m6 | |
9745 | ||
9746 | pmaddubsw m3, m2, [r5 + 7 * 16] | |
9747 | pmulhrsw m3, m0 | |
9748 | packuswb m3, m3 | |
9749 | movd [r0 + 456], m3 | |
9750 | ||
9751 | pmaddubsw m5, m2, m7 | |
9752 | pmulhrsw m5, m0 | |
9753 | packuswb m5, m5 | |
9754 | movd [r0 + 460], m5 | |
9755 | ||
9756 | ; mode 31 | |
9757 | ||
9758 | pmaddubsw m3, m1, [r5 + 17 * 16] | |
9759 | pmulhrsw m3, m0 | |
9760 | packuswb m3, m3 | |
9761 | movd [r0 + 464], m3 | |
9762 | ||
9763 | pmaddubsw m3, m2, [r5 + 2 * 16] | |
9764 | pmulhrsw m3, m0 | |
9765 | packuswb m3, m3 | |
9766 | movd [r0 + 468], m3 | |
9767 | ||
9768 | pmaddubsw m3, m2, [r5 + 19 * 16] | |
9769 | pmulhrsw m3, m0 | |
9770 | packuswb m3, m3 | |
9771 | movd [r0 + 472], m3 | |
9772 | ||
9773 | palignr m3, m2, 2 | |
9774 | ||
9775 | pmaddubsw m4, m3, [r5 + 4 * 16] | |
9776 | pmulhrsw m4, m0 | |
9777 | packuswb m4, m4 | |
9778 | movd [r0 + 476], m4 | |
9779 | ||
9780 | ; mode 32 | |
9781 | ||
9782 | pmaddubsw m4, m1, [r5 + 21 * 16] | |
9783 | pmulhrsw m4, m0 | |
9784 | packuswb m4, m4 | |
9785 | movd [r0 + 480], m4 | |
9786 | ||
9787 | pmaddubsw m4, m2, [r5 + 10 * 16] | |
9788 | pmulhrsw m4, m0 | |
9789 | packuswb m4, m4 | |
9790 | movd [r0 + 484], m4 | |
9791 | ||
9792 | pmaddubsw m4, m2, [r5 + 31 * 16] | |
9793 | pmulhrsw m4, m0 | |
9794 | packuswb m4, m4 | |
9795 | movd [r0 + 488], m4 | |
9796 | ||
9797 | pmaddubsw m4, m3, m7 | |
9798 | pmulhrsw m4, m0 | |
9799 | packuswb m4, m4 | |
9800 | movd [r0 + 492], m4 | |
9801 | ||
9802 | ; mode 33 | |
9803 | ||
9804 | movd [r0 + 496], m6 | |
9805 | ||
9806 | movd [r0 + 500], m5 | |
9807 | ||
9808 | pmaddubsw m4, m3, [r5 + 14 * 16] | |
9809 | pmulhrsw m4, m0 | |
9810 | packuswb m4, m4 | |
9811 | movd [r0 + 504], m4 | |
9812 | ||
9813 | psrldq m3, 2 | |
9814 | ||
9815 | pmaddubsw m3, [r5 + 8 * 16] | |
9816 | pmulhrsw m3, m0 | |
9817 | packuswb m3, m3 | |
9818 | movd [r0 + 508], m3 | |
9819 | ||
9820 | ; mode 34 | |
9821 | ||
9822 | movh m0, [r1 + 2] | |
9823 | movd [r0 + 512], m0 | |
9824 | ||
9825 | palignr m1, m0, 1 | |
9826 | movd [r0 + 516], m1 | |
9827 | ||
9828 | palignr m1, m0, 2 | |
9829 | movd [r0 + 520], m1 | |
9830 | ||
9831 | palignr m1, m0, 3 | |
9832 | movd [r0 + 524], m1 | |
9833 | ||
9834 | RET | |
9835 | ||
9836 | ;----------------------------------------------------------------------------- | |
9837 | ; void all_angs_pred_8x8(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) | |
9838 | ;----------------------------------------------------------------------------- | |
9839 | INIT_XMM sse4 | |
9840 | cglobal all_angs_pred_8x8, 6, 6, 8, dest, above0, left0, above1, left1, bLuma | |
9841 | ||
9842 | ; mode 2 | |
9843 | ||
9844 | movu m0, [r4 + 2] | |
9845 | ||
9846 | palignr m1, m0, 1 | |
9847 | punpcklqdq m2, m0, m1 | |
9848 | movu [r0], m2 | |
9849 | ||
9850 | palignr m1, m0, 2 | |
9851 | palignr m2, m0, 3 | |
9852 | punpcklqdq m1, m2 | |
9853 | movu [r0 + 16], m1 | |
9854 | ||
9855 | palignr m1, m0, 4 | |
9856 | palignr m2, m0, 5 | |
9857 | punpcklqdq m1, m2 | |
9858 | movu [r0 + 32], m1 | |
9859 | ||
9860 | palignr m1, m0, 6 | |
9861 | palignr m2, m0, 7 | |
9862 | punpcklqdq m1, m2 | |
9863 | movu [r0 + 48], m1 | |
9864 | ||
9865 | ; mode 3 [row 0, 1] | |
9866 | ||
9867 | mova m7, [pw_1024] | |
9868 | lea r5, [ang_table] | |
9869 | ||
9870 | movu m0, [r2 + 1] | |
9871 | ||
9872 | palignr m1, m0, 1 | |
9873 | palignr m2, m0, 2 | |
9874 | ||
9875 | punpcklbw m3, m0, m1 | |
9876 | pmaddubsw m4, m3, [r5 + 26 * 16] | |
9877 | pmulhrsw m4, m7 | |
9878 | ||
9879 | punpcklbw m1, m2 | |
9880 | pmaddubsw m5, m1, [r5 + 20 * 16] | |
9881 | pmulhrsw m5, m7 | |
9882 | ||
9883 | packuswb m4, m5 | |
9884 | ||
9885 | movu [r0 + 64], m4 | |
9886 | ||
9887 | ; mode 6 [row 1] | |
9888 | ||
9889 | movh [r0 + 264], m4 | |
9890 | ||
9891 | ; mode 6 [row 3] | |
9892 | ||
9893 | movhps [r0 + 280], m4 | |
9894 | ||
9895 | ; mode 4 [row 0, 1] | |
9896 | ||
9897 | pmaddubsw m4, m3, [r5 + 21 * 16] | |
9898 | pmulhrsw m4, m7 | |
9899 | ||
9900 | pmaddubsw m5, m1, [r5 + 10 * 16] | |
9901 | pmulhrsw m5, m7 | |
9902 | ||
9903 | packuswb m4, m5 | |
9904 | movu [r0 + 128], m4 | |
9905 | ||
9906 | ; mode 5 [row 0, 1] | |
9907 | ||
9908 | pmaddubsw m4, m3, [r5 + 17 * 16] | |
9909 | pmulhrsw m4, m7 | |
9910 | ||
9911 | pmaddubsw m5, m1, [r5 + 2 * 16] | |
9912 | pmulhrsw m5, m7 | |
9913 | ||
9914 | packuswb m4, m5 | |
9915 | movu [r0 + 192], m4 | |
9916 | ||
9917 | ; mode 6 [row 0] | |
9918 | ||
9919 | pmaddubsw m4, m3, [r5 + 13 * 16] | |
9920 | pmulhrsw m4, m7 | |
9921 | ||
9922 | pxor m5, m5 | |
9923 | ||
9924 | packuswb m4, m5 | |
9925 | movh [r0 + 256], m4 | |
9926 | ||
9927 | ; mode 7 [row 0, 1] | |
9928 | ||
9929 | pmaddubsw m4, m3, [r5 + 9 * 16] | |
9930 | pmulhrsw m4, m7 | |
9931 | ||
9932 | pmaddubsw m5, m3, [r5 + 18 * 16] | |
9933 | pmulhrsw m5, m7 | |
9934 | ||
9935 | packuswb m4, m5 | |
9936 | movu [r0 + 320], m4 | |
9937 | ||
9938 | ; mode 8 [row 0, 1] | |
9939 | ||
9940 | pmaddubsw m4, m3, [r5 + 5 * 16] | |
9941 | pmulhrsw m4, m7 | |
9942 | ||
9943 | pmaddubsw m5, m3, [r5 + 10 * 16] | |
9944 | pmulhrsw m5, m7 | |
9945 | ||
9946 | packuswb m4, m5 | |
9947 | movu [r0 + 384], m4 | |
9948 | ||
9949 | ; mode 8 [row 2, 3] | |
9950 | ||
9951 | pmaddubsw m4, m3, [r5 + 15 * 16] | |
9952 | pmulhrsw m4, m7 | |
9953 | ||
9954 | pmaddubsw m5, m3, [r5 + 20 * 16] | |
9955 | pmulhrsw m5, m7 | |
9956 | ||
9957 | packuswb m4, m5 | |
9958 | movu [r0 + 400], m4 | |
9959 | ||
9960 | ; mode 8 [row 4, 5] | |
9961 | ||
9962 | pmaddubsw m4, m3, [r5 + 25 * 16] | |
9963 | pmulhrsw m4, m7 | |
9964 | ||
9965 | pmaddubsw m5, m3, [r5 + 30 * 16] | |
9966 | pmulhrsw m5, m7 | |
9967 | ||
9968 | packuswb m4, m5 | |
9969 | movu [r0 + 416], m4 | |
9970 | ||
9971 | ; mode 8 [row 6, 7] | |
9972 | ||
9973 | pmaddubsw m4, m1, [r5 + 3 * 16] | |
9974 | pmulhrsw m4, m7 | |
9975 | ||
9976 | pmaddubsw m5, m1, [r5 + 8 * 16] | |
9977 | pmulhrsw m5, m7 | |
9978 | ||
9979 | packuswb m4, m5 | |
9980 | movu [r0 + 432], m4 | |
9981 | ||
9982 | ; mode 9 [row 0, 1] | |
9983 | ||
9984 | pmaddubsw m4, m3, [r5 + 2 * 16] | |
9985 | pmulhrsw m4, m7 | |
9986 | ||
9987 | pmaddubsw m5, m3, [r5 + 4 * 16] | |
9988 | pmulhrsw m5, m7 | |
9989 | ||
9990 | packuswb m4, m5 | |
9991 | movu [r0 + 448], m4 | |
9992 | ||
9993 | ; mode 9 [row 2, 3] | |
9994 | ||
9995 | pmaddubsw m4, m3, [r5 + 6 * 16] | |
9996 | pmulhrsw m4, m7 | |
9997 | ||
9998 | pmaddubsw m5, m3, [r5 + 8 * 16] | |
9999 | pmulhrsw m5, m7 | |
10000 | ||
10001 | packuswb m4, m5 | |
10002 | movu [r0 + 464], m4 | |
10003 | ||
10004 | ; mode 9 [row 4, 5] | |
10005 | ||
10006 | pmaddubsw m4, m3, [r5 + 10 * 16] | |
10007 | pmulhrsw m4, m7 | |
10008 | ||
10009 | pmaddubsw m5, m3, [r5 + 12 * 16] | |
10010 | pmulhrsw m5, m7 | |
10011 | ||
10012 | packuswb m4, m5 | |
10013 | movu [r0 + 480], m4 | |
10014 | ||
10015 | ; mode 9 [row 6, 7] | |
10016 | ||
10017 | pmaddubsw m4, m3, [r5 + 14 * 16] | |
10018 | pmulhrsw m4, m7 | |
10019 | ||
10020 | pmaddubsw m5, m3, [r5 + 16 * 16] | |
10021 | pmulhrsw m5, m7 | |
10022 | ||
10023 | packuswb m4, m5 | |
10024 | movu [r0 + 496], m4 | |
10025 | ||
10026 | ; mode 7 [row 2, 3] | |
10027 | ||
10028 | pmaddubsw m4, m3, [r5 + 27 * 16] | |
10029 | pmulhrsw m4, m7 | |
10030 | ||
10031 | pmaddubsw m5, m1, [r5 + 4 * 16] | |
10032 | pmulhrsw m5, m7 | |
10033 | ||
10034 | packuswb m4, m5 | |
10035 | movu [r0 + 336], m4 | |
10036 | ||
10037 | ; mode 7 [row 4, 5] | |
10038 | ||
10039 | pmaddubsw m4, m1, [r5 + 13 * 16] | |
10040 | pmulhrsw m4, m7 | |
10041 | ||
10042 | pmaddubsw m5, m1, [r5 + 22 * 16] | |
10043 | pmulhrsw m5, m7 | |
10044 | ||
10045 | packuswb m4, m5 | |
10046 | movu [r0 + 352], m4 | |
10047 | ||
10048 | ; mode 6 [row 2] | |
10049 | ||
10050 | pmaddubsw m4, m1, [r5 + 7 * 16] | |
10051 | pmulhrsw m4, m7 | |
10052 | ||
10053 | pxor m5, m5 | |
10054 | ||
10055 | packuswb m4, m5 | |
10056 | movh [r0 + 272], m4 | |
10057 | ||
10058 | ; mode 3 [row 2, 3] | |
10059 | ||
10060 | palignr m1, m0, 3 | |
10061 | palignr m3, m0, 4 | |
10062 | ||
10063 | punpcklbw m2, m1 | |
10064 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
10065 | pmulhrsw m5, m7 | |
10066 | ||
10067 | punpcklbw m1, m3 | |
10068 | pmaddubsw m6, m1, [r5 + 8 * 16] | |
10069 | pmulhrsw m6, m7 | |
10070 | ||
10071 | packuswb m5, m6 | |
10072 | movu [r0 + 80], m5 | |
10073 | ||
10074 | ; mode 6 [row 7] | |
10075 | ||
10076 | movhps [r0 + 312], m5 | |
10077 | ||
10078 | ; mode 6 [row 5] | |
10079 | ||
10080 | movh [r0 + 296], m5 | |
10081 | ||
10082 | ; mode 4 [calculate and store row 4, 5] | |
10083 | ||
10084 | pmaddubsw m4, m1, [r5 + 9 * 16] | |
10085 | pmulhrsw m4, m7 | |
10086 | ||
10087 | pmaddubsw m5, m1, [r5 + 30 * 16] | |
10088 | pmulhrsw m5, m7 | |
10089 | ||
10090 | packuswb m4, m5 | |
10091 | movu [r0 + 160], m4 | |
10092 | ||
10093 | ; mode 5 [row 4, 5] | |
10094 | ||
10095 | pmaddubsw m4, m2, [r5 + 21 * 16] | |
10096 | pmulhrsw m4, m7 | |
10097 | ||
10098 | pmaddubsw m5, m1, [r5 + 6 * 16] | |
10099 | pmulhrsw m5, m7 | |
10100 | ||
10101 | packuswb m4, m5 | |
10102 | movu [r0 + 224], m4 | |
10103 | ||
10104 | ; mode 6 [row 4, 5] | |
10105 | ||
10106 | pmaddubsw m5, m2, [r5 + 1 * 16] | |
10107 | pmulhrsw m5, m7 | |
10108 | ||
10109 | pxor m6, m6 | |
10110 | ||
10111 | packuswb m5, m6 | |
10112 | movh [r0 + 288], m5 | |
10113 | ||
10114 | ; mode 6 [row 6, 7] | |
10115 | ||
10116 | pmaddubsw m5, m2, [r5 + 27 * 16] | |
10117 | pmulhrsw m5, m7 | |
10118 | ||
10119 | pxor m6, m6 | |
10120 | ||
10121 | packuswb m5, m6 | |
10122 | movh [r0 + 304], m5 | |
10123 | ||
10124 | ; mode 5 [calculate row 6] | |
10125 | ||
10126 | pmaddubsw m6, m1, [r5 + 23 * 16] | |
10127 | pmulhrsw m6, m7 | |
10128 | ||
10129 | ; mode 3 [row 4, 5] | |
10130 | ||
10131 | palignr m1, m0, 5 | |
10132 | ||
10133 | punpcklbw m3, m1 | |
10134 | pmaddubsw m4, m3, [r5 + 2 * 16] | |
10135 | pmulhrsw m4, m7 | |
10136 | ||
10137 | pmaddubsw m5, m3, [r5 + 28 * 16] | |
10138 | pmulhrsw m5, m7 | |
10139 | ||
10140 | packuswb m4, m5 | |
10141 | movu [r0 + 96], m4 | |
10142 | ||
10143 | ; mode 4 [calculate row 7] | |
10144 | ||
10145 | pmaddubsw m5, m3, [r5 + 19 * 16] | |
10146 | pmulhrsw m5, m7 | |
10147 | ||
10148 | ; mode 5 [calculate row 6] | |
10149 | ||
10150 | pmaddubsw m4, m3, [r5 + 8 * 16] | |
10151 | pmulhrsw m4, m7 | |
10152 | ||
10153 | packuswb m6, m4 | |
10154 | movu [r0 + 240], m6 | |
10155 | ||
10156 | ; mode 3 [row 6, 7] | |
10157 | ||
10158 | palignr m2, m0, 6 | |
10159 | palignr m3, m0, 7 | |
10160 | ||
10161 | punpcklbw m1, m2 | |
10162 | pmaddubsw m4, m1, [r5 + 22 * 16] | |
10163 | pmulhrsw m4, m7 | |
10164 | ||
10165 | punpcklbw m2, m3 | |
10166 | pmaddubsw m2, [r5 + 16 * 16] | |
10167 | pmulhrsw m2, m7 | |
10168 | ||
10169 | packuswb m4, m2 | |
10170 | movu [r0 + 112], m4 | |
10171 | ||
10172 | ; mode 4 [calculate row 7] | |
10173 | ||
10174 | pmaddubsw m2, m1, [r5 + 8 * 16] | |
10175 | pmulhrsw m2, m7 | |
10176 | ||
10177 | ; mode 4 [store row 6 and 7] | |
10178 | ||
10179 | packuswb m5, m2 | |
10180 | movu [r0 + 176], m5 | |
10181 | ||
10182 | ; mode 4 [row 2, 3] | |
10183 | ||
10184 | palignr m1, m0, 1 | |
10185 | palignr m2, m0, 2 | |
10186 | palignr m3, m0, 3 | |
10187 | ||
10188 | punpcklbw m1, m2 | |
10189 | pmaddubsw m4, m1, [r5 + 31 * 16] | |
10190 | pmulhrsw m4, m7 | |
10191 | ||
10192 | punpcklbw m2, m3 | |
10193 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
10194 | pmulhrsw m5, m7 | |
10195 | ||
10196 | packuswb m4, m5 | |
10197 | movu [r0 + 144], m4 | |
10198 | ||
10199 | ; mode 5 [row 2, 3] | |
10200 | ||
10201 | pmaddubsw m4, m1, [r5 + 19 * 16] | |
10202 | pmulhrsw m4, m7 | |
10203 | ||
10204 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
10205 | pmulhrsw m5, m7 | |
10206 | ||
10207 | packuswb m4, m5 | |
10208 | movu [r0 + 208], m4 | |
10209 | ||
10210 | ; mode 7 [row 6, 7] | |
10211 | ||
10212 | pmaddubsw m4, m1, [r5 + 31 * 16] | |
10213 | pmulhrsw m4, m7 | |
10214 | ||
10215 | pmaddubsw m5, m2, [r5 + 8 * 16] | |
10216 | pmulhrsw m5, m7 | |
10217 | ||
10218 | packuswb m4, m5 | |
10219 | movu [r0 + 368], m4 | |
10220 | ||
10221 | ; mode 10 | |
10222 | ||
10223 | pshufb m1, m0, [tab_Si] | |
10224 | movu [r0 + 512], m1 | |
10225 | movu [r0 + 528], m1 | |
10226 | movu [r0 + 544], m1 | |
10227 | movu [r0 + 560], m1 | |
10228 | ||
10229 | pxor m0, m0 | |
10230 | ||
10231 | pshufb m1, m1, m0 | |
10232 | punpcklbw m1, m0 | |
10233 | ||
10234 | movu m2, [r1] | |
10235 | ||
10236 | pshufb m3, m2, m0 | |
10237 | punpcklbw m3, m0 | |
10238 | ||
10239 | psrldq m4, m2, 1 | |
10240 | punpcklbw m4, m0 | |
10241 | ||
10242 | movu m2, [r1 + 9] | |
10243 | punpcklbw m2, m0 | |
10244 | ||
10245 | psubw m4, m3 | |
10246 | psubw m2, m3 | |
10247 | ||
10248 | psraw m4, 1 | |
10249 | psraw m2, 1 | |
10250 | ||
10251 | paddw m4, m1 | |
10252 | paddw m2, m1 | |
10253 | ||
10254 | packuswb m4, m2 | |
10255 | ||
10256 | pextrb [r0 + 512], m4, 0 | |
10257 | pextrb [r0 + 520], m4, 1 | |
10258 | pextrb [r0 + 528], m4, 2 | |
10259 | pextrb [r0 + 536], m4, 3 | |
10260 | pextrb [r0 + 544], m4, 4 | |
10261 | pextrb [r0 + 552], m4, 5 | |
10262 | pextrb [r0 + 560], m4, 6 | |
10263 | pextrb [r0 + 568], m4, 7 | |
10264 | ||
10265 | ; mode 11 [row 0, 1] | |
10266 | ||
10267 | movu m0, [r2] | |
10268 | palignr m1, m0, 1 | |
10269 | punpcklbw m2, m0, m1 | |
10270 | ||
10271 | pmaddubsw m3, m2, [r5 + 30 * 16] | |
10272 | pmulhrsw m3, m7 | |
10273 | ||
10274 | pmaddubsw m4, m2, [r5 + 28 * 16] | |
10275 | pmulhrsw m4, m7 | |
10276 | ||
10277 | packuswb m3, m4 | |
10278 | movu [r0 + 576], m3 | |
10279 | ||
10280 | ; mode 11 [row 2, 3] | |
10281 | ||
10282 | pmaddubsw m3, m2, [r5 + 26 * 16] | |
10283 | pmulhrsw m3, m7 | |
10284 | ||
10285 | pmaddubsw m4, m2, [r5 + 24 * 16] | |
10286 | pmulhrsw m4, m7 | |
10287 | ||
10288 | packuswb m3, m4 | |
10289 | movu [r0 + 592], m3 | |
10290 | ||
10291 | ; mode 11 [row 4, 5] | |
10292 | ||
10293 | pmaddubsw m3, m2, [r5 + 22 * 16] | |
10294 | pmulhrsw m3, m7 | |
10295 | ||
10296 | pmaddubsw m4, m2, [r5 + 20 * 16] | |
10297 | pmulhrsw m4, m7 | |
10298 | ||
10299 | packuswb m5, m3, m4 | |
10300 | movu [r0 + 608], m5 | |
10301 | ||
10302 | ; mode 12 [row 0, 1] | |
10303 | ||
10304 | pmaddubsw m4, m2, [r5 + 27 * 16] | |
10305 | pmulhrsw m4, m7 | |
10306 | ||
10307 | packuswb m4, m3 | |
10308 | movu [r0 + 640], m4 | |
10309 | ||
10310 | ; mode 11 [row 6, 7] | |
10311 | ||
10312 | pmaddubsw m3, m2, [r5 + 18 * 16] | |
10313 | pmulhrsw m3, m7 | |
10314 | ||
10315 | pmaddubsw m4, m2, [r5 + 16 * 16] | |
10316 | pmulhrsw m4, m7 | |
10317 | ||
10318 | packuswb m3, m4 | |
10319 | movu [r0 + 624], m3 | |
10320 | ||
10321 | ; mode 12 [row 2, 3] | |
10322 | ||
10323 | pmaddubsw m3, m2, [r5 + 17 * 16] | |
10324 | pmulhrsw m3, m7 | |
10325 | ||
10326 | pmaddubsw m4, m2, [r5 + 12 * 16] | |
10327 | pmulhrsw m4, m7 | |
10328 | ||
10329 | packuswb m3, m4 | |
10330 | movu [r0 + 656], m3 | |
10331 | ||
10332 | ; mode 12 [row 4, 5] | |
10333 | ||
10334 | pmaddubsw m3, m2, [r5 + 7 * 16] | |
10335 | pmulhrsw m3, m7 | |
10336 | ||
10337 | pmaddubsw m4, m2, [r5 + 2 * 16] | |
10338 | pmulhrsw m4, m7 | |
10339 | ||
10340 | packuswb m3, m4 | |
10341 | movu [r0 + 672], m3 | |
10342 | ||
10343 | ; mode 12 [row 6, 7] | |
10344 | ||
10345 | pslldq m3, m2, 2 | |
10346 | pinsrb m3, [r1 + 0], 1 | |
10347 | pinsrb m3, [r1 + 6], 0 | |
10348 | ||
10349 | pmaddubsw m4, m3, [r5 + 29 * 16] | |
10350 | pmulhrsw m4, m7 | |
10351 | ||
10352 | pmaddubsw m5, m3, [r5 + 24 * 16] | |
10353 | pmulhrsw m5, m7 | |
10354 | ||
10355 | packuswb m4, m5 | |
10356 | movu [r0 + 688], m4 | |
10357 | ||
10358 | ; mode 13 [row 0, 1] | |
10359 | ||
10360 | pmaddubsw m4, m2, [r5 + 23 * 16] | |
10361 | pmulhrsw m4, m7 | |
10362 | ||
10363 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
10364 | pmulhrsw m5, m7 | |
10365 | ||
10366 | packuswb m4, m5 | |
10367 | movu [r0 + 704], m4 | |
10368 | ||
10369 | ; mode 13 [row 2, 3] | |
10370 | ||
10371 | pmaddubsw m4, m2, [r5 + 5 * 16] | |
10372 | pmulhrsw m4, m7 | |
10373 | ||
10374 | pinsrb m3, [r1 + 4], 0 | |
10375 | pmaddubsw m5, m3, [r5 + 28 * 16] | |
10376 | pmulhrsw m5, m7 | |
10377 | ||
10378 | packuswb m4, m5 | |
10379 | movu [r0 + 720], m4 | |
10380 | ||
10381 | ; mode 13 [row 4, 5] | |
10382 | ||
10383 | pmaddubsw m4, m3, [r5 + 19 * 16] | |
10384 | pmulhrsw m4, m7 | |
10385 | ||
10386 | pmaddubsw m5, m3, [r5 + 10 * 16] | |
10387 | pmulhrsw m5, m7 | |
10388 | ||
10389 | packuswb m4, m5 | |
10390 | movu [r0 + 736], m4 | |
10391 | ||
10392 | ; mode 13 [row 6, 7] | |
10393 | ||
10394 | pmaddubsw m4, m3, [r5 + 1 * 16] | |
10395 | pmulhrsw m4, m7 | |
10396 | ||
10397 | pslldq m5, m3, 2 | |
10398 | pinsrb m5, [r1 + 4], 1 | |
10399 | pinsrb m5, [r1 + 7], 0 | |
10400 | ||
10401 | pmaddubsw m5, [r5 + 24 * 16] | |
10402 | pmulhrsw m5, m7 | |
10403 | ||
10404 | packuswb m4, m5 | |
10405 | movu [r0 + 752], m4 | |
10406 | ||
10407 | ; mode 14 [row 0, 1] | |
10408 | ||
10409 | pmaddubsw m4, m2, [r5 + 19 * 16] | |
10410 | pmulhrsw m4, m7 | |
10411 | ||
10412 | pmaddubsw m5, m2, [r5 + 6 * 16] | |
10413 | pmulhrsw m5, m7 | |
10414 | ||
10415 | packuswb m4, m5 | |
10416 | movu [r0 + 768], m4 | |
10417 | ||
10418 | ; mode 14 [row 2, 3] | |
10419 | ||
10420 | pinsrb m3, [r1 + 2], 0 | |
10421 | ||
10422 | pmaddubsw m4, m3, [r5 + 25 * 16] | |
10423 | pmulhrsw m4, m7 | |
10424 | ||
10425 | pmaddubsw m5, m3, [r5 + 12 * 16] | |
10426 | pmulhrsw m5, m7 | |
10427 | ||
10428 | packuswb m4, m5 | |
10429 | movu [r0 + 784], m4 | |
10430 | ||
10431 | ; mode 14 [row 4, 5] | |
10432 | ||
10433 | pslldq m1, m3, 2 | |
10434 | pinsrb m1, [r1 + 2], 1 | |
10435 | pinsrb m1, [r1 + 5], 0 | |
10436 | ||
10437 | pmaddubsw m4, m1, [r5 + 31 * 16] | |
10438 | pmulhrsw m4, m7 | |
10439 | ||
10440 | pmaddubsw m5, m1, [r5 + 18 * 16] | |
10441 | pmulhrsw m5, m7 | |
10442 | ||
10443 | packuswb m4, m5 | |
10444 | movu [r0 + 800], m4 | |
10445 | ||
10446 | ; mode 14 [row 6, 7] | |
10447 | ||
10448 | pmaddubsw m4, m1, [r5 + 5 * 16] | |
10449 | pmulhrsw m4, m7 | |
10450 | ||
10451 | pslldq m1, 2 | |
10452 | pinsrb m1, [r1 + 5], 1 | |
10453 | pinsrb m1, [r1 + 7], 0 | |
10454 | ||
10455 | pmaddubsw m5, m1, [r5 + 24 * 16] | |
10456 | pmulhrsw m5, m7 | |
10457 | ||
10458 | packuswb m4, m5 | |
10459 | movu [r0 + 816], m4 | |
10460 | ||
10461 | ; mode 15 [row 0, 1] | |
10462 | ||
10463 | pmaddubsw m4, m2, [r5 + 15 * 16] | |
10464 | pmulhrsw m4, m7 | |
10465 | ||
10466 | pmaddubsw m5, m3, [r5 + 30 * 16] | |
10467 | pmulhrsw m5, m7 | |
10468 | ||
10469 | packuswb m4, m5 | |
10470 | movu [r0 + 832], m4 | |
10471 | ||
10472 | ; mode 15 [row 2, 3] | |
10473 | ||
10474 | pmaddubsw m4, m3, [r5 + 13 * 16] | |
10475 | pmulhrsw m4, m7 | |
10476 | ||
10477 | pslldq m1, m3, 2 | |
10478 | pinsrb m1, [r1 + 2], 1 | |
10479 | pinsrb m1, [r1 + 4], 0 | |
10480 | ||
10481 | pmaddubsw m5, m1, [r5 + 28 * 16] | |
10482 | pmulhrsw m5, m7 | |
10483 | ||
10484 | packuswb m4, m5 | |
10485 | movu [r0 + 848], m4 | |
10486 | ||
10487 | ; mode 15 [row 4, 5] | |
10488 | ||
10489 | pmaddubsw m4, m1, [r5 + 11 * 16] | |
10490 | pmulhrsw m4, m7 | |
10491 | ||
10492 | pslldq m1, 2 | |
10493 | pinsrb m1, [r1 + 4], 1 | |
10494 | pinsrb m1, [r1 + 6], 0 | |
10495 | ||
10496 | pmaddubsw m5, m1, [r5 + 26 * 16] | |
10497 | pmulhrsw m5, m7 | |
10498 | ||
10499 | packuswb m4, m5 | |
10500 | movu [r0 + 864], m4 | |
10501 | ||
10502 | ; mode 15 [row 6, 7] | |
10503 | ||
10504 | pmaddubsw m4, m1, [r5 + 9 * 16] | |
10505 | pmulhrsw m4, m7 | |
10506 | ||
10507 | pslldq m1, 2 | |
10508 | pinsrb m1, [r1 + 6], 1 | |
10509 | pinsrb m1, [r1 + 8], 0 | |
10510 | ||
10511 | pmaddubsw m1, [r5 + 24 * 16] | |
10512 | pmulhrsw m1, m7 | |
10513 | ||
10514 | packuswb m4, m1 | |
10515 | movu [r0 + 880], m4 | |
10516 | ||
10517 | ; mode 16 [row 0, 1] | |
10518 | ||
10519 | pmaddubsw m4, m2, [r5 + 11 * 16] | |
10520 | pmulhrsw m4, m7 | |
10521 | ||
10522 | pmaddubsw m5, m3, [r5 + 22 * 16] | |
10523 | pmulhrsw m5, m7 | |
10524 | ||
10525 | packuswb m4, m5 | |
10526 | movu [r0 + 896], m4 | |
10527 | ||
10528 | ; mode 16 [row 2, 3] | |
10529 | ||
10530 | pmaddubsw m4, m3, [r5 + 1 * 16] | |
10531 | pmulhrsw m4, m7 | |
10532 | ||
10533 | pslldq m3, 2 | |
10534 | pinsrb m3, [r1 + 2], 1 | |
10535 | pinsrb m3, [r1 + 3], 0 | |
10536 | ||
10537 | pmaddubsw m5, m3, [r5 + 12 * 16] | |
10538 | pmulhrsw m5, m7 | |
10539 | ||
10540 | packuswb m4, m5 | |
10541 | movu [r0 + 912], m4 | |
10542 | ||
10543 | ; mode 16 [row 4, 5] | |
10544 | ||
10545 | pslldq m3, 2 | |
10546 | pinsrb m3, [r1 + 3], 1 | |
10547 | pinsrb m3, [r1 + 5], 0 | |
10548 | ||
10549 | pmaddubsw m4, m3, [r5 + 23 * 16] | |
10550 | pmulhrsw m4, m7 | |
10551 | ||
10552 | pmaddubsw m5, m3, [r5 + 2 * 16] | |
10553 | pmulhrsw m5, m7 | |
10554 | ||
10555 | packuswb m4, m5 | |
10556 | movu [r0 + 928], m4 | |
10557 | ||
10558 | ; mode 16 [row 6, 7] | |
10559 | ||
10560 | pslldq m3, 2 | |
10561 | pinsrb m3, [r1 + 5], 1 | |
10562 | pinsrb m3, [r1 + 6], 0 | |
10563 | ||
10564 | pmaddubsw m4, m3, [r5 + 13 * 16] | |
10565 | pmulhrsw m4, m7 | |
10566 | ||
10567 | pslldq m3, 2 | |
10568 | pinsrb m3, [r1 + 6], 1 | |
10569 | pinsrb m3, [r1 + 8], 0 | |
10570 | ||
10571 | pmaddubsw m3, [r5 + 24 * 16] | |
10572 | pmulhrsw m3, m7 | |
10573 | ||
10574 | packuswb m4, m3 | |
10575 | movu [r0 + 944], m4 | |
10576 | ||
10577 | ; mode 17 [row 0, 1] | |
10578 | ||
10579 | pmaddubsw m4, m2, [r5 + 6 * 16] | |
10580 | pmulhrsw m4, m7 | |
10581 | ||
10582 | pslldq m2, 2 | |
10583 | pinsrb m2, [r1 + 0], 1 | |
10584 | pinsrb m2, [r1 + 1], 0 | |
10585 | ||
10586 | pmaddubsw m3, m2, [r5 + 12 * 16] | |
10587 | pmulhrsw m3, m7 | |
10588 | ||
10589 | packuswb m4, m3 | |
10590 | movu [r0 + 960], m4 | |
10591 | ||
10592 | ; mode 17 [row 2, 3] | |
10593 | ||
10594 | pslldq m2, 2 | |
10595 | pinsrb m2, [r1 + 1], 1 | |
10596 | pinsrb m2, [r1 + 2], 0 | |
10597 | ||
10598 | pmaddubsw m4, m2, [r5 + 18 * 16] | |
10599 | pmulhrsw m4, m7 | |
10600 | ||
10601 | pslldq m2, 2 | |
10602 | pinsrb m2, [r1 + 2], 1 | |
10603 | pinsrb m2, [r1 + 4], 0 | |
10604 | ||
10605 | pmaddubsw m3, m2, [r5 + 24 * 16] | |
10606 | pmulhrsw m3, m7 | |
10607 | ||
10608 | packuswb m4, m3 | |
10609 | movu [r0 + 976], m4 | |
10610 | ||
10611 | ; mode 17 [row 4, 5] | |
10612 | ||
10613 | pslldq m2, 2 | |
10614 | pinsrb m2, [r1 + 4], 1 | |
10615 | pinsrb m2, [r1 + 5], 0 | |
10616 | ||
10617 | pmaddubsw m4, m2, [r5 + 30 * 16] | |
10618 | pmulhrsw m4, m7 | |
10619 | ||
10620 | pmaddubsw m3, m2, [r5 + 4 * 16] | |
10621 | pmulhrsw m3, m7 | |
10622 | ||
10623 | packuswb m4, m3 | |
10624 | movu [r0 + 992], m4 | |
10625 | ||
10626 | ; mode 17 [row 6, 7] | |
10627 | ||
10628 | pslldq m2, 2 | |
10629 | pinsrb m2, [r1 + 5], 1 | |
10630 | pinsrb m2, [r1 + 6], 0 | |
10631 | ||
10632 | pmaddubsw m4, m2, [r5 + 10 * 16] | |
10633 | pmulhrsw m4, m7 | |
10634 | ||
10635 | pslldq m2, 2 | |
10636 | pinsrb m2, [r1 + 6], 1 | |
10637 | pinsrb m2, [r1 + 7], 0 | |
10638 | ||
10639 | pmaddubsw m3, m2, [r5 + 16 * 16] | |
10640 | pmulhrsw m3, m7 | |
10641 | ||
10642 | packuswb m4, m3 | |
10643 | movu [r0 + 1008], m4 | |
10644 | ||
10645 | ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7] | |
10646 | ||
10647 | movh m1, [r3] | |
10648 | movh [r0 + 1024], m1 | |
10649 | ||
10650 | pslldq m2, m1, 1 | |
10651 | pinsrb m2, [r4 + 1], 0 | |
10652 | movh [r0 + 1032], m2 | |
10653 | ||
10654 | pslldq m2, 1 | |
10655 | pinsrb m2, [r4 + 2], 0 | |
10656 | movh [r0 + 1040], m2 | |
10657 | ||
10658 | pslldq m2, 1 | |
10659 | pinsrb m2, [r4 + 3], 0 | |
10660 | movh [r0 + 1048], m2 | |
10661 | ||
10662 | pslldq m2, 1 | |
10663 | pinsrb m2, [r4 + 4], 0 | |
10664 | movh [r0 + 1056], m2 | |
10665 | ||
10666 | pslldq m2, 1 | |
10667 | pinsrb m2, [r4 + 5], 0 | |
10668 | movh [r0 + 1064], m2 | |
10669 | ||
10670 | pslldq m2, 1 | |
10671 | pinsrb m2, [r4 + 6], 0 | |
10672 | movh [r0 + 1072], m2 | |
10673 | ||
10674 | pslldq m2, 1 | |
10675 | pinsrb m2, [r4 + 7], 0 | |
10676 | movh [r0 + 1080], m2 | |
10677 | ||
10678 | ; mode 19 [row 0, 1] | |
10679 | ||
10680 | movu m0, [r1] | |
10681 | palignr m1, m0, 1 | |
10682 | punpcklbw m0, m1 | |
10683 | ||
10684 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
10685 | pmulhrsw m1, m7 | |
10686 | ||
10687 | pslldq m2, m0, 2 | |
10688 | pinsrb m2, [r2 + 0], 1 | |
10689 | pinsrb m2, [r2 + 1], 0 | |
10690 | ||
10691 | pmaddubsw m3, m2, [r5 + 12 * 16] | |
10692 | pmulhrsw m3, m7 | |
10693 | ||
10694 | packuswb m1, m3 | |
10695 | movu [r0 + 1088], m1 | |
10696 | ||
10697 | ; mode 19 [row 2, 3] | |
10698 | ||
10699 | pslldq m2, 2 | |
10700 | pinsrb m2, [r2 + 1], 1 | |
10701 | pinsrb m2, [r2 + 2], 0 | |
10702 | ||
10703 | pmaddubsw m4, m2, [r5 + 18 * 16] | |
10704 | pmulhrsw m4, m7 | |
10705 | ||
10706 | pslldq m2, 2 | |
10707 | pinsrb m2, [r2 + 2], 1 | |
10708 | pinsrb m2, [r2 + 4], 0 | |
10709 | ||
10710 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
10711 | pmulhrsw m5, m7 | |
10712 | ||
10713 | packuswb m4, m5 | |
10714 | movu [r0 + 1104], m4 | |
10715 | ||
10716 | ; mode 19 [row 4, 5] | |
10717 | ||
10718 | pslldq m2, 2 | |
10719 | pinsrb m2, [r2 + 4], 1 | |
10720 | pinsrb m2, [r2 + 5], 0 | |
10721 | ||
10722 | pmaddubsw m4, m2, [r5 + 30 * 16] | |
10723 | pmulhrsw m4, m7 | |
10724 | ||
10725 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
10726 | pmulhrsw m5, m7 | |
10727 | ||
10728 | packuswb m4, m5 | |
10729 | movu [r0 + 1120], m4 | |
10730 | ||
10731 | ; mode 19 [row 6, 7] | |
10732 | ||
10733 | pslldq m2, 2 | |
10734 | pinsrb m2, [r2 + 5], 1 | |
10735 | pinsrb m2, [r2 + 6], 0 | |
10736 | ||
10737 | pmaddubsw m4, m2, [r5 + 10 * 16] | |
10738 | pmulhrsw m4, m7 | |
10739 | ||
10740 | pslldq m2, 2 | |
10741 | pinsrb m2, [r2 + 6], 1 | |
10742 | pinsrb m2, [r2 + 7], 0 | |
10743 | ||
10744 | pmaddubsw m2, [r5 + 16 * 16] | |
10745 | pmulhrsw m2, m7 | |
10746 | ||
10747 | packuswb m4, m2 | |
10748 | movu [r0 + 1136], m4 | |
10749 | ||
10750 | ; mode 20 [row 0, 1] | |
10751 | ||
10752 | pmaddubsw m3, m0, [r5 + 11 * 16] | |
10753 | pmulhrsw m3, m7 | |
10754 | ||
10755 | pslldq m1, m0, 2 | |
10756 | pinsrb m1, [r2 + 0], 1 | |
10757 | pinsrb m1, [r2 + 2], 0 | |
10758 | ||
10759 | pmaddubsw m4, m1, [r5 + 22 * 16] | |
10760 | pmulhrsw m4, m7 | |
10761 | ||
10762 | packuswb m3, m4 | |
10763 | movu [r0 + 1152], m3 | |
10764 | ||
10765 | ; mode 20 [row 2, 3] | |
10766 | ||
10767 | pmaddubsw m3, m1, [r5 + 1 * 16] | |
10768 | pmulhrsw m3, m7 | |
10769 | ||
10770 | pslldq m2, m1, 2 | |
10771 | pinsrb m2, [r2 + 2], 1 | |
10772 | pinsrb m2, [r2 + 3], 0 | |
10773 | ||
10774 | pmaddubsw m4, m2, [r5 + 12 * 16] | |
10775 | pmulhrsw m4, m7 | |
10776 | ||
10777 | packuswb m3, m4 | |
10778 | movu [r0 + 1168], m3 | |
10779 | ||
10780 | ; mode 20 [row 4, 5] | |
10781 | ||
10782 | pslldq m2, 2 | |
10783 | pinsrb m2, [r2 + 3], 1 | |
10784 | pinsrb m2, [r2 + 5], 0 | |
10785 | ||
10786 | pmaddubsw m3, m2, [r5 + 23 * 16] | |
10787 | pmulhrsw m3, m7 | |
10788 | ||
10789 | pmaddubsw m4, m2, [r5 + 2 * 16] | |
10790 | pmulhrsw m4, m7 | |
10791 | ||
10792 | packuswb m3, m4 | |
10793 | movu [r0 + 1184], m3 | |
10794 | ||
10795 | ; mode 20 [row 6, 7] | |
10796 | ||
10797 | pslldq m2, 2 | |
10798 | pinsrb m2, [r2 + 5], 1 | |
10799 | pinsrb m2, [r2 + 6], 0 | |
10800 | ||
10801 | pmaddubsw m3, m2, [r5 + 13 * 16] | |
10802 | pmulhrsw m3, m7 | |
10803 | ||
10804 | pslldq m2, 2 | |
10805 | pinsrb m2, [r2 + 6], 1 | |
10806 | pinsrb m2, [r2 + 8], 0 | |
10807 | ||
10808 | pmaddubsw m4, m2, [r5 + 24 * 16] | |
10809 | pmulhrsw m4, m7 | |
10810 | ||
10811 | packuswb m3, m4 | |
10812 | movu [r0 + 1200], m3 | |
10813 | ||
10814 | ; mode 21 [row 0, 1] | |
10815 | ||
10816 | pmaddubsw m2, m0, [r5 + 15 * 16] | |
10817 | pmulhrsw m2, m7 | |
10818 | ||
10819 | pmaddubsw m3, m1, [r5 + 30 * 16] | |
10820 | pmulhrsw m3, m7 | |
10821 | ||
10822 | packuswb m2, m3 | |
10823 | movu [r0 + 1216], m2 | |
10824 | ||
10825 | ; mode 21 [row 2, 3] | |
10826 | ||
10827 | pmaddubsw m2, m1, [r5 + 13 * 16] | |
10828 | pmulhrsw m2, m7 | |
10829 | ||
10830 | pslldq m3, m1, 2 | |
10831 | pinsrb m3, [r2 + 2], 1 | |
10832 | pinsrb m3, [r2 + 4], 0 | |
10833 | ||
10834 | pmaddubsw m4, m3, [r5 + 28 * 16] | |
10835 | pmulhrsw m4, m7 | |
10836 | ||
10837 | packuswb m2, m4 | |
10838 | movu [r0 + 1232], m2 | |
10839 | ||
10840 | ; mode 21 [row 4, 5] | |
10841 | ||
10842 | pmaddubsw m2, m3, [r5 + 11 * 16] | |
10843 | pmulhrsw m2, m7 | |
10844 | ||
10845 | pslldq m3, 2 | |
10846 | pinsrb m3, [r2 + 4], 1 | |
10847 | pinsrb m3, [r2 + 6], 0 | |
10848 | ||
10849 | pmaddubsw m4, m3, [r5 + 26 * 16] | |
10850 | pmulhrsw m4, m7 | |
10851 | ||
10852 | packuswb m2, m4 | |
10853 | movu [r0 + 1248], m2 | |
10854 | ||
10855 | ; mode 21 [row 6, 7] | |
10856 | ||
10857 | pmaddubsw m2, m3, [r5 + 9 * 16] | |
10858 | pmulhrsw m2, m7 | |
10859 | ||
10860 | pslldq m3, 2 | |
10861 | pinsrb m3, [r2 + 6], 1 | |
10862 | pinsrb m3, [r2 + 8], 0 | |
10863 | ||
10864 | pmaddubsw m4, m3, [r5 + 24 * 16] | |
10865 | pmulhrsw m4, m7 | |
10866 | ||
10867 | packuswb m2, m4 | |
10868 | movu [r0 + 1264], m2 | |
10869 | ||
10870 | ; mode 22 [row 0, 1] | |
10871 | ||
10872 | pmaddubsw m2, m0, [r5 + 19 * 16] | |
10873 | pmulhrsw m2, m7 | |
10874 | ||
10875 | pmaddubsw m4, m0, [r5 + 6 * 16] | |
10876 | pmulhrsw m4, m7 | |
10877 | ||
10878 | packuswb m2, m4 | |
10879 | movu [r0 + 1280], m2 | |
10880 | ||
10881 | ; mode 22 [row 2, 3] | |
10882 | ||
10883 | pmaddubsw m2, m1, [r5 + 25 * 16] | |
10884 | pmulhrsw m2, m7 | |
10885 | ||
10886 | pmaddubsw m3, m1, [r5 + 12 * 16] | |
10887 | pmulhrsw m3, m7 | |
10888 | ||
10889 | packuswb m2, m3 | |
10890 | movu [r0 + 1296], m2 | |
10891 | ||
10892 | ; mode 22 [row 4, 5] | |
10893 | ||
10894 | pslldq m1, 2 | |
10895 | pinsrb m1, [r2 + 5], 0 | |
10896 | pinsrb m1, [r2 + 2], 1 | |
10897 | ||
10898 | pmaddubsw m2, m1, [r5 + 31 * 16] | |
10899 | pmulhrsw m2, m7 | |
10900 | ||
10901 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
10902 | pmulhrsw m3, m7 | |
10903 | ||
10904 | packuswb m2, m3 | |
10905 | movu [r0 + 1312], m2 | |
10906 | ||
10907 | ; mode 22 [row 6, 7] | |
10908 | ||
10909 | pmaddubsw m2, m1, [r5 + 5 * 16] | |
10910 | pmulhrsw m2, m7 | |
10911 | ||
10912 | pslldq m1, 2 | |
10913 | pinsrb m1, [r2 + 5], 1 | |
10914 | pinsrb m1, [r2 + 7], 0 | |
10915 | ||
10916 | pmaddubsw m1, [r5 + 24 * 16] | |
10917 | pmulhrsw m1, m7 | |
10918 | ||
10919 | packuswb m2, m1 | |
10920 | movu [r0 + 1328], m2 | |
10921 | ||
10922 | ; mode 23 [row 0, 1] | |
10923 | ||
10924 | pmaddubsw m2, m0, [r5 + 23 * 16] | |
10925 | pmulhrsw m2, m7 | |
10926 | ||
10927 | pmaddubsw m3, m0, [r5 + 14 * 16] | |
10928 | pmulhrsw m3, m7 | |
10929 | ||
10930 | packuswb m2, m3 | |
10931 | movu [r0 + 1344], m2 | |
10932 | ||
10933 | ; mode 23 [row 2, 3] | |
10934 | ||
10935 | pmaddubsw m2, m0, [r5 + 5 * 16] | |
10936 | pmulhrsw m2, m7 | |
10937 | ||
10938 | pslldq m1, m0, 2 | |
10939 | pinsrb m1, [r2 + 0], 1 | |
10940 | pinsrb m1, [r2 + 4], 0 | |
10941 | ||
10942 | pmaddubsw m3, m1, [r5 + 28 * 16] | |
10943 | pmulhrsw m3, m7 | |
10944 | ||
10945 | packuswb m2, m3 | |
10946 | movu [r0 + 1360], m2 | |
10947 | ||
10948 | ; mode 23 [row 4, 5] | |
10949 | ||
10950 | pmaddubsw m2, m1, [r5 + 19 * 16] | |
10951 | pmulhrsw m2, m7 | |
10952 | ||
10953 | pmaddubsw m3, m1, [r5 + 10 * 16] | |
10954 | pmulhrsw m3, m7 | |
10955 | ||
10956 | packuswb m2, m3 | |
10957 | movu [r0 + 1376], m2 | |
10958 | ||
10959 | ; mode 23 [row 6, 7] | |
10960 | ||
10961 | pmaddubsw m2, m1, [r5 + 1 * 16] | |
10962 | pmulhrsw m2, m7 | |
10963 | ||
10964 | pslldq m3, m1, 2 | |
10965 | pinsrb m3, [r2 + 4], 1 | |
10966 | pinsrb m3, [r2 + 7], 0 | |
10967 | ||
10968 | pmaddubsw m3, [r5 + 24 * 16] | |
10969 | pmulhrsw m3, m7 | |
10970 | ||
10971 | packuswb m2, m3 | |
10972 | movu [r0 + 1392], m2 | |
10973 | ||
10974 | ; mode 24 [row 0, 1] | |
10975 | ||
10976 | pmaddubsw m2, m0, [r5 + 27 * 16] | |
10977 | pmulhrsw m2, m7 | |
10978 | ||
10979 | pmaddubsw m5, m0, [r5 + 22 * 16] | |
10980 | pmulhrsw m5, m7 | |
10981 | ||
10982 | packuswb m2, m5 | |
10983 | movu [r0 + 1408], m2 | |
10984 | ||
10985 | ; mode 24 [row 2, 3] | |
10986 | ||
10987 | pmaddubsw m2, m0, [r5 + 17 * 16] | |
10988 | pmulhrsw m2, m7 | |
10989 | ||
10990 | pmaddubsw m3, m0, [r5 + 12 * 16] | |
10991 | pmulhrsw m3, m7 | |
10992 | ||
10993 | packuswb m2, m3 | |
10994 | movu [r0 + 1424], m2 | |
10995 | ||
10996 | ; mode 24 [row 4, 5] | |
10997 | ||
10998 | pmaddubsw m2, m0, [r5 + 7 * 16] | |
10999 | pmulhrsw m2, m7 | |
11000 | ||
11001 | pmaddubsw m3, m0, [r5 + 2 * 16] | |
11002 | pmulhrsw m3, m7 | |
11003 | ||
11004 | packuswb m2, m3 | |
11005 | movu [r0 + 1440], m2 | |
11006 | ||
11007 | ; mode 24 [row 6, 7] | |
11008 | ||
11009 | pinsrb m1, [r2 + 6], 0 | |
11010 | ||
11011 | pmaddubsw m2, m1, [r5 + 29 * 16] | |
11012 | pmulhrsw m2, m7 | |
11013 | ||
11014 | pmaddubsw m1, [r5 + 24 * 16] | |
11015 | pmulhrsw m1, m7 | |
11016 | ||
11017 | packuswb m2, m1 | |
11018 | movu [r0 + 1456], m2 | |
11019 | ||
11020 | ; mode 25 [row 0, 1] | |
11021 | ||
11022 | pmaddubsw m2, m0, [r5 + 30 * 16] | |
11023 | pmulhrsw m2, m7 | |
11024 | ||
11025 | pmaddubsw m1, m0, [r5 + 28 * 16] | |
11026 | pmulhrsw m1, m7 | |
11027 | ||
11028 | packuswb m2, m1 | |
11029 | movu [r0 + 1472], m2 | |
11030 | ||
11031 | ; mode 25 [row 2, 3] | |
11032 | ||
11033 | pmaddubsw m2, m0, [r5 + 26 * 16] | |
11034 | pmulhrsw m2, m7 | |
11035 | ||
11036 | pmaddubsw m1, m0, [r5 + 24 * 16] | |
11037 | pmulhrsw m1, m7 | |
11038 | ||
11039 | packuswb m2, m1 | |
11040 | movu [r0 + 1488], m2 | |
11041 | ||
11042 | ; mode 25 [row 4, 5] | |
11043 | ||
11044 | pmaddubsw m1, m0, [r5 + 20 * 16] | |
11045 | pmulhrsw m1, m7 | |
11046 | ||
11047 | packuswb m5, m1 | |
11048 | movu [r0 + 1504], m5 | |
11049 | ||
11050 | ; mode 25 [row 6, 7] | |
11051 | ||
11052 | pmaddubsw m2, m0, [r5 + 18 * 16] | |
11053 | pmulhrsw m2, m7 | |
11054 | ||
11055 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
11056 | pmulhrsw m1, m7 | |
11057 | ||
11058 | packuswb m2, m1 | |
11059 | movu [r0 + 1520], m2 | |
11060 | ||
11061 | ; mode 26 | |
11062 | ||
11063 | movu m0, [r1 + 1] | |
11064 | ||
11065 | pshufb m1, m0, [tab_Si] | |
11066 | movu [r0 + 1536], m1 | |
11067 | movu [r0 + 1552], m1 | |
11068 | movu [r0 + 1568], m1 | |
11069 | movu [r0 + 1584], m1 | |
11070 | ||
11071 | pxor m5, m5 | |
11072 | ||
11073 | pshufb m1, m1, m5 | |
11074 | punpcklbw m1, m5 | |
11075 | ||
11076 | movu m2, [r2] | |
11077 | ||
11078 | pshufb m3, m2, m5 | |
11079 | punpcklbw m3, m5 | |
11080 | ||
11081 | psrldq m4, m2, 1 | |
11082 | punpcklbw m4, m5 | |
11083 | ||
11084 | movu m2, [r2 + 9] | |
11085 | punpcklbw m2, m5 | |
11086 | ||
11087 | psubw m4, m3 | |
11088 | psubw m2, m3 | |
11089 | ||
11090 | psraw m4, 1 | |
11091 | psraw m2, 1 | |
11092 | ||
11093 | paddw m4, m1 | |
11094 | paddw m2, m1 | |
11095 | ||
11096 | packuswb m4, m2 | |
11097 | ||
11098 | pextrb [r0 + 1536], m4, 0 | |
11099 | pextrb [r0 + 1544], m4, 1 | |
11100 | pextrb [r0 + 1552], m4, 2 | |
11101 | pextrb [r0 + 1560], m4, 3 | |
11102 | pextrb [r0 + 1568], m4, 4 | |
11103 | pextrb [r0 + 1576], m4, 5 | |
11104 | pextrb [r0 + 1584], m4, 6 | |
11105 | pextrb [r0 + 1592], m4, 7 | |
11106 | ||
11107 | ; mode 27 [row 0, 1] | |
11108 | ||
11109 | palignr m6, m0, 1 | |
11110 | punpcklbw m4, m0, m6 | |
11111 | ||
11112 | pmaddubsw m1, m4, [r5 + 2 * 16] | |
11113 | pmulhrsw m1, m7 | |
11114 | ||
11115 | pmaddubsw m2, m4, [r5 + 4 * 16] | |
11116 | pmulhrsw m2, m7 | |
11117 | ||
11118 | packuswb m1, m2 | |
11119 | movu [r0 + 1600], m1 | |
11120 | ||
11121 | ; mode 27 [row 2, 3] | |
11122 | ||
11123 | pmaddubsw m1, m4, [r5 + 6 * 16] | |
11124 | pmulhrsw m1, m7 | |
11125 | ||
11126 | pmaddubsw m2, m4, [r5 + 8 * 16] | |
11127 | pmulhrsw m2, m7 | |
11128 | ||
11129 | packuswb m1, m2 | |
11130 | movu [r0 + 1616], m1 | |
11131 | ||
11132 | ; mode 27 [row 4, 5] | |
11133 | ||
11134 | pmaddubsw m3, m4, [r5 + 10 * 16] | |
11135 | pmulhrsw m3, m7 | |
11136 | ||
11137 | pmaddubsw m2, m4, [r5 + 12 * 16] | |
11138 | pmulhrsw m2, m7 | |
11139 | ||
11140 | packuswb m1, m3, m2 | |
11141 | movu [r0 + 1632], m1 | |
11142 | ||
11143 | ; mode 27 [row 6, 7] | |
11144 | ||
11145 | pmaddubsw m1, m4, [r5 + 14 * 16] | |
11146 | pmulhrsw m1, m7 | |
11147 | ||
11148 | pmaddubsw m2, m4, [r5 + 16 * 16] | |
11149 | pmulhrsw m2, m7 | |
11150 | ||
11151 | packuswb m1, m2 | |
11152 | movu [r0 + 1648], m1 | |
11153 | ||
11154 | ; mode 28 [row 0, 1] | |
11155 | ||
11156 | pmaddubsw m1, m4, [r5 + 5 * 16] | |
11157 | pmulhrsw m1, m7 | |
11158 | ||
11159 | packuswb m1, m3 | |
11160 | movu [r0 + 1664], m1 | |
11161 | ||
11162 | ; mode 28 [row 2, 3] | |
11163 | ||
11164 | pmaddubsw m1, m4, [r5 + 15 * 16] | |
11165 | pmulhrsw m1, m7 | |
11166 | ||
11167 | pmaddubsw m2, m4, [r5 + 20 * 16] | |
11168 | pmulhrsw m2, m7 | |
11169 | ||
11170 | packuswb m1, m2 | |
11171 | movu [r0 + 1680], m1 | |
11172 | ||
11173 | ; mode 28 [row 4, 5] | |
11174 | ||
11175 | pmaddubsw m1, m4, [r5 + 25 * 16] | |
11176 | pmulhrsw m1, m7 | |
11177 | ||
11178 | pmaddubsw m2, m4, [r5 + 30 * 16] | |
11179 | pmulhrsw m2, m7 | |
11180 | ||
11181 | packuswb m1, m2 | |
11182 | movu [r0 + 1696], m1 | |
11183 | ||
11184 | ; mode 28 [row 6, 7] | |
11185 | ||
11186 | palignr m1, m0, 2 | |
11187 | punpcklbw m5, m6, m1 | |
11188 | ||
11189 | pmaddubsw m2, m5, [r5 + 3 * 16] | |
11190 | pmulhrsw m2, m7 | |
11191 | ||
11192 | pmaddubsw m3, m5, [r5 + 8 * 16] | |
11193 | pmulhrsw m3, m7 | |
11194 | ||
11195 | packuswb m2, m3 | |
11196 | movu [r0 + 1712], m2 | |
11197 | ||
11198 | ; mode 29 [row 0, 1] | |
11199 | ||
11200 | pmaddubsw m2, m4, [r5 + 9 * 16] | |
11201 | pmulhrsw m2, m7 | |
11202 | ||
11203 | pmaddubsw m3, m4, [r5 + 18 * 16] | |
11204 | pmulhrsw m3, m7 | |
11205 | ||
11206 | packuswb m2, m3 | |
11207 | movu [r0 + 1728], m2 | |
11208 | ||
11209 | ; mode 29 [row 2, 3] | |
11210 | ||
11211 | pmaddubsw m2, m4, [r5 + 27 * 16] | |
11212 | pmulhrsw m2, m7 | |
11213 | ||
11214 | pmaddubsw m3, m5, [r5 + 4 * 16] | |
11215 | pmulhrsw m3, m7 | |
11216 | ||
11217 | packuswb m2, m3 | |
11218 | movu [r0 + 1744], m2 | |
11219 | ||
11220 | ; mode 29 [row 4, 5] | |
11221 | ||
11222 | pmaddubsw m2, m5, [r5 + 13 * 16] | |
11223 | pmulhrsw m2, m7 | |
11224 | ||
11225 | pmaddubsw m3, m5, [r5 + 22 * 16] | |
11226 | pmulhrsw m3, m7 | |
11227 | ||
11228 | packuswb m2, m3 | |
11229 | movu [r0 + 1760], m2 | |
11230 | ||
11231 | ; mode 29 [row 6, 7] | |
11232 | ||
11233 | pmaddubsw m2, m5, [r5 + 31 * 16] | |
11234 | pmulhrsw m2, m7 | |
11235 | ||
11236 | palignr m6, m0, 3 | |
11237 | punpcklbw m1, m6 | |
11238 | ||
11239 | pmaddubsw m3, m1, [r5 + 8 * 16] | |
11240 | pmulhrsw m3, m7 | |
11241 | ||
11242 | packuswb m2, m3 | |
11243 | movu [r0 + 1776], m2 | |
11244 | ||
11245 | ; mode 32 [row 2] | |
11246 | ||
11247 | movh [r0 + 1936], m2 | |
11248 | ||
11249 | ; mode 30 [row 0, 1] | |
11250 | ||
11251 | pmaddubsw m2, m4, [r5 + 13 * 16] | |
11252 | pmulhrsw m2, m7 | |
11253 | ||
11254 | pmaddubsw m3, m4, [r5 + 26 * 16] | |
11255 | pmulhrsw m3, m7 | |
11256 | ||
11257 | packuswb m2, m3 | |
11258 | movu [r0 + 1792], m2 | |
11259 | ||
11260 | ; mode 30 [row 2, 3] | |
11261 | ||
11262 | pmaddubsw m2, m5, [r5 + 7 * 16] | |
11263 | pmulhrsw m2, m7 | |
11264 | ||
11265 | pmaddubsw m3, m5, [r5 + 20 * 16] | |
11266 | pmulhrsw m3, m7 | |
11267 | ||
11268 | packuswb m2, m3 | |
11269 | movu [r0 + 1808], m2 | |
11270 | ||
11271 | ; mode 33 [row 1] | |
11272 | ||
11273 | movhps [r0 + 1992], m2 | |
11274 | ||
11275 | ; mode 30 [row 4, 5] | |
11276 | ||
11277 | pmaddubsw m2, m1, [r5 + 1 * 16] | |
11278 | pmulhrsw m2, m7 | |
11279 | ||
11280 | pmaddubsw m3, m1, [r5 + 14 * 16] | |
11281 | pmulhrsw m3, m7 | |
11282 | ||
11283 | packuswb m2, m3 | |
11284 | movu [r0 + 1824], m2 | |
11285 | ||
11286 | ; mode 33 [row 2] | |
11287 | ||
11288 | movhps [r0 + 2000], m2 | |
11289 | ||
11290 | ; mode 30 [row 6, 7] | |
11291 | ||
11292 | pmaddubsw m2, m1, [r5 + 27 * 16] | |
11293 | pmulhrsw m2, m7 | |
11294 | ||
11295 | psrldq m0, 4 | |
11296 | punpcklbw m6, m0 | |
11297 | ||
11298 | pmaddubsw m3, m6, [r5 + 8 * 16] | |
11299 | pmulhrsw m3, m7 | |
11300 | ||
11301 | packuswb m2, m3 | |
11302 | movu [r0 + 1840], m2 | |
11303 | ||
11304 | ; mode 33 [row 3] | |
11305 | ||
11306 | movhps [r0 + 2008], m2 | |
11307 | ||
11308 | ; mode 31 [row 0, 1] | |
11309 | ||
11310 | pmaddubsw m2, m4, [r5 + 17 * 16] | |
11311 | pmulhrsw m2, m7 | |
11312 | ||
11313 | pmaddubsw m3, m5, [r5 + 2 * 16] | |
11314 | pmulhrsw m3, m7 | |
11315 | ||
11316 | packuswb m2, m3 | |
11317 | movu [r0 + 1856], m2 | |
11318 | ||
11319 | ; mode 31 [row 2, 3] | |
11320 | ||
11321 | pmaddubsw m2, m5, [r5 + 19 * 16] | |
11322 | pmulhrsw m2, m7 | |
11323 | ||
11324 | pmaddubsw m3, m1, [r5 + 4 * 16] | |
11325 | pmulhrsw m3, m7 | |
11326 | ||
11327 | packuswb m2, m3 | |
11328 | movu [r0 + 1872], m2 | |
11329 | ||
11330 | ; mode 31 [row 4, 5] | |
11331 | ||
11332 | pmaddubsw m2, m1, [r5 + 21 * 16] | |
11333 | pmulhrsw m2, m7 | |
11334 | ||
11335 | pmaddubsw m3, m6, [r5 + 6 * 16] | |
11336 | pmulhrsw m3, m7 | |
11337 | ||
11338 | packuswb m2, m3 | |
11339 | movu [r0 + 1888], m2 | |
11340 | ||
11341 | ; mode 31 [row 6, 7] | |
11342 | ||
11343 | pmaddubsw m2, m6, [r5 + 23 * 16] | |
11344 | pmulhrsw m2, m7 | |
11345 | ||
11346 | movu m3, [r1 + 6] | |
11347 | punpcklbw m0, m3 | |
11348 | ||
11349 | pmaddubsw m3, m0, [r5 + 8 * 16] | |
11350 | pmulhrsw m3, m7 | |
11351 | ||
11352 | packuswb m2, m3 | |
11353 | movu [r0 + 1904], m2 | |
11354 | ||
11355 | ; mode 32 [row 0, 1] | |
11356 | ||
11357 | pmaddubsw m2, m4, [r5 + 21 * 16] | |
11358 | pmulhrsw m2, m7 | |
11359 | ||
11360 | pmaddubsw m3, m5, [r5 + 10 * 16] | |
11361 | pmulhrsw m3, m7 | |
11362 | ||
11363 | packuswb m2, m3 | |
11364 | movu [r0 + 1920], m2 | |
11365 | ||
11366 | ; mode 32 [row 3] | |
11367 | ||
11368 | pmaddubsw m2, m1, [r5 + 20 * 16] | |
11369 | pmulhrsw m2, m7 | |
11370 | ||
11371 | pxor m3, m3 | |
11372 | ||
11373 | packuswb m2, m3 | |
11374 | movh [r0 + 1944], m2 | |
11375 | ||
11376 | ; mode 32 [row 4, 5] | |
11377 | ||
11378 | pmaddubsw m2, m6, [r5 + 9 * 16] | |
11379 | pmulhrsw m2, m7 | |
11380 | ||
11381 | pmaddubsw m3, m6, [r5 + 30 * 16] | |
11382 | pmulhrsw m3, m7 | |
11383 | ||
11384 | packuswb m2, m3 | |
11385 | movu [r0 + 1952], m2 | |
11386 | ||
11387 | ; mode 33 [row 4, 5] | |
11388 | ||
11389 | pmaddubsw m2, m0, [r5 + 2 * 16] | |
11390 | pmulhrsw m2, m7 | |
11391 | ||
11392 | pmaddubsw m3, m0, [r5 + 28 * 16] | |
11393 | pmulhrsw m3, m7 | |
11394 | ||
11395 | packuswb m2, m3 | |
11396 | movu [r0 + 2016], m2 | |
11397 | ||
11398 | ; mode 32 [row 6] | |
11399 | ||
11400 | pmaddubsw m2, m0, [r5 + 19 * 16] | |
11401 | pmulhrsw m2, m7 | |
11402 | ||
11403 | ; mode 32 [row 7] | |
11404 | ||
11405 | movu m0, [r1 + 6] | |
11406 | palignr m3, m0, 1 | |
11407 | punpcklbw m0, m3 | |
11408 | ||
11409 | pmaddubsw m3, m0, [r5 + 8 * 16] | |
11410 | pmulhrsw m3, m7 | |
11411 | ||
11412 | packuswb m2, m3 | |
11413 | movu [r0 + 1968], m2 | |
11414 | ||
11415 | ; mode 33 [row 6, 7] | |
11416 | ||
11417 | pmaddubsw m2, m0, [r5 + 22 * 16] | |
11418 | pmulhrsw m2, m7 | |
11419 | ||
11420 | movu m0, [r1 + 7] | |
11421 | palignr m3, m0, 1 | |
11422 | punpcklbw m0, m3 | |
11423 | ||
11424 | pmaddubsw m3, m0, [r5 + 16 * 16] | |
11425 | pmulhrsw m3, m7 | |
11426 | ||
11427 | packuswb m2, m3 | |
11428 | movu [r0 + 2032], m2 | |
11429 | ||
11430 | ; mode 33 [row 0] | |
11431 | ||
11432 | pmaddubsw m2, m4, [r5 + 26 * 16] | |
11433 | pmulhrsw m2, m7 | |
11434 | ||
11435 | pxor m3, m3 | |
11436 | ||
11437 | packuswb m2, m3 | |
11438 | movh [r0 + 1984], m2 | |
11439 | ||
11440 | ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7] | |
11441 | ||
11442 | movu m0, [r3 + 2] | |
11443 | palignr m1, m0, 1 | |
11444 | punpcklqdq m2, m0, m1 | |
11445 | movu [r0 + 2048], m2 | |
11446 | ||
11447 | palignr m1, m0, 2 | |
11448 | palignr m2, m0, 3 | |
11449 | punpcklqdq m1, m2 | |
11450 | movu [r0 + 2064], m1 | |
11451 | ||
11452 | palignr m1, m0, 4 | |
11453 | palignr m2, m0, 5 | |
11454 | punpcklqdq m1, m2 | |
11455 | movu [r0 + 2080], m1 | |
11456 | ||
11457 | palignr m1, m0, 6 | |
11458 | palignr m2, m0, 7 | |
11459 | punpcklqdq m1, m2 | |
11460 | movu [r0 + 2096], m1 | |
11461 | ||
11462 | RET | |
11463 | ||
11464 | ;----------------------------------------------------------------------------- | |
11465 | ; void all_angs_pred_16x16(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) | |
11466 | ;----------------------------------------------------------------------------- | |
11467 | INIT_XMM sse4 | |
11468 | cglobal all_angs_pred_16x16, 6, 6, 8, dest, above0, left0, above1, left1, bLuma | |
11469 | ||
11470 | movu m0, [r4 + 2] | |
11471 | movu [r0 + 0 * 16], m0 | |
11472 | ||
11473 | movu m1, m0 | |
11474 | ||
11475 | movu m6, [r4 + 18] | |
11476 | palignr m5, m6, m0, 1 | |
11477 | movu [r0 + 1 * 16], m5 | |
11478 | ||
11479 | movu m4, m5 | |
11480 | ||
11481 | palignr m5, m6, m0, 2 | |
11482 | movu [r0 + 2 * 16], m5 | |
11483 | palignr m5, m6, m0, 3 | |
11484 | movu [r0 + 3 * 16], m5 | |
11485 | palignr m5, m6, m0, 4 | |
11486 | movu [r0 + 4 * 16], m5 | |
11487 | palignr m5, m6, m0, 5 | |
11488 | movu [r0 + 5 * 16], m5 | |
11489 | palignr m5, m6, m0, 6 | |
11490 | movu [r0 + 6 * 16], m5 | |
11491 | palignr m5, m6, m0, 7 | |
11492 | movu [r0 + 7 * 16], m5 | |
11493 | ||
11494 | movu m7, m5 | |
11495 | ||
11496 | palignr m5, m6, m0, 8 | |
11497 | movu [r0 + 8 * 16], m5 | |
11498 | ||
11499 | movu m2, m5 | |
11500 | ||
11501 | palignr m5, m6, m0, 9 | |
11502 | movu [r0 + 9 * 16], m5 | |
11503 | ||
11504 | palignr m3, m6, m0, 10 | |
11505 | movu [r0 + 10 * 16], m3 | |
11506 | palignr m3, m6, m0, 11 | |
11507 | movu [r0 + 11 * 16], m3 | |
11508 | palignr m3, m6, m0, 12 | |
11509 | movu [r0 + 12 * 16], m3 | |
11510 | ||
11511 | ; mode 3 [row 15] | |
11512 | movu [r0 + (3-2)*16*16 + 15 * 16], m3 | |
11513 | ||
11514 | palignr m3, m6, m0, 13 | |
11515 | movu [r0 + 13 * 16], m3 | |
11516 | palignr m3, m6, m0, 14 | |
11517 | movu [r0 + 14 * 16], m3 | |
11518 | palignr m3, m6, m0, 15 | |
11519 | movu [r0 + 15 * 16], m3 | |
11520 | ||
11521 | ; mode 3 [row 0] | |
11522 | lea r5, [ang_table] | |
11523 | movu m3, [pw_1024] | |
11524 | movu m0, [r4 + 1] | |
11525 | punpcklbw m0, m1 | |
11526 | ||
11527 | ; mode 17 [row 8 - second half] | |
11528 | pmaddubsw m1, m0, [r5 + 22 * 16] | |
11529 | pmulhrsw m1, m3 | |
11530 | packuswb m1, m1 | |
11531 | movh [r0 + 248 * 16 + 8], m1 | |
11532 | ; mode 17 [row 8 - second half] end | |
11533 | ||
11534 | pmaddubsw m1, m0, [r5 + 26 * 16] | |
11535 | pmulhrsw m1, m3 | |
11536 | punpcklbw m7, m2 | |
11537 | pmaddubsw m2, m7, [r5 + 26 * 16] | |
11538 | pmulhrsw m2, m3 | |
11539 | packuswb m1, m2 | |
11540 | movu [r0 + 16 * 16], m1 | |
11541 | ||
11542 | ;mode 6 [row 1] | |
11543 | movu [r0 + 65 * 16], m1 | |
11544 | ||
11545 | ; mode 4 [row 0] | |
11546 | pmaddubsw m1, m0, [r5 + 21 * 16] | |
11547 | pmulhrsw m1, m3 | |
11548 | pmaddubsw m2, m7, [r5 + 21 * 16] | |
11549 | pmulhrsw m2, m3 | |
11550 | packuswb m1, m2 | |
11551 | movu [r0 + 32 * 16], m1 | |
11552 | ||
11553 | ; mode 5 [row 0] | |
11554 | pmaddubsw m1, m0, [r5 + 17 * 16] | |
11555 | pmulhrsw m1, m3 | |
11556 | pmaddubsw m2, m7, [r5 + 17 * 16] | |
11557 | pmulhrsw m2, m3 | |
11558 | packuswb m1, m2 | |
11559 | movu [r0 + 48 * 16], m1 | |
11560 | ||
11561 | ; mode 6 [row 0] | |
11562 | pmaddubsw m1, m0, [r5 + 13 * 16] | |
11563 | pmulhrsw m1, m3 | |
11564 | pmaddubsw m2, m7, [r5 + 13 * 16] | |
11565 | pmulhrsw m2, m3 | |
11566 | packuswb m1, m2 | |
11567 | movu [r0 + 64 * 16], m1 | |
11568 | ||
11569 | ; mode 7 [row 0] | |
11570 | pmaddubsw m1, m0, [r5 + 9 * 16] | |
11571 | pmulhrsw m1, m3 | |
11572 | pmaddubsw m2, m7, [r5 + 9 * 16] | |
11573 | pmulhrsw m2, m3 | |
11574 | packuswb m1, m2 | |
11575 | movu [r0 + 80 * 16], m1 | |
11576 | ||
11577 | ; mode 7 [row 1] | |
11578 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
11579 | pmulhrsw m1, m3 | |
11580 | pmaddubsw m2, m7, [r5 + 18 * 16] | |
11581 | pmulhrsw m2, m3 | |
11582 | packuswb m1, m2 | |
11583 | movu [r0 + 81 * 16], m1 | |
11584 | ||
11585 | ; mode 7 [row 2] | |
11586 | pmaddubsw m1, m0, [r5 + 27 * 16] | |
11587 | pmulhrsw m1, m3 | |
11588 | pmaddubsw m2, m7, [r5 + 27 * 16] | |
11589 | pmulhrsw m2, m3 | |
11590 | packuswb m1, m2 | |
11591 | movu [r0 + 82 * 16], m1 | |
11592 | ||
11593 | ; mode 8 [row 0] | |
11594 | pmaddubsw m1, m0, [r5 + 5 * 16] | |
11595 | pmulhrsw m1, m3 | |
11596 | pmaddubsw m2, m7, [r5 + 5 * 16] | |
11597 | pmulhrsw m2, m3 | |
11598 | packuswb m1, m2 | |
11599 | movu [r0 + 96 * 16], m1 | |
11600 | ||
11601 | ; mode 8 [row 1] | |
11602 | pmaddubsw m1, m0, [r5 + 10 * 16] | |
11603 | pmulhrsw m1, m3 | |
11604 | pmaddubsw m2, m7, [r5 + 10 * 16] | |
11605 | pmulhrsw m2, m3 | |
11606 | packuswb m1, m2 | |
11607 | movu [r0 + 97 * 16], m1 | |
11608 | ||
11609 | ; mode 8 [row 2] | |
11610 | pmaddubsw m1, m0, [r5 + 15 * 16] | |
11611 | pmulhrsw m1, m3 | |
11612 | pmaddubsw m2, m7, [r5 + 15 * 16] | |
11613 | pmulhrsw m2, m3 | |
11614 | packuswb m1, m2 | |
11615 | movu [r0 + 98 * 16], m1 | |
11616 | ||
11617 | ; mode 8 [row 3] | |
11618 | pmaddubsw m1, m0, [r5 + 20 * 16] | |
11619 | pmulhrsw m1, m3 | |
11620 | pmaddubsw m2, m7, [r5 + 20 * 16] | |
11621 | pmulhrsw m2, m3 | |
11622 | packuswb m1, m2 | |
11623 | movu [r0 + 99 * 16], m1 | |
11624 | ||
11625 | ; mode 8 [row 4] | |
11626 | pmaddubsw m1, m0, [r5 + 25 * 16] | |
11627 | pmulhrsw m1, m3 | |
11628 | pmaddubsw m2, m7, [r5 + 25 * 16] | |
11629 | pmulhrsw m2, m3 | |
11630 | packuswb m1, m2 | |
11631 | movu [r0 + 100 * 16], m1 | |
11632 | ||
11633 | ; mode 8 [row 5] | |
11634 | pmaddubsw m1, m0, [r5 + 30 * 16] | |
11635 | pmulhrsw m1, m3 | |
11636 | pmaddubsw m2, m7, [r5 + 30 * 16] | |
11637 | pmulhrsw m2, m3 | |
11638 | packuswb m1, m2 | |
11639 | movu [r0 + 101 * 16], m1 | |
11640 | ||
11641 | ; mode 15 [row 13 - second half] | |
11642 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
11643 | pmulhrsw m1, m3 | |
11644 | packuswb m1, m1 | |
11645 | movh [r0 + 221 * 16 + 8], m1 | |
11646 | ; mode 15 [row 13 - second half] end | |
11647 | ||
11648 | ; mode 15 [row 14 - second half] | |
11649 | pmaddubsw m1, m0, [r5 + 1 * 16] | |
11650 | pmulhrsw m1, m3 | |
11651 | packuswb m1, m1 | |
11652 | movh [r0 + 222 * 16 + 8], m1 | |
11653 | ; mode 15 [row 14 - second half] end | |
11654 | ||
11655 | ; mode 16 [row 10 - second half] | |
11656 | pmaddubsw m1, m0, [r5 + 25 * 16] | |
11657 | pmulhrsw m1, m3 | |
11658 | packuswb m1, m1 | |
11659 | movh [r0 + 234 * 16 + 8], m1 | |
11660 | ; mode 16 [row 10 - second half] end | |
11661 | ||
11662 | ; mode 16 [row 11 - second half] | |
11663 | pmaddubsw m1, m0, [r5 + 4 * 16] | |
11664 | pmulhrsw m1, m3 | |
11665 | packuswb m1, m1 | |
11666 | movh [r0 + 235 * 16 + 8], m1 | |
11667 | ; mode 16 [row 11 - second half] end | |
11668 | ||
11669 | ; mode 3 [row 1] | |
11670 | movu m6, [r5 + 20 * 16] | |
11671 | movu m0, [r4 + 2] | |
11672 | punpcklbw m0, m4 | |
11673 | ||
11674 | ; mode 17 [row 7 - second half] | |
11675 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
11676 | pmulhrsw m1, m3 | |
11677 | packuswb m1, m1 | |
11678 | movh [r0 + 247 * 16 + 8], m1 | |
11679 | ||
11680 | ; mode 17 [row 7 - second half] end | |
11681 | pmaddubsw m1, m0, m6 | |
11682 | pmulhrsw m1, m3 | |
11683 | movu m2, [r4 + 10] | |
11684 | punpcklbw m2, m5 | |
11685 | pmaddubsw m4, m2, m6 | |
11686 | pmulhrsw m4, m3 | |
11687 | packuswb m1, m4 | |
11688 | movu [r0 + 17 * 16], m1 | |
11689 | ||
11690 | ;mode 6 [row 3] | |
11691 | movu [r0 + 67 * 16], m1 | |
11692 | ||
11693 | ; mode 4 row [row 1] | |
11694 | pmaddubsw m1, m0, [r5 + 10 * 16] | |
11695 | pmulhrsw m1, m3 | |
11696 | pmaddubsw m4, m2, [r5 + 10 * 16] | |
11697 | pmulhrsw m4, m3 | |
11698 | packuswb m1, m4 | |
11699 | movu [r0 + 33 * 16], m1 | |
11700 | ||
11701 | ; mode 4 row [row 2] | |
11702 | pmaddubsw m1, m0, [r5 + 31 * 16] | |
11703 | pmulhrsw m1, m3 | |
11704 | pmaddubsw m4, m2, [r5 + 31 * 16] | |
11705 | pmulhrsw m4, m3 | |
11706 | packuswb m1, m4 | |
11707 | movu [r0 + 34 * 16], m1 | |
11708 | ||
11709 | ; mode 7 [row 6] | |
11710 | movu [r0 + 86 * 16], m1 | |
11711 | ||
11712 | ; mode 5 row [row 1] | |
11713 | pmaddubsw m1, m0, [r5 + 2 * 16] | |
11714 | pmulhrsw m1, m3 | |
11715 | pmaddubsw m4, m2, [r5 + 2 * 16] | |
11716 | pmulhrsw m4, m3 | |
11717 | packuswb m1, m4 | |
11718 | movu [r0 + 49 * 16], m1 | |
11719 | ||
11720 | ; mode 5 row [row 2] | |
11721 | pmaddubsw m1, m0, [r5 + 19 * 16] | |
11722 | pmulhrsw m1, m3 | |
11723 | pmaddubsw m4, m2, [r5 + 19 * 16] | |
11724 | pmulhrsw m4, m3 | |
11725 | packuswb m1, m4 | |
11726 | movu [r0 + 50 * 16], m1 | |
11727 | ||
11728 | ; mode 6 [row 2] | |
11729 | pmaddubsw m1, m0, [r5 + 7 * 16] | |
11730 | pmulhrsw m1, m3 | |
11731 | pmaddubsw m4, m2, [r5 + 7 * 16] | |
11732 | pmulhrsw m4, m3 | |
11733 | packuswb m1, m4 | |
11734 | movu [r0 + 66 * 16], m1 | |
11735 | ||
11736 | ; mode 7 [row 3] | |
11737 | pmaddubsw m1, m0, [r5 + 4 * 16] | |
11738 | pmulhrsw m1, m3 | |
11739 | pmaddubsw m4, m2, [r5 + 4 * 16] | |
11740 | pmulhrsw m4, m3 | |
11741 | packuswb m1, m4 | |
11742 | movu [r0 + 83 * 16], m1 | |
11743 | ||
11744 | ; mode 7 [row 4] | |
11745 | pmaddubsw m1, m0, [r5 + 13 * 16] | |
11746 | pmulhrsw m1, m3 | |
11747 | pmaddubsw m4, m2, [r5 + 13 * 16] | |
11748 | pmulhrsw m4, m3 | |
11749 | packuswb m1, m4 | |
11750 | movu [r0 + 84 * 16], m1 | |
11751 | ||
11752 | ; mode 8 [row 8] | |
11753 | movu [r0 + 104 * 16], m1 | |
11754 | ||
11755 | ; mode 7 [row 5] | |
11756 | pmaddubsw m1, m0, [r5 + 22 * 16] | |
11757 | pmulhrsw m1, m3 | |
11758 | pmaddubsw m4, m2, [r5 + 22 * 16] | |
11759 | pmulhrsw m4, m3 | |
11760 | packuswb m1, m4 | |
11761 | movu [r0 + 85 * 16], m1 | |
11762 | ||
11763 | ; mode 8 [row 6] | |
11764 | pmaddubsw m1, m0, [r5 + 3 * 16] | |
11765 | pmulhrsw m1, m3 | |
11766 | pmaddubsw m4, m2, [r5 + 3 * 16] | |
11767 | pmulhrsw m4, m3 | |
11768 | packuswb m1, m4 | |
11769 | movu [r0 + 102 * 16], m1 | |
11770 | ||
11771 | ; mode 8 [row 7] | |
11772 | pmaddubsw m1, m0, [r5 + 8 * 16] | |
11773 | pmulhrsw m1, m3 | |
11774 | pmaddubsw m4, m2, [r5 + 8 * 16] | |
11775 | pmulhrsw m4, m3 | |
11776 | packuswb m1, m4 | |
11777 | movu [r0 + 103 * 16], m1 | |
11778 | ||
11779 | ; mode 8 [row 9] | |
11780 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
11781 | pmulhrsw m1, m3 | |
11782 | pmaddubsw m4, m2, [r5 + 18 * 16] | |
11783 | pmulhrsw m4, m3 | |
11784 | packuswb m1, m4 | |
11785 | movu [r0 + 105 * 16], m1 | |
11786 | ||
11787 | ; mode 8 [row 10] | |
11788 | pmaddubsw m1, m0, [r5 + 23 * 16] | |
11789 | pmulhrsw m1, m3 | |
11790 | pmaddubsw m4, m2, [r5 + 23 * 16] | |
11791 | pmulhrsw m4, m3 | |
11792 | packuswb m1, m4 | |
11793 | movu [r0 + 106 * 16], m1 | |
11794 | ||
11795 | ; mode 8 [row 11] | |
11796 | pmaddubsw m1, m0, [r5 + 28 * 16] | |
11797 | pmulhrsw m1, m3 | |
11798 | pmaddubsw m4, m2, [r5 + 28 * 16] | |
11799 | pmulhrsw m4, m3 | |
11800 | packuswb m1, m4 | |
11801 | movu [r0 + 107 * 16], m1 | |
11802 | ||
11803 | ; mode 3 [row 2] | |
11804 | movu m0, [r4 + 3] | |
11805 | movd m1, [r4 + 19] | |
11806 | palignr m1, m0, 1 | |
11807 | punpcklbw m0, m1 | |
11808 | ||
11809 | ; mode 17 [row 6 - second half] | |
11810 | pmaddubsw m1, m0, [r5 + 10 * 16] | |
11811 | pmulhrsw m1, m3 | |
11812 | packuswb m1, m1 | |
11813 | movh [r0 + 246 * 16 + 8], m1 | |
11814 | ; mode 17 [row 6 - second half] end | |
11815 | ||
11816 | pmaddubsw m1, m0, [r5 + 14 * 16] | |
11817 | pmulhrsw m1, m3 | |
11818 | movu m2, [r4 + 11] | |
11819 | movd m4, [r4 + 27] | |
11820 | palignr m4, m2, 1 | |
11821 | punpcklbw m2, m4 | |
11822 | pmaddubsw m4, m2, [r5 + 14 * 16] | |
11823 | pmulhrsw m4, m3 | |
11824 | packuswb m1, m4 | |
11825 | movu [r0 + 18 * 16], m1 | |
11826 | ||
11827 | ; mode 6 [row 5] | |
11828 | movu [r0 + 69 * 16], m1 | |
11829 | ||
11830 | ; mode 4 row [row 3] | |
11831 | pmaddubsw m1, m0, [r5 + 20 * 16] | |
11832 | pmulhrsw m1, m3 | |
11833 | pmaddubsw m4, m2, [r5 + 20 * 16] | |
11834 | pmulhrsw m4, m3 | |
11835 | packuswb m1, m4 | |
11836 | movu [r0 + 35 * 16], m1 | |
11837 | ||
11838 | ; mode 5 row [row 3] | |
11839 | pmaddubsw m1, m0, [r5 + 4 * 16] | |
11840 | pmulhrsw m1, m3 | |
11841 | pmaddubsw m4, m2, [r5 + 4 * 16] | |
11842 | pmulhrsw m4, m3 | |
11843 | packuswb m1, m4 | |
11844 | movu [r0 + 51 * 16], m1 | |
11845 | ||
11846 | ; mode 5 row [row 4] | |
11847 | pmaddubsw m1, m0, [r5 + 21 * 16] | |
11848 | pmulhrsw m1, m3 | |
11849 | pmaddubsw m4, m2, [r5 + 21 * 16] | |
11850 | pmulhrsw m4, m3 | |
11851 | packuswb m1, m4 | |
11852 | movu [r0 + 52 * 16], m1 | |
11853 | ||
11854 | ; mode 6 [row 4] | |
11855 | pmaddubsw m1, m0, [r5 + 1 * 16] | |
11856 | pmulhrsw m1, m3 | |
11857 | pmaddubsw m4, m2, [r5 + 1 * 16] | |
11858 | pmulhrsw m4, m3 | |
11859 | packuswb m1, m4 | |
11860 | movu [r0 + 68 * 16], m1 | |
11861 | ||
11862 | ; mode 6 [row 6] | |
11863 | pmaddubsw m1, m0, [r5 + 27 * 16] | |
11864 | pmulhrsw m1, m3 | |
11865 | pmaddubsw m4, m2, [r5 + 27 * 16] | |
11866 | pmulhrsw m4, m3 | |
11867 | packuswb m1, m4 | |
11868 | movu [r0 + 70 * 16], m1 | |
11869 | ||
11870 | ; mode 7 [row 7] | |
11871 | pmaddubsw m1, m0, [r5 + 8 * 16] | |
11872 | pmulhrsw m1, m3 | |
11873 | pmaddubsw m4, m2, [r5 + 8 * 16] | |
11874 | pmulhrsw m4, m3 | |
11875 | packuswb m1, m4 | |
11876 | movu [r0 + 87 * 16], m1 | |
11877 | ||
11878 | ; mode 7 [row 8] | |
11879 | pmaddubsw m1, m0, [r5 + 17 * 16] | |
11880 | pmulhrsw m1, m3 | |
11881 | pmaddubsw m4, m2, [r5 + 17 * 16] | |
11882 | pmulhrsw m4, m3 | |
11883 | packuswb m1, m4 | |
11884 | movu [r0 + 88 * 16], m1 | |
11885 | ||
11886 | ; mode 7 [row 9] | |
11887 | pmaddubsw m1, m0, [r5 + 26 * 16] | |
11888 | pmulhrsw m1, m3 | |
11889 | pmaddubsw m4, m2, [r5 + 26 * 16] | |
11890 | pmulhrsw m4, m3 | |
11891 | packuswb m1, m4 | |
11892 | movu [r0 + 89 * 16], m1 | |
11893 | ||
11894 | ; mode 8 [row 12] | |
11895 | pmaddubsw m1, m0, [r5 + 1 * 16] | |
11896 | pmulhrsw m1, m3 | |
11897 | pmaddubsw m4, m2, [r5 + 1 * 16] | |
11898 | pmulhrsw m4, m3 | |
11899 | packuswb m1, m4 | |
11900 | movu [r0 + 108 * 16], m1 | |
11901 | ||
11902 | ; mode 8 [row 13] | |
11903 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
11904 | pmulhrsw m1, m3 | |
11905 | pmaddubsw m4, m2, [r5 + 6 * 16] | |
11906 | pmulhrsw m4, m3 | |
11907 | packuswb m1, m4 | |
11908 | movu [r0 + 109 * 16], m1 | |
11909 | ||
11910 | ; mode 8 [row 14] | |
11911 | pmaddubsw m1, m0, [r5 + 11 * 16] | |
11912 | pmulhrsw m1, m3 | |
11913 | pmaddubsw m4, m2, [r5 + 11 * 16] | |
11914 | pmulhrsw m4, m3 | |
11915 | packuswb m1, m4 | |
11916 | movu [r0 + 110 * 16], m1 | |
11917 | ||
11918 | ; mode 8 [row 15] | |
11919 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
11920 | pmulhrsw m1, m3 | |
11921 | pmaddubsw m4, m2, [r5 + 16 * 16] | |
11922 | pmulhrsw m4, m3 | |
11923 | packuswb m1, m4 | |
11924 | movu [r0 + 111 * 16], m1 | |
11925 | ||
11926 | ; mode 3 [row 3] | |
11927 | movu m0, [r4 + 4] | |
11928 | movd m1, [r4 + 20] | |
11929 | palignr m1, m0, 1 | |
11930 | punpcklbw m0, m1 | |
11931 | ||
11932 | ; mode 17 [row 4 - second half] | |
11933 | pmaddubsw m1, m0, [r5 + 30 * 16] | |
11934 | pmulhrsw m1, m3 | |
11935 | packuswb m1, m1 | |
11936 | movh [r0 + 244 * 16 + 8], m1 | |
11937 | ; mode 17 [row 4 - second half] end | |
11938 | ||
11939 | ; mode 17 [row 5 - second half] | |
11940 | pmaddubsw m1, m0, [r5 + 4 * 16] | |
11941 | pmulhrsw m1, m3 | |
11942 | packuswb m1, m1 | |
11943 | movh [r0 + 245 * 16 + 8], m1 | |
11944 | ; mode 17 [row 5 - second half] end | |
11945 | ||
11946 | pmaddubsw m1, m0, [r5 + 8 * 16] | |
11947 | pmulhrsw m1, m3 | |
11948 | movu m2, [r4 + 12] | |
11949 | movd m4, [r4 + 28] | |
11950 | palignr m4, m2, 1 | |
11951 | punpcklbw m2, m4 | |
11952 | pmaddubsw m4, m2, [r5 + 8 * 16] | |
11953 | pmulhrsw m4, m3 | |
11954 | packuswb m1, m4 | |
11955 | movu [r0 + 19 * 16], m1 | |
11956 | ||
11957 | ; mode 6 [row 7] | |
11958 | movu [r0 + 71 * 16], m1 | |
11959 | ||
11960 | ; mode 4 row [row 4] | |
11961 | pmaddubsw m1, m0, [r5 + 9 * 16] | |
11962 | pmulhrsw m1, m3 | |
11963 | pmaddubsw m4, m2, [r5 + 9 * 16] | |
11964 | pmulhrsw m4, m3 | |
11965 | packuswb m1, m4 | |
11966 | movu [r0 + 36 * 16], m1 | |
11967 | ||
11968 | ; mode 4 row [row 5] | |
11969 | pmaddubsw m1, m0, [r5 + 30 * 16] | |
11970 | pmulhrsw m1, m3 | |
11971 | pmaddubsw m4, m2, [r5 + 30 * 16] | |
11972 | pmulhrsw m4, m3 | |
11973 | packuswb m1, m4 | |
11974 | movu [r0 + 37 * 16], m1 | |
11975 | ||
11976 | ; mode 7 row [row 13] | |
11977 | movu [r0 + 93 * 16], m1 | |
11978 | ||
11979 | ; mode 5 row [row 5] | |
11980 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
11981 | pmulhrsw m1, m3 | |
11982 | pmaddubsw m4, m2, [r5 + 6 * 16] | |
11983 | pmulhrsw m4, m3 | |
11984 | packuswb m1, m4 | |
11985 | movu [r0 + 53 * 16], m1 | |
11986 | ||
11987 | ; mode 5 row [row 6] | |
11988 | pmaddubsw m1, m0, [r5 + 23 * 16] | |
11989 | pmulhrsw m1, m3 | |
11990 | pmaddubsw m4, m2, [r5 + 23 * 16] | |
11991 | pmulhrsw m4, m3 | |
11992 | packuswb m1, m4 | |
11993 | movu [r0 + 54 * 16], m1 | |
11994 | ||
11995 | ; mode 6 [row 8] | |
11996 | pmaddubsw m1, m0, [r5 + 21 * 16] | |
11997 | pmulhrsw m1, m3 | |
11998 | pmaddubsw m4, m2, [r5 + 21 * 16] | |
11999 | pmulhrsw m4, m3 | |
12000 | packuswb m1, m4 | |
12001 | movu [r0 + 72 * 16], m1 | |
12002 | ||
12003 | ; mode 7 [row 12] | |
12004 | movu [r0 + 92 * 16], m1 | |
12005 | ||
12006 | ; mode 7 [row 10] | |
12007 | pmaddubsw m1, m0, [r5 + 3 * 16] | |
12008 | pmulhrsw m1, m3 | |
12009 | pmaddubsw m4, m2, [r5 + 3 * 16] | |
12010 | pmulhrsw m4, m3 | |
12011 | packuswb m1, m4 | |
12012 | movu [r0 + 90 * 16], m1 | |
12013 | ||
12014 | ; mode 7 [row 11] | |
12015 | pmaddubsw m1, m0, [r5 + 12 * 16] | |
12016 | pmulhrsw m1, m3 | |
12017 | pmaddubsw m4, m2, [r5 + 12 * 16] | |
12018 | pmulhrsw m4, m3 | |
12019 | packuswb m1, m4 | |
12020 | movu [r0 + 91 * 16], m1 | |
12021 | ||
12022 | ; mode 3 [row 4] | |
12023 | movu m0, [r4 + 5] | |
12024 | movd m1, [r4 + 20] | |
12025 | palignr m1, m0, 1 | |
12026 | punpcklbw m0, m1 | |
12027 | ||
12028 | ; mode 17 [row 3 - second half] | |
12029 | pmaddubsw m1, m0, [r5 + 24 * 16] | |
12030 | pmulhrsw m1, m3 | |
12031 | packuswb m1, m1 | |
12032 | movh [r0 + 243 * 16 + 8], m1 | |
12033 | ||
12034 | ; mode 17 [row 3 - second half] end | |
12035 | pmaddubsw m1, m0, [r5 + 2 * 16] | |
12036 | pmulhrsw m1, m3 | |
12037 | movu m2, [r4 + 13] | |
12038 | movd m4, [r4 + 29] | |
12039 | palignr m4, m2, 1 | |
12040 | punpcklbw m2, m4 | |
12041 | pmaddubsw m4, m2, [r5 + 2 * 16] | |
12042 | pmulhrsw m4, m3 | |
12043 | packuswb m1, m4 | |
12044 | movu [r0 + 20 * 16], m1 | |
12045 | ||
12046 | ;mode 6 [row 9] | |
12047 | movu [r0 + 73 * 16], m1 | |
12048 | ||
12049 | ; mode 4 row [row 6] | |
12050 | movu m6, [r5 + 19 * 16] | |
12051 | pmaddubsw m1, m0, m6 | |
12052 | pmulhrsw m1, m3 | |
12053 | pmaddubsw m4, m2, m6 | |
12054 | pmulhrsw m4, m3 | |
12055 | packuswb m1, m4 | |
12056 | movu [r0 + 38 * 16], m1 | |
12057 | ||
12058 | ; mode 3 [row 5] | |
12059 | pmaddubsw m1, m0, [r5 + 28 * 16] | |
12060 | pmulhrsw m1, m3 | |
12061 | pmaddubsw m4, m2, [r5 + 28 * 16] | |
12062 | pmulhrsw m4, m3 | |
12063 | packuswb m1, m4 | |
12064 | movu [r0 + 21 * 16], m1 | |
12065 | ||
12066 | ;mode 6 [row 11] | |
12067 | movu [r0 + 75 * 16], m1 | |
12068 | ||
12069 | ; mode 5 row [row 7] | |
12070 | pmaddubsw m1, m0, [r5 + 8 * 16] | |
12071 | pmulhrsw m1, m3 | |
12072 | pmaddubsw m4, m2, [r5 + 8 * 16] | |
12073 | pmulhrsw m4, m3 | |
12074 | packuswb m1, m4 | |
12075 | movu [r0 + 55 * 16], m1 | |
12076 | ||
12077 | ; mode 5 row [row 8] | |
12078 | pmaddubsw m1, m0, [r5 + 25 * 16] | |
12079 | pmulhrsw m1, m3 | |
12080 | pmaddubsw m4, m2, [r5 + 25 * 16] | |
12081 | pmulhrsw m4, m3 | |
12082 | packuswb m1, m4 | |
12083 | movu [r0 + 56 * 16], m1 | |
12084 | ||
12085 | ; mode 6 [row 10] | |
12086 | pmaddubsw m1, m0, [r5 + 15 * 16] | |
12087 | pmulhrsw m1, m3 | |
12088 | pmaddubsw m4, m2, [r5 + 15 * 16] | |
12089 | pmulhrsw m4, m3 | |
12090 | packuswb m1, m4 | |
12091 | movu [r0 + 74 * 16], m1 | |
12092 | ||
12093 | ; mode 7 [row 14] | |
12094 | pmaddubsw m1, m0, [r5 + 7 * 16] | |
12095 | pmulhrsw m1, m3 | |
12096 | pmaddubsw m4, m2, [r5 + 7 * 16] | |
12097 | pmulhrsw m4, m3 | |
12098 | packuswb m1, m4 | |
12099 | movu [r0 + 94 * 16], m1 | |
12100 | ||
12101 | ; mode 7 [row 15] | |
12102 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
12103 | pmulhrsw m1, m3 | |
12104 | pmaddubsw m4, m2, [r5 + 16 * 16] | |
12105 | pmulhrsw m4, m3 | |
12106 | packuswb m1, m4 | |
12107 | movu [r0 + 95 * 16], m1 | |
12108 | ||
12109 | ; mode 3 [row 6] | |
12110 | movu m0, [r4 + 6] | |
12111 | movd m1, [r4 + 22] | |
12112 | palignr m1, m0, 1 | |
12113 | punpcklbw m0, m1 | |
12114 | ||
12115 | ; mode 17 [row 2 - second half] | |
12116 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
12117 | pmulhrsw m1, m3 | |
12118 | packuswb m1, m1 | |
12119 | movh [r0 + 242 * 16 + 8], m1 | |
12120 | ; mode 17 [row 2 - second half] end | |
12121 | ||
12122 | pmaddubsw m1, m0, [r5 + 22 * 16] | |
12123 | pmulhrsw m1, m3 | |
12124 | movu m2, [r4 + 14] | |
12125 | movd m4, [r4 + 30] | |
12126 | palignr m4, m2, 1 | |
12127 | punpcklbw m2, m4 | |
12128 | pmaddubsw m4, m2, [r5 + 22 * 16] | |
12129 | pmulhrsw m4, m3 | |
12130 | packuswb m1, m4 | |
12131 | movu [r0 + 22 * 16], m1 | |
12132 | ||
12133 | ; mode 6 [row 13] | |
12134 | movu [r0 + 77 * 16], m1 | |
12135 | ||
12136 | ; mode 4 row [row 7] | |
12137 | pmaddubsw m1, m0, [r5 + 8 * 16] | |
12138 | pmulhrsw m1, m3 | |
12139 | pmaddubsw m4, m2, [r5 + 8 * 16] | |
12140 | pmulhrsw m4, m3 | |
12141 | packuswb m1, m4 | |
12142 | movu [r0 + 39 * 16], m1 | |
12143 | ||
12144 | ; mode 4 row [row 8] | |
12145 | pmaddubsw m1, m0, [r5 + 29 * 16] | |
12146 | pmulhrsw m1, m3 | |
12147 | pmaddubsw m4, m2, [r5 + 29 * 16] | |
12148 | pmulhrsw m4, m3 | |
12149 | packuswb m1, m4 | |
12150 | movu [r0 + 40 * 16], m1 | |
12151 | ||
12152 | ; mode 5 row [row 9] | |
12153 | pmaddubsw m1, m0, [r5 + 10 * 16] | |
12154 | pmulhrsw m1, m3 | |
12155 | pmaddubsw m4, m2, [r5 + 10 * 16] | |
12156 | pmulhrsw m4, m3 | |
12157 | packuswb m1, m4 | |
12158 | movu [r0 + 57 * 16], m1 | |
12159 | ||
12160 | ; mode 5 row [row 10] | |
12161 | pmaddubsw m1, m0, [r5 + 27 * 16] | |
12162 | pmulhrsw m1, m3 | |
12163 | pmaddubsw m4, m2, [r5 + 27 * 16] | |
12164 | pmulhrsw m4, m3 | |
12165 | packuswb m1, m4 | |
12166 | movu [r0 + 58 * 16], m1 | |
12167 | ||
12168 | ; mode 6 [row 12] | |
12169 | pmaddubsw m1, m0, [r5 + 9 * 16] | |
12170 | pmulhrsw m1, m3 | |
12171 | pmaddubsw m4, m2, [r5 + 9 * 16] | |
12172 | pmulhrsw m4, m3 | |
12173 | packuswb m1, m4 | |
12174 | movu [r0 + 76 * 16], m1 | |
12175 | ||
12176 | ; mode 3 [row 7] | |
12177 | movu m0, [r4 + 7] | |
12178 | movd m1, [r4 + 27] | |
12179 | palignr m1, m0, 1 | |
12180 | punpcklbw m0, m1 | |
12181 | ||
12182 | ; mode 17 [row 1 - second half] | |
12183 | pmaddubsw m1, m0, [r5 + 12 * 16] | |
12184 | pmulhrsw m1, m3 | |
12185 | packuswb m1, m1 | |
12186 | movh [r0 + 241 * 16 + 8], m1 | |
12187 | ; mode 17 [row 1 - second half] end | |
12188 | ||
12189 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
12190 | pmulhrsw m1, m3 | |
12191 | movu m2, [r4 + 15] | |
12192 | movd m4, [r4 + 25] | |
12193 | palignr m4, m2, 1 | |
12194 | punpcklbw m2, m4 | |
12195 | pmaddubsw m4, m2, [r5 + 16 * 16] | |
12196 | pmulhrsw m4, m3 | |
12197 | packuswb m1, m4 | |
12198 | movu [r0 + 23 * 16], m1 | |
12199 | ||
12200 | ; mode 6 [row 15] | |
12201 | movu [r0 + 79 * 16], m1 | |
12202 | ||
12203 | ; mode 4 row [row 9] | |
12204 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
12205 | pmulhrsw m1, m3 | |
12206 | pmaddubsw m4, m2, [r5 + 18 * 16] | |
12207 | pmulhrsw m4, m3 | |
12208 | packuswb m1, m4 | |
12209 | movu [r0 + 41 * 16], m1 | |
12210 | ||
12211 | ; mode 5 row [row 11] | |
12212 | pmaddubsw m1, m0, [r5 + 12 * 16] | |
12213 | pmulhrsw m1, m3 | |
12214 | pmaddubsw m4, m2, [r5 + 12 * 16] | |
12215 | pmulhrsw m4, m3 | |
12216 | packuswb m1, m4 | |
12217 | movu [r0 + 59 * 16], m1 | |
12218 | ||
12219 | ; mode 5 row [row 12] | |
12220 | pmaddubsw m1, m0, [r5 + 29 * 16] | |
12221 | pmulhrsw m1, m3 | |
12222 | pmaddubsw m4, m2, [r5 + 29 * 16] | |
12223 | pmulhrsw m4, m3 | |
12224 | packuswb m1, m4 | |
12225 | movu [r0 + 60 * 16], m1 | |
12226 | ||
12227 | ; mode 6 [row 14] | |
12228 | pmaddubsw m1, m0, [r5 + 3 * 16] | |
12229 | pmulhrsw m1, m3 | |
12230 | pmaddubsw m4, m2, [r5 + 3 * 16] | |
12231 | pmulhrsw m4, m3 | |
12232 | packuswb m1, m4 | |
12233 | movu [r0 + 78 * 16], m1 | |
12234 | ||
12235 | ; mode 3 [row 8] | |
12236 | movu m0, [r4 + 8] | |
12237 | movd m1, [r4 + 24] | |
12238 | palignr m1, m0, 1 | |
12239 | punpcklbw m0, m1 | |
12240 | pmaddubsw m1, m0, [r5 + 10 * 16] | |
12241 | pmulhrsw m1, m3 | |
12242 | movu m2, [r4 + 16] | |
12243 | psrldq m4, m2, 1 | |
12244 | pinsrb m4, [r4 + 32], 15 | |
12245 | punpcklbw m2, m4 | |
12246 | pmaddubsw m4, m2, [r5 + 10 * 16] | |
12247 | pmulhrsw m4, m3 | |
12248 | packuswb m1, m4 | |
12249 | movu [r0 + 24 * 16], m1 | |
12250 | ||
12251 | ; mode 4 row [row 10] | |
12252 | pmaddubsw m1, m0, [r5 + 7 * 16] | |
12253 | pmulhrsw m1, m3 | |
12254 | pmaddubsw m4, m2, [r5 + 7 * 16] | |
12255 | pmulhrsw m4, m3 | |
12256 | packuswb m1, m4 | |
12257 | movu [r0 + 42 * 16], m1 | |
12258 | ||
12259 | ; mode 4 row [row 11] | |
12260 | pmaddubsw m1, m0, [r5 + 28 * 16] | |
12261 | pmulhrsw m1, m3 | |
12262 | pmaddubsw m4, m2, [r5 + 28 * 16] | |
12263 | pmulhrsw m4, m3 | |
12264 | packuswb m1, m4 | |
12265 | movu [r0 + 43 * 16], m1 | |
12266 | ||
12267 | ; mode 5 row [row 13] | |
12268 | pmaddubsw m1, m0, [r5 + 14 * 16] | |
12269 | pmulhrsw m1, m3 | |
12270 | pmaddubsw m4, m2, [r5 + 14 * 16] | |
12271 | pmulhrsw m4, m3 | |
12272 | packuswb m1, m4 | |
12273 | movu [r0 + 61 * 16], m1 | |
12274 | ||
12275 | ; mode 5 row [row 14] | |
12276 | pmaddubsw m1, m0, [r5 + 31 * 16] | |
12277 | pmulhrsw m1, m3 | |
12278 | pmaddubsw m4, m2, [r5 + 31 * 16] | |
12279 | pmulhrsw m4, m3 | |
12280 | packuswb m1, m4 | |
12281 | movu [r0 + 62 * 16], m1 | |
12282 | ||
12283 | ; mode 3 [row 9] | |
12284 | movu m0, [r4 + 9] | |
12285 | movd m1, [r4 + 16] | |
12286 | palignr m1, m0, 1 | |
12287 | punpcklbw m0, m1 | |
12288 | pmaddubsw m1, m0, [r5 + 4 * 16] | |
12289 | pmulhrsw m1, m3 | |
12290 | movu m2, [r4 + 17] | |
12291 | movd m4, [r4 + 33] | |
12292 | palignr m4, m2, 1 | |
12293 | punpcklbw m2, m4 | |
12294 | pmaddubsw m4, m2, [r5 + 4 * 16] | |
12295 | pmulhrsw m4, m3 | |
12296 | packuswb m1, m4 | |
12297 | movu [r0 + 25 * 16], m1 | |
12298 | ||
12299 | ; mode 4 row [row 12] | |
12300 | pmaddubsw m1, m0, [r5 + 17 * 16] | |
12301 | pmulhrsw m1, m3 | |
12302 | pmaddubsw m4, m2, [r5 + 17 * 16] | |
12303 | pmulhrsw m4, m3 | |
12304 | packuswb m1, m4 | |
12305 | movu [r0 + 44 * 16], m1 | |
12306 | ||
12307 | ; mode 3 [row 10] | |
12308 | pmaddubsw m1, m0, [r5 + 30 * 16] | |
12309 | pmulhrsw m1, m3 | |
12310 | pmaddubsw m4, m2, [r5 + 30 * 16] | |
12311 | pmulhrsw m4, m3 | |
12312 | packuswb m1, m4 | |
12313 | movu [r0 + 26 * 16], m1 | |
12314 | ||
12315 | ; mode 5 row [row 15] | |
12316 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
12317 | pmulhrsw m1, m3 | |
12318 | pmaddubsw m4, m2, [r5 + 16 * 16] | |
12319 | pmulhrsw m4, m3 | |
12320 | packuswb m1, m4 | |
12321 | movu [r0 + 63 * 16], m1 | |
12322 | ||
12323 | ; mode 3 [row 11] | |
12324 | movu m0, [r4 + 10] | |
12325 | movd m1, [r4 + 26] | |
12326 | palignr m1, m0, 1 | |
12327 | punpcklbw m0, m1 | |
12328 | pmaddubsw m1, m0, [r5 + 24 * 16] | |
12329 | pmulhrsw m1, m3 | |
12330 | movu m2, [r4 + 18] | |
12331 | movd m4, [r4 + 34] | |
12332 | palignr m4, m2, 1 | |
12333 | punpcklbw m2, m4 | |
12334 | pmaddubsw m4, m2, [r5 + 24 * 16] | |
12335 | pmulhrsw m4, m3 | |
12336 | packuswb m1, m4 | |
12337 | movu [r0 + 27 * 16], m1 | |
12338 | ||
12339 | ; mode 4 row [row 13] | |
12340 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
12341 | pmulhrsw m1, m3 | |
12342 | pmaddubsw m4, m2, [r5 + 6 * 16] | |
12343 | pmulhrsw m4, m3 | |
12344 | packuswb m1, m4 | |
12345 | movu [r0 + 45 * 16], m1 | |
12346 | ||
12347 | ; mode 4 row [row 14] | |
12348 | pmaddubsw m1, m0, [r5 + 27 * 16] | |
12349 | pmulhrsw m1, m3 | |
12350 | pmaddubsw m4, m2, [r5 + 27 * 16] | |
12351 | pmulhrsw m4, m3 | |
12352 | packuswb m1, m4 | |
12353 | movu [r0 + 46 * 16], m1 | |
12354 | ||
12355 | ; mode 3 [row 12] | |
12356 | movu m0, [r4 + 11] | |
12357 | movd m1, [r4 + 27] | |
12358 | palignr m1, m0, 1 | |
12359 | punpcklbw m0, m1 | |
12360 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
12361 | pmulhrsw m1, m3 | |
12362 | movu m2, [r4 + 19] | |
12363 | movd m4, [r4 + 35] | |
12364 | palignr m4, m2, 1 | |
12365 | punpcklbw m2, m4 | |
12366 | pmaddubsw m4, m2, [r5 + 18 * 16] | |
12367 | pmulhrsw m4, m3 | |
12368 | packuswb m1, m4 | |
12369 | movu [r0 + 28 * 16], m1 | |
12370 | ||
12371 | ; mode 4 row [row 15] | |
12372 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
12373 | pmulhrsw m1, m3 | |
12374 | pmaddubsw m4, m2, [r5 + 16 * 16] | |
12375 | pmulhrsw m4, m3 | |
12376 | packuswb m1, m4 | |
12377 | movu [r0 + 47 * 16], m1 | |
12378 | ||
12379 | ; mode 3 [row 13] | |
12380 | movu m0, [r4 + 12] | |
12381 | movd m1, [r4 + 28] | |
12382 | palignr m1, m0, 1 | |
12383 | punpcklbw m0, m1 | |
12384 | pmaddubsw m1, m0, [r5 + 12 * 16] | |
12385 | pmulhrsw m1, m3 | |
12386 | movu m2, [r4 + 20] | |
12387 | movd m4, [r4 + 36] | |
12388 | palignr m4, m2, 1 | |
12389 | punpcklbw m2, m4 | |
12390 | pmaddubsw m4, m2, [r5 + 12 * 16] | |
12391 | pmulhrsw m4, m3 | |
12392 | packuswb m1, m4 | |
12393 | movu [r0 + 29 * 16], m1 | |
12394 | ||
12395 | ; mode 3 [row 14] | |
12396 | movu m0, [r4 + 13] | |
12397 | movd m1, [r4 + 29] | |
12398 | palignr m1, m0, 1 | |
12399 | punpcklbw m0, m1 | |
12400 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
12401 | pmulhrsw m1, m3 | |
12402 | movu m2, [r4 + 21] | |
12403 | movd m4, [r4 + 37] | |
12404 | palignr m4, m2, 1 | |
12405 | punpcklbw m2, m4 | |
12406 | pmaddubsw m4, m2, [r5 + 6 * 16] | |
12407 | pmulhrsw m4, m3 | |
12408 | packuswb m1, m4 | |
12409 | movu [r0 + 30 * 16], m1 | |
12410 | ||
12411 | ; mode 9 | |
12412 | movu m0, [r2 + 1] | |
12413 | movd m1, [r2 + 17] | |
12414 | palignr m1, m0, 1 | |
12415 | ||
12416 | ; mode 9 [row 15] | |
12417 | movu [r0 + 127 * 16], m1 | |
12418 | ||
12419 | ; mode 9 [row 0] | |
12420 | punpcklbw m0, m1 | |
12421 | pmaddubsw m1, m0, [r5 + 2 * 16] | |
12422 | pmulhrsw m1, m3 | |
12423 | movu m7, [r2 + 9] | |
12424 | movd m4, [r4 + 25] | |
12425 | palignr m2, m7, 1 | |
12426 | punpcklbw m7, m2 | |
12427 | pmaddubsw m2, m7, [r5 + 2 * 16] | |
12428 | pmulhrsw m2, m3 | |
12429 | packuswb m1, m2 | |
12430 | movu [r0 + 112 * 16], m1 | |
12431 | ||
12432 | ; mode 9 [row 1] | |
12433 | pmaddubsw m1, m0, [r5 + 4 * 16] | |
12434 | pmulhrsw m1, m3 | |
12435 | pmaddubsw m2, m7, [r5 + 4 * 16] | |
12436 | pmulhrsw m2, m3 | |
12437 | packuswb m1, m2 | |
12438 | movu [r0 + 113 * 16], m1 | |
12439 | ||
12440 | ; mode 9 [row 2] | |
12441 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
12442 | pmulhrsw m1, m3 | |
12443 | pmaddubsw m2, m7, [r5 + 6 * 16] | |
12444 | pmulhrsw m2, m3 | |
12445 | packuswb m1, m2 | |
12446 | movu [r0 + 114 * 16], m1 | |
12447 | ||
12448 | ; mode 9 [row 3] | |
12449 | pmaddubsw m1, m0, [r5 + 8 * 16] | |
12450 | pmulhrsw m1, m3 | |
12451 | pmaddubsw m2, m7, [r5 + 8 * 16] | |
12452 | pmulhrsw m2, m3 | |
12453 | packuswb m1, m2 | |
12454 | movu [r0 + 115 * 16], m1 | |
12455 | ||
12456 | ; mode 9 [row 4] | |
12457 | pmaddubsw m1, m0, [r5 + 10 * 16] | |
12458 | pmulhrsw m1, m3 | |
12459 | pmaddubsw m2, m7, [r5 + 10 * 16] | |
12460 | pmulhrsw m2, m3 | |
12461 | packuswb m1, m2 | |
12462 | movu [r0 + 116 * 16], m1 | |
12463 | ||
12464 | ; mode 9 [row 5] | |
12465 | pmaddubsw m1, m0, [r5 + 12 * 16] | |
12466 | pmulhrsw m1, m3 | |
12467 | pmaddubsw m2, m7, [r5 + 12 * 16] | |
12468 | pmulhrsw m2, m3 | |
12469 | packuswb m1, m2 | |
12470 | movu [r0 + 117 * 16], m1 | |
12471 | ||
12472 | ; mode 9 [row 6] | |
12473 | pmaddubsw m1, m0, [r5 + 14 * 16] | |
12474 | pmulhrsw m1, m3 | |
12475 | pmaddubsw m2, m7, [r5 + 14 * 16] | |
12476 | pmulhrsw m2, m3 | |
12477 | packuswb m1, m2 | |
12478 | movu [r0 + 118 * 16], m1 | |
12479 | ||
12480 | ; mode 9 [row 7] | |
12481 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
12482 | pmulhrsw m1, m3 | |
12483 | pmaddubsw m2, m7, [r5 + 16 * 16] | |
12484 | pmulhrsw m2, m3 | |
12485 | packuswb m1, m2 | |
12486 | movu [r0 + 119 * 16], m1 | |
12487 | ||
12488 | ; mode 9 [row 8] | |
12489 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
12490 | pmulhrsw m1, m3 | |
12491 | pmaddubsw m2, m7, [r5 + 18 * 16] | |
12492 | pmulhrsw m2, m3 | |
12493 | packuswb m1, m2 | |
12494 | movu [r0 + 120 * 16], m1 | |
12495 | ||
12496 | ; mode 9 [row 9] | |
12497 | pmaddubsw m1, m0, [r5 + 20 * 16] | |
12498 | pmulhrsw m1, m3 | |
12499 | pmaddubsw m2, m7, [r5 + 20 * 16] | |
12500 | pmulhrsw m2, m3 | |
12501 | packuswb m1, m2 | |
12502 | movu [r0 + 121 * 16], m1 | |
12503 | ||
12504 | ; mode 9 [row 10] | |
12505 | pmaddubsw m1, m0, [r5 + 22 * 16] | |
12506 | pmulhrsw m1, m3 | |
12507 | pmaddubsw m2, m7, [r5 + 22 * 16] | |
12508 | pmulhrsw m2, m3 | |
12509 | packuswb m1, m2 | |
12510 | movu [r0 + 122 * 16], m1 | |
12511 | ||
12512 | ; mode 9 [row 11] | |
12513 | pmaddubsw m1, m0, [r5 + 24 * 16] | |
12514 | pmulhrsw m1, m3 | |
12515 | pmaddubsw m2, m7, [r5 + 24 * 16] | |
12516 | pmulhrsw m2, m3 | |
12517 | packuswb m1, m2 | |
12518 | movu [r0 + 123 * 16], m1 | |
12519 | ||
12520 | ; mode 9 [row 12] | |
12521 | pmaddubsw m1, m0, [r5 + 26 * 16] | |
12522 | pmulhrsw m1, m3 | |
12523 | pmaddubsw m2, m7, [r5 + 26 * 16] | |
12524 | pmulhrsw m2, m3 | |
12525 | packuswb m1, m2 | |
12526 | movu [r0 + 124 * 16], m1 | |
12527 | ||
12528 | ; mode 9 [row 13] | |
12529 | pmaddubsw m1, m0, [r5 + 28 * 16] | |
12530 | pmulhrsw m1, m3 | |
12531 | pmaddubsw m2, m7, [r5 + 28 * 16] | |
12532 | pmulhrsw m2, m3 | |
12533 | packuswb m1, m2 | |
12534 | movu [r0 + 125 * 16], m1 | |
12535 | ||
12536 | ; mode 9 [row 14] | |
12537 | pmaddubsw m1, m0, [r5 + 30 * 16] | |
12538 | pmulhrsw m1, m3 | |
12539 | pmaddubsw m2, m7, [r5 + 30 * 16] | |
12540 | pmulhrsw m2, m3 | |
12541 | packuswb m1, m2 | |
12542 | movu [r0 + 126 * 16], m1 | |
12543 | ||
12544 | ; mode 10 | |
12545 | movu m1, [r2 + 1] | |
12546 | movu [r0 + 128 * 16], m1 | |
12547 | movu [r0 + 129 * 16], m1 | |
12548 | movu [r0 + 130 * 16], m1 | |
12549 | movu [r0 + 131 * 16], m1 | |
12550 | movu [r0 + 132 * 16], m1 | |
12551 | movu [r0 + 133 * 16], m1 | |
12552 | movu [r0 + 134 * 16], m1 | |
12553 | movu [r0 + 135 * 16], m1 | |
12554 | movu [r0 + 136 * 16], m1 | |
12555 | movu [r0 + 137 * 16], m1 | |
12556 | movu [r0 + 138 * 16], m1 | |
12557 | movu [r0 + 139 * 16], m1 | |
12558 | movu [r0 + 140 * 16], m1 | |
12559 | movu [r0 + 141 * 16], m1 | |
12560 | movu [r0 + 142 * 16], m1 | |
12561 | movu [r0 + 143 * 16], m1 | |
12562 | ||
12563 | pxor m0, m0 | |
12564 | pshufb m1, m1, m0 | |
12565 | punpcklbw m1, m0 | |
12566 | movu m2, [r1] | |
12567 | pshufb m2, m2, m0 | |
12568 | punpcklbw m2, m0 | |
12569 | movu m4, [r1 + 1] | |
12570 | punpcklbw m5, m4, m0 | |
12571 | punpckhbw m4, m0 | |
12572 | psubw m5, m2 | |
12573 | psubw m4, m2 | |
12574 | psraw m5, 1 | |
12575 | psraw m4, 1 | |
12576 | paddw m5, m1 | |
12577 | paddw m4, m1 | |
12578 | packuswb m5, m4 | |
12579 | ||
12580 | pextrb [r0 + 128 * 16], m5, 0 | |
12581 | pextrb [r0 + 129 * 16], m5, 1 | |
12582 | pextrb [r0 + 130 * 16], m5, 2 | |
12583 | pextrb [r0 + 131 * 16], m5, 3 | |
12584 | pextrb [r0 + 132 * 16], m5, 4 | |
12585 | pextrb [r0 + 133 * 16], m5, 5 | |
12586 | pextrb [r0 + 134 * 16], m5, 6 | |
12587 | pextrb [r0 + 135 * 16], m5, 7 | |
12588 | pextrb [r0 + 136 * 16], m5, 8 | |
12589 | pextrb [r0 + 137 * 16], m5, 9 | |
12590 | pextrb [r0 + 138 * 16], m5, 10 | |
12591 | pextrb [r0 + 139 * 16], m5, 11 | |
12592 | pextrb [r0 + 140 * 16], m5, 12 | |
12593 | pextrb [r0 + 141 * 16], m5, 13 | |
12594 | pextrb [r0 + 142 * 16], m5, 14 | |
12595 | pextrb [r0 + 143 * 16], m5, 15 | |
12596 | ||
12597 | ; mode 11 | |
12598 | movu m0, [r2] | |
12599 | ||
12600 | ; mode 11 [row 15] | |
12601 | movu [r0 + 159 * 16], m0 | |
12602 | ||
12603 | ; mode 11 [row 0] | |
12604 | movu m1, [r2 + 1] | |
12605 | punpcklbw m0, m1 | |
12606 | pmaddubsw m1, m0, [r5 + 30 * 16] | |
12607 | pmulhrsw m1, m3 | |
12608 | movu m7, [r2 + 8] | |
12609 | movu m2, [r2 + 9] | |
12610 | punpcklbw m7, m2 | |
12611 | pmaddubsw m2, m7, [r5 + 30 * 16] | |
12612 | pmulhrsw m2, m3 | |
12613 | packuswb m1, m2 | |
12614 | movu [r0 + 144 * 16], m1 | |
12615 | ||
12616 | ; mode 11 [row 1] | |
12617 | pmaddubsw m1, m0, [r5 + 28 * 16] | |
12618 | pmulhrsw m1, m3 | |
12619 | pmaddubsw m2, m7, [r5 + 28 * 16] | |
12620 | pmulhrsw m2, m3 | |
12621 | packuswb m1, m2 | |
12622 | movu [r0 + 145 * 16], m1 | |
12623 | ||
12624 | ; mode 11 [row 2] | |
12625 | pmaddubsw m1, m0, [r5 + 26 * 16] | |
12626 | pmulhrsw m1, m3 | |
12627 | pmaddubsw m2, m7, [r5 + 26 * 16] | |
12628 | pmulhrsw m2, m3 | |
12629 | packuswb m1, m2 | |
12630 | movu [r0 + 146 * 16], m1 | |
12631 | ||
12632 | ; mode 11 [row 3] | |
12633 | pmaddubsw m1, m0, [r5 + 24 * 16] | |
12634 | pmulhrsw m1, m3 | |
12635 | pmaddubsw m2, m7, [r5 + 24 * 16] | |
12636 | pmulhrsw m2, m3 | |
12637 | packuswb m1, m2 | |
12638 | movu [r0 + 147 * 16], m1 | |
12639 | ||
12640 | ; mode 11 [row 4] | |
12641 | pmaddubsw m1, m0, [r5 + 22 * 16] | |
12642 | pmulhrsw m1, m3 | |
12643 | pmaddubsw m2, m7, [r5 + 22 * 16] | |
12644 | pmulhrsw m2, m3 | |
12645 | packuswb m1, m2 | |
12646 | movu [r0 + 148 * 16], m1 | |
12647 | ||
12648 | ; mode 11 [row 5] | |
12649 | pmaddubsw m1, m0, [r5 + 20 * 16] | |
12650 | pmulhrsw m1, m3 | |
12651 | pmaddubsw m2, m7, [r5 + 20 * 16] | |
12652 | pmulhrsw m2, m3 | |
12653 | packuswb m1, m2 | |
12654 | movu [r0 + 149 * 16], m1 | |
12655 | ||
12656 | ; mode 11 [row 6] | |
12657 | pmaddubsw m1, m0, [r5 + 18 * 16] | |
12658 | pmulhrsw m1, m3 | |
12659 | pmaddubsw m2, m7, [r5 + 18 * 16] | |
12660 | pmulhrsw m2, m3 | |
12661 | packuswb m1, m2 | |
12662 | movu [r0 + 150 * 16], m1 | |
12663 | ||
12664 | ; mode 11 [row 7] | |
12665 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
12666 | pmulhrsw m1, m3 | |
12667 | pmaddubsw m2, m7, [r5 + 16 * 16] | |
12668 | pmulhrsw m2, m3 | |
12669 | packuswb m1, m2 | |
12670 | movu [r0 + 151 * 16], m1 | |
12671 | ||
12672 | ; mode 11 [row 8] | |
12673 | pmaddubsw m1, m0, [r5 + 14 * 16] | |
12674 | pmulhrsw m1, m3 | |
12675 | pmaddubsw m2, m7, [r5 + 14 * 16] | |
12676 | pmulhrsw m2, m3 | |
12677 | packuswb m1, m2 | |
12678 | movu [r0 + 152 * 16], m1 | |
12679 | ||
12680 | ; mode 11 [row 9] | |
12681 | pmaddubsw m1, m0, [r5 + 12 * 16] | |
12682 | pmulhrsw m1, m3 | |
12683 | pmaddubsw m2, m7, [r5 + 12 * 16] | |
12684 | pmulhrsw m2, m3 | |
12685 | packuswb m1, m2 | |
12686 | movu [r0 + 153 * 16], m1 | |
12687 | ||
12688 | ; mode 11 [row 10] | |
12689 | pmaddubsw m1, m0, [r5 + 10 * 16] | |
12690 | pmulhrsw m1, m3 | |
12691 | pmaddubsw m2, m7, [r5 + 10 * 16] | |
12692 | pmulhrsw m2, m3 | |
12693 | packuswb m1, m2 | |
12694 | movu [r0 + 154 * 16], m1 | |
12695 | ||
12696 | ; mode 11 [row 11] | |
12697 | pmaddubsw m1, m0, [r5 + 8 * 16] | |
12698 | pmulhrsw m1, m3 | |
12699 | pmaddubsw m2, m7, [r5 + 8 * 16] | |
12700 | pmulhrsw m2, m3 | |
12701 | packuswb m1, m2 | |
12702 | movu [r0 + 155 * 16], m1 | |
12703 | ||
12704 | ; mode 11 [row 12] | |
12705 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
12706 | pmulhrsw m1, m3 | |
12707 | pmaddubsw m2, m7, [r5 + 6 * 16] | |
12708 | pmulhrsw m2, m3 | |
12709 | packuswb m1, m2 | |
12710 | movu [r0 + 156 * 16], m1 | |
12711 | ||
12712 | ; mode 11 [row 13] | |
12713 | pmaddubsw m1, m0, [r5 + 4 * 16] | |
12714 | pmulhrsw m1, m3 | |
12715 | pmaddubsw m2, m7, [r5 + 4 * 16] | |
12716 | pmulhrsw m2, m3 | |
12717 | packuswb m1, m2 | |
12718 | movu [r0 + 157 * 16], m1 | |
12719 | ||
12720 | ; mode 11 [row 14] | |
12721 | pmaddubsw m1, m0, [r5 + 2 * 16] | |
12722 | pmulhrsw m1, m3 | |
12723 | pmaddubsw m2, m7, [r5 + 2 * 16] | |
12724 | pmulhrsw m2, m3 | |
12725 | packuswb m1, m2 | |
12726 | movu [r0 + 158 * 16], m1 | |
12727 | ||
12728 | ; mode 12 [row 0] | |
12729 | movu m0, [r4] | |
12730 | movu m1, [r4 + 1] | |
12731 | punpcklbw m0, m1 | |
12732 | pmaddubsw m1, m0, [r5 + 27 * 16] | |
12733 | pmulhrsw m1, m3 | |
12734 | movu m7, [r4 + 8] | |
12735 | movd m2, [r4 + 24] | |
12736 | palignr m2, m7, 1 | |
12737 | punpcklbw m7, m2 | |
12738 | pmaddubsw m2, m7, [r5 + 27 * 16] | |
12739 | pmulhrsw m2, m3 | |
12740 | packuswb m1, m2 | |
12741 | movu [r0 + 160 * 16], m1 | |
12742 | ||
12743 | ; mode 12 [row 1] | |
12744 | pmaddubsw m1, m0, [r5 + 22 * 16] | |
12745 | pmulhrsw m1, m3 | |
12746 | pmaddubsw m2, m7, [r5 + 22 * 16] | |
12747 | pmulhrsw m2, m3 | |
12748 | packuswb m1, m2 | |
12749 | movu [r0 + 161 * 16], m1 | |
12750 | ||
12751 | ; mode 12 [row 2] | |
12752 | pmaddubsw m1, m0, [r5 + 17 * 16] | |
12753 | pmulhrsw m1, m3 | |
12754 | pmaddubsw m2, m7, [r5 + 17 * 16] | |
12755 | pmulhrsw m2, m3 | |
12756 | packuswb m1, m2 | |
12757 | movu [r0 + 162 * 16], m1 | |
12758 | ||
12759 | ; mode 12 [row 3] | |
12760 | pmaddubsw m1, m0, [r5 + 12 * 16] | |
12761 | pmulhrsw m1, m3 | |
12762 | pmaddubsw m2, m7, [r5 + 12 * 16] | |
12763 | pmulhrsw m2, m3 | |
12764 | packuswb m1, m2 | |
12765 | movu [r0 + 163 * 16], m1 | |
12766 | ||
12767 | ; mode 12 [row 4] | |
12768 | pmaddubsw m1, m0, [r5 + 7 * 16] | |
12769 | pmulhrsw m1, m3 | |
12770 | pmaddubsw m2, m7, [r5 + 7 * 16] | |
12771 | pmulhrsw m2, m3 | |
12772 | packuswb m1, m2 | |
12773 | movu [r0 + 164 * 16], m1 | |
12774 | ||
12775 | ; mode 12 [row 5] | |
12776 | pmaddubsw m1, m0, [r5 + 2 * 16] | |
12777 | pmulhrsw m1, m3 | |
12778 | pmaddubsw m2, m7, [r5 + 2 * 16] | |
12779 | pmulhrsw m2, m3 | |
12780 | packuswb m1, m2 | |
12781 | movu [r0 + 165 * 16], m1 | |
12782 | ||
12783 | ; mode 13 [row 0] | |
12784 | pmaddubsw m1, m0, [r5 + 23 * 16] | |
12785 | pmulhrsw m1, m3 | |
12786 | pmaddubsw m2, m7, [r5 + 23 * 16] | |
12787 | pmulhrsw m2, m3 | |
12788 | packuswb m1, m2 | |
12789 | movu [r0 + 176 * 16], m1 | |
12790 | ||
12791 | ; mode 13 [row 1] | |
12792 | pmaddubsw m1, m0, [r5 + 14 * 16] | |
12793 | pmulhrsw m1, m3 | |
12794 | pmaddubsw m2, m7, [r5 + 14 * 16] | |
12795 | pmulhrsw m2, m3 | |
12796 | packuswb m1, m2 | |
12797 | movu [r0 + 177 * 16], m1 | |
12798 | ||
12799 | ; mode 13 [row 2] | |
12800 | pmaddubsw m1, m0, [r5 + 5 * 16] | |
12801 | pmulhrsw m1, m3 | |
12802 | pmaddubsw m2, m7, [r5 + 5 * 16] | |
12803 | pmulhrsw m2, m3 | |
12804 | packuswb m1, m2 | |
12805 | movu [r0 + 178 * 16], m1 | |
12806 | ||
12807 | ; mode 14 [row 0] | |
12808 | pmaddubsw m1, m0, [r5 + 19 * 16] | |
12809 | pmulhrsw m1, m3 | |
12810 | pmaddubsw m2, m7, [r5 + 19 * 16] | |
12811 | pmulhrsw m2, m3 | |
12812 | packuswb m1, m2 | |
12813 | movu [r0 + 192 * 16], m1 | |
12814 | ||
12815 | ; mode 14 [row 1] | |
12816 | pmaddubsw m1, m0, [r5 + 6 * 16] | |
12817 | pmulhrsw m1, m3 | |
12818 | pmaddubsw m2, m7, [r5 + 6 * 16] | |
12819 | pmulhrsw m2, m3 | |
12820 | packuswb m1, m2 | |
12821 | movu [r0 + 193 * 16], m1 | |
12822 | ||
12823 | ; mode 17 [row 0] | |
12824 | movu [r0 + 240 * 16], m1 | |
12825 | ||
12826 | ; mode 15 [row 0] | |
12827 | pmaddubsw m1, m0, [r5 + 15 * 16] | |
12828 | pmulhrsw m1, m3 | |
12829 | pmaddubsw m2, m7, [r5 + 15 * 16] | |
12830 | pmulhrsw m2, m3 | |
12831 | packuswb m1, m2 | |
12832 | movu [r0 + 208 * 16], m1 | |
12833 | ||
12834 | ; mode 15 [row 15 - second half] | |
12835 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
12836 | pmulhrsw m1, m3 | |
12837 | packuswb m1, m1 | |
12838 | movh [r0 + 223 * 16 + 8], m1 | |
12839 | ; mode 15 [row 15 - second half] end | |
12840 | ||
12841 | ; mode 16 [row 0] | |
12842 | pmaddubsw m1, m0, [r5 + 11 * 16] | |
12843 | pmulhrsw m1, m3 | |
12844 | pmaddubsw m2, m7, [r5 + 11 * 16] | |
12845 | pmulhrsw m2, m3 | |
12846 | packuswb m1, m2 | |
12847 | movu [r0 + 224 * 16], m1 | |
12848 | ||
12849 | ; mode 17 [row 9 - second half] | |
12850 | pmaddubsw m1, m0, [r5 + 28 * 16] | |
12851 | pmulhrsw m1, m3 | |
12852 | packuswb m1, m1 | |
12853 | movh [r0 + 249 * 16 + 8], m1 | |
12854 | ; mode 17 [row 9 - second half] end | |
12855 | ||
12856 | ; mode 17 [row 10 - second half] | |
12857 | pmaddubsw m1, m0, [r5 + 2 * 16] | |
12858 | pmulhrsw m1, m3 | |
12859 | packuswb m1, m1 | |
12860 | movh [r0 + 250 * 16 + 8], m1 | |
12861 | ; mode 17 [row 10 - second half] end | |
12862 | ||
12863 | ; mode 17 [row 1 - first half] | |
12864 | pslldq m6, m0, 2 | |
12865 | pinsrb m6, [r3 + 0], 1 | |
12866 | pinsrb m6, [r3 + 1], 0 | |
12867 | pmaddubsw m1, m6, [r5 + 12 * 16] | |
12868 | pmulhrsw m1, m3 | |
12869 | packuswb m1, m1 | |
12870 | movh [r0 + 241 * 16], m1 | |
12871 | ||
12872 | ; mode 17 [row 11 - second half] | |
12873 | pmaddubsw m1, m6, [r5 + 8 * 16] | |
12874 | pmulhrsw m1, m3 | |
12875 | packuswb m1, m1 | |
12876 | movh [r0 + 251 * 16 + 8], m1 | |
12877 | ; mode 17 [row 11 - second half] end | |
12878 | ||
12879 | ; mode 17 [row 2 - first half] | |
12880 | pslldq m6, 2 | |
12881 | pinsrb m6, [r3 + 1], 1 | |
12882 | pinsrb m6, [r3 + 2], 0 | |
12883 | pmaddubsw m1, m6, [r5 + 18 * 16] | |
12884 | pmulhrsw m1, m3 | |
12885 | packuswb m1, m1 | |
12886 | movh [r0 + 242 * 16], m1 | |
12887 | ||
12888 | ; mode 17 [row 12 - second half] | |
12889 | pmaddubsw m1, m6, [r5 + 14 * 16] | |
12890 | pmulhrsw m1, m3 | |
12891 | packuswb m1, m1 | |
12892 | movh [r0 + 252 * 16 + 8], m1 | |
12893 | ; mode 17 [row 12 - second half] end | |
12894 | ||
12895 | ; mode 17 [row 3 - first half] | |
12896 | pslldq m6, 2 | |
12897 | pinsrb m6, [r3 + 2], 1 | |
12898 | pinsrb m6, [r3 + 4], 0 | |
12899 | pmaddubsw m1, m6, [r5 + 24 * 16] | |
12900 | pmulhrsw m1, m3 | |
12901 | packuswb m1, m1 | |
12902 | movh [r0 + 243 * 16], m1 | |
12903 | ||
12904 | ; mode 17 [row 13 - first half] | |
12905 | pmaddubsw m1, m6, [r5 + 20 * 16] | |
12906 | pmulhrsw m1, m3 | |
12907 | packuswb m1, m1 | |
12908 | movh [r0 + 253 * 16 + 8], m1 | |
12909 | ||
12910 | ; mode 17 [row 4 - first half] | |
12911 | pslldq m6, 2 | |
12912 | pinsrb m6, [r3 + 4], 1 | |
12913 | pinsrb m6, [r3 + 5], 0 | |
12914 | pmaddubsw m1, m6, [r5 + 30 * 16] | |
12915 | pmulhrsw m1, m3 | |
12916 | packuswb m1, m1 | |
12917 | movh [r0 + 244 * 16], m1 | |
12918 | ||
12919 | ; mode 17 [row 5 - first half] | |
12920 | pmaddubsw m1, m6, [r5 + 4 * 16] | |
12921 | pmulhrsw m1, m3 | |
12922 | packuswb m1, m1 | |
12923 | movh [r0 + 245 * 16], m1 | |
12924 | ||
12925 | ; mode 17 [row 14 - second half] | |
12926 | pmaddubsw m1, m6, [r5 + 26 * 16] | |
12927 | pmulhrsw m1, m3 | |
12928 | packuswb m1, m1 | |
12929 | movh [r0 + 254 * 16 + 8], m1 | |
12930 | ; mode 17 [row 14 - second half] end | |
12931 | ||
12932 | ; mode 17 [row 6 - first half] | |
12933 | pslldq m6, 2 | |
12934 | pinsrb m6, [r3 + 5], 1 | |
12935 | pinsrb m6, [r3 + 6], 0 | |
12936 | pmaddubsw m1, m6, [r5 + 10 * 16] | |
12937 | pmulhrsw m1, m3 | |
12938 | packuswb m1, m1 | |
12939 | movh [r0 + 246 * 16], m1 | |
12940 | ||
12941 | ; mode 17 [row 7 - first half] | |
12942 | pslldq m6, 2 | |
12943 | pinsrb m6, [r3 + 6], 1 | |
12944 | pinsrb m6, [r3 + 7], 0 | |
12945 | pmaddubsw m1, m6, [r5 + 16 * 16] | |
12946 | pmulhrsw m1, m3 | |
12947 | packuswb m1, m1 | |
12948 | movh [r0 + 247 * 16], m1 | |
12949 | ||
12950 | ; mode 17 [row 8 - first half] | |
12951 | pslldq m6, 2 | |
12952 | pinsrb m6, [r3 + 7], 1 | |
12953 | pinsrb m6, [r3 + 9], 0 | |
12954 | pmaddubsw m1, m6, [r5 + 22 * 16] | |
12955 | pmulhrsw m1, m3 | |
12956 | packuswb m1, m1 | |
12957 | movh [r0 + 248 * 16], m1 | |
12958 | ||
12959 | ; mode 17 [row 9 - first half] | |
12960 | pslldq m6, 2 | |
12961 | pinsrb m6, [r3 + 9], 1 | |
12962 | pinsrb m6, [r3 + 10], 0 | |
12963 | pmaddubsw m1, m6, [r5 + 28 * 16] | |
12964 | pmulhrsw m1, m3 | |
12965 | packuswb m1, m1 | |
12966 | movh [r0 + 249 * 16], m1 | |
12967 | ||
12968 | ; mode 17 [row 10 - first half] | |
12969 | pmaddubsw m1, m6, [r5 + 2 * 16] | |
12970 | pmulhrsw m1, m3 | |
12971 | packuswb m1, m1 | |
12972 | movh [r0 + 250 * 16], m1 | |
12973 | ||
12974 | ; mode 17 [row 11 - first half] | |
12975 | pslldq m6, 2 | |
12976 | pinsrb m6, [r3 + 10], 1 | |
12977 | pinsrb m6, [r3 + 11], 0 | |
12978 | pmaddubsw m1, m6, [r5 + 8 * 16] | |
12979 | pmulhrsw m1, m3 | |
12980 | packuswb m1, m1 | |
12981 | movh [r0 + 251 * 16], m1 | |
12982 | ||
12983 | ; mode 17 [row 12 - first half] | |
12984 | pslldq m6, 2 | |
12985 | pinsrb m6, [r3 + 11], 1 | |
12986 | pinsrb m6, [r3 + 12], 0 | |
12987 | pmaddubsw m1, m6, [r5 + 14 * 16] | |
12988 | pmulhrsw m1, m3 | |
12989 | packuswb m1, m1 | |
12990 | movh [r0 + 252 * 16], m1 | |
12991 | ||
12992 | ; mode 17 [row 13 - first half] | |
12993 | pslldq m6, 2 | |
12994 | pinsrb m6, [r3 + 12], 1 | |
12995 | pinsrb m6, [r3 + 14], 0 | |
12996 | pmaddubsw m1, m6, [r5 + 20 * 16] | |
12997 | pmulhrsw m1, m3 | |
12998 | packuswb m1, m1 | |
12999 | movh [r0 + 253 * 16], m1 | |
13000 | ||
13001 | ; mode 17 [row 14 - first half] | |
13002 | pslldq m6, 2 | |
13003 | pinsrb m6, [r3 + 14], 1 | |
13004 | pinsrb m6, [r3 + 15], 0 | |
13005 | pmaddubsw m1, m6, [r5 + 26 * 16] | |
13006 | pmulhrsw m1, m3 | |
13007 | packuswb m1, m1 | |
13008 | movh [r0 + 254 * 16], m1 | |
13009 | ||
13010 | ; mode 16 [row 12 - second half] | |
13011 | pmaddubsw m1, m0, [r5 + 15 * 16] | |
13012 | pmulhrsw m1, m3 | |
13013 | packuswb m1, m1 | |
13014 | movh [r0 + 236 * 16 + 8], m1 | |
13015 | ; mode 16 [row 12 - second half] | |
13016 | ||
13017 | ; mode 12 [row 6] | |
13018 | pslldq m2, m0, 2 | |
13019 | pinsrb m2, [r3 + 0], 1 | |
13020 | pinsrb m2, [r3 + 6], 0 | |
13021 | pmaddubsw m1, m2, [r5 + 29 * 16] | |
13022 | pmulhrsw m1, m3 | |
13023 | movu m0, [r4 + 7] | |
13024 | psrldq m4, m0, 1 | |
13025 | punpcklbw m0, m4 | |
13026 | pmaddubsw m4, m0, [r5 + 29 * 16] | |
13027 | pmulhrsw m4, m3 | |
13028 | packuswb m1, m4 | |
13029 | movu [r0 + 166 * 16], m1 | |
13030 | ||
13031 | ; mode 12 [row 7] | |
13032 | pmaddubsw m1, m2, [r5 + 24 * 16] | |
13033 | pmulhrsw m1, m3 | |
13034 | pmaddubsw m4, m0, [r5 + 24 * 16] | |
13035 | pmulhrsw m4, m3 | |
13036 | packuswb m1, m4 | |
13037 | movu [r0 + 167 * 16], m1 | |
13038 | ||
13039 | ; mode 12 [row 8] | |
13040 | pmaddubsw m1, m2, [r5 + 19 * 16] | |
13041 | pmulhrsw m1, m3 | |
13042 | pmaddubsw m4, m0, [r5 + 19 * 16] | |
13043 | pmulhrsw m4, m3 | |
13044 | packuswb m1, m4 | |
13045 | movu [r0 + 168 * 16], m1 | |
13046 | ||
13047 | ; mode 12 [row 9] | |
13048 | pmaddubsw m1, m2, [r5 + 14 * 16] | |
13049 | pmulhrsw m1, m3 | |
13050 | pmaddubsw m4, m0, [r5 + 14 * 16] | |
13051 | pmulhrsw m4, m3 | |
13052 | packuswb m1, m4 | |
13053 | movu [r0 + 169 * 16], m1 | |
13054 | ||
13055 | ; mode 12 [row 10] | |
13056 | pmaddubsw m1, m2, [r5 + 9 * 16] | |
13057 | pmulhrsw m1, m3 | |
13058 | pmaddubsw m4, m0, [r5 + 9 * 16] | |
13059 | pmulhrsw m4, m3 | |
13060 | packuswb m1, m4 | |
13061 | movu [r0 + 170 * 16], m1 | |
13062 | ||
13063 | ; mode 12 [row 11] | |
13064 | pmaddubsw m1, m2, [r5 + 4 * 16] | |
13065 | pmulhrsw m1, m3 | |
13066 | pmaddubsw m4, m0, [r5 + 4 * 16] | |
13067 | pmulhrsw m4, m3 | |
13068 | packuswb m1, m4 | |
13069 | movu [r0 + 171 * 16], m1 | |
13070 | ||
13071 | ; mode 13 [row 3] | |
13072 | pinsrb m7, m2, [r3 + 4], 0 | |
13073 | pmaddubsw m1, m7, [r5 + 28 * 16] | |
13074 | pmulhrsw m1, m3 | |
13075 | pmaddubsw m4, m0, [r5 + 28 * 16] | |
13076 | pmulhrsw m4, m3 | |
13077 | packuswb m1, m4 | |
13078 | movu [r0 + 179 * 16], m1 | |
13079 | ||
13080 | ; mode 13 [row 4] | |
13081 | pmaddubsw m1, m7, [r5 + 19 * 16] | |
13082 | pmulhrsw m1, m3 | |
13083 | pmaddubsw m4, m0, [r5 + 19 * 16] | |
13084 | pmulhrsw m4, m3 | |
13085 | packuswb m1, m4 | |
13086 | movu [r0 + 180 * 16], m1 | |
13087 | ||
13088 | ; mode 13 [row 5] | |
13089 | pmaddubsw m1, m7, [r5 + 10 * 16] | |
13090 | pmulhrsw m1, m3 | |
13091 | pmaddubsw m4, m0, [r5 + 10 * 16] | |
13092 | pmulhrsw m4, m3 | |
13093 | packuswb m1, m4 | |
13094 | movu [r0 + 181 * 16], m1 | |
13095 | ||
13096 | ; mode 13 [row 6] | |
13097 | pmaddubsw m1, m7, [r5 + 1 * 16] | |
13098 | pmulhrsw m1, m3 | |
13099 | pmaddubsw m4, m0, [r5 + 1 * 16] | |
13100 | pmulhrsw m4, m3 | |
13101 | packuswb m1, m4 | |
13102 | movu [r0 + 182 * 16], m1 | |
13103 | ||
13104 | ; mode 14 [row 2] | |
13105 | pinsrb m5, m7, [r3 + 2], 0 | |
13106 | pmaddubsw m1, m5, [r5 + 25 * 16] | |
13107 | pmulhrsw m1, m3 | |
13108 | pmaddubsw m4, m0, [r5 + 25 * 16] | |
13109 | pmulhrsw m4, m3 | |
13110 | packuswb m1, m4 | |
13111 | movu [r0 + 194 * 16], m1 | |
13112 | ||
13113 | ; mode 14 [row 3] | |
13114 | pmaddubsw m1, m5, [r5 + 12 * 16] | |
13115 | pmulhrsw m1, m3 | |
13116 | pmaddubsw m4, m0, [r5 + 12 * 16] | |
13117 | pmulhrsw m4, m3 | |
13118 | packuswb m1, m4 | |
13119 | movu [r0 + 195 * 16], m1 | |
13120 | ||
13121 | ; mode 15 [row 1] | |
13122 | pmaddubsw m1, m5, [r5 + 30 * 16] | |
13123 | pmulhrsw m1, m3 | |
13124 | pmaddubsw m4, m0, [r5 + 30 * 16] | |
13125 | pmulhrsw m4, m3 | |
13126 | packuswb m1, m4 | |
13127 | movu [r0 + 209 * 16], m1 | |
13128 | ||
13129 | ; mode 15 [row 2] | |
13130 | pmaddubsw m1, m5, [r5 + 13 * 16] | |
13131 | pmulhrsw m1, m3 | |
13132 | pmaddubsw m4, m0, [r5 + 13 * 16] | |
13133 | pmulhrsw m4, m3 | |
13134 | packuswb m1, m4 | |
13135 | movu [r0 + 210 * 16], m1 | |
13136 | ||
13137 | ; mode 16 [row 1] | |
13138 | pmaddubsw m1, m5, [r5 + 22 * 16] | |
13139 | pmulhrsw m1, m3 | |
13140 | pmaddubsw m4, m0, [r5 + 22 * 16] | |
13141 | pmulhrsw m4, m3 | |
13142 | packuswb m1, m4 | |
13143 | movu [r0 + 225 * 16], m1 | |
13144 | ||
13145 | ; mode 16 [row 2] | |
13146 | pmaddubsw m1, m5, [r5 + 1 * 16] | |
13147 | pmulhrsw m1, m3 | |
13148 | pmaddubsw m4, m0, [r5 + 1 * 16] | |
13149 | pmulhrsw m4, m3 | |
13150 | packuswb m1, m4 | |
13151 | movu [r0 + 226 * 16], m1 | |
13152 | ||
13153 | ; mode 16 [row 13 - second half] | |
13154 | pmaddubsw m1, m5, [r5 + 26 * 16] | |
13155 | pmulhrsw m1, m3 | |
13156 | packuswb m1, m1 | |
13157 | movh [r0 + 237 * 16 + 8], m1 | |
13158 | ; mode 16 [row 13 - second half] | |
13159 | ||
13160 | ; mode 16 [row 14 - second half] | |
13161 | pmaddubsw m1, m5, [r5 + 5 * 16] | |
13162 | pmulhrsw m1, m3 | |
13163 | packuswb m1, m1 | |
13164 | movh [r0 + 238 * 16 + 8], m1 | |
13165 | ; mode 16 [row 14 - second half] | |
13166 | ||
13167 | ; mode 16 [row 3] | |
13168 | pslldq m6, m5, 2 | |
13169 | pinsrb m6, [r3 + 2], 1 | |
13170 | pinsrb m6, [r3 + 3], 0 | |
13171 | pmaddubsw m1, m6, [r5 + 12 * 16] | |
13172 | pmulhrsw m1, m3 | |
13173 | packuswb m1, m1 | |
13174 | movh [r0 + 227 * 16], m1 | |
13175 | ||
13176 | ; mode 16 [row 15 - second half] | |
13177 | pmaddubsw m1, m6, [r5 + 16 * 16] | |
13178 | pmulhrsw m1, m3 | |
13179 | packuswb m1, m1 | |
13180 | movh [r0 + 239 * 16 + 8], m1 | |
13181 | ; mode 16 [row 15 - second half] end | |
13182 | ||
13183 | ; mode 16 [row 4- first half] | |
13184 | pslldq m6, 2 | |
13185 | pinsrb m6, [r3 + 3], 1 | |
13186 | pinsrb m6, [r3 + 5], 0 | |
13187 | pmaddubsw m1, m6, [r5 + 23 * 16] | |
13188 | pmulhrsw m1, m3 | |
13189 | packuswb m1, m1 | |
13190 | movh [r0 + 228 * 16], m1 | |
13191 | ||
13192 | ; mode 16 [row 5- first half] | |
13193 | pmaddubsw m1, m6, [r5 + 2 * 16] | |
13194 | pmulhrsw m1, m3 | |
13195 | packuswb m1, m1 | |
13196 | movh [r0 + 229 * 16], m1 | |
13197 | ||
13198 | ; mode 16 [row 6- first half] | |
13199 | pslldq m6, 2 | |
13200 | pinsrb m6, [r3 + 5], 1 | |
13201 | pinsrb m6, [r3 + 6], 0 | |
13202 | pmaddubsw m1, m6, [r5 + 13 * 16] | |
13203 | pmulhrsw m1, m3 | |
13204 | packuswb m1, m1 | |
13205 | movh [r0 + 230 * 16], m1 | |
13206 | ||
13207 | ; mode 16 [row 7- first half] | |
13208 | pslldq m6, 2 | |
13209 | pinsrb m6, [r3 + 6], 1 | |
13210 | pinsrb m6, [r3 + 8], 0 | |
13211 | pmaddubsw m1, m6, [r5 + 24 * 16] | |
13212 | pmulhrsw m1, m3 | |
13213 | packuswb m1, m1 | |
13214 | movh [r0 + 231 * 16], m1 | |
13215 | ||
13216 | ; mode 16 [row 8- first half] | |
13217 | pmaddubsw m1, m6, [r5 + 3 * 16] | |
13218 | pmulhrsw m1, m3 | |
13219 | packuswb m1, m1 | |
13220 | movh [r0 + 232 * 16], m1 | |
13221 | ; mode 19 [row 0 - second half] end | |
13222 | ||
13223 | ; mode 16 [row 9- first half] | |
13224 | pslldq m6, 2 | |
13225 | pinsrb m6, [r3 + 8], 1 | |
13226 | pinsrb m6, [r3 + 9], 0 | |
13227 | pmaddubsw m1, m6, [r5 + 14 * 16] | |
13228 | pmulhrsw m1, m3 | |
13229 | packuswb m1, m1 | |
13230 | movh [r0 + 233 * 16], m1 | |
13231 | ||
13232 | ; mode 16 [row 10 - first half] | |
13233 | pslldq m6, 2 | |
13234 | pinsrb m6, [r3 + 9], 1 | |
13235 | pinsrb m6, [r3 + 11], 0 | |
13236 | pmaddubsw m1, m6, [r5 + 25 * 16] | |
13237 | pmulhrsw m1, m3 | |
13238 | packuswb m1, m1 | |
13239 | movh [r0 + 234 * 16], m1 | |
13240 | ||
13241 | ; mode 16 [row 11 - first half] | |
13242 | pmaddubsw m1, m6, [r5 + 4 * 16] | |
13243 | pmulhrsw m1, m3 | |
13244 | packuswb m1, m1 | |
13245 | movh [r0 + 235 * 16], m1 | |
13246 | ||
13247 | ; mode 16 [row 12 - first half] | |
13248 | pslldq m6, 2 | |
13249 | pinsrb m6, [r3 + 11], 1 | |
13250 | pinsrb m6, [r3 + 12], 0 | |
13251 | pmaddubsw m1, m6, [r5 + 15 * 16] | |
13252 | pmulhrsw m1, m3 | |
13253 | packuswb m1, m1 | |
13254 | movh [r0 + 236 * 16], m1 | |
13255 | ||
13256 | ; mode 16 [row 13 - first half] | |
13257 | pslldq m6, 2 | |
13258 | pinsrb m6, [r3 + 12], 1 | |
13259 | pinsrb m6, [r3 + 14], 0 | |
13260 | pmaddubsw m1, m6, [r5 + 26 * 16] | |
13261 | pmulhrsw m1, m3 | |
13262 | packuswb m1, m1 | |
13263 | movh [r0 + 237 * 16], m1 | |
13264 | ||
13265 | ; mode 16 [row 14 - first half] | |
13266 | pmaddubsw m1, m6, [r5 + 5 * 16] | |
13267 | pmulhrsw m1, m3 | |
13268 | packuswb m1, m1 | |
13269 | movh [r0 + 238 * 16], m1 | |
13270 | ||
13271 | ; mode 16 [row 15 - first half] | |
13272 | pslldq m6, 2 | |
13273 | pinsrb m6, [r3 + 14], 1 | |
13274 | pinsrb m6, [r3 + 15], 0 | |
13275 | pmaddubsw m1, m6, [r5 + 16 * 16] | |
13276 | pmulhrsw m1, m3 | |
13277 | packuswb m1, m1 | |
13278 | movh [r0 + 239 * 16], m1 | |
13279 | ||
13280 | ; mode 14 [row 4] | |
13281 | pslldq m5, 2 | |
13282 | pinsrb m5, [r3 + 2], 1 | |
13283 | pinsrb m5, [r3 + 5], 0 | |
13284 | movu m4, [r4 + 6] | |
13285 | psrldq m0, m4, 1 | |
13286 | punpcklbw m4, m0 | |
13287 | ||
13288 | ; mode 16 [row 3 - second half] | |
13289 | pmaddubsw m1, m4, [r5 + 12 * 16] | |
13290 | pmulhrsw m1, m3 | |
13291 | packuswb m1, m1 | |
13292 | movh [r0 + 227 * 16 + 8], m1 | |
13293 | ||
13294 | ; mode 16 [row 3 - second half] end | |
13295 | pmaddubsw m1, m5, [r5 + 31 * 16] | |
13296 | pmulhrsw m1, m3 | |
13297 | pmaddubsw m0, m4, [r5 + 31 * 16] | |
13298 | pmulhrsw m0, m3 | |
13299 | packuswb m1, m0 | |
13300 | movu [r0 + 196 * 16], m1 | |
13301 | ||
13302 | ; mode 14 [row 5] | |
13303 | pmaddubsw m1, m5, [r5 + 18 * 16] | |
13304 | pmulhrsw m1, m3 | |
13305 | pmaddubsw m0, m4, [r5 + 18 * 16] | |
13306 | pmulhrsw m0, m3 | |
13307 | packuswb m1, m0 | |
13308 | movu [r0 + 197 * 16], m1 | |
13309 | ||
13310 | ; mode 14 [row 6] | |
13311 | pmaddubsw m1, m5, [r5 + 5 * 16] | |
13312 | pmulhrsw m1, m3 | |
13313 | pmaddubsw m0, m4, [r5 + 5 * 16] | |
13314 | pmulhrsw m0, m3 | |
13315 | packuswb m1, m0 | |
13316 | movu [r0 + 198 * 16], m1 | |
13317 | ||
13318 | ; mode 15 [row 3] | |
13319 | movu m6, m5 | |
13320 | pinsrb m6, [r3 + 4], 0 | |
13321 | pmaddubsw m1, m6, [r5 + 28 * 16] | |
13322 | pmulhrsw m1, m3 | |
13323 | pmaddubsw m0, m4, [r5 + 28 * 16] | |
13324 | pmulhrsw m0, m3 | |
13325 | packuswb m1, m0 | |
13326 | movu [r0 + 211 * 16], m1 | |
13327 | ||
13328 | ; mode 15 [row 4] | |
13329 | pmaddubsw m1, m6, [r5 + 11 * 16] | |
13330 | pmulhrsw m1, m3 | |
13331 | pmaddubsw m0, m4, [r5 + 11 * 16] | |
13332 | pmulhrsw m0, m3 | |
13333 | packuswb m1, m0 | |
13334 | movu [r0 + 212 * 16], m1 | |
13335 | ||
13336 | ; mode 15 [row 5 - first half] | |
13337 | pslldq m6, 2 | |
13338 | pinsrb m6, [r3 + 4], 1 | |
13339 | pinsrb m6, [r3 + 6], 0 | |
13340 | pmaddubsw m1, m6, [r5 + 26 * 16] | |
13341 | pmulhrsw m1, m3 | |
13342 | packuswb m1, m1 | |
13343 | movh [r0 + 213 * 16], m1 | |
13344 | ||
13345 | ; mode 15 [row 6 - first half] | |
13346 | pmaddubsw m1, m6, [r5 + 9 * 16] | |
13347 | pmulhrsw m1, m3 | |
13348 | packuswb m1, m1 | |
13349 | movh [r0 + 214 * 16], m1 | |
13350 | ||
13351 | ; mode 15 [row 7 - first half] | |
13352 | pslldq m6, 2 | |
13353 | pinsrb m6, [r3 + 6], 1 | |
13354 | pinsrb m6, [r3 + 8], 0 | |
13355 | pmaddubsw m1, m6, [r5 + 24 * 16] | |
13356 | pmulhrsw m1, m3 | |
13357 | packuswb m1, m1 | |
13358 | movh [r0 + 215 * 16], m1 | |
13359 | ||
13360 | ; mode 15 [row 8 - first half] | |
13361 | pmaddubsw m1, m6, [r5 + 7 * 16] | |
13362 | pmulhrsw m1, m3 | |
13363 | packuswb m1, m1 | |
13364 | movh [r0 + 216 * 16], m1 | |
13365 | ||
13366 | ; mode 15 [row 9 - first half] | |
13367 | pslldq m6, 2 | |
13368 | pinsrb m6, [r3 + 8], 1 | |
13369 | pinsrb m6, [r3 + 9], 0 | |
13370 | pmaddubsw m1, m6, [r5 + 22 * 16] | |
13371 | pmulhrsw m1, m3 | |
13372 | packuswb m1, m1 | |
13373 | movh [r0 + 217 * 16], m1 | |
13374 | ||
13375 | ; mode 15 [row 10 - first half] | |
13376 | pmaddubsw m1, m6, [r5 + 5 * 16] | |
13377 | pmulhrsw m1, m3 | |
13378 | packuswb m1, m1 | |
13379 | movh [r0 + 218 * 16], m1 | |
13380 | ||
13381 | ; mode 15 [row 11 - first half] | |
13382 | pslldq m6, 2 | |
13383 | pinsrb m6, [r3 + 9], 1 | |
13384 | pinsrb m6, [r3 + 11], 0 | |
13385 | pmaddubsw m1, m6, [r5 + 20 * 16] | |
13386 | pmulhrsw m1, m3 | |
13387 | packuswb m1, m1 | |
13388 | movh [r0 + 219 * 16], m1 | |
13389 | ||
13390 | ; mode 15 [row 12 - first half] | |
13391 | pmaddubsw m1, m6, [r5 + 3 * 16] | |
13392 | pmulhrsw m1, m3 | |
13393 | packuswb m1, m1 | |
13394 | movh [r0 + 220 * 16], m1 | |
13395 | ||
13396 | ; mode 15 [row 13 - first half] | |
13397 | pslldq m6, 2 | |
13398 | pinsrb m6, [r3 + 11], 1 | |
13399 | pinsrb m6, [r3 + 13], 0 | |
13400 | pmaddubsw m1, m6, [r5 + 18 * 16] | |
13401 | pmulhrsw m1, m3 | |
13402 | packuswb m1, m1 | |
13403 | movh [r0 + 221 * 16], m1 | |
13404 | ||
13405 | ; mode 15 [row 14 - first half] | |
13406 | pmaddubsw m1, m6, [r5 + 1 * 16] | |
13407 | pmulhrsw m1, m3 | |
13408 | packuswb m1, m1 | |
13409 | movh [r0 + 222 * 16], m1 | |
13410 | ||
13411 | ; mode 15 [row 15 - first half] | |
13412 | pslldq m6, 2 | |
13413 | pinsrb m6, [r3 + 13], 1 | |
13414 | pinsrb m6, [r3 + 15], 0 | |
13415 | pmaddubsw m1, m6, [r5 + 16 * 16] | |
13416 | pmulhrsw m1, m3 | |
13417 | packuswb m1, m1 | |
13418 | movh [r0 + 223 * 16], m1 | |
13419 | ||
13420 | ; mode 14 [row 7] | |
13421 | pslldq m5, 2 | |
13422 | pinsrb m5, [r3 + 5], 1 | |
13423 | pinsrb m5, [r3 + 7], 0 | |
13424 | movu m0, [r4 + 5] | |
13425 | psrldq m6, m0, 1 | |
13426 | punpcklbw m0, m6 | |
13427 | ||
13428 | ; mode 15 [row 5 - second half] | |
13429 | pmaddubsw m1, m0, [r5 + 26 * 16] | |
13430 | pmulhrsw m1, m3 | |
13431 | packuswb m1, m1 | |
13432 | movh [r0 + 213 * 16 + 8], m1 | |
13433 | ; mode 15 [row 5 - second half] end | |
13434 | ||
13435 | ; mode 15 [row 6 - second half] | |
13436 | pmaddubsw m1, m0, [r5 + 9 * 16] | |
13437 | pmulhrsw m1, m3 | |
13438 | packuswb m1, m1 | |
13439 | movh [r0 + 214 * 16 + 8], m1 | |
13440 | ; mode 15 [row 6 - second half] end | |
13441 | ||
13442 | ; mode 16 [row 4 - second half] | |
13443 | pmaddubsw m1, m0, [r5 + 23 * 16] | |
13444 | pmulhrsw m1, m3 | |
13445 | packuswb m1, m1 | |
13446 | movh [r0 + 228 * 16 + 8], m1 | |
13447 | ; mode 16 [row 4 - second half] end | |
13448 | ||
13449 | ; mode 16 [row 5 - second half] | |
13450 | pmaddubsw m1, m0, [r5 + 2 * 16] | |
13451 | pmulhrsw m1, m3 | |
13452 | packuswb m1, m1 | |
13453 | movh [r0 + 229 * 16 + 8], m1 | |
13454 | ||
13455 | ; mode 16 [row 5 - second half] end | |
13456 | pmaddubsw m1, m5, [r5 + 24 * 16] | |
13457 | pmulhrsw m1, m3 | |
13458 | pmaddubsw m6, m0, [r5 + 24 * 16] | |
13459 | pmulhrsw m6, m3 | |
13460 | packuswb m1, m6 | |
13461 | movu [r0 + 199 * 16], m1 | |
13462 | ||
13463 | ; mode 14 [row 8] | |
13464 | pmaddubsw m1, m5, [r5 + 11 * 16] | |
13465 | pmulhrsw m1, m3 | |
13466 | pmaddubsw m6, m0, [r5 + 11 * 16] | |
13467 | pmulhrsw m6, m3 | |
13468 | packuswb m1, m6 | |
13469 | movu [r0 + 200 * 16], m1 | |
13470 | ||
13471 | ; mode 14 [row 9] | |
13472 | pslldq m5, 2 | |
13473 | pinsrb m5, [r3 + 7], 1 | |
13474 | pinsrb m5, [r3 + 10], 0 | |
13475 | movu m0, [r4 + 4] | |
13476 | psrldq m6, m0, 1 | |
13477 | punpcklbw m0, m6 | |
13478 | ||
13479 | ; mode 15 [row 7 - second half] | |
13480 | pmaddubsw m1, m0, [r5 + 24 * 16] | |
13481 | pmulhrsw m1, m3 | |
13482 | packuswb m1, m1 | |
13483 | movh [r0 + 215 * 16 + 8], m1 | |
13484 | ; mode 15 [row 7 - second half] end | |
13485 | ||
13486 | ; mode 15 [row 8 - second half] | |
13487 | pmaddubsw m1, m0, [r5 + 7 * 16] | |
13488 | pmulhrsw m1, m3 | |
13489 | packuswb m1, m1 | |
13490 | movh [r0 + 216 * 16 + 8], m1 | |
13491 | ; mode 15 [row 8 - second half] end | |
13492 | ||
13493 | ; mode 16 [row 6 - second half] | |
13494 | pmaddubsw m1, m0, [r5 + 13 * 16] | |
13495 | pmulhrsw m1, m3 | |
13496 | packuswb m1, m1 | |
13497 | movh [r0 + 230 * 16 + 8], m1 | |
13498 | ; mode 16 [row 6 - second half] end | |
13499 | ||
13500 | ; mode 15 [row 6 - second half] end | |
13501 | pmaddubsw m1, m5, [r5 + 30 * 16] | |
13502 | pmulhrsw m1, m3 | |
13503 | pmaddubsw m6, m0, [r5 + 30 * 16] | |
13504 | pmulhrsw m6, m3 | |
13505 | packuswb m1, m6 | |
13506 | movu [r0 + 201 * 16], m1 | |
13507 | ||
13508 | ; mode 14 [row 10] | |
13509 | pmaddubsw m1, m5, [r5 + 17 * 16] | |
13510 | pmulhrsw m1, m3 | |
13511 | pmaddubsw m6, m0, [r5 + 17 * 16] | |
13512 | pmulhrsw m6, m3 | |
13513 | packuswb m1, m6 | |
13514 | movu [r0 + 202 * 16], m1 | |
13515 | ||
13516 | ; mode 14 [row 11] | |
13517 | pmaddubsw m1, m5, [r5 + 4 * 16] | |
13518 | pmulhrsw m1, m3 | |
13519 | pmaddubsw m6, m0, [r5 + 4 * 16] | |
13520 | pmulhrsw m6, m3 | |
13521 | packuswb m1, m6 | |
13522 | movu [r0 + 203 * 16], m1 | |
13523 | ||
13524 | ; mode 14 [row 12] | |
13525 | pslldq m5, 2 | |
13526 | pinsrb m5, [r3 + 10], 1 | |
13527 | pinsrb m5, [r3 + 12], 0 | |
13528 | movu m0, [r4 + 3] | |
13529 | psrldq m6, m0, 1 | |
13530 | punpcklbw m0, m6 | |
13531 | ||
13532 | ; mode 15 [row 9 - second half] | |
13533 | pmaddubsw m1, m0, [r5 + 22 * 16] | |
13534 | pmulhrsw m1, m3 | |
13535 | packuswb m1, m1 | |
13536 | movh [r0 + 217 * 16 + 8], m1 | |
13537 | ; mode 15 [row 9 - second half] end | |
13538 | ||
13539 | ; mode 15 [row 10 - second half] | |
13540 | pmaddubsw m1, m0, [r5 + 5 * 16] | |
13541 | pmulhrsw m1, m3 | |
13542 | packuswb m1, m1 | |
13543 | movh [r0 + 218 * 16 + 8], m1 | |
13544 | ; mode 15 [row 10 - second half] end | |
13545 | ||
13546 | ; mode 16 [row 7 - second half] | |
13547 | pmaddubsw m1, m0, [r5 + 24 * 16] | |
13548 | pmulhrsw m1, m3 | |
13549 | packuswb m1, m1 | |
13550 | movh [r0 + 231 * 16 + 8], m1 | |
13551 | ; mode 16 [row 7 - second half] end | |
13552 | ||
13553 | ; mode 16 [row 8 - second half] | |
13554 | pmaddubsw m1, m0, [r5 + 3 * 16] | |
13555 | pmulhrsw m1, m3 | |
13556 | packuswb m1, m1 | |
13557 | movh [r0 + 232 * 16 + 8], m1 | |
13558 | ; mode 16 [row 8 - second half] end | |
13559 | ||
13560 | pmaddubsw m1, m5, [r5 + 23 * 16] | |
13561 | pmulhrsw m1, m3 | |
13562 | pmaddubsw m6, m0, [r5 + 23 * 16] | |
13563 | pmulhrsw m6, m3 | |
13564 | packuswb m1, m6 | |
13565 | movu [r0 + 204 * 16], m1 | |
13566 | ||
13567 | ; mode 14 [row 13] | |
13568 | pmaddubsw m1, m5, [r5 + 10 * 16] | |
13569 | pmulhrsw m1, m3 | |
13570 | pmaddubsw m6, m0, [r5 + 10 * 16] | |
13571 | pmulhrsw m6, m3 | |
13572 | packuswb m1, m6 | |
13573 | movu [r0 + 205 * 16], m1 | |
13574 | ||
13575 | ; mode 14 [row 14] | |
13576 | pslldq m5, 2 | |
13577 | pinsrb m5, [r3 + 12], 1 | |
13578 | pinsrb m5, [r3 + 15], 0 | |
13579 | movu m0, [r4 + 2] | |
13580 | psrldq m6, m0, 1 | |
13581 | punpcklbw m0, m6 | |
13582 | ||
13583 | ; mode 15 [row 11 - second half] | |
13584 | pmaddubsw m1, m0, [r5 + 20 * 16] | |
13585 | pmulhrsw m1, m3 | |
13586 | packuswb m1, m1 | |
13587 | movh [r0 + 219 * 16 + 8], m1 | |
13588 | ; mode 15 [row 11 - second half] end | |
13589 | ||
13590 | ; mode 15 [row 12 - second half] | |
13591 | pmaddubsw m1, m0, [r5 + 3 * 16] | |
13592 | pmulhrsw m1, m3 | |
13593 | packuswb m1, m1 | |
13594 | movh [r0 + 220 * 16 + 8], m1 | |
13595 | ; mode 15 [row 12 - second half] end | |
13596 | ||
13597 | ; mode 16 [row 9 - second half] | |
13598 | pmaddubsw m1, m0, [r5 + 14 * 16] | |
13599 | pmulhrsw m1, m3 | |
13600 | packuswb m1, m1 | |
13601 | movh [r0 + 233 * 16 + 8], m1 | |
13602 | ||
13603 | ; mode 16 [row 9 - second half] end | |
13604 | pmaddubsw m1, m5, [r5 + 29 * 16] | |
13605 | pmulhrsw m1, m3 | |
13606 | pmaddubsw m6, m0, [r5 + 29 * 16] | |
13607 | pmulhrsw m6, m3 | |
13608 | packuswb m1, m6 | |
13609 | movu [r0 + 206 * 16], m1 | |
13610 | ||
13611 | ; mode 14 [row 15] | |
13612 | pmaddubsw m1, m5, [r5 + 16 * 16] | |
13613 | pmulhrsw m1, m3 | |
13614 | pmaddubsw m6, m0, [r5 + 16 * 16] | |
13615 | pmulhrsw m6, m3 | |
13616 | packuswb m1, m6 | |
13617 | movu [r0 + 207 * 16], m1 | |
13618 | ||
13619 | ; mode 12 [row 12] | |
13620 | pslldq m0, m2, 2 | |
13621 | pinsrb m0, [r3 + 6], 1 | |
13622 | pinsrb m0, [r3 + 13], 0 | |
13623 | pmaddubsw m1, m0, [r5 + 31 * 16] | |
13624 | pmulhrsw m1, m3 | |
13625 | pmaddubsw m5, m4, [r5 + 31 * 16] | |
13626 | pmulhrsw m5, m3 | |
13627 | packuswb m1, m5 | |
13628 | movu [r0 + 172 * 16], m1 | |
13629 | ||
13630 | ; mode 12 [row 13] | |
13631 | pmaddubsw m1, m0, [r5 + 26 * 16] | |
13632 | pmulhrsw m1, m3 | |
13633 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
13634 | pmulhrsw m5, m3 | |
13635 | packuswb m1, m5 | |
13636 | movu [r0 + 173 * 16], m1 | |
13637 | ||
13638 | ; mode 12 [row 14] | |
13639 | pmaddubsw m1, m0, [r5 + 21 * 16] | |
13640 | pmulhrsw m1, m3 | |
13641 | pmaddubsw m5, m4, [r5 + 21 * 16] | |
13642 | pmulhrsw m5, m3 | |
13643 | packuswb m1, m5 | |
13644 | movu [r0 + 174 * 16], m1 | |
13645 | ||
13646 | ; mode 12 [row 15] | |
13647 | pmaddubsw m1, m0, [r5 + 16 * 16] | |
13648 | pmulhrsw m1, m3 | |
13649 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
13650 | pmulhrsw m5, m3 | |
13651 | packuswb m1, m5 | |
13652 | movu [r0 + 175 * 16], m1 | |
13653 | ||
13654 | ; mode 13 [row 7] | |
13655 | pslldq m7, 2 | |
13656 | pinsrb m7, [r3 + 4], 1 | |
13657 | pinsrb m7, [r3 + 7], 0 | |
13658 | pmaddubsw m1, m7, [r5 + 24 * 16] | |
13659 | pmulhrsw m1, m3 | |
13660 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
13661 | pmulhrsw m5, m3 | |
13662 | packuswb m1, m5 | |
13663 | movu [r0 + 183 * 16], m1 | |
13664 | ||
13665 | ; mode 13 [row 8] | |
13666 | pmaddubsw m1, m7, [r5 + 15 * 16] | |
13667 | pmulhrsw m1, m3 | |
13668 | pmaddubsw m5, m4, [r5 + 15 * 16] | |
13669 | pmulhrsw m5, m3 | |
13670 | packuswb m1, m5 | |
13671 | movu [r0 + 184 * 16], m1 | |
13672 | ||
13673 | ; mode 13 [row 9] | |
13674 | pmaddubsw m1, m7, [r5 + 6 * 16] | |
13675 | pmulhrsw m1, m3 | |
13676 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
13677 | pmulhrsw m5, m3 | |
13678 | packuswb m1, m5 | |
13679 | movu [r0 + 185 * 16], m1 | |
13680 | ||
13681 | ; mode 13 [row 10] | |
13682 | pslldq m7, 2 | |
13683 | pinsrb m7, [r3 + 7], 1 | |
13684 | pinsrb m7, [r3 + 11], 0 | |
13685 | pmaddubsw m1, m7, [r5 + 29 * 16] | |
13686 | pmulhrsw m1, m3 | |
13687 | movu m4, [r4 + 5] | |
13688 | psrldq m5, m4, 1 | |
13689 | punpcklbw m4, m5 | |
13690 | pmaddubsw m5, m4, [r5 + 29 * 16] | |
13691 | pmulhrsw m5, m3 | |
13692 | packuswb m1, m5 | |
13693 | movu [r0 + 186 * 16], m1 | |
13694 | ||
13695 | ; mode 13 [row 11] | |
13696 | pmaddubsw m1, m7, [r5 + 20 * 16] | |
13697 | pmulhrsw m1, m3 | |
13698 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
13699 | pmulhrsw m5, m3 | |
13700 | packuswb m1, m5 | |
13701 | movu [r0 + 187 * 16], m1 | |
13702 | ||
13703 | ; mode 13 [row 12] | |
13704 | pmaddubsw m1, m7, [r5 + 11 * 16] | |
13705 | pmulhrsw m1, m3 | |
13706 | pmaddubsw m5, m4, [r5 + 11 * 16] | |
13707 | pmulhrsw m5, m3 | |
13708 | packuswb m1, m5 | |
13709 | movu [r0 + 188 * 16], m1 | |
13710 | ||
13711 | ; mode 13 [row 13] | |
13712 | pmaddubsw m1, m7, [r5 + 2 * 16] | |
13713 | pmulhrsw m1, m3 | |
13714 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
13715 | pmulhrsw m5, m3 | |
13716 | packuswb m1, m5 | |
13717 | movu [r0 + 189 * 16], m1 | |
13718 | ||
13719 | ; mode 13 [row 14] | |
13720 | pslldq m7, 2 | |
13721 | pinsrb m7, [r3 + 11], 1 | |
13722 | pinsrb m7, [r3 + 14], 0 | |
13723 | pmaddubsw m1, m7, [r5 + 25 * 16] | |
13724 | pmulhrsw m1, m3 | |
13725 | movu m4, [r4 + 4] | |
13726 | psrldq m5, m4, 1 | |
13727 | punpcklbw m4, m5 | |
13728 | pmaddubsw m5, m4, [r5 + 25 * 16] | |
13729 | pmulhrsw m5, m3 | |
13730 | packuswb m1, m5 | |
13731 | movu [r0 + 190 * 16], m1 | |
13732 | ||
13733 | ; mode 13 [row 15] | |
13734 | pmaddubsw m1, m7, [r5 + 16 * 16] | |
13735 | pmulhrsw m1, m3 | |
13736 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
13737 | pmulhrsw m5, m3 | |
13738 | packuswb m1, m5 | |
13739 | movu [r0 + 191 * 16], m1 | |
13740 | ||
13741 | ; mode 17 [row 15] | |
13742 | movu m0, [r3] | |
13743 | pshufb m1, m0, [tab_S1] | |
13744 | movu [r0 + 255 * 16], m1 | |
13745 | movu m2, [r4] | |
13746 | movd [r0 + 255 * 16 + 12], m2 | |
13747 | ||
13748 | ; mode 18 [row 0] | |
13749 | movu [r0 + 256 * 16], m0 | |
13750 | ||
13751 | ; mode 18 [row 1] | |
13752 | pslldq m4, m0, 1 | |
13753 | pinsrb m4, [r4 + 1], 0 | |
13754 | movu [r0 + 257 * 16], m4 | |
13755 | pslldq m4, 1 | |
13756 | pinsrb m4, [r4 + 2], 0 | |
13757 | movu [r0 + 258 * 16], m4 | |
13758 | pslldq m4, 1 | |
13759 | pinsrb m4, [r4 + 3], 0 | |
13760 | movu [r0 + 259 * 16], m4 | |
13761 | pslldq m4, 1 | |
13762 | pinsrb m4, [r4 + 4], 0 | |
13763 | movu [r0 + 260 * 16], m4 | |
13764 | pslldq m4, 1 | |
13765 | pinsrb m4, [r4 + 5], 0 | |
13766 | movu [r0 + 261 * 16], m4 | |
13767 | pslldq m4, 1 | |
13768 | pinsrb m4, [r4 + 6], 0 | |
13769 | movu [r0 + 262 * 16], m4 | |
13770 | pslldq m4, 1 | |
13771 | pinsrb m4, [r4 + 7], 0 | |
13772 | movu [r0 + 263 * 16], m4 | |
13773 | pslldq m4, 1 | |
13774 | pinsrb m4, [r4 + 8], 0 | |
13775 | movu [r0 + 264 * 16], m4 | |
13776 | pslldq m4, 1 | |
13777 | pinsrb m4, [r4 + 9], 0 | |
13778 | movu [r0 + 265 * 16], m4 | |
13779 | pslldq m4, 1 | |
13780 | pinsrb m4, [r4 + 10], 0 | |
13781 | movu [r0 + 266 * 16], m4 | |
13782 | pslldq m4, 1 | |
13783 | pinsrb m4, [r4 + 11], 0 | |
13784 | movu [r0 + 267 * 16], m4 | |
13785 | pslldq m4, 1 | |
13786 | pinsrb m4, [r4 + 12], 0 | |
13787 | movu [r0 + 268 * 16], m4 | |
13788 | pslldq m4, 1 | |
13789 | pinsrb m4, [r4 + 13], 0 | |
13790 | movu [r0 + 269 * 16], m4 | |
13791 | pslldq m4, 1 | |
13792 | pinsrb m4, [r4 + 14], 0 | |
13793 | movu [r0 + 270 * 16], m4 | |
13794 | pslldq m4, 1 | |
13795 | pinsrb m4, [r4 + 15], 0 | |
13796 | movu [r0 + 271 * 16], m4 | |
13797 | ||
13798 | ; mode 19 [row 0] | |
13799 | psrldq m2, m0, 1 | |
13800 | punpcklbw m0, m2 | |
13801 | movu m5, [r3 + 8] | |
13802 | psrldq m6, m5, 1 | |
13803 | punpcklbw m5, m6 | |
13804 | pmaddubsw m4, m0, [r5 + 6 * 16] | |
13805 | pmulhrsw m4, m3 | |
13806 | pmaddubsw m6, m5, [r5 + 6 * 16] | |
13807 | pmulhrsw m6, m3 | |
13808 | packuswb m4, m6 | |
13809 | movu [r0 + 272 * 16], m4 | |
13810 | ||
13811 | ; mode 20 [row 0] | |
13812 | pmaddubsw m4, m0, [r5 + 11 * 16] | |
13813 | pmulhrsw m4, m3 | |
13814 | pmaddubsw m6, m5, [r5 + 11 * 16] | |
13815 | pmulhrsw m6, m3 | |
13816 | packuswb m4, m6 | |
13817 | movu [r0 + 288 * 16], m4 | |
13818 | ||
13819 | ; mode 21 [row 0] | |
13820 | pmaddubsw m4, m0, [r5 + 15 * 16] | |
13821 | pmulhrsw m4, m3 | |
13822 | pmaddubsw m6, m5, [r5 + 15 * 16] | |
13823 | pmulhrsw m6, m3 | |
13824 | packuswb m4, m6 | |
13825 | movu [r0 + 304 * 16], m4 | |
13826 | ||
13827 | ; mode 22 [row 0] | |
13828 | pmaddubsw m4, m0, [r5 + 19 * 16] | |
13829 | pmulhrsw m4, m3 | |
13830 | pmaddubsw m6, m5, [r5 + 19 * 16] | |
13831 | pmulhrsw m6, m3 | |
13832 | packuswb m4, m6 | |
13833 | movu [r0 + 320 * 16], m4 | |
13834 | ||
13835 | ; mode 22 [row 1] | |
13836 | pmaddubsw m4, m0, [r5 + 6 * 16] | |
13837 | pmulhrsw m4, m3 | |
13838 | pmaddubsw m6, m5, [r5 + 6 * 16] | |
13839 | pmulhrsw m6, m3 | |
13840 | packuswb m4, m6 | |
13841 | movu [r0 + 321 * 16], m4 | |
13842 | ||
13843 | ; mode 23 [row 0] | |
13844 | pmaddubsw m4, m0, [r5 + 23 * 16] | |
13845 | pmulhrsw m4, m3 | |
13846 | pmaddubsw m6, m5, [r5 + 23 * 16] | |
13847 | pmulhrsw m6, m3 | |
13848 | packuswb m4, m6 | |
13849 | movu [r0 + 336 * 16], m4 | |
13850 | ||
13851 | ; mode 23 [row 1] | |
13852 | pmaddubsw m4, m0, [r5 + 14 * 16] | |
13853 | pmulhrsw m4, m3 | |
13854 | pmaddubsw m6, m5, [r5 + 14 * 16] | |
13855 | pmulhrsw m6, m3 | |
13856 | packuswb m4, m6 | |
13857 | movu [r0 + 337 * 16], m4 | |
13858 | ||
13859 | ; mode 23 [row 2] | |
13860 | pmaddubsw m4, m0, [r5 + 5 * 16] | |
13861 | pmulhrsw m4, m3 | |
13862 | pmaddubsw m6, m5, [r5 + 5 * 16] | |
13863 | pmulhrsw m6, m3 | |
13864 | packuswb m4, m6 | |
13865 | movu [r0 + 338 * 16], m4 | |
13866 | ||
13867 | ; mode 24 [row 0] | |
13868 | pmaddubsw m4, m0, [r5 + 27 * 16] | |
13869 | pmulhrsw m4, m3 | |
13870 | pmaddubsw m6, m5, [r5 + 27 * 16] | |
13871 | pmulhrsw m6, m3 | |
13872 | packuswb m4, m6 | |
13873 | movu [r0 + 352 * 16], m4 | |
13874 | ||
13875 | ; mode 24 [row 1] | |
13876 | pmaddubsw m4, m0, [r5 + 22 * 16] | |
13877 | pmulhrsw m4, m3 | |
13878 | pmaddubsw m6, m5, [r5 + 22 * 16] | |
13879 | pmulhrsw m6, m3 | |
13880 | packuswb m4, m6 | |
13881 | movu [r0 + 353 * 16], m4 | |
13882 | ||
13883 | ; mode 24 [row 2] | |
13884 | pmaddubsw m4, m0, [r5 + 17 * 16] | |
13885 | pmulhrsw m4, m3 | |
13886 | pmaddubsw m6, m5, [r5 + 17 * 16] | |
13887 | pmulhrsw m6, m3 | |
13888 | packuswb m4, m6 | |
13889 | movu [r0 + 354 * 16], m4 | |
13890 | ||
13891 | ; mode 24 [row 3] | |
13892 | pmaddubsw m4, m0, [r5 + 12 * 16] | |
13893 | pmulhrsw m4, m3 | |
13894 | pmaddubsw m6, m5, [r5 + 12 * 16] | |
13895 | pmulhrsw m6, m3 | |
13896 | packuswb m4, m6 | |
13897 | movu [r0 + 355 * 16], m4 | |
13898 | ||
13899 | ; mode 24 [row 4] | |
13900 | pmaddubsw m4, m0, [r5 + 7 * 16] | |
13901 | pmulhrsw m4, m3 | |
13902 | pmaddubsw m6, m5, [r5 + 7 * 16] | |
13903 | pmulhrsw m6, m3 | |
13904 | packuswb m4, m6 | |
13905 | movu [r0 + 356 * 16], m4 | |
13906 | ||
13907 | ; mode 24 [row 5] | |
13908 | pmaddubsw m4, m0, [r5 + 2 * 16] | |
13909 | pmulhrsw m4, m3 | |
13910 | pmaddubsw m6, m5, [r5 + 2 * 16] | |
13911 | pmulhrsw m6, m3 | |
13912 | packuswb m4, m6 | |
13913 | movu [r0 + 357 * 16], m4 | |
13914 | ||
13915 | ; mode 24 [row 6 - first half] | |
13916 | pslldq m7, m0, 2 | |
13917 | pinsrb m7, [r4 + 0], 1 | |
13918 | pinsrb m7, [r4 + 6], 0 | |
13919 | pmaddubsw m4, m7, [r5 + 29 * 16] | |
13920 | pmulhrsw m4, m3 | |
13921 | packuswb m4, m4 | |
13922 | movh [r0 + 358 * 16], m4 | |
13923 | ||
13924 | ; mode 24 [row 7 - first half] | |
13925 | pmaddubsw m4, m7, [r5 + 24 * 16] | |
13926 | pmulhrsw m4, m3 | |
13927 | packuswb m4, m4 | |
13928 | movh [r0 + 359 * 16], m4 | |
13929 | ||
13930 | ; mode 24 [row 8 - first half] | |
13931 | pmaddubsw m4, m7, [r5 + 19 * 16] | |
13932 | pmulhrsw m4, m3 | |
13933 | packuswb m4, m4 | |
13934 | movh [r0 + 360 * 16], m4 | |
13935 | ||
13936 | ; mode 24 [row 9 - first half] | |
13937 | pmaddubsw m4, m7, [r5 + 14 * 16] | |
13938 | pmulhrsw m4, m3 | |
13939 | packuswb m4, m4 | |
13940 | movh [r0 + 361 * 16], m4 | |
13941 | ||
13942 | ; mode 24 [row 10 - first half] | |
13943 | pmaddubsw m4, m7, [r5 + 9 * 16] | |
13944 | pmulhrsw m4, m3 | |
13945 | packuswb m4, m4 | |
13946 | movh [r0 + 362 * 16], m4 | |
13947 | ||
13948 | ; mode 24 [row 11 - first half] | |
13949 | pmaddubsw m4, m7, [r5 + 4 * 16] | |
13950 | pmulhrsw m4, m3 | |
13951 | packuswb m4, m4 | |
13952 | movh [r0 + 363 * 16], m4 | |
13953 | ||
13954 | ; mode 24 [row 12 - first half] | |
13955 | pslldq m7, 2 | |
13956 | pinsrb m7, [r4 + 6], 1 | |
13957 | pinsrb m7, [r4 + 13], 0 | |
13958 | pmaddubsw m4, m7, [r5 + 31 * 16] | |
13959 | pmulhrsw m4, m3 | |
13960 | packuswb m4, m4 | |
13961 | movh [r0 + 364 * 16], m4 | |
13962 | ||
13963 | ; mode 24 [row 13 - first half] | |
13964 | pmaddubsw m4, m7, [r5 + 26 * 16] | |
13965 | pmulhrsw m4, m3 | |
13966 | packuswb m4, m4 | |
13967 | movh [r0 + 365 * 16], m4 | |
13968 | ||
13969 | ; mode 24 [row 14 - first half] | |
13970 | pmaddubsw m4, m7, [r5 + 21 * 16] | |
13971 | pmulhrsw m4, m3 | |
13972 | packuswb m4, m4 | |
13973 | movh [r0 + 366 * 16], m4 | |
13974 | ||
13975 | ; mode 24 [row 15 - first half] | |
13976 | pmaddubsw m4, m7, [r5 + 16 * 16] | |
13977 | pmulhrsw m4, m3 | |
13978 | packuswb m4, m4 | |
13979 | movh [r0 + 367 * 16], m4 | |
13980 | ||
13981 | ; mode 23 [row 3 - first half] | |
13982 | pslldq m7, m0, 2 | |
13983 | pinsrb m7, [r4 + 0], 1 | |
13984 | pinsrb m7, [r4 + 4], 0 | |
13985 | pmaddubsw m4, m7, [r5 + 28 * 16] | |
13986 | pmulhrsw m4, m3 | |
13987 | packuswb m4, m4 | |
13988 | movh [r0 + 339 * 16], m4 | |
13989 | ||
13990 | ; mode 23 [row 4 - first half] | |
13991 | pmaddubsw m4, m7, [r5 + 19 * 16] | |
13992 | pmulhrsw m4, m3 | |
13993 | packuswb m4, m4 | |
13994 | movh [r0 + 340 * 16], m4 | |
13995 | ||
13996 | ; mode 23 [row 5 - first half] | |
13997 | pmaddubsw m4, m7, [r5 + 10 * 16] | |
13998 | pmulhrsw m4, m3 | |
13999 | packuswb m4, m4 | |
14000 | movh [r0 + 341 * 16], m4 | |
14001 | ||
14002 | ; mode 23 [row 6 - first half] | |
14003 | pmaddubsw m4, m7, [r5 + 1 * 16] | |
14004 | pmulhrsw m4, m3 | |
14005 | packuswb m4, m4 | |
14006 | movh [r0 + 342 * 16], m4 | |
14007 | ||
14008 | ; mode 23 [row 7 - first half] | |
14009 | pslldq m7, 2 | |
14010 | pinsrb m7, [r4 + 4], 1 | |
14011 | pinsrb m7, [r4 + 7], 0 | |
14012 | pmaddubsw m4, m7, [r5 + 24 * 16] | |
14013 | pmulhrsw m4, m3 | |
14014 | packuswb m4, m4 | |
14015 | movh [r0 + 343 * 16], m4 | |
14016 | ||
14017 | ; mode 23 [row 8 - first half] | |
14018 | pmaddubsw m4, m7, [r5 + 15 * 16] | |
14019 | pmulhrsw m4, m3 | |
14020 | packuswb m4, m4 | |
14021 | movh [r0 + 344 * 16], m4 | |
14022 | ||
14023 | ; mode 23 [row 9 - first half] | |
14024 | pmaddubsw m4, m7, [r5 + 6 * 16] | |
14025 | pmulhrsw m4, m3 | |
14026 | packuswb m4, m4 | |
14027 | movh [r0 + 345 * 16], m4 | |
14028 | ||
14029 | ; mode 23 [row 10 - first half] | |
14030 | pslldq m7, 2 | |
14031 | pinsrb m7, [r4 + 7], 1 | |
14032 | pinsrb m7, [r4 + 11], 0 | |
14033 | pmaddubsw m4, m7, [r5 + 29 * 16] | |
14034 | pmulhrsw m4, m3 | |
14035 | packuswb m4, m4 | |
14036 | movh [r0 + 346 * 16], m4 | |
14037 | ||
14038 | ; mode 23 [row 11 - first half] | |
14039 | pmaddubsw m4, m7, [r5 + 20 * 16] | |
14040 | pmulhrsw m4, m3 | |
14041 | packuswb m4, m4 | |
14042 | movh [r0 + 347 * 16], m4 | |
14043 | ||
14044 | ; mode 23 [row 12 - first half] | |
14045 | pmaddubsw m4, m7, [r5 + 11 * 16] | |
14046 | pmulhrsw m4, m3 | |
14047 | packuswb m4, m4 | |
14048 | movh [r0 + 348 * 16], m4 | |
14049 | ||
14050 | ; mode 23 [row 13 - first half] | |
14051 | pmaddubsw m4, m7, [r5 + 2 * 16] | |
14052 | pmulhrsw m4, m3 | |
14053 | packuswb m4, m4 | |
14054 | movh [r0 + 349 * 16], m4 | |
14055 | ||
14056 | ; mode 23 [row 14 - first half] | |
14057 | pslldq m7, 2 | |
14058 | pinsrb m7, [r4 + 11], 1 | |
14059 | pinsrb m7, [r4 + 14], 0 | |
14060 | pmaddubsw m4, m7, [r5 + 25 * 16] | |
14061 | pmulhrsw m4, m3 | |
14062 | packuswb m4, m4 | |
14063 | movh [r0 + 350 * 16], m4 | |
14064 | ||
14065 | ; mode 23 [row 15 - first half] | |
14066 | pmaddubsw m4, m7, [r5 + 16 * 16] | |
14067 | pmulhrsw m4, m3 | |
14068 | packuswb m4, m4 | |
14069 | movh [r0 + 351 * 16], m4 | |
14070 | ||
14071 | ; mode 21 [row 15 - first half] | |
14072 | pmaddubsw m4, m0, [r5 + 16 * 16] | |
14073 | pmulhrsw m4, m3 | |
14074 | packuswb m4, m4 | |
14075 | movh [r0 + 319 * 16 + 8], m4 | |
14076 | ; mode 21 [row 15 - second half] end | |
14077 | ||
14078 | ; mode 20 [row 1 - first half] | |
14079 | pslldq m7, m0, 2 | |
14080 | pinsrb m7, [r4 + 0], 1 | |
14081 | pinsrb m7, [r4 + 2], 0 | |
14082 | pmaddubsw m4, m7, [r5 + 22 * 16] | |
14083 | pmulhrsw m4, m3 | |
14084 | packuswb m4, m4 | |
14085 | movh [r0 + 289 * 16], m4 | |
14086 | ||
14087 | ; mode 20 [row 2 - first half] | |
14088 | pmaddubsw m4, m7, [r5 + 1 * 16] | |
14089 | pmulhrsw m4, m3 | |
14090 | packuswb m4, m4 | |
14091 | movh [r0 + 290 * 16], m4 | |
14092 | ||
14093 | ; mode 21 [row 1 - first half] | |
14094 | pmaddubsw m4, m7, [r5 + 30 * 16] | |
14095 | pmulhrsw m4, m3 | |
14096 | packuswb m4, m4 | |
14097 | movh [r0 + 305 * 16], m4 | |
14098 | ||
14099 | ; mode 21 [row 2 - first half] | |
14100 | pmaddubsw m4, m7, [r5 + 13 * 16] | |
14101 | pmulhrsw m4, m3 | |
14102 | packuswb m4, m4 | |
14103 | movh [r0 + 306 * 16], m4 | |
14104 | ||
14105 | ; mode 22 [row 2 - first half] | |
14106 | pmaddubsw m4, m7, [r5 + 25 * 16] | |
14107 | pmulhrsw m4, m3 | |
14108 | packuswb m4, m4 | |
14109 | movh [r0 + 322 * 16], m4 | |
14110 | ||
14111 | ; mode 22 [row 3 - first half] | |
14112 | pmaddubsw m4, m7, [r5 + 12 * 16] | |
14113 | pmulhrsw m4, m3 | |
14114 | packuswb m4, m4 | |
14115 | movh [r0 + 323 * 16], m4 | |
14116 | ||
14117 | ; mode 22 [row 4 - first half] | |
14118 | pslldq m1, m7, 2 | |
14119 | pinsrb m1, [r4 + 2], 1 | |
14120 | pinsrb m1, [r4 + 5], 0 | |
14121 | pmaddubsw m4, m1, [r5 + 31 * 16] | |
14122 | pmulhrsw m4, m3 | |
14123 | packuswb m4, m4 | |
14124 | movh [r0 + 324 * 16], m4 | |
14125 | ||
14126 | ; mode 22 [row 5 - first half] | |
14127 | pmaddubsw m4, m1, [r5 + 18 * 16] | |
14128 | pmulhrsw m4, m3 | |
14129 | packuswb m4, m4 | |
14130 | movh [r0 + 325 * 16], m4 | |
14131 | ||
14132 | ; mode 22 [row 6 - first half] | |
14133 | pmaddubsw m4, m1, [r5 + 5 * 16] | |
14134 | pmulhrsw m4, m3 | |
14135 | packuswb m4, m4 | |
14136 | movh [r0 + 326 * 16], m4 | |
14137 | ||
14138 | ; mode 22 [row 7 - first half] | |
14139 | pslldq m1, 2 | |
14140 | pinsrb m1, [r4 + 5], 1 | |
14141 | pinsrb m1, [r4 + 7], 0 | |
14142 | pmaddubsw m4, m1, [r5 + 24 * 16] | |
14143 | pmulhrsw m4, m3 | |
14144 | packuswb m4, m4 | |
14145 | movh [r0 + 327 * 16], m4 | |
14146 | ||
14147 | ; mode 22 [row 8 - first half] | |
14148 | pmaddubsw m4, m1, [r5 + 11 * 16] | |
14149 | pmulhrsw m4, m3 | |
14150 | packuswb m4, m4 | |
14151 | movh [r0 + 328 * 16], m4 | |
14152 | ||
14153 | ; mode 22 [row 9 - first half] | |
14154 | pslldq m1, 2 | |
14155 | pinsrb m1, [r4 + 7], 1 | |
14156 | pinsrb m1, [r4 + 10], 0 | |
14157 | pmaddubsw m4, m1, [r5 + 30 * 16] | |
14158 | pmulhrsw m4, m3 | |
14159 | packuswb m4, m4 | |
14160 | movh [r0 + 329 * 16], m4 | |
14161 | ||
14162 | ; mode 22 [row 10 - first half] | |
14163 | pmaddubsw m4, m1, [r5 + 17 * 16] | |
14164 | pmulhrsw m4, m3 | |
14165 | packuswb m4, m4 | |
14166 | movh [r0 + 330 * 16], m4 | |
14167 | ||
14168 | ; mode 22 [row 11 - first half] | |
14169 | pmaddubsw m4, m1, [r5 + 4 * 16] | |
14170 | pmulhrsw m4, m3 | |
14171 | packuswb m4, m4 | |
14172 | movh [r0 + 331 * 16], m4 | |
14173 | ||
14174 | ; mode 22 [row 12 - first half] | |
14175 | pslldq m1, 2 | |
14176 | pinsrb m1, [r4 + 10], 1 | |
14177 | pinsrb m1, [r4 + 12], 0 | |
14178 | pmaddubsw m4, m1, [r5 + 23 * 16] | |
14179 | pmulhrsw m4, m3 | |
14180 | packuswb m4, m4 | |
14181 | movh [r0 + 332 * 16], m4 | |
14182 | ||
14183 | ; mode 22 [row 13 - first half] | |
14184 | pmaddubsw m4, m1, [r5 + 10 * 16] | |
14185 | pmulhrsw m4, m3 | |
14186 | packuswb m4, m4 | |
14187 | movh [r0 + 333 * 16], m4 | |
14188 | ||
14189 | ; mode 22 [row 14 - first half] | |
14190 | pslldq m1, 2 | |
14191 | pinsrb m1, [r4 + 12], 1 | |
14192 | pinsrb m1, [r4 + 15], 0 | |
14193 | pmaddubsw m4, m1, [r5 + 29 * 16] | |
14194 | pmulhrsw m4, m3 | |
14195 | packuswb m4, m4 | |
14196 | movh [r0 + 334 * 16], m4 | |
14197 | ||
14198 | ; mode 22 [row 15 - first half] | |
14199 | pmaddubsw m4, m1, [r5 + 16 * 16] | |
14200 | pmulhrsw m4, m3 | |
14201 | packuswb m4, m4 | |
14202 | movh [r0 + 335 * 16], m4 | |
14203 | ||
14204 | ; mode 21 [row 3 - first half] | |
14205 | pslldq m6, m7, 2 | |
14206 | pinsrb m6, [r4 + 2], 1 | |
14207 | pinsrb m6, [r4 + 4], 0 | |
14208 | pmaddubsw m4, m6, [r5 + 28 * 16] | |
14209 | pmulhrsw m4, m3 | |
14210 | packuswb m4, m4 | |
14211 | movh [r0 + 307 * 16], m4 | |
14212 | ||
14213 | ; mode 21 [row 4 - first half] | |
14214 | pmaddubsw m4, m6, [r5 + 11 * 16] | |
14215 | pmulhrsw m4, m3 | |
14216 | packuswb m4, m4 | |
14217 | movh [r0 + 308 * 16], m4 | |
14218 | ||
14219 | ; mode 21 [row 5 - first half] | |
14220 | pslldq m6, 2 | |
14221 | pinsrb m6, [r4 + 4], 1 | |
14222 | pinsrb m6, [r4 + 6], 0 | |
14223 | pmaddubsw m4, m6, [r5 + 26 * 16] | |
14224 | pmulhrsw m4, m3 | |
14225 | packuswb m4, m4 | |
14226 | movh [r0 + 309 * 16], m4 | |
14227 | ||
14228 | ; mode 21 [row 6 - first half] | |
14229 | pmaddubsw m4, m6, [r5 + 9 * 16] | |
14230 | pmulhrsw m4, m3 | |
14231 | packuswb m4, m4 | |
14232 | movh [r0 + 310 * 16], m4 | |
14233 | ||
14234 | ; mode 21 [row 7 - first half] | |
14235 | pslldq m6, 2 | |
14236 | pinsrb m6, [r4 + 6], 1 | |
14237 | pinsrb m6, [r4 + 8], 0 | |
14238 | pmaddubsw m4, m6, [r5 + 24 * 16] | |
14239 | pmulhrsw m4, m3 | |
14240 | packuswb m4, m4 | |
14241 | movh [r0 + 311 * 16], m4 | |
14242 | ||
14243 | ; mode 21 [row 8 - first half] | |
14244 | pmaddubsw m4, m6, [r5 + 7 * 16] | |
14245 | pmulhrsw m4, m3 | |
14246 | packuswb m4, m4 | |
14247 | movh [r0 + 312 * 16], m4 | |
14248 | ||
14249 | ; mode 21 [row 9 - first half] | |
14250 | pslldq m6, 2 | |
14251 | pinsrb m6, [r4 + 8], 1 | |
14252 | pinsrb m6, [r4 + 9], 0 | |
14253 | pmaddubsw m4, m6, [r5 + 22 * 16] | |
14254 | pmulhrsw m4, m3 | |
14255 | packuswb m4, m4 | |
14256 | movh [r0 + 313 * 16], m4 | |
14257 | ||
14258 | ; mode 21 [row 10 - first half] | |
14259 | pmaddubsw m4, m6, [r5 + 5 * 16] | |
14260 | pmulhrsw m4, m3 | |
14261 | packuswb m4, m4 | |
14262 | movh [r0 + 314 * 16], m4 | |
14263 | ||
14264 | ; mode 21 [row 11 - first half] | |
14265 | pslldq m6, 2 | |
14266 | pinsrb m6, [r4 + 9], 1 | |
14267 | pinsrb m6, [r4 + 11], 0 | |
14268 | pmaddubsw m4, m6, [r5 + 20 * 16] | |
14269 | pmulhrsw m4, m3 | |
14270 | packuswb m4, m4 | |
14271 | movh [r0 + 315 * 16], m4 | |
14272 | ||
14273 | ; mode 21 [row 12 - first half] | |
14274 | pmaddubsw m4, m6, [r5 + 3 * 16] | |
14275 | pmulhrsw m4, m3 | |
14276 | packuswb m4, m4 | |
14277 | movh [r0 + 316 * 16], m4 | |
14278 | ||
14279 | ; mode 21 [row 13 - first half] | |
14280 | pslldq m6, 2 | |
14281 | pinsrb m6, [r4 + 11], 1 | |
14282 | pinsrb m6, [r4 + 13], 0 | |
14283 | pmaddubsw m4, m6, [r5 + 18 * 16] | |
14284 | pmulhrsw m4, m3 | |
14285 | packuswb m4, m4 | |
14286 | movh [r0 + 317 * 16], m4 | |
14287 | ||
14288 | ; mode 21 [row 14 - first half] | |
14289 | pmaddubsw m4, m6, [r5 + 1 * 16] | |
14290 | pmulhrsw m4, m3 | |
14291 | packuswb m4, m4 | |
14292 | movh [r0 + 318 * 16], m4 | |
14293 | ||
14294 | ; mode 21 [row 15 - first half] | |
14295 | pslldq m6, 2 | |
14296 | pinsrb m6, [r4 + 13], 1 | |
14297 | pinsrb m6, [r4 + 15], 0 | |
14298 | pmaddubsw m4, m6, [r5 + 16 * 16] | |
14299 | pmulhrsw m4, m3 | |
14300 | packuswb m4, m4 | |
14301 | movh [r0 + 319 * 16], m4 | |
14302 | ||
14303 | ; mode 20 [row 13 - second half] | |
14304 | pmaddubsw m4, m7, [r5 + 26 * 16] | |
14305 | pmulhrsw m4, m3 | |
14306 | packuswb m4, m4 | |
14307 | movh [r0 + 301 * 16 + 8], m4 | |
14308 | ; mode 20 [row 13 - second half] | |
14309 | ||
14310 | ; mode 20 [row 14 - second half] | |
14311 | pmaddubsw m4, m7, [r5 + 5 * 16] | |
14312 | pmulhrsw m4, m3 | |
14313 | packuswb m4, m4 | |
14314 | movh [r0 + 302 * 16 + 8], m4 | |
14315 | ; mode 20 [row 14 - second half] | |
14316 | ||
14317 | ; mode 20 [row 3 - first half] | |
14318 | pslldq m7, 2 | |
14319 | pinsrb m7, [r4 + 2], 1 | |
14320 | pinsrb m7, [r4 + 3], 0 | |
14321 | pmaddubsw m4, m7, [r5 + 12 * 16] | |
14322 | pmulhrsw m4, m3 | |
14323 | packuswb m4, m4 | |
14324 | movh [r0 + 291 * 16], m4 | |
14325 | ||
14326 | ; mode 20 [row 15 - second half] | |
14327 | pmaddubsw m4, m7, [r5 + 16 * 16] | |
14328 | pmulhrsw m4, m3 | |
14329 | packuswb m4, m4 | |
14330 | movh [r0 + 303 * 16 + 8], m4 | |
14331 | ; mode 20 [row 15 - second half] | |
14332 | ||
14333 | ; mode 20 [row 4 - first half] | |
14334 | pslldq m7, 2 | |
14335 | pinsrb m7, [r4 + 3], 1 | |
14336 | pinsrb m7, [r4 + 5], 0 | |
14337 | pmaddubsw m4, m7, [r5 + 23 * 16] | |
14338 | pmulhrsw m4, m3 | |
14339 | packuswb m4, m4 | |
14340 | movh [r0 + 292 * 16], m4 | |
14341 | ||
14342 | ; mode 20 [row 5 - first half] | |
14343 | pmaddubsw m4, m7, [r5 + 2 * 16] | |
14344 | pmulhrsw m4, m3 | |
14345 | packuswb m4, m4 | |
14346 | movh [r0 + 293 * 16], m4 | |
14347 | ||
14348 | ; mode 20 [row 6 - first half] | |
14349 | pslldq m7, 2 | |
14350 | pinsrb m7, [r4 + 5], 1 | |
14351 | pinsrb m7, [r4 + 6], 0 | |
14352 | pmaddubsw m4, m7, [r5 + 13 * 16] | |
14353 | pmulhrsw m4, m3 | |
14354 | packuswb m4, m4 | |
14355 | movh [r0 + 294 * 16], m4 | |
14356 | ||
14357 | ; mode 20 [row 7 - first half] | |
14358 | pslldq m7, 2 | |
14359 | pinsrb m7, [r4 + 6], 1 | |
14360 | pinsrb m7, [r4 + 8], 0 | |
14361 | pmaddubsw m4, m7, [r5 + 24 * 16] | |
14362 | pmulhrsw m4, m3 | |
14363 | packuswb m4, m4 | |
14364 | movh [r0 + 295 * 16], m4 | |
14365 | ||
14366 | ; mode 20 [row 8 - first half] | |
14367 | pmaddubsw m4, m7, [r5 + 3 * 16] | |
14368 | pmulhrsw m4, m3 | |
14369 | packuswb m4, m4 | |
14370 | movh [r0 + 296 * 16], m4 | |
14371 | ||
14372 | ; mode 20 [row 9 - first half] | |
14373 | pslldq m7, 2 | |
14374 | pinsrb m7, [r4 + 8], 1 | |
14375 | pinsrb m7, [r4 + 9], 0 | |
14376 | pmaddubsw m4, m7, [r5 + 14 * 16] | |
14377 | pmulhrsw m4, m3 | |
14378 | packuswb m4, m4 | |
14379 | movh [r0 + 297 * 16], m4 | |
14380 | ||
14381 | ; mode 20 [row 10 - first half] | |
14382 | pslldq m7, 2 | |
14383 | pinsrb m7, [r4 + 9], 1 | |
14384 | pinsrb m7, [r4 + 11], 0 | |
14385 | pmaddubsw m4, m7, [r5 + 25 * 16] | |
14386 | pmulhrsw m4, m3 | |
14387 | packuswb m4, m4 | |
14388 | movh [r0 + 298 * 16], m4 | |
14389 | ||
14390 | ; mode 20 [row 11 - first half] | |
14391 | pmaddubsw m4, m7, [r5 + 4 * 16] | |
14392 | pmulhrsw m4, m3 | |
14393 | packuswb m4, m4 | |
14394 | movh [r0 + 299 * 16], m4 | |
14395 | ||
14396 | ; mode 20 [row 12 - first half] | |
14397 | movu m1, [r5 + 15 * 16] | |
14398 | pslldq m7, 2 | |
14399 | pinsrb m7, [r4 + 11], 1 | |
14400 | pinsrb m7, [r4 + 12], 0 | |
14401 | pmaddubsw m4, m7, [r5 + 15 * 16] | |
14402 | pmulhrsw m4, m3 | |
14403 | packuswb m4, m4 | |
14404 | movh [r0 + 300 * 16], m4 | |
14405 | ||
14406 | ; mode 20 [row 13 - first half] | |
14407 | pslldq m7, 2 | |
14408 | pinsrb m7, [r4 + 12], 1 | |
14409 | pinsrb m7, [r4 + 14], 0 | |
14410 | pmaddubsw m4, m7, [r5 + 26 * 16] | |
14411 | pmulhrsw m4, m3 | |
14412 | packuswb m4, m4 | |
14413 | movh [r0 + 301 * 16], m4 | |
14414 | ||
14415 | ; mode 20 [row 14 - first half] | |
14416 | pmaddubsw m4, m7, [r5 + 5 * 16] | |
14417 | pmulhrsw m4, m3 | |
14418 | packuswb m4, m4 | |
14419 | movh [r0 + 302 * 16], m4 | |
14420 | ||
14421 | ; mode 20 [row 15 - first half] | |
14422 | pslldq m7, 2 | |
14423 | pinsrb m7, [r4 + 14], 1 | |
14424 | pinsrb m7, [r4 + 15], 0 | |
14425 | pmaddubsw m4, m7, [r5 + 16 * 16] | |
14426 | pmulhrsw m4, m3 | |
14427 | packuswb m4, m4 | |
14428 | movh [r0 + 303 * 16], m4 | |
14429 | ||
14430 | ; mode 19 [row 1] | |
14431 | pslldq m0, 2 | |
14432 | pinsrb m0, [r4 + 0], 1 | |
14433 | pinsrb m0, [r4 + 1], 0 | |
14434 | pslldq m5, 2 | |
14435 | pinsrb m5, [r3 + 8], 1 | |
14436 | pinsrb m5, [r3 + 7], 0 | |
14437 | ||
14438 | ; mode 20 [row 1 - second half] | |
14439 | pmaddubsw m4, m5, [r5 + 22 * 16] | |
14440 | pmulhrsw m4, m3 | |
14441 | packuswb m4, m4 | |
14442 | movh [r0 + 289 * 16 + 8], m4 | |
14443 | ; mode 20 [row 1 - second half] end | |
14444 | ||
14445 | ; mode 20 [row 2 - second half] | |
14446 | pmaddubsw m4, m5, [r5 + 1 * 16] | |
14447 | pmulhrsw m4, m3 | |
14448 | packuswb m4, m4 | |
14449 | movh [r0 + 290 * 16 + 8], m4 | |
14450 | ; mode 20 [row 2 - second half] end | |
14451 | ||
14452 | ; mode 21 [row 2 - second half] | |
14453 | pmaddubsw m4, m5, [r5 + 30 * 16] | |
14454 | pmulhrsw m4, m3 | |
14455 | packuswb m4, m4 | |
14456 | movh [r0 + 305 * 16 + 8], m4 | |
14457 | ; mode 21 [row 2 - second half] end | |
14458 | ||
14459 | ; mode 21 [row 3 - second half] | |
14460 | pmaddubsw m4, m5, [r5 + 13 * 16] | |
14461 | pmulhrsw m4, m3 | |
14462 | packuswb m4, m4 | |
14463 | movh [r0 + 306 * 16 + 8], m4 | |
14464 | ; mode 21 [row 3 - second half] end | |
14465 | ||
14466 | ; mode 21 [row 4 - second half] | |
14467 | pmaddubsw m4, m5, [r5 + 11 * 16] | |
14468 | pmulhrsw m4, m3 | |
14469 | packuswb m4, m4 | |
14470 | movh [r0 + 307 * 16 + 8], m4 | |
14471 | ; mode 21 [row 4 - second half] end | |
14472 | ||
14473 | ; mode 22 [row 2 - second half] | |
14474 | pmaddubsw m4, m5, [r5 + 25 * 16] | |
14475 | pmulhrsw m4, m3 | |
14476 | packuswb m4, m4 | |
14477 | movh [r0 + 322 * 16 + 8], m4 | |
14478 | ; mode 22 [row 2 - second half] end | |
14479 | ||
14480 | ; mode 22 [row 3 - second half] | |
14481 | pmaddubsw m4, m5, [r5 + 12 * 16] | |
14482 | pmulhrsw m4, m3 | |
14483 | packuswb m4, m4 | |
14484 | movh [r0 + 323 * 16 + 8], m4 | |
14485 | ; mode 22 [row 3 - second half] end | |
14486 | ||
14487 | ; mode 23 [row 3 - second half] | |
14488 | pmaddubsw m4, m5, [r5 + 28 * 16] | |
14489 | pmulhrsw m4, m3 | |
14490 | packuswb m4, m4 | |
14491 | movh [r0 + 339 * 16 + 8], m4 | |
14492 | ; mode 23 [row 3 - second half] end | |
14493 | ||
14494 | ; mode 23 [row 4 - second half] | |
14495 | pmaddubsw m4, m5, [r5 + 19 * 16] | |
14496 | pmulhrsw m4, m3 | |
14497 | packuswb m4, m4 | |
14498 | movh [r0 + 340 * 16 + 8], m4 | |
14499 | ; mode 23 [row 4 - second half] end | |
14500 | ||
14501 | ; mode 23 [row 5 - second half] | |
14502 | pmaddubsw m4, m5, [r5 + 10 * 16] | |
14503 | pmulhrsw m4, m3 | |
14504 | packuswb m4, m4 | |
14505 | movh [r0 + 341 * 16 + 8], m4 | |
14506 | ; mode 23 [row 5 - second half] end | |
14507 | ||
14508 | ; mode 23 [row 6 - second half] | |
14509 | pmaddubsw m4, m5, [r5 + 1 * 16] | |
14510 | pmulhrsw m4, m3 | |
14511 | packuswb m4, m4 | |
14512 | movh [r0 + 342 * 16 + 8], m4 | |
14513 | ; mode 23 [row 6 - second half] end | |
14514 | ||
14515 | ; mode 24 [row 6 - second half] | |
14516 | pmaddubsw m4, m5, [r5 + 29 * 16] | |
14517 | pmulhrsw m4, m3 | |
14518 | packuswb m4, m4 | |
14519 | movh [r0 + 358 * 16 + 8], m4 | |
14520 | ; mode 24 [row 6 - second half] end | |
14521 | ||
14522 | ; mode 24 [row 7 - second half] | |
14523 | pmaddubsw m4, m5, [r5 + 24 * 16] | |
14524 | pmulhrsw m4, m3 | |
14525 | packuswb m4, m4 | |
14526 | movh [r0 + 359 * 16 + 8], m4 | |
14527 | ; mode 24 [row 7 - second half] end | |
14528 | ||
14529 | ; mode 24 [row 8 - second half] | |
14530 | pmaddubsw m4, m5, [r5 + 19 * 16] | |
14531 | pmulhrsw m4, m3 | |
14532 | packuswb m4, m4 | |
14533 | movh [r0 + 360 * 16 + 8], m4 | |
14534 | ; mode 24 [row 8 - second half] end | |
14535 | ||
14536 | ; mode 24 [row 9 - second half] | |
14537 | pmaddubsw m4, m5, [r5 + 14 * 16] | |
14538 | pmulhrsw m4, m3 | |
14539 | packuswb m4, m4 | |
14540 | movh [r0 + 361 * 16 + 8], m4 | |
14541 | ; mode 24 [row 9 - second half] end | |
14542 | ||
14543 | ; mode 24 [row 10 - second half] | |
14544 | pmaddubsw m4, m5, [r5 + 9 * 16] | |
14545 | pmulhrsw m4, m3 | |
14546 | packuswb m4, m4 | |
14547 | movh [r0 + 362 * 16 + 8], m4 | |
14548 | ; mode 24 [row 10 - second half] end | |
14549 | ||
14550 | ; mode 24 [row 11 - second half] | |
14551 | pmaddubsw m4, m5, [r5 + 4 * 16] | |
14552 | pmulhrsw m4, m3 | |
14553 | packuswb m4, m4 | |
14554 | movh [r0 + 363 * 16 + 8], m4 | |
14555 | ; mode 24 [row 11 - second half] end | |
14556 | ||
14557 | pmaddubsw m4, m0, [r5 + 12 * 16] | |
14558 | pmulhrsw m4, m3 | |
14559 | pmaddubsw m6, m5, [r5 + 12 * 16] | |
14560 | pmulhrsw m6, m3 | |
14561 | packuswb m4, m6 | |
14562 | movu [r0 + 273 * 16], m4 | |
14563 | ||
14564 | ; mode 19 [row 2] | |
14565 | pslldq m0, 2 | |
14566 | pinsrb m0, [r4 + 1], 1 | |
14567 | pinsrb m0, [r4 + 2], 0 | |
14568 | pslldq m5, 2 | |
14569 | pinsrb m5, [r3 + 7], 1 | |
14570 | pinsrb m5, [r3 + 6], 0 | |
14571 | ||
14572 | ; mode 20 [row 3 - second half] | |
14573 | pmaddubsw m4, m5, [r5 + 12 * 16] | |
14574 | pmulhrsw m4, m3 | |
14575 | packuswb m4, m4 | |
14576 | movh [r0 + 291 * 16 + 8], m4 | |
14577 | ; mode 20 [row 3 - second half] end | |
14578 | ||
14579 | ; mode 21 [row 3 - second half] | |
14580 | pmaddubsw m4, m5, [r5 + 28 * 16] | |
14581 | pmulhrsw m4, m3 | |
14582 | packuswb m4, m4 | |
14583 | movh [r0 + 307 * 16 + 8], m4 | |
14584 | ; mode 21 [row 3 - second half] end | |
14585 | ||
14586 | ; mode 21 [row 4 - second half] | |
14587 | pmaddubsw m4, m5, [r5 + 11 * 16] | |
14588 | pmulhrsw m4, m3 | |
14589 | packuswb m4, m4 | |
14590 | movh [r0 + 308 * 16 + 8], m4 | |
14591 | ; mode 21 [row 4 - second half] end | |
14592 | ||
14593 | ; mode 22 [row 4 - second half] | |
14594 | pmaddubsw m4, m5, [r5 + 31 * 16] | |
14595 | pmulhrsw m4, m3 | |
14596 | packuswb m4, m4 | |
14597 | movh [r0 + 324 * 16 + 8], m4 | |
14598 | ; mode 22 [row 4 - second half] end | |
14599 | ||
14600 | ; mode 22 [row 5 - second half] | |
14601 | pmaddubsw m4, m5, [r5 + 18 * 16] | |
14602 | pmulhrsw m4, m3 | |
14603 | packuswb m4, m4 | |
14604 | movh [r0 + 325 * 16 + 8], m4 | |
14605 | ; mode 22 [row 5 - second half] end | |
14606 | ||
14607 | ; mode 22 [row 6 - second half] | |
14608 | pmaddubsw m4, m5, [r5 + 5 * 16] | |
14609 | pmulhrsw m4, m3 | |
14610 | packuswb m4, m4 | |
14611 | movh [r0 + 326 * 16 + 8], m4 | |
14612 | ; mode 22 [row 6 - second half] end | |
14613 | ||
14614 | ; mode 23 [row 7 - second half] | |
14615 | pmaddubsw m4, m5, [r5 + 24 * 16] | |
14616 | pmulhrsw m4, m3 | |
14617 | packuswb m4, m4 | |
14618 | movh [r0 + 343 * 16 + 8], m4 | |
14619 | ; mode 23 [row 7 - second half] end | |
14620 | ||
14621 | ; mode 23 [row 8 - second half] | |
14622 | pmaddubsw m4, m5, [r5 + 15 * 16] | |
14623 | pmulhrsw m4, m3 | |
14624 | packuswb m4, m4 | |
14625 | movh [r0 + 344 * 16 + 8], m4 | |
14626 | ; mode 23 [row 8 - second half] end | |
14627 | ||
14628 | ; mode 23 [row 9 - second half] | |
14629 | pmaddubsw m4, m5, [r5 + 6 * 16] | |
14630 | pmulhrsw m4, m3 | |
14631 | packuswb m4, m4 | |
14632 | movh [r0 + 345 * 16 + 8], m4 | |
14633 | ; mode 23 [row 9 - second half] end | |
14634 | ||
14635 | ; mode 24 [row 12 - second half] | |
14636 | pmaddubsw m4, m5, [r5 + 31 * 16] | |
14637 | pmulhrsw m4, m3 | |
14638 | packuswb m4, m4 | |
14639 | movh [r0 + 364 * 16 + 8], m4 | |
14640 | ; mode 24 [row 12 - second half] end | |
14641 | ||
14642 | ; mode 24 [row 13 - second half] | |
14643 | pmaddubsw m4, m5, [r5 + 26 * 16] | |
14644 | pmulhrsw m4, m3 | |
14645 | packuswb m4, m4 | |
14646 | movh [r0 + 365 * 16 + 8], m4 | |
14647 | ; mode 24 [row 13 - second half] end | |
14648 | ||
14649 | ; mode 24 [row 14 - second half] | |
14650 | pmaddubsw m4, m5, [r5 + 21 * 16] | |
14651 | pmulhrsw m4, m3 | |
14652 | packuswb m4, m4 | |
14653 | movh [r0 + 366 * 16 + 8], m4 | |
14654 | ; mode 24 [row 14 - second half] end | |
14655 | ||
14656 | ; mode 24 [row 15 - second half] | |
14657 | pmaddubsw m4, m5, [r5 + 16 * 16] | |
14658 | pmulhrsw m4, m3 | |
14659 | packuswb m4, m4 | |
14660 | movh [r0 + 367 * 16 + 8], m4 | |
14661 | ; mode 24 [row 15 - second half] end | |
14662 | ||
14663 | pmaddubsw m4, m0, [r5 + 18 * 16] | |
14664 | pmulhrsw m4, m3 | |
14665 | pmaddubsw m6, m5, [r5 + 18 * 16] | |
14666 | pmulhrsw m6, m3 | |
14667 | packuswb m4, m6 | |
14668 | movu [r0 + 274 * 16], m4 | |
14669 | ||
14670 | ; mode 19 [row 3] | |
14671 | pslldq m0, 2 | |
14672 | pinsrb m0, [r4 + 2], 1 | |
14673 | pinsrb m0, [r4 + 4], 0 | |
14674 | pslldq m5, 2 | |
14675 | pinsrb m5, [r3 + 6], 1 | |
14676 | pinsrb m5, [r3 + 5], 0 | |
14677 | ||
14678 | ; mode 20 [row 4 - second half] | |
14679 | pmaddubsw m4, m5, [r5 + 23 * 16] | |
14680 | pmulhrsw m4, m3 | |
14681 | packuswb m4, m4 | |
14682 | movh [r0 + 292 * 16 + 8], m4 | |
14683 | ; mode 20 [row 4 - second half] end | |
14684 | ||
14685 | ; mode 20 [row 5 - second half] | |
14686 | pmaddubsw m4, m5, [r5 + 2 * 16] | |
14687 | pmulhrsw m4, m3 | |
14688 | packuswb m4, m4 | |
14689 | movh [r0 + 293 * 16 + 8], m4 | |
14690 | ; mode 20 [row 5 - second half] end | |
14691 | ||
14692 | ; mode 21 [row 5 - second half] | |
14693 | pmaddubsw m4, m5, [r5 + 26 * 16] | |
14694 | pmulhrsw m4, m3 | |
14695 | packuswb m4, m4 | |
14696 | movh [r0 + 309 * 16 + 8], m4 | |
14697 | ; mode 21 [row 5 - second half] end | |
14698 | ||
14699 | ; mode 21 [row 6 - second half] | |
14700 | pmaddubsw m4, m5, [r5 + 9 * 16] | |
14701 | pmulhrsw m4, m3 | |
14702 | packuswb m4, m4 | |
14703 | movh [r0 + 310 * 16 + 8], m4 | |
14704 | ; mode 21 [row 6 - second half] end | |
14705 | ||
14706 | ; mode 22 [row 7 - second half] | |
14707 | pmaddubsw m4, m5, [r5 + 24 * 16] | |
14708 | pmulhrsw m4, m3 | |
14709 | packuswb m4, m4 | |
14710 | movh [r0 + 327 * 16 + 8], m4 | |
14711 | ; mode 22 [row 7 - second half] end | |
14712 | ||
14713 | ; mode 22 [row 8 - second half] | |
14714 | pmaddubsw m4, m5, [r5 + 11 * 16] | |
14715 | pmulhrsw m4, m3 | |
14716 | packuswb m4, m4 | |
14717 | movh [r0 + 328 * 16 + 8], m4 | |
14718 | ; mode 22 [row 7 - second half] end | |
14719 | ||
14720 | ; mode 23 [row 10 - second half] | |
14721 | pmaddubsw m4, m5, [r5 + 29 * 16] | |
14722 | pmulhrsw m4, m3 | |
14723 | packuswb m4, m4 | |
14724 | movh [r0 + 346 * 16 + 8], m4 | |
14725 | ; mode 23 [row 10 - second half] end | |
14726 | ||
14727 | ; mode 23 [row 11 - second half] | |
14728 | pmaddubsw m4, m5, [r5 + 20 * 16] | |
14729 | pmulhrsw m4, m3 | |
14730 | packuswb m4, m4 | |
14731 | movh [r0 + 347 * 16 + 8], m4 | |
14732 | ; mode 23 [row 11 - second half] end | |
14733 | ||
14734 | ; mode 23 [row 12 - second half] | |
14735 | pmaddubsw m4, m5, [r5 + 11 * 16] | |
14736 | pmulhrsw m4, m3 | |
14737 | packuswb m4, m4 | |
14738 | movh [r0 + 348 * 16 + 8], m4 | |
14739 | ; mode 23 [row 12 - second half] end | |
14740 | ||
14741 | ; mode 23 [row 13 - second half] | |
14742 | pmaddubsw m4, m5, [r5 + 2 * 16] | |
14743 | pmulhrsw m4, m3 | |
14744 | packuswb m4, m4 | |
14745 | movh [r0 + 349 * 16 + 8], m4 | |
14746 | ; mode 23 [row 13 - second half] end | |
14747 | ||
14748 | pmaddubsw m4, m0, [r5 + 24 * 16] | |
14749 | pmulhrsw m4, m3 | |
14750 | pmaddubsw m6, m5, [r5 + 24 * 16] | |
14751 | pmulhrsw m6, m3 | |
14752 | packuswb m4, m6 | |
14753 | movu [r0 + 275 * 16], m4 | |
14754 | ||
14755 | ; mode 19 [row 4] | |
14756 | pslldq m0, 2 | |
14757 | pinsrb m0, [r4 + 4], 1 | |
14758 | pinsrb m0, [r4 + 5], 0 | |
14759 | pslldq m5, 2 | |
14760 | pinsrb m5, [r3 + 5], 1 | |
14761 | pinsrb m5, [r3 + 4], 0 | |
14762 | ||
14763 | ; mode 20 [row 6 - second half] | |
14764 | pmaddubsw m4, m5, [r5 + 13 * 16] | |
14765 | pmulhrsw m4, m3 | |
14766 | packuswb m4, m4 | |
14767 | movh [r0 + 294 * 16 + 8], m4 | |
14768 | ; mode 20 [row 6 - second half] end | |
14769 | ||
14770 | ; mode 21 [row 7 - second half] | |
14771 | pmaddubsw m4, m5, [r5 + 24 * 16] | |
14772 | pmulhrsw m4, m3 | |
14773 | packuswb m4, m4 | |
14774 | movh [r0 + 311 * 16 + 8], m4 | |
14775 | ; mode 21 [row 7 - second half] end | |
14776 | ||
14777 | ; mode 21 [row 8 - second half] | |
14778 | pmaddubsw m4, m5, [r5 + 7 * 16] | |
14779 | pmulhrsw m4, m3 | |
14780 | packuswb m4, m4 | |
14781 | movh [r0 + 312 * 16 + 8], m4 | |
14782 | ; mode 21 [row 8 - second half] end | |
14783 | ||
14784 | ; mode 22 [row 9 - second half] | |
14785 | pmaddubsw m4, m5, [r5 + 30 * 16] | |
14786 | pmulhrsw m4, m3 | |
14787 | packuswb m4, m4 | |
14788 | movh [r0 + 329 * 16 + 8], m4 | |
14789 | ; mode 22 [row 9 - second half] end | |
14790 | ||
14791 | ; mode 22 [row 10 - second half] | |
14792 | pmaddubsw m4, m5, [r5 + 17 * 16] | |
14793 | pmulhrsw m4, m3 | |
14794 | packuswb m4, m4 | |
14795 | movh [r0 + 330 * 16 + 8], m4 | |
14796 | ; mode 22 [row 10 - second half] end | |
14797 | ||
14798 | ; mode 22 [row 11 - second half] | |
14799 | pmaddubsw m4, m5, [r5 + 4 * 16] | |
14800 | pmulhrsw m4, m3 | |
14801 | packuswb m4, m4 | |
14802 | movh [r0 + 331 * 16 + 8], m4 | |
14803 | ; mode 22 [row 11 - second half] end | |
14804 | ||
14805 | ; mode 23 [row 14 - second half] | |
14806 | pmaddubsw m4, m5, [r5 + 25 * 16] | |
14807 | pmulhrsw m4, m3 | |
14808 | packuswb m4, m4 | |
14809 | movh [r0 + 350 * 16 + 8], m4 | |
14810 | ; mode 23 [row 14 - second half] end | |
14811 | ||
14812 | ; mode 23 [row 15 - second half] | |
14813 | pmaddubsw m4, m5, [r5 + 16 * 16] | |
14814 | pmulhrsw m4, m3 | |
14815 | packuswb m4, m4 | |
14816 | movh [r0 + 351 * 16 + 8], m4 | |
14817 | ||
14818 | ; mode 23 [row 15 - second half] end | |
14819 | pmaddubsw m4, m0, [r5 + 30 * 16] | |
14820 | pmulhrsw m4, m3 | |
14821 | pmaddubsw m6, m5, [r5 + 30 * 16] | |
14822 | pmulhrsw m6, m3 | |
14823 | packuswb m4, m6 | |
14824 | movu [r0 + 276 * 16], m4 | |
14825 | ||
14826 | ; mode 19 [row 5] | |
14827 | pmaddubsw m4, m0, [r5 + 4 * 16] | |
14828 | pmulhrsw m4, m3 | |
14829 | pmaddubsw m6, m5, [r5 + 4 * 16] | |
14830 | pmulhrsw m6, m3 | |
14831 | packuswb m4, m6 | |
14832 | movu [r0 + 277 * 16], m4 | |
14833 | ||
14834 | ; mode 19 [row 6] | |
14835 | pslldq m0, 2 | |
14836 | pinsrb m0, [r4 + 5], 1 | |
14837 | pinsrb m0, [r4 + 6], 0 | |
14838 | pslldq m5, 2 | |
14839 | pinsrb m5, [r3 + 4], 1 | |
14840 | pinsrb m5, [r3 + 3], 0 | |
14841 | ||
14842 | ; mode 20 [row 7 - second half] | |
14843 | pmaddubsw m4, m5, [r5 + 24 * 16] | |
14844 | pmulhrsw m4, m3 | |
14845 | packuswb m4, m4 | |
14846 | movh [r0 + 295 * 16 + 8], m4 | |
14847 | ; mode 20 [row 7 - second half] end | |
14848 | ||
14849 | ; mode 20 [row 8 - second half] | |
14850 | pmaddubsw m4, m5, [r5 + 3 * 16] | |
14851 | pmulhrsw m4, m3 | |
14852 | packuswb m4, m4 | |
14853 | movh [r0 + 296 * 16 + 8], m4 | |
14854 | ; mode 20 [row 8 - second half] end | |
14855 | ||
14856 | ; mode 21 [row 9 - second half] | |
14857 | pmaddubsw m4, m5, [r5 + 22 * 16] | |
14858 | pmulhrsw m4, m3 | |
14859 | packuswb m4, m4 | |
14860 | movh [r0 + 313 * 16 + 8], m4 | |
14861 | ; mode 21 [row 9 - second half] end | |
14862 | ||
14863 | ; mode 21 [row 10 - second half] | |
14864 | pmaddubsw m4, m5, [r5 + 5 * 16] | |
14865 | pmulhrsw m4, m3 | |
14866 | packuswb m4, m4 | |
14867 | movh [r0 + 314 * 16 + 8], m4 | |
14868 | ; mode 21 [row 10 - second half] end | |
14869 | ||
14870 | ; mode 22 [row 12 - second half] | |
14871 | pmaddubsw m4, m5, [r5 + 23 * 16] | |
14872 | pmulhrsw m4, m3 | |
14873 | packuswb m4, m4 | |
14874 | movh [r0 + 332 * 16 + 8], m4 | |
14875 | ; mode 22 [row 12 - second half] end | |
14876 | ||
14877 | ; mode 22 [row 12 - second half] | |
14878 | pmaddubsw m4, m5, [r5 + 10 * 16] | |
14879 | pmulhrsw m4, m3 | |
14880 | packuswb m4, m4 | |
14881 | movh [r0 + 333 * 16 + 8], m4 | |
14882 | ; mode 22 [row 12 - second half] end | |
14883 | ||
14884 | pmaddubsw m4, m0, [r5 + 10 * 16] | |
14885 | pmulhrsw m4, m3 | |
14886 | pmaddubsw m6, m5, [r5 + 10 * 16] | |
14887 | pmulhrsw m6, m3 | |
14888 | packuswb m4, m6 | |
14889 | movu [r0 + 278 * 16], m4 | |
14890 | ||
14891 | ; mode 19 [row 7] | |
14892 | pslldq m0, 2 | |
14893 | pinsrb m0, [r4 + 6], 1 | |
14894 | pinsrb m0, [r4 + 7], 0 | |
14895 | pslldq m5, 2 | |
14896 | pinsrb m5, [r3 + 3], 1 | |
14897 | pinsrb m5, [r3 + 2], 0 | |
14898 | ||
14899 | ; mode 20 [row 9 - second half] | |
14900 | pmaddubsw m4, m5, [r5 + 14 * 16] | |
14901 | pmulhrsw m4, m3 | |
14902 | packuswb m4, m4 | |
14903 | movh [r0 + 297 * 16 + 8], m4 | |
14904 | ; mode 20 [row 9 - second half] | |
14905 | ||
14906 | ; mode 21 [row 11 - second half] | |
14907 | pmaddubsw m4, m5, [r5 + 20 * 16] | |
14908 | pmulhrsw m4, m3 | |
14909 | packuswb m4, m4 | |
14910 | movh [r0 + 315 * 16 + 8], m4 | |
14911 | ; mode 21 [row 11 - second half] end | |
14912 | ||
14913 | ; mode 21 [row 12 - second half] | |
14914 | pmaddubsw m4, m5, [r5 + 3 * 16] | |
14915 | pmulhrsw m4, m3 | |
14916 | packuswb m4, m4 | |
14917 | movh [r0 + 316 * 16 + 8], m4 | |
14918 | ; mode 21 [row 12 - second half] end | |
14919 | ||
14920 | ; mode 22 [row 14 - second half] | |
14921 | pmaddubsw m4, m5, [r5 + 29 * 16] | |
14922 | pmulhrsw m4, m3 | |
14923 | packuswb m4, m4 | |
14924 | movh [r0 + 334 * 16 + 8], m4 | |
14925 | ; mode 22 [row 14 - second half] end | |
14926 | ||
14927 | ; mode 22 [row 15 - second half] | |
14928 | pmaddubsw m4, m5, [r5 + 16 * 16] | |
14929 | pmulhrsw m4, m3 | |
14930 | packuswb m4, m4 | |
14931 | movh [r0 + 335 * 16 + 8], m4 | |
14932 | ; mode 22 [row 15 - second half] end | |
14933 | ||
14934 | pmaddubsw m4, m0, [r5 + 16 * 16] | |
14935 | pmulhrsw m4, m3 | |
14936 | pmaddubsw m6, m5, [r5 + 16 * 16] | |
14937 | pmulhrsw m6, m3 | |
14938 | packuswb m4, m6 | |
14939 | movu [r0 + 279 * 16], m4 | |
14940 | ||
14941 | ; mode 19 [row 8] | |
14942 | pslldq m0, 2 | |
14943 | pinsrb m0, [r4 + 7], 1 | |
14944 | pinsrb m0, [r4 + 9], 0 | |
14945 | pslldq m5, 2 | |
14946 | pinsrb m5, [r3 + 2], 1 | |
14947 | pinsrb m5, [r3 + 1], 0 | |
14948 | ||
14949 | ; mode 20 [row 10 - second half] | |
14950 | pmaddubsw m4, m5, [r5 + 25 * 16] | |
14951 | pmulhrsw m4, m3 | |
14952 | packuswb m4, m4 | |
14953 | movh [r0 + 298 * 16 + 8], m4 | |
14954 | ; mode 20 [row 10 - second half] end | |
14955 | ||
14956 | ; mode 20 [row 11 - second half] | |
14957 | pmaddubsw m4, m5, [r5 + 4 * 16] | |
14958 | pmulhrsw m4, m3 | |
14959 | packuswb m4, m4 | |
14960 | movh [r0 + 299 * 16 + 8], m4 | |
14961 | ; mode 20 [row 11 - second half] end | |
14962 | ||
14963 | ; mode 21 [row 13 - second half] | |
14964 | pmaddubsw m4, m5, [r5 + 18 * 16] | |
14965 | pmulhrsw m4, m3 | |
14966 | packuswb m4, m4 | |
14967 | movh [r0 + 317 * 16 + 8], m4 | |
14968 | ; mode 21 [row 13 - second half] end | |
14969 | ||
14970 | ; mode 21 [row 14 - second half] | |
14971 | pmaddubsw m4, m5, [r5 + 1 * 16] | |
14972 | pmulhrsw m4, m3 | |
14973 | packuswb m4, m4 | |
14974 | movh [r0 + 318 * 16 + 8], m4 | |
14975 | ; mode 21 [row 14 - second half] end | |
14976 | ||
14977 | pmaddubsw m4, m0, [r5 + 22 * 16] | |
14978 | pmulhrsw m4, m3 | |
14979 | pmaddubsw m6, m5, [r5 + 22 * 16] | |
14980 | pmulhrsw m6, m3 | |
14981 | packuswb m4, m6 | |
14982 | movu [r0 + 280 * 16], m4 | |
14983 | ||
14984 | ; mode 19 [row 9] | |
14985 | pslldq m0, 2 | |
14986 | pinsrb m0, [r4 + 9], 1 | |
14987 | pinsrb m0, [r4 + 10], 0 | |
14988 | pslldq m5, 2 | |
14989 | pinsrb m5, [r3 + 1], 1 | |
14990 | pinsrb m5, [r3 + 0], 0 | |
14991 | ||
14992 | ; mode 20 [row 12 - second half] | |
14993 | pmaddubsw m4, m5, [r5 + 15 * 16] | |
14994 | pmulhrsw m4, m3 | |
14995 | packuswb m4, m4 | |
14996 | movh [r0 + 300 * 16 + 8], m4 | |
14997 | ||
14998 | ; mode 20 [row 12 - second half] end | |
14999 | pmaddubsw m4, m0, [r5 + 28 * 16] | |
15000 | pmulhrsw m4, m3 | |
15001 | pmaddubsw m6, m5, [r5 + 28 * 16] | |
15002 | pmulhrsw m6, m3 | |
15003 | packuswb m4, m6 | |
15004 | movu [r0 + 281 * 16], m4 | |
15005 | ||
15006 | ; mode 19 [row 10] | |
15007 | pmaddubsw m4, m0, [r5 + 2 * 16] | |
15008 | pmulhrsw m4, m3 | |
15009 | pmaddubsw m6, m5, [r5 + 2 * 16] | |
15010 | pmulhrsw m6, m3 | |
15011 | packuswb m4, m6 | |
15012 | movu [r0 + 282 * 16], m4 | |
15013 | ||
15014 | ; mode 19 [row 11] | |
15015 | pslldq m0, 2 | |
15016 | pinsrb m0, [r4 + 10], 1 | |
15017 | pinsrb m0, [r4 + 11], 0 | |
15018 | pmaddubsw m4, m0, [r5 + 8 * 16] | |
15019 | pmulhrsw m4, m3 | |
15020 | pslldq m5, 2 | |
15021 | pinsrb m5, [r4 + 0], 1 | |
15022 | pinsrb m5, [r4 + 1], 0 | |
15023 | pmaddubsw m6, m5, [r5 + 8 * 16] | |
15024 | pmulhrsw m6, m3 | |
15025 | packuswb m4, m6 | |
15026 | movu [r0 + 283 * 16], m4 | |
15027 | ||
15028 | ; mode 19 [row 12] | |
15029 | pslldq m0, 2 | |
15030 | pinsrb m0, [r4 + 11], 1 | |
15031 | pinsrb m0, [r4 + 12], 0 | |
15032 | pslldq m5, 2 | |
15033 | pinsrb m5, [r4 + 1], 1 | |
15034 | pinsrb m5, [r4 + 2], 0 | |
15035 | pmaddubsw m4, m0, [r5 + 14 * 16] | |
15036 | pmulhrsw m4, m3 | |
15037 | pmaddubsw m6, m5, [r5 + 14 * 16] | |
15038 | pmulhrsw m6, m3 | |
15039 | packuswb m4, m6 | |
15040 | movu [r0 + 284 * 16], m4 | |
15041 | ||
15042 | ; mode 19 [row 13] | |
15043 | pslldq m0, 2 | |
15044 | pinsrb m0, [r4 + 12], 1 | |
15045 | pinsrb m0, [r4 + 14], 0 | |
15046 | pmaddubsw m4, m0, [r5 + 20 * 16] | |
15047 | pmulhrsw m4, m3 | |
15048 | pslldq m5, 2 | |
15049 | pinsrb m5, [r4 + 2], 1 | |
15050 | pinsrb m5, [r4 + 4], 0 | |
15051 | pmaddubsw m6, m5, [r5 + 20 * 16] | |
15052 | pmulhrsw m6, m3 | |
15053 | packuswb m4, m6 | |
15054 | movu [r0 + 285 * 16], m4 | |
15055 | ||
15056 | ; mode 19 [row 14] | |
15057 | pslldq m0, 2 | |
15058 | pinsrb m0, [r4 + 14], 1 | |
15059 | pinsrb m0, [r4 + 15], 0 | |
15060 | pmaddubsw m4, m0, [r5 + 26 * 16] | |
15061 | pmulhrsw m4, m3 | |
15062 | pslldq m5, 2 | |
15063 | pinsrb m5, [r4 + 4], 1 | |
15064 | pinsrb m5, [r4 + 5], 0 | |
15065 | pmaddubsw m6, m5, [r5 + 26 * 16] | |
15066 | pmulhrsw m6, m3 | |
15067 | packuswb m4, m6 | |
15068 | movu [r0 + 286 * 16], m4 | |
15069 | ||
15070 | ; mode 19 [row 15] | |
15071 | movu m0, [r4] | |
15072 | pshufb m0, [tab_S1] | |
15073 | movu [r0 + 287 * 16], m0 | |
15074 | movd m1, [r3] | |
15075 | movd [r0 + 287 * 16 + 12], m1 | |
15076 | ||
15077 | ; mode 25 | |
15078 | movu m1, [r1] | |
15079 | ||
15080 | ; mode 26 [all rows] | |
15081 | psrldq m6, m1, 1 | |
15082 | pinsrb m6, [r1 + 16], 15 | |
15083 | movu m7, m6 | |
15084 | movu [r0 + 384 * 16], m6 | |
15085 | movu [r0 + 385 * 16], m6 | |
15086 | movu [r0 + 386 * 16], m6 | |
15087 | movu [r0 + 387 * 16], m6 | |
15088 | movu [r0 + 388 * 16], m6 | |
15089 | movu [r0 + 389 * 16], m6 | |
15090 | movu [r0 + 390 * 16], m6 | |
15091 | movu [r0 + 391 * 16], m6 | |
15092 | movu [r0 + 392 * 16], m6 | |
15093 | movu [r0 + 393 * 16], m6 | |
15094 | movu [r0 + 394 * 16], m6 | |
15095 | movu [r0 + 395 * 16], m6 | |
15096 | movu [r0 + 396 * 16], m6 | |
15097 | movu [r0 + 397 * 16], m6 | |
15098 | movu [r0 + 398 * 16], m6 | |
15099 | movu [r0 + 399 * 16], m6 | |
15100 | ||
15101 | pxor m0, m0 | |
15102 | pshufb m6, m6, m0 | |
15103 | punpcklbw m6, m0 | |
15104 | movu m2, [r2] | |
15105 | pshufb m2, m2, m0 | |
15106 | punpcklbw m2, m0 | |
15107 | movu m4, [r2 + 1] | |
15108 | punpcklbw m5, m4, m0 | |
15109 | punpckhbw m4, m0 | |
15110 | psubw m5, m2 | |
15111 | psubw m4, m2 | |
15112 | psraw m5, 1 | |
15113 | psraw m4, 1 | |
15114 | paddw m5, m6 | |
15115 | paddw m4, m6 | |
15116 | packuswb m5, m4 | |
15117 | ||
15118 | pextrb [r0 + 384 * 16], m5, 0 | |
15119 | pextrb [r0 + 385 * 16], m5, 1 | |
15120 | pextrb [r0 + 386 * 16], m5, 2 | |
15121 | pextrb [r0 + 387 * 16], m5, 3 | |
15122 | pextrb [r0 + 388 * 16], m5, 4 | |
15123 | pextrb [r0 + 389 * 16], m5, 5 | |
15124 | pextrb [r0 + 390 * 16], m5, 6 | |
15125 | pextrb [r0 + 391 * 16], m5, 7 | |
15126 | pextrb [r0 + 392 * 16], m5, 8 | |
15127 | pextrb [r0 + 393 * 16], m5, 9 | |
15128 | pextrb [r0 + 394 * 16], m5, 10 | |
15129 | pextrb [r0 + 395 * 16], m5, 11 | |
15130 | pextrb [r0 + 396 * 16], m5, 12 | |
15131 | pextrb [r0 + 397 * 16], m5, 13 | |
15132 | pextrb [r0 + 398 * 16], m5, 14 | |
15133 | pextrb [r0 + 399 * 16], m5, 15 | |
15134 | ||
15135 | ; mode 25 [row 15] | |
15136 | movu [r0 + 383 * 16], m1 | |
15137 | ||
15138 | ; mode 25 [row 0] | |
15139 | psrldq m2, m1, 1 | |
15140 | punpcklbw m1, m2 | |
15141 | movu m2, [r1 + 8] | |
15142 | psrldq m4, m2, 1 | |
15143 | punpcklbw m2, m4 | |
15144 | pmaddubsw m4, m1, [r5 + 30 * 16] | |
15145 | pmulhrsw m4, m3 | |
15146 | pmaddubsw m5, m2, [r5 + 30 * 16] | |
15147 | pmulhrsw m5, m3 | |
15148 | packuswb m4, m5 | |
15149 | movu [r0 + 368 * 16], m4 | |
15150 | ||
15151 | ; mode 25 [row 1] | |
15152 | pmaddubsw m4, m1, [r5 + 28 * 16] | |
15153 | pmulhrsw m4, m3 | |
15154 | pmaddubsw m5, m2, [r5 + 28 * 16] | |
15155 | pmulhrsw m5, m3 | |
15156 | packuswb m4, m5 | |
15157 | movu [r0 + 369 * 16], m4 | |
15158 | ||
15159 | ; mode 25 [row 2] | |
15160 | pmaddubsw m4, m1, [r5 + 26 * 16] | |
15161 | pmulhrsw m4, m3 | |
15162 | pmaddubsw m5, m2, [r5 + 26 * 16] | |
15163 | pmulhrsw m5, m3 | |
15164 | packuswb m4, m5 | |
15165 | movu [r0 + 370 * 16], m4 | |
15166 | ||
15167 | ; mode 25 [row 3] | |
15168 | pmaddubsw m4, m1, [r5 + 24 * 16] | |
15169 | pmulhrsw m4, m3 | |
15170 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
15171 | pmulhrsw m5, m3 | |
15172 | packuswb m4, m5 | |
15173 | movu [r0 + 371 * 16], m4 | |
15174 | ||
15175 | ; mode 25 [row 4] | |
15176 | pmaddubsw m4, m1, [r5 + 22 * 16] | |
15177 | pmulhrsw m4, m3 | |
15178 | pmaddubsw m5, m2, [r5 + 22 * 16] | |
15179 | pmulhrsw m5, m3 | |
15180 | packuswb m4, m5 | |
15181 | movu [r0 + 372 * 16], m4 | |
15182 | ||
15183 | ; mode 25 [row 5] | |
15184 | pmaddubsw m4, m1, [r5 + 20 * 16] | |
15185 | pmulhrsw m4, m3 | |
15186 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
15187 | pmulhrsw m5, m3 | |
15188 | packuswb m4, m5 | |
15189 | movu [r0 + 373 * 16], m4 | |
15190 | ||
15191 | ; mode 25 [row 6] | |
15192 | pmaddubsw m4, m1, [r5 + 18 * 16] | |
15193 | pmulhrsw m4, m3 | |
15194 | pmaddubsw m5, m2, [r5 + 18 * 16] | |
15195 | pmulhrsw m5, m3 | |
15196 | packuswb m4, m5 | |
15197 | movu [r0 + 374 * 16], m4 | |
15198 | ||
15199 | ; mode 25 [row 7] | |
15200 | pmaddubsw m4, m1, [r5 + 16 * 16] | |
15201 | pmulhrsw m4, m3 | |
15202 | pmaddubsw m5, m2, [r5 + 16 * 16] | |
15203 | pmulhrsw m5, m3 | |
15204 | packuswb m4, m5 | |
15205 | movu [r0 + 375 * 16], m4 | |
15206 | ||
15207 | ; mode 25 [row 8] | |
15208 | pmaddubsw m4, m1, [r5 + 14 * 16] | |
15209 | pmulhrsw m4, m3 | |
15210 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
15211 | pmulhrsw m5, m3 | |
15212 | packuswb m4, m5 | |
15213 | movu [r0 + 376 * 16], m4 | |
15214 | ||
15215 | ; mode 25 [row 9] | |
15216 | pmaddubsw m4, m1, [r5 + 12 * 16] | |
15217 | pmulhrsw m4, m3 | |
15218 | pmaddubsw m5, m2, [r5 + 12 * 16] | |
15219 | pmulhrsw m5, m3 | |
15220 | packuswb m4, m5 | |
15221 | movu [r0 + 377 * 16], m4 | |
15222 | ||
15223 | ; mode 25 [row 10] | |
15224 | pmaddubsw m4, m1, [r5 + 10 * 16] | |
15225 | pmulhrsw m4, m3 | |
15226 | pmaddubsw m5, m2, [r5 + 10 * 16] | |
15227 | pmulhrsw m5, m3 | |
15228 | packuswb m4, m5 | |
15229 | movu [r0 + 378 * 16], m4 | |
15230 | ||
15231 | ; mode 25 [row 11] | |
15232 | pmaddubsw m4, m1, [r5 + 8 * 16] | |
15233 | pmulhrsw m4, m3 | |
15234 | pmaddubsw m5, m2, [r5 + 8 * 16] | |
15235 | pmulhrsw m5, m3 | |
15236 | packuswb m4, m5 | |
15237 | movu [r0 + 379 * 16], m4 | |
15238 | ||
15239 | ; mode 25 [row 12] | |
15240 | pmaddubsw m4, m1, [r5 + 6 * 16] | |
15241 | pmulhrsw m4, m3 | |
15242 | pmaddubsw m5, m2, [r5 + 6 * 16] | |
15243 | pmulhrsw m5, m3 | |
15244 | packuswb m4, m5 | |
15245 | movu [r0 + 380 * 16], m4 | |
15246 | ||
15247 | ; mode 25 [row 13] | |
15248 | pmaddubsw m4, m1, [r5 + 4 * 16] | |
15249 | pmulhrsw m4, m3 | |
15250 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
15251 | pmulhrsw m5, m3 | |
15252 | packuswb m4, m5 | |
15253 | movu [r0 + 381 * 16], m4 | |
15254 | ||
15255 | ; mode 25 [row 14] | |
15256 | pmaddubsw m4, m1, [r5 + 2 * 16] | |
15257 | pmulhrsw m4, m3 | |
15258 | pmaddubsw m5, m2, [r5 + 2 * 16] | |
15259 | pmulhrsw m5, m3 | |
15260 | packuswb m4, m5 | |
15261 | movu [r0 + 382 * 16], m4 | |
15262 | ||
15263 | ; mode 27 [row 15] | |
15264 | psrldq m6, m7, 1 | |
15265 | punpcklbw m7, m6 | |
15266 | pinsrb m6, [r1 + 17], 15 | |
15267 | movu [r0 + 415 * 16], m6 | |
15268 | ||
15269 | ; mode 27 [row 0] | |
15270 | movu m4, [r1 + 9] | |
15271 | psrldq m5, m4, 1 | |
15272 | punpcklbw m4, m5 | |
15273 | pmaddubsw m6, m7, [r5 + 2 * 16] | |
15274 | pmulhrsw m6, m3 | |
15275 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
15276 | pmulhrsw m5, m3 | |
15277 | packuswb m6, m5 | |
15278 | movu [r0 + 400 * 16], m6 | |
15279 | ||
15280 | ; mode 27 [row 1] | |
15281 | pmaddubsw m6, m7, [r5 + 4 * 16] | |
15282 | pmulhrsw m6, m3 | |
15283 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
15284 | pmulhrsw m5, m3 | |
15285 | packuswb m6, m5 | |
15286 | movu [r0 + 401 * 16], m6 | |
15287 | ||
15288 | ; mode 27 [row 2] | |
15289 | pmaddubsw m6, m7, [r5 + 6 * 16] | |
15290 | pmulhrsw m6, m3 | |
15291 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
15292 | pmulhrsw m5, m3 | |
15293 | packuswb m6, m5 | |
15294 | movu [r0 + 402 * 16], m6 | |
15295 | ||
15296 | ; mode 27 [row 3] | |
15297 | pmaddubsw m6, m7, [r5 + 8 * 16] | |
15298 | pmulhrsw m6, m3 | |
15299 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
15300 | pmulhrsw m5, m3 | |
15301 | packuswb m6, m5 | |
15302 | movu [r0 + 403 * 16], m6 | |
15303 | ||
15304 | ; mode 27 [row 4] | |
15305 | pmaddubsw m6, m7, [r5 + 10 * 16] | |
15306 | pmulhrsw m6, m3 | |
15307 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
15308 | pmulhrsw m5, m3 | |
15309 | packuswb m6, m5 | |
15310 | movu [r0 + 404 * 16], m6 | |
15311 | ||
15312 | ; mode 27 [row 5] | |
15313 | pmaddubsw m6, m7, [r5 + 12 * 16] | |
15314 | pmulhrsw m6, m3 | |
15315 | pmaddubsw m5, m4, [r5 + 12 * 16] | |
15316 | pmulhrsw m5, m3 | |
15317 | packuswb m6, m5 | |
15318 | movu [r0 + 405 * 16], m6 | |
15319 | ||
15320 | ; mode 27 [row 6] | |
15321 | pmaddubsw m6, m7, [r5 + 14 * 16] | |
15322 | pmulhrsw m6, m3 | |
15323 | pmaddubsw m5, m4, [r5 + 14 * 16] | |
15324 | pmulhrsw m5, m3 | |
15325 | packuswb m6, m5 | |
15326 | movu [r0 + 406 * 16], m6 | |
15327 | ||
15328 | ; mode 27 [row 7] | |
15329 | pmaddubsw m6, m7, [r5 + 16 * 16] | |
15330 | pmulhrsw m6, m3 | |
15331 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
15332 | pmulhrsw m5, m3 | |
15333 | packuswb m6, m5 | |
15334 | movu [r0 + 407 * 16], m6 | |
15335 | ||
15336 | ; mode 27 [row 8] | |
15337 | pmaddubsw m6, m7, [r5 + 18 * 16] | |
15338 | pmulhrsw m6, m3 | |
15339 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
15340 | pmulhrsw m5, m3 | |
15341 | packuswb m6, m5 | |
15342 | movu [r0 + 408 * 16], m6 | |
15343 | ||
15344 | ; mode 27 [row 9] | |
15345 | pmaddubsw m6, m7, [r5 + 20 * 16] | |
15346 | pmulhrsw m6, m3 | |
15347 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
15348 | pmulhrsw m5, m3 | |
15349 | packuswb m6, m5 | |
15350 | movu [r0 + 409 * 16], m6 | |
15351 | ||
15352 | ; mode 27 [row 10] | |
15353 | pmaddubsw m6, m7, [r5 + 22 * 16] | |
15354 | pmulhrsw m6, m3 | |
15355 | pmaddubsw m5, m4, [r5 + 22 * 16] | |
15356 | pmulhrsw m5, m3 | |
15357 | packuswb m6, m5 | |
15358 | movu [r0 + 410 * 16], m6 | |
15359 | ||
15360 | ; mode 27 [row 11] | |
15361 | pmaddubsw m6, m7, [r5 + 24 * 16] | |
15362 | pmulhrsw m6, m3 | |
15363 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
15364 | pmulhrsw m5, m3 | |
15365 | packuswb m6, m5 | |
15366 | movu [r0 + 411 * 16], m6 | |
15367 | ||
15368 | ; mode 27 [row 12] | |
15369 | pmaddubsw m6, m7, [r5 + 26 * 16] | |
15370 | pmulhrsw m6, m3 | |
15371 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
15372 | pmulhrsw m5, m3 | |
15373 | packuswb m6, m5 | |
15374 | movu [r0 + 412 * 16], m6 | |
15375 | ||
15376 | ; mode 27 [row 13] | |
15377 | pmaddubsw m6, m7, [r5 + 28 * 16] | |
15378 | pmulhrsw m6, m3 | |
15379 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
15380 | pmulhrsw m5, m3 | |
15381 | packuswb m6, m5 | |
15382 | movu [r0 + 413 * 16], m6 | |
15383 | ||
15384 | ; mode 27 [row 14] | |
15385 | pmaddubsw m6, m7, [r5 + 30 * 16] | |
15386 | pmulhrsw m6, m3 | |
15387 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
15388 | pmulhrsw m5, m3 | |
15389 | packuswb m6, m5 | |
15390 | movu [r0 + 414 * 16], m6 | |
15391 | ||
15392 | ; mode 28 [row 0] | |
15393 | movu m1, [r3 + 1] | |
15394 | psrldq m2, m1, 1 | |
15395 | punpcklbw m1, m2 | |
15396 | movu m4, [r3 + 9] | |
15397 | psrldq m5, m4, 1 | |
15398 | punpcklbw m4, m5 | |
15399 | pmaddubsw m2, m1, [r5 + 5 * 16] | |
15400 | pmulhrsw m2, m3 | |
15401 | pmaddubsw m5, m4, [r5 + 5 * 16] | |
15402 | pmulhrsw m5, m3 | |
15403 | packuswb m2, m5 | |
15404 | movu [r0 + 416 * 16], m2 | |
15405 | ||
15406 | ; mode 28 [row 0] | |
15407 | pmaddubsw m2, m1, [r5 + 5 * 16] | |
15408 | pmulhrsw m2, m3 | |
15409 | pmaddubsw m5, m4, [r5 + 5 * 16] | |
15410 | pmulhrsw m5, m3 | |
15411 | packuswb m2, m5 | |
15412 | movu [r0 + 416 * 16], m2 | |
15413 | ||
15414 | ; mode 28 [row 1] | |
15415 | pmaddubsw m2, m1, [r5 + 10 * 16] | |
15416 | pmulhrsw m2, m3 | |
15417 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
15418 | pmulhrsw m5, m3 | |
15419 | packuswb m2, m5 | |
15420 | movu [r0 + 417 * 16], m2 | |
15421 | ||
15422 | ; mode 28 [row 2] | |
15423 | pmaddubsw m2, m1, [r5 + 15 * 16] | |
15424 | pmulhrsw m2, m3 | |
15425 | pmaddubsw m5, m4, [r5 + 15 * 16] | |
15426 | pmulhrsw m5, m3 | |
15427 | packuswb m2, m5 | |
15428 | movu [r0 + 418 * 16], m2 | |
15429 | ||
15430 | ; mode 28 [row 3] | |
15431 | pmaddubsw m2, m1, [r5 + 20 * 16] | |
15432 | pmulhrsw m2, m3 | |
15433 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
15434 | pmulhrsw m5, m3 | |
15435 | packuswb m2, m5 | |
15436 | movu [r0 + 419 * 16], m2 | |
15437 | ||
15438 | ; mode 28 [row 4] | |
15439 | pmaddubsw m2, m1, [r5 + 25 * 16] | |
15440 | pmulhrsw m2, m3 | |
15441 | pmaddubsw m5, m4, [r5 + 25 * 16] | |
15442 | pmulhrsw m5, m3 | |
15443 | packuswb m2, m5 | |
15444 | movu [r0 + 420 * 16], m2 | |
15445 | ||
15446 | ; mode 28 [row 5] | |
15447 | pmaddubsw m2, m1, [r5 + 30 * 16] | |
15448 | pmulhrsw m2, m3 | |
15449 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
15450 | pmulhrsw m5, m3 | |
15451 | packuswb m2, m5 | |
15452 | movu [r0 + 421 * 16], m2 | |
15453 | ||
15454 | ; mode 29 [row 0] | |
15455 | pmaddubsw m2, m1, [r5 + 9 * 16] | |
15456 | pmulhrsw m2, m3 | |
15457 | pmaddubsw m5, m4, [r5 + 9 * 16] | |
15458 | pmulhrsw m5, m3 | |
15459 | packuswb m2, m5 | |
15460 | movu [r0 + 432 * 16], m2 | |
15461 | ||
15462 | ; mode 29 [row 1] | |
15463 | pmaddubsw m2, m1, [r5 + 18 * 16] | |
15464 | pmulhrsw m2, m3 | |
15465 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
15466 | pmulhrsw m5, m3 | |
15467 | packuswb m2, m5 | |
15468 | movu [r0 + 433 * 16], m2 | |
15469 | ||
15470 | ; mode 29 [row 2] | |
15471 | pmaddubsw m2, m1, [r5 + 27 * 16] | |
15472 | pmulhrsw m2, m3 | |
15473 | pmaddubsw m5, m4, [r5 + 27 * 16] | |
15474 | pmulhrsw m5, m3 | |
15475 | packuswb m2, m5 | |
15476 | movu [r0 + 434 * 16], m2 | |
15477 | ||
15478 | ; mode 30 [row 0] | |
15479 | pmaddubsw m2, m1, [r5 + 13 * 16] | |
15480 | pmulhrsw m2, m3 | |
15481 | pmaddubsw m5, m4, [r5 + 13 * 16] | |
15482 | pmulhrsw m5, m3 | |
15483 | packuswb m2, m5 | |
15484 | movu [r0 + 448 * 16], m2 | |
15485 | ||
15486 | ; mode 30 [row 1] | |
15487 | pmaddubsw m2, m1, [r5 + 26 * 16] | |
15488 | pmulhrsw m2, m3 | |
15489 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
15490 | pmulhrsw m5, m3 | |
15491 | packuswb m2, m5 | |
15492 | movu [r0 + 449 * 16], m2 | |
15493 | ||
15494 | ; mode 33 [row 0] | |
15495 | movu [r0 + 496 * 16], m2 | |
15496 | ||
15497 | ; mode 31 [row 0] | |
15498 | pmaddubsw m2, m1, [r5 + 17 * 16] | |
15499 | pmulhrsw m2, m3 | |
15500 | pmaddubsw m5, m4, [r5 + 17 * 16] | |
15501 | pmulhrsw m5, m3 | |
15502 | packuswb m2, m5 | |
15503 | movu [r0 + 464 * 16], m2 | |
15504 | ||
15505 | ; mode 32 [row 0] | |
15506 | pmaddubsw m2, m1, [r5 + 21 * 16] | |
15507 | pmulhrsw m2, m3 | |
15508 | pmaddubsw m5, m4, [r5 + 21 * 16] | |
15509 | pmulhrsw m5, m3 | |
15510 | packuswb m2, m5 | |
15511 | movu [r0 + 480 * 16], m2 | |
15512 | ||
15513 | ; mode 28 [row 6] | |
15514 | movd m7, [r3 + 9] | |
15515 | palignr m7, m1, 2 | |
15516 | pmaddubsw m2, m7, [r5 + 3 * 16] | |
15517 | pmulhrsw m2, m3 | |
15518 | movd m6, [r3 + 17] | |
15519 | palignr m6, m4, 2 | |
15520 | pmaddubsw m5, m6, [r5 + 3 * 16] | |
15521 | pmulhrsw m5, m3 | |
15522 | packuswb m2, m5 | |
15523 | movu [r0 + 422 * 16], m2 | |
15524 | ||
15525 | ; mode 28 [row 7] | |
15526 | pmaddubsw m2, m7, [r5 + 8 * 16] | |
15527 | pmulhrsw m2, m3 | |
15528 | pmaddubsw m5, m6, [r5 + 8 * 16] | |
15529 | pmulhrsw m5, m3 | |
15530 | packuswb m2, m5 | |
15531 | movu [r0 + 423 * 16], m2 | |
15532 | ||
15533 | ; mode 28 [row 8] | |
15534 | pmaddubsw m2, m7, [r5 + 13 * 16] | |
15535 | pmulhrsw m2, m3 | |
15536 | pmaddubsw m5, m6, [r5 + 13 * 16] | |
15537 | pmulhrsw m5, m3 | |
15538 | packuswb m2, m5 | |
15539 | movu [r0 + 424 * 16], m2 | |
15540 | ||
15541 | ; mode 28 [row 9] | |
15542 | pmaddubsw m2, m7, [r5 + 18 * 16] | |
15543 | pmulhrsw m2, m3 | |
15544 | pmaddubsw m5, m6, [r5 + 18 * 16] | |
15545 | pmulhrsw m5, m3 | |
15546 | packuswb m2, m5 | |
15547 | movu [r0 + 425 * 16], m2 | |
15548 | ||
15549 | ; mode 28 [row 10] | |
15550 | pmaddubsw m2, m7, [r5 + 23 * 16] | |
15551 | pmulhrsw m2, m3 | |
15552 | pmaddubsw m5, m6, [r5 + 23 * 16] | |
15553 | pmulhrsw m5, m3 | |
15554 | packuswb m2, m5 | |
15555 | movu [r0 + 426 * 16], m2 | |
15556 | ||
15557 | ; mode 29 [row 3] | |
15558 | pmaddubsw m2, m7, [r5 + 4 * 16] | |
15559 | pmulhrsw m2, m3 | |
15560 | pmaddubsw m5, m6, [r5 + 4 * 16] | |
15561 | pmulhrsw m5, m3 | |
15562 | packuswb m2, m5 | |
15563 | movu [r0 + 435 * 16], m2 | |
15564 | ||
15565 | ; mode 29 [row 4] | |
15566 | pmaddubsw m2, m7, [r5 + 13 * 16] | |
15567 | pmulhrsw m2, m3 | |
15568 | pmaddubsw m5, m6, [r5 + 13 * 16] | |
15569 | pmulhrsw m5, m3 | |
15570 | packuswb m2, m5 | |
15571 | movu [r0 + 436 * 16], m2 | |
15572 | ||
15573 | ; mode 29 [row 5] | |
15574 | pmaddubsw m2, m7, [r5 + 22 * 16] | |
15575 | pmulhrsw m2, m3 | |
15576 | pmaddubsw m5, m6, [r5 + 22 * 16] | |
15577 | pmulhrsw m5, m3 | |
15578 | packuswb m2, m5 | |
15579 | movu [r0 + 437 * 16], m2 | |
15580 | ||
15581 | ; mode 29 [row 6] | |
15582 | pmaddubsw m2, m7, [r5 + 31 * 16] | |
15583 | pmulhrsw m2, m3 | |
15584 | pmaddubsw m5, m6, [r5 + 31 * 16] | |
15585 | pmulhrsw m5, m3 | |
15586 | packuswb m2, m5 | |
15587 | movu [r0 + 438 * 16], m2 | |
15588 | ||
15589 | ; mode 32 [row 2] | |
15590 | movu [r0 + 482 * 16], m2 | |
15591 | ||
15592 | ; mode 30 [row 2] | |
15593 | pmaddubsw m2, m7, [r5 + 7 * 16] | |
15594 | pmulhrsw m2, m3 | |
15595 | pmaddubsw m5, m6, [r5 + 7 * 16] | |
15596 | pmulhrsw m5, m3 | |
15597 | packuswb m2, m5 | |
15598 | movu [r0 + 450 * 16], m2 | |
15599 | ||
15600 | ; mode 30 [row 3] | |
15601 | pmaddubsw m2, m7, [r5 + 20 * 16] | |
15602 | pmulhrsw m2, m3 | |
15603 | pmaddubsw m5, m6, [r5 + 20 * 16] | |
15604 | pmulhrsw m5, m3 | |
15605 | packuswb m2, m5 | |
15606 | movu [r0 + 451 * 16], m2 | |
15607 | ||
15608 | ; mode 33 [row 1] | |
15609 | movu [r0 + 497 * 16], m2 | |
15610 | ||
15611 | ; mode 31 [row 1] | |
15612 | pmaddubsw m2, m7, [r5 + 2 * 16] | |
15613 | pmulhrsw m2, m3 | |
15614 | pmaddubsw m5, m6, [r5 + 2 * 16] | |
15615 | pmulhrsw m5, m3 | |
15616 | packuswb m2, m5 | |
15617 | movu [r0 + 465 * 16], m2 | |
15618 | ||
15619 | ; mode 31 [row 2] | |
15620 | pmaddubsw m2, m7, [r5 + 19 * 16] | |
15621 | pmulhrsw m2, m3 | |
15622 | pmaddubsw m5, m6, [r5 + 19 * 16] | |
15623 | pmulhrsw m5, m3 | |
15624 | packuswb m2, m5 | |
15625 | movu [r0 + 466 * 16], m2 | |
15626 | ||
15627 | ; mode 32 [row 1] | |
15628 | pmaddubsw m2, m7, [r5 + 10 * 16] | |
15629 | pmulhrsw m2, m3 | |
15630 | pmaddubsw m5, m6, [r5 + 10 * 16] | |
15631 | pmulhrsw m5, m3 | |
15632 | packuswb m2, m5 | |
15633 | movu [r0 + 481 * 16], m2 | |
15634 | ||
15635 | ; mode 28 [row 11] | |
15636 | pmaddubsw m2, m7, [r5 + 28 * 16] | |
15637 | pmulhrsw m2, m3 | |
15638 | pmaddubsw m5, m6, [r5 + 28 * 16] | |
15639 | pmulhrsw m5, m3 | |
15640 | packuswb m2, m5 | |
15641 | movu [r0 + 427 * 16], m2 | |
15642 | ||
15643 | ; mode 28 [row 12] | |
15644 | movd m1, [r3 + 10] | |
15645 | palignr m1, m7, 2 | |
15646 | pmaddubsw m2, m1, [r5 + 1 * 16] | |
15647 | pmulhrsw m2, m3 | |
15648 | movd m4, [r3 + 18] | |
15649 | palignr m4, m6, 2 | |
15650 | pmaddubsw m5, m4, [r5 + 1 * 16] | |
15651 | pmulhrsw m5, m3 | |
15652 | packuswb m2, m5 | |
15653 | movu [r0 + 428 * 16], m2 | |
15654 | ||
15655 | ; mode 30 [row 4] | |
15656 | movu [r0 + 452 * 16], m2 | |
15657 | ||
15658 | ; mode 28 [row 13] | |
15659 | pmaddubsw m2, m1, [r5 + 6 * 16] | |
15660 | pmulhrsw m2, m3 | |
15661 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
15662 | pmulhrsw m5, m3 | |
15663 | packuswb m2, m5 | |
15664 | movu [r0 + 429 * 16], m2 | |
15665 | ||
15666 | ; mode 28 [row 14] | |
15667 | pmaddubsw m2, m1, [r5 + 11 * 16] | |
15668 | pmulhrsw m2, m3 | |
15669 | pmaddubsw m5, m4, [r5 + 11 * 16] | |
15670 | pmulhrsw m5, m3 | |
15671 | packuswb m2, m5 | |
15672 | movu [r0 + 430 * 16], m2 | |
15673 | ||
15674 | ; mode 28 [row 15] | |
15675 | pmaddubsw m2, m1, [r5 + 16 * 16] | |
15676 | pmulhrsw m2, m3 | |
15677 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
15678 | pmulhrsw m5, m3 | |
15679 | packuswb m2, m5 | |
15680 | movu [r0 + 431 * 16], m2 | |
15681 | ||
15682 | ; mode 29 [row 7] | |
15683 | pmaddubsw m2, m1, [r5 + 8 * 16] | |
15684 | pmulhrsw m2, m3 | |
15685 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
15686 | pmulhrsw m5, m3 | |
15687 | packuswb m2, m5 | |
15688 | movu [r0 + 439 * 16], m2 | |
15689 | ||
15690 | ; mode 29 [row 8] | |
15691 | pmaddubsw m2, m1, [r5 + 17 * 16] | |
15692 | pmulhrsw m2, m3 | |
15693 | pmaddubsw m5, m4, [r5 + 17 * 16] | |
15694 | pmulhrsw m5, m3 | |
15695 | packuswb m2, m5 | |
15696 | movu [r0 + 440 * 16], m2 | |
15697 | ||
15698 | ; mode 29 [row 9] | |
15699 | pmaddubsw m2, m1, [r5 + 26 * 16] | |
15700 | pmulhrsw m2, m3 | |
15701 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
15702 | pmulhrsw m5, m3 | |
15703 | packuswb m2, m5 | |
15704 | movu [r0 + 441 * 16], m2 | |
15705 | ||
15706 | ; mode 30 [row 5] | |
15707 | pmaddubsw m2, m1, [r5 + 14 * 16] | |
15708 | pmulhrsw m2, m3 | |
15709 | pmaddubsw m5, m4, [r5 + 14 * 16] | |
15710 | pmulhrsw m5, m3 | |
15711 | packuswb m2, m5 | |
15712 | movu [r0 + 453 * 16], m2 | |
15713 | ||
15714 | ; mode 33 [row 2] | |
15715 | movu [r0 + 498 * 16], m2 | |
15716 | ||
15717 | ; mode 30 [row 6] | |
15718 | pmaddubsw m2, m1, [r5 + 27 * 16] | |
15719 | pmulhrsw m2, m3 | |
15720 | pmaddubsw m5, m4, [r5 + 27 * 16] | |
15721 | pmulhrsw m5, m3 | |
15722 | packuswb m2, m5 | |
15723 | movu [r0 + 454 * 16], m2 | |
15724 | ||
15725 | ; mode 31 [row 3] | |
15726 | pmaddubsw m2, m1, [r5 + 4 * 16] | |
15727 | pmulhrsw m2, m3 | |
15728 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
15729 | pmulhrsw m5, m3 | |
15730 | packuswb m2, m5 | |
15731 | movu [r0 + 467 * 16], m2 | |
15732 | ||
15733 | ; mode 31 [row 4] | |
15734 | pmaddubsw m2, m1, [r5 + 21 * 16] | |
15735 | pmulhrsw m2, m3 | |
15736 | pmaddubsw m5, m4, [r5 + 21 * 16] | |
15737 | pmulhrsw m5, m3 | |
15738 | packuswb m2, m5 | |
15739 | movu [r0 + 468 * 16], m2 | |
15740 | ||
15741 | ; mode 32 [row 3] | |
15742 | pmaddubsw m2, m1, [r5 + 20 * 16] | |
15743 | pmulhrsw m2, m3 | |
15744 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
15745 | pmulhrsw m5, m3 | |
15746 | packuswb m2, m5 | |
15747 | movu [r0 + 483 * 16], m2 | |
15748 | ||
15749 | ; mode 29 [row 10] | |
15750 | movd m7, [r3 + 11] | |
15751 | palignr m7, m1, 2 | |
15752 | pmaddubsw m2, m7, [r5 + 3 * 16] | |
15753 | pmulhrsw m2, m3 | |
15754 | movd m6, [r3 + 19] | |
15755 | palignr m6, m4, 2 | |
15756 | pmaddubsw m5, m6, [r5 + 3 * 16] | |
15757 | pmulhrsw m5, m3 | |
15758 | packuswb m2, m5 | |
15759 | movu [r0 + 442 * 16], m2 | |
15760 | ||
15761 | ; mode 29 [row 11] | |
15762 | pmaddubsw m2, m7, [r5 + 12 * 16] | |
15763 | pmulhrsw m2, m3 | |
15764 | pmaddubsw m5, m6, [r5 + 12 * 16] | |
15765 | pmulhrsw m5, m3 | |
15766 | packuswb m2, m5 | |
15767 | movu [r0 + 443 * 16], m2 | |
15768 | ||
15769 | ; mode 29 [row 12] | |
15770 | pmaddubsw m2, m7, [r5 + 21 * 16] | |
15771 | pmulhrsw m2, m3 | |
15772 | pmaddubsw m5, m6, [r5 + 21 * 16] | |
15773 | pmulhrsw m5, m3 | |
15774 | packuswb m2, m5 | |
15775 | movu [r0 + 444 * 16], m2 | |
15776 | ||
15777 | ; mode 30 [row 8] | |
15778 | movu [r0 + 456 * 16], m2 | |
15779 | ||
15780 | ; mode 29 [row 13] | |
15781 | pmaddubsw m2, m7, [r5 + 30 * 16] | |
15782 | pmulhrsw m2, m3 | |
15783 | pmaddubsw m5, m6, [r5 + 30 * 16] | |
15784 | pmulhrsw m5, m3 | |
15785 | packuswb m2, m5 | |
15786 | movu [r0 + 445 * 16], m2 | |
15787 | ||
15788 | ; mode 32 [row 5] | |
15789 | movu [r0 + 485 * 16], m2 | |
15790 | ||
15791 | ; mode 30 [row 7] | |
15792 | pmaddubsw m2, m7, [r5 + 8 * 16] | |
15793 | pmulhrsw m2, m3 | |
15794 | pmaddubsw m5, m6, [r5 + 8 * 16] | |
15795 | pmulhrsw m5, m3 | |
15796 | packuswb m2, m5 | |
15797 | movu [r0 + 455 * 16], m2 | |
15798 | ||
15799 | ; mode 33 [row 3] | |
15800 | movu [r0 + 499 * 16], m2 | |
15801 | ||
15802 | ; mode 31 [row 5] | |
15803 | pmaddubsw m2, m7, [r5 + 6 * 16] | |
15804 | pmulhrsw m2, m3 | |
15805 | pmaddubsw m5, m6, [r5 + 6 * 16] | |
15806 | pmulhrsw m5, m3 | |
15807 | packuswb m2, m5 | |
15808 | movu [r0 + 469 * 16], m2 | |
15809 | ||
15810 | ; mode 31 [row 6] | |
15811 | pmaddubsw m2, m7, [r5 + 23 * 16] | |
15812 | pmulhrsw m2, m3 | |
15813 | pmaddubsw m5, m6, [r5 + 23 * 16] | |
15814 | pmulhrsw m5, m3 | |
15815 | packuswb m2, m5 | |
15816 | movu [r0 + 470 * 16], m2 | |
15817 | ||
15818 | ; mode 32 [row 4] | |
15819 | pmaddubsw m2, m7, [r5 + 9 * 16] | |
15820 | pmulhrsw m2, m3 | |
15821 | pmaddubsw m5, m6, [r5 + 9 * 16] | |
15822 | pmulhrsw m5, m3 | |
15823 | packuswb m2, m5 | |
15824 | movu [r0 + 484 * 16], m2 | |
15825 | ||
15826 | movu m1, m7 | |
15827 | movu m4, m6 | |
15828 | ||
15829 | ; mode 29 [row 14] | |
15830 | movu m1, [r3 + 12] | |
15831 | palignr m1, m7, 2 | |
15832 | pmaddubsw m2, m1, [r5 + 7 * 16] | |
15833 | pmulhrsw m2, m3 | |
15834 | movd m4, [r3 + 20] | |
15835 | palignr m4, m6, 2 | |
15836 | pmaddubsw m5, m4, [r5 + 7 * 16] | |
15837 | pmulhrsw m5, m3 | |
15838 | packuswb m2, m5 | |
15839 | movu [r0 + 446 * 16], m2 | |
15840 | ||
15841 | ; mode 29 [row 15] | |
15842 | pmaddubsw m2, m1, [r5 + 16 * 16] | |
15843 | pmulhrsw m2, m3 | |
15844 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
15845 | pmulhrsw m5, m3 | |
15846 | packuswb m2, m5 | |
15847 | movu [r0 + 447 * 16], m2 | |
15848 | ||
15849 | ; mode 30 [row 9] | |
15850 | pmaddubsw m2, m1, [r5 + 2 * 16] | |
15851 | pmulhrsw m2, m3 | |
15852 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
15853 | pmulhrsw m5, m3 | |
15854 | packuswb m2, m5 | |
15855 | movu [r0 + 457 * 16], m2 | |
15856 | ||
15857 | ; mode 33 [row 4] | |
15858 | movu [r0 + 500 * 16], m2 | |
15859 | ||
15860 | ; mode 30 [row 10] | |
15861 | pmaddubsw m2, m1, [r5 + 15 * 16] | |
15862 | pmulhrsw m2, m3 | |
15863 | pmaddubsw m5, m4, [r5 + 15 * 16] | |
15864 | pmulhrsw m5, m3 | |
15865 | packuswb m2, m5 | |
15866 | movu [r0 + 458 * 16], m2 | |
15867 | ||
15868 | ; mode 30 [row 11] | |
15869 | pmaddubsw m2, m1, [r5 + 28 * 16] | |
15870 | pmulhrsw m2, m3 | |
15871 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
15872 | pmulhrsw m5, m3 | |
15873 | packuswb m2, m5 | |
15874 | movu [r0 + 459 * 16], m2 | |
15875 | ||
15876 | ; mode 33 [row 5] | |
15877 | movu [r0 + 501 * 16], m2 | |
15878 | ||
15879 | ; mode 31 [row 7] | |
15880 | pmaddubsw m2, m1, [r5 + 8 * 16] | |
15881 | pmulhrsw m2, m3 | |
15882 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
15883 | pmulhrsw m5, m3 | |
15884 | packuswb m2, m5 | |
15885 | movu [r0 + 471 * 16], m2 | |
15886 | ||
15887 | ; mode 31 [row 8] | |
15888 | pmaddubsw m2, m1, [r5 + 25 * 16] | |
15889 | pmulhrsw m2, m3 | |
15890 | pmaddubsw m5, m4, [r5 + 25 * 16] | |
15891 | pmulhrsw m5, m3 | |
15892 | packuswb m2, m5 | |
15893 | movu [r0 + 472 * 16], m2 | |
15894 | ||
15895 | ; mode 32 [row 6] | |
15896 | pmaddubsw m2, m1, [r5 + 19 * 16] | |
15897 | pmulhrsw m2, m3 | |
15898 | pmaddubsw m5, m4, [r5 + 19 * 16] | |
15899 | pmulhrsw m5, m3 | |
15900 | packuswb m2, m5 | |
15901 | movu [r0 + 486 * 16], m2 | |
15902 | ||
15903 | ; mode 30 [row 12] | |
15904 | movd m7, [r3 + 13] | |
15905 | palignr m7, m1, 2 | |
15906 | pmaddubsw m2, m7, [r5 + 9 * 16] | |
15907 | pmulhrsw m2, m3 | |
15908 | movd m6, [r3 + 21] | |
15909 | palignr m6, m4, 2 | |
15910 | pmaddubsw m5, m6, [r5 + 9 * 16] | |
15911 | pmulhrsw m5, m3 | |
15912 | packuswb m2, m5 | |
15913 | movu [r0 + 460 * 16], m2 | |
15914 | ||
15915 | ; mode 30 [row 13] | |
15916 | pmaddubsw m2, m7, [r5 + 22 * 16] | |
15917 | pmulhrsw m2, m3 | |
15918 | pmaddubsw m5, m6, [r5 + 22 * 16] | |
15919 | pmulhrsw m5, m3 | |
15920 | packuswb m2, m5 | |
15921 | movu [r0 + 461 * 16], m2 | |
15922 | ||
15923 | ; mode 33 [row 6] | |
15924 | movu [r0 + 502 * 16], m2 | |
15925 | ||
15926 | ; mode 31 [row 9] | |
15927 | pmaddubsw m2, m7, [r5 + 10 * 16] | |
15928 | pmulhrsw m2, m3 | |
15929 | pmaddubsw m5, m6, [r5 + 10 * 16] | |
15930 | pmulhrsw m5, m3 | |
15931 | packuswb m2, m5 | |
15932 | movu [r0 + 473 * 16], m2 | |
15933 | ||
15934 | ; mode 31 [row 10] | |
15935 | pmaddubsw m2, m7, [r5 + 27 * 16] | |
15936 | pmulhrsw m2, m3 | |
15937 | pmaddubsw m5, m6, [r5 + 27 * 16] | |
15938 | pmulhrsw m5, m3 | |
15939 | packuswb m2, m5 | |
15940 | movu [r0 + 474 * 16], m2 | |
15941 | ||
15942 | ; mode 32 [row 7] | |
15943 | pmaddubsw m2, m7, [r5 + 8 * 16] | |
15944 | pmulhrsw m2, m3 | |
15945 | pmaddubsw m5, m6, [r5 + 8 * 16] | |
15946 | pmulhrsw m5, m3 | |
15947 | packuswb m2, m5 | |
15948 | movu [r0 + 487 * 16], m2 | |
15949 | ||
15950 | ; mode 32 [row 8] | |
15951 | pmaddubsw m2, m7, [r5 + 29 * 16] | |
15952 | pmulhrsw m2, m3 | |
15953 | pmaddubsw m5, m6, [r5 + 29 * 16] | |
15954 | pmulhrsw m5, m3 | |
15955 | packuswb m2, m5 | |
15956 | movu [r0 + 488 * 16], m2 | |
15957 | ||
15958 | ||
15959 | movu m1, m7 | |
15960 | movu m4, m6 | |
15961 | ||
15962 | ; mode 30 [row 14] | |
15963 | movd m1, [r3 + 14] | |
15964 | palignr m1, m7, 2 | |
15965 | pmaddubsw m2, m1, [r5 + 3 * 16] | |
15966 | pmulhrsw m2, m3 | |
15967 | movd m4, [r3 + 22] | |
15968 | palignr m4, m6, 2 | |
15969 | pmaddubsw m5, m4, [r5 + 3 * 16] | |
15970 | pmulhrsw m5, m3 | |
15971 | packuswb m2, m5 | |
15972 | movu [r0 + 462 * 16], m2 | |
15973 | ||
15974 | ; mode 30 [row 15] | |
15975 | pmaddubsw m2, m1, [r5 + 16 * 16] | |
15976 | pmulhrsw m2, m3 | |
15977 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
15978 | pmulhrsw m5, m3 | |
15979 | packuswb m2, m5 | |
15980 | movu [r0 + 463 * 16], m2 | |
15981 | ||
15982 | ; mode 33 [row 7] | |
15983 | movu [r0 + 503 * 16], m2 | |
15984 | ||
15985 | ; mode 31 [row 11] | |
15986 | pmaddubsw m2, m1, [r5 + 12 * 16] | |
15987 | pmulhrsw m2, m3 | |
15988 | pmaddubsw m5, m4, [r5 + 12 * 16] | |
15989 | pmulhrsw m5, m3 | |
15990 | packuswb m2, m5 | |
15991 | movu [r0 + 475 * 16], m2 | |
15992 | ||
15993 | ; mode 31 [row 12] | |
15994 | pmaddubsw m2, m1, [r5 + 29 * 16] | |
15995 | pmulhrsw m2, m3 | |
15996 | pmaddubsw m5, m4, [r5 + 29 * 16] | |
15997 | pmulhrsw m5, m3 | |
15998 | packuswb m2, m5 | |
15999 | movu [r0 + 476 * 16], m2 | |
16000 | ||
16001 | ; mode 32 [row 9] | |
16002 | pmaddubsw m2, m1, [r5 + 18 * 16] | |
16003 | pmulhrsw m2, m3 | |
16004 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
16005 | pmulhrsw m5, m3 | |
16006 | packuswb m2, m5 | |
16007 | movu [r0 + 489 * 16], m2 | |
16008 | ||
16009 | ; mode 31 [row 13] | |
16010 | movd m7, [r3 + 15] | |
16011 | palignr m7, m1, 2 | |
16012 | pmaddubsw m2, m7, [r5 + 14 * 16] | |
16013 | pmulhrsw m2, m3 | |
16014 | movd m6, [r3 + 23] | |
16015 | palignr m6, m4, 2 | |
16016 | pmaddubsw m5, m6, [r5 + 14 * 16] | |
16017 | pmulhrsw m5, m3 | |
16018 | packuswb m2, m5 | |
16019 | movu [r0 + 477 * 16], m2 | |
16020 | ||
16021 | ; mode 31 [row 14] | |
16022 | pmaddubsw m2, m7, [r5 + 31 * 16] | |
16023 | pmulhrsw m2, m3 | |
16024 | pmaddubsw m5, m6, [r5 + 31 * 16] | |
16025 | pmulhrsw m5, m3 | |
16026 | packuswb m2, m5 | |
16027 | movu [r0 + 478 * 16], m2 | |
16028 | ||
16029 | ; mode 32 [row 10] | |
16030 | pmaddubsw m2, m7, [r5 + 7 * 16] | |
16031 | pmulhrsw m2, m3 | |
16032 | pmaddubsw m5, m6, [r5 + 7 * 16] | |
16033 | pmulhrsw m5, m3 | |
16034 | packuswb m2, m5 | |
16035 | movu [r0 + 490 * 16], m2 | |
16036 | ||
16037 | ; mode 32 [row 11] | |
16038 | pmaddubsw m2, m7, [r5 + 28 * 16] | |
16039 | pmulhrsw m2, m3 | |
16040 | pmaddubsw m5, m6, [r5 + 28 * 16] | |
16041 | pmulhrsw m5, m3 | |
16042 | packuswb m2, m5 | |
16043 | movu [r0 + 491 * 16], m2 | |
16044 | ||
16045 | ; mode 33 [row 8] | |
16046 | pmaddubsw m2, m7, [r5 + 10 * 16] | |
16047 | pmulhrsw m2, m3 | |
16048 | pmaddubsw m5, m6, [r5 + 10 * 16] | |
16049 | pmulhrsw m5, m3 | |
16050 | packuswb m2, m5 | |
16051 | movu [r0 + 504 * 16], m2 | |
16052 | ||
16053 | ; mode 31 [row 15] | |
16054 | movd m1, [r3 + 16] | |
16055 | palignr m1, m7, 2 | |
16056 | pmaddubsw m2, m1, [r5 + 16 * 16] | |
16057 | pmulhrsw m2, m3 | |
16058 | movd m4, [r3 + 24] | |
16059 | palignr m4, m6, 2 | |
16060 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
16061 | pmulhrsw m5, m3 | |
16062 | packuswb m2, m5 | |
16063 | movu [r0 + 479 * 16], m2 | |
16064 | ||
16065 | ; mode 32 [row 12] | |
16066 | pmaddubsw m2, m1, [r5 + 17 * 16] | |
16067 | pmulhrsw m2, m3 | |
16068 | pmaddubsw m5, m4, [r5 + 17 * 16] | |
16069 | pmulhrsw m5, m3 | |
16070 | packuswb m2, m5 | |
16071 | movu [r0 + 492 * 16], m2 | |
16072 | ||
16073 | ; mode 33 [row 9] | |
16074 | pmaddubsw m2, m1, [r5 + 4 * 16] | |
16075 | pmulhrsw m2, m3 | |
16076 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
16077 | pmulhrsw m5, m3 | |
16078 | packuswb m2, m5 | |
16079 | movu [r0 + 505 * 16], m2 | |
16080 | ||
16081 | ; mode 33 [row 10] | |
16082 | pmaddubsw m2, m1, [r5 + 30 * 16] | |
16083 | pmulhrsw m2, m3 | |
16084 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
16085 | pmulhrsw m5, m3 | |
16086 | packuswb m2, m5 | |
16087 | movu [r0 + 506 * 16], m2 | |
16088 | ||
16089 | ; mode 33 [row 10] | |
16090 | pmaddubsw m2, m1, [r5 + 4 * 16] | |
16091 | pmulhrsw m2, m3 | |
16092 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
16093 | pmulhrsw m5, m3 | |
16094 | packuswb m2, m5 | |
16095 | movu [r0 + 505 * 16], m2 | |
16096 | ||
16097 | ; mode 32 [row 13] | |
16098 | movd m7, [r3 + 17] | |
16099 | palignr m7, m1, 2 | |
16100 | pmaddubsw m2, m7, [r5 + 6 * 16] | |
16101 | pmulhrsw m2, m3 | |
16102 | ||
16103 | movd m6, [r3 + 25] | |
16104 | palignr m6, m4, 2 | |
16105 | pmaddubsw m5, m6, [r5 + 6 * 16] | |
16106 | pmulhrsw m5, m3 | |
16107 | packuswb m2, m5 | |
16108 | movu [r0 + 493 * 16], m2 | |
16109 | ||
16110 | ; mode 32 [row 14] | |
16111 | pmaddubsw m2, m7, [r5 + 27 * 16] | |
16112 | pmulhrsw m2, m3 | |
16113 | pmaddubsw m5, m6, [r5 + 27 * 16] | |
16114 | pmulhrsw m5, m3 | |
16115 | packuswb m2, m5 | |
16116 | movu [r0 + 494 * 16], m2 | |
16117 | ||
16118 | ; mode 33 [row 11] | |
16119 | pmaddubsw m2, m7, [r5 + 24 * 16] | |
16120 | pmulhrsw m2, m3 | |
16121 | pmaddubsw m5, m6, [r5 + 24 * 16] | |
16122 | pmulhrsw m5, m3 | |
16123 | packuswb m2, m5 | |
16124 | movu [r0 + 507 * 16], m2 | |
16125 | ||
16126 | ; mode 32 [row 15] | |
16127 | movd m1, [r3 + 18] | |
16128 | palignr m1, m7, 2 | |
16129 | pmaddubsw m2, m1, [r5 + 16 * 16] | |
16130 | pmulhrsw m2, m3 | |
16131 | psrldq m4, 2 | |
16132 | pinsrb m4, [r3 + 26], 14 | |
16133 | pinsrb m4, [r3 + 27], 15 | |
16134 | movd m4, [r3 + 26] | |
16135 | palignr m4, m6, 2 | |
16136 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
16137 | pmulhrsw m5, m3 | |
16138 | packuswb m2, m5 | |
16139 | movu [r0 + 495 * 16], m2 | |
16140 | ||
16141 | ; mode 33 [row 12] | |
16142 | pmaddubsw m2, m1, [r5 + 18 * 16] | |
16143 | pmulhrsw m2, m3 | |
16144 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
16145 | pmulhrsw m5, m3 | |
16146 | packuswb m2, m5 | |
16147 | movu [r0 + 508 * 16], m2 | |
16148 | ||
16149 | ; mode 33 [row 13] | |
16150 | movd m7, [r3 + 19] | |
16151 | palignr m7, m1, 2 | |
16152 | pmaddubsw m2, m7, [r5 + 12 * 16] | |
16153 | pmulhrsw m2, m3 | |
16154 | movd m6, [r3 + 27] | |
16155 | palignr m6, m4, 2 | |
16156 | pmaddubsw m5, m6, [r5 + 12 * 16] | |
16157 | pmulhrsw m5, m3 | |
16158 | packuswb m2, m5 | |
16159 | movu [r0 + 509 * 16], m2 | |
16160 | ||
16161 | ; mode 33 [row 14] | |
16162 | movd m1, [r3 + 20] | |
16163 | palignr m1, m7, 2 | |
16164 | pmaddubsw m2, m1, [r5 + 6 * 16] | |
16165 | pmulhrsw m2, m3 | |
16166 | movd m4, [r3 + 28] | |
16167 | palignr m4, m6, 2 | |
16168 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
16169 | pmulhrsw m5, m3 | |
16170 | packuswb m2, m5 | |
16171 | movu [r0 + 510 * 16], m2 | |
16172 | ||
16173 | ; mode 34 [row 0] | |
16174 | movu m1, [r3 + 2] | |
16175 | movu [r0 + 512 * 16], m1 | |
16176 | movu m2, [r3 + 18] | |
16177 | palignr m3, m2, m1, 1 | |
16178 | movu [r0 + 513 * 16], m3 | |
16179 | palignr m3, m2, m1, 2 | |
16180 | movu [r0 + 514 * 16], m3 | |
16181 | palignr m3, m2, m1, 3 | |
16182 | movu [r0 + 515 * 16], m3 | |
16183 | palignr m3, m2, m1, 4 | |
16184 | movu [r0 + 516 * 16], m3 | |
16185 | palignr m3, m2, m1, 5 | |
16186 | movu [r0 + 517 * 16], m3 | |
16187 | palignr m3, m2, m1, 6 | |
16188 | movu [r0 + 518 * 16], m3 | |
16189 | palignr m3, m2, m1, 7 | |
16190 | movu [r0 + 519 * 16], m3 | |
16191 | palignr m3, m2, m1, 8 | |
16192 | movu [r0 + 520 * 16], m3 | |
16193 | palignr m3, m2, m1, 9 | |
16194 | movu [r0 + 521 * 16], m3 | |
16195 | palignr m3, m2, m1, 10 | |
16196 | movu [r0 + 522 * 16], m3 | |
16197 | palignr m3, m2, m1, 11 | |
16198 | movu [r0 + 523 * 16], m3 | |
16199 | palignr m3, m2, m1, 12 | |
16200 | movu [r0 + 524 * 16], m3 | |
16201 | ||
16202 | ; mode 33 [row 15] | |
16203 | movu [r0 + 511 * 16], m3 | |
16204 | ||
16205 | ; mode 34 | |
16206 | palignr m3, m2, m1, 13 | |
16207 | movu [r0 + 525 * 16], m3 | |
16208 | palignr m3, m2, m1, 14 | |
16209 | movu [r0 + 526 * 16], m3 | |
16210 | palignr m3, m2, m1, 15 | |
16211 | movu [r0 + 527 * 16], m3 | |
16212 | ||
16213 | RET | |
16214 | ||
16215 | ;----------------------------------------------------------------------------- | |
16216 | ; void all_angs_pred_32x32(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma) | |
16217 | ;----------------------------------------------------------------------------- | |
16218 | INIT_XMM sse4 | |
16219 | cglobal all_angs_pred_32x32, 6, 6, 8, dest, above0, left0, above1, left1, bLuma | |
16220 | ||
16221 | ;mode 2[row 0] | |
16222 | movu m0, [r4 + 2] | |
16223 | movu [r0 + 0 * 16], m0 | |
16224 | movu m1, [r4 + 18] | |
16225 | movu [r0 + 1 * 16], m1 | |
16226 | ||
16227 | ;mode 9 [row 15] | |
16228 | movu [r0 + 478 * 16], m0 | |
16229 | movu [r0 + 479 * 16], m1 | |
16230 | ||
16231 | ;mode 2[row 1] | |
16232 | movu m2, [r4 + 34] | |
16233 | palignr m3, m1, m0, 1 | |
16234 | movu [r0 + 2 * 16], m3 | |
16235 | palignr m4, m2, m1, 1 | |
16236 | movu [r0 + 3 * 16], m4 | |
16237 | ||
16238 | ; mode 9 [row 31] | |
16239 | movu [r0 + 510 * 16], m3 | |
16240 | movu [r0 + 511 * 16], m4 | |
16241 | ||
16242 | ;mode 2[row 17] | |
16243 | movu [r0 + 34 * 16], m4 | |
16244 | movu m5, [r4 + 35] | |
16245 | movu [r0 + 35 * 16], m5 | |
16246 | ||
16247 | ;mode 2[row 2] | |
16248 | palignr m3, m1, m0, 2 | |
16249 | movu [r0 + 4 * 16], m3 | |
16250 | palignr m4, m2, m1, 2 | |
16251 | movu [r0 + 5 * 16], m4 | |
16252 | ||
16253 | ;mode 2[row 18] | |
16254 | movu [r0 + 36 * 16], m4 | |
16255 | movu m6, [r4 + 51] | |
16256 | palignr m7, m6, m5, 1 | |
16257 | movu [r0 + 37 * 16], m7 | |
16258 | ||
16259 | ;mode 2[row 3] | |
16260 | palignr m3, m1, m0, 3 | |
16261 | movu [r0 + 6 * 16], m3 | |
16262 | palignr m4, m2, m1, 3 | |
16263 | movu [r0 + 7 * 16], m4 | |
16264 | ||
16265 | ;mode 2[row 19] | |
16266 | movu [r0 + 38 * 16], m4 | |
16267 | palignr m7, m6, m5, 2 | |
16268 | movu [r0 + 39 * 16], m7 | |
16269 | ||
16270 | ;mode 2[row 4] | |
16271 | palignr m3, m1, m0, 4 | |
16272 | movu [r0 + 8 * 16], m3 | |
16273 | palignr m4, m2, m1, 4 | |
16274 | movu [r0 + 9 * 16], m4 | |
16275 | ||
16276 | ; mode 8 [row 31] | |
16277 | movu [r0 + 446 * 16], m3 | |
16278 | movu [r0 + 447 * 16], m4 | |
16279 | ||
16280 | ;mode 2[row 20] | |
16281 | movu [r0 + 40 * 16], m4 | |
16282 | palignr m7, m6, m5, 3 | |
16283 | movu [r0 + 41 * 16], m7 | |
16284 | ||
16285 | ; mode 4 [row 31] | |
16286 | movu [r0 + 190 * 16], m4 | |
16287 | movu [r0 + 191 * 16], m7 | |
16288 | ||
16289 | ;mode 2[row 5] | |
16290 | palignr m3, m1, m0, 5 | |
16291 | movu [r0 + 10 * 16], m3 | |
16292 | palignr m4, m2, m1, 5 | |
16293 | movu [r0 + 11 * 16], m4 | |
16294 | ||
16295 | ;mode 2[row 21] | |
16296 | movu [r0 + 42 * 16], m4 | |
16297 | palignr m7, m6, m5, 4 | |
16298 | movu [r0 + 43 * 16], m7 | |
16299 | ||
16300 | ;mode 2[row 6] | |
16301 | palignr m3, m1, m0, 6 | |
16302 | movu [r0 + 12 * 16], m3 | |
16303 | palignr m4, m2, m1, 6 | |
16304 | movu [r0 + 13 * 16], m4 | |
16305 | ||
16306 | ;mode 2[row 22] | |
16307 | movu [r0 + 44 * 16], m4 | |
16308 | palignr m7, m6, m5, 5 | |
16309 | movu [r0 + 45 * 16], m7 | |
16310 | ||
16311 | ;mode 2[row 7] | |
16312 | palignr m3, m1, m0, 7 | |
16313 | movu [r0 + 14 * 16], m3 | |
16314 | palignr m4, m2, m1, 7 | |
16315 | movu [r0 + 15 * 16], m4 | |
16316 | ||
16317 | ;mode 2[row 23] | |
16318 | movu [r0 + 46 * 16], m4 | |
16319 | palignr m7, m6, m5, 6 | |
16320 | movu [r0 + 47 * 16], m7 | |
16321 | ||
16322 | ;mode 2[row 8] | |
16323 | palignr m3, m1, m0, 8 | |
16324 | movu [r0 + 16 * 16], m3 | |
16325 | palignr m4, m2, m1, 8 | |
16326 | movu [r0 + 17 * 16], m4 | |
16327 | ||
16328 | ;mode 7[row 31] | |
16329 | movu [r0 + 382 * 16], m3 | |
16330 | movu [r0 + 383 * 16], m4 | |
16331 | ||
16332 | ;mode 2[row 24] | |
16333 | movu [r0 + 48 * 16], m4 | |
16334 | palignr m7, m6, m5, 7 | |
16335 | movu [r0 + 49 * 16], m7 | |
16336 | ||
16337 | ;mode 2[row 9] | |
16338 | palignr m3, m1, m0, 9 | |
16339 | movu [r0 + 18 * 16], m3 | |
16340 | palignr m4, m2, m1, 9 | |
16341 | movu [r0 + 19 * 16], m4 | |
16342 | ||
16343 | ;mode 2[row 25] | |
16344 | movu [r0 + 50 * 16], m4 | |
16345 | palignr m7, m6, m5, 8 | |
16346 | movu [r0 + 51 * 16], m7 | |
16347 | ||
16348 | ; mode 3 [row 31] | |
16349 | movu [r0 + 126 * 16], m4 | |
16350 | movu [r0 + 127 * 16], m7 | |
16351 | ||
16352 | ;mode 2[row 10] | |
16353 | palignr m3, m1, m0, 10 | |
16354 | movu [r0 + 20 * 16], m3 | |
16355 | palignr m4, m2, m1, 10 | |
16356 | movu [r0 + 21 * 16], m4 | |
16357 | ||
16358 | ;mode 2[row 26] | |
16359 | movu [r0 + 52 * 16], m4 | |
16360 | palignr m7, m6, m5, 9 | |
16361 | movu [r0 + 53 * 16], m7 | |
16362 | ||
16363 | ;mode 2[row 11] | |
16364 | palignr m3, m1, m0, 11 | |
16365 | movu [r0 + 22 * 16], m3 | |
16366 | palignr m4, m2, m1, 11 | |
16367 | movu [r0 + 23 * 16], m4 | |
16368 | ||
16369 | ;mode 2[row 27] | |
16370 | movu [r0 + 54 * 16], m4 | |
16371 | palignr m7, m6, m5, 10 | |
16372 | movu [r0 + 55 * 16], m7 | |
16373 | ||
16374 | ;mode 2[row 12] | |
16375 | palignr m3, m1, m0, 12 | |
16376 | movu [r0 + 24 * 16], m3 | |
16377 | palignr m4, m2, m1, 12 | |
16378 | movu [r0 + 25 * 16], m4 | |
16379 | ||
16380 | ; mode 6 [row 31] | |
16381 | movu [r0 + 318 * 16], m3 | |
16382 | movu [r0 + 319 * 16], m4 | |
16383 | ||
16384 | ; mode 3 [row 15] | |
16385 | movu [r0 + 94 * 16], m3 | |
16386 | movu [r0 + 95 * 16], m4 | |
16387 | ||
16388 | ;mode 2[row 28] | |
16389 | movu [r0 + 56 * 16], m4 | |
16390 | palignr m7, m6, m5, 11 | |
16391 | movu [r0 + 57 * 16], m7 | |
16392 | ||
16393 | ;mode 2[row 13] | |
16394 | palignr m3, m1, m0, 13 | |
16395 | movu [r0 + 26 * 16], m3 | |
16396 | palignr m4, m2, m1, 13 | |
16397 | movu [r0 + 27 * 16], m4 | |
16398 | ||
16399 | ;mode 2[row 29] | |
16400 | movu [r0 + 58 * 16], m4 | |
16401 | palignr m7, m6, m5, 12 | |
16402 | movu [r0 + 59 * 16], m7 | |
16403 | ||
16404 | ;mode 2[row 14] | |
16405 | palignr m3, m1, m0, 14 | |
16406 | movu [r0 + 28 * 16], m3 | |
16407 | palignr m4, m2, m1, 14 | |
16408 | movu [r0 + 29 * 16], m4 | |
16409 | ||
16410 | ;mode 2[row 30] | |
16411 | movu [r0 + 60 * 16], m4 | |
16412 | palignr m7, m6, m5, 13 | |
16413 | movu [r0 + 61 * 16], m7 | |
16414 | ||
16415 | ;mode 2[row 15] | |
16416 | palignr m3, m1, m0, 15 | |
16417 | movu [r0 + 30 * 16], m3 | |
16418 | palignr m4, m2, m1, 15 | |
16419 | movu [r0 + 31 * 16], m4 | |
16420 | ||
16421 | ;mode 2[row 31] | |
16422 | movu [r0 + 62 * 16], m4 | |
16423 | palignr m7, m6, m5, 14 | |
16424 | movu [r0 + 63 * 16], m7 | |
16425 | ||
16426 | ;mode 2[row 16] | |
16427 | movu [r0 + 32 * 16], m1 | |
16428 | movu [r0 + 33 * 16], m2 | |
16429 | ||
16430 | ; mode 5[row 31] | |
16431 | movu [r0 + 254 * 16], m1 | |
16432 | movu [r0 + 255 * 16], m2 | |
16433 | ||
16434 | ; mode 3 [row 0] | |
16435 | lea r5, [ang_table] | |
16436 | movu m6, [r5 + 26 * 16] | |
16437 | movu m7, [pw_1024 ] | |
16438 | movu m1, [r4 + 1 ] | |
16439 | punpcklbw m1, m0 | |
16440 | pmaddubsw m0, m1, m6 | |
16441 | pmulhrsw m0, m7 | |
16442 | movu m2, [r4 + 9] | |
16443 | movd m3, [r4 + 10] | |
16444 | palignr m3, m2, 1 | |
16445 | punpcklbw m2, m3 | |
16446 | pmaddubsw m3, m2, m6 | |
16447 | pmulhrsw m3, m7 | |
16448 | packuswb m0, m3 | |
16449 | movu [r0 + 64 * 16], m0 | |
16450 | ||
16451 | ; mode 6 [row 1 - first half] | |
16452 | movu [r0 + 258 * 16], m0 | |
16453 | ||
16454 | ; mode 9 [row 12 - first half] | |
16455 | movu [r0 + 472 * 16], m0 | |
16456 | ||
16457 | movu m0, [r4 + 17] | |
16458 | movd m3, [r4 + 18] | |
16459 | palignr m3, m0, 1 | |
16460 | punpcklbw m0, m3 | |
16461 | pmaddubsw m3, m0, m6 | |
16462 | pmulhrsw m3, m7 | |
16463 | movu m4, [r4 + 25] | |
16464 | movd m5, [r4 + 26] | |
16465 | palignr m5, m4, 1 | |
16466 | punpcklbw m4, m5 | |
16467 | pmaddubsw m5, m4, m6 | |
16468 | pmulhrsw m5, m7 | |
16469 | packuswb m3, m5 | |
16470 | movu [r0 + 65 * 16], m3 | |
16471 | ||
16472 | ; mode 6 [row 1 - second half] | |
16473 | movu [r0 + 259 * 16], m3 | |
16474 | ||
16475 | ; mode 9 [row 12 - second half] | |
16476 | movu [r0 + 473 * 16], m3 | |
16477 | ||
16478 | ; mode 4 [row 0] | |
16479 | movu m6, [r5 + 21 * 16] | |
16480 | pmaddubsw m3, m1, m6 | |
16481 | pmulhrsw m3, m7 | |
16482 | pmaddubsw m5, m2, m6 | |
16483 | pmulhrsw m5, m7 | |
16484 | packuswb m3, m5 | |
16485 | movu [r0 + 128 * 16], m3 | |
16486 | pmaddubsw m3, m0, m6 | |
16487 | pmulhrsw m3, m7 | |
16488 | pmaddubsw m5, m4, m6 | |
16489 | pmulhrsw m5, m7 | |
16490 | packuswb m3, m5 | |
16491 | movu [r0 + 129 * 16], m3 | |
16492 | ||
16493 | ; mode 5 [row 0] | |
16494 | movu m6, [r5 + 17 * 16] | |
16495 | pmaddubsw m3, m1, m6 | |
16496 | pmulhrsw m3, m7 | |
16497 | pmaddubsw m5, m2, m6 | |
16498 | pmulhrsw m5, m7 | |
16499 | packuswb m3, m5 | |
16500 | movu [r0 + 192 * 16], m3 | |
16501 | pmaddubsw m3, m0, m6 | |
16502 | pmulhrsw m3, m7 | |
16503 | pmaddubsw m5, m4, m6 | |
16504 | pmulhrsw m5, m7 | |
16505 | packuswb m3, m5 | |
16506 | movu [r0 + 193 * 16], m3 | |
16507 | ||
16508 | ; mode 6 [row 0] | |
16509 | movu m6, [r5 + 13 * 16] | |
16510 | pmaddubsw m3, m1, m6 | |
16511 | pmulhrsw m3, m7 | |
16512 | pmaddubsw m5, m2, m6 | |
16513 | pmulhrsw m5, m7 | |
16514 | packuswb m3, m5 | |
16515 | movu [r0 + 256 * 16], m3 | |
16516 | pmaddubsw m3, m0, m6 | |
16517 | pmulhrsw m3, m7 | |
16518 | pmaddubsw m5, m4, m6 | |
16519 | pmulhrsw m5, m7 | |
16520 | packuswb m3, m5 | |
16521 | movu [r0 + 257 * 16], m3 | |
16522 | ||
16523 | ; mode 7 [row 0] | |
16524 | movu m6, [r5 + 9 * 16] | |
16525 | pmaddubsw m3, m1, m6 | |
16526 | pmulhrsw m3, m7 | |
16527 | pmaddubsw m5, m2, m6 | |
16528 | pmulhrsw m5, m7 | |
16529 | packuswb m3, m5 | |
16530 | movu [r0 + 320 * 16], m3 | |
16531 | pmaddubsw m3, m0, m6 | |
16532 | pmulhrsw m3, m7 | |
16533 | pmaddubsw m5, m4, m6 | |
16534 | pmulhrsw m5, m7 | |
16535 | packuswb m3, m5 | |
16536 | movu [r0 + 321 * 16], m3 | |
16537 | ||
16538 | ; mode 7 [row 1] | |
16539 | movu m6, [r5 + 18 * 16] | |
16540 | pmaddubsw m3, m1, m6 | |
16541 | pmulhrsw m3, m7 | |
16542 | pmaddubsw m5, m2, m6 | |
16543 | pmulhrsw m5, m7 | |
16544 | packuswb m3, m5 | |
16545 | movu [r0 + 322 * 16], m3 | |
16546 | ||
16547 | ; mode 9 [row 8 - first half] | |
16548 | movu [r0 + 464 * 16], m3 | |
16549 | ||
16550 | pmaddubsw m3, m0, m6 | |
16551 | pmulhrsw m3, m7 | |
16552 | pmaddubsw m5, m4, m6 | |
16553 | pmulhrsw m5, m7 | |
16554 | packuswb m3, m5 | |
16555 | movu [r0 + 323 * 16], m3 | |
16556 | ||
16557 | ; mode 9 [row 8 - second half] | |
16558 | movu [r0 + 465 * 16], m3 | |
16559 | ||
16560 | ; mode 7 [row 2] | |
16561 | movu m6, [r5 + 27 * 16] | |
16562 | pmaddubsw m3, m1, m6 | |
16563 | pmulhrsw m3, m7 | |
16564 | pmaddubsw m5, m2, m6 | |
16565 | pmulhrsw m5, m7 | |
16566 | packuswb m3, m5 | |
16567 | movu [r0 + 324 * 16], m3 | |
16568 | pmaddubsw m3, m0, m6 | |
16569 | pmulhrsw m3, m7 | |
16570 | pmaddubsw m5, m4, m6 | |
16571 | pmulhrsw m5, m7 | |
16572 | packuswb m3, m5 | |
16573 | movu [r0 + 325 * 16], m3 | |
16574 | ||
16575 | ; mode 8 [row 0] | |
16576 | movu m6, [r5 + 5 * 16] | |
16577 | pmaddubsw m3, m1, m6 | |
16578 | pmulhrsw m3, m7 | |
16579 | pmaddubsw m5, m2, m6 | |
16580 | pmulhrsw m5, m7 | |
16581 | packuswb m3, m5 | |
16582 | movu [r0 + 384 * 16], m3 | |
16583 | pmaddubsw m3, m0, m6 | |
16584 | pmulhrsw m3, m7 | |
16585 | pmaddubsw m5, m4, m6 | |
16586 | pmulhrsw m5, m7 | |
16587 | packuswb m3, m5 | |
16588 | movu [r0 + 385 * 16], m3 | |
16589 | ||
16590 | ; mode 8 [row 1] | |
16591 | movu m6, [r5 + 10 * 16] | |
16592 | pmaddubsw m3, m1, m6 | |
16593 | pmulhrsw m3, m7 | |
16594 | pmaddubsw m5, m2, m6 | |
16595 | pmulhrsw m5, m7 | |
16596 | packuswb m3, m5 | |
16597 | movu [r0 + 386 * 16], m3 | |
16598 | ||
16599 | ; mode 9 [row 4 - first half] | |
16600 | movu [r0 + 456 * 16], m3 | |
16601 | ||
16602 | pmaddubsw m3, m0, m6 | |
16603 | pmulhrsw m3, m7 | |
16604 | pmaddubsw m5, m4, m6 | |
16605 | pmulhrsw m5, m7 | |
16606 | packuswb m3, m5 | |
16607 | movu [r0 + 387 * 16], m3 | |
16608 | ||
16609 | ; mode 9 [row 4 - second half] | |
16610 | movu [r0 + 457 * 16], m3 | |
16611 | ||
16612 | ; mode 8 [row 2] | |
16613 | movu m6, [r5 + 15 * 16] | |
16614 | pmaddubsw m3, m1, m6 | |
16615 | pmulhrsw m3, m7 | |
16616 | pmaddubsw m5, m2, m6 | |
16617 | pmulhrsw m5, m7 | |
16618 | packuswb m3, m5 | |
16619 | movu [r0 + 388 * 16], m3 | |
16620 | pmaddubsw m3, m0, m6 | |
16621 | pmulhrsw m3, m7 | |
16622 | pmaddubsw m5, m4, m6 | |
16623 | pmulhrsw m5, m7 | |
16624 | packuswb m3, m5 | |
16625 | movu [r0 + 389 * 16], m3 | |
16626 | ||
16627 | ; mode 8 [row 3] | |
16628 | movu m6, [r5 + 20 * 16] | |
16629 | pmaddubsw m3, m1, m6 | |
16630 | pmulhrsw m3, m7 | |
16631 | pmaddubsw m5, m2, m6 | |
16632 | pmulhrsw m5, m7 | |
16633 | packuswb m3, m5 | |
16634 | movu [r0 + 390 * 16], m3 | |
16635 | ||
16636 | ; mode 9 [row 9 - first half] | |
16637 | movu [r0 + 466 * 16], m3 | |
16638 | ||
16639 | pmaddubsw m3, m0, m6 | |
16640 | pmulhrsw m3, m7 | |
16641 | pmaddubsw m5, m4, m6 | |
16642 | pmulhrsw m5, m7 | |
16643 | packuswb m3, m5 | |
16644 | movu [r0 + 391 * 16], m3 | |
16645 | ||
16646 | ; mode 9 [row 9 - second half] | |
16647 | movu [r0 + 467 * 16], m3 | |
16648 | ||
16649 | ; mode 8 [row 4] | |
16650 | movu m6, [r5 + 25 * 16] | |
16651 | pmaddubsw m3, m1, m6 | |
16652 | pmulhrsw m3, m7 | |
16653 | pmaddubsw m5, m2, m6 | |
16654 | pmulhrsw m5, m7 | |
16655 | packuswb m3, m5 | |
16656 | movu [r0 + 392 * 16], m3 | |
16657 | pmaddubsw m3, m0, m6 | |
16658 | pmulhrsw m3, m7 | |
16659 | pmaddubsw m5, m4, m6 | |
16660 | pmulhrsw m5, m7 | |
16661 | packuswb m3, m5 | |
16662 | movu [r0 + 393 * 16], m3 | |
16663 | ||
16664 | ; mode 8 [row 5] | |
16665 | movu m6, [r5 + 30 * 16] | |
16666 | pmaddubsw m3, m1, m6 | |
16667 | pmulhrsw m3, m7 | |
16668 | pmaddubsw m5, m2, m6 | |
16669 | pmulhrsw m5, m7 | |
16670 | packuswb m3, m5 | |
16671 | movu [r0 + 394 * 16], m3 | |
16672 | ||
16673 | ; mode 9 [row 14 - first half] | |
16674 | movu [r0 + 476 * 16], m3 | |
16675 | ||
16676 | pmaddubsw m3, m0, m6 | |
16677 | pmulhrsw m3, m7 | |
16678 | pmaddubsw m5, m4, m6 | |
16679 | pmulhrsw m5, m7 | |
16680 | packuswb m3, m5 | |
16681 | movu [r0 + 395 * 16], m3 | |
16682 | ||
16683 | ; mode 9 [row 14 - second half] | |
16684 | movu [r0 + 477 * 16], m3 | |
16685 | ||
16686 | ; mode 9 [row 0] | |
16687 | movu m6, [r5 + 2 * 16] | |
16688 | pmaddubsw m3, m1, m6 | |
16689 | pmulhrsw m3, m7 | |
16690 | pmaddubsw m5, m2, m6 | |
16691 | pmulhrsw m5, m7 | |
16692 | packuswb m3, m5 | |
16693 | movu [r0 + 448 * 16], m3 | |
16694 | pmaddubsw m3, m0, m6 | |
16695 | pmulhrsw m3, m7 | |
16696 | pmaddubsw m5, m4, m6 | |
16697 | pmulhrsw m5, m7 | |
16698 | packuswb m3, m5 | |
16699 | movu [r0 + 449 * 16], m3 | |
16700 | ||
16701 | ; mode 9 [row 1] | |
16702 | movu m6, [r5 + 4 * 16] | |
16703 | pmaddubsw m3, m1, m6 | |
16704 | pmulhrsw m3, m7 | |
16705 | pmaddubsw m5, m2, m6 | |
16706 | pmulhrsw m5, m7 | |
16707 | packuswb m3, m5 | |
16708 | movu [r0 + 450 * 16], m3 | |
16709 | pmaddubsw m3, m0, m6 | |
16710 | pmulhrsw m3, m7 | |
16711 | pmaddubsw m5, m4, m6 | |
16712 | pmulhrsw m5, m7 | |
16713 | packuswb m3, m5 | |
16714 | movu [r0 + 451 * 16], m3 | |
16715 | ||
16716 | ; mode 9 [row 2] | |
16717 | movu m6, [r5 + 6 * 16] | |
16718 | pmaddubsw m3, m1, m6 | |
16719 | pmulhrsw m3, m7 | |
16720 | pmaddubsw m5, m2, m6 | |
16721 | pmulhrsw m5, m7 | |
16722 | packuswb m3, m5 | |
16723 | movu [r0 + 452 * 16], m3 | |
16724 | pmaddubsw m3, m0, m6 | |
16725 | pmulhrsw m3, m7 | |
16726 | pmaddubsw m5, m4, m6 | |
16727 | pmulhrsw m5, m7 | |
16728 | packuswb m3, m5 | |
16729 | movu [r0 + 453 * 16], m3 | |
16730 | ||
16731 | ; mode 9 [row 3] | |
16732 | movu m6, [r5 + 8 * 16] | |
16733 | pmaddubsw m3, m1, m6 | |
16734 | pmulhrsw m3, m7 | |
16735 | pmaddubsw m5, m2, m6 | |
16736 | pmulhrsw m5, m7 | |
16737 | packuswb m3, m5 | |
16738 | movu [r0 + 454 * 16], m3 | |
16739 | pmaddubsw m3, m0, m6 | |
16740 | pmulhrsw m3, m7 | |
16741 | pmaddubsw m5, m4, m6 | |
16742 | pmulhrsw m5, m7 | |
16743 | packuswb m3, m5 | |
16744 | movu [r0 + 455 * 16], m3 | |
16745 | ||
16746 | ; mode 9 [row 5] | |
16747 | movu m6, [r5 + 12 * 16] | |
16748 | pmaddubsw m3, m1, m6 | |
16749 | pmulhrsw m3, m7 | |
16750 | pmaddubsw m5, m2, m6 | |
16751 | pmulhrsw m5, m7 | |
16752 | packuswb m3, m5 | |
16753 | movu [r0 + 458 * 16], m3 | |
16754 | pmaddubsw m3, m0, m6 | |
16755 | pmulhrsw m3, m7 | |
16756 | pmaddubsw m5, m4, m6 | |
16757 | pmulhrsw m5, m7 | |
16758 | packuswb m3, m5 | |
16759 | movu [r0 + 459 * 16], m3 | |
16760 | ||
16761 | ; mode 9 [row 6] | |
16762 | movu m6, [r5 + 14 * 16] | |
16763 | pmaddubsw m3, m1, m6 | |
16764 | pmulhrsw m3, m7 | |
16765 | pmaddubsw m5, m2, m6 | |
16766 | pmulhrsw m5, m7 | |
16767 | packuswb m3, m5 | |
16768 | movu [r0 + 460 * 16], m3 | |
16769 | pmaddubsw m3, m0, m6 | |
16770 | pmulhrsw m3, m7 | |
16771 | pmaddubsw m5, m4, m6 | |
16772 | pmulhrsw m5, m7 | |
16773 | packuswb m3, m5 | |
16774 | movu [r0 + 461 * 16], m3 | |
16775 | ||
16776 | ; mode 9 [row 7] | |
16777 | movu m6, [r5 + 16 * 16] | |
16778 | pmaddubsw m3, m1, m6 | |
16779 | pmulhrsw m3, m7 | |
16780 | pmaddubsw m5, m2, m6 | |
16781 | pmulhrsw m5, m7 | |
16782 | packuswb m3, m5 | |
16783 | movu [r0 + 462 * 16], m3 | |
16784 | pmaddubsw m3, m0, m6 | |
16785 | pmulhrsw m3, m7 | |
16786 | pmaddubsw m5, m4, m6 | |
16787 | pmulhrsw m5, m7 | |
16788 | packuswb m3, m5 | |
16789 | movu [r0 + 463 * 16], m3 | |
16790 | ||
16791 | ; mode 9 [row 10] | |
16792 | movu m6, [r5 + 22 * 16] | |
16793 | pmaddubsw m3, m1, m6 | |
16794 | pmulhrsw m3, m7 | |
16795 | pmaddubsw m5, m2, m6 | |
16796 | pmulhrsw m5, m7 | |
16797 | packuswb m3, m5 | |
16798 | movu [r0 + 468 * 16], m3 | |
16799 | pmaddubsw m3, m0, m6 | |
16800 | pmulhrsw m3, m7 | |
16801 | pmaddubsw m5, m4, m6 | |
16802 | pmulhrsw m5, m7 | |
16803 | packuswb m3, m5 | |
16804 | movu [r0 + 469 * 16], m3 | |
16805 | ||
16806 | ; mode 9 [row 11] | |
16807 | movu m6, [r5 + 24 * 16] | |
16808 | pmaddubsw m3, m1, m6 | |
16809 | pmulhrsw m3, m7 | |
16810 | pmaddubsw m5, m2, m6 | |
16811 | pmulhrsw m5, m7 | |
16812 | packuswb m3, m5 | |
16813 | movu [r0 + 470 * 16], m3 | |
16814 | pmaddubsw m3, m0, m6 | |
16815 | pmulhrsw m3, m7 | |
16816 | pmaddubsw m5, m4, m6 | |
16817 | pmulhrsw m5, m7 | |
16818 | packuswb m3, m5 | |
16819 | movu [r0 + 471 * 16], m3 | |
16820 | ||
16821 | ; mode 9 [row 13] | |
16822 | movu m6, [r5 + 28 * 16] | |
16823 | pmaddubsw m3, m1, m6 | |
16824 | pmulhrsw m3, m7 | |
16825 | pmaddubsw m5, m2, m6 | |
16826 | pmulhrsw m5, m7 | |
16827 | packuswb m3, m5 | |
16828 | movu [r0 + 474 * 16], m3 | |
16829 | pmaddubsw m3, m0, m6 | |
16830 | pmulhrsw m3, m7 | |
16831 | pmaddubsw m5, m4, m6 | |
16832 | pmulhrsw m5, m7 | |
16833 | packuswb m3, m5 | |
16834 | movu [r0 + 475 * 16], m3 | |
16835 | ||
16836 | ; mode 3 [row 1] | |
16837 | movu m6, [r5 + 20 * 16] | |
16838 | movu m0, [r4 + 2] | |
16839 | movd m1, [r4 + 3] | |
16840 | palignr m1, m0, 1 | |
16841 | punpcklbw m0, m1 | |
16842 | pmaddubsw m1, m0, m6 | |
16843 | pmulhrsw m1, m7 | |
16844 | movu m2, [r4 + 10] | |
16845 | movd m3, [r4 + 11] | |
16846 | palignr m3, m2, 1 | |
16847 | punpcklbw m2, m3 | |
16848 | pmaddubsw m3, m2, m6 | |
16849 | pmulhrsw m3, m7 | |
16850 | packuswb m1, m3 | |
16851 | movu [r0 + 66 * 16], m1 | |
16852 | ||
16853 | ; mode 6 [row 3 - first half] | |
16854 | movu [r0 + 262 * 16], m1 | |
16855 | ||
16856 | ; mode 9 [row 25 - first half] | |
16857 | movu [r0 + 498 * 16], m1 | |
16858 | ||
16859 | movu m1, [r4 + 18] | |
16860 | movd m3, [r4 + 19] | |
16861 | palignr m3, m1, 1 | |
16862 | punpcklbw m1, m3 | |
16863 | pmaddubsw m3, m1, m6 | |
16864 | pmulhrsw m3, m7 | |
16865 | movu m4, [r4 + 26] | |
16866 | movd m5, [r4 + 27] | |
16867 | palignr m5, m4, 1 | |
16868 | punpcklbw m4, m5 | |
16869 | pmaddubsw m5, m4, m6 | |
16870 | pmulhrsw m5, m7 | |
16871 | packuswb m3, m5 | |
16872 | movu [r0 + 67 * 16], m3 | |
16873 | ||
16874 | ; mode 6 [row 3 - second half] | |
16875 | movu [r0 + 263 * 16], m3 | |
16876 | ||
16877 | ; mode 9 [row 25 - second half] | |
16878 | movu [r0 + 499 * 16], m3 | |
16879 | ||
16880 | ; mode 4 [row 1] | |
16881 | movu m6, [r5 + 10 * 16] | |
16882 | pmaddubsw m3, m0, m6 | |
16883 | pmulhrsw m3, m7 | |
16884 | pmaddubsw m5, m2, m6 | |
16885 | pmulhrsw m5, m7 | |
16886 | packuswb m3, m5 | |
16887 | movu [r0 + 130 * 16], m3 | |
16888 | ||
16889 | ; mode 9 [row 20 - first half] | |
16890 | movu [r0 + 488 * 16], m3 | |
16891 | ||
16892 | pmaddubsw m3, m1, m6 | |
16893 | pmulhrsw m3, m7 | |
16894 | pmaddubsw m5, m4, m6 | |
16895 | pmulhrsw m5, m7 | |
16896 | packuswb m3, m5 | |
16897 | movu [r0 + 131 * 16], m3 | |
16898 | ||
16899 | ; mode 9 [row 20 - second half] | |
16900 | movu [r0 + 489 * 16], m3 | |
16901 | ||
16902 | ; mode 4 [row 2] | |
16903 | movu m6, [r5 + 31 * 16] | |
16904 | pmaddubsw m3, m0, m6 | |
16905 | pmulhrsw m3, m7 | |
16906 | pmaddubsw m5, m2, m6 | |
16907 | pmulhrsw m5, m7 | |
16908 | packuswb m3, m5 | |
16909 | movu [r0 + 132 * 16], m3 | |
16910 | ||
16911 | ; mode 7 [row 6 - first half] | |
16912 | movu [r0 + 332 * 16], m3 | |
16913 | ||
16914 | pmaddubsw m3, m1, m6 | |
16915 | pmulhrsw m3, m7 | |
16916 | pmaddubsw m5, m4, m6 | |
16917 | pmulhrsw m5, m7 | |
16918 | packuswb m3, m5 | |
16919 | movu [r0 + 133 * 16], m3 | |
16920 | ||
16921 | ; mode 7 [row 6 - second half] | |
16922 | movu [r0 + 333 * 16], m3 | |
16923 | ||
16924 | ; mode 5 [row 1] | |
16925 | movu m6, [r5 + 2 * 16] | |
16926 | pmaddubsw m3, m0, m6 | |
16927 | pmulhrsw m3, m7 | |
16928 | pmaddubsw m5, m2, m6 | |
16929 | pmulhrsw m5, m7 | |
16930 | packuswb m3, m5 | |
16931 | movu [r0 + 194 * 16], m3 | |
16932 | ||
16933 | ; mode 5 [row 1 - first half] | |
16934 | movu [r0 + 480 * 16], m3 | |
16935 | ||
16936 | pmaddubsw m3, m1, m6 | |
16937 | pmulhrsw m3, m7 | |
16938 | pmaddubsw m5, m4, m6 | |
16939 | pmulhrsw m5, m7 | |
16940 | packuswb m3, m5 | |
16941 | movu [r0 + 195 * 16], m3 | |
16942 | ||
16943 | ; mode 5 [row 1 - second half] | |
16944 | movu [r0 + 481 * 16], m3 | |
16945 | ||
16946 | ; mode 5 [row 2] | |
16947 | movu m6, [r5 + 19 * 16] | |
16948 | pmaddubsw m3, m0, m6 | |
16949 | pmulhrsw m3, m7 | |
16950 | pmaddubsw m5, m2, m6 | |
16951 | pmulhrsw m5, m7 | |
16952 | packuswb m3, m5 | |
16953 | movu [r0 + 196 * 16], m3 | |
16954 | pmaddubsw m3, m1, m6 | |
16955 | pmulhrsw m3, m7 | |
16956 | pmaddubsw m5, m4, m6 | |
16957 | pmulhrsw m5, m7 | |
16958 | packuswb m3, m5 | |
16959 | movu [r0 + 197 * 16], m3 | |
16960 | ||
16961 | ; mode 6 [row 2] | |
16962 | movu m6, [r5 + 7 * 16] | |
16963 | pmaddubsw m3, m0, m6 | |
16964 | pmulhrsw m3, m7 | |
16965 | pmaddubsw m5, m2, m6 | |
16966 | pmulhrsw m5, m7 | |
16967 | packuswb m3, m5 | |
16968 | movu [r0 + 260 * 16], m3 | |
16969 | pmaddubsw m3, m1, m6 | |
16970 | pmulhrsw m3, m7 | |
16971 | pmaddubsw m5, m4, m6 | |
16972 | pmulhrsw m5, m7 | |
16973 | packuswb m3, m5 | |
16974 | movu [r0 + 261 * 16], m3 | |
16975 | ||
16976 | ; mode 7 [row 3] | |
16977 | movu m6, [r5 + 4 * 16] | |
16978 | pmaddubsw m3, m0, m6 | |
16979 | pmulhrsw m3, m7 | |
16980 | pmaddubsw m5, m2, m6 | |
16981 | pmulhrsw m5, m7 | |
16982 | packuswb m3, m5 | |
16983 | movu [r0 + 326 * 16], m3 | |
16984 | ||
16985 | ; mode 9 [row 17 - first half] | |
16986 | movu [r0 + 482 * 16], m3 | |
16987 | ||
16988 | pmaddubsw m3, m1, m6 | |
16989 | pmulhrsw m3, m7 | |
16990 | pmaddubsw m5, m4, m6 | |
16991 | pmulhrsw m5, m7 | |
16992 | packuswb m3, m5 | |
16993 | movu [r0 + 327 * 16], m3 | |
16994 | ||
16995 | ; mode 9 [row 17 - second half] | |
16996 | movu [r0 + 483 * 16], m3 | |
16997 | ||
16998 | ; mode 7 [row 4] | |
16999 | movu m6, [r5 + 13 * 16] | |
17000 | pmaddubsw m3, m0, m6 | |
17001 | pmulhrsw m3, m7 | |
17002 | pmaddubsw m5, m2, m6 | |
17003 | pmulhrsw m5, m7 | |
17004 | packuswb m3, m5 | |
17005 | movu [r0 + 328 * 16], m3 | |
17006 | ||
17007 | ; mode 8 [row 8 - first half] | |
17008 | movu [r0 + 400 * 16], m3 | |
17009 | ||
17010 | pmaddubsw m3, m1, m6 | |
17011 | pmulhrsw m3, m7 | |
17012 | pmaddubsw m5, m4, m6 | |
17013 | pmulhrsw m5, m7 | |
17014 | packuswb m3, m5 | |
17015 | movu [r0 + 329 * 16], m3 | |
17016 | ||
17017 | ; mode 8 [row 8 - second half] | |
17018 | movu [r0 + 401 * 16], m3 | |
17019 | ||
17020 | ; mode 7 [row 5] | |
17021 | movu m6, [r5 + 22 * 16] | |
17022 | pmaddubsw m3, m0, m6 | |
17023 | pmulhrsw m3, m7 | |
17024 | pmaddubsw m5, m2, m6 | |
17025 | pmulhrsw m5, m7 | |
17026 | packuswb m3, m5 | |
17027 | movu [r0 + 330 * 16], m3 | |
17028 | ||
17029 | ; mode 9 [row 26 - first half] | |
17030 | movu [r0 + 500 * 16], m3 | |
17031 | ||
17032 | pmaddubsw m3, m1, m6 | |
17033 | pmulhrsw m3, m7 | |
17034 | pmaddubsw m5, m4, m6 | |
17035 | pmulhrsw m5, m7 | |
17036 | packuswb m3, m5 | |
17037 | movu [r0 + 331 * 16], m3 | |
17038 | ||
17039 | ; mode 9 [row 26 - second half] | |
17040 | movu [r0 + 501 * 16], m3 | |
17041 | ||
17042 | ; mode 8 [row 6] | |
17043 | movu m6, [r5 + 3 * 16] | |
17044 | pmaddubsw m3, m0, m6 | |
17045 | pmulhrsw m3, m7 | |
17046 | pmaddubsw m5, m2, m6 | |
17047 | pmulhrsw m5, m7 | |
17048 | packuswb m3, m5 | |
17049 | movu [r0 + 396 * 16], m3 | |
17050 | pmaddubsw m3, m1, m6 | |
17051 | pmulhrsw m3, m7 | |
17052 | pmaddubsw m5, m4, m6 | |
17053 | pmulhrsw m5, m7 | |
17054 | packuswb m3, m5 | |
17055 | movu [r0 + 397 * 16], m3 | |
17056 | ||
17057 | ; mode 9 [row 18] | |
17058 | movu m6, [r5 + 6 * 16] | |
17059 | pmaddubsw m3, m0, m6 | |
17060 | pmulhrsw m3, m7 | |
17061 | pmaddubsw m5, m2, m6 | |
17062 | pmulhrsw m5, m7 | |
17063 | packuswb m3, m5 | |
17064 | movu [r0 + 484 * 16], m3 | |
17065 | pmaddubsw m3, m1, m6 | |
17066 | pmulhrsw m3, m7 | |
17067 | pmaddubsw m5, m4, m6 | |
17068 | pmulhrsw m5, m7 | |
17069 | packuswb m3, m5 | |
17070 | movu [r0 + 485 * 16], m3 | |
17071 | ||
17072 | ; mode 9 [row 21] | |
17073 | movu m6, [r5 + 12 * 16] | |
17074 | pmaddubsw m3, m0, m6 | |
17075 | pmulhrsw m3, m7 | |
17076 | pmaddubsw m5, m2, m6 | |
17077 | pmulhrsw m5, m7 | |
17078 | packuswb m3, m5 | |
17079 | movu [r0 + 490 * 16], m3 | |
17080 | pmaddubsw m3, m1, m6 | |
17081 | pmulhrsw m3, m7 | |
17082 | pmaddubsw m5, m4, m6 | |
17083 | pmulhrsw m5, m7 | |
17084 | packuswb m3, m5 | |
17085 | movu [r0 + 491 * 16], m3 | |
17086 | ||
17087 | ; mode 9 [row 22] | |
17088 | movu m6, [r5 + 14 * 16] | |
17089 | pmaddubsw m3, m0, m6 | |
17090 | pmulhrsw m3, m7 | |
17091 | pmaddubsw m5, m2, m6 | |
17092 | pmulhrsw m5, m7 | |
17093 | packuswb m3, m5 | |
17094 | movu [r0 + 492 * 16], m3 | |
17095 | pmaddubsw m3, m1, m6 | |
17096 | pmulhrsw m3, m7 | |
17097 | pmaddubsw m5, m4, m6 | |
17098 | pmulhrsw m5, m7 | |
17099 | packuswb m3, m5 | |
17100 | movu [r0 + 493 * 16], m3 | |
17101 | ||
17102 | ; mode 9 [row 23] | |
17103 | movu m6, [r5 + 16 * 16] | |
17104 | pmaddubsw m3, m0, m6 | |
17105 | pmulhrsw m3, m7 | |
17106 | pmaddubsw m5, m2, m6 | |
17107 | pmulhrsw m5, m7 | |
17108 | packuswb m3, m5 | |
17109 | movu [r0 + 494 * 16], m3 | |
17110 | pmaddubsw m3, m1, m6 | |
17111 | pmulhrsw m3, m7 | |
17112 | pmaddubsw m5, m4, m6 | |
17113 | pmulhrsw m5, m7 | |
17114 | packuswb m3, m5 | |
17115 | movu [r0 + 495 * 16], m3 | |
17116 | ||
17117 | ; mode 9 [row 27] | |
17118 | movu m6, [r5 + 24 * 16] | |
17119 | pmaddubsw m3, m0, m6 | |
17120 | pmulhrsw m3, m7 | |
17121 | pmaddubsw m5, m2, m6 | |
17122 | pmulhrsw m5, m7 | |
17123 | packuswb m3, m5 | |
17124 | movu [r0 + 502 * 16], m3 | |
17125 | pmaddubsw m3, m1, m6 | |
17126 | pmulhrsw m3, m7 | |
17127 | pmaddubsw m5, m4, m6 | |
17128 | pmulhrsw m5, m7 | |
17129 | packuswb m3, m5 | |
17130 | movu [r0 + 503 * 16], m3 | |
17131 | ||
17132 | ; mode 9 [row 28] | |
17133 | movu m6, [r5 + 26 * 16] | |
17134 | pmaddubsw m3, m0, m6 | |
17135 | pmulhrsw m3, m7 | |
17136 | pmaddubsw m5, m2, m6 | |
17137 | pmulhrsw m5, m7 | |
17138 | packuswb m3, m5 | |
17139 | movu [r0 + 504 * 16], m3 | |
17140 | pmaddubsw m3, m1, m6 | |
17141 | pmulhrsw m3, m7 | |
17142 | pmaddubsw m5, m4, m6 | |
17143 | pmulhrsw m5, m7 | |
17144 | packuswb m3, m5 | |
17145 | movu [r0 + 505 * 16], m3 | |
17146 | ||
17147 | ; mode 9 [row 30] | |
17148 | movu m6, [r5 + 30 * 16] | |
17149 | pmaddubsw m3, m0, m6 | |
17150 | pmulhrsw m3, m7 | |
17151 | pmaddubsw m5, m2, m6 | |
17152 | pmulhrsw m5, m7 | |
17153 | packuswb m3, m5 | |
17154 | movu [r0 + 508 * 16], m3 | |
17155 | pmaddubsw m3, m1, m6 | |
17156 | pmulhrsw m3, m7 | |
17157 | pmaddubsw m5, m4, m6 | |
17158 | pmulhrsw m5, m7 | |
17159 | packuswb m3, m5 | |
17160 | movu [r0 + 509 * 16], m3 | |
17161 | ||
17162 | ; mode 8 [row 7] | |
17163 | movu m6, [r5 + 8 * 16] | |
17164 | pmaddubsw m3, m0, m6 | |
17165 | pmulhrsw m3, m7 | |
17166 | pmaddubsw m5, m2, m6 | |
17167 | pmulhrsw m5, m7 | |
17168 | packuswb m3, m5 | |
17169 | movu [r0 + 398 * 16], m3 | |
17170 | ||
17171 | ; mode 9 [row 19 - first half] | |
17172 | movu [r0 + 486 * 16], m3 | |
17173 | ||
17174 | pmaddubsw m3, m1, m6 | |
17175 | pmulhrsw m3, m7 | |
17176 | pmaddubsw m5, m4, m6 | |
17177 | pmulhrsw m5, m7 | |
17178 | packuswb m3, m5 | |
17179 | movu [r0 + 399 * 16], m3 | |
17180 | ||
17181 | ; mode 9 [row 19 - second half] | |
17182 | movu [r0 + 487 * 16], m3 | |
17183 | ||
17184 | ; mode 8 [row 9] | |
17185 | movu m6, [r5 + 18 * 16] | |
17186 | pmaddubsw m3, m0, m6 | |
17187 | pmulhrsw m3, m7 | |
17188 | pmaddubsw m5, m2, m6 | |
17189 | pmulhrsw m5, m7 | |
17190 | packuswb m3, m5 | |
17191 | movu [r0 + 402 * 16], m3 | |
17192 | ||
17193 | ; mode 9 [row 24 - first half] | |
17194 | movu [r0 + 496 * 16], m3 | |
17195 | ||
17196 | pmaddubsw m3, m1, m6 | |
17197 | pmulhrsw m3, m7 | |
17198 | pmaddubsw m5, m4, m6 | |
17199 | pmulhrsw m5, m7 | |
17200 | packuswb m3, m5 | |
17201 | movu [r0 + 403 * 16], m3 | |
17202 | ||
17203 | ; mode 9 [row 24 - second half] | |
17204 | movu [r0 + 497 * 16], m3 | |
17205 | ||
17206 | ; mode 8 [row 10] | |
17207 | movu m6, [r5 + 23 * 16] | |
17208 | pmaddubsw m3, m0, m6 | |
17209 | pmulhrsw m3, m7 | |
17210 | pmaddubsw m5, m2, m6 | |
17211 | pmulhrsw m5, m7 | |
17212 | packuswb m3, m5 | |
17213 | movu [r0 + 404 * 16], m3 | |
17214 | pmaddubsw m3, m1, m6 | |
17215 | pmulhrsw m3, m7 | |
17216 | pmaddubsw m5, m4, m6 | |
17217 | pmulhrsw m5, m7 | |
17218 | packuswb m3, m5 | |
17219 | movu [r0 + 405 * 16], m3 | |
17220 | ||
17221 | ; mode 8 [row 11] | |
17222 | movu m6, [r5 + 28 * 16] | |
17223 | pmaddubsw m3, m0, m6 | |
17224 | pmulhrsw m3, m7 | |
17225 | pmaddubsw m5, m2, m6 | |
17226 | pmulhrsw m5, m7 | |
17227 | packuswb m3, m5 | |
17228 | movu [r0 + 406 * 16], m3 | |
17229 | ||
17230 | ; mode 9 [row 29 - first half] | |
17231 | movu [r0 + 506 * 16], m3 | |
17232 | ||
17233 | pmaddubsw m3, m1, m6 | |
17234 | pmulhrsw m3, m7 | |
17235 | pmaddubsw m5, m4, m6 | |
17236 | pmulhrsw m5, m7 | |
17237 | packuswb m3, m5 | |
17238 | movu [r0 + 407 * 16], m3 | |
17239 | ||
17240 | ; mode 9 [row 29 - second half] | |
17241 | movu [r0 + 507 * 16], m3 | |
17242 | ||
17243 | ; mode 3 [row 2] | |
17244 | movu m6, [r5 + 14 * 16] | |
17245 | movu m0, [r4 + 3] | |
17246 | movd m1, [r4 + 4] | |
17247 | palignr m1, m0, 1 | |
17248 | punpcklbw m0, m1 | |
17249 | pmaddubsw m1, m0, m6 | |
17250 | pmulhrsw m1, m7 | |
17251 | movu m2, [r4 + 11] | |
17252 | movd m3, [r4 + 12] | |
17253 | palignr m3, m2, 1 | |
17254 | punpcklbw m2, m3 | |
17255 | pmaddubsw m3, m2, m6 | |
17256 | pmulhrsw m3, m7 | |
17257 | packuswb m1, m3 | |
17258 | movu [r0 + 68 * 16], m1 | |
17259 | ||
17260 | ; mode 3 [row 2 - first half] | |
17261 | movu [r0 + 266 * 16], m1 | |
17262 | ||
17263 | movu m1, [r4 + 19] | |
17264 | movd m3, [r4 + 20] | |
17265 | palignr m3, m1, 1 | |
17266 | punpcklbw m1, m3 | |
17267 | pmaddubsw m3, m1, m6 | |
17268 | pmulhrsw m3, m7 | |
17269 | movu m4, [r4 + 27] | |
17270 | movd m5, [r4 + 28] | |
17271 | palignr m5, m4, 1 | |
17272 | punpcklbw m4, m5 | |
17273 | pmaddubsw m5, m4, m6 | |
17274 | pmulhrsw m5, m7 | |
17275 | packuswb m3, m5 | |
17276 | movu [r0 + 69 * 16], m3 | |
17277 | ||
17278 | ; mode 3 [row 2 - second half] | |
17279 | movu [r0 + 267 * 16], m3 | |
17280 | ||
17281 | ; mode 4 [row 3] | |
17282 | movu m6, [r5 + 20 * 16] | |
17283 | pmaddubsw m3, m0, m6 | |
17284 | pmulhrsw m3, m7 | |
17285 | pmaddubsw m5, m2, m6 | |
17286 | pmulhrsw m5, m7 | |
17287 | packuswb m3, m5 | |
17288 | movu [r0 + 134 * 16], m3 | |
17289 | pmaddubsw m3, m1, m6 | |
17290 | pmulhrsw m3, m7 | |
17291 | pmaddubsw m5, m4, m6 | |
17292 | pmulhrsw m5, m7 | |
17293 | packuswb m3, m5 | |
17294 | movu [r0 + 135 * 16], m3 | |
17295 | ||
17296 | ; mode 5 [row 3] | |
17297 | movu m6, [r5 + 4 * 16] | |
17298 | pmaddubsw m3, m0, m6 | |
17299 | pmulhrsw m3, m7 | |
17300 | pmaddubsw m5, m2, m6 | |
17301 | pmulhrsw m5, m7 | |
17302 | packuswb m3, m5 | |
17303 | movu [r0 + 198 * 16], m3 | |
17304 | pmaddubsw m3, m1, m6 | |
17305 | pmulhrsw m3, m7 | |
17306 | pmaddubsw m5, m4, m6 | |
17307 | pmulhrsw m5, m7 | |
17308 | packuswb m3, m5 | |
17309 | movu [r0 + 199 * 16], m3 | |
17310 | ||
17311 | ; mode 5 [row 4] | |
17312 | movu m6, [r5 + 21 * 16] | |
17313 | pmaddubsw m3, m0, m6 | |
17314 | pmulhrsw m3, m7 | |
17315 | pmaddubsw m5, m2, m6 | |
17316 | pmulhrsw m5, m7 | |
17317 | packuswb m3, m5 | |
17318 | movu [r0 + 200 * 16], m3 | |
17319 | ||
17320 | ; mode 8 [row 16 - first half] | |
17321 | movu [r0 + 416 * 16], m3 | |
17322 | ||
17323 | pmaddubsw m3, m1, m6 | |
17324 | pmulhrsw m3, m7 | |
17325 | pmaddubsw m5, m4, m6 | |
17326 | pmulhrsw m5, m7 | |
17327 | packuswb m3, m5 | |
17328 | movu [r0 + 201 * 16], m3 | |
17329 | ||
17330 | ; mode 8 [row 16 - second half] | |
17331 | movu [r0 + 417 * 16], m3 | |
17332 | ||
17333 | ; mode 6 [row 4] | |
17334 | movu m6, [r5 + 1 * 16] | |
17335 | pmaddubsw m3, m0, m6 | |
17336 | pmulhrsw m3, m7 | |
17337 | pmaddubsw m5, m2, m6 | |
17338 | pmulhrsw m5, m7 | |
17339 | packuswb m3, m5 | |
17340 | movu [r0 + 264 * 16], m3 | |
17341 | ||
17342 | ; mode 6 [row 4 - first half] | |
17343 | movu [r0 + 408 * 16], m3 | |
17344 | ||
17345 | pmaddubsw m3, m1, m6 | |
17346 | pmulhrsw m3, m7 | |
17347 | pmaddubsw m5, m4, m6 | |
17348 | pmulhrsw m5, m7 | |
17349 | packuswb m3, m5 | |
17350 | movu [r0 + 265 * 16], m3 | |
17351 | ||
17352 | ; mode 6 [row 4 - second half] | |
17353 | movu [r0 + 409 * 16], m3 | |
17354 | ||
17355 | ; mode 6 [row 6] | |
17356 | movu m6, [r5 + 27 * 16] | |
17357 | pmaddubsw m3, m0, m6 | |
17358 | pmulhrsw m3, m7 | |
17359 | pmaddubsw m5, m2, m6 | |
17360 | pmulhrsw m5, m7 | |
17361 | packuswb m3, m5 | |
17362 | movu [r0 + 268 * 16], m3 | |
17363 | pmaddubsw m3, m1, m6 | |
17364 | pmulhrsw m3, m7 | |
17365 | pmaddubsw m5, m4, m6 | |
17366 | pmulhrsw m5, m7 | |
17367 | packuswb m3, m5 | |
17368 | movu [r0 + 269 * 16], m3 | |
17369 | ||
17370 | ; mode 7 [row 7] | |
17371 | movu m6, [r5 + 8 * 16] | |
17372 | pmaddubsw m3, m0, m6 | |
17373 | pmulhrsw m3, m7 | |
17374 | pmaddubsw m5, m2, m6 | |
17375 | pmulhrsw m5, m7 | |
17376 | packuswb m3, m5 | |
17377 | movu [r0 + 334 * 16], m3 | |
17378 | pmaddubsw m3, m1, m6 | |
17379 | pmulhrsw m3, m7 | |
17380 | pmaddubsw m5, m4, m6 | |
17381 | pmulhrsw m5, m7 | |
17382 | packuswb m3, m5 | |
17383 | movu [r0 + 335 * 16], m3 | |
17384 | ||
17385 | ; mode 7 [row 8] | |
17386 | movu m6, [r5 + 17 * 16] | |
17387 | pmaddubsw m3, m0, m6 | |
17388 | pmulhrsw m3, m7 | |
17389 | pmaddubsw m5, m2, m6 | |
17390 | pmulhrsw m5, m7 | |
17391 | packuswb m3, m5 | |
17392 | movu [r0 + 336 * 16], m3 | |
17393 | pmaddubsw m3, m1, m6 | |
17394 | pmulhrsw m3, m7 | |
17395 | pmaddubsw m5, m4, m6 | |
17396 | pmulhrsw m5, m7 | |
17397 | packuswb m3, m5 | |
17398 | movu [r0 + 337 * 16], m3 | |
17399 | ||
17400 | ; mode 7 [row 9] | |
17401 | movu m6, [r5 + 26 * 16] | |
17402 | pmaddubsw m3, m0, m6 | |
17403 | pmulhrsw m3, m7 | |
17404 | pmaddubsw m5, m2, m6 | |
17405 | pmulhrsw m5, m7 | |
17406 | packuswb m3, m5 | |
17407 | movu [r0 + 338 * 16], m3 | |
17408 | ||
17409 | ; mode 8 [row 17 - first half] | |
17410 | movu [r0 + 418 * 16], m3 | |
17411 | ||
17412 | pmaddubsw m3, m1, m6 | |
17413 | pmulhrsw m3, m7 | |
17414 | pmaddubsw m5, m4, m6 | |
17415 | pmulhrsw m5, m7 | |
17416 | packuswb m3, m5 | |
17417 | movu [r0 + 339 * 16], m3 | |
17418 | ||
17419 | ; mode 8 [row 17 - second half] | |
17420 | movu [r0 + 419 * 16], m3 | |
17421 | ||
17422 | ; mode 8 [row 13] | |
17423 | movu m6, [r5 + 6 * 16] | |
17424 | pmaddubsw m3, m0, m6 | |
17425 | pmulhrsw m3, m7 | |
17426 | pmaddubsw m5, m2, m6 | |
17427 | pmulhrsw m5, m7 | |
17428 | packuswb m3, m5 | |
17429 | movu [r0 + 410 * 16], m3 | |
17430 | pmaddubsw m3, m1, m6 | |
17431 | pmulhrsw m3, m7 | |
17432 | pmaddubsw m5, m4, m6 | |
17433 | pmulhrsw m5, m7 | |
17434 | packuswb m3, m5 | |
17435 | movu [r0 + 411 * 16], m3 | |
17436 | ||
17437 | ; mode 8 [row 14] | |
17438 | movu m6, [r5 + 11 * 16] | |
17439 | pmaddubsw m3, m0, m6 | |
17440 | pmulhrsw m3, m7 | |
17441 | pmaddubsw m5, m2, m6 | |
17442 | pmulhrsw m5, m7 | |
17443 | packuswb m3, m5 | |
17444 | movu [r0 + 412 * 16], m3 | |
17445 | pmaddubsw m3, m1, m6 | |
17446 | pmulhrsw m3, m7 | |
17447 | pmaddubsw m5, m4, m6 | |
17448 | pmulhrsw m5, m7 | |
17449 | packuswb m3, m5 | |
17450 | movu [r0 + 413 * 16], m3 | |
17451 | ||
17452 | ; mode 8 [row 15] | |
17453 | movu m6, [r5 + 16 * 16] | |
17454 | pmaddubsw m3, m0, m6 | |
17455 | pmulhrsw m3, m7 | |
17456 | pmaddubsw m5, m2, m6 | |
17457 | pmulhrsw m5, m7 | |
17458 | packuswb m3, m5 | |
17459 | movu [r0 + 414 * 16], m3 | |
17460 | pmaddubsw m3, m1, m6 | |
17461 | pmulhrsw m3, m7 | |
17462 | pmaddubsw m5, m4, m6 | |
17463 | pmulhrsw m5, m7 | |
17464 | packuswb m3, m5 | |
17465 | movu [r0 + 415 * 16], m3 | |
17466 | ||
17467 | ; mode 8 [row 18] | |
17468 | movu m6, [r5 + 31 * 16] | |
17469 | pmaddubsw m3, m0, m6 | |
17470 | pmulhrsw m3, m7 | |
17471 | pmaddubsw m5, m2, m6 | |
17472 | pmulhrsw m5, m7 | |
17473 | packuswb m3, m5 | |
17474 | movu [r0 + 420 * 16], m3 | |
17475 | pmaddubsw m3, m1, m6 | |
17476 | pmulhrsw m3, m7 | |
17477 | pmaddubsw m5, m4, m6 | |
17478 | pmulhrsw m5, m7 | |
17479 | packuswb m3, m5 | |
17480 | movu [r0 + 421 * 16], m3 | |
17481 | ||
17482 | ; mode 3 [row 3] | |
17483 | movu m6, [r5 + 8 * 16] | |
17484 | movu m0, [r4 + 4] | |
17485 | movd m1, [r4 + 5] | |
17486 | palignr m1, m0, 1 | |
17487 | punpcklbw m0, m1 | |
17488 | pmaddubsw m1, m0, m6 | |
17489 | pmulhrsw m1, m7 | |
17490 | movu m2, [r4 + 12] | |
17491 | movd m3, [r4 + 13] | |
17492 | palignr m3, m2, 1 | |
17493 | punpcklbw m2, m3 | |
17494 | pmaddubsw m3, m2, m6 | |
17495 | pmulhrsw m3, m7 | |
17496 | packuswb m1, m3 | |
17497 | movu [r0 + 70 * 16], m1 | |
17498 | ||
17499 | ; mode 6 [row 7 - first half] | |
17500 | movu [r0 + 270 * 16], m1 | |
17501 | ||
17502 | movu m1, [r4 + 20] | |
17503 | movd m3, [r4 + 21] | |
17504 | palignr m3, m1, 1 | |
17505 | punpcklbw m1, m3 | |
17506 | pmaddubsw m3, m1, m6 | |
17507 | pmulhrsw m3, m7 | |
17508 | movu m4, [r4 + 28] | |
17509 | movd m5, [r4 + 29] | |
17510 | palignr m5, m4, 1 | |
17511 | punpcklbw m4, m5 | |
17512 | pmaddubsw m5, m4, m6 | |
17513 | pmulhrsw m5, m7 | |
17514 | packuswb m3, m5 | |
17515 | movu [r0 + 71 * 16], m3 | |
17516 | ||
17517 | ; mode 6 [row 7 - second half] | |
17518 | movu [r0 + 271 * 16], m3 | |
17519 | ||
17520 | ; mode 4 [row 4] | |
17521 | movu m6, [r5 + 9 * 16] | |
17522 | pmaddubsw m3, m0, m6 | |
17523 | pmulhrsw m3, m7 | |
17524 | pmaddubsw m5, m2, m6 | |
17525 | pmulhrsw m5, m7 | |
17526 | packuswb m3, m5 | |
17527 | movu [r0 + 136 * 16], m3 | |
17528 | ||
17529 | ; mode 4 [row 4 - first half] | |
17530 | movu [r0 + 424 * 16], m3 | |
17531 | ||
17532 | pmaddubsw m3, m1, m6 | |
17533 | pmulhrsw m3, m7 | |
17534 | pmaddubsw m5, m4, m6 | |
17535 | pmulhrsw m5, m7 | |
17536 | packuswb m3, m5 | |
17537 | movu [r0 + 137 * 16], m3 | |
17538 | ||
17539 | ; mode 4 [row 4 - second half] | |
17540 | movu [r0 + 425 * 16], m3 | |
17541 | ||
17542 | ; mode 4 [row 5] | |
17543 | movu m6, [r5 + 30 * 16] | |
17544 | pmaddubsw m3, m0, m6 | |
17545 | pmulhrsw m3, m7 | |
17546 | pmaddubsw m5, m2, m6 | |
17547 | pmulhrsw m5, m7 | |
17548 | packuswb m3, m5 | |
17549 | movu [r0 + 138 * 16], m3 | |
17550 | ||
17551 | ; mode 7 [row 13 - first half] | |
17552 | movu [r0 + 346 * 16], m3 | |
17553 | ||
17554 | pmaddubsw m3, m1, m6 | |
17555 | pmulhrsw m3, m7 | |
17556 | pmaddubsw m5, m4, m6 | |
17557 | pmulhrsw m5, m7 | |
17558 | packuswb m3, m5 | |
17559 | movu [r0 + 139 * 16], m3 | |
17560 | ||
17561 | ; mode 7 [row 13 - second half] | |
17562 | movu [r0 + 347 * 16], m3 | |
17563 | ||
17564 | ; mode 5 [row 5] | |
17565 | movu m6, [r5 + 6 * 16] | |
17566 | pmaddubsw m3, m0, m6 | |
17567 | pmulhrsw m3, m7 | |
17568 | pmaddubsw m5, m2, m6 | |
17569 | pmulhrsw m5, m7 | |
17570 | packuswb m3, m5 | |
17571 | movu [r0 + 202 * 16], m3 | |
17572 | pmaddubsw m3, m1, m6 | |
17573 | pmulhrsw m3, m7 | |
17574 | pmaddubsw m5, m4, m6 | |
17575 | pmulhrsw m5, m7 | |
17576 | packuswb m3, m5 | |
17577 | movu [r0 + 203 * 16], m3 | |
17578 | ||
17579 | ; mode 5 [row 6] | |
17580 | movu m6, [r5 + 23 * 16] | |
17581 | pmaddubsw m3, m0, m6 | |
17582 | pmulhrsw m3, m7 | |
17583 | pmaddubsw m5, m2, m6 | |
17584 | pmulhrsw m5, m7 | |
17585 | packuswb m3, m5 | |
17586 | movu [r0 + 204 * 16], m3 | |
17587 | pmaddubsw m3, m1, m6 | |
17588 | pmulhrsw m3, m7 | |
17589 | pmaddubsw m5, m4, m6 | |
17590 | pmulhrsw m5, m7 | |
17591 | packuswb m3, m5 | |
17592 | movu [r0 + 205 * 16], m3 | |
17593 | ||
17594 | ; mode 6 [row 8] | |
17595 | movu m6, [r5 + 21 * 16] | |
17596 | pmaddubsw m3, m0, m6 | |
17597 | pmulhrsw m3, m7 | |
17598 | pmaddubsw m5, m2, m6 | |
17599 | pmulhrsw m5, m7 | |
17600 | packuswb m3, m5 | |
17601 | movu [r0 + 272 * 16], m3 | |
17602 | ||
17603 | ; mode 7 [row 12 - first half] | |
17604 | movu [r0 + 344 * 16], m3 | |
17605 | ||
17606 | pmaddubsw m3, m1, m6 | |
17607 | pmulhrsw m3, m7 | |
17608 | pmaddubsw m5, m4, m6 | |
17609 | pmulhrsw m5, m7 | |
17610 | packuswb m3, m5 | |
17611 | movu [r0 + 273 * 16], m3 | |
17612 | ||
17613 | ; mode 7 [row 12 - second half] | |
17614 | movu [r0 + 345 * 16], m3 | |
17615 | ||
17616 | ; mode 7 [row 10] | |
17617 | movu m6, [r5 + 3 * 16] | |
17618 | pmaddubsw m3, m0, m6 | |
17619 | pmulhrsw m3, m7 | |
17620 | pmaddubsw m5, m2, m6 | |
17621 | pmulhrsw m5, m7 | |
17622 | packuswb m3, m5 | |
17623 | movu [r0 + 340 * 16], m3 | |
17624 | pmaddubsw m3, m1, m6 | |
17625 | pmulhrsw m3, m7 | |
17626 | pmaddubsw m5, m4, m6 | |
17627 | pmulhrsw m5, m7 | |
17628 | packuswb m3, m5 | |
17629 | movu [r0 + 341 * 16], m3 | |
17630 | ||
17631 | ; mode 7 [row 11] | |
17632 | movu m6, [r5 + 12 * 16] | |
17633 | pmaddubsw m3, m0, m6 | |
17634 | pmulhrsw m3, m7 | |
17635 | pmaddubsw m5, m2, m6 | |
17636 | pmulhrsw m5, m7 | |
17637 | packuswb m3, m5 | |
17638 | movu [r0 + 342 * 16], m3 | |
17639 | pmaddubsw m3, m1, m6 | |
17640 | pmulhrsw m3, m7 | |
17641 | pmaddubsw m5, m4, m6 | |
17642 | pmulhrsw m5, m7 | |
17643 | packuswb m3, m5 | |
17644 | movu [r0 + 343 * 16], m3 | |
17645 | ||
17646 | ; mode 8 [row 19] | |
17647 | movu m6, [r5 + 4 * 16] | |
17648 | pmaddubsw m3, m0, m6 | |
17649 | pmulhrsw m3, m7 | |
17650 | pmaddubsw m5, m2, m6 | |
17651 | pmulhrsw m5, m7 | |
17652 | packuswb m3, m5 | |
17653 | movu [r0 + 422 * 16], m3 | |
17654 | pmaddubsw m3, m1, m6 | |
17655 | pmulhrsw m3, m7 | |
17656 | pmaddubsw m5, m4, m6 | |
17657 | pmulhrsw m5, m7 | |
17658 | packuswb m3, m5 | |
17659 | movu [r0 + 423 * 16], m3 | |
17660 | ||
17661 | ; mode 8 [row 21] | |
17662 | movu m6, [r5 + 14 * 16] | |
17663 | pmaddubsw m3, m0, m6 | |
17664 | pmulhrsw m3, m7 | |
17665 | pmaddubsw m5, m2, m6 | |
17666 | pmulhrsw m5, m7 | |
17667 | packuswb m3, m5 | |
17668 | movu [r0 + 426 * 16], m3 | |
17669 | pmaddubsw m3, m1, m6 | |
17670 | pmulhrsw m3, m7 | |
17671 | pmaddubsw m5, m4, m6 | |
17672 | pmulhrsw m5, m7 | |
17673 | packuswb m3, m5 | |
17674 | movu [r0 + 427 * 16], m3 | |
17675 | ||
17676 | ; mode 8 [row 22] | |
17677 | movu m6, [r5 + 19 * 16] | |
17678 | pmaddubsw m3, m0, m6 | |
17679 | pmulhrsw m3, m7 | |
17680 | pmaddubsw m5, m2, m6 | |
17681 | pmulhrsw m5, m7 | |
17682 | packuswb m3, m5 | |
17683 | movu [r0 + 428 * 16], m3 | |
17684 | pmaddubsw m3, m1, m6 | |
17685 | pmulhrsw m3, m7 | |
17686 | pmaddubsw m5, m4, m6 | |
17687 | pmulhrsw m5, m7 | |
17688 | packuswb m3, m5 | |
17689 | movu [r0 + 429 * 16], m3 | |
17690 | ||
17691 | ; mode 8 [row 23] | |
17692 | movu m6, [r5 + 24 * 16] | |
17693 | pmaddubsw m3, m0, m6 | |
17694 | pmulhrsw m3, m7 | |
17695 | pmaddubsw m5, m2, m6 | |
17696 | pmulhrsw m5, m7 | |
17697 | packuswb m3, m5 | |
17698 | movu [r0 + 430 * 16], m3 | |
17699 | pmaddubsw m3, m1, m6 | |
17700 | pmulhrsw m3, m7 | |
17701 | pmaddubsw m5, m4, m6 | |
17702 | pmulhrsw m5, m7 | |
17703 | packuswb m3, m5 | |
17704 | movu [r0 + 431 * 16], m3 | |
17705 | ||
17706 | ; mode 8 [row 24] | |
17707 | movu m6, [r5 + 29 * 16] | |
17708 | pmaddubsw m3, m0, m6 | |
17709 | pmulhrsw m3, m7 | |
17710 | pmaddubsw m5, m2, m6 | |
17711 | pmulhrsw m5, m7 | |
17712 | packuswb m3, m5 | |
17713 | movu [r0 + 432 * 16], m3 | |
17714 | pmaddubsw m3, m1, m6 | |
17715 | pmulhrsw m3, m7 | |
17716 | pmaddubsw m5, m4, m6 | |
17717 | pmulhrsw m5, m7 | |
17718 | packuswb m3, m5 | |
17719 | movu [r0 + 433 * 16], m3 | |
17720 | ||
17721 | ; mode 3 [row 4] | |
17722 | movu m6, [r5 + 2 * 16] | |
17723 | movu m0, [r4 + 5] | |
17724 | movd m1, [r4 + 6] | |
17725 | palignr m1, m0, 1 | |
17726 | punpcklbw m0, m1 | |
17727 | pmaddubsw m1, m0, m6 | |
17728 | pmulhrsw m1, m7 | |
17729 | movu m2, [r4 + 13] | |
17730 | movd m3, [r4 + 14] | |
17731 | palignr m3, m2, 1 | |
17732 | punpcklbw m2, m3 | |
17733 | pmaddubsw m3, m2, m6 | |
17734 | pmulhrsw m3, m7 | |
17735 | packuswb m1, m3 | |
17736 | movu [r0 + 72 * 16], m1 | |
17737 | ||
17738 | ; mode 3 [row 4 - first half] | |
17739 | movu [r0 + 274 * 16], m1 | |
17740 | ||
17741 | ; mode 8 [row 25 - first half] | |
17742 | movu [r0 + 434 * 16], m1 | |
17743 | ||
17744 | movu m1, [r4 + 21] | |
17745 | movd m3, [r4 + 22] | |
17746 | palignr m3, m1, 1 | |
17747 | punpcklbw m1, m3 | |
17748 | pmaddubsw m3, m1, m6 | |
17749 | pmulhrsw m3, m7 | |
17750 | movu m4, [r4 + 29] | |
17751 | movd m5, [r4 + 30] | |
17752 | palignr m5, m4, 1 | |
17753 | punpcklbw m4, m5 | |
17754 | pmaddubsw m5, m4, m6 | |
17755 | pmulhrsw m5, m7 | |
17756 | packuswb m3, m5 | |
17757 | movu [r0 + 73 * 16], m3 | |
17758 | ||
17759 | ; mode 3 [row 4 - second half] | |
17760 | movu [r0 + 275 * 16], m3 | |
17761 | ||
17762 | ; mode 8 [row 25 - second half] | |
17763 | movu [r0 + 435 * 16], m3 | |
17764 | ||
17765 | ; mode 3 [row 5] | |
17766 | movu m6, [r5 + 28 * 16] | |
17767 | pmaddubsw m3, m0, m6 | |
17768 | pmulhrsw m3, m7 | |
17769 | pmaddubsw m5, m2, m6 | |
17770 | pmulhrsw m5, m7 | |
17771 | packuswb m3, m5 | |
17772 | movu [r0 + 74 * 16], m3 | |
17773 | ||
17774 | ; mode 3 [row 5 - first half] | |
17775 | movu [r0 + 278 * 16], m3 | |
17776 | ||
17777 | pmaddubsw m3, m1, m6 | |
17778 | pmulhrsw m3, m7 | |
17779 | pmaddubsw m5, m4, m6 | |
17780 | pmulhrsw m5, m7 | |
17781 | packuswb m3, m5 | |
17782 | movu [r0 + 75 * 16], m3 | |
17783 | ||
17784 | ; mode 3 [row 5 - second half] | |
17785 | movu [r0 + 279 * 16], m3 | |
17786 | ||
17787 | ; mode 4 [row 6] | |
17788 | movu m6, [r5 + 19 * 16] | |
17789 | pmaddubsw m3, m0, m6 | |
17790 | pmulhrsw m3, m7 | |
17791 | pmaddubsw m5, m2, m6 | |
17792 | pmulhrsw m5, m7 | |
17793 | packuswb m3, m5 | |
17794 | movu [r0 + 140 * 16], m3 | |
17795 | pmaddubsw m3, m1, m6 | |
17796 | pmulhrsw m3, m7 | |
17797 | pmaddubsw m5, m4, m6 | |
17798 | pmulhrsw m5, m7 | |
17799 | packuswb m3, m5 | |
17800 | movu [r0 + 141 * 16], m3 | |
17801 | ||
17802 | ; mode 5 [row 7] | |
17803 | movu m6, [r5 + 8 * 16] | |
17804 | pmaddubsw m3, m0, m6 | |
17805 | pmulhrsw m3, m7 | |
17806 | pmaddubsw m5, m2, m6 | |
17807 | pmulhrsw m5, m7 | |
17808 | packuswb m3, m5 | |
17809 | movu [r0 + 206 * 16], m3 | |
17810 | pmaddubsw m3, m1, m6 | |
17811 | pmulhrsw m3, m7 | |
17812 | pmaddubsw m5, m4, m6 | |
17813 | pmulhrsw m5, m7 | |
17814 | packuswb m3, m5 | |
17815 | movu [r0 + 207 * 16], m3 | |
17816 | ||
17817 | ; mode 5 [row 8] | |
17818 | movu m6, [r5 + 25 * 16] | |
17819 | pmaddubsw m3, m0, m6 | |
17820 | pmulhrsw m3, m7 | |
17821 | pmaddubsw m5, m2, m6 | |
17822 | pmulhrsw m5, m7 | |
17823 | packuswb m3, m5 | |
17824 | movu [r0 + 208 * 16], m3 | |
17825 | ||
17826 | ; mode 7 [row 16 - first half] | |
17827 | movu [r0 + 352 * 16], m3 | |
17828 | ||
17829 | pmaddubsw m3, m1, m6 | |
17830 | pmulhrsw m3, m7 | |
17831 | pmaddubsw m5, m4, m6 | |
17832 | pmulhrsw m5, m7 | |
17833 | packuswb m3, m5 | |
17834 | movu [r0 + 209 * 16], m3 | |
17835 | ||
17836 | ; mode 7 [row 16 - second half] | |
17837 | movu [r0 + 353 * 16], m3 | |
17838 | ||
17839 | ; mode 6 [row 10] | |
17840 | movu m6, [r5 + 15 * 16] | |
17841 | pmaddubsw m3, m0, m6 | |
17842 | pmulhrsw m3, m7 | |
17843 | pmaddubsw m5, m2, m6 | |
17844 | pmulhrsw m5, m7 | |
17845 | packuswb m3, m5 | |
17846 | movu [r0 + 276 * 16], m3 | |
17847 | pmaddubsw m3, m1, m6 | |
17848 | pmulhrsw m3, m7 | |
17849 | pmaddubsw m5, m4, m6 | |
17850 | pmulhrsw m5, m7 | |
17851 | packuswb m3, m5 | |
17852 | movu [r0 + 277 * 16], m3 | |
17853 | ||
17854 | ; mode 7 [row 14] | |
17855 | movu m6, [r5 + 7 * 16] | |
17856 | pmaddubsw m3, m0, m6 | |
17857 | pmulhrsw m3, m7 | |
17858 | pmaddubsw m5, m2, m6 | |
17859 | pmulhrsw m5, m7 | |
17860 | packuswb m3, m5 | |
17861 | movu [r0 + 348 * 16], m3 | |
17862 | ||
17863 | ; mode 8 [row 26 - first half] | |
17864 | movu [r0 + 436 * 16], m3 | |
17865 | ||
17866 | pmaddubsw m3, m1, m6 | |
17867 | pmulhrsw m3, m7 | |
17868 | pmaddubsw m5, m4, m6 | |
17869 | pmulhrsw m5, m7 | |
17870 | packuswb m3, m5 | |
17871 | movu [r0 + 349 * 16], m3 | |
17872 | ||
17873 | ; mode 8 [row 26 - second half] | |
17874 | movu [r0 + 437 * 16], m3 | |
17875 | ||
17876 | ; mode 7 [row 15] | |
17877 | movu m6, [r5 + 16 * 16] | |
17878 | pmaddubsw m3, m0, m6 | |
17879 | pmulhrsw m3, m7 | |
17880 | pmaddubsw m5, m2, m6 | |
17881 | pmulhrsw m5, m7 | |
17882 | packuswb m3, m5 | |
17883 | movu [r0 + 350 * 16], m3 | |
17884 | pmaddubsw m3, m1, m6 | |
17885 | pmulhrsw m3, m7 | |
17886 | pmaddubsw m5, m4, m6 | |
17887 | pmulhrsw m5, m7 | |
17888 | packuswb m3, m5 | |
17889 | movu [r0 + 351 * 16], m3 | |
17890 | ||
17891 | ; mode 8 [row 27] | |
17892 | movu m6, [r5 + 12 * 16] | |
17893 | pmaddubsw m3, m0, m6 | |
17894 | pmulhrsw m3, m7 | |
17895 | pmaddubsw m5, m2, m6 | |
17896 | pmulhrsw m5, m7 | |
17897 | packuswb m3, m5 | |
17898 | movu [r0 + 438 * 16], m3 | |
17899 | pmaddubsw m3, m1, m6 | |
17900 | pmulhrsw m3, m7 | |
17901 | pmaddubsw m5, m4, m6 | |
17902 | pmulhrsw m5, m7 | |
17903 | packuswb m3, m5 | |
17904 | movu [r0 + 439 * 16], m3 | |
17905 | ||
17906 | ; mode 8 [row 28] | |
17907 | movu m6, [r5 + 17 * 16] | |
17908 | pmaddubsw m3, m0, m6 | |
17909 | pmulhrsw m3, m7 | |
17910 | pmaddubsw m5, m2, m6 | |
17911 | pmulhrsw m5, m7 | |
17912 | packuswb m3, m5 | |
17913 | movu [r0 + 440 * 16], m3 | |
17914 | pmaddubsw m3, m1, m6 | |
17915 | pmulhrsw m3, m7 | |
17916 | pmaddubsw m5, m4, m6 | |
17917 | pmulhrsw m5, m7 | |
17918 | packuswb m3, m5 | |
17919 | movu [r0 + 441 * 16], m3 | |
17920 | ||
17921 | ; mode 8 [row 29] | |
17922 | movu m6, [r5 + 22 * 16] | |
17923 | pmaddubsw m3, m0, m6 | |
17924 | pmulhrsw m3, m7 | |
17925 | pmaddubsw m5, m2, m6 | |
17926 | pmulhrsw m5, m7 | |
17927 | packuswb m3, m5 | |
17928 | movu [r0 + 442 * 16], m3 | |
17929 | pmaddubsw m3, m1, m6 | |
17930 | pmulhrsw m3, m7 | |
17931 | pmaddubsw m5, m4, m6 | |
17932 | pmulhrsw m5, m7 | |
17933 | packuswb m3, m5 | |
17934 | movu [r0 + 443 * 16], m3 | |
17935 | ||
17936 | ; mode 8 [row 30] | |
17937 | movu m6, [r5 + 27 * 16] | |
17938 | pmaddubsw m3, m0, m6 | |
17939 | pmulhrsw m3, m7 | |
17940 | pmaddubsw m5, m2, m6 | |
17941 | pmulhrsw m5, m7 | |
17942 | packuswb m3, m5 | |
17943 | movu [r0 + 444 * 16], m3 | |
17944 | pmaddubsw m3, m1, m6 | |
17945 | pmulhrsw m3, m7 | |
17946 | pmaddubsw m5, m4, m6 | |
17947 | pmulhrsw m5, m7 | |
17948 | packuswb m3, m5 | |
17949 | movu [r0 + 445 * 16], m3 | |
17950 | ||
17951 | ; mode 3 [row 6] | |
17952 | movu m6, [r5 + 22 * 16] | |
17953 | movu m0, [r4 + 6] | |
17954 | movd m1, [r4 + 7] | |
17955 | palignr m1, m0, 1 | |
17956 | punpcklbw m0, m1 | |
17957 | pmaddubsw m1, m0, m6 | |
17958 | pmulhrsw m1, m7 | |
17959 | movu m2, [r4 + 14] | |
17960 | movd m3, [r4 + 15] | |
17961 | palignr m3, m2, 1 | |
17962 | punpcklbw m2, m3 | |
17963 | pmaddubsw m3, m2, m6 | |
17964 | pmulhrsw m3, m7 | |
17965 | packuswb m1, m3 | |
17966 | movu [r0 + 76 * 16], m1 | |
17967 | ||
17968 | ; mode 6 [row 13 - first half] | |
17969 | movu [r0 + 282 * 16], m1 | |
17970 | ||
17971 | movu m1, [r4 + 22] | |
17972 | movd m3, [r4 + 23] | |
17973 | palignr m3, m1, 1 | |
17974 | punpcklbw m1, m3 | |
17975 | pmaddubsw m3, m1, m6 | |
17976 | pmulhrsw m3, m7 | |
17977 | movu m4, [r4 + 30] | |
17978 | movd m5, [r4 + 31] | |
17979 | palignr m5, m4, 1 | |
17980 | punpcklbw m4, m5 | |
17981 | pmaddubsw m5, m4, m6 | |
17982 | pmulhrsw m5, m7 | |
17983 | packuswb m3, m5 | |
17984 | movu [r0 + 77 * 16], m3 | |
17985 | ||
17986 | ; mode 6 [row 13 - second half] | |
17987 | movu [r0 + 283 * 16], m3 | |
17988 | ||
17989 | ; mode 4 [row 7] | |
17990 | movu m6, [r5 + 8 * 16] | |
17991 | pmaddubsw m3, m0, m6 | |
17992 | pmulhrsw m3, m7 | |
17993 | pmaddubsw m5, m2, m6 | |
17994 | pmulhrsw m5, m7 | |
17995 | packuswb m3, m5 | |
17996 | movu [r0 + 142 * 16], m3 | |
17997 | pmaddubsw m3, m1, m6 | |
17998 | pmulhrsw m3, m7 | |
17999 | pmaddubsw m5, m4, m6 | |
18000 | pmulhrsw m5, m7 | |
18001 | packuswb m3, m5 | |
18002 | movu [r0 + 143 * 16], m3 | |
18003 | ||
18004 | ; mode 4 [row 8] | |
18005 | movu m6, [r5 + 29 * 16] | |
18006 | pmaddubsw m3, m0, m6 | |
18007 | pmulhrsw m3, m7 | |
18008 | pmaddubsw m5, m2, m6 | |
18009 | pmulhrsw m5, m7 | |
18010 | packuswb m3, m5 | |
18011 | movu [r0 + 144 * 16], m3 | |
18012 | ||
18013 | ; mode 4 [row 8 - first half] | |
18014 | movu [r0 + 360 * 16], m3 | |
18015 | ||
18016 | pmaddubsw m3, m1, m6 | |
18017 | pmulhrsw m3, m7 | |
18018 | pmaddubsw m5, m4, m6 | |
18019 | pmulhrsw m5, m7 | |
18020 | packuswb m3, m5 | |
18021 | movu [r0 + 145 * 16], m3 | |
18022 | ||
18023 | ; mode 4 [row 8 - second half] | |
18024 | movu [r0 + 361 * 16], m3 | |
18025 | ||
18026 | ; mode 5 [row 9] | |
18027 | movu m6, [r5 + 10 * 16] | |
18028 | pmaddubsw m3, m0, m6 | |
18029 | pmulhrsw m3, m7 | |
18030 | pmaddubsw m5, m2, m6 | |
18031 | pmulhrsw m5, m7 | |
18032 | packuswb m3, m5 | |
18033 | movu [r0 + 210 * 16], m3 | |
18034 | pmaddubsw m3, m1, m6 | |
18035 | pmulhrsw m3, m7 | |
18036 | pmaddubsw m5, m4, m6 | |
18037 | pmulhrsw m5, m7 | |
18038 | packuswb m3, m5 | |
18039 | movu [r0 + 211 * 16], m3 | |
18040 | ||
18041 | ; mode 5 [row 10] | |
18042 | movu m6, [r5 + 27 * 16] | |
18043 | pmaddubsw m3, m0, m6 | |
18044 | pmulhrsw m3, m7 | |
18045 | pmaddubsw m5, m2, m6 | |
18046 | pmulhrsw m5, m7 | |
18047 | packuswb m3, m5 | |
18048 | movu [r0 + 212 * 16], m3 | |
18049 | pmaddubsw m3, m1, m6 | |
18050 | pmulhrsw m3, m7 | |
18051 | pmaddubsw m5, m4, m6 | |
18052 | pmulhrsw m5, m7 | |
18053 | packuswb m3, m5 | |
18054 | movu [r0 + 213 * 16], m3 | |
18055 | ||
18056 | ; mode 7 [row 17] | |
18057 | movu m6, [r5 + 2 * 16] | |
18058 | pmaddubsw m3, m0, m6 | |
18059 | pmulhrsw m3, m7 | |
18060 | pmaddubsw m5, m2, m6 | |
18061 | pmulhrsw m5, m7 | |
18062 | packuswb m3, m5 | |
18063 | movu [r0 + 354 * 16], m3 | |
18064 | pmaddubsw m3, m1, m6 | |
18065 | pmulhrsw m3, m7 | |
18066 | pmaddubsw m5, m4, m6 | |
18067 | pmulhrsw m5, m7 | |
18068 | packuswb m3, m5 | |
18069 | movu [r0 + 355 * 16], m3 | |
18070 | ||
18071 | ; mode 7 [row 18] | |
18072 | movu m6, [r5 + 11 * 16] | |
18073 | pmaddubsw m3, m0, m6 | |
18074 | pmulhrsw m3, m7 | |
18075 | pmaddubsw m5, m2, m6 | |
18076 | pmulhrsw m5, m7 | |
18077 | packuswb m3, m5 | |
18078 | movu [r0 + 356 * 16], m3 | |
18079 | pmaddubsw m3, m1, m6 | |
18080 | pmulhrsw m3, m7 | |
18081 | pmaddubsw m5, m4, m6 | |
18082 | pmulhrsw m5, m7 | |
18083 | packuswb m3, m5 | |
18084 | movu [r0 + 357 * 16], m3 | |
18085 | ||
18086 | ; mode 7 [row 19] | |
18087 | movu m6, [r5 + 20 * 16] | |
18088 | pmaddubsw m3, m0, m6 | |
18089 | pmulhrsw m3, m7 | |
18090 | pmaddubsw m5, m2, m6 | |
18091 | pmulhrsw m5, m7 | |
18092 | packuswb m3, m5 | |
18093 | movu [r0 + 358 * 16], m3 | |
18094 | pmaddubsw m3, m1, m6 | |
18095 | pmulhrsw m3, m7 | |
18096 | pmaddubsw m5, m4, m6 | |
18097 | pmulhrsw m5, m7 | |
18098 | packuswb m3, m5 | |
18099 | movu [r0 + 359 * 16], m3 | |
18100 | ||
18101 | ; mode 6 [row 12] | |
18102 | movu m6, [r5 + 9 * 16] | |
18103 | pmaddubsw m3, m0, m6 | |
18104 | pmulhrsw m3, m7 | |
18105 | pmaddubsw m5, m2, m6 | |
18106 | pmulhrsw m5, m7 | |
18107 | packuswb m3, m5 | |
18108 | movu [r0 + 280 * 16], m3 | |
18109 | pmaddubsw m3, m1, m6 | |
18110 | pmulhrsw m3, m7 | |
18111 | pmaddubsw m5, m4, m6 | |
18112 | pmulhrsw m5, m7 | |
18113 | packuswb m3, m5 | |
18114 | movu [r0 + 281 * 16], m3 | |
18115 | ||
18116 | ; mode 3 [row 7] | |
18117 | movu m6, [r5 + 16 * 16] | |
18118 | movu m0, [r4 + 7] | |
18119 | movd m1, [r4 + 8] | |
18120 | palignr m1, m0, 1 | |
18121 | punpcklbw m0, m1 | |
18122 | pmaddubsw m1, m0, m6 | |
18123 | pmulhrsw m1, m7 | |
18124 | movu m2, [r4 + 15] | |
18125 | movd m3, [r4 + 16] | |
18126 | palignr m3, m2, 1 | |
18127 | punpcklbw m2, m3 | |
18128 | pmaddubsw m3, m2, m6 | |
18129 | pmulhrsw m3, m7 | |
18130 | packuswb m1, m3 | |
18131 | movu [r0 + 78 * 16], m1 | |
18132 | ||
18133 | ; mode 6 [row 15 - first half] | |
18134 | movu [r0 + 286 * 16], m1 | |
18135 | ||
18136 | movu m1, [r4 + 23] | |
18137 | movd m3, [r4 + 24] | |
18138 | palignr m3, m1, 1 | |
18139 | punpcklbw m1, m3 | |
18140 | pmaddubsw m3, m1, m6 | |
18141 | pmulhrsw m3, m7 | |
18142 | movu m4, [r4 + 31] | |
18143 | movd m5, [r4 + 32] | |
18144 | palignr m5, m4, 1 | |
18145 | punpcklbw m4, m5 | |
18146 | pmaddubsw m5, m4, m6 | |
18147 | pmulhrsw m5, m7 | |
18148 | packuswb m3, m5 | |
18149 | movu [r0 + 79 * 16], m3 | |
18150 | ||
18151 | ; mode 6 [row 15 - second half] | |
18152 | movu [r0 + 287 * 16], m3 | |
18153 | ||
18154 | ; mode 4 [row 9] | |
18155 | movu m6, [r5 + 18 * 16] | |
18156 | pmaddubsw m3, m0, m6 | |
18157 | pmulhrsw m3, m7 | |
18158 | pmaddubsw m5, m2, m6 | |
18159 | pmulhrsw m5, m7 | |
18160 | packuswb m3, m5 | |
18161 | movu [r0 + 146 * 16], m3 | |
18162 | pmaddubsw m3, m1, m6 | |
18163 | pmulhrsw m3, m7 | |
18164 | pmaddubsw m5, m4, m6 | |
18165 | pmulhrsw m5, m7 | |
18166 | packuswb m3, m5 | |
18167 | movu [r0 + 147 * 16], m3 | |
18168 | ||
18169 | ; mode 5 [row 11] | |
18170 | movu m6, [r5 + 12 * 16] | |
18171 | pmaddubsw m3, m0, m6 | |
18172 | pmulhrsw m3, m7 | |
18173 | pmaddubsw m5, m2, m6 | |
18174 | pmulhrsw m5, m7 | |
18175 | packuswb m3, m5 | |
18176 | movu [r0 + 214 * 16], m3 | |
18177 | pmaddubsw m3, m1, m6 | |
18178 | pmulhrsw m3, m7 | |
18179 | pmaddubsw m5, m4, m6 | |
18180 | pmulhrsw m5, m7 | |
18181 | packuswb m3, m5 | |
18182 | movu [r0 + 215 * 16], m3 | |
18183 | ||
18184 | ; mode 5 [row 12] | |
18185 | movu m6, [r5 + 29 * 16] | |
18186 | pmaddubsw m3, m0, m6 | |
18187 | pmulhrsw m3, m7 | |
18188 | pmaddubsw m5, m2, m6 | |
18189 | pmulhrsw m5, m7 | |
18190 | packuswb m3, m5 | |
18191 | movu [r0 + 216 * 16], m3 | |
18192 | ||
18193 | ; mode 6 [row 16 - first half] | |
18194 | movu [r0 + 288 * 16], m3 | |
18195 | ||
18196 | pmaddubsw m3, m1, m6 | |
18197 | pmulhrsw m3, m7 | |
18198 | pmaddubsw m5, m4, m6 | |
18199 | pmulhrsw m5, m7 | |
18200 | packuswb m3, m5 | |
18201 | movu [r0 + 217 * 16], m3 | |
18202 | ||
18203 | ; mode 6 [row 16 - second half] | |
18204 | movu [r0 + 289 * 16], m3 | |
18205 | ||
18206 | ; mode 6 [row 14] | |
18207 | movu m6, [r5 + 3 * 16] | |
18208 | pmaddubsw m3, m0, m6 | |
18209 | pmulhrsw m3, m7 | |
18210 | pmaddubsw m5, m2, m6 | |
18211 | pmulhrsw m5, m7 | |
18212 | packuswb m3, m5 | |
18213 | movu [r0 + 284 * 16], m3 | |
18214 | pmaddubsw m3, m1, m6 | |
18215 | pmulhrsw m3, m7 | |
18216 | pmaddubsw m5, m4, m6 | |
18217 | pmulhrsw m5, m7 | |
18218 | packuswb m3, m5 | |
18219 | movu [r0 + 285 * 16], m3 | |
18220 | ||
18221 | ; mode 7 [row 21] | |
18222 | movu m6, [r5 + 6 * 16] | |
18223 | pmaddubsw m3, m0, m6 | |
18224 | pmulhrsw m3, m7 | |
18225 | pmaddubsw m5, m2, m6 | |
18226 | pmulhrsw m5, m7 | |
18227 | packuswb m3, m5 | |
18228 | movu [r0 + 362 * 16], m3 | |
18229 | pmaddubsw m3, m1, m6 | |
18230 | pmulhrsw m3, m7 | |
18231 | pmaddubsw m5, m4, m6 | |
18232 | pmulhrsw m5, m7 | |
18233 | packuswb m3, m5 | |
18234 | movu [r0 + 363 * 16], m3 | |
18235 | ||
18236 | ; mode 7 [row 22] | |
18237 | movu m6, [r5 + 15 * 16] | |
18238 | pmaddubsw m3, m0, m6 | |
18239 | pmulhrsw m3, m7 | |
18240 | pmaddubsw m5, m2, m6 | |
18241 | pmulhrsw m5, m7 | |
18242 | packuswb m3, m5 | |
18243 | movu [r0 + 364 * 16], m3 | |
18244 | pmaddubsw m3, m1, m6 | |
18245 | pmulhrsw m3, m7 | |
18246 | pmaddubsw m5, m4, m6 | |
18247 | pmulhrsw m5, m7 | |
18248 | packuswb m3, m5 | |
18249 | movu [r0 + 365 * 16], m3 | |
18250 | ||
18251 | ; mode 7 [row 23] | |
18252 | movu m6, [r5 + 24 * 16] | |
18253 | pmaddubsw m3, m0, m6 | |
18254 | pmulhrsw m3, m7 | |
18255 | pmaddubsw m5, m2, m6 | |
18256 | pmulhrsw m5, m7 | |
18257 | packuswb m3, m5 | |
18258 | movu [r0 + 366 * 16], m3 | |
18259 | pmaddubsw m3, m1, m6 | |
18260 | pmulhrsw m3, m7 | |
18261 | pmaddubsw m5, m4, m6 | |
18262 | pmulhrsw m5, m7 | |
18263 | packuswb m3, m5 | |
18264 | movu [r0 + 367 * 16], m3 | |
18265 | ||
18266 | ; mode 3 [row 8] | |
18267 | movu m6, [r5 + 10 * 16] | |
18268 | movu m0, [r4 + 8] | |
18269 | movd m1, [r4 + 9] | |
18270 | palignr m1, m0, 1 | |
18271 | punpcklbw m0, m1 | |
18272 | pmaddubsw m1, m0, m6 | |
18273 | pmulhrsw m1, m7 | |
18274 | movu m2, [r4 + 16] | |
18275 | movd m3, [r4 + 17] | |
18276 | palignr m3, m2, 1 | |
18277 | punpcklbw m2, m3 | |
18278 | pmaddubsw m3, m2, m6 | |
18279 | pmulhrsw m3, m7 | |
18280 | packuswb m1, m3 | |
18281 | movu [r0 + 80 * 16], m1 | |
18282 | ||
18283 | ; mode 7 [row 25 - first half] | |
18284 | movu [r0 + 290 * 16], m1 | |
18285 | ||
18286 | ; mode 6 [row 17 - first half] | |
18287 | movu [r0 + 370 * 16], m1 | |
18288 | ||
18289 | movu m1, [r4 + 24] | |
18290 | movd m3, [r4 + 25] | |
18291 | palignr m3, m1, 1 | |
18292 | punpcklbw m1, m3 | |
18293 | pmaddubsw m3, m1, m6 | |
18294 | pmulhrsw m3, m7 | |
18295 | movu m4, [r4 + 32] | |
18296 | movd m5, [r4 + 33] | |
18297 | palignr m5, m4, 1 | |
18298 | punpcklbw m4, m5 | |
18299 | pmaddubsw m5, m4, m6 | |
18300 | pmulhrsw m5, m7 | |
18301 | packuswb m3, m5 | |
18302 | movu [r0 + 81 * 16], m3 | |
18303 | ||
18304 | ; mode 7 [row 25 - second half] | |
18305 | movu [r0 + 291 * 16], m3 | |
18306 | ||
18307 | ; mode 6 [row 17 - second half] | |
18308 | movu [r0 + 371 * 16], m3 | |
18309 | ||
18310 | ; mode 4 [row 10] | |
18311 | movu m6, [r5 + 7 * 16] | |
18312 | pmaddubsw m3, m0, m6 | |
18313 | pmulhrsw m3, m7 | |
18314 | pmaddubsw m5, m2, m6 | |
18315 | pmulhrsw m5, m7 | |
18316 | packuswb m3, m5 | |
18317 | movu [r0 + 148 * 16], m3 | |
18318 | pmaddubsw m3, m1, m6 | |
18319 | pmulhrsw m3, m7 | |
18320 | pmaddubsw m5, m4, m6 | |
18321 | pmulhrsw m5, m7 | |
18322 | packuswb m3, m5 | |
18323 | movu [r0 + 149 * 16], m3 | |
18324 | ||
18325 | ; mode 4 [row 11] | |
18326 | movu m6, [r5 + 28 * 16] | |
18327 | pmaddubsw m3, m0, m6 | |
18328 | pmulhrsw m3, m7 | |
18329 | pmaddubsw m5, m2, m6 | |
18330 | pmulhrsw m5, m7 | |
18331 | packuswb m3, m5 | |
18332 | movu [r0 + 150 * 16], m3 | |
18333 | ||
18334 | ; mode 7 [row 27 - first half] | |
18335 | movu [r0 + 374 * 16], m3 | |
18336 | ||
18337 | pmaddubsw m3, m1, m6 | |
18338 | pmulhrsw m3, m7 | |
18339 | pmaddubsw m5, m4, m6 | |
18340 | pmulhrsw m5, m7 | |
18341 | packuswb m3, m5 | |
18342 | movu [r0 + 151 * 16], m3 | |
18343 | ||
18344 | ; mode 7 [row 27 - second half] | |
18345 | movu [r0 + 375 * 16], m3 | |
18346 | ||
18347 | ; mode 5 [row 13] | |
18348 | movu m6, [r5 + 14 * 16] | |
18349 | pmaddubsw m3, m0, m6 | |
18350 | pmulhrsw m3, m7 | |
18351 | pmaddubsw m5, m2, m6 | |
18352 | pmulhrsw m5, m7 | |
18353 | packuswb m3, m5 | |
18354 | movu [r0 + 218 * 16], m3 | |
18355 | pmaddubsw m3, m1, m6 | |
18356 | pmulhrsw m3, m7 | |
18357 | pmaddubsw m5, m4, m6 | |
18358 | pmulhrsw m5, m7 | |
18359 | packuswb m3, m5 | |
18360 | movu [r0 + 219 * 16], m3 | |
18361 | ||
18362 | ; mode 5 [row 14] | |
18363 | movu m6, [r5 + 31 * 16] | |
18364 | pmaddubsw m3, m0, m6 | |
18365 | pmulhrsw m3, m7 | |
18366 | pmaddubsw m5, m2, m6 | |
18367 | pmulhrsw m5, m7 | |
18368 | packuswb m3, m5 | |
18369 | movu [r0 + 220 * 16], m3 | |
18370 | pmaddubsw m3, m1, m6 | |
18371 | pmulhrsw m3, m7 | |
18372 | pmaddubsw m5, m4, m6 | |
18373 | pmulhrsw m5, m7 | |
18374 | packuswb m3, m5 | |
18375 | movu [r0 + 221 * 16], m3 | |
18376 | ||
18377 | ; mode 6 [row 18] | |
18378 | movu m6, [r5 + 23 * 16] | |
18379 | pmaddubsw m3, m0, m6 | |
18380 | pmulhrsw m3, m7 | |
18381 | pmaddubsw m5, m2, m6 | |
18382 | pmulhrsw m5, m7 | |
18383 | packuswb m3, m5 | |
18384 | movu [r0 + 292 * 16], m3 | |
18385 | pmaddubsw m3, m1, m6 | |
18386 | pmulhrsw m3, m7 | |
18387 | pmaddubsw m5, m4, m6 | |
18388 | pmulhrsw m5, m7 | |
18389 | packuswb m3, m5 | |
18390 | movu [r0 + 293 * 16], m3 | |
18391 | ||
18392 | ; mode 7 [row 24] | |
18393 | movu m6, [r5 + 1 * 16] | |
18394 | pmaddubsw m3, m0, m6 | |
18395 | pmulhrsw m3, m7 | |
18396 | pmaddubsw m5, m2, m6 | |
18397 | pmulhrsw m5, m7 | |
18398 | packuswb m3, m5 | |
18399 | movu [r0 + 368 * 16], m3 | |
18400 | pmaddubsw m3, m1, m6 | |
18401 | pmulhrsw m3, m7 | |
18402 | pmaddubsw m5, m4, m6 | |
18403 | pmulhrsw m5, m7 | |
18404 | packuswb m3, m5 | |
18405 | movu [r0 + 369 * 16], m3 | |
18406 | ||
18407 | ; mode 7 [row 26] | |
18408 | movu m6, [r5 + 19 * 16] | |
18409 | pmaddubsw m3, m0, m6 | |
18410 | pmulhrsw m3, m7 | |
18411 | pmaddubsw m5, m2, m6 | |
18412 | pmulhrsw m5, m7 | |
18413 | packuswb m3, m5 | |
18414 | movu [r0 + 372 * 16], m3 | |
18415 | pmaddubsw m3, m1, m6 | |
18416 | pmulhrsw m3, m7 | |
18417 | pmaddubsw m5, m4, m6 | |
18418 | pmulhrsw m5, m7 | |
18419 | packuswb m3, m5 | |
18420 | movu [r0 + 373 * 16], m3 | |
18421 | ||
18422 | ; mode 3 [row 9] | |
18423 | movu m6, [r5 + 4 * 16] | |
18424 | movu m0, [r4 + 9] | |
18425 | movd m1, [r4 + 10] | |
18426 | palignr m1, m0, 1 | |
18427 | punpcklbw m0, m1 | |
18428 | pmaddubsw m1, m0, m6 | |
18429 | pmulhrsw m1, m7 | |
18430 | movu m2, [r4 + 17] | |
18431 | movd m3, [r4 + 18] | |
18432 | palignr m3, m2, 1 | |
18433 | punpcklbw m2, m3 | |
18434 | pmaddubsw m3, m2, m6 | |
18435 | pmulhrsw m3, m7 | |
18436 | packuswb m1, m3 | |
18437 | movu [r0 + 82 * 16], m1 | |
18438 | ||
18439 | ; mode 6 [row 19 - first half] | |
18440 | movu [r0 + 294 * 16], m1 | |
18441 | ||
18442 | movu m1, [r4 + 25] | |
18443 | movd m3, [r4 + 26] | |
18444 | palignr m3, m1, 1 | |
18445 | punpcklbw m1, m3 | |
18446 | pmaddubsw m3, m1, m6 | |
18447 | pmulhrsw m3, m7 | |
18448 | movu m4, [r4 + 33] | |
18449 | movd m5, [r4 + 34] | |
18450 | palignr m5, m4, 1 | |
18451 | punpcklbw m4, m5 | |
18452 | pmaddubsw m5, m4, m6 | |
18453 | pmulhrsw m5, m7 | |
18454 | packuswb m3, m5 | |
18455 | movu [r0 + 83 * 16], m3 | |
18456 | ||
18457 | ; mode 6 [row 19 - second half] | |
18458 | movu [r0 + 295 * 16], m3 | |
18459 | ||
18460 | ; mode 4 [row 12] | |
18461 | movu m6, [r5 + 17 * 16] | |
18462 | pmaddubsw m3, m0, m6 | |
18463 | pmulhrsw m3, m7 | |
18464 | pmaddubsw m5, m2, m6 | |
18465 | pmulhrsw m5, m7 | |
18466 | packuswb m3, m5 | |
18467 | movu [r0 + 152 * 16], m3 | |
18468 | ||
18469 | ; mode 4 [row 12 - first half] | |
18470 | movu [r0 + 296 * 16], m3 | |
18471 | ||
18472 | pmaddubsw m3, m1, m6 | |
18473 | pmulhrsw m3, m7 | |
18474 | pmaddubsw m5, m4, m6 | |
18475 | pmulhrsw m5, m7 | |
18476 | packuswb m3, m5 | |
18477 | movu [r0 + 153 * 16], m3 | |
18478 | ||
18479 | ; mode 4 [row 12 - second half] | |
18480 | movu [r0 + 297 * 16], m3 | |
18481 | ||
18482 | ; mode 3 [row 10] | |
18483 | movu m6, [r5 + 30 * 16] | |
18484 | pmaddubsw m3, m0, m6 | |
18485 | pmulhrsw m3, m7 | |
18486 | pmaddubsw m5, m2, m6 | |
18487 | pmulhrsw m5, m7 | |
18488 | packuswb m3, m5 | |
18489 | movu [r0 + 84 * 16], m3 | |
18490 | ||
18491 | ; mode 6 [row 21 - first half] | |
18492 | movu [r0 + 298 * 16], m3 | |
18493 | ||
18494 | pmaddubsw m3, m1, m6 | |
18495 | pmulhrsw m3, m7 | |
18496 | pmaddubsw m5, m4, m6 | |
18497 | pmulhrsw m5, m7 | |
18498 | packuswb m3, m5 | |
18499 | movu [r0 + 85 * 16], m3 | |
18500 | ||
18501 | ; mode 6 [row 21 - second half] | |
18502 | movu [r0 + 299 * 16], m3 | |
18503 | ||
18504 | ; mode 5 [row 15] | |
18505 | movu m6, [r5 + 16 * 16] | |
18506 | pmaddubsw m3, m0, m6 | |
18507 | pmulhrsw m3, m7 | |
18508 | pmaddubsw m5, m2, m6 | |
18509 | pmulhrsw m5, m7 | |
18510 | packuswb m3, m5 | |
18511 | movu [r0 + 222 * 16], m3 | |
18512 | pmaddubsw m3, m1, m6 | |
18513 | pmulhrsw m3, m7 | |
18514 | pmaddubsw m5, m4, m6 | |
18515 | pmulhrsw m5, m7 | |
18516 | packuswb m3, m5 | |
18517 | movu [r0 + 223 * 16], m3 | |
18518 | ||
18519 | ; mode 7 [row 28] | |
18520 | movu m6, [r5 + 5 * 16] | |
18521 | pmaddubsw m3, m0, m6 | |
18522 | pmulhrsw m3, m7 | |
18523 | pmaddubsw m5, m2, m6 | |
18524 | pmulhrsw m5, m7 | |
18525 | packuswb m3, m5 | |
18526 | movu [r0 + 376 * 16], m3 | |
18527 | pmaddubsw m3, m1, m6 | |
18528 | pmulhrsw m3, m7 | |
18529 | pmaddubsw m5, m4, m6 | |
18530 | pmulhrsw m5, m7 | |
18531 | packuswb m3, m5 | |
18532 | movu [r0 + 377 * 16], m3 | |
18533 | ||
18534 | ; mode 7 [row 29] | |
18535 | movu m6, [r5 + 14 * 16] | |
18536 | pmaddubsw m3, m0, m6 | |
18537 | pmulhrsw m3, m7 | |
18538 | pmaddubsw m5, m2, m6 | |
18539 | pmulhrsw m5, m7 | |
18540 | packuswb m3, m5 | |
18541 | movu [r0 + 378 * 16], m3 | |
18542 | pmaddubsw m3, m1, m6 | |
18543 | pmulhrsw m3, m7 | |
18544 | pmaddubsw m5, m4, m6 | |
18545 | pmulhrsw m5, m7 | |
18546 | packuswb m3, m5 | |
18547 | movu [r0 + 379 * 16], m3 | |
18548 | ||
18549 | ; mode 7 [row 30] | |
18550 | movu m6, [r5 + 23 * 16] | |
18551 | pmaddubsw m3, m0, m6 | |
18552 | pmulhrsw m3, m7 | |
18553 | pmaddubsw m5, m2, m6 | |
18554 | pmulhrsw m5, m7 | |
18555 | packuswb m3, m5 | |
18556 | movu [r0 + 380 * 16], m3 | |
18557 | pmaddubsw m3, m1, m6 | |
18558 | pmulhrsw m3, m7 | |
18559 | pmaddubsw m5, m4, m6 | |
18560 | pmulhrsw m5, m7 | |
18561 | packuswb m3, m5 | |
18562 | movu [r0 + 381 * 16], m3 | |
18563 | ||
18564 | ; mode 3 [row 11] | |
18565 | movu m6, [r5 + 24 * 16] | |
18566 | movu m0, [r4 + 10] | |
18567 | movd m1, [r4 + 11] | |
18568 | palignr m1, m0, 1 | |
18569 | punpcklbw m0, m1 | |
18570 | pmaddubsw m1, m0, m6 | |
18571 | pmulhrsw m1, m7 | |
18572 | movu m2, [r4 + 18] | |
18573 | movd m3, [r4 + 19] | |
18574 | palignr m3, m2, 1 | |
18575 | punpcklbw m2, m3 | |
18576 | pmaddubsw m3, m2, m6 | |
18577 | pmulhrsw m3, m7 | |
18578 | packuswb m1, m3 | |
18579 | movu [r0 + 86 * 16], m1 | |
18580 | ||
18581 | ; mode 6 [row 23 - first half] | |
18582 | movu [r0 + 302 * 16], m1 | |
18583 | ||
18584 | movu m1, [r4 + 26] | |
18585 | movd m3, [r4 + 27] | |
18586 | palignr m3, m1, 1 | |
18587 | punpcklbw m1, m3 | |
18588 | pmaddubsw m3, m1, m6 | |
18589 | pmulhrsw m3, m7 | |
18590 | movu m4, [r4 + 34] | |
18591 | movd m5, [r4 + 35] | |
18592 | palignr m5, m4, 1 | |
18593 | punpcklbw m4, m5 | |
18594 | pmaddubsw m5, m4, m6 | |
18595 | pmulhrsw m5, m7 | |
18596 | packuswb m3, m5 | |
18597 | movu [r0 + 87 * 16], m3 | |
18598 | ||
18599 | ; mode 6 [row 23 - second half] | |
18600 | movu [r0 + 303 * 16], m3 | |
18601 | ||
18602 | ; mode 4 [row 13] | |
18603 | movu m6, [r5 + 6 * 16] | |
18604 | pmaddubsw m3, m0, m6 | |
18605 | pmulhrsw m3, m7 | |
18606 | pmaddubsw m5, m2, m6 | |
18607 | pmulhrsw m5, m7 | |
18608 | packuswb m3, m5 | |
18609 | movu [r0 + 154 * 16], m3 | |
18610 | pmaddubsw m3, m1, m6 | |
18611 | pmulhrsw m3, m7 | |
18612 | pmaddubsw m5, m4, m6 | |
18613 | pmulhrsw m5, m7 | |
18614 | packuswb m3, m5 | |
18615 | movu [r0 + 155 * 16], m3 | |
18616 | ||
18617 | ; mode 4 [row 14] | |
18618 | movu m6, [r5 + 27 * 16] | |
18619 | pmaddubsw m3, m0, m6 | |
18620 | pmulhrsw m3, m7 | |
18621 | pmaddubsw m5, m2, m6 | |
18622 | pmulhrsw m5, m7 | |
18623 | packuswb m3, m5 | |
18624 | movu [r0 + 156 * 16], m3 | |
18625 | pmaddubsw m3, m1, m6 | |
18626 | pmulhrsw m3, m7 | |
18627 | pmaddubsw m5, m4, m6 | |
18628 | pmulhrsw m5, m7 | |
18629 | packuswb m3, m5 | |
18630 | movu [r0 + 157 * 16], m3 | |
18631 | ||
18632 | ; mode 5 [row 16] | |
18633 | movu m6, [r5 + 1 * 16] | |
18634 | pmaddubsw m3, m0, m6 | |
18635 | pmulhrsw m3, m7 | |
18636 | pmaddubsw m5, m2, m6 | |
18637 | pmulhrsw m5, m7 | |
18638 | packuswb m3, m5 | |
18639 | movu [r0 + 224 * 16], m3 | |
18640 | pmaddubsw m3, m1, m6 | |
18641 | pmulhrsw m3, m7 | |
18642 | pmaddubsw m5, m4, m6 | |
18643 | pmulhrsw m5, m7 | |
18644 | packuswb m3, m5 | |
18645 | movu [r0 + 225 * 16], m3 | |
18646 | ||
18647 | ; mode 5 [row 17] | |
18648 | movu m6, [r5 + 18 * 16] | |
18649 | pmaddubsw m3, m0, m6 | |
18650 | pmulhrsw m3, m7 | |
18651 | pmaddubsw m5, m2, m6 | |
18652 | pmulhrsw m5, m7 | |
18653 | packuswb m3, m5 | |
18654 | movu [r0 + 226 * 16], m3 | |
18655 | pmaddubsw m3, m1, m6 | |
18656 | pmulhrsw m3, m7 | |
18657 | pmaddubsw m5, m4, m6 | |
18658 | pmulhrsw m5, m7 | |
18659 | packuswb m3, m5 | |
18660 | movu [r0 + 227 * 16], m3 | |
18661 | ||
18662 | ; mode 6 [row 22] | |
18663 | movu m6, [r5 + 11 * 16] | |
18664 | pmaddubsw m3, m0, m6 | |
18665 | pmulhrsw m3, m7 | |
18666 | pmaddubsw m5, m2, m6 | |
18667 | pmulhrsw m5, m7 | |
18668 | packuswb m3, m5 | |
18669 | movu [r0 + 300 * 16], m3 | |
18670 | pmaddubsw m3, m1, m6 | |
18671 | pmulhrsw m3, m7 | |
18672 | pmaddubsw m5, m4, m6 | |
18673 | pmulhrsw m5, m7 | |
18674 | packuswb m3, m5 | |
18675 | movu [r0 + 301 * 16], m3 | |
18676 | ||
18677 | ; mode 3 [row 12] | |
18678 | movu m6, [r5 + 18 * 16] | |
18679 | movu m0, [r4 + 11] | |
18680 | movd m1, [r4 + 12] | |
18681 | palignr m1, m0, 1 | |
18682 | punpcklbw m0, m1 | |
18683 | pmaddubsw m1, m0, m6 | |
18684 | pmulhrsw m1, m7 | |
18685 | movu m2, [r4 + 19] | |
18686 | movd m3, [r4 + 20] | |
18687 | palignr m3, m2, 1 | |
18688 | punpcklbw m2, m3 | |
18689 | pmaddubsw m3, m2, m6 | |
18690 | pmulhrsw m3, m7 | |
18691 | packuswb m1, m3 | |
18692 | movu [r0 + 88 * 16], m1 | |
18693 | ||
18694 | ; mode 6 [row 25 - first half] | |
18695 | movu [r0 + 306 * 16], m1 | |
18696 | ||
18697 | movu m1, [r4 + 27] | |
18698 | movd m3, [r4 + 28] | |
18699 | palignr m3, m1, 1 | |
18700 | punpcklbw m1, m3 | |
18701 | pmaddubsw m3, m1, m6 | |
18702 | pmulhrsw m3, m7 | |
18703 | movu m4, [r4 + 35] | |
18704 | movd m5, [r4 + 36] | |
18705 | palignr m5, m4, 1 | |
18706 | punpcklbw m4, m5 | |
18707 | pmaddubsw m5, m4, m6 | |
18708 | pmulhrsw m5, m7 | |
18709 | packuswb m3, m5 | |
18710 | movu [r0 + 89 * 16], m3 | |
18711 | ||
18712 | ; mode 6 [row 25 - second half] | |
18713 | movu [r0 + 307 * 16], m3 | |
18714 | ||
18715 | ; mode 4 [row 15] | |
18716 | movu m6, [r5 + 16 * 16] | |
18717 | pmaddubsw m3, m0, m6 | |
18718 | pmulhrsw m3, m7 | |
18719 | pmaddubsw m5, m2, m6 | |
18720 | pmulhrsw m5, m7 | |
18721 | packuswb m3, m5 | |
18722 | movu [r0 + 158 * 16], m3 | |
18723 | pmaddubsw m3, m1, m6 | |
18724 | pmulhrsw m3, m7 | |
18725 | pmaddubsw m5, m4, m6 | |
18726 | pmulhrsw m5, m7 | |
18727 | packuswb m3, m5 | |
18728 | movu [r0 + 159 * 16], m3 | |
18729 | ||
18730 | ; mode 5 [row 18] | |
18731 | movu m6, [r5 + 3 * 16] | |
18732 | pmaddubsw m3, m0, m6 | |
18733 | pmulhrsw m3, m7 | |
18734 | pmaddubsw m5, m2, m6 | |
18735 | pmulhrsw m5, m7 | |
18736 | packuswb m3, m5 | |
18737 | movu [r0 + 228 * 16], m3 | |
18738 | pmaddubsw m3, m1, m6 | |
18739 | pmulhrsw m3, m7 | |
18740 | pmaddubsw m5, m4, m6 | |
18741 | pmulhrsw m5, m7 | |
18742 | packuswb m3, m5 | |
18743 | movu [r0 + 229 * 16], m3 | |
18744 | ||
18745 | ; mode 5 [row 19] | |
18746 | movu m6, [r5 + 20 * 16] | |
18747 | pmaddubsw m3, m0, m6 | |
18748 | pmulhrsw m3, m7 | |
18749 | pmaddubsw m5, m2, m6 | |
18750 | pmulhrsw m5, m7 | |
18751 | packuswb m3, m5 | |
18752 | movu [r0 + 230 * 16], m3 | |
18753 | pmaddubsw m3, m1, m6 | |
18754 | pmulhrsw m3, m7 | |
18755 | pmaddubsw m5, m4, m6 | |
18756 | pmulhrsw m5, m7 | |
18757 | packuswb m3, m5 | |
18758 | movu [r0 + 231 * 16], m3 | |
18759 | ||
18760 | ; mode 6 [row 24] | |
18761 | movu m6, [r5 + 5 * 16] | |
18762 | pmaddubsw m3, m0, m6 | |
18763 | pmulhrsw m3, m7 | |
18764 | pmaddubsw m5, m2, m6 | |
18765 | pmulhrsw m5, m7 | |
18766 | packuswb m3, m5 | |
18767 | movu [r0 + 304 * 16], m3 | |
18768 | pmaddubsw m3, m1, m6 | |
18769 | pmulhrsw m3, m7 | |
18770 | pmaddubsw m5, m4, m6 | |
18771 | pmulhrsw m5, m7 | |
18772 | packuswb m3, m5 | |
18773 | movu [r0 + 305 * 16], m3 | |
18774 | ||
18775 | ; mode 6 [row 26] | |
18776 | movu m6, [r5 + 31 * 16] | |
18777 | pmaddubsw m3, m0, m6 | |
18778 | pmulhrsw m3, m7 | |
18779 | pmaddubsw m5, m2, m6 | |
18780 | pmulhrsw m5, m7 | |
18781 | packuswb m3, m5 | |
18782 | movu [r0 + 308 * 16], m3 | |
18783 | pmaddubsw m3, m1, m6 | |
18784 | pmulhrsw m3, m7 | |
18785 | pmaddubsw m5, m4, m6 | |
18786 | pmulhrsw m5, m7 | |
18787 | packuswb m3, m5 | |
18788 | movu [r0 + 309 * 16], m3 | |
18789 | ||
18790 | ; mode 3 [row 13] | |
18791 | movu m6, [r5 + 12 * 16] | |
18792 | movu m0, [r4 + 12] | |
18793 | movd m1, [r4 + 13] | |
18794 | palignr m1, m0, 1 | |
18795 | punpcklbw m0, m1 | |
18796 | pmaddubsw m1, m0, m6 | |
18797 | pmulhrsw m1, m7 | |
18798 | movu m2, [r4 + 20] | |
18799 | movd m3, [r4 + 21] | |
18800 | palignr m3, m2, 1 | |
18801 | punpcklbw m2, m3 | |
18802 | pmaddubsw m3, m2, m6 | |
18803 | pmulhrsw m3, m7 | |
18804 | packuswb m1, m3 | |
18805 | movu [r0 + 90 * 16], m1 | |
18806 | ||
18807 | movu m1, [r4 + 28] | |
18808 | movd m3, [r4 + 29] | |
18809 | palignr m3, m1, 1 | |
18810 | punpcklbw m1, m3 | |
18811 | pmaddubsw m3, m1, m6 | |
18812 | pmulhrsw m3, m7 | |
18813 | movu m4, [r4 + 36] | |
18814 | movd m5, [r4 + 37] | |
18815 | palignr m5, m4, 1 | |
18816 | punpcklbw m4, m5 | |
18817 | pmaddubsw m5, m4, m6 | |
18818 | pmulhrsw m5, m7 | |
18819 | packuswb m3, m5 | |
18820 | movu [r0 + 91 * 16], m3 | |
18821 | ||
18822 | ; mode 4 [row 16] | |
18823 | movu m6, [r5 + 5 * 16] | |
18824 | pmaddubsw m3, m0, m6 | |
18825 | pmulhrsw m3, m7 | |
18826 | pmaddubsw m5, m2, m6 | |
18827 | pmulhrsw m5, m7 | |
18828 | packuswb m3, m5 | |
18829 | movu [r0 + 160 * 16], m3 | |
18830 | ||
18831 | ; mode 5 [row 20 - first half] | |
18832 | movu [r0 + 232 * 16], m3 | |
18833 | ||
18834 | pmaddubsw m3, m1, m6 | |
18835 | pmulhrsw m3, m7 | |
18836 | pmaddubsw m5, m4, m6 | |
18837 | pmulhrsw m5, m7 | |
18838 | packuswb m3, m5 | |
18839 | movu [r0 + 161 * 16], m3 | |
18840 | ||
18841 | ; mode 5 [row 20 - second half] | |
18842 | movu [r0 + 233 * 16], m3 | |
18843 | ||
18844 | ; mode 4 [row 17] | |
18845 | movu m6, [r5 + 26 * 16] | |
18846 | pmaddubsw m3, m0, m6 | |
18847 | pmulhrsw m3, m7 | |
18848 | pmaddubsw m5, m2, m6 | |
18849 | pmulhrsw m5, m7 | |
18850 | packuswb m3, m5 | |
18851 | movu [r0 + 162 * 16], m3 | |
18852 | pmaddubsw m3, m1, m6 | |
18853 | pmulhrsw m3, m7 | |
18854 | pmaddubsw m5, m4, m6 | |
18855 | pmulhrsw m5, m7 | |
18856 | packuswb m3, m5 | |
18857 | movu [r0 + 163 * 16], m3 | |
18858 | ||
18859 | ; mode 5 [row 21] | |
18860 | movu m6, [r5 + 22 * 16] | |
18861 | pmaddubsw m3, m0, m6 | |
18862 | pmulhrsw m3, m7 | |
18863 | pmaddubsw m5, m2, m6 | |
18864 | pmulhrsw m5, m7 | |
18865 | packuswb m3, m5 | |
18866 | movu [r0 + 234 * 16], m3 | |
18867 | pmaddubsw m3, m1, m6 | |
18868 | pmulhrsw m3, m7 | |
18869 | pmaddubsw m5, m4, m6 | |
18870 | pmulhrsw m5, m7 | |
18871 | packuswb m3, m5 | |
18872 | movu [r0 + 235 * 16], m3 | |
18873 | ||
18874 | ; mode 6 [row 27] | |
18875 | movu m6, [r5 + 12 * 16] | |
18876 | pmaddubsw m3, m0, m6 | |
18877 | pmulhrsw m3, m7 | |
18878 | pmaddubsw m5, m2, m6 | |
18879 | pmulhrsw m5, m7 | |
18880 | packuswb m3, m5 | |
18881 | movu [r0 + 310 * 16], m3 | |
18882 | pmaddubsw m3, m1, m6 | |
18883 | pmulhrsw m3, m7 | |
18884 | pmaddubsw m5, m4, m6 | |
18885 | pmulhrsw m5, m7 | |
18886 | packuswb m3, m5 | |
18887 | movu [r0 + 311 * 16], m3 | |
18888 | ||
18889 | ; mode 6 [row 28] | |
18890 | movu m6, [r5 + 25 * 16] | |
18891 | pmaddubsw m3, m0, m6 | |
18892 | pmulhrsw m3, m7 | |
18893 | pmaddubsw m5, m2, m6 | |
18894 | pmulhrsw m5, m7 | |
18895 | packuswb m3, m5 | |
18896 | movu [r0 + 312 * 16], m3 | |
18897 | pmaddubsw m3, m1, m6 | |
18898 | pmulhrsw m3, m7 | |
18899 | pmaddubsw m5, m4, m6 | |
18900 | pmulhrsw m5, m7 | |
18901 | packuswb m3, m5 | |
18902 | movu [r0 + 313 * 16], m3 | |
18903 | ||
18904 | ; mode 3 [row 14] | |
18905 | movu m6, [r5 + 6 * 16] | |
18906 | movu m0, [r4 + 13] | |
18907 | movd m1, [r4 + 14] | |
18908 | palignr m1, m0, 1 | |
18909 | punpcklbw m0, m1 | |
18910 | pmaddubsw m1, m0, m6 | |
18911 | pmulhrsw m1, m7 | |
18912 | movu m2, [r4 + 21] | |
18913 | movd m3, [r4 + 22] | |
18914 | palignr m3, m2, 1 | |
18915 | punpcklbw m2, m3 | |
18916 | pmaddubsw m3, m2, m6 | |
18917 | pmulhrsw m3, m7 | |
18918 | packuswb m1, m3 | |
18919 | movu [r0 + 92 * 16], m1 | |
18920 | ||
18921 | ; mode 6 [row 29 - first half] | |
18922 | movu [r0 + 314 * 16], m1 | |
18923 | ||
18924 | movu m1, [r4 + 29] | |
18925 | movd m3, [r4 + 30] | |
18926 | palignr m3, m1, 1 | |
18927 | punpcklbw m1, m3 | |
18928 | pmaddubsw m3, m1, m6 | |
18929 | pmulhrsw m3, m7 | |
18930 | movu m4, [r4 + 37] | |
18931 | movd m5, [r4 + 38] | |
18932 | palignr m5, m4, 1 | |
18933 | punpcklbw m4, m5 | |
18934 | pmaddubsw m5, m4, m6 | |
18935 | pmulhrsw m5, m7 | |
18936 | packuswb m3, m5 | |
18937 | movu [r0 + 93 * 16], m3 | |
18938 | ||
18939 | ; mode 6 [row 29 - second half] | |
18940 | movu [r0 + 315 * 16], m3 | |
18941 | ||
18942 | ; mode 4 [row 18] | |
18943 | movu m6, [r5 + 15 * 16] | |
18944 | pmaddubsw m3, m0, m6 | |
18945 | pmulhrsw m3, m7 | |
18946 | pmaddubsw m5, m2, m6 | |
18947 | pmulhrsw m5, m7 | |
18948 | packuswb m3, m5 | |
18949 | movu [r0 + 164 * 16], m3 | |
18950 | pmaddubsw m3, m1, m6 | |
18951 | pmulhrsw m3, m7 | |
18952 | pmaddubsw m5, m4, m6 | |
18953 | pmulhrsw m5, m7 | |
18954 | packuswb m3, m5 | |
18955 | movu [r0 + 165 * 16], m3 | |
18956 | ||
18957 | ; mode 5 [row 22] | |
18958 | movu m6, [r5 + 7 * 16] | |
18959 | pmaddubsw m3, m0, m6 | |
18960 | pmulhrsw m3, m7 | |
18961 | pmaddubsw m5, m2, m6 | |
18962 | pmulhrsw m5, m7 | |
18963 | packuswb m3, m5 | |
18964 | movu [r0 + 236 * 16], m3 | |
18965 | pmaddubsw m3, m1, m6 | |
18966 | pmulhrsw m3, m7 | |
18967 | pmaddubsw m5, m4, m6 | |
18968 | pmulhrsw m5, m7 | |
18969 | packuswb m3, m5 | |
18970 | movu [r0 + 237 * 16], m3 | |
18971 | ||
18972 | ; mode 5 [row 23] | |
18973 | movu m6, [r5 + 24 * 16] | |
18974 | pmaddubsw m3, m0, m6 | |
18975 | pmulhrsw m3, m7 | |
18976 | pmaddubsw m5, m2, m6 | |
18977 | pmulhrsw m5, m7 | |
18978 | packuswb m3, m5 | |
18979 | movu [r0 + 238 * 16], m3 | |
18980 | pmaddubsw m3, m1, m6 | |
18981 | pmulhrsw m3, m7 | |
18982 | pmaddubsw m5, m4, m6 | |
18983 | pmulhrsw m5, m7 | |
18984 | packuswb m3, m5 | |
18985 | movu [r0 + 239 * 16], m3 | |
18986 | ||
18987 | ; mode 6 [row 30] | |
18988 | movu m6, [r5 + 19 * 16] | |
18989 | pmaddubsw m3, m0, m6 | |
18990 | pmulhrsw m3, m7 | |
18991 | pmaddubsw m5, m2, m6 | |
18992 | pmulhrsw m5, m7 | |
18993 | packuswb m3, m5 | |
18994 | movu [r0 + 316 * 16], m3 | |
18995 | pmaddubsw m3, m1, m6 | |
18996 | pmulhrsw m3, m7 | |
18997 | pmaddubsw m5, m4, m6 | |
18998 | pmulhrsw m5, m7 | |
18999 | packuswb m3, m5 | |
19000 | movu [r0 + 317 * 16], m3 | |
19001 | ||
19002 | ; mode 3 [row 16] | |
19003 | movu m6, [r5 + 26 * 16] | |
19004 | movu m0, [r4 + 14] | |
19005 | movd m1, [r4 + 15] | |
19006 | palignr m1, m0, 1 | |
19007 | punpcklbw m0, m1 | |
19008 | pmaddubsw m1, m0, m6 | |
19009 | pmulhrsw m1, m7 | |
19010 | movu m2, [r4 + 22] | |
19011 | movd m3, [r4 + 23] | |
19012 | palignr m3, m2, 1 | |
19013 | punpcklbw m2, m3 | |
19014 | pmaddubsw m3, m2, m6 | |
19015 | pmulhrsw m3, m7 | |
19016 | packuswb m1, m3 | |
19017 | movu [r0 + 96 * 16], m1 | |
19018 | ||
19019 | ; mode 5 [row 25 - first half] | |
19020 | movu [r0 + 242 * 16], m1 | |
19021 | ||
19022 | movu m1, [r4 + 30] | |
19023 | movd m3, [r4 + 31] | |
19024 | palignr m3, m1, 1 | |
19025 | punpcklbw m1, m3 | |
19026 | pmaddubsw m3, m1, m6 | |
19027 | pmulhrsw m3, m7 | |
19028 | movu m4, [r4 + 38] | |
19029 | movd m5, [r4 + 39] | |
19030 | palignr m5, m4, 1 | |
19031 | punpcklbw m4, m5 | |
19032 | pmaddubsw m5, m4, m6 | |
19033 | pmulhrsw m5, m7 | |
19034 | packuswb m3, m5 | |
19035 | movu [r0 + 97 * 16], m3 | |
19036 | ||
19037 | ; mode 5 [row 25 - second half] | |
19038 | movu [r0 + 243 * 16], m3 | |
19039 | ||
19040 | ; mode 4 [row 19] | |
19041 | movu m6, [r5 + 4 * 16] | |
19042 | pmaddubsw m3, m0, m6 | |
19043 | pmulhrsw m3, m7 | |
19044 | pmaddubsw m5, m2, m6 | |
19045 | pmulhrsw m5, m7 | |
19046 | packuswb m3, m5 | |
19047 | movu [r0 + 166 * 16], m3 | |
19048 | pmaddubsw m3, m1, m6 | |
19049 | pmulhrsw m3, m7 | |
19050 | pmaddubsw m5, m4, m6 | |
19051 | pmulhrsw m5, m7 | |
19052 | packuswb m3, m5 | |
19053 | movu [r0 + 167 * 16], m3 | |
19054 | ||
19055 | ; mode 4 [row 20] | |
19056 | movu m6, [r5 + 25 * 16] | |
19057 | pmaddubsw m3, m0, m6 | |
19058 | pmulhrsw m3, m7 | |
19059 | pmaddubsw m5, m2, m6 | |
19060 | pmulhrsw m5, m7 | |
19061 | packuswb m3, m5 | |
19062 | movu [r0 + 168 * 16], m3 | |
19063 | pmaddubsw m3, m1, m6 | |
19064 | pmulhrsw m3, m7 | |
19065 | pmaddubsw m5, m4, m6 | |
19066 | pmulhrsw m5, m7 | |
19067 | packuswb m3, m5 | |
19068 | movu [r0 + 169 * 16], m3 | |
19069 | ||
19070 | ; mode 5 [row 24] | |
19071 | movu m6, [r5 + 9 * 16] | |
19072 | pmaddubsw m3, m0, m6 | |
19073 | pmulhrsw m3, m7 | |
19074 | pmaddubsw m5, m2, m6 | |
19075 | pmulhrsw m5, m7 | |
19076 | packuswb m3, m5 | |
19077 | movu [r0 + 240 * 16], m3 | |
19078 | pmaddubsw m3, m1, m6 | |
19079 | pmulhrsw m3, m7 | |
19080 | pmaddubsw m5, m4, m6 | |
19081 | pmulhrsw m5, m7 | |
19082 | packuswb m3, m5 | |
19083 | movu [r0 + 241 * 16], m3 | |
19084 | ||
19085 | ; mode 3 [row 17] | |
19086 | movu m6, [r5 + 20 * 16] | |
19087 | movu m0, [r4 + 15] | |
19088 | movd m1, [r4 + 16] | |
19089 | palignr m1, m0, 1 | |
19090 | punpcklbw m0, m1 | |
19091 | pmaddubsw m1, m0, m6 | |
19092 | pmulhrsw m1, m7 | |
19093 | movu m2, [r4 + 23] | |
19094 | movd m3, [r4 + 24] | |
19095 | palignr m3, m2, 1 | |
19096 | punpcklbw m2, m3 | |
19097 | pmaddubsw m3, m2, m6 | |
19098 | pmulhrsw m3, m7 | |
19099 | packuswb m1, m3 | |
19100 | movu [r0 + 98 * 16], m1 | |
19101 | ||
19102 | movu m1, [r4 + 31] | |
19103 | movd m3, [r4 + 32] | |
19104 | palignr m3, m1, 1 | |
19105 | punpcklbw m1, m3 | |
19106 | pmaddubsw m3, m1, m6 | |
19107 | pmulhrsw m3, m7 | |
19108 | movu m4, [r4 + 39] | |
19109 | movd m5, [r4 + 40] | |
19110 | palignr m5, m4, 1 | |
19111 | punpcklbw m4, m5 | |
19112 | pmaddubsw m5, m4, m6 | |
19113 | pmulhrsw m5, m7 | |
19114 | packuswb m3, m5 | |
19115 | movu [r0 + 99 * 16], m3 | |
19116 | ||
19117 | ; mode 4 [row 21] | |
19118 | movu m6, [r5 + 14 * 16] | |
19119 | pmaddubsw m3, m0, m6 | |
19120 | pmulhrsw m3, m7 | |
19121 | pmaddubsw m5, m2, m6 | |
19122 | pmulhrsw m5, m7 | |
19123 | packuswb m3, m5 | |
19124 | movu [r0 + 170 * 16], m3 | |
19125 | pmaddubsw m3, m1, m6 | |
19126 | pmulhrsw m3, m7 | |
19127 | pmaddubsw m5, m4, m6 | |
19128 | pmulhrsw m5, m7 | |
19129 | packuswb m3, m5 | |
19130 | movu [r0 + 171 * 16], m3 | |
19131 | ||
19132 | ; mode 5 [row 26] | |
19133 | movu m6, [r5 + 11 * 16] | |
19134 | pmaddubsw m3, m0, m6 | |
19135 | pmulhrsw m3, m7 | |
19136 | pmaddubsw m5, m2, m6 | |
19137 | pmulhrsw m5, m7 | |
19138 | packuswb m3, m5 | |
19139 | movu [r0 + 244 * 16], m3 | |
19140 | pmaddubsw m3, m1, m6 | |
19141 | pmulhrsw m3, m7 | |
19142 | pmaddubsw m5, m4, m6 | |
19143 | pmulhrsw m5, m7 | |
19144 | packuswb m3, m5 | |
19145 | movu [r0 + 245 * 16], m3 | |
19146 | ||
19147 | ; mode 5 [row 27] | |
19148 | movu m6, [r5 + 28 * 16] | |
19149 | pmaddubsw m3, m0, m6 | |
19150 | pmulhrsw m3, m7 | |
19151 | pmaddubsw m5, m2, m6 | |
19152 | pmulhrsw m5, m7 | |
19153 | packuswb m3, m5 | |
19154 | movu [r0 + 246 * 16], m3 | |
19155 | pmaddubsw m3, m1, m6 | |
19156 | pmulhrsw m3, m7 | |
19157 | pmaddubsw m5, m4, m6 | |
19158 | pmulhrsw m5, m7 | |
19159 | packuswb m3, m5 | |
19160 | movu [r0 + 247 * 16], m3 | |
19161 | ||
19162 | ; mode 3 [row 18] | |
19163 | movu m6, [r5 + 14 * 16] | |
19164 | movu m0, [r4 + 16] | |
19165 | movd m1, [r4 + 17] | |
19166 | palignr m1, m0, 1 | |
19167 | punpcklbw m0, m1 | |
19168 | pmaddubsw m1, m0, m6 | |
19169 | pmulhrsw m1, m7 | |
19170 | movu m2, [r4 + 24] | |
19171 | movd m3, [r4 + 25] | |
19172 | palignr m3, m2, 1 | |
19173 | punpcklbw m2, m3 | |
19174 | pmaddubsw m3, m2, m6 | |
19175 | pmulhrsw m3, m7 | |
19176 | packuswb m1, m3 | |
19177 | movu [r0 + 100 * 16], m1 | |
19178 | ||
19179 | movu m1, [r4 + 32] | |
19180 | movd m3, [r4 + 33] | |
19181 | palignr m3, m1, 1 | |
19182 | punpcklbw m1, m3 | |
19183 | pmaddubsw m3, m1, m6 | |
19184 | pmulhrsw m3, m7 | |
19185 | movu m4, [r4 + 40] | |
19186 | movd m5, [r4 + 41] | |
19187 | palignr m5, m4, 1 | |
19188 | punpcklbw m4, m5 | |
19189 | pmaddubsw m5, m4, m6 | |
19190 | pmulhrsw m5, m7 | |
19191 | packuswb m3, m5 | |
19192 | movu [r0 + 101 * 16], m3 | |
19193 | ||
19194 | ; mode 4 [row 22] | |
19195 | movu m6, [r5 + 3 * 16] | |
19196 | pmaddubsw m3, m0, m6 | |
19197 | pmulhrsw m3, m7 | |
19198 | pmaddubsw m5, m2, m6 | |
19199 | pmulhrsw m5, m7 | |
19200 | packuswb m3, m5 | |
19201 | movu [r0 + 172 * 16], m3 | |
19202 | pmaddubsw m3, m1, m6 | |
19203 | pmulhrsw m3, m7 | |
19204 | pmaddubsw m5, m4, m6 | |
19205 | pmulhrsw m5, m7 | |
19206 | packuswb m3, m5 | |
19207 | movu [r0 + 173 * 16], m3 | |
19208 | ||
19209 | ; mode 4 [row 23] | |
19210 | movu m6, [r5 + 24 * 16] | |
19211 | pmaddubsw m3, m0, m6 | |
19212 | pmulhrsw m3, m7 | |
19213 | pmaddubsw m5, m2, m6 | |
19214 | pmulhrsw m5, m7 | |
19215 | packuswb m3, m5 | |
19216 | movu [r0 + 174 * 16], m3 | |
19217 | pmaddubsw m3, m1, m6 | |
19218 | pmulhrsw m3, m7 | |
19219 | pmaddubsw m5, m4, m6 | |
19220 | pmulhrsw m5, m7 | |
19221 | packuswb m3, m5 | |
19222 | movu [r0 + 175 * 16], m3 | |
19223 | ||
19224 | ; mode 5 [row 28] | |
19225 | movu m6, [r5 + 13 * 16] | |
19226 | pmaddubsw m3, m0, m6 | |
19227 | pmulhrsw m3, m7 | |
19228 | pmaddubsw m5, m2, m6 | |
19229 | pmulhrsw m5, m7 | |
19230 | packuswb m3, m5 | |
19231 | movu [r0 + 248 * 16], m3 | |
19232 | pmaddubsw m3, m1, m6 | |
19233 | pmulhrsw m3, m7 | |
19234 | pmaddubsw m5, m4, m6 | |
19235 | pmulhrsw m5, m7 | |
19236 | packuswb m3, m5 | |
19237 | movu [r0 + 249 * 16], m3 | |
19238 | ||
19239 | ; mode 5 [row 29] | |
19240 | movu m6, [r5 + 30 * 16] | |
19241 | pmaddubsw m3, m0, m6 | |
19242 | pmulhrsw m3, m7 | |
19243 | pmaddubsw m5, m2, m6 | |
19244 | pmulhrsw m5, m7 | |
19245 | packuswb m3, m5 | |
19246 | movu [r0 + 250 * 16], m3 | |
19247 | pmaddubsw m3, m1, m6 | |
19248 | pmulhrsw m3, m7 | |
19249 | pmaddubsw m5, m4, m6 | |
19250 | pmulhrsw m5, m7 | |
19251 | packuswb m3, m5 | |
19252 | movu [r0 + 251 * 16], m3 | |
19253 | ||
19254 | ; mode 3 [row 19] | |
19255 | movu m6, [r5 + 8 * 16] | |
19256 | movu m0, [r4 + 17] | |
19257 | movd m1, [r4 + 18] | |
19258 | palignr m1, m0, 1 | |
19259 | punpcklbw m0, m1 | |
19260 | pmaddubsw m1, m0, m6 | |
19261 | pmulhrsw m1, m7 | |
19262 | movu m2, [r4 + 25] | |
19263 | movd m3, [r4 + 26] | |
19264 | palignr m3, m2, 1 | |
19265 | punpcklbw m2, m3 | |
19266 | pmaddubsw m3, m2, m6 | |
19267 | pmulhrsw m3, m7 | |
19268 | packuswb m1, m3 | |
19269 | movu [r0 + 102 * 16], m1 | |
19270 | ||
19271 | movu m1, [r4 + 33] | |
19272 | movd m3, [r4 + 34] | |
19273 | palignr m3, m1, 1 | |
19274 | punpcklbw m1, m3 | |
19275 | pmaddubsw m3, m1, m6 | |
19276 | pmulhrsw m3, m7 | |
19277 | movu m4, [r4 + 41] | |
19278 | movd m5, [r4 + 42] | |
19279 | palignr m5, m4, 1 | |
19280 | punpcklbw m4, m5 | |
19281 | pmaddubsw m5, m4, m6 | |
19282 | pmulhrsw m5, m7 | |
19283 | packuswb m3, m5 | |
19284 | movu [r0 + 103 * 16], m3 | |
19285 | ||
19286 | ; mode 4 [row 24] | |
19287 | movu m6, [r5 + 13 * 16] | |
19288 | pmaddubsw m3, m0, m6 | |
19289 | pmulhrsw m3, m7 | |
19290 | pmaddubsw m5, m2, m6 | |
19291 | pmulhrsw m5, m7 | |
19292 | packuswb m3, m5 | |
19293 | movu [r0 + 176 * 16], m3 | |
19294 | pmaddubsw m3, m1, m6 | |
19295 | pmulhrsw m3, m7 | |
19296 | pmaddubsw m5, m4, m6 | |
19297 | pmulhrsw m5, m7 | |
19298 | packuswb m3, m5 | |
19299 | movu [r0 + 177 * 16], m3 | |
19300 | ||
19301 | ; mode 5 [row 30] | |
19302 | movu m6, [r5 + 15 * 16] | |
19303 | pmaddubsw m3, m0, m6 | |
19304 | pmulhrsw m3, m7 | |
19305 | pmaddubsw m5, m2, m6 | |
19306 | pmulhrsw m5, m7 | |
19307 | packuswb m3, m5 | |
19308 | movu [r0 + 252 * 16], m3 | |
19309 | pmaddubsw m3, m1, m6 | |
19310 | pmulhrsw m3, m7 | |
19311 | pmaddubsw m5, m4, m6 | |
19312 | pmulhrsw m5, m7 | |
19313 | packuswb m3, m5 | |
19314 | movu [r0 + 253 * 16], m3 | |
19315 | ||
19316 | ; mode 3 [row 20] | |
19317 | movu m6, [r5 + 2 * 16] | |
19318 | movu m0, [r4 + 18] | |
19319 | movd m1, [r4 + 19] | |
19320 | palignr m1, m0, 1 | |
19321 | punpcklbw m0, m1 | |
19322 | pmaddubsw m1, m0, m6 | |
19323 | pmulhrsw m1, m7 | |
19324 | movu m2, [r4 + 26] | |
19325 | movd m3, [r4 + 27] | |
19326 | palignr m3, m2, 1 | |
19327 | punpcklbw m2, m3 | |
19328 | pmaddubsw m3, m2, m6 | |
19329 | pmulhrsw m3, m7 | |
19330 | packuswb m1, m3 | |
19331 | movu [r0 + 104 * 16], m1 | |
19332 | ||
19333 | movu m1, [r4 + 34] | |
19334 | movd m3, [r4 + 35] | |
19335 | palignr m3, m1, 1 | |
19336 | punpcklbw m1, m3 | |
19337 | pmaddubsw m3, m1, m6 | |
19338 | pmulhrsw m3, m7 | |
19339 | movu m4, [r4 + 42] | |
19340 | movd m5, [r4 + 43] | |
19341 | palignr m5, m4, 1 | |
19342 | punpcklbw m4, m5 | |
19343 | pmaddubsw m5, m4, m6 | |
19344 | pmulhrsw m5, m7 | |
19345 | packuswb m3, m5 | |
19346 | movu [r0 + 105 * 16], m3 | |
19347 | ||
19348 | ; mode 4 [row 25] | |
19349 | pmaddubsw m3, m0, m6 | |
19350 | pmulhrsw m3, m7 | |
19351 | pmaddubsw m5, m2, m6 | |
19352 | pmulhrsw m5, m7 | |
19353 | packuswb m3, m5 | |
19354 | movu [r0 + 178 * 16], m3 | |
19355 | pmaddubsw m3, m1, m6 | |
19356 | pmulhrsw m3, m7 | |
19357 | pmaddubsw m5, m4, m6 | |
19358 | pmulhrsw m5, m7 | |
19359 | packuswb m3, m5 | |
19360 | movu [r0 + 179 * 16], m3 | |
19361 | ||
19362 | ; mode 4 [row 26] | |
19363 | movu m6, [r5 + 23 * 16] | |
19364 | pmaddubsw m3, m0, m6 | |
19365 | pmulhrsw m3, m7 | |
19366 | pmaddubsw m5, m2, m6 | |
19367 | pmulhrsw m5, m7 | |
19368 | packuswb m3, m5 | |
19369 | movu [r0 + 180 * 16], m3 | |
19370 | pmaddubsw m3, m1, m6 | |
19371 | pmulhrsw m3, m7 | |
19372 | pmaddubsw m5, m4, m6 | |
19373 | pmulhrsw m5, m7 | |
19374 | packuswb m3, m5 | |
19375 | movu [r0 + 181 * 16], m3 | |
19376 | ||
19377 | ; mode 3 [row 21] | |
19378 | movu m6, [r5 + 28 * 16] | |
19379 | pmaddubsw m3, m0, m6 | |
19380 | pmulhrsw m3, m7 | |
19381 | pmaddubsw m5, m2, m6 | |
19382 | pmulhrsw m5, m7 | |
19383 | packuswb m3, m5 | |
19384 | movu [r0 + 106 * 16], m3 | |
19385 | pmaddubsw m3, m1, m6 | |
19386 | pmulhrsw m3, m7 | |
19387 | pmaddubsw m5, m4, m6 | |
19388 | pmulhrsw m5, m7 | |
19389 | packuswb m3, m5 | |
19390 | movu [r0 + 107 * 16], m3 | |
19391 | ||
19392 | ; mode 3 [row 22] | |
19393 | movu m6, [r5 + 22 * 16] | |
19394 | movu m0, [r4 + 19] | |
19395 | movd m1, [r4 + 20] | |
19396 | palignr m1, m0, 1 | |
19397 | punpcklbw m0, m1 | |
19398 | pmaddubsw m1, m0, m6 | |
19399 | pmulhrsw m1, m7 | |
19400 | movu m2, [r4 + 27] | |
19401 | movd m3, [r4 + 28] | |
19402 | palignr m3, m2, 1 | |
19403 | punpcklbw m2, m3 | |
19404 | pmaddubsw m3, m2, m6 | |
19405 | pmulhrsw m3, m7 | |
19406 | packuswb m1, m3 | |
19407 | movu [r0 + 108 * 16], m1 | |
19408 | ||
19409 | movu m1, [r4 + 35] | |
19410 | movd m3, [r4 + 36] | |
19411 | palignr m3, m1, 1 | |
19412 | punpcklbw m1, m3 | |
19413 | pmaddubsw m3, m1, m6 | |
19414 | pmulhrsw m3, m7 | |
19415 | movu m4, [r4 + 43] | |
19416 | movd m5, [r4 + 44] | |
19417 | palignr m5, m4, 1 | |
19418 | punpcklbw m4, m5 | |
19419 | pmaddubsw m5, m4, m6 | |
19420 | pmulhrsw m5, m7 | |
19421 | packuswb m3, m5 | |
19422 | movu [r0 + 109 * 16], m3 | |
19423 | ||
19424 | ; mode 4 [row 27] | |
19425 | movu m6, [r5 + 12 * 16] | |
19426 | pmaddubsw m3, m0, m6 | |
19427 | pmulhrsw m3, m7 | |
19428 | pmaddubsw m5, m2, m6 | |
19429 | pmulhrsw m5, m7 | |
19430 | packuswb m3, m5 | |
19431 | movu [r0 + 182 * 16], m3 | |
19432 | pmaddubsw m3, m1, m6 | |
19433 | pmulhrsw m3, m7 | |
19434 | pmaddubsw m5, m4, m6 | |
19435 | pmulhrsw m5, m7 | |
19436 | packuswb m3, m5 | |
19437 | movu [r0 + 183 * 16], m3 | |
19438 | ||
19439 | ; mode 3 [row 23] | |
19440 | movu m6, [r5 + 16 * 16] | |
19441 | movu m0, [r4 + 20] | |
19442 | movd m1, [r4 + 21] | |
19443 | palignr m1, m0, 1 | |
19444 | punpcklbw m0, m1 | |
19445 | pmaddubsw m1, m0, m6 | |
19446 | pmulhrsw m1, m7 | |
19447 | movu m2, [r4 + 28] | |
19448 | movd m3, [r4 + 29] | |
19449 | palignr m3, m2, 1 | |
19450 | punpcklbw m2, m3 | |
19451 | pmaddubsw m3, m2, m6 | |
19452 | pmulhrsw m3, m7 | |
19453 | packuswb m1, m3 | |
19454 | movu [r0 + 110 * 16], m1 | |
19455 | ||
19456 | movu m1, [r4 + 36] | |
19457 | movd m3, [r4 + 37] | |
19458 | palignr m3, m1, 1 | |
19459 | punpcklbw m1, m3 | |
19460 | pmaddubsw m3, m1, m6 | |
19461 | pmulhrsw m3, m7 | |
19462 | movu m4, [r4 + 44] | |
19463 | movd m5, [r4 + 45] | |
19464 | palignr m5, m4, 1 | |
19465 | punpcklbw m4, m5 | |
19466 | pmaddubsw m5, m4, m6 | |
19467 | pmulhrsw m5, m7 | |
19468 | packuswb m3, m5 | |
19469 | movu [r0 + 111 * 16], m3 | |
19470 | ||
19471 | ; mode 4 [row 28] | |
19472 | movu m6, [r5 + 1 * 16] | |
19473 | pmaddubsw m3, m0, m6 | |
19474 | pmulhrsw m3, m7 | |
19475 | pmaddubsw m5, m2, m6 | |
19476 | pmulhrsw m5, m7 | |
19477 | packuswb m3, m5 | |
19478 | movu [r0 + 184 * 16], m3 | |
19479 | pmaddubsw m3, m1, m6 | |
19480 | pmulhrsw m3, m7 | |
19481 | pmaddubsw m5, m4, m6 | |
19482 | pmulhrsw m5, m7 | |
19483 | packuswb m3, m5 | |
19484 | movu [r0 + 185 * 16], m3 | |
19485 | ||
19486 | ; mode 4 [row 29] | |
19487 | movu m6, [r5 + 22 * 16] | |
19488 | pmaddubsw m3, m0, m6 | |
19489 | pmulhrsw m3, m7 | |
19490 | pmaddubsw m5, m2, m6 | |
19491 | pmulhrsw m5, m7 | |
19492 | packuswb m3, m5 | |
19493 | movu [r0 + 186 * 16], m3 | |
19494 | pmaddubsw m3, m1, m6 | |
19495 | pmulhrsw m3, m7 | |
19496 | pmaddubsw m5, m4, m6 | |
19497 | pmulhrsw m5, m7 | |
19498 | packuswb m3, m5 | |
19499 | movu [r0 + 187 * 16], m3 | |
19500 | ||
19501 | ; mode 3 [row 24] | |
19502 | movu m6, [r5 + 10 * 16] | |
19503 | movu m0, [r4 + 21] | |
19504 | movd m1, [r4 + 22] | |
19505 | palignr m1, m0, 1 | |
19506 | punpcklbw m0, m1 | |
19507 | pmaddubsw m1, m0, m6 | |
19508 | pmulhrsw m1, m7 | |
19509 | movu m2, [r4 + 29] | |
19510 | movd m3, [r4 + 30] | |
19511 | palignr m3, m2, 1 | |
19512 | punpcklbw m2, m3 | |
19513 | pmaddubsw m3, m2, m6 | |
19514 | pmulhrsw m3, m7 | |
19515 | packuswb m1, m3 | |
19516 | movu [r0 + 112 * 16], m1 | |
19517 | ||
19518 | movu m1, [r4 + 37] | |
19519 | movd m3, [r4 + 38] | |
19520 | palignr m3, m1, 1 | |
19521 | punpcklbw m1, m3 | |
19522 | pmaddubsw m3, m1, m6 | |
19523 | pmulhrsw m3, m7 | |
19524 | movu m4, [r4 + 45] | |
19525 | movd m5, [r4 + 46] | |
19526 | palignr m5, m4, 1 | |
19527 | punpcklbw m4, m5 | |
19528 | pmaddubsw m5, m4, m6 | |
19529 | pmulhrsw m5, m7 | |
19530 | packuswb m3, m5 | |
19531 | movu [r0 + 113 * 16], m3 | |
19532 | ||
19533 | ; mode 4 [row 30] | |
19534 | movu m6, [r5 + 11 * 16] | |
19535 | pmaddubsw m3, m0, m6 | |
19536 | pmulhrsw m3, m7 | |
19537 | pmaddubsw m5, m2, m6 | |
19538 | pmulhrsw m5, m7 | |
19539 | packuswb m3, m5 | |
19540 | movu [r0 + 188 * 16], m3 | |
19541 | pmaddubsw m3, m1, m6 | |
19542 | pmulhrsw m3, m7 | |
19543 | pmaddubsw m5, m4, m6 | |
19544 | pmulhrsw m5, m7 | |
19545 | packuswb m3, m5 | |
19546 | movu [r0 + 189 * 16], m3 | |
19547 | ||
19548 | ; mode 3 [row 25] | |
19549 | movu m6, [r5 + 4 * 16] | |
19550 | movu m0, [r4 + 22] | |
19551 | movd m1, [r4 + 23] | |
19552 | palignr m1, m0, 1 | |
19553 | punpcklbw m0, m1 | |
19554 | pmaddubsw m1, m0, m6 | |
19555 | pmulhrsw m1, m7 | |
19556 | movu m2, [r4 + 30] | |
19557 | movd m3, [r4 + 31] | |
19558 | palignr m3, m2, 1 | |
19559 | punpcklbw m2, m3 | |
19560 | pmaddubsw m3, m2, m6 | |
19561 | pmulhrsw m3, m7 | |
19562 | packuswb m1, m3 | |
19563 | movu [r0 + 114 * 16], m1 | |
19564 | ||
19565 | movu m1, [r4 + 38] | |
19566 | movd m3, [r4 + 39] | |
19567 | palignr m3, m1, 1 | |
19568 | punpcklbw m1, m3 | |
19569 | pmaddubsw m3, m1, m6 | |
19570 | pmulhrsw m3, m7 | |
19571 | movu m4, [r4 + 46] | |
19572 | movd m5, [r4 + 47] | |
19573 | palignr m5, m4, 1 | |
19574 | punpcklbw m4, m5 | |
19575 | pmaddubsw m5, m4, m6 | |
19576 | pmulhrsw m5, m7 | |
19577 | packuswb m3, m5 | |
19578 | movu [r0 + 115 * 16], m3 | |
19579 | ||
19580 | ; mode 3 [row 26] | |
19581 | movu m6, [r5 + 30 * 16] | |
19582 | pmaddubsw m3, m0, m6 | |
19583 | pmulhrsw m3, m7 | |
19584 | pmaddubsw m5, m2, m6 | |
19585 | pmulhrsw m5, m7 | |
19586 | packuswb m3, m5 | |
19587 | movu [r0 + 116 * 16], m3 | |
19588 | pmaddubsw m3, m1, m6 | |
19589 | pmulhrsw m3, m7 | |
19590 | pmaddubsw m5, m4, m6 | |
19591 | pmulhrsw m5, m7 | |
19592 | packuswb m3, m5 | |
19593 | movu [r0 + 117 * 16], m3 | |
19594 | ||
19595 | ; mode 3 [row 27] | |
19596 | movu m6, [r5 + 24 * 16] | |
19597 | movu m0, [r4 + 23] | |
19598 | movd m1, [r4 + 24] | |
19599 | palignr m1, m0, 1 | |
19600 | punpcklbw m0, m1 | |
19601 | pmaddubsw m1, m0, m6 | |
19602 | pmulhrsw m1, m7 | |
19603 | movu m2, [r4 + 31] | |
19604 | movd m3, [r4 + 32] | |
19605 | palignr m3, m2, 1 | |
19606 | punpcklbw m2, m3 | |
19607 | pmaddubsw m3, m2, m6 | |
19608 | pmulhrsw m3, m7 | |
19609 | packuswb m1, m3 | |
19610 | movu [r0 + 118 * 16], m1 | |
19611 | ||
19612 | movu m1, [r4 + 39] | |
19613 | movd m3, [r4 + 40] | |
19614 | palignr m3, m1, 1 | |
19615 | punpcklbw m1, m3 | |
19616 | pmaddubsw m3, m1, m6 | |
19617 | pmulhrsw m3, m7 | |
19618 | movu m4, [r4 + 47] | |
19619 | movd m5, [r4 + 48] | |
19620 | palignr m5, m4, 1 | |
19621 | punpcklbw m4, m5 | |
19622 | pmaddubsw m5, m4, m6 | |
19623 | pmulhrsw m5, m7 | |
19624 | packuswb m3, m5 | |
19625 | movu [r0 + 119 * 16], m3 | |
19626 | ||
19627 | ; mode 3 [row 28] | |
19628 | movu m6, [r5 + 18 * 16] | |
19629 | movu m0, [r4 + 24] | |
19630 | movd m1, [r4 + 25] | |
19631 | palignr m1, m0, 1 | |
19632 | punpcklbw m0, m1 | |
19633 | pmaddubsw m1, m0, m6 | |
19634 | pmulhrsw m1, m7 | |
19635 | movu m2, [r4 + 32] | |
19636 | movd m3, [r4 + 33] | |
19637 | palignr m3, m2, 1 | |
19638 | punpcklbw m2, m3 | |
19639 | pmaddubsw m3, m2, m6 | |
19640 | pmulhrsw m3, m7 | |
19641 | packuswb m1, m3 | |
19642 | movu [r0 + 120 * 16], m1 | |
19643 | ||
19644 | movu m1, [r4 + 40] | |
19645 | movd m3, [r4 + 41] | |
19646 | palignr m3, m1, 1 | |
19647 | punpcklbw m1, m3 | |
19648 | pmaddubsw m3, m1, m6 | |
19649 | pmulhrsw m3, m7 | |
19650 | movu m4, [r4 + 48] | |
19651 | movd m5, [r4 + 49] | |
19652 | palignr m5, m4, 1 | |
19653 | punpcklbw m4, m5 | |
19654 | pmaddubsw m5, m4, m6 | |
19655 | pmulhrsw m5, m7 | |
19656 | packuswb m3, m5 | |
19657 | movu [r0 + 121 * 16], m3 | |
19658 | ||
19659 | ; mode 3 [row 29] | |
19660 | movu m6, [r5 + 12 * 16] | |
19661 | movu m0, [r4 + 25] | |
19662 | movd m1, [r4 + 26] | |
19663 | palignr m1, m0, 1 | |
19664 | punpcklbw m0, m1 | |
19665 | pmaddubsw m1, m0, m6 | |
19666 | pmulhrsw m1, m7 | |
19667 | movu m2, [r4 + 33] | |
19668 | movd m3, [r4 + 34] | |
19669 | palignr m3, m2, 1 | |
19670 | punpcklbw m2, m3 | |
19671 | pmaddubsw m3, m2, m6 | |
19672 | pmulhrsw m3, m7 | |
19673 | packuswb m1, m3 | |
19674 | movu [r0 + 122 * 16], m1 | |
19675 | ||
19676 | movu m1, [r4 + 41] | |
19677 | movd m3, [r4 + 42] | |
19678 | palignr m3, m1, 1 | |
19679 | punpcklbw m1, m3 | |
19680 | pmaddubsw m3, m1, m6 | |
19681 | pmulhrsw m3, m7 | |
19682 | movu m4, [r4 + 49] | |
19683 | movd m5, [r4 + 50] | |
19684 | palignr m5, m4, 1 | |
19685 | punpcklbw m4, m5 | |
19686 | pmaddubsw m5, m4, m6 | |
19687 | pmulhrsw m5, m7 | |
19688 | packuswb m3, m5 | |
19689 | movu [r0 + 123 * 16], m3 | |
19690 | ||
19691 | ; mode 3 [row 30] | |
19692 | movu m6, [r5 + 6 * 16] | |
19693 | movu m0, [r4 + 26] | |
19694 | movd m1, [r4 + 27] | |
19695 | palignr m1, m0, 1 | |
19696 | punpcklbw m0, m1 | |
19697 | pmaddubsw m1, m0, m6 | |
19698 | pmulhrsw m1, m7 | |
19699 | movu m2, [r4 + 34] | |
19700 | movd m3, [r4 + 35] | |
19701 | palignr m3, m2, 1 | |
19702 | punpcklbw m2, m3 | |
19703 | pmaddubsw m3, m2, m6 | |
19704 | pmulhrsw m3, m7 | |
19705 | packuswb m1, m3 | |
19706 | movu [r0 + 124 * 16], m1 | |
19707 | ||
19708 | movu m1, [r4 + 42] | |
19709 | movd m3, [r4 + 43] | |
19710 | palignr m3, m1, 1 | |
19711 | punpcklbw m1, m3 | |
19712 | pmaddubsw m3, m1, m6 | |
19713 | pmulhrsw m3, m7 | |
19714 | movu m4, [r4 + 50] | |
19715 | movd m5, [r4 + 51] | |
19716 | palignr m5, m4, 1 | |
19717 | punpcklbw m4, m5 | |
19718 | pmaddubsw m5, m4, m6 | |
19719 | pmulhrsw m5, m7 | |
19720 | packuswb m3, m5 | |
19721 | movu [r0 + 125 * 16], m3 | |
19722 | ||
19723 | ; mode 10 | |
19724 | movu m1, [r2 + 1] | |
19725 | movu m2, [r2 + 17] | |
19726 | movu [r0 + 512 * 16], m1 | |
19727 | movu [r0 + 513 * 16], m2 | |
19728 | movu [r0 + 514 * 16], m1 | |
19729 | movu [r0 + 515 * 16], m2 | |
19730 | movu [r0 + 516 * 16], m1 | |
19731 | movu [r0 + 517 * 16], m2 | |
19732 | movu [r0 + 518 * 16], m1 | |
19733 | movu [r0 + 519 * 16], m2 | |
19734 | movu [r0 + 520 * 16], m1 | |
19735 | movu [r0 + 521 * 16], m2 | |
19736 | movu [r0 + 522 * 16], m1 | |
19737 | movu [r0 + 523 * 16], m2 | |
19738 | movu [r0 + 524 * 16], m1 | |
19739 | movu [r0 + 525 * 16], m2 | |
19740 | movu [r0 + 526 * 16], m1 | |
19741 | movu [r0 + 527 * 16], m2 | |
19742 | ||
19743 | movu [r0 + 528 * 16], m1 | |
19744 | movu [r0 + 529 * 16], m2 | |
19745 | movu [r0 + 530 * 16], m1 | |
19746 | movu [r0 + 531 * 16], m2 | |
19747 | movu [r0 + 532 * 16], m1 | |
19748 | movu [r0 + 533 * 16], m2 | |
19749 | movu [r0 + 534 * 16], m1 | |
19750 | movu [r0 + 535 * 16], m2 | |
19751 | movu [r0 + 536 * 16], m1 | |
19752 | movu [r0 + 537 * 16], m2 | |
19753 | movu [r0 + 538 * 16], m1 | |
19754 | movu [r0 + 539 * 16], m2 | |
19755 | movu [r0 + 540 * 16], m1 | |
19756 | movu [r0 + 541 * 16], m2 | |
19757 | movu [r0 + 542 * 16], m1 | |
19758 | movu [r0 + 543 * 16], m2 | |
19759 | ||
19760 | movu [r0 + 544 * 16], m1 | |
19761 | movu [r0 + 545 * 16], m2 | |
19762 | movu [r0 + 546 * 16], m1 | |
19763 | movu [r0 + 547 * 16], m2 | |
19764 | movu [r0 + 548 * 16], m1 | |
19765 | movu [r0 + 549 * 16], m2 | |
19766 | movu [r0 + 550 * 16], m1 | |
19767 | movu [r0 + 551 * 16], m2 | |
19768 | movu [r0 + 552 * 16], m1 | |
19769 | movu [r0 + 553 * 16], m2 | |
19770 | movu [r0 + 554 * 16], m1 | |
19771 | movu [r0 + 555 * 16], m2 | |
19772 | movu [r0 + 556 * 16], m1 | |
19773 | movu [r0 + 557 * 16], m2 | |
19774 | movu [r0 + 558 * 16], m1 | |
19775 | movu [r0 + 559 * 16], m2 | |
19776 | ||
19777 | movu [r0 + 560 * 16], m1 | |
19778 | movu [r0 + 561 * 16], m2 | |
19779 | movu [r0 + 562 * 16], m1 | |
19780 | movu [r0 + 563 * 16], m2 | |
19781 | movu [r0 + 564 * 16], m1 | |
19782 | movu [r0 + 565 * 16], m2 | |
19783 | movu [r0 + 566 * 16], m1 | |
19784 | movu [r0 + 567 * 16], m2 | |
19785 | movu [r0 + 568 * 16], m1 | |
19786 | movu [r0 + 569 * 16], m2 | |
19787 | movu [r0 + 570 * 16], m1 | |
19788 | movu [r0 + 571 * 16], m2 | |
19789 | movu [r0 + 572 * 16], m1 | |
19790 | movu [r0 + 573 * 16], m2 | |
19791 | movu [r0 + 574 * 16], m1 | |
19792 | movu [r0 + 575 * 16], m2 | |
19793 | ||
19794 | ; mode 11 [row 0] | |
19795 | movu m0, [r4] | |
19796 | ||
19797 | ; mode 11 [row 15 - first half] | |
19798 | movu [r0 + 606 * 16], m0 | |
19799 | ||
19800 | movu [r0 + 606 * 16], m0 | |
19801 | ||
19802 | ; mode 12 [row 31] | |
19803 | pslldq m6, m0, 4 | |
19804 | pinsrb m6, [r3 + 26], 0 | |
19805 | pinsrb m6, [r3 + 19], 1 | |
19806 | pinsrb m6, [r3 + 13], 2 | |
19807 | pinsrb m6, [r3 + 6], 3 | |
19808 | movu [r0 + 702 * 16], m6 | |
19809 | movu m6, [r4 + 12] | |
19810 | movu [r0 + 703 * 16], m6 | |
19811 | ||
19812 | ; mode 11 [row 31] | |
19813 | pslldq m6, m0, 1 | |
19814 | pinsrb m6, [r3 + 16], 0 | |
19815 | movu [r0 + 638 * 16], m6 | |
19816 | movu m6, [r4 + 15] | |
19817 | movu [r0 + 639 * 16], m6 | |
19818 | ||
19819 | movd m1, [r4 + 1] | |
19820 | palignr m1, m0, 1 | |
19821 | punpcklbw m0, m1 | |
19822 | pmaddubsw m1, m0, [r5 + 30 * 16] | |
19823 | pmulhrsw m1, m7 | |
19824 | movu m2, [r4 + 8] | |
19825 | movd m3, [r4 + 9] | |
19826 | palignr m3, m2, 1 | |
19827 | punpcklbw m2, m3 | |
19828 | pmaddubsw m3, m2, [r5 + 30 * 16] | |
19829 | pmulhrsw m3, m7 | |
19830 | packuswb m1, m3 | |
19831 | movu [r0 + 576 * 16], m1 | |
19832 | ||
19833 | movu m1, [r4 + 16] | |
19834 | ||
19835 | ; mode 11 [row 15 - second half] | |
19836 | movu [r0 + 607 * 16], m1 | |
19837 | ||
19838 | movd m3, [r4 + 17] | |
19839 | palignr m3, m1, 1 | |
19840 | punpcklbw m1, m3 | |
19841 | pmaddubsw m3, m1, [r5 + 30 * 16] | |
19842 | pmulhrsw m3, m7 | |
19843 | movu m4, [r4 + 24] | |
19844 | movd m5, [r4 + 25] | |
19845 | palignr m5, m4, 1 | |
19846 | punpcklbw m4, m5 | |
19847 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
19848 | pmulhrsw m5, m7 | |
19849 | packuswb m3, m5 | |
19850 | movu [r0 + 577 * 16], m3 | |
19851 | ||
19852 | ; mode 11 [row 1] | |
19853 | pmaddubsw m3, m0, [r5 + 28 * 16] | |
19854 | pmulhrsw m3, m7 | |
19855 | pmaddubsw m5, m2, [r5 + 28 * 16] | |
19856 | pmulhrsw m5, m7 | |
19857 | packuswb m3, m5 | |
19858 | movu [r0 + 578 * 16], m3 | |
19859 | pmaddubsw m3, m1, [r5 + 28 * 16] | |
19860 | pmulhrsw m3, m7 | |
19861 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
19862 | pmulhrsw m5, m7 | |
19863 | packuswb m3, m5 | |
19864 | movu [r0 + 579 * 16], m3 | |
19865 | ||
19866 | ; mode 11 [row 2] | |
19867 | pmaddubsw m3, m0, [r5 + 26 * 16] | |
19868 | pmulhrsw m3, m7 | |
19869 | pmaddubsw m5, m2, [r5 + 26 * 16] | |
19870 | pmulhrsw m5, m7 | |
19871 | packuswb m3, m5 | |
19872 | movu [r0 + 580 * 16], m3 | |
19873 | pmaddubsw m3, m1, [r5 + 26 * 16] | |
19874 | pmulhrsw m3, m7 | |
19875 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
19876 | pmulhrsw m5, m7 | |
19877 | packuswb m3, m5 | |
19878 | movu [r0 + 581 * 16], m3 | |
19879 | ||
19880 | ; mode 11 [row 3] | |
19881 | pmaddubsw m3, m0, [r5 + 24 * 16] | |
19882 | pmulhrsw m3, m7 | |
19883 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
19884 | pmulhrsw m5, m7 | |
19885 | packuswb m3, m5 | |
19886 | movu [r0 + 582 * 16], m3 | |
19887 | pmaddubsw m3, m1, [r5 + 24 * 16] | |
19888 | pmulhrsw m3, m7 | |
19889 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
19890 | pmulhrsw m5, m7 | |
19891 | packuswb m3, m5 | |
19892 | movu [r0 + 583 * 16], m3 | |
19893 | ||
19894 | ; mode 11 [row 4] | |
19895 | pmaddubsw m3, m0, [r5 + 22 * 16] | |
19896 | pmulhrsw m3, m7 | |
19897 | pmaddubsw m5, m2, [r5 + 22 * 16] | |
19898 | pmulhrsw m5, m7 | |
19899 | packuswb m3, m5 | |
19900 | movu [r0 + 584 * 16], m3 | |
19901 | ||
19902 | ; mode 12 [row 1 - first half] | |
19903 | movu [r0 + 642 * 16], m3 | |
19904 | ||
19905 | pmaddubsw m3, m1, [r5 + 22 * 16] | |
19906 | pmulhrsw m3, m7 | |
19907 | pmaddubsw m5, m4, [r5 + 22 * 16] | |
19908 | pmulhrsw m5, m7 | |
19909 | packuswb m3, m5 | |
19910 | movu [r0 + 585 * 16], m3 | |
19911 | ||
19912 | ; mode 12 [row 1 - second half] | |
19913 | movu [r0 + 643 * 16], m3 | |
19914 | ||
19915 | ; mode 11 [row 5] | |
19916 | pmaddubsw m3, m0, [r5 + 20 * 16] | |
19917 | pmulhrsw m3, m7 | |
19918 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
19919 | pmulhrsw m5, m7 | |
19920 | packuswb m3, m5 | |
19921 | movu [r0 + 586 * 16], m3 | |
19922 | pmaddubsw m3, m1, [r5 + 20 * 16] | |
19923 | pmulhrsw m3, m7 | |
19924 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
19925 | pmulhrsw m5, m7 | |
19926 | packuswb m3, m5 | |
19927 | movu [r0 + 587 * 16], m3 | |
19928 | ||
19929 | ; mode 11 [row 6] | |
19930 | pmaddubsw m3, m0, [r5 + 18 * 16] | |
19931 | pmulhrsw m3, m7 | |
19932 | pmaddubsw m5, m2, [r5 + 18 * 16] | |
19933 | pmulhrsw m5, m7 | |
19934 | packuswb m3, m5 | |
19935 | movu [r0 + 588 * 16], m3 | |
19936 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
19937 | pmulhrsw m3, m7 | |
19938 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
19939 | pmulhrsw m5, m7 | |
19940 | packuswb m3, m5 | |
19941 | movu [r0 + 589 * 16], m3 | |
19942 | ||
19943 | ; mode 11 [row 7] | |
19944 | pmaddubsw m3, m0, [r5 + 16 * 16] | |
19945 | pmulhrsw m3, m7 | |
19946 | pmaddubsw m5, m2, [r5 + 16 * 16] | |
19947 | pmulhrsw m5, m7 | |
19948 | packuswb m3, m5 | |
19949 | movu [r0 + 590 * 16], m3 | |
19950 | pmaddubsw m3, m1, [r5 + 16 * 16] | |
19951 | pmulhrsw m3, m7 | |
19952 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
19953 | pmulhrsw m5, m7 | |
19954 | packuswb m3, m5 | |
19955 | movu [r0 + 591 * 16], m3 | |
19956 | ||
19957 | ; mode 11 [row 8] | |
19958 | pmaddubsw m3, m0, [r5 + 14 * 16] | |
19959 | pmulhrsw m3, m7 | |
19960 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
19961 | pmulhrsw m5, m7 | |
19962 | packuswb m3, m5 | |
19963 | movu [r0 + 592 * 16], m3 | |
19964 | ||
19965 | ; mode 13 [row 1 - first half] | |
19966 | movu [r0 + 706 * 16], m3 | |
19967 | ||
19968 | pmaddubsw m3, m1, [r5 + 14 * 16] | |
19969 | pmulhrsw m3, m7 | |
19970 | pmaddubsw m5, m4, [r5 + 14 * 16] | |
19971 | pmulhrsw m5, m7 | |
19972 | packuswb m3, m5 | |
19973 | movu [r0 + 593 * 16], m3 | |
19974 | ||
19975 | ; mode 13 [row 1 - second half] | |
19976 | movu [r0 + 707 * 16], m3 | |
19977 | ||
19978 | ; mode 11 [row 9] | |
19979 | pmaddubsw m3, m0, [r5 + 12 * 16] | |
19980 | pmulhrsw m3, m7 | |
19981 | pmaddubsw m5, m2, [r5 + 12 * 16] | |
19982 | pmulhrsw m5, m7 | |
19983 | packuswb m3, m5 | |
19984 | movu [r0 + 594 * 16], m3 | |
19985 | ||
19986 | ; mode 12 [row 3 - first half] | |
19987 | movu [r0 + 646 * 16], m3 | |
19988 | ||
19989 | pmaddubsw m3, m1, [r5 + 12 * 16] | |
19990 | pmulhrsw m3, m7 | |
19991 | pmaddubsw m5, m4, [r5 + 12 * 16] | |
19992 | pmulhrsw m5, m7 | |
19993 | packuswb m3, m5 | |
19994 | movu [r0 + 595 * 16], m3 | |
19995 | ||
19996 | ; mode 12 [row 3 - second half] | |
19997 | movu [r0 + 647 * 16], m3 | |
19998 | ||
19999 | ; mode 11 [row 10] | |
20000 | pmaddubsw m3, m0, [r5 + 10 * 16] | |
20001 | pmulhrsw m3, m7 | |
20002 | pmaddubsw m5, m2, [r5 + 10 * 16] | |
20003 | pmulhrsw m5, m7 | |
20004 | packuswb m3, m5 | |
20005 | movu [r0 + 596 * 16], m3 | |
20006 | pmaddubsw m3, m1, [r5 + 10 * 16] | |
20007 | pmulhrsw m3, m7 | |
20008 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
20009 | pmulhrsw m5, m7 | |
20010 | packuswb m3, m5 | |
20011 | movu [r0 + 597 * 16], m3 | |
20012 | ||
20013 | ; mode 11 [row 11] | |
20014 | pmaddubsw m3, m0, [r5 + 8 * 16] | |
20015 | pmulhrsw m3, m7 | |
20016 | pmaddubsw m5, m2, [r5 + 8 * 16] | |
20017 | pmulhrsw m5, m7 | |
20018 | packuswb m3, m5 | |
20019 | movu [r0 + 598 * 16], m3 | |
20020 | pmaddubsw m3, m1, [r5 + 8 * 16] | |
20021 | pmulhrsw m3, m7 | |
20022 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
20023 | pmulhrsw m5, m7 | |
20024 | packuswb m3, m5 | |
20025 | movu [r0 + 599 * 16], m3 | |
20026 | ||
20027 | ; mode 11 [row 12] | |
20028 | pmaddubsw m3, m0, [r5 + 6 * 16] | |
20029 | pmulhrsw m3, m7 | |
20030 | pmaddubsw m5, m2, [r5 + 6 * 16] | |
20031 | pmulhrsw m5, m7 | |
20032 | packuswb m3, m5 | |
20033 | movu [r0 + 600 * 16], m3 | |
20034 | ||
20035 | ; mode 14 [row 1 - first half] | |
20036 | movu [r0 + 770 * 16], m3 | |
20037 | ||
20038 | pmaddubsw m3, m1, [r5 + 6 * 16] | |
20039 | pmulhrsw m3, m7 | |
20040 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
20041 | pmulhrsw m5, m7 | |
20042 | packuswb m3, m5 | |
20043 | movu [r0 + 601 * 16], m3 | |
20044 | ||
20045 | ; mode 14 [row 1 - second half] | |
20046 | movu [r0 + 771 * 16], m3 | |
20047 | ||
20048 | ; mode 11 [row 13] | |
20049 | pmaddubsw m3, m0, [r5 + 4 * 16] | |
20050 | pmulhrsw m3, m7 | |
20051 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
20052 | pmulhrsw m5, m7 | |
20053 | packuswb m3, m5 | |
20054 | movu [r0 + 602 * 16], m3 | |
20055 | pmaddubsw m3, m1, [r5 + 4 * 16] | |
20056 | pmulhrsw m3, m7 | |
20057 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
20058 | pmulhrsw m5, m7 | |
20059 | packuswb m3, m5 | |
20060 | movu [r0 + 603 * 16], m3 | |
20061 | ||
20062 | ; mode 11 [row 14] | |
20063 | pmaddubsw m3, m0, [r5 + 2 * 16] | |
20064 | pmulhrsw m3, m7 | |
20065 | pmaddubsw m5, m2, [r5 + 2 * 16] | |
20066 | pmulhrsw m5, m7 | |
20067 | packuswb m3, m5 | |
20068 | movu [r0 + 604 * 16], m3 | |
20069 | ||
20070 | ; mode 13 [row 5 - first half] | |
20071 | movu [r0 + 650 * 16], m3 | |
20072 | ||
20073 | pmaddubsw m3, m1, [r5 + 2 * 16] | |
20074 | pmulhrsw m3, m7 | |
20075 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
20076 | pmulhrsw m5, m7 | |
20077 | packuswb m3, m5 | |
20078 | movu [r0 + 605 * 16], m3 | |
20079 | ||
20080 | ; mode 13 [row 5 - second half] | |
20081 | movu [r0 + 651 * 16], m3 | |
20082 | ||
20083 | ; mode 12 [row 0] | |
20084 | pmaddubsw m3, m0, [r5 + 27 * 16] | |
20085 | pmulhrsw m3, m7 | |
20086 | pmaddubsw m5, m2, [r5 + 27 * 16] | |
20087 | pmulhrsw m5, m7 | |
20088 | packuswb m3, m5 | |
20089 | movu [r0 + 640 * 16], m3 | |
20090 | pmaddubsw m3, m1, [r5 + 27 * 16] | |
20091 | pmulhrsw m3, m7 | |
20092 | pmaddubsw m5, m4, [r5 + 27 * 16] | |
20093 | pmulhrsw m5, m7 | |
20094 | packuswb m3, m5 | |
20095 | movu [r0 + 641 * 16], m3 | |
20096 | ||
20097 | ; mode 12 [row 2] | |
20098 | pmaddubsw m3, m0, [r5 + 17 * 16] | |
20099 | pmulhrsw m3, m7 | |
20100 | pmaddubsw m5, m2, [r5 + 17 * 16] | |
20101 | pmulhrsw m5, m7 | |
20102 | packuswb m3, m5 | |
20103 | movu [r0 + 644 * 16], m3 | |
20104 | pmaddubsw m3, m1, [r5 + 17 * 16] | |
20105 | pmulhrsw m3, m7 | |
20106 | pmaddubsw m5, m4, [r5 + 17 * 16] | |
20107 | pmulhrsw m5, m7 | |
20108 | packuswb m3, m5 | |
20109 | movu [r0 + 645 * 16], m3 | |
20110 | ||
20111 | ; mode 12 [row 4] | |
20112 | pmaddubsw m3, m0, [r5 + 7 * 16] | |
20113 | pmulhrsw m3, m7 | |
20114 | pmaddubsw m5, m2, [r5 + 7 * 16] | |
20115 | pmulhrsw m5, m7 | |
20116 | packuswb m3, m5 | |
20117 | movu [r0 + 648 * 16], m3 | |
20118 | pmaddubsw m3, m1, [r5 + 7 * 16] | |
20119 | pmulhrsw m3, m7 | |
20120 | pmaddubsw m5, m4, [r5 + 7 * 16] | |
20121 | pmulhrsw m5, m7 | |
20122 | packuswb m3, m5 | |
20123 | movu [r0 + 649 * 16], m3 | |
20124 | ||
20125 | ; mode 13 [row 0] | |
20126 | pmaddubsw m3, m0, [r5 + 23 * 16] | |
20127 | pmulhrsw m3, m7 | |
20128 | pmaddubsw m5, m2, [r5 + 23 * 16] | |
20129 | pmulhrsw m5, m7 | |
20130 | packuswb m3, m5 | |
20131 | movu [r0 + 704 * 16], m3 | |
20132 | pmaddubsw m3, m1, [r5 + 23 * 16] | |
20133 | pmulhrsw m3, m7 | |
20134 | pmaddubsw m5, m4, [r5 + 23 * 16] | |
20135 | pmulhrsw m5, m7 | |
20136 | packuswb m3, m5 | |
20137 | movu [r0 + 705 * 16], m3 | |
20138 | ||
20139 | ; mode 13 [row 2] | |
20140 | pmaddubsw m3, m0, [r5 + 5 * 16] | |
20141 | pmulhrsw m3, m7 | |
20142 | pmaddubsw m5, m2, [r5 + 5 * 16] | |
20143 | pmulhrsw m5, m7 | |
20144 | packuswb m3, m5 | |
20145 | movu [r0 + 708 * 16], m3 | |
20146 | pmaddubsw m3, m1, [r5 + 5 * 16] | |
20147 | pmulhrsw m3, m7 | |
20148 | pmaddubsw m5, m4, [r5 + 5 * 16] | |
20149 | pmulhrsw m5, m7 | |
20150 | packuswb m3, m5 | |
20151 | movu [r0 + 709 * 16], m3 | |
20152 | ||
20153 | ; mode 14 [row 0] | |
20154 | pmaddubsw m3, m0, [r5 + 19 * 16] | |
20155 | pmulhrsw m3, m7 | |
20156 | pmaddubsw m5, m2, [r5 + 19 * 16] | |
20157 | pmulhrsw m5, m7 | |
20158 | packuswb m3, m5 | |
20159 | movu [r0 + 768 * 16], m3 | |
20160 | pmaddubsw m3, m1, [r5 + 19 * 16] | |
20161 | pmulhrsw m3, m7 | |
20162 | pmaddubsw m5, m4, [r5 + 19 * 16] | |
20163 | pmulhrsw m5, m7 | |
20164 | packuswb m3, m5 | |
20165 | movu [r0 + 769 * 16], m3 | |
20166 | ||
20167 | ; mode 15 [row 0] | |
20168 | pmaddubsw m3, m0, [r5 + 15 * 16] | |
20169 | pmulhrsw m3, m7 | |
20170 | pmaddubsw m5, m2, [r5 + 15 * 16] | |
20171 | pmulhrsw m5, m7 | |
20172 | packuswb m3, m5 | |
20173 | movu [r0 + 832 * 16], m3 | |
20174 | pmaddubsw m3, m1, [r5 + 15 * 16] | |
20175 | pmulhrsw m3, m7 | |
20176 | pmaddubsw m5, m4, [r5 + 15 * 16] | |
20177 | pmulhrsw m5, m7 | |
20178 | packuswb m3, m5 | |
20179 | movu [r0 + 833 * 16], m3 | |
20180 | ||
20181 | ; mode 11 [row 16] | |
20182 | pslldq m0, 2 | |
20183 | pinsrb m0, [r4 + 0], 1 | |
20184 | pinsrb m0, [r3 + 16], 0 | |
20185 | pmaddubsw m3, m0, [r5 + 30 * 16] | |
20186 | pmulhrsw m3, m7 | |
20187 | pslldq m2, 2 | |
20188 | pinsrb m2, [r4 + 8], 1 | |
20189 | pinsrb m2, [r4 + 7], 0 | |
20190 | pmaddubsw m5, m2, [r5 + 30 * 16] | |
20191 | pmulhrsw m5, m7 | |
20192 | packuswb m3, m5 | |
20193 | movu [r0 + 608 * 16], m3 | |
20194 | pslldq m1, 2 | |
20195 | pinsrb m1, [r4 + 16], 1 | |
20196 | pinsrb m1, [r4 + 15], 0 | |
20197 | pmaddubsw m3, m1, [r5 + 30 * 16] | |
20198 | pmulhrsw m3, m7 | |
20199 | pslldq m4, 2 | |
20200 | pinsrb m4, [r4 + 24], 1 | |
20201 | pinsrb m4, [r4 + 23], 0 | |
20202 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
20203 | pmulhrsw m5, m7 | |
20204 | packuswb m3, m5 | |
20205 | movu [r0 + 609 * 16], m3 | |
20206 | ||
20207 | ; mode 11 [row 17] | |
20208 | pmaddubsw m3, m0, [r5 + 28 * 16] | |
20209 | pmulhrsw m3, m7 | |
20210 | pmaddubsw m5, m2, [r5 + 28 * 16] | |
20211 | pmulhrsw m5, m7 | |
20212 | packuswb m3, m5 | |
20213 | movu [r0 + 610 * 16], m3 | |
20214 | pmaddubsw m3, m1, [r5 + 28 * 16] | |
20215 | pmulhrsw m3, m7 | |
20216 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
20217 | pmulhrsw m5, m7 | |
20218 | packuswb m3, m5 | |
20219 | movu [r0 + 611 * 16], m3 | |
20220 | ||
20221 | ; mode 11 [row 18] | |
20222 | pmaddubsw m3, m0, [r5 + 26 * 16] | |
20223 | pmulhrsw m3, m7 | |
20224 | pmaddubsw m5, m2, [r5 + 26 * 16] | |
20225 | pmulhrsw m5, m7 | |
20226 | packuswb m3, m5 | |
20227 | movu [r0 + 612 * 16], m3 | |
20228 | pmaddubsw m3, m1, [r5 + 26 * 16] | |
20229 | pmulhrsw m3, m7 | |
20230 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
20231 | pmulhrsw m5, m7 | |
20232 | packuswb m3, m5 | |
20233 | movu [r0 + 613 * 16], m3 | |
20234 | ||
20235 | ; mode 11 [row 19] | |
20236 | pmaddubsw m3, m0, [r5 + 24 * 16] | |
20237 | pmulhrsw m3, m7 | |
20238 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
20239 | pmulhrsw m5, m7 | |
20240 | packuswb m3, m5 | |
20241 | movu [r0 + 614 * 16], m3 | |
20242 | pmaddubsw m3, m1, [r5 + 24 * 16] | |
20243 | pmulhrsw m3, m7 | |
20244 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
20245 | pmulhrsw m5, m7 | |
20246 | packuswb m3, m5 | |
20247 | movu [r0 + 615 * 16], m3 | |
20248 | ||
20249 | ; mode 11 [row 20] | |
20250 | pmaddubsw m3, m0, [r5 + 22 * 16] | |
20251 | pmulhrsw m3, m7 | |
20252 | pmaddubsw m5, m2, [r5 + 22 * 16] | |
20253 | pmulhrsw m5, m7 | |
20254 | packuswb m3, m5 | |
20255 | movu [r0 + 616 * 16], m3 | |
20256 | pmaddubsw m3, m1, [r5 + 22 * 16] | |
20257 | pmulhrsw m3, m7 | |
20258 | pmaddubsw m5, m4, [r5 + 22 * 16] | |
20259 | pmulhrsw m5, m7 | |
20260 | packuswb m3, m5 | |
20261 | movu [r0 + 617 * 16], m3 | |
20262 | ||
20263 | ; mode 11 [row 21] | |
20264 | pmaddubsw m3, m0, [r5 + 20 * 16] | |
20265 | pmulhrsw m3, m7 | |
20266 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
20267 | pmulhrsw m5, m7 | |
20268 | packuswb m3, m5 | |
20269 | movu [r0 + 618 * 16], m3 | |
20270 | pmaddubsw m3, m1, [r5 + 20 * 16] | |
20271 | pmulhrsw m3, m7 | |
20272 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
20273 | pmulhrsw m5, m7 | |
20274 | packuswb m3, m5 | |
20275 | movu [r0 + 619 * 16], m3 | |
20276 | ||
20277 | ; mode 11 [row 22] | |
20278 | pmaddubsw m3, m0, [r5 + 18 * 16] | |
20279 | pmulhrsw m3, m7 | |
20280 | pmaddubsw m5, m2, [r5 + 18 * 16] | |
20281 | pmulhrsw m5, m7 | |
20282 | packuswb m3, m5 | |
20283 | movu [r0 + 620 * 16], m3 | |
20284 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
20285 | pmulhrsw m3, m7 | |
20286 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
20287 | pmulhrsw m5, m7 | |
20288 | packuswb m3, m5 | |
20289 | movu [r0 + 621 * 16], m3 | |
20290 | ||
20291 | ; mode 11 [row 23] | |
20292 | pmaddubsw m3, m0, [r5 + 16 * 16] | |
20293 | pmulhrsw m3, m7 | |
20294 | pmaddubsw m5, m2, [r5 + 16 * 16] | |
20295 | pmulhrsw m5, m7 | |
20296 | packuswb m3, m5 | |
20297 | movu [r0 + 622 * 16], m3 | |
20298 | pmaddubsw m3, m1, [r5 + 16 * 16] | |
20299 | pmulhrsw m3, m7 | |
20300 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
20301 | pmulhrsw m5, m7 | |
20302 | packuswb m3, m5 | |
20303 | movu [r0 + 623 * 16], m3 | |
20304 | ||
20305 | ; mode 11 [row 24] | |
20306 | pmaddubsw m3, m0, [r5 + 14 * 16] | |
20307 | pmulhrsw m3, m7 | |
20308 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
20309 | pmulhrsw m5, m7 | |
20310 | packuswb m3, m5 | |
20311 | movu [r0 + 624 * 16], m3 | |
20312 | pmaddubsw m3, m1, [r5 + 14 * 16] | |
20313 | pmulhrsw m3, m7 | |
20314 | pmaddubsw m5, m4, [r5 + 14 * 16] | |
20315 | pmulhrsw m5, m7 | |
20316 | packuswb m3, m5 | |
20317 | movu [r0 + 625 * 16], m3 | |
20318 | ||
20319 | ; mode 11 [row 25] | |
20320 | pmaddubsw m3, m0, [r5 + 12 * 16] | |
20321 | pmulhrsw m3, m7 | |
20322 | pmaddubsw m5, m2, [r5 + 12 * 16] | |
20323 | pmulhrsw m5, m7 | |
20324 | packuswb m3, m5 | |
20325 | movu [r0 + 626 * 16], m3 | |
20326 | pmaddubsw m3, m1, [r5 + 12 * 16] | |
20327 | pmulhrsw m3, m7 | |
20328 | pmaddubsw m5, m4, [r5 + 12 * 16] | |
20329 | pmulhrsw m5, m7 | |
20330 | packuswb m3, m5 | |
20331 | movu [r0 + 627 * 16], m3 | |
20332 | ||
20333 | ; mode 11 [row 26] | |
20334 | pmaddubsw m3, m0, [r5 + 10 * 16] | |
20335 | pmulhrsw m3, m7 | |
20336 | pmaddubsw m5, m2, [r5 + 10 * 16] | |
20337 | pmulhrsw m5, m7 | |
20338 | packuswb m3, m5 | |
20339 | movu [r0 + 628 * 16], m3 | |
20340 | pmaddubsw m3, m1, [r5 + 10 * 16] | |
20341 | pmulhrsw m3, m7 | |
20342 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
20343 | pmulhrsw m5, m7 | |
20344 | packuswb m3, m5 | |
20345 | movu [r0 + 629 * 16], m3 | |
20346 | ||
20347 | ; mode 11 [row 27] | |
20348 | pmaddubsw m3, m0, [r5 + 8 * 16] | |
20349 | pmulhrsw m3, m7 | |
20350 | pmaddubsw m5, m2, [r5 + 8 * 16] | |
20351 | pmulhrsw m5, m7 | |
20352 | packuswb m3, m5 | |
20353 | movu [r0 + 630 * 16], m3 | |
20354 | pmaddubsw m3, m1, [r5 + 8 * 16] | |
20355 | pmulhrsw m3, m7 | |
20356 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
20357 | pmulhrsw m5, m7 | |
20358 | packuswb m3, m5 | |
20359 | movu [r0 + 631 * 16], m3 | |
20360 | ||
20361 | ; mode 11 [row 28] | |
20362 | pmaddubsw m3, m0, [r5 + 6 * 16] | |
20363 | pmulhrsw m3, m7 | |
20364 | pmaddubsw m5, m2, [r5 + 6 * 16] | |
20365 | pmulhrsw m5, m7 | |
20366 | packuswb m3, m5 | |
20367 | movu [r0 + 632 * 16], m3 | |
20368 | pmaddubsw m3, m1, [r5 + 6 * 16] | |
20369 | pmulhrsw m3, m7 | |
20370 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
20371 | pmulhrsw m5, m7 | |
20372 | packuswb m3, m5 | |
20373 | movu [r0 + 633 * 16], m3 | |
20374 | ||
20375 | ; mode 11 [row 29] | |
20376 | pmaddubsw m3, m0, [r5 + 4 * 16] | |
20377 | pmulhrsw m3, m7 | |
20378 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
20379 | pmulhrsw m5, m7 | |
20380 | packuswb m3, m5 | |
20381 | movu [r0 + 634 * 16], m3 | |
20382 | pmaddubsw m3, m1, [r5 + 4 * 16] | |
20383 | pmulhrsw m3, m7 | |
20384 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
20385 | pmulhrsw m5, m7 | |
20386 | packuswb m3, m5 | |
20387 | movu [r0 + 635 * 16], m3 | |
20388 | ||
20389 | ; mode 11 [row 30] | |
20390 | pmaddubsw m3, m0, [r5 + 2 * 16] | |
20391 | pmulhrsw m3, m7 | |
20392 | pmaddubsw m5, m2, [r5 + 2 * 16] | |
20393 | pmulhrsw m5, m7 | |
20394 | packuswb m3, m5 | |
20395 | movu [r0 + 636 * 16], m3 | |
20396 | pmaddubsw m3, m1, [r5 + 2 * 16] | |
20397 | pmulhrsw m3, m7 | |
20398 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
20399 | pmulhrsw m5, m7 | |
20400 | packuswb m3, m5 | |
20401 | movu [r0 + 637 * 16], m3 | |
20402 | ||
20403 | ; mode 12 [row 6] | |
20404 | pinsrb m0, [r3 + 6], 0 | |
20405 | pmaddubsw m3, m0, [r5 + 29 * 16] | |
20406 | pmulhrsw m3, m7 | |
20407 | pmaddubsw m5, m2, [r5 + 29 * 16] | |
20408 | pmulhrsw m5, m7 | |
20409 | packuswb m3, m5 | |
20410 | movu [r0 + 652 * 16], m3 | |
20411 | pmaddubsw m3, m1, [r5 + 29 * 16] | |
20412 | pmulhrsw m3, m7 | |
20413 | pmaddubsw m5, m4, [r5 + 29 * 16] | |
20414 | pmulhrsw m5, m7 | |
20415 | packuswb m3, m5 | |
20416 | movu [r0 + 653 * 16], m3 | |
20417 | ||
20418 | ; mode 12 [row 7] | |
20419 | pmaddubsw m3, m0, [r5 + 24 * 16] | |
20420 | pmulhrsw m3, m7 | |
20421 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
20422 | pmulhrsw m5, m7 | |
20423 | packuswb m3, m5 | |
20424 | movu [r0 + 654 * 16], m3 | |
20425 | pmaddubsw m3, m1, [r5 + 24 * 16] | |
20426 | pmulhrsw m3, m7 | |
20427 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
20428 | pmulhrsw m5, m7 | |
20429 | packuswb m3, m5 | |
20430 | movu [r0 + 655 * 16], m3 | |
20431 | ||
20432 | ; mode 12 [row 8] | |
20433 | pmaddubsw m3, m0, [r5 + 19 * 16] | |
20434 | pmulhrsw m3, m7 | |
20435 | pmaddubsw m5, m2, [r5 + 19 * 16] | |
20436 | pmulhrsw m5, m7 | |
20437 | packuswb m3, m5 | |
20438 | movu [r0 + 656 * 16], m3 | |
20439 | pmaddubsw m3, m1, [r5 + 19 * 16] | |
20440 | pmulhrsw m3, m7 | |
20441 | pmaddubsw m5, m4, [r5 + 19 * 16] | |
20442 | pmulhrsw m5, m7 | |
20443 | packuswb m3, m5 | |
20444 | movu [r0 + 657 * 16], m3 | |
20445 | ||
20446 | ; mode 12 [row 9] | |
20447 | pmaddubsw m3, m0, [r5 + 14 * 16] | |
20448 | pmulhrsw m3, m7 | |
20449 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
20450 | pmulhrsw m5, m7 | |
20451 | packuswb m3, m5 | |
20452 | movu [r0 + 658 * 16], m3 | |
20453 | pmaddubsw m3, m1, [r5 + 14 * 16] | |
20454 | pmulhrsw m3, m7 | |
20455 | pmaddubsw m5, m4, [r5 + 14 * 16] | |
20456 | pmulhrsw m5, m7 | |
20457 | packuswb m3, m5 | |
20458 | movu [r0 + 659 * 16], m3 | |
20459 | ||
20460 | ; mode 12 [row 10] | |
20461 | pmaddubsw m3, m0, [r5 + 9 * 16] | |
20462 | pmulhrsw m3, m7 | |
20463 | pmaddubsw m5, m2, [r5 + 9 * 16] | |
20464 | pmulhrsw m5, m7 | |
20465 | packuswb m3, m5 | |
20466 | movu [r0 + 660 * 16], m3 | |
20467 | pmaddubsw m3, m1, [r5 + 9 * 16] | |
20468 | pmulhrsw m3, m7 | |
20469 | pmaddubsw m5, m4, [r5 + 9 * 16] | |
20470 | pmulhrsw m5, m7 | |
20471 | packuswb m3, m5 | |
20472 | movu [r0 + 661 * 16], m3 | |
20473 | ||
20474 | ; mode 12 [row 11] | |
20475 | pmaddubsw m3, m0, [r5 + 4 * 16] | |
20476 | pmulhrsw m3, m7 | |
20477 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
20478 | pmulhrsw m5, m7 | |
20479 | packuswb m3, m5 | |
20480 | movu [r0 + 662 * 16], m3 | |
20481 | pmaddubsw m3, m1, [r5 + 4 * 16] | |
20482 | pmulhrsw m3, m7 | |
20483 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
20484 | pmulhrsw m5, m7 | |
20485 | packuswb m3, m5 | |
20486 | movu [r0 + 663 * 16], m3 | |
20487 | ||
20488 | ; mode 13 [row 3] | |
20489 | movu m6, m0 | |
20490 | pinsrb m6, [r3 + 4], 0 | |
20491 | pmaddubsw m3, m6, [r5 + 28 * 16] | |
20492 | pmulhrsw m3, m7 | |
20493 | pmaddubsw m5, m2, [r5 + 28 * 16] | |
20494 | pmulhrsw m5, m7 | |
20495 | packuswb m3, m5 | |
20496 | movu [r0 + 710 * 16], m3 | |
20497 | pmaddubsw m3, m1, [r5 + 28 * 16] | |
20498 | pmulhrsw m3, m7 | |
20499 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
20500 | pmulhrsw m5, m7 | |
20501 | packuswb m3, m5 | |
20502 | movu [r0 + 711 * 16], m3 | |
20503 | ||
20504 | ; mode 13 [row 4] | |
20505 | pmaddubsw m3, m6, [r5 + 19 * 16] | |
20506 | pmulhrsw m3, m7 | |
20507 | pmaddubsw m5, m2, [r5 + 19 * 16] | |
20508 | pmulhrsw m5, m7 | |
20509 | packuswb m3, m5 | |
20510 | movu [r0 + 712 * 16], m3 | |
20511 | pmaddubsw m3, m1, [r5 + 19 * 16] | |
20512 | pmulhrsw m3, m7 | |
20513 | pmaddubsw m5, m4, [r5 + 19 * 16] | |
20514 | pmulhrsw m5, m7 | |
20515 | packuswb m3, m5 | |
20516 | movu [r0 + 713 * 16], m3 | |
20517 | ||
20518 | ; mode 13 [row 5] | |
20519 | pmaddubsw m3, m6, [r5 + 10 * 16] | |
20520 | pmulhrsw m3, m7 | |
20521 | pmaddubsw m5, m2, [r5 + 10 * 16] | |
20522 | pmulhrsw m5, m7 | |
20523 | packuswb m3, m5 | |
20524 | movu [r0 + 714 * 16], m3 | |
20525 | pmaddubsw m3, m1, [r5 + 10 * 16] | |
20526 | pmulhrsw m3, m7 | |
20527 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
20528 | pmulhrsw m5, m7 | |
20529 | packuswb m3, m5 | |
20530 | movu [r0 + 715 * 16], m3 | |
20531 | ||
20532 | ; mode 13 [row 6] | |
20533 | pmaddubsw m3, m6, [r5 + 1 * 16] | |
20534 | pmulhrsw m3, m7 | |
20535 | pmaddubsw m5, m2, [r5 + 1 * 16] | |
20536 | pmulhrsw m5, m7 | |
20537 | packuswb m3, m5 | |
20538 | movu [r0 + 716 * 16], m3 | |
20539 | pmaddubsw m3, m1, [r5 + 1 * 16] | |
20540 | pmulhrsw m3, m7 | |
20541 | pmaddubsw m5, m4, [r5 + 1 * 16] | |
20542 | pmulhrsw m5, m7 | |
20543 | packuswb m3, m5 | |
20544 | movu [r0 + 717 * 16], m3 | |
20545 | ||
20546 | ; mode 14 [row 2] | |
20547 | movu m6, m0 | |
20548 | pinsrb m6, [r4 + 0], 1 | |
20549 | pinsrb m6, [r3 + 2], 0 | |
20550 | pmaddubsw m3, m6, [r5 + 25 * 16] | |
20551 | pmulhrsw m3, m7 | |
20552 | pmaddubsw m5, m2, [r5 + 25 * 16] | |
20553 | pmulhrsw m5, m7 | |
20554 | packuswb m3, m5 | |
20555 | movu [r0 + 772 * 16], m3 | |
20556 | pmaddubsw m3, m1, [r5 + 25 * 16] | |
20557 | pmulhrsw m3, m7 | |
20558 | pmaddubsw m5, m4, [r5 + 25 * 16] | |
20559 | pmulhrsw m5, m7 | |
20560 | packuswb m3, m5 | |
20561 | movu [r0 + 773 * 16], m3 | |
20562 | ||
20563 | ; mode 14 [row 3] | |
20564 | pmaddubsw m3, m6, [r5 + 12 * 16] | |
20565 | pmulhrsw m3, m7 | |
20566 | pmaddubsw m5, m2, [r5 + 12 * 16] | |
20567 | pmulhrsw m5, m7 | |
20568 | packuswb m3, m5 | |
20569 | movu [r0 + 774 * 16], m3 | |
20570 | pmaddubsw m3, m1, [r5 + 12 * 16] | |
20571 | pmulhrsw m3, m7 | |
20572 | pmaddubsw m5, m4, [r5 + 12 * 16] | |
20573 | pmulhrsw m5, m7 | |
20574 | packuswb m3, m5 | |
20575 | movu [r0 + 775 * 16], m3 | |
20576 | ||
20577 | ; mode 15 [row 1] | |
20578 | pmaddubsw m3, m6, [r5 + 30 * 16] | |
20579 | pmulhrsw m3, m7 | |
20580 | pmaddubsw m5, m2, [r5 + 30 * 16] | |
20581 | pmulhrsw m5, m7 | |
20582 | packuswb m3, m5 | |
20583 | movu [r0 + 834 * 16], m3 | |
20584 | pmaddubsw m3, m1, [r5 + 30 * 16] | |
20585 | pmulhrsw m3, m7 | |
20586 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
20587 | pmulhrsw m5, m7 | |
20588 | packuswb m3, m5 | |
20589 | movu [r0 + 835 * 16], m3 | |
20590 | ||
20591 | ; mode 15 [row 2] | |
20592 | pmaddubsw m3, m6, [r5 + 13 * 16] | |
20593 | pmulhrsw m3, m7 | |
20594 | pmaddubsw m5, m2, [r5 + 13 * 16] | |
20595 | pmulhrsw m5, m7 | |
20596 | packuswb m3, m5 | |
20597 | movu [r0 + 836 * 16], m3 | |
20598 | pmaddubsw m3, m1, [r5 + 13 * 16] | |
20599 | pmulhrsw m3, m7 | |
20600 | pmaddubsw m5, m4, [r5 + 13 * 16] | |
20601 | pmulhrsw m5, m7 | |
20602 | packuswb m3, m5 | |
20603 | movu [r0 + 837 * 16], m3 | |
20604 | ||
20605 | ; mode 15 [row 3] | |
20606 | pslldq m6, 2 | |
20607 | pinsrb m6, [r3 + 2], 1 | |
20608 | pinsrb m6, [r3 + 4], 0 | |
20609 | pmaddubsw m3, m6, [r5 + 28 * 16] | |
20610 | pmulhrsw m3, m7 | |
20611 | pslldq m2, 2 | |
20612 | pinsrb m2, [r4 + 7], 1 | |
20613 | pinsrb m2, [r4 + 6], 0 | |
20614 | pmaddubsw m5, m2, [r5 + 28 * 16] | |
20615 | pmulhrsw m5, m7 | |
20616 | packuswb m3, m5 | |
20617 | movu [r0 + 838 * 16], m3 | |
20618 | pslldq m1, 2 | |
20619 | pinsrb m1, [r4 + 15], 1 | |
20620 | pinsrb m1, [r4 + 14], 0 | |
20621 | pmaddubsw m3, m1, [r5 + 28 * 16] | |
20622 | pmulhrsw m3, m7 | |
20623 | pslldq m4, 2 | |
20624 | pinsrb m4, [r4 + 23], 1 | |
20625 | pinsrb m4, [r4 + 22], 0 | |
20626 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
20627 | pmulhrsw m5, m7 | |
20628 | packuswb m3, m5 | |
20629 | movu [r0 + 839 * 16], m3 | |
20630 | ||
20631 | ; mode 15 [row 4] | |
20632 | pmaddubsw m3, m6, [r5 + 11 * 16] | |
20633 | pmulhrsw m3, m7 | |
20634 | pmaddubsw m5, m2, [r5 + 11 * 16] | |
20635 | pmulhrsw m5, m7 | |
20636 | packuswb m3, m5 | |
20637 | movu [r0 + 840 * 16], m3 | |
20638 | pmaddubsw m3, m1, [r5 + 11 * 16] | |
20639 | pmulhrsw m3, m7 | |
20640 | pmaddubsw m5, m4, [r5 + 11 * 16] | |
20641 | pmulhrsw m5, m7 | |
20642 | packuswb m3, m5 | |
20643 | movu [r0 + 841 * 16], m3 | |
20644 | ||
20645 | ; mode 15 [row 5, 0-7] | |
20646 | pslldq m6, 2 | |
20647 | pinsrb m6, [r3 + 4], 1 | |
20648 | pinsrb m6, [r3 + 6], 0 | |
20649 | pmaddubsw m3, m6, [r5 + 26 * 16] | |
20650 | pmulhrsw m3, m7 | |
20651 | packuswb m3, m3 | |
20652 | movh [r0 + 842 * 16], m3 | |
20653 | ||
20654 | ; mode 15 [row 6, 0-7] | |
20655 | pmaddubsw m3, m6, [r5 + 9 * 16] | |
20656 | pmulhrsw m3, m7 | |
20657 | packuswb m3, m3 | |
20658 | movh [r0 + 844 * 16], m3 | |
20659 | ||
20660 | ; mode 15 [row 7, 0-7] | |
20661 | pslldq m6, 2 | |
20662 | pinsrb m6, [r3 + 6], 1 | |
20663 | pinsrb m6, [r3 + 8], 0 | |
20664 | pmaddubsw m3, m6, [r5 + 24 * 16] | |
20665 | pmulhrsw m3, m7 | |
20666 | packuswb m3, m3 | |
20667 | movh [r0 + 846 * 16], m3 | |
20668 | ||
20669 | ; mode 15 [row 8, 0-7] | |
20670 | pmaddubsw m3, m6, [r5 + 7 * 16] | |
20671 | pmulhrsw m3, m7 | |
20672 | packuswb m3, m3 | |
20673 | movh [r0 + 848 * 16], m3 | |
20674 | ||
20675 | ; mode 15 [row 9, 0-7] | |
20676 | pslldq m6, 2 | |
20677 | pinsrb m6, [r3 + 8], 1 | |
20678 | pinsrb m6, [r3 + 9], 0 | |
20679 | pmaddubsw m3, m6, [r5 + 22 * 16] | |
20680 | pmulhrsw m3, m7 | |
20681 | packuswb m3, m3 | |
20682 | movh [r0 + 850 * 16], m3 | |
20683 | ||
20684 | ; mode 15 [row 10, 0-7] | |
20685 | pmaddubsw m3, m6, [r5 + 5 * 16] | |
20686 | pmulhrsw m3, m7 | |
20687 | packuswb m3, m3 | |
20688 | movh [r0 + 852 * 16], m3 | |
20689 | ||
20690 | ; mode 15 [row 11, 0-7] | |
20691 | pslldq m6, 2 | |
20692 | pinsrb m6, [r3 + 9], 1 | |
20693 | pinsrb m6, [r3 + 11], 0 | |
20694 | pmaddubsw m3, m6, [r5 + 20 * 16] | |
20695 | pmulhrsw m3, m7 | |
20696 | packuswb m3, m3 | |
20697 | movh [r0 + 854 * 16], m3 | |
20698 | ||
20699 | ; mode 15 [row 12, 0-7] | |
20700 | pmaddubsw m3, m6, [r5 + 3 * 16] | |
20701 | pmulhrsw m3, m7 | |
20702 | packuswb m3, m3 | |
20703 | movh [r0 + 856 * 16], m3 | |
20704 | ||
20705 | ; mode 15 [row 13, 0-7] | |
20706 | pslldq m6, 2 | |
20707 | pinsrb m6, [r3 + 11], 1 | |
20708 | pinsrb m6, [r3 + 13], 0 | |
20709 | pmaddubsw m3, m6, [r5 + 18 * 16] | |
20710 | pmulhrsw m3, m7 | |
20711 | packuswb m3, m3 | |
20712 | movh [r0 + 858 * 16], m3 | |
20713 | ||
20714 | ; mode 15 [row 14, 0-7] | |
20715 | pmaddubsw m3, m6, [r5 + 1 * 16] | |
20716 | pmulhrsw m3, m7 | |
20717 | packuswb m3, m3 | |
20718 | movh [r0 + 860 * 16], m3 | |
20719 | ||
20720 | ; mode 15 [row 15, 0-7] | |
20721 | pslldq m6, 2 | |
20722 | pinsrb m6, [r3 + 13], 1 | |
20723 | pinsrb m6, [r3 + 15], 0 | |
20724 | pmaddubsw m3, m6, [r5 + 16 * 16] | |
20725 | pmulhrsw m3, m7 | |
20726 | packuswb m3, m3 | |
20727 | movh [r0 + 862 * 16], m3 | |
20728 | ||
20729 | ; mode 15 [row 16, 0-7] | |
20730 | pslldq m6, 2 | |
20731 | pinsrb m6, [r3 + 15], 1 | |
20732 | pinsrb m6, [r3 + 17], 0 | |
20733 | pmaddubsw m3, m6, [r5 + 31 * 16] | |
20734 | pmulhrsw m3, m7 | |
20735 | packuswb m3, m3 | |
20736 | movh [r0 + 864 * 16], m3 | |
20737 | ||
20738 | ; mode 15 [row 17, 0-7] | |
20739 | pmaddubsw m3, m6, [r5 + 14 * 16] | |
20740 | pmulhrsw m3, m7 | |
20741 | packuswb m3, m3 | |
20742 | movh [r0 + 866 * 16], m3 | |
20743 | ||
20744 | ; mode 15 [row 18, 0-7] | |
20745 | pslldq m6, 2 | |
20746 | pinsrb m6, [r3 + 17], 1 | |
20747 | pinsrb m6, [r3 + 19], 0 | |
20748 | pmaddubsw m3, m6, [r5 + 29 * 16] | |
20749 | pmulhrsw m3, m7 | |
20750 | packuswb m3, m3 | |
20751 | movh [r0 + 868 * 16], m3 | |
20752 | ||
20753 | ; mode 15 [row 19, 0-7] | |
20754 | pmaddubsw m3, m6, [r5 + 12 * 16] | |
20755 | pmulhrsw m3, m7 | |
20756 | packuswb m3, m3 | |
20757 | movh [r0 + 870 * 16], m3 | |
20758 | ||
20759 | ; mode 15 [row 20, 0-7] | |
20760 | pslldq m6, 2 | |
20761 | pinsrb m6, [r3 + 19], 1 | |
20762 | pinsrb m6, [r3 + 21], 0 | |
20763 | pmaddubsw m3, m6, [r5 + 27 * 16] | |
20764 | pmulhrsw m3, m7 | |
20765 | packuswb m3, m3 | |
20766 | movh [r0 + 872 * 16], m3 | |
20767 | ||
20768 | ; mode 15 [row 21, 0-7] | |
20769 | pmaddubsw m3, m6, [r5 + 10 * 16] | |
20770 | pmulhrsw m3, m7 | |
20771 | packuswb m3, m3 | |
20772 | movh [r0 + 874 * 16], m3 | |
20773 | ||
20774 | ; mode 15 [row 22, 0-7] | |
20775 | pslldq m6, 2 | |
20776 | pinsrb m6, [r3 + 21], 1 | |
20777 | pinsrb m6, [r3 + 23], 0 | |
20778 | pmaddubsw m3, m6, [r5 + 25 * 16] | |
20779 | pmulhrsw m3, m7 | |
20780 | packuswb m3, m3 | |
20781 | movh [r0 + 876 * 16], m3 | |
20782 | ||
20783 | ; mode 15 [row 23, 0-7] | |
20784 | pmaddubsw m3, m6, [r5 + 8 * 16] | |
20785 | pmulhrsw m3, m7 | |
20786 | packuswb m3, m3 | |
20787 | movh [r0 + 878 * 16], m3 | |
20788 | ||
20789 | ; mode 15 [row 24, 0-7] | |
20790 | pslldq m6, 2 | |
20791 | pinsrb m6, [r3 + 23], 1 | |
20792 | pinsrb m6, [r3 + 24], 0 | |
20793 | pmaddubsw m3, m6, [r5 + 23 * 16] | |
20794 | pmulhrsw m3, m7 | |
20795 | packuswb m3, m3 | |
20796 | movh [r0 + 880 * 16], m3 | |
20797 | ||
20798 | ; mode 15 [row 25, 0-7] | |
20799 | pmaddubsw m3, m6, [r5 + 6 * 16] | |
20800 | pmulhrsw m3, m7 | |
20801 | packuswb m3, m3 | |
20802 | movh [r0 + 882 * 16], m3 | |
20803 | ||
20804 | ; mode 15 [row 26, 0-7] | |
20805 | pslldq m6, 2 | |
20806 | pinsrb m6, [r3 + 24], 1 | |
20807 | pinsrb m6, [r3 + 26], 0 | |
20808 | pmaddubsw m3, m6, [r5 + 21 * 16] | |
20809 | pmulhrsw m3, m7 | |
20810 | packuswb m3, m3 | |
20811 | movh [r0 + 884 * 16], m3 | |
20812 | ||
20813 | ; mode 15 [row 27, 0-7] | |
20814 | pmaddubsw m3, m6, [r5 + 4 * 16] | |
20815 | pmulhrsw m3, m7 | |
20816 | packuswb m3, m3 | |
20817 | movh [r0 + 886 * 16], m3 | |
20818 | ||
20819 | ; mode 15 [row 28, 0-7] | |
20820 | pslldq m6, 2 | |
20821 | pinsrb m6, [r3 + 26], 1 | |
20822 | pinsrb m6, [r3 + 28], 0 | |
20823 | pmaddubsw m3, m6, [r5 + 19 * 16] | |
20824 | pmulhrsw m3, m7 | |
20825 | packuswb m3, m3 | |
20826 | movh [r0 + 888 * 16], m3 | |
20827 | ||
20828 | ; mode 15 [row 29, 0-7] | |
20829 | pmaddubsw m3, m6, [r5 + 2 * 16] | |
20830 | pmulhrsw m3, m7 | |
20831 | packuswb m3, m3 | |
20832 | movh [r0 + 890 * 16], m3 | |
20833 | ||
20834 | ; mode 15 [row 30, 0-7] | |
20835 | pslldq m6, 2 | |
20836 | pinsrb m6, [r3 + 28], 1 | |
20837 | pinsrb m6, [r3 + 30], 0 | |
20838 | pmaddubsw m3, m6, [r5 + 17 * 16] | |
20839 | pmulhrsw m3, m7 | |
20840 | packuswb m3, m3 | |
20841 | movh [r0 + 892 * 16], m3 | |
20842 | ||
20843 | ; mode 15 [row 31, 0-7] | |
20844 | pshufb m3, m6, [tab_S2] | |
20845 | movh [r0 + 894 * 16], m3 | |
20846 | ||
20847 | ; mode 12 [row 12] | |
20848 | pslldq m0, 2 | |
20849 | pinsrb m0, [r3 + 6], 1 | |
20850 | pinsrb m0, [r3 + 13], 0 | |
20851 | pmaddubsw m3, m0, [r5 + 31 * 16] | |
20852 | pmulhrsw m3, m7 | |
20853 | pmaddubsw m5, m2, [r5 + 31 * 16] | |
20854 | pmulhrsw m5, m7 | |
20855 | packuswb m3, m5 | |
20856 | movu [r0 + 664 * 16], m3 | |
20857 | pmaddubsw m3, m1, [r5 + 31 * 16] | |
20858 | pmulhrsw m3, m7 | |
20859 | pmaddubsw m5, m4, [r5 + 31 * 16] | |
20860 | pmulhrsw m5, m7 | |
20861 | packuswb m3, m5 | |
20862 | movu [r0 + 665 * 16], m3 | |
20863 | ||
20864 | ; mode 12 [row 13] | |
20865 | pmaddubsw m3, m0, [r5 + 26 * 16] | |
20866 | pmulhrsw m3, m7 | |
20867 | pmaddubsw m5, m2, [r5 + 26 * 16] | |
20868 | pmulhrsw m5, m7 | |
20869 | packuswb m3, m5 | |
20870 | movu [r0 + 666 * 16], m3 | |
20871 | pmaddubsw m3, m1, [r5 + 26 * 16] | |
20872 | pmulhrsw m3, m7 | |
20873 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
20874 | pmulhrsw m5, m7 | |
20875 | packuswb m3, m5 | |
20876 | movu [r0 + 667 * 16], m3 | |
20877 | ||
20878 | ; mode 12 [row 14] | |
20879 | pmaddubsw m3, m0, [r5 + 21 * 16] | |
20880 | pmulhrsw m3, m7 | |
20881 | pmaddubsw m5, m2, [r5 + 21 * 16] | |
20882 | pmulhrsw m5, m7 | |
20883 | packuswb m3, m5 | |
20884 | movu [r0 + 668 * 16], m3 | |
20885 | pmaddubsw m3, m1, [r5 + 21 * 16] | |
20886 | pmulhrsw m3, m7 | |
20887 | pmaddubsw m5, m4, [r5 + 21 * 16] | |
20888 | pmulhrsw m5, m7 | |
20889 | packuswb m3, m5 | |
20890 | movu [r0 + 669 * 16], m3 | |
20891 | ||
20892 | ; mode 12 [row 15] | |
20893 | pmaddubsw m3, m0, [r5 + 16 * 16] | |
20894 | pmulhrsw m3, m7 | |
20895 | pmaddubsw m5, m2, [r5 + 16 * 16] | |
20896 | pmulhrsw m5, m7 | |
20897 | packuswb m3, m5 | |
20898 | movu [r0 + 670 * 16], m3 | |
20899 | pmaddubsw m3, m1, [r5 + 16 * 16] | |
20900 | pmulhrsw m3, m7 | |
20901 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
20902 | pmulhrsw m5, m7 | |
20903 | packuswb m3, m5 | |
20904 | movu [r0 + 671 * 16], m3 | |
20905 | ||
20906 | ; mode 12 [row 16] | |
20907 | pmaddubsw m3, m0, [r5 + 11 * 16] | |
20908 | pmulhrsw m3, m7 | |
20909 | pmaddubsw m5, m2, [r5 + 11 * 16] | |
20910 | pmulhrsw m5, m7 | |
20911 | packuswb m3, m5 | |
20912 | movu [r0 + 672 * 16], m3 | |
20913 | pmaddubsw m3, m1, [r5 + 11 * 16] | |
20914 | pmulhrsw m3, m7 | |
20915 | pmaddubsw m5, m4, [r5 + 11 * 16] | |
20916 | pmulhrsw m5, m7 | |
20917 | packuswb m3, m5 | |
20918 | movu [r0 + 673 * 16], m3 | |
20919 | ||
20920 | ; mode 12 [row 17] | |
20921 | pmaddubsw m3, m0, [r5 + 6 * 16] | |
20922 | pmulhrsw m3, m7 | |
20923 | pmaddubsw m5, m2, [r5 + 6 * 16] | |
20924 | pmulhrsw m5, m7 | |
20925 | packuswb m3, m5 | |
20926 | movu [r0 + 674 * 16], m3 | |
20927 | pmaddubsw m3, m1, [r5 + 6 * 16] | |
20928 | pmulhrsw m3, m7 | |
20929 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
20930 | pmulhrsw m5, m7 | |
20931 | packuswb m3, m5 | |
20932 | movu [r0 + 675 * 16], m3 | |
20933 | ||
20934 | ; mode 12 [row 18] | |
20935 | pmaddubsw m3, m0, [r5 + 1 * 16] | |
20936 | pmulhrsw m3, m7 | |
20937 | pmaddubsw m5, m2, [r5 + 1 * 16] | |
20938 | pmulhrsw m5, m7 | |
20939 | packuswb m3, m5 | |
20940 | movu [r0 + 676 * 16], m3 | |
20941 | pmaddubsw m3, m1, [r5 + 1 * 16] | |
20942 | pmulhrsw m3, m7 | |
20943 | pmaddubsw m5, m4, [r5 + 1 * 16] | |
20944 | pmulhrsw m5, m7 | |
20945 | packuswb m3, m5 | |
20946 | movu [r0 + 677 * 16], m3 | |
20947 | ||
20948 | ; mode 13 [row 7] | |
20949 | movu m6, m0 | |
20950 | pinsrb m6, [r3 + 4], 2 | |
20951 | pinsrb m6, [r3 + 4], 1 | |
20952 | pinsrb m6, [r3 + 7], 0 | |
20953 | pmaddubsw m3, m6, [r5 + 24 * 16] | |
20954 | pmulhrsw m3, m7 | |
20955 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
20956 | pmulhrsw m5, m7 | |
20957 | packuswb m3, m5 | |
20958 | movu [r0 + 718 * 16], m3 | |
20959 | pmaddubsw m3, m1, [r5 + 24 * 16] | |
20960 | pmulhrsw m3, m7 | |
20961 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
20962 | pmulhrsw m5, m7 | |
20963 | packuswb m3, m5 | |
20964 | movu [r0 + 719 * 16], m3 | |
20965 | ||
20966 | ; mode 13 [row 8] | |
20967 | pmaddubsw m3, m6, [r5 + 15 * 16] | |
20968 | pmulhrsw m3, m7 | |
20969 | pmaddubsw m5, m2, [r5 + 15 * 16] | |
20970 | pmulhrsw m5, m7 | |
20971 | packuswb m3, m5 | |
20972 | movu [r0 + 720 * 16], m3 | |
20973 | pmaddubsw m3, m1, [r5 + 15 * 16] | |
20974 | pmulhrsw m3, m7 | |
20975 | pmaddubsw m5, m4, [r5 + 15 * 16] | |
20976 | pmulhrsw m5, m7 | |
20977 | packuswb m3, m5 | |
20978 | movu [r0 + 721 * 16], m3 | |
20979 | ||
20980 | ; mode 13 [row 9] | |
20981 | pmaddubsw m3, m6, [r5 + 6 * 16] | |
20982 | pmulhrsw m3, m7 | |
20983 | pmaddubsw m5, m2, [r5 + 6 * 16] | |
20984 | pmulhrsw m5, m7 | |
20985 | packuswb m3, m5 | |
20986 | movu [r0 + 722 * 16], m3 | |
20987 | pmaddubsw m3, m1, [r5 + 6 * 16] | |
20988 | pmulhrsw m3, m7 | |
20989 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
20990 | pmulhrsw m5, m7 | |
20991 | packuswb m3, m5 | |
20992 | movu [r0 + 723 * 16], m3 | |
20993 | ||
20994 | ; mode 14 [row 4] | |
20995 | pinsrb m6, [r3 + 2], 2 | |
20996 | pinsrb m6, [r3 + 2], 1 | |
20997 | pinsrb m6, [r3 + 5], 0 | |
20998 | pmaddubsw m3, m6, [r5 + 31 * 16] | |
20999 | pmulhrsw m3, m7 | |
21000 | pmaddubsw m5, m2, [r5 + 31 * 16] | |
21001 | pmulhrsw m5, m7 | |
21002 | packuswb m3, m5 | |
21003 | movu [r0 + 776 * 16], m3 | |
21004 | pmaddubsw m3, m1, [r5 + 31 * 16] | |
21005 | pmulhrsw m3, m7 | |
21006 | pmaddubsw m5, m4, [r5 + 31 * 16] | |
21007 | pmulhrsw m5, m7 | |
21008 | packuswb m3, m5 | |
21009 | movu [r0 + 777 * 16], m3 | |
21010 | ||
21011 | ; mode 14 [row 5] | |
21012 | pmaddubsw m3, m6, [r5 + 18 * 16] | |
21013 | pmulhrsw m3, m7 | |
21014 | pmaddubsw m5, m2, [r5 + 18 * 16] | |
21015 | pmulhrsw m5, m7 | |
21016 | packuswb m3, m5 | |
21017 | movu [r0 + 778 * 16], m3 | |
21018 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
21019 | pmulhrsw m3, m7 | |
21020 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
21021 | pmulhrsw m5, m7 | |
21022 | packuswb m3, m5 | |
21023 | movu [r0 + 779 * 16], m3 | |
21024 | ||
21025 | ; mode 14 [row 6] | |
21026 | pmaddubsw m3, m6, [r5 + 5 * 16] | |
21027 | pmulhrsw m3, m7 | |
21028 | pmaddubsw m5, m2, [r5 + 5 * 16] | |
21029 | pmulhrsw m5, m7 | |
21030 | packuswb m3, m5 | |
21031 | movu [r0 + 780 * 16], m3 | |
21032 | pmaddubsw m3, m1, [r5 + 5 * 16] | |
21033 | pmulhrsw m3, m7 | |
21034 | pmaddubsw m5, m4, [r5 + 5 * 16] | |
21035 | pmulhrsw m5, m7 | |
21036 | packuswb m3, m5 | |
21037 | movu [r0 + 781 * 16], m3 | |
21038 | ||
21039 | ; mode 14 [row 7] | |
21040 | pslldq m6, 2 | |
21041 | pinsrb m6, [r3 + 5], 1 | |
21042 | pinsrb m6, [r3 + 7], 0 | |
21043 | pmaddubsw m3, m6, [r5 + 24 * 16] | |
21044 | pmulhrsw m3, m7 | |
21045 | pslldq m2, 2 | |
21046 | pinsrw m2, [r4 + 5], 0 | |
21047 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
21048 | pmulhrsw m5, m7 | |
21049 | packuswb m3, m5 | |
21050 | movu [r0 + 782 * 16], m3 | |
21051 | pslldq m1, 2 | |
21052 | pinsrw m1, [r4 + 13], 0 | |
21053 | pmaddubsw m3, m1, [r5 + 24 * 16] | |
21054 | pmulhrsw m3, m7 | |
21055 | pslldq m4, 2 | |
21056 | pinsrw m4, [r4 + 21], 0 | |
21057 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
21058 | pmulhrsw m5, m7 | |
21059 | packuswb m3, m5 | |
21060 | movu [r0 + 783 * 16], m3 | |
21061 | ||
21062 | ; mode 14 [row 8] | |
21063 | pmaddubsw m3, m6, [r5 + 11 * 16] | |
21064 | pmulhrsw m3, m7 | |
21065 | pmaddubsw m5, m2, [r5 + 11 * 16] | |
21066 | pmulhrsw m5, m7 | |
21067 | packuswb m3, m5 | |
21068 | movu [r0 + 784 * 16], m3 | |
21069 | pmaddubsw m3, m1, [r5 + 11 * 16] | |
21070 | pmulhrsw m3, m7 | |
21071 | pmaddubsw m5, m4, [r5 + 11 * 16] | |
21072 | pmulhrsw m5, m7 | |
21073 | packuswb m3, m5 | |
21074 | movu [r0 + 785 * 16], m3 | |
21075 | ||
21076 | ; mode 15 [row 5, 8-31] | |
21077 | pmaddubsw m5, m2, [r5 + 26 * 16] | |
21078 | pmulhrsw m5, m7 | |
21079 | packuswb m5, m5 | |
21080 | movh [r0 + 842 * 16 + 8], m5 | |
21081 | pmaddubsw m3, m1, [r5 + 26 * 16] | |
21082 | pmulhrsw m3, m7 | |
21083 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
21084 | pmulhrsw m5, m7 | |
21085 | packuswb m3, m5 | |
21086 | movu [r0 + 843 * 16], m3 | |
21087 | ||
21088 | ; mode 15 [row 6, 8-31] | |
21089 | pmaddubsw m5, m2, [r5 + 9 * 16] | |
21090 | pmulhrsw m5, m7 | |
21091 | packuswb m5, m5 | |
21092 | movh [r0 + 844 * 16 + 8], m5 | |
21093 | pmaddubsw m3, m1, [r5 + 9 * 16] | |
21094 | pmulhrsw m3, m7 | |
21095 | pmaddubsw m5, m4, [r5 + 9 * 16] | |
21096 | pmulhrsw m5, m7 | |
21097 | packuswb m3, m5 | |
21098 | movu [r0 + 845 * 16], m3 | |
21099 | ||
21100 | ; mode 12 [row 19] | |
21101 | pslldq m0, 2 | |
21102 | pinsrb m0, [r3 + 13], 1 | |
21103 | pinsrb m0, [r3 + 19], 0 | |
21104 | pmaddubsw m3, m0, [r5 + 28 * 16] | |
21105 | pmulhrsw m3, m7 | |
21106 | pmaddubsw m5, m2, [r5 + 28 * 16] | |
21107 | pmulhrsw m5, m7 | |
21108 | packuswb m3, m5 | |
21109 | movu [r0 + 678 * 16], m3 | |
21110 | pmaddubsw m3, m1, [r5 + 28 * 16] | |
21111 | pmulhrsw m3, m7 | |
21112 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
21113 | pmulhrsw m5, m7 | |
21114 | packuswb m3, m5 | |
21115 | movu [r0 + 679 * 16], m3 | |
21116 | ||
21117 | ; mode 12 [row 20] | |
21118 | pmaddubsw m3, m0, [r5 + 23 * 16] | |
21119 | pmulhrsw m3, m7 | |
21120 | pmaddubsw m5, m2, [r5 + 23 * 16] | |
21121 | pmulhrsw m5, m7 | |
21122 | packuswb m3, m5 | |
21123 | movu [r0 + 680 * 16], m3 | |
21124 | pmaddubsw m3, m1, [r5 + 23 * 16] | |
21125 | pmulhrsw m3, m7 | |
21126 | pmaddubsw m5, m4, [r5 + 23 * 16] | |
21127 | pmulhrsw m5, m7 | |
21128 | packuswb m3, m5 | |
21129 | movu [r0 + 681 * 16], m3 | |
21130 | ||
21131 | ; mode 12 [row 21] | |
21132 | pmaddubsw m3, m0, [r5 + 18 * 16] | |
21133 | pmulhrsw m3, m7 | |
21134 | pmaddubsw m5, m2, [r5 + 18 * 16] | |
21135 | pmulhrsw m5, m7 | |
21136 | packuswb m3, m5 | |
21137 | movu [r0 + 682 * 16], m3 | |
21138 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
21139 | pmulhrsw m3, m7 | |
21140 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
21141 | pmulhrsw m5, m7 | |
21142 | packuswb m3, m5 | |
21143 | movu [r0 + 683 * 16], m3 | |
21144 | ||
21145 | ; mode 12 [row 22] | |
21146 | pmaddubsw m3, m0, [r5 + 13 * 16] | |
21147 | pmulhrsw m3, m7 | |
21148 | pmaddubsw m5, m2, [r5 + 13 * 16] | |
21149 | pmulhrsw m5, m7 | |
21150 | packuswb m3, m5 | |
21151 | movu [r0 + 684 * 16], m3 | |
21152 | pmaddubsw m3, m1, [r5 + 13 * 16] | |
21153 | pmulhrsw m3, m7 | |
21154 | pmaddubsw m5, m4, [r5 + 13 * 16] | |
21155 | pmulhrsw m5, m7 | |
21156 | packuswb m3, m5 | |
21157 | movu [r0 + 685 * 16], m3 | |
21158 | ||
21159 | ; mode 12 [row 23] | |
21160 | pmaddubsw m3, m0, [r5 + 8 * 16] | |
21161 | pmulhrsw m3, m7 | |
21162 | pmaddubsw m5, m2, [r5 + 8 * 16] | |
21163 | pmulhrsw m5, m7 | |
21164 | packuswb m3, m5 | |
21165 | movu [r0 + 686 * 16], m3 | |
21166 | pmaddubsw m3, m1, [r5 + 8 * 16] | |
21167 | pmulhrsw m3, m7 | |
21168 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
21169 | pmulhrsw m5, m7 | |
21170 | packuswb m3, m5 | |
21171 | movu [r0 + 687 * 16], m3 | |
21172 | ||
21173 | ; mode 12 [row 24] | |
21174 | pmaddubsw m3, m0, [r5 + 3 * 16] | |
21175 | pmulhrsw m3, m7 | |
21176 | pmaddubsw m5, m2, [r5 + 3 * 16] | |
21177 | pmulhrsw m5, m7 | |
21178 | packuswb m3, m5 | |
21179 | movu [r0 + 688 * 16], m3 | |
21180 | pmaddubsw m3, m1, [r5 + 3 * 16] | |
21181 | pmulhrsw m3, m7 | |
21182 | pmaddubsw m5, m4, [r5 + 3 * 16] | |
21183 | pmulhrsw m5, m7 | |
21184 | packuswb m3, m5 | |
21185 | movu [r0 + 689 * 16], m3 | |
21186 | ||
21187 | ; mode 13 [row 10] | |
21188 | movu m7, m6 | |
21189 | movu m6, m0 | |
21190 | pinsrb m6, [r3 + 4], 4 | |
21191 | pinsrb m6, [r3 + 4], 3 | |
21192 | pinsrb m6, [r3 + 7], 2 | |
21193 | pinsrb m6, [r3 + 7], 1 | |
21194 | pinsrb m6, [r3 + 11], 0 | |
21195 | pmaddubsw m3, m6, [r5 + 29 * 16] | |
21196 | pmulhrsw m3, [pw_1024] | |
21197 | pmaddubsw m5, m2, [r5 + 29 * 16] | |
21198 | pmulhrsw m5, [pw_1024] | |
21199 | packuswb m3, m5 | |
21200 | movu [r0 + 724 * 16], m3 | |
21201 | pmaddubsw m3, m1, [r5 + 29 * 16] | |
21202 | pmulhrsw m3, [pw_1024] | |
21203 | pmaddubsw m5, m4, [r5 + 29 * 16] | |
21204 | pmulhrsw m5, [pw_1024] | |
21205 | packuswb m3, m5 | |
21206 | movu [r0 + 725 * 16], m3 | |
21207 | ||
21208 | ; mode 13 [row 11] | |
21209 | pmaddubsw m3, m6, [r5 + 20 * 16] | |
21210 | pmulhrsw m3, [pw_1024] | |
21211 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
21212 | pmulhrsw m5, [pw_1024] | |
21213 | packuswb m3, m5 | |
21214 | movu [r0 + 726 * 16], m3 | |
21215 | pmaddubsw m3, m1, [r5 + 20 * 16] | |
21216 | pmulhrsw m3, [pw_1024] | |
21217 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
21218 | pmulhrsw m5, [pw_1024] | |
21219 | packuswb m3, m5 | |
21220 | movu [r0 + 727 * 16], m3 | |
21221 | ||
21222 | ; mode 13 [row 12] | |
21223 | pmaddubsw m3, m6, [r5 + 11 * 16] | |
21224 | pmulhrsw m3, [pw_1024] | |
21225 | pmaddubsw m5, m2, [r5 + 11 * 16] | |
21226 | pmulhrsw m5, [pw_1024] | |
21227 | packuswb m3, m5 | |
21228 | movu [r0 + 728 * 16], m3 | |
21229 | pmaddubsw m3, m1, [r5 + 11 * 16] | |
21230 | pmulhrsw m3, [pw_1024] | |
21231 | pmaddubsw m5, m4, [r5 + 11 * 16] | |
21232 | pmulhrsw m5, [pw_1024] | |
21233 | packuswb m3, m5 | |
21234 | movu [r0 + 729 * 16], m3 | |
21235 | ||
21236 | ; mode 13 [row 13] | |
21237 | pmaddubsw m3, m6, [r5 + 2 * 16] | |
21238 | pmulhrsw m3, [pw_1024] | |
21239 | pmaddubsw m5, m2, [r5 + 2 * 16] | |
21240 | pmulhrsw m5, [pw_1024] | |
21241 | packuswb m3, m5 | |
21242 | movu [r0 + 730 * 16], m3 | |
21243 | pmaddubsw m3, m1, [r5 + 2 * 16] | |
21244 | pmulhrsw m3, [pw_1024] | |
21245 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
21246 | pmulhrsw m5, [pw_1024] | |
21247 | packuswb m3, m5 | |
21248 | movu [r0 + 731 * 16], m3 | |
21249 | ||
21250 | ; mode 14 [row 9] | |
21251 | pslldq m7, 2 | |
21252 | pinsrb m7, [r3 + 7], 1 | |
21253 | pinsrb m7, [r3 + 10], 0 | |
21254 | pmaddubsw m3, m7, [r5 + 30 * 16] | |
21255 | pmulhrsw m3, [pw_1024] | |
21256 | pslldq m2, 2 | |
21257 | pinsrw m2, [r4 + 4], 0 | |
21258 | pmaddubsw m5, m2, [r5 + 30 * 16] | |
21259 | pmulhrsw m5, [pw_1024] | |
21260 | packuswb m3, m5 | |
21261 | movu [r0 + 786 * 16], m3 | |
21262 | pslldq m1, 2 | |
21263 | pinsrw m1, [r4 + 12], 0 | |
21264 | pmaddubsw m3, m1, [r5 + 30 * 16] | |
21265 | pmulhrsw m3, [pw_1024] | |
21266 | pslldq m4, 2 | |
21267 | pinsrb m4, [r4 + 21], 1 | |
21268 | pinsrb m4, [r4 + 20], 0 | |
21269 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
21270 | pmulhrsw m5, [pw_1024] | |
21271 | packuswb m3, m5 | |
21272 | movu [r0 + 787 * 16], m3 | |
21273 | ||
21274 | ; mode 14 [row 10] | |
21275 | pmaddubsw m3, m7, [r5 + 17 * 16] | |
21276 | pmulhrsw m3, [pw_1024] | |
21277 | pmaddubsw m5, m2, [r5 + 17 * 16] | |
21278 | pmulhrsw m5, [pw_1024] | |
21279 | packuswb m3, m5 | |
21280 | movu [r0 + 788 * 16], m3 | |
21281 | pmaddubsw m3, m1, [r5 + 17 * 16] | |
21282 | pmulhrsw m3, [pw_1024] | |
21283 | pmaddubsw m5, m4, [r5 + 17 * 16] | |
21284 | pmulhrsw m5, [pw_1024] | |
21285 | packuswb m3, m5 | |
21286 | movu [r0 + 789 * 16], m3 | |
21287 | ||
21288 | ; mode 14 [row 11] | |
21289 | pmaddubsw m3, m7, [r5 + 4 * 16] | |
21290 | pmulhrsw m3, [pw_1024] | |
21291 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
21292 | pmulhrsw m5, [pw_1024] | |
21293 | packuswb m3, m5 | |
21294 | movu [r0 + 790 * 16], m3 | |
21295 | pmaddubsw m3, m1, [r5 + 4 * 16] | |
21296 | pmulhrsw m3, [pw_1024] | |
21297 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
21298 | pmulhrsw m5, [pw_1024] | |
21299 | packuswb m3, m5 | |
21300 | movu [r0 + 791 * 16], m3 | |
21301 | ||
21302 | movu m6, [pw_1024] | |
21303 | ||
21304 | ; mode 15 [row 7, 8-31] | |
21305 | pmaddubsw m5, m2, [r5 + 24 * 16] | |
21306 | pmulhrsw m5, m6 | |
21307 | packuswb m5, m5 | |
21308 | movh [r0 + 846 * 16 + 8], m5 | |
21309 | pmaddubsw m3, m1, [r5 + 24 * 16] | |
21310 | pmulhrsw m3, m6 | |
21311 | pmaddubsw m5, m4, [r5 + 24 * 16] | |
21312 | pmulhrsw m5, m6 | |
21313 | packuswb m3, m5 | |
21314 | movu [r0 + 847 * 16], m3 | |
21315 | ||
21316 | ; mode 15 [row 8, 8-31] | |
21317 | pmaddubsw m5, m2, [r5 + 7 * 16] | |
21318 | pmulhrsw m5, m6 | |
21319 | packuswb m5, m5 | |
21320 | movh [r0 + 848 * 16 + 8], m5 | |
21321 | pmaddubsw m3, m1, [r5 + 7 * 16] | |
21322 | pmulhrsw m3, m6 | |
21323 | pmaddubsw m5, m4, [r5 + 7 * 16] | |
21324 | pmulhrsw m5, m6 | |
21325 | packuswb m3, m5 | |
21326 | movu [r0 + 849 * 16], m3 | |
21327 | ||
21328 | ; mode 12 [row 25] | |
21329 | pslldq m0, 2 | |
21330 | pinsrb m0, [r3 + 19], 1 | |
21331 | pinsrb m0, [r3 + 26], 0 | |
21332 | pmaddubsw m3, m0, [r5 + 30 * 16] | |
21333 | pmulhrsw m3, [pw_1024] | |
21334 | pmaddubsw m5, m2, [r5 + 30 * 16] | |
21335 | pmulhrsw m5, [pw_1024] | |
21336 | packuswb m3, m5 | |
21337 | movu [r0 + 690 * 16], m3 | |
21338 | pmaddubsw m3, m1, [r5 + 30 * 16] | |
21339 | pmulhrsw m3, [pw_1024] | |
21340 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
21341 | pmulhrsw m5, [pw_1024] | |
21342 | packuswb m3, m5 | |
21343 | movu [r0 + 691 * 16], m3 | |
21344 | ||
21345 | ; mode 12 [row 26] | |
21346 | pmaddubsw m3, m0, [r5 + 25 * 16] | |
21347 | pmulhrsw m3, [pw_1024] | |
21348 | pmaddubsw m5, m2, [r5 + 25 * 16] | |
21349 | pmulhrsw m5, [pw_1024] | |
21350 | packuswb m3, m5 | |
21351 | movu [r0 + 692 * 16], m3 | |
21352 | pmaddubsw m3, m1, [r5 + 25 * 16] | |
21353 | pmulhrsw m3, [pw_1024] | |
21354 | pmaddubsw m5, m4, [r5 + 25 * 16] | |
21355 | pmulhrsw m5, [pw_1024] | |
21356 | packuswb m3, m5 | |
21357 | movu [r0 + 693 * 16], m3 | |
21358 | ||
21359 | ; mode 12 [row 27] | |
21360 | pmaddubsw m3, m0, [r5 + 20 * 16] | |
21361 | pmulhrsw m3, [pw_1024] | |
21362 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
21363 | pmulhrsw m5, [pw_1024] | |
21364 | packuswb m3, m5 | |
21365 | movu [r0 + 694 * 16], m3 | |
21366 | pmaddubsw m3, m1, [r5 + 20 * 16] | |
21367 | pmulhrsw m3, [pw_1024] | |
21368 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
21369 | pmulhrsw m5, [pw_1024] | |
21370 | packuswb m3, m5 | |
21371 | movu [r0 + 695 * 16], m3 | |
21372 | ||
21373 | ; mode 12 [row 28] | |
21374 | pmaddubsw m3, m0, [r5 + 15 * 16] | |
21375 | pmulhrsw m3, [pw_1024] | |
21376 | pmaddubsw m5, m2, [r5 + 15 * 16] | |
21377 | pmulhrsw m5, [pw_1024] | |
21378 | packuswb m3, m5 | |
21379 | movu [r0 + 696 * 16], m3 | |
21380 | pmaddubsw m3, m1, [r5 + 15 * 16] | |
21381 | pmulhrsw m3, [pw_1024] | |
21382 | pmaddubsw m5, m4, [r5 + 15 * 16] | |
21383 | pmulhrsw m5, [pw_1024] | |
21384 | packuswb m3, m5 | |
21385 | movu [r0 + 697 * 16], m3 | |
21386 | ||
21387 | ; mode 12 [row 29] | |
21388 | pmaddubsw m3, m0, [r5 + 10 * 16] | |
21389 | pmulhrsw m3, [pw_1024] | |
21390 | pmaddubsw m5, m2, [r5 + 10 * 16] | |
21391 | pmulhrsw m5, [pw_1024] | |
21392 | packuswb m3, m5 | |
21393 | movu [r0 + 698 * 16], m3 | |
21394 | pmaddubsw m3, m1, [r5 + 10 * 16] | |
21395 | pmulhrsw m3, [pw_1024] | |
21396 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
21397 | pmulhrsw m5, [pw_1024] | |
21398 | packuswb m3, m5 | |
21399 | movu [r0 + 699 * 16], m3 | |
21400 | ||
21401 | ; mode 12 [row 30] | |
21402 | pmaddubsw m3, m0, [r5 + 5 * 16] | |
21403 | pmulhrsw m3, [pw_1024] | |
21404 | pmaddubsw m5, m2, [r5 + 5 * 16] | |
21405 | pmulhrsw m5, [pw_1024] | |
21406 | packuswb m3, m5 | |
21407 | movu [r0 + 700 * 16], m3 | |
21408 | pmaddubsw m3, m1, [r5 + 5 * 16] | |
21409 | pmulhrsw m3, [pw_1024] | |
21410 | pmaddubsw m5, m4, [r5 + 5 * 16] | |
21411 | pmulhrsw m5, [pw_1024] | |
21412 | packuswb m3, m5 | |
21413 | movu [r0 + 701 * 16], m3 | |
21414 | ||
21415 | ; mode 13 [row 14] | |
21416 | movu m6, m0 | |
21417 | pinsrb m6, [r3 + 4], 6 | |
21418 | pinsrb m6, [r3 + 4], 5 | |
21419 | pinsrb m6, [r3 + 7], 4 | |
21420 | pinsrb m6, [r3 + 7], 3 | |
21421 | pinsrb m6, [r3 + 11], 2 | |
21422 | pinsrb m6, [r3 + 11], 1 | |
21423 | pinsrb m6, [r3 + 14], 0 | |
21424 | pmaddubsw m3, m6, [r5 + 25 * 16] | |
21425 | pmulhrsw m3, [pw_1024] | |
21426 | pmaddubsw m5, m2, [r5 + 25 * 16] | |
21427 | pmulhrsw m5, [pw_1024] | |
21428 | packuswb m3, m5 | |
21429 | movu [r0 + 732 * 16], m3 | |
21430 | pmaddubsw m3, m1, [r5 + 25 * 16] | |
21431 | pmulhrsw m3, [pw_1024] | |
21432 | pmaddubsw m5, m4, [r5 + 25 * 16] | |
21433 | pmulhrsw m5, [pw_1024] | |
21434 | packuswb m3, m5 | |
21435 | movu [r0 + 733 * 16], m3 | |
21436 | ||
21437 | ; mode 13 [row 15] | |
21438 | pmaddubsw m3, m6, [r5 + 16 * 16] | |
21439 | pmulhrsw m3, [pw_1024] | |
21440 | pmaddubsw m5, m2, [r5 + 16 * 16] | |
21441 | pmulhrsw m5, [pw_1024] | |
21442 | packuswb m3, m5 | |
21443 | movu [r0 + 734 * 16], m3 | |
21444 | pmaddubsw m3, m1, [r5 + 16 * 16] | |
21445 | pmulhrsw m3, [pw_1024] | |
21446 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
21447 | pmulhrsw m5, [pw_1024] | |
21448 | packuswb m3, m5 | |
21449 | movu [r0 + 735 * 16], m3 | |
21450 | ||
21451 | ; mode 13 [row 16] | |
21452 | pmaddubsw m3, m6, [r5 + 7 * 16] | |
21453 | pmulhrsw m3, [pw_1024] | |
21454 | pmaddubsw m5, m2, [r5 + 7 * 16] | |
21455 | pmulhrsw m5, [pw_1024] | |
21456 | packuswb m3, m5 | |
21457 | movu [r0 + 736 * 16], m3 | |
21458 | pmaddubsw m3, m1, [r5 + 7 * 16] | |
21459 | pmulhrsw m3, [pw_1024] | |
21460 | pmaddubsw m5, m4, [r5 + 7 * 16] | |
21461 | pmulhrsw m5, [pw_1024] | |
21462 | packuswb m3, m5 | |
21463 | movu [r0 + 737 * 16], m3 | |
21464 | ||
21465 | ; mode 13 [row 17] | |
21466 | pslldq m6, 2 | |
21467 | pinsrb m6, [r3 + 14], 1 | |
21468 | pinsrb m6, [r3 + 18], 0 | |
21469 | pmaddubsw m3, m6, [r5 + 30 * 16] | |
21470 | pmulhrsw m3, [pw_1024] | |
21471 | pslldq m2, 2 | |
21472 | pinsrw m2, [r4 + 3], 0 | |
21473 | pmaddubsw m5, m2, [r5 + 30 * 16] | |
21474 | pmulhrsw m5, [pw_1024] | |
21475 | packuswb m3, m5 | |
21476 | movu [r0 + 738 * 16], m3 | |
21477 | pslldq m1, 2 | |
21478 | pinsrw m1, [r4 + 11], 0 | |
21479 | pmaddubsw m3, m1, [r5 + 30 * 16] | |
21480 | pmulhrsw m3, [pw_1024] | |
21481 | pslldq m4, 2 | |
21482 | pinsrw m4, [r4 + 19], 0 | |
21483 | pmaddubsw m5, m4, [r5 + 30 * 16] | |
21484 | pmulhrsw m5, [pw_1024] | |
21485 | packuswb m3, m5 | |
21486 | movu [r0 + 739 * 16], m3 | |
21487 | ||
21488 | ; mode 13 [row 18] | |
21489 | pmaddubsw m3, m6, [r5 + 21 * 16] | |
21490 | pmulhrsw m3, [pw_1024] | |
21491 | pmaddubsw m5, m2, [r5 + 21 * 16] | |
21492 | pmulhrsw m5, [pw_1024] | |
21493 | packuswb m3, m5 | |
21494 | movu [r0 + 740 * 16], m3 | |
21495 | pmaddubsw m3, m1, [r5 + 21 * 16] | |
21496 | pmulhrsw m3, [pw_1024] | |
21497 | pmaddubsw m5, m4, [r5 + 21 * 16] | |
21498 | pmulhrsw m5, [pw_1024] | |
21499 | packuswb m3, m5 | |
21500 | movu [r0 + 741 * 16], m3 | |
21501 | ||
21502 | ; mode 13 [row 19] | |
21503 | pmaddubsw m3, m6, [r5 + 12 * 16] | |
21504 | pmulhrsw m3, [pw_1024] | |
21505 | pmaddubsw m5, m2, [r5 + 12 * 16] | |
21506 | pmulhrsw m5, [pw_1024] | |
21507 | packuswb m3, m5 | |
21508 | movu [r0 + 742 * 16], m3 | |
21509 | pmaddubsw m3, m1, [r5 + 12 * 16] | |
21510 | pmulhrsw m3, [pw_1024] | |
21511 | pmaddubsw m5, m4, [r5 + 12 * 16] | |
21512 | pmulhrsw m5, [pw_1024] | |
21513 | packuswb m3, m5 | |
21514 | movu [r0 + 743 * 16], m3 | |
21515 | ||
21516 | ; mode 13 [row 20] | |
21517 | pmaddubsw m3, m6, [r5 + 3 * 16] | |
21518 | pmulhrsw m3, [pw_1024] | |
21519 | pmaddubsw m5, m2, [r5 + 3 * 16] | |
21520 | pmulhrsw m5, [pw_1024] | |
21521 | packuswb m3, m5 | |
21522 | movu [r0 + 744 * 16], m3 | |
21523 | pmaddubsw m3, m1, [r5 + 3 * 16] | |
21524 | pmulhrsw m3, [pw_1024] | |
21525 | pmaddubsw m5, m4, [r5 + 3 * 16] | |
21526 | pmulhrsw m5, [pw_1024] | |
21527 | packuswb m3, m5 | |
21528 | movu [r0 + 745 * 16], m3 | |
21529 | ||
21530 | ; mode 14 [row 12] | |
21531 | pslldq m7, 2 | |
21532 | pinsrb m7, [r3 + 10], 1 | |
21533 | pinsrb m7, [r3 + 12], 0 | |
21534 | pmaddubsw m3, m7, [r5 + 23 * 16] | |
21535 | pmulhrsw m3, [pw_1024] | |
21536 | pmaddubsw m5, m2, [r5 + 23 * 16] | |
21537 | pmulhrsw m5, [pw_1024] | |
21538 | packuswb m3, m5 | |
21539 | movu [r0 + 792 * 16], m3 | |
21540 | pmaddubsw m3, m1, [r5 + 23 * 16] | |
21541 | pmulhrsw m3, [pw_1024] | |
21542 | pmaddubsw m5, m4, [r5 + 23 * 16] | |
21543 | pmulhrsw m5, [pw_1024] | |
21544 | packuswb m3, m5 | |
21545 | movu [r0 + 793 * 16], m3 | |
21546 | ||
21547 | ; mode 14 [row 13] | |
21548 | pmaddubsw m3, m7, [r5 + 10 * 16] | |
21549 | pmulhrsw m3, [pw_1024] | |
21550 | pmaddubsw m5, m2, [r5 + 10 * 16] | |
21551 | pmulhrsw m5, [pw_1024] | |
21552 | packuswb m3, m5 | |
21553 | movu [r0 + 794 * 16], m3 | |
21554 | pmaddubsw m3, m1, [r5 + 10 * 16] | |
21555 | pmulhrsw m3, [pw_1024] | |
21556 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
21557 | pmulhrsw m5, [pw_1024] | |
21558 | packuswb m3, m5 | |
21559 | movu [r0 + 795 * 16], m3 | |
21560 | ||
21561 | ; mode 15 [row 9] | |
21562 | pmaddubsw m5, m2, [r5 + 22 * 16] | |
21563 | pmulhrsw m5, [pw_1024] | |
21564 | packuswb m5, m5 | |
21565 | movu [r0 + 850 * 16 + 8], m5 | |
21566 | pmaddubsw m3, m1, [r5 + 22 * 16] | |
21567 | pmulhrsw m3, [pw_1024] | |
21568 | pmaddubsw m5, m4, [r5 + 22 * 16] | |
21569 | pmulhrsw m5, [pw_1024] | |
21570 | packuswb m3, m5 | |
21571 | movu [r0 + 851 * 16], m3 | |
21572 | ||
21573 | ; mode 15 [row 10] | |
21574 | pmaddubsw m5, m2, [r5 + 5 * 16] | |
21575 | pmulhrsw m5, [pw_1024] | |
21576 | packuswb m5, m5 | |
21577 | movu [r0 + 852 * 16 + 8], m5 | |
21578 | pmaddubsw m3, m1, [r5 + 5 * 16] | |
21579 | pmulhrsw m3, [pw_1024] | |
21580 | pmaddubsw m5, m4, [r5 + 5 * 16] | |
21581 | pmulhrsw m5, [pw_1024] | |
21582 | packuswb m3, m5 | |
21583 | movu [r0 + 853 * 16], m3 | |
21584 | ||
21585 | ; mode 13 [row 21] | |
21586 | pslldq m6, 2 | |
21587 | pinsrb m6, [r3 + 18], 1 | |
21588 | pinsrb m6, [r3 + 21], 0 | |
21589 | pmaddubsw m3, m6, [r5 + 26 * 16] | |
21590 | pmulhrsw m3, [pw_1024] | |
21591 | pslldq m2, 2 | |
21592 | pinsrw m2, [r4 + 2], 0 | |
21593 | pmaddubsw m5, m2, [r5 + 26 * 16] | |
21594 | pmulhrsw m5, [pw_1024] | |
21595 | packuswb m3, m5 | |
21596 | movu [r0 + 746 * 16], m3 | |
21597 | pslldq m1, 2 | |
21598 | pinsrw m1, [r4 + 10], 0 | |
21599 | pmaddubsw m3, m1, [r5 + 26 * 16] | |
21600 | pmulhrsw m3, [pw_1024] | |
21601 | pslldq m4, 2 | |
21602 | pinsrw m4, [r4 + 18], 0 | |
21603 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
21604 | pmulhrsw m5, [pw_1024] | |
21605 | packuswb m3, m5 | |
21606 | movu [r0 + 747 * 16], m3 | |
21607 | ||
21608 | ; mode 13 [row 22] | |
21609 | pmaddubsw m3, m6, [r5 + 17 * 16] | |
21610 | pmulhrsw m3, [pw_1024] | |
21611 | pmaddubsw m5, m2, [r5 + 17 * 16] | |
21612 | pmulhrsw m5, [pw_1024] | |
21613 | packuswb m3, m5 | |
21614 | movu [r0 + 748 * 16], m3 | |
21615 | pmaddubsw m3, m1, [r5 + 17 * 16] | |
21616 | pmulhrsw m3, [pw_1024] | |
21617 | pmaddubsw m5, m4, [r5 + 17 * 16] | |
21618 | pmulhrsw m5, [pw_1024] | |
21619 | packuswb m3, m5 | |
21620 | movu [r0 + 749 * 16], m3 | |
21621 | ||
21622 | ; mode 13 [row 23] | |
21623 | pmaddubsw m3, m6, [r5 + 8 * 16] | |
21624 | pmulhrsw m3, [pw_1024] | |
21625 | pmaddubsw m5, m2, [r5 + 8 * 16] | |
21626 | pmulhrsw m5, [pw_1024] | |
21627 | packuswb m3, m5 | |
21628 | movu [r0 + 750 * 16], m3 | |
21629 | pmaddubsw m3, m1, [r5 + 8 * 16] | |
21630 | pmulhrsw m3, [pw_1024] | |
21631 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
21632 | pmulhrsw m5, [pw_1024] | |
21633 | packuswb m3, m5 | |
21634 | movu [r0 + 751 * 16], m3 | |
21635 | ||
21636 | ; mode 14 [row 14] | |
21637 | pslldq m7, 2 | |
21638 | pinsrb m7, [r3 + 12], 1 | |
21639 | pinsrb m7, [r3 + 15], 0 | |
21640 | pmaddubsw m3, m7, [r5 + 29 * 16] | |
21641 | pmulhrsw m3, [pw_1024] | |
21642 | pmaddubsw m5, m2, [r5 + 29 * 16] | |
21643 | pmulhrsw m5, [pw_1024] | |
21644 | packuswb m3, m5 | |
21645 | movu [r0 + 796 * 16], m3 | |
21646 | pmaddubsw m3, m1, [r5 + 29 * 16] | |
21647 | pmulhrsw m3, [pw_1024] | |
21648 | pmaddubsw m5, m4, [r5 + 29 * 16] | |
21649 | pmulhrsw m5, [pw_1024] | |
21650 | packuswb m3, m5 | |
21651 | movu [r0 + 797 * 16], m3 | |
21652 | ||
21653 | ; mode 14 [row 15] | |
21654 | pmaddubsw m3, m7, [r5 + 16 * 16] | |
21655 | pmulhrsw m3, [pw_1024] | |
21656 | pmaddubsw m5, m2, [r5 + 16 * 16] | |
21657 | pmulhrsw m5, [pw_1024] | |
21658 | packuswb m3, m5 | |
21659 | movu [r0 + 798 * 16], m3 | |
21660 | pmaddubsw m3, m1, [r5 + 16 * 16] | |
21661 | pmulhrsw m3, [pw_1024] | |
21662 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
21663 | pmulhrsw m5, [pw_1024] | |
21664 | packuswb m3, m5 | |
21665 | movu [r0 + 799 * 16], m3 | |
21666 | ||
21667 | ; mode 14 [row 16] | |
21668 | pmaddubsw m3, m7, [r5 + 3 * 16] | |
21669 | pmulhrsw m3, [pw_1024] | |
21670 | pmaddubsw m5, m2, [r5 + 3 * 16] | |
21671 | pmulhrsw m5, [pw_1024] | |
21672 | packuswb m3, m5 | |
21673 | movu [r0 + 800 * 16], m3 | |
21674 | pmaddubsw m3, m1, [r5 + 3 * 16] | |
21675 | pmulhrsw m3, [pw_1024] | |
21676 | pmaddubsw m5, m4, [r5 + 3 * 16] | |
21677 | pmulhrsw m5, [pw_1024] | |
21678 | packuswb m3, m5 | |
21679 | movu [r0 + 801 * 16], m3 | |
21680 | ||
21681 | ; mode 15 [row 11] | |
21682 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
21683 | pmulhrsw m5, [pw_1024] | |
21684 | packuswb m5, m5 | |
21685 | movh [r0 + 854 * 16 + 8], m5 | |
21686 | pmaddubsw m3, m1, [r5 + 20 * 16] | |
21687 | pmulhrsw m3, [pw_1024] | |
21688 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
21689 | pmulhrsw m5, [pw_1024] | |
21690 | packuswb m3, m5 | |
21691 | movu [r0 + 855 * 16], m3 | |
21692 | ||
21693 | ; mode 15 [row 12] | |
21694 | pmaddubsw m5, m2, [r5 + 3 * 16] | |
21695 | pmulhrsw m5, [pw_1024] | |
21696 | packuswb m5, m5 | |
21697 | movh [r0 + 856 * 16 + 8], m5 | |
21698 | pmaddubsw m3, m1, [r5 + 3 * 16] | |
21699 | pmulhrsw m3, [pw_1024] | |
21700 | pmaddubsw m5, m4, [r5 + 3 * 16] | |
21701 | pmulhrsw m5, [pw_1024] | |
21702 | packuswb m3, m5 | |
21703 | movu [r0 + 857 * 16], m3 | |
21704 | ||
21705 | ; mode 13 [row 24] | |
21706 | pslldq m6, 2 | |
21707 | pinsrb m6, [r3 + 21], 1 | |
21708 | pinsrb m6, [r3 + 25], 0 | |
21709 | pmaddubsw m3, m6, [r5 + 31 * 16] | |
21710 | pmulhrsw m3, [pw_1024] | |
21711 | pslldq m2, 2 | |
21712 | pinsrw m2, [r4 + 1], 0 | |
21713 | pmaddubsw m5, m2, [r5 + 31 * 16] | |
21714 | pmulhrsw m5, [pw_1024] | |
21715 | packuswb m3, m5 | |
21716 | movu [r0 + 752 * 16], m3 | |
21717 | pslldq m1, 2 | |
21718 | pinsrw m1, [r4 + 9], 0 | |
21719 | pmaddubsw m3, m1, [r5 + 31 * 16] | |
21720 | pmulhrsw m3, [pw_1024] | |
21721 | pslldq m4, 2 | |
21722 | pinsrw m4, [r4 + 17], 0 | |
21723 | pmaddubsw m5, m4, [r5 + 31 * 16] | |
21724 | pmulhrsw m5, [pw_1024] | |
21725 | packuswb m3, m5 | |
21726 | movu [r0 + 753 * 16], m3 | |
21727 | ||
21728 | ; mode 13 [row 25] | |
21729 | pmaddubsw m3, m6, [r5 + 22 * 16] | |
21730 | pmulhrsw m3, [pw_1024] | |
21731 | pmaddubsw m5, m2, [r5 + 22 * 16] | |
21732 | pmulhrsw m5, [pw_1024] | |
21733 | packuswb m3, m5 | |
21734 | movu [r0 + 754 * 16], m3 | |
21735 | pmaddubsw m3, m1, [r5 + 22 * 16] | |
21736 | pmulhrsw m3, [pw_1024] | |
21737 | pmaddubsw m5, m4, [r5 + 22 * 16] | |
21738 | pmulhrsw m5, [pw_1024] | |
21739 | packuswb m3, m5 | |
21740 | movu [r0 + 755 * 16], m3 | |
21741 | ||
21742 | ; mode 13 [row 26] | |
21743 | pmaddubsw m3, m6, [r5 + 13 * 16] | |
21744 | pmulhrsw m3, [pw_1024] | |
21745 | pmaddubsw m5, m2, [r5 + 13 * 16] | |
21746 | pmulhrsw m5, [pw_1024] | |
21747 | packuswb m3, m5 | |
21748 | movu [r0 + 756 * 16], m3 | |
21749 | pmaddubsw m3, m1, [r5 + 13 * 16] | |
21750 | pmulhrsw m3, [pw_1024] | |
21751 | pmaddubsw m5, m4, [r5 + 13 * 16] | |
21752 | pmulhrsw m5, [pw_1024] | |
21753 | packuswb m3, m5 | |
21754 | movu [r0 + 757 * 16], m3 | |
21755 | ||
21756 | ; mode 13 [row 27] | |
21757 | pmaddubsw m3, m6, [r5 + 4 * 16] | |
21758 | pmulhrsw m3, [pw_1024] | |
21759 | pmaddubsw m5, m2, [r5 + 4 * 16] | |
21760 | pmulhrsw m5, [pw_1024] | |
21761 | packuswb m3, m5 | |
21762 | movu [r0 + 758 * 16], m3 | |
21763 | pmaddubsw m3, m1, [r5 + 4 * 16] | |
21764 | pmulhrsw m3, [pw_1024] | |
21765 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
21766 | pmulhrsw m5, [pw_1024] | |
21767 | packuswb m3, m5 | |
21768 | movu [r0 + 759 * 16], m3 | |
21769 | ||
21770 | ; mode 14 [row 17] | |
21771 | pslldq m7, 2 | |
21772 | pinsrb m7, [r3 + 15], 1 | |
21773 | pinsrb m7, [r3 + 17], 0 | |
21774 | pmaddubsw m3, m7, [r5 + 22 * 16] | |
21775 | pmulhrsw m3, [pw_1024] | |
21776 | pmaddubsw m5, m2, [r5 + 22 * 16] | |
21777 | pmulhrsw m5, [pw_1024] | |
21778 | packuswb m3, m5 | |
21779 | movu [r0 + 802 * 16], m3 | |
21780 | pmaddubsw m3, m1, [r5 + 22 * 16] | |
21781 | pmulhrsw m3, [pw_1024] | |
21782 | pmaddubsw m5, m4, [r5 + 22 * 16] | |
21783 | pmulhrsw m5, [pw_1024] | |
21784 | packuswb m3, m5 | |
21785 | movu [r0 + 803 * 16], m3 | |
21786 | ||
21787 | ; mode 14 [row 18] | |
21788 | pmaddubsw m3, m7, [r5 + 9 * 16] | |
21789 | pmulhrsw m3, [pw_1024] | |
21790 | pmaddubsw m5, m2, [r5 + 9 * 16] | |
21791 | pmulhrsw m5, [pw_1024] | |
21792 | packuswb m3, m5 | |
21793 | movu [r0 + 804 * 16], m3 | |
21794 | pmaddubsw m3, m1, [r5 + 9 * 16] | |
21795 | pmulhrsw m3, [pw_1024] | |
21796 | pmaddubsw m5, m4, [r5 + 9 * 16] | |
21797 | pmulhrsw m5, [pw_1024] | |
21798 | packuswb m3, m5 | |
21799 | movu [r0 + 805 * 16], m3 | |
21800 | ||
21801 | ; mode 15 [row 13] | |
21802 | pmaddubsw m5, m2, [r5 + 18 * 16] | |
21803 | pmulhrsw m5, [pw_1024] | |
21804 | packuswb m5, m5 | |
21805 | movh [r0 + 858 * 16 + 8], m5 | |
21806 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
21807 | pmulhrsw m3, [pw_1024] | |
21808 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
21809 | pmulhrsw m5, [pw_1024] | |
21810 | packuswb m3, m5 | |
21811 | movu [r0 + 859 * 16], m3 | |
21812 | ||
21813 | ; mode 15 [row 14] | |
21814 | pmaddubsw m5, m2, [r5 + 1 * 16] | |
21815 | pmulhrsw m5, [pw_1024] | |
21816 | packuswb m5, m5 | |
21817 | movh [r0 + 860 * 16 + 8], m5 | |
21818 | pmaddubsw m3, m1, [r5 + 1 * 16] | |
21819 | pmulhrsw m3, [pw_1024] | |
21820 | pmaddubsw m5, m4, [r5 + 1 * 16] | |
21821 | pmulhrsw m5, [pw_1024] | |
21822 | packuswb m3, m5 | |
21823 | movu [r0 + 861 * 16], m3 | |
21824 | ||
21825 | ; mode 13 [row 28] | |
21826 | pslldq m6, 2 | |
21827 | pinsrb m6, [r3 + 25], 1 | |
21828 | pinsrb m6, [r3 + 28], 0 | |
21829 | pmaddubsw m3, m6, [r5 + 27 * 16] | |
21830 | pmulhrsw m3, [pw_1024] | |
21831 | pslldq m2, 2 | |
21832 | pinsrw m2, [r4 + 0], 0 | |
21833 | pmaddubsw m5, m2, [r5 + 27 * 16] | |
21834 | pmulhrsw m5, [pw_1024] | |
21835 | packuswb m3, m5 | |
21836 | movu [r0 + 760 * 16], m3 | |
21837 | pslldq m1, 2 | |
21838 | pinsrw m1, [r4 + 8], 0 | |
21839 | pmaddubsw m3, m1, [r5 + 27 * 16] | |
21840 | pmulhrsw m3, [pw_1024] | |
21841 | pslldq m4, 2 | |
21842 | pinsrw m4, [r4 + 16], 0 | |
21843 | pmaddubsw m5, m4, [r5 + 27 * 16] | |
21844 | pmulhrsw m5, [pw_1024] | |
21845 | packuswb m3, m5 | |
21846 | movu [r0 + 761 * 16], m3 | |
21847 | ||
21848 | ; mode 13 [row 29] | |
21849 | pmaddubsw m3, m6, [r5 + 18 * 16] | |
21850 | pmulhrsw m3, [pw_1024] | |
21851 | pmaddubsw m5, m2, [r5 + 18 * 16] | |
21852 | pmulhrsw m5, [pw_1024] | |
21853 | packuswb m3, m5 | |
21854 | movu [r0 + 762 * 16], m3 | |
21855 | pmaddubsw m3, m1, [r5 + 18 * 16] | |
21856 | pmulhrsw m3, [pw_1024] | |
21857 | pmaddubsw m5, m4, [r5 + 18 * 16] | |
21858 | pmulhrsw m5, [pw_1024] | |
21859 | packuswb m3, m5 | |
21860 | movu [r0 + 763 * 16], m3 | |
21861 | ||
21862 | ; mode 13 [row 30] | |
21863 | pmaddubsw m3, m6, [r5 + 9 * 16] | |
21864 | pmulhrsw m3, [pw_1024] | |
21865 | pmaddubsw m5, m2, [r5 + 9 * 16] | |
21866 | pmulhrsw m5, [pw_1024] | |
21867 | packuswb m3, m5 | |
21868 | movu [r0 + 764 * 16], m3 | |
21869 | pmaddubsw m3, m1, [r5 + 9 * 16] | |
21870 | pmulhrsw m3, [pw_1024] | |
21871 | pmaddubsw m5, m4, [r5 + 9 * 16] | |
21872 | pmulhrsw m5, [pw_1024] | |
21873 | packuswb m3, m5 | |
21874 | movu [r0 + 765 * 16], m3 | |
21875 | ||
21876 | ; mode 14 [row 19] | |
21877 | pslldq m7, 2 | |
21878 | pinsrb m7, [r3 + 17], 1 | |
21879 | pinsrb m7, [r3 + 20], 0 | |
21880 | pmaddubsw m3, m7, [r5 + 28 * 16] | |
21881 | pmulhrsw m3, [pw_1024] | |
21882 | pmaddubsw m5, m2, [r5 + 28 * 16] | |
21883 | pmulhrsw m5, [pw_1024] | |
21884 | packuswb m3, m5 | |
21885 | movu [r0 + 806 * 16], m3 | |
21886 | pmaddubsw m3, m1, [r5 + 28 * 16] | |
21887 | pmulhrsw m3, [pw_1024] | |
21888 | pmaddubsw m5, m4, [r5 + 28 * 16] | |
21889 | pmulhrsw m5, [pw_1024] | |
21890 | packuswb m3, m5 | |
21891 | movu [r0 + 807 * 16], m3 | |
21892 | ||
21893 | ; mode 14 [row 20] | |
21894 | pmaddubsw m3, m7, [r5 + 15 * 16] | |
21895 | pmulhrsw m3, [pw_1024] | |
21896 | pmaddubsw m5, m2, [r5 + 15 * 16] | |
21897 | pmulhrsw m5, [pw_1024] | |
21898 | packuswb m3, m5 | |
21899 | movu [r0 + 808 * 16], m3 | |
21900 | pmaddubsw m3, m1, [r5 + 15 * 16] | |
21901 | pmulhrsw m3, [pw_1024] | |
21902 | pmaddubsw m5, m4, [r5 + 15 * 16] | |
21903 | pmulhrsw m5, [pw_1024] | |
21904 | packuswb m3, m5 | |
21905 | movu [r0 + 809 * 16], m3 | |
21906 | ||
21907 | ; mode 14 [row 21] | |
21908 | pmaddubsw m3, m7, [r5 + 2 * 16] | |
21909 | pmulhrsw m3, [pw_1024] | |
21910 | pmaddubsw m5, m2, [r5 + 2 * 16] | |
21911 | pmulhrsw m5, [pw_1024] | |
21912 | packuswb m3, m5 | |
21913 | movu [r0 + 810 * 16], m3 | |
21914 | pmaddubsw m3, m1, [r5 + 2 * 16] | |
21915 | pmulhrsw m3, [pw_1024] | |
21916 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
21917 | pmulhrsw m5, [pw_1024] | |
21918 | packuswb m3, m5 | |
21919 | movu [r0 + 811 * 16], m3 | |
21920 | ||
21921 | ; mode 15 [row 15] | |
21922 | pmaddubsw m5, m2, [r5 + 16 * 16] | |
21923 | pmulhrsw m5, [pw_1024] | |
21924 | packuswb m5, m5 | |
21925 | movh [r0 + 862 * 16 + 8], m5 | |
21926 | pmaddubsw m3, m1, [r5 + 16 * 16] | |
21927 | pmulhrsw m3, [pw_1024] | |
21928 | pmaddubsw m5, m4, [r5 + 16 * 16] | |
21929 | pmulhrsw m5, [pw_1024] | |
21930 | packuswb m3, m5 | |
21931 | movu [r0 + 863 * 16], m3 | |
21932 | ||
21933 | ; mode 14 [row 22] | |
21934 | pslldq m7, 2 | |
21935 | pinsrb m7, [r3 + 20], 1 | |
21936 | pinsrb m7, [r3 + 22], 0 | |
21937 | pmaddubsw m3, m7, [r5 + 21 * 16] | |
21938 | pmulhrsw m3, [pw_1024] | |
21939 | pslldq m2, 2 | |
21940 | pinsrb m2, [r4 + 0], 1 | |
21941 | pinsrb m2, [r3 + 2], 0 | |
21942 | pmaddubsw m5, m2, [r5 + 21 * 16] | |
21943 | pmulhrsw m5, [pw_1024] | |
21944 | packuswb m3, m5 | |
21945 | movu [r0 + 812 * 16], m3 | |
21946 | pslldq m1, 2 | |
21947 | pinsrw m1, [r4 + 7], 0 | |
21948 | pmaddubsw m3, m1, [r5 + 21 * 16] | |
21949 | pmulhrsw m3, [pw_1024] | |
21950 | pslldq m4, 2 | |
21951 | pinsrw m4, [r4 + 15], 0 | |
21952 | pmaddubsw m5, m4, [r5 + 21 * 16] | |
21953 | pmulhrsw m5, [pw_1024] | |
21954 | packuswb m3, m5 | |
21955 | movu [r0 + 813 * 16], m3 | |
21956 | ||
21957 | ; mode 14 [row 23] | |
21958 | pmaddubsw m3, m7, [r5 + 8 * 16] | |
21959 | pmulhrsw m3, [pw_1024] | |
21960 | pmaddubsw m5, m2, [r5 + 8 * 16] | |
21961 | pmulhrsw m5, [pw_1024] | |
21962 | packuswb m3, m5 | |
21963 | movu [r0 + 814 * 16], m3 | |
21964 | pmaddubsw m3, m1, [r5 + 8 * 16] | |
21965 | pmulhrsw m3, [pw_1024] | |
21966 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
21967 | pmulhrsw m5, [pw_1024] | |
21968 | packuswb m3, m5 | |
21969 | movu [r0 + 815 * 16], m3 | |
21970 | ||
21971 | ; mode 15 [row 16] | |
21972 | pmaddubsw m5, m2, [r5 + 31 * 16] | |
21973 | pmulhrsw m5, [pw_1024] | |
21974 | packuswb m5, m5 | |
21975 | movh [r0 + 864 * 16 + 8], m5 | |
21976 | pmaddubsw m3, m1, [r5 + 31 * 16] | |
21977 | pmulhrsw m3, [pw_1024] | |
21978 | pmaddubsw m5, m4, [r5 + 31 * 16] | |
21979 | pmulhrsw m5, [pw_1024] | |
21980 | packuswb m3, m5 | |
21981 | movu [r0 + 865 * 16], m3 | |
21982 | ||
21983 | ; mode 15 [row 17] | |
21984 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
21985 | pmulhrsw m5, [pw_1024] | |
21986 | packuswb m5, m5 | |
21987 | movh [r0 + 866 * 16 + 8], m5 | |
21988 | pmaddubsw m3, m1, [r5 + 14 * 16] | |
21989 | pmulhrsw m3, [pw_1024] | |
21990 | pmaddubsw m5, m4, [r5 + 14 * 16] | |
21991 | pmulhrsw m5, [pw_1024] | |
21992 | packuswb m3, m5 | |
21993 | movu [r0 + 867 * 16], m3 | |
21994 | ||
21995 | ; mode 14 [row 24] | |
21996 | pslldq m7, 2 | |
21997 | pinsrb m7, [r3 + 22], 1 | |
21998 | pinsrb m7, [r3 + 25], 0 | |
21999 | pmaddubsw m3, m7, [r5 + 27 * 16] | |
22000 | pmulhrsw m3, [pw_1024] | |
22001 | pslldq m2, 2 | |
22002 | pinsrb m2, [r3 + 2], 1 | |
22003 | pinsrb m2, [r3 + 5], 0 | |
22004 | pmaddubsw m5, m2, [r5 + 27 * 16] | |
22005 | pmulhrsw m5, [pw_1024] | |
22006 | packuswb m3, m5 | |
22007 | movu [r0 + 816 * 16], m3 | |
22008 | pslldq m1, 2 | |
22009 | pinsrw m1, [r4 + 6], 0 | |
22010 | pmaddubsw m3, m1, [r5 + 27 * 16] | |
22011 | pmulhrsw m3, [pw_1024] | |
22012 | pslldq m4, 2 | |
22013 | pinsrw m4, [r4 + 14], 0 | |
22014 | pmaddubsw m5, m4, [r5 + 27 * 16] | |
22015 | pmulhrsw m5, [pw_1024] | |
22016 | packuswb m3, m5 | |
22017 | movu [r0 + 817 * 16], m3 | |
22018 | ||
22019 | ; mode 14 [row 25] | |
22020 | pmaddubsw m3, m7, [r5 + 14 * 16] | |
22021 | pmulhrsw m3, [pw_1024] | |
22022 | pmaddubsw m5, m2, [r5 + 14 * 16] | |
22023 | pmulhrsw m5, [pw_1024] | |
22024 | packuswb m3, m5 | |
22025 | movu [r0 + 818 * 16], m3 | |
22026 | pmaddubsw m3, m1, [r5 + 14 * 16] | |
22027 | pmulhrsw m3, [pw_1024] | |
22028 | pmaddubsw m5, m4, [r5 + 14 * 16] | |
22029 | pmulhrsw m5, [pw_1024] | |
22030 | packuswb m3, m5 | |
22031 | movu [r0 + 819 * 16], m3 | |
22032 | ||
22033 | ; mode 14 [row 26] | |
22034 | pmaddubsw m3, m7, [r5 + 1 * 16] | |
22035 | pmulhrsw m3, [pw_1024] | |
22036 | pmaddubsw m5, m2, [r5 + 1 * 16] | |
22037 | pmulhrsw m5, [pw_1024] | |
22038 | packuswb m3, m5 | |
22039 | movu [r0 + 820 * 16], m3 | |
22040 | pmaddubsw m3, m1, [r5 + 1 * 16] | |
22041 | pmulhrsw m3, [pw_1024] | |
22042 | pmaddubsw m5, m4, [r5 + 1 * 16] | |
22043 | pmulhrsw m5, [pw_1024] | |
22044 | packuswb m3, m5 | |
22045 | movu [r0 + 821 * 16], m3 | |
22046 | ||
22047 | ; mode 15 [row 18] | |
22048 | pinsrb m2, [r3 + 4], 0 | |
22049 | pmaddubsw m5, m2, [r5 + 29 * 16] | |
22050 | pmulhrsw m5, [pw_1024] | |
22051 | packuswb m5, m5 | |
22052 | movh [r0 + 868 * 16 + 8], m5 | |
22053 | pmaddubsw m3, m1, [r5 + 29 * 16] | |
22054 | pmulhrsw m3, [pw_1024] | |
22055 | pmaddubsw m5, m4, [r5 + 29 * 16] | |
22056 | pmulhrsw m5, [pw_1024] | |
22057 | packuswb m3, m5 | |
22058 | movu [r0 + 869 * 16], m3 | |
22059 | ||
22060 | ; mode 15 [row 19] | |
22061 | pmaddubsw m5, m2, [r5 + 12 * 16] | |
22062 | pmulhrsw m5, [pw_1024] | |
22063 | packuswb m5, m5 | |
22064 | movh [r0 + 870 * 16 + 8], m5 | |
22065 | pmaddubsw m3, m1, [r5 + 12 * 16] | |
22066 | pmulhrsw m3, [pw_1024] | |
22067 | pmaddubsw m5, m4, [r5 + 12 * 16] | |
22068 | pmulhrsw m5, [pw_1024] | |
22069 | packuswb m3, m5 | |
22070 | movu [r0 + 871 * 16], m3 | |
22071 | ||
22072 | ; mode 15 [row 20 - 8 to 15] | |
22073 | pslldq m3, m2, 2 | |
22074 | pinsrb m3, [r3 + 4], 1 | |
22075 | pinsrb m3, [r3 + 6], 0 | |
22076 | pmaddubsw m5, m3, [r5 + 27 * 16] | |
22077 | pmulhrsw m5, [pw_1024] | |
22078 | packuswb m5, m5 | |
22079 | movh [r0 + 872 * 16 + 8], m5 | |
22080 | ||
22081 | ; mode 15 [row 21 - 8 to 15] | |
22082 | pmaddubsw m5, m3, [r5 + 10 * 16] | |
22083 | pmulhrsw m5, [pw_1024] | |
22084 | packuswb m5, m5 | |
22085 | movh [r0 + 874 * 16 + 8], m5 | |
22086 | ||
22087 | ; mode 15 [row 22 - 8 to 15] | |
22088 | pslldq m3, 2 | |
22089 | pinsrb m3, [r3 + 6], 1 | |
22090 | pinsrb m3, [r3 + 8], 0 | |
22091 | pmaddubsw m5, m3, [r5 + 25 * 16] | |
22092 | pmulhrsw m5, [pw_1024] | |
22093 | packuswb m5, m5 | |
22094 | movh [r0 + 876 * 16 + 8], m5 | |
22095 | ||
22096 | ; mode 15 [row 23 - 8 to 15] | |
22097 | pmaddubsw m5, m3, [r5 + 8 * 16] | |
22098 | pmulhrsw m5, [pw_1024] | |
22099 | packuswb m5, m5 | |
22100 | movh [r0 + 878 * 16 + 8], m5 | |
22101 | ||
22102 | ; mode 15 [row 24 - 8 to 15] | |
22103 | pslldq m3, 2 | |
22104 | pinsrb m3, [r3 + 8], 1 | |
22105 | pinsrb m3, [r3 + 9], 0 | |
22106 | pmaddubsw m5, m3, [r5 + 23 * 16] | |
22107 | pmulhrsw m5, [pw_1024] | |
22108 | packuswb m5, m5 | |
22109 | movh [r0 + 880 * 16 + 8], m5 | |
22110 | ||
22111 | ; mode 15 [row 25 - 8 to 15] | |
22112 | pmaddubsw m5, m3, [r5 + 6 * 16] | |
22113 | pmulhrsw m5, [pw_1024] | |
22114 | packuswb m5, m5 | |
22115 | movh [r0 + 882 * 16 + 8], m5 | |
22116 | ||
22117 | ; mode 15 [row 26 - 8 to 15] | |
22118 | pslldq m3, 2 | |
22119 | pinsrb m3, [r3 + 9], 1 | |
22120 | pinsrb m3, [r3 + 11], 0 | |
22121 | pmaddubsw m5, m3, [r5 + 21 * 16] | |
22122 | pmulhrsw m5, [pw_1024] | |
22123 | packuswb m5, m5 | |
22124 | movh [r0 + 884 * 16 + 8], m5 | |
22125 | ||
22126 | ; mode 15 [row 27 - 8 to 15] | |
22127 | pmaddubsw m5, m3, [r5 + 4 * 16] | |
22128 | pmulhrsw m5, [pw_1024] | |
22129 | packuswb m5, m5 | |
22130 | movh [r0 + 886 * 16 + 8], m5 | |
22131 | ||
22132 | ; mode 15 [row 28 - 8 to 15] | |
22133 | pslldq m3, 2 | |
22134 | pinsrb m3, [r3 + 11], 1 | |
22135 | pinsrb m3, [r3 + 13], 0 | |
22136 | pmaddubsw m5, m3, [r5 + 19 * 16] | |
22137 | pmulhrsw m5, [pw_1024] | |
22138 | packuswb m5, m5 | |
22139 | movh [r0 + 888 * 16 + 8], m5 | |
22140 | ||
22141 | ; mode 15 [row 29 - 8 to 15] | |
22142 | pmaddubsw m5, m3, [r5 + 2 * 16] | |
22143 | pmulhrsw m5, [pw_1024] | |
22144 | packuswb m5, m5 | |
22145 | movh [r0 + 890 * 16 + 8], m5 | |
22146 | ||
22147 | ; mode 15 [row 30 - 8 to 15] | |
22148 | pslldq m3, 2 | |
22149 | pinsrb m3, [r3 + 13], 1 | |
22150 | pinsrb m3, [r3 + 15], 0 | |
22151 | pmaddubsw m5, m3, [r5 + 17 * 16] | |
22152 | pmulhrsw m5, [pw_1024] | |
22153 | packuswb m5, m5 | |
22154 | movh [r0 + 892 * 16 + 8], m5 | |
22155 | ||
22156 | ; mode 15 [row 31, 8 to 15] | |
22157 | pshufb m5, m3, [tab_S2] | |
22158 | movh [r0 + 894 * 16 + 8], m5 | |
22159 | ||
22160 | ; mode 14 [row 27] | |
22161 | pinsrb m2, [r3 + 5], 0 | |
22162 | pslldq m7, 2 | |
22163 | pinsrb m7, [r3 + 25], 1 | |
22164 | pinsrb m7, [r3 + 27], 0 | |
22165 | pmaddubsw m3, m7, [r5 + 20 * 16] | |
22166 | pmulhrsw m3, [pw_1024] | |
22167 | pslldq m2, 2 | |
22168 | pinsrb m2, [r3 + 5], 1 | |
22169 | pinsrb m2, [r3 + 7], 0 | |
22170 | pmaddubsw m5, m2, [r5 + 20 * 16] | |
22171 | pmulhrsw m5, [pw_1024] | |
22172 | packuswb m3, m5 | |
22173 | movu [r0 + 822 * 16], m3 | |
22174 | pslldq m1, 2 | |
22175 | pinsrw m1, [r4 + 5], 0 | |
22176 | pmaddubsw m3, m1, [r5 + 20 * 16] | |
22177 | pmulhrsw m3, [pw_1024] | |
22178 | pslldq m4, 2 | |
22179 | pinsrw m4, [r4 + 13], 0 | |
22180 | pmaddubsw m5, m4, [r5 + 20 * 16] | |
22181 | pmulhrsw m5, [pw_1024] | |
22182 | packuswb m3, m5 | |
22183 | movu [r0 + 823 * 16], m3 | |
22184 | ||
22185 | ; mode 15 [row 20 - 16 to 31] | |
22186 | pmaddubsw m3, m1, [r5 + 27 * 16] | |
22187 | pmulhrsw m3, [pw_1024] | |
22188 | pmaddubsw m5, m4, [r5 + 27 * 16] | |
22189 | pmulhrsw m5, [pw_1024] | |
22190 | packuswb m3, m5 | |
22191 | movu [r0 + 873 * 16], m3 | |
22192 | ||
22193 | ; mode 15 [row 21 - 16 to 31] | |
22194 | pmaddubsw m3, m1, [r5 + 10 * 16] | |
22195 | pmulhrsw m3, [pw_1024] | |
22196 | pmaddubsw m5, m4, [r5 + 10 * 16] | |
22197 | pmulhrsw m5, [pw_1024] | |
22198 | packuswb m3, m5 | |
22199 | movu [r0 + 875 * 16], m3 | |
22200 | ||
22201 | ; mode 14 [row 28] | |
22202 | pmaddubsw m3, m7, [r5 + 7 * 16] | |
22203 | pmulhrsw m3, [pw_1024] | |
22204 | pmaddubsw m5, m2, [r5 + 7 * 16] | |
22205 | pmulhrsw m5, [pw_1024] | |
22206 | packuswb m3, m5 | |
22207 | movu [r0 + 824 * 16], m3 | |
22208 | pmaddubsw m3, m1, [r5 + 7 * 16] | |
22209 | pmulhrsw m3, [pw_1024] | |
22210 | pmaddubsw m5, m4, [r5 + 7 * 16] | |
22211 | pmulhrsw m5, [pw_1024] | |
22212 | packuswb m3, m5 | |
22213 | movu [r0 + 825 * 16], m3 | |
22214 | ||
22215 | ; mode 14 [row 29] | |
22216 | pslldq m7, 2 | |
22217 | pinsrb m7, [r3 + 27], 1 | |
22218 | pinsrb m7, [r3 + 30], 0 | |
22219 | pmaddubsw m3, m7, [r5 + 26 * 16] | |
22220 | pmulhrsw m3, [pw_1024] | |
22221 | pslldq m2, 2 | |
22222 | pinsrb m2, [r3 + 7], 1 | |
22223 | pinsrb m2, [r3 + 10], 0 | |
22224 | pmaddubsw m5, m2, [r5 + 26 * 16] | |
22225 | pmulhrsw m5, [pw_1024] | |
22226 | packuswb m3, m5 | |
22227 | movu [r0 + 826 * 16], m3 | |
22228 | pslldq m1, 2 | |
22229 | pinsrw m1, [r4 + 4], 0 | |
22230 | pmaddubsw m3, m1, [r5 + 26 * 16] | |
22231 | pmulhrsw m3, [pw_1024] | |
22232 | pslldq m4, 2 | |
22233 | pinsrw m4, [r4 + 12], 0 | |
22234 | pmaddubsw m5, m4, [r5 + 26 * 16] | |
22235 | pmulhrsw m5, [pw_1024] | |
22236 | packuswb m3, m5 | |
22237 | movu [r0 + 827 * 16], m3 | |
22238 | ||
22239 | ; mode 14 [row 30] | |
22240 | pmaddubsw m3, m7, [r5 + 13 * 16] | |
22241 | pmulhrsw m3, [pw_1024] | |
22242 | pmaddubsw m5, m2, [r5 + 13 * 16] | |
22243 | pmulhrsw m5, [pw_1024] | |
22244 | packuswb m3, m5 | |
22245 | movu [r0 + 828 * 16], m3 | |
22246 | pmaddubsw m3, m1, [r5 + 13 * 16] | |
22247 | pmulhrsw m3, [pw_1024] | |
22248 | pmaddubsw m5, m4, [r5 + 13 * 16] | |
22249 | pmulhrsw m5, [pw_1024] | |
22250 | packuswb m3, m5 | |
22251 | movu [r0 + 829 * 16], m3 | |
22252 | ||
22253 | ; mode 15 [row 22] | |
22254 | pmaddubsw m3, m1, [r5 + 25 * 16] | |
22255 | pmulhrsw m3, [pw_1024] | |
22256 | pmaddubsw m5, m4, [r5 + 25 * 16] | |
22257 | pmulhrsw m5, [pw_1024] | |
22258 | packuswb m3, m5 | |
22259 | movu [r0 + 877 * 16], m3 | |
22260 | ||
22261 | ; mode 15 [row 23] | |
22262 | pmaddubsw m3, m1, [r5 + 8 * 16] | |
22263 | pmulhrsw m3, [pw_1024] | |
22264 | pmaddubsw m5, m4, [r5 + 8 * 16] | |
22265 | pmulhrsw m5, [pw_1024] | |
22266 | packuswb m3, m5 | |
22267 | movu [r0 + 879 * 16], m3 | |
22268 | ||
22269 | ; mode 14 [row 31] | |
22270 | pshufb m3, m7, [tab_S2] | |
22271 | movh [r0 + 830 * 16], m3 | |
22272 | pshufb m3, m2, [tab_S2] | |
22273 | movh [r0 + 830 * 16 + 8], m3 | |
22274 | pshufb m3, m1, [tab_S2] | |
22275 | movh [r0 + 831 * 16], m3 | |
22276 | pshufb m3, m4, [tab_S2] | |
22277 | movh [r0 + 831 * 16 + 8], m3 | |
22278 | ||
22279 | ; mode 13 [row 31] | |
22280 | pshufb m0, m6, [tab_S2] | |
22281 | movh [r0 + 766 * 16], m0 | |
22282 | movh m0, [r4] | |
22283 | movh [r0 + 766 * 16 + 8], m0 | |
22284 | movu m0, [r4 + 8] | |
22285 | movu [r0 + 767 * 16], m0 | |
22286 | ||
22287 | ; mode 15 [row 24] | |
22288 | pslldq m1, 2 | |
22289 | pinsrw m1, [r4 + 3], 0 | |
22290 | pmaddubsw m3, m1, [r5 + 23 * 16] | |
22291 | pmulhrsw m3, [pw_1024] | |
22292 | pslldq m4, 2 | |
22293 | pinsrw m4, [r4 + 11], 0 | |
22294 | pmaddubsw m5, m4, [r5 + 23 * 16] | |
22295 | pmulhrsw m5, [pw_1024] | |
22296 | packuswb m3, m5 | |
22297 | movu [r0 + 881 * 16], m3 | |
22298 | ||
22299 | ; mode 15 [row 25] | |
22300 | pmaddubsw m3, m1, [r5 + 6 * 16] | |
22301 | pmulhrsw m3, [pw_1024] | |
22302 | pmaddubsw m5, m4, [r5 + 6 * 16] | |
22303 | pmulhrsw m5, [pw_1024] | |
22304 | packuswb m3, m5 | |
22305 | movu [r0 + 883 * 16], m3 | |
22306 | ||
22307 | ; mode 15 [row 26] | |
22308 | pslldq m1, 2 | |
22309 | pinsrw m1, [r4 + 2], 0 | |
22310 | pmaddubsw m3, m1, [r5 + 21 * 16] | |
22311 | pmulhrsw m3, [pw_1024] | |
22312 | pslldq m4, 2 | |
22313 | pinsrw m4, [r4 + 10], 0 | |
22314 | pmaddubsw m5, m4, [r5 + 21 * 16] | |
22315 | pmulhrsw m5, [pw_1024] | |
22316 | packuswb m3, m5 | |
22317 | movu [r0 + 885 * 16], m3 | |
22318 | ||
22319 | ; mode 15 [row 27] | |
22320 | pmaddubsw m3, m1, [r5 + 4 * 16] | |
22321 | pmulhrsw m3, [pw_1024] | |
22322 | pmaddubsw m5, m4, [r5 + 4 * 16] | |
22323 | pmulhrsw m5, [pw_1024] | |
22324 | packuswb m3, m5 | |
22325 | movu [r0 + 887 * 16], m3 | |
22326 | ||
22327 | ; mode 15 [row 28] | |
22328 | pslldq m1, 2 | |
22329 | pinsrw m1, [r4 + 1], 0 | |
22330 | pmaddubsw m3, m1, [r5 + 19 * 16] | |
22331 | pmulhrsw m3, [pw_1024] | |
22332 | pslldq m4, 2 | |
22333 | pinsrw m4, [r4 + 9], 0 | |
22334 | pmaddubsw m5, m4, [r5 + 19 * 16] | |
22335 | pmulhrsw m5, [pw_1024] | |
22336 | packuswb m3, m5 | |
22337 | movu [r0 + 889 * 16], m3 | |
22338 | ||
22339 | ; mode 15 [row 29] | |
22340 | pmaddubsw m3, m1, [r5 + 2 * 16] | |
22341 | pmulhrsw m3, [pw_1024] | |
22342 | pmaddubsw m5, m4, [r5 + 2 * 16] | |
22343 | pmulhrsw m5, [pw_1024] | |
22344 | packuswb m3, m5 | |
22345 | movu [r0 + 891 * 16], m3 | |
22346 | ||
22347 | ; mode 15 [row 30] | |
22348 | pslldq m1, 2 | |
22349 | pinsrw m1, [r4 + 0], 0 | |
22350 | pmaddubsw m3, m1, [r5 + 17 * 16] | |
22351 | pmulhrsw m3, [pw_1024] | |
22352 | pslldq m4, 2 | |
22353 | pinsrw m4, [r4 + 8], 0 | |
22354 | pmaddubsw m5, m4, [r5 + 17 * 16] | |
22355 | pmulhrsw m5, [pw_1024] | |
22356 | packuswb m3, m5 | |
22357 | movu [r0 + 893 * 16], m3 | |
22358 | ||
22359 | ; mode 15 [row 31] | |
22360 | pshufb m5, m1, [tab_S2] | |
22361 | movh [r0 + 895 * 16], m5 | |
22362 | pshufb m5, m4, [tab_S2] | |
22363 | movh [r0 + 895 * 16 + 8], m5 | |
22364 | ||
22365 | ; mode 16 [row 0] | |
22366 | movu m6, [r5 + 11 * 16] | |
22367 | movu m7, [pw_1024] | |
22368 | movh m0, [r4 ] | |
22369 | movh m1, [r4 + 1 ] | |
22370 | punpcklbw m0, m1 | |
22371 | pmaddubsw m1, m0, m6 | |
22372 | pmulhrsw m1, m7 | |
22373 | movh m2, [r4 + 8] | |
22374 | movh m3, [r4 + 9] | |
22375 | punpcklbw m2, m3 | |
22376 | pmaddubsw m3, m2, m6 | |
22377 | pmulhrsw m3, m7 | |
22378 | packuswb m1, m3 | |
22379 | movu [r0 + 896 * 16], m1 | |
22380 | ||
22381 | movh m1, [r4 + 16] | |
22382 | movh m3, [r4 + 17] | |
22383 | punpcklbw m1, m3 | |
22384 | pmaddubsw m3, m1, m6 | |
22385 | pmulhrsw m3, m7 | |
22386 | movh m4, [r4 + 24] | |
22387 | movh m5, [r4 + 25] | |
22388 | punpcklbw m4, m5 | |
22389 | pmaddubsw m5, m4, m6 | |
22390 | pmulhrsw m5, m7 | |
22391 | packuswb m3, m5 | |
22392 | movu [r0 + 897 * 16], m3 | |
22393 | ||
22394 | ; mode16 [row 1] | |
22395 | movu m6, [r5 + 22 * 16] | |
22396 | pslldq m0, 2 | |
22397 | pinsrb m0, [r4], 1 | |
22398 | pinsrb m0, [r3 + 2], 0 | |
22399 | pmaddubsw m3, m0, m6 | |
22400 | pmulhrsw m3, m7 | |
22401 | pslldq m2, 2 | |
22402 | pinsrw m2, [r4 + 7], 0 | |
22403 | pmaddubsw m5, m2, m6 | |
22404 | pmulhrsw m5, m7 | |
22405 | packuswb m3, m5 | |
22406 | movu [r0 + 898 * 16], m3 | |
22407 | ||
22408 | pslldq m1, 2 | |
22409 | pinsrw m1, [r4 + 15], 0 | |
22410 | pmaddubsw m3, m1, m6 | |
22411 | pmulhrsw m3, m7 | |
22412 | pslldq m4, 2 | |
22413 | pinsrw m4, [r4 + 23], 0 | |
22414 | pmaddubsw m5, m4, m6 | |
22415 | pmulhrsw m5, m7 | |
22416 | packuswb m3, m5 | |
22417 | movu [r0 + 899 * 16], m3 | |
22418 | ||
22419 | ; mode16 [row 2] | |
22420 | movu m6, [r5 + 1 * 16] | |
22421 | pmaddubsw m3, m0, m6 | |
22422 | pmulhrsw m3, m7 | |
22423 | pmaddubsw m5, m2, m6 | |
22424 | pmulhrsw m5, m7 | |
22425 | packuswb m3, m5 | |
22426 | movu [r0 + 900 * 16], m3 | |
22427 | ||
22428 | pmaddubsw m3, m1, m6 | |
22429 | pmulhrsw m3, m7 | |
22430 | pmaddubsw m5, m4, m6 | |
22431 | pmulhrsw m5, m7 | |
22432 | packuswb m3, m5 | |
22433 | movu [r0 + 901 * 16], m3 | |
22434 | ||
22435 | ; mode16 [row 3] | |
22436 | movu m6, [r5 + 12 * 16] | |
22437 | pslldq m0, 2 | |
22438 | pinsrb m0, [r3 + 2], 1 | |
22439 | pinsrb m0, [r3 + 3], 0 | |
22440 | pmaddubsw m3, m0, m6 | |
22441 | pmulhrsw m3, m7 | |
22442 | pslldq m2, 2 | |
22443 | pinsrw m2, [r4 + 6], 0 | |
22444 | pmaddubsw m5, m2, m6 | |
22445 | pmulhrsw m5, m7 | |
22446 | packuswb m3, m5 | |
22447 | movu [r0 + 902 * 16], m3 | |
22448 | ||
22449 | pslldq m1, 2 | |
22450 | pinsrw m1, [r4 + 14], 0 | |
22451 | pmaddubsw m3, m1, m6 | |
22452 | pmulhrsw m3, m7 | |
22453 | pslldq m4, 2 | |
22454 | pinsrw m4, [r4 + 22], 0 | |
22455 | pmaddubsw m5, m4, m6 | |
22456 | pmulhrsw m5, m7 | |
22457 | packuswb m3, m5 | |
22458 | movu [r0 + 903 * 16], m3 | |
22459 | ||
22460 | ; mode16 [row 4] | |
22461 | movu m6, [r5 + 23 * 16] | |
22462 | pslldq m0, 2 | |
22463 | pinsrb m0, [r3 + 3], 1 | |
22464 | pinsrb m0, [r3 + 5], 0 | |
22465 | pmaddubsw m3, m0, m6 | |
22466 | pmulhrsw m3, m7 | |
22467 | pslldq m2, 2 | |
22468 | pinsrw m2, [r4 + 5], 0 | |
22469 | pmaddubsw m5, m2, m6 | |
22470 | pmulhrsw m5, m7 | |
22471 | packuswb m3, m5 | |
22472 | movu [r0 + 904 * 16], m3 | |
22473 | ||
22474 | pslldq m1, 2 | |
22475 | pinsrw m1, [r4 + 13], 0 | |
22476 | pmaddubsw m3, m1, m6 | |
22477 | pmulhrsw m3, m7 | |
22478 | pslldq m4, 2 | |
22479 | pinsrw m4, [r4 + 21], 0 | |
22480 | pmaddubsw m5, m4, m6 | |
22481 | pmulhrsw m5, m7 | |
22482 | packuswb m3, m5 | |
22483 | movu [r0 + 905 * 16], m3 | |
22484 | ||
22485 | ; mode16 [row 5] | |
22486 | movu m6, [r5 + 2 * 16] | |
22487 | pmaddubsw m3, m0, m6 | |
22488 | pmulhrsw m3, m7 | |
22489 | pmaddubsw m5, m2, m6 | |
22490 | pmulhrsw m5, m7 | |
22491 | packuswb m3, m5 | |
22492 | movu [r0 + 906 * 16], m3 | |
22493 | ||
22494 | pmaddubsw m3, m1, m6 | |
22495 | pmulhrsw m3, m7 | |
22496 | pmaddubsw m5, m4, m6 | |
22497 | pmulhrsw m5, m7 | |
22498 | packuswb m3, m5 | |
22499 | movu [r0 + 907 * 16], m3 | |
22500 | ||
22501 | ; mode16 [row 6] | |
22502 | movu m6, [r5 + 13 * 16] | |
22503 | pslldq m0, 2 | |
22504 | pinsrb m0, [r3 + 5], 1 | |
22505 | pinsrb m0, [r3 + 6], 0 | |
22506 | pmaddubsw m3, m0, m6 | |
22507 | pmulhrsw m3, m7 | |
22508 | pslldq m2, 2 | |
22509 | pinsrb m2, [r4 + 5], 1 | |
22510 | pinsrb m2, [r4 + 4], 0 | |
22511 | pmaddubsw m5, m2, m6 | |
22512 | pmulhrsw m5, m7 | |
22513 | packuswb m3, m5 | |
22514 | movu [r0 + 908 * 16], m3 | |
22515 | pslldq m1, 2 | |
22516 | pinsrw m1, [r4 + 12], 0 | |
22517 | pmaddubsw m3, m1, m6 | |
22518 | pmulhrsw m3, m7 | |
22519 | pslldq m4, 2 | |
22520 | pinsrw m4, [r4 + 20], 0 | |
22521 | pmaddubsw m5, m4, m6 | |
22522 | pmulhrsw m5, m7 | |
22523 | packuswb m3, m5 | |
22524 | movu [r0 + 909 * 16], m3 | |
22525 | ||
22526 | ; mode16 [row 7] | |
22527 | movu m6, [r5 + 24 * 16] | |
22528 | pslldq m0, 2 | |
22529 | pinsrb m0, [r3 + 6], 1 | |
22530 | pinsrb m0, [r3 + 8], 0 | |
22531 | pmaddubsw m3, m0, m6 | |
22532 | pmulhrsw m3, m7 | |
22533 | pslldq m2, 2 | |
22534 | pinsrw m2, [r4 + 3], 0 | |
22535 | pmaddubsw m5, m2, m6 | |
22536 | pmulhrsw m5, m7 | |
22537 | packuswb m3, m5 | |
22538 | movu [r0 + 910 * 16], m3 | |
22539 | ||
22540 | pslldq m1, 2 | |
22541 | pinsrw m1, [r4 + 11], 0 | |
22542 | pmaddubsw m3, m1, m6 | |
22543 | pmulhrsw m3, m7 | |
22544 | pslldq m4, 2 | |
22545 | pinsrw m4, [r4 + 19], 0 | |
22546 | pmaddubsw m5, m4, m6 | |
22547 | pmulhrsw m5, m7 | |
22548 | packuswb m3, m5 | |
22549 | movu [r0 + 911 * 16], m3 | |
22550 | ||
22551 | ; mode16 [row 8] | |
22552 | movu m6, [r5 + 3 * 16] | |
22553 | pmaddubsw m3, m0, m6 | |
22554 | pmulhrsw m3, m7 | |
22555 | pmaddubsw m5, m2, m6 | |
22556 | pmulhrsw m5, m7 | |
22557 | packuswb m3, m5 | |
22558 | movu [r0 + 912 * 16], m3 | |
22559 | ||
22560 | pmaddubsw m3, m1, m6 | |
22561 | pmulhrsw m3, m7 | |
22562 | pmaddubsw m5, m4, m6 | |
22563 | pmulhrsw m5, m7 | |
22564 | packuswb m3, m5 | |
22565 | movu [r0 + 913 * 16], m3 | |
22566 | ||
22567 | ; mode16 [row 9] | |
22568 | movu m6, [r5 + 14 * 16] | |
22569 | pslldq m0, 2 | |
22570 | pinsrb m0, [r3 + 8], 1 | |
22571 | pinsrb m0, [r3 + 9], 0 | |
22572 | pmaddubsw m3, m0, m6 | |
22573 | pmulhrsw m3, m7 | |
22574 | pslldq m2, 2 | |
22575 | pinsrw m2, [r4 + 2], 0 | |
22576 | pmaddubsw m5, m2, m6 | |
22577 | pmulhrsw m5, m7 | |
22578 | packuswb m3, m5 | |
22579 | movu [r0 + 914 * 16], m3 | |
22580 | ||
22581 | pslldq m1, 2 | |
22582 | pinsrw m1, [r4 + 10], 0 | |
22583 | pmaddubsw m3, m1, m6 | |
22584 | pmulhrsw m3, m7 | |
22585 | pslldq m4, 2 | |
22586 | pinsrw m4, [r4 + 18], 0 | |
22587 | pmaddubsw m5, m4, m6 | |
22588 | pmulhrsw m5, m7 | |
22589 | packuswb m3, m5 | |
22590 | movu [r0 + 915 * 16], m3 | |
22591 | ||
22592 | ; mode16 [row 10] | |
22593 | movu m6, [r5 + 25 * 16] | |
22594 | pslldq m0, 2 | |
22595 | pinsrb m0, [r3 + 9], 1 | |
22596 | pinsrb m0, [r3 + 11], 0 | |
22597 | pmaddubsw m3, m0, m6 | |
22598 | pmulhrsw m3, m7 | |
22599 | pslldq m2, 2 | |
22600 | pinsrw m2, [r4 + 1], 0 | |
22601 | pmaddubsw m5, m2, m6 | |
22602 | pmulhrsw m5, m7 | |
22603 | packuswb m3, m5 | |
22604 | movu [r0 + 916 * 16], m3 | |
22605 | ||
22606 | pslldq m1, 2 | |
22607 | pinsrw m1, [r4 + 9], 0 | |
22608 | pmaddubsw m3, m1, m6 | |
22609 | pmulhrsw m3, m7 | |
22610 | pslldq m4, 2 | |
22611 | pinsrb m4, [r4 + 18], 1 | |
22612 | pinsrb m4, [r4 + 17], 0 | |
22613 | pmaddubsw m5, m4, m6 | |
22614 | pmulhrsw m5, m7 | |
22615 | packuswb m3, m5 | |
22616 | movu [r0 + 917 * 16], m3 | |
22617 | ||
22618 | ; mode16 [row 11] | |
22619 | movu m6, [r5 + 4 * 16] | |
22620 | pmaddubsw m3, m0, m6 | |
22621 | pmulhrsw m3, m7 | |
22622 | pmaddubsw m5, m2, m6 | |
22623 | pmulhrsw m5, m7 | |
22624 | packuswb m3, m5 | |
22625 | movu [r0 + 918 * 16], m3 | |
22626 | ||
22627 | pmaddubsw m3, m1, m6 | |
22628 | pmulhrsw m3, m7 | |
22629 | pmaddubsw m5, m4, m6 | |
22630 | pmulhrsw m5, m7 | |
22631 | packuswb m3, m5 | |
22632 | movu [r0 + 919 * 16], m3 | |
22633 | ||
22634 | ; mode16 [row 12] | |
22635 | movu m6, [r5 + 15 * 16] | |
22636 | pslldq m0, 2 | |
22637 | pinsrb m0, [r3 + 11], 1 | |
22638 | pinsrb m0, [r3 + 12], 0 | |
22639 | pmaddubsw m3, m0, m6 | |
22640 | pmulhrsw m3, m7 | |
22641 | pslldq m2, 2 | |
22642 | pinsrw m2, [r4 + 0], 0 | |
22643 | pmaddubsw m5, m2, m6 | |
22644 | pmulhrsw m5, m7 | |
22645 | packuswb m3, m5 | |
22646 | movu [r0 + 920 * 16], m3 | |
22647 | ||
22648 | pslldq m1, 2 | |
22649 | pinsrw m1, [r4 + 8], 0 | |
22650 | pmaddubsw m3, m1, m6 | |
22651 | pmulhrsw m3, m7 | |
22652 | pslldq m4, 2 | |
22653 | pinsrw m4, [r4 + 16], 0 | |
22654 | pmaddubsw m5, m4, m6 | |
22655 | pmulhrsw m5, m7 | |
22656 | packuswb m3, m5 | |
22657 | movu [r0 + 921 * 16], m3 | |
22658 | ||
22659 | ; mode16 [row 13] | |
22660 | movu m6, [r5 + 26 * 16] | |
22661 | pslldq m0, 2 | |
22662 | pinsrb m0, [r3 + 12], 1 | |
22663 | pinsrb m0, [r3 + 14], 0 | |
22664 | pmaddubsw m3, m0, m6 | |
22665 | pmulhrsw m3, m7 | |
22666 | pslldq m2, 2 | |
22667 | pinsrb m2, [r4 + 0], 1 | |
22668 | pinsrb m2, [r3 + 2], 0 | |
22669 | pmaddubsw m5, m2, m6 | |
22670 | pmulhrsw m5, m7 | |
22671 | packuswb m3, m5 | |
22672 | movu [r0 + 922 * 16], m3 | |
22673 | ||
22674 | pslldq m1, 2 | |
22675 | pinsrw m1, [r4 + 7], 0 | |
22676 | pmaddubsw m3, m1, m6 | |
22677 | pmulhrsw m3, m7 | |
22678 | pslldq m4, 2 | |
22679 | pinsrw m4, [r4 + 15], 0 | |
22680 | pmaddubsw m5, m4, m6 | |
22681 | pmulhrsw m5, m7 | |
22682 | packuswb m3, m5 | |
22683 | movu [r0 + 923 * 16], m3 | |
22684 | ||
22685 | ; mode16 [row 14] | |
22686 | movu m6, [r5 + 5 * 16] | |
22687 | pmaddubsw m3, m0, m6 | |
22688 | pmulhrsw m3, m7 | |
22689 | pmaddubsw m5, m2, m6 | |
22690 | pmulhrsw m5, m7 | |
22691 | packuswb m3, m5 | |
22692 | movu [r0 + 924 * 16], m3 | |
22693 | ||
22694 | pmaddubsw m3, m1, m6 | |
22695 | pmulhrsw m3, m7 | |
22696 | pmaddubsw m5, m4, m6 | |
22697 | pmulhrsw m5, m7 | |
22698 | packuswb m3, m5 | |
22699 | movu [r0 + 925 * 16], m3 | |
22700 | ||
22701 | ; mode16 [row 15] | |
22702 | movu m6, [r5 + 16 * 16] | |
22703 | pslldq m0, 2 | |
22704 | pinsrb m0, [r3 + 14], 1 | |
22705 | pinsrb m0, [r3 + 15], 0 | |
22706 | pmaddubsw m3, m0, m6 | |
22707 | pmulhrsw m3, m7 | |
22708 | pslldq m2, 2 | |
22709 | pinsrb m2, [r3 + 2], 1 | |
22710 | pinsrb m2, [r3 + 3], 0 | |
22711 | pmaddubsw m5, m2, m6 | |
22712 | pmulhrsw m5, m7 | |
22713 | packuswb m3, m5 | |
22714 | movu [r0 + 926 * 16], m3 | |
22715 | ||
22716 | pslldq m1, 2 | |
22717 | pinsrw m1, [r4 + 6], 0 | |
22718 | pmaddubsw m3, m1, m6 | |
22719 | pmulhrsw m3, m7 | |
22720 | pslldq m4, 2 | |
22721 | pinsrw m4, [r4 + 14], 0 | |
22722 | pmaddubsw m5, m4, m6 | |
22723 | pmulhrsw m5, m7 | |
22724 | packuswb m3, m5 | |
22725 | movu [r0 + 927 * 16], m3 | |
22726 | ||
22727 | ; mode16 [row 16] | |
22728 | movu m6, [r5 + 27 * 16] | |
22729 | pslldq m0, 2 | |
22730 | pinsrb m0, [r3 + 15], 1 | |
22731 | pinsrb m0, [r3 + 17], 0 | |
22732 | pmaddubsw m3, m0, m6 | |
22733 | pmulhrsw m3, m7 | |
22734 | pslldq m2, 2 | |
22735 | pinsrb m2, [r3 + 3], 1 | |
22736 | pinsrb m2, [r3 + 5], 0 | |
22737 | pmaddubsw m5, m2, m6 | |
22738 | pmulhrsw m5, m7 | |
22739 | packuswb m3, m5 | |
22740 | movu [r0 + 928 * 16], m3 | |
22741 | ||
22742 | pslldq m1, 2 | |
22743 | pinsrw m1, [r4 + 5], 0 | |
22744 | pmaddubsw m3, m1, m6 | |
22745 | pmulhrsw m3, m7 | |
22746 | pslldq m4, 2 | |
22747 | pinsrw m4, [r4 + 13], 0 | |
22748 | pmaddubsw m5, m4, m6 | |
22749 | pmulhrsw m5, m7 | |
22750 | packuswb m3, m5 | |
22751 | movu [r0 + 929 * 16], m3 | |
22752 | ||
22753 | ; mode16 [row 17] | |
22754 | movu m6, [r5 + 6 * 16] | |
22755 | pmaddubsw m3, m0, m6 | |
22756 | pmulhrsw m3, m7 | |
22757 | pmaddubsw m5, m2, m6 | |
22758 | pmulhrsw m5, m7 | |
22759 | packuswb m3, m5 | |
22760 | movu [r0 + 930 * 16], m3 | |
22761 | ||
22762 | pmaddubsw m3, m1, m6 | |
22763 | pmulhrsw m3, m7 | |
22764 | pmaddubsw m5, m4, m6 | |
22765 | pmulhrsw m5, m7 | |
22766 | packuswb m3, m5 | |
22767 | movu [r0 + 931 * 16], m3 | |
22768 | ||
22769 | ; mode16 [row 18] | |
22770 | movu m6, [r5 + 17 * 16] | |
22771 | pslldq m0, 2 | |
22772 | pinsrb m0, [r3 + 17], 1 | |
22773 | pinsrb m0, [r3 + 18], 0 | |
22774 | pmaddubsw m3, m0, m6 | |
22775 | pmulhrsw m3, m7 | |
22776 | pslldq m2, 2 | |
22777 | pinsrb m2, [r3 + 5], 1 | |
22778 | pinsrb m2, [r3 + 6], 0 | |
22779 | pmaddubsw m5, m2, m6 | |
22780 | pmulhrsw m5, m7 | |
22781 | packuswb m3, m5 | |
22782 | movu [r0 + 932 * 16], m3 | |
22783 | ||
22784 | pslldq m1, 2 | |
22785 | pinsrw m1, [r4 + 4], 0 | |
22786 | pmaddubsw m3, m1, m6 | |
22787 | pmulhrsw m3, m7 | |
22788 | pslldq m4, 2 | |
22789 | pinsrw m4, [r4 + 12], 0 | |
22790 | pmaddubsw m5, m4, m6 | |
22791 | pmulhrsw m5, m7 | |
22792 | packuswb m3, m5 | |
22793 | movu [r0 + 933 * 16], m3 | |
22794 | ||
22795 | ; mode16 [row 19] | |
22796 | movu m6, [r5 + 28 * 16] | |
22797 | pslldq m0, 2 | |
22798 | pinsrb m0, [r3 + 18], 1 | |
22799 | pinsrb m0, [r3 + 20], 0 | |
22800 | pmaddubsw m3, m0, m6 | |
22801 | pmulhrsw m3, m7 | |
22802 | pslldq m2, 2 | |
22803 | pinsrb m2, [r3 + 6], 1 | |
22804 | pinsrb m2, [r3 + 8], 0 | |
22805 | pmaddubsw m5, m2, m6 | |
22806 | pmulhrsw m5, m7 | |
22807 | packuswb m3, m5 | |
22808 | movu [r0 + 934 * 16], m3 | |
22809 | ||
22810 | pslldq m1, 2 | |
22811 | pinsrw m1, [r4 + 3], 0 | |
22812 | pmaddubsw m3, m1, m6 | |
22813 | pmulhrsw m3, m7 | |
22814 | pslldq m4, 2 | |
22815 | pinsrw m4, [r4 + 11], 0 | |
22816 | pmaddubsw m5, m4, m6 | |
22817 | pmulhrsw m5, m7 | |
22818 | packuswb m3, m5 | |
22819 | movu [r0 + 935 * 16], m3 | |
22820 | ||
22821 | ; mode16 [row 20] | |
22822 | movu m6, [r5 + 7 * 16] | |
22823 | pmaddubsw m3, m0, m6 | |
22824 | pmulhrsw m3, m7 | |
22825 | pmaddubsw m5, m2, m6 | |
22826 | pmulhrsw m5, m7 | |
22827 | packuswb m3, m5 | |
22828 | movu [r0 + 936 * 16], m3 | |
22829 | ||
22830 | pmaddubsw m3, m1, m6 | |
22831 | pmulhrsw m3, m7 | |
22832 | pmaddubsw m5, m4, m6 | |
22833 | pmulhrsw m5, m7 | |
22834 | packuswb m3, m5 | |
22835 | movu [r0 + 937 * 16], m3 | |
22836 | ||
22837 | ; mode16 [row 21] | |
22838 | movu m6, [r5 + 18 * 16] | |
22839 | pslldq m0, 2 | |
22840 | pinsrb m0, [r3 + 20], 1 | |
22841 | pinsrb m0, [r3 + 21], 0 | |
22842 | pmaddubsw m3, m0, m6 | |
22843 | pmulhrsw m3, m7 | |
22844 | pslldq m2, 2 | |
22845 | pinsrb m2, [r3 + 8], 1 | |
22846 | pinsrb m2, [r3 + 9], 0 | |
22847 | pmaddubsw m5, m2, m6 | |
22848 | pmulhrsw m5, m7 | |
22849 | packuswb m3, m5 | |
22850 | movu [r0 + 938 * 16], m3 | |
22851 | ||
22852 | pslldq m1, 2 | |
22853 | pinsrw m1, [r4 + 2], 0 | |
22854 | pmaddubsw m3, m1, m6 | |
22855 | pmulhrsw m3, m7 | |
22856 | pslldq m4, 2 | |
22857 | pinsrw m4, [r4 + 10], 0 | |
22858 | pmaddubsw m5, m4, m6 | |
22859 | pmulhrsw m5, m7 | |
22860 | packuswb m3, m5 | |
22861 | movu [r0 + 939 * 16], m3 | |
22862 | ||
22863 | ; mode16 [row 22] | |
22864 | movu m6, [r5 + 29 * 16] | |
22865 | pslldq m0, 2 | |
22866 | pinsrb m0, [r3 + 21], 1 | |
22867 | pinsrb m0, [r3 + 23], 0 | |
22868 | pmaddubsw m3, m0, m6 | |
22869 | pmulhrsw m3, m7 | |
22870 | pslldq m2, 2 | |
22871 | pinsrb m2, [r3 + 9], 1 | |
22872 | pinsrb m2, [r3 + 11], 0 | |
22873 | pmaddubsw m5, m2, m6 | |
22874 | pmulhrsw m5, m7 | |
22875 | packuswb m3, m5 | |
22876 | movu [r0 + 940 * 16], m3 | |
22877 | ||
22878 | pslldq m1, 2 | |
22879 | pinsrw m1, [r4 + 1], 0 | |
22880 | pmaddubsw m3, m1, m6 | |
22881 | pmulhrsw m3, m7 | |
22882 | pslldq m4, 2 | |
22883 | pinsrw m4, [r4 + 9], 0 | |
22884 | pmaddubsw m5, m4, m6 | |
22885 | pmulhrsw m5, m7 | |
22886 | packuswb m3, m5 | |
22887 | movu [r0 + 941 * 16], m3 | |
22888 | ||
22889 | ; mode16 [row 23] | |
22890 | movu m6, [r5 + 8 * 16] | |
22891 | pmaddubsw m3, m0, m6 | |
22892 | pmulhrsw m3, m7 | |
22893 | pmaddubsw m5, m2, m6 | |
22894 | pmulhrsw m5, m7 | |
22895 | packuswb m3, m5 | |
22896 | movu [r0 + 942 * 16], m3 | |
22897 | ||
22898 | pmaddubsw m3, m1, m6 | |
22899 | pmulhrsw m3, m7 | |
22900 | pmaddubsw m5, m4, m6 | |
22901 | pmulhrsw m5, m7 | |
22902 | packuswb m3, m5 | |
22903 | movu [r0 + 943 * 16], m3 | |
22904 | ||
22905 | ; mode16 [row 24] | |
22906 | movu m6, [r5 + 19 * 16] | |
22907 | pslldq m0, 2 | |
22908 | pinsrb m0, [r3 + 23], 1 | |
22909 | pinsrb m0, [r3 + 24], 0 | |
22910 | pmaddubsw m3, m0, m6 | |
22911 | pmulhrsw m3, m7 | |
22912 | pslldq m2, 2 | |
22913 | pinsrb m2, [r3 + 11], 1 | |
22914 | pinsrb m2, [r3 + 12], 0 | |
22915 | pmaddubsw m5, m2, m6 | |
22916 | pmulhrsw m5, m7 | |
22917 | packuswb m3, m5 | |
22918 | movu [r0 + 944 * 16], m3 | |
22919 | ||
22920 | pslldq m1, 2 | |
22921 | pinsrw m1, [r4 + 0], 0 | |
22922 | pmaddubsw m3, m1, m6 | |
22923 | pmulhrsw m3, m7 | |
22924 | pslldq m4, 2 | |
22925 | pinsrw m4, [r4 + 8], 0 | |
22926 | pmaddubsw m5, m4, m6 | |
22927 | pmulhrsw m5, m7 | |
22928 | packuswb m3, m5 | |
22929 | movu [r0 + 945 * 16], m3 | |
22930 | ||
22931 | ; mode16 [row 25] | |
22932 | movu m6, [r5 + 30 * 16] | |
22933 | pslldq m0, 2 | |
22934 | pinsrb m0, [r3 + 24], 1 | |
22935 | pinsrb m0, [r3 + 26], 0 | |
22936 | pmaddubsw m3, m0, m6 | |
22937 | pmulhrsw m3, m7 | |
22938 | pslldq m2, 2 | |
22939 | pinsrb m2, [r3 + 12], 1 | |
22940 | pinsrb m2, [r3 + 14], 0 | |
22941 | pmaddubsw m5, m2, m6 | |
22942 | pmulhrsw m5, m7 | |
22943 | packuswb m3, m5 | |
22944 | movu [r0 + 946 * 16], m3 | |
22945 | ||
22946 | pslldq m1, 2 | |
22947 | pinsrb m1, [r4 + 0], 1 | |
22948 | pinsrb m1, [r3 + 2], 0 | |
22949 | pmaddubsw m3, m1, m6 | |
22950 | pmulhrsw m3, m7 | |
22951 | pslldq m4, 2 | |
22952 | pinsrw m4, [r4 + 7], 0 | |
22953 | pmaddubsw m5, m4, m6 | |
22954 | pmulhrsw m5, m7 | |
22955 | packuswb m3, m5 | |
22956 | movu [r0 + 947 * 16], m3 | |
22957 | ||
22958 | ; mode16 [row 26] | |
22959 | movu m6, [r5 + 9 * 16] | |
22960 | pmaddubsw m3, m0, m6 | |
22961 | pmulhrsw m3, m7 | |
22962 | pmaddubsw m5, m2, m6 | |
22963 | pmulhrsw m5, m7 | |
22964 | packuswb m3, m5 | |
22965 | movu [r0 + 948 * 16], m3 | |
22966 | ||
22967 | pmaddubsw m3, m1, m6 | |
22968 | pmulhrsw m3, m7 | |
22969 | pmaddubsw m5, m4, m6 | |
22970 | pmulhrsw m5, m7 | |
22971 | packuswb m3, m5 | |
22972 | movu [r0 + 949 * 16], m3 | |
22973 | ||
22974 | ; mode16 [row 27] | |
22975 | movu m6, [r5 + 20 * 16] | |
22976 | pslldq m0, 2 | |
22977 | pinsrb m0, [r3 + 26], 1 | |
22978 | pinsrb m0, [r3 + 27], 0 | |
22979 | pmaddubsw m3, m0, m6 | |
22980 | pmulhrsw m3, m7 | |
22981 | pslldq m2, 2 | |
22982 | pinsrb m2, [r3 + 14], 1 | |
22983 | pinsrb m2, [r3 + 15], 0 | |
22984 | pmaddubsw m5, m2, m6 | |
22985 | pmulhrsw m5, m7 | |
22986 | packuswb m3, m5 | |
22987 | movu [r0 + 950 * 16], m3 | |
22988 | ||
22989 | pslldq m1, 2 | |
22990 | pinsrb m1, [r3 + 2], 1 | |
22991 | pinsrb m1, [r3 + 3], 0 | |
22992 | pmaddubsw m3, m1, m6 | |
22993 | pmulhrsw m3, m7 | |
22994 | pslldq m4, 2 | |
22995 | pinsrw m4, [r4 + 6], 0 | |
22996 | pmaddubsw m5, m4, m6 | |
22997 | pmulhrsw m5, m7 | |
22998 | packuswb m3, m5 | |
22999 | movu [r0 + 951 * 16], m3 | |
23000 | ||
23001 | ; mode16 [row 28] | |
23002 | movu m6, [r5 + 31 * 16] | |
23003 | pslldq m0, 2 | |
23004 | pinsrb m0, [r3 + 27], 1 | |
23005 | pinsrb m0, [r3 + 29], 0 | |
23006 | pmaddubsw m3, m0, m6 | |
23007 | pmulhrsw m3, m7 | |
23008 | pslldq m2, 2 | |
23009 | pinsrb m2, [r3 + 15], 1 | |
23010 | pinsrb m2, [r3 + 17], 0 | |
23011 | pmaddubsw m5, m2, m6 | |
23012 | pmulhrsw m5, m7 | |
23013 | packuswb m3, m5 | |
23014 | movu [r0 + 952 * 16], m3 | |
23015 | ||
23016 | pslldq m1, 2 | |
23017 | pinsrb m1, [r3 + 3], 1 | |
23018 | pinsrb m1, [r3 + 5], 0 | |
23019 | pmaddubsw m3, m1, m6 | |
23020 | pmulhrsw m3, m7 | |
23021 | pslldq m4, 2 | |
23022 | pinsrw m4, [r4 + 5], 0 | |
23023 | pmaddubsw m5, m4, m6 | |
23024 | pmulhrsw m5, m7 | |
23025 | packuswb m3, m5 | |
23026 | movu [r0 + 953 * 16], m3 | |
23027 | ||
23028 | ; mode16 [row 29] | |
23029 | movu m6, [r5 + 10 * 16] | |
23030 | pmaddubsw m3, m0, m6 | |
23031 | pmulhrsw m3, m7 | |
23032 | pmaddubsw m5, m2, m6 | |
23033 | pmulhrsw m5, m7 | |
23034 | packuswb m3, m5 | |
23035 | movu [r0 + 954 * 16], m3 | |
23036 | ||
23037 | pmaddubsw m3, m1, m6 | |
23038 | pmulhrsw m3, m7 | |
23039 | pmaddubsw m5, m4, m6 | |
23040 | pmulhrsw m5, m7 | |
23041 | packuswb m3, m5 | |
23042 | movu [r0 + 955 * 16], m3 | |
23043 | ||
23044 | ; mode16 [row 30] | |
23045 | movu m6, [r5 + 21 * 16] | |
23046 | pslldq m0, 2 | |
23047 | pinsrb m0, [r3 + 29], 1 | |
23048 | pinsrb m0, [r3 + 30], 0 | |
23049 | pmaddubsw m3, m0, m6 | |
23050 | pmulhrsw m3, m7 | |
23051 | pslldq m2, 2 | |
23052 | pinsrb m2, [r3 + 17], 1 | |
23053 | pinsrb m2, [r3 + 18], 0 | |
23054 | pmaddubsw m5, m2, m6 | |
23055 | pmulhrsw m5, m7 | |
23056 | packuswb m3, m5 | |
23057 | movu [r0 + 956 * 16], m3 | |
23058 | ||
23059 | pslldq m1, 2 | |
23060 | pinsrb m1, [r3 + 5], 1 | |
23061 | pinsrb m1, [r3 + 6], 0 | |
23062 | pmaddubsw m3, m1, m6 | |
23063 | pmulhrsw m3, m7 | |
23064 | pslldq m4, 2 | |
23065 | pinsrw m4, [r4 + 4], 0 | |
23066 | pmaddubsw m5, m4, m6 | |
23067 | pmulhrsw m5, m7 | |
23068 | packuswb m3, m5 | |
23069 | movu [r0 + 957 * 16], m3 | |
23070 | ||
23071 | ; mode16 [row 31] | |
23072 | pshufb m5, m0, [tab_S2] | |
23073 | movh [r0 + 958 * 16], m5 | |
23074 | pshufb m5, m2, [tab_S2] | |
23075 | movh [r0 + 958 * 16 + 8], m5 | |
23076 | pshufb m5, m1, [tab_S2] | |
23077 | movh [r0 + 959 * 16], m5 | |
23078 | pshufb m5, m4, [tab_S2] | |
23079 | movh [r0 + 959 * 16 + 8], m5 | |
23080 | ||
23081 | ; mode 17 [row 0] | |
23082 | movu m6, [r5 + 6 * 16] | |
23083 | movu m7, [pw_1024] | |
23084 | movh m0, [r4 ] | |
23085 | movh m1, [r4 + 1 ] | |
23086 | punpcklbw m0, m1 | |
23087 | pmaddubsw m1, m0, m6 | |
23088 | pmulhrsw m1, m7 | |
23089 | movh m2, [r4 + 8] | |
23090 | movh m3, [r4 + 9] | |
23091 | punpcklbw m2, m3 | |
23092 | pmaddubsw m3, m2, m6 | |
23093 | pmulhrsw m3, m7 | |
23094 | packuswb m1, m3 | |
23095 | movu [r0 + 960 * 16], m1 | |
23096 | ||
23097 | movh m1, [r4 + 16] | |
23098 | movh m3, [r4 + 17] | |
23099 | punpcklbw m1, m3 | |
23100 | pmaddubsw m3, m1, m6 | |
23101 | pmulhrsw m3, m7 | |
23102 | movh m4, [r4 + 24] | |
23103 | movh m5, [r4 + 25] | |
23104 | punpcklbw m4, m5 | |
23105 | pmaddubsw m5, m4, m6 | |
23106 | pmulhrsw m5, m7 | |
23107 | packuswb m3, m5 | |
23108 | movu [r0 + 961 * 16], m3 | |
23109 | ||
23110 | ; mode17 [row 1] | |
23111 | movu m6, [r5 + 12 * 16] | |
23112 | pslldq m0, 2 | |
23113 | pinsrb m0, [r3 + 0], 1 | |
23114 | pinsrb m0, [r3 + 1], 0 | |
23115 | pmaddubsw m3, m0, m6 | |
23116 | pmulhrsw m3, m7 | |
23117 | pslldq m2, 2 | |
23118 | pinsrw m2, [r4 + 7], 0 | |
23119 | pmaddubsw m5, m2, m6 | |
23120 | pmulhrsw m5, m7 | |
23121 | packuswb m3, m5 | |
23122 | movu [r0 + 962 * 16], m3 | |
23123 | ||
23124 | pslldq m1, 2 | |
23125 | pinsrw m1, [r4 + 15], 0 | |
23126 | pmaddubsw m3, m1, m6 | |
23127 | pmulhrsw m3, m7 | |
23128 | pslldq m4, 2 | |
23129 | pinsrw m4, [r4 + 23], 0 | |
23130 | pmaddubsw m5, m4, m6 | |
23131 | pmulhrsw m5, m7 | |
23132 | packuswb m3, m5 | |
23133 | movu [r0 + 963 * 16], m3 | |
23134 | ||
23135 | ; mode17 [row 2] | |
23136 | movu m6, [r5 + 18 * 16] | |
23137 | pslldq m0, 2 | |
23138 | pinsrb m0, [r3 + 1], 1 | |
23139 | pinsrb m0, [r3 + 2], 0 | |
23140 | pmaddubsw m3, m0, m6 | |
23141 | pmulhrsw m3, m7 | |
23142 | pslldq m2, 2 | |
23143 | pinsrw m2, [r4 + 6], 0 | |
23144 | pmaddubsw m5, m2, m6 | |
23145 | pmulhrsw m5, m7 | |
23146 | packuswb m3, m5 | |
23147 | movu [r0 + 964 * 16], m3 | |
23148 | ||
23149 | pslldq m1, 2 | |
23150 | pinsrw m1, [r4 + 14], 0 | |
23151 | pmaddubsw m3, m1, m6 | |
23152 | pmulhrsw m3, m7 | |
23153 | pslldq m4, 2 | |
23154 | pinsrw m4, [r4 + 22], 0 | |
23155 | pmaddubsw m5, m4, m6 | |
23156 | pmulhrsw m5, m7 | |
23157 | packuswb m3, m5 | |
23158 | movu [r0 + 965 * 16], m3 | |
23159 | ||
23160 | ; mode17 [row 3] | |
23161 | movu m6, [r5 + 24 * 16] | |
23162 | pslldq m0, 2 | |
23163 | pinsrb m0, [r3 + 2], 1 | |
23164 | pinsrb m0, [r3 + 4], 0 | |
23165 | pmaddubsw m3, m0, m6 | |
23166 | pmulhrsw m3, m7 | |
23167 | pslldq m2, 2 | |
23168 | pinsrw m2, [r4 + 5], 0 | |
23169 | pmaddubsw m5, m2, m6 | |
23170 | pmulhrsw m5, m7 | |
23171 | packuswb m3, m5 | |
23172 | movu [r0 + 966 * 16], m3 | |
23173 | ||
23174 | pslldq m1, 2 | |
23175 | pinsrw m1, [r4 + 13], 0 | |
23176 | pmaddubsw m3, m1, m6 | |
23177 | pmulhrsw m3, m7 | |
23178 | pslldq m4, 2 | |
23179 | pinsrw m4, [r4 + 21], 0 | |
23180 | pmaddubsw m5, m4, m6 | |
23181 | pmulhrsw m5, m7 | |
23182 | packuswb m3, m5 | |
23183 | movu [r0 + 967 * 16], m3 | |
23184 | ||
23185 | ; mode17 [row 4] | |
23186 | movu m6, [r5 + 30 * 16] | |
23187 | pslldq m0, 2 | |
23188 | pinsrb m0, [r3 + 4], 1 | |
23189 | pinsrb m0, [r3 + 5], 0 | |
23190 | pmaddubsw m3, m0, m6 | |
23191 | pmulhrsw m3, m7 | |
23192 | pslldq m2, 2 | |
23193 | pinsrw m2, [r4 + 4], 0 | |
23194 | pmaddubsw m5, m2, m6 | |
23195 | pmulhrsw m5, m7 | |
23196 | packuswb m3, m5 | |
23197 | movu [r0 + 968 * 16], m3 | |
23198 | ||
23199 | pslldq m1, 2 | |
23200 | pinsrw m1, [r4 + 12], 0 | |
23201 | pmaddubsw m3, m1, m6 | |
23202 | pmulhrsw m3, m7 | |
23203 | pslldq m4, 2 | |
23204 | pinsrw m4, [r4 + 20], 0 | |
23205 | pmaddubsw m5, m4, m6 | |
23206 | pmulhrsw m5, m7 | |
23207 | packuswb m3, m5 | |
23208 | movu [r0 + 969 * 16], m3 | |
23209 | ||
23210 | ; mode17 [row 5] | |
23211 | movu m6, [r5 + 4 * 16] | |
23212 | pmaddubsw m3, m0, m6 | |
23213 | pmulhrsw m3, m7 | |
23214 | pmaddubsw m5, m2, m6 | |
23215 | pmulhrsw m5, m7 | |
23216 | packuswb m3, m5 | |
23217 | movu [r0 + 970 * 16], m3 | |
23218 | ||
23219 | pmaddubsw m3, m1, m6 | |
23220 | pmulhrsw m3, m7 | |
23221 | pmaddubsw m5, m4, m6 | |
23222 | pmulhrsw m5, m7 | |
23223 | packuswb m3, m5 | |
23224 | movu [r0 + 971 * 16], m3 | |
23225 | ||
23226 | ; mode17 [row 6] | |
23227 | movu m6, [r5 + 10 * 16] | |
23228 | pslldq m0, 2 | |
23229 | pinsrb m0, [r3 + 5], 1 | |
23230 | pinsrb m0, [r3 + 6], 0 | |
23231 | pmaddubsw m3, m0, m6 | |
23232 | pmulhrsw m3, m7 | |
23233 | pslldq m2, 2 | |
23234 | pinsrw m2, [r4 + 3], 0 | |
23235 | pmaddubsw m5, m2, m6 | |
23236 | pmulhrsw m5, m7 | |
23237 | packuswb m3, m5 | |
23238 | movu [r0 + 972 * 16], m3 | |
23239 | ||
23240 | pslldq m1, 2 | |
23241 | pinsrw m1, [r4 + 11], 0 | |
23242 | pmaddubsw m3, m1, m6 | |
23243 | pmulhrsw m3, m7 | |
23244 | pslldq m4, 2 | |
23245 | pinsrw m4, [r4 + 19], 0 | |
23246 | pmaddubsw m5, m4, m6 | |
23247 | pmulhrsw m5, m7 | |
23248 | packuswb m3, m5 | |
23249 | movu [r0 + 973 * 16], m3 | |
23250 | ||
23251 | ; mode17 [row 7] | |
23252 | movu m6, [r5 + 16 * 16] | |
23253 | pslldq m0, 2 | |
23254 | pinsrb m0, [r3 + 6], 1 | |
23255 | pinsrb m0, [r3 + 7], 0 | |
23256 | pmaddubsw m3, m0, m6 | |
23257 | pmulhrsw m3, m7 | |
23258 | pslldq m2, 2 | |
23259 | pinsrw m2, [r4 + 2], 0 | |
23260 | pmaddubsw m5, m2, m6 | |
23261 | pmulhrsw m5, m7 | |
23262 | packuswb m3, m5 | |
23263 | movu [r0 + 974 * 16], m3 | |
23264 | ||
23265 | pslldq m1, 2 | |
23266 | pinsrw m1, [r4 + 10], 0 | |
23267 | pmaddubsw m3, m1, m6 | |
23268 | pmulhrsw m3, m7 | |
23269 | pslldq m4, 2 | |
23270 | pinsrw m4, [r4 + 18], 0 | |
23271 | pmaddubsw m5, m4, m6 | |
23272 | pmulhrsw m5, m7 | |
23273 | packuswb m3, m5 | |
23274 | movu [r0 + 975 * 16], m3 | |
23275 | ||
23276 | ; mode17 [row 8] | |
23277 | movu m6, [r5 + 22 * 16] | |
23278 | pslldq m0, 2 | |
23279 | pinsrb m0, [r3 + 7], 1 | |
23280 | pinsrb m0, [r3 + 9], 0 | |
23281 | pmaddubsw m3, m0, m6 | |
23282 | pmulhrsw m3, m7 | |
23283 | pslldq m2, 2 | |
23284 | pinsrw m2, [r4 + 1], 0 | |
23285 | pmaddubsw m5, m2, m6 | |
23286 | pmulhrsw m5, m7 | |
23287 | packuswb m3, m5 | |
23288 | movu [r0 + 976 * 16], m3 | |
23289 | ||
23290 | pslldq m1, 2 | |
23291 | pinsrw m1, [r4 + 9], 0 | |
23292 | pmaddubsw m3, m1, m6 | |
23293 | pmulhrsw m3, m7 | |
23294 | pslldq m4, 2 | |
23295 | pinsrw m4, [r4 + 17], 0 | |
23296 | pmaddubsw m5, m4, m6 | |
23297 | pmulhrsw m5, m7 | |
23298 | packuswb m3, m5 | |
23299 | movu [r0 + 977 * 16], m3 | |
23300 | ||
23301 | ; mode17 [row 9] | |
23302 | movu m6, [r5 + 28 * 16] | |
23303 | pslldq m0, 2 | |
23304 | pinsrb m0, [r3 + 9], 1 | |
23305 | pinsrb m0, [r3 + 10], 0 | |
23306 | pmaddubsw m3, m0, m6 | |
23307 | pmulhrsw m3, m7 | |
23308 | pslldq m2, 2 | |
23309 | pinsrw m2, [r4 + 0], 0 | |
23310 | pmaddubsw m5, m2, m6 | |
23311 | pmulhrsw m5, m7 | |
23312 | packuswb m3, m5 | |
23313 | movu [r0 + 978 * 16], m3 | |
23314 | ||
23315 | pslldq m1, 2 | |
23316 | pinsrw m1, [r4 + 8], 0 | |
23317 | pmaddubsw m3, m1, m6 | |
23318 | pmulhrsw m3, m7 | |
23319 | pslldq m4, 2 | |
23320 | pinsrw m4, [r4 + 16], 0 | |
23321 | pmaddubsw m5, m4, m6 | |
23322 | pmulhrsw m5, m7 | |
23323 | packuswb m3, m5 | |
23324 | movu [r0 + 979 * 16], m3 | |
23325 | ||
23326 | ; mode17 [row 10] | |
23327 | movu m6, [r5 + 2 * 16] | |
23328 | pmaddubsw m3, m0, m6 | |
23329 | pmulhrsw m3, m7 | |
23330 | pmaddubsw m5, m2, m6 | |
23331 | pmulhrsw m5, m7 | |
23332 | packuswb m3, m5 | |
23333 | movu [r0 + 980 * 16], m3 | |
23334 | ||
23335 | pmaddubsw m3, m1, m6 | |
23336 | pmulhrsw m3, m7 | |
23337 | pmaddubsw m5, m4, m6 | |
23338 | pmulhrsw m5, m7 | |
23339 | packuswb m3, m5 | |
23340 | movu [r0 + 981 * 16], m3 | |
23341 | ||
23342 | ; mode17 [row 11] | |
23343 | movu m6, [r5 + 8 * 16] | |
23344 | pslldq m0, 2 | |
23345 | pinsrb m0, [r3 + 10], 1 | |
23346 | pinsrb m0, [r3 + 11], 0 | |
23347 | pmaddubsw m3, m0, m6 | |
23348 | pmulhrsw m3, m7 | |
23349 | pslldq m2, 2 | |
23350 | pinsrb m2, [r4 + 0], 1 | |
23351 | pinsrb m2, [r3 + 1], 0 | |
23352 | pmaddubsw m5, m2, m6 | |
23353 | pmulhrsw m5, m7 | |
23354 | packuswb m3, m5 | |
23355 | movu [r0 + 982 * 16], m3 | |
23356 | ||
23357 | pslldq m1, 2 | |
23358 | pinsrw m1, [r4 + 7], 0 | |
23359 | pmaddubsw m3, m1, m6 | |
23360 | pmulhrsw m3, m7 | |
23361 | pslldq m4, 2 | |
23362 | pinsrw m4, [r4 + 15], 0 | |
23363 | pmaddubsw m5, m4, m6 | |
23364 | pmulhrsw m5, m7 | |
23365 | packuswb m3, m5 | |
23366 | movu [r0 + 983 * 16], m3 | |
23367 | ||
23368 | ; mode17 [row 12] | |
23369 | movu m6, [r5 + 14 * 16] | |
23370 | pslldq m0, 2 | |
23371 | pinsrb m0, [r3 + 11], 1 | |
23372 | pinsrb m0, [r3 + 12], 0 | |
23373 | pmaddubsw m3, m0, m6 | |
23374 | pmulhrsw m3, m7 | |
23375 | pslldq m2, 2 | |
23376 | pinsrb m2, [r3 + 1], 1 | |
23377 | pinsrb m2, [r3 + 2], 0 | |
23378 | pmaddubsw m5, m2, m6 | |
23379 | pmulhrsw m5, m7 | |
23380 | packuswb m3, m5 | |
23381 | movu [r0 + 984 * 16], m3 | |
23382 | ||
23383 | pslldq m1, 2 | |
23384 | pinsrw m1, [r4 + 6], 0 | |
23385 | pmaddubsw m3, m1, m6 | |
23386 | pmulhrsw m3, m7 | |
23387 | pslldq m4, 2 | |
23388 | pinsrw m4, [r4 + 14], 0 | |
23389 | pmaddubsw m5, m4, m6 | |
23390 | pmulhrsw m5, m7 | |
23391 | packuswb m3, m5 | |
23392 | movu [r0 + 985 * 16], m3 | |
23393 | ||
23394 | ; mode17 [row 13] | |
23395 | movu m6, [r5 + 20 * 16] | |
23396 | pslldq m0, 2 | |
23397 | pinsrb m0, [r3 + 12], 1 | |
23398 | pinsrb m0, [r3 + 14], 0 | |
23399 | pmaddubsw m3, m0, m6 | |
23400 | pmulhrsw m3, m7 | |
23401 | pslldq m2, 2 | |
23402 | pinsrb m2, [r3 + 2], 1 | |
23403 | pinsrb m2, [r3 + 4], 0 | |
23404 | pmaddubsw m5, m2, m6 | |
23405 | pmulhrsw m5, m7 | |
23406 | packuswb m3, m5 | |
23407 | movu [r0 + 986 * 16], m3 | |
23408 | ||
23409 | pslldq m1, 2 | |
23410 | pinsrw m1, [r4 + 5], 0 | |
23411 | pmaddubsw m3, m1, m6 | |
23412 | pmulhrsw m3, m7 | |
23413 | pslldq m4, 2 | |
23414 | pinsrw m4, [r4 + 13], 0 | |
23415 | pmaddubsw m5, m4, m6 | |
23416 | pmulhrsw m5, m7 | |
23417 | packuswb m3, m5 | |
23418 | movu [r0 + 987 * 16], m3 | |
23419 | ||
23420 | ; mode17 [row 14] | |
23421 | movu m6, [r5 + 26 * 16] | |
23422 | pslldq m0, 2 | |
23423 | pinsrb m0, [r3 + 14], 1 | |
23424 | pinsrb m0, [r3 + 15], 0 | |
23425 | pmaddubsw m3, m0, m6 | |
23426 | pmulhrsw m3, m7 | |
23427 | pslldq m2, 2 | |
23428 | pinsrb m2, [r3 + 4], 1 | |
23429 | pinsrb m2, [r3 + 5], 0 | |
23430 | pmaddubsw m5, m2, m6 | |
23431 | pmulhrsw m5, m7 | |
23432 | packuswb m3, m5 | |
23433 | movu [r0 + 988 * 16], m3 | |
23434 | ||
23435 | pslldq m1, 2 | |
23436 | pinsrw m1, [r4 + 4], 0 | |
23437 | pmaddubsw m3, m1, m6 | |
23438 | pmulhrsw m3, m7 | |
23439 | pslldq m4, 2 | |
23440 | pinsrw m4, [r4 + 12], 0 | |
23441 | pmaddubsw m5, m4, m6 | |
23442 | pmulhrsw m5, m7 | |
23443 | packuswb m3, m5 | |
23444 | movu [r0 + 989 * 16], m3 | |
23445 | ||
23446 | ; mode17 [row 15] | |
23447 | pshufb m5, m0, [tab_S2] | |
23448 | movh [r0 + 990 * 16], m5 | |
23449 | pshufb m5, m2, [tab_S2] | |
23450 | movh [r0 + 990 * 16 + 8], m5 | |
23451 | pshufb m5, m1, [tab_S2] | |
23452 | movh [r0 + 991 * 16], m5 | |
23453 | pshufb m5, m4, [tab_S2] | |
23454 | movh [r0 + 991 * 16 + 8], m5 | |
23455 | ||
23456 | ; mode17 [row 16] | |
23457 | movu m6, [r5 + 6 * 16] | |
23458 | pslldq m0, 2 | |
23459 | pinsrb m0, [r3 + 15], 1 | |
23460 | pinsrb m0, [r3 + 16], 0 | |
23461 | pmaddubsw m3, m0, m6 | |
23462 | pmulhrsw m3, m7 | |
23463 | pslldq m2, 2 | |
23464 | pinsrb m2, [r3 + 5], 1 | |
23465 | pinsrb m2, [r3 + 6], 0 | |
23466 | pmaddubsw m5, m2, m6 | |
23467 | pmulhrsw m5, m7 | |
23468 | packuswb m3, m5 | |
23469 | movu [r0 + 992 * 16], m3 | |
23470 | ||
23471 | pslldq m1, 2 | |
23472 | pinsrw m1, [r4 + 3], 0 | |
23473 | pmaddubsw m3, m1, m6 | |
23474 | pmulhrsw m3, m7 | |
23475 | pslldq m4, 2 | |
23476 | pinsrw m4, [r4 + 11], 0 | |
23477 | pmaddubsw m5, m4, m6 | |
23478 | pmulhrsw m5, m7 | |
23479 | packuswb m3, m5 | |
23480 | movu [r0 + 993 * 16], m3 | |
23481 | ||
23482 | ; mode17 [row 17] | |
23483 | movu m6, [r5 + 12 * 16] | |
23484 | pslldq m0, 2 | |
23485 | pinsrb m0, [r3 + 16], 1 | |
23486 | pinsrb m0, [r3 + 17], 0 | |
23487 | pmaddubsw m3, m0, m6 | |
23488 | pmulhrsw m3, m7 | |
23489 | pslldq m2, 2 | |
23490 | pinsrb m2, [r3 + 6], 1 | |
23491 | pinsrb m2, [r3 + 7], 0 | |
23492 | pmaddubsw m5, m2, m6 | |
23493 | pmulhrsw m5, m7 | |
23494 | packuswb m3, m5 | |
23495 | movu [r0 + 994 * 16], m3 | |
23496 | ||
23497 | pslldq m1, 2 | |
23498 | pinsrw m1, [r4 + 2], 0 | |
23499 | pmaddubsw m3, m1, m6 | |
23500 | pmulhrsw m3, m7 | |
23501 | pslldq m4, 2 | |
23502 | pinsrw m4, [r4 + 10], 0 | |
23503 | pmaddubsw m5, m4, m6 | |
23504 | pmulhrsw m5, m7 | |
23505 | packuswb m3, m5 | |
23506 | movu [r0 + 995 * 16], m3 | |
23507 | ||
23508 | ; mode17 [row 18] | |
23509 | movu m6, [r5 + 18 * 16] | |
23510 | pslldq m0, 2 | |
23511 | pinsrb m0, [r3 + 17], 1 | |
23512 | pinsrb m0, [r3 + 18], 0 | |
23513 | pmaddubsw m3, m0, m6 | |
23514 | pmulhrsw m3, m7 | |
23515 | pslldq m2, 2 | |
23516 | pinsrb m2, [r3 + 7], 1 | |
23517 | pinsrb m2, [r3 + 9], 0 | |
23518 | pmaddubsw m5, m2, m6 | |
23519 | pmulhrsw m5, m7 | |
23520 | packuswb m3, m5 | |
23521 | movu [r0 + 996 * 16], m3 | |
23522 | ||
23523 | pslldq m1, 2 | |
23524 | pinsrw m1, [r4 + 1], 0 | |
23525 | pmaddubsw m3, m1, m6 | |
23526 | pmulhrsw m3, m7 | |
23527 | pslldq m4, 2 | |
23528 | pinsrw m4, [r4 + 9], 0 | |
23529 | pmaddubsw m5, m4, m6 | |
23530 | pmulhrsw m5, m7 | |
23531 | packuswb m3, m5 | |
23532 | movu [r0 + 997 * 16], m3 | |
23533 | ||
23534 | ; mode17 [row 19] | |
23535 | movu m6, [r5 + 24 * 16] | |
23536 | pslldq m0, 2 | |
23537 | pinsrb m0, [r3 + 18], 1 | |
23538 | pinsrb m0, [r3 + 20], 0 | |
23539 | pmaddubsw m3, m0, m6 | |
23540 | pmulhrsw m3, m7 | |
23541 | pslldq m2, 2 | |
23542 | pinsrb m2, [r3 + 9], 1 | |
23543 | pinsrb m2, [r3 + 10], 0 | |
23544 | pmaddubsw m5, m2, m6 | |
23545 | pmulhrsw m5, m7 | |
23546 | packuswb m3, m5 | |
23547 | movu [r0 + 998 * 16], m3 | |
23548 | ||
23549 | pslldq m1, 2 | |
23550 | pinsrw m1, [r4 + 0], 0 | |
23551 | pmaddubsw m3, m1, m6 | |
23552 | pmulhrsw m3, m7 | |
23553 | pslldq m4, 2 | |
23554 | pinsrw m4, [r4 + 8], 0 | |
23555 | pmaddubsw m5, m4, m6 | |
23556 | pmulhrsw m5, m7 | |
23557 | packuswb m3, m5 | |
23558 | movu [r0 + 999 * 16], m3 | |
23559 | ||
23560 | ; mode17 [row 20] | |
23561 | movu m6, [r5 + 30 * 16] | |
23562 | pslldq m0, 2 | |
23563 | pinsrb m0, [r3 + 20], 1 | |
23564 | pinsrb m0, [r3 + 21], 0 | |
23565 | pmaddubsw m3, m0, m6 | |
23566 | pmulhrsw m3, m7 | |
23567 | pslldq m2, 2 | |
23568 | pinsrb m2, [r3 + 10], 1 | |
23569 | pinsrb m2, [r3 + 11], 0 | |
23570 | pmaddubsw m5, m2, m6 | |
23571 | pmulhrsw m5, m7 | |
23572 | packuswb m3, m5 | |
23573 | movu [r0 + 1000 * 16], m3 | |
23574 | ||
23575 | pslldq m1, 2 | |
23576 | pinsrb m1, [r4 + 0], 1 | |
23577 | pinsrb m1, [r3 + 1], 0 | |
23578 | pmaddubsw m3, m1, m6 | |
23579 | pmulhrsw m3, m7 | |
23580 | pslldq m4, 2 | |
23581 | ;pinsrb m4, [r4 + 8], 1 | |
23582 | ;pinsrb m4, [r4 + 7], 0 | |
23583 | pinsrw m4, [r4 + 7], 0 | |
23584 | pmaddubsw m5, m4, m6 | |
23585 | pmulhrsw m5, m7 | |
23586 | packuswb m3, m5 | |
23587 | movu [r0 + 1001 * 16], m3 | |
23588 | ||
23589 | ; mode17 [row 21] | |
23590 | movu m6, [r5 + 4 * 16] | |
23591 | pmaddubsw m3, m0, m6 | |
23592 | pmulhrsw m3, m7 | |
23593 | pmaddubsw m5, m2, m6 | |
23594 | pmulhrsw m5, m7 | |
23595 | packuswb m3, m5 | |
23596 | movu [r0 + 1002 * 16], m3 | |
23597 | ||
23598 | pmaddubsw m3, m1, m6 | |
23599 | pmulhrsw m3, m7 | |
23600 | pmaddubsw m5, m4, m6 | |
23601 | pmulhrsw m5, m7 | |
23602 | packuswb m3, m5 | |
23603 | movu [r0 + 1003 * 16], m3 | |
23604 | ||
23605 | ; mode17 [row 22] | |
23606 | movu m6, [r5 + 10 * 16] | |
23607 | pslldq m0, 2 | |
23608 | pinsrb m0, [r3 + 21], 1 | |
23609 | pinsrb m0, [r3 + 22], 0 | |
23610 | pmaddubsw m3, m0, m6 | |
23611 | pmulhrsw m3, m7 | |
23612 | pslldq m2, 2 | |
23613 | pinsrb m2, [r3 + 11], 1 | |
23614 | pinsrb m2, [r3 + 12], 0 | |
23615 | pmaddubsw m5, m2, m6 | |
23616 | pmulhrsw m5, m7 | |
23617 | packuswb m3, m5 | |
23618 | movu [r0 + 1004 * 16], m3 | |
23619 | ||
23620 | pslldq m1, 2 | |
23621 | pinsrb m1, [r3 + 1], 1 | |
23622 | pinsrb m1, [r3 + 2], 0 | |
23623 | pmaddubsw m3, m1, m6 | |
23624 | pmulhrsw m3, m7 | |
23625 | pslldq m4, 2 | |
23626 | pinsrw m4, [r4 + 6], 0 | |
23627 | pmaddubsw m5, m4, m6 | |
23628 | pmulhrsw m5, m7 | |
23629 | packuswb m3, m5 | |
23630 | movu [r0 + 1005 * 16], m3 | |
23631 | ||
23632 | ; mode17 [row 23] | |
23633 | movu m6, [r5 + 16 * 16] | |
23634 | pslldq m0, 2 | |
23635 | pinsrb m0, [r3 + 22], 1 | |
23636 | pinsrb m0, [r3 + 23], 0 | |
23637 | pmaddubsw m3, m0, m6 | |
23638 | pmulhrsw m3, m7 | |
23639 | pslldq m2, 2 | |
23640 | pinsrb m2, [r3 + 12], 1 | |
23641 | pinsrb m2, [r3 + 14], 0 | |
23642 | pmaddubsw m5, m2, m6 | |
23643 | pmulhrsw m5, m7 | |
23644 | packuswb m3, m5 | |
23645 | movu [r0 + 1006 * 16], m3 | |
23646 | ||
23647 | pslldq m1, 2 | |
23648 | pinsrb m1, [r3 + 2], 1 | |
23649 | pinsrb m1, [r3 + 4], 0 | |
23650 | pmaddubsw m3, m1, m6 | |
23651 | pmulhrsw m3, m7 | |
23652 | pslldq m4, 2 | |
23653 | pinsrw m4, [r4 + 5], 0 | |
23654 | pmaddubsw m5, m4, m6 | |
23655 | pmulhrsw m5, m7 | |
23656 | packuswb m3, m5 | |
23657 | movu [r0 + 1007 * 16], m3 | |
23658 | ||
23659 | ; mode17 [row 24] | |
23660 | movu m6, [r5 + 22 * 16] | |
23661 | pslldq m0, 2 | |
23662 | pinsrb m0, [r3 + 23], 1 | |
23663 | pinsrb m0, [r3 + 25], 0 | |
23664 | pmaddubsw m3, m0, m6 | |
23665 | pmulhrsw m3, m7 | |
23666 | pslldq m2, 2 | |
23667 | pinsrb m2, [r3 + 14], 1 | |
23668 | pinsrb m2, [r3 + 15], 0 | |
23669 | pmaddubsw m5, m2, m6 | |
23670 | pmulhrsw m5, m7 | |
23671 | packuswb m3, m5 | |
23672 | movu [r0 + 1008 * 16], m3 | |
23673 | ||
23674 | pslldq m1, 2 | |
23675 | pinsrb m1, [r3 + 4], 1 | |
23676 | pinsrb m1, [r3 + 5], 0 | |
23677 | pmaddubsw m3, m1, m6 | |
23678 | pmulhrsw m3, m7 | |
23679 | pslldq m4, 2 | |
23680 | pinsrw m4, [r4 + 4], 0 | |
23681 | pmaddubsw m5, m4, m6 | |
23682 | pmulhrsw m5, m7 | |
23683 | packuswb m3, m5 | |
23684 | movu [r0 + 1009 * 16], m3 | |
23685 | ||
23686 | ; mode17 [row 25] | |
23687 | movu m6, [r5 + 28 * 16] | |
23688 | pslldq m0, 2 | |
23689 | pinsrb m0, [r3 + 25], 1 | |
23690 | pinsrb m0, [r3 + 26], 0 | |
23691 | pmaddubsw m3, m0, m6 | |
23692 | pmulhrsw m3, m7 | |
23693 | pslldq m2, 2 | |
23694 | pinsrb m2, [r3 + 15], 1 | |
23695 | pinsrb m2, [r3 + 16], 0 | |
23696 | pmaddubsw m5, m2, m6 | |
23697 | pmulhrsw m5, m7 | |
23698 | packuswb m3, m5 | |
23699 | movu [r0 + 1010 * 16], m3 | |
23700 | ||
23701 | pslldq m1, 2 | |
23702 | pinsrb m1, [r3 + 5], 1 | |
23703 | pinsrb m1, [r3 + 6], 0 | |
23704 | pmaddubsw m3, m1, m6 | |
23705 | pmulhrsw m3, m7 | |
23706 | pslldq m4, 2 | |
23707 | pinsrw m4, [r4 + 3], 0 | |
23708 | pmaddubsw m5, m4, m6 | |
23709 | pmulhrsw m5, m7 | |
23710 | packuswb m3, m5 | |
23711 | movu [r0 + 1011 * 16], m3 | |
23712 | ||
23713 | ; mode17 [row 26] | |
23714 | movu m6, [r5 + 2 * 16] | |
23715 | pmaddubsw m3, m0, m6 | |
23716 | pmulhrsw m3, m7 | |
23717 | pmaddubsw m5, m2, m6 | |
23718 | pmulhrsw m5, m7 | |
23719 | packuswb m3, m5 | |
23720 | movu [r0 + 1012 * 16], m3 | |
23721 | ||
23722 | pmaddubsw m3, m1, m6 | |
23723 | pmulhrsw m3, m7 | |
23724 | pmaddubsw m5, m4, m6 | |
23725 | pmulhrsw m5, m7 | |
23726 | packuswb m3, m5 | |
23727 | movu [r0 + 1013 * 16], m3 | |
23728 | ||
23729 | ; mode17 [row 27] | |
23730 | movu m6, [r5 + 8 * 16] | |
23731 | pslldq m0, 2 | |
23732 | pinsrb m0, [r3 + 26], 1 | |
23733 | pinsrb m0, [r3 + 27], 0 | |
23734 | pmaddubsw m3, m0, m6 | |
23735 | pmulhrsw m3, m7 | |
23736 | pslldq m2, 2 | |
23737 | pinsrb m2, [r3 + 16], 1 | |
23738 | pinsrb m2, [r3 + 17], 0 | |
23739 | pmaddubsw m5, m2, m6 | |
23740 | pmulhrsw m5, m7 | |
23741 | packuswb m3, m5 | |
23742 | movu [r0 + 1014 * 16], m3 | |
23743 | ||
23744 | pslldq m1, 2 | |
23745 | pinsrb m1, [r3 + 6], 1 | |
23746 | pinsrb m1, [r3 + 7], 0 | |
23747 | pmaddubsw m3, m1, m6 | |
23748 | pmulhrsw m3, m7 | |
23749 | pslldq m4, 2 | |
23750 | pinsrw m4, [r4 + 2], 0 | |
23751 | pmaddubsw m5, m4, m6 | |
23752 | pmulhrsw m5, m7 | |
23753 | packuswb m3, m5 | |
23754 | movu [r0 + 1015 * 16], m3 | |
23755 | ||
23756 | ; mode17 [row 28] | |
23757 | movu m6, [r5 + 14 * 16] | |
23758 | pslldq m0, 2 | |
23759 | pinsrb m0, [r3 + 27], 1 | |
23760 | pinsrb m0, [r3 + 28], 0 | |
23761 | pmaddubsw m3, m0, m6 | |
23762 | pmulhrsw m3, m7 | |
23763 | pslldq m2, 2 | |
23764 | pinsrb m2, [r3 + 17], 1 | |
23765 | pinsrb m2, [r3 + 18], 0 | |
23766 | pmaddubsw m5, m2, m6 | |
23767 | pmulhrsw m5, m7 | |
23768 | packuswb m3, m5 | |
23769 | movu [r0 + 1016 * 16], m3 | |
23770 | ||
23771 | pslldq m1, 2 | |
23772 | pinsrb m1, [r3 + 7], 1 | |
23773 | pinsrb m1, [r3 + 9], 0 | |
23774 | pmaddubsw m3, m1, m6 | |
23775 | pmulhrsw m3, m7 | |
23776 | pslldq m4, 2 | |
23777 | pinsrw m4, [r4 + 1], 0 | |
23778 | pmaddubsw m5, m4, m6 | |
23779 | pmulhrsw m5, m7 | |
23780 | packuswb m3, m5 | |
23781 | movu [r0 + 1017 * 16], m3 | |
23782 | ||
23783 | ; mode17 [row 29] | |
23784 | movu m6, [r5 + 20 * 16] | |
23785 | pslldq m0, 2 | |
23786 | pinsrb m0, [r3 + 28], 1 | |
23787 | pinsrb m0, [r3 + 30], 0 | |
23788 | pmaddubsw m3, m0, m6 | |
23789 | pmulhrsw m3, m7 | |
23790 | pslldq m2, 2 | |
23791 | pinsrb m2, [r3 + 18], 1 | |
23792 | pinsrb m2, [r3 + 20], 0 | |
23793 | pmaddubsw m5, m2, m6 | |
23794 | pmulhrsw m5, m7 | |
23795 | packuswb m3, m5 | |
23796 | movu [r0 + 1018 * 16], m3 | |
23797 | ||
23798 | pslldq m1, 2 | |
23799 | pinsrb m1, [r3 + 9], 1 | |
23800 | pinsrb m1, [r3 + 10], 0 | |
23801 | pmaddubsw m3, m1, m6 | |
23802 | pmulhrsw m3, m7 | |
23803 | pslldq m4, 2 | |
23804 | pinsrw m4, [r4 + 0], 0 | |
23805 | pmaddubsw m5, m4, m6 | |
23806 | pmulhrsw m5, m7 | |
23807 | packuswb m3, m5 | |
23808 | movu [r0 + 1019 * 16], m3 | |
23809 | ||
23810 | ; mode17 [row 30] | |
23811 | movu m6, [r5 + 26 * 16] | |
23812 | pslldq m0, 2 | |
23813 | pinsrb m0, [r3 + 30], 1 | |
23814 | pinsrb m0, [r3 + 31], 0 | |
23815 | pmaddubsw m3, m0, m6 | |
23816 | pmulhrsw m3, m7 | |
23817 | pslldq m2, 2 | |
23818 | pinsrb m2, [r3 + 20], 1 | |
23819 | pinsrb m2, [r3 + 21], 0 | |
23820 | pmaddubsw m5, m2, m6 | |
23821 | pmulhrsw m5, m7 | |
23822 | packuswb m3, m5 | |
23823 | movu [r0 + 1020 * 16], m3 | |
23824 | ||
23825 | pslldq m1, 2 | |
23826 | pinsrb m1, [r3 + 10], 1 | |
23827 | pinsrb m1, [r3 + 11], 0 | |
23828 | pmaddubsw m3, m1, m6 | |
23829 | pmulhrsw m3, m7 | |
23830 | pslldq m4, 2 | |
23831 | pinsrb m4, [r4 + 0], 1 | |
23832 | pinsrb m4, [r3 + 1], 0 | |
23833 | pmaddubsw m5, m4, m6 | |
23834 | pmulhrsw m5, m7 | |
23835 | packuswb m3, m5 | |
23836 | movu [r0 + 1021 * 16], m3 | |
23837 | ||
23838 | ; mode17 [row 31] | |
23839 | pshufb m5, m0, [tab_S2] | |
23840 | movh [r0 + 1022 * 16], m5 | |
23841 | pshufb m5, m2, [tab_S2] | |
23842 | movh [r0 + 1022 * 16 + 8], m5 | |
23843 | pshufb m5, m1, [tab_S2] | |
23844 | movh [r0 + 1023 * 16], m5 | |
23845 | pshufb m5, m4, [tab_S2] | |
23846 | movh [r0 + 1023 * 16 + 8], m5 | |
23847 | ||
23848 | ;mode 18[row 0] | |
23849 | movu m0, [r3] | |
23850 | movu [r0 + 1024 * 16], m0 | |
23851 | movu m1, [r3 + 16] | |
23852 | movu [r0 + 1025 * 16], m1 | |
23853 | ||
23854 | ;mode 18[row 1] | |
23855 | pslldq m0, 1 | |
23856 | pinsrb m0, [r4 + 1], 0 | |
23857 | movu [r0 + 1026 * 16], m0 | |
23858 | pslldq m1, 1 | |
23859 | pinsrb m1, [r3 + 15], 0 | |
23860 | movu [r0 + 1027 * 16], m1 | |
23861 | ||
23862 | ;mode 18[row 2] | |
23863 | pslldq m0, 1 | |
23864 | pinsrb m0, [r4 + 2], 0 | |
23865 | movu [r0 + 1028 * 16], m0 | |
23866 | pslldq m1, 1 | |
23867 | pinsrb m1, [r3 + 14], 0 | |
23868 | movu [r0 + 1029 * 16], m1 | |
23869 | ||
23870 | ;mode 18[row 3] | |
23871 | pslldq m0, 1 | |
23872 | pinsrb m0, [r4 + 3], 0 | |
23873 | movu [r0 + 1030 * 16], m0 | |
23874 | pslldq m1, 1 | |
23875 | pinsrb m1, [r3 + 13], 0 | |
23876 | movu [r0 + 1031 * 16], m1 | |
23877 | ||
23878 | ;mode 18[row 4] | |
23879 | pslldq m0, 1 | |
23880 | pinsrb m0, [r4 + 4], 0 | |
23881 | movu [r0 + 1032 * 16], m0 | |
23882 | pslldq m1, 1 | |
23883 | pinsrb m1, [r3 + 12], 0 | |
23884 | movu [r0 + 1033 * 16], m1 | |
23885 | ||
23886 | ;mode 18[row 5] | |
23887 | pslldq m0, 1 | |
23888 | pinsrb m0, [r4 + 5], 0 | |
23889 | movu [r0 + 1034 * 16], m0 | |
23890 | pslldq m1, 1 | |
23891 | pinsrb m1, [r3 + 11], 0 | |
23892 | movu [r0 + 1035 * 16], m1 | |
23893 | ||
23894 | ;mode 18[row 6] | |
23895 | pslldq m0, 1 | |
23896 | pinsrb m0, [r4 + 6], 0 | |
23897 | movu [r0 + 1036 * 16], m0 | |
23898 | pslldq m1, 1 | |
23899 | pinsrb m1, [r3 + 10], 0 | |
23900 | movu [r0 + 1037 * 16], m1 | |
23901 | ||
23902 | ;mode 18[row 7] | |
23903 | pslldq m0, 1 | |
23904 | pinsrb m0, [r4 + 7], 0 | |
23905 | movu [r0 + 1038 * 16], m0 | |
23906 | pslldq m1, 1 | |
23907 | pinsrb m1, [r3 + 9], 0 | |
23908 | movu [r0 + 1039 * 16], m1 | |
23909 | ||
23910 | ;mode 18[row 8] | |
23911 | pslldq m0, 1 | |
23912 | pinsrb m0, [r4 + 8], 0 | |
23913 | movu [r0 + 1040 * 16], m0 | |
23914 | pslldq m1, 1 | |
23915 | pinsrb m1, [r3 + 8], 0 | |
23916 | movu [r0 + 1041 * 16], m1 | |
23917 | ||
23918 | ;mode 18[row 9] | |
23919 | pslldq m0, 1 | |
23920 | pinsrb m0, [r4 + 9], 0 | |
23921 | movu [r0 + 1042 * 16], m0 | |
23922 | pslldq m1, 1 | |
23923 | pinsrb m1, [r3 + 7], 0 | |
23924 | movu [r0 + 1043 * 16], m1 | |
23925 | ||
23926 | ;mode 18[row 10] | |
23927 | pslldq m0, 1 | |
23928 | pinsrb m0, [r4 + 10], 0 | |
23929 | movu [r0 + 1044 * 16], m0 | |
23930 | pslldq m1, 1 | |
23931 | pinsrb m1, [r3 + 6], 0 | |
23932 | movu [r0 + 1045 * 16], m1 | |
23933 | ||
23934 | ;mode 18[row 11] | |
23935 | pslldq m0, 1 | |
23936 | pinsrb m0, [r4 + 11], 0 | |
23937 | movu [r0 + 1046 * 16], m0 | |
23938 | pslldq m1, 1 | |
23939 | pinsrb m1, [r3 + 5], 0 | |
23940 | movu [r0 + 1047 * 16], m1 | |
23941 | ||
23942 | ;mode 18[row 12] | |
23943 | pslldq m0, 1 | |
23944 | pinsrb m0, [r4 + 12], 0 | |
23945 | movu [r0 + 1048 * 16], m0 | |
23946 | pslldq m1, 1 | |
23947 | pinsrb m1, [r3 + 4], 0 | |
23948 | movu [r0 + 1049 * 16], m1 | |
23949 | ||
23950 | ;mode 18[row 13] | |
23951 | pslldq m0, 1 | |
23952 | pinsrb m0, [r4 + 13], 0 | |
23953 | movu [r0 + 1050 * 16], m0 | |
23954 | pslldq m1, 1 | |
23955 | pinsrb m1, [r3 + 3], 0 | |
23956 | movu [r0 + 1051 * 16], m1 | |
23957 | ||
23958 | ;mode 18[row 14] | |
23959 | pslldq m0, 1 | |
23960 | pinsrb m0, [r4 + 14], 0 | |
23961 | movu [r0 + 1052 * 16], m0 | |
23962 | pslldq m1, 1 | |
23963 | pinsrb m1, [r3 + 2], 0 | |
23964 | movu [r0 + 1053 * 16], m1 | |
23965 | ||
23966 | ;mode 18[row 15] | |
23967 | pslldq m0, 1 | |
23968 | pinsrb m0, [r4 + 15], 0 | |
23969 | movu [r0 + 1054 * 16], m0 | |
23970 | pslldq m1, 1 | |
23971 | pinsrb m1, [r3 + 1], 0 | |
23972 | movu [r0 + 1055 * 16], m1 | |
23973 | ||
23974 | ;mode 18[row 16] | |
23975 | pslldq m0, 1 | |
23976 | pinsrb m0, [r4 + 16], 0 | |
23977 | movu [r0 + 1056 * 16], m0 | |
23978 | pslldq m1, 1 | |
23979 | pinsrb m1, [r3 + 0], 0 | |
23980 | movu [r0 + 1057 * 16], m1 | |
23981 | ||
23982 | ;mode 18[row 17] | |
23983 | pslldq m0, 1 | |
23984 | pinsrb m0, [r4 + 17], 0 | |
23985 | movu [r0 + 1058 * 16], m0 | |
23986 | pslldq m1, 1 | |
23987 | pinsrb m1, [r4 + 1], 0 | |
23988 | movu [r0 + 1059 * 16], m1 | |
23989 | ||
23990 | ;mode 18[row 18] | |
23991 | pslldq m0, 1 | |
23992 | pinsrb m0, [r4 + 18], 0 | |
23993 | movu [r0 + 1060 * 16], m0 | |
23994 | pslldq m1, 1 | |
23995 | pinsrb m1, [r4 + 2], 0 | |
23996 | movu [r0 + 1061 * 16], m1 | |
23997 | ||
23998 | ;mode 18[row 19] | |
23999 | pslldq m0, 1 | |
24000 | pinsrb m0, [r4 + 19], 0 | |
24001 | movu [r0 + 1062 * 16], m0 | |
24002 | pslldq m1, 1 | |
24003 | pinsrb m1, [r4 + 3], 0 | |
24004 | movu [r0 + 1063 * 16], m1 | |
24005 | ||
24006 | ;mode 18[row 20] | |
24007 | pslldq m0, 1 | |
24008 | pinsrb m0, [r4 + 20], 0 | |
24009 | movu [r0 + 1064 * 16], m0 | |
24010 | pslldq m1, 1 | |
24011 | pinsrb m1, [r4 + 4], 0 | |
24012 | movu [r0 + 1065 * 16], m1 | |
24013 | ||
24014 | ;mode 18[row 21] | |
24015 | pslldq m0, 1 | |
24016 | pinsrb m0, [r4 + 21], 0 | |
24017 | movu [r0 + 1066 * 16], m0 | |
24018 | pslldq m1, 1 | |
24019 | pinsrb m1, [r4 + 5], 0 | |
24020 | movu [r0 + 1067 * 16], m1 | |
24021 | ||
24022 | ;mode 18[row 22] | |
24023 | pslldq m0, 1 | |
24024 | pinsrb m0, [r4 + 22], 0 | |
24025 | movu [r0 + 1068 * 16], m0 | |
24026 | pslldq m1, 1 | |
24027 | pinsrb m1, [r4 + 6], 0 | |
24028 | movu [r0 + 1069 * 16], m1 | |
24029 | ||
24030 | ;mode 18[row 23] | |
24031 | pslldq m0, 1 | |
24032 | pinsrb m0, [r4 + 23], 0 | |
24033 | movu [r0 + 1070 * 16], m0 | |
24034 | pslldq m1, 1 | |
24035 | pinsrb m1, [r4 + 7], 0 | |
24036 | movu [r0 + 1071 * 16], m1 | |
24037 | ||
24038 | ;mode 18[row 24] | |
24039 | pslldq m0, 1 | |
24040 | pinsrb m0, [r4 + 24], 0 | |
24041 | movu [r0 + 1072 * 16], m0 | |
24042 | pslldq m1, 1 | |
24043 | pinsrb m1, [r4 + 8], 0 | |
24044 | movu [r0 + 1073 * 16], m1 | |
24045 | ||
24046 | ;mode 18[row 25] | |
24047 | pslldq m0, 1 | |
24048 | pinsrb m0, [r4 + 25], 0 | |
24049 | movu [r0 + 1074 * 16], m0 | |
24050 | pslldq m1, 1 | |
24051 | pinsrb m1, [r4 + 9], 0 | |
24052 | movu [r0 + 1075 * 16], m1 | |
24053 | ||
24054 | ;mode 18[row 26] | |
24055 | pslldq m0, 1 | |
24056 | pinsrb m0, [r4 + 26], 0 | |
24057 | movu [r0 + 1076 * 16], m0 | |
24058 | pslldq m1, 1 | |
24059 | pinsrb m1, [r4 + 10], 0 | |
24060 | movu [r0 + 1077 * 16], m1 | |
24061 | ||
24062 | ;mode 18[row 27] | |
24063 | pslldq m0, 1 | |
24064 | pinsrb m0, [r4 + 27], 0 | |
24065 | movu [r0 + 1078 * 16], m0 | |
24066 | pslldq m1, 1 | |
24067 | pinsrb m1, [r4 + 11], 0 | |
24068 | movu [r0 + 1079 * 16], m1 | |
24069 | ||
24070 | ;mode 18[row 28] | |
24071 | pslldq m0, 1 | |
24072 | pinsrb m0, [r4 + 28], 0 | |
24073 | movu [r0 + 1080 * 16], m0 | |
24074 | pslldq m1, 1 | |
24075 | pinsrb m1, [r4 + 12], 0 | |
24076 | movu [r0 + 1081 * 16], m1 | |
24077 | ||
24078 | ;mode 18[row 29] | |
24079 | pslldq m0, 1 | |
24080 | pinsrb m0, [r4 + 29], 0 | |
24081 | movu [r0 + 1082 * 16], m0 | |
24082 | pslldq m1, 1 | |
24083 | pinsrb m1, [r4 + 13], 0 | |
24084 | movu [r0 + 1083 * 16], m1 | |
24085 | ||
24086 | ;mode 18[row 30] | |
24087 | pslldq m0, 1 | |
24088 | pinsrb m0, [r4 + 30], 0 | |
24089 | movu [r0 + 1084 * 16], m0 | |
24090 | pslldq m1, 1 | |
24091 | pinsrb m1, [r4 + 14], 0 | |
24092 | movu [r0 + 1085 * 16], m1 | |
24093 | ||
24094 | ;mode 18[row 31] | |
24095 | pslldq m0, 1 | |
24096 | pinsrb m0, [r4 + 31], 0 | |
24097 | movu [r0 + 1086 * 16], m0 | |
24098 | pslldq m1, 1 | |
24099 | pinsrb m1, [r4 + 15], 0 | |
24100 | movu [r0 + 1087 * 16], m1 | |
24101 | ||
24102 | ; mode 19 [row 0] | |
24103 | movu m6, [r5 + 6 * 16] | |
24104 | movu m0, [r3 ] | |
24105 | movu m1, [r3 + 1 ] | |
24106 | punpcklbw m0, m1 | |
24107 | pmaddubsw m1, m0, m6 | |
24108 | pmulhrsw m1, m7 | |
24109 | movu m2, [r3 + 8] | |
24110 | movu m3, [r3 + 9] | |
24111 | punpcklbw m2, m3 | |
24112 | pmaddubsw m3, m2, m6 | |
24113 | pmulhrsw m3, m7 | |
24114 | packuswb m1, m3 | |
24115 | movu [r0 + 1088 * 16], m1 | |
24116 | ||
24117 | movu m1, [r3 + 16] | |
24118 | movu m3, [r3 + 17] | |
24119 | punpcklbw m1, m3 | |
24120 | pmaddubsw m4, m1, m6 | |
24121 | pmulhrsw m4, m7 | |
24122 | movu m3, [r3 + 24] | |
24123 | movu m5, [r3 + 25] | |
24124 | punpcklbw m3, m5 | |
24125 | pmaddubsw m5, m3, m6 | |
24126 | pmulhrsw m5, m7 | |
24127 | packuswb m4, m5 | |
24128 | movu [r0 + 1089 * 16], m4 | |
24129 | ||
24130 | ; mode 19 [row 1] | |
24131 | movu m6, [r5 + 12 * 16] | |
24132 | pslldq m0, 2 | |
24133 | pinsrb m0, [r4 + 0], 1 | |
24134 | pinsrb m0, [r4 + 1], 0 | |
24135 | pmaddubsw m4, m0, m6 | |
24136 | pmulhrsw m4, m7 | |
24137 | pslldq m2, 2 | |
24138 | pinsrw m2, [r3 + 7], 0 | |
24139 | pmaddubsw m5, m2, m6 | |
24140 | pmulhrsw m5, m7 | |
24141 | packuswb m4, m5 | |
24142 | movu [r0 + 1090 * 16], m4 | |
24143 | pslldq m1, 2 | |
24144 | pinsrw m1, [r3 + 15], 0 | |
24145 | pmaddubsw m4, m1, m6 | |
24146 | pmulhrsw m4, m7 | |
24147 | pslldq m3, 2 | |
24148 | pinsrw m3, [r3 + 23], 0 | |
24149 | pmaddubsw m5, m3, m6 | |
24150 | pmulhrsw m5, m7 | |
24151 | packuswb m4, m5 | |
24152 | movu [r0 + 1091 * 16], m4 | |
24153 | ||
24154 | ; mode 19 [row 2] | |
24155 | movu m6, [r5 + 18 * 16] | |
24156 | pslldq m0, 2 | |
24157 | pinsrb m0, [r4 + 1], 1 | |
24158 | pinsrb m0, [r4 + 2], 0 | |
24159 | pmaddubsw m4, m0, m6 | |
24160 | pmulhrsw m4, m7 | |
24161 | pslldq m2, 2 | |
24162 | pinsrw m2, [r3 + 6], 0 | |
24163 | pmaddubsw m5, m2, m6 | |
24164 | pmulhrsw m5, m7 | |
24165 | packuswb m4, m5 | |
24166 | movu [r0 + 1092 * 16], m4 | |
24167 | pslldq m1, 2 | |
24168 | pinsrw m1, [r3 + 14], 0 | |
24169 | pmaddubsw m4, m1, m6 | |
24170 | pmulhrsw m4, m7 | |
24171 | pslldq m3, 2 | |
24172 | pinsrw m3, [r3 + 22], 0 | |
24173 | pmaddubsw m5, m3, m6 | |
24174 | pmulhrsw m5, m7 | |
24175 | packuswb m4, m5 | |
24176 | movu [r0 + 1093 * 16], m4 | |
24177 | ||
24178 | ; mode 19 [row 3] | |
24179 | movu m6, [r5 + 24 * 16] | |
24180 | pslldq m0, 2 | |
24181 | pinsrb m0, [r4 + 2], 1 | |
24182 | pinsrb m0, [r4 + 4], 0 | |
24183 | pmaddubsw m4, m0, m6 | |
24184 | pmulhrsw m4, m7 | |
24185 | pslldq m2, 2 | |
24186 | pinsrw m2, [r3 + 5], 0 | |
24187 | pmaddubsw m5, m2, m6 | |
24188 | pmulhrsw m5, m7 | |
24189 | packuswb m4, m5 | |
24190 | movu [r0 + 1094 * 16], m4 | |
24191 | pslldq m1, 2 | |
24192 | pinsrw m1, [r3 + 13], 0 | |
24193 | pmaddubsw m4, m1, m6 | |
24194 | pmulhrsw m4, m7 | |
24195 | pslldq m3, 2 | |
24196 | pinsrw m3, [r3 + 21], 0 | |
24197 | pmaddubsw m5, m3, m6 | |
24198 | pmulhrsw m5, m7 | |
24199 | packuswb m4, m5 | |
24200 | movu [r0 + 1095 * 16], m4 | |
24201 | ||
24202 | ; mode 19 [row 4] | |
24203 | movu m6, [r5 + 30 * 16] | |
24204 | pslldq m0, 2 | |
24205 | pinsrb m0, [r4 + 4], 1 | |
24206 | pinsrb m0, [r4 + 5], 0 | |
24207 | pmaddubsw m4, m0, m6 | |
24208 | pmulhrsw m4, m7 | |
24209 | pslldq m2, 2 | |
24210 | pinsrw m2, [r3 + 4], 0 | |
24211 | pmaddubsw m5, m2, m6 | |
24212 | pmulhrsw m5, m7 | |
24213 | packuswb m4, m5 | |
24214 | movu [r0 + 1096 * 16], m4 | |
24215 | pslldq m1, 2 | |
24216 | pinsrw m1, [r3 + 12], 0 | |
24217 | pmaddubsw m4, m1, m6 | |
24218 | pmulhrsw m4, m7 | |
24219 | pslldq m3, 2 | |
24220 | pinsrw m3, [r3 + 20], 0 | |
24221 | pmaddubsw m5, m3, m6 | |
24222 | pmulhrsw m5, m7 | |
24223 | packuswb m4, m5 | |
24224 | movu [r0 + 1097 * 16], m4 | |
24225 | ||
24226 | ; mode 19 [row 5] | |
24227 | movu m6, [r5 + 4 * 16] | |
24228 | pmaddubsw m4, m0, m6 | |
24229 | pmulhrsw m4, m7 | |
24230 | pmaddubsw m5, m2, m6 | |
24231 | pmulhrsw m5, m7 | |
24232 | packuswb m4, m5 | |
24233 | movu [r0 + 1098 * 16], m4 | |
24234 | pmaddubsw m4, m1, m6 | |
24235 | pmulhrsw m4, m7 | |
24236 | pmaddubsw m5, m3, m6 | |
24237 | pmulhrsw m5, m7 | |
24238 | packuswb m4, m5 | |
24239 | movu [r0 + 1099 * 16], m4 | |
24240 | ||
24241 | ; mode 19 [row 6] | |
24242 | movu m6, [r5 + 10 * 16] | |
24243 | pslldq m0, 2 | |
24244 | pinsrb m0, [r4 + 5], 1 | |
24245 | pinsrb m0, [r4 + 6], 0 | |
24246 | pmaddubsw m4, m0, m6 | |
24247 | pmulhrsw m4, m7 | |
24248 | pslldq m2, 2 | |
24249 | pinsrw m2, [r3 + 3], 0 | |
24250 | pmaddubsw m5, m2, m6 | |
24251 | pmulhrsw m5, m7 | |
24252 | packuswb m4, m5 | |
24253 | movu [r0 + 1100 * 16], m4 | |
24254 | pslldq m1, 2 | |
24255 | pinsrw m1, [r3 + 11], 0 | |
24256 | pmaddubsw m4, m1, m6 | |
24257 | pmulhrsw m4, m7 | |
24258 | pslldq m3, 2 | |
24259 | pinsrw m3, [r3 + 19], 0 | |
24260 | pmaddubsw m5, m3, m6 | |
24261 | pmulhrsw m5, m7 | |
24262 | packuswb m4, m5 | |
24263 | movu [r0 + 1101 * 16], m4 | |
24264 | ||
24265 | ; mode 19 [row 7] | |
24266 | movu m6, [r5 + 16 * 16] | |
24267 | pslldq m0, 2 | |
24268 | pinsrb m0, [r4 + 6], 1 | |
24269 | pinsrb m0, [r4 + 7], 0 | |
24270 | pmaddubsw m4, m0, m6 | |
24271 | pmulhrsw m4, m7 | |
24272 | pslldq m2, 2 | |
24273 | pinsrw m2, [r3 + 2], 0 | |
24274 | pmaddubsw m5, m2, m6 | |
24275 | pmulhrsw m5, m7 | |
24276 | packuswb m4, m5 | |
24277 | movu [r0 + 1102 * 16], m4 | |
24278 | pslldq m1, 2 | |
24279 | pinsrw m1, [r3 + 10], 0 | |
24280 | pmaddubsw m4, m1, m6 | |
24281 | pmulhrsw m4, m7 | |
24282 | pslldq m3, 2 | |
24283 | pinsrw m3, [r3 + 18], 0 | |
24284 | pmaddubsw m5, m3, m6 | |
24285 | pmulhrsw m5, m7 | |
24286 | packuswb m4, m5 | |
24287 | movu [r0 + 1103 * 16], m4 | |
24288 | ||
24289 | ; mode 19 [row 8] | |
24290 | movu m6, [r5 + 22 * 16] | |
24291 | pslldq m0, 2 | |
24292 | pinsrb m0, [r4 + 7], 1 | |
24293 | pinsrb m0, [r4 + 9], 0 | |
24294 | pmaddubsw m4, m0, m6 | |
24295 | pmulhrsw m4, m7 | |
24296 | pslldq m2, 2 | |
24297 | pinsrw m2, [r3 + 1], 0 | |
24298 | pmaddubsw m5, m2, m6 | |
24299 | pmulhrsw m5, m7 | |
24300 | packuswb m4, m5 | |
24301 | movu [r0 + 1104 * 16], m4 | |
24302 | pslldq m1, 2 | |
24303 | pinsrw m1, [r3 + 9], 0 | |
24304 | pmaddubsw m4, m1, m6 | |
24305 | pmulhrsw m4, m7 | |
24306 | pslldq m3, 2 | |
24307 | pinsrw m3, [r3 + 17], 0 | |
24308 | pmaddubsw m5, m3, m6 | |
24309 | pmulhrsw m5, m7 | |
24310 | packuswb m4, m5 | |
24311 | movu [r0 + 1105 * 16], m4 | |
24312 | ||
24313 | ; mode 19 [row 9] | |
24314 | movu m6, [r5 + 28 * 16] | |
24315 | pslldq m0, 2 | |
24316 | pinsrb m0, [r4 + 9], 1 | |
24317 | pinsrb m0, [r4 + 10], 0 | |
24318 | pmaddubsw m4, m0, m6 | |
24319 | pmulhrsw m4, m7 | |
24320 | pslldq m2, 2 | |
24321 | pinsrw m2, [r3 + 0], 0 | |
24322 | pmaddubsw m5, m2, m6 | |
24323 | pmulhrsw m5, m7 | |
24324 | packuswb m4, m5 | |
24325 | movu [r0 + 1106 * 16], m4 | |
24326 | pslldq m1, 2 | |
24327 | pinsrw m1, [r3 + 8], 0 | |
24328 | pmaddubsw m4, m1, m6 | |
24329 | pmulhrsw m4, m7 | |
24330 | pslldq m3, 2 | |
24331 | pinsrw m3, [r3 + 16], 0 | |
24332 | pmaddubsw m5, m3, m6 | |
24333 | pmulhrsw m5, m7 | |
24334 | packuswb m4, m5 | |
24335 | movu [r0 + 1107 * 16], m4 | |
24336 | ||
24337 | ; mode 19 [row 10] | |
24338 | movu m6, [r5 + 2 * 16] | |
24339 | pmaddubsw m4, m0, m6 | |
24340 | pmulhrsw m4, m7 | |
24341 | pmaddubsw m5, m2, m6 | |
24342 | pmulhrsw m5, m7 | |
24343 | packuswb m4, m5 | |
24344 | movu [r0 + 1108 * 16], m4 | |
24345 | pmaddubsw m4, m1, m6 | |
24346 | pmulhrsw m4, m7 | |
24347 | pmaddubsw m5, m3, m6 | |
24348 | pmulhrsw m5, m7 | |
24349 | packuswb m4, m5 | |
24350 | movu [r0 + 1109 * 16], m4 | |
24351 | ||
24352 | ; mode 19 [row 11] | |
24353 | movu m6, [r5 + 8 * 16] | |
24354 | pslldq m0, 2 | |
24355 | pinsrb m0, [r4 + 10], 1 | |
24356 | pinsrb m0, [r4 + 11], 0 | |
24357 | pmaddubsw m4, m0, m6 | |
24358 | pmulhrsw m4, m7 | |
24359 | pslldq m2, 2 | |
24360 | pinsrb m2, [r3 + 0], 1 | |
24361 | pinsrb m2, [r4 + 1], 0 | |
24362 | pmaddubsw m5, m2, m6 | |
24363 | pmulhrsw m5, m7 | |
24364 | packuswb m4, m5 | |
24365 | movu [r0 + 1110 * 16], m4 | |
24366 | pslldq m1, 2 | |
24367 | pinsrw m1, [r3 + 7], 0 | |
24368 | pmaddubsw m4, m1, m6 | |
24369 | pmulhrsw m4, m7 | |
24370 | pslldq m3, 2 | |
24371 | pinsrw m3, [r3 + 15], 0 | |
24372 | pmaddubsw m5, m3, m6 | |
24373 | pmulhrsw m5, m7 | |
24374 | packuswb m4, m5 | |
24375 | movu [r0 + 1111 * 16], m4 | |
24376 | ||
24377 | ; mode 19 [row 12] | |
24378 | movu m6, [r5 + 14 * 16] | |
24379 | pslldq m0, 2 | |
24380 | pinsrb m0, [r4 + 11], 1 | |
24381 | pinsrb m0, [r4 + 12], 0 | |
24382 | pmaddubsw m4, m0, m6 | |
24383 | pmulhrsw m4, m7 | |
24384 | pslldq m2, 2 | |
24385 | pinsrb m2, [r4 + 1], 1 | |
24386 | pinsrb m2, [r4 + 2], 0 | |
24387 | pmaddubsw m5, m2, m6 | |
24388 | pmulhrsw m5, m7 | |
24389 | packuswb m4, m5 | |
24390 | movu [r0 + 1112 * 16], m4 | |
24391 | pslldq m1, 2 | |
24392 | pinsrw m1, [r3 + 6], 0 | |
24393 | pmaddubsw m4, m1, m6 | |
24394 | pmulhrsw m4, m7 | |
24395 | pslldq m3, 2 | |
24396 | pinsrw m3, [r3 + 14], 0 | |
24397 | pmaddubsw m5, m3, m6 | |
24398 | pmulhrsw m5, m7 | |
24399 | packuswb m4, m5 | |
24400 | movu [r0 + 1113 * 16], m4 | |
24401 | ||
24402 | ; mode 19 [row 13] | |
24403 | movu m6, [r5 + 20 * 16] | |
24404 | pslldq m0, 2 | |
24405 | pinsrb m0, [r4 + 12], 1 | |
24406 | pinsrb m0, [r4 + 14], 0 | |
24407 | pmaddubsw m4, m0, m6 | |
24408 | pmulhrsw m4, m7 | |
24409 | pslldq m2, 2 | |
24410 | pinsrb m2, [r4 + 2], 1 | |
24411 | pinsrb m2, [r4 + 4], 0 | |
24412 | pmaddubsw m5, m2, m6 | |
24413 | pmulhrsw m5, m7 | |
24414 | packuswb m4, m5 | |
24415 | movu [r0 + 1114 * 16], m4 | |
24416 | pslldq m1, 2 | |
24417 | pinsrw m1, [r3 + 5], 0 | |
24418 | pmaddubsw m4, m1, m6 | |
24419 | pmulhrsw m4, m7 | |
24420 | pslldq m3, 2 | |
24421 | pinsrw m3, [r3 + 13], 0 | |
24422 | pmaddubsw m5, m3, m6 | |
24423 | pmulhrsw m5, m7 | |
24424 | packuswb m4, m5 | |
24425 | movu [r0 + 1115 * 16], m4 | |
24426 | ||
24427 | ; mode 19 [row 14] | |
24428 | movu m6, [r5 + 26 * 16] | |
24429 | pslldq m0, 2 | |
24430 | pinsrb m0, [r4 + 14], 1 | |
24431 | pinsrb m0, [r4 + 15], 0 | |
24432 | pmaddubsw m4, m0, m6 | |
24433 | pmulhrsw m4, m7 | |
24434 | pslldq m2, 2 | |
24435 | pinsrb m2, [r4 + 4], 1 | |
24436 | pinsrb m2, [r4 + 5], 0 | |
24437 | pmaddubsw m5, m2, m6 | |
24438 | pmulhrsw m5, m7 | |
24439 | packuswb m4, m5 | |
24440 | movu [r0 + 1116 * 16], m4 | |
24441 | pslldq m1, 2 | |
24442 | pinsrw m1, [r3 + 4], 0 | |
24443 | pmaddubsw m4, m1, m6 | |
24444 | pmulhrsw m4, m7 | |
24445 | pslldq m3, 2 | |
24446 | pinsrw m3, [r3 + 12], 0 | |
24447 | pmaddubsw m5, m3, m6 | |
24448 | pmulhrsw m5, m7 | |
24449 | packuswb m4, m5 | |
24450 | movu [r0 + 1117 * 16], m4 | |
24451 | ||
24452 | ; mode19 [row 15] | |
24453 | pshufb m5, m0, [tab_S2] | |
24454 | movh [r0 + 1118 * 16], m5 | |
24455 | pshufb m5, m2, [tab_S2] | |
24456 | movh [r0 + 1118 * 16 + 8], m5 | |
24457 | pshufb m5, m1, [tab_S2] | |
24458 | movh [r0 + 1119 * 16], m5 | |
24459 | pshufb m5, m3, [tab_S2] | |
24460 | movh [r0 + 1119 * 16 + 8], m5 | |
24461 | ||
24462 | ; mode 19 [row 16] | |
24463 | movu m6, [r5 + 6 * 16] | |
24464 | pslldq m0, 2 | |
24465 | pinsrb m0, [r4 + 15], 1 | |
24466 | pinsrb m0, [r4 + 16], 0 | |
24467 | pmaddubsw m4, m0, m6 | |
24468 | pmulhrsw m4, m7 | |
24469 | pslldq m2, 2 | |
24470 | pinsrb m2, [r4 + 5], 1 | |
24471 | pinsrb m2, [r4 + 6], 0 | |
24472 | pmaddubsw m5, m2, m6 | |
24473 | pmulhrsw m5, m7 | |
24474 | packuswb m4, m5 | |
24475 | movu [r0 + 1120 * 16], m4 | |
24476 | pslldq m1, 2 | |
24477 | pinsrw m1, [r3 + 3], 0 | |
24478 | pmaddubsw m4, m1, m6 | |
24479 | pmulhrsw m4, m7 | |
24480 | pslldq m3, 2 | |
24481 | pinsrw m3, [r3 + 11], 0 | |
24482 | pmaddubsw m5, m3, m6 | |
24483 | pmulhrsw m5, m7 | |
24484 | packuswb m4, m5 | |
24485 | movu [r0 + 1121 * 16], m4 | |
24486 | ||
24487 | ; mode 19 [row 17] | |
24488 | movu m6, [r5 + 12 * 16] | |
24489 | pslldq m0, 2 | |
24490 | pinsrb m0, [r4 + 16], 1 | |
24491 | pinsrb m0, [r4 + 17], 0 | |
24492 | pmaddubsw m4, m0, m6 | |
24493 | pmulhrsw m4, m7 | |
24494 | pslldq m2, 2 | |
24495 | pinsrb m2, [r4 + 6], 1 | |
24496 | pinsrb m2, [r4 + 7], 0 | |
24497 | pmaddubsw m5, m2, m6 | |
24498 | pmulhrsw m5, m7 | |
24499 | packuswb m4, m5 | |
24500 | movu [r0 + 1122 * 16], m4 | |
24501 | pslldq m1, 2 | |
24502 | pinsrw m1, [r3 + 2], 0 | |
24503 | pmaddubsw m4, m1, m6 | |
24504 | pmulhrsw m4, m7 | |
24505 | pslldq m3, 2 | |
24506 | pinsrw m3, [r3 + 10], 0 | |
24507 | pmaddubsw m5, m3, m6 | |
24508 | pmulhrsw m5, m7 | |
24509 | packuswb m4, m5 | |
24510 | movu [r0 + 1123 * 16], m4 | |
24511 | ||
24512 | ; mode 19 [row 18] | |
24513 | movu m6, [r5 + 18 * 16] | |
24514 | pslldq m0, 2 | |
24515 | pinsrb m0, [r4 + 17], 1 | |
24516 | pinsrb m0, [r4 + 18], 0 | |
24517 | pmaddubsw m4, m0, m6 | |
24518 | pmulhrsw m4, m7 | |
24519 | pslldq m2, 2 | |
24520 | pinsrb m2, [r4 + 7], 1 | |
24521 | pinsrb m2, [r4 + 9], 0 | |
24522 | pmaddubsw m5, m2, m6 | |
24523 | pmulhrsw m5, m7 | |
24524 | packuswb m4, m5 | |
24525 | movu [r0 + 1124 * 16], m4 | |
24526 | pslldq m1, 2 | |
24527 | pinsrw m1, [r3 + 1], 0 | |
24528 | pmaddubsw m4, m1, m6 | |
24529 | pmulhrsw m4, m7 | |
24530 | pslldq m3, 2 | |
24531 | pinsrw m3, [r3 + 9], 0 | |
24532 | pmaddubsw m5, m3, m6 | |
24533 | pmulhrsw m5, m7 | |
24534 | packuswb m4, m5 | |
24535 | movu [r0 + 1125 * 16], m4 | |
24536 | ||
24537 | ; mode 19 [row 19] | |
24538 | movu m6, [r5 + 24 * 16] | |
24539 | pslldq m0, 2 | |
24540 | pinsrb m0, [r4 + 18], 1 | |
24541 | pinsrb m0, [r4 + 20], 0 | |
24542 | pmaddubsw m4, m0, m6 | |
24543 | pmulhrsw m4, m7 | |
24544 | pslldq m2, 2 | |
24545 | pinsrb m2, [r4 + 9], 1 | |
24546 | pinsrb m2, [r4 + 10], 0 | |
24547 | pmaddubsw m5, m2, m6 | |
24548 | pmulhrsw m5, m7 | |
24549 | packuswb m4, m5 | |
24550 | movu [r0 + 1126 * 16], m4 | |
24551 | pslldq m1, 2 | |
24552 | pinsrw m1, [r3 + 0], 0 | |
24553 | pmaddubsw m4, m1, m6 | |
24554 | pmulhrsw m4, m7 | |
24555 | pslldq m3, 2 | |
24556 | pinsrw m3, [r3 + 8], 0 | |
24557 | pmaddubsw m5, m3, m6 | |
24558 | pmulhrsw m5, m7 | |
24559 | packuswb m4, m5 | |
24560 | movu [r0 + 1127 * 16], m4 | |
24561 | ||
24562 | ; mode 19 [row 20] | |
24563 | movu m6, [r5 + 30 * 16] | |
24564 | pslldq m0, 2 | |
24565 | pinsrb m0, [r4 + 20], 1 | |
24566 | pinsrb m0, [r4 + 21], 0 | |
24567 | pmaddubsw m4, m0, m6 | |
24568 | pmulhrsw m4, m7 | |
24569 | pslldq m2, 2 | |
24570 | pinsrb m2, [r4 + 10], 1 | |
24571 | pinsrb m2, [r4 + 11], 0 | |
24572 | pmaddubsw m5, m2, m6 | |
24573 | pmulhrsw m5, m7 | |
24574 | packuswb m4, m5 | |
24575 | movu [r0 + 1128 * 16], m4 | |
24576 | pslldq m1, 2 | |
24577 | pinsrb m1, [r4 + 0], 1 | |
24578 | pinsrb m1, [r4 + 1], 0 | |
24579 | pmaddubsw m4, m1, m6 | |
24580 | pmulhrsw m4, m7 | |
24581 | pslldq m3, 2 | |
24582 | pinsrb m3, [r3 + 8], 1 | |
24583 | pinsrb m3, [r3 + 7], 0 | |
24584 | pmaddubsw m5, m3, m6 | |
24585 | pmulhrsw m5, m7 | |
24586 | packuswb m4, m5 | |
24587 | movu [r0 + 1129 * 16], m4 | |
24588 | ||
24589 | ; mode 19 [row 21] | |
24590 | movu m6, [r5 + 4 * 16] | |
24591 | pmaddubsw m4, m0, m6 | |
24592 | pmulhrsw m4, m7 | |
24593 | pmaddubsw m5, m2, m6 | |
24594 | pmulhrsw m5, m7 | |
24595 | packuswb m4, m5 | |
24596 | movu [r0 + 1130 * 16], m4 | |
24597 | pmaddubsw m4, m1, m6 | |
24598 | pmulhrsw m4, m7 | |
24599 | pmaddubsw m5, m3, m6 | |
24600 | pmulhrsw m5, m7 | |
24601 | packuswb m4, m5 | |
24602 | movu [r0 + 1131 * 16], m4 | |
24603 | ||
24604 | ; mode 19 [row 22] | |
24605 | movu m6, [r5 + 10 * 16] | |
24606 | pslldq m0, 2 | |
24607 | pinsrb m0, [r4 + 21], 1 | |
24608 | pinsrb m0, [r4 + 22], 0 | |
24609 | pmaddubsw m4, m0, m6 | |
24610 | pmulhrsw m4, m7 | |
24611 | pslldq m2, 2 | |
24612 | pinsrb m2, [r4 + 11], 1 | |
24613 | pinsrb m2, [r4 + 12], 0 | |
24614 | pmaddubsw m5, m2, m6 | |
24615 | pmulhrsw m5, m7 | |
24616 | packuswb m4, m5 | |
24617 | movu [r0 + 1132 * 16], m4 | |
24618 | pslldq m1, 2 | |
24619 | pinsrb m1, [r4 + 1], 1 | |
24620 | pinsrb m1, [r4 + 2], 0 | |
24621 | pmaddubsw m4, m1, m6 | |
24622 | pmulhrsw m4, m7 | |
24623 | pslldq m3, 2 | |
24624 | pinsrw m3, [r3 + 6], 0 | |
24625 | pmaddubsw m5, m3, m6 | |
24626 | pmulhrsw m5, m7 | |
24627 | packuswb m4, m5 | |
24628 | movu [r0 + 1133 * 16], m4 | |
24629 | ||
24630 | ; mode 19 [row 23] | |
24631 | movu m6, [r5 + 16 * 16] | |
24632 | pslldq m0, 2 | |
24633 | pinsrb m0, [r4 + 22], 1 | |
24634 | pinsrb m0, [r4 + 23], 0 | |
24635 | pmaddubsw m4, m0, m6 | |
24636 | pmulhrsw m4, m7 | |
24637 | pslldq m2, 2 | |
24638 | pinsrb m2, [r4 + 12], 1 | |
24639 | pinsrb m2, [r4 + 14], 0 | |
24640 | pmaddubsw m5, m2, m6 | |
24641 | pmulhrsw m5, m7 | |
24642 | packuswb m4, m5 | |
24643 | movu [r0 + 1134 * 16], m4 | |
24644 | pslldq m1, 2 | |
24645 | pinsrb m1, [r4 + 2], 1 | |
24646 | pinsrb m1, [r4 + 4], 0 | |
24647 | pmaddubsw m4, m1, m6 | |
24648 | pmulhrsw m4, m7 | |
24649 | pslldq m3, 2 | |
24650 | pinsrw m3, [r3 + 5], 0 | |
24651 | pmaddubsw m5, m3, m6 | |
24652 | pmulhrsw m5, m7 | |
24653 | packuswb m4, m5 | |
24654 | movu [r0 + 1135 * 16], m4 | |
24655 | ||
24656 | ; mode 19 [row 24] | |
24657 | movu m6, [r5 + 22 * 16] | |
24658 | pslldq m0, 2 | |
24659 | pinsrb m0, [r4 + 23], 1 | |
24660 | pinsrb m0, [r4 + 25], 0 | |
24661 | pmaddubsw m4, m0, m6 | |
24662 | pmulhrsw m4, m7 | |
24663 | pslldq m2, 2 | |
24664 | pinsrb m2, [r4 + 14], 1 | |
24665 | pinsrb m2, [r4 + 15], 0 | |
24666 | pmaddubsw m5, m2, m6 | |
24667 | pmulhrsw m5, m7 | |
24668 | packuswb m4, m5 | |
24669 | movu [r0 + 1136 * 16], m4 | |
24670 | pslldq m1, 2 | |
24671 | pinsrb m1, [r4 + 4], 1 | |
24672 | pinsrb m1, [r4 + 5], 0 | |
24673 | pmaddubsw m4, m1, m6 | |
24674 | pmulhrsw m4, m7 | |
24675 | pslldq m3, 2 | |
24676 | pinsrw m3, [r3 + 4], 0 | |
24677 | pmaddubsw m5, m3, m6 | |
24678 | pmulhrsw m5, m7 | |
24679 | packuswb m4, m5 | |
24680 | movu [r0 + 1137 * 16], m4 | |
24681 | ||
24682 | ; mode 19 [row 25] | |
24683 | movu m6, [r5 + 28 * 16] | |
24684 | pslldq m0, 2 | |
24685 | pinsrb m0, [r4 + 25], 1 | |
24686 | pinsrb m0, [r4 + 26], 0 | |
24687 | pmaddubsw m4, m0, m6 | |
24688 | pmulhrsw m4, m7 | |
24689 | pslldq m2, 2 | |
24690 | pinsrb m2, [r4 + 15], 1 | |
24691 | pinsrb m2, [r4 + 16], 0 | |
24692 | pmaddubsw m5, m2, m6 | |
24693 | pmulhrsw m5, m7 | |
24694 | packuswb m4, m5 | |
24695 | movu [r0 + 1138 * 16], m4 | |
24696 | pslldq m1, 2 | |
24697 | pinsrb m1, [r4 + 5], 1 | |
24698 | pinsrb m1, [r4 + 6], 0 | |
24699 | pmaddubsw m4, m1, m6 | |
24700 | pmulhrsw m4, m7 | |
24701 | pslldq m3, 2 | |
24702 | pinsrw m3, [r3 + 3], 0 | |
24703 | pmaddubsw m5, m3, m6 | |
24704 | pmulhrsw m5, m7 | |
24705 | packuswb m4, m5 | |
24706 | movu [r0 + 1139 * 16], m4 | |
24707 | ||
24708 | ; mode 19 [row 26] | |
24709 | movu m6, [r5 + 2 * 16] | |
24710 | pmaddubsw m4, m0, m6 | |
24711 | pmulhrsw m4, m7 | |
24712 | pmaddubsw m5, m2, m6 | |
24713 | pmulhrsw m5, m7 | |
24714 | packuswb m4, m5 | |
24715 | movu [r0 + 1140 * 16], m4 | |
24716 | pmaddubsw m4, m1, m6 | |
24717 | pmulhrsw m4, m7 | |
24718 | pmaddubsw m5, m3, m6 | |
24719 | pmulhrsw m5, m7 | |
24720 | packuswb m4, m5 | |
24721 | movu [r0 + 1141 * 16], m4 | |
24722 | ||
24723 | ; mode 19 [row 27] | |
24724 | movu m6, [r5 + 8 * 16] | |
24725 | pslldq m0, 2 | |
24726 | pinsrb m0, [r4 + 26], 1 | |
24727 | pinsrb m0, [r4 + 27], 0 | |
24728 | pmaddubsw m4, m0, m6 | |
24729 | pmulhrsw m4, m7 | |
24730 | pslldq m2, 2 | |
24731 | pinsrb m2, [r4 + 16], 1 | |
24732 | pinsrb m2, [r4 + 17], 0 | |
24733 | pmaddubsw m5, m2, m6 | |
24734 | pmulhrsw m5, m7 | |
24735 | packuswb m4, m5 | |
24736 | movu [r0 + 1142 * 16], m4 | |
24737 | pslldq m1, 2 | |
24738 | pinsrb m1, [r4 + 6], 1 | |
24739 | pinsrb m1, [r4 + 7], 0 | |
24740 | pmaddubsw m4, m1, m6 | |
24741 | pmulhrsw m4, m7 | |
24742 | pslldq m3, 2 | |
24743 | pinsrw m3, [r3 + 2], 0 | |
24744 | pmaddubsw m5, m3, m6 | |
24745 | pmulhrsw m5, m7 | |
24746 | packuswb m4, m5 | |
24747 | movu [r0 + 1143 * 16], m4 | |
24748 | ||
24749 | ; mode 19 [row 28] | |
24750 | movu m6, [r5 + 14 * 16] | |
24751 | pslldq m0, 2 | |
24752 | pinsrb m0, [r4 + 27], 1 | |
24753 | pinsrb m0, [r4 + 28], 0 | |
24754 | pmaddubsw m4, m0, m6 | |
24755 | pmulhrsw m4, m7 | |
24756 | pslldq m2, 2 | |
24757 | pinsrb m2, [r4 + 17], 1 | |
24758 | pinsrb m2, [r4 + 18], 0 | |
24759 | pmaddubsw m5, m2, m6 | |
24760 | pmulhrsw m5, m7 | |
24761 | packuswb m4, m5 | |
24762 | movu [r0 + 1144 * 16], m4 | |
24763 | pslldq m1, 2 | |
24764 | pinsrb m1, [r4 + 7], 1 | |
24765 | pinsrb m1, [r4 + 9], 0 | |
24766 | pmaddubsw m4, m1, m6 | |
24767 | pmulhrsw m4, m7 | |
24768 | pslldq m3, 2 | |
24769 | pinsrw m3, [r3 + 1], 0 | |
24770 | pmaddubsw m5, m3, m6 | |
24771 | pmulhrsw m5, m7 | |
24772 | packuswb m4, m5 | |
24773 | movu [r0 + 1145 * 16], m4 | |
24774 | ||
24775 | ; mode 19 [row 29] | |
24776 | movu m6, [r5 + 20 * 16] | |
24777 | pslldq m0, 2 | |
24778 | pinsrb m0, [r4 + 28], 1 | |
24779 | pinsrb m0, [r4 + 30], 0 | |
24780 | pmaddubsw m4, m0, m6 | |
24781 | pmulhrsw m4, m7 | |
24782 | pslldq m2, 2 | |
24783 | pinsrb m2, [r4 + 18], 1 | |
24784 | pinsrb m2, [r4 + 20], 0 | |
24785 | pmaddubsw m5, m2, m6 | |
24786 | pmulhrsw m5, m7 | |
24787 | packuswb m4, m5 | |
24788 | movu [r0 + 1146 * 16], m4 | |
24789 | pslldq m1, 2 | |
24790 | pinsrb m1, [r4 + 9], 1 | |
24791 | pinsrb m1, [r4 + 10], 0 | |
24792 | pmaddubsw m4, m1, m6 | |
24793 | pmulhrsw m4, m7 | |
24794 | pslldq m3, 2 | |
24795 | pinsrw m3, [r3 + 0], 0 | |
24796 | pmaddubsw m5, m3, m6 | |
24797 | pmulhrsw m5, m7 | |
24798 | packuswb m4, m5 | |
24799 | movu [r0 + 1147 * 16], m4 | |
24800 | ||
24801 | ; mode 19 [row 30] | |
24802 | movu m6, [r5 + 26 * 16] | |
24803 | pslldq m0, 2 | |
24804 | pinsrb m0, [r4 + 30], 1 | |
24805 | pinsrb m0, [r4 + 31], 0 | |
24806 | pmaddubsw m4, m0, m6 | |
24807 | pmulhrsw m4, m7 | |
24808 | pslldq m2, 2 | |
24809 | pinsrb m2, [r4 + 20], 1 | |
24810 | pinsrb m2, [r4 + 21], 0 | |
24811 | pmaddubsw m5, m2, m6 | |
24812 | pmulhrsw m5, m7 | |
24813 | packuswb m4, m5 | |
24814 | movu [r0 + 1148 * 16], m4 | |
24815 | pslldq m1, 2 | |
24816 | pinsrb m1, [r4 + 10], 1 | |
24817 | pinsrb m1, [r4 + 11], 0 | |
24818 | pmaddubsw m4, m1, m6 | |
24819 | pmulhrsw m4, m7 | |
24820 | pslldq m3, 2 | |
24821 | pinsrb m3, [r4 + 0], 1 | |
24822 | pinsrb m3, [r4 + 1], 0 | |
24823 | pmaddubsw m5, m3, m6 | |
24824 | pmulhrsw m5, m7 | |
24825 | packuswb m4, m5 | |
24826 | movu [r0 + 1149 * 16], m4 | |
24827 | ||
24828 | ; mode19 [row 31] | |
24829 | pshufb m5, m0, [tab_S2] | |
24830 | movh [r0 + 1150 * 16], m5 | |
24831 | pshufb m5, m2, [tab_S2] | |
24832 | movh [r0 + 1150 * 16 + 8], m5 | |
24833 | pshufb m5, m1, [tab_S2] | |
24834 | movh [r0 + 1151 * 16], m5 | |
24835 | pshufb m5, m3, [tab_S2] | |
24836 | movh [r0 + 1151 * 16 + 8], m5 | |
24837 | ||
24838 | ; mode 20 [row 0] | |
24839 | movu m6, [r5 + 11 * 16] | |
24840 | movu m0, [r3 ] | |
24841 | movu m1, [r3 + 1 ] | |
24842 | punpcklbw m0, m1 | |
24843 | pmaddubsw m1, m0, m6 | |
24844 | pmulhrsw m1, m7 | |
24845 | movu m2, [r3 + 8] | |
24846 | movu m3, [r3 + 9] | |
24847 | punpcklbw m2, m3 | |
24848 | pmaddubsw m3, m2, m6 | |
24849 | pmulhrsw m3, m7 | |
24850 | packuswb m1, m3 | |
24851 | movu [r0 + 1152 * 16], m1 | |
24852 | ||
24853 | movu m1, [r3 + 16] | |
24854 | movu m3, [r3 + 17] | |
24855 | punpcklbw m1, m3 | |
24856 | pmaddubsw m4, m1, m6 | |
24857 | pmulhrsw m4, m7 | |
24858 | movu m3, [r3 + 24] | |
24859 | movu m5, [r3 + 25] | |
24860 | punpcklbw m3, m5 | |
24861 | pmaddubsw m5, m3, m6 | |
24862 | pmulhrsw m5, m7 | |
24863 | packuswb m4, m5 | |
24864 | movu [r0 + 1153 * 16], m4 | |
24865 | ||
24866 | ; mode 20 [row 1] | |
24867 | movu m6, [r5 + 22 * 16] | |
24868 | pslldq m0, 2 | |
24869 | pinsrb m0, [r4 + 0], 1 | |
24870 | pinsrb m0, [r4 + 2], 0 | |
24871 | pmaddubsw m4, m0, m6 | |
24872 | pmulhrsw m4, m7 | |
24873 | pslldq m2, 2 | |
24874 | pinsrw m2, [r3 + 7], 0 | |
24875 | pmaddubsw m5, m2, m6 | |
24876 | pmulhrsw m5, m7 | |
24877 | packuswb m4, m5 | |
24878 | movu [r0 + 1154 * 16], m4 | |
24879 | pslldq m1, 2 | |
24880 | pinsrw m1, [r3 + 15], 0 | |
24881 | pmaddubsw m4, m1, m6 | |
24882 | pmulhrsw m4, m7 | |
24883 | pslldq m3, 2 | |
24884 | pinsrw m3, [r3 + 23], 0 | |
24885 | pmaddubsw m5, m3, m6 | |
24886 | pmulhrsw m5, m7 | |
24887 | packuswb m4, m5 | |
24888 | movu [r0 + 1155 * 16], m4 | |
24889 | ||
24890 | ; mode 20 [row 2] | |
24891 | movu m6, [r5 + 1 * 16] | |
24892 | pmaddubsw m4, m0, m6 | |
24893 | pmulhrsw m4, m7 | |
24894 | pmaddubsw m5, m2, m6 | |
24895 | pmulhrsw m5, m7 | |
24896 | packuswb m4, m5 | |
24897 | movu [r0 + 1156 * 16], m4 | |
24898 | pmaddubsw m4, m1, m6 | |
24899 | pmulhrsw m4, m7 | |
24900 | pmaddubsw m5, m3, m6 | |
24901 | pmulhrsw m5, m7 | |
24902 | packuswb m4, m5 | |
24903 | movu [r0 + 1157 * 16], m4 | |
24904 | ||
24905 | ; mode 20 [row 3] | |
24906 | movu m6, [r5 + 12 * 16] | |
24907 | pslldq m0, 2 | |
24908 | pinsrb m0, [r4 + 2], 1 | |
24909 | pinsrb m0, [r4 + 3], 0 | |
24910 | pmaddubsw m4, m0, m6 | |
24911 | pmulhrsw m4, m7 | |
24912 | pslldq m2, 2 | |
24913 | pinsrw m2, [r3 + 6], 0 | |
24914 | pmaddubsw m5, m2, m6 | |
24915 | pmulhrsw m5, m7 | |
24916 | packuswb m4, m5 | |
24917 | movu [r0 + 1158 * 16], m4 | |
24918 | pslldq m1, 2 | |
24919 | pinsrw m1, [r3 + 14], 0 | |
24920 | pmaddubsw m4, m1, m6 | |
24921 | pmulhrsw m4, m7 | |
24922 | pslldq m3, 2 | |
24923 | pinsrw m3, [r3 + 22], 0 | |
24924 | pmaddubsw m5, m3, m6 | |
24925 | pmulhrsw m5, m7 | |
24926 | packuswb m4, m5 | |
24927 | movu [r0 + 1159 * 16], m4 | |
24928 | ||
24929 | ; mode 20 [row 4] | |
24930 | movu m6, [r5 + 23 * 16] | |
24931 | pslldq m0, 2 | |
24932 | pinsrb m0, [r4 + 3], 1 | |
24933 | pinsrb m0, [r4 + 5], 0 | |
24934 | pmaddubsw m4, m0, m6 | |
24935 | pmulhrsw m4, m7 | |
24936 | pslldq m2, 2 | |
24937 | pinsrw m2, [r3 + 5], 0 | |
24938 | pmaddubsw m5, m2, m6 | |
24939 | pmulhrsw m5, m7 | |
24940 | packuswb m4, m5 | |
24941 | movu [r0 + 1160 * 16], m4 | |
24942 | pslldq m1, 2 | |
24943 | pinsrw m1, [r3 + 13], 0 | |
24944 | pmaddubsw m4, m1, m6 | |
24945 | pmulhrsw m4, m7 | |
24946 | pslldq m3, 2 | |
24947 | pinsrw m3, [r3 + 21], 0 | |
24948 | pmaddubsw m5, m3, m6 | |
24949 | pmulhrsw m5, m7 | |
24950 | packuswb m4, m5 | |
24951 | movu [r0 + 1161 * 16], m4 | |
24952 | ||
24953 | ; mode 20 [row 5] | |
24954 | movu m6, [r5 + 2 * 16] | |
24955 | pmaddubsw m4, m0, m6 | |
24956 | pmulhrsw m4, m7 | |
24957 | pmaddubsw m5, m2, m6 | |
24958 | pmulhrsw m5, m7 | |
24959 | packuswb m4, m5 | |
24960 | movu [r0 + 1162 * 16], m4 | |
24961 | pmaddubsw m4, m1, m6 | |
24962 | pmulhrsw m4, m7 | |
24963 | pmaddubsw m5, m3, m6 | |
24964 | pmulhrsw m5, m7 | |
24965 | packuswb m4, m5 | |
24966 | movu [r0 + 1163 * 16], m4 | |
24967 | ||
24968 | ; mode 20 [row 6] | |
24969 | movu m6, [r5 + 13 * 16] | |
24970 | pslldq m0, 2 | |
24971 | pinsrb m0, [r4 + 5], 1 | |
24972 | pinsrb m0, [r4 + 6], 0 | |
24973 | pmaddubsw m4, m0, m6 | |
24974 | pmulhrsw m4, m7 | |
24975 | pslldq m2, 2 | |
24976 | pinsrw m2, [r3 + 4], 0 | |
24977 | pmaddubsw m5, m2, m6 | |
24978 | pmulhrsw m5, m7 | |
24979 | packuswb m4, m5 | |
24980 | movu [r0 + 1164 * 16], m4 | |
24981 | pslldq m1, 2 | |
24982 | pinsrw m1, [r3 + 12], 0 | |
24983 | pmaddubsw m4, m1, m6 | |
24984 | pmulhrsw m4, m7 | |
24985 | pslldq m3, 2 | |
24986 | pinsrw m3, [r3 + 20], 0 | |
24987 | pmaddubsw m5, m3, m6 | |
24988 | pmulhrsw m5, m7 | |
24989 | packuswb m4, m5 | |
24990 | movu [r0 + 1165 * 16], m4 | |
24991 | ||
24992 | ; mode 20 [row 7] | |
24993 | movu m6, [r5 + 24 * 16] | |
24994 | pslldq m0, 2 | |
24995 | pinsrb m0, [r4 + 6], 1 | |
24996 | pinsrb m0, [r4 + 8], 0 | |
24997 | pmaddubsw m4, m0, m6 | |
24998 | pmulhrsw m4, m7 | |
24999 | pslldq m2, 2 | |
25000 | pinsrw m2, [r3 + 3], 0 | |
25001 | pmaddubsw m5, m2, m6 | |
25002 | pmulhrsw m5, m7 | |
25003 | packuswb m4, m5 | |
25004 | movu [r0 + 1166 * 16], m4 | |
25005 | pslldq m1, 2 | |
25006 | pinsrw m1, [r3 + 11], 0 | |
25007 | pmaddubsw m4, m1, m6 | |
25008 | pmulhrsw m4, m7 | |
25009 | pslldq m3, 2 | |
25010 | pinsrw m3, [r3 + 19], 0 | |
25011 | pmaddubsw m5, m3, m6 | |
25012 | pmulhrsw m5, m7 | |
25013 | packuswb m4, m5 | |
25014 | movu [r0 + 1167 * 16], m4 | |
25015 | ||
25016 | ; mode 20 [row 8] | |
25017 | movu m6, [r5 + 3 * 16] | |
25018 | pmaddubsw m4, m0, m6 | |
25019 | pmulhrsw m4, m7 | |
25020 | pmaddubsw m5, m2, m6 | |
25021 | pmulhrsw m5, m7 | |
25022 | packuswb m4, m5 | |
25023 | movu [r0 + 1168 * 16], m4 | |
25024 | pmaddubsw m4, m1, m6 | |
25025 | pmulhrsw m4, m7 | |
25026 | pmaddubsw m5, m3, m6 | |
25027 | pmulhrsw m5, m7 | |
25028 | packuswb m4, m5 | |
25029 | movu [r0 + 1169 * 16], m4 | |
25030 | ||
25031 | ; mode 20 [row 9] | |
25032 | movu m6, [r5 + 14 * 16] | |
25033 | pslldq m0, 2 | |
25034 | pinsrb m0, [r4 + 8], 1 | |
25035 | pinsrb m0, [r4 + 9], 0 | |
25036 | pmaddubsw m4, m0, m6 | |
25037 | pmulhrsw m4, m7 | |
25038 | pslldq m2, 2 | |
25039 | pinsrb m2, [r3 + 3], 1 | |
25040 | pinsrb m2, [r3 + 2], 0 | |
25041 | pmaddubsw m5, m2, m6 | |
25042 | pmulhrsw m5, m7 | |
25043 | packuswb m4, m5 | |
25044 | movu [r0 + 1170 * 16], m4 | |
25045 | pslldq m1, 2 | |
25046 | pinsrw m1, [r3 + 10], 0 | |
25047 | pmaddubsw m4, m1, m6 | |
25048 | pmulhrsw m4, m7 | |
25049 | pslldq m3, 2 | |
25050 | pinsrw m3, [r3 + 18], 0 | |
25051 | pmaddubsw m5, m3, m6 | |
25052 | pmulhrsw m5, m7 | |
25053 | packuswb m4, m5 | |
25054 | movu [r0 + 1171 * 16], m4 | |
25055 | ||
25056 | ; mode 20 [row 10] | |
25057 | movu m6, [r5 + 25 * 16] | |
25058 | pslldq m0, 2 | |
25059 | pinsrb m0, [r4 + 9], 1 | |
25060 | pinsrb m0, [r4 + 11], 0 | |
25061 | pmaddubsw m4, m0, m6 | |
25062 | pmulhrsw m4, m7 | |
25063 | pslldq m2, 2 | |
25064 | pinsrw m2, [r3 + 1], 0 | |
25065 | pmaddubsw m5, m2, m6 | |
25066 | pmulhrsw m5, m7 | |
25067 | packuswb m4, m5 | |
25068 | movu [r0 + 1172 * 16], m4 | |
25069 | pslldq m1, 2 | |
25070 | pinsrw m1, [r3 + 9], 0 | |
25071 | pmaddubsw m4, m1, m6 | |
25072 | pmulhrsw m4, m7 | |
25073 | pslldq m3, 2 | |
25074 | pinsrw m3, [r3 + 17], 0 | |
25075 | pmaddubsw m5, m3, m6 | |
25076 | pmulhrsw m5, m7 | |
25077 | packuswb m4, m5 | |
25078 | movu [r0 + 1173 * 16], m4 | |
25079 | ||
25080 | ; mode 20 [row 11] | |
25081 | movu m6, [r5 + 4 * 16] | |
25082 | pmaddubsw m4, m0, m6 | |
25083 | pmulhrsw m4, m7 | |
25084 | pmaddubsw m5, m2, m6 | |
25085 | pmulhrsw m5, m7 | |
25086 | packuswb m4, m5 | |
25087 | movu [r0 + 1174 * 16], m4 | |
25088 | pmaddubsw m4, m1, m6 | |
25089 | pmulhrsw m4, m7 | |
25090 | pmaddubsw m5, m3, m6 | |
25091 | pmulhrsw m5, m7 | |
25092 | packuswb m4, m5 | |
25093 | movu [r0 + 1175 * 16], m4 | |
25094 | ||
25095 | ; mode 20 [row 12] | |
25096 | movu m6, [r5 + 15 * 16] | |
25097 | pslldq m0, 2 | |
25098 | pinsrb m0, [r4 + 11], 1 | |
25099 | pinsrb m0, [r4 + 12], 0 | |
25100 | pmaddubsw m4, m0, m6 | |
25101 | pmulhrsw m4, m7 | |
25102 | pslldq m2, 2 | |
25103 | pinsrb m2, [r3 + 1], 1 | |
25104 | pinsrb m2, [r3 + 0], 0 | |
25105 | pmaddubsw m5, m2, m6 | |
25106 | pmulhrsw m5, m7 | |
25107 | packuswb m4, m5 | |
25108 | movu [r0 + 1176 * 16], m4 | |
25109 | pslldq m1, 2 | |
25110 | pinsrw m1, [r3 + 8], 0 | |
25111 | pmaddubsw m4, m1, m6 | |
25112 | pmulhrsw m4, m7 | |
25113 | pslldq m3, 2 | |
25114 | pinsrw m3, [r3 + 16], 0 | |
25115 | pmaddubsw m5, m3, m6 | |
25116 | pmulhrsw m5, m7 | |
25117 | packuswb m4, m5 | |
25118 | movu [r0 + 1177 * 16], m4 | |
25119 | ||
25120 | ; mode 20 [row 13] | |
25121 | movu m6, [r5 + 26 * 16] | |
25122 | pslldq m0, 2 | |
25123 | pinsrb m0, [r4 + 12], 1 | |
25124 | pinsrb m0, [r4 + 14], 0 | |
25125 | pmaddubsw m4, m0, m6 | |
25126 | pmulhrsw m4, m7 | |
25127 | pslldq m2, 2 | |
25128 | pinsrb m2, [r4 + 0], 1 | |
25129 | pinsrb m2, [r4 + 2], 0 | |
25130 | pmaddubsw m5, m2, m6 | |
25131 | pmulhrsw m5, m7 | |
25132 | packuswb m4, m5 | |
25133 | movu [r0 + 1178 * 16], m4 | |
25134 | pslldq m1, 2 | |
25135 | pinsrw m1, [r3 + 7], 0 | |
25136 | pmaddubsw m4, m1, m6 | |
25137 | pmulhrsw m4, m7 | |
25138 | pslldq m3, 2 | |
25139 | pinsrw m3, [r3 + 15], 0 | |
25140 | pmaddubsw m5, m3, m6 | |
25141 | pmulhrsw m5, m7 | |
25142 | packuswb m4, m5 | |
25143 | movu [r0 + 1179 * 16], m4 | |
25144 | ||
25145 | ; mode 20 [row 14] | |
25146 | movu m6, [r5 + 5 * 16] | |
25147 | pmaddubsw m4, m0, m6 | |
25148 | pmulhrsw m4, m7 | |
25149 | pmaddubsw m5, m2, m6 | |
25150 | pmulhrsw m5, m7 | |
25151 | packuswb m4, m5 | |
25152 | movu [r0 + 1180 * 16], m4 | |
25153 | pmaddubsw m4, m1, m6 | |
25154 | pmulhrsw m4, m7 | |
25155 | pmaddubsw m5, m3, m6 | |
25156 | pmulhrsw m5, m7 | |
25157 | packuswb m4, m5 | |
25158 | movu [r0 + 1181 * 16], m4 | |
25159 | ||
25160 | ; mode 20 [row 15] | |
25161 | movu m6, [r5 + 16 * 16] | |
25162 | pslldq m0, 2 | |
25163 | pinsrb m0, [r4 + 14], 1 | |
25164 | pinsrb m0, [r4 + 15], 0 | |
25165 | pmaddubsw m4, m0, m6 | |
25166 | pmulhrsw m4, m7 | |
25167 | pslldq m2, 2 | |
25168 | pinsrb m2, [r4 + 2], 1 | |
25169 | pinsrb m2, [r4 + 3], 0 | |
25170 | pmaddubsw m5, m2, m6 | |
25171 | pmulhrsw m5, m7 | |
25172 | packuswb m4, m5 | |
25173 | movu [r0 + 1182 * 16], m4 | |
25174 | pslldq m1, 2 | |
25175 | pinsrw m1, [r3 + 6], 0 | |
25176 | pmaddubsw m4, m1, m6 | |
25177 | pmulhrsw m4, m7 | |
25178 | pslldq m3, 2 | |
25179 | pinsrw m3, [r3 + 14], 0 | |
25180 | pmaddubsw m5, m3, m6 | |
25181 | pmulhrsw m5, m7 | |
25182 | packuswb m4, m5 | |
25183 | movu [r0 + 1183 * 16], m4 | |
25184 | ||
25185 | ; mode 20 [row 16] | |
25186 | movu m6, [r5 + 27 * 16] | |
25187 | pslldq m0, 2 | |
25188 | pinsrb m0, [r4 + 15], 1 | |
25189 | pinsrb m0, [r4 + 17], 0 | |
25190 | pmaddubsw m4, m0, m6 | |
25191 | pmulhrsw m4, m7 | |
25192 | pslldq m2, 2 | |
25193 | pinsrb m2, [r4 + 3], 1 | |
25194 | pinsrb m2, [r4 + 5], 0 | |
25195 | pmaddubsw m5, m2, m6 | |
25196 | pmulhrsw m5, m7 | |
25197 | packuswb m4, m5 | |
25198 | movu [r0 + 1184 * 16], m4 | |
25199 | pslldq m1, 2 | |
25200 | pinsrw m1, [r3 + 5], 0 | |
25201 | pmaddubsw m4, m1, m6 | |
25202 | pmulhrsw m4, m7 | |
25203 | pslldq m3, 2 | |
25204 | pinsrw m3, [r3 + 13], 0 | |
25205 | pmaddubsw m5, m3, m6 | |
25206 | pmulhrsw m5, m7 | |
25207 | packuswb m4, m5 | |
25208 | movu [r0 + 1185 * 16], m4 | |
25209 | ||
25210 | ; mode 20 [row 17] | |
25211 | movu m6, [r5 + 6 * 16] | |
25212 | pmaddubsw m4, m0, m6 | |
25213 | pmulhrsw m4, m7 | |
25214 | pmaddubsw m5, m2, m6 | |
25215 | pmulhrsw m5, m7 | |
25216 | packuswb m4, m5 | |
25217 | movu [r0 + 1186 * 16], m4 | |
25218 | pmaddubsw m4, m1, m6 | |
25219 | pmulhrsw m4, m7 | |
25220 | pmaddubsw m5, m3, m6 | |
25221 | pmulhrsw m5, m7 | |
25222 | packuswb m4, m5 | |
25223 | movu [r0 + 1187 * 16], m4 | |
25224 | ||
25225 | ; mode 20 [row 18] | |
25226 | movu m6, [r5 + 17 * 16] | |
25227 | pslldq m0, 2 | |
25228 | pinsrb m0, [r4 + 17], 1 | |
25229 | pinsrb m0, [r4 + 18], 0 | |
25230 | pmaddubsw m4, m0, m6 | |
25231 | pmulhrsw m4, m7 | |
25232 | pslldq m2, 2 | |
25233 | pinsrb m2, [r4 + 5], 1 | |
25234 | pinsrb m2, [r4 + 6], 0 | |
25235 | pmaddubsw m5, m2, m6 | |
25236 | pmulhrsw m5, m7 | |
25237 | packuswb m4, m5 | |
25238 | movu [r0 + 1188 * 16], m4 | |
25239 | pslldq m1, 2 | |
25240 | pinsrw m1, [r3 + 4], 0 | |
25241 | pmaddubsw m4, m1, m6 | |
25242 | pmulhrsw m4, m7 | |
25243 | pslldq m3, 2 | |
25244 | pinsrw m3, [r3 + 12], 0 | |
25245 | pmaddubsw m5, m3, m6 | |
25246 | pmulhrsw m5, m7 | |
25247 | packuswb m4, m5 | |
25248 | movu [r0 + 1189 * 16], m4 | |
25249 | ||
25250 | ; mode 20 [row 19] | |
25251 | movu m6, [r5 + 28 * 16] | |
25252 | pslldq m0, 2 | |
25253 | pinsrb m0, [r4 + 18], 1 | |
25254 | pinsrb m0, [r4 + 20], 0 | |
25255 | pmaddubsw m4, m0, m6 | |
25256 | pmulhrsw m4, m7 | |
25257 | pslldq m2, 2 | |
25258 | pinsrb m2, [r4 + 6], 1 | |
25259 | pinsrb m2, [r4 + 8], 0 | |
25260 | pmaddubsw m5, m2, m6 | |
25261 | pmulhrsw m5, m7 | |
25262 | packuswb m4, m5 | |
25263 | movu [r0 + 1190 * 16], m4 | |
25264 | pslldq m1, 2 | |
25265 | pinsrw m1, [r3 + 3], 0 | |
25266 | pmaddubsw m4, m1, m6 | |
25267 | pmulhrsw m4, m7 | |
25268 | pslldq m3, 2 | |
25269 | pinsrw m3, [r3 + 11], 0 | |
25270 | pmaddubsw m5, m3, m6 | |
25271 | pmulhrsw m5, m7 | |
25272 | packuswb m4, m5 | |
25273 | movu [r0 + 1191 * 16], m4 | |
25274 | ||
25275 | ; mode 20 [row 20] | |
25276 | movu m6, [r5 + 7 * 16] | |
25277 | pmaddubsw m4, m0, m6 | |
25278 | pmulhrsw m4, m7 | |
25279 | pmaddubsw m5, m2, m6 | |
25280 | pmulhrsw m5, m7 | |
25281 | packuswb m4, m5 | |
25282 | movu [r0 + 1192 * 16], m4 | |
25283 | pmaddubsw m4, m1, m6 | |
25284 | pmulhrsw m4, m7 | |
25285 | pmaddubsw m5, m3, m6 | |
25286 | pmulhrsw m5, m7 | |
25287 | packuswb m4, m5 | |
25288 | movu [r0 + 1193 * 16], m4 | |
25289 | ||
25290 | ; mode 20 [row 21] | |
25291 | movu m6, [r5 + 18 * 16] | |
25292 | pslldq m0, 2 | |
25293 | pinsrb m0, [r4 + 20], 1 | |
25294 | pinsrb m0, [r4 + 21], 0 | |
25295 | pmaddubsw m4, m0, m6 | |
25296 | pmulhrsw m4, m7 | |
25297 | pslldq m2, 2 | |
25298 | pinsrb m2, [r4 + 8], 1 | |
25299 | pinsrb m2, [r4 + 9], 0 | |
25300 | pmaddubsw m5, m2, m6 | |
25301 | pmulhrsw m5, m7 | |
25302 | packuswb m4, m5 | |
25303 | movu [r0 + 1194 * 16], m4 | |
25304 | pslldq m1, 2 | |
25305 | pinsrw m1, [r3 + 2], 0 | |
25306 | pmaddubsw m4, m1, m6 | |
25307 | pmulhrsw m4, m7 | |
25308 | pslldq m3, 2 | |
25309 | pinsrw m3, [r3 + 10], 0 | |
25310 | pmaddubsw m5, m3, m6 | |
25311 | pmulhrsw m5, m7 | |
25312 | packuswb m4, m5 | |
25313 | movu [r0 + 1195 * 16], m4 | |
25314 | ||
25315 | ; mode 20 [row 22] | |
25316 | movu m6, [r5 + 29 * 16] | |
25317 | pslldq m0, 2 | |
25318 | pinsrb m0, [r4 + 21], 1 | |
25319 | pinsrb m0, [r4 + 23], 0 | |
25320 | pmaddubsw m4, m0, m6 | |
25321 | pmulhrsw m4, m7 | |
25322 | pslldq m2, 2 | |
25323 | pinsrb m2, [r4 + 9], 1 | |
25324 | pinsrb m2, [r4 + 11], 0 | |
25325 | pmaddubsw m5, m2, m6 | |
25326 | pmulhrsw m5, m7 | |
25327 | packuswb m4, m5 | |
25328 | movu [r0 + 1196 * 16], m4 | |
25329 | pslldq m1, 2 | |
25330 | pinsrw m1, [r3 + 1], 0 | |
25331 | pmaddubsw m4, m1, m6 | |
25332 | pmulhrsw m4, m7 | |
25333 | pslldq m3, 2 | |
25334 | pinsrw m3, [r3 + 9], 0 | |
25335 | pmaddubsw m5, m3, m6 | |
25336 | pmulhrsw m5, m7 | |
25337 | packuswb m4, m5 | |
25338 | movu [r0 + 1197 * 16], m4 | |
25339 | ||
25340 | ; mode 20 [row 23] | |
25341 | movu m6, [r5 + 8 * 16] | |
25342 | pmaddubsw m4, m0, m6 | |
25343 | pmulhrsw m4, m7 | |
25344 | pmaddubsw m5, m2, m6 | |
25345 | pmulhrsw m5, m7 | |
25346 | packuswb m4, m5 | |
25347 | movu [r0 + 1198 * 16], m4 | |
25348 | pmaddubsw m4, m1, m6 | |
25349 | pmulhrsw m4, m7 | |
25350 | pmaddubsw m5, m3, m6 | |
25351 | pmulhrsw m5, m7 | |
25352 | packuswb m4, m5 | |
25353 | movu [r0 + 1199 * 16], m4 | |
25354 | ||
25355 | ; mode 20 [row 24] | |
25356 | movu m6, [r5 + 19 * 16] | |
25357 | pslldq m0, 2 | |
25358 | pinsrb m0, [r4 + 23], 1 | |
25359 | pinsrb m0, [r4 + 24], 0 | |
25360 | pmaddubsw m4, m0, m6 | |
25361 | pmulhrsw m4, m7 | |
25362 | pslldq m2, 2 | |
25363 | pinsrb m2, [r4 + 11], 1 | |
25364 | pinsrb m2, [r4 + 12], 0 | |
25365 | pmaddubsw m5, m2, m6 | |
25366 | pmulhrsw m5, m7 | |
25367 | packuswb m4, m5 | |
25368 | movu [r0 + 1200 * 16], m4 | |
25369 | pslldq m1, 2 | |
25370 | pinsrw m1, [r3 + 0], 0 | |
25371 | pmaddubsw m4, m1, m6 | |
25372 | pmulhrsw m4, m7 | |
25373 | pslldq m3, 2 | |
25374 | pinsrw m3, [r3 + 8], 0 | |
25375 | pmaddubsw m5, m3, m6 | |
25376 | pmulhrsw m5, m7 | |
25377 | packuswb m4, m5 | |
25378 | movu [r0 + 1201 * 16], m4 | |
25379 | ||
25380 | ; mode 20 [row 25] | |
25381 | movu m6, [r5 + 30 * 16] | |
25382 | pslldq m0, 2 | |
25383 | pinsrb m0, [r4 + 24], 1 | |
25384 | pinsrb m0, [r4 + 26], 0 | |
25385 | pmaddubsw m4, m0, m6 | |
25386 | pmulhrsw m4, m7 | |
25387 | pslldq m2, 2 | |
25388 | pinsrb m2, [r4 + 12], 1 | |
25389 | pinsrb m2, [r4 + 14], 0 | |
25390 | pmaddubsw m5, m2, m6 | |
25391 | pmulhrsw m5, m7 | |
25392 | packuswb m4, m5 | |
25393 | movu [r0 + 1202 * 16], m4 | |
25394 | pslldq m1, 2 | |
25395 | pinsrb m1, [r4 + 0], 1 | |
25396 | pinsrb m1, [r4 + 2], 0 | |
25397 | pmaddubsw m4, m1, m6 | |
25398 | pmulhrsw m4, m7 | |
25399 | pslldq m3, 2 | |
25400 | pinsrw m3, [r3 + 7], 0 | |
25401 | pmaddubsw m5, m3, m6 | |
25402 | pmulhrsw m5, m7 | |
25403 | packuswb m4, m5 | |
25404 | movu [r0 + 1203 * 16], m4 | |
25405 | ||
25406 | ; mode 20 [row 26] | |
25407 | movu m6, [r5 + 9 * 16] | |
25408 | pmaddubsw m4, m0, m6 | |
25409 | pmulhrsw m4, m7 | |
25410 | pmaddubsw m5, m2, m6 | |
25411 | pmulhrsw m5, m7 | |
25412 | packuswb m4, m5 | |
25413 | movu [r0 + 1204 * 16], m4 | |
25414 | pmaddubsw m4, m1, m6 | |
25415 | pmulhrsw m4, m7 | |
25416 | pmaddubsw m5, m3, m6 | |
25417 | pmulhrsw m5, m7 | |
25418 | packuswb m4, m5 | |
25419 | movu [r0 + 1205 * 16], m4 | |
25420 | ||
25421 | ; mode 20 [row 27] | |
25422 | movu m6, [r5 + 20 * 16] | |
25423 | pslldq m0, 2 | |
25424 | pinsrb m0, [r4 + 26], 1 | |
25425 | pinsrb m0, [r4 + 27], 0 | |
25426 | pmaddubsw m4, m0, m6 | |
25427 | pmulhrsw m4, m7 | |
25428 | pslldq m2, 2 | |
25429 | pinsrb m2, [r4 + 14], 1 | |
25430 | pinsrb m2, [r4 + 15], 0 | |
25431 | pmaddubsw m5, m2, m6 | |
25432 | pmulhrsw m5, m7 | |
25433 | packuswb m4, m5 | |
25434 | movu [r0 + 1206 * 16], m4 | |
25435 | pslldq m1, 2 | |
25436 | pinsrb m1, [r4 + 2], 1 | |
25437 | pinsrb m1, [r4 + 3], 0 | |
25438 | pmaddubsw m4, m1, m6 | |
25439 | pmulhrsw m4, m7 | |
25440 | pslldq m3, 2 | |
25441 | pinsrw m3, [r3 + 6], 0 | |
25442 | pmaddubsw m5, m3, m6 | |
25443 | pmulhrsw m5, m7 | |
25444 | packuswb m4, m5 | |
25445 | movu [r0 + 1207 * 16], m4 | |
25446 | ||
25447 | ; mode 20 [row 28] | |
25448 | movu m6, [r5 + 31 * 16] | |
25449 | pslldq m0, 2 | |
25450 | pinsrb m0, [r4 + 27], 1 | |
25451 | pinsrb m0, [r4 + 29], 0 | |
25452 | pmaddubsw m4, m0, m6 | |
25453 | pmulhrsw m4, m7 | |
25454 | pslldq m2, 2 | |
25455 | pinsrb m2, [r4 + 15], 1 | |
25456 | pinsrb m2, [r4 + 17], 0 | |
25457 | pmaddubsw m5, m2, m6 | |
25458 | pmulhrsw m5, m7 | |
25459 | packuswb m4, m5 | |
25460 | movu [r0 + 1208 * 16], m4 | |
25461 | pslldq m1, 2 | |
25462 | pinsrb m1, [r4 + 3], 1 | |
25463 | pinsrb m1, [r4 + 5], 0 | |
25464 | pmaddubsw m4, m1, m6 | |
25465 | pmulhrsw m4, m7 | |
25466 | pslldq m3, 2 | |
25467 | pinsrw m3, [r3 + 5], 0 | |
25468 | pmaddubsw m5, m3, m6 | |
25469 | pmulhrsw m5, m7 | |
25470 | packuswb m4, m5 | |
25471 | movu [r0 + 1209 * 16], m4 | |
25472 | ||
25473 | ; mode 20 [row 29] | |
25474 | movu m6, [r5 + 10 * 16] | |
25475 | pmaddubsw m4, m0, m6 | |
25476 | pmulhrsw m4, m7 | |
25477 | pmaddubsw m5, m2, m6 | |
25478 | pmulhrsw m5, m7 | |
25479 | packuswb m4, m5 | |
25480 | movu [r0 + 1210 * 16], m4 | |
25481 | pmaddubsw m4, m1, m6 | |
25482 | pmulhrsw m4, m7 | |
25483 | pmaddubsw m5, m3, m6 | |
25484 | pmulhrsw m5, m7 | |
25485 | packuswb m4, m5 | |
25486 | movu [r0 + 1211 * 16], m4 | |
25487 | ||
25488 | ; mode 20 [row 30] | |
25489 | movu m6, [r5 + 21 * 16] | |
25490 | pslldq m0, 2 | |
25491 | pinsrb m0, [r4 + 29], 1 | |
25492 | pinsrb m0, [r4 + 30], 0 | |
25493 | pmaddubsw m4, m0, m6 | |
25494 | pmulhrsw m4, m7 | |
25495 | pslldq m2, 2 | |
25496 | pinsrb m2, [r4 + 17], 1 | |
25497 | pinsrb m2, [r4 + 18], 0 | |
25498 | pmaddubsw m5, m2, m6 | |
25499 | pmulhrsw m5, m7 | |
25500 | packuswb m4, m5 | |
25501 | movu [r0 + 1212 * 16], m4 | |
25502 | pslldq m1, 2 | |
25503 | pinsrb m1, [r4 + 5], 1 | |
25504 | pinsrb m1, [r4 + 6], 0 | |
25505 | pmaddubsw m4, m1, m6 | |
25506 | pmulhrsw m4, m7 | |
25507 | pslldq m3, 2 | |
25508 | pinsrw m3, [r3 + 4], 0 | |
25509 | pmaddubsw m5, m3, m6 | |
25510 | pmulhrsw m5, m7 | |
25511 | packuswb m4, m5 | |
25512 | movu [r0 + 1213 * 16], m4 | |
25513 | ||
25514 | ; mode20 [row 31] | |
25515 | pshufb m5, m0, [tab_S2] | |
25516 | movh [r0 + 1214 * 16], m5 | |
25517 | pshufb m5, m2, [tab_S2] | |
25518 | movh [r0 + 1214 * 16 + 8], m5 | |
25519 | pshufb m5, m1, [tab_S2] | |
25520 | movh [r0 + 1215 * 16], m5 | |
25521 | pshufb m5, m3, [tab_S2] | |
25522 | movh [r0 + 1215 * 16 + 8], m5 | |
25523 | ||
25524 | ; mode 21 [row 0] | |
25525 | movu m6, [r5 + 15 * 16] | |
25526 | movu m0, [r3 ] | |
25527 | movu m1, [r3 + 1 ] | |
25528 | punpcklbw m0, m1 | |
25529 | pmaddubsw m1, m0, m6 | |
25530 | pmulhrsw m1, m7 | |
25531 | movu m2, [r3 + 8] | |
25532 | movu m3, [r3 + 9] | |
25533 | punpcklbw m2, m3 | |
25534 | pmaddubsw m3, m2, m6 | |
25535 | pmulhrsw m3, m7 | |
25536 | packuswb m1, m3 | |
25537 | movu [r0 + 1216 * 16], m1 | |
25538 | ||
25539 | movu m1, [r3 + 16] | |
25540 | movu m3, [r3 + 17] | |
25541 | punpcklbw m1, m3 | |
25542 | pmaddubsw m4, m1, m6 | |
25543 | pmulhrsw m4, m7 | |
25544 | movu m3, [r3 + 24] | |
25545 | movu m5, [r3 + 25] | |
25546 | punpcklbw m3, m5 | |
25547 | pmaddubsw m5, m3, m6 | |
25548 | pmulhrsw m5, m7 | |
25549 | packuswb m4, m5 | |
25550 | movu [r0 + 1217 * 16], m4 | |
25551 | ||
25552 | ; mode 21 [row 1] | |
25553 | movu m6, [r5 + 30 * 16] | |
25554 | pslldq m0, 2 | |
25555 | pinsrb m0, [r4 + 0], 1 | |
25556 | pinsrb m0, [r4 + 2], 0 | |
25557 | pmaddubsw m4, m0, m6 | |
25558 | pmulhrsw m4, m7 | |
25559 | pslldq m2, 2 | |
25560 | pinsrw m2, [r3 + 7], 0 | |
25561 | pmaddubsw m5, m2, m6 | |
25562 | pmulhrsw m5, m7 | |
25563 | packuswb m4, m5 | |
25564 | movu [r0 + 1218 * 16], m4 | |
25565 | pslldq m1, 2 | |
25566 | pinsrw m1, [r3 + 15], 0 | |
25567 | pmaddubsw m4, m1, m6 | |
25568 | pmulhrsw m4, m7 | |
25569 | pslldq m3, 2 | |
25570 | pinsrw m3, [r3 + 23], 0 | |
25571 | pmaddubsw m5, m3, m6 | |
25572 | pmulhrsw m5, m7 | |
25573 | packuswb m4, m5 | |
25574 | movu [r0 + 1219 * 16], m4 | |
25575 | ||
25576 | ; mode 21 [row 2] | |
25577 | movu m6, [r5 + 13 * 16] | |
25578 | pmaddubsw m4, m0, m6 | |
25579 | pmulhrsw m4, m7 | |
25580 | pmaddubsw m5, m2, m6 | |
25581 | pmulhrsw m5, m7 | |
25582 | packuswb m4, m5 | |
25583 | movu [r0 + 1220 * 16], m4 | |
25584 | pmaddubsw m4, m1, m6 | |
25585 | pmulhrsw m4, m7 | |
25586 | pmaddubsw m5, m3, m6 | |
25587 | pmulhrsw m5, m7 | |
25588 | packuswb m4, m5 | |
25589 | movu [r0 + 1221 * 16], m4 | |
25590 | ||
25591 | ; mode 21 [row 3] | |
25592 | movu m6, [r5 + 28 * 16] | |
25593 | pslldq m0, 2 | |
25594 | pinsrb m0, [r4 + 2], 1 | |
25595 | pinsrb m0, [r4 + 4], 0 | |
25596 | pmaddubsw m4, m0, m6 | |
25597 | pmulhrsw m4, m7 | |
25598 | pslldq m2, 2 | |
25599 | pinsrw m2, [r3 + 6], 0 | |
25600 | pmaddubsw m5, m2, m6 | |
25601 | pmulhrsw m5, m7 | |
25602 | packuswb m4, m5 | |
25603 | movu [r0 + 1222 * 16], m4 | |
25604 | pslldq m1, 2 | |
25605 | pinsrw m1, [r3 + 14], 0 | |
25606 | pmaddubsw m4, m1, m6 | |
25607 | pmulhrsw m4, m7 | |
25608 | pslldq m3, 2 | |
25609 | pinsrw m3, [r3 + 22], 0 | |
25610 | pmaddubsw m5, m3, m6 | |
25611 | pmulhrsw m5, m7 | |
25612 | packuswb m4, m5 | |
25613 | movu [r0 + 1223 * 16], m4 | |
25614 | ||
25615 | ; mode 21 [row 4] | |
25616 | movu m6, [r5 + 11 * 16] | |
25617 | pmaddubsw m4, m0, m6 | |
25618 | pmulhrsw m4, m7 | |
25619 | pmaddubsw m5, m2, m6 | |
25620 | pmulhrsw m5, m7 | |
25621 | packuswb m4, m5 | |
25622 | movu [r0 + 1224 * 16], m4 | |
25623 | pmaddubsw m4, m1, m6 | |
25624 | pmulhrsw m4, m7 | |
25625 | pmaddubsw m5, m3, m6 | |
25626 | pmulhrsw m5, m7 | |
25627 | packuswb m4, m5 | |
25628 | movu [r0 + 1225 * 16], m4 | |
25629 | ||
25630 | ; mode 21 [row 5] | |
25631 | movu m6, [r5 + 26 * 16] | |
25632 | pslldq m0, 2 | |
25633 | pinsrb m0, [r4 + 4], 1 | |
25634 | pinsrb m0, [r4 + 6], 0 | |
25635 | pmaddubsw m4, m0, m6 | |
25636 | pmulhrsw m4, m7 | |
25637 | pslldq m2, 2 | |
25638 | pinsrw m2, [r3 + 5], 0 | |
25639 | pmaddubsw m5, m2, m6 | |
25640 | pmulhrsw m5, m7 | |
25641 | packuswb m4, m5 | |
25642 | movu [r0 + 1226 * 16], m4 | |
25643 | pslldq m1, 2 | |
25644 | pinsrw m1, [r3 + 13], 0 | |
25645 | pmaddubsw m4, m1, m6 | |
25646 | pmulhrsw m4, m7 | |
25647 | pslldq m3, 2 | |
25648 | pinsrw m3, [r3 + 21], 0 | |
25649 | pmaddubsw m5, m3, m6 | |
25650 | pmulhrsw m5, m7 | |
25651 | packuswb m4, m5 | |
25652 | movu [r0 + 1227 * 16], m4 | |
25653 | ||
25654 | ; mode 21 [row 6] | |
25655 | movu m6, [r5 + 9 * 16] | |
25656 | pmaddubsw m4, m0, m6 | |
25657 | pmulhrsw m4, m7 | |
25658 | pmaddubsw m5, m2, m6 | |
25659 | pmulhrsw m5, m7 | |
25660 | packuswb m4, m5 | |
25661 | movu [r0 + 1228 * 16], m4 | |
25662 | pmaddubsw m4, m1, m6 | |
25663 | pmulhrsw m4, m7 | |
25664 | pmaddubsw m5, m3, m6 | |
25665 | pmulhrsw m5, m7 | |
25666 | packuswb m4, m5 | |
25667 | movu [r0 + 1229 * 16], m4 | |
25668 | ||
25669 | ; mode 21 [row 7] | |
25670 | movu m6, [r5 + 24 * 16] | |
25671 | pslldq m0, 2 | |
25672 | pinsrb m0, [r4 + 6], 1 | |
25673 | pinsrb m0, [r4 + 8], 0 | |
25674 | pmaddubsw m4, m0, m6 | |
25675 | pmulhrsw m4, m7 | |
25676 | pslldq m2, 2 | |
25677 | pinsrw m2, [r3 + 4], 0 | |
25678 | pmaddubsw m5, m2, m6 | |
25679 | pmulhrsw m5, m7 | |
25680 | packuswb m4, m5 | |
25681 | movu [r0 + 1230 * 16], m4 | |
25682 | pslldq m1, 2 | |
25683 | pinsrw m1, [r3 + 12], 0 | |
25684 | pmaddubsw m4, m1, m6 | |
25685 | pmulhrsw m4, m7 | |
25686 | pslldq m3, 2 | |
25687 | pinsrw m3, [r3 + 20], 0 | |
25688 | pmaddubsw m5, m3, m6 | |
25689 | pmulhrsw m5, m7 | |
25690 | packuswb m4, m5 | |
25691 | movu [r0 + 1231 * 16], m4 | |
25692 | ||
25693 | ; mode 21 [row 8] | |
25694 | movu m6, [r5 + 7 * 16] | |
25695 | pmaddubsw m4, m0, m6 | |
25696 | pmulhrsw m4, m7 | |
25697 | pmaddubsw m5, m2, m6 | |
25698 | pmulhrsw m5, m7 | |
25699 | packuswb m4, m5 | |
25700 | movu [r0 + 1232 * 16], m4 | |
25701 | pmaddubsw m4, m1, m6 | |
25702 | pmulhrsw m4, m7 | |
25703 | pmaddubsw m5, m3, m6 | |
25704 | pmulhrsw m5, m7 | |
25705 | packuswb m4, m5 | |
25706 | movu [r0 + 1233 * 16], m4 | |
25707 | ||
25708 | ; mode 21 [row 9] | |
25709 | movu m6, [r5 + 22 * 16] | |
25710 | pslldq m0, 2 | |
25711 | pinsrb m0, [r4 + 8], 1 | |
25712 | pinsrb m0, [r4 + 9], 0 | |
25713 | pmaddubsw m4, m0, m6 | |
25714 | pmulhrsw m4, m7 | |
25715 | pslldq m2, 2 | |
25716 | pinsrw m2, [r3 + 3], 0 | |
25717 | pmaddubsw m5, m2, m6 | |
25718 | pmulhrsw m5, m7 | |
25719 | packuswb m4, m5 | |
25720 | movu [r0 + 1234 * 16], m4 | |
25721 | pslldq m1, 2 | |
25722 | pinsrw m1, [r3 + 11], 0 | |
25723 | pmaddubsw m4, m1, m6 | |
25724 | pmulhrsw m4, m7 | |
25725 | pslldq m3, 2 | |
25726 | pinsrw m3, [r3 + 19], 0 | |
25727 | pmaddubsw m5, m3, m6 | |
25728 | pmulhrsw m5, m7 | |
25729 | packuswb m4, m5 | |
25730 | movu [r0 + 1235 * 16], m4 | |
25731 | ||
25732 | ; mode 21 [row 10] | |
25733 | movu m6, [r5 + 5 * 16] | |
25734 | pmaddubsw m4, m0, m6 | |
25735 | pmulhrsw m4, m7 | |
25736 | pmaddubsw m5, m2, m6 | |
25737 | pmulhrsw m5, m7 | |
25738 | packuswb m4, m5 | |
25739 | movu [r0 + 1236 * 16], m4 | |
25740 | pmaddubsw m4, m1, m6 | |
25741 | pmulhrsw m4, m7 | |
25742 | pmaddubsw m5, m3, m6 | |
25743 | pmulhrsw m5, m7 | |
25744 | packuswb m4, m5 | |
25745 | movu [r0 + 1237 * 16], m4 | |
25746 | ||
25747 | ; mode 21 [row 11] | |
25748 | movu m6, [r5 + 20 * 16] | |
25749 | pslldq m0, 2 | |
25750 | pinsrb m0, [r4 + 9], 1 | |
25751 | pinsrb m0, [r4 + 11], 0 | |
25752 | pmaddubsw m4, m0, m6 | |
25753 | pmulhrsw m4, m7 | |
25754 | pslldq m2, 2 | |
25755 | pinsrw m2, [r3 + 2], 0 | |
25756 | pmaddubsw m5, m2, m6 | |
25757 | pmulhrsw m5, m7 | |
25758 | packuswb m4, m5 | |
25759 | movu [r0 + 1238 * 16], m4 | |
25760 | pslldq m1, 2 | |
25761 | pinsrw m1, [r3 + 10], 0 | |
25762 | pmaddubsw m4, m1, m6 | |
25763 | pmulhrsw m4, m7 | |
25764 | pslldq m3, 2 | |
25765 | pinsrw m3, [r3 + 18], 0 | |
25766 | pmaddubsw m5, m3, m6 | |
25767 | pmulhrsw m5, m7 | |
25768 | packuswb m4, m5 | |
25769 | movu [r0 + 1239 * 16], m4 | |
25770 | ||
25771 | ; mode 21 [row 12] | |
25772 | movu m6, [r5 + 3 * 16] | |
25773 | pmaddubsw m4, m0, m6 | |
25774 | pmulhrsw m4, m7 | |
25775 | pmaddubsw m5, m2, m6 | |
25776 | pmulhrsw m5, m7 | |
25777 | packuswb m4, m5 | |
25778 | movu [r0 + 1240 * 16], m4 | |
25779 | pmaddubsw m4, m1, m6 | |
25780 | pmulhrsw m4, m7 | |
25781 | pmaddubsw m5, m3, m6 | |
25782 | pmulhrsw m5, m7 | |
25783 | packuswb m4, m5 | |
25784 | movu [r0 + 1241 * 16], m4 | |
25785 | ||
25786 | ; mode 21 [row 13] | |
25787 | movu m6, [r5 + 18 * 16] | |
25788 | pslldq m0, 2 | |
25789 | pinsrb m0, [r4 + 11], 1 | |
25790 | pinsrb m0, [r4 + 13], 0 | |
25791 | pmaddubsw m4, m0, m6 | |
25792 | pmulhrsw m4, m7 | |
25793 | pslldq m2, 2 | |
25794 | pinsrw m2, [r3 + 1], 0 | |
25795 | pmaddubsw m5, m2, m6 | |
25796 | pmulhrsw m5, m7 | |
25797 | packuswb m4, m5 | |
25798 | movu [r0 + 1242 * 16], m4 | |
25799 | pslldq m1, 2 | |
25800 | pinsrw m1, [r3 + 9], 0 | |
25801 | pmaddubsw m4, m1, m6 | |
25802 | pmulhrsw m4, m7 | |
25803 | pslldq m3, 2 | |
25804 | pinsrw m3, [r3 + 17], 0 | |
25805 | pmaddubsw m5, m3, m6 | |
25806 | pmulhrsw m5, m7 | |
25807 | packuswb m4, m5 | |
25808 | movu [r0 + 1243 * 16], m4 | |
25809 | ||
25810 | ; mode 21 [row 14] | |
25811 | movu m6, [r5 + 1 * 16] | |
25812 | pmaddubsw m4, m0, m6 | |
25813 | pmulhrsw m4, m7 | |
25814 | pmaddubsw m5, m2, m6 | |
25815 | pmulhrsw m5, m7 | |
25816 | packuswb m4, m5 | |
25817 | movu [r0 + 1244 * 16], m4 | |
25818 | pmaddubsw m4, m1, m6 | |
25819 | pmulhrsw m4, m7 | |
25820 | pmaddubsw m5, m3, m6 | |
25821 | pmulhrsw m5, m7 | |
25822 | packuswb m4, m5 | |
25823 | movu [r0 + 1245 * 16], m4 | |
25824 | ||
25825 | ; mode 21 [row 15] | |
25826 | movu m6, [r5 + 16 * 16] | |
25827 | pslldq m0, 2 | |
25828 | pinsrb m0, [r4 + 13], 1 | |
25829 | pinsrb m0, [r4 + 15], 0 | |
25830 | pmaddubsw m4, m0, m6 | |
25831 | pmulhrsw m4, m7 | |
25832 | pslldq m2, 2 | |
25833 | pinsrw m2, [r3 + 0], 0 | |
25834 | pmaddubsw m5, m2, m6 | |
25835 | pmulhrsw m5, m7 | |
25836 | packuswb m4, m5 | |
25837 | movu [r0 + 1246 * 16], m4 | |
25838 | pslldq m1, 2 | |
25839 | pinsrw m1, [r3 + 8], 0 | |
25840 | pmaddubsw m4, m1, m6 | |
25841 | pmulhrsw m4, m7 | |
25842 | pslldq m3, 2 | |
25843 | pinsrw m3, [r3 + 16], 0 | |
25844 | pmaddubsw m5, m3, m6 | |
25845 | pmulhrsw m5, m7 | |
25846 | packuswb m4, m5 | |
25847 | movu [r0 + 1247 * 16], m4 | |
25848 | ||
25849 | ; mode 21 [row 16] | |
25850 | movu m6, [r5 + 31 * 16] | |
25851 | pslldq m0, 2 | |
25852 | pinsrb m0, [r4 + 15], 1 | |
25853 | pinsrb m0, [r4 + 17], 0 | |
25854 | pmaddubsw m4, m0, m6 | |
25855 | pmulhrsw m4, m7 | |
25856 | pslldq m2, 2 | |
25857 | pinsrb m2, [r4 + 0], 1 | |
25858 | pinsrb m2, [r4 + 2], 0 | |
25859 | pmaddubsw m5, m2, m6 | |
25860 | pmulhrsw m5, m7 | |
25861 | packuswb m4, m5 | |
25862 | movu [r0 + 1248 * 16], m4 | |
25863 | pslldq m1, 2 | |
25864 | pinsrw m1, [r3 + 7], 0 | |
25865 | pmaddubsw m4, m1, m6 | |
25866 | pmulhrsw m4, m7 | |
25867 | pslldq m3, 2 | |
25868 | pinsrw m3, [r3 + 15], 0 | |
25869 | pmaddubsw m5, m3, m6 | |
25870 | pmulhrsw m5, m7 | |
25871 | packuswb m4, m5 | |
25872 | movu [r0 + 1249 * 16], m4 | |
25873 | ||
25874 | ; mode 21 [row 17] | |
25875 | movu m6, [r5 + 14 * 16] | |
25876 | pmaddubsw m4, m0, m6 | |
25877 | pmulhrsw m4, m7 | |
25878 | pmaddubsw m5, m2, m6 | |
25879 | pmulhrsw m5, m7 | |
25880 | packuswb m4, m5 | |
25881 | movu [r0 + 1250 * 16], m4 | |
25882 | pmaddubsw m4, m1, m6 | |
25883 | pmulhrsw m4, m7 | |
25884 | pmaddubsw m5, m3, m6 | |
25885 | pmulhrsw m5, m7 | |
25886 | packuswb m4, m5 | |
25887 | movu [r0 + 1251 * 16], m4 | |
25888 | ||
25889 | ; mode 21 [row 18] | |
25890 | movu m6, [r5 + 29 * 16] | |
25891 | pslldq m0, 2 | |
25892 | pinsrb m0, [r4 + 17], 1 | |
25893 | pinsrb m0, [r4 + 19], 0 | |
25894 | pmaddubsw m4, m0, m6 | |
25895 | pmulhrsw m4, m7 | |
25896 | pslldq m2, 2 | |
25897 | pinsrb m2, [r4 + 2], 1 | |
25898 | pinsrb m2, [r4 + 4], 0 | |
25899 | pmaddubsw m5, m2, m6 | |
25900 | pmulhrsw m5, m7 | |
25901 | packuswb m4, m5 | |
25902 | movu [r0 + 1252 * 16], m4 | |
25903 | pslldq m1, 2 | |
25904 | pinsrb m1, [r3 + 7], 1 | |
25905 | pinsrb m1, [r3 + 6], 0 | |
25906 | pmaddubsw m4, m1, m6 | |
25907 | pmulhrsw m4, m7 | |
25908 | pslldq m3, 2 | |
25909 | pinsrb m3, [r3 + 15], 1 | |
25910 | pinsrb m3, [r3 + 14], 0 | |
25911 | pmaddubsw m5, m3, m6 | |
25912 | pmulhrsw m5, m7 | |
25913 | packuswb m4, m5 | |
25914 | movu [r0 + 1253 * 16], m4 | |
25915 | ||
25916 | ; mode 21 [row 19] | |
25917 | movu m6, [r5 + 12 * 16] | |
25918 | pmaddubsw m4, m0, m6 | |
25919 | pmulhrsw m4, m7 | |
25920 | pmaddubsw m5, m2, m6 | |
25921 | pmulhrsw m5, m7 | |
25922 | packuswb m4, m5 | |
25923 | movu [r0 + 1254 * 16], m4 | |
25924 | pmaddubsw m4, m1, m6 | |
25925 | pmulhrsw m4, m7 | |
25926 | pmaddubsw m5, m3, m6 | |
25927 | pmulhrsw m5, m7 | |
25928 | packuswb m4, m5 | |
25929 | movu [r0 + 1255 * 16], m4 | |
25930 | ||
25931 | ; mode 21 [row 20] | |
25932 | movu m6, [r5 + 27 * 16] | |
25933 | pslldq m0, 2 | |
25934 | pinsrb m0, [r4 + 19], 1 | |
25935 | pinsrb m0, [r4 + 21], 0 | |
25936 | pmaddubsw m4, m0, m6 | |
25937 | pmulhrsw m4, m7 | |
25938 | pslldq m2, 2 | |
25939 | pinsrb m2, [r4 + 4], 1 | |
25940 | pinsrb m2, [r4 + 6], 0 | |
25941 | pmaddubsw m5, m2, m6 | |
25942 | pmulhrsw m5, m7 | |
25943 | packuswb m4, m5 | |
25944 | movu [r0 + 1256 * 16], m4 | |
25945 | pslldq m1, 2 | |
25946 | pinsrw m1, [r3 + 5], 0 | |
25947 | pmaddubsw m4, m1, m6 | |
25948 | pmulhrsw m4, m7 | |
25949 | pslldq m3, 2 | |
25950 | pinsrw m3, [r3 + 13], 0 | |
25951 | pmaddubsw m5, m3, m6 | |
25952 | pmulhrsw m5, m7 | |
25953 | packuswb m4, m5 | |
25954 | movu [r0 + 1257 * 16], m4 | |
25955 | ||
25956 | ; mode 21 [row 21] | |
25957 | movu m6, [r5 + 10 * 16] | |
25958 | pmaddubsw m4, m0, m6 | |
25959 | pmulhrsw m4, m7 | |
25960 | pmaddubsw m5, m2, m6 | |
25961 | pmulhrsw m5, m7 | |
25962 | packuswb m4, m5 | |
25963 | movu [r0 + 1258 * 16], m4 | |
25964 | pmaddubsw m4, m1, m6 | |
25965 | pmulhrsw m4, m7 | |
25966 | pmaddubsw m5, m3, m6 | |
25967 | pmulhrsw m5, m7 | |
25968 | packuswb m4, m5 | |
25969 | movu [r0 + 1259 * 16], m4 | |
25970 | ||
25971 | ; mode 21 [row 22] | |
25972 | movu m6, [r5 + 25 * 16] | |
25973 | pslldq m0, 2 | |
25974 | pinsrb m0, [r4 + 21], 1 | |
25975 | pinsrb m0, [r4 + 23], 0 | |
25976 | pmaddubsw m4, m0, m6 | |
25977 | pmulhrsw m4, m7 | |
25978 | pslldq m2, 2 | |
25979 | pinsrb m2, [r4 + 6], 1 | |
25980 | pinsrb m2, [r4 + 8], 0 | |
25981 | pmaddubsw m5, m2, m6 | |
25982 | pmulhrsw m5, m7 | |
25983 | packuswb m4, m5 | |
25984 | movu [r0 + 1260 * 16], m4 | |
25985 | pslldq m1, 2 | |
25986 | pinsrw m1, [r3 + 4], 0 | |
25987 | pmaddubsw m4, m1, m6 | |
25988 | pmulhrsw m4, m7 | |
25989 | pslldq m3, 2 | |
25990 | pinsrw m3, [r3 + 12], 0 | |
25991 | pmaddubsw m5, m3, m6 | |
25992 | pmulhrsw m5, m7 | |
25993 | packuswb m4, m5 | |
25994 | movu [r0 + 1261 * 16], m4 | |
25995 | ||
25996 | ; mode 21 [row 23] | |
25997 | movu m6, [r5 + 8 * 16] | |
25998 | pmaddubsw m4, m0, m6 | |
25999 | pmulhrsw m4, m7 | |
26000 | pmaddubsw m5, m2, m6 | |
26001 | pmulhrsw m5, m7 | |
26002 | packuswb m4, m5 | |
26003 | movu [r0 + 1262 * 16], m4 | |
26004 | pmaddubsw m4, m1, m6 | |
26005 | pmulhrsw m4, m7 | |
26006 | pmaddubsw m5, m3, m6 | |
26007 | pmulhrsw m5, m7 | |
26008 | packuswb m4, m5 | |
26009 | movu [r0 + 1263 * 16], m4 | |
26010 | ||
26011 | ; mode 21 [row 24] | |
26012 | movu m6, [r5 + 23 * 16] | |
26013 | pslldq m0, 2 | |
26014 | pinsrb m0, [r4 + 23], 1 | |
26015 | pinsrb m0, [r4 + 24], 0 | |
26016 | pmaddubsw m4, m0, m6 | |
26017 | pmulhrsw m4, m7 | |
26018 | pslldq m2, 2 | |
26019 | pinsrb m2, [r4 + 8], 1 | |
26020 | pinsrb m2, [r4 + 9], 0 | |
26021 | pmaddubsw m5, m2, m6 | |
26022 | pmulhrsw m5, m7 | |
26023 | packuswb m4, m5 | |
26024 | movu [r0 + 1264 * 16], m4 | |
26025 | pslldq m1, 2 | |
26026 | pinsrw m1, [r3 + 3], 0 | |
26027 | pmaddubsw m4, m1, m6 | |
26028 | pmulhrsw m4, m7 | |
26029 | pslldq m3, 2 | |
26030 | pinsrw m3, [r3 + 11], 0 | |
26031 | pmaddubsw m5, m3, m6 | |
26032 | pmulhrsw m5, m7 | |
26033 | packuswb m4, m5 | |
26034 | movu [r0 + 1265 * 16], m4 | |
26035 | ||
26036 | ; mode 21 [row 25] | |
26037 | movu m6, [r5 + 6 * 16] | |
26038 | pmaddubsw m4, m0, m6 | |
26039 | pmulhrsw m4, m7 | |
26040 | pmaddubsw m5, m2, m6 | |
26041 | pmulhrsw m5, m7 | |
26042 | packuswb m4, m5 | |
26043 | movu [r0 + 1266 * 16], m4 | |
26044 | pmaddubsw m4, m1, m6 | |
26045 | pmulhrsw m4, m7 | |
26046 | pmaddubsw m5, m3, m6 | |
26047 | pmulhrsw m5, m7 | |
26048 | packuswb m4, m5 | |
26049 | movu [r0 + 1267 * 16], m4 | |
26050 | ||
26051 | ; mode 21 [row 26] | |
26052 | movu m6, [r5 + 21 * 16] | |
26053 | pslldq m0, 2 | |
26054 | pinsrb m0, [r4 + 24], 1 | |
26055 | pinsrb m0, [r4 + 26], 0 | |
26056 | pmaddubsw m4, m0, m6 | |
26057 | pmulhrsw m4, m7 | |
26058 | pslldq m2, 2 | |
26059 | pinsrb m2, [r4 + 9], 1 | |
26060 | pinsrb m2, [r4 + 11], 0 | |
26061 | pmaddubsw m5, m2, m6 | |
26062 | pmulhrsw m5, m7 | |
26063 | packuswb m4, m5 | |
26064 | movu [r0 + 1268 * 16], m4 | |
26065 | pslldq m1, 2 | |
26066 | pinsrw m1, [r3 + 2], 0 | |
26067 | pmaddubsw m4, m1, m6 | |
26068 | pmulhrsw m4, m7 | |
26069 | pslldq m3, 2 | |
26070 | pinsrw m3, [r3 + 10], 0 | |
26071 | pmaddubsw m5, m3, m6 | |
26072 | pmulhrsw m5, m7 | |
26073 | packuswb m4, m5 | |
26074 | movu [r0 + 1269 * 16], m4 | |
26075 | ||
26076 | ; mode 21 [row 27] | |
26077 | movu m6, [r5 + 4 * 16] | |
26078 | pmaddubsw m4, m0, m6 | |
26079 | pmulhrsw m4, m7 | |
26080 | pmaddubsw m5, m2, m6 | |
26081 | pmulhrsw m5, m7 | |
26082 | packuswb m4, m5 | |
26083 | movu [r0 + 1270 * 16], m4 | |
26084 | pmaddubsw m4, m1, m6 | |
26085 | pmulhrsw m4, m7 | |
26086 | pmaddubsw m5, m3, m6 | |
26087 | pmulhrsw m5, m7 | |
26088 | packuswb m4, m5 | |
26089 | movu [r0 + 1271 * 16], m4 | |
26090 | ||
26091 | ; mode 21 [row 28] | |
26092 | movu m6, [r5 + 19 * 16] | |
26093 | pslldq m0, 2 | |
26094 | pinsrb m0, [r4 + 26], 1 | |
26095 | pinsrb m0, [r4 + 28], 0 | |
26096 | pmaddubsw m4, m0, m6 | |
26097 | pmulhrsw m4, m7 | |
26098 | pslldq m2, 2 | |
26099 | pinsrb m2, [r4 + 11], 1 | |
26100 | pinsrb m2, [r4 + 13], 0 | |
26101 | pmaddubsw m5, m2, m6 | |
26102 | pmulhrsw m5, m7 | |
26103 | packuswb m4, m5 | |
26104 | movu [r0 + 1272 * 16], m4 | |
26105 | pslldq m1, 2 | |
26106 | pinsrw m1, [r3 + 1], 0 | |
26107 | pmaddubsw m4, m1, m6 | |
26108 | pmulhrsw m4, m7 | |
26109 | pslldq m3, 2 | |
26110 | pinsrw m3, [r3 + 9], 0 | |
26111 | pmaddubsw m5, m3, m6 | |
26112 | pmulhrsw m5, m7 | |
26113 | packuswb m4, m5 | |
26114 | movu [r0 + 1273 * 16], m4 | |
26115 | ||
26116 | ; mode 21 [row 29] | |
26117 | movu m6, [r5 + 2 * 16] | |
26118 | pmaddubsw m4, m0, m6 | |
26119 | pmulhrsw m4, m7 | |
26120 | pmaddubsw m5, m2, m6 | |
26121 | pmulhrsw m5, m7 | |
26122 | packuswb m4, m5 | |
26123 | movu [r0 + 1274 * 16], m4 | |
26124 | pmaddubsw m4, m1, m6 | |
26125 | pmulhrsw m4, m7 | |
26126 | pmaddubsw m5, m3, m6 | |
26127 | pmulhrsw m5, m7 | |
26128 | packuswb m4, m5 | |
26129 | movu [r0 + 1275 * 16], m4 | |
26130 | ||
26131 | ; mode 21 [row 30] | |
26132 | movu m6, [r5 + 17 * 16] | |
26133 | pslldq m0, 2 | |
26134 | pinsrb m0, [r4 + 28], 1 | |
26135 | pinsrb m0, [r4 + 30], 0 | |
26136 | pmaddubsw m4, m0, m6 | |
26137 | pmulhrsw m4, m7 | |
26138 | pslldq m2, 2 | |
26139 | pinsrb m2, [r4 + 13], 1 | |
26140 | pinsrb m2, [r4 + 15], 0 | |
26141 | pmaddubsw m5, m2, m6 | |
26142 | pmulhrsw m5, m7 | |
26143 | packuswb m4, m5 | |
26144 | movu [r0 + 1276 * 16], m4 | |
26145 | pslldq m1, 2 | |
26146 | pinsrw m1, [r3 + 0], 0 | |
26147 | pmaddubsw m4, m1, m6 | |
26148 | pmulhrsw m4, m7 | |
26149 | pslldq m3, 2 | |
26150 | pinsrw m3, [r3 + 8], 0 | |
26151 | pmaddubsw m5, m3, m6 | |
26152 | pmulhrsw m5, m7 | |
26153 | packuswb m4, m5 | |
26154 | movu [r0 + 1277 * 16], m4 | |
26155 | ||
26156 | ; mode21 [row 31] | |
26157 | pshufb m5, m0, [tab_S2] | |
26158 | movh [r0 + 1278 * 16], m5 | |
26159 | pshufb m5, m2, [tab_S2] | |
26160 | movh [r0 + 1278 * 16 + 8], m5 | |
26161 | pshufb m5, m1, [tab_S2] | |
26162 | movh [r0 + 1279 * 16], m5 | |
26163 | pshufb m5, m3, [tab_S2] | |
26164 | movh [r0 + 1279 * 16 + 8], m5 | |
26165 | ||
26166 | ; mode 22 [row 0] | |
26167 | movu m6, [r5 + 19 * 16] | |
26168 | movu m0, [r3 ] | |
26169 | movu m1, [r3 + 1 ] | |
26170 | punpcklbw m0, m1 | |
26171 | pmaddubsw m1, m0, m6 | |
26172 | pmulhrsw m1, m7 | |
26173 | movu m2, [r3 + 8] | |
26174 | movu m3, [r3 + 9] | |
26175 | punpcklbw m2, m3 | |
26176 | pmaddubsw m3, m2, m6 | |
26177 | pmulhrsw m3, m7 | |
26178 | packuswb m1, m3 | |
26179 | movu [r0 + 1280 * 16], m1 | |
26180 | ||
26181 | movu m1, [r3 + 16] | |
26182 | movu m3, [r3 + 17] | |
26183 | punpcklbw m1, m3 | |
26184 | pmaddubsw m4, m1, m6 | |
26185 | pmulhrsw m4, m7 | |
26186 | movu m3, [r3 + 24] | |
26187 | movu m5, [r3 + 25] | |
26188 | punpcklbw m3, m5 | |
26189 | pmaddubsw m5, m3, m6 | |
26190 | pmulhrsw m5, m7 | |
26191 | packuswb m4, m5 | |
26192 | movu [r0 + 1281 * 16], m4 | |
26193 | ||
26194 | ; mode 22 [row 1] | |
26195 | movu m6, [r5 + 6 * 16] | |
26196 | pmaddubsw m4, m0, m6 | |
26197 | pmulhrsw m4, m7 | |
26198 | pmaddubsw m5, m2, m6 | |
26199 | pmulhrsw m5, m7 | |
26200 | packuswb m4, m5 | |
26201 | movu [r0 + 1282 * 16], m4 | |
26202 | pmaddubsw m4, m1, m6 | |
26203 | pmulhrsw m4, m7 | |
26204 | pmaddubsw m5, m3, m6 | |
26205 | pmulhrsw m5, m7 | |
26206 | packuswb m4, m5 | |
26207 | movu [r0 + 1283 * 16], m4 | |
26208 | ||
26209 | ; mode 22 [row 2] | |
26210 | movu m6, [r5 + 25 * 16] | |
26211 | pslldq m0, 2 | |
26212 | pinsrb m0, [r4 + 0], 1 | |
26213 | pinsrb m0, [r4 + 2], 0 | |
26214 | pmaddubsw m4, m0, m6 | |
26215 | pmulhrsw m4, m7 | |
26216 | pslldq m2, 2 | |
26217 | pinsrw m2, [r3 + 7], 0 | |
26218 | pmaddubsw m5, m2, m6 | |
26219 | pmulhrsw m5, m7 | |
26220 | packuswb m4, m5 | |
26221 | movu [r0 + 1284 * 16], m4 | |
26222 | pslldq m1, 2 | |
26223 | pinsrw m1, [r3 + 15], 0 | |
26224 | pmaddubsw m4, m1, m6 | |
26225 | pmulhrsw m4, m7 | |
26226 | pslldq m3, 2 | |
26227 | pinsrw m3, [r3 + 23], 0 | |
26228 | pmaddubsw m5, m3, m6 | |
26229 | pmulhrsw m5, m7 | |
26230 | packuswb m4, m5 | |
26231 | movu [r0 + 1285 * 16], m4 | |
26232 | ||
26233 | ; mode 22 [row 3] | |
26234 | movu m6, [r5 + 12 * 16] | |
26235 | pmaddubsw m4, m0, m6 | |
26236 | pmulhrsw m4, m7 | |
26237 | pmaddubsw m5, m2, m6 | |
26238 | pmulhrsw m5, m7 | |
26239 | packuswb m4, m5 | |
26240 | movu [r0 + 1286 * 16], m4 | |
26241 | pmaddubsw m4, m1, m6 | |
26242 | pmulhrsw m4, m7 | |
26243 | pmaddubsw m5, m3, m6 | |
26244 | pmulhrsw m5, m7 | |
26245 | packuswb m4, m5 | |
26246 | movu [r0 + 1287 * 16], m4 | |
26247 | ||
26248 | ; mode 22 [row 4] | |
26249 | movu m6, [r5 + 31 * 16] | |
26250 | pslldq m0, 2 | |
26251 | pinsrb m0, [r4 + 2], 1 | |
26252 | pinsrb m0, [r4 + 5], 0 | |
26253 | pmaddubsw m4, m0, m6 | |
26254 | pmulhrsw m4, m7 | |
26255 | pslldq m2, 2 | |
26256 | pinsrw m2, [r3 + 6], 0 | |
26257 | pmaddubsw m5, m2, m6 | |
26258 | pmulhrsw m5, m7 | |
26259 | packuswb m4, m5 | |
26260 | movu [r0 + 1288 * 16], m4 | |
26261 | pslldq m1, 2 | |
26262 | pinsrw m1, [r3 + 14], 0 | |
26263 | pmaddubsw m4, m1, m6 | |
26264 | pmulhrsw m4, m7 | |
26265 | pslldq m3, 2 | |
26266 | pinsrw m3, [r3 + 22], 0 | |
26267 | pmaddubsw m5, m3, m6 | |
26268 | pmulhrsw m5, m7 | |
26269 | packuswb m4, m5 | |
26270 | movu [r0 + 1289 * 16], m4 | |
26271 | ||
26272 | ; mode 22 [row 5] | |
26273 | movu m6, [r5 + 18 * 16] | |
26274 | pmaddubsw m4, m0, m6 | |
26275 | pmulhrsw m4, m7 | |
26276 | pmaddubsw m5, m2, m6 | |
26277 | pmulhrsw m5, m7 | |
26278 | packuswb m4, m5 | |
26279 | movu [r0 + 1290 * 16], m4 | |
26280 | pmaddubsw m4, m1, m6 | |
26281 | pmulhrsw m4, m7 | |
26282 | pmaddubsw m5, m3, m6 | |
26283 | pmulhrsw m5, m7 | |
26284 | packuswb m4, m5 | |
26285 | movu [r0 + 1291 * 16], m4 | |
26286 | ||
26287 | ; mode 22 [row 6] | |
26288 | movu m6, [r5 + 5 * 16] | |
26289 | pmaddubsw m4, m0, m6 | |
26290 | pmulhrsw m4, m7 | |
26291 | pmaddubsw m5, m2, m6 | |
26292 | pmulhrsw m5, m7 | |
26293 | packuswb m4, m5 | |
26294 | movu [r0 + 1292 * 16], m4 | |
26295 | pmaddubsw m4, m1, m6 | |
26296 | pmulhrsw m4, m7 | |
26297 | pmaddubsw m5, m3, m6 | |
26298 | pmulhrsw m5, m7 | |
26299 | packuswb m4, m5 | |
26300 | movu [r0 + 1293 * 16], m4 | |
26301 | ||
26302 | ; mode 22 [row 7] | |
26303 | movu m6, [r5 + 24 * 16] | |
26304 | pslldq m0, 2 | |
26305 | pinsrb m0, [r4 + 5], 1 | |
26306 | pinsrb m0, [r4 + 7], 0 | |
26307 | pmaddubsw m4, m0, m6 | |
26308 | pmulhrsw m4, m7 | |
26309 | pslldq m2, 2 | |
26310 | pinsrw m2, [r3 + 5], 0 | |
26311 | pmaddubsw m5, m2, m6 | |
26312 | pmulhrsw m5, m7 | |
26313 | packuswb m4, m5 | |
26314 | movu [r0 + 1294 * 16], m4 | |
26315 | pslldq m1, 2 | |
26316 | pinsrw m1, [r3 + 13], 0 | |
26317 | pmaddubsw m4, m1, m6 | |
26318 | pmulhrsw m4, m7 | |
26319 | pslldq m3, 2 | |
26320 | pinsrw m3, [r3 + 21], 0 | |
26321 | pmaddubsw m5, m3, m6 | |
26322 | pmulhrsw m5, m7 | |
26323 | packuswb m4, m5 | |
26324 | movu [r0 + 1295 * 16], m4 | |
26325 | ||
26326 | ; mode 22 [row 8] | |
26327 | movu m6, [r5 + 11 * 16] | |
26328 | pmaddubsw m4, m0, m6 | |
26329 | pmulhrsw m4, m7 | |
26330 | pmaddubsw m5, m2, m6 | |
26331 | pmulhrsw m5, m7 | |
26332 | packuswb m4, m5 | |
26333 | movu [r0 + 1296 * 16], m4 | |
26334 | pmaddubsw m4, m1, m6 | |
26335 | pmulhrsw m4, m7 | |
26336 | pmaddubsw m5, m3, m6 | |
26337 | pmulhrsw m5, m7 | |
26338 | packuswb m4, m5 | |
26339 | movu [r0 + 1297 * 16], m4 | |
26340 | ||
26341 | ; mode 22 [row 9] | |
26342 | movu m6, [r5 + 30 * 16] | |
26343 | pslldq m0, 2 | |
26344 | pinsrb m0, [r4 + 7], 1 | |
26345 | pinsrb m0, [r4 + 10], 0 | |
26346 | pmaddubsw m4, m0, m6 | |
26347 | pmulhrsw m4, m7 | |
26348 | pslldq m2, 2 | |
26349 | pinsrw m2, [r3 + 4], 0 | |
26350 | pmaddubsw m5, m2, m6 | |
26351 | pmulhrsw m5, m7 | |
26352 | packuswb m4, m5 | |
26353 | movu [r0 + 1298 * 16], m4 | |
26354 | pslldq m1, 2 | |
26355 | pinsrw m1, [r3 + 12], 0 | |
26356 | pmaddubsw m4, m1, m6 | |
26357 | pmulhrsw m4, m7 | |
26358 | pslldq m3, 2 | |
26359 | pinsrw m3, [r3 + 20], 0 | |
26360 | pmaddubsw m5, m3, m6 | |
26361 | pmulhrsw m5, m7 | |
26362 | packuswb m4, m5 | |
26363 | movu [r0 + 1299 * 16], m4 | |
26364 | ||
26365 | ; mode 22 [row 10] | |
26366 | movu m6, [r5 + 17 * 16] | |
26367 | pmaddubsw m4, m0, m6 | |
26368 | pmulhrsw m4, m7 | |
26369 | pmaddubsw m5, m2, m6 | |
26370 | pmulhrsw m5, m7 | |
26371 | packuswb m4, m5 | |
26372 | movu [r0 + 1300 * 16], m4 | |
26373 | pmaddubsw m4, m1, m6 | |
26374 | pmulhrsw m4, m7 | |
26375 | pmaddubsw m5, m3, m6 | |
26376 | pmulhrsw m5, m7 | |
26377 | packuswb m4, m5 | |
26378 | movu [r0 + 1301 * 16], m4 | |
26379 | ||
26380 | ; mode 22 [row 11] | |
26381 | movu m6, [r5 + 4 * 16] | |
26382 | pmaddubsw m4, m0, m6 | |
26383 | pmulhrsw m4, m7 | |
26384 | pmaddubsw m5, m2, m6 | |
26385 | pmulhrsw m5, m7 | |
26386 | packuswb m4, m5 | |
26387 | movu [r0 + 1302 * 16], m4 | |
26388 | pmaddubsw m4, m1, m6 | |
26389 | pmulhrsw m4, m7 | |
26390 | pmaddubsw m5, m3, m6 | |
26391 | pmulhrsw m5, m7 | |
26392 | packuswb m4, m5 | |
26393 | movu [r0 + 1303 * 16], m4 | |
26394 | ||
26395 | ; mode 22 [row 12] | |
26396 | movu m6, [r5 + 23 * 16] | |
26397 | pslldq m0, 2 | |
26398 | pinsrb m0, [r4 + 10], 1 | |
26399 | pinsrb m0, [r4 + 12], 0 | |
26400 | pmaddubsw m4, m0, m6 | |
26401 | pmulhrsw m4, m7 | |
26402 | pslldq m2, 2 | |
26403 | pinsrw m2, [r3 + 3], 0 | |
26404 | pmaddubsw m5, m2, m6 | |
26405 | pmulhrsw m5, m7 | |
26406 | packuswb m4, m5 | |
26407 | movu [r0 + 1304 * 16], m4 | |
26408 | pslldq m1, 2 | |
26409 | pinsrw m1, [r3 + 11], 0 | |
26410 | pmaddubsw m4, m1, m6 | |
26411 | pmulhrsw m4, m7 | |
26412 | pslldq m3, 2 | |
26413 | pinsrw m3, [r3 + 19], 0 | |
26414 | pmaddubsw m5, m3, m6 | |
26415 | pmulhrsw m5, m7 | |
26416 | packuswb m4, m5 | |
26417 | movu [r0 + 1305 * 16], m4 | |
26418 | ||
26419 | ; mode 22 [row 13] | |
26420 | movu m6, [r5 + 10 * 16] | |
26421 | pmaddubsw m4, m0, m6 | |
26422 | pmulhrsw m4, m7 | |
26423 | pmaddubsw m5, m2, m6 | |
26424 | pmulhrsw m5, m7 | |
26425 | packuswb m4, m5 | |
26426 | movu [r0 + 1306 * 16], m4 | |
26427 | pmaddubsw m4, m1, m6 | |
26428 | pmulhrsw m4, m7 | |
26429 | pmaddubsw m5, m3, m6 | |
26430 | pmulhrsw m5, m7 | |
26431 | packuswb m4, m5 | |
26432 | movu [r0 + 1307 * 16], m4 | |
26433 | ||
26434 | ; mode 22 [row 14] | |
26435 | movu m6, [r5 + 29 * 16] | |
26436 | pslldq m0, 2 | |
26437 | pinsrb m0, [r4 + 12], 1 | |
26438 | pinsrb m0, [r4 + 15], 0 | |
26439 | pmaddubsw m4, m0, m6 | |
26440 | pmulhrsw m4, m7 | |
26441 | pslldq m2, 2 | |
26442 | pinsrw m2, [r3 + 2], 0 | |
26443 | pmaddubsw m5, m2, m6 | |
26444 | pmulhrsw m5, m7 | |
26445 | packuswb m4, m5 | |
26446 | movu [r0 + 1308 * 16], m4 | |
26447 | pslldq m1, 2 | |
26448 | pinsrw m1, [r3 + 10], 0 | |
26449 | pmaddubsw m4, m1, m6 | |
26450 | pmulhrsw m4, m7 | |
26451 | pslldq m3, 2 | |
26452 | pinsrw m3, [r3 + 18], 0 | |
26453 | pmaddubsw m5, m3, m6 | |
26454 | pmulhrsw m5, m7 | |
26455 | packuswb m4, m5 | |
26456 | movu [r0 + 1309 * 16], m4 | |
26457 | ||
26458 | ; mode 22 [row 15] | |
26459 | movu m6, [r5 + 16 * 16] | |
26460 | pmaddubsw m4, m0, m6 | |
26461 | pmulhrsw m4, m7 | |
26462 | pmaddubsw m5, m2, m6 | |
26463 | pmulhrsw m5, m7 | |
26464 | packuswb m4, m5 | |
26465 | movu [r0 + 1310 * 16], m4 | |
26466 | pmaddubsw m4, m1, m6 | |
26467 | pmulhrsw m4, m7 | |
26468 | pmaddubsw m5, m3, m6 | |
26469 | pmulhrsw m5, m7 | |
26470 | packuswb m4, m5 | |
26471 | movu [r0 + 1311 * 16], m4 | |
26472 | ||
26473 | ; mode 22 [row 16] | |
26474 | movu m6, [r5 + 3 * 16] | |
26475 | pmaddubsw m4, m0, m6 | |
26476 | pmulhrsw m4, m7 | |
26477 | pmaddubsw m5, m2, m6 | |
26478 | pmulhrsw m5, m7 | |
26479 | packuswb m4, m5 | |
26480 | movu [r0 + 1312 * 16], m4 | |
26481 | pmaddubsw m4, m1, m6 | |
26482 | pmulhrsw m4, m7 | |
26483 | pmaddubsw m5, m3, m6 | |
26484 | pmulhrsw m5, m7 | |
26485 | packuswb m4, m5 | |
26486 | movu [r0 + 1313 * 16], m4 | |
26487 | ||
26488 | ; mode 22 [row 17] | |
26489 | movu m6, [r5 + 22 * 16] | |
26490 | pslldq m0, 2 | |
26491 | pinsrb m0, [r4 + 15], 1 | |
26492 | pinsrb m0, [r4 + 17], 0 | |
26493 | pmaddubsw m4, m0, m6 | |
26494 | pmulhrsw m4, m7 | |
26495 | pslldq m2, 2 | |
26496 | pinsrw m2, [r3 + 1], 0 | |
26497 | pmaddubsw m5, m2, m6 | |
26498 | pmulhrsw m5, m7 | |
26499 | packuswb m4, m5 | |
26500 | movu [r0 + 1314 * 16], m4 | |
26501 | pslldq m1, 2 | |
26502 | pinsrw m1, [r3 + 9], 0 | |
26503 | pmaddubsw m4, m1, m6 | |
26504 | pmulhrsw m4, m7 | |
26505 | pslldq m3, 2 | |
26506 | pinsrw m3, [r3 + 17], 0 | |
26507 | pmaddubsw m5, m3, m6 | |
26508 | pmulhrsw m5, m7 | |
26509 | packuswb m4, m5 | |
26510 | movu [r0 + 1315 * 16], m4 | |
26511 | ||
26512 | ; mode 22 [row 18] | |
26513 | movu m6, [r5 + 9 * 16] | |
26514 | pmaddubsw m4, m0, m6 | |
26515 | pmulhrsw m4, m7 | |
26516 | pmaddubsw m5, m2, m6 | |
26517 | pmulhrsw m5, m7 | |
26518 | packuswb m4, m5 | |
26519 | movu [r0 + 1316 * 16], m4 | |
26520 | pmaddubsw m4, m1, m6 | |
26521 | pmulhrsw m4, m7 | |
26522 | pmaddubsw m5, m3, m6 | |
26523 | pmulhrsw m5, m7 | |
26524 | packuswb m4, m5 | |
26525 | movu [r0 + 1317 * 16], m4 | |
26526 | ||
26527 | ; mode 22 [row 19] | |
26528 | movu m6, [r5 + 28 * 16] | |
26529 | pslldq m0, 2 | |
26530 | pinsrb m0, [r4 + 17], 1 | |
26531 | pinsrb m0, [r4 + 20], 0 | |
26532 | pmaddubsw m4, m0, m6 | |
26533 | pmulhrsw m4, m7 | |
26534 | pslldq m2, 2 | |
26535 | pinsrw m2, [r3 + 0], 0 | |
26536 | pmaddubsw m5, m2, m6 | |
26537 | pmulhrsw m5, m7 | |
26538 | packuswb m4, m5 | |
26539 | movu [r0 + 1318 * 16], m4 | |
26540 | pslldq m1, 2 | |
26541 | pinsrw m1, [r3 + 8], 0 | |
26542 | pmaddubsw m4, m1, m6 | |
26543 | pmulhrsw m4, m7 | |
26544 | pslldq m3, 2 | |
26545 | pinsrw m3, [r3 + 16], 0 | |
26546 | pmaddubsw m5, m3, m6 | |
26547 | pmulhrsw m5, m7 | |
26548 | packuswb m4, m5 | |
26549 | movu [r0 + 1319 * 16], m4 | |
26550 | ||
26551 | ; mode 22 [row 20] | |
26552 | movu m6, [r5 + 15 * 16] | |
26553 | pmaddubsw m4, m0, m6 | |
26554 | pmulhrsw m4, m7 | |
26555 | pmaddubsw m5, m2, m6 | |
26556 | pmulhrsw m5, m7 | |
26557 | packuswb m4, m5 | |
26558 | movu [r0 + 1320 * 16], m4 | |
26559 | pmaddubsw m4, m1, m6 | |
26560 | pmulhrsw m4, m7 | |
26561 | pmaddubsw m5, m3, m6 | |
26562 | pmulhrsw m5, m7 | |
26563 | packuswb m4, m5 | |
26564 | movu [r0 + 1321 * 16], m4 | |
26565 | ||
26566 | ; mode 22 [row 21] | |
26567 | movu m6, [r5 + 2 * 16] | |
26568 | pmaddubsw m4, m0, m6 | |
26569 | pmulhrsw m4, m7 | |
26570 | pmaddubsw m5, m2, m6 | |
26571 | pmulhrsw m5, m7 | |
26572 | packuswb m4, m5 | |
26573 | movu [r0 + 1322 * 16], m4 | |
26574 | pmaddubsw m4, m1, m6 | |
26575 | pmulhrsw m4, m7 | |
26576 | pmaddubsw m5, m3, m6 | |
26577 | pmulhrsw m5, m7 | |
26578 | packuswb m4, m5 | |
26579 | movu [r0 + 1323 * 16], m4 | |
26580 | ||
26581 | ; mode 22 [row 22] | |
26582 | movu m6, [r5 + 21 * 16] | |
26583 | pslldq m0, 2 | |
26584 | pinsrb m0, [r4 + 20], 1 | |
26585 | pinsrb m0, [r4 + 22], 0 | |
26586 | pmaddubsw m4, m0, m6 | |
26587 | pmulhrsw m4, m7 | |
26588 | pslldq m2, 2 | |
26589 | pinsrb m2, [r4 + 0], 1 | |
26590 | pinsrb m2, [r4 + 2], 0 | |
26591 | pmaddubsw m5, m2, m6 | |
26592 | pmulhrsw m5, m7 | |
26593 | packuswb m4, m5 | |
26594 | movu [r0 + 1324 * 16], m4 | |
26595 | pslldq m1, 2 | |
26596 | pinsrw m1, [r3 + 7], 0 | |
26597 | pmaddubsw m4, m1, m6 | |
26598 | pmulhrsw m4, m7 | |
26599 | pslldq m3, 2 | |
26600 | pinsrw m3, [r3 + 15], 0 | |
26601 | pmaddubsw m5, m3, m6 | |
26602 | pmulhrsw m5, m7 | |
26603 | packuswb m4, m5 | |
26604 | movu [r0 + 1325 * 16], m4 | |
26605 | ||
26606 | ; mode 22 [row 23] | |
26607 | movu m6, [r5 + 8 * 16] | |
26608 | pmaddubsw m4, m0, m6 | |
26609 | pmulhrsw m4, m7 | |
26610 | pmaddubsw m5, m2, m6 | |
26611 | pmulhrsw m5, m7 | |
26612 | packuswb m4, m5 | |
26613 | movu [r0 + 1326 * 16], m4 | |
26614 | pmaddubsw m4, m1, m6 | |
26615 | pmulhrsw m4, m7 | |
26616 | pmaddubsw m5, m3, m6 | |
26617 | pmulhrsw m5, m7 | |
26618 | packuswb m4, m5 | |
26619 | movu [r0 + 1327 * 16], m4 | |
26620 | ||
26621 | ; mode 22 [row 24] | |
26622 | movu m6, [r5 + 27 * 16] | |
26623 | pslldq m0, 2 | |
26624 | pinsrb m0, [r4 + 22], 1 | |
26625 | pinsrb m0, [r4 + 25], 0 | |
26626 | pmaddubsw m4, m0, m6 | |
26627 | pmulhrsw m4, m7 | |
26628 | pslldq m2, 2 | |
26629 | pinsrb m2, [r4 + 2], 1 | |
26630 | pinsrb m2, [r4 + 5], 0 | |
26631 | pmaddubsw m5, m2, m6 | |
26632 | pmulhrsw m5, m7 | |
26633 | packuswb m4, m5 | |
26634 | movu [r0 + 1328 * 16], m4 | |
26635 | pslldq m1, 2 | |
26636 | pinsrw m1, [r3 + 6], 0 | |
26637 | pmaddubsw m4, m1, m6 | |
26638 | pmulhrsw m4, m7 | |
26639 | pslldq m3, 2 | |
26640 | pinsrw m3, [r3 + 14], 0 | |
26641 | pmaddubsw m5, m3, m6 | |
26642 | pmulhrsw m5, m7 | |
26643 | packuswb m4, m5 | |
26644 | movu [r0 + 1329 * 16], m4 | |
26645 | ||
26646 | ; mode 22 [row 25] | |
26647 | movu m6, [r5 + 14 * 16] | |
26648 | pmaddubsw m4, m0, m6 | |
26649 | pmulhrsw m4, m7 | |
26650 | pmaddubsw m5, m2, m6 | |
26651 | pmulhrsw m5, m7 | |
26652 | packuswb m4, m5 | |
26653 | movu [r0 + 1330 * 16], m4 | |
26654 | pmaddubsw m4, m1, m6 | |
26655 | pmulhrsw m4, m7 | |
26656 | pmaddubsw m5, m3, m6 | |
26657 | pmulhrsw m5, m7 | |
26658 | packuswb m4, m5 | |
26659 | movu [r0 + 1331 * 16], m4 | |
26660 | ||
26661 | ; mode 22 [row 26] | |
26662 | movu m6, [r5 + 1 * 16] | |
26663 | pmaddubsw m4, m0, m6 | |
26664 | pmulhrsw m4, m7 | |
26665 | pmaddubsw m5, m2, m6 | |
26666 | pmulhrsw m5, m7 | |
26667 | packuswb m4, m5 | |
26668 | movu [r0 + 1332 * 16], m4 | |
26669 | pmaddubsw m4, m1, m6 | |
26670 | pmulhrsw m4, m7 | |
26671 | pmaddubsw m5, m3, m6 | |
26672 | pmulhrsw m5, m7 | |
26673 | packuswb m4, m5 | |
26674 | movu [r0 + 1333 * 16], m4 | |
26675 | ||
26676 | ; mode 22 [row 27] | |
26677 | movu m6, [r5 + 20 * 16] | |
26678 | pslldq m0, 2 | |
26679 | pinsrb m0, [r4 + 25], 1 | |
26680 | pinsrb m0, [r4 + 27], 0 | |
26681 | pmaddubsw m4, m0, m6 | |
26682 | pmulhrsw m4, m7 | |
26683 | pslldq m2, 2 | |
26684 | pinsrb m2, [r4 + 5], 1 | |
26685 | pinsrb m2, [r4 + 7], 0 | |
26686 | pmaddubsw m5, m2, m6 | |
26687 | pmulhrsw m5, m7 | |
26688 | packuswb m4, m5 | |
26689 | movu [r0 + 1334 * 16], m4 | |
26690 | pslldq m1, 2 | |
26691 | pinsrw m1, [r3 + 5], 0 | |
26692 | pmaddubsw m4, m1, m6 | |
26693 | pmulhrsw m4, m7 | |
26694 | pslldq m3, 2 | |
26695 | pinsrw m3, [r3 + 13], 0 | |
26696 | pmaddubsw m5, m3, m6 | |
26697 | pmulhrsw m5, m7 | |
26698 | packuswb m4, m5 | |
26699 | movu [r0 + 1335 * 16], m4 | |
26700 | ||
26701 | ; mode 22 [row 28] | |
26702 | movu m6, [r5 + 7 * 16] | |
26703 | pmaddubsw m4, m0, m6 | |
26704 | pmulhrsw m4, m7 | |
26705 | pmaddubsw m5, m2, m6 | |
26706 | pmulhrsw m5, m7 | |
26707 | packuswb m4, m5 | |
26708 | movu [r0 + 1336 * 16], m4 | |
26709 | pmaddubsw m4, m1, m6 | |
26710 | pmulhrsw m4, m7 | |
26711 | pmaddubsw m5, m3, m6 | |
26712 | pmulhrsw m5, m7 | |
26713 | packuswb m4, m5 | |
26714 | movu [r0 + 1337 * 16], m4 | |
26715 | ||
26716 | ; mode 22 [row 29] | |
26717 | movu m6, [r5 + 26 * 16] | |
26718 | pslldq m0, 2 | |
26719 | pinsrb m0, [r4 + 27], 1 | |
26720 | pinsrb m0, [r4 + 30], 0 | |
26721 | pmaddubsw m4, m0, m6 | |
26722 | pmulhrsw m4, m7 | |
26723 | pslldq m2, 2 | |
26724 | pinsrb m2, [r4 + 7], 1 | |
26725 | pinsrb m2, [r4 + 10], 0 | |
26726 | pmaddubsw m5, m2, m6 | |
26727 | pmulhrsw m5, m7 | |
26728 | packuswb m4, m5 | |
26729 | movu [r0 + 1338 * 16], m4 | |
26730 | pslldq m1, 2 | |
26731 | pinsrw m1, [r3 + 4], 0 | |
26732 | pmaddubsw m4, m1, m6 | |
26733 | pmulhrsw m4, m7 | |
26734 | pslldq m3, 2 | |
26735 | pinsrw m3, [r3 + 12], 0 | |
26736 | pmaddubsw m5, m3, m6 | |
26737 | pmulhrsw m5, m7 | |
26738 | packuswb m4, m5 | |
26739 | movu [r0 + 1339 * 16], m4 | |
26740 | ||
26741 | ; mode 22 [row 30] | |
26742 | movu m6, [r5 + 13 * 16] | |
26743 | pmaddubsw m4, m0, m6 | |
26744 | pmulhrsw m4, m7 | |
26745 | pmaddubsw m5, m2, m6 | |
26746 | pmulhrsw m5, m7 | |
26747 | packuswb m4, m5 | |
26748 | movu [r0 + 1340 * 16], m4 | |
26749 | pmaddubsw m4, m1, m6 | |
26750 | pmulhrsw m4, m7 | |
26751 | pmaddubsw m5, m3, m6 | |
26752 | pmulhrsw m5, m7 | |
26753 | packuswb m4, m5 | |
26754 | movu [r0 + 1341 * 16], m4 | |
26755 | ||
26756 | ; mode22 [row 31] | |
26757 | pshufb m5, m0, [tab_S2] | |
26758 | movh [r0 + 1342 * 16], m5 | |
26759 | pshufb m5, m2, [tab_S2] | |
26760 | movh [r0 + 1342 * 16 + 8], m5 | |
26761 | pshufb m5, m1, [tab_S2] | |
26762 | movh [r0 + 1343 * 16], m5 | |
26763 | pshufb m5, m3, [tab_S2] | |
26764 | movh [r0 + 1343 * 16 + 8], m5 | |
26765 | ||
26766 | ; mode 23 [row 0] | |
26767 | movu m6, [r5 + 23 * 16] | |
26768 | movu m0, [r3 ] | |
26769 | movu m1, [r3 + 1 ] | |
26770 | punpcklbw m0, m1 | |
26771 | pmaddubsw m1, m0, m6 | |
26772 | pmulhrsw m1, m7 | |
26773 | movu m2, [r3 + 8] | |
26774 | movu m3, [r3 + 9] | |
26775 | punpcklbw m2, m3 | |
26776 | pmaddubsw m3, m2, m6 | |
26777 | pmulhrsw m3, m7 | |
26778 | packuswb m1, m3 | |
26779 | movu [r0 + 1344 * 16], m1 | |
26780 | ||
26781 | movu m1, [r3 + 16] | |
26782 | movu m3, [r3 + 17] | |
26783 | punpcklbw m1, m3 | |
26784 | pmaddubsw m4, m1, m6 | |
26785 | pmulhrsw m4, m7 | |
26786 | movu m3, [r3 + 24] | |
26787 | movu m5, [r3 + 25] | |
26788 | punpcklbw m3, m5 | |
26789 | pmaddubsw m5, m3, m6 | |
26790 | pmulhrsw m5, m7 | |
26791 | packuswb m4, m5 | |
26792 | movu [r0 + 1345 * 16], m4 | |
26793 | ||
26794 | ; mode 23 [row 1] | |
26795 | movu m6, [r5 + 14 * 16] | |
26796 | pmaddubsw m4, m0, m6 | |
26797 | pmulhrsw m4, m7 | |
26798 | pmaddubsw m5, m2, m6 | |
26799 | pmulhrsw m5, m7 | |
26800 | packuswb m4, m5 | |
26801 | movu [r0 + 1346 * 16], m4 | |
26802 | pmaddubsw m4, m1, m6 | |
26803 | pmulhrsw m4, m7 | |
26804 | pmaddubsw m5, m3, m6 | |
26805 | pmulhrsw m5, m7 | |
26806 | packuswb m4, m5 | |
26807 | movu [r0 + 1347 * 16], m4 | |
26808 | ||
26809 | ; mode 23 [row 2] | |
26810 | movu m6, [r5 + 5 * 16] | |
26811 | pmaddubsw m4, m0, m6 | |
26812 | pmulhrsw m4, m7 | |
26813 | pmaddubsw m5, m2, m6 | |
26814 | pmulhrsw m5, m7 | |
26815 | packuswb m4, m5 | |
26816 | movu [r0 + 1348 * 16], m4 | |
26817 | pmaddubsw m4, m1, m6 | |
26818 | pmulhrsw m4, m7 | |
26819 | pmaddubsw m5, m3, m6 | |
26820 | pmulhrsw m5, m7 | |
26821 | packuswb m4, m5 | |
26822 | movu [r0 + 1349 * 16], m4 | |
26823 | ||
26824 | ; mode 23 [row 3] | |
26825 | movu m6, [r5 + 28 * 16] | |
26826 | pslldq m0, 2 | |
26827 | pinsrb m0, [r4 + 0], 1 | |
26828 | pinsrb m0, [r4 + 4], 0 | |
26829 | pmaddubsw m4, m0, m6 | |
26830 | pmulhrsw m4, m7 | |
26831 | pslldq m2, 2 | |
26832 | pinsrw m2, [r3 + 7], 0 | |
26833 | pmaddubsw m5, m2, m6 | |
26834 | pmulhrsw m5, m7 | |
26835 | packuswb m4, m5 | |
26836 | movu [r0 + 1350 * 16], m4 | |
26837 | pslldq m1, 2 | |
26838 | pinsrw m1, [r3 + 15], 0 | |
26839 | pmaddubsw m4, m1, m6 | |
26840 | pmulhrsw m4, m7 | |
26841 | pslldq m3, 2 | |
26842 | pinsrw m3, [r3 + 23], 0 | |
26843 | pmaddubsw m5, m3, m6 | |
26844 | pmulhrsw m5, m7 | |
26845 | packuswb m4, m5 | |
26846 | movu [r0 + 1351 * 16], m4 | |
26847 | ||
26848 | ; mode 23 [row 4] | |
26849 | movu m6, [r5 + 19 * 16] | |
26850 | pmaddubsw m4, m0, m6 | |
26851 | pmulhrsw m4, m7 | |
26852 | pmaddubsw m5, m2, m6 | |
26853 | pmulhrsw m5, m7 | |
26854 | packuswb m4, m5 | |
26855 | movu [r0 + 1352 * 16], m4 | |
26856 | pmaddubsw m4, m1, m6 | |
26857 | pmulhrsw m4, m7 | |
26858 | pmaddubsw m5, m3, m6 | |
26859 | pmulhrsw m5, m7 | |
26860 | packuswb m4, m5 | |
26861 | movu [r0 + 1353 * 16], m4 | |
26862 | ||
26863 | ; mode 23 [row 5] | |
26864 | movu m6, [r5 + 10 * 16] | |
26865 | pmaddubsw m4, m0, m6 | |
26866 | pmulhrsw m4, m7 | |
26867 | pmaddubsw m5, m2, m6 | |
26868 | pmulhrsw m5, m7 | |
26869 | packuswb m4, m5 | |
26870 | movu [r0 + 1354 * 16], m4 | |
26871 | pmaddubsw m4, m1, m6 | |
26872 | pmulhrsw m4, m7 | |
26873 | pmaddubsw m5, m3, m6 | |
26874 | pmulhrsw m5, m7 | |
26875 | packuswb m4, m5 | |
26876 | movu [r0 + 1355 * 16], m4 | |
26877 | ||
26878 | ; mode 23 [row 6] | |
26879 | movu m6, [r5 + 1 * 16] | |
26880 | pmaddubsw m4, m0, m6 | |
26881 | pmulhrsw m4, m7 | |
26882 | pmaddubsw m5, m2, m6 | |
26883 | pmulhrsw m5, m7 | |
26884 | packuswb m4, m5 | |
26885 | movu [r0 + 1356 * 16], m4 | |
26886 | pmaddubsw m4, m1, m6 | |
26887 | pmulhrsw m4, m7 | |
26888 | pmaddubsw m5, m3, m6 | |
26889 | pmulhrsw m5, m7 | |
26890 | packuswb m4, m5 | |
26891 | movu [r0 + 1357 * 16], m4 | |
26892 | ||
26893 | ; mode 23 [row 7] | |
26894 | movu m6, [r5 + 24 * 16] | |
26895 | pslldq m0, 2 | |
26896 | pinsrb m0, [r4 + 4], 1 | |
26897 | pinsrb m0, [r4 + 7], 0 | |
26898 | pmaddubsw m4, m0, m6 | |
26899 | pmulhrsw m4, m7 | |
26900 | pslldq m2, 2 | |
26901 | pinsrw m2, [r3 + 6], 0 | |
26902 | pmaddubsw m5, m2, m6 | |
26903 | pmulhrsw m5, m7 | |
26904 | packuswb m4, m5 | |
26905 | movu [r0 + 1358 * 16], m4 | |
26906 | pslldq m1, 2 | |
26907 | pinsrw m1, [r3 + 14], 0 | |
26908 | pmaddubsw m4, m1, m6 | |
26909 | pmulhrsw m4, m7 | |
26910 | pslldq m3, 2 | |
26911 | pinsrw m3, [r3 + 22], 0 | |
26912 | pmaddubsw m5, m3, m6 | |
26913 | pmulhrsw m5, m7 | |
26914 | packuswb m4, m5 | |
26915 | movu [r0 + 1359 * 16], m4 | |
26916 | ||
26917 | ; mode 23 [row 8] | |
26918 | movu m6, [r5 + 15 * 16] | |
26919 | pmaddubsw m4, m0, m6 | |
26920 | pmulhrsw m4, m7 | |
26921 | pmaddubsw m5, m2, m6 | |
26922 | pmulhrsw m5, m7 | |
26923 | packuswb m4, m5 | |
26924 | movu [r0 + 1360 * 16], m4 | |
26925 | pmaddubsw m4, m1, m6 | |
26926 | pmulhrsw m4, m7 | |
26927 | pmaddubsw m5, m3, m6 | |
26928 | pmulhrsw m5, m7 | |
26929 | packuswb m4, m5 | |
26930 | movu [r0 + 1361 * 16], m4 | |
26931 | ||
26932 | ; mode 23 [row 9] | |
26933 | movu m6, [r5 + 6 * 16] | |
26934 | pmaddubsw m4, m0, m6 | |
26935 | pmulhrsw m4, m7 | |
26936 | pmaddubsw m5, m2, m6 | |
26937 | pmulhrsw m5, m7 | |
26938 | packuswb m4, m5 | |
26939 | movu [r0 + 1362 * 16], m4 | |
26940 | pmaddubsw m4, m1, m6 | |
26941 | pmulhrsw m4, m7 | |
26942 | pmaddubsw m5, m3, m6 | |
26943 | pmulhrsw m5, m7 | |
26944 | packuswb m4, m5 | |
26945 | movu [r0 + 1363 * 16], m4 | |
26946 | ||
26947 | ; mode 23 [row 10] | |
26948 | movu m6, [r5 + 29 * 16] | |
26949 | pslldq m0, 2 | |
26950 | pinsrb m0, [r4 + 7], 1 | |
26951 | pinsrb m0, [r4 + 11], 0 | |
26952 | pmaddubsw m4, m0, m6 | |
26953 | pmulhrsw m4, m7 | |
26954 | pslldq m2, 2 | |
26955 | pinsrw m2, [r3 + 5], 0 | |
26956 | pmaddubsw m5, m2, m6 | |
26957 | pmulhrsw m5, m7 | |
26958 | packuswb m4, m5 | |
26959 | movu [r0 + 1364 * 16], m4 | |
26960 | pslldq m1, 2 | |
26961 | pinsrw m1, [r3 + 13], 0 | |
26962 | pmaddubsw m4, m1, m6 | |
26963 | pmulhrsw m4, m7 | |
26964 | pslldq m3, 2 | |
26965 | pinsrw m3, [r3 + 21], 0 | |
26966 | pmaddubsw m5, m3, m6 | |
26967 | pmulhrsw m5, m7 | |
26968 | packuswb m4, m5 | |
26969 | movu [r0 + 1365 * 16], m4 | |
26970 | ||
26971 | ; mode 23 [row 11] | |
26972 | movu m6, [r5 + 20 * 16] | |
26973 | pmaddubsw m4, m0, m6 | |
26974 | pmulhrsw m4, m7 | |
26975 | pmaddubsw m5, m2, m6 | |
26976 | pmulhrsw m5, m7 | |
26977 | packuswb m4, m5 | |
26978 | movu [r0 + 1366 * 16], m4 | |
26979 | pmaddubsw m4, m1, m6 | |
26980 | pmulhrsw m4, m7 | |
26981 | pmaddubsw m5, m3, m6 | |
26982 | pmulhrsw m5, m7 | |
26983 | packuswb m4, m5 | |
26984 | movu [r0 + 1367 * 16], m4 | |
26985 | ||
26986 | ; mode 23 [row 12] | |
26987 | movu m6, [r5 + 11 * 16] | |
26988 | pmaddubsw m4, m0, m6 | |
26989 | pmulhrsw m4, m7 | |
26990 | pmaddubsw m5, m2, m6 | |
26991 | pmulhrsw m5, m7 | |
26992 | packuswb m4, m5 | |
26993 | movu [r0 + 1368 * 16], m4 | |
26994 | pmaddubsw m4, m1, m6 | |
26995 | pmulhrsw m4, m7 | |
26996 | pmaddubsw m5, m3, m6 | |
26997 | pmulhrsw m5, m7 | |
26998 | packuswb m4, m5 | |
26999 | movu [r0 + 1369 * 16], m4 | |
27000 | ||
27001 | ; mode 23 [row 13] | |
27002 | movu m6, [r5 + 2 * 16] | |
27003 | pmaddubsw m4, m0, m6 | |
27004 | pmulhrsw m4, m7 | |
27005 | pmaddubsw m5, m2, m6 | |
27006 | pmulhrsw m5, m7 | |
27007 | packuswb m4, m5 | |
27008 | movu [r0 + 1370 * 16], m4 | |
27009 | pmaddubsw m4, m1, m6 | |
27010 | pmulhrsw m4, m7 | |
27011 | pmaddubsw m5, m3, m6 | |
27012 | pmulhrsw m5, m7 | |
27013 | packuswb m4, m5 | |
27014 | movu [r0 + 1371 * 16], m4 | |
27015 | ||
27016 | ; mode 23 [row 14] | |
27017 | movu m6, [r5 + 25 * 16] | |
27018 | pslldq m0, 2 | |
27019 | pinsrb m0, [r4 + 11], 1 | |
27020 | pinsrb m0, [r4 + 14], 0 | |
27021 | pmaddubsw m4, m0, m6 | |
27022 | pmulhrsw m4, m7 | |
27023 | pslldq m2, 2 | |
27024 | pinsrw m2, [r3 + 4], 0 | |
27025 | pmaddubsw m5, m2, m6 | |
27026 | pmulhrsw m5, m7 | |
27027 | packuswb m4, m5 | |
27028 | movu [r0 + 1372 * 16], m4 | |
27029 | pslldq m1, 2 | |
27030 | pinsrw m1, [r3 + 12], 0 | |
27031 | pmaddubsw m4, m1, m6 | |
27032 | pmulhrsw m4, m7 | |
27033 | pslldq m3, 2 | |
27034 | pinsrw m3, [r3 + 20], 0 | |
27035 | pmaddubsw m5, m3, m6 | |
27036 | pmulhrsw m5, m7 | |
27037 | packuswb m4, m5 | |
27038 | movu [r0 + 1373 * 16], m4 | |
27039 | ||
27040 | ; mode 23 [row 15] | |
27041 | movu m6, [r5 + 16 * 16] | |
27042 | pmaddubsw m4, m0, m6 | |
27043 | pmulhrsw m4, m7 | |
27044 | pmaddubsw m5, m2, m6 | |
27045 | pmulhrsw m5, m7 | |
27046 | packuswb m4, m5 | |
27047 | movu [r0 + 1374 * 16], m4 | |
27048 | pmaddubsw m4, m1, m6 | |
27049 | pmulhrsw m4, m7 | |
27050 | pmaddubsw m5, m3, m6 | |
27051 | pmulhrsw m5, m7 | |
27052 | packuswb m4, m5 | |
27053 | movu [r0 + 1375 * 16], m4 | |
27054 | ||
27055 | ; mode 23 [row 16] | |
27056 | movu m6, [r5 + 7 * 16] | |
27057 | pmaddubsw m4, m0, m6 | |
27058 | pmulhrsw m4, m7 | |
27059 | pmaddubsw m5, m2, m6 | |
27060 | pmulhrsw m5, m7 | |
27061 | packuswb m4, m5 | |
27062 | movu [r0 + 1376 * 16], m4 | |
27063 | pmaddubsw m4, m1, m6 | |
27064 | pmulhrsw m4, m7 | |
27065 | pmaddubsw m5, m3, m6 | |
27066 | pmulhrsw m5, m7 | |
27067 | packuswb m4, m5 | |
27068 | movu [r0 + 1377 * 16], m4 | |
27069 | ||
27070 | ; mode 23 [row 17] | |
27071 | movu m6, [r5 + 30 * 16] | |
27072 | pslldq m0, 2 | |
27073 | pinsrb m0, [r4 + 14], 1 | |
27074 | pinsrb m0, [r4 + 18], 0 | |
27075 | pmaddubsw m4, m0, m6 | |
27076 | pmulhrsw m4, m7 | |
27077 | pslldq m2, 2 | |
27078 | pinsrw m2, [r3 + 3], 0 | |
27079 | pmaddubsw m5, m2, m6 | |
27080 | pmulhrsw m5, m7 | |
27081 | packuswb m4, m5 | |
27082 | movu [r0 + 1378 * 16], m4 | |
27083 | pslldq m1, 2 | |
27084 | pinsrw m1, [r3 + 11], 0 | |
27085 | pmaddubsw m4, m1, m6 | |
27086 | pmulhrsw m4, m7 | |
27087 | pslldq m3, 2 | |
27088 | pinsrw m3, [r3 + 19], 0 | |
27089 | pmaddubsw m5, m3, m6 | |
27090 | pmulhrsw m5, m7 | |
27091 | packuswb m4, m5 | |
27092 | movu [r0 + 1379 * 16], m4 | |
27093 | ||
27094 | ; mode 23 [row 18] | |
27095 | movu m6, [r5 + 21 * 16] | |
27096 | pmaddubsw m4, m0, m6 | |
27097 | pmulhrsw m4, m7 | |
27098 | pmaddubsw m5, m2, m6 | |
27099 | pmulhrsw m5, m7 | |
27100 | packuswb m4, m5 | |
27101 | movu [r0 + 1380 * 16], m4 | |
27102 | pmaddubsw m4, m1, m6 | |
27103 | pmulhrsw m4, m7 | |
27104 | pmaddubsw m5, m3, m6 | |
27105 | pmulhrsw m5, m7 | |
27106 | packuswb m4, m5 | |
27107 | movu [r0 + 1381 * 16], m4 | |
27108 | ||
27109 | ; mode 23 [row 19] | |
27110 | movu m6, [r5 + 12 * 16] | |
27111 | pmaddubsw m4, m0, m6 | |
27112 | pmulhrsw m4, m7 | |
27113 | pmaddubsw m5, m2, m6 | |
27114 | pmulhrsw m5, m7 | |
27115 | packuswb m4, m5 | |
27116 | movu [r0 + 1382 * 16], m4 | |
27117 | pmaddubsw m4, m1, m6 | |
27118 | pmulhrsw m4, m7 | |
27119 | pmaddubsw m5, m3, m6 | |
27120 | pmulhrsw m5, m7 | |
27121 | packuswb m4, m5 | |
27122 | movu [r0 + 1383 * 16], m4 | |
27123 | ||
27124 | ; mode 23 [row 20] | |
27125 | movu m6, [r5 + 3 * 16] | |
27126 | pmaddubsw m4, m0, m6 | |
27127 | pmulhrsw m4, m7 | |
27128 | pmaddubsw m5, m2, m6 | |
27129 | pmulhrsw m5, m7 | |
27130 | packuswb m4, m5 | |
27131 | movu [r0 + 1384 * 16], m4 | |
27132 | pmaddubsw m4, m1, m6 | |
27133 | pmulhrsw m4, m7 | |
27134 | pmaddubsw m5, m3, m6 | |
27135 | pmulhrsw m5, m7 | |
27136 | packuswb m4, m5 | |
27137 | movu [r0 + 1385 * 16], m4 | |
27138 | ||
27139 | ; mode 23 [row 21] | |
27140 | movu m6, [r5 + 26 * 16] | |
27141 | pslldq m0, 2 | |
27142 | pinsrb m0, [r4 + 18], 1 | |
27143 | pinsrb m0, [r4 + 21], 0 | |
27144 | pmaddubsw m4, m0, m6 | |
27145 | pmulhrsw m4, m7 | |
27146 | pslldq m2, 2 | |
27147 | pinsrw m2, [r3 + 2], 0 | |
27148 | pmaddubsw m5, m2, m6 | |
27149 | pmulhrsw m5, m7 | |
27150 | packuswb m4, m5 | |
27151 | movu [r0 + 1386 * 16], m4 | |
27152 | pslldq m1, 2 | |
27153 | pinsrw m1, [r3 + 10], 0 | |
27154 | pmaddubsw m4, m1, m6 | |
27155 | pmulhrsw m4, m7 | |
27156 | pslldq m3, 2 | |
27157 | pinsrw m3, [r3 + 18], 0 | |
27158 | pmaddubsw m5, m3, m6 | |
27159 | pmulhrsw m5, m7 | |
27160 | packuswb m4, m5 | |
27161 | movu [r0 + 1387 * 16], m4 | |
27162 | ||
27163 | ; mode 23 [row 22] | |
27164 | movu m6, [r5 + 17 * 16] | |
27165 | pmaddubsw m4, m0, m6 | |
27166 | pmulhrsw m4, m7 | |
27167 | pmaddubsw m5, m2, m6 | |
27168 | pmulhrsw m5, m7 | |
27169 | packuswb m4, m5 | |
27170 | movu [r0 + 1388 * 16], m4 | |
27171 | pmaddubsw m4, m1, m6 | |
27172 | pmulhrsw m4, m7 | |
27173 | pmaddubsw m5, m3, m6 | |
27174 | pmulhrsw m5, m7 | |
27175 | packuswb m4, m5 | |
27176 | movu [r0 + 1389 * 16], m4 | |
27177 | ||
27178 | ; mode 23 [row 23] | |
27179 | movu m6, [r5 + 8 * 16] | |
27180 | pmaddubsw m4, m0, m6 | |
27181 | pmulhrsw m4, m7 | |
27182 | pmaddubsw m5, m2, m6 | |
27183 | pmulhrsw m5, m7 | |
27184 | packuswb m4, m5 | |
27185 | movu [r0 + 1390 * 16], m4 | |
27186 | pmaddubsw m4, m1, m6 | |
27187 | pmulhrsw m4, m7 | |
27188 | pmaddubsw m5, m3, m6 | |
27189 | pmulhrsw m5, m7 | |
27190 | packuswb m4, m5 | |
27191 | movu [r0 + 1391 * 16], m4 | |
27192 | ||
27193 | ; mode 23 [row 24] | |
27194 | movu m6, [r5 + 31 * 16] | |
27195 | pslldq m0, 2 | |
27196 | pinsrb m0, [r4 + 21], 1 | |
27197 | pinsrb m0, [r4 + 25], 0 | |
27198 | pmaddubsw m4, m0, m6 | |
27199 | pmulhrsw m4, m7 | |
27200 | pslldq m2, 2 | |
27201 | pinsrw m2, [r3 + 1], 0 | |
27202 | pmaddubsw m5, m2, m6 | |
27203 | pmulhrsw m5, m7 | |
27204 | packuswb m4, m5 | |
27205 | movu [r0 + 1392 * 16], m4 | |
27206 | pslldq m1, 2 | |
27207 | pinsrw m1, [r3 + 9], 0 | |
27208 | pmaddubsw m4, m1, m6 | |
27209 | pmulhrsw m4, m7 | |
27210 | pslldq m3, 2 | |
27211 | pinsrw m3, [r3 + 17], 0 | |
27212 | pmaddubsw m5, m3, m6 | |
27213 | pmulhrsw m5, m7 | |
27214 | packuswb m4, m5 | |
27215 | movu [r0 + 1393 * 16], m4 | |
27216 | ||
27217 | ; mode 23 [row 25] | |
27218 | movu m6, [r5 + 22 * 16] | |
27219 | pmaddubsw m4, m0, m6 | |
27220 | pmulhrsw m4, m7 | |
27221 | pmaddubsw m5, m2, m6 | |
27222 | pmulhrsw m5, m7 | |
27223 | packuswb m4, m5 | |
27224 | movu [r0 + 1394 * 16], m4 | |
27225 | pmaddubsw m4, m1, m6 | |
27226 | pmulhrsw m4, m7 | |
27227 | pmaddubsw m5, m3, m6 | |
27228 | pmulhrsw m5, m7 | |
27229 | packuswb m4, m5 | |
27230 | movu [r0 + 1395 * 16], m4 | |
27231 | ||
27232 | ; mode 23 [row 26] | |
27233 | movu m6, [r5 + 13 * 16] | |
27234 | pmaddubsw m4, m0, m6 | |
27235 | pmulhrsw m4, m7 | |
27236 | pmaddubsw m5, m2, m6 | |
27237 | pmulhrsw m5, m7 | |
27238 | packuswb m4, m5 | |
27239 | movu [r0 + 1396 * 16], m4 | |
27240 | pmaddubsw m4, m1, m6 | |
27241 | pmulhrsw m4, m7 | |
27242 | pmaddubsw m5, m3, m6 | |
27243 | pmulhrsw m5, m7 | |
27244 | packuswb m4, m5 | |
27245 | movu [r0 + 1397 * 16], m4 | |
27246 | ||
27247 | ; mode 23 [row 27] | |
27248 | movu m6, [r5 + 4 * 16] | |
27249 | pmaddubsw m4, m0, m6 | |
27250 | pmulhrsw m4, m7 | |
27251 | pmaddubsw m5, m2, m6 | |
27252 | pmulhrsw m5, m7 | |
27253 | packuswb m4, m5 | |
27254 | movu [r0 + 1398 * 16], m4 | |
27255 | pmaddubsw m4, m1, m6 | |
27256 | pmulhrsw m4, m7 | |
27257 | pmaddubsw m5, m3, m6 | |
27258 | pmulhrsw m5, m7 | |
27259 | packuswb m4, m5 | |
27260 | movu [r0 + 1399 * 16], m4 | |
27261 | ||
27262 | ; mode 23 [row 28] | |
27263 | movu m6, [r5 + 27 * 16] | |
27264 | pslldq m0, 2 | |
27265 | pinsrb m0, [r4 + 25], 1 | |
27266 | pinsrb m0, [r4 + 28], 0 | |
27267 | pmaddubsw m4, m0, m6 | |
27268 | pmulhrsw m4, m7 | |
27269 | pslldq m2, 2 | |
27270 | pinsrw m2, [r3 + 0], 0 | |
27271 | pmaddubsw m5, m2, m6 | |
27272 | pmulhrsw m5, m7 | |
27273 | packuswb m4, m5 | |
27274 | movu [r0 + 1400 * 16], m4 | |
27275 | pslldq m1, 2 | |
27276 | pinsrw m1, [r3 + 8], 0 | |
27277 | pmaddubsw m4, m1, m6 | |
27278 | pmulhrsw m4, m7 | |
27279 | pslldq m3, 2 | |
27280 | pinsrw m3, [r3 + 16], 0 | |
27281 | pmaddubsw m5, m3, m6 | |
27282 | pmulhrsw m5, m7 | |
27283 | packuswb m4, m5 | |
27284 | movu [r0 + 1401 * 16], m4 | |
27285 | ||
27286 | ; mode 23 [row 29] | |
27287 | movu m6, [r5 + 18 * 16] | |
27288 | pmaddubsw m4, m0, m6 | |
27289 | pmulhrsw m4, m7 | |
27290 | pmaddubsw m5, m2, m6 | |
27291 | pmulhrsw m5, m7 | |
27292 | packuswb m4, m5 | |
27293 | movu [r0 + 1402 * 16], m4 | |
27294 | pmaddubsw m4, m1, m6 | |
27295 | pmulhrsw m4, m7 | |
27296 | pmaddubsw m5, m3, m6 | |
27297 | pmulhrsw m5, m7 | |
27298 | packuswb m4, m5 | |
27299 | movu [r0 + 1403 * 16], m4 | |
27300 | ||
27301 | ; mode 23 [row 30] | |
27302 | movu m6, [r5 + 9 * 16] | |
27303 | pmaddubsw m4, m0, m6 | |
27304 | pmulhrsw m4, m7 | |
27305 | pmaddubsw m5, m2, m6 | |
27306 | pmulhrsw m5, m7 | |
27307 | packuswb m4, m5 | |
27308 | movu [r0 + 1404 * 16], m4 | |
27309 | pmaddubsw m4, m1, m6 | |
27310 | pmulhrsw m4, m7 | |
27311 | pmaddubsw m5, m3, m6 | |
27312 | pmulhrsw m5, m7 | |
27313 | packuswb m4, m5 | |
27314 | movu [r0 + 1405 * 16], m4 | |
27315 | ||
27316 | ; mode23 [row 31] | |
27317 | pshufb m5, m0, [tab_S2] | |
27318 | movh [r0 + 1406 * 16], m5 | |
27319 | pshufb m5, m2, [tab_S2] | |
27320 | movh [r0 + 1406 * 16 + 8], m5 | |
27321 | pshufb m5, m1, [tab_S2] | |
27322 | movh [r0 + 1407 * 16], m5 | |
27323 | pshufb m5, m3, [tab_S2] | |
27324 | movh [r0 + 1407 * 16 + 8], m5 | |
27325 | ||
27326 | ; mode 24 [row 0] | |
27327 | movu m6, [r5 + 27 * 16] | |
27328 | movu m0, [r3 ] | |
27329 | movu m1, [r3 + 1 ] | |
27330 | punpcklbw m0, m1 | |
27331 | pmaddubsw m4, m0, m6 | |
27332 | pmulhrsw m4, m7 | |
27333 | movu m2, [r3 + 8] | |
27334 | movu m3, [r3 + 9] | |
27335 | punpcklbw m2, m3 | |
27336 | pmaddubsw m5, m2, m6 | |
27337 | pmulhrsw m5, m7 | |
27338 | packuswb m4, m5 | |
27339 | movu [r0 + 1408 * 16], m4 | |
27340 | ||
27341 | movu m1, [r3 + 16] | |
27342 | movu m3, [r3 + 17] | |
27343 | punpcklbw m1, m3 | |
27344 | pmaddubsw m4, m1, m6 | |
27345 | pmulhrsw m4, m7 | |
27346 | movu m3, [r3 + 24] | |
27347 | movu m5, [r3 + 25] | |
27348 | punpcklbw m3, m5 | |
27349 | pmaddubsw m5, m3, m6 | |
27350 | pmulhrsw m5, m7 | |
27351 | packuswb m4, m5 | |
27352 | movu [r0 + 1409 * 16], m4 | |
27353 | ||
27354 | ; mode 24 [row 1] | |
27355 | movu m6, [r5 + 22 * 16] | |
27356 | pmaddubsw m4, m0, m6 | |
27357 | pmulhrsw m4, m7 | |
27358 | pmaddubsw m5, m2, m6 | |
27359 | pmulhrsw m5, m7 | |
27360 | packuswb m4, m5 | |
27361 | movu [r0 + 1410 * 16], m4 | |
27362 | pmaddubsw m4, m1, m6 | |
27363 | pmulhrsw m4, m7 | |
27364 | pmaddubsw m5, m3, m6 | |
27365 | pmulhrsw m5, m7 | |
27366 | packuswb m4, m5 | |
27367 | movu [r0 + 1411 * 16], m4 | |
27368 | ||
27369 | ; mode 24 [row 2] | |
27370 | movu m6, [r5 + 17 * 16] | |
27371 | pmaddubsw m4, m0, m6 | |
27372 | pmulhrsw m4, m7 | |
27373 | pmaddubsw m5, m2, m6 | |
27374 | pmulhrsw m5, m7 | |
27375 | packuswb m4, m5 | |
27376 | movu [r0 + 1412 * 16], m4 | |
27377 | pmaddubsw m4, m1, m6 | |
27378 | pmulhrsw m4, m7 | |
27379 | pmaddubsw m5, m3, m6 | |
27380 | pmulhrsw m5, m7 | |
27381 | packuswb m4, m5 | |
27382 | movu [r0 + 1413 * 16], m4 | |
27383 | ||
27384 | ; mode 24 [row 3] | |
27385 | movu m6, [r5 + 12 * 16] | |
27386 | pmaddubsw m4, m0, m6 | |
27387 | pmulhrsw m4, m7 | |
27388 | pmaddubsw m5, m2, m6 | |
27389 | pmulhrsw m5, m7 | |
27390 | packuswb m4, m5 | |
27391 | movu [r0 + 1414 * 16], m4 | |
27392 | pmaddubsw m4, m1, m6 | |
27393 | pmulhrsw m4, m7 | |
27394 | pmaddubsw m5, m3, m6 | |
27395 | pmulhrsw m5, m7 | |
27396 | packuswb m4, m5 | |
27397 | movu [r0 + 1415 * 16], m4 | |
27398 | ||
27399 | ; mode 24 [row 4] | |
27400 | movu m6, [r5 + 7 * 16] | |
27401 | pmaddubsw m4, m0, m6 | |
27402 | pmulhrsw m4, m7 | |
27403 | pmaddubsw m5, m2, m6 | |
27404 | pmulhrsw m5, m7 | |
27405 | packuswb m4, m5 | |
27406 | movu [r0 + 1416 * 16], m4 | |
27407 | pmaddubsw m4, m1, m6 | |
27408 | pmulhrsw m4, m7 | |
27409 | pmaddubsw m5, m3, m6 | |
27410 | pmulhrsw m5, m7 | |
27411 | packuswb m4, m5 | |
27412 | movu [r0 + 1417 * 16], m4 | |
27413 | ||
27414 | ; mode 24 [row 5] | |
27415 | movu m6, [r5 + 2 * 16] | |
27416 | pmaddubsw m4, m0, m6 | |
27417 | pmulhrsw m4, m7 | |
27418 | pmaddubsw m5, m2, m6 | |
27419 | pmulhrsw m5, m7 | |
27420 | packuswb m4, m5 | |
27421 | movu [r0 + 1418 * 16], m4 | |
27422 | pmaddubsw m4, m1, m6 | |
27423 | pmulhrsw m4, m7 | |
27424 | pmaddubsw m5, m3, m6 | |
27425 | pmulhrsw m5, m7 | |
27426 | packuswb m4, m5 | |
27427 | movu [r0 + 1419 * 16], m4 | |
27428 | ||
27429 | ; mode 24 [row 6] | |
27430 | movu m6, [r5 + 29 * 16] | |
27431 | pslldq m0, 2 | |
27432 | pinsrb m0, [r4 + 0], 1 | |
27433 | pinsrb m0, [r4 + 6], 0 | |
27434 | pmaddubsw m4, m0, m6 | |
27435 | pmulhrsw m4, m7 | |
27436 | pslldq m2, 2 | |
27437 | pinsrw m2, [r3 + 7], 0 | |
27438 | pmaddubsw m5, m2, m6 | |
27439 | pmulhrsw m5, m7 | |
27440 | packuswb m4, m5 | |
27441 | movu [r0 + 1420 * 16], m4 | |
27442 | pslldq m1, 2 | |
27443 | pinsrw m1, [r3 + 15], 0 | |
27444 | pmaddubsw m4, m1, m6 | |
27445 | pmulhrsw m4, m7 | |
27446 | pslldq m3, 2 | |
27447 | pinsrw m3, [r3 + 23], 0 | |
27448 | pmaddubsw m5, m3, m6 | |
27449 | pmulhrsw m5, m7 | |
27450 | packuswb m4, m5 | |
27451 | movu [r0 + 1421 * 16], m4 | |
27452 | ||
27453 | ; mode 24 [row 7] | |
27454 | movu m6, [r5 + 24 * 16] | |
27455 | pmaddubsw m4, m0, m6 | |
27456 | pmulhrsw m4, m7 | |
27457 | pmaddubsw m5, m2, m6 | |
27458 | pmulhrsw m5, m7 | |
27459 | packuswb m4, m5 | |
27460 | movu [r0 + 1422 * 16], m4 | |
27461 | pmaddubsw m4, m1, m6 | |
27462 | pmulhrsw m4, m7 | |
27463 | pmaddubsw m5, m3, m6 | |
27464 | pmulhrsw m5, m7 | |
27465 | packuswb m4, m5 | |
27466 | movu [r0 + 1423 * 16], m4 | |
27467 | ||
27468 | ; mode 24 [row 8] | |
27469 | movu m6, [r5 + 19 * 16] | |
27470 | pmaddubsw m4, m0, m6 | |
27471 | pmulhrsw m4, m7 | |
27472 | pmaddubsw m5, m2, m6 | |
27473 | pmulhrsw m5, m7 | |
27474 | packuswb m4, m5 | |
27475 | movu [r0 + 1424 * 16], m4 | |
27476 | pmaddubsw m4, m1, m6 | |
27477 | pmulhrsw m4, m7 | |
27478 | pmaddubsw m5, m3, m6 | |
27479 | pmulhrsw m5, m7 | |
27480 | packuswb m4, m5 | |
27481 | movu [r0 + 1425 * 16], m4 | |
27482 | ||
27483 | ; mode 24 [row 9] | |
27484 | movu m6, [r5 + 14 * 16] | |
27485 | pmaddubsw m4, m0, m6 | |
27486 | pmulhrsw m4, m7 | |
27487 | pmaddubsw m5, m2, m6 | |
27488 | pmulhrsw m5, m7 | |
27489 | packuswb m4, m5 | |
27490 | movu [r0 + 1426 * 16], m4 | |
27491 | pmaddubsw m4, m1, m6 | |
27492 | pmulhrsw m4, m7 | |
27493 | pmaddubsw m5, m3, m6 | |
27494 | pmulhrsw m5, m7 | |
27495 | packuswb m4, m5 | |
27496 | movu [r0 + 1427 * 16], m4 | |
27497 | ||
27498 | ; mode 24 [row 10] | |
27499 | movu m6, [r5 + 9 * 16] | |
27500 | pmaddubsw m4, m0, m6 | |
27501 | pmulhrsw m4, m7 | |
27502 | pmaddubsw m5, m2, m6 | |
27503 | pmulhrsw m5, m7 | |
27504 | packuswb m4, m5 | |
27505 | movu [r0 + 1428 * 16], m4 | |
27506 | pmaddubsw m4, m1, m6 | |
27507 | pmulhrsw m4, m7 | |
27508 | pmaddubsw m5, m3, m6 | |
27509 | pmulhrsw m5, m7 | |
27510 | packuswb m4, m5 | |
27511 | movu [r0 + 1429 * 16], m4 | |
27512 | ||
27513 | ; mode 24 [row 11] | |
27514 | movu m6, [r5 + 4 * 16] | |
27515 | pmaddubsw m4, m0, m6 | |
27516 | pmulhrsw m4, m7 | |
27517 | pmaddubsw m5, m2, m6 | |
27518 | pmulhrsw m5, m7 | |
27519 | packuswb m4, m5 | |
27520 | movu [r0 + 1430 * 16], m4 | |
27521 | pmaddubsw m4, m1, m6 | |
27522 | pmulhrsw m4, m7 | |
27523 | pmaddubsw m5, m3, m6 | |
27524 | pmulhrsw m5, m7 | |
27525 | packuswb m4, m5 | |
27526 | movu [r0 + 1431 * 16], m4 | |
27527 | ||
27528 | ; mode 24 [row 12] | |
27529 | movu m6, [r5 + 31 * 16] | |
27530 | pslldq m0, 2 | |
27531 | pinsrb m0, [r4 + 6], 1 | |
27532 | pinsrb m0, [r4 + 13], 0 | |
27533 | pmaddubsw m4, m0, m6 | |
27534 | pmulhrsw m4, m7 | |
27535 | pslldq m2, 2 | |
27536 | pinsrw m2, [r3 + 6], 0 | |
27537 | pmaddubsw m5, m2, m6 | |
27538 | pmulhrsw m5, m7 | |
27539 | packuswb m4, m5 | |
27540 | movu [r0 + 1432 * 16], m4 | |
27541 | pslldq m1, 2 | |
27542 | pinsrw m1, [r3 + 14], 0 | |
27543 | pmaddubsw m4, m1, m6 | |
27544 | pmulhrsw m4, m7 | |
27545 | pslldq m3, 2 | |
27546 | pinsrw m3, [r3 + 22], 0 | |
27547 | pmaddubsw m5, m3, m6 | |
27548 | pmulhrsw m5, m7 | |
27549 | packuswb m4, m5 | |
27550 | movu [r0 + 1433 * 16], m4 | |
27551 | ||
27552 | ; mode 24 [row 13] | |
27553 | movu m6, [r5 + 26 * 16] | |
27554 | pmaddubsw m4, m0, m6 | |
27555 | pmulhrsw m4, m7 | |
27556 | pmaddubsw m5, m2, m6 | |
27557 | pmulhrsw m5, m7 | |
27558 | packuswb m4, m5 | |
27559 | movu [r0 + 1434 * 16], m4 | |
27560 | pmaddubsw m4, m1, m6 | |
27561 | pmulhrsw m4, m7 | |
27562 | pmaddubsw m5, m3, m6 | |
27563 | pmulhrsw m5, m7 | |
27564 | packuswb m4, m5 | |
27565 | movu [r0 + 1435 * 16], m4 | |
27566 | ||
27567 | ; mode 24 [row 14] | |
27568 | movu m6, [r5 + 21 * 16] | |
27569 | pmaddubsw m4, m0, m6 | |
27570 | pmulhrsw m4, m7 | |
27571 | pmaddubsw m5, m2, m6 | |
27572 | pmulhrsw m5, m7 | |
27573 | packuswb m4, m5 | |
27574 | movu [r0 + 1436 * 16], m4 | |
27575 | pmaddubsw m4, m1, m6 | |
27576 | pmulhrsw m4, m7 | |
27577 | pmaddubsw m5, m3, m6 | |
27578 | pmulhrsw m5, m7 | |
27579 | packuswb m4, m5 | |
27580 | movu [r0 + 1437 * 16], m4 | |
27581 | ||
27582 | ; mode 24 [row 15] | |
27583 | movu m6, [r5 + 16 * 16] | |
27584 | pmaddubsw m4, m0, m6 | |
27585 | pmulhrsw m4, m7 | |
27586 | pmaddubsw m5, m2, m6 | |
27587 | pmulhrsw m5, m7 | |
27588 | packuswb m4, m5 | |
27589 | movu [r0 + 1438 * 16], m4 | |
27590 | pmaddubsw m4, m1, m6 | |
27591 | pmulhrsw m4, m7 | |
27592 | pmaddubsw m5, m3, m6 | |
27593 | pmulhrsw m5, m7 | |
27594 | packuswb m4, m5 | |
27595 | movu [r0 + 1439 * 16], m4 | |
27596 | ||
27597 | ; mode 24 [row 16] | |
27598 | movu m6, [r5 + 11 * 16] | |
27599 | pmaddubsw m4, m0, m6 | |
27600 | pmulhrsw m4, m7 | |
27601 | pmaddubsw m5, m2, m6 | |
27602 | pmulhrsw m5, m7 | |
27603 | packuswb m4, m5 | |
27604 | movu [r0 + 1440 * 16], m4 | |
27605 | pmaddubsw m4, m1, m6 | |
27606 | pmulhrsw m4, m7 | |
27607 | pmaddubsw m5, m3, m6 | |
27608 | pmulhrsw m5, m7 | |
27609 | packuswb m4, m5 | |
27610 | movu [r0 + 1441 * 16], m4 | |
27611 | ||
27612 | ; mode 24 [row 17] | |
27613 | movu m6, [r5 + 6 * 16] | |
27614 | pmaddubsw m4, m0, m6 | |
27615 | pmulhrsw m4, m7 | |
27616 | pmaddubsw m5, m2, m6 | |
27617 | pmulhrsw m5, m7 | |
27618 | packuswb m4, m5 | |
27619 | movu [r0 + 1442 * 16], m4 | |
27620 | pmaddubsw m4, m1, m6 | |
27621 | pmulhrsw m4, m7 | |
27622 | pmaddubsw m5, m3, m6 | |
27623 | pmulhrsw m5, m7 | |
27624 | packuswb m4, m5 | |
27625 | movu [r0 + 1443 * 16], m4 | |
27626 | ||
27627 | ; mode 24 [row 18] | |
27628 | movu m6, [r5 + 1 * 16] | |
27629 | pmaddubsw m4, m0, m6 | |
27630 | pmulhrsw m4, m7 | |
27631 | pmaddubsw m5, m2, m6 | |
27632 | pmulhrsw m5, m7 | |
27633 | packuswb m4, m5 | |
27634 | movu [r0 + 1444 * 16], m4 | |
27635 | pmaddubsw m4, m1, m6 | |
27636 | pmulhrsw m4, m7 | |
27637 | pmaddubsw m5, m3, m6 | |
27638 | pmulhrsw m5, m7 | |
27639 | packuswb m4, m5 | |
27640 | movu [r0 + 1445 * 16], m4 | |
27641 | ||
27642 | ; mode 24 [row 19] | |
27643 | movu m6, [r5 + 28 * 16] | |
27644 | pslldq m0, 2 | |
27645 | pinsrb m0, [r4 + 13], 1 | |
27646 | pinsrb m0, [r4 + 19], 0 | |
27647 | pmaddubsw m4, m0, m6 | |
27648 | pmulhrsw m4, m7 | |
27649 | pslldq m2, 2 | |
27650 | pinsrw m2, [r3 + 5], 0 | |
27651 | pmaddubsw m5, m2, m6 | |
27652 | pmulhrsw m5, m7 | |
27653 | packuswb m4, m5 | |
27654 | movu [r0 + 1446 * 16], m4 | |
27655 | pslldq m1, 2 | |
27656 | pinsrw m1, [r3 + 13], 0 | |
27657 | pmaddubsw m4, m1, m6 | |
27658 | pmulhrsw m4, m7 | |
27659 | pslldq m3, 2 | |
27660 | pinsrw m3, [r3 + 21], 0 | |
27661 | pmaddubsw m5, m3, m6 | |
27662 | pmulhrsw m5, m7 | |
27663 | packuswb m4, m5 | |
27664 | movu [r0 + 1447 * 16], m4 | |
27665 | ||
27666 | ; mode 24 [row 20] | |
27667 | movu m6, [r5 + 23 * 16] | |
27668 | pmaddubsw m4, m0, m6 | |
27669 | pmulhrsw m4, m7 | |
27670 | pmaddubsw m5, m2, m6 | |
27671 | pmulhrsw m5, m7 | |
27672 | packuswb m4, m5 | |
27673 | movu [r0 + 1448 * 16], m4 | |
27674 | pmaddubsw m4, m1, m6 | |
27675 | pmulhrsw m4, m7 | |
27676 | pmaddubsw m5, m3, m6 | |
27677 | pmulhrsw m5, m7 | |
27678 | packuswb m4, m5 | |
27679 | movu [r0 + 1449 * 16], m4 | |
27680 | ||
27681 | ; mode 24 [row 21] | |
27682 | movu m6, [r5 + 18 * 16] | |
27683 | pmaddubsw m4, m0, m6 | |
27684 | pmulhrsw m4, m7 | |
27685 | pmaddubsw m5, m2, m6 | |
27686 | pmulhrsw m5, m7 | |
27687 | packuswb m4, m5 | |
27688 | movu [r0 + 1450 * 16], m4 | |
27689 | pmaddubsw m4, m1, m6 | |
27690 | pmulhrsw m4, m7 | |
27691 | pmaddubsw m5, m3, m6 | |
27692 | pmulhrsw m5, m7 | |
27693 | packuswb m4, m5 | |
27694 | movu [r0 + 1451 * 16], m4 | |
27695 | ||
27696 | ; mode 24 [row 22] | |
27697 | movu m6, [r5 + 13 * 16] | |
27698 | pmaddubsw m4, m0, m6 | |
27699 | pmulhrsw m4, m7 | |
27700 | pmaddubsw m5, m2, m6 | |
27701 | pmulhrsw m5, m7 | |
27702 | packuswb m4, m5 | |
27703 | movu [r0 + 1452 * 16], m4 | |
27704 | pmaddubsw m4, m1, m6 | |
27705 | pmulhrsw m4, m7 | |
27706 | pmaddubsw m5, m3, m6 | |
27707 | pmulhrsw m5, m7 | |
27708 | packuswb m4, m5 | |
27709 | movu [r0 + 1453 * 16], m4 | |
27710 | ||
27711 | ; mode 24 [row 23] | |
27712 | movu m6, [r5 + 8 * 16] | |
27713 | pmaddubsw m4, m0, m6 | |
27714 | pmulhrsw m4, m7 | |
27715 | pmaddubsw m5, m2, m6 | |
27716 | pmulhrsw m5, m7 | |
27717 | packuswb m4, m5 | |
27718 | movu [r0 + 1454 * 16], m4 | |
27719 | pmaddubsw m4, m1, m6 | |
27720 | pmulhrsw m4, m7 | |
27721 | pmaddubsw m5, m3, m6 | |
27722 | pmulhrsw m5, m7 | |
27723 | packuswb m4, m5 | |
27724 | movu [r0 + 1455 * 16], m4 | |
27725 | ||
27726 | ; mode 24 [row 24] | |
27727 | movu m6, [r5 + 3 * 16] | |
27728 | pmaddubsw m4, m0, m6 | |
27729 | pmulhrsw m4, m7 | |
27730 | pmaddubsw m5, m2, m6 | |
27731 | pmulhrsw m5, m7 | |
27732 | packuswb m4, m5 | |
27733 | movu [r0 + 1456 * 16], m4 | |
27734 | pmaddubsw m4, m1, m6 | |
27735 | pmulhrsw m4, m7 | |
27736 | pmaddubsw m5, m3, m6 | |
27737 | pmulhrsw m5, m7 | |
27738 | packuswb m4, m5 | |
27739 | movu [r0 + 1457 * 16], m4 | |
27740 | ||
27741 | ; mode 24 [row 25] | |
27742 | movu m6, [r5 + 30 * 16] | |
27743 | pslldq m0, 2 | |
27744 | pinsrb m0, [r4 + 19], 1 | |
27745 | pinsrb m0, [r4 + 26], 0 | |
27746 | pmaddubsw m4, m0, m6 | |
27747 | pmulhrsw m4, m7 | |
27748 | pslldq m2, 2 | |
27749 | pinsrw m2, [r3 + 4], 0 | |
27750 | pmaddubsw m5, m2, m6 | |
27751 | pmulhrsw m5, m7 | |
27752 | packuswb m4, m5 | |
27753 | movu [r0 + 1458 * 16], m4 | |
27754 | pslldq m1, 2 | |
27755 | pinsrw m1, [r3 + 12], 0 | |
27756 | pmaddubsw m4, m1, m6 | |
27757 | pmulhrsw m4, m7 | |
27758 | pslldq m3, 2 | |
27759 | pinsrw m3, [r3 + 20], 0 | |
27760 | pmaddubsw m5, m3, m6 | |
27761 | pmulhrsw m5, m7 | |
27762 | packuswb m4, m5 | |
27763 | movu [r0 + 1459 * 16], m4 | |
27764 | ||
27765 | ; mode 24 [row 26] | |
27766 | movu m6, [r5 + 25 * 16] | |
27767 | pmaddubsw m4, m0, m6 | |
27768 | pmulhrsw m4, m7 | |
27769 | pmaddubsw m5, m2, m6 | |
27770 | pmulhrsw m5, m7 | |
27771 | packuswb m4, m5 | |
27772 | movu [r0 + 1460 * 16], m4 | |
27773 | pmaddubsw m4, m1, m6 | |
27774 | pmulhrsw m4, m7 | |
27775 | pmaddubsw m5, m3, m6 | |
27776 | pmulhrsw m5, m7 | |
27777 | packuswb m4, m5 | |
27778 | movu [r0 + 1461 * 16], m4 | |
27779 | ||
27780 | ; mode 24 [row 27] | |
27781 | movu m6, [r5 + 20 * 16] | |
27782 | pmaddubsw m4, m0, m6 | |
27783 | pmulhrsw m4, m7 | |
27784 | pmaddubsw m5, m2, m6 | |
27785 | pmulhrsw m5, m7 | |
27786 | packuswb m4, m5 | |
27787 | movu [r0 + 1462 * 16], m4 | |
27788 | pmaddubsw m4, m1, m6 | |
27789 | pmulhrsw m4, m7 | |
27790 | pmaddubsw m5, m3, m6 | |
27791 | pmulhrsw m5, m7 | |
27792 | packuswb m4, m5 | |
27793 | movu [r0 + 1463 * 16], m4 | |
27794 | ||
27795 | ; mode 24 [row 28] | |
27796 | movu m6, [r5 + 15 * 16] | |
27797 | pmaddubsw m4, m0, m6 | |
27798 | pmulhrsw m4, m7 | |
27799 | pmaddubsw m5, m2, m6 | |
27800 | pmulhrsw m5, m7 | |
27801 | packuswb m4, m5 | |
27802 | movu [r0 + 1464 * 16], m4 | |
27803 | pmaddubsw m4, m1, m6 | |
27804 | pmulhrsw m4, m7 | |
27805 | pmaddubsw m5, m3, m6 | |
27806 | pmulhrsw m5, m7 | |
27807 | packuswb m4, m5 | |
27808 | movu [r0 + 1465 * 16], m4 | |
27809 | ||
27810 | ; mode 24 [row 29] | |
27811 | movu m6, [r5 + 10 * 16] | |
27812 | pmaddubsw m4, m0, m6 | |
27813 | pmulhrsw m4, m7 | |
27814 | pmaddubsw m5, m2, m6 | |
27815 | pmulhrsw m5, m7 | |
27816 | packuswb m4, m5 | |
27817 | movu [r0 + 1466 * 16], m4 | |
27818 | pmaddubsw m4, m1, m6 | |
27819 | pmulhrsw m4, m7 | |
27820 | pmaddubsw m5, m3, m6 | |
27821 | pmulhrsw m5, m7 | |
27822 | packuswb m4, m5 | |
27823 | movu [r0 + 1467 * 16], m4 | |
27824 | ||
27825 | ; mode 24 [row 30] | |
27826 | movu m6, [r5 + 5 * 16] | |
27827 | pmaddubsw m4, m0, m6 | |
27828 | pmulhrsw m4, m7 | |
27829 | pmaddubsw m5, m2, m6 | |
27830 | pmulhrsw m5, m7 | |
27831 | packuswb m4, m5 | |
27832 | movu [r0 + 1468 * 16], m4 | |
27833 | pmaddubsw m4, m1, m6 | |
27834 | pmulhrsw m4, m7 | |
27835 | pmaddubsw m5, m3, m6 | |
27836 | pmulhrsw m5, m7 | |
27837 | packuswb m4, m5 | |
27838 | movu [r0 + 1469 * 16], m4 | |
27839 | ||
27840 | ; mode 24 [row 31] | |
27841 | pshufb m5, m0, [tab_S2] | |
27842 | movh [r0 + 1470 * 16], m5 | |
27843 | pshufb m5, m2, [tab_S2] | |
27844 | movh [r0 + 1470 * 16 + 8], m5 | |
27845 | pshufb m5, m1, [tab_S2] | |
27846 | movh [r0 + 1471 * 16], m5 | |
27847 | pshufb m5, m3, [tab_S2] | |
27848 | movh [r0 + 1471 * 16 + 8], m5 | |
27849 | ||
27850 | ; mode 25 [row 0] | |
27851 | movu m6, [r5 + 30 * 16] | |
27852 | movu m0, [r3 ] | |
27853 | movu m1, [r3 + 1 ] | |
27854 | punpcklbw m0, m1 | |
27855 | pmaddubsw m4, m0, m6 | |
27856 | pmulhrsw m4, m7 | |
27857 | movu m2, [r3 + 8] | |
27858 | movu m3, [r3 + 9] | |
27859 | punpcklbw m2, m3 | |
27860 | pmaddubsw m5, m2, m6 | |
27861 | pmulhrsw m5, m7 | |
27862 | packuswb m4, m5 | |
27863 | movu [r0 + 1472 * 16], m4 | |
27864 | ||
27865 | movu m1, [r3 + 16] | |
27866 | movu m3, [r3 + 17] | |
27867 | punpcklbw m1, m3 | |
27868 | pmaddubsw m4, m1, m6 | |
27869 | pmulhrsw m4, m7 | |
27870 | movu m3, [r3 + 24] | |
27871 | movu m5, [r3 + 25] | |
27872 | punpcklbw m3, m5 | |
27873 | pmaddubsw m5, m3, m6 | |
27874 | pmulhrsw m5, m7 | |
27875 | packuswb m4, m5 | |
27876 | movu [r0 + 1473 * 16], m4 | |
27877 | ||
27878 | ; mode 25 [row 1] | |
27879 | movu m6, [r5 + 28 * 16] | |
27880 | pmaddubsw m4, m0, m6 | |
27881 | pmulhrsw m4, m7 | |
27882 | pmaddubsw m5, m2, m6 | |
27883 | pmulhrsw m5, m7 | |
27884 | packuswb m4, m5 | |
27885 | movu [r0 + 1474 * 16], m4 | |
27886 | pmaddubsw m4, m1, m6 | |
27887 | pmulhrsw m4, m7 | |
27888 | pmaddubsw m5, m3, m6 | |
27889 | pmulhrsw m5, m7 | |
27890 | packuswb m4, m5 | |
27891 | movu [r0 + 1475 * 16], m4 | |
27892 | ||
27893 | ; mode 25 [row 2] | |
27894 | movu m6, [r5 + 26 * 16] | |
27895 | pmaddubsw m4, m0, m6 | |
27896 | pmulhrsw m4, m7 | |
27897 | pmaddubsw m5, m2, m6 | |
27898 | pmulhrsw m5, m7 | |
27899 | packuswb m4, m5 | |
27900 | movu [r0 + 1476 * 16], m4 | |
27901 | pmaddubsw m4, m1, m6 | |
27902 | pmulhrsw m4, m7 | |
27903 | pmaddubsw m5, m3, m6 | |
27904 | pmulhrsw m5, m7 | |
27905 | packuswb m4, m5 | |
27906 | movu [r0 + 1477 * 16], m4 | |
27907 | ||
27908 | ; mode 25 [row 3] | |
27909 | movu m6, [r5 + 24 * 16] | |
27910 | pmaddubsw m4, m0, m6 | |
27911 | pmulhrsw m4, m7 | |
27912 | pmaddubsw m5, m2, m6 | |
27913 | pmulhrsw m5, m7 | |
27914 | packuswb m4, m5 | |
27915 | movu [r0 + 1478 * 16], m4 | |
27916 | pmaddubsw m4, m1, m6 | |
27917 | pmulhrsw m4, m7 | |
27918 | pmaddubsw m5, m3, m6 | |
27919 | pmulhrsw m5, m7 | |
27920 | packuswb m4, m5 | |
27921 | movu [r0 + 1479 * 16], m4 | |
27922 | ||
27923 | ; mode 25 [row 4] | |
27924 | movu m6, [r5 + 22 * 16] | |
27925 | pmaddubsw m4, m0, m6 | |
27926 | pmulhrsw m4, m7 | |
27927 | pmaddubsw m5, m2, m6 | |
27928 | pmulhrsw m5, m7 | |
27929 | packuswb m4, m5 | |
27930 | movu [r0 + 1480 * 16], m4 | |
27931 | pmaddubsw m4, m1, m6 | |
27932 | pmulhrsw m4, m7 | |
27933 | pmaddubsw m5, m3, m6 | |
27934 | pmulhrsw m5, m7 | |
27935 | packuswb m4, m5 | |
27936 | movu [r0 + 1481 * 16], m4 | |
27937 | ||
27938 | ; mode 25 [row 5] | |
27939 | movu m6, [r5 + 20 * 16] | |
27940 | pmaddubsw m4, m0, m6 | |
27941 | pmulhrsw m4, m7 | |
27942 | pmaddubsw m5, m2, m6 | |
27943 | pmulhrsw m5, m7 | |
27944 | packuswb m4, m5 | |
27945 | movu [r0 + 1482 * 16], m4 | |
27946 | pmaddubsw m4, m1, m6 | |
27947 | pmulhrsw m4, m7 | |
27948 | pmaddubsw m5, m3, m6 | |
27949 | pmulhrsw m5, m7 | |
27950 | packuswb m4, m5 | |
27951 | movu [r0 + 1483 * 16], m4 | |
27952 | ||
27953 | ; mode 25 [row 6] | |
27954 | movu m6, [r5 + 18 * 16] | |
27955 | pmaddubsw m4, m0, m6 | |
27956 | pmulhrsw m4, m7 | |
27957 | pmaddubsw m5, m2, m6 | |
27958 | pmulhrsw m5, m7 | |
27959 | packuswb m4, m5 | |
27960 | movu [r0 + 1484 * 16], m4 | |
27961 | pmaddubsw m4, m1, m6 | |
27962 | pmulhrsw m4, m7 | |
27963 | pmaddubsw m5, m3, m6 | |
27964 | pmulhrsw m5, m7 | |
27965 | packuswb m4, m5 | |
27966 | movu [r0 + 1485 * 16], m4 | |
27967 | ||
27968 | ; mode 25 [row 7] | |
27969 | movu m6, [r5 + 16 * 16] | |
27970 | pmaddubsw m4, m0, m6 | |
27971 | pmulhrsw m4, m7 | |
27972 | pmaddubsw m5, m2, m6 | |
27973 | pmulhrsw m5, m7 | |
27974 | packuswb m4, m5 | |
27975 | movu [r0 + 1486 * 16], m4 | |
27976 | pmaddubsw m4, m1, m6 | |
27977 | pmulhrsw m4, m7 | |
27978 | pmaddubsw m5, m3, m6 | |
27979 | pmulhrsw m5, m7 | |
27980 | packuswb m4, m5 | |
27981 | movu [r0 + 1487 * 16], m4 | |
27982 | ||
27983 | ; mode 25 [row 8] | |
27984 | movu m6, [r5 + 14 * 16] | |
27985 | pmaddubsw m4, m0, m6 | |
27986 | pmulhrsw m4, m7 | |
27987 | pmaddubsw m5, m2, m6 | |
27988 | pmulhrsw m5, m7 | |
27989 | packuswb m4, m5 | |
27990 | movu [r0 + 1488 * 16], m4 | |
27991 | pmaddubsw m4, m1, m6 | |
27992 | pmulhrsw m4, m7 | |
27993 | pmaddubsw m5, m3, m6 | |
27994 | pmulhrsw m5, m7 | |
27995 | packuswb m4, m5 | |
27996 | movu [r0 + 1489 * 16], m4 | |
27997 | ||
27998 | ; mode 25 [row 9] | |
27999 | movu m6, [r5 + 12 * 16] | |
28000 | pmaddubsw m4, m0, m6 | |
28001 | pmulhrsw m4, m7 | |
28002 | pmaddubsw m5, m2, m6 | |
28003 | pmulhrsw m5, m7 | |
28004 | packuswb m4, m5 | |
28005 | movu [r0 + 1490 * 16], m4 | |
28006 | pmaddubsw m4, m1, m6 | |
28007 | pmulhrsw m4, m7 | |
28008 | pmaddubsw m5, m3, m6 | |
28009 | pmulhrsw m5, m7 | |
28010 | packuswb m4, m5 | |
28011 | movu [r0 + 1491 * 16], m4 | |
28012 | ||
28013 | ; mode 25 [row 10] | |
28014 | movu m6, [r5 + 10 * 16] | |
28015 | pmaddubsw m4, m0, m6 | |
28016 | pmulhrsw m4, m7 | |
28017 | pmaddubsw m5, m2, m6 | |
28018 | pmulhrsw m5, m7 | |
28019 | packuswb m4, m5 | |
28020 | movu [r0 + 1492 * 16], m4 | |
28021 | pmaddubsw m4, m1, m6 | |
28022 | pmulhrsw m4, m7 | |
28023 | pmaddubsw m5, m3, m6 | |
28024 | pmulhrsw m5, m7 | |
28025 | packuswb m4, m5 | |
28026 | movu [r0 + 1493 * 16], m4 | |
28027 | ||
28028 | ; mode 25 [row 11] | |
28029 | movu m6, [r5 + 8 * 16] | |
28030 | pmaddubsw m4, m0, m6 | |
28031 | pmulhrsw m4, m7 | |
28032 | pmaddubsw m5, m2, m6 | |
28033 | pmulhrsw m5, m7 | |
28034 | packuswb m4, m5 | |
28035 | movu [r0 + 1494 * 16], m4 | |
28036 | pmaddubsw m4, m1, m6 | |
28037 | pmulhrsw m4, m7 | |
28038 | pmaddubsw m5, m3, m6 | |
28039 | pmulhrsw m5, m7 | |
28040 | packuswb m4, m5 | |
28041 | movu [r0 + 1495 * 16], m4 | |
28042 | ||
28043 | ; mode 25 [row 12] | |
28044 | movu m6, [r5 + 6 * 16] | |
28045 | pmaddubsw m4, m0, m6 | |
28046 | pmulhrsw m4, m7 | |
28047 | pmaddubsw m5, m2, m6 | |
28048 | pmulhrsw m5, m7 | |
28049 | packuswb m4, m5 | |
28050 | movu [r0 + 1496 * 16], m4 | |
28051 | pmaddubsw m4, m1, m6 | |
28052 | pmulhrsw m4, m7 | |
28053 | pmaddubsw m5, m3, m6 | |
28054 | pmulhrsw m5, m7 | |
28055 | packuswb m4, m5 | |
28056 | movu [r0 + 1497 * 16], m4 | |
28057 | ||
28058 | ; mode 25 [row 13] | |
28059 | movu m6, [r5 + 4 * 16] | |
28060 | pmaddubsw m4, m0, m6 | |
28061 | pmulhrsw m4, m7 | |
28062 | pmaddubsw m5, m2, m6 | |
28063 | pmulhrsw m5, m7 | |
28064 | packuswb m4, m5 | |
28065 | movu [r0 + 1498 * 16], m4 | |
28066 | pmaddubsw m4, m1, m6 | |
28067 | pmulhrsw m4, m7 | |
28068 | pmaddubsw m5, m3, m6 | |
28069 | pmulhrsw m5, m7 | |
28070 | packuswb m4, m5 | |
28071 | movu [r0 + 1499 * 16], m4 | |
28072 | ||
28073 | ; mode 25 [row 14] | |
28074 | movu m6, [r5 + 2 * 16] | |
28075 | pmaddubsw m4, m0, m6 | |
28076 | pmulhrsw m4, m7 | |
28077 | pmaddubsw m5, m2, m6 | |
28078 | pmulhrsw m5, m7 | |
28079 | packuswb m4, m5 | |
28080 | movu [r0 + 1500 * 16], m4 | |
28081 | pmaddubsw m4, m1, m6 | |
28082 | pmulhrsw m4, m7 | |
28083 | pmaddubsw m5, m3, m6 | |
28084 | pmulhrsw m5, m7 | |
28085 | packuswb m4, m5 | |
28086 | movu [r0 + 1501 * 16], m4 | |
28087 | ||
28088 | ; mode 25 [row 15] | |
28089 | pshufb m5, m0, [tab_S2] | |
28090 | movh [r0 + 1502 * 16], m5 | |
28091 | pshufb m5, m2, [tab_S2] | |
28092 | movh [r0 + 1502 * 16 + 8], m5 | |
28093 | pshufb m5, m1, [tab_S2] | |
28094 | movh [r0 + 1503 * 16], m5 | |
28095 | pshufb m5, m3, [tab_S2] | |
28096 | movh [r0 + 1503 * 16 + 8], m5 | |
28097 | ||
28098 | ; mode 25 [row 16] | |
28099 | movu m6, [r5 + 30 * 16] | |
28100 | pslldq m0, 2 | |
28101 | pinsrb m0, [r4 + 0], 1 | |
28102 | pinsrb m0, [r4 + 16], 0 | |
28103 | pmaddubsw m4, m0, m6 | |
28104 | pmulhrsw m4, m7 | |
28105 | pslldq m2, 2 | |
28106 | pinsrw m2, [r3 + 7], 0 | |
28107 | pmaddubsw m5, m2, m6 | |
28108 | pmulhrsw m5, m7 | |
28109 | packuswb m4, m5 | |
28110 | movu [r0 + 1504 * 16], m4 | |
28111 | pslldq m1, 2 | |
28112 | pinsrw m1, [r3 + 15], 0 | |
28113 | pmaddubsw m4, m1, m6 | |
28114 | pmulhrsw m4, m7 | |
28115 | pslldq m3, 2 | |
28116 | pinsrw m3, [r3 + 23], 0 | |
28117 | pmaddubsw m5, m3, m6 | |
28118 | pmulhrsw m5, m7 | |
28119 | packuswb m4, m5 | |
28120 | movu [r0 + 1505 * 16], m4 | |
28121 | ||
28122 | ; mode 25 [row 17] | |
28123 | movu m6, [r5 + 28 * 16] | |
28124 | pmaddubsw m4, m0, m6 | |
28125 | pmulhrsw m4, m7 | |
28126 | pmaddubsw m5, m2, m6 | |
28127 | pmulhrsw m5, m7 | |
28128 | packuswb m4, m5 | |
28129 | movu [r0 + 1506 * 16], m4 | |
28130 | pmaddubsw m4, m1, m6 | |
28131 | pmulhrsw m4, m7 | |
28132 | pmaddubsw m5, m3, m6 | |
28133 | pmulhrsw m5, m7 | |
28134 | packuswb m4, m5 | |
28135 | movu [r0 + 1507 * 16], m4 | |
28136 | ||
28137 | ; mode 25 [row 18] | |
28138 | movu m6, [r5 + 26 * 16] | |
28139 | pmaddubsw m4, m0, m6 | |
28140 | pmulhrsw m4, m7 | |
28141 | pmaddubsw m5, m2, m6 | |
28142 | pmulhrsw m5, m7 | |
28143 | packuswb m4, m5 | |
28144 | movu [r0 + 1508 * 16], m4 | |
28145 | pmaddubsw m4, m1, m6 | |
28146 | pmulhrsw m4, m7 | |
28147 | pmaddubsw m5, m3, m6 | |
28148 | pmulhrsw m5, m7 | |
28149 | packuswb m4, m5 | |
28150 | movu [r0 + 1509 * 16], m4 | |
28151 | ||
28152 | ; mode 25 [row 19] | |
28153 | movu m6, [r5 + 24 * 16] | |
28154 | pmaddubsw m4, m0, m6 | |
28155 | pmulhrsw m4, m7 | |
28156 | pmaddubsw m5, m2, m6 | |
28157 | pmulhrsw m5, m7 | |
28158 | packuswb m4, m5 | |
28159 | movu [r0 + 1510 * 16], m4 | |
28160 | pmaddubsw m4, m1, m6 | |
28161 | pmulhrsw m4, m7 | |
28162 | pmaddubsw m5, m3, m6 | |
28163 | pmulhrsw m5, m7 | |
28164 | packuswb m4, m5 | |
28165 | movu [r0 + 1511 * 16], m4 | |
28166 | ||
28167 | ; mode 25 [row 20] | |
28168 | movu m6, [r5 + 22 * 16] | |
28169 | pmaddubsw m4, m0, m6 | |
28170 | pmulhrsw m4, m7 | |
28171 | pmaddubsw m5, m2, m6 | |
28172 | pmulhrsw m5, m7 | |
28173 | packuswb m4, m5 | |
28174 | movu [r0 + 1512 * 16], m4 | |
28175 | pmaddubsw m4, m1, m6 | |
28176 | pmulhrsw m4, m7 | |
28177 | pmaddubsw m5, m3, m6 | |
28178 | pmulhrsw m5, m7 | |
28179 | packuswb m4, m5 | |
28180 | movu [r0 + 1513 * 16], m4 | |
28181 | ||
28182 | ; mode 25 [row 21] | |
28183 | movu m6, [r5 + 20 * 16] | |
28184 | pmaddubsw m4, m0, m6 | |
28185 | pmulhrsw m4, m7 | |
28186 | pmaddubsw m5, m2, m6 | |
28187 | pmulhrsw m5, m7 | |
28188 | packuswb m4, m5 | |
28189 | movu [r0 + 1514 * 16], m4 | |
28190 | pmaddubsw m4, m1, m6 | |
28191 | pmulhrsw m4, m7 | |
28192 | pmaddubsw m5, m3, m6 | |
28193 | pmulhrsw m5, m7 | |
28194 | packuswb m4, m5 | |
28195 | movu [r0 + 1515 * 16], m4 | |
28196 | ||
28197 | ; mode 25 [row 22] | |
28198 | movu m6, [r5 + 18 * 16] | |
28199 | pmaddubsw m4, m0, m6 | |
28200 | pmulhrsw m4, m7 | |
28201 | pmaddubsw m5, m2, m6 | |
28202 | pmulhrsw m5, m7 | |
28203 | packuswb m4, m5 | |
28204 | movu [r0 + 1516 * 16], m4 | |
28205 | pmaddubsw m4, m1, m6 | |
28206 | pmulhrsw m4, m7 | |
28207 | pmaddubsw m5, m3, m6 | |
28208 | pmulhrsw m5, m7 | |
28209 | packuswb m4, m5 | |
28210 | movu [r0 + 1517 * 16], m4 | |
28211 | ||
28212 | ; mode 25 [row 23] | |
28213 | movu m6, [r5 + 16 * 16] | |
28214 | pmaddubsw m4, m0, m6 | |
28215 | pmulhrsw m4, m7 | |
28216 | pmaddubsw m5, m2, m6 | |
28217 | pmulhrsw m5, m7 | |
28218 | packuswb m4, m5 | |
28219 | movu [r0 + 1518 * 16], m4 | |
28220 | pmaddubsw m4, m1, m6 | |
28221 | pmulhrsw m4, m7 | |
28222 | pmaddubsw m5, m3, m6 | |
28223 | pmulhrsw m5, m7 | |
28224 | packuswb m4, m5 | |
28225 | movu [r0 + 1519 * 16], m4 | |
28226 | ||
28227 | ; mode 25 [row 24] | |
28228 | movu m6, [r5 + 14 * 16] | |
28229 | pmaddubsw m4, m0, m6 | |
28230 | pmulhrsw m4, m7 | |
28231 | pmaddubsw m5, m2, m6 | |
28232 | pmulhrsw m5, m7 | |
28233 | packuswb m4, m5 | |
28234 | movu [r0 + 1520 * 16], m4 | |
28235 | pmaddubsw m4, m1, m6 | |
28236 | pmulhrsw m4, m7 | |
28237 | pmaddubsw m5, m3, m6 | |
28238 | pmulhrsw m5, m7 | |
28239 | packuswb m4, m5 | |
28240 | movu [r0 + 1521 * 16], m4 | |
28241 | ||
28242 | ; mode 25 [row 25] | |
28243 | movu m6, [r5 + 12 * 16] | |
28244 | pmaddubsw m4, m0, m6 | |
28245 | pmulhrsw m4, m7 | |
28246 | pmaddubsw m5, m2, m6 | |
28247 | pmulhrsw m5, m7 | |
28248 | packuswb m4, m5 | |
28249 | movu [r0 + 1522 * 16], m4 | |
28250 | pmaddubsw m4, m1, m6 | |
28251 | pmulhrsw m4, m7 | |
28252 | pmaddubsw m5, m3, m6 | |
28253 | pmulhrsw m5, m7 | |
28254 | packuswb m4, m5 | |
28255 | movu [r0 + 1523 * 16], m4 | |
28256 | ||
28257 | ; mode 25 [row 26] | |
28258 | movu m6, [r5 + 10 * 16] | |
28259 | pmaddubsw m4, m0, m6 | |
28260 | pmulhrsw m4, m7 | |
28261 | pmaddubsw m5, m2, m6 | |
28262 | pmulhrsw m5, m7 | |
28263 | packuswb m4, m5 | |
28264 | movu [r0 + 1524 * 16], m4 | |
28265 | pmaddubsw m4, m1, m6 | |
28266 | pmulhrsw m4, m7 | |
28267 | pmaddubsw m5, m3, m6 | |
28268 | pmulhrsw m5, m7 | |
28269 | packuswb m4, m5 | |
28270 | movu [r0 + 1525 * 16], m4 | |
28271 | ||
28272 | ; mode 25 [row 27] | |
28273 | movu m6, [r5 + 8 * 16] | |
28274 | pmaddubsw m4, m0, m6 | |
28275 | pmulhrsw m4, m7 | |
28276 | pmaddubsw m5, m2, m6 | |
28277 | pmulhrsw m5, m7 | |
28278 | packuswb m4, m5 | |
28279 | movu [r0 + 1526 * 16], m4 | |
28280 | pmaddubsw m4, m1, m6 | |
28281 | pmulhrsw m4, m7 | |
28282 | pmaddubsw m5, m3, m6 | |
28283 | pmulhrsw m5, m7 | |
28284 | packuswb m4, m5 | |
28285 | movu [r0 + 1527 * 16], m4 | |
28286 | ||
28287 | ; mode 25 [row 28] | |
28288 | movu m6, [r5 + 6 * 16] | |
28289 | pmaddubsw m4, m0, m6 | |
28290 | pmulhrsw m4, m7 | |
28291 | pmaddubsw m5, m2, m6 | |
28292 | pmulhrsw m5, m7 | |
28293 | packuswb m4, m5 | |
28294 | movu [r0 + 1528 * 16], m4 | |
28295 | pmaddubsw m4, m1, m6 | |
28296 | pmulhrsw m4, m7 | |
28297 | pmaddubsw m5, m3, m6 | |
28298 | pmulhrsw m5, m7 | |
28299 | packuswb m4, m5 | |
28300 | movu [r0 + 1529 * 16], m4 | |
28301 | ||
28302 | ; mode 25 [row 29] | |
28303 | movu m6, [r5 + 4 * 16] | |
28304 | pmaddubsw m4, m0, m6 | |
28305 | pmulhrsw m4, m7 | |
28306 | pmaddubsw m5, m2, m6 | |
28307 | pmulhrsw m5, m7 | |
28308 | packuswb m4, m5 | |
28309 | movu [r0 + 1530 * 16], m4 | |
28310 | pmaddubsw m4, m1, m6 | |
28311 | pmulhrsw m4, m7 | |
28312 | pmaddubsw m5, m3, m6 | |
28313 | pmulhrsw m5, m7 | |
28314 | packuswb m4, m5 | |
28315 | movu [r0 + 1531 * 16], m4 | |
28316 | ||
28317 | ; mode 25 [row 30] | |
28318 | movu m6, [r5 + 2 * 16] | |
28319 | pmaddubsw m4, m0, m6 | |
28320 | pmulhrsw m4, m7 | |
28321 | pmaddubsw m5, m2, m6 | |
28322 | pmulhrsw m5, m7 | |
28323 | packuswb m4, m5 | |
28324 | movu [r0 + 1532 * 16], m4 | |
28325 | pmaddubsw m4, m1, m6 | |
28326 | pmulhrsw m4, m7 | |
28327 | pmaddubsw m5, m3, m6 | |
28328 | pmulhrsw m5, m7 | |
28329 | packuswb m4, m5 | |
28330 | movu [r0 + 1533 * 16], m4 | |
28331 | ||
28332 | ; mode 25 [row 31] | |
28333 | pshufb m5, m0, [tab_S2] | |
28334 | movh [r0 + 1534 * 16], m5 | |
28335 | pshufb m5, m2, [tab_S2] | |
28336 | movh [r0 + 1534 * 16 + 8], m5 | |
28337 | pshufb m5, m1, [tab_S2] | |
28338 | movh [r0 + 1535 * 16], m5 | |
28339 | pshufb m5, m3, [tab_S2] | |
28340 | movh [r0 + 1535 * 16 + 8], m5 | |
28341 | ||
28342 | ; mode 26 | |
28343 | movu m1, [r1 + 1] | |
28344 | movu m2, [r1 + 17] | |
28345 | movu [r0 + 1536 * 16], m1 | |
28346 | movu [r0 + 1537 * 16], m2 | |
28347 | movu [r0 + 1538 * 16], m1 | |
28348 | movu [r0 + 1539 * 16], m2 | |
28349 | movu [r0 + 1540 * 16], m1 | |
28350 | movu [r0 + 1541 * 16], m2 | |
28351 | movu [r0 + 1542 * 16], m1 | |
28352 | movu [r0 + 1543 * 16], m2 | |
28353 | movu [r0 + 1544 * 16], m1 | |
28354 | movu [r0 + 1545 * 16], m2 | |
28355 | movu [r0 + 1546 * 16], m1 | |
28356 | movu [r0 + 1547 * 16], m2 | |
28357 | movu [r0 + 1548 * 16], m1 | |
28358 | movu [r0 + 1549 * 16], m2 | |
28359 | movu [r0 + 1550 * 16], m1 | |
28360 | movu [r0 + 1551 * 16], m2 | |
28361 | ||
28362 | movu [r0 + 1552 * 16], m1 | |
28363 | movu [r0 + 1553 * 16], m2 | |
28364 | movu [r0 + 1554 * 16], m1 | |
28365 | movu [r0 + 1555 * 16], m2 | |
28366 | movu [r0 + 1556 * 16], m1 | |
28367 | movu [r0 + 1557 * 16], m2 | |
28368 | movu [r0 + 1558 * 16], m1 | |
28369 | movu [r0 + 1559 * 16], m2 | |
28370 | movu [r0 + 1560 * 16], m1 | |
28371 | movu [r0 + 1561 * 16], m2 | |
28372 | movu [r0 + 1562 * 16], m1 | |
28373 | movu [r0 + 1563 * 16], m2 | |
28374 | movu [r0 + 1564 * 16], m1 | |
28375 | movu [r0 + 1565 * 16], m2 | |
28376 | movu [r0 + 1566 * 16], m1 | |
28377 | movu [r0 + 1567 * 16], m2 | |
28378 | ||
28379 | movu [r0 + 1568 * 16], m1 | |
28380 | movu [r0 + 1569 * 16], m2 | |
28381 | movu [r0 + 1570 * 16], m1 | |
28382 | movu [r0 + 1571 * 16], m2 | |
28383 | movu [r0 + 1572 * 16], m1 | |
28384 | movu [r0 + 1573 * 16], m2 | |
28385 | movu [r0 + 1574 * 16], m1 | |
28386 | movu [r0 + 1575 * 16], m2 | |
28387 | movu [r0 + 1576 * 16], m1 | |
28388 | movu [r0 + 1577 * 16], m2 | |
28389 | movu [r0 + 1578 * 16], m1 | |
28390 | movu [r0 + 1579 * 16], m2 | |
28391 | movu [r0 + 1580 * 16], m1 | |
28392 | movu [r0 + 1581 * 16], m2 | |
28393 | movu [r0 + 1582 * 16], m1 | |
28394 | movu [r0 + 1583 * 16], m2 | |
28395 | ||
28396 | movu [r0 + 1584 * 16], m1 | |
28397 | movu [r0 + 1585 * 16], m2 | |
28398 | movu [r0 + 1586 * 16], m1 | |
28399 | movu [r0 + 1587 * 16], m2 | |
28400 | movu [r0 + 1588 * 16], m1 | |
28401 | movu [r0 + 1589 * 16], m2 | |
28402 | movu [r0 + 1590 * 16], m1 | |
28403 | movu [r0 + 1591 * 16], m2 | |
28404 | movu [r0 + 1592 * 16], m1 | |
28405 | movu [r0 + 1593 * 16], m2 | |
28406 | movu [r0 + 1594 * 16], m1 | |
28407 | movu [r0 + 1595 * 16], m2 | |
28408 | movu [r0 + 1596 * 16], m1 | |
28409 | movu [r0 + 1597 * 16], m2 | |
28410 | movu [r0 + 1598 * 16], m1 | |
28411 | movu [r0 + 1599 * 16], m2 | |
28412 | ||
28413 | ; mode 27 [row 0] | |
28414 | movu m6, [r5 + 2 * 16] | |
28415 | movu m0, [r3 + 1 ] | |
28416 | movu m1, [r3 + 2 ] | |
28417 | punpcklbw m0, m1 | |
28418 | pmaddubsw m4, m0, m6 | |
28419 | pmulhrsw m4, m7 | |
28420 | movu m2, [r3 + 9] | |
28421 | movu m3, [r3 + 10] | |
28422 | punpcklbw m2, m3 | |
28423 | pmaddubsw m5, m2, m6 | |
28424 | pmulhrsw m5, m7 | |
28425 | packuswb m4, m5 | |
28426 | movu [r0 + 1600 * 16], m4 | |
28427 | ||
28428 | movu m1, [r3 + 17] | |
28429 | movu m3, [r3 + 18] | |
28430 | punpcklbw m1, m3 | |
28431 | pmaddubsw m4, m1, m6 | |
28432 | pmulhrsw m4, m7 | |
28433 | movu m3, [r3 + 25] | |
28434 | movu m5, [r3 + 26] | |
28435 | punpcklbw m3, m5 | |
28436 | pmaddubsw m5, m3, m6 | |
28437 | pmulhrsw m5, m7 | |
28438 | packuswb m4, m5 | |
28439 | movu [r0 + 1601 * 16], m4 | |
28440 | ||
28441 | ; mode 27 [row 1] | |
28442 | movu m6, [r5 + 4 * 16] | |
28443 | pmaddubsw m4, m0, m6 | |
28444 | pmulhrsw m4, m7 | |
28445 | pmaddubsw m5, m2, m6 | |
28446 | pmulhrsw m5, m7 | |
28447 | packuswb m4, m5 | |
28448 | movu [r0 + 1602 * 16], m4 | |
28449 | pmaddubsw m4, m1, m6 | |
28450 | pmulhrsw m4, m7 | |
28451 | pmaddubsw m5, m3, m6 | |
28452 | pmulhrsw m5, m7 | |
28453 | packuswb m4, m5 | |
28454 | movu [r0 + 1603 * 16], m4 | |
28455 | ||
28456 | ; mode 27 [row 2] | |
28457 | movu m6, [r5 + 6 * 16] | |
28458 | pmaddubsw m4, m0, m6 | |
28459 | pmulhrsw m4, m7 | |
28460 | pmaddubsw m5, m2, m6 | |
28461 | pmulhrsw m5, m7 | |
28462 | packuswb m4, m5 | |
28463 | movu [r0 + 1604 * 16], m4 | |
28464 | pmaddubsw m4, m1, m6 | |
28465 | pmulhrsw m4, m7 | |
28466 | pmaddubsw m5, m3, m6 | |
28467 | pmulhrsw m5, m7 | |
28468 | packuswb m4, m5 | |
28469 | movu [r0 + 1605 * 16], m4 | |
28470 | ||
28471 | ; mode 27 [row 3] | |
28472 | movu m6, [r5 + 8 * 16] | |
28473 | pmaddubsw m4, m0, m6 | |
28474 | pmulhrsw m4, m7 | |
28475 | pmaddubsw m5, m2, m6 | |
28476 | pmulhrsw m5, m7 | |
28477 | packuswb m4, m5 | |
28478 | movu [r0 + 1606 * 16], m4 | |
28479 | pmaddubsw m4, m1, m6 | |
28480 | pmulhrsw m4, m7 | |
28481 | pmaddubsw m5, m3, m6 | |
28482 | pmulhrsw m5, m7 | |
28483 | packuswb m4, m5 | |
28484 | movu [r0 + 1607 * 16], m4 | |
28485 | ||
28486 | ; mode 27 [row 4] | |
28487 | movu m6, [r5 + 10 * 16] | |
28488 | pmaddubsw m4, m0, m6 | |
28489 | pmulhrsw m4, m7 | |
28490 | pmaddubsw m5, m2, m6 | |
28491 | pmulhrsw m5, m7 | |
28492 | packuswb m4, m5 | |
28493 | movu [r0 + 1608 * 16], m4 | |
28494 | ||
28495 | ; mode 28 [row 1 -first half] | |
28496 | movu [r0 + 1666 * 16], m4 | |
28497 | ||
28498 | pmaddubsw m4, m1, m6 | |
28499 | pmulhrsw m4, m7 | |
28500 | pmaddubsw m5, m3, m6 | |
28501 | pmulhrsw m5, m7 | |
28502 | packuswb m4, m5 | |
28503 | movu [r0 + 1609 * 16], m4 | |
28504 | ||
28505 | ; mode 28 [row 1 - second half] | |
28506 | movu [r0 + 1667 * 16], m4 | |
28507 | ||
28508 | ; mode 27 [row 5] | |
28509 | movu m6, [r5 + 12 * 16] | |
28510 | pmaddubsw m4, m0, m6 | |
28511 | pmulhrsw m4, m7 | |
28512 | pmaddubsw m5, m2, m6 | |
28513 | pmulhrsw m5, m7 | |
28514 | packuswb m4, m5 | |
28515 | movu [r0 + 1610 * 16], m4 | |
28516 | ||
28517 | pmaddubsw m4, m1, m6 | |
28518 | pmulhrsw m4, m7 | |
28519 | pmaddubsw m5, m3, m6 | |
28520 | pmulhrsw m5, m7 | |
28521 | packuswb m4, m5 | |
28522 | movu [r0 + 1611 * 16], m4 | |
28523 | ||
28524 | ; mode 27 [row 6] | |
28525 | movu m6, [r5 + 14 * 16] | |
28526 | pmaddubsw m4, m0, m6 | |
28527 | pmulhrsw m4, m7 | |
28528 | pmaddubsw m5, m2, m6 | |
28529 | pmulhrsw m5, m7 | |
28530 | packuswb m4, m5 | |
28531 | movu [r0 + 1612 * 16], m4 | |
28532 | pmaddubsw m4, m1, m6 | |
28533 | pmulhrsw m4, m7 | |
28534 | pmaddubsw m5, m3, m6 | |
28535 | pmulhrsw m5, m7 | |
28536 | packuswb m4, m5 | |
28537 | movu [r0 + 1613 * 16], m4 | |
28538 | ||
28539 | ; mode 27 [row 7] | |
28540 | movu m6, [r5 + 16 * 16] | |
28541 | pmaddubsw m4, m0, m6 | |
28542 | pmulhrsw m4, m7 | |
28543 | pmaddubsw m5, m2, m6 | |
28544 | pmulhrsw m5, m7 | |
28545 | packuswb m4, m5 | |
28546 | movu [r0 + 1614 * 16], m4 | |
28547 | pmaddubsw m4, m1, m6 | |
28548 | pmulhrsw m4, m7 | |
28549 | pmaddubsw m5, m3, m6 | |
28550 | pmulhrsw m5, m7 | |
28551 | packuswb m4, m5 | |
28552 | movu [r0 + 1615 * 16], m4 | |
28553 | ||
28554 | ; mode 27 [row 8] | |
28555 | movu m6, [r5 + 18 * 16] | |
28556 | pmaddubsw m4, m0, m6 | |
28557 | pmulhrsw m4, m7 | |
28558 | pmaddubsw m5, m2, m6 | |
28559 | pmulhrsw m5, m7 | |
28560 | packuswb m4, m5 | |
28561 | movu [r0 + 1616 * 16], m4 | |
28562 | ||
28563 | ; mode 29 [row 1 - first half] | |
28564 | movu [r0 + 1730 * 16], m4 | |
28565 | ||
28566 | pmaddubsw m4, m1, m6 | |
28567 | pmulhrsw m4, m7 | |
28568 | pmaddubsw m5, m3, m6 | |
28569 | pmulhrsw m5, m7 | |
28570 | packuswb m4, m5 | |
28571 | movu [r0 + 1617 * 16], m4 | |
28572 | ||
28573 | ; mode 29 [row 1 - second half] | |
28574 | movu [r0 + 1731 * 16], m4 | |
28575 | ||
28576 | ; mode 27 [row 9] | |
28577 | movu m6, [r5 + 20 * 16] | |
28578 | pmaddubsw m4, m0, m6 | |
28579 | pmulhrsw m4, m7 | |
28580 | pmaddubsw m5, m2, m6 | |
28581 | pmulhrsw m5, m7 | |
28582 | packuswb m4, m5 | |
28583 | movu [r0 + 1618 * 16], m4 | |
28584 | ||
28585 | ; mode 28 [row 3 -first half] | |
28586 | movu [r0 + 1670 * 16], m4 | |
28587 | ||
28588 | pmaddubsw m4, m1, m6 | |
28589 | pmulhrsw m4, m7 | |
28590 | pmaddubsw m5, m3, m6 | |
28591 | pmulhrsw m5, m7 | |
28592 | packuswb m4, m5 | |
28593 | movu [r0 + 1619 * 16], m4 | |
28594 | ||
28595 | ; mode 28 [row 3 -second half] | |
28596 | movu [r0 + 1671 * 16], m4 | |
28597 | ||
28598 | ; mode 27 [row 10] | |
28599 | movu m6, [r5 + 22 * 16] | |
28600 | pmaddubsw m4, m0, m6 | |
28601 | pmulhrsw m4, m7 | |
28602 | pmaddubsw m5, m2, m6 | |
28603 | pmulhrsw m5, m7 | |
28604 | packuswb m4, m5 | |
28605 | movu [r0 + 1620 * 16], m4 | |
28606 | pmaddubsw m4, m1, m6 | |
28607 | pmulhrsw m4, m7 | |
28608 | pmaddubsw m5, m3, m6 | |
28609 | pmulhrsw m5, m7 | |
28610 | packuswb m4, m5 | |
28611 | movu [r0 + 1621 * 16], m4 | |
28612 | ||
28613 | ; mode 27 [row 11] | |
28614 | movu m6, [r5 + 24 * 16] | |
28615 | pmaddubsw m4, m0, m6 | |
28616 | pmulhrsw m4, m7 | |
28617 | pmaddubsw m5, m2, m6 | |
28618 | pmulhrsw m5, m7 | |
28619 | packuswb m4, m5 | |
28620 | movu [r0 + 1622 * 16], m4 | |
28621 | pmaddubsw m4, m1, m6 | |
28622 | pmulhrsw m4, m7 | |
28623 | pmaddubsw m5, m3, m6 | |
28624 | pmulhrsw m5, m7 | |
28625 | packuswb m4, m5 | |
28626 | movu [r0 + 1623 * 16], m4 | |
28627 | ||
28628 | ; mode 27 [row 12] | |
28629 | movu m6, [r5 + 26 * 16] | |
28630 | pmaddubsw m4, m0, m6 | |
28631 | pmulhrsw m4, m7 | |
28632 | pmaddubsw m5, m2, m6 | |
28633 | pmulhrsw m5, m7 | |
28634 | packuswb m4, m5 | |
28635 | movu [r0 + 1624 * 16], m4 | |
28636 | ||
28637 | ; mode 30 [row 1 - first half] | |
28638 | movu [r0 + 1794 * 16], m4 | |
28639 | ||
28640 | ; mode 33 [row 0 - first half] | |
28641 | movu [r0 + 1984 * 16], m4 | |
28642 | ||
28643 | pmaddubsw m4, m1, m6 | |
28644 | pmulhrsw m4, m7 | |
28645 | pmaddubsw m5, m3, m6 | |
28646 | pmulhrsw m5, m7 | |
28647 | packuswb m4, m5 | |
28648 | movu [r0 + 1625 * 16], m4 | |
28649 | ||
28650 | ; mode 30 [row 1 - second half] | |
28651 | movu [r0 + 1795 * 16], m4 | |
28652 | ||
28653 | ; mode 33 [row 0 - second half] | |
28654 | movu [r0 + 1985 * 16], m4 | |
28655 | ||
28656 | ; mode 27 [row 13] | |
28657 | movu m6, [r5 + 28 * 16] | |
28658 | pmaddubsw m4, m0, m6 | |
28659 | pmulhrsw m4, m7 | |
28660 | pmaddubsw m5, m2, m6 | |
28661 | pmulhrsw m5, m7 | |
28662 | packuswb m4, m5 | |
28663 | movu [r0 + 1626 * 16], m4 | |
28664 | pmaddubsw m4, m1, m6 | |
28665 | pmulhrsw m4, m7 | |
28666 | pmaddubsw m5, m3, m6 | |
28667 | pmulhrsw m5, m7 | |
28668 | packuswb m4, m5 | |
28669 | movu [r0 + 1627 * 16], m4 | |
28670 | ||
28671 | ; mode 27 [row 14] | |
28672 | movu m6, [r5 + 30 * 16] | |
28673 | pmaddubsw m4, m0, m6 | |
28674 | pmulhrsw m4, m7 | |
28675 | pmaddubsw m5, m2, m6 | |
28676 | pmulhrsw m5, m7 | |
28677 | packuswb m4, m5 | |
28678 | movu [r0 + 1628 * 16], m4 | |
28679 | ||
28680 | ; mode 28 [row 5 first half] | |
28681 | movu [r0 + 1674 * 16], m4 | |
28682 | ||
28683 | pmaddubsw m4, m1, m6 | |
28684 | pmulhrsw m4, m7 | |
28685 | pmaddubsw m5, m3, m6 | |
28686 | pmulhrsw m5, m7 | |
28687 | packuswb m4, m5 | |
28688 | movu [r0 + 1629 * 16], m4 | |
28689 | ||
28690 | ; mode 28 [row 5 second half] | |
28691 | movu [r0 + 1675 * 16], m4 | |
28692 | ||
28693 | ; mode 28 [row 0] | |
28694 | movu m6, [r5 + 5 * 16] | |
28695 | pmaddubsw m4, m0, m6 | |
28696 | pmulhrsw m4, m7 | |
28697 | pmaddubsw m5, m2, m6 | |
28698 | pmulhrsw m5, m7 | |
28699 | packuswb m4, m5 | |
28700 | movu [r0 + 1664 * 16], m4 | |
28701 | pmaddubsw m4, m1, m6 | |
28702 | pmulhrsw m4, m7 | |
28703 | pmaddubsw m5, m3, m6 | |
28704 | pmulhrsw m5, m7 | |
28705 | packuswb m4, m5 | |
28706 | movu [r0 + 1665 * 16], m4 | |
28707 | ||
28708 | ; mode 28 [row 2] | |
28709 | movu m6, [r5 + 15 * 16] | |
28710 | pmaddubsw m4, m0, m6 | |
28711 | pmulhrsw m4, m7 | |
28712 | pmaddubsw m5, m2, m6 | |
28713 | pmulhrsw m5, m7 | |
28714 | packuswb m4, m5 | |
28715 | movu [r0 + 1668 * 16], m4 | |
28716 | pmaddubsw m4, m1, m6 | |
28717 | pmulhrsw m4, m7 | |
28718 | pmaddubsw m5, m3, m6 | |
28719 | pmulhrsw m5, m7 | |
28720 | packuswb m4, m5 | |
28721 | movu [r0 + 1669 * 16], m4 | |
28722 | ||
28723 | ; mode 28 [row 4] | |
28724 | movu m6, [r5 + 25 * 16] | |
28725 | pmaddubsw m4, m0, m6 | |
28726 | pmulhrsw m4, m7 | |
28727 | pmaddubsw m5, m2, m6 | |
28728 | pmulhrsw m5, m7 | |
28729 | packuswb m4, m5 | |
28730 | movu [r0 + 1672 * 16], m4 | |
28731 | pmaddubsw m4, m1, m6 | |
28732 | pmulhrsw m4, m7 | |
28733 | pmaddubsw m5, m3, m6 | |
28734 | pmulhrsw m5, m7 | |
28735 | packuswb m4, m5 | |
28736 | movu [r0 + 1673 * 16], m4 | |
28737 | ||
28738 | ; mode 30 [row 0] | |
28739 | movu m6, [r5 + 13 * 16] | |
28740 | pmaddubsw m4, m0, m6 | |
28741 | pmulhrsw m4, m7 | |
28742 | pmaddubsw m5, m2, m6 | |
28743 | pmulhrsw m5, m7 | |
28744 | packuswb m4, m5 | |
28745 | movu [r0 + 1792 * 16], m4 | |
28746 | pmaddubsw m4, m1, m6 | |
28747 | pmulhrsw m4, m7 | |
28748 | pmaddubsw m5, m3, m6 | |
28749 | pmulhrsw m5, m7 | |
28750 | packuswb m4, m5 | |
28751 | movu [r0 + 1793 * 16], m4 | |
28752 | ||
28753 | ; mode 29 [row 0] | |
28754 | movu m6, [r5 + 9 * 16] | |
28755 | pmaddubsw m4, m0, m6 | |
28756 | pmulhrsw m4, m7 | |
28757 | pmaddubsw m5, m2, m6 | |
28758 | pmulhrsw m5, m7 | |
28759 | packuswb m4, m5 | |
28760 | movu [r0 + 1728 * 16], m4 | |
28761 | pmaddubsw m4, m1, m6 | |
28762 | pmulhrsw m4, m7 | |
28763 | pmaddubsw m5, m3, m6 | |
28764 | pmulhrsw m5, m7 | |
28765 | packuswb m4, m5 | |
28766 | movu [r0 + 1729 * 16], m4 | |
28767 | ||
28768 | ; mode 29 [row 2] | |
28769 | movu m6, [r5 + 27 * 16] | |
28770 | pmaddubsw m4, m0, m6 | |
28771 | pmulhrsw m4, m7 | |
28772 | pmaddubsw m5, m2, m6 | |
28773 | pmulhrsw m5, m7 | |
28774 | packuswb m4, m5 | |
28775 | movu [r0 + 1732 * 16], m4 | |
28776 | pmaddubsw m4, m1, m6 | |
28777 | pmulhrsw m4, m7 | |
28778 | pmaddubsw m5, m3, m6 | |
28779 | pmulhrsw m5, m7 | |
28780 | packuswb m4, m5 | |
28781 | movu [r0 + 1733 * 16], m4 | |
28782 | ||
28783 | ; mode 31 [row 0] | |
28784 | movu m6, [r5 + 17 * 16] | |
28785 | pmaddubsw m4, m0, m6 | |
28786 | pmulhrsw m4, m7 | |
28787 | pmaddubsw m5, m2, m6 | |
28788 | pmulhrsw m5, m7 | |
28789 | packuswb m4, m5 | |
28790 | movu [r0 + 1856 * 16], m4 | |
28791 | pmaddubsw m4, m1, m6 | |
28792 | pmulhrsw m4, m7 | |
28793 | pmaddubsw m5, m3, m6 | |
28794 | pmulhrsw m5, m7 | |
28795 | packuswb m4, m5 | |
28796 | movu [r0 + 1857 * 16], m4 | |
28797 | ||
28798 | ; mode 32 [row 0] | |
28799 | movu m6, [r5 + 21 * 16] | |
28800 | pmaddubsw m4, m0, m6 | |
28801 | pmulhrsw m4, m7 | |
28802 | pmaddubsw m5, m2, m6 | |
28803 | pmulhrsw m5, m7 | |
28804 | packuswb m4, m5 | |
28805 | movu [r0 + 1920 * 16], m4 | |
28806 | pmaddubsw m4, m1, m6 | |
28807 | pmulhrsw m4, m7 | |
28808 | pmaddubsw m5, m3, m6 | |
28809 | pmulhrsw m5, m7 | |
28810 | packuswb m4, m5 | |
28811 | movu [r0 + 1921 * 16], m4 | |
28812 | ||
28813 | ; mode 27 [row 15] | |
28814 | movu m0, [r3 + 2] | |
28815 | movd m1, [r3 + 3] | |
28816 | palignr m1, m0, 1 | |
28817 | punpcklbw m0, m1 | |
28818 | movu m2, [r3 + 10] | |
28819 | movd m3, [r3 + 11] | |
28820 | palignr m3, m2, 1 | |
28821 | punpcklbw m2, m3 | |
28822 | movu m1, [r3 + 18] | |
28823 | movd m3, [r3 + 19] | |
28824 | palignr m3, m1, 1 | |
28825 | punpcklbw m1, m3 | |
28826 | movu m4, [r3 + 26] | |
28827 | movd m5, [r3 + 27] | |
28828 | palignr m5, m4, 1 | |
28829 | punpcklbw m4, m5 | |
28830 | ||
28831 | pshufb m5, m0, [tab_S2] | |
28832 | movh [r0 + 1630 * 16], m5 | |
28833 | pshufb m5, m2, [tab_S2] | |
28834 | movh [r0 + 1630 * 16 + 8], m5 | |
28835 | pshufb m5, m1, [tab_S2] | |
28836 | movh [r0 + 1631 * 16], m5 | |
28837 | pshufb m5, m4, [tab_S2] | |
28838 | movh [r0 + 1631 * 16 + 8], m5 | |
28839 | ||
28840 | ; mode 27 [row 16] | |
28841 | movu m6, [r5 + 2 * 16] | |
28842 | pmaddubsw m3, m0, m6 | |
28843 | pmulhrsw m3, m7 | |
28844 | pmaddubsw m5, m2, m6 | |
28845 | pmulhrsw m5, m7 | |
28846 | packuswb m3, m5 | |
28847 | movu [r0 + 1632 * 16], m3 | |
28848 | ||
28849 | ; mode 31 [row 1 - first half] | |
28850 | movu [r0 + 1858 * 16], m3 | |
28851 | ||
28852 | pmaddubsw m3, m1, m6 | |
28853 | pmulhrsw m3, m7 | |
28854 | pmaddubsw m5, m4, m6 | |
28855 | pmulhrsw m5, m7 | |
28856 | packuswb m3, m5 | |
28857 | movu [r0 + 1633 * 16], m3 | |
28858 | ||
28859 | ; mode 31 [row 1 - second half] | |
28860 | movu [r0 + 1859 * 16], m3 | |
28861 | ||
28862 | ; mode 27 [row 17] | |
28863 | movu m6, [r5 + 4 * 16] | |
28864 | pmaddubsw m3, m0, m6 | |
28865 | pmulhrsw m3, m7 | |
28866 | pmaddubsw m5, m2, m6 | |
28867 | pmulhrsw m5, m7 | |
28868 | packuswb m3, m5 | |
28869 | movu [r0 + 1634 * 16], m3 | |
28870 | ||
28871 | ; mode 29 [row 3 - first half] | |
28872 | movu [r0 + 1734 * 16], m3 | |
28873 | ||
28874 | pmaddubsw m3, m1, m6 | |
28875 | pmulhrsw m3, m7 | |
28876 | pmaddubsw m5, m4, m6 | |
28877 | pmulhrsw m5, m7 | |
28878 | packuswb m3, m5 | |
28879 | movu [r0 + 1635 * 16], m3 | |
28880 | ||
28881 | ; mode 29 [row 3 - second half] | |
28882 | movu [r0 + 1735 * 16], m3 | |
28883 | ||
28884 | ; mode 27 [row 18] | |
28885 | movu m6, [r5 + 6 * 16] | |
28886 | pmaddubsw m3, m0, m6 | |
28887 | pmulhrsw m3, m7 | |
28888 | pmaddubsw m5, m2, m6 | |
28889 | pmulhrsw m5, m7 | |
28890 | packuswb m3, m5 | |
28891 | movu [r0 + 1636 * 16], m3 | |
28892 | pmaddubsw m3, m1, m6 | |
28893 | pmulhrsw m3, m7 | |
28894 | pmaddubsw m5, m4, m6 | |
28895 | pmulhrsw m5, m7 | |
28896 | packuswb m3, m5 | |
28897 | movu [r0 + 1637 * 16], m3 | |
28898 | ||
28899 | ; mode 27 [row 19] | |
28900 | movu m6, [r5 + 8 * 16] | |
28901 | pmaddubsw m3, m0, m6 | |
28902 | pmulhrsw m3, m7 | |
28903 | pmaddubsw m5, m2, m6 | |
28904 | pmulhrsw m5, m7 | |
28905 | packuswb m3, m5 | |
28906 | movu [r0 + 1638 * 16], m3 | |
28907 | ||
28908 | ; mode 28 [row 7 - first half] | |
28909 | movu [r0 + 1678 * 16], m3 | |
28910 | ||
28911 | pmaddubsw m3, m1, m6 | |
28912 | pmulhrsw m3, m7 | |
28913 | pmaddubsw m5, m4, m6 | |
28914 | pmulhrsw m5, m7 | |
28915 | packuswb m3, m5 | |
28916 | movu [r0 + 1639 * 16], m3 | |
28917 | ||
28918 | ; mode 28 [row 7 - second half] | |
28919 | movu [r0 + 1679 * 16], m3 | |
28920 | ||
28921 | ; mode 27 [row 20] | |
28922 | movu m6, [r5 + 10 * 16] | |
28923 | pmaddubsw m3, m0, m6 | |
28924 | pmulhrsw m3, m7 | |
28925 | pmaddubsw m5, m2, m6 | |
28926 | pmulhrsw m5, m7 | |
28927 | packuswb m3, m5 | |
28928 | movu [r0 + 1640 * 16], m3 | |
28929 | ||
28930 | ; mode 32 [row 1 - first half] | |
28931 | movu [r0 + 1922 * 16], m3 | |
28932 | ||
28933 | pmaddubsw m3, m1, m6 | |
28934 | pmulhrsw m3, m7 | |
28935 | pmaddubsw m5, m4, m6 | |
28936 | pmulhrsw m5, m7 | |
28937 | packuswb m3, m5 | |
28938 | movu [r0 + 1641 * 16], m3 | |
28939 | ||
28940 | ; mode 32 [row 1 - second half] | |
28941 | movu [r0 + 1923 * 16], m3 | |
28942 | ||
28943 | ; mode 27 [row 21] | |
28944 | movu m6, [r5 + 12 * 16] | |
28945 | pmaddubsw m3, m0, m6 | |
28946 | pmulhrsw m3, m7 | |
28947 | pmaddubsw m5, m2, m6 | |
28948 | pmulhrsw m5, m7 | |
28949 | packuswb m3, m5 | |
28950 | movu [r0 + 1642 * 16], m3 | |
28951 | pmaddubsw m3, m1, m6 | |
28952 | pmulhrsw m3, m7 | |
28953 | pmaddubsw m5, m4, m6 | |
28954 | pmulhrsw m5, m7 | |
28955 | packuswb m3, m5 | |
28956 | movu [r0 + 1643 * 16], m3 | |
28957 | ||
28958 | ; mode 27 [row 22] | |
28959 | movu m6, [r5 + 14 * 16] | |
28960 | pmaddubsw m3, m0, m6 | |
28961 | pmulhrsw m3, m7 | |
28962 | pmaddubsw m5, m2, m6 | |
28963 | pmulhrsw m5, m7 | |
28964 | packuswb m3, m5 | |
28965 | movu [r0 + 1644 * 16], m3 | |
28966 | pmaddubsw m3, m1, m6 | |
28967 | pmulhrsw m3, m7 | |
28968 | pmaddubsw m5, m4, m6 | |
28969 | pmulhrsw m5, m7 | |
28970 | packuswb m3, m5 | |
28971 | movu [r0 + 1645 * 16], m3 | |
28972 | ||
28973 | ; mode 27 [row 23] | |
28974 | movu m6, [r5 + 16 * 16] | |
28975 | pmaddubsw m3, m0, m6 | |
28976 | pmulhrsw m3, m7 | |
28977 | pmaddubsw m5, m2, m6 | |
28978 | pmulhrsw m5, m7 | |
28979 | packuswb m3, m5 | |
28980 | movu [r0 + 1646 * 16], m3 | |
28981 | pmaddubsw m3, m1, m6 | |
28982 | pmulhrsw m3, m7 | |
28983 | pmaddubsw m5, m4, m6 | |
28984 | pmulhrsw m5, m7 | |
28985 | packuswb m3, m5 | |
28986 | movu [r0 + 1647 * 16], m3 | |
28987 | ||
28988 | ; mode 27 [row 24] | |
28989 | movu m6, [r5 + 18 * 16] | |
28990 | pmaddubsw m3, m0, m6 | |
28991 | pmulhrsw m3, m7 | |
28992 | pmaddubsw m5, m2, m6 | |
28993 | pmulhrsw m5, m7 | |
28994 | packuswb m3, m5 | |
28995 | movu [r0 + 1648 * 16], m3 | |
28996 | ||
28997 | ; mode 28 [row 9 - first half] | |
28998 | movu [r0 + 1682 * 16], m3 | |
28999 | ||
29000 | pmaddubsw m3, m1, m6 | |
29001 | pmulhrsw m3, m7 | |
29002 | pmaddubsw m5, m4, m6 | |
29003 | pmulhrsw m5, m7 | |
29004 | packuswb m3, m5 | |
29005 | movu [r0 + 1649 * 16], m3 | |
29006 | ||
29007 | ; mode 28 [row 9 - second half] | |
29008 | movu [r0 + 1683 * 16], m3 | |
29009 | ||
29010 | ; mode 27 [row 25] | |
29011 | movu m6, [r5 + 20 * 16] | |
29012 | pmaddubsw m3, m0, m6 | |
29013 | pmulhrsw m3, m7 | |
29014 | pmaddubsw m5, m2, m6 | |
29015 | pmulhrsw m5, m7 | |
29016 | packuswb m3, m5 | |
29017 | movu [r0 + 1650 * 16], m3 | |
29018 | ||
29019 | ; mode 30 [row 3 - first half] | |
29020 | movu [r0 + 1798 * 16], m3 | |
29021 | ||
29022 | ; mode 33 [row 1 - first half] | |
29023 | movu [r0 + 1986 * 16], m3 | |
29024 | ||
29025 | pmaddubsw m3, m1, m6 | |
29026 | pmulhrsw m3, m7 | |
29027 | pmaddubsw m5, m4, m6 | |
29028 | pmulhrsw m5, m7 | |
29029 | packuswb m3, m5 | |
29030 | movu [r0 + 1651 * 16], m3 | |
29031 | ||
29032 | ; mode 30 [row 3 - second half] | |
29033 | movu [r0 + 1799 * 16], m3 | |
29034 | ||
29035 | ; mode 33 [row 1 - second half] | |
29036 | movu [r0 + 1987 * 16], m3 | |
29037 | ||
29038 | ; mode 27 [row 26] | |
29039 | movu m6, [r5 + 22 * 16] | |
29040 | pmaddubsw m3, m0, m6 | |
29041 | pmulhrsw m3, m7 | |
29042 | pmaddubsw m5, m2, m6 | |
29043 | pmulhrsw m5, m7 | |
29044 | packuswb m3, m5 | |
29045 | movu [r0 + 1652 * 16], m3 | |
29046 | ||
29047 | ; mode 29 [row 5 - first half] | |
29048 | movu [r0 + 1738 * 16], m3 | |
29049 | ||
29050 | pmaddubsw m3, m1, m6 | |
29051 | pmulhrsw m3, m7 | |
29052 | pmaddubsw m5, m4, m6 | |
29053 | pmulhrsw m5, m7 | |
29054 | packuswb m3, m5 | |
29055 | movu [r0 + 1653 * 16], m3 | |
29056 | ||
29057 | ; mode 29 [row 5 - second half] | |
29058 | movu [r0 + 1739 * 16], m3 | |
29059 | ||
29060 | ; mode 27 [row 27] | |
29061 | movu m6, [r5 + 24 * 16] | |
29062 | pmaddubsw m3, m0, m6 | |
29063 | pmulhrsw m3, m7 | |
29064 | pmaddubsw m5, m2, m6 | |
29065 | pmulhrsw m5, m7 | |
29066 | packuswb m3, m5 | |
29067 | movu [r0 + 1654 * 16], m3 | |
29068 | pmaddubsw m3, m1, m6 | |
29069 | pmulhrsw m3, m7 | |
29070 | pmaddubsw m5, m4, m6 | |
29071 | pmulhrsw m5, m7 | |
29072 | packuswb m3, m5 | |
29073 | movu [r0 + 1655 * 16], m3 | |
29074 | ||
29075 | ; mode 27 [row 28] | |
29076 | movu m6, [r5 + 26 * 16] | |
29077 | pmaddubsw m3, m0, m6 | |
29078 | pmulhrsw m3, m7 | |
29079 | pmaddubsw m5, m2, m6 | |
29080 | pmulhrsw m5, m7 | |
29081 | packuswb m3, m5 | |
29082 | movu [r0 + 1656 * 16], m3 | |
29083 | pmaddubsw m3, m1, m6 | |
29084 | pmulhrsw m3, m7 | |
29085 | pmaddubsw m5, m4, m6 | |
29086 | pmulhrsw m5, m7 | |
29087 | packuswb m3, m5 | |
29088 | movu [r0 + 1657 * 16], m3 | |
29089 | ||
29090 | ; mode 27 [row 29] | |
29091 | movu m6, [r5 + 28 * 16] | |
29092 | pmaddubsw m3, m0, m6 | |
29093 | pmulhrsw m3, m7 | |
29094 | pmaddubsw m5, m2, m6 | |
29095 | pmulhrsw m5, m7 | |
29096 | packuswb m3, m5 | |
29097 | movu [r0 + 1658 * 16], m3 | |
29098 | ||
29099 | ; mode 28 [row 11 - first half] | |
29100 | movu [r0 + 1686 * 16], m3 | |
29101 | ||
29102 | pmaddubsw m3, m1, m6 | |
29103 | pmulhrsw m3, m7 | |
29104 | pmaddubsw m5, m4, m6 | |
29105 | pmulhrsw m5, m7 | |
29106 | packuswb m3, m5 | |
29107 | movu [r0 + 1659 * 16], m3 | |
29108 | ||
29109 | ; mode 28 [row 11 - second half] | |
29110 | movu [r0 + 1687 * 16], m3 | |
29111 | ||
29112 | ; mode 27 [row 30] | |
29113 | movu m6, [r5 + 30 * 16] | |
29114 | pmaddubsw m3, m0, m6 | |
29115 | pmulhrsw m3, m7 | |
29116 | pmaddubsw m5, m2, m6 | |
29117 | pmulhrsw m5, m7 | |
29118 | packuswb m3, m5 | |
29119 | movu [r0 + 1660 * 16], m3 | |
29120 | pmaddubsw m3, m1, m6 | |
29121 | pmulhrsw m3, m7 | |
29122 | pmaddubsw m5, m4, m6 | |
29123 | pmulhrsw m5, m7 | |
29124 | packuswb m3, m5 | |
29125 | movu [r0 + 1661 * 16], m3 | |
29126 | ||
29127 | ; mode 28 [row 6] | |
29128 | movu m6, [r5 + 3 * 16] | |
29129 | pmaddubsw m3, m0, m6 | |
29130 | pmulhrsw m3, m7 | |
29131 | pmaddubsw m5, m2, m6 | |
29132 | pmulhrsw m5, m7 | |
29133 | packuswb m3, m5 | |
29134 | movu [r0 + 1676 * 16], m3 | |
29135 | pmaddubsw m3, m1, m6 | |
29136 | pmulhrsw m3, m7 | |
29137 | pmaddubsw m5, m4, m6 | |
29138 | pmulhrsw m5, m7 | |
29139 | packuswb m3, m5 | |
29140 | movu [r0 + 1677 * 16], m3 | |
29141 | ||
29142 | ; mode 28 [row 8] | |
29143 | movu m6, [r5 + 13 * 16] | |
29144 | pmaddubsw m3, m0, m6 | |
29145 | pmulhrsw m3, m7 | |
29146 | pmaddubsw m5, m2, m6 | |
29147 | pmulhrsw m5, m7 | |
29148 | packuswb m3, m5 | |
29149 | movu [r0 + 1680 * 16], m3 | |
29150 | ||
29151 | ; mode 29 [row 4 - first half] | |
29152 | movu [r0 + 1736 * 16], m3 | |
29153 | ||
29154 | pmaddubsw m3, m1, m6 | |
29155 | pmulhrsw m3, m7 | |
29156 | pmaddubsw m5, m4, m6 | |
29157 | pmulhrsw m5, m7 | |
29158 | packuswb m3, m5 | |
29159 | movu [r0 + 1681 * 16], m3 | |
29160 | ||
29161 | ; mode 29 [row 4 - second half] | |
29162 | movu [r0 + 1737 * 16], m3 | |
29163 | ||
29164 | ; mode 28 [row 10] | |
29165 | movu m6, [r5 + 23 * 16] | |
29166 | pmaddubsw m3, m0, m6 | |
29167 | pmulhrsw m3, m7 | |
29168 | pmaddubsw m5, m2, m6 | |
29169 | pmulhrsw m5, m7 | |
29170 | packuswb m3, m5 | |
29171 | movu [r0 + 1684 * 16], m3 | |
29172 | pmaddubsw m3, m1, m6 | |
29173 | pmulhrsw m3, m7 | |
29174 | pmaddubsw m5, m4, m6 | |
29175 | pmulhrsw m5, m7 | |
29176 | packuswb m3, m5 | |
29177 | movu [r0 + 1685 * 16], m3 | |
29178 | ||
29179 | ; mode 29 [row 6] | |
29180 | movu m6, [r5 + 31 * 16] | |
29181 | pmaddubsw m3, m0, m6 | |
29182 | pmulhrsw m3, m7 | |
29183 | pmaddubsw m5, m2, m6 | |
29184 | pmulhrsw m5, m7 | |
29185 | packuswb m3, m5 | |
29186 | movu [r0 + 1740 * 16], m3 | |
29187 | ||
29188 | ; mode 32 [row 2 - first half] | |
29189 | movu [r0 + 1924 * 16], m3 | |
29190 | ||
29191 | pmaddubsw m3, m1, m6 | |
29192 | pmulhrsw m3, m7 | |
29193 | pmaddubsw m5, m4, m6 | |
29194 | pmulhrsw m5, m7 | |
29195 | packuswb m3, m5 | |
29196 | movu [r0 + 1741 * 16], m3 | |
29197 | ||
29198 | ; mode 32 [row 2 - second half] | |
29199 | movu [r0 + 1925 * 16], m3 | |
29200 | ||
29201 | ; mode 30 [row 2] | |
29202 | movu m6, [r5 + 7 * 16] | |
29203 | pmaddubsw m3, m0, m6 | |
29204 | pmulhrsw m3, m7 | |
29205 | pmaddubsw m5, m2, m6 | |
29206 | pmulhrsw m5, m7 | |
29207 | packuswb m3, m5 | |
29208 | movu [r0 + 1796 * 16], m3 | |
29209 | pmaddubsw m3, m1, m6 | |
29210 | pmulhrsw m3, m7 | |
29211 | pmaddubsw m5, m4, m6 | |
29212 | pmulhrsw m5, m7 | |
29213 | packuswb m3, m5 | |
29214 | movu [r0 + 1797 * 16], m3 | |
29215 | ||
29216 | ; mode 31 [row 2] | |
29217 | movu m6, [r5 + 19 * 16] | |
29218 | pmaddubsw m3, m0, m6 | |
29219 | pmulhrsw m3, m7 | |
29220 | pmaddubsw m5, m2, m6 | |
29221 | pmulhrsw m5, m7 | |
29222 | packuswb m3, m5 | |
29223 | movu [r0 + 1860 * 16], m3 | |
29224 | pmaddubsw m3, m1, m6 | |
29225 | pmulhrsw m3, m7 | |
29226 | pmaddubsw m5, m4, m6 | |
29227 | pmulhrsw m5, m7 | |
29228 | packuswb m3, m5 | |
29229 | movu [r0 + 1861 * 16], m3 | |
29230 | ||
29231 | ; mode 27 [row 15] | |
29232 | movu m0, [r3 + 3] | |
29233 | movd m1, [r3 + 4] | |
29234 | palignr m1, m0, 1 | |
29235 | punpcklbw m0, m1 | |
29236 | movu m2, [r3 + 11] | |
29237 | movd m3, [r3 + 12] | |
29238 | palignr m3, m2, 1 | |
29239 | punpcklbw m2, m3 | |
29240 | movu m1, [r3 + 19] | |
29241 | movd m3, [r3 + 20] | |
29242 | palignr m3, m1, 1 | |
29243 | punpcklbw m1, m3 | |
29244 | movu m4, [r3 + 27] | |
29245 | movd m5, [r3 + 28] | |
29246 | palignr m5, m4, 1 | |
29247 | punpcklbw m4, m5 | |
29248 | ||
29249 | pshufb m5, m0, [tab_S2] | |
29250 | movh [r0 + 1662 * 16], m5 | |
29251 | pshufb m5, m2, [tab_S2] | |
29252 | movh [r0 + 1662 * 16 + 8], m5 | |
29253 | pshufb m5, m1, [tab_S2] | |
29254 | movh [r0 + 1663 * 16], m5 | |
29255 | pshufb m5, m4, [tab_S2] | |
29256 | movh [r0 + 1663 * 16 + 8], m5 | |
29257 | ||
29258 | ; mode 28 [row 12] | |
29259 | movu m6, [r5 + 1 * 16] | |
29260 | pmaddubsw m3, m0, m6 | |
29261 | pmulhrsw m3, m7 | |
29262 | pmaddubsw m5, m2, m6 | |
29263 | pmulhrsw m5, m7 | |
29264 | packuswb m3, m5 | |
29265 | movu [r0 + 1688 * 16], m3 | |
29266 | ||
29267 | ; mode 30 [row 4 - first half] | |
29268 | movu [r0 + 1800 * 16], m3 | |
29269 | ||
29270 | pmaddubsw m3, m1, m6 | |
29271 | pmulhrsw m3, m7 | |
29272 | pmaddubsw m5, m4, m6 | |
29273 | pmulhrsw m5, m7 | |
29274 | packuswb m3, m5 | |
29275 | movu [r0 + 1689 * 16], m3 | |
29276 | ||
29277 | ; mode 30 [row 4 - second half] | |
29278 | movu [r0 + 1801 * 16], m3 | |
29279 | ||
29280 | ; mode 28 [row 13] | |
29281 | movu m6, [r5 + 6 * 16] | |
29282 | pmaddubsw m3, m0, m6 | |
29283 | pmulhrsw m3, m7 | |
29284 | pmaddubsw m5, m2, m6 | |
29285 | pmulhrsw m5, m7 | |
29286 | packuswb m3, m5 | |
29287 | movu [r0 + 1690 * 16], m3 | |
29288 | pmaddubsw m3, m1, m6 | |
29289 | pmulhrsw m3, m7 | |
29290 | pmaddubsw m5, m4, m6 | |
29291 | pmulhrsw m5, m7 | |
29292 | packuswb m3, m5 | |
29293 | movu [r0 + 1691 * 16], m3 | |
29294 | ||
29295 | ; mode 28 [row 14] | |
29296 | movu m6, [r5 + 11 * 16] | |
29297 | pmaddubsw m3, m0, m6 | |
29298 | pmulhrsw m3, m7 | |
29299 | pmaddubsw m5, m2, m6 | |
29300 | pmulhrsw m5, m7 | |
29301 | packuswb m3, m5 | |
29302 | movu [r0 + 1692 * 16], m3 | |
29303 | pmaddubsw m3, m1, m6 | |
29304 | pmulhrsw m3, m7 | |
29305 | pmaddubsw m5, m4, m6 | |
29306 | pmulhrsw m5, m7 | |
29307 | packuswb m3, m5 | |
29308 | movu [r0 + 1693 * 16], m3 | |
29309 | ||
29310 | ; mode 28 [row 15] | |
29311 | movu m6, [r5 + 16 * 16] | |
29312 | pmaddubsw m3, m0, m6 | |
29313 | pmulhrsw m3, m7 | |
29314 | pmaddubsw m5, m2, m6 | |
29315 | pmulhrsw m5, m7 | |
29316 | packuswb m3, m5 | |
29317 | movu [r0 + 1694 * 16], m3 | |
29318 | pmaddubsw m3, m1, m6 | |
29319 | pmulhrsw m3, m7 | |
29320 | pmaddubsw m5, m4, m6 | |
29321 | pmulhrsw m5, m7 | |
29322 | packuswb m3, m5 | |
29323 | movu [r0 + 1695 * 16], m3 | |
29324 | ||
29325 | ; mode 28 [row 16] | |
29326 | movu m6, [r5 + 21 * 16] | |
29327 | pmaddubsw m3, m0, m6 | |
29328 | pmulhrsw m3, m7 | |
29329 | pmaddubsw m5, m2, m6 | |
29330 | pmulhrsw m5, m7 | |
29331 | packuswb m3, m5 | |
29332 | movu [r0 + 1696 * 16], m3 | |
29333 | ||
29334 | ; mode 31 [row 4 - first half] | |
29335 | movu [r0 + 1864 * 16], m3 | |
29336 | ||
29337 | pmaddubsw m3, m1, m6 | |
29338 | pmulhrsw m3, m7 | |
29339 | pmaddubsw m5, m4, m6 | |
29340 | pmulhrsw m5, m7 | |
29341 | packuswb m3, m5 | |
29342 | movu [r0 + 1697 * 16], m3 | |
29343 | ||
29344 | ; mode 31 [row 4 - second half] | |
29345 | movu [r0 + 1865 * 16], m3 | |
29346 | ||
29347 | ; mode 28 [row 17] | |
29348 | movu m6, [r5 + 26 * 16] | |
29349 | pmaddubsw m3, m0, m6 | |
29350 | pmulhrsw m3, m7 | |
29351 | pmaddubsw m5, m2, m6 | |
29352 | pmulhrsw m5, m7 | |
29353 | packuswb m3, m5 | |
29354 | movu [r0 + 1698 * 16], m3 | |
29355 | ||
29356 | ; mode 29 [row 9 - first half] | |
29357 | movu [r0 + 1746 * 16], m3 | |
29358 | ||
29359 | pmaddubsw m3, m1, m6 | |
29360 | pmulhrsw m3, m7 | |
29361 | pmaddubsw m5, m4, m6 | |
29362 | pmulhrsw m5, m7 | |
29363 | packuswb m3, m5 | |
29364 | movu [r0 + 1699 * 16], m3 | |
29365 | ||
29366 | ; mode 29 [row 9 - second half] | |
29367 | movu [r0 + 1747 * 16], m3 | |
29368 | ||
29369 | ; mode 28 [row 18] | |
29370 | movu m6, [r5 + 31 * 16] | |
29371 | pmaddubsw m3, m0, m6 | |
29372 | pmulhrsw m3, m7 | |
29373 | pmaddubsw m5, m2, m6 | |
29374 | pmulhrsw m5, m7 | |
29375 | packuswb m3, m5 | |
29376 | movu [r0 + 1700 * 16], m3 | |
29377 | pmaddubsw m3, m1, m6 | |
29378 | pmulhrsw m3, m7 | |
29379 | pmaddubsw m5, m4, m6 | |
29380 | pmulhrsw m5, m7 | |
29381 | packuswb m3, m5 | |
29382 | movu [r0 + 1701 * 16], m3 | |
29383 | ||
29384 | ; mode 29 [row 7] | |
29385 | movu m6, [r5 + 8 * 16] | |
29386 | pmaddubsw m3, m0, m6 | |
29387 | pmulhrsw m3, m7 | |
29388 | pmaddubsw m5, m2, m6 | |
29389 | pmulhrsw m5, m7 | |
29390 | packuswb m3, m5 | |
29391 | movu [r0 + 1742 * 16], m3 | |
29392 | pmaddubsw m3, m1, m6 | |
29393 | pmulhrsw m3, m7 | |
29394 | pmaddubsw m5, m4, m6 | |
29395 | pmulhrsw m5, m7 | |
29396 | packuswb m3, m5 | |
29397 | movu [r0 + 1743 * 16], m3 | |
29398 | ||
29399 | ; mode 29 [row 8] | |
29400 | movu m6, [r5 + 17 * 16] | |
29401 | pmaddubsw m3, m0, m6 | |
29402 | pmulhrsw m3, m7 | |
29403 | pmaddubsw m5, m2, m6 | |
29404 | pmulhrsw m5, m7 | |
29405 | packuswb m3, m5 | |
29406 | movu [r0 + 1744 * 16], m3 | |
29407 | pmaddubsw m3, m1, m6 | |
29408 | pmulhrsw m3, m7 | |
29409 | pmaddubsw m5, m4, m6 | |
29410 | pmulhrsw m5, m7 | |
29411 | packuswb m3, m5 | |
29412 | movu [r0 + 1745 * 16], m3 | |
29413 | ||
29414 | ; mode 30 [row 5] | |
29415 | movu m6, [r5 + 14 * 16] | |
29416 | pmaddubsw m3, m0, m6 | |
29417 | pmulhrsw m3, m7 | |
29418 | pmaddubsw m5, m2, m6 | |
29419 | pmulhrsw m5, m7 | |
29420 | packuswb m3, m5 | |
29421 | movu [r0 + 1802 * 16], m3 | |
29422 | ||
29423 | ; mode 33 [row 2 - first half] | |
29424 | movu [r0 + 1988 * 16], m3 | |
29425 | ||
29426 | pmaddubsw m3, m1, m6 | |
29427 | pmulhrsw m3, m7 | |
29428 | pmaddubsw m5, m4, m6 | |
29429 | pmulhrsw m5, m7 | |
29430 | packuswb m3, m5 | |
29431 | movu [r0 + 1803 * 16], m3 | |
29432 | ||
29433 | ; mode 33 [row 2 - second half] | |
29434 | movu [r0 + 1989 * 16], m3 | |
29435 | ||
29436 | ; mode 30 [row 6] | |
29437 | movu m6, [r5 + 27 * 16] | |
29438 | pmaddubsw m3, m0, m6 | |
29439 | pmulhrsw m3, m7 | |
29440 | pmaddubsw m5, m2, m6 | |
29441 | pmulhrsw m5, m7 | |
29442 | packuswb m3, m5 | |
29443 | movu [r0 + 1804 * 16], m3 | |
29444 | pmaddubsw m3, m1, m6 | |
29445 | pmulhrsw m3, m7 | |
29446 | pmaddubsw m5, m4, m6 | |
29447 | pmulhrsw m5, m7 | |
29448 | packuswb m3, m5 | |
29449 | movu [r0 + 1805 * 16], m3 | |
29450 | ||
29451 | ; mode 31 [row 3] | |
29452 | movu m6, [r5 + 4 * 16] | |
29453 | pmaddubsw m3, m0, m6 | |
29454 | pmulhrsw m3, m7 | |
29455 | pmaddubsw m5, m2, m6 | |
29456 | pmulhrsw m5, m7 | |
29457 | packuswb m3, m5 | |
29458 | movu [r0 + 1862 * 16], m3 | |
29459 | pmaddubsw m3, m1, m6 | |
29460 | pmulhrsw m3, m7 | |
29461 | pmaddubsw m5, m4, m6 | |
29462 | pmulhrsw m5, m7 | |
29463 | packuswb m3, m5 | |
29464 | movu [r0 + 1863 * 16], m3 | |
29465 | ||
29466 | ; mode 32 [row 3] | |
29467 | movu m6, [r5 + 20 * 16] | |
29468 | pmaddubsw m3, m0, m6 | |
29469 | pmulhrsw m3, m7 | |
29470 | pmaddubsw m5, m2, m6 | |
29471 | pmulhrsw m5, m7 | |
29472 | packuswb m3, m5 | |
29473 | movu [r0 + 1926 * 16], m3 | |
29474 | pmaddubsw m3, m1, m6 | |
29475 | pmulhrsw m3, m7 | |
29476 | pmaddubsw m5, m4, m6 | |
29477 | pmulhrsw m5, m7 | |
29478 | packuswb m3, m5 | |
29479 | movu [r0 + 1927 * 16], m3 | |
29480 | ||
29481 | ; mode 28 [row 19] | |
29482 | movu m6, [r5 + 4 * 16] | |
29483 | movu m0, [r3 + 4] | |
29484 | movd m1, [r3 + 5] | |
29485 | palignr m1, m0, 1 | |
29486 | punpcklbw m0, m1 | |
29487 | pmaddubsw m3, m0, m6 | |
29488 | pmulhrsw m3, m7 | |
29489 | movu m2, [r3 + 12] | |
29490 | movd m4, [r3 + 13] | |
29491 | palignr m4, m2, 1 | |
29492 | punpcklbw m2, m4 | |
29493 | pmaddubsw m5, m2, m6 | |
29494 | pmulhrsw m5, m7 | |
29495 | packuswb m3, m5 | |
29496 | movu [r0 + 1702 * 16], m3 | |
29497 | ||
29498 | movu m1, [r3 + 20] | |
29499 | movd m3, [r3 + 21] | |
29500 | palignr m3, m1, 1 | |
29501 | punpcklbw m1, m3 | |
29502 | pmaddubsw m3, m1, m6 | |
29503 | pmulhrsw m3, m7 | |
29504 | movu m4, [r3 + 28] | |
29505 | movd m5, [r3 + 29] | |
29506 | palignr m5, m4, 1 | |
29507 | punpcklbw m4, m5 | |
29508 | pmaddubsw m5, m4, m6 | |
29509 | pmulhrsw m5, m7 | |
29510 | packuswb m3, m5 | |
29511 | movu [r0 + 1703 * 16], m3 | |
29512 | ||
29513 | ; mode 28 [row 20] | |
29514 | movu m6, [r5 + 9 * 16] | |
29515 | pmaddubsw m3, m0, m6 | |
29516 | pmulhrsw m3, m7 | |
29517 | pmaddubsw m5, m2, m6 | |
29518 | pmulhrsw m5, m7 | |
29519 | packuswb m3, m5 | |
29520 | movu [r0 + 1704 * 16], m3 | |
29521 | ||
29522 | ; mode 32 [row 4 - first half] | |
29523 | movu [r0 + 1928 * 16], m3 | |
29524 | ||
29525 | pmaddubsw m3, m1, m6 | |
29526 | pmulhrsw m3, m7 | |
29527 | pmaddubsw m5, m4, m6 | |
29528 | pmulhrsw m5, m7 | |
29529 | packuswb m3, m5 | |
29530 | movu [r0 + 1705 * 16], m3 | |
29531 | ||
29532 | ; mode 32 [row 4 - second half] | |
29533 | movu [r0 + 1929 * 16], m3 | |
29534 | ||
29535 | ; mode 28 [row 21] | |
29536 | movu m6, [r5 + 14 * 16] | |
29537 | pmaddubsw m3, m0, m6 | |
29538 | pmulhrsw m3, m7 | |
29539 | pmaddubsw m5, m2, m6 | |
29540 | pmulhrsw m5, m7 | |
29541 | packuswb m3, m5 | |
29542 | movu [r0 + 1706 * 16], m3 | |
29543 | pmaddubsw m3, m1, m6 | |
29544 | pmulhrsw m3, m7 | |
29545 | pmaddubsw m5, m4, m6 | |
29546 | pmulhrsw m5, m7 | |
29547 | packuswb m3, m5 | |
29548 | movu [r0 + 1707 * 16], m3 | |
29549 | ||
29550 | ; mode 28 [row 22] | |
29551 | movu m6, [r5 + 19 * 16] | |
29552 | pmaddubsw m3, m0, m6 | |
29553 | pmulhrsw m3, m7 | |
29554 | pmaddubsw m5, m2, m6 | |
29555 | pmulhrsw m5, m7 | |
29556 | packuswb m3, m5 | |
29557 | movu [r0 + 1708 * 16], m3 | |
29558 | pmaddubsw m3, m1, m6 | |
29559 | pmulhrsw m3, m7 | |
29560 | pmaddubsw m5, m4, m6 | |
29561 | pmulhrsw m5, m7 | |
29562 | packuswb m3, m5 | |
29563 | movu [r0 + 1709 * 16], m3 | |
29564 | ||
29565 | ; mode 28 [row 23] | |
29566 | movu m6, [r5 + 24 * 16] | |
29567 | pmaddubsw m3, m0, m6 | |
29568 | pmulhrsw m3, m7 | |
29569 | pmaddubsw m5, m2, m6 | |
29570 | pmulhrsw m5, m7 | |
29571 | packuswb m3, m5 | |
29572 | movu [r0 + 1710 * 16], m3 | |
29573 | pmaddubsw m3, m1, m6 | |
29574 | pmulhrsw m3, m7 | |
29575 | pmaddubsw m5, m4, m6 | |
29576 | pmulhrsw m5, m7 | |
29577 | packuswb m3, m5 | |
29578 | movu [r0 + 1711 * 16], m3 | |
29579 | ||
29580 | ; mode 28 [row 24] | |
29581 | movu m6, [r5 + 29 * 16] | |
29582 | pmaddubsw m3, m0, m6 | |
29583 | pmulhrsw m3, m7 | |
29584 | pmaddubsw m5, m2, m6 | |
29585 | pmulhrsw m5, m7 | |
29586 | packuswb m3, m5 | |
29587 | movu [r0 + 1712 * 16], m3 | |
29588 | pmaddubsw m3, m1, m6 | |
29589 | pmulhrsw m3, m7 | |
29590 | pmaddubsw m5, m4, m6 | |
29591 | pmulhrsw m5, m7 | |
29592 | packuswb m3, m5 | |
29593 | movu [r0 + 1713 * 16], m3 | |
29594 | ||
29595 | ; mode 29 [row 10] | |
29596 | movu m6, [r5 + 3 * 16] | |
29597 | pmaddubsw m3, m0, m6 | |
29598 | pmulhrsw m3, m7 | |
29599 | pmaddubsw m5, m2, m6 | |
29600 | pmulhrsw m5, m7 | |
29601 | packuswb m3, m5 | |
29602 | movu [r0 + 1748 * 16], m3 | |
29603 | pmaddubsw m3, m1, m6 | |
29604 | pmulhrsw m3, m7 | |
29605 | pmaddubsw m5, m4, m6 | |
29606 | pmulhrsw m5, m7 | |
29607 | packuswb m3, m5 | |
29608 | movu [r0 + 1749 * 16], m3 | |
29609 | ||
29610 | ; mode 29 [row 11] | |
29611 | movu m6, [r5 + 12 * 16] | |
29612 | pmaddubsw m3, m0, m6 | |
29613 | pmulhrsw m3, m7 | |
29614 | pmaddubsw m5, m2, m6 | |
29615 | pmulhrsw m5, m7 | |
29616 | packuswb m3, m5 | |
29617 | movu [r0 + 1750 * 16], m3 | |
29618 | pmaddubsw m3, m1, m6 | |
29619 | pmulhrsw m3, m7 | |
29620 | pmaddubsw m5, m4, m6 | |
29621 | pmulhrsw m5, m7 | |
29622 | packuswb m3, m5 | |
29623 | movu [r0 + 1751 * 16], m3 | |
29624 | ||
29625 | ; mode 29 [row 12] | |
29626 | movu m6, [r5 + 21 * 16] | |
29627 | pmaddubsw m3, m0, m6 | |
29628 | pmulhrsw m3, m7 | |
29629 | pmaddubsw m5, m2, m6 | |
29630 | pmulhrsw m5, m7 | |
29631 | packuswb m3, m5 | |
29632 | movu [r0 + 1752 * 16], m3 | |
29633 | ||
29634 | ; mode 30 [row 8 -first half] | |
29635 | movu [r0 + 1808 * 16], m3 | |
29636 | ||
29637 | pmaddubsw m3, m1, m6 | |
29638 | pmulhrsw m3, m7 | |
29639 | pmaddubsw m5, m4, m6 | |
29640 | pmulhrsw m5, m7 | |
29641 | packuswb m3, m5 | |
29642 | movu [r0 + 1753 * 16], m3 | |
29643 | ||
29644 | ; mode 30 [row 8 -second half] | |
29645 | movu [r0 + 1809 * 16], m3 | |
29646 | ||
29647 | ; mode 29 [row 13] | |
29648 | movu m6, [r5 + 30 * 16] | |
29649 | pmaddubsw m3, m0, m6 | |
29650 | pmulhrsw m3, m7 | |
29651 | pmaddubsw m5, m2, m6 | |
29652 | pmulhrsw m5, m7 | |
29653 | packuswb m3, m5 | |
29654 | movu [r0 + 1754 * 16], m3 | |
29655 | ||
29656 | ; mode 32 [row 5 - first half] | |
29657 | movu [r0 + 1930 * 16], m3 | |
29658 | ||
29659 | pmaddubsw m3, m1, m6 | |
29660 | pmulhrsw m3, m7 | |
29661 | pmaddubsw m5, m4, m6 | |
29662 | pmulhrsw m5, m7 | |
29663 | packuswb m3, m5 | |
29664 | movu [r0 + 1755 * 16], m3 | |
29665 | ||
29666 | ; mode 32 [row 5 - second half] | |
29667 | movu [r0 + 1931 * 16], m3 | |
29668 | ||
29669 | ; mode 30 [row 7] | |
29670 | movu m6, [r5 + 8 * 16] | |
29671 | pmaddubsw m3, m0, m6 | |
29672 | pmulhrsw m3, m7 | |
29673 | pmaddubsw m5, m2, m6 | |
29674 | pmulhrsw m5, m7 | |
29675 | packuswb m3, m5 | |
29676 | movu [r0 + 1806 * 16], m3 | |
29677 | ||
29678 | ; mode 33 [row 3 - first half] | |
29679 | movu [r0 + 1990 * 16], m3 | |
29680 | ||
29681 | pmaddubsw m3, m1, m6 | |
29682 | pmulhrsw m3, m7 | |
29683 | pmaddubsw m5, m4, m6 | |
29684 | pmulhrsw m5, m7 | |
29685 | packuswb m3, m5 | |
29686 | movu [r0 + 1807 * 16], m3 | |
29687 | ||
29688 | ; mode 33 [row 3 - second half] | |
29689 | movu [r0 + 1991 * 16], m3 | |
29690 | ||
29691 | ; mode 31 [row 5] | |
29692 | movu m6, [r5 + 6 * 16] | |
29693 | pmaddubsw m3, m0, m6 | |
29694 | pmulhrsw m3, m7 | |
29695 | pmaddubsw m5, m2, m6 | |
29696 | pmulhrsw m5, m7 | |
29697 | packuswb m3, m5 | |
29698 | movu [r0 + 1866 * 16], m3 | |
29699 | pmaddubsw m3, m1, m6 | |
29700 | pmulhrsw m3, m7 | |
29701 | pmaddubsw m5, m4, m6 | |
29702 | pmulhrsw m5, m7 | |
29703 | packuswb m3, m5 | |
29704 | movu [r0 + 1867 * 16], m3 | |
29705 | ||
29706 | ; mode 31 [row 6] | |
29707 | movu m6, [r5 + 23 * 16] | |
29708 | pmaddubsw m3, m0, m6 | |
29709 | pmulhrsw m3, m7 | |
29710 | pmaddubsw m5, m2, m6 | |
29711 | pmulhrsw m5, m7 | |
29712 | packuswb m3, m5 | |
29713 | movu [r0 + 1868 * 16], m3 | |
29714 | pmaddubsw m3, m1, m6 | |
29715 | pmulhrsw m3, m7 | |
29716 | pmaddubsw m5, m4, m6 | |
29717 | pmulhrsw m5, m7 | |
29718 | packuswb m3, m5 | |
29719 | movu [r0 + 1869 * 16], m3 | |
29720 | ||
29721 | ; mode 28 [row 25] | |
29722 | movu m6, [r5 + 2 * 16] | |
29723 | movu m0, [r3 + 5] | |
29724 | movd m1, [r3 + 6] | |
29725 | palignr m1, m0, 1 | |
29726 | punpcklbw m0, m1 | |
29727 | pmaddubsw m3, m0, m6 | |
29728 | pmulhrsw m3, m7 | |
29729 | movu m2, [r3 + 13] | |
29730 | movd m4, [r3 + 14] | |
29731 | palignr m4, m2, 1 | |
29732 | punpcklbw m2, m4 | |
29733 | pmaddubsw m5, m2, m6 | |
29734 | pmulhrsw m5, m7 | |
29735 | packuswb m3, m5 | |
29736 | movu [r0 + 1714 * 16], m3 | |
29737 | ||
29738 | movu m1, [r3 + 21] | |
29739 | movd m3, [r3 + 22] | |
29740 | palignr m3, m1, 1 | |
29741 | punpcklbw m1, m3 | |
29742 | pmaddubsw m3, m1, m6 | |
29743 | pmulhrsw m3, m7 | |
29744 | movu m4, [r3 + 29] | |
29745 | movd m5, [r3 + 30] | |
29746 | palignr m5, m4, 1 | |
29747 | punpcklbw m4, m5 | |
29748 | pmaddubsw m5, m4, m6 | |
29749 | pmulhrsw m5, m7 | |
29750 | packuswb m3, m5 | |
29751 | movu [r0 + 1715 * 16], m3 | |
29752 | ||
29753 | ; mode 28 [row 26] | |
29754 | movu m6, [r5 + 7 * 16] | |
29755 | pmaddubsw m3, m0, m6 | |
29756 | pmulhrsw m3, m7 | |
29757 | pmaddubsw m5, m2, m6 | |
29758 | pmulhrsw m5, m7 | |
29759 | packuswb m3, m5 | |
29760 | movu [r0 + 1716 * 16], m3 | |
29761 | ||
29762 | ; mode 29 [row 14 - first half] | |
29763 | movu [r0 + 1756 * 16], m3 | |
29764 | ||
29765 | pmaddubsw m3, m1, m6 | |
29766 | pmulhrsw m3, m7 | |
29767 | pmaddubsw m5, m4, m6 | |
29768 | pmulhrsw m5, m7 | |
29769 | packuswb m3, m5 | |
29770 | movu [r0 + 1717 * 16], m3 | |
29771 | ||
29772 | ; mode 29 [row 14 - second half] | |
29773 | movu [r0 + 1757 * 16], m3 | |
29774 | ||
29775 | ; mode 28 [row 27] | |
29776 | movu m6, [r5 + 12 * 16] | |
29777 | pmaddubsw m3, m0, m6 | |
29778 | pmulhrsw m3, m7 | |
29779 | pmaddubsw m5, m2, m6 | |
29780 | pmulhrsw m5, m7 | |
29781 | packuswb m3, m5 | |
29782 | movu [r0 + 1718 * 16], m3 | |
29783 | pmaddubsw m3, m1, m6 | |
29784 | pmulhrsw m3, m7 | |
29785 | pmaddubsw m5, m4, m6 | |
29786 | pmulhrsw m5, m7 | |
29787 | packuswb m3, m5 | |
29788 | movu [r0 + 1719 * 16], m3 | |
29789 | ||
29790 | ; mode 28 [row 28] | |
29791 | movu m6, [r5 + 17 * 16] | |
29792 | pmaddubsw m3, m0, m6 | |
29793 | pmulhrsw m3, m7 | |
29794 | pmaddubsw m5, m2, m6 | |
29795 | pmulhrsw m5, m7 | |
29796 | packuswb m3, m5 | |
29797 | movu [r0 + 1720 * 16], m3 | |
29798 | pmaddubsw m3, m1, m6 | |
29799 | pmulhrsw m3, m7 | |
29800 | pmaddubsw m5, m4, m6 | |
29801 | pmulhrsw m5, m7 | |
29802 | packuswb m3, m5 | |
29803 | movu [r0 + 1721 * 16], m3 | |
29804 | ||
29805 | ; mode 28 [row 29] | |
29806 | movu m6, [r5 + 22 * 16] | |
29807 | pmaddubsw m3, m0, m6 | |
29808 | pmulhrsw m3, m7 | |
29809 | pmaddubsw m5, m2, m6 | |
29810 | pmulhrsw m5, m7 | |
29811 | packuswb m3, m5 | |
29812 | movu [r0 + 1722 * 16], m3 | |
29813 | pmaddubsw m3, m1, m6 | |
29814 | pmulhrsw m3, m7 | |
29815 | pmaddubsw m5, m4, m6 | |
29816 | pmulhrsw m5, m7 | |
29817 | packuswb m3, m5 | |
29818 | movu [r0 + 1723 * 16], m3 | |
29819 | ||
29820 | ; mode 28 [row 30] | |
29821 | movu m6, [r5 + 27 * 16] | |
29822 | pmaddubsw m3, m0, m6 | |
29823 | pmulhrsw m3, m7 | |
29824 | pmaddubsw m5, m2, m6 | |
29825 | pmulhrsw m5, m7 | |
29826 | packuswb m3, m5 | |
29827 | movu [r0 + 1724 * 16], m3 | |
29828 | pmaddubsw m3, m1, m6 | |
29829 | pmulhrsw m3, m7 | |
29830 | pmaddubsw m5, m4, m6 | |
29831 | pmulhrsw m5, m7 | |
29832 | packuswb m3, m5 | |
29833 | movu [r0 + 1725 * 16], m3 | |
29834 | ||
29835 | ; mode 29 [row 15] | |
29836 | movu m6, [r5 + 16 * 16] | |
29837 | pmaddubsw m3, m0, m6 | |
29838 | pmulhrsw m3, m7 | |
29839 | pmaddubsw m5, m2, m6 | |
29840 | pmulhrsw m5, m7 | |
29841 | packuswb m3, m5 | |
29842 | movu [r0 + 1758 * 16], m3 | |
29843 | pmaddubsw m3, m1, m6 | |
29844 | pmulhrsw m3, m7 | |
29845 | pmaddubsw m5, m4, m6 | |
29846 | pmulhrsw m5, m7 | |
29847 | packuswb m3, m5 | |
29848 | movu [r0 + 1759 * 16], m3 | |
29849 | ||
29850 | ; mode 29 [row 16] | |
29851 | movu m6, [r5 + 25 * 16] | |
29852 | pmaddubsw m3, m0, m6 | |
29853 | pmulhrsw m3, m7 | |
29854 | pmaddubsw m5, m2, m6 | |
29855 | pmulhrsw m5, m7 | |
29856 | packuswb m3, m5 | |
29857 | movu [r0 + 1760 * 16], m3 | |
29858 | pmaddubsw m3, m1, m6 | |
29859 | pmulhrsw m3, m7 | |
29860 | pmaddubsw m5, m4, m6 | |
29861 | pmulhrsw m5, m7 | |
29862 | packuswb m3, m5 | |
29863 | movu [r0 + 1761 * 16], m3 | |
29864 | ||
29865 | ; mode 30 [row 9] | |
29866 | movu m6, [r5 + 2 * 16] | |
29867 | pmaddubsw m3, m0, m6 | |
29868 | pmulhrsw m3, m7 | |
29869 | pmaddubsw m5, m2, m6 | |
29870 | pmulhrsw m5, m7 | |
29871 | packuswb m3, m5 | |
29872 | movu [r0 + 1810 * 16], m3 | |
29873 | ||
29874 | ; mode 33 [row 4 - first half] | |
29875 | movu [r0 + 1992 * 16], m3 | |
29876 | ||
29877 | pmaddubsw m3, m1, m6 | |
29878 | pmulhrsw m3, m7 | |
29879 | pmaddubsw m5, m4, m6 | |
29880 | pmulhrsw m5, m7 | |
29881 | packuswb m3, m5 | |
29882 | movu [r0 + 1811 * 16], m3 | |
29883 | ||
29884 | ; mode 33 [row 4 - second half] | |
29885 | movu [r0 + 1993 * 16], m3 | |
29886 | ||
29887 | ; mode 30 [row 10] | |
29888 | movu m6, [r5 + 15 * 16] | |
29889 | pmaddubsw m3, m0, m6 | |
29890 | pmulhrsw m3, m7 | |
29891 | pmaddubsw m5, m2, m6 | |
29892 | pmulhrsw m5, m7 | |
29893 | packuswb m3, m5 | |
29894 | movu [r0 + 1812 * 16], m3 | |
29895 | pmaddubsw m3, m1, m6 | |
29896 | pmulhrsw m3, m7 | |
29897 | pmaddubsw m5, m4, m6 | |
29898 | pmulhrsw m5, m7 | |
29899 | packuswb m3, m5 | |
29900 | movu [r0 + 1813 * 16], m3 | |
29901 | ||
29902 | ; mode 31 [row 7] | |
29903 | movu m6, [r5 + 8 * 16] | |
29904 | pmaddubsw m3, m0, m6 | |
29905 | pmulhrsw m3, m7 | |
29906 | pmaddubsw m5, m2, m6 | |
29907 | pmulhrsw m5, m7 | |
29908 | packuswb m3, m5 | |
29909 | movu [r0 + 1870 * 16], m3 | |
29910 | pmaddubsw m3, m1, m6 | |
29911 | pmulhrsw m3, m7 | |
29912 | pmaddubsw m5, m4, m6 | |
29913 | pmulhrsw m5, m7 | |
29914 | packuswb m3, m5 | |
29915 | movu [r0 + 1871 * 16], m3 | |
29916 | ||
29917 | ; mode 31 [row 8] | |
29918 | movu m6, [r5 + 25 * 16] | |
29919 | pmaddubsw m3, m0, m6 | |
29920 | pmulhrsw m3, m7 | |
29921 | pmaddubsw m5, m2, m6 | |
29922 | pmulhrsw m5, m7 | |
29923 | packuswb m3, m5 | |
29924 | movu [r0 + 1872 * 16], m3 | |
29925 | pmaddubsw m3, m1, m6 | |
29926 | pmulhrsw m3, m7 | |
29927 | pmaddubsw m5, m4, m6 | |
29928 | pmulhrsw m5, m7 | |
29929 | packuswb m3, m5 | |
29930 | movu [r0 + 1873 * 16], m3 | |
29931 | ||
29932 | ; mode 32 [row 6] | |
29933 | movu m6, [r5 + 19 * 16] | |
29934 | pmaddubsw m3, m0, m6 | |
29935 | pmulhrsw m3, m7 | |
29936 | pmaddubsw m5, m2, m6 | |
29937 | pmulhrsw m5, m7 | |
29938 | packuswb m3, m5 | |
29939 | movu [r0 + 1932 * 16], m3 | |
29940 | pmaddubsw m3, m1, m6 | |
29941 | pmulhrsw m3, m7 | |
29942 | pmaddubsw m5, m4, m6 | |
29943 | pmulhrsw m5, m7 | |
29944 | packuswb m3, m5 | |
29945 | movu [r0 + 1933 * 16], m3 | |
29946 | ||
29947 | ; mode 30 [row 11] | |
29948 | movu m6, [r5 + 28 * 16] | |
29949 | pmaddubsw m3, m0, m6 | |
29950 | pmulhrsw m3, m7 | |
29951 | pmaddubsw m5, m2, m6 | |
29952 | pmulhrsw m5, m7 | |
29953 | packuswb m3, m5 | |
29954 | movu [r0 + 1814 * 16], m3 | |
29955 | ||
29956 | ; mode 33 [row 5 - first half] | |
29957 | movu [r0 + 1994 * 16], m3 | |
29958 | ||
29959 | pmaddubsw m3, m1, m6 | |
29960 | pmulhrsw m3, m7 | |
29961 | pmaddubsw m5, m4, m6 | |
29962 | pmulhrsw m5, m7 | |
29963 | packuswb m3, m5 | |
29964 | movu [r0 + 1815 * 16], m3 | |
29965 | ||
29966 | ; mode 33 [row 5 - second half] | |
29967 | movu [r0 + 1995 * 16], m3 | |
29968 | ||
29969 | ; mode 28 [row 31] | |
29970 | movu m0, [r3 + 6] | |
29971 | movd m1, [r3 + 7] | |
29972 | palignr m1, m0, 1 | |
29973 | punpcklbw m0, m1 | |
29974 | movu m2, [r3 + 14] | |
29975 | movd m3, [r3 + 15] | |
29976 | palignr m3, m2, 1 | |
29977 | punpcklbw m2, m3 | |
29978 | movu m1, [r3 + 22] | |
29979 | movd m3, [r3 + 23] | |
29980 | palignr m3, m1, 1 | |
29981 | punpcklbw m1, m3 | |
29982 | movu m4, [r3 + 30] | |
29983 | movd m5, [r3 + 31] | |
29984 | palignr m5, m4, 1 | |
29985 | punpcklbw m4, m5 | |
29986 | ||
29987 | pshufb m5, m0, [tab_S2] | |
29988 | movh [r0 + 1726 * 16], m5 | |
29989 | pshufb m5, m2, [tab_S2] | |
29990 | movh [r0 + 1726 * 16 + 8], m5 | |
29991 | pshufb m5, m1, [tab_S2] | |
29992 | movh [r0 + 1727 * 16], m5 | |
29993 | pshufb m5, m4, [tab_S2] | |
29994 | movh [r0 + 1727 * 16 + 8], m5 | |
29995 | ||
29996 | ; mode 29 [row 17] | |
29997 | movu m6, [r5 + 2 * 16] | |
29998 | pmaddubsw m3, m0, m6 | |
29999 | pmulhrsw m3, m7 | |
30000 | pmaddubsw m5, m2, m6 | |
30001 | pmulhrsw m5, m7 | |
30002 | packuswb m3, m5 | |
30003 | movu [r0 + 1762 * 16], m3 | |
30004 | pmaddubsw m3, m1, m6 | |
30005 | pmulhrsw m3, m7 | |
30006 | pmaddubsw m5, m4, m6 | |
30007 | pmulhrsw m5, m7 | |
30008 | packuswb m3, m5 | |
30009 | movu [r0 + 1763 * 16], m3 | |
30010 | ||
30011 | ; mode 29 [row 18] | |
30012 | movu m6, [r5 + 11 * 16] | |
30013 | pmaddubsw m3, m0, m6 | |
30014 | pmulhrsw m3, m7 | |
30015 | pmaddubsw m5, m2, m6 | |
30016 | pmulhrsw m5, m7 | |
30017 | packuswb m3, m5 | |
30018 | movu [r0 + 1764 * 16], m3 | |
30019 | pmaddubsw m3, m1, m6 | |
30020 | pmulhrsw m3, m7 | |
30021 | pmaddubsw m5, m4, m6 | |
30022 | pmulhrsw m5, m7 | |
30023 | packuswb m3, m5 | |
30024 | movu [r0 + 1765 * 16], m3 | |
30025 | ||
30026 | ; mode 29 [row 19] | |
30027 | movu m6, [r5 + 20 * 16] | |
30028 | pmaddubsw m3, m0, m6 | |
30029 | pmulhrsw m3, m7 | |
30030 | pmaddubsw m5, m2, m6 | |
30031 | pmulhrsw m5, m7 | |
30032 | packuswb m3, m5 | |
30033 | movu [r0 + 1766 * 16], m3 | |
30034 | pmaddubsw m3, m1, m6 | |
30035 | pmulhrsw m3, m7 | |
30036 | pmaddubsw m5, m4, m6 | |
30037 | pmulhrsw m5, m7 | |
30038 | packuswb m3, m5 | |
30039 | movu [r0 + 1767 * 16], m3 | |
30040 | ||
30041 | ; mode 29 [row 20] | |
30042 | movu m6, [r5 + 29 * 16] | |
30043 | pmaddubsw m3, m0, m6 | |
30044 | pmulhrsw m3, m7 | |
30045 | pmaddubsw m5, m2, m6 | |
30046 | pmulhrsw m5, m7 | |
30047 | packuswb m3, m5 | |
30048 | movu [r0 + 1768 * 16], m3 | |
30049 | ||
30050 | ; mode 32 [row 8 - first halif] | |
30051 | movu [r0 + 1936 * 16], m3 | |
30052 | ||
30053 | pmaddubsw m3, m1, m6 | |
30054 | pmulhrsw m3, m7 | |
30055 | pmaddubsw m5, m4, m6 | |
30056 | pmulhrsw m5, m7 | |
30057 | packuswb m3, m5 | |
30058 | movu [r0 + 1769 * 16], m3 | |
30059 | ||
30060 | ; mode 32 [row 8 - second halif] | |
30061 | movu [r0 + 1937 * 16], m3 | |
30062 | ||
30063 | ; mode 30 [row 12] | |
30064 | movu m6, [r5 + 9 * 16] | |
30065 | pmaddubsw m3, m0, m6 | |
30066 | pmulhrsw m3, m7 | |
30067 | pmaddubsw m5, m2, m6 | |
30068 | pmulhrsw m5, m7 | |
30069 | packuswb m3, m5 | |
30070 | movu [r0 + 1816 * 16], m3 | |
30071 | pmaddubsw m3, m1, m6 | |
30072 | pmulhrsw m3, m7 | |
30073 | pmaddubsw m5, m4, m6 | |
30074 | pmulhrsw m5, m7 | |
30075 | packuswb m3, m5 | |
30076 | movu [r0 + 1817 * 16], m3 | |
30077 | ||
30078 | ; mode 30 [row 13] | |
30079 | movu m6, [r5 + 22 * 16] | |
30080 | pmaddubsw m3, m0, m6 | |
30081 | pmulhrsw m3, m7 | |
30082 | pmaddubsw m5, m2, m6 | |
30083 | pmulhrsw m5, m7 | |
30084 | packuswb m3, m5 | |
30085 | movu [r0 + 1818 * 16], m3 | |
30086 | ||
30087 | ; mode 33 [row 6 - first half] | |
30088 | movu [r0 + 1996 * 16], m3 | |
30089 | ||
30090 | pmaddubsw m3, m1, m6 | |
30091 | pmulhrsw m3, m7 | |
30092 | pmaddubsw m5, m4, m6 | |
30093 | pmulhrsw m5, m7 | |
30094 | packuswb m3, m5 | |
30095 | movu [r0 + 1819 * 16], m3 | |
30096 | ||
30097 | ; mode 33 [row 6 - second half] | |
30098 | movu [r0 + 1997 * 16], m3 | |
30099 | ||
30100 | ; mode 31 [row 9] | |
30101 | movu m6, [r5 + 10 * 16] | |
30102 | pmaddubsw m3, m0, m6 | |
30103 | pmulhrsw m3, m7 | |
30104 | pmaddubsw m5, m2, m6 | |
30105 | pmulhrsw m5, m7 | |
30106 | packuswb m3, m5 | |
30107 | movu [r0 + 1874 * 16], m3 | |
30108 | pmaddubsw m3, m1, m6 | |
30109 | pmulhrsw m3, m7 | |
30110 | pmaddubsw m5, m4, m6 | |
30111 | pmulhrsw m5, m7 | |
30112 | packuswb m3, m5 | |
30113 | movu [r0 + 1875 * 16], m3 | |
30114 | ||
30115 | ; mode 31 [row 10] | |
30116 | movu m6, [r5 + 27 * 16] | |
30117 | pmaddubsw m3, m0, m6 | |
30118 | pmulhrsw m3, m7 | |
30119 | pmaddubsw m5, m2, m6 | |
30120 | pmulhrsw m5, m7 | |
30121 | packuswb m3, m5 | |
30122 | movu [r0 + 1876 * 16], m3 | |
30123 | pmaddubsw m3, m1, m6 | |
30124 | pmulhrsw m3, m7 | |
30125 | pmaddubsw m5, m4, m6 | |
30126 | pmulhrsw m5, m7 | |
30127 | packuswb m3, m5 | |
30128 | movu [r0 + 1877 * 16], m3 | |
30129 | ||
30130 | ; mode 32 [row 7] | |
30131 | movu m6, [r5 + 8 * 16] | |
30132 | pmaddubsw m3, m0, m6 | |
30133 | pmulhrsw m3, m7 | |
30134 | pmaddubsw m5, m2, m6 | |
30135 | pmulhrsw m5, m7 | |
30136 | packuswb m3, m5 | |
30137 | movu [r0 + 1934 * 16], m3 | |
30138 | pmaddubsw m3, m1, m6 | |
30139 | pmulhrsw m3, m7 | |
30140 | pmaddubsw m5, m4, m6 | |
30141 | pmulhrsw m5, m7 | |
30142 | packuswb m3, m5 | |
30143 | movu [r0 + 1935 * 16], m3 | |
30144 | ||
30145 | ; mode 29 [row 21] | |
30146 | movu m6, [r5 + 6 * 16] | |
30147 | movu m0, [r3 + 7] | |
30148 | movd m1, [r3 + 8] | |
30149 | palignr m1, m0, 1 | |
30150 | punpcklbw m0, m1 | |
30151 | pmaddubsw m3, m0, m6 | |
30152 | pmulhrsw m3, m7 | |
30153 | movu m2, [r3 + 15] | |
30154 | movd m4, [r3 + 16] | |
30155 | palignr m4, m2, 1 | |
30156 | punpcklbw m2, m4 | |
30157 | pmaddubsw m5, m2, m6 | |
30158 | pmulhrsw m5, m7 | |
30159 | packuswb m3, m5 | |
30160 | movu [r0 + 1770 * 16], m3 | |
30161 | ||
30162 | movu m1, [r3 + 23] | |
30163 | movd m3, [r3 + 24] | |
30164 | palignr m3, m1, 1 | |
30165 | punpcklbw m1, m3 | |
30166 | pmaddubsw m3, m1, m6 | |
30167 | pmulhrsw m3, m7 | |
30168 | movu m4, [r3 + 31] | |
30169 | movd m5, [r3 + 32] | |
30170 | palignr m5, m4, 1 | |
30171 | punpcklbw m4, m5 | |
30172 | pmaddubsw m5, m4, m6 | |
30173 | pmulhrsw m5, m7 | |
30174 | packuswb m3, m5 | |
30175 | movu [r0 + 1771 * 16], m3 | |
30176 | ||
30177 | ; mode 29 [row 22] | |
30178 | movu m6, [r5 + 15 * 16] | |
30179 | pmaddubsw m3, m0, m6 | |
30180 | pmulhrsw m3, m7 | |
30181 | pmaddubsw m5, m2, m6 | |
30182 | pmulhrsw m5, m7 | |
30183 | packuswb m3, m5 | |
30184 | movu [r0 + 1772 * 16], m3 | |
30185 | pmaddubsw m3, m1, m6 | |
30186 | pmulhrsw m3, m7 | |
30187 | pmaddubsw m5, m4, m6 | |
30188 | pmulhrsw m5, m7 | |
30189 | packuswb m3, m5 | |
30190 | movu [r0 + 1773 * 16], m3 | |
30191 | ||
30192 | ; mode 29 [row 23] | |
30193 | movu m6, [r5 + 24 * 16] | |
30194 | pmaddubsw m3, m0, m6 | |
30195 | pmulhrsw m3, m7 | |
30196 | pmaddubsw m5, m2, m6 | |
30197 | pmulhrsw m5, m7 | |
30198 | packuswb m3, m5 | |
30199 | movu [r0 + 1774 * 16], m3 | |
30200 | pmaddubsw m3, m1, m6 | |
30201 | pmulhrsw m3, m7 | |
30202 | pmaddubsw m5, m4, m6 | |
30203 | pmulhrsw m5, m7 | |
30204 | packuswb m3, m5 | |
30205 | movu [r0 + 1775 * 16], m3 | |
30206 | ||
30207 | ; mode 30 [row 14] | |
30208 | movu m6, [r5 + 3 * 16] | |
30209 | pmaddubsw m3, m0, m6 | |
30210 | pmulhrsw m3, m7 | |
30211 | pmaddubsw m5, m2, m6 | |
30212 | pmulhrsw m5, m7 | |
30213 | packuswb m3, m5 | |
30214 | movu [r0 + 1820 * 16], m3 | |
30215 | pmaddubsw m3, m1, m6 | |
30216 | pmulhrsw m3, m7 | |
30217 | pmaddubsw m5, m4, m6 | |
30218 | pmulhrsw m5, m7 | |
30219 | packuswb m3, m5 | |
30220 | movu [r0 + 1821 * 16], m3 | |
30221 | ||
30222 | ; mode 30 [row 15] | |
30223 | movu m6, [r5 + 16 * 16] | |
30224 | pmaddubsw m3, m0, m6 | |
30225 | pmulhrsw m3, m7 | |
30226 | pmaddubsw m5, m2, m6 | |
30227 | pmulhrsw m5, m7 | |
30228 | packuswb m3, m5 | |
30229 | movu [r0 + 1822 * 16], m3 | |
30230 | ||
30231 | ; mode 33 [row 7 - first half] | |
30232 | movu [r0 + 1998 * 16], m3 | |
30233 | ||
30234 | pmaddubsw m3, m1, m6 | |
30235 | pmulhrsw m3, m7 | |
30236 | pmaddubsw m5, m4, m6 | |
30237 | pmulhrsw m5, m7 | |
30238 | packuswb m3, m5 | |
30239 | movu [r0 + 1823 * 16], m3 | |
30240 | ||
30241 | ; mode 33 [row 7 - second half] | |
30242 | movu [r0 + 1999 * 16], m3 | |
30243 | ||
30244 | ; mode 30 [row 16] | |
30245 | movu m6, [r5 + 29 * 16] | |
30246 | pmaddubsw m3, m0, m6 | |
30247 | pmulhrsw m3, m7 | |
30248 | pmaddubsw m5, m2, m6 | |
30249 | pmulhrsw m5, m7 | |
30250 | packuswb m3, m5 | |
30251 | movu [r0 + 1824 * 16], m3 | |
30252 | ||
30253 | ; mode 31 [row 12 - first half] | |
30254 | movu [r0 + 1880 * 16], m3 | |
30255 | ||
30256 | pmaddubsw m3, m1, m6 | |
30257 | pmulhrsw m3, m7 | |
30258 | pmaddubsw m5, m4, m6 | |
30259 | pmulhrsw m5, m7 | |
30260 | packuswb m3, m5 | |
30261 | movu [r0 + 1825 * 16], m3 | |
30262 | ||
30263 | ; mode 31 [row 12 - second half] | |
30264 | movu [r0 + 1881 * 16], m3 | |
30265 | ||
30266 | ; mode 31 [row 11] | |
30267 | movu m6, [r5 + 12 * 16] | |
30268 | pmaddubsw m3, m0, m6 | |
30269 | pmulhrsw m3, m7 | |
30270 | pmaddubsw m5, m2, m6 | |
30271 | pmulhrsw m5, m7 | |
30272 | packuswb m3, m5 | |
30273 | movu [r0 + 1878 * 16], m3 | |
30274 | pmaddubsw m3, m1, m6 | |
30275 | pmulhrsw m3, m7 | |
30276 | pmaddubsw m5, m4, m6 | |
30277 | pmulhrsw m5, m7 | |
30278 | packuswb m3, m5 | |
30279 | movu [r0 + 1879 * 16], m3 | |
30280 | ||
30281 | ; mode 32 [row 9] | |
30282 | movu m6, [r5 + 18 * 16] | |
30283 | pmaddubsw m3, m0, m6 | |
30284 | pmulhrsw m3, m7 | |
30285 | pmaddubsw m5, m2, m6 | |
30286 | pmulhrsw m5, m7 | |
30287 | packuswb m3, m5 | |
30288 | movu [r0 + 1938 * 16], m3 | |
30289 | pmaddubsw m3, m1, m6 | |
30290 | pmulhrsw m3, m7 | |
30291 | pmaddubsw m5, m4, m6 | |
30292 | pmulhrsw m5, m7 | |
30293 | packuswb m3, m5 | |
30294 | movu [r0 + 1939 * 16], m3 | |
30295 | ||
30296 | ; mode 29 [row 24] | |
30297 | movu m6, [r5 + 1 * 16] | |
30298 | movu m0, [r3 + 8] | |
30299 | movd m1, [r3 + 9] | |
30300 | palignr m1, m0, 1 | |
30301 | punpcklbw m0, m1 | |
30302 | pmaddubsw m3, m0, m6 | |
30303 | pmulhrsw m3, m7 | |
30304 | movu m2, [r3 + 16] | |
30305 | movd m4, [r3 + 17] | |
30306 | palignr m4, m2, 1 | |
30307 | punpcklbw m2, m4 | |
30308 | pmaddubsw m5, m2, m6 | |
30309 | pmulhrsw m5, m7 | |
30310 | packuswb m3, m5 | |
30311 | movu [r0 + 1776 * 16], m3 | |
30312 | ||
30313 | movu m1, [r3 + 24] | |
30314 | movd m3, [r3 + 25] | |
30315 | palignr m3, m1, 1 | |
30316 | punpcklbw m1, m3 | |
30317 | pmaddubsw m3, m1, m6 | |
30318 | pmulhrsw m3, m7 | |
30319 | movu m4, [r3 + 32] | |
30320 | movd m5, [r3 + 33] | |
30321 | palignr m5, m4, 1 | |
30322 | punpcklbw m4, m5 | |
30323 | pmaddubsw m5, m4, m6 | |
30324 | pmulhrsw m5, m7 | |
30325 | packuswb m3, m5 | |
30326 | movu [r0 + 1777 * 16], m3 | |
30327 | ||
30328 | ; mode 29 [row 25] | |
30329 | movu m6, [r5 + 10 * 16] | |
30330 | pmaddubsw m3, m0, m6 | |
30331 | pmulhrsw m3, m7 | |
30332 | pmaddubsw m5, m2, m6 | |
30333 | pmulhrsw m5, m7 | |
30334 | packuswb m3, m5 | |
30335 | movu [r0 + 1778 * 16], m3 | |
30336 | ||
30337 | ; mode 30 [row 17 - first half] | |
30338 | movu [r0 + 1826 * 16], m3 | |
30339 | ||
30340 | ; mode 33 [row 8 - first half] | |
30341 | movu [r0 + 2000 * 16], m3 | |
30342 | ||
30343 | pmaddubsw m3, m1, m6 | |
30344 | pmulhrsw m3, m7 | |
30345 | pmaddubsw m5, m4, m6 | |
30346 | pmulhrsw m5, m7 | |
30347 | packuswb m3, m5 | |
30348 | movu [r0 + 1779 * 16], m3 | |
30349 | ||
30350 | ; mode 30 [row 17 - second half] | |
30351 | movu [r0 + 1827 * 16], m3 | |
30352 | ||
30353 | ; mode 33 [row 8 - second half] | |
30354 | movu [r0 + 2001 * 16], m3 | |
30355 | ||
30356 | ; mode 29 [row 26] | |
30357 | movu m6, [r5 + 19 * 16] | |
30358 | pmaddubsw m3, m0, m6 | |
30359 | pmulhrsw m3, m7 | |
30360 | pmaddubsw m5, m2, m6 | |
30361 | pmulhrsw m5, m7 | |
30362 | packuswb m3, m5 | |
30363 | movu [r0 + 1780 * 16], m3 | |
30364 | pmaddubsw m3, m1, m6 | |
30365 | pmulhrsw m3, m7 | |
30366 | pmaddubsw m5, m4, m6 | |
30367 | pmulhrsw m5, m7 | |
30368 | packuswb m3, m5 | |
30369 | movu [r0 + 1781 * 16], m3 | |
30370 | ||
30371 | ; mode 29 [row 27] | |
30372 | movu m6, [r5 + 28 * 16] | |
30373 | pmaddubsw m3, m0, m6 | |
30374 | pmulhrsw m3, m7 | |
30375 | pmaddubsw m5, m2, m6 | |
30376 | pmulhrsw m5, m7 | |
30377 | packuswb m3, m5 | |
30378 | movu [r0 + 1782 * 16], m3 | |
30379 | ||
30380 | ; mode 32 [row 11 - first half] | |
30381 | movu [r0 + 1942 * 16], m3 | |
30382 | ||
30383 | pmaddubsw m3, m1, m6 | |
30384 | pmulhrsw m3, m7 | |
30385 | pmaddubsw m5, m4, m6 | |
30386 | pmulhrsw m5, m7 | |
30387 | packuswb m3, m5 | |
30388 | movu [r0 + 1783 * 16], m3 | |
30389 | ||
30390 | ; mode 32 [row 11 - second half] | |
30391 | movu [r0 + 1943 * 16], m3 | |
30392 | ||
30393 | ; mode 30 [row 18] | |
30394 | movu m6, [r5 + 23 * 16] | |
30395 | pmaddubsw m3, m0, m6 | |
30396 | pmulhrsw m3, m7 | |
30397 | pmaddubsw m5, m2, m6 | |
30398 | pmulhrsw m5, m7 | |
30399 | packuswb m3, m5 | |
30400 | movu [r0 + 1828 * 16], m3 | |
30401 | pmaddubsw m3, m1, m6 | |
30402 | pmulhrsw m3, m7 | |
30403 | pmaddubsw m5, m4, m6 | |
30404 | pmulhrsw m5, m7 | |
30405 | packuswb m3, m5 | |
30406 | movu [r0 + 1829 * 16], m3 | |
30407 | ||
30408 | ; mode 31 [row 13] | |
30409 | movu m6, [r5 + 14 * 16] | |
30410 | pmaddubsw m3, m0, m6 | |
30411 | pmulhrsw m3, m7 | |
30412 | pmaddubsw m5, m2, m6 | |
30413 | pmulhrsw m5, m7 | |
30414 | packuswb m3, m5 | |
30415 | movu [r0 + 1882 * 16], m3 | |
30416 | pmaddubsw m3, m1, m6 | |
30417 | pmulhrsw m3, m7 | |
30418 | pmaddubsw m5, m4, m6 | |
30419 | pmulhrsw m5, m7 | |
30420 | packuswb m3, m5 | |
30421 | movu [r0 + 1883 * 16], m3 | |
30422 | ||
30423 | ; mode 31 [row 14] | |
30424 | movu m6, [r5 + 31 * 16] | |
30425 | pmaddubsw m3, m0, m6 | |
30426 | pmulhrsw m3, m7 | |
30427 | pmaddubsw m5, m2, m6 | |
30428 | pmulhrsw m5, m7 | |
30429 | packuswb m3, m5 | |
30430 | movu [r0 + 1884 * 16], m3 | |
30431 | pmaddubsw m3, m1, m6 | |
30432 | pmulhrsw m3, m7 | |
30433 | pmaddubsw m5, m4, m6 | |
30434 | pmulhrsw m5, m7 | |
30435 | packuswb m3, m5 | |
30436 | movu [r0 + 1885 * 16], m3 | |
30437 | ||
30438 | ; mode 32 [row 10] | |
30439 | movu m6, [r5 + 7 * 16] | |
30440 | pmaddubsw m3, m0, m6 | |
30441 | pmulhrsw m3, m7 | |
30442 | pmaddubsw m5, m2, m6 | |
30443 | pmulhrsw m5, m7 | |
30444 | packuswb m3, m5 | |
30445 | movu [r0 + 1940 * 16], m3 | |
30446 | pmaddubsw m3, m1, m6 | |
30447 | pmulhrsw m3, m7 | |
30448 | pmaddubsw m5, m4, m6 | |
30449 | pmulhrsw m5, m7 | |
30450 | packuswb m3, m5 | |
30451 | movu [r0 + 1941 * 16], m3 | |
30452 | ||
30453 | ; mode 29 [row 28] | |
30454 | movu m6, [r5 + 5 * 16] | |
30455 | movu m0, [r3 + 9] | |
30456 | movd m1, [r3 + 10] | |
30457 | palignr m1, m0, 1 | |
30458 | punpcklbw m0, m1 | |
30459 | pmaddubsw m3, m0, m6 | |
30460 | pmulhrsw m3, m7 | |
30461 | movu m2, [r3 + 17] | |
30462 | movd m4, [r3 + 18] | |
30463 | palignr m4, m2, 1 | |
30464 | punpcklbw m2, m4 | |
30465 | pmaddubsw m5, m2, m6 | |
30466 | pmulhrsw m5, m7 | |
30467 | packuswb m3, m5 | |
30468 | movu [r0 + 1784 * 16], m3 | |
30469 | ||
30470 | movu m1, [r3 + 25] | |
30471 | movd m3, [r3 + 26] | |
30472 | palignr m3, m1, 1 | |
30473 | punpcklbw m1, m3 | |
30474 | pmaddubsw m3, m1, m6 | |
30475 | pmulhrsw m3, m7 | |
30476 | movu m4, [r3 + 33] | |
30477 | movd m5, [r3 + 34] | |
30478 | palignr m5, m4, 1 | |
30479 | punpcklbw m4, m5 | |
30480 | pmaddubsw m5, m4, m6 | |
30481 | pmulhrsw m5, m7 | |
30482 | packuswb m3, m5 | |
30483 | movu [r0 + 1785 * 16], m3 | |
30484 | ||
30485 | ; mode 29 [row 29] | |
30486 | movu m6, [r5 + 14 * 16] | |
30487 | pmaddubsw m3, m0, m6 | |
30488 | pmulhrsw m3, m7 | |
30489 | pmaddubsw m5, m2, m6 | |
30490 | pmulhrsw m5, m7 | |
30491 | packuswb m3, m5 | |
30492 | movu [r0 + 1786 * 16], m3 | |
30493 | pmaddubsw m3, m1, m6 | |
30494 | pmulhrsw m3, m7 | |
30495 | pmaddubsw m5, m4, m6 | |
30496 | pmulhrsw m5, m7 | |
30497 | packuswb m3, m5 | |
30498 | movu [r0 + 1787 * 16], m3 | |
30499 | ||
30500 | ; mode 29 [row 30] | |
30501 | movu m6, [r5 + 23 * 16] | |
30502 | pmaddubsw m3, m0, m6 | |
30503 | pmulhrsw m3, m7 | |
30504 | pmaddubsw m5, m2, m6 | |
30505 | pmulhrsw m5, m7 | |
30506 | packuswb m3, m5 | |
30507 | movu [r0 + 1788 * 16], m3 | |
30508 | pmaddubsw m3, m1, m6 | |
30509 | pmulhrsw m3, m7 | |
30510 | pmaddubsw m5, m4, m6 | |
30511 | pmulhrsw m5, m7 | |
30512 | packuswb m3, m5 | |
30513 | movu [r0 + 1789 * 16], m3 | |
30514 | ||
30515 | ; mode 30 [row 19] | |
30516 | movu m6, [r5 + 4 * 16] | |
30517 | pmaddubsw m3, m0, m6 | |
30518 | pmulhrsw m3, m7 | |
30519 | pmaddubsw m5, m2, m6 | |
30520 | pmulhrsw m5, m7 | |
30521 | packuswb m3, m5 | |
30522 | movu [r0 + 1830 * 16], m3 | |
30523 | ||
30524 | ; mode 33 [row 9 - first half] | |
30525 | movu [r0 + 2002 * 16], m3 | |
30526 | ||
30527 | pmaddubsw m3, m1, m6 | |
30528 | pmulhrsw m3, m7 | |
30529 | pmaddubsw m5, m4, m6 | |
30530 | pmulhrsw m5, m7 | |
30531 | packuswb m3, m5 | |
30532 | movu [r0 + 1831 * 16], m3 | |
30533 | ||
30534 | ; mode 33 [row 9 - second half] | |
30535 | movu [r0 + 2003 * 16], m3 | |
30536 | ||
30537 | ; mode 30 [row 20] | |
30538 | movu m6, [r5 + 17 * 16] | |
30539 | pmaddubsw m3, m0, m6 | |
30540 | pmulhrsw m3, m7 | |
30541 | pmaddubsw m5, m2, m6 | |
30542 | pmulhrsw m5, m7 | |
30543 | packuswb m3, m5 | |
30544 | movu [r0 + 1832 * 16], m3 | |
30545 | ||
30546 | ; mode 32 [row 12 - first half] | |
30547 | movu [r0 + 1944 * 16], m3 | |
30548 | ||
30549 | pmaddubsw m3, m1, m6 | |
30550 | pmulhrsw m3, m7 | |
30551 | pmaddubsw m5, m4, m6 | |
30552 | pmulhrsw m5, m7 | |
30553 | packuswb m3, m5 | |
30554 | movu [r0 + 1833 * 16], m3 | |
30555 | ||
30556 | ; mode 32 [row 12 - second half] | |
30557 | movu [r0 + 1945 * 16], m3 | |
30558 | ||
30559 | ; mode 30 [row 21] | |
30560 | movu m6, [r5 + 30 * 16] | |
30561 | pmaddubsw m3, m0, m6 | |
30562 | pmulhrsw m3, m7 | |
30563 | pmaddubsw m5, m2, m6 | |
30564 | pmulhrsw m5, m7 | |
30565 | packuswb m3, m5 | |
30566 | movu [r0 + 1834 * 16], m3 | |
30567 | ||
30568 | ; mode 33 [row 10 - first half] | |
30569 | movu [r0 + 2004 * 16], m3 | |
30570 | ||
30571 | pmaddubsw m3, m1, m6 | |
30572 | pmulhrsw m3, m7 | |
30573 | pmaddubsw m5, m4, m6 | |
30574 | pmulhrsw m5, m7 | |
30575 | packuswb m3, m5 | |
30576 | movu [r0 + 1835 * 16], m3 | |
30577 | ||
30578 | ; mode 33 [row 10 - second half] | |
30579 | movu [r0 + 2005 * 16], m3 | |
30580 | ||
30581 | ; mode 31 [row 15] | |
30582 | movu m6, [r5 + 16 * 16] | |
30583 | pmaddubsw m3, m0, m6 | |
30584 | pmulhrsw m3, m7 | |
30585 | pmaddubsw m5, m2, m6 | |
30586 | pmulhrsw m5, m7 | |
30587 | packuswb m3, m5 | |
30588 | movu [r0 + 1886 * 16], m3 | |
30589 | pmaddubsw m3, m1, m6 | |
30590 | pmulhrsw m3, m7 | |
30591 | pmaddubsw m5, m4, m6 | |
30592 | pmulhrsw m5, m7 | |
30593 | packuswb m3, m5 | |
30594 | movu [r0 + 1887 * 16], m3 | |
30595 | ||
30596 | ; mode 29 [row 31] | |
30597 | movu m0, [r3 + 10] | |
30598 | movd m1, [r3 + 11] | |
30599 | palignr m1, m0, 1 | |
30600 | punpcklbw m0, m1 | |
30601 | movu m2, [r3 + 18] | |
30602 | movd m3, [r3 + 19] | |
30603 | palignr m3, m2, 1 | |
30604 | punpcklbw m2, m3 | |
30605 | movu m1, [r3 + 26] | |
30606 | movd m3, [r3 + 27] | |
30607 | palignr m3, m1, 1 | |
30608 | punpcklbw m1, m3 | |
30609 | movu m4, [r3 + 34] | |
30610 | movd m5, [r3 + 35] | |
30611 | palignr m5, m4, 1 | |
30612 | punpcklbw m4, m5 | |
30613 | ||
30614 | pshufb m5, m0, [tab_S2] | |
30615 | movh [r0 + 1790 * 16], m5 | |
30616 | pshufb m5, m2, [tab_S2] | |
30617 | movh [r0 + 1790 * 16 + 8], m5 | |
30618 | pshufb m5, m1, [tab_S2] | |
30619 | movh [r0 + 1791 * 16], m5 | |
30620 | pshufb m5, m4, [tab_S2] | |
30621 | movh [r0 + 1791 * 16 + 8], m5 | |
30622 | ||
30623 | ; mode 30 [row 22] | |
30624 | movu m6, [r5 + 11 * 16] | |
30625 | pmaddubsw m3, m0, m6 | |
30626 | pmulhrsw m3, m7 | |
30627 | pmaddubsw m5, m2, m6 | |
30628 | pmulhrsw m5, m7 | |
30629 | packuswb m3, m5 | |
30630 | movu [r0 + 1836 * 16], m3 | |
30631 | pmaddubsw m3, m1, m6 | |
30632 | pmulhrsw m3, m7 | |
30633 | pmaddubsw m5, m4, m6 | |
30634 | pmulhrsw m5, m7 | |
30635 | packuswb m3, m5 | |
30636 | movu [r0 + 1837 * 16], m3 | |
30637 | ||
30638 | ; mode 30 [row 23] | |
30639 | movu m6, [r5 + 24 * 16] | |
30640 | pmaddubsw m3, m0, m6 | |
30641 | pmulhrsw m3, m7 | |
30642 | pmaddubsw m5, m2, m6 | |
30643 | pmulhrsw m5, m7 | |
30644 | packuswb m3, m5 | |
30645 | movu [r0 + 1838 * 16], m3 | |
30646 | ||
30647 | ; mode 33 [row 11 - first half] | |
30648 | movu [r0 + 2006 * 16], m3 | |
30649 | ||
30650 | pmaddubsw m3, m1, m6 | |
30651 | pmulhrsw m3, m7 | |
30652 | pmaddubsw m5, m4, m6 | |
30653 | pmulhrsw m5, m7 | |
30654 | packuswb m3, m5 | |
30655 | movu [r0 + 1839 * 16], m3 | |
30656 | ||
30657 | ; mode 33 [row 11 - second half] | |
30658 | movu [r0 + 2007 * 16], m3 | |
30659 | ||
30660 | ; mode 31 [row 16] | |
30661 | movu m6, [r5 + 1 * 16] | |
30662 | pmaddubsw m3, m0, m6 | |
30663 | pmulhrsw m3, m7 | |
30664 | pmaddubsw m5, m2, m6 | |
30665 | pmulhrsw m5, m7 | |
30666 | packuswb m3, m5 | |
30667 | movu [r0 + 1888 * 16], m3 | |
30668 | pmaddubsw m3, m1, m6 | |
30669 | pmulhrsw m3, m7 | |
30670 | pmaddubsw m5, m4, m6 | |
30671 | pmulhrsw m5, m7 | |
30672 | packuswb m3, m5 | |
30673 | movu [r0 + 1889 * 16], m3 | |
30674 | ||
30675 | ; mode 31 [row 17] | |
30676 | movu m6, [r5 + 18 * 16] | |
30677 | pmaddubsw m3, m0, m6 | |
30678 | pmulhrsw m3, m7 | |
30679 | pmaddubsw m5, m2, m6 | |
30680 | pmulhrsw m5, m7 | |
30681 | packuswb m3, m5 | |
30682 | movu [r0 + 1890 * 16], m3 | |
30683 | pmaddubsw m3, m1, m6 | |
30684 | pmulhrsw m3, m7 | |
30685 | pmaddubsw m5, m4, m6 | |
30686 | pmulhrsw m5, m7 | |
30687 | packuswb m3, m5 | |
30688 | movu [r0 + 1891 * 16], m3 | |
30689 | ||
30690 | ; mode 32 [row 13] | |
30691 | movu m6, [r5 + 6 * 16] | |
30692 | pmaddubsw m3, m0, m6 | |
30693 | pmulhrsw m3, m7 | |
30694 | pmaddubsw m5, m2, m6 | |
30695 | pmulhrsw m5, m7 | |
30696 | packuswb m3, m5 | |
30697 | movu [r0 + 1946 * 16], m3 | |
30698 | pmaddubsw m3, m1, m6 | |
30699 | pmulhrsw m3, m7 | |
30700 | pmaddubsw m5, m4, m6 | |
30701 | pmulhrsw m5, m7 | |
30702 | packuswb m3, m5 | |
30703 | movu [r0 + 1947 * 16], m3 | |
30704 | ||
30705 | ; mode 32 [row 14] | |
30706 | movu m6, [r5 + 27 * 16] | |
30707 | pmaddubsw m3, m0, m6 | |
30708 | pmulhrsw m3, m7 | |
30709 | pmaddubsw m5, m2, m6 | |
30710 | pmulhrsw m5, m7 | |
30711 | packuswb m3, m5 | |
30712 | movu [r0 + 1948 * 16], m3 | |
30713 | pmaddubsw m3, m1, m6 | |
30714 | pmulhrsw m3, m7 | |
30715 | pmaddubsw m5, m4, m6 | |
30716 | pmulhrsw m5, m7 | |
30717 | packuswb m3, m5 | |
30718 | movu [r0 + 1949 * 16], m3 | |
30719 | ||
30720 | ; mode 30 [row 24] | |
30721 | movu m6, [r5 + 5 * 16] | |
30722 | movu m0, [r3 + 11] | |
30723 | movd m1, [r3 + 12] | |
30724 | palignr m1, m0, 1 | |
30725 | punpcklbw m0, m1 | |
30726 | pmaddubsw m3, m0, m6 | |
30727 | pmulhrsw m3, m7 | |
30728 | movu m2, [r3 + 19] | |
30729 | movd m4, [r3 + 20] | |
30730 | palignr m4, m2, 1 | |
30731 | punpcklbw m2, m4 | |
30732 | pmaddubsw m5, m2, m6 | |
30733 | pmulhrsw m5, m7 | |
30734 | packuswb m3, m5 | |
30735 | movu [r0 + 1840 * 16], m3 | |
30736 | ||
30737 | movu m1, [r3 + 27] | |
30738 | movd m3, [r3 + 28] | |
30739 | palignr m3, m1, 1 | |
30740 | punpcklbw m1, m3 | |
30741 | pmaddubsw m3, m1, m6 | |
30742 | pmulhrsw m3, m7 | |
30743 | movu m4, [r3 + 35] | |
30744 | movd m5, [r3 + 36] | |
30745 | palignr m5, m4, 1 | |
30746 | punpcklbw m4, m5 | |
30747 | pmaddubsw m5, m4, m6 | |
30748 | pmulhrsw m5, m7 | |
30749 | packuswb m3, m5 | |
30750 | movu [r0 + 1841 * 16], m3 | |
30751 | ||
30752 | ; mode 30 [row 25] | |
30753 | movu m6, [r5 + 18 * 16] | |
30754 | pmaddubsw m3, m0, m6 | |
30755 | pmulhrsw m3, m7 | |
30756 | pmaddubsw m5, m2, m6 | |
30757 | pmulhrsw m5, m7 | |
30758 | packuswb m3, m5 | |
30759 | movu [r0 + 1842 * 16], m3 | |
30760 | ||
30761 | ; mode 33 [row 12 - first half] | |
30762 | movu [r0 + 2008 * 16], m3 | |
30763 | ||
30764 | pmaddubsw m3, m1, m6 | |
30765 | pmulhrsw m3, m7 | |
30766 | pmaddubsw m5, m4, m6 | |
30767 | pmulhrsw m5, m7 | |
30768 | packuswb m3, m5 | |
30769 | movu [r0 + 1843 * 16], m3 | |
30770 | ||
30771 | ; mode 33 [row 12 - second half] | |
30772 | movu [r0 + 2009 * 16], m3 | |
30773 | ||
30774 | ; mode 30 [row 26] | |
30775 | movu m6, [r5 + 31 * 16] | |
30776 | pmaddubsw m3, m0, m6 | |
30777 | pmulhrsw m3, m7 | |
30778 | pmaddubsw m5, m2, m6 | |
30779 | pmulhrsw m5, m7 | |
30780 | packuswb m3, m5 | |
30781 | movu [r0 + 1844 * 16], m3 | |
30782 | pmaddubsw m3, m1, m6 | |
30783 | pmulhrsw m3, m7 | |
30784 | pmaddubsw m5, m4, m6 | |
30785 | pmulhrsw m5, m7 | |
30786 | packuswb m3, m5 | |
30787 | movu [r0 + 1845 * 16], m3 | |
30788 | ||
30789 | ; mode 31 [row 18] | |
30790 | movu m6, [r5 + 3 * 16] | |
30791 | pmaddubsw m3, m0, m6 | |
30792 | pmulhrsw m3, m7 | |
30793 | pmaddubsw m5, m2, m6 | |
30794 | pmulhrsw m5, m7 | |
30795 | packuswb m3, m5 | |
30796 | movu [r0 + 1892 * 16], m3 | |
30797 | pmaddubsw m3, m1, m6 | |
30798 | pmulhrsw m3, m7 | |
30799 | pmaddubsw m5, m4, m6 | |
30800 | pmulhrsw m5, m7 | |
30801 | packuswb m3, m5 | |
30802 | movu [r0 + 1893 * 16], m3 | |
30803 | ||
30804 | ; mode 31 [row 19] | |
30805 | movu m6, [r5 + 20 * 16] | |
30806 | pmaddubsw m3, m0, m6 | |
30807 | pmulhrsw m3, m7 | |
30808 | pmaddubsw m5, m2, m6 | |
30809 | pmulhrsw m5, m7 | |
30810 | packuswb m3, m5 | |
30811 | movu [r0 + 1894 * 16], m3 | |
30812 | pmaddubsw m3, m1, m6 | |
30813 | pmulhrsw m3, m7 | |
30814 | pmaddubsw m5, m4, m6 | |
30815 | pmulhrsw m5, m7 | |
30816 | packuswb m3, m5 | |
30817 | movu [r0 + 1895 * 16], m3 | |
30818 | ||
30819 | ; mode 32 [row 15] | |
30820 | movu m6, [r5 + 16 * 16] | |
30821 | pmaddubsw m3, m0, m6 | |
30822 | pmulhrsw m3, m7 | |
30823 | pmaddubsw m5, m2, m6 | |
30824 | pmulhrsw m5, m7 | |
30825 | packuswb m3, m5 | |
30826 | movu [r0 + 1950 * 16], m3 | |
30827 | pmaddubsw m3, m1, m6 | |
30828 | pmulhrsw m3, m7 | |
30829 | pmaddubsw m5, m4, m6 | |
30830 | pmulhrsw m5, m7 | |
30831 | packuswb m3, m5 | |
30832 | movu [r0 + 1951 * 16], m3 | |
30833 | ||
30834 | ; mode 30 [row 27] | |
30835 | movu m6, [r5 + 12 * 16] | |
30836 | movu m0, [r3 + 12] | |
30837 | movd m1, [r3 + 13] | |
30838 | palignr m1, m0, 1 | |
30839 | punpcklbw m0, m1 | |
30840 | pmaddubsw m3, m0, m6 | |
30841 | pmulhrsw m3, m7 | |
30842 | movu m2, [r3 + 20] | |
30843 | movd m4, [r3 + 21] | |
30844 | palignr m4, m2, 1 | |
30845 | punpcklbw m2, m4 | |
30846 | pmaddubsw m5, m2, m6 | |
30847 | pmulhrsw m5, m7 | |
30848 | packuswb m3, m5 | |
30849 | movu [r0 + 1846 * 16], m3 | |
30850 | ||
30851 | ; mode 33 [row 13 - first half] | |
30852 | movu [r0 + 2010 * 16], m3 | |
30853 | ||
30854 | movu m1, [r3 + 28] | |
30855 | movd m3, [r3 + 29] | |
30856 | palignr m3, m1, 1 | |
30857 | punpcklbw m1, m3 | |
30858 | pmaddubsw m3, m1, m6 | |
30859 | pmulhrsw m3, m7 | |
30860 | movu m4, [r3 + 36] | |
30861 | movd m5, [r3 + 37] | |
30862 | palignr m5, m4, 1 | |
30863 | punpcklbw m4, m5 | |
30864 | pmaddubsw m5, m4, m6 | |
30865 | pmulhrsw m5, m7 | |
30866 | packuswb m3, m5 | |
30867 | movu [r0 + 1847 * 16], m3 | |
30868 | ||
30869 | ; mode 33 [row 13 - second half] | |
30870 | movu [r0 + 2011 * 16], m3 | |
30871 | ||
30872 | ; mode 30 [row 28] | |
30873 | movu m6, [r5 + 25 * 16] | |
30874 | pmaddubsw m3, m0, m6 | |
30875 | pmulhrsw m3, m7 | |
30876 | pmaddubsw m5, m2, m6 | |
30877 | pmulhrsw m5, m7 | |
30878 | packuswb m3, m5 | |
30879 | movu [r0 + 1848 * 16], m3 | |
30880 | pmaddubsw m3, m1, m6 | |
30881 | pmulhrsw m3, m7 | |
30882 | pmaddubsw m5, m4, m6 | |
30883 | pmulhrsw m5, m7 | |
30884 | packuswb m3, m5 | |
30885 | movu [r0 + 1849 * 16], m3 | |
30886 | ||
30887 | ; mode 31 [row 20] | |
30888 | movu m6, [r5 + 5 * 16] | |
30889 | pmaddubsw m3, m0, m6 | |
30890 | pmulhrsw m3, m7 | |
30891 | pmaddubsw m5, m2, m6 | |
30892 | pmulhrsw m5, m7 | |
30893 | packuswb m3, m5 | |
30894 | movu [r0 + 1896 * 16], m3 | |
30895 | ||
30896 | ; mode 32 [row 16 - first half] | |
30897 | movu [r0 + 1952 * 16], m3 | |
30898 | ||
30899 | pmaddubsw m3, m1, m6 | |
30900 | pmulhrsw m3, m7 | |
30901 | pmaddubsw m5, m4, m6 | |
30902 | pmulhrsw m5, m7 | |
30903 | packuswb m3, m5 | |
30904 | movu [r0 + 1897 * 16], m3 | |
30905 | ||
30906 | ; mode 32 [row 16 - second half] | |
30907 | movu [r0 + 1953 * 16], m3 | |
30908 | ||
30909 | ; mode 31 [row 21] | |
30910 | movu m6, [r5 + 22 * 16] | |
30911 | pmaddubsw m3, m0, m6 | |
30912 | pmulhrsw m3, m7 | |
30913 | pmaddubsw m5, m2, m6 | |
30914 | pmulhrsw m5, m7 | |
30915 | packuswb m3, m5 | |
30916 | movu [r0 + 1898 * 16], m3 | |
30917 | pmaddubsw m3, m1, m6 | |
30918 | pmulhrsw m3, m7 | |
30919 | pmaddubsw m5, m4, m6 | |
30920 | pmulhrsw m5, m7 | |
30921 | packuswb m3, m5 | |
30922 | movu [r0 + 1899 * 16], m3 | |
30923 | ||
30924 | ; mode 32 [row 17] | |
30925 | movu m6, [r5 + 26 * 16] | |
30926 | pmaddubsw m3, m0, m6 | |
30927 | pmulhrsw m3, m7 | |
30928 | pmaddubsw m5, m2, m6 | |
30929 | pmulhrsw m5, m7 | |
30930 | packuswb m3, m5 | |
30931 | movu [r0 + 1954 * 16], m3 | |
30932 | pmaddubsw m3, m1, m6 | |
30933 | pmulhrsw m3, m7 | |
30934 | pmaddubsw m5, m4, m6 | |
30935 | pmulhrsw m5, m7 | |
30936 | packuswb m3, m5 | |
30937 | movu [r0 + 1955 * 16], m3 | |
30938 | ||
30939 | ; mode 30 [row 29] | |
30940 | movu m6, [r5 + 6 * 16] | |
30941 | movu m0, [r3 + 13] | |
30942 | movd m1, [r3 + 14] | |
30943 | palignr m1, m0, 1 | |
30944 | punpcklbw m0, m1 | |
30945 | pmaddubsw m3, m0, m6 | |
30946 | pmulhrsw m3, m7 | |
30947 | movu m2, [r3 + 21] | |
30948 | movd m4, [r3 + 22] | |
30949 | palignr m4, m2, 1 | |
30950 | punpcklbw m2, m4 | |
30951 | pmaddubsw m5, m2, m6 | |
30952 | pmulhrsw m5, m7 | |
30953 | packuswb m3, m5 | |
30954 | movu [r0 + 1850 * 16], m3 | |
30955 | ||
30956 | ; mode 33 [row 14 - first half] | |
30957 | movu [r0 + 2012 * 16], m3 | |
30958 | ||
30959 | movu m1, [r3 + 29] | |
30960 | movd m3, [r3 + 30] | |
30961 | palignr m3, m1, 1 | |
30962 | punpcklbw m1, m3 | |
30963 | pmaddubsw m3, m1, m6 | |
30964 | pmulhrsw m3, m7 | |
30965 | movu m4, [r3 + 37] | |
30966 | movd m5, [r3 + 38] | |
30967 | palignr m5, m4, 1 | |
30968 | punpcklbw m4, m5 | |
30969 | pmaddubsw m5, m4, m6 | |
30970 | pmulhrsw m5, m7 | |
30971 | packuswb m3, m5 | |
30972 | movu [r0 + 1851 * 16], m3 | |
30973 | ||
30974 | ; mode 33 [row 14 - second half] | |
30975 | movu [r0 + 2013 * 16], m3 | |
30976 | ||
30977 | ; mode 30 [row 30] | |
30978 | movu m6, [r5 + 19 * 16] | |
30979 | pmaddubsw m3, m0, m6 | |
30980 | pmulhrsw m3, m7 | |
30981 | pmaddubsw m5, m2, m6 | |
30982 | pmulhrsw m5, m7 | |
30983 | packuswb m3, m5 | |
30984 | movu [r0 + 1852 * 16], m3 | |
30985 | pmaddubsw m3, m1, m6 | |
30986 | pmulhrsw m3, m7 | |
30987 | pmaddubsw m5, m4, m6 | |
30988 | pmulhrsw m5, m7 | |
30989 | packuswb m3, m5 | |
30990 | movu [r0 + 1853 * 16], m3 | |
30991 | ||
30992 | ; mode 31 [row 22] | |
30993 | movu m6, [r5 + 7 * 16] | |
30994 | pmaddubsw m3, m0, m6 | |
30995 | pmulhrsw m3, m7 | |
30996 | pmaddubsw m5, m2, m6 | |
30997 | pmulhrsw m5, m7 | |
30998 | packuswb m3, m5 | |
30999 | movu [r0 + 1900 * 16], m3 | |
31000 | pmaddubsw m3, m1, m6 | |
31001 | pmulhrsw m3, m7 | |
31002 | pmaddubsw m5, m4, m6 | |
31003 | pmulhrsw m5, m7 | |
31004 | packuswb m3, m5 | |
31005 | movu [r0 + 1901 * 16], m3 | |
31006 | ||
31007 | ; mode 31 [row 23] | |
31008 | movu m6, [r5 + 24 * 16] | |
31009 | pmaddubsw m3, m0, m6 | |
31010 | pmulhrsw m3, m7 | |
31011 | pmaddubsw m5, m2, m6 | |
31012 | pmulhrsw m5, m7 | |
31013 | packuswb m3, m5 | |
31014 | movu [r0 + 1902 * 16], m3 | |
31015 | pmaddubsw m3, m1, m6 | |
31016 | pmulhrsw m3, m7 | |
31017 | pmaddubsw m5, m4, m6 | |
31018 | pmulhrsw m5, m7 | |
31019 | packuswb m3, m5 | |
31020 | movu [r0 + 1903 * 16], m3 | |
31021 | ||
31022 | ; mode 32 [row 18] | |
31023 | movu m6, [r5 + 15 * 16] | |
31024 | pmaddubsw m3, m0, m6 | |
31025 | pmulhrsw m3, m7 | |
31026 | pmaddubsw m5, m2, m6 | |
31027 | pmulhrsw m5, m7 | |
31028 | packuswb m3, m5 | |
31029 | movu [r0 + 1956 * 16], m3 | |
31030 | pmaddubsw m3, m1, m6 | |
31031 | pmulhrsw m3, m7 | |
31032 | pmaddubsw m5, m4, m6 | |
31033 | pmulhrsw m5, m7 | |
31034 | packuswb m3, m5 | |
31035 | movu [r0 + 1957 * 16], m3 | |
31036 | ||
31037 | ; mode 30 [row 31] | |
31038 | movu m0, [r3 + 14] | |
31039 | movd m1, [r3 + 15] | |
31040 | palignr m1, m0, 1 | |
31041 | punpcklbw m0, m1 | |
31042 | movu m2, [r3 + 22] | |
31043 | movd m3, [r3 + 23] | |
31044 | palignr m3, m2, 1 | |
31045 | punpcklbw m2, m3 | |
31046 | movu m1, [r3 + 30] | |
31047 | movd m3, [r3 + 31] | |
31048 | palignr m3, m1, 1 | |
31049 | punpcklbw m1, m3 | |
31050 | movu m4, [r3 + 38] | |
31051 | movd m5, [r3 + 39] | |
31052 | palignr m5, m4, 1 | |
31053 | punpcklbw m4, m5 | |
31054 | ||
31055 | pshufb m5, m0, [tab_S2] | |
31056 | movh [r0 + 1854 * 16], m5 | |
31057 | ||
31058 | ; mode 33 [row 15 - first eight] | |
31059 | movh [r0 + 2014 * 16], m5 | |
31060 | ||
31061 | pshufb m5, m2, [tab_S2] | |
31062 | movh [r0 + 1854 * 16 + 8], m5 | |
31063 | ||
31064 | ; mode 33 [row 15 - second eight] | |
31065 | movh [r0 + 2014 * 16 + 8], m5 | |
31066 | ||
31067 | pshufb m5, m1, [tab_S2] | |
31068 | movh [r0 + 1855 * 16], m5 | |
31069 | ||
31070 | ; mode 33 [row 15 - third eight] | |
31071 | movh [r0 + 2015 * 16], m5 | |
31072 | ||
31073 | pshufb m5, m4, [tab_S2] | |
31074 | movh [r0 + 1855 * 16 + 8], m5 | |
31075 | ||
31076 | ; mode 33 [row 15 - fourth eight] | |
31077 | movh [r0 + 2015 * 16 + 8], m5 | |
31078 | ||
31079 | ; mode 31 [row 24] | |
31080 | movu m6, [r5 + 9 * 16] | |
31081 | pmaddubsw m3, m0, m6 | |
31082 | pmulhrsw m3, m7 | |
31083 | pmaddubsw m5, m2, m6 | |
31084 | pmulhrsw m5, m7 | |
31085 | packuswb m3, m5 | |
31086 | movu [r0 + 1904 * 16], m3 | |
31087 | pmaddubsw m3, m1, m6 | |
31088 | pmulhrsw m3, m7 | |
31089 | pmaddubsw m5, m4, m6 | |
31090 | pmulhrsw m5, m7 | |
31091 | packuswb m3, m5 | |
31092 | movu [r0 + 1905 * 16], m3 | |
31093 | ||
31094 | ; mode 31 [row 25] | |
31095 | movu m6, [r5 + 26 * 16] | |
31096 | pmaddubsw m3, m0, m6 | |
31097 | pmulhrsw m3, m7 | |
31098 | pmaddubsw m5, m2, m6 | |
31099 | pmulhrsw m5, m7 | |
31100 | packuswb m3, m5 | |
31101 | movu [r0 + 1906 * 16], m3 | |
31102 | ||
31103 | ; mode 33 [row 16 - first half] | |
31104 | movu [r0 + 2016 * 16], m3 | |
31105 | ||
31106 | pmaddubsw m3, m1, m6 | |
31107 | pmulhrsw m3, m7 | |
31108 | pmaddubsw m5, m4, m6 | |
31109 | pmulhrsw m5, m7 | |
31110 | packuswb m3, m5 | |
31111 | movu [r0 + 1907 * 16], m3 | |
31112 | ||
31113 | ; mode 33 [row 16 - second half] | |
31114 | movu [r0 + 2017 * 16], m3 | |
31115 | ||
31116 | ; mode 32 [row 19] | |
31117 | movu m6, [r5 + 4 * 16] | |
31118 | pmaddubsw m3, m0, m6 | |
31119 | pmulhrsw m3, m7 | |
31120 | pmaddubsw m5, m2, m6 | |
31121 | pmulhrsw m5, m7 | |
31122 | packuswb m3, m5 | |
31123 | movu [r0 + 1958 * 16], m3 | |
31124 | pmaddubsw m3, m1, m6 | |
31125 | pmulhrsw m3, m7 | |
31126 | pmaddubsw m5, m4, m6 | |
31127 | pmulhrsw m5, m7 | |
31128 | packuswb m3, m5 | |
31129 | movu [r0 + 1959 * 16], m3 | |
31130 | ||
31131 | ; mode 32 [row 20] | |
31132 | movu m6, [r5 + 25 * 16] | |
31133 | pmaddubsw m3, m0, m6 | |
31134 | pmulhrsw m3, m7 | |
31135 | pmaddubsw m5, m2, m6 | |
31136 | pmulhrsw m5, m7 | |
31137 | packuswb m3, m5 | |
31138 | movu [r0 + 1960 * 16], m3 | |
31139 | pmaddubsw m3, m1, m6 | |
31140 | pmulhrsw m3, m7 | |
31141 | pmaddubsw m5, m4, m6 | |
31142 | pmulhrsw m5, m7 | |
31143 | packuswb m3, m5 | |
31144 | movu [r0 + 1961 * 16], m3 | |
31145 | ||
31146 | ; mode 31 [row 26] | |
31147 | movu m6, [r5 + 11 * 16] | |
31148 | movu m0, [r3 + 15] | |
31149 | movd m1, [r3 + 16] | |
31150 | palignr m1, m0, 1 | |
31151 | punpcklbw m0, m1 | |
31152 | pmaddubsw m3, m0, m6 | |
31153 | pmulhrsw m3, m7 | |
31154 | movu m2, [r3 + 23] | |
31155 | movd m4, [r3 + 24] | |
31156 | palignr m4, m2, 1 | |
31157 | punpcklbw m2, m4 | |
31158 | pmaddubsw m5, m2, m6 | |
31159 | pmulhrsw m5, m7 | |
31160 | packuswb m3, m5 | |
31161 | movu [r0 + 1908 * 16], m3 | |
31162 | ||
31163 | movu m1, [r3 + 31] | |
31164 | movd m3, [r3 + 32] | |
31165 | palignr m3, m1, 1 | |
31166 | punpcklbw m1, m3 | |
31167 | pmaddubsw m3, m1, m6 | |
31168 | pmulhrsw m3, m7 | |
31169 | movu m4, [r3 + 39] | |
31170 | movd m5, [r3 + 40] | |
31171 | palignr m5, m4, 1 | |
31172 | punpcklbw m4, m5 | |
31173 | pmaddubsw m5, m4, m6 | |
31174 | pmulhrsw m5, m7 | |
31175 | packuswb m3, m5 | |
31176 | movu [r0 + 1909 * 16], m3 | |
31177 | ||
31178 | ; mode 31 [row 27] | |
31179 | movu m6, [r5 + 28 * 16] | |
31180 | pmaddubsw m3, m0, m6 | |
31181 | pmulhrsw m3, m7 | |
31182 | pmaddubsw m5, m2, m6 | |
31183 | pmulhrsw m5, m7 | |
31184 | packuswb m3, m5 | |
31185 | movu [r0 + 1910 * 16], m3 | |
31186 | pmaddubsw m3, m1, m6 | |
31187 | pmulhrsw m3, m7 | |
31188 | pmaddubsw m5, m4, m6 | |
31189 | pmulhrsw m5, m7 | |
31190 | packuswb m3, m5 | |
31191 | movu [r0 + 1911 * 16], m3 | |
31192 | ||
31193 | ; mode 32 [row 21] | |
31194 | movu m6, [r5 + 14 * 16] | |
31195 | pmaddubsw m3, m0, m6 | |
31196 | pmulhrsw m3, m7 | |
31197 | pmaddubsw m5, m2, m6 | |
31198 | pmulhrsw m5, m7 | |
31199 | packuswb m3, m5 | |
31200 | movu [r0 + 1962 * 16], m3 | |
31201 | pmaddubsw m3, m1, m6 | |
31202 | pmulhrsw m3, m7 | |
31203 | pmaddubsw m5, m4, m6 | |
31204 | pmulhrsw m5, m7 | |
31205 | packuswb m3, m5 | |
31206 | movu [r0 + 1963 * 16], m3 | |
31207 | ||
31208 | ; mode 33 [row 17] | |
31209 | movu m6, [r5 + 20 * 16] | |
31210 | pmaddubsw m3, m0, m6 | |
31211 | pmulhrsw m3, m7 | |
31212 | pmaddubsw m5, m2, m6 | |
31213 | pmulhrsw m5, m7 | |
31214 | packuswb m3, m5 | |
31215 | movu [r0 + 2018 * 16], m3 | |
31216 | pmaddubsw m3, m1, m6 | |
31217 | pmulhrsw m3, m7 | |
31218 | pmaddubsw m5, m4, m6 | |
31219 | pmulhrsw m5, m7 | |
31220 | packuswb m3, m5 | |
31221 | movu [r0 + 2019 * 16], m3 | |
31222 | ||
31223 | ; mode 31 [row 28] | |
31224 | movu m6, [r5 + 13 * 16] | |
31225 | movu m0, [r3 + 16] | |
31226 | movd m1, [r3 + 17] | |
31227 | palignr m1, m0, 1 | |
31228 | punpcklbw m0, m1 | |
31229 | pmaddubsw m3, m0, m6 | |
31230 | pmulhrsw m3, m7 | |
31231 | movu m2, [r3 + 24] | |
31232 | movd m4, [r3 + 25] | |
31233 | palignr m4, m2, 1 | |
31234 | punpcklbw m2, m4 | |
31235 | pmaddubsw m5, m2, m6 | |
31236 | pmulhrsw m5, m7 | |
31237 | packuswb m3, m5 | |
31238 | movu [r0 + 1912 * 16], m3 | |
31239 | ||
31240 | movu m1, [r3 + 32] | |
31241 | movd m3, [r3 + 33] | |
31242 | palignr m3, m1, 1 | |
31243 | punpcklbw m1, m3 | |
31244 | pmaddubsw m3, m1, m6 | |
31245 | pmulhrsw m3, m7 | |
31246 | movu m4, [r3 + 40] | |
31247 | movd m5, [r3 + 41] | |
31248 | palignr m5, m4, 1 | |
31249 | punpcklbw m4, m5 | |
31250 | pmaddubsw m5, m4, m6 | |
31251 | pmulhrsw m5, m7 | |
31252 | packuswb m3, m5 | |
31253 | movu [r0 + 1913 * 16], m3 | |
31254 | ||
31255 | ; mode 31 [row 29] | |
31256 | movu m6, [r5 + 30 * 16] | |
31257 | pmaddubsw m3, m0, m6 | |
31258 | pmulhrsw m3, m7 | |
31259 | pmaddubsw m5, m2, m6 | |
31260 | pmulhrsw m5, m7 | |
31261 | packuswb m3, m5 | |
31262 | movu [r0 + 1914 * 16], m3 | |
31263 | pmaddubsw m3, m1, m6 | |
31264 | pmulhrsw m3, m7 | |
31265 | pmaddubsw m5, m4, m6 | |
31266 | pmulhrsw m5, m7 | |
31267 | packuswb m3, m5 | |
31268 | movu [r0 + 1915 * 16], m3 | |
31269 | ||
31270 | ; mode 32 [row 22] | |
31271 | movu m6, [r5 + 3 * 16] | |
31272 | pmaddubsw m3, m0, m6 | |
31273 | pmulhrsw m3, m7 | |
31274 | pmaddubsw m5, m2, m6 | |
31275 | pmulhrsw m5, m7 | |
31276 | packuswb m3, m5 | |
31277 | movu [r0 + 1964 * 16], m3 | |
31278 | pmaddubsw m3, m1, m6 | |
31279 | pmulhrsw m3, m7 | |
31280 | pmaddubsw m5, m4, m6 | |
31281 | pmulhrsw m5, m7 | |
31282 | packuswb m3, m5 | |
31283 | movu [r0 + 1965 * 16], m3 | |
31284 | ||
31285 | ; mode 32 [row 23] | |
31286 | movu m6, [r5 + 24 * 16] | |
31287 | pmaddubsw m3, m0, m6 | |
31288 | pmulhrsw m3, m7 | |
31289 | pmaddubsw m5, m2, m6 | |
31290 | pmulhrsw m5, m7 | |
31291 | packuswb m3, m5 | |
31292 | movu [r0 + 1966 * 16], m3 | |
31293 | pmaddubsw m3, m1, m6 | |
31294 | pmulhrsw m3, m7 | |
31295 | pmaddubsw m5, m4, m6 | |
31296 | pmulhrsw m5, m7 | |
31297 | packuswb m3, m5 | |
31298 | movu [r0 + 1967 * 16], m3 | |
31299 | ||
31300 | ; mode 33 [row 18] | |
31301 | movu m6, [r5 + 14 * 16] | |
31302 | pmaddubsw m3, m0, m6 | |
31303 | pmulhrsw m3, m7 | |
31304 | pmaddubsw m5, m2, m6 | |
31305 | pmulhrsw m5, m7 | |
31306 | packuswb m3, m5 | |
31307 | movu [r0 + 2020 * 16], m3 | |
31308 | pmaddubsw m3, m1, m6 | |
31309 | pmulhrsw m3, m7 | |
31310 | pmaddubsw m5, m4, m6 | |
31311 | pmulhrsw m5, m7 | |
31312 | packuswb m3, m5 | |
31313 | movu [r0 + 2021 * 16], m3 | |
31314 | ||
31315 | ; mode 31 [row 30] | |
31316 | movu m6, [r5 + 15 * 16] | |
31317 | movu m0, [r3 + 17] | |
31318 | movd m1, [r3 + 18] | |
31319 | palignr m1, m0, 1 | |
31320 | punpcklbw m0, m1 | |
31321 | pmaddubsw m3, m0, m6 | |
31322 | pmulhrsw m3, m7 | |
31323 | movu m2, [r3 + 25] | |
31324 | movd m4, [r3 + 26] | |
31325 | palignr m4, m2, 1 | |
31326 | punpcklbw m2, m4 | |
31327 | pmaddubsw m5, m2, m6 | |
31328 | pmulhrsw m5, m7 | |
31329 | packuswb m3, m5 | |
31330 | movu [r0 + 1916 * 16], m3 | |
31331 | ||
31332 | movu m1, [r3 + 33] | |
31333 | movd m3, [r3 + 34] | |
31334 | palignr m3, m1, 1 | |
31335 | punpcklbw m1, m3 | |
31336 | pmaddubsw m3, m1, m6 | |
31337 | pmulhrsw m3, m7 | |
31338 | movu m4, [r3 + 41] | |
31339 | movd m5, [r3 + 42] | |
31340 | palignr m5, m4, 1 | |
31341 | punpcklbw m4, m5 | |
31342 | pmaddubsw m5, m4, m6 | |
31343 | pmulhrsw m5, m7 | |
31344 | packuswb m3, m5 | |
31345 | movu [r0 + 1917 * 16], m3 | |
31346 | ||
31347 | ; mode 32 [row 24] | |
31348 | movu m6, [r5 + 13 * 16] | |
31349 | pmaddubsw m3, m0, m6 | |
31350 | pmulhrsw m3, m7 | |
31351 | pmaddubsw m5, m2, m6 | |
31352 | pmulhrsw m5, m7 | |
31353 | packuswb m3, m5 | |
31354 | movu [r0 + 1968 * 16], m3 | |
31355 | pmaddubsw m3, m1, m6 | |
31356 | pmulhrsw m3, m7 | |
31357 | pmaddubsw m5, m4, m6 | |
31358 | pmulhrsw m5, m7 | |
31359 | packuswb m3, m5 | |
31360 | movu [r0 + 1969 * 16], m3 | |
31361 | ||
31362 | ; mode 33 [row 19] | |
31363 | movu m6, [r5 + 8 * 16] | |
31364 | pmaddubsw m3, m0, m6 | |
31365 | pmulhrsw m3, m7 | |
31366 | pmaddubsw m5, m2, m6 | |
31367 | pmulhrsw m5, m7 | |
31368 | packuswb m3, m5 | |
31369 | movu [r0 + 2022 * 16], m3 | |
31370 | pmaddubsw m3, m1, m6 | |
31371 | pmulhrsw m3, m7 | |
31372 | pmaddubsw m5, m4, m6 | |
31373 | pmulhrsw m5, m7 | |
31374 | packuswb m3, m5 | |
31375 | movu [r0 + 2023 * 16], m3 | |
31376 | ||
31377 | ; mode 31 [row 31] | |
31378 | movu m0, [r3 + 18] | |
31379 | movd m1, [r3 + 19] | |
31380 | palignr m1, m0, 1 | |
31381 | punpcklbw m0, m1 | |
31382 | movu m2, [r3 + 26] | |
31383 | movd m3, [r3 + 27] | |
31384 | palignr m3, m2, 1 | |
31385 | punpcklbw m2, m3 | |
31386 | movu m1, [r3 + 34] | |
31387 | movd m3, [r3 + 35] | |
31388 | palignr m3, m1, 1 | |
31389 | punpcklbw m1, m3 | |
31390 | movu m4, [r3 + 42] | |
31391 | movd m5, [r3 + 43] | |
31392 | palignr m5, m4, 1 | |
31393 | punpcklbw m4, m5 | |
31394 | ||
31395 | pshufb m5, m0, [tab_S2] | |
31396 | movh [r0 + 1918 * 16], m5 | |
31397 | pshufb m5, m2, [tab_S2] | |
31398 | movh [r0 + 1918 * 16 + 8], m5 | |
31399 | pshufb m5, m1, [tab_S2] | |
31400 | movh [r0 + 1919 * 16], m5 | |
31401 | pshufb m5, m4, [tab_S2] | |
31402 | movh [r0 + 1919 * 16 + 8], m5 | |
31403 | ||
31404 | ; mode 32 [row 25] | |
31405 | movu m6, [r5 + 2 * 16] | |
31406 | pmaddubsw m3, m0, m6 | |
31407 | pmulhrsw m3, m7 | |
31408 | pmaddubsw m5, m2, m6 | |
31409 | pmulhrsw m5, m7 | |
31410 | packuswb m3, m5 | |
31411 | movu [r0 + 1970 * 16], m3 | |
31412 | ||
31413 | ; mode 33 [row 20 - first half] | |
31414 | movu [r0 + 2024 * 16], m3 | |
31415 | ||
31416 | pmaddubsw m3, m1, m6 | |
31417 | pmulhrsw m3, m7 | |
31418 | pmaddubsw m5, m4, m6 | |
31419 | pmulhrsw m5, m7 | |
31420 | packuswb m3, m5 | |
31421 | movu [r0 + 1971 * 16], m3 | |
31422 | ||
31423 | ; mode 33 [row 20 - second half] | |
31424 | movu [r0 + 2025 * 16], m3 | |
31425 | ||
31426 | ; mode 32 [row 26] | |
31427 | movu m6, [r5 + 23 * 16] | |
31428 | pmaddubsw m3, m0, m6 | |
31429 | pmulhrsw m3, m7 | |
31430 | pmaddubsw m5, m2, m6 | |
31431 | pmulhrsw m5, m7 | |
31432 | packuswb m3, m5 | |
31433 | movu [r0 + 1972 * 16], m3 | |
31434 | pmaddubsw m3, m1, m6 | |
31435 | pmulhrsw m3, m7 | |
31436 | pmaddubsw m5, m4, m6 | |
31437 | pmulhrsw m5, m7 | |
31438 | packuswb m3, m5 | |
31439 | movu [r0 + 1973 * 16], m3 | |
31440 | ||
31441 | ; mode 33 [row 21] | |
31442 | movu m6, [r5 + 28 * 16] | |
31443 | pmaddubsw m3, m0, m6 | |
31444 | pmulhrsw m3, m7 | |
31445 | pmaddubsw m5, m2, m6 | |
31446 | pmulhrsw m5, m7 | |
31447 | packuswb m3, m5 | |
31448 | movu [r0 + 2026 * 16], m3 | |
31449 | pmaddubsw m3, m1, m6 | |
31450 | pmulhrsw m3, m7 | |
31451 | pmaddubsw m5, m4, m6 | |
31452 | pmulhrsw m5, m7 | |
31453 | packuswb m3, m5 | |
31454 | movu [r0 + 2027 * 16], m3 | |
31455 | ||
31456 | ; mode 32 [row 27] | |
31457 | movu m6, [r5 + 12 * 16] | |
31458 | movu m0, [r3 + 19] | |
31459 | movd m1, [r3 + 20] | |
31460 | palignr m1, m0, 1 | |
31461 | punpcklbw m0, m1 | |
31462 | pmaddubsw m3, m0, m6 | |
31463 | pmulhrsw m3, m7 | |
31464 | movu m2, [r3 + 27] | |
31465 | movd m4, [r3 + 28] | |
31466 | palignr m4, m2, 1 | |
31467 | punpcklbw m2, m4 | |
31468 | pmaddubsw m5, m2, m6 | |
31469 | pmulhrsw m5, m7 | |
31470 | packuswb m3, m5 | |
31471 | movu [r0 + 1974 * 16], m3 | |
31472 | ||
31473 | movu m1, [r3 + 35] | |
31474 | movd m3, [r3 + 36] | |
31475 | palignr m3, m1, 1 | |
31476 | punpcklbw m1, m3 | |
31477 | pmaddubsw m3, m1, m6 | |
31478 | pmulhrsw m3, m7 | |
31479 | movu m4, [r3 + 43] | |
31480 | movd m5, [r3 + 44] | |
31481 | palignr m5, m4, 1 | |
31482 | punpcklbw m4, m5 | |
31483 | pmaddubsw m5, m4, m6 | |
31484 | pmulhrsw m5, m7 | |
31485 | packuswb m3, m5 | |
31486 | movu [r0 + 1975 * 16], m3 | |
31487 | ||
31488 | ; mode 33 [row 22] | |
31489 | movu m6, [r5 + 22 * 16] | |
31490 | pmaddubsw m3, m0, m6 | |
31491 | pmulhrsw m3, m7 | |
31492 | pmaddubsw m5, m2, m6 | |
31493 | pmulhrsw m5, m7 | |
31494 | packuswb m3, m5 | |
31495 | movu [r0 + 2028 * 16], m3 | |
31496 | pmaddubsw m3, m1, m6 | |
31497 | pmulhrsw m3, m7 | |
31498 | pmaddubsw m5, m4, m6 | |
31499 | pmulhrsw m5, m7 | |
31500 | packuswb m3, m5 | |
31501 | movu [r0 + 2029 * 16], m3 | |
31502 | ||
31503 | ; mode 32 [row 28] | |
31504 | movu m6, [r5 + 1 * 16] | |
31505 | movu m0, [r3 + 20] | |
31506 | movd m1, [r3 + 21] | |
31507 | palignr m1, m0, 1 | |
31508 | punpcklbw m0, m1 | |
31509 | pmaddubsw m3, m0, m6 | |
31510 | pmulhrsw m3, m7 | |
31511 | movu m2, [r3 + 28] | |
31512 | movd m4, [r3 + 29] | |
31513 | palignr m4, m2, 1 | |
31514 | punpcklbw m2, m4 | |
31515 | pmaddubsw m5, m2, m6 | |
31516 | pmulhrsw m5, m7 | |
31517 | packuswb m3, m5 | |
31518 | movu [r0 + 1976 * 16], m3 | |
31519 | ||
31520 | movu m1, [r3 + 36] | |
31521 | movd m3, [r3 + 37] | |
31522 | palignr m3, m1, 1 | |
31523 | punpcklbw m1, m3 | |
31524 | pmaddubsw m3, m1, m6 | |
31525 | pmulhrsw m3, m7 | |
31526 | movu m4, [r3 + 44] | |
31527 | movd m5, [r3 + 45] | |
31528 | palignr m5, m4, 1 | |
31529 | punpcklbw m4, m5 | |
31530 | pmaddubsw m5, m4, m6 | |
31531 | pmulhrsw m5, m7 | |
31532 | packuswb m3, m5 | |
31533 | movu [r0 + 1977 * 16], m3 | |
31534 | ||
31535 | ; mode 32 [row 29] | |
31536 | movu m6, [r5 + 22 * 16] | |
31537 | pmaddubsw m3, m0, m6 | |
31538 | pmulhrsw m3, m7 | |
31539 | pmaddubsw m5, m2, m6 | |
31540 | pmulhrsw m5, m7 | |
31541 | packuswb m3, m5 | |
31542 | movu [r0 + 1978 * 16], m3 | |
31543 | pmaddubsw m3, m1, m6 | |
31544 | pmulhrsw m3, m7 | |
31545 | pmaddubsw m5, m4, m6 | |
31546 | pmulhrsw m5, m7 | |
31547 | packuswb m3, m5 | |
31548 | movu [r0 + 1979 * 16], m3 | |
31549 | ||
31550 | ; mode 33 [row 23] | |
31551 | movu m6, [r5 + 16 * 16] | |
31552 | pmaddubsw m3, m0, m6 | |
31553 | pmulhrsw m3, m7 | |
31554 | pmaddubsw m5, m2, m6 | |
31555 | pmulhrsw m5, m7 | |
31556 | packuswb m3, m5 | |
31557 | movu [r0 + 2030 * 16], m3 | |
31558 | pmaddubsw m3, m1, m6 | |
31559 | pmulhrsw m3, m7 | |
31560 | pmaddubsw m5, m4, m6 | |
31561 | pmulhrsw m5, m7 | |
31562 | packuswb m3, m5 | |
31563 | movu [r0 + 2031 * 16], m3 | |
31564 | ||
31565 | ; mode 32 [row 30] | |
31566 | movu m6, [r5 + 11 * 16] | |
31567 | movu m0, [r3 + 21] | |
31568 | movd m1, [r3 + 22] | |
31569 | palignr m1, m0, 1 | |
31570 | punpcklbw m0, m1 | |
31571 | pmaddubsw m3, m0, m6 | |
31572 | pmulhrsw m3, m7 | |
31573 | movu m2, [r3 + 29] | |
31574 | movd m4, [r3 + 30] | |
31575 | palignr m4, m2, 1 | |
31576 | punpcklbw m2, m4 | |
31577 | pmaddubsw m5, m2, m6 | |
31578 | pmulhrsw m5, m7 | |
31579 | packuswb m3, m5 | |
31580 | movu [r0 + 1980 * 16], m3 | |
31581 | ||
31582 | movu m1, [r3 + 37] | |
31583 | movd m3, [r3 + 38] | |
31584 | palignr m3, m1, 1 | |
31585 | punpcklbw m1, m3 | |
31586 | pmaddubsw m3, m1, m6 | |
31587 | pmulhrsw m3, m7 | |
31588 | movu m4, [r3 + 45] | |
31589 | movd m5, [r3 + 46] | |
31590 | palignr m5, m4, 1 | |
31591 | punpcklbw m4, m5 | |
31592 | pmaddubsw m5, m4, m6 | |
31593 | pmulhrsw m5, m7 | |
31594 | packuswb m3, m5 | |
31595 | movu [r0 + 1981 * 16], m3 | |
31596 | ||
31597 | ; mode 33 [row 24] | |
31598 | movu m6, [r5 + 10 * 16] | |
31599 | pmaddubsw m3, m0, m6 | |
31600 | pmulhrsw m3, m7 | |
31601 | pmaddubsw m5, m2, m6 | |
31602 | pmulhrsw m5, m7 | |
31603 | packuswb m3, m5 | |
31604 | movu [r0 + 2032 * 16], m3 | |
31605 | pmaddubsw m3, m1, m6 | |
31606 | pmulhrsw m3, m7 | |
31607 | pmaddubsw m5, m4, m6 | |
31608 | pmulhrsw m5, m7 | |
31609 | packuswb m3, m5 | |
31610 | movu [r0 + 2033 * 16], m3 | |
31611 | ||
31612 | ; mode 32 [row 31] | |
31613 | movu m0, [r3 + 22] | |
31614 | movd m1, [r3 + 23] | |
31615 | palignr m1, m0, 1 | |
31616 | punpcklbw m0, m1 | |
31617 | movu m2, [r3 + 30] | |
31618 | movd m3, [r3 + 31] | |
31619 | palignr m3, m2, 1 | |
31620 | punpcklbw m2, m3 | |
31621 | movu m1, [r3 + 38] | |
31622 | movd m3, [r3 + 39] | |
31623 | palignr m3, m1, 1 | |
31624 | punpcklbw m1, m3 | |
31625 | movu m4, [r3 + 46] | |
31626 | movd m5, [r3 + 47] | |
31627 | palignr m5, m4, 1 | |
31628 | punpcklbw m4, m5 | |
31629 | ||
31630 | pshufb m5, m0, [tab_S2] | |
31631 | movh [r0 + 1982 * 16], m5 | |
31632 | pshufb m5, m2, [tab_S2] | |
31633 | movh [r0 + 1982 * 16 + 8], m5 | |
31634 | pshufb m5, m1, [tab_S2] | |
31635 | movh [r0 + 1983 * 16], m5 | |
31636 | pshufb m5, m4, [tab_S2] | |
31637 | movh [r0 + 1983 * 16 + 8], m5 | |
31638 | ||
31639 | ; mode 33 [row 25] | |
31640 | movu m6, [r5 + 4 * 16] | |
31641 | pmaddubsw m3, m0, m6 | |
31642 | pmulhrsw m3, m7 | |
31643 | pmaddubsw m5, m2, m6 | |
31644 | pmulhrsw m5, m7 | |
31645 | packuswb m3, m5 | |
31646 | movu [r0 + 2034 * 16], m3 | |
31647 | pmaddubsw m3, m1, m6 | |
31648 | pmulhrsw m3, m7 | |
31649 | pmaddubsw m5, m4, m6 | |
31650 | pmulhrsw m5, m7 | |
31651 | packuswb m3, m5 | |
31652 | movu [r0 + 2035 * 16], m3 | |
31653 | ||
31654 | ; mode 33 [row 26] | |
31655 | movu m6, [r5 + 30 * 16] | |
31656 | pmaddubsw m3, m0, m6 | |
31657 | pmulhrsw m3, m7 | |
31658 | pmaddubsw m5, m2, m6 | |
31659 | pmulhrsw m5, m7 | |
31660 | packuswb m3, m5 | |
31661 | movu [r0 + 2036 * 16], m3 | |
31662 | pmaddubsw m3, m1, m6 | |
31663 | pmulhrsw m3, m7 | |
31664 | pmaddubsw m5, m4, m6 | |
31665 | pmulhrsw m5, m7 | |
31666 | packuswb m3, m5 | |
31667 | movu [r0 + 2037 * 16], m3 | |
31668 | ||
31669 | ; mode 33 [row 27] | |
31670 | movu m6, [r5 + 24 * 16] | |
31671 | movu m0, [r3 + 23] | |
31672 | movd m1, [r3 + 24] | |
31673 | palignr m1, m0, 1 | |
31674 | punpcklbw m0, m1 | |
31675 | pmaddubsw m3, m0, m6 | |
31676 | pmulhrsw m3, m7 | |
31677 | movu m2, [r3 + 31] | |
31678 | movd m4, [r3 + 32] | |
31679 | palignr m4, m2, 1 | |
31680 | punpcklbw m2, m4 | |
31681 | pmaddubsw m5, m2, m6 | |
31682 | pmulhrsw m5, m7 | |
31683 | packuswb m3, m5 | |
31684 | movu [r0 + 2038 * 16], m3 | |
31685 | ||
31686 | movu m1, [r3 + 39] | |
31687 | movd m3, [r3 + 40] | |
31688 | palignr m3, m1, 1 | |
31689 | punpcklbw m1, m3 | |
31690 | pmaddubsw m3, m1, m6 | |
31691 | pmulhrsw m3, m7 | |
31692 | movu m4, [r3 + 47] | |
31693 | movd m5, [r3 + 48] | |
31694 | palignr m5, m4, 1 | |
31695 | punpcklbw m4, m5 | |
31696 | pmaddubsw m5, m4, m6 | |
31697 | pmulhrsw m5, m7 | |
31698 | packuswb m3, m5 | |
31699 | movu [r0 + 2039 * 16], m3 | |
31700 | ||
31701 | ; mode 33 [row 28] | |
31702 | movu m6, [r5 + 18 * 16] | |
31703 | movu m0, [r3 + 24] | |
31704 | movd m1, [r3 + 25] | |
31705 | palignr m1, m0, 1 | |
31706 | punpcklbw m0, m1 | |
31707 | pmaddubsw m3, m0, m6 | |
31708 | pmulhrsw m3, m7 | |
31709 | movu m2, [r3 + 32] | |
31710 | movd m4, [r3 + 33] | |
31711 | palignr m4, m2, 1 | |
31712 | punpcklbw m2, m4 | |
31713 | pmaddubsw m5, m2, m6 | |
31714 | pmulhrsw m5, m7 | |
31715 | packuswb m3, m5 | |
31716 | movu [r0 + 2040 * 16], m3 | |
31717 | ||
31718 | movu m1, [r3 + 40] | |
31719 | movd m3, [r3 + 41] | |
31720 | palignr m3, m1, 1 | |
31721 | punpcklbw m1, m3 | |
31722 | pmaddubsw m3, m1, m6 | |
31723 | pmulhrsw m3, m7 | |
31724 | movu m4, [r3 + 48] | |
31725 | movd m5, [r3 + 49] | |
31726 | palignr m5, m4, 1 | |
31727 | punpcklbw m4, m5 | |
31728 | pmaddubsw m5, m4, m6 | |
31729 | pmulhrsw m5, m7 | |
31730 | packuswb m3, m5 | |
31731 | movu [r0 + 2041 * 16], m3 | |
31732 | ||
31733 | ; mode 33 [row 29] | |
31734 | movu m6, [r5 + 12 * 16] | |
31735 | movu m0, [r3 + 25] | |
31736 | movd m1, [r3 + 26] | |
31737 | palignr m1, m0, 1 | |
31738 | punpcklbw m0, m1 | |
31739 | pmaddubsw m3, m0, m6 | |
31740 | pmulhrsw m3, m7 | |
31741 | movu m2, [r3 + 33] | |
31742 | movd m4, [r3 + 34] | |
31743 | palignr m4, m2, 1 | |
31744 | punpcklbw m2, m4 | |
31745 | pmaddubsw m5, m2, m6 | |
31746 | pmulhrsw m5, m7 | |
31747 | packuswb m3, m5 | |
31748 | movu [r0 + 2042 * 16], m3 | |
31749 | ||
31750 | movu m1, [r3 + 41] | |
31751 | movd m3, [r3 + 42] | |
31752 | palignr m3, m1, 1 | |
31753 | punpcklbw m1, m3 | |
31754 | pmaddubsw m3, m1, m6 | |
31755 | pmulhrsw m3, m7 | |
31756 | movu m4, [r3 + 49] | |
31757 | movd m5, [r3 + 50] | |
31758 | palignr m5, m4, 1 | |
31759 | punpcklbw m4, m5 | |
31760 | pmaddubsw m5, m4, m6 | |
31761 | pmulhrsw m5, m7 | |
31762 | packuswb m3, m5 | |
31763 | movu [r0 + 2043 * 16], m3 | |
31764 | ||
31765 | ; mode 33 [row 30] | |
31766 | movu m6, [r5 + 6 * 16] | |
31767 | movu m0, [r3 + 26] | |
31768 | movd m1, [r3 + 27] | |
31769 | palignr m1, m0, 1 | |
31770 | punpcklbw m0, m1 | |
31771 | pmaddubsw m3, m0, m6 | |
31772 | pmulhrsw m3, m7 | |
31773 | movu m2, [r3 + 34] | |
31774 | movd m4, [r3 + 35] | |
31775 | palignr m4, m2, 1 | |
31776 | punpcklbw m2, m4 | |
31777 | pmaddubsw m5, m2, m6 | |
31778 | pmulhrsw m5, m7 | |
31779 | packuswb m3, m5 | |
31780 | movu [r0 + 2044 * 16], m3 | |
31781 | ||
31782 | movu m1, [r3 + 42] | |
31783 | movd m3, [r3 + 43] | |
31784 | palignr m3, m1, 1 | |
31785 | punpcklbw m1, m3 | |
31786 | pmaddubsw m3, m1, m6 | |
31787 | pmulhrsw m3, m7 | |
31788 | movu m4, [r3 + 50] | |
31789 | movd m5, [r3 + 51] | |
31790 | palignr m5, m4, 1 | |
31791 | punpcklbw m4, m5 | |
31792 | pmaddubsw m5, m4, m6 | |
31793 | pmulhrsw m5, m7 | |
31794 | packuswb m3, m5 | |
31795 | movu [r0 + 2045 * 16], m3 | |
31796 | ||
31797 | ; mode 33 [row 31] | |
31798 | movu m5, [r3 + 27] | |
31799 | movu [r0 + 2046 * 16], m5 | |
31800 | movu m5, [r3 + 43] | |
31801 | movu [r0 + 2047 * 16], m5 | |
31802 | ||
31803 | ;mode 34 [row 0] | |
31804 | movu m0, [r3 + 2] | |
31805 | movu [r0 + 2048 * 16], m0 | |
31806 | movu m1, [r3 + 18] | |
31807 | movu [r0 + 2049 * 16], m1 | |
31808 | ||
31809 | ;mode 34 [row 1] | |
31810 | movu m2, [r3 + 34] | |
31811 | palignr m3, m1, m0, 1 | |
31812 | movu [r0 + 2050 * 16], m3 | |
31813 | palignr m4, m2, m1, 1 | |
31814 | movu [r0 + 2051 * 16], m4 | |
31815 | ||
31816 | ;mode 34 [row 2] | |
31817 | palignr m3, m1, m0, 2 | |
31818 | movu [r0 + 2052 * 16], m3 | |
31819 | palignr m4, m2, m1, 2 | |
31820 | movu [r0 + 2053 * 16], m4 | |
31821 | ||
31822 | ;mode 34 [row 3] | |
31823 | palignr m3, m1, m0, 3 | |
31824 | movu [r0 + 2054 * 16], m3 | |
31825 | palignr m4, m2, m1, 3 | |
31826 | movu [r0 + 2055 * 16], m4 | |
31827 | ||
31828 | ;mode 34 [row 4] | |
31829 | palignr m3, m1, m0, 4 | |
31830 | movu [r0 + 2056 * 16], m3 | |
31831 | palignr m4, m2, m1, 4 | |
31832 | movu [r0 + 2057 * 16], m4 | |
31833 | ||
31834 | ;mode 34 [row 5] | |
31835 | palignr m3, m1, m0, 5 | |
31836 | movu [r0 + 2058 * 16], m3 | |
31837 | palignr m4, m2, m1, 5 | |
31838 | movu [r0 + 2059 * 16], m4 | |
31839 | ||
31840 | ;mode 34 [row 6] | |
31841 | palignr m3, m1, m0, 6 | |
31842 | movu [r0 + 2060 * 16], m3 | |
31843 | palignr m4, m2, m1, 6 | |
31844 | movu [r0 + 2061 * 16], m4 | |
31845 | ||
31846 | ;mode 34 [row 7] | |
31847 | palignr m3, m1, m0, 7 | |
31848 | movu [r0 + 2062 * 16], m3 | |
31849 | palignr m4, m2, m1, 7 | |
31850 | movu [r0 + 2063 * 16], m4 | |
31851 | ||
31852 | ;mode 34 [row 8] | |
31853 | palignr m3, m1, m0, 8 | |
31854 | movu [r0 + 2064 * 16], m3 | |
31855 | palignr m4, m2, m1, 8 | |
31856 | movu [r0 + 2065 * 16], m4 | |
31857 | ||
31858 | ;mode 34 [row 9] | |
31859 | palignr m3, m1, m0, 9 | |
31860 | movu [r0 + 2066 * 16], m3 | |
31861 | palignr m4, m2, m1, 9 | |
31862 | movu [r0 + 2067 * 16], m4 | |
31863 | ||
31864 | ;mode 34 [row 10] | |
31865 | palignr m3, m1, m0, 10 | |
31866 | movu [r0 + 2068 * 16], m3 | |
31867 | palignr m4, m2, m1, 10 | |
31868 | movu [r0 + 2069 * 16], m4 | |
31869 | ||
31870 | ;mode 34 [row 11] | |
31871 | palignr m3, m1, m0, 11 | |
31872 | movu [r0 + 2070 * 16], m3 | |
31873 | palignr m4, m2, m1, 11 | |
31874 | movu [r0 + 2071 * 16], m4 | |
31875 | ||
31876 | ;mode 34 [row 12] | |
31877 | palignr m3, m1, m0, 12 | |
31878 | movu [r0 + 2072 * 16], m3 | |
31879 | palignr m4, m2, m1, 12 | |
31880 | movu [r0 + 2073 * 16], m4 | |
31881 | ||
31882 | ;mode 34 [row 13] | |
31883 | palignr m3, m1, m0, 13 | |
31884 | movu [r0 + 2074 * 16], m3 | |
31885 | palignr m4, m2, m1, 13 | |
31886 | movu [r0 + 2075 * 16], m4 | |
31887 | ||
31888 | ;mode 34 [row 14] | |
31889 | palignr m3, m1, m0, 14 | |
31890 | movu [r0 + 2076 * 16], m3 | |
31891 | palignr m4, m2, m1, 14 | |
31892 | movu [r0 + 2077 * 16], m4 | |
31893 | ||
31894 | ;mode 34 [row 15] | |
31895 | palignr m3, m1, m0, 15 | |
31896 | movu [r0 + 2078 * 16], m3 | |
31897 | palignr m4, m2, m1, 15 | |
31898 | movu [r0 + 2079 * 16], m4 | |
31899 | ||
31900 | ;mode 34 [row 16] | |
31901 | palignr m3, m1, m0, 16 | |
31902 | movu [r0 + 2080 * 16], m3 | |
31903 | palignr m4, m2, m1, 16 | |
31904 | movu [r0 + 2081 * 16], m4 | |
31905 | ||
31906 | ;mode 34 [row 17] | |
31907 | movu m0, [r3 + 19] | |
31908 | movu [r0 + 2082 * 16], m0 | |
31909 | movu m1, [r3 + 35] | |
31910 | movu [r0 + 2083 * 16], m1 | |
31911 | ||
31912 | ;mode 34 [row 18] | |
31913 | movu m2, [r3 + 51] | |
31914 | palignr m3, m1, m0, 1 | |
31915 | movu [r0 + 2084 * 16], m3 | |
31916 | palignr m4, m2, m1, 1 | |
31917 | movu [r0 + 2085 * 16], m4 | |
31918 | ||
31919 | ;mode 34 [row 19] | |
31920 | palignr m3, m1, m0, 2 | |
31921 | movu [r0 + 2086 * 16], m3 | |
31922 | palignr m4, m2, m1, 2 | |
31923 | movu [r0 + 2087 * 16], m4 | |
31924 | ||
31925 | ;mode 34 [row 20] | |
31926 | palignr m3, m1, m0, 3 | |
31927 | movu [r0 + 2088 * 16], m3 | |
31928 | palignr m4, m2, m1, 3 | |
31929 | movu [r0 + 2089 * 16], m4 | |
31930 | ||
31931 | ;mode 34 [row 21] | |
31932 | palignr m3, m1, m0, 4 | |
31933 | movu [r0 + 2090 * 16], m3 | |
31934 | palignr m4, m2, m1, 4 | |
31935 | movu [r0 + 2091 * 16], m4 | |
31936 | ||
31937 | ;mode 34 [row 22] | |
31938 | palignr m3, m1, m0, 5 | |
31939 | movu [r0 + 2092 * 16], m3 | |
31940 | palignr m4, m2, m1, 5 | |
31941 | movu [r0 + 2093 * 16], m4 | |
31942 | ||
31943 | ;mode 34 [row 23] | |
31944 | palignr m3, m1, m0, 6 | |
31945 | movu [r0 + 2094 * 16], m3 | |
31946 | palignr m4, m2, m1, 6 | |
31947 | movu [r0 + 2095 * 16], m4 | |
31948 | ||
31949 | ;mode 34 [row 24] | |
31950 | palignr m3, m1, m0, 7 | |
31951 | movu [r0 + 2096 * 16], m3 | |
31952 | palignr m4, m2, m1, 7 | |
31953 | movu [r0 + 2097 * 16], m4 | |
31954 | ||
31955 | ;mode 34 [row 25] | |
31956 | palignr m3, m1, m0, 8 | |
31957 | movu [r0 + 2098 * 16], m3 | |
31958 | palignr m4, m2, m1, 8 | |
31959 | movu [r0 + 2099 * 16], m4 | |
31960 | ||
31961 | ;mode 34 [row 26] | |
31962 | palignr m3, m1, m0, 9 | |
31963 | movu [r0 + 2100 * 16], m3 | |
31964 | palignr m4, m2, m1, 9 | |
31965 | movu [r0 + 2101 * 16], m4 | |
31966 | ||
31967 | ;mode 34 [row 27] | |
31968 | palignr m3, m1, m0, 10 | |
31969 | movu [r0 + 2102 * 16], m3 | |
31970 | palignr m4, m2, m1, 10 | |
31971 | movu [r0 + 2103 * 16], m4 | |
31972 | ||
31973 | ;mode 34 [row 28] | |
31974 | palignr m3, m1, m0, 11 | |
31975 | movu [r0 + 2104 * 16], m3 | |
31976 | palignr m4, m2, m1, 11 | |
31977 | movu [r0 + 2105 * 16], m4 | |
31978 | ||
31979 | ;mode 34 [row 29] | |
31980 | palignr m3, m1, m0, 12 | |
31981 | movu [r0 + 2106 * 16], m3 | |
31982 | palignr m4, m2, m1, 12 | |
31983 | movu [r0 + 2107 * 16], m4 | |
31984 | ||
31985 | ;mode 34 [row 30] | |
31986 | palignr m3, m1, m0, 13 | |
31987 | movu [r0 + 2108 * 16], m3 | |
31988 | palignr m4, m2, m1, 13 | |
31989 | movu [r0 + 2109 * 16], m4 | |
31990 | ||
31991 | ;mode 34 [row 31] | |
31992 | palignr m3, m1, m0, 14 | |
31993 | movu [r0 + 2110 * 16], m3 | |
31994 | palignr m4, m2, m1, 14 | |
31995 | movu [r0 + 2111 * 16], m4 | |
31996 | ||
31997 | RET |