Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavcodec / x86 / bswapdsp.asm
CommitLineData
2ba45a60
DM
1;******************************************************************************
2;* optimized bswap buffer functions
3;* Copyright (c) 2008 Loren Merritt
4;* Copyright (c) 2003-2013 Michael Niedermayer
5;* Copyright (c) 2013 Daniel Kang
6;*
7;* This file is part of FFmpeg.
8;*
9;* FFmpeg is free software; you can redistribute it and/or
10;* modify it under the terms of the GNU Lesser General Public
11;* License as published by the Free Software Foundation; either
12;* version 2.1 of the License, or (at your option) any later version.
13;*
14;* FFmpeg is distributed in the hope that it will be useful,
15;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17;* Lesser General Public License for more details.
18;*
19;* You should have received a copy of the GNU Lesser General Public
20;* License along with FFmpeg; if not, write to the Free Software
21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22;******************************************************************************
23
24%include "libavutil/x86/x86util.asm"
25
26SECTION_RODATA
27pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
28
29cextern pb_80
30
31SECTION_TEXT
32
33; %1 = aligned/unaligned
34%macro BSWAP_LOOPS 1
35 mov r3, r2
36 sar r2, 3
37 jz .left4_%1
38.loop8_%1:
39 mov%1 m0, [r1 + 0]
40 mov%1 m1, [r1 + 16]
41%if cpuflag(ssse3)
42 pshufb m0, m2
43 pshufb m1, m2
44 mov%1 [r0 + 0], m0
45 mov%1 [r0 + 16], m1
46%else
47 pshuflw m0, m0, 10110001b
48 pshuflw m1, m1, 10110001b
49 pshufhw m0, m0, 10110001b
50 pshufhw m1, m1, 10110001b
51 mova m2, m0
52 mova m3, m1
53 psllw m0, 8
54 psllw m1, 8
55 psrlw m2, 8
56 psrlw m3, 8
57 por m2, m0
58 por m3, m1
59 mov%1 [r0 + 0], m2
60 mov%1 [r0 + 16], m3
61%endif
62 add r0, 32
63 add r1, 32
64 dec r2
65 jnz .loop8_%1
66.left4_%1:
67 mov r2, r3
68 and r3, 4
69 jz .left
70 mov%1 m0, [r1]
71%if cpuflag(ssse3)
72 pshufb m0, m2
73 mov%1 [r0], m0
74%else
75 pshuflw m0, m0, 10110001b
76 pshufhw m0, m0, 10110001b
77 mova m2, m0
78 psllw m0, 8
79 psrlw m2, 8
80 por m2, m0
81 mov%1 [r0], m2
82%endif
83 add r1, 16
84 add r0, 16
85%endmacro
86
87; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
88%macro BSWAP32_BUF 0
89%if cpuflag(ssse3)
90cglobal bswap32_buf, 3,4,3
91 mov r3, r1
92 mova m2, [pb_bswap32]
93%else
94cglobal bswap32_buf, 3,4,5
95 mov r3, r1
96%endif
97 or r3, r0
98 and r3, 15
99 jz .start_align
100 BSWAP_LOOPS u
101 jmp .left
102.start_align:
103 BSWAP_LOOPS a
104.left:
105%if cpuflag(ssse3)
106 mov r3, r2
107 and r2, 2
108 jz .left1
109 movq m0, [r1]
110 pshufb m0, m2
111 movq [r0], m0
112 add r1, 8
113 add r0, 8
114.left1:
115 and r3, 1
116 jz .end
117 mov r2d, [r1]
118 bswap r2d
119 mov [r0], r2d
120%else
121 and r2, 3
122 jz .end
123.loop2:
124 mov r3d, [r1]
125 bswap r3d
126 mov [r0], r3d
127 add r1, 4
128 add r0, 4
129 dec r2
130 jnz .loop2
131%endif
132.end:
133 RET
134%endmacro
135
136INIT_XMM sse2
137BSWAP32_BUF
138
139INIT_XMM ssse3
140BSWAP32_BUF