Imported Debian version 2.4.3~trusty1
[deb_ffmpeg.git] / ffmpeg / libavutil / x86 / x86inc.asm
CommitLineData
2ba45a60
DM
1;*****************************************************************************
2;* x86inc.asm: x264asm abstraction layer
3;*****************************************************************************
4;* Copyright (C) 2005-2013 x264 project
5;*
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
8;* Fiona Glaser <fiona@x264.com>
9;* Henrik Gramner <henrik@gramner.com>
10;*
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
14;*
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22;*****************************************************************************
23
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
37%ifndef private_prefix
38 %define private_prefix x264
39%endif
40
41%ifndef public_prefix
42 %define public_prefix private_prefix
43%endif
44
45%define WIN64 0
46%define UNIX64 0
47%if ARCH_X86_64
48 %ifidn __OUTPUT_FORMAT__,win32
49 %define WIN64 1
50 %elifidn __OUTPUT_FORMAT__,win64
51 %define WIN64 1
52 %elifidn __OUTPUT_FORMAT__,x64
53 %define WIN64 1
54 %else
55 %define UNIX64 1
56 %endif
57%endif
58
59%ifdef PREFIX
60 %define mangle(x) _ %+ x
61%else
62 %define mangle(x) x
63%endif
64
65; aout does not support align=
66; NOTE: This section is out of sync with x264, in order to
67; keep supporting OS/2.
68%macro SECTION_RODATA 0-1 16
69 %ifidn __OUTPUT_FORMAT__,aout
70 section .text
71 %else
72 SECTION .rodata align=%1
73 %endif
74%endmacro
75
76%macro SECTION_TEXT 0-1 16
77 %ifidn __OUTPUT_FORMAT__,aout
78 SECTION .text
79 %else
80 SECTION .text align=%1
81 %endif
82%endmacro
83
84%if WIN64
85 %define PIC
86%elif ARCH_X86_64 == 0
87; x86_32 doesn't require PIC.
88; Some distros prefer shared objects to be PIC, but nothing breaks if
89; the code contains a few textrels, so we'll skip that complexity.
90 %undef PIC
91%endif
92%ifdef PIC
93 default rel
94%endif
95
96%macro CPUNOP 1
97 %if HAVE_CPUNOP
98 CPU %1
99 %endif
100%endmacro
101
102; Macros to eliminate most code duplication between x86_32 and x86_64:
103; Currently this works only for leaf functions which load all their arguments
104; into registers at the start, and make no other use of the stack. Luckily that
105; covers most of x264's asm.
106
107; PROLOGUE:
108; %1 = number of arguments. loads them from stack if needed.
109; %2 = number of registers used. pushes callee-saved regs if needed.
110; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
111; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
112; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
113; and an extra register will be allocated to hold the original stack
114; pointer (to not invalidate r0m etc.). To prevent the use of an extra
115; register as stack pointer, request a negative stack size.
116; %4+/%5+ = list of names to define to registers
117; PROLOGUE can also be invoked by adding the same options to cglobal
118
119; e.g.
120; cglobal foo, 2,3,0, dst, src, tmp
121; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
122
123; TODO Some functions can use some args directly from the stack. If they're the
124; last args then you can just not declare them, but if they're in the middle
125; we need more flexible macro.
126
127; RET:
128; Pops anything that was pushed by PROLOGUE, and returns.
129
130; REP_RET:
131; Use this instead of RET if it's a branch target.
132
133; registers:
134; rN and rNq are the native-size register holding function argument N
135; rNd, rNw, rNb are dword, word, and byte size
136; rNh is the high 8 bits of the word size
137; rNm is the original location of arg N (a register or on the stack), dword
138; rNmp is native size
139
140%macro DECLARE_REG 2-3
141 %define r%1q %2
142 %define r%1d %2d
143 %define r%1w %2w
144 %define r%1b %2b
145 %define r%1h %2h
146 %define %2q %2
147 %if %0 == 2
148 %define r%1m %2d
149 %define r%1mp %2
150 %elif ARCH_X86_64 ; memory
151 %define r%1m [rstk + stack_offset + %3]
152 %define r%1mp qword r %+ %1 %+ m
153 %else
154 %define r%1m [rstk + stack_offset + %3]
155 %define r%1mp dword r %+ %1 %+ m
156 %endif
157 %define r%1 %2
158%endmacro
159
160%macro DECLARE_REG_SIZE 3
161 %define r%1q r%1
162 %define e%1q r%1
163 %define r%1d e%1
164 %define e%1d e%1
165 %define r%1w %1
166 %define e%1w %1
167 %define r%1h %3
168 %define e%1h %3
169 %define r%1b %2
170 %define e%1b %2
171%if ARCH_X86_64 == 0
172 %define r%1 e%1
173%endif
174%endmacro
175
176DECLARE_REG_SIZE ax, al, ah
177DECLARE_REG_SIZE bx, bl, bh
178DECLARE_REG_SIZE cx, cl, ch
179DECLARE_REG_SIZE dx, dl, dh
180DECLARE_REG_SIZE si, sil, null
181DECLARE_REG_SIZE di, dil, null
182DECLARE_REG_SIZE bp, bpl, null
183
184; t# defines for when per-arch register allocation is more complex than just function arguments
185
186%macro DECLARE_REG_TMP 1-*
187 %assign %%i 0
188 %rep %0
189 CAT_XDEFINE t, %%i, r%1
190 %assign %%i %%i+1
191 %rotate 1
192 %endrep
193%endmacro
194
195%macro DECLARE_REG_TMP_SIZE 0-*
196 %rep %0
197 %define t%1q t%1 %+ q
198 %define t%1d t%1 %+ d
199 %define t%1w t%1 %+ w
200 %define t%1h t%1 %+ h
201 %define t%1b t%1 %+ b
202 %rotate 1
203 %endrep
204%endmacro
205
206DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
207
208%if ARCH_X86_64
209 %define gprsize 8
210%else
211 %define gprsize 4
212%endif
213
214%macro PUSH 1
215 push %1
216 %ifidn rstk, rsp
217 %assign stack_offset stack_offset+gprsize
218 %endif
219%endmacro
220
221%macro POP 1
222 pop %1
223 %ifidn rstk, rsp
224 %assign stack_offset stack_offset-gprsize
225 %endif
226%endmacro
227
228%macro PUSH_IF_USED 1-*
229 %rep %0
230 %if %1 < regs_used
231 PUSH r%1
232 %endif
233 %rotate 1
234 %endrep
235%endmacro
236
237%macro POP_IF_USED 1-*
238 %rep %0
239 %if %1 < regs_used
240 pop r%1
241 %endif
242 %rotate 1
243 %endrep
244%endmacro
245
246%macro LOAD_IF_USED 1-*
247 %rep %0
248 %if %1 < num_args
249 mov r%1, r %+ %1 %+ mp
250 %endif
251 %rotate 1
252 %endrep
253%endmacro
254
255%macro SUB 2
256 sub %1, %2
257 %ifidn %1, rstk
258 %assign stack_offset stack_offset+(%2)
259 %endif
260%endmacro
261
262%macro ADD 2
263 add %1, %2
264 %ifidn %1, rstk
265 %assign stack_offset stack_offset-(%2)
266 %endif
267%endmacro
268
269%macro movifnidn 2
270 %ifnidn %1, %2
271 mov %1, %2
272 %endif
273%endmacro
274
275%macro movsxdifnidn 2
276 %ifnidn %1, %2
277 movsxd %1, %2
278 %endif
279%endmacro
280
281%macro ASSERT 1
282 %if (%1) == 0
283 %error assert failed
284 %endif
285%endmacro
286
287%macro DEFINE_ARGS 0-*
288 %ifdef n_arg_names
289 %assign %%i 0
290 %rep n_arg_names
291 CAT_UNDEF arg_name %+ %%i, q
292 CAT_UNDEF arg_name %+ %%i, d
293 CAT_UNDEF arg_name %+ %%i, w
294 CAT_UNDEF arg_name %+ %%i, h
295 CAT_UNDEF arg_name %+ %%i, b
296 CAT_UNDEF arg_name %+ %%i, m
297 CAT_UNDEF arg_name %+ %%i, mp
298 CAT_UNDEF arg_name, %%i
299 %assign %%i %%i+1
300 %endrep
301 %endif
302
303 %xdefine %%stack_offset stack_offset
304 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
305 %assign %%i 0
306 %rep %0
307 %xdefine %1q r %+ %%i %+ q
308 %xdefine %1d r %+ %%i %+ d
309 %xdefine %1w r %+ %%i %+ w
310 %xdefine %1h r %+ %%i %+ h
311 %xdefine %1b r %+ %%i %+ b
312 %xdefine %1m r %+ %%i %+ m
313 %xdefine %1mp r %+ %%i %+ mp
314 CAT_XDEFINE arg_name, %%i, %1
315 %assign %%i %%i+1
316 %rotate 1
317 %endrep
318 %xdefine stack_offset %%stack_offset
319 %assign n_arg_names %0
320%endmacro
321
322%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
323 %ifnum %1
324 %if %1 != 0
325 %assign %%stack_alignment ((mmsize + 15) & ~15)
326 %assign stack_size %1
327 %if stack_size < 0
328 %assign stack_size -stack_size
329 %endif
330 %assign stack_size_padded stack_size
331 %if WIN64
332 %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
333 %if mmsize != 8
334 %assign xmm_regs_used %2
335 %if xmm_regs_used > 8
336 %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
337 %endif
338 %endif
339 %endif
340 %if mmsize <= 16 && HAVE_ALIGNED_STACK
341 %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
342 SUB rsp, stack_size_padded
343 %else
344 %assign %%reg_num (regs_used - 1)
345 %xdefine rstk r %+ %%reg_num
346 ; align stack, and save original stack location directly above
347 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
348 ; stack in a single instruction (i.e. mov rsp, rstk or mov
349 ; rsp, [rsp+stack_size_padded])
350 mov rstk, rsp
351 %if %1 < 0 ; need to store rsp on stack
352 sub rsp, gprsize+stack_size_padded
353 and rsp, ~(%%stack_alignment-1)
354 %xdefine rstkm [rsp+stack_size_padded]
355 mov rstkm, rstk
356 %else ; can keep rsp in rstk during whole function
357 sub rsp, stack_size_padded
358 and rsp, ~(%%stack_alignment-1)
359 %xdefine rstkm rstk
360 %endif
361 %endif
362 WIN64_PUSH_XMM
363 %endif
364 %endif
365%endmacro
366
367%macro SETUP_STACK_POINTER 1
368 %ifnum %1
369 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
370 %if %1 > 0
371 %assign regs_used (regs_used + 1)
372 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
373 %warning "Stack pointer will overwrite register argument"
374 %endif
375 %endif
376 %endif
377%endmacro
378
379%macro DEFINE_ARGS_INTERNAL 3+
380 %ifnum %2
381 DEFINE_ARGS %3
382 %elif %1 == 4
383 DEFINE_ARGS %2
384 %elif %1 > 4
385 DEFINE_ARGS %2, %3
386 %endif
387%endmacro
388
389%if WIN64 ; Windows x64 ;=================================================
390
391DECLARE_REG 0, rcx
392DECLARE_REG 1, rdx
393DECLARE_REG 2, R8
394DECLARE_REG 3, R9
395DECLARE_REG 4, R10, 40
396DECLARE_REG 5, R11, 48
397DECLARE_REG 6, rax, 56
398DECLARE_REG 7, rdi, 64
399DECLARE_REG 8, rsi, 72
400DECLARE_REG 9, rbx, 80
401DECLARE_REG 10, rbp, 88
402DECLARE_REG 11, R12, 96
403DECLARE_REG 12, R13, 104
404DECLARE_REG 13, R14, 112
405DECLARE_REG 14, R15, 120
406
407%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
408 %assign num_args %1
409 %assign regs_used %2
410 ASSERT regs_used >= num_args
411 SETUP_STACK_POINTER %4
412 ASSERT regs_used <= 15
413 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
414 ALLOC_STACK %4, %3
415 %if mmsize != 8 && stack_size == 0
416 WIN64_SPILL_XMM %3
417 %endif
418 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
419 DEFINE_ARGS_INTERNAL %0, %4, %5
420%endmacro
421
422%macro WIN64_PUSH_XMM 0
423 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
424 %if xmm_regs_used > 6
425 movaps [rstk + stack_offset + 8], xmm6
426 %endif
427 %if xmm_regs_used > 7
428 movaps [rstk + stack_offset + 24], xmm7
429 %endif
430 %if xmm_regs_used > 8
431 %assign %%i 8
432 %rep xmm_regs_used-8
433 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
434 %assign %%i %%i+1
435 %endrep
436 %endif
437%endmacro
438
439%macro WIN64_SPILL_XMM 1
440 %assign xmm_regs_used %1
441 ASSERT xmm_regs_used <= 16
442 %if xmm_regs_used > 8
443 %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
444 SUB rsp, stack_size_padded
445 %endif
446 WIN64_PUSH_XMM
447%endmacro
448
449%macro WIN64_RESTORE_XMM_INTERNAL 1
450 %assign %%pad_size 0
451 %if xmm_regs_used > 8
452 %assign %%i xmm_regs_used
453 %rep xmm_regs_used-8
454 %assign %%i %%i-1
455 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
456 %endrep
457 %endif
458 %if stack_size_padded > 0
459 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
460 mov rsp, rstkm
461 %else
462 add %1, stack_size_padded
463 %assign %%pad_size stack_size_padded
464 %endif
465 %endif
466 %if xmm_regs_used > 7
467 movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
468 %endif
469 %if xmm_regs_used > 6
470 movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
471 %endif
472%endmacro
473
474%macro WIN64_RESTORE_XMM 1
475 WIN64_RESTORE_XMM_INTERNAL %1
476 %assign stack_offset (stack_offset-stack_size_padded)
477 %assign xmm_regs_used 0
478%endmacro
479
480%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
481
482%macro RET 0
483 WIN64_RESTORE_XMM_INTERNAL rsp
484 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
485%if mmsize == 32
486 vzeroupper
487%endif
488 AUTO_REP_RET
489%endmacro
490
491%elif ARCH_X86_64 ; *nix x64 ;=============================================
492
493DECLARE_REG 0, rdi
494DECLARE_REG 1, rsi
495DECLARE_REG 2, rdx
496DECLARE_REG 3, rcx
497DECLARE_REG 4, R8
498DECLARE_REG 5, R9
499DECLARE_REG 6, rax, 8
500DECLARE_REG 7, R10, 16
501DECLARE_REG 8, R11, 24
502DECLARE_REG 9, rbx, 32
503DECLARE_REG 10, rbp, 40
504DECLARE_REG 11, R12, 48
505DECLARE_REG 12, R13, 56
506DECLARE_REG 13, R14, 64
507DECLARE_REG 14, R15, 72
508
509%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
510 %assign num_args %1
511 %assign regs_used %2
512 ASSERT regs_used >= num_args
513 SETUP_STACK_POINTER %4
514 ASSERT regs_used <= 15
515 PUSH_IF_USED 9, 10, 11, 12, 13, 14
516 ALLOC_STACK %4
517 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
518 DEFINE_ARGS_INTERNAL %0, %4, %5
519%endmacro
520
521%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
522
523%macro RET 0
524%if stack_size_padded > 0
525%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
526 mov rsp, rstkm
527%else
528 add rsp, stack_size_padded
529%endif
530%endif
531 POP_IF_USED 14, 13, 12, 11, 10, 9
532%if mmsize == 32
533 vzeroupper
534%endif
535 AUTO_REP_RET
536%endmacro
537
538%else ; X86_32 ;==============================================================
539
540DECLARE_REG 0, eax, 4
541DECLARE_REG 1, ecx, 8
542DECLARE_REG 2, edx, 12
543DECLARE_REG 3, ebx, 16
544DECLARE_REG 4, esi, 20
545DECLARE_REG 5, edi, 24
546DECLARE_REG 6, ebp, 28
547%define rsp esp
548
549%macro DECLARE_ARG 1-*
550 %rep %0
551 %define r%1m [rstk + stack_offset + 4*%1 + 4]
552 %define r%1mp dword r%1m
553 %rotate 1
554 %endrep
555%endmacro
556
557DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
558
559%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
560 %assign num_args %1
561 %assign regs_used %2
562 ASSERT regs_used >= num_args
563 %if num_args > 7
564 %assign num_args 7
565 %endif
566 %if regs_used > 7
567 %assign regs_used 7
568 %endif
569 SETUP_STACK_POINTER %4
570 ASSERT regs_used <= 7
571 PUSH_IF_USED 3, 4, 5, 6
572 ALLOC_STACK %4
573 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
574 DEFINE_ARGS_INTERNAL %0, %4, %5
575%endmacro
576
577%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
578
579%macro RET 0
580%if stack_size_padded > 0
581%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
582 mov rsp, rstkm
583%else
584 add rsp, stack_size_padded
585%endif
586%endif
587 POP_IF_USED 6, 5, 4, 3
588%if mmsize == 32
589 vzeroupper
590%endif
591 AUTO_REP_RET
592%endmacro
593
594%endif ;======================================================================
595
596%if WIN64 == 0
597%macro WIN64_SPILL_XMM 1
598%endmacro
599%macro WIN64_RESTORE_XMM 1
600%endmacro
601%macro WIN64_PUSH_XMM 0
602%endmacro
603%endif
604
605; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
606; a branch or a branch target. So switch to a 2-byte form of ret in that case.
607; We can automatically detect "follows a branch", but not a branch target.
608; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
609%macro REP_RET 0
610 %if has_epilogue
611 RET
612 %else
613 rep ret
614 %endif
615%endmacro
616
617%define last_branch_adr $$
618%macro AUTO_REP_RET 0
619 %ifndef cpuflags
620 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
621 %elif notcpuflag(ssse3)
622 times ((last_branch_adr-$)>>31)+1 rep
623 %endif
624 ret
625%endmacro
626
627%macro BRANCH_INSTR 0-*
628 %rep %0
629 %macro %1 1-2 %1
630 %2 %1
631 %%branch_instr:
632 %xdefine last_branch_adr %%branch_instr
633 %endmacro
634 %rotate 1
635 %endrep
636%endmacro
637
638BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
639
640%macro TAIL_CALL 2 ; callee, is_nonadjacent
641 %if has_epilogue
642 call %1
643 RET
644 %elif %2
645 jmp %1
646 %endif
647%endmacro
648
649;=============================================================================
650; arch-independent part
651;=============================================================================
652
653%assign function_align 16
654
655; Begin a function.
656; Applies any symbol mangling needed for C linkage, and sets up a define such that
657; subsequent uses of the function name automatically refer to the mangled version.
658; Appends cpuflags to the function name if cpuflags has been specified.
659; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
660; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
661%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
662 cglobal_internal 1, %1 %+ SUFFIX, %2
663%endmacro
664%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
665 cglobal_internal 0, %1 %+ SUFFIX, %2
666%endmacro
667%macro cglobal_internal 2-3+
668 %if %1
669 %xdefine %%FUNCTION_PREFIX private_prefix
670 %xdefine %%VISIBILITY hidden
671 %else
672 %xdefine %%FUNCTION_PREFIX public_prefix
673 %xdefine %%VISIBILITY
674 %endif
675 %ifndef cglobaled_%2
676 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
677 %xdefine %2.skip_prologue %2 %+ .skip_prologue
678 CAT_XDEFINE cglobaled_, %2, 1
679 %endif
680 %xdefine current_function %2
681 %ifidn __OUTPUT_FORMAT__,elf
682 global %2:function %%VISIBILITY
683 %else
684 global %2
685 %endif
686 align function_align
687 %2:
688 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
689 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
690 %assign stack_offset 0 ; stack pointer offset relative to the return address
691 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
692 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
693 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
694 %ifnidn %3, ""
695 PROLOGUE %3
696 %endif
697%endmacro
698
699%macro cextern 1
700 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
701 CAT_XDEFINE cglobaled_, %1, 1
702 extern %1
703%endmacro
704
705; like cextern, but without the prefix
706%macro cextern_naked 1
707 %xdefine %1 mangle(%1)
708 CAT_XDEFINE cglobaled_, %1, 1
709 extern %1
710%endmacro
711
712%macro const 1-2+
713 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
714 %ifidn __OUTPUT_FORMAT__,elf
715 global %1:data hidden
716 %else
717 global %1
718 %endif
719 %1: %2
720%endmacro
721
722; This is needed for ELF, otherwise the GNU linker assumes the stack is
723; executable by default.
724%ifidn __OUTPUT_FORMAT__,elf
725SECTION .note.GNU-stack noalloc noexec nowrite progbits
726%endif
727
728; cpuflags
729
730%assign cpuflags_mmx (1<<0)
731%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
732%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
733%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
734%assign cpuflags_sse (1<<4) | cpuflags_mmx2
735%assign cpuflags_sse2 (1<<5) | cpuflags_sse
736%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
737%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
738%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
739%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
740%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
741%assign cpuflags_avx (1<<11)| cpuflags_sse42
742%assign cpuflags_xop (1<<12)| cpuflags_avx
743%assign cpuflags_fma4 (1<<13)| cpuflags_avx
744%assign cpuflags_avx2 (1<<14)| cpuflags_avx
745%assign cpuflags_fma3 (1<<15)| cpuflags_avx
746
747%assign cpuflags_cache32 (1<<16)
748%assign cpuflags_cache64 (1<<17)
749%assign cpuflags_slowctz (1<<18)
750%assign cpuflags_lzcnt (1<<19)
751%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
752%assign cpuflags_atom (1<<21)
753%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
754%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
755
756%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
757%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
758
759; Takes an arbitrary number of cpuflags from the above list.
760; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
761; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
762%macro INIT_CPUFLAGS 0-*
763 %xdefine SUFFIX
764 %undef cpuname
765 %assign cpuflags 0
766
767 %if %0 >= 1
768 %rep %0
769 %ifdef cpuname
770 %xdefine cpuname cpuname %+ _%1
771 %else
772 %xdefine cpuname %1
773 %endif
774 %assign cpuflags cpuflags | cpuflags_%1
775 %rotate 1
776 %endrep
777 %xdefine SUFFIX _ %+ cpuname
778
779 %if cpuflag(avx)
780 %assign avx_enabled 1
781 %endif
782 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
783 %define mova movaps
784 %define movu movups
785 %define movnta movntps
786 %endif
787 %if cpuflag(aligned)
788 %define movu mova
789 %elif cpuflag(sse3) && notcpuflag(ssse3)
790 %define movu lddqu
791 %endif
792 %endif
793
794 %if cpuflag(sse2)
795 CPUNOP amdnop
796 %else
797 CPUNOP basicnop
798 %endif
799%endmacro
800
801; Merge mmx and sse*
802; m# is a simd register of the currently selected size
803; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
804; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
805; (All 3 remain in sync through SWAP.)
806
807%macro CAT_XDEFINE 3
808 %xdefine %1%2 %3
809%endmacro
810
811%macro CAT_UNDEF 2
812 %undef %1%2
813%endmacro
814
815%macro INIT_MMX 0-1+
816 %assign avx_enabled 0
817 %define RESET_MM_PERMUTATION INIT_MMX %1
818 %define mmsize 8
819 %define num_mmregs 8
820 %define mova movq
821 %define movu movq
822 %define movh movd
823 %define movnta movntq
824 %assign %%i 0
825 %rep 8
826 CAT_XDEFINE m, %%i, mm %+ %%i
827 CAT_XDEFINE nnmm, %%i, %%i
828 %assign %%i %%i+1
829 %endrep
830 %rep 8
831 CAT_UNDEF m, %%i
832 CAT_UNDEF nnmm, %%i
833 %assign %%i %%i+1
834 %endrep
835 INIT_CPUFLAGS %1
836%endmacro
837
838%macro INIT_XMM 0-1+
839 %assign avx_enabled 0
840 %define RESET_MM_PERMUTATION INIT_XMM %1
841 %define mmsize 16
842 %define num_mmregs 8
843 %if ARCH_X86_64
844 %define num_mmregs 16
845 %endif
846 %define mova movdqa
847 %define movu movdqu
848 %define movh movq
849 %define movnta movntdq
850 %assign %%i 0
851 %rep num_mmregs
852 CAT_XDEFINE m, %%i, xmm %+ %%i
853 CAT_XDEFINE nnxmm, %%i, %%i
854 %assign %%i %%i+1
855 %endrep
856 INIT_CPUFLAGS %1
857%endmacro
858
859; FIXME: INIT_AVX can be replaced by INIT_XMM avx
860%macro INIT_AVX 0
861 INIT_XMM
862 %assign avx_enabled 1
863 %define PALIGNR PALIGNR_SSSE3
864 %define RESET_MM_PERMUTATION INIT_AVX
865%endmacro
866
867%macro INIT_YMM 0-1+
868 %assign avx_enabled 1
869 %define RESET_MM_PERMUTATION INIT_YMM %1
870 %define mmsize 32
871 %define num_mmregs 8
872 %if ARCH_X86_64
873 %define num_mmregs 16
874 %endif
875 %define mova movdqa
876 %define movu movdqu
877 %undef movh
878 %define movnta movntdq
879 %assign %%i 0
880 %rep num_mmregs
881 CAT_XDEFINE m, %%i, ymm %+ %%i
882 CAT_XDEFINE nymm, %%i, %%i
883 %assign %%i %%i+1
884 %endrep
885 INIT_CPUFLAGS %1
886%endmacro
887
888INIT_XMM
889
890%macro DECLARE_MMCAST 1
891 %define mmmm%1 mm%1
892 %define mmxmm%1 mm%1
893 %define mmymm%1 mm%1
894 %define xmmmm%1 mm%1
895 %define xmmxmm%1 xmm%1
896 %define xmmymm%1 xmm%1
897 %define ymmmm%1 mm%1
898 %define ymmxmm%1 xmm%1
899 %define ymmymm%1 ymm%1
900 %define xm%1 xmm %+ m%1
901 %define ym%1 ymm %+ m%1
902%endmacro
903
904%assign i 0
905%rep 16
906 DECLARE_MMCAST i
907%assign i i+1
908%endrep
909
910; I often want to use macros that permute their arguments. e.g. there's no
911; efficient way to implement butterfly or transpose or dct without swapping some
912; arguments.
913;
914; I would like to not have to manually keep track of the permutations:
915; If I insert a permutation in the middle of a function, it should automatically
916; change everything that follows. For more complex macros I may also have multiple
917; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
918;
919; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
920; permutes its arguments. It's equivalent to exchanging the contents of the
921; registers, except that this way you exchange the register names instead, so it
922; doesn't cost any cycles.
923
924%macro PERMUTE 2-* ; takes a list of pairs to swap
925%rep %0/2
926 %xdefine %%tmp%2 m%2
927 %rotate 2
928%endrep
929%rep %0/2
930 %xdefine m%1 %%tmp%2
931 CAT_XDEFINE nn, m%1, %1
932 %rotate 2
933%endrep
934%endmacro
935
936%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
937%ifnum %1 ; SWAP 0, 1, ...
938 SWAP_INTERNAL_NUM %1, %2
939%else ; SWAP m0, m1, ...
940 SWAP_INTERNAL_NAME %1, %2
941%endif
942%endmacro
943
944%macro SWAP_INTERNAL_NUM 2-*
945 %rep %0-1
946 %xdefine %%tmp m%1
947 %xdefine m%1 m%2
948 %xdefine m%2 %%tmp
949 CAT_XDEFINE nn, m%1, %1
950 CAT_XDEFINE nn, m%2, %2
951 %rotate 1
952 %endrep
953%endmacro
954
955%macro SWAP_INTERNAL_NAME 2-*
956 %xdefine %%args nn %+ %1
957 %rep %0-1
958 %xdefine %%args %%args, nn %+ %2
959 %rotate 1
960 %endrep
961 SWAP_INTERNAL_NUM %%args
962%endmacro
963
964; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
965; calls to that function will automatically load the permutation, so values can
966; be returned in mmregs.
967%macro SAVE_MM_PERMUTATION 0-1
968 %if %0
969 %xdefine %%f %1_m
970 %else
971 %xdefine %%f current_function %+ _m
972 %endif
973 %assign %%i 0
974 %rep num_mmregs
975 CAT_XDEFINE %%f, %%i, m %+ %%i
976 %assign %%i %%i+1
977 %endrep
978%endmacro
979
980%macro LOAD_MM_PERMUTATION 1 ; name to load from
981 %ifdef %1_m0
982 %assign %%i 0
983 %rep num_mmregs
984 CAT_XDEFINE m, %%i, %1_m %+ %%i
985 CAT_XDEFINE nn, m %+ %%i, %%i
986 %assign %%i %%i+1
987 %endrep
988 %endif
989%endmacro
990
991; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
992%macro call 1
993 call_internal %1 %+ SUFFIX, %1
994%endmacro
995%macro call_internal 2
996 %xdefine %%i %2
997 %ifndef cglobaled_%2
998 %ifdef cglobaled_%1
999 %xdefine %%i %1
1000 %endif
1001 %endif
1002 call %%i
1003 LOAD_MM_PERMUTATION %%i
1004%endmacro
1005
1006; Substitutions that reduce instruction size but are functionally equivalent
1007%macro add 2
1008 %ifnum %2
1009 %if %2==128
1010 sub %1, -128
1011 %else
1012 add %1, %2
1013 %endif
1014 %else
1015 add %1, %2
1016 %endif
1017%endmacro
1018
1019%macro sub 2
1020 %ifnum %2
1021 %if %2==128
1022 add %1, -128
1023 %else
1024 sub %1, %2
1025 %endif
1026 %else
1027 sub %1, %2
1028 %endif
1029%endmacro
1030
1031;=============================================================================
1032; AVX abstraction layer
1033;=============================================================================
1034
1035%assign i 0
1036%rep 16
1037 %if i < 8
1038 CAT_XDEFINE sizeofmm, i, 8
1039 %endif
1040 CAT_XDEFINE sizeofxmm, i, 16
1041 CAT_XDEFINE sizeofymm, i, 32
1042%assign i i+1
1043%endrep
1044%undef i
1045
1046%macro CHECK_AVX_INSTR_EMU 3-*
1047 %xdefine %%opcode %1
1048 %xdefine %%dst %2
1049 %rep %0-2
1050 %ifidn %%dst, %3
1051 %error non-avx emulation of ``%%opcode'' is not supported
1052 %endif
1053 %rotate 1
1054 %endrep
1055%endmacro
1056
1057;%1 == instruction
1058;%2 == 1 if float, 0 if int
1059;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1060;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1061;%5+: operands
1062%macro RUN_AVX_INSTR 5-8+
1063 %ifnum sizeof%6
1064 %assign __sizeofreg sizeof%6
1065 %elifnum sizeof%5
1066 %assign __sizeofreg sizeof%5
1067 %else
1068 %assign __sizeofreg mmsize
1069 %endif
1070 %assign __emulate_avx 0
1071 %if avx_enabled && __sizeofreg >= 16
1072 %xdefine __instr v%1
1073 %else
1074 %xdefine __instr %1
1075 %if %0 >= 7+%3
1076 %assign __emulate_avx 1
1077 %endif
1078 %endif
1079
1080 %if __emulate_avx
1081 %xdefine __src1 %6
1082 %xdefine __src2 %7
1083 %ifnidn %5, %6
1084 %if %0 >= 8
1085 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
1086 %else
1087 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
1088 %endif
1089 %if %4 && %3 == 0
1090 %ifnid %7
1091 ; 3-operand AVX instructions with a memory arg can only have it in src2,
1092 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1093 ; So, if the instruction is commutative with a memory arg, swap them.
1094 %xdefine __src1 %7
1095 %xdefine __src2 %6
1096 %endif
1097 %endif
1098 %if __sizeofreg == 8
1099 MOVQ %5, __src1
1100 %elif %2
1101 MOVAPS %5, __src1
1102 %else
1103 MOVDQA %5, __src1
1104 %endif
1105 %endif
1106 %if %0 >= 8
1107 %1 %5, __src2, %8
1108 %else
1109 %1 %5, __src2
1110 %endif
1111 %elif %0 >= 8
1112 __instr %5, %6, %7, %8
1113 %elif %0 == 7
1114 __instr %5, %6, %7
1115 %elif %0 == 6
1116 __instr %5, %6
1117 %else
1118 __instr %5
1119 %endif
1120%endmacro
1121
1122;%1 == instruction
1123;%2 == 1 if float, 0 if int
1124;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1125;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1126%macro AVX_INSTR 1-4 0, 1, 0
1127 %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
1128 %ifidn %2, fnord
1129 RUN_AVX_INSTR %6, %7, %8, %9, %1
1130 %elifidn %3, fnord
1131 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
1132 %elifidn %4, fnord
1133 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
1134 %elifidn %5, fnord
1135 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
1136 %else
1137 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
1138 %endif
1139 %endmacro
1140%endmacro
1141
1142; Instructions with both VEX and non-VEX encodings
1143; Non-destructive instructions are written without parameters
1144AVX_INSTR addpd, 1, 0, 1
1145AVX_INSTR addps, 1, 0, 1
1146AVX_INSTR addsd, 1, 0, 1
1147AVX_INSTR addss, 1, 0, 1
1148AVX_INSTR addsubpd, 1, 0, 0
1149AVX_INSTR addsubps, 1, 0, 0
1150AVX_INSTR aesdec, 0, 0, 0
1151AVX_INSTR aesdeclast, 0, 0, 0
1152AVX_INSTR aesenc, 0, 0, 0
1153AVX_INSTR aesenclast, 0, 0, 0
1154AVX_INSTR aesimc
1155AVX_INSTR aeskeygenassist
1156AVX_INSTR andnpd, 1, 0, 0
1157AVX_INSTR andnps, 1, 0, 0
1158AVX_INSTR andpd, 1, 0, 1
1159AVX_INSTR andps, 1, 0, 1
1160AVX_INSTR blendpd, 1, 0, 0
1161AVX_INSTR blendps, 1, 0, 0
1162AVX_INSTR blendvpd, 1, 0, 0
1163AVX_INSTR blendvps, 1, 0, 0
1164AVX_INSTR cmppd, 1, 1, 0
1165AVX_INSTR cmpps, 1, 1, 0
1166AVX_INSTR cmpsd, 1, 1, 0
1167AVX_INSTR cmpss, 1, 1, 0
1168AVX_INSTR comisd
1169AVX_INSTR comiss
1170AVX_INSTR cvtdq2pd
1171AVX_INSTR cvtdq2ps
1172AVX_INSTR cvtpd2dq
1173AVX_INSTR cvtpd2ps
1174AVX_INSTR cvtps2dq
1175AVX_INSTR cvtps2pd
1176AVX_INSTR cvtsd2si
1177AVX_INSTR cvtsd2ss
1178AVX_INSTR cvtsi2sd
1179AVX_INSTR cvtsi2ss
1180AVX_INSTR cvtss2sd
1181AVX_INSTR cvtss2si
1182AVX_INSTR cvttpd2dq
1183AVX_INSTR cvttps2dq
1184AVX_INSTR cvttsd2si
1185AVX_INSTR cvttss2si
1186AVX_INSTR divpd, 1, 0, 0
1187AVX_INSTR divps, 1, 0, 0
1188AVX_INSTR divsd, 1, 0, 0
1189AVX_INSTR divss, 1, 0, 0
1190AVX_INSTR dppd, 1, 1, 0
1191AVX_INSTR dpps, 1, 1, 0
1192AVX_INSTR extractps
1193AVX_INSTR haddpd, 1, 0, 0
1194AVX_INSTR haddps, 1, 0, 0
1195AVX_INSTR hsubpd, 1, 0, 0
1196AVX_INSTR hsubps, 1, 0, 0
1197AVX_INSTR insertps, 1, 1, 0
1198AVX_INSTR lddqu
1199AVX_INSTR ldmxcsr
1200AVX_INSTR maskmovdqu
1201AVX_INSTR maxpd, 1, 0, 1
1202AVX_INSTR maxps, 1, 0, 1
1203AVX_INSTR maxsd, 1, 0, 1
1204AVX_INSTR maxss, 1, 0, 1
1205AVX_INSTR minpd, 1, 0, 1
1206AVX_INSTR minps, 1, 0, 1
1207AVX_INSTR minsd, 1, 0, 1
1208AVX_INSTR minss, 1, 0, 1
1209AVX_INSTR movapd
1210AVX_INSTR movaps
1211AVX_INSTR movd
1212AVX_INSTR movddup
1213AVX_INSTR movdqa
1214AVX_INSTR movdqu
1215AVX_INSTR movhlps, 1, 0, 0
1216AVX_INSTR movhpd, 1, 0, 0
1217AVX_INSTR movhps, 1, 0, 0
1218AVX_INSTR movlhps, 1, 0, 0
1219AVX_INSTR movlpd, 1, 0, 0
1220AVX_INSTR movlps, 1, 0, 0
1221AVX_INSTR movmskpd
1222AVX_INSTR movmskps
1223AVX_INSTR movntdq
1224AVX_INSTR movntdqa
1225AVX_INSTR movntpd
1226AVX_INSTR movntps
1227AVX_INSTR movq
1228AVX_INSTR movsd, 1, 0, 0
1229AVX_INSTR movshdup
1230AVX_INSTR movsldup
1231AVX_INSTR movss, 1, 0, 0
1232AVX_INSTR movupd
1233AVX_INSTR movups
1234AVX_INSTR mpsadbw, 0, 1, 0
1235AVX_INSTR mulpd, 1, 0, 1
1236AVX_INSTR mulps, 1, 0, 1
1237AVX_INSTR mulsd, 1, 0, 1
1238AVX_INSTR mulss, 1, 0, 1
1239AVX_INSTR orpd, 1, 0, 1
1240AVX_INSTR orps, 1, 0, 1
1241AVX_INSTR pabsb
1242AVX_INSTR pabsd
1243AVX_INSTR pabsw
1244AVX_INSTR packsswb, 0, 0, 0
1245AVX_INSTR packssdw, 0, 0, 0
1246AVX_INSTR packuswb, 0, 0, 0
1247AVX_INSTR packusdw, 0, 0, 0
1248AVX_INSTR paddb, 0, 0, 1
1249AVX_INSTR paddw, 0, 0, 1
1250AVX_INSTR paddd, 0, 0, 1
1251AVX_INSTR paddq, 0, 0, 1
1252AVX_INSTR paddsb, 0, 0, 1
1253AVX_INSTR paddsw, 0, 0, 1
1254AVX_INSTR paddusb, 0, 0, 1
1255AVX_INSTR paddusw, 0, 0, 1
1256AVX_INSTR palignr, 0, 1, 0
1257AVX_INSTR pand, 0, 0, 1
1258AVX_INSTR pandn, 0, 0, 0
1259AVX_INSTR pavgb, 0, 0, 1
1260AVX_INSTR pavgw, 0, 0, 1
1261AVX_INSTR pblendvb, 0, 0, 0
1262AVX_INSTR pblendw, 0, 1, 0
1263AVX_INSTR pclmulqdq, 0, 1, 0
1264AVX_INSTR pcmpestri
1265AVX_INSTR pcmpestrm
1266AVX_INSTR pcmpistri
1267AVX_INSTR pcmpistrm
1268AVX_INSTR pcmpeqb, 0, 0, 1
1269AVX_INSTR pcmpeqw, 0, 0, 1
1270AVX_INSTR pcmpeqd, 0, 0, 1
1271AVX_INSTR pcmpeqq, 0, 0, 1
1272AVX_INSTR pcmpgtb, 0, 0, 0
1273AVX_INSTR pcmpgtw, 0, 0, 0
1274AVX_INSTR pcmpgtd, 0, 0, 0
1275AVX_INSTR pcmpgtq, 0, 0, 0
1276AVX_INSTR pextrb
1277AVX_INSTR pextrd
1278AVX_INSTR pextrq
1279AVX_INSTR pextrw
1280AVX_INSTR phaddw, 0, 0, 0
1281AVX_INSTR phaddd, 0, 0, 0
1282AVX_INSTR phaddsw, 0, 0, 0
1283AVX_INSTR phminposuw
1284AVX_INSTR phsubw, 0, 0, 0
1285AVX_INSTR phsubd, 0, 0, 0
1286AVX_INSTR phsubsw, 0, 0, 0
1287AVX_INSTR pinsrb, 0, 1, 0
1288AVX_INSTR pinsrd, 0, 1, 0
1289AVX_INSTR pinsrq, 0, 1, 0
1290AVX_INSTR pinsrw, 0, 1, 0
1291AVX_INSTR pmaddwd, 0, 0, 1
1292AVX_INSTR pmaddubsw, 0, 0, 0
1293AVX_INSTR pmaxsb, 0, 0, 1
1294AVX_INSTR pmaxsw, 0, 0, 1
1295AVX_INSTR pmaxsd, 0, 0, 1
1296AVX_INSTR pmaxub, 0, 0, 1
1297AVX_INSTR pmaxuw, 0, 0, 1
1298AVX_INSTR pmaxud, 0, 0, 1
1299AVX_INSTR pminsb, 0, 0, 1
1300AVX_INSTR pminsw, 0, 0, 1
1301AVX_INSTR pminsd, 0, 0, 1
1302AVX_INSTR pminub, 0, 0, 1
1303AVX_INSTR pminuw, 0, 0, 1
1304AVX_INSTR pminud, 0, 0, 1
1305AVX_INSTR pmovmskb
1306AVX_INSTR pmovsxbw
1307AVX_INSTR pmovsxbd
1308AVX_INSTR pmovsxbq
1309AVX_INSTR pmovsxwd
1310AVX_INSTR pmovsxwq
1311AVX_INSTR pmovsxdq
1312AVX_INSTR pmovzxbw
1313AVX_INSTR pmovzxbd
1314AVX_INSTR pmovzxbq
1315AVX_INSTR pmovzxwd
1316AVX_INSTR pmovzxwq
1317AVX_INSTR pmovzxdq
1318AVX_INSTR pmuldq, 0, 0, 1
1319AVX_INSTR pmulhrsw, 0, 0, 1
1320AVX_INSTR pmulhuw, 0, 0, 1
1321AVX_INSTR pmulhw, 0, 0, 1
1322AVX_INSTR pmullw, 0, 0, 1
1323AVX_INSTR pmulld, 0, 0, 1
1324AVX_INSTR pmuludq, 0, 0, 1
1325AVX_INSTR por, 0, 0, 1
1326AVX_INSTR psadbw, 0, 0, 1
1327AVX_INSTR pshufb, 0, 0, 0
1328AVX_INSTR pshufd
1329AVX_INSTR pshufhw
1330AVX_INSTR pshuflw
1331AVX_INSTR psignb, 0, 0, 0
1332AVX_INSTR psignw, 0, 0, 0
1333AVX_INSTR psignd, 0, 0, 0
1334AVX_INSTR psllw, 0, 0, 0
1335AVX_INSTR pslld, 0, 0, 0
1336AVX_INSTR psllq, 0, 0, 0
1337AVX_INSTR pslldq, 0, 0, 0
1338AVX_INSTR psraw, 0, 0, 0
1339AVX_INSTR psrad, 0, 0, 0
1340AVX_INSTR psrlw, 0, 0, 0
1341AVX_INSTR psrld, 0, 0, 0
1342AVX_INSTR psrlq, 0, 0, 0
1343AVX_INSTR psrldq, 0, 0, 0
1344AVX_INSTR psubb, 0, 0, 0
1345AVX_INSTR psubw, 0, 0, 0
1346AVX_INSTR psubd, 0, 0, 0
1347AVX_INSTR psubq, 0, 0, 0
1348AVX_INSTR psubsb, 0, 0, 0
1349AVX_INSTR psubsw, 0, 0, 0
1350AVX_INSTR psubusb, 0, 0, 0
1351AVX_INSTR psubusw, 0, 0, 0
1352AVX_INSTR ptest
1353AVX_INSTR punpckhbw, 0, 0, 0
1354AVX_INSTR punpckhwd, 0, 0, 0
1355AVX_INSTR punpckhdq, 0, 0, 0
1356AVX_INSTR punpckhqdq, 0, 0, 0
1357AVX_INSTR punpcklbw, 0, 0, 0
1358AVX_INSTR punpcklwd, 0, 0, 0
1359AVX_INSTR punpckldq, 0, 0, 0
1360AVX_INSTR punpcklqdq, 0, 0, 0
1361AVX_INSTR pxor, 0, 0, 1
1362AVX_INSTR rcpps, 1, 0, 0
1363AVX_INSTR rcpss, 1, 0, 0
1364AVX_INSTR roundpd
1365AVX_INSTR roundps
1366AVX_INSTR roundsd
1367AVX_INSTR roundss
1368AVX_INSTR rsqrtps, 1, 0, 0
1369AVX_INSTR rsqrtss, 1, 0, 0
1370AVX_INSTR shufpd, 1, 1, 0
1371AVX_INSTR shufps, 1, 1, 0
1372AVX_INSTR sqrtpd, 1, 0, 0
1373AVX_INSTR sqrtps, 1, 0, 0
1374AVX_INSTR sqrtsd, 1, 0, 0
1375AVX_INSTR sqrtss, 1, 0, 0
1376AVX_INSTR stmxcsr
1377AVX_INSTR subpd, 1, 0, 0
1378AVX_INSTR subps, 1, 0, 0
1379AVX_INSTR subsd, 1, 0, 0
1380AVX_INSTR subss, 1, 0, 0
1381AVX_INSTR ucomisd
1382AVX_INSTR ucomiss
1383AVX_INSTR unpckhpd, 1, 0, 0
1384AVX_INSTR unpckhps, 1, 0, 0
1385AVX_INSTR unpcklpd, 1, 0, 0
1386AVX_INSTR unpcklps, 1, 0, 0
1387AVX_INSTR xorpd, 1, 0, 1
1388AVX_INSTR xorps, 1, 0, 1
1389
1390; 3DNow instructions, for sharing code between AVX, SSE and 3DN
1391AVX_INSTR pfadd, 1, 0, 1
1392AVX_INSTR pfsub, 1, 0, 0
1393AVX_INSTR pfmul, 1, 0, 1
1394
1395; base-4 constants for shuffles
1396%assign i 0
1397%rep 256
1398 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1399 %if j < 10
1400 CAT_XDEFINE q000, j, i
1401 %elif j < 100
1402 CAT_XDEFINE q00, j, i
1403 %elif j < 1000
1404 CAT_XDEFINE q0, j, i
1405 %else
1406 CAT_XDEFINE q, j, i
1407 %endif
1408%assign i i+1
1409%endrep
1410%undef i
1411%undef j
1412
1413; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1414; This lets us use tzcnt without bumping the yasm version requirement yet.
1415%define tzcnt rep bsf
1416
1417; convert FMA4 to FMA3 if possible
1418%macro FMA4_INSTR 4
1419 %macro %1 4-8 %1, %2, %3, %4
1420 %if cpuflag(fma4)
1421 v%5 %1, %2, %3, %4
1422 %elifidn %1, %2
1423 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
1424 %elifidn %1, %3
1425 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
1426 %elifidn %1, %4
1427 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
1428 %else
1429 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
1430 %endif
1431 %endmacro
1432%endmacro
1433
1434FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
1435FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
1436FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
1437FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
1438
1439FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
1440FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
1441FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
1442FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
1443
1444FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
1445FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
1446FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
1447FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
1448
1449FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
1450FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
1451FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
1452FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
1453
1454FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
1455FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
1456FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
1457FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
1458
1459; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
1460%if ARCH_X86_64 == 0
1461%macro vpbroadcastq 2
1462%if sizeof%1 == 16
1463 movddup %1, %2
1464%else
1465 vbroadcastsd %1, %2
1466%endif
1467%endmacro
1468%endif