- "jb 1b \n\t"\
- :: "g" (filter),
- "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
- : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
- "%"REG_d, "%"REG_S, "%"REG_c
- );
+ "jb 1b \n\t"
+
+ if (offset) {
+ __asm__ volatile(
+ "movq %5, %%xmm3 \n\t"
+ "movdqa %%xmm3, %%xmm4 \n\t"
+ "psrlq $24, %%xmm3 \n\t"
+ "psllq $40, %%xmm4 \n\t"
+ "por %%xmm4, %%xmm3 \n\t"
+ MAIN_FUNCTION
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
+ "m"(filterSize), "m"(((uint64_t *) dither)[0])
+ : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
+ "%"REG_d, "%"REG_S, "%"REG_c
+ );
+ } else {
+ __asm__ volatile(
+ "movq %5, %%xmm3 \n\t"
+ MAIN_FUNCTION
+ :: "g" (filter),
+ "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
+ "m"(filterSize), "m"(((uint64_t *) dither)[0])
+ : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
+ "%"REG_d, "%"REG_S, "%"REG_c
+ );
+ }