| 1 | /* |
| 2 | * Copyright (C) 2010 David Conrad |
| 3 | * |
| 4 | * This file is part of FFmpeg. |
| 5 | * |
| 6 | * FFmpeg is free software; you can redistribute it and/or |
| 7 | * modify it under the terms of the GNU Lesser General Public |
| 8 | * License as published by the Free Software Foundation; either |
| 9 | * version 2.1 of the License, or (at your option) any later version. |
| 10 | * |
| 11 | * FFmpeg is distributed in the hope that it will be useful, |
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 14 | * Lesser General Public License for more details. |
| 15 | * |
| 16 | * You should have received a copy of the GNU Lesser General Public |
| 17 | * License along with FFmpeg; if not, write to the Free Software |
| 18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 19 | */ |
| 20 | |
| 21 | #include "libavutil/x86/cpu.h" |
| 22 | #include "diracdsp_mmx.h" |
| 23 | #include "fpel.h" |
| 24 | |
| 25 | void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
| 26 | void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
| 27 | void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
| 28 | void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
| 29 | |
| 30 | #define HPEL_FILTER(MMSIZE, EXT) \ |
| 31 | void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \ |
| 32 | void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \ |
| 33 | \ |
| 34 | static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \ |
| 35 | const uint8_t *src, int stride, int width, int height) \ |
| 36 | { \ |
| 37 | while( height-- ) \ |
| 38 | { \ |
| 39 | ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \ |
| 40 | ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \ |
| 41 | ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \ |
| 42 | \ |
| 43 | dsth += stride; \ |
| 44 | dstv += stride; \ |
| 45 | dstc += stride; \ |
| 46 | src += stride; \ |
| 47 | } \ |
| 48 | } |
| 49 | |
| 50 | #if !ARCH_X86_64 |
| 51 | HPEL_FILTER(8, mmx) |
| 52 | #endif |
| 53 | HPEL_FILTER(16, sse2) |
| 54 | |
| 55 | #define PIXFUNC(PFX, IDX, EXT) \ |
| 56 | /*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \ |
| 57 | c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \ |
| 58 | c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT |
| 59 | |
| 60 | #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\ |
| 61 | void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ |
| 62 | {\ |
| 63 | if (h&3)\ |
| 64 | ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\ |
| 65 | else\ |
| 66 | OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\ |
| 67 | }\ |
| 68 | void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ |
| 69 | {\ |
| 70 | if (h&3)\ |
| 71 | ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\ |
| 72 | else\ |
| 73 | OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\ |
| 74 | }\ |
| 75 | void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ |
| 76 | {\ |
| 77 | if (h&3) {\ |
| 78 | ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\ |
| 79 | } else {\ |
| 80 | OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\ |
| 81 | OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\ |
| 82 | }\ |
| 83 | } |
| 84 | |
| 85 | DIRAC_PIXOP(put, ff_put, mmx) |
| 86 | DIRAC_PIXOP(avg, ff_avg, mmx) |
| 87 | DIRAC_PIXOP(avg, ff_avg, mmxext) |
| 88 | |
| 89 | void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
| 90 | { |
| 91 | if (h&3) |
| 92 | ff_put_dirac_pixels16_c(dst, src, stride, h); |
| 93 | else |
| 94 | ff_put_pixels16_sse2(dst, src[0], stride, h); |
| 95 | } |
| 96 | void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
| 97 | { |
| 98 | if (h&3) |
| 99 | ff_avg_dirac_pixels16_c(dst, src, stride, h); |
| 100 | else |
| 101 | ff_avg_pixels16_sse2(dst, src[0], stride, h); |
| 102 | } |
| 103 | void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
| 104 | { |
| 105 | if (h&3) { |
| 106 | ff_put_dirac_pixels32_c(dst, src, stride, h); |
| 107 | } else { |
| 108 | ff_put_pixels16_sse2(dst , src[0] , stride, h); |
| 109 | ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h); |
| 110 | } |
| 111 | } |
| 112 | void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
| 113 | { |
| 114 | if (h&3) { |
| 115 | ff_avg_dirac_pixels32_c(dst, src, stride, h); |
| 116 | } else { |
| 117 | ff_avg_pixels16_sse2(dst , src[0] , stride, h); |
| 118 | ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h); |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | void ff_diracdsp_init_mmx(DiracDSPContext* c) |
| 123 | { |
| 124 | int mm_flags = av_get_cpu_flags(); |
| 125 | |
| 126 | if (EXTERNAL_MMX(mm_flags)) { |
| 127 | c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; |
| 128 | #if !ARCH_X86_64 |
| 129 | c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx; |
| 130 | c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx; |
| 131 | c->dirac_hpel_filter = dirac_hpel_filter_mmx; |
| 132 | c->add_rect_clamped = ff_add_rect_clamped_mmx; |
| 133 | c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx; |
| 134 | #endif |
| 135 | PIXFUNC(put, 0, mmx); |
| 136 | PIXFUNC(avg, 0, mmx); |
| 137 | } |
| 138 | |
| 139 | if (EXTERNAL_MMXEXT(mm_flags)) { |
| 140 | PIXFUNC(avg, 0, mmxext); |
| 141 | } |
| 142 | |
| 143 | if (EXTERNAL_SSE2(mm_flags)) { |
| 144 | c->dirac_hpel_filter = dirac_hpel_filter_sse2; |
| 145 | c->add_rect_clamped = ff_add_rect_clamped_sse2; |
| 146 | c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2; |
| 147 | |
| 148 | c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; |
| 149 | c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; |
| 150 | |
| 151 | c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2; |
| 152 | c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2; |
| 153 | c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; |
| 154 | c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; |
| 155 | } |
| 156 | } |