| 1 | /* |
| 2 | * Copyright (c) 2002 Brian Foley |
| 3 | * Copyright (c) 2002 Dieter Shirley |
| 4 | * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> |
| 5 | * |
| 6 | * This file is part of FFmpeg. |
| 7 | * |
| 8 | * FFmpeg is free software; you can redistribute it and/or |
| 9 | * modify it under the terms of the GNU Lesser General Public |
| 10 | * License as published by the Free Software Foundation; either |
| 11 | * version 2.1 of the License, or (at your option) any later version. |
| 12 | * |
| 13 | * FFmpeg is distributed in the hope that it will be useful, |
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 16 | * Lesser General Public License for more details. |
| 17 | * |
| 18 | * You should have received a copy of the GNU Lesser General Public |
| 19 | * License along with FFmpeg; if not, write to the Free Software |
| 20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 21 | */ |
| 22 | |
| 23 | #include "config.h" |
| 24 | #if HAVE_ALTIVEC_H |
| 25 | #include <altivec.h> |
| 26 | #endif |
| 27 | |
| 28 | #include "libavutil/attributes.h" |
| 29 | #include "libavutil/cpu.h" |
| 30 | #include "libavutil/ppc/cpu.h" |
| 31 | #include "libavutil/ppc/types_altivec.h" |
| 32 | #include "libavutil/ppc/util_altivec.h" |
| 33 | #include "libavcodec/avcodec.h" |
| 34 | #include "libavcodec/mpegvideo.h" |
| 35 | #include "libavcodec/me_cmp.h" |
| 36 | |
| 37 | #if HAVE_ALTIVEC |
| 38 | static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
| 39 | int line_size, int h) |
| 40 | { |
| 41 | int i, s = 0; |
| 42 | const vector unsigned char zero = |
| 43 | (const vector unsigned char) vec_splat_u8(0); |
| 44 | vector unsigned char perm1 = vec_lvsl(0, pix2); |
| 45 | vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
| 46 | vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
| 47 | vector signed int sumdiffs; |
| 48 | |
| 49 | for (i = 0; i < h; i++) { |
| 50 | /* Read unaligned pixels into our vectors. The vectors are as follows: |
| 51 | * pix1v: pix1[0] - pix1[15] |
| 52 | * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */ |
| 53 | vector unsigned char pix1v = vec_ld(0, pix1); |
| 54 | vector unsigned char pix2l = vec_ld(0, pix2); |
| 55 | vector unsigned char pix2r = vec_ld(16, pix2); |
| 56 | vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
| 57 | vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
| 58 | |
| 59 | /* Calculate the average vector. */ |
| 60 | vector unsigned char avgv = vec_avg(pix2v, pix2iv); |
| 61 | |
| 62 | /* Calculate a sum of abs differences vector. */ |
| 63 | vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv), |
| 64 | vec_min(pix1v, avgv)); |
| 65 | |
| 66 | /* Add each 4 pixel group together and put 4 results into sad. */ |
| 67 | sad = vec_sum4s(t5, sad); |
| 68 | |
| 69 | pix1 += line_size; |
| 70 | pix2 += line_size; |
| 71 | } |
| 72 | /* Sum up the four partial sums, and put the result into s. */ |
| 73 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
| 74 | sumdiffs = vec_splat(sumdiffs, 3); |
| 75 | vec_ste(sumdiffs, 0, &s); |
| 76 | |
| 77 | return s; |
| 78 | } |
| 79 | |
| 80 | static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
| 81 | int line_size, int h) |
| 82 | { |
| 83 | int i, s = 0; |
| 84 | const vector unsigned char zero = |
| 85 | (const vector unsigned char) vec_splat_u8(0); |
| 86 | vector unsigned char perm = vec_lvsl(0, pix2); |
| 87 | vector unsigned char pix1v, pix3v, avgv, t5; |
| 88 | vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
| 89 | vector signed int sumdiffs; |
| 90 | uint8_t *pix3 = pix2 + line_size; |
| 91 | |
| 92 | /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
| 93 | * iteration becomes pix2 in the next iteration. We can use this |
| 94 | * fact to avoid a potentially expensive unaligned read, each |
| 95 | * time around the loop. |
| 96 | * Read unaligned pixels into our vectors. The vectors are as follows: |
| 97 | * pix2v: pix2[0] - pix2[15] |
| 98 | * Split the pixel vectors into shorts. */ |
| 99 | vector unsigned char pix2l = vec_ld(0, pix2); |
| 100 | vector unsigned char pix2r = vec_ld(15, pix2); |
| 101 | vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm); |
| 102 | |
| 103 | for (i = 0; i < h; i++) { |
| 104 | /* Read unaligned pixels into our vectors. The vectors are as follows: |
| 105 | * pix1v: pix1[0] - pix1[15] |
| 106 | * pix3v: pix3[0] - pix3[15] */ |
| 107 | pix1v = vec_ld(0, pix1); |
| 108 | |
| 109 | pix2l = vec_ld(0, pix3); |
| 110 | pix2r = vec_ld(15, pix3); |
| 111 | pix3v = vec_perm(pix2l, pix2r, perm); |
| 112 | |
| 113 | /* Calculate the average vector. */ |
| 114 | avgv = vec_avg(pix2v, pix3v); |
| 115 | |
| 116 | /* Calculate a sum of abs differences vector. */ |
| 117 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
| 118 | |
| 119 | /* Add each 4 pixel group together and put 4 results into sad. */ |
| 120 | sad = vec_sum4s(t5, sad); |
| 121 | |
| 122 | pix1 += line_size; |
| 123 | pix2v = pix3v; |
| 124 | pix3 += line_size; |
| 125 | } |
| 126 | |
| 127 | /* Sum up the four partial sums, and put the result into s. */ |
| 128 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
| 129 | sumdiffs = vec_splat(sumdiffs, 3); |
| 130 | vec_ste(sumdiffs, 0, &s); |
| 131 | return s; |
| 132 | } |
| 133 | |
| 134 | static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
| 135 | int line_size, int h) |
| 136 | { |
| 137 | int i, s = 0; |
| 138 | uint8_t *pix3 = pix2 + line_size; |
| 139 | const vector unsigned char zero = |
| 140 | (const vector unsigned char) vec_splat_u8(0); |
| 141 | const vector unsigned short two = |
| 142 | (const vector unsigned short) vec_splat_u16(2); |
| 143 | vector unsigned char avgv, t5; |
| 144 | vector unsigned char perm1 = vec_lvsl(0, pix2); |
| 145 | vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1)); |
| 146 | vector unsigned char pix1v, pix3v, pix3iv; |
| 147 | vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; |
| 148 | vector unsigned short avghv, avglv; |
| 149 | vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
| 150 | vector signed int sumdiffs; |
| 151 | |
| 152 | /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one |
| 153 | * iteration becomes pix2 in the next iteration. We can use this |
| 154 | * fact to avoid a potentially expensive unaligned read, as well |
| 155 | * as some splitting, and vector addition each time around the loop. |
| 156 | * Read unaligned pixels into our vectors. The vectors are as follows: |
| 157 | * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] |
| 158 | * Split the pixel vectors into shorts. */ |
| 159 | vector unsigned char pix2l = vec_ld(0, pix2); |
| 160 | vector unsigned char pix2r = vec_ld(16, pix2); |
| 161 | vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1); |
| 162 | vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2); |
| 163 | |
| 164 | vector unsigned short pix2hv = |
| 165 | (vector unsigned short) vec_mergeh(zero, pix2v); |
| 166 | vector unsigned short pix2lv = |
| 167 | (vector unsigned short) vec_mergel(zero, pix2v); |
| 168 | vector unsigned short pix2ihv = |
| 169 | (vector unsigned short) vec_mergeh(zero, pix2iv); |
| 170 | vector unsigned short pix2ilv = |
| 171 | (vector unsigned short) vec_mergel(zero, pix2iv); |
| 172 | vector unsigned short t1 = vec_add(pix2hv, pix2ihv); |
| 173 | vector unsigned short t2 = vec_add(pix2lv, pix2ilv); |
| 174 | vector unsigned short t3, t4; |
| 175 | |
| 176 | for (i = 0; i < h; i++) { |
| 177 | /* Read unaligned pixels into our vectors. The vectors are as follows: |
| 178 | * pix1v: pix1[0] - pix1[15] |
| 179 | * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */ |
| 180 | pix1v = vec_ld(0, pix1); |
| 181 | |
| 182 | pix2l = vec_ld(0, pix3); |
| 183 | pix2r = vec_ld(16, pix3); |
| 184 | pix3v = vec_perm(pix2l, pix2r, perm1); |
| 185 | pix3iv = vec_perm(pix2l, pix2r, perm2); |
| 186 | |
| 187 | /* Note that AltiVec does have vec_avg, but this works on vector pairs |
| 188 | * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the |
| 189 | * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when |
| 190 | * it should be 1. Instead, we have to split the pixel vectors into |
| 191 | * vectors of shorts and do the averaging by hand. */ |
| 192 | |
| 193 | /* Split the pixel vectors into shorts. */ |
| 194 | pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); |
| 195 | pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); |
| 196 | pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); |
| 197 | pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); |
| 198 | |
| 199 | /* Do the averaging on them. */ |
| 200 | t3 = vec_add(pix3hv, pix3ihv); |
| 201 | t4 = vec_add(pix3lv, pix3ilv); |
| 202 | |
| 203 | avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); |
| 204 | avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); |
| 205 | |
| 206 | /* Pack the shorts back into a result. */ |
| 207 | avgv = vec_pack(avghv, avglv); |
| 208 | |
| 209 | /* Calculate a sum of abs differences vector. */ |
| 210 | t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); |
| 211 | |
| 212 | /* Add each 4 pixel group together and put 4 results into sad. */ |
| 213 | sad = vec_sum4s(t5, sad); |
| 214 | |
| 215 | pix1 += line_size; |
| 216 | pix3 += line_size; |
| 217 | /* Transfer the calculated values for pix3 into pix2. */ |
| 218 | t1 = t3; |
| 219 | t2 = t4; |
| 220 | } |
| 221 | /* Sum up the four partial sums, and put the result into s. */ |
| 222 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
| 223 | sumdiffs = vec_splat(sumdiffs, 3); |
| 224 | vec_ste(sumdiffs, 0, &s); |
| 225 | |
| 226 | return s; |
| 227 | } |
| 228 | |
| 229 | static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
| 230 | int line_size, int h) |
| 231 | { |
| 232 | int i, s; |
| 233 | const vector unsigned int zero = |
| 234 | (const vector unsigned int) vec_splat_u32(0); |
| 235 | vector unsigned char perm = vec_lvsl(0, pix2); |
| 236 | vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
| 237 | vector signed int sumdiffs; |
| 238 | |
| 239 | for (i = 0; i < h; i++) { |
| 240 | /* Read potentially unaligned pixels into t1 and t2. */ |
| 241 | vector unsigned char pix2l = vec_ld(0, pix2); |
| 242 | vector unsigned char pix2r = vec_ld(15, pix2); |
| 243 | vector unsigned char t1 = vec_ld(0, pix1); |
| 244 | vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
| 245 | |
| 246 | /* Calculate a sum of abs differences vector. */ |
| 247 | vector unsigned char t3 = vec_max(t1, t2); |
| 248 | vector unsigned char t4 = vec_min(t1, t2); |
| 249 | vector unsigned char t5 = vec_sub(t3, t4); |
| 250 | |
| 251 | /* Add each 4 pixel group together and put 4 results into sad. */ |
| 252 | sad = vec_sum4s(t5, sad); |
| 253 | |
| 254 | pix1 += line_size; |
| 255 | pix2 += line_size; |
| 256 | } |
| 257 | |
| 258 | /* Sum up the four partial sums, and put the result into s. */ |
| 259 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
| 260 | sumdiffs = vec_splat(sumdiffs, 3); |
| 261 | vec_ste(sumdiffs, 0, &s); |
| 262 | |
| 263 | return s; |
| 264 | } |
| 265 | |
| 266 | static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
| 267 | int line_size, int h) |
| 268 | { |
| 269 | int i, s; |
| 270 | const vector unsigned int zero = |
| 271 | (const vector unsigned int) vec_splat_u32(0); |
| 272 | const vector unsigned char permclear = |
| 273 | (vector unsigned char) |
| 274 | { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
| 275 | vector unsigned char perm1 = vec_lvsl(0, pix1); |
| 276 | vector unsigned char perm2 = vec_lvsl(0, pix2); |
| 277 | vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); |
| 278 | vector signed int sumdiffs; |
| 279 | |
| 280 | for (i = 0; i < h; i++) { |
| 281 | /* Read potentially unaligned pixels into t1 and t2. |
| 282 | * Since we're reading 16 pixels, and actually only want 8, |
| 283 | * mask out the last 8 pixels. The 0s don't change the sum. */ |
| 284 | vector unsigned char pix1l = vec_ld(0, pix1); |
| 285 | vector unsigned char pix1r = vec_ld(7, pix1); |
| 286 | vector unsigned char pix2l = vec_ld(0, pix2); |
| 287 | vector unsigned char pix2r = vec_ld(7, pix2); |
| 288 | vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
| 289 | permclear); |
| 290 | vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
| 291 | permclear); |
| 292 | |
| 293 | /* Calculate a sum of abs differences vector. */ |
| 294 | vector unsigned char t3 = vec_max(t1, t2); |
| 295 | vector unsigned char t4 = vec_min(t1, t2); |
| 296 | vector unsigned char t5 = vec_sub(t3, t4); |
| 297 | |
| 298 | /* Add each 4 pixel group together and put 4 results into sad. */ |
| 299 | sad = vec_sum4s(t5, sad); |
| 300 | |
| 301 | pix1 += line_size; |
| 302 | pix2 += line_size; |
| 303 | } |
| 304 | |
| 305 | /* Sum up the four partial sums, and put the result into s. */ |
| 306 | sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); |
| 307 | sumdiffs = vec_splat(sumdiffs, 3); |
| 308 | vec_ste(sumdiffs, 0, &s); |
| 309 | |
| 310 | return s; |
| 311 | } |
| 312 | |
| 313 | /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. |
| 314 | * It's the sad8_altivec code above w/ squaring added. */ |
| 315 | static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
| 316 | int line_size, int h) |
| 317 | { |
| 318 | int i, s; |
| 319 | const vector unsigned int zero = |
| 320 | (const vector unsigned int) vec_splat_u32(0); |
| 321 | const vector unsigned char permclear = |
| 322 | (vector unsigned char) |
| 323 | { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 }; |
| 324 | vector unsigned char perm1 = vec_lvsl(0, pix1); |
| 325 | vector unsigned char perm2 = vec_lvsl(0, pix2); |
| 326 | vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
| 327 | vector signed int sumsqr; |
| 328 | |
| 329 | for (i = 0; i < h; i++) { |
| 330 | /* Read potentially unaligned pixels into t1 and t2. |
| 331 | * Since we're reading 16 pixels, and actually only want 8, |
| 332 | * mask out the last 8 pixels. The 0s don't change the sum. */ |
| 333 | vector unsigned char pix1l = vec_ld(0, pix1); |
| 334 | vector unsigned char pix1r = vec_ld(7, pix1); |
| 335 | vector unsigned char pix2l = vec_ld(0, pix2); |
| 336 | vector unsigned char pix2r = vec_ld(7, pix2); |
| 337 | vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1), |
| 338 | permclear); |
| 339 | vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2), |
| 340 | permclear); |
| 341 | |
| 342 | /* Since we want to use unsigned chars, we can take advantage |
| 343 | * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
| 344 | |
| 345 | /* Calculate abs differences vector. */ |
| 346 | vector unsigned char t3 = vec_max(t1, t2); |
| 347 | vector unsigned char t4 = vec_min(t1, t2); |
| 348 | vector unsigned char t5 = vec_sub(t3, t4); |
| 349 | |
| 350 | /* Square the values and add them to our sum. */ |
| 351 | sum = vec_msum(t5, t5, sum); |
| 352 | |
| 353 | pix1 += line_size; |
| 354 | pix2 += line_size; |
| 355 | } |
| 356 | |
| 357 | /* Sum up the four partial sums, and put the result into s. */ |
| 358 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
| 359 | sumsqr = vec_splat(sumsqr, 3); |
| 360 | vec_ste(sumsqr, 0, &s); |
| 361 | |
| 362 | return s; |
| 363 | } |
| 364 | |
| 365 | /* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced. |
| 366 | * It's the sad16_altivec code above w/ squaring added. */ |
| 367 | static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, |
| 368 | int line_size, int h) |
| 369 | { |
| 370 | int i, s; |
| 371 | const vector unsigned int zero = |
| 372 | (const vector unsigned int) vec_splat_u32(0); |
| 373 | vector unsigned char perm = vec_lvsl(0, pix2); |
| 374 | vector unsigned int sum = (vector unsigned int) vec_splat_u32(0); |
| 375 | vector signed int sumsqr; |
| 376 | |
| 377 | for (i = 0; i < h; i++) { |
| 378 | /* Read potentially unaligned pixels into t1 and t2. */ |
| 379 | vector unsigned char pix2l = vec_ld(0, pix2); |
| 380 | vector unsigned char pix2r = vec_ld(15, pix2); |
| 381 | vector unsigned char t1 = vec_ld(0, pix1); |
| 382 | vector unsigned char t2 = vec_perm(pix2l, pix2r, perm); |
| 383 | |
| 384 | /* Since we want to use unsigned chars, we can take advantage |
| 385 | * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */ |
| 386 | |
| 387 | /* Calculate abs differences vector. */ |
| 388 | vector unsigned char t3 = vec_max(t1, t2); |
| 389 | vector unsigned char t4 = vec_min(t1, t2); |
| 390 | vector unsigned char t5 = vec_sub(t3, t4); |
| 391 | |
| 392 | /* Square the values and add them to our sum. */ |
| 393 | sum = vec_msum(t5, t5, sum); |
| 394 | |
| 395 | pix1 += line_size; |
| 396 | pix2 += line_size; |
| 397 | } |
| 398 | |
| 399 | /* Sum up the four partial sums, and put the result into s. */ |
| 400 | sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); |
| 401 | sumsqr = vec_splat(sumsqr, 3); |
| 402 | vec_ste(sumsqr, 0, &s); |
| 403 | |
| 404 | return s; |
| 405 | } |
| 406 | |
| 407 | static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst, |
| 408 | uint8_t *src, int stride, int h) |
| 409 | { |
| 410 | int sum; |
| 411 | register const vector unsigned char vzero = |
| 412 | (const vector unsigned char) vec_splat_u8(0); |
| 413 | register vector signed short temp0, temp1, temp2, temp3, temp4, |
| 414 | temp5, temp6, temp7; |
| 415 | { |
| 416 | register const vector signed short vprod1 = |
| 417 | (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
| 418 | register const vector signed short vprod2 = |
| 419 | (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
| 420 | register const vector signed short vprod3 = |
| 421 | (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
| 422 | register const vector unsigned char perm1 = |
| 423 | (const vector unsigned char) |
| 424 | { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
| 425 | 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
| 426 | register const vector unsigned char perm2 = |
| 427 | (const vector unsigned char) |
| 428 | { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
| 429 | 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
| 430 | register const vector unsigned char perm3 = |
| 431 | (const vector unsigned char) |
| 432 | { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
| 433 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
| 434 | |
| 435 | #define ONEITERBUTTERFLY(i, res) \ |
| 436 | { \ |
| 437 | register vector unsigned char src1 = vec_ld(stride * i, src); \ |
| 438 | register vector unsigned char src2 = vec_ld(stride * i + 15, src); \ |
| 439 | register vector unsigned char srcO = \ |
| 440 | vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
| 441 | register vector unsigned char dst1 = vec_ld(stride * i, dst); \ |
| 442 | register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \ |
| 443 | register vector unsigned char dstO = \ |
| 444 | vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
| 445 | \ |
| 446 | /* Promote the unsigned chars to signed shorts. */ \ |
| 447 | /* We're in the 8x8 function, we only care for the first 8. */ \ |
| 448 | register vector signed short srcV = \ |
| 449 | (vector signed short) vec_mergeh((vector signed char) vzero, \ |
| 450 | (vector signed char) srcO); \ |
| 451 | register vector signed short dstV = \ |
| 452 | (vector signed short) vec_mergeh((vector signed char) vzero, \ |
| 453 | (vector signed char) dstO); \ |
| 454 | \ |
| 455 | /* subtractions inside the first butterfly */ \ |
| 456 | register vector signed short but0 = vec_sub(srcV, dstV); \ |
| 457 | register vector signed short op1 = vec_perm(but0, but0, perm1); \ |
| 458 | register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
| 459 | register vector signed short op2 = vec_perm(but1, but1, perm2); \ |
| 460 | register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
| 461 | register vector signed short op3 = vec_perm(but2, but2, perm3); \ |
| 462 | res = vec_mladd(but2, vprod3, op3); \ |
| 463 | } |
| 464 | ONEITERBUTTERFLY(0, temp0); |
| 465 | ONEITERBUTTERFLY(1, temp1); |
| 466 | ONEITERBUTTERFLY(2, temp2); |
| 467 | ONEITERBUTTERFLY(3, temp3); |
| 468 | ONEITERBUTTERFLY(4, temp4); |
| 469 | ONEITERBUTTERFLY(5, temp5); |
| 470 | ONEITERBUTTERFLY(6, temp6); |
| 471 | ONEITERBUTTERFLY(7, temp7); |
| 472 | } |
| 473 | #undef ONEITERBUTTERFLY |
| 474 | { |
| 475 | register vector signed int vsum; |
| 476 | register vector signed short line0 = vec_add(temp0, temp1); |
| 477 | register vector signed short line1 = vec_sub(temp0, temp1); |
| 478 | register vector signed short line2 = vec_add(temp2, temp3); |
| 479 | register vector signed short line3 = vec_sub(temp2, temp3); |
| 480 | register vector signed short line4 = vec_add(temp4, temp5); |
| 481 | register vector signed short line5 = vec_sub(temp4, temp5); |
| 482 | register vector signed short line6 = vec_add(temp6, temp7); |
| 483 | register vector signed short line7 = vec_sub(temp6, temp7); |
| 484 | |
| 485 | register vector signed short line0B = vec_add(line0, line2); |
| 486 | register vector signed short line2B = vec_sub(line0, line2); |
| 487 | register vector signed short line1B = vec_add(line1, line3); |
| 488 | register vector signed short line3B = vec_sub(line1, line3); |
| 489 | register vector signed short line4B = vec_add(line4, line6); |
| 490 | register vector signed short line6B = vec_sub(line4, line6); |
| 491 | register vector signed short line5B = vec_add(line5, line7); |
| 492 | register vector signed short line7B = vec_sub(line5, line7); |
| 493 | |
| 494 | register vector signed short line0C = vec_add(line0B, line4B); |
| 495 | register vector signed short line4C = vec_sub(line0B, line4B); |
| 496 | register vector signed short line1C = vec_add(line1B, line5B); |
| 497 | register vector signed short line5C = vec_sub(line1B, line5B); |
| 498 | register vector signed short line2C = vec_add(line2B, line6B); |
| 499 | register vector signed short line6C = vec_sub(line2B, line6B); |
| 500 | register vector signed short line3C = vec_add(line3B, line7B); |
| 501 | register vector signed short line7C = vec_sub(line3B, line7B); |
| 502 | |
| 503 | vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
| 504 | vsum = vec_sum4s(vec_abs(line1C), vsum); |
| 505 | vsum = vec_sum4s(vec_abs(line2C), vsum); |
| 506 | vsum = vec_sum4s(vec_abs(line3C), vsum); |
| 507 | vsum = vec_sum4s(vec_abs(line4C), vsum); |
| 508 | vsum = vec_sum4s(vec_abs(line5C), vsum); |
| 509 | vsum = vec_sum4s(vec_abs(line6C), vsum); |
| 510 | vsum = vec_sum4s(vec_abs(line7C), vsum); |
| 511 | vsum = vec_sums(vsum, (vector signed int) vzero); |
| 512 | vsum = vec_splat(vsum, 3); |
| 513 | vec_ste(vsum, 0, &sum); |
| 514 | } |
| 515 | return sum; |
| 516 | } |
| 517 | |
| 518 | /* |
| 519 | * 16x8 works with 16 elements; it allows to avoid replicating loads, and |
| 520 | * gives the compiler more room for scheduling. It's only used from |
| 521 | * inside hadamard8_diff16_altivec. |
| 522 | * |
| 523 | * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has |
| 524 | * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in |
| 525 | * registers by itself. The following code includes hand-made register |
| 526 | * allocation. It's not clean, but on a 7450 the resulting code is much faster |
| 527 | * (best case falls from 700+ cycles to 550). |
| 528 | * |
| 529 | * xlc doesn't add spill code, but it doesn't know how to schedule for the |
| 530 | * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses |
| 531 | * 25% fewer instructions...) |
| 532 | * |
| 533 | * On the 970, the hand-made RA is still a win (around 690 vs. around 780), |
| 534 | * but xlc goes to around 660 on the regular C code... |
| 535 | */ |
| 536 | static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst, |
| 537 | uint8_t *src, int stride, int h) |
| 538 | { |
| 539 | int sum; |
| 540 | register vector signed short |
| 541 | temp0 __asm__ ("v0"), |
| 542 | temp1 __asm__ ("v1"), |
| 543 | temp2 __asm__ ("v2"), |
| 544 | temp3 __asm__ ("v3"), |
| 545 | temp4 __asm__ ("v4"), |
| 546 | temp5 __asm__ ("v5"), |
| 547 | temp6 __asm__ ("v6"), |
| 548 | temp7 __asm__ ("v7"); |
| 549 | register vector signed short |
| 550 | temp0S __asm__ ("v8"), |
| 551 | temp1S __asm__ ("v9"), |
| 552 | temp2S __asm__ ("v10"), |
| 553 | temp3S __asm__ ("v11"), |
| 554 | temp4S __asm__ ("v12"), |
| 555 | temp5S __asm__ ("v13"), |
| 556 | temp6S __asm__ ("v14"), |
| 557 | temp7S __asm__ ("v15"); |
| 558 | register const vector unsigned char vzero __asm__ ("v31") = |
| 559 | (const vector unsigned char) vec_splat_u8(0); |
| 560 | { |
| 561 | register const vector signed short vprod1 __asm__ ("v16") = |
| 562 | (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 }; |
| 563 | |
| 564 | register const vector signed short vprod2 __asm__ ("v17") = |
| 565 | (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 }; |
| 566 | |
| 567 | register const vector signed short vprod3 __asm__ ("v18") = |
| 568 | (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 }; |
| 569 | |
| 570 | register const vector unsigned char perm1 __asm__ ("v19") = |
| 571 | (const vector unsigned char) |
| 572 | { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, |
| 573 | 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D }; |
| 574 | |
| 575 | register const vector unsigned char perm2 __asm__ ("v20") = |
| 576 | (const vector unsigned char) |
| 577 | { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, |
| 578 | 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B }; |
| 579 | |
| 580 | register const vector unsigned char perm3 __asm__ ("v21") = |
| 581 | (const vector unsigned char) |
| 582 | { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
| 583 | 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
| 584 | |
| 585 | #define ONEITERBUTTERFLY(i, res1, res2) \ |
| 586 | { \ |
| 587 | register vector unsigned char src1 __asm__ ("v22") = \ |
| 588 | vec_ld(stride * i, src); \ |
| 589 | register vector unsigned char src2 __asm__ ("v23") = \ |
| 590 | vec_ld(stride * i + 16, src); \ |
| 591 | register vector unsigned char srcO __asm__ ("v22") = \ |
| 592 | vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
| 593 | register vector unsigned char dst1 __asm__ ("v24") = \ |
| 594 | vec_ld(stride * i, dst); \ |
| 595 | register vector unsigned char dst2 __asm__ ("v25") = \ |
| 596 | vec_ld(stride * i + 16, dst); \ |
| 597 | register vector unsigned char dstO __asm__ ("v23") = \ |
| 598 | vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
| 599 | \ |
| 600 | /* Promote the unsigned chars to signed shorts. */ \ |
| 601 | register vector signed short srcV __asm__ ("v24") = \ |
| 602 | (vector signed short) vec_mergeh((vector signed char) vzero, \ |
| 603 | (vector signed char) srcO); \ |
| 604 | register vector signed short dstV __asm__ ("v25") = \ |
| 605 | (vector signed short) vec_mergeh((vector signed char) vzero, \ |
| 606 | (vector signed char) dstO); \ |
| 607 | register vector signed short srcW __asm__ ("v26") = \ |
| 608 | (vector signed short) vec_mergel((vector signed char) vzero, \ |
| 609 | (vector signed char) srcO); \ |
| 610 | register vector signed short dstW __asm__ ("v27") = \ |
| 611 | (vector signed short) vec_mergel((vector signed char) vzero, \ |
| 612 | (vector signed char) dstO); \ |
| 613 | \ |
| 614 | /* subtractions inside the first butterfly */ \ |
| 615 | register vector signed short but0 __asm__ ("v28") = \ |
| 616 | vec_sub(srcV, dstV); \ |
| 617 | register vector signed short but0S __asm__ ("v29") = \ |
| 618 | vec_sub(srcW, dstW); \ |
| 619 | register vector signed short op1 __asm__ ("v30") = \ |
| 620 | vec_perm(but0, but0, perm1); \ |
| 621 | register vector signed short but1 __asm__ ("v22") = \ |
| 622 | vec_mladd(but0, vprod1, op1); \ |
| 623 | register vector signed short op1S __asm__ ("v23") = \ |
| 624 | vec_perm(but0S, but0S, perm1); \ |
| 625 | register vector signed short but1S __asm__ ("v24") = \ |
| 626 | vec_mladd(but0S, vprod1, op1S); \ |
| 627 | register vector signed short op2 __asm__ ("v25") = \ |
| 628 | vec_perm(but1, but1, perm2); \ |
| 629 | register vector signed short but2 __asm__ ("v26") = \ |
| 630 | vec_mladd(but1, vprod2, op2); \ |
| 631 | register vector signed short op2S __asm__ ("v27") = \ |
| 632 | vec_perm(but1S, but1S, perm2); \ |
| 633 | register vector signed short but2S __asm__ ("v28") = \ |
| 634 | vec_mladd(but1S, vprod2, op2S); \ |
| 635 | register vector signed short op3 __asm__ ("v29") = \ |
| 636 | vec_perm(but2, but2, perm3); \ |
| 637 | register vector signed short op3S __asm__ ("v30") = \ |
| 638 | vec_perm(but2S, but2S, perm3); \ |
| 639 | res1 = vec_mladd(but2, vprod3, op3); \ |
| 640 | res2 = vec_mladd(but2S, vprod3, op3S); \ |
| 641 | } |
| 642 | ONEITERBUTTERFLY(0, temp0, temp0S); |
| 643 | ONEITERBUTTERFLY(1, temp1, temp1S); |
| 644 | ONEITERBUTTERFLY(2, temp2, temp2S); |
| 645 | ONEITERBUTTERFLY(3, temp3, temp3S); |
| 646 | ONEITERBUTTERFLY(4, temp4, temp4S); |
| 647 | ONEITERBUTTERFLY(5, temp5, temp5S); |
| 648 | ONEITERBUTTERFLY(6, temp6, temp6S); |
| 649 | ONEITERBUTTERFLY(7, temp7, temp7S); |
| 650 | } |
| 651 | #undef ONEITERBUTTERFLY |
| 652 | { |
| 653 | register vector signed int vsum; |
| 654 | |
| 655 | register vector signed short line0 = vec_add(temp0, temp1); |
| 656 | register vector signed short line1 = vec_sub(temp0, temp1); |
| 657 | register vector signed short line2 = vec_add(temp2, temp3); |
| 658 | register vector signed short line3 = vec_sub(temp2, temp3); |
| 659 | register vector signed short line4 = vec_add(temp4, temp5); |
| 660 | register vector signed short line5 = vec_sub(temp4, temp5); |
| 661 | register vector signed short line6 = vec_add(temp6, temp7); |
| 662 | register vector signed short line7 = vec_sub(temp6, temp7); |
| 663 | |
| 664 | register vector signed short line0B = vec_add(line0, line2); |
| 665 | register vector signed short line2B = vec_sub(line0, line2); |
| 666 | register vector signed short line1B = vec_add(line1, line3); |
| 667 | register vector signed short line3B = vec_sub(line1, line3); |
| 668 | register vector signed short line4B = vec_add(line4, line6); |
| 669 | register vector signed short line6B = vec_sub(line4, line6); |
| 670 | register vector signed short line5B = vec_add(line5, line7); |
| 671 | register vector signed short line7B = vec_sub(line5, line7); |
| 672 | |
| 673 | register vector signed short line0C = vec_add(line0B, line4B); |
| 674 | register vector signed short line4C = vec_sub(line0B, line4B); |
| 675 | register vector signed short line1C = vec_add(line1B, line5B); |
| 676 | register vector signed short line5C = vec_sub(line1B, line5B); |
| 677 | register vector signed short line2C = vec_add(line2B, line6B); |
| 678 | register vector signed short line6C = vec_sub(line2B, line6B); |
| 679 | register vector signed short line3C = vec_add(line3B, line7B); |
| 680 | register vector signed short line7C = vec_sub(line3B, line7B); |
| 681 | |
| 682 | register vector signed short line0S = vec_add(temp0S, temp1S); |
| 683 | register vector signed short line1S = vec_sub(temp0S, temp1S); |
| 684 | register vector signed short line2S = vec_add(temp2S, temp3S); |
| 685 | register vector signed short line3S = vec_sub(temp2S, temp3S); |
| 686 | register vector signed short line4S = vec_add(temp4S, temp5S); |
| 687 | register vector signed short line5S = vec_sub(temp4S, temp5S); |
| 688 | register vector signed short line6S = vec_add(temp6S, temp7S); |
| 689 | register vector signed short line7S = vec_sub(temp6S, temp7S); |
| 690 | |
| 691 | register vector signed short line0BS = vec_add(line0S, line2S); |
| 692 | register vector signed short line2BS = vec_sub(line0S, line2S); |
| 693 | register vector signed short line1BS = vec_add(line1S, line3S); |
| 694 | register vector signed short line3BS = vec_sub(line1S, line3S); |
| 695 | register vector signed short line4BS = vec_add(line4S, line6S); |
| 696 | register vector signed short line6BS = vec_sub(line4S, line6S); |
| 697 | register vector signed short line5BS = vec_add(line5S, line7S); |
| 698 | register vector signed short line7BS = vec_sub(line5S, line7S); |
| 699 | |
| 700 | register vector signed short line0CS = vec_add(line0BS, line4BS); |
| 701 | register vector signed short line4CS = vec_sub(line0BS, line4BS); |
| 702 | register vector signed short line1CS = vec_add(line1BS, line5BS); |
| 703 | register vector signed short line5CS = vec_sub(line1BS, line5BS); |
| 704 | register vector signed short line2CS = vec_add(line2BS, line6BS); |
| 705 | register vector signed short line6CS = vec_sub(line2BS, line6BS); |
| 706 | register vector signed short line3CS = vec_add(line3BS, line7BS); |
| 707 | register vector signed short line7CS = vec_sub(line3BS, line7BS); |
| 708 | |
| 709 | vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
| 710 | vsum = vec_sum4s(vec_abs(line1C), vsum); |
| 711 | vsum = vec_sum4s(vec_abs(line2C), vsum); |
| 712 | vsum = vec_sum4s(vec_abs(line3C), vsum); |
| 713 | vsum = vec_sum4s(vec_abs(line4C), vsum); |
| 714 | vsum = vec_sum4s(vec_abs(line5C), vsum); |
| 715 | vsum = vec_sum4s(vec_abs(line6C), vsum); |
| 716 | vsum = vec_sum4s(vec_abs(line7C), vsum); |
| 717 | |
| 718 | vsum = vec_sum4s(vec_abs(line0CS), vsum); |
| 719 | vsum = vec_sum4s(vec_abs(line1CS), vsum); |
| 720 | vsum = vec_sum4s(vec_abs(line2CS), vsum); |
| 721 | vsum = vec_sum4s(vec_abs(line3CS), vsum); |
| 722 | vsum = vec_sum4s(vec_abs(line4CS), vsum); |
| 723 | vsum = vec_sum4s(vec_abs(line5CS), vsum); |
| 724 | vsum = vec_sum4s(vec_abs(line6CS), vsum); |
| 725 | vsum = vec_sum4s(vec_abs(line7CS), vsum); |
| 726 | vsum = vec_sums(vsum, (vector signed int) vzero); |
| 727 | vsum = vec_splat(vsum, 3); |
| 728 | vec_ste(vsum, 0, &sum); |
| 729 | } |
| 730 | return sum; |
| 731 | } |
| 732 | |
| 733 | static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst, |
| 734 | uint8_t *src, int stride, int h) |
| 735 | { |
| 736 | int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
| 737 | |
| 738 | if (h == 16) { |
| 739 | dst += 8 * stride; |
| 740 | src += 8 * stride; |
| 741 | score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
| 742 | } |
| 743 | return score; |
| 744 | } |
| 745 | #endif /* HAVE_ALTIVEC */ |
| 746 | |
| 747 | av_cold void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx) |
| 748 | { |
| 749 | #if HAVE_ALTIVEC |
| 750 | if (!PPC_ALTIVEC(av_get_cpu_flags())) |
| 751 | return; |
| 752 | |
| 753 | c->pix_abs[0][1] = sad16_x2_altivec; |
| 754 | c->pix_abs[0][2] = sad16_y2_altivec; |
| 755 | c->pix_abs[0][3] = sad16_xy2_altivec; |
| 756 | c->pix_abs[0][0] = sad16_altivec; |
| 757 | c->pix_abs[1][0] = sad8_altivec; |
| 758 | |
| 759 | c->sad[0] = sad16_altivec; |
| 760 | c->sad[1] = sad8_altivec; |
| 761 | c->sse[0] = sse16_altivec; |
| 762 | c->sse[1] = sse8_altivec; |
| 763 | |
| 764 | c->hadamard8_diff[0] = hadamard8_diff16_altivec; |
| 765 | c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; |
| 766 | #endif /* HAVE_ALTIVEC */ |
| 767 | } |