/*
 * Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "libavutil/arm/asm.S"

.macro alias name, tgt, set=1
.if \set != 0
    \name   .req    \tgt
.else
    .unreq  \name
.endif
.endm

.altmacro

.macro alias_dw_all qw, dw_l, dw_h
    alias   q\qw\()_l, d\dw_l
    alias   q\qw\()_h, d\dw_h
    .if \qw < 15
        alias_dw_all  %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
    .endif
.endm

alias_dw_all    0, 0, 1

.noaltmacro

.macro alias_qw     name, qw, set=1
    alias   \name\(), \qw, \set
    alias   \name\()_l, \qw\()_l, \set
    alias   \name\()_h, \qw\()_h, \set
.endm

.macro prologue
    push            {r4-r12, lr}
    vpush           {q4-q7}
.endm

.macro epilogue
    vpop            {q4-q7}
    pop             {r4-r12, pc}
.endm

.macro  load_arg    reg, ix
    ldr     \reg,   [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm


/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
 *                  int width, int height,
 *                  int y_stride, int c_stride, int src_stride,
 *                  int32_t coeff_table[9]);
 */
.macro  alias_loop_420sp set=1
    alias   src,        r0, \set
    alias   src0,       src, \set
    alias   y,          r1, \set
    alias   y0,         y, \set
    alias   chroma,     r2, \set
    alias   width,      r3, \set
    alias   header,     width, \set

    alias   height,     r4, \set
    alias   y_stride,   r5, \set
    alias   c_stride,   r6, \set
    alias   c_padding,  c_stride, \set
    alias   src_stride, r7, \set

    alias   y0_end,     r8, \set

    alias   src_padding,r9, \set
    alias   y_padding,  r10, \set

    alias   src1,       r11, \set
    alias   y1,         r12, \set

    alias   coeff_table,r12, \set
.endm


.macro  loop_420sp s_fmt, d_fmt, init, kernel, precision

function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
    prologue

    alias_loop_420sp

    load_arg    height,         4
    load_arg    y_stride,       5
    load_arg    c_stride,       6
    load_arg    src_stride,     7
    load_arg    coeff_table,    8

    \init       coeff_table

    sub         y_padding,      y_stride,       width
    sub         c_padding,      c_stride,       width
    sub         src_padding,    src_stride,     width, LSL #2

    add         y0_end,         y0,             width
    and         header,         width,          #15

    add         y1,             y0,             y_stride
    add         src1,           src0,           src_stride

0:
    cmp         header,     #0
    beq         1f

    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header

1:
    \kernel     \s_fmt, \d_fmt, src0, src1, y0, y1, chroma

    cmp         y0,         y0_end
    blt         1b
2:
    add         y0,         y1,         y_padding
    add         y0_end,     y1,         y_stride
    add         chroma,     chroma,     c_padding
    add         src0,       src1,       src_padding

    add         y1,         y0,         y_stride
    add         src1,       src0,       src_stride

    subs        height,     height,     #2

    bgt         0b

    epilogue

    alias_loop_420sp 0

endfunc
.endm

.macro downsample
    vpaddl.u8   r16x8,  r8x16
    vpaddl.u8   g16x8,  g8x16
    vpaddl.u8   b16x8,  b8x16
.endm


/* acculumate and right shift by 2 */
.macro downsample_ars2
    vpadal.u8   r16x8,  r8x16
    vpadal.u8   g16x8,  g8x16
    vpadal.u8   b16x8,  b8x16

    vrshr.u16   r16x8,  r16x8,  #2
    vrshr.u16   g16x8,  g16x8,  #2
    vrshr.u16   b16x8,  b16x8,  #2
.endm

.macro store_y8_16x1            dst, count
.ifc "\count",""
    vstmia      \dst!,  {y8x16}
.else
    vstmia      \dst,   {y8x16}
    add         \dst,   \dst,           \count
.endif
.endm

.macro store_chroma_nv12_8x1    dst, count
.ifc "\count",""
    vst2.i8     {u8x8, v8x8},   [\dst]!
.else
    vst2.i8     {u8x8, v8x8},   [\dst], \count
.endif
.endm

.macro store_chroma_nv21_8x1    dst, count
.ifc "\count",""
    vst2.i8     {v8x8, u8x8},   [\dst]!
.else
    vst2.i8     {v8x8, u8x8},   [\dst], \count
.endif
.endm

.macro load_8888_16x1   a, b, c, d, src, count
.ifc "\count",""
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]!
.else
    vld4.8      {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l},  [\src]!
    vld4.8      {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h},  [\src]
    sub         \src,   \src,   #32
    add         \src,   \src,   \count, LSL #2
.endif
.endm

.macro load_rgbx_16x1   src, count
    load_8888_16x1  r, g, b, x, \src, \count
.endm

.macro load_bgrx_16x1   src, count
    load_8888_16x1  b, g, r, x, \src, \count
.endm

.macro alias_src_rgbx   set=1
    alias_src_8888  r, g, b, x, \set
.endm

.macro alias_src_bgrx   set=1
    alias_src_8888  b, g, r, x, \set
.endm

.macro alias_dst_nv12   set=1
    alias   u8x8, c8x8x2_l, \set
    alias   v8x8, c8x8x2_h, \set
.endm

.macro alias_dst_nv21   set=1
    alias   v8x8, c8x8x2_l, \set
    alias   u8x8, c8x8x2_h, \set
.endm


// common aliases

alias   CO_R    d0
CO_RY   .dn     d0.s16[0]
CO_RU   .dn     d0.s16[1]
CO_RV   .dn     d0.s16[2]

alias   CO_G    d1
CO_GY   .dn     d1.s16[0]
CO_GU   .dn     d1.s16[1]
CO_GV   .dn     d1.s16[2]

alias   CO_B    d2
CO_BY   .dn     d2.s16[0]
CO_BU   .dn     d2.s16[1]
CO_BV   .dn     d2.s16[2]

alias   BIAS_U, d3
alias   BIAS_V, BIAS_U

alias   BIAS_Y, q2


/* q3-q6 R8G8B8X8 x16 */

.macro alias_src_8888   a, b, c, d, set
    alias_qw  \a\()8x16, q3, \set
    alias_qw  \b\()8x16, q4, \set
    alias_qw  \c\()8x16, q5, \set
    alias_qw  \d\()8x16, q6, \set
.endm

.macro kernel_420_16x2  rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
    alias_src_\rgb_fmt
    alias_dst_\yuv_fmt

    load_\rgb_fmt\()_16x1   \rgb0, \count

    downsample
    compute_y_16x1
    store_y8_16x1   \y0, \count


    load_\rgb_fmt\()_16x1   \rgb1, \count
    downsample_ars2
    compute_y_16x1
    store_y8_16x1   \y1, \count

    compute_chroma_8x1  u, U
    compute_chroma_8x1  v, V

    store_chroma_\yuv_fmt\()_8x1 \chroma, \count

    alias_dst_\yuv_fmt 0
    alias_src_\rgb_fmt 0
.endm