[deb_ffmpeg.git] / me_cmp_mvi_asm.S

/*
 * Alpha optimized DSP utils
 * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "regdef.h"

/* Some nicer register names.  */
#define ta t10
#define tb t11
#define tc t12
#define td AT
/* Danger: these overlap with the argument list and the return value */
#define te a5
#define tf a4
#define tg a3
#define th v0

        .set noat
        .set noreorder
        .arch pca56
        .text

/*****************************************************************************
 * int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size)
 *
 * This code is written with a pca56 in mind. For ev6, one should
 * really take the increased latency of 3 cycles for MVI instructions
 * into account.
 *
 * It is important to keep the loading and first use of a register as
 * far apart as possible, because if a register is accessed before it
 * has been fetched from memory, the CPU will stall.
 */
        .align 4
        .globl pix_abs16x16_mvi_asm
        .ent pix_abs16x16_mvi_asm
pix_abs16x16_mvi_asm:
        .frame sp, 0, ra, 0
        .prologue 0

        and     a2, 7, t0
        clr     v0
        beq     t0, $aligned
        .align 4
$unaligned:
        /* Registers:
           line 0:
           t0:  left_u -> left lo -> left
           t1:  mid
           t2:  right_u -> right hi -> right
           t3:  ref left
           t4:  ref right
           line 1:
           t5:  left_u -> left lo -> left
           t6:  mid
           t7:  right_u -> right hi -> right
           t8:  ref left
           t9:  ref right
           temp:
           ta:  left hi
           tb:  right lo
           tc:  error left
           td:  error right  */

        /* load line 0 */
        ldq_u   t0, 0(a2)       # left_u
        ldq_u   t1, 8(a2)       # mid
        ldq_u   t2, 16(a2)      # right_u
        ldq     t3, 0(a1)       # ref left
        ldq     t4, 8(a1)       # ref right
        addq    a1, a3, a1      # pix1
        addq    a2, a3, a2      # pix2
        /* load line 1 */
        ldq_u   t5, 0(a2)       # left_u
        ldq_u   t6, 8(a2)       # mid
        ldq_u   t7, 16(a2)      # right_u
        ldq     t8, 0(a1)       # ref left
        ldq     t9, 8(a1)       # ref right
        addq    a1, a3, a1      # pix1
        addq    a2, a3, a2      # pix2
        /* calc line 0 */
        extql   t0, a2, t0      # left lo
        extqh   t1, a2, ta      # left hi
        extql   t1, a2, tb      # right lo
        or      t0, ta, t0      # left
        extqh   t2, a2, t2      # right hi
        perr    t3, t0, tc      # error left
        or      t2, tb, t2      # right
        perr    t4, t2, td      # error right
        addq    v0, tc, v0      # add error left
        addq    v0, td, v0      # add error left
        /* calc line 1 */
        extql   t5, a2, t5      # left lo
        extqh   t6, a2, ta      # left hi
        extql   t6, a2, tb      # right lo
        or      t5, ta, t5      # left
        extqh   t7, a2, t7      # right hi
        perr    t8, t5, tc      # error left
        or      t7, tb, t7      # right
        perr    t9, t7, td      # error right
        addq    v0, tc, v0      # add error left
        addq    v0, td, v0      # add error left
        /* loop */
        subq    a4,  2, a4      # h -= 2
        bne     a4, $unaligned
        ret

        .align 4
$aligned:
        /* load line 0 */
        ldq     t0, 0(a2)       # left
        ldq     t1, 8(a2)       # right
        addq    a2, a3, a2      # pix2
        ldq     t2, 0(a1)       # ref left
        ldq     t3, 8(a1)       # ref right
        addq    a1, a3, a1      # pix1
        /* load line 1 */
        ldq     t4, 0(a2)       # left
        ldq     t5, 8(a2)       # right
        addq    a2, a3, a2      # pix2
        ldq     t6, 0(a1)       # ref left
        ldq     t7, 8(a1)       # ref right
        addq    a1, a3, a1      # pix1
        /* load line 2 */
        ldq     t8, 0(a2)       # left
        ldq     t9, 8(a2)       # right
        addq    a2, a3, a2      # pix2
        ldq     ta, 0(a1)       # ref left
        ldq     tb, 8(a1)       # ref right
        addq    a1, a3, a1      # pix1
        /* load line 3 */
        ldq     tc, 0(a2)       # left
        ldq     td, 8(a2)       # right
        addq    a2, a3, a2      # pix2
        ldq     te, 0(a1)       # ref left
        ldq     a0, 8(a1)       # ref right
        /* calc line 0 */
        perr    t0, t2, t0      # error left
        addq    a1, a3, a1      # pix1
        perr    t1, t3, t1      # error right
        addq    v0, t0, v0      # add error left
        /* calc line 1 */
        perr    t4, t6, t0      # error left
        addq    v0, t1, v0      # add error right
        perr    t5, t7, t1      # error right
        addq    v0, t0, v0      # add error left
        /* calc line 2 */
        perr    t8, ta, t0      # error left
        addq    v0, t1, v0      # add error right
        perr    t9, tb, t1      # error right
        addq    v0, t0, v0      # add error left
        /* calc line 3 */
        perr    tc, te, t0      # error left
        addq    v0, t1, v0      # add error right
        perr    td, a0, t1      # error right
        addq    v0, t0, v0      # add error left
        addq    v0, t1, v0      # add error right
        /* loop */
        subq    a4,  4, a4      # h -= 4
        bne     a4, $aligned
        ret
        .end pix_abs16x16_mvi_asm
Commit	Line	Data
	1	/*
	2	* Alpha optimized DSP utils
	3	* Copyright (c) 2002 Falk Hueffner <falk@debian.org>
	4	*
	5	* This file is part of FFmpeg.
	6	*
	7	* FFmpeg is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU Lesser General Public
	9	* License as published by the Free Software Foundation; either
	10	* version 2.1 of the License, or (at your option) any later version.
	11	*
	12	* FFmpeg is distributed in the hope that it will be useful,
	13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	15	* Lesser General Public License for more details.
	16	*
	17	* You should have received a copy of the GNU Lesser General Public
	18	* License along with FFmpeg; if not, write to the Free Software
	19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
	20	*/
	21
	22	#include "regdef.h"
	23
	24	/* Some nicer register names. */
	25	#define ta t10
	26	#define tb t11
	27	#define tc t12
	28	#define td AT
	29	/* Danger: these overlap with the argument list and the return value */
	30	#define te a5
	31	#define tf a4
	32	#define tg a3
	33	#define th v0
	34
	35	.set noat
	36	.set noreorder
	37	.arch pca56
	38	.text
	39
	40	/*****************************************************************************
	41	* int pix_abs16x16_mvi_asm(uint8_t pix1, uint8_t pix2, int line_size)
	42	*
	43	* This code is written with a pca56 in mind. For ev6, one should
	44	* really take the increased latency of 3 cycles for MVI instructions
	45	* into account.
	46	*
	47	* It is important to keep the loading and first use of a register as
	48	* far apart as possible, because if a register is accessed before it
	49	* has been fetched from memory, the CPU will stall.
	50	*/
	51	.align 4
	52	.globl pix_abs16x16_mvi_asm
	53	.ent pix_abs16x16_mvi_asm
	54	pix_abs16x16_mvi_asm:
	55	.frame sp, 0, ra, 0
	56	.prologue 0
	57
	58	and a2, 7, t0
	59	clr v0
	60	beq t0, $aligned
	61	.align 4
	62	$unaligned:
	63	/* Registers:
	64	line 0:
	65	t0: left_u -> left lo -> left
	66	t1: mid
	67	t2: right_u -> right hi -> right
	68	t3: ref left
	69	t4: ref right
	70	line 1:
	71	t5: left_u -> left lo -> left
	72	t6: mid
	73	t7: right_u -> right hi -> right
	74	t8: ref left
	75	t9: ref right
	76	temp:
	77	ta: left hi
	78	tb: right lo
	79	tc: error left
	80	td: error right */
	81
	82	/* load line 0 */
	83	ldq_u t0, 0(a2) # left_u
	84	ldq_u t1, 8(a2) # mid
	85	ldq_u t2, 16(a2) # right_u
	86	ldq t3, 0(a1) # ref left
	87	ldq t4, 8(a1) # ref right
	88	addq a1, a3, a1 # pix1
	89	addq a2, a3, a2 # pix2
	90	/* load line 1 */
	91	ldq_u t5, 0(a2) # left_u
	92	ldq_u t6, 8(a2) # mid
	93	ldq_u t7, 16(a2) # right_u
	94	ldq t8, 0(a1) # ref left
	95	ldq t9, 8(a1) # ref right
	96	addq a1, a3, a1 # pix1
	97	addq a2, a3, a2 # pix2
	98	/* calc line 0 */
	99	extql t0, a2, t0 # left lo
	100	extqh t1, a2, ta # left hi
	101	extql t1, a2, tb # right lo
	102	or t0, ta, t0 # left
	103	extqh t2, a2, t2 # right hi
	104	perr t3, t0, tc # error left
	105	or t2, tb, t2 # right
	106	perr t4, t2, td # error right
	107	addq v0, tc, v0 # add error left
	108	addq v0, td, v0 # add error left
	109	/* calc line 1 */
	110	extql t5, a2, t5 # left lo
	111	extqh t6, a2, ta # left hi
	112	extql t6, a2, tb # right lo
	113	or t5, ta, t5 # left
	114	extqh t7, a2, t7 # right hi
	115	perr t8, t5, tc # error left
	116	or t7, tb, t7 # right
	117	perr t9, t7, td # error right
	118	addq v0, tc, v0 # add error left
	119	addq v0, td, v0 # add error left
	120	/* loop */
	121	subq a4, 2, a4 # h -= 2
	122	bne a4, $unaligned
	123	ret
	124
	125	.align 4
	126	$aligned:
	127	/* load line 0 */
	128	ldq t0, 0(a2) # left
	129	ldq t1, 8(a2) # right
	130	addq a2, a3, a2 # pix2
	131	ldq t2, 0(a1) # ref left
	132	ldq t3, 8(a1) # ref right
	133	addq a1, a3, a1 # pix1
	134	/* load line 1 */
	135	ldq t4, 0(a2) # left
	136	ldq t5, 8(a2) # right
	137	addq a2, a3, a2 # pix2
	138	ldq t6, 0(a1) # ref left
	139	ldq t7, 8(a1) # ref right
	140	addq a1, a3, a1 # pix1
	141	/* load line 2 */
	142	ldq t8, 0(a2) # left
	143	ldq t9, 8(a2) # right
	144	addq a2, a3, a2 # pix2
	145	ldq ta, 0(a1) # ref left
	146	ldq tb, 8(a1) # ref right
	147	addq a1, a3, a1 # pix1
	148	/* load line 3 */
	149	ldq tc, 0(a2) # left
	150	ldq td, 8(a2) # right
	151	addq a2, a3, a2 # pix2
	152	ldq te, 0(a1) # ref left
	153	ldq a0, 8(a1) # ref right
	154	/* calc line 0 */
	155	perr t0, t2, t0 # error left
	156	addq a1, a3, a1 # pix1
	157	perr t1, t3, t1 # error right
	158	addq v0, t0, v0 # add error left
	159	/* calc line 1 */
	160	perr t4, t6, t0 # error left
	161	addq v0, t1, v0 # add error right
	162	perr t5, t7, t1 # error right
	163	addq v0, t0, v0 # add error left
	164	/* calc line 2 */
	165	perr t8, ta, t0 # error left
	166	addq v0, t1, v0 # add error right
	167	perr t9, tb, t1 # error right
	168	addq v0, t0, v0 # add error left
	169	/* calc line 3 */
	170	perr tc, te, t0 # error left
	171	addq v0, t1, v0 # add error right
	172	perr td, a0, t1 # error right
	173	addq v0, t0, v0 # add error left
	174	addq v0, t1, v0 # add error right
	175	/* loop */
	176	subq a4, 4, a4 # h -= 4
	177	bne a4, $aligned
	178	ret
	179	.end pix_abs16x16_mvi_asm