1
0
Fork 0

lavc/me_cmp: R-V V sse

C908:
sse_0_c: 614.7
sse_0_rvv_i32: 138.2
sse_1_c: 302.7
sse_1_rvv_i32: 107.2
sse_2_c: 175.7
sse_2_rvv_i32: 104.2

Signed-off-by: Rémi Denis-Courmont <remi@remlab.net>
This commit is contained in:
sunyuechi 2024-02-06 21:55:07 +08:00 committed by Rémi Denis-Courmont
parent 37463d7979
commit 9cb8f262f2
2 changed files with 77 additions and 0 deletions

View File

@ -39,6 +39,13 @@ int ff_pix_abs16_y2_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *p
int ff_pix_abs8_y2_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse16_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse8_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int ff_sse4_rvv(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_riscv(MECmpContext *c, AVCodecContext *avctx)
{
#if HAVE_RVV
@ -53,6 +60,10 @@ av_cold void ff_me_cmp_init_riscv(MECmpContext *c, AVCodecContext *avctx)
c->pix_abs[1][1] = ff_pix_abs8_x2_rvv;
c->pix_abs[0][2] = ff_pix_abs16_y2_rvv;
c->pix_abs[1][2] = ff_pix_abs8_y2_rvv;
c->sse[0] = ff_sse16_rvv;
c->sse[1] = ff_sse8_rvv;
c->sse[2] = ff_sse4_rvv;
}
#endif
}

View File

@ -165,3 +165,69 @@ func ff_pix_abs8_y2_rvv, zve32x
pix_abs_ret
endfunc
func ff_sse16_rvv, zve32x
vsetivli t0, 16, e32, m4, ta, ma
vmv.v.x v24, zero
vmv.s.x v0, zero
1:
vsetvli zero, zero, e8, m1, tu, ma
vle8.v v4, (a1)
vle8.v v12, (a2)
addi a4, a4, -1
vwsubu.vv v16, v4, v12
vsetvli zero, zero, e16, m2, tu, ma
vwmacc.vv v24, v16, v16
add a1, a1, a3
add a2, a2, a3
bnez a4, 1b
vsetvli zero, zero, e32, m4, tu, ma
vredsum.vs v0, v24, v0
vmv.x.s a0, v0
ret
endfunc
func ff_sse8_rvv, zve32x
vsetivli t0, 8, e32, m2, ta, ma
vmv.v.x v24, zero
vmv.s.x v0, zero
1:
vsetvli zero, zero, e8, mf2, tu, ma
vle8.v v4, (a1)
vle8.v v12, (a2)
addi a4, a4, -1
vwsubu.vv v16, v4, v12
vsetvli zero, zero, e16, m1, tu, ma
vwmacc.vv v24, v16, v16
add a1, a1, a3
add a2, a2, a3
bnez a4, 1b
vsetvli zero, zero, e32, m2, tu, ma
vredsum.vs v0, v24, v0
vmv.x.s a0, v0
ret
endfunc
func ff_sse4_rvv, zve32x
vsetivli t0, 4, e32, m1, ta, ma
vmv.v.x v24, zero
vmv.s.x v0, zero
1:
vsetvli zero, zero, e8, mf4, tu, ma
vle8.v v4, (a1)
vle8.v v12, (a2)
addi a4, a4, -1
vwsubu.vv v16, v4, v12
vsetvli zero, zero, e16, mf2, tu, ma
vwmacc.vv v24, v16, v16
add a1, a1, a3
add a2, a2, a3
bnez a4, 1b
vsetvli zero, zero, e32, m1, tu, ma
vredsum.vs v0, v24, v0
vmv.x.s a0, v0
ret
endfunc