lavc/opusdsp: simplify R-V V postfilter

This skips the round-trip to scalar register for the sliding 'x' coefficients, improving performance by about 5%. The trick here is that the vector slide-up instruction preserves elements in destination vector until the slide offset. The switch from vfslide1up.vf to vslideup.vi also allows the elimination of data dependencies on consecutive slides. Since the specifications recommend sticking to power of two offsets, we could slide as follows: vslideup.vi v8, v0, 2 vslideup.vi v4, v0, 1 vslideup.vi v12, v8, 1 vslideup.vi v16, v8, 2 However in the device under test, this seems to make performance slightly worse, so this is left for (in)validation with future better hardware.
2023-12-16 10:02:08 +02:00 · 2023-12-16 10:02:08 +02:00 · db32f75c63
parent 04cb307508
commit db32f75c63
1 changed files with 12 additions and 18 deletions
--- a/libavcodec/riscv/opusdsp_rvv.S
+++ b/libavcodec/riscv/opusdsp_rvv.S
@ -26,40 +26,34 @@ func ff_opus_postfilter_rvv, zve32f
        flw     fa1, 4(a2) // g1
        sub     t0, a0, t1
        flw     fa2, 8(a2) // g2
+        addi    t1, t0, -2 * 4 // data - (period + 2) = initial &x4
+        vsetivli zero, 4, e32, m4, ta, ma
        addi    t0, t0, 2 * 4 // data - (period - 2) = initial &x0
-
-        flw     ft4, -16(t0)
+        vle32.v v16, (t1)
        addi    t3, a1, -2 // maximum parallelism w/o stepping our tail
-        flw     ft3, -12(t0)
-        flw     ft2,  -8(t0)
-        flw     ft1,  -4(t0)
 1:
+        vslidedown.vi v8, v16, 2
        min     t1, a3, t3
+        vslide1down.vx v12, v16, zero
        vsetvli t1, t1, e32, m4, ta, ma
        vle32.v v0, (t0) // x0
        sub     a3, a3, t1
-        vle32.v v28, (a0)
+        vslide1down.vx v4, v8, zero
        sh2add  t0, t1, t0
-        vfslide1up.vf v4, v0, ft1
+        vle32.v v28, (a0)
        addi    t2, t1, -4
-        vfslide1up.vf v8, v4, ft2
-        vfslide1up.vf v12, v8, ft3
-        vfslide1up.vf v16, v12, ft4
+        vslideup.vi v4, v0, 1
+        vslideup.vi v8, v4, 1
+        vslideup.vi v12, v8, 1
+        vslideup.vi v16, v12, 1
        vfadd.vv v20, v4, v12
        vfadd.vv v24, v0, v16
-        vslidedown.vx v12, v0, t2
+        vslidedown.vx v16, v0, t2
        vfmacc.vf v28, fa0, v8
-        vslidedown.vi v4, v12, 2
        vfmacc.vf v28, fa1, v20
-        vslide1down.vx v8, v12, zero
        vfmacc.vf v28, fa2, v24
-        vslide1down.vx v0, v4, zero
        vse32.v v28, (a0)
-        vfmv.f.s ft4, v12
        sh2add  a0, t1, a0
-        vfmv.f.s ft2, v4
-        vfmv.f.s ft3, v8
-        vfmv.f.s ft1, v0
        bnez    a3, 1b

        ret