avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt

tests/checkasm/checkasm: C LSX LASX put_hevc_pel_uni_w_pixels4_8_c: 2.7 1.0 put_hevc_pel_uni_w_pixels6_8_c: 6.2 2.0 1.5 put_hevc_pel_uni_w_pixels8_8_c: 10.7 2.5 1.7 put_hevc_pel_uni_w_pixels12_8_c: 23.0 5.5 5.0 put_hevc_pel_uni_w_pixels16_8_c: 41.0 8.2 5.0 put_hevc_pel_uni_w_pixels24_8_c: 91.0 19.7 13.2 put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.5 16.2 put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.7 43.0 put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2 Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads is 1fps(47fps-->48fps). Reviewed-by: yinshiyou-hf@loongson.cn Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
2023-12-28 16:21:01 +08:00 · 2023-12-28 16:21:01 +08:00 · a28eea2a27
parent cfbdda607d
commit a28eea2a27
5 changed files with 596 additions and 1 deletions
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)       += loongarch/hevcdsp_lsx.o \
                                         loongarch/hevc_mc_bi_lsx.o \
                                         loongarch/hevc_mc_uni_lsx.o \
                                         loongarch/hevc_mc_uniw_lsx.o \
-                                         loongarch/hevc_add_res.o
+                                         loongarch/hevc_add_res.o \
+                                         loongarch/hevc_mc.o
 LSX-OBJS-$(CONFIG_H264DSP)            += loongarch/h264idct.o \
                                         loongarch/h264idct_loongarch.o \
                                         loongarch/h264dsp.o
--- a/libavcodec/loongarch/hevc_mc.S
+++ b/libavcodec/loongarch/hevc_mc.S
@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro LOAD_VAR bit
+    addi.w         t1,     a5,      6  //shift
+    addi.w         t3,     zero,    1  //one
+    sub.w          t4,     t1,      t3
+    sll.w          t3,     t3,      t4 //offset
+.if \bit == 128
+    vreplgr2vr.w   vr1,    a6          //wx
+    vreplgr2vr.w   vr2,    t3          //offset
+    vreplgr2vr.w   vr3,    t1          //shift
+    vreplgr2vr.w   vr4,    a7          //ox
+.else
+    xvreplgr2vr.w  xr1,    a6
+    xvreplgr2vr.w  xr2,    t3
+    xvreplgr2vr.w  xr3,    t1
+    xvreplgr2vr.w  xr4,    a7
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
+    vldrepl.d      vr0,    \src0,   0
+    vsllwil.hu.bu  vr0,    vr0,     0
+    vexth.wu.hu    vr5,    vr0
+    vsllwil.wu.hu  vr0,    vr0,     0
+    vslli.w        vr0,    vr0,     6
+    vslli.w        vr5,    vr5,     6
+    vmul.w         vr0,    vr0,     vr1
+    vmul.w         vr5,    vr5,     vr1
+    vadd.w         vr0,    vr0,     vr2
+    vadd.w         vr5,    vr5,     vr2
+    vsra.w         vr0,    vr0,     vr3
+    vsra.w         vr5,    vr5,     vr3
+    vadd.w         vr0,    vr0,     vr4
+    vadd.w         vr5,    vr5,     vr4
+    vssrani.h.w    vr5,    vr0,     0
+    vssrani.bu.h   vr5,    vr5,     0
+.if \w == 6
+    fst.s          f5,     \dst0,   0
+    vstelm.h       vr5,    \dst0,   4,     2
+.else
+    fst.d          f5,     \dst0,   0
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
+    vldrepl.d      vr0,    \src0,   0
+    add.d          t2,     \src0,   a3
+    vldrepl.d      vr5,    t2,      0
+    xvpermi.q      xr0,    xr5,     0x02
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr5,    xr0
+    xvsllwil.wu.hu xr0,    xr0,     0
+    xvslli.w       xr0,    xr0,     6
+    xvslli.w       xr5,    xr5,     6
+    xvmul.w        xr0,    xr0,     xr1
+    xvmul.w        xr5,    xr5,     xr1
+    xvadd.w        xr0,    xr0,     xr2
+    xvadd.w        xr5,    xr5,     xr2
+    xvsra.w        xr0,    xr0,     xr3
+    xvsra.w        xr5,    xr5,     xr3
+    xvadd.w        xr0,    xr0,     xr4
+    xvadd.w        xr5,    xr5,     xr4
+    xvssrani.h.w   xr5,    xr0,     0
+    xvpermi.q      xr0,    xr5,     0x01
+    xvssrani.bu.h  xr0,    xr5,     0
+    add.d          t3,     \dst0,   a1
+.if \w == 6
+    vstelm.w       vr0,    \dst0,   0,     0
+    vstelm.h       vr0,    \dst0,   4,     2
+    vstelm.w       vr0,    t3,      0,     2
+    vstelm.h       vr0,    t3,      4,     6
+.else
+    vstelm.d       vr0,    \dst0,   0,     0
+    vstelm.d       vr0,    t3,      0,     1
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LSX src0, dst0
+    vld            vr0,    \src0,   0
+    vexth.hu.bu    vr7,    vr0
+    vexth.wu.hu    vr8,    vr7
+    vsllwil.wu.hu  vr7,    vr7,     0
+    vsllwil.hu.bu  vr5,    vr0,     0
+    vexth.wu.hu    vr6,    vr5
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr5,    vr5,     6
+    vslli.w        vr6,    vr6,     6
+    vslli.w        vr7,    vr7,     6
+    vslli.w        vr8,    vr8,     6
+    vmul.w         vr5,    vr5,     vr1
+    vmul.w         vr6,    vr6,     vr1
+    vmul.w         vr7,    vr7,     vr1
+    vmul.w         vr8,    vr8,     vr1
+    vadd.w         vr5,    vr5,     vr2
+    vadd.w         vr6,    vr6,     vr2
+    vadd.w         vr7,    vr7,     vr2
+    vadd.w         vr8,    vr8,     vr2
+    vsra.w         vr5,    vr5,     vr3
+    vsra.w         vr6,    vr6,     vr3
+    vsra.w         vr7,    vr7,     vr3
+    vsra.w         vr8,    vr8,     vr3
+    vadd.w         vr5,    vr5,     vr4
+    vadd.w         vr6,    vr6,     vr4
+    vadd.w         vr7,    vr7,     vr4
+    vadd.w         vr8,    vr8,     vr4
+    vssrani.h.w    vr6,    vr5,     0
+    vssrani.h.w    vr8,    vr7,     0
+    vssrani.bu.h   vr8,    vr6,     0
+    vst            vr8,    \dst0,   0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS16_LASX src0, dst0
+    vld            vr0,    \src0,   0
+    xvpermi.d      xr0,    xr0,     0xd8
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr6,    xr0
+    xvsllwil.wu.hu xr5,    xr0,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvpermi.q      xr7,    xr6,     0x01
+    xvssrani.bu.h  xr7,    xr6,     0
+    vst            vr7,    \dst0,   0
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS32_LASX src0, dst0, w
+.if \w == 16
+    vld            vr0,    \src0,   0
+    add.d          t2,     \src0,   a3
+    vld            vr5,    t2,      0
+    xvpermi.q      xr0,    xr5,     0x02
+.else //w=24/32
+    xvld           xr0,    \src0,   0
+.endif
+    xvexth.hu.bu   xr7,    xr0
+    xvexth.wu.hu   xr8,    xr7
+    xvsllwil.wu.hu xr7,    xr7,     0
+    xvsllwil.hu.bu xr5,    xr0,     0
+    xvexth.wu.hu   xr6,    xr5
+    xvsllwil.wu.hu xr5,    xr5,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvslli.w       xr7,    xr7,     6
+    xvslli.w       xr8,    xr8,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvmul.w        xr7,    xr7,     xr1
+    xvmul.w        xr8,    xr8,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvadd.w        xr7,    xr7,     xr2
+    xvadd.w        xr8,    xr8,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvsra.w        xr7,    xr7,     xr3
+    xvsra.w        xr8,    xr8,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvadd.w        xr7,    xr7,     xr4
+    xvadd.w        xr8,    xr8,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvssrani.h.w   xr8,    xr7,     0
+    xvssrani.bu.h  xr8,    xr6,     0
+.if \w == 16
+    vst            vr8,    \dst0,   0
+    add.d          t2,     \dst0,   a1
+    xvpermi.q      xr8,    xr8,     0x01
+    vst            vr8,    t2,      0
+.elseif \w == 24
+    vst            vr8,    \dst0,   0
+    xvstelm.d      xr8,    \dst0,   16,    2
+.else
+    xvst           xr8,    \dst0,   0
+.endif
+.endm
+
+function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
+    LOAD_VAR 128
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS4:
+    vldrepl.w      vr0,    a2,      0
+    add.d          t1,     a2,      a3
+    vldrepl.w      vr5,    t1,      0
+    vsllwil.hu.bu  vr0,    vr0,     0
+    vsllwil.wu.hu  vr0,    vr0,     0
+    vsllwil.hu.bu  vr5,    vr5,     0
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr0,    vr0,     6
+    vslli.w        vr5,    vr5,     6
+    vmul.w         vr0,    vr0,     vr1
+    vmul.w         vr5,    vr5,     vr1
+    vadd.w         vr0,    vr0,     vr2
+    vadd.w         vr5,    vr5,     vr2
+    vsra.w         vr0,    vr0,     vr3
+    vsra.w         vr5,    vr5,     vr3
+    vadd.w         vr0,    vr0,     vr4
+    vadd.w         vr5,    vr5,     vr4
+    vssrani.h.w    vr5,    vr0,     0
+    vssrani.bu.h   vr5,    vr5,     0
+    fst.s          f5,     a0,      0
+    add.d          t2,     a0,      a1
+    vstelm.w       vr5,    t2,      0,     1
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS4
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS6:
+    HEVC_PEL_UNI_W_PIXELS8_LSX      a2,    a0,    6
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS6
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS6_LASX:
+    HEVC_PEL_UNI_W_PIXELS8x2_LASX   a2,    a0,    6
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS6_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS8:
+    HEVC_PEL_UNI_W_PIXELS8_LSX      a2,    a0,    8
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS8
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS8_LASX:
+    HEVC_PEL_UNI_W_PIXELS8x2_LASX   a2,    a0,    8
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS8_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS12:
+    vld            vr0,    a2,      0
+    vexth.hu.bu    vr7,    vr0
+    vsllwil.wu.hu  vr7,    vr7,     0
+    vsllwil.hu.bu  vr5,    vr0,     0
+    vexth.wu.hu    vr6,    vr5
+    vsllwil.wu.hu  vr5,    vr5,     0
+    vslli.w        vr5,    vr5,     6
+    vslli.w        vr6,    vr6,     6
+    vslli.w        vr7,    vr7,     6
+    vmul.w         vr5,    vr5,     vr1
+    vmul.w         vr6,    vr6,     vr1
+    vmul.w         vr7,    vr7,     vr1
+    vadd.w         vr5,    vr5,     vr2
+    vadd.w         vr6,    vr6,     vr2
+    vadd.w         vr7,    vr7,     vr2
+    vsra.w         vr5,    vr5,     vr3
+    vsra.w         vr6,    vr6,     vr3
+    vsra.w         vr7,    vr7,     vr3
+    vadd.w         vr5,    vr5,     vr4
+    vadd.w         vr6,    vr6,     vr4
+    vadd.w         vr7,    vr7,     vr4
+    vssrani.h.w    vr6,    vr5,     0
+    vssrani.h.w    vr7,    vr7,     0
+    vssrani.bu.h   vr7,    vr6,     0
+    fst.d          f7,     a0,      0
+    vstelm.w       vr7,    a0,      8,     2
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS12
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS12_LASX:
+    vld            vr0,    a2,      0
+    xvpermi.d      xr0,    xr0,     0xd8
+    xvsllwil.hu.bu xr0,    xr0,     0
+    xvexth.wu.hu   xr6,    xr0
+    xvsllwil.wu.hu xr5,    xr0,     0
+    xvslli.w       xr5,    xr5,     6
+    xvslli.w       xr6,    xr6,     6
+    xvmul.w        xr5,    xr5,     xr1
+    xvmul.w        xr6,    xr6,     xr1
+    xvadd.w        xr5,    xr5,     xr2
+    xvadd.w        xr6,    xr6,     xr2
+    xvsra.w        xr5,    xr5,     xr3
+    xvsra.w        xr6,    xr6,     xr3
+    xvadd.w        xr5,    xr5,     xr4
+    xvadd.w        xr6,    xr6,     xr4
+    xvssrani.h.w   xr6,    xr5,     0
+    xvpermi.q      xr7,    xr6,     0x01
+    xvssrani.bu.h  xr7,    xr6,     0
+    fst.d          f7,     a0,      0
+    vstelm.w       vr7,    a0,      8,     2
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS12_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS16:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS16
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx
+    LOAD_VAR 256
+    srli.w         t0,     a4,      1
+.LOOP_PIXELS16_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,   16
+    alsl.d         a2,     a3,      a2,    1
+    alsl.d         a0,     a1,      a0,    1
+    addi.w         t0,     t0,      -1
+    bnez           t0,     .LOOP_PIXELS16_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS24:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS8_LSX      t0,    t1,   8
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS24
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS24_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,   24
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS24_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS32:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS32
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS32_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS32_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS48:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS48
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS48_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LASX    t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS48_LASX
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx
+    LOAD_VAR 128
+.LOOP_PIXELS64:
+    HEVC_PEL_UNI_W_PIXELS16_LSX     a2,    a0
+    addi.d         t0,     a2,      16
+    addi.d         t1,     a0,      16
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    addi.d         t0,     a2,      48
+    addi.d         t1,     a0,      48
+    HEVC_PEL_UNI_W_PIXELS16_LSX     t0,    t1
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS64
+endfunc
+
+function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx
+    LOAD_VAR 256
+.LOOP_PIXELS64_LASX:
+    HEVC_PEL_UNI_W_PIXELS32_LASX    a2,    a0,    32
+    addi.d         t0,     a2,      32
+    addi.d         t1,     a0,      32
+    HEVC_PEL_UNI_W_PIXELS32_LASX    t0,    t1,    32
+    add.d          a2,     a2,      a3
+    add.d          a0,     a0,      a1
+    addi.w         a4,     a4,      -1
+    bnez           a4,     .LOOP_PIXELS64_LASX
+endfunc
--- a/libavcodec/loongarch/hevcdsp_init_loongarch.c
+++ b/libavcodec/loongarch/hevcdsp_init_loongarch.c
@ -22,6 +22,7 @@

 #include "libavutil/loongarch/cpu.h"
 #include "hevcdsp_lsx.h"
+#include "hevcdsp_lasx.h"

 void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
 {
@ -160,6 +161,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
            c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_8_lsx;
            c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_lsx;

+            c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
+            c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx;
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lsx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lsx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lsx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lsx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lsx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lsx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lsx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lsx;
+
            c->put_hevc_qpel_uni_w[3][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv8_8_lsx;
            c->put_hevc_qpel_uni_w[5][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv16_8_lsx;
            c->put_hevc_qpel_uni_w[6][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv24_8_lsx;
@ -196,4 +217,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
            c->add_residual[3] = ff_hevc_add_residual32x32_8_lsx;
        }
    }
+
+    if (have_lasx(cpu_flags)) {
+        if (bit_depth == 8) {
+            c->put_hevc_qpel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_qpel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+
+            c->put_hevc_epel_uni_w[2][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels6_8_lasx;
+            c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels8_8_lasx;
+            c->put_hevc_epel_uni_w[4][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels12_8_lasx;
+            c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels16_8_lasx;
+            c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels24_8_lasx;
+            c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
+            c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
+            c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
+        }
+    }
 }
--- a/libavcodec/loongarch/hevcdsp_lasx.h
+++ b/libavcodec/loongarch/hevcdsp_lasx.h
@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo <jinbo@loongson.cn>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+#define AVCODEC_LOONGARCH_HEVCDSP_LASX_H
+
+#include "libavcodec/hevcdsp.h"
+
+#define PEL_UNI_W(PEL, DIR, WIDTH)                                       \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lasx(uint8_t *dst,  \
+                                                          ptrdiff_t      \
+                                                          dst_stride,    \
+                                                          const uint8_t *src,  \
+                                                          ptrdiff_t      \
+                                                          src_stride,    \
+                                                          int height,    \
+                                                          int denom,     \
+                                                          int wx,        \
+                                                          int ox,        \
+                                                          intptr_t mx,   \
+                                                          intptr_t my,   \
+                                                          int width)
+
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
+#endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H
--- a/libavcodec/loongarch/hevcdsp_lsx.h
+++ b/libavcodec/loongarch/hevcdsp_lsx.h
@ -232,4 +232,31 @@ void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t s
 void ff_hevc_add_residual16x16_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
 void ff_hevc_add_residual32x32_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride);

+#define PEL_UNI_W(PEL, DIR, WIDTH)                                      \
+void ff_hevc_put_hevc_##PEL##_uni_w_##DIR##WIDTH##_8_lsx(uint8_t *dst,  \
+                                                         ptrdiff_t      \
+                                                         dst_stride,    \
+                                                         const uint8_t *src,  \
+                                                         ptrdiff_t      \
+                                                         src_stride,    \
+                                                         int height,    \
+                                                         int denom,     \
+                                                         int wx,        \
+                                                         int ox,        \
+                                                         intptr_t mx,   \
+                                                         intptr_t my,   \
+                                                         int width)
+
+PEL_UNI_W(pel, pixels, 4);
+PEL_UNI_W(pel, pixels, 6);
+PEL_UNI_W(pel, pixels, 8);
+PEL_UNI_W(pel, pixels, 12);
+PEL_UNI_W(pel, pixels, 16);
+PEL_UNI_W(pel, pixels, 24);
+PEL_UNI_W(pel, pixels, 32);
+PEL_UNI_W(pel, pixels, 48);
+PEL_UNI_W(pel, pixels, 64);
+
+#undef PEL_UNI_W
+
 #endif  // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H