1
0
Fork 0

avcodec/hevc: Add qpel_uni_w_v|h4/6/8/12/16/24/32/48/64 asm opt

tests/checkasm/checkasm:           C       LSX     LASX
put_hevc_qpel_uni_w_h4_8_c:        6.5     1.7     1.2
put_hevc_qpel_uni_w_h6_8_c:        14.5    4.5     3.7
put_hevc_qpel_uni_w_h8_8_c:        24.5    5.7     4.5
put_hevc_qpel_uni_w_h12_8_c:       54.7    17.5    12.0
put_hevc_qpel_uni_w_h16_8_c:       96.5    22.7    13.2
put_hevc_qpel_uni_w_h24_8_c:       216.0   51.2    33.2
put_hevc_qpel_uni_w_h32_8_c:       385.7   87.0    53.2
put_hevc_qpel_uni_w_h48_8_c:       860.5   192.0   113.2
put_hevc_qpel_uni_w_h64_8_c:       1531.0  334.2   200.0

put_hevc_qpel_uni_w_v4_8_c:        8.0     1.7
put_hevc_qpel_uni_w_v6_8_c:        17.2    4.5
put_hevc_qpel_uni_w_v8_8_c:        29.5    6.0     5.2
put_hevc_qpel_uni_w_v12_8_c:       65.2    16.0    11.7
put_hevc_qpel_uni_w_v16_8_c:       116.5   20.5    14.0
put_hevc_qpel_uni_w_v24_8_c:       259.2   48.5    37.2
put_hevc_qpel_uni_w_v32_8_c:       459.5   80.5    56.0
put_hevc_qpel_uni_w_v48_8_c:       1028.5  180.2   126.5
put_hevc_qpel_uni_w_v64_8_c:       1831.2  319.2   224.2

Speedup of decoding H265 4K 30FPS 30Mbps on
3A6000 with 8 threads is 4fps(48fps-->52fps).

Change-Id: I1178848541d90083869225ba98a02e6aa8bb8c5a
Reviewed-by: yinshiyou-hf@loongson.cn
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
This commit is contained in:
jinbo 2023-12-28 16:21:02 +08:00 committed by Michael Niedermayer
parent a28eea2a27
commit 6c6bf18ce8
No known key found for this signature in database
GPG Key ID: B18E8928B3948D64
4 changed files with 1370 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -188,6 +188,26 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni_w[8][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv48_8_lsx;
c->put_hevc_qpel_uni_w[9][1][1] = ff_hevc_put_hevc_uni_w_qpel_hv64_8_lsx;
c->put_hevc_qpel_uni_w[1][1][0] = ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx;
c->put_hevc_qpel_uni_w[2][1][0] = ff_hevc_put_hevc_qpel_uni_w_v6_8_lsx;
c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lsx;
c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lsx;
c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lsx;
c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_qpel_uni_w_v24_8_lsx;
c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_qpel_uni_w_v32_8_lsx;
c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_qpel_uni_w_v48_8_lsx;
c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_qpel_uni_w_v64_8_lsx;
c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx;
c->put_hevc_qpel_uni_w[2][0][1] = ff_hevc_put_hevc_qpel_uni_w_h6_8_lsx;
c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_qpel_uni_w_h8_8_lsx;
c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_qpel_uni_w_h12_8_lsx;
c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_qpel_uni_w_h16_8_lsx;
c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_qpel_uni_w_h24_8_lsx;
c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_qpel_uni_w_h32_8_lsx;
c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_qpel_uni_w_h48_8_lsx;
c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_qpel_uni_w_h64_8_lsx;
c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_lsx;
c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_8_lsx;
c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_8_lsx;
@ -237,6 +257,24 @@ void ff_hevc_dsp_init_loongarch(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels32_8_lasx;
c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels48_8_lasx;
c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx;
c->put_hevc_qpel_uni_w[3][1][0] = ff_hevc_put_hevc_qpel_uni_w_v8_8_lasx;
c->put_hevc_qpel_uni_w[4][1][0] = ff_hevc_put_hevc_qpel_uni_w_v12_8_lasx;
c->put_hevc_qpel_uni_w[5][1][0] = ff_hevc_put_hevc_qpel_uni_w_v16_8_lasx;
c->put_hevc_qpel_uni_w[6][1][0] = ff_hevc_put_hevc_qpel_uni_w_v24_8_lasx;
c->put_hevc_qpel_uni_w[7][1][0] = ff_hevc_put_hevc_qpel_uni_w_v32_8_lasx;
c->put_hevc_qpel_uni_w[8][1][0] = ff_hevc_put_hevc_qpel_uni_w_v48_8_lasx;
c->put_hevc_qpel_uni_w[9][1][0] = ff_hevc_put_hevc_qpel_uni_w_v64_8_lasx;
c->put_hevc_qpel_uni_w[1][0][1] = ff_hevc_put_hevc_qpel_uni_w_h4_8_lasx;
c->put_hevc_qpel_uni_w[2][0][1] = ff_hevc_put_hevc_qpel_uni_w_h6_8_lasx;
c->put_hevc_qpel_uni_w[3][0][1] = ff_hevc_put_hevc_qpel_uni_w_h8_8_lasx;
c->put_hevc_qpel_uni_w[4][0][1] = ff_hevc_put_hevc_qpel_uni_w_h12_8_lasx;
c->put_hevc_qpel_uni_w[5][0][1] = ff_hevc_put_hevc_qpel_uni_w_h16_8_lasx;
c->put_hevc_qpel_uni_w[6][0][1] = ff_hevc_put_hevc_qpel_uni_w_h24_8_lasx;
c->put_hevc_qpel_uni_w[7][0][1] = ff_hevc_put_hevc_qpel_uni_w_h32_8_lasx;
c->put_hevc_qpel_uni_w[8][0][1] = ff_hevc_put_hevc_qpel_uni_w_h48_8_lasx;
c->put_hevc_qpel_uni_w[9][0][1] = ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx;
}
}
}

View File

@ -48,6 +48,24 @@ PEL_UNI_W(pel, pixels, 32);
PEL_UNI_W(pel, pixels, 48);
PEL_UNI_W(pel, pixels, 64);
PEL_UNI_W(qpel, v, 8);
PEL_UNI_W(qpel, v, 12);
PEL_UNI_W(qpel, v, 16);
PEL_UNI_W(qpel, v, 24);
PEL_UNI_W(qpel, v, 32);
PEL_UNI_W(qpel, v, 48);
PEL_UNI_W(qpel, v, 64);
PEL_UNI_W(qpel, h, 4);
PEL_UNI_W(qpel, h, 6);
PEL_UNI_W(qpel, h, 8);
PEL_UNI_W(qpel, h, 12);
PEL_UNI_W(qpel, h, 16);
PEL_UNI_W(qpel, h, 24);
PEL_UNI_W(qpel, h, 32);
PEL_UNI_W(qpel, h, 48);
PEL_UNI_W(qpel, h, 64);
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LASX_H

View File

@ -257,6 +257,26 @@ PEL_UNI_W(pel, pixels, 32);
PEL_UNI_W(pel, pixels, 48);
PEL_UNI_W(pel, pixels, 64);
PEL_UNI_W(qpel, v, 4);
PEL_UNI_W(qpel, v, 6);
PEL_UNI_W(qpel, v, 8);
PEL_UNI_W(qpel, v, 12);
PEL_UNI_W(qpel, v, 16);
PEL_UNI_W(qpel, v, 24);
PEL_UNI_W(qpel, v, 32);
PEL_UNI_W(qpel, v, 48);
PEL_UNI_W(qpel, v, 64);
PEL_UNI_W(qpel, h, 4);
PEL_UNI_W(qpel, h, 6);
PEL_UNI_W(qpel, h, 8);
PEL_UNI_W(qpel, h, 12);
PEL_UNI_W(qpel, h, 16);
PEL_UNI_W(qpel, h, 24);
PEL_UNI_W(qpel, h, 32);
PEL_UNI_W(qpel, h, 48);
PEL_UNI_W(qpel, h, 64);
#undef PEL_UNI_W
#endif // #ifndef AVCODEC_LOONGARCH_HEVCDSP_LSX_H