1
0
Fork 0

aarch64: Make the indentation more consistent

Some functions have slightly different indentation styles; try
to match the surrounding code.

libavcodec/aarch64/vc1dsp_neon.S is skipped here, as it intentionally
uses a layered indentation style to visually show how different
unrolled/interleaved phases fit together.

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö 2023-10-17 13:47:27 +03:00
parent 93cda5a9c2
commit 7f905f3672
7 changed files with 304 additions and 304 deletions

View File

@ -526,7 +526,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
ld1 {v17.8b}, [x4], x1
ld1 {v19.8b}, [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
@ -554,7 +554,7 @@ h_loop_filter_chroma420_intra:
ld1 {v17.s}[1], [x4], x1
ld1 {v19.s}[1], [x4], x1
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
transpose_4x8B v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra
@ -1017,7 +1017,7 @@ function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
ld1 {v16.8h}, [x4], x1
ld1 {v19.8h}, [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10
@ -1045,7 +1045,7 @@ h_loop_filter_chroma420_intra_10:
ld1 {v19.4h}, [x4], x1
ld1 {v19.d}[1], [x9], x1
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
transpose_4x8H v18, v16, v17, v19, v26, v27, v28, v29
h264_loop_filter_chroma_intra_10

View File

@ -580,8 +580,8 @@ function \type\()_h264_qpel16_hv_lowpass_l2_neon
endfunc
.endm
h264_qpel16_hv put
h264_qpel16_hv avg
h264_qpel16_hv put
h264_qpel16_hv avg
.macro h264_qpel8 type
function ff_\type\()_h264_qpel8_mc10_neon, export=1
@ -759,8 +759,8 @@ function ff_\type\()_h264_qpel8_mc33_neon, export=1
endfunc
.endm
h264_qpel8 put
h264_qpel8 avg
h264_qpel8 put
h264_qpel8 avg
.macro h264_qpel16 type
function ff_\type\()_h264_qpel16_mc10_neon, export=1
@ -931,5 +931,5 @@ function ff_\type\()_h264_qpel16_mc33_neon, export=1
endfunc
.endm
h264_qpel16 put
h264_qpel16 avg
h264_qpel16 put
h264_qpel16 avg

View File

@ -239,23 +239,23 @@ function hevc_add_residual_32x32_16_neon, export=0
endfunc
.macro tr_4x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
sshll v20.4s, \in0, #6
sshll v21.4s, \in0, #6
smull v22.4s, \in1, v4.h[1]
smull v23.4s, \in1, v4.h[3]
smlal v20.4s, \in2, v4.h[0] //e0
smlsl v21.4s, \in2, v4.h[0] //e1
smlal v22.4s, \in3, v4.h[3] //o0
smlsl v23.4s, \in3, v4.h[1] //o1
sshll v20.4s, \in0, #6
sshll v21.4s, \in0, #6
smull v22.4s, \in1, v4.h[1]
smull v23.4s, \in1, v4.h[3]
smlal v20.4s, \in2, v4.h[0] //e0
smlsl v21.4s, \in2, v4.h[0] //e1
smlal v22.4s, \in3, v4.h[3] //o0
smlsl v23.4s, \in3, v4.h[1] //o1
add v24.4s, v20.4s, v22.4s
sub v20.4s, v20.4s, v22.4s
add v22.4s, v21.4s, v23.4s
sub v21.4s, v21.4s, v23.4s
sqrshrn \out0, v24.4s, #\shift
sqrshrn \out3, v20.4s, #\shift
sqrshrn \out1, v22.4s, #\shift
sqrshrn \out2, v21.4s, #\shift
add v24.4s, v20.4s, v22.4s
sub v20.4s, v20.4s, v22.4s
add v22.4s, v21.4s, v23.4s
sub v21.4s, v21.4s, v23.4s
sqrshrn \out0, v24.4s, #\shift
sqrshrn \out3, v20.4s, #\shift
sqrshrn \out1, v22.4s, #\shift
sqrshrn \out2, v21.4s, #\shift
.endm
.macro idct_4x4 bitdepth
@ -294,19 +294,19 @@ endfunc
// uses and clobbers v28-v31 as temp registers
.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2
sshll\p1 v28.4s, \in0, #6
mov v29.16b, v28.16b
smull\p1 v30.4s, \in1, v0.h[1]
smull\p1 v31.4s, \in1, v0.h[3]
smlal\p2 v28.4s, \in2, v0.h[0] //e0
smlsl\p2 v29.4s, \in2, v0.h[0] //e1
smlal\p2 v30.4s, \in3, v0.h[3] //o0
smlsl\p2 v31.4s, \in3, v0.h[1] //o1
sshll\p1 v28.4s, \in0, #6
mov v29.16b, v28.16b
smull\p1 v30.4s, \in1, v0.h[1]
smull\p1 v31.4s, \in1, v0.h[3]
smlal\p2 v28.4s, \in2, v0.h[0] //e0
smlsl\p2 v29.4s, \in2, v0.h[0] //e1
smlal\p2 v30.4s, \in3, v0.h[3] //o0
smlsl\p2 v31.4s, \in3, v0.h[1] //o1
add \out0, v28.4s, v30.4s
add \out1, v29.4s, v31.4s
sub \out2, v29.4s, v31.4s
sub \out3, v28.4s, v30.4s
add \out0, v28.4s, v30.4s
add \out1, v29.4s, v31.4s
sub \out2, v29.4s, v31.4s
sub \out3, v28.4s, v30.4s
.endm
.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
@ -362,11 +362,11 @@ endfunc
.macro idct_8x8 bitdepth
function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
//x0 - coeffs
mov x1, x0
mov x1, x0
ld1 {v16.8h-v19.8h}, [x1], #64
ld1 {v20.8h-v23.8h}, [x1]
movrel x1, trans
movrel x1, trans
ld1 {v0.8h}, [x1]
tr_8x4 7, v16,.4h, v17,.4h, v18,.4h, v19,.4h, v20,.4h, v21,.4h, v22,.4h, v23,.4h
@ -379,7 +379,7 @@ function ff_hevc_idct_8x8_\bitdepth\()_neon, export=1
transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23
mov x1, x0
mov x1, x0
st1 {v16.8h-v19.8h}, [x1], #64
st1 {v20.8h-v23.8h}, [x1]
@ -388,8 +388,8 @@ endfunc
.endm
.macro butterfly e, o, tmp_p, tmp_m
add \tmp_p, \e, \o
sub \tmp_m, \e, \o
add \tmp_p, \e, \o
sub \tmp_m, \e, \o
.endm
.macro tr16_8x4 in0, in1, in2, in3, offset
@ -418,7 +418,7 @@ endfunc
butterfly v25.4s, v29.4s, v17.4s, v22.4s
butterfly v26.4s, v30.4s, v18.4s, v21.4s
butterfly v27.4s, v31.4s, v19.4s, v20.4s
add x4, sp, #\offset
add x4, sp, #\offset
st1 {v16.4s-v19.4s}, [x4], #64
st1 {v20.4s-v23.4s}, [x4]
.endm
@ -435,14 +435,14 @@ endfunc
.endm
.macro add_member in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, p
sum_sub v21.4s, \in, \t0, \op0, \p
sum_sub v22.4s, \in, \t1, \op1, \p
sum_sub v23.4s, \in, \t2, \op2, \p
sum_sub v24.4s, \in, \t3, \op3, \p
sum_sub v25.4s, \in, \t4, \op4, \p
sum_sub v26.4s, \in, \t5, \op5, \p
sum_sub v27.4s, \in, \t6, \op6, \p
sum_sub v28.4s, \in, \t7, \op7, \p
sum_sub v21.4s, \in, \t0, \op0, \p
sum_sub v22.4s, \in, \t1, \op1, \p
sum_sub v23.4s, \in, \t2, \op2, \p
sum_sub v24.4s, \in, \t3, \op3, \p
sum_sub v25.4s, \in, \t4, \op4, \p
sum_sub v26.4s, \in, \t5, \op5, \p
sum_sub v27.4s, \in, \t6, \op6, \p
sum_sub v28.4s, \in, \t7, \op7, \p
.endm
.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
@ -528,20 +528,20 @@ endfunc
.macro tr_16x4 name, shift, offset, step
function func_tr_16x4_\name
mov x1, x5
add x3, x5, #(\step * 64)
mov x2, #(\step * 128)
mov x1, x5
add x3, x5, #(\step * 64)
mov x2, #(\step * 128)
load16 v16.d, v17.d, v18.d, v19.d
movrel x1, trans
movrel x1, trans
ld1 {v0.8h}, [x1]
tr16_8x4 v16, v17, v18, v19, \offset
add x1, x5, #(\step * 32)
add x3, x5, #(\step * 3 *32)
mov x2, #(\step * 128)
add x1, x5, #(\step * 32)
add x3, x5, #(\step * 3 *32)
mov x2, #(\step * 128)
load16 v20.d, v17.d, v18.d, v19.d
movrel x1, trans, 16
movrel x1, trans, 16
ld1 {v1.8h}, [x1]
smull v21.4s, v20.4h, v1.h[0]
smull v22.4s, v20.4h, v1.h[1]
@ -560,19 +560,19 @@ function func_tr_16x4_\name
add_member v19.4h, v1.h[6], v1.h[3], v1.h[0], v1.h[2], v1.h[5], v1.h[7], v1.h[4], v1.h[1], +, -, +, -, +, +, -, +
add_member v19.8h, v1.h[7], v1.h[6], v1.h[5], v1.h[4], v1.h[3], v1.h[2], v1.h[1], v1.h[0], +, -, +, -, +, -, +, -, 2
add x4, sp, #\offset
add x4, sp, #\offset
ld1 {v16.4s-v19.4s}, [x4], #64
butterfly16 v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, v24.4s
.if \shift > 0
scale v29, v30, v31, v24, v20.4s, v16.4s, v21.4s, v17.4s, v22.4s, v18.4s, v23.4s, v19.4s, \shift
transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7
mov x1, x6
add x3, x6, #(24 +3*32)
mov x2, #32
mov x4, #-32
mov x1, x6
add x3, x6, #(24 +3*32)
mov x2, #32
mov x4, #-32
store16 v29.d, v30.d, v31.d, v24.d, x4
.else
store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s
store_to_stack \offset, (\offset + 240), v20.4s, v21.4s, v22.4s, v23.4s, v19.4s, v18.4s, v17.4s, v16.4s
.endif
add x4, sp, #(\offset + 64)
@ -582,13 +582,13 @@ function func_tr_16x4_\name
scale v29, v30, v31, v20, v20.4s, v16.4s, v25.4s, v17.4s, v26.4s, v18.4s, v27.4s, v19.4s, \shift
transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7
add x1, x6, #8
add x3, x6, #(16 + 3 * 32)
mov x2, #32
mov x4, #-32
add x1, x6, #8
add x3, x6, #(16 + 3 * 32)
mov x2, #32
mov x4, #-32
store16 v29.d, v30.d, v31.d, v20.d, x4
.else
store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s
store_to_stack (\offset + 64), (\offset + 176), v20.4s, v25.4s, v26.4s, v27.4s, v19.4s, v18.4s, v17.4s, v16.4s
.endif
ret
@ -601,21 +601,21 @@ function ff_hevc_idct_16x16_\bitdepth\()_neon, export=1
mov x15, x30
// allocate a temp buffer
sub sp, sp, #640
sub sp, sp, #640
.irp i, 0, 1, 2, 3
add x5, x0, #(8 * \i)
add x6, sp, #(8 * \i * 16)
add x5, x0, #(8 * \i)
add x6, sp, #(8 * \i * 16)
bl func_tr_16x4_firstpass
.endr
.irp i, 0, 1, 2, 3
add x5, sp, #(8 * \i)
add x6, x0, #(8 * \i * 16)
add x5, sp, #(8 * \i)
add x6, x0, #(8 * \i * 16)
bl func_tr_16x4_secondpass_\bitdepth
.endr
add sp, sp, #640
add sp, sp, #640
ret x15
endfunc
@ -644,10 +644,10 @@ endfunc
.endm
.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
sum_sub v24.4s, \in, \t0, \op0, \p
sum_sub v25.4s, \in, \t1, \op1, \p
sum_sub v26.4s, \in, \t2, \op2, \p
sum_sub v27.4s, \in, \t3, \op3, \p
sum_sub v24.4s, \in, \t0, \op0, \p
sum_sub v25.4s, \in, \t1, \op1, \p
sum_sub v26.4s, \in, \t2, \op2, \p
sum_sub v27.4s, \in, \t3, \op3, \p
.endm
.macro butterfly32 in0, in1, in2, in3, out
@ -841,85 +841,85 @@ idct_32x32 8
idct_32x32 10
.macro tr4_luma_shift r0, r1, r2, r3, shift
saddl v0.4s, \r0, \r2 // c0 = src0 + src2
saddl v1.4s, \r2, \r3 // c1 = src2 + src3
ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
smull v3.4s, \r1, v21.4h // c3 = 74 * src1
saddl v0.4s, \r0, \r2 // c0 = src0 + src2
saddl v1.4s, \r2, \r3 // c1 = src2 + src3
ssubl v2.4s, \r0, \r3 // c2 = src0 - src3
smull v3.4s, \r1, v21.4h // c3 = 74 * src1
saddl v7.4s, \r0, \r3 // src0 + src3
ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
saddl v7.4s, \r0, \r3 // src0 + src3
ssubw v7.4s, v7.4s, \r2 // src0 - src2 + src3
mul v7.4s, v7.4s, v18.4s // dst2 = 74 * (src0 - src2 + src3)
mul v5.4s, v0.4s, v19.4s // 29 * c0
mul v6.4s, v1.4s, v20.4s // 55 * c1
add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
mul v5.4s, v0.4s, v19.4s // 29 * c0
mul v6.4s, v1.4s, v20.4s // 55 * c1
add v5.4s, v5.4s, v6.4s // 29 * c0 + 55 * c1
add v5.4s, v5.4s, v3.4s // dst0 = 29 * c0 + 55 * c1 + c3
mul v1.4s, v1.4s, v19.4s // 29 * c1
mul v6.4s, v2.4s, v20.4s // 55 * c2
sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
mul v1.4s, v1.4s, v19.4s // 29 * c1
mul v6.4s, v2.4s, v20.4s // 55 * c2
sub v6.4s, v6.4s, v1.4s // 55 * c2 - 29 * c1
add v6.4s, v6.4s, v3.4s // dst1 = 55 * c2 - 29 * c1 + c3
mul v0.4s, v0.4s, v20.4s // 55 * c0
mul v2.4s, v2.4s, v19.4s // 29 * c2
add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
mul v0.4s, v0.4s, v20.4s // 55 * c0
mul v2.4s, v2.4s, v19.4s // 29 * c2
add v0.4s, v0.4s, v2.4s // 55 * c0 + 29 * c2
sub v0.4s, v0.4s, v3.4s // dst3 = 55 * c0 + 29 * c2 - c3
sqrshrn \r0, v5.4s, \shift
sqrshrn \r1, v6.4s, \shift
sqrshrn \r2, v7.4s, \shift
sqrshrn \r3, v0.4s, \shift
sqrshrn \r0, v5.4s, \shift
sqrshrn \r1, v6.4s, \shift
sqrshrn \r2, v7.4s, \shift
sqrshrn \r3, v0.4s, \shift
.endm
function ff_hevc_transform_luma_4x4_neon_8, export=1
ld1 {v28.4h-v31.4h}, [x0]
movi v18.4s, #74
movi v19.4s, #29
movi v20.4s, #55
movi v21.4h, #74
ld1 {v28.4h-v31.4h}, [x0]
movi v18.4s, #74
movi v19.4s, #29
movi v20.4s, #55
movi v21.4h, #74
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #7
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
tr4_luma_shift v28.4h, v29.4h, v30.4h, v31.4h, #12
transpose_4x4H v28, v29, v30, v31, v22, v23, v24, v25
st1 {v28.4h-v31.4h}, [x0]
st1 {v28.4h-v31.4h}, [x0]
ret
endfunc
// void ff_hevc_idct_NxN_dc_DEPTH_neon(int16_t *coeffs)
.macro idct_dc size, bitdepth
function ff_hevc_idct_\size\()x\size\()_dc_\bitdepth\()_neon, export=1
ld1r {v4.8h}, [x0]
srshr v4.8h, v4.8h, #1
srshr v0.8h, v4.8h, #(14 - \bitdepth)
srshr v1.8h, v4.8h, #(14 - \bitdepth)
ld1r {v4.8h}, [x0]
srshr v4.8h, v4.8h, #1
srshr v0.8h, v4.8h, #(14 - \bitdepth)
srshr v1.8h, v4.8h, #(14 - \bitdepth)
.if \size > 4
srshr v2.8h, v4.8h, #(14 - \bitdepth)
srshr v3.8h, v4.8h, #(14 - \bitdepth)
srshr v2.8h, v4.8h, #(14 - \bitdepth)
srshr v3.8h, v4.8h, #(14 - \bitdepth)
.if \size > 16 /* dc 32x32 */
mov x2, #4
mov x2, #4
1:
subs x2, x2, #1
subs x2, x2, #1
.endif
add x12, x0, #64
mov x13, #128
.if \size > 8 /* dc 16x16 */
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
.endif /* dc 8x8 */
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
st1 {v0.8h-v3.8h}, [x0], x13
st1 {v0.8h-v3.8h}, [x12], x13
.if \size > 16 /* dc 32x32 */
bne 1b
.endif
.else /* dc 4x4 */
st1 {v0.8h-v1.8h}, [x0]
st1 {v0.8h-v1.8h}, [x0]
.endif
ret
endfunc

View File

@ -840,19 +840,19 @@ function ff_hevc_put_hevc_qpel_uni_v16_8_neon, export=1
endfunc
function ff_hevc_put_hevc_qpel_uni_v24_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
b X(ff_hevc_put_hevc_qpel_uni_v12_8_neon)
endfunc
function ff_hevc_put_hevc_qpel_uni_v32_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc
function ff_hevc_put_hevc_qpel_uni_v48_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc
function ff_hevc_put_hevc_qpel_uni_v64_8_neon, export=1
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
b X(ff_hevc_put_hevc_qpel_uni_v16_8_neon)
endfunc
function ff_hevc_put_hevc_pel_uni_w_pixels4_8_neon, export=1
@ -1560,21 +1560,21 @@ endfunc
#if HAVE_I8MM
.macro calc_all2
calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
calc v30, v31, v16, v18, v20, v22, v24, v26, v28, v30, v17, v19, v21, v23, v25, v27, v29, v31
b.eq 2f
calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
calc v16, v17, v18, v20, v22, v24, v26, v28, v30, v16, v19, v21, v23, v25, v27, v29, v31, v17
b.eq 2f
calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
calc v18, v19, v20, v22, v24, v26, v28, v30, v16, v18, v21, v23, v25, v27, v29, v31, v17, v19
b.eq 2f
calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
calc v20, v21, v22, v24, v26, v28, v30, v16, v18, v20, v23, v25, v27, v29, v31, v17, v19, v21
b.eq 2f
calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
calc v22, v23, v24, v26, v28, v30, v16, v18, v20, v22, v25, v27, v29, v31, v17, v19, v21, v23
b.eq 2f
calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
calc v24, v25, v26, v28, v30, v16, v18, v20, v22, v24, v27, v29, v31, v17, v19, v21, v23, v25
b.eq 2f
calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
calc v26, v27, v28, v30, v16, v18, v20, v22, v24, v26, v29, v31, v17, v19, v21, v23, v25, v27
b.eq 2f
calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
calc v28, v29, v30, v16, v18, v20, v22, v24, v26, v28, v31, v17, v19, v21, v23, v25, v27, v29
b.hi 1b
.endm

View File

@ -34,13 +34,13 @@ endconst
function ff_opus_deemphasis_neon, export=1
movrel x4, tab_st
ld1 {v4.4s}, [x4]
ld1 {v4.4s}, [x4]
movrel x4, tab_x0
ld1 {v5.4s}, [x4]
ld1 {v5.4s}, [x4]
movrel x4, tab_x1
ld1 {v6.4s}, [x4]
ld1 {v6.4s}, [x4]
movrel x4, tab_x2
ld1 {v7.4s}, [x4]
ld1 {v7.4s}, [x4]
fmul v0.4s, v4.4s, v0.s[0]

View File

@ -330,32 +330,32 @@ endfunc
// v17: hev
// convert to signed value:
eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80
eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80
movi v20.8h, #3
ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
mul v19.8h, v19.8h, v20.8h
movi v20.8h, #3
ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0
ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit)
eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80
eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80
mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0)
mul v19.8h, v19.8h, v20.8h
sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
movi v22.16b, #4
movi v23.16b, #3
sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1)
movi v22.16b, #4
movi v23.16b, #3
.if \inner
and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1)
.endif
saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
saddw2 v19.8h, v19.8h, v20.16b
sqxtn v18.8b, v18.8h // narrow result back into v18
sqxtn2 v18.16b, v19.8h
saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1)
saddw2 v19.8h, v19.8h, v20.16b
sqxtn v18.8b, v18.8h // narrow result back into v18
sqxtn2 v18.16b, v19.8h
.if !\inner && !\simple
eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80
eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80
.endif
and v18.16b, v18.16b, v16.16b // w &= normal_limit
and v18.16b, v18.16b, v16.16b // w &= normal_limit
// registers used at this point..
// v0 -> P3 (don't corrupt)
@ -375,44 +375,44 @@ endfunc
// P0 = s2u(PS0 + c2);
.if \simple
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
.elseif \inner
// the !is4tap case of filter_common, only used for inner blocks
// c3 = ((c1&~hev) + 1) >> 1;
// Q1 = s2u(QS1 - c3);
// P1 = s2u(PS1 + c3);
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
bic v19.16b, v19.16b, v17.16b // c1 & ~hev
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
srshr v19.16b, v19.16b, #1 // c3 >>= 1
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
bic v19.16b, v19.16b, v17.16b // c1 & ~hev
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
srshr v19.16b, v19.16b, #1 // c3 >>= 1
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3)
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3)
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
.else
and v20.16b, v18.16b, v17.16b // w & hev
sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3
bic v18.16b, v18.16b, v17.16b // w &= ~hev
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
and v20.16b, v18.16b, v17.16b // w & hev
sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4)
sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3)
sshr v19.16b, v19.16b, #3 // c1 >>= 3
sshr v20.16b, v20.16b, #3 // c2 >>= 3
bic v18.16b, v18.16b, v17.16b // w &= ~hev
sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1)
sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2)
// filter_mbedge:
// a = clamp((27*w + 63) >> 7);
@ -424,35 +424,35 @@ endfunc
// a = clamp((9*w + 63) >> 7);
// Q2 = s2u(QS2 - a);
// P2 = s2u(PS2 + a);
movi v17.8h, #63
sshll v22.8h, v18.8b, #3
sshll2 v23.8h, v18.16b, #3
saddw v22.8h, v22.8h, v18.8b
saddw2 v23.8h, v23.8h, v18.16b
add v16.8h, v17.8h, v22.8h
add v17.8h, v17.8h, v23.8h // 9*w + 63
add v19.8h, v16.8h, v22.8h
add v20.8h, v17.8h, v23.8h // 18*w + 63
add v22.8h, v19.8h, v22.8h
add v23.8h, v20.8h, v23.8h // 27*w + 63
sqshrn v16.8b, v16.8h, #7
sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
sqshrn v19.8b, v19.8h, #7
sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
sqshrn v22.8b, v22.8h, #7
sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
movi v17.8h, #63
sshll v22.8h, v18.8b, #3
sshll2 v23.8h, v18.16b, #3
saddw v22.8h, v22.8h, v18.8b
saddw2 v23.8h, v23.8h, v18.16b
add v16.8h, v17.8h, v22.8h
add v17.8h, v17.8h, v23.8h // 9*w + 63
add v19.8h, v16.8h, v22.8h
add v20.8h, v17.8h, v23.8h // 18*w + 63
add v22.8h, v19.8h, v22.8h
add v23.8h, v20.8h, v23.8h // 27*w + 63
sqshrn v16.8b, v16.8h, #7
sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7)
sqshrn v19.8b, v19.8h, #7
sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7)
sqshrn v22.8b, v22.8h, #7
sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7)
sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a)
sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a)
sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a)
sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a)
sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a)
sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a)
eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80
eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80
eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80
eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80
eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80
eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80
.endif
.endm
@ -507,48 +507,48 @@ function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2
// Load pixels:
ld1 {v0.d}[0], [x0], x2 // P3
ld1 {v0.d}[1], [x1], x2 // P3
ld1 {v1.d}[0], [x0], x2 // P2
ld1 {v1.d}[1], [x1], x2 // P2
ld1 {v2.d}[0], [x0], x2 // P1
ld1 {v2.d}[1], [x1], x2 // P1
ld1 {v3.d}[0], [x0], x2 // P0
ld1 {v3.d}[1], [x1], x2 // P0
ld1 {v4.d}[0], [x0], x2 // Q0
ld1 {v4.d}[1], [x1], x2 // Q0
ld1 {v5.d}[0], [x0], x2 // Q1
ld1 {v5.d}[1], [x1], x2 // Q1
ld1 {v6.d}[0], [x0], x2 // Q2
ld1 {v6.d}[1], [x1], x2 // Q2
ld1 {v7.d}[0], [x0] // Q3
ld1 {v7.d}[1], [x1] // Q3
ld1 {v0.d}[0], [x0], x2 // P3
ld1 {v0.d}[1], [x1], x2 // P3
ld1 {v1.d}[0], [x0], x2 // P2
ld1 {v1.d}[1], [x1], x2 // P2
ld1 {v2.d}[0], [x0], x2 // P1
ld1 {v2.d}[1], [x1], x2 // P1
ld1 {v3.d}[0], [x0], x2 // P0
ld1 {v3.d}[1], [x1], x2 // P0
ld1 {v4.d}[0], [x0], x2 // Q0
ld1 {v4.d}[1], [x1], x2 // Q0
ld1 {v5.d}[0], [x0], x2 // Q1
ld1 {v5.d}[1], [x1], x2 // Q1
ld1 {v6.d}[0], [x0], x2 // Q2
ld1 {v6.d}[1], [x1], x2 // Q2
ld1 {v7.d}[0], [x0] // Q3
ld1 {v7.d}[1], [x1] // Q3
dup v22.16b, w3 // flim_E
dup v23.16b, w4 // flim_I
dup v22.16b, w3 // flim_E
dup v23.16b, w4 // flim_I
vp8_loop_filter inner=\inner, hev_thresh=w5
// back up to P2: u,v -= stride * 6
sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2
sub x0, x0, x2, lsl #1
sub x1, x1, x2, lsl #1
sub x0, x0, x2, lsl #2
sub x1, x1, x2, lsl #2
sub x0, x0, x2, lsl #1
sub x1, x1, x2, lsl #1
// Store pixels:
st1 {v1.d}[0], [x0], x2 // P2
st1 {v1.d}[1], [x1], x2 // P2
st1 {v2.d}[0], [x0], x2 // P1
st1 {v2.d}[1], [x1], x2 // P1
st1 {v3.d}[0], [x0], x2 // P0
st1 {v3.d}[1], [x1], x2 // P0
st1 {v4.d}[0], [x0], x2 // Q0
st1 {v4.d}[1], [x1], x2 // Q0
st1 {v5.d}[0], [x0], x2 // Q1
st1 {v5.d}[1], [x1], x2 // Q1
st1 {v6.d}[0], [x0] // Q2
st1 {v6.d}[1], [x1] // Q2
st1 {v1.d}[0], [x0], x2 // P2
st1 {v1.d}[1], [x1], x2 // P2
st1 {v2.d}[0], [x0], x2 // P1
st1 {v2.d}[1], [x1], x2 // P1
st1 {v3.d}[0], [x0], x2 // P0
st1 {v3.d}[1], [x1], x2 // P0
st1 {v4.d}[0], [x0], x2 // Q0
st1 {v4.d}[1], [x1], x2 // Q0
st1 {v5.d}[0], [x0], x2 // Q1
st1 {v5.d}[1], [x1], x2 // Q1
st1 {v6.d}[0], [x0] // Q2
st1 {v6.d}[1], [x1] // Q2
ret
endfunc
@ -579,7 +579,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
ld1 {v6.d}[1], [x0], x1
ld1 {v7.d}[1], [x0], x1
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w2 // flim_E
.if !\simple
@ -590,7 +590,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1
sub x0, x0, x1, lsl #4 // backup 16 rows
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels:
st1 {v0.d}[0], [x0], x1
@ -624,24 +624,24 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x1, x1, #4
// Load pixels:
ld1 {v0.d}[0], [x0], x2 // load u
ld1 {v0.d}[1], [x1], x2 // load v
ld1 {v1.d}[0], [x0], x2
ld1 {v1.d}[1], [x1], x2
ld1 {v2.d}[0], [x0], x2
ld1 {v2.d}[1], [x1], x2
ld1 {v3.d}[0], [x0], x2
ld1 {v3.d}[1], [x1], x2
ld1 {v4.d}[0], [x0], x2
ld1 {v4.d}[1], [x1], x2
ld1 {v5.d}[0], [x0], x2
ld1 {v5.d}[1], [x1], x2
ld1 {v6.d}[0], [x0], x2
ld1 {v6.d}[1], [x1], x2
ld1 {v7.d}[0], [x0], x2
ld1 {v7.d}[1], [x1], x2
ld1 {v0.d}[0], [x0], x2 // load u
ld1 {v0.d}[1], [x1], x2 // load v
ld1 {v1.d}[0], [x0], x2
ld1 {v1.d}[1], [x1], x2
ld1 {v2.d}[0], [x0], x2
ld1 {v2.d}[1], [x1], x2
ld1 {v3.d}[0], [x0], x2
ld1 {v3.d}[1], [x1], x2
ld1 {v4.d}[0], [x0], x2
ld1 {v4.d}[1], [x1], x2
ld1 {v5.d}[0], [x0], x2
ld1 {v5.d}[1], [x1], x2
ld1 {v6.d}[0], [x0], x2
ld1 {v6.d}[1], [x1], x2
ld1 {v7.d}[0], [x0], x2
ld1 {v7.d}[1], [x1], x2
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
dup v22.16b, w3 // flim_E
dup v23.16b, w4 // flim_I
@ -651,25 +651,25 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
sub x0, x0, x2, lsl #3 // backup u 8 rows
sub x1, x1, x2, lsl #3 // backup v 8 rows
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
// Store pixels:
st1 {v0.d}[0], [x0], x2 // load u
st1 {v0.d}[1], [x1], x2 // load v
st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x1], x2
st1 {v2.d}[0], [x0], x2
st1 {v2.d}[1], [x1], x2
st1 {v3.d}[0], [x0], x2
st1 {v3.d}[1], [x1], x2
st1 {v4.d}[0], [x0], x2
st1 {v4.d}[1], [x1], x2
st1 {v5.d}[0], [x0], x2
st1 {v5.d}[1], [x1], x2
st1 {v6.d}[0], [x0], x2
st1 {v6.d}[1], [x1], x2
st1 {v7.d}[0], [x0]
st1 {v7.d}[1], [x1]
st1 {v0.d}[0], [x0], x2 // load u
st1 {v0.d}[1], [x1], x2 // load v
st1 {v1.d}[0], [x0], x2
st1 {v1.d}[1], [x1], x2
st1 {v2.d}[0], [x0], x2
st1 {v2.d}[1], [x1], x2
st1 {v3.d}[0], [x0], x2
st1 {v3.d}[1], [x1], x2
st1 {v4.d}[0], [x0], x2
st1 {v4.d}[1], [x1], x2
st1 {v5.d}[0], [x0], x2
st1 {v5.d}[1], [x1], x2
st1 {v6.d}[0], [x0], x2
st1 {v6.d}[1], [x1], x2
st1 {v7.d}[0], [x0]
st1 {v7.d}[1], [x1]
ret

View File

@ -729,9 +729,9 @@ FFT16_FN ns_float, 1
.endm
.macro SR_COMBINE_4 len, part, off
add x10, x1, x21
add x11, x1, x21, lsl #1
add x12, x1, x22
add x10, x1, x21
add x11, x1, x21, lsl #1
add x12, x1, x22
ldp q0, q1, [x1, #((0 + \part)*32 + \off)]
ldp q4, q5, [x1, #((2 + \part)*32 + \off)]
@ -759,9 +759,9 @@ FFT16_FN ns_float, 1
.endm
.macro SR_COMBINE_FULL len, off=0
add x10, x1, x21
add x11, x1, x21, lsl #1
add x12, x1, x22
add x10, x1, x21
add x11, x1, x21, lsl #1
add x12, x1, x22
SR_COMBINE_4 \len, 0, \off
SR_COMBINE_4 \len, 1, \off