From dae95b3ffd62ed86cd2e3798c2f281aa67969eca Mon Sep 17 00:00:00 2001 From: Paul B Mahol Date: Wed, 2 Mar 2022 22:30:40 +0100 Subject: [PATCH] avfilter/vf_maskedmerge: fix rounding when masking --- libavfilter/maskedmerge.h | 2 +- libavfilter/vf_maskedmerge.c | 20 ++++++++++++-------- libavfilter/x86/vf_maskedmerge.asm | 17 ++++++++++------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/libavfilter/maskedmerge.h b/libavfilter/maskedmerge.h index 8e2b1cf676..c1cf8027e4 100644 --- a/libavfilter/maskedmerge.h +++ b/libavfilter/maskedmerge.h @@ -30,7 +30,7 @@ typedef struct MaskedMergeContext { int linesize[4]; int nb_planes; int planes; - int half, depth; + int half, depth, max; FFFrameSync fs; void (*maskedmerge)(const uint8_t *bsrc, const uint8_t *osrc, diff --git a/libavfilter/vf_maskedmerge.c b/libavfilter/vf_maskedmerge.c index 11492af61f..db0c516938 100644 --- a/libavfilter/vf_maskedmerge.c +++ b/libavfilter/vf_maskedmerge.c @@ -96,7 +96,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs) base->linesize[p], overlay->linesize[p], mask->linesize[p], out->linesize[p], s->width[p], slice_end - slice_start, - s->half, s->depth); + s->half, s->max); } return 0; @@ -138,13 +138,13 @@ static int process_frame(FFFrameSync *fs) return ff_filter_frame(outlink, out); } -#define MASKEDMERGE(n, type, half, shift) \ +#define MASKEDMERGE(n, type, ctype, half, max, div) \ static void maskedmerge##n(const uint8_t *bbsrc, const uint8_t *oosrc, \ const uint8_t *mmsrc, uint8_t *ddst, \ ptrdiff_t blinesize, ptrdiff_t olinesize, \ ptrdiff_t mlinesize, ptrdiff_t dlinesize, \ int w, int h, \ - int hhalf, int sshift) \ + int hhalf, int mmax) \ { \ const type *bsrc = (const type *)bbsrc; \ const type *osrc = (const type *)oosrc; \ @@ -158,7 +158,10 @@ static void maskedmerge##n(const uint8_t *bbsrc, const uint8_t *oosrc, \ \ for (int y = 0; y < h; y++) { \ for (int x = 0; x < w; x++) { \ - dst[x] = bsrc[x] + ((msrc[x] * (osrc[x] - bsrc[x]) + half) shift); \ + const type invm = max - msrc[x]; \ + const ctype r = ((ctype)(bsrc[x] * invm) + \ + (ctype)(msrc[x] * osrc[x] + half)) div; \ + dst[x] = r; \ } \ \ dst += dlinesize; \ @@ -168,9 +171,9 @@ static void maskedmerge##n(const uint8_t *bbsrc, const uint8_t *oosrc, \ } \ } -MASKEDMERGE(8, uint8_t, 128, >> 8) -MASKEDMERGE(16, uint16_t, hhalf, >> sshift) -MASKEDMERGE(32, float, 0.f, + 0.f) +MASKEDMERGE(8, uint8_t, uint16_t, 127, 255, / 255) +MASKEDMERGE(16, uint16_t, uint32_t, hhalf, mmax, / mmax) +MASKEDMERGE(32, float, float, 0.f, 1.f, + 0.f) static int config_input(AVFilterLink *inlink) { @@ -189,7 +192,8 @@ static int config_input(AVFilterLink *inlink) s->width[0] = s->width[3] = inlink->w; s->depth = desc->comp[0].depth; - s->half = (1 << s->depth) / 2; + s->max = (1 << s->depth) - 1; + s->half = s->max / 2; if (s->depth == 8) s->maskedmerge = maskedmerge8; diff --git a/libavfilter/x86/vf_maskedmerge.asm b/libavfilter/x86/vf_maskedmerge.asm index 7e61935b97..1028299087 100644 --- a/libavfilter/x86/vf_maskedmerge.asm +++ b/libavfilter/x86/vf_maskedmerge.asm @@ -24,26 +24,28 @@ SECTION_RODATA -pw_128: times 8 dw 128 -pw_256: times 8 dw 256 +pw_127: times 8 dw 127 +pw_255: times 8 dw 255 +pw_32897: times 8 dw 32897 SECTION .text INIT_XMM sse2 %if ARCH_X86_64 -cglobal maskedmerge8, 8, 11, 7, bsrc, osrc, msrc, dst, blinesize, olinesize, mlinesize, dlinesize, w, h, x +cglobal maskedmerge8, 8, 11, 8, bsrc, osrc, msrc, dst, blinesize, olinesize, mlinesize, dlinesize, w, h, x mov wd, dword wm mov hd, dword hm %else -cglobal maskedmerge8, 5, 7, 7, bsrc, osrc, msrc, dst, blinesize, w, x +cglobal maskedmerge8, 5, 7, 8, bsrc, osrc, msrc, dst, blinesize, w, x mov wd, r8m %define olinesizeq r5mp %define mlinesizeq r6mp %define dlinesizeq r7mp %define hd r9mp %endif - mova m4, [pw_256] - mova m5, [pw_128] + mova m4, [pw_255] + mova m5, [pw_127] + mova m7, [pw_32897] pxor m6, m6 add bsrcq, wq add osrcq, wq @@ -66,7 +68,8 @@ cglobal maskedmerge8, 5, 7, 7, bsrc, osrc, msrc, dst, blinesize, w, x pmullw m1, m3 paddw m1, m2 paddw m1, m5 - psrlw m1, 8 + pmulhuw m1, m7 + psrlw m1, 7 packuswb m1, m1 movh [dstq + xq], m1 add xq, mmsize / 2