From dae95b3ffd62ed86cd2e3798c2f281aa67969eca Mon Sep 17 00:00:00 2001
From: Paul B Mahol <onemda@gmail.com>
Date: Wed, 2 Mar 2022 22:30:40 +0100
Subject: [PATCH] avfilter/vf_maskedmerge: fix rounding when masking

---
 libavfilter/maskedmerge.h          |  2 +-
 libavfilter/vf_maskedmerge.c       | 20 ++++++++++++--------
 libavfilter/x86/vf_maskedmerge.asm | 17 ++++++++++-------
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/libavfilter/maskedmerge.h b/libavfilter/maskedmerge.h
index 8e2b1cf676..c1cf8027e4 100644
--- a/libavfilter/maskedmerge.h
+++ b/libavfilter/maskedmerge.h
@@ -30,7 +30,7 @@ typedef struct MaskedMergeContext {
     int linesize[4];
     int nb_planes;
     int planes;
-    int half, depth;
+    int half, depth, max;
     FFFrameSync fs;
 
     void (*maskedmerge)(const uint8_t *bsrc, const uint8_t *osrc,
diff --git a/libavfilter/vf_maskedmerge.c b/libavfilter/vf_maskedmerge.c
index 11492af61f..db0c516938 100644
--- a/libavfilter/vf_maskedmerge.c
+++ b/libavfilter/vf_maskedmerge.c
@@ -96,7 +96,7 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
                        base->linesize[p], overlay->linesize[p],
                        mask->linesize[p], out->linesize[p],
                        s->width[p], slice_end - slice_start,
-                       s->half, s->depth);
+                       s->half, s->max);
     }
 
     return 0;
@@ -138,13 +138,13 @@ static int process_frame(FFFrameSync *fs)
     return ff_filter_frame(outlink, out);
 }
 
-#define MASKEDMERGE(n, type, half, shift)                              \
+#define MASKEDMERGE(n, type, ctype, half, max, div)                    \
 static void maskedmerge##n(const uint8_t *bbsrc, const uint8_t *oosrc, \
                            const uint8_t *mmsrc, uint8_t *ddst,        \
                            ptrdiff_t blinesize, ptrdiff_t olinesize,   \
                            ptrdiff_t mlinesize, ptrdiff_t dlinesize,   \
                            int w, int h,                               \
-                           int hhalf, int sshift)                      \
+                           int hhalf, int mmax)                        \
 {                                                                      \
     const type *bsrc = (const type *)bbsrc;                            \
     const type *osrc = (const type *)oosrc;                            \
@@ -158,7 +158,10 @@ static void maskedmerge##n(const uint8_t *bbsrc, const uint8_t *oosrc, \
                                                                        \
     for (int y = 0; y < h; y++) {                                      \
         for (int x = 0; x < w; x++) {                                  \
-            dst[x] = bsrc[x] + ((msrc[x] * (osrc[x] - bsrc[x]) + half) shift); \
+            const type invm = max - msrc[x];                           \
+            const ctype r = ((ctype)(bsrc[x] * invm) +                 \
+                             (ctype)(msrc[x] * osrc[x] + half))  div;  \
+            dst[x] = r;                                                \
         }                                                              \
                                                                        \
         dst  += dlinesize;                                             \
@@ -168,9 +171,9 @@ static void maskedmerge##n(const uint8_t *bbsrc, const uint8_t *oosrc, \
     }                                                                  \
 }
 
-MASKEDMERGE(8,  uint8_t, 128, >> 8)
-MASKEDMERGE(16, uint16_t, hhalf, >> sshift)
-MASKEDMERGE(32, float, 0.f, + 0.f)
+MASKEDMERGE(8,  uint8_t,  uint16_t,   127, 255,  / 255)
+MASKEDMERGE(16, uint16_t, uint32_t, hhalf, mmax, / mmax)
+MASKEDMERGE(32, float,    float,      0.f, 1.f,  + 0.f)
 
 static int config_input(AVFilterLink *inlink)
 {
@@ -189,7 +192,8 @@ static int config_input(AVFilterLink *inlink)
     s->width[0]  = s->width[3]  = inlink->w;
 
     s->depth = desc->comp[0].depth;
-    s->half = (1 << s->depth) / 2;
+    s->max  = (1 << s->depth) - 1;
+    s->half = s->max / 2;
 
     if (s->depth == 8)
         s->maskedmerge = maskedmerge8;
diff --git a/libavfilter/x86/vf_maskedmerge.asm b/libavfilter/x86/vf_maskedmerge.asm
index 7e61935b97..1028299087 100644
--- a/libavfilter/x86/vf_maskedmerge.asm
+++ b/libavfilter/x86/vf_maskedmerge.asm
@@ -24,26 +24,28 @@
 
 SECTION_RODATA
 
-pw_128: times 8 dw 128
-pw_256: times 8 dw 256
+pw_127: times 8 dw 127
+pw_255: times 8 dw 255
+pw_32897: times 8 dw 32897
 
 SECTION .text
 
 INIT_XMM sse2
 %if ARCH_X86_64
-cglobal maskedmerge8, 8, 11, 7, bsrc, osrc, msrc, dst, blinesize, olinesize, mlinesize, dlinesize, w, h, x
+cglobal maskedmerge8, 8, 11, 8, bsrc, osrc, msrc, dst, blinesize, olinesize, mlinesize, dlinesize, w, h, x
     mov         wd, dword wm
     mov         hd, dword hm
 %else
-cglobal maskedmerge8, 5, 7, 7, bsrc, osrc, msrc, dst, blinesize, w, x
+cglobal maskedmerge8, 5, 7, 8, bsrc, osrc, msrc, dst, blinesize, w, x
     mov         wd, r8m
 %define olinesizeq r5mp
 %define mlinesizeq r6mp
 %define dlinesizeq r7mp
 %define hd r9mp
 %endif
-    mova        m4, [pw_256]
-    mova        m5, [pw_128]
+    mova        m4, [pw_255]
+    mova        m5, [pw_127]
+    mova        m7, [pw_32897]
     pxor        m6, m6
     add      bsrcq, wq
     add      osrcq, wq
@@ -66,7 +68,8 @@ cglobal maskedmerge8, 5, 7, 7, bsrc, osrc, msrc, dst, blinesize, w, x
         pmullw          m1, m3
         paddw           m1, m2
         paddw           m1, m5
-        psrlw           m1, 8
+        pmulhuw         m1, m7
+        psrlw           m1, 7
         packuswb        m1, m1
         movh   [dstq + xq], m1
         add             xq, mmsize / 2