rendersub: move the blend functions to a separate file

Make it possible to swap easily swap the blend functions in the future.
2024-11-02 12:54:04 +01:00 · 2024-11-02 12:54:04 +01:00 · 8c032a5863
commit 8c032a5863
parent 7b2076a43f
6 changed files with 1036 additions and 503 deletions
--- a/libhb/blend.c
+++ b/libhb/blend.c
@ -0,0 +1,841 @@
+/* blend.c
+
+   Copyright (c) 2003-2024 HandBrake Team
+   This file is part of the HandBrake source code
+   Homepage: <http://handbrake.fr/>.
+   It may be used under the terms of the GNU General Public License v2.
+   For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+ */
+
+#include "handbrake/handbrake.h"
+#include "libavutil/bswap.h"
+
+struct hb_blend_private_s
+{
+    int hshift;
+    int wshift;
+    int depth;
+
+    unsigned chroma_coeffs[2][4];
+
+    void (*blend)(const struct hb_blend_private_s *pv, hb_buffer_t *dst,
+                  const hb_buffer_t *src, const int shift);
+};
+
+static int hb_blend_init(hb_blend_object_t *object,
+                         int in_pix_fmt,
+                         int in_chroma_location,
+                         int overlay_pix_fmt);
+
+static hb_buffer_t * hb_blend_work(hb_blend_object_t *object,
+                                   hb_buffer_t *in,
+                                   hb_buffer_list_t *overlays);
+
+static void hb_blend_close(hb_blend_object_t *object);
+
+hb_blend_object_t hb_blend =
+{
+    .name  = "Blend",
+    .init  = hb_blend_init,
+    .work  = hb_blend_work,
+    .close = hb_blend_close,
+};
+
+static void blend_subsample_8on1x(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int x0, y0, x0c, y0c;
+    int ox, oy;
+    int width, height;
+    uint8_t *y_in, *u_in, *v_in, *a_in;
+    uint16_t *y_out, *u_out, *v_out;
+    const unsigned max_val = (256 << shift) - 1;
+
+    const int left = x0 = src->f.x;
+    const int top  = y0 = src->f.y;
+
+    // Coordinates of the first chroma sample affected by the overlay
+    x0c = x0 & ~((1 << pv->wshift) - 1);
+    y0c = y0 & ~((1 << pv->hshift) - 1);
+
+    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
+    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
+
+    // This is setting the pointer outside of the array range if y0c < y0
+    oy = y0c - y0;
+
+    unsigned is_chroma_line, res_u, res_v, alpha;
+    unsigned accu_a, accu_b, accu_c, coeff;
+    for (int yy = y0c; oy < height; oy = ++yy - y0)
+    {
+        y_out = (uint16_t*)(dst->plane[0].data + yy * dst->plane[0].stride);
+        u_out = (uint16_t*)(dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride);
+        v_out = (uint16_t*)(dst->plane[2].data + (yy >> pv->hshift) * dst->plane[2].stride);
+
+        y_in = src->plane[0].data + oy * src->plane[0].stride;
+        u_in = src->plane[1].data + oy * src->plane[1].stride;
+        v_in = src->plane[2].data + oy * src->plane[2].stride;
+        a_in = src->plane[3].data + oy * src->plane[3].stride;
+
+        ox = x0c - x0;
+        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
+        for (int xx = x0c; ox < width; ox = ++xx - x0)
+        {
+            if (ox >= 0 && oy >= 0)
+            {
+                alpha = a_in[ox] << shift;
+                y_out[xx] = ((uint32_t)y_out[xx] * (max_val - alpha) + ((uint32_t)y_in[ox] << shift) * alpha + (max_val >> 1)) / max_val;
+            }
+
+            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
+            {
+                // Perform chromaloc-aware subsampling and blending
+                accu_a = accu_b = accu_c = 0;
+                for (int yz = 0, oyz = oy; yz < (1 << pv->hshift) && oy + yz < height; yz++, oyz++)
+                {
+                    for (int xz = 0, oxz = ox; xz < (1 << pv->wshift) && ox + xz < width; xz++, oxz++)
+                    {
+                        // Weight of the current chroma sample
+                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
+                        res_u = u_out[xx >> pv->wshift];
+                        res_v = v_out[xx >> pv->wshift];
+
+                        // Chroma sampled area overlap with bitmap
+                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
+                        {
+                            alpha = (uint32_t)a_in[oxz + yz * src->plane[3].stride] << shift;
+                            res_u *= (max_val - alpha);
+                            res_u = (res_u + ((uint32_t)(u_in + yz * src->plane[1].stride)[oxz] << shift) * alpha + (max_val>>1)) / max_val;
+
+                            res_v *= (max_val - alpha);
+                            res_v = (res_v + ((uint32_t)(v_in + yz * src->plane[2].stride)[oxz] << shift) * alpha + (max_val>>1)) / max_val;
+                        }
+
+                        // Accumulate
+                        accu_a += coeff * res_u;
+                        accu_b += coeff * res_v;
+                        accu_c += coeff;
+                    }
+                }
+                if (accu_c)
+                {
+                    u_out[xx >> pv->wshift] = (accu_a + (accu_c >> 1)) / accu_c;
+                    v_out[xx >> pv->wshift] = (accu_b + (accu_c >> 1)) / accu_c;
+                }
+            }
+        }
+    }
+}
+
+static void blend_subsample_8onbi1x(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int x0, y0, x0c, y0c;
+    int ox, oy;
+    int width, height;
+    uint8_t *y_in, *u_in, *v_in, *a_in;
+    uint16_t *y_out, *u_out, *v_out;
+    const unsigned max_val = (256 << shift) - 1;
+
+    const int left = x0 = src->f.x;
+    const int top  = y0 = src->f.y;
+
+    // Coordinates of the first chroma sample affected by the overlay
+    x0c = x0 & ~((1 << pv->wshift) - 1);
+    y0c = y0 & ~((1 << pv->hshift) - 1);
+
+    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
+    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
+
+    // This is setting the pointer outside of the array range if y0c < y0
+    oy = y0c - y0;
+
+    unsigned is_chroma_line, res_u, res_v, alpha;
+    unsigned accu_a, accu_b, accu_c, coeff;
+    for (int yy = y0c; oy < height; oy = ++yy - y0)
+    {
+        y_out = (uint16_t*)(dst->plane[0].data + yy * dst->plane[0].stride);
+        u_out = (uint16_t*)(dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride);
+        v_out = u_out;
+
+        y_in = src->plane[0].data + oy * src->plane[0].stride;
+        u_in = src->plane[1].data + oy * src->plane[1].stride;
+        v_in = src->plane[2].data + oy * src->plane[2].stride;
+        a_in = src->plane[3].data + oy * src->plane[3].stride;
+
+        ox = x0c - x0;
+        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
+        for (int xx = x0c; ox < width; ox = ++xx - x0)
+        {
+            if (ox >= 0 && oy >= 0)
+            {
+                alpha = a_in[ox] << shift;
+                y_out[xx] = ((uint32_t)y_out[xx] * (max_val - alpha) + av_bswap16(y_in[ox]) * alpha + (max_val >> 1)) / max_val;
+            }
+
+            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
+            {
+                // Perform chromaloc-aware subsampling and blending
+                accu_a = accu_b = accu_c = 0;
+                for (int yz = 0, oyz = oy; yz < (1 << pv->hshift) && oy + yz < height; yz++, oyz++)
+                {
+                    for (int xz = 0, oxz = ox; xz < (1 << pv->wshift) && ox + xz < width; xz++, oxz++)
+                    {
+                        // Weight of the current chroma sample
+                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
+                        res_u = u_out[(xx >> pv->wshift) * 2 + 0];
+                        res_v = v_out[(xx >> pv->wshift) * 2 + 1];
+
+                        // Chroma sampled area overlap with bitmap
+                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
+                        {
+                            alpha = a_in[oxz + yz*src->plane[3].stride] << shift;
+                            res_u *= (max_val - alpha);
+                            res_u = (res_u + av_bswap16((u_in + yz * src->plane[1].stride)[oxz]) * alpha + (max_val>>1)) / max_val;
+
+                            res_v *= (max_val - alpha);
+                            res_v = (res_v + av_bswap16((v_in + yz * src->plane[2].stride)[oxz]) * alpha + (max_val>>1)) / max_val;
+                        }
+
+                        // Accumulate
+                        accu_a += coeff * res_u;
+                        accu_b += coeff * res_v;
+                        accu_c += coeff;
+                    }
+                }
+                if (accu_c)
+                {
+                    u_out[(xx >> pv->wshift) * 2 + 0] = (accu_a + (accu_c >> 1)) / accu_c;
+                    v_out[(xx >> pv->wshift) * 2 + 1] = (accu_b + (accu_c >> 1)) / accu_c;
+                }
+            }
+        }
+    }
+}
+
+static void blend_subsample_8on8(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int x0, y0, x0c, y0c;
+    int ox, oy;
+    int width, height;
+    uint8_t *y_in, *y_out;
+    uint8_t *u_in, *u_out;
+    uint8_t *v_in, *v_out;
+    uint8_t *a_in;
+
+    const int left = x0 = src->f.x;
+    const int top  = y0 = src->f.y;
+
+    // Coordinates of the first chroma sample affected by the overlay
+    x0c = x0 & ~((1 << pv->wshift) - 1);
+    y0c = y0 & ~((1 << pv->hshift) - 1);
+
+    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
+    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
+
+    // This is setting the pointer outside of the array range if y0c < y0
+    oy = y0c - y0;
+
+    unsigned is_chroma_line, res_u, res_v, alpha;
+    unsigned accu_a, accu_b, accu_c, coeff;
+    for (int yy = y0c; oy < height; oy = ++yy - y0)
+    {
+        y_out = dst->plane[0].data + yy * dst->plane[0].stride;
+        u_out = dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride;
+        v_out = dst->plane[2].data + (yy >> pv->hshift) * dst->plane[2].stride;
+
+        y_in = src->plane[0].data + oy * src->plane[0].stride;
+        u_in = src->plane[1].data + oy * src->plane[1].stride;
+        v_in = src->plane[2].data + oy * src->plane[2].stride;
+        a_in = src->plane[3].data + oy * src->plane[3].stride;
+
+        ox = x0c - x0;
+        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
+        for (int xx = x0c; ox < width; ox = ++xx - x0)
+        {
+            if (ox >= 0 && oy >= 0)
+            {
+                y_out[xx] = (y_out[xx] * (255 - a_in[ox]) + y_in[ox] * a_in[ox] + 127) / 255;
+            }
+
+            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
+            {
+                // Perform chromaloc-aware subsampling and blending
+                accu_a = accu_b = accu_c = 0;
+                for (int yz = 0, oyz = oy; yz < (1 << pv->hshift) && oy + yz < height; yz++, oyz++)
+                {
+                    for (int xz = 0, oxz = ox; xz < (1 << pv->wshift) && ox + xz < width; xz++, oxz++)
+                    {
+                        // Weight of the current chroma sample
+                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
+                        res_u = u_out[xx >> pv->wshift];
+                        res_v = v_out[xx >> pv->wshift];
+
+                        // Chroma sampled area overlap with bitmap
+                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
+                        {
+                            alpha = a_in[oxz + yz*src->plane[3].stride];
+                            res_u *= (255 - alpha);
+                            res_u = (res_u + (u_in + yz * src->plane[1].stride)[oxz] * alpha + 127) / 255;
+
+                            res_v *= (255 - alpha);
+                            res_v = (res_v + (v_in + yz * src->plane[2].stride)[oxz] * alpha + 127) / 255;
+                        }
+
+                        // Accumulate
+                        accu_a += coeff * res_u;
+                        accu_b += coeff * res_v;
+                        accu_c += coeff;
+                    }
+                }
+                if (accu_c)
+                {
+                    u_out[xx >> pv->wshift] = (accu_a + (accu_c >> 1)) / accu_c;
+                    v_out[xx >> pv->wshift] = (accu_b + (accu_c >> 1)) / accu_c;
+                }
+            }
+        }
+    }
+}
+
+static void blend_subsample_8onbi8(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int x0, y0, x0c, y0c;
+    int ox, oy;
+    int width, height;
+    uint8_t *y_in, *y_out;
+    uint8_t *u_in, *u_out;
+    uint8_t *v_in, *v_out;
+    uint8_t *a_in;
+
+    const int left = x0 = src->f.x;
+    const int top = y0 = src->f.y;
+
+    // Coordinates of the first chroma sample affected by the overlay
+    x0c = x0 & ~((1 << pv->wshift) - 1);
+    y0c = y0 & ~((1 << pv->hshift) - 1);
+
+    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
+    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
+
+    // This is setting the pointer outside of the array range if y0c < y0
+    oy = y0c - y0;
+
+    unsigned is_chroma_line, res_u, res_v, alpha;
+    unsigned accu_a, accu_b, accu_c, coeff;
+    for (int yy = y0c; oy < height; oy = ++yy - y0)
+    {
+        y_out = dst->plane[0].data + yy * dst->plane[0].stride;
+        u_out = dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride;
+        v_out = u_out;
+
+        y_in = src->plane[0].data + oy * src->plane[0].stride;
+        u_in = src->plane[1].data + oy * src->plane[1].stride;
+        v_in = src->plane[2].data + oy * src->plane[2].stride;
+        a_in = src->plane[3].data + oy * src->plane[3].stride;
+
+        ox = x0c - x0;
+        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
+        for (int xx = x0c; ox < width; ox = ++xx - x0)
+        {
+            if (ox >= 0 && oy >= 0)
+            {
+                y_out[xx] = (y_out[xx] * (255 - a_in[ox]) + y_in[ox] * a_in[ox] + 127) / 255;
+            }
+
+            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
+            {
+                // Perform chromaloc-aware subsampling and blending
+                accu_a = accu_b = accu_c = 0;
+                for (int yz = 0, oyz = oy; yz < (1 << pv->hshift); yz++, oyz++)
+                {
+                    for (int xz = 0, oxz = ox; xz < (1 << pv->wshift); xz++, oxz++)
+                    {
+                        // Weight of the current chroma sample
+                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
+                        res_u = u_out[(xx >> pv->wshift) * 2 + 0];
+                        res_v = v_out[(xx >> pv->wshift) * 2 + 1];
+
+                        // Chroma sampled area overlap with bitmap
+                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
+                        {
+                            alpha = a_in[oxz + yz*src->plane[3].stride];
+                            res_u *= (255 - alpha);
+                            res_u = (res_u + (u_in + yz * src->plane[1].stride)[oxz] * alpha + 127) / 255;
+
+                            res_v *= (255 - alpha);
+                            res_v = (res_v + (v_in + yz * src->plane[2].stride)[oxz] * alpha + 127) / 255;
+                        }
+
+                        // Accumulate
+                        accu_a += coeff*res_u;
+                        accu_b += coeff*res_v;
+                        accu_c += coeff;
+                    }
+                }
+                if (accu_c)
+                {
+                    u_out[(xx >> pv->wshift) * 2 + 0] = (accu_a + (accu_c >> 1)) / accu_c;
+                    v_out[(xx >> pv->wshift) * 2 + 1] = (accu_b + (accu_c >> 1)) / accu_c;
+                }
+            }
+        }
+    }
+}
+
+// blends src YUVA4**P buffer into dst
+static void blend8on8(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int ww, hh;
+    int x0, y0;
+    uint8_t *y_in, *y_out;
+    uint8_t *u_in, *u_out;
+    uint8_t *v_in, *v_out;
+    uint8_t *a_in, alpha;
+
+    const int left = src->f.x;
+    const int top  = src->f.y;
+
+    x0 = y0 = 0;
+    if (left < 0)
+    {
+        x0 = -left;
+    }
+    if (top < 0)
+    {
+        y0 = -top;
+    }
+
+    ww = src->f.width;
+    if (src->f.width - x0 > dst->f.width - left)
+    {
+        ww = dst->f.width - left + x0;
+    }
+    hh = src->f.height;
+    if (src->f.height - y0 > dst->f.height - top)
+    {
+        hh = dst->f.height - top + y0;
+    }
+    // Blend luma
+    for (int yy = y0; yy < hh; yy++)
+    {
+        y_in  = src->plane[0].data + yy * src->plane[0].stride;
+        y_out = dst->plane[0].data + (yy + top) * dst->plane[0].stride;
+        a_in = src->plane[3].data + yy * src->plane[3].stride;
+        for (int xx = x0; xx < ww; xx++)
+        {
+            alpha = a_in[xx];
+            // Merge the luminance and alpha with the picture
+            y_out[left + xx] =
+                ((uint16_t)y_out[left + xx] * (255 - alpha) +
+                     (uint16_t)y_in[xx] * alpha) / 255;
+        }
+    }
+
+    // Blend U & V
+    // Assumes source and dest are the same PIX_FMT
+    int hshift = 0;
+    int wshift = 0;
+    if (dst->plane[1].height < dst->plane[0].height)
+    {
+        hshift = 1;
+    }
+    if (dst->plane[1].width < dst->plane[0].width)
+    {
+        wshift = 1;
+    }
+
+    for (int yy = y0 >> hshift; yy < hh >> hshift; yy++)
+    {
+        u_in = src->plane[1].data + yy * src->plane[1].stride;
+        u_out = dst->plane[1].data + (yy + (top >> hshift)) * dst->plane[1].stride;
+        v_in = src->plane[2].data + yy * src->plane[2].stride;
+        v_out = dst->plane[2].data + (yy + (top >> hshift)) * dst->plane[2].stride;
+        a_in = src->plane[3].data + (yy << hshift) * src->plane[3].stride;
+
+        for (int xx = x0 >> wshift; xx < ww >> wshift; xx++)
+        {
+            alpha = a_in[xx << wshift];
+
+            // Blend U and alpha
+            u_out[(left >> wshift) + xx] =
+                ((uint16_t)u_out[(left >> wshift) + xx] * (255 - alpha) +
+                 (uint16_t)u_in[xx] * alpha) / 255;
+
+            // Blend V and alpha
+            v_out[(left >> wshift) + xx] =
+                ((uint16_t)v_out[(left >> wshift) + xx] * (255 - alpha) +
+                 (uint16_t)v_in[xx] * alpha) / 255;
+        }
+    }
+}
+
+static void blend8on1x(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int ww, hh;
+    int x0, y0;
+    int max;
+
+    uint8_t *y_in;
+    uint8_t *u_in;
+    uint8_t *v_in;
+    uint8_t *a_in;
+
+    uint16_t *y_out;
+    uint16_t *u_out;
+    uint16_t *v_out;
+    uint16_t alpha;
+
+    const int left = x0 = src->f.x;
+    const int top = y0 = src->f.y;
+
+    x0 = y0 = 0;
+    if (left < 0)
+    {
+        x0 = -left;
+    }
+    if (top < 0)
+    {
+        y0 = -top;
+    }
+
+    ww = src->f.width;
+    if (src->f.width - x0 > dst->f.width - left)
+    {
+        ww = dst->f.width - left + x0;
+    }
+    hh = src->f.height;
+    if (src->f.height - y0 > dst->f.height - top)
+    {
+        hh = dst->f.height - top + y0;
+    }
+
+    max = (256 << shift) -1;
+
+    // Blend luma
+    for (int yy = y0; yy < hh; yy++)
+    {
+        y_in  = src->plane[0].data + yy * src->plane[0].stride;
+        y_out = (uint16_t*)(dst->plane[0].data + (yy + top) * dst->plane[0].stride);
+        a_in = src->plane[3].data + yy * src->plane[3].stride;
+        for (int xx = x0; xx < ww; xx++)
+        {
+            alpha = a_in[xx] << shift;
+            // Merge the luminance and alpha with the picture
+            y_out[left + xx] =
+                ((uint32_t)y_out[left + xx] * (max - alpha) +
+                    ((uint32_t)y_in[xx] << shift) * alpha) / max;
+        }
+    }
+
+    // Blend U & V
+    int hshift = 0;
+    int wshift = 0;
+    if (dst->plane[1].height < dst->plane[0].height)
+    {
+        hshift = 1;
+    }
+    if (dst->plane[1].width < dst->plane[0].width)
+    {
+        wshift = 1;
+    }
+
+    for (int yy = y0 >> hshift; yy < hh >> hshift; yy++)
+    {
+        u_in = src->plane[1].data + yy * src->plane[1].stride;
+        u_out = (uint16_t*)(dst->plane[1].data + (yy + (top >> hshift)) * dst->plane[1].stride);
+        v_in = src->plane[2].data + yy * src->plane[2].stride;
+        v_out = (uint16_t*)(dst->plane[2].data + (yy + (top >> hshift)) * dst->plane[2].stride);
+        a_in = src->plane[3].data + (yy << hshift) * src->plane[3].stride;
+
+        for (int xx = x0 >> wshift; xx < ww >> wshift; xx++)
+        {
+            alpha = a_in[xx << wshift] << shift;
+
+            // Blend U and alpha
+            u_out[(left >> wshift) + xx] =
+                ((uint32_t)u_out[(left >> wshift) + xx] * (max - alpha) +
+                 ((uint32_t)u_in[xx] << shift) * alpha) / max;
+
+            // Blend V and alpha
+            v_out[(left >> wshift) + xx] =
+                ((uint32_t)v_out[(left >> wshift) + xx] * (max - alpha) +
+                 ((uint32_t)v_in[xx] << shift) * alpha) / max;
+        }
+    }
+}
+
+static void blend8onbi8(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int ww, hh;
+    int x0, y0;
+    uint8_t *y_in, *y_out;
+    uint8_t *u_in, *u_out;
+    uint8_t *v_in, *v_out;
+    uint8_t *a_in, alpha;
+
+    const int left = x0 = src->f.x;
+    const int top = y0 = src->f.y;
+
+    x0 = y0 = 0;
+    if (left < 0)
+    {
+        x0 = -left;
+    }
+    if (top < 0)
+    {
+        y0 = -top;
+    }
+
+    ww = src->f.width;
+    if (src->f.width - x0 > dst->f.width - left)
+    {
+        ww = dst->f.width - left + x0;
+    }
+    hh = src->f.height;
+    if (src->f.height - y0 > dst->f.height - top)
+    {
+        hh = dst->f.height - top + y0;
+    }
+
+    // Blend luma
+    for (int yy = y0; yy < hh; yy++)
+    {
+        y_in  = src->plane[0].data + yy * src->plane[0].stride;
+        y_out = dst->plane[0].data + (yy + top) * dst->plane[0].stride;
+        a_in = src->plane[3].data + yy * src->plane[3].stride;
+        for (int xx = x0; xx < ww; xx++)
+        {
+            alpha = a_in[xx];
+            // Merge the luminance and alpha with the picture
+            y_out[left + xx] =
+                ((uint16_t)y_out[left + xx] * (255 - alpha) +
+                    (uint16_t)y_in[xx] * alpha) / 255;
+        }
+    }
+
+    // Blend U & V
+    // Assumes source and dest are the same PIX_FMT
+    int hshift = 0;
+    int wshift = 0;
+    if (dst->plane[1].height < dst->plane[0].height)
+    {
+        hshift = 1;
+    }
+    if (dst->plane[1].width < dst->plane[0].width)
+    {
+        wshift = 1;
+    }
+
+    for (int yy = y0 >> hshift; yy < hh >> hshift; yy++)
+    {
+        u_in = src->plane[1].data + yy * src->plane[1].stride;
+        u_out = dst->plane[1].data + (yy + (top >> hshift)) * dst->plane[1].stride;
+        v_in = src->plane[2].data + yy * src->plane[2].stride;
+        v_out = dst->plane[1].data + (yy + (top >> hshift)) * dst->plane[1].stride;
+        a_in = src->plane[3].data + (yy << hshift) * src->plane[3].stride;
+
+        for (int xx = x0 >> wshift; xx < ww >> wshift; xx++)
+        {
+            alpha = a_in[xx << wshift];
+
+            // Blend U and alpha
+            u_out[((left >> wshift) + xx) * 2] =
+                ((uint16_t)u_out[((left >> wshift) + xx) * 2] * (255 - alpha) +
+                 (uint16_t)u_in[xx] * alpha) / 255;
+
+            // Blend V and alpha
+            v_out[((left >> wshift) + xx) * 2 +1] =
+                ((uint16_t)v_out[((left >> wshift) + xx) * 2 + 1] * (255 - alpha) +
+                 (uint16_t)v_in[xx] * alpha) / 255;
+        }
+    }
+}
+
+static void blend8onbi1x(const hb_blend_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
+{
+    int ww, hh;
+    int x0, y0;
+    int max;
+
+    uint8_t *y_in;
+    uint8_t *u_in;
+    uint8_t *v_in;
+    uint8_t *a_in;
+
+    uint16_t *y_out;
+    uint16_t *u_out;
+    uint16_t *v_out;
+    uint16_t alpha;
+
+    const int left = x0 = src->f.x;
+    const int top = y0 = src->f.y;
+
+    x0 = y0 = 0;
+    if (left < 0)
+    {
+        x0 = -left;
+    }
+    if (top < 0)
+    {
+        y0 = -top;
+    }
+
+    ww = src->f.width;
+    if (src->f.width - x0 > dst->f.width - left)
+    {
+        ww = dst->f.width - left + x0;
+    }
+    hh = src->f.height;
+    if (src->f.height - y0 > dst->f.height - top)
+    {
+        hh = dst->f.height - top + y0;
+    }
+
+    max = (256 << shift) -1;
+
+    // Blend luma
+    for (int yy = y0; yy < hh; yy++)
+    {
+        y_in  = src->plane[0].data + yy * src->plane[0].stride;
+        y_out = (uint16_t*)(dst->plane[0].data + (yy + top) * dst->plane[0].stride);
+        a_in  = src->plane[3].data + yy * src->plane[3].stride;
+        for (int xx = x0; xx < ww; xx++)
+        {
+            alpha = a_in[xx] << shift;
+            // Merge the luminance and alpha with the picture
+            y_out[left + xx] =
+                ((uint32_t)y_out[left + xx] * (max - alpha) +
+                    ((uint32_t)av_bswap16(y_in[xx])) * alpha) / max;
+        }
+    }
+
+    // Blend U & V
+    int hshift = 0;
+    int wshift = 0;
+    if (dst->plane[1].height < dst->plane[0].height)
+    {
+        hshift = 1;
+    }
+    if (dst->plane[1].width < dst->plane[0].width)
+    {
+        wshift = 1;
+    }
+
+    for (int yy = y0 >> hshift; yy < hh >> hshift; yy++)
+    {
+        u_in = src->plane[1].data + yy * src->plane[1].stride;
+        u_out = (uint16_t *)(dst->plane[1].data + (yy + (top >> hshift)) * dst->plane[1].stride);
+        v_in = src->plane[2].data + yy * src->plane[2].stride;
+        v_out = (uint16_t *)(dst->plane[1].data + (yy + (top >> hshift)) * dst->plane[1].stride);
+        a_in = src->plane[3].data + (yy << hshift) * src->plane[3].stride;
+
+        for (int xx = x0 >> wshift; xx < ww >> wshift; xx++)
+        {
+            alpha = a_in[xx << wshift] << shift;
+
+            // Blend averge U and alpha
+            u_out[((left >> wshift) + xx) * 2] =
+                ((uint32_t)u_out[((left >> wshift) + xx) * 2] * (max - alpha) +
+                 ((uint32_t)av_bswap16(u_in[xx])) * alpha) / max;
+
+            // Blend V and alpha
+            v_out[((left >> wshift) + xx) * 2 + 1] =
+                ((uint32_t)v_out[((left >> wshift) + xx) * 2 + 1] * (max - alpha) +
+                 ((uint32_t)av_bswap16(v_in[xx])) * alpha) / max;
+        }
+    }
+}
+
+static int hb_blend_init(hb_blend_object_t *object,
+                         int in_pix_fmt,
+                         int in_chroma_location,
+                         int overlay_pix_fmt)
+{
+    object->private_data = calloc(sizeof(struct hb_blend_private_s), 1);
+    if (object->private_data == NULL)
+    {
+        hb_error("blend: calloc failed");
+        return -1;
+    }
+    hb_blend_private_t *pv = object->private_data;
+
+    const AVPixFmtDescriptor *in_desc = av_pix_fmt_desc_get(in_pix_fmt);
+    const AVPixFmtDescriptor *overlay_desc = av_pix_fmt_desc_get(overlay_pix_fmt);
+
+    pv->depth  = in_desc->comp[0].depth;
+    pv->wshift = in_desc->log2_chroma_w;
+    pv->hshift = in_desc->log2_chroma_h;
+
+    hb_compute_chroma_smoothing_coefficient(pv->chroma_coeffs,
+                                            in_pix_fmt,
+                                            in_chroma_location);
+
+    const int needs_subsample = in_desc->log2_chroma_w != overlay_desc->log2_chroma_w ||
+                                in_desc->log2_chroma_h != overlay_desc->log2_chroma_h;
+    const int planes_count = av_pix_fmt_count_planes(in_pix_fmt);
+
+    switch (pv->depth)
+    {
+        case 8:
+            switch (planes_count)
+            {
+                case 2:
+                    pv->blend = needs_subsample ? blend_subsample_8onbi8 : blend8onbi8;
+                    break;
+                default:
+                    pv->blend = needs_subsample ? blend_subsample_8on8 : blend8on8;
+                    break;
+            }
+            break;
+        default:
+            switch (planes_count)
+            {
+                case 2:
+                    pv->blend = needs_subsample ? blend_subsample_8onbi1x : blend8onbi1x;
+                    break;
+                default:
+                    pv->blend = needs_subsample ? blend_subsample_8on1x : blend8on1x;
+                    break;
+            }
+    }
+
+
+    return 0;
+}
+
+static hb_buffer_t * hb_blend_work(hb_blend_object_t *object,
+                                   hb_buffer_t *in,
+                                   hb_buffer_list_t *overlays)
+{
+    hb_blend_private_t *pv = object->private_data;
+    hb_buffer_t *out = in;
+
+    if (hb_buffer_list_count(overlays) == 0)
+    {
+        return out;
+    }
+
+    if (hb_buffer_is_writable(in) == 0)
+    {
+        out = hb_buffer_dup(in);
+        hb_buffer_close(&in);
+    }
+
+    for (hb_buffer_t *overlay = hb_buffer_list_head(overlays); overlay; overlay = overlay->next)
+    {
+        pv->blend(pv, out, overlay, pv->depth - 8);
+    }
+
+    return out;
+}
+
+static void hb_blend_close(hb_blend_object_t *object)
+{
+    hb_blend_private_t *pv = object->private_data;
+
+    if (pv == NULL)
+    {
+        return;
+    }
+
+    free(pv);
+}
--- a/libhb/common.c
+++ b/libhb/common.c
@ -6236,6 +6236,45 @@ int hb_rgb2yuv_bt2020(int rgb)
    return (y << 16) | (Cr << 8) | Cb;
 }

+void hb_compute_chroma_smoothing_coefficient(unsigned chroma_coeffs[2][4], int pix_fmt, int chroma_location)
+{
+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
+
+    // Compute chroma smoothing coefficients wrt video chroma location
+    int wX, wY;
+    wX = 4 - (1 << desc->log2_chroma_w);
+    wY = 4 - (1 << desc->log2_chroma_h);
+
+    switch (chroma_location)
+    {
+        case AVCHROMA_LOC_TOPLEFT:
+            wX += (1 << desc->log2_chroma_w) - 1;
+        case AVCHROMA_LOC_TOP:
+            wY += (1 << desc->log2_chroma_h) - 1;
+            break;
+        case AVCHROMA_LOC_LEFT:
+            wX += (1 << desc->log2_chroma_w) - 1;
+            break;
+        case AVCHROMA_LOC_BOTTOMLEFT:
+            wX += (1 << desc->log2_chroma_w) - 1;
+        case AVCHROMA_LOC_BOTTOM:
+            wY += (1 << desc->log2_chroma_h) - 1;
+        case AVCHROMA_LOC_CENTER:
+        default: // Center chroma value for unknown/unsupported
+            break;
+    }
+
+    const unsigned base_coefficients[] = {1, 3, 9, 27, 9, 3, 1};
+    // If wZ is even, an intermediate value is interpolated for symmetry.
+    for (int x = 0; x < 4; x++)
+    {
+        chroma_coeffs[0][x] = (base_coefficients[x + wX] +
+                               base_coefficients[x + wX + !(wX & 0x1)]) >> 1;
+        chroma_coeffs[1][x] = (base_coefficients[x + wY] +
+                               base_coefficients[x + wY + !(wY & 0x1)]) >> 1;
+    }
+}
+
 const char * hb_subsource_name( int source )
 {
    switch (source)
--- a/libhb/handbrake/common.h
+++ b/libhb/handbrake/common.h
@ -1511,20 +1511,6 @@ struct hb_filter_object_s
 #endif
 };

-struct hb_motion_metric_object_s
-{
-    char                * name;
-
-#ifdef __LIBHB__
-    int                (* init)       ( hb_motion_metric_object_t *, hb_filter_init_t * );
-    float              (* work)       ( hb_motion_metric_object_t *,
-                                        hb_buffer_t *, hb_buffer_t * );
-    void               (* close)      ( hb_motion_metric_object_t * );
-
-    hb_motion_metric_private_t * private_data;
-#endif
-};
-
 // Update win/CS/HandBrake.Interop/HandBrakeInterop/HbLib/hb_filter_ids.cs when changing this enum
 enum
 {
@ -1591,6 +1577,34 @@ char               * hb_filter_settings_string(int filter_id,
 char               * hb_filter_settings_string_json(int filter_id,
                                                    const char * json);

+struct hb_motion_metric_object_s
+{
+    char                * name;
+
+#ifdef __LIBHB__
+    int                (* init)       ( hb_motion_metric_object_t *, hb_filter_init_t * );
+    float              (* work)       ( hb_motion_metric_object_t *,
+                                        hb_buffer_t *, hb_buffer_t * );
+    void               (* close)      ( hb_motion_metric_object_t * );
+
+    hb_motion_metric_private_t * private_data;
+#endif
+};
+
+struct hb_blend_object_s
+{
+    char                * name;
+
+#ifdef __LIBHB__
+    int                (* init)       ( hb_blend_object_t *, int in_pix_fmt, int in_chroma_location, int sub_pix_fmt );
+    hb_buffer_t *      (* work)       ( hb_blend_object_t *,
+                                        hb_buffer_t *, hb_buffer_list_t * );
+    void               (* close)      ( hb_blend_object_t * );
+
+    hb_blend_private_t * private_data;
+#endif
+};
+
 typedef void hb_error_handler_t( const char *errmsg );

 extern void hb_register_error_handler( hb_error_handler_t * handler );
@ -1614,6 +1628,9 @@ int hb_rgb2yuv(int rgb);
 int hb_rgb2yuv_bt709(int rgb);
 int hb_rgb2yuv_bt2020(int rgb);

+void hb_compute_chroma_smoothing_coefficient(unsigned chroma_coeffs[2][4],
+                                             int pix_fmt, int chroma_location);
+
 const char * hb_subsource_name( int source );

 // unparse a set of x264 settings to an HB encopts string
--- a/libhb/handbrake/hbtypes.h
+++ b/libhb/handbrake/hbtypes.h
@ -42,6 +42,8 @@ typedef struct hb_filter_private_s hb_filter_private_t;
 typedef struct hb_filter_object_s  hb_filter_object_t;
 typedef struct hb_motion_metric_private_s  hb_motion_metric_private_t;
 typedef struct hb_motion_metric_object_s  hb_motion_metric_object_t;
+typedef struct hb_blend_private_s  hb_blend_private_t;
+typedef struct hb_blend_object_s  hb_blend_object_t;
 typedef struct hb_buffer_settings_s hb_buffer_settings_t;
 typedef struct hb_image_format_s hb_image_format_t;
 typedef struct hb_fifo_s hb_fifo_t;
--- a/libhb/handbrake/internal.h
+++ b/libhb/handbrake/internal.h
@ -467,6 +467,8 @@ extern hb_motion_metric_object_t hb_motion_metric;
 extern hb_motion_metric_object_t hb_motion_metric_vt;
 #endif

+extern hb_blend_object_t hb_blend;
+
 extern hb_work_object_t * hb_objects;

 #define HB_WORK_IDLE     0
--- a/libhb/rendersub.c
+++ b/libhb/rendersub.c
@ -10,7 +10,6 @@
 #include "handbrake/handbrake.h"
 #include "handbrake/hbffmpeg.h"
 #include "handbrake/extradata.h"
-#include "libavutil/bswap.h"
 #include <ass/ass.h>

 #define ABS(a) ((a) > 0 ? (a) : (-(a)))
@ -31,7 +30,6 @@ struct hb_filter_private_s
 {
    // Common
    int                pix_fmt_alpha;
-    int                depth;
    int                hshift;
    int                wshift;
    int                crop[4];
@ -56,7 +54,7 @@ struct hb_filter_private_s
    int                line;
    hb_buffer_t       *current_sub;

-    void (*blend)(const struct hb_filter_private_s *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift);
+    hb_blend_object_t *blend;
    unsigned           chroma_coeffs[2][4];

    hb_filter_init_t   input;
@ -238,385 +236,6 @@ static void hb_box_vec_close(hb_box_vec_t *vec)
    vec->size = 0;
 }

-static void blend8on1x(const hb_filter_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
-{
-    int x0, y0, x0c, y0c;
-    int xx, yy, ox, oy;
-    int width, height;
-    uint8_t *y_in, *u_in, *v_in, *a_in;
-    uint16_t *y_out, *u_out, *v_out;
-    const unsigned max_val = (256 << shift) - 1;
-
-    const int left = x0 = src->f.x;
-    const int top  = y0 = src->f.y;
-
-    // Coordinates of the first chroma sample affected by the overlay
-    x0c = x0 & ~((1 << pv->wshift) - 1);
-    y0c = y0 & ~((1 << pv->hshift) - 1);
-
-    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
-    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
-
-    const unsigned int ovVertShift = NULL == pv->ssa ? 0 : pv->hshift;
-    const unsigned int ovHorzShift = NULL == pv->ssa ? 0 : pv->wshift;
-
-    // This is setting the pointer outside of the array range if y0c < y0
-    oy = y0c - y0;
-
-    unsigned is_chroma_line, resU, resV, alpha;
-    unsigned accuA, accuB, accuC, coeff;
-    for (yy = y0c; oy < height; oy = ++yy - y0)
-    {
-        y_out = (uint16_t*)(dst->plane[0].data + yy * dst->plane[0].stride);
-        u_out = (uint16_t*)(dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride);
-        v_out = (uint16_t*)(dst->plane[2].data + (yy >> pv->hshift) * dst->plane[2].stride);
-
-        y_in = src->plane[0].data + oy * src->plane[0].stride;
-        u_in = src->plane[1].data + (oy >> ovVertShift) * src->plane[1].stride;
-        v_in = src->plane[2].data + (oy >> ovVertShift) * src->plane[2].stride;
-        a_in = src->plane[3].data + oy * src->plane[3].stride;
-
-        ox = x0c - x0;
-        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
-        for (xx = x0c; ox < width; ox = ++xx - x0)
-        {
-            if (ox >= 0 && oy >= 0)
-            {
-                alpha = a_in[ox] << shift;
-                y_out[xx] = ((uint32_t)y_out[xx] * (max_val - alpha) + ((uint32_t)y_in[ox] << shift) * alpha + (max_val >> 1)) / max_val;
-            }
-
-            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
-            {
-                // Perform chromaloc-aware subsampling and blending
-                accuA = accuB = accuC = 0;
-                for (int yz = 0, oyz = oy; yz < (1 << (pv->hshift - ovVertShift)) && oy + yz < height; yz++, oyz++)
-                {
-                    for (int xz = 0, oxz = ox; xz < (1 << (pv->wshift - ovHorzShift)) && ox + xz < width; xz++, oxz++)
-                    {
-                        // Weight of the current chroma sample
-                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
-                        resU = u_out[xx >> pv->wshift];
-                        resV = v_out[xx >> pv->wshift];
-
-                        // Chroma sampled area overlap with bitmap
-                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
-                        {
-                            alpha = (uint32_t)a_in[oxz + yz * src->plane[3].stride] << shift;
-                            resU *= (max_val - alpha);
-                            resU = (resU + ((uint32_t)(u_in + (yz >> ovVertShift) * src->plane[1].stride)[oxz >> ovHorzShift] << shift) * alpha + (max_val>>1)) / max_val;
-
-                            resV *= (max_val - alpha);
-                            resV = (resV + ((uint32_t)(v_in + (yz >> ovVertShift) * src->plane[2].stride)[oxz >> ovHorzShift] << shift) * alpha + (max_val>>1)) / max_val;
-                        }
-
-                        // Accumulate
-                        accuA += coeff * resU;
-                        accuB += coeff * resV;
-                        accuC += coeff;
-                    }
-                }
-                if (accuC)
-                {
-                    u_out[xx >> pv->wshift] = (accuA + (accuC>>1))/accuC;
-                    v_out[xx >> pv->wshift] = (accuB + (accuC>>1))/accuC;
-                }
-            }
-        }
-    }
-}
-
-static void blend8onbi1x(const hb_filter_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
-{
-    int x0, y0, x0c, y0c;
-    int xx, yy, ox, oy;
-    int width, height;
-    uint8_t *y_in, *u_in, *v_in, *a_in;
-    uint16_t *y_out, *u_out, *v_out;
-    const unsigned max_val = (256 << shift) - 1;
-
-    const int left = x0 = src->f.x;
-    const int top  = y0 = src->f.y;
-
-    // Coordinates of the first chroma sample affected by the overlay
-    x0c = x0 & ~((1 << pv->wshift) - 1);
-    y0c = y0 & ~((1 << pv->hshift) - 1);
-
-    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
-    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
-
-    const unsigned int ovVertShift = NULL == pv->ssa ? 0 : pv->hshift;
-    const unsigned int ovHorzShift = NULL == pv->ssa ? 0 : pv->wshift;
-
-    // This is setting the pointer outside of the array range if y0c < y0
-    oy = y0c - y0;
-
-    unsigned is_chroma_line, resU, resV, alpha;
-    unsigned accuA, accuB, accuC, coeff;
-    for (yy = y0c; oy < height; oy = ++yy - y0)
-    {
-        y_out = (uint16_t*)(dst->plane[0].data + yy * dst->plane[0].stride);
-        u_out = (uint16_t*)(dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride);
-        v_out = u_out;
-
-        y_in = src->plane[0].data + oy * src->plane[0].stride;
-        u_in = src->plane[1].data + (oy >> ovVertShift) * src->plane[1].stride;
-        v_in = src->plane[2].data + (oy >> ovVertShift) * src->plane[2].stride;
-        a_in = src->plane[3].data + oy * src->plane[3].stride;
-
-        ox = x0c - x0;
-        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
-        for (xx = x0c; ox < width; ox = ++xx - x0)
-        {
-            if (ox >= 0 && oy >= 0)
-            {
-                alpha = a_in[ox] << shift;
-                y_out[xx] = ((uint32_t)y_out[xx] * (max_val - alpha) + av_bswap16(y_in[ox]) * alpha + (max_val >> 1)) / max_val;
-            }
-
-            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
-            {
-                // Perform chromaloc-aware subsampling and blending
-                accuA = accuB = accuC = 0;
-                for (int yz = 0, oyz = oy; yz < (1 << (pv->hshift - ovVertShift)) && oy + yz < height; yz++, oyz++)
-                {
-                    for (int xz = 0, oxz = ox; xz < (1 << (pv->wshift - ovHorzShift)) && ox + xz < width; xz++, oxz++)
-                    {
-                        // Weight of the current chroma sample
-                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
-                        resU = u_out[(xx >> pv->wshift) * 2 + 0];
-                        resV = v_out[(xx >> pv->wshift) * 2 + 1];
-
-                        // Chroma sampled area overlap with bitmap
-                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
-                        {
-                            alpha = a_in[oxz + yz*src->plane[3].stride] << shift;
-                            resU *= (max_val - alpha);
-                            resU = (resU + av_bswap16((u_in + (yz >> ovVertShift) * src->plane[1].stride)[oxz >> ovHorzShift]) * alpha + (max_val>>1)) / max_val;
-
-                            resV *= (max_val - alpha);
-                            resV = (resV + av_bswap16((v_in + (yz >> ovVertShift) * src->plane[2].stride)[oxz >> ovHorzShift]) * alpha + (max_val>>1)) / max_val;
-                        }
-
-                        // Accumulate
-                        accuA += coeff * resU;
-                        accuB += coeff * resV;
-                        accuC += coeff;
-                    }
-                }
-                if (accuC)
-                {
-                    u_out[(xx >> pv->wshift) * 2 + 0] = (accuA + (accuC>>1))/accuC;
-                    v_out[(xx >> pv->wshift) * 2 + 1] = (accuB + (accuC>>1))/accuC;
-                }
-            }
-        }
-    }
-}
-
-static void blend8on8(const hb_filter_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
-{
-    int x0, y0, x0c, y0c;
-    int xx, yy, ox, oy;
-    int width, height;
-    uint8_t *y_in, *y_out;
-    uint8_t *u_in, *u_out;
-    uint8_t *v_in, *v_out;
-    uint8_t *a_in;
-
-    const int left = x0 = src->f.x;
-    const int top  = y0 = src->f.y;
-
-    // Coordinates of the first chroma sample affected by the overlay
-    x0c = x0 & ~((1 << pv->wshift) - 1);
-    y0c = y0 & ~((1 << pv->hshift) - 1);
-
-    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
-    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
-
-    // This is setting the pointer outside of the array range if y0c < y0
-    oy = y0c - y0;
-
-    // ASS overlays are already subsampled to the video pixel format
-    // Adapt condition below to support other formats that are also subsampled.
-    const unsigned int ovVertShift = NULL == pv->ssa ? 0 : pv->hshift;
-    const unsigned int ovHorzShift = NULL == pv->ssa ? 0 : pv->wshift;
-
-    unsigned is_chroma_line, resU, resV, alpha;
-    unsigned accuA, accuB, accuC, coeff;
-    for (yy = y0c; oy < height; oy = ++yy - y0)
-    {
-        y_out = dst->plane[0].data + yy * dst->plane[0].stride;
-        u_out = dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride;
-        v_out = dst->plane[2].data + (yy >> pv->hshift) * dst->plane[2].stride;
-
-        y_in = src->plane[0].data + oy * src->plane[0].stride;
-        u_in = src->plane[1].data + (oy >> ovVertShift) * src->plane[1].stride;
-        v_in = src->plane[2].data + (oy >> ovVertShift) * src->plane[2].stride;
-        a_in = src->plane[3].data + oy * src->plane[3].stride;
-
-        ox = x0c - x0;
-        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
-        for (xx = x0c; ox < width; ox = ++xx - x0)
-        {
-            if (ox >= 0 && oy >= 0)
-            {
-                y_out[xx] = (y_out[xx] * (255 - a_in[ox]) + y_in[ox] * a_in[ox] + 127) / 255;
-            }
-
-            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
-            {
-                // Perform chromaloc-aware subsampling and blending
-                accuA = accuB = accuC = 0;
-                for (int yz = 0, oyz = oy; yz < (1 << (pv->hshift - ovVertShift)) && oy + yz < height; yz++, oyz++)
-                {
-                    for (int xz = 0, oxz = ox; xz < (1 << (pv->wshift - ovHorzShift)) && ox + xz < width; xz++, oxz++)
-                    {
-                        // Weight of the current chroma sample
-                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
-                        resU = u_out[xx >> pv->wshift];
-                        resV = v_out[xx >> pv->wshift];
-
-                        // Chroma sampled area overlap with bitmap
-                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
-                        {
-                            alpha = a_in[oxz + yz*src->plane[3].stride];
-                            resU *= (255 - alpha);
-                            resU = (resU + (u_in + (yz >> ovVertShift) * src->plane[1].stride)[oxz >> ovHorzShift] * alpha + 127) / 255;
-
-                            resV *= (255 - alpha);
-                            resV = (resV + (v_in + (yz >> ovVertShift) * src->plane[2].stride)[oxz >> ovHorzShift] * alpha + 127) / 255;
-                        }
-
-                        // Accumulate
-                        accuA += coeff * resU;
-                        accuB += coeff * resV;
-                        accuC += coeff;
-                    }
-                }
-                if (accuC)
-                {
-                    u_out[xx >> pv->wshift] = (accuA + (accuC>>1)) / accuC;
-                    v_out[xx >> pv->wshift] = (accuB + (accuC>>1)) / accuC;
-                }
-            }
-        }
-    }
-}
-
-static void blend8onbi8(const hb_filter_private_t *pv, hb_buffer_t *dst, const hb_buffer_t *src, const int shift)
-{
-    int x0, y0, x0c, y0c;
-    int xx, yy, ox, oy;
-    int width, height;
-    uint8_t *y_in, *y_out;
-    uint8_t *u_in, *u_out;
-    uint8_t *v_in, *v_out;
-    uint8_t *a_in;
-
-    const int left = x0 = src->f.x;
-    const int top = y0 = src->f.y;
-
-    // Coordinates of the first chroma sample affected by the overlay
-    x0c = x0 & ~((1 << pv->wshift) - 1);
-    y0c = y0 & ~((1 << pv->hshift) - 1);
-
-    width  = (src->f.width  - x0 <= dst->f.width - left) ? src->f.width  : (dst->f.width - left + x0);
-    height = (src->f.height - y0 <= dst->f.height - top) ? src->f.height : (dst->f.height - top + y0);
-
-    const unsigned int ovVertShift = NULL == pv->ssa ? 0 : pv->hshift;
-    const unsigned int ovHorzShift = NULL == pv->ssa ? 0 : pv->wshift;
-
-    // This is setting the pointer outside of the array range if y0c < y0
-    oy = y0c - y0;
-
-    unsigned is_chroma_line, resU, resV, alpha;
-    unsigned accuA, accuB, accuC, coeff;
-    for (yy = y0c; oy < height; oy = ++yy - y0)
-    {
-        y_out = dst->plane[0].data + yy * dst->plane[0].stride;
-        u_out = dst->plane[1].data + (yy >> pv->hshift) * dst->plane[1].stride;
-        v_out = u_out;
-
-        y_in = src->plane[0].data + oy * src->plane[0].stride;
-        u_in = src->plane[1].data + (oy >> ovVertShift) * src->plane[1].stride;
-        v_in = src->plane[2].data + (oy >> ovVertShift) * src->plane[2].stride;
-        a_in = src->plane[3].data + oy * src->plane[3].stride;
-
-        ox = x0c - x0;
-        is_chroma_line = yy == (yy & ~((1 << pv->hshift) - 1));
-        for (xx = x0c; ox < width; ox = ++xx - x0)
-        {
-            if (ox >= 0 && oy >= 0)
-            {
-                y_out[xx] = (y_out[xx] * (255 - a_in[ox]) + y_in[ox] * a_in[ox] + 127) / 255;
-            }
-
-            if (is_chroma_line && xx == (xx & ~((1 << pv->wshift) - 1)))
-            {
-                // Perform chromaloc-aware subsampling and blending
-                accuA = accuB = accuC = 0;
-                for (int yz = 0, oyz = oy; yz < (1 << (pv->hshift - ovVertShift)); yz++, oyz++)
-                {
-                    for (int xz = 0, oxz = ox; xz < (1 << (pv->wshift - ovHorzShift)); xz++, oxz++)
-                    {
-                        // Weight of the current chroma sample
-                        coeff = pv->chroma_coeffs[0][xz] * pv->chroma_coeffs[1][yz];
-                        resU = u_out[(xx >> pv->wshift) * 2 + 0];
-                        resV = v_out[(xx >> pv->wshift) * 2 + 1];
-
-                        // Chroma sampled area overlap with bitmap
-                        if (oxz >= 0 && oyz >= 0 && ox + xz < width && oy + yz < height)
-                        {
-                            alpha = a_in[oxz + yz*src->plane[3].stride];
-                            resU *= (255 - alpha);
-                            resU = (resU + (u_in + (yz >> ovVertShift) * src->plane[1].stride)[oxz >> ovHorzShift] * alpha + 127) / 255;
-
-                            resV *= (255 - alpha);
-                            resV = (resV + (v_in + (yz >> ovVertShift) * src->plane[2].stride)[oxz >> ovHorzShift] * alpha + 127) / 255;
-                        }
-
-                        // Accumulate
-                        accuA += coeff*resU;
-                        accuB += coeff*resV;
-                        accuC += coeff;
-                    }
-                }
-                if (accuC)
-                {
-                    u_out[(xx >> pv->wshift) * 2 + 0] = (accuA + (accuC>>1)) / accuC;
-                    v_out[(xx >> pv->wshift) * 2 + 1] = (accuB + (accuC>>1)) / accuC;
-                }
-            }
-        }
-    }
-}
-
-static hb_buffer_t * blend_subs(const hb_filter_private_t *pv, hb_buffer_t *in, hb_buffer_list_t *subs)
-{
-    // Assumes that the input destination buffer has
-    // the same dimensions as the original title dimensions
-    hb_buffer_t *out = in;
-
-    if (hb_buffer_list_count(subs) == 0)
-    {
-        return out;
-    }
-
-    if (hb_buffer_is_writable(in) == 0)
-    {
-        out = hb_buffer_dup(in);
-        hb_buffer_close(&in);
-    }
-
-    for (hb_buffer_t *sub = hb_buffer_list_head(subs); sub; sub = sub->next)
-    {
-        pv->blend(pv, out, sub, pv->depth - 8);
-    }
-
-    return out;
-}
-
 static hb_buffer_t * scale_subtitle(hb_filter_private_t *pv,
                                    hb_buffer_t *sub, hb_buffer_t *buf)
 {
@ -793,6 +412,7 @@ static void render_vobsubs(hb_filter_private_t *pv, hb_buffer_t *buf)

 static int vobsub_post_init(hb_filter_object_t *filter, hb_job_t *job)
 {
+    filter->private_data->pix_fmt_alpha = AV_PIX_FMT_YUVA444P;
    return 0;
 }

@ -841,7 +461,7 @@ static int vobsub_work(hb_filter_object_t *filter,
    render_vobsubs(pv, in);

    *buf_in = NULL;
-    *buf_out = blend_subs(pv, in, &pv->rendered_sub_list);
+    *buf_out = pv->blend->work(pv->blend, in, &pv->rendered_sub_list);

    hb_buffer_list_close(&pv->rendered_sub_list);

@ -1052,6 +672,41 @@ static int ssa_post_init(hb_filter_object_t *filter, hb_job_t *job)
 {
    hb_filter_private_t *pv = filter->private_data;

+    switch (pv->input.pix_fmt)
+    {
+        case AV_PIX_FMT_NV12:
+        case AV_PIX_FMT_P010:
+        case AV_PIX_FMT_P012:
+        case AV_PIX_FMT_P016:
+        case AV_PIX_FMT_YUV420P:
+        case AV_PIX_FMT_YUV420P10:
+        case AV_PIX_FMT_YUV420P12:
+        case AV_PIX_FMT_YUV420P16:
+            pv->pix_fmt_alpha = AV_PIX_FMT_YUVA420P;
+            break;
+        case AV_PIX_FMT_NV16:
+        case AV_PIX_FMT_P210:
+        case AV_PIX_FMT_P212:
+        case AV_PIX_FMT_P216:
+        case AV_PIX_FMT_YUV422P:
+        case AV_PIX_FMT_YUV422P10:
+        case AV_PIX_FMT_YUV422P12:
+        case AV_PIX_FMT_YUV422P16:
+            pv->pix_fmt_alpha = AV_PIX_FMT_YUVA422P;
+            break;
+        case AV_PIX_FMT_NV24:
+        case AV_PIX_FMT_P410:
+        case AV_PIX_FMT_P412:
+        case AV_PIX_FMT_P416:
+        case AV_PIX_FMT_YUV444P:
+        case AV_PIX_FMT_YUV444P10:
+        case AV_PIX_FMT_YUV444P12:
+        case AV_PIX_FMT_YUV444P16:
+        default:
+            pv->pix_fmt_alpha = AV_PIX_FMT_YUVA444P;
+            break;
+    }
+
    pv->ssa = ass_library_init();
    if (!pv->ssa)
    {
@ -1200,7 +855,7 @@ static int ssa_work(hb_filter_object_t *filter,
    render_ssa_subs(pv, in->s.start);

    *buf_in  = NULL;
-    *buf_out = blend_subs(pv, in, &pv->rendered_sub_list);
+    *buf_out = pv->blend->work(pv->blend, in, &pv->rendered_sub_list);

    return HB_FILTER_OK;
 }
@ -1340,7 +995,7 @@ static int textsub_work(hb_filter_object_t *filter,
    render_ssa_subs(pv, in->s.start);

    *buf_in  = NULL;
-    *buf_out = blend_subs(pv, in, &pv->rendered_sub_list);
+    *buf_out = pv->blend->work(pv->blend, in, &pv->rendered_sub_list);

    return HB_FILTER_OK;
 }
@ -1402,6 +1057,7 @@ static void render_pgs_subs(hb_filter_private_t *pv, hb_buffer_t *buf)

 static int pgssub_post_init(hb_filter_object_t *filter, hb_job_t *job)
 {
+    filter->private_data->pix_fmt_alpha = AV_PIX_FMT_YUVA444P;
    return 0;
 }

@ -1450,13 +1106,53 @@ static int pgssub_work(hb_filter_object_t *filter,
    render_pgs_subs(pv, in);

    *buf_in = NULL;
-    *buf_out = blend_subs(pv, in, &pv->rendered_sub_list);
+    *buf_out = pv->blend->work(pv->blend, in, &pv->rendered_sub_list);

    hb_buffer_list_close(&pv->rendered_sub_list);

    return HB_FILTER_OK;
 }

+static hb_blend_object_t * hb_blend_init(int hw_pixfmt, int in_pix_fmt, int in_chroma_location, int sub_pix_fmt)
+{
+    hb_blend_object_t *blend;
+    switch (hw_pixfmt)
+    {
+        default:
+            blend = &hb_blend;
+            break;
+    }
+
+    hb_blend_object_t *blend_copy = malloc(sizeof(hb_blend_object_t));
+    if (blend_copy == NULL)
+    {
+        hb_error("render_sub: blend malloc failed");
+        return NULL;
+    }
+
+    memcpy(blend_copy, blend, sizeof(hb_blend_object_t));
+    if (blend_copy->init(blend_copy, in_pix_fmt, in_chroma_location, sub_pix_fmt))
+    {
+        free(blend_copy);
+        hb_error("render_sub: blend init failed");
+        return NULL;
+    }
+
+    return blend_copy;
+}
+
+void hb_blend_close(hb_blend_object_t **_b)
+{
+    hb_blend_object_t *b = *_b;
+    if (b == NULL)
+    {
+        return;
+    }
+    b->close(b);
+    free(b);
+    *_b = NULL;
+}
+
 static int hb_rendersub_init(hb_filter_object_t *filter,
                             hb_filter_init_t *init)
 {
@ -1472,105 +1168,12 @@ static int hb_rendersub_init(hb_filter_object_t *filter,
    pv->input = *init;

    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(init->pix_fmt);
-    pv->depth      = desc->comp[0].depth;
-    pv->wshift     = desc->log2_chroma_w;
-    pv->hshift     = desc->log2_chroma_h;
+    pv->wshift = desc->log2_chroma_w;
+    pv->hshift = desc->log2_chroma_h;

-    //Compute chroma smoothing coefficients wrt video chroma location
-    int wX, wY;
-    wX = 4 - (1 << desc->log2_chroma_w);
-    wY = 4 - (1 << desc->log2_chroma_h);
-
-    switch (init->chroma_location)
-    {
-        case AVCHROMA_LOC_TOPLEFT:
-            wX += (1 << desc->log2_chroma_w) - 1;
-        case AVCHROMA_LOC_TOP:
-            wY += (1 << desc->log2_chroma_h) - 1;
-            break;
-        case AVCHROMA_LOC_LEFT:
-            wX += (1 << desc->log2_chroma_w) - 1;
-            break;
-        case AVCHROMA_LOC_BOTTOMLEFT:
-            wX += (1 << desc->log2_chroma_w) - 1;
-        case AVCHROMA_LOC_BOTTOM:
-            wY += (1 << desc->log2_chroma_h) - 1;
-        case AVCHROMA_LOC_CENTER:
-        default: // Center chroma value for unknown/unsupported
-            break;
-    }
-
-    const unsigned base_coefficients[] = {1, 3, 9, 27, 9, 3, 1};
-    // If wZ is even, an intermediate value is interpolated for symmetry.
-    for (int x = 0; x < 4; x++)
-    {
-        pv->chroma_coeffs[0][x] = (base_coefficients[x + wX] +
-                                   base_coefficients[x + wX + !(wX & 0x1)]) >> 1;
-        pv->chroma_coeffs[1][x] = (base_coefficients[x + wY] +
-                                   base_coefficients[x + wY + !(wY & 0x1)]) >> 1;
-    }
-
-    switch (init->pix_fmt)
-    {
-        case AV_PIX_FMT_NV12:
-        case AV_PIX_FMT_P010:
-        case AV_PIX_FMT_P012:
-        case AV_PIX_FMT_P016:
-        case AV_PIX_FMT_YUV420P:
-        case AV_PIX_FMT_YUV420P10:
-        case AV_PIX_FMT_YUV420P12:
-        case AV_PIX_FMT_YUV420P16:
-            pv->pix_fmt_alpha = AV_PIX_FMT_YUVA420P;
-            break;
-        case AV_PIX_FMT_NV16:
-        case AV_PIX_FMT_P210:
-        case AV_PIX_FMT_P212:
-        case AV_PIX_FMT_P216:
-        case AV_PIX_FMT_YUV422P:
-        case AV_PIX_FMT_YUV422P10:
-        case AV_PIX_FMT_YUV422P12:
-        case AV_PIX_FMT_YUV422P16:
-            pv->pix_fmt_alpha = AV_PIX_FMT_YUVA422P;
-            break;
-        case AV_PIX_FMT_NV24:
-        case AV_PIX_FMT_P410:
-        case AV_PIX_FMT_P412:
-        case AV_PIX_FMT_P416:
-        case AV_PIX_FMT_YUV444P:
-        case AV_PIX_FMT_YUV444P10:
-        case AV_PIX_FMT_YUV444P12:
-        case AV_PIX_FMT_YUV444P16:
-        default:
-            pv->pix_fmt_alpha = AV_PIX_FMT_YUVA444P;
-            break;
-    }
-
-    const int planes_count = av_pix_fmt_count_planes(init->pix_fmt);
-
-    switch (pv->depth)
-    {
-        case 8:
-            switch (planes_count)
-            {
-                case 2:
-                    pv->blend = blend8onbi8;
-                    break;
-                default:
-                    pv->blend = blend8on8;
-                    break;
-            }
-            break;
-        default:
-            switch (planes_count)
-            {
-                case 2:
-                    pv->blend = blend8onbi1x;
-                    break;
-                default:
-                    pv->blend = blend8on1x;
-                    break;
-            }
-    }
+    hb_compute_chroma_smoothing_coefficient(pv->chroma_coeffs,
+                                            init->pix_fmt,
+                                            init->chroma_location);

    // Find the subtitle we need
    for (int ii = 0; ii < hb_list_count(init->job->list_subtitle); ii++)
@ -1596,6 +1199,7 @@ static int hb_rendersub_init(hb_filter_object_t *filter,

 static int hb_rendersub_post_init(hb_filter_object_t *filter, hb_job_t *job)
 {
+    int ret = 0;
    hb_filter_private_t *pv = filter->private_data;

    pv->crop[0] = job->crop[0];
@ -1607,12 +1211,14 @@ static int hb_rendersub_post_init(hb_filter_object_t *filter, hb_job_t *job)
    {
        case VOBSUB:
        {
-            return vobsub_post_init(filter, job);
+            ret = vobsub_post_init(filter, job);
+            break;
        }

        case SSASUB:
        {
-            return ssa_post_init(filter, job);
+            ret =  ssa_post_init(filter, job);
+            break;
        }

        case IMPORTSRT:
@ -1620,18 +1226,21 @@ static int hb_rendersub_post_init(hb_filter_object_t *filter, hb_job_t *job)
        case UTF8SUB:
        case TX3GSUB:
        {
-            return textsub_post_init(filter, job);
+            ret = textsub_post_init(filter, job);
+            break;
        }

        case CC608SUB:
        {
-            return cc608sub_post_init(filter, job);
+            ret = cc608sub_post_init(filter, job);
+            break;
        }

        case DVBSUB:
        case PGSSUB:
        {
-            return pgssub_post_init(filter, job);
+            ret = pgssub_post_init(filter, job);
+            break;
        }

        default:
@ -1640,6 +1249,23 @@ static int hb_rendersub_post_init(hb_filter_object_t *filter, hb_job_t *job)
            return 1;
        }
    }
+
+    if (ret > 0)
+    {
+        return 1;
+    }
+
+    pv->blend = hb_blend_init(pv->input.hw_pix_fmt,
+                              pv->input.pix_fmt, pv->input.chroma_location,
+                              pv->pix_fmt_alpha);
+
+    if (pv->blend == NULL)
+    {
+        hb_log("rendersub: blend initialization failed");
+        return 1;
+    }
+
+    return 0;
 }

 static int hb_rendersub_work(hb_filter_object_t *filter,
@ -1686,12 +1312,18 @@ static void hb_rendersub_close(hb_filter_object_t *filter)
 {
    hb_filter_private_t *pv = filter->private_data;

+    if (pv == NULL)
+    {
+        return;
+    }
+
    if (pv->sws != NULL)
    {
        sws_freeContext(pv->sws);
    }

    hb_buffer_list_close(&pv->rendered_sub_list);
+    hb_blend_close(&pv->blend);

    switch (pv->type)
    {