libhb: add a Metal accelerated comb detect filter.

2023-11-23 18:05:09 +01:00 · 2023-11-23 18:05:09 +01:00 · 4e40332d2e
commit 4e40332d2e
parent ad20fe40c7
7 changed files with 1089 additions and 0 deletions
--- a/libhb/common.c
+++ b/libhb/common.c
@ -4693,6 +4693,10 @@ hb_filter_object_t * hb_filter_get( int filter_id )
            filter = &hb_filter_prefilter_vt;
            break;

+        case HB_FILTER_COMB_DETECT_VT:
+            filter = &hb_filter_comb_detect_vt;
+            break;
+
        case HB_FILTER_YADIF_VT:
            filter = &hb_filter_yadif_vt;
            break;
--- a/libhb/handbrake/common.h
+++ b/libhb/handbrake/common.h
@ -1489,6 +1489,7 @@ enum
    // First, filters that may change the framerate (drop or dup frames)
    HB_FILTER_DETELECINE,
    HB_FILTER_COMB_DETECT,
+    HB_FILTER_COMB_DETECT_VT,
    HB_FILTER_DECOMB,
    HB_FILTER_YADIF,
    HB_FILTER_YADIF_VT,
--- a/libhb/handbrake/internal.h
+++ b/libhb/handbrake/internal.h
@ -496,6 +496,7 @@ extern hb_filter_object_t hb_filter_format;

 #if defined(__APPLE__)
 extern hb_filter_object_t hb_filter_prefilter_vt;
+extern hb_filter_object_t hb_filter_comb_detect_vt;
 extern hb_filter_object_t hb_filter_yadif_vt;
 extern hb_filter_object_t hb_filter_bwdif_vt;
 extern hb_filter_object_t hb_filter_crop_scale_vt;
--- a/libhb/platform/macosx/comb_detect_vt.m
+++ b/libhb/platform/macosx/comb_detect_vt.m
@ -0,0 +1,524 @@
+/* comb_detect.c
+
+   Copyright (c) 2003-2023 HandBrake Team
+   This file is part of the HandBrake source code
+   Homepage: <http://handbrake.fr/>.
+   It may be used under the terms of the GNU General Public License v2.
+   For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+*/
+
+#include "handbrake/handbrake.h"
+#include "cv_utils.h"
+#include "metal_utils.h"
+#include "vt_common.h"
+
+extern char hb_comb_detect_vt_metallib_data[];
+extern unsigned int hb_comb_detect_vt_metallib_len;
+
+struct mtl_comb_detect_params
+{
+    int spatial_metric;
+
+    float motion_threshold;
+    float spatial_threshold;
+    int   block_threshold;
+    int   block_width;
+    int   block_height;
+
+    float gamma_motion_threshold;
+    float gamma_spatial_threshold;
+    float gamma_spatial_threshold6;
+    float spatial_threshold_squared;
+    float spatial_threshold6;
+    float comb32detect_min;
+    float comb32detect_max;
+
+    bool  force_exaustive_check;
+};
+
+#define MODE_GAMMA        1 // Scale gamma when decombing
+#define MODE_FILTER       2 // Filter combing mask
+#define MODE_MASK         4 // Output combing masks instead of pictures
+#define MODE_COMPOSITE    8 // Overlay combing mask onto picture
+
+#define FILTER_CLASSIC 1
+#define FILTER_ERODE_DILATE 2
+
+struct hb_filter_private_s
+{
+    hb_metal_context_t *mtl;
+    const AVPixFmtDescriptor *desc;
+
+    // comb detect parameters
+    int  mode;
+    int  filter_mode;
+    int  spatial_metric;
+    float  motion_threshold;
+    float  spatial_threshold;
+    int    block_threshold;
+    int    block_width;
+    int    block_height;
+
+    // Computed parameters
+    float  gamma_motion_threshold;
+    float  gamma_spatial_threshold;
+    float  gamma_spatial_threshold6;
+    float  spatial_threshold_squared;
+    float  spatial_threshold6;
+    float  comb32detect_min;
+    float  comb32detect_max;
+
+    bool   force_exaustive_check;
+
+    // Mask textures
+    id<MTLTexture> mask;
+    id<MTLTexture> temp;
+
+    // Comb result
+    id<MTLBuffer> combed;
+
+    hb_buffer_t       *ref[3];
+    hb_buffer_list_t   out_list;
+
+    // Filter statistics
+    int comb_heavy;
+    int comb_light;
+    int comb_none;
+    int frames;
+};
+
+static int comb_detect_vt_init(hb_filter_object_t *filter,
+                            hb_filter_init_t *init);
+
+static int comb_detect_vt_work(hb_filter_object_t *filter,
+                            hb_buffer_t **buf_in,
+                            hb_buffer_t **buf_out );
+
+static void comb_detect_vt_close(hb_filter_object_t *filter);
+
+static const char comb_detect_vt_template[] =
+    "mode=^"HB_INT_REG"$:spatial-metric=^([012])$:"
+    "motion-thresh=^"HB_INT_REG"$:spatial-thresh=^"HB_INT_REG"$:"
+    "filter-mode=^([012])$:block-thresh=^"HB_INT_REG"$:"
+    "block-width=^"HB_INT_REG"$:block-height=^"HB_INT_REG"$:"
+    "disable=^"HB_BOOL_REG"$";
+
+hb_filter_object_t hb_filter_comb_detect_vt =
+{
+    .id                = HB_FILTER_COMB_DETECT_VT,
+    .enforce_order     = 1,
+    .name              = "Comb Detect (VideoToolbox)",
+    .settings          = NULL,
+    .init              = comb_detect_vt_init,
+    .work              = comb_detect_vt_work,
+    .close             = comb_detect_vt_close,
+    .settings_template = comb_detect_vt_template,
+};
+
+#define PREV 0
+#define CURR 1
+#define NEXT 2
+
+static void store_buf(hb_filter_private_t *pv, hb_buffer_t *in)
+{
+    if (pv->ref[PREV])
+    {
+        hb_buffer_close(&pv->ref[PREV]);
+    }
+    pv->ref[PREV] = pv->ref[CURR];
+    pv->ref[CURR] = pv->ref[NEXT];
+    pv->ref[NEXT] = in;
+}
+
+static int comb_detect_vt_init(hb_filter_object_t *filter,
+                               hb_filter_init_t   *init)
+{
+    filter->private_data = calloc(1, sizeof(struct hb_filter_private_s));
+    if (filter->private_data == NULL)
+    {
+        hb_error("comb_detect_vt: calloc failed");
+        return -1;
+    }
+    hb_filter_private_t *pv = filter->private_data;
+
+    hb_buffer_list_clear(&pv->out_list);
+
+    pv->desc   = av_pix_fmt_desc_get(init->pix_fmt);
+
+    pv->frames = 0;
+    pv->force_exaustive_check = 1;
+    pv->comb_heavy = 0;
+    pv->comb_light = 0;
+    pv->comb_none = 0;
+
+    pv->mode              = MODE_GAMMA | MODE_FILTER;
+    pv->filter_mode       = FILTER_ERODE_DILATE;
+    pv->spatial_metric    = 2;
+    pv->motion_threshold  = 3;
+    pv->spatial_threshold = 3;
+    pv->block_threshold   = 40;
+    pv->block_width       = 16;
+    pv->block_height      = 16;
+
+    if (filter->settings)
+    {
+        int motion_threshold, spatial_threshold, block_threshold;
+        hb_value_t *dict = filter->settings;
+
+        // Get comb detection settings
+        hb_dict_extract_int(&pv->mode, dict, "mode");
+        hb_dict_extract_int(&pv->spatial_metric, dict, "spatial-metric");
+        hb_dict_extract_int(&motion_threshold, dict, "motion-thresh");
+        hb_dict_extract_int(&spatial_threshold, dict, "spatial-thresh");
+        hb_dict_extract_int(&pv->filter_mode, dict, "filter-mode");
+        hb_dict_extract_int(&block_threshold, dict, "block-thresh");
+        hb_dict_extract_int(&pv->block_width, dict, "block-width");
+        hb_dict_extract_int(&pv->block_height, dict, "block-height");
+
+        pv->motion_threshold  = motion_threshold;
+        pv->spatial_threshold = spatial_threshold;
+        pv->block_threshold   = block_threshold;
+    }
+
+    pv->motion_threshold  /= 255.f;
+    pv->spatial_threshold /= 255.f;
+
+    // Compute thresholds
+    pv->gamma_motion_threshold    = pv->motion_threshold;
+    pv->gamma_spatial_threshold   = pv->spatial_threshold;
+    pv->gamma_spatial_threshold6  = 6 * pv->gamma_spatial_threshold;
+    pv->spatial_threshold_squared = pv->spatial_threshold * pv->spatial_threshold;
+    pv->spatial_threshold6        = 6 * pv->spatial_threshold;
+    pv->comb32detect_min = 10 / 255.f;
+    pv->comb32detect_max = 15 / 255.f;
+
+    if (pv->block_width > 32)  {pv->block_width  = 32;}
+    if (pv->block_height > 32) {pv->block_height = 32;}
+    if (pv->block_width < 8)   {pv->block_width  = 8; }
+    if (pv->block_height < 8)  {pv->block_height = 8; }
+
+    pv->mtl = hb_metal_context_init(hb_comb_detect_vt_metallib_data,
+                                    hb_comb_detect_vt_metallib_len,
+                                    pv->mode & MODE_GAMMA ? "comb_detect_gamma" : "comb_detect",
+                                    sizeof(struct mtl_comb_detect_params),
+                                    init->geometry.width, init->geometry.height,
+                                    init->pix_fmt, init->color_range);
+    if (pv->mtl == NULL)
+    {
+        hb_error("comb_detect_vt: failed to create Metal device");
+        return -1;
+    }
+
+    struct mtl_comb_detect_params *params = (struct mtl_comb_detect_params *)pv->mtl->params_buffer.contents;
+    *params = (struct mtl_comb_detect_params) {
+        .spatial_metric     = pv->spatial_metric,
+        .motion_threshold   = pv->motion_threshold,
+        .spatial_threshold  = pv->spatial_threshold,
+        .block_threshold    = pv->block_threshold,
+        .block_width        = pv->block_width,
+        .block_height       = pv->block_height,
+        .gamma_motion_threshold    = pv->gamma_motion_threshold,
+        .gamma_spatial_threshold   = pv->gamma_spatial_threshold,
+        .gamma_spatial_threshold6  = pv->gamma_spatial_threshold6,
+        .spatial_threshold_squared = pv->spatial_threshold_squared,
+        .spatial_threshold6 = pv->spatial_threshold6,
+        .comb32detect_min   = pv->comb32detect_min,
+        .comb32detect_max   = pv->comb32detect_max,
+        .force_exaustive_check = pv->force_exaustive_check
+    };
+
+    if (hb_metal_add_pipeline(pv->mtl, pv->filter_mode == FILTER_ERODE_DILATE ? "filter_erode_dilate" : "filter_classic",
+                              pv->mtl->pipelines_count))
+    {
+        return -1;
+    }
+    if (hb_metal_add_pipeline(pv->mtl, "erode_mask", pv->mtl->pipelines_count))
+    {
+        return -1;
+    }
+    if (hb_metal_add_pipeline(pv->mtl, "dilate_mask", pv->mtl->pipelines_count))
+    {
+        return -1;
+    }
+    char *check_combing_name = pv->mode & MODE_FILTER ? "check_filtered_combing_mask" : "check_combing_mask";
+    if (@available(macOS 13, *))
+    {
+        if ([pv->mtl->device supportsFamily:MTLGPUFamilyMetal3] &&
+            ((pv->block_width == 16 && pv->block_height == 16) || (pv->block_width == 32 && pv->block_height == 32)))
+        {
+            // Use simd_sum() to speed up the final reduction pass
+            check_combing_name = pv->mode & MODE_FILTER ? "check_filtered_combing_mask_quad" : "check_combing_mask_quad";
+        }
+        else if ([pv->mtl->device supportsFamily:MTLGPUFamilyCommon3] &&
+                 (pv->block_width * pv->block_height) % 4)
+        {
+            // Use quad_sum() to speed up the final reduction pass
+            check_combing_name = pv->mode & MODE_FILTER ? "check_filtered_combing_mask_quad" : "check_combing_mask_quad";
+        }
+    }
+    if (hb_metal_add_pipeline(pv->mtl,check_combing_name, pv->mtl->pipelines_count))
+    {
+        return -1;
+    }
+    if (hb_metal_add_pipeline(pv->mtl, "apply_mask", pv->mtl->pipelines_count))
+    {
+        return -1;
+    }
+
+    // Allocate buffers to store the mask and the comb result
+    MTLTextureDescriptor *descriptor = [[MTLTextureDescriptor alloc] init];
+    descriptor.textureType      = MTLTextureType2D;
+    descriptor.pixelFormat      = MTLPixelFormatR8Uint;
+    descriptor.width            = init->geometry.width;
+    descriptor.height           = init->geometry.height;
+    descriptor.depth            = 1;
+    descriptor.storageMode      = MTLStorageModePrivate;
+
+    pv->mask   = [pv->mtl->device newTextureWithDescriptor:descriptor];
+    pv->temp   = [pv->mtl->device newTextureWithDescriptor:descriptor];
+
+    [descriptor release];
+
+    pv->combed = [pv->mtl->device newBufferWithLength:sizeof(uint32_t) options:MTLResourceStorageModeShared];
+
+    return 0;
+}
+
+static void comb_detect_vt_close(hb_filter_object_t *filter)
+{
+    hb_filter_private_t *pv = filter->private_data;
+
+    if (pv == NULL)
+    {
+        return;
+    }
+
+    hb_log("comb detect: heavy %i | light %i | uncombed %i | total %i",
+           pv->comb_heavy, pv->comb_light, pv->comb_none, pv->frames);
+
+    [pv->combed release];
+    [pv->temp release];
+    [pv->mask release];
+
+    hb_metal_context_close(&pv->mtl);
+
+    for (int i = 0; i < 3; i++)
+    {
+        hb_buffer_close(&pv->ref[i]);
+    }
+
+    free(pv);
+    filter->private_data = NULL;
+}
+
+static void call_kernel(hb_filter_private_t *pv,
+                        id<MTLTexture> prev,
+                        id<MTLTexture> cur,
+                        id<MTLTexture> next,
+                        id<MTLTexture> dest)
+{
+    id<MTLCommandBuffer> buffer = pv->mtl->queue.commandBuffer;
+    id<MTLComputeCommandEncoder> encoder = buffer.computeCommandEncoder;
+
+    int width = cur.width, height = cur.height;
+    struct mtl_comb_detect_params *params = (struct mtl_comb_detect_params *)pv->mtl->params_buffer.contents;
+    params->force_exaustive_check = pv->force_exaustive_check;
+
+    [encoder setTexture:prev atIndex:0];
+    [encoder setTexture:cur  atIndex:1];
+    [encoder setTexture:next atIndex:2];
+    [encoder setTexture:pv->mask atIndex:3];
+    [encoder setTexture:pv->temp atIndex:4];
+    if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
+    {
+        [encoder setTexture:dest atIndex:5];
+    }
+
+    [encoder setBuffer:pv->combed offset:0 atIndex:0];
+    [encoder setBuffer:pv->mtl->params_buffer offset:0 atIndex:1];
+
+    hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[0], encoder, width, height);
+
+    if (pv->mode & MODE_FILTER)
+    {
+        hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[1], encoder, width, height);
+
+        if (pv->filter_mode == FILTER_ERODE_DILATE)
+        {
+            hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[2], encoder, width, height);
+            hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[3], encoder, width, height);
+            hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[2], encoder, width, height);
+        }
+    }
+
+    if (pv->mode & MODE_FILTER && pv->filter_mode == FILTER_CLASSIC)
+    {
+        [encoder setTexture:pv->temp atIndex:3];
+    }
+
+    hb_metal_compute_encoder_dispatch_fixed_threadgroup_size(pv->mtl->device, pv->mtl->pipelines[4], encoder,
+                                                             width, height, pv->block_width, pv->block_height);
+
+    if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
+    {
+        hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[5], encoder, width, height);
+    }
+
+    [encoder endEncoding];
+
+    [buffer commit];
+    [buffer waitUntilCompleted];
+}
+
+static int analyze_frame(hb_filter_private_t *pv, hb_buffer_t **out)
+{
+    CVReturn ret = kCVReturnSuccess;
+
+    CVPixelBufferRef cv_dest = NULL;
+    CVPixelBufferRef cv_prev = pv->ref[PREV] ? hb_cv_get_pixel_buffer(pv->ref[PREV]) : hb_cv_get_pixel_buffer(pv->ref[CURR]);
+    CVPixelBufferRef cv_cur = hb_cv_get_pixel_buffer(pv->ref[CURR]);
+    CVPixelBufferRef cv_next = pv->ref[NEXT] ? hb_cv_get_pixel_buffer(pv->ref[NEXT]) : hb_cv_get_pixel_buffer(pv->ref[CURR]);
+
+    if (cv_prev == NULL || cv_cur == NULL || cv_next == NULL)
+    {
+        hb_log("comb_detect_vt: extract_buf failed");
+        goto fail;
+    }
+
+    const AVComponentDescriptor *comp = &pv->desc->comp[0];
+
+    int channels;
+    const MTLPixelFormat format = hb_metal_pix_fmt_from_component(comp, &channels);
+    if (format == MTLPixelFormatInvalid)
+    {
+        goto fail;
+    }
+
+    CVMetalTextureRef dest = NULL;
+    id<MTLTexture> tex_dest = nil;
+
+    if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
+    {
+        ret = CVPixelBufferPoolCreatePixelBuffer(kCFAllocatorDefault, pv->mtl->pool, &cv_dest);
+        if (ret != kCVReturnSuccess)
+        {
+            hb_log("comb_detect_vt: CVPixelBufferPoolCreatePixelBuffer failed");
+            goto fail;
+        }
+        dest = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_dest, 0, format);
+        tex_dest = CVMetalTextureGetTexture(dest);
+    }
+
+    CVMetalTextureRef prev = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_prev, 0, format);
+    CVMetalTextureRef cur  = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_cur,  0, format);
+    CVMetalTextureRef next = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_next, 0, format);
+
+    id<MTLTexture> tex_prev = CVMetalTextureGetTexture(prev);
+    id<MTLTexture> tex_cur  = CVMetalTextureGetTexture(cur);
+    id<MTLTexture> tex_next = CVMetalTextureGetTexture(next);
+
+    uint32_t *combed = pv->combed.contents;
+    *combed = HB_COMB_NONE;
+
+    call_kernel(pv, tex_prev, tex_cur, tex_next, tex_dest);
+
+    CFRelease(prev);
+    CFRelease(cur);
+    CFRelease(next);
+
+    if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
+    {
+        CFRelease(dest);
+        CVBufferPropagateAttachments(cv_cur, cv_dest);
+
+        *out = hb_buffer_wrapper_init();
+        (*out)->storage_type = COREMEDIA;
+        (*out)->storage      = cv_dest;
+        (*out)->f.width           = pv->ref[CURR]->f.width;
+        (*out)->f.height          = pv->ref[CURR]->f.height;
+        (*out)->f.fmt             = pv->ref[CURR]->f.fmt;
+        (*out)->f.color_prim      = pv->ref[CURR]->f.color_prim;
+        (*out)->f.color_transfer  = pv->ref[CURR]->f.color_transfer;
+        (*out)->f.color_matrix    = pv->ref[CURR]->f.color_matrix;
+        (*out)->f.color_range     = pv->ref[CURR]->f.color_range;
+        (*out)->f.chroma_location = pv->ref[CURR]->f.chroma_location;
+        hb_buffer_copy_props(*out,  pv->ref[CURR]);
+    }
+
+    return *combed;
+
+fail:
+    return -1;
+}
+
+static void process_frame(hb_filter_private_t *pv)
+{
+    int combed = 0;
+    hb_buffer_t *out = NULL;
+
+    @autoreleasepool
+    {
+        combed = analyze_frame(pv, &out);
+    }
+
+    switch (combed)
+    {
+        case HB_COMB_HEAVY:
+            pv->comb_heavy++;
+            break;
+
+        case HB_COMB_LIGHT:
+            pv->comb_light++;
+            break;
+
+        case HB_COMB_NONE:
+        default:
+            pv->comb_none++;
+            break;
+    }
+    pv->frames++;
+    pv->ref[CURR]->s.combed = combed;
+    if (out)
+    {
+        hb_buffer_list_append(&pv->out_list, out);
+    }
+    else
+    {
+        hb_buffer_list_append(&pv->out_list, hb_vt_buffer_dup(pv->ref[CURR]));
+    }
+
+    pv->force_exaustive_check = 0;
+}
+
+static int comb_detect_vt_work(hb_filter_object_t *filter,
+                               hb_buffer_t **buf_in,
+                               hb_buffer_t **buf_out)
+{
+    hb_filter_private_t *pv = filter->private_data;
+    hb_buffer_t         *in = *buf_in;
+
+    *buf_in = NULL;
+    if (in->s.flags & HB_BUF_FLAG_EOF)
+    {
+        store_buf(pv, NULL);
+        pv->force_exaustive_check = 1;
+        process_frame(pv);
+        hb_buffer_list_append(&pv->out_list, in);
+        *buf_out = hb_buffer_list_clear(&pv->out_list);
+        return HB_FILTER_DONE;
+    }
+
+    store_buf(pv, in);
+
+    if (pv->ref[CURR] == NULL)
+    {
+        // Wait for next buffer
+        return HB_FILTER_DELAY;
+    }
+
+    process_frame(pv);
+    *buf_out = hb_buffer_list_clear(&pv->out_list);
+
+    return *buf_out == NULL ? HB_FILTER_FAILED : HB_FILTER_OK;
+}
--- a/libhb/platform/macosx/shaders/comb_detect_vt.metal
+++ b/libhb/platform/macosx/shaders/comb_detect_vt.metal
@ -0,0 +1,555 @@
+/* comb_detect.metal
+
+   Copyright (c) 2003-2023 HandBrake Team
+
+   This file is part of the HandBrake source code
+   Homepage: <http://handbrake.fr/>.
+   It may be used under the terms of the GNU General Public License v2.
+   For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+ */
+
+#include <metal_stdlib>
+#include <metal_integer>
+#include <metal_texture>
+
+using namespace metal;
+
+#define HB_COMB_NONE  0
+#define HB_COMB_LIGHT 1
+#define HB_COMB_HEAVY 2
+
+/*
+ * Parameters
+ */
+
+struct params {
+    int spatial_metric;
+
+    float motion_threshold;
+    float spatial_threshold;
+    int   block_threshold;
+    int   block_width;
+    int   block_height;
+
+    float gamma_motion_threshold;
+    float gamma_spatial_threshold;
+    float gamma_spatial_threshold6;
+    float spatial_threshold_squared;
+    float spatial_threshold6;
+    float comb32detect_min;
+    float comb32detect_max;
+
+    bool  force_exaustive_check;
+};
+
+/*
+ * Texture access helpers
+ */
+
+constexpr sampler s(coord::pixel);
+
+template <typename T>
+T tex2D(texture2d<float, access::sample> tex, short2 pos)
+{
+    return tex.sample(s, float2(pos)).x;
+}
+
+template <typename T>
+T tex2D(texture2d<float, access::sample> tex, ushort2 pos)
+{
+    return tex.sample(s, float2(pos)).x;
+}
+
+template <typename T>
+T tex2D(texture2d<ushort, access::sample> tex, short2 pos)
+{
+    return tex.sample(s, float2(pos)).x;
+}
+
+template <typename T>
+T tex2D(texture2d<ushort, access::sample> tex, ushort2 pos)
+{
+    return tex.sample(s, float2(pos)).x;
+}
+
+template <typename T>
+T tex2D(texture2d<float, access::read> tex, uint x, uint y)
+{
+    return tex.read(uint2(x, y)).x;
+}
+
+template <typename T>
+T tex2D(texture2d<ushort, access::read> tex, ushort2 pos)
+{
+    return tex.read(pos).x;
+}
+
+constexpr sampler szero(coord::pixel,address::clamp_to_zero);
+template <typename T>
+T tex2Dc(texture2d<ushort, access::sample> tex, short2 pos)
+{
+    return tex.sample(szero, float2(pos)).x;
+}
+
+template <typename T>
+T tex2Dc(texture2d<ushort, access::sample> tex, ushort2 pos)
+{
+    return tex.sample(szero, float2(pos)).x;
+}
+
+/*
+ * Comb detect helpers
+ */
+
+void write_result(
+    device  atomic_int *combed,
+    uchar block_threshold,
+    ushort block_score)
+{
+    int current = atomic_load_explicit(combed, memory_order_relaxed);
+    if (current == HB_COMB_HEAVY) {
+        return;
+    }
+
+    if (block_score >= (block_threshold / 2)) {
+        if (block_score > block_threshold) {
+            atomic_store_explicit(combed, HB_COMB_HEAVY, memory_order_relaxed);
+        } else {
+            atomic_compare_exchange_weak_explicit(combed, &current, HB_COMB_LIGHT, memory_order_relaxed, memory_order_relaxed);
+        }
+    }
+}
+
+template <typename T>
+T gamma(T value) {
+    return pow(value, 2.2f);
+}
+
+template <typename T>
+void detect_gamma_combed_segment(
+    texture2d<float, access::sample> prev,
+    texture2d<float, access::sample> cur,
+    texture2d<float, access::sample> next,
+    texture2d<ushort, access::write> mask,
+    constant params& p,
+    ushort2 pos)
+{
+    // A mishmash of various comb detection tricks
+    // picked up from neuron2's Decomb plugin for
+    // AviSynth and tritical's IsCombedT and
+    // IsCombedTIVTC plugins.
+
+    // Comb scoring algorithm
+    const float mthresh  = p.gamma_motion_threshold;
+    const float athresh  = p.gamma_spatial_threshold;
+    const float athresh6 = p.gamma_spatial_threshold6;
+
+    // These are just to make the buffer locations easier to read.
+    const short2 up_2    = short2(pos.x, pos.y -2);
+    const short2 up_1    = short2(pos.x, pos.y -1);
+    const short2 down_1  = short2(pos.x, pos.y +1);
+    const short2 down_2  = short2(pos.x, pos.y +2);
+
+    const T up_diff   = gamma(tex2D<T>(cur, pos)) - gamma(tex2D<T>(cur, up_1));
+    const T down_diff = gamma(tex2D<T>(cur, pos)) - gamma(tex2D<T>(cur, down_1));
+
+    mask.write(0, pos);
+
+    if ((up_diff >  athresh && down_diff >  athresh) ||
+        (up_diff < -athresh && down_diff < -athresh)) {
+        // The pixel above and below are different,
+        // and they change in the same "direction" too.
+        bool motion = false;
+        if (mthresh > 0) {
+            // Make sure there's sufficient motion between frame t-1 to frame t+1.
+            if (abs(gamma(tex2D<T>(prev, pos))   - gamma(tex2D<T>(cur, pos)))     > mthresh &&
+                abs(gamma(tex2D<T>(cur, up_1))   - gamma(tex2D<T>(next, up_1)))   > mthresh &&
+                abs(gamma(tex2D<T>(cur, down_1)) - gamma(tex2D<T>(next, down_1))) > mthresh) {
+                motion = true;
+            }
+            if (abs(gamma(tex2D<T>(next, pos))    - gamma(tex2D<T>(cur, pos)))    > mthresh &&
+                abs(gamma(tex2D<T>(prev, up_1))   - gamma(tex2D<T>(cur, up_1)))   > mthresh &&
+                abs(gamma(tex2D<T>(prev, down_1)) - gamma(tex2D<T>(cur, down_1))) > mthresh) {
+                motion = true;
+            }
+        } else {
+            // User doesn't want to check for motion,
+            // so move on to the spatial check
+            motion = true;
+        }
+
+        // If motion, or we can't measure motion yet…
+        if (motion || p.force_exaustive_check) {
+            // Tritical's noise-resistant combing scorer
+            // The check is done on a bob+blur convolution
+            const T combing = abs(gamma(tex2D<T>(cur, up_2))
+                                  + (4 * gamma(tex2D<T>(cur, pos)))
+                                  + gamma(tex2D<T>(cur, down_2))
+                                  - (3 * (gamma(tex2D<T>(cur, up_1))
+                                          + gamma(tex2D<T>(cur, down_1)))));
+
+            // If the frame is sufficiently combed,
+            // then mark it down on the mask as 1.
+            if (combing > athresh6) {
+                mask.write(1, pos);
+            }
+        }
+    }
+}
+
+template <typename T>
+void detect_combed_segment(
+    texture2d<float, access::sample> prev,
+    texture2d<float, access::sample> cur,
+    texture2d<float, access::sample> next,
+    texture2d<ushort, access::write> mask,
+    constant params& p,
+    ushort2 pos)
+{
+    // A mishmash of various comb detection tricks
+    // picked up from neuron2's Decomb plugin for
+    // AviSynth and tritical's IsCombedT and
+    // IsCombedTIVTC plugins.
+
+    // Comb scoring algorithm
+    const float mthresh         = p.motion_threshold;
+    const float athresh         = p.spatial_threshold;
+    const float athresh_squared = p.spatial_threshold_squared;
+    const float athresh6        = p.spatial_threshold6;
+
+    // These are just to make the buffer locations easier to read.
+    const short2 up_2    = short2(pos.x, pos.y -2);
+    const short2 up_1    = short2(pos.x, pos.y -1);
+    const short2 down_1  = short2(pos.x, pos.y +1);
+    const short2 down_2  = short2(pos.x, pos.y +2);
+
+    const float up_diff   = tex2D<T>(cur, pos) - tex2D<T>(cur, up_1);
+    const float down_diff = tex2D<T>(cur, pos) - tex2D<T>(cur, down_1);
+
+    mask.write(0, pos);
+
+    if ((up_diff >  athresh && down_diff >  athresh) ||
+        (up_diff < -athresh && down_diff < -athresh)) {
+        // The pixel above and below are different,
+        // and they change in the same "direction" too.
+        bool motion = false;
+        if (mthresh > 0) {
+            // Make sure there's sufficient motion between frame t-1 to frame t+1.
+            if (abs(tex2D<T>(prev, pos)   - tex2D<T>(cur, pos))     > mthresh &&
+                abs(tex2D<T>(cur, up_1)   - tex2D<T>(next, up_1))   > mthresh &&
+                abs(tex2D<T>(cur, down_1) - tex2D<T>(next, down_1)) > mthresh) {
+                motion = true;
+            }
+            if (abs(tex2D<T>(next, pos)    - tex2D<T>(cur, pos))    > mthresh &&
+                abs(tex2D<T>(prev, up_1)   - tex2D<T>(cur, up_1))   > mthresh &&
+                abs(tex2D<T>(prev, down_1) - tex2D<T>(cur, down_1)) > mthresh) {
+                motion = true;
+            }
+        } else {
+            // User doesn't want to check for motion,
+            // so move on to the spatial check
+            motion = true;
+        }
+
+        // If motion, or we can't measure motion yet…
+        if (motion || p.force_exaustive_check) {
+            // That means it's time for the spatial check
+            // We've got several options here
+            if (p.spatial_metric == 0) {
+                // Simple 32detect style comb detection.
+                if ((abs(tex2D<T>(cur, pos) - tex2D<T>(cur, down_2)) < p.comb32detect_min) &&
+                    (abs(tex2D<T>(cur, pos) - tex2D<T>(cur, down_1)) > p.comb32detect_max)) {
+                    mask.write(1, pos);
+                }
+            } else if (p.spatial_metric == 1) {
+                // This, for comparison, is what IsCombed uses
+                // It's better, but still noise sensitive
+                const T combing = (tex2D<T>(cur, up_1)   - tex2D<T>(cur, pos)) *
+                                  (tex2D<T>(cur, down_1) - tex2D<T>(cur, pos));
+
+                if (combing > athresh_squared) {
+                    mask.write(1, pos);
+                }
+            } else if (p.spatial_metric == 2) {
+                // Tritical's noise-resistant combing scorer
+                // The check is done on a bob+blur convolution
+                const T combing = abs(tex2D<T>(cur, up_2)
+                                        + (4 * tex2D<T>(cur, pos))
+                                        + tex2D<T>(cur, down_2)
+                                        - (3 * (tex2D<T>(cur, up_1)
+                                                + tex2D<T>(cur, down_1))));
+
+                // If the frame is sufficiently combed,
+                // then mark it down on the mask as 1.
+                if (combing > athresh6) {
+                    mask.write(1, pos);
+                }
+            }
+        }
+    }
+}
+
+/*
+ * Kernel dispatch
+ */
+
+kernel void apply_mask(
+    texture2d<ushort, access::read> mask [[texture(3)]],
+    texture2d<half, access::write> dst [[texture(5)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    auto value = tex2D<ushort>(mask, pos) ? 1.h : 0.h;
+    dst.write(value, pos);
+}
+
+kernel void check_filtered_combing_mask_simd(
+    texture2d<ushort, access::sample> mask [[texture(3)]],
+    device  atomic_int *combed [[buffer(0)]],
+    constant params& p         [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]],
+    ushort  sid [[simdgroup_index_in_threadgroup]],
+    ushort  w   [[simdgroups_per_threadgroup]])
+{
+    threadgroup ushort partial_score[32];
+
+    ushort value = tex2Dc<ushort>(mask, pos);
+    partial_score[sid] = simd_sum(value);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (sid == 0) {
+        ushort block_score = 0;
+        for (uchar i = 0; i < w; i++) {
+            block_score += partial_score[i];
+        }
+        write_result(combed, p.block_threshold, block_score);
+    }
+}
+
+kernel void check_filtered_combing_mask_quad(
+    texture2d<ushort, access::sample> mask [[texture(3)]],
+    device  atomic_int *combed [[buffer(0)]],
+    constant params& p         [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]],
+    ushort  qid [[quadgroup_index_in_threadgroup]],
+    ushort  w   [[quadgroups_per_threadgroup]])
+{
+    threadgroup ushort partial_score[256];
+
+    ushort value = tex2Dc<ushort>(mask, pos);
+    partial_score[qid] = quad_sum(value);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (qid == 0) {
+        ushort block_score = 0;
+        for (uchar i = 0; i < w; i++) {
+            block_score += partial_score[i];
+        }
+        write_result(combed, p.block_threshold, block_score);
+    }
+}
+
+kernel void check_filtered_combing_mask(
+    texture2d<ushort, access::sample> mask [[texture(3)]],
+    device  atomic_int *combed [[buffer(0)]],
+    constant params& p         [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    if (pos.x % p.block_width > 0 || pos.y % p.block_height > 0) {
+        return;
+    }
+
+    ushort block_score = 0;
+
+    for (uchar x = 0; x < p.block_width; x++) {
+        for (uchar y = 0; y < p.block_height; y++) {
+            ushort2 block_pos = ushort2(pos.x + x, pos.y + y);
+            block_score += tex2Dc<ushort>(mask, block_pos);
+        }
+    }
+
+    write_result(combed, p.block_threshold, block_score);
+}
+
+kernel void check_combing_mask_simd(
+    texture2d<ushort, access::sample> mask [[texture(3)]],
+    device  atomic_int *combed [[buffer(0)]],
+    constant params& p         [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]],
+    ushort  sid [[simdgroup_index_in_threadgroup]],
+    ushort  w   [[simdgroups_per_threadgroup]])
+{
+    threadgroup ushort partial_score[32];
+    const short2 left  = short2(pos.x -1, pos.y);
+    const short2 right = short2(pos.x +1, pos.y);
+
+    ushort value = tex2Dc<ushort>(mask, left) & tex2Dc<ushort>(mask, pos) & tex2Dc<ushort>(mask, right);
+    partial_score[sid] = simd_sum(value);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (sid == 0) {
+        ushort block_score = 0;
+        for (uchar i = 0; i < w; i++) {
+            block_score += partial_score[i];
+        }
+        write_result(combed, p.block_threshold, block_score);
+    }
+}
+
+kernel void check_combing_mask_quad(
+    texture2d<ushort, access::sample> mask [[texture(3)]],
+    device  atomic_int *combed [[buffer(0)]],
+    constant params& p         [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]],
+    ushort  qid [[quadgroup_index_in_threadgroup]],
+    ushort  w   [[quadgroups_per_threadgroup]])
+{
+    threadgroup ushort partial_score[256];
+    const short2 left  = short2(pos.x -1, pos.y);
+    const short2 right = short2(pos.x +1, pos.y);
+
+    ushort value = tex2Dc<ushort>(mask, left) & tex2Dc<ushort>(mask, pos) & tex2Dc<ushort>(mask, right);
+    partial_score[qid] = quad_sum(value);
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    if (qid == 0) {
+        ushort block_score = 0;
+        for (uchar i = 0; i < w; i++) {
+            block_score += partial_score[i];
+        }
+        write_result(combed, p.block_threshold, block_score);
+    }
+}
+
+kernel void check_combing_mask(
+    texture2d<ushort, access::sample> mask [[texture(3)]],
+    device  atomic_int *combed [[buffer(0)]],
+    constant params& p         [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    if (pos.x % p.block_width > 0 || pos.y % p.block_height > 0) {
+        return;
+    }
+
+    ushort block_score = 0;
+
+    for (uchar x = 0; x < p.block_width; x++) {
+        for (uchar y = 0; y < p.block_height; y++) {
+            const ushort2 block_pos = ushort2(pos.x + x, pos.y + y);
+            const short2 left  = short2(pos.x -1 +x, pos.y +y);
+            const short2 right = short2(pos.x +1 +x, pos.y +y);
+            block_score += tex2Dc<ushort>(mask, left) & tex2Dc<ushort>(mask, block_pos) & tex2Dc<ushort>(mask, right);
+        }
+    }
+    write_result(combed, p.block_threshold, block_score);
+}
+
+kernel void dilate_mask(
+    texture2d<ushort, access::sample>    src [[texture(3)]],
+    texture2d<ushort, access::write> dst [[texture(4)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    if (tex2D<ushort>(src, pos)) {
+        dst.write(1, pos);
+        return;
+    }
+
+    const short2 up         = short2(pos.x, pos.y -1);
+    const short2 up_left    = short2(pos.x -1, pos.y -1);
+    const short2 up_right   = short2(pos.x +1, pos.y -1);
+    const short2 down       = short2(pos.x, pos.y +1);
+    const short2 down_left  = short2(pos.x -1,  pos.y +1);
+    const short2 down_right = short2(pos.x +1, pos.y +1);
+    const short2 left       = short2(pos.x -1, pos.y);
+    const short2 right      = short2(pos.x +1, pos.y);
+
+    constexpr uchar dilation_threshold = 4;
+    const uchar count = tex2D<ushort>(src, up_left)   + tex2D<ushort>(src, up)   + tex2D<ushort>(src, up_right) +
+                        tex2D<ushort>(src, left)      +                            tex2D<ushort>(src, right)    +
+                        tex2D<ushort>(src, down_left) + tex2D<ushort>(src, down) + tex2D<ushort>(src, down_right);
+
+    dst.write(count >= dilation_threshold, pos);
+
+}
+
+kernel void erode_mask(
+    texture2d<ushort, access::sample> src [[texture(4)]],
+    texture2d<ushort, access::write>  dst [[texture(3)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    if (tex2D<ushort>(src, pos) == 0) {
+        dst.write(0, pos);
+        return;
+    }
+
+    const short2 up         = short2(pos.x, pos.y -1);
+    const short2 up_left    = short2(pos.x -1, pos.y -1);
+    const short2 up_right   = short2(pos.x +1, pos.y -1);
+    const short2 down       = short2(pos.x, pos.y +1);
+    const short2 down_left  = short2(pos.x -1,  pos.y +1);
+    const short2 down_right = short2(pos.x +1, pos.y +1);
+    const short2 left       = short2(pos.x -1, pos.y);
+    const short2 right      = short2(pos.x +1, pos.y);
+
+    constexpr uchar erosion_threshold = 2;
+    const uchar count = tex2D<ushort>(src, up_left)   + tex2D<ushort>(src, up)   + tex2D<ushort>(src, up_right) +
+                        tex2D<ushort>(src, left)      +                            tex2D<ushort>(src, right)    +
+                        tex2D<ushort>(src, down_left) + tex2D<ushort>(src, down) + tex2D<ushort>(src, down_right);
+
+    dst.write(count >= erosion_threshold, pos);
+}
+
+kernel void filter_classic(
+    texture2d<ushort, access::sample> src [[texture(3)]],
+    texture2d<ushort, access::write>  dst [[texture(4)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    const short2 left  = short2(pos.x -1, pos.y);
+    const short2 right = short2(pos.x +1, pos.y);
+
+    const uchar h_count = tex2D<ushort>(src, left) & tex2D<ushort>(src, pos) & tex2D<ushort>(src, right);
+
+    dst.write(h_count, pos);
+}
+
+kernel void filter_erode_dilate(
+    texture2d<ushort, access::sample> src [[texture(3)]],
+    texture2d<ushort, access::write>  dst [[texture(4)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    const short2 up    = short2(pos.x, pos.y -1);
+    const short2 down  = short2(pos.x, pos.y +1);
+    const short2 left  = short2(pos.x -1, pos.y);
+    const short2 right = short2(pos.x +1, pos.y);
+
+    const uchar h_count = tex2D<ushort>(src, left) & tex2D<ushort>(src, pos) & tex2D<ushort>(src, right);
+    const uchar v_count = tex2D<ushort>(src, up)   & tex2D<ushort>(src, pos) & tex2D<ushort>(src, down);
+
+    dst.write(h_count & v_count, pos);
+}
+
+kernel void comb_detect(
+    texture2d<float, access::sample> prev [[texture(0)]],
+    texture2d<float, access::sample> cur  [[texture(1)]],
+    texture2d<float, access::sample> next [[texture(2)]],
+    texture2d<ushort, access::write> mask [[texture(3)]],
+    constant params& p  [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    detect_combed_segment<float>(prev, cur, next, mask, p, pos);
+}
+
+kernel void comb_detect_gamma(
+    texture2d<float, access::sample> prev [[texture(0)]],
+    texture2d<float, access::sample> cur  [[texture(1)]],
+    texture2d<float, access::sample> next [[texture(2)]],
+    texture2d<ushort, access::write> mask [[texture(3)]],
+    constant params& p  [[buffer(1)]],
+    ushort2 pos [[thread_position_in_grid]])
+{
+    detect_gamma_combed_segment<float>(prev, cur, next, mask, p, pos);
+}
+
--- a/libhb/platform/macosx/vt_common.c
+++ b/libhb/platform/macosx/vt_common.c
@ -403,6 +403,8 @@ int hb_vt_are_filters_supported(hb_list_t *filters)
        switch (filter->id)
        {
            case HB_FILTER_PRE_VT:
+            case HB_FILTER_COMB_DETECT:
+            case HB_FILTER_COMB_DETECT_VT:
            case HB_FILTER_YADIF:
            case HB_FILTER_YADIF_VT:
            case HB_FILTER_BWDIF:
@ -498,6 +500,7 @@ void hb_vt_setup_hw_filters(hb_job_t *job)
        hb_filter_object_t *filter = hb_filter_init(HB_FILTER_PRE_VT);
        hb_add_filter(job, filter, NULL);

+        replace_filter(job, HB_FILTER_COMB_DETECT, HB_FILTER_COMB_DETECT_VT);
        replace_filter(job, HB_FILTER_YADIF, HB_FILTER_YADIF_VT);
        replace_filter(job, HB_FILTER_BWDIF, HB_FILTER_BWDIF_VT);
        replace_filter(job, HB_FILTER_CROP_SCALE, HB_FILTER_CROP_SCALE_VT);
--- a/win/CS/HandBrake.Interop/Interop/HbLib/hb_filter_ids.cs
+++ b/win/CS/HandBrake.Interop/Interop/HbLib/hb_filter_ids.cs
@ -16,6 +16,7 @@ namespace HandBrake.Interop.Interop.HbLib
        // First, filters that may change the framerate (drop or dup frames)
        HB_FILTER_DETELECINE,
        HB_FILTER_COMB_DETECT,
+        HB_FILTER_COMB_DETECT_VT,
        HB_FILTER_DECOMB,
        HB_FILTER_YADIF,
        HB_FILTER_YADIF_VT,