HandBrake/libhb/comb_detect.c

/* comb_detect.c

   Copyright (c) 2003-2025 HandBrake Team
   This file is part of the HandBrake source code
   Homepage: <http://handbrake.fr/>.
   It may be used under the terms of the GNU General Public License v2.
   For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html

*/

/*****
Parameters:
    Mode : Spatial metric : Motion thresh : Spatial thresh : Mask Filter Mode :
    Block thresh : Block width : Block height

Defaults:
    3:2:3:3:2:40:16:16

Original "Faster" settings:
    0:2:6:9:1:80:16:16
*****/

#define MODE_GAMMA        1 // Scale gamma when decombing
#define MODE_FILTER       2 // Filter combing mask
#define MODE_MASK         4 // Output combing masks instead of pictures
#define MODE_COMPOSITE    8 // Overlay combing mask onto picture

#define FILTER_CLASSIC 1
#define FILTER_ERODE_DILATE 2

#include "handbrake/handbrake.h"
#include "handbrake/taskset.h"

#if defined(__aarch64__)
#include <arm_neon.h>
#endif

typedef struct comb_detect_thread_arg_s
{
    taskset_thread_arg_t arg;
    hb_filter_private_t *pv;
    int segment_start[3];
    int segment_height[3];
} comb_detect_thread_arg_t;

struct hb_filter_private_s
{
    int depth;
    int bps;
    int max_value;
    int half_value;

    // comb detect parameters
    int                mode;
    int                filter_mode;
    int                spatial_metric;
    int                motion_threshold;
    int                spatial_threshold;
    int                block_threshold;
    int                block_width;
    int                block_height;
    int               *block_score;
    int                comb_check_complete;
    int                comb_check_nthreads;

    // Computed parameters
    float              gamma_motion_threshold;
    float              gamma_spatial_threshold;
    float              gamma_spatial_threshold6;
    int                spatial_threshold_squared;
    int                spatial_threshold6;
    int                comb32detect_min;
    int                comb32detect_max;
    float             *gamma_lut;

    int                comb_detect_ready;
    int                force_exaustive_check;

    hb_buffer_t       *ref[3];
    int                ref_used[3];

    // Make buffers to store a comb masks.
    hb_buffer_t       *mask;
    hb_buffer_t       *mask_filtered;
    hb_buffer_t       *mask_temp;
    int                mask_box_x;
    int                mask_box_y;

    int                cpu_count;
    int                segment_height[3];

    taskset_t          comb_detect_filter_taskset; // Threads for comb detection
    taskset_t          comb_detect_check_taskset;  // Threads for comb check
    taskset_t          mask_filter_taskset; // Threads for comb detect mask filter
    taskset_t          mask_erode_taskset;  // Threads for comb detect mask erode
    taskset_t          mask_dilate_taskset; // Threads for comb detect mask dilate

    void (*detect_gamma_combed_segment)(hb_filter_private_t *pv,
                                        int segment_start, int segment_stop);
    void (*detect_combed_segment)(hb_filter_private_t *pv,
                                  int segment_start, int segment_stop);
    void (*apply_mask)(hb_filter_private_t *pv, hb_buffer_t *b);

    hb_buffer_list_t   out_list;

    // Filter statistics
    int                comb_heavy;
    int                comb_light;
    int                comb_none;
    int                frames;
};

static int comb_detect_init(hb_filter_object_t *filter,
                            hb_filter_init_t *init);

static int comb_detect_work(hb_filter_object_t *filter,
                            hb_buffer_t **buf_in,
                            hb_buffer_t **buf_out );

static void comb_detect_close(hb_filter_object_t *filter);

static const char comb_detect_template[] =
    "mode=^"HB_INT_REG"$:spatial-metric=^([012])$:"
    "motion-thresh=^"HB_INT_REG"$:spatial-thresh=^"HB_INT_REG"$:"
    "filter-mode=^([012])$:block-thresh=^"HB_INT_REG"$:"
    "block-width=^"HB_INT_REG"$:block-height=^"HB_INT_REG"$:"
    "disable=^"HB_BOOL_REG"$";

hb_filter_object_t hb_filter_comb_detect =
{
    .id                = HB_FILTER_COMB_DETECT,
    .enforce_order     = 1,
    .name              = "Comb Detect",
    .settings          = NULL,
    .init              = comb_detect_init,
    .work              = comb_detect_work,
    .close             = comb_detect_close,
    .settings_template = comb_detect_template,
};

#define BIT_DEPTH 8
#include "templates/comb_detect_template.c"
#undef BIT_DEPTH

#define BIT_DEPTH 16
#include "templates/comb_detect_template.c"
#undef BIT_DEPTH

#if defined (__aarch64__)
static void check_filtered_combing_mask(hb_filter_private_t *pv, int segment, int start, int stop)
{
    // Go through the mask in X*Y blocks. If any of these windows
    // have threshold or more combed pixels, consider the whole
    // frame to be combed and send it on to be deinterlaced.
    // Block mask threshold -- The number of pixels
    // in a block_width * block_height window of
    // the mask that need to show combing for the
    // whole frame to be seen as such.

    const int threshold     = pv->block_threshold;
    const int block_width   = pv->block_width;
    const int block_height  = pv->block_height;

    const int stride = pv->mask_filtered->plane[0].stride;
    const int width = pv->mask_filtered->plane[0].width;

    for (int y = start; y < (stop - block_height + 1); y = y + block_height)
    {
        for (int x = 0; x < (width - block_width); x = x + block_width)
        {
            int block_score = 0;

            for (int block_y = 0; block_y < block_height; block_y++)
            {
                const int my = y + block_y;
                const uint8_t *mask_p = &pv->mask_filtered->plane[0].data[my * stride + x];

                if (block_width == 16)
                {
                    uint8x16_t mask = vld1q_u8(&mask_p[0]);
                    block_score +=  vaddvq_u8(mask);
                }
                else
                {
                    int block_x = 0;
                    for (; block_x < block_width-7; block_x += 8)
                    {
                        uint8x8_t mask = vld1_u8(&mask_p[block_x]);
                        block_score +=  vaddv_u8(mask);
                    }
                    for (;block_x < block_width; block_x++)
                    {
                        block_score += mask_p[block_x];
                    }
                }
            }

            if (pv->comb_check_complete)
            {
                // Some other thread found coming before this one
                return;
            }

            if (block_score >= (threshold / 2))
            {
                pv->mask_box_x = x;
                pv->mask_box_y = y;

                pv->block_score[segment] = block_score;
                if (block_score > threshold)
                {
                    pv->comb_check_complete = 1;
                    return;
                }
            }
        }
    }
}
#else
static void check_filtered_combing_mask(hb_filter_private_t *pv, int segment, int start, int stop)
{
    // Go through the mask in X*Y blocks. If any of these windows
    // have threshold or more combed pixels, consider the whole
    // frame to be combed and send it on to be deinterlaced.
    // Block mask threshold -- The number of pixels
    // in a block_width * block_height window of
    // the mask that need to show combing for the
    // whole frame to be seen as such.

    const int threshold     = pv->block_threshold;
    const int block_width   = pv->block_width;
    const int block_height  = pv->block_height;

    const int stride = pv->mask_filtered->plane[0].stride;
    const int width = pv->mask_filtered->plane[0].width;

    for (int y = start; y < (stop - block_height + 1); y = y + block_height)
    {
        for (int x = 0; x < (width - block_width); x = x + block_width)
        {
            int block_score = 0;

            for (int block_y = 0; block_y < block_height; block_y++)
            {
                const int my = y + block_y;
                const uint8_t *mask_p = &pv->mask_filtered->plane[0].data[my * stride + x];

                for (int block_x = 0; block_x < block_width; block_x++)
                {
                    block_score += mask_p[0];
                    mask_p++;
                }
            }

            if (pv->comb_check_complete)
            {
                // Some other thread found coming before this one
                return;
            }

            if (block_score >= (threshold / 2))
            {
                pv->mask_box_x = x;
                pv->mask_box_y = y;

                pv->block_score[segment] = block_score;
                if (block_score > threshold)
                {
                    pv->comb_check_complete = 1;
                    return;
                }
            }
        }
    }
}
#endif

#if defined(__aarch64__)
static void check_combing_mask(hb_filter_private_t *pv, int segment, int start, int stop)
{
    // Go through the mask in X*Y blocks. If any of these windows
    // have threshold or more combed pixels, consider the whole
    // frame to be combed and send it on to be deinterlaced.
    // Block mask threshold -- The number of pixels
    // in a block_width * block_height window of
    // the mask that need to show combing for the
    // whole frame to be seen as such.

    const int threshold    = pv->block_threshold;
    const int block_width  = pv->block_width;
    const int block_height = pv->block_height;

    const int stride = pv->mask->plane[0].stride;
    const int width = pv->mask->plane[0].width;

    uint8x16_t one_vector = vdupq_n_u8(255);
    for (int y = start; y < (stop - block_height + 1); y = y + block_height)
    {
        for (int x = 0; x < (width - block_width); x = x + block_width)
        {
            int block_score = 0;

            for (int block_y = 0; block_y < block_height; block_y++)
            {
                const int mask_y = y + block_y;
                const uint8_t *mask_p = &pv->mask->plane[0].data[mask_y * stride + x];

                int block_x = 0;
                if (block_width == 16)
                {
                    uint8x16_t mask = vld1q_u8(&mask_p[0]);
                    uint8x16_t mask_left, mask_right;
                    if (x == 0)
                    {
                         mask_left = vextq_u8(one_vector, mask, 15);
                    }
                    else
                    {
                         mask_left = vld1q_u8(&mask_p[-1]);
                    }
                    if (x == width-block_width - 1)
                    {
                        mask_right = vextq_u8(mask, one_vector, 1);
                    }
                    else
                    {
                         mask_right = vld1q_u8(&mask_p[1]);
                    }
                    uint8x16_t res1 = vandq_u8(vandq_u8(mask_left, mask), mask_right);
                    block_score +=  vaddvq_u8(res1);
                }
                else
                {
                    if ((x + block_x) == 0)
                    {
                        block_score += mask_p[0] & mask_p[1];
                        block_x += 1;
                    }
                    for (; block_x < block_width - 8; block_x += 8)
                    {
                        uint8x8_t mask = vld1_u8(&mask_p[block_x]);
                        uint8x8_t mask_left = vld1_u8(&mask_p[block_x-1]);
                        uint8x8_t mask_right = vld1_u8(&mask_p[block_x+1]);
                        uint8x8_t result = vand_u8(vand_u8(mask_left, mask), mask_right );
                        block_score += vaddv_u8(result);
                    }
                    for (; block_x < block_width; block_x++)
                    {
                        if ((x + block_x) == (width -1))
                        {
                            block_score += mask_p[block_x-1] & mask_p[block_x];
                        }
                        else
                        {
                            block_score += mask_p[block_x-1] & mask_p[block_x] & mask_p[block_x+1];
                        }
                    }
                }
            }

            if (pv->comb_check_complete)
            {
                // Some other thread found coming before this one
                return;
            }

            if (block_score >= (threshold / 2))
            {
                pv->mask_box_x = x;
                pv->mask_box_y = y;

                pv->block_score[segment] = block_score;
                if (block_score > threshold)
                {
                    pv->comb_check_complete = 1;
                    return;
                }
            }
        }
    }
}
#else
static void check_combing_mask(hb_filter_private_t *pv, int segment, int start, int stop)
{
    // Go through the mask in X*Y blocks. If any of these windows
    // have threshold or more combed pixels, consider the whole
    // frame to be combed and send it on to be deinterlaced.
    // Block mask threshold -- The number of pixels
    // in a block_width * block_height window of
    // the mask that need to show combing for the
    // whole frame to be seen as such.

    const int threshold    = pv->block_threshold;
    const int block_width  = pv->block_width;
    const int block_height = pv->block_height;

    const int stride = pv->mask->plane[0].stride;
    const int width = pv->mask->plane[0].width;

    for (int y = start; y < (stop - block_height + 1); y = y + block_height)
    {
        for (int x = 0; x < (width - block_width); x = x + block_width)
        {
            int block_score = 0;

            for (int block_y = 0; block_y < block_height; block_y++)
            {
                const int mask_y = y + block_y;
                const uint8_t *mask_p = &pv->mask->plane[0].data[mask_y * stride + x];

                for (int block_x = 0; block_x < block_width; block_x++)
                {
                    // We only want to mark a pixel in a block as combed
                    // if the adjacent pixels are as well. Got to
                    // handle the sides separately.
                    if ((x + block_x) == 0)
                    {
                        block_score += mask_p[0] & mask_p[1];
                    }
                    else if ((x + block_x) == (width -1))
                    {
                        block_score += mask_p[-1] & mask_p[0];
                    }
                    else
                    {
                        block_score += mask_p[-1] & mask_p[0] & mask_p[1];
                    }

                    mask_p++;
                }
            }

            if (pv->comb_check_complete)
            {
                // Some other thread found coming before this one
                return;
            }

            if (block_score >= (threshold / 2))
            {
                pv->mask_box_x = x;
                pv->mask_box_y = y;

                pv->block_score[segment] = block_score;
                if (block_score > threshold)
                {
                    pv->comb_check_complete = 1;
                    return;
                }
            }
        }
    }
}
#endif

#if defined(__aarch64__)
static void mask_dilate_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    const int segment_start = thread_args->segment_start[0];
    const int segment_stop = segment_start + thread_args->segment_height[0];

    const int dilation_threshold = 4;

    const int width = pv->mask_filtered->plane[0].width;
    const int height = pv->mask_filtered->plane[0].height;
    const int stride = pv->mask_filtered->plane[0].stride;

    int start, stop, p, c, n;

    if (segment_start == 0)
    {
        start = 1;
        p = 0;
        c = 1;
        n = 2;
    }
    else
    {
        start = segment_start;
        p = segment_start - 1;
        c = segment_start;
        n = segment_start + 1;
    }

    if (segment_stop == height)
    {
        stop = height -1;
    }
    else
    {
        stop = segment_stop;
    }

    uint8_t *curp = &pv->mask_filtered->plane[0].data[p * stride + 1];
    uint8_t *cur  = &pv->mask_filtered->plane[0].data[c * stride + 1];
    uint8_t *curn = &pv->mask_filtered->plane[0].data[n * stride + 1];
    uint8_t *dst = &pv->mask_temp->plane[0].data[c * stride + 1];

    uint8x8_t threshold = vdup_n_u8(dilation_threshold);
    uint8x8_t zero_vector = vdup_n_u8(0);
    uint8x8_t result_if_nonzero = vdup_n_u8(1);
    for (int yy = start; yy < stop; yy++)
    {
        int xx = 1;
        for (; xx < width - 8; xx += 8)
        {
            uint8x8_t cur_left = vld1_u8(&cur[xx-1]);
            uint8x8_t curp_left = vld1_u8(&curp[xx-1]);
            uint8x8_t curn_left = vld1_u8(&curn[xx-1]);

            uint8x8_t curp_vec = vext_u8(curp_left, vld1_u8(&curp[xx+7]), 1);
            uint8x8_t curp_right = vext_u8(curp_left, vld1_u8(&curp[xx+7]), 2);

            uint8x8_t cur_vec = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 1);
            uint8x8_t cur_right = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 2);

            uint8x8_t curn_vec = vext_u8(curn_left, vld1_u8(&curn[xx+7]), 1);
            uint8x8_t curn_right = vext_u8(curn_left, vld1_u8(&curn[xx+7]), 2);

            uint8x8_t sum_p = vadd_u8(vadd_u8(curp_left, curp_vec), curp_right);
            uint8x8_t sum_c = vadd_u8(cur_left, cur_right);
            uint8x8_t sum_n = vadd_u8(vadd_u8(curn_left, curn_vec), curn_right);
            uint8x8_t sum = vadd_u8(vadd_u8(sum_p, sum_c), sum_n);

            uint8x8_t result_8 = vcge_u8(sum, threshold);
            uint8x8_t result = vand_u8(result_8, result_if_nonzero);
            uint8x8_t nonzero_mask = vcgt_u8(cur_vec, zero_vector);

            result = vbsl_u8(nonzero_mask, result_if_nonzero, result);
            vst1_u8(&dst[xx], result);
        }
        for (; xx < width - 1; xx++)
        {
            if (cur[xx])
            {
                dst[xx] = 1;
                continue;
            }
            const int count = curp[xx-1] + curp[xx] + curp[xx+1] +
                              cur[xx-1]  +            cur [xx+1] +
                              curn[xx-1] + curn[xx] + curn[xx+1];

            dst[xx] = count >= dilation_threshold;
        }
        curp += stride;
        cur += stride;
        curn += stride;
        dst += stride;
    }
}
#else
static void mask_dilate_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    const int segment_start = thread_args->segment_start[0];
    const int segment_stop = segment_start + thread_args->segment_height[0];

    const int dilation_threshold = 4;

    const int width = pv->mask_filtered->plane[0].width;
    const int height = pv->mask_filtered->plane[0].height;
    const int stride = pv->mask_filtered->plane[0].stride;

    int start, stop, p, c, n;

    if (segment_start == 0)
    {
        start = 1;
        p = 0;
        c = 1;
        n = 2;
    }
    else
    {
        start = segment_start;
        p = segment_start - 1;
        c = segment_start;
        n = segment_start + 1;
    }

    if (segment_stop == height)
    {
        stop = height -1;
    }
    else
    {
        stop = segment_stop;
    }

    const uint8_t *curp = &pv->mask_filtered->plane[0].data[p * stride + 1];
    const uint8_t *cur  = &pv->mask_filtered->plane[0].data[c * stride + 1];
    const uint8_t *curn = &pv->mask_filtered->plane[0].data[n * stride + 1];
    uint8_t *dst = &pv->mask_temp->plane[0].data[c * stride + 1];

    for (int yy = start; yy < stop; yy++)
    {
        for (int xx = 1; xx < width - 1; xx++)
        {
            if (cur[xx])
            {
                dst[xx] = 1;
                continue;
            }

            const int count = curp[xx-1] + curp[xx] + curp[xx+1] +
                              cur [xx-1] +            cur [xx+1] +
                              curn[xx-1] + curn[xx] + curn[xx+1];

            dst[xx] = count >= dilation_threshold;
        }
        curp += stride;
        cur += stride;
        curn += stride;
        dst += stride;
    }
}
#endif

#if defined (__aarch64__)
static void mask_erode_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    const int segment_start = thread_args->segment_start[0];
    const int segment_stop = segment_start + thread_args->segment_height[0];

    const int erosion_threshold = 2;

    const int width = pv->mask_filtered->plane[0].width;
    const int height = pv->mask_filtered->plane[0].height;
    const int stride = pv->mask_filtered->plane[0].stride;

    int start, stop, p, c, n;

    if (segment_start == 0)
    {
        start = 1;
        p = 0;
        c = 1;
        n = 2;
    }
    else
    {
        start = segment_start;
        p = segment_start - 1;
        c = segment_start;
        n = segment_start + 1;
    }

    if (segment_stop == height)
    {
        stop = height -1;
    }
    else
    {
        stop = segment_stop;
    }

    const uint8_t *curp = &pv->mask_temp->plane[0].data[p * stride + 1];
    const uint8_t *cur  = &pv->mask_temp->plane[0].data[c * stride + 1];
    const uint8_t *curn = &pv->mask_temp->plane[0].data[n * stride + 1];
    uint8_t *dst = &pv->mask_filtered->plane[0].data[c * stride + 1];

    uint8x8_t threshold = vdup_n_u8(erosion_threshold);
    uint8x8_t result_if_zero = vdup_n_u8(0);
    uint8x8_t conv_vector = vdup_n_u8(1);

    for (int yy = start; yy < stop; yy++)
    {
        int xx = 1;
        for (; xx < width - 8; xx += 8)
        {
            uint8x8_t cur_left = vld1_u8(&cur[xx-1]);
            uint8x8_t curp_left = vld1_u8(&curp[xx-1]);
            uint8x8_t curn_left = vld1_u8(&curn[xx-1]);

            uint8x8_t curp_vec = vext_u8(curp_left, vld1_u8(&curp[xx+7]), 1);
            uint8x8_t curp_right = vext_u8(curp_left, vld1_u8(&curp[xx+7]), 2);

            uint8x8_t cur_vec = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 1);
            uint8x8_t cur_right = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 2);

            uint8x8_t curn_vec = vext_u8(curn_left, vld1_u8(&curn[xx+7]), 1);
            uint8x8_t curn_right = vext_u8(curn_left, vld1_u8(&curn[xx+7]), 2);

            uint8x8_t sum_p = vadd_u8(vadd_u8(curp_left, curp_vec), curp_right);
            uint8x8_t sum_c = vadd_u8(cur_left, cur_right);
            uint8x8_t sum_n = vadd_u8(vadd_u8(curn_left, curn_vec), curn_right);
            uint8x8_t sum = vadd_u8(vadd_u8(sum_p, sum_c), sum_n);

            uint8x8_t result = vcge_u8(sum, threshold);
            result = vand_u8(result, conv_vector);
            uint8x8_t nonzero_mask = vceq_u8(cur_vec, result_if_zero);

            result = vbsl_u8(nonzero_mask, result_if_zero, result);
            vst1_u8(&dst[xx], result);

        }
        for (; xx < width - 1; xx++)
        {
            if (cur[xx] == 0)
            {
                dst[xx] = 0;
                continue;
            }
            const int count = curp[xx-1] + curp[xx] + curp[xx+1] +
                              cur [xx-1] +            cur [xx+1] +
                              curn[xx-1] + curn[xx] + curn[xx+1];

            dst[xx] = count >= erosion_threshold;
        }
        curp += stride;
        cur += stride;
        curn += stride;
        dst += stride;
    }
}
#else
static void mask_erode_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    const int segment_start = thread_args->segment_start[0];
    const int segment_stop = segment_start + thread_args->segment_height[0];

    const int erosion_threshold = 2;

    const int width = pv->mask_filtered->plane[0].width;
    const int height = pv->mask_filtered->plane[0].height;
    const int stride = pv->mask_filtered->plane[0].stride;

    int start, stop, p, c, n;

    if (segment_start == 0)
    {
        start = 1;
        p = 0;
        c = 1;
        n = 2;
    }
    else
    {
        start = segment_start;
        p = segment_start - 1;
        c = segment_start;
        n = segment_start + 1;
    }

    if (segment_stop == height)
    {
        stop = height -1;
    }
    else
    {
        stop = segment_stop;
    }

    const uint8_t *curp = &pv->mask_temp->plane[0].data[p * stride + 1];
    const uint8_t *cur  = &pv->mask_temp->plane[0].data[c * stride + 1];
    const uint8_t *curn = &pv->mask_temp->plane[0].data[n * stride + 1];
    uint8_t *dst = &pv->mask_filtered->plane[0].data[c * stride + 1];

    for (int yy = start; yy < stop; yy++)
    {
        for (int xx = 1; xx < width - 1; xx++)
        {
            if (cur[xx] == 0)
            {
                dst[xx] = 0;
                continue;
            }

            const int count = curp[xx-1] + curp[xx] + curp[xx+1] +
                              cur [xx-1] +            cur [xx+1] +
                              curn[xx-1] + curn[xx] + curn[xx+1];

            dst[xx] = count >= erosion_threshold;
        }
        curp += stride;
        cur += stride;
        curn += stride;
        dst += stride;
    }
}
#endif

#if defined (__aarch64__)
static void mask_filter_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    const int width = pv->mask->plane[0].width;
    const int height = pv->mask->plane[0].height;
    const int stride = pv->mask->plane[0].stride;

    int start, stop, p, c, n;
    int segment_start = thread_args->segment_start[0];
    int segment_stop = segment_start + thread_args->segment_height[0];

    if (segment_start == 0)
    {
        start = 1;
        p = 0;
        c = 1;
        n = 2;
    }
    else
    {
        start = segment_start;
        p = segment_start - 1;
        c = segment_start;
        n = segment_start + 1;
    }

    if (segment_stop == height)
    {
        stop = height - 1;
    }
    else
    {
        stop = segment_stop;
    }

    uint8_t *curp = &pv->mask->plane[0].data[p * stride + 1];
    uint8_t *cur  = &pv->mask->plane[0].data[c * stride + 1];
    uint8_t *curn = &pv->mask->plane[0].data[n * stride + 1];
    uint8_t *dst = (pv->filter_mode == FILTER_CLASSIC) ?
                     &pv->mask_filtered->plane[0].data[c * stride + 1] :
                     &pv->mask_temp->plane[0].data[c * stride + 1] ;
    if (pv->filter_mode == FILTER_CLASSIC)
    {
        for (int yy = start; yy < stop; yy++)
        {
            int xx = 1;
            for (; xx < width - 8; xx += 8)
            {
                uint8x8_t cur_left = vld1_u8(&cur[xx-1]);
                uint8x8_t cur_vec = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 1);
                uint8x8_t cur_right = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 2);

                uint8x8_t h_count = vand_u8(vand_u8(cur_left, cur_vec), cur_right);
                vst1_u8(&dst[xx], h_count);
            }
            for (; xx < width - 1; xx++)
            {
                const int h_count = cur[xx-1] & cur[xx] & cur[xx+1];
                dst[xx] = h_count;
            }

            curp += stride;
            cur += stride;
            curn +=stride;
            dst+=stride;
        }
    }
    else
    {
        for (int yy = start; yy < stop; yy++)
        {
            int xx = 1;
            for (; xx < width - 8; xx += 8)
            {
                uint8x8_t curp_vec = vld1_u8(&curp[xx]);
                uint8x8_t cur_left = vld1_u8(&cur[xx-1]);
                uint8x8_t curn_vec = vld1_u8(&curn[xx]);

                uint8x8_t cur_vec = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 1);
                uint8x8_t cur_right = vext_u8(cur_left, vld1_u8(&cur[xx+7]), 2);

                uint8x8_t h_count = vand_u8(vand_u8(cur_left, cur_vec), cur_right);
                uint8x8_t v_count = vand_u8(vand_u8(curp_vec, cur_vec), curn_vec);

                uint8x8_t result = vand_u8(h_count, v_count);

                vst1_u8(&dst[xx], result);
            }
            for (; xx < width - 1; xx++)
            {
                const int h_count = cur[xx-1] & cur[xx] & cur[xx+1];
                const int v_count = curp[xx] & cur[xx] & curn[xx];

                dst[xx] = h_count & v_count;
            }
            curp += stride;
            cur += stride;
            curn += stride;
            dst += stride;
        }
    }
}
#else
static void mask_filter_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    const int width = pv->mask->plane[0].width;
    const int height = pv->mask->plane[0].height;
    const int stride = pv->mask->plane[0].stride;

    int start, stop, p, c, n;
    int segment_start = thread_args->segment_start[0];
    int segment_stop = segment_start + thread_args->segment_height[0];

    if (segment_start == 0)
    {
        start = 1;
        p = 0;
        c = 1;
        n = 2;
    }
    else
    {
        start = segment_start;
        p = segment_start - 1;
        c = segment_start;
        n = segment_start + 1;
    }

    if (segment_stop == height)
    {
        stop = height - 1;
    }
    else
    {
        stop = segment_stop;
    }

    const uint8_t *curp = &pv->mask->plane[0].data[p * stride + 1];
    const uint8_t *cur  = &pv->mask->plane[0].data[c * stride + 1];
    const uint8_t *curn = &pv->mask->plane[0].data[n * stride + 1];
    uint8_t *dst = (pv->filter_mode == FILTER_CLASSIC) ?
                     &pv->mask_filtered->plane[0].data[c * stride + 1] :
                     &pv->mask_temp->plane[0].data[c * stride + 1] ;

    for (int yy = start; yy < stop; yy++)
    {
        for (int xx = 1; xx < width - 1; xx++)
        {
            const int h_count = cur[xx-1] & cur[xx] & cur[xx+1];
            const int v_count = curp[xx] & cur[xx] & curn[xx];

            if (pv->filter_mode == FILTER_CLASSIC)
            {
                dst[xx] = h_count;
            }
            else
            {
                dst[xx] = h_count & v_count;
            }
        }
        curp += stride;
        cur += stride;
        curn += stride;
        dst += stride;
    }
}
#endif

static void comb_detect_check_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    int segment = thread_args->arg.segment;
    int segment_start = thread_args->segment_start[0];
    int segment_stop = segment_start + thread_args->segment_height[0];

    if (pv->mode & MODE_FILTER)
    {
        check_filtered_combing_mask(pv, segment, segment_start, segment_stop);
    }
    else
    {
        check_combing_mask(pv, segment, segment_start, segment_stop);
    }
}

static void comb_detect_filter_work(void *thread_args_v)
{
    comb_detect_thread_arg_t *thread_args = thread_args_v;
    hb_filter_private_t *pv = thread_args->pv;

    //Process segment (for now just from luma)
    const int segment_start = thread_args->segment_start[0];
    const int segment_stop = segment_start + thread_args->segment_height[0];

    if (pv->mode & MODE_GAMMA)
    {
        pv->detect_gamma_combed_segment(pv, segment_start, segment_stop);
    }
    else
    {
        pv->detect_combed_segment(pv, segment_start, segment_stop);
    }
}

static void store_ref(hb_filter_private_t *pv, hb_buffer_t *b)
{
    // Free unused buffer
    if (!pv->ref_used[0])
    {
        hb_buffer_close(&pv->ref[0]);
    }
    memmove(&pv->ref[0],      &pv->ref[1],      sizeof(pv->ref[0])      * 2);
    memmove(&pv->ref_used[0], &pv->ref_used[1], sizeof(pv->ref_used[0]) * 2);
    pv->ref[2]      = b;
    pv->ref_used[2] = 0;
}

static void reset_combing_results(hb_filter_private_t *pv)
{
    pv->comb_check_complete = 0;
    for (int ii = 0; ii < pv->comb_check_nthreads; ii++)
    {
       pv->block_score[ii] = 0;
    }
}

static int check_combing_results(hb_filter_private_t *pv)
{
    int combed = HB_COMB_NONE;
    for (int ii = 0; ii < pv->comb_check_nthreads; ii++)
    {
        if (pv->block_score[ii] >= (pv->block_threshold / 2))
        {
            if (pv->block_score[ii] <= pv->block_threshold)
            {
                // Indicate light combing for block_score that is between
                // ( pv->block_threshold / 2 ) and pv->block_threshold
                combed = HB_COMB_LIGHT;
            }
            else if (pv->block_score[ii] > pv->block_threshold)
            {
                return HB_COMB_HEAVY;
            }
        }
    }
    return combed;
}

static int comb_segmenter(hb_filter_private_t *pv)
{
    /*
     * Now that all data for comb detection is ready for
     * our threads, fire them off and wait for their completion.
     */
    taskset_cycle(&pv->comb_detect_filter_taskset);

    if (pv->mode & MODE_FILTER)
    {
        taskset_cycle(&pv->mask_filter_taskset);
        if (pv->filter_mode == FILTER_ERODE_DILATE)
        {
            taskset_cycle(&pv->mask_erode_taskset);
            taskset_cycle(&pv->mask_dilate_taskset);
            taskset_cycle(&pv->mask_erode_taskset);
        }
    }
    reset_combing_results(pv);
    taskset_cycle(&pv->comb_detect_check_taskset);
    return check_combing_results(pv);
}

static void build_gamma_lut(hb_filter_private_t *pv)
{
    const int max = pv->max_value;
    for (int i = 0; i < max + 1; i++)
    {
        pv->gamma_lut[i] = pow(((float)i / (float)max), 2.2f);
    }
}

static int comb_detect_init(hb_filter_object_t *filter,
                            hb_filter_init_t   *init)
{
    filter->private_data = calloc(1, sizeof(struct hb_filter_private_s));
    if (filter->private_data == NULL)
    {
        hb_error("comb_detect: calloc failed");
        return -1;
    }
    hb_filter_private_t *pv = filter->private_data;

    hb_buffer_list_clear(&pv->out_list);

    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(init->pix_fmt);
    pv->depth      = desc->comp[0].depth;
    pv->bps        = pv->depth > 8 ? 2 : 1;
    pv->max_value  = (1 << pv->depth) - 1;
    pv->half_value = (1 << pv->depth) / 2;

    pv->gamma_lut = malloc(sizeof(float) * (pv->max_value + 1));
    if (pv->gamma_lut == NULL)
    {
        hb_error("comb_detect: malloc failed");
        return -1;
    }
    build_gamma_lut(pv);

    pv->frames = 0;
    pv->force_exaustive_check = 1;
    pv->comb_heavy = 0;
    pv->comb_light = 0;
    pv->comb_none = 0;

    pv->comb_detect_ready = 0;

    pv->mode              = MODE_GAMMA | MODE_FILTER;
    pv->filter_mode       = FILTER_ERODE_DILATE;
    pv->spatial_metric    = 2;
    pv->motion_threshold  = 3;
    pv->spatial_threshold = 3;
    pv->block_threshold   = 40;
    pv->block_width       = 16;
    pv->block_height      = 16;

    if (filter->settings)
    {
        hb_value_t *dict = filter->settings;

        // Get comb detection settings
        hb_dict_extract_int(&pv->mode, dict, "mode");
        hb_dict_extract_int(&pv->spatial_metric, dict, "spatial-metric");
        hb_dict_extract_int(&pv->motion_threshold, dict, "motion-thresh");
        hb_dict_extract_int(&pv->spatial_threshold, dict, "spatial-thresh");
        hb_dict_extract_int(&pv->filter_mode, dict, "filter-mode");
        hb_dict_extract_int(&pv->block_threshold, dict, "block-thresh");
        hb_dict_extract_int(&pv->block_width, dict, "block-width");
        hb_dict_extract_int(&pv->block_height, dict, "block-height");
    }

    if (pv->block_width > init->geometry.width)
    {
        pv->block_width = init->geometry.width;
    }
    if (pv->block_height > init->geometry.height)
    {
        pv->block_height = init->geometry.height;
    }

    // Scale the thresholds for the current depth
    pv->motion_threshold  <<= (pv->depth - 8);
    pv->spatial_threshold <<= (pv->depth - 8);

    // Compute thresholds
    pv->gamma_motion_threshold    = (float)pv->motion_threshold / (float)pv->max_value;
    pv->gamma_spatial_threshold   = (float)pv->spatial_threshold / (float)pv->max_value;
    pv->gamma_spatial_threshold6  = 6 * pv->gamma_spatial_threshold;
    pv->spatial_threshold_squared = pv->spatial_threshold * pv->spatial_threshold;
    pv->spatial_threshold6        = 6 * pv->spatial_threshold;
    pv->comb32detect_min = 10 << (pv->depth - 8);
    pv->comb32detect_max = 15 << (pv->depth - 8);

    pv->cpu_count = hb_get_cpu_count();

    // Make segment sizes an even number of lines
    int height = hb_image_height(init->pix_fmt, init->geometry.height, 0);
    // each segment of each plane must begin on an even row.
    pv->segment_height[0] = (height / pv->cpu_count) & ~3;
    pv->segment_height[1] = hb_image_height(init->pix_fmt, pv->segment_height[0], 1);
    pv->segment_height[2] = hb_image_height(init->pix_fmt, pv->segment_height[0], 2);

    /* Allocate buffers to store comb masks. */
    pv->mask = hb_frame_buffer_init(AV_PIX_FMT_GRAY8,
                                init->geometry.width, init->geometry.height);
    pv->mask_filtered = hb_frame_buffer_init(AV_PIX_FMT_GRAY8,
                                init->geometry.width, init->geometry.height);
    pv->mask_temp = hb_frame_buffer_init(AV_PIX_FMT_GRAY8,
                                init->geometry.width, init->geometry.height);
    memset(pv->mask->data, 0, pv->mask->size);
    memset(pv->mask_filtered->data, 0, pv->mask_filtered->size);
    memset(pv->mask_temp->data, 0, pv->mask_temp->size);

    // Set the functions for the current bit depth
    switch (pv->depth)
    {
        case 8:
            pv->detect_gamma_combed_segment = detect_gamma_combed_segment_8;
            pv->detect_combed_segment       = detect_combed_segment_8;
            pv->apply_mask                  = apply_mask_8;
            break;

        default:
            pv->detect_gamma_combed_segment = detect_gamma_combed_segment_16;
            pv->detect_combed_segment       = detect_combed_segment_16;
            pv->apply_mask                  = apply_mask_16;
            break;
    }

    /*
     * Create comb detection taskset.
     */
    if (taskset_init(&pv->comb_detect_filter_taskset, "comb_detect_filter_segment", pv->cpu_count,
                     sizeof(comb_detect_thread_arg_t), comb_detect_filter_work) == 0)
    {
        hb_error("comb_detect could not initialize taskset");
        return -1;
    }

    comb_detect_thread_arg_t *comb_detect_prev_thread_args = NULL;
    for (int ii = 0; ii < pv->cpu_count; ii++)
    {
        comb_detect_thread_arg_t *thread_args;

        thread_args = taskset_thread_args( &pv->comb_detect_filter_taskset, ii );
        thread_args->pv = pv;
        thread_args->arg.segment = ii;
        thread_args->arg.taskset = &pv->comb_detect_filter_taskset;

        for (int pp = 0; pp < 3; pp++)
        {
            if (comb_detect_prev_thread_args != NULL)
            {
                thread_args->segment_start[pp] =
                    comb_detect_prev_thread_args->segment_start[pp] +
                    comb_detect_prev_thread_args->segment_height[pp];
            }
            if (ii == pv->cpu_count - 1)
            {
                /*
                 * Final segment
                 */
                thread_args->segment_height[pp] =
                    hb_image_height(init->pix_fmt, init->geometry.height, pp) -
                    thread_args->segment_start[pp];
            } else {
                thread_args->segment_height[pp] = pv->segment_height[pp];
            }
        }

        comb_detect_prev_thread_args = thread_args;
    }

    pv->comb_check_nthreads = init->geometry.height / pv->block_height;

    if (pv->comb_check_nthreads > pv->cpu_count)
    {
        pv->comb_check_nthreads = pv->cpu_count;
    }

    pv->block_score = calloc(pv->comb_check_nthreads, sizeof(int));

    /*
     * Create comb check taskset.
     */
    if (taskset_init(&pv->comb_detect_check_taskset, "comb_detect_check_segment", pv->comb_check_nthreads,
                     sizeof(comb_detect_thread_arg_t), comb_detect_check_work) == 0)
    {
        hb_error("comb_detect check could not initialize taskset");
        return -1;
    }

    comb_detect_prev_thread_args = NULL;
    for (int ii = 0; ii < pv->comb_check_nthreads; ii++)
    {
        comb_detect_thread_arg_t *thread_args;

        thread_args = taskset_thread_args(&pv->comb_detect_check_taskset, ii);
        thread_args->pv = pv;
        thread_args->arg.segment = ii;
        thread_args->arg.taskset = &pv->comb_detect_check_taskset;

        for (int pp = 0; pp < 3; pp++)
        {
            if (comb_detect_prev_thread_args != NULL)
            {
                thread_args->segment_start[pp] =
                    comb_detect_prev_thread_args->segment_start[pp] +
                    comb_detect_prev_thread_args->segment_height[pp];
            }

            // Make segment height a multiple of block_height
            int h = hb_image_height(init->pix_fmt, init->geometry.height, pp) / pv->comb_check_nthreads;
            h = h / pv->block_height * pv->block_height;
            if (h == 0)
                h = pv->block_height;

            if (ii == pv->comb_check_nthreads - 1)
            {
                /*
                 * Final segment
                 */
                thread_args->segment_height[pp] =
                    hb_image_height(init->pix_fmt, init->geometry.height, pp) -
                    thread_args->segment_start[pp];
            } else {
                thread_args->segment_height[pp] = h;
            }
        }

        comb_detect_prev_thread_args = thread_args;
    }

    if (pv->mode & MODE_FILTER)
    {
        if (taskset_init(&pv->mask_filter_taskset, "mask_filter_segment", pv->cpu_count,
                         sizeof(comb_detect_thread_arg_t), mask_filter_work) == 0)
        {
            hb_error( "mask filter could not initialize taskset" );
            return -1;
        }

        comb_detect_prev_thread_args = NULL;
        for (int ii = 0; ii < pv->cpu_count; ii++)
        {
            comb_detect_thread_arg_t *thread_args;

            thread_args = taskset_thread_args(&pv->mask_filter_taskset, ii);
            thread_args->pv = pv;
            thread_args->arg.segment = ii;
            thread_args->arg.taskset = &pv->mask_filter_taskset;

            for (int pp = 0; pp < 3; pp++)
            {
                if (comb_detect_prev_thread_args != NULL)
                {
                    thread_args->segment_start[pp] =
                        comb_detect_prev_thread_args->segment_start[pp] +
                        comb_detect_prev_thread_args->segment_height[pp];
                }

                if (ii == pv->cpu_count - 1)
                {
                    /*
                     * Final segment
                     */
                    thread_args->segment_height[pp] =
                        hb_image_height(init->pix_fmt, init->geometry.height, pp) -
                        thread_args->segment_start[pp];
                } else {
                    thread_args->segment_height[pp] = pv->segment_height[pp];
                }
            }

            comb_detect_prev_thread_args = thread_args;
        }

        if (pv->filter_mode == FILTER_ERODE_DILATE)
        {
            if (taskset_init(&pv->mask_erode_taskset, "mask_erode_segment", pv->cpu_count,
                             sizeof(comb_detect_thread_arg_t), mask_erode_work) == 0)
            {
                hb_error("mask erode could not initialize taskset");
                return -1;
            }

            comb_detect_prev_thread_args = NULL;
            for (int ii = 0; ii < pv->cpu_count; ii++)
            {
                comb_detect_thread_arg_t *thread_args;

                thread_args = taskset_thread_args( &pv->mask_erode_taskset, ii );
                thread_args->pv = pv;
                thread_args->arg.segment = ii;
                thread_args->arg.taskset = &pv->mask_erode_taskset;

                for (int pp = 0; pp < 3; pp++)
                {
                    if (comb_detect_prev_thread_args != NULL)
                    {
                        thread_args->segment_start[pp] =
                            comb_detect_prev_thread_args->segment_start[pp] +
                            comb_detect_prev_thread_args->segment_height[pp];
                    }

                    if (ii == pv->cpu_count - 1)
                    {
                        /*
                         * Final segment
                         */
                        thread_args->segment_height[pp] =
                            hb_image_height(init->pix_fmt, init->geometry.height, pp) -
                            thread_args->segment_start[pp];
                    } else {
                        thread_args->segment_height[pp] = pv->segment_height[pp];
                    }
                }

                comb_detect_prev_thread_args = thread_args;
            }

            if (taskset_init(&pv->mask_dilate_taskset, "mask_dilate_segment", pv->cpu_count,
                             sizeof(comb_detect_thread_arg_t), mask_dilate_work) == 0)
            {
                hb_error("mask dilate could not initialize taskset");
                return -1;
            }

            comb_detect_prev_thread_args = NULL;
            for (int ii = 0; ii < pv->cpu_count; ii++)
            {
                comb_detect_thread_arg_t *thread_args;

                thread_args = taskset_thread_args( &pv->mask_dilate_taskset, ii );
                thread_args->pv = pv;
                thread_args->arg.segment = ii;
                thread_args->arg.taskset = &pv->mask_dilate_taskset;

                for (int pp = 0; pp < 3; pp++)
                {
                    if (comb_detect_prev_thread_args != NULL)
                    {
                        thread_args->segment_start[pp] =
                            comb_detect_prev_thread_args->segment_start[pp] +
                            comb_detect_prev_thread_args->segment_height[pp];
                    }

                    if (ii == pv->cpu_count - 1)
                    {
                        /*
                         * Final segment
                         */
                        thread_args->segment_height[pp] =
                            hb_image_height(init->pix_fmt, init->geometry.height, pp) -
                            thread_args->segment_start[pp];
                    } else {
                        thread_args->segment_height[pp] = pv->segment_height[pp];
                    }
                }

                comb_detect_prev_thread_args = thread_args;
            }
        }
    }

    return 0;
}

static void comb_detect_close(hb_filter_object_t *filter)
{
    hb_filter_private_t *pv = filter->private_data;

    if (pv == NULL)
    {
        return;
    }

    hb_log("comb detect: heavy %i | light %i | uncombed %i | total %i",
           pv->comb_heavy,  pv->comb_light,  pv->comb_none, pv->frames);

    taskset_fini(&pv->comb_detect_filter_taskset);
    taskset_fini(&pv->comb_detect_check_taskset);

    if (pv->mode & MODE_FILTER)
    {
        taskset_fini( &pv->mask_filter_taskset );
        if (pv->filter_mode == FILTER_ERODE_DILATE)
        {
            taskset_fini(&pv->mask_erode_taskset);
            taskset_fini(&pv->mask_dilate_taskset);
        }
    }

    hb_buffer_list_close(&pv->out_list);

    /* Cleanup reference buffers. */
    for (int ii = 0; ii < 3; ii++)
    {
        if (!pv->ref_used[ii])
        {
            hb_buffer_close(&pv->ref[ii]);
        }
    }

    /* Cleanup combing masks. */
    hb_buffer_close(&pv->mask);
    hb_buffer_close(&pv->mask_filtered);
    hb_buffer_close(&pv->mask_temp);

    free(pv->gamma_lut);
    free(pv->block_score);
    free(pv);
    filter->private_data = NULL;
}

static void process_frame(hb_filter_private_t *pv)
{
    int combed = comb_segmenter(pv);

    switch (combed)
    {
        case HB_COMB_HEAVY:
            pv->comb_heavy++;
            break;

        case HB_COMB_LIGHT:
            pv->comb_light++;
            break;

        case HB_COMB_NONE:
        default:
            pv->comb_none++;
            break;
    }
    pv->frames++;
    if (((pv->mode & MODE_MASK) || (pv->mode & MODE_COMPOSITE)) && combed)
    {
        hb_buffer_t *out;
        out = hb_buffer_shallow_dup(pv->ref[1]);
        pv->apply_mask(pv, out);
        out->s.combed = combed;
        hb_buffer_list_append(&pv->out_list, out);
    }
    else
    {
        pv->ref_used[1] = 1;
        pv->ref[1]->s.combed = combed;
        hb_buffer_list_append(&pv->out_list, pv->ref[1]);
    }

    pv->force_exaustive_check = 0;
}

static int comb_detect_work(hb_filter_object_t *filter,
                            hb_buffer_t **buf_in,
                            hb_buffer_t **buf_out )
{
    hb_filter_private_t *pv = filter->private_data;
    hb_buffer_t         *in = *buf_in;

    // Input buffer is always consumed.
    *buf_in = NULL;
    if (in->s.flags & HB_BUF_FLAG_EOF)
    {
        // Duplicate last frame and process refs
        store_ref(pv, hb_buffer_shallow_dup(pv->ref[2]));
        if (pv->ref[0] != NULL)
        {
            pv->force_exaustive_check = 1;
            process_frame(pv);
        }
        hb_buffer_list_append(&pv->out_list, in);
        *buf_out = hb_buffer_list_clear(&pv->out_list);
        return HB_FILTER_DONE;
    }

    // comb detect requires 3 buffers, prev, cur, and next.  For the first
    // frame, there can be no prev, so we duplicate the first frame.
    if (!pv->comb_detect_ready)
    {
        // If not ready, store duplicate ref and return HB_FILTER_DELAY
        store_ref(pv, hb_buffer_shallow_dup(in));
        store_ref(pv, in);
        pv->comb_detect_ready = 1;
        // Wait for next
        return HB_FILTER_DELAY;
    }

    store_ref(pv, in);
    process_frame(pv);

    // Output buffers may also be in comb detect's internal ref list.
    // Since buffers are not reference counted, we must wait until
    // we are certain they are no longer in the ref list before sending
    // down the pipeline where they will ultimately get closed.
    if (hb_buffer_list_count(&pv->out_list) > 3)
    {
        *buf_out = hb_buffer_list_rem_head(&pv->out_list);
    }
    return HB_FILTER_OK;
}