libhb: add a Metal accelerated comb detect filter.

This commit is contained in:
Damiano Galassi 2023-11-23 18:05:09 +01:00 committed by Damiano Galassi
parent ad20fe40c7
commit 4e40332d2e
7 changed files with 1089 additions and 0 deletions

View File

@ -4693,6 +4693,10 @@ hb_filter_object_t * hb_filter_get( int filter_id )
filter = &hb_filter_prefilter_vt;
break;
case HB_FILTER_COMB_DETECT_VT:
filter = &hb_filter_comb_detect_vt;
break;
case HB_FILTER_YADIF_VT:
filter = &hb_filter_yadif_vt;
break;

View File

@ -1489,6 +1489,7 @@ enum
// First, filters that may change the framerate (drop or dup frames)
HB_FILTER_DETELECINE,
HB_FILTER_COMB_DETECT,
HB_FILTER_COMB_DETECT_VT,
HB_FILTER_DECOMB,
HB_FILTER_YADIF,
HB_FILTER_YADIF_VT,

View File

@ -496,6 +496,7 @@ extern hb_filter_object_t hb_filter_format;
#if defined(__APPLE__)
extern hb_filter_object_t hb_filter_prefilter_vt;
extern hb_filter_object_t hb_filter_comb_detect_vt;
extern hb_filter_object_t hb_filter_yadif_vt;
extern hb_filter_object_t hb_filter_bwdif_vt;
extern hb_filter_object_t hb_filter_crop_scale_vt;

View File

@ -0,0 +1,524 @@
/* comb_detect.c
Copyright (c) 2003-2023 HandBrake Team
This file is part of the HandBrake source code
Homepage: <http://handbrake.fr/>.
It may be used under the terms of the GNU General Public License v2.
For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
*/
#include "handbrake/handbrake.h"
#include "cv_utils.h"
#include "metal_utils.h"
#include "vt_common.h"
extern char hb_comb_detect_vt_metallib_data[];
extern unsigned int hb_comb_detect_vt_metallib_len;
struct mtl_comb_detect_params
{
int spatial_metric;
float motion_threshold;
float spatial_threshold;
int block_threshold;
int block_width;
int block_height;
float gamma_motion_threshold;
float gamma_spatial_threshold;
float gamma_spatial_threshold6;
float spatial_threshold_squared;
float spatial_threshold6;
float comb32detect_min;
float comb32detect_max;
bool force_exaustive_check;
};
#define MODE_GAMMA 1 // Scale gamma when decombing
#define MODE_FILTER 2 // Filter combing mask
#define MODE_MASK 4 // Output combing masks instead of pictures
#define MODE_COMPOSITE 8 // Overlay combing mask onto picture
#define FILTER_CLASSIC 1
#define FILTER_ERODE_DILATE 2
struct hb_filter_private_s
{
hb_metal_context_t *mtl;
const AVPixFmtDescriptor *desc;
// comb detect parameters
int mode;
int filter_mode;
int spatial_metric;
float motion_threshold;
float spatial_threshold;
int block_threshold;
int block_width;
int block_height;
// Computed parameters
float gamma_motion_threshold;
float gamma_spatial_threshold;
float gamma_spatial_threshold6;
float spatial_threshold_squared;
float spatial_threshold6;
float comb32detect_min;
float comb32detect_max;
bool force_exaustive_check;
// Mask textures
id<MTLTexture> mask;
id<MTLTexture> temp;
// Comb result
id<MTLBuffer> combed;
hb_buffer_t *ref[3];
hb_buffer_list_t out_list;
// Filter statistics
int comb_heavy;
int comb_light;
int comb_none;
int frames;
};
static int comb_detect_vt_init(hb_filter_object_t *filter,
hb_filter_init_t *init);
static int comb_detect_vt_work(hb_filter_object_t *filter,
hb_buffer_t **buf_in,
hb_buffer_t **buf_out );
static void comb_detect_vt_close(hb_filter_object_t *filter);
static const char comb_detect_vt_template[] =
"mode=^"HB_INT_REG"$:spatial-metric=^([012])$:"
"motion-thresh=^"HB_INT_REG"$:spatial-thresh=^"HB_INT_REG"$:"
"filter-mode=^([012])$:block-thresh=^"HB_INT_REG"$:"
"block-width=^"HB_INT_REG"$:block-height=^"HB_INT_REG"$:"
"disable=^"HB_BOOL_REG"$";
hb_filter_object_t hb_filter_comb_detect_vt =
{
.id = HB_FILTER_COMB_DETECT_VT,
.enforce_order = 1,
.name = "Comb Detect (VideoToolbox)",
.settings = NULL,
.init = comb_detect_vt_init,
.work = comb_detect_vt_work,
.close = comb_detect_vt_close,
.settings_template = comb_detect_vt_template,
};
#define PREV 0
#define CURR 1
#define NEXT 2
static void store_buf(hb_filter_private_t *pv, hb_buffer_t *in)
{
if (pv->ref[PREV])
{
hb_buffer_close(&pv->ref[PREV]);
}
pv->ref[PREV] = pv->ref[CURR];
pv->ref[CURR] = pv->ref[NEXT];
pv->ref[NEXT] = in;
}
static int comb_detect_vt_init(hb_filter_object_t *filter,
hb_filter_init_t *init)
{
filter->private_data = calloc(1, sizeof(struct hb_filter_private_s));
if (filter->private_data == NULL)
{
hb_error("comb_detect_vt: calloc failed");
return -1;
}
hb_filter_private_t *pv = filter->private_data;
hb_buffer_list_clear(&pv->out_list);
pv->desc = av_pix_fmt_desc_get(init->pix_fmt);
pv->frames = 0;
pv->force_exaustive_check = 1;
pv->comb_heavy = 0;
pv->comb_light = 0;
pv->comb_none = 0;
pv->mode = MODE_GAMMA | MODE_FILTER;
pv->filter_mode = FILTER_ERODE_DILATE;
pv->spatial_metric = 2;
pv->motion_threshold = 3;
pv->spatial_threshold = 3;
pv->block_threshold = 40;
pv->block_width = 16;
pv->block_height = 16;
if (filter->settings)
{
int motion_threshold, spatial_threshold, block_threshold;
hb_value_t *dict = filter->settings;
// Get comb detection settings
hb_dict_extract_int(&pv->mode, dict, "mode");
hb_dict_extract_int(&pv->spatial_metric, dict, "spatial-metric");
hb_dict_extract_int(&motion_threshold, dict, "motion-thresh");
hb_dict_extract_int(&spatial_threshold, dict, "spatial-thresh");
hb_dict_extract_int(&pv->filter_mode, dict, "filter-mode");
hb_dict_extract_int(&block_threshold, dict, "block-thresh");
hb_dict_extract_int(&pv->block_width, dict, "block-width");
hb_dict_extract_int(&pv->block_height, dict, "block-height");
pv->motion_threshold = motion_threshold;
pv->spatial_threshold = spatial_threshold;
pv->block_threshold = block_threshold;
}
pv->motion_threshold /= 255.f;
pv->spatial_threshold /= 255.f;
// Compute thresholds
pv->gamma_motion_threshold = pv->motion_threshold;
pv->gamma_spatial_threshold = pv->spatial_threshold;
pv->gamma_spatial_threshold6 = 6 * pv->gamma_spatial_threshold;
pv->spatial_threshold_squared = pv->spatial_threshold * pv->spatial_threshold;
pv->spatial_threshold6 = 6 * pv->spatial_threshold;
pv->comb32detect_min = 10 / 255.f;
pv->comb32detect_max = 15 / 255.f;
if (pv->block_width > 32) {pv->block_width = 32;}
if (pv->block_height > 32) {pv->block_height = 32;}
if (pv->block_width < 8) {pv->block_width = 8; }
if (pv->block_height < 8) {pv->block_height = 8; }
pv->mtl = hb_metal_context_init(hb_comb_detect_vt_metallib_data,
hb_comb_detect_vt_metallib_len,
pv->mode & MODE_GAMMA ? "comb_detect_gamma" : "comb_detect",
sizeof(struct mtl_comb_detect_params),
init->geometry.width, init->geometry.height,
init->pix_fmt, init->color_range);
if (pv->mtl == NULL)
{
hb_error("comb_detect_vt: failed to create Metal device");
return -1;
}
struct mtl_comb_detect_params *params = (struct mtl_comb_detect_params *)pv->mtl->params_buffer.contents;
*params = (struct mtl_comb_detect_params) {
.spatial_metric = pv->spatial_metric,
.motion_threshold = pv->motion_threshold,
.spatial_threshold = pv->spatial_threshold,
.block_threshold = pv->block_threshold,
.block_width = pv->block_width,
.block_height = pv->block_height,
.gamma_motion_threshold = pv->gamma_motion_threshold,
.gamma_spatial_threshold = pv->gamma_spatial_threshold,
.gamma_spatial_threshold6 = pv->gamma_spatial_threshold6,
.spatial_threshold_squared = pv->spatial_threshold_squared,
.spatial_threshold6 = pv->spatial_threshold6,
.comb32detect_min = pv->comb32detect_min,
.comb32detect_max = pv->comb32detect_max,
.force_exaustive_check = pv->force_exaustive_check
};
if (hb_metal_add_pipeline(pv->mtl, pv->filter_mode == FILTER_ERODE_DILATE ? "filter_erode_dilate" : "filter_classic",
pv->mtl->pipelines_count))
{
return -1;
}
if (hb_metal_add_pipeline(pv->mtl, "erode_mask", pv->mtl->pipelines_count))
{
return -1;
}
if (hb_metal_add_pipeline(pv->mtl, "dilate_mask", pv->mtl->pipelines_count))
{
return -1;
}
char *check_combing_name = pv->mode & MODE_FILTER ? "check_filtered_combing_mask" : "check_combing_mask";
if (@available(macOS 13, *))
{
if ([pv->mtl->device supportsFamily:MTLGPUFamilyMetal3] &&
((pv->block_width == 16 && pv->block_height == 16) || (pv->block_width == 32 && pv->block_height == 32)))
{
// Use simd_sum() to speed up the final reduction pass
check_combing_name = pv->mode & MODE_FILTER ? "check_filtered_combing_mask_quad" : "check_combing_mask_quad";
}
else if ([pv->mtl->device supportsFamily:MTLGPUFamilyCommon3] &&
(pv->block_width * pv->block_height) % 4)
{
// Use quad_sum() to speed up the final reduction pass
check_combing_name = pv->mode & MODE_FILTER ? "check_filtered_combing_mask_quad" : "check_combing_mask_quad";
}
}
if (hb_metal_add_pipeline(pv->mtl,check_combing_name, pv->mtl->pipelines_count))
{
return -1;
}
if (hb_metal_add_pipeline(pv->mtl, "apply_mask", pv->mtl->pipelines_count))
{
return -1;
}
// Allocate buffers to store the mask and the comb result
MTLTextureDescriptor *descriptor = [[MTLTextureDescriptor alloc] init];
descriptor.textureType = MTLTextureType2D;
descriptor.pixelFormat = MTLPixelFormatR8Uint;
descriptor.width = init->geometry.width;
descriptor.height = init->geometry.height;
descriptor.depth = 1;
descriptor.storageMode = MTLStorageModePrivate;
pv->mask = [pv->mtl->device newTextureWithDescriptor:descriptor];
pv->temp = [pv->mtl->device newTextureWithDescriptor:descriptor];
[descriptor release];
pv->combed = [pv->mtl->device newBufferWithLength:sizeof(uint32_t) options:MTLResourceStorageModeShared];
return 0;
}
static void comb_detect_vt_close(hb_filter_object_t *filter)
{
hb_filter_private_t *pv = filter->private_data;
if (pv == NULL)
{
return;
}
hb_log("comb detect: heavy %i | light %i | uncombed %i | total %i",
pv->comb_heavy, pv->comb_light, pv->comb_none, pv->frames);
[pv->combed release];
[pv->temp release];
[pv->mask release];
hb_metal_context_close(&pv->mtl);
for (int i = 0; i < 3; i++)
{
hb_buffer_close(&pv->ref[i]);
}
free(pv);
filter->private_data = NULL;
}
static void call_kernel(hb_filter_private_t *pv,
id<MTLTexture> prev,
id<MTLTexture> cur,
id<MTLTexture> next,
id<MTLTexture> dest)
{
id<MTLCommandBuffer> buffer = pv->mtl->queue.commandBuffer;
id<MTLComputeCommandEncoder> encoder = buffer.computeCommandEncoder;
int width = cur.width, height = cur.height;
struct mtl_comb_detect_params *params = (struct mtl_comb_detect_params *)pv->mtl->params_buffer.contents;
params->force_exaustive_check = pv->force_exaustive_check;
[encoder setTexture:prev atIndex:0];
[encoder setTexture:cur atIndex:1];
[encoder setTexture:next atIndex:2];
[encoder setTexture:pv->mask atIndex:3];
[encoder setTexture:pv->temp atIndex:4];
if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
{
[encoder setTexture:dest atIndex:5];
}
[encoder setBuffer:pv->combed offset:0 atIndex:0];
[encoder setBuffer:pv->mtl->params_buffer offset:0 atIndex:1];
hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[0], encoder, width, height);
if (pv->mode & MODE_FILTER)
{
hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[1], encoder, width, height);
if (pv->filter_mode == FILTER_ERODE_DILATE)
{
hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[2], encoder, width, height);
hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[3], encoder, width, height);
hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[2], encoder, width, height);
}
}
if (pv->mode & MODE_FILTER && pv->filter_mode == FILTER_CLASSIC)
{
[encoder setTexture:pv->temp atIndex:3];
}
hb_metal_compute_encoder_dispatch_fixed_threadgroup_size(pv->mtl->device, pv->mtl->pipelines[4], encoder,
width, height, pv->block_width, pv->block_height);
if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
{
hb_metal_compute_encoder_dispatch(pv->mtl->device, pv->mtl->pipelines[5], encoder, width, height);
}
[encoder endEncoding];
[buffer commit];
[buffer waitUntilCompleted];
}
static int analyze_frame(hb_filter_private_t *pv, hb_buffer_t **out)
{
CVReturn ret = kCVReturnSuccess;
CVPixelBufferRef cv_dest = NULL;
CVPixelBufferRef cv_prev = pv->ref[PREV] ? hb_cv_get_pixel_buffer(pv->ref[PREV]) : hb_cv_get_pixel_buffer(pv->ref[CURR]);
CVPixelBufferRef cv_cur = hb_cv_get_pixel_buffer(pv->ref[CURR]);
CVPixelBufferRef cv_next = pv->ref[NEXT] ? hb_cv_get_pixel_buffer(pv->ref[NEXT]) : hb_cv_get_pixel_buffer(pv->ref[CURR]);
if (cv_prev == NULL || cv_cur == NULL || cv_next == NULL)
{
hb_log("comb_detect_vt: extract_buf failed");
goto fail;
}
const AVComponentDescriptor *comp = &pv->desc->comp[0];
int channels;
const MTLPixelFormat format = hb_metal_pix_fmt_from_component(comp, &channels);
if (format == MTLPixelFormatInvalid)
{
goto fail;
}
CVMetalTextureRef dest = NULL;
id<MTLTexture> tex_dest = nil;
if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
{
ret = CVPixelBufferPoolCreatePixelBuffer(kCFAllocatorDefault, pv->mtl->pool, &cv_dest);
if (ret != kCVReturnSuccess)
{
hb_log("comb_detect_vt: CVPixelBufferPoolCreatePixelBuffer failed");
goto fail;
}
dest = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_dest, 0, format);
tex_dest = CVMetalTextureGetTexture(dest);
}
CVMetalTextureRef prev = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_prev, 0, format);
CVMetalTextureRef cur = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_cur, 0, format);
CVMetalTextureRef next = hb_metal_create_texture_from_pixbuf(pv->mtl->cache, cv_next, 0, format);
id<MTLTexture> tex_prev = CVMetalTextureGetTexture(prev);
id<MTLTexture> tex_cur = CVMetalTextureGetTexture(cur);
id<MTLTexture> tex_next = CVMetalTextureGetTexture(next);
uint32_t *combed = pv->combed.contents;
*combed = HB_COMB_NONE;
call_kernel(pv, tex_prev, tex_cur, tex_next, tex_dest);
CFRelease(prev);
CFRelease(cur);
CFRelease(next);
if (pv->mode & MODE_MASK || pv->mode & MODE_COMPOSITE)
{
CFRelease(dest);
CVBufferPropagateAttachments(cv_cur, cv_dest);
*out = hb_buffer_wrapper_init();
(*out)->storage_type = COREMEDIA;
(*out)->storage = cv_dest;
(*out)->f.width = pv->ref[CURR]->f.width;
(*out)->f.height = pv->ref[CURR]->f.height;
(*out)->f.fmt = pv->ref[CURR]->f.fmt;
(*out)->f.color_prim = pv->ref[CURR]->f.color_prim;
(*out)->f.color_transfer = pv->ref[CURR]->f.color_transfer;
(*out)->f.color_matrix = pv->ref[CURR]->f.color_matrix;
(*out)->f.color_range = pv->ref[CURR]->f.color_range;
(*out)->f.chroma_location = pv->ref[CURR]->f.chroma_location;
hb_buffer_copy_props(*out, pv->ref[CURR]);
}
return *combed;
fail:
return -1;
}
static void process_frame(hb_filter_private_t *pv)
{
int combed = 0;
hb_buffer_t *out = NULL;
@autoreleasepool
{
combed = analyze_frame(pv, &out);
}
switch (combed)
{
case HB_COMB_HEAVY:
pv->comb_heavy++;
break;
case HB_COMB_LIGHT:
pv->comb_light++;
break;
case HB_COMB_NONE:
default:
pv->comb_none++;
break;
}
pv->frames++;
pv->ref[CURR]->s.combed = combed;
if (out)
{
hb_buffer_list_append(&pv->out_list, out);
}
else
{
hb_buffer_list_append(&pv->out_list, hb_vt_buffer_dup(pv->ref[CURR]));
}
pv->force_exaustive_check = 0;
}
static int comb_detect_vt_work(hb_filter_object_t *filter,
hb_buffer_t **buf_in,
hb_buffer_t **buf_out)
{
hb_filter_private_t *pv = filter->private_data;
hb_buffer_t *in = *buf_in;
*buf_in = NULL;
if (in->s.flags & HB_BUF_FLAG_EOF)
{
store_buf(pv, NULL);
pv->force_exaustive_check = 1;
process_frame(pv);
hb_buffer_list_append(&pv->out_list, in);
*buf_out = hb_buffer_list_clear(&pv->out_list);
return HB_FILTER_DONE;
}
store_buf(pv, in);
if (pv->ref[CURR] == NULL)
{
// Wait for next buffer
return HB_FILTER_DELAY;
}
process_frame(pv);
*buf_out = hb_buffer_list_clear(&pv->out_list);
return *buf_out == NULL ? HB_FILTER_FAILED : HB_FILTER_OK;
}

View File

@ -0,0 +1,555 @@
/* comb_detect.metal
Copyright (c) 2003-2023 HandBrake Team
This file is part of the HandBrake source code
Homepage: <http://handbrake.fr/>.
It may be used under the terms of the GNU General Public License v2.
For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
*/
#include <metal_stdlib>
#include <metal_integer>
#include <metal_texture>
using namespace metal;
#define HB_COMB_NONE 0
#define HB_COMB_LIGHT 1
#define HB_COMB_HEAVY 2
/*
* Parameters
*/
struct params {
int spatial_metric;
float motion_threshold;
float spatial_threshold;
int block_threshold;
int block_width;
int block_height;
float gamma_motion_threshold;
float gamma_spatial_threshold;
float gamma_spatial_threshold6;
float spatial_threshold_squared;
float spatial_threshold6;
float comb32detect_min;
float comb32detect_max;
bool force_exaustive_check;
};
/*
* Texture access helpers
*/
constexpr sampler s(coord::pixel);
template <typename T>
T tex2D(texture2d<float, access::sample> tex, short2 pos)
{
return tex.sample(s, float2(pos)).x;
}
template <typename T>
T tex2D(texture2d<float, access::sample> tex, ushort2 pos)
{
return tex.sample(s, float2(pos)).x;
}
template <typename T>
T tex2D(texture2d<ushort, access::sample> tex, short2 pos)
{
return tex.sample(s, float2(pos)).x;
}
template <typename T>
T tex2D(texture2d<ushort, access::sample> tex, ushort2 pos)
{
return tex.sample(s, float2(pos)).x;
}
template <typename T>
T tex2D(texture2d<float, access::read> tex, uint x, uint y)
{
return tex.read(uint2(x, y)).x;
}
template <typename T>
T tex2D(texture2d<ushort, access::read> tex, ushort2 pos)
{
return tex.read(pos).x;
}
constexpr sampler szero(coord::pixel,address::clamp_to_zero);
template <typename T>
T tex2Dc(texture2d<ushort, access::sample> tex, short2 pos)
{
return tex.sample(szero, float2(pos)).x;
}
template <typename T>
T tex2Dc(texture2d<ushort, access::sample> tex, ushort2 pos)
{
return tex.sample(szero, float2(pos)).x;
}
/*
* Comb detect helpers
*/
void write_result(
device atomic_int *combed,
uchar block_threshold,
ushort block_score)
{
int current = atomic_load_explicit(combed, memory_order_relaxed);
if (current == HB_COMB_HEAVY) {
return;
}
if (block_score >= (block_threshold / 2)) {
if (block_score > block_threshold) {
atomic_store_explicit(combed, HB_COMB_HEAVY, memory_order_relaxed);
} else {
atomic_compare_exchange_weak_explicit(combed, &current, HB_COMB_LIGHT, memory_order_relaxed, memory_order_relaxed);
}
}
}
template <typename T>
T gamma(T value) {
return pow(value, 2.2f);
}
template <typename T>
void detect_gamma_combed_segment(
texture2d<float, access::sample> prev,
texture2d<float, access::sample> cur,
texture2d<float, access::sample> next,
texture2d<ushort, access::write> mask,
constant params& p,
ushort2 pos)
{
// A mishmash of various comb detection tricks
// picked up from neuron2's Decomb plugin for
// AviSynth and tritical's IsCombedT and
// IsCombedTIVTC plugins.
// Comb scoring algorithm
const float mthresh = p.gamma_motion_threshold;
const float athresh = p.gamma_spatial_threshold;
const float athresh6 = p.gamma_spatial_threshold6;
// These are just to make the buffer locations easier to read.
const short2 up_2 = short2(pos.x, pos.y -2);
const short2 up_1 = short2(pos.x, pos.y -1);
const short2 down_1 = short2(pos.x, pos.y +1);
const short2 down_2 = short2(pos.x, pos.y +2);
const T up_diff = gamma(tex2D<T>(cur, pos)) - gamma(tex2D<T>(cur, up_1));
const T down_diff = gamma(tex2D<T>(cur, pos)) - gamma(tex2D<T>(cur, down_1));
mask.write(0, pos);
if ((up_diff > athresh && down_diff > athresh) ||
(up_diff < -athresh && down_diff < -athresh)) {
// The pixel above and below are different,
// and they change in the same "direction" too.
bool motion = false;
if (mthresh > 0) {
// Make sure there's sufficient motion between frame t-1 to frame t+1.
if (abs(gamma(tex2D<T>(prev, pos)) - gamma(tex2D<T>(cur, pos))) > mthresh &&
abs(gamma(tex2D<T>(cur, up_1)) - gamma(tex2D<T>(next, up_1))) > mthresh &&
abs(gamma(tex2D<T>(cur, down_1)) - gamma(tex2D<T>(next, down_1))) > mthresh) {
motion = true;
}
if (abs(gamma(tex2D<T>(next, pos)) - gamma(tex2D<T>(cur, pos))) > mthresh &&
abs(gamma(tex2D<T>(prev, up_1)) - gamma(tex2D<T>(cur, up_1))) > mthresh &&
abs(gamma(tex2D<T>(prev, down_1)) - gamma(tex2D<T>(cur, down_1))) > mthresh) {
motion = true;
}
} else {
// User doesn't want to check for motion,
// so move on to the spatial check
motion = true;
}
// If motion, or we can't measure motion yet…
if (motion || p.force_exaustive_check) {
// Tritical's noise-resistant combing scorer
// The check is done on a bob+blur convolution
const T combing = abs(gamma(tex2D<T>(cur, up_2))
+ (4 * gamma(tex2D<T>(cur, pos)))
+ gamma(tex2D<T>(cur, down_2))
- (3 * (gamma(tex2D<T>(cur, up_1))
+ gamma(tex2D<T>(cur, down_1)))));
// If the frame is sufficiently combed,
// then mark it down on the mask as 1.
if (combing > athresh6) {
mask.write(1, pos);
}
}
}
}
template <typename T>
void detect_combed_segment(
texture2d<float, access::sample> prev,
texture2d<float, access::sample> cur,
texture2d<float, access::sample> next,
texture2d<ushort, access::write> mask,
constant params& p,
ushort2 pos)
{
// A mishmash of various comb detection tricks
// picked up from neuron2's Decomb plugin for
// AviSynth and tritical's IsCombedT and
// IsCombedTIVTC plugins.
// Comb scoring algorithm
const float mthresh = p.motion_threshold;
const float athresh = p.spatial_threshold;
const float athresh_squared = p.spatial_threshold_squared;
const float athresh6 = p.spatial_threshold6;
// These are just to make the buffer locations easier to read.
const short2 up_2 = short2(pos.x, pos.y -2);
const short2 up_1 = short2(pos.x, pos.y -1);
const short2 down_1 = short2(pos.x, pos.y +1);
const short2 down_2 = short2(pos.x, pos.y +2);
const float up_diff = tex2D<T>(cur, pos) - tex2D<T>(cur, up_1);
const float down_diff = tex2D<T>(cur, pos) - tex2D<T>(cur, down_1);
mask.write(0, pos);
if ((up_diff > athresh && down_diff > athresh) ||
(up_diff < -athresh && down_diff < -athresh)) {
// The pixel above and below are different,
// and they change in the same "direction" too.
bool motion = false;
if (mthresh > 0) {
// Make sure there's sufficient motion between frame t-1 to frame t+1.
if (abs(tex2D<T>(prev, pos) - tex2D<T>(cur, pos)) > mthresh &&
abs(tex2D<T>(cur, up_1) - tex2D<T>(next, up_1)) > mthresh &&
abs(tex2D<T>(cur, down_1) - tex2D<T>(next, down_1)) > mthresh) {
motion = true;
}
if (abs(tex2D<T>(next, pos) - tex2D<T>(cur, pos)) > mthresh &&
abs(tex2D<T>(prev, up_1) - tex2D<T>(cur, up_1)) > mthresh &&
abs(tex2D<T>(prev, down_1) - tex2D<T>(cur, down_1)) > mthresh) {
motion = true;
}
} else {
// User doesn't want to check for motion,
// so move on to the spatial check
motion = true;
}
// If motion, or we can't measure motion yet…
if (motion || p.force_exaustive_check) {
// That means it's time for the spatial check
// We've got several options here
if (p.spatial_metric == 0) {
// Simple 32detect style comb detection.
if ((abs(tex2D<T>(cur, pos) - tex2D<T>(cur, down_2)) < p.comb32detect_min) &&
(abs(tex2D<T>(cur, pos) - tex2D<T>(cur, down_1)) > p.comb32detect_max)) {
mask.write(1, pos);
}
} else if (p.spatial_metric == 1) {
// This, for comparison, is what IsCombed uses
// It's better, but still noise sensitive
const T combing = (tex2D<T>(cur, up_1) - tex2D<T>(cur, pos)) *
(tex2D<T>(cur, down_1) - tex2D<T>(cur, pos));
if (combing > athresh_squared) {
mask.write(1, pos);
}
} else if (p.spatial_metric == 2) {
// Tritical's noise-resistant combing scorer
// The check is done on a bob+blur convolution
const T combing = abs(tex2D<T>(cur, up_2)
+ (4 * tex2D<T>(cur, pos))
+ tex2D<T>(cur, down_2)
- (3 * (tex2D<T>(cur, up_1)
+ tex2D<T>(cur, down_1))));
// If the frame is sufficiently combed,
// then mark it down on the mask as 1.
if (combing > athresh6) {
mask.write(1, pos);
}
}
}
}
}
/*
* Kernel dispatch
*/
kernel void apply_mask(
texture2d<ushort, access::read> mask [[texture(3)]],
texture2d<half, access::write> dst [[texture(5)]],
ushort2 pos [[thread_position_in_grid]])
{
auto value = tex2D<ushort>(mask, pos) ? 1.h : 0.h;
dst.write(value, pos);
}
kernel void check_filtered_combing_mask_simd(
texture2d<ushort, access::sample> mask [[texture(3)]],
device atomic_int *combed [[buffer(0)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]],
ushort sid [[simdgroup_index_in_threadgroup]],
ushort w [[simdgroups_per_threadgroup]])
{
threadgroup ushort partial_score[32];
ushort value = tex2Dc<ushort>(mask, pos);
partial_score[sid] = simd_sum(value);
threadgroup_barrier(mem_flags::mem_threadgroup);
if (sid == 0) {
ushort block_score = 0;
for (uchar i = 0; i < w; i++) {
block_score += partial_score[i];
}
write_result(combed, p.block_threshold, block_score);
}
}
kernel void check_filtered_combing_mask_quad(
texture2d<ushort, access::sample> mask [[texture(3)]],
device atomic_int *combed [[buffer(0)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]],
ushort qid [[quadgroup_index_in_threadgroup]],
ushort w [[quadgroups_per_threadgroup]])
{
threadgroup ushort partial_score[256];
ushort value = tex2Dc<ushort>(mask, pos);
partial_score[qid] = quad_sum(value);
threadgroup_barrier(mem_flags::mem_threadgroup);
if (qid == 0) {
ushort block_score = 0;
for (uchar i = 0; i < w; i++) {
block_score += partial_score[i];
}
write_result(combed, p.block_threshold, block_score);
}
}
kernel void check_filtered_combing_mask(
texture2d<ushort, access::sample> mask [[texture(3)]],
device atomic_int *combed [[buffer(0)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]])
{
if (pos.x % p.block_width > 0 || pos.y % p.block_height > 0) {
return;
}
ushort block_score = 0;
for (uchar x = 0; x < p.block_width; x++) {
for (uchar y = 0; y < p.block_height; y++) {
ushort2 block_pos = ushort2(pos.x + x, pos.y + y);
block_score += tex2Dc<ushort>(mask, block_pos);
}
}
write_result(combed, p.block_threshold, block_score);
}
kernel void check_combing_mask_simd(
texture2d<ushort, access::sample> mask [[texture(3)]],
device atomic_int *combed [[buffer(0)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]],
ushort sid [[simdgroup_index_in_threadgroup]],
ushort w [[simdgroups_per_threadgroup]])
{
threadgroup ushort partial_score[32];
const short2 left = short2(pos.x -1, pos.y);
const short2 right = short2(pos.x +1, pos.y);
ushort value = tex2Dc<ushort>(mask, left) & tex2Dc<ushort>(mask, pos) & tex2Dc<ushort>(mask, right);
partial_score[sid] = simd_sum(value);
threadgroup_barrier(mem_flags::mem_threadgroup);
if (sid == 0) {
ushort block_score = 0;
for (uchar i = 0; i < w; i++) {
block_score += partial_score[i];
}
write_result(combed, p.block_threshold, block_score);
}
}
kernel void check_combing_mask_quad(
texture2d<ushort, access::sample> mask [[texture(3)]],
device atomic_int *combed [[buffer(0)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]],
ushort qid [[quadgroup_index_in_threadgroup]],
ushort w [[quadgroups_per_threadgroup]])
{
threadgroup ushort partial_score[256];
const short2 left = short2(pos.x -1, pos.y);
const short2 right = short2(pos.x +1, pos.y);
ushort value = tex2Dc<ushort>(mask, left) & tex2Dc<ushort>(mask, pos) & tex2Dc<ushort>(mask, right);
partial_score[qid] = quad_sum(value);
threadgroup_barrier(mem_flags::mem_threadgroup);
if (qid == 0) {
ushort block_score = 0;
for (uchar i = 0; i < w; i++) {
block_score += partial_score[i];
}
write_result(combed, p.block_threshold, block_score);
}
}
kernel void check_combing_mask(
texture2d<ushort, access::sample> mask [[texture(3)]],
device atomic_int *combed [[buffer(0)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]])
{
if (pos.x % p.block_width > 0 || pos.y % p.block_height > 0) {
return;
}
ushort block_score = 0;
for (uchar x = 0; x < p.block_width; x++) {
for (uchar y = 0; y < p.block_height; y++) {
const ushort2 block_pos = ushort2(pos.x + x, pos.y + y);
const short2 left = short2(pos.x -1 +x, pos.y +y);
const short2 right = short2(pos.x +1 +x, pos.y +y);
block_score += tex2Dc<ushort>(mask, left) & tex2Dc<ushort>(mask, block_pos) & tex2Dc<ushort>(mask, right);
}
}
write_result(combed, p.block_threshold, block_score);
}
kernel void dilate_mask(
texture2d<ushort, access::sample> src [[texture(3)]],
texture2d<ushort, access::write> dst [[texture(4)]],
ushort2 pos [[thread_position_in_grid]])
{
if (tex2D<ushort>(src, pos)) {
dst.write(1, pos);
return;
}
const short2 up = short2(pos.x, pos.y -1);
const short2 up_left = short2(pos.x -1, pos.y -1);
const short2 up_right = short2(pos.x +1, pos.y -1);
const short2 down = short2(pos.x, pos.y +1);
const short2 down_left = short2(pos.x -1, pos.y +1);
const short2 down_right = short2(pos.x +1, pos.y +1);
const short2 left = short2(pos.x -1, pos.y);
const short2 right = short2(pos.x +1, pos.y);
constexpr uchar dilation_threshold = 4;
const uchar count = tex2D<ushort>(src, up_left) + tex2D<ushort>(src, up) + tex2D<ushort>(src, up_right) +
tex2D<ushort>(src, left) + tex2D<ushort>(src, right) +
tex2D<ushort>(src, down_left) + tex2D<ushort>(src, down) + tex2D<ushort>(src, down_right);
dst.write(count >= dilation_threshold, pos);
}
kernel void erode_mask(
texture2d<ushort, access::sample> src [[texture(4)]],
texture2d<ushort, access::write> dst [[texture(3)]],
ushort2 pos [[thread_position_in_grid]])
{
if (tex2D<ushort>(src, pos) == 0) {
dst.write(0, pos);
return;
}
const short2 up = short2(pos.x, pos.y -1);
const short2 up_left = short2(pos.x -1, pos.y -1);
const short2 up_right = short2(pos.x +1, pos.y -1);
const short2 down = short2(pos.x, pos.y +1);
const short2 down_left = short2(pos.x -1, pos.y +1);
const short2 down_right = short2(pos.x +1, pos.y +1);
const short2 left = short2(pos.x -1, pos.y);
const short2 right = short2(pos.x +1, pos.y);
constexpr uchar erosion_threshold = 2;
const uchar count = tex2D<ushort>(src, up_left) + tex2D<ushort>(src, up) + tex2D<ushort>(src, up_right) +
tex2D<ushort>(src, left) + tex2D<ushort>(src, right) +
tex2D<ushort>(src, down_left) + tex2D<ushort>(src, down) + tex2D<ushort>(src, down_right);
dst.write(count >= erosion_threshold, pos);
}
kernel void filter_classic(
texture2d<ushort, access::sample> src [[texture(3)]],
texture2d<ushort, access::write> dst [[texture(4)]],
ushort2 pos [[thread_position_in_grid]])
{
const short2 left = short2(pos.x -1, pos.y);
const short2 right = short2(pos.x +1, pos.y);
const uchar h_count = tex2D<ushort>(src, left) & tex2D<ushort>(src, pos) & tex2D<ushort>(src, right);
dst.write(h_count, pos);
}
kernel void filter_erode_dilate(
texture2d<ushort, access::sample> src [[texture(3)]],
texture2d<ushort, access::write> dst [[texture(4)]],
ushort2 pos [[thread_position_in_grid]])
{
const short2 up = short2(pos.x, pos.y -1);
const short2 down = short2(pos.x, pos.y +1);
const short2 left = short2(pos.x -1, pos.y);
const short2 right = short2(pos.x +1, pos.y);
const uchar h_count = tex2D<ushort>(src, left) & tex2D<ushort>(src, pos) & tex2D<ushort>(src, right);
const uchar v_count = tex2D<ushort>(src, up) & tex2D<ushort>(src, pos) & tex2D<ushort>(src, down);
dst.write(h_count & v_count, pos);
}
kernel void comb_detect(
texture2d<float, access::sample> prev [[texture(0)]],
texture2d<float, access::sample> cur [[texture(1)]],
texture2d<float, access::sample> next [[texture(2)]],
texture2d<ushort, access::write> mask [[texture(3)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]])
{
detect_combed_segment<float>(prev, cur, next, mask, p, pos);
}
kernel void comb_detect_gamma(
texture2d<float, access::sample> prev [[texture(0)]],
texture2d<float, access::sample> cur [[texture(1)]],
texture2d<float, access::sample> next [[texture(2)]],
texture2d<ushort, access::write> mask [[texture(3)]],
constant params& p [[buffer(1)]],
ushort2 pos [[thread_position_in_grid]])
{
detect_gamma_combed_segment<float>(prev, cur, next, mask, p, pos);
}

View File

@ -403,6 +403,8 @@ int hb_vt_are_filters_supported(hb_list_t *filters)
switch (filter->id)
{
case HB_FILTER_PRE_VT:
case HB_FILTER_COMB_DETECT:
case HB_FILTER_COMB_DETECT_VT:
case HB_FILTER_YADIF:
case HB_FILTER_YADIF_VT:
case HB_FILTER_BWDIF:
@ -498,6 +500,7 @@ void hb_vt_setup_hw_filters(hb_job_t *job)
hb_filter_object_t *filter = hb_filter_init(HB_FILTER_PRE_VT);
hb_add_filter(job, filter, NULL);
replace_filter(job, HB_FILTER_COMB_DETECT, HB_FILTER_COMB_DETECT_VT);
replace_filter(job, HB_FILTER_YADIF, HB_FILTER_YADIF_VT);
replace_filter(job, HB_FILTER_BWDIF, HB_FILTER_BWDIF_VT);
replace_filter(job, HB_FILTER_CROP_SCALE, HB_FILTER_CROP_SCALE_VT);

View File

@ -16,6 +16,7 @@ namespace HandBrake.Interop.Interop.HbLib
// First, filters that may change the framerate (drop or dup frames)
HB_FILTER_DETELECINE,
HB_FILTER_COMB_DETECT,
HB_FILTER_COMB_DETECT_VT,
HB_FILTER_DECOMB,
HB_FILTER_YADIF,
HB_FILTER_YADIF_VT,