perf(native): use simd to speedup audio buffer mix (#11717)
Run `cargo bench -p affine_media_capture` ``` test result: ok. 0 passed; 0 failed; 6 ignored; 0 measured; 0 filtered out; finished in 0.00s Running benches/mix_audio_samples.rs (target/release/deps/mix_audio_samples-ffbc55dcf90d3468) audio mix/simd time: [98.380 ns 99.339 ns 100.57 ns] change: [−19.199% −16.928% −14.569%] (p = 0.00 < 0.05) Performance has improved. Found 12 outliers among 100 measurements (12.00%) 6 (6.00%) high mild 6 (6.00%) high severe audio mix/scalar time: [123.99 ns 126.11 ns 128.71 ns] change: [+0.2703% +1.2739% +2.5727%] (p = 0.02 < 0.05) Change within noise threshold. Found 11 outliers among 100 measurements (11.00%) 4 (4.00%) high mild 7 (7.00%) high severe ```
This commit is contained in:
parent
e0970daa5a
commit
fe86722845
@ -5,7 +5,7 @@ rustflags = ["-C", "target-feature=+crt-static"]
|
||||
[target.'cfg(target_os = "linux")']
|
||||
rustflags = ["-C", "link-args=-Wl,--warn-unresolved-symbols"]
|
||||
[target.'cfg(target_os = "macos")']
|
||||
rustflags = ["-C", "link-args=-all_load", "-C", "link-args=-weak_framework ScreenCaptureKit"]
|
||||
rustflags = ["-C", "link-args=-Wl,-undefined,dynamic_lookup,-no_fixup_chains", "-C", "link-args=-all_load", "-C", "link-args=-weak_framework ScreenCaptureKit"]
|
||||
# https://sourceware.org/bugzilla/show_bug.cgi?id=21032
|
||||
# https://sourceware.org/bugzilla/show_bug.cgi?id=21031
|
||||
# https://github.com/rust-lang/rust/issues/134820
|
||||
|
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -79,6 +79,7 @@ dependencies = [
|
||||
"block2",
|
||||
"core-foundation",
|
||||
"coreaudio-rs",
|
||||
"criterion2",
|
||||
"dispatch2",
|
||||
"libc",
|
||||
"napi",
|
||||
|
@ -6,6 +6,10 @@ version = "0.0.0"
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
[[bench]]
|
||||
harness = false
|
||||
name = "mix_audio_samples"
|
||||
|
||||
[dependencies]
|
||||
napi = { workspace = true, features = ["napi4"] }
|
||||
napi-derive = { workspace = true, features = ["type-def"] }
|
||||
@ -24,5 +28,8 @@ objc2-foundation = { workspace = true }
|
||||
screencapturekit = { workspace = true }
|
||||
uuid = { workspace = true, features = ["v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
criterion2 = { workspace = true }
|
||||
|
||||
[build-dependencies]
|
||||
napi-build = { workspace = true }
|
||||
|
@ -0,0 +1,55 @@
|
||||
#[cfg(target_os = "macos")]
|
||||
use std::hint::black_box;
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
use affine_media_capture::macos::audio_buffer::{mix_audio_samples, mix_audio_samples_scalar};
|
||||
#[cfg(target_os = "macos")]
|
||||
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
fn generate_test_samples() -> [f32; 1024] {
|
||||
let mut samples = [0.0; 1024];
|
||||
|
||||
// Generate a simple sine wave with some variation
|
||||
for (i, sample) in samples.iter_mut().enumerate() {
|
||||
let t = i as f32 / 1024.0;
|
||||
// Create a complex waveform with multiple frequencies
|
||||
let value = 0.008 * (2.0 * std::f32::consts::PI * t * 5.0).sin()
|
||||
+ 0.004 * (2.0 * std::f32::consts::PI * t * 10.0).cos()
|
||||
+ 0.002 * (2.0 * std::f32::consts::PI * t * 20.0).sin();
|
||||
*sample = value;
|
||||
}
|
||||
|
||||
samples
|
||||
}
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
fn bench_audio_mix(c: &mut Criterion) {
|
||||
let mut group = c.benchmark_group("audio mix");
|
||||
|
||||
let input = generate_test_samples();
|
||||
let output = generate_test_samples();
|
||||
|
||||
group.bench_function(BenchmarkId::from_parameter("simd"), |b| {
|
||||
b.iter(|| {
|
||||
let mixed = mix_audio_samples(&input, &output);
|
||||
black_box(mixed);
|
||||
});
|
||||
});
|
||||
|
||||
group.bench_function(BenchmarkId::from_parameter("scalar"), |b| {
|
||||
b.iter(|| {
|
||||
let mut mixed = vec![0.0; 1024];
|
||||
mix_audio_samples_scalar(&input, &output, &mut mixed, 0, input.len());
|
||||
black_box(mixed);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
#[cfg(target_os = "macos")]
|
||||
criterion_group!(benches, bench_audio_mix);
|
||||
#[cfg(target_os = "macos")]
|
||||
criterion_main!(benches);
|
||||
|
||||
#[cfg(not(target_os = "macos"))]
|
||||
fn main() {}
|
@ -5,6 +5,141 @@ use objc2::{Encode, Encoding, RefEncode};
|
||||
|
||||
use crate::{error::CoreAudioError, utils::process_audio_frame};
|
||||
|
||||
pub const AUDIO_MIX_OUTPUT_WEIGHT: f32 = 0.75;
|
||||
|
||||
/// Mix audio samples using scalar operations (no SIMD)
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `input_samples` - Samples from the input stream
|
||||
/// * `output_samples` - Samples from the output stream
|
||||
/// * `mixed_samples` - Buffer to store the result (must be pre-allocated)
|
||||
/// * `start_index` - Starting index in the buffers
|
||||
/// * `end_index` - Ending index in the buffers (exclusive)
|
||||
pub fn mix_audio_samples_scalar(
|
||||
input_samples: &[f32],
|
||||
output_samples: &[f32],
|
||||
mixed_samples: &mut [f32],
|
||||
start_index: usize,
|
||||
end_index: usize,
|
||||
) {
|
||||
// performance downgraded 4x if apply suggestion from this lint rule
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for sample_index in start_index..end_index {
|
||||
let sample_in = input_samples.get(sample_index).unwrap_or(&0.0);
|
||||
let sample_out = output_samples.get(sample_index).unwrap_or(&0.0);
|
||||
mixed_samples[sample_index] = sample_in + sample_out * AUDIO_MIX_OUTPUT_WEIGHT;
|
||||
}
|
||||
}
|
||||
|
||||
/// Mix audio samples from input and output streams with specified weights
|
||||
/// Uses NEON SIMD acceleration on supported platforms
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `input_samples` - Samples from the input stream
|
||||
/// * `output_samples` - Samples from the output stream
|
||||
///
|
||||
/// # Returns
|
||||
/// A vector of mixed audio samples
|
||||
pub fn mix_audio_samples(input_samples: &[f32], output_samples: &[f32]) -> Vec<f32> {
|
||||
let mixed_samples_length = input_samples.len();
|
||||
let mut mixed_samples = vec![0.0; mixed_samples_length];
|
||||
|
||||
// For very small arrays, use scalar implementation
|
||||
if mixed_samples_length < 16 {
|
||||
mix_audio_samples_scalar(
|
||||
input_samples,
|
||||
output_samples,
|
||||
&mut mixed_samples,
|
||||
0,
|
||||
mixed_samples_length,
|
||||
);
|
||||
return mixed_samples;
|
||||
}
|
||||
|
||||
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
|
||||
unsafe {
|
||||
use std::arch::aarch64::{vdupq_n_f32, vld1q_f32, vmlaq_f32, vst1q_f32};
|
||||
|
||||
let output_weight_vec = vdupq_n_f32(AUDIO_MIX_OUTPUT_WEIGHT);
|
||||
// Process the common length where both arrays have data
|
||||
let common_length = input_samples.len();
|
||||
|
||||
// Main SIMD loop - process each block of 4 samples
|
||||
let input_ptr = input_samples.as_ptr();
|
||||
let output_ptr = output_samples.as_ptr();
|
||||
let result_ptr = mixed_samples.as_mut_ptr();
|
||||
|
||||
let mut offset: usize = 0;
|
||||
let mut remaining_offset: Option<usize> = None;
|
||||
|
||||
// Process 16 samples at a time (4 SIMD vectors)
|
||||
while offset < common_length {
|
||||
// Load 4 vectors of 4 floats each
|
||||
let in_vec1 = vld1q_f32(input_ptr.add(offset));
|
||||
let out_vec1 = vld1q_f32(output_ptr.add(offset));
|
||||
let in_vec2 = vld1q_f32(input_ptr.add(offset + 4));
|
||||
let out_vec2 = vld1q_f32(output_ptr.add(offset + 4));
|
||||
let in_vec3 = vld1q_f32(input_ptr.add(offset + 8));
|
||||
let out_vec3 = vld1q_f32(output_ptr.add(offset + 8));
|
||||
let in_vec4 = vld1q_f32(input_ptr.add(offset + 12));
|
||||
let out_vec4 = vld1q_f32(output_ptr.add(offset + 12));
|
||||
|
||||
// Using fused multiply-add: (a * b) + c in one operation
|
||||
// First multiply input by weight
|
||||
let result1 = vmlaq_f32(in_vec1, out_vec1, output_weight_vec);
|
||||
let result2 = vmlaq_f32(in_vec2, out_vec2, output_weight_vec);
|
||||
let result3 = vmlaq_f32(in_vec3, out_vec3, output_weight_vec);
|
||||
|
||||
let result4 = vmlaq_f32(in_vec4, out_vec4, output_weight_vec);
|
||||
|
||||
// Store results
|
||||
vst1q_f32(result_ptr.add(offset), result1);
|
||||
vst1q_f32(result_ptr.add(offset + 4), result2);
|
||||
vst1q_f32(result_ptr.add(offset + 8), result3);
|
||||
vst1q_f32(result_ptr.add(offset + 12), result4);
|
||||
offset += 16;
|
||||
// accept clippy lint suggestion would downgrade the performance by 15%
|
||||
#[allow(clippy::comparison_chain)]
|
||||
// fast path for aligned length
|
||||
if offset == common_length {
|
||||
break;
|
||||
} else if offset > common_length {
|
||||
remaining_offset = Some(offset - 16);
|
||||
} else {
|
||||
let remaining = common_length - offset;
|
||||
if remaining < 16 {
|
||||
remaining_offset = Some(offset);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(remaining_offset) = remaining_offset {
|
||||
mix_audio_samples_scalar(
|
||||
input_samples,
|
||||
output_samples,
|
||||
&mut mixed_samples,
|
||||
remaining_offset,
|
||||
common_length,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
|
||||
{
|
||||
// Fallback for non-ARM architectures
|
||||
mix_audio_samples_scalar(
|
||||
input_samples,
|
||||
output_samples,
|
||||
&mut mixed_samples,
|
||||
0,
|
||||
mixed_samples_length,
|
||||
);
|
||||
}
|
||||
|
||||
mixed_samples
|
||||
}
|
||||
|
||||
/// [Apple's documentation](https://developer.apple.com/documentation/coreaudiotypes/audiobuffer?language=objc)
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Copy, Debug, PartialEq)]
|
||||
@ -50,6 +185,9 @@ unsafe impl RefEncode for AudioBufferList {
|
||||
pub struct InputAndOutputAudioBufferList(pub AudioBufferList);
|
||||
|
||||
impl InputAndOutputAudioBufferList {
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller must ensure that the input data is a valid AudioBufferList
|
||||
pub unsafe fn from_raw(in_input_data: *mut c_void) -> Result<Self, i32> {
|
||||
let buffer_list: AudioBufferList = unsafe { *in_input_data.cast() };
|
||||
if buffer_list.mNumberBuffers != 2 {
|
||||
@ -93,18 +231,157 @@ impl InputAndOutputAudioBufferList {
|
||||
return Err(CoreAudioError::ProcessAudioFrameFailed("output"));
|
||||
};
|
||||
|
||||
let mixed_samples_length = processed_samples_input
|
||||
.len()
|
||||
.max(processed_samples_output.len());
|
||||
|
||||
let mut mixed_samples = vec![0.0; mixed_samples_length];
|
||||
for (sample_index, mixed_sample) in mixed_samples.iter_mut().enumerate() {
|
||||
let sample_in = processed_samples_input.get(sample_index).unwrap_or(&0.0);
|
||||
let sample_out = processed_samples_output.get(sample_index).unwrap_or(&0.0);
|
||||
|
||||
*mixed_sample = (sample_in * 2.0 + sample_out * 1.5) / 2.0;
|
||||
}
|
||||
// Use the extracted mixing function with the const weights
|
||||
let mixed_samples = mix_audio_samples(&processed_samples_input, &processed_samples_output);
|
||||
|
||||
Ok(mixed_samples)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_mix_audio_samples_empty() {
|
||||
let input: Vec<f32> = vec![];
|
||||
let output: Vec<f32> = vec![];
|
||||
let mixed = mix_audio_samples(&input, &output);
|
||||
assert_eq!(mixed.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mix_audio_samples_equal_length() {
|
||||
let input = vec![0.1, 0.2, 0.3, 0.4, 0.5];
|
||||
let output = vec![0.5, 0.4, 0.3, 0.2, 0.1];
|
||||
let mixed = mix_audio_samples(&input, &output);
|
||||
|
||||
assert_eq!(mixed.len(), 5);
|
||||
|
||||
// Verify calculations: (input + output * 0.75)
|
||||
let expected = [
|
||||
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.4 + 0.2 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.5 + 0.1 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
];
|
||||
|
||||
for i in 0..mixed.len() {
|
||||
assert!(
|
||||
(mixed[i] - expected[i]).abs() < 1e-6,
|
||||
"Mismatch at index {}: expected {}, got {}",
|
||||
i,
|
||||
expected[i],
|
||||
mixed[i]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mix_audio_samples_input_longer() {
|
||||
let input = vec![0.1, 0.2, 0.3, 0.4, 0.5];
|
||||
let output = vec![0.5, 0.4, 0.3];
|
||||
let mixed = mix_audio_samples(&input, &output);
|
||||
|
||||
assert_eq!(mixed.len(), 5);
|
||||
|
||||
// Verify calculations
|
||||
let expected = [
|
||||
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.4 + 0.0 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.5 + 0.0 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
];
|
||||
|
||||
for i in 0..mixed.len() {
|
||||
assert!(
|
||||
(mixed[i] - expected[i]).abs() < 1e-6,
|
||||
"Mismatch at index {}: expected {}, got {}",
|
||||
i,
|
||||
expected[i],
|
||||
mixed[i]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_mix_audio_samples_custom_weights() {
|
||||
// Note: We're using the constant weights so we can't really test custom values
|
||||
// directly
|
||||
let input = vec![0.1, 0.2, 0.3];
|
||||
let output = vec![0.5, 0.4, 0.3];
|
||||
let mixed = mix_audio_samples(&input, &output);
|
||||
|
||||
// Calculate expected values based on the constants
|
||||
let expected = [
|
||||
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
];
|
||||
|
||||
for i in 0..mixed.len() {
|
||||
assert!(
|
||||
(mixed[i] - expected[i]).abs() < 1e-6,
|
||||
"Mismatch at index {}: expected {}, got {}",
|
||||
i,
|
||||
expected[i],
|
||||
mixed[i]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
|
||||
#[test]
|
||||
fn test_simd_implementation_used() {
|
||||
const BUFFER_SIZES: [usize; 4] = [100, 127, 128, 512];
|
||||
for size in BUFFER_SIZES {
|
||||
// Create arrays large enough to trigger SIMD path
|
||||
let input: Vec<f32> = (0..size).map(|i| i as f32 * 0.01).collect();
|
||||
let output: Vec<f32> = (0..size).map(|i| (size - i) as f32 * 0.01).collect();
|
||||
|
||||
// Mix with standard weights
|
||||
let mixed = mix_audio_samples(&input, &output);
|
||||
|
||||
// Compute the same mix using scalar implementation for comparison
|
||||
let mut expected = vec![0.0; input.len()];
|
||||
mix_audio_samples_scalar(&input, &output, &mut expected, 0, input.len());
|
||||
|
||||
// Verify results match between SIMD and scalar implementations
|
||||
for i in 0..mixed.len() {
|
||||
assert!(
|
||||
(mixed[i] - expected[i]).abs() < 1e-6,
|
||||
"SIMD and scalar implementations should produce identical results at index {}",
|
||||
i
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_small_vector_uses_scalar() {
|
||||
// Create small arrays that should use scalar path even with SIMD available
|
||||
let input = vec![0.1, 0.2, 0.3];
|
||||
let output = vec![0.5, 0.4, 0.3];
|
||||
|
||||
// Mix with standard weights
|
||||
let mixed = mix_audio_samples(&input, &output);
|
||||
|
||||
// Calculate expected values manually
|
||||
let expected = [
|
||||
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
|
||||
];
|
||||
|
||||
// Verify results
|
||||
for i in 0..mixed.len() {
|
||||
assert!(
|
||||
(mixed[i] - expected[i]).abs() < 1e-6,
|
||||
"Small vector mixing should be correct at index {}",
|
||||
i
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
pub(crate) mod audio_buffer;
|
||||
pub mod audio_buffer;
|
||||
pub mod audio_stream_basic_desc;
|
||||
pub mod av_audio_file;
|
||||
pub mod av_audio_format;
|
||||
|
Loading…
x
Reference in New Issue
Block a user