perf(native): use simd to speedup audio buffer mix (#11717)

Run `cargo bench -p affine_media_capture`

```
test result: ok. 0 passed; 0 failed; 6 ignored; 0 measured; 0 filtered out; finished in 0.00s

     Running benches/mix_audio_samples.rs (target/release/deps/mix_audio_samples-ffbc55dcf90d3468)
audio mix/simd          time:   [98.380 ns 99.339 ns 100.57 ns]
                        change: [−19.199% −16.928% −14.569%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 12 outliers among 100 measurements (12.00%)
  6 (6.00%) high mild
  6 (6.00%) high severe
audio mix/scalar        time:   [123.99 ns 126.11 ns 128.71 ns]
                        change: [+0.2703% +1.2739% +2.5727%] (p = 0.02 < 0.05)
                        Change within noise threshold.
Found 11 outliers among 100 measurements (11.00%)
  4 (4.00%) high mild
  7 (7.00%) high severe
```
This commit is contained in:
Brooooooklyn 2025-04-16 00:42:30 +00:00
parent e0970daa5a
commit fe86722845
No known key found for this signature in database
GPG Key ID: 30B1140CE1C07C99
6 changed files with 353 additions and 13 deletions

View File

@ -5,7 +5,7 @@ rustflags = ["-C", "target-feature=+crt-static"]
[target.'cfg(target_os = "linux")'] [target.'cfg(target_os = "linux")']
rustflags = ["-C", "link-args=-Wl,--warn-unresolved-symbols"] rustflags = ["-C", "link-args=-Wl,--warn-unresolved-symbols"]
[target.'cfg(target_os = "macos")'] [target.'cfg(target_os = "macos")']
rustflags = ["-C", "link-args=-all_load", "-C", "link-args=-weak_framework ScreenCaptureKit"] rustflags = ["-C", "link-args=-Wl,-undefined,dynamic_lookup,-no_fixup_chains", "-C", "link-args=-all_load", "-C", "link-args=-weak_framework ScreenCaptureKit"]
# https://sourceware.org/bugzilla/show_bug.cgi?id=21032 # https://sourceware.org/bugzilla/show_bug.cgi?id=21032
# https://sourceware.org/bugzilla/show_bug.cgi?id=21031 # https://sourceware.org/bugzilla/show_bug.cgi?id=21031
# https://github.com/rust-lang/rust/issues/134820 # https://github.com/rust-lang/rust/issues/134820

1
Cargo.lock generated
View File

@ -79,6 +79,7 @@ dependencies = [
"block2", "block2",
"core-foundation", "core-foundation",
"coreaudio-rs", "coreaudio-rs",
"criterion2",
"dispatch2", "dispatch2",
"libc", "libc",
"napi", "napi",

View File

@ -6,6 +6,10 @@ version = "0.0.0"
[lib] [lib]
crate-type = ["cdylib", "rlib"] crate-type = ["cdylib", "rlib"]
[[bench]]
harness = false
name = "mix_audio_samples"
[dependencies] [dependencies]
napi = { workspace = true, features = ["napi4"] } napi = { workspace = true, features = ["napi4"] }
napi-derive = { workspace = true, features = ["type-def"] } napi-derive = { workspace = true, features = ["type-def"] }
@ -24,5 +28,8 @@ objc2-foundation = { workspace = true }
screencapturekit = { workspace = true } screencapturekit = { workspace = true }
uuid = { workspace = true, features = ["v4"] } uuid = { workspace = true, features = ["v4"] }
[dev-dependencies]
criterion2 = { workspace = true }
[build-dependencies] [build-dependencies]
napi-build = { workspace = true } napi-build = { workspace = true }

View File

@ -0,0 +1,55 @@
#[cfg(target_os = "macos")]
use std::hint::black_box;
#[cfg(target_os = "macos")]
use affine_media_capture::macos::audio_buffer::{mix_audio_samples, mix_audio_samples_scalar};
#[cfg(target_os = "macos")]
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
#[cfg(target_os = "macos")]
fn generate_test_samples() -> [f32; 1024] {
let mut samples = [0.0; 1024];
// Generate a simple sine wave with some variation
for (i, sample) in samples.iter_mut().enumerate() {
let t = i as f32 / 1024.0;
// Create a complex waveform with multiple frequencies
let value = 0.008 * (2.0 * std::f32::consts::PI * t * 5.0).sin()
+ 0.004 * (2.0 * std::f32::consts::PI * t * 10.0).cos()
+ 0.002 * (2.0 * std::f32::consts::PI * t * 20.0).sin();
*sample = value;
}
samples
}
#[cfg(target_os = "macos")]
fn bench_audio_mix(c: &mut Criterion) {
let mut group = c.benchmark_group("audio mix");
let input = generate_test_samples();
let output = generate_test_samples();
group.bench_function(BenchmarkId::from_parameter("simd"), |b| {
b.iter(|| {
let mixed = mix_audio_samples(&input, &output);
black_box(mixed);
});
});
group.bench_function(BenchmarkId::from_parameter("scalar"), |b| {
b.iter(|| {
let mut mixed = vec![0.0; 1024];
mix_audio_samples_scalar(&input, &output, &mut mixed, 0, input.len());
black_box(mixed);
});
});
}
#[cfg(target_os = "macos")]
criterion_group!(benches, bench_audio_mix);
#[cfg(target_os = "macos")]
criterion_main!(benches);
#[cfg(not(target_os = "macos"))]
fn main() {}

View File

@ -5,6 +5,141 @@ use objc2::{Encode, Encoding, RefEncode};
use crate::{error::CoreAudioError, utils::process_audio_frame}; use crate::{error::CoreAudioError, utils::process_audio_frame};
pub const AUDIO_MIX_OUTPUT_WEIGHT: f32 = 0.75;
/// Mix audio samples using scalar operations (no SIMD)
///
/// # Arguments
/// * `input_samples` - Samples from the input stream
/// * `output_samples` - Samples from the output stream
/// * `mixed_samples` - Buffer to store the result (must be pre-allocated)
/// * `start_index` - Starting index in the buffers
/// * `end_index` - Ending index in the buffers (exclusive)
pub fn mix_audio_samples_scalar(
input_samples: &[f32],
output_samples: &[f32],
mixed_samples: &mut [f32],
start_index: usize,
end_index: usize,
) {
// performance downgraded 4x if apply suggestion from this lint rule
#[allow(clippy::needless_range_loop)]
for sample_index in start_index..end_index {
let sample_in = input_samples.get(sample_index).unwrap_or(&0.0);
let sample_out = output_samples.get(sample_index).unwrap_or(&0.0);
mixed_samples[sample_index] = sample_in + sample_out * AUDIO_MIX_OUTPUT_WEIGHT;
}
}
/// Mix audio samples from input and output streams with specified weights
/// Uses NEON SIMD acceleration on supported platforms
///
/// # Arguments
/// * `input_samples` - Samples from the input stream
/// * `output_samples` - Samples from the output stream
///
/// # Returns
/// A vector of mixed audio samples
pub fn mix_audio_samples(input_samples: &[f32], output_samples: &[f32]) -> Vec<f32> {
let mixed_samples_length = input_samples.len();
let mut mixed_samples = vec![0.0; mixed_samples_length];
// For very small arrays, use scalar implementation
if mixed_samples_length < 16 {
mix_audio_samples_scalar(
input_samples,
output_samples,
&mut mixed_samples,
0,
mixed_samples_length,
);
return mixed_samples;
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
unsafe {
use std::arch::aarch64::{vdupq_n_f32, vld1q_f32, vmlaq_f32, vst1q_f32};
let output_weight_vec = vdupq_n_f32(AUDIO_MIX_OUTPUT_WEIGHT);
// Process the common length where both arrays have data
let common_length = input_samples.len();
// Main SIMD loop - process each block of 4 samples
let input_ptr = input_samples.as_ptr();
let output_ptr = output_samples.as_ptr();
let result_ptr = mixed_samples.as_mut_ptr();
let mut offset: usize = 0;
let mut remaining_offset: Option<usize> = None;
// Process 16 samples at a time (4 SIMD vectors)
while offset < common_length {
// Load 4 vectors of 4 floats each
let in_vec1 = vld1q_f32(input_ptr.add(offset));
let out_vec1 = vld1q_f32(output_ptr.add(offset));
let in_vec2 = vld1q_f32(input_ptr.add(offset + 4));
let out_vec2 = vld1q_f32(output_ptr.add(offset + 4));
let in_vec3 = vld1q_f32(input_ptr.add(offset + 8));
let out_vec3 = vld1q_f32(output_ptr.add(offset + 8));
let in_vec4 = vld1q_f32(input_ptr.add(offset + 12));
let out_vec4 = vld1q_f32(output_ptr.add(offset + 12));
// Using fused multiply-add: (a * b) + c in one operation
// First multiply input by weight
let result1 = vmlaq_f32(in_vec1, out_vec1, output_weight_vec);
let result2 = vmlaq_f32(in_vec2, out_vec2, output_weight_vec);
let result3 = vmlaq_f32(in_vec3, out_vec3, output_weight_vec);
let result4 = vmlaq_f32(in_vec4, out_vec4, output_weight_vec);
// Store results
vst1q_f32(result_ptr.add(offset), result1);
vst1q_f32(result_ptr.add(offset + 4), result2);
vst1q_f32(result_ptr.add(offset + 8), result3);
vst1q_f32(result_ptr.add(offset + 12), result4);
offset += 16;
// accept clippy lint suggestion would downgrade the performance by 15%
#[allow(clippy::comparison_chain)]
// fast path for aligned length
if offset == common_length {
break;
} else if offset > common_length {
remaining_offset = Some(offset - 16);
} else {
let remaining = common_length - offset;
if remaining < 16 {
remaining_offset = Some(offset);
break;
}
}
}
if let Some(remaining_offset) = remaining_offset {
mix_audio_samples_scalar(
input_samples,
output_samples,
&mut mixed_samples,
remaining_offset,
common_length,
);
}
}
#[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))]
{
// Fallback for non-ARM architectures
mix_audio_samples_scalar(
input_samples,
output_samples,
&mut mixed_samples,
0,
mixed_samples_length,
);
}
mixed_samples
}
/// [Apple's documentation](https://developer.apple.com/documentation/coreaudiotypes/audiobuffer?language=objc) /// [Apple's documentation](https://developer.apple.com/documentation/coreaudiotypes/audiobuffer?language=objc)
#[repr(C)] #[repr(C)]
#[derive(Clone, Copy, Debug, PartialEq)] #[derive(Clone, Copy, Debug, PartialEq)]
@ -50,6 +185,9 @@ unsafe impl RefEncode for AudioBufferList {
pub struct InputAndOutputAudioBufferList(pub AudioBufferList); pub struct InputAndOutputAudioBufferList(pub AudioBufferList);
impl InputAndOutputAudioBufferList { impl InputAndOutputAudioBufferList {
/// # Safety
///
/// The caller must ensure that the input data is a valid AudioBufferList
pub unsafe fn from_raw(in_input_data: *mut c_void) -> Result<Self, i32> { pub unsafe fn from_raw(in_input_data: *mut c_void) -> Result<Self, i32> {
let buffer_list: AudioBufferList = unsafe { *in_input_data.cast() }; let buffer_list: AudioBufferList = unsafe { *in_input_data.cast() };
if buffer_list.mNumberBuffers != 2 { if buffer_list.mNumberBuffers != 2 {
@ -93,18 +231,157 @@ impl InputAndOutputAudioBufferList {
return Err(CoreAudioError::ProcessAudioFrameFailed("output")); return Err(CoreAudioError::ProcessAudioFrameFailed("output"));
}; };
let mixed_samples_length = processed_samples_input // Use the extracted mixing function with the const weights
.len() let mixed_samples = mix_audio_samples(&processed_samples_input, &processed_samples_output);
.max(processed_samples_output.len());
let mut mixed_samples = vec![0.0; mixed_samples_length];
for (sample_index, mixed_sample) in mixed_samples.iter_mut().enumerate() {
let sample_in = processed_samples_input.get(sample_index).unwrap_or(&0.0);
let sample_out = processed_samples_output.get(sample_index).unwrap_or(&0.0);
*mixed_sample = (sample_in * 2.0 + sample_out * 1.5) / 2.0;
}
Ok(mixed_samples) Ok(mixed_samples)
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_mix_audio_samples_empty() {
let input: Vec<f32> = vec![];
let output: Vec<f32> = vec![];
let mixed = mix_audio_samples(&input, &output);
assert_eq!(mixed.len(), 0);
}
#[test]
fn test_mix_audio_samples_equal_length() {
let input = vec![0.1, 0.2, 0.3, 0.4, 0.5];
let output = vec![0.5, 0.4, 0.3, 0.2, 0.1];
let mixed = mix_audio_samples(&input, &output);
assert_eq!(mixed.len(), 5);
// Verify calculations: (input + output * 0.75)
let expected = [
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.4 + 0.2 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.5 + 0.1 * AUDIO_MIX_OUTPUT_WEIGHT),
];
for i in 0..mixed.len() {
assert!(
(mixed[i] - expected[i]).abs() < 1e-6,
"Mismatch at index {}: expected {}, got {}",
i,
expected[i],
mixed[i]
);
}
}
#[test]
fn test_mix_audio_samples_input_longer() {
let input = vec![0.1, 0.2, 0.3, 0.4, 0.5];
let output = vec![0.5, 0.4, 0.3];
let mixed = mix_audio_samples(&input, &output);
assert_eq!(mixed.len(), 5);
// Verify calculations
let expected = [
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.4 + 0.0 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.5 + 0.0 * AUDIO_MIX_OUTPUT_WEIGHT),
];
for i in 0..mixed.len() {
assert!(
(mixed[i] - expected[i]).abs() < 1e-6,
"Mismatch at index {}: expected {}, got {}",
i,
expected[i],
mixed[i]
);
}
}
#[test]
fn test_mix_audio_samples_custom_weights() {
// Note: We're using the constant weights so we can't really test custom values
// directly
let input = vec![0.1, 0.2, 0.3];
let output = vec![0.5, 0.4, 0.3];
let mixed = mix_audio_samples(&input, &output);
// Calculate expected values based on the constants
let expected = [
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
];
for i in 0..mixed.len() {
assert!(
(mixed[i] - expected[i]).abs() < 1e-6,
"Mismatch at index {}: expected {}, got {}",
i,
expected[i],
mixed[i]
);
}
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
#[test]
fn test_simd_implementation_used() {
const BUFFER_SIZES: [usize; 4] = [100, 127, 128, 512];
for size in BUFFER_SIZES {
// Create arrays large enough to trigger SIMD path
let input: Vec<f32> = (0..size).map(|i| i as f32 * 0.01).collect();
let output: Vec<f32> = (0..size).map(|i| (size - i) as f32 * 0.01).collect();
// Mix with standard weights
let mixed = mix_audio_samples(&input, &output);
// Compute the same mix using scalar implementation for comparison
let mut expected = vec![0.0; input.len()];
mix_audio_samples_scalar(&input, &output, &mut expected, 0, input.len());
// Verify results match between SIMD and scalar implementations
for i in 0..mixed.len() {
assert!(
(mixed[i] - expected[i]).abs() < 1e-6,
"SIMD and scalar implementations should produce identical results at index {}",
i
);
}
}
}
#[test]
fn test_small_vector_uses_scalar() {
// Create small arrays that should use scalar path even with SIMD available
let input = vec![0.1, 0.2, 0.3];
let output = vec![0.5, 0.4, 0.3];
// Mix with standard weights
let mixed = mix_audio_samples(&input, &output);
// Calculate expected values manually
let expected = [
(0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT),
(0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT),
];
// Verify results
for i in 0..mixed.len() {
assert!(
(mixed[i] - expected[i]).abs() < 1e-6,
"Small vector mixing should be correct at index {}",
i
);
}
}
}

View File

@ -1,4 +1,4 @@
pub(crate) mod audio_buffer; pub mod audio_buffer;
pub mod audio_stream_basic_desc; pub mod audio_stream_basic_desc;
pub mod av_audio_file; pub mod av_audio_file;
pub mod av_audio_format; pub mod av_audio_format;