From fe867228451a498e0c9a9062b55087fe892fa69c Mon Sep 17 00:00:00 2001 From: Brooooooklyn Date: Wed, 16 Apr 2025 00:42:30 +0000 Subject: [PATCH] perf(native): use simd to speedup audio buffer mix (#11717) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run `cargo bench -p affine_media_capture` ``` test result: ok. 0 passed; 0 failed; 6 ignored; 0 measured; 0 filtered out; finished in 0.00s Running benches/mix_audio_samples.rs (target/release/deps/mix_audio_samples-ffbc55dcf90d3468) audio mix/simd time: [98.380 ns 99.339 ns 100.57 ns] change: [−19.199% −16.928% −14.569%] (p = 0.00 < 0.05) Performance has improved. Found 12 outliers among 100 measurements (12.00%) 6 (6.00%) high mild 6 (6.00%) high severe audio mix/scalar time: [123.99 ns 126.11 ns 128.71 ns] change: [+0.2703% +1.2739% +2.5727%] (p = 0.02 < 0.05) Change within noise threshold. Found 11 outliers among 100 measurements (11.00%) 4 (4.00%) high mild 7 (7.00%) high severe ``` --- .cargo/config.toml | 2 +- Cargo.lock | 1 + .../frontend/native/media_capture/Cargo.toml | 7 + .../benches/mix_audio_samples.rs | 55 ++++ .../media_capture/src/macos/audio_buffer.rs | 299 +++++++++++++++++- .../native/media_capture/src/macos/mod.rs | 2 +- 6 files changed, 353 insertions(+), 13 deletions(-) create mode 100644 packages/frontend/native/media_capture/benches/mix_audio_samples.rs diff --git a/.cargo/config.toml b/.cargo/config.toml index 0880406da4..bbd24f91f6 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -5,7 +5,7 @@ rustflags = ["-C", "target-feature=+crt-static"] [target.'cfg(target_os = "linux")'] rustflags = ["-C", "link-args=-Wl,--warn-unresolved-symbols"] [target.'cfg(target_os = "macos")'] -rustflags = ["-C", "link-args=-all_load", "-C", "link-args=-weak_framework ScreenCaptureKit"] +rustflags = ["-C", "link-args=-Wl,-undefined,dynamic_lookup,-no_fixup_chains", "-C", "link-args=-all_load", "-C", "link-args=-weak_framework ScreenCaptureKit"] # https://sourceware.org/bugzilla/show_bug.cgi?id=21032 # https://sourceware.org/bugzilla/show_bug.cgi?id=21031 # https://github.com/rust-lang/rust/issues/134820 diff --git a/Cargo.lock b/Cargo.lock index 7dfaadd264..57a08b8a9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -79,6 +79,7 @@ dependencies = [ "block2", "core-foundation", "coreaudio-rs", + "criterion2", "dispatch2", "libc", "napi", diff --git a/packages/frontend/native/media_capture/Cargo.toml b/packages/frontend/native/media_capture/Cargo.toml index 2a71d67d3e..25a798a64a 100644 --- a/packages/frontend/native/media_capture/Cargo.toml +++ b/packages/frontend/native/media_capture/Cargo.toml @@ -6,6 +6,10 @@ version = "0.0.0" [lib] crate-type = ["cdylib", "rlib"] +[[bench]] +harness = false +name = "mix_audio_samples" + [dependencies] napi = { workspace = true, features = ["napi4"] } napi-derive = { workspace = true, features = ["type-def"] } @@ -24,5 +28,8 @@ objc2-foundation = { workspace = true } screencapturekit = { workspace = true } uuid = { workspace = true, features = ["v4"] } +[dev-dependencies] +criterion2 = { workspace = true } + [build-dependencies] napi-build = { workspace = true } diff --git a/packages/frontend/native/media_capture/benches/mix_audio_samples.rs b/packages/frontend/native/media_capture/benches/mix_audio_samples.rs new file mode 100644 index 0000000000..35c45742f9 --- /dev/null +++ b/packages/frontend/native/media_capture/benches/mix_audio_samples.rs @@ -0,0 +1,55 @@ +#[cfg(target_os = "macos")] +use std::hint::black_box; + +#[cfg(target_os = "macos")] +use affine_media_capture::macos::audio_buffer::{mix_audio_samples, mix_audio_samples_scalar}; +#[cfg(target_os = "macos")] +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; + +#[cfg(target_os = "macos")] +fn generate_test_samples() -> [f32; 1024] { + let mut samples = [0.0; 1024]; + + // Generate a simple sine wave with some variation + for (i, sample) in samples.iter_mut().enumerate() { + let t = i as f32 / 1024.0; + // Create a complex waveform with multiple frequencies + let value = 0.008 * (2.0 * std::f32::consts::PI * t * 5.0).sin() + + 0.004 * (2.0 * std::f32::consts::PI * t * 10.0).cos() + + 0.002 * (2.0 * std::f32::consts::PI * t * 20.0).sin(); + *sample = value; + } + + samples +} + +#[cfg(target_os = "macos")] +fn bench_audio_mix(c: &mut Criterion) { + let mut group = c.benchmark_group("audio mix"); + + let input = generate_test_samples(); + let output = generate_test_samples(); + + group.bench_function(BenchmarkId::from_parameter("simd"), |b| { + b.iter(|| { + let mixed = mix_audio_samples(&input, &output); + black_box(mixed); + }); + }); + + group.bench_function(BenchmarkId::from_parameter("scalar"), |b| { + b.iter(|| { + let mut mixed = vec![0.0; 1024]; + mix_audio_samples_scalar(&input, &output, &mut mixed, 0, input.len()); + black_box(mixed); + }); + }); +} + +#[cfg(target_os = "macos")] +criterion_group!(benches, bench_audio_mix); +#[cfg(target_os = "macos")] +criterion_main!(benches); + +#[cfg(not(target_os = "macos"))] +fn main() {} diff --git a/packages/frontend/native/media_capture/src/macos/audio_buffer.rs b/packages/frontend/native/media_capture/src/macos/audio_buffer.rs index d2d3db5098..93e95692cb 100644 --- a/packages/frontend/native/media_capture/src/macos/audio_buffer.rs +++ b/packages/frontend/native/media_capture/src/macos/audio_buffer.rs @@ -5,6 +5,141 @@ use objc2::{Encode, Encoding, RefEncode}; use crate::{error::CoreAudioError, utils::process_audio_frame}; +pub const AUDIO_MIX_OUTPUT_WEIGHT: f32 = 0.75; + +/// Mix audio samples using scalar operations (no SIMD) +/// +/// # Arguments +/// * `input_samples` - Samples from the input stream +/// * `output_samples` - Samples from the output stream +/// * `mixed_samples` - Buffer to store the result (must be pre-allocated) +/// * `start_index` - Starting index in the buffers +/// * `end_index` - Ending index in the buffers (exclusive) +pub fn mix_audio_samples_scalar( + input_samples: &[f32], + output_samples: &[f32], + mixed_samples: &mut [f32], + start_index: usize, + end_index: usize, +) { + // performance downgraded 4x if apply suggestion from this lint rule + #[allow(clippy::needless_range_loop)] + for sample_index in start_index..end_index { + let sample_in = input_samples.get(sample_index).unwrap_or(&0.0); + let sample_out = output_samples.get(sample_index).unwrap_or(&0.0); + mixed_samples[sample_index] = sample_in + sample_out * AUDIO_MIX_OUTPUT_WEIGHT; + } +} + +/// Mix audio samples from input and output streams with specified weights +/// Uses NEON SIMD acceleration on supported platforms +/// +/// # Arguments +/// * `input_samples` - Samples from the input stream +/// * `output_samples` - Samples from the output stream +/// +/// # Returns +/// A vector of mixed audio samples +pub fn mix_audio_samples(input_samples: &[f32], output_samples: &[f32]) -> Vec { + let mixed_samples_length = input_samples.len(); + let mut mixed_samples = vec![0.0; mixed_samples_length]; + + // For very small arrays, use scalar implementation + if mixed_samples_length < 16 { + mix_audio_samples_scalar( + input_samples, + output_samples, + &mut mixed_samples, + 0, + mixed_samples_length, + ); + return mixed_samples; + } + + #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] + unsafe { + use std::arch::aarch64::{vdupq_n_f32, vld1q_f32, vmlaq_f32, vst1q_f32}; + + let output_weight_vec = vdupq_n_f32(AUDIO_MIX_OUTPUT_WEIGHT); + // Process the common length where both arrays have data + let common_length = input_samples.len(); + + // Main SIMD loop - process each block of 4 samples + let input_ptr = input_samples.as_ptr(); + let output_ptr = output_samples.as_ptr(); + let result_ptr = mixed_samples.as_mut_ptr(); + + let mut offset: usize = 0; + let mut remaining_offset: Option = None; + + // Process 16 samples at a time (4 SIMD vectors) + while offset < common_length { + // Load 4 vectors of 4 floats each + let in_vec1 = vld1q_f32(input_ptr.add(offset)); + let out_vec1 = vld1q_f32(output_ptr.add(offset)); + let in_vec2 = vld1q_f32(input_ptr.add(offset + 4)); + let out_vec2 = vld1q_f32(output_ptr.add(offset + 4)); + let in_vec3 = vld1q_f32(input_ptr.add(offset + 8)); + let out_vec3 = vld1q_f32(output_ptr.add(offset + 8)); + let in_vec4 = vld1q_f32(input_ptr.add(offset + 12)); + let out_vec4 = vld1q_f32(output_ptr.add(offset + 12)); + + // Using fused multiply-add: (a * b) + c in one operation + // First multiply input by weight + let result1 = vmlaq_f32(in_vec1, out_vec1, output_weight_vec); + let result2 = vmlaq_f32(in_vec2, out_vec2, output_weight_vec); + let result3 = vmlaq_f32(in_vec3, out_vec3, output_weight_vec); + + let result4 = vmlaq_f32(in_vec4, out_vec4, output_weight_vec); + + // Store results + vst1q_f32(result_ptr.add(offset), result1); + vst1q_f32(result_ptr.add(offset + 4), result2); + vst1q_f32(result_ptr.add(offset + 8), result3); + vst1q_f32(result_ptr.add(offset + 12), result4); + offset += 16; + // accept clippy lint suggestion would downgrade the performance by 15% + #[allow(clippy::comparison_chain)] + // fast path for aligned length + if offset == common_length { + break; + } else if offset > common_length { + remaining_offset = Some(offset - 16); + } else { + let remaining = common_length - offset; + if remaining < 16 { + remaining_offset = Some(offset); + break; + } + } + } + + if let Some(remaining_offset) = remaining_offset { + mix_audio_samples_scalar( + input_samples, + output_samples, + &mut mixed_samples, + remaining_offset, + common_length, + ); + } + } + + #[cfg(not(any(target_arch = "aarch64", target_arch = "arm")))] + { + // Fallback for non-ARM architectures + mix_audio_samples_scalar( + input_samples, + output_samples, + &mut mixed_samples, + 0, + mixed_samples_length, + ); + } + + mixed_samples +} + /// [Apple's documentation](https://developer.apple.com/documentation/coreaudiotypes/audiobuffer?language=objc) #[repr(C)] #[derive(Clone, Copy, Debug, PartialEq)] @@ -50,6 +185,9 @@ unsafe impl RefEncode for AudioBufferList { pub struct InputAndOutputAudioBufferList(pub AudioBufferList); impl InputAndOutputAudioBufferList { + /// # Safety + /// + /// The caller must ensure that the input data is a valid AudioBufferList pub unsafe fn from_raw(in_input_data: *mut c_void) -> Result { let buffer_list: AudioBufferList = unsafe { *in_input_data.cast() }; if buffer_list.mNumberBuffers != 2 { @@ -93,18 +231,157 @@ impl InputAndOutputAudioBufferList { return Err(CoreAudioError::ProcessAudioFrameFailed("output")); }; - let mixed_samples_length = processed_samples_input - .len() - .max(processed_samples_output.len()); - - let mut mixed_samples = vec![0.0; mixed_samples_length]; - for (sample_index, mixed_sample) in mixed_samples.iter_mut().enumerate() { - let sample_in = processed_samples_input.get(sample_index).unwrap_or(&0.0); - let sample_out = processed_samples_output.get(sample_index).unwrap_or(&0.0); - - *mixed_sample = (sample_in * 2.0 + sample_out * 1.5) / 2.0; - } + // Use the extracted mixing function with the const weights + let mixed_samples = mix_audio_samples(&processed_samples_input, &processed_samples_output); Ok(mixed_samples) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_mix_audio_samples_empty() { + let input: Vec = vec![]; + let output: Vec = vec![]; + let mixed = mix_audio_samples(&input, &output); + assert_eq!(mixed.len(), 0); + } + + #[test] + fn test_mix_audio_samples_equal_length() { + let input = vec![0.1, 0.2, 0.3, 0.4, 0.5]; + let output = vec![0.5, 0.4, 0.3, 0.2, 0.1]; + let mixed = mix_audio_samples(&input, &output); + + assert_eq!(mixed.len(), 5); + + // Verify calculations: (input + output * 0.75) + let expected = [ + (0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.4 + 0.2 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.5 + 0.1 * AUDIO_MIX_OUTPUT_WEIGHT), + ]; + + for i in 0..mixed.len() { + assert!( + (mixed[i] - expected[i]).abs() < 1e-6, + "Mismatch at index {}: expected {}, got {}", + i, + expected[i], + mixed[i] + ); + } + } + + #[test] + fn test_mix_audio_samples_input_longer() { + let input = vec![0.1, 0.2, 0.3, 0.4, 0.5]; + let output = vec![0.5, 0.4, 0.3]; + let mixed = mix_audio_samples(&input, &output); + + assert_eq!(mixed.len(), 5); + + // Verify calculations + let expected = [ + (0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.4 + 0.0 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.5 + 0.0 * AUDIO_MIX_OUTPUT_WEIGHT), + ]; + + for i in 0..mixed.len() { + assert!( + (mixed[i] - expected[i]).abs() < 1e-6, + "Mismatch at index {}: expected {}, got {}", + i, + expected[i], + mixed[i] + ); + } + } + + #[test] + fn test_mix_audio_samples_custom_weights() { + // Note: We're using the constant weights so we can't really test custom values + // directly + let input = vec![0.1, 0.2, 0.3]; + let output = vec![0.5, 0.4, 0.3]; + let mixed = mix_audio_samples(&input, &output); + + // Calculate expected values based on the constants + let expected = [ + (0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT), + ]; + + for i in 0..mixed.len() { + assert!( + (mixed[i] - expected[i]).abs() < 1e-6, + "Mismatch at index {}: expected {}, got {}", + i, + expected[i], + mixed[i] + ); + } + } + + #[cfg(any(target_arch = "aarch64", target_arch = "arm"))] + #[test] + fn test_simd_implementation_used() { + const BUFFER_SIZES: [usize; 4] = [100, 127, 128, 512]; + for size in BUFFER_SIZES { + // Create arrays large enough to trigger SIMD path + let input: Vec = (0..size).map(|i| i as f32 * 0.01).collect(); + let output: Vec = (0..size).map(|i| (size - i) as f32 * 0.01).collect(); + + // Mix with standard weights + let mixed = mix_audio_samples(&input, &output); + + // Compute the same mix using scalar implementation for comparison + let mut expected = vec![0.0; input.len()]; + mix_audio_samples_scalar(&input, &output, &mut expected, 0, input.len()); + + // Verify results match between SIMD and scalar implementations + for i in 0..mixed.len() { + assert!( + (mixed[i] - expected[i]).abs() < 1e-6, + "SIMD and scalar implementations should produce identical results at index {}", + i + ); + } + } + } + + #[test] + fn test_small_vector_uses_scalar() { + // Create small arrays that should use scalar path even with SIMD available + let input = vec![0.1, 0.2, 0.3]; + let output = vec![0.5, 0.4, 0.3]; + + // Mix with standard weights + let mixed = mix_audio_samples(&input, &output); + + // Calculate expected values manually + let expected = [ + (0.1 + 0.5 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.2 + 0.4 * AUDIO_MIX_OUTPUT_WEIGHT), + (0.3 + 0.3 * AUDIO_MIX_OUTPUT_WEIGHT), + ]; + + // Verify results + for i in 0..mixed.len() { + assert!( + (mixed[i] - expected[i]).abs() < 1e-6, + "Small vector mixing should be correct at index {}", + i + ); + } + } +} diff --git a/packages/frontend/native/media_capture/src/macos/mod.rs b/packages/frontend/native/media_capture/src/macos/mod.rs index 4512317388..f5c4e60d8a 100644 --- a/packages/frontend/native/media_capture/src/macos/mod.rs +++ b/packages/frontend/native/media_capture/src/macos/mod.rs @@ -1,4 +1,4 @@ -pub(crate) mod audio_buffer; +pub mod audio_buffer; pub mod audio_stream_basic_desc; pub mod av_audio_file; pub mod av_audio_format;