205 lines
8.5 KiB
Rust
205 lines
8.5 KiB
Rust
//! Encode/decode throughput benchmarks.
|
||
//!
|
||
//! Uses the nightly `test` crate harness (`#[bench]`), not criterion. Run
|
||
//! with `cargo bench`. Results are wall-clock nanoseconds per iteration; to
|
||
//! convert to samples-per-second, divide the frame sample count by the
|
||
//! reported ns/iter and multiply by 10⁹.
|
||
//!
|
||
//! Representative sizes exercise the encoder's exhaustive search behaviour
|
||
//! at different `partition_order` ceilings — 256 and 1024 are power-of-two
|
||
//! frames (all seven partition orders available); 960 and 2880 mimic
|
||
//! Opus/WebRTC frame sizes (only some partition orders divide evenly, so
|
||
//! the inner search is sparser).
|
||
|
||
#![feature(test)]
|
||
|
||
extern crate test;
|
||
|
||
use lac::{decode_frame, encode_frame};
|
||
use test::Bencher;
|
||
|
||
// ── Synthetic-signal benches ────────────────────────────────────────────────
|
||
//
|
||
// These benches drive the encoder without any WAV I/O overhead, so the
|
||
// measurement is pure codec work. Four signal shapes cover the space:
|
||
//
|
||
// - `silence`: order-0 short-circuit path (skips Levinson entirely).
|
||
// - `multi_sine`: maximally LPC-friendly — order search converges fast.
|
||
// - `pseudo_speech`: AR(2) resonance on LFSR excitation, which is the
|
||
// textbook model of the vocal tract. Residuals are near-Laplacian,
|
||
// exercising the Rice k-search at realistic speech statistics.
|
||
// - `filtered_noise`: LFSR through a biquad low-pass. Broad-spectrum
|
||
// content with no strong tonal structure — the hard case for LPC and
|
||
// a reasonable proxy for music-like workload.
|
||
|
||
fn silence(n: usize) -> Vec<i32> {
|
||
vec![0i32; n]
|
||
}
|
||
|
||
fn multi_sine(n: usize) -> Vec<i32> {
|
||
// Three superimposed sinusoids with incommensurate frequencies, scaled
|
||
// to the 24-bit range. Picks up realistic LPC workload without needing
|
||
// to read a WAV file in the bench body.
|
||
(0..n)
|
||
.map(|i| {
|
||
let t = i as f64;
|
||
let a = (t * 0.11).sin() * 3_000_000.0;
|
||
let b = (t * 0.27).sin() * 1_500_000.0;
|
||
let c = (t * 0.43).sin() * 750_000.0;
|
||
(a + b + c) as i32
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
/// 32-bit Galois LFSR. Deterministic (seeded) pseudo-random i32 sequence
|
||
/// in approximately `±2^19` — one-eighth of full 24-bit scale, chosen to
|
||
/// leave headroom for AR(2) resonance gain without clipping.
|
||
fn lfsr_noise(n: usize, seed: u32) -> Vec<i32> {
|
||
// Non-zero seed required: the LFSR would otherwise lock at zero.
|
||
let mut state = if seed == 0 { 0xACE1_ACE1 } else { seed };
|
||
(0..n)
|
||
.map(|_| {
|
||
// Maximal-length 32-bit Galois polynomial x^32 + x^22 + x^2 + x + 1
|
||
// (tap mask 0x8020_0003). Period = 2^32 − 1, which is comfortably
|
||
// larger than any bench frame size.
|
||
let lsb = state & 1;
|
||
state >>= 1;
|
||
if lsb != 0 {
|
||
state ^= 0x8020_0003;
|
||
}
|
||
// Sign-extend via `as i32`, arithmetic shift narrows to ~±2^19.
|
||
(state as i32) >> 12
|
||
})
|
||
.collect()
|
||
}
|
||
|
||
/// AR(2) pseudo-speech: LFSR excitation filtered through a single formant
|
||
/// resonance at ~700 Hz / 16 kHz with bandwidth ~100 Hz. The pole pair
|
||
/// gives speech-shaped spectral envelope; residuals are near-Laplacian,
|
||
/// matching the statistical profile of real vowel segments. This is the
|
||
/// content class LPC + Rice is designed for, so it stresses the inner
|
||
/// encoder loops more realistically than multi_sine (which converges
|
||
/// instantly) or white noise (which doesn't benefit from LPC).
|
||
fn pseudo_speech(n: usize) -> Vec<i32> {
|
||
// Q14 coefficients for a pole at r·e^{±jθ} with r = 0.9806, θ = 2π·700/16000.
|
||
// a1 = 2r·cos(θ) · 2^14 ≈ 30933
|
||
// a2 = −r² · 2^14 ≈ −15751
|
||
// The implied real-valued poles live inside the unit circle so the
|
||
// recursion is stable for unbounded input lengths.
|
||
const A1_Q14: i64 = 30_933;
|
||
const A2_Q14: i64 = -15_751;
|
||
let excitation = lfsr_noise(n, 0x5EED);
|
||
let mut out = Vec::with_capacity(n);
|
||
let mut y1: i64 = 0;
|
||
let mut y2: i64 = 0;
|
||
for &e in &excitation {
|
||
// Resonance gain at the peak is ~1/(1−r) ≈ 50, so y magnitudes
|
||
// reach ~2^19 · 50 ≈ 2^25. i64 intermediates prevent the
|
||
// multiply-accumulate from overflowing; the final clamp keeps
|
||
// the output inside the codec's 24-bit input contract.
|
||
let sum = A1_Q14 * y1 + A2_Q14 * y2;
|
||
// Round-to-nearest for the Q14 → integer demotion.
|
||
let ar = (sum + (1 << 13)) >> 14;
|
||
let y = ar + e as i64;
|
||
let clamped = y.clamp(-((1 << 23) - 1), (1 << 23) - 1);
|
||
out.push(clamped as i32);
|
||
y2 = y1;
|
||
y1 = clamped;
|
||
}
|
||
out
|
||
}
|
||
|
||
/// Broadband low-passed noise — LFSR excitation through a simple
|
||
/// single-pole IIR low-pass (pole at 0.9 in Q15). Covers the content
|
||
/// class where LPC cannot predict efficiently: residuals retain most
|
||
/// of the source entropy, so the Rice coder dominates the output bit
|
||
/// budget.
|
||
fn filtered_noise(n: usize) -> Vec<i32> {
|
||
const POLE_Q15: i64 = 29_491; // round(0.9 · 2^15)
|
||
// (1 − pole) in Q15 is the DC gain correction keeping output
|
||
// magnitude near the excitation range.
|
||
const ONE_MINUS_POLE_Q15: i64 = (1 << 15) - POLE_Q15;
|
||
let excitation = lfsr_noise(n, 0xFEED);
|
||
let mut out = Vec::with_capacity(n);
|
||
let mut y: i64 = 0;
|
||
for &e in &excitation {
|
||
// y[n] = pole·y[n−1] + (1−pole)·x[n], all in Q15.
|
||
let sum = POLE_Q15 * y + ONE_MINUS_POLE_Q15 * e as i64;
|
||
y = (sum + (1 << 14)) >> 15;
|
||
out.push(y.clamp(-((1 << 23) - 1), (1 << 23) - 1) as i32);
|
||
}
|
||
out
|
||
}
|
||
|
||
macro_rules! encode_bench {
|
||
($name:ident, $signal:ident, $size:expr) => {
|
||
#[bench]
|
||
fn $name(b: &mut Bencher) {
|
||
let samples = $signal($size);
|
||
b.iter(|| encode_frame(test::black_box(&samples)));
|
||
}
|
||
};
|
||
}
|
||
|
||
macro_rules! decode_bench {
|
||
($name:ident, $signal:ident, $size:expr) => {
|
||
#[bench]
|
||
fn $name(b: &mut Bencher) {
|
||
let samples = $signal($size);
|
||
let encoded = encode_frame(&samples);
|
||
b.iter(|| decode_frame(test::black_box(&encoded)).unwrap());
|
||
}
|
||
};
|
||
}
|
||
|
||
encode_bench!(encode_silence_960, silence, 960);
|
||
encode_bench!(encode_silence_4096, silence, 4096);
|
||
encode_bench!(encode_sine_256, multi_sine, 256);
|
||
encode_bench!(encode_sine_960, multi_sine, 960);
|
||
encode_bench!(encode_sine_1024, multi_sine, 1024);
|
||
encode_bench!(encode_sine_2048, multi_sine, 2048);
|
||
encode_bench!(encode_sine_2880, multi_sine, 2880);
|
||
encode_bench!(encode_sine_4096, multi_sine, 4096);
|
||
encode_bench!(encode_speech_320, pseudo_speech, 320);
|
||
encode_bench!(encode_speech_960, pseudo_speech, 960);
|
||
encode_bench!(encode_speech_2048, pseudo_speech, 2048);
|
||
encode_bench!(encode_music_960, filtered_noise, 960);
|
||
encode_bench!(encode_music_2048, filtered_noise, 2048);
|
||
encode_bench!(encode_music_4096, filtered_noise, 4096);
|
||
|
||
decode_bench!(decode_silence_4096, silence, 4096);
|
||
decode_bench!(decode_sine_960, multi_sine, 960);
|
||
decode_bench!(decode_sine_4096, multi_sine, 4096);
|
||
decode_bench!(decode_speech_960, pseudo_speech, 960);
|
||
decode_bench!(decode_music_2048, filtered_noise, 2048);
|
||
|
||
// ── SIMD dot-product kernel isolation benches ───────────────────────────────
|
||
//
|
||
// The encode benches above measure end-to-end encode cost, which is
|
||
// dominated by per-frame allocations, Rice k-search, and the order
|
||
// loop. When evaluating a SIMD kernel change these confound the signal.
|
||
// These benches exercise just the kernel at realistic LPC orders so a
|
||
// change to `compute_residuals` shows up directly.
|
||
|
||
macro_rules! compute_residuals_bench {
|
||
($name:ident, $order:expr, $len:expr) => {
|
||
#[bench]
|
||
fn $name(b: &mut Bencher) {
|
||
let samples = multi_sine($len);
|
||
let coeffs: Vec<i16> = (0..$order)
|
||
.map(|i| ((i as i16) * 711).wrapping_sub(100))
|
||
.collect();
|
||
b.iter(|| {
|
||
lac::compute_residuals(test::black_box(&samples), test::black_box(&coeffs), 1)
|
||
});
|
||
}
|
||
};
|
||
}
|
||
|
||
compute_residuals_bench!(compute_residuals_order_4_n320, 4, 320);
|
||
compute_residuals_bench!(compute_residuals_order_8_n320, 8, 320);
|
||
compute_residuals_bench!(compute_residuals_order_16_n320, 16, 320);
|
||
compute_residuals_bench!(compute_residuals_order_32_n320, 32, 320);
|
||
compute_residuals_bench!(compute_residuals_order_8_n960, 8, 960);
|
||
compute_residuals_bench!(compute_residuals_order_32_n960, 32, 960);
|
||
compute_residuals_bench!(compute_residuals_order_32_n4096, 32, 4096);
|