lac/benches/codec.rs
Kamal Tufekcic 7862cb1d9d
All checks were successful
CI / lint (push) Successful in 5s
CI / fuzz-regression (push) Successful in 14s
CI / build (push) Successful in 4s
CI / test (push) Successful in 6m54s
CI / publish (push) Successful in 8s
initial commit
Signed-off-by: Kamal Tufekcic <kamal@lo.sh>
2026-04-23 14:58:32 +03:00

205 lines
8.5 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Encode/decode throughput benchmarks.
//!
//! Uses the nightly `test` crate harness (`#[bench]`), not criterion. Run
//! with `cargo bench`. Results are wall-clock nanoseconds per iteration; to
//! convert to samples-per-second, divide the frame sample count by the
//! reported ns/iter and multiply by 10⁹.
//!
//! Representative sizes exercise the encoder's exhaustive search behaviour
//! at different `partition_order` ceilings — 256 and 1024 are power-of-two
//! frames (all seven partition orders available); 960 and 2880 mimic
//! Opus/WebRTC frame sizes (only some partition orders divide evenly, so
//! the inner search is sparser).
#![feature(test)]
extern crate test;
use lac::{decode_frame, encode_frame};
use test::Bencher;
// ── Synthetic-signal benches ────────────────────────────────────────────────
//
// These benches drive the encoder without any WAV I/O overhead, so the
// measurement is pure codec work. Four signal shapes cover the space:
//
// - `silence`: order-0 short-circuit path (skips Levinson entirely).
// - `multi_sine`: maximally LPC-friendly — order search converges fast.
// - `pseudo_speech`: AR(2) resonance on LFSR excitation, which is the
// textbook model of the vocal tract. Residuals are near-Laplacian,
// exercising the Rice k-search at realistic speech statistics.
// - `filtered_noise`: LFSR through a biquad low-pass. Broad-spectrum
// content with no strong tonal structure — the hard case for LPC and
// a reasonable proxy for music-like workload.
fn silence(n: usize) -> Vec<i32> {
vec![0i32; n]
}
fn multi_sine(n: usize) -> Vec<i32> {
// Three superimposed sinusoids with incommensurate frequencies, scaled
// to the 24-bit range. Picks up realistic LPC workload without needing
// to read a WAV file in the bench body.
(0..n)
.map(|i| {
let t = i as f64;
let a = (t * 0.11).sin() * 3_000_000.0;
let b = (t * 0.27).sin() * 1_500_000.0;
let c = (t * 0.43).sin() * 750_000.0;
(a + b + c) as i32
})
.collect()
}
/// 32-bit Galois LFSR. Deterministic (seeded) pseudo-random i32 sequence
/// in approximately `±2^19` — one-eighth of full 24-bit scale, chosen to
/// leave headroom for AR(2) resonance gain without clipping.
fn lfsr_noise(n: usize, seed: u32) -> Vec<i32> {
// Non-zero seed required: the LFSR would otherwise lock at zero.
let mut state = if seed == 0 { 0xACE1_ACE1 } else { seed };
(0..n)
.map(|_| {
// Maximal-length 32-bit Galois polynomial x^32 + x^22 + x^2 + x + 1
// (tap mask 0x8020_0003). Period = 2^32 1, which is comfortably
// larger than any bench frame size.
let lsb = state & 1;
state >>= 1;
if lsb != 0 {
state ^= 0x8020_0003;
}
// Sign-extend via `as i32`, arithmetic shift narrows to ~±2^19.
(state as i32) >> 12
})
.collect()
}
/// AR(2) pseudo-speech: LFSR excitation filtered through a single formant
/// resonance at ~700 Hz / 16 kHz with bandwidth ~100 Hz. The pole pair
/// gives speech-shaped spectral envelope; residuals are near-Laplacian,
/// matching the statistical profile of real vowel segments. This is the
/// content class LPC + Rice is designed for, so it stresses the inner
/// encoder loops more realistically than multi_sine (which converges
/// instantly) or white noise (which doesn't benefit from LPC).
fn pseudo_speech(n: usize) -> Vec<i32> {
// Q14 coefficients for a pole at r·e^{±jθ} with r = 0.9806, θ = 2π·700/16000.
// a1 = 2r·cos(θ) · 2^14 ≈ 30933
// a2 = r² · 2^14 ≈ 15751
// The implied real-valued poles live inside the unit circle so the
// recursion is stable for unbounded input lengths.
const A1_Q14: i64 = 30_933;
const A2_Q14: i64 = -15_751;
let excitation = lfsr_noise(n, 0x5EED);
let mut out = Vec::with_capacity(n);
let mut y1: i64 = 0;
let mut y2: i64 = 0;
for &e in &excitation {
// Resonance gain at the peak is ~1/(1r) ≈ 50, so y magnitudes
// reach ~2^19 · 50 ≈ 2^25. i64 intermediates prevent the
// multiply-accumulate from overflowing; the final clamp keeps
// the output inside the codec's 24-bit input contract.
let sum = A1_Q14 * y1 + A2_Q14 * y2;
// Round-to-nearest for the Q14 → integer demotion.
let ar = (sum + (1 << 13)) >> 14;
let y = ar + e as i64;
let clamped = y.clamp(-((1 << 23) - 1), (1 << 23) - 1);
out.push(clamped as i32);
y2 = y1;
y1 = clamped;
}
out
}
/// Broadband low-passed noise — LFSR excitation through a simple
/// single-pole IIR low-pass (pole at 0.9 in Q15). Covers the content
/// class where LPC cannot predict efficiently: residuals retain most
/// of the source entropy, so the Rice coder dominates the output bit
/// budget.
fn filtered_noise(n: usize) -> Vec<i32> {
const POLE_Q15: i64 = 29_491; // round(0.9 · 2^15)
// (1 pole) in Q15 is the DC gain correction keeping output
// magnitude near the excitation range.
const ONE_MINUS_POLE_Q15: i64 = (1 << 15) - POLE_Q15;
let excitation = lfsr_noise(n, 0xFEED);
let mut out = Vec::with_capacity(n);
let mut y: i64 = 0;
for &e in &excitation {
// y[n] = pole·y[n1] + (1pole)·x[n], all in Q15.
let sum = POLE_Q15 * y + ONE_MINUS_POLE_Q15 * e as i64;
y = (sum + (1 << 14)) >> 15;
out.push(y.clamp(-((1 << 23) - 1), (1 << 23) - 1) as i32);
}
out
}
macro_rules! encode_bench {
($name:ident, $signal:ident, $size:expr) => {
#[bench]
fn $name(b: &mut Bencher) {
let samples = $signal($size);
b.iter(|| encode_frame(test::black_box(&samples)));
}
};
}
macro_rules! decode_bench {
($name:ident, $signal:ident, $size:expr) => {
#[bench]
fn $name(b: &mut Bencher) {
let samples = $signal($size);
let encoded = encode_frame(&samples);
b.iter(|| decode_frame(test::black_box(&encoded)).unwrap());
}
};
}
encode_bench!(encode_silence_960, silence, 960);
encode_bench!(encode_silence_4096, silence, 4096);
encode_bench!(encode_sine_256, multi_sine, 256);
encode_bench!(encode_sine_960, multi_sine, 960);
encode_bench!(encode_sine_1024, multi_sine, 1024);
encode_bench!(encode_sine_2048, multi_sine, 2048);
encode_bench!(encode_sine_2880, multi_sine, 2880);
encode_bench!(encode_sine_4096, multi_sine, 4096);
encode_bench!(encode_speech_320, pseudo_speech, 320);
encode_bench!(encode_speech_960, pseudo_speech, 960);
encode_bench!(encode_speech_2048, pseudo_speech, 2048);
encode_bench!(encode_music_960, filtered_noise, 960);
encode_bench!(encode_music_2048, filtered_noise, 2048);
encode_bench!(encode_music_4096, filtered_noise, 4096);
decode_bench!(decode_silence_4096, silence, 4096);
decode_bench!(decode_sine_960, multi_sine, 960);
decode_bench!(decode_sine_4096, multi_sine, 4096);
decode_bench!(decode_speech_960, pseudo_speech, 960);
decode_bench!(decode_music_2048, filtered_noise, 2048);
// ── SIMD dot-product kernel isolation benches ───────────────────────────────
//
// The encode benches above measure end-to-end encode cost, which is
// dominated by per-frame allocations, Rice k-search, and the order
// loop. When evaluating a SIMD kernel change these confound the signal.
// These benches exercise just the kernel at realistic LPC orders so a
// change to `compute_residuals` shows up directly.
macro_rules! compute_residuals_bench {
($name:ident, $order:expr, $len:expr) => {
#[bench]
fn $name(b: &mut Bencher) {
let samples = multi_sine($len);
let coeffs: Vec<i16> = (0..$order)
.map(|i| ((i as i16) * 711).wrapping_sub(100))
.collect();
b.iter(|| {
lac::compute_residuals(test::black_box(&samples), test::black_box(&coeffs), 1)
});
}
};
}
compute_residuals_bench!(compute_residuals_order_4_n320, 4, 320);
compute_residuals_bench!(compute_residuals_order_8_n320, 8, 320);
compute_residuals_bench!(compute_residuals_order_16_n320, 16, 320);
compute_residuals_bench!(compute_residuals_order_32_n320, 32, 320);
compute_residuals_bench!(compute_residuals_order_8_n960, 8, 960);
compute_residuals_bench!(compute_residuals_order_32_n960, 32, 960);
compute_residuals_bench!(compute_residuals_order_32_n4096, 32, 4096);