initial commit

Signed-off-by: Kamal Tufekcic <kamal@lo.sh>
2026-04-23 14:58:32 +03:00 · 2026-04-23 14:58:32 +03:00 · 7862cb1d9d
commit 7862cb1d9d
2884 changed files with 16797 additions and 0 deletions
--- a/benches/codec.rs
+++ b/benches/codec.rs
@ -0,0 +1,205 @@
+//! Encode/decode throughput benchmarks.
+//!
+//! Uses the nightly `test` crate harness (`#[bench]`), not criterion. Run
+//! with `cargo bench`. Results are wall-clock nanoseconds per iteration; to
+//! convert to samples-per-second, divide the frame sample count by the
+//! reported ns/iter and multiply by 10⁹.
+//!
+//! Representative sizes exercise the encoder's exhaustive search behaviour
+//! at different `partition_order` ceilings — 256 and 1024 are power-of-two
+//! frames (all seven partition orders available); 960 and 2880 mimic
+//! Opus/WebRTC frame sizes (only some partition orders divide evenly, so
+//! the inner search is sparser).
+
+#![feature(test)]
+
+extern crate test;
+
+use lac::{decode_frame, encode_frame};
+use test::Bencher;
+
+// ── Synthetic-signal benches ────────────────────────────────────────────────
+//
+// These benches drive the encoder without any WAV I/O overhead, so the
+// measurement is pure codec work. Four signal shapes cover the space:
+//
+// - `silence`: order-0 short-circuit path (skips Levinson entirely).
+// - `multi_sine`: maximally LPC-friendly — order search converges fast.
+// - `pseudo_speech`: AR(2) resonance on LFSR excitation, which is the
+//   textbook model of the vocal tract. Residuals are near-Laplacian,
+//   exercising the Rice k-search at realistic speech statistics.
+// - `filtered_noise`: LFSR through a biquad low-pass. Broad-spectrum
+//   content with no strong tonal structure — the hard case for LPC and
+//   a reasonable proxy for music-like workload.
+
+fn silence(n: usize) -> Vec<i32> {
+    vec![0i32; n]
+}
+
+fn multi_sine(n: usize) -> Vec<i32> {
+    // Three superimposed sinusoids with incommensurate frequencies, scaled
+    // to the 24-bit range. Picks up realistic LPC workload without needing
+    // to read a WAV file in the bench body.
+    (0..n)
+        .map(|i| {
+            let t = i as f64;
+            let a = (t * 0.11).sin() * 3_000_000.0;
+            let b = (t * 0.27).sin() * 1_500_000.0;
+            let c = (t * 0.43).sin() * 750_000.0;
+            (a + b + c) as i32
+        })
+        .collect()
+}
+
+/// 32-bit Galois LFSR. Deterministic (seeded) pseudo-random i32 sequence
+/// in approximately `±2^19` — one-eighth of full 24-bit scale, chosen to
+/// leave headroom for AR(2) resonance gain without clipping.
+fn lfsr_noise(n: usize, seed: u32) -> Vec<i32> {
+    // Non-zero seed required: the LFSR would otherwise lock at zero.
+    let mut state = if seed == 0 { 0xACE1_ACE1 } else { seed };
+    (0..n)
+        .map(|_| {
+            // Maximal-length 32-bit Galois polynomial x^32 + x^22 + x^2 + x + 1
+            // (tap mask 0x8020_0003). Period = 2^32 − 1, which is comfortably
+            // larger than any bench frame size.
+            let lsb = state & 1;
+            state >>= 1;
+            if lsb != 0 {
+                state ^= 0x8020_0003;
+            }
+            // Sign-extend via `as i32`, arithmetic shift narrows to ~±2^19.
+            (state as i32) >> 12
+        })
+        .collect()
+}
+
+/// AR(2) pseudo-speech: LFSR excitation filtered through a single formant
+/// resonance at ~700 Hz / 16 kHz with bandwidth ~100 Hz. The pole pair
+/// gives speech-shaped spectral envelope; residuals are near-Laplacian,
+/// matching the statistical profile of real vowel segments. This is the
+/// content class LPC + Rice is designed for, so it stresses the inner
+/// encoder loops more realistically than multi_sine (which converges
+/// instantly) or white noise (which doesn't benefit from LPC).
+fn pseudo_speech(n: usize) -> Vec<i32> {
+    // Q14 coefficients for a pole at r·e^{±jθ} with r = 0.9806, θ = 2π·700/16000.
+    // a1 = 2r·cos(θ) · 2^14 ≈ 30933
+    // a2 = −r²      · 2^14 ≈ −15751
+    // The implied real-valued poles live inside the unit circle so the
+    // recursion is stable for unbounded input lengths.
+    const A1_Q14: i64 = 30_933;
+    const A2_Q14: i64 = -15_751;
+    let excitation = lfsr_noise(n, 0x5EED);
+    let mut out = Vec::with_capacity(n);
+    let mut y1: i64 = 0;
+    let mut y2: i64 = 0;
+    for &e in &excitation {
+        // Resonance gain at the peak is ~1/(1−r) ≈ 50, so y magnitudes
+        // reach ~2^19 · 50 ≈ 2^25. i64 intermediates prevent the
+        // multiply-accumulate from overflowing; the final clamp keeps
+        // the output inside the codec's 24-bit input contract.
+        let sum = A1_Q14 * y1 + A2_Q14 * y2;
+        // Round-to-nearest for the Q14 → integer demotion.
+        let ar = (sum + (1 << 13)) >> 14;
+        let y = ar + e as i64;
+        let clamped = y.clamp(-((1 << 23) - 1), (1 << 23) - 1);
+        out.push(clamped as i32);
+        y2 = y1;
+        y1 = clamped;
+    }
+    out
+}
+
+/// Broadband low-passed noise — LFSR excitation through a simple
+/// single-pole IIR low-pass (pole at 0.9 in Q15). Covers the content
+/// class where LPC cannot predict efficiently: residuals retain most
+/// of the source entropy, so the Rice coder dominates the output bit
+/// budget.
+fn filtered_noise(n: usize) -> Vec<i32> {
+    const POLE_Q15: i64 = 29_491; // round(0.9 · 2^15)
+    // (1 − pole) in Q15 is the DC gain correction keeping output
+    // magnitude near the excitation range.
+    const ONE_MINUS_POLE_Q15: i64 = (1 << 15) - POLE_Q15;
+    let excitation = lfsr_noise(n, 0xFEED);
+    let mut out = Vec::with_capacity(n);
+    let mut y: i64 = 0;
+    for &e in &excitation {
+        // y[n] = pole·y[n−1] + (1−pole)·x[n], all in Q15.
+        let sum = POLE_Q15 * y + ONE_MINUS_POLE_Q15 * e as i64;
+        y = (sum + (1 << 14)) >> 15;
+        out.push(y.clamp(-((1 << 23) - 1), (1 << 23) - 1) as i32);
+    }
+    out
+}
+
+macro_rules! encode_bench {
+    ($name:ident, $signal:ident, $size:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            let samples = $signal($size);
+            b.iter(|| encode_frame(test::black_box(&samples)));
+        }
+    };
+}
+
+macro_rules! decode_bench {
+    ($name:ident, $signal:ident, $size:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            let samples = $signal($size);
+            let encoded = encode_frame(&samples);
+            b.iter(|| decode_frame(test::black_box(&encoded)).unwrap());
+        }
+    };
+}
+
+encode_bench!(encode_silence_960, silence, 960);
+encode_bench!(encode_silence_4096, silence, 4096);
+encode_bench!(encode_sine_256, multi_sine, 256);
+encode_bench!(encode_sine_960, multi_sine, 960);
+encode_bench!(encode_sine_1024, multi_sine, 1024);
+encode_bench!(encode_sine_2048, multi_sine, 2048);
+encode_bench!(encode_sine_2880, multi_sine, 2880);
+encode_bench!(encode_sine_4096, multi_sine, 4096);
+encode_bench!(encode_speech_320, pseudo_speech, 320);
+encode_bench!(encode_speech_960, pseudo_speech, 960);
+encode_bench!(encode_speech_2048, pseudo_speech, 2048);
+encode_bench!(encode_music_960, filtered_noise, 960);
+encode_bench!(encode_music_2048, filtered_noise, 2048);
+encode_bench!(encode_music_4096, filtered_noise, 4096);
+
+decode_bench!(decode_silence_4096, silence, 4096);
+decode_bench!(decode_sine_960, multi_sine, 960);
+decode_bench!(decode_sine_4096, multi_sine, 4096);
+decode_bench!(decode_speech_960, pseudo_speech, 960);
+decode_bench!(decode_music_2048, filtered_noise, 2048);
+
+// ── SIMD dot-product kernel isolation benches ───────────────────────────────
+//
+// The encode benches above measure end-to-end encode cost, which is
+// dominated by per-frame allocations, Rice k-search, and the order
+// loop. When evaluating a SIMD kernel change these confound the signal.
+// These benches exercise just the kernel at realistic LPC orders so a
+// change to `compute_residuals` shows up directly.
+
+macro_rules! compute_residuals_bench {
+    ($name:ident, $order:expr, $len:expr) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            let samples = multi_sine($len);
+            let coeffs: Vec<i16> = (0..$order)
+                .map(|i| ((i as i16) * 711).wrapping_sub(100))
+                .collect();
+            b.iter(|| {
+                lac::compute_residuals(test::black_box(&samples), test::black_box(&coeffs), 1)
+            });
+        }
+    };
+}
+
+compute_residuals_bench!(compute_residuals_order_4_n320, 4, 320);
+compute_residuals_bench!(compute_residuals_order_8_n320, 8, 320);
+compute_residuals_bench!(compute_residuals_order_16_n320, 16, 320);
+compute_residuals_bench!(compute_residuals_order_32_n320, 32, 320);
+compute_residuals_bench!(compute_residuals_order_8_n960, 8, 960);
+compute_residuals_bench!(compute_residuals_order_32_n960, 32, 960);
+compute_residuals_bench!(compute_residuals_order_32_n4096, 32, 4096);
--- a/benches/compare-flac.sh
+++ b/benches/compare-flac.sh
@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+#
+# Wall-clock + compressed-size comparison between FLAC and LAC on the
+# local corpus directory. Diagnostic only — not part of CI.
+#
+# Usage:
+#   benches/compare-flac.sh [corpus_dir]
+#
+# Output columns (tab-separated, stable for piping into column -t):
+#
+#   file   flac_ms   flac_bytes   [matching LAC line from `cargo test`]
+#
+# For the LAC side, run `cargo test --test corpus --release -- --nocapture`
+# and correlate `lac_enc_ms` / `lac=<bytes>` values against the filenames
+# printed here. Two separate invocations because automating the join is
+# more fragile than eyeballing it for the six files the corpus contains.
+
+set -euo pipefail
+
+CORPUS_DIR="${1:-corpus}"
+
+if ! command -v flac > /dev/null 2>&1; then
+    echo "flac CLI not found in PATH; install the flac package" >&2
+    exit 1
+fi
+if [[ ! -d "$CORPUS_DIR" ]]; then
+    echo "corpus directory not found: $CORPUS_DIR" >&2
+    exit 1
+fi
+
+# Header. Columns cover both FLAC modes — default (`-5`, what most
+# production pipelines actually use) and `--best` (`-8`, the ceiling
+# `tests/corpus.rs` asserts its ratios against). `column -t` on the
+# output aligns to the printf format below.
+printf "%-50s\t%12s\t%14s\t%12s\t%14s\n" \
+    "file" "flac_d_ms" "flac_d_bytes" "flac_b_ms" "flac_b_bytes"
+
+# Shell globs are unordered across filesystems; sort for stable output.
+shopt -s nullglob
+files=("$CORPUS_DIR"/*.wav)
+IFS=$'\n' files=($(sort <<< "${files[*]}"))
+unset IFS
+
+# Warm-up invocation against the first file: the very first `flac` exec
+# in a shell session pays dynamic-linker + page-fault costs that aren't
+# representative of steady-state. Subsequent runs don't repay them.
+if [[ ${#files[@]} -gt 0 ]]; then
+    flac --stdout --best --silent "${files[0]}" > /dev/null 2>&1 || true
+fi
+
+for f in "${files[@]}"; do
+    # `date +%s%N` gives nanoseconds since epoch on GNU coreutils. Not
+    # portable to BSD `date`, but this script is Linux-only by design
+    # (matches the CI runner environment).
+    #
+    # Two invocations per file: default (`-5`) first, then `--best`.
+    # Ordering is deliberate: the default pass also warms the OS file
+    # cache, so `--best` sees warm-cache I/O and its time reflects the
+    # compute cost, not disk read.
+    start_ns=$(date +%s%N)
+    flac_d_bytes=$(flac --stdout --silent "$f" 2> /dev/null | wc -c)
+    mid_ns=$(date +%s%N)
+    flac_b_bytes=$(flac --stdout --best --silent "$f" 2> /dev/null | wc -c)
+    end_ns=$(date +%s%N)
+    flac_d_ms=$(( (mid_ns - start_ns) / 1000000 ))
+    flac_b_ms=$(( (end_ns - mid_ns) / 1000000 ))
+    printf "%-50s\t%12d\t%14d\t%12d\t%14d\n" \
+        "$(basename "$f")" "$flac_d_ms" "$flac_d_bytes" "$flac_b_ms" "$flac_b_bytes"
+done