initial commit

Signed-off-by: Kamal Tufekcic <kamal@lo.sh>
2026-04-23 14:58:32 +03:00 · 2026-04-23 14:58:32 +03:00 · 7862cb1d9d
commit 7862cb1d9d
2884 changed files with 16797 additions and 0 deletions
--- a/tests/conformance.rs
+++ b/tests/conformance.rs
@ -0,0 +1,369 @@
+//! Wire-format conformance fixtures.
+//!
+//! Each `DecodeFixture` pins a `(samples, bytes)` pair at the byte
+//! level. A second implementation of LAC MUST produce `samples` when
+//! fed `bytes` to its decoder. This test suite is the canonical
+//! reference for decoder conformance: byte-identical `bytes` across
+//! implementations aren't required (encoders have latitude in order /
+//! partition / k selection), but byte-identical decoder output is.
+//!
+//! # How this file works
+//!
+//! - `DECODE_FIXTURES` holds the pinned vectors.
+//! - `decode_fixtures` runs each fixture's bytes through `decode_frame`
+//!   and asserts the output matches. This is the conformance test.
+//! - `encode_matches_fixtures` runs each fixture's samples through the
+//!   reference encoder and asserts the bytes match. This catches
+//!   unintentional drift in the reference's encoder strategy; a
+//!   deliberate change (e.g. adding a new predictor or order) will fail
+//!   this test and require regenerating the fixtures.
+//! - `generate_vectors` (ignored by default) prints the current
+//!   reference encoder output in a paste-ready format. Run via
+//!   `cargo test --test conformance generate_vectors --
+//!   --ignored --nocapture` to refresh the fixtures after an
+//!   intentional encoder change.
+//!
+//! # Rejection fixtures
+//!
+//! `REJECT_FIXTURES` pins header-level malformed inputs to their
+//! expected `DecodeError` variants. These are hand-constructed — the
+//! encoder never emits them — so they verify decoder rejection paths
+//! across implementations.
+
+use lac::{DecodeError, decode_frame, encode_frame};
+
+// ── Decode / encode fixtures ────────────────────────────────────────────────
+
+struct DecodeFixture {
+    name: &'static str,
+    samples: &'static [i32],
+    bytes: &'static [u8],
+}
+
+/// Pinned wire-format vectors. Populated from the reference encoder
+/// via `generate_vectors` below. See `ENCODER_PIN` comment at the top
+/// of the generator for the rationale on why this doubles as a drift
+/// canary for the encoder.
+const DECODE_FIXTURES: &[DecodeFixture] = &[
+    // ── Degenerate / smallest frames ───────────────────────────────
+    DecodeFixture {
+        name: "single_zero",
+        samples: &[0],
+        bytes: &[0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x01, 0x04],
+    },
+    DecodeFixture {
+        name: "silence_4",
+        samples: &[0, 0, 0, 0],
+        bytes: &[0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x04, 0x07, 0x80],
+    },
+    DecodeFixture {
+        name: "silence_8",
+        samples: &[0, 0, 0, 0, 0, 0, 0, 0],
+        bytes: &[0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x08, 0x07, 0xf8],
+    },
+    // ── Single-sample polarity + magnitude boundaries ──────────────
+    DecodeFixture {
+        name: "single_pos_one",
+        samples: &[1],
+        bytes: &[0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01],
+    },
+    DecodeFixture {
+        name: "single_neg_one",
+        samples: &[-1],
+        bytes: &[0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02],
+    },
+    DecodeFixture {
+        name: "single_full_scale_pos",
+        samples: &[(1 << 23) - 1],
+        bytes: &[
+            0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x01, 0xbb, 0xff, 0xff, 0xf8,
+        ],
+    },
+    DecodeFixture {
+        name: "single_full_scale_neg",
+        samples: &[-((1 << 23) - 1)],
+        bytes: &[
+            0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x01, 0xbb, 0xff, 0xff, 0xf4,
+        ],
+    },
+    // ── DC and near-DC content ─────────────────────────────────────
+    DecodeFixture {
+        name: "dc_100_4",
+        samples: &[100, 100, 100, 100],
+        bytes: &[
+            0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x04, 0x3b, 0x21, 0x90, 0xc8, 0x64, 0x00,
+        ],
+    },
+    // ── Alternating polarity (Nyquist-like) ────────────────────────
+    DecodeFixture {
+        name: "alternating_small_4",
+        samples: &[1000, -1000, 1000, -1000],
+        bytes: &[
+            0x1a, 0xcc, 0x00, 0x00, 0x00, 0x00, 0x04, 0x53, 0xe8, 0x3e, 0x7b, 0xe8, 0x3e, 0x78,
+        ],
+    },
+    // ── Smooth polynomial — fixed predictor territory ──────────────
+    DecodeFixture {
+        name: "linear_ramp_8",
+        samples: &[0, 100, 200, 300, 400, 500, 600, 700],
+        bytes: &[
+            0x1a, 0xcc, 0x02, 0x02, 0x02, 0x00, 0x08, 0x40, 0x00, 0xe0, 0x00, 0x34, 0x01, 0x20,
+            0x18, 0x30, 0x60,
+        ],
+    },
+    // ── 16-sample growing-amplitude (exercises partition + LPC) ────
+    DecodeFixture {
+        name: "lfsr_noise_16",
+        samples: &[
+            21, -100, 42, -200, 51, -400, 71, -800, 90, -1600, 110, -3200, 130, -6400, 151, -12800,
+        ],
+        bytes: &[
+            0x1a, 0xcc, 0x00, 0x01, 0x00, 0x00, 0x10, 0x44, 0xab, 0x8f, 0x54, 0x63, 0xec, 0xc2,
+            0x3f, 0x8e, 0x02, 0x7e, 0xc8, 0x5a, 0x71, 0xfe, 0x1b, 0x8c, 0x7f, 0xc4, 0x10, 0x47,
+            0xfe, 0x25, 0xc0, 0x4f, 0xfc,
+        ],
+    },
+];
+
+// ── Rejection fixtures ──────────────────────────────────────────────────────
+
+struct RejectFixture {
+    name: &'static str,
+    bytes: &'static [u8],
+    expected: DecodeError,
+}
+
+const REJECT_FIXTURES: &[RejectFixture] = &[
+    RejectFixture {
+        name: "bad_sync_word",
+        // Sync byte flipped to 0xFF; rest is a well-formed minimal
+        // verbatim header so the decoder only rejects on the first
+        // check.
+        bytes: &[0xFF, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x01],
+        expected: DecodeError::BadSyncWord { got: 0xFFCC },
+    },
+    RejectFixture {
+        name: "prediction_order_above_max",
+        bytes: &[0x1A, 0xCC, 0x21, 0x00, 0x00, 0x00, 0x01],
+        expected: DecodeError::InvalidPredictionOrder { got: 33 },
+    },
+    RejectFixture {
+        name: "partition_order_above_max",
+        bytes: &[0x1A, 0xCC, 0x00, 0x08, 0x00, 0x00, 0x01],
+        expected: DecodeError::InvalidPartitionOrder { got: 8 },
+    },
+    RejectFixture {
+        name: "coefficient_shift_above_max",
+        // Non-zero prediction_order so the shift is actually used.
+        // 2 bytes of (zero) coefficient follow so the header is
+        // structurally valid before the shift check fires.
+        bytes: &[0x1A, 0xCC, 0x01, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00],
+        expected: DecodeError::InvalidCoefficientShift { got: 6 },
+    },
+    RejectFixture {
+        name: "coefficient_shift_without_order",
+        // order = 0, shift = 3 — contradictory per spec §3.4.
+        bytes: &[0x1A, 0xCC, 0x00, 0x00, 0x03, 0x00, 0x01],
+        expected: DecodeError::CoefficientShiftWithoutOrder { shift: 3 },
+    },
+    RejectFixture {
+        name: "zero_frame_sample_count",
+        bytes: &[0x1A, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x00],
+        expected: DecodeError::InvalidParameter,
+    },
+    RejectFixture {
+        name: "frame_count_not_divisible_by_partition_count",
+        // partition_order = 3 → 8 partitions, count = 7 doesn't divide.
+        bytes: &[0x1A, 0xCC, 0x00, 0x03, 0x00, 0x00, 0x07],
+        expected: DecodeError::InvalidParameter,
+    },
+    RejectFixture {
+        name: "truncated_before_header_complete",
+        // Only 3 bytes — below the 7-byte fixed header minimum.
+        bytes: &[0x1A, 0xCC, 0x00],
+        expected: DecodeError::Truncated,
+    },
+    RejectFixture {
+        name: "truncated_before_coefficients",
+        // prediction_order = 2 claims 4 trailing coefficient bytes,
+        // but only 7 bytes are present.
+        bytes: &[0x1A, 0xCC, 0x02, 0x00, 0x00, 0x00, 0x04],
+        expected: DecodeError::Truncated,
+    },
+    RejectFixture {
+        name: "truncated_before_rice_bitstream",
+        // Fully valid 7-byte header with count=1 and order=0 (no
+        // coefficients), then no Rice bytes at all. Decoder reads the
+        // header, enters Rice decode, tries to read the 5-bit `k`
+        // field, and fails. Covers the third truncation class in
+        // spec §6 (Rice-bitstream-level, distinct from header and
+        // coefficient-array truncations above).
+        bytes: &[0x1A, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x01],
+        expected: DecodeError::Truncated,
+    },
+    RejectFixture {
+        name: "rice_k_above_max",
+        // Valid 7-byte verbatim header; first Rice byte stores `k=31`
+        // in its high 5 bits (31 = 0b11111, left-shifted 3 = 0xF8).
+        // The decoder reads `k` and rejects immediately — never
+        // proceeds to the codeword — per spec §4.1 (k must be in
+        // [0, 23]).
+        bytes: &[0x1A, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x01, 0xF8],
+        expected: DecodeError::InvalidParameter,
+    },
+];
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+#[test]
+fn decode_fixtures() {
+    // The canonical conformance check: every fixture's bytes must
+    // decode to the claimed samples. Any second implementation's
+    // decoder that fails this test is non-conformant.
+    for f in DECODE_FIXTURES {
+        if f.bytes.is_empty() {
+            // Placeholder fixture — regenerate with `generate_vectors`.
+            continue;
+        }
+        let decoded = decode_frame(f.bytes)
+            .unwrap_or_else(|e| panic!("fixture {}: decode failed with {e:?}", f.name));
+        assert_eq!(
+            decoded, f.samples,
+            "fixture {}: decoded samples mismatch",
+            f.name
+        );
+    }
+}
+
+#[test]
+fn encode_matches_fixtures() {
+    // Drift canary: the reference encoder currently produces these
+    // exact bytes for these inputs. A deliberate change to the
+    // encoder's search strategy (new predictor, different grid,
+    // different tie-break) will fail this test and should be followed
+    // by regenerating the `bytes` fields via `generate_vectors`.
+    //
+    // Second implementations are not obligated to produce byte-
+    // identical output, so this test is reference-specific.
+    for f in DECODE_FIXTURES {
+        if f.bytes.is_empty() {
+            continue;
+        }
+        let encoded = encode_frame(f.samples);
+        assert_eq!(
+            &encoded[..],
+            f.bytes,
+            "fixture {}: encoder output drifted from pinned bytes",
+            f.name
+        );
+    }
+}
+
+#[test]
+fn reject_fixtures() {
+    for f in REJECT_FIXTURES {
+        match decode_frame(f.bytes) {
+            Ok(samples) => panic!(
+                "fixture {}: expected {:?}, got Ok with {} samples",
+                f.name,
+                f.expected,
+                samples.len()
+            ),
+            Err(e) => assert_eq!(e, f.expected, "fixture {}: wrong error variant", f.name),
+        }
+    }
+}
+
+/// Spec §6 rejection class 10: decoder **must** reject any codeword
+/// whose unary run length exceeds `u32::MAX >> k` (equivalently
+/// `q > (2³² − 1) / 2^k`). Lives here rather than in `REJECT_FIXTURES`
+/// because the minimal triggering payload is ~75 bytes of mostly
+/// zeros — a const array of that shape is noise, and the construction
+/// logic (`k = 23`, so `q_max = 511`; emit 512 unary zeros; then
+/// terminator + zero remainder) documents the intent better than a
+/// hex blob.
+#[test]
+fn reject_unary_run_above_cap() {
+    // Header: sync + order=0 + po=0 + shift=0 + count=1. Rice section
+    // begins at byte 7 with no coefficients in between.
+    let mut bytes: Vec<u8> = vec![0x1A, 0xCC, 0x00, 0x00, 0x00, 0x00, 0x01];
+
+    // Rice payload, built bit-by-bit via the codec's own BitWriter
+    // analogue below. Doing it with raw byte arithmetic here would
+    // duplicate bit-ordering logic that already lives in `bit_io`.
+    // We construct the payload into a separate Vec and append.
+    //
+    // Payload structure (541 bits total, padded to 544 = 68 bytes):
+    //   - 5 bits: k = 23           (triggers q_max = u32::MAX >> 23 = 511)
+    //   - 512 bits: unary zeros    (one past the cap)
+    //   - 1 bit: terminator = 1
+    //   - 23 bits: remainder = 0
+    let mut rice: Vec<u8> = Vec::with_capacity(68);
+    let mut bit_accum: u32 = 0;
+    let mut bits_in_accum: u32 = 0;
+    let push_bits = |value: u32, count: u32, rice: &mut Vec<u8>, accum: &mut u32, n: &mut u32| {
+        for i in (0..count).rev() {
+            let bit = (value >> i) & 1;
+            *accum = (*accum << 1) | bit;
+            *n += 1;
+            if *n == 8 {
+                rice.push(*accum as u8);
+                *accum = 0;
+                *n = 0;
+            }
+        }
+    };
+    push_bits(23, 5, &mut rice, &mut bit_accum, &mut bits_in_accum); // k = 23
+    for _ in 0..512 {
+        push_bits(0, 1, &mut rice, &mut bit_accum, &mut bits_in_accum);
+    }
+    push_bits(1, 1, &mut rice, &mut bit_accum, &mut bits_in_accum); // terminator
+    push_bits(0, 23, &mut rice, &mut bit_accum, &mut bits_in_accum); // remainder
+    // Flush partial byte (left-aligned, matching spec §4.3).
+    if bits_in_accum > 0 {
+        rice.push((bit_accum << (8 - bits_in_accum)) as u8);
+    }
+
+    bytes.extend_from_slice(&rice);
+    assert_eq!(bytes.len(), 7 + 68, "unexpected fixture length");
+
+    assert_eq!(
+        decode_frame(&bytes),
+        Err(DecodeError::InvalidParameter),
+        "q=512 with k=23 exceeds u32::MAX >> k = 511; decoder must reject \
+         with InvalidParameter per spec §4.2"
+    );
+}
+
+#[test]
+#[ignore = "helper for refreshing the pinned byte literals"]
+fn generate_vectors() {
+    // Prints DECODE_FIXTURES entries in paste-ready format. Run with
+    //   cargo test --test conformance generate_vectors -- --ignored --nocapture
+    // then copy the output over the existing fixture bodies. Intended
+    // for use after a deliberate change to encoder strategy; refuses
+    // to run in normal test flow to avoid accidental acceptance of
+    // silent drift.
+    for f in DECODE_FIXTURES {
+        let encoded = encode_frame(f.samples);
+        eprintln!("    DecodeFixture {{");
+        eprintln!("        name: {:?},", f.name);
+        eprint!("        samples: &[");
+        for (i, s) in f.samples.iter().enumerate() {
+            if i > 0 {
+                eprint!(", ");
+            }
+            eprint!("{s}");
+        }
+        eprintln!("],");
+        eprint!("        bytes: &[");
+        for (i, b) in encoded.iter().enumerate() {
+            if i > 0 {
+                eprint!(", ");
+            }
+            eprint!("{b:#04x}");
+        }
+        eprintln!("],");
+        eprintln!("    }},");
+    }
+}
--- a/tests/corpus.rs
+++ b/tests/corpus.rs
@ -0,0 +1,351 @@
+//! Real-audio corpus tests: round-trip, compression ratio, FLAC comparison.
+//!
+//! These are integration tests — run with `cargo test --test corpus` to see
+//! the printed ratio numbers, or `cargo test --test corpus -- --nocapture`
+//! for verbose output during development.
+//!
+//! Each test is gated by the presence of its corpus file so the suite still
+//! passes on a clean checkout without the WAV data.
+//!
+//! # What we measure
+//!
+//! - **Round-trip**: encode every frame of the input, decode back, verify
+//!   sample-for-sample equality. Failure here means the codec is broken.
+//! - **Compression ratio**: `encoded_size / raw_size` for the full file,
+//!   framed at a realistic frame size. Ratios below 0.6 are "good" for a
+//!   lossless codec; ratios above 0.8 suggest something is wrong with the
+//!   LPC or Rice tuning.
+//! - **FLAC comparison**: invoke `flac` CLI on the same input, compare
+//!   compressed sizes. We expect LAC to be within 10-25% of FLAC size on
+//!   most content; consistently much worse indicates a real codec gap (the
+//!   Q15 coefficient-clamping limitation, most likely).
+//! - **LAC encode wall-clock**: printed as `lac_enc_ms`. Not asserted —
+//!   CI hardware variance makes any ceiling either useless or flaky. The
+//!   number is visibility-only, meant to be correlated against
+//!   `bench/compare-flac.sh` output for an engineer-side speed sanity
+//!   check. Only the encode hot path is timed (decode is excluded: at
+//!   microsecond scale, decode speed is dominated by allocator noise).
+
+use std::path::{Path, PathBuf};
+use std::process::Command;
+use std::time::{Duration, Instant};
+
+use hound::WavReader;
+use lac::{decode_frame, encode_frame};
+
+const CORPUS_DIR: &str = "corpus";
+
+/// Encode-side frame length. 4096 is FLAC's default blocksize at
+/// compression levels 1-8 for `≤ 48 kHz` content and sits inside LAC's
+/// own archival-default band (README "Offline / archival"). Using the
+/// same block size on both sides makes the FLAC comparison
+/// apples-to-apples: neither codec is charged for a block-size
+/// mismatch with the other.
+///
+/// Every partition order `0..=7` divides 4096, so the encoder stays on
+/// the dense search path throughout.
+const FRAME_SIZE: usize = 4096;
+
+// ── WAV loading ─────────────────────────────────────────────────────────────
+
+/// Load a WAV file as separated mono channels of i32 samples.
+///
+/// Samples are passed through at their native width — 16-bit values stay
+/// 16-bit, inside the i32 carrier. LAC's only hard constraint is that
+/// `|sample|` fits in 24 bits (ceiling for autocorrelation overflow
+/// analysis); narrower inputs compress according to their actual
+/// magnitudes.
+///
+/// The earlier version of this loader left-shifted to "promote" 16-bit to
+/// 24-bit. That was wrong: Rice coding tracks residual magnitude, so a
+/// 256× amplification costs 8 extra bits per residual and inflates the
+/// output size proportionally.
+fn load_wav_channels(path: &Path) -> Option<Vec<Vec<i32>>> {
+    let mut reader = WavReader::open(path).ok()?;
+    let spec = reader.spec();
+    if spec.sample_format != hound::SampleFormat::Int {
+        return None;
+    }
+    let ch = spec.channels as usize;
+
+    // Sanity check: reject inputs whose values won't fit in 24 bits. In
+    // practice the corpus is 16-bit or 24-bit integer PCM so this is just
+    // a defensive guard.
+    if spec.bits_per_sample > 24 {
+        return None;
+    }
+
+    let mut channels: Vec<Vec<i32>> = (0..ch).map(|_| Vec::new()).collect();
+    for (i, s) in reader.samples::<i32>().enumerate() {
+        let s = s.ok()?;
+        channels[i % ch].push(s);
+    }
+    Some(channels)
+}
+
+fn corpus_path(name: &str) -> PathBuf {
+    Path::new(CORPUS_DIR).join(name)
+}
+
+/// Skip a test if its WAV file isn't present, so the suite stays green on
+/// a clean checkout. Prints a one-line hint so the operator knows why.
+macro_rules! require_corpus {
+    ($path:expr) => {
+        if !$path.exists() {
+            eprintln!("skipping: corpus file not found: {}", $path.display());
+            return;
+        }
+    };
+}
+
+// ── Per-frame round-trip harness ────────────────────────────────────────────
+
+/// Aggregate per-file measurement: raw byte count, encoded byte count,
+/// and wall-clock time spent inside `encode_frame`. The decode time is
+/// not reported — it is an order of magnitude smaller than encode for
+/// every content class in the corpus, so the ratio of interest is
+/// encode-side.
+struct Measurement {
+    raw_bytes: usize,
+    encoded_bytes: usize,
+    encode_time: Duration,
+}
+
+impl Measurement {
+    fn new() -> Self {
+        Self {
+            raw_bytes: 0,
+            encoded_bytes: 0,
+            encode_time: Duration::ZERO,
+        }
+    }
+
+    fn add(&mut self, other: &Measurement) {
+        self.raw_bytes += other.raw_bytes;
+        self.encoded_bytes += other.encoded_bytes;
+        self.encode_time += other.encode_time;
+    }
+}
+
+/// Encode every `FRAME_SIZE`-sample chunk of `channel`, decode, assert
+/// equality. The trailing partial chunk (if any) is encoded at whatever
+/// partition_order divides its length; the encoder's search handles that
+/// automatically. `encode_time` captures only the encode hot path —
+/// decode, allocation of the returned `Vec`, and the round-trip assert
+/// are excluded so the number is directly comparable to `flac`'s wall
+/// clock at the same input.
+fn roundtrip_channel(channel: &[i32], bytes_per_sample: usize) -> Measurement {
+    let mut m = Measurement::new();
+    for chunk in channel.chunks(FRAME_SIZE) {
+        let t = Instant::now();
+        let encoded = encode_frame(chunk);
+        m.encode_time += t.elapsed();
+        let decoded = decode_frame(&encoded).expect("decode_frame failed on own output");
+        assert_eq!(
+            decoded,
+            chunk,
+            "round-trip mismatch in frame of {} samples",
+            chunk.len()
+        );
+        m.raw_bytes += chunk.len() * bytes_per_sample;
+        m.encoded_bytes += encoded.len();
+    }
+    m
+}
+
+/// Aggregate round-trip over every channel in a WAV file. Returns a
+/// `Measurement` whose `raw_bytes` uses the WAV's actual sample width,
+/// so the ratio corresponds to the over-the-wire comparison a user
+/// would do against the same file encoded with FLAC.
+fn roundtrip_wav(path: &Path) -> Measurement {
+    // Probe the spec once to know the bytes-per-sample for the raw-size
+    // denominator. `load_wav_channels` filters unsupported formats, so the
+    // spec read here is guaranteed to be a supported integer format.
+    let spec = WavReader::open(path).expect("open for spec").spec();
+    // Bits per sample rounds up to bytes: 16 → 2, 20 → 3, 24 → 3.
+    let bytes_per_sample = spec.bits_per_sample.div_ceil(8) as usize;
+
+    let channels = load_wav_channels(path).expect("load_wav_channels failed");
+    let mut totals = Measurement::new();
+    for ch in &channels {
+        let m = roundtrip_channel(ch, bytes_per_sample);
+        totals.add(&m);
+    }
+    totals
+}
+
+// ── FLAC comparison ─────────────────────────────────────────────────────────
+
+/// Invoke the `flac` CLI to compress `path`, return the resulting byte
+/// count. Returns `None` if the FLAC tool isn't installed.
+fn flac_compress_size(path: &Path) -> Option<usize> {
+    // `flac --stdout --silent <file>` writes compressed FLAC to stdout so
+    // we never touch the filesystem for the output. `-o -` would do the
+    // same but is not universally supported across FLAC versions.
+    let out = Command::new("flac")
+        .arg("--stdout")
+        .arg("--silent")
+        .arg("--best")
+        .arg(path)
+        .output()
+        .ok()?;
+    if !out.status.success() {
+        return None;
+    }
+    Some(out.stdout.len())
+}
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+fn report_ratio(name: &str, m: &Measurement, flac_size: Option<usize>) {
+    let ratio = m.encoded_bytes as f64 / m.raw_bytes as f64;
+    let enc_ms = m.encode_time.as_secs_f64() * 1000.0;
+    eprint!(
+        "{name:40}  raw={:>10}  lac={:>10}  ratio={ratio:.3}  lac_enc_ms={enc_ms:>7.1}",
+        m.raw_bytes, m.encoded_bytes,
+    );
+    if let Some(flac) = flac_size {
+        let flac_ratio = flac as f64 / m.raw_bytes as f64;
+        // `lac_vs_flac` > 1.0 means LAC is bigger than FLAC. Anything
+        // above ~1.3 on typical content points at the Q15-clamping
+        // limitation and motivates adding the coefficient-shift field.
+        let lac_vs_flac = m.encoded_bytes as f64 / flac as f64;
+        eprint!("  flac={flac:>10}  flac_ratio={flac_ratio:.3}  lac/flac={lac_vs_flac:.3}");
+    }
+    eprintln!();
+}
+
+// Music — solo piano from the Open Goldberg Variations project (Kimiko
+// Ishizaka's recording of J.S. Bach's BWV 988, released CC0 by the
+// project). 96 kHz / 24-bit / stereo studio masters — genuine lossless
+// source, redistributable, covering three distinct pianistic content
+// classes between them.
+
+const GOLDBERG_PREFIX: &str =
+    "Kimiko Ishizaka - J.S. Bach- -Open- Goldberg Variations, BWV 988 (Piano)";
+
+#[test]
+fn bach_aria() {
+    // Slow, sustained, lyrical sarabande with long held notes and gentle
+    // bass — LPC's best-case piano content (sustained harmonics,
+    // minimal transients, smooth melodic motion). 300 s / 96 kHz / 24-bit
+    // stereo.
+    let path = corpus_path(&format!("{GOLDBERG_PREFIX} - 01 Aria.wav"));
+    require_corpus!(path);
+    let m = roundtrip_wav(&path);
+    let flac = flac_compress_size(&path);
+    report_ratio("bach_aria (solo piano, tonal)", &m, flac);
+    let ratio = m.encoded_bytes as f64 / m.raw_bytes as f64;
+    // Measured ~0.483 at FRAME_SIZE=4096 on 96 kHz / 24-bit stereo;
+    // ceiling 0.503 keeps the ~2 pp regression budget used elsewhere.
+    assert!(
+        ratio < 0.503,
+        "bach_aria ratio {} exceeds regression ceiling 0.503",
+        ratio
+    );
+}
+
+#[test]
+fn bach_variatio_4_fughetta() {
+    // Short fugal variation — tight polyphonic counterpoint with
+    // interleaved voices. Fast melodic runs stress Rice k-selection on
+    // richer residual statistics than the Aria. Cheapest music test in
+    // the suite at ~68 s.
+    let path = corpus_path(&format!("{GOLDBERG_PREFIX} - 05 Variatio 4 a 1 Clav..wav"));
+    require_corpus!(path);
+    let m = roundtrip_wav(&path);
+    let flac = flac_compress_size(&path);
+    report_ratio("bach_variatio_4 (fugal)", &m, flac);
+    let ratio = m.encoded_bytes as f64 / m.raw_bytes as f64;
+    // Measured ~0.514; ceiling 0.534 keeps the ~2 pp regression budget.
+    assert!(
+        ratio < 0.534,
+        "bach_variatio_4 ratio {} exceeds regression ceiling 0.534",
+        ratio
+    );
+}
+
+#[test]
+fn bach_variatio_16_ouverture() {
+    // French-overture style: dotted rhythms, strong attacks, ornamented
+    // melodic runs. The transient-heavy content class — residual
+    // statistics shift mid-frame as runs punctuate sustained harmonies,
+    // exercising partition_order search.
+    let path = corpus_path(&format!(
+        "{GOLDBERG_PREFIX} - 17 Variatio 16 a 1 Clav. Ouverture.wav"
+    ));
+    require_corpus!(path);
+    let m = roundtrip_wav(&path);
+    let flac = flac_compress_size(&path);
+    report_ratio("bach_variatio_16 (ouverture)", &m, flac);
+    let ratio = m.encoded_bytes as f64 / m.raw_bytes as f64;
+    // Measured ~0.512; ceiling 0.532 keeps the ~2 pp regression budget.
+    assert!(
+        ratio < 0.532,
+        "bach_variatio_16 ratio {} exceeds regression ceiling 0.532",
+        ratio
+    );
+}
+
+// Speech — AMI meeting corpus. Clean headset mic gives near-ideal speech
+// conditions; residuals should be near-Laplacian, which is exactly what
+// Rice coding is optimal for. Expect the best ratios here.
+
+#[test]
+fn ami_headset_speech() {
+    let path = corpus_path("ES2002a.Headset-0.wav");
+    require_corpus!(path);
+    let m = roundtrip_wav(&path);
+    let flac = flac_compress_size(&path);
+    report_ratio("ami_headset_speech", &m, flac);
+    let ratio = m.encoded_bytes as f64 / m.raw_bytes as f64;
+    // Measured ~0.178 at FRAME_SIZE=4096; ceiling 0.195 gives ~2 pp budget.
+    assert!(
+        ratio < 0.195,
+        "headset ratio {} exceeds regression ceiling 0.195",
+        ratio
+    );
+}
+
+#[test]
+fn ami_array_speech() {
+    // Tabletop array mic: distant speech with room acoustics. Less
+    // predictable than a headset mic — a useful stress case for LPC.
+    let path = corpus_path("ES2002a.Array1-01.wav");
+    require_corpus!(path);
+    let m = roundtrip_wav(&path);
+    let flac = flac_compress_size(&path);
+    report_ratio("ami_array_speech", &m, flac);
+    let ratio = m.encoded_bytes as f64 / m.raw_bytes as f64;
+    // Measured ~0.375 at FRAME_SIZE=4096; ceiling 0.395 gives ~2 pp budget.
+    assert!(
+        ratio < 0.395,
+        "array speech ratio {} exceeds regression ceiling 0.395",
+        ratio
+    );
+}
+
+#[test]
+fn ami_mixed_meeting() {
+    // Mixed headset: multiple simultaneous speakers. Highest spectral
+    // complexity in the corpus.
+    let path = corpus_path("ES2002a.Mix-Headset.wav");
+    require_corpus!(path);
+    let m = roundtrip_wav(&path);
+    let flac = flac_compress_size(&path);
+    report_ratio("ami_mixed_meeting", &m, flac);
+    let ratio = m.encoded_bytes as f64 / m.raw_bytes as f64;
+    // Measured ~0.292 at FRAME_SIZE=4096; ceiling 0.312 gives ~2 pp budget.
+    assert!(
+        ratio < 0.312,
+        "mixed meeting ratio {} exceeds regression ceiling 0.312",
+        ratio
+    );
+}
+
+// The sparse-vs-exhaustive encoder differential lives in
+// `src/frame.rs::tests::sparse_vs_exhaustive_on_headset_speech` — it
+// needs `encode_frame_with_grid`, which is `pub(crate)` rather than
+// part of the semver surface. Same fixture file, same assertions;
+// moved into the crate-private test module when the grid entry-point
+// was demoted from `pub #[doc(hidden)]`.
--- a/tests/determinism.rs
+++ b/tests/determinism.rs
@ -0,0 +1,103 @@
+//! Determinism fence: the encoder is contractually bit-exact on the
+//! same input, and the decoder is contractually bit-exact on the same
+//! bitstream. This file exists to catch a future change that would
+//! quietly break either property — a parallel order search with a
+//! race, a `HashMap` iteration order leaking into state, a
+//! non-deterministic tie-break. All three would pass the round-trip
+//! tests but fail here.
+//!
+//! Tests drive small deterministic inputs (LFSR noise, sine) so they
+//! don't depend on corpus presence and run in milliseconds.
+
+use lac::{decode_frame, encode_frame};
+
+// ── Deterministic inputs ────────────────────────────────────────────────────
+
+/// 32-bit Galois LFSR, mirrored from `tests/synthetic.rs`. Seeded
+/// deterministically so repeated calls with the same `seed` produce
+/// bit-identical outputs, which is the property we rely on below.
+fn lfsr_noise(n: usize, bit_depth: u8, seed: u32) -> Vec<i32> {
+    assert!((1..=24).contains(&bit_depth));
+    let mut state = if seed == 0 { 0xACE1_ACE1 } else { seed };
+    let shift = 32 - bit_depth as u32;
+    let max: i32 = (1i32 << (bit_depth - 1)) - 1;
+    (0..n)
+        .map(|_| {
+            let lsb = state & 1;
+            state >>= 1;
+            if lsb != 0 {
+                state ^= 0x8020_0003;
+            }
+            ((state as i32) >> shift).max(-max)
+        })
+        .collect()
+}
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+#[test]
+fn encode_byte_equal_on_same_input_silence() {
+    let samples = vec![0i32; 4096];
+    let a = encode_frame(&samples);
+    let b = encode_frame(&samples);
+    assert_eq!(a, b, "encoder produced different bytes for identical input");
+}
+
+#[test]
+fn encode_byte_equal_on_same_input_noise() {
+    // 16-bit noise — exercises the non-trivial LPC/Rice search path
+    // where any latent non-determinism (tie-break, parallelism, hash
+    // ordering) would be most likely to surface.
+    let samples = lfsr_noise(4096, 16, 0xDEAD);
+    let a = encode_frame(&samples);
+    let b = encode_frame(&samples);
+    assert_eq!(
+        a,
+        b,
+        "encoder produced different bytes on a noisy input (first {} bytes)",
+        a.len().min(16)
+    );
+}
+
+#[test]
+fn encode_byte_equal_on_same_input_24bit_full_scale() {
+    // Full-scale content stresses the autocorrelation accumulator
+    // width; any non-determinism in coefficient quantization would
+    // show up differently from the silence and noise cases.
+    let samples = vec![(1 << 23) - 1; 2048];
+    let a = encode_frame(&samples);
+    let b = encode_frame(&samples);
+    assert_eq!(a, b);
+}
+
+#[test]
+fn encode_byte_equal_across_many_repeats() {
+    // Ten encodes of the same input — catches an intermittent race
+    // that a two-shot comparison might miss by luck. LFSR seed chosen
+    // independently from the other tests to reduce correlation
+    // between failure modes.
+    let samples = lfsr_noise(2048, 20, 0xBEEF);
+    let reference = encode_frame(&samples);
+    for i in 1..10 {
+        let current = encode_frame(&samples);
+        assert_eq!(
+            current, reference,
+            "encode #{} differs from the reference encode",
+            i
+        );
+    }
+}
+
+#[test]
+fn decode_byte_equal_on_same_input() {
+    // Decoder determinism: same bitstream, same samples, every time.
+    let samples = lfsr_noise(2048, 16, 0xF00D);
+    let bytes = encode_frame(&samples);
+    let a = decode_frame(&bytes).expect("decode a");
+    let b = decode_frame(&bytes).expect("decode b");
+    assert_eq!(
+        a, b,
+        "decoder produced different samples for identical bytes"
+    );
+    assert_eq!(a, samples, "decoder output doesn't match original samples");
+}
--- a/tests/latency.rs
+++ b/tests/latency.rs
@ -0,0 +1,376 @@
+//! Per-frame latency distribution and heap footprint on real audio.
+//!
+//! Answers three questions the aggregate benchmark leaves fuzzy:
+//!
+//! 1. **What's the tail of encode latency?** The mean in a bench says
+//!    nothing about how bad the worst frames are. A realtime system needs
+//!    P99 below the frame period; otherwise one slow frame blows the
+//!    entire deadline. We report P50/P95/P99/max on real speech and music.
+//! 2. **What's decode speed in isolation?** The MCU test bundles
+//!    decode+mix+encode; this test measures decode alone so we know how
+//!    cheap the receive path actually is.
+//! 3. **Peak heap per frame?** Important if LAC is embedded alongside a
+//!    heavier codec (LVC video) — we want to know how much transient
+//!    allocation each audio frame costs. We wrap the global allocator
+//!    with a simple counter for the duration of the test.
+//!
+//! Run with `cargo test --test latency --release -- --nocapture`.
+//! Tests serialise themselves via a process-wide mutex so the
+//! tracking-allocator counters stay coherent even under `cargo test`'s
+//! default multi-threaded runner. `--test-threads=1` is no longer
+//! required for correctness but still recommended for clean,
+//! in-order console output.
+//!
+//! # Measurement stability
+//!
+//! For stable P99 numbers, pin the harness to a fixed core and disable
+//! frequency scaling before running:
+//!
+//! ```text
+//! sudo cpupower frequency-set -g performance
+//! taskset -c 0 cargo test --test latency --release -- --nocapture
+//! ```
+//!
+//! On a noisy CI runner the P99 values include scheduler jitter and can
+//! overstate real-world cost by 2-5×. The P99 hard-deadline asserts
+//! below use the frame period as the ceiling, which is still a wide
+//! safety margin (~40× headroom in steady state) so jitter alone won't
+//! flake the suite.
+
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::path::{Path, PathBuf};
+use std::sync::Mutex;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::{Duration, Instant};
+
+use hound::WavReader;
+use lac::{decode_frame, encode_frame};
+
+const CORPUS_DIR: &str = "corpus";
+
+// ── Tracking allocator ──────────────────────────────────────────────────────
+
+/// Global allocator wrapper tracking current and peak bytes outstanding.
+/// Counts are process-global; the per-test `_lock(&TEST_MUTEX)` guard
+/// (`MEASUREMENT_LOCK.lock()` at the top of each test body) serialises
+/// access so concurrent test threads don't corrupt each other's
+/// measurements.
+struct TrackingAllocator;
+
+/// Serialises latency tests so the process-global tracking-allocator
+/// counters stay coherent under multi-threaded `cargo test`. Each test
+/// takes the lock at entry and holds it for the whole measurement
+/// window. This is only about allocator-counter coherence; a panic
+/// inside a test section will still release the mutex via unwind, so
+/// the `PoisonError` path intentionally ignores poison.
+static MEASUREMENT_LOCK: Mutex<()> = Mutex::new(());
+
+/// Cumulative bytes currently allocated from the tracked allocator. Updated
+/// on every alloc/dealloc; reset to 0 between measurements via
+/// `reset_peak`.
+static CURRENT_BYTES: AtomicUsize = AtomicUsize::new(0);
+
+/// Peak of `CURRENT_BYTES` observed since the last `reset_peak`.
+static PEAK_BYTES: AtomicUsize = AtomicUsize::new(0);
+
+/// Cumulative count of `alloc` calls since the last `reset_peak`. Tracked
+/// separately from bytes because a regression can keep peak-bytes flat
+/// (same sized buffers, different provenance) while multiplying the call
+/// count — e.g. a refactor that replaces one reused `Vec` with a fresh
+/// `Vec::new()` per frame.
+static CALL_COUNT: AtomicUsize = AtomicUsize::new(0);
+
+unsafe impl GlobalAlloc for TrackingAllocator {
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        let ptr = unsafe { System.alloc(layout) };
+        if !ptr.is_null() {
+            let new = CURRENT_BYTES.fetch_add(layout.size(), Ordering::Relaxed) + layout.size();
+            // `fetch_max` updates the peak only if `new` exceeds the stored
+            // value; cheap, wait-free, sufficient for single-threaded tests.
+            PEAK_BYTES.fetch_max(new, Ordering::Relaxed);
+            CALL_COUNT.fetch_add(1, Ordering::Relaxed);
+        }
+        ptr
+    }
+
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        unsafe { System.dealloc(ptr, layout) };
+        CURRENT_BYTES.fetch_sub(layout.size(), Ordering::Relaxed);
+    }
+}
+
+#[global_allocator]
+static ALLOC: TrackingAllocator = TrackingAllocator;
+
+/// Reset the peak to the current allocation level so the next measurement
+/// window starts fresh. Also clears the per-window allocation counter.
+/// Call once before the code under test.
+fn reset_peak() {
+    PEAK_BYTES.store(CURRENT_BYTES.load(Ordering::Relaxed), Ordering::Relaxed);
+    CALL_COUNT.store(0, Ordering::Relaxed);
+}
+
+/// Read the maximum bytes outstanding since the last `reset_peak`, minus
+/// the current baseline — i.e., the peak *transient* heap the code under
+/// test held. Ignores allocations that were still live at reset time
+/// (test scaffolding, pre-loaded corpus).
+fn peak_delta_since_reset() -> usize {
+    let peak = PEAK_BYTES.load(Ordering::Relaxed);
+    let baseline = CURRENT_BYTES.load(Ordering::Relaxed);
+    peak.saturating_sub(baseline)
+}
+
+/// Read the number of `alloc` calls made since the last `reset_peak`.
+fn call_count_since_reset() -> usize {
+    CALL_COUNT.load(Ordering::Relaxed)
+}
+
+// ── Corpus loading ──────────────────────────────────────────────────────────
+
+fn corpus(name: &str) -> PathBuf {
+    Path::new(CORPUS_DIR).join(name)
+}
+
+fn load_mono(path: &Path) -> Option<Vec<i32>> {
+    let mut reader = WavReader::open(path).ok()?;
+    let spec = reader.spec();
+    if spec.sample_format != hound::SampleFormat::Int
+        || spec.channels != 1
+        || spec.bits_per_sample > 24
+    {
+        return None;
+    }
+    let samples: Result<Vec<i32>, _> = reader.samples::<i32>().collect();
+    samples.ok()
+}
+
+macro_rules! require {
+    ($path:expr) => {
+        if !$path.exists() {
+            eprintln!("skipping: corpus file not found: {}", $path.display());
+            return;
+        }
+    };
+}
+
+// ── Latency harness ─────────────────────────────────────────────────────────
+
+/// Distribution summary computed from a sorted `Vec<Duration>`.
+struct Dist {
+    count: usize,
+    p50: Duration,
+    p95: Duration,
+    p99: Duration,
+    max: Duration,
+    mean: Duration,
+}
+
+fn dist_of(mut samples: Vec<Duration>) -> Dist {
+    samples.sort();
+    let count = samples.len();
+    let p = |frac: f64| samples[((count as f64 - 1.0) * frac).round() as usize];
+    let total: Duration = samples.iter().sum();
+    Dist {
+        count,
+        p50: p(0.50),
+        p95: p(0.95),
+        p99: p(0.99),
+        max: *samples.last().unwrap(),
+        mean: total / count as u32,
+    }
+}
+
+fn fmt_us(d: Duration) -> String {
+    format!("{:.1}µs", d.as_nanos() as f64 / 1000.0)
+}
+
+/// Measure encode and decode latency distributions over every
+/// `frame_size`-sample chunk in `samples`. Also reports peak per-frame
+/// heap allocation (encoder side only — the decoder footprint is
+/// measured separately in the per-phase breakdown below).
+///
+/// `sample_rate` is used to convert the frame-sample count into a
+/// real-time frame period, which gates the P99 assertion below.
+fn run_latency(name: &str, samples: &[i32], frame_size: usize, sample_rate: u32) {
+    // Serialise against every other latency-test thread: the tracking
+    // allocator is a single pair of atomic counters. The
+    // `.unwrap_or_else` handles `PoisonError` so one failed test can't
+    // wedge the rest of the suite.
+    let _guard = MEASUREMENT_LOCK
+        .lock()
+        .unwrap_or_else(|poisoned| poisoned.into_inner());
+    let frames: Vec<&[i32]> = samples.chunks_exact(frame_size).collect();
+    // Warm-up: first few encodes allocate LLVM+allocator arenas that
+    // don't reflect steady-state behaviour. Skip them in the measurement.
+    let warmup = 32.min(frames.len());
+    for f in &frames[..warmup] {
+        std::hint::black_box(encode_frame(f));
+    }
+
+    // ── Encode latency and transient heap ────────────────────────────
+    let mut encode_times = Vec::with_capacity(frames.len() - warmup);
+    reset_peak();
+    let mut peak_encode_bytes = 0usize;
+    let mut peak_encode_allocs = 0usize;
+    let mut encoded_bytes_total = 0usize;
+    let mut encoded_frames: Vec<Vec<u8>> = Vec::with_capacity(frames.len() - warmup);
+    for f in &frames[warmup..] {
+        reset_peak();
+        let t = Instant::now();
+        let encoded = encode_frame(f);
+        encode_times.push(t.elapsed());
+        encoded_bytes_total += encoded.len();
+        peak_encode_bytes = peak_encode_bytes.max(peak_delta_since_reset());
+        peak_encode_allocs = peak_encode_allocs.max(call_count_since_reset());
+        encoded_frames.push(encoded);
+    }
+
+    // ── Decode latency and transient heap ────────────────────────────
+    let mut decode_times = Vec::with_capacity(encoded_frames.len());
+    let mut peak_decode_bytes = 0usize;
+    let mut peak_decode_allocs = 0usize;
+    for ef in &encoded_frames {
+        reset_peak();
+        let t = Instant::now();
+        let _samples = decode_frame(ef).expect("decode");
+        decode_times.push(t.elapsed());
+        peak_decode_bytes = peak_decode_bytes.max(peak_delta_since_reset());
+        peak_decode_allocs = peak_decode_allocs.max(call_count_since_reset());
+    }
+
+    let enc = dist_of(encode_times);
+    let dec = dist_of(decode_times);
+    // Frame period = frame_size / sample_rate, expressed in nanoseconds
+    // as an integer so the subsequent P99 comparison is exact (no float
+    // epsilon). Example: 320 samples at 16 kHz → 20_000_000 ns = 20 ms.
+    let frame_period =
+        Duration::from_nanos((frame_size as u64 * 1_000_000_000) / sample_rate as u64);
+
+    eprintln!();
+    eprintln!("== {name} ({frame_size}-sample frames @ {sample_rate} Hz) ==");
+    eprintln!(
+        "  encode latency:  p50={}  p95={}  p99={}  max={}  mean={}",
+        fmt_us(enc.p50),
+        fmt_us(enc.p95),
+        fmt_us(enc.p99),
+        fmt_us(enc.max),
+        fmt_us(enc.mean)
+    );
+    eprintln!(
+        "                    headroom at p99 vs frame period ({:.1}ms): {:.1}×",
+        frame_period.as_micros() as f64 / 1000.0,
+        frame_period.as_nanos() as f64 / enc.p99.as_nanos() as f64,
+    );
+    eprintln!(
+        "  decode latency:  p50={}  p95={}  p99={}  max={}  mean={}",
+        fmt_us(dec.p50),
+        fmt_us(dec.p95),
+        fmt_us(dec.p99),
+        fmt_us(dec.max),
+        fmt_us(dec.mean)
+    );
+    eprintln!(
+        "  peak heap / frame:    encode={}B  decode={}B",
+        peak_encode_bytes, peak_decode_bytes
+    );
+    eprintln!(
+        "  peak allocs / frame:  encode={}  decode={}",
+        peak_encode_allocs, peak_decode_allocs
+    );
+    eprintln!(
+        "  throughput:  encoded_frames={}  total_encoded_bytes={}  avg_bytes/frame={:.1}",
+        enc.count,
+        encoded_bytes_total,
+        encoded_bytes_total as f64 / enc.count as f64
+    );
+
+    // Real-time invariant: P99 per-frame cost must stay below the frame
+    // period. Steady-state headroom is ~40× so a CI runner with heavy
+    // scheduler jitter still passes comfortably; a 40× regression
+    // (encoder bug, allocator hot-path change) trips this assert.
+    assert!(
+        enc.p99 < frame_period,
+        "encode P99 {} exceeds frame period {} — real-time deadline missed",
+        fmt_us(enc.p99),
+        fmt_us(frame_period),
+    );
+    assert!(
+        dec.p99 < frame_period,
+        "decode P99 {} exceeds frame period {} — real-time deadline missed",
+        fmt_us(dec.p99),
+        fmt_us(frame_period),
+    );
+}
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+#[test]
+fn latency_headset_speech_320() {
+    // 320 samples @ 16 kHz = 20 ms frame — standard voice-chat period.
+    let path = corpus("ES2002a.Headset-0.wav");
+    require!(path);
+    let samples = load_mono(&path).expect("load");
+    // Cap to ~60 s of audio so the test doesn't dominate CI time.
+    let cap = (16_000 * 60).min(samples.len());
+    run_latency("headset_speech", &samples[..cap], 320, 16_000);
+}
+
+#[test]
+fn latency_headset_speech_160() {
+    // 160 samples @ 16 kHz = 10 ms frame — tighter latency mode used by
+    // WebRTC and similar real-time systems.
+    let path = corpus("ES2002a.Headset-0.wav");
+    require!(path);
+    let samples = load_mono(&path).expect("load");
+    let cap = (16_000 * 60).min(samples.len());
+    run_latency("headset_speech_10ms", &samples[..cap], 160, 16_000);
+}
+
+#[test]
+fn latency_headset_speech_480() {
+    // 480 samples at 16 kHz = 30 ms frame. The same sample count at
+    // 48 kHz is WebRTC's 10 ms full-band frame; since the codec only
+    // cares about frame sample count (not sample rate) this exercises
+    // the same search-grid shape that a 48 kHz WebRTC deployment would
+    // hit. 480 = 2^5 · 3 · 5, so partition orders 0..=5 are valid; 6
+    // and 7 are not, which differs from the 2048-sample dense case.
+    let path = corpus("ES2002a.Headset-0.wav");
+    require!(path);
+    let samples = load_mono(&path).expect("load");
+    let cap = (16_000 * 60).min(samples.len());
+    run_latency("headset_speech_480", &samples[..cap], 480, 16_000);
+}
+
+#[test]
+fn latency_headset_speech_prime() {
+    // 503 is prime, so only `partition_order = 0` divides it — the
+    // encoder skips the partition search entirely and emits a single
+    // Rice partition. Covers a code path that power-of-two and
+    // smooth-composite frame sizes never reach.
+    let path = corpus("ES2002a.Headset-0.wav");
+    require!(path);
+    let samples = load_mono(&path).expect("load");
+    let cap = (16_000 * 60).min(samples.len());
+    run_latency("headset_speech_prime503", &samples[..cap], 503, 16_000);
+}
+
+#[test]
+fn latency_mixed_meeting_320() {
+    let path = corpus("ES2002a.Mix-Headset.wav");
+    require!(path);
+    let samples = load_mono(&path).expect("load");
+    let cap = (16_000 * 60).min(samples.len());
+    run_latency("mixed_meeting", &samples[..cap], 320, 16_000);
+}
+
+#[test]
+fn latency_array_speech_320() {
+    // Distant mic — residuals are noisier, so encode cost per frame
+    // typically rises. Useful to confirm P99 doesn't blow up on less
+    // predictable content.
+    let path = corpus("ES2002a.Array1-01.wav");
+    require!(path);
+    let samples = load_mono(&path).expect("load");
+    let cap = (16_000 * 60).min(samples.len());
+    run_latency("array_speech", &samples[..cap], 320, 16_000);
+}
--- a/tests/mcu_mix.rs
+++ b/tests/mcu_mix.rs
@ -0,0 +1,488 @@
+//! End-to-end MCU server-side mix workload, on real speech audio.
+//!
+//! An **MCU** (Multipoint Control Unit) is a conferencing server that
+//! decodes every participant's incoming stream, mixes them in PCM, and
+//! re-encodes a per-receiver output. Contrast with an **SFU** (Selective
+//! Forwarding Unit) which forwards encoded streams byte-for-byte with no
+//! decode — lower CPU, higher bandwidth, no mix.
+//!
+//! This test simulates the MCU hot loop (decode → mix → encode) and lets
+//! us answer three concrete questions with real numbers:
+//!
+//! 1. **How many concurrent meetings can one CPU core handle?** Reported
+//!    as the realtime ratio — "1000 ms of audio processed in 20 ms wall
+//!    clock" means one core can handle 50 concurrent meetings at that
+//!    configuration.
+//! 2. **What fraction of MCU time is actually encode?** Breaks the loop
+//!    into decode, mix, and encode phases so a later optimisation can
+//!    target the real bottleneck rather than the suspected one.
+//! 3. **How much bandwidth does server-side mix save over pure SFU
+//!    fanout?** Compares the MCU's outgoing byte total (one stream per
+//!    receiver, each a leave-one-out mix) against what an SFU would send
+//!    (`P × (P − 1)` stream copies across the meeting).
+//!
+//! The same test is the natural baseline for future codec work:
+//!
+//! - **Q15 coefficient-shift fix**: should reduce encoded byte totals on
+//!   bass-heavy content (music tests). Compare `bytes_out_mix` before and
+//!   after.
+//! - **Encoder search optimisation**: should reduce `encode_ns` without
+//!   hurting `bytes_out_mix`. Compare the phase breakdown before and
+//!   after.
+//!
+//! Run with `cargo test --test mcu_mix --release -- --nocapture`.
+
+use std::path::{Path, PathBuf};
+use std::time::Instant;
+
+use hound::WavReader;
+use lac::{decode_frame, encode_frame};
+
+const CORPUS_DIR: &str = "corpus";
+
+/// 20 ms frame at 16 kHz — standard voice-chat frame length. Divides
+/// cleanly by every partition order so the encoder search stays on the
+/// dense path.
+const FRAME_SIZE: usize = 320;
+
+/// Cap wall-clock runtime by limiting how many frames we actually
+/// process. At 20 ms/frame, 1500 frames is 30 s of audio per stream —
+/// enough to average out transient behaviour without making the test take
+/// minutes in CI.
+const MAX_FRAMES: usize = 1500;
+
+// ── WAV loading ─────────────────────────────────────────────────────────────
+
+fn load_mono(path: &Path) -> Option<Vec<i32>> {
+    let mut reader = WavReader::open(path).ok()?;
+    let spec = reader.spec();
+    if spec.sample_format != hound::SampleFormat::Int
+        || spec.channels != 1
+        || spec.bits_per_sample > 24
+    {
+        return None;
+    }
+    let samples: Result<Vec<i32>, _> = reader.samples::<i32>().collect();
+    samples.ok()
+}
+
+fn corpus(name: &str) -> PathBuf {
+    Path::new(CORPUS_DIR).join(name)
+}
+
+macro_rules! require {
+    ($path:expr) => {
+        if !$path.exists() {
+            eprintln!("skipping: corpus file not found: {}", $path.display());
+            return;
+        }
+    };
+}
+
+// ── MCU mix pipeline ────────────────────────────────────────────────────────
+
+/// Result of one MCU simulation run.
+#[derive(Default)]
+struct McuStats {
+    /// Number of participants in the simulated meeting.
+    participants: usize,
+    /// Number of frames processed per participant (each frame is `FRAME_SIZE` samples).
+    frames: usize,
+    /// Total nanoseconds spent in `decode_frame` across all decodes.
+    decode_ns: u128,
+    /// Total nanoseconds spent doing the leave-one-out mix additions.
+    mix_ns: u128,
+    /// Total nanoseconds spent in `encode_frame` across all output encodes.
+    encode_ns: u128,
+    /// Bytes sent *into* the MCU by participants (pre-encoded frames).
+    bytes_in: usize,
+    /// Bytes sent *out* of the MCU (one stream per receiver, each a
+    /// leave-one-out mix).
+    bytes_out_mix: usize,
+    /// Bytes an SFU would send out for the same meeting: `P × (P − 1)`
+    /// stream copies, since every participant receives every other
+    /// participant's stream verbatim with no mix.
+    bytes_out_fanout: usize,
+}
+
+impl McuStats {
+    fn total_ns(&self) -> u128 {
+        self.decode_ns + self.mix_ns + self.encode_ns
+    }
+
+    /// Audio duration processed, in milliseconds.
+    fn audio_ms(&self) -> f64 {
+        // FRAME_SIZE samples at 16 kHz = (FRAME_SIZE / 16000) seconds per frame.
+        (self.frames as f64) * (FRAME_SIZE as f64) / 16_000.0 * 1000.0
+    }
+
+    /// Realtime multiplier — audio ms per wall-clock ms. A value of 50
+    /// means this workload runs 50× faster than realtime, i.e., one core
+    /// handles 50 concurrent meetings of this configuration.
+    fn realtime_ratio(&self) -> f64 {
+        let wall_ms = (self.total_ns() as f64) / 1_000_000.0;
+        self.audio_ms() / wall_ms
+    }
+}
+
+/// Run the decode → mix → encode loop for a simulated MCU meeting with
+/// the given pre-encoded participant streams. Returns timing and byte
+/// accounting for a single run.
+fn simulate_meeting(encoded_streams: &[Vec<Vec<u8>>]) -> McuStats {
+    let p = encoded_streams.len();
+    assert!(p >= 2, "need at least 2 participants for a meeting");
+    let n_frames = encoded_streams.iter().map(|s| s.len()).min().unwrap();
+    let n_frames = n_frames.min(MAX_FRAMES);
+
+    let mut stats = McuStats {
+        participants: p,
+        frames: n_frames,
+        ..Default::default()
+    };
+
+    // Ingress bytes: each participant sends all its encoded frames to the MCU.
+    for s in encoded_streams {
+        stats.bytes_in += s[..n_frames].iter().map(|f| f.len()).sum::<usize>();
+    }
+
+    // Reusable scratch buffer for leave-one-out mixes. The mix for
+    // receiver `r` excludes participant `r`'s own voice so they don't
+    // hear themselves delayed.
+    let mut mix = vec![0i32; FRAME_SIZE];
+
+    // `frame_idx` is a tick index that selects the same position across
+    // every participant's encoded-frame vector. Converting this to
+    // `iter().zip(...)` across variable-arity participants complicates
+    // the loop body for no runtime benefit, so the integer-index form
+    // stays; silence the `needless_range_loop` lint explicitly.
+    #[allow(clippy::needless_range_loop)]
+    for frame_idx in 0..n_frames {
+        // ── Phase 1: decode every incoming stream for this frame tick ──
+        let t0 = Instant::now();
+        let decoded: Vec<Vec<i32>> = (0..p)
+            .map(|i| decode_frame(&encoded_streams[i][frame_idx]).expect("decode"))
+            .collect();
+        stats.decode_ns += t0.elapsed().as_nanos();
+
+        // ── Phase 2 + 3: for each receiver, build its leave-one-out mix and encode ──
+        for receiver in 0..p {
+            // Mix (i32 additions). Wrapping add is appropriate — i32
+            // sample range is ±2²³-1 and summing up to P-1 ≤ 31 streams
+            // keeps the running total within i32 (max 2^31-1 / 2^23 ≈ 256
+            // summands before saturation).
+            let t_mix = Instant::now();
+            mix.fill(0);
+            for (i, stream) in decoded.iter().enumerate() {
+                if i == receiver {
+                    continue;
+                }
+                for (m, &s) in mix.iter_mut().zip(stream.iter()) {
+                    *m = m.wrapping_add(s);
+                }
+            }
+            stats.mix_ns += t_mix.elapsed().as_nanos();
+
+            // Clamp the mix to the 24-bit input range LAC guarantees. In
+            // practice, summing 4 typical speech streams almost never
+            // exceeds the range (human voice peaks well under full-scale,
+            // and constructive superposition of uncorrelated speakers is
+            // rare), but clamp anyway to stay inside the codec contract.
+            for m in mix.iter_mut() {
+                *m = (*m).clamp(-(1 << 23) + 1, (1 << 23) - 1);
+            }
+
+            let t_enc = Instant::now();
+            let encoded = encode_frame(&mix);
+            stats.encode_ns += t_enc.elapsed().as_nanos();
+            stats.bytes_out_mix += encoded.len();
+        }
+
+        // Fanout byte accounting: for each frame, each participant's
+        // stream is copied to the P-1 other receivers, with no re-encode.
+        // That's P × (P-1) copies, each the original encoded frame size.
+        for stream in encoded_streams.iter() {
+            stats.bytes_out_fanout += stream[frame_idx].len() * (p - 1);
+        }
+    }
+
+    stats
+}
+
+// ── Reporting ───────────────────────────────────────────────────────────────
+
+fn report(name: &str, s: &McuStats) {
+    let total_us = (s.total_ns() as f64) / 1000.0;
+    let decode_pct = 100.0 * (s.decode_ns as f64) / (s.total_ns() as f64);
+    let mix_pct = 100.0 * (s.mix_ns as f64) / (s.total_ns() as f64);
+    let encode_pct = 100.0 * (s.encode_ns as f64) / (s.total_ns() as f64);
+    let bandwidth_ratio = s.bytes_out_mix as f64 / s.bytes_out_fanout as f64;
+
+    eprintln!();
+    eprintln!("== {name} ==");
+    eprintln!(
+        "  {} participants × {} frames = {:.1} s of audio per stream",
+        s.participants,
+        s.frames,
+        s.audio_ms() / 1000.0
+    );
+    eprintln!(
+        "  wall:     {:.1} ms  ({:.1}× realtime → {:.0} concurrent meetings/core)",
+        total_us / 1000.0,
+        s.realtime_ratio(),
+        s.realtime_ratio(),
+    );
+    eprintln!(
+        "  phase:    decode {:>5.1}%   mix {:>5.1}%   encode {:>5.1}%",
+        decode_pct, mix_pct, encode_pct
+    );
+    eprintln!(
+        "  bytes:    in {} / out_mix {} / out_fanout {} / mix_vs_fanout {:.2}",
+        s.bytes_in, s.bytes_out_mix, s.bytes_out_fanout, bandwidth_ratio
+    );
+}
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+/// Pre-encode all participant streams into `FRAME_SIZE`-sample frames and
+/// hand the result to `simulate_meeting`. Splitting the setup out of the
+/// simulation keeps the reported `encode_ns` to the *server-side* encode
+/// work only — participant-side encoding is a separate machine and
+/// shouldn't contaminate the MCU measurement.
+fn run(name: &str, stream_names: &[&str]) {
+    run_inner(name, stream_names, Activity::Continuous);
+}
+
+/// Turn-taking activity pattern. `Continuous` is every participant
+/// producing audio every frame (the pessimistic load-bearing case for
+/// MCU mix/encode cost). `DominantSpeaker { window_frames }` zeroes
+/// every stream except a single rotating speaker for each
+/// `window_frames`-long block. This is the realistic meeting behaviour
+/// — one person talks at a time — under which the MCU compute cost
+/// drops substantially (most mixes collapse to one voice + silence) but
+/// the relative byte savings vs SFU shrink (silence forwards for
+/// essentially free on a pure SFU, so the SFU egress number also falls).
+enum Activity {
+    Continuous,
+    DominantSpeaker { window_frames: usize },
+}
+
+fn run_inner(name: &str, stream_names: &[&str], activity: Activity) {
+    let paths: Vec<PathBuf> = stream_names.iter().map(|n| corpus(n)).collect();
+    for p in &paths {
+        if !p.exists() {
+            eprintln!("skipping {name}: missing {}", p.display());
+            return;
+        }
+    }
+
+    // Cap each stream to the number of samples we'll actually simulate,
+    // so we don't spend time pre-encoding the remaining 30+ minutes of
+    // audio that `simulate_meeting` will never touch. The AMI files are
+    // multi-minute recordings; MAX_FRAMES = 1500 @ 20 ms/frame = 30 s.
+    let max_samples = MAX_FRAMES * FRAME_SIZE;
+    let mut streams: Vec<Vec<i32>> = paths
+        .iter()
+        .map(|p| {
+            let mut s = load_mono(p).expect("load_mono");
+            s.truncate(max_samples);
+            s
+        })
+        .collect();
+
+    if let Activity::DominantSpeaker { window_frames } = activity {
+        apply_dominant_speaker(&mut streams, window_frames);
+    }
+
+    let encoded_streams: Vec<Vec<Vec<u8>>> = streams
+        .iter()
+        .map(|s| s.chunks(FRAME_SIZE).map(encode_frame).collect())
+        .collect();
+
+    let stats = simulate_meeting(&encoded_streams);
+    report(name, &stats);
+
+    // Sanity assertion: MCU egress must not exceed SFU egress. Equality
+    // holds at P=2 (the leave-one-out mix is just the other participant's
+    // voice, which encodes to the same size as forwarding their original
+    // stream); strict inequality holds for P ≥ 3 because fanout's byte
+    // count grows as `P × (P − 1)` while mix grows as `P`.
+    //
+    // For dominant-speaker the invariant still holds on the egress side
+    // (MCU produces one stream per receiver; SFU forwards P-1 streams
+    // per receiver) but the absolute margin narrows because silence
+    // compresses to ~1 bit per sample and SFU's egress of N-1 silent
+    // streams is essentially free.
+    assert!(
+        stats.bytes_out_mix <= stats.bytes_out_fanout,
+        "MCU mix produced more bytes than SFU fanout: mix={} fanout={}",
+        stats.bytes_out_mix,
+        stats.bytes_out_fanout
+    );
+}
+
+/// Zero every stream except a single rotating speaker per
+/// `window_frames`-long block. Assumes every stream in `streams` has the
+/// same length and `FRAME_SIZE` divides it; the runtime calls this only
+/// with post-truncate stream slices where both hold.
+///
+/// The rotation uses the block index modulo the participant count, so
+/// each participant gets roughly `total_blocks / participants` turns
+/// spread across the measurement window. At the default
+/// `window_frames = 100` (2 s per turn at 20 ms frames) and
+/// `MAX_FRAMES = 1500`, P=8 gives each participant ~1.9 turns — enough
+/// for the phase breakdown to average over speaker transitions.
+fn apply_dominant_speaker(streams: &mut [Vec<i32>], window_frames: usize) {
+    let n = streams.len();
+    if n == 0 {
+        return;
+    }
+    // AMI files aren't guaranteed to be identical length even after the
+    // common `truncate(max_samples)` step — a short recording stays
+    // short. Use the shortest stream as the rotation horizon so the
+    // per-stream slice below never runs past a stream's own end.
+    let common_len = streams.iter().map(|s| s.len()).min().unwrap_or(0);
+    let total_frames = common_len / FRAME_SIZE;
+    let window_samples = window_frames * FRAME_SIZE;
+    for block in 0..total_frames.div_ceil(window_frames) {
+        let active = block % n;
+        let start = block * window_samples;
+        let end = (start + window_samples).min(common_len);
+        for (i, s) in streams.iter_mut().enumerate() {
+            if i == active {
+                continue;
+            }
+            for sample in &mut s[start..end] {
+                *sample = 0;
+            }
+        }
+    }
+}
+
+#[test]
+fn mcu_mix_1on1_voice() {
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_1on1_voice",
+        &["ES2002a.Headset-0.wav", "ES2002a.Headset-1.wav"],
+    );
+}
+
+#[test]
+fn mcu_mix_3people_voice() {
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_3people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+        ],
+    );
+}
+
+#[test]
+fn mcu_mix_5people_voice() {
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_5people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+        ],
+    );
+}
+
+#[test]
+fn mcu_mix_8people_voice() {
+    // Covers the cross-over where mix bandwidth savings become dramatic —
+    // fanout grows quadratically, server-mix stays linear.
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_8people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+            "ES2002a.Lapel-1.wav",
+            "ES2002a.Lapel-2.wav",
+            "ES2002a.Lapel-3.wav",
+        ],
+    );
+}
+
+#[test]
+fn mcu_mix_8people_dominant_speaker() {
+    // Same participant set as `mcu_mix_8people_voice`, but only one
+    // participant has audio at any given moment — rotating every
+    // 2 seconds (100 frames @ 20 ms each). This is the realistic
+    // meeting behaviour: one person talks while N-1 listen. The
+    // continuous-speech test above is the pessimistic workload
+    // (everyone talks simultaneously, which inflates MCU mix cost and
+    // SFU fanout byte count); this test shows what the server actually
+    // experiences in production traffic.
+    //
+    // Expected differences vs the continuous variant:
+    //   - Decode phase: unchanged in wall-clock (silent frames decode
+    //     about as cheap as voice frames — the codec still walks every
+    //     residual).
+    //   - Mix phase: still O(P²) additions per frame, unchanged.
+    //   - Encode phase: drops sharply. Most mixes collapse to "one
+    //     voice + silence + silence + …", which encodes roughly like
+    //     a single voice stream instead of a P-1-way sum.
+    //   - Egress ratio (MCU vs SFU): narrows. SFU forwards (P-1) silent
+    //     streams per receiver nearly for free, so the SFU egress
+    //     baseline drops faster than MCU egress does.
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run_inner(
+        "mcu_mix_8people_dominant_speaker",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+            "ES2002a.Lapel-1.wav",
+            "ES2002a.Lapel-2.wav",
+            "ES2002a.Lapel-3.wav",
+        ],
+        Activity::DominantSpeaker { window_frames: 100 },
+    );
+}
+
+#[test]
+fn mcu_mix_16people_voice() {
+    // Past the typical full-mesh MCU sweet spot but still plausible for
+    // a mid-size meeting before hierarchical routing kicks in. The mix
+    // phase does 16 × 15 = 240 sample-wise additions per frame vs
+    // 8 × 7 = 56 at P=8, so the quadratic mix term starts showing up in
+    // the phase breakdown. Larger real meetings (P > 20) are typically
+    // routed via dominant-speaker selection rather than full-mesh mix,
+    // so this is a reasonable ceiling for the full-mesh data point.
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_16people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+            "ES2002a.Lapel-1.wav",
+            "ES2002a.Lapel-2.wav",
+            "ES2002a.Lapel-3.wav",
+            "ES2002a.Array1-01.wav",
+            "ES2002a.Array1-02.wav",
+            "ES2002a.Array1-03.wav",
+            "ES2002a.Array1-04.wav",
+            "ES2002a.Array1-05.wav",
+            "ES2002a.Array1-06.wav",
+            "ES2002a.Array1-07.wav",
+            "ES2002a.Array1-08.wav",
+        ],
+    );
+}
--- a/tests/synthetic.rs
+++ b/tests/synthetic.rs
@ -0,0 +1,290 @@
+//! Synthetic round-trip tests covering bit depths and pathological
+//! content the real-audio corpus doesn't exercise.
+//!
+//! # Why these exist
+//!
+//! The `corpus.rs` suite is 16-bit PCM only (AMI + music). The spec
+//! permits any source in `|sample| ≤ 2²³ − 1`, so 8-bit, 20-bit, and
+//! 24-bit inputs are supported but untested by corpus data alone.
+//! Likewise, real audio rarely exhibits exact DC, exact full-scale,
+//! pure Nyquist, or cleanly bounded white noise — conditions the
+//! numerical-stability paths inside the encoder are expected to
+//! handle but which deserve explicit regression fences.
+//!
+//! Everything here is deterministic and integer-only: a 32-bit LFSR
+//! drives the noise cases, and fixed constants drive the pathological
+//! ones. No corpus files needed; the tests always run in CI.
+//!
+//! Frame size is 1024 (power-of-two so every partition order is
+//! available to the encoder search). Every test round-trips through
+//! `encode_frame`/`decode_frame` and asserts bit-exact recovery — the
+//! only acceptable outcome for a lossless codec.
+
+use lac::{decode_frame, encode_frame};
+
+const FRAME_SIZE: usize = 1024;
+
+// ── LFSR noise generator ────────────────────────────────────────────────────
+
+/// 32-bit Galois LFSR producing deterministic pseudo-random i32 values
+/// in `[-(2^{bit_depth-1} − 1), 2^{bit_depth-1} − 1]` — the arithmetic
+/// right shift would in principle include the extra negative value
+/// `-2^{bit_depth-1}` (from `i32::MIN >> shift`), but LAC's input
+/// contract (spec §1) excludes that value, so it's clamped out here.
+/// A fixed seed per call keeps tests reproducible across runs and
+/// platforms.
+fn lfsr_noise(n: usize, bit_depth: u8, seed: u32) -> Vec<i32> {
+    assert!((1..=24).contains(&bit_depth));
+    // Non-zero seed: a zero state would lock the LFSR at zero.
+    let mut state = if seed == 0 { 0xACE1_ACE1 } else { seed };
+    let shift = 32 - bit_depth as u32;
+    // Contract upper bound for this bit depth: ±(2^(bit_depth-1) − 1).
+    // At bit_depth=24 this matches LAC's input contract exactly; at
+    // narrower depths it matches the symmetric-range PCM convention.
+    let max: i32 = (1i32 << (bit_depth - 1)) - 1;
+    (0..n)
+        .map(|_| {
+            // Maximal-length 32-bit Galois polynomial (tap mask 0x8020_0003).
+            // Period 2^32 − 1 dwarfs any frame size this suite uses.
+            let lsb = state & 1;
+            state >>= 1;
+            if lsb != 0 {
+                state ^= 0x8020_0003;
+            }
+            // Sign-extend via `as i32`, then arithmetic-right-shift to
+            // the requested bit depth. Clamp the asymmetric lower edge
+            // up to match the symmetric contract.
+            ((state as i32) >> shift).max(-max)
+        })
+        .collect()
+}
+
+/// Encode every `FRAME_SIZE`-sample chunk of `samples`, decode, and
+/// assert exact recovery. Returns `(raw_bytes, encoded_bytes)` under
+/// the assumption that `bytes_per_sample` reflects the *source* PCM
+/// width the caller originally packed the signal into — so the
+/// reported ratio is comparable to what a user would measure when
+/// running LAC against a file of that depth.
+fn roundtrip(samples: &[i32], bytes_per_sample: usize) -> (usize, usize) {
+    assert!(!samples.is_empty());
+    let mut raw = 0usize;
+    let mut encoded_total = 0usize;
+    for chunk in samples.chunks(FRAME_SIZE) {
+        let encoded = encode_frame(chunk);
+        let decoded = decode_frame(&encoded).expect("decode_frame rejected its own output");
+        assert_eq!(
+            decoded,
+            chunk,
+            "round-trip mismatch on {}-sample frame",
+            chunk.len()
+        );
+        raw += chunk.len() * bytes_per_sample;
+        encoded_total += encoded.len();
+    }
+    (raw, encoded_total)
+}
+
+// ── Bit-depth coverage ──────────────────────────────────────────────────────
+//
+// The codec's input contract is `|sample| ≤ 2^23 − 1`, but the spec
+// emphasises that narrower sources (8/16/20-bit) "compress at the bit
+// cost of their actual values, not a 24-bit ceiling." These tests
+// verify that claim holds — round-trip is bit-exact at every width, and
+// the compressed size stays proportional to the source range, not
+// inflated to a 24-bit ceiling.
+
+#[test]
+fn roundtrip_8bit_noise() {
+    // 8-bit PCM: samples in [-128, 127]. This is the narrowest format
+    // LAC's spec mentions explicitly. Residuals are tiny, so the Rice
+    // k-selection should land at very low k (often 0-2).
+    let samples = lfsr_noise(4 * FRAME_SIZE, 8, 0x8ACE);
+    let (raw, encoded) = roundtrip(&samples, 1);
+    eprintln!(
+        "roundtrip_8bit_noise           raw={}  encoded={}  ratio={:.3}",
+        raw,
+        encoded,
+        encoded as f64 / raw as f64,
+    );
+    // White noise at 8-bit is incompressible in principle — LPC cannot
+    // predict i.i.d. values, so the Rice coding essentially passes the
+    // samples through. Ratio should be near 1.0; ceiling 1.5× absorbs
+    // the fixed-header + per-partition-k overhead at small frames.
+    assert!(
+        encoded < raw * 3 / 2,
+        "8-bit noise inflated by more than 50% (encoded={encoded}, raw={raw})"
+    );
+}
+
+#[test]
+fn roundtrip_16bit_noise() {
+    let samples = lfsr_noise(4 * FRAME_SIZE, 16, 0x16AC);
+    let (raw, encoded) = roundtrip(&samples, 2);
+    eprintln!(
+        "roundtrip_16bit_noise          raw={}  encoded={}  ratio={:.3}",
+        raw,
+        encoded,
+        encoded as f64 / raw as f64,
+    );
+    // Same reasoning as the 8-bit case. Header overhead is proportionally
+    // smaller at 16-bit, so the ceiling can be tighter (1.1×).
+    assert!(
+        encoded < raw * 11 / 10,
+        "16-bit noise inflated by more than 10% (encoded={encoded}, raw={raw})"
+    );
+}
+
+#[test]
+fn roundtrip_20bit_noise() {
+    // 20-bit PCM: studio-mastered material. Residual range is wider so
+    // Rice k ends up in the middle of its domain (~18-19).
+    let samples = lfsr_noise(4 * FRAME_SIZE, 20, 0x20AC);
+    let (raw, encoded) = roundtrip(&samples, 3);
+    eprintln!(
+        "roundtrip_20bit_noise          raw={}  encoded={}  ratio={:.3}",
+        raw,
+        encoded,
+        encoded as f64 / raw as f64,
+    );
+    // 3 bytes packs 24 bits for a 20-bit source, so ratio below ~1.0
+    // implies the codec is honouring the source width rather than
+    // charging 24-bit-ceiling rates.
+    assert!(
+        encoded < raw,
+        "20-bit noise inflated past raw size (encoded={encoded}, raw={raw})"
+    );
+}
+
+#[test]
+fn roundtrip_24bit_noise() {
+    let samples = lfsr_noise(4 * FRAME_SIZE, 24, 0x24AC);
+    let (raw, encoded) = roundtrip(&samples, 3);
+    eprintln!(
+        "roundtrip_24bit_noise          raw={}  encoded={}  ratio={:.3}",
+        raw,
+        encoded,
+        encoded as f64 / raw as f64,
+    );
+    assert!(
+        encoded < raw * 11 / 10,
+        "24-bit noise inflated by more than 10% (encoded={encoded}, raw={raw})"
+    );
+}
+
+#[test]
+fn roundtrip_24bit_full_scale() {
+    // Every sample at the 24-bit ceiling. Exercises the autocorrelation
+    // accumulator's worst case — `R[0] = N · (2^23 − 1)^2 ≈ 2^46` for a
+    // 1024-sample frame, comfortably inside i64 but worth a regression
+    // fence to catch a future narrowing to i32.
+    let samples = vec![(1 << 23) - 1; 4 * FRAME_SIZE];
+    let (_raw, encoded) = roundtrip(&samples, 3);
+    eprintln!("roundtrip_24bit_full_scale     encoded={}", encoded);
+}
+
+// ── Pathological content ────────────────────────────────────────────────────
+
+#[test]
+fn roundtrip_all_zeros() {
+    // Degenerate case called out by the spec: prediction_order MUST be 0
+    // because Levinson-Durbin is undefined at R[0] = 0. This test is a
+    // regression fence on the encoder's order-0 short-circuit.
+    let samples = vec![0i32; 4 * FRAME_SIZE];
+    let (raw, encoded) = roundtrip(&samples, 2);
+    eprintln!(
+        "roundtrip_all_zeros            raw={}  encoded={}  ratio={:.3}",
+        raw,
+        encoded,
+        encoded as f64 / raw as f64,
+    );
+    // All-zero frames compress to ~header + one bit per sample
+    // (k=0 unary terminator). At 1024-sample frames the fixed 7-byte
+    // header is still a visible fraction of the output. Measured ratio
+    // is ~0.066; ceiling 0.15 absorbs header-overhead variance at other
+    // frame sizes and keeps a ~2× regression budget.
+    assert!(
+        encoded < raw * 3 / 20,
+        "all-zero frame compressed poorly (encoded={encoded}, raw={raw})"
+    );
+}
+
+#[test]
+fn roundtrip_dc_offset() {
+    // Constant non-zero sample — `R[0] > 0` but all autocorrelation
+    // lags are equal, so the LPC model captures the signal perfectly
+    // with order 1 (coefficient = 1.0). Residuals are zero after the
+    // warm-up sample.
+    let samples = vec![12_345i32; 4 * FRAME_SIZE];
+    let (raw, encoded) = roundtrip(&samples, 2);
+    eprintln!(
+        "roundtrip_dc_offset            raw={}  encoded={}  ratio={:.3}",
+        raw,
+        encoded,
+        encoded as f64 / raw as f64,
+    );
+    // Measured ratio is ~0.097: header + one big warm-up residual for
+    // the DC level + unary-zero tail. Ceiling 0.20 leaves ~2× regression
+    // headroom without flaking on encoder-tuning changes that shift the
+    // warm-up residual's Rice k by one.
+    assert!(
+        encoded < raw / 5,
+        "DC-offset frame compressed poorly (encoded={encoded}, raw={raw})"
+    );
+}
+
+#[test]
+fn roundtrip_nyquist_square() {
+    // Pure Nyquist: alternating +A, −A, +A, −A. An order-1 predictor
+    // with coefficient −1 would give zero residuals, but the encoder's
+    // sparse LPC grid starts at order 2 and the fixed-predictor
+    // post-pass ships FLAC-style orders 1-4 whose coefficients do not
+    // include the `a = −1` Nyquist match — so this signal is
+    // structurally hard for LAC despite its regularity. The result is
+    // that Nyquist compresses only modestly (~52% measured).
+    //
+    // Kept as a regression fence: a future encoder that extends the
+    // grid or adds a Nyquist-aware fixed predictor would dramatically
+    // improve this ratio, and the ceiling here shouldn't fight that;
+    // meanwhile a regression that makes it *worse* than ~60% is real.
+    let a = 1_000_000i32;
+    let samples: Vec<i32> = (0..4 * FRAME_SIZE)
+        .map(|i| if i & 1 == 0 { a } else { -a })
+        .collect();
+    let (raw, encoded) = roundtrip(&samples, 3);
+    eprintln!(
+        "roundtrip_nyquist_square       raw={}  encoded={}  ratio={:.3}",
+        raw,
+        encoded,
+        encoded as f64 / raw as f64,
+    );
+    assert!(
+        encoded < raw * 3 / 5,
+        "Nyquist square compressed poorly (encoded={encoded}, raw={raw})"
+    );
+}
+
+#[test]
+fn roundtrip_silence_with_click() {
+    // Zero everywhere except a single full-scale impulse partway through.
+    // Exercises the case where one residual is enormous (effectively the
+    // click amplitude itself, since predecessors are zero) while every
+    // other residual is zero. The Rice k-search has to pick a k that
+    // doesn't over-serve the impulse at the cost of the silence.
+    let mut samples = vec![0i32; 4 * FRAME_SIZE];
+    samples[FRAME_SIZE / 2] = (1 << 22) - 1;
+    let (_raw, encoded) = roundtrip(&samples, 2);
+    eprintln!("roundtrip_silence_with_click   encoded={}", encoded);
+}
+
+#[test]
+fn roundtrip_prime_frame_size() {
+    // Prime frame size forces `partition_order = 0` — the Rice bitstream
+    // has a single partition, and the encoder's partition search is
+    // skipped entirely. Ensures the single-partition path is exercised
+    // independently of the corpus tests (which all use power-of-two
+    // frame sizes). 509 is the largest prime ≤ 512.
+    let samples = lfsr_noise(509, 16, 0x509D);
+    let encoded = encode_frame(&samples);
+    let decoded = decode_frame(&encoded).expect("decode");
+    assert_eq!(decoded, samples, "prime-length frame round-trip mismatch");
+}