initial commit

Signed-off-by: Kamal Tufekcic <kamal@lo.sh>
2026-04-23 14:58:32 +03:00 · 2026-04-23 14:58:32 +03:00 · 7862cb1d9d
commit 7862cb1d9d
2884 changed files with 16797 additions and 0 deletions
--- a/tests/mcu_mix.rs
+++ b/tests/mcu_mix.rs
@ -0,0 +1,488 @@
+//! End-to-end MCU server-side mix workload, on real speech audio.
+//!
+//! An **MCU** (Multipoint Control Unit) is a conferencing server that
+//! decodes every participant's incoming stream, mixes them in PCM, and
+//! re-encodes a per-receiver output. Contrast with an **SFU** (Selective
+//! Forwarding Unit) which forwards encoded streams byte-for-byte with no
+//! decode — lower CPU, higher bandwidth, no mix.
+//!
+//! This test simulates the MCU hot loop (decode → mix → encode) and lets
+//! us answer three concrete questions with real numbers:
+//!
+//! 1. **How many concurrent meetings can one CPU core handle?** Reported
+//!    as the realtime ratio — "1000 ms of audio processed in 20 ms wall
+//!    clock" means one core can handle 50 concurrent meetings at that
+//!    configuration.
+//! 2. **What fraction of MCU time is actually encode?** Breaks the loop
+//!    into decode, mix, and encode phases so a later optimisation can
+//!    target the real bottleneck rather than the suspected one.
+//! 3. **How much bandwidth does server-side mix save over pure SFU
+//!    fanout?** Compares the MCU's outgoing byte total (one stream per
+//!    receiver, each a leave-one-out mix) against what an SFU would send
+//!    (`P × (P − 1)` stream copies across the meeting).
+//!
+//! The same test is the natural baseline for future codec work:
+//!
+//! - **Q15 coefficient-shift fix**: should reduce encoded byte totals on
+//!   bass-heavy content (music tests). Compare `bytes_out_mix` before and
+//!   after.
+//! - **Encoder search optimisation**: should reduce `encode_ns` without
+//!   hurting `bytes_out_mix`. Compare the phase breakdown before and
+//!   after.
+//!
+//! Run with `cargo test --test mcu_mix --release -- --nocapture`.
+
+use std::path::{Path, PathBuf};
+use std::time::Instant;
+
+use hound::WavReader;
+use lac::{decode_frame, encode_frame};
+
+const CORPUS_DIR: &str = "corpus";
+
+/// 20 ms frame at 16 kHz — standard voice-chat frame length. Divides
+/// cleanly by every partition order so the encoder search stays on the
+/// dense path.
+const FRAME_SIZE: usize = 320;
+
+/// Cap wall-clock runtime by limiting how many frames we actually
+/// process. At 20 ms/frame, 1500 frames is 30 s of audio per stream —
+/// enough to average out transient behaviour without making the test take
+/// minutes in CI.
+const MAX_FRAMES: usize = 1500;
+
+// ── WAV loading ─────────────────────────────────────────────────────────────
+
+fn load_mono(path: &Path) -> Option<Vec<i32>> {
+    let mut reader = WavReader::open(path).ok()?;
+    let spec = reader.spec();
+    if spec.sample_format != hound::SampleFormat::Int
+        || spec.channels != 1
+        || spec.bits_per_sample > 24
+    {
+        return None;
+    }
+    let samples: Result<Vec<i32>, _> = reader.samples::<i32>().collect();
+    samples.ok()
+}
+
+fn corpus(name: &str) -> PathBuf {
+    Path::new(CORPUS_DIR).join(name)
+}
+
+macro_rules! require {
+    ($path:expr) => {
+        if !$path.exists() {
+            eprintln!("skipping: corpus file not found: {}", $path.display());
+            return;
+        }
+    };
+}
+
+// ── MCU mix pipeline ────────────────────────────────────────────────────────
+
+/// Result of one MCU simulation run.
+#[derive(Default)]
+struct McuStats {
+    /// Number of participants in the simulated meeting.
+    participants: usize,
+    /// Number of frames processed per participant (each frame is `FRAME_SIZE` samples).
+    frames: usize,
+    /// Total nanoseconds spent in `decode_frame` across all decodes.
+    decode_ns: u128,
+    /// Total nanoseconds spent doing the leave-one-out mix additions.
+    mix_ns: u128,
+    /// Total nanoseconds spent in `encode_frame` across all output encodes.
+    encode_ns: u128,
+    /// Bytes sent *into* the MCU by participants (pre-encoded frames).
+    bytes_in: usize,
+    /// Bytes sent *out* of the MCU (one stream per receiver, each a
+    /// leave-one-out mix).
+    bytes_out_mix: usize,
+    /// Bytes an SFU would send out for the same meeting: `P × (P − 1)`
+    /// stream copies, since every participant receives every other
+    /// participant's stream verbatim with no mix.
+    bytes_out_fanout: usize,
+}
+
+impl McuStats {
+    fn total_ns(&self) -> u128 {
+        self.decode_ns + self.mix_ns + self.encode_ns
+    }
+
+    /// Audio duration processed, in milliseconds.
+    fn audio_ms(&self) -> f64 {
+        // FRAME_SIZE samples at 16 kHz = (FRAME_SIZE / 16000) seconds per frame.
+        (self.frames as f64) * (FRAME_SIZE as f64) / 16_000.0 * 1000.0
+    }
+
+    /// Realtime multiplier — audio ms per wall-clock ms. A value of 50
+    /// means this workload runs 50× faster than realtime, i.e., one core
+    /// handles 50 concurrent meetings of this configuration.
+    fn realtime_ratio(&self) -> f64 {
+        let wall_ms = (self.total_ns() as f64) / 1_000_000.0;
+        self.audio_ms() / wall_ms
+    }
+}
+
+/// Run the decode → mix → encode loop for a simulated MCU meeting with
+/// the given pre-encoded participant streams. Returns timing and byte
+/// accounting for a single run.
+fn simulate_meeting(encoded_streams: &[Vec<Vec<u8>>]) -> McuStats {
+    let p = encoded_streams.len();
+    assert!(p >= 2, "need at least 2 participants for a meeting");
+    let n_frames = encoded_streams.iter().map(|s| s.len()).min().unwrap();
+    let n_frames = n_frames.min(MAX_FRAMES);
+
+    let mut stats = McuStats {
+        participants: p,
+        frames: n_frames,
+        ..Default::default()
+    };
+
+    // Ingress bytes: each participant sends all its encoded frames to the MCU.
+    for s in encoded_streams {
+        stats.bytes_in += s[..n_frames].iter().map(|f| f.len()).sum::<usize>();
+    }
+
+    // Reusable scratch buffer for leave-one-out mixes. The mix for
+    // receiver `r` excludes participant `r`'s own voice so they don't
+    // hear themselves delayed.
+    let mut mix = vec![0i32; FRAME_SIZE];
+
+    // `frame_idx` is a tick index that selects the same position across
+    // every participant's encoded-frame vector. Converting this to
+    // `iter().zip(...)` across variable-arity participants complicates
+    // the loop body for no runtime benefit, so the integer-index form
+    // stays; silence the `needless_range_loop` lint explicitly.
+    #[allow(clippy::needless_range_loop)]
+    for frame_idx in 0..n_frames {
+        // ── Phase 1: decode every incoming stream for this frame tick ──
+        let t0 = Instant::now();
+        let decoded: Vec<Vec<i32>> = (0..p)
+            .map(|i| decode_frame(&encoded_streams[i][frame_idx]).expect("decode"))
+            .collect();
+        stats.decode_ns += t0.elapsed().as_nanos();
+
+        // ── Phase 2 + 3: for each receiver, build its leave-one-out mix and encode ──
+        for receiver in 0..p {
+            // Mix (i32 additions). Wrapping add is appropriate — i32
+            // sample range is ±2²³-1 and summing up to P-1 ≤ 31 streams
+            // keeps the running total within i32 (max 2^31-1 / 2^23 ≈ 256
+            // summands before saturation).
+            let t_mix = Instant::now();
+            mix.fill(0);
+            for (i, stream) in decoded.iter().enumerate() {
+                if i == receiver {
+                    continue;
+                }
+                for (m, &s) in mix.iter_mut().zip(stream.iter()) {
+                    *m = m.wrapping_add(s);
+                }
+            }
+            stats.mix_ns += t_mix.elapsed().as_nanos();
+
+            // Clamp the mix to the 24-bit input range LAC guarantees. In
+            // practice, summing 4 typical speech streams almost never
+            // exceeds the range (human voice peaks well under full-scale,
+            // and constructive superposition of uncorrelated speakers is
+            // rare), but clamp anyway to stay inside the codec contract.
+            for m in mix.iter_mut() {
+                *m = (*m).clamp(-(1 << 23) + 1, (1 << 23) - 1);
+            }
+
+            let t_enc = Instant::now();
+            let encoded = encode_frame(&mix);
+            stats.encode_ns += t_enc.elapsed().as_nanos();
+            stats.bytes_out_mix += encoded.len();
+        }
+
+        // Fanout byte accounting: for each frame, each participant's
+        // stream is copied to the P-1 other receivers, with no re-encode.
+        // That's P × (P-1) copies, each the original encoded frame size.
+        for stream in encoded_streams.iter() {
+            stats.bytes_out_fanout += stream[frame_idx].len() * (p - 1);
+        }
+    }
+
+    stats
+}
+
+// ── Reporting ───────────────────────────────────────────────────────────────
+
+fn report(name: &str, s: &McuStats) {
+    let total_us = (s.total_ns() as f64) / 1000.0;
+    let decode_pct = 100.0 * (s.decode_ns as f64) / (s.total_ns() as f64);
+    let mix_pct = 100.0 * (s.mix_ns as f64) / (s.total_ns() as f64);
+    let encode_pct = 100.0 * (s.encode_ns as f64) / (s.total_ns() as f64);
+    let bandwidth_ratio = s.bytes_out_mix as f64 / s.bytes_out_fanout as f64;
+
+    eprintln!();
+    eprintln!("== {name} ==");
+    eprintln!(
+        "  {} participants × {} frames = {:.1} s of audio per stream",
+        s.participants,
+        s.frames,
+        s.audio_ms() / 1000.0
+    );
+    eprintln!(
+        "  wall:     {:.1} ms  ({:.1}× realtime → {:.0} concurrent meetings/core)",
+        total_us / 1000.0,
+        s.realtime_ratio(),
+        s.realtime_ratio(),
+    );
+    eprintln!(
+        "  phase:    decode {:>5.1}%   mix {:>5.1}%   encode {:>5.1}%",
+        decode_pct, mix_pct, encode_pct
+    );
+    eprintln!(
+        "  bytes:    in {} / out_mix {} / out_fanout {} / mix_vs_fanout {:.2}",
+        s.bytes_in, s.bytes_out_mix, s.bytes_out_fanout, bandwidth_ratio
+    );
+}
+
+// ── Tests ───────────────────────────────────────────────────────────────────
+
+/// Pre-encode all participant streams into `FRAME_SIZE`-sample frames and
+/// hand the result to `simulate_meeting`. Splitting the setup out of the
+/// simulation keeps the reported `encode_ns` to the *server-side* encode
+/// work only — participant-side encoding is a separate machine and
+/// shouldn't contaminate the MCU measurement.
+fn run(name: &str, stream_names: &[&str]) {
+    run_inner(name, stream_names, Activity::Continuous);
+}
+
+/// Turn-taking activity pattern. `Continuous` is every participant
+/// producing audio every frame (the pessimistic load-bearing case for
+/// MCU mix/encode cost). `DominantSpeaker { window_frames }` zeroes
+/// every stream except a single rotating speaker for each
+/// `window_frames`-long block. This is the realistic meeting behaviour
+/// — one person talks at a time — under which the MCU compute cost
+/// drops substantially (most mixes collapse to one voice + silence) but
+/// the relative byte savings vs SFU shrink (silence forwards for
+/// essentially free on a pure SFU, so the SFU egress number also falls).
+enum Activity {
+    Continuous,
+    DominantSpeaker { window_frames: usize },
+}
+
+fn run_inner(name: &str, stream_names: &[&str], activity: Activity) {
+    let paths: Vec<PathBuf> = stream_names.iter().map(|n| corpus(n)).collect();
+    for p in &paths {
+        if !p.exists() {
+            eprintln!("skipping {name}: missing {}", p.display());
+            return;
+        }
+    }
+
+    // Cap each stream to the number of samples we'll actually simulate,
+    // so we don't spend time pre-encoding the remaining 30+ minutes of
+    // audio that `simulate_meeting` will never touch. The AMI files are
+    // multi-minute recordings; MAX_FRAMES = 1500 @ 20 ms/frame = 30 s.
+    let max_samples = MAX_FRAMES * FRAME_SIZE;
+    let mut streams: Vec<Vec<i32>> = paths
+        .iter()
+        .map(|p| {
+            let mut s = load_mono(p).expect("load_mono");
+            s.truncate(max_samples);
+            s
+        })
+        .collect();
+
+    if let Activity::DominantSpeaker { window_frames } = activity {
+        apply_dominant_speaker(&mut streams, window_frames);
+    }
+
+    let encoded_streams: Vec<Vec<Vec<u8>>> = streams
+        .iter()
+        .map(|s| s.chunks(FRAME_SIZE).map(encode_frame).collect())
+        .collect();
+
+    let stats = simulate_meeting(&encoded_streams);
+    report(name, &stats);
+
+    // Sanity assertion: MCU egress must not exceed SFU egress. Equality
+    // holds at P=2 (the leave-one-out mix is just the other participant's
+    // voice, which encodes to the same size as forwarding their original
+    // stream); strict inequality holds for P ≥ 3 because fanout's byte
+    // count grows as `P × (P − 1)` while mix grows as `P`.
+    //
+    // For dominant-speaker the invariant still holds on the egress side
+    // (MCU produces one stream per receiver; SFU forwards P-1 streams
+    // per receiver) but the absolute margin narrows because silence
+    // compresses to ~1 bit per sample and SFU's egress of N-1 silent
+    // streams is essentially free.
+    assert!(
+        stats.bytes_out_mix <= stats.bytes_out_fanout,
+        "MCU mix produced more bytes than SFU fanout: mix={} fanout={}",
+        stats.bytes_out_mix,
+        stats.bytes_out_fanout
+    );
+}
+
+/// Zero every stream except a single rotating speaker per
+/// `window_frames`-long block. Assumes every stream in `streams` has the
+/// same length and `FRAME_SIZE` divides it; the runtime calls this only
+/// with post-truncate stream slices where both hold.
+///
+/// The rotation uses the block index modulo the participant count, so
+/// each participant gets roughly `total_blocks / participants` turns
+/// spread across the measurement window. At the default
+/// `window_frames = 100` (2 s per turn at 20 ms frames) and
+/// `MAX_FRAMES = 1500`, P=8 gives each participant ~1.9 turns — enough
+/// for the phase breakdown to average over speaker transitions.
+fn apply_dominant_speaker(streams: &mut [Vec<i32>], window_frames: usize) {
+    let n = streams.len();
+    if n == 0 {
+        return;
+    }
+    // AMI files aren't guaranteed to be identical length even after the
+    // common `truncate(max_samples)` step — a short recording stays
+    // short. Use the shortest stream as the rotation horizon so the
+    // per-stream slice below never runs past a stream's own end.
+    let common_len = streams.iter().map(|s| s.len()).min().unwrap_or(0);
+    let total_frames = common_len / FRAME_SIZE;
+    let window_samples = window_frames * FRAME_SIZE;
+    for block in 0..total_frames.div_ceil(window_frames) {
+        let active = block % n;
+        let start = block * window_samples;
+        let end = (start + window_samples).min(common_len);
+        for (i, s) in streams.iter_mut().enumerate() {
+            if i == active {
+                continue;
+            }
+            for sample in &mut s[start..end] {
+                *sample = 0;
+            }
+        }
+    }
+}
+
+#[test]
+fn mcu_mix_1on1_voice() {
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_1on1_voice",
+        &["ES2002a.Headset-0.wav", "ES2002a.Headset-1.wav"],
+    );
+}
+
+#[test]
+fn mcu_mix_3people_voice() {
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_3people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+        ],
+    );
+}
+
+#[test]
+fn mcu_mix_5people_voice() {
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_5people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+        ],
+    );
+}
+
+#[test]
+fn mcu_mix_8people_voice() {
+    // Covers the cross-over where mix bandwidth savings become dramatic —
+    // fanout grows quadratically, server-mix stays linear.
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_8people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+            "ES2002a.Lapel-1.wav",
+            "ES2002a.Lapel-2.wav",
+            "ES2002a.Lapel-3.wav",
+        ],
+    );
+}
+
+#[test]
+fn mcu_mix_8people_dominant_speaker() {
+    // Same participant set as `mcu_mix_8people_voice`, but only one
+    // participant has audio at any given moment — rotating every
+    // 2 seconds (100 frames @ 20 ms each). This is the realistic
+    // meeting behaviour: one person talks while N-1 listen. The
+    // continuous-speech test above is the pessimistic workload
+    // (everyone talks simultaneously, which inflates MCU mix cost and
+    // SFU fanout byte count); this test shows what the server actually
+    // experiences in production traffic.
+    //
+    // Expected differences vs the continuous variant:
+    //   - Decode phase: unchanged in wall-clock (silent frames decode
+    //     about as cheap as voice frames — the codec still walks every
+    //     residual).
+    //   - Mix phase: still O(P²) additions per frame, unchanged.
+    //   - Encode phase: drops sharply. Most mixes collapse to "one
+    //     voice + silence + silence + …", which encodes roughly like
+    //     a single voice stream instead of a P-1-way sum.
+    //   - Egress ratio (MCU vs SFU): narrows. SFU forwards (P-1) silent
+    //     streams per receiver nearly for free, so the SFU egress
+    //     baseline drops faster than MCU egress does.
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run_inner(
+        "mcu_mix_8people_dominant_speaker",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+            "ES2002a.Lapel-1.wav",
+            "ES2002a.Lapel-2.wav",
+            "ES2002a.Lapel-3.wav",
+        ],
+        Activity::DominantSpeaker { window_frames: 100 },
+    );
+}
+
+#[test]
+fn mcu_mix_16people_voice() {
+    // Past the typical full-mesh MCU sweet spot but still plausible for
+    // a mid-size meeting before hierarchical routing kicks in. The mix
+    // phase does 16 × 15 = 240 sample-wise additions per frame vs
+    // 8 × 7 = 56 at P=8, so the quadratic mix term starts showing up in
+    // the phase breakdown. Larger real meetings (P > 20) are typically
+    // routed via dominant-speaker selection rather than full-mesh mix,
+    // so this is a reasonable ceiling for the full-mesh data point.
+    require!(corpus("ES2002a.Headset-0.wav"));
+    run(
+        "mcu_mix_16people_voice",
+        &[
+            "ES2002a.Headset-0.wav",
+            "ES2002a.Headset-1.wav",
+            "ES2002a.Headset-2.wav",
+            "ES2002a.Headset-3.wav",
+            "ES2002a.Lapel-0.wav",
+            "ES2002a.Lapel-1.wav",
+            "ES2002a.Lapel-2.wav",
+            "ES2002a.Lapel-3.wav",
+            "ES2002a.Array1-01.wav",
+            "ES2002a.Array1-02.wav",
+            "ES2002a.Array1-03.wav",
+            "ES2002a.Array1-04.wav",
+            "ES2002a.Array1-05.wav",
+            "ES2002a.Array1-06.wav",
+            "ES2002a.Array1-07.wav",
+            "ES2002a.Array1-08.wav",
+        ],
+    );
+}