From 9f5b63919007f7f797badfd27f7f27e73d8c7eb3 Mon Sep 17 00:00:00 2001 From: mk Date: Mon, 23 Mar 2026 23:34:08 -0300 Subject: [PATCH] fix: switch VAD gate to per-frame probability control MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit onSpeechStart/onSpeechEnd fire at segment boundaries — with constant non-speech noise, onSpeechEnd never fires so the gate stayed open. Switch to onFrameProcessed which fires every ~96ms and applies hysteresis (open at >0.5, close at <0.35) matching Silero's own thresholds. Gate now starts closed and opens only once the first speech frame is confirmed. Co-Authored-By: Claude Sonnet 4.6 --- src/livekit/SileroVADGate.ts | 62 ++++++++++++------- .../CallViewModel/localMember/Publisher.ts | 6 +- 2 files changed, 42 insertions(+), 26 deletions(-) diff --git a/src/livekit/SileroVADGate.ts b/src/livekit/SileroVADGate.ts index 357c9c62..e7bd9751 100644 --- a/src/livekit/SileroVADGate.ts +++ b/src/livekit/SileroVADGate.ts @@ -12,27 +12,34 @@ const log = logger.getChild("[SileroVADGate]"); const VAD_BASE_PATH = "/vad/"; +// Speech probability above this value opens the gate; below it closes it. +// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35. +// We use those same values so the gate tracks the model's own speech/silence logic. +const SPEECH_OPEN_THRESHOLD = 0.5; +const SPEECH_CLOSE_THRESHOLD = 0.35; + /** - * Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the - * NoiseGateTransformer's VAD gate. Instead of creating its own microphone - * stream, it receives the existing LiveKit MediaStream so the VAD sees exactly - * the same audio the worklet processes. + * Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability + * decisions into the NoiseGateTransformer's VAD gate. * - * Usage: - * const gate = new SileroVADGate(stream, audioContext); - * gate.onSpeechStart = () => transformer.setVADOpen(true); - * gate.onSpeechEnd = () => transformer.setVADOpen(false); - * await gate.start(); - * // later: - * await gate.destroy(); + * Uses onFrameProcessed (fires every ~96ms) rather than the segment-level + * onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at + * speech segment boundaries — with purely non-speech noise, onSpeechEnd + * never fires and the gate stays open. Per-frame probability control fixes + * this: the gate closes on the first silent frame. + * + * The gate starts closed (silent) and opens only once the VAD confirms speech. */ export class SileroVADGate { - public onSpeechStart: () => void = () => {}; - public onSpeechEnd: () => void = () => {}; + /** Called each time the gate transitions to open (speech detected). */ + public onOpen: () => void = () => {}; + /** Called each time the gate transitions to closed (silence detected). */ + public onClose: () => void = () => {}; private vad: MicVAD | null = null; private readonly stream: MediaStream; private readonly audioContext: AudioContext; + private gateOpen = false; public constructor(stream: MediaStream, audioContext: AudioContext) { this.stream = stream; @@ -57,21 +64,28 @@ export class SileroVADGate { pauseStream: async (): Promise => {}, // eslint-disable-next-line @typescript-eslint/require-await resumeStream: async (): Promise => stream, - onSpeechStart: (): void => { - log.debug("speech start"); - this.onSpeechStart(); + onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => { + const p = probabilities.isSpeech; + if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) { + this.gateOpen = true; + log.debug("gate open (isSpeech=", p, ")"); + this.onOpen(); + } else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) { + this.gateOpen = false; + log.debug("gate close (isSpeech=", p, ")"); + this.onClose(); + } }, - onSpeechEnd: (): void => { - log.debug("speech end"); - this.onSpeechEnd(); - }, - onVADMisfire: (): void => { - log.debug("VAD misfire"); - }, - onFrameProcessed: (): void => {}, + onSpeechStart: (): void => {}, + onSpeechEnd: (): void => {}, + onVADMisfire: (): void => {}, onSpeechRealStart: (): void => {}, }); + // Gate starts closed — audio is muted until the first speech frame arrives. + this.gateOpen = false; + this.onClose(); + await this.vad.start(); log.info("MicVAD started"); } diff --git a/src/state/CallViewModel/localMember/Publisher.ts b/src/state/CallViewModel/localMember/Publisher.ts index 521788bb..faed893b 100644 --- a/src/state/CallViewModel/localMember/Publisher.ts +++ b/src/state/CallViewModel/localMember/Publisher.ts @@ -468,8 +468,10 @@ export class Publisher { } const stream = new MediaStream([rawTrack]); vadGate = new SileroVADGate(stream, ctx); - vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true); - vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false); + // Close the gate immediately — VAD will open it once speech is confirmed. + transformer?.setVADOpen(false); + vadGate.onOpen = (): void => transformer?.setVADOpen(true); + vadGate.onClose = (): void => transformer?.setVADOpen(false); vadGate.start().catch((e: unknown) => { this.logger.error("[VAD] failed to start", e); });