fix: switch VAD gate to per-frame probability control

onSpeechStart/onSpeechEnd fire at segment boundaries — with constant
non-speech noise, onSpeechEnd never fires so the gate stayed open.
Switch to onFrameProcessed which fires every ~96ms and applies hysteresis
(open at >0.5, close at <0.35) matching Silero's own thresholds. Gate now
starts closed and opens only once the first speech frame is confirmed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-23 23:34:08 -03:00
parent 428b76db25
commit 9f5b639190
2 changed files with 42 additions and 26 deletions

View File

@@ -12,27 +12,34 @@ const log = logger.getChild("[SileroVADGate]");
const VAD_BASE_PATH = "/vad/"; const VAD_BASE_PATH = "/vad/";
// Speech probability above this value opens the gate; below it closes it.
// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35.
// We use those same values so the gate tracks the model's own speech/silence logic.
const SPEECH_OPEN_THRESHOLD = 0.5;
const SPEECH_CLOSE_THRESHOLD = 0.35;
/** /**
* Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the * Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
* NoiseGateTransformer's VAD gate. Instead of creating its own microphone * decisions into the NoiseGateTransformer's VAD gate.
* stream, it receives the existing LiveKit MediaStream so the VAD sees exactly
* the same audio the worklet processes.
* *
* Usage: * Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
* const gate = new SileroVADGate(stream, audioContext); * onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at
* gate.onSpeechStart = () => transformer.setVADOpen(true); * speech segment boundaries — with purely non-speech noise, onSpeechEnd
* gate.onSpeechEnd = () => transformer.setVADOpen(false); * never fires and the gate stays open. Per-frame probability control fixes
* await gate.start(); * this: the gate closes on the first silent frame.
* // later: *
* await gate.destroy(); * The gate starts closed (silent) and opens only once the VAD confirms speech.
*/ */
export class SileroVADGate { export class SileroVADGate {
public onSpeechStart: () => void = () => {}; /** Called each time the gate transitions to open (speech detected). */
public onSpeechEnd: () => void = () => {}; public onOpen: () => void = () => {};
/** Called each time the gate transitions to closed (silence detected). */
public onClose: () => void = () => {};
private vad: MicVAD | null = null; private vad: MicVAD | null = null;
private readonly stream: MediaStream; private readonly stream: MediaStream;
private readonly audioContext: AudioContext; private readonly audioContext: AudioContext;
private gateOpen = false;
public constructor(stream: MediaStream, audioContext: AudioContext) { public constructor(stream: MediaStream, audioContext: AudioContext) {
this.stream = stream; this.stream = stream;
@@ -57,21 +64,28 @@ export class SileroVADGate {
pauseStream: async (): Promise<void> => {}, pauseStream: async (): Promise<void> => {},
// eslint-disable-next-line @typescript-eslint/require-await // eslint-disable-next-line @typescript-eslint/require-await
resumeStream: async (): Promise<MediaStream> => stream, resumeStream: async (): Promise<MediaStream> => stream,
onSpeechStart: (): void => { onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
log.debug("speech start"); const p = probabilities.isSpeech;
this.onSpeechStart(); if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) {
this.gateOpen = true;
log.debug("gate open (isSpeech=", p, ")");
this.onOpen();
} else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) {
this.gateOpen = false;
log.debug("gate close (isSpeech=", p, ")");
this.onClose();
}
}, },
onSpeechEnd: (): void => { onSpeechStart: (): void => {},
log.debug("speech end"); onSpeechEnd: (): void => {},
this.onSpeechEnd(); onVADMisfire: (): void => {},
},
onVADMisfire: (): void => {
log.debug("VAD misfire");
},
onFrameProcessed: (): void => {},
onSpeechRealStart: (): void => {}, onSpeechRealStart: (): void => {},
}); });
// Gate starts closed — audio is muted until the first speech frame arrives.
this.gateOpen = false;
this.onClose();
await this.vad.start(); await this.vad.start();
log.info("MicVAD started"); log.info("MicVAD started");
} }

View File

@@ -468,8 +468,10 @@ export class Publisher {
} }
const stream = new MediaStream([rawTrack]); const stream = new MediaStream([rawTrack]);
vadGate = new SileroVADGate(stream, ctx); vadGate = new SileroVADGate(stream, ctx);
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true); // Close the gate immediately — VAD will open it once speech is confirmed.
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false); transformer?.setVADOpen(false);
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
vadGate.onClose = (): void => transformer?.setVADOpen(false);
vadGate.start().catch((e: unknown) => { vadGate.start().catch((e: unknown) => {
this.logger.error("[VAD] failed to start", e); this.logger.error("[VAD] failed to start", e);
}); });