fix: switch VAD gate to per-frame probability control

onSpeechStart/onSpeechEnd fire at segment boundaries — with constant
non-speech noise, onSpeechEnd never fires so the gate stayed open.
Switch to onFrameProcessed which fires every ~96ms and applies hysteresis
(open at >0.5, close at <0.35) matching Silero's own thresholds. Gate now
starts closed and opens only once the first speech frame is confirmed.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-23 23:34:08 -03:00
parent 428b76db25
commit 9f5b639190
2 changed files with 42 additions and 26 deletions

View File

@@ -12,27 +12,34 @@ const log = logger.getChild("[SileroVADGate]");
const VAD_BASE_PATH = "/vad/";
// Speech probability above this value opens the gate; below it closes it.
// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35.
// We use those same values so the gate tracks the model's own speech/silence logic.
const SPEECH_OPEN_THRESHOLD = 0.5;
const SPEECH_CLOSE_THRESHOLD = 0.35;
/**
* Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the
* NoiseGateTransformer's VAD gate. Instead of creating its own microphone
* stream, it receives the existing LiveKit MediaStream so the VAD sees exactly
* the same audio the worklet processes.
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
* decisions into the NoiseGateTransformer's VAD gate.
*
* Usage:
* const gate = new SileroVADGate(stream, audioContext);
* gate.onSpeechStart = () => transformer.setVADOpen(true);
* gate.onSpeechEnd = () => transformer.setVADOpen(false);
* await gate.start();
* // later:
* await gate.destroy();
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
* onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at
* speech segment boundaries — with purely non-speech noise, onSpeechEnd
* never fires and the gate stays open. Per-frame probability control fixes
* this: the gate closes on the first silent frame.
*
* The gate starts closed (silent) and opens only once the VAD confirms speech.
*/
export class SileroVADGate {
public onSpeechStart: () => void = () => {};
public onSpeechEnd: () => void = () => {};
/** Called each time the gate transitions to open (speech detected). */
public onOpen: () => void = () => {};
/** Called each time the gate transitions to closed (silence detected). */
public onClose: () => void = () => {};
private vad: MicVAD | null = null;
private readonly stream: MediaStream;
private readonly audioContext: AudioContext;
private gateOpen = false;
public constructor(stream: MediaStream, audioContext: AudioContext) {
this.stream = stream;
@@ -57,21 +64,28 @@ export class SileroVADGate {
pauseStream: async (): Promise<void> => {},
// eslint-disable-next-line @typescript-eslint/require-await
resumeStream: async (): Promise<MediaStream> => stream,
onSpeechStart: (): void => {
log.debug("speech start");
this.onSpeechStart();
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
const p = probabilities.isSpeech;
if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) {
this.gateOpen = true;
log.debug("gate open (isSpeech=", p, ")");
this.onOpen();
} else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) {
this.gateOpen = false;
log.debug("gate close (isSpeech=", p, ")");
this.onClose();
}
},
onSpeechEnd: (): void => {
log.debug("speech end");
this.onSpeechEnd();
},
onVADMisfire: (): void => {
log.debug("VAD misfire");
},
onFrameProcessed: (): void => {},
onSpeechStart: (): void => {},
onSpeechEnd: (): void => {},
onVADMisfire: (): void => {},
onSpeechRealStart: (): void => {},
});
// Gate starts closed — audio is muted until the first speech frame arrives.
this.gateOpen = false;
this.onClose();
await this.vad.start();
log.info("MicVAD started");
}

View File

@@ -468,8 +468,10 @@ export class Publisher {
}
const stream = new MediaStream([rawTrack]);
vadGate = new SileroVADGate(stream, ctx);
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
// Close the gate immediately — VAD will open it once speech is confirmed.
transformer?.setVADOpen(false);
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
vadGate.onClose = (): void => transformer?.setVADOpen(false);
vadGate.start().catch((e: unknown) => {
this.logger.error("[VAD] failed to start", e);
});