fix: switch VAD gate to per-frame probability control
onSpeechStart/onSpeechEnd fire at segment boundaries — with constant non-speech noise, onSpeechEnd never fires so the gate stayed open. Switch to onFrameProcessed which fires every ~96ms and applies hysteresis (open at >0.5, close at <0.35) matching Silero's own thresholds. Gate now starts closed and opens only once the first speech frame is confirmed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,27 +12,34 @@ const log = logger.getChild("[SileroVADGate]");
|
||||
|
||||
const VAD_BASE_PATH = "/vad/";
|
||||
|
||||
// Speech probability above this value opens the gate; below it closes it.
|
||||
// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35.
|
||||
// We use those same values so the gate tracks the model's own speech/silence logic.
|
||||
const SPEECH_OPEN_THRESHOLD = 0.5;
|
||||
const SPEECH_CLOSE_THRESHOLD = 0.35;
|
||||
|
||||
/**
|
||||
* Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the
|
||||
* NoiseGateTransformer's VAD gate. Instead of creating its own microphone
|
||||
* stream, it receives the existing LiveKit MediaStream so the VAD sees exactly
|
||||
* the same audio the worklet processes.
|
||||
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
|
||||
* decisions into the NoiseGateTransformer's VAD gate.
|
||||
*
|
||||
* Usage:
|
||||
* const gate = new SileroVADGate(stream, audioContext);
|
||||
* gate.onSpeechStart = () => transformer.setVADOpen(true);
|
||||
* gate.onSpeechEnd = () => transformer.setVADOpen(false);
|
||||
* await gate.start();
|
||||
* // later:
|
||||
* await gate.destroy();
|
||||
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
|
||||
* onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at
|
||||
* speech segment boundaries — with purely non-speech noise, onSpeechEnd
|
||||
* never fires and the gate stays open. Per-frame probability control fixes
|
||||
* this: the gate closes on the first silent frame.
|
||||
*
|
||||
* The gate starts closed (silent) and opens only once the VAD confirms speech.
|
||||
*/
|
||||
export class SileroVADGate {
|
||||
public onSpeechStart: () => void = () => {};
|
||||
public onSpeechEnd: () => void = () => {};
|
||||
/** Called each time the gate transitions to open (speech detected). */
|
||||
public onOpen: () => void = () => {};
|
||||
/** Called each time the gate transitions to closed (silence detected). */
|
||||
public onClose: () => void = () => {};
|
||||
|
||||
private vad: MicVAD | null = null;
|
||||
private readonly stream: MediaStream;
|
||||
private readonly audioContext: AudioContext;
|
||||
private gateOpen = false;
|
||||
|
||||
public constructor(stream: MediaStream, audioContext: AudioContext) {
|
||||
this.stream = stream;
|
||||
@@ -57,21 +64,28 @@ export class SileroVADGate {
|
||||
pauseStream: async (): Promise<void> => {},
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
resumeStream: async (): Promise<MediaStream> => stream,
|
||||
onSpeechStart: (): void => {
|
||||
log.debug("speech start");
|
||||
this.onSpeechStart();
|
||||
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
|
||||
const p = probabilities.isSpeech;
|
||||
if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) {
|
||||
this.gateOpen = true;
|
||||
log.debug("gate open (isSpeech=", p, ")");
|
||||
this.onOpen();
|
||||
} else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) {
|
||||
this.gateOpen = false;
|
||||
log.debug("gate close (isSpeech=", p, ")");
|
||||
this.onClose();
|
||||
}
|
||||
},
|
||||
onSpeechEnd: (): void => {
|
||||
log.debug("speech end");
|
||||
this.onSpeechEnd();
|
||||
},
|
||||
onVADMisfire: (): void => {
|
||||
log.debug("VAD misfire");
|
||||
},
|
||||
onFrameProcessed: (): void => {},
|
||||
onSpeechStart: (): void => {},
|
||||
onSpeechEnd: (): void => {},
|
||||
onVADMisfire: (): void => {},
|
||||
onSpeechRealStart: (): void => {},
|
||||
});
|
||||
|
||||
// Gate starts closed — audio is muted until the first speech frame arrives.
|
||||
this.gateOpen = false;
|
||||
this.onClose();
|
||||
|
||||
await this.vad.start();
|
||||
log.info("MicVAD started");
|
||||
}
|
||||
|
||||
@@ -468,8 +468,10 @@ export class Publisher {
|
||||
}
|
||||
const stream = new MediaStream([rawTrack]);
|
||||
vadGate = new SileroVADGate(stream, ctx);
|
||||
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
|
||||
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
|
||||
// Close the gate immediately — VAD will open it once speech is confirmed.
|
||||
transformer?.setVADOpen(false);
|
||||
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
|
||||
vadGate.onClose = (): void => transformer?.setVADOpen(false);
|
||||
vadGate.start().catch((e: unknown) => {
|
||||
this.logger.error("[VAD] failed to start", e);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user