fix: switch VAD gate to per-frame probability control
onSpeechStart/onSpeechEnd fire at segment boundaries — with constant non-speech noise, onSpeechEnd never fires so the gate stayed open. Switch to onFrameProcessed which fires every ~96ms and applies hysteresis (open at >0.5, close at <0.35) matching Silero's own thresholds. Gate now starts closed and opens only once the first speech frame is confirmed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -12,27 +12,34 @@ const log = logger.getChild("[SileroVADGate]");
|
|||||||
|
|
||||||
const VAD_BASE_PATH = "/vad/";
|
const VAD_BASE_PATH = "/vad/";
|
||||||
|
|
||||||
|
// Speech probability above this value opens the gate; below it closes it.
|
||||||
|
// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35.
|
||||||
|
// We use those same values so the gate tracks the model's own speech/silence logic.
|
||||||
|
const SPEECH_OPEN_THRESHOLD = 0.5;
|
||||||
|
const SPEECH_CLOSE_THRESHOLD = 0.35;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the
|
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
|
||||||
* NoiseGateTransformer's VAD gate. Instead of creating its own microphone
|
* decisions into the NoiseGateTransformer's VAD gate.
|
||||||
* stream, it receives the existing LiveKit MediaStream so the VAD sees exactly
|
|
||||||
* the same audio the worklet processes.
|
|
||||||
*
|
*
|
||||||
* Usage:
|
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
|
||||||
* const gate = new SileroVADGate(stream, audioContext);
|
* onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at
|
||||||
* gate.onSpeechStart = () => transformer.setVADOpen(true);
|
* speech segment boundaries — with purely non-speech noise, onSpeechEnd
|
||||||
* gate.onSpeechEnd = () => transformer.setVADOpen(false);
|
* never fires and the gate stays open. Per-frame probability control fixes
|
||||||
* await gate.start();
|
* this: the gate closes on the first silent frame.
|
||||||
* // later:
|
*
|
||||||
* await gate.destroy();
|
* The gate starts closed (silent) and opens only once the VAD confirms speech.
|
||||||
*/
|
*/
|
||||||
export class SileroVADGate {
|
export class SileroVADGate {
|
||||||
public onSpeechStart: () => void = () => {};
|
/** Called each time the gate transitions to open (speech detected). */
|
||||||
public onSpeechEnd: () => void = () => {};
|
public onOpen: () => void = () => {};
|
||||||
|
/** Called each time the gate transitions to closed (silence detected). */
|
||||||
|
public onClose: () => void = () => {};
|
||||||
|
|
||||||
private vad: MicVAD | null = null;
|
private vad: MicVAD | null = null;
|
||||||
private readonly stream: MediaStream;
|
private readonly stream: MediaStream;
|
||||||
private readonly audioContext: AudioContext;
|
private readonly audioContext: AudioContext;
|
||||||
|
private gateOpen = false;
|
||||||
|
|
||||||
public constructor(stream: MediaStream, audioContext: AudioContext) {
|
public constructor(stream: MediaStream, audioContext: AudioContext) {
|
||||||
this.stream = stream;
|
this.stream = stream;
|
||||||
@@ -57,21 +64,28 @@ export class SileroVADGate {
|
|||||||
pauseStream: async (): Promise<void> => {},
|
pauseStream: async (): Promise<void> => {},
|
||||||
// eslint-disable-next-line @typescript-eslint/require-await
|
// eslint-disable-next-line @typescript-eslint/require-await
|
||||||
resumeStream: async (): Promise<MediaStream> => stream,
|
resumeStream: async (): Promise<MediaStream> => stream,
|
||||||
onSpeechStart: (): void => {
|
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
|
||||||
log.debug("speech start");
|
const p = probabilities.isSpeech;
|
||||||
this.onSpeechStart();
|
if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) {
|
||||||
|
this.gateOpen = true;
|
||||||
|
log.debug("gate open (isSpeech=", p, ")");
|
||||||
|
this.onOpen();
|
||||||
|
} else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) {
|
||||||
|
this.gateOpen = false;
|
||||||
|
log.debug("gate close (isSpeech=", p, ")");
|
||||||
|
this.onClose();
|
||||||
|
}
|
||||||
},
|
},
|
||||||
onSpeechEnd: (): void => {
|
onSpeechStart: (): void => {},
|
||||||
log.debug("speech end");
|
onSpeechEnd: (): void => {},
|
||||||
this.onSpeechEnd();
|
onVADMisfire: (): void => {},
|
||||||
},
|
|
||||||
onVADMisfire: (): void => {
|
|
||||||
log.debug("VAD misfire");
|
|
||||||
},
|
|
||||||
onFrameProcessed: (): void => {},
|
|
||||||
onSpeechRealStart: (): void => {},
|
onSpeechRealStart: (): void => {},
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Gate starts closed — audio is muted until the first speech frame arrives.
|
||||||
|
this.gateOpen = false;
|
||||||
|
this.onClose();
|
||||||
|
|
||||||
await this.vad.start();
|
await this.vad.start();
|
||||||
log.info("MicVAD started");
|
log.info("MicVAD started");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -468,8 +468,10 @@ export class Publisher {
|
|||||||
}
|
}
|
||||||
const stream = new MediaStream([rawTrack]);
|
const stream = new MediaStream([rawTrack]);
|
||||||
vadGate = new SileroVADGate(stream, ctx);
|
vadGate = new SileroVADGate(stream, ctx);
|
||||||
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
|
// Close the gate immediately — VAD will open it once speech is confirmed.
|
||||||
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
|
transformer?.setVADOpen(false);
|
||||||
|
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
|
||||||
|
vadGate.onClose = (): void => transformer?.setVADOpen(false);
|
||||||
vadGate.start().catch((e: unknown) => {
|
vadGate.start().catch((e: unknown) => {
|
||||||
this.logger.error("[VAD] failed to start", e);
|
this.logger.error("[VAD] failed to start", e);
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user