feat: add Silero VAD toggle to audio pipeline

Integrates @ricky0123/vad-web's MicVAD as an optional voice activity detector
alongside the noise gate. When enabled, the Silero ONNX model classifies each
audio frame as speech or silence; silence frames mute the worklet's output via
a new VAD gate message. VAD is wired into Publisher.ts alongside the existing
noise gate transformer. Vite is configured to copy the worklet bundle, ONNX
model, and ORT WASM files to /vad/ so they're reachable at runtime.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-23 23:29:43 -03:00
parent 0788e56c51
commit 428b76db25
9 changed files with 386 additions and 6 deletions

View File

@@ -41,11 +41,13 @@ import {
transientSuppressorEnabled,
transientThreshold,
transientRelease,
vadEnabled,
} from "../../../settings/settings.ts";
import {
type NoiseGateParams,
NoiseGateTransformer,
} from "../../../livekit/NoiseGateTransformer.ts";
import { SileroVADGate } from "../../../livekit/SileroVADGate.ts";
import { observeTrackReference$ } from "../../observeTrackReference";
import { type Connection } from "../remoteMembers/Connection.ts";
import { ObservableScope } from "../../ObservableScope.ts";
@@ -435,6 +437,7 @@ export class Publisher {
let transformer: NoiseGateTransformer | null = null;
let audioCtx: AudioContext | null = null;
let vadGate: SileroVADGate | null = null;
const currentParams = (): NoiseGateParams => ({
threshold: noiseGateThreshold.getValue(),
@@ -446,6 +449,32 @@ export class Publisher {
transientReleaseMs: transientRelease.getValue(),
});
const stopVAD = (): void => {
if (vadGate) {
void vadGate.destroy();
vadGate = null;
}
// Reset gate to open so audio flows if VAD is toggled off mid-call
transformer?.setVADOpen(true);
};
const startVAD = (track: LocalAudioTrack, ctx: AudioContext): void => {
stopVAD();
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const rawTrack: MediaStreamTrack | undefined = (track as any).mediaStreamTrack;
if (!rawTrack) {
this.logger.warn("[VAD] no underlying MediaStreamTrack — skipping VAD");
return;
}
const stream = new MediaStream([rawTrack]);
vadGate = new SileroVADGate(stream, ctx);
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
vadGate.start().catch((e: unknown) => {
this.logger.error("[VAD] failed to start", e);
});
};
// Attach / detach processor when enabled state or the track changes.
combineLatest([audioTrack$, noiseGateEnabled.value$])
.pipe(scope.bind())
@@ -459,18 +488,20 @@ export class Publisher {
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(audioTrack as any).setAudioContext(audioCtx);
audioCtx.resume().then(() => {
audioCtx.resume().then(async () => {
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
return audioTrack
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.setProcessor(transformer as any);
}).then(() => {
this.logger.info("[NoiseGate] setProcessor resolved");
if (vadEnabled.getValue() && audioCtx) startVAD(audioTrack, audioCtx);
}).catch((e: unknown) => {
this.logger.error("[NoiseGate] setProcessor failed", e);
});
} else if (!enabled && audioTrack.getProcessor()) {
this.logger.info("[NoiseGate] removing processor");
stopVAD();
void audioTrack.stopProcessor();
void audioCtx?.close();
audioCtx = null;
@@ -482,6 +513,18 @@ export class Publisher {
}
});
// Start/stop VAD when its toggle changes.
combineLatest([audioTrack$, vadEnabled.value$])
.pipe(scope.bind())
.subscribe(([audioTrack, enabled]) => {
if (!audioTrack || !audioCtx) return;
if (enabled) {
startVAD(audioTrack, audioCtx);
} else {
stopVAD();
}
});
// Push param changes to the live worklet without recreating the processor.
combineLatest([
noiseGateThreshold.value$,