feat: add Silero VAD toggle to audio pipeline
Integrates @ricky0123/vad-web's MicVAD as an optional voice activity detector alongside the noise gate. When enabled, the Silero ONNX model classifies each audio frame as speech or silence; silence frames mute the worklet's output via a new VAD gate message. VAD is wired into Publisher.ts alongside the existing noise gate transformer. Vite is configured to copy the worklet bundle, ONNX model, and ORT WASM files to /vad/ so they're reachable at runtime. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -41,11 +41,13 @@ import {
|
||||
transientSuppressorEnabled,
|
||||
transientThreshold,
|
||||
transientRelease,
|
||||
vadEnabled,
|
||||
} from "../../../settings/settings.ts";
|
||||
import {
|
||||
type NoiseGateParams,
|
||||
NoiseGateTransformer,
|
||||
} from "../../../livekit/NoiseGateTransformer.ts";
|
||||
import { SileroVADGate } from "../../../livekit/SileroVADGate.ts";
|
||||
import { observeTrackReference$ } from "../../observeTrackReference";
|
||||
import { type Connection } from "../remoteMembers/Connection.ts";
|
||||
import { ObservableScope } from "../../ObservableScope.ts";
|
||||
@@ -435,6 +437,7 @@ export class Publisher {
|
||||
|
||||
let transformer: NoiseGateTransformer | null = null;
|
||||
let audioCtx: AudioContext | null = null;
|
||||
let vadGate: SileroVADGate | null = null;
|
||||
|
||||
const currentParams = (): NoiseGateParams => ({
|
||||
threshold: noiseGateThreshold.getValue(),
|
||||
@@ -446,6 +449,32 @@ export class Publisher {
|
||||
transientReleaseMs: transientRelease.getValue(),
|
||||
});
|
||||
|
||||
const stopVAD = (): void => {
|
||||
if (vadGate) {
|
||||
void vadGate.destroy();
|
||||
vadGate = null;
|
||||
}
|
||||
// Reset gate to open so audio flows if VAD is toggled off mid-call
|
||||
transformer?.setVADOpen(true);
|
||||
};
|
||||
|
||||
const startVAD = (track: LocalAudioTrack, ctx: AudioContext): void => {
|
||||
stopVAD();
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const rawTrack: MediaStreamTrack | undefined = (track as any).mediaStreamTrack;
|
||||
if (!rawTrack) {
|
||||
this.logger.warn("[VAD] no underlying MediaStreamTrack — skipping VAD");
|
||||
return;
|
||||
}
|
||||
const stream = new MediaStream([rawTrack]);
|
||||
vadGate = new SileroVADGate(stream, ctx);
|
||||
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
|
||||
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
|
||||
vadGate.start().catch((e: unknown) => {
|
||||
this.logger.error("[VAD] failed to start", e);
|
||||
});
|
||||
};
|
||||
|
||||
// Attach / detach processor when enabled state or the track changes.
|
||||
combineLatest([audioTrack$, noiseGateEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
@@ -459,18 +488,20 @@ export class Publisher {
|
||||
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(audioTrack as any).setAudioContext(audioCtx);
|
||||
audioCtx.resume().then(() => {
|
||||
audioCtx.resume().then(async () => {
|
||||
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
|
||||
return audioTrack
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
.setProcessor(transformer as any);
|
||||
}).then(() => {
|
||||
this.logger.info("[NoiseGate] setProcessor resolved");
|
||||
if (vadEnabled.getValue() && audioCtx) startVAD(audioTrack, audioCtx);
|
||||
}).catch((e: unknown) => {
|
||||
this.logger.error("[NoiseGate] setProcessor failed", e);
|
||||
});
|
||||
} else if (!enabled && audioTrack.getProcessor()) {
|
||||
this.logger.info("[NoiseGate] removing processor");
|
||||
stopVAD();
|
||||
void audioTrack.stopProcessor();
|
||||
void audioCtx?.close();
|
||||
audioCtx = null;
|
||||
@@ -482,6 +513,18 @@ export class Publisher {
|
||||
}
|
||||
});
|
||||
|
||||
// Start/stop VAD when its toggle changes.
|
||||
combineLatest([audioTrack$, vadEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([audioTrack, enabled]) => {
|
||||
if (!audioTrack || !audioCtx) return;
|
||||
if (enabled) {
|
||||
startVAD(audioTrack, audioCtx);
|
||||
} else {
|
||||
stopVAD();
|
||||
}
|
||||
});
|
||||
|
||||
// Push param changes to the live worklet without recreating the processor.
|
||||
combineLatest([
|
||||
noiseGateThreshold.value$,
|
||||
|
||||
Reference in New Issue
Block a user