feat: add Silero VAD toggle to audio pipeline
Integrates @ricky0123/vad-web's MicVAD as an optional voice activity detector alongside the noise gate. When enabled, the Silero ONNX model classifies each audio frame as speech or silence; silence frames mute the worklet's output via a new VAD gate message. VAD is wired into Publisher.ts alongside the existing noise gate transformer. Vite is configured to copy the worklet bundle, ONNX model, and ORT WASM files to /vad/ so they're reachable at runtime. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,11 @@ interface NoiseGateParams {
|
||||
transientReleaseMs: number; // how quickly suppression fades after transient ends
|
||||
}
|
||||
|
||||
interface VADGateMessage {
|
||||
type: "vad-gate";
|
||||
open: boolean;
|
||||
}
|
||||
|
||||
function dbToLinear(db: number): number {
|
||||
return Math.pow(10, db / 20);
|
||||
}
|
||||
@@ -65,12 +70,19 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
// Exponential smoothing coefficient for background RMS (~200ms time constant)
|
||||
private rmsCoeff = Math.exp(-1.0 / (0.2 * sampleRate));
|
||||
|
||||
// VAD gate state (controlled externally via port message)
|
||||
private vadGateOpen = true; // starts open until VAD sends its first decision
|
||||
|
||||
private logCounter = 0;
|
||||
|
||||
public constructor() {
|
||||
super();
|
||||
this.port.onmessage = (e: MessageEvent<NoiseGateParams>): void => {
|
||||
this.updateParams(e.data);
|
||||
this.port.onmessage = (e: MessageEvent<NoiseGateParams | VADGateMessage>): void => {
|
||||
if ((e.data as VADGateMessage).type === "vad-gate") {
|
||||
this.vadGateOpen = (e.data as VADGateMessage).open;
|
||||
} else {
|
||||
this.updateParams(e.data as NoiseGateParams);
|
||||
}
|
||||
};
|
||||
this.updateParams({
|
||||
threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
|
||||
@@ -148,7 +160,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
const gain = this.gateAttenuation * transientGain;
|
||||
const gain = this.gateAttenuation * transientGain * (this.vadGateOpen ? 1.0 : 0.0);
|
||||
|
||||
for (let c = 0; c < output.length; c++) {
|
||||
const inCh = input[c] ?? input[0];
|
||||
|
||||
@@ -119,6 +119,11 @@ export class NoiseGateTransformer implements AudioTrackProcessor {
|
||||
this.sendParams();
|
||||
}
|
||||
|
||||
/** Tell the worklet to open or close the VAD-controlled gate. */
|
||||
public setVADOpen(open: boolean): void {
|
||||
this.workletNode?.port.postMessage({ type: "vad-gate", open });
|
||||
}
|
||||
|
||||
private sendParams(): void {
|
||||
if (!this.workletNode) return;
|
||||
log.debug("sendParams:", this.params);
|
||||
|
||||
85
src/livekit/SileroVADGate.ts
Normal file
85
src/livekit/SileroVADGate.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
/*
|
||||
Copyright 2026 New Vector Ltd.
|
||||
|
||||
SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
|
||||
Please see LICENSE in the repository root for full details.
|
||||
*/
|
||||
|
||||
import { MicVAD, getDefaultRealTimeVADOptions } from "@ricky0123/vad-web";
|
||||
import { logger } from "matrix-js-sdk/lib/logger";
|
||||
|
||||
const log = logger.getChild("[SileroVADGate]");
|
||||
|
||||
const VAD_BASE_PATH = "/vad/";
|
||||
|
||||
/**
|
||||
* Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the
|
||||
* NoiseGateTransformer's VAD gate. Instead of creating its own microphone
|
||||
* stream, it receives the existing LiveKit MediaStream so the VAD sees exactly
|
||||
* the same audio the worklet processes.
|
||||
*
|
||||
* Usage:
|
||||
* const gate = new SileroVADGate(stream, audioContext);
|
||||
* gate.onSpeechStart = () => transformer.setVADOpen(true);
|
||||
* gate.onSpeechEnd = () => transformer.setVADOpen(false);
|
||||
* await gate.start();
|
||||
* // later:
|
||||
* await gate.destroy();
|
||||
*/
|
||||
export class SileroVADGate {
|
||||
public onSpeechStart: () => void = () => {};
|
||||
public onSpeechEnd: () => void = () => {};
|
||||
|
||||
private vad: MicVAD | null = null;
|
||||
private readonly stream: MediaStream;
|
||||
private readonly audioContext: AudioContext;
|
||||
|
||||
public constructor(stream: MediaStream, audioContext: AudioContext) {
|
||||
this.stream = stream;
|
||||
this.audioContext = audioContext;
|
||||
}
|
||||
|
||||
public async start(): Promise<void> {
|
||||
const stream = this.stream;
|
||||
const audioContext = this.audioContext;
|
||||
|
||||
log.info("initialising MicVAD, baseAssetPath:", VAD_BASE_PATH);
|
||||
|
||||
this.vad = await MicVAD.new({
|
||||
...getDefaultRealTimeVADOptions("legacy"),
|
||||
audioContext,
|
||||
baseAssetPath: VAD_BASE_PATH,
|
||||
onnxWASMBasePath: VAD_BASE_PATH,
|
||||
startOnLoad: false,
|
||||
// Provide the existing stream instead of calling getUserMedia
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
getStream: async (): Promise<MediaStream> => stream,
|
||||
pauseStream: async (): Promise<void> => {},
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
resumeStream: async (): Promise<MediaStream> => stream,
|
||||
onSpeechStart: (): void => {
|
||||
log.debug("speech start");
|
||||
this.onSpeechStart();
|
||||
},
|
||||
onSpeechEnd: (): void => {
|
||||
log.debug("speech end");
|
||||
this.onSpeechEnd();
|
||||
},
|
||||
onVADMisfire: (): void => {
|
||||
log.debug("VAD misfire");
|
||||
},
|
||||
onFrameProcessed: (): void => {},
|
||||
onSpeechRealStart: (): void => {},
|
||||
});
|
||||
|
||||
await this.vad.start();
|
||||
log.info("MicVAD started");
|
||||
}
|
||||
|
||||
public async destroy(): Promise<void> {
|
||||
if (this.vad) {
|
||||
await this.vad.destroy();
|
||||
this.vad = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -32,6 +32,7 @@ import {
|
||||
transientSuppressorEnabled as transientSuppressorEnabledSetting,
|
||||
transientThreshold as transientThresholdSetting,
|
||||
transientRelease as transientReleaseSetting,
|
||||
vadEnabled as vadEnabledSetting,
|
||||
} from "./settings";
|
||||
import { PreferencesSettingsTab } from "./PreferencesSettingsTab";
|
||||
import { Slider } from "../Slider";
|
||||
@@ -129,6 +130,9 @@ export const SettingsModal: FC<Props> = ({
|
||||
|
||||
const [showAdvancedGate, setShowAdvancedGate] = useState(false);
|
||||
|
||||
// Voice activity detection
|
||||
const [vadActive, setVadActive] = useSetting(vadEnabledSetting);
|
||||
|
||||
// Transient suppressor settings
|
||||
const [transientEnabled, setTransientEnabled] = useSetting(transientSuppressorEnabledSetting);
|
||||
const [transientThreshold, setTransientThreshold] = useSetting(transientThresholdSetting);
|
||||
@@ -310,6 +314,31 @@ export const SettingsModal: FC<Props> = ({
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
<div className={styles.noiseGateSection}>
|
||||
<Heading
|
||||
type="body"
|
||||
weight="semibold"
|
||||
size="sm"
|
||||
as="h4"
|
||||
className={styles.noiseGateHeading}
|
||||
>
|
||||
Voice Activity Detection
|
||||
</Heading>
|
||||
<Separator className={styles.noiseGateSeparator} />
|
||||
<FieldRow>
|
||||
<InputField
|
||||
id="vadEnabled"
|
||||
type="checkbox"
|
||||
label="Enable voice activity detection"
|
||||
description="Uses the Silero VAD model to mute audio when no speech is detected. Requires the noise gate to be enabled."
|
||||
checked={vadActive}
|
||||
onChange={(e: ChangeEvent<HTMLInputElement>): void =>
|
||||
setVadActive(e.target.checked)
|
||||
}
|
||||
disabled={!noiseGateEnabled}
|
||||
/>
|
||||
</FieldRow>
|
||||
</div>
|
||||
<div className={styles.noiseGateSection}>
|
||||
<Heading
|
||||
type="body"
|
||||
|
||||
@@ -145,6 +145,8 @@ export const noiseGateHold = new Setting<number>("noise-gate-hold", 200);
|
||||
// Time in ms for the gate to fully close after hold expires
|
||||
export const noiseGateRelease = new Setting<number>("noise-gate-release", 150);
|
||||
|
||||
export const vadEnabled = new Setting<boolean>("vad-enabled", false);
|
||||
|
||||
export const transientSuppressorEnabled = new Setting<boolean>(
|
||||
"transient-suppressor-enabled",
|
||||
false,
|
||||
|
||||
@@ -41,11 +41,13 @@ import {
|
||||
transientSuppressorEnabled,
|
||||
transientThreshold,
|
||||
transientRelease,
|
||||
vadEnabled,
|
||||
} from "../../../settings/settings.ts";
|
||||
import {
|
||||
type NoiseGateParams,
|
||||
NoiseGateTransformer,
|
||||
} from "../../../livekit/NoiseGateTransformer.ts";
|
||||
import { SileroVADGate } from "../../../livekit/SileroVADGate.ts";
|
||||
import { observeTrackReference$ } from "../../observeTrackReference";
|
||||
import { type Connection } from "../remoteMembers/Connection.ts";
|
||||
import { ObservableScope } from "../../ObservableScope.ts";
|
||||
@@ -435,6 +437,7 @@ export class Publisher {
|
||||
|
||||
let transformer: NoiseGateTransformer | null = null;
|
||||
let audioCtx: AudioContext | null = null;
|
||||
let vadGate: SileroVADGate | null = null;
|
||||
|
||||
const currentParams = (): NoiseGateParams => ({
|
||||
threshold: noiseGateThreshold.getValue(),
|
||||
@@ -446,6 +449,32 @@ export class Publisher {
|
||||
transientReleaseMs: transientRelease.getValue(),
|
||||
});
|
||||
|
||||
const stopVAD = (): void => {
|
||||
if (vadGate) {
|
||||
void vadGate.destroy();
|
||||
vadGate = null;
|
||||
}
|
||||
// Reset gate to open so audio flows if VAD is toggled off mid-call
|
||||
transformer?.setVADOpen(true);
|
||||
};
|
||||
|
||||
const startVAD = (track: LocalAudioTrack, ctx: AudioContext): void => {
|
||||
stopVAD();
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const rawTrack: MediaStreamTrack | undefined = (track as any).mediaStreamTrack;
|
||||
if (!rawTrack) {
|
||||
this.logger.warn("[VAD] no underlying MediaStreamTrack — skipping VAD");
|
||||
return;
|
||||
}
|
||||
const stream = new MediaStream([rawTrack]);
|
||||
vadGate = new SileroVADGate(stream, ctx);
|
||||
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
|
||||
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
|
||||
vadGate.start().catch((e: unknown) => {
|
||||
this.logger.error("[VAD] failed to start", e);
|
||||
});
|
||||
};
|
||||
|
||||
// Attach / detach processor when enabled state or the track changes.
|
||||
combineLatest([audioTrack$, noiseGateEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
@@ -459,18 +488,20 @@ export class Publisher {
|
||||
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(audioTrack as any).setAudioContext(audioCtx);
|
||||
audioCtx.resume().then(() => {
|
||||
audioCtx.resume().then(async () => {
|
||||
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
|
||||
return audioTrack
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
.setProcessor(transformer as any);
|
||||
}).then(() => {
|
||||
this.logger.info("[NoiseGate] setProcessor resolved");
|
||||
if (vadEnabled.getValue() && audioCtx) startVAD(audioTrack, audioCtx);
|
||||
}).catch((e: unknown) => {
|
||||
this.logger.error("[NoiseGate] setProcessor failed", e);
|
||||
});
|
||||
} else if (!enabled && audioTrack.getProcessor()) {
|
||||
this.logger.info("[NoiseGate] removing processor");
|
||||
stopVAD();
|
||||
void audioTrack.stopProcessor();
|
||||
void audioCtx?.close();
|
||||
audioCtx = null;
|
||||
@@ -482,6 +513,18 @@ export class Publisher {
|
||||
}
|
||||
});
|
||||
|
||||
// Start/stop VAD when its toggle changes.
|
||||
combineLatest([audioTrack$, vadEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([audioTrack, enabled]) => {
|
||||
if (!audioTrack || !audioCtx) return;
|
||||
if (enabled) {
|
||||
startVAD(audioTrack, audioCtx);
|
||||
} else {
|
||||
stopVAD();
|
||||
}
|
||||
});
|
||||
|
||||
// Push param changes to the live worklet without recreating the processor.
|
||||
combineLatest([
|
||||
noiseGateThreshold.value$,
|
||||
|
||||
Reference in New Issue
Block a user