feat: add Silero VAD toggle to audio pipeline

Integrates @ricky0123/vad-web's MicVAD as an optional voice activity detector
alongside the noise gate. When enabled, the Silero ONNX model classifies each
audio frame as speech or silence; silence frames mute the worklet's output via
a new VAD gate message. VAD is wired into Publisher.ts alongside the existing
noise gate transformer. Vite is configured to copy the worklet bundle, ONNX
model, and ORT WASM files to /vad/ so they're reachable at runtime.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-23 23:29:43 -03:00
parent 0788e56c51
commit 428b76db25
9 changed files with 386 additions and 6 deletions

View File

@@ -30,6 +30,11 @@ interface NoiseGateParams {
transientReleaseMs: number; // how quickly suppression fades after transient ends
}
interface VADGateMessage {
type: "vad-gate";
open: boolean;
}
function dbToLinear(db: number): number {
return Math.pow(10, db / 20);
}
@@ -65,12 +70,19 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
// Exponential smoothing coefficient for background RMS (~200ms time constant)
private rmsCoeff = Math.exp(-1.0 / (0.2 * sampleRate));
// VAD gate state (controlled externally via port message)
private vadGateOpen = true; // starts open until VAD sends its first decision
private logCounter = 0;
public constructor() {
super();
this.port.onmessage = (e: MessageEvent<NoiseGateParams>): void => {
this.updateParams(e.data);
this.port.onmessage = (e: MessageEvent<NoiseGateParams | VADGateMessage>): void => {
if ((e.data as VADGateMessage).type === "vad-gate") {
this.vadGateOpen = (e.data as VADGateMessage).open;
} else {
this.updateParams(e.data as NoiseGateParams);
}
};
this.updateParams({
threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
@@ -148,7 +160,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
}
}
const gain = this.gateAttenuation * transientGain;
const gain = this.gateAttenuation * transientGain * (this.vadGateOpen ? 1.0 : 0.0);
for (let c = 0; c < output.length; c++) {
const inCh = input[c] ?? input[0];

View File

@@ -119,6 +119,11 @@ export class NoiseGateTransformer implements AudioTrackProcessor {
this.sendParams();
}
/** Tell the worklet to open or close the VAD-controlled gate. */
public setVADOpen(open: boolean): void {
this.workletNode?.port.postMessage({ type: "vad-gate", open });
}
private sendParams(): void {
if (!this.workletNode) return;
log.debug("sendParams:", this.params);

View File

@@ -0,0 +1,85 @@
/*
Copyright 2026 New Vector Ltd.
SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
Please see LICENSE in the repository root for full details.
*/
import { MicVAD, getDefaultRealTimeVADOptions } from "@ricky0123/vad-web";
import { logger } from "matrix-js-sdk/lib/logger";
const log = logger.getChild("[SileroVADGate]");
const VAD_BASE_PATH = "/vad/";
/**
* Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the
* NoiseGateTransformer's VAD gate. Instead of creating its own microphone
* stream, it receives the existing LiveKit MediaStream so the VAD sees exactly
* the same audio the worklet processes.
*
* Usage:
* const gate = new SileroVADGate(stream, audioContext);
* gate.onSpeechStart = () => transformer.setVADOpen(true);
* gate.onSpeechEnd = () => transformer.setVADOpen(false);
* await gate.start();
* // later:
* await gate.destroy();
*/
export class SileroVADGate {
public onSpeechStart: () => void = () => {};
public onSpeechEnd: () => void = () => {};
private vad: MicVAD | null = null;
private readonly stream: MediaStream;
private readonly audioContext: AudioContext;
public constructor(stream: MediaStream, audioContext: AudioContext) {
this.stream = stream;
this.audioContext = audioContext;
}
public async start(): Promise<void> {
const stream = this.stream;
const audioContext = this.audioContext;
log.info("initialising MicVAD, baseAssetPath:", VAD_BASE_PATH);
this.vad = await MicVAD.new({
...getDefaultRealTimeVADOptions("legacy"),
audioContext,
baseAssetPath: VAD_BASE_PATH,
onnxWASMBasePath: VAD_BASE_PATH,
startOnLoad: false,
// Provide the existing stream instead of calling getUserMedia
// eslint-disable-next-line @typescript-eslint/require-await
getStream: async (): Promise<MediaStream> => stream,
pauseStream: async (): Promise<void> => {},
// eslint-disable-next-line @typescript-eslint/require-await
resumeStream: async (): Promise<MediaStream> => stream,
onSpeechStart: (): void => {
log.debug("speech start");
this.onSpeechStart();
},
onSpeechEnd: (): void => {
log.debug("speech end");
this.onSpeechEnd();
},
onVADMisfire: (): void => {
log.debug("VAD misfire");
},
onFrameProcessed: (): void => {},
onSpeechRealStart: (): void => {},
});
await this.vad.start();
log.info("MicVAD started");
}
public async destroy(): Promise<void> {
if (this.vad) {
await this.vad.destroy();
this.vad = null;
}
}
}

View File

@@ -32,6 +32,7 @@ import {
transientSuppressorEnabled as transientSuppressorEnabledSetting,
transientThreshold as transientThresholdSetting,
transientRelease as transientReleaseSetting,
vadEnabled as vadEnabledSetting,
} from "./settings";
import { PreferencesSettingsTab } from "./PreferencesSettingsTab";
import { Slider } from "../Slider";
@@ -129,6 +130,9 @@ export const SettingsModal: FC<Props> = ({
const [showAdvancedGate, setShowAdvancedGate] = useState(false);
// Voice activity detection
const [vadActive, setVadActive] = useSetting(vadEnabledSetting);
// Transient suppressor settings
const [transientEnabled, setTransientEnabled] = useSetting(transientSuppressorEnabledSetting);
const [transientThreshold, setTransientThreshold] = useSetting(transientThresholdSetting);
@@ -310,6 +314,31 @@ export const SettingsModal: FC<Props> = ({
</>
)}
</div>
<div className={styles.noiseGateSection}>
<Heading
type="body"
weight="semibold"
size="sm"
as="h4"
className={styles.noiseGateHeading}
>
Voice Activity Detection
</Heading>
<Separator className={styles.noiseGateSeparator} />
<FieldRow>
<InputField
id="vadEnabled"
type="checkbox"
label="Enable voice activity detection"
description="Uses the Silero VAD model to mute audio when no speech is detected. Requires the noise gate to be enabled."
checked={vadActive}
onChange={(e: ChangeEvent<HTMLInputElement>): void =>
setVadActive(e.target.checked)
}
disabled={!noiseGateEnabled}
/>
</FieldRow>
</div>
<div className={styles.noiseGateSection}>
<Heading
type="body"

View File

@@ -145,6 +145,8 @@ export const noiseGateHold = new Setting<number>("noise-gate-hold", 200);
// Time in ms for the gate to fully close after hold expires
export const noiseGateRelease = new Setting<number>("noise-gate-release", 150);
export const vadEnabled = new Setting<boolean>("vad-enabled", false);
export const transientSuppressorEnabled = new Setting<boolean>(
"transient-suppressor-enabled",
false,

View File

@@ -41,11 +41,13 @@ import {
transientSuppressorEnabled,
transientThreshold,
transientRelease,
vadEnabled,
} from "../../../settings/settings.ts";
import {
type NoiseGateParams,
NoiseGateTransformer,
} from "../../../livekit/NoiseGateTransformer.ts";
import { SileroVADGate } from "../../../livekit/SileroVADGate.ts";
import { observeTrackReference$ } from "../../observeTrackReference";
import { type Connection } from "../remoteMembers/Connection.ts";
import { ObservableScope } from "../../ObservableScope.ts";
@@ -435,6 +437,7 @@ export class Publisher {
let transformer: NoiseGateTransformer | null = null;
let audioCtx: AudioContext | null = null;
let vadGate: SileroVADGate | null = null;
const currentParams = (): NoiseGateParams => ({
threshold: noiseGateThreshold.getValue(),
@@ -446,6 +449,32 @@ export class Publisher {
transientReleaseMs: transientRelease.getValue(),
});
const stopVAD = (): void => {
if (vadGate) {
void vadGate.destroy();
vadGate = null;
}
// Reset gate to open so audio flows if VAD is toggled off mid-call
transformer?.setVADOpen(true);
};
const startVAD = (track: LocalAudioTrack, ctx: AudioContext): void => {
stopVAD();
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const rawTrack: MediaStreamTrack | undefined = (track as any).mediaStreamTrack;
if (!rawTrack) {
this.logger.warn("[VAD] no underlying MediaStreamTrack — skipping VAD");
return;
}
const stream = new MediaStream([rawTrack]);
vadGate = new SileroVADGate(stream, ctx);
vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
vadGate.start().catch((e: unknown) => {
this.logger.error("[VAD] failed to start", e);
});
};
// Attach / detach processor when enabled state or the track changes.
combineLatest([audioTrack$, noiseGateEnabled.value$])
.pipe(scope.bind())
@@ -459,18 +488,20 @@ export class Publisher {
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(audioTrack as any).setAudioContext(audioCtx);
audioCtx.resume().then(() => {
audioCtx.resume().then(async () => {
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
return audioTrack
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.setProcessor(transformer as any);
}).then(() => {
this.logger.info("[NoiseGate] setProcessor resolved");
if (vadEnabled.getValue() && audioCtx) startVAD(audioTrack, audioCtx);
}).catch((e: unknown) => {
this.logger.error("[NoiseGate] setProcessor failed", e);
});
} else if (!enabled && audioTrack.getProcessor()) {
this.logger.info("[NoiseGate] removing processor");
stopVAD();
void audioTrack.stopProcessor();
void audioCtx?.close();
audioCtx = null;
@@ -482,6 +513,18 @@ export class Publisher {
}
});
// Start/stop VAD when its toggle changes.
combineLatest([audioTrack$, vadEnabled.value$])
.pipe(scope.bind())
.subscribe(([audioTrack, enabled]) => {
if (!audioTrack || !audioCtx) return;
if (enabled) {
startVAD(audioTrack, audioCtx);
} else {
stopVAD();
}
});
// Push param changes to the live worklet without recreating the processor.
combineLatest([
noiseGateThreshold.value$,