feat: add Silero VAD toggle to audio pipeline

Integrates @ricky0123/vad-web's MicVAD as an optional voice activity detector alongside the noise gate. When enabled, the Silero ONNX model classifies each audio frame as speech or silence; silence frames mute the worklet's output via a new VAD gate message. VAD is wired into Publisher.ts alongside the existing noise gate transformer. Vite is configured to copy the worklet bundle, ONNX model, and ORT WASM files to /vad/ so they're reachable at runtime. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 23:29:43 -03:00
parent 0788e56c51
commit 428b76db25
9 changed files with 386 additions and 6 deletions
--- a/src/livekit/NoiseGateProcessor.worklet.ts
+++ b/src/livekit/NoiseGateProcessor.worklet.ts
@@ -30,6 +30,11 @@ interface NoiseGateParams {
  transientReleaseMs: number;   // how quickly suppression fades after transient ends
 }

+interface VADGateMessage {
+  type: "vad-gate";
+  open: boolean;
+}
+
 function dbToLinear(db: number): number {
  return Math.pow(10, db / 20);
 }
@@ -65,12 +70,19 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
  // Exponential smoothing coefficient for background RMS (~200ms time constant)
  private rmsCoeff = Math.exp(-1.0 / (0.2 * sampleRate));

+  // VAD gate state (controlled externally via port message)
+  private vadGateOpen = true; // starts open until VAD sends its first decision
+
  private logCounter = 0;

  public constructor() {
    super();
-    this.port.onmessage = (e: MessageEvent<NoiseGateParams>): void => {
-      this.updateParams(e.data);
+    this.port.onmessage = (e: MessageEvent<NoiseGateParams | VADGateMessage>): void => {
+      if ((e.data as VADGateMessage).type === "vad-gate") {
+        this.vadGateOpen = (e.data as VADGateMessage).open;
+      } else {
+        this.updateParams(e.data as NoiseGateParams);
+      }
    };
    this.updateParams({
      threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
@@ -148,7 +160,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
        }
      }

-      const gain = this.gateAttenuation * transientGain;
+      const gain = this.gateAttenuation * transientGain * (this.vadGateOpen ? 1.0 : 0.0);

      for (let c = 0; c < output.length; c++) {
        const inCh = input[c] ?? input[0];
--- a/src/livekit/NoiseGateTransformer.ts
+++ b/src/livekit/NoiseGateTransformer.ts
@@ -119,6 +119,11 @@ export class NoiseGateTransformer implements AudioTrackProcessor {
    this.sendParams();
  }

+  /** Tell the worklet to open or close the VAD-controlled gate. */
+  public setVADOpen(open: boolean): void {
+    this.workletNode?.port.postMessage({ type: "vad-gate", open });
+  }
+
  private sendParams(): void {
    if (!this.workletNode) return;
    log.debug("sendParams:", this.params);
--- a/src/livekit/SileroVADGate.ts
+++ b/src/livekit/SileroVADGate.ts
@@ -0,0 +1,85 @@
+/*
+Copyright 2026 New Vector Ltd.
+
+SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
+Please see LICENSE in the repository root for full details.
+*/
+
+import { MicVAD, getDefaultRealTimeVADOptions } from "@ricky0123/vad-web";
+import { logger } from "matrix-js-sdk/lib/logger";
+
+const log = logger.getChild("[SileroVADGate]");
+
+const VAD_BASE_PATH = "/vad/";
+
+/**
+ * Wraps @ricky0123/vad-web's MicVAD to feed speech/silence decisions into the
+ * NoiseGateTransformer's VAD gate.  Instead of creating its own microphone
+ * stream, it receives the existing LiveKit MediaStream so the VAD sees exactly
+ * the same audio the worklet processes.
+ *
+ * Usage:
+ *   const gate = new SileroVADGate(stream, audioContext);
+ *   gate.onSpeechStart = () => transformer.setVADOpen(true);
+ *   gate.onSpeechEnd   = () => transformer.setVADOpen(false);
+ *   await gate.start();
+ *   // later:
+ *   await gate.destroy();
+ */
+export class SileroVADGate {
+  public onSpeechStart: () => void = () => {};
+  public onSpeechEnd: () => void = () => {};
+
+  private vad: MicVAD | null = null;
+  private readonly stream: MediaStream;
+  private readonly audioContext: AudioContext;
+
+  public constructor(stream: MediaStream, audioContext: AudioContext) {
+    this.stream = stream;
+    this.audioContext = audioContext;
+  }
+
+  public async start(): Promise<void> {
+    const stream = this.stream;
+    const audioContext = this.audioContext;
+
+    log.info("initialising MicVAD, baseAssetPath:", VAD_BASE_PATH);
+
+    this.vad = await MicVAD.new({
+      ...getDefaultRealTimeVADOptions("legacy"),
+      audioContext,
+      baseAssetPath: VAD_BASE_PATH,
+      onnxWASMBasePath: VAD_BASE_PATH,
+      startOnLoad: false,
+      // Provide the existing stream instead of calling getUserMedia
+      // eslint-disable-next-line @typescript-eslint/require-await
+      getStream: async (): Promise<MediaStream> => stream,
+      pauseStream: async (): Promise<void> => {},
+      // eslint-disable-next-line @typescript-eslint/require-await
+      resumeStream: async (): Promise<MediaStream> => stream,
+      onSpeechStart: (): void => {
+        log.debug("speech start");
+        this.onSpeechStart();
+      },
+      onSpeechEnd: (): void => {
+        log.debug("speech end");
+        this.onSpeechEnd();
+      },
+      onVADMisfire: (): void => {
+        log.debug("VAD misfire");
+      },
+      onFrameProcessed: (): void => {},
+      onSpeechRealStart: (): void => {},
+    });
+
+    await this.vad.start();
+    log.info("MicVAD started");
+  }
+
+  public async destroy(): Promise<void> {
+    if (this.vad) {
+      await this.vad.destroy();
+      this.vad = null;
+    }
+  }
+}
--- a/src/settings/SettingsModal.tsx
+++ b/src/settings/SettingsModal.tsx
@@ -32,6 +32,7 @@ import {
  transientSuppressorEnabled as transientSuppressorEnabledSetting,
  transientThreshold as transientThresholdSetting,
  transientRelease as transientReleaseSetting,
+  vadEnabled as vadEnabledSetting,
 } from "./settings";
 import { PreferencesSettingsTab } from "./PreferencesSettingsTab";
 import { Slider } from "../Slider";
@@ -129,6 +130,9 @@ export const SettingsModal: FC<Props> = ({

  const [showAdvancedGate, setShowAdvancedGate] = useState(false);

+  // Voice activity detection
+  const [vadActive, setVadActive] = useSetting(vadEnabledSetting);
+
  // Transient suppressor settings
  const [transientEnabled, setTransientEnabled] = useSetting(transientSuppressorEnabledSetting);
  const [transientThreshold, setTransientThreshold] = useSetting(transientThresholdSetting);
@@ -310,6 +314,31 @@ export const SettingsModal: FC<Props> = ({
          </>
        )}
        </div>
+        <div className={styles.noiseGateSection}>
+          <Heading
+            type="body"
+            weight="semibold"
+            size="sm"
+            as="h4"
+            className={styles.noiseGateHeading}
+          >
+            Voice Activity Detection
+          </Heading>
+          <Separator className={styles.noiseGateSeparator} />
+          <FieldRow>
+            <InputField
+              id="vadEnabled"
+              type="checkbox"
+              label="Enable voice activity detection"
+              description="Uses the Silero VAD model to mute audio when no speech is detected. Requires the noise gate to be enabled."
+              checked={vadActive}
+              onChange={(e: ChangeEvent<HTMLInputElement>): void =>
+                setVadActive(e.target.checked)
+              }
+              disabled={!noiseGateEnabled}
+            />
+          </FieldRow>
+        </div>
        <div className={styles.noiseGateSection}>
          <Heading
            type="body"
--- a/src/settings/settings.ts
+++ b/src/settings/settings.ts
@@ -145,6 +145,8 @@ export const noiseGateHold = new Setting<number>("noise-gate-hold", 200);
 // Time in ms for the gate to fully close after hold expires
 export const noiseGateRelease = new Setting<number>("noise-gate-release", 150);

+export const vadEnabled = new Setting<boolean>("vad-enabled", false);
+
 export const transientSuppressorEnabled = new Setting<boolean>(
  "transient-suppressor-enabled",
  false,
--- a/src/state/CallViewModel/localMember/Publisher.ts
+++ b/src/state/CallViewModel/localMember/Publisher.ts
@@ -41,11 +41,13 @@ import {
  transientSuppressorEnabled,
  transientThreshold,
  transientRelease,
+  vadEnabled,
 } from "../../../settings/settings.ts";
 import {
  type NoiseGateParams,
  NoiseGateTransformer,
 } from "../../../livekit/NoiseGateTransformer.ts";
+import { SileroVADGate } from "../../../livekit/SileroVADGate.ts";
 import { observeTrackReference$ } from "../../observeTrackReference";
 import { type Connection } from "../remoteMembers/Connection.ts";
 import { ObservableScope } from "../../ObservableScope.ts";
@@ -435,6 +437,7 @@ export class Publisher {

    let transformer: NoiseGateTransformer | null = null;
    let audioCtx: AudioContext | null = null;
+    let vadGate: SileroVADGate | null = null;

    const currentParams = (): NoiseGateParams => ({
      threshold: noiseGateThreshold.getValue(),
@@ -446,6 +449,32 @@ export class Publisher {
      transientReleaseMs: transientRelease.getValue(),
    });

+    const stopVAD = (): void => {
+      if (vadGate) {
+        void vadGate.destroy();
+        vadGate = null;
+      }
+      // Reset gate to open so audio flows if VAD is toggled off mid-call
+      transformer?.setVADOpen(true);
+    };
+
+    const startVAD = (track: LocalAudioTrack, ctx: AudioContext): void => {
+      stopVAD();
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const rawTrack: MediaStreamTrack | undefined = (track as any).mediaStreamTrack;
+      if (!rawTrack) {
+        this.logger.warn("[VAD] no underlying MediaStreamTrack — skipping VAD");
+        return;
+      }
+      const stream = new MediaStream([rawTrack]);
+      vadGate = new SileroVADGate(stream, ctx);
+      vadGate.onSpeechStart = (): void => transformer?.setVADOpen(true);
+      vadGate.onSpeechEnd = (): void => transformer?.setVADOpen(false);
+      vadGate.start().catch((e: unknown) => {
+        this.logger.error("[VAD] failed to start", e);
+      });
+    };
+
    // Attach / detach processor when enabled state or the track changes.
    combineLatest([audioTrack$, noiseGateEnabled.value$])
      .pipe(scope.bind())
@@ -459,18 +488,20 @@ export class Publisher {
          this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
          // eslint-disable-next-line @typescript-eslint/no-explicit-any
          (audioTrack as any).setAudioContext(audioCtx);
-          audioCtx.resume().then(() => {
+          audioCtx.resume().then(async () => {
            this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
            return audioTrack
              // eslint-disable-next-line @typescript-eslint/no-explicit-any
              .setProcessor(transformer as any);
          }).then(() => {
            this.logger.info("[NoiseGate] setProcessor resolved");
+            if (vadEnabled.getValue() && audioCtx) startVAD(audioTrack, audioCtx);
          }).catch((e: unknown) => {
            this.logger.error("[NoiseGate] setProcessor failed", e);
          });
        } else if (!enabled && audioTrack.getProcessor()) {
          this.logger.info("[NoiseGate] removing processor");
+          stopVAD();
          void audioTrack.stopProcessor();
          void audioCtx?.close();
          audioCtx = null;
@@ -482,6 +513,18 @@ export class Publisher {
        }
      });

+    // Start/stop VAD when its toggle changes.
+    combineLatest([audioTrack$, vadEnabled.value$])
+      .pipe(scope.bind())
+      .subscribe(([audioTrack, enabled]) => {
+        if (!audioTrack || !audioCtx) return;
+        if (enabled) {
+          startVAD(audioTrack, audioCtx);
+        } else {
+          stopVAD();
+        }
+      });
+
    // Push param changes to the live worklet without recreating the processor.
    combineLatest([
      noiseGateThreshold.value$,