feat: replace Silero VAD with TEN-VAD running inside the AudioWorklet
TEN-VAD (official TEN-framework/ten-vad WASM, no npm dependency) replaces
@ricky0123/vad-web. The WASM module is compiled once on the main thread and
passed to the AudioWorklet via processorOptions, where it is instantiated
synchronously and called every 16 ms with no IPC round-trip.
- Add public/vad/ten_vad.{wasm,js} from official upstream lib/Web/
- NoiseGateProcessor: TenVADRuntime class wraps the Emscripten WASM with
minimal import stubs; 3:1 decimation accumulates 256 Int16 samples @
16 kHz per hop; hysteresis controls vadGateOpen directly in-worklet
- NoiseGateTransformer: fetch+compile WASM once (module-level cache),
pass WebAssembly.Module via processorOptions; remove setVADOpen()
- Publisher: remove all SileroVADGate lifecycle (init/start/stop/destroy,
rawMicTrack capture); VAD params folded into single combineLatest;
fix transient suppressor standalone attach (shouldAttach now includes
transientSuppressorEnabled)
- vite.config.ts: remove viteStaticCopy, serveVadAssets plugin, and all
vad-web/onnxruntime copy targets (public/vad/ served automatically)
- Remove @ricky0123/vad-web, onnxruntime-web deps and resolution
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -8,6 +8,9 @@ Please see LICENSE in the repository root for full details.
|
||||
declare const sampleRate: number;
|
||||
declare class AudioWorkletProcessor {
|
||||
public readonly port: MessagePort;
|
||||
public constructor(options?: {
|
||||
processorOptions?: Record<string, unknown>;
|
||||
});
|
||||
public process(
|
||||
inputs: Float32Array[][],
|
||||
outputs: Float32Array[][],
|
||||
@@ -29,6 +32,10 @@ interface NoiseGateParams {
|
||||
transientEnabled: boolean;
|
||||
transientThresholdDb: number; // dB above background RMS that triggers suppression
|
||||
transientReleaseMs: number; // how quickly suppression fades after transient ends
|
||||
// TEN-VAD params
|
||||
vadEnabled: boolean;
|
||||
vadPositiveThreshold: number; // open gate when prob >= this (0–1)
|
||||
vadNegativeThreshold: number; // close gate when prob < this (0–1)
|
||||
}
|
||||
|
||||
interface VADGateMessage {
|
||||
@@ -41,16 +48,142 @@ function dbToLinear(db: number): number {
|
||||
}
|
||||
|
||||
/**
|
||||
* AudioWorkletProcessor implementing a noise gate and an optional transient
|
||||
* suppressor, both running per-sample in a single pass.
|
||||
* Thin synchronous wrapper around the TEN-VAD Emscripten WASM module.
|
||||
* Instantiated synchronously in the AudioWorklet constructor from a
|
||||
* pre-compiled WebAssembly.Module passed via processorOptions.
|
||||
*/
|
||||
class TenVADRuntime {
|
||||
private readonly mem: WebAssembly.Memory;
|
||||
private readonly freeFn: (ptr: number) => void;
|
||||
private readonly processFn: (
|
||||
handle: number,
|
||||
audioPtr: number,
|
||||
hopSize: number,
|
||||
probPtr: number,
|
||||
flagPtr: number,
|
||||
) => number;
|
||||
private readonly destroyFn: (handle: number) => number;
|
||||
private readonly handle: number;
|
||||
private readonly audioBufPtr: number;
|
||||
private readonly probPtr: number;
|
||||
private readonly flagPtr: number;
|
||||
public readonly hopSize: number;
|
||||
|
||||
public constructor(
|
||||
module: WebAssembly.Module,
|
||||
hopSize: number,
|
||||
threshold: number,
|
||||
) {
|
||||
this.hopSize = hopSize;
|
||||
|
||||
// Late-bound memory reference — emscripten_resize_heap and memmove
|
||||
// are only called after instantiation, so closing over this is safe.
|
||||
const state = { mem: null as WebAssembly.Memory | null };
|
||||
|
||||
const imports = {
|
||||
a: {
|
||||
// abort
|
||||
a: (): never => {
|
||||
throw new Error("ten_vad abort");
|
||||
},
|
||||
// fd_write / proc_exit stub
|
||||
b: (): number => 0,
|
||||
// emscripten_resize_heap
|
||||
c: (reqBytes: number): number => {
|
||||
if (!state.mem) return 0;
|
||||
try {
|
||||
const cur = state.mem.buffer.byteLength;
|
||||
if (cur >= reqBytes) return 1;
|
||||
state.mem.grow(Math.ceil((reqBytes - cur) / 65536));
|
||||
return 1;
|
||||
} catch {
|
||||
return 0;
|
||||
}
|
||||
},
|
||||
// fd_write stub
|
||||
d: (): number => 0,
|
||||
// environ stub
|
||||
e: (): number => 0,
|
||||
// memmove
|
||||
f: (dest: number, src: number, len: number): void => {
|
||||
if (state.mem) {
|
||||
new Uint8Array(state.mem.buffer).copyWithin(dest, src, src + len);
|
||||
}
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
// Synchronous instantiation — valid in Worker/AudioWorklet global scope
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const instance = new WebAssembly.Instance(module, imports as any);
|
||||
const asm = instance.exports as {
|
||||
g: WebAssembly.Memory; // exported memory
|
||||
h: () => void; // __wasm_call_ctors
|
||||
i: (n: number) => number; // malloc
|
||||
j: (p: number) => void; // free
|
||||
k: (handlePtr: number, hopSize: number, threshold: number) => number; // ten_vad_create
|
||||
l: (handle: number, audioPtr: number, hopSize: number, probPtr: number, flagPtr: number) => number; // ten_vad_process
|
||||
m: (handle: number) => number; // ten_vad_destroy
|
||||
};
|
||||
|
||||
state.mem = asm.g;
|
||||
this.mem = asm.g;
|
||||
this.freeFn = asm.j;
|
||||
this.processFn = asm.l;
|
||||
this.destroyFn = asm.m;
|
||||
|
||||
// Run Emscripten static constructors
|
||||
asm.h();
|
||||
|
||||
// Allocate persistent buffers (malloc is 8-byte aligned, so alignment is fine)
|
||||
this.audioBufPtr = asm.i(hopSize * 2); // Int16Array
|
||||
this.probPtr = asm.i(4); // float
|
||||
this.flagPtr = asm.i(4); // int
|
||||
|
||||
// Create VAD handle — ten_vad_create(void** handle, int hopSize, float threshold)
|
||||
const handlePtrPtr = asm.i(4);
|
||||
const ret = asm.k(handlePtrPtr, hopSize, threshold);
|
||||
if (ret !== 0) throw new Error(`ten_vad_create failed: ${ret}`);
|
||||
this.handle = new Int32Array(this.mem.buffer)[handlePtrPtr >> 2];
|
||||
asm.j(handlePtrPtr);
|
||||
}
|
||||
|
||||
/** Process one hop of Int16 audio. Returns speech probability [0–1]. */
|
||||
public process(samples: Int16Array): number {
|
||||
new Int16Array(this.mem.buffer).set(samples, this.audioBufPtr >> 1);
|
||||
this.processFn(
|
||||
this.handle,
|
||||
this.audioBufPtr,
|
||||
this.hopSize,
|
||||
this.probPtr,
|
||||
this.flagPtr,
|
||||
);
|
||||
return new Float32Array(this.mem.buffer)[this.probPtr >> 2];
|
||||
}
|
||||
|
||||
public destroy(): void {
|
||||
this.destroyFn(this.handle);
|
||||
this.freeFn(this.audioBufPtr);
|
||||
this.freeFn(this.probPtr);
|
||||
this.freeFn(this.flagPtr);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* AudioWorkletProcessor implementing a noise gate, an optional transient
|
||||
* suppressor, and an optional in-worklet TEN-VAD gate — all running
|
||||
* per-sample in a single pass.
|
||||
*
|
||||
* Noise gate: opens when instantaneous peak exceeds threshold, closes below.
|
||||
* Attack, hold, and release times smooth the attenuation envelope.
|
||||
*
|
||||
* Transient suppressor: tracks a slow-moving RMS background level. When the
|
||||
* instantaneous peak exceeds the background by more than transientThresholdDb,
|
||||
* gain is instantly cut to 0 and releases over transientReleaseMs. This catches
|
||||
* desk hits, mic bumps, and other sudden loud impacts without affecting speech.
|
||||
* gain is instantly cut to 0 and releases over transientReleaseMs.
|
||||
*
|
||||
* TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz),
|
||||
* runs the TEN-VAD model synchronously every 256 samples (16 ms), and
|
||||
* controls vadGateOpen with hysteresis. No IPC round-trip required.
|
||||
*/
|
||||
class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
// Noise gate state
|
||||
@@ -65,35 +198,84 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
|
||||
// Transient suppressor state
|
||||
private transientEnabled = false;
|
||||
private transientRatio = dbToLinear(15); // peak must exceed rms by this factor
|
||||
private transientRatio = dbToLinear(15);
|
||||
private transientReleaseRate = 1.0 / (0.08 * sampleRate);
|
||||
private transientAttenuation = 1.0; // 1 = fully open, ramps to 0 on transient
|
||||
private transientAttenuation = 1.0;
|
||||
private slowRms = 0;
|
||||
// Exponential smoothing coefficient for background RMS (~200ms time constant)
|
||||
private rmsCoeff = Math.exp(-1.0 / (0.2 * sampleRate));
|
||||
|
||||
// VAD gate state (controlled externally via port message)
|
||||
private vadGateOpen = true; // starts open until VAD sends its first decision
|
||||
// Smooth ramp so the VAD gate fades rather than cutting instantly (~20ms)
|
||||
// VAD gate state
|
||||
private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame
|
||||
private vadAttenuation = 1.0;
|
||||
private readonly vadRampRate = 1.0 / (0.02 * sampleRate);
|
||||
|
||||
// TEN-VAD state
|
||||
private vadEnabled = false;
|
||||
private vadPositiveThreshold = 0.5;
|
||||
private vadNegativeThreshold = 0.3;
|
||||
private tenVadRuntime: TenVADRuntime | null = null;
|
||||
// 3:1 decimation from AudioContext sample rate to 16 kHz
|
||||
private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000));
|
||||
private decPhase = 0;
|
||||
private decAcc = 0;
|
||||
private readonly vadHopBuf = new Int16Array(256);
|
||||
private vadHopCount = 0;
|
||||
|
||||
private logCounter = 0;
|
||||
|
||||
public constructor() {
|
||||
super();
|
||||
this.port.onmessage = (e: MessageEvent<NoiseGateParams | VADGateMessage>): void => {
|
||||
public constructor(options?: {
|
||||
processorOptions?: Record<string, unknown>;
|
||||
}) {
|
||||
super(options);
|
||||
|
||||
// Try to instantiate TEN-VAD from the pre-compiled module passed by the main thread
|
||||
const tenVadModule = options?.processorOptions?.tenVadModule as
|
||||
| WebAssembly.Module
|
||||
| undefined;
|
||||
if (tenVadModule) {
|
||||
try {
|
||||
// hopSize = 256 samples @ 16 kHz = 16 ms; threshold = 0.5 (overridden via params)
|
||||
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 256, 0.5);
|
||||
this.port.postMessage({
|
||||
type: "log",
|
||||
msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio,
|
||||
});
|
||||
} catch (e) {
|
||||
this.port.postMessage({
|
||||
type: "log",
|
||||
msg: "[NoiseGate worklet] TEN-VAD init failed: " + String(e),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
this.port.onmessage = (
|
||||
e: MessageEvent<NoiseGateParams | VADGateMessage>,
|
||||
): void => {
|
||||
if ((e.data as VADGateMessage).type === "vad-gate") {
|
||||
this.vadGateOpen = (e.data as VADGateMessage).open;
|
||||
} else {
|
||||
this.updateParams(e.data as NoiseGateParams);
|
||||
}
|
||||
};
|
||||
|
||||
this.updateParams({
|
||||
noiseGateActive: true, threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
|
||||
transientEnabled: false, transientThresholdDb: 15, transientReleaseMs: 80,
|
||||
noiseGateActive: true,
|
||||
threshold: -60,
|
||||
attackMs: 25,
|
||||
holdMs: 200,
|
||||
releaseMs: 150,
|
||||
transientEnabled: false,
|
||||
transientThresholdDb: 15,
|
||||
transientReleaseMs: 80,
|
||||
vadEnabled: false,
|
||||
vadPositiveThreshold: 0.5,
|
||||
vadNegativeThreshold: 0.3,
|
||||
});
|
||||
|
||||
this.port.postMessage({
|
||||
type: "log",
|
||||
msg: "[NoiseGate worklet] constructor called, sampleRate=" + sampleRate,
|
||||
});
|
||||
this.port.postMessage({ type: "log", msg: "[NoiseGate worklet] constructor called, sampleRate=" + sampleRate });
|
||||
}
|
||||
|
||||
private updateParams(p: NoiseGateParams): void {
|
||||
@@ -105,11 +287,17 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
this.transientEnabled = p.transientEnabled;
|
||||
this.transientRatio = dbToLinear(p.transientThresholdDb);
|
||||
this.transientReleaseRate = 1.0 / ((p.transientReleaseMs / 1000) * sampleRate);
|
||||
this.vadEnabled = p.vadEnabled ?? false;
|
||||
this.vadPositiveThreshold = p.vadPositiveThreshold ?? 0.5;
|
||||
this.vadNegativeThreshold = p.vadNegativeThreshold ?? 0.3;
|
||||
// When VAD is disabled, open the gate immediately
|
||||
if (!this.vadEnabled) this.vadGateOpen = true;
|
||||
this.port.postMessage({
|
||||
type: "log",
|
||||
msg: "[NoiseGate worklet] params updated: threshold=" + p.threshold
|
||||
+ " transientEnabled=" + p.transientEnabled
|
||||
+ " transientThresholdDb=" + p.transientThresholdDb,
|
||||
+ " vadEnabled=" + p.vadEnabled
|
||||
+ " vadPos=" + p.vadPositiveThreshold
|
||||
+ " vadNeg=" + p.vadNegativeThreshold,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -132,19 +320,18 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
// --- Transient suppressor ---
|
||||
let transientGain = 1.0;
|
||||
if (this.transientEnabled) {
|
||||
// Update slow RMS background (exponential moving average of energy)
|
||||
this.slowRms = Math.sqrt(
|
||||
this.rmsCoeff * this.slowRms * this.slowRms +
|
||||
(1.0 - this.rmsCoeff) * curLevel * curLevel,
|
||||
);
|
||||
|
||||
const background = Math.max(this.slowRms, 1e-6);
|
||||
if (curLevel > background * this.transientRatio) {
|
||||
// Transient detected — instantly cut gain
|
||||
this.transientAttenuation = 0.0;
|
||||
} else {
|
||||
// Release: ramp back toward 1
|
||||
this.transientAttenuation = Math.min(1.0, this.transientAttenuation + this.transientReleaseRate);
|
||||
this.transientAttenuation = Math.min(
|
||||
1.0,
|
||||
this.transientAttenuation + this.transientReleaseRate,
|
||||
);
|
||||
}
|
||||
transientGain = this.transientAttenuation;
|
||||
}
|
||||
@@ -159,23 +346,66 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
this.isOpen = false;
|
||||
}
|
||||
if (this.isOpen) {
|
||||
this.gateAttenuation = Math.min(1.0, this.gateAttenuation + this.attackRate);
|
||||
this.gateAttenuation = Math.min(
|
||||
1.0,
|
||||
this.gateAttenuation + this.attackRate,
|
||||
);
|
||||
} else {
|
||||
this.heldTime += samplePeriod;
|
||||
if (this.heldTime > this.holdTime) {
|
||||
this.gateAttenuation = Math.max(0.0, this.gateAttenuation - this.releaseRate);
|
||||
this.gateAttenuation = Math.max(
|
||||
0.0,
|
||||
this.gateAttenuation - this.releaseRate,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.gateAttenuation = 1.0;
|
||||
}
|
||||
|
||||
// Ramp VAD attenuation toward target to avoid clicks on gate open/close
|
||||
// --- TEN-VAD in-worklet processing ---
|
||||
// Accumulate raw mono samples with decRatio:1 decimation (48 kHz → 16 kHz).
|
||||
// Every 256 output samples (16 ms) run the WASM VAD and update vadGateOpen.
|
||||
if (this.vadEnabled && this.tenVadRuntime !== null) {
|
||||
this.decAcc += input[0]?.[i] ?? 0;
|
||||
this.decPhase++;
|
||||
if (this.decPhase >= this.decRatio) {
|
||||
this.decPhase = 0;
|
||||
const avg = this.decAcc / this.decRatio;
|
||||
this.decAcc = 0;
|
||||
// Float32 [-1,1] → Int16 with clamping
|
||||
const s16 =
|
||||
avg >= 1.0
|
||||
? 32767
|
||||
: avg <= -1.0
|
||||
? -32768
|
||||
: (avg * 32767 + 0.5) | 0;
|
||||
this.vadHopBuf[this.vadHopCount++] = s16;
|
||||
|
||||
if (this.vadHopCount >= 256) {
|
||||
this.vadHopCount = 0;
|
||||
const prob = this.tenVadRuntime.process(this.vadHopBuf);
|
||||
if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) {
|
||||
this.vadGateOpen = true;
|
||||
} else if (this.vadGateOpen && prob < this.vadNegativeThreshold) {
|
||||
this.vadGateOpen = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ramp VAD attenuation toward target to avoid clicks
|
||||
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
|
||||
if (this.vadAttenuation < vadTarget) {
|
||||
this.vadAttenuation = Math.min(vadTarget, this.vadAttenuation + this.vadRampRate);
|
||||
this.vadAttenuation = Math.min(
|
||||
vadTarget,
|
||||
this.vadAttenuation + this.vadRampRate,
|
||||
);
|
||||
} else if (this.vadAttenuation > vadTarget) {
|
||||
this.vadAttenuation = Math.max(vadTarget, this.vadAttenuation - this.vadRampRate);
|
||||
this.vadAttenuation = Math.max(
|
||||
vadTarget,
|
||||
this.vadAttenuation - this.vadRampRate,
|
||||
);
|
||||
}
|
||||
|
||||
const gain = this.gateAttenuation * transientGain * this.vadAttenuation;
|
||||
@@ -196,7 +426,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
msg: "[NoiseGate worklet] gateOpen=" + this.isOpen
|
||||
+ " gateAtten=" + this.gateAttenuation.toFixed(3)
|
||||
+ " transientAtten=" + this.transientAttenuation.toFixed(3)
|
||||
+ " slowRms=" + this.slowRms.toFixed(5),
|
||||
+ " vadOpen=" + this.vadGateOpen
|
||||
+ " vadAtten=" + this.vadAttenuation.toFixed(3),
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,10 @@ export interface NoiseGateParams {
|
||||
transientEnabled: boolean;
|
||||
transientThresholdDb: number; // dB above background RMS that triggers suppression
|
||||
transientReleaseMs: number; // ms for suppression to fade after transient ends
|
||||
// TEN-VAD params — processed entirely inside the AudioWorklet
|
||||
vadEnabled: boolean;
|
||||
vadPositiveThreshold: number; // open gate when isSpeech prob >= this (0–1)
|
||||
vadNegativeThreshold: number; // close gate when isSpeech prob < this (0–1)
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -43,13 +47,36 @@ export interface AudioTrackProcessor {
|
||||
destroy(): Promise<void>;
|
||||
}
|
||||
|
||||
// Cached compiled TEN-VAD module — compiled once, reused across processor restarts.
|
||||
let tenVadModulePromise: Promise<WebAssembly.Module> | null = null;
|
||||
|
||||
function getTenVADModule(): Promise<WebAssembly.Module> {
|
||||
if (!tenVadModulePromise) {
|
||||
tenVadModulePromise = fetch("/vad/ten_vad.wasm")
|
||||
.then((r) => {
|
||||
if (!r.ok) throw new Error(`Failed to fetch ten_vad.wasm: ${r.status}`);
|
||||
return r.arrayBuffer();
|
||||
})
|
||||
.then((buf) => WebAssembly.compile(buf))
|
||||
.catch((e) => {
|
||||
// Clear the cache so a retry is possible on next attach
|
||||
tenVadModulePromise = null;
|
||||
throw e;
|
||||
});
|
||||
}
|
||||
return tenVadModulePromise;
|
||||
}
|
||||
|
||||
/**
|
||||
* LiveKit audio track processor that applies the OBS-style noise gate via
|
||||
* AudioWorklet.
|
||||
* LiveKit audio track processor that applies a noise gate, optional transient
|
||||
* suppressor, and optional TEN-VAD gate via AudioWorklet.
|
||||
*
|
||||
* Builds the audio graph: sourceNode → workletNode → destinationNode, then
|
||||
* exposes destinationNode's track as processedTrack for LiveKit to swap into
|
||||
* the WebRTC sender via sender.replaceTrack(processedTrack).
|
||||
* The TEN-VAD WASM module is fetched once, compiled, and passed to the worklet
|
||||
* via processorOptions so it runs synchronously inside the audio thread —
|
||||
* no IPC round-trip, ~16 ms VAD latency.
|
||||
*
|
||||
* Audio graph: sourceNode → workletNode → destinationNode
|
||||
* processedTrack is destinationNode.stream.getAudioTracks()[0]
|
||||
*/
|
||||
export class NoiseGateTransformer implements AudioTrackProcessor {
|
||||
public readonly name = "noise-gate";
|
||||
@@ -69,6 +96,15 @@ export class NoiseGateTransformer implements AudioTrackProcessor {
|
||||
|
||||
log.info("init() called, audioContext state:", audioContext.state, "params:", this.params);
|
||||
|
||||
// Fetch and compile the TEN-VAD WASM module (cached after first call)
|
||||
let tenVadModule: WebAssembly.Module | undefined;
|
||||
try {
|
||||
tenVadModule = await getTenVADModule();
|
||||
log.info("TEN-VAD WASM module compiled");
|
||||
} catch (e) {
|
||||
log.warn("TEN-VAD WASM module unavailable — VAD disabled:", e);
|
||||
}
|
||||
|
||||
const workletUrl = new URL(
|
||||
"./NoiseGateProcessor.worklet.ts",
|
||||
import.meta.url,
|
||||
@@ -80,8 +116,15 @@ export class NoiseGateTransformer implements AudioTrackProcessor {
|
||||
this.workletNode = new AudioWorkletNode(
|
||||
audioContext,
|
||||
"noise-gate-processor",
|
||||
{
|
||||
processorOptions: {
|
||||
tenVadModule,
|
||||
},
|
||||
},
|
||||
);
|
||||
this.workletNode.port.onmessage = (e: MessageEvent<{ type: string; msg: string }>): void => {
|
||||
this.workletNode.port.onmessage = (
|
||||
e: MessageEvent<{ type: string; msg: string }>,
|
||||
): void => {
|
||||
if (e.data?.type === "log") log.debug(e.data.msg);
|
||||
};
|
||||
this.sendParams();
|
||||
@@ -114,17 +157,12 @@ export class NoiseGateTransformer implements AudioTrackProcessor {
|
||||
this.processedTrack = undefined;
|
||||
}
|
||||
|
||||
/** Push updated gate parameters to the running worklet. */
|
||||
/** Push updated gate/VAD parameters to the running worklet. */
|
||||
public updateParams(params: NoiseGateParams): void {
|
||||
this.params = { ...params };
|
||||
this.sendParams();
|
||||
}
|
||||
|
||||
/** Tell the worklet to open or close the VAD-controlled gate. */
|
||||
public setVADOpen(open: boolean): void {
|
||||
this.workletNode?.port.postMessage({ type: "vad-gate", open });
|
||||
}
|
||||
|
||||
private sendParams(): void {
|
||||
if (!this.workletNode) return;
|
||||
log.debug("sendParams:", this.params);
|
||||
|
||||
@@ -1,128 +0,0 @@
|
||||
/*
|
||||
Copyright 2026 New Vector Ltd.
|
||||
|
||||
SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
|
||||
Please see LICENSE in the repository root for full details.
|
||||
*/
|
||||
|
||||
import { MicVAD, getDefaultRealTimeVADOptions } from "@ricky0123/vad-web";
|
||||
// ort is not re-exported from the vad-web index; import from the submodule
|
||||
import { ort } from "@ricky0123/vad-web/dist/real-time-vad.js";
|
||||
import { logger } from "matrix-js-sdk/lib/logger";
|
||||
|
||||
const log = logger.getChild("[SileroVADGate]");
|
||||
|
||||
const VAD_BASE_PATH = "/vad/";
|
||||
|
||||
export interface SileroVADGateOptions {
|
||||
positiveThreshold: number; // open gate when isSpeech >= this (0–1)
|
||||
negativeThreshold: number; // close gate when isSpeech < this (0–1)
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps @ricky0123/vad-web's MicVAD with a two-phase lifecycle:
|
||||
*
|
||||
* init(audioContext) — loads the ONNX model and ORT WASM (expensive,
|
||||
* call as early as possible for zero-latency enable)
|
||||
* start(stream) — wires the stream and begins per-frame processing
|
||||
* stop() — pauses processing, keeps model loaded
|
||||
* destroy() — full teardown
|
||||
*
|
||||
* Uses onFrameProcessed (fires every ~32ms with v5 model) with hysteresis
|
||||
* to control the gate. Starts OPEN so audio flows immediately; the model
|
||||
* closes it on the first silent frame.
|
||||
*/
|
||||
export class SileroVADGate {
|
||||
public onOpen: () => void = () => {};
|
||||
public onClose: () => void = () => {};
|
||||
|
||||
private vad: MicVAD | null = null;
|
||||
private activeStream: MediaStream | null = null;
|
||||
private options: SileroVADGateOptions;
|
||||
private gateOpen = true;
|
||||
|
||||
public constructor(options: SileroVADGateOptions) {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 1 — load the model. Call this as early as possible (e.g. when the
|
||||
* AudioContext is first created) so start() is near-instant later.
|
||||
*/
|
||||
public async init(audioContext: AudioContext): Promise<void> {
|
||||
// Avoid requiring SharedArrayBuffer (COOP/COEP headers) by running
|
||||
// single-threaded. Performance is sufficient for 16kHz speech frames.
|
||||
ort.env.wasm.numThreads = 1;
|
||||
|
||||
log.info("pre-warming MicVAD model");
|
||||
|
||||
this.vad = await MicVAD.new({
|
||||
...getDefaultRealTimeVADOptions("v5"),
|
||||
audioContext,
|
||||
baseAssetPath: VAD_BASE_PATH,
|
||||
onnxWASMBasePath: VAD_BASE_PATH,
|
||||
startOnLoad: false,
|
||||
// Stream is provided via activeStream at start() time
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
getStream: async (): Promise<MediaStream> => {
|
||||
if (!this.activeStream) throw new Error("[VAD] stream not set — call start() first");
|
||||
return this.activeStream;
|
||||
},
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
pauseStream: async (): Promise<void> => {},
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
resumeStream: async (): Promise<MediaStream> => {
|
||||
if (!this.activeStream) throw new Error("[VAD] stream not set");
|
||||
return this.activeStream;
|
||||
},
|
||||
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
|
||||
const p = probabilities.isSpeech;
|
||||
if (!this.gateOpen && p >= this.options.positiveThreshold) {
|
||||
this.gateOpen = true;
|
||||
log.debug("gate open (isSpeech=", p, ")");
|
||||
this.onOpen();
|
||||
} else if (this.gateOpen && p < this.options.negativeThreshold) {
|
||||
this.gateOpen = false;
|
||||
log.debug("gate close (isSpeech=", p, ")");
|
||||
this.onClose();
|
||||
}
|
||||
},
|
||||
onSpeechStart: (): void => {},
|
||||
onSpeechEnd: (): void => {},
|
||||
onVADMisfire: (): void => {},
|
||||
onSpeechRealStart: (): void => {},
|
||||
});
|
||||
|
||||
log.info("MicVAD model loaded");
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 2 — wire the raw mic stream and begin classifying frames.
|
||||
* init() must have completed first.
|
||||
*/
|
||||
public async start(stream: MediaStream): Promise<void> {
|
||||
if (!this.vad) throw new Error("[VAD] call init() before start()");
|
||||
this.activeStream = stream;
|
||||
this.gateOpen = true; // start open — first silent frame will close it
|
||||
await this.vad.start();
|
||||
log.info("MicVAD started");
|
||||
}
|
||||
|
||||
/** Pause frame processing without destroying the model. */
|
||||
public async stop(): Promise<void> {
|
||||
if (this.vad) await this.vad.pause();
|
||||
this.activeStream = null;
|
||||
}
|
||||
|
||||
public updateOptions(options: SileroVADGateOptions): void {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
public async destroy(): Promise<void> {
|
||||
if (this.vad) {
|
||||
await this.vad.destroy();
|
||||
this.vad = null;
|
||||
}
|
||||
this.activeStream = null;
|
||||
}
|
||||
}
|
||||
@@ -336,7 +336,7 @@ export const SettingsModal: FC<Props> = ({
|
||||
id="vadEnabled"
|
||||
type="checkbox"
|
||||
label="Enable voice activity detection"
|
||||
description="Uses the Silero VAD model to mute audio when no speech is detected."
|
||||
description="Uses TEN-VAD to mute audio when no speech is detected (~16 ms latency)."
|
||||
checked={vadActive}
|
||||
onChange={(e: ChangeEvent<HTMLInputElement>): void =>
|
||||
setVadActive(e.target.checked)
|
||||
|
||||
@@ -49,7 +49,6 @@ import {
|
||||
type NoiseGateParams,
|
||||
NoiseGateTransformer,
|
||||
} from "../../../livekit/NoiseGateTransformer.ts";
|
||||
import { SileroVADGate } from "../../../livekit/SileroVADGate.ts";
|
||||
import { observeTrackReference$ } from "../../observeTrackReference";
|
||||
import { type Connection } from "../remoteMembers/Connection.ts";
|
||||
import { ObservableScope } from "../../ObservableScope.ts";
|
||||
@@ -439,12 +438,6 @@ export class Publisher {
|
||||
|
||||
let transformer: NoiseGateTransformer | null = null;
|
||||
let audioCtx: AudioContext | null = null;
|
||||
// Single VAD gate instance — persists across start/stop to keep model warm
|
||||
let vadGate: SileroVADGate | null = new SileroVADGate({
|
||||
positiveThreshold: vadPositiveThreshold.getValue(),
|
||||
negativeThreshold: vadNegativeThreshold.getValue(),
|
||||
});
|
||||
let rawMicTrack: MediaStreamTrack | null = null;
|
||||
|
||||
const currentParams = (): NoiseGateParams => ({
|
||||
noiseGateActive: noiseGateEnabled.getValue(),
|
||||
@@ -455,98 +448,55 @@ export class Publisher {
|
||||
transientEnabled: transientSuppressorEnabled.getValue(),
|
||||
transientThresholdDb: transientThreshold.getValue(),
|
||||
transientReleaseMs: transientRelease.getValue(),
|
||||
vadEnabled: vadEnabled.getValue(),
|
||||
vadPositiveThreshold: vadPositiveThreshold.getValue(),
|
||||
vadNegativeThreshold: vadNegativeThreshold.getValue(),
|
||||
});
|
||||
|
||||
const stopVAD = (): void => {
|
||||
if (vadGate) {
|
||||
void vadGate.stop();
|
||||
}
|
||||
// Always reopen gate when VAD stops so audio flows without VAD
|
||||
transformer?.setVADOpen(true);
|
||||
};
|
||||
|
||||
const startVAD = (rawTrack: MediaStreamTrack): void => {
|
||||
if (!vadGate) return;
|
||||
const stream = new MediaStream([rawTrack]);
|
||||
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
|
||||
vadGate.onClose = (): void => transformer?.setVADOpen(false);
|
||||
vadGate.start(stream).catch((e: unknown) => {
|
||||
this.logger.error("[VAD] failed to start", e);
|
||||
});
|
||||
};
|
||||
|
||||
// Attach / detach processor when noise gate or VAD enabled state or the track changes.
|
||||
combineLatest([audioTrack$, noiseGateEnabled.value$, vadEnabled.value$])
|
||||
// Attach / detach processor when any processing feature changes or the track changes.
|
||||
combineLatest([audioTrack$, noiseGateEnabled.value$, vadEnabled.value$, transientSuppressorEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([audioTrack, ngEnabled, vadActive]) => {
|
||||
.subscribe(([audioTrack, ngEnabled, vadActive, transientActive]) => {
|
||||
if (!audioTrack) return;
|
||||
const shouldAttach = ngEnabled || vadActive;
|
||||
const shouldAttach = ngEnabled || vadActive || transientActive;
|
||||
if (shouldAttach && !audioTrack.getProcessor()) {
|
||||
const params = currentParams();
|
||||
this.logger.info("[NoiseGate] attaching processor, params:", params);
|
||||
// Capture the raw mic track BEFORE setProcessor replaces it
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
rawMicTrack = (audioTrack as any).mediaStreamTrack ?? null;
|
||||
transformer = new NoiseGateTransformer(params);
|
||||
audioCtx = new AudioContext();
|
||||
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(audioTrack as any).setAudioContext(audioCtx);
|
||||
// Pre-warm VAD model as soon as AudioContext is created
|
||||
if (vadGate && audioCtx) {
|
||||
vadGate.init(audioCtx).catch((e: unknown) => {
|
||||
this.logger.error("[VAD] failed to pre-warm model", e);
|
||||
});
|
||||
}
|
||||
audioCtx.resume().then(async () => {
|
||||
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
|
||||
return audioTrack
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
.setProcessor(transformer as any);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
return audioTrack.setProcessor(transformer as any);
|
||||
}).then(() => {
|
||||
this.logger.info("[NoiseGate] setProcessor resolved");
|
||||
if (vadActive && rawMicTrack) startVAD(rawMicTrack);
|
||||
}).catch((e: unknown) => {
|
||||
this.logger.error("[NoiseGate] setProcessor failed", e);
|
||||
});
|
||||
} else if (!shouldAttach && audioTrack.getProcessor()) {
|
||||
this.logger.info("[NoiseGate] removing processor");
|
||||
stopVAD();
|
||||
void audioTrack.stopProcessor();
|
||||
void audioCtx?.close();
|
||||
audioCtx = null;
|
||||
transformer = null;
|
||||
rawMicTrack = null;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(audioTrack as any).setAudioContext(undefined);
|
||||
} else if (shouldAttach && audioTrack.getProcessor()) {
|
||||
// Processor already attached — push updated params (e.g. noiseGateActive toggled)
|
||||
transformer?.updateParams(currentParams());
|
||||
} else {
|
||||
this.logger.info("[NoiseGate] tick — ngEnabled:", ngEnabled, "vadActive:", vadActive, "hasProcessor:", !!audioTrack.getProcessor());
|
||||
this.logger.info(
|
||||
"[NoiseGate] tick — ngEnabled:", ngEnabled,
|
||||
"vadActive:", vadActive,
|
||||
"hasProcessor:", !!audioTrack.getProcessor(),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// Start/stop VAD when its toggle changes.
|
||||
combineLatest([audioTrack$, vadEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([, enabled]) => {
|
||||
if (!rawMicTrack) return;
|
||||
if (enabled) {
|
||||
startVAD(rawMicTrack);
|
||||
} else {
|
||||
stopVAD();
|
||||
}
|
||||
});
|
||||
|
||||
// Push VAD threshold changes to the live gate without recreating it.
|
||||
combineLatest([vadPositiveThreshold.value$, vadNegativeThreshold.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([positiveThreshold, negativeThreshold]) => {
|
||||
vadGate?.updateOptions({ positiveThreshold, negativeThreshold });
|
||||
});
|
||||
|
||||
// Push param changes to the live worklet without recreating the processor.
|
||||
// Push all param changes (noise gate + VAD) to the live worklet.
|
||||
combineLatest([
|
||||
noiseGateEnabled.value$,
|
||||
noiseGateThreshold.value$,
|
||||
@@ -556,23 +506,24 @@ export class Publisher {
|
||||
transientSuppressorEnabled.value$,
|
||||
transientThreshold.value$,
|
||||
transientRelease.value$,
|
||||
vadEnabled.value$,
|
||||
vadPositiveThreshold.value$,
|
||||
vadNegativeThreshold.value$,
|
||||
])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([noiseGateActive, threshold, attackMs, holdMs, releaseMs,
|
||||
transientEnabled, transientThresholdDb, transientReleaseMs]) => {
|
||||
.subscribe(([
|
||||
noiseGateActive, threshold, attackMs, holdMs, releaseMs,
|
||||
transientEnabled, transientThresholdDb, transientReleaseMs,
|
||||
vadActive, vadPos, vadNeg,
|
||||
]) => {
|
||||
transformer?.updateParams({
|
||||
noiseGateActive, threshold, attackMs, holdMs, releaseMs,
|
||||
transientEnabled, transientThresholdDb, transientReleaseMs,
|
||||
vadEnabled: vadActive,
|
||||
vadPositiveThreshold: vadPos,
|
||||
vadNegativeThreshold: vadNeg,
|
||||
});
|
||||
});
|
||||
|
||||
// Destroy VAD gate when scope ends (processor fully torn down)
|
||||
scope.onEnd(() => {
|
||||
if (vadGate) {
|
||||
void vadGate.destroy();
|
||||
vadGate = null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private observeTrackProcessors(
|
||||
|
||||
Reference in New Issue
Block a user