feat: decouple noise gate and VAD, pre-warm model for instant enable

Noise gate and Silero VAD now work fully independently — the worklet
attaches when either is enabled and bypasses the amplitude gate when
only VAD is on (noiseGateActive flag). SileroVADGate gains a two-phase
lifecycle: init(ctx) loads the ONNX model eagerly when the AudioContext
is first created; start(stream) is then near-instant when the user
enables VAD. stop() pauses without unloading the model so re-enabling
is also instant. VAD checkbox no longer requires the noise gate.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-24 00:15:32 -03:00
parent 325094b54d
commit dbd4eef899
5 changed files with 110 additions and 65 deletions

View File

@@ -21,6 +21,7 @@ declare function registerProcessor(
): void; ): void;
interface NoiseGateParams { interface NoiseGateParams {
noiseGateActive: boolean;
threshold: number; // dBFS — gate opens above this, closes below it threshold: number; // dBFS — gate opens above this, closes below it
attackMs: number; attackMs: number;
holdMs: number; holdMs: number;
@@ -53,6 +54,7 @@ function dbToLinear(db: number): number {
*/ */
class NoiseGateProcessor extends AudioWorkletProcessor { class NoiseGateProcessor extends AudioWorkletProcessor {
// Noise gate state // Noise gate state
private noiseGateActive = true;
private threshold = dbToLinear(-60); private threshold = dbToLinear(-60);
private attackRate = 1.0 / (0.025 * sampleRate); private attackRate = 1.0 / (0.025 * sampleRate);
private releaseRate = 1.0 / (0.15 * sampleRate); private releaseRate = 1.0 / (0.15 * sampleRate);
@@ -88,13 +90,14 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
} }
}; };
this.updateParams({ this.updateParams({
threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150, noiseGateActive: true, threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
transientEnabled: false, transientThresholdDb: 15, transientReleaseMs: 80, transientEnabled: false, transientThresholdDb: 15, transientReleaseMs: 80,
}); });
this.port.postMessage({ type: "log", msg: "[NoiseGate worklet] constructor called, sampleRate=" + sampleRate }); this.port.postMessage({ type: "log", msg: "[NoiseGate worklet] constructor called, sampleRate=" + sampleRate });
} }
private updateParams(p: NoiseGateParams): void { private updateParams(p: NoiseGateParams): void {
this.noiseGateActive = p.noiseGateActive ?? true;
this.threshold = dbToLinear(p.threshold); this.threshold = dbToLinear(p.threshold);
this.attackRate = 1.0 / ((p.attackMs / 1000) * sampleRate); this.attackRate = 1.0 / ((p.attackMs / 1000) * sampleRate);
this.releaseRate = 1.0 / ((p.releaseMs / 1000) * sampleRate); this.releaseRate = 1.0 / ((p.releaseMs / 1000) * sampleRate);
@@ -147,20 +150,24 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
} }
// --- Noise gate --- // --- Noise gate ---
if (curLevel > this.threshold && !this.isOpen) { if (this.noiseGateActive) {
this.isOpen = true; if (curLevel > this.threshold && !this.isOpen) {
} this.isOpen = true;
if (curLevel <= this.threshold && this.isOpen) {
this.heldTime = 0;
this.isOpen = false;
}
if (this.isOpen) {
this.gateAttenuation = Math.min(1.0, this.gateAttenuation + this.attackRate);
} else {
this.heldTime += samplePeriod;
if (this.heldTime > this.holdTime) {
this.gateAttenuation = Math.max(0.0, this.gateAttenuation - this.releaseRate);
} }
if (curLevel <= this.threshold && this.isOpen) {
this.heldTime = 0;
this.isOpen = false;
}
if (this.isOpen) {
this.gateAttenuation = Math.min(1.0, this.gateAttenuation + this.attackRate);
} else {
this.heldTime += samplePeriod;
if (this.heldTime > this.holdTime) {
this.gateAttenuation = Math.max(0.0, this.gateAttenuation - this.releaseRate);
}
}
} else {
this.gateAttenuation = 1.0;
} }
// Ramp VAD attenuation toward target to avoid clicks on gate open/close // Ramp VAD attenuation toward target to avoid clicks on gate open/close

View File

@@ -11,6 +11,7 @@ import { logger } from "matrix-js-sdk/lib/logger";
const log = logger.getChild("[NoiseGateTransformer]"); const log = logger.getChild("[NoiseGateTransformer]");
export interface NoiseGateParams { export interface NoiseGateParams {
noiseGateActive: boolean;
threshold: number; // dBFS — gate opens above this, closes below it threshold: number; // dBFS — gate opens above this, closes below it
attackMs: number; attackMs: number;
holdMs: number; holdMs: number;

View File

@@ -20,60 +20,61 @@ export interface SileroVADGateOptions {
} }
/** /**
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability * Wraps @ricky0123/vad-web's MicVAD with a two-phase lifecycle:
* decisions into the NoiseGateTransformer's VAD gate.
* *
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level * init(audioContext) — loads the ONNX model and ORT WASM (expensive,
* onSpeechStart/onSpeechEnd callbacks — those only fire at segment boundaries * call as early as possible for zero-latency enable)
* so non-speech noise never triggers onSpeechEnd, keeping the gate open. * start(stream) — wires the stream and begins per-frame processing
* Per-frame probability control with hysteresis fixes this. * stop() — pauses processing, keeps model loaded
* destroy() — full teardown
* *
* The gate starts OPEN (fail-safe): audio flows immediately and the model * Uses onFrameProcessed (fires every ~32ms with v5 model) with hysteresis
* closes it on the first silent frame. A failed model load therefore * to control the gate. Starts OPEN so audio flows immediately; the model
* degrades gracefully instead of permanently muting the user. * closes it on the first silent frame.
*/ */
export class SileroVADGate { export class SileroVADGate {
/** Called each time the gate transitions to open (speech detected). */
public onOpen: () => void = () => {}; public onOpen: () => void = () => {};
/** Called each time the gate transitions to closed (silence detected). */
public onClose: () => void = () => {}; public onClose: () => void = () => {};
private vad: MicVAD | null = null; private vad: MicVAD | null = null;
private readonly stream: MediaStream; private activeStream: MediaStream | null = null;
private readonly audioContext: AudioContext;
private options: SileroVADGateOptions; private options: SileroVADGateOptions;
private gateOpen = true; private gateOpen = true;
public constructor(stream: MediaStream, audioContext: AudioContext, options: SileroVADGateOptions) { public constructor(options: SileroVADGateOptions) {
this.stream = stream;
this.audioContext = audioContext;
this.options = options; this.options = options;
} }
public async start(): Promise<void> { /**
const stream = this.stream; * Phase 1 — load the model. Call this as early as possible (e.g. when the
const audioContext = this.audioContext; * AudioContext is first created) so start() is near-instant later.
*/
log.info("initialising MicVAD, baseAssetPath:", VAD_BASE_PATH); public async init(audioContext: AudioContext): Promise<void> {
// Avoid requiring SharedArrayBuffer (COOP/COEP headers) by running // Avoid requiring SharedArrayBuffer (COOP/COEP headers) by running
// single-threaded. Performance is sufficient for 16kHz speech frames. // single-threaded. Performance is sufficient for 16kHz speech frames.
ort.env.wasm.numThreads = 1; ort.env.wasm.numThreads = 1;
log.info("pre-warming MicVAD model");
this.vad = await MicVAD.new({ this.vad = await MicVAD.new({
// v5 model uses 512-sample frames (32ms) vs legacy's fixed 1536 (96ms),
// giving 3× faster gate response at the cost of a slightly larger model file.
...getDefaultRealTimeVADOptions("v5"), ...getDefaultRealTimeVADOptions("v5"),
audioContext, audioContext,
baseAssetPath: VAD_BASE_PATH, baseAssetPath: VAD_BASE_PATH,
onnxWASMBasePath: VAD_BASE_PATH, onnxWASMBasePath: VAD_BASE_PATH,
startOnLoad: false, startOnLoad: false,
// Provide the existing stream instead of calling getUserMedia // Stream is provided via activeStream at start() time
// eslint-disable-next-line @typescript-eslint/require-await
getStream: async (): Promise<MediaStream> => {
if (!this.activeStream) throw new Error("[VAD] stream not set — call start() first");
return this.activeStream;
},
// eslint-disable-next-line @typescript-eslint/require-await // eslint-disable-next-line @typescript-eslint/require-await
getStream: async (): Promise<MediaStream> => stream,
pauseStream: async (): Promise<void> => {}, pauseStream: async (): Promise<void> => {},
// eslint-disable-next-line @typescript-eslint/require-await // eslint-disable-next-line @typescript-eslint/require-await
resumeStream: async (): Promise<MediaStream> => stream, resumeStream: async (): Promise<MediaStream> => {
if (!this.activeStream) throw new Error("[VAD] stream not set");
return this.activeStream;
},
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => { onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
const p = probabilities.isSpeech; const p = probabilities.isSpeech;
if (!this.gateOpen && p >= this.options.positiveThreshold) { if (!this.gateOpen && p >= this.options.positiveThreshold) {
@@ -92,10 +93,27 @@ export class SileroVADGate {
onSpeechRealStart: (): void => {}, onSpeechRealStart: (): void => {},
}); });
log.info("MicVAD model loaded");
}
/**
* Phase 2 — wire the raw mic stream and begin classifying frames.
* init() must have completed first.
*/
public async start(stream: MediaStream): Promise<void> {
if (!this.vad) throw new Error("[VAD] call init() before start()");
this.activeStream = stream;
this.gateOpen = true; // start open — first silent frame will close it
await this.vad.start(); await this.vad.start();
log.info("MicVAD started"); log.info("MicVAD started");
} }
/** Pause frame processing without destroying the model. */
public async stop(): Promise<void> {
if (this.vad) await this.vad.pause();
this.activeStream = null;
}
public updateOptions(options: SileroVADGateOptions): void { public updateOptions(options: SileroVADGateOptions): void {
this.options = options; this.options = options;
} }
@@ -105,5 +123,6 @@ export class SileroVADGate {
await this.vad.destroy(); await this.vad.destroy();
this.vad = null; this.vad = null;
} }
this.activeStream = null;
} }
} }

View File

@@ -336,12 +336,11 @@ export const SettingsModal: FC<Props> = ({
id="vadEnabled" id="vadEnabled"
type="checkbox" type="checkbox"
label="Enable voice activity detection" label="Enable voice activity detection"
description="Uses the Silero VAD model to mute audio when no speech is detected. Requires the noise gate to be enabled." description="Uses the Silero VAD model to mute audio when no speech is detected."
checked={vadActive} checked={vadActive}
onChange={(e: ChangeEvent<HTMLInputElement>): void => onChange={(e: ChangeEvent<HTMLInputElement>): void =>
setVadActive(e.target.checked) setVadActive(e.target.checked)
} }
disabled={!noiseGateEnabled}
/> />
</FieldRow> </FieldRow>
{vadActive && ( {vadActive && (

View File

@@ -439,10 +439,15 @@ export class Publisher {
let transformer: NoiseGateTransformer | null = null; let transformer: NoiseGateTransformer | null = null;
let audioCtx: AudioContext | null = null; let audioCtx: AudioContext | null = null;
let vadGate: SileroVADGate | null = null; // Single VAD gate instance — persists across start/stop to keep model warm
let vadGate: SileroVADGate | null = new SileroVADGate({
positiveThreshold: vadPositiveThreshold.getValue(),
negativeThreshold: vadNegativeThreshold.getValue(),
});
let rawMicTrack: MediaStreamTrack | null = null; let rawMicTrack: MediaStreamTrack | null = null;
const currentParams = (): NoiseGateParams => ({ const currentParams = (): NoiseGateParams => ({
noiseGateActive: noiseGateEnabled.getValue(),
threshold: noiseGateThreshold.getValue(), threshold: noiseGateThreshold.getValue(),
attackMs: noiseGateAttack.getValue(), attackMs: noiseGateAttack.getValue(),
holdMs: noiseGateHold.getValue(), holdMs: noiseGateHold.getValue(),
@@ -454,33 +459,29 @@ export class Publisher {
const stopVAD = (): void => { const stopVAD = (): void => {
if (vadGate) { if (vadGate) {
void vadGate.destroy(); void vadGate.stop();
vadGate = null;
} }
// Always reopen gate when VAD stops so audio flows without VAD // Always reopen gate when VAD stops so audio flows without VAD
transformer?.setVADOpen(true); transformer?.setVADOpen(true);
}; };
const startVAD = (rawTrack: MediaStreamTrack, ctx: AudioContext): void => { const startVAD = (rawTrack: MediaStreamTrack): void => {
stopVAD(); if (!vadGate) return;
const stream = new MediaStream([rawTrack]); const stream = new MediaStream([rawTrack]);
vadGate = new SileroVADGate(stream, ctx, {
positiveThreshold: vadPositiveThreshold.getValue(),
negativeThreshold: vadNegativeThreshold.getValue(),
});
vadGate.onOpen = (): void => transformer?.setVADOpen(true); vadGate.onOpen = (): void => transformer?.setVADOpen(true);
vadGate.onClose = (): void => transformer?.setVADOpen(false); vadGate.onClose = (): void => transformer?.setVADOpen(false);
vadGate.start().catch((e: unknown) => { vadGate.start(stream).catch((e: unknown) => {
this.logger.error("[VAD] failed to start", e); this.logger.error("[VAD] failed to start", e);
}); });
}; };
// Attach / detach processor when enabled state or the track changes. // Attach / detach processor when noise gate or VAD enabled state or the track changes.
combineLatest([audioTrack$, noiseGateEnabled.value$]) combineLatest([audioTrack$, noiseGateEnabled.value$, vadEnabled.value$])
.pipe(scope.bind()) .pipe(scope.bind())
.subscribe(([audioTrack, enabled]) => { .subscribe(([audioTrack, ngEnabled, vadActive]) => {
if (!audioTrack) return; if (!audioTrack) return;
if (enabled && !audioTrack.getProcessor()) { const shouldAttach = ngEnabled || vadActive;
if (shouldAttach && !audioTrack.getProcessor()) {
const params = currentParams(); const params = currentParams();
this.logger.info("[NoiseGate] attaching processor, params:", params); this.logger.info("[NoiseGate] attaching processor, params:", params);
// Capture the raw mic track BEFORE setProcessor replaces it // Capture the raw mic track BEFORE setProcessor replaces it
@@ -491,6 +492,12 @@ export class Publisher {
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state); this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
(audioTrack as any).setAudioContext(audioCtx); (audioTrack as any).setAudioContext(audioCtx);
// Pre-warm VAD model as soon as AudioContext is created
if (vadGate && audioCtx) {
vadGate.init(audioCtx).catch((e: unknown) => {
this.logger.error("[VAD] failed to pre-warm model", e);
});
}
audioCtx.resume().then(async () => { audioCtx.resume().then(async () => {
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state); this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
return audioTrack return audioTrack
@@ -498,11 +505,11 @@ export class Publisher {
.setProcessor(transformer as any); .setProcessor(transformer as any);
}).then(() => { }).then(() => {
this.logger.info("[NoiseGate] setProcessor resolved"); this.logger.info("[NoiseGate] setProcessor resolved");
if (vadEnabled.getValue() && audioCtx && rawMicTrack) startVAD(rawMicTrack, audioCtx); if (vadActive && rawMicTrack) startVAD(rawMicTrack);
}).catch((e: unknown) => { }).catch((e: unknown) => {
this.logger.error("[NoiseGate] setProcessor failed", e); this.logger.error("[NoiseGate] setProcessor failed", e);
}); });
} else if (!enabled && audioTrack.getProcessor()) { } else if (!shouldAttach && audioTrack.getProcessor()) {
this.logger.info("[NoiseGate] removing processor"); this.logger.info("[NoiseGate] removing processor");
stopVAD(); stopVAD();
void audioTrack.stopProcessor(); void audioTrack.stopProcessor();
@@ -512,18 +519,21 @@ export class Publisher {
rawMicTrack = null; rawMicTrack = null;
// eslint-disable-next-line @typescript-eslint/no-explicit-any // eslint-disable-next-line @typescript-eslint/no-explicit-any
(audioTrack as any).setAudioContext(undefined); (audioTrack as any).setAudioContext(undefined);
} else if (shouldAttach && audioTrack.getProcessor()) {
// Processor already attached — push updated params (e.g. noiseGateActive toggled)
transformer?.updateParams(currentParams());
} else { } else {
this.logger.info("[NoiseGate] tick — enabled:", enabled, "hasProcessor:", !!audioTrack.getProcessor()); this.logger.info("[NoiseGate] tick — ngEnabled:", ngEnabled, "vadActive:", vadActive, "hasProcessor:", !!audioTrack.getProcessor());
} }
}); });
// Start/stop VAD when its toggle changes. // Start/stop VAD when its toggle changes.
combineLatest([audioTrack$, vadEnabled.value$]) combineLatest([audioTrack$, vadEnabled.value$])
.pipe(scope.bind()) .pipe(scope.bind())
.subscribe(([audioTrack, enabled]) => { .subscribe(([, enabled]) => {
if (!audioCtx || !rawMicTrack) return; if (!rawMicTrack) return;
if (enabled) { if (enabled) {
startVAD(rawMicTrack, audioCtx); startVAD(rawMicTrack);
} else { } else {
stopVAD(); stopVAD();
} }
@@ -538,6 +548,7 @@ export class Publisher {
// Push param changes to the live worklet without recreating the processor. // Push param changes to the live worklet without recreating the processor.
combineLatest([ combineLatest([
noiseGateEnabled.value$,
noiseGateThreshold.value$, noiseGateThreshold.value$,
noiseGateAttack.value$, noiseGateAttack.value$,
noiseGateHold.value$, noiseGateHold.value$,
@@ -547,13 +558,21 @@ export class Publisher {
transientRelease.value$, transientRelease.value$,
]) ])
.pipe(scope.bind()) .pipe(scope.bind())
.subscribe(([threshold, attackMs, holdMs, releaseMs, .subscribe(([noiseGateActive, threshold, attackMs, holdMs, releaseMs,
transientEnabled, transientThresholdDb, transientReleaseMs]) => { transientEnabled, transientThresholdDb, transientReleaseMs]) => {
transformer?.updateParams({ transformer?.updateParams({
threshold, attackMs, holdMs, releaseMs, noiseGateActive, threshold, attackMs, holdMs, releaseMs,
transientEnabled, transientThresholdDb, transientReleaseMs, transientEnabled, transientThresholdDb, transientReleaseMs,
}); });
}); });
// Destroy VAD gate when scope ends (processor fully torn down)
scope.onEnd(() => {
if (vadGate) {
void vadGate.destroy();
vadGate = null;
}
});
} }
private observeTrackProcessors( private observeTrackProcessors(