feat: decouple noise gate and VAD, pre-warm model for instant enable

Noise gate and Silero VAD now work fully independently — the worklet
attaches when either is enabled and bypasses the amplitude gate when
only VAD is on (noiseGateActive flag). SileroVADGate gains a two-phase
lifecycle: init(ctx) loads the ONNX model eagerly when the AudioContext
is first created; start(stream) is then near-instant when the user
enables VAD. stop() pauses without unloading the model so re-enabling
is also instant. VAD checkbox no longer requires the noise gate.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-24 00:15:32 -03:00
parent 325094b54d
commit dbd4eef899
5 changed files with 110 additions and 65 deletions

View File

@@ -21,6 +21,7 @@ declare function registerProcessor(
): void;
interface NoiseGateParams {
noiseGateActive: boolean;
threshold: number; // dBFS — gate opens above this, closes below it
attackMs: number;
holdMs: number;
@@ -53,6 +54,7 @@ function dbToLinear(db: number): number {
*/
class NoiseGateProcessor extends AudioWorkletProcessor {
// Noise gate state
private noiseGateActive = true;
private threshold = dbToLinear(-60);
private attackRate = 1.0 / (0.025 * sampleRate);
private releaseRate = 1.0 / (0.15 * sampleRate);
@@ -88,13 +90,14 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
}
};
this.updateParams({
threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
noiseGateActive: true, threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
transientEnabled: false, transientThresholdDb: 15, transientReleaseMs: 80,
});
this.port.postMessage({ type: "log", msg: "[NoiseGate worklet] constructor called, sampleRate=" + sampleRate });
}
private updateParams(p: NoiseGateParams): void {
this.noiseGateActive = p.noiseGateActive ?? true;
this.threshold = dbToLinear(p.threshold);
this.attackRate = 1.0 / ((p.attackMs / 1000) * sampleRate);
this.releaseRate = 1.0 / ((p.releaseMs / 1000) * sampleRate);
@@ -147,6 +150,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
}
// --- Noise gate ---
if (this.noiseGateActive) {
if (curLevel > this.threshold && !this.isOpen) {
this.isOpen = true;
}
@@ -162,6 +166,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
this.gateAttenuation = Math.max(0.0, this.gateAttenuation - this.releaseRate);
}
}
} else {
this.gateAttenuation = 1.0;
}
// Ramp VAD attenuation toward target to avoid clicks on gate open/close
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;

View File

@@ -11,6 +11,7 @@ import { logger } from "matrix-js-sdk/lib/logger";
const log = logger.getChild("[NoiseGateTransformer]");
export interface NoiseGateParams {
noiseGateActive: boolean;
threshold: number; // dBFS — gate opens above this, closes below it
attackMs: number;
holdMs: number;

View File

@@ -20,60 +20,61 @@ export interface SileroVADGateOptions {
}
/**
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
* decisions into the NoiseGateTransformer's VAD gate.
* Wraps @ricky0123/vad-web's MicVAD with a two-phase lifecycle:
*
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
* onSpeechStart/onSpeechEnd callbacks — those only fire at segment boundaries
* so non-speech noise never triggers onSpeechEnd, keeping the gate open.
* Per-frame probability control with hysteresis fixes this.
* init(audioContext) — loads the ONNX model and ORT WASM (expensive,
* call as early as possible for zero-latency enable)
* start(stream) — wires the stream and begins per-frame processing
* stop() — pauses processing, keeps model loaded
* destroy() — full teardown
*
* The gate starts OPEN (fail-safe): audio flows immediately and the model
* closes it on the first silent frame. A failed model load therefore
* degrades gracefully instead of permanently muting the user.
* Uses onFrameProcessed (fires every ~32ms with v5 model) with hysteresis
* to control the gate. Starts OPEN so audio flows immediately; the model
* closes it on the first silent frame.
*/
export class SileroVADGate {
/** Called each time the gate transitions to open (speech detected). */
public onOpen: () => void = () => {};
/** Called each time the gate transitions to closed (silence detected). */
public onClose: () => void = () => {};
private vad: MicVAD | null = null;
private readonly stream: MediaStream;
private readonly audioContext: AudioContext;
private activeStream: MediaStream | null = null;
private options: SileroVADGateOptions;
private gateOpen = true;
public constructor(stream: MediaStream, audioContext: AudioContext, options: SileroVADGateOptions) {
this.stream = stream;
this.audioContext = audioContext;
public constructor(options: SileroVADGateOptions) {
this.options = options;
}
public async start(): Promise<void> {
const stream = this.stream;
const audioContext = this.audioContext;
log.info("initialising MicVAD, baseAssetPath:", VAD_BASE_PATH);
/**
* Phase 1 — load the model. Call this as early as possible (e.g. when the
* AudioContext is first created) so start() is near-instant later.
*/
public async init(audioContext: AudioContext): Promise<void> {
// Avoid requiring SharedArrayBuffer (COOP/COEP headers) by running
// single-threaded. Performance is sufficient for 16kHz speech frames.
ort.env.wasm.numThreads = 1;
log.info("pre-warming MicVAD model");
this.vad = await MicVAD.new({
// v5 model uses 512-sample frames (32ms) vs legacy's fixed 1536 (96ms),
// giving 3× faster gate response at the cost of a slightly larger model file.
...getDefaultRealTimeVADOptions("v5"),
audioContext,
baseAssetPath: VAD_BASE_PATH,
onnxWASMBasePath: VAD_BASE_PATH,
startOnLoad: false,
// Provide the existing stream instead of calling getUserMedia
// Stream is provided via activeStream at start() time
// eslint-disable-next-line @typescript-eslint/require-await
getStream: async (): Promise<MediaStream> => {
if (!this.activeStream) throw new Error("[VAD] stream not set — call start() first");
return this.activeStream;
},
// eslint-disable-next-line @typescript-eslint/require-await
getStream: async (): Promise<MediaStream> => stream,
pauseStream: async (): Promise<void> => {},
// eslint-disable-next-line @typescript-eslint/require-await
resumeStream: async (): Promise<MediaStream> => stream,
resumeStream: async (): Promise<MediaStream> => {
if (!this.activeStream) throw new Error("[VAD] stream not set");
return this.activeStream;
},
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
const p = probabilities.isSpeech;
if (!this.gateOpen && p >= this.options.positiveThreshold) {
@@ -92,10 +93,27 @@ export class SileroVADGate {
onSpeechRealStart: (): void => {},
});
log.info("MicVAD model loaded");
}
/**
* Phase 2 — wire the raw mic stream and begin classifying frames.
* init() must have completed first.
*/
public async start(stream: MediaStream): Promise<void> {
if (!this.vad) throw new Error("[VAD] call init() before start()");
this.activeStream = stream;
this.gateOpen = true; // start open — first silent frame will close it
await this.vad.start();
log.info("MicVAD started");
}
/** Pause frame processing without destroying the model. */
public async stop(): Promise<void> {
if (this.vad) await this.vad.pause();
this.activeStream = null;
}
public updateOptions(options: SileroVADGateOptions): void {
this.options = options;
}
@@ -105,5 +123,6 @@ export class SileroVADGate {
await this.vad.destroy();
this.vad = null;
}
this.activeStream = null;
}
}

View File

@@ -336,12 +336,11 @@ export const SettingsModal: FC<Props> = ({
id="vadEnabled"
type="checkbox"
label="Enable voice activity detection"
description="Uses the Silero VAD model to mute audio when no speech is detected. Requires the noise gate to be enabled."
description="Uses the Silero VAD model to mute audio when no speech is detected."
checked={vadActive}
onChange={(e: ChangeEvent<HTMLInputElement>): void =>
setVadActive(e.target.checked)
}
disabled={!noiseGateEnabled}
/>
</FieldRow>
{vadActive && (

View File

@@ -439,10 +439,15 @@ export class Publisher {
let transformer: NoiseGateTransformer | null = null;
let audioCtx: AudioContext | null = null;
let vadGate: SileroVADGate | null = null;
// Single VAD gate instance — persists across start/stop to keep model warm
let vadGate: SileroVADGate | null = new SileroVADGate({
positiveThreshold: vadPositiveThreshold.getValue(),
negativeThreshold: vadNegativeThreshold.getValue(),
});
let rawMicTrack: MediaStreamTrack | null = null;
const currentParams = (): NoiseGateParams => ({
noiseGateActive: noiseGateEnabled.getValue(),
threshold: noiseGateThreshold.getValue(),
attackMs: noiseGateAttack.getValue(),
holdMs: noiseGateHold.getValue(),
@@ -454,33 +459,29 @@ export class Publisher {
const stopVAD = (): void => {
if (vadGate) {
void vadGate.destroy();
vadGate = null;
void vadGate.stop();
}
// Always reopen gate when VAD stops so audio flows without VAD
transformer?.setVADOpen(true);
};
const startVAD = (rawTrack: MediaStreamTrack, ctx: AudioContext): void => {
stopVAD();
const startVAD = (rawTrack: MediaStreamTrack): void => {
if (!vadGate) return;
const stream = new MediaStream([rawTrack]);
vadGate = new SileroVADGate(stream, ctx, {
positiveThreshold: vadPositiveThreshold.getValue(),
negativeThreshold: vadNegativeThreshold.getValue(),
});
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
vadGate.onClose = (): void => transformer?.setVADOpen(false);
vadGate.start().catch((e: unknown) => {
vadGate.start(stream).catch((e: unknown) => {
this.logger.error("[VAD] failed to start", e);
});
};
// Attach / detach processor when enabled state or the track changes.
combineLatest([audioTrack$, noiseGateEnabled.value$])
// Attach / detach processor when noise gate or VAD enabled state or the track changes.
combineLatest([audioTrack$, noiseGateEnabled.value$, vadEnabled.value$])
.pipe(scope.bind())
.subscribe(([audioTrack, enabled]) => {
.subscribe(([audioTrack, ngEnabled, vadActive]) => {
if (!audioTrack) return;
if (enabled && !audioTrack.getProcessor()) {
const shouldAttach = ngEnabled || vadActive;
if (shouldAttach && !audioTrack.getProcessor()) {
const params = currentParams();
this.logger.info("[NoiseGate] attaching processor, params:", params);
// Capture the raw mic track BEFORE setProcessor replaces it
@@ -491,6 +492,12 @@ export class Publisher {
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(audioTrack as any).setAudioContext(audioCtx);
// Pre-warm VAD model as soon as AudioContext is created
if (vadGate && audioCtx) {
vadGate.init(audioCtx).catch((e: unknown) => {
this.logger.error("[VAD] failed to pre-warm model", e);
});
}
audioCtx.resume().then(async () => {
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
return audioTrack
@@ -498,11 +505,11 @@ export class Publisher {
.setProcessor(transformer as any);
}).then(() => {
this.logger.info("[NoiseGate] setProcessor resolved");
if (vadEnabled.getValue() && audioCtx && rawMicTrack) startVAD(rawMicTrack, audioCtx);
if (vadActive && rawMicTrack) startVAD(rawMicTrack);
}).catch((e: unknown) => {
this.logger.error("[NoiseGate] setProcessor failed", e);
});
} else if (!enabled && audioTrack.getProcessor()) {
} else if (!shouldAttach && audioTrack.getProcessor()) {
this.logger.info("[NoiseGate] removing processor");
stopVAD();
void audioTrack.stopProcessor();
@@ -512,18 +519,21 @@ export class Publisher {
rawMicTrack = null;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
(audioTrack as any).setAudioContext(undefined);
} else if (shouldAttach && audioTrack.getProcessor()) {
// Processor already attached — push updated params (e.g. noiseGateActive toggled)
transformer?.updateParams(currentParams());
} else {
this.logger.info("[NoiseGate] tick — enabled:", enabled, "hasProcessor:", !!audioTrack.getProcessor());
this.logger.info("[NoiseGate] tick — ngEnabled:", ngEnabled, "vadActive:", vadActive, "hasProcessor:", !!audioTrack.getProcessor());
}
});
// Start/stop VAD when its toggle changes.
combineLatest([audioTrack$, vadEnabled.value$])
.pipe(scope.bind())
.subscribe(([audioTrack, enabled]) => {
if (!audioCtx || !rawMicTrack) return;
.subscribe(([, enabled]) => {
if (!rawMicTrack) return;
if (enabled) {
startVAD(rawMicTrack, audioCtx);
startVAD(rawMicTrack);
} else {
stopVAD();
}
@@ -538,6 +548,7 @@ export class Publisher {
// Push param changes to the live worklet without recreating the processor.
combineLatest([
noiseGateEnabled.value$,
noiseGateThreshold.value$,
noiseGateAttack.value$,
noiseGateHold.value$,
@@ -547,13 +558,21 @@ export class Publisher {
transientRelease.value$,
])
.pipe(scope.bind())
.subscribe(([threshold, attackMs, holdMs, releaseMs,
.subscribe(([noiseGateActive, threshold, attackMs, holdMs, releaseMs,
transientEnabled, transientThresholdDb, transientReleaseMs]) => {
transformer?.updateParams({
threshold, attackMs, holdMs, releaseMs,
noiseGateActive, threshold, attackMs, holdMs, releaseMs,
transientEnabled, transientThresholdDb, transientReleaseMs,
});
});
// Destroy VAD gate when scope ends (processor fully torn down)
scope.onEnd(() => {
if (vadGate) {
void vadGate.destroy();
vadGate = null;
}
});
}
private observeTrackProcessors(