feat: decouple noise gate and VAD, pre-warm model for instant enable
Noise gate and Silero VAD now work fully independently — the worklet attaches when either is enabled and bypasses the amplitude gate when only VAD is on (noiseGateActive flag). SileroVADGate gains a two-phase lifecycle: init(ctx) loads the ONNX model eagerly when the AudioContext is first created; start(stream) is then near-instant when the user enables VAD. stop() pauses without unloading the model so re-enabling is also instant. VAD checkbox no longer requires the noise gate. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,7 @@ declare function registerProcessor(
|
||||
): void;
|
||||
|
||||
interface NoiseGateParams {
|
||||
noiseGateActive: boolean;
|
||||
threshold: number; // dBFS — gate opens above this, closes below it
|
||||
attackMs: number;
|
||||
holdMs: number;
|
||||
@@ -53,6 +54,7 @@ function dbToLinear(db: number): number {
|
||||
*/
|
||||
class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
// Noise gate state
|
||||
private noiseGateActive = true;
|
||||
private threshold = dbToLinear(-60);
|
||||
private attackRate = 1.0 / (0.025 * sampleRate);
|
||||
private releaseRate = 1.0 / (0.15 * sampleRate);
|
||||
@@ -88,13 +90,14 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
}
|
||||
};
|
||||
this.updateParams({
|
||||
threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
|
||||
noiseGateActive: true, threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150,
|
||||
transientEnabled: false, transientThresholdDb: 15, transientReleaseMs: 80,
|
||||
});
|
||||
this.port.postMessage({ type: "log", msg: "[NoiseGate worklet] constructor called, sampleRate=" + sampleRate });
|
||||
}
|
||||
|
||||
private updateParams(p: NoiseGateParams): void {
|
||||
this.noiseGateActive = p.noiseGateActive ?? true;
|
||||
this.threshold = dbToLinear(p.threshold);
|
||||
this.attackRate = 1.0 / ((p.attackMs / 1000) * sampleRate);
|
||||
this.releaseRate = 1.0 / ((p.releaseMs / 1000) * sampleRate);
|
||||
@@ -147,6 +150,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
}
|
||||
|
||||
// --- Noise gate ---
|
||||
if (this.noiseGateActive) {
|
||||
if (curLevel > this.threshold && !this.isOpen) {
|
||||
this.isOpen = true;
|
||||
}
|
||||
@@ -162,6 +166,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
this.gateAttenuation = Math.max(0.0, this.gateAttenuation - this.releaseRate);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.gateAttenuation = 1.0;
|
||||
}
|
||||
|
||||
// Ramp VAD attenuation toward target to avoid clicks on gate open/close
|
||||
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
|
||||
|
||||
@@ -11,6 +11,7 @@ import { logger } from "matrix-js-sdk/lib/logger";
|
||||
const log = logger.getChild("[NoiseGateTransformer]");
|
||||
|
||||
export interface NoiseGateParams {
|
||||
noiseGateActive: boolean;
|
||||
threshold: number; // dBFS — gate opens above this, closes below it
|
||||
attackMs: number;
|
||||
holdMs: number;
|
||||
|
||||
@@ -20,60 +20,61 @@ export interface SileroVADGateOptions {
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
|
||||
* decisions into the NoiseGateTransformer's VAD gate.
|
||||
* Wraps @ricky0123/vad-web's MicVAD with a two-phase lifecycle:
|
||||
*
|
||||
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
|
||||
* onSpeechStart/onSpeechEnd callbacks — those only fire at segment boundaries
|
||||
* so non-speech noise never triggers onSpeechEnd, keeping the gate open.
|
||||
* Per-frame probability control with hysteresis fixes this.
|
||||
* init(audioContext) — loads the ONNX model and ORT WASM (expensive,
|
||||
* call as early as possible for zero-latency enable)
|
||||
* start(stream) — wires the stream and begins per-frame processing
|
||||
* stop() — pauses processing, keeps model loaded
|
||||
* destroy() — full teardown
|
||||
*
|
||||
* The gate starts OPEN (fail-safe): audio flows immediately and the model
|
||||
* closes it on the first silent frame. A failed model load therefore
|
||||
* degrades gracefully instead of permanently muting the user.
|
||||
* Uses onFrameProcessed (fires every ~32ms with v5 model) with hysteresis
|
||||
* to control the gate. Starts OPEN so audio flows immediately; the model
|
||||
* closes it on the first silent frame.
|
||||
*/
|
||||
export class SileroVADGate {
|
||||
/** Called each time the gate transitions to open (speech detected). */
|
||||
public onOpen: () => void = () => {};
|
||||
/** Called each time the gate transitions to closed (silence detected). */
|
||||
public onClose: () => void = () => {};
|
||||
|
||||
private vad: MicVAD | null = null;
|
||||
private readonly stream: MediaStream;
|
||||
private readonly audioContext: AudioContext;
|
||||
private activeStream: MediaStream | null = null;
|
||||
private options: SileroVADGateOptions;
|
||||
private gateOpen = true;
|
||||
|
||||
public constructor(stream: MediaStream, audioContext: AudioContext, options: SileroVADGateOptions) {
|
||||
this.stream = stream;
|
||||
this.audioContext = audioContext;
|
||||
public constructor(options: SileroVADGateOptions) {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
public async start(): Promise<void> {
|
||||
const stream = this.stream;
|
||||
const audioContext = this.audioContext;
|
||||
|
||||
log.info("initialising MicVAD, baseAssetPath:", VAD_BASE_PATH);
|
||||
|
||||
/**
|
||||
* Phase 1 — load the model. Call this as early as possible (e.g. when the
|
||||
* AudioContext is first created) so start() is near-instant later.
|
||||
*/
|
||||
public async init(audioContext: AudioContext): Promise<void> {
|
||||
// Avoid requiring SharedArrayBuffer (COOP/COEP headers) by running
|
||||
// single-threaded. Performance is sufficient for 16kHz speech frames.
|
||||
ort.env.wasm.numThreads = 1;
|
||||
|
||||
log.info("pre-warming MicVAD model");
|
||||
|
||||
this.vad = await MicVAD.new({
|
||||
// v5 model uses 512-sample frames (32ms) vs legacy's fixed 1536 (96ms),
|
||||
// giving 3× faster gate response at the cost of a slightly larger model file.
|
||||
...getDefaultRealTimeVADOptions("v5"),
|
||||
audioContext,
|
||||
baseAssetPath: VAD_BASE_PATH,
|
||||
onnxWASMBasePath: VAD_BASE_PATH,
|
||||
startOnLoad: false,
|
||||
// Provide the existing stream instead of calling getUserMedia
|
||||
// Stream is provided via activeStream at start() time
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
getStream: async (): Promise<MediaStream> => {
|
||||
if (!this.activeStream) throw new Error("[VAD] stream not set — call start() first");
|
||||
return this.activeStream;
|
||||
},
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
getStream: async (): Promise<MediaStream> => stream,
|
||||
pauseStream: async (): Promise<void> => {},
|
||||
// eslint-disable-next-line @typescript-eslint/require-await
|
||||
resumeStream: async (): Promise<MediaStream> => stream,
|
||||
resumeStream: async (): Promise<MediaStream> => {
|
||||
if (!this.activeStream) throw new Error("[VAD] stream not set");
|
||||
return this.activeStream;
|
||||
},
|
||||
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
|
||||
const p = probabilities.isSpeech;
|
||||
if (!this.gateOpen && p >= this.options.positiveThreshold) {
|
||||
@@ -92,10 +93,27 @@ export class SileroVADGate {
|
||||
onSpeechRealStart: (): void => {},
|
||||
});
|
||||
|
||||
log.info("MicVAD model loaded");
|
||||
}
|
||||
|
||||
/**
|
||||
* Phase 2 — wire the raw mic stream and begin classifying frames.
|
||||
* init() must have completed first.
|
||||
*/
|
||||
public async start(stream: MediaStream): Promise<void> {
|
||||
if (!this.vad) throw new Error("[VAD] call init() before start()");
|
||||
this.activeStream = stream;
|
||||
this.gateOpen = true; // start open — first silent frame will close it
|
||||
await this.vad.start();
|
||||
log.info("MicVAD started");
|
||||
}
|
||||
|
||||
/** Pause frame processing without destroying the model. */
|
||||
public async stop(): Promise<void> {
|
||||
if (this.vad) await this.vad.pause();
|
||||
this.activeStream = null;
|
||||
}
|
||||
|
||||
public updateOptions(options: SileroVADGateOptions): void {
|
||||
this.options = options;
|
||||
}
|
||||
@@ -105,5 +123,6 @@ export class SileroVADGate {
|
||||
await this.vad.destroy();
|
||||
this.vad = null;
|
||||
}
|
||||
this.activeStream = null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -336,12 +336,11 @@ export const SettingsModal: FC<Props> = ({
|
||||
id="vadEnabled"
|
||||
type="checkbox"
|
||||
label="Enable voice activity detection"
|
||||
description="Uses the Silero VAD model to mute audio when no speech is detected. Requires the noise gate to be enabled."
|
||||
description="Uses the Silero VAD model to mute audio when no speech is detected."
|
||||
checked={vadActive}
|
||||
onChange={(e: ChangeEvent<HTMLInputElement>): void =>
|
||||
setVadActive(e.target.checked)
|
||||
}
|
||||
disabled={!noiseGateEnabled}
|
||||
/>
|
||||
</FieldRow>
|
||||
{vadActive && (
|
||||
|
||||
@@ -439,10 +439,15 @@ export class Publisher {
|
||||
|
||||
let transformer: NoiseGateTransformer | null = null;
|
||||
let audioCtx: AudioContext | null = null;
|
||||
let vadGate: SileroVADGate | null = null;
|
||||
// Single VAD gate instance — persists across start/stop to keep model warm
|
||||
let vadGate: SileroVADGate | null = new SileroVADGate({
|
||||
positiveThreshold: vadPositiveThreshold.getValue(),
|
||||
negativeThreshold: vadNegativeThreshold.getValue(),
|
||||
});
|
||||
let rawMicTrack: MediaStreamTrack | null = null;
|
||||
|
||||
const currentParams = (): NoiseGateParams => ({
|
||||
noiseGateActive: noiseGateEnabled.getValue(),
|
||||
threshold: noiseGateThreshold.getValue(),
|
||||
attackMs: noiseGateAttack.getValue(),
|
||||
holdMs: noiseGateHold.getValue(),
|
||||
@@ -454,33 +459,29 @@ export class Publisher {
|
||||
|
||||
const stopVAD = (): void => {
|
||||
if (vadGate) {
|
||||
void vadGate.destroy();
|
||||
vadGate = null;
|
||||
void vadGate.stop();
|
||||
}
|
||||
// Always reopen gate when VAD stops so audio flows without VAD
|
||||
transformer?.setVADOpen(true);
|
||||
};
|
||||
|
||||
const startVAD = (rawTrack: MediaStreamTrack, ctx: AudioContext): void => {
|
||||
stopVAD();
|
||||
const startVAD = (rawTrack: MediaStreamTrack): void => {
|
||||
if (!vadGate) return;
|
||||
const stream = new MediaStream([rawTrack]);
|
||||
vadGate = new SileroVADGate(stream, ctx, {
|
||||
positiveThreshold: vadPositiveThreshold.getValue(),
|
||||
negativeThreshold: vadNegativeThreshold.getValue(),
|
||||
});
|
||||
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
|
||||
vadGate.onClose = (): void => transformer?.setVADOpen(false);
|
||||
vadGate.start().catch((e: unknown) => {
|
||||
vadGate.start(stream).catch((e: unknown) => {
|
||||
this.logger.error("[VAD] failed to start", e);
|
||||
});
|
||||
};
|
||||
|
||||
// Attach / detach processor when enabled state or the track changes.
|
||||
combineLatest([audioTrack$, noiseGateEnabled.value$])
|
||||
// Attach / detach processor when noise gate or VAD enabled state or the track changes.
|
||||
combineLatest([audioTrack$, noiseGateEnabled.value$, vadEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([audioTrack, enabled]) => {
|
||||
.subscribe(([audioTrack, ngEnabled, vadActive]) => {
|
||||
if (!audioTrack) return;
|
||||
if (enabled && !audioTrack.getProcessor()) {
|
||||
const shouldAttach = ngEnabled || vadActive;
|
||||
if (shouldAttach && !audioTrack.getProcessor()) {
|
||||
const params = currentParams();
|
||||
this.logger.info("[NoiseGate] attaching processor, params:", params);
|
||||
// Capture the raw mic track BEFORE setProcessor replaces it
|
||||
@@ -491,6 +492,12 @@ export class Publisher {
|
||||
this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state);
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(audioTrack as any).setAudioContext(audioCtx);
|
||||
// Pre-warm VAD model as soon as AudioContext is created
|
||||
if (vadGate && audioCtx) {
|
||||
vadGate.init(audioCtx).catch((e: unknown) => {
|
||||
this.logger.error("[VAD] failed to pre-warm model", e);
|
||||
});
|
||||
}
|
||||
audioCtx.resume().then(async () => {
|
||||
this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state);
|
||||
return audioTrack
|
||||
@@ -498,11 +505,11 @@ export class Publisher {
|
||||
.setProcessor(transformer as any);
|
||||
}).then(() => {
|
||||
this.logger.info("[NoiseGate] setProcessor resolved");
|
||||
if (vadEnabled.getValue() && audioCtx && rawMicTrack) startVAD(rawMicTrack, audioCtx);
|
||||
if (vadActive && rawMicTrack) startVAD(rawMicTrack);
|
||||
}).catch((e: unknown) => {
|
||||
this.logger.error("[NoiseGate] setProcessor failed", e);
|
||||
});
|
||||
} else if (!enabled && audioTrack.getProcessor()) {
|
||||
} else if (!shouldAttach && audioTrack.getProcessor()) {
|
||||
this.logger.info("[NoiseGate] removing processor");
|
||||
stopVAD();
|
||||
void audioTrack.stopProcessor();
|
||||
@@ -512,18 +519,21 @@ export class Publisher {
|
||||
rawMicTrack = null;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
(audioTrack as any).setAudioContext(undefined);
|
||||
} else if (shouldAttach && audioTrack.getProcessor()) {
|
||||
// Processor already attached — push updated params (e.g. noiseGateActive toggled)
|
||||
transformer?.updateParams(currentParams());
|
||||
} else {
|
||||
this.logger.info("[NoiseGate] tick — enabled:", enabled, "hasProcessor:", !!audioTrack.getProcessor());
|
||||
this.logger.info("[NoiseGate] tick — ngEnabled:", ngEnabled, "vadActive:", vadActive, "hasProcessor:", !!audioTrack.getProcessor());
|
||||
}
|
||||
});
|
||||
|
||||
// Start/stop VAD when its toggle changes.
|
||||
combineLatest([audioTrack$, vadEnabled.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([audioTrack, enabled]) => {
|
||||
if (!audioCtx || !rawMicTrack) return;
|
||||
.subscribe(([, enabled]) => {
|
||||
if (!rawMicTrack) return;
|
||||
if (enabled) {
|
||||
startVAD(rawMicTrack, audioCtx);
|
||||
startVAD(rawMicTrack);
|
||||
} else {
|
||||
stopVAD();
|
||||
}
|
||||
@@ -538,6 +548,7 @@ export class Publisher {
|
||||
|
||||
// Push param changes to the live worklet without recreating the processor.
|
||||
combineLatest([
|
||||
noiseGateEnabled.value$,
|
||||
noiseGateThreshold.value$,
|
||||
noiseGateAttack.value$,
|
||||
noiseGateHold.value$,
|
||||
@@ -547,13 +558,21 @@ export class Publisher {
|
||||
transientRelease.value$,
|
||||
])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([threshold, attackMs, holdMs, releaseMs,
|
||||
.subscribe(([noiseGateActive, threshold, attackMs, holdMs, releaseMs,
|
||||
transientEnabled, transientThresholdDb, transientReleaseMs]) => {
|
||||
transformer?.updateParams({
|
||||
threshold, attackMs, holdMs, releaseMs,
|
||||
noiseGateActive, threshold, attackMs, holdMs, releaseMs,
|
||||
transientEnabled, transientThresholdDb, transientReleaseMs,
|
||||
});
|
||||
});
|
||||
|
||||
// Destroy VAD gate when scope ends (processor fully torn down)
|
||||
scope.onEnd(() => {
|
||||
if (vadGate) {
|
||||
void vadGate.destroy();
|
||||
vadGate = null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private observeTrackProcessors(
|
||||
|
||||
Reference in New Issue
Block a user