diff --git a/src/livekit/NoiseGateProcessor.worklet.ts b/src/livekit/NoiseGateProcessor.worklet.ts index 6c1981b7..e9204568 100644 --- a/src/livekit/NoiseGateProcessor.worklet.ts +++ b/src/livekit/NoiseGateProcessor.worklet.ts @@ -21,6 +21,7 @@ declare function registerProcessor( ): void; interface NoiseGateParams { + noiseGateActive: boolean; threshold: number; // dBFS — gate opens above this, closes below it attackMs: number; holdMs: number; @@ -53,6 +54,7 @@ function dbToLinear(db: number): number { */ class NoiseGateProcessor extends AudioWorkletProcessor { // Noise gate state + private noiseGateActive = true; private threshold = dbToLinear(-60); private attackRate = 1.0 / (0.025 * sampleRate); private releaseRate = 1.0 / (0.15 * sampleRate); @@ -88,13 +90,14 @@ class NoiseGateProcessor extends AudioWorkletProcessor { } }; this.updateParams({ - threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150, + noiseGateActive: true, threshold: -60, attackMs: 25, holdMs: 200, releaseMs: 150, transientEnabled: false, transientThresholdDb: 15, transientReleaseMs: 80, }); this.port.postMessage({ type: "log", msg: "[NoiseGate worklet] constructor called, sampleRate=" + sampleRate }); } private updateParams(p: NoiseGateParams): void { + this.noiseGateActive = p.noiseGateActive ?? true; this.threshold = dbToLinear(p.threshold); this.attackRate = 1.0 / ((p.attackMs / 1000) * sampleRate); this.releaseRate = 1.0 / ((p.releaseMs / 1000) * sampleRate); @@ -147,20 +150,24 @@ class NoiseGateProcessor extends AudioWorkletProcessor { } // --- Noise gate --- - if (curLevel > this.threshold && !this.isOpen) { - this.isOpen = true; - } - if (curLevel <= this.threshold && this.isOpen) { - this.heldTime = 0; - this.isOpen = false; - } - if (this.isOpen) { - this.gateAttenuation = Math.min(1.0, this.gateAttenuation + this.attackRate); - } else { - this.heldTime += samplePeriod; - if (this.heldTime > this.holdTime) { - this.gateAttenuation = Math.max(0.0, this.gateAttenuation - this.releaseRate); + if (this.noiseGateActive) { + if (curLevel > this.threshold && !this.isOpen) { + this.isOpen = true; } + if (curLevel <= this.threshold && this.isOpen) { + this.heldTime = 0; + this.isOpen = false; + } + if (this.isOpen) { + this.gateAttenuation = Math.min(1.0, this.gateAttenuation + this.attackRate); + } else { + this.heldTime += samplePeriod; + if (this.heldTime > this.holdTime) { + this.gateAttenuation = Math.max(0.0, this.gateAttenuation - this.releaseRate); + } + } + } else { + this.gateAttenuation = 1.0; } // Ramp VAD attenuation toward target to avoid clicks on gate open/close diff --git a/src/livekit/NoiseGateTransformer.ts b/src/livekit/NoiseGateTransformer.ts index 33269e3b..74d34953 100644 --- a/src/livekit/NoiseGateTransformer.ts +++ b/src/livekit/NoiseGateTransformer.ts @@ -11,6 +11,7 @@ import { logger } from "matrix-js-sdk/lib/logger"; const log = logger.getChild("[NoiseGateTransformer]"); export interface NoiseGateParams { + noiseGateActive: boolean; threshold: number; // dBFS — gate opens above this, closes below it attackMs: number; holdMs: number; diff --git a/src/livekit/SileroVADGate.ts b/src/livekit/SileroVADGate.ts index 5a951d5e..bc022166 100644 --- a/src/livekit/SileroVADGate.ts +++ b/src/livekit/SileroVADGate.ts @@ -20,60 +20,61 @@ export interface SileroVADGateOptions { } /** - * Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability - * decisions into the NoiseGateTransformer's VAD gate. + * Wraps @ricky0123/vad-web's MicVAD with a two-phase lifecycle: * - * Uses onFrameProcessed (fires every ~96ms) rather than the segment-level - * onSpeechStart/onSpeechEnd callbacks — those only fire at segment boundaries - * so non-speech noise never triggers onSpeechEnd, keeping the gate open. - * Per-frame probability control with hysteresis fixes this. + * init(audioContext) — loads the ONNX model and ORT WASM (expensive, + * call as early as possible for zero-latency enable) + * start(stream) — wires the stream and begins per-frame processing + * stop() — pauses processing, keeps model loaded + * destroy() — full teardown * - * The gate starts OPEN (fail-safe): audio flows immediately and the model - * closes it on the first silent frame. A failed model load therefore - * degrades gracefully instead of permanently muting the user. + * Uses onFrameProcessed (fires every ~32ms with v5 model) with hysteresis + * to control the gate. Starts OPEN so audio flows immediately; the model + * closes it on the first silent frame. */ export class SileroVADGate { - /** Called each time the gate transitions to open (speech detected). */ public onOpen: () => void = () => {}; - /** Called each time the gate transitions to closed (silence detected). */ public onClose: () => void = () => {}; private vad: MicVAD | null = null; - private readonly stream: MediaStream; - private readonly audioContext: AudioContext; + private activeStream: MediaStream | null = null; private options: SileroVADGateOptions; private gateOpen = true; - public constructor(stream: MediaStream, audioContext: AudioContext, options: SileroVADGateOptions) { - this.stream = stream; - this.audioContext = audioContext; + public constructor(options: SileroVADGateOptions) { this.options = options; } - public async start(): Promise { - const stream = this.stream; - const audioContext = this.audioContext; - - log.info("initialising MicVAD, baseAssetPath:", VAD_BASE_PATH); - + /** + * Phase 1 — load the model. Call this as early as possible (e.g. when the + * AudioContext is first created) so start() is near-instant later. + */ + public async init(audioContext: AudioContext): Promise { // Avoid requiring SharedArrayBuffer (COOP/COEP headers) by running // single-threaded. Performance is sufficient for 16kHz speech frames. ort.env.wasm.numThreads = 1; + log.info("pre-warming MicVAD model"); + this.vad = await MicVAD.new({ - // v5 model uses 512-sample frames (32ms) vs legacy's fixed 1536 (96ms), - // giving 3× faster gate response at the cost of a slightly larger model file. ...getDefaultRealTimeVADOptions("v5"), audioContext, baseAssetPath: VAD_BASE_PATH, onnxWASMBasePath: VAD_BASE_PATH, startOnLoad: false, - // Provide the existing stream instead of calling getUserMedia + // Stream is provided via activeStream at start() time + // eslint-disable-next-line @typescript-eslint/require-await + getStream: async (): Promise => { + if (!this.activeStream) throw new Error("[VAD] stream not set — call start() first"); + return this.activeStream; + }, // eslint-disable-next-line @typescript-eslint/require-await - getStream: async (): Promise => stream, pauseStream: async (): Promise => {}, // eslint-disable-next-line @typescript-eslint/require-await - resumeStream: async (): Promise => stream, + resumeStream: async (): Promise => { + if (!this.activeStream) throw new Error("[VAD] stream not set"); + return this.activeStream; + }, onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => { const p = probabilities.isSpeech; if (!this.gateOpen && p >= this.options.positiveThreshold) { @@ -92,10 +93,27 @@ export class SileroVADGate { onSpeechRealStart: (): void => {}, }); + log.info("MicVAD model loaded"); + } + + /** + * Phase 2 — wire the raw mic stream and begin classifying frames. + * init() must have completed first. + */ + public async start(stream: MediaStream): Promise { + if (!this.vad) throw new Error("[VAD] call init() before start()"); + this.activeStream = stream; + this.gateOpen = true; // start open — first silent frame will close it await this.vad.start(); log.info("MicVAD started"); } + /** Pause frame processing without destroying the model. */ + public async stop(): Promise { + if (this.vad) await this.vad.pause(); + this.activeStream = null; + } + public updateOptions(options: SileroVADGateOptions): void { this.options = options; } @@ -105,5 +123,6 @@ export class SileroVADGate { await this.vad.destroy(); this.vad = null; } + this.activeStream = null; } } diff --git a/src/settings/SettingsModal.tsx b/src/settings/SettingsModal.tsx index e01139a3..e9d9daba 100644 --- a/src/settings/SettingsModal.tsx +++ b/src/settings/SettingsModal.tsx @@ -336,12 +336,11 @@ export const SettingsModal: FC = ({ id="vadEnabled" type="checkbox" label="Enable voice activity detection" - description="Uses the Silero VAD model to mute audio when no speech is detected. Requires the noise gate to be enabled." + description="Uses the Silero VAD model to mute audio when no speech is detected." checked={vadActive} onChange={(e: ChangeEvent): void => setVadActive(e.target.checked) } - disabled={!noiseGateEnabled} /> {vadActive && ( diff --git a/src/state/CallViewModel/localMember/Publisher.ts b/src/state/CallViewModel/localMember/Publisher.ts index fd82b767..d3c3f4cc 100644 --- a/src/state/CallViewModel/localMember/Publisher.ts +++ b/src/state/CallViewModel/localMember/Publisher.ts @@ -439,10 +439,15 @@ export class Publisher { let transformer: NoiseGateTransformer | null = null; let audioCtx: AudioContext | null = null; - let vadGate: SileroVADGate | null = null; + // Single VAD gate instance — persists across start/stop to keep model warm + let vadGate: SileroVADGate | null = new SileroVADGate({ + positiveThreshold: vadPositiveThreshold.getValue(), + negativeThreshold: vadNegativeThreshold.getValue(), + }); let rawMicTrack: MediaStreamTrack | null = null; const currentParams = (): NoiseGateParams => ({ + noiseGateActive: noiseGateEnabled.getValue(), threshold: noiseGateThreshold.getValue(), attackMs: noiseGateAttack.getValue(), holdMs: noiseGateHold.getValue(), @@ -454,33 +459,29 @@ export class Publisher { const stopVAD = (): void => { if (vadGate) { - void vadGate.destroy(); - vadGate = null; + void vadGate.stop(); } // Always reopen gate when VAD stops so audio flows without VAD transformer?.setVADOpen(true); }; - const startVAD = (rawTrack: MediaStreamTrack, ctx: AudioContext): void => { - stopVAD(); + const startVAD = (rawTrack: MediaStreamTrack): void => { + if (!vadGate) return; const stream = new MediaStream([rawTrack]); - vadGate = new SileroVADGate(stream, ctx, { - positiveThreshold: vadPositiveThreshold.getValue(), - negativeThreshold: vadNegativeThreshold.getValue(), - }); vadGate.onOpen = (): void => transformer?.setVADOpen(true); vadGate.onClose = (): void => transformer?.setVADOpen(false); - vadGate.start().catch((e: unknown) => { + vadGate.start(stream).catch((e: unknown) => { this.logger.error("[VAD] failed to start", e); }); }; - // Attach / detach processor when enabled state or the track changes. - combineLatest([audioTrack$, noiseGateEnabled.value$]) + // Attach / detach processor when noise gate or VAD enabled state or the track changes. + combineLatest([audioTrack$, noiseGateEnabled.value$, vadEnabled.value$]) .pipe(scope.bind()) - .subscribe(([audioTrack, enabled]) => { + .subscribe(([audioTrack, ngEnabled, vadActive]) => { if (!audioTrack) return; - if (enabled && !audioTrack.getProcessor()) { + const shouldAttach = ngEnabled || vadActive; + if (shouldAttach && !audioTrack.getProcessor()) { const params = currentParams(); this.logger.info("[NoiseGate] attaching processor, params:", params); // Capture the raw mic track BEFORE setProcessor replaces it @@ -491,6 +492,12 @@ export class Publisher { this.logger.info("[NoiseGate] AudioContext state before resume:", audioCtx.state); // eslint-disable-next-line @typescript-eslint/no-explicit-any (audioTrack as any).setAudioContext(audioCtx); + // Pre-warm VAD model as soon as AudioContext is created + if (vadGate && audioCtx) { + vadGate.init(audioCtx).catch((e: unknown) => { + this.logger.error("[VAD] failed to pre-warm model", e); + }); + } audioCtx.resume().then(async () => { this.logger.info("[NoiseGate] AudioContext state after resume:", audioCtx?.state); return audioTrack @@ -498,11 +505,11 @@ export class Publisher { .setProcessor(transformer as any); }).then(() => { this.logger.info("[NoiseGate] setProcessor resolved"); - if (vadEnabled.getValue() && audioCtx && rawMicTrack) startVAD(rawMicTrack, audioCtx); + if (vadActive && rawMicTrack) startVAD(rawMicTrack); }).catch((e: unknown) => { this.logger.error("[NoiseGate] setProcessor failed", e); }); - } else if (!enabled && audioTrack.getProcessor()) { + } else if (!shouldAttach && audioTrack.getProcessor()) { this.logger.info("[NoiseGate] removing processor"); stopVAD(); void audioTrack.stopProcessor(); @@ -512,18 +519,21 @@ export class Publisher { rawMicTrack = null; // eslint-disable-next-line @typescript-eslint/no-explicit-any (audioTrack as any).setAudioContext(undefined); + } else if (shouldAttach && audioTrack.getProcessor()) { + // Processor already attached — push updated params (e.g. noiseGateActive toggled) + transformer?.updateParams(currentParams()); } else { - this.logger.info("[NoiseGate] tick — enabled:", enabled, "hasProcessor:", !!audioTrack.getProcessor()); + this.logger.info("[NoiseGate] tick — ngEnabled:", ngEnabled, "vadActive:", vadActive, "hasProcessor:", !!audioTrack.getProcessor()); } }); // Start/stop VAD when its toggle changes. combineLatest([audioTrack$, vadEnabled.value$]) .pipe(scope.bind()) - .subscribe(([audioTrack, enabled]) => { - if (!audioCtx || !rawMicTrack) return; + .subscribe(([, enabled]) => { + if (!rawMicTrack) return; if (enabled) { - startVAD(rawMicTrack, audioCtx); + startVAD(rawMicTrack); } else { stopVAD(); } @@ -538,6 +548,7 @@ export class Publisher { // Push param changes to the live worklet without recreating the processor. combineLatest([ + noiseGateEnabled.value$, noiseGateThreshold.value$, noiseGateAttack.value$, noiseGateHold.value$, @@ -547,13 +558,21 @@ export class Publisher { transientRelease.value$, ]) .pipe(scope.bind()) - .subscribe(([threshold, attackMs, holdMs, releaseMs, + .subscribe(([noiseGateActive, threshold, attackMs, holdMs, releaseMs, transientEnabled, transientThresholdDb, transientReleaseMs]) => { transformer?.updateParams({ - threshold, attackMs, holdMs, releaseMs, + noiseGateActive, threshold, attackMs, holdMs, releaseMs, transientEnabled, transientThresholdDb, transientReleaseMs, }); }); + + // Destroy VAD gate when scope ends (processor fully torn down) + scope.onEnd(() => { + if (vadGate) { + void vadGate.destroy(); + vadGate = null; + } + }); } private observeTrackProcessors(