From 859db651e0dcc3cb6286947f5be750d83e88c31a Mon Sep 17 00:00:00 2001 From: mk Date: Mon, 23 Mar 2026 23:57:35 -0300 Subject: [PATCH] feat: add VAD threshold controls and smooth gate ramp Replace the hard 0/1 VAD gate with a 20ms ramp in the worklet to prevent clicks on open/close transitions. Expose positive and negative speech probability thresholds as user-adjustable settings (defaults 0.5/0.35). Sliders with restore-defaults button added to the VAD section of the audio settings tab. Co-Authored-By: Claude Sonnet 4.6 --- src/livekit/NoiseGateProcessor.worklet.ts | 13 ++++- src/livekit/SileroVADGate.ts | 39 +++++++------- src/settings/SettingsModal.tsx | 52 +++++++++++++++++++ src/settings/settings.ts | 4 ++ .../CallViewModel/localMember/Publisher.ts | 14 ++++- 5 files changed, 101 insertions(+), 21 deletions(-) diff --git a/src/livekit/NoiseGateProcessor.worklet.ts b/src/livekit/NoiseGateProcessor.worklet.ts index ae624fb4..6c1981b7 100644 --- a/src/livekit/NoiseGateProcessor.worklet.ts +++ b/src/livekit/NoiseGateProcessor.worklet.ts @@ -72,6 +72,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor { // VAD gate state (controlled externally via port message) private vadGateOpen = true; // starts open until VAD sends its first decision + // Smooth ramp so the VAD gate fades rather than cutting instantly (~20ms) + private vadAttenuation = 1.0; + private readonly vadRampRate = 1.0 / (0.02 * sampleRate); private logCounter = 0; @@ -160,7 +163,15 @@ class NoiseGateProcessor extends AudioWorkletProcessor { } } - const gain = this.gateAttenuation * transientGain * (this.vadGateOpen ? 1.0 : 0.0); + // Ramp VAD attenuation toward target to avoid clicks on gate open/close + const vadTarget = this.vadGateOpen ? 1.0 : 0.0; + if (this.vadAttenuation < vadTarget) { + this.vadAttenuation = Math.min(vadTarget, this.vadAttenuation + this.vadRampRate); + } else if (this.vadAttenuation > vadTarget) { + this.vadAttenuation = Math.max(vadTarget, this.vadAttenuation - this.vadRampRate); + } + + const gain = this.gateAttenuation * transientGain * this.vadAttenuation; for (let c = 0; c < output.length; c++) { const inCh = input[c] ?? input[0]; diff --git a/src/livekit/SileroVADGate.ts b/src/livekit/SileroVADGate.ts index bfcbd15b..101f8f1f 100644 --- a/src/livekit/SileroVADGate.ts +++ b/src/livekit/SileroVADGate.ts @@ -14,23 +14,23 @@ const log = logger.getChild("[SileroVADGate]"); const VAD_BASE_PATH = "/vad/"; -// Speech probability above this value opens the gate; below it closes it. -// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35. -// We use those same values so the gate tracks the model's own speech/silence logic. -const SPEECH_OPEN_THRESHOLD = 0.5; -const SPEECH_CLOSE_THRESHOLD = 0.35; +export interface SileroVADGateOptions { + positiveThreshold: number; // open gate when isSpeech >= this (0–1) + negativeThreshold: number; // close gate when isSpeech < this (0–1) +} /** * Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability * decisions into the NoiseGateTransformer's VAD gate. * * Uses onFrameProcessed (fires every ~96ms) rather than the segment-level - * onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at - * speech segment boundaries — with purely non-speech noise, onSpeechEnd - * never fires and the gate stays open. Per-frame probability control fixes - * this: the gate closes on the first silent frame. + * onSpeechStart/onSpeechEnd callbacks — those only fire at segment boundaries + * so non-speech noise never triggers onSpeechEnd, keeping the gate open. + * Per-frame probability control with hysteresis fixes this. * - * The gate starts closed (silent) and opens only once the VAD confirms speech. + * The gate starts OPEN (fail-safe): audio flows immediately and the model + * closes it on the first silent frame. A failed model load therefore + * degrades gracefully instead of permanently muting the user. */ export class SileroVADGate { /** Called each time the gate transitions to open (speech detected). */ @@ -41,11 +41,13 @@ export class SileroVADGate { private vad: MicVAD | null = null; private readonly stream: MediaStream; private readonly audioContext: AudioContext; - private gateOpen = false; + private options: SileroVADGateOptions; + private gateOpen = true; - public constructor(stream: MediaStream, audioContext: AudioContext) { + public constructor(stream: MediaStream, audioContext: AudioContext, options: SileroVADGateOptions) { this.stream = stream; this.audioContext = audioContext; + this.options = options; } public async start(): Promise { @@ -72,11 +74,11 @@ export class SileroVADGate { resumeStream: async (): Promise => stream, onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => { const p = probabilities.isSpeech; - if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) { + if (!this.gateOpen && p >= this.options.positiveThreshold) { this.gateOpen = true; log.debug("gate open (isSpeech=", p, ")"); this.onOpen(); - } else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) { + } else if (this.gateOpen && p < this.options.negativeThreshold) { this.gateOpen = false; log.debug("gate close (isSpeech=", p, ")"); this.onClose(); @@ -88,15 +90,14 @@ export class SileroVADGate { onSpeechRealStart: (): void => {}, }); - // Gate starts OPEN so audio flows immediately. The first silence frame - // will close it. This also means a failed model load degrades gracefully - // (audio still flows) rather than permanently muting the user. - this.gateOpen = true; - await this.vad.start(); log.info("MicVAD started"); } + public updateOptions(options: SileroVADGateOptions): void { + this.options = options; + } + public async destroy(): Promise { if (this.vad) { await this.vad.destroy(); diff --git a/src/settings/SettingsModal.tsx b/src/settings/SettingsModal.tsx index b01ee45e..e01139a3 100644 --- a/src/settings/SettingsModal.tsx +++ b/src/settings/SettingsModal.tsx @@ -33,6 +33,8 @@ import { transientThreshold as transientThresholdSetting, transientRelease as transientReleaseSetting, vadEnabled as vadEnabledSetting, + vadPositiveThreshold as vadPositiveThresholdSetting, + vadNegativeThreshold as vadNegativeThresholdSetting, } from "./settings"; import { PreferencesSettingsTab } from "./PreferencesSettingsTab"; import { Slider } from "../Slider"; @@ -132,6 +134,10 @@ export const SettingsModal: FC = ({ // Voice activity detection const [vadActive, setVadActive] = useSetting(vadEnabledSetting); + const [vadPositiveThreshold, setVadPositiveThreshold] = useSetting(vadPositiveThresholdSetting); + const [vadPositiveThresholdRaw, setVadPositiveThresholdRaw] = useState(vadPositiveThreshold); + const [vadNegativeThreshold, setVadNegativeThreshold] = useSetting(vadNegativeThresholdSetting); + const [vadNegativeThresholdRaw, setVadNegativeThresholdRaw] = useState(vadNegativeThreshold); // Transient suppressor settings const [transientEnabled, setTransientEnabled] = useSetting(transientSuppressorEnabledSetting); @@ -338,6 +344,52 @@ export const SettingsModal: FC = ({ disabled={!noiseGateEnabled} /> + {vadActive && ( + <> +
+ Open threshold: {Math.round(vadPositiveThresholdRaw * 100)}% +

How confident the model must be before opening the gate.

+ +
+
+ Close threshold: {Math.round(vadNegativeThresholdRaw * 100)}% +

How low the probability must drop before closing the gate.

+ +
+
+ +
+ + )}
("noise-gate-hold", 200); export const noiseGateRelease = new Setting("noise-gate-release", 150); export const vadEnabled = new Setting("vad-enabled", false); +// Probability above which the VAD opens the gate (0–1) +export const vadPositiveThreshold = new Setting("vad-positive-threshold", 0.5); +// Probability below which the VAD closes the gate (0–1) +export const vadNegativeThreshold = new Setting("vad-negative-threshold", 0.35); export const transientSuppressorEnabled = new Setting( "transient-suppressor-enabled", diff --git a/src/state/CallViewModel/localMember/Publisher.ts b/src/state/CallViewModel/localMember/Publisher.ts index 77e8e419..3b6b8b8a 100644 --- a/src/state/CallViewModel/localMember/Publisher.ts +++ b/src/state/CallViewModel/localMember/Publisher.ts @@ -42,6 +42,8 @@ import { transientThreshold, transientRelease, vadEnabled, + vadPositiveThreshold, + vadNegativeThreshold, } from "../../../settings/settings.ts"; import { type NoiseGateParams, @@ -467,7 +469,10 @@ export class Publisher { return; } const stream = new MediaStream([rawTrack]); - vadGate = new SileroVADGate(stream, ctx); + vadGate = new SileroVADGate(stream, ctx, { + positiveThreshold: vadPositiveThreshold.getValue(), + negativeThreshold: vadNegativeThreshold.getValue(), + }); vadGate.onOpen = (): void => transformer?.setVADOpen(true); vadGate.onClose = (): void => transformer?.setVADOpen(false); vadGate.start().catch((e: unknown) => { @@ -525,6 +530,13 @@ export class Publisher { } }); + // Push VAD threshold changes to the live gate without recreating it. + combineLatest([vadPositiveThreshold.value$, vadNegativeThreshold.value$]) + .pipe(scope.bind()) + .subscribe(([positiveThreshold, negativeThreshold]) => { + vadGate?.updateOptions({ positiveThreshold, negativeThreshold }); + }); + // Push param changes to the live worklet without recreating the processor. combineLatest([ noiseGateThreshold.value$,