feat: add VAD threshold controls and smooth gate ramp
Replace the hard 0/1 VAD gate with a 20ms ramp in the worklet to prevent clicks on open/close transitions. Expose positive and negative speech probability thresholds as user-adjustable settings (defaults 0.5/0.35). Sliders with restore-defaults button added to the VAD section of the audio settings tab. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -72,6 +72,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
|||||||
|
|
||||||
// VAD gate state (controlled externally via port message)
|
// VAD gate state (controlled externally via port message)
|
||||||
private vadGateOpen = true; // starts open until VAD sends its first decision
|
private vadGateOpen = true; // starts open until VAD sends its first decision
|
||||||
|
// Smooth ramp so the VAD gate fades rather than cutting instantly (~20ms)
|
||||||
|
private vadAttenuation = 1.0;
|
||||||
|
private readonly vadRampRate = 1.0 / (0.02 * sampleRate);
|
||||||
|
|
||||||
private logCounter = 0;
|
private logCounter = 0;
|
||||||
|
|
||||||
@@ -160,7 +163,15 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const gain = this.gateAttenuation * transientGain * (this.vadGateOpen ? 1.0 : 0.0);
|
// Ramp VAD attenuation toward target to avoid clicks on gate open/close
|
||||||
|
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
|
||||||
|
if (this.vadAttenuation < vadTarget) {
|
||||||
|
this.vadAttenuation = Math.min(vadTarget, this.vadAttenuation + this.vadRampRate);
|
||||||
|
} else if (this.vadAttenuation > vadTarget) {
|
||||||
|
this.vadAttenuation = Math.max(vadTarget, this.vadAttenuation - this.vadRampRate);
|
||||||
|
}
|
||||||
|
|
||||||
|
const gain = this.gateAttenuation * transientGain * this.vadAttenuation;
|
||||||
|
|
||||||
for (let c = 0; c < output.length; c++) {
|
for (let c = 0; c < output.length; c++) {
|
||||||
const inCh = input[c] ?? input[0];
|
const inCh = input[c] ?? input[0];
|
||||||
|
|||||||
@@ -14,23 +14,23 @@ const log = logger.getChild("[SileroVADGate]");
|
|||||||
|
|
||||||
const VAD_BASE_PATH = "/vad/";
|
const VAD_BASE_PATH = "/vad/";
|
||||||
|
|
||||||
// Speech probability above this value opens the gate; below it closes it.
|
export interface SileroVADGateOptions {
|
||||||
// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35.
|
positiveThreshold: number; // open gate when isSpeech >= this (0–1)
|
||||||
// We use those same values so the gate tracks the model's own speech/silence logic.
|
negativeThreshold: number; // close gate when isSpeech < this (0–1)
|
||||||
const SPEECH_OPEN_THRESHOLD = 0.5;
|
}
|
||||||
const SPEECH_CLOSE_THRESHOLD = 0.35;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
|
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
|
||||||
* decisions into the NoiseGateTransformer's VAD gate.
|
* decisions into the NoiseGateTransformer's VAD gate.
|
||||||
*
|
*
|
||||||
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
|
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
|
||||||
* onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at
|
* onSpeechStart/onSpeechEnd callbacks — those only fire at segment boundaries
|
||||||
* speech segment boundaries — with purely non-speech noise, onSpeechEnd
|
* so non-speech noise never triggers onSpeechEnd, keeping the gate open.
|
||||||
* never fires and the gate stays open. Per-frame probability control fixes
|
* Per-frame probability control with hysteresis fixes this.
|
||||||
* this: the gate closes on the first silent frame.
|
|
||||||
*
|
*
|
||||||
* The gate starts closed (silent) and opens only once the VAD confirms speech.
|
* The gate starts OPEN (fail-safe): audio flows immediately and the model
|
||||||
|
* closes it on the first silent frame. A failed model load therefore
|
||||||
|
* degrades gracefully instead of permanently muting the user.
|
||||||
*/
|
*/
|
||||||
export class SileroVADGate {
|
export class SileroVADGate {
|
||||||
/** Called each time the gate transitions to open (speech detected). */
|
/** Called each time the gate transitions to open (speech detected). */
|
||||||
@@ -41,11 +41,13 @@ export class SileroVADGate {
|
|||||||
private vad: MicVAD | null = null;
|
private vad: MicVAD | null = null;
|
||||||
private readonly stream: MediaStream;
|
private readonly stream: MediaStream;
|
||||||
private readonly audioContext: AudioContext;
|
private readonly audioContext: AudioContext;
|
||||||
private gateOpen = false;
|
private options: SileroVADGateOptions;
|
||||||
|
private gateOpen = true;
|
||||||
|
|
||||||
public constructor(stream: MediaStream, audioContext: AudioContext) {
|
public constructor(stream: MediaStream, audioContext: AudioContext, options: SileroVADGateOptions) {
|
||||||
this.stream = stream;
|
this.stream = stream;
|
||||||
this.audioContext = audioContext;
|
this.audioContext = audioContext;
|
||||||
|
this.options = options;
|
||||||
}
|
}
|
||||||
|
|
||||||
public async start(): Promise<void> {
|
public async start(): Promise<void> {
|
||||||
@@ -72,11 +74,11 @@ export class SileroVADGate {
|
|||||||
resumeStream: async (): Promise<MediaStream> => stream,
|
resumeStream: async (): Promise<MediaStream> => stream,
|
||||||
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
|
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
|
||||||
const p = probabilities.isSpeech;
|
const p = probabilities.isSpeech;
|
||||||
if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) {
|
if (!this.gateOpen && p >= this.options.positiveThreshold) {
|
||||||
this.gateOpen = true;
|
this.gateOpen = true;
|
||||||
log.debug("gate open (isSpeech=", p, ")");
|
log.debug("gate open (isSpeech=", p, ")");
|
||||||
this.onOpen();
|
this.onOpen();
|
||||||
} else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) {
|
} else if (this.gateOpen && p < this.options.negativeThreshold) {
|
||||||
this.gateOpen = false;
|
this.gateOpen = false;
|
||||||
log.debug("gate close (isSpeech=", p, ")");
|
log.debug("gate close (isSpeech=", p, ")");
|
||||||
this.onClose();
|
this.onClose();
|
||||||
@@ -88,15 +90,14 @@ export class SileroVADGate {
|
|||||||
onSpeechRealStart: (): void => {},
|
onSpeechRealStart: (): void => {},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Gate starts OPEN so audio flows immediately. The first silence frame
|
|
||||||
// will close it. This also means a failed model load degrades gracefully
|
|
||||||
// (audio still flows) rather than permanently muting the user.
|
|
||||||
this.gateOpen = true;
|
|
||||||
|
|
||||||
await this.vad.start();
|
await this.vad.start();
|
||||||
log.info("MicVAD started");
|
log.info("MicVAD started");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public updateOptions(options: SileroVADGateOptions): void {
|
||||||
|
this.options = options;
|
||||||
|
}
|
||||||
|
|
||||||
public async destroy(): Promise<void> {
|
public async destroy(): Promise<void> {
|
||||||
if (this.vad) {
|
if (this.vad) {
|
||||||
await this.vad.destroy();
|
await this.vad.destroy();
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ import {
|
|||||||
transientThreshold as transientThresholdSetting,
|
transientThreshold as transientThresholdSetting,
|
||||||
transientRelease as transientReleaseSetting,
|
transientRelease as transientReleaseSetting,
|
||||||
vadEnabled as vadEnabledSetting,
|
vadEnabled as vadEnabledSetting,
|
||||||
|
vadPositiveThreshold as vadPositiveThresholdSetting,
|
||||||
|
vadNegativeThreshold as vadNegativeThresholdSetting,
|
||||||
} from "./settings";
|
} from "./settings";
|
||||||
import { PreferencesSettingsTab } from "./PreferencesSettingsTab";
|
import { PreferencesSettingsTab } from "./PreferencesSettingsTab";
|
||||||
import { Slider } from "../Slider";
|
import { Slider } from "../Slider";
|
||||||
@@ -132,6 +134,10 @@ export const SettingsModal: FC<Props> = ({
|
|||||||
|
|
||||||
// Voice activity detection
|
// Voice activity detection
|
||||||
const [vadActive, setVadActive] = useSetting(vadEnabledSetting);
|
const [vadActive, setVadActive] = useSetting(vadEnabledSetting);
|
||||||
|
const [vadPositiveThreshold, setVadPositiveThreshold] = useSetting(vadPositiveThresholdSetting);
|
||||||
|
const [vadPositiveThresholdRaw, setVadPositiveThresholdRaw] = useState(vadPositiveThreshold);
|
||||||
|
const [vadNegativeThreshold, setVadNegativeThreshold] = useSetting(vadNegativeThresholdSetting);
|
||||||
|
const [vadNegativeThresholdRaw, setVadNegativeThresholdRaw] = useState(vadNegativeThreshold);
|
||||||
|
|
||||||
// Transient suppressor settings
|
// Transient suppressor settings
|
||||||
const [transientEnabled, setTransientEnabled] = useSetting(transientSuppressorEnabledSetting);
|
const [transientEnabled, setTransientEnabled] = useSetting(transientSuppressorEnabledSetting);
|
||||||
@@ -338,6 +344,52 @@ export const SettingsModal: FC<Props> = ({
|
|||||||
disabled={!noiseGateEnabled}
|
disabled={!noiseGateEnabled}
|
||||||
/>
|
/>
|
||||||
</FieldRow>
|
</FieldRow>
|
||||||
|
{vadActive && (
|
||||||
|
<>
|
||||||
|
<div className={`${styles.volumeSlider} ${styles.thresholdSlider}`}>
|
||||||
|
<span className={styles.sliderLabel}>Open threshold: {Math.round(vadPositiveThresholdRaw * 100)}%</span>
|
||||||
|
<p>How confident the model must be before opening the gate.</p>
|
||||||
|
<Slider
|
||||||
|
label="VAD open threshold"
|
||||||
|
value={vadPositiveThresholdRaw}
|
||||||
|
onValueChange={setVadPositiveThresholdRaw}
|
||||||
|
onValueCommit={setVadPositiveThreshold}
|
||||||
|
min={0.1}
|
||||||
|
max={0.9}
|
||||||
|
step={0.05}
|
||||||
|
tooltip={false}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className={styles.volumeSlider}>
|
||||||
|
<span className={styles.sliderLabel}>Close threshold: {Math.round(vadNegativeThresholdRaw * 100)}%</span>
|
||||||
|
<p>How low the probability must drop before closing the gate.</p>
|
||||||
|
<Slider
|
||||||
|
label="VAD close threshold"
|
||||||
|
value={vadNegativeThresholdRaw}
|
||||||
|
onValueChange={setVadNegativeThresholdRaw}
|
||||||
|
onValueCommit={setVadNegativeThreshold}
|
||||||
|
min={0.05}
|
||||||
|
max={0.7}
|
||||||
|
step={0.05}
|
||||||
|
tooltip={false}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className={styles.restoreDefaults}>
|
||||||
|
<Button
|
||||||
|
kind="secondary"
|
||||||
|
size="sm"
|
||||||
|
onClick={(): void => {
|
||||||
|
const pos = vadPositiveThresholdSetting.defaultValue;
|
||||||
|
const neg = vadNegativeThresholdSetting.defaultValue;
|
||||||
|
setVadPositiveThreshold(pos); setVadPositiveThresholdRaw(pos);
|
||||||
|
setVadNegativeThreshold(neg); setVadNegativeThresholdRaw(neg);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
Restore defaults
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
<div className={styles.noiseGateSection}>
|
<div className={styles.noiseGateSection}>
|
||||||
<Heading
|
<Heading
|
||||||
|
|||||||
@@ -146,6 +146,10 @@ export const noiseGateHold = new Setting<number>("noise-gate-hold", 200);
|
|||||||
export const noiseGateRelease = new Setting<number>("noise-gate-release", 150);
|
export const noiseGateRelease = new Setting<number>("noise-gate-release", 150);
|
||||||
|
|
||||||
export const vadEnabled = new Setting<boolean>("vad-enabled", false);
|
export const vadEnabled = new Setting<boolean>("vad-enabled", false);
|
||||||
|
// Probability above which the VAD opens the gate (0–1)
|
||||||
|
export const vadPositiveThreshold = new Setting<number>("vad-positive-threshold", 0.5);
|
||||||
|
// Probability below which the VAD closes the gate (0–1)
|
||||||
|
export const vadNegativeThreshold = new Setting<number>("vad-negative-threshold", 0.35);
|
||||||
|
|
||||||
export const transientSuppressorEnabled = new Setting<boolean>(
|
export const transientSuppressorEnabled = new Setting<boolean>(
|
||||||
"transient-suppressor-enabled",
|
"transient-suppressor-enabled",
|
||||||
|
|||||||
@@ -42,6 +42,8 @@ import {
|
|||||||
transientThreshold,
|
transientThreshold,
|
||||||
transientRelease,
|
transientRelease,
|
||||||
vadEnabled,
|
vadEnabled,
|
||||||
|
vadPositiveThreshold,
|
||||||
|
vadNegativeThreshold,
|
||||||
} from "../../../settings/settings.ts";
|
} from "../../../settings/settings.ts";
|
||||||
import {
|
import {
|
||||||
type NoiseGateParams,
|
type NoiseGateParams,
|
||||||
@@ -467,7 +469,10 @@ export class Publisher {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const stream = new MediaStream([rawTrack]);
|
const stream = new MediaStream([rawTrack]);
|
||||||
vadGate = new SileroVADGate(stream, ctx);
|
vadGate = new SileroVADGate(stream, ctx, {
|
||||||
|
positiveThreshold: vadPositiveThreshold.getValue(),
|
||||||
|
negativeThreshold: vadNegativeThreshold.getValue(),
|
||||||
|
});
|
||||||
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
|
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
|
||||||
vadGate.onClose = (): void => transformer?.setVADOpen(false);
|
vadGate.onClose = (): void => transformer?.setVADOpen(false);
|
||||||
vadGate.start().catch((e: unknown) => {
|
vadGate.start().catch((e: unknown) => {
|
||||||
@@ -525,6 +530,13 @@ export class Publisher {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Push VAD threshold changes to the live gate without recreating it.
|
||||||
|
combineLatest([vadPositiveThreshold.value$, vadNegativeThreshold.value$])
|
||||||
|
.pipe(scope.bind())
|
||||||
|
.subscribe(([positiveThreshold, negativeThreshold]) => {
|
||||||
|
vadGate?.updateOptions({ positiveThreshold, negativeThreshold });
|
||||||
|
});
|
||||||
|
|
||||||
// Push param changes to the live worklet without recreating the processor.
|
// Push param changes to the live worklet without recreating the processor.
|
||||||
combineLatest([
|
combineLatest([
|
||||||
noiseGateThreshold.value$,
|
noiseGateThreshold.value$,
|
||||||
|
|||||||
Reference in New Issue
Block a user