feat: add VAD threshold controls and smooth gate ramp
Replace the hard 0/1 VAD gate with a 20ms ramp in the worklet to prevent clicks on open/close transitions. Expose positive and negative speech probability thresholds as user-adjustable settings (defaults 0.5/0.35). Sliders with restore-defaults button added to the VAD section of the audio settings tab. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -72,6 +72,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
|
||||
// VAD gate state (controlled externally via port message)
|
||||
private vadGateOpen = true; // starts open until VAD sends its first decision
|
||||
// Smooth ramp so the VAD gate fades rather than cutting instantly (~20ms)
|
||||
private vadAttenuation = 1.0;
|
||||
private readonly vadRampRate = 1.0 / (0.02 * sampleRate);
|
||||
|
||||
private logCounter = 0;
|
||||
|
||||
@@ -160,7 +163,15 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
const gain = this.gateAttenuation * transientGain * (this.vadGateOpen ? 1.0 : 0.0);
|
||||
// Ramp VAD attenuation toward target to avoid clicks on gate open/close
|
||||
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
|
||||
if (this.vadAttenuation < vadTarget) {
|
||||
this.vadAttenuation = Math.min(vadTarget, this.vadAttenuation + this.vadRampRate);
|
||||
} else if (this.vadAttenuation > vadTarget) {
|
||||
this.vadAttenuation = Math.max(vadTarget, this.vadAttenuation - this.vadRampRate);
|
||||
}
|
||||
|
||||
const gain = this.gateAttenuation * transientGain * this.vadAttenuation;
|
||||
|
||||
for (let c = 0; c < output.length; c++) {
|
||||
const inCh = input[c] ?? input[0];
|
||||
|
||||
@@ -14,23 +14,23 @@ const log = logger.getChild("[SileroVADGate]");
|
||||
|
||||
const VAD_BASE_PATH = "/vad/";
|
||||
|
||||
// Speech probability above this value opens the gate; below it closes it.
|
||||
// vad-web's defaults are positiveSpeechThreshold=0.5, negativeSpeechThreshold=0.35.
|
||||
// We use those same values so the gate tracks the model's own speech/silence logic.
|
||||
const SPEECH_OPEN_THRESHOLD = 0.5;
|
||||
const SPEECH_CLOSE_THRESHOLD = 0.35;
|
||||
export interface SileroVADGateOptions {
|
||||
positiveThreshold: number; // open gate when isSpeech >= this (0–1)
|
||||
negativeThreshold: number; // close gate when isSpeech < this (0–1)
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps @ricky0123/vad-web's MicVAD to feed per-frame speech probability
|
||||
* decisions into the NoiseGateTransformer's VAD gate.
|
||||
*
|
||||
* Uses onFrameProcessed (fires every ~96ms) rather than the segment-level
|
||||
* onSpeechStart/onSpeechEnd callbacks. The segment callbacks only fire at
|
||||
* speech segment boundaries — with purely non-speech noise, onSpeechEnd
|
||||
* never fires and the gate stays open. Per-frame probability control fixes
|
||||
* this: the gate closes on the first silent frame.
|
||||
* onSpeechStart/onSpeechEnd callbacks — those only fire at segment boundaries
|
||||
* so non-speech noise never triggers onSpeechEnd, keeping the gate open.
|
||||
* Per-frame probability control with hysteresis fixes this.
|
||||
*
|
||||
* The gate starts closed (silent) and opens only once the VAD confirms speech.
|
||||
* The gate starts OPEN (fail-safe): audio flows immediately and the model
|
||||
* closes it on the first silent frame. A failed model load therefore
|
||||
* degrades gracefully instead of permanently muting the user.
|
||||
*/
|
||||
export class SileroVADGate {
|
||||
/** Called each time the gate transitions to open (speech detected). */
|
||||
@@ -41,11 +41,13 @@ export class SileroVADGate {
|
||||
private vad: MicVAD | null = null;
|
||||
private readonly stream: MediaStream;
|
||||
private readonly audioContext: AudioContext;
|
||||
private gateOpen = false;
|
||||
private options: SileroVADGateOptions;
|
||||
private gateOpen = true;
|
||||
|
||||
public constructor(stream: MediaStream, audioContext: AudioContext) {
|
||||
public constructor(stream: MediaStream, audioContext: AudioContext, options: SileroVADGateOptions) {
|
||||
this.stream = stream;
|
||||
this.audioContext = audioContext;
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
public async start(): Promise<void> {
|
||||
@@ -72,11 +74,11 @@ export class SileroVADGate {
|
||||
resumeStream: async (): Promise<MediaStream> => stream,
|
||||
onFrameProcessed: (probabilities: { isSpeech: number; notSpeech: number }): void => {
|
||||
const p = probabilities.isSpeech;
|
||||
if (!this.gateOpen && p >= SPEECH_OPEN_THRESHOLD) {
|
||||
if (!this.gateOpen && p >= this.options.positiveThreshold) {
|
||||
this.gateOpen = true;
|
||||
log.debug("gate open (isSpeech=", p, ")");
|
||||
this.onOpen();
|
||||
} else if (this.gateOpen && p < SPEECH_CLOSE_THRESHOLD) {
|
||||
} else if (this.gateOpen && p < this.options.negativeThreshold) {
|
||||
this.gateOpen = false;
|
||||
log.debug("gate close (isSpeech=", p, ")");
|
||||
this.onClose();
|
||||
@@ -88,15 +90,14 @@ export class SileroVADGate {
|
||||
onSpeechRealStart: (): void => {},
|
||||
});
|
||||
|
||||
// Gate starts OPEN so audio flows immediately. The first silence frame
|
||||
// will close it. This also means a failed model load degrades gracefully
|
||||
// (audio still flows) rather than permanently muting the user.
|
||||
this.gateOpen = true;
|
||||
|
||||
await this.vad.start();
|
||||
log.info("MicVAD started");
|
||||
}
|
||||
|
||||
public updateOptions(options: SileroVADGateOptions): void {
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
public async destroy(): Promise<void> {
|
||||
if (this.vad) {
|
||||
await this.vad.destroy();
|
||||
|
||||
@@ -33,6 +33,8 @@ import {
|
||||
transientThreshold as transientThresholdSetting,
|
||||
transientRelease as transientReleaseSetting,
|
||||
vadEnabled as vadEnabledSetting,
|
||||
vadPositiveThreshold as vadPositiveThresholdSetting,
|
||||
vadNegativeThreshold as vadNegativeThresholdSetting,
|
||||
} from "./settings";
|
||||
import { PreferencesSettingsTab } from "./PreferencesSettingsTab";
|
||||
import { Slider } from "../Slider";
|
||||
@@ -132,6 +134,10 @@ export const SettingsModal: FC<Props> = ({
|
||||
|
||||
// Voice activity detection
|
||||
const [vadActive, setVadActive] = useSetting(vadEnabledSetting);
|
||||
const [vadPositiveThreshold, setVadPositiveThreshold] = useSetting(vadPositiveThresholdSetting);
|
||||
const [vadPositiveThresholdRaw, setVadPositiveThresholdRaw] = useState(vadPositiveThreshold);
|
||||
const [vadNegativeThreshold, setVadNegativeThreshold] = useSetting(vadNegativeThresholdSetting);
|
||||
const [vadNegativeThresholdRaw, setVadNegativeThresholdRaw] = useState(vadNegativeThreshold);
|
||||
|
||||
// Transient suppressor settings
|
||||
const [transientEnabled, setTransientEnabled] = useSetting(transientSuppressorEnabledSetting);
|
||||
@@ -338,6 +344,52 @@ export const SettingsModal: FC<Props> = ({
|
||||
disabled={!noiseGateEnabled}
|
||||
/>
|
||||
</FieldRow>
|
||||
{vadActive && (
|
||||
<>
|
||||
<div className={`${styles.volumeSlider} ${styles.thresholdSlider}`}>
|
||||
<span className={styles.sliderLabel}>Open threshold: {Math.round(vadPositiveThresholdRaw * 100)}%</span>
|
||||
<p>How confident the model must be before opening the gate.</p>
|
||||
<Slider
|
||||
label="VAD open threshold"
|
||||
value={vadPositiveThresholdRaw}
|
||||
onValueChange={setVadPositiveThresholdRaw}
|
||||
onValueCommit={setVadPositiveThreshold}
|
||||
min={0.1}
|
||||
max={0.9}
|
||||
step={0.05}
|
||||
tooltip={false}
|
||||
/>
|
||||
</div>
|
||||
<div className={styles.volumeSlider}>
|
||||
<span className={styles.sliderLabel}>Close threshold: {Math.round(vadNegativeThresholdRaw * 100)}%</span>
|
||||
<p>How low the probability must drop before closing the gate.</p>
|
||||
<Slider
|
||||
label="VAD close threshold"
|
||||
value={vadNegativeThresholdRaw}
|
||||
onValueChange={setVadNegativeThresholdRaw}
|
||||
onValueCommit={setVadNegativeThreshold}
|
||||
min={0.05}
|
||||
max={0.7}
|
||||
step={0.05}
|
||||
tooltip={false}
|
||||
/>
|
||||
</div>
|
||||
<div className={styles.restoreDefaults}>
|
||||
<Button
|
||||
kind="secondary"
|
||||
size="sm"
|
||||
onClick={(): void => {
|
||||
const pos = vadPositiveThresholdSetting.defaultValue;
|
||||
const neg = vadNegativeThresholdSetting.defaultValue;
|
||||
setVadPositiveThreshold(pos); setVadPositiveThresholdRaw(pos);
|
||||
setVadNegativeThreshold(neg); setVadNegativeThresholdRaw(neg);
|
||||
}}
|
||||
>
|
||||
Restore defaults
|
||||
</Button>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
<div className={styles.noiseGateSection}>
|
||||
<Heading
|
||||
|
||||
@@ -146,6 +146,10 @@ export const noiseGateHold = new Setting<number>("noise-gate-hold", 200);
|
||||
export const noiseGateRelease = new Setting<number>("noise-gate-release", 150);
|
||||
|
||||
export const vadEnabled = new Setting<boolean>("vad-enabled", false);
|
||||
// Probability above which the VAD opens the gate (0–1)
|
||||
export const vadPositiveThreshold = new Setting<number>("vad-positive-threshold", 0.5);
|
||||
// Probability below which the VAD closes the gate (0–1)
|
||||
export const vadNegativeThreshold = new Setting<number>("vad-negative-threshold", 0.35);
|
||||
|
||||
export const transientSuppressorEnabled = new Setting<boolean>(
|
||||
"transient-suppressor-enabled",
|
||||
|
||||
@@ -42,6 +42,8 @@ import {
|
||||
transientThreshold,
|
||||
transientRelease,
|
||||
vadEnabled,
|
||||
vadPositiveThreshold,
|
||||
vadNegativeThreshold,
|
||||
} from "../../../settings/settings.ts";
|
||||
import {
|
||||
type NoiseGateParams,
|
||||
@@ -467,7 +469,10 @@ export class Publisher {
|
||||
return;
|
||||
}
|
||||
const stream = new MediaStream([rawTrack]);
|
||||
vadGate = new SileroVADGate(stream, ctx);
|
||||
vadGate = new SileroVADGate(stream, ctx, {
|
||||
positiveThreshold: vadPositiveThreshold.getValue(),
|
||||
negativeThreshold: vadNegativeThreshold.getValue(),
|
||||
});
|
||||
vadGate.onOpen = (): void => transformer?.setVADOpen(true);
|
||||
vadGate.onClose = (): void => transformer?.setVADOpen(false);
|
||||
vadGate.start().catch((e: unknown) => {
|
||||
@@ -525,6 +530,13 @@ export class Publisher {
|
||||
}
|
||||
});
|
||||
|
||||
// Push VAD threshold changes to the live gate without recreating it.
|
||||
combineLatest([vadPositiveThreshold.value$, vadNegativeThreshold.value$])
|
||||
.pipe(scope.bind())
|
||||
.subscribe(([positiveThreshold, negativeThreshold]) => {
|
||||
vadGate?.updateOptions({ positiveThreshold, negativeThreshold });
|
||||
});
|
||||
|
||||
// Push param changes to the live worklet without recreating the processor.
|
||||
combineLatest([
|
||||
noiseGateThreshold.value$,
|
||||
|
||||
Reference in New Issue
Block a user