fix: use Silero v5 model for 32ms frames and lower default thresholds

The legacy model is hardcoded to 1536 samples (96ms frames); v5 uses 512 samples (32ms), reducing gate open latency by 3x. Also lower default positive/negative thresholds to 0.2/0.1 so the gate opens at the first sign of speech rather than waiting for high model confidence. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 00:02:17 -03:00
parent 859db651e0
commit aff09d0e49
3 changed files with 9 additions and 3 deletions
--- a/src/livekit/SileroVADGate.ts
+++ b/src/livekit/SileroVADGate.ts
@@ -61,7 +61,9 @@ export class SileroVADGate {
    ort.env.wasm.numThreads = 1;

    this.vad = await MicVAD.new({
-      ...getDefaultRealTimeVADOptions("legacy"),
+      // v5 model uses 512-sample frames (32ms) vs legacy's fixed 1536 (96ms),
+      // giving 3× faster gate response at the cost of a slightly larger model file.
+      ...getDefaultRealTimeVADOptions("v5"),
      audioContext,
      baseAssetPath: VAD_BASE_PATH,
      onnxWASMBasePath: VAD_BASE_PATH,
--- a/src/settings/settings.ts
+++ b/src/settings/settings.ts
@@ -147,9 +147,9 @@ export const noiseGateRelease = new Setting<number>("noise-gate-release", 150);

 export const vadEnabled = new Setting<boolean>("vad-enabled", false);
 // Probability above which the VAD opens the gate (0–1)
-export const vadPositiveThreshold = new Setting<number>("vad-positive-threshold", 0.5);
+export const vadPositiveThreshold = new Setting<number>("vad-positive-threshold", 0.2);
 // Probability below which the VAD closes the gate (0–1)
-export const vadNegativeThreshold = new Setting<number>("vad-negative-threshold", 0.35);
+export const vadNegativeThreshold = new Setting<number>("vad-negative-threshold", 0.1);

 export const transientSuppressorEnabled = new Setting<boolean>(
  "transient-suppressor-enabled",