diff --git a/src/livekit/NoiseGateProcessor.worklet.ts b/src/livekit/NoiseGateProcessor.worklet.ts index 7b53d2cf..1abec665 100644 --- a/src/livekit/NoiseGateProcessor.worklet.ts +++ b/src/livekit/NoiseGateProcessor.worklet.ts @@ -182,8 +182,10 @@ class TenVADRuntime { * gain is instantly cut to 0 and releases over transientReleaseMs. * * TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz), - * runs the TEN-VAD model synchronously every 256 samples (16 ms), and + * runs the TEN-VAD model synchronously every 160 samples (10 ms), and * controls vadGateOpen with hysteresis. No IPC round-trip required. + * Asymmetric ramp: 5 ms open (minimise speech onset masking), 20 ms close + * (de-click on silence). */ class NoiseGateProcessor extends AudioWorkletProcessor { // Noise gate state @@ -207,7 +209,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor { // VAD gate state private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame private vadAttenuation = 1.0; - private readonly vadRampRate = 1.0 / (0.02 * sampleRate); + // Asymmetric ramp: fast open to avoid masking speech onset, slow close to de-click + private readonly vadOpenRampRate = 1.0 / (0.005 * sampleRate); // 5 ms + private readonly vadCloseRampRate = 1.0 / (0.02 * sampleRate); // 20 ms // TEN-VAD state private vadEnabled = false; @@ -218,7 +222,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor { private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000)); private decPhase = 0; private decAcc = 0; - private readonly vadHopBuf = new Int16Array(256); + // 160-sample hop = 10 ms @ 16 kHz (minimum supported by TEN-VAD) + private readonly vadHopBuf = new Int16Array(160); private vadHopCount = 0; private logCounter = 0; @@ -234,8 +239,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor { | undefined; if (tenVadModule) { try { - // hopSize = 256 samples @ 16 kHz = 16 ms; threshold = 0.5 (overridden via params) - this.tenVadRuntime = new TenVADRuntime(tenVadModule, 256, 0.5); + // hopSize = 160 samples @ 16 kHz = 10 ms; threshold = 0.5 (overridden via params) + this.tenVadRuntime = new TenVADRuntime(tenVadModule, 160, 0.5); this.port.postMessage({ type: "log", msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio, @@ -382,7 +387,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor { : (avg * 32767 + 0.5) | 0; this.vadHopBuf[this.vadHopCount++] = s16; - if (this.vadHopCount >= 256) { + if (this.vadHopCount >= 160) { this.vadHopCount = 0; const prob = this.tenVadRuntime.process(this.vadHopBuf); if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) { @@ -394,17 +399,18 @@ class NoiseGateProcessor extends AudioWorkletProcessor { } } - // Ramp VAD attenuation toward target to avoid clicks + // Asymmetric ramp: fast open (5 ms) to minimise speech onset masking, + // slow close (20 ms) to de-click on silence transitions. const vadTarget = this.vadGateOpen ? 1.0 : 0.0; if (this.vadAttenuation < vadTarget) { this.vadAttenuation = Math.min( vadTarget, - this.vadAttenuation + this.vadRampRate, + this.vadAttenuation + this.vadOpenRampRate, ); } else if (this.vadAttenuation > vadTarget) { this.vadAttenuation = Math.max( vadTarget, - this.vadAttenuation - this.vadRampRate, + this.vadAttenuation - this.vadCloseRampRate, ); }