perf: reduce TEN-VAD latency from 16 ms to 10 ms, asymmetric gate ramp

- Hop size 256 → 160 samples @ 16 kHz: VAD decision every 10 ms instead
  of 16 ms (minimum supported by TEN-VAD)
- Asymmetric VAD ramp: 5 ms open (was 20 ms) to avoid masking speech onset,
  20 ms close retained for de-click on silence

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-24 07:44:47 -03:00
parent dc1f30b84f
commit 025735c490

View File

@@ -182,8 +182,10 @@ class TenVADRuntime {
* gain is instantly cut to 0 and releases over transientReleaseMs.
*
* TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz),
* runs the TEN-VAD model synchronously every 256 samples (16 ms), and
* runs the TEN-VAD model synchronously every 160 samples (10 ms), and
* controls vadGateOpen with hysteresis. No IPC round-trip required.
* Asymmetric ramp: 5 ms open (minimise speech onset masking), 20 ms close
* (de-click on silence).
*/
class NoiseGateProcessor extends AudioWorkletProcessor {
// Noise gate state
@@ -207,7 +209,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
// VAD gate state
private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame
private vadAttenuation = 1.0;
private readonly vadRampRate = 1.0 / (0.02 * sampleRate);
// Asymmetric ramp: fast open to avoid masking speech onset, slow close to de-click
private readonly vadOpenRampRate = 1.0 / (0.005 * sampleRate); // 5 ms
private readonly vadCloseRampRate = 1.0 / (0.02 * sampleRate); // 20 ms
// TEN-VAD state
private vadEnabled = false;
@@ -218,7 +222,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000));
private decPhase = 0;
private decAcc = 0;
private readonly vadHopBuf = new Int16Array(256);
// 160-sample hop = 10 ms @ 16 kHz (minimum supported by TEN-VAD)
private readonly vadHopBuf = new Int16Array(160);
private vadHopCount = 0;
private logCounter = 0;
@@ -234,8 +239,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
| undefined;
if (tenVadModule) {
try {
// hopSize = 256 samples @ 16 kHz = 16 ms; threshold = 0.5 (overridden via params)
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 256, 0.5);
// hopSize = 160 samples @ 16 kHz = 10 ms; threshold = 0.5 (overridden via params)
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 160, 0.5);
this.port.postMessage({
type: "log",
msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio,
@@ -382,7 +387,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
: (avg * 32767 + 0.5) | 0;
this.vadHopBuf[this.vadHopCount++] = s16;
if (this.vadHopCount >= 256) {
if (this.vadHopCount >= 160) {
this.vadHopCount = 0;
const prob = this.tenVadRuntime.process(this.vadHopBuf);
if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) {
@@ -394,17 +399,18 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
}
}
// Ramp VAD attenuation toward target to avoid clicks
// Asymmetric ramp: fast open (5 ms) to minimise speech onset masking,
// slow close (20 ms) to de-click on silence transitions.
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
if (this.vadAttenuation < vadTarget) {
this.vadAttenuation = Math.min(
vadTarget,
this.vadAttenuation + this.vadRampRate,
this.vadAttenuation + this.vadOpenRampRate,
);
} else if (this.vadAttenuation > vadTarget) {
this.vadAttenuation = Math.max(
vadTarget,
this.vadAttenuation - this.vadRampRate,
this.vadAttenuation - this.vadCloseRampRate,
);
}