perf: reduce TEN-VAD latency from 16 ms to 10 ms, asymmetric gate ramp

- Hop size 256 → 160 samples @ 16 kHz: VAD decision every 10 ms instead
  of 16 ms (minimum supported by TEN-VAD)
- Asymmetric VAD ramp: 5 ms open (was 20 ms) to avoid masking speech onset,
  20 ms close retained for de-click on silence

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
mk
2026-03-24 07:44:47 -03:00
parent dc1f30b84f
commit 025735c490

View File

@@ -182,8 +182,10 @@ class TenVADRuntime {
* gain is instantly cut to 0 and releases over transientReleaseMs. * gain is instantly cut to 0 and releases over transientReleaseMs.
* *
* TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz), * TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz),
* runs the TEN-VAD model synchronously every 256 samples (16 ms), and * runs the TEN-VAD model synchronously every 160 samples (10 ms), and
* controls vadGateOpen with hysteresis. No IPC round-trip required. * controls vadGateOpen with hysteresis. No IPC round-trip required.
* Asymmetric ramp: 5 ms open (minimise speech onset masking), 20 ms close
* (de-click on silence).
*/ */
class NoiseGateProcessor extends AudioWorkletProcessor { class NoiseGateProcessor extends AudioWorkletProcessor {
// Noise gate state // Noise gate state
@@ -207,7 +209,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
// VAD gate state // VAD gate state
private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame
private vadAttenuation = 1.0; private vadAttenuation = 1.0;
private readonly vadRampRate = 1.0 / (0.02 * sampleRate); // Asymmetric ramp: fast open to avoid masking speech onset, slow close to de-click
private readonly vadOpenRampRate = 1.0 / (0.005 * sampleRate); // 5 ms
private readonly vadCloseRampRate = 1.0 / (0.02 * sampleRate); // 20 ms
// TEN-VAD state // TEN-VAD state
private vadEnabled = false; private vadEnabled = false;
@@ -218,7 +222,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000)); private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000));
private decPhase = 0; private decPhase = 0;
private decAcc = 0; private decAcc = 0;
private readonly vadHopBuf = new Int16Array(256); // 160-sample hop = 10 ms @ 16 kHz (minimum supported by TEN-VAD)
private readonly vadHopBuf = new Int16Array(160);
private vadHopCount = 0; private vadHopCount = 0;
private logCounter = 0; private logCounter = 0;
@@ -234,8 +239,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
| undefined; | undefined;
if (tenVadModule) { if (tenVadModule) {
try { try {
// hopSize = 256 samples @ 16 kHz = 16 ms; threshold = 0.5 (overridden via params) // hopSize = 160 samples @ 16 kHz = 10 ms; threshold = 0.5 (overridden via params)
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 256, 0.5); this.tenVadRuntime = new TenVADRuntime(tenVadModule, 160, 0.5);
this.port.postMessage({ this.port.postMessage({
type: "log", type: "log",
msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio, msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio,
@@ -382,7 +387,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
: (avg * 32767 + 0.5) | 0; : (avg * 32767 + 0.5) | 0;
this.vadHopBuf[this.vadHopCount++] = s16; this.vadHopBuf[this.vadHopCount++] = s16;
if (this.vadHopCount >= 256) { if (this.vadHopCount >= 160) {
this.vadHopCount = 0; this.vadHopCount = 0;
const prob = this.tenVadRuntime.process(this.vadHopBuf); const prob = this.tenVadRuntime.process(this.vadHopBuf);
if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) { if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) {
@@ -394,17 +399,18 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
} }
} }
// Ramp VAD attenuation toward target to avoid clicks // Asymmetric ramp: fast open (5 ms) to minimise speech onset masking,
// slow close (20 ms) to de-click on silence transitions.
const vadTarget = this.vadGateOpen ? 1.0 : 0.0; const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
if (this.vadAttenuation < vadTarget) { if (this.vadAttenuation < vadTarget) {
this.vadAttenuation = Math.min( this.vadAttenuation = Math.min(
vadTarget, vadTarget,
this.vadAttenuation + this.vadRampRate, this.vadAttenuation + this.vadOpenRampRate,
); );
} else if (this.vadAttenuation > vadTarget) { } else if (this.vadAttenuation > vadTarget) {
this.vadAttenuation = Math.max( this.vadAttenuation = Math.max(
vadTarget, vadTarget,
this.vadAttenuation - this.vadRampRate, this.vadAttenuation - this.vadCloseRampRate,
); );
} }