perf: reduce TEN-VAD latency from 16 ms to 10 ms, asymmetric gate ramp
- Hop size 256 → 160 samples @ 16 kHz: VAD decision every 10 ms instead of 16 ms (minimum supported by TEN-VAD) - Asymmetric VAD ramp: 5 ms open (was 20 ms) to avoid masking speech onset, 20 ms close retained for de-click on silence Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -182,8 +182,10 @@ class TenVADRuntime {
|
||||
* gain is instantly cut to 0 and releases over transientReleaseMs.
|
||||
*
|
||||
* TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz),
|
||||
* runs the TEN-VAD model synchronously every 256 samples (16 ms), and
|
||||
* runs the TEN-VAD model synchronously every 160 samples (10 ms), and
|
||||
* controls vadGateOpen with hysteresis. No IPC round-trip required.
|
||||
* Asymmetric ramp: 5 ms open (minimise speech onset masking), 20 ms close
|
||||
* (de-click on silence).
|
||||
*/
|
||||
class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
// Noise gate state
|
||||
@@ -207,7 +209,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
// VAD gate state
|
||||
private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame
|
||||
private vadAttenuation = 1.0;
|
||||
private readonly vadRampRate = 1.0 / (0.02 * sampleRate);
|
||||
// Asymmetric ramp: fast open to avoid masking speech onset, slow close to de-click
|
||||
private readonly vadOpenRampRate = 1.0 / (0.005 * sampleRate); // 5 ms
|
||||
private readonly vadCloseRampRate = 1.0 / (0.02 * sampleRate); // 20 ms
|
||||
|
||||
// TEN-VAD state
|
||||
private vadEnabled = false;
|
||||
@@ -218,7 +222,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000));
|
||||
private decPhase = 0;
|
||||
private decAcc = 0;
|
||||
private readonly vadHopBuf = new Int16Array(256);
|
||||
// 160-sample hop = 10 ms @ 16 kHz (minimum supported by TEN-VAD)
|
||||
private readonly vadHopBuf = new Int16Array(160);
|
||||
private vadHopCount = 0;
|
||||
|
||||
private logCounter = 0;
|
||||
@@ -234,8 +239,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
| undefined;
|
||||
if (tenVadModule) {
|
||||
try {
|
||||
// hopSize = 256 samples @ 16 kHz = 16 ms; threshold = 0.5 (overridden via params)
|
||||
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 256, 0.5);
|
||||
// hopSize = 160 samples @ 16 kHz = 10 ms; threshold = 0.5 (overridden via params)
|
||||
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 160, 0.5);
|
||||
this.port.postMessage({
|
||||
type: "log",
|
||||
msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio,
|
||||
@@ -382,7 +387,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
: (avg * 32767 + 0.5) | 0;
|
||||
this.vadHopBuf[this.vadHopCount++] = s16;
|
||||
|
||||
if (this.vadHopCount >= 256) {
|
||||
if (this.vadHopCount >= 160) {
|
||||
this.vadHopCount = 0;
|
||||
const prob = this.tenVadRuntime.process(this.vadHopBuf);
|
||||
if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) {
|
||||
@@ -394,17 +399,18 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
// Ramp VAD attenuation toward target to avoid clicks
|
||||
// Asymmetric ramp: fast open (5 ms) to minimise speech onset masking,
|
||||
// slow close (20 ms) to de-click on silence transitions.
|
||||
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
|
||||
if (this.vadAttenuation < vadTarget) {
|
||||
this.vadAttenuation = Math.min(
|
||||
vadTarget,
|
||||
this.vadAttenuation + this.vadRampRate,
|
||||
this.vadAttenuation + this.vadOpenRampRate,
|
||||
);
|
||||
} else if (this.vadAttenuation > vadTarget) {
|
||||
this.vadAttenuation = Math.max(
|
||||
vadTarget,
|
||||
this.vadAttenuation - this.vadRampRate,
|
||||
this.vadAttenuation - this.vadCloseRampRate,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user