perf: reduce TEN-VAD latency from 16 ms to 10 ms, asymmetric gate ramp
- Hop size 256 → 160 samples @ 16 kHz: VAD decision every 10 ms instead of 16 ms (minimum supported by TEN-VAD) - Asymmetric VAD ramp: 5 ms open (was 20 ms) to avoid masking speech onset, 20 ms close retained for de-click on silence Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -182,8 +182,10 @@ class TenVADRuntime {
|
|||||||
* gain is instantly cut to 0 and releases over transientReleaseMs.
|
* gain is instantly cut to 0 and releases over transientReleaseMs.
|
||||||
*
|
*
|
||||||
* TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz),
|
* TEN-VAD gate: accumulates audio with 3:1 decimation (48 kHz → 16 kHz),
|
||||||
* runs the TEN-VAD model synchronously every 256 samples (16 ms), and
|
* runs the TEN-VAD model synchronously every 160 samples (10 ms), and
|
||||||
* controls vadGateOpen with hysteresis. No IPC round-trip required.
|
* controls vadGateOpen with hysteresis. No IPC round-trip required.
|
||||||
|
* Asymmetric ramp: 5 ms open (minimise speech onset masking), 20 ms close
|
||||||
|
* (de-click on silence).
|
||||||
*/
|
*/
|
||||||
class NoiseGateProcessor extends AudioWorkletProcessor {
|
class NoiseGateProcessor extends AudioWorkletProcessor {
|
||||||
// Noise gate state
|
// Noise gate state
|
||||||
@@ -207,7 +209,9 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
|||||||
// VAD gate state
|
// VAD gate state
|
||||||
private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame
|
private vadGateOpen = true; // starts open; TEN-VAD closes it on first silent frame
|
||||||
private vadAttenuation = 1.0;
|
private vadAttenuation = 1.0;
|
||||||
private readonly vadRampRate = 1.0 / (0.02 * sampleRate);
|
// Asymmetric ramp: fast open to avoid masking speech onset, slow close to de-click
|
||||||
|
private readonly vadOpenRampRate = 1.0 / (0.005 * sampleRate); // 5 ms
|
||||||
|
private readonly vadCloseRampRate = 1.0 / (0.02 * sampleRate); // 20 ms
|
||||||
|
|
||||||
// TEN-VAD state
|
// TEN-VAD state
|
||||||
private vadEnabled = false;
|
private vadEnabled = false;
|
||||||
@@ -218,7 +222,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
|||||||
private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000));
|
private readonly decRatio = Math.max(1, Math.round(sampleRate / 16000));
|
||||||
private decPhase = 0;
|
private decPhase = 0;
|
||||||
private decAcc = 0;
|
private decAcc = 0;
|
||||||
private readonly vadHopBuf = new Int16Array(256);
|
// 160-sample hop = 10 ms @ 16 kHz (minimum supported by TEN-VAD)
|
||||||
|
private readonly vadHopBuf = new Int16Array(160);
|
||||||
private vadHopCount = 0;
|
private vadHopCount = 0;
|
||||||
|
|
||||||
private logCounter = 0;
|
private logCounter = 0;
|
||||||
@@ -234,8 +239,8 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
|||||||
| undefined;
|
| undefined;
|
||||||
if (tenVadModule) {
|
if (tenVadModule) {
|
||||||
try {
|
try {
|
||||||
// hopSize = 256 samples @ 16 kHz = 16 ms; threshold = 0.5 (overridden via params)
|
// hopSize = 160 samples @ 16 kHz = 10 ms; threshold = 0.5 (overridden via params)
|
||||||
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 256, 0.5);
|
this.tenVadRuntime = new TenVADRuntime(tenVadModule, 160, 0.5);
|
||||||
this.port.postMessage({
|
this.port.postMessage({
|
||||||
type: "log",
|
type: "log",
|
||||||
msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio,
|
msg: "[NoiseGate worklet] TEN-VAD runtime initialized, decRatio=" + this.decRatio,
|
||||||
@@ -382,7 +387,7 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
|||||||
: (avg * 32767 + 0.5) | 0;
|
: (avg * 32767 + 0.5) | 0;
|
||||||
this.vadHopBuf[this.vadHopCount++] = s16;
|
this.vadHopBuf[this.vadHopCount++] = s16;
|
||||||
|
|
||||||
if (this.vadHopCount >= 256) {
|
if (this.vadHopCount >= 160) {
|
||||||
this.vadHopCount = 0;
|
this.vadHopCount = 0;
|
||||||
const prob = this.tenVadRuntime.process(this.vadHopBuf);
|
const prob = this.tenVadRuntime.process(this.vadHopBuf);
|
||||||
if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) {
|
if (!this.vadGateOpen && prob >= this.vadPositiveThreshold) {
|
||||||
@@ -394,17 +399,18 @@ class NoiseGateProcessor extends AudioWorkletProcessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ramp VAD attenuation toward target to avoid clicks
|
// Asymmetric ramp: fast open (5 ms) to minimise speech onset masking,
|
||||||
|
// slow close (20 ms) to de-click on silence transitions.
|
||||||
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
|
const vadTarget = this.vadGateOpen ? 1.0 : 0.0;
|
||||||
if (this.vadAttenuation < vadTarget) {
|
if (this.vadAttenuation < vadTarget) {
|
||||||
this.vadAttenuation = Math.min(
|
this.vadAttenuation = Math.min(
|
||||||
vadTarget,
|
vadTarget,
|
||||||
this.vadAttenuation + this.vadRampRate,
|
this.vadAttenuation + this.vadOpenRampRate,
|
||||||
);
|
);
|
||||||
} else if (this.vadAttenuation > vadTarget) {
|
} else if (this.vadAttenuation > vadTarget) {
|
||||||
this.vadAttenuation = Math.max(
|
this.vadAttenuation = Math.max(
|
||||||
vadTarget,
|
vadTarget,
|
||||||
this.vadAttenuation - this.vadRampRate,
|
this.vadAttenuation - this.vadCloseRampRate,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user