1 /* libSoX effect: Voice Activity Detector (c) 2009 robs@users.sourceforge.net
2 *
3 * This library is free software; you can redistribute it and/or modify it
4 * under the terms of the GNU Lesser General Public License as published by
5 * the Free Software Foundation; either version 2.1 of the License, or (at
6 * your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public License
14 * along with this library; if not, write to the Free Software Foundation,
15 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18 #include "sox_i.h"
19 #include <string.h>
20
21 typedef struct {
22 double * dftBuf, * noiseSpectrum, * spectrum, * measures, meanMeas;
23 } chan_t;
24
25 typedef struct { /* Configuration parameters: */
26 double bootTime, noiseTcUp, noiseTcDown, noiseReductionAmount;
27 double measureFreq, measureDuration, measureTc, preTriggerTime;
28 double hpFilterFreq, lpFilterFreq, hpLifterFreq, lpLifterFreq;
29 double triggerTc, triggerLevel, searchTime, gapTime;
30 /* Working variables: */
31 sox_sample_t * samples;
32 unsigned dftLen_ws, samplesLen_ns, samplesIndex_ns, flushedLen_ns, gapLen;
33 unsigned measurePeriod_ns, measuresLen, measuresIndex;
34 unsigned measureTimer_ns, measureLen_ws, measureLen_ns;
35 unsigned spectrumStart, spectrumEnd, cepstrumStart, cepstrumEnd; /* bins */
36 int bootCountMax, bootCount;
37 double noiseTcUpMult, noiseTcDownMult;
38 double measureTcMult, triggerMeasTcMult;
39 double * spectrumWindow, * cepstrumWindow;
40 chan_t * channels;
41 } priv_t;
42
43 #define GETOPT_FREQ(optstate, c, name, min) \
44 case c: p->name = lsx_parse_frequency(optstate.arg, &parseIndex); \
45 if (p->name < min || *parseIndex) return lsx_usage(effp); \
46 break;
47
create(sox_effect_t * effp,int argc,char ** argv)48 static int create(sox_effect_t * effp, int argc, char * * argv)
49 {
50 priv_t * p = (priv_t *)effp->priv;
51 #define opt_str "+b:N:n:r:f:m:M:h:l:H:L:T:t:s:g:p:"
52 int c;
53 lsx_getopt_t optstate;
54 lsx_getopt_init(argc, argv, opt_str, NULL, lsx_getopt_flag_none, 1, &optstate);
55
56 p->bootTime = .35;
57 p->noiseTcUp = .1;
58 p->noiseTcDown = .01;
59 p->noiseReductionAmount = 1.35;
60
61 p->measureFreq = 20;
62 p->measureDuration = 2 / p->measureFreq; /* 50% overlap */
63 p->measureTc = .4;
64
65 p->hpFilterFreq = 50;
66 p->lpFilterFreq = 6000;
67 p->hpLifterFreq = 150;
68 p->lpLifterFreq = 2000;
69
70 p->triggerTc = .25;
71 p->triggerLevel = 7;
72
73 p->searchTime = 1;
74 p->gapTime = .25;
75
76 while ((c = lsx_getopt(&optstate)) != -1) switch (c) {
77 char * parseIndex;
78 GETOPT_NUMERIC(optstate, 'b', bootTime , .1 , 10)
79 GETOPT_NUMERIC(optstate, 'N', noiseTcUp , .1 , 10)
80 GETOPT_NUMERIC(optstate, 'n', noiseTcDown ,.001 , .1)
81 GETOPT_NUMERIC(optstate, 'r', noiseReductionAmount,0 , 2)
82 GETOPT_NUMERIC(optstate, 'f', measureFreq , 5 , 50)
83 GETOPT_NUMERIC(optstate, 'm', measureDuration, .01 , 1)
84 GETOPT_NUMERIC(optstate, 'M', measureTc , .1 , 1)
85 GETOPT_FREQ( optstate, 'h', hpFilterFreq , 10)
86 GETOPT_FREQ( optstate, 'l', lpFilterFreq , 1000)
87 GETOPT_FREQ( optstate, 'H', hpLifterFreq , 10)
88 GETOPT_FREQ( optstate, 'L', lpLifterFreq , 1000)
89 GETOPT_NUMERIC(optstate, 'T', triggerTc , .01 , 1)
90 GETOPT_NUMERIC(optstate, 't', triggerLevel , 0 , 20)
91 GETOPT_NUMERIC(optstate, 's', searchTime , .1 , 4)
92 GETOPT_NUMERIC(optstate, 'g', gapTime , .1 , 1)
93 GETOPT_NUMERIC(optstate, 'p', preTriggerTime, 0 , 4)
94 default: lsx_fail("invalid option `-%c'", optstate.opt); return lsx_usage(effp);
95 }
96 return optstate.ind !=argc? lsx_usage(effp) : SOX_SUCCESS;
97 }
98
start(sox_effect_t * effp)99 static int start(sox_effect_t * effp)
100 {
101 priv_t * p = (priv_t *)effp->priv;
102 unsigned i, fixedPreTriggerLen_ns, searchPreTriggerLen_ns;
103
104 fixedPreTriggerLen_ns = p->preTriggerTime * effp->in_signal.rate + .5;
105 fixedPreTriggerLen_ns *= effp->in_signal.channels;
106
107 p->measureLen_ws = effp->in_signal.rate * p->measureDuration + .5;
108 p->measureLen_ns = p->measureLen_ws * effp->in_signal.channels;
109 for (p->dftLen_ws = 16; p->dftLen_ws < p->measureLen_ws; p->dftLen_ws <<= 1);
110 lsx_debug("dftLen_ws=%u measureLen_ws=%u", p->dftLen_ws, p->measureLen_ws);
111
112 p->measurePeriod_ns = effp->in_signal.rate / p->measureFreq + .5;
113 p->measurePeriod_ns *= effp->in_signal.channels;
114 p->measuresLen = ceil(p->searchTime * p->measureFreq);
115 searchPreTriggerLen_ns = p->measuresLen * p->measurePeriod_ns;
116 p->gapLen = p->gapTime * p->measureFreq + .5;
117
118 p->samplesLen_ns =
119 fixedPreTriggerLen_ns + searchPreTriggerLen_ns + p->measureLen_ns;
120 lsx_Calloc(p->samples, p->samplesLen_ns);
121
122 lsx_Calloc(p->channels, effp->in_signal.channels);
123 for (i = 0; i < effp->in_signal.channels; ++i) {
124 chan_t * c = &p->channels[i];
125 lsx_Calloc(c->dftBuf, p->dftLen_ws);
126 lsx_Calloc(c->spectrum, p->dftLen_ws);
127 lsx_Calloc(c->noiseSpectrum, p->dftLen_ws);
128 lsx_Calloc(c->measures, p->measuresLen);
129 }
130
131 lsx_Calloc(p->spectrumWindow, p->measureLen_ws);
132 for (i = 0; i < p->measureLen_ws; ++i)
133 p->spectrumWindow[i] = -2./ SOX_SAMPLE_MIN / sqrt((double)p->measureLen_ws);
134 lsx_apply_hann(p->spectrumWindow, (int)p->measureLen_ws);
135
136 p->spectrumStart = p->hpFilterFreq / effp->in_signal.rate * p->dftLen_ws + .5;
137 p->spectrumStart = max(p->spectrumStart, 1);
138 p->spectrumEnd = p->lpFilterFreq / effp->in_signal.rate * p->dftLen_ws + .5;
139 p->spectrumEnd = min(p->spectrumEnd, p->dftLen_ws / 2);
140
141 lsx_Calloc(p->cepstrumWindow, p->spectrumEnd - p->spectrumStart);
142 for (i = 0; i < p->spectrumEnd - p->spectrumStart; ++i)
143 p->cepstrumWindow[i] = 2 / sqrt((double)p->spectrumEnd - p->spectrumStart);
144 lsx_apply_hann(p->cepstrumWindow,(int)(p->spectrumEnd - p->spectrumStart));
145
146 p->cepstrumStart = ceil(effp->in_signal.rate * .5 / p->lpLifterFreq);
147 p->cepstrumEnd = floor(effp->in_signal.rate * .5 / p->hpLifterFreq);
148 p->cepstrumEnd = min(p->cepstrumEnd, p->dftLen_ws / 4);
149 if (p->cepstrumEnd <= p->cepstrumStart)
150 return SOX_EOF;
151
152 p->noiseTcUpMult = exp(-1 / (p->noiseTcUp * p->measureFreq));
153 p->noiseTcDownMult = exp(-1 / (p->noiseTcDown * p->measureFreq));
154 p->measureTcMult = exp(-1 / (p->measureTc * p->measureFreq));
155 p->triggerMeasTcMult = exp(-1 / (p->triggerTc * p->measureFreq));
156
157 p->bootCountMax = p->bootTime * p->measureFreq - .5;
158 p->measureTimer_ns = p->measureLen_ns;
159 p->bootCount = p->measuresIndex = p->flushedLen_ns = p->samplesIndex_ns = 0;
160
161 effp->out_signal.length = SOX_UNKNOWN_LEN; /* depends on input data */
162 return SOX_SUCCESS;
163 }
164
flowFlush(sox_effect_t * effp,sox_sample_t const * ibuf,sox_sample_t * obuf,size_t * ilen,size_t * olen)165 static int flowFlush(sox_effect_t * effp, sox_sample_t const * ibuf,
166 sox_sample_t * obuf, size_t * ilen, size_t * olen)
167 {
168 priv_t * p = (priv_t *)effp->priv;
169 size_t odone = min(p->samplesLen_ns - p->flushedLen_ns, *olen);
170 size_t odone1 = min(odone, p->samplesLen_ns - p->samplesIndex_ns);
171
172 memcpy(obuf, p->samples + p->samplesIndex_ns, odone1 * sizeof(*obuf));
173 if ((p->samplesIndex_ns += odone1) == p->samplesLen_ns) {
174 memcpy(obuf + odone1, p->samples, (odone - odone1) * sizeof(*obuf));
175 p->samplesIndex_ns = odone - odone1;
176 }
177 if ((p->flushedLen_ns += odone) == p->samplesLen_ns) {
178 size_t olen1 = *olen - odone;
179 (effp->handler.flow = lsx_flow_copy)(effp, ibuf, obuf +odone, ilen, &olen1);
180 odone += olen1;
181 }
182 else *ilen = 0;
183 *olen = odone;
184 return SOX_SUCCESS;
185 }
186
measure(priv_t * p,chan_t * c,size_t index_ns,unsigned step_ns,int bootCount)187 static double measure(
188 priv_t * p, chan_t * c, size_t index_ns, unsigned step_ns, int bootCount)
189 {
190 double mult, result = 0;
191 size_t i;
192
193 for (i = 0; i < p->measureLen_ws; ++i, index_ns = (index_ns + step_ns) % p->samplesLen_ns)
194 c->dftBuf[i] = p->samples[index_ns] * p->spectrumWindow[i];
195 memset(c->dftBuf + i, 0, (p->dftLen_ws - i) * sizeof(*c->dftBuf));
196 lsx_safe_rdft((int)p->dftLen_ws, 1, c->dftBuf);
197
198 memset(c->dftBuf, 0, p->spectrumStart * sizeof(*c->dftBuf));
199 for (i = p->spectrumStart; i < p->spectrumEnd; ++i) {
200 double d = sqrt(sqr(c->dftBuf[2 * i]) + sqr(c->dftBuf[2 * i + 1]));
201 mult = bootCount >= 0? bootCount / (1. + bootCount) : p->measureTcMult;
202 c->spectrum[i] = c->spectrum[i] * mult + d * (1 - mult);
203 d = sqr(c->spectrum[i]);
204 mult = bootCount >= 0? 0 :
205 d > c->noiseSpectrum[i]? p->noiseTcUpMult : p->noiseTcDownMult;
206 c->noiseSpectrum[i] = c->noiseSpectrum[i] * mult + d * (1 - mult);
207 d = sqrt(max(0, d - p->noiseReductionAmount * c->noiseSpectrum[i]));
208 c->dftBuf[i] = d * p->cepstrumWindow[i - p->spectrumStart];
209 }
210 memset(c->dftBuf + i, 0, ((p->dftLen_ws >> 1) - i) * sizeof(*c->dftBuf));
211 lsx_safe_rdft((int)p->dftLen_ws >> 1, 1, c->dftBuf);
212
213 for (i = p->cepstrumStart; i < p->cepstrumEnd; ++i)
214 result += sqr(c->dftBuf[2 * i]) + sqr(c->dftBuf[2 * i + 1]);
215 result = log(result / (p->cepstrumEnd - p->cepstrumStart));
216 return max(0, 21 + result);
217 }
218
flowTrigger(sox_effect_t * effp,sox_sample_t const * ibuf,sox_sample_t * obuf,size_t * ilen,size_t * olen)219 static int flowTrigger(sox_effect_t * effp, sox_sample_t const * ibuf,
220 sox_sample_t * obuf, size_t * ilen, size_t * olen)
221 {
222 priv_t * p = (priv_t *)effp->priv;
223 sox_bool hasTriggered = sox_false;
224 size_t i, idone = 0, numMeasuresToFlush = 0;
225
226 while (idone < *ilen && !hasTriggered) {
227 p->measureTimer_ns -= effp->in_signal.channels;
228 for (i = 0; i < effp->in_signal.channels; ++i, ++idone) {
229 chan_t * c = &p->channels[i];
230 p->samples[p->samplesIndex_ns++] = *ibuf++;
231 if (!p->measureTimer_ns) {
232 size_t x = (p->samplesIndex_ns + p->samplesLen_ns - p->measureLen_ns) % p->samplesLen_ns;
233 double meas = measure(p, c, x, effp->in_signal.channels, p->bootCount);
234 c->measures[p->measuresIndex] = meas;
235 c->meanMeas = c->meanMeas * p->triggerMeasTcMult +
236 meas *(1 - p->triggerMeasTcMult);
237
238 if (hasTriggered |= c->meanMeas >= p->triggerLevel) {
239 unsigned n = p->measuresLen, k = p->measuresIndex;
240 unsigned j, jTrigger = n, jZero = n;
241 for (j = 0; j < n; ++j, k = (k + n - 1) % n)
242 if (c->measures[k] >= p->triggerLevel && j <= jTrigger + p->gapLen)
243 jZero = jTrigger = j;
244 else if (!c->measures[k] && jTrigger >= jZero)
245 jZero = j;
246 j = min(j, jZero);
247 numMeasuresToFlush = range_limit(j, numMeasuresToFlush, n);
248 }
249 lsx_debug_more("%12g %12g %u",
250 meas, c->meanMeas, (unsigned)numMeasuresToFlush);
251 }
252 }
253 if (p->samplesIndex_ns == p->samplesLen_ns)
254 p->samplesIndex_ns = 0;
255 if (!p->measureTimer_ns) {
256 p->measureTimer_ns = p->measurePeriod_ns;
257 ++p->measuresIndex;
258 p->measuresIndex %= p->measuresLen;
259 if (p->bootCount >= 0)
260 p->bootCount = p->bootCount == p->bootCountMax? -1 : p->bootCount + 1;
261 }
262 }
263 if (hasTriggered) {
264 size_t ilen1 = *ilen - idone;
265 p->flushedLen_ns = (p->measuresLen - numMeasuresToFlush) * p->measurePeriod_ns;
266 p->samplesIndex_ns = (p->samplesIndex_ns + p->flushedLen_ns) % p->samplesLen_ns;
267 (effp->handler.flow = flowFlush)(effp, ibuf, obuf, &ilen1, olen);
268 idone += ilen1;
269 }
270 else *olen = 0;
271 *ilen = idone;
272 return SOX_SUCCESS;
273 }
274
drain(sox_effect_t * effp,sox_sample_t * obuf,size_t * olen)275 static int drain(sox_effect_t * effp, sox_sample_t * obuf, size_t * olen)
276 {
277 size_t ilen = 0;
278 return effp->handler.flow(effp, NULL, obuf, &ilen, olen);
279 }
280
stop(sox_effect_t * effp)281 static int stop(sox_effect_t * effp)
282 {
283 priv_t * p = (priv_t *)effp->priv;
284 unsigned i;
285
286 for (i = 0; i < effp->in_signal.channels; ++i) {
287 chan_t * c = &p->channels[i];
288 free(c->measures);
289 free(c->noiseSpectrum);
290 free(c->spectrum);
291 free(c->dftBuf);
292 }
293 free(p->channels);
294 free(p->cepstrumWindow);
295 free(p->spectrumWindow);
296 free(p->samples);
297 return SOX_SUCCESS;
298 }
299
lsx_vad_effect_fn(void)300 sox_effect_handler_t const * lsx_vad_effect_fn(void)
301 {
302 static sox_effect_handler_t handler = {"vad", NULL,
303 SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY,
304 create, start, flowTrigger, drain, stop, NULL, sizeof(priv_t)
305 };
306 static char const * lines[] = {
307 "[options]",
308 "\t-t trigger-level (7)",
309 "\t-T trigger-time-constant (0.25 s)",
310 "\t-s search-time (1 s)",
311 "\t-g allowed-gap (0.25 s)",
312 "\t-p pre-trigger-time (0 s)",
313 "Advanced options:",
314 "\t-b noise-est-boot-time (0.35 s)",
315 "\t-N noise-est-time-constant-up (0.1 s)",
316 "\t-n noise-est-time-constant-down (0.01 s)",
317 "\t-r noise-reduction-amount (1.35)",
318 "\t-f measurement-frequency (20 Hz)",
319 "\t-m measurement-duration (0.1 s)",
320 "\t-M measurement-time-constant (0.4 s)",
321 "\t-h high-pass-filter (50 Hz)",
322 "\t-l low-pass-filter (6000 Hz)",
323 "\t-H high-pass-lifter (150 Hz)",
324 "\t-L low-pass-lifter (2000 Hz)",
325 };
326 static char * usage;
327 handler.usage = lsx_usage_lines(&usage, lines, array_length(lines));
328 return &handler;
329 }
330