1 /* libSoX effect: Voice Activity Detector  (c) 2009 robs@users.sourceforge.net
2  *
3  * This library is free software; you can redistribute it and/or modify it
4  * under the terms of the GNU Lesser General Public License as published by
5  * the Free Software Foundation; either version 2.1 of the License, or (at
6  * your option) any later version.
7  *
8  * This library is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU Lesser General Public License
14  * along with this library; if not, write to the Free Software Foundation,
15  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
16  */
17 
18 #include "sox_i.h"
19 #include <string.h>
20 
21 typedef struct {
22   double    * dftBuf, * noiseSpectrum, * spectrum, * measures, meanMeas;
23 } chan_t;
24 
25 typedef struct {                /* Configuration parameters: */
26   double    bootTime, noiseTcUp, noiseTcDown, noiseReductionAmount;
27   double    measureFreq, measureDuration, measureTc, preTriggerTime;
28   double    hpFilterFreq, lpFilterFreq, hpLifterFreq, lpLifterFreq;
29   double    triggerTc, triggerLevel, searchTime, gapTime;
30                                 /* Working variables: */
31   sox_sample_t  * samples;
32   unsigned  dftLen_ws, samplesLen_ns, samplesIndex_ns, flushedLen_ns, gapLen;
33   unsigned  measurePeriod_ns, measuresLen, measuresIndex;
34   unsigned  measureTimer_ns, measureLen_ws, measureLen_ns;
35   unsigned  spectrumStart, spectrumEnd, cepstrumStart, cepstrumEnd; /* bins */
36   int       bootCountMax, bootCount;
37   double    noiseTcUpMult, noiseTcDownMult;
38   double    measureTcMult, triggerMeasTcMult;
39   double    * spectrumWindow, * cepstrumWindow;
40   chan_t    * channels;
41 } priv_t;
42 
43 #define GETOPT_FREQ(optstate, c, name, min) \
44     case c: p->name = lsx_parse_frequency(optstate.arg, &parseIndex); \
45       if (p->name < min || *parseIndex) return lsx_usage(effp); \
46       break;
47 
create(sox_effect_t * effp,int argc,char ** argv)48 static int create(sox_effect_t * effp, int argc, char * * argv)
49 {
50   priv_t * p = (priv_t *)effp->priv;
51   #define opt_str "+b:N:n:r:f:m:M:h:l:H:L:T:t:s:g:p:"
52   int c;
53   lsx_getopt_t optstate;
54   lsx_getopt_init(argc, argv, opt_str, NULL, lsx_getopt_flag_none, 1, &optstate);
55 
56   p->bootTime        = .35;
57   p->noiseTcUp       = .1;
58   p->noiseTcDown     = .01;
59   p->noiseReductionAmount = 1.35;
60 
61   p->measureFreq     = 20;
62   p->measureDuration = 2 / p->measureFreq; /* 50% overlap */
63   p->measureTc       = .4;
64 
65   p->hpFilterFreq    = 50;
66   p->lpFilterFreq    = 6000;
67   p->hpLifterFreq    = 150;
68   p->lpLifterFreq    = 2000;
69 
70   p->triggerTc       = .25;
71   p->triggerLevel    = 7;
72 
73   p->searchTime      = 1;
74   p->gapTime         = .25;
75 
76   while ((c = lsx_getopt(&optstate)) != -1) switch (c) {
77     char * parseIndex;
78     GETOPT_NUMERIC(optstate, 'b', bootTime      ,  .1 , 10)
79     GETOPT_NUMERIC(optstate, 'N', noiseTcUp     ,  .1 , 10)
80     GETOPT_NUMERIC(optstate, 'n', noiseTcDown   ,.001 , .1)
81     GETOPT_NUMERIC(optstate, 'r', noiseReductionAmount,0 , 2)
82     GETOPT_NUMERIC(optstate, 'f', measureFreq   ,   5 , 50)
83     GETOPT_NUMERIC(optstate, 'm', measureDuration, .01 , 1)
84     GETOPT_NUMERIC(optstate, 'M', measureTc     ,  .1 , 1)
85     GETOPT_FREQ(   optstate, 'h', hpFilterFreq  ,  10)
86     GETOPT_FREQ(   optstate, 'l', lpFilterFreq  ,  1000)
87     GETOPT_FREQ(   optstate, 'H', hpLifterFreq  ,  10)
88     GETOPT_FREQ(   optstate, 'L', lpLifterFreq  ,  1000)
89     GETOPT_NUMERIC(optstate, 'T', triggerTc     , .01 , 1)
90     GETOPT_NUMERIC(optstate, 't', triggerLevel  ,   0 , 20)
91     GETOPT_NUMERIC(optstate, 's', searchTime    ,  .1 , 4)
92     GETOPT_NUMERIC(optstate, 'g', gapTime       ,  .1 , 1)
93     GETOPT_NUMERIC(optstate, 'p', preTriggerTime,   0 , 4)
94     default: lsx_fail("invalid option `-%c'", optstate.opt); return lsx_usage(effp);
95   }
96   return optstate.ind !=argc? lsx_usage(effp) : SOX_SUCCESS;
97 }
98 
start(sox_effect_t * effp)99 static int start(sox_effect_t * effp)
100 {
101   priv_t * p = (priv_t *)effp->priv;
102   unsigned i, fixedPreTriggerLen_ns, searchPreTriggerLen_ns;
103 
104   fixedPreTriggerLen_ns = p->preTriggerTime * effp->in_signal.rate + .5;
105   fixedPreTriggerLen_ns *= effp->in_signal.channels;
106 
107   p->measureLen_ws = effp->in_signal.rate * p->measureDuration + .5;
108   p->measureLen_ns = p->measureLen_ws * effp->in_signal.channels;
109   for (p->dftLen_ws = 16; p->dftLen_ws < p->measureLen_ws; p->dftLen_ws <<= 1);
110   lsx_debug("dftLen_ws=%u measureLen_ws=%u", p->dftLen_ws, p->measureLen_ws);
111 
112   p->measurePeriod_ns = effp->in_signal.rate / p->measureFreq + .5;
113   p->measurePeriod_ns *= effp->in_signal.channels;
114   p->measuresLen = ceil(p->searchTime * p->measureFreq);
115   searchPreTriggerLen_ns = p->measuresLen * p->measurePeriod_ns;
116   p->gapLen = p->gapTime * p->measureFreq + .5;
117 
118   p->samplesLen_ns =
119     fixedPreTriggerLen_ns + searchPreTriggerLen_ns + p->measureLen_ns;
120   lsx_Calloc(p->samples, p->samplesLen_ns);
121 
122   lsx_Calloc(p->channels, effp->in_signal.channels);
123   for (i = 0; i < effp->in_signal.channels; ++i) {
124     chan_t * c = &p->channels[i];
125     lsx_Calloc(c->dftBuf, p->dftLen_ws);
126     lsx_Calloc(c->spectrum, p->dftLen_ws);
127     lsx_Calloc(c->noiseSpectrum, p->dftLen_ws);
128     lsx_Calloc(c->measures, p->measuresLen);
129   }
130 
131   lsx_Calloc(p->spectrumWindow, p->measureLen_ws);
132   for (i = 0; i < p->measureLen_ws; ++i)
133     p->spectrumWindow[i] = -2./ SOX_SAMPLE_MIN / sqrt((double)p->measureLen_ws);
134   lsx_apply_hann(p->spectrumWindow, (int)p->measureLen_ws);
135 
136   p->spectrumStart = p->hpFilterFreq / effp->in_signal.rate * p->dftLen_ws + .5;
137   p->spectrumStart = max(p->spectrumStart, 1);
138   p->spectrumEnd = p->lpFilterFreq / effp->in_signal.rate * p->dftLen_ws + .5;
139   p->spectrumEnd = min(p->spectrumEnd, p->dftLen_ws / 2);
140 
141   lsx_Calloc(p->cepstrumWindow, p->spectrumEnd - p->spectrumStart);
142   for (i = 0; i < p->spectrumEnd - p->spectrumStart; ++i)
143     p->cepstrumWindow[i] = 2 / sqrt((double)p->spectrumEnd - p->spectrumStart);
144   lsx_apply_hann(p->cepstrumWindow,(int)(p->spectrumEnd - p->spectrumStart));
145 
146   p->cepstrumStart = ceil(effp->in_signal.rate * .5 / p->lpLifterFreq);
147   p->cepstrumEnd  = floor(effp->in_signal.rate * .5 / p->hpLifterFreq);
148   p->cepstrumEnd = min(p->cepstrumEnd, p->dftLen_ws / 4);
149   if (p->cepstrumEnd <= p->cepstrumStart)
150     return SOX_EOF;
151 
152   p->noiseTcUpMult     = exp(-1 / (p->noiseTcUp   * p->measureFreq));
153   p->noiseTcDownMult   = exp(-1 / (p->noiseTcDown * p->measureFreq));
154   p->measureTcMult     = exp(-1 / (p->measureTc   * p->measureFreq));
155   p->triggerMeasTcMult = exp(-1 / (p->triggerTc   * p->measureFreq));
156 
157   p->bootCountMax = p->bootTime * p->measureFreq - .5;
158   p->measureTimer_ns = p->measureLen_ns;
159   p->bootCount = p->measuresIndex = p->flushedLen_ns = p->samplesIndex_ns = 0;
160 
161   effp->out_signal.length = SOX_UNKNOWN_LEN; /* depends on input data */
162   return SOX_SUCCESS;
163 }
164 
flowFlush(sox_effect_t * effp,sox_sample_t const * ibuf,sox_sample_t * obuf,size_t * ilen,size_t * olen)165 static int flowFlush(sox_effect_t * effp, sox_sample_t const * ibuf,
166     sox_sample_t * obuf, size_t * ilen, size_t * olen)
167 {
168   priv_t * p = (priv_t *)effp->priv;
169   size_t odone = min(p->samplesLen_ns - p->flushedLen_ns, *olen);
170   size_t odone1 = min(odone, p->samplesLen_ns - p->samplesIndex_ns);
171 
172   memcpy(obuf, p->samples + p->samplesIndex_ns, odone1 * sizeof(*obuf));
173   if ((p->samplesIndex_ns += odone1) == p->samplesLen_ns) {
174     memcpy(obuf + odone1, p->samples, (odone - odone1) * sizeof(*obuf));
175     p->samplesIndex_ns = odone - odone1;
176   }
177   if ((p->flushedLen_ns += odone) == p->samplesLen_ns) {
178     size_t olen1 = *olen - odone;
179     (effp->handler.flow = lsx_flow_copy)(effp, ibuf, obuf +odone, ilen, &olen1);
180     odone += olen1;
181   }
182   else *ilen = 0;
183   *olen = odone;
184   return SOX_SUCCESS;
185 }
186 
measure(priv_t * p,chan_t * c,size_t index_ns,unsigned step_ns,int bootCount)187 static double measure(
188     priv_t * p, chan_t * c, size_t index_ns, unsigned step_ns, int bootCount)
189 {
190   double mult, result = 0;
191   size_t i;
192 
193   for (i = 0; i < p->measureLen_ws; ++i, index_ns = (index_ns + step_ns) % p->samplesLen_ns)
194     c->dftBuf[i] = p->samples[index_ns] * p->spectrumWindow[i];
195   memset(c->dftBuf + i, 0, (p->dftLen_ws - i) * sizeof(*c->dftBuf));
196   lsx_safe_rdft((int)p->dftLen_ws, 1, c->dftBuf);
197 
198   memset(c->dftBuf, 0, p->spectrumStart * sizeof(*c->dftBuf));
199   for (i = p->spectrumStart; i < p->spectrumEnd; ++i) {
200     double d = sqrt(sqr(c->dftBuf[2 * i]) + sqr(c->dftBuf[2 * i + 1]));
201     mult = bootCount >= 0? bootCount / (1. + bootCount) : p->measureTcMult;
202     c->spectrum[i] = c->spectrum[i] * mult + d * (1 - mult);
203     d = sqr(c->spectrum[i]);
204     mult = bootCount >= 0? 0 :
205         d > c->noiseSpectrum[i]? p->noiseTcUpMult : p->noiseTcDownMult;
206     c->noiseSpectrum[i] = c->noiseSpectrum[i] * mult + d * (1 - mult);
207     d = sqrt(max(0, d - p->noiseReductionAmount * c->noiseSpectrum[i]));
208     c->dftBuf[i] = d * p->cepstrumWindow[i - p->spectrumStart];
209   }
210   memset(c->dftBuf + i, 0, ((p->dftLen_ws >> 1) - i) * sizeof(*c->dftBuf));
211   lsx_safe_rdft((int)p->dftLen_ws >> 1, 1, c->dftBuf);
212 
213   for (i = p->cepstrumStart; i < p->cepstrumEnd; ++i)
214     result += sqr(c->dftBuf[2 * i]) + sqr(c->dftBuf[2 * i + 1]);
215   result = log(result / (p->cepstrumEnd - p->cepstrumStart));
216   return max(0, 21 + result);
217 }
218 
flowTrigger(sox_effect_t * effp,sox_sample_t const * ibuf,sox_sample_t * obuf,size_t * ilen,size_t * olen)219 static int flowTrigger(sox_effect_t * effp, sox_sample_t const * ibuf,
220     sox_sample_t * obuf, size_t * ilen, size_t * olen)
221 {
222   priv_t * p = (priv_t *)effp->priv;
223   sox_bool hasTriggered = sox_false;
224   size_t i, idone = 0, numMeasuresToFlush = 0;
225 
226   while (idone < *ilen && !hasTriggered) {
227     p->measureTimer_ns -= effp->in_signal.channels;
228     for (i = 0; i < effp->in_signal.channels; ++i, ++idone) {
229       chan_t * c = &p->channels[i];
230       p->samples[p->samplesIndex_ns++] = *ibuf++;
231       if (!p->measureTimer_ns) {
232         size_t x = (p->samplesIndex_ns + p->samplesLen_ns - p->measureLen_ns) % p->samplesLen_ns;
233         double meas = measure(p, c, x, effp->in_signal.channels, p->bootCount);
234         c->measures[p->measuresIndex] = meas;
235         c->meanMeas = c->meanMeas * p->triggerMeasTcMult +
236             meas *(1 - p->triggerMeasTcMult);
237 
238         if (hasTriggered |= c->meanMeas >= p->triggerLevel) {
239           unsigned n = p->measuresLen, k = p->measuresIndex;
240           unsigned j, jTrigger = n, jZero = n;
241           for (j = 0; j < n; ++j, k = (k + n - 1) % n)
242             if (c->measures[k] >= p->triggerLevel && j <= jTrigger + p->gapLen)
243               jZero = jTrigger = j;
244             else if (!c->measures[k] && jTrigger >= jZero)
245               jZero = j;
246           j = min(j, jZero);
247           numMeasuresToFlush = range_limit(j, numMeasuresToFlush, n);
248         }
249         lsx_debug_more("%12g %12g %u",
250             meas, c->meanMeas, (unsigned)numMeasuresToFlush);
251       }
252     }
253     if (p->samplesIndex_ns == p->samplesLen_ns)
254       p->samplesIndex_ns = 0;
255     if (!p->measureTimer_ns) {
256       p->measureTimer_ns = p->measurePeriod_ns;
257       ++p->measuresIndex;
258       p->measuresIndex %= p->measuresLen;
259       if (p->bootCount >= 0)
260         p->bootCount = p->bootCount == p->bootCountMax? -1 : p->bootCount + 1;
261     }
262   }
263   if (hasTriggered) {
264     size_t ilen1 = *ilen - idone;
265     p->flushedLen_ns = (p->measuresLen - numMeasuresToFlush) * p->measurePeriod_ns;
266     p->samplesIndex_ns = (p->samplesIndex_ns + p->flushedLen_ns) % p->samplesLen_ns;
267     (effp->handler.flow = flowFlush)(effp, ibuf, obuf, &ilen1, olen);
268     idone += ilen1;
269   }
270   else *olen = 0;
271   *ilen = idone;
272   return SOX_SUCCESS;
273 }
274 
drain(sox_effect_t * effp,sox_sample_t * obuf,size_t * olen)275 static int drain(sox_effect_t * effp, sox_sample_t * obuf, size_t * olen)
276 {
277   size_t ilen = 0;
278   return effp->handler.flow(effp, NULL, obuf, &ilen, olen);
279 }
280 
stop(sox_effect_t * effp)281 static int stop(sox_effect_t * effp)
282 {
283   priv_t * p = (priv_t *)effp->priv;
284   unsigned i;
285 
286   for (i = 0; i < effp->in_signal.channels; ++i) {
287     chan_t * c = &p->channels[i];
288     free(c->measures);
289     free(c->noiseSpectrum);
290     free(c->spectrum);
291     free(c->dftBuf);
292   }
293   free(p->channels);
294   free(p->cepstrumWindow);
295   free(p->spectrumWindow);
296   free(p->samples);
297   return SOX_SUCCESS;
298 }
299 
lsx_vad_effect_fn(void)300 sox_effect_handler_t const * lsx_vad_effect_fn(void)
301 {
302   static sox_effect_handler_t handler = {"vad", NULL,
303     SOX_EFF_MCHAN | SOX_EFF_LENGTH | SOX_EFF_MODIFY,
304     create, start, flowTrigger, drain, stop, NULL, sizeof(priv_t)
305   };
306   static char const * lines[] = {
307     "[options]",
308     "\t-t trigger-level                (7)",
309     "\t-T trigger-time-constant        (0.25 s)",
310     "\t-s search-time                  (1 s)",
311     "\t-g allowed-gap                  (0.25 s)",
312     "\t-p pre-trigger-time             (0 s)",
313     "Advanced options:",
314     "\t-b noise-est-boot-time          (0.35 s)",
315     "\t-N noise-est-time-constant-up   (0.1 s)",
316     "\t-n noise-est-time-constant-down (0.01 s)",
317     "\t-r noise-reduction-amount       (1.35)",
318     "\t-f measurement-frequency        (20 Hz)",
319     "\t-m measurement-duration         (0.1 s)",
320     "\t-M measurement-time-constant    (0.4 s)",
321     "\t-h high-pass-filter             (50 Hz)",
322     "\t-l low-pass-filter              (6000 Hz)",
323     "\t-H high-pass-lifter             (150 Hz)",
324     "\t-L low-pass-lifter              (2000 Hz)",
325   };
326   static char * usage;
327   handler.usage = lsx_usage_lines(&usage, lines, array_length(lines));
328   return &handler;
329 }
330