Adding support for 48 kHz input to VAD.
This CL adds support for 48 kHz sampling frequency in the VAD, by adding downsampling from 48 to 8 kHz. BUG= TEST=vad_unittest Review URL: https://webrtc-codereview.appspot.com/855010 git-svn-id: http://webrtc.googlecode.com/svn/trunk@2926 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
@ -504,6 +504,9 @@ int WebRtcVad_InitCore(VadInstT* self) {
|
|||||||
memset(self->downsampling_filter_states, 0,
|
memset(self->downsampling_filter_states, 0,
|
||||||
sizeof(self->downsampling_filter_states));
|
sizeof(self->downsampling_filter_states));
|
||||||
|
|
||||||
|
// Initialization of 48 to 8 kHz downsampling.
|
||||||
|
WebRtcSpl_ResetResample48khzTo8khz(&self->state_48_to_8);
|
||||||
|
|
||||||
// Read initial PDF parameters.
|
// Read initial PDF parameters.
|
||||||
for (i = 0; i < kTableSize; i++) {
|
for (i = 0; i < kTableSize; i++) {
|
||||||
self->noise_means[i] = kNoiseDataMeans[i];
|
self->noise_means[i] = kNoiseDataMeans[i];
|
||||||
@ -600,6 +603,31 @@ int WebRtcVad_set_mode_core(VadInstT* self, int mode) {
|
|||||||
// Calculate VAD decision by first extracting feature values and then calculate
|
// Calculate VAD decision by first extracting feature values and then calculate
|
||||||
// probability for both speech and background noise.
|
// probability for both speech and background noise.
|
||||||
|
|
||||||
|
int WebRtcVad_CalcVad48khz(VadInstT* inst, int16_t* speech_frame,
|
||||||
|
int frame_length) {
|
||||||
|
int vad;
|
||||||
|
int i;
|
||||||
|
int16_t speech_nb[240]; // 30 ms in 8 kHz.
|
||||||
|
// |tmp_mem| is a temporary memory used by resample function, length is
|
||||||
|
// frame length in 10 ms (480 samples) + 256 extra.
|
||||||
|
int32_t tmp_mem[480 + 256] = { 0 };
|
||||||
|
const int kFrameLen10ms48khz = 480;
|
||||||
|
const int kFrameLen10ms8khz = 80;
|
||||||
|
int num_10ms_frames = frame_length / kFrameLen10ms48khz;
|
||||||
|
|
||||||
|
for (i = 0; i < num_10ms_frames; i++) {
|
||||||
|
WebRtcSpl_Resample48khzTo8khz(speech_frame,
|
||||||
|
&speech_nb[i * kFrameLen10ms8khz],
|
||||||
|
&inst->state_48_to_8,
|
||||||
|
tmp_mem);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do VAD on an 8 kHz signal
|
||||||
|
vad = WebRtcVad_CalcVad8khz(inst, speech_nb, frame_length / 6);
|
||||||
|
|
||||||
|
return vad;
|
||||||
|
}
|
||||||
|
|
||||||
int WebRtcVad_CalcVad32khz(VadInstT* inst, int16_t* speech_frame,
|
int WebRtcVad_CalcVad32khz(VadInstT* inst, int16_t* speech_frame,
|
||||||
int frame_length)
|
int frame_length)
|
||||||
{
|
{
|
||||||
|
@ -16,6 +16,7 @@
|
|||||||
#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
#ifndef WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
||||||
#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
#define WEBRTC_COMMON_AUDIO_VAD_VAD_CORE_H_
|
||||||
|
|
||||||
|
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||||
#include "typedefs.h"
|
#include "typedefs.h"
|
||||||
|
|
||||||
enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
|
enum { kNumChannels = 6 }; // Number of frequency bands (named channels).
|
||||||
@ -28,6 +29,7 @@ typedef struct VadInstT_
|
|||||||
|
|
||||||
int vad;
|
int vad;
|
||||||
int32_t downsampling_filter_states[4];
|
int32_t downsampling_filter_states[4];
|
||||||
|
WebRtcSpl_State48khzTo8khz state_48_to_8;
|
||||||
int16_t noise_means[kTableSize];
|
int16_t noise_means[kTableSize];
|
||||||
int16_t speech_means[kTableSize];
|
int16_t speech_means[kTableSize];
|
||||||
int16_t noise_stds[kTableSize];
|
int16_t noise_stds[kTableSize];
|
||||||
@ -82,6 +84,7 @@ int WebRtcVad_InitCore(VadInstT* self);
|
|||||||
int WebRtcVad_set_mode_core(VadInstT* self, int mode);
|
int WebRtcVad_set_mode_core(VadInstT* self, int mode);
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
|
* WebRtcVad_CalcVad48khz(...)
|
||||||
* WebRtcVad_CalcVad32khz(...)
|
* WebRtcVad_CalcVad32khz(...)
|
||||||
* WebRtcVad_CalcVad16khz(...)
|
* WebRtcVad_CalcVad16khz(...)
|
||||||
* WebRtcVad_CalcVad8khz(...)
|
* WebRtcVad_CalcVad8khz(...)
|
||||||
@ -100,6 +103,8 @@ int WebRtcVad_set_mode_core(VadInstT* self, int mode);
|
|||||||
* 0 - No active speech
|
* 0 - No active speech
|
||||||
* 1-6 - Active speech
|
* 1-6 - Active speech
|
||||||
*/
|
*/
|
||||||
|
int WebRtcVad_CalcVad48khz(VadInstT* inst, int16_t* speech_frame,
|
||||||
|
int frame_length);
|
||||||
int WebRtcVad_CalcVad32khz(VadInstT* inst, int16_t* speech_frame,
|
int WebRtcVad_CalcVad32khz(VadInstT* inst, int16_t* speech_frame,
|
||||||
int frame_length);
|
int frame_length);
|
||||||
int WebRtcVad_CalcVad16khz(VadInstT* inst, int16_t* speech_frame,
|
int WebRtcVad_CalcVad16khz(VadInstT* inst, int16_t* speech_frame,
|
||||||
|
@ -75,6 +75,9 @@ TEST_F(VadTest, CalcVad) {
|
|||||||
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
|
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
|
||||||
EXPECT_EQ(0, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
|
EXPECT_EQ(0, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
|
||||||
}
|
}
|
||||||
|
if (ValidRatesAndFrameLengths(48000, kFrameLengths[j])) {
|
||||||
|
EXPECT_EQ(0, WebRtcVad_CalcVad48khz(self, speech, kFrameLengths[j]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Construct a speech signal that will trigger the VAD in all modes. It is
|
// Construct a speech signal that will trigger the VAD in all modes. It is
|
||||||
@ -92,6 +95,9 @@ TEST_F(VadTest, CalcVad) {
|
|||||||
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
|
if (ValidRatesAndFrameLengths(32000, kFrameLengths[j])) {
|
||||||
EXPECT_EQ(1, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
|
EXPECT_EQ(1, WebRtcVad_CalcVad32khz(self, speech, kFrameLengths[j]));
|
||||||
}
|
}
|
||||||
|
if (ValidRatesAndFrameLengths(48000, kFrameLengths[j])) {
|
||||||
|
EXPECT_EQ(1, WebRtcVad_CalcVad48khz(self, speech, kFrameLengths[j]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
free(self);
|
free(self);
|
||||||
|
@ -23,10 +23,11 @@ namespace {
|
|||||||
|
|
||||||
TEST_F(VadTest, vad_sp) {
|
TEST_F(VadTest, vad_sp) {
|
||||||
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
|
VadInstT* self = reinterpret_cast<VadInstT*>(malloc(sizeof(VadInstT)));
|
||||||
int16_t zeros[kMaxFrameLength] = { 0 };
|
const int kMaxFrameLenSp = 960; // Maximum frame length in this unittest.
|
||||||
|
int16_t zeros[kMaxFrameLenSp] = { 0 };
|
||||||
int32_t state[2] = { 0 };
|
int32_t state[2] = { 0 };
|
||||||
int16_t data_in[kMaxFrameLength];
|
int16_t data_in[kMaxFrameLenSp];
|
||||||
int16_t data_out[kMaxFrameLength];
|
int16_t data_out[kMaxFrameLenSp];
|
||||||
|
|
||||||
// We expect the first value to be 1600 as long as |frame_counter| is zero,
|
// We expect the first value to be 1600 as long as |frame_counter| is zero,
|
||||||
// which is true for the first iteration.
|
// which is true for the first iteration.
|
||||||
@ -39,20 +40,18 @@ TEST_F(VadTest, vad_sp) {
|
|||||||
|
|
||||||
// Construct a speech signal that will trigger the VAD in all modes. It is
|
// Construct a speech signal that will trigger the VAD in all modes. It is
|
||||||
// known that (i * i) will wrap around, but that doesn't matter in this case.
|
// known that (i * i) will wrap around, but that doesn't matter in this case.
|
||||||
for (int16_t i = 0; i < kMaxFrameLength; ++i) {
|
for (int16_t i = 0; i < kMaxFrameLenSp; ++i) {
|
||||||
data_in[i] = (i * i);
|
data_in[i] = (i * i);
|
||||||
}
|
}
|
||||||
// Input values all zeros, expect all zeros out.
|
// Input values all zeros, expect all zeros out.
|
||||||
WebRtcVad_Downsampling(zeros, data_out, state,
|
WebRtcVad_Downsampling(zeros, data_out, state, kMaxFrameLenSp);
|
||||||
static_cast<int>(kMaxFrameLength));
|
|
||||||
EXPECT_EQ(0, state[0]);
|
EXPECT_EQ(0, state[0]);
|
||||||
EXPECT_EQ(0, state[1]);
|
EXPECT_EQ(0, state[1]);
|
||||||
for (int16_t i = 0; i < kMaxFrameLength / 2; ++i) {
|
for (int16_t i = 0; i < kMaxFrameLenSp / 2; ++i) {
|
||||||
EXPECT_EQ(0, data_out[i]);
|
EXPECT_EQ(0, data_out[i]);
|
||||||
}
|
}
|
||||||
// Make a simple non-zero data test.
|
// Make a simple non-zero data test.
|
||||||
WebRtcVad_Downsampling(data_in, data_out, state,
|
WebRtcVad_Downsampling(data_in, data_out, state, kMaxFrameLenSp);
|
||||||
static_cast<int>(kMaxFrameLength));
|
|
||||||
EXPECT_EQ(207, state[0]);
|
EXPECT_EQ(207, state[0]);
|
||||||
EXPECT_EQ(2270, state[1]);
|
EXPECT_EQ(2270, state[1]);
|
||||||
|
|
||||||
|
@ -36,12 +36,16 @@ bool VadTest::ValidRatesAndFrameLengths(int rate, int frame_length) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
} else if (rate == 32000) {
|
||||||
if (rate == 32000) {
|
|
||||||
if (frame_length == 320 || frame_length == 640 || frame_length == 960) {
|
if (frame_length == 320 || frame_length == 640 || frame_length == 960) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
} else if (rate == 48000) {
|
||||||
|
if (frame_length == 480 || frame_length == 960 || frame_length == 1440) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
@ -122,15 +126,26 @@ TEST_F(VadTest, ApiTest) {
|
|||||||
|
|
||||||
TEST_F(VadTest, ValidRatesFrameLengths) {
|
TEST_F(VadTest, ValidRatesFrameLengths) {
|
||||||
// This test verifies valid and invalid rate/frame_length combinations. We
|
// This test verifies valid and invalid rate/frame_length combinations. We
|
||||||
// loop through sampling rates and frame lengths from negative values to
|
// loop through some sampling rates and frame lengths from negative values to
|
||||||
// values larger than possible.
|
// values larger than possible.
|
||||||
for (int16_t rate = -1; rate <= kRates[kRatesSize - 1] + 1; rate++) {
|
const int kNumRates = 12;
|
||||||
for (int16_t frame_length = -1; frame_length <= kMaxFrameLength + 1;
|
const int kRates[kNumRates] = {
|
||||||
frame_length++) {
|
-8000, -4000, 0, 4000, 8000, 8001, 15999, 16000, 32000, 48000, 48001, 96000
|
||||||
if (ValidRatesAndFrameLengths(rate, frame_length)) {
|
};
|
||||||
EXPECT_EQ(0, WebRtcVad_ValidRateAndFrameLength(rate, frame_length));
|
|
||||||
|
const int kNumFrameLengths = 13;
|
||||||
|
const int kFrameLengths[kNumFrameLengths] = {
|
||||||
|
-10, 0, 80, 81, 159, 160, 240, 320, 480, 640, 960, 1440, 2000
|
||||||
|
};
|
||||||
|
|
||||||
|
for (int i = 0; i < kNumRates; i++) {
|
||||||
|
for (int j = 0; j < kNumFrameLengths; j++) {
|
||||||
|
if (ValidRatesAndFrameLengths(kRates[i], kFrameLengths[j])) {
|
||||||
|
EXPECT_EQ(0, WebRtcVad_ValidRateAndFrameLength(kRates[i],
|
||||||
|
kFrameLengths[j]));
|
||||||
} else {
|
} else {
|
||||||
EXPECT_EQ(-1, WebRtcVad_ValidRateAndFrameLength(rate, frame_length));
|
EXPECT_EQ(-1, WebRtcVad_ValidRateAndFrameLength(kRates[i],
|
||||||
|
kFrameLengths[j]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,12 +24,12 @@ const int kModes[] = { 0, 1, 2, 3 };
|
|||||||
const size_t kModesSize = sizeof(kModes) / sizeof(*kModes);
|
const size_t kModesSize = sizeof(kModes) / sizeof(*kModes);
|
||||||
|
|
||||||
// Rates we support.
|
// Rates we support.
|
||||||
const int kRates[] = { 8000, 12000, 16000, 24000, 32000 };
|
const int kRates[] = { 8000, 12000, 16000, 24000, 32000, 48000 };
|
||||||
const size_t kRatesSize = sizeof(kRates) / sizeof(*kRates);
|
const size_t kRatesSize = sizeof(kRates) / sizeof(*kRates);
|
||||||
|
|
||||||
// Frame lengths we support.
|
// Frame lengths we support.
|
||||||
const int kMaxFrameLength = 960;
|
const int kMaxFrameLength = 1440;
|
||||||
const int kFrameLengths[] = { 80, 120, 160, 240, 320, 480, 640,
|
const int kFrameLengths[] = { 80, 120, 160, 240, 320, 480, 640, 960,
|
||||||
kMaxFrameLength };
|
kMaxFrameLength };
|
||||||
const size_t kFrameLengthsSize = sizeof(kFrameLengths) / sizeof(*kFrameLengths);
|
const size_t kFrameLengthsSize = sizeof(kFrameLengths) / sizeof(*kFrameLengths);
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
#include "typedefs.h"
|
#include "typedefs.h"
|
||||||
|
|
||||||
static const int kInitCheck = 42;
|
static const int kInitCheck = 42;
|
||||||
static const int kValidRates[] = { 8000, 16000, 32000 };
|
static const int kValidRates[] = { 8000, 16000, 32000, 48000 };
|
||||||
static const size_t kRatesSize = sizeof(kValidRates) / sizeof(*kValidRates);
|
static const size_t kRatesSize = sizeof(kValidRates) / sizeof(*kValidRates);
|
||||||
static const int kMaxFrameLengthMs = 30;
|
static const int kMaxFrameLengthMs = 30;
|
||||||
|
|
||||||
@ -93,7 +93,9 @@ int WebRtcVad_Process(VadInst* handle, int fs, int16_t* audio_frame,
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fs == 32000) {
|
if (fs == 48000) {
|
||||||
|
vad = WebRtcVad_CalcVad48khz(self, audio_frame, frame_length);
|
||||||
|
} else if (fs == 32000) {
|
||||||
vad = WebRtcVad_CalcVad32khz(self, audio_frame, frame_length);
|
vad = WebRtcVad_CalcVad32khz(self, audio_frame, frame_length);
|
||||||
} else if (fs == 16000) {
|
} else if (fs == 16000) {
|
||||||
vad = WebRtcVad_CalcVad16khz(self, audio_frame, frame_length);
|
vad = WebRtcVad_CalcVad16khz(self, audio_frame, frame_length);
|
||||||
|
Reference in New Issue
Block a user