From 518c683f3e413523a458a94b533274bd7f29992d Mon Sep 17 00:00:00 2001 From: aluebs Date: Wed, 24 Jun 2015 18:45:58 -0700 Subject: [PATCH] Pull the Voice Activity Detector out from the AGC This change generates bit-exact values when running through audioproc_f than before. This change was originally uploaded here: * https://codereview.webrtc.org/1181933002/ * https://codereview.webrtc.org/1177043017/ And reverted because of an ASAN problem in Chrome here: * https://codereview.webrtc.org/1192863006/ * https://codereview.webrtc.org/1194963003/ TBR=andrew@webrtc.org Review URL: https://codereview.webrtc.org/1211563003 Cr-Commit-Position: refs/heads/master@{#9502} --- webrtc/modules/audio_processing/BUILD.gn | 38 ++-- webrtc/modules/audio_processing/agc/agc.cc | 77 ++------ webrtc/modules/audio_processing/agc/agc.h | 19 +- .../agc/agc_audio_proc_internal.h | 81 --------- .../agc/agc_manager_direct.cc | 2 +- .../audio_processing/agc/noise_gmm_tables.h | 77 -------- .../audio_processing/agc/voice_gmm_tables.h | 77 -------- .../audio_processing/audio_processing.gypi | 38 ++-- .../audio_processing/{agc => vad}/common.h | 6 +- .../audio_processing/{agc => vad}/gmm.cc | 13 +- .../audio_processing/{agc => vad}/gmm.h | 6 +- .../{agc => vad}/gmm_unittest.cc | 6 +- .../audio_processing/vad/noise_gmm_tables.h | 85 +++++++++ .../{agc => vad}/pitch_based_vad.cc | 20 ++- .../{agc => vad}/pitch_based_vad.h | 15 +- .../{agc => vad}/pitch_based_vad_unittest.cc | 26 +-- .../{agc => vad}/pitch_internal.cc | 3 +- .../{agc => vad}/pitch_internal.h | 6 +- .../{agc => vad}/pitch_internal_unittest.cc | 12 +- .../{agc => vad}/pole_zero_filter.cc | 27 ++- .../{agc => vad}/pole_zero_filter.h | 6 +- .../{agc => vad}/pole_zero_filter_unittest.cc | 49 ++--- .../{agc => vad}/standalone_vad.cc | 9 +- .../{agc => vad}/standalone_vad.h | 2 +- .../{agc => vad}/standalone_vad_unittest.cc | 10 +- .../vad_audio_proc.cc} | 81 +++++---- .../agc_audio_proc.h => vad/vad_audio_proc.h} | 17 +- .../vad/vad_audio_proc_internal.h | 94 ++++++++++ .../vad_audio_proc_unittest.cc} | 12 +- .../vad_circular_buffer.cc} | 32 ++-- .../vad_circular_buffer.h} | 14 +- .../vad_circular_buffer_unittest.cc} | 41 ++--- .../vad/voice_activity_detector.cc | 87 +++++++++ .../vad/voice_activity_detector.h | 70 ++++++++ .../vad/voice_activity_detector_unittest.cc | 168 ++++++++++++++++++ .../audio_processing/vad/voice_gmm_tables.h | 85 +++++++++ webrtc/modules/modules.gyp | 15 +- webrtc/tools/agc/activity_metric.cc | 12 +- 38 files changed, 873 insertions(+), 565 deletions(-) delete mode 100644 webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h delete mode 100644 webrtc/modules/audio_processing/agc/noise_gmm_tables.h delete mode 100644 webrtc/modules/audio_processing/agc/voice_gmm_tables.h rename webrtc/modules/audio_processing/{agc => vad}/common.h (81%) rename webrtc/modules/audio_processing/{agc => vad}/gmm.cc (81%) rename webrtc/modules/audio_processing/{agc => vad}/gmm.h (91%) rename webrtc/modules/audio_processing/{agc => vad}/gmm_unittest.cc (91%) create mode 100644 webrtc/modules/audio_processing/vad/noise_gmm_tables.h rename webrtc/modules/audio_processing/{agc => vad}/pitch_based_vad.cc (87%) rename webrtc/modules/audio_processing/{agc => vad}/pitch_based_vad.h (80%) rename webrtc/modules/audio_processing/{agc => vad}/pitch_based_vad_unittest.cc (73%) rename webrtc/modules/audio_processing/{agc => vad}/pitch_internal.cc (96%) rename webrtc/modules/audio_processing/{agc => vad}/pitch_internal.h (84%) rename webrtc/modules/audio_processing/{agc => vad}/pitch_internal_unittest.cc (82%) rename webrtc/modules/audio_processing/{agc => vad}/pole_zero_filter.cc (83%) rename webrtc/modules/audio_processing/{agc => vad}/pole_zero_filter.h (87%) rename webrtc/modules/audio_processing/{agc => vad}/pole_zero_filter_unittest.cc (50%) rename webrtc/modules/audio_processing/{agc => vad}/standalone_vad.cc (93%) rename webrtc/modules/audio_processing/{agc => vad}/standalone_vad.h (97%) rename webrtc/modules/audio_processing/{agc => vad}/standalone_vad_unittest.cc (94%) rename webrtc/modules/audio_processing/{agc/agc_audio_proc.cc => vad/vad_audio_proc.cc} (78%) rename webrtc/modules/audio_processing/{agc/agc_audio_proc.h => vad/vad_audio_proc.h} (87%) create mode 100644 webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h rename webrtc/modules/audio_processing/{agc/agc_audio_proc_unittest.cc => vad/vad_audio_proc_unittest.cc} (88%) rename webrtc/modules/audio_processing/{agc/circular_buffer.cc => vad/vad_circular_buffer.cc} (75%) rename webrtc/modules/audio_processing/{agc/circular_buffer.h => vad/vad_circular_buffer.h} (86%) rename webrtc/modules/audio_processing/{agc/circular_buffer_unittest.cc => vad/vad_circular_buffer_unittest.cc} (76%) create mode 100644 webrtc/modules/audio_processing/vad/voice_activity_detector.cc create mode 100644 webrtc/modules/audio_processing/vad/voice_activity_detector.h create mode 100644 webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc create mode 100644 webrtc/modules/audio_processing/vad/voice_gmm_tables.h diff --git a/webrtc/modules/audio_processing/BUILD.gn b/webrtc/modules/audio_processing/BUILD.gn index 907f22b655..dd474293a7 100644 --- a/webrtc/modules/audio_processing/BUILD.gn +++ b/webrtc/modules/audio_processing/BUILD.gn @@ -38,17 +38,9 @@ source_set("audio_processing") { "aecm/include/echo_control_mobile.h", "agc/agc.cc", "agc/agc.h", - "agc/agc_audio_proc.cc", - "agc/agc_audio_proc.h", - "agc/agc_audio_proc_internal.h", "agc/agc_manager_direct.cc", "agc/agc_manager_direct.h", - "agc/circular_buffer.cc", - "agc/circular_buffer.h", - "agc/common.h", "agc/gain_map_internal.h", - "agc/gmm.cc", - "agc/gmm.h", "agc/histogram.cc", "agc/histogram.h", "agc/legacy/analog_agc.c", @@ -56,18 +48,8 @@ source_set("audio_processing") { "agc/legacy/digital_agc.c", "agc/legacy/digital_agc.h", "agc/legacy/gain_control.h", - "agc/noise_gmm_tables.h", - "agc/pitch_based_vad.cc", - "agc/pitch_based_vad.h", - "agc/pitch_internal.cc", - "agc/pitch_internal.h", - "agc/pole_zero_filter.cc", - "agc/pole_zero_filter.h", - "agc/standalone_vad.cc", - "agc/standalone_vad.h", "agc/utility.cc", "agc/utility.h", - "agc/voice_gmm_tables.h", "audio_buffer.cc", "audio_buffer.h", "audio_processing_impl.cc", @@ -125,6 +107,26 @@ source_set("audio_processing") { "utility/delay_estimator_internal.h", "utility/delay_estimator_wrapper.c", "utility/delay_estimator_wrapper.h", + "vad/common.h", + "vad/gmm.cc", + "vad/gmm.h", + "vad/noise_gmm_tables.h", + "vad/pitch_based_vad.cc", + "vad/pitch_based_vad.h", + "vad/pitch_internal.cc", + "vad/pitch_internal.h", + "vad/pole_zero_filter.cc", + "vad/pole_zero_filter.h", + "vad/standalone_vad.cc", + "vad/standalone_vad.h", + "vad/vad_audio_proc.cc", + "vad/vad_audio_proc.h", + "vad/vad_audio_proc_internal.h", + "vad/vad_circular_buffer.cc", + "vad/vad_circular_buffer.h", + "vad/voice_activity_detector.cc", + "vad/voice_activity_detector.h", + "vad/voice_gmm_tables.h", "voice_detection_impl.cc", "voice_detection_impl.h", ] diff --git a/webrtc/modules/audio_processing/agc/agc.cc b/webrtc/modules/audio_processing/agc/agc.cc index 6041435bd9..80c3e1fe72 100644 --- a/webrtc/modules/audio_processing/agc/agc.cc +++ b/webrtc/modules/audio_processing/agc/agc.cc @@ -14,13 +14,10 @@ #include #include +#include -#include "webrtc/common_audio/resampler/include/resampler.h" -#include "webrtc/modules/audio_processing/agc/agc_audio_proc.h" -#include "webrtc/modules/audio_processing/agc/common.h" +#include "webrtc/base/checks.h" #include "webrtc/modules/audio_processing/agc/histogram.h" -#include "webrtc/modules/audio_processing/agc/pitch_based_vad.h" -#include "webrtc/modules/audio_processing/agc/standalone_vad.h" #include "webrtc/modules/audio_processing/agc/utility.h" #include "webrtc/modules/interface/module_common_types.h" @@ -28,7 +25,6 @@ namespace webrtc { namespace { const int kDefaultLevelDbfs = -18; -const double kDefaultVoiceValue = 1.0; const int kNumAnalysisFrames = 100; const double kActivityThreshold = 0.3; @@ -36,16 +32,9 @@ const double kActivityThreshold = 0.3; Agc::Agc() : target_level_loudness_(Dbfs2Loudness(kDefaultLevelDbfs)), - last_voice_probability_(kDefaultVoiceValue), target_level_dbfs_(kDefaultLevelDbfs), - standalone_vad_enabled_(true), histogram_(Histogram::Create(kNumAnalysisFrames)), - inactive_histogram_(Histogram::Create()), - audio_processing_(new AgcAudioProc()), - pitch_based_vad_(new PitchBasedVad()), - standalone_vad_(StandaloneVad::Create()), - // Initialize to the most common resampling situation. - resampler_(new Resampler(32000, kSampleRateHz, 1)) { + inactive_histogram_(Histogram::Create()) { } Agc::~Agc() {} @@ -61,55 +50,13 @@ float Agc::AnalyzePreproc(const int16_t* audio, int length) { } int Agc::Process(const int16_t* audio, int length, int sample_rate_hz) { - assert(length == sample_rate_hz / 100); - if (sample_rate_hz > 32000) { - return -1; - } - // Resample to the required rate. - int16_t resampled[kLength10Ms]; - const int16_t* resampled_ptr = audio; - if (sample_rate_hz != kSampleRateHz) { - if (resampler_->ResetIfNeeded(sample_rate_hz, kSampleRateHz, 1) != 0) { - return -1; - } - resampler_->Push(audio, length, resampled, kLength10Ms, length); - resampled_ptr = resampled; - } - assert(length == kLength10Ms); - - if (standalone_vad_enabled_) { - if (standalone_vad_->AddAudio(resampled_ptr, length) != 0) - return -1; - } - - AudioFeatures features; - audio_processing_->ExtractFeatures(resampled_ptr, length, &features); - if (features.num_frames > 0) { - if (features.silence) { - // The other features are invalid, so update the histogram with an - // arbitrary low value. - for (int n = 0; n < features.num_frames; ++n) - histogram_->Update(features.rms[n], 0.01); - return 0; - } - - // Initialize to 0.5 which is a neutral value for combining probabilities, - // in case the standalone-VAD is not enabled. - double p_combined[] = {0.5, 0.5, 0.5, 0.5}; - static_assert(sizeof(p_combined) / sizeof(p_combined[0]) == kMaxNumFrames, - "combined probability incorrect size"); - if (standalone_vad_enabled_) { - if (standalone_vad_->GetActivity(p_combined, kMaxNumFrames) < 0) - return -1; - } - // If any other VAD is enabled it must be combined before calling the - // pitch-based VAD. - if (pitch_based_vad_->VoicingProbability(features, p_combined) < 0) - return -1; - for (int n = 0; n < features.num_frames; n++) { - histogram_->Update(features.rms[n], p_combined[n]); - last_voice_probability_ = p_combined[n]; - } + vad_.ProcessChunk(audio, length, sample_rate_hz); + const std::vector& rms = vad_.chunkwise_rms(); + const std::vector& probabilities = + vad_.chunkwise_voice_probabilities(); + DCHECK_EQ(rms.size(), probabilities.size()); + for (size_t i = 0; i < rms.size(); ++i) { + histogram_->Update(rms[i], probabilities[i]); } return 0; } @@ -151,8 +98,4 @@ int Agc::set_target_level_dbfs(int level) { return 0; } -void Agc::EnableStandaloneVad(bool enable) { - standalone_vad_enabled_ = enable; -} - } // namespace webrtc diff --git a/webrtc/modules/audio_processing/agc/agc.h b/webrtc/modules/audio_processing/agc/agc.h index 1ecdab1166..dd4605e812 100644 --- a/webrtc/modules/audio_processing/agc/agc.h +++ b/webrtc/modules/audio_processing/agc/agc.h @@ -12,16 +12,13 @@ #define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_AGC_H_ #include "webrtc/base/scoped_ptr.h" +#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h" #include "webrtc/typedefs.h" namespace webrtc { class AudioFrame; -class AgcAudioProc; class Histogram; -class PitchBasedVad; -class Resampler; -class StandaloneVad; class Agc { public: @@ -44,24 +41,16 @@ class Agc { virtual int set_target_level_dbfs(int level); virtual int target_level_dbfs() const { return target_level_dbfs_; } - virtual void EnableStandaloneVad(bool enable); - virtual bool standalone_vad_enabled() const { - return standalone_vad_enabled_; + virtual float voice_probability() const { + return vad_.last_voice_probability(); } - virtual double voice_probability() const { return last_voice_probability_; } - private: double target_level_loudness_; - double last_voice_probability_; int target_level_dbfs_; - bool standalone_vad_enabled_; rtc::scoped_ptr histogram_; rtc::scoped_ptr inactive_histogram_; - rtc::scoped_ptr audio_processing_; - rtc::scoped_ptr pitch_based_vad_; - rtc::scoped_ptr standalone_vad_; - rtc::scoped_ptr resampler_; + VoiceActivityDetector vad_; }; } // namespace webrtc diff --git a/webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h b/webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h deleted file mode 100644 index f3b7fd1e93..0000000000 --- a/webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_AGC_AUDIO_PROC_INTERNAL_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_AGC_AUDIO_PROC_INTERNAL_H_ - -namespace webrtc { - -// These values should match MATLAB counterparts for unit-tests to pass. -static const double kCorrWeight[] = { - 1.000000, 0.985000, 0.970225, 0.955672, 0.941337, 0.927217, 0.913308, - 0.899609, 0.886115, 0.872823, 0.859730, 0.846834, 0.834132, 0.821620, - 0.809296, 0.797156, 0.785199 -}; - -static const double kLpcAnalWin[] = { - 0.00000000, 0.01314436, 0.02628645, 0.03942400, 0.05255473, 0.06567639, - 0.07878670, 0.09188339, 0.10496421, 0.11802689, 0.13106918, 0.14408883, - 0.15708358, 0.17005118, 0.18298941, 0.19589602, 0.20876878, 0.22160547, - 0.23440387, 0.24716177, 0.25987696, 0.27254725, 0.28517045, 0.29774438, - 0.31026687, 0.32273574, 0.33514885, 0.34750406, 0.35979922, 0.37203222, - 0.38420093, 0.39630327, 0.40833713, 0.42030043, 0.43219112, 0.44400713, - 0.45574642, 0.46740697, 0.47898676, 0.49048379, 0.50189608, 0.51322164, - 0.52445853, 0.53560481, 0.54665854, 0.55761782, 0.56848075, 0.57924546, - 0.58991008, 0.60047278, 0.61093173, 0.62128512, 0.63153117, 0.64166810, - 0.65169416, 0.66160761, 0.67140676, 0.68108990, 0.69065536, 0.70010148, - 0.70942664, 0.71862923, 0.72770765, 0.73666033, 0.74548573, 0.75418233, - 0.76274862, 0.77118312, 0.77948437, 0.78765094, 0.79568142, 0.80357442, - 0.81132858, 0.81894256, 0.82641504, 0.83374472, 0.84093036, 0.84797069, - 0.85486451, 0.86161063, 0.86820787, 0.87465511, 0.88095122, 0.88709512, - 0.89308574, 0.89892206, 0.90460306, 0.91012776, 0.91549520, 0.92070447, - 0.92575465, 0.93064488, 0.93537432, 0.93994213, 0.94434755, 0.94858979, - 0.95266814, 0.95658189, 0.96033035, 0.96391289, 0.96732888, 0.97057773, - 0.97365889, 0.97657181, 0.97931600, 0.98189099, 0.98429632, 0.98653158, - 0.98859639, 0.99049038, 0.99221324, 0.99376466, 0.99514438, 0.99635215, - 0.99738778, 0.99825107, 0.99894188, 0.99946010, 0.99980562, 0.99997840, - 0.99997840, 0.99980562, 0.99946010, 0.99894188, 0.99825107, 0.99738778, - 0.99635215, 0.99514438, 0.99376466, 0.99221324, 0.99049038, 0.98859639, - 0.98653158, 0.98429632, 0.98189099, 0.97931600, 0.97657181, 0.97365889, - 0.97057773, 0.96732888, 0.96391289, 0.96033035, 0.95658189, 0.95266814, - 0.94858979, 0.94434755, 0.93994213, 0.93537432, 0.93064488, 0.92575465, - 0.92070447, 0.91549520, 0.91012776, 0.90460306, 0.89892206, 0.89308574, - 0.88709512, 0.88095122, 0.87465511, 0.86820787, 0.86161063, 0.85486451, - 0.84797069, 0.84093036, 0.83374472, 0.82641504, 0.81894256, 0.81132858, - 0.80357442, 0.79568142, 0.78765094, 0.77948437, 0.77118312, 0.76274862, - 0.75418233, 0.74548573, 0.73666033, 0.72770765, 0.71862923, 0.70942664, - 0.70010148, 0.69065536, 0.68108990, 0.67140676, 0.66160761, 0.65169416, - 0.64166810, 0.63153117, 0.62128512, 0.61093173, 0.60047278, 0.58991008, - 0.57924546, 0.56848075, 0.55761782, 0.54665854, 0.53560481, 0.52445853, - 0.51322164, 0.50189608, 0.49048379, 0.47898676, 0.46740697, 0.45574642, - 0.44400713, 0.43219112, 0.42030043, 0.40833713, 0.39630327, 0.38420093, - 0.37203222, 0.35979922, 0.34750406, 0.33514885, 0.32273574, 0.31026687, - 0.29774438, 0.28517045, 0.27254725, 0.25987696, 0.24716177, 0.23440387, - 0.22160547, 0.20876878, 0.19589602, 0.18298941, 0.17005118, 0.15708358, - 0.14408883, 0.13106918, 0.11802689, 0.10496421, 0.09188339, 0.07878670, - 0.06567639, 0.05255473, 0.03942400, 0.02628645, 0.01314436, 0.00000000 -}; - -static const int kFilterOrder = 2; -static const float kCoeffNumerator[kFilterOrder + 1] = {0.974827f, -1.949650f, - 0.974827f}; -static const float kCoeffDenominator[kFilterOrder + 1] = {1.0f, -1.971999f, - 0.972457f}; - -static_assert(kFilterOrder + 1 == - sizeof(kCoeffNumerator) / sizeof(kCoeffNumerator[0]), - "numerator coefficients incorrect size"); -static_assert(kFilterOrder + 1 == - sizeof(kCoeffDenominator) / sizeof(kCoeffDenominator[0]), - "denominator coefficients incorrect size"); - -} // namespace webrtc - -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_AUDIO_PROCESSING_H_ diff --git a/webrtc/modules/audio_processing/agc/agc_manager_direct.cc b/webrtc/modules/audio_processing/agc/agc_manager_direct.cc index 573d48cdb9..74f55407a4 100644 --- a/webrtc/modules/audio_processing/agc/agc_manager_direct.cc +++ b/webrtc/modules/audio_processing/agc/agc_manager_direct.cc @@ -321,7 +321,7 @@ void AgcManagerDirect::SetCaptureMuted(bool muted) { } float AgcManagerDirect::voice_probability() { - return static_cast(agc_->voice_probability()); + return agc_->voice_probability(); } int AgcManagerDirect::CheckVolumeAndReset() { diff --git a/webrtc/modules/audio_processing/agc/noise_gmm_tables.h b/webrtc/modules/audio_processing/agc/noise_gmm_tables.h deleted file mode 100644 index 779fd8c368..0000000000 --- a/webrtc/modules/audio_processing/agc/noise_gmm_tables.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// GMM tables for inactive segments. Generated by MakeGmmTables.m. - -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_NOISE_GMM_TABLES_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_NOISE_GMM_TABLES_H_ - -static const int kNoiseGmmNumMixtures = 12; -static const int kNoiseGmmDim = 3; - -static const double kNoiseGmmCovarInverse[kNoiseGmmNumMixtures] - [kNoiseGmmDim][kNoiseGmmDim] = { - {{ 7.36219567592941e+00, 4.83060785179861e-03, 1.23335151497610e-02}, - { 4.83060785179861e-03, 1.65289507047817e-04, -2.41490588169997e-04}, - { 1.23335151497610e-02, -2.41490588169997e-04, 6.59472060689382e-03}}, - {{ 8.70265239309140e+00, -5.30636201431086e-04, 5.44014966585347e-03}, - {-5.30636201431086e-04, 3.11095453521008e-04, -1.86287206836035e-04}, - { 5.44014966585347e-03, -1.86287206836035e-04, 6.29493388790744e-04}}, - {{ 4.53467851955055e+00, -3.92977536695197e-03, -2.46521420693317e-03}, - {-3.92977536695197e-03, 4.94650752632750e-05, -1.08587438501826e-05}, - {-2.46521420693317e-03, -1.08587438501826e-05, 9.28793975422261e-05}}, - {{ 9.26817997114275e-01, -4.03976069276753e-04, -3.56441427392165e-03}, - {-4.03976069276753e-04, 2.51976251631430e-06, 1.46914206734572e-07}, - {-3.56441427392165e-03, 1.46914206734572e-07, 8.19914567685373e-05}}, - {{ 7.61715986787441e+00, -1.54889041216888e-04, 2.41756280071656e-02}, - {-1.54889041216888e-04, 3.50282550461672e-07, -6.27251196972490e-06}, - { 2.41756280071656e-02, -6.27251196972490e-06, 1.45061847649872e-02}}, - {{ 8.31193642663158e+00, -3.84070508164323e-04, -3.09750630821876e-02}, - {-3.84070508164323e-04, 3.80433432277336e-07, -1.14321142836636e-06}, - {-3.09750630821876e-02, -1.14321142836636e-06, 8.35091486289997e-04}}, - {{ 9.67283151270894e-01, 5.82465812445039e-05, -3.18350798617053e-03}, - { 5.82465812445039e-05, 2.23762672000318e-07, -7.74196587408623e-07}, - {-3.18350798617053e-03, -7.74196587408623e-07, 3.85120938338325e-04}}, - {{ 8.28066236985388e+00, 5.87634508319763e-05, 6.99303090891743e-03}, - { 5.87634508319763e-05, 2.93746018618058e-07, 3.40843332882272e-07}, - { 6.99303090891743e-03, 3.40843332882272e-07, 1.99379171190344e-04}}, - {{ 6.07488998675646e+00, -1.11494526618473e-02, 5.10013111123381e-03}, - {-1.11494526618473e-02, 6.99238879921751e-04, 5.36718550370870e-05}, - { 5.10013111123381e-03, 5.36718550370870e-05, 5.26909853276753e-04}}, - {{ 6.90492021419175e+00, 4.20639355257863e-04, -2.38612752336481e-03}, - { 4.20639355257863e-04, 3.31246767338153e-06, -2.42052288150859e-08}, - {-2.38612752336481e-03, -2.42052288150859e-08, 4.46608368363412e-04}}, - {{ 1.31069150869715e+01, -1.73718583865670e-04, -1.97591814508578e-02}, - {-1.73718583865670e-04, 2.80451716300124e-07, 9.96570755379865e-07}, - {-1.97591814508578e-02, 9.96570755379865e-07, 2.41361900868847e-03}}, - {{ 4.69566344239814e+00, -2.61077567563690e-04, 5.26359000761433e-03}, - {-2.61077567563690e-04, 1.82420859823767e-06, -7.83645887541601e-07}, - { 5.26359000761433e-03, -7.83645887541601e-07, 1.33586288288802e-02}}}; - -static const double kNoiseGmmMean[kNoiseGmmNumMixtures][kNoiseGmmDim] = { - {-2.01386094766163e+00, 1.69702162045397e+02, 7.41715804872181e+01}, - {-1.94684591777290e+00, 1.42398396732668e+02, 1.64186321157831e+02}, - {-2.29319297562437e+00, 3.86415425589868e+02, 2.13452215267125e+02}, - {-3.25487177070268e+00, 1.08668712553616e+03, 2.33119949467419e+02}, - {-2.13159632447467e+00, 4.83821702557717e+03, 6.86786166673740e+01}, - {-2.26171410780526e+00, 4.79420193982422e+03, 1.53222513286450e+02}, - {-3.32166740703185e+00, 4.35161135834358e+03, 1.33206448431316e+02}, - {-2.19290322814343e+00, 3.98325506609408e+03, 2.13249167359934e+02}, - {-2.02898459255404e+00, 7.37039893155007e+03, 1.12518527491926e+02}, - {-2.26150236399500e+00, 1.54896745196145e+03, 1.49717357868579e+02}, - {-2.00417668301790e+00, 3.82434760310304e+03, 1.07438913004312e+02}, - {-2.30193040814533e+00, 1.43953696546439e+03, 7.04085275122649e+01}}; - -static const double kNoiseGmmWeights[kNoiseGmmNumMixtures] = { - -1.09422832086193e+01, -1.10847897513425e+01, -1.36767587732187e+01, - -1.79789356118641e+01, -1.42830169160894e+01, -1.56500228061379e+01, - -1.83124990950113e+01, -1.69979436177477e+01, -1.12329424387828e+01, - -1.41311785780639e+01, -1.47171861448585e+01, -1.35963362781839e+01}; -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_NOISE_GMM_TABLES_H_ diff --git a/webrtc/modules/audio_processing/agc/voice_gmm_tables.h b/webrtc/modules/audio_processing/agc/voice_gmm_tables.h deleted file mode 100644 index 9a490a47e0..0000000000 --- a/webrtc/modules/audio_processing/agc/voice_gmm_tables.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -// GMM tables for active segments. Generated by MakeGmmTables.m. - -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_VOICE_GMM_TABLES_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_VOICE_GMM_TABLES_H_ - -static const int kVoiceGmmNumMixtures = 12; -static const int kVoiceGmmDim = 3; - -static const double kVoiceGmmCovarInverse[kVoiceGmmNumMixtures] - [kVoiceGmmDim][kVoiceGmmDim] = { - {{ 1.83673825579513e+00, -8.09791637570095e-04, 4.60106414365986e-03}, - {-8.09791637570095e-04, 8.89351738394608e-04, -9.80188953277734e-04}, - { 4.60106414365986e-03, -9.80188953277734e-04, 1.38706060206582e-03}}, - {{ 6.76228912850703e+01, -1.98893120119660e-02, -3.53548357253551e-03}, - {-1.98893120119660e-02, 3.96216858500530e-05, -4.08492938394097e-05}, - {-3.53548357253551e-03, -4.08492938394097e-05, 9.31864352856416e-04}}, - {{ 9.98612435944558e+00, -5.27880954316893e-03, -6.30342541619017e-03}, - {-5.27880954316893e-03, 4.54359480225226e-05, 6.30804591626044e-05}, - {-6.30342541619017e-03, 6.30804591626044e-05, 5.36466441382942e-04}}, - {{ 3.39917474216349e+01, -1.56213579433191e-03, -4.01459014990225e-02}, - {-1.56213579433191e-03, 6.40415424897724e-05, 6.20076342427833e-05}, - {-4.01459014990225e-02, 6.20076342427833e-05, 3.51199070103063e-03}}, - {{ 1.34545062271428e+01, -7.94513610147144e-03, -5.34401019341728e-02}, - {-7.94513610147144e-03, 1.16511820098649e-04, 4.66063702069293e-05}, - {-5.34401019341728e-02, 4.66063702069293e-05, 2.72354323774163e-03}}, - {{ 1.08557844314806e+02, -1.54885805673668e-02, -1.88029692674851e-02}, - {-1.54885805673668e-02, 1.16404042786406e-04, 6.45579292702802e-06}, - {-1.88029692674851e-02, 6.45579292702802e-06, 4.32330478391416e-04}}, - {{ 8.22940066541450e+01, -1.15903110231303e-02, -4.92166764865343e-02}, - {-1.15903110231303e-02, 7.42510742165261e-05, 3.73007314191290e-06}, - {-4.92166764865343e-02, 3.73007314191290e-06, 3.64005221593244e-03}}, - {{ 2.31133605685660e+00, -7.83261568950254e-04, 7.45744012346313e-04}, - {-7.83261568950254e-04, 1.29460648214142e-05, -2.22774455093730e-06}, - { 7.45744012346313e-04, -2.22774455093730e-06, 1.05117294093010e-04}}, - {{ 3.78767849189611e+02, 1.57759761011568e-03, -2.08551217988774e-02}, - { 1.57759761011568e-03, 4.76066236886865e-05, -2.33977412299324e-05}, - {-2.08551217988774e-02, -2.33977412299324e-05, 5.24261005371196e-04}}, - {{ 6.98580096506135e-01, -5.13850255217378e-04, -4.01124551717056e-04}, - {-5.13850255217378e-04, 1.40501021984840e-06, -2.09496928716569e-06}, - {-4.01124551717056e-04, -2.09496928716569e-06, 2.82879357740037e-04}}, - {{ 2.62770945162399e+00, -2.31825753241430e-03, -5.30447217466318e-03}, - {-2.31825753241430e-03, 4.59108572227649e-05, 7.67631886355405e-05}, - {-5.30447217466318e-03, 7.67631886355405e-05, 2.28521601674098e-03}}, - {{ 1.89940391362152e+02, -4.23280856852379e-03, -2.70608873541399e-02}, - {-4.23280856852379e-03, 6.77547582742563e-05, 2.69154203800467e-05}, - {-2.70608873541399e-02, 2.69154203800467e-05, 3.88574543373470e-03}}}; - -static const double kVoiceGmmMean[kVoiceGmmNumMixtures][kVoiceGmmDim] = { - {-2.15020241646536e+00, 4.97079062999877e+02, 4.77078119504505e+02}, - {-8.92097680029190e-01, 5.92064964199921e+02, 1.81045145941059e+02}, - {-1.29435784144398e+00, 4.98450293410611e+02, 1.71991263804064e+02}, - {-1.03925228397884e+00, 4.99511274321571e+02, 1.05838336539105e+02}, - {-1.29229047206129e+00, 4.15026762566707e+02, 1.12861119017125e+02}, - {-7.88748114599810e-01, 4.48739336688113e+02, 1.89784216956337e+02}, - {-8.77777402332642e-01, 4.86620285054533e+02, 1.13477708016491e+02}, - {-2.06465957063057e+00, 6.33385049870607e+02, 2.32758546796149e+02}, - {-6.98893789231685e-01, 5.93622051503385e+02, 1.92536982473203e+02}, - {-2.55901217508894e+00, 1.55914919756205e+03, 1.39769980835570e+02}, - {-1.92070024165837e+00, 4.87983940444185e+02, 1.02745468128289e+02}, - {-7.29187507662854e-01, 5.22717685022855e+02, 1.16377942283991e+02}}; - -static const double kVoiceGmmWeights[kVoiceGmmNumMixtures] = { - -1.39789694361035e+01, -1.19527720202104e+01, -1.32396317929055e+01, - -1.09436815209238e+01, -1.13440027478149e+01, -1.12200721834504e+01, - -1.02537324043693e+01, -1.60789861938302e+01, -1.03394494048344e+01, - -1.83207938586818e+01, -1.31186044948288e+01, -9.52479998673554e+00}; -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_VOICE_GMM_TABLES_H_ diff --git a/webrtc/modules/audio_processing/audio_processing.gypi b/webrtc/modules/audio_processing/audio_processing.gypi index f0d56691c0..a9c3ebbd96 100644 --- a/webrtc/modules/audio_processing/audio_processing.gypi +++ b/webrtc/modules/audio_processing/audio_processing.gypi @@ -48,17 +48,9 @@ 'aecm/include/echo_control_mobile.h', 'agc/agc.cc', 'agc/agc.h', - 'agc/agc_audio_proc.cc', - 'agc/agc_audio_proc.h', - 'agc/agc_audio_proc_internal.h', 'agc/agc_manager_direct.cc', 'agc/agc_manager_direct.h', - 'agc/circular_buffer.cc', - 'agc/circular_buffer.h', - 'agc/common.h', 'agc/gain_map_internal.h', - 'agc/gmm.cc', - 'agc/gmm.h', 'agc/histogram.cc', 'agc/histogram.h', 'agc/legacy/analog_agc.c', @@ -66,18 +58,8 @@ 'agc/legacy/digital_agc.c', 'agc/legacy/digital_agc.h', 'agc/legacy/gain_control.h', - 'agc/noise_gmm_tables.h', - 'agc/pitch_based_vad.cc', - 'agc/pitch_based_vad.h', - 'agc/pitch_internal.cc', - 'agc/pitch_internal.h', - 'agc/pole_zero_filter.cc', - 'agc/pole_zero_filter.h', - 'agc/standalone_vad.cc', - 'agc/standalone_vad.h', 'agc/utility.cc', 'agc/utility.h', - 'agc/voice_gmm_tables.h', 'audio_buffer.cc', 'audio_buffer.h', 'audio_processing_impl.cc', @@ -135,6 +117,26 @@ 'utility/delay_estimator_internal.h', 'utility/delay_estimator_wrapper.c', 'utility/delay_estimator_wrapper.h', + 'vad/common.h', + 'vad/gmm.cc', + 'vad/gmm.h', + 'vad/noise_gmm_tables.h', + 'vad/pitch_based_vad.cc', + 'vad/pitch_based_vad.h', + 'vad/pitch_internal.cc', + 'vad/pitch_internal.h', + 'vad/pole_zero_filter.cc', + 'vad/pole_zero_filter.h', + 'vad/standalone_vad.cc', + 'vad/standalone_vad.h', + 'vad/vad_audio_proc.cc', + 'vad/vad_audio_proc.h', + 'vad/vad_audio_proc_internal.h', + 'vad/vad_circular_buffer.cc', + 'vad/vad_circular_buffer.h', + 'vad/voice_activity_detector.cc', + 'vad/voice_activity_detector.h', + 'vad/voice_gmm_tables.h', 'voice_detection_impl.cc', 'voice_detection_impl.h', ], diff --git a/webrtc/modules/audio_processing/agc/common.h b/webrtc/modules/audio_processing/vad/common.h similarity index 81% rename from webrtc/modules/audio_processing/agc/common.h rename to webrtc/modules/audio_processing/vad/common.h index e9ed1edadd..0772d55489 100644 --- a/webrtc/modules/audio_processing/agc/common.h +++ b/webrtc/modules/audio_processing/vad/common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_COMMON_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_COMMON_H_ +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_ static const int kSampleRateHz = 16000; static const int kLength10Ms = kSampleRateHz / 100; @@ -24,4 +24,4 @@ struct AudioFeatures { bool silence; }; -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_COMMON_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_COMMON_H_ diff --git a/webrtc/modules/audio_processing/agc/gmm.cc b/webrtc/modules/audio_processing/vad/gmm.cc similarity index 81% rename from webrtc/modules/audio_processing/agc/gmm.cc rename to webrtc/modules/audio_processing/vad/gmm.cc index 9ad8ef95ae..9651975913 100644 --- a/webrtc/modules/audio_processing/agc/gmm.cc +++ b/webrtc/modules/audio_processing/vad/gmm.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/gmm.h" +#include "webrtc/modules/audio_processing/vad/gmm.h" #include #include @@ -19,13 +19,16 @@ namespace webrtc { static const int kMaxDimension = 10; -static void RemoveMean(const double* in, const double* mean_vec, - int dimension, double* out) { +static void RemoveMean(const double* in, + const double* mean_vec, + int dimension, + double* out) { for (int n = 0; n < dimension; ++n) out[n] = in[n] - mean_vec[n]; } -static double ComputeExponent(const double* in, const double* covar_inv, +static double ComputeExponent(const double* in, + const double* covar_inv, int dimension) { double q = 0; for (int i = 0; i < dimension; ++i) { @@ -50,7 +53,7 @@ double EvaluateGmm(const double* x, const GmmParameters& gmm_parameters) { for (int n = 0; n < gmm_parameters.num_mixtures; n++) { RemoveMean(x, mean_vec, gmm_parameters.dimension, v); double q = ComputeExponent(v, covar_inv, gmm_parameters.dimension) + - gmm_parameters.weight[n]; + gmm_parameters.weight[n]; f += exp(q); mean_vec += gmm_parameters.dimension; covar_inv += gmm_parameters.dimension * gmm_parameters.dimension; diff --git a/webrtc/modules/audio_processing/agc/gmm.h b/webrtc/modules/audio_processing/vad/gmm.h similarity index 91% rename from webrtc/modules/audio_processing/agc/gmm.h rename to webrtc/modules/audio_processing/vad/gmm.h index 90ce95d4dd..9f3e578fef 100644 --- a/webrtc/modules/audio_processing/agc/gmm.h +++ b/webrtc/modules/audio_processing/vad/gmm.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_GMM_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_GMM_H_ +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_ namespace webrtc { @@ -42,4 +42,4 @@ struct GmmParameters { double EvaluateGmm(const double* x, const GmmParameters& gmm_parameters); } // namespace webrtc -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_GMM_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_GMM_H_ diff --git a/webrtc/modules/audio_processing/agc/gmm_unittest.cc b/webrtc/modules/audio_processing/vad/gmm_unittest.cc similarity index 91% rename from webrtc/modules/audio_processing/agc/gmm_unittest.cc rename to webrtc/modules/audio_processing/vad/gmm_unittest.cc index 4ca658d732..f8e1bde776 100644 --- a/webrtc/modules/audio_processing/agc/gmm_unittest.cc +++ b/webrtc/modules/audio_processing/vad/gmm_unittest.cc @@ -8,13 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/gmm.h" +#include "webrtc/modules/audio_processing/vad/gmm.h" #include #include "testing/gtest/include/gtest/gtest.h" -#include "webrtc/modules/audio_processing/agc/noise_gmm_tables.h" -#include "webrtc/modules/audio_processing/agc/voice_gmm_tables.h" +#include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h" +#include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h" namespace webrtc { diff --git a/webrtc/modules/audio_processing/vad/noise_gmm_tables.h b/webrtc/modules/audio_processing/vad/noise_gmm_tables.h new file mode 100644 index 0000000000..293af57a2a --- /dev/null +++ b/webrtc/modules/audio_processing/vad/noise_gmm_tables.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// GMM tables for inactive segments. Generated by MakeGmmTables.m. + +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_ + +static const int kNoiseGmmNumMixtures = 12; +static const int kNoiseGmmDim = 3; + +static const double + kNoiseGmmCovarInverse[kNoiseGmmNumMixtures][kNoiseGmmDim][kNoiseGmmDim] = { + {{7.36219567592941e+00, 4.83060785179861e-03, 1.23335151497610e-02}, + {4.83060785179861e-03, 1.65289507047817e-04, -2.41490588169997e-04}, + {1.23335151497610e-02, -2.41490588169997e-04, 6.59472060689382e-03}}, + {{8.70265239309140e+00, -5.30636201431086e-04, 5.44014966585347e-03}, + {-5.30636201431086e-04, 3.11095453521008e-04, -1.86287206836035e-04}, + {5.44014966585347e-03, -1.86287206836035e-04, 6.29493388790744e-04}}, + {{4.53467851955055e+00, -3.92977536695197e-03, -2.46521420693317e-03}, + {-3.92977536695197e-03, 4.94650752632750e-05, -1.08587438501826e-05}, + {-2.46521420693317e-03, -1.08587438501826e-05, 9.28793975422261e-05}}, + {{9.26817997114275e-01, -4.03976069276753e-04, -3.56441427392165e-03}, + {-4.03976069276753e-04, 2.51976251631430e-06, 1.46914206734572e-07}, + {-3.56441427392165e-03, 1.46914206734572e-07, 8.19914567685373e-05}}, + {{7.61715986787441e+00, -1.54889041216888e-04, 2.41756280071656e-02}, + {-1.54889041216888e-04, 3.50282550461672e-07, -6.27251196972490e-06}, + {2.41756280071656e-02, -6.27251196972490e-06, 1.45061847649872e-02}}, + {{8.31193642663158e+00, -3.84070508164323e-04, -3.09750630821876e-02}, + {-3.84070508164323e-04, 3.80433432277336e-07, -1.14321142836636e-06}, + {-3.09750630821876e-02, -1.14321142836636e-06, 8.35091486289997e-04}}, + {{9.67283151270894e-01, 5.82465812445039e-05, -3.18350798617053e-03}, + {5.82465812445039e-05, 2.23762672000318e-07, -7.74196587408623e-07}, + {-3.18350798617053e-03, -7.74196587408623e-07, 3.85120938338325e-04}}, + {{8.28066236985388e+00, 5.87634508319763e-05, 6.99303090891743e-03}, + {5.87634508319763e-05, 2.93746018618058e-07, 3.40843332882272e-07}, + {6.99303090891743e-03, 3.40843332882272e-07, 1.99379171190344e-04}}, + {{6.07488998675646e+00, -1.11494526618473e-02, 5.10013111123381e-03}, + {-1.11494526618473e-02, 6.99238879921751e-04, 5.36718550370870e-05}, + {5.10013111123381e-03, 5.36718550370870e-05, 5.26909853276753e-04}}, + {{6.90492021419175e+00, 4.20639355257863e-04, -2.38612752336481e-03}, + {4.20639355257863e-04, 3.31246767338153e-06, -2.42052288150859e-08}, + {-2.38612752336481e-03, -2.42052288150859e-08, 4.46608368363412e-04}}, + {{1.31069150869715e+01, -1.73718583865670e-04, -1.97591814508578e-02}, + {-1.73718583865670e-04, 2.80451716300124e-07, 9.96570755379865e-07}, + {-1.97591814508578e-02, 9.96570755379865e-07, 2.41361900868847e-03}}, + {{4.69566344239814e+00, -2.61077567563690e-04, 5.26359000761433e-03}, + {-2.61077567563690e-04, 1.82420859823767e-06, -7.83645887541601e-07}, + {5.26359000761433e-03, -7.83645887541601e-07, 1.33586288288802e-02}}}; + +static const double kNoiseGmmMean[kNoiseGmmNumMixtures][kNoiseGmmDim] = { + {-2.01386094766163e+00, 1.69702162045397e+02, 7.41715804872181e+01}, + {-1.94684591777290e+00, 1.42398396732668e+02, 1.64186321157831e+02}, + {-2.29319297562437e+00, 3.86415425589868e+02, 2.13452215267125e+02}, + {-3.25487177070268e+00, 1.08668712553616e+03, 2.33119949467419e+02}, + {-2.13159632447467e+00, 4.83821702557717e+03, 6.86786166673740e+01}, + {-2.26171410780526e+00, 4.79420193982422e+03, 1.53222513286450e+02}, + {-3.32166740703185e+00, 4.35161135834358e+03, 1.33206448431316e+02}, + {-2.19290322814343e+00, 3.98325506609408e+03, 2.13249167359934e+02}, + {-2.02898459255404e+00, 7.37039893155007e+03, 1.12518527491926e+02}, + {-2.26150236399500e+00, 1.54896745196145e+03, 1.49717357868579e+02}, + {-2.00417668301790e+00, 3.82434760310304e+03, 1.07438913004312e+02}, + {-2.30193040814533e+00, 1.43953696546439e+03, 7.04085275122649e+01}}; + +static const double kNoiseGmmWeights[kNoiseGmmNumMixtures] = { + -1.09422832086193e+01, + -1.10847897513425e+01, + -1.36767587732187e+01, + -1.79789356118641e+01, + -1.42830169160894e+01, + -1.56500228061379e+01, + -1.83124990950113e+01, + -1.69979436177477e+01, + -1.12329424387828e+01, + -1.41311785780639e+01, + -1.47171861448585e+01, + -1.35963362781839e+01}; +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_NOISE_GMM_TABLES_H_ diff --git a/webrtc/modules/audio_processing/agc/pitch_based_vad.cc b/webrtc/modules/audio_processing/vad/pitch_based_vad.cc similarity index 87% rename from webrtc/modules/audio_processing/agc/pitch_based_vad.cc rename to webrtc/modules/audio_processing/vad/pitch_based_vad.cc index 0cfa52a010..91638d007e 100644 --- a/webrtc/modules/audio_processing/agc/pitch_based_vad.cc +++ b/webrtc/modules/audio_processing/vad/pitch_based_vad.cc @@ -8,16 +8,16 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/pitch_based_vad.h" +#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" #include #include #include -#include "webrtc/modules/audio_processing/agc/circular_buffer.h" -#include "webrtc/modules/audio_processing/agc/common.h" -#include "webrtc/modules/audio_processing/agc/noise_gmm_tables.h" -#include "webrtc/modules/audio_processing/agc/voice_gmm_tables.h" +#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h" +#include "webrtc/modules/audio_processing/vad/common.h" +#include "webrtc/modules/audio_processing/vad/noise_gmm_tables.h" +#include "webrtc/modules/audio_processing/vad/voice_gmm_tables.h" #include "webrtc/modules/interface/module_common_types.h" namespace webrtc { @@ -44,7 +44,7 @@ static double LimitProbability(double p) { PitchBasedVad::PitchBasedVad() : p_prior_(kInitialPriorProbability), - circular_buffer_(AgcCircularBuffer::Create(kPosteriorHistorySize)) { + circular_buffer_(VadCircularBuffer::Create(kPosteriorHistorySize)) { // Setup noise GMM. noise_gmm_.dimension = kNoiseGmmDim; noise_gmm_.num_mixtures = kNoiseGmmNumMixtures; @@ -60,7 +60,8 @@ PitchBasedVad::PitchBasedVad() voice_gmm_.covar_inverse = &kVoiceGmmCovarInverse[0][0][0]; } -PitchBasedVad::~PitchBasedVad() {} +PitchBasedVad::~PitchBasedVad() { +} int PitchBasedVad::VoicingProbability(const AudioFeatures& features, double* p_combined) { @@ -90,8 +91,9 @@ int PitchBasedVad::VoicingProbability(const AudioFeatures& features, pdf_features_given_noise = kEps * pdf_features_given_voice; } - p = p_prior_ * pdf_features_given_voice / (pdf_features_given_voice * - p_prior_ + pdf_features_given_noise * (1 - p_prior_)); + p = p_prior_ * pdf_features_given_voice / + (pdf_features_given_voice * p_prior_ + + pdf_features_given_noise * (1 - p_prior_)); p = LimitProbability(p); diff --git a/webrtc/modules/audio_processing/agc/pitch_based_vad.h b/webrtc/modules/audio_processing/vad/pitch_based_vad.h similarity index 80% rename from webrtc/modules/audio_processing/agc/pitch_based_vad.h rename to webrtc/modules/audio_processing/vad/pitch_based_vad.h index 2295505cc3..c502184aea 100644 --- a/webrtc/modules/audio_processing/agc/pitch_based_vad.h +++ b/webrtc/modules/audio_processing/vad/pitch_based_vad.h @@ -8,18 +8,18 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_PITCH_BASED_VAD_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_PITCH_BASED_VAD_H_ +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_ #include "webrtc/base/scoped_ptr.h" -#include "webrtc/modules/audio_processing/agc/common.h" -#include "webrtc/modules/audio_processing/agc/gmm.h" +#include "webrtc/modules/audio_processing/vad/common.h" +#include "webrtc/modules/audio_processing/vad/gmm.h" #include "webrtc/typedefs.h" namespace webrtc { class AudioFrame; -class AgcCircularBuffer; +class VadCircularBuffer; // Computes the probability of the input audio frame to be active given // the corresponding pitch-gain and lag of the frame. @@ -37,6 +37,7 @@ class PitchBasedVad { // then, computes the voicing probabilities and combine them // with the given values. The result are returned in |p|. int VoicingProbability(const AudioFeatures& features, double* p_combined); + private: int UpdatePrior(double p); @@ -49,8 +50,8 @@ class PitchBasedVad { double p_prior_; - rtc::scoped_ptr circular_buffer_; + rtc::scoped_ptr circular_buffer_; }; } // namespace webrtc -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_PITCH_BASED_VAD_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_BASED_VAD_H_ diff --git a/webrtc/modules/audio_processing/agc/pitch_based_vad_unittest.cc b/webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc similarity index 73% rename from webrtc/modules/audio_processing/agc/pitch_based_vad_unittest.cc rename to webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc index 3ec0baac95..04ddcab5cb 100644 --- a/webrtc/modules/audio_processing/agc/pitch_based_vad_unittest.cc +++ b/webrtc/modules/audio_processing/vad/pitch_based_vad_unittest.cc @@ -8,20 +8,21 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/pitch_based_vad.h" +#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" #include #include -#include -#include "gtest/gtest.h" +#include + +#include "testing/gtest/include/gtest/gtest.h" #include "webrtc/test/testsupport/fileutils.h" namespace webrtc { TEST(PitchBasedVadTest, VoicingProbabilityTest) { - std::string spectral_peak_file_name = test::ResourcePath( - "audio_processing/agc/agc_spectral_peak", "dat"); + std::string spectral_peak_file_name = + test::ResourcePath("audio_processing/agc/agc_spectral_peak", "dat"); FILE* spectral_peak_file = fopen(spectral_peak_file_name.c_str(), "rb"); ASSERT_TRUE(spectral_peak_file != NULL); @@ -51,12 +52,15 @@ TEST(PitchBasedVadTest, VoicingProbabilityTest) { sizeof(audio_features.spectral_peak[0]), 1, spectral_peak_file) == 1u) { double p; - ASSERT_EQ(1u, fread(audio_features.log_pitch_gain, sizeof( - audio_features.log_pitch_gain[0]), 1, pitch_gain_file)); - ASSERT_EQ(1u, fread(audio_features.pitch_lag_hz, sizeof( - audio_features.pitch_lag_hz[0]), 1, pitch_lag_file)); - ASSERT_EQ(1u, fread(&reference_activity_probability, sizeof( - reference_activity_probability), 1, voicing_prob_file)); + ASSERT_EQ(1u, fread(audio_features.log_pitch_gain, + sizeof(audio_features.log_pitch_gain[0]), 1, + pitch_gain_file)); + ASSERT_EQ(1u, + fread(audio_features.pitch_lag_hz, + sizeof(audio_features.pitch_lag_hz[0]), 1, pitch_lag_file)); + ASSERT_EQ(1u, fread(&reference_activity_probability, + sizeof(reference_activity_probability), 1, + voicing_prob_file)); p = 0.5; // Initialize to the neutral value for combining probabilities. EXPECT_EQ(0, vad_.VoicingProbability(audio_features, &p)); diff --git a/webrtc/modules/audio_processing/agc/pitch_internal.cc b/webrtc/modules/audio_processing/vad/pitch_internal.cc similarity index 96% rename from webrtc/modules/audio_processing/agc/pitch_internal.cc rename to webrtc/modules/audio_processing/vad/pitch_internal.cc index b394074bd3..309b45acf5 100644 --- a/webrtc/modules/audio_processing/agc/pitch_internal.cc +++ b/webrtc/modules/audio_processing/vad/pitch_internal.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/pitch_internal.h" +#include "webrtc/modules/audio_processing/vad/pitch_internal.h" #include @@ -25,7 +25,6 @@ static void PitchInterpolation(double old_val, const double* in, double* out) { out[2] = 0.5 * in[2] + 0.5 * in[3]; } - void GetSubframesPitchParameters(int sampling_rate_hz, double* gains, double* lags, diff --git a/webrtc/modules/audio_processing/agc/pitch_internal.h b/webrtc/modules/audio_processing/vad/pitch_internal.h similarity index 84% rename from webrtc/modules/audio_processing/agc/pitch_internal.h rename to webrtc/modules/audio_processing/vad/pitch_internal.h index ed73760e3a..b25b1a82a2 100644 --- a/webrtc/modules/audio_processing/agc/pitch_internal.h +++ b/webrtc/modules/audio_processing/vad/pitch_internal.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_PITCH_INTERNAL_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_PITCH_INTERNAL_H_ +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_ // TODO(turajs): Write a description of this function. Also be consistent with // usage of |sampling_rate_hz| vs |kSamplingFreqHz|. @@ -23,4 +23,4 @@ void GetSubframesPitchParameters(int sampling_rate_hz, double* log_pitch_gain, double* pitch_lag_hz); -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_PITCH_INTERNAL_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_PITCH_INTERNAL_H_ diff --git a/webrtc/modules/audio_processing/agc/pitch_internal_unittest.cc b/webrtc/modules/audio_processing/vad/pitch_internal_unittest.cc similarity index 82% rename from webrtc/modules/audio_processing/agc/pitch_internal_unittest.cc rename to webrtc/modules/audio_processing/vad/pitch_internal_unittest.cc index 8998f9014b..8b5959d03e 100644 --- a/webrtc/modules/audio_processing/agc/pitch_internal_unittest.cc +++ b/webrtc/modules/audio_processing/vad/pitch_internal_unittest.cc @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/pitch_internal.h" +#include "webrtc/modules/audio_processing/vad/pitch_internal.h" #include -#include "gtest/gtest.h" +#include "testing/gtest/include/gtest/gtest.h" TEST(PitchInternalTest, test) { const int kSamplingRateHz = 8000; @@ -26,12 +26,12 @@ TEST(PitchInternalTest, test) { double lags[] = {90, 111, 122, 50}; // Expected outputs - double expected_log_pitch_gain[] = {-0.541212549898316, -1.45672279045507, - -0.80471895621705}; + double expected_log_pitch_gain[] = { + -0.541212549898316, -1.45672279045507, -0.80471895621705}; double expected_log_old_gain = log(gains[kNumInputParameters - 1]); - double expected_pitch_lag_hz[] = {92.3076923076923, 70.9010339734121, - 93.0232558139535}; + double expected_pitch_lag_hz[] = { + 92.3076923076923, 70.9010339734121, 93.0232558139535}; double expected_old_lag = lags[kNumInputParameters - 1]; double log_pitch_gain[kNumOutputParameters]; diff --git a/webrtc/modules/audio_processing/agc/pole_zero_filter.cc b/webrtc/modules/audio_processing/vad/pole_zero_filter.cc similarity index 83% rename from webrtc/modules/audio_processing/agc/pole_zero_filter.cc rename to webrtc/modules/audio_processing/vad/pole_zero_filter.cc index 3c41e33dd6..84d0739d8c 100644 --- a/webrtc/modules/audio_processing/agc/pole_zero_filter.cc +++ b/webrtc/modules/audio_processing/vad/pole_zero_filter.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/pole_zero_filter.h" +#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h" #include #include @@ -20,13 +20,10 @@ PoleZeroFilter* PoleZeroFilter::Create(const float* numerator_coefficients, int order_numerator, const float* denominator_coefficients, int order_denominator) { - if (order_numerator < 0 || - order_denominator < 0 || + if (order_numerator < 0 || order_denominator < 0 || order_numerator > kMaxFilterOrder || - order_denominator > kMaxFilterOrder || - denominator_coefficients[0] == 0 || - numerator_coefficients == NULL || - denominator_coefficients == NULL) + order_denominator > kMaxFilterOrder || denominator_coefficients[0] == 0 || + numerator_coefficients == NULL || denominator_coefficients == NULL) return NULL; return new PoleZeroFilter(numerator_coefficients, order_numerator, denominator_coefficients, order_denominator); @@ -57,8 +54,7 @@ PoleZeroFilter::PoleZeroFilter(const float* numerator_coefficients, } template -static float FilterArPast(const T* past, int order, - const float* coefficients) { +static float FilterArPast(const T* past, int order, const float* coefficients) { float sum = 0.0f; int past_index = order - 1; for (int k = 1; k <= order; k++, past_index--) @@ -87,8 +83,8 @@ int PoleZeroFilter::Filter(const int16_t* in, if (highest_order_ < num_input_samples) { for (int m = 0; n < num_input_samples; n++, m++) { output[n] = in[n] * numerator_coefficients_[0]; - output[n] += FilterArPast(&in[m], order_numerator_, - numerator_coefficients_); + output[n] += + FilterArPast(&in[m], order_numerator_, numerator_coefficients_); output[n] -= FilterArPast(&output[m], order_denominator_, denominator_coefficients_); } @@ -99,13 +95,12 @@ int PoleZeroFilter::Filter(const int16_t* in, sizeof(output[0]) * order_denominator_); } else { // Odd case that the length of the input is shorter that filter order. - memmove(past_input_, &past_input_[num_input_samples], order_numerator_ * - sizeof(past_input_[0])); - memmove(past_output_, &past_output_[num_input_samples], order_denominator_ * - sizeof(past_output_[0])); + memmove(past_input_, &past_input_[num_input_samples], + order_numerator_ * sizeof(past_input_[0])); + memmove(past_output_, &past_output_[num_input_samples], + order_denominator_ * sizeof(past_output_[0])); } return 0; } } // namespace webrtc - diff --git a/webrtc/modules/audio_processing/agc/pole_zero_filter.h b/webrtc/modules/audio_processing/vad/pole_zero_filter.h similarity index 87% rename from webrtc/modules/audio_processing/agc/pole_zero_filter.h rename to webrtc/modules/audio_processing/vad/pole_zero_filter.h index c9d96fdd42..038d801a1b 100644 --- a/webrtc/modules/audio_processing/agc/pole_zero_filter.h +++ b/webrtc/modules/audio_processing/vad/pole_zero_filter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_POLE_ZERO_FILTER_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_POLE_ZERO_FILTER_H_ +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_ #include "webrtc/typedefs.h" @@ -47,4 +47,4 @@ class PoleZeroFilter { } // namespace webrtc -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_POLE_ZERO_FILTER_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_POLE_ZERO_FILTER_H_ diff --git a/webrtc/modules/audio_processing/agc/pole_zero_filter_unittest.cc b/webrtc/modules/audio_processing/vad/pole_zero_filter_unittest.cc similarity index 50% rename from webrtc/modules/audio_processing/agc/pole_zero_filter_unittest.cc rename to webrtc/modules/audio_processing/vad/pole_zero_filter_unittest.cc index b198b0eed1..492c3f0c94 100644 --- a/webrtc/modules/audio_processing/agc/pole_zero_filter_unittest.cc +++ b/webrtc/modules/audio_processing/vad/pole_zero_filter_unittest.cc @@ -8,44 +8,49 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/pole_zero_filter.h" +#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h" #include #include -#include "gtest/gtest.h" +#include "testing/gtest/include/gtest/gtest.h" #include "webrtc/base/scoped_ptr.h" -#include "webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h" +#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h" #include "webrtc/test/testsupport/fileutils.h" namespace webrtc { static const int kInputSamples = 50; -static const int16_t kInput[kInputSamples] = {-2136, -7116, 10715, 2464, 3164, - 8139, 11393, 24013, -32117, -5544, -27740, 10181, 14190, -24055, -15912, - 17393, 6359, -9950, -13894, 32432, -23944, 3437, -8381, 19768, 3087, -19795, - -5920, 13310, 1407, 3876, 4059, 3524, -23130, 19121, -27900, -24840, 4089, - 21422, -3625, 3015, -11236, 28856, 13424, 6571, -19761, -6361, 15821, -9469, - 29727, 32229}; +static const int16_t kInput[kInputSamples] = { + -2136, -7116, 10715, 2464, 3164, 8139, 11393, 24013, -32117, -5544, + -27740, 10181, 14190, -24055, -15912, 17393, 6359, -9950, -13894, 32432, + -23944, 3437, -8381, 19768, 3087, -19795, -5920, 13310, 1407, 3876, + 4059, 3524, -23130, 19121, -27900, -24840, 4089, 21422, -3625, 3015, + -11236, 28856, 13424, 6571, -19761, -6361, 15821, -9469, 29727, 32229}; -static const float kReferenceOutput[kInputSamples] = {-2082.230472f, - -6878.572941f, 10697.090871f, 2358.373952f, 2973.936512f, 7738.580650f, - 10690.803213f, 22687.091576f, -32676.684717f, -5879.621684f, -27359.297432f, - 10368.735888f, 13994.584604f, -23676.126249f, -15078.250390f, 17818.253338f, - 6577.743123f, -9498.369315f, -13073.651079f, 32460.026588f, -23391.849347f, - 3953.805667f, -7667.761363f, 19995.153447f, 3185.575477f, -19207.365160f, - -5143.103201f, 13756.317237f, 1779.654794f, 4142.269755f, 4209.475034f, - 3572.991789f, -22509.089546f, 19307.878964f, -27060.439759f, -23319.042810f, - 5547.685267f, 22312.718676f, -2707.309027f, 3852.358490f, -10135.510093f, - 29241.509970f, 13394.397233f, 6340.721417f, -19510.207905f, -5908.442086f, - 15882.301634f, -9211.335255f, 29253.056735f, 30874.443046f}; +static const float kReferenceOutput[kInputSamples] = { + -2082.230472f, -6878.572941f, 10697.090871f, 2358.373952f, + 2973.936512f, 7738.580650f, 10690.803213f, 22687.091576f, + -32676.684717f, -5879.621684f, -27359.297432f, 10368.735888f, + 13994.584604f, -23676.126249f, -15078.250390f, 17818.253338f, + 6577.743123f, -9498.369315f, -13073.651079f, 32460.026588f, + -23391.849347f, 3953.805667f, -7667.761363f, 19995.153447f, + 3185.575477f, -19207.365160f, -5143.103201f, 13756.317237f, + 1779.654794f, 4142.269755f, 4209.475034f, 3572.991789f, + -22509.089546f, 19307.878964f, -27060.439759f, -23319.042810f, + 5547.685267f, 22312.718676f, -2707.309027f, 3852.358490f, + -10135.510093f, 29241.509970f, 13394.397233f, 6340.721417f, + -19510.207905f, -5908.442086f, 15882.301634f, -9211.335255f, + 29253.056735f, 30874.443046f}; class PoleZeroFilterTest : public ::testing::Test { protected: PoleZeroFilterTest() - : my_filter_(PoleZeroFilter::Create( - kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) {} + : my_filter_(PoleZeroFilter::Create(kCoeffNumerator, + kFilterOrder, + kCoeffDenominator, + kFilterOrder)) {} ~PoleZeroFilterTest() {} diff --git a/webrtc/modules/audio_processing/agc/standalone_vad.cc b/webrtc/modules/audio_processing/vad/standalone_vad.cc similarity index 93% rename from webrtc/modules/audio_processing/agc/standalone_vad.cc rename to webrtc/modules/audio_processing/vad/standalone_vad.cc index e859325454..783785184d 100644 --- a/webrtc/modules/audio_processing/agc/standalone_vad.cc +++ b/webrtc/modules/audio_processing/vad/standalone_vad.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/standalone_vad.h" +#include "webrtc/modules/audio_processing/vad/standalone_vad.h" #include @@ -21,10 +21,8 @@ namespace webrtc { static const int kDefaultStandaloneVadMode = 3; StandaloneVad::StandaloneVad(VadInst* vad) - : vad_(vad), - buffer_(), - index_(0), - mode_(kDefaultStandaloneVadMode) {} + : vad_(vad), buffer_(), index_(0), mode_(kDefaultStandaloneVadMode) { +} StandaloneVad::~StandaloneVad() { WebRtcVad_Free(vad_); @@ -93,4 +91,3 @@ int StandaloneVad::set_mode(int mode) { } } // namespace webrtc - diff --git a/webrtc/modules/audio_processing/agc/standalone_vad.h b/webrtc/modules/audio_processing/vad/standalone_vad.h similarity index 97% rename from webrtc/modules/audio_processing/agc/standalone_vad.h rename to webrtc/modules/audio_processing/vad/standalone_vad.h index 3cace01286..4017a72c60 100644 --- a/webrtc/modules/audio_processing/agc/standalone_vad.h +++ b/webrtc/modules/audio_processing/vad/standalone_vad.h @@ -12,8 +12,8 @@ #define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_STANDALONE_VAD_H_ #include "webrtc/base/scoped_ptr.h" +#include "webrtc/modules/audio_processing/vad/common.h" #include "webrtc/common_audio/vad/include/webrtc_vad.h" -#include "webrtc/modules/audio_processing/agc/common.h" #include "webrtc/typedefs.h" namespace webrtc { diff --git a/webrtc/modules/audio_processing/agc/standalone_vad_unittest.cc b/webrtc/modules/audio_processing/vad/standalone_vad_unittest.cc similarity index 94% rename from webrtc/modules/audio_processing/agc/standalone_vad_unittest.cc rename to webrtc/modules/audio_processing/vad/standalone_vad_unittest.cc index a8caaae9c5..404a66f303 100644 --- a/webrtc/modules/audio_processing/agc/standalone_vad_unittest.cc +++ b/webrtc/modules/audio_processing/vad/standalone_vad_unittest.cc @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/standalone_vad.h" +#include "webrtc/modules/audio_processing/vad/standalone_vad.h" #include -#include "gtest/gtest.h" +#include "testing/gtest/include/gtest/gtest.h" #include "webrtc/base/scoped_ptr.h" #include "webrtc/modules/interface/module_common_types.h" #include "webrtc/test/testsupport/fileutils.h" @@ -22,7 +22,7 @@ namespace webrtc { TEST(StandaloneVadTest, Api) { rtc::scoped_ptr vad(StandaloneVad::Create()); - int16_t data[kLength10Ms] = { 0 }; + int16_t data[kLength10Ms] = {0}; // Valid frame length (for 32 kHz rate), but not what the VAD is expecting. EXPECT_EQ(-1, vad->AddAudio(data, 320)); @@ -58,7 +58,7 @@ TEST(StandaloneVadTest, Api) { TEST(StandaloneVadTest, DISABLED_ON_IOS(ActivityDetection)) { rtc::scoped_ptr vad(StandaloneVad::Create()); const size_t kDataLength = kLength10Ms; - int16_t data[kDataLength] = { 0 }; + int16_t data[kDataLength] = {0}; FILE* pcm_file = fopen(test::ResourcePath("audio_processing/agc/agc_audio", "pcm").c_str(), @@ -101,4 +101,4 @@ TEST(StandaloneVadTest, DISABLED_ON_IOS(ActivityDetection)) { fclose(reference_file); fclose(pcm_file); } -} +} // namespace webrtc diff --git a/webrtc/modules/audio_processing/agc/agc_audio_proc.cc b/webrtc/modules/audio_processing/vad/vad_audio_proc.cc similarity index 78% rename from webrtc/modules/audio_processing/agc/agc_audio_proc.cc rename to webrtc/modules/audio_processing/vad/vad_audio_proc.cc index dc4a5a711c..e8f27f802d 100644 --- a/webrtc/modules/audio_processing/agc/agc_audio_proc.cc +++ b/webrtc/modules/audio_processing/vad/vad_audio_proc.cc @@ -8,15 +8,15 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/agc_audio_proc.h" +#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" #include #include #include "webrtc/common_audio/fft4g.h" -#include "webrtc/modules/audio_processing/agc/agc_audio_proc_internal.h" -#include "webrtc/modules/audio_processing/agc/pitch_internal.h" -#include "webrtc/modules/audio_processing/agc/pole_zero_filter.h" +#include "webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h" +#include "webrtc/modules/audio_processing/vad/pitch_internal.h" +#include "webrtc/modules/audio_processing/vad/pole_zero_filter.h" extern "C" { #include "webrtc/modules/audio_coding/codecs/isac/main/source/codec.h" #include "webrtc/modules/audio_coding/codecs/isac/main/source/lpc_analysis.h" @@ -29,23 +29,25 @@ namespace webrtc { // The following structures are declared anonymous in iSAC's structs.h. To // forward declare them, we use this derived class trick. -struct AgcAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; -struct AgcAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; +struct VadAudioProc::PitchAnalysisStruct : public ::PitchAnalysisStruct {}; +struct VadAudioProc::PreFiltBankstr : public ::PreFiltBankstr {}; -static const float kFrequencyResolution = kSampleRateHz / - static_cast(AgcAudioProc::kDftSize); +static const float kFrequencyResolution = + kSampleRateHz / static_cast(VadAudioProc::kDftSize); static const int kSilenceRms = 5; -// TODO(turajs): Make a Create or Init for AgcAudioProc. -AgcAudioProc::AgcAudioProc() +// TODO(turajs): Make a Create or Init for VadAudioProc. +VadAudioProc::VadAudioProc() : audio_buffer_(), num_buffer_samples_(kNumPastSignalSamples), log_old_gain_(-2), old_lag_(50), // Arbitrary but valid as pitch-lag (in samples). pitch_analysis_handle_(new PitchAnalysisStruct), pre_filter_handle_(new PreFiltBankstr), - high_pass_filter_(PoleZeroFilter::Create( - kCoeffNumerator, kFilterOrder, kCoeffDenominator, kFilterOrder)) { + high_pass_filter_(PoleZeroFilter::Create(kCoeffNumerator, + kFilterOrder, + kCoeffDenominator, + kFilterOrder)) { static_assert(kNumPastSignalSamples + kNumSubframeSamples == sizeof(kLpcAnalWin) / sizeof(kLpcAnalWin[0]), "lpc analysis window incorrect size"); @@ -64,15 +66,16 @@ AgcAudioProc::AgcAudioProc() WebRtcIsac_InitPitchAnalysis(pitch_analysis_handle_.get()); } -AgcAudioProc::~AgcAudioProc() {} +VadAudioProc::~VadAudioProc() { +} -void AgcAudioProc::ResetBuffer() { +void VadAudioProc::ResetBuffer() { memcpy(audio_buffer_, &audio_buffer_[kNumSamplesToProcess], sizeof(audio_buffer_[0]) * kNumPastSignalSamples); num_buffer_samples_ = kNumPastSignalSamples; } -int AgcAudioProc::ExtractFeatures(const int16_t* frame, +int VadAudioProc::ExtractFeatures(const int16_t* frame, int length, AudioFeatures* features) { features->num_frames = 0; @@ -85,7 +88,7 @@ int AgcAudioProc::ExtractFeatures(const int16_t* frame, // classification. if (high_pass_filter_->Filter(frame, kNumSubframeSamples, &audio_buffer_[num_buffer_samples_]) != 0) { - return -1; + return -1; } num_buffer_samples_ += kNumSubframeSamples; @@ -115,7 +118,8 @@ int AgcAudioProc::ExtractFeatures(const int16_t* frame, } // Computes |kLpcOrder + 1| correlation coefficients. -void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr, +void VadAudioProc::SubframeCorrelation(double* corr, + int length_corr, int subframe_index) { assert(length_corr >= kLpcOrder + 1); double windowed_audio[kNumSubframeSamples + kNumPastSignalSamples]; @@ -124,20 +128,20 @@ void AgcAudioProc::SubframeCorrelation(double* corr, int length_corr, for (int n = 0; n < kNumSubframeSamples + kNumPastSignalSamples; n++) windowed_audio[n] = audio_buffer_[buffer_index++] * kLpcAnalWin[n]; - WebRtcIsac_AutoCorr(corr, windowed_audio, kNumSubframeSamples + - kNumPastSignalSamples, kLpcOrder); + WebRtcIsac_AutoCorr(corr, windowed_audio, + kNumSubframeSamples + kNumPastSignalSamples, kLpcOrder); } // Compute |kNum10msSubframes| sets of LPC coefficients, one per 10 ms input. // The analysis window is 15 ms long and it is centered on the first half of // each 10ms sub-frame. This is equivalent to computing LPC coefficients for the // first half of each 10 ms subframe. -void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) { +void VadAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) { assert(length_lpc >= kNum10msSubframes * (kLpcOrder + 1)); double corr[kLpcOrder + 1]; double reflec_coeff[kLpcOrder]; for (int i = 0, offset_lpc = 0; i < kNum10msSubframes; - i++, offset_lpc += kLpcOrder + 1) { + i++, offset_lpc += kLpcOrder + 1) { SubframeCorrelation(corr, kLpcOrder + 1, i); corr[0] *= 1.0001; // This makes Lev-Durb a bit more stable. @@ -150,7 +154,8 @@ void AgcAudioProc::GetLpcPolynomials(double* lpc, int length_lpc) { // Fit a second order curve to these 3 points and find the location of the // extremum. The points are inverted before curve fitting. -static float QuadraticInterpolation(float prev_val, float curr_val, +static float QuadraticInterpolation(float prev_val, + float curr_val, float next_val) { // Doing the interpolation in |1 / A(z)|^2. float fractional_index = 0; @@ -158,8 +163,8 @@ static float QuadraticInterpolation(float prev_val, float curr_val, prev_val = 1.0f / prev_val; curr_val = 1.0f / curr_val; - fractional_index = -(next_val - prev_val) * 0.5f / (next_val + prev_val - - 2.f * curr_val); + fractional_index = + -(next_val - prev_val) * 0.5f / (next_val + prev_val - 2.f * curr_val); assert(fabs(fractional_index) < 1); return fractional_index; } @@ -169,7 +174,7 @@ static float QuadraticInterpolation(float prev_val, float curr_val, // with the local minimum of A(z). It saves complexity, as we save one // inversion. Furthermore, we find the first local maximum of magnitude squared, // to save on one square root. -void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) { +void VadAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) { assert(length_f_peak >= kNum10msSubframes); double lpc[kNum10msSubframes * (kLpcOrder + 1)]; // For all sub-frames. @@ -193,8 +198,8 @@ void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) { float next_magn_sqr; bool found_peak = false; for (int n = 2; n < kNumDftCoefficients - 1; n++) { - next_magn_sqr = data[2 * n] * data[2 * n] + - data[2 * n + 1] * data[2 * n + 1]; + next_magn_sqr = + data[2 * n] * data[2 * n] + data[2 * n + 1] * data[2 * n + 1]; if (curr_magn_sqr < prev_magn_sqr && curr_magn_sqr < next_magn_sqr) { found_peak = true; index_peak = n - 1; @@ -213,15 +218,16 @@ void AgcAudioProc::FindFirstSpectralPeaks(double* f_peak, int length_f_peak) { } else { // A peak is found, do a simple quadratic interpolation to get a more // accurate estimate of the peak location. - fractional_index = QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, - next_magn_sqr); + fractional_index = + QuadraticInterpolation(prev_magn_sqr, curr_magn_sqr, next_magn_sqr); } f_peak[i] = (index_peak + fractional_index) * kFrequencyResolution; } } // Using iSAC functions to estimate pitch gains & lags. -void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz, +void VadAudioProc::PitchAnalysis(double* log_pitch_gains, + double* pitch_lags_hz, int length) { // TODO(turajs): This can be "imported" from iSAC & and the next two // constants. @@ -241,28 +247,27 @@ void AgcAudioProc::PitchAnalysis(double* log_pitch_gains, double* pitch_lags_hz, kNumLookaheadSamples]; // Split signal to lower and upper bands - WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], - lower, upper, lower_lookahead, upper_lookahead, + WebRtcIsac_SplitAndFilterFloat(&audio_buffer_[kNumPastSignalSamples], lower, + upper, lower_lookahead, upper_lookahead, pre_filter_handle_.get()); WebRtcIsac_PitchAnalysis(lower_lookahead, lower_lookahead_pre_filter, pitch_analysis_handle_.get(), lags, gains); // Lags are computed on lower-band signal with sampling rate half of the // input signal. - GetSubframesPitchParameters(kSampleRateHz / 2, gains, lags, - kNumPitchSubframes, kNum10msSubframes, - &log_old_gain_, &old_lag_, - log_pitch_gains, pitch_lags_hz); + GetSubframesPitchParameters( + kSampleRateHz / 2, gains, lags, kNumPitchSubframes, kNum10msSubframes, + &log_old_gain_, &old_lag_, log_pitch_gains, pitch_lags_hz); } -void AgcAudioProc::Rms(double* rms, int length_rms) { +void VadAudioProc::Rms(double* rms, int length_rms) { assert(length_rms >= kNum10msSubframes); int offset = kNumPastSignalSamples; for (int i = 0; i < kNum10msSubframes; i++) { rms[i] = 0; for (int n = 0; n < kNumSubframeSamples; n++, offset++) rms[i] += audio_buffer_[offset] * audio_buffer_[offset]; - rms[i] = sqrt(rms[i] / kNumSubframeSamples); + rms[i] = sqrt(rms[i] / kNumSubframeSamples); } } diff --git a/webrtc/modules/audio_processing/agc/agc_audio_proc.h b/webrtc/modules/audio_processing/vad/vad_audio_proc.h similarity index 87% rename from webrtc/modules/audio_processing/agc/agc_audio_proc.h rename to webrtc/modules/audio_processing/vad/vad_audio_proc.h index e5eb390170..6cf3937f79 100644 --- a/webrtc/modules/audio_processing/agc/agc_audio_proc.h +++ b/webrtc/modules/audio_processing/vad/vad_audio_proc.h @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_AGC_AUDIO_PROC_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_AGC_AUDIO_PROC_H_ +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_ #include "webrtc/base/scoped_ptr.h" -#include "webrtc/modules/audio_processing/agc/common.h" +#include "webrtc/modules/audio_processing/vad/common.h" #include "webrtc/typedefs.h" namespace webrtc { @@ -20,14 +20,14 @@ namespace webrtc { class AudioFrame; class PoleZeroFilter; -class AgcAudioProc { +class VadAudioProc { public: // Forward declare iSAC structs. struct PitchAnalysisStruct; struct PreFiltBankstr; - AgcAudioProc(); - ~AgcAudioProc(); + VadAudioProc(); + ~VadAudioProc(); int ExtractFeatures(const int16_t* audio_frame, int length, @@ -55,7 +55,8 @@ class AgcAudioProc { static const int kNum10msSubframes = 3; static const int kNumSubframeSamples = kSampleRateHz / 100; - static const int kNumSamplesToProcess = kNum10msSubframes * + static const int kNumSamplesToProcess = + kNum10msSubframes * kNumSubframeSamples; // Samples in 30 ms @ given sampling rate. static const int kBufferLength = kNumPastSignalSamples + kNumSamplesToProcess; static const int kIpLength = kDftSize >> 1; @@ -80,4 +81,4 @@ class AgcAudioProc { } // namespace webrtc -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_AGC_AUDIO_PROC_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_H_ diff --git a/webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h b/webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h new file mode 100644 index 0000000000..4486879df4 --- /dev/null +++ b/webrtc/modules/audio_processing/vad/vad_audio_proc_internal.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_INTERNAL_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROC_INTERNAL_H_ + +namespace webrtc { + +// These values should match MATLAB counterparts for unit-tests to pass. +static const double kCorrWeight[] = {1.000000, + 0.985000, + 0.970225, + 0.955672, + 0.941337, + 0.927217, + 0.913308, + 0.899609, + 0.886115, + 0.872823, + 0.859730, + 0.846834, + 0.834132, + 0.821620, + 0.809296, + 0.797156, + 0.785199}; + +static const double kLpcAnalWin[] = { + 0.00000000, 0.01314436, 0.02628645, 0.03942400, 0.05255473, 0.06567639, + 0.07878670, 0.09188339, 0.10496421, 0.11802689, 0.13106918, 0.14408883, + 0.15708358, 0.17005118, 0.18298941, 0.19589602, 0.20876878, 0.22160547, + 0.23440387, 0.24716177, 0.25987696, 0.27254725, 0.28517045, 0.29774438, + 0.31026687, 0.32273574, 0.33514885, 0.34750406, 0.35979922, 0.37203222, + 0.38420093, 0.39630327, 0.40833713, 0.42030043, 0.43219112, 0.44400713, + 0.45574642, 0.46740697, 0.47898676, 0.49048379, 0.50189608, 0.51322164, + 0.52445853, 0.53560481, 0.54665854, 0.55761782, 0.56848075, 0.57924546, + 0.58991008, 0.60047278, 0.61093173, 0.62128512, 0.63153117, 0.64166810, + 0.65169416, 0.66160761, 0.67140676, 0.68108990, 0.69065536, 0.70010148, + 0.70942664, 0.71862923, 0.72770765, 0.73666033, 0.74548573, 0.75418233, + 0.76274862, 0.77118312, 0.77948437, 0.78765094, 0.79568142, 0.80357442, + 0.81132858, 0.81894256, 0.82641504, 0.83374472, 0.84093036, 0.84797069, + 0.85486451, 0.86161063, 0.86820787, 0.87465511, 0.88095122, 0.88709512, + 0.89308574, 0.89892206, 0.90460306, 0.91012776, 0.91549520, 0.92070447, + 0.92575465, 0.93064488, 0.93537432, 0.93994213, 0.94434755, 0.94858979, + 0.95266814, 0.95658189, 0.96033035, 0.96391289, 0.96732888, 0.97057773, + 0.97365889, 0.97657181, 0.97931600, 0.98189099, 0.98429632, 0.98653158, + 0.98859639, 0.99049038, 0.99221324, 0.99376466, 0.99514438, 0.99635215, + 0.99738778, 0.99825107, 0.99894188, 0.99946010, 0.99980562, 0.99997840, + 0.99997840, 0.99980562, 0.99946010, 0.99894188, 0.99825107, 0.99738778, + 0.99635215, 0.99514438, 0.99376466, 0.99221324, 0.99049038, 0.98859639, + 0.98653158, 0.98429632, 0.98189099, 0.97931600, 0.97657181, 0.97365889, + 0.97057773, 0.96732888, 0.96391289, 0.96033035, 0.95658189, 0.95266814, + 0.94858979, 0.94434755, 0.93994213, 0.93537432, 0.93064488, 0.92575465, + 0.92070447, 0.91549520, 0.91012776, 0.90460306, 0.89892206, 0.89308574, + 0.88709512, 0.88095122, 0.87465511, 0.86820787, 0.86161063, 0.85486451, + 0.84797069, 0.84093036, 0.83374472, 0.82641504, 0.81894256, 0.81132858, + 0.80357442, 0.79568142, 0.78765094, 0.77948437, 0.77118312, 0.76274862, + 0.75418233, 0.74548573, 0.73666033, 0.72770765, 0.71862923, 0.70942664, + 0.70010148, 0.69065536, 0.68108990, 0.67140676, 0.66160761, 0.65169416, + 0.64166810, 0.63153117, 0.62128512, 0.61093173, 0.60047278, 0.58991008, + 0.57924546, 0.56848075, 0.55761782, 0.54665854, 0.53560481, 0.52445853, + 0.51322164, 0.50189608, 0.49048379, 0.47898676, 0.46740697, 0.45574642, + 0.44400713, 0.43219112, 0.42030043, 0.40833713, 0.39630327, 0.38420093, + 0.37203222, 0.35979922, 0.34750406, 0.33514885, 0.32273574, 0.31026687, + 0.29774438, 0.28517045, 0.27254725, 0.25987696, 0.24716177, 0.23440387, + 0.22160547, 0.20876878, 0.19589602, 0.18298941, 0.17005118, 0.15708358, + 0.14408883, 0.13106918, 0.11802689, 0.10496421, 0.09188339, 0.07878670, + 0.06567639, 0.05255473, 0.03942400, 0.02628645, 0.01314436, 0.00000000}; + +static const int kFilterOrder = 2; +static const float kCoeffNumerator[kFilterOrder + 1] = {0.974827f, + -1.949650f, + 0.974827f}; +static const float kCoeffDenominator[kFilterOrder + 1] = {1.0f, + -1.971999f, + 0.972457f}; + +static_assert(kFilterOrder + 1 == + sizeof(kCoeffNumerator) / sizeof(kCoeffNumerator[0]), + "numerator coefficients incorrect size"); +static_assert(kFilterOrder + 1 == + sizeof(kCoeffDenominator) / sizeof(kCoeffDenominator[0]), + "denominator coefficients incorrect size"); + +} // namespace webrtc + +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_AUDIO_PROCESSING_H_ diff --git a/webrtc/modules/audio_processing/agc/agc_audio_proc_unittest.cc b/webrtc/modules/audio_processing/vad/vad_audio_proc_unittest.cc similarity index 88% rename from webrtc/modules/audio_processing/agc/agc_audio_proc_unittest.cc rename to webrtc/modules/audio_processing/vad/vad_audio_proc_unittest.cc index 9534aec2ec..675af70b45 100644 --- a/webrtc/modules/audio_processing/agc/agc_audio_proc_unittest.cc +++ b/webrtc/modules/audio_processing/vad/vad_audio_proc_unittest.cc @@ -12,20 +12,22 @@ // routines. However, interpolation of pitch-gain and lags is in a separate // class and has its own unit-test. -#include "webrtc/modules/audio_processing/agc/agc_audio_proc.h" +#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" #include #include -#include "gtest/gtest.h" -#include "webrtc/modules/audio_processing/agc/common.h" +#include + +#include "testing/gtest/include/gtest/gtest.h" +#include "webrtc/modules/audio_processing/vad/common.h" #include "webrtc/modules/interface/module_common_types.h" #include "webrtc/test/testsupport/fileutils.h" namespace webrtc { TEST(AudioProcessingTest, DISABLED_ComputingFirstSpectralPeak) { - AgcAudioProc audioproc; + VadAudioProc audioproc; std::string peak_file_name = test::ResourcePath("audio_processing/agc/agc_spectral_peak", "dat"); @@ -39,7 +41,7 @@ TEST(AudioProcessingTest, DISABLED_ComputingFirstSpectralPeak) { // Read 10 ms audio in each iteration. const size_t kDataLength = kLength10Ms; - int16_t data[kDataLength] = { 0 }; + int16_t data[kDataLength] = {0}; AudioFeatures features; double sp[kMaxNumFrames]; while (fread(data, sizeof(int16_t), kDataLength, pcm_file) == kDataLength) { diff --git a/webrtc/modules/audio_processing/agc/circular_buffer.cc b/webrtc/modules/audio_processing/vad/vad_circular_buffer.cc similarity index 75% rename from webrtc/modules/audio_processing/agc/circular_buffer.cc rename to webrtc/modules/audio_processing/vad/vad_circular_buffer.cc index 8ecb76008f..d337893c45 100644 --- a/webrtc/modules/audio_processing/agc/circular_buffer.cc +++ b/webrtc/modules/audio_processing/vad/vad_circular_buffer.cc @@ -8,42 +8,44 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/circular_buffer.h" +#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h" #include #include namespace webrtc { -AgcCircularBuffer::AgcCircularBuffer(int buffer_size) +VadCircularBuffer::VadCircularBuffer(int buffer_size) : buffer_(new double[buffer_size]), is_full_(false), index_(0), buffer_size_(buffer_size), - sum_(0) {} + sum_(0) { +} -AgcCircularBuffer::~AgcCircularBuffer() {} +VadCircularBuffer::~VadCircularBuffer() { +} -void AgcCircularBuffer::Reset() { +void VadCircularBuffer::Reset() { is_full_ = false; index_ = 0; sum_ = 0; } -AgcCircularBuffer* AgcCircularBuffer::Create(int buffer_size) { +VadCircularBuffer* VadCircularBuffer::Create(int buffer_size) { if (buffer_size <= 0) return NULL; - return new AgcCircularBuffer(buffer_size); + return new VadCircularBuffer(buffer_size); } -double AgcCircularBuffer::Oldest() const { +double VadCircularBuffer::Oldest() const { if (!is_full_) return buffer_[0]; else return buffer_[index_]; } -double AgcCircularBuffer::Mean() { +double VadCircularBuffer::Mean() { double m; if (is_full_) { m = sum_ / buffer_size_; @@ -56,7 +58,7 @@ double AgcCircularBuffer::Mean() { return m; } -void AgcCircularBuffer::Insert(double value) { +void VadCircularBuffer::Insert(double value) { if (is_full_) { sum_ -= buffer_[index_]; } @@ -68,13 +70,13 @@ void AgcCircularBuffer::Insert(double value) { index_ = 0; } } -int AgcCircularBuffer::BufferLevel() { +int VadCircularBuffer::BufferLevel() { if (is_full_) return buffer_size_; return index_; } -int AgcCircularBuffer::Get(int index, double* value) const { +int VadCircularBuffer::Get(int index, double* value) const { int err = ConvertToLinearIndex(&index); if (err < 0) return -1; @@ -82,7 +84,7 @@ int AgcCircularBuffer::Get(int index, double* value) const { return 0; } -int AgcCircularBuffer::Set(int index, double value) { +int VadCircularBuffer::Set(int index, double value) { int err = ConvertToLinearIndex(&index); if (err < 0) return -1; @@ -93,7 +95,7 @@ int AgcCircularBuffer::Set(int index, double value) { return 0; } -int AgcCircularBuffer::ConvertToLinearIndex(int* index) const { +int VadCircularBuffer::ConvertToLinearIndex(int* index) const { if (*index < 0 || *index >= buffer_size_) return -1; @@ -106,7 +108,7 @@ int AgcCircularBuffer::ConvertToLinearIndex(int* index) const { return 0; } -int AgcCircularBuffer::RemoveTransient(int width_threshold, +int VadCircularBuffer::RemoveTransient(int width_threshold, double val_threshold) { if (!is_full_ && index_ < width_threshold + 2) return 0; diff --git a/webrtc/modules/audio_processing/agc/circular_buffer.h b/webrtc/modules/audio_processing/vad/vad_circular_buffer.h similarity index 86% rename from webrtc/modules/audio_processing/agc/circular_buffer.h rename to webrtc/modules/audio_processing/vad/vad_circular_buffer.h index eee60977d1..5238f77257 100644 --- a/webrtc/modules/audio_processing/agc/circular_buffer.h +++ b/webrtc/modules/audio_processing/vad/vad_circular_buffer.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AGC_CIRCULAR_BUFFER_H_ -#define WEBRTC_MODULES_AUDIO_PROCESSING_AGC_CIRCULAR_BUFFER_H_ +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_ #include "webrtc/base/scoped_ptr.h" @@ -21,10 +21,10 @@ namespace webrtc { // It is used in class "PitchBasedActivity" to keep track of posterior // probabilities in the past few seconds. The posterior probabilities are used // to recursively update prior probabilities. -class AgcCircularBuffer { +class VadCircularBuffer { public: - static AgcCircularBuffer* Create(int buffer_size); - ~AgcCircularBuffer(); + static VadCircularBuffer* Create(int buffer_size); + ~VadCircularBuffer(); // If buffer is wrapped around. bool is_full() const { return is_full_; } @@ -44,7 +44,7 @@ class AgcCircularBuffer { int RemoveTransient(int width_threshold, double val_threshold); private: - explicit AgcCircularBuffer(int buffer_size); + explicit VadCircularBuffer(int buffer_size); // Get previous values. |index = 0| corresponds to the most recent // insertion. |index = 1| is the one before the most recent insertion, and // so on. @@ -66,4 +66,4 @@ class AgcCircularBuffer { }; } // namespace webrtc -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AGC_CIRCULAR_BUFFER_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VAD_CIRCULAR_BUFFER_H_ diff --git a/webrtc/modules/audio_processing/agc/circular_buffer_unittest.cc b/webrtc/modules/audio_processing/vad/vad_circular_buffer_unittest.cc similarity index 76% rename from webrtc/modules/audio_processing/agc/circular_buffer_unittest.cc rename to webrtc/modules/audio_processing/vad/vad_circular_buffer_unittest.cc index e80a5d0fa1..11945e042c 100644 --- a/webrtc/modules/audio_processing/agc/circular_buffer_unittest.cc +++ b/webrtc/modules/audio_processing/vad/vad_circular_buffer_unittest.cc @@ -8,7 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "webrtc/modules/audio_processing/agc/circular_buffer.h" +#include "webrtc/modules/audio_processing/vad/vad_circular_buffer.h" #include @@ -22,7 +22,7 @@ static const double kValThreshold = 1.0; static const int kLongBuffSize = 100; static const int kShortBuffSize = 10; -static void InsertSequentially(int k, AgcCircularBuffer* circular_buffer) { +static void InsertSequentially(int k, VadCircularBuffer* circular_buffer) { double mean_val; for (int n = 1; n <= k; n++) { EXPECT_TRUE(!circular_buffer->is_full()); @@ -32,19 +32,20 @@ static void InsertSequentially(int k, AgcCircularBuffer* circular_buffer) { } } -static void Insert(double value, int num_insertion, - AgcCircularBuffer* circular_buffer) { +static void Insert(double value, + int num_insertion, + VadCircularBuffer* circular_buffer) { for (int n = 0; n < num_insertion; n++) circular_buffer->Insert(value); } -static void InsertZeros(int num_zeros, AgcCircularBuffer* circular_buffer) { +static void InsertZeros(int num_zeros, VadCircularBuffer* circular_buffer) { Insert(0.0, num_zeros, circular_buffer); } -TEST(AgcCircularBufferTest, GeneralTest) { - rtc::scoped_ptr circular_buffer( - AgcCircularBuffer::Create(kShortBuffSize)); +TEST(VadCircularBufferTest, GeneralTest) { + rtc::scoped_ptr circular_buffer( + VadCircularBuffer::Create(kShortBuffSize)); double mean_val; // Mean should return zero if nothing is inserted. @@ -70,9 +71,9 @@ TEST(AgcCircularBufferTest, GeneralTest) { EXPECT_TRUE(circular_buffer->is_full()); } -TEST(AgcCircularBufferTest, TransientsRemoval) { - rtc::scoped_ptr circular_buffer( - AgcCircularBuffer::Create(kLongBuffSize)); +TEST(VadCircularBufferTest, TransientsRemoval) { + rtc::scoped_ptr circular_buffer( + VadCircularBuffer::Create(kLongBuffSize)); // Let the first transient be in wrap-around. InsertZeros(kLongBuffSize - kWidthThreshold / 2, circular_buffer.get()); @@ -89,9 +90,9 @@ TEST(AgcCircularBufferTest, TransientsRemoval) { } } -TEST(AgcCircularBufferTest, TransientDetection) { - rtc::scoped_ptr circular_buffer( - AgcCircularBuffer::Create(kLongBuffSize)); +TEST(VadCircularBufferTest, TransientDetection) { + rtc::scoped_ptr circular_buffer( + VadCircularBuffer::Create(kLongBuffSize)); // Let the first transient be in wrap-around. int num_insertion = kLongBuffSize - kWidthThreshold / 2; InsertZeros(num_insertion, circular_buffer.get()); @@ -104,8 +105,8 @@ TEST(AgcCircularBufferTest, TransientDetection) { double mean_val = circular_buffer->Mean(); EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val); circular_buffer->Insert(0); - EXPECT_EQ(0, circular_buffer->RemoveTransient(kWidthThreshold, - kValThreshold)); + EXPECT_EQ(0, + circular_buffer->RemoveTransient(kWidthThreshold, kValThreshold)); mean_val = circular_buffer->Mean(); EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val); @@ -114,8 +115,8 @@ TEST(AgcCircularBufferTest, TransientDetection) { num_insertion = 3; Insert(push_val, num_insertion, circular_buffer.get()); circular_buffer->Insert(0); - EXPECT_EQ(0, circular_buffer->RemoveTransient(kWidthThreshold, - kValThreshold)); + EXPECT_EQ(0, + circular_buffer->RemoveTransient(kWidthThreshold, kValThreshold)); mean_val = circular_buffer->Mean(); EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val); @@ -123,8 +124,8 @@ TEST(AgcCircularBufferTest, TransientDetection) { // it shouldn't be considered transient. Insert(push_val, num_insertion, circular_buffer.get()); num_non_zero_elements += num_insertion; - EXPECT_EQ(0, circular_buffer->RemoveTransient(kWidthThreshold, - kValThreshold)); + EXPECT_EQ(0, + circular_buffer->RemoveTransient(kWidthThreshold, kValThreshold)); mean_val = circular_buffer->Mean(); EXPECT_DOUBLE_EQ(num_non_zero_elements * push_val / kLongBuffSize, mean_val); } diff --git a/webrtc/modules/audio_processing/vad/voice_activity_detector.cc b/webrtc/modules/audio_processing/vad/voice_activity_detector.cc new file mode 100644 index 0000000000..79928d1aa0 --- /dev/null +++ b/webrtc/modules/audio_processing/vad/voice_activity_detector.cc @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h" + +#include + +#include "webrtc/base/checks.h" + +namespace webrtc { +namespace { + +const int kMaxLength = 320; +const int kNumChannels = 1; + +const double kDefaultVoiceValue = 1.0; +const double kNeutralProbability = 0.5; +const double kLowProbability = 0.01; + +} // namespace + +VoiceActivityDetector::VoiceActivityDetector() + : last_voice_probability_(kDefaultVoiceValue), + // Initialize to the most common resampling situation. + resampler_(kMaxLength, kLength10Ms, kNumChannels), + standalone_vad_(StandaloneVad::Create()) { +} + +// Because ISAC has a different chunk length, it updates +// |chunkwise_voice_probabilities_| and |chunkwise_rms_| when there is new data. +// Otherwise it clears them. +void VoiceActivityDetector::ProcessChunk(const int16_t* audio, + int length, + int sample_rate_hz) { + DCHECK_EQ(length, sample_rate_hz / 100); + DCHECK_LE(length, kMaxLength); + // Resample to the required rate. + const int16_t* resampled_ptr = audio; + if (sample_rate_hz != kSampleRateHz) { + CHECK_EQ( + resampler_.ResetIfNeeded(sample_rate_hz, kSampleRateHz, kNumChannels), + 0); + resampler_.Push(audio, length, resampled_, kLength10Ms, length); + resampled_ptr = resampled_; + } + DCHECK_EQ(length, kLength10Ms); + + // Each chunk needs to be passed into |standalone_vad_|, because internally it + // buffers the audio and processes it all at once when GetActivity() is + // called. + CHECK_EQ(standalone_vad_->AddAudio(audio, length), 0); + + audio_processing_.ExtractFeatures(resampled_ptr, length, &features_); + + chunkwise_voice_probabilities_.resize(features_.num_frames); + chunkwise_rms_.resize(features_.num_frames); + std::copy(features_.rms, features_.rms + chunkwise_rms_.size(), + chunkwise_rms_.begin()); + if (features_.num_frames > 0) { + if (features_.silence) { + // The other features are invalid, so set the voice probabilities to an + // arbitrary low value. + std::fill(chunkwise_voice_probabilities_.begin(), + chunkwise_voice_probabilities_.end(), kLowProbability); + } else { + std::fill(chunkwise_voice_probabilities_.begin(), + chunkwise_voice_probabilities_.end(), kNeutralProbability); + CHECK_GE( + standalone_vad_->GetActivity(&chunkwise_voice_probabilities_[0], + chunkwise_voice_probabilities_.size()), + 0); + CHECK_GE(pitch_based_vad_.VoicingProbability( + features_, &chunkwise_voice_probabilities_[0]), + 0); + } + last_voice_probability_ = chunkwise_voice_probabilities_.back(); + } +} + +} // namespace webrtc diff --git a/webrtc/modules/audio_processing/vad/voice_activity_detector.h b/webrtc/modules/audio_processing/vad/voice_activity_detector.h new file mode 100644 index 0000000000..aedd6ed324 --- /dev/null +++ b/webrtc/modules/audio_processing/vad/voice_activity_detector.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_ + +#include + +#include "webrtc/base/scoped_ptr.h" +#include "webrtc/common_audio/resampler/include/resampler.h" +#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" +#include "webrtc/modules/audio_processing/vad/common.h" +#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" +#include "webrtc/modules/audio_processing/vad/standalone_vad.h" + +namespace webrtc { + +// A Voice Activity Detector (VAD) that combines the voice probability from the +// StandaloneVad and PitchBasedVad to get a more robust estimation. +class VoiceActivityDetector { + public: + VoiceActivityDetector(); + + // Processes each audio chunk and estimates the voice probability. The maximum + // supported sample rate is 32kHz. + // TODO(aluebs): Change |length| to size_t. + void ProcessChunk(const int16_t* audio, int length, int sample_rate_hz); + + // Returns a vector of voice probabilities for each chunk. It can be empty for + // some chunks, but it catches up afterwards returning multiple values at + // once. + const std::vector& chunkwise_voice_probabilities() const { + return chunkwise_voice_probabilities_; + } + + // Returns a vector of RMS values for each chunk. It has the same length as + // chunkwise_voice_probabilities(). + const std::vector& chunkwise_rms() const { return chunkwise_rms_; } + + // Returns the last voice probability, regardless of the internal + // implementation, although it has a few chunks of delay. + float last_voice_probability() const { return last_voice_probability_; } + + private: + // TODO(aluebs): Change these to float. + std::vector chunkwise_voice_probabilities_; + std::vector chunkwise_rms_; + + float last_voice_probability_; + + Resampler resampler_; + VadAudioProc audio_processing_; + + rtc::scoped_ptr standalone_vad_; + PitchBasedVad pitch_based_vad_; + + int16_t resampled_[kLength10Ms]; + AudioFeatures features_; +}; + +} // namespace webrtc + +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_ACTIVITY_DETECTOR_H_ diff --git a/webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc b/webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc new file mode 100644 index 0000000000..f4ee17760e --- /dev/null +++ b/webrtc/modules/audio_processing/vad/voice_activity_detector_unittest.cc @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2015 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "webrtc/modules/audio_processing/vad/voice_activity_detector.h" + +#include +#include + +#include "testing/gtest/include/gtest/gtest.h" +#include "webrtc/test/testsupport/fileutils.h" + +namespace webrtc { +namespace { + +const int kStartTimeSec = 16; +const float kMeanSpeechProbability = 0.3f; +const float kMaxNoiseProbability = 0.1f; +const size_t kNumChunks = 300u; +const size_t kNumChunksPerIsacBlock = 3; + +void GenerateNoise(std::vector* data) { + for (size_t i = 0; i < data->size(); ++i) { + // std::rand returns between 0 and RAND_MAX, but this will work because it + // wraps into some random place. + (*data)[i] = std::rand(); + } +} + +} // namespace + +TEST(VoiceActivityDetectorTest, ConstructorSetsDefaultValues) { + const float kDefaultVoiceValue = 1.f; + + VoiceActivityDetector vad; + + std::vector p = vad.chunkwise_voice_probabilities(); + std::vector rms = vad.chunkwise_rms(); + + EXPECT_EQ(p.size(), 0u); + EXPECT_EQ(rms.size(), 0u); + + EXPECT_FLOAT_EQ(vad.last_voice_probability(), kDefaultVoiceValue); +} + +TEST(VoiceActivityDetectorTest, Speech16kHzHasHighVoiceProbabilities) { + const int kSampleRateHz = 16000; + const int kLength10Ms = kSampleRateHz / 100; + + VoiceActivityDetector vad; + + std::vector data(kLength10Ms); + float mean_probability = 0.f; + + FILE* pcm_file = + fopen(test::ResourcePath("audio_processing/transient/audio16kHz", "pcm") + .c_str(), + "rb"); + ASSERT_TRUE(pcm_file != nullptr); + // The silences in the file are skipped to get a more robust voice probability + // for speech. + ASSERT_EQ(fseek(pcm_file, kStartTimeSec * kSampleRateHz * sizeof(data[0]), + SEEK_SET), + 0); + + size_t num_chunks = 0; + while (fread(&data[0], sizeof(data[0]), data.size(), pcm_file) == + data.size()) { + vad.ProcessChunk(&data[0], data.size(), kSampleRateHz); + + mean_probability += vad.last_voice_probability(); + + ++num_chunks; + } + + mean_probability /= num_chunks; + + EXPECT_GT(mean_probability, kMeanSpeechProbability); +} + +TEST(VoiceActivityDetectorTest, Speech32kHzHasHighVoiceProbabilities) { + const int kSampleRateHz = 32000; + const int kLength10Ms = kSampleRateHz / 100; + + VoiceActivityDetector vad; + + std::vector data(kLength10Ms); + float mean_probability = 0.f; + + FILE* pcm_file = + fopen(test::ResourcePath("audio_processing/transient/audio32kHz", "pcm") + .c_str(), + "rb"); + ASSERT_TRUE(pcm_file != nullptr); + // The silences in the file are skipped to get a more robust voice probability + // for speech. + ASSERT_EQ(fseek(pcm_file, kStartTimeSec * kSampleRateHz * sizeof(data[0]), + SEEK_SET), + 0); + + size_t num_chunks = 0; + while (fread(&data[0], sizeof(data[0]), data.size(), pcm_file) == + data.size()) { + vad.ProcessChunk(&data[0], data.size(), kSampleRateHz); + + mean_probability += vad.last_voice_probability(); + + ++num_chunks; + } + + mean_probability /= num_chunks; + + EXPECT_GT(mean_probability, kMeanSpeechProbability); +} + +TEST(VoiceActivityDetectorTest, Noise16kHzHasLowVoiceProbabilities) { + VoiceActivityDetector vad; + + std::vector data(kLength10Ms); + float max_probability = 0.f; + + std::srand(42); + + for (size_t i = 0; i < kNumChunks; ++i) { + GenerateNoise(&data); + + vad.ProcessChunk(&data[0], data.size(), kSampleRateHz); + + // Before the |vad has enough data to process an ISAC block it will return + // the default value, 1.f, which would ruin the |max_probability| value. + if (i > kNumChunksPerIsacBlock) { + max_probability = std::max(max_probability, vad.last_voice_probability()); + } + } + + EXPECT_LT(max_probability, kMaxNoiseProbability); +} + +TEST(VoiceActivityDetectorTest, Noise32kHzHasLowVoiceProbabilities) { + VoiceActivityDetector vad; + + std::vector data(2 * kLength10Ms); + float max_probability = 0.f; + + std::srand(42); + + for (size_t i = 0; i < kNumChunks; ++i) { + GenerateNoise(&data); + + vad.ProcessChunk(&data[0], data.size(), 2 * kSampleRateHz); + + // Before the |vad has enough data to process an ISAC block it will return + // the default value, 1.f, which would ruin the |max_probability| value. + if (i > kNumChunksPerIsacBlock) { + max_probability = std::max(max_probability, vad.last_voice_probability()); + } + } + + EXPECT_LT(max_probability, kMaxNoiseProbability); +} + +} // namespace webrtc diff --git a/webrtc/modules/audio_processing/vad/voice_gmm_tables.h b/webrtc/modules/audio_processing/vad/voice_gmm_tables.h new file mode 100644 index 0000000000..2f247c3798 --- /dev/null +++ b/webrtc/modules/audio_processing/vad/voice_gmm_tables.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// GMM tables for active segments. Generated by MakeGmmTables.m. + +#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_ +#define WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_ + +static const int kVoiceGmmNumMixtures = 12; +static const int kVoiceGmmDim = 3; + +static const double + kVoiceGmmCovarInverse[kVoiceGmmNumMixtures][kVoiceGmmDim][kVoiceGmmDim] = { + {{1.83673825579513e+00, -8.09791637570095e-04, 4.60106414365986e-03}, + {-8.09791637570095e-04, 8.89351738394608e-04, -9.80188953277734e-04}, + {4.60106414365986e-03, -9.80188953277734e-04, 1.38706060206582e-03}}, + {{6.76228912850703e+01, -1.98893120119660e-02, -3.53548357253551e-03}, + {-1.98893120119660e-02, 3.96216858500530e-05, -4.08492938394097e-05}, + {-3.53548357253551e-03, -4.08492938394097e-05, 9.31864352856416e-04}}, + {{9.98612435944558e+00, -5.27880954316893e-03, -6.30342541619017e-03}, + {-5.27880954316893e-03, 4.54359480225226e-05, 6.30804591626044e-05}, + {-6.30342541619017e-03, 6.30804591626044e-05, 5.36466441382942e-04}}, + {{3.39917474216349e+01, -1.56213579433191e-03, -4.01459014990225e-02}, + {-1.56213579433191e-03, 6.40415424897724e-05, 6.20076342427833e-05}, + {-4.01459014990225e-02, 6.20076342427833e-05, 3.51199070103063e-03}}, + {{1.34545062271428e+01, -7.94513610147144e-03, -5.34401019341728e-02}, + {-7.94513610147144e-03, 1.16511820098649e-04, 4.66063702069293e-05}, + {-5.34401019341728e-02, 4.66063702069293e-05, 2.72354323774163e-03}}, + {{1.08557844314806e+02, -1.54885805673668e-02, -1.88029692674851e-02}, + {-1.54885805673668e-02, 1.16404042786406e-04, 6.45579292702802e-06}, + {-1.88029692674851e-02, 6.45579292702802e-06, 4.32330478391416e-04}}, + {{8.22940066541450e+01, -1.15903110231303e-02, -4.92166764865343e-02}, + {-1.15903110231303e-02, 7.42510742165261e-05, 3.73007314191290e-06}, + {-4.92166764865343e-02, 3.73007314191290e-06, 3.64005221593244e-03}}, + {{2.31133605685660e+00, -7.83261568950254e-04, 7.45744012346313e-04}, + {-7.83261568950254e-04, 1.29460648214142e-05, -2.22774455093730e-06}, + {7.45744012346313e-04, -2.22774455093730e-06, 1.05117294093010e-04}}, + {{3.78767849189611e+02, 1.57759761011568e-03, -2.08551217988774e-02}, + {1.57759761011568e-03, 4.76066236886865e-05, -2.33977412299324e-05}, + {-2.08551217988774e-02, -2.33977412299324e-05, 5.24261005371196e-04}}, + {{6.98580096506135e-01, -5.13850255217378e-04, -4.01124551717056e-04}, + {-5.13850255217378e-04, 1.40501021984840e-06, -2.09496928716569e-06}, + {-4.01124551717056e-04, -2.09496928716569e-06, 2.82879357740037e-04}}, + {{2.62770945162399e+00, -2.31825753241430e-03, -5.30447217466318e-03}, + {-2.31825753241430e-03, 4.59108572227649e-05, 7.67631886355405e-05}, + {-5.30447217466318e-03, 7.67631886355405e-05, 2.28521601674098e-03}}, + {{1.89940391362152e+02, -4.23280856852379e-03, -2.70608873541399e-02}, + {-4.23280856852379e-03, 6.77547582742563e-05, 2.69154203800467e-05}, + {-2.70608873541399e-02, 2.69154203800467e-05, 3.88574543373470e-03}}}; + +static const double kVoiceGmmMean[kVoiceGmmNumMixtures][kVoiceGmmDim] = { + {-2.15020241646536e+00, 4.97079062999877e+02, 4.77078119504505e+02}, + {-8.92097680029190e-01, 5.92064964199921e+02, 1.81045145941059e+02}, + {-1.29435784144398e+00, 4.98450293410611e+02, 1.71991263804064e+02}, + {-1.03925228397884e+00, 4.99511274321571e+02, 1.05838336539105e+02}, + {-1.29229047206129e+00, 4.15026762566707e+02, 1.12861119017125e+02}, + {-7.88748114599810e-01, 4.48739336688113e+02, 1.89784216956337e+02}, + {-8.77777402332642e-01, 4.86620285054533e+02, 1.13477708016491e+02}, + {-2.06465957063057e+00, 6.33385049870607e+02, 2.32758546796149e+02}, + {-6.98893789231685e-01, 5.93622051503385e+02, 1.92536982473203e+02}, + {-2.55901217508894e+00, 1.55914919756205e+03, 1.39769980835570e+02}, + {-1.92070024165837e+00, 4.87983940444185e+02, 1.02745468128289e+02}, + {-7.29187507662854e-01, 5.22717685022855e+02, 1.16377942283991e+02}}; + +static const double kVoiceGmmWeights[kVoiceGmmNumMixtures] = { + -1.39789694361035e+01, + -1.19527720202104e+01, + -1.32396317929055e+01, + -1.09436815209238e+01, + -1.13440027478149e+01, + -1.12200721834504e+01, + -1.02537324043693e+01, + -1.60789861938302e+01, + -1.03394494048344e+01, + -1.83207938586818e+01, + -1.31186044948288e+01, + -9.52479998673554e+00}; +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_VAD_VOICE_GMM_TABLES_H_ diff --git a/webrtc/modules/modules.gyp b/webrtc/modules/modules.gyp index 150ee8e575..fc0673acec 100644 --- a/webrtc/modules/modules.gyp +++ b/webrtc/modules/modules.gyp @@ -161,15 +161,8 @@ 'audio_processing/aec/system_delay_unittest.cc', # TODO(ajm): Fix to match new interface. # 'audio_processing/agc/agc_unittest.cc', - 'audio_processing/agc/agc_audio_proc_unittest.cc', - 'audio_processing/agc/circular_buffer_unittest.cc', - 'audio_processing/agc/gmm_unittest.cc', 'audio_processing/agc/histogram_unittest.cc', 'audio_processing/agc/mock_agc.h', - 'audio_processing/agc/pitch_based_vad_unittest.cc', - 'audio_processing/agc/pitch_internal_unittest.cc', - 'audio_processing/agc/pole_zero_filter_unittest.cc', - 'audio_processing/agc/standalone_vad_unittest.cc', 'audio_processing/beamformer/complex_matrix_unittest.cc', 'audio_processing/beamformer/covariance_matrix_generator_unittest.cc', 'audio_processing/beamformer/matrix_unittest.cc', @@ -187,6 +180,14 @@ 'audio_processing/transient/wpd_node_unittest.cc', 'audio_processing/transient/wpd_tree_unittest.cc', 'audio_processing/utility/delay_estimator_unittest.cc', + 'audio_processing/vad/gmm_unittest.cc', + 'audio_processing/vad/pitch_based_vad_unittest.cc', + 'audio_processing/vad/pitch_internal_unittest.cc', + 'audio_processing/vad/pole_zero_filter_unittest.cc', + 'audio_processing/vad/standalone_vad_unittest.cc', + 'audio_processing/vad/vad_audio_proc_unittest.cc', + 'audio_processing/vad/vad_circular_buffer_unittest.cc', + 'audio_processing/vad/voice_activity_detector_unittest.cc', 'bitrate_controller/bitrate_allocator_unittest.cc', 'bitrate_controller/bitrate_controller_unittest.cc', 'bitrate_controller/send_side_bandwidth_estimation_unittest.cc', diff --git a/webrtc/tools/agc/activity_metric.cc b/webrtc/tools/agc/activity_metric.cc index 57e2ad615f..fb50daf2ba 100644 --- a/webrtc/tools/agc/activity_metric.cc +++ b/webrtc/tools/agc/activity_metric.cc @@ -18,12 +18,12 @@ #include "gflags/gflags.h" #include "testing/gtest/include/gtest/gtest.h" #include "webrtc/modules/audio_processing/agc/agc.h" -#include "webrtc/modules/audio_processing/agc/agc_audio_proc.h" -#include "webrtc/modules/audio_processing/agc/common.h" #include "webrtc/modules/audio_processing/agc/histogram.h" -#include "webrtc/modules/audio_processing/agc/pitch_based_vad.h" -#include "webrtc/modules/audio_processing/agc/standalone_vad.h" #include "webrtc/modules/audio_processing/agc/utility.h" +#include "webrtc/modules/audio_processing/vad/vad_audio_proc.h" +#include "webrtc/modules/audio_processing/vad/common.h" +#include "webrtc/modules/audio_processing/vad/pitch_based_vad.h" +#include "webrtc/modules/audio_processing/vad/standalone_vad.h" #include "webrtc/modules/interface/module_common_types.h" static const int kAgcAnalWindowSamples = 100; @@ -75,7 +75,7 @@ class AgcStat { : video_index_(0), activity_threshold_(kDefaultActivityThreshold), audio_content_(Histogram::Create(kAgcAnalWindowSamples)), - audio_processing_(new AgcAudioProc()), + audio_processing_(new VadAudioProc()), vad_(new PitchBasedVad()), standalone_vad_(StandaloneVad::Create()), audio_content_fid_(NULL) { @@ -155,7 +155,7 @@ class AgcStat { double activity_threshold_; double video_vad_[kMaxNumFrames]; rtc::scoped_ptr audio_content_; - rtc::scoped_ptr audio_processing_; + rtc::scoped_ptr audio_processing_; rtc::scoped_ptr vad_; rtc::scoped_ptr standalone_vad_;