Delete voice_detection() pointer to submodule

The new configuration path is via AudioProcessing::ApplyConfig and AudioProcessing::GetStatistics. ApmTest.Process passes with unchanged reference files if audio_processing_impl would initialize the VAD with VoiceDetection::kLowLikelihood instead of kVeryLowLikelihood. This was verified by testing this CL with that modification. Bug: webrtc:9878 Change-Id: I4d08df37a07e5c72feeec02a07d6b9435f917d72 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/155445 Commit-Queue: Sam Zackrisson <saza@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#29395}
2019-10-07 14:03:56 +02:00
parent 24d251f796
commit 0824c6f61a
19 changed files with 223 additions and 511 deletions
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@ -156,8 +156,8 @@ rtc_static_library("audio_processing") {
    "transient/wpd_tree.h",
    "typing_detection.cc",
    "typing_detection.h",
-    "voice_detection_impl.cc",
-    "voice_detection_impl.h",
+    "voice_detection.cc",
+    "voice_detection.h",
  ]

  defines = []
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@ -40,7 +40,7 @@
 #include "modules/audio_processing/noise_suppression_proxy.h"
 #include "modules/audio_processing/residual_echo_detector.h"
 #include "modules/audio_processing/transient/transient_suppressor.h"
-#include "modules/audio_processing/voice_detection_impl.h"
+#include "modules/audio_processing/voice_detection.h"
 #include "rtc_base/atomic_ops.h"
 #include "rtc_base/checks.h"
 #include "rtc_base/constructor_magic.h"
@ -165,8 +165,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
    bool gain_controller2_enabled,
    bool pre_amplifier_enabled,
    bool echo_controller_enabled,
-    bool voice_activity_detector_enabled,
-    bool private_voice_detector_enabled,
+    bool voice_detector_enabled,
    bool level_estimator_enabled,
    bool transient_suppressor_enabled) {
  bool changed = false;
@ -183,10 +182,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
  changed |= (pre_amplifier_enabled_ != pre_amplifier_enabled);
  changed |= (echo_controller_enabled != echo_controller_enabled_);
  changed |= (level_estimator_enabled != level_estimator_enabled_);
-  changed |=
-      (voice_activity_detector_enabled != voice_activity_detector_enabled_);
-  changed |=
-      (private_voice_detector_enabled != private_voice_detector_enabled_);
+  changed |= (voice_detector_enabled != voice_detector_enabled_);
  changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
  if (changed) {
    high_pass_filter_enabled_ = high_pass_filter_enabled;
@ -199,8 +195,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(
    pre_amplifier_enabled_ = pre_amplifier_enabled;
    echo_controller_enabled_ = echo_controller_enabled;
    level_estimator_enabled_ = level_estimator_enabled;
-    voice_activity_detector_enabled_ = voice_activity_detector_enabled;
-    private_voice_detector_enabled_ = private_voice_detector_enabled;
+    voice_detector_enabled_ = voice_detector_enabled;
    transient_suppressor_enabled_ = transient_suppressor_enabled;
  }

@ -211,8 +206,7 @@ bool AudioProcessingImpl::ApmSubmoduleStates::Update(

 bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandSubModulesActive()
    const {
-  return CaptureMultiBandProcessingActive() ||
-         voice_activity_detector_enabled_ || private_voice_detector_enabled_;
+  return CaptureMultiBandProcessingActive() || voice_detector_enabled_;
 }

 bool AudioProcessingImpl::ApmSubmoduleStates::CaptureMultiBandProcessingActive()
@ -263,7 +257,6 @@ struct AudioProcessingImpl::ApmPublicSubmodules {
  std::unique_ptr<LevelEstimatorImpl> level_estimator;
  std::unique_ptr<NoiseSuppressionImpl> noise_suppression;
  std::unique_ptr<NoiseSuppressionProxy> noise_suppression_proxy;
-  std::unique_ptr<VoiceDetectionImpl> voice_detection;
  std::unique_ptr<GainControlImpl> gain_control;
  std::unique_ptr<GainControlForExperimentalAgc>
      gain_control_for_experimental_agc;
@ -295,7 +288,7 @@ struct AudioProcessingImpl::ApmPrivateSubmodules {
  std::unique_ptr<GainApplier> pre_amplifier;
  std::unique_ptr<CustomAudioAnalyzer> capture_analyzer;
  std::unique_ptr<LevelEstimatorImpl> output_level_estimator;
-  std::unique_ptr<VoiceDetectionImpl> voice_detector;
+  std::unique_ptr<VoiceDetection> voice_detector;
 };

 AudioProcessingBuilder::AudioProcessingBuilder() = default;
@ -415,8 +408,6 @@ AudioProcessingImpl::AudioProcessingImpl(
      new NoiseSuppressionImpl(&crit_capture_));
  public_submodules_->noise_suppression_proxy.reset(new NoiseSuppressionProxy(
      this, public_submodules_->noise_suppression.get()));
-  public_submodules_->voice_detection.reset(
-      new VoiceDetectionImpl(&crit_capture_));
  public_submodules_->gain_control_for_experimental_agc.reset(
      new GainControlForExperimentalAgc(
          public_submodules_->gain_control.get()));
@ -556,11 +547,7 @@ int AudioProcessingImpl::InitializeLocked() {
  InitializeHighPassFilter();
  public_submodules_->noise_suppression->Initialize(num_proc_channels(),
                                                    proc_sample_rate_hz());
-  public_submodules_->voice_detection->Initialize(proc_split_sample_rate_hz());
-  if (private_submodules_->voice_detector) {
-    private_submodules_->voice_detector->Initialize(
-        proc_split_sample_rate_hz());
-  }
+  InitializeVoiceDetector();
  public_submodules_->level_estimator->Initialize();
  InitializeResidualEchoDetector();
  InitializeEchoController();
@ -702,6 +689,9 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
      config_.gain_controller1.analog_level_maximum !=
          config.gain_controller1.analog_level_maximum;

+  const bool voice_detection_config_changed =
+      config_.voice_detection.enabled != config.voice_detection.enabled;
+
  config_ = config;

  if (aec_config_changed) {
@ -745,14 +735,8 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
    private_submodules_->output_level_estimator->Enable(true);
  }

-  if (config_.voice_detection.enabled && !private_submodules_->voice_detector) {
-    private_submodules_->voice_detector.reset(
-        new VoiceDetectionImpl(&crit_capture_));
-    private_submodules_->voice_detector->Enable(true);
-    private_submodules_->voice_detector->set_likelihood(
-        VoiceDetection::kVeryLowLikelihood);
-    private_submodules_->voice_detector->Initialize(
-        proc_split_sample_rate_hz());
+  if (voice_detection_config_changed) {
+    InitializeVoiceDetector();
  }

  // Reinitialization must happen after all submodule configuration to avoid
@ -1276,14 +1260,17 @@ int AudioProcessingImpl::ProcessStream(AudioFrame* frame) {
    RecordUnprocessedCaptureStream(*frame);
  }

-  capture_.vad_activity = frame->vad_activity_;
  capture_.capture_audio->CopyFrom(frame);
  RETURN_ON_ERR(ProcessCaptureStreamLocked());
  if (submodule_states_.CaptureMultiBandProcessingActive() ||
      submodule_states_.CaptureFullBandProcessingActive()) {
    capture_.capture_audio->CopyTo(frame);
  }
-  frame->vad_activity_ = capture_.vad_activity;
+  if (capture_.stats.voice_detected) {
+    frame->vad_activity_ = *capture_.stats.voice_detected
+                               ? AudioFrame::kVadActive
+                               : AudioFrame::kVadPassive;
+  }

  if (aec_dump_) {
    RecordProcessedCaptureStream(*frame);
@ -1432,19 +1419,10 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
    public_submodules_->noise_suppression->ProcessCaptureAudio(capture_buffer);
  }

-  if (public_submodules_->voice_detection->is_enabled() &&
-      !public_submodules_->voice_detection->using_external_vad()) {
-    bool voice_active =
-        public_submodules_->voice_detection->ProcessCaptureAudio(
-            capture_buffer);
-    capture_.vad_activity =
-        voice_active ? AudioFrame::kVadActive : AudioFrame::kVadPassive;
-  }
-
  if (config_.voice_detection.enabled) {
-    private_submodules_->voice_detector->ProcessCaptureAudio(capture_buffer);
    capture_.stats.voice_detected =
-        private_submodules_->voice_detector->stream_has_voice();
+        private_submodules_->voice_detector->ProcessCaptureAudio(
+            capture_buffer);
  } else {
    capture_.stats.voice_detected = absl::nullopt;
  }
@ -1817,10 +1795,6 @@ NoiseSuppression* AudioProcessingImpl::noise_suppression() const {
  return public_submodules_->noise_suppression_proxy.get();
 }

-VoiceDetection* AudioProcessingImpl::voice_detection() const {
-  return public_submodules_->voice_detection.get();
-}
-
 void AudioProcessingImpl::MutateConfig(
    rtc::FunctionView<void(AudioProcessing::Config*)> mutator) {
  rtc::CritScope cs_render(&crit_render_);
@ -1845,7 +1819,6 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
      public_submodules_->gain_control->is_enabled(),
      config_.gain_controller2.enabled, config_.pre_amplifier.enabled,
      capture_nonlocked_.echo_controller_enabled,
-      public_submodules_->voice_detection->is_enabled(),
      config_.voice_detection.enabled,
      public_submodules_->level_estimator->is_enabled(),
      capture_.transient_suppressor_enabled);
@ -1871,6 +1844,14 @@ void AudioProcessingImpl::InitializeHighPassFilter() {
  }
 }

+void AudioProcessingImpl::InitializeVoiceDetector() {
+  if (config_.voice_detection.enabled) {
+    private_submodules_->voice_detector = std::make_unique<VoiceDetection>(
+        proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood);
+  } else {
+    private_submodules_->voice_detector.reset();
+  }
+}
 void AudioProcessingImpl::InitializeEchoController() {
  bool use_echo_controller =
      echo_control_factory_ ||
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@ -122,7 +122,6 @@ class AudioProcessingImpl : public AudioProcessing {
  GainControl* gain_control() const override;
  LevelEstimator* level_estimator() const override;
  NoiseSuppression* noise_suppression() const override;
-  VoiceDetection* voice_detection() const override;

  // TODO(peah): Remove MutateConfig once the new API allows that.
  void MutateConfig(rtc::FunctionView<void(AudioProcessing::Config*)> mutator);
@ -182,8 +181,7 @@ class AudioProcessingImpl : public AudioProcessing {
                bool gain_controller2_enabled,
                bool pre_amplifier_enabled,
                bool echo_controller_enabled,
-                bool voice_activity_detector_enabled,
-                bool private_voice_detector_enabled,
+                bool voice_detector_enabled,
                bool level_estimator_enabled,
                bool transient_suppressor_enabled);
    bool CaptureMultiBandSubModulesActive() const;
@ -209,8 +207,7 @@ class AudioProcessingImpl : public AudioProcessing {
    bool pre_amplifier_enabled_ = false;
    bool echo_controller_enabled_ = false;
    bool level_estimator_enabled_ = false;
-    bool voice_activity_detector_enabled_ = false;
-    bool private_voice_detector_enabled_ = false;
+    bool voice_detector_enabled_ = false;
    bool transient_suppressor_enabled_ = false;
    bool first_update_ = true;
  };
@ -239,6 +236,7 @@ class AudioProcessingImpl : public AudioProcessing {
  void InitializeResidualEchoDetector()
      RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_render_, crit_capture_);
  void InitializeHighPassFilter() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
+  void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
  void InitializeEchoController()
      RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_render_, crit_capture_);
  void InitializeGainController2() RTC_EXCLUSIVE_LOCKS_REQUIRED(crit_capture_);
@ -405,7 +403,6 @@ class AudioProcessingImpl : public AudioProcessing {
      size_t num_keyboard_frames = 0;
      const float* keyboard_data = nullptr;
    } keyboard_info;
-    AudioFrame::VADActivity vad_activity = AudioFrame::kVadUnknown;
  } capture_ RTC_GUARDED_BY(crit_capture_);

  struct ApmCaptureNonLockedState {
--- a/modules/audio_processing/audio_processing_impl_locking_unittest.cc
+++ b/modules/audio_processing/audio_processing_impl_locking_unittest.cc
@ -595,7 +595,6 @@ void StatsProcessor::Process() {

  // The below return values are not testable.
  apm_->noise_suppression()->speech_probability();
-  apm_->voice_detection()->is_enabled();

  apm_->GetStatistics(/*has_remote_tracks=*/true);
 }
--- a/modules/audio_processing/audio_processing_unittest.cc
+++ b/modules/audio_processing/audio_processing_unittest.cc
@ -193,12 +193,11 @@ void EnableAllAPComponents(AudioProcessing* ap) {

  apm_config.high_pass_filter.enabled = true;
  apm_config.level_estimation.enabled = true;
+  apm_config.voice_detection.enabled = true;
  ap->ApplyConfig(apm_config);

  EXPECT_NOERR(ap->level_estimator()->Enable(true));
  EXPECT_NOERR(ap->noise_suppression()->Enable(true));
-
-  EXPECT_NOERR(ap->voice_detection()->Enable(true));
 }

 // These functions are only used by ApmTest.Process.
@ -1114,63 +1113,6 @@ TEST_F(ApmTest, LevelEstimator) {
  EXPECT_EQ(90, apm_->level_estimator()->RMS());
 }

-TEST_F(ApmTest, VoiceDetection) {
-  // Test external VAD
-  EXPECT_EQ(apm_->kNoError,
-            apm_->voice_detection()->set_stream_has_voice(true));
-  EXPECT_TRUE(apm_->voice_detection()->stream_has_voice());
-  EXPECT_EQ(apm_->kNoError,
-            apm_->voice_detection()->set_stream_has_voice(false));
-  EXPECT_FALSE(apm_->voice_detection()->stream_has_voice());
-
-  // Test valid likelihoods
-  VoiceDetection::Likelihood likelihood[] = {
-      VoiceDetection::kVeryLowLikelihood, VoiceDetection::kLowLikelihood,
-      VoiceDetection::kModerateLikelihood, VoiceDetection::kHighLikelihood};
-  for (size_t i = 0; i < arraysize(likelihood); i++) {
-    EXPECT_EQ(apm_->kNoError,
-              apm_->voice_detection()->set_likelihood(likelihood[i]));
-    EXPECT_EQ(likelihood[i], apm_->voice_detection()->likelihood());
-  }
-
-  /* TODO(bjornv): Enable once VAD supports other frame lengths than 10 ms
-  // Test invalid frame sizes
-  EXPECT_EQ(apm_->kBadParameterError,
-      apm_->voice_detection()->set_frame_size_ms(12));
-
-  // Test valid frame sizes
-  for (int i = 10; i <= 30; i += 10) {
-    EXPECT_EQ(apm_->kNoError,
-        apm_->voice_detection()->set_frame_size_ms(i));
-    EXPECT_EQ(i, apm_->voice_detection()->frame_size_ms());
-  }
-  */
-
-  // Turn VAD on/off
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
-  EXPECT_TRUE(apm_->voice_detection()->is_enabled());
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
-  EXPECT_FALSE(apm_->voice_detection()->is_enabled());
-
-  // Test that AudioFrame activity is maintained when VAD is disabled.
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
-  AudioFrame::VADActivity activity[] = {
-      AudioFrame::kVadActive, AudioFrame::kVadPassive, AudioFrame::kVadUnknown};
-  for (size_t i = 0; i < arraysize(activity); i++) {
-    frame_->vad_activity_ = activity[i];
-    EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
-    EXPECT_EQ(activity[i], frame_->vad_activity_);
-  }
-
-  // Test that AudioFrame activity is set when VAD is enabled.
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
-  frame_->vad_activity_ = AudioFrame::kVadUnknown;
-  EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
-  EXPECT_NE(AudioFrame::kVadUnknown, frame_->vad_activity_);
-
-  // TODO(bjornv): Add tests for streamed voice; stream_has_voice()
-}
-
 TEST_F(ApmTest, AllProcessingDisabledByDefault) {
  AudioProcessing::Config config = apm_->GetConfig();
  EXPECT_FALSE(config.echo_canceller.enabled);
@ -1180,7 +1122,6 @@ TEST_F(ApmTest, AllProcessingDisabledByDefault) {
  EXPECT_FALSE(apm_->gain_control()->is_enabled());
  EXPECT_FALSE(apm_->level_estimator()->is_enabled());
  EXPECT_FALSE(apm_->noise_suppression()->is_enabled());
-  EXPECT_FALSE(apm_->voice_detection()->is_enabled());
 }

 TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
@ -1282,16 +1223,7 @@ TEST_F(ApmTest, SplittingFilter) {
  EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
  EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(false));

-  // 3. Only VAD is enabled...
-  SetFrameTo(frame_, 1000);
-  frame_copy.CopyFrom(*frame_);
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
-  EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
-  EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
-  EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
-
-  // 4. Only GetStatistics-reporting VAD is enabled...
+  // 3. Only GetStatistics-reporting VAD is enabled...
  SetFrameTo(frame_, 1000);
  frame_copy.CopyFrom(*frame_);
  auto apm_config = apm_->GetConfig();
@ -1303,18 +1235,16 @@ TEST_F(ApmTest, SplittingFilter) {
  apm_config.voice_detection.enabled = false;
  apm_->ApplyConfig(apm_config);

-  // 5. Both VADs and the level estimator are enabled...
+  // 4. Both the VAD and the level estimator are enabled...
  SetFrameTo(frame_, 1000);
  frame_copy.CopyFrom(*frame_);
  EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(true));
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(true));
  apm_config.voice_detection.enabled = true;
  apm_->ApplyConfig(apm_config);
  EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
  EXPECT_EQ(apm_->kNoError, apm_->ProcessStream(frame_));
  EXPECT_TRUE(FrameDataAreEqual(*frame_, frame_copy));
  EXPECT_EQ(apm_->kNoError, apm_->level_estimator()->Enable(false));
-  EXPECT_EQ(apm_->kNoError, apm_->voice_detection()->Enable(false));
  apm_config.voice_detection.enabled = false;
  apm_->ApplyConfig(apm_config);

@ -1652,18 +1582,15 @@ TEST_F(ApmTest, Process) {
      if (apm_->gain_control()->stream_is_saturated()) {
        is_saturated_count++;
      }
-      if (apm_->voice_detection()->stream_has_voice()) {
-        has_voice_count++;
-        EXPECT_EQ(AudioFrame::kVadActive, frame_->vad_activity_);
-      } else {
-        EXPECT_EQ(AudioFrame::kVadPassive, frame_->vad_activity_);
-      }
-
-      ns_speech_prob_average += apm_->noise_suppression()->speech_probability();
      AudioProcessingStats stats =
          apm_->GetStatistics(/*has_remote_tracks=*/false);
+      EXPECT_TRUE(stats.voice_detected);
+      EXPECT_TRUE(stats.output_rms_dbfs);
+      has_voice_count += *stats.voice_detected ? 1 : 0;
      rms_dbfs_average += *stats.output_rms_dbfs;

+      ns_speech_prob_average += apm_->noise_suppression()->speech_probability();
+
      size_t frame_size = frame_->samples_per_channel_ * frame_->num_channels_;
      size_t write_count =
          fwrite(frame_->data(), sizeof(int16_t), frame_size, out_file_);
@ -2566,7 +2493,6 @@ std::unique_ptr<AudioProcessing> CreateApm(bool mobile_aec) {
  EXPECT_EQ(apm->gain_control()->Enable(false), 0);
  EXPECT_EQ(apm->level_estimator()->Enable(false), 0);
  EXPECT_EQ(apm->noise_suppression()->Enable(false), 0);
-  EXPECT_EQ(apm->voice_detection()->Enable(false), 0);
  return apm;
 }

--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@ -53,7 +53,6 @@ class LevelEstimator;
 class NoiseSuppression;
 class CustomAudioAnalyzer;
 class CustomProcessing;
-class VoiceDetection;

 // Use to enable the extended filter mode in the AEC, along with robustness
 // measures around the reported system delays. It comes with a significant
@ -287,7 +286,10 @@ class AudioProcessing : public rtc::RefCountInterface {
      Level level = kModerate;
    } noise_suppression;

-    // Enables reporting of |has_voice| in webrtc::AudioProcessingStats.
+    // Enables reporting of |voice_detected| in webrtc::AudioProcessingStats.
+    // In addition to |voice_detected|, VAD decision is provided through the
+    // |AudioFrame| passed to |ProcessStream()|. The |vad_activity_| member will
+    // be modified to reflect the current decision.
    struct VoiceDetection {
      bool enabled = false;
    } voice_detection;
@ -685,7 +687,6 @@ class AudioProcessing : public rtc::RefCountInterface {
  virtual GainControl* gain_control() const = 0;
  virtual LevelEstimator* level_estimator() const = 0;
  virtual NoiseSuppression* noise_suppression() const = 0;
-  virtual VoiceDetection* voice_detection() const = 0;

  // Returns the last applied configuration.
  virtual AudioProcessing::Config GetConfig() const = 0;
@ -981,56 +982,6 @@ class EchoDetector : public rtc::RefCountInterface {
  virtual Metrics GetMetrics() const = 0;
 };

-// The voice activity detection (VAD) component analyzes the stream to
-// determine if voice is present. A facility is also provided to pass in an
-// external VAD decision.
-//
-// In addition to |stream_has_voice()| the VAD decision is provided through the
-// |AudioFrame| passed to |ProcessStream()|. The |vad_activity_| member will be
-// modified to reflect the current decision.
-class VoiceDetection {
- public:
-  virtual int Enable(bool enable) = 0;
-  virtual bool is_enabled() const = 0;
-
-  // Returns true if voice is detected in the current frame. Should be called
-  // after |ProcessStream()|.
-  virtual bool stream_has_voice() const = 0;
-
-  // Some of the APM functionality requires a VAD decision. In the case that
-  // a decision is externally available for the current frame, it can be passed
-  // in here, before |ProcessStream()| is called.
-  //
-  // VoiceDetection does _not_ need to be enabled to use this. If it happens to
-  // be enabled, detection will be skipped for any frame in which an external
-  // VAD decision is provided.
-  virtual int set_stream_has_voice(bool has_voice) = 0;
-
-  // Specifies the likelihood that a frame will be declared to contain voice.
-  // A higher value makes it more likely that speech will not be clipped, at
-  // the expense of more noise being detected as voice.
-  enum Likelihood {
-    kVeryLowLikelihood,
-    kLowLikelihood,
-    kModerateLikelihood,
-    kHighLikelihood
-  };
-
-  virtual int set_likelihood(Likelihood likelihood) = 0;
-  virtual Likelihood likelihood() const = 0;
-
-  // Sets the |size| of the frames in ms on which the VAD will operate. Larger
-  // frames will improve detection accuracy, but reduce the frequency of
-  // updates.
-  //
-  // This does not impact the size of frames passed to |ProcessStream()|.
-  virtual int set_frame_size_ms(int size) = 0;
-  virtual int frame_size_ms() const = 0;
-
- protected:
-  virtual ~VoiceDetection() {}
-};
-
 }  // namespace webrtc

 #endif  // MODULES_AUDIO_PROCESSING_INCLUDE_AUDIO_PROCESSING_H_
--- a/modules/audio_processing/include/mock_audio_processing.h
+++ b/modules/audio_processing/include/mock_audio_processing.h
@ -91,26 +91,12 @@ class MockEchoControl : public EchoControl {
  MOCK_METHOD1(SetAudioBufferDelay, void(size_t delay_ms));
 };

-class MockVoiceDetection : public VoiceDetection {
- public:
-  virtual ~MockVoiceDetection() {}
-  MOCK_METHOD1(Enable, int(bool enable));
-  MOCK_CONST_METHOD0(is_enabled, bool());
-  MOCK_CONST_METHOD0(stream_has_voice, bool());
-  MOCK_METHOD1(set_stream_has_voice, int(bool has_voice));
-  MOCK_METHOD1(set_likelihood, int(Likelihood likelihood));
-  MOCK_CONST_METHOD0(likelihood, Likelihood());
-  MOCK_METHOD1(set_frame_size_ms, int(int size));
-  MOCK_CONST_METHOD0(frame_size_ms, int());
-};
-
 class MockAudioProcessing : public ::testing::NiceMock<AudioProcessing> {
 public:
  MockAudioProcessing()
      : gain_control_(new ::testing::NiceMock<MockGainControl>()),
        level_estimator_(new ::testing::NiceMock<MockLevelEstimator>()),
-        noise_suppression_(new ::testing::NiceMock<MockNoiseSuppression>()),
-        voice_detection_(new ::testing::NiceMock<MockVoiceDetection>()) {}
+        noise_suppression_(new ::testing::NiceMock<MockNoiseSuppression>()) {}

  virtual ~MockAudioProcessing() {}

@ -183,9 +169,6 @@ class MockAudioProcessing : public ::testing::NiceMock<AudioProcessing> {
  virtual MockNoiseSuppression* noise_suppression() const {
    return noise_suppression_.get();
  }
-  virtual MockVoiceDetection* voice_detection() const {
-    return voice_detection_.get();
-  }

  MOCK_CONST_METHOD0(GetConfig, AudioProcessing::Config());

@ -193,7 +176,6 @@ class MockAudioProcessing : public ::testing::NiceMock<AudioProcessing> {
  std::unique_ptr<MockGainControl> gain_control_;
  std::unique_ptr<MockLevelEstimator> level_estimator_;
  std::unique_ptr<MockNoiseSuppression> noise_suppression_;
-  std::unique_ptr<MockVoiceDetection> voice_detection_;
 };

 }  // namespace test
--- a/modules/audio_processing/test/audio_processing_simulator.cc
+++ b/modules/audio_processing/test/audio_processing_simulator.cc
@ -455,6 +455,10 @@ void AudioProcessingSimulator::CreateAudioProcessor() {
    apm_config.high_pass_filter.enabled = *settings_.use_hpf;
  }

+  if (settings_.use_vad) {
+    apm_config.voice_detection.enabled = *settings_.use_vad;
+  }
+
  if (settings_.use_refined_adaptive_filter) {
    config.Set<RefinedAdaptiveFilter>(
        new RefinedAdaptiveFilter(*settings_.use_refined_adaptive_filter));
@ -502,10 +506,6 @@ void AudioProcessingSimulator::CreateAudioProcessor() {
    RTC_CHECK_EQ(AudioProcessing::kNoError,
                 ap_->level_estimator()->Enable(*settings_.use_le));
  }
-  if (settings_.use_vad) {
-    RTC_CHECK_EQ(AudioProcessing::kNoError,
-                 ap_->voice_detection()->Enable(*settings_.use_vad));
-  }
  if (settings_.use_agc_limiter) {
    RTC_CHECK_EQ(AudioProcessing::kNoError, ap_->gain_control()->enable_limiter(
                                                *settings_.use_agc_limiter));
@ -526,13 +526,6 @@ void AudioProcessingSimulator::CreateAudioProcessor() {
        ap_->gain_control()->set_mode(
            static_cast<webrtc::GainControl::Mode>(*settings_.agc_mode)));
  }
-
-  if (settings_.vad_likelihood) {
-    RTC_CHECK_EQ(AudioProcessing::kNoError,
-                 ap_->voice_detection()->set_likelihood(
-                     static_cast<webrtc::VoiceDetection::Likelihood>(
-                         *settings_.vad_likelihood)));
-  }
  if (settings_.ns_level) {
    RTC_CHECK_EQ(
        AudioProcessing::kNoError,
--- a/modules/audio_processing/test/audio_processing_simulator.h
+++ b/modules/audio_processing/test/audio_processing_simulator.h
@ -79,7 +79,6 @@ struct SimulationSettings {
  AudioProcessing::Config::GainController2::LevelEstimator
      agc2_adaptive_level_estimator;
  absl::optional<float> pre_amplifier_gain_factor;
-  absl::optional<int> vad_likelihood;
  absl::optional<int> ns_level;
  absl::optional<int> maximum_internal_processing_rate;
  absl::optional<bool> use_refined_adaptive_filter;
--- a/modules/audio_processing/test/audioproc_float_impl.cc
+++ b/modules/audio_processing/test/audioproc_float_impl.cc
@ -185,10 +185,6 @@ ABSL_FLAG(float,
          pre_amplifier_gain_factor,
          kParameterNotSpecifiedValue,
          "Pre-amplifier gain factor (linear) to apply");
-ABSL_FLAG(int,
-          vad_likelihood,
-          kParameterNotSpecifiedValue,
-          "Specify the VAD likelihood (0-3)");
 ABSL_FLAG(int,
          ns_level,
          kParameterNotSpecifiedValue,
@ -423,8 +419,6 @@ SimulationSettings CreateSettings() {
      absl::GetFlag(FLAGS_agc2_adaptive_level_estimator));
  SetSettingIfSpecified(absl::GetFlag(FLAGS_pre_amplifier_gain_factor),
                        &settings.pre_amplifier_gain_factor);
-  SetSettingIfSpecified(absl::GetFlag(FLAGS_vad_likelihood),
-                        &settings.vad_likelihood);
  SetSettingIfSpecified(absl::GetFlag(FLAGS_ns_level), &settings.ns_level);
  SetSettingIfSpecified(absl::GetFlag(FLAGS_maximum_internal_processing_rate),
                        &settings.maximum_internal_processing_rate);
@ -555,11 +549,6 @@ void PerformBasicParameterSanityChecks(const SimulationSettings& settings) {
                                      (*settings.agc2_fixed_gain_db) > 90),
      "Error: --agc2_fixed_gain_db must be specified between 0 and 90.\n");

-  ReportConditionalErrorAndExit(
-      settings.vad_likelihood &&
-          ((*settings.vad_likelihood) < 0 || (*settings.vad_likelihood) > 3),
-      "Error: --vad_likelihood must be specified between 0 and 3.\n");
-
  ReportConditionalErrorAndExit(
      settings.ns_level &&
          ((*settings.ns_level) < 0 || (*settings.ns_level) > 3),
--- a/modules/audio_processing/voice_detection.cc
+++ b/modules/audio_processing/voice_detection.cc
@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "modules/audio_processing/voice_detection.h"
+
+#include "api/audio/audio_frame.h"
+#include "common_audio/vad/include/webrtc_vad.h"
+#include "modules/audio_processing/audio_buffer.h"
+#include "rtc_base/checks.h"
+
+namespace webrtc {
+class VoiceDetection::Vad {
+ public:
+  Vad() {
+    state_ = WebRtcVad_Create();
+    RTC_CHECK(state_);
+    int error = WebRtcVad_Init(state_);
+    RTC_DCHECK_EQ(0, error);
+  }
+  ~Vad() { WebRtcVad_Free(state_); }
+
+  Vad(Vad&) = delete;
+  Vad& operator=(Vad&) = delete;
+
+  VadInst* state() { return state_; }
+
+ private:
+  VadInst* state_ = nullptr;
+};
+
+VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood)
+    : sample_rate_hz_(sample_rate_hz),
+      frame_size_samples_(static_cast<size_t>(sample_rate_hz_ / 100)),
+      likelihood_(likelihood),
+      vad_(new Vad()) {
+  int mode = 2;
+  switch (likelihood) {
+    case VoiceDetection::kVeryLowLikelihood:
+      mode = 3;
+      break;
+    case VoiceDetection::kLowLikelihood:
+      mode = 2;
+      break;
+    case VoiceDetection::kModerateLikelihood:
+      mode = 1;
+      break;
+    case VoiceDetection::kHighLikelihood:
+      mode = 0;
+      break;
+    default:
+      RTC_NOTREACHED();
+      break;
+  }
+  int error = WebRtcVad_set_mode(vad_->state(), mode);
+  RTC_DCHECK_EQ(0, error);
+}
+
+VoiceDetection::~VoiceDetection() {}
+
+bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) {
+  RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
+                audio->num_frames_per_band());
+  std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
+  rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
+                                               audio->num_frames_per_band());
+  if (audio->num_channels() == 1) {
+    FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
+                  audio->num_frames_per_band(), mixed_low_pass_data.data());
+  } else {
+    const int num_channels = static_cast<int>(audio->num_channels());
+    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
+      int32_t value =
+          FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
+      for (int j = 1; j < num_channels; ++j) {
+        value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
+      }
+      mixed_low_pass_data[i] = value / num_channels;
+    }
+  }
+
+  int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
+                                  mixed_low_pass.data(), frame_size_samples_);
+  RTC_DCHECK(vad_ret == 0 || vad_ret == 1);
+  return vad_ret == 0 ? false : true;
+}
+}  // namespace webrtc
--- a/modules/audio_processing/voice_detection.h
+++ b/modules/audio_processing/voice_detection.h
@ -0,0 +1,59 @@
+/*
+ *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
+#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
+
+#include <stddef.h>
+
+#include <memory>
+
+#include "modules/audio_processing/include/audio_processing.h"
+
+namespace webrtc {
+
+class AudioBuffer;
+
+// The voice activity detection (VAD) component analyzes the stream to
+// determine if voice is present.
+class VoiceDetection {
+ public:
+  // Specifies the likelihood that a frame will be declared to contain voice.
+  // A higher value makes it more likely that speech will not be clipped, at
+  // the expense of more noise being detected as voice.
+  enum Likelihood {
+    kVeryLowLikelihood,
+    kLowLikelihood,
+    kModerateLikelihood,
+    kHighLikelihood
+  };
+
+  VoiceDetection(int sample_rate_hz, Likelihood likelihood);
+  ~VoiceDetection();
+
+  VoiceDetection(VoiceDetection&) = delete;
+  VoiceDetection& operator=(VoiceDetection&) = delete;
+
+  // Returns true if voice is detected in the current frame.
+  bool ProcessCaptureAudio(AudioBuffer* audio);
+
+  Likelihood likelihood() const { return likelihood_; }
+
+ private:
+  class Vad;
+
+  int sample_rate_hz_;
+  size_t frame_size_samples_;
+  Likelihood likelihood_;
+  std::unique_ptr<Vad> vad_;
+};
+}  // namespace webrtc
+
+#endif  // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
--- a/modules/audio_processing/voice_detection_impl.cc
+++ b/modules/audio_processing/voice_detection_impl.cc
@ -1,168 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "modules/audio_processing/voice_detection_impl.h"
-
-#include "api/audio/audio_frame.h"
-#include "common_audio/vad/include/webrtc_vad.h"
-#include "modules/audio_processing/audio_buffer.h"
-#include "rtc_base/checks.h"
-#include "rtc_base/constructor_magic.h"
-
-namespace webrtc {
-class VoiceDetectionImpl::Vad {
- public:
-  Vad() {
-    state_ = WebRtcVad_Create();
-    RTC_CHECK(state_);
-    int error = WebRtcVad_Init(state_);
-    RTC_DCHECK_EQ(0, error);
-  }
-  ~Vad() { WebRtcVad_Free(state_); }
-  VadInst* state() { return state_; }
-
- private:
-  VadInst* state_ = nullptr;
-  RTC_DISALLOW_COPY_AND_ASSIGN(Vad);
-};
-
-VoiceDetectionImpl::VoiceDetectionImpl(rtc::CriticalSection* crit)
-    : crit_(crit) {
-  RTC_DCHECK(crit);
-}
-
-VoiceDetectionImpl::~VoiceDetectionImpl() {}
-
-void VoiceDetectionImpl::Initialize(int sample_rate_hz) {
-  rtc::CritScope cs(crit_);
-  sample_rate_hz_ = sample_rate_hz;
-  std::unique_ptr<Vad> new_vad;
-  if (enabled_) {
-    new_vad.reset(new Vad());
-  }
-  vad_.swap(new_vad);
-  using_external_vad_ = false;
-  frame_size_samples_ =
-      static_cast<size_t>(frame_size_ms_ * sample_rate_hz_) / 1000;
-  set_likelihood(likelihood_);
-}
-
-bool VoiceDetectionImpl::ProcessCaptureAudio(AudioBuffer* audio) {
-  rtc::CritScope cs(crit_);
-  RTC_DCHECK(enabled_);
-
-  RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
-                audio->num_frames_per_band());
-  std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
-  rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
-                                               audio->num_frames_per_band());
-  if (audio->num_channels() == 1) {
-    FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
-                  audio->num_frames_per_band(), mixed_low_pass_data.data());
-  } else {
-    const int num_channels = static_cast<int>(audio->num_channels());
-    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
-      int32_t value =
-          FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
-      for (int j = 1; j < num_channels; ++j) {
-        value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
-      }
-      mixed_low_pass_data[i] = value / num_channels;
-    }
-  }
-
-  int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
-                                  mixed_low_pass.data(), frame_size_samples_);
-  if (vad_ret == 0) {
-    stream_has_voice_ = false;
-    return false;
-  } else if (vad_ret == 1) {
-    stream_has_voice_ = true;
-  } else {
-    RTC_NOTREACHED();
-  }
-
-  return stream_has_voice_;
-}
-
-int VoiceDetectionImpl::Enable(bool enable) {
-  rtc::CritScope cs(crit_);
-  if (enabled_ != enable) {
-    enabled_ = enable;
-    Initialize(sample_rate_hz_);
-  }
-  return AudioProcessing::kNoError;
-}
-
-bool VoiceDetectionImpl::is_enabled() const {
-  rtc::CritScope cs(crit_);
-  return enabled_;
-}
-
-int VoiceDetectionImpl::set_stream_has_voice(bool has_voice) {
-  rtc::CritScope cs(crit_);
-  using_external_vad_ = true;
-  stream_has_voice_ = has_voice;
-  return AudioProcessing::kNoError;
-}
-
-bool VoiceDetectionImpl::stream_has_voice() const {
-  rtc::CritScope cs(crit_);
-  // TODO(ajm): enable this assertion?
-  // RTC_DCHECK(using_external_vad_ || is_component_enabled());
-  return stream_has_voice_;
-}
-
-int VoiceDetectionImpl::set_likelihood(VoiceDetection::Likelihood likelihood) {
-  rtc::CritScope cs(crit_);
-  likelihood_ = likelihood;
-  if (enabled_) {
-    int mode = 2;
-    switch (likelihood) {
-      case VoiceDetection::kVeryLowLikelihood:
-        mode = 3;
-        break;
-      case VoiceDetection::kLowLikelihood:
-        mode = 2;
-        break;
-      case VoiceDetection::kModerateLikelihood:
-        mode = 1;
-        break;
-      case VoiceDetection::kHighLikelihood:
-        mode = 0;
-        break;
-      default:
-        RTC_NOTREACHED();
-        break;
-    }
-    int error = WebRtcVad_set_mode(vad_->state(), mode);
-    RTC_DCHECK_EQ(0, error);
-  }
-  return AudioProcessing::kNoError;
-}
-
-VoiceDetection::Likelihood VoiceDetectionImpl::likelihood() const {
-  rtc::CritScope cs(crit_);
-  return likelihood_;
-}
-
-int VoiceDetectionImpl::set_frame_size_ms(int size) {
-  rtc::CritScope cs(crit_);
-  RTC_DCHECK_EQ(10, size);  // TODO(ajm): remove when supported.
-  frame_size_ms_ = size;
-  Initialize(sample_rate_hz_);
-  return AudioProcessing::kNoError;
-}
-
-int VoiceDetectionImpl::frame_size_ms() const {
-  rtc::CritScope cs(crit_);
-  return frame_size_ms_;
-}
-}  // namespace webrtc
--- a/modules/audio_processing/voice_detection_impl.h
+++ b/modules/audio_processing/voice_detection_impl.h
@ -1,69 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_IMPL_H_
-#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_IMPL_H_
-
-#include <stddef.h>
-
-#include <memory>
-
-#include "modules/audio_processing/include/audio_processing.h"
-#include "rtc_base/constructor_magic.h"
-#include "rtc_base/critical_section.h"
-#include "rtc_base/thread_annotations.h"
-
-namespace webrtc {
-
-class AudioBuffer;
-
-class VoiceDetectionImpl : public VoiceDetection {
- public:
-  explicit VoiceDetectionImpl(rtc::CriticalSection* crit);
-  ~VoiceDetectionImpl() override;
-
-  // TODO(peah): Fold into ctor, once public API is removed.
-  void Initialize(int sample_rate_hz);
-
-  // Returns the VAD activity.
-  bool ProcessCaptureAudio(AudioBuffer* audio);
-
-  bool using_external_vad() const {
-    rtc::CritScope cs(crit_);
-    return using_external_vad_;
-  }
-
-  // VoiceDetection implementation.
-  int Enable(bool enable) override;
-  bool is_enabled() const override;
-  int set_stream_has_voice(bool has_voice) override;
-  bool stream_has_voice() const override;
-  int set_likelihood(Likelihood likelihood) override;
-  Likelihood likelihood() const override;
-  int set_frame_size_ms(int size) override;
-  int frame_size_ms() const override;
-
- private:
-  class Vad;
-
-  rtc::CriticalSection* const crit_;
-  bool enabled_ RTC_GUARDED_BY(crit_) = false;
-  bool stream_has_voice_ RTC_GUARDED_BY(crit_) = false;
-  bool using_external_vad_ RTC_GUARDED_BY(crit_) = false;
-  Likelihood likelihood_ RTC_GUARDED_BY(crit_) = kLowLikelihood;
-  int frame_size_ms_ RTC_GUARDED_BY(crit_) = 10;
-  size_t frame_size_samples_ RTC_GUARDED_BY(crit_) = 0;
-  int sample_rate_hz_ RTC_GUARDED_BY(crit_) = 0;
-  std::unique_ptr<Vad> vad_ RTC_GUARDED_BY(crit_);
-  RTC_DISALLOW_IMPLICIT_CONSTRUCTORS(VoiceDetectionImpl);
-};
-}  // namespace webrtc
-
-#endif  // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_IMPL_H_
--- a/modules/audio_processing/voice_detection_unittest.cc
+++ b/modules/audio_processing/voice_detection_unittest.cc
@ -13,7 +13,7 @@
 #include "modules/audio_processing/audio_buffer.h"
 #include "modules/audio_processing/test/audio_buffer_tools.h"
 #include "modules/audio_processing/test/bitexactness_tools.h"
-#include "modules/audio_processing/voice_detection_impl.h"
+#include "modules/audio_processing/voice_detection.h"
 #include "test/gtest.h"

 namespace webrtc {
@ -22,27 +22,24 @@ namespace {
 const int kNumFramesToProcess = 1000;

 // Process one frame of data and produce the output.
-void ProcessOneFrame(int sample_rate_hz,
+bool ProcessOneFrame(int sample_rate_hz,
                     AudioBuffer* audio_buffer,
-                     VoiceDetectionImpl* voice_detection) {
+                     VoiceDetection* voice_detection) {
  if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
    audio_buffer->SplitIntoFrequencyBands();
  }

-  voice_detection->ProcessCaptureAudio(audio_buffer);
+  return voice_detection->ProcessCaptureAudio(audio_buffer);
 }

 // Processes a specified amount of frames, verifies the results and reports
 // any errors.
 void RunBitexactnessTest(int sample_rate_hz,
                         size_t num_channels,
-                         int frame_size_ms_reference,
-                         bool stream_has_voice_reference,
-                         VoiceDetection::Likelihood likelihood_reference) {
-  rtc::CriticalSection crit_capture;
-  VoiceDetectionImpl voice_detection(&crit_capture);
-  voice_detection.Initialize(sample_rate_hz > 16000 ? 16000 : sample_rate_hz);
-  voice_detection.Enable(true);
+                         bool stream_has_voice_reference) {
+  int sample_rate_to_use = std::min(sample_rate_hz, 16000);
+  VoiceDetection voice_detection(sample_rate_to_use,
+                                 VoiceDetection::kLowLikelihood);

  int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
  const StreamConfig capture_config(sample_rate_hz, num_channels, false);
@ -53,6 +50,7 @@ void RunBitexactnessTest(int sample_rate_hz,
  test::InputAudioFile capture_file(
      test::GetApmCaptureTestVectorFileName(sample_rate_hz));
  std::vector<float> capture_input(samples_per_channel * num_channels);
+  bool stream_has_voice = false;
  for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
    ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
                                   &capture_file, capture_input);
@ -60,64 +58,47 @@ void RunBitexactnessTest(int sample_rate_hz,
    test::CopyVectorToAudioBuffer(capture_config, capture_input,
                                  &capture_buffer);

+    stream_has_voice =
        ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
  }

-  int frame_size_ms = voice_detection.frame_size_ms();
-  bool stream_has_voice = voice_detection.stream_has_voice();
-  VoiceDetection::Likelihood likelihood = voice_detection.likelihood();
-
-  // Compare the outputs to the references.
-  EXPECT_EQ(frame_size_ms_reference, frame_size_ms);
  EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
-  EXPECT_EQ(likelihood_reference, likelihood);
 }

-const int kFrameSizeMsReference = 10;
 const bool kStreamHasVoiceReference = true;
-const VoiceDetection::Likelihood kLikelihoodReference =
-    VoiceDetection::kLowLikelihood;

 }  // namespace

 TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
-  RunBitexactnessTest(8000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(8000, 1, kStreamHasVoiceReference);
 }

 TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
-  RunBitexactnessTest(16000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(16000, 1, kStreamHasVoiceReference);
 }

 TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
-  RunBitexactnessTest(32000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(32000, 1, kStreamHasVoiceReference);
 }

 TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
-  RunBitexactnessTest(48000, 1, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(48000, 1, kStreamHasVoiceReference);
 }

 TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
-  RunBitexactnessTest(8000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(8000, 2, kStreamHasVoiceReference);
 }

 TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
-  RunBitexactnessTest(16000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(16000, 2, kStreamHasVoiceReference);
 }

 TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
-  RunBitexactnessTest(32000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(32000, 2, kStreamHasVoiceReference);
 }

 TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
-  RunBitexactnessTest(48000, 2, kFrameSizeMsReference, kStreamHasVoiceReference,
-                      kLikelihoodReference);
+  RunBitexactnessTest(48000, 2, kStreamHasVoiceReference);
 }

 }  // namespace webrtc
--- a/resources/audio_processing/output_data_fixed.pb.sha1
+++ b/resources/audio_processing/output_data_fixed.pb.sha1
@ -1 +1 @@
-e540fa8940b41d0cda26cdef937be3a455a04be7
+e9569d846d21e027bfdcae76a40146bc10d49d54
--- a/resources/audio_processing/output_data_float.pb.sha1
+++ b/resources/audio_processing/output_data_float.pb.sha1
@ -1 +1 @@
-2811f534082857ac9b9447a3e53028ef11851052
+53dd63154cc2694a3425596d9a8300fa2c66215d
--- a/resources/audio_processing/output_data_mac.pb.sha1
+++ b/resources/audio_processing/output_data_mac.pb.sha1
@ -1 +1 @@
-cc82c345f1e7ef17b12c2da41a0a9f73b09ca8f6
+2b31852bbce2b0b19ee36c47b18352e035cb08c5
--- a/test/fuzzers/audio_processing_configs_fuzzer.cc
+++ b/test/fuzzers/audio_processing_configs_fuzzer.cc
@ -145,7 +145,6 @@ std::unique_ptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
  apm->ApplyConfig(apm_config);

  apm->level_estimator()->Enable(use_le);
-  apm->voice_detection()->Enable(use_vad);

  return apm;
 }