Revert "Reland "Remove unused APM voice activity detection sub-module""

This reverts commit 54d1344d985b00d4d1580dd18057d4618c11ad1f. Reason for revert: Breaks chromium roll, see https://ci.chromium.org/ui/p/chromium/builders/try/linux_chromium_tsan_rel_ng/1080583/overview https://chromium-review.googlesource.com/c/chromium/src/+/3461512 Original change's description: > Reland "Remove unused APM voice activity detection sub-module" > > This reverts commit a751f167c68343f76528436defdbc61600a8d7b3. > > Reason for revert: dependency in a downstream project removed > > Original change's description: > > Revert "Remove unused APM voice activity detection sub-module" > > > > This reverts commit b4e06d032e6f82a65c52ed0c5364ae9e7c0a0215. > > > > Reason for revert: breaking downstream projects > > > > Original change's description: > > > Remove unused APM voice activity detection sub-module > > > > > > API changes: > > > - webrtc::AudioProcessing::Config::VoiceDetection removed > > > - webrtc::AudioProcessingStats::voice_detected deprecated > > > - cricket::AudioOptions::typing_detection deprecated > > > - webrtc::StatsReport::StatsValueName:: > > > kStatsValueNameTypingNoiseState deprecated > > > > > > PSA: https://groups.google.com/g/discuss-webrtc/c/7X6uwmJarE0 > > > > > > Bug: webrtc:11226,webrtc:11292 > > > Change-Id: I8d008b56708cf62961b9857ec052b59fda3b41bf > > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/250666 > > > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > > > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> > > > Reviewed-by: Sam Zackrisson <saza@webrtc.org> > > > Reviewed-by: Björn Terelius <terelius@webrtc.org> > > > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > > > Cr-Commit-Position: refs/heads/main@{#35975} > > > > TBR=gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com > > > > Change-Id: Iee01fdb874b4e0331277f3ffe60dacaabc3859a2 > > No-Presubmit: true > > No-Tree-Checks: true > > No-Try: true > > Bug: webrtc:11226,webrtc:11292 > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251600 > > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> > > Commit-Queue: Mirko Bonadei <mbonadei@webrtc.org> > > Cr-Commit-Position: refs/heads/main@{#35977} > > # Not skipping CQ checks because this is a reland. > > Bug: webrtc:11226,webrtc:11292 > Change-Id: I2fcbc5fdade16bfe6a0f0a02841a33a598d4f2ad > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251660 > Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > Cr-Commit-Position: refs/heads/main@{#35984} TBR=mbonadei@webrtc.org,gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com Change-Id: Ib308a3af2dcce85a0074ef5a4680ccec3f82712f No-Presubmit: true No-Tree-Checks: true No-Try: true Bug: webrtc:11226,webrtc:11292 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251688 Reviewed-by: Henrik Boström <hbos@webrtc.org> Bot-Commit: rubber-stamper@appspot.gserviceaccount.com <rubber-stamper@appspot.gserviceaccount.com> Auto-Submit: Henrik Boström <hbos@webrtc.org> Reviewed-by: Harald Alvestrand <hta@webrtc.org> Commit-Queue: Harald Alvestrand <hta@webrtc.org> Cr-Commit-Position: refs/heads/main@{#35990}
2022-02-14 12:02:45 +00:00
parent eb6c6fcf27
commit 09aaf6f7bc
23 changed files with 483 additions and 18 deletions
--- a/api/audio_options.h
+++ b/api/audio_options.h
@ -60,8 +60,6 @@ struct RTC_EXPORT AudioOptions {
  absl::optional<int> audio_jitter_buffer_min_delay_ms;
  // Audio receiver jitter buffer (NetEq) should handle retransmitted packets.
  absl::optional<bool> audio_jitter_buffer_enable_rtx_handling;
  // Deprecated.
  // TODO(bugs.webrtc.org/11226): Remove.
  // Audio processing to detect typing.
  absl::optional<bool> typing_detection;
  // TODO(bugs.webrtc.org/11539): Deprecated, replaced by
--- a/api/stats_types.cc
+++ b/api/stats_types.cc
@ -648,7 +648,6 @@ const char* StatsReport::Value::display_name() const {
      return "googTrackId";
    case kStatsValueNameTimingFrameInfo:
      return "googTimingFrameInfo";
    // TODO(bugs.webrtc.org/11226): Remove.
    case kStatsValueNameTypingNoiseState:
      return "googTypingNoiseState";
    case kStatsValueNameWritable:
--- a/api/stats_types.h
+++ b/api/stats_types.h
@ -235,7 +235,6 @@ class RTC_EXPORT StatsReport {
    kStatsValueNameTrackId,
    kStatsValueNameTransmitBitrate,
    kStatsValueNameTransportType,
    // TODO(bugs.webrtc.org/11226): Remove.
    kStatsValueNameTypingNoiseState,
    kStatsValueNameWritable,
    kStatsValueNameAudioDeviceUnderrunCounter,
--- a/audio/audio_transport_impl.cc
+++ b/audio/audio_transport_impl.cc
@ -165,6 +165,24 @@ int32_t AudioTransportImpl::RecordedDataIsAvailable(
                      audio_frame.get());
  audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns /
                                                 1000000);
  // Typing detection (utilizes the APM/VAD decision). We let the VAD determine
  // if we're using this feature or not.
  // TODO(solenberg): GetConfig() takes a lock. Work around that.
  bool typing_detected = false;
  if (audio_processing_ &&
      audio_processing_->GetConfig().voice_detection.enabled) {
    if (audio_frame->vad_activity_ != AudioFrame::kVadUnknown) {
      bool vad_active = audio_frame->vad_activity_ == AudioFrame::kVadActive;
      typing_detected = typing_detection_.Process(key_pressed, vad_active);
    }
  }
  // Copy frame and push to each sending stream. The copy is required since an
  // encoding task will be posted internally to each stream.
  {
    MutexLock lock(&capture_lock_);
    typing_noise_detected_ = typing_detected;
  }
  RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0);
  if (async_audio_processing_)
@ -272,4 +290,8 @@ void AudioTransportImpl::SetStereoChannelSwapping(bool enable) {
  swap_stereo_channels_ = enable;
 }
 bool AudioTransportImpl::typing_noise_detected() const {
  MutexLock lock(&capture_lock_);
  return typing_noise_detected_;
 }
 }  // namespace webrtc
--- a/audio/audio_transport_impl.h
+++ b/audio/audio_transport_impl.h
@ -86,9 +86,7 @@ class AudioTransportImpl : public AudioTransport {
                          int send_sample_rate_hz,
                          size_t send_num_channels);
  void SetStereoChannelSwapping(bool enable);
-  // Deprecated.
+  bool typing_noise_detected() const;
  // TODO(bugs.webrtc.org/11226): Remove.
  bool typing_noise_detected() const { return false; }
 private:
  void SendProcessedData(std::unique_ptr<AudioFrame> audio_frame);
@ -105,6 +103,7 @@ class AudioTransportImpl : public AudioTransport {
  std::vector<AudioSender*> audio_senders_ RTC_GUARDED_BY(capture_lock_);
  int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000;
  size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1;
  bool typing_noise_detected_ RTC_GUARDED_BY(capture_lock_) = false;
  bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false;
  PushResampler<int16_t> capture_resampler_;
  TypingDetection typing_detection_;
--- a/media/engine/webrtc_voice_engine.cc
+++ b/media/engine/webrtc_voice_engine.cc
@ -609,7 +609,9 @@ bool WebRtcVoiceEngine::ApplyOptions(const AudioOptions& options_in) {
  }
  if (options.typing_detection) {
-    RTC_LOG(LS_WARNING) << "Typing detection is requested, but unsupported.";
+    RTC_LOG(LS_INFO) << "Typing detection is enabled? "
                     << *options.typing_detection;
    apm_config.voice_detection.enabled = *options.typing_detection;
  }
  ap->ApplyConfig(apm_config);
--- a/media/engine/webrtc_voice_engine_unittest.cc
+++ b/media/engine/webrtc_voice_engine_unittest.cc
@ -221,6 +221,11 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam<bool> {
      // Default Options.
      VerifyEchoCancellationSettings(/*enabled=*/true);
      EXPECT_TRUE(IsHighPassFilterEnabled());
 #if defined(WEBRTC_ANDROID)
      EXPECT_FALSE(IsTypingDetectionEnabled());
 #else
      EXPECT_TRUE(IsTypingDetectionEnabled());
 #endif
      EXPECT_TRUE(apm_config_.noise_suppression.enabled);
      EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel);
      VerifyGainControlEnabledCorrectly();
@ -788,6 +793,10 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam<bool> {
    return apm_config_.high_pass_filter.enabled;
  }
  bool IsTypingDetectionEnabled() {
    return apm_config_.voice_detection.enabled;
  }
 protected:
  const bool use_null_apm_;
  std::unique_ptr<webrtc::TaskQueueFactory> task_queue_factory_;
@ -2980,10 +2989,40 @@ TEST_P(WebRtcVoiceEngineTestFake, SetAudioOptions) {
  if (!use_null_apm_) {
    VerifyEchoCancellationSettings(/*enabled=*/true);
    EXPECT_TRUE(IsHighPassFilterEnabled());
 #if defined(WEBRTC_ANDROID)
    EXPECT_FALSE(IsTypingDetectionEnabled());
 #else
    EXPECT_TRUE(IsTypingDetectionEnabled());
 #endif
  }
  EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets);
  EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate);
  // Turn typing detection off.
  send_parameters_.options.typing_detection = false;
  SetSendParameters(send_parameters_);
  if (!use_null_apm_) {
    EXPECT_FALSE(IsTypingDetectionEnabled());
  }
  // Leave typing detection unchanged, but non-default.
  send_parameters_.options.typing_detection = absl::nullopt;
  SetSendParameters(send_parameters_);
  if (!use_null_apm_) {
    EXPECT_FALSE(IsTypingDetectionEnabled());
  }
  // Turn typing detection on.
  send_parameters_.options.typing_detection = true;
  SetSendParameters(send_parameters_);
  if (!use_null_apm_) {
 #if defined(WEBRTC_ANDROID)
    EXPECT_FALSE(IsTypingDetectionEnabled());
 #else
    EXPECT_TRUE(IsTypingDetectionEnabled());
 #endif
  }
  // Turn echo cancellation off
  send_parameters_.options.echo_cancellation = false;
  SetSendParameters(send_parameters_);
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@ -168,6 +168,7 @@ rtc_library("audio_processing") {
    ":high_pass_filter",
    ":optionally_built_submodule_creators",
    ":rms_level",
    ":voice_detection",
    "../../api:array_view",
    "../../api:function_view",
    "../../api/audio:aec3_config",
@ -217,6 +218,20 @@ rtc_library("audio_processing") {
  }
 }
 rtc_library("voice_detection") {
  sources = [
    "voice_detection.cc",
    "voice_detection.h",
  ]
  deps = [
    ":api",
    ":audio_buffer",
    "../../api/audio:audio_frame_api",
    "../../common_audio:common_audio_c",
    "../../rtc_base:checks",
  ]
 }
 rtc_library("residual_echo_detector") {
  poisonous = [ "default_echo_detector" ]
  configs += [ ":apm_debug_dump" ]
@ -364,6 +379,7 @@ if (rtc_include_tests) {
        ":gain_controller2",
        ":high_pass_filter",
        ":mocks",
        ":voice_detection",
        "../../api:array_view",
        "../../api:scoped_refptr",
        "../../api/audio:aec3_config",
@ -458,6 +474,7 @@ if (rtc_include_tests) {
          "test/echo_canceller_test_tools_unittest.cc",
          "test/echo_control_mock.h",
          "test/test_utils.h",
          "voice_detection_unittest.cc",
        ]
      }
    }
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@ -141,6 +141,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
    bool gain_controller2_enabled,
    bool gain_adjustment_enabled,
    bool echo_controller_enabled,
    bool voice_detector_enabled,
    bool transient_suppressor_enabled) {
  bool changed = false;
  changed |= (high_pass_filter_enabled != high_pass_filter_enabled_);
@ -152,6 +153,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
  changed |= (gain_controller2_enabled != gain_controller2_enabled_);
  changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
  changed |= (echo_controller_enabled != echo_controller_enabled_);
  changed |= (voice_detector_enabled != voice_detector_enabled_);
  changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
  if (changed) {
    high_pass_filter_enabled_ = high_pass_filter_enabled;
@ -161,6 +163,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
    gain_controller2_enabled_ = gain_controller2_enabled;
    gain_adjustment_enabled_ = gain_adjustment_enabled;
    echo_controller_enabled_ = echo_controller_enabled;
    voice_detector_enabled_ = voice_detector_enabled;
    transient_suppressor_enabled_ = transient_suppressor_enabled;
  }
@ -171,7 +174,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
 bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive()
    const {
-  return CaptureMultiBandProcessingPresent();
+  return CaptureMultiBandProcessingPresent() || voice_detector_enabled_;
 }
 bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent()
@ -368,6 +371,7 @@ void AudioProcessingImpl::InitializeLocked() {
  InitializeGainController1();
  InitializeTransientSuppressor();
  InitializeHighPassFilter(true);
  InitializeVoiceDetector();
  InitializeResidualEchoDetector();
  InitializeEchoController();
  InitializeGainController2(/*config_has_changed=*/true);
@ -502,6 +506,9 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
  const bool agc2_config_changed =
      config_.gain_controller2 != config.gain_controller2;
  const bool voice_detection_config_changed =
      config_.voice_detection.enabled != config.voice_detection.enabled;
  const bool ns_config_changed =
      config_.noise_suppression.enabled != config.noise_suppression.enabled ||
      config_.noise_suppression.level != config.noise_suppression.level;
@ -550,6 +557,10 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
    InitializeCaptureLevelsAdjuster();
  }
  if (voice_detection_config_changed) {
    InitializeVoiceDetector();
  }
  // Reinitialization must happen after all submodule configuration to avoid
  // additional reinitializations on the next capture / render processing call.
  if (pipeline_config_changed) {
@ -1204,6 +1215,13 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
    }
  }
  if (config_.voice_detection.enabled) {
    capture_.stats.voice_detected =
        submodules_.voice_detector->ProcessCaptureAudio(capture_buffer);
  } else {
    capture_.stats.voice_detected = absl::nullopt;
  }
  if (submodules_.agc_manager) {
    submodules_.agc_manager->Process(capture_buffer);
@ -1664,7 +1682,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
      !!submodules_.gain_controller2,
      config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
      capture_nonlocked_.echo_controller_enabled,
-      !!submodules_.transient_suppressor);
+      config_.voice_detection.enabled, !!submodules_.transient_suppressor);
 }
 void AudioProcessingImpl::InitializeTransientSuppressor() {
@ -1714,6 +1732,14 @@ void AudioProcessingImpl::InitializeHighPassFilter(bool forced_reset) {
  }
 }
 void AudioProcessingImpl::InitializeVoiceDetector() {
  if (config_.voice_detection.enabled) {
    submodules_.voice_detector = std::make_unique<VoiceDetection>(
        proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood);
  } else {
    submodules_.voice_detector.reset();
  }
 }
 void AudioProcessingImpl::InitializeEchoController() {
  bool use_echo_controller =
      echo_control_factory_ ||
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@ -39,6 +39,7 @@
 #include "modules/audio_processing/render_queue_item_verifier.h"
 #include "modules/audio_processing/rms_level.h"
 #include "modules/audio_processing/transient/transient_suppressor.h"
 #include "modules/audio_processing/voice_detection.h"
 #include "rtc_base/gtest_prod_util.h"
 #include "rtc_base/ignore_wundef.h"
 #include "rtc_base/swap_queue.h"
@ -207,6 +208,7 @@ class AudioProcessingImpl : public AudioProcessing {
                bool gain_controller2_enabled,
                bool gain_adjustment_enabled,
                bool echo_controller_enabled,
                bool voice_detector_enabled,
                bool transient_suppressor_enabled);
    bool CaptureMultiBandSubModulesActive() const;
    bool CaptureMultiBandProcessingPresent() const;
@ -229,6 +231,7 @@ class AudioProcessingImpl : public AudioProcessing {
    bool gain_controller2_enabled_ = false;
    bool gain_adjustment_enabled_ = false;
    bool echo_controller_enabled_ = false;
    bool voice_detector_enabled_ = false;
    bool transient_suppressor_enabled_ = false;
    bool first_update_ = true;
  };
@ -264,6 +267,7 @@ class AudioProcessingImpl : public AudioProcessing {
  // already acquired.
  void InitializeHighPassFilter(bool forced_reset)
      RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
  void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
  void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
  void InitializeTransientSuppressor()
      RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
@ -396,6 +400,7 @@ class AudioProcessingImpl : public AudioProcessing {
    std::unique_ptr<EchoControlMobileImpl> echo_control_mobile;
    std::unique_ptr<NoiseSuppressor> noise_suppressor;
    std::unique_ptr<TransientSuppressor> transient_suppressor;
    std::unique_ptr<VoiceDetection> voice_detector;
    std::unique_ptr<CaptureLevelsAdjuster> capture_levels_adjuster;
  } submodules_;
--- a/modules/audio_processing/audio_processing_impl_locking_unittest.cc
+++ b/modules/audio_processing/audio_processing_impl_locking_unittest.cc
@ -483,6 +483,7 @@ AudioProcessing::Config GetApmTestConfig(AecType aec_type) {
  apm_config.gain_controller1.mode =
      AudioProcessing::Config::GainController1::kAdaptiveDigital;
  apm_config.noise_suppression.enabled = true;
  apm_config.voice_detection.enabled = true;
  return apm_config;
 }
--- a/modules/audio_processing/audio_processing_performance_unittest.cc
+++ b/modules/audio_processing/audio_processing_performance_unittest.cc
@ -441,6 +441,7 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
      apm_config.gain_controller1.enabled = true;
      apm_config.gain_controller1.mode =
          AudioProcessing::Config::GainController1::kAdaptiveDigital;
      apm_config.voice_detection.enabled = true;
      apm->ApplyConfig(apm_config);
    };
@ -452,6 +453,7 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
      apm_config.noise_suppression.enabled = true;
      apm_config.gain_controller1.mode =
          AudioProcessing::Config::GainController1::kAdaptiveDigital;
      apm_config.voice_detection.enabled = true;
      apm->ApplyConfig(apm_config);
    };
@ -462,6 +464,7 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
      apm_config.echo_canceller.enabled = false;
      apm_config.gain_controller1.enabled = false;
      apm_config.noise_suppression.enabled = false;
      apm_config.voice_detection.enabled = false;
      apm->ApplyConfig(apm_config);
    };
--- a/modules/audio_processing/audio_processing_unittest.cc
+++ b/modules/audio_processing/audio_processing_unittest.cc
@ -190,6 +190,7 @@ void EnableAllAPComponents(AudioProcessing* ap) {
  apm_config.noise_suppression.enabled = true;
  apm_config.high_pass_filter.enabled = true;
  apm_config.voice_detection.enabled = true;
  apm_config.pipeline.maximum_internal_processing_rate = 48000;
  ap->ApplyConfig(apm_config);
 }
@ -1225,6 +1226,7 @@ TEST_F(ApmTest, AllProcessingDisabledByDefault) {
  EXPECT_FALSE(config.high_pass_filter.enabled);
  EXPECT_FALSE(config.gain_controller1.enabled);
  EXPECT_FALSE(config.noise_suppression.enabled);
  EXPECT_FALSE(config.voice_detection.enabled);
 }
 TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
@ -1365,6 +1367,48 @@ TEST_F(ApmTest, SplittingFilter) {
  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
  apm_->ApplyConfig(apm_config);
  // 3. Only GetStatistics-reporting VAD is enabled...
  SetFrameTo(&frame_, 1000);
  frame_copy.CopyFrom(frame_);
  apm_config.voice_detection.enabled = true;
  apm_->ApplyConfig(apm_config);
  EXPECT_EQ(apm_->kNoError,
            apm_->ProcessStream(
                frame_.data.data(),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                frame_.data.data()));
  EXPECT_EQ(apm_->kNoError,
            apm_->ProcessStream(
                frame_.data.data(),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                frame_.data.data()));
  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
  apm_config.voice_detection.enabled = false;
  apm_->ApplyConfig(apm_config);
  // 4. The VAD is enabled...
  SetFrameTo(&frame_, 1000);
  frame_copy.CopyFrom(frame_);
  apm_config.voice_detection.enabled = true;
  apm_->ApplyConfig(apm_config);
  EXPECT_EQ(apm_->kNoError,
            apm_->ProcessStream(
                frame_.data.data(),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                frame_.data.data()));
  EXPECT_EQ(apm_->kNoError,
            apm_->ProcessStream(
                frame_.data.data(),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
                frame_.data.data()));
  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
  apm_config.voice_detection.enabled = false;
  apm_->ApplyConfig(apm_config);
  // Check the test is valid. We should have distortion from the filter
  // when AEC is enabled (which won't affect the audio).
  apm_config.echo_canceller.enabled = true;
@ -1692,6 +1736,7 @@ TEST_F(ApmTest, Process) {
         static_cast<size_t>(test->num_reverse_channels()), true);
    int frame_count = 0;
    int has_voice_count = 0;
    int analog_level = 127;
    int analog_level_average = 0;
    int max_output_average = 0;
@ -1727,6 +1772,8 @@ TEST_F(ApmTest, Process) {
      analog_level = apm_->recommended_stream_analog_level();
      analog_level_average += analog_level;
      AudioProcessingStats stats = apm_->GetStatistics();
      EXPECT_TRUE(stats.voice_detected);
      has_voice_count += *stats.voice_detected ? 1 : 0;
      size_t frame_size = frame_.samples_per_channel * frame_.num_channels;
      size_t write_count =
@ -1782,23 +1829,33 @@ TEST_F(ApmTest, Process) {
    if (!absl::GetFlag(FLAGS_write_apm_ref_data)) {
      const int kIntNear = 1;
-      // All numbers being consistently higher on N7 compare to the reference
+      // When running the test on a N7 we get a {2, 6} difference of
-      // data.
+      // `has_voice_count` and `max_output_average` is up to 18 higher.
      // All numbers being consistently higher on N7 compare to ref_data.
      // TODO(bjornv): If we start getting more of these offsets on Android we
      // should consider a different approach. Either using one slack for all,
      // or generate a separate android reference.
 #if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS)
      const int kHasVoiceCountOffset = 3;
      const int kHasVoiceCountNear = 8;
      const int kMaxOutputAverageOffset = 9;
      const int kMaxOutputAverageNear = 26;
 #else
      const int kHasVoiceCountOffset = 0;
      const int kHasVoiceCountNear = kIntNear;
      const int kMaxOutputAverageOffset = 0;
      const int kMaxOutputAverageNear = kIntNear;
 #endif
      EXPECT_NEAR(test->has_voice_count(),
                  has_voice_count - kHasVoiceCountOffset, kHasVoiceCountNear);
      EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear);
      EXPECT_NEAR(test->max_output_average(),
                  max_output_average - kMaxOutputAverageOffset,
                  kMaxOutputAverageNear);
    } else {
      test->set_has_voice_count(has_voice_count);
      test->set_analog_level_average(analog_level_average);
      test->set_max_output_average(max_output_average);
    }
@ -2628,6 +2685,7 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(bool mobile_aec) {
  apm_config.echo_canceller.enabled = true;
  apm_config.echo_canceller.mobile_mode = mobile_aec;
  apm_config.noise_suppression.enabled = false;
  apm_config.voice_detection.enabled = false;
  apm->ApplyConfig(apm_config);
  return apm;
 }
@ -2736,9 +2794,10 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) {
  EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value());
 }
-TEST(ApmStatistics, DoNotReportVoiceDetectedStat) {
+TEST(ApmStatistics, ReportHasVoice) {
  ProcessingConfig processing_config = {
      {{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
  AudioProcessing::Config config;
  // Set up an audioframe.
  Int16FrameData frame;
@ -2755,14 +2814,37 @@ TEST(ApmStatistics, DoNotReportVoiceDetectedStat) {
      AudioProcessingBuilderForTesting().Create();
  apm->Initialize(processing_config);
-  // No metric should be reported.
+  // If not enabled, no metric should be reported.
  EXPECT_EQ(
      apm->ProcessStream(frame.data.data(),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         frame.data.data()),
      0);
-  EXPECT_FALSE(apm->GetStatistics().voice_detected.has_value());
+  EXPECT_FALSE(apm->GetStatistics().voice_detected);
  // If enabled, metrics should be reported.
  config.voice_detection.enabled = true;
  apm->ApplyConfig(config);
  EXPECT_EQ(
      apm->ProcessStream(frame.data.data(),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         frame.data.data()),
      0);
  auto stats = apm->GetStatistics();
  EXPECT_TRUE(stats.voice_detected);
  // If re-disabled, the value is again not reported.
  config.voice_detection.enabled = false;
  apm->ApplyConfig(config);
  EXPECT_EQ(
      apm->ProcessStream(frame.data.data(),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         frame.data.data()),
      0);
  EXPECT_FALSE(apm->GetStatistics().voice_detected);
 }
 TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) {
--- a/modules/audio_processing/include/audio_processing.cc
+++ b/modules/audio_processing/include/audio_processing.cc
@ -145,6 +145,7 @@ std::string AudioProcessing::Config::ToString() const {
          << NoiseSuppressionLevelToString(noise_suppression.level)
          << " }, transient_suppression: { enabled: "
          << transient_suppression.enabled
          << " }, voice_detection: { enabled: " << voice_detection.enabled
          << " }, gain_controller1: { enabled: " << gain_controller1.enabled
          << ", mode: " << GainController1ModeToString(gain_controller1.mode)
          << ", target_level_dbfs: " << gain_controller1.target_level_dbfs
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@ -113,6 +113,8 @@ static constexpr int kClippedLevelMin = 70;
 //
 // config.high_pass_filter.enabled = true;
 //
 // config.voice_detection.enabled = true;
 //
 // apm->ApplyConfig(config)
 //
 // apm->noise_reduction()->set_level(kHighSuppression);
@ -230,6 +232,11 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface {
      bool enabled = false;
    } transient_suppression;
    // Enables reporting of `voice_detected` in webrtc::AudioProcessingStats.
    struct VoiceDetection {
      bool enabled = false;
    } voice_detection;
    // Enables automatic gain control (AGC) functionality.
    // The automatic gain control (AGC) component brings the signal to an
    // appropriate range. This is done by applying a digital gain directly and,
--- a/modules/audio_processing/include/audio_processing_statistics.h
+++ b/modules/audio_processing/include/audio_processing_statistics.h
@ -24,8 +24,6 @@ struct RTC_EXPORT AudioProcessingStats {
  AudioProcessingStats(const AudioProcessingStats& other);
  ~AudioProcessingStats();
  // Deprecated.
  // TODO(bugs.webrtc.org/11226): Remove.
  // True if voice is detected in the last capture frame, after processing.
  // It is conservative in flagging audio as speech, with low likelihood of
  // incorrectly flagging a frame as voice.
--- a/modules/audio_processing/test/audio_processing_simulator.cc
+++ b/modules/audio_processing/test/audio_processing_simulator.cc
@ -543,6 +543,10 @@ void AudioProcessingSimulator::ConfigureAudioProcessor() {
    apm_config.high_pass_filter.enabled = *settings_.use_hpf;
  }
  if (settings_.use_vad) {
    apm_config.voice_detection.enabled = *settings_.use_vad;
  }
  if (settings_.use_agc) {
    apm_config.gain_controller1.enabled = *settings_.use_agc;
  }
--- a/modules/audio_processing/test/audio_processing_simulator.h
+++ b/modules/audio_processing/test/audio_processing_simulator.h
@ -105,6 +105,7 @@ struct SimulationSettings {
  absl::optional<bool> use_ns;
  absl::optional<int> use_ts;
  absl::optional<bool> use_analog_agc;
  absl::optional<bool> use_vad;
  absl::optional<bool> use_all;
  absl::optional<bool> analog_agc_disable_digital_adaptive;
  absl::optional<int> agc_mode;
--- a/modules/audio_processing/test/audioproc_float_impl.cc
+++ b/modules/audio_processing/test/audioproc_float_impl.cc
@ -117,6 +117,10 @@ ABSL_FLAG(int,
          analog_agc,
          kParameterNotSpecifiedValue,
          "Activate (1) or deactivate (0) the analog AGC");
 ABSL_FLAG(int,
          vad,
          kParameterNotSpecifiedValue,
          "Activate (1) or deactivate (0) the voice activity detector");
 ABSL_FLAG(bool,
          all_default,
          false,
@ -361,6 +365,7 @@ void SetSettingIfFlagSet(int32_t flag, absl::optional<bool>* parameter) {
 SimulationSettings CreateSettings() {
  SimulationSettings settings;
  if (absl::GetFlag(FLAGS_all_default)) {
    settings.use_vad = true;
    settings.use_ts = true;
    settings.use_analog_agc = true;
    settings.use_ns = true;
@ -412,6 +417,7 @@ SimulationSettings CreateSettings() {
  SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts);
  SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc),
                      &settings.use_analog_agc);
  SetSettingIfFlagSet(absl::GetFlag(FLAGS_vad), &settings.use_vad);
  SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive),
                      &settings.analog_agc_disable_digital_adaptive);
  SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode);
--- a/modules/audio_processing/voice_detection.cc
+++ b/modules/audio_processing/voice_detection.cc
@ -0,0 +1,92 @@
 /*
 *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "modules/audio_processing/voice_detection.h"
 #include "common_audio/vad/include/webrtc_vad.h"
 #include "modules/audio_processing/audio_buffer.h"
 #include "rtc_base/checks.h"
 namespace webrtc {
 class VoiceDetection::Vad {
 public:
  Vad() {
    state_ = WebRtcVad_Create();
    RTC_CHECK(state_);
    int error = WebRtcVad_Init(state_);
    RTC_DCHECK_EQ(0, error);
  }
  ~Vad() { WebRtcVad_Free(state_); }
  Vad(Vad&) = delete;
  Vad& operator=(Vad&) = delete;
  VadInst* state() { return state_; }
 private:
  VadInst* state_ = nullptr;
 };
 VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood)
    : sample_rate_hz_(sample_rate_hz),
      frame_size_samples_(static_cast<size_t>(sample_rate_hz_ / 100)),
      likelihood_(likelihood),
      vad_(new Vad()) {
  int mode = 2;
  switch (likelihood) {
    case VoiceDetection::kVeryLowLikelihood:
      mode = 3;
      break;
    case VoiceDetection::kLowLikelihood:
      mode = 2;
      break;
    case VoiceDetection::kModerateLikelihood:
      mode = 1;
      break;
    case VoiceDetection::kHighLikelihood:
      mode = 0;
      break;
    default:
      RTC_DCHECK_NOTREACHED();
      break;
  }
  int error = WebRtcVad_set_mode(vad_->state(), mode);
  RTC_DCHECK_EQ(0, error);
 }
 VoiceDetection::~VoiceDetection() {}
 bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) {
  RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
                audio->num_frames_per_band());
  std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
  rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
                                               audio->num_frames_per_band());
  if (audio->num_channels() == 1) {
    FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
                  audio->num_frames_per_band(), mixed_low_pass_data.data());
  } else {
    const int num_channels = static_cast<int>(audio->num_channels());
    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
      int32_t value =
          FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
      for (int j = 1; j < num_channels; ++j) {
        value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
      }
      mixed_low_pass_data[i] = value / num_channels;
    }
  }
  int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
                                  mixed_low_pass.data(), frame_size_samples_);
  RTC_DCHECK(vad_ret == 0 || vad_ret == 1);
  return vad_ret == 0 ? false : true;
 }
 }  // namespace webrtc
--- a/modules/audio_processing/voice_detection.h
+++ b/modules/audio_processing/voice_detection.h
@ -0,0 +1,59 @@
 /*
 *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
 #define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
 #include <stddef.h>
 #include <memory>
 #include "modules/audio_processing/include/audio_processing.h"
 namespace webrtc {
 class AudioBuffer;
 // The voice activity detection (VAD) component analyzes the stream to
 // determine if voice is present.
 class VoiceDetection {
 public:
  // Specifies the likelihood that a frame will be declared to contain voice.
  // A higher value makes it more likely that speech will not be clipped, at
  // the expense of more noise being detected as voice.
  enum Likelihood {
    kVeryLowLikelihood,
    kLowLikelihood,
    kModerateLikelihood,
    kHighLikelihood
  };
  VoiceDetection(int sample_rate_hz, Likelihood likelihood);
  ~VoiceDetection();
  VoiceDetection(VoiceDetection&) = delete;
  VoiceDetection& operator=(VoiceDetection&) = delete;
  // Returns true if voice is detected in the current frame.
  bool ProcessCaptureAudio(AudioBuffer* audio);
  Likelihood likelihood() const { return likelihood_; }
 private:
  class Vad;
  int sample_rate_hz_;
  size_t frame_size_samples_;
  Likelihood likelihood_;
  std::unique_ptr<Vad> vad_;
 };
 }  // namespace webrtc
 #endif  // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
--- a/modules/audio_processing/voice_detection_unittest.cc
+++ b/modules/audio_processing/voice_detection_unittest.cc
@ -0,0 +1,104 @@
 /*
 *  Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include <vector>
 #include "api/array_view.h"
 #include "modules/audio_processing/audio_buffer.h"
 #include "modules/audio_processing/test/audio_buffer_tools.h"
 #include "modules/audio_processing/test/bitexactness_tools.h"
 #include "modules/audio_processing/voice_detection.h"
 #include "test/gtest.h"
 namespace webrtc {
 namespace {
 const int kNumFramesToProcess = 1000;
 // Process one frame of data and produce the output.
 bool ProcessOneFrame(int sample_rate_hz,
                     AudioBuffer* audio_buffer,
                     VoiceDetection* voice_detection) {
  if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
    audio_buffer->SplitIntoFrequencyBands();
  }
  return voice_detection->ProcessCaptureAudio(audio_buffer);
 }
 // Processes a specified amount of frames, verifies the results and reports
 // any errors.
 void RunBitexactnessTest(int sample_rate_hz,
                         size_t num_channels,
                         bool stream_has_voice_reference) {
  int sample_rate_to_use = std::min(sample_rate_hz, 16000);
  VoiceDetection voice_detection(sample_rate_to_use,
                                 VoiceDetection::kLowLikelihood);
  int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
  const StreamConfig capture_config(sample_rate_hz, num_channels);
  AudioBuffer capture_buffer(
      capture_config.sample_rate_hz(), capture_config.num_channels(),
      capture_config.sample_rate_hz(), capture_config.num_channels(),
      capture_config.sample_rate_hz(), capture_config.num_channels());
  test::InputAudioFile capture_file(
      test::GetApmCaptureTestVectorFileName(sample_rate_hz));
  std::vector<float> capture_input(samples_per_channel * num_channels);
  bool stream_has_voice = false;
  for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
    ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
                                   &capture_file, capture_input);
    test::CopyVectorToAudioBuffer(capture_config, capture_input,
                                  &capture_buffer);
    stream_has_voice =
        ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
  }
  EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
 }
 const bool kStreamHasVoiceReference = true;
 }  // namespace
 TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
  RunBitexactnessTest(8000, 1, kStreamHasVoiceReference);
 }
 TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
  RunBitexactnessTest(16000, 1, kStreamHasVoiceReference);
 }
 TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
  RunBitexactnessTest(32000, 1, kStreamHasVoiceReference);
 }
 TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
  RunBitexactnessTest(48000, 1, kStreamHasVoiceReference);
 }
 TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
  RunBitexactnessTest(8000, 2, kStreamHasVoiceReference);
 }
 TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
  RunBitexactnessTest(16000, 2, kStreamHasVoiceReference);
 }
 TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
  RunBitexactnessTest(32000, 2, kStreamHasVoiceReference);
 }
 TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
  RunBitexactnessTest(48000, 2, kStreamHasVoiceReference);
 }
 }  // namespace webrtc
--- a/test/fuzzers/audio_processing_configs_fuzzer.cc
+++ b/test/fuzzers/audio_processing_configs_fuzzer.cc
@ -54,7 +54,7 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
  bool use_agc = fuzz_data->ReadOrDefaultValue(true);
  bool use_ns = fuzz_data->ReadOrDefaultValue(true);
  static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
-  static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
+  bool use_vad = fuzz_data->ReadOrDefaultValue(true);
  bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true);
  bool use_agc2 = fuzz_data->ReadOrDefaultValue(true);
@ -114,6 +114,7 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
      use_agc2_adaptive_digital;
  apm_config.noise_suppression.enabled = use_ns;
  apm_config.transient_suppression.enabled = use_ts;
  apm_config.voice_detection.enabled = use_vad;
  rtc::scoped_refptr<AudioProcessing> apm =
      AudioProcessingBuilderForTesting()