From bf28277774c916731f9d60d65b0d69d16d9d4006 Mon Sep 17 00:00:00 2001
From: Hanna Silen <silen@webrtc.org>
Date: Fri, 18 Nov 2022 19:36:34 +0100
Subject: [PATCH] InputVolumeController: Add configurable speech probability
 aggregation

Make speech probability threshold configurable by replacing
kSpeechProbabilitySilenceThreshold with speech_probability_threshold in
InputVolumeController::Config.

Make the processing more robust against outliers in speech probability
estimaton by computing an aggregate speech activity over a speech
segment. In MonoInputVolumeController::Process(), use the passed
non-empty speech probabilities to compute the speech activity over the
speech segment and only allow updates for segments with a high enough
ratio of speech frames. Pass RMS error and speech probability for every
frame in Process(): If rms_error_dbfs is empty, volume updates are not
allowed; if speech_probability is empty, the frame counts as a non-
speech frame.

Remove startup_min_volume from the config since it's no longer used
after https://webrtc-review.googlesource.com/c/src/+/282821.

Bug: webrtc:7494
Change-Id: I0ab81b03371496315348f552133aa9909bd36f26
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/283523
Commit-Queue: Hanna Silen <silen@webrtc.org>
Reviewed-by: Alessio Bazzica <alessiob@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#38685}
---
 .../agc2/input_volume_controller.cc           |  93 +++++----
 .../agc2/input_volume_controller.h            |  43 +++--
 .../agc2/input_volume_controller_unittest.cc  | 176 ++++++++++++++----
 3 files changed, 232 insertions(+), 80 deletions(-)
diff --git a/modules/audio_processing/agc2/input_volume_controller.cc b/modules/audio_processing/agc2/input_volume_controller.cc
index 2e5b9e3083..fecb090fbc 100644
--- a/modules/audio_processing/agc2/input_volume_controller.cc
+++ b/modules/audio_processing/agc2/input_volume_controller.cc
@@ -37,11 +37,6 @@ constexpr int kMinMicLevel = 12;
 // Prevent very large microphone level changes.
 constexpr int kMaxResidualGainChange = 15;
 
-// Target speech level (dBFs) and speech probability threshold used to compute
-// the RMS error in `GetSpeechLevelErrorDb()`.
-// TODO(webrtc:7494): Move this to a config and pass in the ctor.
-constexpr float kSpeechProbabilitySilenceThreshold = 0.5f;
-
 using Agc1ClippingPredictorConfig = AudioProcessing::Config::GainController1::
     AnalogGainController::ClippingPredictor;
 
@@ -128,25 +123,16 @@ void LogClippingMetrics(int clipping_rate) {
 }
 
 // Computes the speech level error in dB. The value of `speech_level_dbfs` is
-// required to be in the range [-90.0f, 30.0f] and `speech_probability` in the
-// range [0.0f, 1.0f]. Returns a positive value when the speech level is below
-// the target range and a negative value when the speech level is above the
-// target range.
+// required to be in the range [-90.0f, 30.0f]. Returns a positive value when
+// the speech level is below the target range and a negative value when the
+// speech level is above the target range.
 int GetSpeechLevelErrorDb(float speech_level_dbfs,
-                          float speech_probability,
                           int target_range_min_dbfs,
                           int target_range_max_dbfs) {
   constexpr float kMinSpeechLevelDbfs = -90.0f;
   constexpr float kMaxSpeechLevelDbfs = 30.0f;
   RTC_DCHECK_GE(speech_level_dbfs, kMinSpeechLevelDbfs);
   RTC_DCHECK_LE(speech_level_dbfs, kMaxSpeechLevelDbfs);
-  RTC_DCHECK_GE(speech_probability, 0.0f);
-  RTC_DCHECK_LE(speech_probability, 1.0f);
-
-  // TODO(webrtc:7494): Replace with the use of `SpeechProbabilityBuffer`.
-  if (speech_probability < kSpeechProbabilitySilenceThreshold) {
-    return 0;
-  }
 
   // Ensure the speech level is in the range [-90.0f, 30.0f].
   speech_level_dbfs = rtc::SafeClamp<float>(
@@ -169,11 +155,26 @@ int GetSpeechLevelErrorDb(float speech_level_dbfs,
 MonoInputVolumeController::MonoInputVolumeController(
     int clipped_level_min,
     int min_mic_level,
-    int update_input_volume_wait_frames)
+    int update_input_volume_wait_frames,
+    float speech_probability_threshold,
+    float speech_ratio_threshold)
     : min_mic_level_(min_mic_level),
       max_level_(kMaxMicLevel),
       clipped_level_min_(clipped_level_min),
-      update_input_volume_wait_frames_(update_input_volume_wait_frames) {}
+      update_input_volume_wait_frames_(
+          std::max(update_input_volume_wait_frames, 1)),
+      speech_probability_threshold_(speech_probability_threshold),
+      speech_ratio_threshold_(speech_ratio_threshold) {
+  RTC_DCHECK_GE(clipped_level_min_, 0);
+  RTC_DCHECK_LE(clipped_level_min_, 255);
+  RTC_DCHECK_GE(min_mic_level_, 0);
+  RTC_DCHECK_LE(min_mic_level_, 255);
+  RTC_DCHECK_GE(update_input_volume_wait_frames_, 0);
+  RTC_DCHECK_GE(speech_probability_threshold_, 0.0f);
+  RTC_DCHECK_LE(speech_probability_threshold_, 1.0f);
+  RTC_DCHECK_GE(speech_ratio_threshold_, 0.0f);
+  RTC_DCHECK_LE(speech_ratio_threshold_, 1.0f);
+}
 
 MonoInputVolumeController::~MonoInputVolumeController() = default;
 
@@ -182,10 +183,18 @@ void MonoInputVolumeController::Initialize() {
   capture_output_used_ = true;
   check_volume_on_next_process_ = true;
   frames_since_update_input_volume_ = 0;
+  speech_frames_since_update_input_volume_ = 0;
   is_first_frame_ = true;
 }
 
-void MonoInputVolumeController::Process(absl::optional<int> rms_error_dbfs) {
+// A speech segment is considered active if at least
+// `update_input_volume_wait_frames_` new frames have been processed since the
+// previous update and the ratio of non-silence frames (i.e., frames with a
+// non-empty `speech_probability` value above `speech_probability_threshold_`)
+// is at least `speech_ratio_threshold_`.
+void MonoInputVolumeController::Process(
+    absl::optional<int> rms_error_dbfs,
+    absl::optional<float> speech_probability) {
   if (check_volume_on_next_process_) {
     check_volume_on_next_process_ = false;
     // We have to wait until the first process call to check the volume,
@@ -193,9 +202,29 @@ void MonoInputVolumeController::Process(absl::optional<int> rms_error_dbfs) {
     CheckVolumeAndReset();
   }
 
-  if (++frames_since_update_input_volume_ >= update_input_volume_wait_frames_ &&
-      rms_error_dbfs.has_value() && !is_first_frame_) {
-    UpdateInputVolume(*rms_error_dbfs);
+  // Count frames with a high speech probability as speech.
+  if (speech_probability.has_value() &&
+      *speech_probability >= speech_probability_threshold_) {
+    ++speech_frames_since_update_input_volume_;
+  }
+
+  // Reset the counters and maybe update the input volume.
+  if (++frames_since_update_input_volume_ >= update_input_volume_wait_frames_) {
+    const float speech_ratio =
+        static_cast<float>(speech_frames_since_update_input_volume_) /
+        static_cast<float>(update_input_volume_wait_frames_);
+
+    // Always reset the counters regardless of whether the volume changes or
+    // not.
+    frames_since_update_input_volume_ = 0;
+    speech_frames_since_update_input_volume_ = 0;
+
+    // Update the input volume if allowed.
+    if (!is_first_frame_ && speech_ratio >= speech_ratio_threshold_) {
+      if (rms_error_dbfs.has_value()) {
+        UpdateInputVolume(*rms_error_dbfs);
+      }
+    }
   }
 
   is_first_frame_ = false;
@@ -216,6 +245,7 @@ void MonoInputVolumeController::HandleClipping(int clipped_level_step) {
     // will still not react until the postproc updates the level.
     SetLevel(std::max(clipped_level_min_, level_ - clipped_level_step));
     frames_since_update_input_volume_ = 0;
+    speech_frames_since_update_input_volume_ = 0;
     is_first_frame_ = false;
   }
 }
@@ -250,6 +280,7 @@ void MonoInputVolumeController::SetLevel(int new_level) {
     // Take no action in this case, since we can't be sure when the volume
     // was manually adjusted.
     frames_since_update_input_volume_ = 0;
+    speech_frames_since_update_input_volume_ = 0;
     is_first_frame_ = false;
     return;
   }
@@ -311,16 +342,13 @@ int MonoInputVolumeController::CheckVolumeAndReset() {
   level_ = level;
   startup_ = false;
   frames_since_update_input_volume_ = 0;
+  speech_frames_since_update_input_volume_ = 0;
   is_first_frame_ = true;
 
   return 0;
 }
 
 void MonoInputVolumeController::UpdateInputVolume(int rms_error_dbfs) {
-  // Always reset the counter regardless of whether the gain is changed
-  // or not.
-  frames_since_update_input_volume_ = 0;
-
   const int residual_gain = rtc::SafeClamp(
       rms_error_dbfs, -kMaxResidualGainChange, kMaxResidualGainChange);
 
@@ -367,7 +395,8 @@ InputVolumeController::InputVolumeController(int num_capture_channels,
   for (auto& controller : channel_controllers_) {
     controller = std::make_unique<MonoInputVolumeController>(
         config.clipped_level_min, min_mic_level,
-        config.update_input_volume_wait_frames);
+        config.update_input_volume_wait_frames,
+        config.speech_probability_threshold, config.speech_ratio_threshold);
   }
 
   RTC_DCHECK(!channel_controllers_.empty());
@@ -481,13 +510,13 @@ void InputVolumeController::Process(absl::optional<float> speech_probability,
 
   absl::optional<int> rms_error_dbfs;
   if (speech_probability.has_value() && speech_level_dbfs.has_value()) {
-    rms_error_dbfs =
-        GetSpeechLevelErrorDb(*speech_level_dbfs, *speech_probability,
-                              target_range_min_dbfs_, target_range_max_dbfs_);
+    // Compute the error for all frames (both speech and non-speech frames).
+    rms_error_dbfs = GetSpeechLevelErrorDb(
+        *speech_level_dbfs, target_range_min_dbfs_, target_range_max_dbfs_);
   }
 
   for (auto& controller : channel_controllers_) {
-    controller->Process(rms_error_dbfs);
+    controller->Process(rms_error_dbfs, speech_probability);
   }
 
   AggregateChannelLevels();
diff --git a/modules/audio_processing/agc2/input_volume_controller.h b/modules/audio_processing/agc2/input_volume_controller.h
index 8c1bac7ccc..d2f3970c81 100644
--- a/modules/audio_processing/agc2/input_volume_controller.h
+++ b/modules/audio_processing/agc2/input_volume_controller.h
@@ -36,16 +36,13 @@ class InputVolumeController final {
   // Config for the constructor.
   struct Config {
     bool enabled = false;
-    // TODO(bugs.webrtc.org/1275566): Describe `startup_min_volume`.
-    int startup_min_volume = 0;
-    // Lowest analog microphone level that will be applied in response to
-    // clipping.
+    // Lowest input volume level that will be applied in response to clipping.
     int clipped_level_min = 70;
-    // Amount the microphone level is lowered with every clipping event.
-    // Limited to (0, 255].
+    // Amount input volume level is lowered with every clipping event. Limited
+    // to (0, 255].
     int clipped_level_step = 15;
     // Proportion of clipped samples required to declare a clipping event.
-    // Limited to (0.f, 1.f).
+    // Limited to (0.0f, 1.0f).
     float clipped_ratio_threshold = 0.1f;
     // Time in frames to wait after a clipping event before checking again.
     // Limited to values higher than 0.
@@ -65,6 +62,12 @@ class InputVolumeController final {
     int target_range_min_dbfs = -48;
     // Number of wait frames between the recommended input volume updates.
     int update_input_volume_wait_frames = 100;
+    // Speech probability threshold: speech probabilities below the threshold
+    // are considered silence. Limited to [0.0f, 1.0f].
+    float speech_probability_threshold = 0.7f;
+    // Minimum speech frame ratio for volume updates to be allowed. Limited to
+    // [0.0f, 1.0f].
+    float speech_ratio_threshold = 0.9f;
   };
 
   // Ctor. `num_capture_channels` specifies the number of channels for the audio
@@ -90,6 +93,7 @@ class InputVolumeController final {
   // prediction (if enabled). Must be called after `set_stream_analog_level()`.
   void AnalyzePreProcess(const AudioBuffer& audio_buffer);
 
+  // TODO(bugs.webrtc.org/7494): Rename, audio not passed to the method anymore.
   // Adjusts the recommended input volume upwards/downwards based on
   // `speech_level_dbfs`. Must be called after `AnalyzePreProcess()`. The value
   // of `speech_probability` is expected to be in the range [0.0f, 1.0f] and
@@ -185,7 +189,9 @@ class MonoInputVolumeController {
  public:
   MonoInputVolumeController(int clipped_level_min,
                             int min_mic_level,
-                            int update_input_volume_wait_frames);
+                            int update_input_volume_wait_frames,
+                            float speech_probability_threshold,
+                            float speech_ratio_threshold);
   ~MonoInputVolumeController();
   MonoInputVolumeController(const MonoInputVolumeController&) = delete;
   MonoInputVolumeController& operator=(const MonoInputVolumeController&) =
@@ -202,10 +208,13 @@ class MonoInputVolumeController {
   // `set_stream_analog_level()`.
   void HandleClipping(int clipped_level_step);
 
-  // Adjusts the recommended input volume upwards/downwards depending on whether
-  // `rms_error_dbfs` is positive or negative. Must be called after
-  // `HandleClipping()`.
-  void Process(absl::optional<int> rms_error_dbfs);
+  // TODO(bugs.webrtc.org/7494): Rename, audio not passed to the method anymore.
+  // Adjusts the recommended input volume upwards/downwards depending on
+  // whether `rms_error_dbfs` is positive or negative. Updates are only allowed
+  // for active speech segments and when `rms_error_dbfs` is not empty. Must be
+  // called after `HandleClipping()`.
+  void Process(absl::optional<int> rms_error_dbfs,
+               absl::optional<float> speech_probability);
 
   // Returns the recommended input volume. Must be called after `Process()`.
   int recommended_analog_level() const { return recommended_input_volume_; }
@@ -254,10 +263,18 @@ class MonoInputVolumeController {
 
   const int clipped_level_min_;
 
-  // Number of frames waited between the calls to `UpdateInputVolume()`.
+  // Counters for frames and speech frames since the last update in the
+  // recommended input volume.
   const int update_input_volume_wait_frames_;
   int frames_since_update_input_volume_ = 0;
+  int speech_frames_since_update_input_volume_ = 0;
   bool is_first_frame_ = true;
+
+  // Speech probability threshold for a frame to be considered speech (instead
+  // of silence). Limited to [0.0f, 1.0f].
+  const float speech_probability_threshold_;
+  // Minimum ratio of speech frames. Limited to [0.0f, 1.0f].
+  const float speech_ratio_threshold_;
 };
 
 }  // namespace webrtc
diff --git a/modules/audio_processing/agc2/input_volume_controller_unittest.cc b/modules/audio_processing/agc2/input_volume_controller_unittest.cc
index 489d99f158..e9be177a7f 100644
--- a/modules/audio_processing/agc2/input_volume_controller_unittest.cc
+++ b/modules/audio_processing/agc2/input_volume_controller_unittest.cc
@@ -44,6 +44,8 @@ constexpr float kClippedRatioThreshold = 0.1f;
 constexpr int kClippedWaitFrames = 300;
 constexpr float kHighSpeechProbability = 0.7f;
 constexpr float kSpeechLevel = -25.0f;
+constexpr float kSpeechProbabilityThreshold = 0.5f;
+constexpr float kSpeechRatioThreshold = 0.8f;
 
 constexpr float kMinSample = std::numeric_limits<int16_t>::min();
 constexpr float kMaxSample = std::numeric_limits<int16_t>::max();
@@ -57,7 +59,6 @@ constexpr InputVolumeControllerConfig kDefaultInputVolumeControllerConfig{};
 constexpr ClippingPredictorConfig kDefaultClippingPredictorConfig{};
 
 std::unique_ptr<InputVolumeController> CreateInputVolumeController(
-    int startup_min_volume,
     int clipped_level_step,
     float clipped_ratio_threshold,
     int clipped_wait_frames,
@@ -65,7 +66,6 @@ std::unique_ptr<InputVolumeController> CreateInputVolumeController(
     int update_input_volume_wait_frames = 0) {
   InputVolumeControllerConfig config{
       .enabled = true,
-      .startup_min_volume = startup_min_volume,
       .clipped_level_min = kClippedMin,
       .clipped_level_step = clipped_level_step,
       .clipped_ratio_threshold = clipped_ratio_threshold,
@@ -74,6 +74,8 @@ std::unique_ptr<InputVolumeController> CreateInputVolumeController(
       .target_range_max_dbfs = -18,
       .target_range_min_dbfs = -30,
       .update_input_volume_wait_frames = update_input_volume_wait_frames,
+      .speech_probability_threshold = kSpeechProbabilityThreshold,
+      .speech_ratio_threshold = kSpeechRatioThreshold,
   };
 
   return std::make_unique<InputVolumeController>(/*num_capture_channels=*/1,
@@ -258,7 +260,6 @@ class SpeechSamplesReader {
 constexpr InputVolumeControllerConfig GetInputVolumeControllerTestConfig() {
   InputVolumeControllerConfig config{
       .enabled = true,
-      .startup_min_volume = kInitialInputVolume,
       .clipped_level_min = kClippedMin,
       .clipped_level_step = kClippedLevelStep,
       .clipped_ratio_threshold = kClippedRatioThreshold,
@@ -267,6 +268,8 @@ constexpr InputVolumeControllerConfig GetInputVolumeControllerTestConfig() {
       .target_range_max_dbfs = -18,
       .target_range_min_dbfs = -30,
       .update_input_volume_wait_frames = 0,
+      .speech_probability_threshold = 0.5f,
+      .speech_ratio_threshold = 1.0f,
   };
   return config;
 }
@@ -946,9 +949,8 @@ TEST_P(InputVolumeControllerParametrizedTest,
 }
 
 TEST(InputVolumeControllerTest, AgcMinMicLevelExperimentDefault) {
-  std::unique_ptr<InputVolumeController> manager =
-      CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                  kClippedRatioThreshold, kClippedWaitFrames);
+  std::unique_ptr<InputVolumeController> manager = CreateInputVolumeController(
+      kClippedLevelStep, kClippedRatioThreshold, kClippedWaitFrames);
   EXPECT_EQ(manager->channel_controllers_[0]->min_mic_level(), kMinMicLevel);
 }
 
@@ -957,8 +959,8 @@ TEST(InputVolumeControllerTest, AgcMinMicLevelExperimentDisabled) {
     test::ScopedFieldTrials field_trial(
         GetAgcMinMicLevelExperimentFieldTrial("Disabled" + field_trial_suffix));
     std::unique_ptr<InputVolumeController> manager =
-        CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                    kClippedRatioThreshold, kClippedWaitFrames);
+        CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                    kClippedWaitFrames);
 
     EXPECT_EQ(manager->channel_controllers_[0]->min_mic_level(), kMinMicLevel);
   }
@@ -969,9 +971,8 @@ TEST(InputVolumeControllerTest, AgcMinMicLevelExperimentDisabled) {
 TEST(InputVolumeControllerTest, AgcMinMicLevelExperimentOutOfRangeAbove) {
   test::ScopedFieldTrials field_trial(
       GetAgcMinMicLevelExperimentFieldTrial("Enabled-256"));
-  std::unique_ptr<InputVolumeController> manager =
-      CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                  kClippedRatioThreshold, kClippedWaitFrames);
+  std::unique_ptr<InputVolumeController> manager = CreateInputVolumeController(
+      kClippedLevelStep, kClippedRatioThreshold, kClippedWaitFrames);
   EXPECT_EQ(manager->channel_controllers_[0]->min_mic_level(), kMinMicLevel);
 }
 
@@ -980,9 +981,8 @@ TEST(InputVolumeControllerTest, AgcMinMicLevelExperimentOutOfRangeAbove) {
 TEST(InputVolumeControllerTest, AgcMinMicLevelExperimentOutOfRangeBelow) {
   test::ScopedFieldTrials field_trial(
       GetAgcMinMicLevelExperimentFieldTrial("Enabled--1"));
-  std::unique_ptr<InputVolumeController> manager =
-      CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                  kClippedRatioThreshold, kClippedWaitFrames);
+  std::unique_ptr<InputVolumeController> manager = CreateInputVolumeController(
+      kClippedLevelStep, kClippedRatioThreshold, kClippedWaitFrames);
   EXPECT_EQ(manager->channel_controllers_[0]->min_mic_level(), kMinMicLevel);
 }
 
@@ -997,8 +997,8 @@ TEST(InputVolumeControllerTest, AgcMinMicLevelExperimentEnabled50) {
         GetAgcMinMicLevelExperimentFieldTrialEnabled(kMinMicLevelOverride,
                                                      field_trial_suffix));
     std::unique_ptr<InputVolumeController> manager =
-        CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                    kClippedRatioThreshold, kClippedWaitFrames);
+        CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                    kClippedWaitFrames);
 
     EXPECT_EQ(manager->channel_controllers_[0]->min_mic_level(),
               kMinMicLevelOverride);
@@ -1016,8 +1016,8 @@ TEST(InputVolumeControllerTest,
   // relevant field trial.
   const auto factory = []() {
     std::unique_ptr<InputVolumeController> manager =
-        CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                    kClippedRatioThreshold, kClippedWaitFrames);
+        CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                    kClippedWaitFrames);
     manager->Initialize();
     manager->set_stream_analog_level(kInitialInputVolume);
     return manager;
@@ -1071,8 +1071,8 @@ TEST(InputVolumeControllerTest,
   // relevant field trial.
   const auto factory = []() {
     std::unique_ptr<InputVolumeController> manager =
-        CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                    kClippedRatioThreshold, kClippedWaitFrames);
+        CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                    kClippedWaitFrames);
     manager->Initialize();
     manager->set_stream_analog_level(kInitialInputVolume);
     return manager;
@@ -1127,7 +1127,6 @@ TEST(InputVolumeControllerTest,
     // with clipping.
     InputVolumeControllerConfig config = kDefaultInputVolumeControllerConfig;
     config.enabled = true;
-    config.startup_min_volume = kInitialInputVolume;
     config.clipped_level_step = 64;
     config.clipped_ratio_threshold = kClippedRatioThreshold;
     config.clipped_wait_frames = kClippedWaitFrames;
@@ -1193,7 +1192,6 @@ TEST(InputVolumeControllerTest,
     // with clipping.
     InputVolumeControllerConfig config = kDefaultInputVolumeControllerConfig;
     config.enabled = true;
-    config.startup_min_volume = kInitialInputVolume;
     config.clipped_level_step = 64;
     config.clipped_ratio_threshold = kClippedRatioThreshold;
     config.clipped_wait_frames = kClippedWaitFrames;
@@ -1252,16 +1250,14 @@ TEST_P(InputVolumeControllerParametrizedTest, ClippingParametersVerified) {
     GTEST_SKIP() << "Skipped. RMS error does not affect the test.";
   }
 
-  std::unique_ptr<InputVolumeController> manager =
-      CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                  kClippedRatioThreshold, kClippedWaitFrames);
+  std::unique_ptr<InputVolumeController> manager = CreateInputVolumeController(
+      kClippedLevelStep, kClippedRatioThreshold, kClippedWaitFrames);
   manager->Initialize();
   EXPECT_EQ(manager->clipped_level_step_, kClippedLevelStep);
   EXPECT_EQ(manager->clipped_ratio_threshold_, kClippedRatioThreshold);
   EXPECT_EQ(manager->clipped_wait_frames_, kClippedWaitFrames);
   std::unique_ptr<InputVolumeController> manager_custom =
-      CreateInputVolumeController(kInitialInputVolume,
-                                  /*clipped_level_step=*/10,
+      CreateInputVolumeController(/*clipped_level_step=*/10,
                                   /*clipped_ratio_threshold=*/0.2f,
                                   /*clipped_wait_frames=*/50);
   manager_custom->Initialize();
@@ -1277,8 +1273,8 @@ TEST_P(InputVolumeControllerParametrizedTest,
   }
 
   std::unique_ptr<InputVolumeController> manager = CreateInputVolumeController(
-      kInitialInputVolume, kClippedLevelStep, kClippedRatioThreshold,
-      kClippedWaitFrames, /*enable_clipping_predictor=*/false);
+      kClippedLevelStep, kClippedRatioThreshold, kClippedWaitFrames,
+      /*enable_clipping_predictor=*/false);
   manager->Initialize();
 
   EXPECT_FALSE(manager->clipping_predictor_enabled());
@@ -1302,8 +1298,8 @@ TEST_P(InputVolumeControllerParametrizedTest,
   }
 
   std::unique_ptr<InputVolumeController> manager = CreateInputVolumeController(
-      kInitialInputVolume, kClippedLevelStep, kClippedRatioThreshold,
-      kClippedWaitFrames, /*enable_clipping_predictor=*/true);
+      kClippedLevelStep, kClippedRatioThreshold, kClippedWaitFrames,
+      /*enable_clipping_predictor=*/true);
   manager->Initialize();
 
   EXPECT_TRUE(manager->clipping_predictor_enabled());
@@ -1469,13 +1465,13 @@ TEST_P(InputVolumeControllerParametrizedTest, EmptyRmsErrorHasNoEffect) {
 TEST(InputVolumeControllerTest, UpdateInputVolumeWaitFramesIsEffective) {
   constexpr int kInputVolume = kInitialInputVolume;
   std::unique_ptr<InputVolumeController> controller_wait_0 =
-      CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                  kClippedRatioThreshold, kClippedWaitFrames,
+      CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                  kClippedWaitFrames,
                                   /*enable_clipping_predictor=*/false,
                                   /*update_input_volume_wait_frames=*/0);
   std::unique_ptr<InputVolumeController> controller_wait_100 =
-      CreateInputVolumeController(kInitialInputVolume, kClippedLevelStep,
-                                  kClippedRatioThreshold, kClippedWaitFrames,
+      CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                  kClippedWaitFrames,
                                   /*enable_clipping_predictor=*/false,
                                   /*update_input_volume_wait_frames=*/100);
   controller_wait_0->Initialize();
@@ -1504,4 +1500,114 @@ TEST(InputVolumeControllerTest, UpdateInputVolumeWaitFramesIsEffective) {
   ASSERT_GT(controller_wait_100->recommended_analog_level(), kInputVolume);
 }
 
+TEST(InputVolumeControllerTest, SpeechRatioThresholdIsEffective) {
+  constexpr int kInputVolume = kInitialInputVolume;
+  // Create two input volume controllers with 10 frames between volume updates
+  // and the minimum speech ratio of 0.8 and speech probability threshold 0.5.
+  std::unique_ptr<InputVolumeController> controller_1 =
+      CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                  kClippedWaitFrames,
+                                  /*enable_clipping_predictor=*/false,
+                                  /*update_input_volume_wait_frames=*/10);
+  std::unique_ptr<InputVolumeController> controller_2 =
+      CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                  kClippedWaitFrames,
+                                  /*enable_clipping_predictor=*/false,
+                                  /*update_input_volume_wait_frames=*/10);
+  controller_1->Initialize();
+  controller_2->Initialize();
+  controller_1->set_stream_analog_level(kInputVolume);
+  controller_2->set_stream_analog_level(kInputVolume);
+
+  SpeechSamplesReader reader_1;
+  SpeechSamplesReader reader_2;
+
+  reader_1.Feed(/*num_frames=*/1, /*gain_db=*/0,
+                /*speech_probability=*/0.7f, /*speech_level=*/-42.0f,
+                *controller_1);
+  reader_2.Feed(/*num_frames=*/1, /*gain_db=*/0,
+                /*speech_probability=*/0.4f, /*speech_level=*/-42.0f,
+                *controller_2);
+
+  ASSERT_EQ(controller_1->recommended_analog_level(), kInputVolume);
+  ASSERT_EQ(controller_2->recommended_analog_level(), kInputVolume);
+
+  reader_1.Feed(/*num_frames=*/2, /*gain_db=*/0,
+                /*speech_probability=*/0.4f, /*speech_level=*/-42.0f,
+                *controller_1);
+  reader_2.Feed(/*num_frames=*/2, /*gain_db=*/0,
+                /*speech_probability=*/0.4f, /*speech_level=*/-42.0f,
+                *controller_2);
+
+  ASSERT_EQ(controller_1->recommended_analog_level(), kInputVolume);
+  ASSERT_EQ(controller_2->recommended_analog_level(), kInputVolume);
+
+  reader_1.Feed(/*num_frames=*/7, /*gain_db=*/0,
+                /*speech_probability=*/0.7f, /*speech_level=*/-42.0f,
+                *controller_1);
+  reader_2.Feed(/*num_frames=*/7, /*gain_db=*/0,
+                /*speech_probability=*/0.7f, /*speech_level=*/-42.0f,
+                *controller_2);
+
+  ASSERT_GT(controller_1->recommended_analog_level(), kInputVolume);
+  ASSERT_EQ(controller_2->recommended_analog_level(), kInputVolume);
+}
+
+TEST(InputVolumeControllerTest, SpeechProbabilityThresholdIsEffective) {
+  constexpr int kInputVolume = kInitialInputVolume;
+  // Create two input volume controllers with the exact same settings and
+  // 10 frames between volume updates.
+  std::unique_ptr<InputVolumeController> controller_1 =
+      CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                  kClippedWaitFrames,
+                                  /*enable_clipping_predictor=*/false,
+                                  /*update_input_volume_wait_frames=*/10);
+  std::unique_ptr<InputVolumeController> controller_2 =
+      CreateInputVolumeController(kClippedLevelStep, kClippedRatioThreshold,
+                                  kClippedWaitFrames,
+                                  /*enable_clipping_predictor=*/false,
+                                  /*update_input_volume_wait_frames=*/10);
+  controller_1->Initialize();
+  controller_2->Initialize();
+  controller_1->set_stream_analog_level(kInputVolume);
+  controller_2->set_stream_analog_level(kInputVolume);
+
+  SpeechSamplesReader reader_1;
+  SpeechSamplesReader reader_2;
+
+  // Process with two sets of inputs: Use `reader_1` to process inputs
+  // that make the volume to be adjusted after enough frames have been
+  // processsed and `reader_2` to process inputs that won't make the volume
+  // to be adjusted.
+  reader_1.Feed(/*num_frames=*/1, /*gain_db=*/0,
+                /*speech_probability=*/0.5f, /*speech_level=*/-42.0f,
+                *controller_1);
+  reader_2.Feed(/*num_frames=*/1, /*gain_db=*/0,
+                /*speech_probability=*/0.49f, /*speech_level=*/-42.0f,
+                *controller_2);
+
+  ASSERT_EQ(controller_1->recommended_analog_level(), kInputVolume);
+  ASSERT_EQ(controller_2->recommended_analog_level(), kInputVolume);
+
+  reader_1.Feed(/*num_frames=*/2, /*gain_db=*/0,
+                /*speech_probability=*/0.49f, /*speech_level=*/-42.0f,
+                *controller_1);
+  reader_2.Feed(/*num_frames=*/2, /*gain_db=*/0,
+                /*speech_probability=*/0.49f, /*speech_level=*/-42.0f,
+                *controller_2);
+
+  ASSERT_EQ(controller_1->recommended_analog_level(), kInputVolume);
+  ASSERT_EQ(controller_2->recommended_analog_level(), kInputVolume);
+
+  reader_1.Feed(/*num_frames=*/7, /*gain_db=*/0,
+                /*speech_probability=*/0.5f, /*speech_level=*/-42.0f,
+                *controller_1);
+  reader_2.Feed(/*num_frames=*/7, /*gain_db=*/0,
+                /*speech_probability=*/0.5f, /*speech_level=*/-42.0f,
+                *controller_2);
+
+  ASSERT_GT(controller_1->recommended_analog_level(), kInputVolume);
+  ASSERT_EQ(controller_2->recommended_analog_level(), kInputVolume);
+}
+
 }  // namespace webrtc