Improve buffer level estimation with DTX and add CNG time stretching.

The functionality is hidden behind field trial for experimentation. Bug: webrtc:10736 Change-Id: I1daf60966717c3ea43bf6ee16d190290ab740ce7 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/144059 Commit-Queue: Jakob Ivarsson <jakobi@webrtc.org> Reviewed-by: Minyue Li <minyue@webrtc.org> Cr-Commit-Position: refs/heads/master@{#28474}
2019-07-03 16:00:30 +02:00
parent 3d642f8442
commit 46dda83bcb
5 changed files with 100 additions and 35 deletions
--- a/modules/audio_coding/BUILD.gn
+++ b/modules/audio_coding/BUILD.gn
@ -1029,6 +1029,7 @@ rtc_static_library("neteq") {
    "../../rtc_base:rtc_base_approved",
    "../../rtc_base:safe_minmax",
    "../../rtc_base:sanitizer",
+    "../../rtc_base/experiments:field_trial_parser",
    "../../rtc_base/system:fallthrough",
    "../../system_wrappers",
    "../../system_wrappers:field_trial",
--- a/modules/audio_coding/neteq/decision_logic.cc
+++ b/modules/audio_coding/neteq/decision_logic.cc
@ -14,6 +14,7 @@
 #include <stdio.h>
 #include <string>

+#include "absl/types/optional.h"
 #include "modules/audio_coding/neteq/buffer_level_filter.h"
 #include "modules/audio_coding/neteq/decoder_database.h"
 #include "modules/audio_coding/neteq/delay_manager.h"
@ -21,12 +22,15 @@
 #include "modules/audio_coding/neteq/packet_buffer.h"
 #include "modules/audio_coding/neteq/sync_buffer.h"
 #include "rtc_base/checks.h"
+#include "rtc_base/experiments/field_trial_parser.h"
 #include "rtc_base/logging.h"
 #include "rtc_base/numerics/safe_conversions.h"
+#include "system_wrappers/include/field_trial.h"

 namespace {

 constexpr int kPostponeDecodingLevel = 50;
+constexpr int kDefaultTargetLevelWindowMs = 100;

 }  // namespace

@ -65,8 +69,24 @@ DecisionLogic::DecisionLogic(int fs_hz,
      disallow_time_stretching_(disallow_time_stretching),
      timescale_countdown_(
          tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1)),
-      num_consecutive_expands_(0) {
+      num_consecutive_expands_(0),
+      time_stretched_cn_samples_(0),
+      estimate_dtx_delay_("estimate_dtx_delay", false),
+      time_stretch_cn_("time_stretch_cn", false),
+      target_level_window_ms_("target_level_window",
+                              kDefaultTargetLevelWindowMs,
+                              0,
+                              absl::nullopt) {
  SetSampleRate(fs_hz, output_size_samples);
+  const std::string field_trial_name =
+      field_trial::FindFullName("WebRTC-Audio-NetEqDecisionLogicSettings");
+  ParseFieldTrial(
+      {&estimate_dtx_delay_, &time_stretch_cn_, &target_level_window_ms_},
+      field_trial_name);
+  RTC_LOG(LS_INFO) << "NetEq decision logic settings:"
+                   << " estimate_dtx_delay=" << estimate_dtx_delay_
+                   << " time_stretch_cn=" << time_stretch_cn_
+                   << " target_level_window_ms=" << target_level_window_ms_;
 }

 DecisionLogic::~DecisionLogic() = default;
@ -79,6 +99,7 @@ void DecisionLogic::Reset() {
  prev_time_scale_ = false;
  timescale_countdown_.reset();
  num_consecutive_expands_ = 0;
+  time_stretched_cn_samples_ = 0;
 }

 void DecisionLogic::SoftReset() {
@ -87,12 +108,13 @@ void DecisionLogic::SoftReset() {
  prev_time_scale_ = false;
  timescale_countdown_ =
      tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1);
+  time_stretched_cn_samples_ = 0;
 }

 void DecisionLogic::SetSampleRate(int fs_hz, size_t output_size_samples) {
  // TODO(hlundin): Change to an enumerator and skip assert.
  assert(fs_hz == 8000 || fs_hz == 16000 || fs_hz == 32000 || fs_hz == 48000);
-  fs_mult_ = fs_hz / 8000;
+  sample_rate_ = fs_hz;
  output_size_samples_ = output_size_samples;
 }

@ -113,9 +135,11 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
    cng_state_ = kCngInternalOn;
  }

-  // TODO(jakobi): Use buffer span instead of num samples.
-  const size_t cur_size_samples =
-      packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
+  size_t cur_size_samples =
+      estimate_dtx_delay_
+          ? packet_buffer_.GetSpanSamples(decoder_frame_length, sample_rate_,
+                                          true)
+          : packet_buffer_.NumSamplesInBuffer(decoder_frame_length);

  prev_time_scale_ =
      prev_time_scale_ && (prev_mode == kModeAccelerateSuccess ||
@ -125,9 +149,9 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,

  // Do not update buffer history if currently playing CNG since it will bias
  // the filtered buffer level.
-  if ((prev_mode != kModeRfc3389Cng) && (prev_mode != kModeCodecInternalCng) &&
+  if (prev_mode != kModeRfc3389Cng && prev_mode != kModeCodecInternalCng &&
      !(next_packet && next_packet->frame &&
-        next_packet->frame->IsDtxPacket())) {
+        next_packet->frame->IsDtxPacket() && !estimate_dtx_delay_)) {
    FilterBufferLevel(cur_size_samples);
  }

@ -173,7 +197,8 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
  // if the mute factor is low enough (otherwise the expansion was short enough
  // to not be noticable).
  // Note that the MuteFactor is in Q14, so a value of 16384 corresponds to 1.
-  size_t current_span = packet_buffer_.GetSpanSamples(decoder_frame_length);
+  size_t current_span = packet_buffer_.GetSpanSamples(
+      decoder_frame_length, sample_rate_, estimate_dtx_delay_);
  if ((prev_mode == kModeExpand || prev_mode == kModeCodecPlc) &&
      expand.MuteFactor(0) < 16384 / 2 &&
      current_span < static_cast<size_t>(delay_manager_->TargetLevel() *
@ -183,8 +208,7 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
    return kExpand;
  }

-  const uint32_t five_seconds_samples =
-      static_cast<uint32_t>(5 * 8000 * fs_mult_);
+  const uint32_t five_seconds_samples = static_cast<uint32_t>(5 * sample_rate_);
  // Check if the required packet is available.
  if (target_timestamp == available_timestamp) {
    return ExpectedPacketAvailable(prev_mode, play_dtmf);
@ -212,14 +236,15 @@ void DecisionLogic::FilterBufferLevel(size_t buffer_size_samples) {
  buffer_level_filter_->SetTargetBufferLevel(
      delay_manager_->base_target_level());

-  int sample_memory_local = 0;
+  int time_stretched_samples = time_stretched_cn_samples_;
  if (prev_time_scale_) {
-    sample_memory_local = sample_memory_;
+    time_stretched_samples += sample_memory_;
    timescale_countdown_ = tick_timer_->GetNewCountdown(kMinTimescaleInterval);
  }

-  buffer_level_filter_->Update(buffer_size_samples, sample_memory_local);
+  buffer_level_filter_->Update(buffer_size_samples, time_stretched_samples);
  prev_time_scale_ = false;
+  time_stretched_cn_samples_ = 0;
 }

 Operations DecisionLogic::CngOperation(Modes prev_mode,
@ -323,30 +348,53 @@ Operations DecisionLogic::FuturePacketAvailable(
    return kNormal;
  }

-  const size_t cur_size_samples =
-      packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
-
  // If previous was comfort noise, then no merge is needed.
  if (prev_mode == kModeRfc3389Cng || prev_mode == kModeCodecInternalCng) {
-    // Keep the same delay as before the CNG, but make sure that the number of
-    // samples in buffer is no higher than 4 times the optimal level. (Note that
-    // TargetLevel() is in Q8.)
-    if (static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
-            available_timestamp ||
-        cur_size_samples >
-            ((delay_manager_->TargetLevel() * packet_length_samples_) >> 8) *
-                4) {
-      // Time to play this new packet.
-      return kNormal;
+    size_t cur_size_samples =
+        estimate_dtx_delay_
+            ? cur_size_samples = packet_buffer_.GetSpanSamples(
+                  decoder_frame_length, sample_rate_, true)
+            : packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
+    // Target level is in number of packets in Q8.
+    const size_t target_level_samples =
+        (delay_manager_->TargetLevel() * packet_length_samples_) >> 8;
+    const bool generated_enough_noise =
+        static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
+        available_timestamp;
+
+    if (time_stretch_cn_) {
+      const size_t target_threshold_samples =
+          target_level_window_ms_ / 2 * (sample_rate_ / 1000);
+      const bool above_target_window =
+          cur_size_samples > target_level_samples + target_threshold_samples;
+      const bool below_target_window =
+          target_level_samples > target_threshold_samples &&
+          cur_size_samples < target_level_samples - target_threshold_samples;
+      // Keep the delay same as before CNG, but make sure that it is within the
+      // target window.
+      if ((generated_enough_noise && !below_target_window) ||
+          above_target_window) {
+        time_stretched_cn_samples_ = timestamp_leap - generated_noise_samples;
+        return kNormal;
+      }
    } else {
-      // Too early to play this new packet; keep on playing comfort noise.
-      if (prev_mode == kModeRfc3389Cng) {
-        return kRfc3389CngNoPacket;
-      } else {  // prevPlayMode == kModeCodecInternalCng.
-        return kCodecInternalCng;
+      // Keep the same delay as before the CNG, but make sure that the number of
+      // samples in buffer is no higher than 4 times the optimal level.
+      if (generated_enough_noise ||
+          cur_size_samples > target_level_samples * 4) {
+        // Time to play this new packet.
+        return kNormal;
      }
    }
+
+    // Too early to play this new packet; keep on playing comfort noise.
+    if (prev_mode == kModeRfc3389Cng) {
+      return kRfc3389CngNoPacket;
+    }
+    // prevPlayMode == kModeCodecInternalCng.
+    return kCodecInternalCng;
  }
+
  // Do not merge unless we have done an expand before.
  if (prev_mode == kModeExpand) {
    return kMerge;
--- a/modules/audio_coding/neteq/decision_logic.h
+++ b/modules/audio_coding/neteq/decision_logic.h
@ -14,6 +14,7 @@
 #include "modules/audio_coding/neteq/defines.h"
 #include "modules/audio_coding/neteq/tick_timer.h"
 #include "rtc_base/constructor_magic.h"
+#include "rtc_base/experiments/field_trial_parser.h"

 namespace webrtc {

@ -167,7 +168,7 @@ class DecisionLogic final {
  DelayManager* delay_manager_;
  BufferLevelFilter* buffer_level_filter_;
  const TickTimer* tick_timer_;
-  int fs_mult_;
+  int sample_rate_;
  size_t output_size_samples_;
  CngState cng_state_;  // Remember if comfort noise is interrupted by other
                        // event (e.g., DTMF).
@ -178,6 +179,10 @@ class DecisionLogic final {
  bool disallow_time_stretching_;
  std::unique_ptr<TickTimer::Countdown> timescale_countdown_;
  int num_consecutive_expands_;
+  int time_stretched_cn_samples_;
+  FieldTrialParameter<bool> estimate_dtx_delay_;
+  FieldTrialParameter<bool> time_stretch_cn_;
+  FieldTrialConstrained<int> target_level_window_ms_;

  RTC_DISALLOW_COPY_AND_ASSIGN(DecisionLogic);
 };
--- a/modules/audio_coding/neteq/packet_buffer.cc
+++ b/modules/audio_coding/neteq/packet_buffer.cc
@ -26,6 +26,7 @@
 #include "modules/audio_coding/neteq/tick_timer.h"
 #include "rtc_base/checks.h"
 #include "rtc_base/logging.h"
+#include "rtc_base/numerics/safe_conversions.h"

 namespace webrtc {
 namespace {
@ -287,14 +288,22 @@ size_t PacketBuffer::NumSamplesInBuffer(size_t last_decoded_length) const {
  return num_samples;
 }

-size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length) const {
+size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length,
+                                    size_t sample_rate,
+                                    bool count_dtx_waiting_time) const {
  if (buffer_.size() == 0) {
    return 0;
  }

  size_t span = buffer_.back().timestamp - buffer_.front().timestamp;
  if (buffer_.back().frame && buffer_.back().frame->Duration() > 0) {
-    span += buffer_.back().frame->Duration();
+    size_t duration = buffer_.back().frame->Duration();
+    if (count_dtx_waiting_time && buffer_.back().frame->IsDtxPacket()) {
+      size_t waiting_time_samples = rtc::dchecked_cast<size_t>(
+          buffer_.back().waiting_time->ElapsedMs() * (sample_rate / 1000));
+      duration = std::max(duration, waiting_time_samples);
+    }
+    span += duration;
  } else {
    span += last_decoded_length;
  }
--- a/modules/audio_coding/neteq/packet_buffer.h
+++ b/modules/audio_coding/neteq/packet_buffer.h
@ -123,7 +123,9 @@ class PacketBuffer {

  // Returns the total duration in samples that the packets in the buffer spans
  // across.
-  virtual size_t GetSpanSamples(size_t last_decoded_length) const;
+  virtual size_t GetSpanSamples(size_t last_decoded_length,
+                                size_t sample_rate,
+                                bool count_dtx_waiting_time) const;

  // Returns true if the packet buffer contains any DTX or CNG packets.
  virtual bool ContainsDtxOrCngPacket(