Avoiding the noise pumping during DTX regions by just forwarding the refresh DTX packets that decrease the comfort noise level at the decoder.

Bug: webrtc:12380 Change-Id: I60e4684150cb4880224f402a9bf42a72811863b3 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/202920 Commit-Queue: Jesus de Vicente Pena <devicentepena@webrtc.org> Reviewed-by: Minyue Li <minyue@webrtc.org> Reviewed-by: Henrik Lundin <henrik.lundin@webrtc.org> Cr-Commit-Position: refs/heads/master@{#33174}
2021-02-05 09:05:46 +01:00
parent 483b31c231
commit 3b9abd8dee
5 changed files with 180 additions and 3 deletions
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc
@ -809,4 +809,90 @@ TEST_P(AudioEncoderOpusTest, OpusFlagDtxAsNonSpeech) {
  EXPECT_GT(max_nonspeech_frames, 15);
 }

+TEST(AudioEncoderOpusTest, OpusDtxFilteringHighEnergyRefreshPackets) {
+  test::ScopedFieldTrials override_field_trials(
+      "WebRTC-Audio-OpusAvoidNoisePumpingDuringDtx/Enabled/");
+  const std::string kInputFileName =
+      webrtc::test::ResourcePath("audio_coding/testfile16kHz", "pcm");
+  constexpr int kSampleRateHz = 16000;
+  AudioEncoderOpusConfig config;
+  config.dtx_enabled = true;
+  config.sample_rate_hz = kSampleRateHz;
+  constexpr int payload_type = 17;
+  const auto encoder = AudioEncoderOpus::MakeAudioEncoder(config, payload_type);
+  test::AudioLoop audio_loop;
+  constexpr size_t kMaxLoopLengthSaples = kSampleRateHz * 11.6f;
+  constexpr size_t kInputBlockSizeSamples = kSampleRateHz / 100;
+  EXPECT_TRUE(audio_loop.Init(kInputFileName, kMaxLoopLengthSaples,
+                              kInputBlockSizeSamples));
+  AudioEncoder::EncodedInfo info;
+  rtc::Buffer encoded(500);
+  // Encode the audio file and store the last part that corresponds to silence.
+  constexpr size_t kSilenceDurationSamples = kSampleRateHz * 0.2f;
+  std::array<int16_t, kSilenceDurationSamples> silence;
+  uint32_t rtp_timestamp = 0;
+  bool opus_entered_dtx = false;
+  bool silence_filled = false;
+  size_t timestamp_start_silence = 0;
+  while (!silence_filled && rtp_timestamp < kMaxLoopLengthSaples) {
+    encoded.Clear();
+    // Every second call to the encoder will generate an Opus packet.
+    for (int j = 0; j < 2; j++) {
+      auto next_frame = audio_loop.GetNextBlock();
+      info = encoder->Encode(rtp_timestamp, next_frame, &encoded);
+      if (opus_entered_dtx) {
+        size_t silence_frame_start = rtp_timestamp - timestamp_start_silence;
+        silence_filled = silence_frame_start >= kSilenceDurationSamples;
+        if (!silence_filled) {
+          std::copy(next_frame.begin(), next_frame.end(),
+                    silence.begin() + silence_frame_start);
+        }
+      }
+      rtp_timestamp += kInputBlockSizeSamples;
+    }
+    if (info.encoded_bytes < 2 && !opus_entered_dtx) {
+      timestamp_start_silence = rtp_timestamp;
+    }
+    opus_entered_dtx = info.encoded_bytes < 2;
+  }
+
+  EXPECT_TRUE(silence_filled);
+  // The copied 200 ms of silence is used for creating 6 bursts that are fed to
+  // the encoder, the first three ones with a larger energy and the last three
+  // with a lower energy. This test verifies that the encoder just sends refresh
+  // DTX packets during the last bursts.
+  int number_non_empty_packets_during_increase = 0;
+  int number_non_empty_packets_during_decrease = 0;
+  for (size_t burst = 0; burst < 6; ++burst) {
+    uint32_t rtp_timestamp_start = rtp_timestamp;
+    const bool increase_noise = burst < 3;
+    const float gain = increase_noise ? 1.4f : 0.0f;
+    while (rtp_timestamp < rtp_timestamp_start + kSilenceDurationSamples) {
+      encoded.Clear();
+      // Every second call to the encoder will generate an Opus packet.
+      for (int j = 0; j < 2; j++) {
+        std::array<int16_t, kInputBlockSizeSamples> silence_frame;
+        size_t silence_frame_start = rtp_timestamp - rtp_timestamp_start;
+        std::transform(
+            silence.begin() + silence_frame_start,
+            silence.begin() + silence_frame_start + kInputBlockSizeSamples,
+            silence_frame.begin(), [gain](float s) { return gain * s; });
+        info = encoder->Encode(rtp_timestamp, silence_frame, &encoded);
+        rtp_timestamp += kInputBlockSizeSamples;
+      }
+      // Tracking the number of non empty packets.
+      if (increase_noise && info.encoded_bytes > 2) {
+        number_non_empty_packets_during_increase++;
+      }
+      if (!increase_noise && info.encoded_bytes > 2) {
+        number_non_empty_packets_during_decrease++;
+      }
+    }
+  }
+  // Check that the refresh DTX packets are just sent during the decrease energy
+  // region.
+  EXPECT_EQ(number_non_empty_packets_during_increase, 0);
+  EXPECT_GT(number_non_empty_packets_during_decrease, 0);
+}
+
 }  // namespace webrtc
--- a/modules/audio_coding/codecs/opus/opus_inst.h
+++ b/modules/audio_coding/codecs/opus/opus_inst.h
@ -25,6 +25,9 @@ struct WebRtcOpusEncInst {
  OpusMSEncoder* multistream_encoder;
  size_t channels;
  int in_dtx_mode;
+  bool avoid_noise_pumping_during_dtx;
+  int sample_rate_hz;
+  float smooth_energy_non_active_frames;
 };

 struct WebRtcOpusDecInst {
--- a/modules/audio_coding/codecs/opus/opus_interface.cc
+++ b/modules/audio_coding/codecs/opus/opus_interface.cc
@ -12,6 +12,9 @@

 #include <cstdlib>

+#include <numeric>
+
+#include "api/array_view.h"
 #include "rtc_base/checks.h"
 #include "system_wrappers/include/field_trial.h"

@ -36,6 +39,9 @@ enum {
 constexpr char kPlcUsePrevDecodedSamplesFieldTrial[] =
    "WebRTC-Audio-OpusPlcUsePrevDecodedSamples";

+constexpr char kAvoidNoisePumpingDuringDtxFieldTrial[] =
+    "WebRTC-Audio-OpusAvoidNoisePumpingDuringDtx";
+
 static int FrameSizePerChannel(int frame_size_ms, int sample_rate_hz) {
  RTC_DCHECK_GT(frame_size_ms, 0);
  RTC_DCHECK_EQ(frame_size_ms % 10, 0);
@ -54,6 +60,46 @@ static int DefaultFrameSizePerChannel(int sample_rate_hz) {
  return FrameSizePerChannel(20, sample_rate_hz);
 }

+// Returns true if the `encoded` payload corresponds to a refresh DTX packet
+// whose energy is larger than the expected for non activity packets.
+static bool WebRtcOpus_IsHighEnergyRefreshDtxPacket(
+    OpusEncInst* inst,
+    rtc::ArrayView<const int16_t> frame,
+    rtc::ArrayView<const uint8_t> encoded) {
+  if (encoded.size() <= 2) {
+    return false;
+  }
+  int number_frames =
+      frame.size() / DefaultFrameSizePerChannel(inst->sample_rate_hz);
+  if (number_frames > 0 &&
+      WebRtcOpus_PacketHasVoiceActivity(encoded.data(), encoded.size()) == 0) {
+    const float average_frame_energy =
+        std::accumulate(frame.begin(), frame.end(), 0.0f,
+                        [](float a, int32_t b) { return a + b * b; }) /
+        number_frames;
+    if (WebRtcOpus_GetInDtx(inst) == 1 &&
+        average_frame_energy >= inst->smooth_energy_non_active_frames * 0.5f) {
+      // This is a refresh DTX packet as the encoder is in DTX and has
+      // produced a payload > 2 bytes. This refresh packet has a higher energy
+      // than the smooth energy of non activity frames (with a 3 dB negative
+      // margin) and, therefore, it is flagged as a high energy refresh DTX
+      // packet.
+      return true;
+    }
+    // The average energy is tracked in a similar way as the modeling of the
+    // comfort noise in the Silk decoder in Opus
+    // (third_party/opus/src/silk/CNG.c).
+    if (average_frame_energy < inst->smooth_energy_non_active_frames * 0.5f) {
+      inst->smooth_energy_non_active_frames = average_frame_energy;
+    } else {
+      inst->smooth_energy_non_active_frames +=
+          (average_frame_energy - inst->smooth_energy_non_active_frames) *
+          0.25f;
+    }
+  }
+  return false;
+}
+
 int16_t WebRtcOpus_EncoderCreate(OpusEncInst** inst,
                                 size_t channels,
                                 int32_t application,
@ -88,6 +134,10 @@ int16_t WebRtcOpus_EncoderCreate(OpusEncInst** inst,

  state->in_dtx_mode = 0;
  state->channels = channels;
+  state->sample_rate_hz = sample_rate_hz;
+  state->smooth_energy_non_active_frames = 0.0f;
+  state->avoid_noise_pumping_during_dtx =
+      webrtc::field_trial::IsEnabled(kAvoidNoisePumpingDuringDtxFieldTrial);

  *inst = state;
  return 0;
@ -120,9 +170,10 @@ int16_t WebRtcOpus_MultistreamEncoderCreate(
  RTC_DCHECK(state);

  int error;
-  state->multistream_encoder =
-      opus_multistream_encoder_create(48000, channels, streams, coupled_streams,
-                                      channel_mapping, opus_app, &error);
+  const int sample_rate_hz = 48000;
+  state->multistream_encoder = opus_multistream_encoder_create(
+      sample_rate_hz, channels, streams, coupled_streams, channel_mapping,
+      opus_app, &error);

  if (error != OPUS_OK || (!state->encoder && !state->multistream_encoder)) {
    WebRtcOpus_EncoderFree(state);
@ -131,6 +182,9 @@ int16_t WebRtcOpus_MultistreamEncoderCreate(

  state->in_dtx_mode = 0;
  state->channels = channels;
+  state->sample_rate_hz = sample_rate_hz;
+  state->smooth_energy_non_active_frames = 0.0f;
+  state->avoid_noise_pumping_during_dtx = false;

  *inst = state;
  return 0;
@ -188,6 +242,15 @@ int WebRtcOpus_Encode(OpusEncInst* inst,
    }
  }

+  if (inst->avoid_noise_pumping_during_dtx && WebRtcOpus_GetUseDtx(inst) == 1 &&
+      WebRtcOpus_IsHighEnergyRefreshDtxPacket(
+          inst, rtc::MakeArrayView(audio_in, samples),
+          rtc::MakeArrayView(encoded, res))) {
+    // This packet is a high energy refresh DTX packet. For avoiding an increase
+    // of the energy in the DTX region at the decoder, this packet is dropped.
+    inst->in_dtx_mode = 0;
+    return 0;
+  }
  inst->in_dtx_mode = 0;
  return res;
 }
@ -316,6 +379,16 @@ int16_t WebRtcOpus_DisableDtx(OpusEncInst* inst) {
  }
 }

+int16_t WebRtcOpus_GetUseDtx(OpusEncInst* inst) {
+  if (inst) {
+    opus_int32 use_dtx;
+    if (ENCODER_CTL(inst, OPUS_GET_DTX(&use_dtx)) == 0) {
+      return use_dtx;
+    }
+  }
+  return -1;
+}
+
 int16_t WebRtcOpus_EnableCbr(OpusEncInst* inst) {
  if (inst) {
    return ENCODER_CTL(inst, OPUS_SET_VBR(0));
--- a/modules/audio_coding/codecs/opus/opus_interface.h
+++ b/modules/audio_coding/codecs/opus/opus_interface.h
@ -231,6 +231,20 @@ int16_t WebRtcOpus_EnableDtx(OpusEncInst* inst);
 */
 int16_t WebRtcOpus_DisableDtx(OpusEncInst* inst);

+/****************************************************************************
+ * WebRtcOpus_GetUseDtx()
+ *
+ * This function gets the DTX configuration used for encoding.
+ *
+ * Input:
+ *      - inst               : Encoder context
+ *
+ * Return value              :  0 - Encoder does not use DTX.
+ *                              1 - Encoder uses DTX.
+ *                             -1 - Error.
+ */
+int16_t WebRtcOpus_GetUseDtx(OpusEncInst* inst);
+
 /****************************************************************************
 * WebRtcOpus_EnableCbr()
 *