Avoid flagging Opus DTX frames as speech.

Background: After 20 consecutive DTX frames, Opus encodes the background noise in a normal frame and then goes back to outputting DTX frames. Currently all Opus frames are flagged as containing speech. This CL is has two effects on outgoing Opus packets: 1. DTX frames are flagged as non-speech. 2. A non-DTX frame that follows 20 consecutive DTX frames is flagged as non-speech. Bug: webrtc:8088 Change-Id: Ic36cf8c9d0a34f55ed4e57858362ad91e3897dda Reviewed-on: https://webrtc-review.googlesource.com/23760 Commit-Queue: Gustaf Ullberg <gustaf@webrtc.org> Reviewed-by: Henrik Lundin <henrik.lundin@webrtc.org> Cr-Commit-Position: refs/heads/master@{#20794}
2017-11-20 14:55:41 +01:00
parent e5b5f4638d
commit 36de62e830
4 changed files with 78 additions and 2 deletions
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
@ -380,7 +380,8 @@ AudioEncoderOpusImpl::AudioEncoderOpusImpl(
      inst_(nullptr),
      packet_loss_fraction_smoother_(new PacketLossFractionSmoother()),
      audio_network_adaptor_creator_(audio_network_adaptor_creator),
-      bitrate_smoother_(std::move(bitrate_smoother)) {
+      bitrate_smoother_(std::move(bitrate_smoother)),
+      consecutive_dtx_frames_(0) {
  RTC_DCHECK(0 <= payload_type && payload_type <= 127);

  // Sanity check of the redundant payload type field that we want to get rid
@ -603,14 +604,23 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
          });
  input_buffer_.clear();

+  bool dtx_frame = (info.encoded_bytes <= 2);
+
  // Will use new packet size for next encoding.
  config_.frame_size_ms = next_frame_length_ms_;

  info.encoded_timestamp = first_timestamp_in_buffer_;
  info.payload_type = payload_type_;
  info.send_even_if_empty = true;  // Allows Opus to send empty packets.
-  info.speech = (info.encoded_bytes > 0);
+  // After 20 DTX frames (MAX_CONSECUTIVE_DTX) Opus will send a frame
+  // coding the background noise. Avoid flagging this frame as speech
+  // (even though there is a probability of the frame being speech).
+  info.speech = !dtx_frame && (consecutive_dtx_frames_ != 20);
  info.encoder_type = CodecType::kOpus;
+
+  // Increase or reset DTX counter.
+  consecutive_dtx_frames_ = (dtx_frame) ? (consecutive_dtx_frames_ + 1) : (0);
+
  return info;
 }

--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.h
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.h
@ -161,6 +161,7 @@ class AudioEncoderOpusImpl final : public AudioEncoder {
  rtc::Optional<size_t> overhead_bytes_per_packet_;
  const std::unique_ptr<SmoothingFilter> bitrate_smoother_;
  rtc::Optional<int64_t> bitrate_smoother_last_update_time_;
+  int consecutive_dtx_frames_;

  friend struct AudioEncoderOpus;
  RTC_DISALLOW_COPY_AND_ASSIGN(AudioEncoderOpusImpl);
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus_unittest.cc
@ -753,4 +753,66 @@ TEST(AudioEncoderOpusTest, SetMaxPlaybackRateFb) {
  EXPECT_EQ(64000, config.bitrate_bps);
 }

+TEST(AudioEncoderOpusTest, OpusFlagDtxAsNonSpeech) {
+  // Create encoder with DTX enabled.
+  AudioEncoderOpusConfig config;
+  config.dtx_enabled = true;
+  constexpr int payload_type = 17;
+  const auto encoder = AudioEncoderOpus::MakeAudioEncoder(config, payload_type);
+
+  // Open file containing speech and silence.
+  const std::string kInputFileName =
+      webrtc::test::ResourcePath("audio_coding/testfile32kHz", "pcm");
+  test::AudioLoop audio_loop;
+  // Use the file as if it were sampled at 48 kHz.
+  constexpr int kSampleRateHz = 48000;
+  EXPECT_EQ(kSampleRateHz, encoder->SampleRateHz());
+  constexpr size_t kMaxLoopLengthSamples =
+      kSampleRateHz * 10;  // Max 10 second loop.
+  constexpr size_t kInputBlockSizeSamples =
+      10 * kSampleRateHz / 1000;  // 10 ms.
+  EXPECT_TRUE(audio_loop.Init(kInputFileName, kMaxLoopLengthSamples,
+                              kInputBlockSizeSamples));
+
+  // Encode.
+  AudioEncoder::EncodedInfo info;
+  rtc::Buffer encoded(500);
+  int nonspeech_frames = 0;
+  int max_nonspeech_frames = 0;
+  int dtx_frames = 0;
+  int max_dtx_frames = 0;
+  uint32_t rtp_timestamp = 0u;
+  for (size_t i = 0; i < 500; ++i) {
+    encoded.Clear();
+
+    // Every second call to the encoder will generate an Opus packet.
+    for (int j = 0; j < 2; j++) {
+      info =
+          encoder->Encode(rtp_timestamp, audio_loop.GetNextBlock(), &encoded);
+      rtp_timestamp += kInputBlockSizeSamples;
+    }
+
+    // Bookkeeping of number of DTX frames.
+    if (info.encoded_bytes <= 2) {
+      ++dtx_frames;
+    } else {
+      if (dtx_frames > max_dtx_frames)
+        max_dtx_frames = dtx_frames;
+      dtx_frames = 0;
+    }
+
+    // Bookkeeping of number of non-speech frames.
+    if (info.speech == 0) {
+      ++nonspeech_frames;
+    } else {
+      if (nonspeech_frames > max_nonspeech_frames)
+        max_nonspeech_frames = nonspeech_frames;
+      nonspeech_frames = 0;
+    }
+  }
+
+  // Maximum number of consecutive non-speech packets should exceed 20.
+  EXPECT_GT(max_nonspeech_frames, 20);
+}
+
 }  // namespace webrtc