opus: take SILK vad result into account for voice detection

BUG=webrtc:11643 Change-Id: Idc3a9b6bb7bd1a33f905843e5d6067ae19d5172c Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176508 Commit-Queue: Minyue Li <minyue@webrtc.org> Reviewed-by: Minyue Li <minyue@webrtc.org> Cr-Commit-Position: refs/heads/master@{#31743}
2020-07-16 09:47:24 +02:00
parent 3592839896
commit 686a3709ac
5 changed files with 60 additions and 23 deletions
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
@ -367,8 +367,7 @@ AudioEncoderOpusImpl::AudioEncoderOpusImpl(
      inst_(nullptr),
      packet_loss_fraction_smoother_(new PacketLossFractionSmoother()),
      audio_network_adaptor_creator_(audio_network_adaptor_creator),
-      bitrate_smoother_(std::move(bitrate_smoother)),
-      consecutive_dtx_frames_(0) {
+      bitrate_smoother_(std::move(bitrate_smoother)) {
  RTC_DCHECK(0 <= payload_type && payload_type <= 127);

  // Sanity check of the redundant payload type field that we want to get rid
@ -590,6 +589,7 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
               Num10msFramesPerPacket() * SamplesPer10msFrame());

  const size_t max_encoded_bytes = SufficientOutputBufferSize();
+  const size_t start_offset_bytes = encoded->size();
  EncodedInfo info;
  info.encoded_bytes = encoded->AppendData(
      max_encoded_bytes, [&](rtc::ArrayView<uint8_t> encoded) {
@ -604,8 +604,6 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
      });
  input_buffer_.clear();

-  bool dtx_frame = (info.encoded_bytes <= 2);
-
  // Will use new packet size for next encoding.
  config_.frame_size_ms = next_frame_length_ms_;

@ -620,14 +618,18 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
  info.encoded_timestamp = first_timestamp_in_buffer_;
  info.payload_type = payload_type_;
  info.send_even_if_empty = true;  // Allows Opus to send empty packets.
-  // After 20 DTX frames (MAX_CONSECUTIVE_DTX) Opus will send a frame
-  // coding the background noise. Avoid flagging this frame as speech
-  // (even though there is a probability of the frame being speech).
-  info.speech = !dtx_frame && (consecutive_dtx_frames_ != 20);
  info.encoder_type = CodecType::kOpus;

-  // Increase or reset DTX counter.
-  consecutive_dtx_frames_ = (dtx_frame) ? (consecutive_dtx_frames_ + 1) : (0);
+  // Extract the VAD result from the encoded packet.
+  int has_voice = WebRtcOpus_PacketHasVoiceActivity(
+      &encoded->data()[start_offset_bytes], info.encoded_bytes);
+  if (has_voice == -1) {
+    // CELT mode packet or there was an error. This had set the speech flag to
+    // true historically.
+    info.speech = true;
+  } else {
+    info.speech = has_voice;
+  }

  return info;
 }