Improve buffer level estimation with DTX and add CNG time stretching.

The functionality is hidden behind field trial for experimentation.

Bug: webrtc:10736
Change-Id: I1daf60966717c3ea43bf6ee16d190290ab740ce7
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/144059
Commit-Queue: Jakob Ivarsson <jakobi@webrtc.org>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#28474}
This commit is contained in:
Jakob Ivarsson
2019-07-03 16:00:30 +02:00
committed by Commit Bot
parent 3d642f8442
commit 46dda83bcb
5 changed files with 100 additions and 35 deletions

View File

@ -1029,6 +1029,7 @@ rtc_static_library("neteq") {
"../../rtc_base:rtc_base_approved",
"../../rtc_base:safe_minmax",
"../../rtc_base:sanitizer",
"../../rtc_base/experiments:field_trial_parser",
"../../rtc_base/system:fallthrough",
"../../system_wrappers",
"../../system_wrappers:field_trial",

View File

@ -14,6 +14,7 @@
#include <stdio.h>
#include <string>
#include "absl/types/optional.h"
#include "modules/audio_coding/neteq/buffer_level_filter.h"
#include "modules/audio_coding/neteq/decoder_database.h"
#include "modules/audio_coding/neteq/delay_manager.h"
@ -21,12 +22,15 @@
#include "modules/audio_coding/neteq/packet_buffer.h"
#include "modules/audio_coding/neteq/sync_buffer.h"
#include "rtc_base/checks.h"
#include "rtc_base/experiments/field_trial_parser.h"
#include "rtc_base/logging.h"
#include "rtc_base/numerics/safe_conversions.h"
#include "system_wrappers/include/field_trial.h"
namespace {
constexpr int kPostponeDecodingLevel = 50;
constexpr int kDefaultTargetLevelWindowMs = 100;
} // namespace
@ -65,8 +69,24 @@ DecisionLogic::DecisionLogic(int fs_hz,
disallow_time_stretching_(disallow_time_stretching),
timescale_countdown_(
tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1)),
num_consecutive_expands_(0) {
num_consecutive_expands_(0),
time_stretched_cn_samples_(0),
estimate_dtx_delay_("estimate_dtx_delay", false),
time_stretch_cn_("time_stretch_cn", false),
target_level_window_ms_("target_level_window",
kDefaultTargetLevelWindowMs,
0,
absl::nullopt) {
SetSampleRate(fs_hz, output_size_samples);
const std::string field_trial_name =
field_trial::FindFullName("WebRTC-Audio-NetEqDecisionLogicSettings");
ParseFieldTrial(
{&estimate_dtx_delay_, &time_stretch_cn_, &target_level_window_ms_},
field_trial_name);
RTC_LOG(LS_INFO) << "NetEq decision logic settings:"
<< " estimate_dtx_delay=" << estimate_dtx_delay_
<< " time_stretch_cn=" << time_stretch_cn_
<< " target_level_window_ms=" << target_level_window_ms_;
}
DecisionLogic::~DecisionLogic() = default;
@ -79,6 +99,7 @@ void DecisionLogic::Reset() {
prev_time_scale_ = false;
timescale_countdown_.reset();
num_consecutive_expands_ = 0;
time_stretched_cn_samples_ = 0;
}
void DecisionLogic::SoftReset() {
@ -87,12 +108,13 @@ void DecisionLogic::SoftReset() {
prev_time_scale_ = false;
timescale_countdown_ =
tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1);
time_stretched_cn_samples_ = 0;
}
void DecisionLogic::SetSampleRate(int fs_hz, size_t output_size_samples) {
// TODO(hlundin): Change to an enumerator and skip assert.
assert(fs_hz == 8000 || fs_hz == 16000 || fs_hz == 32000 || fs_hz == 48000);
fs_mult_ = fs_hz / 8000;
sample_rate_ = fs_hz;
output_size_samples_ = output_size_samples;
}
@ -113,9 +135,11 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
cng_state_ = kCngInternalOn;
}
// TODO(jakobi): Use buffer span instead of num samples.
const size_t cur_size_samples =
packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
size_t cur_size_samples =
estimate_dtx_delay_
? packet_buffer_.GetSpanSamples(decoder_frame_length, sample_rate_,
true)
: packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
prev_time_scale_ =
prev_time_scale_ && (prev_mode == kModeAccelerateSuccess ||
@ -125,9 +149,9 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
// Do not update buffer history if currently playing CNG since it will bias
// the filtered buffer level.
if ((prev_mode != kModeRfc3389Cng) && (prev_mode != kModeCodecInternalCng) &&
if (prev_mode != kModeRfc3389Cng && prev_mode != kModeCodecInternalCng &&
!(next_packet && next_packet->frame &&
next_packet->frame->IsDtxPacket())) {
next_packet->frame->IsDtxPacket() && !estimate_dtx_delay_)) {
FilterBufferLevel(cur_size_samples);
}
@ -173,7 +197,8 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
// if the mute factor is low enough (otherwise the expansion was short enough
// to not be noticable).
// Note that the MuteFactor is in Q14, so a value of 16384 corresponds to 1.
size_t current_span = packet_buffer_.GetSpanSamples(decoder_frame_length);
size_t current_span = packet_buffer_.GetSpanSamples(
decoder_frame_length, sample_rate_, estimate_dtx_delay_);
if ((prev_mode == kModeExpand || prev_mode == kModeCodecPlc) &&
expand.MuteFactor(0) < 16384 / 2 &&
current_span < static_cast<size_t>(delay_manager_->TargetLevel() *
@ -183,8 +208,7 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
return kExpand;
}
const uint32_t five_seconds_samples =
static_cast<uint32_t>(5 * 8000 * fs_mult_);
const uint32_t five_seconds_samples = static_cast<uint32_t>(5 * sample_rate_);
// Check if the required packet is available.
if (target_timestamp == available_timestamp) {
return ExpectedPacketAvailable(prev_mode, play_dtmf);
@ -212,14 +236,15 @@ void DecisionLogic::FilterBufferLevel(size_t buffer_size_samples) {
buffer_level_filter_->SetTargetBufferLevel(
delay_manager_->base_target_level());
int sample_memory_local = 0;
int time_stretched_samples = time_stretched_cn_samples_;
if (prev_time_scale_) {
sample_memory_local = sample_memory_;
time_stretched_samples += sample_memory_;
timescale_countdown_ = tick_timer_->GetNewCountdown(kMinTimescaleInterval);
}
buffer_level_filter_->Update(buffer_size_samples, sample_memory_local);
buffer_level_filter_->Update(buffer_size_samples, time_stretched_samples);
prev_time_scale_ = false;
time_stretched_cn_samples_ = 0;
}
Operations DecisionLogic::CngOperation(Modes prev_mode,
@ -323,30 +348,53 @@ Operations DecisionLogic::FuturePacketAvailable(
return kNormal;
}
const size_t cur_size_samples =
packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
// If previous was comfort noise, then no merge is needed.
if (prev_mode == kModeRfc3389Cng || prev_mode == kModeCodecInternalCng) {
// Keep the same delay as before the CNG, but make sure that the number of
// samples in buffer is no higher than 4 times the optimal level. (Note that
// TargetLevel() is in Q8.)
if (static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
available_timestamp ||
cur_size_samples >
((delay_manager_->TargetLevel() * packet_length_samples_) >> 8) *
4) {
// Time to play this new packet.
return kNormal;
size_t cur_size_samples =
estimate_dtx_delay_
? cur_size_samples = packet_buffer_.GetSpanSamples(
decoder_frame_length, sample_rate_, true)
: packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
// Target level is in number of packets in Q8.
const size_t target_level_samples =
(delay_manager_->TargetLevel() * packet_length_samples_) >> 8;
const bool generated_enough_noise =
static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
available_timestamp;
if (time_stretch_cn_) {
const size_t target_threshold_samples =
target_level_window_ms_ / 2 * (sample_rate_ / 1000);
const bool above_target_window =
cur_size_samples > target_level_samples + target_threshold_samples;
const bool below_target_window =
target_level_samples > target_threshold_samples &&
cur_size_samples < target_level_samples - target_threshold_samples;
// Keep the delay same as before CNG, but make sure that it is within the
// target window.
if ((generated_enough_noise && !below_target_window) ||
above_target_window) {
time_stretched_cn_samples_ = timestamp_leap - generated_noise_samples;
return kNormal;
}
} else {
// Too early to play this new packet; keep on playing comfort noise.
if (prev_mode == kModeRfc3389Cng) {
return kRfc3389CngNoPacket;
} else { // prevPlayMode == kModeCodecInternalCng.
return kCodecInternalCng;
// Keep the same delay as before the CNG, but make sure that the number of
// samples in buffer is no higher than 4 times the optimal level.
if (generated_enough_noise ||
cur_size_samples > target_level_samples * 4) {
// Time to play this new packet.
return kNormal;
}
}
// Too early to play this new packet; keep on playing comfort noise.
if (prev_mode == kModeRfc3389Cng) {
return kRfc3389CngNoPacket;
}
// prevPlayMode == kModeCodecInternalCng.
return kCodecInternalCng;
}
// Do not merge unless we have done an expand before.
if (prev_mode == kModeExpand) {
return kMerge;

View File

@ -14,6 +14,7 @@
#include "modules/audio_coding/neteq/defines.h"
#include "modules/audio_coding/neteq/tick_timer.h"
#include "rtc_base/constructor_magic.h"
#include "rtc_base/experiments/field_trial_parser.h"
namespace webrtc {
@ -167,7 +168,7 @@ class DecisionLogic final {
DelayManager* delay_manager_;
BufferLevelFilter* buffer_level_filter_;
const TickTimer* tick_timer_;
int fs_mult_;
int sample_rate_;
size_t output_size_samples_;
CngState cng_state_; // Remember if comfort noise is interrupted by other
// event (e.g., DTMF).
@ -178,6 +179,10 @@ class DecisionLogic final {
bool disallow_time_stretching_;
std::unique_ptr<TickTimer::Countdown> timescale_countdown_;
int num_consecutive_expands_;
int time_stretched_cn_samples_;
FieldTrialParameter<bool> estimate_dtx_delay_;
FieldTrialParameter<bool> time_stretch_cn_;
FieldTrialConstrained<int> target_level_window_ms_;
RTC_DISALLOW_COPY_AND_ASSIGN(DecisionLogic);
};

View File

@ -26,6 +26,7 @@
#include "modules/audio_coding/neteq/tick_timer.h"
#include "rtc_base/checks.h"
#include "rtc_base/logging.h"
#include "rtc_base/numerics/safe_conversions.h"
namespace webrtc {
namespace {
@ -287,14 +288,22 @@ size_t PacketBuffer::NumSamplesInBuffer(size_t last_decoded_length) const {
return num_samples;
}
size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length) const {
size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length,
size_t sample_rate,
bool count_dtx_waiting_time) const {
if (buffer_.size() == 0) {
return 0;
}
size_t span = buffer_.back().timestamp - buffer_.front().timestamp;
if (buffer_.back().frame && buffer_.back().frame->Duration() > 0) {
span += buffer_.back().frame->Duration();
size_t duration = buffer_.back().frame->Duration();
if (count_dtx_waiting_time && buffer_.back().frame->IsDtxPacket()) {
size_t waiting_time_samples = rtc::dchecked_cast<size_t>(
buffer_.back().waiting_time->ElapsedMs() * (sample_rate / 1000));
duration = std::max(duration, waiting_time_samples);
}
span += duration;
} else {
span += last_decoded_length;
}

View File

@ -123,7 +123,9 @@ class PacketBuffer {
// Returns the total duration in samples that the packets in the buffer spans
// across.
virtual size_t GetSpanSamples(size_t last_decoded_length) const;
virtual size_t GetSpanSamples(size_t last_decoded_length,
size_t sample_rate,
bool count_dtx_waiting_time) const;
// Returns true if the packet buffer contains any DTX or CNG packets.
virtual bool ContainsDtxOrCngPacket(