Improve buffer level estimation with DTX and add CNG time stretching.
The functionality is hidden behind field trial for experimentation. Bug: webrtc:10736 Change-Id: I1daf60966717c3ea43bf6ee16d190290ab740ce7 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/144059 Commit-Queue: Jakob Ivarsson <jakobi@webrtc.org> Reviewed-by: Minyue Li <minyue@webrtc.org> Cr-Commit-Position: refs/heads/master@{#28474}
This commit is contained in:
committed by
Commit Bot
parent
3d642f8442
commit
46dda83bcb
@ -1029,6 +1029,7 @@ rtc_static_library("neteq") {
|
||||
"../../rtc_base:rtc_base_approved",
|
||||
"../../rtc_base:safe_minmax",
|
||||
"../../rtc_base:sanitizer",
|
||||
"../../rtc_base/experiments:field_trial_parser",
|
||||
"../../rtc_base/system:fallthrough",
|
||||
"../../system_wrappers",
|
||||
"../../system_wrappers:field_trial",
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
|
||||
#include "absl/types/optional.h"
|
||||
#include "modules/audio_coding/neteq/buffer_level_filter.h"
|
||||
#include "modules/audio_coding/neteq/decoder_database.h"
|
||||
#include "modules/audio_coding/neteq/delay_manager.h"
|
||||
@ -21,12 +22,15 @@
|
||||
#include "modules/audio_coding/neteq/packet_buffer.h"
|
||||
#include "modules/audio_coding/neteq/sync_buffer.h"
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/experiments/field_trial_parser.h"
|
||||
#include "rtc_base/logging.h"
|
||||
#include "rtc_base/numerics/safe_conversions.h"
|
||||
#include "system_wrappers/include/field_trial.h"
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr int kPostponeDecodingLevel = 50;
|
||||
constexpr int kDefaultTargetLevelWindowMs = 100;
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -65,8 +69,24 @@ DecisionLogic::DecisionLogic(int fs_hz,
|
||||
disallow_time_stretching_(disallow_time_stretching),
|
||||
timescale_countdown_(
|
||||
tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1)),
|
||||
num_consecutive_expands_(0) {
|
||||
num_consecutive_expands_(0),
|
||||
time_stretched_cn_samples_(0),
|
||||
estimate_dtx_delay_("estimate_dtx_delay", false),
|
||||
time_stretch_cn_("time_stretch_cn", false),
|
||||
target_level_window_ms_("target_level_window",
|
||||
kDefaultTargetLevelWindowMs,
|
||||
0,
|
||||
absl::nullopt) {
|
||||
SetSampleRate(fs_hz, output_size_samples);
|
||||
const std::string field_trial_name =
|
||||
field_trial::FindFullName("WebRTC-Audio-NetEqDecisionLogicSettings");
|
||||
ParseFieldTrial(
|
||||
{&estimate_dtx_delay_, &time_stretch_cn_, &target_level_window_ms_},
|
||||
field_trial_name);
|
||||
RTC_LOG(LS_INFO) << "NetEq decision logic settings:"
|
||||
<< " estimate_dtx_delay=" << estimate_dtx_delay_
|
||||
<< " time_stretch_cn=" << time_stretch_cn_
|
||||
<< " target_level_window_ms=" << target_level_window_ms_;
|
||||
}
|
||||
|
||||
DecisionLogic::~DecisionLogic() = default;
|
||||
@ -79,6 +99,7 @@ void DecisionLogic::Reset() {
|
||||
prev_time_scale_ = false;
|
||||
timescale_countdown_.reset();
|
||||
num_consecutive_expands_ = 0;
|
||||
time_stretched_cn_samples_ = 0;
|
||||
}
|
||||
|
||||
void DecisionLogic::SoftReset() {
|
||||
@ -87,12 +108,13 @@ void DecisionLogic::SoftReset() {
|
||||
prev_time_scale_ = false;
|
||||
timescale_countdown_ =
|
||||
tick_timer_->GetNewCountdown(kMinTimescaleInterval + 1);
|
||||
time_stretched_cn_samples_ = 0;
|
||||
}
|
||||
|
||||
void DecisionLogic::SetSampleRate(int fs_hz, size_t output_size_samples) {
|
||||
// TODO(hlundin): Change to an enumerator and skip assert.
|
||||
assert(fs_hz == 8000 || fs_hz == 16000 || fs_hz == 32000 || fs_hz == 48000);
|
||||
fs_mult_ = fs_hz / 8000;
|
||||
sample_rate_ = fs_hz;
|
||||
output_size_samples_ = output_size_samples;
|
||||
}
|
||||
|
||||
@ -113,9 +135,11 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
|
||||
cng_state_ = kCngInternalOn;
|
||||
}
|
||||
|
||||
// TODO(jakobi): Use buffer span instead of num samples.
|
||||
const size_t cur_size_samples =
|
||||
packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
|
||||
size_t cur_size_samples =
|
||||
estimate_dtx_delay_
|
||||
? packet_buffer_.GetSpanSamples(decoder_frame_length, sample_rate_,
|
||||
true)
|
||||
: packet_buffer_.NumSamplesInBuffer(decoder_frame_length);
|
||||
|
||||
prev_time_scale_ =
|
||||
prev_time_scale_ && (prev_mode == kModeAccelerateSuccess ||
|
||||
@ -125,9 +149,9 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
|
||||
|
||||
// Do not update buffer history if currently playing CNG since it will bias
|
||||
// the filtered buffer level.
|
||||
if ((prev_mode != kModeRfc3389Cng) && (prev_mode != kModeCodecInternalCng) &&
|
||||
if (prev_mode != kModeRfc3389Cng && prev_mode != kModeCodecInternalCng &&
|
||||
!(next_packet && next_packet->frame &&
|
||||
next_packet->frame->IsDtxPacket())) {
|
||||
next_packet->frame->IsDtxPacket() && !estimate_dtx_delay_)) {
|
||||
FilterBufferLevel(cur_size_samples);
|
||||
}
|
||||
|
||||
@ -173,7 +197,8 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
|
||||
// if the mute factor is low enough (otherwise the expansion was short enough
|
||||
// to not be noticable).
|
||||
// Note that the MuteFactor is in Q14, so a value of 16384 corresponds to 1.
|
||||
size_t current_span = packet_buffer_.GetSpanSamples(decoder_frame_length);
|
||||
size_t current_span = packet_buffer_.GetSpanSamples(
|
||||
decoder_frame_length, sample_rate_, estimate_dtx_delay_);
|
||||
if ((prev_mode == kModeExpand || prev_mode == kModeCodecPlc) &&
|
||||
expand.MuteFactor(0) < 16384 / 2 &&
|
||||
current_span < static_cast<size_t>(delay_manager_->TargetLevel() *
|
||||
@ -183,8 +208,7 @@ Operations DecisionLogic::GetDecision(const SyncBuffer& sync_buffer,
|
||||
return kExpand;
|
||||
}
|
||||
|
||||
const uint32_t five_seconds_samples =
|
||||
static_cast<uint32_t>(5 * 8000 * fs_mult_);
|
||||
const uint32_t five_seconds_samples = static_cast<uint32_t>(5 * sample_rate_);
|
||||
// Check if the required packet is available.
|
||||
if (target_timestamp == available_timestamp) {
|
||||
return ExpectedPacketAvailable(prev_mode, play_dtmf);
|
||||
@ -212,14 +236,15 @@ void DecisionLogic::FilterBufferLevel(size_t buffer_size_samples) {
|
||||
buffer_level_filter_->SetTargetBufferLevel(
|
||||
delay_manager_->base_target_level());
|
||||
|
||||
int sample_memory_local = 0;
|
||||
int time_stretched_samples = time_stretched_cn_samples_;
|
||||
if (prev_time_scale_) {
|
||||
sample_memory_local = sample_memory_;
|
||||
time_stretched_samples += sample_memory_;
|
||||
timescale_countdown_ = tick_timer_->GetNewCountdown(kMinTimescaleInterval);
|
||||
}
|
||||
|
||||
buffer_level_filter_->Update(buffer_size_samples, sample_memory_local);
|
||||
buffer_level_filter_->Update(buffer_size_samples, time_stretched_samples);
|
||||
prev_time_scale_ = false;
|
||||
time_stretched_cn_samples_ = 0;
|
||||
}
|
||||
|
||||
Operations DecisionLogic::CngOperation(Modes prev_mode,
|
||||
@ -323,30 +348,53 @@ Operations DecisionLogic::FuturePacketAvailable(
|
||||
return kNormal;
|
||||
}
|
||||
|
||||
const size_t cur_size_samples =
|
||||
packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
|
||||
|
||||
// If previous was comfort noise, then no merge is needed.
|
||||
if (prev_mode == kModeRfc3389Cng || prev_mode == kModeCodecInternalCng) {
|
||||
// Keep the same delay as before the CNG, but make sure that the number of
|
||||
// samples in buffer is no higher than 4 times the optimal level. (Note that
|
||||
// TargetLevel() is in Q8.)
|
||||
if (static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
|
||||
available_timestamp ||
|
||||
cur_size_samples >
|
||||
((delay_manager_->TargetLevel() * packet_length_samples_) >> 8) *
|
||||
4) {
|
||||
// Time to play this new packet.
|
||||
return kNormal;
|
||||
size_t cur_size_samples =
|
||||
estimate_dtx_delay_
|
||||
? cur_size_samples = packet_buffer_.GetSpanSamples(
|
||||
decoder_frame_length, sample_rate_, true)
|
||||
: packet_buffer_.NumPacketsInBuffer() * decoder_frame_length;
|
||||
// Target level is in number of packets in Q8.
|
||||
const size_t target_level_samples =
|
||||
(delay_manager_->TargetLevel() * packet_length_samples_) >> 8;
|
||||
const bool generated_enough_noise =
|
||||
static_cast<uint32_t>(generated_noise_samples + target_timestamp) >=
|
||||
available_timestamp;
|
||||
|
||||
if (time_stretch_cn_) {
|
||||
const size_t target_threshold_samples =
|
||||
target_level_window_ms_ / 2 * (sample_rate_ / 1000);
|
||||
const bool above_target_window =
|
||||
cur_size_samples > target_level_samples + target_threshold_samples;
|
||||
const bool below_target_window =
|
||||
target_level_samples > target_threshold_samples &&
|
||||
cur_size_samples < target_level_samples - target_threshold_samples;
|
||||
// Keep the delay same as before CNG, but make sure that it is within the
|
||||
// target window.
|
||||
if ((generated_enough_noise && !below_target_window) ||
|
||||
above_target_window) {
|
||||
time_stretched_cn_samples_ = timestamp_leap - generated_noise_samples;
|
||||
return kNormal;
|
||||
}
|
||||
} else {
|
||||
// Too early to play this new packet; keep on playing comfort noise.
|
||||
if (prev_mode == kModeRfc3389Cng) {
|
||||
return kRfc3389CngNoPacket;
|
||||
} else { // prevPlayMode == kModeCodecInternalCng.
|
||||
return kCodecInternalCng;
|
||||
// Keep the same delay as before the CNG, but make sure that the number of
|
||||
// samples in buffer is no higher than 4 times the optimal level.
|
||||
if (generated_enough_noise ||
|
||||
cur_size_samples > target_level_samples * 4) {
|
||||
// Time to play this new packet.
|
||||
return kNormal;
|
||||
}
|
||||
}
|
||||
|
||||
// Too early to play this new packet; keep on playing comfort noise.
|
||||
if (prev_mode == kModeRfc3389Cng) {
|
||||
return kRfc3389CngNoPacket;
|
||||
}
|
||||
// prevPlayMode == kModeCodecInternalCng.
|
||||
return kCodecInternalCng;
|
||||
}
|
||||
|
||||
// Do not merge unless we have done an expand before.
|
||||
if (prev_mode == kModeExpand) {
|
||||
return kMerge;
|
||||
|
||||
@ -14,6 +14,7 @@
|
||||
#include "modules/audio_coding/neteq/defines.h"
|
||||
#include "modules/audio_coding/neteq/tick_timer.h"
|
||||
#include "rtc_base/constructor_magic.h"
|
||||
#include "rtc_base/experiments/field_trial_parser.h"
|
||||
|
||||
namespace webrtc {
|
||||
|
||||
@ -167,7 +168,7 @@ class DecisionLogic final {
|
||||
DelayManager* delay_manager_;
|
||||
BufferLevelFilter* buffer_level_filter_;
|
||||
const TickTimer* tick_timer_;
|
||||
int fs_mult_;
|
||||
int sample_rate_;
|
||||
size_t output_size_samples_;
|
||||
CngState cng_state_; // Remember if comfort noise is interrupted by other
|
||||
// event (e.g., DTMF).
|
||||
@ -178,6 +179,10 @@ class DecisionLogic final {
|
||||
bool disallow_time_stretching_;
|
||||
std::unique_ptr<TickTimer::Countdown> timescale_countdown_;
|
||||
int num_consecutive_expands_;
|
||||
int time_stretched_cn_samples_;
|
||||
FieldTrialParameter<bool> estimate_dtx_delay_;
|
||||
FieldTrialParameter<bool> time_stretch_cn_;
|
||||
FieldTrialConstrained<int> target_level_window_ms_;
|
||||
|
||||
RTC_DISALLOW_COPY_AND_ASSIGN(DecisionLogic);
|
||||
};
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
#include "modules/audio_coding/neteq/tick_timer.h"
|
||||
#include "rtc_base/checks.h"
|
||||
#include "rtc_base/logging.h"
|
||||
#include "rtc_base/numerics/safe_conversions.h"
|
||||
|
||||
namespace webrtc {
|
||||
namespace {
|
||||
@ -287,14 +288,22 @@ size_t PacketBuffer::NumSamplesInBuffer(size_t last_decoded_length) const {
|
||||
return num_samples;
|
||||
}
|
||||
|
||||
size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length) const {
|
||||
size_t PacketBuffer::GetSpanSamples(size_t last_decoded_length,
|
||||
size_t sample_rate,
|
||||
bool count_dtx_waiting_time) const {
|
||||
if (buffer_.size() == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t span = buffer_.back().timestamp - buffer_.front().timestamp;
|
||||
if (buffer_.back().frame && buffer_.back().frame->Duration() > 0) {
|
||||
span += buffer_.back().frame->Duration();
|
||||
size_t duration = buffer_.back().frame->Duration();
|
||||
if (count_dtx_waiting_time && buffer_.back().frame->IsDtxPacket()) {
|
||||
size_t waiting_time_samples = rtc::dchecked_cast<size_t>(
|
||||
buffer_.back().waiting_time->ElapsedMs() * (sample_rate / 1000));
|
||||
duration = std::max(duration, waiting_time_samples);
|
||||
}
|
||||
span += duration;
|
||||
} else {
|
||||
span += last_decoded_length;
|
||||
}
|
||||
|
||||
@ -123,7 +123,9 @@ class PacketBuffer {
|
||||
|
||||
// Returns the total duration in samples that the packets in the buffer spans
|
||||
// across.
|
||||
virtual size_t GetSpanSamples(size_t last_decoded_length) const;
|
||||
virtual size_t GetSpanSamples(size_t last_decoded_length,
|
||||
size_t sample_rate,
|
||||
bool count_dtx_waiting_time) const;
|
||||
|
||||
// Returns true if the packet buffer contains any DTX or CNG packets.
|
||||
virtual bool ContainsDtxOrCngPacket(
|
||||
|
||||
Reference in New Issue
Block a user