Avoiding the noise pumping during DTX regions by just forwarding the refresh DTX packets that decrease the comfort noise level at the decoder.

Bug: webrtc:12380
Change-Id: I60e4684150cb4880224f402a9bf42a72811863b3
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/202920
Commit-Queue: Jesus de Vicente Pena <devicentepena@webrtc.org>
Reviewed-by: Minyue Li <minyue@webrtc.org>
Reviewed-by: Henrik Lundin <henrik.lundin@webrtc.org>
Cr-Commit-Position: refs/heads/master@{#33174}
This commit is contained in:
Jesús de Vicente Peña
2021-02-05 09:05:46 +01:00
committed by Commit Bot
parent 483b31c231
commit 3b9abd8dee
5 changed files with 180 additions and 3 deletions

View File

@ -809,4 +809,90 @@ TEST_P(AudioEncoderOpusTest, OpusFlagDtxAsNonSpeech) {
EXPECT_GT(max_nonspeech_frames, 15);
}
TEST(AudioEncoderOpusTest, OpusDtxFilteringHighEnergyRefreshPackets) {
test::ScopedFieldTrials override_field_trials(
"WebRTC-Audio-OpusAvoidNoisePumpingDuringDtx/Enabled/");
const std::string kInputFileName =
webrtc::test::ResourcePath("audio_coding/testfile16kHz", "pcm");
constexpr int kSampleRateHz = 16000;
AudioEncoderOpusConfig config;
config.dtx_enabled = true;
config.sample_rate_hz = kSampleRateHz;
constexpr int payload_type = 17;
const auto encoder = AudioEncoderOpus::MakeAudioEncoder(config, payload_type);
test::AudioLoop audio_loop;
constexpr size_t kMaxLoopLengthSaples = kSampleRateHz * 11.6f;
constexpr size_t kInputBlockSizeSamples = kSampleRateHz / 100;
EXPECT_TRUE(audio_loop.Init(kInputFileName, kMaxLoopLengthSaples,
kInputBlockSizeSamples));
AudioEncoder::EncodedInfo info;
rtc::Buffer encoded(500);
// Encode the audio file and store the last part that corresponds to silence.
constexpr size_t kSilenceDurationSamples = kSampleRateHz * 0.2f;
std::array<int16_t, kSilenceDurationSamples> silence;
uint32_t rtp_timestamp = 0;
bool opus_entered_dtx = false;
bool silence_filled = false;
size_t timestamp_start_silence = 0;
while (!silence_filled && rtp_timestamp < kMaxLoopLengthSaples) {
encoded.Clear();
// Every second call to the encoder will generate an Opus packet.
for (int j = 0; j < 2; j++) {
auto next_frame = audio_loop.GetNextBlock();
info = encoder->Encode(rtp_timestamp, next_frame, &encoded);
if (opus_entered_dtx) {
size_t silence_frame_start = rtp_timestamp - timestamp_start_silence;
silence_filled = silence_frame_start >= kSilenceDurationSamples;
if (!silence_filled) {
std::copy(next_frame.begin(), next_frame.end(),
silence.begin() + silence_frame_start);
}
}
rtp_timestamp += kInputBlockSizeSamples;
}
if (info.encoded_bytes < 2 && !opus_entered_dtx) {
timestamp_start_silence = rtp_timestamp;
}
opus_entered_dtx = info.encoded_bytes < 2;
}
EXPECT_TRUE(silence_filled);
// The copied 200 ms of silence is used for creating 6 bursts that are fed to
// the encoder, the first three ones with a larger energy and the last three
// with a lower energy. This test verifies that the encoder just sends refresh
// DTX packets during the last bursts.
int number_non_empty_packets_during_increase = 0;
int number_non_empty_packets_during_decrease = 0;
for (size_t burst = 0; burst < 6; ++burst) {
uint32_t rtp_timestamp_start = rtp_timestamp;
const bool increase_noise = burst < 3;
const float gain = increase_noise ? 1.4f : 0.0f;
while (rtp_timestamp < rtp_timestamp_start + kSilenceDurationSamples) {
encoded.Clear();
// Every second call to the encoder will generate an Opus packet.
for (int j = 0; j < 2; j++) {
std::array<int16_t, kInputBlockSizeSamples> silence_frame;
size_t silence_frame_start = rtp_timestamp - rtp_timestamp_start;
std::transform(
silence.begin() + silence_frame_start,
silence.begin() + silence_frame_start + kInputBlockSizeSamples,
silence_frame.begin(), [gain](float s) { return gain * s; });
info = encoder->Encode(rtp_timestamp, silence_frame, &encoded);
rtp_timestamp += kInputBlockSizeSamples;
}
// Tracking the number of non empty packets.
if (increase_noise && info.encoded_bytes > 2) {
number_non_empty_packets_during_increase++;
}
if (!increase_noise && info.encoded_bytes > 2) {
number_non_empty_packets_during_decrease++;
}
}
}
// Check that the refresh DTX packets are just sent during the decrease energy
// region.
EXPECT_EQ(number_non_empty_packets_during_increase, 0);
EXPECT_GT(number_non_empty_packets_during_decrease, 0);
}
} // namespace webrtc

View File

@ -25,6 +25,9 @@ struct WebRtcOpusEncInst {
OpusMSEncoder* multistream_encoder;
size_t channels;
int in_dtx_mode;
bool avoid_noise_pumping_during_dtx;
int sample_rate_hz;
float smooth_energy_non_active_frames;
};
struct WebRtcOpusDecInst {

View File

@ -12,6 +12,9 @@
#include <cstdlib>
#include <numeric>
#include "api/array_view.h"
#include "rtc_base/checks.h"
#include "system_wrappers/include/field_trial.h"
@ -36,6 +39,9 @@ enum {
constexpr char kPlcUsePrevDecodedSamplesFieldTrial[] =
"WebRTC-Audio-OpusPlcUsePrevDecodedSamples";
constexpr char kAvoidNoisePumpingDuringDtxFieldTrial[] =
"WebRTC-Audio-OpusAvoidNoisePumpingDuringDtx";
static int FrameSizePerChannel(int frame_size_ms, int sample_rate_hz) {
RTC_DCHECK_GT(frame_size_ms, 0);
RTC_DCHECK_EQ(frame_size_ms % 10, 0);
@ -54,6 +60,46 @@ static int DefaultFrameSizePerChannel(int sample_rate_hz) {
return FrameSizePerChannel(20, sample_rate_hz);
}
// Returns true if the `encoded` payload corresponds to a refresh DTX packet
// whose energy is larger than the expected for non activity packets.
static bool WebRtcOpus_IsHighEnergyRefreshDtxPacket(
OpusEncInst* inst,
rtc::ArrayView<const int16_t> frame,
rtc::ArrayView<const uint8_t> encoded) {
if (encoded.size() <= 2) {
return false;
}
int number_frames =
frame.size() / DefaultFrameSizePerChannel(inst->sample_rate_hz);
if (number_frames > 0 &&
WebRtcOpus_PacketHasVoiceActivity(encoded.data(), encoded.size()) == 0) {
const float average_frame_energy =
std::accumulate(frame.begin(), frame.end(), 0.0f,
[](float a, int32_t b) { return a + b * b; }) /
number_frames;
if (WebRtcOpus_GetInDtx(inst) == 1 &&
average_frame_energy >= inst->smooth_energy_non_active_frames * 0.5f) {
// This is a refresh DTX packet as the encoder is in DTX and has
// produced a payload > 2 bytes. This refresh packet has a higher energy
// than the smooth energy of non activity frames (with a 3 dB negative
// margin) and, therefore, it is flagged as a high energy refresh DTX
// packet.
return true;
}
// The average energy is tracked in a similar way as the modeling of the
// comfort noise in the Silk decoder in Opus
// (third_party/opus/src/silk/CNG.c).
if (average_frame_energy < inst->smooth_energy_non_active_frames * 0.5f) {
inst->smooth_energy_non_active_frames = average_frame_energy;
} else {
inst->smooth_energy_non_active_frames +=
(average_frame_energy - inst->smooth_energy_non_active_frames) *
0.25f;
}
}
return false;
}
int16_t WebRtcOpus_EncoderCreate(OpusEncInst** inst,
size_t channels,
int32_t application,
@ -88,6 +134,10 @@ int16_t WebRtcOpus_EncoderCreate(OpusEncInst** inst,
state->in_dtx_mode = 0;
state->channels = channels;
state->sample_rate_hz = sample_rate_hz;
state->smooth_energy_non_active_frames = 0.0f;
state->avoid_noise_pumping_during_dtx =
webrtc::field_trial::IsEnabled(kAvoidNoisePumpingDuringDtxFieldTrial);
*inst = state;
return 0;
@ -120,9 +170,10 @@ int16_t WebRtcOpus_MultistreamEncoderCreate(
RTC_DCHECK(state);
int error;
state->multistream_encoder =
opus_multistream_encoder_create(48000, channels, streams, coupled_streams,
channel_mapping, opus_app, &error);
const int sample_rate_hz = 48000;
state->multistream_encoder = opus_multistream_encoder_create(
sample_rate_hz, channels, streams, coupled_streams, channel_mapping,
opus_app, &error);
if (error != OPUS_OK || (!state->encoder && !state->multistream_encoder)) {
WebRtcOpus_EncoderFree(state);
@ -131,6 +182,9 @@ int16_t WebRtcOpus_MultistreamEncoderCreate(
state->in_dtx_mode = 0;
state->channels = channels;
state->sample_rate_hz = sample_rate_hz;
state->smooth_energy_non_active_frames = 0.0f;
state->avoid_noise_pumping_during_dtx = false;
*inst = state;
return 0;
@ -188,6 +242,15 @@ int WebRtcOpus_Encode(OpusEncInst* inst,
}
}
if (inst->avoid_noise_pumping_during_dtx && WebRtcOpus_GetUseDtx(inst) == 1 &&
WebRtcOpus_IsHighEnergyRefreshDtxPacket(
inst, rtc::MakeArrayView(audio_in, samples),
rtc::MakeArrayView(encoded, res))) {
// This packet is a high energy refresh DTX packet. For avoiding an increase
// of the energy in the DTX region at the decoder, this packet is dropped.
inst->in_dtx_mode = 0;
return 0;
}
inst->in_dtx_mode = 0;
return res;
}
@ -316,6 +379,16 @@ int16_t WebRtcOpus_DisableDtx(OpusEncInst* inst) {
}
}
int16_t WebRtcOpus_GetUseDtx(OpusEncInst* inst) {
if (inst) {
opus_int32 use_dtx;
if (ENCODER_CTL(inst, OPUS_GET_DTX(&use_dtx)) == 0) {
return use_dtx;
}
}
return -1;
}
int16_t WebRtcOpus_EnableCbr(OpusEncInst* inst) {
if (inst) {
return ENCODER_CTL(inst, OPUS_SET_VBR(0));

View File

@ -231,6 +231,20 @@ int16_t WebRtcOpus_EnableDtx(OpusEncInst* inst);
*/
int16_t WebRtcOpus_DisableDtx(OpusEncInst* inst);
/****************************************************************************
* WebRtcOpus_GetUseDtx()
*
* This function gets the DTX configuration used for encoding.
*
* Input:
* - inst : Encoder context
*
* Return value : 0 - Encoder does not use DTX.
* 1 - Encoder uses DTX.
* -1 - Error.
*/
int16_t WebRtcOpus_GetUseDtx(OpusEncInst* inst);
/****************************************************************************
* WebRtcOpus_EnableCbr()
*