Moved code into the lowest level of EchoSuppression

to simplify future refactoring and development.

In more detail:
1) Moved the updating of eBuf from the EchoSubtraction method
   to the EchoSuppression method as it is only used in the latter.
2) Moved the computation of efw and dfw from the SubbandCoherence method
   as those are actually the analysis filterbank computation that is not
   directly related to the coherence.
3) As a consequence of 2) 3 functions needed to be replaced by the
   generic function pointer scheme used in WebRTCAec as they have
   optimized versions for SSE2 and NEON (which before were local to each
   of the aec_core*.c files.

Motivation:
Apart from making sense from a logical point of view, the changes will
a) Allow eBuf stored in half the size on the state.
b) Allow simpler switching between using the the microphone signal
   and echo subtractor output in the echo suppressor.
c) Allow further refactoring that move all the changes to eBuf to one method
   (currently those are happening in at least 4 different methods.

Drawbacks:
i) dfw is moved to EchoSuppression which increases the stack usage for that
 method. This will, however, be improved once further refactoring can be done.

The changes have been tested for bitexactness on Linux using a quite extensive dataset.

BUG=webrtc:5201

Review URL: https://codereview.webrtc.org/1494563002

Cr-Commit-Position: refs/heads/master@{#10954}
This commit is contained in:
peah
2015-12-09 08:50:24 -08:00
committed by Commit bot
parent d1590b2571
commit afeb43897a
4 changed files with 66 additions and 71 deletions

View File

@ -135,6 +135,9 @@ WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation;
WebRtcAecOverdriveAndSuppress WebRtcAec_OverdriveAndSuppress;
WebRtcAecComfortNoise WebRtcAec_ComfortNoise;
WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
WebRtcAecStoreAsComplex WebRtcAec_StoreAsComplex;
WebRtcAecPartitionDelay WebRtcAec_PartitionDelay;
WebRtcAecWindowData WebRtcAec_WindowData;
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
return aRe * bRe - aIm * bIm;
@ -407,31 +410,13 @@ __inline static void StoreAsComplex(const float* data,
static void SubbandCoherence(AecCore* aec,
float efw[2][PART_LEN1],
float dfw[2][PART_LEN1],
float xfw[2][PART_LEN1],
float* fft,
float* cohde,
float* cohxd) {
float dfw[2][PART_LEN1];
int i;
if (aec->delayEstCtr == 0)
aec->delayIdx = PartitionDelay(aec);
// Use delayed far.
memcpy(xfw,
aec->xfwBuf + aec->delayIdx * PART_LEN1,
sizeof(xfw[0][0]) * 2 * PART_LEN1);
// Windowed near fft
WindowData(fft, aec->dBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, dfw);
// Windowed error fft
WindowData(fft, aec->eBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, efw);
SmoothedPSD(aec, efw, dfw, xfw);
// Subband coherence
@ -1011,9 +996,12 @@ static void EchoSubtraction(
static void EchoSuppression(AecCore* aec,
float* echo_subtractor_output,
float* output,
float* const* outputH) {
float efw[2][PART_LEN1], xfw[2][PART_LEN1];
float efw[2][PART_LEN1];
float xfw[2][PART_LEN1];
float dfw[2][PART_LEN1];
complex_t comfortNoiseHband[PART_LEN1];
float fft[PART_LEN2];
float scale, dtmp;
@ -1040,6 +1028,22 @@ static void EchoSuppression(AecCore* aec,
float* xfw_ptr = NULL;
// Update eBuf with echo subtractor output.
memcpy(aec->eBuf + PART_LEN,
echo_subtractor_output,
sizeof(float) * PART_LEN);
// Analysis filter banks for the echo suppressor.
// Windowed near-end ffts.
WindowData(fft, aec->dBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, dfw);
// Windowed echo suppressor output ffts.
WindowData(fft, aec->eBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, efw);
aec->delayEstCtr++;
if (aec->delayEstCtr == delayEstInterval) {
aec->delayEstCtr = 0;
@ -1060,7 +1064,15 @@ static void EchoSuppression(AecCore* aec,
// Buffer far.
memcpy(aec->xfwBuf, xfw_ptr, sizeof(float) * 2 * PART_LEN1);
WebRtcAec_SubbandCoherence(aec, efw, xfw, fft, cohde, cohxd);
if (aec->delayEstCtr == 0)
aec->delayIdx = WebRtcAec_PartitionDelay(aec);
// Use delayed far.
memcpy(xfw,
aec->xfwBuf + aec->delayIdx * PART_LEN1,
sizeof(xfw[0][0]) * 2 * PART_LEN1);
WebRtcAec_SubbandCoherence(aec, efw, dfw, xfw, fft, cohde, cohxd);
hNlXdAvg = 0;
for (i = minPrefBand; i < prefBandSize + minPrefBand; i++) {
@ -1399,10 +1411,7 @@ static void ProcessBlock(AecCore* aec) {
RTC_AEC_DEBUG_WAV_WRITE(aec->outLinearFile, echo_subtractor_output, PART_LEN);
// Perform echo suppression.
memcpy(aec->eBuf + PART_LEN,
echo_subtractor_output,
sizeof(float) * PART_LEN);
EchoSuppression(aec, output, outputH_ptr);
EchoSuppression(aec, echo_subtractor_output, output, outputH_ptr);
if (aec->metricsMode == 1) {
// Update power levels and echo metrics
@ -1511,6 +1520,10 @@ AecCore* WebRtcAec_CreateAec() {
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppress;
WebRtcAec_ComfortNoise = ComfortNoise;
WebRtcAec_SubbandCoherence = SubbandCoherence;
WebRtcAec_StoreAsComplex = StoreAsComplex;
WebRtcAec_PartitionDelay = PartitionDelay;
WebRtcAec_WindowData = WindowData;
#if defined(WEBRTC_ARCH_X86_FAMILY)
if (WebRtc_GetCPUInfo(kSSE2)) {

View File

@ -205,10 +205,21 @@ extern WebRtcAecComfortNoise WebRtcAec_ComfortNoise;
typedef void (*WebRtcAecSubBandCoherence)(AecCore* aec,
float efw[2][PART_LEN1],
float dfw[2][PART_LEN1],
float xfw[2][PART_LEN1],
float* fft,
float* cohde,
float* cohxd);
extern WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
typedef int (*WebRtcAecPartitionDelay)(const AecCore* aec);
extern WebRtcAecPartitionDelay WebRtcAec_PartitionDelay;
typedef void (*WebRtcAecStoreAsComplex)(const float* data,
float data_complex[2][PART_LEN1]);
extern WebRtcAecStoreAsComplex WebRtcAec_StoreAsComplex;
typedef void (*WebRtcAecWindowData)(float* x_windowed, const float* x);
extern WebRtcAecWindowData WebRtcAec_WindowData;
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_

View File

@ -453,7 +453,7 @@ static void OverdriveAndSuppressNEON(AecCore* aec,
}
}
static int PartitionDelay(const AecCore* aec) {
static int PartitionDelayNEON(const AecCore* aec) {
// Measures the energy in each filter partition and returns the partition with
// highest energy.
// TODO(bjornv): Spread computational cost by computing one partition per
@ -638,7 +638,7 @@ static void SmoothedPSD(AecCore* aec,
}
// Window time domain data to be used by the fft.
__inline static void WindowData(float* x_windowed, const float* x) {
static void WindowDataNEON(float* x_windowed, const float* x) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
@ -659,8 +659,8 @@ __inline static void WindowData(float* x_windowed, const float* x) {
}
// Puts fft output data into a complex valued array.
__inline static void StoreAsComplex(const float* data,
float data_complex[2][PART_LEN1]) {
static void StoreAsComplexNEON(const float* data,
float data_complex[2][PART_LEN1]) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
@ -676,31 +676,13 @@ __inline static void StoreAsComplex(const float* data,
static void SubbandCoherenceNEON(AecCore* aec,
float efw[2][PART_LEN1],
float dfw[2][PART_LEN1],
float xfw[2][PART_LEN1],
float* fft,
float* cohde,
float* cohxd) {
float dfw[2][PART_LEN1];
int i;
if (aec->delayEstCtr == 0)
aec->delayIdx = PartitionDelay(aec);
// Use delayed far.
memcpy(xfw,
aec->xfwBuf + aec->delayIdx * PART_LEN1,
sizeof(xfw[0][0]) * 2 * PART_LEN1);
// Windowed near fft
WindowData(fft, aec->dBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, dfw);
// Windowed error fft
WindowData(fft, aec->eBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, efw);
SmoothedPSD(aec, efw, dfw, xfw);
{
@ -743,4 +725,7 @@ void WebRtcAec_InitAec_neon(void) {
WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
WebRtcAec_StoreAsComplex = StoreAsComplexNEON;
WebRtcAec_PartitionDelay = PartitionDelayNEON;
WebRtcAec_WindowData = WindowDataNEON;
}

View File

@ -439,7 +439,8 @@ __inline static void _mm_add_ps_4x1(__m128 sum, float *dst) {
sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
_mm_store_ss(dst, sum);
}
static int PartitionDelay(const AecCore* aec) {
static int PartitionDelaySSE2(const AecCore* aec) {
// Measures the energy in each filter partition and returns the partition with
// highest energy.
// TODO(bjornv): Spread computational cost by computing one partition per
@ -619,7 +620,7 @@ static void SmoothedPSD(AecCore* aec,
}
// Window time domain data to be used by the fft.
__inline static void WindowData(float* x_windowed, const float* x) {
static void WindowDataSSE2(float* x_windowed, const float* x) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);
@ -639,8 +640,8 @@ __inline static void WindowData(float* x_windowed, const float* x) {
}
// Puts fft output data into a complex valued array.
__inline static void StoreAsComplex(const float* data,
float data_complex[2][PART_LEN1]) {
static void StoreAsComplexSSE2(const float* data,
float data_complex[2][PART_LEN1]) {
int i;
for (i = 0; i < PART_LEN; i += 4) {
const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);
@ -661,31 +662,13 @@ __inline static void StoreAsComplex(const float* data,
static void SubbandCoherenceSSE2(AecCore* aec,
float efw[2][PART_LEN1],
float dfw[2][PART_LEN1],
float xfw[2][PART_LEN1],
float* fft,
float* cohde,
float* cohxd) {
float dfw[2][PART_LEN1];
int i;
if (aec->delayEstCtr == 0)
aec->delayIdx = PartitionDelay(aec);
// Use delayed far.
memcpy(xfw,
aec->xfwBuf + aec->delayIdx * PART_LEN1,
sizeof(xfw[0][0]) * 2 * PART_LEN1);
// Windowed near fft
WindowData(fft, aec->dBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, dfw);
// Windowed error fft
WindowData(fft, aec->eBuf);
aec_rdft_forward_128(fft);
StoreAsComplex(fft, efw);
SmoothedPSD(aec, efw, dfw, xfw);
{
@ -740,4 +723,7 @@ void WebRtcAec_InitAec_SSE2(void) {
WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
WebRtcAec_SubbandCoherence = SubbandCoherenceSSE2;
WebRtcAec_StoreAsComplex = StoreAsComplexSSE2;
WebRtcAec_PartitionDelay = PartitionDelaySSE2;
WebRtcAec_WindowData = WindowDataSSE2;
}