-Removed the state as an input to the FilterAdaptation function.

-Renamed the TimeToFrequency and FrequencyToTime functions.
-Moved the windowing from the TimeToFrequency function.
-Simplified the EchoSubtraction function.

Note that the aec state is still an input to the EchoSubtraction function, and it currently needs to be that in order to support the output of the debug file. The longer-term goal is, however, to order the state into substates. This will simplify the parameter lists to the EchoCancellation function as well as replace the aec state as a parameter

BUG=webrtc:5201

Review URL: https://codereview.webrtc.org/1456123003

Cr-Commit-Position: refs/heads/master@{#10830}
This commit is contained in:
peah
2015-11-27 15:24:27 -08:00
committed by Commit bot
parent 19822d63c1
commit 7e43138c08
5 changed files with 297 additions and 275 deletions

View File

@ -151,30 +151,31 @@ static int CmpFloat(const void* a, const void* b) {
return (*da > *db) - (*da < *db);
}
static void FilterFar(int num_partitions,
int x_fft_buffer_block_pos,
float x_fft_buffer[2][kExtendedNumPartitions * PART_LEN1],
float h_fft_buffer[2][kExtendedNumPartitions * PART_LEN1],
float y_fft[2][PART_LEN1]) {
static void FilterFar(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float y_fft[2][PART_LEN1]) {
int i;
for (i = 0; i < num_partitions; i++) {
int j;
int x_pos = (i + x_fft_buffer_block_pos) * PART_LEN1;
int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrapped buffer.
if (i + x_fft_buffer_block_pos >= num_partitions) {
x_pos -= num_partitions * (PART_LEN1);
// Check for wrap
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * (PART_LEN1);
}
for (j = 0; j < PART_LEN1; j++) {
y_fft[0][j] += MulRe(x_fft_buffer[0][x_pos + j],
x_fft_buffer[1][x_pos + j],
h_fft_buffer[0][pos + j],
h_fft_buffer[1][pos + j]);
y_fft[1][j] += MulIm(x_fft_buffer[0][x_pos + j],
x_fft_buffer[1][x_pos + j],
h_fft_buffer[0][pos + j],
h_fft_buffer[1][pos + j]);
y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],
x_fft_buf[1][xPos + j],
h_fft_buf[0][pos + j],
h_fft_buf[1][pos + j]);
y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],
x_fft_buf[1][xPos + j],
h_fft_buf[0][pos + j],
h_fft_buf[1][pos + j]);
}
}
}
@ -182,7 +183,7 @@ static void FilterFar(int num_partitions,
static void ScaleErrorSignal(int extended_filter_enabled,
float normal_mu,
float normal_error_threshold,
float *x_pow,
float x_pow[PART_LEN1],
float ef[2][PART_LEN1]) {
const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;
const float error_threshold = extended_filter_enabled
@ -207,59 +208,40 @@ static void ScaleErrorSignal(int extended_filter_enabled,
}
}
// Time-unconstrined filter adaptation.
// TODO(andrew): consider for a low-complexity mode.
// static void FilterAdaptationUnconstrained(AecCore* aec, float *fft,
// float ef[2][PART_LEN1]) {
// int i, j;
// for (i = 0; i < aec->num_partitions; i++) {
// int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
// int pos;
// // Check for wrap
// if (i + aec->xfBufBlockPos >= aec->num_partitions) {
// xPos -= aec->num_partitions * PART_LEN1;
// }
//
// pos = i * PART_LEN1;
//
// for (j = 0; j < PART_LEN1; j++) {
// aec->wfBuf[0][pos + j] += MulRe(aec->xfBuf[0][xPos + j],
// -aec->xfBuf[1][xPos + j],
// ef[0][j], ef[1][j]);
// aec->wfBuf[1][pos + j] += MulIm(aec->xfBuf[0][xPos + j],
// -aec->xfBuf[1][xPos + j],
// ef[0][j], ef[1][j]);
// }
// }
//}
static void FilterAdaptation(AecCore* aec, float* fft, float ef[2][PART_LEN1]) {
static void FilterAdaptation(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float e_fft[2][PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
int i, j;
for (i = 0; i < aec->num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
float fft[PART_LEN2];
for (i = 0; i < num_partitions; i++) {
int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1);
int pos;
// Check for wrap
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
xPos -= aec->num_partitions * PART_LEN1;
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
pos = i * PART_LEN1;
for (j = 0; j < PART_LEN; j++) {
fft[2 * j] = MulRe(aec->xfBuf[0][xPos + j],
-aec->xfBuf[1][xPos + j],
ef[0][j],
ef[1][j]);
fft[2 * j + 1] = MulIm(aec->xfBuf[0][xPos + j],
-aec->xfBuf[1][xPos + j],
ef[0][j],
ef[1][j]);
fft[2 * j] = MulRe(x_fft_buf[0][xPos + j],
-x_fft_buf[1][xPos + j],
e_fft[0][j],
e_fft[1][j]);
fft[2 * j + 1] = MulIm(x_fft_buf[0][xPos + j],
-x_fft_buf[1][xPos + j],
e_fft[0][j],
e_fft[1][j]);
}
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN],
ef[1][PART_LEN]);
fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],
-x_fft_buf[1][xPos + PART_LEN],
e_fft[0][PART_LEN],
e_fft[1][PART_LEN]);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
@ -273,12 +255,12 @@ static void FilterAdaptation(AecCore* aec, float* fft, float ef[2][PART_LEN1]) {
}
aec_rdft_forward_128(fft);
aec->wfBuf[0][pos] += fft[0];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
h_fft_buf[0][pos] += fft[0];
h_fft_buf[0][pos + PART_LEN] += fft[1];
for (j = 1; j < PART_LEN; j++) {
aec->wfBuf[0][pos + j] += fft[2 * j];
aec->wfBuf[1][pos + j] += fft[2 * j + 1];
h_fft_buf[0][pos + j] += fft[2 * j];
h_fft_buf[1][pos + j] += fft[2 * j + 1];
}
}
}
@ -845,34 +827,26 @@ static void UpdateDelayMetrics(AecCore* self) {
return;
}
static void FrequencyToTime(float freq_data[2][PART_LEN1],
float time_data[PART_LEN2]) {
static void InverseFft(float freq_data[2][PART_LEN1],
float time_data[PART_LEN2]) {
int i;
time_data[0] = freq_data[0][0];
time_data[1] = freq_data[0][PART_LEN];
const float scale = 1.0f / PART_LEN2;
time_data[0] = freq_data[0][0] * scale;
time_data[1] = freq_data[0][PART_LEN] * scale;
for (i = 1; i < PART_LEN; i++) {
time_data[2 * i] = freq_data[0][i];
time_data[2 * i + 1] = freq_data[1][i];
time_data[2 * i] = freq_data[0][i] * scale;
time_data[2 * i + 1] = freq_data[1][i] * scale;
}
aec_rdft_inverse_128(time_data);
}
static void TimeToFrequency(float time_data[PART_LEN2],
float freq_data[2][PART_LEN1],
int window) {
int i = 0;
// TODO(bjornv): Should we have a different function/wrapper for windowed FFT?
if (window) {
for (i = 0; i < PART_LEN; i++) {
time_data[i] *= WebRtcAec_sqrtHanning[i];
time_data[PART_LEN + i] *= WebRtcAec_sqrtHanning[PART_LEN - i];
}
}
static void Fft(float time_data[PART_LEN2],
float freq_data[2][PART_LEN1]) {
int i;
aec_rdft_forward_128(time_data);
// Reorder.
// Reorder fft output data.
freq_data[1][0] = 0;
freq_data[1][PART_LEN] = 0;
freq_data[0][0] = time_data[0];
@ -963,61 +937,76 @@ static int SignalBasedDelayCorrection(AecCore* self) {
return delay_correction;
}
static void EchoSubtraction(AecCore* aec,
float* nearend_ptr) {
float yf[2][PART_LEN1];
float fft[PART_LEN2];
float y[PART_LEN];
static void EchoSubtraction(
AecCore* aec,
int num_partitions,
int x_fft_buf_block_pos,
int metrics_mode,
int extended_filter_enabled,
float normal_mu,
float normal_error_threshold,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float* const y,
float x_pow[PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
PowerLevel* linout_level,
float echo_subtractor_output[PART_LEN]) {
float s_fft[2][PART_LEN1];
float e_extended[PART_LEN2];
float s_extended[PART_LEN2];
float *s;
float e[PART_LEN];
float ef[2][PART_LEN1];
float scale;
float e_fft[2][PART_LEN1];
int i;
memset(yf, 0, sizeof(yf));
memset(s_fft, 0, sizeof(s_fft));
// Produce frequency domain echo estimate.
WebRtcAec_FilterFar(aec->num_partitions,
aec->xfBufBlockPos,
aec->xfBuf,
aec->wfBuf,
yf);
// Produce echo estimate s_fft.
WebRtcAec_FilterFar(num_partitions,
x_fft_buf_block_pos,
x_fft_buf,
h_fft_buf,
s_fft);
// Inverse fft to obtain echo estimate and error.
FrequencyToTime(yf, fft);
// Extract the output signal and compute the time-domain error.
scale = 2.0f / PART_LEN2;
// Compute the time-domain echo estimate s.
InverseFft(s_fft, s_extended);
s = &s_extended[PART_LEN];
for (i = 0; i < PART_LEN; ++i) {
y[i] = fft[PART_LEN + i] * scale; // fft scaling.
e[i] = nearend_ptr[i] - y[i];
s[i] *= 2.0f;
}
// Error fft
memcpy(aec->eBuf + PART_LEN, e, sizeof(float) * PART_LEN);
memset(fft, 0, sizeof(float) * PART_LEN);
memcpy(fft + PART_LEN, e, sizeof(float) * PART_LEN);
TimeToFrequency(fft, ef, 0);
// Compute the time-domain echo prediction error.
for (i = 0; i < PART_LEN; ++i) {
e[i] = y[i] - s[i];
}
// Compute the frequency domain echo prediction error.
memset(e_extended, 0, sizeof(float) * PART_LEN);
memcpy(e_extended + PART_LEN, e, sizeof(float) * PART_LEN);
Fft(e_extended, e_fft);
RTC_AEC_DEBUG_RAW_WRITE(aec->e_fft_file,
&ef[0][0],
sizeof(ef[0][0]) * PART_LEN1 * 2);
&e_fft[0][0],
sizeof(e_fft[0][0]) * PART_LEN1 * 2);
if (aec->metricsMode == 1) {
if (metrics_mode == 1) {
// Note that the first PART_LEN samples in fft (before transformation) are
// zero. Hence, the scaling by two in UpdateLevel() should not be
// performed. That scaling is taken care of in UpdateMetrics() instead.
UpdateLevel(&aec->linoutlevel, ef);
UpdateLevel(linout_level, e_fft);
}
// Scale error signal inversely with far power.
WebRtcAec_ScaleErrorSignal(aec->extended_filter_enabled,
aec->normal_mu,
aec->normal_error_threshold,
aec->xPow,
ef);
WebRtcAec_FilterAdaptation(aec, fft, ef);
RTC_AEC_DEBUG_WAV_WRITE(aec->outLinearFile, e, PART_LEN);
WebRtcAec_ScaleErrorSignal(extended_filter_enabled,
normal_mu,
normal_error_threshold,
x_pow,
e_fft);
WebRtcAec_FilterAdaptation(num_partitions,
x_fft_buf_block_pos,
x_fft_buf,
e_fft,
h_fft_buf);
memcpy(echo_subtractor_output, e, sizeof(float) * PART_LEN);
}
@ -1274,6 +1263,7 @@ static void ProcessBlock(AecCore* aec) {
const float gInitNoise[2] = {0.999f, 0.001f};
float nearend[PART_LEN];
float echo_subtractor_output[PART_LEN];
float* nearend_ptr = NULL;
float output[PART_LEN];
float outputH[NUM_HIGH_BANDS_MAX][PART_LEN];
@ -1313,7 +1303,7 @@ static void ProcessBlock(AecCore* aec) {
// Near fft
memcpy(fft, aec->dBuf, sizeof(float) * PART_LEN2);
TimeToFrequency(fft, df, 0);
Fft(fft, df);
// Power smoothing
for (i = 0; i < PART_LEN1; i++) {
@ -1392,9 +1382,26 @@ static void ProcessBlock(AecCore* aec) {
sizeof(float) * PART_LEN1);
// Perform echo subtraction.
EchoSubtraction(aec, nearend_ptr);
EchoSubtraction(aec,
aec->num_partitions,
aec->xfBufBlockPos,
aec->metricsMode,
aec->extended_filter_enabled,
aec->normal_mu,
aec->normal_error_threshold,
aec->xfBuf,
nearend_ptr,
aec->xPow,
aec->wfBuf,
&aec->linoutlevel,
echo_subtractor_output);
RTC_AEC_DEBUG_WAV_WRITE(aec->outLinearFile, echo_subtractor_output, PART_LEN);
// Perform echo suppression.
memcpy(aec->eBuf + PART_LEN,
echo_subtractor_output,
sizeof(float) * PART_LEN);
EchoSuppression(aec, output, outputH_ptr);
if (aec->metricsMode == 1) {
@ -1737,12 +1744,12 @@ void WebRtcAec_BufferFarendPartition(AecCore* aec, const float* farend) {
}
// Convert far-end partition to the frequency domain without windowing.
memcpy(fft, farend, sizeof(float) * PART_LEN2);
TimeToFrequency(fft, xf, 0);
Fft(fft, xf);
WebRtc_WriteBuffer(aec->far_buf, &xf[0][0], 1);
// Convert far-end partition to the frequency domain with windowing.
memcpy(fft, farend, sizeof(float) * PART_LEN2);
TimeToFrequency(fft, xf, 1);
WindowData(fft, farend);
Fft(fft, xf);
WebRtc_WriteBuffer(aec->far_buf_windowed, &xf[0][0], 1);
}

View File

@ -172,20 +172,23 @@ struct AecCore {
typedef void (*WebRtcAecFilterFar)(
int num_partitions,
int x_fft_buffer_block_pos,
float x_fft_buffer[2][kExtendedNumPartitions * PART_LEN1],
float h_fft_buffer[2][kExtendedNumPartitions * PART_LEN1],
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float y_fft[2][PART_LEN1]);
extern WebRtcAecFilterFar WebRtcAec_FilterFar;
typedef void (*WebRtcAecScaleErrorSignal)(int extended_filter_enabled,
float normal_mu,
float normal_error_threshold,
float* xPow,
float x_pow[PART_LEN1],
float ef[2][PART_LEN1]);
extern WebRtcAecScaleErrorSignal WebRtcAec_ScaleErrorSignal;
typedef void (*WebRtcAecFilterAdaptation)(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]);
typedef void (*WebRtcAecFilterAdaptation)(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float e_fft[2][PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]);
extern WebRtcAecFilterAdaptation WebRtcAec_FilterAdaptation;
typedef void (*WebRtcAecOverdriveAndSuppress)(AecCore* aec,
float hNl[PART_LEN1],

View File

@ -322,24 +322,24 @@ void WebRtcAec_ComfortNoise_mips(AecCore* aec,
void WebRtcAec_FilterFar_mips(
int num_partitions,
int xfBufBlockPos,
float xfBuf[2][kExtendedNumPartitions * PART_LEN1],
float wfBuf[2][kExtendedNumPartitions * PART_LEN1],
float yf[2][PART_LEN1]) {
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float y_fft[2][PART_LEN1]) {
int i;
for (i = 0; i < num_partitions; i++) {
int xPos = (i + xfBufBlockPos) * PART_LEN1;
int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + xfBufBlockPos >= num_partitions) {
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * (PART_LEN1);
}
float* yf0 = yf[0];
float* yf1 = yf[1];
float* aRe = xfBuf[0] + xPos;
float* aIm = xfBuf[1] + xPos;
float* bRe = wfBuf[0] + pos;
float* bIm = wfBuf[1] + pos;
float* yf0 = y_fft[0];
float* yf1 = y_fft[1];
float* aRe = x_fft_buf[0] + xPos;
float* aIm = x_fft_buf[1] + xPos;
float* bRe = h_fft_buf[0] + pos;
float* bIm = h_fft_buf[1] + pos;
float f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13;
int len = PART_LEN1 >> 1;
@ -437,23 +437,27 @@ void WebRtcAec_FilterFar_mips(
}
}
void WebRtcAec_FilterAdaptation_mips(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
void WebRtcAec_FilterAdaptation_mips(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float e_fft[2][PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
float fft[PART_LEN2];
int i;
for (i = 0; i < aec->num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1);
for (i = 0; i < num_partitions; i++) {
int xPos = (i + x_fft_buf_block_pos)*(PART_LEN1);
int pos;
// Check for wrap
if (i + aec->xfBufBlockPos >= aec->num_partitions) {
xPos -= aec->num_partitions * PART_LEN1;
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
pos = i * PART_LEN1;
float* aRe = aec->xfBuf[0] + xPos;
float* aIm = aec->xfBuf[1] + xPos;
float* bRe = ef[0];
float* bIm = ef[1];
float* aRe = x_fft_buf[0] + xPos;
float* aIm = x_fft_buf[1] + xPos;
float* bRe = e_fft[0];
float* bIm = e_fft[1];
float* fft_tmp;
float f0, f1, f2, f3, f4, f5, f6 ,f7, f8, f9, f10, f11, f12;
@ -578,8 +582,8 @@ void WebRtcAec_FilterAdaptation_mips(AecCore* aec,
);
}
aec_rdft_forward_128(fft);
aRe = aec->wfBuf[0] + pos;
aIm = aec->wfBuf[1] + pos;
aRe = h_fft_buf[0] + pos;
aIm = h_fft_buf[1] + pos;
__asm __volatile (
".set push \n\t"
".set noreorder \n\t"
@ -707,7 +711,7 @@ void WebRtcAec_OverdriveAndSuppress_mips(AecCore* aec,
void WebRtcAec_ScaleErrorSignal_mips(int extended_filter_enabled,
float normal_mu,
float normal_error_threshold,
float *x_pow,
float x_pow[PART_LEN1],
float ef[2][PART_LEN1]) {
const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;
const float error_threshold = extended_filter_enabled

View File

@ -34,48 +34,49 @@ __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
return aRe * bIm + aIm * bRe;
}
static void FilterFarNEON(int num_partitions,
int xfBufBlockPos,
float xfBuf[2][kExtendedNumPartitions * PART_LEN1],
float wfBuf[2][kExtendedNumPartitions * PART_LEN1],
float yf[2][PART_LEN1]) {
static void FilterFarNEON(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float y_fft[2][PART_LEN1]) {
int i;
for (i = 0; i < num_partitions; i++) {
int j;
int xPos = (i + xfBufBlockPos) * PART_LEN1;
int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + xfBufBlockPos >= num_partitions) {
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// vectorized code (four at once)
for (j = 0; j + 3 < PART_LEN1; j += 4) {
const float32x4_t xfBuf_re = vld1q_f32(&xfBuf[0][xPos + j]);
const float32x4_t xfBuf_im = vld1q_f32(&xfBuf[1][xPos + j]);
const float32x4_t wfBuf_re = vld1q_f32(&wfBuf[0][pos + j]);
const float32x4_t wfBuf_im = vld1q_f32(&wfBuf[1][pos + j]);
const float32x4_t yf_re = vld1q_f32(&yf[0][j]);
const float32x4_t yf_im = vld1q_f32(&yf[1][j]);
const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re);
const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im);
const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im);
const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re);
const float32x4_t g = vaddq_f32(yf_re, e);
const float32x4_t h = vaddq_f32(yf_im, f);
vst1q_f32(&yf[0][j], g);
vst1q_f32(&yf[1][j], h);
const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]);
const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]);
const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re);
const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im);
const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im);
const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re);
const float32x4_t g = vaddq_f32(y_fft_re, e);
const float32x4_t h = vaddq_f32(y_fft_im, f);
vst1q_f32(&y_fft[0][j], g);
vst1q_f32(&y_fft[1][j], h);
}
// scalar code for the remaining items.
for (; j < PART_LEN1; j++) {
yf[0][j] += MulRe(xfBuf[0][xPos + j],
xfBuf[1][xPos + j],
wfBuf[0][pos + j],
wfBuf[1][pos + j]);
yf[1][j] += MulIm(xfBuf[0][xPos + j],
xfBuf[1][xPos + j],
wfBuf[0][pos + j],
wfBuf[1][pos + j]);
y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],
x_fft_buf[1][xPos + j],
h_fft_buf[0][pos + j],
h_fft_buf[1][pos + j]);
y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],
x_fft_buf[1][xPos + j],
h_fft_buf[0][pos + j],
h_fft_buf[1][pos + j]);
}
}
}
@ -128,7 +129,7 @@ static float32x4_t vsqrtq_f32(float32x4_t s) {
static void ScaleErrorSignalNEON(int extended_filter_enabled,
float normal_mu,
float normal_error_threshold,
float *x_pow,
float x_pow[PART_LEN1],
float ef[2][PART_LEN1]) {
const float mu = extended_filter_enabled ? kExtendedMu : normal_mu;
const float error_threshold = extended_filter_enabled ?
@ -185,34 +186,37 @@ static void ScaleErrorSignalNEON(int extended_filter_enabled,
}
}
static void FilterAdaptationNEON(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
static void FilterAdaptationNEON(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float e_fft[2][PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
float fft[PART_LEN2];
int i;
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * PART_LEN1;
int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
int pos = i * PART_LEN1;
int j;
// Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) {
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// Process the whole array...
for (j = 0; j < PART_LEN; j += 4) {
// Load xfBuf and ef.
const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]);
const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]);
const float32x4_t ef_re = vld1q_f32(&ef[0][j]);
const float32x4_t ef_im = vld1q_f32(&ef[1][j]);
// Calculate the product of conjugate(xfBuf) by ef.
// Load x_fft_buf and e_fft.
const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]);
const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]);
// Calculate the product of conjugate(x_fft_buf) by e_fft.
// re(conjugate(a) * b) = aRe * bRe + aIm * bIm
// im(conjugate(a) * b)= aRe * bIm - aIm * bRe
const float32x4_t a = vmulq_f32(xfBuf_re, ef_re);
const float32x4_t e = vmlaq_f32(a, xfBuf_im, ef_im);
const float32x4_t c = vmulq_f32(xfBuf_re, ef_im);
const float32x4_t f = vmlsq_f32(c, xfBuf_im, ef_re);
const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re);
const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im);
const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im);
const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re);
// Interleave real and imaginary parts.
const float32x4x2_t g_n_h = vzipq_f32(e, f);
// Store
@ -220,10 +224,10 @@ static void FilterAdaptationNEON(AecCore* aec,
vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
}
// ... and fixup the first imaginary entry.
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN],
ef[1][PART_LEN]);
fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],
-x_fft_buf[1][xPos + PART_LEN],
e_fft[0][PART_LEN],
e_fft[1][PART_LEN]);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
@ -241,21 +245,21 @@ static void FilterAdaptationNEON(AecCore* aec,
aec_rdft_forward_128(fft);
{
const float wt1 = aec->wfBuf[1][pos];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
const float wt1 = h_fft_buf[1][pos];
h_fft_buf[0][pos + PART_LEN] += fft[1];
for (j = 0; j < PART_LEN; j += 4) {
float32x4_t wtBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]);
float32x4_t wtBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]);
float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);
vst1q_f32(&aec->wfBuf[0][pos + j], wtBuf_re);
vst1q_f32(&aec->wfBuf[1][pos + j], wtBuf_im);
vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re);
vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im);
}
aec->wfBuf[1][pos] = wt1;
h_fft_buf[1][pos] = wt1;
}
}
}

View File

@ -29,51 +29,52 @@ __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
return aRe * bIm + aIm * bRe;
}
static void FilterFarSSE2(int num_partitions,
int xfBufBlockPos,
float xfBuf[2][kExtendedNumPartitions * PART_LEN1],
float wfBuf[2][kExtendedNumPartitions * PART_LEN1],
float yf[2][PART_LEN1]) {
static void FilterFarSSE2(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float y_fft[2][PART_LEN1]) {
int i;
for (i = 0; i < num_partitions; i++) {
int j;
int xPos = (i + xfBufBlockPos) * PART_LEN1;
int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
int pos = i * PART_LEN1;
// Check for wrap
if (i + xfBufBlockPos >= num_partitions) {
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * (PART_LEN1);
}
// vectorized code (four at once)
for (j = 0; j + 3 < PART_LEN1; j += 4) {
const __m128 xfBuf_re = _mm_loadu_ps(&xfBuf[0][xPos + j]);
const __m128 xfBuf_im = _mm_loadu_ps(&xfBuf[1][xPos + j]);
const __m128 wfBuf_re = _mm_loadu_ps(&wfBuf[0][pos + j]);
const __m128 wfBuf_im = _mm_loadu_ps(&wfBuf[1][pos + j]);
const __m128 yf_re = _mm_loadu_ps(&yf[0][j]);
const __m128 yf_im = _mm_loadu_ps(&yf[1][j]);
const __m128 a = _mm_mul_ps(xfBuf_re, wfBuf_re);
const __m128 b = _mm_mul_ps(xfBuf_im, wfBuf_im);
const __m128 c = _mm_mul_ps(xfBuf_re, wfBuf_im);
const __m128 d = _mm_mul_ps(xfBuf_im, wfBuf_re);
const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]);
const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]);
const __m128 h_fft_buf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]);
const __m128 h_fft_buf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]);
const __m128 y_fft_re = _mm_loadu_ps(&y_fft[0][j]);
const __m128 y_fft_im = _mm_loadu_ps(&y_fft[1][j]);
const __m128 a = _mm_mul_ps(x_fft_buf_re, h_fft_buf_re);
const __m128 b = _mm_mul_ps(x_fft_buf_im, h_fft_buf_im);
const __m128 c = _mm_mul_ps(x_fft_buf_re, h_fft_buf_im);
const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re);
const __m128 e = _mm_sub_ps(a, b);
const __m128 f = _mm_add_ps(c, d);
const __m128 g = _mm_add_ps(yf_re, e);
const __m128 h = _mm_add_ps(yf_im, f);
_mm_storeu_ps(&yf[0][j], g);
_mm_storeu_ps(&yf[1][j], h);
const __m128 g = _mm_add_ps(y_fft_re, e);
const __m128 h = _mm_add_ps(y_fft_im, f);
_mm_storeu_ps(&y_fft[0][j], g);
_mm_storeu_ps(&y_fft[1][j], h);
}
// scalar code for the remaining items.
for (; j < PART_LEN1; j++) {
yf[0][j] += MulRe(xfBuf[0][xPos + j],
xfBuf[1][xPos + j],
wfBuf[0][pos + j],
wfBuf[1][pos + j]);
yf[1][j] += MulIm(xfBuf[0][xPos + j],
xfBuf[1][xPos + j],
wfBuf[0][pos + j],
wfBuf[1][pos + j]);
y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j],
x_fft_buf[1][xPos + j],
h_fft_buf[0][pos + j],
h_fft_buf[1][pos + j]);
y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j],
x_fft_buf[1][xPos + j],
h_fft_buf[0][pos + j],
h_fft_buf[1][pos + j]);
}
}
}
@ -81,7 +82,7 @@ static void FilterFarSSE2(int num_partitions,
static void ScaleErrorSignalSSE2(int extended_filter_enabled,
float normal_mu,
float normal_error_threshold,
float *x_pow,
float x_pow[PART_LEN1],
float ef[2][PART_LEN1]) {
const __m128 k1e_10f = _mm_set1_ps(1e-10f);
const __m128 kMu = extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
@ -147,33 +148,36 @@ static void ScaleErrorSignalSSE2(int extended_filter_enabled,
}
}
static void FilterAdaptationSSE2(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
static void FilterAdaptationSSE2(
int num_partitions,
int x_fft_buf_block_pos,
float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
float e_fft[2][PART_LEN1],
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
float fft[PART_LEN2];
int i, j;
const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1);
int pos = i * PART_LEN1;
// Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) {
if (i + x_fft_buf_block_pos >= num_partitions) {
xPos -= num_partitions * PART_LEN1;
}
// Process the whole array...
for (j = 0; j < PART_LEN; j += 4) {
// Load xfBuf and ef.
const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
const __m128 ef_re = _mm_loadu_ps(&ef[0][j]);
const __m128 ef_im = _mm_loadu_ps(&ef[1][j]);
// Calculate the product of conjugate(xfBuf) by ef.
// Load x_fft_buf and e_fft.
const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]);
const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]);
const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]);
const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]);
// Calculate the product of conjugate(x_fft_buf) by e_fft.
// re(conjugate(a) * b) = aRe * bRe + aIm * bIm
// im(conjugate(a) * b)= aRe * bIm - aIm * bRe
const __m128 a = _mm_mul_ps(xfBuf_re, ef_re);
const __m128 b = _mm_mul_ps(xfBuf_im, ef_im);
const __m128 c = _mm_mul_ps(xfBuf_re, ef_im);
const __m128 d = _mm_mul_ps(xfBuf_im, ef_re);
const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re);
const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im);
const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im);
const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re);
const __m128 e = _mm_add_ps(a, b);
const __m128 f = _mm_sub_ps(c, d);
// Interleave real and imaginary parts.
@ -184,10 +188,10 @@ static void FilterAdaptationSSE2(AecCore* aec,
_mm_storeu_ps(&fft[2 * j + 4], h);
}
// ... and fixup the first imaginary entry.
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN],
ef[1][PART_LEN]);
fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],
-x_fft_buf[1][xPos + PART_LEN],
e_fft[0][PART_LEN],
e_fft[1][PART_LEN]);
aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
@ -205,11 +209,11 @@ static void FilterAdaptationSSE2(AecCore* aec,
aec_rdft_forward_128(fft);
{
float wt1 = aec->wfBuf[1][pos];
aec->wfBuf[0][pos + PART_LEN] += fft[1];
float wt1 = h_fft_buf[1][pos];
h_fft_buf[0][pos + PART_LEN] += fft[1];
for (j = 0; j < PART_LEN; j += 4) {
__m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
__m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
__m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]);
__m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]);
const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
const __m128 fft_re =
@ -218,10 +222,10 @@ static void FilterAdaptationSSE2(AecCore* aec,
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
_mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
_mm_storeu_ps(&aec->wfBuf[1][pos + j], wtBuf_im);
_mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re);
_mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im);
}
aec->wfBuf[1][pos] = wt1;
h_fft_buf[1][pos] = wt1;
}
}
}