Made the method PartitionDelay independent of the AEC state.
This CL is step towards simplifying the AEC code, making it more modifiable and modular. The changes should be bitexact. BUG=webrtc:5201, webrtc:5298 Review-Url: https://codereview.webrtc.org/1936203002 Cr-Commit-Position: refs/heads/master@{#12654}
This commit is contained in:
@ -329,7 +329,9 @@ static void OverdriveAndSuppress(float overdrive_scaling,
|
||||
}
|
||||
}
|
||||
|
||||
static int PartitionDelay(const AecCore* aec) {
|
||||
static int PartitionDelay(int num_partitions,
|
||||
float h_fft_buf[2]
|
||||
[kExtendedNumPartitions * PART_LEN1]) {
|
||||
// Measures the energy in each filter partition and returns the partition with
|
||||
// highest energy.
|
||||
// TODO(bjornv): Spread computational cost by computing one partition per
|
||||
@ -338,13 +340,13 @@ static int PartitionDelay(const AecCore* aec) {
|
||||
int i;
|
||||
int delay = 0;
|
||||
|
||||
for (i = 0; i < aec->num_partitions; i++) {
|
||||
for (i = 0; i < num_partitions; i++) {
|
||||
int j;
|
||||
int pos = i * PART_LEN1;
|
||||
float wfEn = 0;
|
||||
for (j = 0; j < PART_LEN1; j++) {
|
||||
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
|
||||
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
|
||||
wfEn += h_fft_buf[0][pos + j] * h_fft_buf[0][pos + j] +
|
||||
h_fft_buf[1][pos + j] * h_fft_buf[1][pos + j];
|
||||
}
|
||||
|
||||
if (wfEn > wfEnMax) {
|
||||
@ -1053,7 +1055,7 @@ static void EchoSuppression(AecCore* aec,
|
||||
aec->delayEstCtr++;
|
||||
if (aec->delayEstCtr == delayEstInterval) {
|
||||
aec->delayEstCtr = 0;
|
||||
aec->delayIdx = WebRtcAec_PartitionDelay(aec);
|
||||
aec->delayIdx = WebRtcAec_PartitionDelay(aec->num_partitions, aec->wfBuf);
|
||||
}
|
||||
|
||||
// Use delayed far.
|
||||
|
||||
@ -234,7 +234,9 @@ typedef void (*WebRtcAecSubBandCoherence)(int mult,
|
||||
int* extreme_filter_divergence);
|
||||
extern WebRtcAecSubBandCoherence WebRtcAec_SubbandCoherence;
|
||||
|
||||
typedef int (*WebRtcAecPartitionDelay)(const AecCore* aec);
|
||||
typedef int (*WebRtcAecPartitionDelay)(
|
||||
int num_partitions,
|
||||
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]);
|
||||
extern WebRtcAecPartitionDelay WebRtcAec_PartitionDelay;
|
||||
|
||||
typedef void (*WebRtcAecStoreAsComplex)(const float* data,
|
||||
|
||||
@ -448,7 +448,9 @@ static void OverdriveAndSuppressNEON(float overdrive_scaling,
|
||||
}
|
||||
}
|
||||
|
||||
static int PartitionDelayNEON(const AecCore* aec) {
|
||||
static int PartitionDelayNEON(
|
||||
int num_partitions,
|
||||
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
|
||||
// Measures the energy in each filter partition and returns the partition with
|
||||
// highest energy.
|
||||
// TODO(bjornv): Spread computational cost by computing one partition per
|
||||
@ -457,15 +459,15 @@ static int PartitionDelayNEON(const AecCore* aec) {
|
||||
int i;
|
||||
int delay = 0;
|
||||
|
||||
for (i = 0; i < aec->num_partitions; i++) {
|
||||
for (i = 0; i < num_partitions; i++) {
|
||||
int j;
|
||||
int pos = i * PART_LEN1;
|
||||
float wfEn = 0;
|
||||
float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
|
||||
// vectorized code (four at once)
|
||||
for (j = 0; j + 3 < PART_LEN1; j += 4) {
|
||||
const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
|
||||
const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
|
||||
const float32x4_t vec_wfBuf0 = vld1q_f32(&h_fft_buf[0][pos + j]);
|
||||
const float32x4_t vec_wfBuf1 = vld1q_f32(&h_fft_buf[1][pos + j]);
|
||||
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
|
||||
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
|
||||
}
|
||||
@ -481,8 +483,8 @@ static int PartitionDelayNEON(const AecCore* aec) {
|
||||
|
||||
// scalar code for the remaining items.
|
||||
for (; j < PART_LEN1; j++) {
|
||||
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
|
||||
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
|
||||
wfEn += h_fft_buf[0][pos + j] * h_fft_buf[0][pos + j] +
|
||||
h_fft_buf[1][pos + j] * h_fft_buf[1][pos + j];
|
||||
}
|
||||
|
||||
if (wfEn > wfEnMax) {
|
||||
|
||||
@ -449,7 +449,9 @@ __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) {
|
||||
_mm_store_ss(dst, sum);
|
||||
}
|
||||
|
||||
static int PartitionDelaySSE2(const AecCore* aec) {
|
||||
static int PartitionDelaySSE2(
|
||||
int num_partitions,
|
||||
float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
|
||||
// Measures the energy in each filter partition and returns the partition with
|
||||
// highest energy.
|
||||
// TODO(bjornv): Spread computational cost by computing one partition per
|
||||
@ -458,15 +460,15 @@ static int PartitionDelaySSE2(const AecCore* aec) {
|
||||
int i;
|
||||
int delay = 0;
|
||||
|
||||
for (i = 0; i < aec->num_partitions; i++) {
|
||||
for (i = 0; i < num_partitions; i++) {
|
||||
int j;
|
||||
int pos = i * PART_LEN1;
|
||||
float wfEn = 0;
|
||||
__m128 vec_wfEn = _mm_set1_ps(0.0f);
|
||||
// vectorized code (four at once)
|
||||
for (j = 0; j + 3 < PART_LEN1; j += 4) {
|
||||
const __m128 vec_wfBuf0 = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
|
||||
const __m128 vec_wfBuf1 = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
|
||||
const __m128 vec_wfBuf0 = _mm_loadu_ps(&h_fft_buf[0][pos + j]);
|
||||
const __m128 vec_wfBuf1 = _mm_loadu_ps(&h_fft_buf[1][pos + j]);
|
||||
vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0));
|
||||
vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1));
|
||||
}
|
||||
@ -474,8 +476,8 @@ static int PartitionDelaySSE2(const AecCore* aec) {
|
||||
|
||||
// scalar code for the remaining items.
|
||||
for (; j < PART_LEN1; j++) {
|
||||
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
|
||||
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
|
||||
wfEn += h_fft_buf[0][pos + j] * h_fft_buf[0][pos + j] +
|
||||
h_fft_buf[1][pos + j] * h_fft_buf[1][pos + j];
|
||||
}
|
||||
|
||||
if (wfEn > wfEnMax) {
|
||||
|
||||
Reference in New Issue
Block a user