Neon version of SubbandCoherence()
The performance gain on a Nexus 7 reported by audioproc is ~1.4% The output is NOT bit exact. Any difference seen is +-1. BUG=3131 R=bjornv@webrtc.org, cd@webrtc.org Review URL: https://webrtc-codereview.appspot.com/17839005 Patch from Scott LaVarnway <slavarnw@gmail.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@6647 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
32
webrtc/modules/audio_processing/aec/aec_common.h
Normal file
32
webrtc/modules/audio_processing/aec/aec_common.h
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
|
||||||
|
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
|
||||||
|
|
||||||
|
#include "webrtc/typedefs.h"
|
||||||
|
|
||||||
|
#ifdef _MSC_VER /* visual c++ */
|
||||||
|
#define ALIGN16_BEG __declspec(align(16))
|
||||||
|
#define ALIGN16_END
|
||||||
|
#else /* gcc or icc */
|
||||||
|
#define ALIGN16_BEG
|
||||||
|
#define ALIGN16_END __attribute__((aligned(16)))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65];
|
||||||
|
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_weightCurve[65];
|
||||||
|
extern ALIGN16_BEG const float ALIGN16_END WebRtcAec_overDriveCurve[65];
|
||||||
|
extern const float WebRtcAec_kExtendedSmoothingCoefficients[2][2];
|
||||||
|
extern const float WebRtcAec_kNormalSmoothingCoefficients[2][2];
|
||||||
|
extern const float WebRtcAec_kMinFarendPSD;
|
||||||
|
|
||||||
|
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_COMMON_H_
|
||||||
|
|
@ -21,6 +21,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
|
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
|
||||||
|
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||||
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
|
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
|
||||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||||
#include "webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h"
|
#include "webrtc/modules/audio_processing/utility/delay_estimator_wrapper.h"
|
||||||
@ -45,7 +46,7 @@ static const int freqAvgIc = PART_LEN / 2;
|
|||||||
// Matlab code to produce table:
|
// Matlab code to produce table:
|
||||||
// win = sqrt(hanning(63)); win = [0 ; win(1:32)];
|
// win = sqrt(hanning(63)); win = [0 ; win(1:32)];
|
||||||
// fprintf(1, '\t%.14f, %.14f, %.14f,\n', win);
|
// fprintf(1, '\t%.14f, %.14f, %.14f,\n', win);
|
||||||
static const float sqrtHanning[65] = {
|
ALIGN16_BEG const float ALIGN16_END WebRtcAec_sqrtHanning[65] = {
|
||||||
0.00000000000000f, 0.02454122852291f, 0.04906767432742f, 0.07356456359967f,
|
0.00000000000000f, 0.02454122852291f, 0.04906767432742f, 0.07356456359967f,
|
||||||
0.09801714032956f, 0.12241067519922f, 0.14673047445536f, 0.17096188876030f,
|
0.09801714032956f, 0.12241067519922f, 0.14673047445536f, 0.17096188876030f,
|
||||||
0.19509032201613f, 0.21910124015687f, 0.24298017990326f, 0.26671275747490f,
|
0.19509032201613f, 0.21910124015687f, 0.24298017990326f, 0.26671275747490f,
|
||||||
@ -99,10 +100,10 @@ static const float kTargetSupp[3] = {-6.9f, -11.5f, -18.4f};
|
|||||||
// Two sets of parameters, one for the extended filter mode.
|
// Two sets of parameters, one for the extended filter mode.
|
||||||
static const float kExtendedMinOverDrive[3] = {3.0f, 6.0f, 15.0f};
|
static const float kExtendedMinOverDrive[3] = {3.0f, 6.0f, 15.0f};
|
||||||
static const float kNormalMinOverDrive[3] = {1.0f, 2.0f, 5.0f};
|
static const float kNormalMinOverDrive[3] = {1.0f, 2.0f, 5.0f};
|
||||||
static const float kExtendedSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
|
const float WebRtcAec_kExtendedSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
|
||||||
{0.92f, 0.08f}};
|
{0.92f, 0.08f}};
|
||||||
static const float kNormalSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
|
const float WebRtcAec_kNormalSmoothingCoefficients[2][2] = {{0.9f, 0.1f},
|
||||||
{0.93f, 0.07f}};
|
{0.93f, 0.07f}};
|
||||||
|
|
||||||
// Number of partitions forming the NLP's "preferred" bands.
|
// Number of partitions forming the NLP's "preferred" bands.
|
||||||
enum {
|
enum {
|
||||||
@ -442,7 +443,7 @@ static int PartitionDelay(const AecCore* aec) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Threshold to protect against the ill-effects of a zero far-end.
|
// Threshold to protect against the ill-effects of a zero far-end.
|
||||||
static const float kMinFarendPSD = 15;
|
const float WebRtcAec_kMinFarendPSD = 15;
|
||||||
|
|
||||||
// Updates the following smoothed Power Spectral Densities (PSD):
|
// Updates the following smoothed Power Spectral Densities (PSD):
|
||||||
// - sd : near-end
|
// - sd : near-end
|
||||||
@ -459,8 +460,8 @@ static void SmoothedPSD(AecCore* aec,
|
|||||||
float xfw[2][PART_LEN1]) {
|
float xfw[2][PART_LEN1]) {
|
||||||
// Power estimate smoothing coefficients.
|
// Power estimate smoothing coefficients.
|
||||||
const float* ptrGCoh = aec->extended_filter_enabled
|
const float* ptrGCoh = aec->extended_filter_enabled
|
||||||
? kExtendedSmoothingCoefficients[aec->mult - 1]
|
? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
|
||||||
: kNormalSmoothingCoefficients[aec->mult - 1];
|
: WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
|
||||||
int i;
|
int i;
|
||||||
float sdSum = 0, seSum = 0;
|
float sdSum = 0, seSum = 0;
|
||||||
|
|
||||||
@ -476,7 +477,8 @@ static void SmoothedPSD(AecCore* aec,
|
|||||||
aec->sx[i] =
|
aec->sx[i] =
|
||||||
ptrGCoh[0] * aec->sx[i] +
|
ptrGCoh[0] * aec->sx[i] +
|
||||||
ptrGCoh[1] * WEBRTC_SPL_MAX(
|
ptrGCoh[1] * WEBRTC_SPL_MAX(
|
||||||
xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i], kMinFarendPSD);
|
xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
|
||||||
|
WebRtcAec_kMinFarendPSD);
|
||||||
|
|
||||||
aec->sde[i][0] =
|
aec->sde[i][0] =
|
||||||
ptrGCoh[0] * aec->sde[i][0] +
|
ptrGCoh[0] * aec->sde[i][0] +
|
||||||
@ -511,8 +513,9 @@ static void SmoothedPSD(AecCore* aec,
|
|||||||
__inline static void WindowData(float* x_windowed, const float* x) {
|
__inline static void WindowData(float* x_windowed, const float* x) {
|
||||||
int i;
|
int i;
|
||||||
for (i = 0; i < PART_LEN; i++) {
|
for (i = 0; i < PART_LEN; i++) {
|
||||||
x_windowed[i] = x[i] * sqrtHanning[i];
|
x_windowed[i] = x[i] * WebRtcAec_sqrtHanning[i];
|
||||||
x_windowed[PART_LEN + i] = x[PART_LEN + i] * sqrtHanning[PART_LEN - i];
|
x_windowed[PART_LEN + i] =
|
||||||
|
x[PART_LEN + i] * WebRtcAec_sqrtHanning[PART_LEN - i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1347,10 +1350,10 @@ static void NonLinearProcessing(AecCore* aec, float* output, float* outputH) {
|
|||||||
scale = 2.0f / PART_LEN2;
|
scale = 2.0f / PART_LEN2;
|
||||||
for (i = 0; i < PART_LEN; i++) {
|
for (i = 0; i < PART_LEN; i++) {
|
||||||
fft[i] *= scale; // fft scaling
|
fft[i] *= scale; // fft scaling
|
||||||
fft[i] = fft[i] * sqrtHanning[i] + aec->outBuf[i];
|
fft[i] = fft[i] * WebRtcAec_sqrtHanning[i] + aec->outBuf[i];
|
||||||
|
|
||||||
fft[PART_LEN + i] *= scale; // fft scaling
|
fft[PART_LEN + i] *= scale; // fft scaling
|
||||||
aec->outBuf[i] = fft[PART_LEN + i] * sqrtHanning[PART_LEN - i];
|
aec->outBuf[i] = fft[PART_LEN + i] * WebRtcAec_sqrtHanning[PART_LEN - i];
|
||||||
|
|
||||||
// Saturate output to keep it in the allowed range.
|
// Saturate output to keep it in the allowed range.
|
||||||
output[i] = WEBRTC_SPL_SAT(
|
output[i] = WEBRTC_SPL_SAT(
|
||||||
@ -1737,8 +1740,8 @@ static void TimeToFrequency(float time_data[PART_LEN2],
|
|||||||
// TODO(bjornv): Should we have a different function/wrapper for windowed FFT?
|
// TODO(bjornv): Should we have a different function/wrapper for windowed FFT?
|
||||||
if (window) {
|
if (window) {
|
||||||
for (i = 0; i < PART_LEN; i++) {
|
for (i = 0; i < PART_LEN; i++) {
|
||||||
time_data[i] *= sqrtHanning[i];
|
time_data[i] *= WebRtcAec_sqrtHanning[i];
|
||||||
time_data[PART_LEN + i] *= sqrtHanning[PART_LEN - i];
|
time_data[PART_LEN + i] *= WebRtcAec_sqrtHanning[PART_LEN - i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||||
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
||||||
#include "webrtc/modules/audio_processing/utility/ring_buffer.h"
|
#include "webrtc/modules/audio_processing/utility/ring_buffer.h"
|
||||||
#include "webrtc/typedefs.h"
|
#include "webrtc/typedefs.h"
|
||||||
|
@ -14,12 +14,12 @@
|
|||||||
* Based on aec_core_sse2.c.
|
* Based on aec_core_sse2.c.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "webrtc/modules/audio_processing/aec/aec_core.h"
|
|
||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <string.h> // memset
|
#include <string.h> // memset
|
||||||
|
|
||||||
|
#include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
|
||||||
|
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||||
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
|
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
|
||||||
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
#include "webrtc/modules/audio_processing/aec/aec_rdft.h"
|
||||||
|
|
||||||
@ -250,9 +250,6 @@ static void FilterAdaptationNEON(AecCore* aec,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern const float WebRtcAec_weightCurve[65];
|
|
||||||
extern const float WebRtcAec_overDriveCurve[65];
|
|
||||||
|
|
||||||
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
|
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
|
||||||
// a^b = exp2(b * log2(a))
|
// a^b = exp2(b * log2(a))
|
||||||
// exp2(x) and log2(x) are calculated using polynomial approximations.
|
// exp2(x) and log2(x) are calculated using polynomial approximations.
|
||||||
@ -442,10 +439,295 @@ static void OverdriveAndSuppressNEON(AecCore* aec,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int PartitionDelay(const AecCore* aec) {
|
||||||
|
// Measures the energy in each filter partition and returns the partition with
|
||||||
|
// highest energy.
|
||||||
|
// TODO(bjornv): Spread computational cost by computing one partition per
|
||||||
|
// block?
|
||||||
|
float wfEnMax = 0;
|
||||||
|
int i;
|
||||||
|
int delay = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < aec->num_partitions; i++) {
|
||||||
|
int j;
|
||||||
|
int pos = i * PART_LEN1;
|
||||||
|
float wfEn = 0;
|
||||||
|
float32x4_t vec_wfEn = vdupq_n_f32(0.0f);
|
||||||
|
// vectorized code (four at once)
|
||||||
|
for (j = 0; j + 3 < PART_LEN1; j += 4) {
|
||||||
|
const float32x4_t vec_wfBuf0 = vld1q_f32(&aec->wfBuf[0][pos + j]);
|
||||||
|
const float32x4_t vec_wfBuf1 = vld1q_f32(&aec->wfBuf[1][pos + j]);
|
||||||
|
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf0, vec_wfBuf0);
|
||||||
|
vec_wfEn = vmlaq_f32(vec_wfEn, vec_wfBuf1, vec_wfBuf1);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
float32x2_t vec_total;
|
||||||
|
// A B C D
|
||||||
|
vec_total = vpadd_f32(vget_low_f32(vec_wfEn), vget_high_f32(vec_wfEn));
|
||||||
|
// A+B C+D
|
||||||
|
vec_total = vpadd_f32(vec_total, vec_total);
|
||||||
|
// A+B+C+D A+B+C+D
|
||||||
|
wfEn = vget_lane_f32(vec_total, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// scalar code for the remaining items.
|
||||||
|
for (; j < PART_LEN1; j++) {
|
||||||
|
wfEn += aec->wfBuf[0][pos + j] * aec->wfBuf[0][pos + j] +
|
||||||
|
aec->wfBuf[1][pos + j] * aec->wfBuf[1][pos + j];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wfEn > wfEnMax) {
|
||||||
|
wfEnMax = wfEn;
|
||||||
|
delay = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return delay;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Updates the following smoothed Power Spectral Densities (PSD):
|
||||||
|
// - sd : near-end
|
||||||
|
// - se : residual echo
|
||||||
|
// - sx : far-end
|
||||||
|
// - sde : cross-PSD of near-end and residual echo
|
||||||
|
// - sxd : cross-PSD of near-end and far-end
|
||||||
|
//
|
||||||
|
// In addition to updating the PSDs, also the filter diverge state is determined
|
||||||
|
// upon actions are taken.
|
||||||
|
static void SmoothedPSD(AecCore* aec,
|
||||||
|
float efw[2][PART_LEN1],
|
||||||
|
float dfw[2][PART_LEN1],
|
||||||
|
float xfw[2][PART_LEN1]) {
|
||||||
|
// Power estimate smoothing coefficients.
|
||||||
|
const float* ptrGCoh = aec->extended_filter_enabled
|
||||||
|
? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
|
||||||
|
: WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
|
||||||
|
int i;
|
||||||
|
float sdSum = 0, seSum = 0;
|
||||||
|
const float32x4_t vec_15 = vdupq_n_f32(WebRtcAec_kMinFarendPSD);
|
||||||
|
float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
|
||||||
|
float32x4_t vec_seSum = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||||
|
const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
|
||||||
|
const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
|
||||||
|
const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
|
||||||
|
const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
|
||||||
|
const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
|
||||||
|
const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
|
||||||
|
float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
|
||||||
|
float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
|
||||||
|
float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
|
||||||
|
float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
|
||||||
|
float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
|
||||||
|
float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);
|
||||||
|
|
||||||
|
vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
|
||||||
|
vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
|
||||||
|
vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
|
||||||
|
vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
|
||||||
|
vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
|
||||||
|
vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
|
||||||
|
vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);
|
||||||
|
|
||||||
|
vst1q_f32(&aec->sd[i], vec_sd);
|
||||||
|
vst1q_f32(&aec->se[i], vec_se);
|
||||||
|
vst1q_f32(&aec->sx[i], vec_sx);
|
||||||
|
|
||||||
|
{
|
||||||
|
float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
|
||||||
|
float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
|
||||||
|
float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
|
||||||
|
vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
|
||||||
|
vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
|
||||||
|
vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
|
||||||
|
vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
|
||||||
|
vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
|
||||||
|
vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
|
||||||
|
vst2q_f32(&aec->sde[i][0], vec_sde);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
|
||||||
|
float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
|
||||||
|
float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
|
||||||
|
vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
|
||||||
|
vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
|
||||||
|
vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
|
||||||
|
vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
|
||||||
|
vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
|
||||||
|
vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
|
||||||
|
vst2q_f32(&aec->sxd[i][0], vec_sxd);
|
||||||
|
}
|
||||||
|
|
||||||
|
vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
|
||||||
|
vec_seSum = vaddq_f32(vec_seSum, vec_se);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
float32x2_t vec_sdSum_total;
|
||||||
|
float32x2_t vec_seSum_total;
|
||||||
|
// A B C D
|
||||||
|
vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
|
||||||
|
vget_high_f32(vec_sdSum));
|
||||||
|
vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
|
||||||
|
vget_high_f32(vec_seSum));
|
||||||
|
// A+B C+D
|
||||||
|
vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
|
||||||
|
vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
|
||||||
|
// A+B+C+D A+B+C+D
|
||||||
|
sdSum = vget_lane_f32(vec_sdSum_total, 0);
|
||||||
|
seSum = vget_lane_f32(vec_seSum_total, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// scalar code for the remaining items.
|
||||||
|
for (; i < PART_LEN1; i++) {
|
||||||
|
aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
|
||||||
|
ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
|
||||||
|
aec->se[i] = ptrGCoh[0] * aec->se[i] +
|
||||||
|
ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
|
||||||
|
// We threshold here to protect against the ill-effects of a zero farend.
|
||||||
|
// The threshold is not arbitrarily chosen, but balances protection and
|
||||||
|
// adverse interaction with the algorithm's tuning.
|
||||||
|
// TODO(bjornv): investigate further why this is so sensitive.
|
||||||
|
aec->sx[i] =
|
||||||
|
ptrGCoh[0] * aec->sx[i] +
|
||||||
|
ptrGCoh[1] * WEBRTC_SPL_MAX(
|
||||||
|
xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
|
||||||
|
WebRtcAec_kMinFarendPSD);
|
||||||
|
|
||||||
|
aec->sde[i][0] =
|
||||||
|
ptrGCoh[0] * aec->sde[i][0] +
|
||||||
|
ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
|
||||||
|
aec->sde[i][1] =
|
||||||
|
ptrGCoh[0] * aec->sde[i][1] +
|
||||||
|
ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
|
||||||
|
|
||||||
|
aec->sxd[i][0] =
|
||||||
|
ptrGCoh[0] * aec->sxd[i][0] +
|
||||||
|
ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
|
||||||
|
aec->sxd[i][1] =
|
||||||
|
ptrGCoh[0] * aec->sxd[i][1] +
|
||||||
|
ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
|
||||||
|
|
||||||
|
sdSum += aec->sd[i];
|
||||||
|
seSum += aec->se[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Divergent filter safeguard.
|
||||||
|
aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;
|
||||||
|
|
||||||
|
if (aec->divergeState)
|
||||||
|
memcpy(efw, dfw, sizeof(efw[0][0]) * 2 * PART_LEN1);
|
||||||
|
|
||||||
|
// Reset if error is significantly larger than nearend (13 dB).
|
||||||
|
if (!aec->extended_filter_enabled && seSum > (19.95f * sdSum))
|
||||||
|
memset(aec->wfBuf, 0, sizeof(aec->wfBuf));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Window time domain data to be used by the fft.
|
||||||
|
__inline static void WindowData(float* x_windowed, const float* x) {
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < PART_LEN; i += 4) {
|
||||||
|
const float32x4_t vec_Buf1 = vld1q_f32(&x[i]);
|
||||||
|
const float32x4_t vec_Buf2 = vld1q_f32(&x[PART_LEN + i]);
|
||||||
|
const float32x4_t vec_sqrtHanning = vld1q_f32(&WebRtcAec_sqrtHanning[i]);
|
||||||
|
// A B C D
|
||||||
|
float32x4_t vec_sqrtHanning_rev =
|
||||||
|
vld1q_f32(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
|
||||||
|
// B A D C
|
||||||
|
vec_sqrtHanning_rev = vrev64q_f32(vec_sqrtHanning_rev);
|
||||||
|
// D C B A
|
||||||
|
vec_sqrtHanning_rev = vcombine_f32(vget_high_f32(vec_sqrtHanning_rev),
|
||||||
|
vget_low_f32(vec_sqrtHanning_rev));
|
||||||
|
vst1q_f32(&x_windowed[i], vmulq_f32(vec_Buf1, vec_sqrtHanning));
|
||||||
|
vst1q_f32(&x_windowed[PART_LEN + i],
|
||||||
|
vmulq_f32(vec_Buf2, vec_sqrtHanning_rev));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Puts fft output data into a complex valued array.
|
||||||
|
__inline static void StoreAsComplex(const float* data,
|
||||||
|
float data_complex[2][PART_LEN1]) {
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < PART_LEN; i += 4) {
|
||||||
|
const float32x4x2_t vec_data = vld2q_f32(&data[2 * i]);
|
||||||
|
vst1q_f32(&data_complex[0][i], vec_data.val[0]);
|
||||||
|
vst1q_f32(&data_complex[1][i], vec_data.val[1]);
|
||||||
|
}
|
||||||
|
// fix beginning/end values
|
||||||
|
data_complex[1][0] = 0;
|
||||||
|
data_complex[1][PART_LEN] = 0;
|
||||||
|
data_complex[0][0] = data[0];
|
||||||
|
data_complex[0][PART_LEN] = data[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void SubbandCoherenceNEON(AecCore* aec,
|
||||||
|
float efw[2][PART_LEN1],
|
||||||
|
float xfw[2][PART_LEN1],
|
||||||
|
float* fft,
|
||||||
|
float* cohde,
|
||||||
|
float* cohxd) {
|
||||||
|
float dfw[2][PART_LEN1];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (aec->delayEstCtr == 0)
|
||||||
|
aec->delayIdx = PartitionDelay(aec);
|
||||||
|
|
||||||
|
// Use delayed far.
|
||||||
|
memcpy(xfw,
|
||||||
|
aec->xfwBuf + aec->delayIdx * PART_LEN1,
|
||||||
|
sizeof(xfw[0][0]) * 2 * PART_LEN1);
|
||||||
|
|
||||||
|
// Windowed near fft
|
||||||
|
WindowData(fft, aec->dBuf);
|
||||||
|
aec_rdft_forward_128(fft);
|
||||||
|
StoreAsComplex(fft, dfw);
|
||||||
|
|
||||||
|
// Windowed error fft
|
||||||
|
WindowData(fft, aec->eBuf);
|
||||||
|
aec_rdft_forward_128(fft);
|
||||||
|
StoreAsComplex(fft, efw);
|
||||||
|
|
||||||
|
SmoothedPSD(aec, efw, dfw, xfw);
|
||||||
|
|
||||||
|
{
|
||||||
|
const float32x4_t vec_1eminus10 = vdupq_n_f32(1e-10f);
|
||||||
|
|
||||||
|
// Subband coherence
|
||||||
|
for (i = 0; i + 3 < PART_LEN1; i += 4) {
|
||||||
|
const float32x4_t vec_sd = vld1q_f32(&aec->sd[i]);
|
||||||
|
const float32x4_t vec_se = vld1q_f32(&aec->se[i]);
|
||||||
|
const float32x4_t vec_sx = vld1q_f32(&aec->sx[i]);
|
||||||
|
const float32x4_t vec_sdse = vmlaq_f32(vec_1eminus10, vec_sd, vec_se);
|
||||||
|
const float32x4_t vec_sdsx = vmlaq_f32(vec_1eminus10, vec_sd, vec_sx);
|
||||||
|
float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
|
||||||
|
float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
|
||||||
|
float32x4_t vec_cohde = vmulq_f32(vec_sde.val[0], vec_sde.val[0]);
|
||||||
|
float32x4_t vec_cohxd = vmulq_f32(vec_sxd.val[0], vec_sxd.val[0]);
|
||||||
|
vec_cohde = vmlaq_f32(vec_cohde, vec_sde.val[1], vec_sde.val[1]);
|
||||||
|
vec_cohde = vdivq_f32(vec_cohde, vec_sdse);
|
||||||
|
vec_cohxd = vmlaq_f32(vec_cohxd, vec_sxd.val[1], vec_sxd.val[1]);
|
||||||
|
vec_cohxd = vdivq_f32(vec_cohxd, vec_sdsx);
|
||||||
|
|
||||||
|
vst1q_f32(&cohde[i], vec_cohde);
|
||||||
|
vst1q_f32(&cohxd[i], vec_cohxd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// scalar code for the remaining items.
|
||||||
|
for (; i < PART_LEN1; i++) {
|
||||||
|
cohde[i] =
|
||||||
|
(aec->sde[i][0] * aec->sde[i][0] + aec->sde[i][1] * aec->sde[i][1]) /
|
||||||
|
(aec->sd[i] * aec->se[i] + 1e-10f);
|
||||||
|
cohxd[i] =
|
||||||
|
(aec->sxd[i][0] * aec->sxd[i][0] + aec->sxd[i][1] * aec->sxd[i][1]) /
|
||||||
|
(aec->sx[i] * aec->sd[i] + 1e-10f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void WebRtcAec_InitAec_neon(void) {
|
void WebRtcAec_InitAec_neon(void) {
|
||||||
WebRtcAec_FilterFar = FilterFarNEON;
|
WebRtcAec_FilterFar = FilterFarNEON;
|
||||||
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
|
WebRtcAec_ScaleErrorSignal = ScaleErrorSignalNEON;
|
||||||
WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
|
WebRtcAec_FilterAdaptation = FilterAdaptationNEON;
|
||||||
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
|
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressNEON;
|
||||||
|
WebRtcAec_SubbandCoherence = SubbandCoherenceNEON;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,6 +11,8 @@
|
|||||||
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
|
#ifndef WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
|
||||||
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
|
#define WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_
|
||||||
|
|
||||||
|
#include "webrtc/modules/audio_processing/aec/aec_common.h"
|
||||||
|
|
||||||
// These intrinsics were unavailable before VS 2008.
|
// These intrinsics were unavailable before VS 2008.
|
||||||
// TODO(andrew): move to a common file.
|
// TODO(andrew): move to a common file.
|
||||||
#if defined(_MSC_VER) && _MSC_VER < 1500
|
#if defined(_MSC_VER) && _MSC_VER < 1500
|
||||||
@ -19,14 +21,6 @@ static __inline __m128 _mm_castsi128_ps(__m128i a) { return *(__m128*)&a; }
|
|||||||
static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
|
static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _MSC_VER /* visual c++ */
|
|
||||||
#define ALIGN16_BEG __declspec(align(16))
|
|
||||||
#define ALIGN16_END
|
|
||||||
#else /* gcc or icc */
|
|
||||||
#define ALIGN16_BEG
|
|
||||||
#define ALIGN16_END __attribute__((aligned(16)))
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// constants shared by all paths (C, SSE2).
|
// constants shared by all paths (C, SSE2).
|
||||||
extern float rdft_w[64];
|
extern float rdft_w[64];
|
||||||
// constants used by the C path.
|
// constants used by the C path.
|
||||||
|
Reference in New Issue
Block a user