clang-format audio_processing/aec/*

TBR=bjornv
TESTED=trybots

Review URL: https://webrtc-codereview.appspot.com/2373004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@4944 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org
2013-10-08 23:41:42 +00:00
parent d241718e17
commit 13b2d46593
12 changed files with 2023 additions and 1972 deletions

File diff suppressed because it is too large Load Diff

View File

@ -18,14 +18,20 @@
#include "webrtc/typedefs.h" #include "webrtc/typedefs.h"
#define FRAME_LEN 80 #define FRAME_LEN 80
#define PART_LEN 64 // Length of partition #define PART_LEN 64 // Length of partition
#define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients #define PART_LEN1 (PART_LEN + 1) // Unique fft coefficients
#define PART_LEN2 (PART_LEN * 2) // Length of partition * 2 #define PART_LEN2 (PART_LEN * 2) // Length of partition * 2
// Delay estimator constants, used for logging. // Delay estimator constants, used for logging.
enum { kMaxDelayBlocks = 60 }; enum {
enum { kLookaheadBlocks = 15 }; kMaxDelayBlocks = 60
enum { kHistorySizeBlocks = kMaxDelayBlocks + kLookaheadBlocks }; };
enum {
kLookaheadBlocks = 15
};
enum {
kHistorySizeBlocks = kMaxDelayBlocks + kLookaheadBlocks
};
typedef float complex_t[2]; typedef float complex_t[2];
// For performance reasons, some arrays of complex numbers are replaced by twice // For performance reasons, some arrays of complex numbers are replaced by twice
@ -37,7 +43,9 @@ typedef float complex_t[2];
// compile time. // compile time.
// Metrics // Metrics
enum { kOffsetLevel = -100 }; enum {
kOffsetLevel = -100
};
typedef struct Stats { typedef struct Stats {
float instant; float instant;
@ -79,14 +87,18 @@ int WebRtcAec_GetDelayMetricsCore(AecCore* self, int* median, int* std);
int WebRtcAec_echo_state(AecCore* self); int WebRtcAec_echo_state(AecCore* self);
// Gets statistics of the echo metrics ERL, ERLE, A_NLP. // Gets statistics of the echo metrics ERL, ERLE, A_NLP.
void WebRtcAec_GetEchoStats(AecCore* self, Stats* erl, Stats* erle, void WebRtcAec_GetEchoStats(AecCore* self,
Stats* erl,
Stats* erle,
Stats* a_nlp); Stats* a_nlp);
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
void* WebRtcAec_far_time_buf(AecCore* self); void* WebRtcAec_far_time_buf(AecCore* self);
#endif #endif
// Sets local configuration modes. // Sets local configuration modes.
void WebRtcAec_SetConfigCore(AecCore* self, int nlp_mode, int metrics_mode, void WebRtcAec_SetConfigCore(AecCore* self,
int nlp_mode,
int metrics_mode,
int delay_logging); int delay_logging);
// We now interpret delay correction to mean an extended filter length feature. // We now interpret delay correction to mean an extended filter length feature.

View File

@ -21,7 +21,9 @@
// Number of partitions for the extended filter mode. The first one is an enum // Number of partitions for the extended filter mode. The first one is an enum
// to be used in array declarations, as it represents the maximum filter length. // to be used in array declarations, as it represents the maximum filter length.
enum { kExtendedNumPartitions = 32 }; enum {
kExtendedNumPartitions = 32
};
static const int kNormalNumPartitions = 12; static const int kNormalNumPartitions = 12;
// Extended filter adaptation parameters. // Extended filter adaptation parameters.
@ -61,7 +63,7 @@ struct AecCore {
float dPow[PART_LEN1]; float dPow[PART_LEN1];
float dMinPow[PART_LEN1]; float dMinPow[PART_LEN1];
float dInitMinPow[PART_LEN1]; float dInitMinPow[PART_LEN1];
float *noisePow; float* noisePow;
float xfBuf[2][kExtendedNumPartitions * PART_LEN1]; // farend fft buffer float xfBuf[2][kExtendedNumPartitions * PART_LEN1]; // farend fft buffer
float wfBuf[2][kExtendedNumPartitions * PART_LEN1]; // filter fft float wfBuf[2][kExtendedNumPartitions * PART_LEN1]; // filter fft
@ -93,7 +95,7 @@ struct AecCore {
int sampFreq; int sampFreq;
uint32_t seed; uint32_t seed;
float normal_mu; // stepsize float normal_mu; // stepsize
float normal_error_threshold; // error threshold float normal_error_threshold; // error threshold
int noiseEstCtr; int noiseEstCtr;
@ -111,8 +113,8 @@ struct AecCore {
Stats rerl; Stats rerl;
// Quantities to control H band scaling for SWB input // Quantities to control H band scaling for SWB input
int freq_avg_ic; // initial bin for averaging nlp gain int freq_avg_ic; // initial bin for averaging nlp gain
int flag_Hband_cn; // for comfort noise int flag_Hband_cn; // for comfort noise
float cn_scale_Hband; // scale for comfort noise in H band float cn_scale_Hband; // scale for comfort noise in H band
int delay_histogram[kHistorySizeBlocks]; int delay_histogram[kHistorySizeBlocks];
@ -127,24 +129,26 @@ struct AecCore {
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
RingBuffer* far_time_buf; RingBuffer* far_time_buf;
FILE *farFile; FILE* farFile;
FILE *nearFile; FILE* nearFile;
FILE *outFile; FILE* outFile;
FILE *outLinearFile; FILE* outLinearFile;
#endif #endif
}; };
typedef void (*WebRtcAec_FilterFar_t)(AecCore* aec, float yf[2][PART_LEN1]); typedef void (*WebRtcAec_FilterFar_t)(AecCore* aec, float yf[2][PART_LEN1]);
extern WebRtcAec_FilterFar_t WebRtcAec_FilterFar; extern WebRtcAec_FilterFar_t WebRtcAec_FilterFar;
typedef void (*WebRtcAec_ScaleErrorSignal_t) typedef void (*WebRtcAec_ScaleErrorSignal_t)(AecCore* aec,
(AecCore* aec, float ef[2][PART_LEN1]); float ef[2][PART_LEN1]);
extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal; extern WebRtcAec_ScaleErrorSignal_t WebRtcAec_ScaleErrorSignal;
typedef void (*WebRtcAec_FilterAdaptation_t) typedef void (*WebRtcAec_FilterAdaptation_t)(AecCore* aec,
(AecCore* aec, float *fft, float ef[2][PART_LEN1]); float* fft,
float ef[2][PART_LEN1]);
extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation; extern WebRtcAec_FilterAdaptation_t WebRtcAec_FilterAdaptation;
typedef void (*WebRtcAec_OverdriveAndSuppress_t) typedef void (*WebRtcAec_OverdriveAndSuppress_t)(AecCore* aec,
(AecCore* aec, float hNl[PART_LEN1], const float hNlFb, float hNl[PART_LEN1],
float efw[2][PART_LEN1]); const float hNlFb,
float efw[2][PART_LEN1]);
extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress; extern WebRtcAec_OverdriveAndSuppress_t WebRtcAec_OverdriveAndSuppress;
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_ #endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_CORE_INTERNAL_H_

View File

@ -21,18 +21,15 @@
#include "webrtc/modules/audio_processing/aec/aec_core_internal.h" #include "webrtc/modules/audio_processing/aec/aec_core_internal.h"
#include "webrtc/modules/audio_processing/aec/aec_rdft.h" #include "webrtc/modules/audio_processing/aec/aec_rdft.h"
__inline static float MulRe(float aRe, float aIm, float bRe, float bIm) __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
{
return aRe * bRe - aIm * bIm; return aRe * bRe - aIm * bIm;
} }
__inline static float MulIm(float aRe, float aIm, float bRe, float bIm) __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
{
return aRe * bIm + aIm * bRe; return aRe * bIm + aIm * bRe;
} }
static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1]) static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1]) {
{
int i; int i;
const int num_partitions = aec->num_partitions; const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) { for (i = 0; i < num_partitions; i++) {
@ -41,7 +38,7 @@ static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1])
int pos = i * PART_LEN1; int pos = i * PART_LEN1;
// Check for wrap // Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) { if (i + aec->xfBufBlockPos >= num_partitions) {
xPos -= num_partitions*(PART_LEN1); xPos -= num_partitions * (PART_LEN1);
} }
// vectorized code (four at once) // vectorized code (four at once)
@ -65,22 +62,25 @@ static void FilterFarSSE2(AecCore* aec, float yf[2][PART_LEN1])
} }
// scalar code for the remaining items. // scalar code for the remaining items.
for (; j < PART_LEN1; j++) { for (; j < PART_LEN1; j++) {
yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], yf[0][j] += MulRe(aec->xfBuf[0][xPos + j],
aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]); aec->xfBuf[1][xPos + j],
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], aec->wfBuf[0][pos + j],
aec->wfBuf[0][ pos + j], aec->wfBuf[1][ pos + j]); aec->wfBuf[1][pos + j]);
yf[1][j] += MulIm(aec->xfBuf[0][xPos + j],
aec->xfBuf[1][xPos + j],
aec->wfBuf[0][pos + j],
aec->wfBuf[1][pos + j]);
} }
} }
} }
static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) {
{
const __m128 k1e_10f = _mm_set1_ps(1e-10f); const __m128 k1e_10f = _mm_set1_ps(1e-10f);
const __m128 kMu = aec->extended_filter_enabled ? const __m128 kMu = aec->extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
_mm_set1_ps(kExtendedMu) : _mm_set1_ps(aec->normal_mu); : _mm_set1_ps(aec->normal_mu);
const __m128 kThresh = aec->extended_filter_enabled ? const __m128 kThresh = aec->extended_filter_enabled
_mm_set1_ps(kExtendedErrorThreshold) : ? _mm_set1_ps(kExtendedErrorThreshold)
_mm_set1_ps(aec->normal_error_threshold); : _mm_set1_ps(aec->normal_error_threshold);
int i; int i;
// vectorized code (four at once) // vectorized code (four at once)
@ -115,12 +115,13 @@ static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1])
} }
// scalar code for the remaining items. // scalar code for the remaining items.
{ {
const float mu = aec->extended_filter_enabled ? const float mu =
kExtendedMu : aec->normal_mu; aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
const float error_threshold = aec->extended_filter_enabled ? const float error_threshold = aec->extended_filter_enabled
kExtendedErrorThreshold : aec->normal_error_threshold; ? kExtendedErrorThreshold
: aec->normal_error_threshold;
for (; i < (PART_LEN1); i++) { for (; i < (PART_LEN1); i++) {
float abs_ef; float abs_ef;
ef[0][i] /= (aec->xPow[i] + 1e-10f); ef[0][i] /= (aec->xPow[i] + 1e-10f);
ef[1][i] /= (aec->xPow[i] + 1e-10f); ef[1][i] /= (aec->xPow[i] + 1e-10f);
abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
@ -138,11 +139,13 @@ static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1])
} }
} }
static void FilterAdaptationSSE2(AecCore* aec, float *fft, float ef[2][PART_LEN1]) { static void FilterAdaptationSSE2(AecCore* aec,
float* fft,
float ef[2][PART_LEN1]) {
int i, j; int i, j;
const int num_partitions = aec->num_partitions; const int num_partitions = aec->num_partitions;
for (i = 0; i < num_partitions; i++) { for (i = 0; i < num_partitions; i++) {
int xPos = (i + aec->xfBufBlockPos)*(PART_LEN1); int xPos = (i + aec->xfBufBlockPos) * (PART_LEN1);
int pos = i * PART_LEN1; int pos = i * PART_LEN1;
// Check for wrap // Check for wrap
if (i + aec->xfBufBlockPos >= num_partitions) { if (i + aec->xfBufBlockPos >= num_partitions) {
@ -150,7 +153,7 @@ static void FilterAdaptationSSE2(AecCore* aec, float *fft, float ef[2][PART_LEN1
} }
// Process the whole array... // Process the whole array...
for (j = 0; j < PART_LEN; j+= 4) { for (j = 0; j < PART_LEN; j += 4) {
// Load xfBuf and ef. // Load xfBuf and ef.
const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]); const __m128 xfBuf_re = _mm_loadu_ps(&aec->xfBuf[0][xPos + j]);
const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]); const __m128 xfBuf_im = _mm_loadu_ps(&aec->xfBuf[1][xPos + j]);
@ -169,22 +172,23 @@ static void FilterAdaptationSSE2(AecCore* aec, float *fft, float ef[2][PART_LEN1
const __m128 g = _mm_unpacklo_ps(e, f); const __m128 g = _mm_unpacklo_ps(e, f);
const __m128 h = _mm_unpackhi_ps(e, f); const __m128 h = _mm_unpackhi_ps(e, f);
// Store // Store
_mm_storeu_ps(&fft[2*j + 0], g); _mm_storeu_ps(&fft[2 * j + 0], g);
_mm_storeu_ps(&fft[2*j + 4], h); _mm_storeu_ps(&fft[2 * j + 4], h);
} }
// ... and fixup the first imaginary entry. // ... and fixup the first imaginary entry.
fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN], fft[1] = MulRe(aec->xfBuf[0][xPos + PART_LEN],
-aec->xfBuf[1][xPos + PART_LEN], -aec->xfBuf[1][xPos + PART_LEN],
ef[0][PART_LEN], ef[1][PART_LEN]); ef[0][PART_LEN],
ef[1][PART_LEN]);
aec_rdft_inverse_128(fft); aec_rdft_inverse_128(fft);
memset(fft + PART_LEN, 0, sizeof(float)*PART_LEN); memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
// fft scaling // fft scaling
{ {
float scale = 2.0f / PART_LEN2; float scale = 2.0f / PART_LEN2;
const __m128 scale_ps = _mm_load_ps1(&scale); const __m128 scale_ps = _mm_load_ps1(&scale);
for (j = 0; j < PART_LEN; j+=4) { for (j = 0; j < PART_LEN; j += 4) {
const __m128 fft_ps = _mm_loadu_ps(&fft[j]); const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps); const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
_mm_storeu_ps(&fft[j], fft_scale); _mm_storeu_ps(&fft[j], fft_scale);
@ -195,13 +199,15 @@ static void FilterAdaptationSSE2(AecCore* aec, float *fft, float ef[2][PART_LEN1
{ {
float wt1 = aec->wfBuf[1][pos]; float wt1 = aec->wfBuf[1][pos];
aec->wfBuf[0][pos + PART_LEN] += fft[1]; aec->wfBuf[0][pos + PART_LEN] += fft[1];
for (j = 0; j < PART_LEN; j+= 4) { for (j = 0; j < PART_LEN; j += 4) {
__m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]); __m128 wtBuf_re = _mm_loadu_ps(&aec->wfBuf[0][pos + j]);
__m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]); __m128 wtBuf_im = _mm_loadu_ps(&aec->wfBuf[1][pos + j]);
const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]); const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]); const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
const __m128 fft_re = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2 ,0)); const __m128 fft_re =
const __m128 fft_im = _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3 ,1)); _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 fft_im =
_mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
wtBuf_re = _mm_add_ps(wtBuf_re, fft_re); wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
wtBuf_im = _mm_add_ps(wtBuf_im, fft_im); wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
_mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re); _mm_storeu_ps(&aec->wfBuf[0][pos + j], wtBuf_re);
@ -212,8 +218,7 @@ static void FilterAdaptationSSE2(AecCore* aec, float *fft, float ef[2][PART_LEN1
} }
} }
static __m128 mm_pow_ps(__m128 a, __m128 b) static __m128 mm_pow_ps(__m128 a, __m128 b) {
{
// a^b = exp2(b * log2(a)) // a^b = exp2(b * log2(a))
// exp2(x) and log2(x) are calculated using polynomial approximations. // exp2(x) and log2(x) are calculated using polynomial approximations.
__m128 log2_a, b_log2_a, a_exp_b; __m128 log2_a, b_log2_a, a_exp_b;
@ -238,55 +243,55 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
// compensate the fact that the exponent has been shifted in the top/ // compensate the fact that the exponent has been shifted in the top/
// fractional part and finally getting rid of the implicit leading one // fractional part and finally getting rid of the implicit leading one
// from the mantissa by substracting it out. // from the mantissa by substracting it out.
static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
{0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; 0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
{0x43800000, 0x43800000, 0x43800000, 0x43800000}; 0x43800000, 0x43800000, 0x43800000, 0x43800000};
static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
{0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; 0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
static const int shift_exponent_into_top_mantissa = 8; static const int shift_exponent_into_top_mantissa = 8;
const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask)); const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(_mm_castps_si128(two_n), const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
shift_exponent_into_top_mantissa)); _mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
const __m128 n_0 = _mm_or_ps(n_1, *((__m128 *)eight_biased_exponent)); const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one)); const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));
// Compute y. // Compute y.
static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
{0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; 0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
{0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask)); const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
const __m128 y = _mm_or_ps( const __m128 y =
mantissa, *((__m128 *)zero_biased_exponent_is_one)); _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));
// Approximate log2(y) ~= (y - 1) * pol5(y). // Approximate log2(y) ~= (y - 1) * pol5(y).
// pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
static const ALIGN16_BEG float ALIGN16_END C5[4] = static const ALIGN16_BEG float ALIGN16_END C5[4] = {
{-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
static const ALIGN16_BEG float ALIGN16_END C4[4] = static const ALIGN16_BEG float ALIGN16_END
{3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
static const ALIGN16_BEG float ALIGN16_END C3[4] = static const ALIGN16_BEG float ALIGN16_END
{-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
static const ALIGN16_BEG float ALIGN16_END C2[4] = static const ALIGN16_BEG float ALIGN16_END
{2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f}; C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
static const ALIGN16_BEG float ALIGN16_END C1[4] = static const ALIGN16_BEG float ALIGN16_END
{-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
static const ALIGN16_BEG float ALIGN16_END C0[4] = static const ALIGN16_BEG float ALIGN16_END
{3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f}; C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5)); const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4)); const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3)); const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2)); const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1)); const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0)); const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
const __m128 y_minus_one = _mm_sub_ps( const __m128 y_minus_one =
y, *((__m128 *)zero_biased_exponent_is_one)); _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y); const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);
// Combine parts. // Combine parts.
log2_a = _mm_add_ps(n, log2_y); log2_a = _mm_add_ps(n, log2_y);
@ -310,38 +315,38 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
// maximum relative error of 0.17%. // maximum relative error of 0.17%.
// To avoid over/underflow, we reduce the range of input to ]-127, 129]. // To avoid over/underflow, we reduce the range of input to ]-127, 129].
static const ALIGN16_BEG float max_input[4] ALIGN16_END = static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
{129.f, 129.f, 129.f, 129.f}; 129.f, 129.f};
static const ALIGN16_BEG float min_input[4] ALIGN16_END = static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
{-126.99999f, -126.99999f, -126.99999f, -126.99999f}; -126.99999f, -126.99999f, -126.99999f, -126.99999f};
const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input)); const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input)); const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
// Compute n. // Compute n.
static const ALIGN16_BEG float half[4] ALIGN16_END = static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
{0.5f, 0.5f, 0.5f, 0.5f}; 0.5f, 0.5f};
const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half)); const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
// Compute 2^n. // Compute 2^n.
static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
{127, 127, 127, 127}; 127, 127, 127, 127};
static const int float_exponent_shift = 23; static const int float_exponent_shift = 23;
const __m128i two_n_exponent = _mm_add_epi32( const __m128i two_n_exponent =
x_minus_half_floor, *((__m128i *)float_exponent_bias)); _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
const __m128 two_n = _mm_castsi128_ps(_mm_slli_epi32( const __m128 two_n =
two_n_exponent, float_exponent_shift)); _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
// Compute y. // Compute y.
const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
// Approximate 2^y ~= C2 * y^2 + C1 * y + C0. // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
static const ALIGN16_BEG float C2[4] ALIGN16_END = static const ALIGN16_BEG float C2[4] ALIGN16_END = {
{3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
static const ALIGN16_BEG float C1[4] ALIGN16_END = static const ALIGN16_BEG float C1[4] ALIGN16_END = {
{6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
static const ALIGN16_BEG float C0[4] ALIGN16_END = static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
{1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f}; 1.0017247f, 1.0017247f};
const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2)); const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1)); const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0)); const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));
// Combine parts. // Combine parts.
a_exp_b = _mm_mul_ps(exp2_y, two_n); a_exp_b = _mm_mul_ps(exp2_y, two_n);
@ -352,7 +357,8 @@ static __m128 mm_pow_ps(__m128 a, __m128 b)
extern const float WebRtcAec_weightCurve[65]; extern const float WebRtcAec_weightCurve[65];
extern const float WebRtcAec_overDriveCurve[65]; extern const float WebRtcAec_overDriveCurve[65];
static void OverdriveAndSuppressSSE2(AecCore* aec, float hNl[PART_LEN1], static void OverdriveAndSuppressSSE2(AecCore* aec,
float hNl[PART_LEN1],
const float hNlFb, const float hNlFb,
float efw[2][PART_LEN1]) { float efw[2][PART_LEN1]) {
int i; int i;
@ -361,26 +367,25 @@ static void OverdriveAndSuppressSSE2(AecCore* aec, float hNl[PART_LEN1],
const __m128 vec_minus_one = _mm_set1_ps(-1.0f); const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm); const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
// vectorized code (four at once) // vectorized code (four at once)
for (i = 0; i + 3 < PART_LEN1; i+=4) { for (i = 0; i + 3 < PART_LEN1; i += 4) {
// Weight subbands // Weight subbands
__m128 vec_hNl = _mm_loadu_ps(&hNl[i]); __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]); const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb); const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
const __m128 vec_weightCurve_hNlFb = _mm_mul_ps( const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
vec_weightCurve, vec_hNlFb);
const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve); const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
const __m128 vec_one_weightCurve_hNl = _mm_mul_ps( const __m128 vec_one_weightCurve_hNl =
vec_one_weightCurve, vec_hNl); _mm_mul_ps(vec_one_weightCurve, vec_hNl);
const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl); const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
const __m128 vec_if1 = _mm_and_ps( const __m128 vec_if1 = _mm_and_ps(
bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl)); bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
vec_hNl = _mm_or_ps(vec_if0, vec_if1); vec_hNl = _mm_or_ps(vec_if0, vec_if1);
{ {
const __m128 vec_overDriveCurve = _mm_loadu_ps( const __m128 vec_overDriveCurve =
&WebRtcAec_overDriveCurve[i]); _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
const __m128 vec_overDriveSm_overDriveCurve = _mm_mul_ps( const __m128 vec_overDriveSm_overDriveCurve =
vec_overDriveSm, vec_overDriveCurve); _mm_mul_ps(vec_overDriveSm, vec_overDriveCurve);
vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve); vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
_mm_storeu_ps(&hNl[i], vec_hNl); _mm_storeu_ps(&hNl[i], vec_hNl);
} }
@ -404,7 +409,7 @@ static void OverdriveAndSuppressSSE2(AecCore* aec, float hNl[PART_LEN1],
// Weight subbands // Weight subbands
if (hNl[i] > hNlFb) { if (hNl[i] > hNlFb) {
hNl[i] = WebRtcAec_weightCurve[i] * hNlFb + hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
(1 - WebRtcAec_weightCurve[i]) * hNl[i]; (1 - WebRtcAec_weightCurve[i]) * hNl[i];
} }
hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]); hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);
@ -424,4 +429,3 @@ void WebRtcAec_InitAec_SSE2(void) {
WebRtcAec_FilterAdaptation = FilterAdaptationSSE2; WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2; WebRtcAec_OverdriveAndSuppress = OverdriveAndSuppressSSE2;
} }

View File

@ -42,7 +42,7 @@ ALIGN16_BEG float ALIGN16_END cftmdl_wk1r[4];
static int ip[16]; static int ip[16];
static void bitrv2_32(int *ip, float *a) { static void bitrv2_32(int* ip, float* a) {
const int n = 32; const int n = 32;
int j, j1, k, k1, m, m2; int j, j1, k, k1, m, m2;
float xr, xi, yr, yi; float xr, xi, yr, yi;
@ -116,7 +116,7 @@ static void bitrv2_32(int *ip, float *a) {
} }
} }
static void bitrv2_128(float *a) { static void bitrv2_128(float* a) {
/* /*
Following things have been attempted but are no faster: Following things have been attempted but are no faster:
(a) Storing the swap indexes in a LUT (index calculations are done (a) Storing the swap indexes in a LUT (index calculations are done
@ -146,7 +146,7 @@ static void bitrv2_128(float *a) {
a[j1 + 1] = yi; a[j1 + 1] = yi;
a[k1 + 0] = xr; a[k1 + 0] = xr;
a[k1 + 1] = xi; a[k1 + 1] = xi;
j1 += 8; j1 += 8;
k1 += 16; k1 += 16;
xr = a[j1 + 0]; xr = a[j1 + 0];
xi = a[j1 + 1]; xi = a[j1 + 1];
@ -166,7 +166,7 @@ static void bitrv2_128(float *a) {
a[j1 + 1] = yi; a[j1 + 1] = yi;
a[k1 + 0] = xr; a[k1 + 0] = xr;
a[k1 + 1] = xi; a[k1 + 1] = xi;
j1 += 8; j1 += 8;
k1 += 16; k1 += 16;
xr = a[j1 + 0]; xr = a[j1 + 0];
xi = a[j1 + 1]; xi = a[j1 + 1];
@ -265,7 +265,7 @@ static void makewt_32(void) {
} }
static void makect_32(void) { static void makect_32(void) {
float *c = rdft_w + 32; float* c = rdft_w + 32;
const int nc = 32; const int nc = 32;
int j, nch; int j, nch;
float delta; float delta;
@ -281,7 +281,7 @@ static void makect_32(void) {
} }
} }
static void cft1st_128_C(float *a) { static void cft1st_128_C(float* a) {
const int n = 128; const int n = 128;
int j, k1, k2; int j, k1, k2;
float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i; float wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
@ -385,7 +385,7 @@ static void cft1st_128_C(float *a) {
} }
} }
static void cftmdl_128_C(float *a) { static void cftmdl_128_C(float* a) {
const int l = 8; const int l = 8;
const int n = 128; const int n = 128;
const int m = 32; const int m = 32;
@ -394,7 +394,7 @@ static void cftmdl_128_C(float *a) {
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
for (j0 = 0; j0 < l; j0 += 2) { for (j0 = 0; j0 < l; j0 += 2) {
j1 = j0 + 8; j1 = j0 + 8;
j2 = j0 + 16; j2 = j0 + 16;
j3 = j0 + 24; j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0]; x0r = a[j0 + 0] + a[j1 + 0];
@ -416,7 +416,7 @@ static void cftmdl_128_C(float *a) {
} }
wk1r = rdft_w[2]; wk1r = rdft_w[2];
for (j0 = m; j0 < l + m; j0 += 2) { for (j0 = m; j0 < l + m; j0 += 2) {
j1 = j0 + 8; j1 = j0 + 8;
j2 = j0 + 16; j2 = j0 + 16;
j3 = j0 + 24; j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0]; x0r = a[j0 + 0] + a[j1 + 0];
@ -452,7 +452,7 @@ static void cftmdl_128_C(float *a) {
wk3r = rdft_wk3ri_first[k1 + 0]; wk3r = rdft_wk3ri_first[k1 + 0];
wk3i = rdft_wk3ri_first[k1 + 1]; wk3i = rdft_wk3ri_first[k1 + 1];
for (j0 = k; j0 < l + k; j0 += 2) { for (j0 = k; j0 < l + k; j0 += 2) {
j1 = j0 + 8; j1 = j0 + 8;
j2 = j0 + 16; j2 = j0 + 16;
j3 = j0 + 24; j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0]; x0r = a[j0 + 0] + a[j1 + 0];
@ -483,7 +483,7 @@ static void cftmdl_128_C(float *a) {
wk3r = rdft_wk3ri_second[k1 + 0]; wk3r = rdft_wk3ri_second[k1 + 0];
wk3i = rdft_wk3ri_second[k1 + 1]; wk3i = rdft_wk3ri_second[k1 + 1];
for (j0 = k + m; j0 < l + (k + m); j0 += 2) { for (j0 = k + m; j0 < l + (k + m); j0 += 2) {
j1 = j0 + 8; j1 = j0 + 8;
j2 = j0 + 16; j2 = j0 + 16;
j3 = j0 + 24; j3 = j0 + 24;
x0r = a[j0 + 0] + a[j1 + 0]; x0r = a[j0 + 0] + a[j1 + 0];
@ -512,7 +512,7 @@ static void cftmdl_128_C(float *a) {
} }
} }
static void cftfsub_128(float *a) { static void cftfsub_128(float* a) {
int j, j1, j2, j3, l; int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
@ -542,7 +542,7 @@ static void cftfsub_128(float *a) {
} }
} }
static void cftbsub_128(float *a) { static void cftbsub_128(float* a) {
int j, j1, j2, j3, l; int j, j1, j2, j3, l;
float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
@ -573,14 +573,14 @@ static void cftbsub_128(float *a) {
} }
} }
static void rftfsub_128_C(float *a) { static void rftfsub_128_C(float* a) {
const float *c = rdft_w + 32; const float* c = rdft_w + 32;
int j1, j2, k1, k2; int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi; float wkr, wki, xr, xi, yr, yi;
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2; k2 = 128 - j2;
k1 = 32 - j1; k1 = 32 - j1;
wkr = 0.5f - c[k1]; wkr = 0.5f - c[k1];
wki = c[j1]; wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0]; xr = a[j2 + 0] - a[k2 + 0];
@ -594,15 +594,15 @@ static void rftfsub_128_C(float *a) {
} }
} }
static void rftbsub_128_C(float *a) { static void rftbsub_128_C(float* a) {
const float *c = rdft_w + 32; const float* c = rdft_w + 32;
int j1, j2, k1, k2; int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi; float wkr, wki, xr, xi, yr, yi;
a[1] = -a[1]; a[1] = -a[1];
for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) { for (j1 = 1, j2 = 2; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2; k2 = 128 - j2;
k1 = 32 - j1; k1 = 32 - j1;
wkr = 0.5f - c[k1]; wkr = 0.5f - c[k1];
wki = c[j1]; wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0]; xr = a[j2 + 0] - a[k2 + 0];
@ -617,7 +617,7 @@ static void rftbsub_128_C(float *a) {
a[65] = -a[65]; a[65] = -a[65];
} }
void aec_rdft_forward_128(float *a) { void aec_rdft_forward_128(float* a) {
float xi; float xi;
bitrv2_128(a); bitrv2_128(a);
cftfsub_128(a); cftfsub_128(a);
@ -627,7 +627,7 @@ void aec_rdft_forward_128(float *a) {
a[1] = xi; a[1] = xi;
} }
void aec_rdft_inverse_128(float *a) { void aec_rdft_inverse_128(float* a) {
a[1] = 0.5f * (a[0] - a[1]); a[1] = 0.5f * (a[0] - a[1]);
a[0] -= a[1]; a[0] -= a[1];
rftbsub_128(a); rftbsub_128(a);

View File

@ -20,11 +20,11 @@ static __inline __m128i _mm_castps_si128(__m128 a) { return *(__m128i*)&a; }
#endif #endif
#ifdef _MSC_VER /* visual c++ */ #ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16)) #define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END #define ALIGN16_END
#else /* gcc or icc */ #else /* gcc or icc */
# define ALIGN16_BEG #define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16))) #define ALIGN16_END __attribute__((aligned(16)))
#endif #endif
// constants shared by all paths (C, SSE2). // constants shared by all paths (C, SSE2).
@ -42,7 +42,7 @@ extern float rdft_wk3i[32];
extern float cftmdl_wk1r[4]; extern float cftmdl_wk1r[4];
// code path selection function pointers // code path selection function pointers
typedef void (*rft_sub_128_t)(float *a); typedef void (*rft_sub_128_t)(float* a);
extern rft_sub_128_t rftfsub_128; extern rft_sub_128_t rftfsub_128;
extern rft_sub_128_t rftbsub_128; extern rft_sub_128_t rftbsub_128;
extern rft_sub_128_t cft1st_128; extern rft_sub_128_t cft1st_128;
@ -51,7 +51,7 @@ extern rft_sub_128_t cftmdl_128;
// entry points // entry points
void aec_rdft_init(void); void aec_rdft_init(void);
void aec_rdft_init_sse2(void); void aec_rdft_init_sse2(void);
void aec_rdft_forward_128(float *a); void aec_rdft_forward_128(float* a);
void aec_rdft_inverse_128(float *a); void aec_rdft_inverse_128(float* a);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_ #endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_MAIN_SOURCE_AEC_RDFT_H_

View File

@ -12,165 +12,164 @@
#include <emmintrin.h> #include <emmintrin.h>
static const ALIGN16_BEG float ALIGN16_END k_swap_sign[4] = static const ALIGN16_BEG float ALIGN16_END
{-1.f, 1.f, -1.f, 1.f}; k_swap_sign[4] = {-1.f, 1.f, -1.f, 1.f};
static void cft1st_128_SSE2(float *a) { static void cft1st_128_SSE2(float* a) {
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j, k2; int j, k2;
for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
__m128 a00v = _mm_loadu_ps(&a[j + 0]); __m128 a00v = _mm_loadu_ps(&a[j + 0]);
__m128 a04v = _mm_loadu_ps(&a[j + 4]); __m128 a04v = _mm_loadu_ps(&a[j + 4]);
__m128 a08v = _mm_loadu_ps(&a[j + 8]); __m128 a08v = _mm_loadu_ps(&a[j + 8]);
__m128 a12v = _mm_loadu_ps(&a[j + 12]); __m128 a12v = _mm_loadu_ps(&a[j + 12]);
__m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1 ,0)); __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3 ,2)); __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
__m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1 ,0)); __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
__m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3 ,2)); __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]); const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]); const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]); const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]); const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]); const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]); const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
__m128 x0v = _mm_add_ps(a01v, a23v); __m128 x0v = _mm_add_ps(a01v, a23v);
const __m128 x1v = _mm_sub_ps(a01v, a23v); const __m128 x1v = _mm_sub_ps(a01v, a23v);
const __m128 x2v = _mm_add_ps(a45v, a67v); const __m128 x2v = _mm_add_ps(a45v, a67v);
const __m128 x3v = _mm_sub_ps(a45v, a67v); const __m128 x3v = _mm_sub_ps(a45v, a67v);
__m128 x0w; __m128 x0w;
a01v = _mm_add_ps(x0v, x2v); a01v = _mm_add_ps(x0v, x2v);
x0v = _mm_sub_ps(x0v, x2v); x0v = _mm_sub_ps(x0v, x2v);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
{ {
const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v); const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w); const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
a45v = _mm_add_ps(a45_0v, a45_1v); a45v = _mm_add_ps(a45_0v, a45_1v);
} }
{ {
__m128 a23_0v, a23_1v; __m128 a23_0v, a23_1v;
const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0 ,1)); const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w); const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
x0v = _mm_add_ps(x1v, x3s); x0v = _mm_add_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
a23_0v = _mm_mul_ps(wk1rv, x0v); a23_0v = _mm_mul_ps(wk1rv, x0v);
a23_1v = _mm_mul_ps(wk1iv, x0w); a23_1v = _mm_mul_ps(wk1iv, x0w);
a23v = _mm_add_ps(a23_0v, a23_1v); a23v = _mm_add_ps(a23_0v, a23_1v);
x0v = _mm_sub_ps(x1v, x3s); x0v = _mm_sub_ps(x1v, x3s);
x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0 ,1)); x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
} }
{ {
const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v); const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w); const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
a67v = _mm_add_ps(a67_0v, a67_1v); a67v = _mm_add_ps(a67_0v, a67_1v);
} }
a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1 ,0)); a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1 ,0)); a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3 ,2)); a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3 ,2)); a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
_mm_storeu_ps(&a[j + 0], a00v); _mm_storeu_ps(&a[j + 0], a00v);
_mm_storeu_ps(&a[j + 4], a04v); _mm_storeu_ps(&a[j + 4], a04v);
_mm_storeu_ps(&a[j + 8], a08v); _mm_storeu_ps(&a[j + 8], a08v);
_mm_storeu_ps(&a[j + 12], a12v); _mm_storeu_ps(&a[j + 12], a12v);
} }
} }
static void cftmdl_128_SSE2(float *a) { static void cftmdl_128_SSE2(float* a) {
const int l = 8; const int l = 8;
const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign); const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
int j0; int j0;
__m128 wk1rv = _mm_load_ps(cftmdl_wk1r); __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
for (j0 = 0; j0 < l; j0 += 2) { for (j0 = 0; j0 < l; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
_mm_castsi128_ps(a_32), _mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
_mm_castsi128_ps(a_40), _mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]); const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]); const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
_mm_castsi128_ps(a_48), _mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
_mm_castsi128_ps(a_56), _mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps( const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1), _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
_MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 yy0 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, const __m128 yy0 =
_MM_SHUFFLE(2, 2, 2 ,2)); _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
const __m128 yy1 = _mm_shuffle_ps(x1_x3_add, x1_x3_sub, const __m128 yy1 =
_MM_SHUFFLE(3, 3, 3 ,3)); _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1); const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
const __m128 yy3 = _mm_add_ps(yy0, yy2); const __m128 yy3 = _mm_add_ps(yy0, yy2);
const __m128 yy4 = _mm_mul_ps(wk1rv, yy3); const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0)); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
_mm_storel_epi64((__m128i*)&a[j0 + 32], _mm_storel_epi64(
_mm_shuffle_epi32(_mm_castps_si128(xx0), (__m128i*)&a[j0 + 32],
_MM_SHUFFLE(3, 2, 3, 2))); _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1)); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
_mm_storel_epi64((__m128i*)&a[j0 + 48], _mm_storel_epi64(
_mm_shuffle_epi32(_mm_castps_si128(xx1), (__m128i*)&a[j0 + 48],
_MM_SHUFFLE(2, 3, 2, 3))); _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
a[j0 + 48] = -a[j0 + 48]; a[j0 + 48] = -a[j0 + 48];
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add)); _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub)); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
_mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4)); _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
_mm_storel_epi64((__m128i*)&a[j0 + 56], _mm_storel_epi64(
_mm_shuffle_epi32(_mm_castps_si128(yy4), (__m128i*)&a[j0 + 56],
_MM_SHUFFLE(2, 3, 2, 3))); _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
} }
{ {
int k = 64; int k = 64;
int k1 = 2; int k1 = 2;
int k2 = 2 * k1; int k2 = 2 * k1;
const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2+0]); const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2+0]); const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2+0]); const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2+0]); const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2+0]); const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
wk1rv = _mm_load_ps(&rdft_wk1r[k2+0]); wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
for (j0 = k; j0 < l + k; j0 += 2) { for (j0 = k; j0 < l + k; j0 += 2) {
const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]); const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]); const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]); const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]); const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00), const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
_mm_castsi128_ps(a_32), _mm_castsi128_ps(a_32),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08), const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
_mm_castsi128_ps(a_40), _mm_castsi128_ps(a_40),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
__m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40); __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40); const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]); const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
@ -179,100 +178,102 @@ static void cftmdl_128_SSE2(float *a) {
const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]); const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16), const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
_mm_castsi128_ps(a_48), _mm_castsi128_ps(a_48),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24), const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
_mm_castsi128_ps(a_56), _mm_castsi128_ps(a_56),
_MM_SHUFFLE(1, 0, 1 ,0)); _MM_SHUFFLE(1, 0, 1, 0));
const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56); const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56); const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
const __m128 xx2 = _mm_mul_ps(xx1 , wk2rv); const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
const __m128 xx3 = _mm_mul_ps(wk2iv, const __m128 xx3 =
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1), _mm_mul_ps(wk2iv,
_MM_SHUFFLE(2, 3, 0, 1)))); _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx4 = _mm_add_ps(xx2, xx3); const __m128 xx4 = _mm_add_ps(xx2, xx3);
const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps( const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
_mm_shuffle_epi32(_mm_castps_si128(x3r0_3i0_3r1_x3i1), _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
_MM_SHUFFLE(2, 3, 0, 1))); const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1); const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped); const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv); const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
const __m128 xx11 = _mm_mul_ps(wk1iv, const __m128 xx11 = _mm_mul_ps(
wk1iv,
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add), _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
_MM_SHUFFLE(2, 3, 0, 1)))); _MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx12 = _mm_add_ps(xx10, xx11); const __m128 xx12 = _mm_add_ps(xx10, xx11);
const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv); const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
const __m128 xx21 = _mm_mul_ps(wk3iv, const __m128 xx21 = _mm_mul_ps(
wk3iv,
_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub), _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
_MM_SHUFFLE(2, 3, 0, 1)))); _MM_SHUFFLE(2, 3, 0, 1))));
const __m128 xx22 = _mm_add_ps(xx20, xx21); const __m128 xx22 = _mm_add_ps(xx20, xx21);
_mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx)); _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
_mm_storel_epi64((__m128i*)&a[j0 + 32], _mm_storel_epi64(
_mm_shuffle_epi32(_mm_castps_si128(xx), (__m128i*)&a[j0 + 32],
_MM_SHUFFLE(3, 2, 3, 2))); _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4)); _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
_mm_storel_epi64((__m128i*)&a[j0 + 48], _mm_storel_epi64(
_mm_shuffle_epi32(_mm_castps_si128(xx4), (__m128i*)&a[j0 + 48],
_MM_SHUFFLE(3, 2, 3, 2))); _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12)); _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
_mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_storel_epi64(
_mm_shuffle_epi32(_mm_castps_si128(xx12), (__m128i*)&a[j0 + 40],
_MM_SHUFFLE(3, 2, 3, 2))); _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22)); _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
_mm_storel_epi64((__m128i*)&a[j0 + 56], _mm_storel_epi64(
_mm_shuffle_epi32(_mm_castps_si128(xx22), (__m128i*)&a[j0 + 56],
_MM_SHUFFLE(3, 2, 3, 2))); _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
} }
} }
} }
static void rftfsub_128_SSE2(float *a) { static void rftfsub_128_SSE2(float* a) {
const float *c = rdft_w + 32; const float* c = rdft_w + 32;
int j1, j2, k1, k2; int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi; float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END k_half[4] = static const ALIGN16_BEG float ALIGN16_END
{0.5f, 0.5f, 0.5f, 0.5f}; k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
const __m128 mm_half = _mm_load_ps(k_half); const __m128 mm_half = _mm_load_ps(k_half);
// Vectorized code (four at once). // Vectorized code (four at once).
// Note: commented number are indexes for the first iteration of the loop. // Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'. // Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4, const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ = const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4, const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'. // Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4, const __m128 a_j2_p0 = _mm_shuffle_ps(
_MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8, a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4, const __m128 a_j2_p1 = _mm_shuffle_ps(
_MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9, a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0, const __m128 a_k2_p0 = _mm_shuffle_ps(
_MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120, a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0, const __m128 a_k2_p1 = _mm_shuffle_ps(
_MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121, a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'. // Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120, // 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121, // 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'. // Calculate product into 'y'.
// yr = wkr * xr - wki * xi; // yr = wkr * xr - wki * xi;
// yi = wkr * xi + wki * xr; // yi = wkr * xi + wki * xr;
@ -280,12 +281,12 @@ static void rftfsub_128_SSE2(float *a) {
const __m128 b_ = _mm_mul_ps(wki_, xi_); const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_); const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_); const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'. // Update 'a'.
// a[j2 + 0] -= yr; // a[j2 + 0] -= yr;
// a[j2 + 1] -= yi; // a[j2 + 1] -= yi;
// a[k2 + 0] += yr; // a[k2 + 0] += yr;
// a[k2 + 1] -= yi; // a[k2 + 1] -= yi;
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9, const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9,
@ -293,26 +294,26 @@ static void rftfsub_128_SSE2(float *a) {
const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121, const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121,
// Shuffle in right order and store. // Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5, // 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9, // 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121, // 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125, // 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt, const __m128 a_k2_0n = _mm_shuffle_ps(
_MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123, a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt, const __m128 a_k2_4n = _mm_shuffle_ps(
_MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127, a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n); _mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n); _mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n); _mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n); _mm_storeu_ps(&a[126 - j2], a_k2_4n);
} }
// Scalar code for the remaining items. // Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) { for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2; k2 = 128 - j2;
k1 = 32 - j1; k1 = 32 - j1;
wkr = 0.5f - c[k1]; wkr = 0.5f - c[k1];
wki = c[j1]; wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0]; xr = a[j2 + 0] - a[k2 + 0];
@ -326,13 +327,13 @@ static void rftfsub_128_SSE2(float *a) {
} }
} }
static void rftbsub_128_SSE2(float *a) { static void rftbsub_128_SSE2(float* a) {
const float *c = rdft_w + 32; const float* c = rdft_w + 32;
int j1, j2, k1, k2; int j1, j2, k1, k2;
float wkr, wki, xr, xi, yr, yi; float wkr, wki, xr, xi, yr, yi;
static const ALIGN16_BEG float ALIGN16_END k_half[4] = static const ALIGN16_BEG float ALIGN16_END
{0.5f, 0.5f, 0.5f, 0.5f}; k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
const __m128 mm_half = _mm_load_ps(k_half); const __m128 mm_half = _mm_load_ps(k_half);
a[1] = -a[1]; a[1] = -a[1];
@ -340,30 +341,30 @@ static void rftbsub_128_SSE2(float *a) {
// Note: commented number are indexes for the first iteration of the loop. // Note: commented number are indexes for the first iteration of the loop.
for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
// Load 'wk'. // Load 'wk'.
const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4, const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
const __m128 wkr_ = const __m128 wkr_ =
_mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
const __m128 wki_ = c_j1; // 1, 2, 3, 4, const __m128 wki_ = c_j1; // 1, 2, 3, 4,
// Load and shuffle 'a'. // Load and shuffle 'a'.
const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4, const __m128 a_j2_p0 = _mm_shuffle_ps(
_MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8, a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4, const __m128 a_j2_p1 = _mm_shuffle_ps(
_MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9, a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0, const __m128 a_k2_p0 = _mm_shuffle_ps(
_MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120, a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0, const __m128 a_k2_p1 = _mm_shuffle_ps(
_MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121, a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
// Calculate 'x'. // Calculate 'x'.
const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
// 2-126, 4-124, 6-122, 8-120, // 2-126, 4-124, 6-122, 8-120,
const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
// 3-127, 5-125, 7-123, 9-121, // 3-127, 5-125, 7-123, 9-121,
// Calculate product into 'y'. // Calculate product into 'y'.
// yr = wkr * xr + wki * xi; // yr = wkr * xr + wki * xi;
// yi = wkr * xi - wki * xr; // yi = wkr * xi - wki * xr;
@ -371,12 +372,12 @@ static void rftbsub_128_SSE2(float *a) {
const __m128 b_ = _mm_mul_ps(wki_, xi_); const __m128 b_ = _mm_mul_ps(wki_, xi_);
const __m128 c_ = _mm_mul_ps(wkr_, xi_); const __m128 c_ = _mm_mul_ps(wkr_, xi_);
const __m128 d_ = _mm_mul_ps(wki_, xr_); const __m128 d_ = _mm_mul_ps(wki_, xr_);
const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
// Update 'a'. // Update 'a'.
// a[j2 + 0] = a[j2 + 0] - yr; // a[j2 + 0] = a[j2 + 0] - yr;
// a[j2 + 1] = yi - a[j2 + 1]; // a[j2 + 1] = yi - a[j2 + 1];
// a[k2 + 0] = yr + a[k2 + 0]; // a[k2 + 0] = yr + a[k2 + 0];
// a[k2 + 1] = yi - a[k2 + 1]; // a[k2 + 1] = yi - a[k2 + 1];
const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9, const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9,
@ -384,26 +385,26 @@ static void rftbsub_128_SSE2(float *a) {
const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121, const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121,
// Shuffle in right order and store. // Shuffle in right order and store.
const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
// 2, 3, 4, 5, // 2, 3, 4, 5,
const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
// 6, 7, 8, 9, // 6, 7, 8, 9,
const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
// 122, 123, 120, 121, // 122, 123, 120, 121,
const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
// 126, 127, 124, 125, // 126, 127, 124, 125,
const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt, const __m128 a_k2_0n = _mm_shuffle_ps(
_MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123, a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt, const __m128 a_k2_4n = _mm_shuffle_ps(
_MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127, a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
_mm_storeu_ps(&a[0 + j2], a_j2_0n); _mm_storeu_ps(&a[0 + j2], a_j2_0n);
_mm_storeu_ps(&a[4 + j2], a_j2_4n); _mm_storeu_ps(&a[4 + j2], a_j2_4n);
_mm_storeu_ps(&a[122 - j2], a_k2_0n); _mm_storeu_ps(&a[122 - j2], a_k2_0n);
_mm_storeu_ps(&a[126 - j2], a_k2_4n); _mm_storeu_ps(&a[126 - j2], a_k2_4n);
} }
// Scalar code for the remaining items. // Scalar code for the remaining items.
for (; j2 < 64; j1 += 1, j2 += 2) { for (; j2 < 64; j1 += 1, j2 += 2) {
k2 = 128 - j2; k2 = 128 - j2;
k1 = 32 - j1; k1 = 32 - j1;
wkr = 0.5f - c[k1]; wkr = 0.5f - c[k1];
wki = c[j1]; wki = c[j1];
xr = a[j2 + 0] - a[k2 + 0]; xr = a[j2 + 0] - a[k2 + 0];

View File

@ -8,7 +8,8 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
/* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for clock /* Resamples a signal to an arbitrary rate. Used by the AEC to compensate for
* clock
* skew by resampling the farend signal. * skew by resampling the farend signal.
*/ */
@ -21,214 +22,205 @@
#include "webrtc/modules/audio_processing/aec/aec_core.h" #include "webrtc/modules/audio_processing/aec/aec_core.h"
enum { kEstimateLengthFrames = 400 }; enum {
kEstimateLengthFrames = 400
};
typedef struct { typedef struct {
short buffer[kResamplerBufferSize]; short buffer[kResamplerBufferSize];
float position; float position;
int deviceSampleRateHz; int deviceSampleRateHz;
int skewData[kEstimateLengthFrames]; int skewData[kEstimateLengthFrames];
int skewDataIndex; int skewDataIndex;
float skewEstimate; float skewEstimate;
} resampler_t; } resampler_t;
static int EstimateSkew(const int* rawSkew, static int EstimateSkew(const int* rawSkew,
int size, int size,
int absLimit, int absLimit,
float *skewEst); float* skewEst);
int WebRtcAec_CreateResampler(void **resampInst) int WebRtcAec_CreateResampler(void** resampInst) {
{ resampler_t* obj = malloc(sizeof(resampler_t));
resampler_t *obj = malloc(sizeof(resampler_t)); *resampInst = obj;
*resampInst = obj; if (obj == NULL) {
if (obj == NULL) { return -1;
return -1; }
}
return 0; return 0;
} }
int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz) int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz) {
{ resampler_t* obj = (resampler_t*)resampInst;
resampler_t *obj = (resampler_t*) resampInst; memset(obj->buffer, 0, sizeof(obj->buffer));
memset(obj->buffer, 0, sizeof(obj->buffer)); obj->position = 0.0;
obj->position = 0.0;
obj->deviceSampleRateHz = deviceSampleRateHz; obj->deviceSampleRateHz = deviceSampleRateHz;
memset(obj->skewData, 0, sizeof(obj->skewData)); memset(obj->skewData, 0, sizeof(obj->skewData));
obj->skewDataIndex = 0; obj->skewDataIndex = 0;
obj->skewEstimate = 0.0; obj->skewEstimate = 0.0;
return 0; return 0;
} }
int WebRtcAec_FreeResampler(void *resampInst) int WebRtcAec_FreeResampler(void* resampInst) {
{ resampler_t* obj = (resampler_t*)resampInst;
resampler_t *obj = (resampler_t*) resampInst; free(obj);
free(obj);
return 0; return 0;
} }
void WebRtcAec_ResampleLinear(void *resampInst, void WebRtcAec_ResampleLinear(void* resampInst,
const short *inspeech, const short* inspeech,
int size, int size,
float skew, float skew,
short *outspeech, short* outspeech,
int *size_out) int* size_out) {
{ resampler_t* obj = (resampler_t*)resampInst;
resampler_t *obj = (resampler_t*) resampInst;
short *y; short* y;
float be, tnew, interp; float be, tnew, interp;
int tn, mm; int tn, mm;
assert(!(size < 0 || size > 2 * FRAME_LEN)); assert(!(size < 0 || size > 2 * FRAME_LEN));
assert(resampInst != NULL); assert(resampInst != NULL);
assert(inspeech != NULL); assert(inspeech != NULL);
assert(outspeech != NULL); assert(outspeech != NULL);
assert(size_out != NULL); assert(size_out != NULL);
// Add new frame data in lookahead // Add new frame data in lookahead
memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay], memcpy(&obj->buffer[FRAME_LEN + kResamplingDelay],
inspeech, inspeech,
size * sizeof(short)); size * sizeof(short));
// Sample rate ratio // Sample rate ratio
be = 1 + skew; be = 1 + skew;
// Loop over input frame // Loop over input frame
mm = 0; mm = 0;
y = &obj->buffer[FRAME_LEN]; // Point at current frame y = &obj->buffer[FRAME_LEN]; // Point at current frame
tnew = be * mm + obj->position;
tn = (int)tnew;
while (tn < size) {
// Interpolation
interp = y[tn] + (tnew - tn) * (y[tn + 1] - y[tn]);
if (interp > 32767) {
interp = 32767;
} else if (interp < -32768) {
interp = -32768;
}
outspeech[mm] = (short)interp;
mm++;
tnew = be * mm + obj->position; tnew = be * mm + obj->position;
tn = (int) tnew; tn = (int)tnew;
}
while (tn < size) { *size_out = mm;
obj->position += (*size_out) * be - size;
// Interpolation // Shift buffer
interp = y[tn] + (tnew - tn) * (y[tn+1] - y[tn]); memmove(obj->buffer,
&obj->buffer[size],
if (interp > 32767) { (kResamplerBufferSize - size) * sizeof(short));
interp = 32767;
}
else if (interp < -32768) {
interp = -32768;
}
outspeech[mm] = (short) interp;
mm++;
tnew = be * mm + obj->position;
tn = (int) tnew;
}
*size_out = mm;
obj->position += (*size_out) * be - size;
// Shift buffer
memmove(obj->buffer,
&obj->buffer[size],
(kResamplerBufferSize - size) * sizeof(short));
} }
int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst) int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst) {
{ resampler_t* obj = (resampler_t*)resampInst;
resampler_t *obj = (resampler_t*)resampInst; int err = 0;
int err = 0;
if (obj->skewDataIndex < kEstimateLengthFrames) { if (obj->skewDataIndex < kEstimateLengthFrames) {
obj->skewData[obj->skewDataIndex] = rawSkew; obj->skewData[obj->skewDataIndex] = rawSkew;
obj->skewDataIndex++; obj->skewDataIndex++;
} } else if (obj->skewDataIndex == kEstimateLengthFrames) {
else if (obj->skewDataIndex == kEstimateLengthFrames) { err = EstimateSkew(
err = EstimateSkew(obj->skewData, obj->skewData, kEstimateLengthFrames, obj->deviceSampleRateHz, skewEst);
kEstimateLengthFrames, obj->skewEstimate = *skewEst;
obj->deviceSampleRateHz, obj->skewDataIndex++;
skewEst); } else {
obj->skewEstimate = *skewEst; *skewEst = obj->skewEstimate;
obj->skewDataIndex++; }
}
else {
*skewEst = obj->skewEstimate;
}
return err; return err;
} }
int EstimateSkew(const int* rawSkew, int EstimateSkew(const int* rawSkew,
int size, int size,
int deviceSampleRateHz, int deviceSampleRateHz,
float *skewEst) float* skewEst) {
{ const int absLimitOuter = (int)(0.04f * deviceSampleRateHz);
const int absLimitOuter = (int)(0.04f * deviceSampleRateHz); const int absLimitInner = (int)(0.0025f * deviceSampleRateHz);
const int absLimitInner = (int)(0.0025f * deviceSampleRateHz); int i = 0;
int i = 0; int n = 0;
int n = 0; float rawAvg = 0;
float rawAvg = 0; float err = 0;
float err = 0; float rawAbsDev = 0;
float rawAbsDev = 0; int upperLimit = 0;
int upperLimit = 0; int lowerLimit = 0;
int lowerLimit = 0; float cumSum = 0;
float cumSum = 0; float x = 0;
float x = 0; float x2 = 0;
float x2 = 0; float y = 0;
float y = 0; float xy = 0;
float xy = 0; float xAvg = 0;
float xAvg = 0; float denom = 0;
float denom = 0; float skew = 0;
float skew = 0;
*skewEst = 0; // Set in case of error below. *skewEst = 0; // Set in case of error below.
for (i = 0; i < size; i++) { for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) { if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
n++; n++;
rawAvg += rawSkew[i]; rawAvg += rawSkew[i];
}
} }
}
if (n == 0) { if (n == 0) {
return -1; return -1;
}
assert(n > 0);
rawAvg /= n;
for (i = 0; i < size; i++) {
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) {
err = rawSkew[i] - rawAvg;
rawAbsDev += err >= 0 ? err : -err;
} }
assert(n > 0); }
rawAvg /= n; assert(n > 0);
rawAbsDev /= n;
upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
for (i = 0; i < size; i++) { n = 0;
if ((rawSkew[i] < absLimitOuter && rawSkew[i] > -absLimitOuter)) { for (i = 0; i < size; i++) {
err = rawSkew[i] - rawAvg; if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) ||
rawAbsDev += err >= 0 ? err : -err; (rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) {
} n++;
cumSum += rawSkew[i];
x += n;
x2 += n * n;
y += cumSum;
xy += n * cumSum;
} }
assert(n > 0); }
rawAbsDev /= n;
upperLimit = (int)(rawAvg + 5 * rawAbsDev + 1); // +1 for ceiling.
lowerLimit = (int)(rawAvg - 5 * rawAbsDev - 1); // -1 for floor.
n = 0; if (n == 0) {
for (i = 0; i < size; i++) { return -1;
if ((rawSkew[i] < absLimitInner && rawSkew[i] > -absLimitInner) || }
(rawSkew[i] < upperLimit && rawSkew[i] > lowerLimit)) { assert(n > 0);
n++; xAvg = x / n;
cumSum += rawSkew[i]; denom = x2 - xAvg * x;
x += n;
x2 += n*n;
y += cumSum;
xy += n * cumSum;
}
}
if (n == 0) { if (denom != 0) {
return -1; skew = (xy - xAvg * y) / denom;
} }
assert(n > 0);
xAvg = x / n;
denom = x2 - xAvg*x;
if (denom != 0) { *skewEst = skew;
skew = (xy - xAvg*y) / denom; return 0;
}
*skewEst = skew;
return 0;
} }

View File

@ -13,23 +13,27 @@
#include "webrtc/modules/audio_processing/aec/aec_core.h" #include "webrtc/modules/audio_processing/aec/aec_core.h"
enum { kResamplingDelay = 1 }; enum {
enum { kResamplerBufferSize = FRAME_LEN * 4 }; kResamplingDelay = 1
};
enum {
kResamplerBufferSize = FRAME_LEN * 4
};
// Unless otherwise specified, functions return 0 on success and -1 on error // Unless otherwise specified, functions return 0 on success and -1 on error
int WebRtcAec_CreateResampler(void **resampInst); int WebRtcAec_CreateResampler(void** resampInst);
int WebRtcAec_InitResampler(void *resampInst, int deviceSampleRateHz); int WebRtcAec_InitResampler(void* resampInst, int deviceSampleRateHz);
int WebRtcAec_FreeResampler(void *resampInst); int WebRtcAec_FreeResampler(void* resampInst);
// Estimates skew from raw measurement. // Estimates skew from raw measurement.
int WebRtcAec_GetSkew(void *resampInst, int rawSkew, float *skewEst); int WebRtcAec_GetSkew(void* resampInst, int rawSkew, float* skewEst);
// Resamples input using linear interpolation. // Resamples input using linear interpolation.
void WebRtcAec_ResampleLinear(void *resampInst, void WebRtcAec_ResampleLinear(void* resampInst,
const short *inspeech, const short* inspeech,
int size, int size,
float skew, float skew,
short *outspeech, short* outspeech,
int *size_out); int* size_out);
#endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_ #endif // WEBRTC_MODULES_AUDIO_PROCESSING_AEC_AEC_RESAMPLER_H_

View File

@ -90,7 +90,7 @@ static const int kMaxTrustedDelayMs = 500;
#define MAX_RESAMP_LEN (5 * FRAME_LEN) #define MAX_RESAMP_LEN (5 * FRAME_LEN)
static const int kMaxBufSizeStart = 62; // In partitions static const int kMaxBufSizeStart = 62; // In partitions
static const int sampMsNb = 8; // samples per ms in nb static const int sampMsNb = 8; // samples per ms in nb
static const int initCheck = 42; static const int initCheck = 42;
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
@ -99,334 +99,351 @@ int webrtc_aec_instance_count = 0;
// Estimates delay to set the position of the far-end buffer read pointer // Estimates delay to set the position of the far-end buffer read pointer
// (controlled by knownDelay) // (controlled by knownDelay)
static void EstBufDelayNormal(aecpc_t *aecInst); static void EstBufDelayNormal(aecpc_t* aecInst);
static void EstBufDelayExtended(aecpc_t *aecInst); static void EstBufDelayExtended(aecpc_t* aecInst);
static int ProcessNormal(aecpc_t* self, const int16_t* near, static int ProcessNormal(aecpc_t* self,
const int16_t* near_high, int16_t* out, int16_t* out_high, const int16_t* near,
int16_t num_samples, int16_t reported_delay_ms, int32_t skew); const int16_t* near_high,
static void ProcessExtended(aecpc_t* self, const int16_t* near, int16_t* out,
const int16_t* near_high, int16_t* out, int16_t* out_high, int16_t* out_high,
int16_t num_samples, int16_t reported_delay_ms, int32_t skew); int16_t num_samples,
int16_t reported_delay_ms,
int32_t skew);
static void ProcessExtended(aecpc_t* self,
const int16_t* near,
const int16_t* near_high,
int16_t* out,
int16_t* out_high,
int16_t num_samples,
int16_t reported_delay_ms,
int32_t skew);
int32_t WebRtcAec_Create(void **aecInst) int32_t WebRtcAec_Create(void** aecInst) {
{ aecpc_t* aecpc;
aecpc_t *aecpc; if (aecInst == NULL) {
if (aecInst == NULL) { return -1;
return -1; }
}
aecpc = malloc(sizeof(aecpc_t)); aecpc = malloc(sizeof(aecpc_t));
*aecInst = aecpc; *aecInst = aecpc;
if (aecpc == NULL) { if (aecpc == NULL) {
return -1; return -1;
} }
if (WebRtcAec_CreateAec(&aecpc->aec) == -1) { if (WebRtcAec_CreateAec(&aecpc->aec) == -1) {
WebRtcAec_Free(aecpc); WebRtcAec_Free(aecpc);
aecpc = NULL; aecpc = NULL;
return -1; return -1;
} }
if (WebRtcAec_CreateResampler(&aecpc->resampler) == -1) { if (WebRtcAec_CreateResampler(&aecpc->resampler) == -1) {
WebRtcAec_Free(aecpc); WebRtcAec_Free(aecpc);
aecpc = NULL; aecpc = NULL;
return -1; return -1;
} }
// Create far-end pre-buffer. The buffer size has to be large enough for // Create far-end pre-buffer. The buffer size has to be large enough for
// largest possible drift compensation (kResamplerBufferSize) + "almost" an // largest possible drift compensation (kResamplerBufferSize) + "almost" an
// FFT buffer (PART_LEN2 - 1). // FFT buffer (PART_LEN2 - 1).
aecpc->far_pre_buf = WebRtc_CreateBuffer(PART_LEN2 + kResamplerBufferSize, aecpc->far_pre_buf =
sizeof(float)); WebRtc_CreateBuffer(PART_LEN2 + kResamplerBufferSize, sizeof(float));
if (!aecpc->far_pre_buf) { if (!aecpc->far_pre_buf) {
WebRtcAec_Free(aecpc); WebRtcAec_Free(aecpc);
aecpc = NULL; aecpc = NULL;
return -1; return -1;
} }
aecpc->initFlag = 0; aecpc->initFlag = 0;
aecpc->lastError = 0; aecpc->lastError = 0;
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
aecpc->far_pre_buf_s16 = WebRtc_CreateBuffer( aecpc->far_pre_buf_s16 =
PART_LEN2 + kResamplerBufferSize, sizeof(int16_t)); WebRtc_CreateBuffer(PART_LEN2 + kResamplerBufferSize, sizeof(int16_t));
if (!aecpc->far_pre_buf_s16) { if (!aecpc->far_pre_buf_s16) {
WebRtcAec_Free(aecpc); WebRtcAec_Free(aecpc);
aecpc = NULL; aecpc = NULL;
return -1; return -1;
} }
{ {
char filename[64]; char filename[64];
sprintf(filename, "aec_buf%d.dat", webrtc_aec_instance_count); sprintf(filename, "aec_buf%d.dat", webrtc_aec_instance_count);
aecpc->bufFile = fopen(filename, "wb"); aecpc->bufFile = fopen(filename, "wb");
sprintf(filename, "aec_skew%d.dat", webrtc_aec_instance_count); sprintf(filename, "aec_skew%d.dat", webrtc_aec_instance_count);
aecpc->skewFile = fopen(filename, "wb"); aecpc->skewFile = fopen(filename, "wb");
sprintf(filename, "aec_delay%d.dat", webrtc_aec_instance_count); sprintf(filename, "aec_delay%d.dat", webrtc_aec_instance_count);
aecpc->delayFile = fopen(filename, "wb"); aecpc->delayFile = fopen(filename, "wb");
webrtc_aec_instance_count++; webrtc_aec_instance_count++;
} }
#endif #endif
return 0; return 0;
} }
int32_t WebRtcAec_Free(void *aecInst) int32_t WebRtcAec_Free(void* aecInst) {
{ aecpc_t* aecpc = aecInst;
aecpc_t *aecpc = aecInst;
if (aecpc == NULL) { if (aecpc == NULL) {
return -1; return -1;
} }
WebRtc_FreeBuffer(aecpc->far_pre_buf); WebRtc_FreeBuffer(aecpc->far_pre_buf);
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
WebRtc_FreeBuffer(aecpc->far_pre_buf_s16); WebRtc_FreeBuffer(aecpc->far_pre_buf_s16);
fclose(aecpc->bufFile); fclose(aecpc->bufFile);
fclose(aecpc->skewFile); fclose(aecpc->skewFile);
fclose(aecpc->delayFile); fclose(aecpc->delayFile);
#endif #endif
WebRtcAec_FreeAec(aecpc->aec); WebRtcAec_FreeAec(aecpc->aec);
WebRtcAec_FreeResampler(aecpc->resampler); WebRtcAec_FreeResampler(aecpc->resampler);
free(aecpc); free(aecpc);
return 0; return 0;
} }
int32_t WebRtcAec_Init(void *aecInst, int32_t sampFreq, int32_t scSampFreq) int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq) {
{ aecpc_t* aecpc = aecInst;
aecpc_t *aecpc = aecInst; AecConfig aecConfig;
AecConfig aecConfig;
if (sampFreq != 8000 && sampFreq != 16000 && sampFreq != 32000) { if (sampFreq != 8000 && sampFreq != 16000 && sampFreq != 32000) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR; aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1; return -1;
} }
aecpc->sampFreq = sampFreq; aecpc->sampFreq = sampFreq;
if (scSampFreq < 1 || scSampFreq > 96000) { if (scSampFreq < 1 || scSampFreq > 96000) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR; aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1; return -1;
} }
aecpc->scSampFreq = scSampFreq; aecpc->scSampFreq = scSampFreq;
// Initialize echo canceller core // Initialize echo canceller core
if (WebRtcAec_InitAec(aecpc->aec, aecpc->sampFreq) == -1) { if (WebRtcAec_InitAec(aecpc->aec, aecpc->sampFreq) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR; aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1; return -1;
} }
if (WebRtcAec_InitResampler(aecpc->resampler, aecpc->scSampFreq) == -1) { if (WebRtcAec_InitResampler(aecpc->resampler, aecpc->scSampFreq) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR; aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1; return -1;
} }
if (WebRtc_InitBuffer(aecpc->far_pre_buf) == -1) { if (WebRtc_InitBuffer(aecpc->far_pre_buf) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR; aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1; return -1;
} }
WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN); // Start overlap. WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN); // Start overlap.
aecpc->initFlag = initCheck; // indicates that initialization has been done aecpc->initFlag = initCheck; // indicates that initialization has been done
if (aecpc->sampFreq == 32000) { if (aecpc->sampFreq == 32000) {
aecpc->splitSampFreq = 16000; aecpc->splitSampFreq = 16000;
} } else {
else { aecpc->splitSampFreq = sampFreq;
aecpc->splitSampFreq = sampFreq; }
}
aecpc->delayCtr = 0; aecpc->delayCtr = 0;
aecpc->sampFactor = (aecpc->scSampFreq * 1.0f) / aecpc->splitSampFreq; aecpc->sampFactor = (aecpc->scSampFreq * 1.0f) / aecpc->splitSampFreq;
// Sampling frequency multiplier (SWB is processed as 160 frame size). // Sampling frequency multiplier (SWB is processed as 160 frame size).
aecpc->rate_factor = aecpc->splitSampFreq / 8000; aecpc->rate_factor = aecpc->splitSampFreq / 8000;
aecpc->sum = 0; aecpc->sum = 0;
aecpc->counter = 0; aecpc->counter = 0;
aecpc->checkBuffSize = 1; aecpc->checkBuffSize = 1;
aecpc->firstVal = 0; aecpc->firstVal = 0;
aecpc->startup_phase = 1; aecpc->startup_phase = 1;
aecpc->bufSizeStart = 0; aecpc->bufSizeStart = 0;
aecpc->checkBufSizeCtr = 0; aecpc->checkBufSizeCtr = 0;
aecpc->msInSndCardBuf = 0; aecpc->msInSndCardBuf = 0;
aecpc->filtDelay = -1; // -1 indicates an initialized state. aecpc->filtDelay = -1; // -1 indicates an initialized state.
aecpc->timeForDelayChange = 0; aecpc->timeForDelayChange = 0;
aecpc->knownDelay = 0; aecpc->knownDelay = 0;
aecpc->lastDelayDiff = 0; aecpc->lastDelayDiff = 0;
aecpc->skewFrCtr = 0; aecpc->skewFrCtr = 0;
aecpc->resample = kAecFalse; aecpc->resample = kAecFalse;
aecpc->highSkewCtr = 0; aecpc->highSkewCtr = 0;
aecpc->skew = 0; aecpc->skew = 0;
aecpc->farend_started = 0; aecpc->farend_started = 0;
// Default settings. // Default settings.
aecConfig.nlpMode = kAecNlpModerate; aecConfig.nlpMode = kAecNlpModerate;
aecConfig.skewMode = kAecFalse; aecConfig.skewMode = kAecFalse;
aecConfig.metricsMode = kAecFalse; aecConfig.metricsMode = kAecFalse;
aecConfig.delay_logging = kAecFalse; aecConfig.delay_logging = kAecFalse;
if (WebRtcAec_set_config(aecpc, aecConfig) == -1) { if (WebRtcAec_set_config(aecpc, aecConfig) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR; aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1; return -1;
} }
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
if (WebRtc_InitBuffer(aecpc->far_pre_buf_s16) == -1) { if (WebRtc_InitBuffer(aecpc->far_pre_buf_s16) == -1) {
aecpc->lastError = AEC_UNSPECIFIED_ERROR; aecpc->lastError = AEC_UNSPECIFIED_ERROR;
return -1; return -1;
} }
WebRtc_MoveReadPtr(aecpc->far_pre_buf_s16, -PART_LEN); // Start overlap. WebRtc_MoveReadPtr(aecpc->far_pre_buf_s16, -PART_LEN); // Start overlap.
#endif #endif
return 0; return 0;
} }
// only buffer L band for farend // only buffer L band for farend
int32_t WebRtcAec_BufferFarend(void *aecInst, const int16_t *farend, int32_t WebRtcAec_BufferFarend(void* aecInst,
int16_t nrOfSamples) const int16_t* farend,
{ int16_t nrOfSamples) {
aecpc_t *aecpc = aecInst; aecpc_t* aecpc = aecInst;
int32_t retVal = 0; int32_t retVal = 0;
int newNrOfSamples = (int) nrOfSamples; int newNrOfSamples = (int)nrOfSamples;
short newFarend[MAX_RESAMP_LEN]; short newFarend[MAX_RESAMP_LEN];
const int16_t* farend_ptr = farend; const int16_t* farend_ptr = farend;
float tmp_farend[MAX_RESAMP_LEN]; float tmp_farend[MAX_RESAMP_LEN];
const float* farend_float = tmp_farend; const float* farend_float = tmp_farend;
float skew; float skew;
int i = 0; int i = 0;
if (farend == NULL) { if (farend == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR; aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1; return -1;
} }
if (aecpc->initFlag != initCheck) { if (aecpc->initFlag != initCheck) {
aecpc->lastError = AEC_UNINITIALIZED_ERROR; aecpc->lastError = AEC_UNINITIALIZED_ERROR;
return -1; return -1;
} }
// number of samples == 160 for SWB input // number of samples == 160 for SWB input
if (nrOfSamples != 80 && nrOfSamples != 160) { if (nrOfSamples != 80 && nrOfSamples != 160) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR; aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1; return -1;
} }
skew = aecpc->skew; skew = aecpc->skew;
if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) { if (aecpc->skewMode == kAecTrue && aecpc->resample == kAecTrue) {
// Resample and get a new number of samples // Resample and get a new number of samples
WebRtcAec_ResampleLinear(aecpc->resampler, farend, nrOfSamples, skew, WebRtcAec_ResampleLinear(aecpc->resampler,
newFarend, &newNrOfSamples); farend,
farend_ptr = (const int16_t*) newFarend; nrOfSamples,
} skew,
newFarend,
&newNrOfSamples);
farend_ptr = (const int16_t*)newFarend;
}
aecpc->farend_started = 1; aecpc->farend_started = 1;
WebRtcAec_SetSystemDelay(aecpc->aec, WebRtcAec_system_delay(aecpc->aec) + WebRtcAec_SetSystemDelay(aecpc->aec,
newNrOfSamples); WebRtcAec_system_delay(aecpc->aec) + newNrOfSamples);
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
WebRtc_WriteBuffer(aecpc->far_pre_buf_s16, farend_ptr, WebRtc_WriteBuffer(
(size_t) newNrOfSamples); aecpc->far_pre_buf_s16, farend_ptr, (size_t)newNrOfSamples);
#endif #endif
// Cast to float and write the time-domain data to |far_pre_buf|. // Cast to float and write the time-domain data to |far_pre_buf|.
for (i = 0; i < newNrOfSamples; i++) { for (i = 0; i < newNrOfSamples; i++) {
tmp_farend[i] = (float) farend_ptr[i]; tmp_farend[i] = (float)farend_ptr[i];
} }
WebRtc_WriteBuffer(aecpc->far_pre_buf, farend_float, WebRtc_WriteBuffer(aecpc->far_pre_buf, farend_float, (size_t)newNrOfSamples);
(size_t) newNrOfSamples);
// Transform to frequency domain if we have enough data. // Transform to frequency domain if we have enough data.
while (WebRtc_available_read(aecpc->far_pre_buf) >= PART_LEN2) { while (WebRtc_available_read(aecpc->far_pre_buf) >= PART_LEN2) {
// We have enough data to pass to the FFT, hence read PART_LEN2 samples. // We have enough data to pass to the FFT, hence read PART_LEN2 samples.
WebRtc_ReadBuffer(aecpc->far_pre_buf, (void**) &farend_float, tmp_farend, WebRtc_ReadBuffer(
PART_LEN2); aecpc->far_pre_buf, (void**)&farend_float, tmp_farend, PART_LEN2);
WebRtcAec_BufferFarendPartition(aecpc->aec, farend_float); WebRtcAec_BufferFarendPartition(aecpc->aec, farend_float);
// Rewind |far_pre_buf| PART_LEN samples for overlap before continuing. // Rewind |far_pre_buf| PART_LEN samples for overlap before continuing.
WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN); WebRtc_MoveReadPtr(aecpc->far_pre_buf, -PART_LEN);
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
WebRtc_ReadBuffer(aecpc->far_pre_buf_s16, (void**) &farend_ptr, newFarend, WebRtc_ReadBuffer(
PART_LEN2); aecpc->far_pre_buf_s16, (void**)&farend_ptr, newFarend, PART_LEN2);
WebRtc_WriteBuffer(WebRtcAec_far_time_buf(aecpc->aec), WebRtc_WriteBuffer(
&farend_ptr[PART_LEN], 1); WebRtcAec_far_time_buf(aecpc->aec), &farend_ptr[PART_LEN], 1);
WebRtc_MoveReadPtr(aecpc->far_pre_buf_s16, -PART_LEN); WebRtc_MoveReadPtr(aecpc->far_pre_buf_s16, -PART_LEN);
#endif #endif
} }
return retVal; return retVal;
} }
int32_t WebRtcAec_Process(void *aecInst, const int16_t *nearend, int32_t WebRtcAec_Process(void* aecInst,
const int16_t *nearendH, int16_t *out, int16_t *outH, const int16_t* nearend,
int16_t nrOfSamples, int16_t msInSndCardBuf, const int16_t* nearendH,
int32_t skew) int16_t* out,
{ int16_t* outH,
aecpc_t *aecpc = aecInst; int16_t nrOfSamples,
int32_t retVal = 0; int16_t msInSndCardBuf,
if (nearend == NULL) { int32_t skew) {
aecpc->lastError = AEC_NULL_POINTER_ERROR; aecpc_t* aecpc = aecInst;
return -1; int32_t retVal = 0;
} if (nearend == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1;
}
if (out == NULL) { if (out == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR; aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1; return -1;
} }
if (aecpc->initFlag != initCheck) { if (aecpc->initFlag != initCheck) {
aecpc->lastError = AEC_UNINITIALIZED_ERROR; aecpc->lastError = AEC_UNINITIALIZED_ERROR;
return -1; return -1;
} }
// number of samples == 160 for SWB input // number of samples == 160 for SWB input
if (nrOfSamples != 80 && nrOfSamples != 160) { if (nrOfSamples != 80 && nrOfSamples != 160) {
aecpc->lastError = AEC_BAD_PARAMETER_ERROR; aecpc->lastError = AEC_BAD_PARAMETER_ERROR;
return -1; return -1;
} }
// Check for valid pointers based on sampling rate // Check for valid pointers based on sampling rate
if (aecpc->sampFreq == 32000 && nearendH == NULL) { if (aecpc->sampFreq == 32000 && nearendH == NULL) {
aecpc->lastError = AEC_NULL_POINTER_ERROR; aecpc->lastError = AEC_NULL_POINTER_ERROR;
return -1; return -1;
} }
if (msInSndCardBuf < 0) { if (msInSndCardBuf < 0) {
msInSndCardBuf = 0; msInSndCardBuf = 0;
aecpc->lastError = AEC_BAD_PARAMETER_WARNING; aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
retVal = -1; retVal = -1;
} } else if (msInSndCardBuf > kMaxTrustedDelayMs) {
else if (msInSndCardBuf > kMaxTrustedDelayMs) { // The clamping is now done in ProcessExtended/Normal().
// The clamping is now done in ProcessExtended/Normal(). aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
aecpc->lastError = AEC_BAD_PARAMETER_WARNING; retVal = -1;
retVal = -1; }
}
// This returns the value of aec->extended_filter_enabled. // This returns the value of aec->extended_filter_enabled.
if (WebRtcAec_delay_correction_enabled(aecpc->aec)) { if (WebRtcAec_delay_correction_enabled(aecpc->aec)) {
ProcessExtended(aecpc, nearend, nearendH, out, outH, nrOfSamples, ProcessExtended(
msInSndCardBuf, skew); aecpc, nearend, nearendH, out, outH, nrOfSamples, msInSndCardBuf, skew);
} else { } else {
if (ProcessNormal(aecpc, nearend, nearendH, out, outH, nrOfSamples, if (ProcessNormal(aecpc,
msInSndCardBuf, skew) != 0) { nearend,
retVal = -1; nearendH,
} out,
outH,
nrOfSamples,
msInSndCardBuf,
skew) != 0) {
retVal = -1;
} }
}
#ifdef WEBRTC_AEC_DEBUG_DUMP #ifdef WEBRTC_AEC_DEBUG_DUMP
{ {
int16_t far_buf_size_ms = (int16_t)(WebRtcAec_system_delay(aecpc->aec) / int16_t far_buf_size_ms = (int16_t)(WebRtcAec_system_delay(aecpc->aec) /
(sampMsNb * aecpc->rate_factor)); (sampMsNb * aecpc->rate_factor));
(void)fwrite(&far_buf_size_ms, 2, 1, aecpc->bufFile); (void)fwrite(&far_buf_size_ms, 2, 1, aecpc->bufFile);
(void)fwrite(&aecpc->knownDelay, sizeof(aecpc->knownDelay), 1, (void)fwrite(
aecpc->delayFile); &aecpc->knownDelay, sizeof(aecpc->knownDelay), 1, aecpc->delayFile);
} }
#endif #endif
return retVal; return retVal;
} }
int WebRtcAec_set_config(void* handle, AecConfig config) { int WebRtcAec_set_config(void* handle, AecConfig config) {
@ -442,8 +459,9 @@ int WebRtcAec_set_config(void* handle, AecConfig config) {
} }
self->skewMode = config.skewMode; self->skewMode = config.skewMode;
if (config.nlpMode != kAecNlpConservative && config.nlpMode != kAecNlpModerate if (config.nlpMode != kAecNlpConservative &&
&& config.nlpMode != kAecNlpAggressive) { config.nlpMode != kAecNlpModerate &&
config.nlpMode != kAecNlpAggressive) {
self->lastError = AEC_BAD_PARAMETER_ERROR; self->lastError = AEC_BAD_PARAMETER_ERROR;
return -1; return -1;
} }
@ -458,14 +476,14 @@ int WebRtcAec_set_config(void* handle, AecConfig config) {
return -1; return -1;
} }
WebRtcAec_SetConfigCore(self->aec, config.nlpMode, config.metricsMode, WebRtcAec_SetConfigCore(
config.delay_logging); self->aec, config.nlpMode, config.metricsMode, config.delay_logging);
return 0; return 0;
} }
int WebRtcAec_get_echo_status(void* handle, int* status) { int WebRtcAec_get_echo_status(void* handle, int* status) {
aecpc_t* self = (aecpc_t*)handle; aecpc_t* self = (aecpc_t*)handle;
if (status == NULL ) { if (status == NULL) {
self->lastError = AEC_NULL_POINTER_ERROR; self->lastError = AEC_NULL_POINTER_ERROR;
return -1; return -1;
} }
@ -488,10 +506,10 @@ int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics) {
Stats erle; Stats erle;
Stats a_nlp; Stats a_nlp;
if (handle == NULL ) { if (handle == NULL) {
return -1; return -1;
} }
if (metrics == NULL ) { if (metrics == NULL) {
self->lastError = AEC_NULL_POINTER_ERROR; self->lastError = AEC_NULL_POINTER_ERROR;
return -1; return -1;
} }
@ -503,46 +521,46 @@ int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics) {
WebRtcAec_GetEchoStats(self->aec, &erl, &erle, &a_nlp); WebRtcAec_GetEchoStats(self->aec, &erl, &erle, &a_nlp);
// ERL // ERL
metrics->erl.instant = (int) erl.instant; metrics->erl.instant = (int)erl.instant;
if ((erl.himean > kOffsetLevel) && (erl.average > kOffsetLevel)) { if ((erl.himean > kOffsetLevel) && (erl.average > kOffsetLevel)) {
// Use a mix between regular average and upper part average. // Use a mix between regular average and upper part average.
dtmp = kUpWeight * erl.himean + (1 - kUpWeight) * erl.average; dtmp = kUpWeight * erl.himean + (1 - kUpWeight) * erl.average;
metrics->erl.average = (int) dtmp; metrics->erl.average = (int)dtmp;
} else { } else {
metrics->erl.average = kOffsetLevel; metrics->erl.average = kOffsetLevel;
} }
metrics->erl.max = (int) erl.max; metrics->erl.max = (int)erl.max;
if (erl.min < (kOffsetLevel * (-1))) { if (erl.min < (kOffsetLevel * (-1))) {
metrics->erl.min = (int) erl.min; metrics->erl.min = (int)erl.min;
} else { } else {
metrics->erl.min = kOffsetLevel; metrics->erl.min = kOffsetLevel;
} }
// ERLE // ERLE
metrics->erle.instant = (int) erle.instant; metrics->erle.instant = (int)erle.instant;
if ((erle.himean > kOffsetLevel) && (erle.average > kOffsetLevel)) { if ((erle.himean > kOffsetLevel) && (erle.average > kOffsetLevel)) {
// Use a mix between regular average and upper part average. // Use a mix between regular average and upper part average.
dtmp = kUpWeight * erle.himean + (1 - kUpWeight) * erle.average; dtmp = kUpWeight * erle.himean + (1 - kUpWeight) * erle.average;
metrics->erle.average = (int) dtmp; metrics->erle.average = (int)dtmp;
} else { } else {
metrics->erle.average = kOffsetLevel; metrics->erle.average = kOffsetLevel;
} }
metrics->erle.max = (int) erle.max; metrics->erle.max = (int)erle.max;
if (erle.min < (kOffsetLevel * (-1))) { if (erle.min < (kOffsetLevel * (-1))) {
metrics->erle.min = (int) erle.min; metrics->erle.min = (int)erle.min;
} else { } else {
metrics->erle.min = kOffsetLevel; metrics->erle.min = kOffsetLevel;
} }
// RERL // RERL
if ((metrics->erl.average > kOffsetLevel) if ((metrics->erl.average > kOffsetLevel) &&
&& (metrics->erle.average > kOffsetLevel)) { (metrics->erle.average > kOffsetLevel)) {
stmp = metrics->erl.average + metrics->erle.average; stmp = metrics->erl.average + metrics->erle.average;
} else { } else {
stmp = kOffsetLevel; stmp = kOffsetLevel;
@ -555,20 +573,20 @@ int WebRtcAec_GetMetrics(void* handle, AecMetrics* metrics) {
metrics->rerl.min = stmp; metrics->rerl.min = stmp;
// A_NLP // A_NLP
metrics->aNlp.instant = (int) a_nlp.instant; metrics->aNlp.instant = (int)a_nlp.instant;
if ((a_nlp.himean > kOffsetLevel) && (a_nlp.average > kOffsetLevel)) { if ((a_nlp.himean > kOffsetLevel) && (a_nlp.average > kOffsetLevel)) {
// Use a mix between regular average and upper part average. // Use a mix between regular average and upper part average.
dtmp = kUpWeight * a_nlp.himean + (1 - kUpWeight) * a_nlp.average; dtmp = kUpWeight * a_nlp.himean + (1 - kUpWeight) * a_nlp.average;
metrics->aNlp.average = (int) dtmp; metrics->aNlp.average = (int)dtmp;
} else { } else {
metrics->aNlp.average = kOffsetLevel; metrics->aNlp.average = kOffsetLevel;
} }
metrics->aNlp.max = (int) a_nlp.max; metrics->aNlp.max = (int)a_nlp.max;
if (a_nlp.min < (kOffsetLevel * (-1))) { if (a_nlp.min < (kOffsetLevel * (-1))) {
metrics->aNlp.min = (int) a_nlp.min; metrics->aNlp.min = (int)a_nlp.min;
} else { } else {
metrics->aNlp.min = kOffsetLevel; metrics->aNlp.min = kOffsetLevel;
} }
@ -599,22 +617,25 @@ int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std) {
return 0; return 0;
} }
int32_t WebRtcAec_get_error_code(void *aecInst) int32_t WebRtcAec_get_error_code(void* aecInst) {
{ aecpc_t* aecpc = aecInst;
aecpc_t *aecpc = aecInst; return aecpc->lastError;
return aecpc->lastError;
} }
AecCore* WebRtcAec_aec_core(void* handle) { AecCore* WebRtcAec_aec_core(void* handle) {
if (!handle) { if (!handle) {
return NULL; return NULL;
} }
return ((aecpc_t*) handle)->aec; return ((aecpc_t*)handle)->aec;
} }
static int ProcessNormal(aecpc_t *aecpc, const int16_t *nearend, static int ProcessNormal(aecpc_t* aecpc,
const int16_t *nearendH, int16_t *out, int16_t *outH, const int16_t* nearend,
int16_t nrOfSamples, int16_t msInSndCardBuf, const int16_t* nearendH,
int16_t* out,
int16_t* outH,
int16_t nrOfSamples,
int16_t msInSndCardBuf,
int32_t skew) { int32_t skew) {
int retVal = 0; int retVal = 0;
short i; short i;
@ -624,8 +645,8 @@ static int ProcessNormal(aecpc_t *aecpc, const int16_t *nearend,
const float minSkewEst = -0.5f; const float minSkewEst = -0.5f;
const float maxSkewEst = 1.0f; const float maxSkewEst = 1.0f;
msInSndCardBuf = msInSndCardBuf > kMaxTrustedDelayMs ? msInSndCardBuf =
kMaxTrustedDelayMs : msInSndCardBuf; msInSndCardBuf > kMaxTrustedDelayMs ? kMaxTrustedDelayMs : msInSndCardBuf;
// TODO(andrew): we need to investigate if this +10 is really wanted. // TODO(andrew): we need to investigate if this +10 is really wanted.
msInSndCardBuf += 10; msInSndCardBuf += 10;
aecpc->msInSndCardBuf = msInSndCardBuf; aecpc->msInSndCardBuf = msInSndCardBuf;
@ -633,27 +654,24 @@ static int ProcessNormal(aecpc_t *aecpc, const int16_t *nearend,
if (aecpc->skewMode == kAecTrue) { if (aecpc->skewMode == kAecTrue) {
if (aecpc->skewFrCtr < 25) { if (aecpc->skewFrCtr < 25) {
aecpc->skewFrCtr++; aecpc->skewFrCtr++;
} } else {
else {
retVal = WebRtcAec_GetSkew(aecpc->resampler, skew, &aecpc->skew); retVal = WebRtcAec_GetSkew(aecpc->resampler, skew, &aecpc->skew);
if (retVal == -1) { if (retVal == -1) {
aecpc->skew = 0; aecpc->skew = 0;
aecpc->lastError = AEC_BAD_PARAMETER_WARNING; aecpc->lastError = AEC_BAD_PARAMETER_WARNING;
} }
aecpc->skew /= aecpc->sampFactor*nrOfSamples; aecpc->skew /= aecpc->sampFactor * nrOfSamples;
if (aecpc->skew < 1.0e-3 && aecpc->skew > -1.0e-3) { if (aecpc->skew < 1.0e-3 && aecpc->skew > -1.0e-3) {
aecpc->resample = kAecFalse; aecpc->resample = kAecFalse;
} } else {
else {
aecpc->resample = kAecTrue; aecpc->resample = kAecTrue;
} }
if (aecpc->skew < minSkewEst) { if (aecpc->skew < minSkewEst) {
aecpc->skew = minSkewEst; aecpc->skew = minSkewEst;
} } else if (aecpc->skew > maxSkewEst) {
else if (aecpc->skew > maxSkewEst) {
aecpc->skew = maxSkewEst; aecpc->skew = maxSkewEst;
} }
@ -692,11 +710,10 @@ static int ProcessNormal(aecpc_t *aecpc, const int16_t *nearend,
} }
if (abs(aecpc->firstVal - aecpc->msInSndCardBuf) < if (abs(aecpc->firstVal - aecpc->msInSndCardBuf) <
WEBRTC_SPL_MAX(0.2 * aecpc->msInSndCardBuf, sampMsNb)) { WEBRTC_SPL_MAX(0.2 * aecpc->msInSndCardBuf, sampMsNb)) {
aecpc->sum += aecpc->msInSndCardBuf; aecpc->sum += aecpc->msInSndCardBuf;
aecpc->counter++; aecpc->counter++;
} } else {
else {
aecpc->counter = 0; aecpc->counter = 0;
} }
@ -704,9 +721,10 @@ static int ProcessNormal(aecpc_t *aecpc, const int16_t *nearend,
// The far-end buffer size is determined in partitions of // The far-end buffer size is determined in partitions of
// PART_LEN samples. Use 75% of the average value of the system // PART_LEN samples. Use 75% of the average value of the system
// delay as buffer size to start with. // delay as buffer size to start with.
aecpc->bufSizeStart = WEBRTC_SPL_MIN((3 * aecpc->sum * aecpc->bufSizeStart =
aecpc->rate_factor * 8) / (4 * aecpc->counter * PART_LEN), WEBRTC_SPL_MIN((3 * aecpc->sum * aecpc->rate_factor * 8) /
kMaxBufSizeStart); (4 * aecpc->counter * PART_LEN),
kMaxBufSizeStart);
// Buffer size has now been determined. // Buffer size has now been determined.
aecpc->checkBuffSize = 0; aecpc->checkBuffSize = 0;
} }
@ -714,8 +732,9 @@ static int ProcessNormal(aecpc_t *aecpc, const int16_t *nearend,
if (aecpc->checkBufSizeCtr * nBlocks10ms > 50) { if (aecpc->checkBufSizeCtr * nBlocks10ms > 50) {
// For really bad systems, don't disable the echo canceller for // For really bad systems, don't disable the echo canceller for
// more than 0.5 sec. // more than 0.5 sec.
aecpc->bufSizeStart = WEBRTC_SPL_MIN((aecpc->msInSndCardBuf * aecpc->bufSizeStart = WEBRTC_SPL_MIN(
aecpc->rate_factor * 3) / 40, kMaxBufSizeStart); (aecpc->msInSndCardBuf * aecpc->rate_factor * 3) / 40,
kMaxBufSizeStart);
aecpc->checkBuffSize = 0; aecpc->checkBuffSize = 0;
} }
} }
@ -765,9 +784,14 @@ static int ProcessNormal(aecpc_t *aecpc, const int16_t *nearend,
return retVal; return retVal;
} }
static void ProcessExtended(aecpc_t* self, const int16_t* near, static void ProcessExtended(aecpc_t* self,
const int16_t* near_high, int16_t* out, int16_t* out_high, const int16_t* near,
int16_t num_samples, int16_t reported_delay_ms, int32_t skew) { const int16_t* near_high,
int16_t* out,
int16_t* out_high,
int16_t num_samples,
int16_t reported_delay_ms,
int32_t skew) {
int i; int i;
const int num_frames = num_samples / FRAME_LEN; const int num_frames = num_samples / FRAME_LEN;
#if defined(WEBRTC_UNTRUSTED_DELAY) #if defined(WEBRTC_UNTRUSTED_DELAY)
@ -779,14 +803,16 @@ static void ProcessExtended(aecpc_t* self, const int16_t* near,
// Due to the longer filter, we no longer add 10 ms to the reported delay // Due to the longer filter, we no longer add 10 ms to the reported delay
// to reduce chance of non-causality. Instead we apply a minimum here to avoid // to reduce chance of non-causality. Instead we apply a minimum here to avoid
// issues with the read pointer jumping around needlessly. // issues with the read pointer jumping around needlessly.
reported_delay_ms = reported_delay_ms < kMinTrustedDelayMs ? reported_delay_ms = reported_delay_ms < kMinTrustedDelayMs
kMinTrustedDelayMs : reported_delay_ms; ? kMinTrustedDelayMs
: reported_delay_ms;
// If the reported delay appears to be bogus, we attempt to recover by using // If the reported delay appears to be bogus, we attempt to recover by using
// the measured fixed delay values. We use >= here because higher layers // the measured fixed delay values. We use >= here because higher layers
// may already clamp to this maximum value, and we would otherwise not // may already clamp to this maximum value, and we would otherwise not
// detect it here. // detect it here.
reported_delay_ms = reported_delay_ms >= kMaxTrustedDelayMs ? reported_delay_ms = reported_delay_ms >= kMaxTrustedDelayMs
kFixedDelayMs : reported_delay_ms; ? kFixedDelayMs
: reported_delay_ms;
#endif #endif
self->msInSndCardBuf = reported_delay_ms; self->msInSndCardBuf = reported_delay_ms;
@ -805,10 +831,11 @@ static void ProcessExtended(aecpc_t* self, const int16_t* near,
// action on the first frame. In the trusted delay case, we'll take the // action on the first frame. In the trusted delay case, we'll take the
// current reported delay, unless it's less then our conservative // current reported delay, unless it's less then our conservative
// measurement. // measurement.
int startup_size_ms = reported_delay_ms < kFixedDelayMs ? int startup_size_ms =
kFixedDelayMs : reported_delay_ms; reported_delay_ms < kFixedDelayMs ? kFixedDelayMs : reported_delay_ms;
int overhead_elements = (WebRtcAec_system_delay(self->aec) - int overhead_elements = (WebRtcAec_system_delay(self->aec) -
startup_size_ms / 2 * self->rate_factor * 8) / PART_LEN; startup_size_ms / 2 * self->rate_factor * 8) /
PART_LEN;
WebRtcAec_MoveFarReadPtr(self->aec, overhead_elements); WebRtcAec_MoveFarReadPtr(self->aec, overhead_elements);
self->startup_phase = 0; self->startup_phase = 0;
} }
@ -823,9 +850,12 @@ static void ProcessExtended(aecpc_t* self, const int16_t* near,
WEBRTC_SPL_MAX(0, self->knownDelay + delay_diff_offset); WEBRTC_SPL_MAX(0, self->knownDelay + delay_diff_offset);
for (i = 0; i < num_frames; ++i) { for (i = 0; i < num_frames; ++i) {
WebRtcAec_ProcessFrame(self->aec, &near[FRAME_LEN * i], WebRtcAec_ProcessFrame(self->aec,
&near_high[FRAME_LEN * i], adjusted_known_delay, &near[FRAME_LEN * i],
&out[FRAME_LEN * i], &out_high[FRAME_LEN * i]); &near_high[FRAME_LEN * i],
adjusted_known_delay,
&out[FRAME_LEN * i],
&out_high[FRAME_LEN * i]);
} }
} }
} }
@ -857,8 +887,8 @@ static void EstBufDelayNormal(aecpc_t* aecpc) {
// We use -1 to signal an initialized state in the "extended" implementation; // We use -1 to signal an initialized state in the "extended" implementation;
// compensate for that. // compensate for that.
aecpc->filtDelay = aecpc->filtDelay < 0 ? 0 : aecpc->filtDelay; aecpc->filtDelay = aecpc->filtDelay < 0 ? 0 : aecpc->filtDelay;
aecpc->filtDelay = WEBRTC_SPL_MAX(0, (short) (0.8 * aecpc->filtDelay + aecpc->filtDelay =
0.2 * current_delay)); WEBRTC_SPL_MAX(0, (short)(0.8 * aecpc->filtDelay + 0.2 * current_delay));
delay_difference = aecpc->filtDelay - aecpc->knownDelay; delay_difference = aecpc->filtDelay - aecpc->knownDelay;
if (delay_difference > 224) { if (delay_difference > 224) {
@ -879,7 +909,7 @@ static void EstBufDelayNormal(aecpc_t* aecpc) {
aecpc->lastDelayDiff = delay_difference; aecpc->lastDelayDiff = delay_difference;
if (aecpc->timeForDelayChange > 25) { if (aecpc->timeForDelayChange > 25) {
aecpc->knownDelay = WEBRTC_SPL_MAX((int) aecpc->filtDelay - 160, 0); aecpc->knownDelay = WEBRTC_SPL_MAX((int)aecpc->filtDelay - 160, 0);
} }
} }
@ -910,8 +940,8 @@ static void EstBufDelayExtended(aecpc_t* self) {
if (self->filtDelay == -1) { if (self->filtDelay == -1) {
self->filtDelay = WEBRTC_SPL_MAX(0, 0.5 * current_delay); self->filtDelay = WEBRTC_SPL_MAX(0, 0.5 * current_delay);
} else { } else {
self->filtDelay = WEBRTC_SPL_MAX(0, (short) (0.95 * self->filtDelay + self->filtDelay = WEBRTC_SPL_MAX(
0.05 * current_delay)); 0, (short)(0.95 * self->filtDelay + 0.05 * current_delay));
} }
delay_difference = self->filtDelay - self->knownDelay; delay_difference = self->filtDelay - self->knownDelay;
@ -933,6 +963,6 @@ static void EstBufDelayExtended(aecpc_t* self) {
self->lastDelayDiff = delay_difference; self->lastDelayDiff = delay_difference;
if (self->timeForDelayChange > 25) { if (self->timeForDelayChange > 25) {
self->knownDelay = WEBRTC_SPL_MAX((int) self->filtDelay - 256, 0); self->knownDelay = WEBRTC_SPL_MAX((int)self->filtDelay - 256, 0);
} }
} }

View File

@ -14,32 +14,32 @@
#include "webrtc/typedefs.h" #include "webrtc/typedefs.h"
// Errors // Errors
#define AEC_UNSPECIFIED_ERROR 12000 #define AEC_UNSPECIFIED_ERROR 12000
#define AEC_UNSUPPORTED_FUNCTION_ERROR 12001 #define AEC_UNSUPPORTED_FUNCTION_ERROR 12001
#define AEC_UNINITIALIZED_ERROR 12002 #define AEC_UNINITIALIZED_ERROR 12002
#define AEC_NULL_POINTER_ERROR 12003 #define AEC_NULL_POINTER_ERROR 12003
#define AEC_BAD_PARAMETER_ERROR 12004 #define AEC_BAD_PARAMETER_ERROR 12004
// Warnings // Warnings
#define AEC_BAD_PARAMETER_WARNING 12050 #define AEC_BAD_PARAMETER_WARNING 12050
enum { enum {
kAecNlpConservative = 0, kAecNlpConservative = 0,
kAecNlpModerate, kAecNlpModerate,
kAecNlpAggressive kAecNlpAggressive
}; };
enum { enum {
kAecFalse = 0, kAecFalse = 0,
kAecTrue kAecTrue
}; };
typedef struct { typedef struct {
int16_t nlpMode; // default kAecNlpModerate int16_t nlpMode; // default kAecNlpModerate
int16_t skewMode; // default kAecFalse int16_t skewMode; // default kAecFalse
int16_t metricsMode; // default kAecFalse int16_t metricsMode; // default kAecFalse
int delay_logging; // default kAecFalse int delay_logging; // default kAecFalse
//float realSkew; // float realSkew;
} AecConfig; } AecConfig;
typedef struct { typedef struct {
@ -50,10 +50,10 @@ typedef struct {
} AecLevel; } AecLevel;
typedef struct { typedef struct {
AecLevel rerl; AecLevel rerl;
AecLevel erl; AecLevel erl;
AecLevel erle; AecLevel erle;
AecLevel aNlp; AecLevel aNlp;
} AecMetrics; } AecMetrics;
struct AecCore; struct AecCore;
@ -76,7 +76,7 @@ extern "C" {
* int32_t return 0: OK * int32_t return 0: OK
* -1: error * -1: error
*/ */
int32_t WebRtcAec_Create(void **aecInst); int32_t WebRtcAec_Create(void** aecInst);
/* /*
* This function releases the memory allocated by WebRtcAec_Create(). * This function releases the memory allocated by WebRtcAec_Create().
@ -90,7 +90,7 @@ int32_t WebRtcAec_Create(void **aecInst);
* int32_t return 0: OK * int32_t return 0: OK
* -1: error * -1: error
*/ */
int32_t WebRtcAec_Free(void *aecInst); int32_t WebRtcAec_Free(void* aecInst);
/* /*
* Initializes an AEC instance. * Initializes an AEC instance.
@ -106,7 +106,7 @@ int32_t WebRtcAec_Free(void *aecInst);
* int32_t return 0: OK * int32_t return 0: OK
* -1: error * -1: error
*/ */
int32_t WebRtcAec_Init(void *aecInst, int32_t sampFreq, int32_t scSampFreq); int32_t WebRtcAec_Init(void* aecInst, int32_t sampFreq, int32_t scSampFreq);
/* /*
* Inserts an 80 or 160 sample block of data into the farend buffer. * Inserts an 80 or 160 sample block of data into the farend buffer.
@ -123,8 +123,8 @@ int32_t WebRtcAec_Init(void *aecInst, int32_t sampFreq, int32_t scSampFreq);
* int32_t return 0: OK * int32_t return 0: OK
* -1: error * -1: error
*/ */
int32_t WebRtcAec_BufferFarend(void *aecInst, int32_t WebRtcAec_BufferFarend(void* aecInst,
const int16_t *farend, const int16_t* farend,
int16_t nrOfSamples); int16_t nrOfSamples);
/* /*
@ -153,11 +153,11 @@ int32_t WebRtcAec_BufferFarend(void *aecInst,
* int32_t return 0: OK * int32_t return 0: OK
* -1: error * -1: error
*/ */
int32_t WebRtcAec_Process(void *aecInst, int32_t WebRtcAec_Process(void* aecInst,
const int16_t *nearend, const int16_t* nearend,
const int16_t *nearendH, const int16_t* nearendH,
int16_t *out, int16_t* out,
int16_t *outH, int16_t* outH,
int16_t nrOfSamples, int16_t nrOfSamples,
int16_t msInSndCardBuf, int16_t msInSndCardBuf,
int32_t skew); int32_t skew);
@ -238,7 +238,7 @@ int WebRtcAec_GetDelayMetrics(void* handle, int* median, int* std);
* ------------------------------------------------------------------- * -------------------------------------------------------------------
* int32_t return 11000-11100: error code * int32_t return 11000-11100: error code
*/ */
int32_t WebRtcAec_get_error_code(void *aecInst); int32_t WebRtcAec_get_error_code(void* aecInst);
// Returns a pointer to the low level AEC handle. // Returns a pointer to the low level AEC handle.
// //

View File

@ -52,9 +52,7 @@ class SystemDelayTest : public ::testing::Test {
}; };
SystemDelayTest::SystemDelayTest() SystemDelayTest::SystemDelayTest()
: handle_(NULL), : handle_(NULL), self_(NULL), samples_per_frame_(0) {
self_(NULL),
samples_per_frame_(0) {
// Dummy input data are set with more or less arbitrary non-zero values. // Dummy input data are set with more or less arbitrary non-zero values.
memset(far_, 1, sizeof(far_)); memset(far_, 1, sizeof(far_));
memset(near_, 2, sizeof(near_)); memset(near_, 2, sizeof(near_));
@ -74,7 +72,7 @@ void SystemDelayTest::TearDown() {
// In SWB mode nothing is added to the buffer handling with respect to // In SWB mode nothing is added to the buffer handling with respect to
// functionality compared to WB. We therefore only verify behavior in NB and WB. // functionality compared to WB. We therefore only verify behavior in NB and WB.
static const int kSampleRateHz[] = { 8000, 16000 }; static const int kSampleRateHz[] = {8000, 16000};
static const size_t kNumSampleRates = static const size_t kNumSampleRates =
sizeof(kSampleRateHz) / sizeof(*kSampleRateHz); sizeof(kSampleRateHz) / sizeof(*kSampleRateHz);
@ -100,8 +98,15 @@ void SystemDelayTest::Init(int sample_rate_hz) {
void SystemDelayTest::RenderAndCapture(int device_buffer_ms) { void SystemDelayTest::RenderAndCapture(int device_buffer_ms) {
EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_)); EXPECT_EQ(0, WebRtcAec_BufferFarend(handle_, far_, samples_per_frame_));
EXPECT_EQ(0, WebRtcAec_Process(handle_, near_, NULL, out_, NULL, EXPECT_EQ(0,
samples_per_frame_, device_buffer_ms, 0)); WebRtcAec_Process(handle_,
near_,
NULL,
out_,
NULL,
samples_per_frame_,
device_buffer_ms,
0));
} }
int SystemDelayTest::BufferFillUp() { int SystemDelayTest::BufferFillUp() {
@ -254,8 +259,15 @@ TEST_F(SystemDelayTest, CorrectDelayAfterStableBufferBuildUp) {
// can make that assumption since we have a separate stability test. // can make that assumption since we have a separate stability test.
int process_time_ms = 0; int process_time_ms = 0;
for (; process_time_ms < kStableConvergenceMs; process_time_ms += 10) { for (; process_time_ms < kStableConvergenceMs; process_time_ms += 10) {
EXPECT_EQ(0, WebRtcAec_Process(handle_, near_, NULL, out_, NULL, EXPECT_EQ(0,
samples_per_frame_, kDeviceBufMs, 0)); WebRtcAec_Process(handle_,
near_,
NULL,
out_,
NULL,
samples_per_frame_,
kDeviceBufMs,
0));
} }
// Verify that a buffer size has been established. // Verify that a buffer size has been established.
EXPECT_EQ(0, self_->checkBuffSize); EXPECT_EQ(0, self_->checkBuffSize);
@ -301,8 +313,15 @@ TEST_F(SystemDelayTest, CorrectDelayWhenBufferUnderrun) {
// |kStableConvergenceMs| in the buffer. Keep on calling Process() until // |kStableConvergenceMs| in the buffer. Keep on calling Process() until
// we run out of data and verify that the system delay is non-negative. // we run out of data and verify that the system delay is non-negative.
for (int j = 0; j <= kStableConvergenceMs; j += 10) { for (int j = 0; j <= kStableConvergenceMs; j += 10) {
EXPECT_EQ(0, WebRtcAec_Process(handle_, near_, NULL, out_, NULL, EXPECT_EQ(0,
samples_per_frame_, kDeviceBufMs, 0)); WebRtcAec_Process(handle_,
near_,
NULL,
out_,
NULL,
samples_per_frame_,
kDeviceBufMs,
0));
EXPECT_LE(0, WebRtcAec_system_delay(self_->aec)); EXPECT_LE(0, WebRtcAec_system_delay(self_->aec));
} }
} }