Formalized Real 16-bit FFT for APM.

It also prepares for introducing Real 16-bit FFT Neon code from Openmax to SPL. CL https://webrtc-codereview.appspot.com/1819004/ takes care of that, but this CL is a prerequisite of that one.
Tested audioproc with an offline file. Bit exact.

R=andrew@webrtc.org, rtoy@google.com

Review URL: https://webrtc-codereview.appspot.com/1830004

git-svn-id: http://webrtc.googlecode.com/svn/trunk@4390 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
kma@webrtc.org
2013-07-24 17:38:23 +00:00
parent b63c29f48c
commit fc8aaf02e1
10 changed files with 285 additions and 473 deletions

View File

@ -244,8 +244,6 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) {
CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
WindowAndFFT WebRtcAecm_WindowAndFFT;
InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
{
@ -351,41 +349,36 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const int16_t* echo_path)
aecm->mseChannelCount = 0;
}
static void WindowAndFFTC(AecmCore_t* aecm,
static void WindowAndFFT(AecmCore_t* aecm,
int16_t* fft,
const int16_t* time_signal,
complex16_t* freq_signal,
int time_signal_scaling)
{
int i, j;
int time_signal_scaling) {
int i = 0;
memset(fft, 0, sizeof(int16_t) * PART_LEN4);
// FFT of signal
for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
{
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[j] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[i],
14);
fft[PART_LEN2 + j] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i + PART_LEN] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
// Inserting zeros in imaginary parts not necessary since we
// initialized the array with all zeros
}
// FFT of signal
for (i = 0; i < PART_LEN; i++) {
// Window time domain signal and insert into real part of
// transformation array |fft|
fft[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[i],
14);
fft[PART_LEN + i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(
(time_signal[i + PART_LEN] << time_signal_scaling),
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
}
// Do forward FFT, then take only the first PART_LEN complex samples,
// and change signs of the imaginary parts.
WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal);
for (i = 0; i < PART_LEN; i++) {
freq_signal[i].imag = -freq_signal[i].imag;
}
// Do forward FFT, then take only the first PART_LEN complex samples,
// and change signs of the imaginary parts.
WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal);
for (i = 0; i < PART_LEN; i++) {
freq_signal[i].imag = -freq_signal[i].imag;
}
}
static void InverseFFTAndWindowC(AecmCore_t* aecm,
static void InverseFFTAndWindow(AecmCore_t* aecm,
int16_t* fft,
complex16_t* efw,
int16_t* output,
@ -395,17 +388,9 @@ static void InverseFFTAndWindowC(AecmCore_t* aecm,
int32_t tmp32no1;
// Synthesis
for (i = 1; i < PART_LEN; i++)
{
j = WEBRTC_SPL_LSHIFT_W32(i, 1);
fft[j] = efw[i].real;
// mirrored data, even
fft[PART_LEN4 - j] = efw[i].real;
fft[j + 1] = -efw[i].imag;
//mirrored data, odd
fft[PART_LEN4 - (j - 1)] = efw[i].imag;
for (i = 1, j = 2; i < PART_LEN; i += 1, j += 2) {
fft[j] = efw[i].real;
fft[j + 1] = -efw[i].imag;
}
fft[0] = efw[0].real;
fft[1] = -efw[0].imag;
@ -413,31 +398,23 @@ static void InverseFFTAndWindowC(AecmCore_t* aecm,
fft[PART_LEN2] = efw[PART_LEN].real;
fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
// Inverse FFT. Then take only the real values, and keep outCFFT
// to scale the samples in the next block.
outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw);
for (i = 0; i < PART_LEN; i++) {
efw[i].real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
efw[i].real,
WebRtcAecm_kSqrtHanning[i],
14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((int32_t)efw[i].real,
outCFFT - aecm->dfaCleanQDomain);
efw[i].real = (int16_t)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
tmp32no1 + aecm->outBuf[i],
WEBRTC_SPL_WORD16_MIN);
output[i] = efw[i].real;
// Inverse FFT. Keep outCFFT to scale the samples in the next block.
outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, output);
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
efw[PART_LEN + i].real,
WebRtcAecm_kSqrtHanning[PART_LEN - i],
14);
for (i = 0; i < PART_LEN; i++) {
output[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
output[i], WebRtcAecm_kSqrtHanning[i], 14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32((int32_t)output[i],
outCFFT - aecm->dfaCleanQDomain);
output[i] = (int16_t)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
tmp32no1 + aecm->outBuf[i], WEBRTC_SPL_WORD16_MIN);
tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(output[PART_LEN + i],
WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
outCFFT - aecm->dfaCleanQDomain);
outCFFT - aecm->dfaCleanQDomain);
aecm->outBuf[i] = (int16_t)WEBRTC_SPL_SAT(
WEBRTC_SPL_WORD16_MAX,
tmp32no1,
WEBRTC_SPL_WORD16_MIN);
WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
}
// Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
@ -522,9 +499,6 @@ static void ResetAdaptiveChannelC(AecmCore_t* aecm)
#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
static void WebRtcAecm_InitNeon(void)
{
// TODO(kma): Check why WebRtcAecm_InverseFFTAndWindowNeon() doesn't work.
WebRtcAecm_WindowAndFFT = WebRtcAecm_WindowAndFFTNeon;
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
WebRtcAecm_StoreAdaptiveChannel = WebRtcAecm_StoreAdaptiveChannelNeon;
WebRtcAecm_ResetAdaptiveChannel = WebRtcAecm_ResetAdaptiveChannelNeon;
WebRtcAecm_CalcLinearEnergies = WebRtcAecm_CalcLinearEnergiesNeon;
@ -654,8 +628,6 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
COMPILE_ASSERT(PART_LEN % 16 == 0);
// Initialize function pointers.
WebRtcAecm_WindowAndFFT = WindowAndFFTC;
WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC;
WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC;
WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC;
@ -1403,7 +1375,7 @@ static int TimeToFrequencyDomain(AecmCore_t* aecm,
time_signal_scaling = WebRtcSpl_NormW16(tmp16no1);
#endif
WebRtcAecm_WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
WindowAndFFT(aecm, fft, time_signal, freq_signal, time_signal_scaling);
// Extract imaginary and real part, calculate the magnitude for all frequency bins
freq_signal[0].imag = 0;
@ -1843,7 +1815,7 @@ int WebRtcAecm_ProcessBlock(AecmCore_t * aecm,
ComfortNoise(aecm, ptrDfaClean, efw, hnl);
}
WebRtcAecm_InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
InverseFFTAndWindow(aecm, fft, efw, output, nearendClean);
return 0;
}

View File

@ -294,37 +294,10 @@ extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm);
extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
typedef void (*WindowAndFFT)(
AecmCore_t* aecm,
int16_t* fft,
const int16_t* time_signal,
complex16_t* freq_signal,
int time_signal_scaling);
extern WindowAndFFT WebRtcAecm_WindowAndFFT;
typedef void (*InverseFFTAndWindow)(
AecmCore_t* aecm,
int16_t* fft, complex16_t* efw,
int16_t* output,
const int16_t* nearendClean);
extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
// For the above function pointers, functions for generic platforms are declared
// and defined as static in file aecm_core.c, while those for ARM Neon platforms
// are declared below and defined in file aecm_core_neon.s.
#if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON)
void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
int16_t* fft,
const int16_t* time_signal,
complex16_t* freq_signal,
int time_signal_scaling);
void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
int16_t* fft,
complex16_t* efw,
int16_t* output,
const int16_t* nearendClean);
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
const uint16_t* far_spectrum,
int32_t* echo_est,

View File

@ -17,185 +17,10 @@
#include "webrtc/system_wrappers/interface/asm_defines.h"
GLOBAL_LABEL WebRtcAecm_kSqrtHanning
GLOBAL_FUNCTION WebRtcAecm_WindowAndFFTNeon
GLOBAL_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
GLOBAL_FUNCTION WebRtcAecm_CalcLinearEnergiesNeon
GLOBAL_FUNCTION WebRtcAecm_StoreAdaptiveChannelNeon
GLOBAL_FUNCTION WebRtcAecm_ResetAdaptiveChannelNeon
@ void WebRtcAecm_WindowAndFFTNeon(AecmCore_t* aecm,
@ int16_t* fft,
@ const int16_t* time_signal,
@ complex16_t* freq_signal,
@ int time_signal_scaling);
.align 2
DEFINE_FUNCTION WebRtcAecm_WindowAndFFTNeon
push {r4, r5, r6, lr}
ldr r12, [sp, #16] @ time_signal_scaling
vdup.16 d16, r12
vmov.i16 d21, #0 @ For imaginary parts of |fft|.
vmov.i16 d27, #0 @ For imaginary parts of |fft|.
adr r5, WebRtcAecm_kSqrtHanning
adr lr, kSqrtHanningReversed
add r4, r1, #(PART_LEN2 * 2) @ &fft[PART_LEN2]
add r12, r2, #(PART_LEN * 2) @ time_signal[PART_LEN]
mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4
LOOP_PART_LEN:
vld1.16 d0, [r2, :64]! @ time_signal[i]
vld1.16 d22, [r12, :64]! @ time_signal[i + PART_LEN]
vld1.16 d17, [r5, :64]! @ WebRtcAecm_kSqrtHanning[i]
vld1.16 d23, [lr, :64]! @ kSqrtHanningReversed[i]
vshl.s16 d18, d0, d16
vshl.s16 d22, d22, d16
vmull.s16 q9, d18, d17
vmull.s16 q12, d22, d23
subs r6, #1
vshrn.i32 d20, q9, #14
vshrn.i32 d26, q12, #14
vst2.16 {d20, d21}, [r1, :128]! @ fft[j]
vst2.16 {d26, d27}, [r4, :128]! @ fft[PART_LEN2 + j]
bgt LOOP_PART_LEN
@ WebRtcSpl_RealForwardFFT(aecm->real_fft, fft, (int16_t*)freq_signal);
movw r12, #offset_aecm_real_fft
sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0].
mov r2, r3 @ freq_signal
mov r4, r3
ldr r0, [r0, r12] @ aecm->real_fft
CALL_FUNCTION WebRtcSpl_RealForwardFFTNeon
mov r12, #(PART_LEN * 2 / 16) @ Loop counter, unrolled by 16.
LOOP_PART_LEN2:
@ freq_signal[i].imag = - freq_signal[i].imag;
vld2.16 {d20, d21, d22, d23}, [r4, :256]
subs r12, #1
vneg.s16 d22, d22
vneg.s16 d23, d23
vst2.16 {d20, d21, d22, d23}, [r4, :256]!
bgt LOOP_PART_LEN2
pop {r4, r5, r6, pc}
@ void WebRtcAecm_InverseFFTAndWindowNeon(AecmCore_t* aecm,
@ int16_t* fft,
@ complex16_t* efw,
@ int16_t* output,
@ const int16_t* nearendClean);
.align 2
DEFINE_FUNCTION WebRtcAecm_InverseFFTAndWindowNeon
push {r4-r8, lr}
@ Values of r0, r1, and r3 will change in WebRtcSpl_ComplexIFFT
@ and WebRtcSpl_ComplexBitReverse.
mov r4, r1
mov r5, r0
mov r7, r3
add r3, r1, #((PART_LEN4 - 6) * 2) @ &fft[PART_LEN4 - 6]
mov r6, #(PART_LEN / 4) @ Loop counter, unrolled by 4
add r12, r2, #(PART_LEN * 4) @ &efw[PART_LEN]
mov r8, #-16
LOOP_PRE_IFFT:
vld2.16 {q10}, [r2, :128]!
vmov q11, q10
vneg.s16 d23, d23
vst2.16 {d22, d23}, [r1, :128]!
vrev64.16 q10, q10
subs r6, #1
vst2.16 {q10}, [r3], r8
bgt LOOP_PRE_IFFT
@ fft[PART_LEN2] = efw[PART_LEN].real;
@ fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
ldr r8, [r12]
ssub16 r12, r6, r8
mov r3, #(PART_LEN2 * 2)
pkhbt r8, r8, r12
str r8, [r4, r3]
@ outCFFT = WebRtcSpl_RealInverseFFT(aecm->real_fft, fft, (int16_t*)efw);
movw r12, #offset_aecm_real_fft
sub r1, #(PART_LEN * 4) @ Get r1 back to &fft[0].
sub r2, #(PART_LEN * 4) @ Get r2 back to &efw[0].
mov r4, r2 @ Keep efw in r4.
ldr r0, [r0, r12] @ aecm->real_fft
CALL_FUNCTION WebRtcSpl_RealInverseFFTNeon
movw r6, #offset_aecm_outBuf
movw r12, #offset_aecm_dfaCleanQDomain
ldr r8, [r5, r6] @ &aecm->outBuf[0]
ldrsh r2, [r5, r12] @ &aecm->dfaCleanQDomain[0]
adr r12, kSqrtHanningReversed
adr r6, WebRtcAecm_kSqrtHanning
rsb r0, r2, r0 @ outCFFT - aecm->dfaCleanQDomain
vdup.32 q9, r0
add r0, r4, #(PART_LEN * 4) @ &efw[PART_LEN]
mov r3, #(PART_LEN / 4) @ Loop counter, unrolled by 4
LOOP_POST_IFFT:
vld2.16 {d4, d5}, [r4, :128] @ &efw[i];
vld1.16 d17, [r6, :64]! @ WebRtcAecm_kSqrtHanning[i]
vld1.16 d20, [r8, :64] @ aecm->outBuf[i]
vmull.s16 q8, d4, d17
vmovl.s16 q10, d20
vrshr.s32 q8, q8, #14
vld1.16 d0, [r0, :64]! @ &efw[PART_LEN + i]
vshl.s32 q8, q8, q9
vld1.16 d1, [r12, :64]! @ kSqrtHanningReversed[i]
vadd.i32 q8, q10
vmull.s16 q0, d0, d1
vqmovn.s32 d16, q8
vshr.s32 q0, q0, #14
vst2.16 {d4, d5}, [r4, :128]! @ &efw[i];
vshl.s32 q0, q0, q9
vst1.16 d16, [r7, :64]! @ output[i]
vqmovn.s32 d0, q0
subs r3, #1
vst1.16 d0, [r8, :64]! @ aecm->outBuf[i]
bgt LOOP_POST_IFFT
movw r3, #offset_aecm_xBuf
movw r12, #offset_aecm_dBufNoisy
ldr r3, [r5, r3] @ &aecm->xBuf[0]
ldr r1, [r5, r12] @ &aecm->dBufNoisy[0]
add r2, r3, #(PART_LEN * 2) @ &aecm->xBuf[PART_LEN]
add r0, r1, #(PART_LEN * 2) @ &aecm->dBufNoisy[PART_LEN]
mov r4, #(PART_LEN / 16) @ Loop counter, unrolled by 16.
LOOP_COPY:
vld1.16 {q10, q11}, [r2, :256]!
vld1.16 {q12, q13}, [r0, :256]!
subs r4, #1
vst1.16 {q10, q11}, [r3, :256]!
vst1.16 {q12, q13}, [r1, :256]!
bgt LOOP_COPY
ldr r2, [sp, #16]
cmp r2, #0 @ Check if (nearendClean != NULL).
beq END
movw r4, #offset_aecm_dBufClean
ldr r1, [r5, r4] @ &aecm->dBufClean[0]
add r0, r1, #(PART_LEN * 2) @ &aecm->dBufClean[PART_LEN]
vld1.16 {q10, q11}, [r0, :256]!
vld1.16 {q12, q13}, [r0, :256]!
vst1.16 {q10, q11}, [r1, :256]!
vst1.16 {q12, q13}, [r1, :256]!
vld1.16 {q10, q11}, [r0, :256]!
vld1.16 {q12, q13}, [r0, :256]!
vst1.16 {q10, q11}, [r1, :256]!
vst1.16 {q12, q13}, [r1, :256]!
END:
pop {r4-r8, pc}
@ void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore_t* aecm,
@ const uint16_t* far_spectrum,
@ int32_t* echo_est,

View File

@ -12,7 +12,6 @@
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -436,26 +435,6 @@ static const int16_t kDeterminantEstMatrix[66] = {
355, 330
};
// Declare function pointers.
NoiseEstimation WebRtcNsx_NoiseEstimation;
PrepareSpectrum WebRtcNsx_PrepareSpectrum;
SynthesisUpdate WebRtcNsx_SynthesisUpdate;
AnalysisUpdate WebRtcNsx_AnalysisUpdate;
Denormalize WebRtcNsx_Denormalize;
CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
// Initialize function pointers for ARM Neon platform.
static void WebRtcNsx_InitNeon(void) {
WebRtcNsx_NoiseEstimation = WebRtcNsx_NoiseEstimationNeon;
WebRtcNsx_PrepareSpectrum = WebRtcNsx_PrepareSpectrumNeon;
WebRtcNsx_SynthesisUpdate = WebRtcNsx_SynthesisUpdateNeon;
WebRtcNsx_AnalysisUpdate = WebRtcNsx_AnalysisUpdateNeon;
WebRtcNsx_Denormalize = WebRtcNsx_DenormalizeNeon;
WebRtcNsx_CreateComplexBuffer = WebRtcNsx_CreateComplexBufferNeon;
}
#endif
// Update the noise estimation information.
static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
int32_t tmp32no1 = 0;
@ -614,7 +593,6 @@ static void NoiseEstimationC(NsxInst_t* inst,
// Filter the data in the frequency domain, and create spectrum.
static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) {
int i = 0, j = 0;
int16_t tmp16 = 0;
for (i = 0; i < inst->magnLen; i++) {
inst->real[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
@ -626,22 +604,19 @@ static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) {
freq_buf[0] = inst->real[0];
freq_buf[1] = -inst->imag[0];
for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
tmp16 = (inst->anaLen << 1) - j;
freq_buf[j] = inst->real[i];
freq_buf[j + 1] = -inst->imag[i];
freq_buf[tmp16] = inst->real[i];
freq_buf[tmp16 + 1] = inst->imag[i];
}
freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
}
// Denormalize the input buffer.
static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) {
int i = 0, j = 0;
// Denormalize the real-valued signal |in|, the output from inverse FFT.
static __inline void Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
int i = 0;
int32_t tmp32 = 0;
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[j],
for (i = 0; i < inst->anaLen; i += 1) {
tmp32 = WEBRTC_SPL_SHIFT_W32((int32_t)in[i],
factor - inst->normData);
inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
}
@ -701,18 +676,32 @@ static void AnalysisUpdateC(NsxInst_t* inst,
}
}
// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
// zeros, and normalize it.
static __inline void CreateComplexBufferC(NsxInst_t* inst,
int16_t* in,
int16_t* out) {
int i = 0, j = 0;
for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
out[j + 1] = 0; // Insert zeros in imaginary part
// Normalize the real-valued signal |in|, the input to forward FFT.
static __inline void NormalizeRealBuffer(NsxInst_t* inst,
const int16_t* in,
int16_t* out) {
int i = 0;
for (i = 0; i < inst->anaLen; ++i) {
out[i] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
}
}
// Declare function pointers.
NoiseEstimation WebRtcNsx_NoiseEstimation;
PrepareSpectrum WebRtcNsx_PrepareSpectrum;
SynthesisUpdate WebRtcNsx_SynthesisUpdate;
AnalysisUpdate WebRtcNsx_AnalysisUpdate;
#if (defined WEBRTC_DETECT_ARM_NEON || defined WEBRTC_ARCH_ARM_NEON)
// Initialize function pointers for ARM Neon platform.
static void WebRtcNsx_InitNeon(void) {
WebRtcNsx_NoiseEstimation = WebRtcNsx_NoiseEstimationNeon;
WebRtcNsx_PrepareSpectrum = WebRtcNsx_PrepareSpectrumNeon;
WebRtcNsx_SynthesisUpdate = WebRtcNsx_SynthesisUpdateNeon;
WebRtcNsx_AnalysisUpdate = WebRtcNsx_AnalysisUpdateNeon;
}
#endif
void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
int16_t pink_noise_exp_avg,
int32_t pink_noise_num_avg,
@ -900,17 +889,14 @@ int32_t WebRtcNsx_InitCore(NsxInst_t* inst, uint32_t fs) {
WebRtcNsx_PrepareSpectrum = PrepareSpectrumC;
WebRtcNsx_SynthesisUpdate = SynthesisUpdateC;
WebRtcNsx_AnalysisUpdate = AnalysisUpdateC;
WebRtcNsx_Denormalize = DenormalizeC;
WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC;
#ifdef WEBRTC_DETECT_ARM_NEON
uint64_t features = WebRtc_GetCPUFeaturesARM();
if ((features & kCPUFeatureNEON) != 0)
{
WebRtcNsx_InitNeon();
}
uint64_t features = WebRtc_GetCPUFeaturesARM();
if ((features & kCPUFeatureNEON) != 0) {
WebRtcNsx_InitNeon();
}
#elif defined(WEBRTC_ARCH_ARM_NEON)
WebRtcNsx_InitNeon();
WebRtcNsx_InitNeon();
#endif
inst->initFlag = 1;
@ -1606,7 +1592,7 @@ void WebRtcNsx_DataAnalysis(NsxInst_t* inst, short* speechFrame, uint16_t* magnU
right_shifts_in_magnU16 = WEBRTC_SPL_MAX(right_shifts_in_magnU16, 0);
// create realImag as winData interleaved with zeros (= imag. part), normalize it
WebRtcNsx_CreateComplexBuffer(inst, winData, realImag);
NormalizeRealBuffer(inst, winData, realImag);
// FFT output will be in winData[].
WebRtcSpl_RealForwardFFT(inst->real_fft, realImag, winData);
@ -1838,8 +1824,7 @@ void WebRtcNsx_DataSynthesis(NsxInst_t* inst, short* outFrame) {
// Inverse FFT output will be in rfft_out[].
outCIFFT = WebRtcSpl_RealInverseFFT(inst->real_fft, realImag, rfft_out);
// Denormalize.
WebRtcNsx_Denormalize(inst, rfft_out, outCIFFT);
Denormalize(inst, rfft_out, outCIFFT);
//scale factor: only do it after END_STARTUP_LONG time
gainFactor = 8192; // 8192 = Q13(1.0)

View File

@ -201,19 +201,6 @@ typedef void (*AnalysisUpdate)(NsxInst_t* inst,
int16_t* new_speech);
extern AnalysisUpdate WebRtcNsx_AnalysisUpdate;
// Denormalize the input buffer.
typedef void (*Denormalize)(NsxInst_t* inst,
int16_t* in,
int factor);
extern Denormalize WebRtcNsx_Denormalize;
// Create a complex number buffer, as the intput interleaved with zeros,
// and normalize it.
typedef void (*CreateComplexBuffer)(NsxInst_t* inst,
int16_t* in,
int16_t* out);
extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
#if (defined WEBRTC_DETECT_ARM_NEON) || defined (WEBRTC_ARCH_ARM_NEON)
// For the above function pointers, functions for generic platforms are declared
// and defined as static in file nsx_core.c, while those for ARM Neon platforms
@ -222,16 +209,12 @@ void WebRtcNsx_NoiseEstimationNeon(NsxInst_t* inst,
uint16_t* magn,
uint32_t* noise,
int16_t* q_noise);
void WebRtcNsx_CreateComplexBufferNeon(NsxInst_t* inst,
int16_t* in,
int16_t* out);
void WebRtcNsx_SynthesisUpdateNeon(NsxInst_t* inst,
int16_t* out_frame,
int16_t gain_factor);
void WebRtcNsx_AnalysisUpdateNeon(NsxInst_t* inst,
int16_t* out,
int16_t* new_speech);
void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor);
void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buff);
#endif

View File

@ -20,8 +20,6 @@ GLOBAL_FUNCTION WebRtcNsx_NoiseEstimationNeon
GLOBAL_FUNCTION WebRtcNsx_PrepareSpectrumNeon
GLOBAL_FUNCTION WebRtcNsx_SynthesisUpdateNeon
GLOBAL_FUNCTION WebRtcNsx_AnalysisUpdateNeon
GLOBAL_FUNCTION WebRtcNsx_DenormalizeNeon
GLOBAL_FUNCTION WebRtcNsx_CreateComplexBufferNeon
GLOBAL_LABEL WebRtcNsx_kLogTable
GLOBAL_LABEL WebRtcNsx_kCounterDiv
GLOBAL_LABEL WebRtcNsx_kLogTableFrac
@ -426,6 +424,7 @@ POST_LOOP_MAGNLEN:
pop {r4, r5, r6, pc}
@ TODO(kma): Remove copying to 2nd half of freq_buf, for real FFT interface.
@ void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf);
.align 2
DEFINE_FUNCTION WebRtcNsx_PrepareSpectrumNeon
@ -542,35 +541,6 @@ LOOP_ANALEN2:
pop {r4-r9}
bx r14
@ void WebRtcNsx_DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor);
.align 2
DEFINE_FUNCTION WebRtcNsx_DenormalizeNeon
movw r12, #offset_nsx_normData
movw r3, #offset_nsx_real
ldr r12, [r0, r12] @ inst->normData
add r3, r0 @ &inst->real[0]
sub r2, r12
vdup.32 q10, r2
movw r2, #offset_nsx_anaLen
ldrsh r2, [r0, r2] @ inst->anaLen
add r0, r3, r2, lsl #1 @ &inst->real[inst->anaLen]
LOOP_ANALEN:
vld2.16 {d0, d1}, [r1]! @ &in[]
vld2.16 {d2, d3}, [r1]! @ &in[]
vmovl.s16 q2, d0
vmovl.s16 q3, d2
vshl.s32 q2, q10
vshl.s32 q3, q10
vqmovn.s32 d0, q2
vqmovn.s32 d1, q3
vst1.16 {d0, d1}, [r3]! @ inst->real[]
cmp r3, r0
blt LOOP_ANALEN
bx r14
@ void SynthesisUpdateNeon(NsxInst_t* inst,
@ int16_t* out_frame,
@ int16_t gain_factor);
@ -704,33 +674,3 @@ LOOP_WINDOW_DATA:
POST_LOOP_WINDOW_DATA:
pop {r4-r6}
bx r14
@ void CreateComplexBufferNeon(NsxInst_t* inst, int16_t* in, int16_t* out);
.align 2
DEFINE_FUNCTION WebRtcNsx_CreateComplexBufferNeon
movw r3, #offset_nsx_anaLen
movw r12, #offset_nsx_normData
ldrsh r3, [r0, r3] @ inst->anaLen
ldr r12, [r0, r12] @ inst->normData
add r3, r1, r3, lsl #1 @ &in[inst->anaLen]
vmov.i16 d7, #0 @ For writing to imaginary parts.
vmov.i16 d5, #0 @ For writing to imaginary parts.
vdup.i16 q10, r12
LOOP_CREATE_COMPLEX_BUFFER: @ Unrolled by 16.
vld1.16 {d0, d1, d2, d3}, [r1]! @ in[]
cmp r1, r3
vshl.s16 q0, q10
vshl.s16 q1, q10
vmov d4, d1
vmov d1, d5
vmov d6, d3
vmov d3, d7
vst2.16 {d0, d1}, [r2]!
vst2.16 {d4, d5}, [r2]!
vst2.16 {d2, d3}, [r2]!
vst2.16 {d6, d7}, [r2]!
blt LOOP_CREATE_COMPLEX_BUFFER
bx r14