From b59c0316606017524befe6631f75add61b634e37 Mon Sep 17 00:00:00 2001 From: "kma@webrtc.org" Date: Sat, 3 Dec 2011 18:34:50 +0000 Subject: [PATCH] For Android ARMv7 platforms, added a feature of dynamically detecting the existence of Neon, and when it's present, switch to some functions optimized for Neon at run time. Review URL: http://webrtc-codereview.appspot.com/268002 git-svn-id: http://webrtc.googlecode.com/svn/trunk@1096 4adac7df-926f-26a2-2b94-8c16560cd09d --- Android.mk | 12 + android-webrtc.mk | 3 +- src/modules/audio_processing/aecm/Android.mk | 46 +- src/modules/audio_processing/aecm/aecm_core.c | 406 ++++++------- src/modules/audio_processing/aecm/aecm_core.h | 54 +- .../audio_processing/aecm/aecm_core_neon.c | 475 ++++++++------- src/modules/audio_processing/ns/Android.mk | 47 +- src/modules/audio_processing/ns/nsx_core.c | 543 +++++++++--------- src/modules/audio_processing/ns/nsx_core.h | 53 +- .../audio_processing/ns/nsx_core_neon.c | 80 +-- .../interface/cpu_features_wrapper.h | 17 +- src/system_wrappers/source/Android.mk | 1 + src/system_wrappers/source/cpu_features_arm.c | 333 +++++++++++ 13 files changed, 1274 insertions(+), 796 deletions(-) create mode 100644 src/system_wrappers/source/cpu_features_arm.c diff --git a/Android.mk b/Android.mk index bd050e6af0..245c4249f9 100644 --- a/Android.mk +++ b/Android.mk @@ -54,6 +54,7 @@ include $(MY_WEBRTC_ROOT_PATH)/libvpx.mk LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) +include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk LOCAL_ARM_MODE := arm LOCAL_MODULE := libwebrtc_audio_preprocessing @@ -71,6 +72,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \ libwebrtc_aecm \ libwebrtc_system_wrappers +# Add Neon libraries. +ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS))) +LOCAL_WHOLE_STATIC_LIBRARIES += \ + libwebrtc_aecm_neon \ + libwebrtc_ns_neon +else ifeq ($(ARCH_ARM_HAVE_NEON),true) +LOCAL_WHOLE_STATIC_LIBRARIES += \ + libwebrtc_aecm_neon \ + libwebrtc_ns_neon +endif + LOCAL_STATIC_LIBRARIES := \ libprotobuf-cpp-2.3.0-lite diff --git a/android-webrtc.mk b/android-webrtc.mk index 9a8c861b73..eb620bb3b4 100644 --- a/android-webrtc.mk +++ b/android-webrtc.mk @@ -21,8 +21,9 @@ MY_WEBRTC_COMMON_DEFS := \ # '-DWEBRTC_MODULE_UTILITY_VIDEO' [module media_file] [module utility] ifeq ($(TARGET_ARCH),arm) MY_WEBRTC_COMMON_DEFS += \ - '-DWEBRTC_ARM_INLINE_CALLS' \ '-DWEBRTC_ARCH_ARM' +# '-DWEBRTC_DETECT_ARM_NEON' # only used in a build configuration without Neon +# TODO(kma): figure out if the above define could be moved to NDK build only. # TODO(kma): test if the code under next two macros works with generic GCC compilers ifeq ($(ARCH_ARM_HAVE_NEON),true) diff --git a/src/modules/audio_processing/aecm/Android.mk b/src/modules/audio_processing/aecm/Android.mk index 916c5a8635..c33a957e5f 100644 --- a/src/modules/audio_processing/aecm/Android.mk +++ b/src/modules/audio_processing/aecm/Android.mk @@ -6,6 +6,9 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. +############################# +# Build the non-neon library. + LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) @@ -21,21 +24,16 @@ LOCAL_SRC_FILES := \ aecm_core.c # Flags passed to both C and C++ files. -LOCAL_CFLAGS := \ - $(MY_WEBRTC_COMMON_DEFS) - -ifeq ($(ARCH_ARM_HAVE_NEON),true) -LOCAL_SRC_FILES += \ - aecm_core_neon.c -LOCAL_CFLAGS += \ - $(MY_ARM_CFLAGS_NEON) -endif +LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS) LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/interface \ $(LOCAL_PATH)/../utility \ $(LOCAL_PATH)/../../.. \ - $(LOCAL_PATH)/../../../common_audio/signal_processing/include + $(LOCAL_PATH)/../../../common_audio/signal_processing/include \ + $(LOCAL_PATH)/../../../system_wrappers/interface + +LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers LOCAL_SHARED_LIBRARIES := \ libcutils \ @@ -46,3 +44,31 @@ ifndef NDK_ROOT include external/stlport/libstlport.mk endif include $(BUILD_STATIC_LIBRARY) + +######################### +# Build the neon library. + +include $(CLEAR_VARS) + +LOCAL_ARM_MODE := arm +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +LOCAL_MODULE := libwebrtc_aecm_neon +LOCAL_MODULE_TAGS := optional + +LOCAL_SRC_FILES := aecm_core_neon.c + +# Flags passed to both C and C++ files. +LOCAL_CFLAGS := \ + $(MY_WEBRTC_COMMON_DEFS) \ + -mfpu=neon \ + -flax-vector-conversions + +LOCAL_C_INCLUDES := \ + $(LOCAL_PATH)/interface \ + $(LOCAL_PATH)/../../.. \ + $(LOCAL_PATH)/../../../common_audio/signal_processing/include + +ifndef NDK_ROOT +include external/stlport/libstlport.mk +endif +include $(BUILD_STATIC_LIBRARY) diff --git a/src/modules/audio_processing/aecm/aecm_core.c b/src/modules/audio_processing/aecm/aecm_core.c index 4ad705e506..f2e4683288 100644 --- a/src/modules/audio_processing/aecm/aecm_core.c +++ b/src/modules/audio_processing/aecm/aecm_core.c @@ -13,8 +13,9 @@ #include #include -#include "echo_control_mobile.h" +#include "cpu_features_wrapper.h" #include "delay_estimator_wrapper.h" +#include "echo_control_mobile.h" #include "ring_buffer.h" #include "typedefs.h" @@ -263,6 +264,13 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) { HANDLE logFile = NULL; #endif +// Declare function pointers. +CalcLinearEnergies WebRtcAecm_CalcLinearEnergies; +StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel; +ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel; +WindowAndFFT WebRtcAecm_WindowAndFFT; +InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow; + int WebRtcAecm_CreateCore(AecmCore_t **aecmInst) { AecmCore_t *aecm = malloc(sizeof(AecmCore_t)); @@ -346,6 +354,194 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat aecm->mseChannelCount = 0; } +static void WindowAndFFTC(WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + complex16_t* freq_signal, + int time_signal_scaling) +{ + int i, j; + + memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); + // FFT of signal + for (i = 0, j = 0; i < PART_LEN; i++, j += 2) + { + // Window time domain signal and insert into real part of + // transformation array |fft| + fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[i] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[i], + 14); + fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + (time_signal[i + PART_LEN] << time_signal_scaling), + WebRtcAecm_kSqrtHanning[PART_LEN - i], + 14); + // Inserting zeros in imaginary parts not necessary since we + // initialized the array with all zeros + } + + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); + + // Take only the first PART_LEN2 samples + for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2) + { + freq_signal[i].real = fft[j]; + + // The imaginary part has to switch sign + freq_signal[i].imag = - fft[j+1]; + } +} + +static void InverseFFTAndWindowC(AecmCore_t* aecm, + WebRtc_Word16* fft, + complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean) +{ + int i, j, outCFFT; + WebRtc_Word32 tmp32no1; + + // Synthesis + for (i = 1; i < PART_LEN; i++) + { + j = WEBRTC_SPL_LSHIFT_W32(i, 1); + fft[j] = efw[i].real; + + // mirrored data, even + fft[PART_LEN4 - j] = efw[i].real; + fft[j + 1] = -efw[i].imag; + + //mirrored data, odd + fft[PART_LEN4 - (j - 1)] = efw[i].imag; + } + fft[0] = efw[0].real; + fft[1] = -efw[0].imag; + + fft[PART_LEN2] = efw[PART_LEN].real; + fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; + + // inverse FFT, result should be scaled with outCFFT + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); + + //take only the real values and scale with outCFFT + for (i = 0; i < PART_LEN2; i++) + { + j = WEBRTC_SPL_LSHIFT_W32(i, 1); + fft[i] = fft[j]; + } + + for (i = 0; i < PART_LEN; i++) + { + fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + fft[i], + WebRtcAecm_kSqrtHanning[i], + 14); + tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], + outCFFT - aecm->dfaCleanQDomain); + fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, + tmp32no1 + aecm->outBuf[i], + WEBRTC_SPL_WORD16_MIN); + output[i] = fft[i]; + + tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( + fft[PART_LEN + i], + WebRtcAecm_kSqrtHanning[PART_LEN - i], + 14); + tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, + outCFFT - aecm->dfaCleanQDomain); + aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( + WEBRTC_SPL_WORD16_MAX, + tmp32no1, + WEBRTC_SPL_WORD16_MIN); + } + +#ifdef ARM_WINM_LOG_ + // measure tick end + QueryPerformanceCounter((LARGE_INTEGER*)&end); + diff__ = ((end - start) * 1000) / (freq/1000); + milliseconds = (unsigned int)(diff__ & 0xffffffff); + WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL); +#endif + + // Copy the current block to the old position (aecm->outBuf is shifted elsewhere) + memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); + memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); + if (nearendClean != NULL) + { + memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); + } +} + +static void CalcLinearEnergiesC(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored) +{ + int i; + + // Get energy for the delayed far end signal and estimated + // echo using both stored and adapted channels. + for (i = 0; i < PART_LEN1; i++) + { + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); + (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]); + (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], + far_spectrum[i]); + (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i]; + } +} + +static void StoreAdaptiveChannelC(AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est) +{ + int i; + + // During startup we store the channel every block. + memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); + // Recalculate echo estimate + for (i = 0; i < PART_LEN; i += 4) + { + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); + echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], + far_spectrum[i + 1]); + echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], + far_spectrum[i + 2]); + echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], + far_spectrum[i + 3]); + } + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], + far_spectrum[i]); +} + +static void ResetAdaptiveChannelC(AecmCore_t* aecm) +{ + int i; + + // The stored channel has a significantly lower MSE than the adaptive one for + // two consecutive calculations. Reset the adaptive channel. + memcpy(aecm->channelAdapt16, aecm->channelStored, + sizeof(WebRtc_Word16) * PART_LEN1); + // Restore the W32 channel + for (i = 0; i < PART_LEN; i += 4) + { + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i], 16); + aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 1], 16); + aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 2], 16); + aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i + 3], 16); + } + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); +} + // WebRtcAecm_InitCore(...) // // This function initializes the AECM instant created with WebRtcAecm_CreateCore(...) @@ -463,6 +659,23 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq) assert(PART_LEN % 16 == 0); + // Initialize function pointers. + WebRtcAecm_WindowAndFFT = WindowAndFFTC; + WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC; + WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC; + WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC; + WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC; + +#ifdef WEBRTC_DETECT_ARM_NEON + uint64_t features = WebRtc_GetCPUFeaturesARM(); + if ((features & kCPUFeatureNEON) != 0) + { + WebRtcAecm_InitNeon(); + } +#elif defined(WEBRTC_ARCH_ARM_NEON) + WebRtcAecm_InitNeon(); +#endif + return 0; } @@ -1890,194 +2103,3 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far aecm->farBufReadPos += readLen; } -#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) - -void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft, - const WebRtc_Word16* time_signal, - complex16_t* freq_signal, - int time_signal_scaling) -{ - int i, j; - - memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4); - // FFT of signal - for (i = 0, j = 0; i < PART_LEN; i++, j += 2) - { - // Window time domain signal and insert into real part of - // transformation array |fft| - fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[i] << time_signal_scaling), - WebRtcAecm_kSqrtHanning[i], - 14); - fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - (time_signal[i + PART_LEN] << time_signal_scaling), - WebRtcAecm_kSqrtHanning[PART_LEN - i], - 14); - // Inserting zeros in imaginary parts not necessary since we - // initialized the array with all zeros - } - - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); - - // Take only the first PART_LEN2 samples - for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2) - { - freq_signal[i].real = fft[j]; - - // The imaginary part has to switch sign - freq_signal[i].imag = - fft[j+1]; - } -} - -void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm, - WebRtc_Word16* fft, - complex16_t* efw, - WebRtc_Word16* output, - const WebRtc_Word16* nearendClean) -{ - int i, j, outCFFT; - WebRtc_Word32 tmp32no1; - - // Synthesis - for (i = 1; i < PART_LEN; i++) - { - j = WEBRTC_SPL_LSHIFT_W32(i, 1); - fft[j] = efw[i].real; - - // mirrored data, even - fft[PART_LEN4 - j] = efw[i].real; - fft[j + 1] = -efw[i].imag; - - //mirrored data, odd - fft[PART_LEN4 - (j - 1)] = efw[i].imag; - } - fft[0] = efw[0].real; - fft[1] = -efw[0].imag; - - fft[PART_LEN2] = efw[PART_LEN].real; - fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; - - // inverse FFT, result should be scaled with outCFFT - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); - - //take only the real values and scale with outCFFT - for (i = 0; i < PART_LEN2; i++) - { - j = WEBRTC_SPL_LSHIFT_W32(i, 1); - fft[i] = fft[j]; - } - - for (i = 0; i < PART_LEN; i++) - { - fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - fft[i], - WebRtcAecm_kSqrtHanning[i], - 14); - tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], - outCFFT - aecm->dfaCleanQDomain); - fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, - tmp32no1 + aecm->outBuf[i], - WEBRTC_SPL_WORD16_MIN); - output[i] = fft[i]; - - tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( - fft[PART_LEN + i], - WebRtcAecm_kSqrtHanning[PART_LEN - i], - 14); - tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, - outCFFT - aecm->dfaCleanQDomain); - aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( - WEBRTC_SPL_WORD16_MAX, - tmp32no1, - WEBRTC_SPL_WORD16_MIN); - } - -#ifdef ARM_WINM_LOG_ - // measure tick end - QueryPerformanceCounter((LARGE_INTEGER*)&end); - diff__ = ((end - start) * 1000) / (freq/1000); - milliseconds = (unsigned int)(diff__ & 0xffffffff); - WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL); -#endif - - // Copy the current block to the old position (aecm->outBuf is shifted elsewhere) - memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); - memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); - if (nearendClean != NULL) - { - memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN); - } -} - -void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est, - WebRtc_UWord32* far_energy, - WebRtc_UWord32* echo_energy_adapt, - WebRtc_UWord32* echo_energy_stored) -{ - int i; - - // Get energy for the delayed far end signal and estimated - // echo using both stored and adapted channels. - for (i = 0; i < PART_LEN1; i++) - { - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); - (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]); - (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i], - far_spectrum[i]); - (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i]; - } -} - -void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est) -{ - int i; - - // During startup we store the channel every block. - memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1); - // Recalculate echo estimate - for (i = 0; i < PART_LEN; i += 4) - { - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); - echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], - far_spectrum[i + 1]); - echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], - far_spectrum[i + 2]); - echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], - far_spectrum[i + 3]); - } - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], - far_spectrum[i]); -} - -void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm) -{ - int i; - - // The stored channel has a significantly lower MSE than the adaptive one for - // two consecutive calculations. Reset the adaptive channel. - memcpy(aecm->channelAdapt16, aecm->channelStored, - sizeof(WebRtc_Word16) * PART_LEN1); - // Restore the W32 channel - for (i = 0; i < PART_LEN; i += 4) - { - aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i], 16); - aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 1], 16); - aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 2], 16); - aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i + 3], 16); - } - aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16); -} - -#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)) diff --git a/src/modules/audio_processing/aecm/aecm_core.h b/src/modules/audio_processing/aecm/aecm_core.h index dede6d39be..0ec62ec24d 100644 --- a/src/modules/audio_processing/aecm/aecm_core.h +++ b/src/modules/audio_processing/aecm/aecm_core.h @@ -332,32 +332,44 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend, const int farLen, const int knownDelay); -/////////////////////////////////////////////////////////////////////////////////////////////// -// Some internal functions shared by ARM NEON and generic C code: +/////////////////////////////////////////////////////////////////////////////// +// Some function pointers, for internal functions shared by ARM NEON and +// generic C code. // +typedef void (*CalcLinearEnergies)( + AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echoEst, + WebRtc_UWord32* far_energy, + WebRtc_UWord32* echo_energy_adapt, + WebRtc_UWord32* echo_energy_stored); +extern CalcLinearEnergies WebRtcAecm_CalcLinearEnergies; -void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echoEst, - WebRtc_UWord32* far_energy, - WebRtc_UWord32* echo_energy_adapt, - WebRtc_UWord32* echo_energy_stored); +typedef void (*StoreAdaptiveChannel)( + AecmCore_t* aecm, + const WebRtc_UWord16* far_spectrum, + WebRtc_Word32* echo_est); +extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel; -void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, - const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est); +typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm); +extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel; -void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm); +typedef void (*WindowAndFFT)( + WebRtc_Word16* fft, + const WebRtc_Word16* time_signal, + complex16_t* freq_signal, + int time_signal_scaling); +extern WindowAndFFT WebRtcAecm_WindowAndFFT; -void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft, - const WebRtc_Word16* time_signal, - complex16_t* freq_signal, - int time_signal_scaling); +typedef void (*InverseFFTAndWindow)( + AecmCore_t* aecm, + WebRtc_Word16* fft, complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean); +extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow; + +// Initialization of the above function pointers for ARM Neon. +void WebRtcAecm_InitNeon(void); -void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm, - WebRtc_Word16* fft, - complex16_t* efw, - WebRtc_Word16* output, - const WebRtc_Word16* nearendClean); #endif diff --git a/src/modules/audio_processing/aecm/aecm_core_neon.c b/src/modules/audio_processing/aecm/aecm_core_neon.c index 86ced1ed3b..ab448b48da 100644 --- a/src/modules/audio_processing/aecm/aecm_core_neon.c +++ b/src/modules/audio_processing/aecm/aecm_core_neon.c @@ -7,7 +7,6 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON) #include "aecm_core.h" @@ -16,299 +15,289 @@ // Square root of Hanning window in Q14. -static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = { - 16384, 16373, 16354, 16325, - 16286, 16237, 16179, 16111, - 16034, 15947, 15851, 15746, - 15631, 15506, 15373, 15231, - 15079, 14918, 14749, 14571, - 14384, 14189, 13985, 13773, - 13553, 13325, 13089, 12845, - 12594, 12335, 12068, 11795, - 11514, 11227, 10933, 10633, - 10326, 10013, 9695, 9370, - 9040, 8705, 8364, 8019, - 7668, 7313, 6954, 6591, - 6224, 5853, 5478, 5101, - 4720, 4337, 3951, 3562, - 3172, 2780, 2386, 1990, - 1594, 1196, 798, 399 +static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = { + 16384, 16373, 16354, 16325, + 16286, 16237, 16179, 16111, + 16034, 15947, 15851, 15746, + 15631, 15506, 15373, 15231, + 15079, 14918, 14749, 14571, + 14384, 14189, 13985, 13773, + 13553, 13325, 13089, 12845, + 12594, 12335, 12068, 11795, + 11514, 11227, 10933, 10633, + 10326, 10013, 9695, 9370, + 9040, 8705, 8364, 8019, + 7668, 7313, 6954, 6591, + 6224, 5853, 5478, 5101, + 4720, 4337, 3951, 3562, + 3172, 2780, 2386, 1990, + 1594, 1196, 798, 399 }; -void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft, +static void WindowAndFFTNeon(WebRtc_Word16* fft, const WebRtc_Word16* time_signal, complex16_t* freq_signal, - int time_signal_scaling) -{ - int i, j; + int time_signal_scaling) { + int i, j; - int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); - __asm__("vmov.i16 d21, #0" ::: "d21"); + int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling); + __asm__("vmov.i16 d21, #0" ::: "d21"); - for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8) - { - int16x4_t tmp16x4_0; - int16x4_t tmp16x4_1; - int32x4_t tmp32x4_0; + for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) { + int16x4_t tmp16x4_0; + int16x4_t tmp16x4_1; + int32x4_t tmp32x4_0; - /* Window near end */ - // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] - // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); - tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); + /* Window near end */ + // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i] + // << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i])); + tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); - tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); + tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); - __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); - __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); + __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); + __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10"); - // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( - // (time_signal[PART_LEN + i] << time_signal_scaling), - // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN])); - tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); + // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT( + // (time_signal[PART_LEN + i] << time_signal_scaling), + // WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN])); + tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); - tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); + tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1); - __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); - __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); - } + __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20"); + __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10"); + } - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1); - // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part. - for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) - { - __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); - __asm__("vneg.s16 d22, d22" : : : "q10"); - __asm__("vneg.s16 d23, d23" : : : "q11"); - __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : : + // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part. + for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) { + __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); + __asm__("vneg.s16 d22, d22" : : : "q10"); + __asm__("vneg.s16 d23, d23" : : : "q11"); + __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&freq_signal[i].real): "q10", "q11"); - } + } } -void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm, - WebRtc_Word16* fft, - complex16_t* efw, - WebRtc_Word16* output, - const WebRtc_Word16* nearendClean) -{ - int i, j, outCFFT; - WebRtc_Word32 tmp32no1; +static void InverseFFTAndWindowNeon(AecmCore_t* aecm, + WebRtc_Word16* fft, + complex16_t* efw, + WebRtc_Word16* output, + const WebRtc_Word16* nearendClean) { + int i, j, outCFFT; + WebRtc_Word32 tmp32no1; - // Synthesis - for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8) - { - // We overwrite two more elements in fft[], but it's ok. - __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10"); - __asm__("vmov q11, q10" : : : "q10", "q11"); + // Synthesis + for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) { + // We overwrite two more elements in fft[], but it's ok. + __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10"); + __asm__("vmov q11, q10" : : : "q10", "q11"); - __asm__("vneg.s16 d23, d23" : : : "q11"); - __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11"); + __asm__("vneg.s16 d23, d23" : : : "q11"); + __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11"); - __asm__("vrev64.16 q10, q10" : : : "q10"); - __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10"); - } + __asm__("vrev64.16 q10, q10" : : : "q10"); + __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10"); + } - fft[PART_LEN2] = efw[PART_LEN].real; - fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; + fft[PART_LEN2] = efw[PART_LEN].real; + fft[PART_LEN2 + 1] = -efw[PART_LEN].imag; - // Inverse FFT, result should be scaled with outCFFT. - WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); - outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); + // Inverse FFT, result should be scaled with outCFFT. + WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT); + outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1); - // Take only the real values and scale with outCFFT. - for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16) - { - __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); - __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10"); - } + // Take only the real values and scale with outCFFT. + for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) { + __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11"); + __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10"); + } - int32x4_t tmp32x4_2; - __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32) - (outCFFT - aecm->dfaCleanQDomain))); - for (i = 0; i < PART_LEN; i += 4) - { - int16x4_t tmp16x4_0; - int16x4_t tmp16x4_1; - int32x4_t tmp32x4_0; - int32x4_t tmp32x4_1; + int32x4_t tmp32x4_2; + __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32) + (outCFFT - aecm->dfaCleanQDomain))); + for (i = 0; i < PART_LEN; i += 4) { + int16x4_t tmp16x4_0; + int16x4_t tmp16x4_1; + int32x4_t tmp32x4_0; + int32x4_t tmp32x4_1; - // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - // fft[i], WebRtcAecm_kSqrtHanning[i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i])); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); - __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); - __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); + // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + // fft[i], WebRtcAecm_kSqrtHanning[i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i])); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i])); + __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); + __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); - // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], - // outCFFT - aecm->dfaCleanQDomain); - __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); + // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i], + // outCFFT - aecm->dfaCleanQDomain); + __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); - // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, - // tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN); - // output[i] = fft[i]; - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); - __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); - __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); - __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); - __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i])); - __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); + // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX, + // tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN); + // output[i] = fft[i]; + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i])); + __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0)); + __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1)); + __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i])); + __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i])); - // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( - // fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i])); - __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); - __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); - __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); + // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT( + // fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i])); + __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i])); + __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1)); + __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0)); - // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain); - __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); - // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( - // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); - __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); - __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); - } + // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain); + __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2)); + // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT( + // WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN); + __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0)); + __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i])); + } - // Copy the current block to the old position (outBuf is shifted elsewhere). - for (i = 0; i < PART_LEN; i += 16) - { - __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + // Copy the current block to the old position (outBuf is shifted elsewhere). + for (i = 0; i < PART_LEN; i += 16) { + __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i + PART_LEN]) : "q10"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10"); - } - for (i = 0; i < PART_LEN; i += 16) - { - __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10"); + } + for (i = 0; i < PART_LEN; i += 16) { + __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->dBufNoisy[i]): "q10"); + } + if (nearendClean != NULL) { + for (i = 0; i < PART_LEN; i += 16) { + __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->dBufClean[i + PART_LEN]) : "q10"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->dBufClean[i]): "q10"); } - if (nearendClean != NULL) { - for (i = 0; i < PART_LEN; i += 16) - { - __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : : - "r"(&aecm->dBufClean[i + PART_LEN]) : "q10"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : - "r"(&aecm->dBufClean[i]): "q10"); - } - } + } } -void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm, +static void CalcLinearEnergiesNeon(AecmCore_t* aecm, const WebRtc_UWord16* far_spectrum, WebRtc_Word32* echo_est, WebRtc_UWord32* far_energy, WebRtc_UWord32* echo_energy_adapt, - WebRtc_UWord32* echo_energy_stored) -{ - int i; + WebRtc_UWord32* echo_energy_stored) { + int i; - register WebRtc_UWord32 far_energy_r; - register WebRtc_UWord32 echo_energy_stored_r; - register WebRtc_UWord32 echo_energy_adapt_r; - uint32x4_t tmp32x4_0; + register WebRtc_UWord32 far_energy_r; + register WebRtc_UWord32 echo_energy_stored_r; + register WebRtc_UWord32 echo_energy_adapt_r; + uint32x4_t tmp32x4_0; - __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy - __asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored - __asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt + __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy + __asm__("vmov.i32 q8, #0" : : : "q8"); // echo_energy_stored + __asm__("vmov.i32 q9, #0" : : : "q9"); // echo_energy_adapt - for(i = 0; i < PART_LEN -7; i += 8) - { - // far_energy += (WebRtc_UWord32)(far_spectrum[i]); - __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); - __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); - __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); - - // Get estimated echo energies for adaptive channel and stored channel. - // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); - __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): - "q10", "q11"); - - // echo_energy_stored += (WebRtc_UWord32)echoEst[i]; - __asm__("vadd.u32 q8, q10" : : : "q10", "q8"); - __asm__("vadd.u32 q8, q11" : : : "q11", "q8"); - - // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16( - // aecm->channelAdapt16[i], far_spectrum[i]); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); - __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm__("vadd.u32 q9, q10" : : : "q9", "q15"); - __asm__("vadd.u32 q9, q11" : : : "q9", "q11"); - } - - __asm__("vadd.u32 d28, d29" : : : "q14"); - __asm__("vpadd.u32 d28, d28" : : : "q14"); - __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); - - __asm__("vadd.u32 d18, d19" : : : "q9"); - __asm__("vpadd.u32 d18, d18" : : : "q9"); - __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); - - __asm__("vadd.u32 d16, d17" : : : "q8"); - __asm__("vpadd.u32 d16, d16" : : : "q8"); - __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); + for (i = 0; i < PART_LEN - 7; i += 8) { + // far_energy += (WebRtc_UWord32)(far_spectrum[i]); + __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); + __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13"); + __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13"); // Get estimated echo energies for adaptive channel and stored channel. - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i]; - *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]); - *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16( - aecm->channelAdapt16[i], far_spectrum[i]); + // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); + __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]): + "q10", "q11"); + + // echo_energy_stored += (WebRtc_UWord32)echoEst[i]; + __asm__("vadd.u32 q8, q10" : : : "q10", "q8"); + __asm__("vadd.u32 q8, q11" : : : "q11", "q8"); + + // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16( + // aecm->channelAdapt16[i], far_spectrum[i]); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm__("vadd.u32 q9, q10" : : : "q9", "q15"); + __asm__("vadd.u32 q9, q11" : : : "q9", "q11"); + } + + __asm__("vadd.u32 d28, d29" : : : "q14"); + __asm__("vpadd.u32 d28, d28" : : : "q14"); + __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14"); + + __asm__("vadd.u32 d18, d19" : : : "q9"); + __asm__("vpadd.u32 d18, d18" : : : "q9"); + __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9"); + + __asm__("vadd.u32 d16, d17" : : : "q8"); + __asm__("vpadd.u32 d16, d16" : : : "q8"); + __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8"); + + // Get estimated echo energies for adaptive channel and stored channel. + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i]; + *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]); + *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16( + aecm->channelAdapt16[i], far_spectrum[i]); } -void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm, +static void StoreAdaptiveChannelNeon(AecmCore_t* aecm, const WebRtc_UWord16* far_spectrum, - WebRtc_Word32* echo_est) -{ - int i; + WebRtc_Word32* echo_est) { + int i; - // During startup we store the channel every block. - // Recalculate echo estimate. - for(i = 0; i < PART_LEN -7; i += 8) - { - // aecm->channelStored[i] = acem->channelAdapt16[i]; - // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); - __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); - __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); - __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); - __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : - "r"(&echo_est[i]) : "q10", "q11"); - } - aecm->channelStored[i] = aecm->channelAdapt16[i]; - echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + // During startup we store the channel every block. + // Recalculate echo estimate. + for (i = 0; i < PART_LEN - 7; i += 8) { + // aecm->channelStored[i] = acem->channelAdapt16[i]; + // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); + __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13"); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12"); + __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10"); + __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&echo_est[i]) : "q10", "q11"); + } + aecm->channelStored[i] = aecm->channelAdapt16[i]; + echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]); } -void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm) -{ - int i; +static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) { + int i; - for(i = 0; i < PART_LEN -7; i += 8) - { - // aecm->channelAdapt16[i] = aecm->channelStored[i]; - // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32) - // aecm->channelStored[i], 16); - __asm__("vld1.16 {d24, d25}, [%0, :128]" : : - "r"(&aecm->channelStored[i]) : "q12"); - __asm__("vst1.16 {d24, d25}, [%0, :128]" : : - "r"(&aecm->channelAdapt16[i]) : "q12"); - __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); - __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); - __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : - "r"(&aecm->channelAdapt32[i]): "q10", "q11"); - } - aecm->channelAdapt16[i] = aecm->channelStored[i]; - aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( - (WebRtc_Word32)aecm->channelStored[i], 16); + for (i = 0; i < PART_LEN - 7; i += 8) { + // aecm->channelAdapt16[i] = aecm->channelStored[i]; + // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32) + // aecm->channelStored[i], 16); + __asm__("vld1.16 {d24, d25}, [%0, :128]" : : + "r"(&aecm->channelStored[i]) : "q12"); + __asm__("vst1.16 {d24, d25}, [%0, :128]" : : + "r"(&aecm->channelAdapt16[i]) : "q12"); + __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10"); + __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11"); + __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : + "r"(&aecm->channelAdapt32[i]): "q10", "q11"); + } + aecm->channelAdapt16[i] = aecm->channelStored[i]; + aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( + (WebRtc_Word32)aecm->channelStored[i], 16); } -#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON) +void WebRtcAecm_InitNeon(void) { + WebRtcAecm_WindowAndFFT = WindowAndFFTNeon; + WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon; + WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon; + WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon; + WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon; +} diff --git a/src/modules/audio_processing/ns/Android.mk b/src/modules/audio_processing/ns/Android.mk index 1363a93657..aba95e1d9f 100644 --- a/src/modules/audio_processing/ns/Android.mk +++ b/src/modules/audio_processing/ns/Android.mk @@ -6,6 +6,8 @@ # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. +############################# +# Build the non-neon library. LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) @@ -20,25 +22,20 @@ LOCAL_SRC_FILES := \ noise_suppression_x.c \ nsx_core.c -# floating point +# Files for floating point. # noise_suppression.c ns_core.c # Flags passed to both C and C++ files. -LOCAL_CFLAGS := \ - $(MY_WEBRTC_COMMON_DEFS) - -ifeq ($(ARCH_ARM_HAVE_NEON),true) -LOCAL_SRC_FILES += \ - nsx_core_neon.c -LOCAL_CFLAGS += \ - $(MY_ARM_CFLAGS_NEON) -endif +LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS) LOCAL_C_INCLUDES := \ $(LOCAL_PATH)/interface \ $(LOCAL_PATH)/../utility \ $(LOCAL_PATH)/../../.. \ - $(LOCAL_PATH)/../../../common_audio/signal_processing/include + $(LOCAL_PATH)/../../../common_audio/signal_processing/include \ + $(LOCAL_PATH)/../../../system_wrappers/interface + +LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers LOCAL_SHARED_LIBRARIES := \ libcutils \ @@ -49,3 +46,31 @@ ifndef NDK_ROOT include external/stlport/libstlport.mk endif include $(BUILD_STATIC_LIBRARY) + +############################# +# Build the neon library. + +include $(CLEAR_VARS) + +LOCAL_MODULE_CLASS := STATIC_LIBRARIES +LOCAL_MODULE := libwebrtc_ns_neon +LOCAL_MODULE_TAGS := optional +LOCAL_GENERATED_SOURCES := + +LOCAL_SRC_FILES := nsx_core_neon.c + +# Flags passed to both C and C++ files. +LOCAL_CFLAGS := \ + $(MY_WEBRTC_COMMON_DEFS) \ + -mfpu=neon \ + -flax-vector-conversions + +LOCAL_C_INCLUDES := \ + $(LOCAL_PATH)/interface \ + $(LOCAL_PATH)/../../.. \ + $(LOCAL_PATH)/../../../common_audio/signal_processing/include + +ifndef NDK_ROOT +include external/stlport/libstlport.mk +endif +include $(BUILD_STATIC_LIBRARY) diff --git a/src/modules/audio_processing/ns/nsx_core.c b/src/modules/audio_processing/ns/nsx_core.c index 66c49134f2..3879161e8b 100644 --- a/src/modules/audio_processing/ns/nsx_core.c +++ b/src/modules/audio_processing/ns/nsx_core.c @@ -16,6 +16,7 @@ #include #include +#include "cpu_features_wrapper.h" #include "nsx_core.h" // Skip first frequency bins during estimation. (0 <= value < 64) @@ -426,6 +427,271 @@ static const WebRtc_Word16 kDeterminantEstMatrix[66] = { 355, 330 }; +// Declare function pointers. +NoiseEstimation WebRtcNsx_NoiseEstimation; +PrepareSpectrum WebRtcNsx_PrepareSpectrum; +SynthesisUpdate WebRtcNsx_SynthesisUpdate; +AnalysisUpdate WebRtcNsx_AnalysisUpdate; +Denormalize WebRtcNsx_Denormalize; +CreateComplexBuffer WebRtcNsx_CreateComplexBuffer; + +// Update the noise estimation information. +static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) { + WebRtc_Word32 tmp32no1 = 0; + WebRtc_Word32 tmp32no2 = 0; + WebRtc_Word16 tmp16 = 0; + const WebRtc_Word16 kExp2Const = 11819; // Q13 + + int i = 0; + + tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, + inst->magnLen); + // Guarantee a Q-domain as high as possible and still fit in int16 + inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + kExp2Const, tmp16, 21); + for (i = 0; i < inst->magnLen; i++) { + // inst->quantile[i]=exp(inst->lquantile[offset+i]); + // in Q21 + tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const, + inst->noiseEstLogQuantile[offset + i]); + tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac + tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21); + tmp16 -= 21;// shift 21 to get result in Q0 + tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise) + if (tmp16 < 0) { + tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16); + } else { + tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16); + } + inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1); + } +} + +// Noise Estimation +static void NoiseEstimationC(NsxInst_t* inst, + uint16_t* magn, + uint32_t* noise, + int16_t* q_noise) { + WebRtc_Word32 numerator = FACTOR_Q16; + WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv; + WebRtc_Word16 countProd, delta, zeros, frac; + WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2; + const int16_t log2_const = 22713; // Q15 + const int16_t width_factor = 21845; + + int i, s, offset; + + tabind = inst->stages - inst->normData; + assert(tabind < 9); + assert(tabind > -9); + if (tabind < 0) { + logval = -WebRtcNsx_kLogTable[-tabind]; + } else { + logval = WebRtcNsx_kLogTable[tabind]; + } + + // lmagn(i)=log(magn(i))=log(2)*log2(magn(i)) + // magn is in Q(-stages), and the real lmagn values are: + // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages) + // lmagn in Q8 + for (i = 0; i < inst->magnLen; i++) { + if (magn[i]) { + zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]); + frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros) + & 0x7FFFFFFF) >> 23); + // log2(magn(i)) + assert(frac < 256); + log2 = (WebRtc_Word16)(((31 - zeros) << 8) + + WebRtcNsx_kLogTableFrac[frac]); + // log2(magn(i))*log(2) + lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); + // + log(2^stages) + lmagn[i] += logval; + } else { + lmagn[i] = logval;//0; + } + } + + // loop over simultaneous estimates + for (s = 0; s < SIMULT; s++) { + offset = s * inst->magnLen; + + // Get counter values from state + counter = inst->noiseEstCounter[s]; + assert(counter < 201); + countDiv = WebRtcNsx_kCounterDiv[counter]; + countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv); + + // quant_est(...) + for (i = 0; i < inst->magnLen; i++) { + // compute delta + if (inst->noiseEstDensity[offset + i] > 512) { + delta = WebRtcSpl_DivW32W16ResW16(numerator, + inst->noiseEstDensity[offset + i]); + } else { + delta = FACTOR_Q7; + if (inst->blockIndex < END_STARTUP_LONG) { + // Smaller step size during startup. This prevents from using + // unrealistic values causing overflow. + delta = FACTOR_Q7_STARTUP; + } + } + + // update log quantile estimate + tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14); + if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) { + // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2 + // CounterDiv=1/(inst->counter[s]+1) in Q15 + tmp16 += 2; + tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2); + inst->noiseEstLogQuantile[offset + i] += tmp16no1; + } else { + tmp16 += 1; + tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1); + // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2 + tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1); + inst->noiseEstLogQuantile[offset + i] -= tmp16no2; + if (inst->noiseEstLogQuantile[offset + i] < logval) { + // This is the smallest fixed point representation we can + // have, hence we limit the output. + inst->noiseEstLogQuantile[offset + i] = logval; + } + } + + // update density estimate + if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i]) + < WIDTH_Q8) { + tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + inst->noiseEstDensity[offset + i], countProd, 15); + tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + width_factor, countDiv, 15); + inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2; + } + } // end loop over magnitude spectrum + + if (counter >= END_STARTUP_LONG) { + inst->noiseEstCounter[s] = 0; + if (inst->blockIndex >= END_STARTUP_LONG) { + UpdateNoiseEstimate(inst, offset); + } + } + inst->noiseEstCounter[s]++; + + } // end loop over simultaneous estimates + + // Sequentially update the noise during startup + if (inst->blockIndex < END_STARTUP_LONG) { + UpdateNoiseEstimate(inst, offset); + } + + for (i = 0; i < inst->magnLen; i++) { + noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise) + } + (*q_noise) = (WebRtc_Word16)inst->qNoise; +} + +// Filter the data in the frequency domain, and create spectrum. +static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) { + int i = 0, j = 0; + int16_t tmp16 = 0; + + for (i = 0; i < inst->magnLen; i++) { + inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i], + (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) + inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i], + (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) + } + + freq_buf[0] = inst->real[0]; + freq_buf[1] = -inst->imag[0]; + for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { + tmp16 = (inst->anaLen << 1) - j; + freq_buf[j] = inst->real[i]; + freq_buf[j + 1] = -inst->imag[i]; + freq_buf[tmp16] = inst->real[i]; + freq_buf[tmp16 + 1] = inst->imag[i]; + } + freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; + freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; +} + +// Denormalize the input buffer. +static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) { + int i = 0, j = 0; + int32_t tmp32 = 0; + for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) { + tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j], + factor - inst->normData); + inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0 + } +} + +// For the noise supression process, synthesis, read out fully processed +// segment, and update synthesis buffer. +static void SynthesisUpdateC(NsxInst_t* inst, + int16_t* out_frame, + int16_t gain_factor) { + int i = 0; + int16_t tmp16a = 0; + int16_t tmp16b = 0; + int32_t tmp32 = 0; + + // synthesis + for (i = 0; i < inst->anaLen; i++) { + tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + inst->window[i], inst->real[i], 14); // Q0, window in Q14 + tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0 + // Down shift with rounding + tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0 + inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i], + tmp16b); // Q0 + } + + // read out fully processed segment + for (i = 0; i < inst->blockLen10ms; i++) { + out_frame[i] = inst->synthesisBuffer[i]; // Q0 + } + + // update synthesis buffer + WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer, + inst->synthesisBuffer + inst->blockLen10ms, + inst->anaLen - inst->blockLen10ms); + WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer + + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms); +} + +// Update analysis buffer for lower band, and window data before FFT. +static void AnalysisUpdateC(NsxInst_t* inst, + int16_t* out, + int16_t* new_speech) { + int i = 0; + + // For lower band update analysis buffer. + WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer, + inst->analysisBuffer + inst->blockLen10ms, + inst->anaLen - inst->blockLen10ms); + WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer + + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms); + + // Window data before FFT. + for (i = 0; i < inst->anaLen; i++) { + out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( + inst->window[i], inst->analysisBuffer[i], 14); // Q0 + } +} + +// Create a complex number buffer (out[]) as the intput (in[]) interleaved with +// zeros, and normalize it. +static __inline void CreateComplexBufferC(NsxInst_t* inst, + int16_t* in, + int16_t* out) { + int i = 0, j = 0; + for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) { + out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) + out[j + 1] = 0; // Insert zeros in imaginary part + } +} + void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst, WebRtc_Word16 pink_noise_exp_avg, WebRtc_Word32 pink_noise_num_avg, @@ -600,6 +866,24 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) { inst->file5 = fopen("file5.pcm", "wb"); #endif + // Initialize function pointers. + WebRtcNsx_NoiseEstimation = NoiseEstimationC; + WebRtcNsx_PrepareSpectrum = PrepareSpectrumC; + WebRtcNsx_SynthesisUpdate = SynthesisUpdateC; + WebRtcNsx_AnalysisUpdate = AnalysisUpdateC; + WebRtcNsx_Denormalize = DenormalizeC; + WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC; + +#ifdef WEBRTC_DETECT_ARM_NEON + uint64_t features = WebRtc_GetCPUFeaturesARM(); + if ((features & kCPUFeatureNEON) != 0) + { + WebRtcNsx_InitNeon(); + } +#elif defined(WEBRTC_ARCH_ARM_NEON) + WebRtcNsx_InitNeon(); +#endif + inst->initFlag = 1; return 0; @@ -2157,263 +2441,4 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram return 0; } -#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)) -// Update the noise estimation information. -static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) { - WebRtc_Word32 tmp32no1 = 0; - WebRtc_Word32 tmp32no2 = 0; - WebRtc_Word16 tmp16 = 0; - const WebRtc_Word16 kExp2Const = 11819; // Q13 - - int i = 0; - - tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, - inst->magnLen); - // Guarantee a Q-domain as high as possible and still fit in int16 - inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - kExp2Const, tmp16, 21); - for (i = 0; i < inst->magnLen; i++) { - // inst->quantile[i]=exp(inst->lquantile[offset+i]); - // in Q21 - tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const, - inst->noiseEstLogQuantile[offset + i]); - tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac - tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21); - tmp16 -= 21;// shift 21 to get result in Q0 - tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise) - if (tmp16 < 0) { - tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16); - } else { - tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16); - } - inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1); - } -} - -// Noise Estimation -void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, - uint16_t* magn, - uint32_t* noise, - int16_t* q_noise) { - WebRtc_Word32 numerator = FACTOR_Q16; - WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv; - WebRtc_Word16 countProd, delta, zeros, frac; - WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2; - const int16_t log2_const = 22713; // Q15 - const int16_t width_factor = 21845; - - int i, s, offset; - - tabind = inst->stages - inst->normData; - assert(tabind < 9); - assert(tabind > -9); - if (tabind < 0) { - logval = -WebRtcNsx_kLogTable[-tabind]; - } else { - logval = WebRtcNsx_kLogTable[tabind]; - } - - // lmagn(i)=log(magn(i))=log(2)*log2(magn(i)) - // magn is in Q(-stages), and the real lmagn values are: - // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages) - // lmagn in Q8 - for (i = 0; i < inst->magnLen; i++) { - if (magn[i]) { - zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]); - frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros) - & 0x7FFFFFFF) >> 23); - // log2(magn(i)) - assert(frac < 256); - log2 = (WebRtc_Word16)(((31 - zeros) << 8) - + WebRtcNsx_kLogTableFrac[frac]); - // log2(magn(i))*log(2) - lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); - // + log(2^stages) - lmagn[i] += logval; - } else { - lmagn[i] = logval;//0; - } - } - - // loop over simultaneous estimates - for (s = 0; s < SIMULT; s++) { - offset = s * inst->magnLen; - - // Get counter values from state - counter = inst->noiseEstCounter[s]; - assert(counter < 201); - countDiv = WebRtcNsx_kCounterDiv[counter]; - countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv); - - // quant_est(...) - for (i = 0; i < inst->magnLen; i++) { - // compute delta - if (inst->noiseEstDensity[offset + i] > 512) { - delta = WebRtcSpl_DivW32W16ResW16(numerator, - inst->noiseEstDensity[offset + i]); - } else { - delta = FACTOR_Q7; - if (inst->blockIndex < END_STARTUP_LONG) { - // Smaller step size during startup. This prevents from using - // unrealistic values causing overflow. - delta = FACTOR_Q7_STARTUP; - } - } - - // update log quantile estimate - tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14); - if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) { - // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2 - // CounterDiv=1/(inst->counter[s]+1) in Q15 - tmp16 += 2; - tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2); - inst->noiseEstLogQuantile[offset + i] += tmp16no1; - } else { - tmp16 += 1; - tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1); - // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2 - tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1); - inst->noiseEstLogQuantile[offset + i] -= tmp16no2; - if (inst->noiseEstLogQuantile[offset + i] < logval) { - // This is the smallest fixed point representation we can - // have, hence we limit the output. - inst->noiseEstLogQuantile[offset + i] = logval; - } - } - - // update density estimate - if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i]) - < WIDTH_Q8) { - tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - inst->noiseEstDensity[offset + i], countProd, 15); - tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - width_factor, countDiv, 15); - inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2; - } - } // end loop over magnitude spectrum - - if (counter >= END_STARTUP_LONG) { - inst->noiseEstCounter[s] = 0; - if (inst->blockIndex >= END_STARTUP_LONG) { - UpdateNoiseEstimate(inst, offset); - } - } - inst->noiseEstCounter[s]++; - - } // end loop over simultaneous estimates - - // Sequentially update the noise during startup - if (inst->blockIndex < END_STARTUP_LONG) { - UpdateNoiseEstimate(inst, offset); - } - - for (i = 0; i < inst->magnLen; i++) { - noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise) - } - (*q_noise) = (WebRtc_Word16)inst->qNoise; -} - -// Filter the data in the frequency domain, and create spectrum. -void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) { - int i = 0, j = 0; - int16_t tmp16 = 0; - - for (i = 0; i < inst->magnLen; i++) { - inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i], - (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) - inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i], - (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) - } - - freq_buf[0] = inst->real[0]; - freq_buf[1] = -inst->imag[0]; - for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { - tmp16 = (inst->anaLen << 1) - j; - freq_buf[j] = inst->real[i]; - freq_buf[j + 1] = -inst->imag[i]; - freq_buf[tmp16] = inst->real[i]; - freq_buf[tmp16 + 1] = inst->imag[i]; - } - freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; - freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; -} - -// Denormalize the input buffer. -__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) { - int i = 0, j = 0; - int32_t tmp32 = 0; - for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) { - tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j], - factor - inst->normData); - inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0 - } -} - -// For the noise supression process, synthesis, read out fully processed -// segment, and update synthesis buffer. -void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, - int16_t* out_frame, - int16_t gain_factor) { - int i = 0; - int16_t tmp16a = 0; - int16_t tmp16b = 0; - int32_t tmp32 = 0; - - // synthesis - for (i = 0; i < inst->anaLen; i++) { - tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - inst->window[i], inst->real[i], 14); // Q0, window in Q14 - tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0 - // Down shift with rounding - tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0 - inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i], - tmp16b); // Q0 - } - - // read out fully processed segment - for (i = 0; i < inst->blockLen10ms; i++) { - out_frame[i] = inst->synthesisBuffer[i]; // Q0 - } - - // update synthesis buffer - WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer, - inst->synthesisBuffer + inst->blockLen10ms, - inst->anaLen - inst->blockLen10ms); - WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer - + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms); -} - -// Update analysis buffer for lower band, and window data before FFT. -void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst, - int16_t* out, - int16_t* new_speech) { - int i = 0; - - // For lower band update analysis buffer. - WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer, - inst->analysisBuffer + inst->blockLen10ms, - inst->anaLen - inst->blockLen10ms); - WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer - + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms); - - // Window data before FFT. - for (i = 0; i < inst->anaLen; i++) { - out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND( - inst->window[i], inst->analysisBuffer[i], 14); // Q0 - } -} - -// Create a complex number buffer (out[]) as the intput (in[]) interleaved with -// zeros, and normalize it. -__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst, - int16_t* in, - int16_t* out) { - int i = 0, j = 0; - for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) { - out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) - out[j + 1] = 0; // Insert zeros in imaginary part - } -} - -#endif // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)) diff --git a/src/modules/audio_processing/ns/nsx_core.h b/src/modules/audio_processing/ns/nsx_core.h index 990dfcb3a4..0a0faf98f8 100644 --- a/src/modules/audio_processing/ns/nsx_core.h +++ b/src/modules/audio_processing/ns/nsx_core.h @@ -165,40 +165,51 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* outFrameHigh); /**************************************************************************** - * Internal functions and variable declarations shared with optimized code. + * Some function pointers, for internal functions shared by ARM NEON and + * generic C code. */ - // Noise Estimation. -void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, - uint16_t* magn, - uint32_t* noise, - int16_t* q_noise); +typedef void (*NoiseEstimation)(NsxInst_t* inst, + uint16_t* magn, + uint32_t* noise, + int16_t* q_noise); +extern NoiseEstimation WebRtcNsx_NoiseEstimation; // Filter the data in the frequency domain, and create spectrum. -void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, - int16_t* freq_buff); +typedef void (*PrepareSpectrum)(NsxInst_t* inst, + int16_t* freq_buff); +extern PrepareSpectrum WebRtcNsx_PrepareSpectrum; // For the noise supression process, synthesis, read out fully processed // segment, and update synthesis buffer. -void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, - int16_t* out_frame, - int16_t gain_factor); +typedef void (*SynthesisUpdate)(NsxInst_t* inst, + int16_t* out_frame, + int16_t gain_factor); +extern SynthesisUpdate WebRtcNsx_SynthesisUpdate; // Update analysis buffer for lower band, and window data before FFT. -void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst, - int16_t* out, - int16_t* new_speech); +typedef void (*AnalysisUpdate)(NsxInst_t* inst, + int16_t* out, + int16_t* new_speech); +extern AnalysisUpdate WebRtcNsx_AnalysisUpdate; // Denormalize the input buffer. -__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, - int16_t* in, - int factor); +typedef void (*Denormalize)(NsxInst_t* inst, + int16_t* in, + int factor); +extern Denormalize WebRtcNsx_Denormalize; // Create a complex number buffer, as the intput interleaved with zeros, // and normalize it. -__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst, - int16_t* in, - int16_t* out); +typedef void (*CreateComplexBuffer)(NsxInst_t* inst, + int16_t* in, + int16_t* out); +extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer; + +/**************************************************************************** + * Initialization of the above function pointers for ARM Neon. + */ +void WebRtcNsx_InitNeon(void); extern const WebRtc_Word16 WebRtcNsx_kLogTable[9]; extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256]; @@ -208,4 +219,4 @@ extern const WebRtc_Word16 WebRtcNsx_kCounterDiv[201]; } #endif -#endif // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_ +#endif // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_ diff --git a/src/modules/audio_processing/ns/nsx_core_neon.c b/src/modules/audio_processing/ns/nsx_core_neon.c index d01ba3b97d..675b65220c 100644 --- a/src/modules/audio_processing/ns/nsx_core_neon.c +++ b/src/modules/audio_processing/ns/nsx_core_neon.c @@ -8,15 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#if defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID) - #include "nsx_core.h" #include #include // Update the noise estimation information. -static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) { +static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) { int i = 0; const int16_t kExp2Const = 11819; // Q13 int16_t* ptr_noiseEstLogQuantile = NULL; @@ -75,7 +73,7 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) { } // Last iteration: - + // inst->quantile[i]=exp(inst->lquantile[offset+i]); // in Q21 int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const, @@ -94,10 +92,10 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) { } // Noise Estimation -void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, - uint16_t* magn, - uint32_t* noise, - int16_t* q_noise) { +static void NoiseEstimationNeon(NsxInst_t* inst, + uint16_t* magn, + uint32_t* noise, + int16_t* q_noise) { int32_t numerator = FACTOR_Q16; int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv; int16_t countProd, delta, zeros, frac; @@ -126,11 +124,11 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, if (magn[i]) { zeros = WebRtcSpl_NormU32((uint32_t)magn[i]); frac = (int16_t)((((uint32_t)magn[i] << zeros) - & 0x7FFFFFFF) >> 23); + & 0x7FFFFFFF) >> 23); assert(frac < 256); // log2(magn(i)) log2 = (int16_t)(((31 - zeros) << 8) - + WebRtcNsx_kLogTableFrac[frac]); + + WebRtcNsx_kLogTableFrac[frac]); // log2(magn(i))*log(2) lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); // + log(2^stages) @@ -302,7 +300,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, if (counter >= END_STARTUP_LONG) { inst->noiseEstCounter[s] = 0; if (inst->blockIndex >= END_STARTUP_LONG) { - UpdateNoiseEstimate(inst, offset); + UpdateNoiseEstimateNeon(inst, offset); } } inst->noiseEstCounter[s]++; @@ -311,7 +309,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, // Sequentially update the noise during startup if (inst->blockIndex < END_STARTUP_LONG) { - UpdateNoiseEstimate(inst, offset); + UpdateNoiseEstimateNeon(inst, offset); } for (i = 0; i < inst->magnLen; i++) { @@ -321,7 +319,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst, } // Filter the data in the frequency domain, and create spectrum. -void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) { +static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) { // (1) Filtering. @@ -338,7 +336,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) { uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0]; // Filter the rest in the frequency domain. - for (; ptr_real < &inst->real[inst->magnLen - 1]; ) { + for (; ptr_real < &inst->real[inst->magnLen - 1];) { // Loop unrolled once. Both pointers are incremented by 4 twice. __asm__ __volatile__( "vld1.16 d20, [%[ptr_real]]\n\t" @@ -368,7 +366,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) { : :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "q9", "q10", "q11", "q12" - ); + ); } // Filter the last pair of elements in the frequency domain. @@ -400,7 +398,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) { int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8]; ptr_real = &inst->real[1]; ptr_imag = &inst->imag[1]; - for (; ptr_real < &inst->real[inst->anaLen2 - 11]; ) { + for (; ptr_real < &inst->real[inst->anaLen2 - 11];) { // Loop unrolled once. All pointers are incremented twice. __asm__ __volatile__( "vld1.16 d22, [%[ptr_real]]!\n\t" @@ -456,13 +454,13 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) { } // Denormalize the input buffer. -__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) { +static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) { int16_t* ptr_real = &inst->real[0]; int16_t* ptr_in = &in[0]; __asm__ __volatile__("vdup.32 q10, %0" :: "r"((int32_t)(factor - inst->normData)) : "q10"); - for (; ptr_real < &inst->real[inst->anaLen]; ) { + for (; ptr_real < &inst->real[inst->anaLen];) { // Loop unrolled once. Both pointers are incremented. __asm__ __volatile__( @@ -495,9 +493,9 @@ __inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) { // For the noise supress process, synthesis, read out fully processed segment, // and update synthesis buffer. -void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, - int16_t* out_frame, - int16_t gain_factor) { +static void SynthesisUpdateNeon(NsxInst_t* inst, + int16_t* out_frame, + int16_t gain_factor) { int16_t* ptr_real = &inst->real[0]; int16_t* ptr_syn = &inst->synthesisBuffer[0]; int16_t* ptr_window = &inst->window[0]; @@ -505,7 +503,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, // synthesis __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24"); // Loop unrolled once. All pointers are incremented in the assembly code. - for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) { + for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) { __asm__ __volatile__( // Load variables. "vld1.16 d22, [%[ptr_real]]!\n\t" @@ -553,7 +551,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, int16_t* ptr_out = &out_frame[0]; ptr_syn = &inst->synthesisBuffer[0]; // read out fully processed segment - for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms]; ) { + for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) { // Loop unrolled once. Both pointers are incremented in the assembly code. __asm__ __volatile__( // out_frame[i] = inst->synthesisBuffer[i]; // Q0 @@ -575,7 +573,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, // inst->anaLen - inst->blockLen10ms); ptr_out = &inst->synthesisBuffer[0], ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms]; - for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) { + for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) { // Loop unrolled once. Both pointers are incremented in the assembly code. __asm__ __volatile__( "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t" @@ -593,7 +591,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer // + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms); __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10"); - for (; ptr_out < &inst->synthesisBuffer[inst->anaLen]; ) { + for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) { // Loop unrolled once. Pointer is incremented in the assembly code. __asm__ __volatile__( "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t" @@ -606,9 +604,9 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst, } // Update analysis buffer for lower band, and window data before FFT. -void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst, - int16_t* out, - int16_t* new_speech) { +static void AnalysisUpdateNeon(NsxInst_t* inst, + int16_t* out, + int16_t* new_speech) { int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms]; int16_t* ptr_out = &inst->analysisBuffer[0]; @@ -617,7 +615,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst, // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer, // inst->analysisBuffer + inst->blockLen10ms, // inst->anaLen - inst->blockLen10ms); - for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms]; ) { + for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) { // Loop unrolled once, so both pointers are incremented by 8 twice. __asm__ __volatile__( "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t" @@ -633,7 +631,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst, // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer // + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms); - for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen]; ) { + for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) { // Loop unrolled once, so both pointers are incremented by 8 twice. __asm__ __volatile__( "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t" @@ -651,7 +649,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst, int16_t* ptr_window = &inst->window[0]; ptr_out = &out[0]; ptr_ana = &inst->analysisBuffer[0]; - for (; ptr_out < &out[inst->anaLen]; ) { + for (; ptr_out < &out[inst->anaLen];) { // Loop unrolled once, so all pointers are incremented by 4 twice. __asm__ __volatile__( @@ -683,17 +681,17 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst, // Create a complex number buffer (out[]) as the intput (in[]) interleaved with // zeros, and normalize it. -__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst, - int16_t* in, - int16_t* out) { +static __inline void CreateComplexBufferNeon(NsxInst_t* inst, + int16_t* in, + int16_t* out) { int16_t* ptr_out = &out[0]; int16_t* ptr_in = &in[0]; __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25"); __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10"); - for (; ptr_in < &in[inst->anaLen]; ) { + for (; ptr_in < &in[inst->anaLen];) { - // Loop unrolled once, so ptr_in is incremented by 8 twice, + // Loop unrolled once, so ptr_in is incremented by 8 twice, // and ptr_out is incremented by 8 four times. __asm__ __volatile__( // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData) @@ -724,4 +722,12 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst, ); } } -#endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID) + +void WebRtcNsx_InitNeon(void) { + WebRtcNsx_NoiseEstimation = NoiseEstimationNeon; + WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon; + WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon; + WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon; + WebRtcNsx_Denormalize = DenormalizeNeon; + WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon; +} diff --git a/src/system_wrappers/interface/cpu_features_wrapper.h b/src/system_wrappers/interface/cpu_features_wrapper.h index 5d8a828a7a..d949592197 100644 --- a/src/system_wrappers/interface/cpu_features_wrapper.h +++ b/src/system_wrappers/interface/cpu_features_wrapper.h @@ -15,18 +15,33 @@ extern "C" { #endif -// list of features. +#include + +// List of features in x86. typedef enum { kSSE2, kSSE3 } CPUFeature; +// List of features in ARM. +enum { + kCPUFeatureARMv7 = (1 << 0), + kCPUFeatureVFPv3 = (1 << 1), + kCPUFeatureNEON = (1 << 2), + kCPUFeatureLDREXSTREX = (1 << 3) +}; + typedef int (*WebRtc_CPUInfo)(CPUFeature feature); // returns true if the CPU supports the feature. extern WebRtc_CPUInfo WebRtc_GetCPUInfo; // No CPU feature is available => straight C path. extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM; +// Return the features in an ARM device. +// It detects the features in the hardware platform, and returns supported +// values in the above enum definition as a bitmask. +extern uint64_t WebRtc_GetCPUFeaturesARM(void); + #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" #endif diff --git a/src/system_wrappers/source/Android.mk b/src/system_wrappers/source/Android.mk index a5d1439a2a..00a69ce009 100644 --- a/src/system_wrappers/source/Android.mk +++ b/src/system_wrappers/source/Android.mk @@ -25,6 +25,7 @@ LOCAL_SRC_FILES := \ condition_variable.cc \ cpu_dummy.cc \ cpu_features.cc \ + cpu_features_arm.c \ cpu_info.cc \ critical_section.cc \ event.cc \ diff --git a/src/system_wrappers/source/cpu_features_arm.c b/src/system_wrappers/source/cpu_features_arm.c new file mode 100644 index 0000000000..106511852c --- /dev/null +++ b/src/system_wrappers/source/cpu_features_arm.c @@ -0,0 +1,333 @@ +/* + * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +// This file is derived from Android's NDK package r7, located at +// /sources/android/cpufeatures/ (downloadable from +// http://developer.android.com/sdk/ndk/index.html). + +#include "cpu_features_wrapper.h" + +#include +#include +#include +#include +#include + +// Define CPU family. +typedef enum { + CPU_FAMILY_UNKNOWN = 0, + CPU_FAMILY_ARM, + CPU_FAMILY_X86, + CPU_FAMILY_MAX // Do not remove. +} CpuFamily; + +static pthread_once_t g_once; +static CpuFamily g_cpuFamily; +static uint64_t g_cpuFeatures; +static int g_cpuCount; + +static const int cpufeatures_debug = 0; + +#ifdef __arm__ +# define DEFAULT_CPU_FAMILY CPU_FAMILY_ARM +#elif defined __i386__ +# define DEFAULT_CPU_FAMILY CPU_FAMILY_X86 +#else +# define DEFAULT_CPU_FAMILY CPU_FAMILY_UNKNOWN +#endif + +#define D(...) \ + do { \ + if (cpufeatures_debug) { \ + printf(__VA_ARGS__); fflush(stdout); \ + } \ + } while (0) + +/* Read the content of /proc/cpuinfo into a user-provided buffer. + * Return the length of the data, or -1 on error. Does *not* + * zero-terminate the content. Will not read more + * than 'buffsize' bytes. + */ +static int read_file(const char* pathname, char* buffer, size_t buffsize) { + int fd, len; + + fd = open(pathname, O_RDONLY); + if (fd < 0) + return -1; + + do { + len = read(fd, buffer, buffsize); + } while (len < 0 && errno == EINTR); + + close(fd); + + return len; +} + +/* Extract the content of a the first occurence of a given field in + * the content of /proc/cpuinfo and return it as a heap-allocated + * string that must be freed by the caller. + * + * Return NULL if not found + */ +static char* extract_cpuinfo_field(char* buffer, int buflen, const char* field) { + int fieldlen = strlen(field); + char* bufend = buffer + buflen; + char* result = NULL; + int len, ignore; + const char* p, *q; + + /* Look for first field occurence, and ensures it starts the line. + */ + p = buffer; + bufend = buffer + buflen; + for (;;) { + p = memmem(p, bufend - p, field, fieldlen); + if (p == NULL) + goto EXIT; + + if (p == buffer || p[-1] == '\n') + break; + + p += fieldlen; + } + + /* Skip to the first column followed by a space */ + p += fieldlen; + p = memchr(p, ':', bufend - p); + if (p == NULL || p[1] != ' ') + goto EXIT; + + /* Find the end of the line */ + p += 2; + q = memchr(p, '\n', bufend - p); + if (q == NULL) + q = bufend; + + /* Copy the line into a heap-allocated buffer */ + len = q - p; + result = malloc(len + 1); + if (result == NULL) + goto EXIT; + + memcpy(result, p, len); + result[len] = '\0'; + +EXIT: + return result; +} + +/* Count the number of occurences of a given field prefix in /proc/cpuinfo. + */ +static int count_cpuinfo_field(char* buffer, int buflen, const char* field) { + int fieldlen = strlen(field); + const char* p = buffer; + const char* bufend = buffer + buflen; + const char* q; + int count = 0; + + for (;;) { + const char* q; + + p = memmem(p, bufend - p, field, fieldlen); + if (p == NULL) + break; + + /* Ensure that the field is at the start of a line */ + if (p > buffer && p[-1] != '\n') { + p += fieldlen; + continue; + } + + + /* skip any whitespace */ + q = p + fieldlen; + while (q < bufend && (*q == ' ' || *q == '\t')) + q++; + + /* we must have a colon now */ + if (q < bufend && *q == ':') { + count += 1; + q ++; + } + p = q; + } + + return count; +} + +/* Like strlen(), but for constant string literals */ +#define STRLEN_CONST(x) ((sizeof(x)-1) + + +/* Checks that a space-separated list of items contains one given 'item'. + * Returns 1 if found, 0 otherwise. + */ +static int has_list_item(const char* list, const char* item) { + const char* p = list; + int itemlen = strlen(item); + + if (list == NULL) + return 0; + + while (*p) { + const char* q; + + /* skip spaces */ + while (*p == ' ' || *p == '\t') + p++; + + /* find end of current list item */ + q = p; + while (*q && *q != ' ' && *q != '\t') + q++; + + if (itemlen == q - p && !memcmp(p, item, itemlen)) + return 1; + + /* skip to next item */ + p = q; + } + return 0; +} + + +static void cpuInit(void) { + char cpuinfo[4096]; + int cpuinfo_len; + + g_cpuFamily = DEFAULT_CPU_FAMILY; + g_cpuFeatures = 0; + g_cpuCount = 1; + + cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo); + D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len, + cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo); + + if (cpuinfo_len < 0) { /* should not happen */ + return; + } + + /* Count the CPU cores, the value may be 0 for single-core CPUs */ + g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "processor"); + if (g_cpuCount == 0) { + g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "Processor"); + if (g_cpuCount == 0) { + g_cpuCount = 1; + } + } + + D("found cpuCount = %d\n", g_cpuCount); + +#ifdef __arm__ + { + char* features = NULL; + char* architecture = NULL; + + /* Extract architecture from the "CPU Architecture" field. + * The list is well-known, unlike the the output of + * the 'Processor' field which can vary greatly. + * + * See the definition of the 'proc_arch' array in + * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in + * same file. + */ + char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len, + "CPU architecture"); + + if (cpuArch != NULL) { + char* end; + long archNumber; + int hasARMv7 = 0; + + D("found cpuArch = '%s'\n", cpuArch); + + /* read the initial decimal number, ignore the rest */ + archNumber = strtol(cpuArch, &end, 10); + + /* Here we assume that ARMv8 will be upwards compatible with v7 + * in the future. Unfortunately, there is no 'Features' field to + * indicate that Thumb-2 is supported. + */ + if (end > cpuArch && archNumber >= 7) { + hasARMv7 = 1; + } + + /* Unfortunately, it seems that certain ARMv6-based CPUs + * report an incorrect architecture number of 7! + * + * We try to correct this by looking at the 'elf_format' + * field reported by the 'Processor' field, which is of the + * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for + * an ARMv6-one. + */ + if (hasARMv7) { + char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len, + "Processor"); + if (cpuProc != NULL) { + D("found cpuProc = '%s'\n", cpuProc); + if (has_list_item(cpuProc, "(v6l)")) { + D("CPU processor and architecture mismatch!!\n"); + hasARMv7 = 0; + } + free(cpuProc); + } + } + + if (hasARMv7) { + g_cpuFeatures |= kCPUFeatureARMv7; + } + + /* The LDREX / STREX instructions are available from ARMv6 */ + if (archNumber >= 6) { + g_cpuFeatures |= kCPUFeatureLDREXSTREX; + } + + free(cpuArch); + } + + /* Extract the list of CPU features from 'Features' field */ + char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len, + "Features"); + + if (cpuFeatures != NULL) { + + D("found cpuFeatures = '%s'\n", cpuFeatures); + + if (has_list_item(cpuFeatures, "vfpv3")) + g_cpuFeatures |= kCPUFeatureVFPv3; + + else if (has_list_item(cpuFeatures, "vfpv3d16")) + g_cpuFeatures |= kCPUFeatureVFPv3; + + if (has_list_item(cpuFeatures, "neon")) { + /* Note: Certain kernels only report neon but not vfpv3 + * in their features list. However, ARM mandates + * that if Neon is implemented, so must be VFPv3 + * so always set the flag. + */ + g_cpuFeatures |= kCPUFeatureNEON | + kCPUFeatureVFPv3; + } + free(cpuFeatures); + } + } +#endif // __arm__ + +#ifdef __i386__ + g_cpuFamily = CPU_FAMILY_X86; +#endif +} + + +uint64_t WebRtc_GetCPUFeaturesARM(void) { + pthread_once(&g_once, cpuInit); + return g_cpuFeatures; +}