From b59c0316606017524befe6631f75add61b634e37 Mon Sep 17 00:00:00 2001
From: "kma@webrtc.org" <kma@webrtc.org@4adac7df-926f-26a2-2b94-8c16560cd09d>
Date: Sat, 3 Dec 2011 18:34:50 +0000
Subject: [PATCH] For Android ARMv7 platforms, added a feature of dynamically
 detecting the existence of Neon, and when it's present, switch to some
 functions optimized for Neon at run time. Review URL:
 http://webrtc-codereview.appspot.com/268002

git-svn-id: http://webrtc.googlecode.com/svn/trunk@1096 4adac7df-926f-26a2-2b94-8c16560cd09d
---
 Android.mk                                    |  12 +
 android-webrtc.mk                             |   3 +-
 src/modules/audio_processing/aecm/Android.mk  |  46 +-
 src/modules/audio_processing/aecm/aecm_core.c | 406 ++++++-------
 src/modules/audio_processing/aecm/aecm_core.h |  54 +-
 .../audio_processing/aecm/aecm_core_neon.c    | 475 ++++++++-------
 src/modules/audio_processing/ns/Android.mk    |  47 +-
 src/modules/audio_processing/ns/nsx_core.c    | 543 +++++++++---------
 src/modules/audio_processing/ns/nsx_core.h    |  53 +-
 .../audio_processing/ns/nsx_core_neon.c       |  80 +--
 .../interface/cpu_features_wrapper.h          |  17 +-
 src/system_wrappers/source/Android.mk         |   1 +
 src/system_wrappers/source/cpu_features_arm.c | 333 +++++++++++
 13 files changed, 1274 insertions(+), 796 deletions(-)
 create mode 100644 src/system_wrappers/source/cpu_features_arm.c

diff --git a/Android.mk b/Android.mk
index bd050e6af0..245c4249f9 100644
--- a/Android.mk
+++ b/Android.mk
@@ -54,6 +54,7 @@ include $(MY_WEBRTC_ROOT_PATH)/libvpx.mk
 LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
+include $(LOCAL_PATH)/../../external/webrtc/android-webrtc.mk
 
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libwebrtc_audio_preprocessing
@@ -71,6 +72,17 @@ LOCAL_WHOLE_STATIC_LIBRARIES := \
     libwebrtc_aecm \
     libwebrtc_system_wrappers
 
+# Add Neon libraries.
+ifneq (,$(filter '-DWEBRTC_DETECT_ARM_NEON',$(MY_WEBRTC_COMMON_DEFS)))
+LOCAL_WHOLE_STATIC_LIBRARIES += \
+    libwebrtc_aecm_neon \
+    libwebrtc_ns_neon
+else ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_WHOLE_STATIC_LIBRARIES += \
+    libwebrtc_aecm_neon \
+    libwebrtc_ns_neon
+endif
+
 LOCAL_STATIC_LIBRARIES := \
     libprotobuf-cpp-2.3.0-lite
 
diff --git a/android-webrtc.mk b/android-webrtc.mk
index 9a8c861b73..eb620bb3b4 100644
--- a/android-webrtc.mk
+++ b/android-webrtc.mk
@@ -21,8 +21,9 @@ MY_WEBRTC_COMMON_DEFS := \
 #    '-DWEBRTC_MODULE_UTILITY_VIDEO' [module media_file] [module utility]
 ifeq ($(TARGET_ARCH),arm)
 MY_WEBRTC_COMMON_DEFS += \
-    '-DWEBRTC_ARM_INLINE_CALLS' \
     '-DWEBRTC_ARCH_ARM'
+#    '-DWEBRTC_DETECT_ARM_NEON' # only used in a build configuration without Neon
+# TODO(kma): figure out if the above define could be moved to NDK build only.
 
 # TODO(kma): test if the code under next two macros works with generic GCC compilers
 ifeq ($(ARCH_ARM_HAVE_NEON),true)
diff --git a/src/modules/audio_processing/aecm/Android.mk b/src/modules/audio_processing/aecm/Android.mk
index 916c5a8635..c33a957e5f 100644
--- a/src/modules/audio_processing/aecm/Android.mk
+++ b/src/modules/audio_processing/aecm/Android.mk
@@ -6,6 +6,9 @@
 # in the file PATENTS.  All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
+#############################
+# Build the non-neon library.
+
 LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
@@ -21,21 +24,16 @@ LOCAL_SRC_FILES := \
     aecm_core.c
 
 # Flags passed to both C and C++ files.
-LOCAL_CFLAGS := \
-    $(MY_WEBRTC_COMMON_DEFS)
-
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
-LOCAL_SRC_FILES += \
-    aecm_core_neon.c
-LOCAL_CFLAGS += \
-    $(MY_ARM_CFLAGS_NEON)
-endif
+LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
 
 LOCAL_C_INCLUDES := \
     $(LOCAL_PATH)/interface \
     $(LOCAL_PATH)/../utility \
     $(LOCAL_PATH)/../../.. \
-    $(LOCAL_PATH)/../../../common_audio/signal_processing/include
+    $(LOCAL_PATH)/../../../common_audio/signal_processing/include \
+    $(LOCAL_PATH)/../../../system_wrappers/interface
+
+LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
 
 LOCAL_SHARED_LIBRARIES := \
     libcutils \
@@ -46,3 +44,31 @@ ifndef NDK_ROOT
 include external/stlport/libstlport.mk
 endif
 include $(BUILD_STATIC_LIBRARY)
+
+#########################
+# Build the neon library.
+
+include $(CLEAR_VARS)
+
+LOCAL_ARM_MODE := arm
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := libwebrtc_aecm_neon
+LOCAL_MODULE_TAGS := optional
+
+LOCAL_SRC_FILES := aecm_core_neon.c
+
+# Flags passed to both C and C++ files.
+LOCAL_CFLAGS := \
+    $(MY_WEBRTC_COMMON_DEFS) \
+    -mfpu=neon \
+    -flax-vector-conversions
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/interface \
+    $(LOCAL_PATH)/../../.. \
+    $(LOCAL_PATH)/../../../common_audio/signal_processing/include
+
+ifndef NDK_ROOT
+include external/stlport/libstlport.mk
+endif
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/modules/audio_processing/aecm/aecm_core.c b/src/modules/audio_processing/aecm/aecm_core.c
index 4ad705e506..f2e4683288 100644
--- a/src/modules/audio_processing/aecm/aecm_core.c
+++ b/src/modules/audio_processing/aecm/aecm_core.c
@@ -13,8 +13,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "echo_control_mobile.h"
+#include "cpu_features_wrapper.h"
 #include "delay_estimator_wrapper.h"
+#include "echo_control_mobile.h"
 #include "ring_buffer.h"
 #include "typedefs.h"
 
@@ -263,6 +264,13 @@ static const uint16_t* AlignedFarend(AecmCore_t* self, int* far_q, int delay) {
 HANDLE logFile = NULL;
 #endif
 
+// Declare function pointers.
+CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
+StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
+ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
+WindowAndFFT WebRtcAecm_WindowAndFFT;
+InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
+
 int WebRtcAecm_CreateCore(AecmCore_t **aecmInst)
 {
     AecmCore_t *aecm = malloc(sizeof(AecmCore_t));
@@ -346,6 +354,194 @@ void WebRtcAecm_InitEchoPathCore(AecmCore_t* aecm, const WebRtc_Word16* echo_pat
     aecm->mseChannelCount = 0;
 }
 
+static void WindowAndFFTC(WebRtc_Word16* fft,
+                          const WebRtc_Word16* time_signal,
+                          complex16_t* freq_signal,
+                          int time_signal_scaling)
+{
+    int i, j;
+
+    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
+    // FFT of signal
+    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
+    {
+        // Window time domain signal and insert into real part of
+        // transformation array |fft|
+        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+            (time_signal[i] << time_signal_scaling),
+            WebRtcAecm_kSqrtHanning[i],
+            14);
+        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+            (time_signal[i + PART_LEN] << time_signal_scaling),
+            WebRtcAecm_kSqrtHanning[PART_LEN - i],
+            14);
+        // Inserting zeros in imaginary parts not necessary since we
+        // initialized the array with all zeros
+    }
+
+    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+    WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
+
+    // Take only the first PART_LEN2 samples
+    for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
+    {
+        freq_signal[i].real = fft[j];
+
+        // The imaginary part has to switch sign
+        freq_signal[i].imag = - fft[j+1];
+    }
+}
+
+static void InverseFFTAndWindowC(AecmCore_t* aecm,
+                                 WebRtc_Word16* fft,
+                                 complex16_t* efw,
+                                 WebRtc_Word16* output,
+                                 const WebRtc_Word16* nearendClean)
+{
+    int i, j, outCFFT;
+    WebRtc_Word32 tmp32no1;
+
+    // Synthesis
+    for (i = 1; i < PART_LEN; i++)
+    {
+        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
+        fft[j] = efw[i].real;
+
+        // mirrored data, even
+        fft[PART_LEN4 - j] = efw[i].real;
+        fft[j + 1] = -efw[i].imag;
+
+        //mirrored data, odd
+        fft[PART_LEN4 - (j - 1)] = efw[i].imag;
+    }
+    fft[0] = efw[0].real;
+    fft[1] = -efw[0].imag;
+
+    fft[PART_LEN2] = efw[PART_LEN].real;
+    fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
+
+    // inverse FFT, result should be scaled with outCFFT
+    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
+
+    //take only the real values and scale with outCFFT
+    for (i = 0; i < PART_LEN2; i++)
+    {
+        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
+        fft[i] = fft[j];
+    }
+
+    for (i = 0; i < PART_LEN; i++)
+    {
+        fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                fft[i],
+                WebRtcAecm_kSqrtHanning[i],
+                14);
+        tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
+                outCFFT - aecm->dfaCleanQDomain);
+        fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
+                tmp32no1 + aecm->outBuf[i],
+                WEBRTC_SPL_WORD16_MIN);
+        output[i] = fft[i];
+
+        tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
+                fft[PART_LEN + i],
+                WebRtcAecm_kSqrtHanning[PART_LEN - i],
+                14);
+        tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
+                outCFFT - aecm->dfaCleanQDomain);
+        aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
+                WEBRTC_SPL_WORD16_MAX,
+                tmp32no1,
+                WEBRTC_SPL_WORD16_MIN);
+    }
+
+#ifdef ARM_WINM_LOG_
+    // measure tick end
+    QueryPerformanceCounter((LARGE_INTEGER*)&end);
+    diff__ = ((end - start) * 1000) / (freq/1000);
+    milliseconds = (unsigned int)(diff__ & 0xffffffff);
+    WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
+#endif
+
+    // Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
+    memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
+    memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
+    if (nearendClean != NULL)
+    {
+        memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
+    }
+}
+
+static void CalcLinearEnergiesC(AecmCore_t* aecm,
+                                const WebRtc_UWord16* far_spectrum,
+                                WebRtc_Word32* echo_est,
+                                WebRtc_UWord32* far_energy,
+                                WebRtc_UWord32* echo_energy_adapt,
+                                WebRtc_UWord32* echo_energy_stored)
+{
+    int i;
+
+    // Get energy for the delayed far end signal and estimated
+    // echo using both stored and adapted channels.
+    for (i = 0; i < PART_LEN1; i++)
+    {
+        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                           far_spectrum[i]);
+        (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
+        (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
+                                          far_spectrum[i]);
+        (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
+    }
+}
+
+static void StoreAdaptiveChannelC(AecmCore_t* aecm,
+                                  const WebRtc_UWord16* far_spectrum,
+                                  WebRtc_Word32* echo_est)
+{
+    int i;
+
+    // During startup we store the channel every block.
+    memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
+    // Recalculate echo estimate
+    for (i = 0; i < PART_LEN; i += 4)
+    {
+        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                           far_spectrum[i]);
+        echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
+                                           far_spectrum[i + 1]);
+        echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
+                                           far_spectrum[i + 2]);
+        echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
+                                           far_spectrum[i + 3]);
+    }
+    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
+                                       far_spectrum[i]);
+}
+
+static void ResetAdaptiveChannelC(AecmCore_t* aecm)
+{
+    int i;
+
+    // The stored channel has a significantly lower MSE than the adaptive one for
+    // two consecutive calculations. Reset the adaptive channel.
+    memcpy(aecm->channelAdapt16, aecm->channelStored,
+           sizeof(WebRtc_Word16) * PART_LEN1);
+    // Restore the W32 channel
+    for (i = 0; i < PART_LEN; i += 4)
+    {
+        aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i], 16);
+        aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 1], 16);
+        aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 2], 16);
+        aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
+                (WebRtc_Word32)aecm->channelStored[i + 3], 16);
+    }
+    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
+}
+
 // WebRtcAecm_InitCore(...)
 //
 // This function initializes the AECM instant created with WebRtcAecm_CreateCore(...)
@@ -463,6 +659,23 @@ int WebRtcAecm_InitCore(AecmCore_t * const aecm, int samplingFreq)
 
     assert(PART_LEN % 16 == 0);
 
+    // Initialize function pointers.
+    WebRtcAecm_WindowAndFFT = WindowAndFFTC;
+    WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowC;
+    WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesC;
+    WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelC;
+    WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelC;
+
+#ifdef WEBRTC_DETECT_ARM_NEON
+    uint64_t features = WebRtc_GetCPUFeaturesARM();
+    if ((features & kCPUFeatureNEON) != 0)
+    {
+        WebRtcAecm_InitNeon();
+    }
+#elif defined(WEBRTC_ARCH_ARM_NEON)
+    WebRtcAecm_InitNeon();
+#endif
+
     return 0;
 }
 
@@ -1890,194 +2103,3 @@ void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const far
     aecm->farBufReadPos += readLen;
 }
 
-#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
-
-void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
-                    const WebRtc_Word16* time_signal,
-                    complex16_t* freq_signal,
-                    int time_signal_scaling)
-{
-    int i, j;
-
-    memset(fft, 0, sizeof(WebRtc_Word16) * PART_LEN4);
-    // FFT of signal
-    for (i = 0, j = 0; i < PART_LEN; i++, j += 2)
-    {
-        // Window time domain signal and insert into real part of
-        // transformation array |fft|
-        fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-            (time_signal[i] << time_signal_scaling),
-            WebRtcAecm_kSqrtHanning[i],
-            14);
-        fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-            (time_signal[i + PART_LEN] << time_signal_scaling),
-            WebRtcAecm_kSqrtHanning[PART_LEN - i],
-            14);
-        // Inserting zeros in imaginary parts not necessary since we
-        // initialized the array with all zeros
-    }
-
-    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
-    WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
-
-    // Take only the first PART_LEN2 samples
-    for (i = 0, j = 0; j < PART_LEN2; i += 1, j += 2)
-    {
-        freq_signal[i].real = fft[j];
-
-        // The imaginary part has to switch sign
-        freq_signal[i].imag = - fft[j+1];
-    }
-}
-
-void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
-                        WebRtc_Word16* fft,
-                        complex16_t* efw,
-                        WebRtc_Word16* output,
-                        const WebRtc_Word16* nearendClean)
-{
-    int i, j, outCFFT;
-    WebRtc_Word32 tmp32no1;
-
-    // Synthesis
-    for (i = 1; i < PART_LEN; i++)
-    {
-        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
-        fft[j] = efw[i].real;
-
-        // mirrored data, even
-        fft[PART_LEN4 - j] = efw[i].real;
-        fft[j + 1] = -efw[i].imag;
-
-        //mirrored data, odd
-        fft[PART_LEN4 - (j - 1)] = efw[i].imag;
-    }
-    fft[0] = efw[0].real;
-    fft[1] = -efw[0].imag;
-
-    fft[PART_LEN2] = efw[PART_LEN].real;
-    fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
-
-    // inverse FFT, result should be scaled with outCFFT
-    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
-    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
-
-    //take only the real values and scale with outCFFT
-    for (i = 0; i < PART_LEN2; i++)
-    {
-        j = WEBRTC_SPL_LSHIFT_W32(i, 1);
-        fft[i] = fft[j];
-    }
-
-    for (i = 0; i < PART_LEN; i++)
-    {
-        fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                fft[i],
-                WebRtcAecm_kSqrtHanning[i],
-                14);
-        tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
-                outCFFT - aecm->dfaCleanQDomain);
-        fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
-                tmp32no1 + aecm->outBuf[i],
-                WEBRTC_SPL_WORD16_MIN);
-        output[i] = fft[i];
-
-        tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
-                fft[PART_LEN + i],
-                WebRtcAecm_kSqrtHanning[PART_LEN - i],
-                14);
-        tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1,
-                outCFFT - aecm->dfaCleanQDomain);
-        aecm->outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
-                WEBRTC_SPL_WORD16_MAX,
-                tmp32no1,
-                WEBRTC_SPL_WORD16_MIN);
-    }
-
-#ifdef ARM_WINM_LOG_
-    // measure tick end
-    QueryPerformanceCounter((LARGE_INTEGER*)&end);
-    diff__ = ((end - start) * 1000) / (freq/1000);
-    milliseconds = (unsigned int)(diff__ & 0xffffffff);
-    WriteFile (logFile, &milliseconds, sizeof(unsigned int), &temp, NULL);
-#endif
-
-    // Copy the current block to the old position (aecm->outBuf is shifted elsewhere)
-    memcpy(aecm->xBuf, aecm->xBuf + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
-    memcpy(aecm->dBufNoisy, aecm->dBufNoisy + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
-    if (nearendClean != NULL)
-    {
-        memcpy(aecm->dBufClean, aecm->dBufClean + PART_LEN, sizeof(WebRtc_Word16) * PART_LEN);
-    }
-}
-
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
-                                   const WebRtc_UWord16* far_spectrum,
-                                   WebRtc_Word32* echo_est,
-                                   WebRtc_UWord32* far_energy,
-                                   WebRtc_UWord32* echo_energy_adapt,
-                                   WebRtc_UWord32* echo_energy_stored)
-{
-    int i;
-
-    // Get energy for the delayed far end signal and estimated
-    // echo using both stored and adapted channels.
-    for (i = 0; i < PART_LEN1; i++)
-    {
-        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                           far_spectrum[i]);
-        (*far_energy) += (WebRtc_UWord32)(far_spectrum[i]);
-        (*echo_energy_adapt) += WEBRTC_SPL_UMUL_16_16(aecm->channelAdapt16[i],
-                                          far_spectrum[i]);
-        (*echo_energy_stored) += (WebRtc_UWord32)echo_est[i];
-    }
-}
-
-void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
-                                     const WebRtc_UWord16* far_spectrum,
-                                     WebRtc_Word32* echo_est)
-{
-    int i;
-
-    // During startup we store the channel every block.
-    memcpy(aecm->channelStored, aecm->channelAdapt16, sizeof(WebRtc_Word16) * PART_LEN1);
-    // Recalculate echo estimate
-    for (i = 0; i < PART_LEN; i += 4)
-    {
-        echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                           far_spectrum[i]);
-        echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
-                                           far_spectrum[i + 1]);
-        echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
-                                           far_spectrum[i + 2]);
-        echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
-                                           far_spectrum[i + 3]);
-    }
-    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
-                                       far_spectrum[i]);
-}
-
-void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
-{
-    int i;
-
-    // The stored channel has a significantly lower MSE than the adaptive one for
-    // two consecutive calculations. Reset the adaptive channel.
-    memcpy(aecm->channelAdapt16, aecm->channelStored,
-           sizeof(WebRtc_Word16) * PART_LEN1);
-    // Restore the W32 channel
-    for (i = 0; i < PART_LEN; i += 4)
-    {
-        aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i], 16);
-        aecm->channelAdapt32[i + 1] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i + 1], 16);
-        aecm->channelAdapt32[i + 2] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i + 2], 16);
-        aecm->channelAdapt32[i + 3] = WEBRTC_SPL_LSHIFT_W32(
-                (WebRtc_Word32)aecm->channelStored[i + 3], 16);
-    }
-    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)aecm->channelStored[i], 16);
-}
-
-#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
diff --git a/src/modules/audio_processing/aecm/aecm_core.h b/src/modules/audio_processing/aecm/aecm_core.h
index dede6d39be..0ec62ec24d 100644
--- a/src/modules/audio_processing/aecm/aecm_core.h
+++ b/src/modules/audio_processing/aecm/aecm_core.h
@@ -332,32 +332,44 @@ void WebRtcAecm_BufferFarFrame(AecmCore_t * const aecm, const WebRtc_Word16 * co
 void WebRtcAecm_FetchFarFrame(AecmCore_t * const aecm, WebRtc_Word16 * const farend,
                               const int farLen, const int knownDelay);
 
-///////////////////////////////////////////////////////////////////////////////////////////////
-// Some internal functions shared by ARM NEON and generic C code:
+///////////////////////////////////////////////////////////////////////////////
+// Some function pointers, for internal functions shared by ARM NEON and 
+// generic C code.
 //
+typedef void (*CalcLinearEnergies)(
+    AecmCore_t* aecm,
+    const WebRtc_UWord16* far_spectrum,
+    WebRtc_Word32* echoEst,
+    WebRtc_UWord32* far_energy,
+    WebRtc_UWord32* echo_energy_adapt,
+    WebRtc_UWord32* echo_energy_stored);
+extern CalcLinearEnergies WebRtcAecm_CalcLinearEnergies;
 
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
-                                   const WebRtc_UWord16* far_spectrum,
-                                   WebRtc_Word32* echoEst,
-                                   WebRtc_UWord32* far_energy,
-                                   WebRtc_UWord32* echo_energy_adapt,
-                                   WebRtc_UWord32* echo_energy_stored);
+typedef void (*StoreAdaptiveChannel)(
+    AecmCore_t* aecm,
+    const WebRtc_UWord16* far_spectrum,
+    WebRtc_Word32* echo_est);
+extern StoreAdaptiveChannel WebRtcAecm_StoreAdaptiveChannel;
 
-void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
-                                     const WebRtc_UWord16* far_spectrum,
-                                     WebRtc_Word32* echo_est);
+typedef void (*ResetAdaptiveChannel)(AecmCore_t* aecm);
+extern ResetAdaptiveChannel WebRtcAecm_ResetAdaptiveChannel;
 
-void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t *aecm);
+typedef void (*WindowAndFFT)(
+    WebRtc_Word16* fft,
+    const WebRtc_Word16* time_signal,
+    complex16_t* freq_signal,
+    int time_signal_scaling);
+extern WindowAndFFT WebRtcAecm_WindowAndFFT;
 
-void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
-                             const WebRtc_Word16* time_signal,
-                             complex16_t* freq_signal,
-                             int time_signal_scaling);
+typedef void (*InverseFFTAndWindow)(
+    AecmCore_t* aecm,
+    WebRtc_Word16* fft, complex16_t* efw,
+    WebRtc_Word16* output,
+    const WebRtc_Word16* nearendClean);
+extern InverseFFTAndWindow WebRtcAecm_InverseFFTAndWindow;
+
+// Initialization of the above function pointers for ARM Neon.
+void WebRtcAecm_InitNeon(void);
 
-void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
-                                    WebRtc_Word16* fft,
-                                    complex16_t* efw,
-                                    WebRtc_Word16* output,
-                                    const WebRtc_Word16* nearendClean);
 
 #endif
diff --git a/src/modules/audio_processing/aecm/aecm_core_neon.c b/src/modules/audio_processing/aecm/aecm_core_neon.c
index 86ced1ed3b..ab448b48da 100644
--- a/src/modules/audio_processing/aecm/aecm_core_neon.c
+++ b/src/modules/audio_processing/aecm/aecm_core_neon.c
@@ -7,7 +7,6 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
 
 #include "aecm_core.h"
 
@@ -16,299 +15,289 @@
 
 
 // Square root of Hanning window in Q14.
-static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__ ((aligned (8))) = {       
-     16384, 16373, 16354, 16325,  
-     16286, 16237, 16179, 16111,  
-     16034, 15947, 15851, 15746,  
-     15631, 15506, 15373, 15231,  
-     15079, 14918, 14749, 14571,  
-     14384, 14189, 13985, 13773,  
-     13553, 13325, 13089, 12845,  
-     12594, 12335, 12068, 11795,  
-     11514, 11227, 10933, 10633,  
-     10326, 10013, 9695,  9370,   
-     9040,  8705,  8364,  8019,   
-     7668,  7313,  6954,  6591,   
-     6224,  5853,  5478,  5101,   
-     4720,  4337,  3951,  3562,   
-     3172,  2780,  2386,  1990,   
-     1594,  1196,  798,   399
+static const WebRtc_Word16 kSqrtHanningReversed[] __attribute__((aligned(8))) = {
+  16384, 16373, 16354, 16325, 
+  16286, 16237, 16179, 16111,
+  16034, 15947, 15851, 15746,
+  15631, 15506, 15373, 15231,
+  15079, 14918, 14749, 14571,
+  14384, 14189, 13985, 13773,
+  13553, 13325, 13089, 12845,
+  12594, 12335, 12068, 11795,
+  11514, 11227, 10933, 10633,
+  10326, 10013, 9695,  9370,
+  9040,  8705,  8364,  8019,
+  7668,  7313,  6954,  6591,
+  6224,  5853,  5478,  5101,
+  4720,  4337,  3951,  3562,
+  3172,  2780,  2386,  1990,
+  1594,  1196,  798,   399
 };
 
-void WebRtcAecm_WindowAndFFT(WebRtc_Word16* fft,
+static void WindowAndFFTNeon(WebRtc_Word16* fft,
                              const WebRtc_Word16* time_signal,
                              complex16_t* freq_signal,
-                             int time_signal_scaling)
-{
-    int i, j;
+                             int time_signal_scaling) {
+  int i, j;
 
-    int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
-    __asm__("vmov.i16 d21, #0" ::: "d21");
+  int16x4_t tmp16x4_scaling = vdup_n_s16(time_signal_scaling);
+  __asm__("vmov.i16 d21, #0" ::: "d21");
 
-    for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
-    {
-        int16x4_t tmp16x4_0;
-        int16x4_t tmp16x4_1;
-        int32x4_t tmp32x4_0;
+  for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
+    int16x4_t tmp16x4_0;
+    int16x4_t tmp16x4_1;
+    int32x4_t tmp32x4_0;
 
-        /* Window near end */
-        // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
-        //       << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
-        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
+    /* Window near end */
+    // fft[j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT((time_signal[i]
+    //       << time_signal_scaling), WebRtcAecm_kSqrtHanning[i], 14);
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i]));
+    tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
 
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
-        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
+    tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
 
-        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
-        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
+    __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
+    __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[j]) : "q10");
 
-        // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
-        //      (time_signal[PART_LEN + i] << time_signal_scaling),
-        //       WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
-        tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
+    // fft[PART_LEN2 + j] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(
+    //      (time_signal[PART_LEN + i] << time_signal_scaling),
+    //       WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&time_signal[i + PART_LEN]));
+    tmp16x4_0 = vshl_s16(tmp16x4_0, tmp16x4_scaling);
 
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
-        tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
+    tmp32x4_0 = vmull_s16(tmp16x4_0, tmp16x4_1);
 
-        __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
-        __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
-    }
+    __asm__("vshrn.i32 d20, %q0, #14" : : "w"(tmp32x4_0) : "d20");
+    __asm__("vst2.16 {d20, d21}, [%0, :128]" : : "r"(&fft[PART_LEN2 + j]) : "q10");
+  }
 
-    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
-    WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
+  WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+  WebRtcSpl_ComplexFFT(fft, PART_LEN_SHIFT, 1);
 
-    // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
-    for(i = 0, j = 0; j < PART_LEN2; i += 8, j += 16)
-    {
-        __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
-        __asm__("vneg.s16 d22, d22" : : : "q10");
-        __asm__("vneg.s16 d23, d23" : : : "q11");
-        __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : : 
+  // Take only the first PART_LEN2 samples, and switch the sign of the imaginary part.
+  for (i = 0, j = 0; j < PART_LEN2; i += 8, j += 16) {
+    __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
+    __asm__("vneg.s16 d22, d22" : : : "q10");
+    __asm__("vneg.s16 d23, d23" : : : "q11");
+    __asm__("vst2.16 {d20, d21, d22, d23}, [%0, :256]" : :
             "r"(&freq_signal[i].real): "q10", "q11");
-    }
+  }
 }
 
-void WebRtcAecm_InverseFFTAndWindow(AecmCore_t* aecm,
-                        WebRtc_Word16* fft,
-                        complex16_t* efw,
-                        WebRtc_Word16* output,
-                        const WebRtc_Word16* nearendClean)
-{
-    int i, j, outCFFT;
-    WebRtc_Word32 tmp32no1;
+static void InverseFFTAndWindowNeon(AecmCore_t* aecm,
+                                    WebRtc_Word16* fft,
+                                    complex16_t* efw,
+                                    WebRtc_Word16* output,
+                                    const WebRtc_Word16* nearendClean) {
+  int i, j, outCFFT;
+  WebRtc_Word32 tmp32no1;
 
-    // Synthesis
-    for(i = 0, j = 0; i < PART_LEN; i += 4, j += 8)
-    {
-        // We overwrite two more elements in fft[], but it's ok.
-        __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
-        __asm__("vmov q11, q10" : : : "q10", "q11");
+  // Synthesis
+  for (i = 0, j = 0; i < PART_LEN; i += 4, j += 8) {
+    // We overwrite two more elements in fft[], but it's ok.
+    __asm__("vld2.16 {d20, d21}, [%0, :128]" : : "r"(&(efw[i].real)) : "q10");
+    __asm__("vmov q11, q10" : : : "q10", "q11");
 
-        __asm__("vneg.s16 d23, d23" : : : "q11");
-        __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
+    __asm__("vneg.s16 d23, d23" : : : "q11");
+    __asm__("vst2.16 {d22, d23}, [%0, :128]" : : "r"(&fft[j]): "q11");
 
-        __asm__("vrev64.16 q10, q10" : : : "q10");
-        __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
-    }
+    __asm__("vrev64.16 q10, q10" : : : "q10");
+    __asm__("vst2.16 {d20, d21}, [%0]" : : "r"(&fft[PART_LEN4 - j - 6]): "q10");
+  }
 
-    fft[PART_LEN2] = efw[PART_LEN].real;
-    fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
+  fft[PART_LEN2] = efw[PART_LEN].real;
+  fft[PART_LEN2 + 1] = -efw[PART_LEN].imag;
 
-    // Inverse FFT, result should be scaled with outCFFT.
-    WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
-    outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
+  // Inverse FFT, result should be scaled with outCFFT.
+  WebRtcSpl_ComplexBitReverse(fft, PART_LEN_SHIFT);
+  outCFFT = WebRtcSpl_ComplexIFFT(fft, PART_LEN_SHIFT, 1);
 
-    // Take only the real values and scale with outCFFT.
-    for (i = 0, j = 0; i < PART_LEN2; i += 8, j+= 16)
-    {
-        __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
-        __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
-    }
+  // Take only the real values and scale with outCFFT.
+  for (i = 0, j = 0; i < PART_LEN2; i += 8, j += 16) {
+    __asm__("vld2.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&fft[j]) : "q10", "q11");
+    __asm__("vst1.16 {d20, d21}, [%0, :128]" : : "r"(&fft[i]): "q10");
+  }
 
-    int32x4_t tmp32x4_2;
-    __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
-        (outCFFT - aecm->dfaCleanQDomain)));
-    for (i = 0; i < PART_LEN; i += 4)
-    {
-        int16x4_t tmp16x4_0;
-        int16x4_t tmp16x4_1;
-        int32x4_t tmp32x4_0;
-        int32x4_t tmp32x4_1;
+  int32x4_t tmp32x4_2;
+  __asm__("vdup.32 %q0, %1" : "=w"(tmp32x4_2) : "r"((WebRtc_Word32)
+      (outCFFT - aecm->dfaCleanQDomain)));
+  for (i = 0; i < PART_LEN; i += 4) {
+    int16x4_t tmp16x4_0;
+    int16x4_t tmp16x4_1;
+    int32x4_t tmp32x4_0;
+    int32x4_t tmp32x4_1;
 
-        // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-        //        fft[i], WebRtcAecm_kSqrtHanning[i], 14);
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
-        __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
-        __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
+    // fft[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+    //        fft[i], WebRtcAecm_kSqrtHanning[i], 14);
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[i]));
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&WebRtcAecm_kSqrtHanning[i]));
+    __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
+    __asm__("vrshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
 
-        // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
-        //        outCFFT - aecm->dfaCleanQDomain);
-        __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
+    // tmp32no1 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)fft[i],
+    //        outCFFT - aecm->dfaCleanQDomain);
+    __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
 
-        // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
-        //        tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
-        // output[i] = fft[i];
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
-        __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
-        __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
-        __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
-        __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
-        __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
+    // fft[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(WEBRTC_SPL_WORD16_MAX,
+    //        tmp32no1 + outBuf[i], WEBRTC_SPL_WORD16_MIN);
+    // output[i] = fft[i];
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&aecm->outBuf[i]));
+    __asm__("vmovl.s16 %q0, %P1" : "=w"(tmp32x4_1) : "w"(tmp16x4_0));
+    __asm__("vadd.i32 %q0, %q1" : : "w"(tmp32x4_0), "w"(tmp32x4_1));
+    __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
+    __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&fft[i]));
+    __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&output[i]));
 
-        // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
-        //        fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
-        __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
-        __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
-        __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
+    // tmp32no1 = WEBRTC_SPL_MUL_16_16_RSFT(
+    //        fft[PART_LEN + i], WebRtcAecm_kSqrtHanning[PART_LEN - i], 14);
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_0) : "r"(&fft[PART_LEN + i]));
+    __asm__("vld1.16 %P0, [%1, :64]" : "=w"(tmp16x4_1) : "r"(&kSqrtHanningReversed[i]));
+    __asm__("vmull.s16 %q0, %P1, %P2" : "=w"(tmp32x4_0) : "w"(tmp16x4_0), "w"(tmp16x4_1));
+    __asm__("vshr.s32 %q0, %q1, #14" : "=w"(tmp32x4_0) : "0"(tmp32x4_0));
 
-        // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
-        __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
-        // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
-        //        WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
-        __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
-        __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
-    }
+    // tmp32no1 = WEBRTC_SPL_SHIFT_W32(tmp32no1, outCFFT - aecm->dfaCleanQDomain);
+    __asm__("vshl.s32 %q0, %q1, %q2" : "=w"(tmp32x4_0) : "0"(tmp32x4_0), "w"(tmp32x4_2));
+    // outBuf[i] = (WebRtc_Word16)WEBRTC_SPL_SAT(
+    //        WEBRTC_SPL_WORD16_MAX, tmp32no1, WEBRTC_SPL_WORD16_MIN);
+    __asm__("vqshrn.s32 %P0, %q1, #0" : "=w"(tmp16x4_0) : "w"(tmp32x4_0));
+    __asm__("vst1.16 %P0, [%1, :64]" : : "w"(tmp16x4_0), "r"(&aecm->outBuf[i]));
+  }
 
-    // Copy the current block to the old position (outBuf is shifted elsewhere).
-    for (i = 0; i < PART_LEN; i += 16)
-    {
-        __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+  // Copy the current block to the old position (outBuf is shifted elsewhere).
+  for (i = 0; i < PART_LEN; i += 16) {
+    __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
             "r"(&aecm->xBuf[i + PART_LEN]) : "q10");
-        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
-    }
-    for (i = 0; i < PART_LEN; i += 16)
-    {
-        __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+    __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&aecm->xBuf[i]): "q10");
+  }
+  for (i = 0; i < PART_LEN; i += 16) {
+    __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
             "r"(&aecm->dBufNoisy[i + PART_LEN]) : "q10");
-        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : : 
+    __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
             "r"(&aecm->dBufNoisy[i]): "q10");
+  }
+  if (nearendClean != NULL) {
+    for (i = 0; i < PART_LEN; i += 16) {
+      __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+              "r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
+      __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+              "r"(&aecm->dBufClean[i]): "q10");
     }
-    if (nearendClean != NULL) {
-        for (i = 0; i < PART_LEN; i += 16)
-        {
-            __asm__("vld1.16 {d20, d21, d22, d23}, [%0, :256]" : :
-                "r"(&aecm->dBufClean[i + PART_LEN]) : "q10");
-            __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
-                "r"(&aecm->dBufClean[i]): "q10");
-        }
-    }
+  }
 }
 
-void WebRtcAecm_CalcLinearEnergies(AecmCore_t* aecm,
+static void CalcLinearEnergiesNeon(AecmCore_t* aecm,
                                    const WebRtc_UWord16* far_spectrum,
                                    WebRtc_Word32* echo_est,
                                    WebRtc_UWord32* far_energy,
                                    WebRtc_UWord32* echo_energy_adapt,
-                                   WebRtc_UWord32* echo_energy_stored)
-{
-    int i;
+                                   WebRtc_UWord32* echo_energy_stored) {
+  int i;
 
-    register WebRtc_UWord32 far_energy_r;
-    register WebRtc_UWord32 echo_energy_stored_r;
-    register WebRtc_UWord32 echo_energy_adapt_r;
-    uint32x4_t tmp32x4_0;
+  register WebRtc_UWord32 far_energy_r;
+  register WebRtc_UWord32 echo_energy_stored_r;
+  register WebRtc_UWord32 echo_energy_adapt_r;
+  uint32x4_t tmp32x4_0;
 
-    __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
-    __asm__("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
-    __asm__("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
+  __asm__("vmov.i32 q14, #0" : : : "q14"); // far_energy
+  __asm__("vmov.i32 q8,  #0" : : : "q8"); // echo_energy_stored
+  __asm__("vmov.i32 q9,  #0" : : : "q9"); // echo_energy_adapt
 
-    for(i = 0; i < PART_LEN -7; i += 8)
-    {
-        // far_energy += (WebRtc_UWord32)(far_spectrum[i]);
-        __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
-        __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
-        __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
-
-        // Get estimated echo energies for adaptive channel and stored channel.
-        // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-        __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
-        __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-        __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-        __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
-            "q10", "q11");
-
-        // echo_energy_stored += (WebRtc_UWord32)echoEst[i];
-        __asm__("vadd.u32 q8, q10" : : : "q10", "q8");
-        __asm__("vadd.u32 q8, q11" : : : "q11", "q8");
-
-        // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
-        //     aecm->channelAdapt16[i], far_spectrum[i]);
-        __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
-        __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-        __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-        __asm__("vadd.u32 q9, q10" : : : "q9", "q15");
-        __asm__("vadd.u32 q9, q11" : : : "q9", "q11");
-    }
-
-    __asm__("vadd.u32 d28, d29" : : : "q14");
-    __asm__("vpadd.u32 d28, d28" : : : "q14");
-    __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
-
-    __asm__("vadd.u32 d18, d19" : : : "q9");
-    __asm__("vpadd.u32 d18, d18" : : : "q9");
-    __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
-
-    __asm__("vadd.u32 d16, d17" : : : "q8");
-    __asm__("vpadd.u32 d16, d16" : : : "q8");
-    __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
+  for (i = 0; i < PART_LEN - 7; i += 8) {
+    // far_energy += (WebRtc_UWord32)(far_spectrum[i]);
+    __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
+    __asm__("vaddw.u16 q14, q14, d26" : : : "q14", "q13");
+    __asm__("vaddw.u16 q14, q14, d27" : : : "q14", "q13");
 
     // Get estimated echo energies for adaptive channel and stored channel.
-    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-    *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
-    *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
-    *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
-        aecm->channelAdapt16[i], far_spectrum[i]);
+    // echoEst[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+    __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
+    __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
+    __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
+    __asm__("vst1.32 {d20, d21, d22, d23}, [%0, :256]" : : "r"(&echo_est[i]):
+            "q10", "q11");
+
+    // echo_energy_stored += (WebRtc_UWord32)echoEst[i];
+    __asm__("vadd.u32 q8, q10" : : : "q10", "q8");
+    __asm__("vadd.u32 q8, q11" : : : "q11", "q8");
+
+    // echo_energy_adapt += WEBRTC_SPL_UMUL_16_16(
+    //     aecm->channelAdapt16[i], far_spectrum[i]);
+    __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
+    __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
+    __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
+    __asm__("vadd.u32 q9, q10" : : : "q9", "q15");
+    __asm__("vadd.u32 q9, q11" : : : "q9", "q11");
+  }
+
+  __asm__("vadd.u32 d28, d29" : : : "q14");
+  __asm__("vpadd.u32 d28, d28" : : : "q14");
+  __asm__("vmov.32 %0, d28[0]" : "=r"(far_energy_r): : "q14");
+
+  __asm__("vadd.u32 d18, d19" : : : "q9");
+  __asm__("vpadd.u32 d18, d18" : : : "q9");
+  __asm__("vmov.32 %0, d18[0]" : "=r"(echo_energy_adapt_r): : "q9");
+
+  __asm__("vadd.u32 d16, d17" : : : "q8");
+  __asm__("vpadd.u32 d16, d16" : : : "q8");
+  __asm__("vmov.32 %0, d16[0]" : "=r"(echo_energy_stored_r): : "q8");
+
+  // Get estimated echo energies for adaptive channel and stored channel.
+  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+  *echo_energy_stored = echo_energy_stored_r + (WebRtc_UWord32)echo_est[i];
+  *far_energy = far_energy_r + (WebRtc_UWord32)(far_spectrum[i]);
+  *echo_energy_adapt = echo_energy_adapt_r + WEBRTC_SPL_UMUL_16_16(
+      aecm->channelAdapt16[i], far_spectrum[i]);
 }
 
-void WebRtcAecm_StoreAdaptiveChannel(AecmCore_t* aecm,
+static void StoreAdaptiveChannelNeon(AecmCore_t* aecm,
                                      const WebRtc_UWord16* far_spectrum,
-                                     WebRtc_Word32* echo_est)
-{
-    int i;
+                                     WebRtc_Word32* echo_est) {
+  int i;
 
-    // During startup we store the channel every block.
-    // Recalculate echo estimate.
-    for(i = 0; i < PART_LEN -7; i += 8)
-    {
-        // aecm->channelStored[i] = acem->channelAdapt16[i];
-        // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
-        __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
-        __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
-        __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
-        __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
-        __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
-        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
-                               "r"(&echo_est[i]) : "q10", "q11");
-    }
-    aecm->channelStored[i] = aecm->channelAdapt16[i];
-    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+  // During startup we store the channel every block.
+  // Recalculate echo estimate.
+  for (i = 0; i < PART_LEN - 7; i += 8) {
+    // aecm->channelStored[i] = acem->channelAdapt16[i];
+    // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
+    __asm__("vld1.16 {d26, d27}, [%0]" : : "r"(&far_spectrum[i]) : "q13");
+    __asm__("vld1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelAdapt16[i]) : "q12");
+    __asm__("vst1.16 {d24, d25}, [%0, :128]" : : "r"(&aecm->channelStored[i]) : "q12");
+    __asm__("vmull.u16 q10, d26, d24" : : : "q12", "q13", "q10");
+    __asm__("vmull.u16 q11, d27, d25" : : : "q12", "q13", "q11");
+    __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+            "r"(&echo_est[i]) : "q10", "q11");
+  }
+  aecm->channelStored[i] = aecm->channelAdapt16[i];
+  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], far_spectrum[i]);
 }
 
-void WebRtcAecm_ResetAdaptiveChannel(AecmCore_t* aecm)
-{
-    int i;
+static void ResetAdaptiveChannelNeon(AecmCore_t* aecm) {
+  int i;
 
-    for(i = 0; i < PART_LEN -7; i += 8)
-    {
-        // aecm->channelAdapt16[i] = aecm->channelStored[i];
-        // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
-        //                           aecm->channelStored[i], 16);
-        __asm__("vld1.16 {d24, d25}, [%0, :128]" : :
-                        "r"(&aecm->channelStored[i]) : "q12");
-        __asm__("vst1.16 {d24, d25}, [%0, :128]" : :
-                        "r"(&aecm->channelAdapt16[i]) : "q12");
-        __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
-        __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
-        __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
-                        "r"(&aecm->channelAdapt32[i]): "q10", "q11");
-    }
-    aecm->channelAdapt16[i] = aecm->channelStored[i];
-    aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
-            (WebRtc_Word32)aecm->channelStored[i], 16);
+  for (i = 0; i < PART_LEN - 7; i += 8) {
+    // aecm->channelAdapt16[i] = aecm->channelStored[i];
+    // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)
+    //                           aecm->channelStored[i], 16);
+    __asm__("vld1.16 {d24, d25}, [%0, :128]" : :
+            "r"(&aecm->channelStored[i]) : "q12");
+    __asm__("vst1.16 {d24, d25}, [%0, :128]" : :
+            "r"(&aecm->channelAdapt16[i]) : "q12");
+    __asm__("vshll.s16 q10, d24, #16" : : : "q12", "q13", "q10");
+    __asm__("vshll.s16 q11, d25, #16" : : : "q12", "q13", "q11");
+    __asm__("vst1.16 {d20, d21, d22, d23}, [%0, :256]" : :
+            "r"(&aecm->channelAdapt32[i]): "q10", "q11");
+  }
+  aecm->channelAdapt16[i] = aecm->channelStored[i];
+  aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
+      (WebRtc_Word32)aecm->channelStored[i], 16);
 }
 
-#endif // #if defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON)
+void WebRtcAecm_InitNeon(void) {
+  WebRtcAecm_WindowAndFFT = WindowAndFFTNeon;
+  WebRtcAecm_InverseFFTAndWindow = InverseFFTAndWindowNeon;
+  WebRtcAecm_CalcLinearEnergies = CalcLinearEnergiesNeon;
+  WebRtcAecm_StoreAdaptiveChannel = StoreAdaptiveChannelNeon;
+  WebRtcAecm_ResetAdaptiveChannel = ResetAdaptiveChannelNeon;
+}
diff --git a/src/modules/audio_processing/ns/Android.mk b/src/modules/audio_processing/ns/Android.mk
index 1363a93657..aba95e1d9f 100644
--- a/src/modules/audio_processing/ns/Android.mk
+++ b/src/modules/audio_processing/ns/Android.mk
@@ -6,6 +6,8 @@
 # in the file PATENTS.  All contributing project authors may
 # be found in the AUTHORS file in the root of the source tree.
 
+#############################
+# Build the non-neon library.
 LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
@@ -20,25 +22,20 @@ LOCAL_SRC_FILES := \
     noise_suppression_x.c \
     nsx_core.c
 
-# floating point
+# Files for floating point.
 # noise_suppression.c ns_core.c 
 
 # Flags passed to both C and C++ files.
-LOCAL_CFLAGS := \
-    $(MY_WEBRTC_COMMON_DEFS)
-
-ifeq ($(ARCH_ARM_HAVE_NEON),true)
-LOCAL_SRC_FILES += \
-    nsx_core_neon.c
-LOCAL_CFLAGS += \
-    $(MY_ARM_CFLAGS_NEON)
-endif
+LOCAL_CFLAGS := $(MY_WEBRTC_COMMON_DEFS)
 
 LOCAL_C_INCLUDES := \
     $(LOCAL_PATH)/interface \
     $(LOCAL_PATH)/../utility \
     $(LOCAL_PATH)/../../.. \
-    $(LOCAL_PATH)/../../../common_audio/signal_processing/include 
+    $(LOCAL_PATH)/../../../common_audio/signal_processing/include \
+    $(LOCAL_PATH)/../../../system_wrappers/interface
+
+LOCAL_STATIC_LIBRARIES += libwebrtc_system_wrappers
 
 LOCAL_SHARED_LIBRARIES := \
     libcutils \
@@ -49,3 +46,31 @@ ifndef NDK_ROOT
 include external/stlport/libstlport.mk
 endif
 include $(BUILD_STATIC_LIBRARY)
+
+#############################
+# Build the neon library.
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE_CLASS := STATIC_LIBRARIES
+LOCAL_MODULE := libwebrtc_ns_neon
+LOCAL_MODULE_TAGS := optional
+LOCAL_GENERATED_SOURCES :=
+
+LOCAL_SRC_FILES := nsx_core_neon.c
+
+# Flags passed to both C and C++ files.
+LOCAL_CFLAGS := \
+    $(MY_WEBRTC_COMMON_DEFS) \
+    -mfpu=neon \
+    -flax-vector-conversions
+
+LOCAL_C_INCLUDES := \
+    $(LOCAL_PATH)/interface \
+    $(LOCAL_PATH)/../../.. \
+    $(LOCAL_PATH)/../../../common_audio/signal_processing/include
+
+ifndef NDK_ROOT
+include external/stlport/libstlport.mk
+endif
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/modules/audio_processing/ns/nsx_core.c b/src/modules/audio_processing/ns/nsx_core.c
index 66c49134f2..3879161e8b 100644
--- a/src/modules/audio_processing/ns/nsx_core.c
+++ b/src/modules/audio_processing/ns/nsx_core.c
@@ -16,6 +16,7 @@
 #include <stdlib.h>
 #include <stdio.h>
 
+#include "cpu_features_wrapper.h"
 #include "nsx_core.h"
 
 // Skip first frequency bins during estimation. (0 <= value < 64)
@@ -426,6 +427,271 @@ static const WebRtc_Word16 kDeterminantEstMatrix[66] = {
   355,    330
 };
 
+// Declare function pointers.
+NoiseEstimation WebRtcNsx_NoiseEstimation;
+PrepareSpectrum WebRtcNsx_PrepareSpectrum;
+SynthesisUpdate WebRtcNsx_SynthesisUpdate;
+AnalysisUpdate WebRtcNsx_AnalysisUpdate;
+Denormalize WebRtcNsx_Denormalize;
+CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
+
+// Update the noise estimation information.
+static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
+  WebRtc_Word32 tmp32no1 = 0;
+  WebRtc_Word32 tmp32no2 = 0;
+  WebRtc_Word16 tmp16 = 0;
+  const WebRtc_Word16 kExp2Const = 11819; // Q13
+
+  int i = 0;
+
+  tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
+                                   inst->magnLen);
+  // Guarantee a Q-domain as high as possible and still fit in int16
+  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                   kExp2Const, tmp16, 21);
+  for (i = 0; i < inst->magnLen; i++) {
+    // inst->quantile[i]=exp(inst->lquantile[offset+i]);
+    // in Q21
+    tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
+                                    inst->noiseEstLogQuantile[offset + i]);
+    tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
+    tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
+    tmp16 -= 21;// shift 21 to get result in Q0
+    tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
+    if (tmp16 < 0) {
+      tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
+    } else {
+      tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
+    }
+    inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
+  }
+}
+
+// Noise Estimation
+static void NoiseEstimationC(NsxInst_t* inst,
+                             uint16_t* magn,
+                             uint32_t* noise,
+                             int16_t* q_noise) {
+  WebRtc_Word32 numerator = FACTOR_Q16;
+  WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
+  WebRtc_Word16 countProd, delta, zeros, frac;
+  WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
+  const int16_t log2_const = 22713; // Q15
+  const int16_t width_factor = 21845;
+
+  int i, s, offset;
+
+  tabind = inst->stages - inst->normData;
+  assert(tabind < 9);
+  assert(tabind > -9);
+  if (tabind < 0) {
+    logval = -WebRtcNsx_kLogTable[-tabind];
+  } else {
+    logval = WebRtcNsx_kLogTable[tabind];
+  }
+
+  // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
+  // magn is in Q(-stages), and the real lmagn values are:
+  // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
+  // lmagn in Q8
+  for (i = 0; i < inst->magnLen; i++) {
+    if (magn[i]) {
+      zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
+      frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
+                              & 0x7FFFFFFF) >> 23);
+      // log2(magn(i))
+      assert(frac < 256);
+      log2 = (WebRtc_Word16)(((31 - zeros) << 8)
+                             + WebRtcNsx_kLogTableFrac[frac]);
+      // log2(magn(i))*log(2)
+      lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
+      // + log(2^stages)
+      lmagn[i] += logval;
+    } else {
+      lmagn[i] = logval;//0;
+    }
+  }
+
+  // loop over simultaneous estimates
+  for (s = 0; s < SIMULT; s++) {
+    offset = s * inst->magnLen;
+
+    // Get counter values from state
+    counter = inst->noiseEstCounter[s];
+    assert(counter < 201);
+    countDiv = WebRtcNsx_kCounterDiv[counter];
+    countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
+
+    // quant_est(...)
+    for (i = 0; i < inst->magnLen; i++) {
+      // compute delta
+      if (inst->noiseEstDensity[offset + i] > 512) {
+        delta = WebRtcSpl_DivW32W16ResW16(numerator,
+                                          inst->noiseEstDensity[offset + i]);
+      } else {
+        delta = FACTOR_Q7;
+        if (inst->blockIndex < END_STARTUP_LONG) {
+          // Smaller step size during startup. This prevents from using
+          // unrealistic values causing overflow.
+          delta = FACTOR_Q7_STARTUP;
+        }
+      }
+
+      // update log quantile estimate
+      tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
+      if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
+        // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
+        // CounterDiv=1/(inst->counter[s]+1) in Q15
+        tmp16 += 2;
+        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
+        inst->noiseEstLogQuantile[offset + i] += tmp16no1;
+      } else {
+        tmp16 += 1;
+        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
+        // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
+        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
+        inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
+        if (inst->noiseEstLogQuantile[offset + i] < logval) {
+          // This is the smallest fixed point representation we can
+          // have, hence we limit the output.
+          inst->noiseEstLogQuantile[offset + i] = logval;
+        }
+      }
+
+      // update density estimate
+      if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
+          < WIDTH_Q8) {
+        tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                     inst->noiseEstDensity[offset + i], countProd, 15);
+        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                     width_factor, countDiv, 15);
+        inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
+      }
+    } // end loop over magnitude spectrum
+
+    if (counter >= END_STARTUP_LONG) {
+      inst->noiseEstCounter[s] = 0;
+      if (inst->blockIndex >= END_STARTUP_LONG) {
+        UpdateNoiseEstimate(inst, offset);
+      }
+    }
+    inst->noiseEstCounter[s]++;
+
+  } // end loop over simultaneous estimates
+
+  // Sequentially update the noise during startup
+  if (inst->blockIndex < END_STARTUP_LONG) {
+    UpdateNoiseEstimate(inst, offset);
+  }
+
+  for (i = 0; i < inst->magnLen; i++) {
+    noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
+  }
+  (*q_noise) = (WebRtc_Word16)inst->qNoise;
+}
+
+// Filter the data in the frequency domain, and create spectrum.
+static void PrepareSpectrumC(NsxInst_t* inst, int16_t* freq_buf) {
+  int i = 0, j = 0;
+  int16_t tmp16 = 0;
+
+  for (i = 0; i < inst->magnLen; i++) {
+    inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
+        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
+    inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
+        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
+  }
+
+  freq_buf[0] = inst->real[0];
+  freq_buf[1] = -inst->imag[0];
+  for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
+    tmp16 = (inst->anaLen << 1) - j;
+    freq_buf[j] = inst->real[i];
+    freq_buf[j + 1] = -inst->imag[i];
+    freq_buf[tmp16] = inst->real[i];
+    freq_buf[tmp16 + 1] = inst->imag[i];
+  }
+  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
+  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
+}
+
+// Denormalize the input buffer.
+static __inline void DenormalizeC(NsxInst_t* inst, int16_t* in, int factor) {
+  int i = 0, j = 0;
+  int32_t tmp32 = 0;
+  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
+    tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
+                                 factor - inst->normData);
+    inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+  }
+}
+
+// For the noise supression process, synthesis, read out fully processed
+// segment, and update synthesis buffer.
+static void SynthesisUpdateC(NsxInst_t* inst,
+                             int16_t* out_frame,
+                             int16_t gain_factor) {
+  int i = 0;
+  int16_t tmp16a = 0;
+  int16_t tmp16b = 0;
+  int32_t tmp32 = 0;
+
+  // synthesis
+  for (i = 0; i < inst->anaLen; i++) {
+    tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+                 inst->window[i], inst->real[i], 14); // Q0, window in Q14
+    tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
+    // Down shift with rounding
+    tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
+    inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
+                                                      tmp16b); // Q0
+  }
+
+  // read out fully processed segment
+  for (i = 0; i < inst->blockLen10ms; i++) {
+    out_frame[i] = inst->synthesisBuffer[i]; // Q0
+  }
+
+  // update synthesis buffer
+  WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
+                        inst->synthesisBuffer + inst->blockLen10ms,
+                        inst->anaLen - inst->blockLen10ms);
+  WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
+      + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
+}
+
+// Update analysis buffer for lower band, and window data before FFT.
+static void AnalysisUpdateC(NsxInst_t* inst,
+                            int16_t* out,
+                            int16_t* new_speech) {
+  int i = 0;
+
+  // For lower band update analysis buffer.
+  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
+                        inst->analysisBuffer + inst->blockLen10ms,
+                        inst->anaLen - inst->blockLen10ms);
+  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
+      + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
+
+  // Window data before FFT.
+  for (i = 0; i < inst->anaLen; i++) {
+    out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
+               inst->window[i], inst->analysisBuffer[i], 14); // Q0
+  }
+}
+
+// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
+// zeros, and normalize it.
+static __inline void CreateComplexBufferC(NsxInst_t* inst,
+                                          int16_t* in,
+                                          int16_t* out) {
+  int i = 0, j = 0;
+  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
+    out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
+    out[j + 1] = 0; // Insert zeros in imaginary part
+  }
+}
+
 void WebRtcNsx_CalcParametricNoiseEstimate(NsxInst_t* inst,
                                            WebRtc_Word16 pink_noise_exp_avg,
                                            WebRtc_Word32 pink_noise_num_avg,
@@ -600,6 +866,24 @@ WebRtc_Word32 WebRtcNsx_InitCore(NsxInst_t* inst, WebRtc_UWord32 fs) {
   inst->file5 = fopen("file5.pcm", "wb");
 #endif
 
+  // Initialize function pointers.
+  WebRtcNsx_NoiseEstimation = NoiseEstimationC;
+  WebRtcNsx_PrepareSpectrum = PrepareSpectrumC;
+  WebRtcNsx_SynthesisUpdate = SynthesisUpdateC;
+  WebRtcNsx_AnalysisUpdate = AnalysisUpdateC;
+  WebRtcNsx_Denormalize = DenormalizeC;
+  WebRtcNsx_CreateComplexBuffer = CreateComplexBufferC;
+
+#ifdef WEBRTC_DETECT_ARM_NEON
+    uint64_t features = WebRtc_GetCPUFeaturesARM();
+    if ((features & kCPUFeatureNEON) != 0)
+    {
+        WebRtcNsx_InitNeon();
+    }
+#elif defined(WEBRTC_ARCH_ARM_NEON)
+    WebRtcNsx_InitNeon();
+#endif
+
   inst->initFlag = 1;
 
   return 0;
@@ -2157,263 +2441,4 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst, short* speechFrame, short* speechFram
   return 0;
 }
 
-#if !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
 
-// Update the noise estimation information.
-static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
-  WebRtc_Word32 tmp32no1 = 0;
-  WebRtc_Word32 tmp32no2 = 0;
-  WebRtc_Word16 tmp16 = 0;
-  const WebRtc_Word16 kExp2Const = 11819; // Q13
-
-  int i = 0;
-
-  tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
-                                   inst->magnLen);
-  // Guarantee a Q-domain as high as possible and still fit in int16
-  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                   kExp2Const, tmp16, 21);
-  for (i = 0; i < inst->magnLen; i++) {
-    // inst->quantile[i]=exp(inst->lquantile[offset+i]);
-    // in Q21
-    tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
-                                    inst->noiseEstLogQuantile[offset + i]);
-    tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
-    tmp16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32no2, 21);
-    tmp16 -= 21;// shift 21 to get result in Q0
-    tmp16 += (WebRtc_Word16) inst->qNoise; //shift to get result in Q(qNoise)
-    if (tmp16 < 0) {
-      tmp32no1 = WEBRTC_SPL_RSHIFT_W32(tmp32no1, -tmp16);
-    } else {
-      tmp32no1 = WEBRTC_SPL_LSHIFT_W32(tmp32no1, tmp16);
-    }
-    inst->noiseEstQuantile[i] = WebRtcSpl_SatW32ToW16(tmp32no1);
-  }
-}
-
-// Noise Estimation
-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
-                               uint16_t* magn,
-                               uint32_t* noise,
-                               int16_t* q_noise) {
-  WebRtc_Word32 numerator = FACTOR_Q16;
-  WebRtc_Word16 lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
-  WebRtc_Word16 countProd, delta, zeros, frac;
-  WebRtc_Word16 log2, tabind, logval, tmp16, tmp16no1, tmp16no2;
-  const int16_t log2_const = 22713; // Q15
-  const int16_t width_factor = 21845;
-
-  int i, s, offset;
-
-  tabind = inst->stages - inst->normData;
-  assert(tabind < 9);
-  assert(tabind > -9);
-  if (tabind < 0) {
-    logval = -WebRtcNsx_kLogTable[-tabind];
-  } else {
-    logval = WebRtcNsx_kLogTable[tabind];
-  }
-
-  // lmagn(i)=log(magn(i))=log(2)*log2(magn(i))
-  // magn is in Q(-stages), and the real lmagn values are:
-  // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages)
-  // lmagn in Q8
-  for (i = 0; i < inst->magnLen; i++) {
-    if (magn[i]) {
-      zeros = WebRtcSpl_NormU32((WebRtc_UWord32)magn[i]);
-      frac = (WebRtc_Word16)((((WebRtc_UWord32)magn[i] << zeros)
-                              & 0x7FFFFFFF) >> 23);
-      // log2(magn(i))
-      assert(frac < 256);
-      log2 = (WebRtc_Word16)(((31 - zeros) << 8)
-                             + WebRtcNsx_kLogTableFrac[frac]);
-      // log2(magn(i))*log(2)
-      lmagn[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
-      // + log(2^stages)
-      lmagn[i] += logval;
-    } else {
-      lmagn[i] = logval;//0;
-    }
-  }
-
-  // loop over simultaneous estimates
-  for (s = 0; s < SIMULT; s++) {
-    offset = s * inst->magnLen;
-
-    // Get counter values from state
-    counter = inst->noiseEstCounter[s];
-    assert(counter < 201);
-    countDiv = WebRtcNsx_kCounterDiv[counter];
-    countProd = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(counter, countDiv);
-
-    // quant_est(...)
-    for (i = 0; i < inst->magnLen; i++) {
-      // compute delta
-      if (inst->noiseEstDensity[offset + i] > 512) {
-        delta = WebRtcSpl_DivW32W16ResW16(numerator,
-                                          inst->noiseEstDensity[offset + i]);
-      } else {
-        delta = FACTOR_Q7;
-        if (inst->blockIndex < END_STARTUP_LONG) {
-          // Smaller step size during startup. This prevents from using
-          // unrealistic values causing overflow.
-          delta = FACTOR_Q7_STARTUP;
-        }
-      }
-
-      // update log quantile estimate
-      tmp16 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(delta, countDiv, 14);
-      if (lmagn[i] > inst->noiseEstLogQuantile[offset + i]) {
-        // +=QUANTILE*delta/(inst->counter[s]+1) QUANTILE=0.25, =1 in Q2
-        // CounterDiv=1/(inst->counter[s]+1) in Q15
-        tmp16 += 2;
-        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 2);
-        inst->noiseEstLogQuantile[offset + i] += tmp16no1;
-      } else {
-        tmp16 += 1;
-        tmp16no1 = WEBRTC_SPL_RSHIFT_W16(tmp16, 1);
-        // *(1-QUANTILE), in Q2 QUANTILE=0.25, 1-0.25=0.75=3 in Q2
-        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(tmp16no1, 3, 1);
-        inst->noiseEstLogQuantile[offset + i] -= tmp16no2;
-        if (inst->noiseEstLogQuantile[offset + i] < logval) {
-          // This is the smallest fixed point representation we can
-          // have, hence we limit the output.
-          inst->noiseEstLogQuantile[offset + i] = logval;
-        }
-      }
-
-      // update density estimate
-      if (WEBRTC_SPL_ABS_W16(lmagn[i] - inst->noiseEstLogQuantile[offset + i])
-          < WIDTH_Q8) {
-        tmp16no1 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                     inst->noiseEstDensity[offset + i], countProd, 15);
-        tmp16no2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                     width_factor, countDiv, 15);
-        inst->noiseEstDensity[offset + i] = tmp16no1 + tmp16no2;
-      }
-    } // end loop over magnitude spectrum
-
-    if (counter >= END_STARTUP_LONG) {
-      inst->noiseEstCounter[s] = 0;
-      if (inst->blockIndex >= END_STARTUP_LONG) {
-        UpdateNoiseEstimate(inst, offset);
-      }
-    }
-    inst->noiseEstCounter[s]++;
-
-  } // end loop over simultaneous estimates
-
-  // Sequentially update the noise during startup
-  if (inst->blockIndex < END_STARTUP_LONG) {
-    UpdateNoiseEstimate(inst, offset);
-  }
-
-  for (i = 0; i < inst->magnLen; i++) {
-    noise[i] = (WebRtc_UWord32)(inst->noiseEstQuantile[i]); // Q(qNoise)
-  }
-  (*q_noise) = (WebRtc_Word16)inst->qNoise;
-}
-
-// Filter the data in the frequency domain, and create spectrum.
-void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
-  int i = 0, j = 0;
-  int16_t tmp16 = 0;
-
-  for (i = 0; i < inst->magnLen; i++) {
-    inst->real[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->real[i],
-        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
-    inst->imag[i] = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16_RSFT(inst->imag[i],
-        (WebRtc_Word16)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
-  }
-
-  freq_buf[0] = inst->real[0];
-  freq_buf[1] = -inst->imag[0];
-  for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
-    tmp16 = (inst->anaLen << 1) - j;
-    freq_buf[j] = inst->real[i];
-    freq_buf[j + 1] = -inst->imag[i];
-    freq_buf[tmp16] = inst->real[i];
-    freq_buf[tmp16 + 1] = inst->imag[i];
-  }
-  freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
-  freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
-}
-
-// Denormalize the input buffer.
-__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
-  int i = 0, j = 0;
-  int32_t tmp32 = 0;
-  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
-    tmp32 = WEBRTC_SPL_SHIFT_W32((WebRtc_Word32)in[j],
-                                 factor - inst->normData);
-    inst->real[i] = WebRtcSpl_SatW32ToW16(tmp32); // Q0
-  }
-}
-
-// For the noise supression process, synthesis, read out fully processed
-// segment, and update synthesis buffer.
-void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
-                               int16_t* out_frame,
-                               int16_t gain_factor) {
-  int i = 0;
-  int16_t tmp16a = 0;
-  int16_t tmp16b = 0;
-  int32_t tmp32 = 0;
-
-  // synthesis
-  for (i = 0; i < inst->anaLen; i++) {
-    tmp16a = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-                 inst->window[i], inst->real[i], 14); // Q0, window in Q14
-    tmp32 = WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(tmp16a, gain_factor, 13); // Q0
-    // Down shift with rounding
-    tmp16b = WebRtcSpl_SatW32ToW16(tmp32); // Q0
-    inst->synthesisBuffer[i] = WEBRTC_SPL_ADD_SAT_W16(inst->synthesisBuffer[i],
-                                                      tmp16b); // Q0
-  }
-
-  // read out fully processed segment
-  for (i = 0; i < inst->blockLen10ms; i++) {
-    out_frame[i] = inst->synthesisBuffer[i]; // Q0
-  }
-
-  // update synthesis buffer
-  WEBRTC_SPL_MEMCPY_W16(inst->synthesisBuffer,
-                        inst->synthesisBuffer + inst->blockLen10ms,
-                        inst->anaLen - inst->blockLen10ms);
-  WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
-      + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
-}
-
-// Update analysis buffer for lower band, and window data before FFT.
-void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
-                              int16_t* out,
-                              int16_t* new_speech) {
-  int i = 0;
-
-  // For lower band update analysis buffer.
-  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
-                        inst->analysisBuffer + inst->blockLen10ms,
-                        inst->anaLen - inst->blockLen10ms);
-  WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
-      + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
-
-  // Window data before FFT.
-  for (i = 0; i < inst->anaLen; i++) {
-    out[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(
-               inst->window[i], inst->analysisBuffer[i], 14); // Q0
-  }
-}
-
-// Create a complex number buffer (out[]) as the intput (in[]) interleaved with
-// zeros, and normalize it.
-__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
-                                            int16_t* in,
-                                            int16_t* out) {
-  int i = 0, j = 0;
-  for (i = 0, j = 0; i < inst->anaLen; i += 1, j += 2) {
-    out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
-    out[j + 1] = 0; // Insert zeros in imaginary part
-  }
-}
-
-#endif  // !(defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID))
diff --git a/src/modules/audio_processing/ns/nsx_core.h b/src/modules/audio_processing/ns/nsx_core.h
index 990dfcb3a4..0a0faf98f8 100644
--- a/src/modules/audio_processing/ns/nsx_core.h
+++ b/src/modules/audio_processing/ns/nsx_core.h
@@ -165,40 +165,51 @@ int WebRtcNsx_ProcessCore(NsxInst_t* inst,
                           short* outFrameHigh);
 
 /****************************************************************************
- * Internal functions and variable declarations shared with optimized code.
+ * Some function pointers, for internal functions shared by ARM NEON and 
+ * generic C code.
  */
-
 // Noise Estimation.
-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
-                               uint16_t* magn,
-                               uint32_t* noise,
-                               int16_t* q_noise);
+typedef void (*NoiseEstimation)(NsxInst_t* inst,
+                                uint16_t* magn,
+                                uint32_t* noise,
+                                int16_t* q_noise);
+extern NoiseEstimation WebRtcNsx_NoiseEstimation;
 
 // Filter the data in the frequency domain, and create spectrum.
-void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst,
-                               int16_t* freq_buff);
+typedef void (*PrepareSpectrum)(NsxInst_t* inst,
+                                int16_t* freq_buff);
+extern PrepareSpectrum WebRtcNsx_PrepareSpectrum;
 
 // For the noise supression process, synthesis, read out fully processed
 // segment, and update synthesis buffer.
-void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
-                               int16_t* out_frame,
-                               int16_t gain_factor);
+typedef void (*SynthesisUpdate)(NsxInst_t* inst,
+                                int16_t* out_frame,
+                                int16_t gain_factor);
+extern SynthesisUpdate WebRtcNsx_SynthesisUpdate;
 
 // Update analysis buffer for lower band, and window data before FFT.
-void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
-                              int16_t* out,
-                              int16_t* new_speech);
+typedef void (*AnalysisUpdate)(NsxInst_t* inst,
+                               int16_t* out,
+                               int16_t* new_speech);
+extern AnalysisUpdate WebRtcNsx_AnalysisUpdate;
 
 // Denormalize the input buffer.
-__inline void WebRtcNsx_Denormalize(NsxInst_t* inst,
-                                    int16_t* in,
-                                    int factor);
+typedef void (*Denormalize)(NsxInst_t* inst,
+                            int16_t* in,
+                            int factor);
+extern Denormalize WebRtcNsx_Denormalize;
 
 // Create a complex number buffer, as the intput interleaved with zeros,
 // and normalize it.
-__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
-                                            int16_t* in,
-                                            int16_t* out);
+typedef void (*CreateComplexBuffer)(NsxInst_t* inst,
+                                    int16_t* in,
+                                    int16_t* out);
+extern CreateComplexBuffer WebRtcNsx_CreateComplexBuffer;
+
+/****************************************************************************
+ * Initialization of the above function pointers for ARM Neon.
+ */
+void WebRtcNsx_InitNeon(void);
 
 extern const WebRtc_Word16 WebRtcNsx_kLogTable[9];
 extern const WebRtc_Word16 WebRtcNsx_kLogTableFrac[256];
@@ -208,4 +219,4 @@ extern const WebRtc_Word16 WebRtcNsx_kCounterDiv[201];
 }
 #endif
 
-#endif // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_
+#endif  // WEBRTC_MODULES_AUDIO_PROCESSING_NS_MAIN_SOURCE_NSX_CORE_H_
diff --git a/src/modules/audio_processing/ns/nsx_core_neon.c b/src/modules/audio_processing/ns/nsx_core_neon.c
index d01ba3b97d..675b65220c 100644
--- a/src/modules/audio_processing/ns/nsx_core_neon.c
+++ b/src/modules/audio_processing/ns/nsx_core_neon.c
@@ -8,15 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#if defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
-
 #include "nsx_core.h"
 
 #include <arm_neon.h>
 #include <assert.h>
 
 // Update the noise estimation information.
-static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
+static void UpdateNoiseEstimateNeon(NsxInst_t* inst, int offset) {
   int i = 0;
   const int16_t kExp2Const = 11819; // Q13
   int16_t* ptr_noiseEstLogQuantile = NULL;
@@ -75,7 +73,7 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
   }
 
   // Last iteration:
-  
+
   // inst->quantile[i]=exp(inst->lquantile[offset+i]);
   // in Q21
   int32_t tmp32no2 = WEBRTC_SPL_MUL_16_16(kExp2Const,
@@ -94,10 +92,10 @@ static void UpdateNoiseEstimate(NsxInst_t* inst, int offset) {
 }
 
 // Noise Estimation
-void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
-                               uint16_t* magn,
-                               uint32_t* noise,
-                               int16_t* q_noise) {
+static void NoiseEstimationNeon(NsxInst_t* inst,
+                                uint16_t* magn,
+                                uint32_t* noise,
+                                int16_t* q_noise) {
   int32_t numerator = FACTOR_Q16;
   int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv;
   int16_t countProd, delta, zeros, frac;
@@ -126,11 +124,11 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
     if (magn[i]) {
       zeros = WebRtcSpl_NormU32((uint32_t)magn[i]);
       frac = (int16_t)((((uint32_t)magn[i] << zeros)
-                              & 0x7FFFFFFF) >> 23);
+                        & 0x7FFFFFFF) >> 23);
       assert(frac < 256);
       // log2(magn(i))
       log2 = (int16_t)(((31 - zeros) << 8)
-                             + WebRtcNsx_kLogTableFrac[frac]);
+                       + WebRtcNsx_kLogTableFrac[frac]);
       // log2(magn(i))*log(2)
       lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15);
       // + log(2^stages)
@@ -302,7 +300,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
     if (counter >= END_STARTUP_LONG) {
       inst->noiseEstCounter[s] = 0;
       if (inst->blockIndex >= END_STARTUP_LONG) {
-        UpdateNoiseEstimate(inst, offset);
+        UpdateNoiseEstimateNeon(inst, offset);
       }
     }
     inst->noiseEstCounter[s]++;
@@ -311,7 +309,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
 
   // Sequentially update the noise during startup
   if (inst->blockIndex < END_STARTUP_LONG) {
-    UpdateNoiseEstimate(inst, offset);
+    UpdateNoiseEstimateNeon(inst, offset);
   }
 
   for (i = 0; i < inst->magnLen; i++) {
@@ -321,7 +319,7 @@ void WebRtcNsx_NoiseEstimation(NsxInst_t* inst,
 }
 
 // Filter the data in the frequency domain, and create spectrum.
-void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
+static void PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
 
   // (1) Filtering.
 
@@ -338,7 +336,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
   uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0];
 
   // Filter the rest in the frequency domain.
-  for (; ptr_real < &inst->real[inst->magnLen - 1]; ) {
+  for (; ptr_real < &inst->real[inst->magnLen - 1];) {
     // Loop unrolled once. Both pointers are incremented by 4 twice.
     __asm__ __volatile__(
       "vld1.16 d20, [%[ptr_real]]\n\t"
@@ -368,7 +366,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
       :
       :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25",
        "q9", "q10", "q11", "q12"
-      );
+    );
   }
 
   // Filter the last pair of elements in the frequency domain.
@@ -400,7 +398,7 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
   int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8];
   ptr_real = &inst->real[1];
   ptr_imag = &inst->imag[1];
-  for (; ptr_real < &inst->real[inst->anaLen2 - 11]; ) {
+  for (; ptr_real < &inst->real[inst->anaLen2 - 11];) {
     // Loop unrolled once. All pointers are incremented twice.
     __asm__ __volatile__(
       "vld1.16 d22, [%[ptr_real]]!\n\t"
@@ -456,13 +454,13 @@ void WebRtcNsx_PrepareSpectrum(NsxInst_t* inst, int16_t* freq_buf) {
 }
 
 // Denormalize the input buffer.
-__inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
+static __inline void DenormalizeNeon(NsxInst_t* inst, int16_t* in, int factor) {
   int16_t* ptr_real = &inst->real[0];
   int16_t* ptr_in = &in[0];
 
   __asm__ __volatile__("vdup.32 q10, %0" ::
                        "r"((int32_t)(factor - inst->normData)) : "q10");
-  for (; ptr_real < &inst->real[inst->anaLen]; ) {
+  for (; ptr_real < &inst->real[inst->anaLen];) {
 
     // Loop unrolled once. Both pointers are incremented.
     __asm__ __volatile__(
@@ -495,9 +493,9 @@ __inline void WebRtcNsx_Denormalize(NsxInst_t* inst, int16_t* in, int factor) {
 
 // For the noise supress process, synthesis, read out fully processed segment,
 // and update synthesis buffer.
-void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
-                               int16_t* out_frame,
-                               int16_t gain_factor) {
+static void SynthesisUpdateNeon(NsxInst_t* inst,
+                                int16_t* out_frame,
+                                int16_t gain_factor) {
   int16_t* ptr_real = &inst->real[0];
   int16_t* ptr_syn = &inst->synthesisBuffer[0];
   int16_t* ptr_window = &inst->window[0];
@@ -505,7 +503,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
   // synthesis
   __asm__ __volatile__("vdup.16 d24, %0" : : "r"(gain_factor) : "d24");
   // Loop unrolled once. All pointers are incremented in the assembly code.
-  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
+  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
     __asm__ __volatile__(
       // Load variables.
       "vld1.16 d22, [%[ptr_real]]!\n\t"
@@ -553,7 +551,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
   int16_t* ptr_out = &out_frame[0];
   ptr_syn = &inst->synthesisBuffer[0];
   // read out fully processed segment
-  for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms]; ) {
+  for (; ptr_syn < &inst->synthesisBuffer[inst->blockLen10ms];) {
     // Loop unrolled once. Both pointers are incremented in the assembly code.
     __asm__ __volatile__(
       // out_frame[i] = inst->synthesisBuffer[i]; // Q0
@@ -575,7 +573,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
   //                      inst->anaLen - inst->blockLen10ms);
   ptr_out = &inst->synthesisBuffer[0],
   ptr_syn = &inst->synthesisBuffer[inst->blockLen10ms];
-  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen]; ) {
+  for (; ptr_syn < &inst->synthesisBuffer[inst->anaLen];) {
     // Loop unrolled once. Both pointers are incremented in the assembly code.
     __asm__ __volatile__(
       "vld1.16 {d22, d23}, [%[ptr_syn]]!\n\t"
@@ -593,7 +591,7 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
   // WebRtcSpl_ZerosArrayW16(inst->synthesisBuffer
   //    + inst->anaLen - inst->blockLen10ms, inst->blockLen10ms);
   __asm__ __volatile__("vdup.16 q10, %0" : : "r"(0) : "q10");
-  for (; ptr_out < &inst->synthesisBuffer[inst->anaLen]; ) {
+  for (; ptr_out < &inst->synthesisBuffer[inst->anaLen];) {
     // Loop unrolled once. Pointer is incremented in the assembly code.
     __asm__ __volatile__(
       "vst1.16 {d20, d21}, [%[ptr_out]]!\n\t"
@@ -606,9 +604,9 @@ void WebRtcNsx_SynthesisUpdate(NsxInst_t* inst,
 }
 
 // Update analysis buffer for lower band, and window data before FFT.
-void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
-                              int16_t* out,
-                              int16_t* new_speech) {
+static void AnalysisUpdateNeon(NsxInst_t* inst,
+                               int16_t* out,
+                               int16_t* new_speech) {
 
   int16_t* ptr_ana = &inst->analysisBuffer[inst->blockLen10ms];
   int16_t* ptr_out = &inst->analysisBuffer[0];
@@ -617,7 +615,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer,
   //                      inst->analysisBuffer + inst->blockLen10ms,
   //                      inst->anaLen - inst->blockLen10ms);
-  for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms]; ) {
+  for (; ptr_out < &inst->analysisBuffer[inst->anaLen - inst->blockLen10ms];) {
     // Loop unrolled once, so both pointers are incremented by 8 twice.
     __asm__ __volatile__(
       "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
@@ -633,7 +631,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
 
   // WEBRTC_SPL_MEMCPY_W16(inst->analysisBuffer
   //    + inst->anaLen - inst->blockLen10ms, new_speech, inst->blockLen10ms);
-  for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen]; ) {
+  for (ptr_ana = new_speech; ptr_out < &inst->analysisBuffer[inst->anaLen];) {
     // Loop unrolled once, so both pointers are incremented by 8 twice.
     __asm__ __volatile__(
       "vld1.16 {d20, d21}, [%[ptr_ana]]!\n\t"
@@ -651,7 +649,7 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
   int16_t* ptr_window = &inst->window[0];
   ptr_out = &out[0];
   ptr_ana = &inst->analysisBuffer[0];
-  for (; ptr_out < &out[inst->anaLen]; ) {
+  for (; ptr_out < &out[inst->anaLen];) {
 
     // Loop unrolled once, so all pointers are incremented by 4 twice.
     __asm__ __volatile__(
@@ -683,17 +681,17 @@ void WebRtcNsx_AnalysisUpdate(NsxInst_t* inst,
 
 // Create a complex number buffer (out[]) as the intput (in[]) interleaved with
 // zeros, and normalize it.
-__inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
-                                            int16_t* in,
-                                            int16_t* out) {
+static __inline void CreateComplexBufferNeon(NsxInst_t* inst,
+                                             int16_t* in,
+                                             int16_t* out) {
   int16_t* ptr_out = &out[0];
   int16_t* ptr_in = &in[0];
 
   __asm__ __volatile__("vdup.16 d25, %0" : : "r"(0) : "d25");
   __asm__ __volatile__("vdup.16 q10, %0" : : "r"(inst->normData) : "q10");
-  for (; ptr_in < &in[inst->anaLen]; ) {
+  for (; ptr_in < &in[inst->anaLen];) {
 
-    // Loop unrolled once, so ptr_in is incremented by 8 twice, 
+    // Loop unrolled once, so ptr_in is incremented by 8 twice,
     // and ptr_out is incremented by 8 four times.
     __asm__ __volatile__(
       // out[j] = WEBRTC_SPL_LSHIFT_W16(in[i], inst->normData); // Q(normData)
@@ -724,4 +722,12 @@ __inline void WebRtcNsx_CreateComplexBuffer(NsxInst_t* inst,
     );
   }
 }
-#endif // defined(WEBRTC_ARCH_ARM_NEON) && defined(WEBRTC_ANDROID)
+
+void WebRtcNsx_InitNeon(void) {
+  WebRtcNsx_NoiseEstimation = NoiseEstimationNeon;
+  WebRtcNsx_PrepareSpectrum = PrepareSpectrumNeon;
+  WebRtcNsx_SynthesisUpdate = SynthesisUpdateNeon;
+  WebRtcNsx_AnalysisUpdate = AnalysisUpdateNeon;
+  WebRtcNsx_Denormalize = DenormalizeNeon;
+  WebRtcNsx_CreateComplexBuffer = CreateComplexBufferNeon;
+}
diff --git a/src/system_wrappers/interface/cpu_features_wrapper.h b/src/system_wrappers/interface/cpu_features_wrapper.h
index 5d8a828a7a..d949592197 100644
--- a/src/system_wrappers/interface/cpu_features_wrapper.h
+++ b/src/system_wrappers/interface/cpu_features_wrapper.h
@@ -15,18 +15,33 @@
 extern "C" {
 #endif
 
-// list of features.
+#include <typedefs.h>
+
+// List of features in x86.
 typedef enum {
   kSSE2,
   kSSE3
 } CPUFeature;
 
+// List of features in ARM.
+enum {
+  kCPUFeatureARMv7       = (1 << 0),
+  kCPUFeatureVFPv3       = (1 << 1),
+  kCPUFeatureNEON        = (1 << 2),
+  kCPUFeatureLDREXSTREX  = (1 << 3)
+};
+
 typedef int (*WebRtc_CPUInfo)(CPUFeature feature);
 // returns true if the CPU supports the feature.
 extern WebRtc_CPUInfo WebRtc_GetCPUInfo;
 // No CPU feature is available => straight C path.
 extern WebRtc_CPUInfo WebRtc_GetCPUInfoNoASM;
 
+// Return the features in an ARM device.
+// It detects the features in the hardware platform, and returns supported 
+// values in the above enum definition as a bitmask.
+extern uint64_t WebRtc_GetCPUFeaturesARM(void);
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }    // extern "C"
 #endif
diff --git a/src/system_wrappers/source/Android.mk b/src/system_wrappers/source/Android.mk
index a5d1439a2a..00a69ce009 100644
--- a/src/system_wrappers/source/Android.mk
+++ b/src/system_wrappers/source/Android.mk
@@ -25,6 +25,7 @@ LOCAL_SRC_FILES := \
     condition_variable.cc \
     cpu_dummy.cc \
     cpu_features.cc \
+    cpu_features_arm.c \
     cpu_info.cc \
     critical_section.cc \
     event.cc \
diff --git a/src/system_wrappers/source/cpu_features_arm.c b/src/system_wrappers/source/cpu_features_arm.c
new file mode 100644
index 0000000000..106511852c
--- /dev/null
+++ b/src/system_wrappers/source/cpu_features_arm.c
@@ -0,0 +1,333 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+// This file is derived from Android's NDK package r7, located at
+// <ndk>/sources/android/cpufeatures/ (downloadable from
+// http://developer.android.com/sdk/ndk/index.html).
+
+#include "cpu_features_wrapper.h"
+
+#include <fcntl.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Define CPU family.
+typedef enum {
+  CPU_FAMILY_UNKNOWN = 0,
+  CPU_FAMILY_ARM,
+  CPU_FAMILY_X86,
+  CPU_FAMILY_MAX  // Do not remove.
+} CpuFamily;
+
+static pthread_once_t g_once;
+static CpuFamily g_cpuFamily;
+static uint64_t g_cpuFeatures;
+static int g_cpuCount;
+
+static const int cpufeatures_debug = 0;
+
+#ifdef __arm__
+#  define DEFAULT_CPU_FAMILY  CPU_FAMILY_ARM
+#elif defined __i386__
+#  define DEFAULT_CPU_FAMILY  CPU_FAMILY_X86
+#else
+#  define DEFAULT_CPU_FAMILY  CPU_FAMILY_UNKNOWN
+#endif
+
+#define  D(...) \
+  do { \
+    if (cpufeatures_debug) { \
+      printf(__VA_ARGS__); fflush(stdout); \
+    } \
+  } while (0)
+
+/* Read the content of /proc/cpuinfo into a user-provided buffer.
+ * Return the length of the data, or -1 on error. Does *not*
+ * zero-terminate the content. Will not read more
+ * than 'buffsize' bytes.
+ */
+static int read_file(const char*  pathname, char*  buffer, size_t  buffsize) {
+  int  fd, len;
+
+  fd = open(pathname, O_RDONLY);
+  if (fd < 0)
+    return -1;
+
+  do {
+    len = read(fd, buffer, buffsize);
+  } while (len < 0 && errno == EINTR);
+
+  close(fd);
+
+  return len;
+}
+
+/* Extract the content of a the first occurence of a given field in
+ * the content of /proc/cpuinfo and return it as a heap-allocated
+ * string that must be freed by the caller.
+ *
+ * Return NULL if not found
+ */
+static char* extract_cpuinfo_field(char* buffer, int buflen, const char* field) {
+  int  fieldlen = strlen(field);
+  char* bufend = buffer + buflen;
+  char* result = NULL;
+  int len, ignore;
+  const char* p, *q;
+
+  /* Look for first field occurence, and ensures it starts the line.
+   */
+  p = buffer;
+  bufend = buffer + buflen;
+  for (;;) {
+    p = memmem(p, bufend - p, field, fieldlen);
+    if (p == NULL)
+      goto EXIT;
+
+    if (p == buffer || p[-1] == '\n')
+      break;
+
+    p += fieldlen;
+  }
+
+  /* Skip to the first column followed by a space */
+  p += fieldlen;
+  p  = memchr(p, ':', bufend - p);
+  if (p == NULL || p[1] != ' ')
+    goto EXIT;
+
+  /* Find the end of the line */
+  p += 2;
+  q = memchr(p, '\n', bufend - p);
+  if (q == NULL)
+    q = bufend;
+
+  /* Copy the line into a heap-allocated buffer */
+  len = q - p;
+  result = malloc(len + 1);
+  if (result == NULL)
+    goto EXIT;
+
+  memcpy(result, p, len);
+  result[len] = '\0';
+
+EXIT:
+  return result;
+}
+
+/* Count the number of occurences of a given field prefix in /proc/cpuinfo.
+ */
+static int count_cpuinfo_field(char* buffer, int buflen, const char* field) {
+  int fieldlen = strlen(field);
+  const char* p = buffer;
+  const char* bufend = buffer + buflen;
+  const char* q;
+  int count = 0;
+
+  for (;;) {
+    const char* q;
+
+    p = memmem(p, bufend - p, field, fieldlen);
+    if (p == NULL)
+      break;
+
+    /* Ensure that the field is at the start of a line */
+    if (p > buffer && p[-1] != '\n') {
+      p += fieldlen;
+      continue;
+    }
+
+
+    /* skip any whitespace */
+    q = p + fieldlen;
+    while (q < bufend && (*q == ' ' || *q == '\t'))
+      q++;
+
+    /* we must have a colon now */
+    if (q < bufend && *q == ':') {
+      count += 1;
+      q ++;
+    }
+    p = q;
+  }
+
+  return count;
+}
+
+/* Like strlen(), but for constant string literals */
+#define STRLEN_CONST(x)  ((sizeof(x)-1)
+
+
+/* Checks that a space-separated list of items contains one given 'item'.
+ * Returns 1 if found, 0 otherwise.
+ */
+static int has_list_item(const char* list, const char* item) {
+  const char*  p = list;
+  int itemlen = strlen(item);
+
+  if (list == NULL)
+    return 0;
+
+  while (*p) {
+    const char*  q;
+
+    /* skip spaces */
+    while (*p == ' ' || *p == '\t')
+      p++;
+
+    /* find end of current list item */
+    q = p;
+    while (*q && *q != ' ' && *q != '\t')
+      q++;
+
+    if (itemlen == q - p && !memcmp(p, item, itemlen))
+      return 1;
+
+    /* skip to next item */
+    p = q;
+  }
+  return 0;
+}
+
+
+static void cpuInit(void) {
+  char cpuinfo[4096];
+  int  cpuinfo_len;
+
+  g_cpuFamily   = DEFAULT_CPU_FAMILY;
+  g_cpuFeatures = 0;
+  g_cpuCount    = 1;
+
+  cpuinfo_len = read_file("/proc/cpuinfo", cpuinfo, sizeof cpuinfo);
+  D("cpuinfo_len is (%d):\n%.*s\n", cpuinfo_len,
+    cpuinfo_len >= 0 ? cpuinfo_len : 0, cpuinfo);
+
+  if (cpuinfo_len < 0) { /* should not happen */
+    return;
+  }
+
+  /* Count the CPU cores, the value may be 0 for single-core CPUs */
+  g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "processor");
+  if (g_cpuCount == 0) {
+    g_cpuCount = count_cpuinfo_field(cpuinfo, cpuinfo_len, "Processor");
+    if (g_cpuCount == 0) {
+      g_cpuCount = 1;
+    }
+  }
+
+  D("found cpuCount = %d\n", g_cpuCount);
+
+#ifdef __arm__
+  {
+    char*  features = NULL;
+    char*  architecture = NULL;
+
+    /* Extract architecture from the "CPU Architecture" field.
+     * The list is well-known, unlike the the output of
+     * the 'Processor' field which can vary greatly.
+     *
+     * See the definition of the 'proc_arch' array in
+     * $KERNEL/arch/arm/kernel/setup.c and the 'c_show' function in
+     * same file.
+     */
+    char* cpuArch = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
+                                          "CPU architecture");
+
+    if (cpuArch != NULL) {
+      char*  end;
+      long   archNumber;
+      int    hasARMv7 = 0;
+
+      D("found cpuArch = '%s'\n", cpuArch);
+
+      /* read the initial decimal number, ignore the rest */
+      archNumber = strtol(cpuArch, &end, 10);
+
+      /* Here we assume that ARMv8 will be upwards compatible with v7
+          * in the future. Unfortunately, there is no 'Features' field to
+          * indicate that Thumb-2 is supported.
+          */
+      if (end > cpuArch && archNumber >= 7) {
+        hasARMv7 = 1;
+      }
+
+      /* Unfortunately, it seems that certain ARMv6-based CPUs
+       * report an incorrect architecture number of 7!
+       *
+       * We try to correct this by looking at the 'elf_format'
+       * field reported by the 'Processor' field, which is of the
+       * form of "(v7l)" for an ARMv7-based CPU, and "(v6l)" for
+       * an ARMv6-one.
+       */
+      if (hasARMv7) {
+        char* cpuProc = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
+                                              "Processor");
+        if (cpuProc != NULL) {
+          D("found cpuProc = '%s'\n", cpuProc);
+          if (has_list_item(cpuProc, "(v6l)")) {
+            D("CPU processor and architecture mismatch!!\n");
+            hasARMv7 = 0;
+          }
+          free(cpuProc);
+        }
+      }
+
+      if (hasARMv7) {
+        g_cpuFeatures |= kCPUFeatureARMv7;
+      }
+
+      /* The LDREX / STREX instructions are available from ARMv6 */
+      if (archNumber >= 6) {
+        g_cpuFeatures |= kCPUFeatureLDREXSTREX;
+      }
+
+      free(cpuArch);
+    }
+
+    /* Extract the list of CPU features from 'Features' field */
+    char* cpuFeatures = extract_cpuinfo_field(cpuinfo, cpuinfo_len,
+                                              "Features");
+
+    if (cpuFeatures != NULL) {
+
+      D("found cpuFeatures = '%s'\n", cpuFeatures);
+
+      if (has_list_item(cpuFeatures, "vfpv3"))
+        g_cpuFeatures |= kCPUFeatureVFPv3;
+
+      else if (has_list_item(cpuFeatures, "vfpv3d16"))
+        g_cpuFeatures |= kCPUFeatureVFPv3;
+
+      if (has_list_item(cpuFeatures, "neon")) {
+        /* Note: Certain kernels only report neon but not vfpv3
+            *       in their features list. However, ARM mandates
+            *       that if Neon is implemented, so must be VFPv3
+            *       so always set the flag.
+            */
+        g_cpuFeatures |= kCPUFeatureNEON |
+                         kCPUFeatureVFPv3;
+      }
+      free(cpuFeatures);
+    }
+  }
+#endif  // __arm__
+
+#ifdef __i386__
+  g_cpuFamily = CPU_FAMILY_X86;
+#endif
+}
+
+
+uint64_t WebRtc_GetCPUFeaturesARM(void) {
+  pthread_once(&g_once, cpuInit);
+  return g_cpuFeatures;
+}