Optimized WebRtcIsacfix_AutocorrFix() function for iSAC fix.

(1) For generic platforms, code was changed to remove the shifting within loops. Basically, it's just change a loop from for() { sum += (a*b) >> scale; } to: for() { sum += (a*b); } sum >> scale; Type int64_t is used for sum to make sure no information is not lost. Performance is about the same as before the change. Bits are not exact, although in theory the change should have preserved more information. The purpose of this change is to make the generic code and ARM code bit exact, simpify the code, while keep the speech quality at least not lower. (Some speech tests might be good.) (2) For ARM platform, used assembly to optimize the performance. iSAC runs faster with this change. (Reduced run time of an offline file test from 10.16ms to 8.81ms) Review URL: http://webrtc-codereview.appspot.com/267014 git-svn-id: http://webrtc.googlecode.com/svn/trunk@972 4adac7df-926f-26a2-2b94-8c16560cd09d
2011-11-18 02:50:55 +00:00
parent f556b9d1f4
commit fa9b016fb5
3 changed files with 210 additions and 40 deletions
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/Android.mk
@ -42,6 +42,14 @@ LOCAL_SRC_FILES := \
    spectrum_ar_model_tables.c \
    transform.c

+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_SRC_FILES += \
+    filters_neon.c
+#    lattice_neon.c
+LOCAL_CFLAGS += \
+    $(MY_ARM_CFLAGS_NEON)
+endif
+
 # Flags passed to both C and C++ files.
 LOCAL_CFLAGS := \
    $(MY_WEBRTC_COMMON_DEFS)
@ -49,7 +57,7 @@ LOCAL_CFLAGS := \
 LOCAL_C_INCLUDES := \
    $(LOCAL_PATH)/../interface \
    $(LOCAL_PATH)/../../../../../.. \
-    $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include 
+    $(LOCAL_PATH)/../../../../../../common_audio/signal_processing/include

 LOCAL_SHARED_LIBRARIES := \
    libcutils \
@ -61,8 +69,8 @@ include external/stlport/libstlport.mk
 endif
 include $(BUILD_STATIC_LIBRARY)

-
 # isac test app
+
 include $(CLEAR_VARS)

 LOCAL_MODULE_TAGS := tests
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters.c
@ -22,50 +22,44 @@
 #include "lpc_masking_model.h"
 #include "codec.h"

-
-/* Autocorrelation function in fixed point. NOTE! Different from SPLIB-version in how it scales the signal. */
+#if !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))
+// Autocorrelation function in fixed point.
+// NOTE! Different from SPLIB-version in how it scales the signal.
 int WebRtcIsacfix_AutocorrFix(
-    WebRtc_Word32          *r,
-    const WebRtc_Word16 *x,
-    WebRtc_Word16          N,
-    WebRtc_Word16          order,
-    WebRtc_Word16          *scale)
-{
-  int  j, i;
-  WebRtc_Word16  scaling;
-  WebRtc_Word32 sum, prod, newsum;
-  G_CONST WebRtc_Word16    *xptr1;
-  G_CONST WebRtc_Word16    *xptr2;
+    WebRtc_Word32* __restrict__ r,
+    const WebRtc_Word16* __restrict__ x,
+    WebRtc_Word16 N,
+    WebRtc_Word16 order,
+    WebRtc_Word16* __restrict__ scale) {

-  sum=0;
-  scaling=0;
-  /* Calculate r[0] and how much scaling is needed */
-  for (i=0; i < N; i++) {
-    prod = WEBRTC_SPL_MUL_16_16_RSFT(x[i],x[i],scaling);
-    newsum = sum+prod;
-    /* If sum gets less than 0 we have overflow and need to scale the signal */
-    if(newsum<0) {
-      scaling++;
-      sum=WEBRTC_SPL_RSHIFT_W32(sum, 1);
-      prod=WEBRTC_SPL_RSHIFT_W32(prod, 1);
-    }
-    sum += prod;
+  int i = 0;
+  int j = 0;
+  int16_t scaling = 0;
+  int32_t sum = 0;
+  uint32_t temp = 0;
+  int64_t prod = 0;
+
+  // Calculate r[0].
+  for (i = 0; i < N; i++) {
+    prod += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
  }
-  r[0]=sum;

-  /* Perform the actual correlation calculation */
-  for (i = 1; i < order + 1; i++)
-  {
-    int loops=(N-i);
-    sum = 0;
-    xptr1=(G_CONST WebRtc_Word16 *)x;
-    xptr2=(G_CONST WebRtc_Word16 *)&x[i];
+  // Calculate scaling (the value of shifting).
+  temp = (uint32_t)(prod >> 31);
+  if(temp == 0) {
+    scaling = 0;
+  } else {
+    scaling = 32 - WebRtcSpl_NormU32(temp);
+  }
+  r[0] = (int32_t)(prod >> scaling);

-    for (j = loops;j > 0; j--)
-    {
-      sum += WEBRTC_SPL_MUL_16_16_RSFT(*xptr1++,*xptr2++,scaling);
+  // Perform the actual correlation calculation.
+  for (i = 1; i < order + 1; i++) {
+    prod = 0;
+    for (j = 0; j < N - i; j++) {
+      prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]);
    }
-
+    sum = (int32_t)(prod >> scaling);
    r[i] = sum;
  }

@ -73,6 +67,7 @@ int WebRtcIsacfix_AutocorrFix(

  return(order + 1);
 }
+#endif // !(defined(WEBRTC_ANDROID) && defined(WEBRTC_ARCH_ARM_NEON))

 static const WebRtc_Word32 kApUpperQ15[ALLPASSSECTIONS] = { 1137, 12537 };
 static const WebRtc_Word32 kApLowerQ15[ALLPASSSECTIONS] = { 5059, 24379 };
--- a/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
+++ b/src/modules/audio_coding/codecs/iSAC/fix/source/filters_neon.c
@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * filters_neon.c
+ *
+ * This file contains function WebRtcIsacfix_AutocorrFix, optimized for
+ * ARM Neon platform.
+ *
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "codec.h"
+
+// Autocorrelation function in fixed point.
+// NOTE! Different from SPLIB-version in how it scales the signal.
+int WebRtcIsacfix_AutocorrFix(
+    WebRtc_Word32* __restrict__ r,
+    const WebRtc_Word16* __restrict__ x,
+    WebRtc_Word16 N,
+    WebRtc_Word16 order,
+    WebRtc_Word16* __restrict__ scale) {
+
+  // The 1st for loop assumed N % 4 == 0.
+  assert(N % 4 == 0);
+
+  int i = 0;
+  int zeros_low = 0;
+  int zeros_high = 0;
+  int16_t scaling = 0;
+  int32_t sum = 0;
+
+  // Step 1, calculate r[0] and how much scaling is needed.
+
+  int16x4_t reg16x4;
+  int64x1_t reg64x1a;
+  int64x1_t reg64x1b;
+  int32x4_t reg32x4;
+  int64x2_t reg64x2 = vdupq_n_s64(0); // zeros
+
+  // Loop over the samples and do:
+  // sum += WEBRTC_SPL_MUL_16_16(x[i], x[i]);
+  for (i = 0; i < N; i += 4) {
+    reg16x4 = vld1_s16(&x[i]);
+    reg32x4 = vmull_s16(reg16x4, reg16x4);
+    reg64x2 = vpadalq_s32(reg64x2, reg32x4);
+  }
+  reg64x1a = vget_low_s64(reg64x2);
+  reg64x1b = vget_high_s64(reg64x2);
+  reg64x1a = vadd_s64(reg64x1a, reg64x1b);
+
+  // Calculate the value of shifting (scaling).
+  __asm__ __volatile__(
+    "vmov %[z_l], %[z_h], %P[reg]\n\t"
+    "clz %[z_l], %[z_l]\n\t"
+    "clz %[z_h], %[z_h]\n\t"
+    :[z_l]"+r"(zeros_low),
+     [z_h]"+r"(zeros_high)
+    :[reg]"w"(reg64x1a)
+  );
+  if (zeros_high != 32) {
+    scaling = (32 - zeros_high + 1);
+  } else if (zeros_low == 0) {
+    scaling = 1;
+  }
+  reg64x1b = -scaling;
+  reg64x1a = vshl_s64(reg64x1a, reg64x1b);
+
+  // Record the result.
+  r[0] = (int32_t)vget_lane_s64(reg64x1a, 0);
+
+
+  // Step 2, perform the actual correlation calculation.
+
+  /* Original C code (for the rest of the function):
+  for (i = 1; i < order + 1; i++)  {
+    prod = 0;
+    for (j = 0; j < N - i; j++) {
+      prod += WEBRTC_SPL_MUL_16_16(x[j], x[i + j]);
+    }
+    sum = (int32_t)(prod >> scaling);
+    r[i] = sum;
+  }
+  */
+
+  for (i = 1; i < order + 1; i++) {
+    int32_t prod_lower = 0;
+    int32_t prod_upper = 0;
+    int16_t* ptr0 = &x[0];
+    int16_t* ptr1 = &x[i];
+    int32_t tmp = 0;
+
+    // Initialize the sum (q9) to zero.
+    __asm__ __volatile__("vmov.i32 q9, #0\n\t":::"q9");
+
+    // Calculate the major block of the samples (a multiple of 8).
+    for (; ptr0 < &x[N - i - 7];) {
+      __asm__ __volatile__(
+        "vld1.16 {d20, d21}, [%[ptr0]]!\n\t"
+        "vld1.16 {d22, d23}, [%[ptr1]]!\n\t"
+        "vmull.s16 q12, d20, d22\n\t"
+        "vmull.s16 q13, d21, d23\n\t"
+        "vpadal.s32 q9, q12\n\t"
+        "vpadal.s32 q9, q13\n\t"
+
+        // Specify constraints.
+        :[ptr0]"+r"(ptr0),
+        [ptr1]"+r"(ptr1)
+        :
+        :"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27"
+      );
+    }
+
+    // Calculate the rest of the samples.
+    for (; ptr0 < &x[N - i]; ptr0++, ptr1++) {
+      __asm__ __volatile__(
+        "smulbb %[tmp], %[ptr0], %[ptr1]\n\t"
+        "adds %[prod_lower], %[prod_lower], %[tmp]\n\t"
+        "adc %[prod_upper], %[prod_upper], %[tmp], asr #31\n\t"
+
+        // Specify constraints.
+        :[prod_lower]"+r"(prod_lower),
+        [prod_upper]"+r"(prod_upper),
+        [tmp]"+r"(tmp)
+        :[ptr0]"r"(*ptr0),
+        [ptr1]"r"(*ptr1)
+      );
+    }
+
+    // Sum the results up, and do shift.
+    __asm__ __volatile__(
+      "vadd.i64 d18, d19\n\t"
+      "vmov.32 d17[0], %[prod_lower]\n\t"
+      "vmov.32 d17[1], %[prod_upper]\n\t"
+      "vadd.i64 d17, d18\n\t"
+      "mov %[tmp], %[scaling], asr #31\n\t"
+      "vmov.32 d16, %[scaling], %[tmp]\n\t"
+      "vshl.s64 d17, d16\n\t"
+      "vmov.32 %[sum], d17[0]\n\t"
+
+      // Specify constraints.
+      :[sum]"=r"(sum),
+      [tmp]"+r"(tmp)
+      :[prod_upper]"r"(prod_upper),
+      [prod_lower]"r"(prod_lower),
+      [scaling]"r"(-scaling)
+      :"d16", "d17", "d18", "d19"
+    );
+
+    // Record the result.
+    r[i] = sum;
+  }
+
+  // Record the result.
+  *scale = scaling;
+
+  return(order + 1);
+}