Fix bug in WebRtcIsacfix_FilterMaLoopNeon.

Pass content_browsertests in Chromium. Performance test result (lower is better): C version: 100% old intrinsics Neon version (with bug): 16.5% new intrinsics Neon version: 18.0% asm Neon version: 23.3% BUG=4002 R=andrew@webrtc.org, jridges@masque.com Change-Id: Ia0a96ac237216b635fc528f67d39319cdf246281 Review URL: https://webrtc-codereview.appspot.com/46739004 Cr-Commit-Position: refs/heads/master@{#8907}
2015-04-01 17:43:00 +08:00
parent 9cb1f3002f
commit f809b9b38d
1 changed files with 31 additions and 9 deletions
--- a/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c
+++ b/webrtc/modules/audio_coding/codecs/isac/fix/source/lattice_neon.c
@ -17,7 +17,7 @@
 // filter routine for iSAC codec, optimized for ARM Neon platform.
 // It does:
 //  for 0 <= n < HALF_SUBFRAMELEN - 1:
-//    *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
+//    *ptr2 = input2 * ((*ptr2) + input0 * (*ptr0));
 //    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
 // Output is not bit-exact with the reference C code, due to the replacement
 // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
@ -41,6 +41,7 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
  int32x4_t ptr0va, ptr1va, ptr2va;
  int32x4_t ptr0vb, ptr1vb, ptr2vb;

+  int64x2_t tmp2al_low, tmp2al_high, tmp2bl_low, tmp2bl_high;
  // Unroll to process 8 samples at once.
  for (n = 0; n < loop; n++) {
    ptr0va = vld1q_s32(ptr0);
@ -61,12 +62,25 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2b = vaddq_s32(tmp0b, ptr2vb);
-    tmp2a = vshlq_n_s32(tmp2a, 15);
-    tmp2b = vshlq_n_s32(tmp2b, 15);

    // Calculate *ptr2 = input2 * tmp2.
-    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
-    ptr2vb = vqrdmulhq_s32(tmp2b, input2_v);
+    tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
+#if defined(WEBRTC_ARCH_ARM64)
+    tmp2al_high = vmull_high_s32(tmp2a, input2_v);
+#else
+    tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
+#endif
+    ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
+                          vrshrn_n_s64(tmp2al_high, 16));
+
+    tmp2bl_low = vmull_s32(vget_low_s32(tmp2b), vget_low_s32(input2_v));
+#if defined(WEBRTC_ARCH_ARM64)
+    tmp2bl_high = vmull_high_s32(tmp2b, input2_v);
+#else
+    tmp2bl_high = vmull_s32(vget_high_s32(tmp2b), vget_high_s32(input2_v));
+#endif
+    ptr2vb = vcombine_s32(vrshrn_n_s64(tmp2bl_low, 16),
+                          vrshrn_n_s64(tmp2bl_high, 16));

    vst1q_s32(ptr2, ptr2va);
    vst1q_s32(ptr2 + 4, ptr2vb);
@ -99,10 +113,17 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
-    tmp2a = vshlq_n_s32(tmp2a, 15);

    // Calculate *ptr2 = input2 * tmp2.
-    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
+    tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
+
+#if defined(WEBRTC_ARCH_ARM64)
+    tmp2al_high = vmull_high_s32(tmp2a, input2_v);
+#else
+    tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
+#endif
+    ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
+                          vrshrn_n_s64(tmp2al_high, 16));

    vst1q_s32(ptr2, ptr2va);
    ptr2 += 4;
@ -121,6 +142,7 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
  if (loop_tail & 0x2) {
    int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
    int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
+    int64x2_t tmp2l_tail;
    ptr0v_tail = vld1_s32(ptr0);
    ptr2v_tail = vld1_s32(ptr2);
    ptr0 += 2;
@ -133,10 +155,10 @@ void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
-    tmp2_tail = vshl_n_s32(tmp2_tail, 15);

    // Calculate *ptr2 = input2 * tmp2.
-    ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v));
+    tmp2l_tail = vmull_s32(tmp2_tail, vget_low_s32(input2_v));
+    ptr2v_tail = vrshrn_n_s64(tmp2l_tail, 16);

    vst1_s32(ptr2, ptr2v_tail);
    ptr2 += 2;