add WebRtcIsacfix_AutocorrNeon's intrinsics version

The modification only uses the unique part of the
 WebRtcIsacfix_AutocorrC function. Pass FiltersTest.AutocorrFixTest test
 on both ARMv7 and ARM64, and the single function performance is similar
 with original assembly version on different platforms. If not
 specified, the code is compiled by GCC 4.6. The result is the "X
 version / C version" ratio, and the less is better.

| run 100k times             | cortex-a7 | cortex-a15 |
| use C as the base on each  |  (1.2Ghz) |   (1.7Ghz) |
| CPU target                 |           |            |
|----------------------------+-----------+------------|
| Neon asm                   |       24% |        23% |
| Neon intrinsics (GCC 4.6)  |       33% |        32% |
| Neon intrinsics (GCC 4.8)  |       27% |        27% |

BUG=3850
R=andrew@webrtc.org, jridges@masque.com

Change-Id: Id6cd0671502fadbebd10b1f5493f5b16c988286f

Review URL: https://webrtc-codereview.appspot.com/27999004

Patch from Zhongwei Yao <zhongwei.yao@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@7802 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org
2014-12-03 21:58:18 +00:00
parent 8dc21dc238
commit 3a52458237

View File

@ -0,0 +1,114 @@
/*
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include <assert.h>
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
// Autocorrelation function in fixed point.
// NOTE! Different from SPLIB-version in how it scales the signal.
int WebRtcIsacfix_AutocorrNeon(int32_t* __restrict r,
const int16_t* x,
int16_t n,
int16_t order,
int16_t* __restrict scale) {
int i = 0;
int16_t scaling = 0;
uint32_t temp = 0;
int64_t prod = 0;
int64_t prod_tail = 0;
assert(n % 4 == 0);
assert(n >= 8);
// Calculate r[0].
int16x4_t x0_v;
int32x4_t tmpa0_v;
int64x2_t tmpb_v;
tmpb_v = vdupq_n_s64(0);
const int16_t* x_start = x;
const int16_t* x_end0 = x_start + n;
while (x_start < x_end0) {
x0_v = vld1_s16(x_start);
tmpa0_v = vmull_s16(x0_v, x0_v);
tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
x_start += 4;
}
#ifdef WEBRTC_ARCH_ARM64
prod = vaddvq_s64(tmpb_v);
#else
prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
0);
#endif
// Calculate scaling (the value of shifting).
temp = (uint32_t)(prod >> 31);
scaling = temp ? 32 - WebRtcSpl_NormU32(temp) : 0;
r[0] = (int32_t)(prod >> scaling);
int16x8_t x1_v;
int16x8_t y_v;
int32x4_t tmpa1_v;
// Perform the actual correlation calculation.
for (i = 1; i < order + 1; i++) {
tmpb_v = vdupq_n_s64(0);
int rest = (n - i) % 8;
x_start = x;
x_end0 = x_start + n - i - rest;
const int16_t* y_start = x_start + i;
while (x_start < x_end0) {
x1_v = vld1q_s16(x_start);
y_v = vld1q_s16(y_start);
tmpa0_v = vmull_s16(vget_low_s16(x1_v), vget_low_s16(y_v));
#ifdef WEBRTC_ARCH_ARM64
tmpa1_v = vmull_high_s16(x1_v, y_v);
#else
tmpa1_v = vmull_s16(vget_high_s16(x1_v), vget_high_s16(y_v));
#endif
tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
tmpb_v = vpadalq_s32(tmpb_v, tmpa1_v);
x_start += 8;
y_start += 8;
}
// The remaining calculation.
const int16_t* x_end1 = x + n - i;
if (rest >= 4) {
int16x4_t x2_v = vld1_s16(x_start);
int16x4_t y2_v = vld1_s16(y_start);
tmpa0_v = vmull_s16(x2_v, y2_v);
tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
x_start += 4;
y_start += 4;
}
#ifdef WEBRTC_ARCH_ARM64
prod = vaddvq_s64(tmpb_v);
#else
prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
0);
#endif
prod_tail = 0;
while (x_start < x_end1) {
prod_tail += WEBRTC_SPL_MUL_16_16(*x_start, *y_start);
++x_start;
++y_start;
}
r[i] = (int32_t)((prod + prod_tail) >> scaling);
}
*scale = scaling;
return order + 1;
}