add WebRtcIsacfix_AutocorrNeon's intrinsics version
The modification only uses the unique part of the WebRtcIsacfix_AutocorrC function. Pass FiltersTest.AutocorrFixTest test on both ARMv7 and ARM64, and the single function performance is similar with original assembly version on different platforms. If not specified, the code is compiled by GCC 4.6. The result is the "X version / C version" ratio, and the less is better. | run 100k times | cortex-a7 | cortex-a15 | | use C as the base on each | (1.2Ghz) | (1.7Ghz) | | CPU target | | | |----------------------------+-----------+------------| | Neon asm | 24% | 23% | | Neon intrinsics (GCC 4.6) | 33% | 32% | | Neon intrinsics (GCC 4.8) | 27% | 27% | BUG=3850 R=andrew@webrtc.org, jridges@masque.com Change-Id: Id6cd0671502fadbebd10b1f5493f5b16c988286f Review URL: https://webrtc-codereview.appspot.com/27999004 Patch from Zhongwei Yao <zhongwei.yao@arm.com>. git-svn-id: http://webrtc.googlecode.com/svn/trunk@7802 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "webrtc/modules/audio_coding/codecs/isac/fix/source/codec.h"
|
||||
|
||||
// Autocorrelation function in fixed point.
|
||||
// NOTE! Different from SPLIB-version in how it scales the signal.
|
||||
int WebRtcIsacfix_AutocorrNeon(int32_t* __restrict r,
|
||||
const int16_t* x,
|
||||
int16_t n,
|
||||
int16_t order,
|
||||
int16_t* __restrict scale) {
|
||||
int i = 0;
|
||||
int16_t scaling = 0;
|
||||
uint32_t temp = 0;
|
||||
int64_t prod = 0;
|
||||
int64_t prod_tail = 0;
|
||||
|
||||
assert(n % 4 == 0);
|
||||
assert(n >= 8);
|
||||
|
||||
// Calculate r[0].
|
||||
int16x4_t x0_v;
|
||||
int32x4_t tmpa0_v;
|
||||
int64x2_t tmpb_v;
|
||||
|
||||
tmpb_v = vdupq_n_s64(0);
|
||||
const int16_t* x_start = x;
|
||||
const int16_t* x_end0 = x_start + n;
|
||||
while (x_start < x_end0) {
|
||||
x0_v = vld1_s16(x_start);
|
||||
tmpa0_v = vmull_s16(x0_v, x0_v);
|
||||
tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
|
||||
x_start += 4;
|
||||
}
|
||||
|
||||
#ifdef WEBRTC_ARCH_ARM64
|
||||
prod = vaddvq_s64(tmpb_v);
|
||||
#else
|
||||
prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
|
||||
0);
|
||||
#endif
|
||||
// Calculate scaling (the value of shifting).
|
||||
temp = (uint32_t)(prod >> 31);
|
||||
|
||||
scaling = temp ? 32 - WebRtcSpl_NormU32(temp) : 0;
|
||||
r[0] = (int32_t)(prod >> scaling);
|
||||
|
||||
int16x8_t x1_v;
|
||||
int16x8_t y_v;
|
||||
int32x4_t tmpa1_v;
|
||||
// Perform the actual correlation calculation.
|
||||
for (i = 1; i < order + 1; i++) {
|
||||
tmpb_v = vdupq_n_s64(0);
|
||||
int rest = (n - i) % 8;
|
||||
x_start = x;
|
||||
x_end0 = x_start + n - i - rest;
|
||||
const int16_t* y_start = x_start + i;
|
||||
while (x_start < x_end0) {
|
||||
x1_v = vld1q_s16(x_start);
|
||||
y_v = vld1q_s16(y_start);
|
||||
tmpa0_v = vmull_s16(vget_low_s16(x1_v), vget_low_s16(y_v));
|
||||
#ifdef WEBRTC_ARCH_ARM64
|
||||
tmpa1_v = vmull_high_s16(x1_v, y_v);
|
||||
#else
|
||||
tmpa1_v = vmull_s16(vget_high_s16(x1_v), vget_high_s16(y_v));
|
||||
#endif
|
||||
tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
|
||||
tmpb_v = vpadalq_s32(tmpb_v, tmpa1_v);
|
||||
x_start += 8;
|
||||
y_start += 8;
|
||||
}
|
||||
// The remaining calculation.
|
||||
const int16_t* x_end1 = x + n - i;
|
||||
if (rest >= 4) {
|
||||
int16x4_t x2_v = vld1_s16(x_start);
|
||||
int16x4_t y2_v = vld1_s16(y_start);
|
||||
tmpa0_v = vmull_s16(x2_v, y2_v);
|
||||
tmpb_v = vpadalq_s32(tmpb_v, tmpa0_v);
|
||||
x_start += 4;
|
||||
y_start += 4;
|
||||
}
|
||||
#ifdef WEBRTC_ARCH_ARM64
|
||||
prod = vaddvq_s64(tmpb_v);
|
||||
#else
|
||||
prod = vget_lane_s64(vadd_s64(vget_low_s64(tmpb_v), vget_high_s64(tmpb_v)),
|
||||
0);
|
||||
#endif
|
||||
|
||||
prod_tail = 0;
|
||||
while (x_start < x_end1) {
|
||||
prod_tail += WEBRTC_SPL_MUL_16_16(*x_start, *y_start);
|
||||
++x_start;
|
||||
++y_start;
|
||||
}
|
||||
|
||||
r[i] = (int32_t)((prod + prod_tail) >> scaling);
|
||||
}
|
||||
|
||||
*scale = scaling;
|
||||
|
||||
return order + 1;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user