replace inline assembly WebRtcNsx_PrepareSpectrumNeon by intrinsics.

The modification only uses the unique part of the spectrum (as is done for the C and asm code). It passes
byte to byte conformance test, and the single function performance
(if not specified, the code is compiled by GCC 4.6) on different
platforms:

| run 100k times             | cortex-a7 | cortex-a9 | cortex-a15 |
| use C as the base on each  |  (1.2Ghz) |  (1.0Ghz) |   (1.7Ghz) |
| CPU target                 |           |           |            |
|----------------------------+-----------+-----------+------------|
| C                          |      100% |      100% |       100% |
| Neon asm                   |       18% |       14% |        19% |
| Neon inline asm            |       31% |       25% |        27% |
| Neon intrinsic (GCC 4.6)   |       33% |       27% |        42% |
| Neon intrinscis (GCC 4.8)  |       17% |       14% |        19% |
| Neon intrinsics (LLVM 3.3) |       15% |       13% |        18% |

BUG=
R=andrew@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/13739004

Patch from Joe Yu <joe.yu@arm.com>.

git-svn-id: http://webrtc.googlecode.com/svn/trunk@6920 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
andrew@webrtc.org
2014-08-18 17:46:45 +00:00
parent f86b262588
commit d798095a37
2 changed files with 52 additions and 98 deletions

View File

@ -24,3 +24,4 @@ Mozilla Foundation
Opera Software ASA Opera Software ASA
Vonage Holdings Corp. Vonage Holdings Corp.
Temasys Communications Temasys Communications
ARM Holdings

View File

@ -363,6 +363,8 @@ void WebRtcNsx_NoiseEstimationNeon(NsxInst_t* inst,
// Filter the data in the frequency domain, and create spectrum. // Filter the data in the frequency domain, and create spectrum.
void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) { void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
assert(inst->magnLen % 8 == 1);
assert(inst->anaLen2 % 16 == 0);
// (1) Filtering. // (1) Filtering.
@ -374,49 +376,38 @@ void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
// (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages) // (int16_t)(inst->noiseSupFilter[i]), 14); // Q(normData-stages)
// } // }
int16_t* ptr_real = &inst->real[0]; int16_t* preal = &inst->real[0];
int16_t* ptr_imag = &inst->imag[0]; int16_t* pimag = &inst->imag[0];
uint16_t* ptr_noiseSupFilter = &inst->noiseSupFilter[0]; int16_t* pns_filter = (int16_t*)&inst->noiseSupFilter[0];
int16_t* pimag_end = pimag + inst->magnLen - 4;
// Filter the rest in the frequency domain. while (pimag < pimag_end) {
for (; ptr_real < &inst->real[inst->magnLen - 1];) { int16x8_t real = vld1q_s16(preal);
// Loop unrolled once. Both pointers are incremented by 4 twice. int16x8_t imag = vld1q_s16(pimag);
__asm__ __volatile__( int16x8_t ns_filter = vld1q_s16(pns_filter);
"vld1.16 d20, [%[ptr_real]]\n\t"
"vld1.16 d22, [%[ptr_imag]]\n\t"
"vld1.16 d23, [%[ptr_noiseSupFilter]]!\n\t"
"vmull.s16 q10, d20, d23\n\t"
"vmull.s16 q11, d22, d23\n\t"
"vshrn.s32 d20, q10, #14\n\t"
"vshrn.s32 d22, q11, #14\n\t"
"vst1.16 d20, [%[ptr_real]]!\n\t"
"vst1.16 d22, [%[ptr_imag]]!\n\t"
"vld1.16 d18, [%[ptr_real]]\n\t" int32x4_t tmp_r_0 = vmull_s16(vget_low_s16(real), vget_low_s16(ns_filter));
"vld1.16 d24, [%[ptr_imag]]\n\t" int32x4_t tmp_i_0 = vmull_s16(vget_low_s16(imag), vget_low_s16(ns_filter));
"vld1.16 d25, [%[ptr_noiseSupFilter]]!\n\t" int32x4_t tmp_r_1 = vmull_s16(vget_high_s16(real),
"vmull.s16 q9, d18, d25\n\t" vget_high_s16(ns_filter));
"vmull.s16 q12, d24, d25\n\t" int32x4_t tmp_i_1 = vmull_s16(vget_high_s16(imag),
"vshrn.s32 d18, q9, #14\n\t" vget_high_s16(ns_filter));
"vshrn.s32 d24, q12, #14\n\t"
"vst1.16 d18, [%[ptr_real]]!\n\t"
"vst1.16 d24, [%[ptr_imag]]!\n\t"
// Specify constraints. int16x4_t result_r_0 = vshrn_n_s32(tmp_r_0, 14);
:[ptr_imag]"+r"(ptr_imag), int16x4_t result_i_0 = vshrn_n_s32(tmp_i_0, 14);
[ptr_real]"+r"(ptr_real), int16x4_t result_r_1 = vshrn_n_s32(tmp_r_1, 14);
[ptr_noiseSupFilter]"+r"(ptr_noiseSupFilter) int16x4_t result_i_1 = vshrn_n_s32(tmp_i_1, 14);
:
:"d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", vst1q_s16(preal, vcombine_s16(result_r_0, result_r_1));
"q9", "q10", "q11", "q12" vst1q_s16(pimag, vcombine_s16(result_i_0, result_i_1));
); preal += 8;
pimag += 8;
pns_filter += 8;
} }
// Filter the last pair of elements in the frequency domain. // Filter the last element
*ptr_real = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_real, *preal = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*preal, *pns_filter, 14);
(int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages) *pimag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*pimag, *pns_filter, 14);
*ptr_imag = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(*ptr_imag,
(int16_t)(*ptr_noiseSupFilter), 14); // Q(normData-stages)
// (2) Create spectrum. // (2) Create spectrum.
@ -424,74 +415,36 @@ void WebRtcNsx_PrepareSpectrumNeon(NsxInst_t* inst, int16_t* freq_buf) {
// freq_buf[0] = inst->real[0]; // freq_buf[0] = inst->real[0];
// freq_buf[1] = -inst->imag[0]; // freq_buf[1] = -inst->imag[0];
// for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) { // for (i = 1, j = 2; i < inst->anaLen2; i += 1, j += 2) {
// tmp16 = (inst->anaLen << 1) - j;
// freq_buf[j] = inst->real[i]; // freq_buf[j] = inst->real[i];
// freq_buf[j + 1] = -inst->imag[i]; // freq_buf[j + 1] = -inst->imag[i];
// freq_buf[tmp16] = inst->real[i];
// freq_buf[tmp16 + 1] = inst->imag[i];
// } // }
// freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; // freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
// freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; // freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
freq_buf[0] = inst->real[0]; preal = &inst->real[0];
freq_buf[1] = -inst->imag[0]; pimag = &inst->imag[0];
pimag_end = pimag + inst->anaLen2;
int16_t * freq_buf_start = freq_buf;
while (pimag < pimag_end) {
// loop unroll
int16x8x2_t real_imag_0;
int16x8x2_t real_imag_1;
real_imag_0.val[1] = vld1q_s16(pimag);
real_imag_0.val[0] = vld1q_s16(preal);
preal += 8;
pimag += 8;
real_imag_1.val[1] = vld1q_s16(pimag);
real_imag_1.val[0] = vld1q_s16(preal);
preal += 8;
pimag += 8;
int offset = -16; real_imag_0.val[1] = vnegq_s16(real_imag_0.val[1]);
int16_t* ptr_realImag1 = &freq_buf[2]; real_imag_1.val[1] = vnegq_s16(real_imag_1.val[1]);
int16_t* ptr_realImag2 = ptr_realImag2 = &freq_buf[(inst->anaLen << 1) - 8]; vst2q_s16(freq_buf_start, real_imag_0);
ptr_real = &inst->real[1]; freq_buf_start += 16;
ptr_imag = &inst->imag[1]; vst2q_s16(freq_buf_start, real_imag_1);
for (; ptr_real < &inst->real[inst->anaLen2 - 11];) { freq_buf_start += 16;
// Loop unrolled once. All pointers are incremented twice.
__asm__ __volatile__(
"vld1.16 d22, [%[ptr_real]]!\n\t"
"vld1.16 d23, [%[ptr_imag]]!\n\t"
// Negate and interleave:
"vmov.s16 d20, d22\n\t"
"vneg.s16 d21, d23\n\t"
"vzip.16 d20, d21\n\t"
// Write 8 elements to &freq_buf[j]
"vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
// Interleave and reverse elements:
"vzip.16 d22, d23\n\t"
"vrev64.32 d18, d23\n\t"
"vrev64.32 d19, d22\n\t"
// Write 8 elements to &freq_buf[tmp16]
"vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
"vld1.16 d22, [%[ptr_real]]!\n\t"
"vld1.16 d23, [%[ptr_imag]]!\n\t"
// Negate and interleave:
"vmov.s16 d20, d22\n\t"
"vneg.s16 d21, d23\n\t"
"vzip.16 d20, d21\n\t"
// Write 8 elements to &freq_buf[j]
"vst1.16 {d20, d21}, [%[ptr_realImag1]]!\n\t"
// Interleave and reverse elements:
"vzip.16 d22, d23\n\t"
"vrev64.32 d18, d23\n\t"
"vrev64.32 d19, d22\n\t"
// Write 8 elements to &freq_buf[tmp16]
"vst1.16 {d18, d19}, [%[ptr_realImag2]], %[offset]\n\t"
// Specify constraints.
:[ptr_imag]"+r"(ptr_imag),
[ptr_real]"+r"(ptr_real),
[ptr_realImag1]"+r"(ptr_realImag1),
[ptr_realImag2]"+r"(ptr_realImag2)
:[offset]"r"(offset)
:"d18", "d19", "d20", "d21", "d22", "d23"
);
} }
for (ptr_realImag2 += 6;
ptr_real <= &inst->real[inst->anaLen2];
ptr_real += 1, ptr_imag += 1, ptr_realImag1 += 2, ptr_realImag2 -= 2) {
*ptr_realImag1 = *ptr_real;
*(ptr_realImag1 + 1) = -(*ptr_imag);
*ptr_realImag2 = *ptr_real;
*(ptr_realImag2 + 1) = *ptr_imag;
}
freq_buf[inst->anaLen] = inst->real[inst->anaLen2]; freq_buf[inst->anaLen] = inst->real[inst->anaLen2];
freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2]; freq_buf[inst->anaLen + 1] = -inst->imag[inst->anaLen2];
} }