diff --git a/src/common_audio/signal_processing/Android.mk b/src/common_audio/signal_processing/Android.mk index 3a2ddc71f5..3ff066c2af 100644 --- a/src/common_audio/signal_processing/Android.mk +++ b/src/common_audio/signal_processing/Android.mk @@ -20,7 +20,6 @@ LOCAL_SRC_FILES := \ auto_corr_to_refl_coef.c \ auto_correlation.c \ complex_fft.c \ - complex_bit_reverse.c \ copy_set_operations.c \ division_operations.c \ dot_product_with_scale.c \ @@ -77,9 +76,11 @@ endif ifeq ($(TARGET_ARCH),arm) LOCAL_SRC_FILES += \ + complex_bit_reverse_arm.s \ spl_sqrt_floor.s else LOCAL_SRC_FILES += \ + complex_bit_reverse.c \ spl_sqrt_floor.c endif diff --git a/src/common_audio/signal_processing/complex_bit_reverse.c b/src/common_audio/signal_processing/complex_bit_reverse.c index 85c76f8283..02fde1e91c 100644 --- a/src/common_audio/signal_processing/complex_bit_reverse.c +++ b/src/common_audio/signal_processing/complex_bit_reverse.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved. + * Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -8,44 +8,102 @@ * be found in the AUTHORS file in the root of the source tree. */ - -/* - * This file contains the function WebRtcSpl_ComplexBitReverse(). - * The description header can be found in signal_processing_library.h - * - */ - #include "signal_processing_library.h" -void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 frfi[], int stages) -{ - int mr, nn, n, l, m; - WebRtc_Word16 tr, ti; +/* Tables for data buffer indexes that are bit reversed and thus need to be + * swapped. Note that, index_7[{0, 2, 4, ...}] are for the left side of the swap + * operations, while index_7[{1, 3, 5, ...}] are for the right side of the + * operation. Same for index_8. + */ - n = 1 << stages; +/* Indexes for the case of stages == 7. */ +static const int16_t index_7[112] = { + 1, 64, 2, 32, 3, 96, 4, 16, 5, 80, 6, 48, 7, 112, 9, 72, 10, 40, 11, 104, + 12, 24, 13, 88, 14, 56, 15, 120, 17, 68, 18, 36, 19, 100, 21, 84, 22, 52, + 23, 116, 25, 76, 26, 44, 27, 108, 29, 92, 30, 60, 31, 124, 33, 66, 35, 98, + 37, 82, 38, 50, 39, 114, 41, 74, 43, 106, 45, 90, 46, 58, 47, 122, 49, 70, + 51, 102, 53, 86, 55, 118, 57, 78, 59, 110, 61, 94, 63, 126, 67, 97, 69, + 81, 71, 113, 75, 105, 77, 89, 79, 121, 83, 101, 87, 117, 91, 109, 95, 125, + 103, 115, 111, 123 +}; - mr = 0; - nn = n - 1; +/* Indexes for the case of stages == 8. */ +static const int16_t index_8[240] = { + 1, 128, 2, 64, 3, 192, 4, 32, 5, 160, 6, 96, 7, 224, 8, 16, 9, 144, 10, 80, + 11, 208, 12, 48, 13, 176, 14, 112, 15, 240, 17, 136, 18, 72, 19, 200, 20, + 40, 21, 168, 22, 104, 23, 232, 25, 152, 26, 88, 27, 216, 28, 56, 29, 184, + 30, 120, 31, 248, 33, 132, 34, 68, 35, 196, 37, 164, 38, 100, 39, 228, 41, + 148, 42, 84, 43, 212, 44, 52, 45, 180, 46, 116, 47, 244, 49, 140, 50, 76, + 51, 204, 53, 172, 54, 108, 55, 236, 57, 156, 58, 92, 59, 220, 61, 188, 62, + 124, 63, 252, 65, 130, 67, 194, 69, 162, 70, 98, 71, 226, 73, 146, 74, 82, + 75, 210, 77, 178, 78, 114, 79, 242, 81, 138, 83, 202, 85, 170, 86, 106, 87, + 234, 89, 154, 91, 218, 93, 186, 94, 122, 95, 250, 97, 134, 99, 198, 101, + 166, 103, 230, 105, 150, 107, 214, 109, 182, 110, 118, 111, 246, 113, 142, + 115, 206, 117, 174, 119, 238, 121, 158, 123, 222, 125, 190, 127, 254, 131, + 193, 133, 161, 135, 225, 137, 145, 139, 209, 141, 177, 143, 241, 147, 201, + 149, 169, 151, 233, 155, 217, 157, 185, 159, 249, 163, 197, 167, 229, 171, + 213, 173, 181, 175, 245, 179, 205, 183, 237, 187, 221, 191, 253, 199, 227, + 203, 211, 207, 243, 215, 235, 223, 251, 239, 247 +}; - // decimation in time - re-order data - for (m = 1; m <= nn; ++m) - { - l = n; - do - { - l >>= 1; - } while (mr + l > nn); - mr = (mr & (l - 1)) + l; +void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages) { + /* For any specific value of stages, we know exactly the indexes that are + * bit reversed. Currently (Feb. 2012) in WebRTC the only possible values of + * stages are 7 and 8, so we use tables to save unnecessary iterations and + * calculations for these two cases. + */ + if (stages == 7 || stages == 8) { + int m = 0; + int length = 112; + const int16_t* index = index_7; - if (mr <= m) - continue; - - tr = frfi[2 * m]; - frfi[2 * m] = frfi[2 * mr]; - frfi[2 * mr] = tr; - - ti = frfi[2 * m + 1]; - frfi[2 * m + 1] = frfi[2 * mr + 1]; - frfi[2 * mr + 1] = ti; + if (stages == 8) { + length = 240; + index = index_8; } + + /* Decimation in time. Swap the elements with bit-reversed indexes. */ + for (m = 0; m < length; m += 2) { + /* We declare a int32_t* type pointer, to load both the 16-bit real + * and imaginary elements from complex_data in one instruction, reducing + * complexity. + */ + int32_t* complex_data_ptr = (int32_t*)complex_data; + int32_t temp = 0; + + temp = complex_data_ptr[index[m]]; /* Real and imaginary */ + complex_data_ptr[index[m]] = complex_data_ptr[index[m + 1]]; + complex_data_ptr[index[m + 1]] = temp; + } + } + else { + int m = 0, mr = 0, l = 0; + int n = 1 << stages; + int nn = n - 1; + + /* Decimation in time - re-order data */ + for (m = 1; m <= nn; ++m) { + int32_t* complex_data_ptr = (int32_t*)complex_data; + int32_t temp = 0; + + /* Find out indexes that are bit-reversed. */ + l = n; + do { + l >>= 1; + } while (l > nn - mr); + mr = (mr & (l - 1)) + l; + + if (mr <= m) { + continue; + } + + /* Swap the elements with bit-reversed indexes. + * This is similar to the loop in the stages == 7 or 8 cases. + */ + temp = complex_data_ptr[m]; /* Real and imaginary */ + complex_data_ptr[m] = complex_data_ptr[mr]; + complex_data_ptr[mr] = temp; + } + } } + diff --git a/src/common_audio/signal_processing/complex_bit_reverse_arm.s b/src/common_audio/signal_processing/complex_bit_reverse_arm.s new file mode 100644 index 0000000000..4828077800 --- /dev/null +++ b/src/common_audio/signal_processing/complex_bit_reverse_arm.s @@ -0,0 +1,126 @@ +@ +@ Copyright (c) 2012 The WebRTC project authors. All Rights Reserved. +@ +@ Use of this source code is governed by a BSD-style license +@ that can be found in the LICENSE file in the root of the source +@ tree. An additional intellectual property rights grant can be found +@ in the file PATENTS. All contributing project authors may +@ be found in the AUTHORS file in the root of the source tree. +@ + +@ This file contains the function WebRtcSpl_ComplexBitReverse(), optimized +@ for ARMv5 platforms. +@ Reference C code is in file complex_bit_reverse.c. Bit-exact. + +.arch armv5 + +.global WebRtcSpl_ComplexBitReverse + +.align 2 + +WebRtcSpl_ComplexBitReverse: +.fnstart + + push {r4-r7} + + cmp r1, #7 + adr r3, index_7 @ Table pointer. + mov r4, #112 @ Number of interations. + beq PRE_LOOP_STAGES_7_OR_8 + + cmp r1, #8 + adr r3, index_8 @ Table pointer. + mov r4, #240 @ Number of interations. + beq PRE_LOOP_STAGES_7_OR_8 + + mov r3, #1 @ Initialize m. + mov r1, r3, asl r1 @ n = 1 << stages; + subs r6, r1, #1 @ nn = n - 1; + ble END + + mov r5, r0 @ &complex_data + mov r4, #0 @ ml + +LOOP_GENERIC: + rsb r12, r4, r6 @ l > nn - mr + mov r2, r1 @ n + +LOOP_SHIFT: + asr r2, #1 @ l >>= 1; + cmp r2, r12 + bgt LOOP_SHIFT + + sub r12, r2, #1 + and r4, r12, r4 + add r4, r2 @ mr = (mr & (l - 1)) + l; + cmp r4, r3 @ mr <= m ? + ble UPDATE_REGISTERS + + mov r12, r4, asl #2 + ldr r7, [r5, #4] @ complex_data[2 * m, 2 * m + 1]. + @ Offset 4 due to m incrementing from 1. + ldr r2, [r0, r12] @ complex_data[2 * mr, 2 * mr + 1]. + str r7, [r0, r12] + str r2, [r5, #4] + +UPDATE_REGISTERS: + add r3, r3, #1 + add r5, #4 + cmp r3, r1 + bne LOOP_GENERIC + + b END + +PRE_LOOP_STAGES_7_OR_8: + add r4, r3, r4, asl #1 + +LOOP_STAGES_7_OR_8: + ldrsh r2, [r3], #2 @ index[m] + ldrsh r5, [r3], #2 @ index[m + 1] + ldr r1, [r0, r2] @ complex_data[index[m], index[m] + 1] + ldr r12, [r0, r5] @ complex_data[index[m + 1], index[m + 1] + 1] + cmp r3, r4 + str r1, [r0, r5] + str r12, [r0, r2] + bne LOOP_STAGES_7_OR_8 + +END: + pop {r4-r7} + bx lr + +.fnend + + +@ The index tables. Note the values are doubles of the actual indexes for 16-bit +@ elements, different from the generic C code. It actually provides byte offsets +@ for the indexes. + +.align 2 +index_7: @ Indexes for stages == 7. + .hword 4, 256, 8, 128, 12, 384, 16, 64, 20, 320, 24, 192, 28, 448, 36, 288 + .hword 40, 160, 44, 416, 48, 96, 52, 352, 56, 224, 60, 480, 68, 272, 72, 144 + .hword 76, 400, 84, 336, 88, 208, 92, 464, 100, 304, 104, 176, 108, 432, 116 + .hword 368, 120, 240, 124, 496, 132, 264, 140, 392, 148, 328, 152, 200, 156 + .hword 456, 164, 296, 172, 424, 180, 360, 184, 232, 188, 488, 196, 280, 204 + .hword 408, 212, 344, 220, 472, 228, 312, 236, 440, 244, 376, 252, 504, 268 + .hword 388, 276, 324, 284, 452, 300, 420, 308, 356, 316, 484, 332, 404, 348 + .hword 468, 364, 436, 380, 500, 412, 460, 444, 492 + +index_8: @ Indexes for stages == 8. + .hword 4, 512, 8, 256, 12, 768, 16, 128, 20, 640, 24, 384, 28, 896, 32, 64 + .hword 36, 576, 40, 320, 44, 832, 48, 192, 52, 704, 56, 448, 60, 960, 68, 544 + .hword 72, 288, 76, 800, 80, 160, 84, 672, 88, 416, 92, 928, 100, 608, 104 + .hword 352, 108, 864, 112, 224, 116, 736, 120, 480, 124, 992, 132, 528, 136 + .hword 272, 140, 784, 148, 656, 152, 400, 156, 912, 164, 592, 168, 336, 172 + .hword 848, 176, 208, 180, 720, 184, 464, 188, 976, 196, 560, 200, 304, 204 + .hword 816, 212, 688, 216, 432, 220, 944, 228, 624, 232, 368, 236, 880, 244 + .hword 752, 248, 496, 252, 1008, 260, 520, 268, 776, 276, 648, 280, 392, 284 + .hword 904, 292, 584, 296, 328, 300, 840, 308, 712, 312, 456, 316, 968, 324 + .hword 552, 332, 808, 340, 680, 344, 424, 348, 936, 356, 616, 364, 872, 372 + .hword 744, 376, 488, 380, 1000, 388, 536, 396, 792, 404, 664, 412, 920, 420 + .hword 600, 428, 856, 436, 728, 440, 472, 444, 984, 452, 568, 460, 824, 468 + .hword 696, 476, 952, 484, 632, 492, 888, 500, 760, 508, 1016, 524, 772, 532 + .hword 644, 540, 900, 548, 580, 556, 836, 564, 708, 572, 964, 588, 804, 596 + .hword 676, 604, 932, 620, 868, 628, 740, 636, 996, 652, 788, 668, 916, 684 + .hword 852, 692, 724, 700, 980, 716, 820, 732, 948, 748, 884, 764, 1012, 796 + .hword 908, 812, 844, 828, 972, 860, 940, 892, 1004, 956, 988 diff --git a/src/common_audio/signal_processing/include/signal_processing_library.h b/src/common_audio/signal_processing/include/signal_processing_library.h index d9008c1460..87b8f2936a 100644 --- a/src/common_audio/signal_processing/include/signal_processing_library.h +++ b/src/common_audio/signal_processing/include/signal_processing_library.h @@ -429,9 +429,26 @@ int WebRtcSpl_DownsampleFast(const int16_t* data_in, // End: Filter operations. // FFT operations + int WebRtcSpl_ComplexFFT(WebRtc_Word16 vector[], int stages, int mode); int WebRtcSpl_ComplexIFFT(WebRtc_Word16 vector[], int stages, int mode); -void WebRtcSpl_ComplexBitReverse(WebRtc_Word16 vector[], int stages); + +// Treat a 16-bit complex data buffer |complex_data| as an array of 32-bit +// values, and swap elements whose indexes are bit-reverses of each other. +// +// Input: +// - complex_data : Complex data buffer containing 2^|stages| real +// elements interleaved with 2^|stages| imaginary +// elements: [Re Im Re Im Re Im....] +// - stages : Number of FFT stages. Must be at least 3 and at most +// 10, since the table WebRtcSpl_kSinTable1024[] is 1024 +// elements long. +// +// Output: +// - complex_data : The complex data buffer. + +void WebRtcSpl_ComplexBitReverse(int16_t* __restrict complex_data, int stages); + // End: FFT operations /************************************************************ @@ -1573,31 +1590,6 @@ void WebRtcSpl_SynthesisQMF(const WebRtc_Word16* low_band, // which returns a scale value of -1, indicating error. // -// -// WebRtcSpl_ComplexBitReverse(...) -// -// Complex Bit Reverse -// -// This function bit-reverses the position of elements in the complex input -// vector into the output vector. -// -// If you bit-reverse a linear-order array, you obtain a bit-reversed order -// array. If you bit-reverse a bit-reversed order array, you obtain a -// linear-order array. -// -// Input: -// - vector : In pointer to complex vector containing 2^|stages| real -// elements interleaved with 2^|stages| imaginary elements. -// [ReImReImReIm....] -// - stages : Number of FFT stages. Must be at least 3 and at most 10, -// since the table WebRtcSpl_kSinTable1024[] is 1024 -// elements long. -// -// Output: -// - vector : Out pointer to complex vector in bit-reversed order. -// The input vector is over written. -// - // // WebRtcSpl_AnalysisQMF(...) // diff --git a/src/common_audio/signal_processing/include/spl_inl_armv7.h b/src/common_audio/signal_processing/include/spl_inl_armv7.h index 689c2baeea..5b19c2c1ff 100644 --- a/src/common_audio/signal_processing/include/spl_inl_armv7.h +++ b/src/common_audio/signal_processing/include/spl_inl_armv7.h @@ -15,6 +15,9 @@ #ifndef WEBRTC_SPL_SPL_INL_ARMV7_H_ #define WEBRTC_SPL_SPL_INL_ARMV7_H_ +// TODO(kma): Replace some assembly code with GCC intrinsics +// (e.g. __builtin_clz). + static __inline WebRtc_Word32 WEBRTC_SPL_MUL_16_32_RSFT16(WebRtc_Word16 a, WebRtc_Word32 b) { WebRtc_Word32 tmp;