Optimized sqrt() for ARM.

Merged optimized sqrt() function from webRTC svn rev r1627
to improve performance ARM.

Change-Id: Ie9c57e2a3b8a0786c0169028c0940184b7c9db5e
This commit is contained in:
Eric Laurent
2012-02-17 10:26:11 -08:00
parent c55a963834
commit 5870e071aa
4 changed files with 168 additions and 36 deletions

36
NOTICE
View File

@ -164,3 +164,39 @@ Scott McMurray
// release() added in by Google. Use this to conditionally
// transfer ownership of a heap-allocated object to the caller, usually on
// method success.
===============================================================================
/*
* Written by Wilco Dijkstra, 1996.
* Refer to NOTICE file at the root of git project.
*
* Minor modifications in code style for WebRTC, 2012.
*/
// The following email record is related to source files spl_sqrt_floor.c
// and spl_sqrt_floor.s in trunk/src/common_audio/signal_processing/.
//
//
// From: Wilco Dijkstra <Wilco.Dijkstra@ntlworld.com>
// Date: Fri, Jun 24, 2011 at 3:20 AM
// Subject: Re: sqrt routine
// To: Kevin Ma <kma@google.com>
// Hi Kevin,
// Thanks for asking. Those routines are public domain (originally posted to
// comp.sys.arm a long time ago), so you can use them freely for any purpose.
// Cheers,
// Wilco
//
// ----- Original Message -----
// From: "Kevin Ma" <kma@google.com>
// To: <Wilco.Dijkstra@ntlworld.com>
// Sent: Thursday, June 23, 2011 11:44 PM
// Subject: Fwd: sqrt routine
// Hi Wilco,
// I saw your sqrt routine from several web sites, including
// http://www.finesse.demon.co.uk/steven/sqrt.html.
// Just wonder if there's any copyright information with your Successive
// approximation routines, or if I can freely use it for any purpose.
// Thanks.
// Kevin

View File

@ -44,7 +44,6 @@ LOCAL_SRC_FILES := \
resample_by_2_internal.c \
resample_fractional.c \
spl_sqrt.c \
spl_sqrt_floor.c \
spl_version.c \
splitting_filter.c \
sqrt_of_one_minus_x_squared.c \
@ -65,6 +64,14 @@ LOCAL_CFLAGS += \
$(MY_ARM_CFLAGS_NEON)
endif
ifeq ($(TARGET_ARCH),arm)
LOCAL_SRC_FILES += \
spl_sqrt_floor.s
else
LOCAL_SRC_FILES += \
spl_sqrt_floor.c
endif
LOCAL_SHARED_LIBRARIES := libstlport
ifeq ($(TARGET_OS)-$(TARGET_SIMULATOR),linux-true)

View File

@ -1,21 +1,26 @@
/*
* Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/*
* This file contains the function WebRtcSpl_SqrtFloor().
* The description header can be found in signal_processing_library.h
* Written by Wilco Dijkstra, 1996.
* Refer to NOTICE file at the root of git project.
*
* Minor modifications in code style for WebRTC, 2012.
*/
#include "signal_processing_library.h"
/*
* Algorithm:
* Successive approximation of the equation (root + delta) ^ 2 = N
* until delta < 1. If delta < 1 we have the integer part of SQRT (N).
* Use delta = 2^i for i = 15 .. 0.
*
* Output precision is 16 bits. Note for large input values (close to
* 0x7FFFFFFF), bit 15 (the highest bit of the low 16-bit half word)
* contains the MSB information (a non-sign value). Do with caution
* if you need to cast the output to int16_t type.
*
* If the input value is negative, it returns 0.
*/
#define WEBRTC_SPL_SQRT_ITER(N) \
try1 = root + (1 << (N)); \
if (value >= try1 << (N)) \
@ -24,13 +29,9 @@
root |= 2 << (N); \
}
// (out) Square root of input parameter
WebRtc_Word32 WebRtcSpl_SqrtFloor(WebRtc_Word32 value)
int32_t WebRtcSpl_SqrtFloor(int32_t value)
{
// new routine for performance, 4 cycles/bit in ARM
// output precision is 16 bits
WebRtc_Word32 root = 0, try1;
int32_t root = 0, try1;
WEBRTC_SPL_SQRT_ITER (15);
WEBRTC_SPL_SQRT_ITER (14);

View File

@ -0,0 +1,88 @@
@ Written by Wilco Dijkstra, 1996.
@ Refer to NOTICE file at the root of git project.
@
@ Minor modifications in code style for WebRTC, 2012.
@ Output is bit-exact with the reference C code in spl_sqrt_floor.c.
@ Input : r0 32 bit unsigned integer
@ Output: r0 = INT (SQRT (r0)), precision is 16 bits
@ Registers touched: r1, r2
.global WebRtcSpl_SqrtFloor
.align 2
WebRtcSpl_SqrtFloor:
.fnstart
mov r1, #3 << 30
mov r2, #1 << 30
@ unroll for i = 0 .. 15
cmp r0, r2, ror #2 * 0
subhs r0, r0, r2, ror #2 * 0
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 1
subhs r0, r0, r2, ror #2 * 1
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 2
subhs r0, r0, r2, ror #2 * 2
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 3
subhs r0, r0, r2, ror #2 * 3
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 4
subhs r0, r0, r2, ror #2 * 4
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 5
subhs r0, r0, r2, ror #2 * 5
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 6
subhs r0, r0, r2, ror #2 * 6
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 7
subhs r0, r0, r2, ror #2 * 7
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 8
subhs r0, r0, r2, ror #2 * 8
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 9
subhs r0, r0, r2, ror #2 * 9
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 10
subhs r0, r0, r2, ror #2 * 10
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 11
subhs r0, r0, r2, ror #2 * 11
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 12
subhs r0, r0, r2, ror #2 * 12
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 13
subhs r0, r0, r2, ror #2 * 13
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 14
subhs r0, r0, r2, ror #2 * 14
adc r2, r1, r2, lsl #1
cmp r0, r2, ror #2 * 15
subhs r0, r0, r2, ror #2 * 15
adc r2, r1, r2, lsl #1
bic r0, r2, #3 << 30 @ for rounding add: cmp r0, r2 adc r2, #1
bx lr
.fnend