1.开根号浮点优化
inline float dsps_sqrtf_f32_ansi(float f)
{
int result;
int* f_ptr = (int*)&f;
result = 0x1fbb4000 + (*f_ptr >> 1);
const int *p = &result;
float* f_result = (float*)p;
return *f_result;
}
2.根号除法优化 x^(-1/2)
float dsps_inverted_sqrtf_f32_ansi(float data )
{
const float x2 = data * 0.5F;
const float threehalfs = 1.5F;
union {
float f;
uint32_t i;
} conv = {data}; // member 'f' set to value of 'data'.
conv.i = 0x5f3759df - ( conv.i >> 1 );
conv.f *= ( threehalfs - ( x2 * conv.f * conv.f ) );
return conv.f;
}
// Fast InvSqrt approxumation, with an error of less than 4%.
// This approximation is attributed to Greg Walsh.
inline void InvSqrt(float *x) { *(int *)x = 0x5f3759df - (*(int *)x >> 1); }
inline float SqrtSqrt(float x) { InvSqrt(&x); InvSqrt(&x); return x; }
3.float 乘法(数组相乘 vector)
.text
.align 4
.global dsps_mul_f32_ae32
.type dsps_mul_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dsps_mul_f32_ansi(const float *input1, const float *input2, float *output, int len, int step1, int step2, int step_out)
// {
// for (int i = 0 ; i < len ; i++) {
// output[i * step_out] = input1[i * step1] * input2[i * step2];
// }
// return ESP_OK;
// }
dsps_mul_f32_ae32:
// input1 - a2
// input2 - a3
// output - a4
// len - a5
// step1 - a6
// step2 - a7
// step_out - stack (a8)
entry a1, 16
l32i.n a8, a1, 16 // Load step_out to the a8 register
slli a6, a6, 2 // a6 - step1<<2
slli a7, a7, 2 // a7 - step2<<2
slli a8, a8, 2 // a8 - step_out<<2
lsi f0, a2, 0
add.n a2, a2, a6 // input1_ptr+=step1;
loopnez a5, loop_end_mul_f32_ae32
lsi f1, a3, 0
add.n a3, a3, a7 // input2_ptr+=step2;
mul.s f2, f1, f0 // f2 = f1*f0
lsi f0, a2, 0
add.n a2, a2, a6 // input1_ptr+=step1;
ssi f2, a4, 0
add.n a4, a4, a8 // input2_ptr+=step2;
loop_end_mul_f32_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
4.float数组乘固定数
.text
.align 4
.global dsps_mulc_f32_ae32
.type dsps_mulc_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dsps_mulc_f32_ansi(const float *input, float *output, int len, float C, int step_in, int step_out)
// {
// for (int i = 0 ; i < len ; i++) {
// output[i * step_out] = input[i * step_in] * C;
// }
// return ESP_OK;
// }
dsps_mulc_f32_ae32:
// input - a2
// output - a3
// len - a4
// C - a5
// step_in - a6
// step_out - a7
entry a1, 16
slli a6, a6, 2 // a6 - step_in<<2
slli a7, a7, 2 // a7 - step_out<<2
wfr f0, a5 // a5 - load to the f0
loopnez a4, loop_end_mulc_f32_ae32
lsi f1, a2, 0
mul.s f2, f1, f0 // f2 = f1 * f0
add.n a2, a2, a6 // input1_ptr+=step_in;
ssi f2, a3, 0
add.n a3, a3, a7 // output+=step_out;
loop_end_mulc_f32_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
5.float减法
.text
.align 4
.global dsps_sub_f32_ae32
.type dsps_sub_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dsps_sub_f32_ae32(const float *input1, const float *input2, float *output, int len, int step1, int step2, int step_out)
// {
// for (int i = 0 ; i < len ; i++) {
// output[i * step_out] = input1[i * step1] - input2[i * step2];
// }
// return ESP_OK;
// }
dsps_sub_f32_ae32:
// input1 - a2
// input2 - a3
// output - a4
// len - a5
// step1 - a6
// step2 - a7
// step_out - stack (a8)
entry a1, 16
l32i.n a8, a1, 16 // Load step_out to the a8 register
slli a6, a6, 2 // a6 - step1<<2
slli a7, a7, 2 // a7 - step2<<2
slli a8, a8, 2 // a8 - step_out<<2
lsi f0, a2, 0
add.n a2, a2, a6 // input1_ptr+=step1;
loopnez a5, loop_end_sub_f32_ae32
lsi f1, a3, 0
add.n a3, a3, a7 // input2_ptr+=step2;
sub.s f2, f0, f1 // f2 = f0 - f1
lsi f0, a2, 0
add.n a2, a2, a6 // input1_ptr+=step1;
ssi f2, a4, 0
add.n a4, a4, a8 // input2_ptr+=step2;
loop_end_sub_f32_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
6.float 加法
.text
.align 4
.global dsps_add_f32_ae32
.type dsps_add_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dsps_add_f32_ansi(const float *input1, const float *input2, float *output, int len, int step1, int step2, int step_out)
// {
// for (int i = 0 ; i < len ; i++) {
// output[i * step_out] = input1[i * step1] + input2[i * step2];
// }
// return ESP_OK;
// }
dsps_add_f32_ae32:
// input1 - a2
// input2 - a3
// output - a4
// len - a5
// step1 - a6
// step2 - a7
// step_out - stack (a8)
entry a1, 16
l32i.n a8, a1, 16 // Load step_out to the a8 register
slli a6, a6, 2 // a6 - step1<<2
slli a7, a7, 2 // a7 - step2<<2
slli a8, a8, 2 // a8 - step_out<<2
lsi f0, a2, 0
add.n a2, a2, a6 // input1_ptr+=step1;
loopnez a5, loop_end_add_f32_ae32
lsi f1, a3, 0
add.n a3, a3, a7 // input2_ptr+=step2;
add.s f2, f1, f0 // f2 = f1 + f0
lsi f0, a2, 0
add.n a2, a2, a6 // input1_ptr+=step1;
ssi f2, a4, 0
add.n a4, a4, a8 // input2_ptr+=step2;
loop_end_add_f32_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
7.矩阵乘法
7.1 常规矩阵乘法
// Matrinx A(m,n), m - amount or rows, n - amount of columns
// C(m,k) = A(m,n)*B(n,k)
// c(i,j) = sum(a(i,s)*b(s,j)) , s=1..n
esp_err_t dspm_mult_f32_ansi(const float *A, const float *B, float *C, int m, int n, int k)
{
for (int i = 0 ; i < m ; i++) {
for (int j = 0 ; j < k ; j++) {
C[i * k + j] = A[i * n] * B[j];
for (int s = 1; s < n ; s++) {
C[i * k + j] += A[i * n + s] * B[s * k + j];
}
}
}
return ESP_OK;
}
///汇编优化
.text
.align 4
.global dspm_mult_f32_ae32
.global .dspm_mult_f32_ae32_body
.type dspm_mult_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_f32_ae32:
// A - a2
// B - a3
// C - a4
// m - a5
// n - a6
// k - a7
// a8 = n*4
// a10 = 4
// a9 - counter loop1: 0..m
// a11 - counter loop2: 0..k
// a12 - A
// a13 - B
// a4 - C
entry a1, 16
// Array increment for floating point data should be 4
.dspm_mult_f32_ae32_body:
slli a8, a6, 2 // Pointer increment for A
slli a15,a7, 2 // Pointer increment for B
movi.n a14, 0 // Innitial state of accumulator f1
movi.n a10, 4 // Increment = 4
movi.n a9, 0 // counter loop1
.dpf_loop1:
movi.n a11, 0 // reset counter for loop2
.dpf_loop2:
// Clear initial state of the result register
// a2 - A
// a3 - B
// a6 - n
// a10 - step == 4 bytes
// a8 - step n*4
mov a12, a2 // load A
slli a13, a11, 2 // loop count to pointer value
add.n a13, a3, a13 // load A
wfr f1, a14 // reset f1
// Calculating dotproduct...
dotprode_f32_ae32 a12, a13, a6, a10, a15;
ssi f1, a4, 0 // Store result from f1 to memory at a4
addi a4, a4, 4 // increment a4 for next time
// check loop 2
addi a11, a11, 1 // Increment loop2 counter
blt a11, a7, .dpf_loop2
// check loop 1
add.n a2, a2, a8 // Increment A, A = A[i*n]
addi a9, a9, 1 // Increment loop1 counter
blt a9, a5, .dpf_loop1
movi.n a2, 0 // return status ESP_OK
retw.n
7.2 C[1][3] = A[3][3] * B[3][1] 型乘法优化
.text
.align 4
.global dspm_mult_3x3x1_f32_ae32
.type dspm_mult_3x3x1_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_3x3x1_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 3
entry a1, 16
movi a5, 0
movi a6, 3
lsi f13,a3, 0 // B[0]
lsi f14,a3, 4 // B[1]
lsi f15,a3, 8 // B[2]
// addi a2, a2, -12 // To compensate first increment
loopnez a6, loop_mac_3x3x1_end_m_ae32
wfr f0, a5
lsi f2, a2, 0
madd.s f0, f2, f13
lsi f3, a2, 4
madd.s f0, f3, f14
lsi f4, a2, 8
madd.s f0, f4, f15
addi a2, a2, 12
ssi f0, a4, 0
addi a4, a4, 4
loop_mac_3x3x1_end_m_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
7.3 C[3][3] = A[3][3] * B[3][3] 型矩阵优化
.text
.align 4
.global dspm_mult_3x3x3_f32_ae32
.type dspm_mult_3x3x3_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_3x3x3_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 3 - internal loop for n
// a7 - 3 - external loop for M
entry a1, 16
movi a5, 0
movi a6, 3
movi a7, 3 // loop ccount
m_loop_3x3x3:
mov a12, a2 // A
mov a14, a4 // output pointer
lsi f12, a3, 0 // B[0][0]
lsi f13, a3, 12 // B[1][0]
lsi f14, a3, 24 // B[2][0]
loopnez a6, loop_mac_3x3x3_end_m_ae32
wfr f0, a5
lsi f2, a12, 0
madd.s f0, f2, f12
lsi f3, a12, 4
madd.s f0, f3, f13
lsi f4, a12, 8
madd.s f0, f4, f14
addi a12, a12, 12
ssi f0, a14, 0
addi a14, a14, 12
loop_mac_3x3x3_end_m_ae32:
addi a3, a3, 4 // increment input pointer B
addi a4, a4, 4
addi a7, a7, -1
bnez a7, m_loop_3x3x3
movi.n a2, 0 // return status ESP_OK
retw.n
7.4 C[1][4] = A[4][4] * B[4][1] 矩阵乘法优化
.text
.align 4
.global dspm_mult_4x4x1_f32_ae32
.type dspm_mult_4x4x1_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_4x4x1_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 3
entry a1, 16
movi a5, 0
movi a6, 4
lsi f12,a3, 0 // B[0]
lsi f13,a3, 4 // B[1]
lsi f14,a3, 8 // B[2]
lsi f15,a3, 12 // B[3]
loopnez a6, loop_mac_4x4x1_end_m_ae32
wfr f0, a5
lsi f2, a2, 0
madd.s f0, f2, f12
lsi f3, a2, 4
madd.s f0, f3, f13
lsi f4, a2, 8
madd.s f0, f4, f14
lsi f5, a2, 12
madd.s f0, f5, f15
addi a2, a2, 16
ssi f0, a4, 0
addi a4, a4, 4
loop_mac_4x4x1_end_m_ae32:
movi.n a2, 0 // return status ESP_OK
retw.n
7.5 C[4][4] = A[4][4] * B[4][4] 型乘法优化
.text
.align 4
.global dspm_mult_4x4x4_f32_ae32
.type dspm_mult_4x4x4_f32_ae32,@function
// The function implements the following C code:
// esp_err_t dspm_mult_3x3x1_f32_ansi(const float* A, const float* B, float* C, int m, int n, int k)
// {
// for (int i=0 ; i< m ; i++)
// {
// for (int j=0 ; j< k ; j++)
// {
// C[i*k + j] = A[i*n]*B[j];
// for (int s=1; s< n ; s++)
// {
// C[i*k + j] += A[i*n + s]*B[s*k + j];
// }
// }
// }
// return ESP_OK;
// }
dspm_mult_4x4x4_f32_ae32:
// A - a2
// B - a3
// C - a4
// a5 - 0
// a6 - 4 - internal loop for n
// a7 - 4 - external loop for M
entry a1, 16
movi a5, 0
movi a6, 4
movi a7, 4 // loop ccount
m_loop_4x4x4:
mov a12, a2 // A
mov a14, a4 // output pointer
lsi f12, a3, 0 // B[0][0]
lsi f13, a3, 16 // B[1][0]
lsi f14, a3, 32 // B[2][0]
lsi f15, a3, 48 // B[3][0]
loopnez a6, loop_mac_4x4x4_end_m_ae32
wfr f0, a5
lsi f2, a12, 0
madd.s f0, f2, f12
lsi f3, a12, 4
madd.s f0, f3, f13
lsi f4, a12, 8
madd.s f0, f4, f14
lsi f5, a12, 12
madd.s f0, f5, f15
addi a12, a12, 16
ssi f0, a14, 0
addi a14, a14, 16
loop_mac_4x4x4_end_m_ae32:
addi a3, a3, 4 // increment input pointer B
addi a4, a4, 4
addi a7, a7, -1
bnez a7, m_loop_4x4x4
movi.n a2, 0 // return status ESP_OK
retw.n
8.sin cos 生成优化
用泰勒级数来求近似值。由于上面提到精度足够即可,在具体实现中我只迭代了 10 次。具体实现如下:
// sin(x) = x - (x^3 / 3!) + (x^5 / 5!) - (x^7 / 7!) + ...
template <typename T>
T f_sin(const T& x)
{
const T x2 = x * x;
T power = x;
T facter = 1;
T sign = 1;
T sum = 0;
const int loop = 22; // 10 times loop
for (int i = 3; i < loop; i += 2) {
sign *= -1;
power *= x2;
facter *= i * (i - 1);
sum += sign * power / facter;
}
return sum + x;
}
// cos(x) = 1 - (x^2 / 2!) + (x^4 / 4!) - (x^6 / 6!) + ...
template <typename T>
T f_cos(const T& x)
{
const T x2 = x * x;
T power = 1;
T facter = 1;
T sign = 1;
T sum = 0;
const int loop = 21; // 10 times loop
for (int i = 2; i < loop; i += 2) {
sign *= -1;
power *= x2;
facter *= i * (i - 1);
sum += sign * power / facter;
}
return sum + 1;
}
//随机数产生
#include <ctime>
typedef unsigned int u32;
class LCG {
public:
LCG(u32 seed) : mSeed(seed) {}
u32 operator()() {
mSeed = mSeed * 214013U + 2531011U;
return (mSeed >> 16U) & 0x7FFFU;
}
private:
u32 mSeed;
};
9.POW10的快速生成
// Fast power-of-10 approximation, with RMS error of 1.77%.
// This approximation developed by Nicol Schraudolph (Neural Computation vol 11, 1999).
// Adapted for 32-bit floats by Lippold Haken of Haken Audio, April 2010.
// Set float variable's bits to integer expression.
// f=b^f is approximated by
// (int)f = f*0x00800000*log(b)/log(2) + 0x3F800000-60801*8
// f=10^f is approximated by
// (int)f = f*27866352.6 + 1064866808.0
inline void Pow10(float *f) { *(int *)f = *f * 27866352.6 + 1064866808.0; };
10.(A*A+B*B)^(1/2) 优化
double
pythag(double a, double b)
// return sqrt(a*a+b*b) avoiding overflow.
// Shouldn't be necessary with type double and sane algorithms/code
{
a = fabs(a);
b = fabs(b);
if ( a>b )
{
double v = b / a;
return a * sqrt(1.0+v*v);
}
else
{
if ( b==0.0 ) return 0.0;
double v = a / b;
return b * sqrt(1.0+v*v);
}
}
11.二项式系数
inline ulong binomial(ulong n, ulong k)
{
if ( k>n ) return 0;
if ( (k==0) || (k==n) ) return 1;
// if ( 2*k > n ) // use symmetry
if ( k > n / 2 ) k = n-k; // use symmetry
ulong b = n - k + 1;
ulong f = b;
for (ulong j=2; j<=k; ++j)
{
++f;
b *= f;
b /= j;
}
return b;
}
12.算数平均(A+B)/2
static inline ulong floor_average(ulong x, ulong y)
// Return floor( (x+y)/2 )
// Result is correct even if (x+y) wouldn't fit into a ulong
// Use: x+y == ((x&y)<<1) + (x^y)
// that is: sum == carries + sum_without_carries
{
return (x & y) + ((x ^ y) >> 1);
// return y + ((x-y)>>1); // works if x>=y
}
// -------------------------
static inline ulong ceil_average(ulong x, ulong y)
// Return ceil( (x+y)/2 )
// Result is correct even if (x+y) wouldn't fit into a ulong
// Use: x+y == ((x|y)<<1) - (x^y)
// ceil_average(x,y) == average(x,y) + ((x^y)&1))
{
return (x | y) - ((x ^ y) >> 1);
}
// -------------------------
/*
The following assembler implementations were suggested
by Stefan Kanthak (March 2019):
floor_average:
movl x, %eax
addl y, %eax
rcrl $1, %eax
ret
ceil_average:
movl x, %eax
addl y, %eax
rcrl $1, %eax
adcl $0, %eax
ret
*/
13.使用TI的IQmath 库
参考https://www.armbbs.cn/forum.php?mod=viewthread&tid=94660