361 lines
11 KiB
C
361 lines
11 KiB
C
|
/*
|
||
|
sse_mathfun_extension.h - zlib license
|
||
|
Written by Tolga Mizrak 2016
|
||
|
Extension of sse_mathfun.h, which is written by Julien Pommier
|
||
|
|
||
|
Based on the corresponding algorithms of the cephes math library
|
||
|
|
||
|
This is written as an extension to sse_mathfun.h instead of modifying it, just because I didn't want
|
||
|
to maintain a modified version of the original library. This way switching to a newer version of the
|
||
|
library won't be a hassle.
|
||
|
|
||
|
Note that non SSE2 implementations of tan_ps, atan_ps, cot_ps and atan2_ps are not implemented yet.
|
||
|
As such, currently you need to #define USE_SSE2 to compile.
|
||
|
|
||
|
With tan_ps, cot_ps you get good precision on input ranges that are further away from the domain
|
||
|
borders (-PI/2, PI/2 for tan and 0, 1 for cot). See the results on the deviations for these
|
||
|
functions on my machine:
|
||
|
checking tan on [-0.25*Pi, 0.25*Pi]
|
||
|
max deviation from tanf(x): 1.19209e-07 at 0.250000006957*Pi, max deviation from cephes_tan(x):
|
||
|
5.96046e-08
|
||
|
->> precision OK for the tan_ps <<-
|
||
|
|
||
|
checking tan on [-0.49*Pi, 0.49*Pi]
|
||
|
max deviation from tanf(x): 3.8147e-06 at -0.490000009841*Pi, max deviation from cephes_tan(x):
|
||
|
9.53674e-07
|
||
|
->> precision OK for the tan_ps <<-
|
||
|
|
||
|
checking cot on [0.2*Pi, 0.7*Pi]
|
||
|
max deviation from cotf(x): 1.19209e-07 at 0.204303119606*Pi, max deviation from cephes_cot(x):
|
||
|
1.19209e-07
|
||
|
->> precision OK for the cot_ps <<-
|
||
|
|
||
|
checking cot on [0.01*Pi, 0.99*Pi]
|
||
|
max deviation from cotf(x): 3.8147e-06 at 0.987876517942*Pi, max deviation from cephes_cot(x):
|
||
|
9.53674e-07
|
||
|
->> precision OK for the cot_ps <<-
|
||
|
|
||
|
With atan_ps and atan2_ps you get pretty good precision, atan_ps max deviation is < 2e-7 and
|
||
|
atan2_ps max deviation is < 2.5e-7
|
||
|
*/
|
||
|
|
||
|
/* Copyright (C) 2016 Tolga Mizrak
|
||
|
|
||
|
This software is provided 'as-is', without any express or implied
|
||
|
warranty. In no event will the authors be held liable for any damages
|
||
|
arising from the use of this software.
|
||
|
|
||
|
Permission is granted to anyone to use this software for any purpose,
|
||
|
including commercial applications, and to alter it and redistribute it
|
||
|
freely, subject to the following restrictions:
|
||
|
|
||
|
1. The origin of this software must not be misrepresented; you must not
|
||
|
claim that you wrote the original software. If you use this software
|
||
|
in a product, an acknowledgment in the product documentation would be
|
||
|
appreciated but is not required.
|
||
|
2. Altered source versions must be plainly marked as such, and must not be
|
||
|
misrepresented as being the original software.
|
||
|
3. This notice may not be removed or altered from any source distribution.
|
||
|
|
||
|
(this is the zlib license)
|
||
|
*/
|
||
|
|
||
|
#pragma once
|
||
|
|
||
|
#ifndef _SSE_MATHFUN_EXTENSION_H_INCLUDED_
|
||
|
#define _SSE_MATHFUN_EXTENSION_H_INCLUDED_
|
||
|
|
||
|
#ifndef USE_SSE2
|
||
|
#error sse1 & mmx version not implemented
|
||
|
#endif
|
||
|
|
||
|
#ifdef _MSC_VER
|
||
|
#pragma warning( push )
|
||
|
/* warning C4838: conversion from 'double' to 'const float' requires a narrowing conversion */
|
||
|
#pragma warning( disable : 4838 )
|
||
|
/* warning C4305: 'initializing': truncation from 'double' to 'const float' */
|
||
|
#pragma warning( disable : 4305 )
|
||
|
#endif
|
||
|
|
||
|
#include "sse_mathfun.h"
|
||
|
|
||
|
_PS_CONST( 0, 0 );
|
||
|
_PS_CONST( 2, 2 );
|
||
|
_PI32_CONST( neg1, 1 );
|
||
|
|
||
|
_PS_CONST( tancof_p0, 9.38540185543E-3 );
|
||
|
_PS_CONST( tancof_p1, 3.11992232697E-3 );
|
||
|
_PS_CONST( tancof_p2, 2.44301354525E-2 );
|
||
|
_PS_CONST( tancof_p3, 5.34112807005E-2 );
|
||
|
_PS_CONST( tancof_p4, 1.33387994085E-1 );
|
||
|
_PS_CONST( tancof_p5, 3.33331568548E-1 );
|
||
|
|
||
|
_PS_CONST( tancot_eps, 1.0e-4 );
|
||
|
|
||
|
v4sf tancot_ps( v4sf x, int cotFlag )
|
||
|
{
|
||
|
v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
|
||
|
|
||
|
#ifdef USE_SSE2
|
||
|
v4si emm2;
|
||
|
#else
|
||
|
#endif
|
||
|
sign_bit = x;
|
||
|
/* take the absolute value */
|
||
|
x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
|
||
|
/* extract the sign bit (upper one) */
|
||
|
sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
|
||
|
|
||
|
/* scale by 4/Pi */
|
||
|
y = _mm_mul_ps( x, *(v4sf*)_ps_cephes_FOPI );
|
||
|
|
||
|
#ifdef USE_SSE2
|
||
|
/* store the integer part of y in mm0 */
|
||
|
emm2 = _mm_cvttps_epi32( y );
|
||
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||
|
emm2 = _mm_add_epi32( emm2, *(v4si*)_pi32_1 );
|
||
|
emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_inv1 );
|
||
|
y = _mm_cvtepi32_ps( emm2 );
|
||
|
|
||
|
emm2 = _mm_and_si128( emm2, *(v4si*)_pi32_2 );
|
||
|
emm2 = _mm_cmpeq_epi32( emm2, _mm_setzero_si128() );
|
||
|
|
||
|
v4sf poly_mask = _mm_castsi128_ps( emm2 );
|
||
|
#else
|
||
|
#endif
|
||
|
/* The magic pass: "Extended precision modular arithmetic"
|
||
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||
|
xmm1 = *(v4sf*)_ps_minus_cephes_DP1;
|
||
|
xmm2 = *(v4sf*)_ps_minus_cephes_DP2;
|
||
|
xmm3 = *(v4sf*)_ps_minus_cephes_DP3;
|
||
|
xmm1 = _mm_mul_ps( y, xmm1 );
|
||
|
xmm2 = _mm_mul_ps( y, xmm2 );
|
||
|
xmm3 = _mm_mul_ps( y, xmm3 );
|
||
|
v4sf z = _mm_add_ps( x, xmm1 );
|
||
|
z = _mm_add_ps( z, xmm2 );
|
||
|
z = _mm_add_ps( z, xmm3 );
|
||
|
|
||
|
v4sf zz = _mm_mul_ps( z, z );
|
||
|
|
||
|
y = *(v4sf*)_ps_tancof_p0;
|
||
|
y = _mm_mul_ps( y, zz );
|
||
|
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p1 );
|
||
|
y = _mm_mul_ps( y, zz );
|
||
|
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p2 );
|
||
|
y = _mm_mul_ps( y, zz );
|
||
|
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p3 );
|
||
|
y = _mm_mul_ps( y, zz );
|
||
|
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p4 );
|
||
|
y = _mm_mul_ps( y, zz );
|
||
|
y = _mm_add_ps( y, *(v4sf*)_ps_tancof_p5 );
|
||
|
y = _mm_mul_ps( y, zz );
|
||
|
y = _mm_mul_ps( y, z );
|
||
|
y = _mm_add_ps( y, z );
|
||
|
|
||
|
v4sf y2;
|
||
|
if( cotFlag ) {
|
||
|
y2 = _mm_xor_ps( y, *(v4sf*)_ps_sign_mask );
|
||
|
/* y = _mm_rcp_ps( y ); */
|
||
|
/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
|
||
|
y = _mm_div_ps( *(v4sf*)_ps_1, y );
|
||
|
} else {
|
||
|
/* y2 = _mm_rcp_ps( y ); */
|
||
|
/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
|
||
|
y2 = _mm_div_ps( *(v4sf*)_ps_1, y );
|
||
|
y2 = _mm_xor_ps( y2, *(v4sf*)_ps_sign_mask );
|
||
|
}
|
||
|
|
||
|
/* select the correct result from the two polynoms */
|
||
|
xmm3 = poly_mask;
|
||
|
y = _mm_and_ps( xmm3, y );
|
||
|
y2 = _mm_andnot_ps( xmm3, y2 );
|
||
|
y = _mm_or_ps( y, y2 );
|
||
|
|
||
|
/* update the sign */
|
||
|
y = _mm_xor_ps( y, sign_bit );
|
||
|
|
||
|
return y;
|
||
|
}
|
||
|
|
||
|
v4sf tan_ps( v4sf x ) { return tancot_ps( x, 0 ); }
|
||
|
|
||
|
v4sf cot_ps( v4sf x ) { return tancot_ps( x, 1 ); }
|
||
|
|
||
|
_PS_CONST( atanrange_hi, 2.414213562373095 );
|
||
|
_PS_CONST( atanrange_lo, 0.4142135623730950 );
|
||
|
const float PIF = 3.141592653589793238;
|
||
|
const float PIO2F = 1.5707963267948966192;
|
||
|
_PS_CONST( cephes_PIF, 3.141592653589793238 );
|
||
|
_PS_CONST( cephes_PIO2F, 1.5707963267948966192 );
|
||
|
_PS_CONST( cephes_PIO4F, 0.7853981633974483096 );
|
||
|
|
||
|
_PS_CONST( atancof_p0, 8.05374449538e-2 );
|
||
|
_PS_CONST( atancof_p1, 1.38776856032E-1 );
|
||
|
_PS_CONST( atancof_p2, 1.99777106478E-1 );
|
||
|
_PS_CONST( atancof_p3, 3.33329491539E-1 );
|
||
|
|
||
|
v4sf atan_ps( v4sf x )
|
||
|
{
|
||
|
v4sf sign_bit, y;
|
||
|
|
||
|
sign_bit = x;
|
||
|
/* take the absolute value */
|
||
|
x = _mm_and_ps( x, *(v4sf*)_ps_inv_sign_mask );
|
||
|
/* extract the sign bit (upper one) */
|
||
|
sign_bit = _mm_and_ps( sign_bit, *(v4sf*)_ps_sign_mask );
|
||
|
|
||
|
/* range reduction, init x and y depending on range */
|
||
|
#ifdef USE_SSE2
|
||
|
/* x > 2.414213562373095 */
|
||
|
v4sf cmp0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_hi );
|
||
|
/* x > 0.4142135623730950 */
|
||
|
v4sf cmp1 = _mm_cmpgt_ps( x, *(v4sf*)_ps_atanrange_lo );
|
||
|
|
||
|
/* x > 0.4142135623730950 && !( x > 2.414213562373095 ) */
|
||
|
v4sf cmp2 = _mm_andnot_ps( cmp0, cmp1 );
|
||
|
|
||
|
/* -( 1.0/x ) */
|
||
|
v4sf y0 = _mm_and_ps( cmp0, *(v4sf*)_ps_cephes_PIO2F );
|
||
|
v4sf x0 = _mm_div_ps( *(v4sf*)_ps_1, x );
|
||
|
x0 = _mm_xor_ps( x0, *(v4sf*)_ps_sign_mask );
|
||
|
|
||
|
v4sf y1 = _mm_and_ps( cmp2, *(v4sf*)_ps_cephes_PIO4F );
|
||
|
/* (x-1.0)/(x+1.0) */
|
||
|
v4sf x1_o = _mm_sub_ps( x, *(v4sf*)_ps_1 );
|
||
|
v4sf x1_u = _mm_add_ps( x, *(v4sf*)_ps_1 );
|
||
|
v4sf x1 = _mm_div_ps( x1_o, x1_u );
|
||
|
|
||
|
v4sf x2 = _mm_and_ps( cmp2, x1 );
|
||
|
x0 = _mm_and_ps( cmp0, x0 );
|
||
|
x2 = _mm_or_ps( x2, x0 );
|
||
|
cmp1 = _mm_or_ps( cmp0, cmp2 );
|
||
|
x2 = _mm_and_ps( cmp1, x2 );
|
||
|
x = _mm_andnot_ps( cmp1, x );
|
||
|
x = _mm_or_ps( x2, x );
|
||
|
|
||
|
y = _mm_or_ps( y0, y1 );
|
||
|
#else
|
||
|
#error sse1 & mmx version not implemented
|
||
|
#endif
|
||
|
|
||
|
v4sf zz = _mm_mul_ps( x, x );
|
||
|
v4sf acc = *(v4sf*)_ps_atancof_p0;
|
||
|
acc = _mm_mul_ps( acc, zz );
|
||
|
acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p1 );
|
||
|
acc = _mm_mul_ps( acc, zz );
|
||
|
acc = _mm_add_ps( acc, *(v4sf*)_ps_atancof_p2 );
|
||
|
acc = _mm_mul_ps( acc, zz );
|
||
|
acc = _mm_sub_ps( acc, *(v4sf*)_ps_atancof_p3 );
|
||
|
acc = _mm_mul_ps( acc, zz );
|
||
|
acc = _mm_mul_ps( acc, x );
|
||
|
acc = _mm_add_ps( acc, x );
|
||
|
y = _mm_add_ps( y, acc );
|
||
|
|
||
|
/* update the sign */
|
||
|
y = _mm_xor_ps( y, sign_bit );
|
||
|
|
||
|
return y;
|
||
|
}
|
||
|
|
||
|
v4sf atan2_ps( v4sf y, v4sf x )
|
||
|
{
|
||
|
v4sf x_eq_0 = _mm_cmpeq_ps( x, *(v4sf*)_ps_0 );
|
||
|
v4sf x_gt_0 = _mm_cmpgt_ps( x, *(v4sf*)_ps_0 );
|
||
|
v4sf x_le_0 = _mm_cmple_ps( x, *(v4sf*)_ps_0 );
|
||
|
v4sf y_eq_0 = _mm_cmpeq_ps( y, *(v4sf*)_ps_0 );
|
||
|
v4sf x_lt_0 = _mm_cmplt_ps( x, *(v4sf*)_ps_0 );
|
||
|
v4sf y_lt_0 = _mm_cmplt_ps( y, *(v4sf*)_ps_0 );
|
||
|
|
||
|
v4sf zero_mask = _mm_and_ps( x_eq_0, y_eq_0 );
|
||
|
v4sf zero_mask_other_case = _mm_and_ps( y_eq_0, x_gt_0 );
|
||
|
zero_mask = _mm_or_ps( zero_mask, zero_mask_other_case );
|
||
|
|
||
|
v4sf pio2_mask = _mm_andnot_ps( y_eq_0, x_eq_0 );
|
||
|
v4sf pio2_mask_sign = _mm_and_ps( y_lt_0, *(v4sf*)_ps_sign_mask );
|
||
|
v4sf pio2_result = *(v4sf*)_ps_cephes_PIO2F;
|
||
|
pio2_result = _mm_xor_ps( pio2_result, pio2_mask_sign );
|
||
|
pio2_result = _mm_and_ps( pio2_mask, pio2_result );
|
||
|
|
||
|
v4sf pi_mask = _mm_and_ps( y_eq_0, x_le_0 );
|
||
|
v4sf pi = *(v4sf*)_ps_cephes_PIF;
|
||
|
v4sf pi_result = _mm_and_ps( pi_mask, pi );
|
||
|
|
||
|
v4sf swap_sign_mask_offset = _mm_and_ps( x_lt_0, y_lt_0 );
|
||
|
swap_sign_mask_offset = _mm_and_ps( swap_sign_mask_offset, *(v4sf*)_ps_sign_mask );
|
||
|
|
||
|
v4sf offset0 = _mm_setzero_ps();
|
||
|
v4sf offset1 = *(v4sf*)_ps_cephes_PIF;
|
||
|
offset1 = _mm_xor_ps( offset1, swap_sign_mask_offset );
|
||
|
|
||
|
v4sf offset = _mm_andnot_ps( x_lt_0, offset0 );
|
||
|
offset = _mm_and_ps( x_lt_0, offset1 );
|
||
|
|
||
|
v4sf arg = _mm_div_ps( y, x );
|
||
|
v4sf atan_result = atan_ps( arg );
|
||
|
atan_result = _mm_add_ps( atan_result, offset );
|
||
|
|
||
|
/* select between zero_result, pio2_result and atan_result */
|
||
|
|
||
|
v4sf result = _mm_andnot_ps( zero_mask, pio2_result );
|
||
|
atan_result = _mm_andnot_ps( pio2_mask, atan_result );
|
||
|
atan_result = _mm_andnot_ps( pio2_mask, atan_result);
|
||
|
result = _mm_or_ps( result, atan_result );
|
||
|
result = _mm_or_ps( result, pi_result );
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
/* for convenience of calling simd sqrt */
|
||
|
float sqrt_ps( float x )
|
||
|
{
|
||
|
v4sf sse_value = _mm_set_ps1( x );
|
||
|
sse_value = _mm_sqrt_ps( sse_value );
|
||
|
return _mm_cvtss_f32( sse_value );
|
||
|
}
|
||
|
float rsqrt_ps( float x )
|
||
|
{
|
||
|
v4sf sse_value = _mm_set_ps1( x );
|
||
|
sse_value = _mm_rsqrt_ps( sse_value );
|
||
|
return _mm_cvtss_f32( sse_value );
|
||
|
}
|
||
|
|
||
|
/* atan2 implementation using atan, used as a reference to implement atan2_ps */
|
||
|
float atan2_ref( float y, float x )
|
||
|
{
|
||
|
if( x == 0.0f ) {
|
||
|
if( y == 0.0f ) {
|
||
|
return 0.0f;
|
||
|
}
|
||
|
float result = _ps_cephes_PIO2F[0];
|
||
|
if( y < 0.0f ) {
|
||
|
result = -result;
|
||
|
}
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
if( y == 0.0f ) {
|
||
|
if( x > 0.0f ) {
|
||
|
return 0.0f;
|
||
|
}
|
||
|
return PIF;
|
||
|
}
|
||
|
|
||
|
float offset = 0;
|
||
|
if( x < 0.0f ) {
|
||
|
offset = PIF;
|
||
|
if( y < 0.0f ) {
|
||
|
offset = -offset;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
v4sf val = _mm_set_ps1( y / x );
|
||
|
val = atan_ps( val );
|
||
|
return offset + _mm_cvtss_f32( val );
|
||
|
}
|
||
|
|
||
|
#ifdef _MSC_VER
|
||
|
#pragma warning( pop )
|
||
|
#endif
|
||
|
|
||
|
#endif
|