I was hunting around for fast sin/cos functions and couldn’t find anything suitably packaged and ready to go. The most useful I found was in this discussion:

http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/

Looks good, but it needed a bit of work. I decided to implement it with sse intrinsics andĀ calculate sin and cos together to save even more processor time.

I ended up with this. Accurate to within 0.2% over +2 PI to -2 PI and about 3 times faster than standard sin/cos if you calculate both together.


__m128 Abs(__m128 m)
{
__m128 sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
return _mm_andnot_ps(sign, m);
}
__m128 Sin(__m128 m_x)
{
const float B = 4.f / PI;
const float C = -4.f / (PI * PI);
const float P = 0.225f;
//float y = B * x + C * x * abs(x);
//y = P * (y * abs(y) - y) + y;
__m128 m_pi = _mm_set1_ps(PI);
__m128 m_mpi = _mm_set1_ps(-PI);
__m128 m_2pi = _mm_set1_ps(PI * 2);
__m128 m_B = _mm_set1_ps(B);
__m128 m_C = _mm_set1_ps(C);
__m128 m_P = _mm_set1_ps(P);
__m128 m1 =_mm_cmpnlt_ps(m_x, m_pi);
m1 = _mm_and_ps(m1, m_2pi);
m_x = _mm_sub_ps(m_x, m1);
m1 =_mm_cmpngt_ps(m_x, m_mpi);
m1 = _mm_and_ps(m1, m_2pi);
m_x = _mm_add_ps(m_x, m1);
__m128 m_abs = Abs(m_x);
m1 = _mm_mul_ps(m_abs, m_C);
m1 = _mm_add_ps(m1, m_B);
__m128 m_y = _mm_mul_ps(m1, m_x);
m_abs = Abs(m_y);
m1 = _mm_mul_ps(m_abs, m_y);
m1 = _mm_sub_ps(m1, m_y);
m1 = _mm_mul_ps(m1, m_P);
m_y = _mm_add_ps(m1, m_y);
return m_y;
}
float Sin(float x)
{
__m128 m_x = _mm_set1_ps(x);
__m128 m_sin = Sin(m_x);
return _mm_cvtss_f32(m_sin);
}
float Cos(float x)
{
__m128 m_x = _mm_set1_ps(x + PI / 2.f);
__m128 m_cos = Sin(m_x);
return _mm_cvtss_f32(m_cos);
}
void SinCos(float x, float* s, float* c)
{
__m128 m_both = _mm_set_ps(0.f, 0.f, x + PI / 2.f, x);
__m128 m_sincos = Sin(m_both);
__m128 m_cos = _mm_shuffle_ps(m_sincos, m_sincos, _MM_SHUFFLE(0, 0, 0, 1));
*s = _mm_cvtss_f32(m_sincos);
*c = _mm_cvtss_f32(m_cos);
}

About these ads