I was hunting around for fast sin/cos functions and couldn’t find anything suitably packaged and ready to go. The most useful I found was in this discussion:

http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/

Looks good, but it needed a bit of work. I decided to implement it with sse intrinsics andĀ calculate sin and cos together to save even more processor time.

I ended up with this. Accurate to within 0.2% over +2 PI to -2 PI and about 3 times faster than standard sin/cos if you calculate both together.

__m128 Abs(__m128 m)

{

__m128 sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

return _mm_andnot_ps(sign, m);

}

__m128 Sin(__m128 m_x)

{

const float B = 4.f / PI;

const float C = -4.f / (PI * PI);

const float P = 0.225f;

//float y = B * x + C * x * abs(x);

//y = P * (y * abs(y) - y) + y;

__m128 m_pi = _mm_set1_ps(PI);

__m128 m_mpi = _mm_set1_ps(-PI);

__m128 m_2pi = _mm_set1_ps(PI * 2);

__m128 m_B = _mm_set1_ps(B);

__m128 m_C = _mm_set1_ps(C);

__m128 m_P = _mm_set1_ps(P);

__m128 m1 =_mm_cmpnlt_ps(m_x, m_pi);

m1 = _mm_and_ps(m1, m_2pi);

m_x = _mm_sub_ps(m_x, m1);

m1 =_mm_cmpngt_ps(m_x, m_mpi);

m1 = _mm_and_ps(m1, m_2pi);

m_x = _mm_add_ps(m_x, m1);

__m128 m_abs = Abs(m_x);

m1 = _mm_mul_ps(m_abs, m_C);

m1 = _mm_add_ps(m1, m_B);

__m128 m_y = _mm_mul_ps(m1, m_x);

m_abs = Abs(m_y);

m1 = _mm_mul_ps(m_abs, m_y);

m1 = _mm_sub_ps(m1, m_y);

m1 = _mm_mul_ps(m1, m_P);

m_y = _mm_add_ps(m1, m_y);

return m_y;

}

float Sin(float x)

{

__m128 m_x = _mm_set1_ps(x);

__m128 m_sin = Sin(m_x);

return _mm_cvtss_f32(m_sin);

}

float Cos(float x)

{

__m128 m_x = _mm_set1_ps(x + PI / 2.f);

__m128 m_cos = Sin(m_x);

return _mm_cvtss_f32(m_cos);

}

void SinCos(float x, float* s, float* c)

{

__m128 m_both = _mm_set_ps(0.f, 0.f, x + PI / 2.f, x);

__m128 m_sincos = Sin(m_both);

__m128 m_cos = _mm_shuffle_ps(m_sincos, m_sincos, _MM_SHUFFLE(0, 0, 0, 1));

*s = _mm_cvtss_f32(m_sincos);

*c = _mm_cvtss_f32(m_cos);

}