I was hunting around for fast sin/cos functions and couldn’t find anything suitably packaged and ready to go. The most useful I found was in this discussion:

http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/

Looks good, but it needed a bit of work. I decided to implement it with sse intrinsics and calculate sin and cos together to save even more processor time.

I ended up with this. Accurate to within 0.2% over +2 PI to -2 PI and about 3 times faster than standard sin/cos if you calculate both together.

__m128 Abs(__m128 m)

{

__m128 sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));

return _mm_andnot_ps(sign, m);

}

__m128 Sin(__m128 m_x)

{

const float B = 4.f / PI;

const float C = -4.f / (PI * PI);

const float P = 0.225f;

//float y = B * x + C * x * abs(x);

//y = P * (y * abs(y) - y) + y;

__m128 m_pi = _mm_set1_ps(PI);

__m128 m_mpi = _mm_set1_ps(-PI);

__m128 m_2pi = _mm_set1_ps(PI * 2);

__m128 m_B = _mm_set1_ps(B);

__m128 m_C = _mm_set1_ps(C);

__m128 m_P = _mm_set1_ps(P);

__m128 m1 =_mm_cmpnlt_ps(m_x, m_pi);

m1 = _mm_and_ps(m1, m_2pi);

m_x = _mm_sub_ps(m_x, m1);

m1 =_mm_cmpngt_ps(m_x, m_mpi);

m1 = _mm_and_ps(m1, m_2pi);

m_x = _mm_add_ps(m_x, m1);

__m128 m_abs = Abs(m_x);

m1 = _mm_mul_ps(m_abs, m_C);

m1 = _mm_add_ps(m1, m_B);

__m128 m_y = _mm_mul_ps(m1, m_x);

m_abs = Abs(m_y);

m1 = _mm_mul_ps(m_abs, m_y);

m1 = _mm_sub_ps(m1, m_y);

m1 = _mm_mul_ps(m1, m_P);

m_y = _mm_add_ps(m1, m_y);

return m_y;

}

float Sin(float x)

{

__m128 m_x = _mm_set1_ps(x);

__m128 m_sin = Sin(m_x);

return _mm_cvtss_f32(m_sin);

}

float Cos(float x)

{

__m128 m_x = _mm_set1_ps(x + PI / 2.f);

__m128 m_cos = Sin(m_x);

return _mm_cvtss_f32(m_cos);

}

void SinCos(float x, float* s, float* c)

{

__m128 m_both = _mm_set_ps(0.f, 0.f, x + PI / 2.f, x);

__m128 m_sincos = Sin(m_both);

__m128 m_cos = _mm_shuffle_ps(m_sincos, m_sincos, _MM_SHUFFLE(0, 0, 0, 1));

*s = _mm_cvtss_f32(m_sincos);

*c = _mm_cvtss_f32(m_cos);

}

A long time ago where computers were really slow

programmers used precomputed arrays of trigonometirc functions that

would be probably the fastest way if you don’t need precision at calculation.

By the way nice blog keep writing.