I was hunting around for fast sin/cos functions and couldn’t find anything suitably packaged and ready to go. The most useful I found was in this discussion:

http://devmaster.net/forums/topic/4648-fast-and-accurate-sinecosine/

Looks good, but it needed a bit of work. I decided to implement it with sse intrinsics andĀ calculate sin and cos together to save even more processor time.

I ended up with this. Accurate to within 0.2% over +2 PI to -2 PI and about 3 times faster than standard sin/cos if you calculate both together.

``` __m128 Abs(__m128 m) { __m128 sign = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); return _mm_andnot_ps(sign, m); } __m128 Sin(__m128 m_x) { const float B = 4.f / PI; const float C = -4.f / (PI * PI); const float P = 0.225f; //float y = B * x + C * x * abs(x); //y = P * (y * abs(y) - y) + y; __m128 m_pi = _mm_set1_ps(PI); __m128 m_mpi = _mm_set1_ps(-PI); __m128 m_2pi = _mm_set1_ps(PI * 2); __m128 m_B = _mm_set1_ps(B); __m128 m_C = _mm_set1_ps(C); __m128 m_P = _mm_set1_ps(P); __m128 m1 =_mm_cmpnlt_ps(m_x, m_pi); m1 = _mm_and_ps(m1, m_2pi); m_x = _mm_sub_ps(m_x, m1); m1 =_mm_cmpngt_ps(m_x, m_mpi); m1 = _mm_and_ps(m1, m_2pi); m_x = _mm_add_ps(m_x, m1); __m128 m_abs = Abs(m_x); m1 = _mm_mul_ps(m_abs, m_C); m1 = _mm_add_ps(m1, m_B); __m128 m_y = _mm_mul_ps(m1, m_x); m_abs = Abs(m_y); m1 = _mm_mul_ps(m_abs, m_y); m1 = _mm_sub_ps(m1, m_y); m1 = _mm_mul_ps(m1, m_P); m_y = _mm_add_ps(m1, m_y); return m_y; } float Sin(float x) { __m128 m_x = _mm_set1_ps(x); __m128 m_sin = Sin(m_x); return _mm_cvtss_f32(m_sin); } float Cos(float x) { __m128 m_x = _mm_set1_ps(x + PI / 2.f); __m128 m_cos = Sin(m_x); return _mm_cvtss_f32(m_cos); } void SinCos(float x, float* s, float* c) { __m128 m_both = _mm_set_ps(0.f, 0.f, x + PI / 2.f, x); __m128 m_sincos = Sin(m_both); __m128 m_cos = _mm_shuffle_ps(m_sincos, m_sincos, _MM_SHUFFLE(0, 0, 0, 1)); *s = _mm_cvtss_f32(m_sincos); *c = _mm_cvtss_f32(m_cos); }```