/*
  tanh_sse.h

  example for approximation of hyperbolic tangent from Lambert's (Gauss)
  continued fraction.
  http://en.wikipedia.org/wiki/Gauss's_continued_fraction
  http://maths.ashwyninnovations.com/lambert.pdf

  plain c and sse intrinsic versions for msvc and gcc

  compile with:
    gcc -msse -mfpmath=387 -W -Wall
    gcc -msse -mfpmath=387,sse -W -Wall
    cl /arch:SSE /W4

  notes:
    - the plain c version also ends up quite fast with gcc's fpu optimizations
    - _TANH_RANGE could be brought down, so that the "clamp" engages for lower
      values. error starts to propagate near 5,6 or -5,-6 in the 0.00001 range
    - if _TANH_CLAMP_INDIVIDUAL is not set the entire vector will be clamped
    - speed comparison against libm on amd althon xp with gcc 4.x for one and
      the same value in the vector:
        flags:          -O3 -msse -mfpmath=387
        iterations:     1E+6
        fast_tanh_sse:  37 ms
        libm:           172 ms
    - orders of 5/6 are used:
      tanh(x) = (21*x^5 + 1260*x^3 + 10395*x) /
                (x^6 + 210*x^4 + 4725*x^2 + 10395)
    - define _TANH_FAST_DIV for less accurate (~14% faster) division with
      ~(21 - 22) bits of mantissa.

  contact:
    lubomir i. ivanov, neolit123 [at] gmail
*/

#ifndef _INCLUDE_TANH_SSE_
#define _INCLUDE_TANH_SSE_

/* inline */
#ifdef _TANH_USE_INLINE
  #define _TANH_INLINE    inline
#else
  #define _TANH_INLINE
#endif

/* constants */
#define _TANH_RANGE   5.f
#define _TANH_CLAMP   1.f
#define _TANH_K0      21.f
#define _TANH_K1      210.f
#define _TANH_K2      1260.f
#define _TANH_K3      4725.f
#define _TANH_K4      10395.f

/* types */
#define v4sf          __m128

/* portable c version */
_TANH_INLINE
float
fast_tanh(const float x)
{
  const     float s = x*x;
  register  float d;

  if      (x < -_TANH_RANGE)
    return -_TANH_CLAMP;
  else if (x > _TANH_RANGE)
    return _TANH_CLAMP;

  d =     (s*(s*(s + _TANH_K1) + _TANH_K3) + _TANH_K4);
  return  (x*(s*(_TANH_K0*s + _TANH_K2) + _TANH_K4)) / d;
}

/* sse intrinsic version */
#ifdef __SSE__  // jklontz 1/17/2012
#include "xmmintrin.h"

_TANH_INLINE
v4sf
fast_tanh_sse(const v4sf x)
{
  v4sf    y, s, d;
  #ifdef _TANH_FAST_DIV
    v4sf  i_d;
  #endif

  /* check each value in the vector */
  #ifdef _TANH_CLAMP_INDIVIDUAL
    register short i;
    union
    {
      v4sf    v;
      float   f[4];
    } u;

    i = 0;
    u.v = x;
    while (i < 4)
    {
      if (u.f[i] < -_TANH_RANGE)
        u.f[i] = -_TANH_CLAMP;
      if (u.f[i] > _TANH_RANGE)
        u.f[i] = _TANH_CLAMP;
      i++;
    }
  /* clamp entire vector */
  #else
    if      (_mm_movemask_ps(_mm_cmplt_ps(x, _mm_set1_ps(-_TANH_RANGE))))
      return _mm_set1_ps(-_TANH_CLAMP);
    else if (_mm_movemask_ps(_mm_cmpgt_ps(x, _mm_set1_ps( _TANH_RANGE))))
      return _mm_set1_ps( _TANH_CLAMP);
  #endif

  s = _mm_mul_ps(x, x);

  /* denominator */
  d = _mm_add_ps(s, _mm_set1_ps(_TANH_K1));
  d = _mm_mul_ps(d, s);
  d = _mm_add_ps(d, _mm_set1_ps(_TANH_K3));
  d = _mm_mul_ps(d, s);
  d = _mm_add_ps(d, _mm_set1_ps(_TANH_K4));

  /* numerator */
  y = _mm_mul_ps(s, _mm_set1_ps(_TANH_K0));
  y = _mm_add_ps(y, _mm_set1_ps(_TANH_K2));
  y = _mm_mul_ps(y, s);
  y = _mm_add_ps(y, _mm_set1_ps(_TANH_K4));
  y = _mm_mul_ps(y, x);

  #ifdef _TANH_FAST_DIV
    /* reciprocal of the denominator with one newton iteration */
    i_d = _mm_rcp_ps(d);
    i_d = _mm_sub_ps( _mm_add_ps(i_d, i_d),
                      _mm_mul_ps(d, _mm_mul_ps(i_d, i_d)));
    return _mm_mul_ps(y, i_d);
  #else
    return _mm_div_ps(y, d);
  #endif
}
#endif

#endif /* _INCLUDE_TANH_SSE_ */