TnFOX: FXMaths.h Source File

00001 /********************************************************************************
00002 *                                                                               *
00003 *                                 Tools for maths                               *
00004 *                                                                               *
00005 *********************************************************************************
00006 *        Copyright (C) 2006-2009 by Niall Douglas.   All Rights Reserved.       *
00007 *       NOTE THAT I DO NOT PERMIT ANY OF MY CODE TO BE PROMOTED TO THE GPL      *
00008 *********************************************************************************
00009 * This code is free software; you can redistribute it and/or modify it under    *
00010 * the terms of the GNU Library General Public License v2.1 as published by the  *
00011 * Free Software Foundation EXCEPT that clause 3 does not apply ie; you may not  *
00012 * "upgrade" this code to the GPL without my prior written permission.           *
00013 * Please consult the file "License_Addendum2.txt" accompanying this file.       *
00014 *                                                                               *
00015 * This code is distributed in the hope that it will be useful,                  *
00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of                *
00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                          *
00018 *********************************************************************************
00019 * $Id:                                                                          *
00020 ********************************************************************************/
00021 
00022 #ifndef FXMATHS_H
00023 #define FXMATHS_H
00024 
00025 #include "FXProcess.h"
00026 #include "QThread.h"
00027 #include "FXStream.h"
00028 #include "qmemarray.h"
00029 #include <math.h>
00030 #include "FXVec2f.h"
00031 #include "FXVec2d.h"
00032 #include "FXVec3f.h"
00033 #include "FXVec3d.h"
00034 #include "FXMat3f.h"
00035 #include "FXMat3d.h"
00036 #include "FXVec4f.h"
00037 #include "FXVec4d.h"
00038 #include "FXMat4f.h"
00039 #include "FXMat4d.h"
00040 #if _M_IX86_FP>=3 || defined(__SSE3__)
00041 #include "pmmintrin.h"
00042 #endif
00043 #if _M_IX86_FP>=4 || defined(__SSE4__)
00044 #include "smmintrin.h"
00045 //#include "nmmintrin.h"    // We only use the SSE4.1 dot product so this is unneeded
00046 #endif
00047 
00048 namespace FX {
00049 
00054 namespace Maths {
00079 
00080     template<typename type> inline const type &min(const type &a, const type &b) { return (a<b) ? a : b; }
00082     template<typename type> inline const type &max(const type &a, const type &b) { return (a>b) ? a : b; }
00084     template<typename type> inline type sqrt(const type &v) { return ::sqrt(v); }
00085     template<> inline float sqrt<float>(const float &v) { return ::sqrtf(v); }
00087     template<typename type> inline type rcp(const type &v) { return 1/v; }
00089     template<typename type> inline type rsqrt(const type &v) { return 1/sqrt(v); }
00090 
00091     namespace Impl
00092     {
00093         template<typename type, bool minus> struct CalcInfinity;
00094         template<bool minus> struct CalcInfinity<float, minus>
00095         {
00096             union
00097             {
00098                 FXuint integer;
00099                 float floatingpoint;
00100             };
00101             CalcInfinity()
00102             {   // Directly poke in our value
00103                 integer=minus ? 0xff800000 : 0x7f800000;
00104             }
00105             operator const float &()
00106             {
00107                 return floatingpoint;
00108             }
00109         };
00110         template<bool minus> struct CalcInfinity<double, minus>
00111         {
00112             union
00113             {
00114                 FXulong integer;
00115                 double floatingpoint;
00116             };
00117             CalcInfinity()
00118             {   // Directly poke in our value
00119                 integer=minus ? 0xfff0000000000000ULL : 0x7ff0000000000000ULL;
00120             }
00121             operator const double &()
00122             {
00123                 return floatingpoint;
00124             }
00125         };
00126         template<typename type, bool minus> struct InfinityValue
00127         {
00128             static const CalcInfinity<type, minus> &value()
00129             {
00130                 static const CalcInfinity<type, minus> foo;
00131                 return foo;
00132             }
00133         };
00134 
00135         template<typename type> struct CalcNaN;
00136         template<> struct CalcNaN<float>
00137         {
00138             union
00139             {
00140                 FXuint integer;
00141                 float floatingpoint;
00142             };
00143             CalcNaN()
00144             {   // Directly poke in our value
00145                 integer=0x7fffffff;
00146             }
00147             operator const float &() const
00148             {
00149                 return floatingpoint;
00150             }
00151         };
00152         template<> struct CalcNaN<double>
00153         {
00154             union
00155             {
00156                 FXulong integer;
00157                 double floatingpoint;
00158             };
00159             CalcNaN()
00160             {   // Directly poke in our value
00161                 integer=0x7fffffffffffffffULL;
00162             }
00163             operator const double &() const
00164             {
00165                 return floatingpoint;
00166             }
00167         };
00168         template<typename type> struct NaNValue
00169         {
00170             static const CalcNaN<type> &value()
00171             {
00172                 static const CalcNaN<type> foo;
00173                 return foo;
00174             }
00175         };
00176         template<unsigned int size> struct TwoPowerMemAligner { };
00177 #if defined(_MSC_VER)
00178         // Don't assert on compilers not supporting memory alignment
00179         template<> struct FXMEMALIGNED(4)   TwoPowerMemAligner<4>   { TwoPowerMemAligner() { assert(!((FXuval)this & 3)); } };
00180         template<> struct FXMEMALIGNED(8)   TwoPowerMemAligner<8>   { TwoPowerMemAligner() { assert(!((FXuval)this & 7)); } };
00181         template<> struct FXMEMALIGNED(16)  TwoPowerMemAligner<16>  { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00182         template<> struct FXMEMALIGNED(32)  TwoPowerMemAligner<32>  { TwoPowerMemAligner() { assert(!((FXuval)this & 31)); } };
00183         template<> struct FXMEMALIGNED(64)  TwoPowerMemAligner<64>  { TwoPowerMemAligner() { assert(!((FXuval)this & 63)); } };
00184         template<> struct FXMEMALIGNED(128) TwoPowerMemAligner<128> { TwoPowerMemAligner() { assert(!((FXuval)this & 127)); } };
00185         template<> struct FXMEMALIGNED(256) TwoPowerMemAligner<256> { TwoPowerMemAligner() { assert(!((FXuval)this & 255)); } };
00186         template<> struct FXMEMALIGNED(512) TwoPowerMemAligner<512> { TwoPowerMemAligner() { assert(!((FXuval)this & 511)); } };
00187         template<> struct FXMEMALIGNED(1024) TwoPowerMemAligner<1024> { TwoPowerMemAligner() { assert(!((FXuval)this & 1023)); } };
00188 #elif defined(__GNUC__)
00189 #if __GNUC__<4 || (__GNUC__==4 && __GNUC_MINOR__<3)
00190 #warning Before v4.3 GCC will not allow selective structure alignment - FX::Maths::Vector will not be aligned!
00191 #define FXVECTOR_BUGGYGCCALIGNMENTHACK FXMEMALIGNED(16)
00192 #else
00193         // GCC's alignment support on x86 and x64 is nearly useless
00194         // as it won't go above 16 bytes :(
00195         template<> struct FXMEMALIGNED(4)   TwoPowerMemAligner<4>   { TwoPowerMemAligner() { assert(!((FXuval)this & 3)); } };
00196         template<> struct FXMEMALIGNED(8)   TwoPowerMemAligner<8>   { TwoPowerMemAligner() { assert(!((FXuval)this & 7)); } };
00197         template<> struct FXMEMALIGNED(16)  TwoPowerMemAligner<16>  { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00198         template<> struct FXMEMALIGNED(32)  TwoPowerMemAligner<32>  { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00199         template<> struct FXMEMALIGNED(64)  TwoPowerMemAligner<64>  { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00200         template<> struct FXMEMALIGNED(128) TwoPowerMemAligner<128> { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00201         template<> struct FXMEMALIGNED(256) TwoPowerMemAligner<256> { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00202         template<> struct FXMEMALIGNED(512) TwoPowerMemAligner<512> { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00203         template<> struct FXMEMALIGNED(1024) TwoPowerMemAligner<1024> { TwoPowerMemAligner() { assert(!((FXuval)this & 15)); } };
00204 #endif
00205 #endif
00206 #ifndef FXVECTOR_BUGGYGCCALIGNMENTHACK
00207 #define FXVECTOR_BUGGYGCCALIGNMENTHACK
00208 #endif
00209         template<typename type, unsigned int A, class supertype, bool _isArithmetic, bool _isInteger, typename SIMDType=char> class VectorBase
00210         {
00211         protected:
00212             union
00213             {
00214                 type data[A];
00215                 SIMDType v;
00216             };
00217         public:
00219             typedef type TYPE;
00221             static const unsigned int DIMENSION=A;
00223             static const bool isArithmetic=_isArithmetic;
00225             static const bool isInteger=_isInteger;
00226             VectorBase() { }
00227             operator const supertype &() const { return static_cast<const supertype &>(*this); }
00228             VectorBase(const VectorBase &o) { for(FXuint n=0; n<A; n++) data[n]=o.data[n]; }
00229             VectorBase &operator=(const VectorBase &o) { for(FXuint n=0; n<A; n++) data[n]=o.data[n]; return *this; }
00230             explicit VectorBase(const type *d) { for(FXuint n=0; n<A; n++) data[n]=d ? d[n] : 0; }
00231             explicit VectorBase(const type &d) { for(FXuint n=0; n<A; n++) data[n]=d; }
00233             type operator[](unsigned int i) const { assert(i<A); return data[i]; }
00235             VectorBase &set(unsigned int i, const type &d) { assert(i<A); data[i]=d; return *this; }
00236         };
00237 #define VECTOR1OP(op)  VectorBase operator op () const                    { VectorBase ret; for(FXuint n=0; n<A; n++) ret.data[n]=op Base::data[n]; return ret; }
00238 #define VECTOR2OP(op)  VectorBase operator op (const VectorBase &o) const { VectorBase ret; for(FXuint n=0; n<A; n++) ret.data[n]=Base::data[n] op o.data[n]; return ret; }
00239 #define VECTORP2OP(op) VectorBase &operator op (const VectorBase &o)      {                 for(FXuint n=0; n<A; n++) Base::data[n] op o.data[n]; return *this; }
00240 #define VECTORFUNC(op) friend VectorBase op(const VectorBase &o)          { VectorBase ret; for(FXuint n=0; n<A; n++) ret.data[n]= Maths::op (o.data[n]); return ret; }
00241 #define VECTOR2FUNC(op) friend VectorBase op(const VectorBase &a, const VectorBase &b) { VectorBase ret; for(FXuint n=0; n<A; n++) ret.data[n]= op (a.data[n], b.data[n]); return ret; }
00242         template<typename type, unsigned int A, class supertype, bool isInteger, typename SIMDType> class VectorBase<type, A, supertype, true, isInteger, SIMDType> : private TwoPowerMemAligner<sizeof(type)*A>, public VectorBase<type, A, supertype, false, false, SIMDType>
00243         {
00244             typedef VectorBase<type, A, supertype, false, false, SIMDType> Base;
00245         public:
00246             static const bool isArithmetic=true;
00247             VectorBase() { }
00248             explicit VectorBase(const type *d) : Base(d) { }
00249             explicit VectorBase(const type &d) : Base(d) { }
00250             // Arithmetic (int & fp)
00251             VECTOR1OP(+) VECTOR1OP(-) VECTOR1OP(!)
00252              VECTOR2OP(+)    VECTOR2OP(-)   VECTOR2OP(*)   VECTOR2OP(/)
00253             VECTORP2OP(+=) VECTORP2OP(-=) VECTORP2OP(*=) VECTORP2OP(/=)
00254             // Logical (int & fp)
00255             VECTOR2OP( == ) VECTOR2OP( != ) VECTOR2OP( < )  VECTOR2OP( <= ) VECTOR2OP( > )  VECTOR2OP( >= )
00256             VECTOR2OP( && ) VECTOR2OP( || )
00257 
00258             VECTORFUNC(sqrt) VECTORFUNC(rcp) VECTORFUNC(rsqrt)
00259             VECTOR2FUNC(min) VECTOR2FUNC(max)
00261             friend bool isZero(const VectorBase &a)
00262             {
00263                 bool iszero=true;
00264                 for(FXuint n=0; n<A && iszero; n++)
00265                     iszero=iszero && !a.data[n];
00266                 return iszero;
00267             }
00269             friend type sum(const VectorBase &a)
00270             {
00271                 type ret=0;
00272                 for(FXuint n=0; n<A; n++)
00273                     ret+=a.data[n];
00274                 return ret;
00275             }
00277             friend type dot(const VectorBase &a, const VectorBase &b)
00278             {
00279                 type ret=0;
00280                 for(FXuint n=0; n<A; n++)
00281                     ret+=a.data[n]*b.data[n];
00282                 return ret;
00283             }
00284         };
00285         template<typename type, unsigned int A, class supertype, typename SIMDType> class VectorBase<type, A, supertype, true, true, SIMDType> : public VectorBase<type, A, supertype, true, false, SIMDType>
00286         {
00287             typedef VectorBase<type, A, supertype, true, false, SIMDType> Base;
00288         public:
00289             static const bool isArithmetic=true;
00290             static const bool isInteger=true;
00291             VectorBase() { }
00292             explicit VectorBase(const type *d) : Base(d) { }
00293             explicit VectorBase(const type &d) : Base(d) { }
00294 
00295             // Arithmetic (int only)
00296              VECTOR2OP(%)   VECTOR2OP(&)   VECTOR2OP(|)   VECTOR2OP(^)   VECTOR2OP(<<)   VECTOR2OP(>>)  VECTOR1OP(~)
00297             VECTORP2OP(%=) VECTORP2OP(&=) VECTORP2OP(|=) VECTORP2OP(^=) VECTORP2OP(<<=) VECTORP2OP(>>=)
00298 
00300             friend VectorBase lshiftvec(const VectorBase &a, int shift)
00301             {
00302                 VectorBase ret;
00303                 FXuint offset=(shift/8)/sizeof(type);
00304                 shift-=(offset*sizeof(type))*8;
00305                 for(FXint n=A-1; n>=0; n--)
00306                     ret.data[n]=(a.data[n-offset]<<shift)|((0==n) ? 0 : (a.data[n-offset-1]>>(8*sizeof(type)-shift)));
00307                 return ret;
00308             }
00310             friend VectorBase rshiftvec(const VectorBase &a, int shift)
00311             {
00312                 VectorBase ret;
00313                 FXuint offset=(shift/8)/sizeof(type);
00314                 shift-=(offset*sizeof(type))*8;
00315                 for(FXuint n=0; n<A; n++)
00316                     ret.data[n]=(a.data[n+offset]>>shift)|((A==n+offset+1) ? 0 : (a.data[n+offset+1]<<(8*sizeof(type)-shift)));
00317                 return ret;
00318             }
00319         };
00320 #undef VECTOR1OP
00321 #undef VECTOR2OP
00322 #undef VECTORP2OP
00323 #undef VECTORFUNC
00324 #undef VECTOR2FUNC
00325 
00326         // Used to provide implicit conversions between these and FOX classes
00327         template<class base, typename type, class equivtype> class EquivType : public base
00328         {
00329         public:
00330             EquivType() { }
00331             template<typename F> explicit EquivType(const F &d) : base(d) { }
00332             EquivType &operator=(const equivtype &o) { return *this=*((const EquivType *)&o); }
00333             operator equivtype &() { return *((equivtype *)this); }
00334             operator const equivtype &() const { return *((const equivtype *)this); }
00335         };
00336         template<class base, typename type> class EquivType<base, type, void> : public base
00337         {
00338         public:
00339             EquivType() { }
00340             template<typename F> explicit EquivType(const F &d) : base(d) { }
00341         };
00342 
00343         // Used to conglomerate multiple SIMD vector ops
00344         template<class vectortype, unsigned int N, class supertype, bool _isArithmetic=vectortype::isArithmetic, bool _isInteger=vectortype::isInteger> class VectorOfVectors
00345         {
00346         protected:
00347             vectortype vectors[N];
00348         public:
00350             typedef typename vectortype::TYPE TYPE;
00352             static const unsigned int DIMENSION=vectortype::DIMENSION*N;
00354             static const bool isArithmetic=_isArithmetic;
00356             static const bool isInteger=_isInteger;
00357             VectorOfVectors() { }
00358             operator const supertype &() const { return static_cast<const supertype &>(*this); }
00359             VectorOfVectors(const VectorOfVectors &o)
00360             {
00361                 for(FXuint n=0; n<N; n++)
00362                     vectors[n]=o.vectors[n];
00363             }
00364             VectorOfVectors &operator=(const VectorOfVectors &o)
00365             {
00366                 for(FXuint n=0; n<N; n++)
00367                     vectors[n]=o.vectors[n];
00368                 return *this;
00369             }
00370             explicit VectorOfVectors(const TYPE *d)
00371             {
00372                 for(FXuint n=0; n<N; n++)
00373                     vectors[n]=vectortype(FXOFFSETPTR(d, n*sizeof(vectortype)));
00374             }
00375             explicit VectorOfVectors(const TYPE &d)
00376             {
00377                 for(FXuint n=0; n<N; n++)
00378                     vectors[n]=vectortype(d);
00379             }
00381             TYPE operator[](unsigned int i) const { assert(i<DIMENSION); return vectors[i/vectortype::DIMENSION][i%vectortype::DIMENSION]; }
00383             VectorOfVectors &set(unsigned int i, const TYPE &d) { assert(i<DIMENSION); vectors[i/vectortype::DIMENSION].set(i%vectortype::DIMENSION, d); return *this; }
00384         };
00385 #define VECTOR1OP(op)  VectorOfVectors operator op () const                         { VectorOfVectors ret; for(FXuint n=0; n<N; n++) ret.vectors[n]=op Base::vectors[n]; return ret; }
00386 #define VECTOR2OP(op)  VectorOfVectors operator op (const VectorOfVectors &o) const { VectorOfVectors ret; for(FXuint n=0; n<N; n++) ret.vectors[n]=Base::vectors[n] op o.vectors[n]; return ret; }
00387 #define VECTORP2OP(op) VectorOfVectors &operator op (const VectorOfVectors &o)      {                      for(FXuint n=0; n<N; n++) Base::vectors[n] op o.vectors[n]; return *this; }
00388 #define VECTORFUNC(op) friend VectorOfVectors op(const VectorOfVectors &o)          { VectorOfVectors ret; for(FXuint n=0; n<N; n++) ret.vectors[n]= op (o.vectors[n]); return ret; }
00389 #define VECTOR2FUNC(op) friend VectorOfVectors op(const VectorOfVectors &a, const VectorOfVectors &b) { VectorOfVectors ret; for(FXuint n=0; n<N; n++) ret.vectors[n]= op (a.vectors[n], b.vectors[n]); return ret; }
00390         template<class vectortype, unsigned int N, class supertype, bool isInteger> class VectorOfVectors<vectortype, N, supertype, true, isInteger> : public VectorOfVectors<vectortype, N, supertype, false, false>
00391         {
00392         protected:
00393             typedef VectorOfVectors<vectortype, N, supertype, false, false> Base;
00394         public:
00395             typedef typename vectortype::TYPE TYPE;
00396             VectorOfVectors() { }
00397             explicit VectorOfVectors(const TYPE *d) : Base(d) { }
00398             explicit VectorOfVectors(const TYPE &d) : Base(d) { }
00399             // Arithmetic (int & fp)
00400             VECTOR1OP(+) VECTOR1OP(-) VECTOR1OP(!)
00401              VECTOR2OP(+)    VECTOR2OP(-)   VECTOR2OP(*)   VECTOR2OP(/)
00402             VECTORP2OP(+=) VECTORP2OP(-=) VECTORP2OP(*=) VECTORP2OP(/=)
00403             // Logical (int & fp)
00404             VECTOR2OP( == ) VECTOR2OP( != ) VECTOR2OP( < )  VECTOR2OP( <= ) VECTOR2OP( > )  VECTOR2OP( >= )
00405             VECTOR2OP( && ) VECTOR2OP( || )
00406 
00407             VECTORFUNC(sqrt) VECTORFUNC(rcp) VECTORFUNC(rsqrt)
00408             VECTOR2FUNC(min) VECTOR2FUNC(max)
00410             friend bool isZero(const VectorOfVectors &a)
00411             {
00412                 bool iszero=true;
00413                 for(FXuint n=0; n<N && iszero; n++)
00414                     iszero&=isZero(a.vectors[n]);
00415                 return iszero;
00416             }
00418             friend TYPE sum(const VectorOfVectors &a)
00419             {
00420                 TYPE ret=0;
00421                 for(FXuint n=0; n<N; n++)
00422                     ret+=sum(a.vectors[n]);
00423                 return ret;
00424             }
00426             friend TYPE dot(const VectorOfVectors &a, const VectorOfVectors &b)
00427             {
00428                 TYPE ret=0;
00429                 for(FXuint n=0; n<N; n++)
00430                     ret+=dot(a.vectors[n], b.vectors[n]);
00431                 return ret;
00432             }
00433         };
00434         template<class vectortype, unsigned int N, class supertype> class VectorOfVectors<vectortype, N, supertype, true, true> : public VectorOfVectors<vectortype, N, supertype, true, false>
00435         {
00436         protected:
00437             typedef VectorOfVectors<vectortype, N, supertype, true, false> Base;
00438         public:
00439             typedef typename vectortype::TYPE TYPE;
00440             VectorOfVectors() { }
00441             explicit VectorOfVectors(const TYPE *d) : Base(d) { }
00442             explicit VectorOfVectors(const TYPE &d) : Base(d) { }
00443 
00444             // Arithmetic (int only)
00445              VECTOR2OP(%)   VECTOR2OP(&)   VECTOR2OP(|)   VECTOR2OP(^)   VECTOR2OP(<<)   VECTOR2OP(>>)  VECTOR1OP(~)
00446             VECTORP2OP(%=) VECTORP2OP(&=) VECTORP2OP(|=) VECTORP2OP(^=) VECTORP2OP(<<=) VECTORP2OP(>>=)
00447 
00448             VECTOR2FUNC(lshiftvec) VECTOR2FUNC(rshiftvec)
00449         };
00450 #undef VECTOR1OP
00451 #undef VECTOR2OP
00452 #undef VECTORP2OP
00453 #undef VECTORFUNC
00454 #undef VECTOR2FUNC
00455     }
00459     template<typename type, bool minus=false> struct InfinityValue : public Impl::InfinityValue<type, minus> { };
00463     template<typename type> struct NaNValue : public Impl::NaNValue<type> { };
00465     template<typename type> inline bool isNaN(type val) throw();
00466     template<> inline bool isNaN<float>(float val) throw() { union { float f; FXuint i; } v; v.f=val; return v.i==NaNValue<float>::value().integer; }
00467     template<> inline bool isNaN<double>(double val) throw() { union { double f; FXulong i; } v; v.f=val; return v.i==NaNValue<double>::value().integer; }
00468 
00549     template<typename type, unsigned int A> class Vector : public Impl::EquivType<Impl::VectorBase<type, A, Vector<type, A>, Generic::Traits<type>::isArithmetical, Generic::Traits<type>::isInt>, type, void>
00550     {
00551         typedef Impl::EquivType<Impl::VectorBase<type, A, Vector<type, A>, Generic::Traits<type>::isArithmetical, Generic::Traits<type>::isInt>, type, void> Base;
00552     public:
00553         Vector() { }
00555         explicit Vector(const type *d) : Base(d) { }
00557         explicit Vector(const type &d) : Base(d) { }
00558     };
00559     // Map to FOX types
00560 #define DEFINEVECTOREQUIV(type, A, equivtype) \
00561     template<> class Vector<type, A> : public Impl::EquivType<Impl::VectorBase<type, A, Vector<type, A>, Generic::Traits<type>::isArithmetical, Generic::Traits<type>::isInt>, type, void> \
00562     { \
00563         typedef Impl::EquivType<Impl::VectorBase<type, A, Vector<type, A>, Generic::Traits<type>::isArithmetical, Generic::Traits<type>::isInt>, type, void> Base; \
00564     public: \
00565         Vector() { } \
00566         explicit Vector(const type *d) : Base(d) { } \
00567         explicit Vector(const type &d) : Base(d) { } \
00568     };
00569     DEFINEVECTOREQUIV(FXuchar,  8, FXulong)
00570     DEFINEVECTOREQUIV(FXushort, 4, FXulong)
00571     DEFINEVECTOREQUIV(FXuint,   2, FXulong)
00572     DEFINEVECTOREQUIV(FXuchar,  4, FXuint)
00573     DEFINEVECTOREQUIV(FXushort, 2, FXuint)
00574     DEFINEVECTOREQUIV(FXuchar,  2, FXushort)
00575     DEFINEVECTOREQUIV(float, 2, FXVec2f)
00576     DEFINEVECTOREQUIV(float, 3, FXVec3f)
00577     typedef Vector<float, 2> Vector2f;
00578     typedef Vector<float, 3> Vector3f;
00579     typedef Vector<float, 4> Vector4f;
00580     DEFINEVECTOREQUIV(double, 3, FXVec3d)
00581     typedef Vector<double, 2> Vector2d;
00582     typedef Vector<double, 3> Vector3d;
00583     typedef Vector<double, 4> Vector4d;
00584 
00585     template<typename type, unsigned int A> FXStream &operator<<(FXStream &s, const Vector<type, A> &v)
00586     {
00587         for(FXuint n=0; n<A; n++) s << v[n];
00588         return s;
00589     }
00590     template<typename type, unsigned int A> FXStream &operator>>(FXStream &s, Vector<type, A> &v)
00591     {
00592         type t;
00593         for(FXuint n=0; n<A; n++) { s >> t; v.set(n, t); }
00594         return s;
00595     }
00597 #define FXVECTOROFVECTORS(VECTORTYPE, ELEMENTS, equivtype) template<> class Vector<VECTORTYPE::TYPE, ELEMENTS> : public Impl::EquivType<Impl::VectorOfVectors<VECTORTYPE, ELEMENTS/VECTORTYPE::DIMENSION, Vector<VECTORTYPE::TYPE, ELEMENTS> >, VECTORTYPE::TYPE, equivtype> \
00598     { \
00599         typedef Impl::EquivType<Impl::VectorOfVectors<VECTORTYPE, ELEMENTS/VECTORTYPE::DIMENSION, Vector<VECTORTYPE::TYPE, ELEMENTS> >, VECTORTYPE::TYPE, equivtype> Base; \
00600     public: \
00601         Vector() { } \
00602         explicit Vector(const VECTORTYPE::TYPE *d) : Base(d) { } \
00603         explicit Vector(const VECTORTYPE::TYPE &d) : Base(d) { } \
00604     };
00605 
00606 
00607 #if 1   // Use to disable SIMD optimised versions
00608 #if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
00609     // The x86 and x64 SSE specialisations
00610 #if defined(_M_X64) || defined(__x86_64__) || (defined(_M_IX86) && _M_IX86_FP>=1) || (defined(__i386__) && defined(__SSE__))
00611 #define FXVECTOR_SPECIALISEDFLOAT4
00612     template<> class FXVECTOR_BUGGYGCCALIGNMENTHACK Vector<float, 4> : private Impl::TwoPowerMemAligner<16>
00613     {
00614     public:
00615         typedef float TYPE;
00616         static const unsigned int DIMENSION=4;
00617         static const bool isArithmetic=true;
00618         static const bool isInteger=false;
00619     private:
00620         typedef __m128 SSETYPE;
00621         SSETYPE v;
00622         static TYPE int_extract(SSETYPE v, unsigned int i)
00623         {
00624             SSETYPE t;
00625             switch(i)
00626             {
00627             case 0:
00628                 t=v; /*t=_mm_shuffle_ps(v, v, 0);*/ break;
00629             case 1:
00630                 t=_mm_shuffle_ps(v, v, 1); break;
00631             case 2:
00632                 t=_mm_shuffle_ps(v, v, 2); break;
00633             case 3:
00634                 t=_mm_shuffle_ps(v, v, 3); break;
00635             }
00636             TYPE ret;
00637             _mm_store_ss(&ret, t);
00638             return ret;
00639         }
00640         static SSETYPE int_ones()
00641         {
00642             static SSETYPE v=_mm_set1_ps(1);
00643             return v;
00644         }
00645         static SSETYPE int_not()
00646         {
00647             static SSETYPE v=_mm_cmpeq_ps(int_ones(), int_ones());
00648             return v;
00649         }
00650         static SSETYPE int_notones()
00651         {
00652             static SSETYPE v=_mm_xor_ps(int_ones(), int_not());
00653             return v;
00654         }
00655     public:
00656         Vector() { }
00657         Vector(const SSETYPE &_v) { v=_v; }
00658         Vector(const Vector &o) { v=o.v; }
00659         Vector &operator=(const Vector &o) { v=o.v; return *this; }
00660 
00661         Vector(const FXVec4f &o) { v=_mm_loadu_ps((float*)&o); }
00662         Vector &operator=(const FXVec4f &o) { v=_mm_loadu_ps((float*)&o); return *this; }
00663         operator FXVec4f &() { return *((FXVec4f *)this); }
00664         operator const FXVec4f &() const { return *((const FXVec4f *)this); }
00665 
00666         explicit Vector(const TYPE *d)
00667         {
00668             if(!d) v=_mm_setzero_ps();
00669             else v=_mm_loadu_ps(d);
00670         }
00671         explicit Vector(const TYPE &d)
00672         {
00673             v=_mm_set1_ps(d);
00674         }
00675         TYPE operator[](unsigned int i) const
00676         {
00677             return int_extract(v, i);
00678         }
00679         Vector &set(unsigned int i, const TYPE &d)
00680         {
00681             union { TYPE f[DIMENSION]; SSETYPE v; } t;
00682             assert(i<DIMENSION);
00683             t.v=v;
00684             t.f[i]=d;
00685             v=t.v;
00686             return *this;
00687         }
00688         Vector operator +() const { return *this; }
00689         Vector operator -() const { Vector ret((TYPE *)0); ret-=*this; return ret; }
00690         Vector operator !() const { return _mm_and_ps(int_ones(), _mm_cmpneq_ps(_mm_setzero_ps(), v)); }
00691 #define VECTOR2OP(op, sseop)  Vector operator op (const Vector &o) const { return _mm_ ##sseop## _ps(v, o.v); }
00692 #define VECTORP2OP(op, sseop) Vector &operator op (const Vector &o)      { v=_mm_ ##sseop## _ps(v, o.v); return *this; }
00693 #define VECTORFUNC(op) friend Vector op(const Vector &o)                 { return _mm_ ##op## _ps(o.v); }
00694 #define VECTOR2FUNC(op) friend Vector op(const Vector &a, const Vector &b) { return _mm_ ##op## _ps(a.v, b.v); }
00695          VECTOR2OP(+,  add)  VECTOR2OP(-,  sub)  VECTOR2OP(*,  mul)  VECTOR2OP(/,  div)
00696         VECTORP2OP(+=, add) VECTORP2OP(-=, sub) VECTORP2OP(*=, mul) VECTORP2OP(/=, div)
00697 
00698 #undef VECTOR2OP
00699 #define VECTOR2OP(op, sseop)  Vector operator op (const Vector &o) const { return _mm_and_ps(int_ones(), _mm_cmp ##sseop## _ps(v, o.v)); }
00700         VECTOR2OP( == , eq) VECTOR2OP( != , neq) VECTOR2OP( < , lt)  VECTOR2OP( <= , le) VECTOR2OP( > , gt)  VECTOR2OP( >= , ge)
00701 #undef VECTOR2OP
00702 #define VECTOR2OP(op, sseop)  Vector operator op (const Vector &o) const { return _mm_and_ps(int_ones(), _mm_cmpneq_ps(_mm_setzero_ps(), _mm_ ##sseop## _ps(v, o.v))); }
00703         VECTOR2OP( && , and) VECTOR2OP( || , or)
00704 
00705         VECTORFUNC(sqrt) VECTORFUNC(rcp) VECTORFUNC(rsqrt)
00706         VECTOR2FUNC(min) VECTOR2FUNC(max)
00707 #undef VECTOR1OP
00708 #undef VECTOR2OP
00709 #undef VECTORP2OP
00710 #undef VECTORFUNC
00711 #undef VECTOR2FUNC
00712         friend bool isZero(const Vector &a)
00713         {   // We can use a cunning trick here
00714             return !_mm_movemask_ps(a.v)&&!_mm_movemask_ps(_mm_sub_ps(_mm_setzero_ps(), a.v));
00715         }
00716         friend TYPE sum(const Vector &a)
00717         {
00718 #if _M_IX86_FP>=3 || defined(__SSE3__)
00719             SSETYPE v=_mm_hadd_ps(a.v, a.v);
00720             return int_extract(_mm_hadd_ps(v, v), 0);
00721 #elif 1
00722             // This is actually the same speed as two hadd's on my machine, but
00723             // hadd is supposed to be quicker on newer processors
00724             SSETYPE tempA = _mm_shuffle_ps(a.v,a.v, _MM_SHUFFLE(2,0,2,0));
00725             SSETYPE tempB = _mm_shuffle_ps(a.v,a.v, _MM_SHUFFLE(3,1,3,1));
00726             SSETYPE tempC = _mm_add_ps(tempB, tempA);
00727             tempA = _mm_shuffle_ps(tempC,tempC, _MM_SHUFFLE(2,0,2,0));
00728             tempB = _mm_shuffle_ps(tempC,tempC, _MM_SHUFFLE(3,1,3,1));
00729             tempC = _mm_add_ss(tempB, tempA);
00730             return int_extract(tempC, 0);
00731 #else
00732             FXMEMALIGNED(16) TYPE f[DIMENSION];
00733             _mm_store_ps(f, a.v);
00734             return f[0]+f[1]+f[2]+f[3];
00735 #endif
00736         }
00737         friend TYPE dot(const Vector &a, const Vector &b)
00738         {
00739 #if _M_IX86_FP>=4 || defined(__SSE4__)
00740             // Use the SSE4.1 instruction
00741             return int_extract(_mm_dp_ps(a.v, b.v, 0xf1), 0);
00742 #else
00743             // SSE implementation
00744             return sum(a*b);
00745 #endif
00746         }
00747         friend inline SSETYPE &GetSSEVal(Vector &a) { return a.v; }
00748         friend inline const SSETYPE &GetSSEVal(const Vector &a) { return a.v; }
00749     };
00750     typedef Vector<float, 4> int_SSEOptimised_float4;   // Needed as macros don't understand template types :(
00751     FXVECTOROFVECTORS(int_SSEOptimised_float4, 8, void);
00752     FXVECTOROFVECTORS(int_SSEOptimised_float4, 16, void);
00753     FXVECTOROFVECTORS(int_SSEOptimised_float4, 32, void);
00754     FXVECTOROFVECTORS(int_SSEOptimised_float4, 64, void);
00755     FXVECTOROFVECTORS(int_SSEOptimised_float4, 128, void);
00756     FXVECTOROFVECTORS(int_SSEOptimised_float4, 256, void);
00757 #endif
00758 #if defined(_M_X64) || defined(__x86_64__) || (defined(_M_IX86) && _M_IX86_FP>=2) || (defined(__i386__) && defined(__SSE2__))
00759 #define FXVECTOR_SPECIALISEDDOUBLE2
00760     template<> class FXVECTOR_BUGGYGCCALIGNMENTHACK Vector<double, 2> : private Impl::TwoPowerMemAligner<16>
00761     {
00762     public:
00763         typedef double TYPE;
00764         static const unsigned int DIMENSION=2;
00765         static const bool isArithmetic=true;
00766         static const bool isInteger=false;
00767     private:
00768         typedef __m128d SSETYPE;
00769         SSETYPE v;
00770         static TYPE int_extract(SSETYPE v, unsigned int i)
00771         {
00772             SSETYPE t;
00773             switch(i)
00774             {
00775             case 0:
00776                 t=v; /*_mm_shuffle_pd(v, v, 0);*/ break;
00777             case 1:
00778                 t=_mm_shuffle_pd(v, v, 1); break;
00779             }
00780             TYPE ret;
00781             _mm_store_sd(&ret, t);
00782             return ret;
00783         }
00784         static SSETYPE int_ones()
00785         {
00786             static SSETYPE v=_mm_set1_pd(1);
00787             return v;
00788         }
00789         static SSETYPE int_not()
00790         {
00791             static SSETYPE v=_mm_cmpeq_pd(int_ones(), int_ones());
00792             return v;
00793         }
00794         static SSETYPE int_notones()
00795         {
00796             static SSETYPE v=_mm_xor_pd(int_ones(), int_not());
00797             return v;
00798         }
00799     public:
00800         Vector() { }
00801         Vector(const SSETYPE &_v) { v=_v; }
00802         Vector(const Vector &o) { v=o.v; }
00803         Vector &operator=(const Vector &o) { v=o.v; return *this; }
00804 
00805         Vector(const FXVec2d &o) { v=_mm_loadu_pd((double*)&o); }
00806         Vector &operator=(const FXVec2d &o) { v=_mm_loadu_pd((double*)&o); return *this; }
00807         operator FXVec2d &() { return *((FXVec2d *)this); }
00808         operator const FXVec2d &() const { return *((const FXVec2d *)this); }
00809 
00810         explicit Vector(const TYPE *d)
00811         {
00812             if(!d) v=_mm_setzero_pd();
00813             else v=_mm_loadu_pd(d);
00814         }
00815         explicit Vector(const TYPE &d)
00816         {
00817             v=_mm_set1_pd(d);
00818         }
00819         TYPE operator[](unsigned int i) const
00820         {
00821             return int_extract(v, i);
00822         }
00823         Vector &set(unsigned int i, const TYPE &d)
00824         {
00825             union { TYPE f[DIMENSION]; SSETYPE v; } t;
00826             assert(i<DIMENSION);
00827             t.v=v;
00828             t.f[i]=d;
00829             v=t.v;
00830             return *this;
00831         }
00832         Vector operator +() const { return *this; }
00833         Vector operator -() const { Vector ret((TYPE *)0); ret-=*this; return ret; }
00834         Vector operator !() const { return _mm_and_pd(int_ones(), _mm_cmpneq_pd(_mm_setzero_pd(), v)); }
00835 #define VECTOR2OP(op, sseop)  Vector operator op (const Vector &o) const { return _mm_ ##sseop## _pd(v, o.v); }
00836 #define VECTORP2OP(op, sseop) Vector &operator op (const Vector &o)      { v=_mm_ ##sseop## _pd(v, o.v); return *this; }
00837 #define VECTORFUNC(op) friend Vector op(const Vector &o)                 { return _mm_ ##op## _pd(o.v); }
00838 #define VECTOR2FUNC(op) friend Vector op(const Vector &a, const Vector &b) { return _mm_ ##op## _pd(a.v, b.v); }
00839          VECTOR2OP(+,  add)  VECTOR2OP(-,  sub)  VECTOR2OP(*,  mul)  VECTOR2OP(/,  div)
00840         VECTORP2OP(+=, add) VECTORP2OP(-=, sub) VECTORP2OP(*=, mul) VECTORP2OP(/=, div)
00841 
00842 #undef VECTOR2OP
00843 #define VECTOR2OP(op, sseop)  Vector operator op (const Vector &o) const { return _mm_and_pd(int_ones(), _mm_cmp ##sseop## _pd(v, o.v)); }
00844         VECTOR2OP( == , eq) VECTOR2OP( != , neq) VECTOR2OP( < , lt)  VECTOR2OP( <= , le) VECTOR2OP( > , gt)  VECTOR2OP( >= , ge)
00845 #undef VECTOR2OP
00846 #define VECTOR2OP(op, sseop)  Vector operator op (const Vector &o) const { return _mm_and_pd(int_ones(), _mm_cmpneq_pd(_mm_setzero_pd(), _mm_ ##sseop## _pd(v, o.v))); }
00847         VECTOR2OP( && , and) VECTOR2OP( || , or)
00848 
00849         VECTORFUNC(sqrt)
00850         VECTOR2FUNC(min) VECTOR2FUNC(max)
00851 #undef VECTOR1OP
00852 #undef VECTOR2OP
00853 #undef VECTORP2OP
00854 #undef VECTORFUNC
00855 #undef VECTOR2FUNC
00856         friend Vector rcp(const Vector &o) { return _mm_div_pd(_mm_set1_pd(1), o.v); }
00857         friend Vector rsqrt(const Vector &o) { return _mm_div_pd(_mm_set1_pd(1), _mm_sqrt_pd(o.v)); }
00858         friend bool isZero(const Vector &a)
00859         {   // We can use a cunning trick here
00860             return !_mm_movemask_pd(a.v)&&!_mm_movemask_pd(_mm_sub_pd(_mm_setzero_pd(), a.v));
00861         }
00862         friend TYPE sum(const Vector &a)
00863         {
00864 #if _M_IX86_FP>=3 || defined(__SSE3__)
00865             return int_extract(_mm_hadd_pd(a.v, a.v), 0);
00866 #elif 1
00867             SSETYPE tempA = _mm_shuffle_pd(a.v,a.v, _MM_SHUFFLE2(0,0));
00868             SSETYPE tempB = _mm_shuffle_pd(a.v,a.v, _MM_SHUFFLE2(1,1));
00869             return int_extract(_mm_add_sd(tempB, tempA), 0);
00870 #else
00871             FXMEMALIGNED(16) TYPE f[DIMENSION];
00872             _mm_store_pd(f, a.v);
00873             return f[0]+f[1];
00874 #endif
00875         }
00876         friend TYPE dot(const Vector &a, const Vector &b)
00877         {
00878 #if _M_IX86_FP>=4 || defined(__SSE4__)
00879             // Use the SSE4.1 instruction
00880             return int_extract(_mm_dp_pd(a.v, b.v, 0xf1), 0);
00881 #else
00882             // SSE implementation
00883             return sum(a*b);
00884 #endif
00885         }
00886         friend inline SSETYPE &GetSSEVal(Vector &a) { return a.v; }
00887         friend inline const SSETYPE &GetSSEVal(const Vector &a) { return a.v; }
00888     };
00889     typedef Vector<double, 2> int_SSEOptimised_double2; // Needed as macros don't understand template types :(
00890     FXVECTOROFVECTORS(int_SSEOptimised_double2, 4, FXVec4d);
00891     FXVECTOROFVECTORS(int_SSEOptimised_double2, 8, void);
00892     FXVECTOROFVECTORS(int_SSEOptimised_double2, 16, void);
00893     FXVECTOROFVECTORS(int_SSEOptimised_double2, 32, void);
00894     FXVECTOROFVECTORS(int_SSEOptimised_double2, 64, void);
00895     FXVECTOROFVECTORS(int_SSEOptimised_double2, 128, void);
00896 
00897 
00898     // Now for the integer vectors, we can save ourselves some work by reusing Impl::VectorBase
00899 #define VECTOR2OP_A(op, sseop, sseending)  Vector operator op (const Vector &o) const { return _mm_ ##sseop## _ ##sseending (v, o.v); }
00900 #define VECTORP2OP_A(op, sseop, sseending) Vector &operator op (const Vector &o)      { v=_mm_ ##sseop## _ ##sseending (v, o.v); return *this; }
00901 #define VECTOR2OP_A2(op, sseop, sseending)  Vector operator op (const Vector &o) const { return _mm_ ##sseop## _ ##sseending (v, _mm_cvtsi32_si128(_mm_cvtsi128_si32(o.v))); }
00902 #define VECTORP2OP_A2(op, sseop, sseending) Vector &operator op (const Vector &o)      { v=_mm_ ##sseop## _ ##sseending (v, _mm_cvtsi32_si128(_mm_cvtsi128_si32(o.v))); return *this; }
00903 #define VECTOR2OP_B(op, sseop, sseending)  Vector operator op (const Vector &o) const { return _mm_ ##sseop## _si128(v, o.v); }
00904 #define VECTORP2OP_B(op, sseop, sseending) Vector &operator op (const Vector &o)      { v=_mm_ ##sseop## _si128(v, o.v); return *this; }
00905 #define VECTOR2OP_C(op, sseop, sseending)  Vector operator op (const Vector &o) const { return _mm_and_si128(int_ones(), _mm_cmp ##sseop## _ ##sseending (v, o.v)); }
00906 #define VECTOR2OP_D(op, sseop, sseending)  Vector operator op (const Vector &o) const { return _mm_andnot_si128(_mm_cmp ##sseop## _ ##sseending (v, o.v), int_ones()); }
00907 #define VECTOR2OP_E(op, sseop, sseending)  Vector operator op (const Vector &o) const { return _mm_xor_si128(int_ones(), _mm_cmpeq_ ##sseending (_mm_setzero_si128(), _mm_ ##sseop## _si128(v, o.v))); }
00908 #define VECTOR2FUNC(op, sseop, sseending)  friend Vector op(const Vector &a, const Vector &b) { return _mm_ ##sseop## _ ##sseending (a.v, b.v); }
00909 #if _M_IX86_FP>=4 || defined(__SSE4__)
00910 #define VECTOR2OP_A_SSE4ONLY(op, sseop, sseending)  VECTOR2OP_A(op, sseop, sseending)
00911 #define VECTORP2OP_A_SSE4ONLY(op, sseop, sseending) VECTORP2OP_A(op, sseop, sseending)
00912 #define VECTOR2FUNC_SSE4ONLY(op, sseop, sseending) VECTOR2FUNC(op, sseop, sseending)
00913 #define VECTORISZERO 1==_mm_testc_si128(a.v, _mm_setzero_si128())
00914 #else
00915 #define VECTOR2OP_A_SSE4ONLY(op, sseop, sseending)
00916 #define VECTORP2OP_A_SSE4ONLY(op, sseop, sseending)
00917 #define VECTOR2FUNC_SSE4ONLY(op, sseop, sseending)
00918 #define VECTORISZERO 65535==_mm_movemask_epi8(_mm_cmpeq_epi8(_mm_setzero_si128(), a.v))
00919 #endif
00920 #define VECTORINTEGER(vint, vsize, sseending, signage1, signage2) \
00921     template<> class FXVECTOR_BUGGYGCCALIGNMENTHACK Vector<vint, vsize> : public Impl::VectorBase<vint, vsize, Vector<vint, vsize>, true, true, __m128i> \
00922     { \
00923     private: \
00924         typedef Impl::VectorBase<vint, vsize, Vector<vint, vsize>, true, true, __m128i> Base; \
00925         typedef __m128i SSETYPE; \
00926         static SSETYPE int_ones() \
00927         { \
00928             static SSETYPE v=_mm_set1_ ##sseending (1); \
00929             return v; \
00930         } \
00931         static SSETYPE int_not() \
00932         { \
00933             static SSETYPE v=_mm_cmpeq_ ##sseending (int_ones(), int_ones()); \
00934             return v; \
00935         } \
00936         static SSETYPE int_notones() \
00937         { \
00938             static SSETYPE v=_mm_xor_si128(int_ones(), int_not()); \
00939             return v; \
00940         } \
00941     public: \
00942         Vector() { } \
00943         Vector(const SSETYPE &_v) { v=_v; } \
00944         Vector(const Vector &o) { v=o.v; } \
00945         Vector &operator=(const Vector &o) { v=o.v; return *this; } \
00946         explicit Vector(const TYPE *d) \
00947         { \
00948             if(!d) v=_mm_setzero_si128(); \
00949             else v=_mm_loadu_si128((SSETYPE *) d); \
00950         } \
00951         explicit Vector(const TYPE &d) \
00952         { \
00953             v=_mm_set1_ ##sseending (d); \
00954         } \
00955         Vector operator +() const { return *this; } \
00956         Vector operator -() const { Vector ret((TYPE *)0); ret-=*this; return ret; } \
00957         Vector operator !() const { return _mm_andnot_si128(_mm_cmpeq_ ##sseending (_mm_setzero_si128(), v), int_ones()); } \
00958         Vector operator ~() const { return _mm_xor_si128(int_not(), v); } \
00959         VECTOR2OP_A(+,  add, sseending)  VECTOR2OP_A(-,  sub, sseending)   VECTOR2OP_A2(<<,  sll, sseending)  VECTOR2OP_A2(>>,  sr ## signage1, sseending) \
00960         VECTORP2OP_A(+=, add, sseending) VECTORP2OP_A(-=, sub, sseending) VECTORP2OP_A2(<<=, sll, sseending) VECTORP2OP_A2(>>=, sr ## signage1, sseending) \
00961         VECTOR2OP_A_SSE4ONLY(*, mullo, sseending) \
00962         VECTORP2OP_A_SSE4ONLY(*, mullo, sseending) \
00963  \
00964          VECTOR2OP_B( & ,  and, sseending)   VECTOR2OP_B( | ,  or, sseending)   VECTOR2OP_B( ^ ,  xor, sseending) \
00965         VECTORP2OP_B( &= , and, sseending)  VECTORP2OP_B( |= , or, sseending)  VECTORP2OP_B( ^= , xor, sseending)  \
00966  \
00967         VECTOR2OP_C( == , eq, sseending)  VECTOR2OP_C( < , lt, sseending)  VECTOR2OP_C( > , gt, sseending)   \
00968         VECTOR2OP_D( != , eq, sseending)  VECTOR2OP_D( <= , gt, sseending) VECTOR2OP_D( >= , lt, sseending) \
00969         VECTOR2OP_E( && , and, sseending) VECTOR2OP_E( || , or, sseending) \
00970  \
00971         VECTOR2FUNC_SSE4ONLY(min, min, signage2) VECTOR2FUNC_SSE4ONLY(max, max, signage2) \
00972         friend bool isZero(const Vector &a) \
00973         { \
00974             return VECTORISZERO; \
00975         } \
00976         friend Vector lshiftvec(const Vector &a, int shift) \
00977         { \
00978             if(shift&7) \
00979                 return lshiftvec(static_cast<const Vector::Base &>(a), shift); \
00980             else switch(shift/8) \
00981             { \
00982                 case 0:  return a; \
00983                 case 1:  return _mm_slli_si128(a.v, 1); \
00984                 case 2:  return _mm_slli_si128(a.v, 2); \
00985                 case 3:  return _mm_slli_si128(a.v, 3); \
00986                 case 4:  return _mm_slli_si128(a.v, 4); \
00987                 case 5:  return _mm_slli_si128(a.v, 5); \
00988                 case 6:  return _mm_slli_si128(a.v, 6); \
00989                 case 7:  return _mm_slli_si128(a.v, 7); \
00990                 case 8:  return _mm_slli_si128(a.v, 8); \
00991                 case 9:  return _mm_slli_si128(a.v, 9); \
00992                 case 10: return _mm_slli_si128(a.v, 10); \
00993                 case 11: return _mm_slli_si128(a.v, 11); \
00994                 case 12: return _mm_slli_si128(a.v, 12); \
00995                 case 13: return _mm_slli_si128(a.v, 13); \
00996                 case 14: return _mm_slli_si128(a.v, 14); \
00997                 case 15: return _mm_slli_si128(a.v, 15); \
00998             } \
00999             return _mm_setzero_si128(); \
01000         } \
01001         friend Vector rshiftvec(const Vector &a, int shift) \
01002         { \
01003             if(shift&7) \
01004                 return rshiftvec(static_cast<const Vector::Base &>(a), shift); \
01005             else switch(shift/8) \
01006             { \
01007                 case 0:  return a; \
01008                 case 1:  return _mm_srli_si128(a.v, 1); \
01009                 case 2:  return _mm_srli_si128(a.v, 2); \
01010                 case 3:  return _mm_srli_si128(a.v, 3); \
01011                 case 4:  return _mm_srli_si128(a.v, 4); \
01012                 case 5:  return _mm_srli_si128(a.v, 5); \
01013                 case 6:  return _mm_srli_si128(a.v, 6); \
01014                 case 7:  return _mm_srli_si128(a.v, 7); \
01015                 case 8:  return _mm_srli_si128(a.v, 8); \
01016                 case 9:  return _mm_srli_si128(a.v, 9); \
01017                 case 10: return _mm_srli_si128(a.v, 10); \
01018                 case 11: return _mm_srli_si128(a.v, 11); \
01019                 case 12: return _mm_srli_si128(a.v, 12); \
01020                 case 13: return _mm_srli_si128(a.v, 13); \
01021                 case 14: return _mm_srli_si128(a.v, 14); \
01022                 case 15: return _mm_srli_si128(a.v, 15); \
01023             } \
01024             return _mm_setzero_si128(); \
01025         } \
01026         friend inline SSETYPE &GetSSEVal(Vector &a) { return a.v; } \
01027         friend inline const SSETYPE &GetSSEVal(const Vector &a) { return a.v; } \
01028     };
01029 //VECTORINTEGER(FXuchar, 16, epi8)
01030 //VECTORINTEGER(FXchar,  16, epi8)
01031 VECTORINTEGER(FXushort, 8, epi16, l, epu16)
01032 VECTORINTEGER(FXshort,  8, epi16, a, epi16)
01033 VECTORINTEGER(FXuint,   4, epi32, l, epu32)
01034 VECTORINTEGER(FXint,    4, epi32, a, epi32)
01035 //VECTORINTEGER(FXulong,  2, epi64)
01036 //VECTORINTEGER(FXlong,   2, epi64)
01037 #undef VECTOR2OP_A
01038 #undef VECTORP2OP_A
01039 #undef VECTOR2OP_A2
01040 #undef VECTORP2OP_A2
01041 #undef VECTOR2OP_B
01042 #undef VECTORP2OP_B
01043 #undef VECTOR2OP_C
01044 #undef VECTOR2OP_D
01045 #undef VECTOR2OP_E
01046 #undef VECTOR2FUNC
01047 #undef VECTOR2OP_A_SSE4ONLY
01048 #undef VECTORP2OP_A_SSE4ONLY
01049 #undef VECTOR2FUNC_SSE4ONLY
01050 #undef VECTORISZERO
01051 #undef VECTORINTEGER
01052 #define VECTORINTEGER(vint, vsize) \
01053     typedef Vector<vint, vsize> int_SSEOptimised_##vint; \
01054     FXVECTOROFVECTORS(int_SSEOptimised_##vint, vsize*2, void); \
01055     FXVECTOROFVECTORS(int_SSEOptimised_##vint, vsize*4, void); \
01056     FXVECTOROFVECTORS(int_SSEOptimised_##vint, vsize*8, void); \
01057     FXVECTOROFVECTORS(int_SSEOptimised_##vint, vsize*16, void); \
01058     FXVECTOROFVECTORS(int_SSEOptimised_##vint, vsize*32, void); \
01059     FXVECTOROFVECTORS(int_SSEOptimised_##vint, vsize*64, void);
01060 VECTORINTEGER(FXushort, 8)
01061 VECTORINTEGER(FXshort,  8)
01062 VECTORINTEGER(FXuint,   4)
01063 VECTORINTEGER(FXint,    4)
01064 #undef VECTORINTEGER
01065 #endif
01066 #endif
01067 #endif
01068 
01069 #ifndef FXVECTOR_SPECIALISEDFLOAT4
01070     DEFINEVECTOREQUIV(float, 4, FXVec4f)
01071 #endif
01072 #ifndef FXVECTOR_SPECIALISEDDOUBLE2
01073     DEFINEVECTOREQUIV(double, 2, FXVec2d)
01074     DEFINEVECTOREQUIV(double, 4, FXVec4d)
01075 #endif
01076 #undef DEFINEVECTOREQUIV
01077 
01087     template<typename type, unsigned int A> class Array
01088     {
01089         type data[A];
01090     public:
01091         typedef type value_type;
01092         typedef value_type &reference;
01093         typedef const value_type &const_reference;
01094         typedef value_type *iterator;
01095         typedef const value_type *const_iterator;
01096         size_t max_size() const { return A; }
01097 
01098         Array() { }
01099         explicit Array(const type *d) { for(FXuint b=0; b<A; b++) data[b]=d ? type(d[b]) : type(); }
01100         bool operator==(const Array &o) const { for(FXuint b=0; b<A; b++) if(data[b]!=o.data[b]) return false; return true; }
01101         bool operator!=(const Array &o) const { for(FXuint b=0; b<A; b++) if(data[b]!=o.data[b]) return true; return false; }
01102 
01103         reference at(int i) { assert(i<A); return data[i]; }
01104         const_reference at(int i) const { assert(i<A); return data[i]; }
01105         reference operator[](int i) { return at(i); }
01106         const_reference operator[](int i) const { return at(i); }
01107 
01108         reference front() { return data[0]; }
01109         const_reference front() const { return data[0]; }
01110         reference back() { return data[A-1]; }
01111         const_reference back() const { return data[A-1]; }
01112         iterator begin() { return &data[0]; }
01113         const_iterator begin() const { return &data[0]; }
01114         iterator end() { return &data[A]; }
01115         const_iterator end() const { return &data[A]; }
01116     };
01117 
01128     namespace Impl {
01129         template<typename type, unsigned int A, unsigned int B, bool isFP> class MatrixIt;
01130         // Yet to be implemented
01131         template<typename type, unsigned int N, unsigned int A, unsigned int B> class MatrixIt<Vector<type, N>, A, B, false>;
01132         // The floating point specialisation
01133         template<typename type, unsigned int A, unsigned int B> class MatrixIt<type, A, B, true>
01134         {
01135         protected:
01136             Vector<type, A> data[B];
01137         public:
01138             typedef Vector<type, A> VECTORTYPE;
01139             static const unsigned int WIDTH=A, HEIGHT=B;
01140             MatrixIt() { }
01141             explicit MatrixIt(const type *d)
01142             {
01143                 for(FXuint b=0; b<B; b++)
01144                     data[b]=VECTORTYPE(d ? d+b*A : 0);
01145             }
01146             explicit MatrixIt(const type (*d)[A])
01147             {
01148                 for(FXuint b=0; b<B; b++)
01149                     data[b]=VECTORTYPE(d ? d[b] : 0);
01150             }
01151             bool operator==(const MatrixIt &o) const
01152             {
01153                 for(FXuint b=0; b<B; b++)
01154                     if(isZero(data[b]==o.data[b]))
01155                         return false;
01156                 return true;
01157             }
01158             bool operator!=(const MatrixIt &o) const
01159             {
01160                 for(FXuint b=0; b<B; b++)
01161                     if(isZero(data[b]==o.data[b]))
01162                         return true;
01163                 return false;
01164             }
01165             MatrixIt inline operator *(const MatrixIt &_x) const;
01166             template<typename type2, unsigned int A2, unsigned int B2> friend inline MatrixIt<type2, A2, B2, true> transpose(const MatrixIt<type2, A2, B2, true> &v);
01167         };
01168         template<typename type, unsigned int A, unsigned int B> MatrixIt<type, A, B, true> inline MatrixIt<type, A, B, true>::operator *(const MatrixIt<type, A, B, true> &_x) const
01169         {   // We can make heavy use of the dot product here
01170             MatrixIt<type, A, B, true> ret, x(transpose(_x));
01171             for(FXuint b=0; b<B; b++)
01172                 for(FXuint a=0; a<A; a++)
01173                     ret.data[b].set(a, dot(data[b], x.data[a]));
01174             return ret;
01175         }
01176         template<typename type, unsigned int A, unsigned int B> inline MatrixIt<type, A, B, true> transpose(const MatrixIt<type, A, B, true> &v)
01177         {   // No SIMD available to swap bytes around and endian conversion isn't big enough :(
01178             MatrixIt<type, A, B, true> ret;
01179             for(FXuint b=0; b<B; b++)
01180                 for(FXuint a=0; a<A; a++)
01181                     ret.data[b].set(a, v.data[a][b]);
01182             return ret;
01183         }
01184 #ifdef FXVECTOR_SPECIALISEDFLOAT4
01185         template<> inline MatrixIt<float, 4, 4, true> transpose(const MatrixIt<float, 4, 4, true> &v)
01186         {
01187             MatrixIt<float, 4, 4, true> ret(v);
01188             _MM_TRANSPOSE4_PS(GetSSEVal(ret.data[0]), GetSSEVal(ret.data[1]), GetSSEVal(ret.data[2]), GetSSEVal(ret.data[3]));
01189             return ret;
01190         }
01191 #if _M_IX86_FP>=4 || defined(__SSE4__)
01192         template<> MatrixIt<float, 4, 4, true> inline MatrixIt<float, 4, 4, true>::operator *(const MatrixIt<float, 4, 4, true> &_x) const
01193         {   // The SSE4 fast dot() makes the normal algorithm worthwhile
01194             MatrixIt<float, 4, 4, true> ret;
01195             __m128 x[4];
01196             for(FXuint b=0; b<4; b++)
01197                 x[b]=GetSSEVal(_x.data[b]);
01198             _MM_TRANSPOSE4_PS(x[0], x[1], x[2], x[3]);
01199             for(FXuint b=0; b<4; b++)
01200                 for(FXuint a=0; a<4; a++)
01201                     ret.data[b].set(a, dot(data[b], x[a]));
01202             return ret;
01203         }
01204 #else
01205         template<> MatrixIt<float, 4, 4, true> inline MatrixIt<float, 4, 4, true>::operator *(const MatrixIt<float, 4, 4, true> &_x) const
01206         {   // In pure SSE
01207             MatrixIt<float, 4, 4, true> ret;
01208             __m128 acc, temp, row, x[4];
01209             for(FXuint b=0; b<4; b++)
01210                 x[b]=GetSSEVal(_x.data[b]);
01211             // To avoid horizontal adding, accumulate vertically
01212             for(FXuint b=0; b<4; b++)
01213             {
01214                 row=GetSSEVal(data[b]);
01215                 temp=_mm_shuffle_ps(row, row, _MM_SHUFFLE(0,0,0,0)); acc=_mm_mul_ps(temp, x[0]);
01216                 temp=_mm_shuffle_ps(row, row, _MM_SHUFFLE(1,1,1,1)); acc=_mm_add_ps(acc, _mm_mul_ps(temp, x[1]));
01217                 temp=_mm_shuffle_ps(row, row, _MM_SHUFFLE(2,2,2,2)); acc=_mm_add_ps(acc, _mm_mul_ps(temp, x[2]));
01218                 temp=_mm_shuffle_ps(row, row, _MM_SHUFFLE(3,3,3,3)); acc=_mm_add_ps(acc, _mm_mul_ps(temp, x[3]));
01219                 ret.data[b]=acc;
01220             }
01221             return ret;
01222         }
01223 #endif
01224 #endif
01225         template<typename type, unsigned int A, unsigned int B> class MatrixI : public MatrixIt<type, A, B, Generic::Traits<type>::isFloat>
01226         {
01227         protected:
01228             typedef MatrixIt<type, A, B, Generic::Traits<type>::isFloat> Base;
01229         public:
01230             typedef type value_type;
01231             typedef value_type &reference;
01232             typedef const value_type &const_reference;
01233             typedef value_type *iterator;
01234             typedef const value_type *const_iterator;
01235             size_t max_size() const { return A*B; }
01236 
01237             MatrixI() { }
01238             MatrixI(const Base &o) : Base(o) { }
01239             explicit MatrixI(const type *d) : Base(d) { }
01240             explicit MatrixI(const type (*d)[A]) : Base(d) { }
01241 
01242             value_type at(int a, int b) { assert(a<A && b<B); return Base::data[b][a]; }
01243             value_type at(int a, int b) const { assert(a<A && b<B); return Base::data[b][a]; }
01244             value_type operator[](int i) { return at(i%A, i/A); }
01245             value_type operator[](int i) const { return at(i%A, i/A); }
01246 
01247             reference front() { return Base::data[0]; }
01248             const_reference front() const { return Base::data[0]; }
01249             reference back() { return Base::data[B-1][A-1]; }
01250             const_reference back() const { return Base::data[B-1][A-1]; }
01251             iterator begin() { return &Base::data[0]; }
01252             const_iterator begin() const { return &Base::data[0]; }
01253             iterator end() { return &Base::data[B][0]; }
01254             const_iterator end() const { return &Base::data[B][0]; }
01255         };
01256     }
01257     template<typename type, unsigned int A, unsigned int B> class Matrix : public Impl::EquivType<Impl::MatrixI<type, A, B>, type, void>
01258     {
01259     protected:
01260         typedef Impl::EquivType<Impl::MatrixI<type, A, B>, type, void> Base;
01261     public:
01262         Matrix() { }
01263         Matrix(const Base &o) : Base(o) { }
01264         Matrix(const typename Base::Base &o) : Base(o) { }
01265         explicit Matrix(const type *d) : Base(d) { }
01266         explicit Matrix(const type (*d)[A]) : Base(d) { }
01267     };
01268 #define DEFINEMATRIXEQUIV(type, no, equivtype) \
01269     template<> class Matrix<type, no, no> : public Impl::EquivType<Impl::MatrixI<type, no, no>, type, equivtype> \
01270     { \
01271         typedef Impl::EquivType<Impl::MatrixI<type, no, no>, type, equivtype> Base; \
01272     public: \
01273         Matrix() { } \
01274         Matrix(const Base &o) : Base(o) { } \
01275         Matrix(const Base::Base &o) : Base(o) { } \
01276         explicit Matrix(const type *d) : Base(d) { } \
01277         explicit Matrix(const type (*d)[no]) : Base(d) { } \
01278         Matrix(const equivtype &o) : Base((const type *)&o) { } \
01279     };
01280     DEFINEMATRIXEQUIV(float, 3, FXMat3f)
01281     DEFINEMATRIXEQUIV(float, 4, FXMat4f)
01282     typedef Matrix<float, 3, 3> Matrix3f;
01283     typedef Matrix<float, 4, 4> Matrix4f;
01284     DEFINEMATRIXEQUIV(double, 3, FXMat3d)
01285     DEFINEMATRIXEQUIV(double, 4, FXMat4d)
01286     typedef Matrix<double, 3, 3> Matrix3d;
01287     typedef Matrix<double, 4, 4> Matrix4d;
01288 #undef DEFINEMATRIXEQUIV
01289 
01318 #if FOX_BIGENDIAN || (defined(_M_IX86) && _M_IX86_FP==0) || (defined(__i386__) && !defined(__SSE__))
01319     /*
01320    A C-program for MT19937-64 (2004/9/29 version).
01321    Coded by Takuji Nishimura and Makoto Matsumoto.
01322 
01323    This is a 64-bit version of Mersenne Twister pseudorandom number
01324    generator.
01325 
01326    Before using, initialize the state by using init_genrand64(seed)
01327    or init_by_array64(init_key, key_length).
01328 
01329    Copyright (C) 2004, Makoto Matsumoto and Takuji Nishimura,
01330    All rights reserved.
01331 
01332    Redistribution and use in source and binary forms, with or without
01333    modification, are permitted provided that the following conditions
01334    are met:
01335 
01336      1. Redistributions of source code must retain the above copyright
01337         notice, this list of conditions and the following disclaimer.
01338 
01339      2. Redistributions in binary form must reproduce the above copyright
01340         notice, this list of conditions and the following disclaimer in the
01341         documentation and/or other materials provided with the distribution.
01342 
01343      3. The names of its contributors may not be used to endorse or promote
01344         products derived from this software without specific prior written
01345         permission.
01346 
01347    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
01348    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
01349    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
01350    A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
01351    CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
01352    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
01353    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
01354    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
01355    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
01356    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
01357    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
01358 
01359    References:
01360    T. Nishimura, ``Tables of 64-bit Mersenne Twisters''
01361      ACM Transactions on Modeling and
01362      Computer Simulation 10. (2000) 348--357.
01363    M. Matsumoto and T. Nishimura,
01364      ``Mersenne Twister: a 623-dimensionally equidistributed
01365        uniform pseudorandom number generator''
01366      ACM Transactions on Modeling and
01367      Computer Simulation 8. (Jan. 1998) 3--30.
01368 
01369    Any feedback is very welcome.
01370    http://www.math.hiroshima-u.ac.jp/~m-mat/MT/emt.html
01371    email: m-mat @ math.sci.hiroshima-u.ac.jp (remove spaces)
01372     */
01373     class FRandomness
01374     {
01375         static const FXint NN=312;
01376         static const FXint MM=156;
01377         static const FXulong MATRIX_A=0xB5026F5AA96619E9ULL;
01378         static const FXulong UM=0xFFFFFFFF80000000ULL;  /* Most significant 33 bits */
01379         static const FXulong LM=0x7FFFFFFFULL;          /* Least significant 31 bits */
01380 
01381         FXulong mt[NN];                                 /* The array for the state vector */
01382         int mti;
01383     public:
01385         static const bool usingSIMD=false;
01387         FRandomness(FXulong seed) throw() : mti(NN+1)
01388         {
01389             mt[0] = seed;
01390             for (mti=1; mti<NN; mti++)
01391                 mt[mti] =  (6364136223846793005ULL * (mt[mti-1] ^ (mt[mti-1] >> 62)) + mti);
01392         }
01393 
01394         FRandomness(FXuchar *_seed, FXuval len) throw() : mti(NN+1)
01395         {
01396             FXulong *init_key=(FXulong *) _seed, key_length=len;
01397             unsigned long long i, j, k;
01398             mt[0] = 19650218ULL;
01399             for (mti=1; mti<NN; mti++)
01400                 mt[mti] =  (6364136223846793005ULL * (mt[mti-1] ^ (mt[mti-1] >> 62)) + mti);
01401             i=1; j=0;
01402             k = (NN>key_length ? NN : key_length);
01403             for (; k; k--) {
01404                 mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 3935559000370003845ULL))
01405                     + init_key[j] + j; /* non linear */
01406                 i++; j++;
01407                 if (i>=NN) { mt[0] = mt[NN-1]; i=1; }
01408                 if (j>=key_length) j=0;
01409             }
01410             for (k=NN-1; k; k--) {
01411                 mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 2862933555777941757ULL))
01412                     - i; /* non linear */
01413                 i++;
01414                 if (i>=NN) { mt[0] = mt[NN-1]; i=1; }
01415             }
01416 
01417             mt[0] = 1ULL << 63; /* MSB is 1; assuring non-zero initial array */
01418         }
01419 
01421         FXulong int64() throw()
01422         {
01423             int i;
01424             FXulong x;
01425             static FXulong mag01[2]={0ULL, MATRIX_A};
01426 
01427             if (mti >= NN) { /* generate NN words at one time */
01428 
01429                 for (i=0;i<NN-MM;i++) {
01430                     x = (mt[i]&UM)|(mt[i+1]&LM);
01431                     mt[i] = mt[i+MM] ^ (x>>1) ^ mag01[(int)(x&1ULL)];
01432                 }
01433                 for (;i<NN-1;i++) {
01434                     x = (mt[i]&UM)|(mt[i+1]&LM);
01435                     mt[i] = mt[i+(MM-NN)] ^ (x>>1) ^ mag01[(int)(x&1ULL)];
01436                 }
01437                 x = (mt[NN-1]&UM)|(mt[0]&LM);
01438                 mt[NN-1] = mt[MM-1] ^ (x>>1) ^ mag01[(int)(x&1ULL)];
01439 
01440                 mti = 0;
01441             }
01442 
01443             x = mt[mti++];
01444 
01445             x ^= (x >> 29) & 0x5555555555555555ULL;
01446             x ^= (x << 17) & 0x71D67FFFEDA60000ULL;
01447             x ^= (x << 37) & 0xFFF7EEE000000000ULL;
01448             x ^= (x >> 43);
01449 
01450             return x;
01451         }
01452 
01454         void fill(FXuchar *d, FXuval len) throw()
01455         {
01456             for(FXuint n=0; n<(FXuint)len/8; n++)
01457                 ((FXulong *)d)[n]=int64();
01458         }
01459 
01461         double real1() throw()
01462         {
01463             return (int64() >> 11) * (1.0/9007199254740991.0);
01464         }
01465 
01467         double real2() throw()
01468         {
01469             return (int64() >> 11) * (1.0/9007199254740992.0);
01470         }
01471 
01473         double real3() throw()
01474         {
01475             return ((int64() >> 12) + 0.5) * (1.0/4503599627370496.0);
01476         }
01477     };
01478 #else
01479     class FRandomness
01480     {
01481         static const int MEXP=19937;
01482         static const int N=(MEXP / 128 + 1);    // = 156
01483         typedef Vector<FXuint, 4> w128_t;
01484         // From SFMT=params19937.h
01485         static const int N32=(N * 4);
01486         static const int POS1=122, SL1=18, SR1=11, SL2=1, SR2=1;
01487         static const FXuint *MASK() throw() { static const FXMEMALIGNED(16) FXuint d[4]={0xdfffffefU, 0xddfecb7fU, 0xbffaffffU, 0xbffffff6U}; return d; }
01488         static const FXuint *PARITY() throw() { static const FXMEMALIGNED(16) FXuint d[4]={0x00000001U, 0x00000000U, 0x00000000U, 0x13c9e684U}; return d; }
01489 
01490         w128_t sfmt[N];
01491         FXuval idx;
01492         FXuint *psfmt32;
01493         FXulong *psfmt64;
01494 
01495         inline w128_t do_recursion(const w128_t &a, const w128_t &b, const w128_t &c, const w128_t &d) throw()
01496         {
01497             w128_t z(rshiftvec(c, SR2*8));
01498             z^=a;
01499             z^=d<<w128_t(SL1);
01500             z^=lshiftvec(a, SL2*8);
01501             w128_t y(b>>w128_t(SR1));
01502             y&=w128_t(MASK());
01503             z^=y;
01504             return z;
01505         }
01506         inline void gen_rand_all() throw()
01507         {
01508             int i;
01509             w128_t r, r1, r2;
01510 
01511             r1 = sfmt[N - 2];
01512             r2 = sfmt[N - 1];
01513             for (i = 0; i < N - POS1; i++)
01514             {
01515                 sfmt[i] = r = do_recursion(sfmt[i], sfmt[i + POS1], r1, r2);
01516                 r1 = r2;
01517                 r2 = r;
01518             }
01519             for (; i < N; i++)
01520             {
01521                 sfmt[i] = r = do_recursion(sfmt[i], sfmt[i + POS1 - N], r1, r2);
01522                 r1 = r2;
01523                 r2 = r;
01524             }
01525         }
01526         inline void gen_rand_array(w128_t *array, FXuint size) throw()
01527         {
01528             FXuint i, j;
01529             w128_t r, r1, r2;
01530 
01531             r1 = sfmt[N - 2];
01532             r2 = sfmt[N - 1];
01533             for (i = 0; i < N - POS1; i++)
01534             {
01535                 array[i] = r = do_recursion(sfmt[i], sfmt[i + POS1], r1, r2);
01536                 r1 = r2;
01537                 r2 = r;
01538             }
01539             for (; i < (FXuint) N; i++)
01540             {
01541                 array[i] = r = do_recursion(sfmt[i], array[i + POS1 - N], r1, r2);
01542                 r1 = r2;
01543                 r2 = r;
01544             }
01545             /* main loop */
01546             for (; i < size - N; i++)
01547             {
01548                 array[i] = r = do_recursion(array[i - N], array[i + POS1 - N], r1, r2);
01549                 r1 = r2;
01550                 r2 = r;
01551             }
01552             int limit=2*N-size;
01553             for(j = 0; (int) j < limit; j++)
01554             {
01555                 sfmt[j]=array[j + size - N];
01556             }
01557             for (; i < size; i++, j++)
01558             {
01559                 sfmt[j] = array[i] = r = do_recursion(array[i - N], array[i + POS1 - N], r1, r2);
01560                 r1 = r2;
01561                 r2 = r;
01562             }
01563         }
01564         void period_certification() throw()
01565         {
01566             int inner = 0;
01567             int i, j;
01568             FXuint work;
01569             static const w128_t parity(PARITY());
01570 
01571             for (i = 0; i < 4; i++)
01572                 inner ^= psfmt32[i] & parity[i];
01573             for (i = 16; i > 0; i >>= 1)
01574                 inner ^= inner >> i;
01575             inner &= 1;
01576             /* check OK */
01577             if (inner == 1) {
01578                 return;
01579             }
01580             /* check NG, and modification */
01581             for (i = 0; i < 4; i++) {
01582                 work = 1;
01583                 for (j = 0; j < 32; j++) {
01584                     if ((work & parity[i]) != 0) {
01585                         psfmt32[i] ^= work;
01586                         return;
01587                     }
01588                     work = work << 1;
01589                 }
01590             }
01591         }
01592         inline FXuint func1(FXuint x) throw() {
01593             return (x ^ (x >> 27)) * (FXuint)1664525UL;
01594         }
01595         inline FXuint func2(FXuint x) throw() {
01596             return (x ^ (x >> 27)) * (FXuint)1566083941UL;
01597         }
01598 
01599         void init_by_array(FXuint *init_key, FXuint key_length) throw()
01600         {
01601             FXuint i, j, count;
01602             FXuint r;
01603             int lag;
01604             int mid;
01605             int size = N * 4;
01606 
01607             if (size >= 623) {
01608                 lag = 11;
01609             } else if (size >= 68) {
01610                 lag = 7;
01611             } else if (size >= 39) {
01612                 lag = 5;
01613             } else {
01614                 lag = 3;
01615             }
01616             mid = (size - lag) / 2;
01617 
01618             memset(sfmt, 0x8b, sizeof(sfmt));
01619             if (key_length + 1 > (FXuint) N32) {
01620                 count = key_length + 1;
01621             } else {
01622                 count = N32;
01623             }
01624             r = func1(psfmt32[0] ^ psfmt32[mid] ^ psfmt32[N32 - 1]);
01625             psfmt32[mid] += r;
01626             r += key_length;
01627             psfmt32[mid + lag] += r;
01628             psfmt32[0] = r;
01629 
01630             count--;
01631             for (i = 1, j = 0; (j < count) && (j < key_length); j++) {
01632                 r = func1(psfmt32[i] ^ psfmt32[(i + mid) % N32] ^ psfmt32[(i + N32 - 1) % N32]);
01633                 psfmt32[(i + mid) % N32] += r;
01634                 r += init_key[j] + i;
01635                 psfmt32[(i + mid + lag) % N32] += r;
01636                 psfmt32[i] = r;
01637                 i = (i + 1) % N32;
01638             }
01639             for (; j < count; j++) {
01640                 r = func1(psfmt32[i] ^ psfmt32[(i + mid) % N32] ^ psfmt32[(i + N32 - 1) % N32]);
01641                 psfmt32[(i + mid) % N32] += r;
01642                 r += i;
01643                 psfmt32[(i + mid + lag) % N32] += r;
01644                 psfmt32[i] = r;
01645                 i = (i + 1) % N32;
01646             }
01647             for (j = 0; j < (FXuint) N32; j++) {
01648                 r = func2(psfmt32[i] + psfmt32[(i + mid) % N32] + psfmt32[(i + N32 - 1) % N32]);
01649                 psfmt32[(i + mid) % N32] ^= r;
01650                 r -= i;
01651                 psfmt32[(i + mid + lag) % N32] ^= r;
01652                 psfmt32[i] = r;
01653                 i = (i + 1) % N32;
01654             }
01655 
01656             idx = N32;
01657             period_certification();
01658         }
01659     public:
01661         static const bool usingSIMD=true;
01663         FRandomness(FXulong seed) throw() : idx(0), psfmt32((FXuint *)&sfmt), psfmt64((FXulong *)&sfmt)
01664         {
01665             init_by_array((FXuint*)&seed, 2);
01666             /*int i;
01667 
01668             psfmt32[0] = (FXuint) seed;
01669             for (i = 1; i < N32; i++) {
01670                 psfmt32[i] = 1812433253UL * (psfmt32[i - 1] ^ (psfmt32[i - 1] >> 30)) + i;
01671             }
01672             idx = N32;
01673             period_certification();*/
01674         }
01676         FRandomness(FXuchar *seed, FXuval len) throw() : idx(0), psfmt32((FXuint *)&sfmt), psfmt64((FXulong *)&sfmt)
01677         {
01678             init_by_array((FXuint*)seed, (FXuint)(len/4));
01679         }
01680 
01682         FXulong int64() throw()
01683         {
01684             FXulong r;
01685             assert(idx % 2 == 0);
01686 
01687             if (idx >= (FXuint) N32)
01688             {
01689                 gen_rand_all();
01690                 idx = 0;
01691             }
01692             r = psfmt64[idx / 2];
01693             idx += 2;
01694             return r;
01695         }
01696 
01698         void fill(FXuchar *d, FXuval len) throw()
01699         {
01700             gen_rand_array((w128_t *) d, (FXuint)(len/sizeof(w128_t)));
01701         }
01702 
01704         double real1() throw()
01705         {
01706             return (int64() >> 11) * (1.0/9007199254740991.0);
01707         }
01708 
01710         double real2() throw()
01711         {
01712             return (int64() >> 11) * (1.0/9007199254740992.0);
01713         }
01714 
01716         double real3() throw()
01717         {
01718             return ((int64() >> 12) + 0.5) * (1.0/4503599627370496.0);
01719         }
01720     };
01721 #endif
01722 
01729     class SysRandomness : protected QMutex, protected FRandomness
01730     {
01731     public:
01732         SysRandomness() : FRandomness(FXProcess::getNsCount()) { }
01733         FXulong int64()
01734         {
01735             QMtxHold h(this);
01736             return FRandomness::int64();
01737         }
01738         double real1()
01739         {
01740             QMtxHold h(this);
01741             return FRandomness::real1();
01742         }
01743         double real2()
01744         {
01745             QMtxHold h(this);
01746             return FRandomness::real2();
01747         }
01748         double real3()
01749         {
01750             QMtxHold h(this);
01751             return FRandomness::real3();
01752         }
01753     };
01754     extern FXAPI SysRandomness SysRandSrc;
01755 
01757     inline double normalrand(FRandomness &src, double stddevs) throw()
01758     {
01759         double x, y, r2;
01760         do
01761         {
01762             x=-1+2*src.real3();
01763             y=-1+2*src.real3();
01764             r2=x*x+y*y;
01765         }
01766         while(r2>1.0 || r2==0);
01767         return stddevs*y*::sqrt(-2.0*log(r2)/r2);
01768     }
01769     inline double normalrand(SysRandomness &src, double stddevs) throw()
01770     {
01771         double x, y, r2;
01772         do
01773         {
01774             x=-1+2*src.real3();
01775             y=-1+2*src.real3();
01776             r2=x*x+y*y;
01777         }
01778         while(r2>1.0 || r2==0);
01779         return stddevs*y*::sqrt(-2.0*log(r2)/r2);
01780     }
01781 
01783     template<typename type> inline type normaldist(type x, type stddevs) throw()
01784     {
01785         type u=x/fabs(stddevs);
01786         return (1/(sqrt(2*(type) PI)*fabs(stddevs)))*exp(-u*u/2);
01787     }
01788 
01790     template<typename type> inline type mean(const type *FXRESTRICT array, FXuval len, FXuint stride=1, type *FXRESTRICT min=0, type *FXRESTRICT max=0, type *FXRESTRICT mode=0) throw()
01791     {
01792         type m=0;
01793         if(min) *min=Generic::BiggestValue<type>::value;
01794         if(max) *max=Generic::BiggestValue<type, true>::value;
01795         if(mode) *mode=0;
01796         for(FXuval n=0; n<len; n+=stride)
01797         {
01798             m+=array[n];
01799             if(min && array[n]<*min) *min=array[n];
01800             if(max && array[n]>*max) *max=array[n];
01801             if(mode && (len/2==n || (len+1)/2==n)) *mode=*mode ? (*mode+array[n])/2 : array[n];
01802         }
01803         return m/len;
01804     }
01805     template<typename type, class allocator> inline type mean(const QMemArray<type, allocator> &array, FXuint stride=1, type *FXRESTRICT min=0, type *FXRESTRICT max=0, type *FXRESTRICT mode=0) throw()
01806     {
01807         return mean(array.data(), array.count(), stride, max, min, mode);
01808     }
01809 
01811     template<typename type> inline type variance(const type *FXRESTRICT array, FXuval len, FXuint stride=1, const type *FXRESTRICT _mean=0) throw()
01812     {
01813         type v=0, m=_mean ? *_mean : mean(array, len, stride);
01814         for(FXuval n=0; n<len; n+=stride)
01815         {
01816             const type d=array[n]-m;
01817             v+=d*d;
01818         }
01819         return v/((len/stride)-1);
01820     }
01821     template<typename type, class allocator> inline type variance(const QMemArray<type, allocator> &array, FXuint stride=1, const type *FXRESTRICT _mean=0) throw()
01822     {
01823         return variance(array.data(), array.count(), stride, _mean);
01824     }
01825 
01827     template<typename type> inline type stddev(const type *FXRESTRICT array, FXuval len, FXuint stride=1, const type *FXRESTRICT _mean=0) throw()
01828     {
01829         return sqrt(variance(array, len, stride, _mean));
01830     }
01831     template<typename type, class allocator> inline type stddev(const QMemArray<type, allocator> &array, FXuint stride=1, const type *FXRESTRICT _mean=0) throw()
01832     {
01833         return stddev(array.data(), array.count(), stride, _mean);
01834     }
01835 
01837     template<unsigned int buckets, typename type> inline Vector<type, buckets+3> distribution(const type *FXRESTRICT array, FXuval len, FXuint stride=1, const type *FXRESTRICT min=0, const type *FXRESTRICT max=0) throw()
01838     {
01839         Vector<type, buckets+3> ret;
01840         const type &_min=(min && max) ? *min : ret[0], &_max=(min && max) ? *max : ret[1];
01841         type &div=ret[2];
01842         if(!min || !max)
01843             mean(array, len, stride, &ret[0], &ret[1]);
01844         div=(_max-_min)/buckets;
01845         for(FXuval n=0; n<len; n+=stride)
01846         {
01847             for(FXuval bucket=0; bucket<buckets; bucket++)
01848             {
01849                 if(array[n]>=_min+div*bucket && array[n]<_min+div*(bucket+1))
01850                     ++ret[3+bucket];
01851             }
01852         }
01853         return ret;
01854     }
01855     template<unsigned int buckets, typename type, class allocator> inline Vector<type, buckets+3> distribution(const QMemArray<type, allocator> &array, FXuint stride=1, const type *FXRESTRICT min=0, const type *FXRESTRICT max=0) throw()
01856     {
01857         return distribution<type, buckets>(array.data(), array.count(), stride, min, max);
01858     }
01859 
01860 } // namespace
01861 
01862 } // namespace
01863 
01864 #endif