00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef FXASSEMBLEROPS_H
00023 #define FXASSEMBLEROPS_H
00024
00042 #if defined(_MSC_VER) && ((defined(_M_IX86) && _M_IX86>=400) || (defined(_M_AMD64) || defined(_M_X64)))
00043
00044 #if defined(_M_X64) || _M_IX86_FP>=1
00045 #include "xmmintrin.h"
00046 #endif
00047 #if defined(_M_X64) || _M_IX86_FP>=2
00048 #include "emmintrin.h"
00049 #endif
00050 #ifndef BitScanForward // Try to avoid pulling in WinNT.h
00051 extern "C" unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
00052 extern "C" unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
00053 #define BitScanForward _BitScanForward
00054 #define BitScanReverse _BitScanReverse
00055 #pragma intrinsic(_BitScanForward)
00056 #pragma intrinsic(_BitScanReverse)
00057
00058 #if defined(_M_AMD64) || defined(_M_X64)
00059 extern "C" unsigned char _BitScanForward64(unsigned long *index, unsigned __int64 mask);
00060 extern "C" unsigned char _BitScanReverse64(unsigned long *index, unsigned __int64 mask);
00061 #define BitScanForward64 _BitScanForward64
00062 #define BitScanReverse64 _BitScanReverse64
00063 #pragma intrinsic(_BitScanForward64)
00064 #pragma intrinsic(_BitScanReverse64)
00065 #endif
00066
00067 #endif
00068 #include <stdlib.h>
00069
00070
00071
00072 #pragma intrinsic(_byteswap_ushort)
00073 #pragma intrinsic(_byteswap_ulong)
00074 #pragma intrinsic(_byteswap_uint64)
00075
00076 namespace FX {
00077
00078 inline void fxprefetchmemT(const void *ptr) throw()
00079 {
00080 #if defined(_M_X64) || _M_IX86_FP>=1
00081 _mm_prefetch((const char *) ptr, _MM_HINT_T2);
00082 #endif
00083 }
00084 inline void fxprefetchmemNT(const void *ptr) throw()
00085 {
00086 #if defined(_M_X64) || _M_IX86_FP>=1
00087 _mm_prefetch((const char *) ptr, _MM_HINT_NTA);
00088 #endif
00089 }
00090 inline FXuint fxbitscan(FXuint x) throw()
00091 {
00092 FXuint m;
00093 #if defined(BitScanForward)
00094 unsigned long _m;
00095 BitScanForward(&_m, x);
00096 m=(unsigned int) _m;
00097 #elif defined(_M_IX86)
00098 __asm
00099 {
00100 bsf eax, [x]
00101 mov [m], eax
00102 }
00103 #else
00104 #error Unknown implementation
00105 #endif
00106 return m;
00107 }
00108 inline FXuint fxbitscan(FXulong x) throw()
00109 {
00110 FXuint m;
00111 #if defined(BitScanForward64)
00112 unsigned long _m;
00113 BitScanForward64(&_m, x);
00114 m=(unsigned int) _m;
00115 #else
00116 FXuint *_x=(FXuint *) &x;
00117 m=fxbitscan(_x[0]);
00118 if(32==m) m=32+fxbitscan(_x[1]);
00119 #endif
00120 return m;
00121 }
00122 inline FXuint fxbitscanrev(FXuint x) throw()
00123 {
00124 FXuint m;
00125 #if defined(BitScanReverse)
00126 unsigned long _m;
00127 BitScanReverse(&_m, x);
00128 m=(unsigned int) _m;
00129 #elif defined(_M_IX86)
00130 __asm
00131 {
00132 bsr eax, [x]
00133 mov [m], eax
00134 }
00135 #else
00136 #error Unknown implementation
00137 #endif
00138 return m;
00139 }
00140 inline FXuint fxbitscanrev(FXulong x) throw()
00141 {
00142 FXuint m;
00143 #if defined(BitScanReverse64)
00144 unsigned long _m;
00145 BitScanReverse64(&_m, x);
00146 m=(unsigned int) _m;
00147 #else
00148 FXuint *_x=(FXuint *) &x;
00149 m=32+fxbitscanrev(_x[1]);
00150 if(64==m) { m=fxbitscanrev(_x[0]); if(32==m) m=64; }
00151 #endif
00152 return m;
00153 }
00154 inline void fxendianswap(FXushort &v) throw()
00155 {
00156 v=((v & 0xff)<<8)|(v>>8);
00157 }
00158 inline void fxendianswap(FXuint &v) throw()
00159 {
00160 v=_byteswap_ulong(v);
00161 }
00162 inline void fxendianswap(FXulong &v) throw()
00163 {
00164 v=_byteswap_uint64(v);
00165 }
00166 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
00167 #if defined(__x86_64__) || defined(__SSE__)
00168 #include "xmmintrin.h"
00169 #endif
00170 #if defined(__x86_64__) || defined(__SSE2__)
00171 #include "emmintrin.h"
00172 #endif
00173
00174 namespace FX {
00175
00176 inline void fxprefetchmemT(const void *ptr) throw()
00177 {
00178 __builtin_prefetch(ptr, 0, 3);
00179 }
00180 inline void fxprefetchmemNT(const void *ptr) throw()
00181 {
00182 __builtin_prefetch(ptr, 0, 0);
00183 }
00184 inline FXuint fxbitscan(FXuint x) throw()
00185 {
00186 FXuint m;
00187 __asm__("bsfl %1,%0\n\t"
00188 : "=r" (m)
00189 : "rm" (x));
00190 return m;
00191 }
00192 inline FXuint fxbitscan(FXulong x) throw()
00193 {
00194 FXulong m;
00195 #if defined(__x86_64__)
00196 __asm__("bsfq %1,%0\n\t"
00197 : "=r" (m)
00198 : "rm" (x));
00199 #else
00200 union
00201 {
00202 FXulong l;
00203 FXuint i[2];
00204 } _x;
00205 _x.l=x;
00206 m=fxbitscan(_x.i[!FOX_BIGENDIAN]);
00207 if(32==m) m=32+fxbitscan(_x.i[!!FOX_BIGENDIAN]);
00208 #endif
00209 return (FXuint) m;
00210 }
00211 inline FXuint fxbitscanrev(FXuint x) throw()
00212 {
00213 FXuint m;
00214 __asm__("bsrl %1,%0\n\t"
00215 : "=r" (m)
00216 : "rm" (x));
00217 return m;
00218 }
00219 inline FXuint fxbitscanrev(FXulong x) throw()
00220 {
00221 FXulong m;
00222 #if defined(__x86_64__)
00223 __asm__("bsrq %1,%0\n\t"
00224 : "=r" (m)
00225 : "rm" (x));
00226 #else
00227 union
00228 {
00229 FXulong l;
00230 FXuint i[2];
00231 } _x;
00232 _x.l=x;
00233 m=32+fxbitscanrev(_x.i[!!FOX_BIGENDIAN]);
00234 if(64==m) { m=fxbitscanrev(_x.i[!FOX_BIGENDIAN]); if(32==m) m=64; }
00235 #endif
00236 return (FXuint) m;
00237 }
00238 inline void fxendianswap(FXushort &v) throw()
00239 {
00240 v=((v & 0xff)<<8)|(v>>8);
00241 }
00242 inline void fxendianswap(FXuint &v) throw()
00243 {
00244 __asm__("bswapl %0\n\t"
00245 : "=r" (v)
00246 : "0" (v));
00247 }
00248 inline void fxendianswap(FXulong &v) throw()
00249 {
00250 #if defined(__x86_64__)
00251 __asm__("bswapq %0\n\t"
00252 : "=r" (v)
00253 : "0" (v));
00254 #else
00255 __asm__("bswapl %%eax\n\t"
00256 "bswapl %%edx\n\t"
00257 "movl %%eax, %%ecx\n\t"
00258 "movl %%edx, %%eax\n\t"
00259 "movl %%ecx, %%edx\n\t"
00260 : "=A" (v)
00261 : "0" (v)
00262 : "%ecx");
00263 #endif
00264 }
00265 #else
00266 namespace FX {
00267
00271 inline void fxprefetchmemT(const void *ptr) throw()
00272 {
00273 }
00277 inline void fxprefetchmemNT(const void *ptr) throw()
00278 {
00279 }
00284 inline FXuint fxbitscan(FXuint x) throw()
00285 {
00286 x = ~x & (x - 1);
00287 x = x - ((x >> 1) & 0x55555555);
00288 x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
00289 x = (x + (x >> 4)) & 0x0F0F0F0F;
00290 x = x + (x << 8);
00291 x = x + (x << 16);
00292 return x >> 24;
00293 }
00294 inline FXuint fxbitscan(FXulong x) throw()
00295 {
00296 FXuint m;
00297 union
00298 {
00299 FXulong l;
00300 FXuint i[2];
00301 } _x;
00302 _x.l=x;
00303 m=fxbitscan(_x.i[!FOX_BIGENDIAN]);
00304 if(32==m) m=32+fxbitscan(_x.i[!!FOX_BIGENDIAN]);
00305 return m;
00306 }
00315 inline FXuint fxbitscanrev(FXuint x) throw()
00316 {
00317 #if 1
00318 union {
00319 unsigned asInt[2];
00320 double asDouble;
00321 };
00322 int n;
00323
00324 asDouble = (double)x + 0.5;
00325 n = (asInt[!FOX_BIGENDIAN] >> 20) - 1023;
00326 return n;
00327 #else
00328 x = x | (x >> 1);
00329 x = x | (x >> 2);
00330 x = x | (x >> 4);
00331 x = x | (x >> 8);
00332 x = x | (x >>16);
00333 x = ~x;
00334 x = x - ((x >> 1) & 0x55555555);
00335 x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
00336 x = (x + (x >> 4)) & 0x0F0F0F0F;
00337 x = x + (x << 8);
00338 x = x + (x << 16);
00339 return x >> 24;
00340 #endif
00341 }
00342 inline FXuint fxbitscanrev(FXulong x) throw()
00343 {
00344 FXuint m;
00345 union
00346 {
00347 FXulong l;
00348 FXuint i[2];
00349 } _x;
00350 _x.l=x;
00351 m=32+fxbitscanrev(_x.i[!!FOX_BIGENDIAN]);
00352 if(64==m) { m=fxbitscanrev(_x.i[!FOX_BIGENDIAN]); if(32==m) m=64; }
00353 return m;
00354 }
00355
00359 inline void fxendianswap(FXushort &v) throw()
00360 {
00361 v=((v & 0xff)<<8)|(v>>8);
00362 }
00363
00367 inline void fxendianswap(FXuint &v) throw()
00368 {
00369 FXuchar *p=(FXuchar *) &v, t;
00370 t=p[0]; p[0]=p[3]; p[3]=t;
00371 t=p[1]; p[1]=p[2]; p[2]=t;
00372 }
00373
00377 inline void fxendianswap(FXulong &v) throw()
00378 {
00379 FXuchar *p=(FXuchar *) &v, t;
00380 t=p[0]; p[0]=p[7]; p[7]=t;
00381 t=p[1]; p[1]=p[6]; p[6]=t;
00382 t=p[2]; p[2]=p[5]; p[5]=t;
00383 t=p[3]; p[3]=p[4]; p[4]=t;
00384 }
00385 #endif
00386
00387 }
00388
00389 #endif