fxassemblerops.h

Go to the documentation of this file.
00001 /********************************************************************************
00002 *                                                                               *
00003 *                        Assembler optimised operations                         *
00004 *                                                                               *
00005 *********************************************************************************
00006 *        Copyright (C) 2005-2007 by Niall Douglas.   All Rights Reserved.       *
00007 *       NOTE THAT I DO NOT PERMIT ANY OF MY CODE TO BE PROMOTED TO THE GPL      *
00008 *********************************************************************************
00009 * This code is free software; you can redistribute it and/or modify it under    *
00010 * the terms of the GNU Library General Public License v2.1 as published by the  *
00011 * Free Software Foundation EXCEPT that clause 3 does not apply ie; you may not  *
00012 * "upgrade" this code to the GPL without my prior written permission.           *
00013 * Please consult the file "License_Addendum2.txt" accompanying this file.       *
00014 *                                                                               *
00015 * This code is distributed in the hope that it will be useful,                  *
00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of                *
00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                          *
00018 *********************************************************************************
00019 * $Id:                                                                          *
00020 ********************************************************************************/
00021 
00022 #ifndef FXASSEMBLEROPS_H
00023 #define FXASSEMBLEROPS_H
00024 
00042 #if defined(_MSC_VER) && ((defined(_M_IX86) && _M_IX86>=400) || (defined(_M_AMD64) || defined(_M_X64)))
00043 // Get the intrinsic definitions
00044 #if defined(_M_X64) || _M_IX86_FP>=1
00045 #include "xmmintrin.h"  // For mm_prefetch
00046 #endif
00047 #if defined(_M_X64) || _M_IX86_FP>=2
00048 #include "emmintrin.h"
00049 #endif
00050 #ifndef BitScanForward  // Try to avoid pulling in WinNT.h
00051 extern "C" unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
00052 extern "C" unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
00053 #define BitScanForward _BitScanForward
00054 #define BitScanReverse _BitScanReverse
00055 #pragma intrinsic(_BitScanForward)
00056 #pragma intrinsic(_BitScanReverse)
00057 
00058 #if defined(_M_AMD64) || defined(_M_X64)
00059 extern "C" unsigned char _BitScanForward64(unsigned long *index, unsigned __int64 mask);
00060 extern "C" unsigned char _BitScanReverse64(unsigned long *index, unsigned __int64 mask);
00061 #define BitScanForward64 _BitScanForward64
00062 #define BitScanReverse64 _BitScanReverse64
00063 #pragma intrinsic(_BitScanForward64)
00064 #pragma intrinsic(_BitScanReverse64)
00065 #endif
00066 
00067 #endif
00068 #include <stdlib.h>     // For byteswap
00069 //        unsigned short __cdecl _byteswap_ushort(unsigned short);
00070 //        unsigned long  __cdecl _byteswap_ulong (unsigned long);
00071 //        unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64);
00072 #pragma intrinsic(_byteswap_ushort)
00073 #pragma intrinsic(_byteswap_ulong)
00074 #pragma intrinsic(_byteswap_uint64)
00075 
00076 namespace FX {
00077 /* One has a choice of increments: 32 for P6, 64 for Athlon and 128 for P4, so we choose 64 */
00078 inline void fxprefetchmemT(const void *ptr) throw()
00079 {
00080 #if defined(_M_X64) || _M_IX86_FP>=1
00081     _mm_prefetch((const char *) ptr, _MM_HINT_T2);
00082 #endif
00083 }
00084 inline void fxprefetchmemNT(const void *ptr) throw()
00085 {
00086 #if defined(_M_X64) || _M_IX86_FP>=1
00087     _mm_prefetch((const char *) ptr, _MM_HINT_NTA);
00088 #endif
00089 }
00090 inline FXuint fxbitscan(FXuint x) throw()
00091 {
00092     FXuint m;
00093 #if defined(BitScanForward)
00094     unsigned long _m;
00095     BitScanForward(&_m, x);
00096     m=(unsigned int) _m;
00097 #elif defined(_M_IX86)
00098     __asm
00099     {
00100         bsf eax, [x]
00101         mov [m], eax
00102     }
00103 #else
00104 #error Unknown implementation
00105 #endif
00106     return m;
00107 }
00108 inline FXuint fxbitscan(FXulong x) throw()
00109 {
00110     FXuint m;
00111 #if defined(BitScanForward64)
00112     unsigned long _m;
00113     BitScanForward64(&_m, x);
00114     m=(unsigned int) _m;
00115 #else
00116     FXuint *_x=(FXuint *) &x;
00117     m=fxbitscan(_x[0]);
00118     if(32==m) m=32+fxbitscan(_x[1]);
00119 #endif
00120     return m;
00121 }
00122 inline FXuint fxbitscanrev(FXuint x) throw()
00123 {
00124     FXuint m;
00125 #if defined(BitScanReverse)
00126     unsigned long _m;
00127     BitScanReverse(&_m, x);
00128     m=(unsigned int) _m;
00129 #elif defined(_M_IX86)
00130     __asm
00131     {
00132         bsr eax, [x]
00133         mov [m], eax
00134     }
00135 #else
00136 #error Unknown implementation
00137 #endif
00138     return m;
00139 }
00140 inline FXuint fxbitscanrev(FXulong x) throw()
00141 {
00142     FXuint m;
00143 #if defined(BitScanReverse64)
00144     unsigned long _m;
00145     BitScanReverse64(&_m, x);
00146     m=(unsigned int) _m;
00147 #else
00148     FXuint *_x=(FXuint *) &x;
00149     m=32+fxbitscanrev(_x[1]);
00150     if(64==m) { m=fxbitscanrev(_x[0]); if(32==m) m=64; }
00151 #endif
00152     return m;
00153 }
00154 inline void fxendianswap(FXushort &v) throw()
00155 {   // Can't improve on this
00156     v=((v & 0xff)<<8)|(v>>8);
00157 }
00158 inline void fxendianswap(FXuint &v) throw()
00159 {
00160     v=_byteswap_ulong(v);           // Invokes bswap x86 instruction
00161 }
00162 inline void fxendianswap(FXulong &v) throw()
00163 {
00164     v=_byteswap_uint64(v);          // Invokes bswap x86 instruction
00165 }
00166 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
00167 #if defined(__x86_64__) || defined(__SSE__)
00168 #include "xmmintrin.h"
00169 #endif
00170 #if defined(__x86_64__) || defined(__SSE2__)
00171 #include "emmintrin.h"
00172 #endif
00173 
00174 namespace FX {
00175 
00176 inline void fxprefetchmemT(const void *ptr) throw()
00177 {
00178     __builtin_prefetch(ptr, 0, 3);
00179 }
00180 inline void fxprefetchmemNT(const void *ptr) throw()
00181 {
00182     __builtin_prefetch(ptr, 0, 0);
00183 }
00184 inline FXuint fxbitscan(FXuint x) throw()
00185 {
00186     FXuint m;
00187     __asm__("bsfl %1,%0\n\t"
00188             : "=r" (m) 
00189             : "rm" (x));
00190     return m;
00191 }
00192 inline FXuint fxbitscan(FXulong x) throw()
00193 {
00194     FXulong m;
00195 #if defined(__x86_64__)
00196     __asm__("bsfq %1,%0\n\t"
00197             : "=r" (m) 
00198             : "rm" (x));
00199 #else
00200     union
00201     {
00202         FXulong l;
00203         FXuint i[2];
00204     } _x;
00205     _x.l=x;
00206     m=fxbitscan(_x.i[!FOX_BIGENDIAN]);
00207     if(32==m) m=32+fxbitscan(_x.i[!!FOX_BIGENDIAN]);
00208 #endif
00209     return (FXuint) m;
00210 }
00211 inline FXuint fxbitscanrev(FXuint x) throw()
00212 {
00213     FXuint m;
00214     __asm__("bsrl %1,%0\n\t"
00215             : "=r" (m) 
00216             : "rm" (x));
00217     return m;
00218 }
00219 inline FXuint fxbitscanrev(FXulong x) throw()
00220 {
00221     FXulong m;
00222 #if defined(__x86_64__)
00223     __asm__("bsrq %1,%0\n\t"
00224             : "=r" (m) 
00225             : "rm" (x));
00226 #else
00227     union
00228     {
00229         FXulong l;
00230         FXuint i[2];
00231     } _x;
00232     _x.l=x;
00233     m=32+fxbitscanrev(_x.i[!!FOX_BIGENDIAN]);
00234     if(64==m) { m=fxbitscanrev(_x.i[!FOX_BIGENDIAN]); if(32==m) m=64; }
00235 #endif
00236     return (FXuint) m;
00237 }
00238 inline void fxendianswap(FXushort &v) throw()
00239 {   // Can't improve on this
00240     v=((v & 0xff)<<8)|(v>>8);
00241 }
00242 inline void fxendianswap(FXuint &v) throw()
00243 {
00244     __asm__("bswapl %0\n\t"
00245             : "=r" (v)
00246             : "0"  (v));
00247 }
00248 inline void fxendianswap(FXulong &v) throw()
00249 {
00250 #if defined(__x86_64__)
00251     __asm__("bswapq %0\n\t"
00252             : "=r" (v)
00253             : "0"  (v));
00254 #else
00255     __asm__("bswapl %%eax\n\t"
00256             "bswapl %%edx\n\t"
00257             "movl %%eax, %%ecx\n\t"
00258             "movl %%edx, %%eax\n\t"
00259             "movl %%ecx, %%edx\n\t"
00260             : "=A" (v)
00261             : "0"  (v)
00262             : "%ecx");
00263 #endif
00264 }
00265 #else
00266 namespace FX {
00267 
00271 inline void fxprefetchmemT(const void *ptr) throw()
00272 {
00273 }
00277 inline void fxprefetchmemNT(const void *ptr) throw()
00278 {
00279 }
00284 inline FXuint fxbitscan(FXuint x) throw()
00285 {
00286    x = ~x & (x - 1);
00287    x = x - ((x >> 1) & 0x55555555);
00288    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
00289    x = (x + (x >> 4)) & 0x0F0F0F0F;
00290    x = x + (x << 8);
00291    x = x + (x << 16);
00292    return x >> 24;
00293 }
00294 inline FXuint fxbitscan(FXulong x) throw()
00295 {
00296     FXuint m;
00297     union
00298     {
00299         FXulong l;
00300         FXuint i[2];
00301     } _x;
00302     _x.l=x;
00303     m=fxbitscan(_x.i[!FOX_BIGENDIAN]);
00304     if(32==m) m=32+fxbitscan(_x.i[!!FOX_BIGENDIAN]);
00305     return m;
00306 }
00315 inline FXuint fxbitscanrev(FXuint x) throw()
00316 {
00317 #if 1
00318     union {
00319         unsigned asInt[2];
00320         double asDouble;
00321     };
00322     int n;
00323 
00324     asDouble = (double)x + 0.5;
00325     n = (asInt[!FOX_BIGENDIAN] >> 20) - 1023;
00326     return n;
00327 #else
00328     x = x | (x >> 1);
00329     x = x | (x >> 2);
00330     x = x | (x >> 4);
00331     x = x | (x >> 8);
00332     x = x | (x >>16);
00333     x = ~x;
00334     x = x - ((x >> 1) & 0x55555555);
00335     x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
00336     x = (x + (x >> 4)) & 0x0F0F0F0F;
00337     x = x + (x << 8);
00338     x = x + (x << 16);
00339     return x >> 24;
00340 #endif
00341 }
00342 inline FXuint fxbitscanrev(FXulong x) throw()
00343 {
00344     FXuint m;
00345     union
00346     {
00347         FXulong l;
00348         FXuint i[2];
00349     } _x;
00350     _x.l=x;
00351     m=32+fxbitscanrev(_x.i[!!FOX_BIGENDIAN]);
00352     if(64==m) { m=fxbitscanrev(_x.i[!FOX_BIGENDIAN]); if(32==m) m=64; }
00353     return m;
00354 }
00355 
00359 inline void fxendianswap(FXushort &v) throw()
00360 {   // Can't improve on this
00361     v=((v & 0xff)<<8)|(v>>8);
00362 }
00363 
00367 inline void fxendianswap(FXuint &v) throw()
00368 {
00369     FXuchar *p=(FXuchar *) &v, t;
00370     t=p[0]; p[0]=p[3]; p[3]=t;
00371     t=p[1]; p[1]=p[2]; p[2]=t;
00372 }
00373 
00377 inline void fxendianswap(FXulong &v) throw()
00378 {
00379     FXuchar *p=(FXuchar *) &v, t;
00380     t=p[0]; p[0]=p[7]; p[7]=t;
00381     t=p[1]; p[1]=p[6]; p[6]=t;
00382     t=p[2]; p[2]=p[5]; p[5]=t;
00383     t=p[3]; p[3]=p[4]; p[4]=t;
00384 }
00385 #endif
00386 
00387 } // namespace
00388 
00389 #endif

(C) 2002-2009 Niall Douglas. Some parts (C) to assorted authors.
Generated on Fri Nov 20 18:31:19 2009 for TnFOX by doxygen v1.4.7