mdcore: /home/pedro/work/mdcore/src/fptype.h Source File

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * This file is part of mdcore.
00003  * Coypright (c) 2010 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
00004  * 
00005  * This program is free software: you can redistribute it and/or modify
00006  * it under the terms of the GNU Lesser General Public License as published
00007  * by the Free Software Foundation, either version 3 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU Lesser General Public License
00016  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
00017  * 
00018  ******************************************************************************/
00019 
00020 /* Global defines. */
00021 #ifndef FPTYPE_DEFINED
00022     #ifdef FPTYPE_SINGLE
00023 
00024         typedef float FPTYPE;
00025         #define FPTYPE_EPSILON FLT_EPSILON
00026         #define FPTYPE_ONE 1.0f
00027         #define FPTYPE_ZERO 0.0f
00028         #define FPTYPE_TWO 2.0f
00029         #define FPTYPE_SQRT sqrtf
00030         #define FPTYPE_FMAX fmaxf
00031         #define FPTYPE_FMIN fminf
00032     #else
00033 
00034         typedef double FPTYPE;
00035         #define FPTYPE_EPSILON DBL_EPSILON
00036         #define FPTYPE_DOUBLE
00037         #define FPTYPE_ONE 1.0
00038         #define FPTYPE_TWO 2.0
00039         #define FPTYPE_ZERO 0.0
00040         #define FPTYPE_SQRT sqrt
00041         #define FPTYPE_FMAX fmax
00042         #define FPTYPE_FMIN fmin
00043     #endif
00044     #define FPTYPE_DEFINED
00045 #endif
00046 
00047 
00048 /* Get the inlining right. */
00049 #ifndef INLINE
00050 # if __GNUC__ && !__GNUC_STDC_INLINE__
00051 #  define INLINE extern inline
00052 # else
00053 #  define INLINE inline
00054 # endif
00055 #endif
00056 
00057     
00058 /* Define some macros for single/double precision vector operations. */
00059 #if ( defined(__AVX__) && defined(FPTYPE_SINGLE) )
00060     #define VEC_SINGLE
00061     #define VEC_SIZE 8
00062     #define VECTORIZE
00063 #elif ( (defined(__SSE__) || defined(__ALTIVEC__)) && defined(FPTYPE_SINGLE) )
00064     #define VEC_SINGLE
00065     #define VEC_SIZE 4
00066     #define VECTORIZE
00067 #elif ( (defined(__SSE2__) || defined(__AVX__)) && defined(FPTYPE_DOUBLE) )
00068     #define VEC_DOUBLE
00069     #define VEC_SIZE 4
00070     #define VECTORIZE
00071 #endif
00072 
00073 
00074 /* Get headers for intrinsic functions. */
00075 #ifdef __SSE__
00076     #include <xmmintrin.h>
00077 #endif
00078 #ifdef __SSE2__
00079     #include <emmintrin.h>
00080 #endif
00081 #ifdef __SSE3__
00082     #include <pmmintrin.h>
00083 #endif
00084 #ifdef __SSE4_1__
00085     #include <smmintrin.h>
00086 #endif
00087 
00089 #define vector(elcount, type)  __attribute__((vector_size((elcount)*sizeof(type)))) type
00090 
00091 
00092 /* Some extra functions function for Alti-Vec instruction set. */
00093 #ifdef __ALTIVEC__
00094     #include <altivec.h>
00095     __attribute__ ((always_inline)) INLINE vector float vec_sqrt( vector float a ) {
00096         vector float z = ( vector float ){ 0.0f };
00097         vector float estimate = vec_rsqrte( a );
00098         vector float estimateSquared = vec_madd( estimate, estimate, z );
00099         vector float halfEstimate = vec_madd( estimate, (vector float){0.5}, z );
00100         return vec_madd( a, vec_madd( vec_nmsub( a, estimateSquared, (vector float){1.0} ), halfEstimate, estimate ), z);
00101         }
00102     /* inline static vector float vec_load4 ( float a , float b , float c , float d ) {
00103         return vec_mergeh( vec_mergeh( vec_promote(a,0) , vec_promote(c,0) ) , vec_mergeh( vec_promote(b,0) , vec_promote(d,0) ) );
00104         } */
00105     #define vec_load4(a,b,c,d) vec_mergeh( vec_mergeh( vec_promote((a),0) , vec_promote((c),0) ) , vec_mergeh( vec_promote((b),0) , vec_promote((d),0) ) )
00106     #define vec_mul(a,b) vec_madd((a),(b),(vector float){0.0f})
00107 #endif
00108 
00109 
00123 __attribute__ ((always_inline)) INLINE FPTYPE fptype_r2 ( FPTYPE *x1 , FPTYPE *x2 , FPTYPE *dx ) {
00124 
00125 #if defined(FPTYPE_SINGLE) && defined(__SSE4_1__)
00126     union {
00127         vector(4,float) v;
00128         float f[4];
00129         } a, b, c, d;
00130         
00131     /* Load x1 and x2 into a and b. */
00132     a.v = _mm_load_ps( x1 );
00133     b.v = _mm_load_ps( x2 );
00134     
00135     /* Compute the difference and store in dx. */
00136     c.v = a.v - b.v;
00137     _mm_store_ps( dx , c.v );
00138     
00139     /* Use the built-in dot-product instruction. */
00140     d.v = _mm_dp_ps( c.v , c.v , 0x71 );
00141     
00142     /* Return the sum of squares. */
00143     return d.f[0];
00144 #elif defined(FPTYPE_SINGLE) && defined(__SSE3__)
00145     union {
00146         vector(4,float) v;
00147         float f[4];
00148         } a, b, c, d;
00149         
00150     /* Load x1 and x2 into a and b. */
00151     a.v = _mm_load_ps( x1 );
00152     b.v = _mm_load_ps( x2 );
00153     
00154     /* Compute the difference and store in dx. */
00155     c.v = a.v - b.v;
00156     _mm_store_ps( dx , c.v );
00157     
00158     /* Square the entries (use a different register so that c can be stored). */
00159     d.v = c.v * c.v;
00160     
00161     /* Add horizontally twice to get the sum of the four entries
00162        in the lowest float. */
00163     d.v = _mm_hadd_ps( d.v , d.v );
00164     d.v = _mm_hadd_ps( d.v , d.v );
00165     
00166     /* Return the sum of squares. */
00167     return d.f[0];
00168 #elif defined(FPTYPE_DOUBLE) && defined(__AVX__)
00169     union {
00170         vector(4,double) v;
00171         double f[4];
00172         } a, b, c, d;
00173         
00174     /* Load x1 and x2 into a and b. */
00175     a.v = _m256_mm_load_pd( x1 );
00176     b.v = _m256_mm_load_pd( x2 );
00177     
00178     /* Compute the difference and store in dx. */
00179     c.v = a.v - b.v;
00180     _mm256_store_pd( dx , c.v );
00181     
00182     /* Square the entries (use a different register so that c can be stored). */
00183     d.v = c.v * c.v;
00184     
00185     /* Add horizontally twice to get the sum of the four entries
00186        in the lowest double. */
00187     d.v = _mm256_hadd_ps( d.v , d.v );
00188     d.v = _mm256_hadd_ps( d.v , d.v );
00189     
00190     /* Return the sum of squares. */
00191     return d.f[0];
00192 #elif defined(FPTYPE_DOUBLE) && defined(__SSE4_1__)
00193     union {
00194         vector(2,double) v;
00195         double f[2];
00196         } a1, a2, b1, b2, c1, c2, d1, d2;
00197         
00198     /* Load x1 and x2 into a and b. */
00199     a1.v = _mm_load_pd( x1 );
00200     b1.v = _mm_load_pd( x2 );
00201     a2.v = _mm_load_pd( &x1[2] );
00202     b2.v = _mm_load_pd( &x2[2] );
00203     
00204     /* Compute the difference and store in dx. */
00205     c1.v = a1.v - b1.v;
00206     c2.v = a2.v - b2.v;
00207     _mm_store_pd( dx , c1.v );
00208     _mm_store_pd( &dx[2] , c1.v );
00209     
00210     /* Use the built-in dot-product instruction. */
00211     d1.v = _mm_dp_pd( c1.v , c1.v , 1+4+8 );
00212     d2.v = c2.v * c2.v;
00213     
00214     /* Return the sum of squares. */
00215     return d1.f[0] + d2.f[0];
00216 #elif defined(FPTYPE_DOUBLE) && defined(__SSE3__)
00217     union {
00218         vector(2,double) v;
00219         double f[2];
00220         } a1, a2, b1, b2, c1, c2, d1, d2;
00221         
00222     /* Load x1 and x2 into a and b. */
00223     a1.v = _mm_load_pd( x1 );
00224     b1.v = _mm_load_pd( x2 );
00225     a2.v = _mm_load_pd( &x1[2] );
00226     b2.v = _mm_load_pd( &x2[2] );
00227     
00228     /* Compute the difference and store in dx. */
00229     c1.v = a1.v - b1.v;
00230     c2.v = a2.v - b2.v;
00231     _mm_store_pd( dx , c1.v );
00232     _mm_store_pd( &dx[2] , c1.v );
00233     
00234     /* Square the entries (use a different register so that c can be stored). */
00235     d1.v = c1.v * c1.v;
00236     d2.v = c2.v * c2.v;
00237     
00238     /* Add horizontally twice to get the sum of the four entries
00239        in the lowest double. */
00240     d1.v = _mm_hadd_pd( d1.v , d2.v );
00241     d1.v = _mm_hadd_pd( d1.v , d1.v );
00242     
00243     /* Return the sum of squares. */
00244     return d1.f[0];
00245 #else
00246     dx[0] = x1[0] - x2[0];
00247     dx[1] = x1[1] - x2[1];
00248     dx[2] = x1[2] - x2[2];
00249     return dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2];
00250 #endif
00251 
00252     }
00253     
00254