mdcore
0.1.5
|
00001 /******************************************************************************* 00002 * This file is part of mdcore. 00003 * Coypright (c) 2010 Pedro Gonnet (pedro.gonnet@durham.ac.uk) 00004 * 00005 * This program is free software: you can redistribute it and/or modify 00006 * it under the terms of the GNU Lesser General Public License as published 00007 * by the Free Software Foundation, either version 3 of the License, or 00008 * (at your option) any later version. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public License 00016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00017 * 00018 ******************************************************************************/ 00019 00020 /* Global defines. */ 00021 #ifndef FPTYPE_DEFINED 00022 #ifdef FPTYPE_SINGLE 00023 00024 typedef float FPTYPE; 00025 #define FPTYPE_EPSILON FLT_EPSILON 00026 #define FPTYPE_ONE 1.0f 00027 #define FPTYPE_ZERO 0.0f 00028 #define FPTYPE_TWO 2.0f 00029 #define FPTYPE_SQRT sqrtf 00030 #define FPTYPE_FMAX fmaxf 00031 #define FPTYPE_FMIN fminf 00032 #else 00033 00034 typedef double FPTYPE; 00035 #define FPTYPE_EPSILON DBL_EPSILON 00036 #define FPTYPE_DOUBLE 00037 #define FPTYPE_ONE 1.0 00038 #define FPTYPE_TWO 2.0 00039 #define FPTYPE_ZERO 0.0 00040 #define FPTYPE_SQRT sqrt 00041 #define FPTYPE_FMAX fmax 00042 #define FPTYPE_FMIN fmin 00043 #endif 00044 #define FPTYPE_DEFINED 00045 #endif 00046 00047 00048 /* Get the inlining right. */ 00049 #ifndef INLINE 00050 # if __GNUC__ && !__GNUC_STDC_INLINE__ 00051 # define INLINE extern inline 00052 # else 00053 # define INLINE inline 00054 # endif 00055 #endif 00056 00057 00058 /* Define some macros for single/double precision vector operations. */ 00059 #if ( defined(__AVX__) && defined(FPTYPE_SINGLE) ) 00060 #define VEC_SINGLE 00061 #define VEC_SIZE 8 00062 #define VECTORIZE 00063 #elif ( (defined(__SSE__) || defined(__ALTIVEC__)) && defined(FPTYPE_SINGLE) ) 00064 #define VEC_SINGLE 00065 #define VEC_SIZE 4 00066 #define VECTORIZE 00067 #elif ( (defined(__SSE2__) || defined(__AVX__)) && defined(FPTYPE_DOUBLE) ) 00068 #define VEC_DOUBLE 00069 #define VEC_SIZE 4 00070 #define VECTORIZE 00071 #endif 00072 00073 00074 /* Get headers for intrinsic functions. */ 00075 #ifdef __SSE__ 00076 #include <xmmintrin.h> 00077 #endif 00078 #ifdef __SSE2__ 00079 #include <emmintrin.h> 00080 #endif 00081 #ifdef __SSE3__ 00082 #include <pmmintrin.h> 00083 #endif 00084 #ifdef __SSE4_1__ 00085 #include <smmintrin.h> 00086 #endif 00087 00089 #define vector(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type 00090 00091 00092 /* Some extra functions function for Alti-Vec instruction set. */ 00093 #ifdef __ALTIVEC__ 00094 #include <altivec.h> 00095 __attribute__ ((always_inline)) INLINE vector float vec_sqrt( vector float a ) { 00096 vector float z = ( vector float ){ 0.0f }; 00097 vector float estimate = vec_rsqrte( a ); 00098 vector float estimateSquared = vec_madd( estimate, estimate, z ); 00099 vector float halfEstimate = vec_madd( estimate, (vector float){0.5}, z ); 00100 return vec_madd( a, vec_madd( vec_nmsub( a, estimateSquared, (vector float){1.0} ), halfEstimate, estimate ), z); 00101 } 00102 /* inline static vector float vec_load4 ( float a , float b , float c , float d ) { 00103 return vec_mergeh( vec_mergeh( vec_promote(a,0) , vec_promote(c,0) ) , vec_mergeh( vec_promote(b,0) , vec_promote(d,0) ) ); 00104 } */ 00105 #define vec_load4(a,b,c,d) vec_mergeh( vec_mergeh( vec_promote((a),0) , vec_promote((c),0) ) , vec_mergeh( vec_promote((b),0) , vec_promote((d),0) ) ) 00106 #define vec_mul(a,b) vec_madd((a),(b),(vector float){0.0f}) 00107 #endif 00108 00109 00123 __attribute__ ((always_inline)) INLINE FPTYPE fptype_r2 ( FPTYPE *x1 , FPTYPE *x2 , FPTYPE *dx ) { 00124 00125 #if defined(FPTYPE_SINGLE) && defined(__SSE4_1__) 00126 union { 00127 vector(4,float) v; 00128 float f[4]; 00129 } a, b, c, d; 00130 00131 /* Load x1 and x2 into a and b. */ 00132 a.v = _mm_load_ps( x1 ); 00133 b.v = _mm_load_ps( x2 ); 00134 00135 /* Compute the difference and store in dx. */ 00136 c.v = a.v - b.v; 00137 _mm_store_ps( dx , c.v ); 00138 00139 /* Use the built-in dot-product instruction. */ 00140 d.v = _mm_dp_ps( c.v , c.v , 0x71 ); 00141 00142 /* Return the sum of squares. */ 00143 return d.f[0]; 00144 #elif defined(FPTYPE_SINGLE) && defined(__SSE3__) 00145 union { 00146 vector(4,float) v; 00147 float f[4]; 00148 } a, b, c, d; 00149 00150 /* Load x1 and x2 into a and b. */ 00151 a.v = _mm_load_ps( x1 ); 00152 b.v = _mm_load_ps( x2 ); 00153 00154 /* Compute the difference and store in dx. */ 00155 c.v = a.v - b.v; 00156 _mm_store_ps( dx , c.v ); 00157 00158 /* Square the entries (use a different register so that c can be stored). */ 00159 d.v = c.v * c.v; 00160 00161 /* Add horizontally twice to get the sum of the four entries 00162 in the lowest float. */ 00163 d.v = _mm_hadd_ps( d.v , d.v ); 00164 d.v = _mm_hadd_ps( d.v , d.v ); 00165 00166 /* Return the sum of squares. */ 00167 return d.f[0]; 00168 #elif defined(FPTYPE_DOUBLE) && defined(__AVX__) 00169 union { 00170 vector(4,double) v; 00171 double f[4]; 00172 } a, b, c, d; 00173 00174 /* Load x1 and x2 into a and b. */ 00175 a.v = _m256_mm_load_pd( x1 ); 00176 b.v = _m256_mm_load_pd( x2 ); 00177 00178 /* Compute the difference and store in dx. */ 00179 c.v = a.v - b.v; 00180 _mm256_store_pd( dx , c.v ); 00181 00182 /* Square the entries (use a different register so that c can be stored). */ 00183 d.v = c.v * c.v; 00184 00185 /* Add horizontally twice to get the sum of the four entries 00186 in the lowest double. */ 00187 d.v = _mm256_hadd_ps( d.v , d.v ); 00188 d.v = _mm256_hadd_ps( d.v , d.v ); 00189 00190 /* Return the sum of squares. */ 00191 return d.f[0]; 00192 #elif defined(FPTYPE_DOUBLE) && defined(__SSE4_1__) 00193 union { 00194 vector(2,double) v; 00195 double f[2]; 00196 } a1, a2, b1, b2, c1, c2, d1, d2; 00197 00198 /* Load x1 and x2 into a and b. */ 00199 a1.v = _mm_load_pd( x1 ); 00200 b1.v = _mm_load_pd( x2 ); 00201 a2.v = _mm_load_pd( &x1[2] ); 00202 b2.v = _mm_load_pd( &x2[2] ); 00203 00204 /* Compute the difference and store in dx. */ 00205 c1.v = a1.v - b1.v; 00206 c2.v = a2.v - b2.v; 00207 _mm_store_pd( dx , c1.v ); 00208 _mm_store_pd( &dx[2] , c1.v ); 00209 00210 /* Use the built-in dot-product instruction. */ 00211 d1.v = _mm_dp_pd( c1.v , c1.v , 1+4+8 ); 00212 d2.v = c2.v * c2.v; 00213 00214 /* Return the sum of squares. */ 00215 return d1.f[0] + d2.f[0]; 00216 #elif defined(FPTYPE_DOUBLE) && defined(__SSE3__) 00217 union { 00218 vector(2,double) v; 00219 double f[2]; 00220 } a1, a2, b1, b2, c1, c2, d1, d2; 00221 00222 /* Load x1 and x2 into a and b. */ 00223 a1.v = _mm_load_pd( x1 ); 00224 b1.v = _mm_load_pd( x2 ); 00225 a2.v = _mm_load_pd( &x1[2] ); 00226 b2.v = _mm_load_pd( &x2[2] ); 00227 00228 /* Compute the difference and store in dx. */ 00229 c1.v = a1.v - b1.v; 00230 c2.v = a2.v - b2.v; 00231 _mm_store_pd( dx , c1.v ); 00232 _mm_store_pd( &dx[2] , c1.v ); 00233 00234 /* Square the entries (use a different register so that c can be stored). */ 00235 d1.v = c1.v * c1.v; 00236 d2.v = c2.v * c2.v; 00237 00238 /* Add horizontally twice to get the sum of the four entries 00239 in the lowest double. */ 00240 d1.v = _mm_hadd_pd( d1.v , d2.v ); 00241 d1.v = _mm_hadd_pd( d1.v , d1.v ); 00242 00243 /* Return the sum of squares. */ 00244 return d1.f[0]; 00245 #else 00246 dx[0] = x1[0] - x2[0]; 00247 dx[1] = x1[1] - x2[1]; 00248 dx[2] = x1[2] - x2[2]; 00249 return dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2]; 00250 #endif 00251 00252 } 00253 00254