mdcore
0.1.5
|
00001 00002 /******************************************************************************* 00003 * This file is part of mdcore. 00004 * Coypright (c) 2010 Pedro Gonnet (pedro.gonnet@durham.ac.uk) 00005 * 00006 * This program is free software: you can redistribute it and/or modify 00007 * it under the terms of the GNU Lesser General Public License as published 00008 * by the Free Software Foundation, either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * This program is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 * GNU General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public License 00017 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 * 00019 ******************************************************************************/ 00020 00021 00022 /* This file contains the potential evaluation function als "extern inline", 00023 such that they can be inlined in the respective modules. 00024 00025 If your code wants to call any potential_eval functions, you must include 00026 this file. 00027 */ 00028 00029 /* Function prototypes. */ 00030 /* void potential_eval ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f ); 00031 void potential_eval_expl ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f ); 00032 void potential_eval_vec_4single ( struct potential *p[4] , float *r2 , float *e , float *f ); 00033 void potential_eval_vec_4single_r ( struct potential *p[4] , float *r_in , float *e , float *f ); 00034 void potential_eval_vec_8single ( struct potential *p[4] , float *r2 , float *e , float *f ); 00035 void potential_eval_vec_2double ( struct potential *p[4] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f ); 00036 void potential_eval_vec_4double ( struct potential *p[4] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f ); 00037 void potential_eval_vec_4double_r ( struct potential *p[4] , FPTYPE *r , FPTYPE *e , FPTYPE *f ); 00038 void potential_eval_r ( struct potential *p , FPTYPE r , FPTYPE *e , FPTYPE *f ); 00039 */ 00040 00041 00042 /* Get the inlining right. */ 00043 #ifndef INLINE 00044 # if __GNUC__ && !__GNUC_STDC_INLINE__ 00045 # define INLINE extern inline 00046 # else 00047 # define INLINE inline 00048 # endif 00049 #endif 00050 00051 00067 __attribute__ ((always_inline)) INLINE void potential_eval ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f ) { 00068 00069 int ind, k; 00070 FPTYPE x, ee, eff, *c, r; 00071 00072 /* Get r for the right type. */ 00073 r = FPTYPE_SQRT(r2); 00074 00075 /* is r in the house? */ 00076 /* if ( r < p->a || r > p->b ) 00077 printf("potential_eval: requested potential at r=%e, not in [%e,%e].\n",r,p->a,p->b); */ 00078 00079 /* compute the index */ 00080 ind = FPTYPE_FMAX( FPTYPE_ZERO , p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2]) ); 00081 00082 /* if ( ind > p->n ) { 00083 printf("potential_eval: r=%.18e.\n",r); 00084 fflush(stdout); 00085 } */ 00086 00087 /* get the table offset */ 00088 c = &(p->c[ind * potential_chunk]); 00089 00090 /* adjust x to the interval */ 00091 x = (r - c[0]) * c[1]; 00092 00093 /* compute the potential and its derivative */ 00094 ee = c[2] * x + c[3]; 00095 eff = c[2]; 00096 for ( k = 4 ; k < potential_chunk ; k++ ) { 00097 eff = eff * x + ee; 00098 ee = ee * x + c[k]; 00099 } 00100 00101 /* store the result */ 00102 *e = ee; *f = eff * c[1] / r; 00103 00104 } 00105 00106 00122 __attribute__ ((always_inline)) INLINE void potential_eval_r ( struct potential *p , FPTYPE r , FPTYPE *e , FPTYPE *f ) { 00123 00124 int ind, k; 00125 FPTYPE x, ee, eff, *c; 00126 00127 /* compute the index */ 00128 ind = FPTYPE_FMAX( FPTYPE_ZERO , p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2] ) ); 00129 00130 /* is r in the house? */ 00131 /* if ( ind > p->n ) 00132 printf("potential_eval_r: requested potential at r=%e (ind=%.8e), not in [%e,%e].\n",r , p->alpha[0] + r * (p->alpha[1] + r * (p->alpha[2] + r * p->alpha[3])),p->a,p->b); */ 00133 00134 /* get the table offset */ 00135 c = &(p->c[ind * potential_chunk]); 00136 00137 /* adjust x to the interval */ 00138 x = (r - c[0]) * c[1]; 00139 00140 /* compute the potential and its derivative */ 00141 ee = c[2] * x + c[3]; 00142 eff = c[2]; 00143 for ( k = 4 ; k < potential_chunk ; k++ ) { 00144 eff = eff * x + ee; 00145 ee = ee * x + c[k]; 00146 } 00147 00148 /* store the result */ 00149 *e = ee; *f = eff * c[1]; 00150 00151 } 00152 00153 00175 __attribute__ ((always_inline)) INLINE void potential_eval_expl ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f ) { 00176 00177 const FPTYPE isqrtpi = 0.56418958354775628695; 00178 const FPTYPE kappa = 3.0; 00179 FPTYPE r = sqrt(r2), ir = 1.0 / r, ir2 = ir * ir, ir4, ir6, ir12, t1, t2; 00180 FPTYPE ee = 0.0, eff = 0.0; 00181 00182 /* Do we have a Lennard-Jones interaction? */ 00183 if ( p->flags & potential_flag_LJ126 ) { 00184 00185 /* init some variables */ 00186 ir4 = ir2 * ir2; ir6 = ir4 * ir2; ir12 = ir6 * ir6; 00187 00188 /* compute the energy and the force */ 00189 ee = ( p->alpha[0] * ir12 - p->alpha[1] * ir6 ); 00190 eff = 6.0 * ir * ( -2.0 * p->alpha[0] * ir12 + p->alpha[1] * ir6 ); 00191 00192 } 00193 00194 /* Do we have an Ewald short-range part? */ 00195 if ( p->flags & potential_flag_Ewald ) { 00196 00197 /* get some values we will re-use */ 00198 t2 = r * kappa; 00199 t1 = erfc( t2 ); 00200 00201 /* compute the energy and the force */ 00202 ee += p->alpha[2] * t1 * ir; 00203 eff += p->alpha[2] * ( -2.0 * isqrtpi * exp( -t2 * t2 ) * kappa * ir - t1 * ir2 ); 00204 00205 } 00206 00207 /* Do we have a Coulomb interaction? */ 00208 if ( p->flags & potential_flag_Coulomb ) { 00209 00210 /* compute the energy and the force */ 00211 ee += potential_escale * p->alpha[2] * ir; 00212 eff += -potential_escale * p->alpha[2] * ir2; 00213 00214 } 00215 00216 /* store the potential and force. */ 00217 *e = ee; 00218 *f = eff; 00219 00220 } 00221 00222 00246 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single ( struct potential *p[4] , float *r2 , float *e , float *f ) { 00247 00248 #if defined(__SSE__) && defined(FPTYPE_SINGLE) 00249 // int j, k; 00250 union { 00251 __v4sf v; 00252 __m128i m; 00253 float f[4]; 00254 int i[4]; 00255 } alpha[4], mi, hi, x, ee, eff, c[6], r, ind, t[8]; 00256 // float *data[4]; 00257 00258 /* Get r . */ 00259 r.v = _mm_sqrt_ps( _mm_load_ps( r2 ) ); 00260 00261 /* compute the index */ 00262 alpha[0].v = _mm_load_ps( p[0]->alpha ); 00263 alpha[1].v = _mm_load_ps( p[1]->alpha ); 00264 alpha[2].v = _mm_load_ps( p[2]->alpha ); 00265 alpha[3].v = _mm_load_ps( p[3]->alpha ); 00266 t[0].m = _mm_unpacklo_epi32( alpha[0].m , alpha[1].m ); 00267 t[1].m = _mm_unpacklo_epi32( alpha[2].m , alpha[3].m ); 00268 t[2].m = _mm_unpackhi_epi32( alpha[0].m , alpha[1].m ); 00269 t[3].m = _mm_unpackhi_epi32( alpha[2].m , alpha[3].m ); 00270 alpha[0].m = _mm_unpacklo_epi64( t[0].m , t[1].m ); 00271 alpha[1].m = _mm_unpackhi_epi64( t[0].m , t[1].m ); 00272 alpha[2].m = _mm_unpacklo_epi64( t[2].m , t[3].m ); 00273 ind.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha[0].v , _mm_mul_ps( r.v , _mm_add_ps( alpha[1].v , _mm_mul_ps( r.v , alpha[2].v ) ) ) ) ) ); 00274 00275 /* Check ranges. */ 00276 /* for ( j = 0 ; j < 4 ; j++ ) 00277 if ( ind.i[j] == 0 || ind.i[j] > p[j]->n ) { 00278 printf( "potential_eval_vec_4single: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" , 00279 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) , 00280 p[j]->n , p[j]->a , p[j]->b ); 00281 fflush(stdout); 00282 } */ 00283 00284 /* Unpack/transpose the coefficient data. */ 00285 mi.v = _mm_load_ps( &p[0]->c[ ind.i[0] * potential_chunk ] ); 00286 hi.v = _mm_load_ps( &p[1]->c[ ind.i[1] * potential_chunk ] ); 00287 c[0].v = _mm_load_ps( &p[2]->c[ ind.i[2] * potential_chunk ] ); 00288 c[1].v = _mm_load_ps( &p[3]->c[ ind.i[3] * potential_chunk ] ); 00289 _MM_TRANSPOSE4_PS( mi.v , hi.v , c[0].v , c[1].v ); 00290 c[2].v = _mm_load_ps( &p[0]->c[ ind.i[0] * potential_chunk + 4 ] ); 00291 c[3].v = _mm_load_ps( &p[1]->c[ ind.i[1] * potential_chunk + 4 ] ); 00292 c[4].v = _mm_load_ps( &p[2]->c[ ind.i[2] * potential_chunk + 4 ] ); 00293 c[5].v = _mm_load_ps( &p[3]->c[ ind.i[3] * potential_chunk + 4 ] ); 00294 _MM_TRANSPOSE4_PS( c[2].v , c[3].v , c[4].v , c[5].v ); 00295 00296 /* adjust x to the interval */ 00297 x.v = _mm_mul_ps( _mm_sub_ps( r.v , mi.v ) , hi.v ); 00298 00299 /* compute the potential and its derivative */ 00300 eff.v = c[0].v; 00301 ee.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , c[1].v ); 00302 eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v ); 00303 ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[2].v ); 00304 eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v ); 00305 ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[3].v ); 00306 eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v ); 00307 ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[4].v ); 00308 eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v ); 00309 ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[5].v ); 00310 00311 /* store the result */ 00312 _mm_store_ps( e , ee.v ); 00313 _mm_store_ps( f , _mm_mul_ps( eff.v , _mm_div_ps( hi.v , r.v ) ) ); 00314 00315 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE) 00316 int j, k; 00317 union { 00318 vector float v; 00319 float f[4]; 00320 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r; 00321 union { 00322 vector unsigned int v; 00323 unsigned int i[4]; 00324 } ind; 00325 float *data[4]; 00326 00327 /* Get r . */ 00328 r.v = vec_sqrt( *((vector float *)r2) ); 00329 00330 /* compute the index (vec_ctu maps negative floats to 0) */ 00331 alpha0.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00332 alpha1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00333 alpha2.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00334 ind.v = vec_ctu( vec_madd( r.v , vec_madd( r.v , alpha2.v , alpha1.v ) , alpha0.v ) , 0 ); 00335 00336 /* get the table offset */ 00337 for ( k = 0 ; k < 4 ; k++ ) 00338 data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] ); 00339 00340 /* adjust x to the interval */ 00341 mi.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 00342 hi.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 00343 x.v = vec_mul( vec_sub( r.v , mi.v ) , hi.v ); 00344 00345 /* compute the potential and its derivative */ 00346 eff.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 00347 c.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 00348 ee.v = vec_madd( eff.v , x.v , c.v ); 00349 for ( j = 4 ; j < potential_chunk ; j++ ) { 00350 c.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 00351 eff.v = vec_madd( eff.v , x.v , ee.v ); 00352 ee.v = vec_madd( ee.v , x.v , c.v ); 00353 } 00354 00355 /* store the result */ 00356 *((vector float *)e) = ee.v; 00357 *((vector float *)f) = vec_mul( eff.v , vec_div( hi.v , r.v ) ); 00358 00359 #else 00360 int k; 00361 FPTYPE ee, eff; 00362 for ( k = 0 ; k < 4 ; k++ ) { 00363 potential_eval_r( p[k] , r2[k] , &ee , &eff ); 00364 e[k] = ee; f[k] = eff; 00365 } 00366 #endif 00367 00368 } 00369 00370 00394 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single_old ( struct potential *p[4] , float *r2 , float *e , float *f ) { 00395 00396 #if defined(__SSE__) && defined(FPTYPE_SINGLE) 00397 int j, k; 00398 union { 00399 __v4sf v; 00400 __m128i m; 00401 float f[4]; 00402 int i[4]; 00403 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind; 00404 float *data[4]; 00405 00406 /* Get r . */ 00407 r.v = _mm_sqrt_ps( _mm_load_ps( r2 ) ); 00408 00409 /* compute the index */ 00410 alpha0.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00411 alpha1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00412 alpha2.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00413 ind.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0.v , _mm_mul_ps( r.v , _mm_add_ps( alpha1.v , _mm_mul_ps( r.v , alpha2.v ) ) ) ) ) ); 00414 00415 /* Check ranges. */ 00416 /* for ( j = 0 ; j < 4 ; j++ ) 00417 if ( ind.i[j] == 0 || ind.i[j] > p[j]->n ) { 00418 printf( "potential_eval_vec_4single: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" , 00419 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) , 00420 p[j]->n , p[j]->a , p[j]->b ); 00421 fflush(stdout); 00422 } */ 00423 00424 /* get the table offset */ 00425 for ( k = 0 ; k < 4 ; k++ ) 00426 data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] ); 00427 00428 /* adjust x to the interval */ 00429 mi.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 00430 hi.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 00431 x.v = _mm_mul_ps( _mm_sub_ps( r.v , mi.v ) , hi.v ); 00432 00433 /* compute the potential and its derivative */ 00434 eff.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 00435 c.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 00436 ee.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , c.v ); 00437 for ( j = 4 ; j < potential_chunk ; j++ ) { 00438 c.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 00439 eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v ); 00440 ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c.v ); 00441 } 00442 00443 /* store the result */ 00444 _mm_store_ps( e , ee.v ); 00445 _mm_store_ps( f , _mm_mul_ps( eff.v , _mm_div_ps( hi.v , r.v ) ) ); 00446 00447 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE) 00448 int j, k; 00449 union { 00450 vector float v; 00451 float f[4]; 00452 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r; 00453 union { 00454 vector unsigned int v; 00455 unsigned int i[4]; 00456 } ind; 00457 float *data[4]; 00458 00459 /* Get r . */ 00460 r.v = vec_sqrt( *((vector float *)r2) ); 00461 00462 /* compute the index (vec_ctu maps negative floats to 0) */ 00463 alpha0.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00464 alpha1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00465 alpha2.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00466 ind.v = vec_ctu( vec_madd( r.v , vec_madd( r.v , alpha2.v , alpha1.v ) , alpha0.v ) , 0 ); 00467 00468 /* get the table offset */ 00469 for ( k = 0 ; k < 4 ; k++ ) 00470 data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] ); 00471 00472 /* adjust x to the interval */ 00473 mi.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 00474 hi.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 00475 x.v = vec_mul( vec_sub( r.v , mi.v ) , hi.v ); 00476 00477 /* compute the potential and its derivative */ 00478 eff.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 00479 c.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 00480 ee.v = vec_madd( eff.v , x.v , c.v ); 00481 for ( j = 4 ; j < potential_chunk ; j++ ) { 00482 c.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 00483 eff.v = vec_madd( eff.v , x.v , ee.v ); 00484 ee.v = vec_madd( ee.v , x.v , c.v ); 00485 } 00486 00487 /* store the result */ 00488 *((vector float *)e) = ee.v; 00489 *((vector float *)f) = vec_mul( eff.v , vec_div( hi.v , r.v ) ); 00490 00491 #else 00492 int k; 00493 FPTYPE ee, eff; 00494 for ( k = 0 ; k < 4 ; k++ ) { 00495 potential_eval_r( p[k] , r2[k] , &ee , &eff ); 00496 e[k] = ee; f[k] = eff; 00497 } 00498 #endif 00499 00500 } 00501 00502 00503 #ifdef STILL_NOT_READY_FOR_PRIME_TIME 00504 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single_gccvec ( struct potential *p[4] , float *r2 , float *e , float *f ) { 00505 00506 int j, k; 00507 union { 00508 vector(4,float) v; 00509 float f[4]; 00510 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r; 00511 union { 00512 vector(4,int) v; 00513 int i[4]; 00514 } ind; 00515 FPTYPE *data[4]; 00516 00517 /* Get r . */ 00518 r.v = sqrtf( *( (vector(4,float) *)r2 ) ); 00519 00520 /* compute the index */ 00521 for ( k = 0 ; k < 4 ; k++ ) { 00522 alpha0.f[k] = p[k]->alpha[0]; 00523 alpha1.f[k] = p[k]->alpha[1]; 00524 alpha2.f[k] = p[k]->alpha[2]; 00525 } 00526 ind.v = max( (vector(4,int)){0,0,0,0} , (vector(4,int))( alpha0.v + r.v*( alpha1.v + r.v*alpha2.v ) ) ); 00527 00528 /* Check ranges. */ 00529 /* for ( j = 0 ; j < 4 ; j++ ) 00530 if ( ind.i[j] == 0 || ind.i[j] > p[j]->n ) { 00531 printf( "potential_eval_vec_4single: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" , 00532 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) , 00533 p[j]->n , p[j]->a , p[j]->b ); 00534 fflush(stdout); 00535 } */ 00536 00537 /* get the table offset */ 00538 for ( k = 0 ; k < 4 ; k++ ) 00539 data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] ); 00540 00541 /* adjust x to the interval */ 00542 for ( k = 0 ; k < 4 ; k++ ) { 00543 mi.f[k] = data[k][0]; 00544 hi.f[k] = data[k][1]; 00545 } 00546 x.v = ( r.v - mi.v ) * hi.v; 00547 00548 /* compute the potential and its derivative */ 00549 for ( k = 0 ; k < 4 ; k++ ) { 00550 eff.f[k] = data[k][2]; 00551 c.f[k] = data[k][3]; 00552 } 00553 ee.v = eff.v*x.v + c.v; 00554 for ( j = 4 ; j < potential_chunk ; j++ ) { 00555 for ( k = 0 ; k < 4 ; k++ ) 00556 c.f[k] = data[k][j]; 00557 eff.v = eff.v*x.v + ee.v; 00558 ee.v = ee.v*x.v + c.v; 00559 } 00560 00561 /* store the result */ 00562 *( (vector(4,float) *)e ) = ee.v; 00563 *( (vector(4,float) *)f ) = eff.v*( hi.v / r.v ); 00564 00565 } 00566 #endif 00567 00568 00592 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single_r ( struct potential *p[4] , float *r_in , float *e , float *f ) { 00593 00594 #if defined(__SSE__) && defined(FPTYPE_SINGLE) 00595 int j, k; 00596 union { 00597 __v4sf v; 00598 __m128i m; 00599 float f[4]; 00600 int i[4]; 00601 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind; 00602 float *data[4]; 00603 00604 /* Get r . */ 00605 r.v = _mm_load_ps( r_in ); 00606 00607 /* compute the index */ 00608 alpha0.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00609 alpha1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00610 alpha2.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00611 ind.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0.v , _mm_mul_ps( r.v , _mm_add_ps( alpha1.v , _mm_mul_ps( r.v , alpha2.v ) ) ) ) ) ); 00612 00613 /* Check ranges. */ 00614 /* for ( j = 0 ; j < 4 ; j++ ) 00615 if ( ind.i[j] > p[j]->n ) { 00616 printf( "potential_eval_vec_4single_r: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" , 00617 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) , 00618 p[j]->n , p[j]->a , p[j]->b ); 00619 fflush(stdout); 00620 } */ 00621 00622 /* get the table offset */ 00623 for ( k = 0 ; k < 4 ; k++ ) 00624 data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] ); 00625 00626 /* adjust x to the interval */ 00627 mi.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 00628 hi.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 00629 x.v = _mm_mul_ps( _mm_sub_ps( r.v , mi.v ) , hi.v ); 00630 00631 /* compute the potential and its derivative */ 00632 eff.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 00633 c.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 00634 ee.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , c.v ); 00635 for ( j = 4 ; j < potential_chunk ; j++ ) { 00636 c.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 00637 eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v ); 00638 ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c.v ); 00639 } 00640 00641 /* store the result */ 00642 _mm_store_ps( e , ee.v ); 00643 _mm_store_ps( f , _mm_mul_ps( eff.v , hi.v ) ); 00644 00645 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE) 00646 int j, k; 00647 union { 00648 vector float v; 00649 float f[4]; 00650 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r; 00651 union { 00652 vector unsigned int v; 00653 unsigned int i[4]; 00654 } ind; 00655 float *data[4]; 00656 00657 /* Get r . */ 00658 r.v = *((vector float *)r_in); 00659 00660 /* compute the index (vec_ctu maps negative floats to 0) */ 00661 alpha0.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00662 alpha1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00663 alpha2.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00664 ind.v = vec_ctu( vec_madd( r.v , vec_madd( r.v , alpha2.v , alpha1.v ) , alpha0.v ) , 0 ); 00665 00666 /* get the table offset */ 00667 for ( k = 0 ; k < 4 ; k++ ) 00668 data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] ); 00669 00670 /* adjust x to the interval */ 00671 mi.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 00672 hi.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 00673 x.v = vec_mul( vec_sub( r.v , mi.v ) , hi.v ); 00674 00675 /* compute the potential and its derivative */ 00676 eff.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 00677 c.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 00678 ee.v = vec_madd( eff.v , x.v , c.v ); 00679 for ( j = 4 ; j < potential_chunk ; j++ ) { 00680 c.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 00681 eff.v = vec_madd( eff.v , x.v , ee.v ); 00682 ee.v = vec_madd( ee.v , x.v , c.v ); 00683 } 00684 00685 /* store the result */ 00686 *((vector float *)e) = ee.v; 00687 *((vector float *)f) = vec_mul( eff.v , hi.v ); 00688 00689 #else 00690 int k; 00691 FPTYPE ee, eff; 00692 for ( k = 0 ; k < 4 ; k++ ) { 00693 potential_eval( p[k] , r_in[k] , &ee , &eff ); 00694 e[k] = ee; f[k] = eff; 00695 } 00696 #endif 00697 00698 } 00699 00700 00724 __attribute__ ((always_inline)) INLINE void potential_eval_vec_8single ( struct potential *p[8] , float *r2 , float *e , float *f ) { 00725 00726 #if defined(__AVX__) && defined(FPTYPE_SINGLE) 00727 int j; 00728 union { 00729 __v8sf v; 00730 __m256i m; 00731 float f[8]; 00732 int i[8]; 00733 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind; 00734 float *data[8]; 00735 00736 /* Get r . */ 00737 r.v = _mm256_sqrt_ps( _mm256_load_ps( r2 ) ); 00738 00739 /* compute the index */ 00740 alpha0.v = _mm256_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] , p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] ); 00741 alpha1.v = _mm256_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] , p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] ); 00742 alpha2.v = _mm256_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] , p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] ); 00743 ind.m = _mm256_cvttps_epi32( _mm256_max_ps( _mm256_setzero_ps() , _mm256_add_ps( alpha0.v , _mm256_mul_ps( r.v , _mm256_add_ps( alpha1.v , _mm256_mul_ps( r.v , alpha2.v ) ) ) ) ) ); 00744 00745 /* get the table offset */ 00746 data[0] = &( p[0]->c[ ind.i[0] * potential_chunk ] ); 00747 data[1] = &( p[1]->c[ ind.i[1] * potential_chunk ] ); 00748 data[2] = &( p[2]->c[ ind.i[2] * potential_chunk ] ); 00749 data[3] = &( p[3]->c[ ind.i[3] * potential_chunk ] ); 00750 data[4] = &( p[4]->c[ ind.i[4] * potential_chunk ] ); 00751 data[5] = &( p[5]->c[ ind.i[5] * potential_chunk ] ); 00752 data[6] = &( p[6]->c[ ind.i[6] * potential_chunk ] ); 00753 data[7] = &( p[7]->c[ ind.i[7] * potential_chunk ] ); 00754 00755 /* adjust x to the interval */ 00756 mi.v = _mm256_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] , data[4][0] , data[5][0] , data[6][0] , data[7][0] ); 00757 hi.v = _mm256_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] , data[4][1] , data[5][1] , data[6][1] , data[7][1] ); 00758 x.v = _mm256_mul_ps( _mm256_sub_ps( r.v , mi.v ) , hi.v ); 00759 00760 /* compute the potential and its derivative */ 00761 eff.v = _mm256_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] , data[4][2] , data[5][2] , data[6][2] , data[7][2] ); 00762 c.v = _mm256_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] , data[4][3] , data[5][3] , data[6][3] , data[7][3] ); 00763 ee.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , c.v ); 00764 for ( j = 4 ; j < potential_chunk ; j++ ) { 00765 c.v = _mm256_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] , data[4][j] , data[5][j] , data[6][j] , data[7][j] ); 00766 eff.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , ee.v ); 00767 ee.v = _mm256_add_ps( _mm256_mul_ps( ee.v , x.v ) , c.v ); 00768 } 00769 00770 /* store the result */ 00771 _mm256_store_ps( e , ee.v ); 00772 _mm256_store_ps( f , _mm256_mul_ps( eff.v , _mm256_div_ps( hi.v , r.v ) ) ); 00773 00774 #elifif defined(__SSE__) && defined(FPTYPE_SINGLE) 00775 int j; 00776 union { 00777 __v4sf v; 00778 __m128i m; 00779 float f[4]; 00780 int i[4]; 00781 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1, 00782 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2; 00783 float *data[8]; 00784 00785 /* Get r . */ 00786 r_1.v = _mm_sqrt_ps( _mm_load_ps( &r2[0] ) ); 00787 r_2.v = _mm_sqrt_ps( _mm_load_ps( &r2[4] ) ); 00788 00789 /* compute the index */ 00790 alpha0_1.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00791 alpha1_1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00792 alpha2_1.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00793 alpha0_2.v = _mm_setr_ps( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] ); 00794 alpha1_2.v = _mm_setr_ps( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] ); 00795 alpha2_2.v = _mm_setr_ps( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] ); 00796 ind_1.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_1.v , _mm_mul_ps( r_1.v , _mm_add_ps( alpha1_1.v , _mm_mul_ps( r_1.v , alpha2_1.v ) ) ) ) ) ); 00797 ind_2.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_2.v , _mm_mul_ps( r_2.v , _mm_add_ps( alpha1_2.v , _mm_mul_ps( r_2.v , alpha2_2.v ) ) ) ) ) ); 00798 00799 /* get the table offset */ 00800 data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] ); 00801 data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] ); 00802 data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] ); 00803 data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] ); 00804 data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] ); 00805 data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] ); 00806 data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] ); 00807 data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] ); 00808 00809 /* adjust x to the interval */ 00810 mi_1.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 00811 hi_1.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 00812 mi_2.v = _mm_setr_ps( data[4][0] , data[5][0] , data[6][0] , data[7][0] ); 00813 hi_2.v = _mm_setr_ps( data[4][1] , data[5][1] , data[6][1] , data[7][1] ); 00814 x_1.v = _mm_mul_ps( _mm_sub_ps( r_1.v , mi_1.v ) , hi_1.v ); 00815 x_2.v = _mm_mul_ps( _mm_sub_ps( r_2.v , mi_2.v ) , hi_2.v ); 00816 00817 /* compute the potential and its derivative */ 00818 eff_1.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 00819 eff_2.v = _mm_setr_ps( data[4][2] , data[5][2] , data[6][2] , data[7][2] ); 00820 c_1.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 00821 c_2.v = _mm_setr_ps( data[4][3] , data[5][3] , data[6][3] , data[7][3] ); 00822 ee_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , c_1.v ); 00823 ee_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , c_2.v ); 00824 for ( j = 4 ; j < potential_chunk ; j++ ) { 00825 c_1.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 00826 c_2.v = _mm_setr_ps( data[4][j] , data[5][j] , data[6][j] , data[7][j] ); 00827 eff_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , ee_1.v ); 00828 eff_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , ee_2.v ); 00829 ee_1.v = _mm_add_ps( _mm_mul_ps( ee_1.v , x_1.v ) , c_1.v ); 00830 ee_2.v = _mm_add_ps( _mm_mul_ps( ee_2.v , x_2.v ) , c_2.v ); 00831 } 00832 00833 /* store the result */ 00834 _mm_store_ps( &e[0] , ee_1.v ); 00835 _mm_store_ps( &e[4] , ee_2.v ); 00836 _mm_store_ps( &f[0] , _mm_mul_ps( eff_1.v , _mm_div_ps( hi_1.v , r_1.v ) ) ); 00837 _mm_store_ps( &f[4] , _mm_mul_ps( eff_2.v , _mm_div_ps( hi_2.v , r_2.v ) ) ); 00838 00839 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE) 00840 int j; 00841 union { 00842 vector float v; 00843 vector unsigned int m; 00844 float f[4]; 00845 unsigned int i[4]; 00846 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1, 00847 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2; 00848 float *data[8]; 00849 00850 /* Get r . */ 00851 r_1.v = vec_sqrt( *((vector float *)&r2[0]) ); 00852 r_2.v = vec_sqrt( *((vector float *)&r2[4]) ); 00853 00854 /* compute the index */ 00855 alpha0_1.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00856 alpha1_1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00857 alpha2_1.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00858 alpha0_2.v = vec_load4( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] ); 00859 alpha1_2.v = vec_load4( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] ); 00860 alpha2_2.v = vec_load4( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] ); 00861 ind_1.m = vec_ctu( vec_madd( r_1.v , vec_madd( r_1.v , alpha2_1.v , alpha1_1.v ) , alpha0_1.v ) , 0 ); 00862 ind_2.m = vec_ctu( vec_madd( r_2.v , vec_madd( r_2.v , alpha2_2.v , alpha1_2.v ) , alpha0_2.v ) , 0 ); 00863 00864 /* get the table offset */ 00865 data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] ); 00866 data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] ); 00867 data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] ); 00868 data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] ); 00869 data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] ); 00870 data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] ); 00871 data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] ); 00872 data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] ); 00873 00874 /* adjust x to the interval */ 00875 mi_1.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 00876 hi_1.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 00877 mi_2.v = vec_load4( data[4][0] , data[5][0] , data[6][0] , data[7][0] ); 00878 hi_2.v = vec_load4( data[4][1] , data[5][1] , data[6][1] , data[7][1] ); 00879 x_1.v = vec_mul( vec_sub( r_1.v , mi_1.v ) , hi_1.v ); 00880 x_2.v = vec_mul( vec_sub( r_2.v , mi_2.v ) , hi_2.v ); 00881 00882 /* compute the potential and its derivative */ 00883 eff_1.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 00884 eff_2.v = vec_load4( data[4][2] , data[5][2] , data[6][2] , data[7][2] ); 00885 c_1.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 00886 c_2.v = vec_load4( data[4][3] , data[5][3] , data[6][3] , data[7][3] ); 00887 ee_1.v = vec_madd( eff_1.v , x_1.v , c_1.v ); 00888 ee_2.v = vec_madd( eff_2.v , x_2.v , c_2.v ); 00889 for ( j = 4 ; j < potential_chunk ; j++ ) { 00890 c_1.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 00891 c_2.v = vec_load4( data[4][j] , data[5][j] , data[6][j] , data[7][j] ); 00892 eff_1.v = vec_madd( eff_1.v , x_1.v , ee_1.v ); 00893 eff_2.v = vec_madd( eff_2.v , x_2.v , ee_2.v ); 00894 ee_1.v = vec_madd( ee_1.v , x_1.v , c_1.v ); 00895 ee_2.v = vec_madd( ee_2.v , x_2.v , c_2.v ); 00896 } 00897 00898 /* store the result */ 00899 eff_1.v = vec_mul( eff_1.v , vec_div( hi_1.v , r_1.v ) ); 00900 eff_2.v = vec_mul( eff_2.v , vec_div( hi_2.v , r_2.v ) ); 00901 memcpy( &e[0] , &ee_1 , sizeof(vector float) ); 00902 memcpy( &f[0] , &eff_1 , sizeof(vector float) ); 00903 memcpy( &e[4] , &ee_2 , sizeof(vector float) ); 00904 memcpy( &f[4] , &eff_2 , sizeof(vector float) ); 00905 00906 #else 00907 int k; 00908 FPTYPE ee, eff; 00909 for ( k = 0 ; k < 8 ; k++ ) { 00910 potential_eval( p[k] , r2[k] , &ee , &eff ); 00911 e[k] = ee; f[k] = eff; 00912 } 00913 #endif 00914 00915 } 00916 00917 00918 __attribute__ ((always_inline)) INLINE void potential_eval_vec_8single_r ( struct potential *p[8] , float *r2 , float *e , float *f ) { 00919 00920 #if defined(__AVX__) && defined(FPTYPE_SINGLE) 00921 int j; 00922 union { 00923 __v8sf v; 00924 __m256i m; 00925 float f[8]; 00926 int i[8]; 00927 } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind; 00928 float *data[8]; 00929 00930 /* Get r . */ 00931 r.v = _mm256_load_ps( r2 ); 00932 00933 /* compute the index */ 00934 alpha0.v = _mm256_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] , p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] ); 00935 alpha1.v = _mm256_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] , p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] ); 00936 alpha2.v = _mm256_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] , p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] ); 00937 ind.m = _mm256_cvttps_epi32( _mm256_max_ps( _mm256_setzero_ps() , _mm256_add_ps( alpha0.v , _mm256_mul_ps( r.v , _mm256_add_ps( alpha1.v , _mm256_mul_ps( r.v , alpha2.v ) ) ) ) ) ); 00938 00939 /* get the table offset */ 00940 data[0] = &( p[0]->c[ ind.i[0] * potential_chunk ] ); 00941 data[1] = &( p[1]->c[ ind.i[1] * potential_chunk ] ); 00942 data[2] = &( p[2]->c[ ind.i[2] * potential_chunk ] ); 00943 data[3] = &( p[3]->c[ ind.i[3] * potential_chunk ] ); 00944 data[4] = &( p[4]->c[ ind.i[4] * potential_chunk ] ); 00945 data[5] = &( p[5]->c[ ind.i[5] * potential_chunk ] ); 00946 data[6] = &( p[6]->c[ ind.i[6] * potential_chunk ] ); 00947 data[7] = &( p[7]->c[ ind.i[7] * potential_chunk ] ); 00948 00949 /* adjust x to the interval */ 00950 mi.v = _mm256_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] , data[4][0] , data[5][0] , data[6][0] , data[7][0] ); 00951 hi.v = _mm256_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] , data[4][1] , data[5][1] , data[6][1] , data[7][1] ); 00952 x.v = _mm256_mul_ps( _mm256_sub_ps( r.v , mi.v ) , hi.v ); 00953 00954 /* compute the potential and its derivative */ 00955 eff.v = _mm256_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] , data[4][2] , data[5][2] , data[6][2] , data[7][2] ); 00956 c.v = _mm256_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] , data[4][3] , data[5][3] , data[6][3] , data[7][3] ); 00957 ee.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , c.v ); 00958 for ( j = 4 ; j < potential_chunk ; j++ ) { 00959 c.v = _mm256_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] , data[4][j] , data[5][j] , data[6][j] , data[7][j] ); 00960 eff.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , ee.v ); 00961 ee.v = _mm256_add_ps( _mm256_mul_ps( ee.v , x.v ) , c.v ); 00962 } 00963 00964 /* store the result */ 00965 _mm256_store_ps( e , ee.v ); 00966 _mm256_store_ps( f , _mm256_mul_ps( eff.v , hi.v ) ); 00967 00968 #elifif defined(__SSE__) && defined(FPTYPE_SINGLE) 00969 int j; 00970 union { 00971 __v4sf v; 00972 __m128i m; 00973 float f[4]; 00974 int i[4]; 00975 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1, 00976 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2; 00977 float *data[8]; 00978 00979 /* Get r . */ 00980 r_1.v = _mm_load_ps( &r2[0] ); 00981 r_2.v = _mm_load_ps( &r2[4] ); 00982 00983 /* compute the index */ 00984 alpha0_1.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 00985 alpha1_1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 00986 alpha2_1.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 00987 alpha0_2.v = _mm_setr_ps( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] ); 00988 alpha1_2.v = _mm_setr_ps( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] ); 00989 alpha2_2.v = _mm_setr_ps( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] ); 00990 ind_1.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_1.v , _mm_mul_ps( r_1.v , _mm_add_ps( alpha1_1.v , _mm_mul_ps( r_1.v , alpha2_1.v ) ) ) ) ) ); 00991 ind_2.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_2.v , _mm_mul_ps( r_2.v , _mm_add_ps( alpha1_2.v , _mm_mul_ps( r_2.v , alpha2_2.v ) ) ) ) ) ); 00992 00993 /* get the table offset */ 00994 data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] ); 00995 data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] ); 00996 data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] ); 00997 data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] ); 00998 data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] ); 00999 data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] ); 01000 data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] ); 01001 data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] ); 01002 01003 /* adjust x to the interval */ 01004 mi_1.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 01005 hi_1.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 01006 mi_2.v = _mm_setr_ps( data[4][0] , data[5][0] , data[6][0] , data[7][0] ); 01007 hi_2.v = _mm_setr_ps( data[4][1] , data[5][1] , data[6][1] , data[7][1] ); 01008 x_1.v = _mm_mul_ps( _mm_sub_ps( r_1.v , mi_1.v ) , hi_1.v ); 01009 x_2.v = _mm_mul_ps( _mm_sub_ps( r_2.v , mi_2.v ) , hi_2.v ); 01010 01011 /* compute the potential and its derivative */ 01012 eff_1.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 01013 eff_2.v = _mm_setr_ps( data[4][2] , data[5][2] , data[6][2] , data[7][2] ); 01014 c_1.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 01015 c_2.v = _mm_setr_ps( data[4][3] , data[5][3] , data[6][3] , data[7][3] ); 01016 ee_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , c_1.v ); 01017 ee_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , c_2.v ); 01018 for ( j = 4 ; j < potential_chunk ; j++ ) { 01019 c_1.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 01020 c_2.v = _mm_setr_ps( data[4][j] , data[5][j] , data[6][j] , data[7][j] ); 01021 eff_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , ee_1.v ); 01022 eff_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , ee_2.v ); 01023 ee_1.v = _mm_add_ps( _mm_mul_ps( ee_1.v , x_1.v ) , c_1.v ); 01024 ee_2.v = _mm_add_ps( _mm_mul_ps( ee_2.v , x_2.v ) , c_2.v ); 01025 } 01026 01027 /* store the result */ 01028 _mm_store_ps( &e[0] , ee_1.v ); 01029 _mm_store_ps( &e[4] , ee_2.v ); 01030 _mm_store_ps( &f[0] , _mm_mul_ps( eff_1.v , hi_1.v ) ); 01031 _mm_store_ps( &f[4] , _mm_mul_ps( eff_2.v , hi_2.v ) ); 01032 01033 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE) 01034 int j; 01035 union { 01036 vector float v; 01037 vector unsigned int m; 01038 float f[4]; 01039 unsigned int i[4]; 01040 } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1, 01041 alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2; 01042 float *data[8]; 01043 01044 /* Get r . */ 01045 r_1.v = *((vector float *)&r2[0]); 01046 r_2.v = *((vector float *)&r2[4]); 01047 01048 /* compute the index */ 01049 alpha0_1.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 01050 alpha1_1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 01051 alpha2_1.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 01052 alpha0_2.v = vec_load4( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] ); 01053 alpha1_2.v = vec_load4( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] ); 01054 alpha2_2.v = vec_load4( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] ); 01055 ind_1.m = vec_ctu( vec_madd( r_1.v , vec_madd( r_1.v , alpha2_1.v , alpha1_1.v ) , alpha0_1.v ) , 0 ); 01056 ind_2.m = vec_ctu( vec_madd( r_2.v , vec_madd( r_2.v , alpha2_2.v , alpha1_2.v ) , alpha0_2.v ) , 0 ); 01057 01058 /* get the table offset */ 01059 data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] ); 01060 data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] ); 01061 data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] ); 01062 data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] ); 01063 data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] ); 01064 data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] ); 01065 data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] ); 01066 data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] ); 01067 01068 /* adjust x to the interval */ 01069 mi_1.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 01070 hi_1.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 01071 mi_2.v = vec_load4( data[4][0] , data[5][0] , data[6][0] , data[7][0] ); 01072 hi_2.v = vec_load4( data[4][1] , data[5][1] , data[6][1] , data[7][1] ); 01073 x_1.v = vec_mul( vec_sub( r_1.v , mi_1.v ) , hi_1.v ); 01074 x_2.v = vec_mul( vec_sub( r_2.v , mi_2.v ) , hi_2.v ); 01075 01076 /* compute the potential and its derivative */ 01077 eff_1.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 01078 eff_2.v = vec_load4( data[4][2] , data[5][2] , data[6][2] , data[7][2] ); 01079 c_1.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 01080 c_2.v = vec_load4( data[4][3] , data[5][3] , data[6][3] , data[7][3] ); 01081 ee_1.v = vec_madd( eff_1.v , x_1.v , c_1.v ); 01082 ee_2.v = vec_madd( eff_2.v , x_2.v , c_2.v ); 01083 for ( j = 4 ; j < potential_chunk ; j++ ) { 01084 c_1.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 01085 c_2.v = vec_load4( data[4][j] , data[5][j] , data[6][j] , data[7][j] ); 01086 eff_1.v = vec_madd( eff_1.v , x_1.v , ee_1.v ); 01087 eff_2.v = vec_madd( eff_2.v , x_2.v , ee_2.v ); 01088 ee_1.v = vec_madd( ee_1.v , x_1.v , c_1.v ); 01089 ee_2.v = vec_madd( ee_2.v , x_2.v , c_2.v ); 01090 } 01091 01092 /* store the result */ 01093 eff_1.v = vec_mul( eff_1.v , hi_1.v ); 01094 eff_2.v = vec_mul( eff_2.v , hi_2.v ); 01095 memcpy( &e[0] , &ee_1 , sizeof(vector float) ); 01096 memcpy( &f[0] , &eff_1 , sizeof(vector float) ); 01097 memcpy( &e[4] , &ee_2 , sizeof(vector float) ); 01098 memcpy( &f[4] , &eff_2 , sizeof(vector float) ); 01099 01100 #else 01101 int k; 01102 FPTYPE ee, eff; 01103 for ( k = 0 ; k < 8 ; k++ ) { 01104 potential_eval( p[k] , r2[k] , &ee , &eff ); 01105 e[k] = ee; f[k] = eff; 01106 } 01107 #endif 01108 01109 } 01110 01111 01135 __attribute__ ((always_inline)) INLINE void potential_eval_vec_2double ( struct potential *p[2] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f ) { 01136 01137 #if defined(__SSE2__) && defined(FPTYPE_DOUBLE) 01138 int ind[2], j; 01139 union { 01140 __v2df v; 01141 double f[2]; 01142 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r; 01143 double *data[2]; 01144 01145 /* Get r . */ 01146 r.v = _mm_sqrt_pd( _mm_load_pd( r2 ) ); 01147 01148 /* compute the index */ 01149 alpha0.v = _mm_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] ); 01150 alpha1.v = _mm_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] ); 01151 alpha2.v = _mm_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] ); 01152 rind.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0.v , _mm_mul_pd( r.v , _mm_add_pd( alpha1.v , _mm_mul_pd( r.v , alpha2.v ) ) ) ) ); 01153 ind[0] = rind.f[0]; 01154 ind[1] = rind.f[1]; 01155 01156 /* get the table offset */ 01157 data[0] = &( p[0]->c[ ind[0] * potential_chunk ] ); 01158 data[1] = &( p[1]->c[ ind[1] * potential_chunk ] ); 01159 01160 /* adjust x to the interval */ 01161 mi.v = _mm_setr_pd( data[0][0] , data[1][0] ); 01162 hi.v = _mm_setr_pd( data[0][1] , data[1][1] ); 01163 x.v = _mm_mul_pd( _mm_sub_pd( r.v , mi.v ) , hi.v ); 01164 01165 /* compute the potential and its derivative */ 01166 eff.v = _mm_setr_pd( data[0][2] , data[1][2] ); 01167 c.v = _mm_setr_pd( data[0][3] , data[1][3] ); 01168 ee.v = _mm_add_pd( _mm_mul_pd( eff.v , x.v ) , c.v ); 01169 for ( j = 4 ; j < potential_chunk ; j++ ) { 01170 c.v = _mm_setr_pd( data[0][j] , data[1][j] ); 01171 eff.v = _mm_add_pd( _mm_mul_pd( eff.v , x.v ) , ee.v ); 01172 ee.v = _mm_add_pd( _mm_mul_pd( ee.v , x.v ) , c.v ); 01173 } 01174 01175 /* store the result */ 01176 _mm_store_pd( e , ee.v ); 01177 _mm_store_pd( f , _mm_mul_pd( eff.v , _mm_div_pd( hi.v , r.v ) ) ); 01178 01179 #else 01180 int k; 01181 for ( k = 0 ; k < 2 ; k++ ) 01182 potential_eval( p[k] , r2[k] , &e[k] , &f[k] ); 01183 #endif 01184 01185 } 01186 01187 01211 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4double ( struct potential *p[4] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f ) { 01212 01213 #if defined(__AVX__) && defined(FPTYPE_DOUBLE) 01214 int ind[4], j; 01215 union { 01216 __v4df v; 01217 double f[4]; 01218 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r; 01219 double *data[4]; 01220 01221 /* Get r . */ 01222 r.v = _mm256_sqrt_pd( _mm256_load_pd( r2 ) ); 01223 01224 /* compute the index */ 01225 alpha0.v = _mm256_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 01226 alpha1.v = _mm256_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 01227 alpha2.v = _mm256_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 01228 rind.v = _mm256_max_pd( _mm256_setzero_pd() , _mm256_add_pd( alpha0.v , _mm256_mul_pd( r.v , _mm256_add_pd( alpha1.v , _mm256_mul_pd( r.v , alpha2.v ) ) ) ) ); 01229 ind[0] = rind.f[0]; 01230 ind[1] = rind.f[1]; 01231 ind[3] = rind.f[3]; 01232 ind[4] = rind.f[4]; 01233 01234 /* get the table offset */ 01235 data[0] = &( p[0]->c[ ind[0] * potential_chunk ] ); 01236 data[1] = &( p[1]->c[ ind[1] * potential_chunk ] ); 01237 data[2] = &( p[2]->c[ ind[2] * potential_chunk ] ); 01238 data[3] = &( p[3]->c[ ind[3] * potential_chunk ] ); 01239 01240 /* adjust x to the interval */ 01241 mi.v = _mm256_setr_pd( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 01242 hi.v = _mm256_setr_pd( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 01243 x.v = _mm256_mul_pd( _mm256_sub_pd( r.v , mi.v ) , hi.v ); 01244 01245 /* compute the potential and its derivative */ 01246 eff.v = _mm256_setr_pd( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 01247 c.v = _mm256_setr_pd( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 01248 ee.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , c.v ); 01249 for ( j = 4 ; j < potential_chunk ; j++ ) { 01250 c.v = _mm256_setr_pd( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 01251 eff.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , ee.v ); 01252 ee.v = _mm256_add_pd( _mm256_mul_pd( ee.v , x.v ) , c.v ); 01253 } 01254 01255 /* store the result */ 01256 _mm256_store_pd( e , ee.v ); 01257 _mm256_store_pd( f , _mm256_mul_pd( eff.v , _mm256_div_pd( hi.v , r.v ) ) ); 01258 01259 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE) 01260 int ind[4], j; 01261 union { 01262 __v2df v; 01263 double f[2]; 01264 } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, 01265 alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2; 01266 double *data[4]; 01267 01268 /* Get r . */ 01269 r_1.v = _mm_sqrt_pd( _mm_load_pd( &r2[0] ) ); 01270 r_2.v = _mm_sqrt_pd( _mm_load_pd( &r2[2] ) ); 01271 01272 /* compute the index */ 01273 alpha0_1.v = _mm_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] ); 01274 alpha1_1.v = _mm_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] ); 01275 alpha2_1.v = _mm_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] ); 01276 alpha0_2.v = _mm_setr_pd( p[2]->alpha[0] , p[3]->alpha[0] ); 01277 alpha1_2.v = _mm_setr_pd( p[2]->alpha[1] , p[3]->alpha[1] ); 01278 alpha2_2.v = _mm_setr_pd( p[2]->alpha[2] , p[3]->alpha[2] ); 01279 rind_1.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_1.v , _mm_mul_pd( r_1.v , _mm_add_pd( alpha1_1.v , _mm_mul_pd( r_1.v , alpha2_1.v ) ) ) ) ); 01280 rind_2.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_2.v , _mm_mul_pd( r_2.v , _mm_add_pd( alpha1_2.v , _mm_mul_pd( r_2.v , alpha2_2.v ) ) ) ) ); 01281 ind[0] = rind_1.f[0]; 01282 ind[1] = rind_1.f[1]; 01283 ind[2] = rind_2.f[0]; 01284 ind[3] = rind_2.f[1]; 01285 01286 /* for ( j = 0 ; j < 4 ; j++ ) 01287 if ( ind[j] > p[j]->n ) { 01288 printf("potential_eval_vec_4double: dookie.\n"); 01289 fflush(stdout); 01290 } */ 01291 01292 /* get the table offset */ 01293 data[0] = &( p[0]->c[ ind[0] * potential_chunk ] ); 01294 data[1] = &( p[1]->c[ ind[1] * potential_chunk ] ); 01295 data[2] = &( p[2]->c[ ind[2] * potential_chunk ] ); 01296 data[3] = &( p[3]->c[ ind[3] * potential_chunk ] ); 01297 01298 /* adjust x to the interval */ 01299 mi_1.v = _mm_setr_pd( data[0][0] , data[1][0] ); 01300 hi_1.v = _mm_setr_pd( data[0][1] , data[1][1] ); 01301 mi_2.v = _mm_setr_pd( data[2][0] , data[3][0] ); 01302 hi_2.v = _mm_setr_pd( data[2][1] , data[3][1] ); 01303 x_1.v = _mm_mul_pd( _mm_sub_pd( r_1.v , mi_1.v ) , hi_1.v ); 01304 x_2.v = _mm_mul_pd( _mm_sub_pd( r_2.v , mi_2.v ) , hi_2.v ); 01305 01306 /* compute the potential and its derivative */ 01307 eff_1.v = _mm_setr_pd( data[0][2] , data[1][2] ); 01308 eff_2.v = _mm_setr_pd( data[2][2] , data[3][2] ); 01309 c_1.v = _mm_setr_pd( data[0][3] , data[1][3] ); 01310 c_2.v = _mm_setr_pd( data[2][3] , data[3][3] ); 01311 ee_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , c_1.v ); 01312 ee_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , c_2.v ); 01313 for ( j = 4 ; j < potential_chunk ; j++ ) { 01314 c_1.v = _mm_setr_pd( data[0][j] , data[1][j] ); 01315 c_2.v = _mm_setr_pd( data[2][j] , data[3][j] ); 01316 eff_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , ee_1.v ); 01317 eff_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , ee_2.v ); 01318 ee_1.v = _mm_add_pd( _mm_mul_pd( ee_1.v , x_1.v ) , c_1.v ); 01319 ee_2.v = _mm_add_pd( _mm_mul_pd( ee_2.v , x_2.v ) , c_2.v ); 01320 } 01321 01322 /* store the result */ 01323 _mm_store_pd( &e[0] , ee_1.v ); 01324 _mm_store_pd( &f[0] , _mm_mul_pd( eff_1.v , _mm_div_pd( hi_1.v , r_1.v ) ) ); 01325 _mm_store_pd( &e[2] , ee_2.v ); 01326 _mm_store_pd( &f[2] , _mm_mul_pd( eff_2.v , _mm_div_pd( hi_2.v , r_2.v ) ) ); 01327 01328 #else 01329 int k; 01330 for ( k = 0 ; k < 4 ; k++ ) 01331 potential_eval( p[k] , r2[k] , &e[k] , &f[k] ); 01332 #endif 01333 01334 } 01335 01336 01360 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4double_r ( struct potential *p[4] , FPTYPE *r , FPTYPE *e , FPTYPE *f ) { 01361 01362 #if defined(__AVX__) && defined(FPTYPE_DOUBLE) 01363 int ind[4], j; 01364 union { 01365 __v4df v; 01366 double f[4]; 01367 } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r; 01368 double *data[4]; 01369 01370 /* Get r . */ 01371 r.v = _mm256_load_pd( r2 ); 01372 01373 /* compute the index */ 01374 alpha0.v = _mm256_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] ); 01375 alpha1.v = _mm256_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] ); 01376 alpha2.v = _mm256_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] ); 01377 rind.v = _mm256_max_pd( _mm256_setzero_pd() , _mm256_add_pd( alpha0.v , _mm256_mul_pd( r.v , _mm256_add_pd( alpha1.v , _mm256_mul_pd( r.v , alpha2.v ) ) ) ) ); 01378 ind[0] = rind.f[0]; 01379 ind[1] = rind.f[1]; 01380 ind[3] = rind.f[3]; 01381 ind[4] = rind.f[4]; 01382 01383 /* get the table offset */ 01384 data[0] = &( p[0]->c[ ind[0] * potential_chunk ] ); 01385 data[1] = &( p[1]->c[ ind[1] * potential_chunk ] ); 01386 data[2] = &( p[2]->c[ ind[2] * potential_chunk ] ); 01387 data[3] = &( p[3]->c[ ind[3] * potential_chunk ] ); 01388 01389 /* adjust x to the interval */ 01390 mi.v = _mm256_setr_pd( data[0][0] , data[1][0] , data[2][0] , data[3][0] ); 01391 hi.v = _mm256_setr_pd( data[0][1] , data[1][1] , data[2][1] , data[3][1] ); 01392 x.v = _mm256_mul_pd( _mm256_sub_pd( r.v , mi.v ) , hi.v ); 01393 01394 /* compute the potential and its derivative */ 01395 eff.v = _mm256_setr_pd( data[0][2] , data[1][2] , data[2][2] , data[3][2] ); 01396 c.v = _mm256_setr_pd( data[0][3] , data[1][3] , data[2][3] , data[3][3] ); 01397 ee.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , c.v ); 01398 for ( j = 4 ; j < potential_chunk ; j++ ) { 01399 c.v = _mm256_setr_pd( data[0][j] , data[1][j] , data[2][j] , data[3][j] ); 01400 eff.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , ee.v ); 01401 ee.v = _mm256_add_pd( _mm256_mul_pd( ee.v , x.v ) , c.v ); 01402 } 01403 01404 /* store the result */ 01405 _mm256_store_pd( e , ee.v ); 01406 _mm256_store_pd( f , _mm256_mul_pd( eff.v , hi.v ) ); 01407 01408 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE) 01409 int ind[4], j; 01410 union { 01411 __v2df v; 01412 double f[2]; 01413 } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, 01414 alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2; 01415 double *data[4]; 01416 01417 /* Get r . */ 01418 r_1.v = _mm_load_pd( &r[0] ); 01419 r_2.v = _mm_load_pd( &r[2] ); 01420 01421 /* compute the index */ 01422 alpha0_1.v = _mm_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] ); 01423 alpha1_1.v = _mm_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] ); 01424 alpha2_1.v = _mm_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] ); 01425 alpha0_2.v = _mm_setr_pd( p[2]->alpha[0] , p[3]->alpha[0] ); 01426 alpha1_2.v = _mm_setr_pd( p[2]->alpha[1] , p[3]->alpha[1] ); 01427 alpha2_2.v = _mm_setr_pd( p[2]->alpha[2] , p[3]->alpha[2] ); 01428 rind_1.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_1.v , _mm_mul_pd( r_1.v , _mm_add_pd( alpha1_1.v , _mm_mul_pd( r_1.v , alpha2_1.v ) ) ) ) ); 01429 rind_2.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_2.v , _mm_mul_pd( r_2.v , _mm_add_pd( alpha1_2.v , _mm_mul_pd( r_2.v , alpha2_2.v ) ) ) ) ); 01430 ind[0] = rind_1.f[0]; 01431 ind[1] = rind_1.f[1]; 01432 ind[2] = rind_2.f[0]; 01433 ind[3] = rind_2.f[1]; 01434 01435 /* for ( j = 0 ; j < 4 ; j++ ) 01436 if ( ind[j] > p[j]->n ) { 01437 printf("potential_eval_vec_4double: dookie.\n"); 01438 fflush(stdout); 01439 } */ 01440 01441 /* get the table offset */ 01442 data[0] = &( p[0]->c[ ind[0] * potential_chunk ] ); 01443 data[1] = &( p[1]->c[ ind[1] * potential_chunk ] ); 01444 data[2] = &( p[2]->c[ ind[2] * potential_chunk ] ); 01445 data[3] = &( p[3]->c[ ind[3] * potential_chunk ] ); 01446 01447 /* adjust x to the interval */ 01448 mi_1.v = _mm_setr_pd( data[0][0] , data[1][0] ); 01449 hi_1.v = _mm_setr_pd( data[0][1] , data[1][1] ); 01450 mi_2.v = _mm_setr_pd( data[2][0] , data[3][0] ); 01451 hi_2.v = _mm_setr_pd( data[2][1] , data[3][1] ); 01452 x_1.v = _mm_mul_pd( _mm_sub_pd( r_1.v , mi_1.v ) , hi_1.v ); 01453 x_2.v = _mm_mul_pd( _mm_sub_pd( r_2.v , mi_2.v ) , hi_2.v ); 01454 01455 /* compute the potential and its derivative */ 01456 eff_1.v = _mm_setr_pd( data[0][2] , data[1][2] ); 01457 eff_2.v = _mm_setr_pd( data[2][2] , data[3][2] ); 01458 c_1.v = _mm_setr_pd( data[0][3] , data[1][3] ); 01459 c_2.v = _mm_setr_pd( data[2][3] , data[3][3] ); 01460 ee_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , c_1.v ); 01461 ee_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , c_2.v ); 01462 for ( j = 4 ; j < potential_chunk ; j++ ) { 01463 c_1.v = _mm_setr_pd( data[0][j] , data[1][j] ); 01464 c_2.v = _mm_setr_pd( data[2][j] , data[3][j] ); 01465 eff_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , ee_1.v ); 01466 eff_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , ee_2.v ); 01467 ee_1.v = _mm_add_pd( _mm_mul_pd( ee_1.v , x_1.v ) , c_1.v ); 01468 ee_2.v = _mm_add_pd( _mm_mul_pd( ee_2.v , x_2.v ) , c_2.v ); 01469 } 01470 01471 /* store the result */ 01472 _mm_store_pd( &e[0] , ee_1.v ); 01473 _mm_store_pd( &f[0] , _mm_mul_pd( eff_1.v , hi_1.v ) ); 01474 _mm_store_pd( &e[2] , ee_2.v ); 01475 _mm_store_pd( &f[2] , _mm_mul_pd( eff_2.v , hi_2.v ) ); 01476 01477 #else 01478 int k; 01479 for ( k = 0 ; k < 4 ; k++ ) 01480 potential_eval_r( p[k] , r[k] , &e[k] , &f[k] ); 01481 #endif 01482 01483 } 01484 01485