mdcore  0.1.5
/home/pedro/work/mdcore/src/potential_eval.h
Go to the documentation of this file.
00001 
00002 /*******************************************************************************
00003  * This file is part of mdcore.
00004  * Coypright (c) 2010 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
00005  * 
00006  * This program is free software: you can redistribute it and/or modify
00007  * it under the terms of the GNU Lesser General Public License as published
00008  * by the Free Software Foundation, either version 3 of the License, or
00009  * (at your option) any later version.
00010  * 
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  * 
00016  * You should have received a copy of the GNU Lesser General Public License
00017  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
00018  * 
00019  ******************************************************************************/
00020 
00021 
00022 /* This file contains the potential evaluation function als "extern inline",
00023    such that they can be inlined in the respective modules. 
00024    
00025    If your code wants to call any potential_eval functions, you must include
00026    this file.
00027 */
00028 
00029 /* Function prototypes. */
00030 /* void potential_eval ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f );
00031 void potential_eval_expl ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f );
00032 void potential_eval_vec_4single ( struct potential *p[4] , float *r2 , float *e , float *f );
00033 void potential_eval_vec_4single_r ( struct potential *p[4] , float *r_in , float *e , float *f );
00034 void potential_eval_vec_8single ( struct potential *p[4] , float *r2 , float *e , float *f );
00035 void potential_eval_vec_2double ( struct potential *p[4] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f );
00036 void potential_eval_vec_4double ( struct potential *p[4] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f );
00037 void potential_eval_vec_4double_r ( struct potential *p[4] , FPTYPE *r , FPTYPE *e , FPTYPE *f );
00038 void potential_eval_r ( struct potential *p , FPTYPE r , FPTYPE *e , FPTYPE *f );
00039 */
00040 
00041 
00042 /* Get the inlining right. */
00043 #ifndef INLINE
00044 # if __GNUC__ && !__GNUC_STDC_INLINE__
00045 #  define INLINE extern inline
00046 # else
00047 #  define INLINE inline
00048 # endif
00049 #endif
00050 
00051     
00067 __attribute__ ((always_inline)) INLINE void potential_eval ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f ) {
00068 
00069     int ind, k;
00070     FPTYPE x, ee, eff, *c, r;
00071     
00072     /* Get r for the right type. */
00073     r = FPTYPE_SQRT(r2);
00074     
00075     /* is r in the house? */
00076     /* if ( r < p->a || r > p->b )
00077         printf("potential_eval: requested potential at r=%e, not in [%e,%e].\n",r,p->a,p->b); */
00078     
00079     /* compute the index */
00080     ind = FPTYPE_FMAX( FPTYPE_ZERO , p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2]) );
00081     
00082     /* if ( ind > p->n ) {
00083         printf("potential_eval: r=%.18e.\n",r);
00084         fflush(stdout);
00085         } */
00086             
00087     /* get the table offset */
00088     c = &(p->c[ind * potential_chunk]);
00089     
00090     /* adjust x to the interval */
00091     x = (r - c[0]) * c[1];
00092     
00093     /* compute the potential and its derivative */
00094     ee = c[2] * x + c[3];
00095     eff = c[2];
00096     for ( k = 4 ; k < potential_chunk ; k++ ) {
00097         eff = eff * x + ee;
00098         ee = ee * x + c[k];
00099         }
00100 
00101     /* store the result */
00102     *e = ee; *f = eff * c[1] / r;
00103         
00104     }
00105 
00106 
00122 __attribute__ ((always_inline)) INLINE void potential_eval_r ( struct potential *p , FPTYPE r , FPTYPE *e , FPTYPE *f ) {
00123 
00124     int ind, k;
00125     FPTYPE x, ee, eff, *c;
00126     
00127     /* compute the index */
00128     ind = FPTYPE_FMAX( FPTYPE_ZERO , p->alpha[0] + r * (p->alpha[1] + r * p->alpha[2] ) );
00129     
00130     /* is r in the house? */
00131     /* if ( ind > p->n )
00132         printf("potential_eval_r: requested potential at r=%e (ind=%.8e), not in [%e,%e].\n",r , p->alpha[0] + r * (p->alpha[1] + r * (p->alpha[2] + r * p->alpha[3])),p->a,p->b); */
00133     
00134     /* get the table offset */
00135     c = &(p->c[ind * potential_chunk]);
00136     
00137     /* adjust x to the interval */
00138     x = (r - c[0]) * c[1];
00139     
00140     /* compute the potential and its derivative */
00141     ee = c[2] * x + c[3];
00142     eff = c[2];
00143     for ( k = 4 ; k < potential_chunk ; k++ ) {
00144         eff = eff * x + ee;
00145         ee = ee * x + c[k];
00146         }
00147 
00148     /* store the result */
00149     *e = ee; *f = eff * c[1];
00150         
00151     }
00152 
00153 
00175 __attribute__ ((always_inline)) INLINE void potential_eval_expl ( struct potential *p , FPTYPE r2 , FPTYPE *e , FPTYPE *f ) {
00176 
00177     const FPTYPE isqrtpi = 0.56418958354775628695;
00178     const FPTYPE kappa = 3.0;
00179     FPTYPE r = sqrt(r2), ir = 1.0 / r, ir2 = ir * ir, ir4, ir6, ir12, t1, t2;
00180     FPTYPE ee = 0.0, eff = 0.0;
00181 
00182     /* Do we have a Lennard-Jones interaction? */
00183     if ( p->flags & potential_flag_LJ126 ) {
00184     
00185         /* init some variables */
00186         ir4 = ir2 * ir2; ir6 = ir4 * ir2; ir12 = ir6 * ir6;
00187         
00188         /* compute the energy and the force */
00189         ee = ( p->alpha[0] * ir12 - p->alpha[1] * ir6 );
00190         eff = 6.0 * ir * ( -2.0 * p->alpha[0] * ir12 + p->alpha[1] * ir6 );
00191     
00192         }
00193         
00194     /* Do we have an Ewald short-range part? */
00195     if ( p->flags & potential_flag_Ewald ) {
00196     
00197         /* get some values we will re-use */
00198         t2 = r * kappa;
00199         t1 = erfc( t2 );
00200     
00201         /* compute the energy and the force */
00202         ee += p->alpha[2] * t1 * ir;
00203         eff += p->alpha[2] * ( -2.0 * isqrtpi * exp( -t2 * t2 ) * kappa * ir - t1 * ir2 );
00204     
00205         }
00206     
00207     /* Do we have a Coulomb interaction? */
00208     if ( p->flags & potential_flag_Coulomb ) {
00209     
00210         /* compute the energy and the force */
00211         ee += potential_escale * p->alpha[2] * ir;
00212         eff += -potential_escale * p->alpha[2] * ir2;
00213     
00214         }
00215         
00216     /* store the potential and force. */
00217     *e = ee;
00218     *f = eff;
00219     
00220     }
00221 
00222 
00246 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single ( struct potential *p[4] , float *r2 , float *e , float *f ) {
00247 
00248 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
00249     // int j, k;
00250     union {
00251         __v4sf v;
00252         __m128i m;
00253         float f[4];
00254         int i[4];
00255         } alpha[4], mi, hi, x, ee, eff, c[6], r, ind, t[8];
00256     // float *data[4];
00257     
00258     /* Get r . */
00259     r.v = _mm_sqrt_ps( _mm_load_ps( r2 ) );
00260     
00261     /* compute the index */
00262     alpha[0].v = _mm_load_ps( p[0]->alpha );
00263     alpha[1].v = _mm_load_ps( p[1]->alpha );
00264     alpha[2].v = _mm_load_ps( p[2]->alpha );
00265     alpha[3].v = _mm_load_ps( p[3]->alpha );
00266     t[0].m = _mm_unpacklo_epi32( alpha[0].m , alpha[1].m );
00267     t[1].m = _mm_unpacklo_epi32( alpha[2].m , alpha[3].m );
00268     t[2].m = _mm_unpackhi_epi32( alpha[0].m , alpha[1].m );
00269     t[3].m = _mm_unpackhi_epi32( alpha[2].m , alpha[3].m );
00270     alpha[0].m = _mm_unpacklo_epi64( t[0].m , t[1].m );
00271     alpha[1].m = _mm_unpackhi_epi64( t[0].m , t[1].m );
00272     alpha[2].m = _mm_unpacklo_epi64( t[2].m , t[3].m );
00273     ind.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha[0].v , _mm_mul_ps( r.v , _mm_add_ps( alpha[1].v , _mm_mul_ps( r.v , alpha[2].v ) ) ) ) ) );
00274     
00275     /* Check ranges. */
00276     /* for ( j = 0 ; j < 4 ; j++ )
00277         if ( ind.i[j] == 0 || ind.i[j] > p[j]->n ) {
00278             printf( "potential_eval_vec_4single: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" ,
00279                 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) ,
00280                 p[j]->n , p[j]->a , p[j]->b );
00281             fflush(stdout);
00282             } */
00283             
00284     /* Unpack/transpose the coefficient data. */
00285     mi.v = _mm_load_ps( &p[0]->c[ ind.i[0] * potential_chunk ] );
00286     hi.v = _mm_load_ps( &p[1]->c[ ind.i[1] * potential_chunk ] );
00287     c[0].v = _mm_load_ps( &p[2]->c[ ind.i[2] * potential_chunk ] );
00288     c[1].v = _mm_load_ps( &p[3]->c[ ind.i[3] * potential_chunk ] );
00289     _MM_TRANSPOSE4_PS( mi.v , hi.v , c[0].v , c[1].v );
00290     c[2].v = _mm_load_ps( &p[0]->c[ ind.i[0] * potential_chunk + 4 ] );
00291     c[3].v = _mm_load_ps( &p[1]->c[ ind.i[1] * potential_chunk + 4 ] );
00292     c[4].v = _mm_load_ps( &p[2]->c[ ind.i[2] * potential_chunk + 4 ] );
00293     c[5].v = _mm_load_ps( &p[3]->c[ ind.i[3] * potential_chunk + 4 ] );
00294     _MM_TRANSPOSE4_PS( c[2].v , c[3].v , c[4].v , c[5].v );
00295     
00296     /* adjust x to the interval */
00297     x.v = _mm_mul_ps( _mm_sub_ps( r.v , mi.v ) , hi.v );
00298     
00299     /* compute the potential and its derivative */
00300     eff.v = c[0].v;
00301     ee.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , c[1].v );
00302     eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v );
00303     ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[2].v );
00304     eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v );
00305     ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[3].v );
00306     eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v );
00307     ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[4].v );
00308     eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v );
00309     ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c[5].v );
00310 
00311     /* store the result */
00312     _mm_store_ps( e , ee.v );
00313     _mm_store_ps( f , _mm_mul_ps( eff.v , _mm_div_ps( hi.v , r.v ) ) );
00314     
00315 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
00316     int j, k;
00317     union {
00318         vector float v;
00319         float f[4];
00320         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
00321     union {
00322         vector unsigned int v;
00323         unsigned int i[4];
00324         } ind;
00325     float *data[4];
00326     
00327     /* Get r . */
00328     r.v = vec_sqrt( *((vector float *)r2) );
00329     
00330     /* compute the index (vec_ctu maps negative floats to 0) */
00331     alpha0.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00332     alpha1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00333     alpha2.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00334     ind.v = vec_ctu( vec_madd( r.v , vec_madd( r.v , alpha2.v , alpha1.v ) , alpha0.v ) , 0 );
00335     
00336     /* get the table offset */
00337     for ( k = 0 ; k < 4 ; k++ )
00338         data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] );
00339     
00340     /* adjust x to the interval */
00341     mi.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
00342     hi.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
00343     x.v = vec_mul( vec_sub( r.v , mi.v ) , hi.v );
00344     
00345     /* compute the potential and its derivative */
00346     eff.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
00347     c.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
00348     ee.v = vec_madd( eff.v , x.v , c.v );
00349     for ( j = 4 ; j < potential_chunk ; j++ ) {
00350         c.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
00351         eff.v = vec_madd( eff.v , x.v , ee.v );
00352         ee.v = vec_madd( ee.v , x.v , c.v );
00353         }
00354 
00355     /* store the result */
00356     *((vector float *)e) = ee.v;
00357     *((vector float *)f) = vec_mul( eff.v , vec_div( hi.v , r.v ) );
00358         
00359 #else
00360     int k;
00361     FPTYPE ee, eff;
00362     for ( k = 0 ; k < 4 ; k++ ) {
00363         potential_eval_r( p[k] , r2[k] , &ee , &eff );
00364         e[k] = ee; f[k] = eff;
00365         }
00366 #endif
00367         
00368     }
00369 
00370 
00394 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single_old ( struct potential *p[4] , float *r2 , float *e , float *f ) {
00395 
00396 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
00397     int j, k;
00398     union {
00399         __v4sf v;
00400         __m128i m;
00401         float f[4];
00402         int i[4];
00403         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
00404     float *data[4];
00405     
00406     /* Get r . */
00407     r.v = _mm_sqrt_ps( _mm_load_ps( r2 ) );
00408     
00409     /* compute the index */
00410     alpha0.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00411     alpha1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00412     alpha2.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00413     ind.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0.v , _mm_mul_ps( r.v , _mm_add_ps( alpha1.v , _mm_mul_ps( r.v , alpha2.v ) ) ) ) ) );
00414     
00415     /* Check ranges. */
00416     /* for ( j = 0 ; j < 4 ; j++ )
00417         if ( ind.i[j] == 0 || ind.i[j] > p[j]->n ) {
00418             printf( "potential_eval_vec_4single: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" ,
00419                 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) ,
00420                 p[j]->n , p[j]->a , p[j]->b );
00421             fflush(stdout);
00422             } */
00423     
00424     /* get the table offset */
00425     for ( k = 0 ; k < 4 ; k++ )
00426         data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] );
00427     
00428     /* adjust x to the interval */
00429     mi.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
00430     hi.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
00431     x.v = _mm_mul_ps( _mm_sub_ps( r.v , mi.v ) , hi.v );
00432     
00433     /* compute the potential and its derivative */
00434     eff.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
00435     c.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
00436     ee.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , c.v );
00437     for ( j = 4 ; j < potential_chunk ; j++ ) {
00438         c.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
00439         eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v );
00440         ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c.v );
00441         }
00442 
00443     /* store the result */
00444     _mm_store_ps( e , ee.v );
00445     _mm_store_ps( f , _mm_mul_ps( eff.v , _mm_div_ps( hi.v , r.v ) ) );
00446     
00447 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
00448     int j, k;
00449     union {
00450         vector float v;
00451         float f[4];
00452         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
00453     union {
00454         vector unsigned int v;
00455         unsigned int i[4];
00456         } ind;
00457     float *data[4];
00458     
00459     /* Get r . */
00460     r.v = vec_sqrt( *((vector float *)r2) );
00461     
00462     /* compute the index (vec_ctu maps negative floats to 0) */
00463     alpha0.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00464     alpha1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00465     alpha2.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00466     ind.v = vec_ctu( vec_madd( r.v , vec_madd( r.v , alpha2.v , alpha1.v ) , alpha0.v ) , 0 );
00467     
00468     /* get the table offset */
00469     for ( k = 0 ; k < 4 ; k++ )
00470         data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] );
00471     
00472     /* adjust x to the interval */
00473     mi.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
00474     hi.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
00475     x.v = vec_mul( vec_sub( r.v , mi.v ) , hi.v );
00476     
00477     /* compute the potential and its derivative */
00478     eff.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
00479     c.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
00480     ee.v = vec_madd( eff.v , x.v , c.v );
00481     for ( j = 4 ; j < potential_chunk ; j++ ) {
00482         c.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
00483         eff.v = vec_madd( eff.v , x.v , ee.v );
00484         ee.v = vec_madd( ee.v , x.v , c.v );
00485         }
00486 
00487     /* store the result */
00488     *((vector float *)e) = ee.v;
00489     *((vector float *)f) = vec_mul( eff.v , vec_div( hi.v , r.v ) );
00490         
00491 #else
00492     int k;
00493     FPTYPE ee, eff;
00494     for ( k = 0 ; k < 4 ; k++ ) {
00495         potential_eval_r( p[k] , r2[k] , &ee , &eff );
00496         e[k] = ee; f[k] = eff;
00497         }
00498 #endif
00499         
00500     }
00501 
00502 
00503 #ifdef STILL_NOT_READY_FOR_PRIME_TIME
00504 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single_gccvec ( struct potential *p[4] , float *r2 , float *e , float *f ) {
00505 
00506     int j, k;
00507     union {
00508         vector(4,float) v;
00509         float f[4];
00510         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
00511     union {
00512         vector(4,int) v;
00513         int i[4];
00514         } ind;
00515     FPTYPE *data[4];
00516     
00517     /* Get r . */
00518     r.v = sqrtf( *( (vector(4,float) *)r2 ) );
00519     
00520     /* compute the index */
00521     for ( k = 0 ; k < 4 ; k++ ) {
00522         alpha0.f[k] = p[k]->alpha[0];
00523         alpha1.f[k] = p[k]->alpha[1];
00524         alpha2.f[k] = p[k]->alpha[2];
00525         }
00526     ind.v = max( (vector(4,int)){0,0,0,0} , (vector(4,int))( alpha0.v + r.v*( alpha1.v + r.v*alpha2.v ) ) );
00527     
00528     /* Check ranges. */
00529     /* for ( j = 0 ; j < 4 ; j++ )
00530         if ( ind.i[j] == 0 || ind.i[j] > p[j]->n ) {
00531             printf( "potential_eval_vec_4single: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" ,
00532                 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) ,
00533                 p[j]->n , p[j]->a , p[j]->b );
00534             fflush(stdout);
00535             } */
00536     
00537     /* get the table offset */
00538     for ( k = 0 ; k < 4 ; k++ )
00539         data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] );
00540     
00541     /* adjust x to the interval */
00542     for ( k = 0 ; k < 4 ; k++ ) {
00543         mi.f[k] = data[k][0];
00544         hi.f[k] = data[k][1];
00545         }
00546     x.v = ( r.v - mi.v ) * hi.v;
00547     
00548     /* compute the potential and its derivative */
00549     for ( k = 0 ; k < 4 ; k++ ) {
00550         eff.f[k] = data[k][2];
00551         c.f[k] = data[k][3];
00552         }
00553     ee.v = eff.v*x.v + c.v;
00554     for ( j = 4 ; j < potential_chunk ; j++ ) {
00555             for ( k = 0 ; k < 4 ; k++ )
00556             c.f[k] = data[k][j];
00557         eff.v = eff.v*x.v + ee.v;
00558         ee.v = ee.v*x.v + c.v;
00559         }
00560 
00561     /* store the result */
00562     *( (vector(4,float) *)e ) = ee.v;
00563     *( (vector(4,float) *)f ) = eff.v*( hi.v / r.v );
00564     
00565     }
00566 #endif
00567 
00568 
00592 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4single_r ( struct potential *p[4] , float *r_in , float *e , float *f ) {
00593 
00594 #if defined(__SSE__) && defined(FPTYPE_SINGLE)
00595     int j, k;
00596     union {
00597         __v4sf v;
00598         __m128i m;
00599         float f[4];
00600         int i[4];
00601         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
00602     float *data[4];
00603     
00604     /* Get r . */
00605     r.v = _mm_load_ps( r_in );
00606     
00607     /* compute the index */
00608     alpha0.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00609     alpha1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00610     alpha2.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00611     ind.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0.v , _mm_mul_ps( r.v , _mm_add_ps( alpha1.v , _mm_mul_ps( r.v , alpha2.v ) ) ) ) ) );
00612     
00613     /* Check ranges. */
00614     /* for ( j = 0 ; j < 4 ; j++ )
00615         if ( ind.i[j] > p[j]->n ) {
00616             printf( "potential_eval_vec_4single_r: r=%.9e (n=%.9e/%i) is not in [%.9e,%.9e].\n" ,
00617                 r.f[j] , p[j]->alpha[0] + r.f[j]*(p[j]->alpha[1] + r.f[j]*p[j]->alpha[2]) ,
00618                 p[j]->n , p[j]->a , p[j]->b );
00619             fflush(stdout);
00620             } */
00621             
00622     /* get the table offset */
00623     for ( k = 0 ; k < 4 ; k++ )
00624         data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] );
00625     
00626     /* adjust x to the interval */
00627     mi.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
00628     hi.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
00629     x.v = _mm_mul_ps( _mm_sub_ps( r.v , mi.v ) , hi.v );
00630     
00631     /* compute the potential and its derivative */
00632     eff.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
00633     c.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
00634     ee.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , c.v );
00635     for ( j = 4 ; j < potential_chunk ; j++ ) {
00636         c.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
00637         eff.v = _mm_add_ps( _mm_mul_ps( eff.v , x.v ) , ee.v );
00638         ee.v = _mm_add_ps( _mm_mul_ps( ee.v , x.v ) , c.v );
00639         }
00640 
00641     /* store the result */
00642     _mm_store_ps( e , ee.v );
00643     _mm_store_ps( f , _mm_mul_ps( eff.v , hi.v ) );
00644     
00645 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
00646     int j, k;
00647     union {
00648         vector float v;
00649         float f[4];
00650         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r;
00651     union {
00652         vector unsigned int v;
00653         unsigned int i[4];
00654         } ind;
00655     float *data[4];
00656     
00657     /* Get r . */
00658     r.v = *((vector float *)r_in);
00659     
00660     /* compute the index (vec_ctu maps negative floats to 0) */
00661     alpha0.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00662     alpha1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00663     alpha2.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00664     ind.v = vec_ctu( vec_madd( r.v , vec_madd( r.v , alpha2.v , alpha1.v ) , alpha0.v ) , 0 );
00665     
00666     /* get the table offset */
00667     for ( k = 0 ; k < 4 ; k++ )
00668         data[k] = &( p[k]->c[ ind.i[k] * potential_chunk ] );
00669     
00670     /* adjust x to the interval */
00671     mi.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
00672     hi.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
00673     x.v = vec_mul( vec_sub( r.v , mi.v ) , hi.v );
00674     
00675     /* compute the potential and its derivative */
00676     eff.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
00677     c.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
00678     ee.v = vec_madd( eff.v , x.v , c.v );
00679     for ( j = 4 ; j < potential_chunk ; j++ ) {
00680         c.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
00681         eff.v = vec_madd( eff.v , x.v , ee.v );
00682         ee.v = vec_madd( ee.v , x.v , c.v );
00683         }
00684 
00685     /* store the result */
00686     *((vector float *)e) = ee.v;
00687     *((vector float *)f) = vec_mul( eff.v , hi.v );
00688         
00689 #else
00690     int k;
00691     FPTYPE ee, eff;
00692     for ( k = 0 ; k < 4 ; k++ ) {
00693         potential_eval( p[k] , r_in[k] , &ee , &eff );
00694         e[k] = ee; f[k] = eff;
00695         }
00696 #endif
00697         
00698     }
00699 
00700 
00724 __attribute__ ((always_inline)) INLINE void potential_eval_vec_8single ( struct potential *p[8] , float *r2 , float *e , float *f ) {
00725 
00726 #if defined(__AVX__) && defined(FPTYPE_SINGLE)
00727     int j;
00728     union {
00729         __v8sf v;
00730         __m256i m;
00731         float f[8];
00732         int i[8];
00733         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
00734     float *data[8];
00735     
00736     /* Get r . */
00737     r.v = _mm256_sqrt_ps( _mm256_load_ps( r2 ) );
00738     
00739     /* compute the index */
00740     alpha0.v = _mm256_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] , p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] );
00741     alpha1.v = _mm256_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] , p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] );
00742     alpha2.v = _mm256_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] , p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] );
00743     ind.m = _mm256_cvttps_epi32( _mm256_max_ps( _mm256_setzero_ps() , _mm256_add_ps( alpha0.v , _mm256_mul_ps( r.v , _mm256_add_ps( alpha1.v , _mm256_mul_ps( r.v , alpha2.v ) ) ) ) ) );
00744     
00745     /* get the table offset */
00746     data[0] = &( p[0]->c[ ind.i[0] * potential_chunk ] );
00747     data[1] = &( p[1]->c[ ind.i[1] * potential_chunk ] );
00748     data[2] = &( p[2]->c[ ind.i[2] * potential_chunk ] );
00749     data[3] = &( p[3]->c[ ind.i[3] * potential_chunk ] );
00750     data[4] = &( p[4]->c[ ind.i[4] * potential_chunk ] );
00751     data[5] = &( p[5]->c[ ind.i[5] * potential_chunk ] );
00752     data[6] = &( p[6]->c[ ind.i[6] * potential_chunk ] );
00753     data[7] = &( p[7]->c[ ind.i[7] * potential_chunk ] );
00754     
00755     /* adjust x to the interval */
00756     mi.v = _mm256_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] , data[4][0] , data[5][0] , data[6][0] , data[7][0] );
00757     hi.v = _mm256_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] , data[4][1] , data[5][1] , data[6][1] , data[7][1] );
00758     x.v = _mm256_mul_ps( _mm256_sub_ps( r.v , mi.v ) , hi.v );
00759     
00760     /* compute the potential and its derivative */
00761     eff.v = _mm256_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] , data[4][2] , data[5][2] , data[6][2] , data[7][2] );
00762     c.v = _mm256_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] , data[4][3] , data[5][3] , data[6][3] , data[7][3] );
00763     ee.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , c.v );
00764     for ( j = 4 ; j < potential_chunk ; j++ ) {
00765         c.v = _mm256_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] , data[4][j] , data[5][j] , data[6][j] , data[7][j] );
00766         eff.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , ee.v );
00767         ee.v = _mm256_add_ps( _mm256_mul_ps( ee.v , x.v ) , c.v );
00768         }
00769 
00770     /* store the result */
00771     _mm256_store_ps( e , ee.v );
00772     _mm256_store_ps( f , _mm256_mul_ps( eff.v , _mm256_div_ps( hi.v , r.v ) ) );
00773     
00774 #elifif defined(__SSE__) && defined(FPTYPE_SINGLE)
00775     int j;
00776     union {
00777         __v4sf v;
00778         __m128i m;
00779         float f[4];
00780         int i[4];
00781         } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
00782           alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
00783     float *data[8];
00784     
00785     /* Get r . */
00786     r_1.v = _mm_sqrt_ps( _mm_load_ps( &r2[0] ) );
00787     r_2.v = _mm_sqrt_ps( _mm_load_ps( &r2[4] ) );
00788     
00789     /* compute the index */
00790     alpha0_1.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00791     alpha1_1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00792     alpha2_1.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00793     alpha0_2.v = _mm_setr_ps( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] );
00794     alpha1_2.v = _mm_setr_ps( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] );
00795     alpha2_2.v = _mm_setr_ps( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] );
00796     ind_1.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_1.v , _mm_mul_ps( r_1.v , _mm_add_ps( alpha1_1.v , _mm_mul_ps( r_1.v , alpha2_1.v ) ) ) ) ) );
00797     ind_2.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_2.v , _mm_mul_ps( r_2.v , _mm_add_ps( alpha1_2.v , _mm_mul_ps( r_2.v , alpha2_2.v ) ) ) ) ) );
00798     
00799     /* get the table offset */
00800     data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] );
00801     data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] );
00802     data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] );
00803     data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] );
00804     data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] );
00805     data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] );
00806     data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] );
00807     data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] );
00808     
00809     /* adjust x to the interval */
00810     mi_1.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
00811     hi_1.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
00812     mi_2.v = _mm_setr_ps( data[4][0] , data[5][0] , data[6][0] , data[7][0] );
00813     hi_2.v = _mm_setr_ps( data[4][1] , data[5][1] , data[6][1] , data[7][1] );
00814     x_1.v = _mm_mul_ps( _mm_sub_ps( r_1.v , mi_1.v ) , hi_1.v );
00815     x_2.v = _mm_mul_ps( _mm_sub_ps( r_2.v , mi_2.v ) , hi_2.v );
00816     
00817     /* compute the potential and its derivative */
00818     eff_1.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
00819     eff_2.v = _mm_setr_ps( data[4][2] , data[5][2] , data[6][2] , data[7][2] );
00820     c_1.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
00821     c_2.v = _mm_setr_ps( data[4][3] , data[5][3] , data[6][3] , data[7][3] );
00822     ee_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , c_1.v );
00823     ee_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , c_2.v );
00824     for ( j = 4 ; j < potential_chunk ; j++ ) {
00825         c_1.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
00826         c_2.v = _mm_setr_ps( data[4][j] , data[5][j] , data[6][j] , data[7][j] );
00827         eff_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , ee_1.v );
00828         eff_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , ee_2.v );
00829         ee_1.v = _mm_add_ps( _mm_mul_ps( ee_1.v , x_1.v ) , c_1.v );
00830         ee_2.v = _mm_add_ps( _mm_mul_ps( ee_2.v , x_2.v ) , c_2.v );
00831         }
00832 
00833     /* store the result */
00834     _mm_store_ps( &e[0] , ee_1.v );
00835     _mm_store_ps( &e[4] , ee_2.v );
00836     _mm_store_ps( &f[0] , _mm_mul_ps( eff_1.v , _mm_div_ps( hi_1.v , r_1.v ) ) );
00837     _mm_store_ps( &f[4] , _mm_mul_ps( eff_2.v , _mm_div_ps( hi_2.v , r_2.v ) ) );
00838     
00839 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
00840     int j;
00841     union {
00842         vector float v;
00843         vector unsigned int m;
00844         float f[4];
00845         unsigned int i[4];
00846         } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
00847           alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
00848     float *data[8];
00849     
00850     /* Get r . */
00851     r_1.v = vec_sqrt( *((vector float *)&r2[0]) );
00852     r_2.v = vec_sqrt( *((vector float *)&r2[4]) );
00853     
00854     /* compute the index */
00855     alpha0_1.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00856     alpha1_1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00857     alpha2_1.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00858     alpha0_2.v = vec_load4( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] );
00859     alpha1_2.v = vec_load4( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] );
00860     alpha2_2.v = vec_load4( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] );
00861     ind_1.m = vec_ctu( vec_madd( r_1.v , vec_madd( r_1.v , alpha2_1.v , alpha1_1.v ) , alpha0_1.v ) , 0 );
00862     ind_2.m = vec_ctu( vec_madd( r_2.v , vec_madd( r_2.v , alpha2_2.v , alpha1_2.v ) , alpha0_2.v ) , 0 );
00863     
00864     /* get the table offset */
00865     data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] );
00866     data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] );
00867     data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] );
00868     data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] );
00869     data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] );
00870     data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] );
00871     data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] );
00872     data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] );
00873     
00874     /* adjust x to the interval */
00875     mi_1.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
00876     hi_1.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
00877     mi_2.v = vec_load4( data[4][0] , data[5][0] , data[6][0] , data[7][0] );
00878     hi_2.v = vec_load4( data[4][1] , data[5][1] , data[6][1] , data[7][1] );
00879     x_1.v = vec_mul( vec_sub( r_1.v , mi_1.v ) , hi_1.v );
00880     x_2.v = vec_mul( vec_sub( r_2.v , mi_2.v ) , hi_2.v );
00881     
00882     /* compute the potential and its derivative */
00883     eff_1.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
00884     eff_2.v = vec_load4( data[4][2] , data[5][2] , data[6][2] , data[7][2] );
00885     c_1.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
00886     c_2.v = vec_load4( data[4][3] , data[5][3] , data[6][3] , data[7][3] );
00887     ee_1.v = vec_madd( eff_1.v , x_1.v , c_1.v );
00888     ee_2.v = vec_madd( eff_2.v , x_2.v , c_2.v );
00889     for ( j = 4 ; j < potential_chunk ; j++ ) {
00890         c_1.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
00891         c_2.v = vec_load4( data[4][j] , data[5][j] , data[6][j] , data[7][j] );
00892         eff_1.v = vec_madd( eff_1.v , x_1.v , ee_1.v );
00893         eff_2.v = vec_madd( eff_2.v , x_2.v , ee_2.v );
00894         ee_1.v = vec_madd( ee_1.v , x_1.v , c_1.v );
00895         ee_2.v = vec_madd( ee_2.v , x_2.v , c_2.v );
00896         }
00897 
00898     /* store the result */
00899     eff_1.v = vec_mul( eff_1.v , vec_div( hi_1.v , r_1.v ) );
00900     eff_2.v = vec_mul( eff_2.v , vec_div( hi_2.v , r_2.v ) );
00901     memcpy( &e[0] , &ee_1 , sizeof(vector float) );
00902     memcpy( &f[0] , &eff_1 , sizeof(vector float) );
00903     memcpy( &e[4] , &ee_2 , sizeof(vector float) );
00904     memcpy( &f[4] , &eff_2 , sizeof(vector float) );
00905     
00906 #else
00907     int k;
00908     FPTYPE ee, eff;
00909     for ( k = 0 ; k < 8 ; k++ ) {
00910         potential_eval( p[k] , r2[k] , &ee , &eff );
00911         e[k] = ee; f[k] = eff;
00912         }
00913 #endif
00914         
00915     }
00916 
00917 
00918 __attribute__ ((always_inline)) INLINE void potential_eval_vec_8single_r ( struct potential *p[8] , float *r2 , float *e , float *f ) {
00919 
00920 #if defined(__AVX__) && defined(FPTYPE_SINGLE)
00921     int j;
00922     union {
00923         __v8sf v;
00924         __m256i m;
00925         float f[8];
00926         int i[8];
00927         } alpha0, alpha1, alpha2, mi, hi, x, ee, eff, c, r, ind;
00928     float *data[8];
00929     
00930     /* Get r . */
00931     r.v = _mm256_load_ps( r2 );
00932     
00933     /* compute the index */
00934     alpha0.v = _mm256_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] , p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] );
00935     alpha1.v = _mm256_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] , p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] );
00936     alpha2.v = _mm256_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] , p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] );
00937     ind.m = _mm256_cvttps_epi32( _mm256_max_ps( _mm256_setzero_ps() , _mm256_add_ps( alpha0.v , _mm256_mul_ps( r.v , _mm256_add_ps( alpha1.v , _mm256_mul_ps( r.v , alpha2.v ) ) ) ) ) );
00938     
00939     /* get the table offset */
00940     data[0] = &( p[0]->c[ ind.i[0] * potential_chunk ] );
00941     data[1] = &( p[1]->c[ ind.i[1] * potential_chunk ] );
00942     data[2] = &( p[2]->c[ ind.i[2] * potential_chunk ] );
00943     data[3] = &( p[3]->c[ ind.i[3] * potential_chunk ] );
00944     data[4] = &( p[4]->c[ ind.i[4] * potential_chunk ] );
00945     data[5] = &( p[5]->c[ ind.i[5] * potential_chunk ] );
00946     data[6] = &( p[6]->c[ ind.i[6] * potential_chunk ] );
00947     data[7] = &( p[7]->c[ ind.i[7] * potential_chunk ] );
00948     
00949     /* adjust x to the interval */
00950     mi.v = _mm256_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] , data[4][0] , data[5][0] , data[6][0] , data[7][0] );
00951     hi.v = _mm256_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] , data[4][1] , data[5][1] , data[6][1] , data[7][1] );
00952     x.v = _mm256_mul_ps( _mm256_sub_ps( r.v , mi.v ) , hi.v );
00953     
00954     /* compute the potential and its derivative */
00955     eff.v = _mm256_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] , data[4][2] , data[5][2] , data[6][2] , data[7][2] );
00956     c.v = _mm256_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] , data[4][3] , data[5][3] , data[6][3] , data[7][3] );
00957     ee.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , c.v );
00958     for ( j = 4 ; j < potential_chunk ; j++ ) {
00959         c.v = _mm256_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] , data[4][j] , data[5][j] , data[6][j] , data[7][j] );
00960         eff.v = _mm256_add_ps( _mm256_mul_ps( eff.v , x.v ) , ee.v );
00961         ee.v = _mm256_add_ps( _mm256_mul_ps( ee.v , x.v ) , c.v );
00962         }
00963 
00964     /* store the result */
00965     _mm256_store_ps( e , ee.v );
00966     _mm256_store_ps( f , _mm256_mul_ps( eff.v , hi.v ) );
00967     
00968 #elifif defined(__SSE__) && defined(FPTYPE_SINGLE)
00969     int j;
00970     union {
00971         __v4sf v;
00972         __m128i m;
00973         float f[4];
00974         int i[4];
00975         } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
00976           alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
00977     float *data[8];
00978     
00979     /* Get r . */
00980     r_1.v = _mm_load_ps( &r2[0] );
00981     r_2.v = _mm_load_ps( &r2[4] );
00982     
00983     /* compute the index */
00984     alpha0_1.v = _mm_setr_ps( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
00985     alpha1_1.v = _mm_setr_ps( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
00986     alpha2_1.v = _mm_setr_ps( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
00987     alpha0_2.v = _mm_setr_ps( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] );
00988     alpha1_2.v = _mm_setr_ps( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] );
00989     alpha2_2.v = _mm_setr_ps( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] );
00990     ind_1.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_1.v , _mm_mul_ps( r_1.v , _mm_add_ps( alpha1_1.v , _mm_mul_ps( r_1.v , alpha2_1.v ) ) ) ) ) );
00991     ind_2.m = _mm_cvttps_epi32( _mm_max_ps( _mm_setzero_ps() , _mm_add_ps( alpha0_2.v , _mm_mul_ps( r_2.v , _mm_add_ps( alpha1_2.v , _mm_mul_ps( r_2.v , alpha2_2.v ) ) ) ) ) );
00992     
00993     /* get the table offset */
00994     data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] );
00995     data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] );
00996     data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] );
00997     data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] );
00998     data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] );
00999     data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] );
01000     data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] );
01001     data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] );
01002     
01003     /* adjust x to the interval */
01004     mi_1.v = _mm_setr_ps( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
01005     hi_1.v = _mm_setr_ps( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
01006     mi_2.v = _mm_setr_ps( data[4][0] , data[5][0] , data[6][0] , data[7][0] );
01007     hi_2.v = _mm_setr_ps( data[4][1] , data[5][1] , data[6][1] , data[7][1] );
01008     x_1.v = _mm_mul_ps( _mm_sub_ps( r_1.v , mi_1.v ) , hi_1.v );
01009     x_2.v = _mm_mul_ps( _mm_sub_ps( r_2.v , mi_2.v ) , hi_2.v );
01010     
01011     /* compute the potential and its derivative */
01012     eff_1.v = _mm_setr_ps( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
01013     eff_2.v = _mm_setr_ps( data[4][2] , data[5][2] , data[6][2] , data[7][2] );
01014     c_1.v = _mm_setr_ps( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
01015     c_2.v = _mm_setr_ps( data[4][3] , data[5][3] , data[6][3] , data[7][3] );
01016     ee_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , c_1.v );
01017     ee_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , c_2.v );
01018     for ( j = 4 ; j < potential_chunk ; j++ ) {
01019         c_1.v = _mm_setr_ps( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
01020         c_2.v = _mm_setr_ps( data[4][j] , data[5][j] , data[6][j] , data[7][j] );
01021         eff_1.v = _mm_add_ps( _mm_mul_ps( eff_1.v , x_1.v ) , ee_1.v );
01022         eff_2.v = _mm_add_ps( _mm_mul_ps( eff_2.v , x_2.v ) , ee_2.v );
01023         ee_1.v = _mm_add_ps( _mm_mul_ps( ee_1.v , x_1.v ) , c_1.v );
01024         ee_2.v = _mm_add_ps( _mm_mul_ps( ee_2.v , x_2.v ) , c_2.v );
01025         }
01026 
01027     /* store the result */
01028     _mm_store_ps( &e[0] , ee_1.v );
01029     _mm_store_ps( &e[4] , ee_2.v );
01030     _mm_store_ps( &f[0] , _mm_mul_ps( eff_1.v , hi_1.v ) );
01031     _mm_store_ps( &f[4] , _mm_mul_ps( eff_2.v , hi_2.v ) );
01032     
01033 #elif defined(__ALTIVEC__) && defined(FPTYPE_SINGLE)
01034     int j;
01035     union {
01036         vector float v;
01037         vector unsigned int m;
01038         float f[4];
01039         unsigned int i[4];
01040         } alpha0_1, alpha1_1, alpha2_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1, ind_1,
01041           alpha0_2, alpha1_2, alpha2_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2, ind_2;
01042     float *data[8];
01043     
01044     /* Get r . */
01045     r_1.v = *((vector float *)&r2[0]);
01046     r_2.v = *((vector float *)&r2[4]);
01047     
01048     /* compute the index */
01049     alpha0_1.v = vec_load4( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
01050     alpha1_1.v = vec_load4( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
01051     alpha2_1.v = vec_load4( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
01052     alpha0_2.v = vec_load4( p[4]->alpha[0] , p[5]->alpha[0] , p[6]->alpha[0] , p[7]->alpha[0] );
01053     alpha1_2.v = vec_load4( p[4]->alpha[1] , p[5]->alpha[1] , p[6]->alpha[1] , p[7]->alpha[1] );
01054     alpha2_2.v = vec_load4( p[4]->alpha[2] , p[5]->alpha[2] , p[6]->alpha[2] , p[7]->alpha[2] );
01055     ind_1.m = vec_ctu( vec_madd( r_1.v , vec_madd( r_1.v , alpha2_1.v , alpha1_1.v ) , alpha0_1.v ) , 0 );
01056     ind_2.m = vec_ctu( vec_madd( r_2.v , vec_madd( r_2.v , alpha2_2.v , alpha1_2.v ) , alpha0_2.v ) , 0 );
01057     
01058     /* get the table offset */
01059     data[0] = &( p[0]->c[ ind_1.i[0] * potential_chunk ] );
01060     data[1] = &( p[1]->c[ ind_1.i[1] * potential_chunk ] );
01061     data[2] = &( p[2]->c[ ind_1.i[2] * potential_chunk ] );
01062     data[3] = &( p[3]->c[ ind_1.i[3] * potential_chunk ] );
01063     data[4] = &( p[4]->c[ ind_2.i[0] * potential_chunk ] );
01064     data[5] = &( p[5]->c[ ind_2.i[1] * potential_chunk ] );
01065     data[6] = &( p[6]->c[ ind_2.i[2] * potential_chunk ] );
01066     data[7] = &( p[7]->c[ ind_2.i[3] * potential_chunk ] );
01067     
01068     /* adjust x to the interval */
01069     mi_1.v = vec_load4( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
01070     hi_1.v = vec_load4( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
01071     mi_2.v = vec_load4( data[4][0] , data[5][0] , data[6][0] , data[7][0] );
01072     hi_2.v = vec_load4( data[4][1] , data[5][1] , data[6][1] , data[7][1] );
01073     x_1.v = vec_mul( vec_sub( r_1.v , mi_1.v ) , hi_1.v );
01074     x_2.v = vec_mul( vec_sub( r_2.v , mi_2.v ) , hi_2.v );
01075     
01076     /* compute the potential and its derivative */
01077     eff_1.v = vec_load4( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
01078     eff_2.v = vec_load4( data[4][2] , data[5][2] , data[6][2] , data[7][2] );
01079     c_1.v = vec_load4( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
01080     c_2.v = vec_load4( data[4][3] , data[5][3] , data[6][3] , data[7][3] );
01081     ee_1.v = vec_madd( eff_1.v , x_1.v , c_1.v );
01082     ee_2.v = vec_madd( eff_2.v , x_2.v , c_2.v );
01083     for ( j = 4 ; j < potential_chunk ; j++ ) {
01084         c_1.v = vec_load4( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
01085         c_2.v = vec_load4( data[4][j] , data[5][j] , data[6][j] , data[7][j] );
01086         eff_1.v = vec_madd( eff_1.v , x_1.v , ee_1.v );
01087         eff_2.v = vec_madd( eff_2.v , x_2.v , ee_2.v );
01088         ee_1.v = vec_madd( ee_1.v , x_1.v , c_1.v );
01089         ee_2.v = vec_madd( ee_2.v , x_2.v , c_2.v );
01090         }
01091 
01092     /* store the result */
01093     eff_1.v = vec_mul( eff_1.v , hi_1.v );
01094     eff_2.v = vec_mul( eff_2.v , hi_2.v );
01095     memcpy( &e[0] , &ee_1 , sizeof(vector float) );
01096     memcpy( &f[0] , &eff_1 , sizeof(vector float) );
01097     memcpy( &e[4] , &ee_2 , sizeof(vector float) );
01098     memcpy( &f[4] , &eff_2 , sizeof(vector float) );
01099     
01100 #else
01101     int k;
01102     FPTYPE ee, eff;
01103     for ( k = 0 ; k < 8 ; k++ ) {
01104         potential_eval( p[k] , r2[k] , &ee , &eff );
01105         e[k] = ee; f[k] = eff;
01106         }
01107 #endif
01108         
01109     }
01110 
01111 
01135 __attribute__ ((always_inline)) INLINE void potential_eval_vec_2double ( struct potential *p[2] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f ) {
01136 
01137 #if defined(__SSE2__) && defined(FPTYPE_DOUBLE)
01138     int ind[2], j;
01139     union {
01140         __v2df v;
01141         double f[2];
01142         } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
01143     double *data[2];
01144     
01145     /* Get r . */
01146     r.v = _mm_sqrt_pd( _mm_load_pd( r2 ) );
01147     
01148     /* compute the index */
01149     alpha0.v = _mm_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] );
01150     alpha1.v = _mm_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] );
01151     alpha2.v = _mm_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] );
01152     rind.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0.v , _mm_mul_pd( r.v , _mm_add_pd( alpha1.v , _mm_mul_pd( r.v , alpha2.v ) ) ) ) );
01153     ind[0] = rind.f[0];
01154     ind[1] = rind.f[1];
01155     
01156     /* get the table offset */
01157     data[0] = &( p[0]->c[ ind[0] * potential_chunk ] );
01158     data[1] = &( p[1]->c[ ind[1] * potential_chunk ] );
01159     
01160     /* adjust x to the interval */
01161     mi.v = _mm_setr_pd( data[0][0] , data[1][0] );
01162     hi.v = _mm_setr_pd( data[0][1] , data[1][1] );
01163     x.v = _mm_mul_pd( _mm_sub_pd( r.v , mi.v ) , hi.v );
01164     
01165     /* compute the potential and its derivative */
01166     eff.v = _mm_setr_pd( data[0][2] , data[1][2] );
01167     c.v = _mm_setr_pd( data[0][3] , data[1][3] );
01168     ee.v = _mm_add_pd( _mm_mul_pd( eff.v , x.v ) , c.v );
01169     for ( j = 4 ; j < potential_chunk ; j++ ) {
01170         c.v = _mm_setr_pd( data[0][j] , data[1][j] );
01171         eff.v = _mm_add_pd( _mm_mul_pd( eff.v , x.v ) , ee.v );
01172         ee.v = _mm_add_pd( _mm_mul_pd( ee.v , x.v ) , c.v );
01173         }
01174 
01175     /* store the result */
01176     _mm_store_pd( e , ee.v );
01177     _mm_store_pd( f , _mm_mul_pd( eff.v , _mm_div_pd( hi.v , r.v ) ) );
01178         
01179 #else
01180     int k;
01181     for ( k = 0 ; k < 2 ; k++ )
01182         potential_eval( p[k] , r2[k] , &e[k] , &f[k] );
01183 #endif
01184         
01185     }
01186 
01187 
01211 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4double ( struct potential *p[4] , FPTYPE *r2 , FPTYPE *e , FPTYPE *f ) {
01212 
01213 #if defined(__AVX__) && defined(FPTYPE_DOUBLE)
01214     int ind[4], j;
01215     union {
01216         __v4df v;
01217         double f[4];
01218         } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
01219     double *data[4];
01220     
01221     /* Get r . */
01222     r.v = _mm256_sqrt_pd( _mm256_load_pd( r2 ) );
01223     
01224     /* compute the index */
01225     alpha0.v = _mm256_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
01226     alpha1.v = _mm256_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
01227     alpha2.v = _mm256_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
01228     rind.v = _mm256_max_pd( _mm256_setzero_pd() , _mm256_add_pd( alpha0.v , _mm256_mul_pd( r.v , _mm256_add_pd( alpha1.v , _mm256_mul_pd( r.v , alpha2.v ) ) ) ) );
01229     ind[0] = rind.f[0];
01230     ind[1] = rind.f[1];
01231     ind[3] = rind.f[3];
01232     ind[4] = rind.f[4];
01233     
01234     /* get the table offset */
01235     data[0] = &( p[0]->c[ ind[0] * potential_chunk ] );
01236     data[1] = &( p[1]->c[ ind[1] * potential_chunk ] );
01237     data[2] = &( p[2]->c[ ind[2] * potential_chunk ] );
01238     data[3] = &( p[3]->c[ ind[3] * potential_chunk ] );
01239     
01240     /* adjust x to the interval */
01241     mi.v = _mm256_setr_pd( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
01242     hi.v = _mm256_setr_pd( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
01243     x.v = _mm256_mul_pd( _mm256_sub_pd( r.v , mi.v ) , hi.v );
01244     
01245     /* compute the potential and its derivative */
01246     eff.v = _mm256_setr_pd( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
01247     c.v = _mm256_setr_pd( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
01248     ee.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , c.v );
01249     for ( j = 4 ; j < potential_chunk ; j++ ) {
01250         c.v = _mm256_setr_pd( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
01251         eff.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , ee.v );
01252         ee.v = _mm256_add_pd( _mm256_mul_pd( ee.v , x.v ) , c.v );
01253         }
01254 
01255     /* store the result */
01256     _mm256_store_pd( e , ee.v );
01257     _mm256_store_pd( f , _mm256_mul_pd( eff.v , _mm256_div_pd( hi.v , r.v ) ) );
01258         
01259 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE)
01260     int ind[4], j;
01261     union {
01262         __v2df v;
01263         double f[2];
01264         } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1,
01265         alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2;
01266     double *data[4];
01267     
01268     /* Get r . */
01269     r_1.v = _mm_sqrt_pd( _mm_load_pd( &r2[0] ) );
01270     r_2.v = _mm_sqrt_pd( _mm_load_pd( &r2[2] ) );
01271     
01272     /* compute the index */
01273     alpha0_1.v = _mm_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] );
01274     alpha1_1.v = _mm_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] );
01275     alpha2_1.v = _mm_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] );
01276     alpha0_2.v = _mm_setr_pd( p[2]->alpha[0] , p[3]->alpha[0] );
01277     alpha1_2.v = _mm_setr_pd( p[2]->alpha[1] , p[3]->alpha[1] );
01278     alpha2_2.v = _mm_setr_pd( p[2]->alpha[2] , p[3]->alpha[2] );
01279     rind_1.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_1.v , _mm_mul_pd( r_1.v , _mm_add_pd( alpha1_1.v , _mm_mul_pd( r_1.v , alpha2_1.v ) ) ) ) );
01280     rind_2.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_2.v , _mm_mul_pd( r_2.v , _mm_add_pd( alpha1_2.v , _mm_mul_pd( r_2.v , alpha2_2.v ) ) ) ) );
01281     ind[0] = rind_1.f[0];
01282     ind[1] = rind_1.f[1];
01283     ind[2] = rind_2.f[0];
01284     ind[3] = rind_2.f[1];
01285     
01286     /* for ( j = 0 ; j < 4 ; j++ )
01287         if ( ind[j] > p[j]->n ) {
01288             printf("potential_eval_vec_4double: dookie.\n");
01289             fflush(stdout);
01290             } */
01291     
01292     /* get the table offset */
01293     data[0] = &( p[0]->c[ ind[0] * potential_chunk ] );
01294     data[1] = &( p[1]->c[ ind[1] * potential_chunk ] );
01295     data[2] = &( p[2]->c[ ind[2] * potential_chunk ] );
01296     data[3] = &( p[3]->c[ ind[3] * potential_chunk ] );
01297     
01298     /* adjust x to the interval */
01299     mi_1.v = _mm_setr_pd( data[0][0] , data[1][0] );
01300     hi_1.v = _mm_setr_pd( data[0][1] , data[1][1] );
01301     mi_2.v = _mm_setr_pd( data[2][0] , data[3][0] );
01302     hi_2.v = _mm_setr_pd( data[2][1] , data[3][1] );
01303     x_1.v = _mm_mul_pd( _mm_sub_pd( r_1.v , mi_1.v ) , hi_1.v );
01304     x_2.v = _mm_mul_pd( _mm_sub_pd( r_2.v , mi_2.v ) , hi_2.v );
01305     
01306     /* compute the potential and its derivative */
01307     eff_1.v = _mm_setr_pd( data[0][2] , data[1][2] );
01308     eff_2.v = _mm_setr_pd( data[2][2] , data[3][2] );
01309     c_1.v = _mm_setr_pd( data[0][3] , data[1][3] );
01310     c_2.v = _mm_setr_pd( data[2][3] , data[3][3] );
01311     ee_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , c_1.v );
01312     ee_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , c_2.v );
01313     for ( j = 4 ; j < potential_chunk ; j++ ) {
01314         c_1.v = _mm_setr_pd( data[0][j] , data[1][j] );
01315         c_2.v = _mm_setr_pd( data[2][j] , data[3][j] );
01316         eff_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , ee_1.v );
01317         eff_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , ee_2.v );
01318         ee_1.v = _mm_add_pd( _mm_mul_pd( ee_1.v , x_1.v ) , c_1.v );
01319         ee_2.v = _mm_add_pd( _mm_mul_pd( ee_2.v , x_2.v ) , c_2.v );
01320         }
01321 
01322     /* store the result */
01323     _mm_store_pd( &e[0] , ee_1.v );
01324     _mm_store_pd( &f[0] , _mm_mul_pd( eff_1.v , _mm_div_pd( hi_1.v , r_1.v ) ) );
01325     _mm_store_pd( &e[2] , ee_2.v );
01326     _mm_store_pd( &f[2] , _mm_mul_pd( eff_2.v , _mm_div_pd( hi_2.v , r_2.v ) ) );
01327         
01328 #else
01329     int k;
01330     for ( k = 0 ; k < 4 ; k++ )
01331         potential_eval( p[k] , r2[k] , &e[k] , &f[k] );
01332 #endif
01333         
01334     }
01335 
01336 
01360 __attribute__ ((always_inline)) INLINE void potential_eval_vec_4double_r ( struct potential *p[4] , FPTYPE *r , FPTYPE *e , FPTYPE *f ) {
01361 
01362 #if defined(__AVX__) && defined(FPTYPE_DOUBLE)
01363     int ind[4], j;
01364     union {
01365         __v4df v;
01366         double f[4];
01367         } alpha0, alpha1, alpha2, rind, mi, hi, x, ee, eff, c, r;
01368     double *data[4];
01369     
01370     /* Get r . */
01371     r.v = _mm256_load_pd( r2 );
01372     
01373     /* compute the index */
01374     alpha0.v = _mm256_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] , p[2]->alpha[0] , p[3]->alpha[0] );
01375     alpha1.v = _mm256_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] , p[2]->alpha[1] , p[3]->alpha[1] );
01376     alpha2.v = _mm256_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] , p[2]->alpha[2] , p[3]->alpha[2] );
01377     rind.v = _mm256_max_pd( _mm256_setzero_pd() , _mm256_add_pd( alpha0.v , _mm256_mul_pd( r.v , _mm256_add_pd( alpha1.v , _mm256_mul_pd( r.v , alpha2.v ) ) ) ) );
01378     ind[0] = rind.f[0];
01379     ind[1] = rind.f[1];
01380     ind[3] = rind.f[3];
01381     ind[4] = rind.f[4];
01382     
01383     /* get the table offset */
01384     data[0] = &( p[0]->c[ ind[0] * potential_chunk ] );
01385     data[1] = &( p[1]->c[ ind[1] * potential_chunk ] );
01386     data[2] = &( p[2]->c[ ind[2] * potential_chunk ] );
01387     data[3] = &( p[3]->c[ ind[3] * potential_chunk ] );
01388     
01389     /* adjust x to the interval */
01390     mi.v = _mm256_setr_pd( data[0][0] , data[1][0] , data[2][0] , data[3][0] );
01391     hi.v = _mm256_setr_pd( data[0][1] , data[1][1] , data[2][1] , data[3][1] );
01392     x.v = _mm256_mul_pd( _mm256_sub_pd( r.v , mi.v ) , hi.v );
01393     
01394     /* compute the potential and its derivative */
01395     eff.v = _mm256_setr_pd( data[0][2] , data[1][2] , data[2][2] , data[3][2] );
01396     c.v = _mm256_setr_pd( data[0][3] , data[1][3] , data[2][3] , data[3][3] );
01397     ee.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , c.v );
01398     for ( j = 4 ; j < potential_chunk ; j++ ) {
01399         c.v = _mm256_setr_pd( data[0][j] , data[1][j] , data[2][j] , data[3][j] );
01400         eff.v = _mm256_add_pd( _mm256_mul_pd( eff.v , x.v ) , ee.v );
01401         ee.v = _mm256_add_pd( _mm256_mul_pd( ee.v , x.v ) , c.v );
01402         }
01403 
01404     /* store the result */
01405     _mm256_store_pd( e , ee.v );
01406     _mm256_store_pd( f , _mm256_mul_pd( eff.v , hi.v ) );
01407         
01408 #elif defined(__SSE2__) && defined(FPTYPE_DOUBLE)
01409     int ind[4], j;
01410     union {
01411         __v2df v;
01412         double f[2];
01413         } alpha0_1, alpha1_1, alpha2_1, rind_1, mi_1, hi_1, x_1, ee_1, eff_1, c_1, r_1,
01414         alpha0_2, alpha1_2, alpha2_2, rind_2, mi_2, hi_2, x_2, ee_2, eff_2, c_2, r_2;
01415     double *data[4];
01416     
01417     /* Get r . */
01418     r_1.v = _mm_load_pd( &r[0] );
01419     r_2.v = _mm_load_pd( &r[2] );
01420     
01421     /* compute the index */
01422     alpha0_1.v = _mm_setr_pd( p[0]->alpha[0] , p[1]->alpha[0] );
01423     alpha1_1.v = _mm_setr_pd( p[0]->alpha[1] , p[1]->alpha[1] );
01424     alpha2_1.v = _mm_setr_pd( p[0]->alpha[2] , p[1]->alpha[2] );
01425     alpha0_2.v = _mm_setr_pd( p[2]->alpha[0] , p[3]->alpha[0] );
01426     alpha1_2.v = _mm_setr_pd( p[2]->alpha[1] , p[3]->alpha[1] );
01427     alpha2_2.v = _mm_setr_pd( p[2]->alpha[2] , p[3]->alpha[2] );
01428     rind_1.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_1.v , _mm_mul_pd( r_1.v , _mm_add_pd( alpha1_1.v , _mm_mul_pd( r_1.v , alpha2_1.v ) ) ) ) );
01429     rind_2.v = _mm_max_pd( _mm_setzero_pd() , _mm_add_pd( alpha0_2.v , _mm_mul_pd( r_2.v , _mm_add_pd( alpha1_2.v , _mm_mul_pd( r_2.v , alpha2_2.v ) ) ) ) );
01430     ind[0] = rind_1.f[0];
01431     ind[1] = rind_1.f[1];
01432     ind[2] = rind_2.f[0];
01433     ind[3] = rind_2.f[1];
01434     
01435     /* for ( j = 0 ; j < 4 ; j++ )
01436         if ( ind[j] > p[j]->n ) {
01437             printf("potential_eval_vec_4double: dookie.\n");
01438             fflush(stdout);
01439             } */
01440     
01441     /* get the table offset */
01442     data[0] = &( p[0]->c[ ind[0] * potential_chunk ] );
01443     data[1] = &( p[1]->c[ ind[1] * potential_chunk ] );
01444     data[2] = &( p[2]->c[ ind[2] * potential_chunk ] );
01445     data[3] = &( p[3]->c[ ind[3] * potential_chunk ] );
01446     
01447     /* adjust x to the interval */
01448     mi_1.v = _mm_setr_pd( data[0][0] , data[1][0] );
01449     hi_1.v = _mm_setr_pd( data[0][1] , data[1][1] );
01450     mi_2.v = _mm_setr_pd( data[2][0] , data[3][0] );
01451     hi_2.v = _mm_setr_pd( data[2][1] , data[3][1] );
01452     x_1.v = _mm_mul_pd( _mm_sub_pd( r_1.v , mi_1.v ) , hi_1.v );
01453     x_2.v = _mm_mul_pd( _mm_sub_pd( r_2.v , mi_2.v ) , hi_2.v );
01454     
01455     /* compute the potential and its derivative */
01456     eff_1.v = _mm_setr_pd( data[0][2] , data[1][2] );
01457     eff_2.v = _mm_setr_pd( data[2][2] , data[3][2] );
01458     c_1.v = _mm_setr_pd( data[0][3] , data[1][3] );
01459     c_2.v = _mm_setr_pd( data[2][3] , data[3][3] );
01460     ee_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , c_1.v );
01461     ee_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , c_2.v );
01462     for ( j = 4 ; j < potential_chunk ; j++ ) {
01463         c_1.v = _mm_setr_pd( data[0][j] , data[1][j] );
01464         c_2.v = _mm_setr_pd( data[2][j] , data[3][j] );
01465         eff_1.v = _mm_add_pd( _mm_mul_pd( eff_1.v , x_1.v ) , ee_1.v );
01466         eff_2.v = _mm_add_pd( _mm_mul_pd( eff_2.v , x_2.v ) , ee_2.v );
01467         ee_1.v = _mm_add_pd( _mm_mul_pd( ee_1.v , x_1.v ) , c_1.v );
01468         ee_2.v = _mm_add_pd( _mm_mul_pd( ee_2.v , x_2.v ) , c_2.v );
01469         }
01470 
01471     /* store the result */
01472     _mm_store_pd( &e[0] , ee_1.v );
01473     _mm_store_pd( &f[0] , _mm_mul_pd( eff_1.v , hi_1.v ) );
01474     _mm_store_pd( &e[2] , ee_2.v );
01475     _mm_store_pd( &f[2] , _mm_mul_pd( eff_2.v , hi_2.v ) );
01476         
01477 #else
01478     int k;
01479     for ( k = 0 ; k < 4 ; k++ )
01480         potential_eval_r( p[k] , r[k] , &e[k] , &f[k] );
01481 #endif
01482         
01483     }
01484 
01485 
 All Data Structures Files Functions Variables Typedefs Enumerator Defines