mdcore  0.1.5
Defines | Functions | Variables
/home/pedro/work/mdcore/src/runner_cuda.cu File Reference
#include "../config.h"
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <limits.h>
#include "cutil_math.h"
#include <mpi.h>
#include "cycle.h"
#include "errs.h"
#include "fptype.h"
#include "lock.h"
#include "part.h"
#include "cell.h"
#include "fifo.h"
#include "space.h"
#include "potential.h"
#include "engine.h"
#include "runner.h"
#include "runner_cuda.h"
#include "runner_cuda_main.h"

Defines

#define FPTYPE_SINGLE   1
#define error(id)   ( engine_err = errs_register( id , engine_err_msg[-(id)] , __LINE__ , __FUNCTION__ , __FILE__ ) )
#define cuda_error(id)   ( engine_err = errs_register( id , cudaGetErrorString(cudaGetLastError()) , __LINE__ , __FUNCTION__ , __FILE__ ) )
#define cuda_nrparts   32
#define cuda_nrparts   64
#define cuda_nrparts   96
#define cuda_nrparts   128
#define cuda_nrparts   160
#define cuda_nrparts   192
#define cuda_nrparts   224
#define cuda_nrparts   256
#define cuda_nrparts   288
#define cuda_nrparts   320
#define cuda_nrparts   352
#define cuda_nrparts   384
#define cuda_nrparts   416
#define cuda_nrparts   448
#define cuda_nrparts   480

Functions

__device__ void cuda_mutex_lock (int *m)
 Lock a device mutex.
__device__ int cuda_mutex_trylock (int *m)
 Attempt to lock a device mutex.
__device__ int cuda_mutex_lock_cond (int *m, int *c)
 Lock a device mutex with an additional condition.
__device__ void cuda_mutex_unlock (int *m)
 Unlock a device mutex.
__device__ int cuda_queue_gettask (struct queue_cuda *q)
 Get a task ID from the given queue.
__device__ void cuda_queue_puttask (struct queue_cuda *q, int tid)
 Put a task onto the given queue.
__noinline__ __device__ uint get_smid (void)
 Get the ID of the block's SM.
__device__ int runner_cuda_gettask (struct queue_cuda *q, int steal)
 Get a task from the given task queue.
__device__ void cuda_memcpy (void *dest, void *source, int count)
 Copy bulk memory in a strided way.
__device__ void cuda_memcpy_old (void *dest, void *source, int count)
__device__ void cuda_sum (float *a, float *b, int count)
 Sum two vectors in a strided way.
__device__ void cuda_sort_descending (unsigned int *a, int count)
 Sort the given data w.r.t. the lowest 16 bits in decending order.
__device__ void cuda_sort_ascending (unsigned int *a, int count)
 Sort the given data w.r.t. the lowest 16 bits in ascending order.
__device__ void potential_eval_cuda_tex (int pid, float r2, float *e, float *f)
 Evaluates the given potential at the given point (interpolated) using texture memory on the device.
__device__ void potential_eval4_cuda_tex (int4 pid, float4 r2, float4 *e, float4 *f)
 Evaluates the given potential at the given point (interpolated) using texture memory on the device.
__device__ void potential_eval_cuda (struct potential *p, float r2, float *e, float *f)
 Evaluates the given potential at the given point (interpolated).
__device__ void runner_dopair_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, float *pshift, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void runner_dopair4_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, float *pshift, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void runner_dopair_verlet_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void runner_dopair4_verlet_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void runner_dopair4_verlet_left_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global)
__device__ void runner_dopair4_verlet_right_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global)
__device__ void runner_dopair_sorted_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void runner_dopair4_sorted_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void runner_dopair4_sorted_left_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the left side (cid).
__device__ void runner_dopair4_sorted_right_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global)
 Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the right side (cjd).
__device__ void runner_doself_cuda (float4 *parts, int count, float *forces, float *epot_global)
 Compute the self interactions for the given cell on a CUDA device.
__device__ void runner_doself4_cuda (float4 *parts, int count, float *forces, float *epot_global)
 Compute the self interactions for the given cell on a CUDA device.
__device__ void runner_doself_diag_cuda (float4 *parts, int count, float *forces, float *epot_global)
 Compute the self interactions for the given cell on a CUDA device.
__device__ void runner_doself4_diag_cuda (float4 *parts, int count, float *forces, float *epot_global)
 Compute the self interactions for the given cell on a CUDA device.
__device__ void runner_doself4_diag_cuda_old (float4 *parts, int count, float *forces, float *epot_global)
int runner_bind (cudaArray *cuArray_coeffs, cudaArray *cuArray_pind, cudaArray *cuArray_diags)
 Bind textures to the given cuda Arrays.
int runner_parts_bind (cudaArray *cuArray_parts)
 Bind textures to the given cuda Arrays.
int runner_parts_unbind ()
 Bind textures to the given cuda Arrays.
int engine_nonbond_cuda (struct engine *e)
 Offload and compute the nonbonded interactions on a CUDA device.
int engine_cuda_load_parts (struct engine *e)
 Load the cell data onto the CUDA device.
int engine_cuda_unload_parts (struct engine *e)
 Load the cell data from the CUDA device.
int engine_cuda_queues_load (struct engine *e)
 Load the queues onto the CUDA device.
int engine_cuda_load (struct engine *e)
 Load the potentials and cell pairs onto the CUDA device.

Variables

__constant__ struct potentialpotential_null_cuda = NULL
__constant__ int cuda_nr_pairs = 0
__device__ int cuda_pairs_done = 0
__constant__ int cuda_nr_tuples = 0
__constant__ int cuda_nr_cells = 0
__constant__ float4 * cuda_parts
__constant__ unsigned int * cuda_diags
__constant__ unsigned int * cuda_pind
__device__ int cuda_cell_mutex = 0
__device__ int cuda_barrier = 0
__constant__ struct cellpair_cudacuda_pairs
__device__ int * cuda_taboo
__device__ int cuda_pair_next = 0
__device__ struct queue_cuda cuda_queues [cuda_maxqueues]
__constant__ int cuda_nrqueues
__constant__ int cuda_queue_size
__constant__ float cuda_cutoff2 = 0.0f
__constant__ float cuda_cutoff = 0.0f
__constant__ float cuda_dscale = 0.0f
__constant__ float cuda_maxdist = 0.0f
__constant__ struct potential ** cuda_p
__constant__ int cuda_maxtype = 0
__constant__ struct potentialcuda_pots
__device__ unsigned int * cuda_sortlists = NULL
__device__ int * cuda_sortlists_ind
texture< float4,
cudaTextureType2D > 
tex_coeffs
texture< float4,
cudaTextureType2D > 
tex_parts
texture< int, cudaTextureType1D > tex_pind
texture< unsigned int,
cudaTextureType1D > 
tex_diags
cudaArray * cuda_coeffs
__constant__ float cuda_eps [100]
__constant__ float cuda_rmin [100]
__device__ float cuda_fio [32]
__device__ int cuda_io [32]
__device__ int cuda_rcount = 0
__device__ float cuda_epot = 0.0f
__device__ float cuda_epot_out
__device__ float cuda_timers [tid_count]

Define Documentation

#define cuda_error (   id)    ( engine_err = errs_register( id , cudaGetErrorString(cudaGetLastError()) , __LINE__ , __FUNCTION__ , __FILE__ ) )
#define cuda_nrparts   32

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   64

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   96

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   128

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   160

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   192

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   224

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   256

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   288

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   320

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   352

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   384

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   416

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   448

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define cuda_nrparts   480

This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.

#define error (   id)    ( engine_err = errs_register( id , engine_err_msg[-(id)] , __LINE__ , __FUNCTION__ , __FILE__ ) )
#define FPTYPE_SINGLE   1

Function Documentation

__device__ void cuda_memcpy ( void *  dest,
void *  source,
int  count 
) [inline]

Copy bulk memory in a strided way.

Parameters:
destPointer to destination memory.
sourcePointer to source memory.
countNumber of bytes to copy, must be a multiple of sizeof(int).
__device__ void cuda_memcpy_old ( void *  dest,
void *  source,
int  count 
) [inline]
__device__ void cuda_mutex_lock ( int *  m)

Lock a device mutex.

Parameters:
mThe mutex.

Loops until the mutex can be set. Note that only one thread can do this at a time, so to synchronize blocks, only a single thread of each block should call it.

__device__ int cuda_mutex_lock_cond ( int *  m,
int *  c 
)

Lock a device mutex with an additional condition.

Parameters:
mThe mutex.
cthe condition
Returns:
1 if the mutex could be locked or zero if the condition c was reached first.

Loops until the mutex can be set or until *c is non-zero. Note that only one thread can do this at a time, so to synchronize blocks, only a single thread of each block should call it.

__device__ int cuda_mutex_trylock ( int *  m)

Attempt to lock a device mutex.

Parameters:
mThe mutex.

Try to grab the mutex. Note that only one thread can do this at a time, so to synchronize blocks, only a single thread of each block should call it.

__device__ void cuda_mutex_unlock ( int *  m)

Unlock a device mutex.

Parameters:
mThe mutex.

Does not check if the mutex had been locked.

__device__ int cuda_queue_gettask ( struct queue_cuda q)

Get a task ID from the given queue.

__device__ void cuda_queue_puttask ( struct queue_cuda q,
int  tid 
)

Put a task onto the given queue.

Parameters:
tidThe task ID to add to the end of the queue.
__device__ void cuda_sort_ascending ( unsigned int *  a,
int  count 
)

Sort the given data w.r.t. the lowest 16 bits in ascending order.

Parameters:
aThe array to sort.
countThe number of elements.
__device__ void cuda_sort_descending ( unsigned int *  a,
int  count 
)

Sort the given data w.r.t. the lowest 16 bits in decending order.

Parameters:
aThe array to sort.
countThe number of elements.
__device__ void cuda_sum ( float *  a,
float *  b,
int  count 
) [inline]

Sum two vectors in a strided way.

Parameters:
aPointer to destination memory.
bPointer to source memory.
countNumber of floats to sum.

Computes a[k] += b[k] for k=1..count.

int engine_cuda_load ( struct engine e)

Load the potentials and cell pairs onto the CUDA device.

Parameters:
eThe engine.
Returns:
engine_err_ok or < 0 on error (see engine_err).
int engine_cuda_load_parts ( struct engine e)

Load the cell data onto the CUDA device.

Parameters:
eThe engine.
Returns:
The maximum number of parts per cell or < 0 on error (see engine_err).
int engine_cuda_queues_load ( struct engine e)

Load the queues onto the CUDA device.

Parameters:
eThe engine.
Returns:
engine_err_ok or < 0 on error (see engine_err).
int engine_cuda_unload_parts ( struct engine e)

Load the cell data from the CUDA device.

Parameters:
eThe engine.
Returns:
engine_err_ok or < 0 on error (see engine_err).
int engine_nonbond_cuda ( struct engine e)

Offload and compute the nonbonded interactions on a CUDA device.

Parameters:
eThe engine.
Returns:
engine_err_ok or < 0 on error (see engine_err).
__noinline__ __device__ uint get_smid ( void  )

Get the ID of the block's SM.

__device__ void potential_eval4_cuda_tex ( int4  pid,
float4  r2,
float4 *  e,
float4 *  f 
) [inline]

Evaluates the given potential at the given point (interpolated) using texture memory on the device.

Parameters:
pidThe index of the potential to be evaluated.
r2The radius at which it is to be evaluated, squared.
ePointer to a floating-point value in which to store the interaction energy.
fPointer to a floating-point value in which to store the magnitude of the interaction force divided by r.

Note that for efficiency reasons, this function does not check if any of the parameters are NULL or if sqrt(r2) is within the interval of the potential p.

__device__ void potential_eval_cuda ( struct potential p,
float  r2,
float *  e,
float *  f 
) [inline]

Evaluates the given potential at the given point (interpolated).

Parameters:
pThe potential to be evaluated.
r2The radius at which it is to be evaluated, squared.
ePointer to a floating-point value in which to store the interaction energy.
fPointer to a floating-point value in which to store the magnitude of the interaction force divided by r.

Note that for efficiency reasons, this function does not check if any of the parameters are NULL or if sqrt(r2) is within the interval of the potential p.

__device__ void potential_eval_cuda_tex ( int  pid,
float  r2,
float *  e,
float *  f 
) [inline]

Evaluates the given potential at the given point (interpolated) using texture memory on the device.

Parameters:
pidThe index of the potential to be evaluated.
r2The radius at which it is to be evaluated, squared.
ePointer to a floating-point value in which to store the interaction energy.
fPointer to a floating-point value in which to store the magnitude of the interaction force divided by r.

Note that for efficiency reasons, this function does not check if any of the parameters are NULL or if sqrt(r2) is within the interval of the potential p.

int runner_bind ( cudaArray *  cuArray_coeffs,
cudaArray *  cuArray_pind,
cudaArray *  cuArray_diags 
)

Bind textures to the given cuda Arrays.

Hack to get around the fact that textures are static and can thus not be externalized.

__device__ int runner_cuda_gettask ( struct queue_cuda q,
int  steal 
)

Get a task from the given task queue.

Picks tasks from the queue sequentially and checks if they can be computed. If not, they are returned to the queue.

This routine blocks until a valid task is picked up, or the specified queue is empty.

__device__ void runner_dopair4_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
float *  pshift,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device.

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_dopair4_sorted_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device.

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_dopair4_sorted_left_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the left side (cid).

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_dopair4_sorted_right_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the right side (cjd).

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_dopair4_verlet_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
int  verlet_rebuild,
unsigned int *  sortlist,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device.

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_dopair4_verlet_left_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
int  verlet_rebuild,
unsigned int *  sortlist,
float *  epot_global 
)
__device__ void runner_dopair4_verlet_right_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
int  verlet_rebuild,
unsigned int *  sortlist,
float *  epot_global 
)
__device__ void runner_dopair_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
float *  pshift,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device.

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_dopair_sorted_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device.

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_dopair_verlet_cuda ( float4 *  parts_i,
int  count_i,
float4 *  parts_j,
int  count_j,
float *  forces_i,
float *  forces_j,
unsigned int *  sort_i,
unsigned int *  sort_j,
float *  pshift,
int  verlet_rebuild,
unsigned int *  sortlist,
float *  epot_global 
)

Compute the pairwise interactions for the given pair on a CUDA device.

Parameters:
icidArray of parts in the first cell.
count_iNumber of parts in the first cell.
icjdArray of parts in the second cell.
count_jNumber of parts in the second cell.
pshiftA pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j.
cidPart buffer in local memory.
cjdPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_doself4_cuda ( float4 *  parts,
int  count,
float *  forces,
float *  epot_global 
)

Compute the self interactions for the given cell on a CUDA device.

Parameters:
ipartsArray of parts in this cell.
countNumber of parts in the cell.
partsPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_doself4_diag_cuda ( float4 *  parts,
int  count,
float *  forces,
float *  epot_global 
)

Compute the self interactions for the given cell on a CUDA device.

Parameters:
ipartsArray of parts in this cell.
countNumber of parts in the cell.
partsPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_doself4_diag_cuda_old ( float4 *  parts,
int  count,
float *  forces,
float *  epot_global 
)
__device__ void runner_doself_cuda ( float4 *  parts,
int  count,
float *  forces,
float *  epot_global 
)

Compute the self interactions for the given cell on a CUDA device.

Parameters:
ipartsArray of parts in this cell.
countNumber of parts in the cell.
partsPart buffer in local memory.
See also:
runner_dopair.
__device__ void runner_doself_diag_cuda ( float4 *  parts,
int  count,
float *  forces,
float *  epot_global 
)

Compute the self interactions for the given cell on a CUDA device.

Parameters:
ipartsArray of parts in this cell.
countNumber of parts in the cell.
partsPart buffer in local memory.
See also:
runner_dopair.
int runner_parts_bind ( cudaArray *  cuArray_parts)

Bind textures to the given cuda Arrays.

Hack to get around the fact that textures are static and can thus not be externalized.

Bind textures to the given cuda Arrays.

Hack to get around the fact that textures are static and can thus not be externalized.


Variable Documentation

__device__ int cuda_barrier = 0
__device__ int cuda_cell_mutex = 0
cudaArray* cuda_coeffs
__constant__ float cuda_cutoff = 0.0f
__constant__ float cuda_cutoff2 = 0.0f
__constant__ unsigned int* cuda_diags
__constant__ float cuda_dscale = 0.0f
__device__ float cuda_epot = 0.0f
__device__ float cuda_epot_out
__constant__ float cuda_eps[100]
__device__ float cuda_fio[32]
__device__ int cuda_io[32]
__constant__ float cuda_maxdist = 0.0f
__constant__ int cuda_maxtype = 0
__constant__ int cuda_nr_cells = 0
__constant__ int cuda_nr_pairs = 0
__constant__ int cuda_nr_tuples = 0
__constant__ int cuda_nrqueues
__constant__ struct potential** cuda_p
__device__ int cuda_pair_next = 0
__constant__ struct cellpair_cuda* cuda_pairs
__device__ int cuda_pairs_done = 0
__constant__ float4* cuda_parts
__constant__ unsigned int* cuda_pind
__constant__ struct potential* cuda_pots
__constant__ int cuda_queue_size
__device__ int cuda_rcount = 0
__constant__ float cuda_rmin[100]
__device__ unsigned int* cuda_sortlists = NULL
__device__ int* cuda_sortlists_ind
__device__ int* cuda_taboo
__device__ float cuda_timers[tid_count]
__constant__ struct potential* potential_null_cuda = NULL
texture< float4 , cudaTextureType2D > tex_coeffs
texture< unsigned int , cudaTextureType1D > tex_diags
texture< float4 , cudaTextureType2D > tex_parts
texture< int , cudaTextureType1D > tex_pind
 All Data Structures Files Functions Variables Typedefs Enumerator Defines