#include "../config.h"
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <limits.h>
#include "cutil_math.h"
#include <mpi.h>
#include "cycle.h"
#include "errs.h"
#include "fptype.h"
#include "lock.h"
#include "part.h"
#include "cell.h"
#include "fifo.h"
#include "space.h"
#include "potential.h"
#include "engine.h"
#include "runner.h"
#include "runner_cuda.h"
#include "runner_cuda_main.h"

Defines
#define	FPTYPE_SINGLE 1
#define	error(id) ( engine_err = errs_register( id , engine_err_msg[-(id)] , __LINE__ , __FUNCTION__ , __FILE__ ) )
#define	cuda_error(id) ( engine_err = errs_register( id , cudaGetErrorString(cudaGetLastError()) , __LINE__ , __FUNCTION__ , __FILE__ ) )
#define	cuda_nrparts 32
#define	cuda_nrparts 64
#define	cuda_nrparts 96
#define	cuda_nrparts 128
#define	cuda_nrparts 160
#define	cuda_nrparts 192
#define	cuda_nrparts 224
#define	cuda_nrparts 256
#define	cuda_nrparts 288
#define	cuda_nrparts 320
#define	cuda_nrparts 352
#define	cuda_nrparts 384
#define	cuda_nrparts 416
#define	cuda_nrparts 448
#define	cuda_nrparts 480
Functions
__device__ void	cuda_mutex_lock (int *m)
	Lock a device mutex.
__device__ int	cuda_mutex_trylock (int *m)
	Attempt to lock a device mutex.
__device__ int	cuda_mutex_lock_cond (int m, int c)
	Lock a device mutex with an additional condition.
__device__ void	cuda_mutex_unlock (int *m)
	Unlock a device mutex.
__device__ int	cuda_queue_gettask (struct queue_cuda *q)
	Get a task ID from the given queue.
__device__ void	cuda_queue_puttask (struct queue_cuda *q, int tid)
	Put a task onto the given queue.
__noinline__ __device__ uint	get_smid (void)
	Get the ID of the block's SM.
__device__ int	runner_cuda_gettask (struct queue_cuda *q, int steal)
	Get a task from the given task queue.
__device__ void	cuda_memcpy (void dest, void source, int count)
	Copy bulk memory in a strided way.
__device__ void	cuda_memcpy_old (void dest, void source, int count)
__device__ void	cuda_sum (float a, float b, int count)
	Sum two vectors in a strided way.
__device__ void	cuda_sort_descending (unsigned int *a, int count)
	Sort the given data w.r.t. the lowest 16 bits in decending order.
__device__ void	cuda_sort_ascending (unsigned int *a, int count)
	Sort the given data w.r.t. the lowest 16 bits in ascending order.
__device__ void	potential_eval_cuda_tex (int pid, float r2, float e, float f)
	Evaluates the given potential at the given point (interpolated) using texture memory on the device.
__device__ void	potential_eval4_cuda_tex (int4 pid, float4 r2, float4 e, float4 f)
	Evaluates the given potential at the given point (interpolated) using texture memory on the device.
__device__ void	potential_eval_cuda (struct potential p, float r2, float e, float *f)
	Evaluates the given potential at the given point (interpolated).
__device__ void	runner_dopair_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, float pshift, float epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void	runner_dopair4_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, float pshift, float epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void	runner_dopair_verlet_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, int verlet_rebuild, unsigned int sortlist, float *epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void	runner_dopair4_verlet_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, int verlet_rebuild, unsigned int sortlist, float *epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void	runner_dopair4_verlet_left_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, int verlet_rebuild, unsigned int sortlist, float *epot_global)
__device__ void	runner_dopair4_verlet_right_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, int verlet_rebuild, unsigned int sortlist, float *epot_global)
__device__ void	runner_dopair_sorted_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, float epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void	runner_dopair4_sorted_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, float epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device.
__device__ void	runner_dopair4_sorted_left_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, float epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the left side (cid).
__device__ void	runner_dopair4_sorted_right_cuda (float4 parts_i, int count_i, float4 parts_j, int count_j, float forces_i, float forces_j, unsigned int sort_i, unsigned int sort_j, float pshift, float epot_global)
	Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the right side (cjd).
__device__ void	runner_doself_cuda (float4 parts, int count, float forces, float *epot_global)
	Compute the self interactions for the given cell on a CUDA device.
__device__ void	runner_doself4_cuda (float4 parts, int count, float forces, float *epot_global)
	Compute the self interactions for the given cell on a CUDA device.
__device__ void	runner_doself_diag_cuda (float4 parts, int count, float forces, float *epot_global)
	Compute the self interactions for the given cell on a CUDA device.
__device__ void	runner_doself4_diag_cuda (float4 parts, int count, float forces, float *epot_global)
	Compute the self interactions for the given cell on a CUDA device.
__device__ void	runner_doself4_diag_cuda_old (float4 parts, int count, float forces, float *epot_global)
int	runner_bind (cudaArray cuArray_coeffs, cudaArray cuArray_pind, cudaArray *cuArray_diags)
	Bind textures to the given cuda Arrays.
int	runner_parts_bind (cudaArray *cuArray_parts)
	Bind textures to the given cuda Arrays.
int	runner_parts_unbind ()
	Bind textures to the given cuda Arrays.
int	engine_nonbond_cuda (struct engine *e)
	Offload and compute the nonbonded interactions on a CUDA device.
int	engine_cuda_load_parts (struct engine *e)
	Load the cell data onto the CUDA device.
int	engine_cuda_unload_parts (struct engine *e)
	Load the cell data from the CUDA device.
int	engine_cuda_queues_load (struct engine *e)
	Load the queues onto the CUDA device.
int	engine_cuda_load (struct engine *e)
	Load the potentials and cell pairs onto the CUDA device.
Variables
__constant__ struct potential *	potential_null_cuda = NULL
__constant__ int	cuda_nr_pairs = 0
__device__ int	cuda_pairs_done = 0
__constant__ int	cuda_nr_tuples = 0
__constant__ int	cuda_nr_cells = 0
__constant__ float4 *	cuda_parts
__constant__ unsigned int *	cuda_diags
__constant__ unsigned int *	cuda_pind
__device__ int	cuda_cell_mutex = 0
__device__ int	cuda_barrier = 0
__constant__ struct cellpair_cuda *	cuda_pairs
__device__ int *	cuda_taboo
__device__ int	cuda_pair_next = 0
__device__ struct queue_cuda	cuda_queues [cuda_maxqueues]
__constant__ int	cuda_nrqueues
__constant__ int	cuda_queue_size
__constant__ float	cuda_cutoff2 = 0.0f
__constant__ float	cuda_cutoff = 0.0f
__constant__ float	cuda_dscale = 0.0f
__constant__ float	cuda_maxdist = 0.0f
__constant__ struct potential **	cuda_p
__constant__ int	cuda_maxtype = 0
__constant__ struct potential *	cuda_pots
__device__ unsigned int *	cuda_sortlists = NULL
__device__ int *	cuda_sortlists_ind
texture< float4, cudaTextureType2D >	tex_coeffs
texture< float4, cudaTextureType2D >	tex_parts
texture< int, cudaTextureType1D >	tex_pind
texture< unsigned int, cudaTextureType1D >	tex_diags
cudaArray *	cuda_coeffs
__constant__ float	cuda_eps [100]
__constant__ float	cuda_rmin [100]
__device__ float	cuda_fio [32]
__device__ int	cuda_io [32]
__device__ int	cuda_rcount = 0
__device__ float	cuda_epot = 0.0f
__device__ float	cuda_epot_out
__device__ float	cuda_timers [tid_count]

Define Documentation

#define cuda_error ( id ) ( engine_err = errs_register( id , cudaGetErrorString(cudaGetLastError()) , __LINE__ , __FUNCTION__ , __FILE__ ) )

#define cuda_nrparts 32