mdcore
0.1.5
|
#include "../config.h"
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <math.h>
#include <float.h>
#include <string.h>
#include <limits.h>
#include "cutil_math.h"
#include <mpi.h>
#include "cycle.h"
#include "errs.h"
#include "fptype.h"
#include "lock.h"
#include "part.h"
#include "cell.h"
#include "fifo.h"
#include "space.h"
#include "potential.h"
#include "engine.h"
#include "runner.h"
#include "runner_cuda.h"
#include "runner_cuda_main.h"
Defines | |
#define | FPTYPE_SINGLE 1 |
#define | error(id) ( engine_err = errs_register( id , engine_err_msg[-(id)] , __LINE__ , __FUNCTION__ , __FILE__ ) ) |
#define | cuda_error(id) ( engine_err = errs_register( id , cudaGetErrorString(cudaGetLastError()) , __LINE__ , __FUNCTION__ , __FILE__ ) ) |
#define | cuda_nrparts 32 |
#define | cuda_nrparts 64 |
#define | cuda_nrparts 96 |
#define | cuda_nrparts 128 |
#define | cuda_nrparts 160 |
#define | cuda_nrparts 192 |
#define | cuda_nrparts 224 |
#define | cuda_nrparts 256 |
#define | cuda_nrparts 288 |
#define | cuda_nrparts 320 |
#define | cuda_nrparts 352 |
#define | cuda_nrparts 384 |
#define | cuda_nrparts 416 |
#define | cuda_nrparts 448 |
#define | cuda_nrparts 480 |
Functions | |
__device__ void | cuda_mutex_lock (int *m) |
Lock a device mutex. | |
__device__ int | cuda_mutex_trylock (int *m) |
Attempt to lock a device mutex. | |
__device__ int | cuda_mutex_lock_cond (int *m, int *c) |
Lock a device mutex with an additional condition. | |
__device__ void | cuda_mutex_unlock (int *m) |
Unlock a device mutex. | |
__device__ int | cuda_queue_gettask (struct queue_cuda *q) |
Get a task ID from the given queue. | |
__device__ void | cuda_queue_puttask (struct queue_cuda *q, int tid) |
Put a task onto the given queue. | |
__noinline__ __device__ uint | get_smid (void) |
Get the ID of the block's SM. | |
__device__ int | runner_cuda_gettask (struct queue_cuda *q, int steal) |
Get a task from the given task queue. | |
__device__ void | cuda_memcpy (void *dest, void *source, int count) |
Copy bulk memory in a strided way. | |
__device__ void | cuda_memcpy_old (void *dest, void *source, int count) |
__device__ void | cuda_sum (float *a, float *b, int count) |
Sum two vectors in a strided way. | |
__device__ void | cuda_sort_descending (unsigned int *a, int count) |
Sort the given data w.r.t. the lowest 16 bits in decending order. | |
__device__ void | cuda_sort_ascending (unsigned int *a, int count) |
Sort the given data w.r.t. the lowest 16 bits in ascending order. | |
__device__ void | potential_eval_cuda_tex (int pid, float r2, float *e, float *f) |
Evaluates the given potential at the given point (interpolated) using texture memory on the device. | |
__device__ void | potential_eval4_cuda_tex (int4 pid, float4 r2, float4 *e, float4 *f) |
Evaluates the given potential at the given point (interpolated) using texture memory on the device. | |
__device__ void | potential_eval_cuda (struct potential *p, float r2, float *e, float *f) |
Evaluates the given potential at the given point (interpolated). | |
__device__ void | runner_dopair_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, float *pshift, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device. | |
__device__ void | runner_dopair4_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, float *pshift, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device. | |
__device__ void | runner_dopair_verlet_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device. | |
__device__ void | runner_dopair4_verlet_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device. | |
__device__ void | runner_dopair4_verlet_left_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global) |
__device__ void | runner_dopair4_verlet_right_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, int verlet_rebuild, unsigned int *sortlist, float *epot_global) |
__device__ void | runner_dopair_sorted_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device. | |
__device__ void | runner_dopair4_sorted_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device. | |
__device__ void | runner_dopair4_sorted_left_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the left side (cid). | |
__device__ void | runner_dopair4_sorted_right_cuda (float4 *parts_i, int count_i, float4 *parts_j, int count_j, float *forces_i, float *forces_j, unsigned int *sort_i, unsigned int *sort_j, float *pshift, float *epot_global) |
Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the right side (cjd). | |
__device__ void | runner_doself_cuda (float4 *parts, int count, float *forces, float *epot_global) |
Compute the self interactions for the given cell on a CUDA device. | |
__device__ void | runner_doself4_cuda (float4 *parts, int count, float *forces, float *epot_global) |
Compute the self interactions for the given cell on a CUDA device. | |
__device__ void | runner_doself_diag_cuda (float4 *parts, int count, float *forces, float *epot_global) |
Compute the self interactions for the given cell on a CUDA device. | |
__device__ void | runner_doself4_diag_cuda (float4 *parts, int count, float *forces, float *epot_global) |
Compute the self interactions for the given cell on a CUDA device. | |
__device__ void | runner_doself4_diag_cuda_old (float4 *parts, int count, float *forces, float *epot_global) |
int | runner_bind (cudaArray *cuArray_coeffs, cudaArray *cuArray_pind, cudaArray *cuArray_diags) |
Bind textures to the given cuda Arrays. | |
int | runner_parts_bind (cudaArray *cuArray_parts) |
Bind textures to the given cuda Arrays. | |
int | runner_parts_unbind () |
Bind textures to the given cuda Arrays. | |
int | engine_nonbond_cuda (struct engine *e) |
Offload and compute the nonbonded interactions on a CUDA device. | |
int | engine_cuda_load_parts (struct engine *e) |
Load the cell data onto the CUDA device. | |
int | engine_cuda_unload_parts (struct engine *e) |
Load the cell data from the CUDA device. | |
int | engine_cuda_queues_load (struct engine *e) |
Load the queues onto the CUDA device. | |
int | engine_cuda_load (struct engine *e) |
Load the potentials and cell pairs onto the CUDA device. | |
Variables | |
__constant__ struct potential * | potential_null_cuda = NULL |
__constant__ int | cuda_nr_pairs = 0 |
__device__ int | cuda_pairs_done = 0 |
__constant__ int | cuda_nr_tuples = 0 |
__constant__ int | cuda_nr_cells = 0 |
__constant__ float4 * | cuda_parts |
__constant__ unsigned int * | cuda_diags |
__constant__ unsigned int * | cuda_pind |
__device__ int | cuda_cell_mutex = 0 |
__device__ int | cuda_barrier = 0 |
__constant__ struct cellpair_cuda * | cuda_pairs |
__device__ int * | cuda_taboo |
__device__ int | cuda_pair_next = 0 |
__device__ struct queue_cuda | cuda_queues [cuda_maxqueues] |
__constant__ int | cuda_nrqueues |
__constant__ int | cuda_queue_size |
__constant__ float | cuda_cutoff2 = 0.0f |
__constant__ float | cuda_cutoff = 0.0f |
__constant__ float | cuda_dscale = 0.0f |
__constant__ float | cuda_maxdist = 0.0f |
__constant__ struct potential ** | cuda_p |
__constant__ int | cuda_maxtype = 0 |
__constant__ struct potential * | cuda_pots |
__device__ unsigned int * | cuda_sortlists = NULL |
__device__ int * | cuda_sortlists_ind |
texture< float4, cudaTextureType2D > | tex_coeffs |
texture< float4, cudaTextureType2D > | tex_parts |
texture< int, cudaTextureType1D > | tex_pind |
texture< unsigned int, cudaTextureType1D > | tex_diags |
cudaArray * | cuda_coeffs |
__constant__ float | cuda_eps [100] |
__constant__ float | cuda_rmin [100] |
__device__ float | cuda_fio [32] |
__device__ int | cuda_io [32] |
__device__ int | cuda_rcount = 0 |
__device__ float | cuda_epot = 0.0f |
__device__ float | cuda_epot_out |
__device__ float | cuda_timers [tid_count] |
#define cuda_error | ( | id | ) | ( engine_err = errs_register( id , cudaGetErrorString(cudaGetLastError()) , __LINE__ , __FUNCTION__ , __FILE__ ) ) |
#define cuda_nrparts 32 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 64 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 96 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 128 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 160 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 192 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 224 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 256 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 288 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 320 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 352 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 384 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 416 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 448 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define cuda_nrparts 480 |
This set of defines and includes produces kernels with buffers for multiples of 32 particles up to 512 cuda_maxparts.
#define error | ( | id | ) | ( engine_err = errs_register( id , engine_err_msg[-(id)] , __LINE__ , __FUNCTION__ , __FILE__ ) ) |
#define FPTYPE_SINGLE 1 |
__device__ void cuda_memcpy | ( | void * | dest, |
void * | source, | ||
int | count | ||
) | [inline] |
Copy bulk memory in a strided way.
dest | Pointer to destination memory. |
source | Pointer to source memory. |
count | Number of bytes to copy, must be a multiple of sizeof(int). |
__device__ void cuda_memcpy_old | ( | void * | dest, |
void * | source, | ||
int | count | ||
) | [inline] |
__device__ void cuda_mutex_lock | ( | int * | m | ) |
Lock a device mutex.
m | The mutex. |
Loops until the mutex can be set. Note that only one thread can do this at a time, so to synchronize blocks, only a single thread of each block should call it.
__device__ int cuda_mutex_lock_cond | ( | int * | m, |
int * | c | ||
) |
Lock a device mutex with an additional condition.
m | The mutex. |
c | the condition |
1
if the mutex could be locked or zero if the condition c
was reached first.Loops until the mutex can be set or until *c
is non-zero. Note that only one thread can do this at a time, so to synchronize blocks, only a single thread of each block should call it.
__device__ int cuda_mutex_trylock | ( | int * | m | ) |
Attempt to lock a device mutex.
m | The mutex. |
Try to grab the mutex. Note that only one thread can do this at a time, so to synchronize blocks, only a single thread of each block should call it.
__device__ void cuda_mutex_unlock | ( | int * | m | ) |
Unlock a device mutex.
m | The mutex. |
Does not check if the mutex had been locked.
__device__ int cuda_queue_gettask | ( | struct queue_cuda * | q | ) |
Get a task ID from the given queue.
__device__ void cuda_queue_puttask | ( | struct queue_cuda * | q, |
int | tid | ||
) |
Put a task onto the given queue.
tid | The task ID to add to the end of the queue. |
__device__ void cuda_sort_ascending | ( | unsigned int * | a, |
int | count | ||
) |
Sort the given data w.r.t. the lowest 16 bits in ascending order.
a | The array to sort. |
count | The number of elements. |
__device__ void cuda_sort_descending | ( | unsigned int * | a, |
int | count | ||
) |
Sort the given data w.r.t. the lowest 16 bits in decending order.
a | The array to sort. |
count | The number of elements. |
__device__ void cuda_sum | ( | float * | a, |
float * | b, | ||
int | count | ||
) | [inline] |
Sum two vectors in a strided way.
a | Pointer to destination memory. |
b | Pointer to source memory. |
count | Number of floats to sum. |
Computes a
[k] += b[k] for k=1..count.
int engine_cuda_load | ( | struct engine * | e | ) |
Load the potentials and cell pairs onto the CUDA device.
e | The engine. |
int engine_cuda_load_parts | ( | struct engine * | e | ) |
Load the cell data onto the CUDA device.
e | The engine. |
int engine_cuda_queues_load | ( | struct engine * | e | ) |
Load the queues onto the CUDA device.
e | The engine. |
int engine_cuda_unload_parts | ( | struct engine * | e | ) |
Load the cell data from the CUDA device.
e | The engine. |
int engine_nonbond_cuda | ( | struct engine * | e | ) |
Offload and compute the nonbonded interactions on a CUDA device.
e | The engine. |
__device__ void potential_eval4_cuda_tex | ( | int4 | pid, |
float4 | r2, | ||
float4 * | e, | ||
float4 * | f | ||
) | [inline] |
Evaluates the given potential at the given point (interpolated) using texture memory on the device.
pid | The index of the potential to be evaluated. |
r2 | The radius at which it is to be evaluated, squared. |
e | Pointer to a floating-point value in which to store the interaction energy. |
f | Pointer to a floating-point value in which to store the magnitude of the interaction force divided by r. |
Note that for efficiency reasons, this function does not check if any of the parameters are NULL
or if sqrt(r2)
is within the interval of the potential p
.
__device__ void potential_eval_cuda | ( | struct potential * | p, |
float | r2, | ||
float * | e, | ||
float * | f | ||
) | [inline] |
Evaluates the given potential at the given point (interpolated).
p | The potential to be evaluated. |
r2 | The radius at which it is to be evaluated, squared. |
e | Pointer to a floating-point value in which to store the interaction energy. |
f | Pointer to a floating-point value in which to store the magnitude of the interaction force divided by r. |
Note that for efficiency reasons, this function does not check if any of the parameters are NULL
or if sqrt(r2)
is within the interval of the potential p
.
__device__ void potential_eval_cuda_tex | ( | int | pid, |
float | r2, | ||
float * | e, | ||
float * | f | ||
) | [inline] |
Evaluates the given potential at the given point (interpolated) using texture memory on the device.
pid | The index of the potential to be evaluated. |
r2 | The radius at which it is to be evaluated, squared. |
e | Pointer to a floating-point value in which to store the interaction energy. |
f | Pointer to a floating-point value in which to store the magnitude of the interaction force divided by r. |
Note that for efficiency reasons, this function does not check if any of the parameters are NULL
or if sqrt(r2)
is within the interval of the potential p
.
int runner_bind | ( | cudaArray * | cuArray_coeffs, |
cudaArray * | cuArray_pind, | ||
cudaArray * | cuArray_diags | ||
) |
Bind textures to the given cuda Arrays.
Hack to get around the fact that textures are static and can thus not be externalized.
__device__ int runner_cuda_gettask | ( | struct queue_cuda * | q, |
int | steal | ||
) |
Get a task from the given task queue.
Picks tasks from the queue sequentially and checks if they can be computed. If not, they are returned to the queue.
This routine blocks until a valid task is picked up, or the specified queue is empty.
__device__ void runner_dopair4_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
float * | pshift, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device.
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_dopair4_sorted_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device.
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_dopair4_sorted_left_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the left side (cid).
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_dopair4_sorted_right_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device, and store only interactions on the right side (cjd).
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_dopair4_verlet_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
int | verlet_rebuild, | ||
unsigned int * | sortlist, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device.
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_dopair4_verlet_left_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
int | verlet_rebuild, | ||
unsigned int * | sortlist, | ||
float * | epot_global | ||
) |
__device__ void runner_dopair4_verlet_right_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
int | verlet_rebuild, | ||
unsigned int * | sortlist, | ||
float * | epot_global | ||
) |
__device__ void runner_dopair_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
float * | pshift, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device.
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_dopair_sorted_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device.
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_dopair_verlet_cuda | ( | float4 * | parts_i, |
int | count_i, | ||
float4 * | parts_j, | ||
int | count_j, | ||
float * | forces_i, | ||
float * | forces_j, | ||
unsigned int * | sort_i, | ||
unsigned int * | sort_j, | ||
float * | pshift, | ||
int | verlet_rebuild, | ||
unsigned int * | sortlist, | ||
float * | epot_global | ||
) |
Compute the pairwise interactions for the given pair on a CUDA device.
icid | Array of parts in the first cell. |
count_i | Number of parts in the first cell. |
icjd | Array of parts in the second cell. |
count_j | Number of parts in the second cell. |
pshift | A pointer to an array of three floating point values containing the vector separating the centers of cell_i and cell_j . |
cid | Part buffer in local memory. |
cjd | Part buffer in local memory. |
__device__ void runner_doself4_cuda | ( | float4 * | parts, |
int | count, | ||
float * | forces, | ||
float * | epot_global | ||
) |
Compute the self interactions for the given cell on a CUDA device.
iparts | Array of parts in this cell. |
count | Number of parts in the cell. |
parts | Part buffer in local memory. |
__device__ void runner_doself4_diag_cuda | ( | float4 * | parts, |
int | count, | ||
float * | forces, | ||
float * | epot_global | ||
) |
Compute the self interactions for the given cell on a CUDA device.
iparts | Array of parts in this cell. |
count | Number of parts in the cell. |
parts | Part buffer in local memory. |
__device__ void runner_doself4_diag_cuda_old | ( | float4 * | parts, |
int | count, | ||
float * | forces, | ||
float * | epot_global | ||
) |
__device__ void runner_doself_cuda | ( | float4 * | parts, |
int | count, | ||
float * | forces, | ||
float * | epot_global | ||
) |
Compute the self interactions for the given cell on a CUDA device.
iparts | Array of parts in this cell. |
count | Number of parts in the cell. |
parts | Part buffer in local memory. |
__device__ void runner_doself_diag_cuda | ( | float4 * | parts, |
int | count, | ||
float * | forces, | ||
float * | epot_global | ||
) |
Compute the self interactions for the given cell on a CUDA device.
iparts | Array of parts in this cell. |
count | Number of parts in the cell. |
parts | Part buffer in local memory. |
int runner_parts_bind | ( | cudaArray * | cuArray_parts | ) |
Bind textures to the given cuda Arrays.
Hack to get around the fact that textures are static and can thus not be externalized.
int runner_parts_unbind | ( | ) |
Bind textures to the given cuda Arrays.
Hack to get around the fact that textures are static and can thus not be externalized.
__device__ int cuda_barrier = 0 |
__device__ int cuda_cell_mutex = 0 |
cudaArray* cuda_coeffs |
__constant__ float cuda_cutoff = 0.0f |
__constant__ float cuda_cutoff2 = 0.0f |
__constant__ unsigned int* cuda_diags |
__constant__ float cuda_dscale = 0.0f |
__device__ float cuda_epot = 0.0f |
__device__ float cuda_epot_out |
__constant__ float cuda_eps[100] |
__device__ float cuda_fio[32] |
__device__ int cuda_io[32] |
__constant__ float cuda_maxdist = 0.0f |
__constant__ int cuda_maxtype = 0 |
__constant__ int cuda_nr_cells = 0 |
__constant__ int cuda_nr_pairs = 0 |
__constant__ int cuda_nr_tuples = 0 |
__constant__ int cuda_nrqueues |
__device__ int cuda_pair_next = 0 |
__constant__ struct cellpair_cuda* cuda_pairs |
__device__ int cuda_pairs_done = 0 |
__constant__ float4* cuda_parts |
__constant__ unsigned int* cuda_pind |
__constant__ int cuda_queue_size |
__device__ struct queue_cuda cuda_queues[cuda_maxqueues] |
__device__ int cuda_rcount = 0 |
__constant__ float cuda_rmin[100] |
__device__ unsigned int* cuda_sortlists = NULL |
__device__ int* cuda_sortlists_ind |
__device__ int* cuda_taboo |
__device__ float cuda_timers[tid_count] |
__constant__ struct potential* potential_null_cuda = NULL |
texture< float4 , cudaTextureType2D > tex_coeffs |
texture< unsigned int , cudaTextureType1D > tex_diags |
texture< float4 , cudaTextureType2D > tex_parts |
texture< int , cudaTextureType1D > tex_pind |