mdcore
0.1.5
|
00001 /******************************************************************************* 00002 * This file is part of mdcore. 00003 * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) 00004 * 00005 * This program is free software: you can redistribute it and/or modify 00006 * it under the terms of the GNU Lesser General Public License as published 00007 * by the Free Software Foundation, either version 3 of the License, or 00008 * (at your option) any later version. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU Lesser General Public License 00016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00017 * 00018 ******************************************************************************/ 00019 00020 00021 /* Set the kernel names depending on cuda_nrparts. */ 00022 #define PASTE(x,y) x ## _ ## y 00023 #define runner_run_verlet_cuda(N) PASTE(runner_run_verlet_cuda,N) 00024 #define runner_run_cuda(N) PASTE(runner_run_cuda,N) 00025 00026 00034 __global__ void runner_run_verlet_cuda(cuda_nrparts) ( float *forces , int *counts , int *ind , int verlet_rebuild ) { 00035 00036 int threadID; 00037 int cid, cjd; 00038 float epot = 0.0f; 00039 volatile __shared__ int pid; 00040 #ifdef FORCES_LOCAL 00041 int k; 00042 __shared__ __align__(16) int buff[ 8*cuda_nrparts ]; 00043 float *forces_i = (float *)&buff[ 0 ]; 00044 float *forces_j = (float *)&buff[ 3*cuda_nrparts ]; 00045 unsigned int *sort_i = (unsigned int *)&buff[ 6*cuda_nrparts ]; 00046 unsigned int *sort_j = (unsigned int *)&buff[ 7*cuda_nrparts ]; 00047 #else 00048 unsigned int seed = 6178 + blockIdx.x; 00049 float *forces_i, *forces_j; 00050 __shared__ unsigned int sort_i[ cuda_nrparts ]; 00051 __shared__ unsigned int sort_j[ cuda_nrparts ]; 00052 struct queue_cuda *myq , *queues[ cuda_maxqueues ]; 00053 int naq = cuda_nrqueues, qid; 00054 #endif 00055 #if !defined(PARTS_TEX) 00056 #ifdef PARTS_LOCAL 00057 float4 *parts_i; 00058 __shared__ float4 parts_j[ cuda_nrparts ]; 00059 #else 00060 float4 *parts_i, *parts_j; 00061 #endif 00062 #endif 00063 00064 TIMER_TIC2 00065 00066 /* Get the block and thread ids. */ 00067 threadID = threadIdx.x; 00068 00069 /* Check that we've got the correct warp size! */ 00070 /* if ( warpSize != cuda_frame ) { 00071 if ( blockID == 0 && threadID == 0 ) 00072 printf( "runner_run_cuda: error: the warp size of the device (%i) does not match the warp size mdcore was compiled for (%i).\n" , 00073 warpSize , cuda_frame ); 00074 return; 00075 } */ 00076 00077 00078 /* Init the list of queues. */ 00079 #ifndef FORCES_LOCAL 00080 if ( threadID == 0 ) { 00081 myq = &cuda_queues[ get_smid() ]; 00082 for ( qid = 0 ; qid < cuda_nrqueues ; qid++ ) 00083 queues[qid] = &cuda_queues[qid]; 00084 naq = cuda_nrqueues - 1; 00085 queues[ get_smid() ] = queues[ naq ]; 00086 } 00087 #endif 00088 00089 00090 /* Main loop... */ 00091 while ( pid >= 0 && pid < cuda_nr_pairs ) { 00092 00093 /* Let the first thread grab a pair. */ 00094 if ( threadID == 0 ) { 00095 #ifdef FORCES_LOCAL 00096 if ( ( pid = atomicAdd( &cuda_pair_next , 1 ) ) >= cuda_nr_pairs ) 00097 pid = -1; 00098 #else 00099 TIMER_TIC 00100 while ( 1 ) { 00101 if ( myq->rec_count >= myq->count || ( pid = runner_cuda_gettask( myq , 0 ) ) < 0 ) { 00102 for ( qid = 0 ; qid < naq ; qid++ ) 00103 if ( queues[qid]->rec_count >= queues[qid]->count ) 00104 queues[ qid-- ] = queues[ --naq ]; 00105 if ( naq == 0 ) { 00106 pid = -1; 00107 break; 00108 } 00109 seed = 1103515245 * seed + 12345; 00110 qid = seed % naq; 00111 if ( ( pid = runner_cuda_gettask( queues[qid] , 1 ) ) >= 0 ) { 00112 if ( atomicAdd( (int *)&myq->count , 1 ) < cuda_queue_size ) 00113 myq->rec_data[ atomicAdd( (int *)&myq->rec_count , 1 ) ] = pid; 00114 else { 00115 atomicSub( (int *)&myq->count , 1 ); 00116 atomicAdd( (int *)&queues[qid]->count , 1 ); 00117 queues[qid]->rec_data[ atomicAdd( (int *)&queues[qid]->rec_count , 1 ) ] = pid; 00118 } 00119 break; 00120 } 00121 } 00122 else 00123 break; 00124 } 00125 TIMER_TOC(tid_gettask) 00126 #endif 00127 } 00128 00129 /* Exit if we didn't get a valid pair. */ 00130 if ( pid < 0 ) 00131 break; 00132 00133 /* Get a hold of the pair cells. */ 00134 cid = cuda_pairs[pid].i; 00135 cjd = cuda_pairs[pid].j; 00136 00137 /* Do the pair. */ 00138 if ( cid != cjd ) { 00139 00140 #ifdef FORCES_LOCAL 00141 00142 /* Clear the forces buffers. */ 00143 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame ) 00144 forces_i[k] = 0.0f; 00145 for ( k = threadID ; k < 3*counts[cjd] ; k += cuda_frame ) 00146 forces_j[k] = 0.0f; 00147 // __threadfence_block(); 00148 00149 /* Copy the particle data into the local buffers. */ 00150 #ifndef PARTS_TEX 00151 #ifdef PARTS_LOCAL 00152 parts_i = &cuda_parts[ ind[cid] ]; 00153 cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] ); 00154 // __threadfence_block(); 00155 #else 00156 parts_i = &cuda_parts[ ind[cid] ]; 00157 parts_j = &cuda_parts[ ind[cjd] ]; 00158 #endif 00159 #endif 00160 00161 /* Compute the cell pair interactions. */ 00162 #ifdef PARTS_TEX 00163 runner_dopair4_verlet_cuda( 00164 cid , counts[cid] , 00165 cjd , counts[cjd] , 00166 forces_i , forces_j , 00167 sort_i , sort_j , 00168 cuda_pairs[pid].shift , 00169 verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] , 00170 &epot ); 00171 #else 00172 runner_dopair4_verlet_cuda( 00173 parts_i , counts[cid] , 00174 parts_j , counts[cjd] , 00175 forces_i , forces_j , 00176 sort_i , sort_j , 00177 cuda_pairs[pid].shift , 00178 verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] , 00179 &epot ); 00180 #endif 00181 00182 /* Write the particle forces back to cell_i. */ 00183 if ( threadID == 0 ) 00184 cuda_mutex_lock( &cuda_taboo[cid] ); 00185 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] ); 00186 __threadfence(); 00187 if ( threadID == 0 ) { 00188 cuda_mutex_unlock( &cuda_taboo[cid] ); 00189 __threadfence(); 00190 } 00191 00192 /* Write the particle forces back to cell_j. */ 00193 if ( threadID == 0 ) 00194 cuda_mutex_lock( &cuda_taboo[cjd] ); 00195 cuda_sum( &forces[ 3*ind[cjd] ] , forces_j , 3*counts[cjd] ); 00196 __threadfence(); 00197 if ( threadID == 0 ) { 00198 cuda_mutex_unlock( &cuda_taboo[cjd] ); 00199 __threadfence(); 00200 } 00201 00202 #else 00203 00204 /* Put a finger on the forces. */ 00205 forces_i = &forces[ 3*ind[cid] ]; 00206 forces_j = &forces[ 3*ind[cjd] ]; 00207 00208 /* Copy the particle data into the local buffers. */ 00209 #ifndef PARTS_TEX 00210 #ifdef PARTS_LOCAL 00211 parts_i = &cuda_parts[ ind[cid] ]; 00212 cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] ); 00213 // __threadfence_block(); 00214 #else 00215 parts_i = &cuda_parts[ ind[cid] ]; 00216 parts_j = &cuda_parts[ ind[cjd] ]; 00217 #endif 00218 #endif 00219 00220 /* Compute the cell pair interactions. */ 00221 #ifdef PARTS_TEX 00222 runner_dopair4_verlet_cuda( 00223 cid , counts[cid] , 00224 cjd , counts[cjd] , 00225 forces_i , forces_j , 00226 sort_i , sort_j , 00227 cuda_pairs[pid].shift , 00228 verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] , 00229 &epot ); 00230 #else 00231 runner_dopair4_verlet_cuda( 00232 parts_i , counts[cid] , 00233 parts_j , counts[cjd] , 00234 forces_i , forces_j , 00235 sort_i , sort_j , 00236 cuda_pairs[pid].shift , 00237 verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] , 00238 &epot ); 00239 #endif 00240 00241 /* Unlock these cells' mutexes. */ 00242 if ( threadID == 0 ) { 00243 cuda_mutex_unlock( &cuda_taboo[cid] ); 00244 cuda_mutex_unlock( &cuda_taboo[cjd] ); 00245 } 00246 00247 #endif 00248 00249 } 00250 else { 00251 00252 #ifdef FORCES_LOCAL 00253 00254 /* Clear the forces buffer. */ 00255 TIMER_TIC 00256 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame ) 00257 forces_i[k] = 0.0f; 00258 // __threadfence_block(); 00259 TIMER_TOC(tid_update) 00260 00261 /* Copy the particle data into the local buffers. */ 00262 #ifndef PARTS_TEX 00263 parts_j = (float4 *)forces_j; 00264 cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] ); 00265 #endif 00266 00267 /* Compute the cell self interactions. */ 00268 #ifdef PARTS_TEX 00269 runner_doself4_diag_cuda( cid , counts[cid] , forces_i , &epot ); 00270 #else 00271 runner_doself4_diag_cuda( parts_j , counts[cid] , forces_i , &epot ); 00272 #endif 00273 00274 /* Write the particle forces back to cell_i. */ 00275 if ( threadID == 0 ) 00276 cuda_mutex_lock( &cuda_taboo[cid] ); 00277 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] ); 00278 __threadfence(); 00279 if ( threadID == 0 ) { 00280 cuda_mutex_unlock( &cuda_taboo[cid] ); 00281 __threadfence(); 00282 } 00283 00284 #else 00285 00286 /* Put a finger on the forces. */ 00287 forces_i = &forces[ 3*ind[cid] ]; 00288 00289 /* Copy the particle data into the local buffers. */ 00290 #ifndef PARTS_TEX 00291 parts_j = (float4 *)forces_j; 00292 cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] ); 00293 #endif 00294 00295 /* Compute the cell self interactions. */ 00296 #ifdef PARTS_TEX 00297 runner_doself4_diag_cuda( cid , counts[cid] , forces_i , &epot ); 00298 #else 00299 runner_doself4_diag_cuda( parts_j , counts[cid] , forces_i , &epot ); 00300 #endif 00301 00302 /* Unlock this cell's mutex. */ 00303 if ( threadID == 0 ) 00304 cuda_mutex_unlock( &cuda_taboo[cid] ); 00305 00306 #endif 00307 00308 } 00309 00310 } /* main loop. */ 00311 00312 /* Accumulate the potential energy. */ 00313 atomicAdd( &cuda_epot , epot ); 00314 00315 /* Make a notch on the barrier, last one out cleans up the mess... */ 00316 if ( threadID == 0 && atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 ) { 00317 cuda_barrier = 0; 00318 cuda_epot_out = cuda_epot; 00319 cuda_epot = 0.0f; 00320 #ifdef FORCES_LOCAL 00321 cuda_pair_next = 0; 00322 #else 00323 for ( qid = 0 ; qid < cuda_nrqueues ; qid++ ) { 00324 volatile int *temp = cuda_queues[qid].data; cuda_queues[qid].data = cuda_queues[qid].rec_data; cuda_queues[qid].rec_data = temp; 00325 cuda_queues[qid].first = 0; 00326 cuda_queues[qid].last = cuda_queues[qid].count; 00327 cuda_queues[qid].rec_count = 0; 00328 } 00329 #endif 00330 } 00331 00332 TIMER_TOC2(tid_total) 00333 00334 } 00335 00336 00344 __global__ void runner_run_cuda(cuda_nrparts) ( float *forces , int *counts , int *ind ) { 00345 00346 int threadID; 00347 int cid, cjd; 00348 float epot = 0.0f; 00349 __shared__ volatile int pid; 00350 #ifdef FORCES_LOCAL 00351 int k; 00352 __shared__ __align__(16) int buff[ 8*cuda_nrparts ]; 00353 float *forces_i = (float *)&buff[ 0 ]; 00354 float *forces_j = (float *)&buff[ 3*cuda_nrparts ]; 00355 unsigned int *sort_i = (unsigned int *)&buff[ 6*cuda_nrparts ]; 00356 unsigned int *sort_j = (unsigned int *)&buff[ 7*cuda_nrparts ]; 00357 #else 00358 unsigned int seed = 6178 + blockIdx.x; 00359 float *forces_i, *forces_j; 00360 __shared__ unsigned int sort_i[ cuda_nrparts ]; 00361 __shared__ unsigned int sort_j[ cuda_nrparts ]; 00362 struct queue_cuda *myq , *queues[ cuda_maxqueues ]; 00363 int naq = cuda_nrqueues, qid; 00364 #endif 00365 #if !defined(PARTS_TEX) 00366 #ifdef PARTS_LOCAL 00367 float4 *parts_i; 00368 __shared__ float4 parts_j[ cuda_nrparts ]; 00369 #else 00370 float4 *parts_i, *parts_j; 00371 #endif 00372 #endif 00373 00374 TIMER_TIC2 00375 00376 /* Get the block and thread ids. */ 00377 threadID = threadIdx.x; 00378 00379 /* Check that we've got the correct warp size! */ 00380 /* if ( warpSize != cuda_frame ) { 00381 if ( blockID == 0 && threadID == 0 ) 00382 printf( "runner_run_cuda: error: the warp size of the device (%i) does not match the warp size mdcore was compiled for (%i).\n" , 00383 warpSize , cuda_frame ); 00384 return; 00385 } */ 00386 00387 /* Init the list of queues. */ 00388 #ifndef FORCES_LOCAL 00389 if ( threadID == 0 ) { 00390 myq = &cuda_queues[ get_smid() % cuda_nrqueues ]; 00391 for ( qid = 0 ; qid < cuda_nrqueues ; qid++ ) 00392 queues[qid] = &cuda_queues[qid]; 00393 naq = cuda_nrqueues - 1; 00394 queues[ get_smid() ] = queues[ naq ]; 00395 } 00396 #endif 00397 00398 00399 /* Main loop... */ 00400 while ( 1 ) { 00401 00402 /* Let the first thread grab a pair. */ 00403 if ( threadID == 0 ) { 00404 #ifdef FORCES_LOCAL 00405 if ( ( pid = atomicAdd( &cuda_pair_next , 1 ) ) >= cuda_nr_pairs ) 00406 pid = -1; 00407 #else 00408 TIMER_TIC 00409 while ( 1 ) { 00410 if ( myq->rec_count >= myq->count || ( pid = runner_cuda_gettask( myq , 0 ) ) < 0 ) { 00411 for ( qid = 0 ; qid < naq ; qid++ ) 00412 if ( queues[qid]->rec_count >= queues[qid]->count ) 00413 queues[ qid-- ] = queues[ --naq ]; 00414 if ( naq == 0 ) { 00415 pid = -1; 00416 break; 00417 } 00418 seed = 1103515245 * seed + 12345; 00419 qid = seed % naq; 00420 if ( ( pid = runner_cuda_gettask( queues[qid] , 1 ) ) >= 0 ) { 00421 if ( atomicAdd( (int *)&myq->count , 1 ) < cuda_queue_size ) 00422 myq->rec_data[ atomicAdd( (int *)&myq->rec_count , 1 ) ] = pid; 00423 else { 00424 atomicSub( (int *)&myq->count , 1 ); 00425 atomicAdd( (int *)&queues[qid]->count , 1 ); 00426 queues[qid]->rec_data[ atomicAdd( (int *)&queues[qid]->rec_count , 1 ) ] = pid; 00427 } 00428 break; 00429 } 00430 } 00431 else 00432 break; 00433 } 00434 TIMER_TOC(tid_gettask) 00435 #endif 00436 } 00437 00438 /* Exit if we didn't get a valid pair. */ 00439 if ( pid < 0 ) 00440 break; 00441 00442 /* Get a hold of the pair cells. */ 00443 cid = cuda_pairs[pid].i; 00444 cjd = cuda_pairs[pid].j; 00445 00446 /* if ( threadID == 0 ) 00447 printf( "runner_run_cuda: block %03i got pid=%i (%i/%i).\n" , blockID , pid , cid , cjd ); */ 00448 00449 /* Do the pair. */ 00450 if ( cid != cjd ) { 00451 00452 #ifdef FORCES_LOCAL 00453 /* Clear the forces buffer. */ 00454 TIMER_TIC 00455 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame ) 00456 forces_i[k] = 0.0f; 00457 for ( k = threadID ; k < 3*counts[cjd] ; k += cuda_frame ) 00458 forces_j[k] = 0.0f; 00459 // __threadfence_block(); 00460 TIMER_TOC(tid_update) 00461 00462 /* Copy the particle data into the local buffers. */ 00463 #ifndef PARTS_TEX 00464 #ifdef PARTS_LOCAL 00465 // cuda_memcpy( parts_i , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] ); 00466 parts_i = &cuda_parts[ ind[cid] ]; 00467 cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] ); 00468 // __threadfence_block(); 00469 #else 00470 parts_i = &cuda_parts[ ind[cid] ]; 00471 parts_j = &cuda_parts[ ind[cjd] ]; 00472 #endif 00473 #endif 00474 00475 /* Compute the cell pair interactions. */ 00476 #ifdef PARTS_TEX 00477 runner_dopair4_sorted_cuda( 00478 cid , counts[cid] , 00479 cjd , counts[cjd] , 00480 forces_i , forces_j , 00481 sort_i , sort_j , 00482 cuda_pairs[pid].shift , 00483 &epot ); 00484 #else 00485 runner_dopair4_sorted_cuda( 00486 parts_i , counts[cid] , 00487 parts_j , counts[cjd] , 00488 forces_i , forces_j , 00489 sort_i , sort_j , 00490 cuda_pairs[pid].shift , 00491 &epot ); 00492 #endif 00493 00494 /* Write the particle forces back to cell_i. */ 00495 if ( threadID == 0 ) 00496 cuda_mutex_lock( &cuda_taboo[cid] ); 00497 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] ); 00498 __threadfence(); 00499 if ( threadID == 0 ) { 00500 cuda_mutex_unlock( &cuda_taboo[cid] ); 00501 __threadfence(); 00502 } 00503 00504 /* Write the particle forces back to cell_j. */ 00505 if ( threadID == 0 ) 00506 cuda_mutex_lock( &cuda_taboo[cjd] ); 00507 cuda_sum( &forces[ 3*ind[cjd] ] , forces_j , 3*counts[cjd] ); 00508 __threadfence(); 00509 if ( threadID == 0 ) { 00510 cuda_mutex_unlock( &cuda_taboo[cjd] ); 00511 __threadfence(); 00512 } 00513 00514 #else 00515 00516 /* Put a finger on the forces. */ 00517 forces_i = &forces[ 3*ind[cid] ]; 00518 forces_j = &forces[ 3*ind[cjd] ]; 00519 00520 /* Copy the particle data into the local buffers. */ 00521 #ifndef PARTS_TEX 00522 #ifdef PARTS_LOCAL 00523 // cuda_memcpy( parts_i , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] ); 00524 parts_i = &cuda_parts[ ind[cid] ]; 00525 cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] ); 00526 // __threadfence_block(); 00527 #else 00528 parts_i = &cuda_parts[ ind[cid] ]; 00529 parts_j = &cuda_parts[ ind[cjd] ]; 00530 #endif 00531 #endif 00532 00533 /* Compute the cell pair interactions. */ 00534 #ifdef PARTS_TEX 00535 runner_dopair4_sorted_cuda( 00536 cid , counts[cid] , 00537 cjd , counts[cjd] , 00538 forces_i , forces_j , 00539 sort_i , sort_j , 00540 cuda_pairs[pid].shift , 00541 &epot ); 00542 #else 00543 runner_dopair4_sorted_cuda( 00544 parts_i , counts[cid] , 00545 parts_j , counts[cjd] , 00546 forces_i , forces_j , 00547 sort_i , sort_j , 00548 cuda_pairs[pid].shift , 00549 &epot ); 00550 #endif 00551 00552 /* Unlock these cells' mutexes. */ 00553 if ( threadID == 0 ) { 00554 cuda_mutex_unlock( &cuda_taboo[cid] ); 00555 cuda_mutex_unlock( &cuda_taboo[cjd] ); 00556 } 00557 00558 #endif 00559 00560 } 00561 else { 00562 00563 #ifdef FORCES_LOCAL 00564 /* Clear the forces buffer. */ 00565 TIMER_TIC 00566 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame ) 00567 forces_i[k] = 0.0f; 00568 // __threadfence_block(); 00569 TIMER_TOC(tid_update) 00570 00571 /* Copy the particle data into the local buffers. */ 00572 #ifndef PARTS_TEX 00573 #ifdef FORCES_LOCAL 00574 parts_j = (float4 *)forces_j; 00575 cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] ); 00576 #else 00577 parts_j = &cuda_parts[ ind[cid] ]; 00578 #endif 00579 #endif 00580 00581 /* Compute the cell self interactions. */ 00582 #ifdef PARTS_TEX 00583 runner_doself4_cuda( cid , counts[cid] , forces_i , &epot ); 00584 #else 00585 runner_doself4_cuda( parts_j , counts[cid] , forces_i , &epot ); 00586 #endif 00587 00588 /* Write the particle forces back to cell_i. */ 00589 if ( threadID == 0 ) 00590 cuda_mutex_lock( &cuda_taboo[cid] ); 00591 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] ); 00592 __threadfence(); 00593 if ( threadID == 0 ) { 00594 cuda_mutex_unlock( &cuda_taboo[cid] ); 00595 __threadfence(); 00596 } 00597 00598 #else 00599 forces_i = &forces[ 3*ind[cid] ]; 00600 00601 /* Copy the particle data into the local buffers. */ 00602 #ifndef PARTS_TEX 00603 #ifdef FORCES_LOCAL 00604 parts_j = (float4 *)forces_j; 00605 cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] ); 00606 #else 00607 parts_j = &cuda_parts[ ind[cid] ]; 00608 #endif 00609 #endif 00610 00611 /* Compute the cell self interactions. */ 00612 #ifdef PARTS_TEX 00613 runner_doself4_cuda( cid , counts[cid] , forces_i , &epot ); 00614 #else 00615 runner_doself4_cuda( parts_j , counts[cid] , forces_i , &epot ); 00616 #endif 00617 00618 /* Unlock this cell's mutex. */ 00619 if ( threadID == 0 ) 00620 cuda_mutex_unlock( &cuda_taboo[cid] ); 00621 00622 #endif 00623 00624 } 00625 00626 } /* main loop. */ 00627 00628 /* Accumulate the potential energy. */ 00629 atomicAdd( &cuda_epot , epot ); 00630 00631 /* Make a notch on the barrier, last one out cleans up the mess... */ 00632 if ( threadID == 0 && atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 ) { 00633 cuda_barrier = 0; 00634 cuda_epot_out = cuda_epot; 00635 cuda_epot = 0.0f; 00636 #ifdef FORCES_LOCAL 00637 cuda_pair_next = 0; 00638 #else 00639 // int nr_tasks = 0; 00640 for ( qid = 0 ; qid < cuda_nrqueues ; qid++ ) { 00641 volatile int *temp = cuda_queues[qid].data; cuda_queues[qid].data = cuda_queues[qid].rec_data; cuda_queues[qid].rec_data = temp; 00642 cuda_queues[qid].first = 0; 00643 cuda_queues[qid].last = cuda_queues[qid].count; 00644 cuda_queues[qid].rec_count = 0; 00645 // printf( "runner_cuda: queue %i has %i elements.\n" , qid , cuda_queues[qid].count ); 00646 // nr_tasks += cuda_queues[qid].count; 00647 } 00648 // printf( "runner_cuda: counted %i tasks in all queues.\n" , nr_tasks ); 00649 #endif 00650 } 00651 00652 /* if ( threadID == 0 ) 00653 printf( "runner_run_cuda: block %03i is done.\n" , blockID ); */ 00654 00655 TIMER_TOC2(tid_total) 00656 00657 } 00658