mdcore: /home/pedro/work/mdcore/src/runner_cuda

Go to the documentation of this file.
00001 /*******************************************************************************
00002  * This file is part of mdcore.
00003  * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
00004  * 
00005  * This program is free software: you can redistribute it and/or modify
00006  * it under the terms of the GNU Lesser General Public License as published
00007  * by the Free Software Foundation, either version 3 of the License, or
00008  * (at your option) any later version.
00009  * 
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  * 
00015  * You should have received a copy of the GNU Lesser General Public License
00016  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
00017  * 
00018  ******************************************************************************/
00019 
00020 
00021 /* Set the kernel names depending on cuda_nrparts. */
00022 #define PASTE(x,y) x ## _ ## y
00023 #define runner_run_verlet_cuda(N) PASTE(runner_run_verlet_cuda,N)
00024 #define runner_run_cuda(N) PASTE(runner_run_cuda,N)
00025 
00026 
00034 __global__ void runner_run_verlet_cuda(cuda_nrparts) ( float *forces , int *counts , int *ind , int verlet_rebuild ) {
00035 
00036     int threadID;
00037     int cid, cjd;
00038     float epot = 0.0f;
00039     volatile __shared__ int pid;
00040     #ifdef FORCES_LOCAL
00041         int k;
00042         __shared__ __align__(16) int buff[ 8*cuda_nrparts ];
00043         float *forces_i = (float *)&buff[ 0 ];
00044         float *forces_j = (float *)&buff[ 3*cuda_nrparts ];
00045         unsigned int *sort_i = (unsigned int *)&buff[ 6*cuda_nrparts ];
00046         unsigned int *sort_j = (unsigned int *)&buff[ 7*cuda_nrparts ];
00047     #else
00048         unsigned int seed = 6178 + blockIdx.x;
00049         float *forces_i, *forces_j;
00050         __shared__ unsigned int sort_i[ cuda_nrparts ];
00051         __shared__ unsigned int sort_j[ cuda_nrparts ];
00052         struct queue_cuda *myq , *queues[ cuda_maxqueues ];
00053         int naq = cuda_nrqueues, qid;
00054     #endif
00055     #if !defined(PARTS_TEX)
00056         #ifdef PARTS_LOCAL
00057             float4 *parts_i;
00058             __shared__ float4 parts_j[ cuda_nrparts ];
00059         #else
00060             float4 *parts_i, *parts_j;
00061         #endif
00062     #endif
00063     
00064     TIMER_TIC2
00065     
00066     /* Get the block and thread ids. */
00067     threadID = threadIdx.x;
00068     
00069     /* Check that we've got the correct warp size! */
00070     /* if ( warpSize != cuda_frame ) {
00071         if ( blockID == 0 && threadID == 0 )
00072             printf( "runner_run_cuda: error: the warp size of the device (%i) does not match the warp size mdcore was compiled for (%i).\n" ,
00073                 warpSize , cuda_frame );
00074         return;
00075         } */
00076         
00077 
00078     /* Init the list of queues. */
00079     #ifndef FORCES_LOCAL
00080     if ( threadID == 0 ) {
00081         myq = &cuda_queues[ get_smid() ];
00082         for ( qid = 0 ; qid < cuda_nrqueues ; qid++ )
00083             queues[qid] = &cuda_queues[qid];
00084         naq = cuda_nrqueues - 1;
00085         queues[ get_smid() ] = queues[ naq ];
00086         }
00087     #endif
00088         
00089 
00090     /* Main loop... */
00091     while ( pid >= 0 && pid < cuda_nr_pairs ) {
00092     
00093         /* Let the first thread grab a pair. */
00094         if ( threadID == 0 ) {
00095             #ifdef FORCES_LOCAL
00096                 if ( ( pid = atomicAdd( &cuda_pair_next , 1 ) ) >= cuda_nr_pairs )
00097                     pid = -1;
00098             #else
00099                 TIMER_TIC
00100                 while ( 1 ) {
00101                     if ( myq->rec_count >= myq->count || ( pid = runner_cuda_gettask( myq , 0 ) ) < 0 ) {
00102                         for ( qid = 0 ; qid < naq ; qid++ )
00103                             if ( queues[qid]->rec_count >= queues[qid]->count )
00104                                 queues[ qid-- ] = queues[ --naq ];
00105                         if ( naq == 0 ) {
00106                             pid = -1;
00107                             break;
00108                             }
00109                         seed = 1103515245 * seed + 12345;
00110                         qid = seed % naq;
00111                         if ( ( pid = runner_cuda_gettask( queues[qid] , 1 ) ) >= 0 ) {
00112                             if ( atomicAdd( (int *)&myq->count , 1 ) < cuda_queue_size )
00113                                 myq->rec_data[ atomicAdd( (int *)&myq->rec_count , 1 ) ] = pid;
00114                             else {
00115                                 atomicSub( (int *)&myq->count , 1 );
00116                                 atomicAdd( (int *)&queues[qid]->count , 1 );
00117                                 queues[qid]->rec_data[ atomicAdd( (int *)&queues[qid]->rec_count , 1 ) ] = pid;
00118                                 }
00119                             break;
00120                             }
00121                         }
00122                     else
00123                         break;
00124                     }
00125                 TIMER_TOC(tid_gettask)
00126             #endif
00127             }
00128             
00129         /* Exit if we didn't get a valid pair. */
00130         if ( pid < 0 )
00131             break;
00132 
00133         /* Get a hold of the pair cells. */
00134         cid = cuda_pairs[pid].i;
00135         cjd = cuda_pairs[pid].j;
00136     
00137         /* Do the pair. */
00138         if ( cid != cjd ) {
00139         
00140             #ifdef FORCES_LOCAL
00141         
00142                 /* Clear the forces buffers. */
00143                 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame )
00144                     forces_i[k] = 0.0f;
00145                 for ( k = threadID ; k < 3*counts[cjd] ; k += cuda_frame )
00146                     forces_j[k] = 0.0f;
00147                 // __threadfence_block();
00148                 
00149                 /* Copy the particle data into the local buffers. */
00150                 #ifndef PARTS_TEX
00151                     #ifdef PARTS_LOCAL
00152                         parts_i = &cuda_parts[ ind[cid] ];
00153                         cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] );
00154                         // __threadfence_block();
00155                     #else
00156                         parts_i = &cuda_parts[ ind[cid] ];
00157                         parts_j = &cuda_parts[ ind[cjd] ];
00158                     #endif
00159                 #endif
00160                 
00161                 /* Compute the cell pair interactions. */
00162                 #ifdef PARTS_TEX
00163                     runner_dopair4_verlet_cuda(
00164                         cid , counts[cid] ,
00165                         cjd , counts[cjd] ,
00166                         forces_i , forces_j , 
00167                         sort_i , sort_j ,
00168                         cuda_pairs[pid].shift ,
00169                         verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] ,
00170                         &epot );
00171                 #else
00172                     runner_dopair4_verlet_cuda(
00173                         parts_i , counts[cid] ,
00174                         parts_j , counts[cjd] ,
00175                         forces_i , forces_j , 
00176                         sort_i , sort_j ,
00177                         cuda_pairs[pid].shift ,
00178                         verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] ,
00179                         &epot );
00180                 #endif
00181                     
00182                 /* Write the particle forces back to cell_i. */
00183                 if ( threadID == 0 )
00184                     cuda_mutex_lock( &cuda_taboo[cid] );
00185                 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] );
00186                 __threadfence();
00187                 if ( threadID == 0 ) {
00188                     cuda_mutex_unlock( &cuda_taboo[cid] );
00189                     __threadfence();
00190                     }
00191                     
00192                 /* Write the particle forces back to cell_j. */
00193                 if ( threadID == 0 )
00194                     cuda_mutex_lock( &cuda_taboo[cjd] );
00195                 cuda_sum( &forces[ 3*ind[cjd] ] , forces_j , 3*counts[cjd] );
00196                 __threadfence();
00197                 if ( threadID == 0 ) {
00198                     cuda_mutex_unlock( &cuda_taboo[cjd] );
00199                     __threadfence();
00200                     }
00201                     
00202             #else
00203             
00204                 /* Put a finger on the forces. */
00205                 forces_i = &forces[ 3*ind[cid] ];
00206                 forces_j = &forces[ 3*ind[cjd] ];
00207                 
00208                 /* Copy the particle data into the local buffers. */
00209                 #ifndef PARTS_TEX
00210                     #ifdef PARTS_LOCAL
00211                         parts_i = &cuda_parts[ ind[cid] ];
00212                         cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] );
00213                         // __threadfence_block();
00214                     #else
00215                         parts_i = &cuda_parts[ ind[cid] ];
00216                         parts_j = &cuda_parts[ ind[cjd] ];
00217                     #endif
00218                 #endif
00219                 
00220                 /* Compute the cell pair interactions. */
00221                 #ifdef PARTS_TEX
00222                     runner_dopair4_verlet_cuda(
00223                         cid , counts[cid] ,
00224                         cjd , counts[cjd] ,
00225                         forces_i , forces_j , 
00226                         sort_i , sort_j ,
00227                         cuda_pairs[pid].shift ,
00228                         verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] ,
00229                         &epot );
00230                 #else
00231                     runner_dopair4_verlet_cuda(
00232                         parts_i , counts[cid] ,
00233                         parts_j , counts[cjd] ,
00234                         forces_i , forces_j , 
00235                         sort_i , sort_j ,
00236                         cuda_pairs[pid].shift ,
00237                         verlet_rebuild , &cuda_sortlists[ cuda_sortlists_ind[ pid ] ] ,
00238                         &epot );
00239                 #endif
00240                     
00241                 /* Unlock these cells' mutexes. */
00242                 if ( threadID == 0 ) {
00243                     cuda_mutex_unlock( &cuda_taboo[cid] );
00244                     cuda_mutex_unlock( &cuda_taboo[cjd] );
00245                     }
00246                        
00247             #endif
00248                 
00249             }
00250         else {
00251         
00252             #ifdef FORCES_LOCAL
00253         
00254                 /* Clear the forces buffer. */
00255                 TIMER_TIC
00256                 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame )
00257                     forces_i[k] = 0.0f;
00258                 // __threadfence_block();
00259                 TIMER_TOC(tid_update)
00260                 
00261                 /* Copy the particle data into the local buffers. */
00262                 #ifndef PARTS_TEX
00263                     parts_j = (float4 *)forces_j;
00264                     cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] );
00265                 #endif
00266                 
00267                 /* Compute the cell self interactions. */
00268                 #ifdef PARTS_TEX
00269                     runner_doself4_diag_cuda( cid , counts[cid] , forces_i , &epot );
00270                 #else
00271                     runner_doself4_diag_cuda( parts_j , counts[cid] , forces_i , &epot );
00272                 #endif
00273                     
00274                 /* Write the particle forces back to cell_i. */
00275                 if ( threadID == 0 )
00276                     cuda_mutex_lock( &cuda_taboo[cid] );
00277                 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] );
00278                 __threadfence();
00279                 if ( threadID == 0 ) {
00280                     cuda_mutex_unlock( &cuda_taboo[cid] );
00281                     __threadfence();
00282                     }
00283                     
00284             #else
00285                 
00286                 /* Put a finger on the forces. */
00287                 forces_i = &forces[ 3*ind[cid] ];
00288                 
00289                 /* Copy the particle data into the local buffers. */
00290                 #ifndef PARTS_TEX
00291                     parts_j = (float4 *)forces_j;
00292                     cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] );
00293                 #endif
00294                 
00295                 /* Compute the cell self interactions. */
00296                 #ifdef PARTS_TEX
00297                     runner_doself4_diag_cuda( cid , counts[cid] , forces_i , &epot );
00298                 #else
00299                     runner_doself4_diag_cuda( parts_j , counts[cid] , forces_i , &epot );
00300                 #endif
00301                 
00302                 /* Unlock this cell's mutex. */
00303                 if ( threadID == 0 )
00304                     cuda_mutex_unlock( &cuda_taboo[cid] );
00305                        
00306             #endif
00307         
00308             }
00309             
00310         } /* main loop. */
00311         
00312     /* Accumulate the potential energy. */
00313     atomicAdd( &cuda_epot , epot );
00314 
00315     /* Make a notch on the barrier, last one out cleans up the mess... */
00316     if ( threadID == 0 && atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 ) {
00317         cuda_barrier = 0;
00318         cuda_epot_out = cuda_epot;
00319         cuda_epot = 0.0f;
00320         #ifdef FORCES_LOCAL
00321             cuda_pair_next = 0;
00322         #else
00323             for ( qid = 0 ; qid < cuda_nrqueues ; qid++ ) {
00324                 volatile int *temp = cuda_queues[qid].data; cuda_queues[qid].data = cuda_queues[qid].rec_data; cuda_queues[qid].rec_data = temp;
00325                 cuda_queues[qid].first = 0;
00326                 cuda_queues[qid].last = cuda_queues[qid].count;
00327                 cuda_queues[qid].rec_count = 0;
00328                 }
00329         #endif
00330         }
00331     
00332     TIMER_TOC2(tid_total)
00333 
00334     }
00335     
00336     
00344 __global__ void runner_run_cuda(cuda_nrparts) ( float *forces , int *counts , int *ind ) {
00345 
00346     int threadID;
00347     int cid, cjd;
00348     float epot = 0.0f;
00349     __shared__ volatile int pid;
00350     #ifdef FORCES_LOCAL
00351         int k;
00352         __shared__ __align__(16) int buff[ 8*cuda_nrparts ];
00353         float *forces_i = (float *)&buff[ 0 ];
00354         float *forces_j = (float *)&buff[ 3*cuda_nrparts ];
00355         unsigned int *sort_i = (unsigned int *)&buff[ 6*cuda_nrparts ];
00356         unsigned int *sort_j = (unsigned int *)&buff[ 7*cuda_nrparts ];
00357     #else
00358         unsigned int seed = 6178 + blockIdx.x;
00359         float *forces_i, *forces_j;
00360         __shared__ unsigned int sort_i[ cuda_nrparts ];
00361         __shared__ unsigned int sort_j[ cuda_nrparts ];
00362         struct queue_cuda *myq , *queues[ cuda_maxqueues ];
00363         int naq = cuda_nrqueues, qid;
00364     #endif
00365     #if !defined(PARTS_TEX)
00366         #ifdef PARTS_LOCAL
00367             float4 *parts_i;
00368             __shared__ float4 parts_j[ cuda_nrparts ];
00369         #else
00370             float4 *parts_i, *parts_j;
00371         #endif
00372     #endif
00373     
00374     TIMER_TIC2
00375     
00376     /* Get the block and thread ids. */
00377     threadID = threadIdx.x;
00378     
00379     /* Check that we've got the correct warp size! */
00380     /* if ( warpSize != cuda_frame ) {
00381         if ( blockID == 0 && threadID == 0 )
00382             printf( "runner_run_cuda: error: the warp size of the device (%i) does not match the warp size mdcore was compiled for (%i).\n" ,
00383                 warpSize , cuda_frame );
00384         return;
00385         } */
00386         
00387     /* Init the list of queues. */
00388     #ifndef FORCES_LOCAL
00389     if ( threadID == 0 ) {
00390         myq = &cuda_queues[ get_smid() % cuda_nrqueues ];
00391         for ( qid = 0 ; qid < cuda_nrqueues ; qid++ )
00392             queues[qid] = &cuda_queues[qid];
00393         naq = cuda_nrqueues - 1;
00394         queues[ get_smid() ] = queues[ naq ];
00395         }
00396     #endif
00397         
00398 
00399     /* Main loop... */
00400     while ( 1 ) {
00401     
00402         /* Let the first thread grab a pair. */
00403         if ( threadID == 0 ) {
00404             #ifdef FORCES_LOCAL
00405                 if ( ( pid = atomicAdd( &cuda_pair_next , 1 ) ) >= cuda_nr_pairs )
00406                     pid = -1;
00407             #else
00408                 TIMER_TIC
00409                 while ( 1 ) {
00410                     if ( myq->rec_count >= myq->count || ( pid = runner_cuda_gettask( myq , 0 ) ) < 0 ) {
00411                         for ( qid = 0 ; qid < naq ; qid++ )
00412                             if ( queues[qid]->rec_count >= queues[qid]->count )
00413                                 queues[ qid-- ] = queues[ --naq ];
00414                         if ( naq == 0 ) {
00415                             pid = -1;
00416                             break;
00417                             }
00418                         seed = 1103515245 * seed + 12345;
00419                         qid = seed % naq;
00420                         if ( ( pid = runner_cuda_gettask( queues[qid] , 1 ) ) >= 0 ) {
00421                             if ( atomicAdd( (int *)&myq->count , 1 ) < cuda_queue_size )
00422                                 myq->rec_data[ atomicAdd( (int *)&myq->rec_count , 1 ) ] = pid;
00423                             else {
00424                                 atomicSub( (int *)&myq->count , 1 );
00425                                 atomicAdd( (int *)&queues[qid]->count , 1 );
00426                                 queues[qid]->rec_data[ atomicAdd( (int *)&queues[qid]->rec_count , 1 ) ] = pid;
00427                                 }
00428                             break;
00429                             }
00430                         }
00431                     else
00432                         break;
00433                     }
00434                 TIMER_TOC(tid_gettask)
00435             #endif
00436             }
00437             
00438         /* Exit if we didn't get a valid pair. */
00439         if ( pid < 0 )
00440             break;
00441             
00442         /* Get a hold of the pair cells. */
00443         cid = cuda_pairs[pid].i;
00444         cjd = cuda_pairs[pid].j;
00445     
00446         /* if ( threadID == 0 )
00447             printf( "runner_run_cuda: block %03i got pid=%i (%i/%i).\n" , blockID , pid , cid , cjd ); */
00448         
00449         /* Do the pair. */
00450         if ( cid != cjd ) {
00451         
00452             #ifdef FORCES_LOCAL
00453                 /* Clear the forces buffer. */
00454                 TIMER_TIC
00455                 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame )
00456                     forces_i[k] = 0.0f;
00457                 for ( k = threadID ; k < 3*counts[cjd] ; k += cuda_frame )
00458                     forces_j[k] = 0.0f;
00459                 // __threadfence_block();
00460                 TIMER_TOC(tid_update)
00461             
00462                 /* Copy the particle data into the local buffers. */
00463                 #ifndef PARTS_TEX
00464                     #ifdef PARTS_LOCAL
00465                         // cuda_memcpy( parts_i , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] );
00466                         parts_i = &cuda_parts[ ind[cid] ];
00467                         cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] );
00468                         // __threadfence_block();
00469                     #else
00470                         parts_i = &cuda_parts[ ind[cid] ];
00471                         parts_j = &cuda_parts[ ind[cjd] ];
00472                     #endif
00473                 #endif
00474             
00475                 /* Compute the cell pair interactions. */
00476                 #ifdef PARTS_TEX
00477                     runner_dopair4_sorted_cuda(
00478                         cid , counts[cid] ,
00479                         cjd , counts[cjd] ,
00480                         forces_i , forces_j , 
00481                         sort_i , sort_j ,
00482                         cuda_pairs[pid].shift ,
00483                         &epot );
00484                 #else
00485                     runner_dopair4_sorted_cuda(
00486                         parts_i , counts[cid] ,
00487                         parts_j , counts[cjd] ,
00488                         forces_i , forces_j , 
00489                         sort_i , sort_j ,
00490                         cuda_pairs[pid].shift ,
00491                         &epot );
00492                 #endif
00493                 
00494                 /* Write the particle forces back to cell_i. */
00495                 if ( threadID == 0 )
00496                     cuda_mutex_lock( &cuda_taboo[cid] );
00497                 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] );
00498                 __threadfence();
00499                 if ( threadID == 0 ) {
00500                     cuda_mutex_unlock( &cuda_taboo[cid] );
00501                     __threadfence();
00502                     }
00503 
00504                 /* Write the particle forces back to cell_j. */
00505                 if ( threadID == 0 )
00506                     cuda_mutex_lock( &cuda_taboo[cjd] );
00507                 cuda_sum( &forces[ 3*ind[cjd] ] , forces_j , 3*counts[cjd] );
00508                 __threadfence();
00509                 if ( threadID == 0 ) {
00510                     cuda_mutex_unlock( &cuda_taboo[cjd] );
00511                     __threadfence();
00512                     }
00513                     
00514             #else
00515             
00516                 /* Put a finger on the forces. */
00517                 forces_i = &forces[ 3*ind[cid] ];
00518                 forces_j = &forces[ 3*ind[cjd] ];
00519             
00520                 /* Copy the particle data into the local buffers. */
00521                 #ifndef PARTS_TEX
00522                     #ifdef PARTS_LOCAL
00523                         // cuda_memcpy( parts_i , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] );
00524                         parts_i = &cuda_parts[ ind[cid] ];
00525                         cuda_memcpy( parts_j , &cuda_parts[ ind[cjd] ] , sizeof(float4) * counts[cjd] );
00526                         // __threadfence_block();
00527                     #else
00528                         parts_i = &cuda_parts[ ind[cid] ];
00529                         parts_j = &cuda_parts[ ind[cjd] ];
00530                     #endif
00531                 #endif
00532 
00533                 /* Compute the cell pair interactions. */
00534                 #ifdef PARTS_TEX
00535                     runner_dopair4_sorted_cuda(
00536                         cid , counts[cid] ,
00537                         cjd , counts[cjd] ,
00538                         forces_i , forces_j , 
00539                         sort_i , sort_j ,
00540                         cuda_pairs[pid].shift ,
00541                         &epot );
00542                 #else
00543                     runner_dopair4_sorted_cuda(
00544                         parts_i , counts[cid] ,
00545                         parts_j , counts[cjd] ,
00546                         forces_i , forces_j , 
00547                         sort_i , sort_j ,
00548                         cuda_pairs[pid].shift ,
00549                         &epot );
00550                 #endif
00551 
00552                 /* Unlock these cells' mutexes. */
00553                 if ( threadID == 0 ) {
00554                     cuda_mutex_unlock( &cuda_taboo[cid] );
00555                     cuda_mutex_unlock( &cuda_taboo[cjd] );
00556                     }
00557                        
00558             #endif
00559                 
00560             }
00561         else {
00562         
00563             #ifdef FORCES_LOCAL
00564                 /* Clear the forces buffer. */
00565                 TIMER_TIC
00566                 for ( k = threadID ; k < 3*counts[cid] ; k += cuda_frame )
00567                     forces_i[k] = 0.0f;
00568                 // __threadfence_block();
00569                 TIMER_TOC(tid_update)
00570             
00571                 /* Copy the particle data into the local buffers. */
00572                 #ifndef PARTS_TEX
00573                     #ifdef FORCES_LOCAL
00574                         parts_j = (float4 *)forces_j;
00575                         cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] );
00576                     #else
00577                         parts_j = &cuda_parts[ ind[cid] ];
00578                     #endif
00579                 #endif
00580             
00581                 /* Compute the cell self interactions. */
00582                 #ifdef PARTS_TEX
00583                     runner_doself4_cuda( cid , counts[cid] , forces_i , &epot );
00584                 #else
00585                     runner_doself4_cuda( parts_j , counts[cid] , forces_i , &epot );
00586                 #endif
00587                 
00588                 /* Write the particle forces back to cell_i. */
00589                 if ( threadID == 0 )
00590                     cuda_mutex_lock( &cuda_taboo[cid] );
00591                 cuda_sum( &forces[ 3*ind[cid] ] , forces_i , 3*counts[cid] );
00592                 __threadfence();
00593                 if ( threadID == 0 ) {
00594                     cuda_mutex_unlock( &cuda_taboo[cid] );
00595                     __threadfence();
00596                     }
00597                     
00598             #else
00599                 forces_i = &forces[ 3*ind[cid] ];
00600             
00601                 /* Copy the particle data into the local buffers. */
00602                 #ifndef PARTS_TEX
00603                     #ifdef FORCES_LOCAL
00604                         parts_j = (float4 *)forces_j;
00605                         cuda_memcpy( parts_j , &cuda_parts[ ind[cid] ] , sizeof(float4) * counts[cid] );
00606                     #else
00607                         parts_j = &cuda_parts[ ind[cid] ];
00608                     #endif
00609                 #endif
00610 
00611                 /* Compute the cell self interactions. */
00612                 #ifdef PARTS_TEX
00613                     runner_doself4_cuda( cid , counts[cid] , forces_i , &epot );
00614                 #else
00615                     runner_doself4_cuda( parts_j , counts[cid] , forces_i , &epot );
00616                 #endif
00617                 
00618                 /* Unlock this cell's mutex. */
00619                 if ( threadID == 0 )
00620                     cuda_mutex_unlock( &cuda_taboo[cid] );
00621                        
00622             #endif
00623             
00624             }
00625           
00626         } /* main loop. */
00627 
00628     /* Accumulate the potential energy. */
00629     atomicAdd( &cuda_epot , epot );
00630 
00631     /* Make a notch on the barrier, last one out cleans up the mess... */
00632     if ( threadID == 0 && atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 ) {
00633         cuda_barrier = 0;
00634         cuda_epot_out = cuda_epot;
00635         cuda_epot = 0.0f;
00636         #ifdef FORCES_LOCAL
00637             cuda_pair_next = 0;
00638         #else
00639             // int nr_tasks = 0;
00640             for ( qid = 0 ; qid < cuda_nrqueues ; qid++ ) {
00641                 volatile int *temp = cuda_queues[qid].data; cuda_queues[qid].data = cuda_queues[qid].rec_data; cuda_queues[qid].rec_data = temp;
00642                 cuda_queues[qid].first = 0;
00643                 cuda_queues[qid].last = cuda_queues[qid].count;
00644                 cuda_queues[qid].rec_count = 0;
00645                 // printf( "runner_cuda: queue %i has %i elements.\n" , qid , cuda_queues[qid].count );
00646                 // nr_tasks += cuda_queues[qid].count;
00647                 }
00648             // printf( "runner_cuda: counted %i tasks in all queues.\n" , nr_tasks );
00649         #endif
00650         }
00651     
00652     /* if ( threadID == 0 )
00653         printf( "runner_run_cuda: block %03i is done.\n" , blockID ); */
00654         
00655     TIMER_TOC2(tid_total)
00656 
00657     }
00658