cuda - How to synchronize global memory between multiple kernel launches? -
i want launch multiple times following kernel in loop (pseudo):
__global__ void kernel(t_dev input array in global mem) { __shared__ prec tt[block_dim]; if (thid < m) { tt[thid] = t_dev.data[ii]; // mem read! } ... // modify __syncthreads(); if (thid < m) { t_dev.data[thid] = tt[thid]; // mem write! } __threadfence(); // or __syncthreads(); //// necessary!! why? }
what conceptually read in values t_dev . modify them, , write out global mem again! , start same kernel again!!
why need _threadfence or __syncthread otherwise result wrong, because, memory writes not finished when same kernel starts again. thats happens here, gtx580 has device overlap enabled,
but why global mem writes not finished when next kernel starts... because of device overlap or because that? thought, when launch kernel after kernel, mem write/reads finished after 1 kernel... :-)
thanks answers!
some code :
for(int kernelaidx = 0; kernelaidx < loops; kernelaidx++){ proxgpu::sorproxcontactordered_1threads_stepa_kernelwrap<prec,sorproxsettings1>( mu_dev,x_new_dev,t_dev,x_old_dev,d_dev, t_dev, kernelaidx, pconvergedflag_dev, m_abstol,m_reltol); proxgpu::sorprox_stepb_kernelwrap<prec,sorproxsettings1>( t_dev, t_dev, x_new_dev, kernelaidx ); }
these thw 2 kernels in loop, t_dev , x_new_dev, moved step step b,
kernel looks follows:
template<typename prec, int threads_per_block, int block_dim, int prox_packages, typename tconvexset> __global__ void sorproxcontactordered_1threads_stepa_kernel( utilcuda::matrix<prec> mu_dev, utilcuda::matrix<prec> y_dev, utilcuda::matrix<prec> t_dev, utilcuda::matrix<prec> x_old_dev, utilcuda::matrix<prec> d_dev, utilcuda::matrix<prec> t_dev, int kernelaidx, int maxncontacts, bool * convergedflag_dev, prec _abstol, prec _reltol){ //__threadfence() here or @ end; works???? why // assumend 1 block, threads_per_block threads , column major matrix t_dev int thid = threadidx.x; int m = min(maxncontacts*prox_package_size, block_dim); // actual size of diagonal block! int = kernelaidx * block_dim; int ii = + thid; //first copy x_old_dev in shared __shared__ prec xx[block_dim]; // each thread writes 1 element, if in limit!! __shared__ prec tt[block_dim]; if(thid < m){ xx[thid] = x_old_dev.data[ii]; tt[thid] = t_dev.data[ii]; } __syncthreads(); prec abstol = _abstol; prec reltol = _reltol; int jj; //prec t_iijj; //offset t_dev_ptr start of block prec * t_dev_ptr = ptrelem_colm(t_dev,i,i); prec * mu_dev_ptr = &mu_dev.data[prox_packages*kernelaidx]; __syncthreads(); for(int j_t = 0; j_t < m ; j_t+=prox_package_size){ //select number of threads need! // here process 1 [m x prox_package_size] block // first normal direction ========================================================== jj = + j_t; __syncthreads(); if( ii == jj ){ // select thread on diagonal ... prec x_new_n = (d_dev.data[ii] + tt[thid]); //prox normal! if(x_new_n <= 0.0){ x_new_n = 0.0; } /* if( !checkconverged(x_new,xx[thid],abstol,reltol)){ *convergedflag_dev = 0; }*/ xx[thid] = x_new_n; tt[thid] = 0.0; } // threads not on diagonal fall sync! __syncthreads(); // select m threads! if(thid < m){ tt[thid] += t_dev_ptr[thid] * xx[j_t]; } // ==================================================================================== // wee need syncronize here because 1 threads finished lambda_t2 shared mem tt, updated thread! __syncthreads(); // second tangential direction ========================================================== jj++; __syncthreads(); if( ii == jj ){ // select thread on diagonal, 1 thread finishs t1 , t2 directions. // prox tangential prec lambda_t1 = (d_dev.data[ii] + tt[thid]); prec lambda_t2 = (d_dev.data[ii+1] + tt[thid+1]); prec radius = (*mu_dev_ptr) * xx[thid-1]; prec absvalue = sqrt(lambda_t1*lambda_t1 + lambda_t2*lambda_t2); if(absvalue > radius){ lambda_t1 = (lambda_t1 * radius ) / absvalue; lambda_t2 = (lambda_t2 * radius ) / absvalue; } /*if( !checkconverged(lambda_t1,xx[thid],abstol,reltol)){ *convergedflag_dev = 0; } if( !checkconverged(lambda_t2,xx[thid+1],abstol,reltol)){ *convergedflag_dev = 0; }*/ //write 2 values back! xx[thid] = lambda_t1; tt[thid] = 0.0; xx[thid+1] = lambda_t2; tt[thid+1] = 0.0; } // threads not on diagonal fall sync! __syncthreads(); t_dev_ptr = ptrcoloffset_colm(t_dev_ptr,1,t_dev.outerstridebytes); __syncthreads(); if(thid < m){ tt[thid] += t_dev_ptr[thid] * xx[j_t+1]; } __syncthreads(); t_dev_ptr = ptrcoloffset_colm(t_dev_ptr,1,t_dev.outerstridebytes); __syncthreads(); if(thid < m){ tt[thid] += t_dev_ptr[thid] * xx[j_t+2]; } // ==================================================================================== __syncthreads(); // move t_dev_ptr 1 column t_dev_ptr = ptrcoloffset_colm(t_dev_ptr,1,t_dev.outerstridebytes); // move mu_ptr nex contact __syncthreads(); mu_dev_ptr = &mu_dev_ptr[1]; __syncthreads(); } __syncthreads(); // write results, dont need syncronize because // anyway safe testing first! if(thid < m){ y_dev.data[ii] = xx[thid]; updated in kernel b t_dev.data[ii] = tt[thid]; updated in kernel b } //__threadfence(); /// stupid threadfence makes working!
i compare solution @ end cpu, , here put everywhere can syncthread around safe, start! (this code gauss seidel stuff) not work @ without thread_fence @ end or @ beginnig not make sense...
sorry code, can guess problem comes, frome because bit @ end, explainig why happens? checked algorithm several times, there no memory error (reported nsight) or other stuff, every thing works fine... kernel launched 1 block only!
if launch successive instances of kernel same stream, each kernel launch synchronous compared kernel instance before , after it. programming model guarantees it. cuda permits simultaneous kernel execution on kernels launched different streams of same context, , overlapping kernel execution happens if scheduler determines sufficient resources available so.
neither __threadfence
nor __syncthreads
have effect seem thinking - __threadfence
works @ scope of active threads , __syncthreads
intra-block barrier operation. if want kernel kernel synchronization, need use 1 of host side synchronization calls, cudathreadsynchronize
(pre cuda 4.0) or cudadevicesynchronize
(cuda 4.0 , later), or per-stream equivalent if using streams.
Comments
Post a Comment