#include #include #include #include #include #include #include #include #ifndef __CUDACC__ #define XCPU #elif __CUDA_ARCH__ >= 120 #define USMA #endif #ifdef XCPU #include #include #endif using namespace std; namespace xppc{ #include "ini.cxx" #include "pro.cu" void initialize(float enh = 1.f){ m.set(); d.eff*=enh; } unsigned int pmax, pmxo, pn; #ifdef XCPU dats *e; // pointer to a copy of "d" on device int nblk, nthr, ntot; void ini(int type){ rs_ini(); pn=0; ntot=nblk*nthr; pmax=ntot*NPHO; pmxo=pmax/OVER; pmax=pmxo*OVER; d.hnum=pmax/HQUO; { d.hits = q.hits = new hit[d.hnum]; if(type==0) d.pz = q.pz = new photon[pmxo]; #ifdef TALL d.bf = new pbuf[pmax]; #endif } { d.z=&z; e=&d; oms=q.oms; } { unsigned int size=d.rsize, need=seed+1; if(sizedevice=device; { ostringstream o; o<<"NPHO_"<= 3000 checkError(cudaFuncSetCacheConfig(propagate, cudaFuncCachePreferL1)); #endif cudaFuncAttributes attr; checkError(cudaFuncGetAttributes (&attr, propagate)); nblk=prop.multiProcessorCount, nthr=attr.maxThreadsPerBlock; cerr<<"Running on "<1){ pmax=ntot*npho; pmxo=pmax/OVER; pmax=pmxo*OVER; d.hnum=pmax/HQUO; unsigned long mtot=sizeof(datz)+sizeof(dats)+d.gsize*sizeof(DOM); mtot+=+d.hnum*sizeof(hit); if(d.type==0) mtot+=pmxo*sizeof(photon); #ifdef TALL mtot+=pmax*sizeof(pbuf); #endif if(mtot>xmem) npho/=2; else break; } } { checkError(cudaStreamCreate(&stream)); checkError(cudaEventCreateWithFlags(&evt1, cudaEventBlockingSync)); checkError(cudaEventCreateWithFlags(&evt2, cudaEventBlockingSync)); } { unsigned int size=d.rsize; if(size0){ cerr<<"Error: TOT was a nan or an inf "<=d.hnum){ d.hidx=d.hnum; cerr<<"Error: data buffer overflow occurred!"<0) checkError(cudaStreamSynchronize(stream)); unsigned int pn=num/OVER; unsigned int size=pn*sizeof(photon); checkError(cudaMemcpyAsync(d.pz, &q.pz[idx], size, cudaMemcpyHostToDevice, stream)); idx+=pn; } void kernel_f(){ checkError(cudaStreamSynchronize(stream)); if(num>0){ checkError(cudaEventRecord(evt1, stream)); propagate<<< 1, 1, 0, stream >>>(e, 0); checkError(cudaGetLastError()); propagate<<< nblk, nthr, 0, stream >>>(e, num); checkError(cudaGetLastError()); checkError(cudaEventRecord(evt2, stream)); } } void stop(){ fprintf(stderr, "Device time: %2.1f (in-kernel: %2.1f...%2.1f) [ms]\n", deviceTime, threadMin, threadMax); checkError(cudaThreadExit()); } }; vector gpus; void ini(int type){ d.hnum=0; pmax=0, pmxo=0, pn=0; for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++){ i->set(); i->ini(type); if(xgpu) sv++; d.hnum+=i->d.hnum; pmax+=i->pmax, pmxo+=i->pmxo; } { unsigned long size=d.hnum*sizeof(hit); checkError(cudaMallocHost((void**) &q.hits, size)); } if(d.type==0){ unsigned long size=pmxo*sizeof(photon); checkError(cudaMallocHost((void**) &q.pz, size)); } } void fin(){ for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++) i->set(), i->fin(); checkError(cudaFreeHost(q.hits)); if(d.type==0) checkError(cudaFreeHost(q.pz)); } void listDevices(){ int deviceCount, driver, runtime; cudaGetDeviceCount(&deviceCount); cudaDriverGetVersion(&driver); cudaRuntimeGetVersion(&runtime); fprintf(stderr, "Found %d devices, driver %d, runtime %d\n", deviceCount, driver, runtime); for(int device=0; device0){ d.hidx=0; #ifdef XCPU for(d.blockIdx=0, d.gridDim=nblk, blockDim.x=nthr; d.blockIdx=d.hnum){ d.hidx=d.hnum; cerr<<"Error: data buffer overflow occurred!"<::iterator i=gpus.begin(); i!=gpus.end(); i++) i->set(), i->kernel_i(); #endif cerr<<"photons: "<::iterator i=gpus.begin(); i!=gpus.end(); i++){ i->num=over*((num*(unsigned long long) i->pmax)/(over*(unsigned long long) pmax)); sum+=i->num; } while(num>sum){ static int res=0; gpu& g=gpus[res++%gpus.size()]; if(g.num::iterator i=gpus.begin(); i!=gpus.end(); i++) i->set(), i->kernel_c(idx); } for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++) i->set(), i->kernel_f(); #endif if(old>0) print(); #ifndef XCPU old=num; for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++) i->old=i->num; #endif } float square(float x){ return x*x; } int flset(int str, int dom){ int type=1; float r[3]={0, 0, 0}; if(str<0){ type=2; str=-str; } if(str==0) switch(dom){ case 1: type=3; r[0]=544.07; r[1]=55.89; r[2]=136.86; break; case 2: type=4; r[0]=11.87; r[1]=179.19; r[2]=-205.64; break; } else for(int n=0; n0){ float FLZ, FLR; sincosf(fcv*30.f, &FLZ, &FLR); FLZ*=OMR, FLR*=OMR; sft[0]+=FLR*n[0]; sft[1]+=FLR*n[1]; sft[2]+=FLZ; r[3]+=OMR*d.ocv; } float xi; sincosf(d.up, &n[2], &xi); n[0]*=xi; n[1]*=xi; if(m!=NULL){ float o[3]={0,0,1}; float r[3]; r[0]=m[1]*o[2]-m[2]*o[1]; // m[1] r[1]=m[2]*o[0]-m[0]*o[2]; //-m[0] r[2]=m[0]*o[1]-m[1]*o[0]; // 0 float norm=sqrt(r[0]*r[0]+r[1]*r[1]+r[2]*r[2]); if(norm>0){ float cs=0; for(int i=0; i<3; i++) r[i]/=norm, cs+=o[i]*m[i]; float sn=sin(acos(cs)); //norm float R[3][3]={0}; for(int i=0; i<3; i++) for(int j=0; j<3; j++) R[i][j]=(i==j?cs:sn*r[3-i-j]*((j-i+3)%3==1?1:-1))+(1-cs)*r[i]*r[j]; float tmp[3]; for(int i=0; i<3; i++){ tmp[i]=0; for(int j=0; j<3; j++) tmp[i]+=R[i][j]*n[j]; } for(int i=0; i<3; i++) n[i]=tmp[i]; for(int i=0; i<3; i++){ tmp[i]=0; for(int j=0; j<3; j++) tmp[i]+=R[i][j]*sft[j]; } for(int i=0; i<3; i++) sft[i]=tmp[i]; } } for(int i=0; i<3; i++) r[i]+=sft[i]; } #endif void flone(unsigned long long num){ for(long long i=llroundf(num*(long double)d.eff); i>0; i-=pmax) kernel(min(i, (long long) pmax)); #ifndef XCPU kernel(0); #endif } void flasher(int str, int dom, unsigned long long num, int itr){ flini(str, dom); for(int j=0; j0) printf("\n"); } fin(); } #ifdef XCPU void start(){} void stop(){} void choose(int device){ sv+=device; seed=device; nblk=NBLK, nthr=NTHR; } void listDevices(){} #else void start(){ cudaSetDeviceFlags(cudaDeviceBlockingSync); } void stop(){ fprintf(stderr, "\n"); for(vector::iterator i=gpus.begin(); i!=gpus.end(); i++) i->set(), i->stop(); } void choose(int device){ if(device<0){ int deviceCount; cudaGetDeviceCount(&deviceCount); for(int device=0; device1; } #endif #include "f2k.cxx" } #ifndef XLIB using namespace xppc; int main(int arg_c, char *arg_a[]){ start(); if(arg_c<=1){ listDevices(); fprintf(stderr, "Use: %s [device] (f2k muons)\n" " %s [str] [om] [num] [device] (flasher)\n", arg_a[0], arg_a[0]); } else if(0==strcmp(arg_a[1], "-")){ initialize(); ices & w = z.w[WNUM/2]; cerr<<"For wavelength="<1) device=atoi(arg_a[1]); initialize(); choose(device); fprintf(stderr, "Processing f2k muons from stdin on device %d\n", device); f2k(); } else{ int str=0, dom=0, device=0, itr=0; unsigned long long num=1000000ULL; if(arg_c>1) str=atoi(arg_a[1]); if(arg_c>2) dom=atoi(arg_a[2]); if(arg_c>3){ num=(unsigned long long) atof(arg_a[3]); char * sub = strchr(arg_a[3], '*'); if(sub!=NULL) itr=(int) atof(++sub); } if(arg_c>4) device=atoi(arg_a[4]); initialize(); choose(device); fprintf(stderr, "Running flasher simulation on device %d\n", device); flasher(str, dom, num, itr); } stop(); } #endif