/* mog.cu MIMD On GPU simulator by Hank Dietz This simulator uses an accumulator-based instruction set that is based on reprocessing MIPSEL assembly code. The instruction set is fundamentally similar, but has been simplified to make decoding instructions trivial. 20101122 first alpha test release */ #define VERSION 20101122 #define TARGET "cuda" #define TRACE //#undef TRACE #include #include #include #include #include #include "op.h" #include "do.h" #include "pr.h" #include "mog.h" #define GPU_MULTIPROCESSORS 4 #define GPU_WARPSIZE 32 #define GPU_THREADSPERBLOCK (8*GPU_WARPSIZE) #define SHARED_SKEW 16 #define WARPSIZE 32 /* Size of a WARP on this GPU */ #define WARPDIV(X) ((X)>>5) /* X / WARPSIZE */ #define WARPMOD(X) ((X)&31) /* X % WARPSIZE */ #define WARPMUL(X) ((X)<<5) /* X * WARPSIZE */ #define MAR(X) (((X)<<3)&~3) /* X * WARPSIZE / 4 */ /* TIMEOUT can be anything; 1 for TRACE */ #ifdef TRACE #define TIMEOUT 1 #else #define TIMEOUT (1024) #endif #define BNPROC GPU_THREADSPERBLOCK #define NPROC (GPU_MULTIPROCESSORS*BNPROC) typedef union word_union { float f; int i; unsigned int u; short h[2]; signed char b[4]; } word_t; #define FLAGS 1 /* Number of different flag states */ typedef struct flags_struct { int flag[FLAGS]; /* Flags showing which inst present */ } flags_t; typedef struct pe_struct { word_t regs[REGSUSED]; /* Copy of regs for this PE */ word_t a; /* Copy of accumulator */ int pc; /* PC offset */ int ir; /* IR */ } pe_t; typedef struct data_struct { flags_t flags; /* Emulator flags */ pe_t pe[NPROC]; /* PE state info (fixed size) */ /* Memory pool (potentially variable size) */ word_t mem[WARPDIV(NPROC)][DATASIZE][WARPSIZE]; } data_t; typedef struct { word_t regs[WARPDIV(BNPROC)][REGSUSED][WARPSIZE]; } my_shared_t; texture codetex; texture cpooltex; /* Emulator as a single fragment function */ template __global__ void emulate(register data_t *alldata) { extern __shared__ my_shared_t myshared[]; #define BIPROC (threadIdx.x) register int warpmodbiproc = WARPMOD(BIPROC); register int warpdivbiproc = WARPDIV(BIPROC); register const int IPROC = BIPROC + (blockIdx.x * blockSize); register unsigned int ir, op; register int pc = alldata->pe[IPROC].pc; register word_t a; register int serial = 0; register volatile word_t *mem = &(alldata->mem[WARPDIV(IPROC)][0][WARPMOD(BIPROC)]); register word_t *regs = ((word_t *) &((*myshared).regs[warpdivbiproc][0][warpmodbiproc])); register int moretodo; #define REGF(N) regs[WARPMUL(REGNAME(N))].f #define REGI(N) regs[WARPMUL(REGNAME(N))].i #define REGU(N) regs[WARPMUL(REGNAME(N))].u #define MEMI(M) mem[MAR(M)].i #define MEMH(M) mem[MAR(M)].h[((M)&2)>>1] #define MEMB(M) mem[MAR(M)].b[(M)&3] /* Reset flags */ if (IPROC < FLAGS) alldata->flags.flag[IPROC] = 0; /* Restore registers */ for (a.i=0; a.i=0; --a.i) { REGI(a.i) = alldata->pe[IPROC].regs[a.i].i; } a = alldata->pe[IPROC].a; #define CPOOL(p, i) tex1D(cpooltex, CHASH(p-1, i)) #define CODE(addr) tex1D(codetex, addr) //define NEXT { op = OPCODE(ir = CODE(pc++)); } #define NEXT { ir = CODE(pc); op = OPCODE(ir); ++pc; } #define OPIS(o) if (op == (OP##o)) { DO##o } /* Fetch current instruction */ NEXT; do { /* Decode and execute single-instruction interpreters */ moretodo = (TIMEOUT + 1); /* Expand-out the optimized interpreter sequence */ OPORDER ++serial; if (op != OPsys) moretodo = serial; } while (moretodo < TIMEOUT); __syncthreads(); /* Is everybody stuck at a syscall? */ if (op != OPsys) { /* Nope. */ alldata->flags.flag[0] = 1; } __syncthreads(); /* Save registers */ alldata->pe[IPROC].a = a; alldata->pe[IPROC].pc = pc - 1; for (a.i=0; a.ipe[IPROC].regs[a.i].i = REGI(a.i); } } /* Trace emulator as a single fragment function */ template __global__ void emutrace(register data_t *alldata) { extern __shared__ my_shared_t myshared[]; #define BIPROC (threadIdx.x) register int warpmodbiproc = WARPMOD(BIPROC); register int warpdivbiproc = WARPDIV(BIPROC); register const int IPROC = BIPROC + (blockIdx.x * blockSize); register unsigned int ir, op; register int pc = alldata->pe[IPROC].pc; register word_t a; register int serial = 0; register volatile word_t *mem = &(alldata->mem[WARPDIV(IPROC)][0][WARPMOD(BIPROC)]); register word_t *regs = ((word_t *) &((*myshared).regs[warpdivbiproc][0][warpmodbiproc])); register int moretodo; #define REGF(N) regs[WARPMUL(REGNAME(N))].f #define REGI(N) regs[WARPMUL(REGNAME(N))].i #define REGU(N) regs[WARPMUL(REGNAME(N))].u #define MEMI(M) mem[MAR(M)].i #define MEMH(M) mem[MAR(M)].h[((M)&2)>>1] #define MEMB(M) mem[MAR(M)].b[(M)&3] /* Reset flags */ if (IPROC < FLAGS) alldata->flags.flag[IPROC] = 0; /* Restore registers */ for (a.i=0; a.i=0; --a.i) { REGI(a.i) = alldata->pe[IPROC].regs[a.i].i; } a = alldata->pe[IPROC].a; #define CPOOL(p, i) tex1D(cpooltex, CHASH(p-1, i)) #define CODE(addr) tex1D(codetex, addr) //define NEXT { op = OPCODE(ir = CODE(pc++)); } #define NEXT { ir = CODE(pc); op = OPCODE(ir); ++pc; } #define OPIS(o) if (op == (OP##o)) { DO##o } /* Fetch current instruction */ NEXT; do { /* Decode and execute single-instruction interpreters */ moretodo = (TIMEOUT + 1); #ifdef TRACE #define TRACEELSE else #else #define TRACEELSE /* don't serialize */ #endif OPIS(add) TRACEELSE OPIS(addf) TRACEELSE OPIS(and) TRACEELSE OPIS(div) TRACEELSE OPIS(divu) TRACEELSE OPIS(divf) TRACEELSE OPIS(xor) TRACEELSE OPIS(mul) TRACEELSE OPIS(mulf) TRACEELSE OPIS(or) TRACEELSE OPIS(slt) TRACEELSE OPIS(sltu) TRACEELSE OPIS(sltf) TRACEELSE OPIS(neg) TRACEELSE OPIS(negf) TRACEELSE OPIS(rem) TRACEELSE OPIS(remu) TRACEELSE OPIS(sra) TRACEELSE OPIS(sll) TRACEELSE OPIS(srl) TRACEELSE OPIS(j) TRACEELSE OPIS(jf) TRACEELSE OPIS(jt) TRACEELSE OPIS(i2f) TRACEELSE OPIS(u2f) TRACEELSE OPIS(f2i) TRACEELSE OPIS(li) TRACEELSE OPIS(lr) TRACEELSE OPIS(sr) TRACEELSE OPIS(lw) TRACEELSE OPIS(sw) TRACEELSE OPIS(lh) TRACEELSE OPIS(sh) TRACEELSE OPIS(lb) TRACEELSE OPIS(sb) ++serial; if (op != OPsys) moretodo = serial; } while (moretodo < TIMEOUT); __syncthreads(); /* Is everybody stuck at a syscall? */ if (op != OPsys) { /* Nope. */ alldata->flags.flag[0] = 1; } __syncthreads(); /* Save registers */ alldata->pe[IPROC].a = a; alldata->pe[IPROC].pc = pc - 1; for (a.i=0; a.ipe[IPROC].regs[a.i].i = REGI(a.i); } } data_t alldata; /* Status output for PE IPROC */ void status(register const int IPROC) { register int pc = alldata.pe[IPROC].pc; register int a = alldata.pe[IPROC].a.i; register int ir = textseg[pc]; register int op = OPCODE(ir); register int i; #undef CPOOL #define CPOOL(PC,IR) cpool[CHASH((PC), (IR))] #define IRIS(o) { if (op == (OP##o)) PR##o; } /* Dump registers */ printf("PE%d: ", IPROC); printf("pc=0x%x", pc); printf(" ir=%c", '"'); IRIS(sys) IRIS(add) IRIS(addf) IRIS(and) IRIS(div) IRIS(divu) IRIS(divf) IRIS(xor) IRIS(mul) IRIS(mulf) IRIS(or) IRIS(slt) IRIS(sltu) IRIS(sltf) IRIS(neg) IRIS(negf) IRIS(rem) IRIS(remu) IRIS(sra) IRIS(sll) IRIS(srl) IRIS(j) IRIS(jf) IRIS(jt) IRIS(i2f) IRIS(u2f) IRIS(f2i) IRIS(li) IRIS(lr) IRIS(sr) IRIS(lw) IRIS(sw) IRIS(lh) IRIS(sh) IRIS(lb) IRIS(sb) printf("%c a=0x%x", '"', a); for (i=0; i(); cudaArray* gputext2; CUDA_SAFE_CALL(cudaMallocArray( &gputext2, &channelDesc2, sizeof(cpool)/sizeof(int), 1)); CUDA_SAFE_CALL(cudaMemcpyToArray( gputext2, 0, 0, cpool, sizeof(cpool), cudaMemcpyHostToDevice)); cpooltex.filterMode = cudaFilterModePoint; cpooltex.normalized = false; // access with unnormalized texture coordinates cpooltex.addressMode[0] = cudaAddressModeClamp; // wrap texture coordinates CUDA_SAFE_CALL( cudaBindTextureToArray( cpooltex, gputext2, channelDesc2)); /* Allocate and copy textseg[] to GPU texture */ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); cudaArray* gputext; CUDA_SAFE_CALL(cudaMallocArray( &gputext, &channelDesc, sizeof(textseg)/sizeof(unsigned short), 1)); CUDA_SAFE_CALL(cudaMemcpyToArray( gputext, 0, 0, textseg, sizeof(textseg), cudaMemcpyHostToDevice)); codetex.filterMode = cudaFilterModePoint; codetex.normalized = false; // access with unnormalized texture coordinates codetex.addressMode[0] = cudaAddressModeClamp; // wrap texture coordinates CUDA_SAFE_CALL( cudaBindTextureToArray( codetex, gputext, channelDesc)); /* Time the main program execution */ cutStartTimer(timer); #ifdef TRACE printf("flag = %d\n", alldata.flags.flag[0]); for (i=0; i<<< dimGrid, dimBlock, sizeof(my_shared_t) >>>(gpudata); /* Copy stuff back to host... If we are doing a debug trace, copy entire data, else could copy only the flags */ CUDA_SAFE_CALL( cudaMemcpy(&(alldata), gpudata, sizeof(data_t), cudaMemcpyDeviceToHost) ); printf("flag = %d\n", alldata.flags.flag[0]); for (i=0; i<<< dimGrid, dimBlock, sizeof(my_shared_t) >>>(gpudata); /* Copy stuff back to host... If we are doing a debug trace, copy entire data, else could copy only the flags */ CUDA_SAFE_CALL( cudaMemcpy(&(alldata), gpudata, sizeof(flags_t), cudaMemcpyDeviceToHost) ); } while (alldata.flags.flag[0] != 0); #endif /* This is where we'd decode system calls... all just become exit() for now */ /* Stop the timer */ CUT_SAFE_CALL( cutStopTimer(timer)); /* Print the execution time of the emulate function (and it's loop/readback) */ time = cutGetTimerValue(timer); printf("\nTotal emulate time (without setup): %.3f s\n",time/1000.0f); CUDA_SAFE_CALL(cudaFree(gpudata)); CUT_SAFE_CALL( cutDeleteTimer(timer)); }