c++ - CUDA : program which doesn't work with every size -
i'm working on 3d laplacian. code successful size n=32 n=64 or n=128 i've incorrect results:
#include <iostream> #include <sys/time.h> #include <cuda.h> #include <ctime> #include"res3dcb.cuh" #include <math.h> using namespace std; // let's start main program. int main(void) { // choice of n. int n; cout<<"choose matrix dimension (32, 64 or 128)"<<endl; cin>>n; int size=(n+2)*(n+2)*(n+2)*sizeof(float); // variable statement. struct timeval t1, t2; float *x_d, *y_d; float *x,*y; float gflops; float numops; //init x , y. x = new float[size]; y = new float[size]; (int i=1;i<n+1;i++) (int j=1;j<n+1;j++) (int k=1;k<n+1;k++) { x[i*(n+2)*(n+2)+j*(n+2)+k]=1; } // shadow cases. (int i=1;i<n+1;i++) { (int j=1;j<n+1;j++) { x[i*(n+2)*(n+2)+j*(n+2)]=x[i*(n+2)*(n+2)+j*(n+2)+1]; x[i*(n+2)*(n+2)+j*(n+2)+n+1]=x[i*(n+2)*(n+2)+j*(n+2)+n]; } (int k=0;k<n+2;k++) { x[i*(n+2)*(n+2)+k]=x[i*(n+2)*(n+2)+(n+2)+k]; x[i*(n+2)*(n+2)+(n+1)*(n+2)+k]=x[i*(n+2)*(n+2)+n*(n+2)+k];} } (int j=0;j<n+2;j++) (int k=0;k<n+2;k++) { x[(n+2)*j+k]=x[(n+2)*(n+2)+(n+2)*j+k]; x[(n+1)*(n+2)*(n+2)+(n+2)*j+k]=x[(n+2)*(n+2)*n+(n+2)*j+k]; } // display of initial matrix. int id_stage=-2; while (id_stage!=-1) { cout<<"which initial matrix's stage want display? (-1 if don't want diplay one)"<<endl; cin>>id_stage; cout<<endl; if (id_stage != -1) { cout<<"etage "<<id_stage<<" du cube :"<<endl; (int j=0;j<n+2;j++) { cout<<"| "; (int k=0;k<n+2;k++) {cout<<x[id_stage*(n+2)*(n+2)+j*(n+2)+k]<<" ";} cout<<"|"<<endl; } cout<<endl; } } // cpu gpu. cudamalloc( (void**) & x_d, size); cudamalloc( (void**) & y_d, size); cudamemcpy(x_d, x, size, cudamemcpyhosttodevice) ; cudamemcpy(y_d, y, size, cudamemcpyhosttodevice) ; // solver parameters. dim3 dimgrid(n/32, n/8, n/8); dim3 dimblock(16, 8, 8); // solver loop. gettimeofday(&t1, 0); res3d<<<dimgrid, dimblock>>>(x_d, y_d, n); cudadevicesynchronize(); gettimeofday(&t2, 0); double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0; // power calculation. numops=(1.0e-9)*n*n*n*7; gflops = ( numops / (time)); // gpu cpu. cudamemcpy(y, y_d, size, cudamemcpydevicetohost); cudafree(x_d); cudafree(y_d); // display of final matrix. id_stage=-2; while (id_stage!=-1) { cout<<"which output's stage want display? (-1 if don't want diplay one)"<<endl; cin>>id_stage; cout<<endl; if (id_stage != -1) { cout<<"etage "<<id_stage<<" du cube :"<<endl; (int j=0;j<n+2;j++) { cout<<"| "; (int k=0;k<n+2;k++) {cout<<y[id_stage*(n+2)*(n+2)+j*(n+2)+k]<<" ";} cout<<"|"<<endl; } cout<<endl; } } cout<<"time : "<<time<<endl; cout<<"gflops/s : "<<gflops<<endl; }
where :
#ifndef res2d_mat_gpu_hpp #define res2d_gpu_hpp #include <iostream> #include <sys/time.h> #include <cuda.h> __global__ void res3d(volatile float* x, float* y, int n) { // variable statement. __shared__ float sdata[18][10][10]; __shared__ float idata[18][10][10]; int tid = threadidx.x+1; int tjd = threadidx.y+1; int tkd = threadidx.z+1; int = threadidx.x + blockidx.x*(blockdim.x)+1; int j = threadidx.y + blockidx.y*(blockdim.y)+1; int k = threadidx.z + blockidx.z*(blockdim.z)+1; // overloading of shared variable's outlines. float data=0,data1=0; if (threadidx.x==0) { data += x[(n+2)*(n+2)*(i-1)+(n+2)*j+k]; data1 += x[(n+2)*(n+2)*(i-1)+(n+2)*j+k+n*(n+2)*(n+2)/2]; } if (threadidx.x==15) { data += x[(n+2)*(n+2)*(i+1)+(n+2)*j+k]; data1 += x[(n+2)*(n+2)*(i+1)+(n+2)*j+k+n*(n+2)*(n+2)/2]; } if (threadidx.y==0) { data += x[(n+2)*(n+2)*i+(n+2)*(j-1)+k]; data1 += x[(n+2)*(n+2)*i+(n+2)*(j-1)+k+n*(n+2)*(n+2)/2]; } if (threadidx.y==7) { data += x[(n+2)*(n+2)*i+(n+2)*(j+1)+k]; data1 += x[(n+2)*(n+2)*i+(n+2)*(j+1)+k+n*(n+2)*(n+2)/2]; } if (threadidx.z==0) { data += x[(n+2)*(n+2)*i+(n+2)*j+k-1]; data1 += x[(n+2)*(n+2)*i+(n+2)*j+k-1+n*(n+2)*(n+2)/2]; } if (threadidx.z==7) { data += x[(n+2)*(n+2)*i+(n+2)*j+k+1]; data1 += x[(n+2)*(n+2)*i+(n+2)*j+k+1+n*(n+2)*(n+2)/2]; } // init shared variable. sdata[tid][tjd][tkd] = x[(n+2)*(n+2)*i+(n+2)*j+k]; idata[tid][tjd][tkd]=x[(n+2)*(n+2)*i+(n+2)*j+k+n*(n+2)*(n+2)/2]; __syncthreads(); // (small) tiling. y[(n+2)*(n+2)*i+(n+2)*j+k] = sdata[tid][tjd+1][tkd] + sdata[tid][tjd-1][tkd] + sdata[tid][tjd][tkd+1] + sdata[tid][tjd][tkd-1] + sdata[tid+1][tjd][tkd] + sdata[tid-1][tjd][tkd] - 6*sdata[tid][tjd][tkd]+data; y[(n+2)*(n+2)*i+(n+2)*j+k+n*(n+2)*(n+2)/2] = idata[tid][tjd+1][tkd] + idata[tid][tjd-1][tkd] + idata[tid][tjd][tkd+1] + idata[tid][tjd][tkd-1] + idata[tid+1][tjd][tkd] + idata[tid-1][tjd][tkd] - 6*idata[tid][tjd][tkd]+data1; } #endif
questions :
is code erroneous? or problem gpu's architecure if results false n=64 , n=128?
does "if" way overloading shared variable's outlines?
thanks in advance help.
you have mistake here:
dim3 dimgrid(n/32, n/8, n/8); dim3 dimblock(16, 8, 8);
this should be:
dim3 dimgrid(n/16, n/8, n/8); dim3 dimblock(16, 8, 8);
also, noted in comments, over-allocating memory here:
x = new float[size]; y = new float[size];
since size
has been calculated in bytes, not elements.
Comments
Post a Comment