c++ - CUDA : program which doesn't work with every size -


i'm working on 3d laplacian. code successful size n=32 n=64 or n=128 i've incorrect results:

#include <iostream> #include <sys/time.h> #include <cuda.h> #include <ctime> #include"res3dcb.cuh" #include <math.h> using namespace std;  // let's start main program. int main(void) {  // choice of n. int n; cout<<"choose matrix dimension (32, 64 or 128)"<<endl; cin>>n; int size=(n+2)*(n+2)*(n+2)*sizeof(float);  // variable statement. struct timeval t1, t2; float *x_d, *y_d;  float *x,*y;  float gflops; float numops;   //init x , y. x = new float[size]; y = new float[size];  (int i=1;i<n+1;i++)     (int j=1;j<n+1;j++)          (int k=1;k<n+1;k++) {              x[i*(n+2)*(n+2)+j*(n+2)+k]=1;         }  // shadow cases. (int i=1;i<n+1;i++) {     (int j=1;j<n+1;j++) {          x[i*(n+2)*(n+2)+j*(n+2)]=x[i*(n+2)*(n+2)+j*(n+2)+1];          x[i*(n+2)*(n+2)+j*(n+2)+n+1]=x[i*(n+2)*(n+2)+j*(n+2)+n];     }  (int k=0;k<n+2;k++) {      x[i*(n+2)*(n+2)+k]=x[i*(n+2)*(n+2)+(n+2)+k];      x[i*(n+2)*(n+2)+(n+1)*(n+2)+k]=x[i*(n+2)*(n+2)+n*(n+2)+k];} }  (int j=0;j<n+2;j++)      (int k=0;k<n+2;k++) {         x[(n+2)*j+k]=x[(n+2)*(n+2)+(n+2)*j+k];         x[(n+1)*(n+2)*(n+2)+(n+2)*j+k]=x[(n+2)*(n+2)*n+(n+2)*j+k];     }  // display of initial matrix. int id_stage=-2; while (id_stage!=-1) {     cout<<"which initial matrix's stage want display? (-1 if don't want diplay one)"<<endl; cin>>id_stage; cout<<endl;  if (id_stage != -1) {     cout<<"etage "<<id_stage<<" du cube :"<<endl;     (int j=0;j<n+2;j++) {         cout<<"| ";         (int k=0;k<n+2;k++) {cout<<x[id_stage*(n+2)*(n+2)+j*(n+2)+k]<<" ";}         cout<<"|"<<endl;     }     cout<<endl;     } }   // cpu gpu. cudamalloc( (void**) & x_d, size); cudamalloc( (void**) & y_d, size);  cudamemcpy(x_d, x, size, cudamemcpyhosttodevice) ; cudamemcpy(y_d, y, size, cudamemcpyhosttodevice) ;  // solver parameters. dim3 dimgrid(n/32, n/8, n/8); dim3 dimblock(16, 8, 8);   // solver loop. gettimeofday(&t1, 0); res3d<<<dimgrid, dimblock>>>(x_d, y_d, n);  cudadevicesynchronize(); gettimeofday(&t2, 0); double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;   // power calculation. numops=(1.0e-9)*n*n*n*7; gflops = ( numops / (time));  // gpu cpu. cudamemcpy(y, y_d, size, cudamemcpydevicetohost); cudafree(x_d); cudafree(y_d);  // display of final matrix. id_stage=-2; while (id_stage!=-1) {     cout<<"which output's stage want display? (-1 if don't want diplay one)"<<endl;     cin>>id_stage;     cout<<endl;  if (id_stage != -1) {     cout<<"etage "<<id_stage<<" du cube :"<<endl;     (int j=0;j<n+2;j++) {         cout<<"| ";         (int k=0;k<n+2;k++) {cout<<y[id_stage*(n+2)*(n+2)+j*(n+2)+k]<<" ";}         cout<<"|"<<endl;     }     cout<<endl; } }    cout<<"time : "<<time<<endl; cout<<"gflops/s : "<<gflops<<endl; } 

where :

#ifndef res2d_mat_gpu_hpp #define res2d_gpu_hpp #include <iostream> #include <sys/time.h> #include <cuda.h>  __global__ void res3d(volatile float* x, float* y, int n)  { // variable statement. __shared__ float sdata[18][10][10]; __shared__ float idata[18][10][10];  int tid = threadidx.x+1; int tjd = threadidx.y+1; int tkd = threadidx.z+1; int = threadidx.x + blockidx.x*(blockdim.x)+1; int j = threadidx.y + blockidx.y*(blockdim.y)+1; int k = threadidx.z + blockidx.z*(blockdim.z)+1;  // overloading of shared variable's outlines. float data=0,data1=0;  if (threadidx.x==0) {     data += x[(n+2)*(n+2)*(i-1)+(n+2)*j+k];     data1 += x[(n+2)*(n+2)*(i-1)+(n+2)*j+k+n*(n+2)*(n+2)/2]; } if (threadidx.x==15) {     data += x[(n+2)*(n+2)*(i+1)+(n+2)*j+k];     data1 += x[(n+2)*(n+2)*(i+1)+(n+2)*j+k+n*(n+2)*(n+2)/2]; } if (threadidx.y==0) {     data += x[(n+2)*(n+2)*i+(n+2)*(j-1)+k];     data1 += x[(n+2)*(n+2)*i+(n+2)*(j-1)+k+n*(n+2)*(n+2)/2]; } if (threadidx.y==7) {     data += x[(n+2)*(n+2)*i+(n+2)*(j+1)+k];      data1 += x[(n+2)*(n+2)*i+(n+2)*(j+1)+k+n*(n+2)*(n+2)/2];     }    if (threadidx.z==0) {     data += x[(n+2)*(n+2)*i+(n+2)*j+k-1];     data1 += x[(n+2)*(n+2)*i+(n+2)*j+k-1+n*(n+2)*(n+2)/2];   }    if (threadidx.z==7) {      data += x[(n+2)*(n+2)*i+(n+2)*j+k+1];      data1 += x[(n+2)*(n+2)*i+(n+2)*j+k+1+n*(n+2)*(n+2)/2];  }  // init shared variable. sdata[tid][tjd][tkd] = x[(n+2)*(n+2)*i+(n+2)*j+k]; idata[tid][tjd][tkd]=x[(n+2)*(n+2)*i+(n+2)*j+k+n*(n+2)*(n+2)/2];  __syncthreads();  // (small) tiling. y[(n+2)*(n+2)*i+(n+2)*j+k] = sdata[tid][tjd+1][tkd]                 + sdata[tid][tjd-1][tkd]                 + sdata[tid][tjd][tkd+1]                 + sdata[tid][tjd][tkd-1]                 + sdata[tid+1][tjd][tkd]                 + sdata[tid-1][tjd][tkd]                 - 6*sdata[tid][tjd][tkd]+data;   y[(n+2)*(n+2)*i+(n+2)*j+k+n*(n+2)*(n+2)/2] = idata[tid][tjd+1][tkd]                 + idata[tid][tjd-1][tkd]                 + idata[tid][tjd][tkd+1]                 + idata[tid][tjd][tkd-1]                 + idata[tid+1][tjd][tkd]                 + idata[tid-1][tjd][tkd]                 - 6*idata[tid][tjd][tkd]+data1;  } #endif 

questions :

  1. is code erroneous? or problem gpu's architecure if results false n=64 , n=128?

  2. does "if" way overloading shared variable's outlines?

thanks in advance help.

you have mistake here:

dim3 dimgrid(n/32, n/8, n/8); dim3 dimblock(16, 8, 8); 

this should be:

dim3 dimgrid(n/16, n/8, n/8); dim3 dimblock(16, 8, 8); 

also, noted in comments, over-allocating memory here:

x = new float[size]; y = new float[size]; 

since size has been calculated in bytes, not elements.


Comments

Popular posts from this blog

html - Outlook 2010 Anchor (url/address/link) -

javascript - Why does running this loop 9 times take 100x longer than running it 8 times? -

Getting gateway time-out Rails app with Nginx + Puma running on Digital Ocean -