Hi, I want to implement an algorithm that sums two vectors and scales depending on their dimension. This is what I have done so far:
#include <stdio.h>
#define VSIZE 64
__global__ void sum(char *v1, char *v2, int length, int *res) {
//Total number of threas within a block
int b_threads=blockDim.x*blockDim.y*blockDim.z;
//Total number of threads in the grid
int g_threads=gridDim.x*gridDim.y*gridDim.z*b_threads;
//Size of the vector partition on which every thread will work on
int pt_size=length/g_threads;
//Size of the vector partition which will be loaded by a block
int pb_size=pt_size*b_threads;
//Block index
int blockId=blockIdx.x+blockIdx.y*blockDim.x;
//Thread index (this is addressing a thread within the grid)
int tId=threadIdx.x+threadIdx.y*blockDim.x+ blockId*b_threads;
int bound=tId*pt_size+pt_size;
//printf("b_threads:%d, g_threads:%d, pt_size:%d, pb_size:%d, blockId:%d, tId:%d, bound:%d\n",
// b_threads, g_threads, pt_size, pb_size, blockId, tId, bound);
int i;
for (i=tId*pt_size; i<bound; i++) {
res[i]=v1[i]+v2[i];
}
}
int main() {
char v1[VSIZE], v2[VSIZE], *vc1, *vc2;
int vres[VSIZE], *vcres;
int size1= sizeof(char)*VSIZE;
int size2=sizeof(int)*VSIZE;
int i;
for (i=0; i<VSIZE; i++) {
v1[i]=1;
v2[i]=2;
}
//initialize and copy mem to device
cudaMalloc((void**)&vc1, size1);
cudaMemcpy(vc1, v1, size1, cudaMemcpyHostToDevice);
cudaMalloc((void**)&vc2, size1);
cudaMemcpy(vc2, v2, size1, cudaMemcpyHostToDevice);
cudaMalloc((void**)&vcres, size2);
//set dimensions of grid and block
dim3 dimBlock(2, 2);
dim3 dimGrid(2, 2);
sum<<<dimGrid, dimBlock>>>(vc1, vc2, VSIZE, vcres);
cudaMemcpy(vres, vcres, size2, cudaMemcpyDeviceToHost);
cudaFree(vc1);
cudaFree(vc2);
cudaFree(vcres);
int k;
for (k=0; k<VSIZE; k++) printf("posizione %d: %d\n", k, vres[k]);
return 0;
}
The code is working on a “short” input (i.e vector size 32 or 64) but when it grows I get meaningless data!
The kernel assumes that the total number of threads is always smaller or equal than the size of the vectors.
How can I use shared memory in order to have faster memory access within a block?
Any help is appreciated!