when large number stack overflow

393774381 · October 19, 2017, 6:44am

here is the code，I have a program ,when i run somewhere i will report system error。
I see the data size is 245930 ，and I use sort_by_key as below. but when i test thest in a simple example,when the program start i will report

"; Find next lower page and probe
cs20:
sub eax, PAGESIZE ; decrease by PAGESIZE
test dword ptr [eax],eax ; probe page.
jmp short cs10
"

here is the code .

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <thrust/execution_policy.h>
#include <stdio.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#define RAND_MAX 0x7fff
cudaError_t addWithCuda(int *c, int *d, int *a, int *b, unsigned int size);

global void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
global void dealKernel(int *d, const int *a, const int *b)
{
int i = threadIdx.x;
d[i] = b[i]-a[i];
}
//int randData(int low, int high)
//{
// int t = (int)rand() / (int)RAND_MAX;
// return ((int)1.0 - t) * low + t * high;
//}
int main()
{//245930
int s;
s =3;
const int arraySize = 12;
const long tmp = 245930;
//245930
//const int tmp = 245930;
int a[tmp] = { 0 };
int b[tmp] = { 0};
int c[tmp] = { 0 };
int d[tmp] = { 0 };
for(int i=0;i<tmp;i++)
{
a[i]= rand();;
}
for(int i=0;i<tmp;i++)
{
b[i]= i;
}
for(int i=0;i<tmp;i++)
{
c[i]= i;
}
for(int i=0;i<tmp;i++)
{
d[i]= i;
}
// Addectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, d,a, b, tmp);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “addWithCuda failed!”);
return 1;
}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    c[0],c[1], c[2],c[3], c[4]);
 printf("{a1,a2,a3,a4,a5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    a[0],a[1], a[2],a[3], a[4]);
 printf("{d1,d2,d3,d4,d5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    d[0],d[1], d[2],d[3], d[4]);



// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}
 system("pause");
return 0;

}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, int *d, int *a, int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
int *dev_d = 0;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
   cudaStatus = cudaMalloc((void**)&dev_d, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
  cudaStatus = cudaMemcpy(dev_d, d, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
//addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
//dealKernel<<<1, size>>>(dev_d, dev_a, dev_b);

//thrust::sort_by_key(dev_a,dev_a+size,dev_c);

thrust::device_ptr< int> dev_data_ptr(dev_c);
thrust::device_ptr<int> dev_keys_ptr(dev_a);


////2 sort
//thrust::sort_by_key(dev_data_ptr, dev_data_ptr + size, dev_keys_ptr);


thrust::sort_by_key(dev_keys_ptr, dev_keys_ptr +size, dev_data_ptr);
//unsigned  int * sc = thrust::raw_pointer_cast(dev_data_ptr);
//unsigned  int * sa = thrust::raw_pointer_cast(dev_keys_ptr);

/*thrust::device_ptr<int> thrust_a_pointer(dev_a);

thrust::device_ptr<int> thrust_b_pointer(dev_c);


thrust::sort_by_key(thrust_b_pointer,thrust_b_pointer+size,thrust_a_pointer);
 int * raw_ptr = thrust::raw_pointer_cast(thrust_b_pointer);*/


/*printf("ra3{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    raw_ptr[0], raw_ptr[1], raw_ptr[2], raw_ptr[3], raw_ptr[4]);*/
//thrust::is_sorted(dev_c,size);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
 cudaStatus = cudaMemcpy(a, dev_a, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
 cudaStatus = cudaMemcpy(d, dev_d, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_d);

return cudaStatus;

}

njuffa · October 19, 2017, 1:13pm

Stack overflows errors as described here occur in host code, i.e. the code running on the CPU. And it has nothing to do with CUDA, really (meaning this would happen as well if this were a pure C or C++ program). It can easily happen when large data objects are placed one the stack, as happens here:

const long tmp = 245930;
//245930
//const int tmp = 245930;
int a[tmp] = { 0 };
int b[tmp] = { 0};
int c[tmp] = { 0 };
int d[tmp] = { 0 };

Don’t do that. Use dynamic allocations instead.

Topic		Replies	Views
This code doesn't work maybe too much threads assigned? CUDA Programming and Performance	8	1089	February 2, 2014
Kernel launches blocking when 1024 kernels in a queue CUDA Programming and Performance	3	2658	July 8, 2014
a problem complex array add with cuda ????? CUDA Programming and Performance	2	986	August 17, 2017
Problem with CUDA code,overflow? CUDA Programming and Performance	9	1196	September 10, 2020
Inconsistent Cuda Errors CUDA Programming and Performance	10	1072	July 7, 2017
Cuda application crashes works fine for small data and crashes for big data CUDA Programming and Performance	3	414	October 12, 2021
Error 719 (failure to launch) for JCUDA and PyCUDA; How to run GPU consecutive times for 'large' data blocks CUDA Programming and Performance	0	2334	December 13, 2016
Can a Kernel be too big?? CUDA_ERROR_NO_BINARY_FOR_GPU error 209 CUDA Programming and Performance	11	3053	November 13, 2017
cudaSynchronizeDevice() returns error code 6 CUDA Programming and Performance	7	8607	June 16, 2011
A problem when a new hand in cuda programing CUDA Programming and Performance	6	697	July 30, 2015

when large number stack overflow

Related topics