when large number stack overflow

here is the code,I have a program ,when i run somewhere i will report system error。
I see the data size is 245930 ,and I use sort_by_key as below. but when i test thest in a simple example,when the program start i will report

"; Find next lower page and probe
cs20:
sub eax, PAGESIZE ; decrease by PAGESIZE
test dword ptr [eax],eax ; probe page.
jmp short cs10
"

here is the code .

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <thrust/execution_policy.h>
#include <stdio.h>
#include <thrust/sort.h> 
#include <thrust/device_ptr.h>
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#define RAND_MAX 0x7fff
cudaError_t addWithCuda(int *c, int *d, int *a, int *b, unsigned int size);

global void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
global void dealKernel(int *d, const int *a, const int *b)
{
int i = threadIdx.x;
d[i] = b[i]-a[i];
}
//int randData(int low, int high)
//{
// int t = (int)rand() / (int)RAND_MAX;
// return ((int)1.0 - t) * low + t * high;
//}
int main()
{//245930
int s;
s =3;
const int arraySize = 12;
const long tmp = 245930;
//245930
//const int tmp = 245930;
int a[tmp] = { 0 };
int b[tmp] = { 0};
int c[tmp] = { 0 };
int d[tmp] = { 0 };
for(int i=0;i<tmp;i++)
{
a[i]= rand();;
}
for(int i=0;i<tmp;i++)
{
b[i]= i;
}
for(int i=0;i<tmp;i++)
{
c[i]= i;
}
for(int i=0;i<tmp;i++)
{
d[i]= i;
}
// Addectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, d,a, b, tmp);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “addWithCuda failed!”);
return 1;
}

printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    c[0],c[1], c[2],c[3], c[4]);
 printf("{a1,a2,a3,a4,a5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    a[0],a[1], a[2],a[3], a[4]);
 printf("{d1,d2,d3,d4,d5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    d[0],d[1], d[2],d[3], d[4]);



// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceReset failed!");
    return 1;
}
 system("pause");
return 0;

}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, int *d, int *a, int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
int *dev_d = 0;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
    goto Error;
}

// Allocate GPU buffers for three vectors (two input, one output)    .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}
   cudaStatus = cudaMalloc((void**)&dev_d, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMalloc failed!");
    goto Error;
}

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
  cudaStatus = cudaMemcpy(dev_d, d, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

// Launch a kernel on the GPU with one thread for each element.
//addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
//dealKernel<<<1, size>>>(dev_d, dev_a, dev_b);

//thrust::sort_by_key(dev_a,dev_a+size,dev_c);

thrust::device_ptr< int> dev_data_ptr(dev_c);
thrust::device_ptr<int> dev_keys_ptr(dev_a);


////2 sort
//thrust::sort_by_key(dev_data_ptr, dev_data_ptr + size, dev_keys_ptr);


thrust::sort_by_key(dev_keys_ptr, dev_keys_ptr +size, dev_data_ptr);
//unsigned  int * sc = thrust::raw_pointer_cast(dev_data_ptr);
//unsigned  int * sa = thrust::raw_pointer_cast(dev_keys_ptr);

/*thrust::device_ptr<int> thrust_a_pointer(dev_a);

thrust::device_ptr<int> thrust_b_pointer(dev_c);


thrust::sort_by_key(thrust_b_pointer,thrust_b_pointer+size,thrust_a_pointer);
 int * raw_ptr = thrust::raw_pointer_cast(thrust_b_pointer);*/


/*printf("ra3{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
    raw_ptr[0], raw_ptr[1], raw_ptr[2], raw_ptr[3], raw_ptr[4]);*/
//thrust::is_sorted(dev_c,size);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
    goto Error;
}

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
    goto Error;
}

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
 cudaStatus = cudaMemcpy(a, dev_a, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}
 cudaStatus = cudaMemcpy(d, dev_d, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "cudaMemcpy failed!");
    goto Error;
}

Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_d);

return cudaStatus;

}

Stack overflows errors as described here occur in host code, i.e. the code running on the CPU. And it has nothing to do with CUDA, really (meaning this would happen as well if this were a pure C or C++ program). It can easily happen when large data objects are placed one the stack, as happens here:

const long tmp = 245930;
//245930
//const int tmp = 245930;
int a[tmp] = { 0 };
int b[tmp] = { 0};
int c[tmp] = { 0 };
int d[tmp] = { 0 };

Don’t do that. Use dynamic allocations instead.