here is the code,I have a program ,when i run somewhere i will report system error。
I see the data size is 245930 ,and I use sort_by_key as below. but when i test thest in a simple example,when the program start i will report
"; Find next lower page and probe
cs20:
sub eax, PAGESIZE ; decrease by PAGESIZE
test dword ptr [eax],eax ; probe page.
jmp short cs10
"
here is the code .
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <thrust/execution_policy.h>
#include <stdio.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <thrust/for_each.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#define RAND_MAX 0x7fff
cudaError_t addWithCuda(int *c, int *d, int *a, int *b, unsigned int size);
global void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x;
c[i] = a[i] + b[i];
}
global void dealKernel(int *d, const int *a, const int *b)
{
int i = threadIdx.x;
d[i] = b[i]-a[i];
}
//int randData(int low, int high)
//{
// int t = (int)rand() / (int)RAND_MAX;
// return ((int)1.0 - t) * low + t * high;
//}
int main()
{//245930
int s;
s =3;
const int arraySize = 12;
const long tmp = 245930;
//245930
//const int tmp = 245930;
int a[tmp] = { 0 };
int b[tmp] = { 0};
int c[tmp] = { 0 };
int d[tmp] = { 0 };
for(int i=0;i<tmp;i++)
{
a[i]= rand();;
}
for(int i=0;i<tmp;i++)
{
b[i]= i;
}
for(int i=0;i<tmp;i++)
{
c[i]= i;
}
for(int i=0;i<tmp;i++)
{
d[i]= i;
}
// Addectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, d,a, b, tmp);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “addWithCuda failed!”);
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0],c[1], c[2],c[3], c[4]);
printf("{a1,a2,a3,a4,a5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
a[0],a[1], a[2],a[3], a[4]);
printf("{d1,d2,d3,d4,d5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
d[0],d[1], d[2],d[3], d[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
system("pause");
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, int *d, int *a, int *b, unsigned int size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
int *dev_d = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_d, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_d, d, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// Launch a kernel on the GPU with one thread for each element.
//addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
//dealKernel<<<1, size>>>(dev_d, dev_a, dev_b);
//thrust::sort_by_key(dev_a,dev_a+size,dev_c);
thrust::device_ptr< int> dev_data_ptr(dev_c);
thrust::device_ptr<int> dev_keys_ptr(dev_a);
////2 sort
//thrust::sort_by_key(dev_data_ptr, dev_data_ptr + size, dev_keys_ptr);
thrust::sort_by_key(dev_keys_ptr, dev_keys_ptr +size, dev_data_ptr);
//unsigned int * sc = thrust::raw_pointer_cast(dev_data_ptr);
//unsigned int * sa = thrust::raw_pointer_cast(dev_keys_ptr);
/*thrust::device_ptr<int> thrust_a_pointer(dev_a);
thrust::device_ptr<int> thrust_b_pointer(dev_c);
thrust::sort_by_key(thrust_b_pointer,thrust_b_pointer+size,thrust_a_pointer);
int * raw_ptr = thrust::raw_pointer_cast(thrust_b_pointer);*/
/*printf("ra3{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
raw_ptr[0], raw_ptr[1], raw_ptr[2], raw_ptr[3], raw_ptr[4]);*/
//thrust::is_sorted(dev_c,size);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(a, dev_a, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(d, dev_d, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_d);
return cudaStatus;
}