Hello,
I am quite new to CUDA and I am stuck with an error when trying to run this code with the variable quantities being a larger array or with the grid or block dimensions being higher. The code has the expected behaviour with smaller arrays.
I run this test on Windows 10 on a GTX 980. The size of the array causing the error may depend on your GPU, it seems I am exceeding a limit somewhere. The error looks like :
CudaAPIError Traceback (most recent call last)
<ipython-input-9-29f1940fb1d0> in <module>()
33
34 #Get the output of the calculations from the GPU to the host
---> 35 combs = out_device.copy_to_host()
36
37 print(combs)
G:\Applications\Anaconda\lib\site-packages\numba\cuda\cudadrv\devices.py in _require_cuda_context(*args, **kws)
210 def _require_cuda_context(*args, **kws):
211 get_context()
--> 212 return fn(*args, **kws)
213
214 return _require_cuda_context
G:\Applications\Anaconda\lib\site-packages\numba\cuda\cudadrv\devicearray.py in copy_to_host(self, ary, stream)
250 assert self.alloc_size >= 0, "Negative memory size"
251 if self.alloc_size != 0:
--> 252 _driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
253
254 if ary is None:
G:\Applications\Anaconda\lib\site-packages\numba\cuda\cudadrv\driver.py in device_to_host(dst, src, size, stream)
1774 fn = driver.cuMemcpyDtoH
1775
-> 1776 fn(host_pointer(dst), device_pointer(src), size, *varargs)
1777
1778
G:\Applications\Anaconda\lib\site-packages\numba\cuda\cudadrv\driver.py in safe_cuda_api_call(*args)
286 _logger.debug('call driver api: %s', libfn.__name__)
287 retcode = libfn(*args)
--> 288 self._check_error(fname, retcode)
289 return safe_cuda_api_call
290
G:\Applications\Anaconda\lib\site-packages\numba\cuda\cudadrv\driver.py in _check_error(self, fname, retcode)
321 _logger.critical(msg, _getpid(), self.pid)
322 raise CudaDriverError("CUDA initialized before forking")
--> 323 raise CudaAPIError(retcode, msg)
324
325 def get_device(self, devnum=0):
CudaAPIError: [700] Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR
The goal of the function is to parallelize the generation of combinations (not the “mathematical” combination) and to perform a computation on each of these. The array of quantities being significantly larger.
import numpy as np
import math
from numba import njit, jit, cuda, vectorize, guvectorize
import numba
### FUNCTIONS TO BE MERGED
def power_added(quantities):
pow_add = [1]
result=1
cpy = quantities.copy()
while cpy:
result *=cpy.pop(0)+1
pow_add.append(result)
pow_add.pop(-1)
return pow_add
def power(quantities):
result=1
cpy = quantities.copy()
while cpy:
result *=cpy.pop(0)+1
return result
# Definition of variables to be processed
# quantities = [3,4,5,6,7] #returns an error
quantities = [3,4,5,6]
pow_add = power_added(quantities)
pow = power(quantities)
length_q = len(quantities)
# Transformation into tuple to pass to the kernel
q_tup = tuple(quantities)
pa_tup = tuple(pow_add)
#Number of combinations per thread (must be a multiple of the nb of comb):
cpt = 20
#Affects data to the device to avoid useless transfers
q_device = cuda.to_device(quantities)
pa_device = cuda.to_device(pow_add)
out_device = cuda.device_array(shape=(pow//cpt,cpt,length_q), dtype=np.int32)
#print(out_device.copy_to_host())
@cuda.jit
def kernel(an_array, q, pa):
# Thread id in a 1D block
tx = cuda.threadIdx.x
# Block id in a 1D grid
ty = cuda.blockIdx.x
# Block width, i.e. number of threads per block
bw = cuda.blockDim.x
# Compute flattened index inside the array
pos = tx + ty * bw
# The above is equivalent to pos = roc.get_global_id(0)
if pos < an_array.size: # Check array boundaries
"""an_array[pos] = compute(x,y)"""
compute(pos, q, pa, an_array[pos])
@cuda.jit(device = True)
def compute(index,q,pa,comb):
#Get the size of 'quantities' to know the nb of numbers per combination
length_q = len(q)
#If we are to compute multiple combinations in a single thread
for i in range(cpt):
#For each number in the combination, compute it
for L in range(length_q):
comb[i][L] = (cpt*index+i)//pa[L]%(q[L]+1)
# To improve performance :
# Compute yield of the combination in the thread and only return the best
# one and its yield (to compare with other threads): local optimum
#Caution when choosing the dimensions, use the device info & size of output
n_blocks = 30 # must be <= MAX_GRID_DIM_X
n_threads_per_block = 128 # must be <= MAX_THREADS_PER_BLOCK (if multi dim, the product must be <=)
#Calls the function with the data already stored on the device
kernel[n_blocks, n_threads_per_block](out_device, q_device, pa_device)
#Get the output of the calculations from the GPU to the host
combs = out_device.copy_to_host()
print(combs)
and my GPU returns :
Global memory occupancy:80.765915% free
===Attributes for device 0
ASYNC_ENGINE_COUNT:2
CAN_MAP_HOST_MEMORY:1
CLOCK_RATE:1291000
COMPUTE_CAPABILITY_MAJOR:5
COMPUTE_CAPABILITY_MINOR:2
COMPUTE_MODE:DEFAULT
CONCURRENT_KERNELS:1
ECC_ENABLED:0
GLOBAL_L1_CACHE_SUPPORTED:1
GLOBAL_MEMORY_BUS_WIDTH:256
GPU_OVERLAP:1
INTEGRATED:0
KERNEL_EXEC_TIMEOUT:1
L2_CACHE_SIZE:2097152
LOCAL_L1_CACHE_SUPPORTED:1
MANAGED_MEMORY:1
MAXIMUM_SURFACE1D_LAYERED_LAYERS:2048
MAXIMUM_SURFACE1D_LAYERED_WIDTH:16384
MAXIMUM_SURFACE1D_WIDTH:16384
MAXIMUM_SURFACE2D_HEIGHT:65536
MAXIMUM_SURFACE2D_LAYERED_HEIGHT:16384
MAXIMUM_SURFACE2D_LAYERED_LAYERS:2048
MAXIMUM_SURFACE2D_LAYERED_WIDTH:16384
MAXIMUM_SURFACE2D_WIDTH:65536
MAXIMUM_SURFACE3D_DEPTH:4096
MAXIMUM_SURFACE3D_HEIGHT:4096
MAXIMUM_SURFACE3D_WIDTH:4096
MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:2046
MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:16384
MAXIMUM_SURFACECUBEMAP_WIDTH:16384
MAXIMUM_TEXTURE1D_LAYERED_LAYERS:2048
MAXIMUM_TEXTURE1D_LAYERED_WIDTH:16384
MAXIMUM_TEXTURE1D_LINEAR_WIDTH:134217728
MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH:16384
MAXIMUM_TEXTURE1D_WIDTH:65536
MAXIMUM_TEXTURE2D_ARRAY_HEIGHT:16384
MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES:2048
MAXIMUM_TEXTURE2D_ARRAY_WIDTH:16384
MAXIMUM_TEXTURE2D_GATHER_HEIGHT:16384
MAXIMUM_TEXTURE2D_GATHER_WIDTH:16384
MAXIMUM_TEXTURE2D_HEIGHT:65536
MAXIMUM_TEXTURE2D_LINEAR_HEIGHT:65536
MAXIMUM_TEXTURE2D_LINEAR_PITCH:1048544
MAXIMUM_TEXTURE2D_LINEAR_WIDTH:65536
MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT:16384
MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH:16384
MAXIMUM_TEXTURE2D_WIDTH:65536
MAXIMUM_TEXTURE3D_DEPTH:4096
MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:16384
MAXIMUM_TEXTURE3D_HEIGHT:4096
MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:2048
MAXIMUM_TEXTURE3D_WIDTH:4096
MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:2048
MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:2046
MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:16384
MAXIMUM_TEXTURECUBEMAP_WIDTH:16384
MAX_BLOCK_DIM_X:1024
MAX_BLOCK_DIM_Y:1024
MAX_BLOCK_DIM_Z:64
MAX_GRID_DIM_X:2147483647
MAX_GRID_DIM_Y:65535
MAX_GRID_DIM_Z:65535
MAX_PITCH:2147483647
MAX_REGISTERS_PER_BLOCK:65536
MAX_REGISTERS_PER_MULTIPROCESSOR:65536
MAX_SHARED_MEMORY_PER_BLOCK:49152
MAX_SHARED_MEMORY_PER_MULTIPROCESSOR:98304
MAX_THREADS_PER_BLOCK:1024
MAX_THREADS_PER_MULTIPROCESSOR:2048
MEMORY_CLOCK_RATE:3505000
MULTIPROCESSOR_COUNT:16
MULTI_GPU_BOARD:0
MULTI_GPU_BOARD_GROUP_ID:0
PCI_BUS_ID:1
PCI_DEVICE_ID:0
PCI_DOMAIN_ID:0
STREAM_PRIORITIES_SUPPORTED:1
SURFACE_ALIGNMENT:512
TCC_DRIVER:0
TEXTURE_ALIGNMENT:512
TEXTURE_PITCH_ALIGNMENT:32
TOTAL_CONSTANT_MEMORY:65536
UNIFIED_ADDRESSING:1
WARP_SIZE:32
Some posts indicated this error could come from the Windows WDDM TDR, so I disabled it and nothing changed.
How do I determine the max grid/block dimensions ? I assumed MAX_THREADS_PER_BLOCK:1024 and MAX_GRID_DIM_X:2147483647 would be my limits and MULTIPROCESSOR_COUNT:16 indicates the number of blocks running at the same time.
What causes this error and how can I prevent it ? It could be a memory issue, but it is surprising as the size of the array required in the exemple is 336205
Thank you for your help !