// Get block/grid sizes; the number of threads per block is limited
// to 128 because the kernel defined above uses too many
// registers to be invoked more threads per block:
dev = misc.get_current_device()
max_threads_per_block = 128
block_dim, grid_dim = \
misc.select_block_grid_sizes(dev, z_gpu.shape, max_threads_per_block)
// Set this to False when debugging to make sure the compiled kernel is
// not cached:
cache_dir=None
expi_mod = \
SourceModule(expi_template.substitute(use_double=use_double),
cache_dir=cache_dir,
options=["-I", install_headers])
expi_func = expi_mod.get_function("expi_array")
e_gpu = gpuarray.empty_like(z_gpu)
expi_func(z_gpu, e_gpu,
np.uint32(z_gpu.size),
block=block_dim,
grid=grid_dim)
return e_gpu