Global variables in CUDA

Campy JIT’s a C#/NET lambda expression into NVIDIA PTX, which is then linked with the GPU base class library runtime, loaded into a cuModule, and executed. C++ global variables are fine, accessible and stable between calls to different kernels via Cuda.cuLaunchKernel(). Unfortunately, global variables are not shared between cuModule’s. As noted in the CUDA C Programming Guide, “The names for all symbols, including functions, global variables, and texture or surface references, are maintained at module scope so that modules written by independent third parties may interoperate in the same CUDA context.” Further, “Any user defined __device__, __constant__ or __managed__ device global variables present in the module that owns the CUfunction being launched are independently instantiated on every device. The user is responsible for initializing such device global variables appropriately.”

At this moment, each call to Parallel.For() results in a separate JIT and CUDA module. The runtime is initialized with each Parallel.For(). Unfortunately, initialization of the base class layer is quite time-consuming, especially on the GPU. So it should be done only once, either on the CPU or GPU. Therefore, C++ global variables need to be eliminated. But, the GPU base class layer library contains dozens of global variables and hundreds of references. Another mass edit of the DotNetAnywhere code, and bump on the road to full C# support on the GPU.

CUDA and kernel malloc’s

If you ever tried to call malloc() in a CUDA kernel, you might be surprised to find out that it seems to “work.” You get a buffer allocated, and you can assign values to it. However, if you thought that the buffer could then be accessed on the host CPU via cuMemcpyDtoH, or directly accessed on the CPU, guess again: cuMemcpyDtoH returns a cudaErrorInvalidValue (0x0000000b), and you get a segv if you try to access the buffer directly from the CPU. While some folks seem to have come to the same conclusion [1, 2], you may find others saying that you can–of course, without backing up the claim with evidence [3]. My own test, with the code below, demonstrates that one cannot access kernel malloc() buffers on the CPU.

Since malloc() buffers can’t be accessed on the host, Campy implements a memory allocator. A heap is allocated on the CPU and is used to allocate from on the GPU. This allocator is quite primitive. At some point, I hope to write a better memory allocator, such as the one by Huang [4].

[1] https://stackoverflow.com/questions/42000339/use-data-allocated-dynamically-in-cuda-kernel-on-host

[2] https://stackoverflow.com/questions/13480213/how-to-dynamically-allocate-arrays-inside-a-kernel

[3] http://heather.cs.ucdavis.edu/~matloff/158/PLN/CUDA.tex

[4] Huang, X. Xmalloc: a scalable lock-free dynamic memory allocator for many-core machines. MS Thesis, the University of Illinois at Urbana-Champaign. 2010. https://www.ideals.illinois.edu/bitstream/handle/2142/16137/Huang_Xiaohuang.pdf?sequence=1&isAllowed=y

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>

cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

__global__ void addKernel(int *c, const int *a, const int *b, void * * d)
{
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
	int * ddd;
	ddd = (int*)malloc(sizeof(int));
	d[i] = ddd;
	int * p = (int*)d[i];
	*ddd = i;
}

int main()
{
    const int arraySize = 5;
    const int a[arraySize] = { 1, 2, 3, 4, 5 };
    const int b[arraySize] = { 10, 20, 30, 40, 50 };
    int c[arraySize] = { 0 };

    // Add vectors in parallel.
    cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }

    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
        c[0], c[1], c[2], c[3], c[4]);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
{
    int *dev_a = 0;
    int *dev_b = 0;
    int *dev_c = 0;
	void * * dev_d = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

	cudaStatus = cudaMalloc((void**)&dev_d, size * sizeof(void*));
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMalloc failed!");
		goto Error;
	}


    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<1, size>>>(dev_c, dev_a, dev_b, dev_d);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

	// Copy output vector from GPU buffer to host memory.
	cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	void* * d = (void**)malloc(sizeof(void*) * size);
	cudaStatus = cudaMemcpy(d, dev_d, size * sizeof(void*), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}

	// Copy from device malloc
	int jjj;
	cudaStatus = cudaMemcpy(&jjj, d[0], sizeof(int), cudaMemcpyDeviceToHost);
	if (cudaStatus != cudaSuccess) {
		fprintf(stderr, "cudaMemcpy failed!");
		goto Error;
	}




Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    
    return cudaStatus;
}