Function qualifiers: -host is the sytem cpu and device is the gpu -global called by host, runs on device (for kernels) -device called by device, runs on device (kernel helper function) -host (or nothing) normal host function

  1. Sample program1:

include <cuda_runtime.h> #include

global void AddIntsCUDA(int* a) { a[0] += 1; }

int main() { int a = 5; int* p_a, * p_b; cudaMalloc(&p_a, sizeof(int)); cudaMemcpy(p_a, &a, sizeof(int), cudaMemcpyHostToDevice);

AddIntsCUDA <<<1,1>>> (p_a);
cudaMemcpy(&a, p_a, sizeof(int), cudaMemcpyDeviceToHost);

std::cout << "The answer is " << a << std::endl;
cudaFree(p_a);

}

-cudaMalloc allocates memory in GPU memory and puts its address into our pointer, this pointer is not like the old ones -cudaMemcpy copies values from system memory to GPU memory (because gpu programs cannot see the system memory) -cudaFree to release the memory -The chevron includes the number of blocks and number of threads per block <<<50,256>>> means 50 blocks, 256 threads per each (what is a thread block? later bro? this is like 50*256 threads), cpu cant beat it -Such levels of abstraction for supporting same code for stronger GPUs

  1. Sample program2:

dim3 is a type thats like a 3D coordinates: dim3 name(x,y,z) Each kernel has access to the following terms: Index terms: -blockIdx: block index within the grid -threadIdx: thread index within the block Dimension terms: -blockDim: block dimensions in threads -gridDim: grid dimensions in blocks Can be read in the kernel function, data type of each is dim3 There are lots of threads (in a grid in a block) and using these values we can associate each thread with a unique id given by id = blockIdx.x * blockDim.x + threadIdx.x

include <cuda_runtime.h> #include #include #include <assert.h>

const int MAX = 1000;

global void AddInts(int* cuda_arr1, int* cuda_arr2) { int id = blockIdx.x * blockDim.x + threadIdx.x; if (id < MAX) { cuda_arr1[id] += cuda_arr2[id]; } }

int main() { int arr1[MAX]; int arr2[MAX];

for (int i = 0; i < MAX; ++i)
{
	arr1[i] = std::rand() % 100;
	arr2[i] = std::rand() % 100;
}

std::cout << "Before: " << std::endl;

for (int i = 0; i < MAX; ++i)
{
	std::cout << arr1[i] << " ";
}
std::cout << std::endl;

for (int i = 0; i < MAX; ++i)
{
	std::cout << arr2[i] << " ";
}
std::cout << std::endl;

int *cuda_arr1, *cuda_arr2;

assert(cudaMalloc(&cuda_arr1, sizeof(arr1)) == cudaSuccess);
assert(cudaMalloc(&cuda_arr2, sizeof(arr2)) == cudaSuccess);

assert(cudaMemcpy(cuda_arr1, arr1, sizeof(arr1), cudaMemcpyHostToDevice) == cudaSuccess);
assert(cudaMemcpy(cuda_arr2, arr2, sizeof(arr2), cudaMemcpyHostToDevice) == cudaSuccess);

AddInts <<< MAX/256+1, 256 >>> (cuda_arr1, cuda_arr2);

assert(cudaMemcpy(arr1, cuda_arr1, sizeof(arr1), cudaMemcpyDeviceToHost) == cudaSuccess);

cudaFree(cuda_arr1);
cudaFree(cuda_arr2);

for (int i = 0; i < MAX; ++i)
{
	std::cout << arr1[i] << " ";
}

}

-What its doing when calling the kernel is throwing all the threads onto the kernel, so we filter them out by their unique id