CUDA编程的基本步骤:
1. 编写Kernel函数
2. 为主机和设备需要使用的数据分配内存
3. 拷贝主机内存的数据到设备内存
4. 调用kernel函数执行计算
5. 把计算后的数据结果从设备内存拷贝回主机内存
6. 释放为主机和设备分配的内存
举一个最简单的例子:
#include <stdio.h>
#include <cuda_runtime.h>
#include "helper_cuda.h"
//声明一个kernel 函数
__global__ void vectprAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if ( i < numElements )
{
C[i] = A[i] + B[i];
}
}
int main(void)
{
int numElements = 50000;
size_t size = numElements * sizeof(float);
//分配主机内存
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
//初始化主机数据
for(int i=0; i<numElements; i++)
{
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
//分配设备内存
float *d_A, *d_B, *d_C;
cudaMalloc((void**)&d_A, size);
cudaMalloc(&d_B, size);
cudaMalloc(&d_C, size);
//拷贝主机数据到设备
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//调用kernel函数执行计算
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
//拷贝数据结果到主机内存
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
//释放设备内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//释放主机内存
free(h_A);
free(h_B);
free(h_C);
}