OpenCL编程之二-CFANZ编程社区

白嫖来的C端代码:

matrix.c：

#include <stdio.h>
#include <stdlib.h>
#include <alloca.h>
#include <CL/cl.h>
#pragma warning( disable : 4996 )
int main() {
  cl_int error;
  cl_platform_id platforms;

  cl_device_id devices;

  cl_context context;

  FILE *program_handle;
  size_t program_size;
  char *program_buffer;
  cl_program program;

  size_t log_size;
  char *program_log;

  char kernel_name[] = "createBuffer";
  cl_kernel kernel;

  cl_command_queue queue;
  //获取平台
  error = clGetPlatformIDs(1, &platforms, NULL);
  if (error != 0) {
    printf("Get platform failed!");
    return -1;
  }
  //获取设备
  error = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, NULL);
  if (error != 0) {
    printf("Get device failed!");
    return -1;
  }
  //创建上下文
  context = clCreateContext(NULL,1,&devices,NULL,NULL,&error);
  if (error != 0) {
    printf("Creat context failed!");
    return -1;
  }
  //创建程序；注意要用"rb"
  program_handle = fopen("kernel.cl","rb");
  if (program_handle == NULL) {
    printf("The kernle can not be opened!");
    return -1;
  }
  fseek(program_handle,0,SEEK_END);
  program_size = ftell(program_handle);
  rewind(program_handle);

  program_buffer = (char *)malloc(program_size+1);
  program_buffer[program_size] = '\0';
  error=fread(program_buffer,sizeof(char),program_size,program_handle);
  if (error == 0) {
    printf("Read kernel failed!");
    return -1;
  }
  fclose(program_handle);
  program = clCreateProgramWithSource(context,1,(const char **)&program_buffer, 
                                       &program_size,&error);
  if (error < 0) {
    printf("Couldn't create the program!");
    return -1;
  }
  //编译程序
  error = clBuildProgram(program,1,&devices,NULL,NULL,NULL);
  if (error < 0) {
    //确定日志文件的大小
    clGetProgramBuildInfo(program,devices,CL_PROGRAM_BUILD_LOG,0,NULL,&log_size);
    program_log = (char *)malloc(log_size+1);
    program_log[log_size] = '\0';
    //读取日志
    clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, 
                           log_size+1, program_log, NULL);
    printf("%s\n",program_log);
    free(program_log);
    return -1;
  }
  free(program_buffer);
  //创建命令队列
  queue = clCreateCommandQueue(context, devices, CL_QUEUE_PROFILING_ENABLE, &error);
  if (error < 0) {
    printf("Coudn't create the command queue");
    return -1;
  }
  //创建内核
  kernel = clCreateKernel(program,kernel_name,&error);
  if (kernel==NULL) {
    printf("Couldn't create kernel!\n");
    return -1;
  }
  //初始化参数
  float result[100];
  float a_in[100];
  float b_in[100];
  for (int i = 0; i < 100; i++) {
    a_in[i] = i;
    b_in[i] = i*2.0;
  }
  //创建缓存对象
  cl_mem memObject1 = clCreateBuffer(context,CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,sizeof(float)*100,a_in,&error);
  if (error < 0) {
    printf("Creat memObject1 failed!\n");
    return -1;
  }
  cl_mem memObject2 = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, 
                                      sizeof(float) * 100, b_in, &error);
  if (error < 0) {
    printf("Creat memObject2 failed!\n");
    return -1;
  }
  cl_mem memObject3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY , 
                                         sizeof(float) * 100, NULL, &error);
  if (error < 0) {
    printf("Creat memObject3 failed!\n");
    return -1;
  }
  //设置内核参数
  error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memObject1);
  error|= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObject2);
  error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObject3);
  if (error != CL_SUCCESS) {
    printf("Error setting kernel arguments!\n");
    return -1;
  }
  //执行内核
  size_t globalWorkSize[1] = {100};
  size_t localWorkSize[1] = {1};
  error = clEnqueueNDRangeKernel(queue,kernel,1,NULL,globalWorkSize, 
                                  localWorkSize,0,NULL,NULL);
  if (error != CL_SUCCESS) {
    printf("Error queuing kernel for execution!\n");
    return -1;
  }
  //读取执行结果
  error = clEnqueueReadBuffer(queue,memObject3,CL_TRUE,0,100*sizeof(float), 
                               result,0,NULL,NULL);
  if (error != CL_SUCCESS) {
    printf("Error reading result buffer!\n");
    return -1;
  }
  //显示结果
  for (int i = 0; i < 100; i++) {
    printf("%f ",result[i]);
  }
  printf("\n");
  //释放资源
  clReleaseDevice(devices);
  clReleaseContext(context);
  clReleaseCommandQueue(queue);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseMemObject(memObject1);
  clReleaseMemObject(memObject2);
  clReleaseMemObject(memObject3);
  return 0;
}

OpenCL代码:

kernel.cl:

__kernel void createBuffer(__global const float *a_in,
  __global const float *b_in,
  __global float *result) {
int gid = get_global_id(0);
result[gid] = a_in[gid] + b_in[gid];
}

执行逻辑是，用opencl开发的kernel.cl程序，在HOST端的C程序中被动态加载，运行时编译，投递到GPU中运行，跑出结果后在从GPU MEM中读回来打印。

验证：

OpenCL编程之二_CUDA

结合a_in,b_in初始化值和kernel.cl的逻辑，可以知道正确的结果应该是首项为0，公差为3的等差数列，我们编译运行，看一下结果是否符合我们预期：

编译命令：

gcc -I/usr/local/cuda-11.5/targets/x86_64-linux/include matrix.c -o main -L/usr/local/cuda-11.5/targets/x86_64-linux/lib/ -lOpenCL

OpenCL编程之二_蓝桥杯_02

符合预期，说明程序是对的。

和CUDA的关系：

架构上，它们在同一层面，cuda和OpenCL都属于一种并行计算开发语言，这从CUDA使用的编译器和OpenCL虽然使用GCC编译HOST侧代码，但是端册代码却需编译一个文本CL程序文件，交给OpenCL API执行在线编译看出来，他们虽然都有吸取C的语法特点，但是异构加速核心这一块和CPU端的编译器是不共用的，关于CUDA开发的例子，可以参考如下博客。

OpenCL编程之二

和CUDA的关系：

结束！