#c #ubuntu #build #opencl
Вопрос:
Приведенный ниже код тестирования отлично работает под Windows с использованием NVIDIA GeForce GTX 960M, но в Ubuntu с видеокартой Quadro P4000 он сообщает об ошибке построения. Я попытался получить подробную информацию об ошибке, но я не получаю никаких журналов вообще. Я получаю ту же ошибку, независимо от того, использую ли я внешний файл ядра saxpy.cl
или встроенный код.
Я могу подтвердить, что под Windows журналы работают просто отлично. Итак, в чем может быть корень проблемы для среды Ubuntu? Может быть, плохая установка Cuda?
clinfo
в ubuntu действительно можно найти карту NVidia Quadro, как вы можете видеть в конце этого вопроса.
Ошибка, которую я получаю, такова
[kernel loaded. Read 208 bytes]
[Number of platforms = 1]
[Device ID selected = 0]
[Using GPU device = Quadro P4000]
************* CL_BUILD_PROGRAM_FAILURE ********************
[Error Description]
*************************************************************
ERROR: clCreateKernel ( -45 )
ERROR: clSetKernelArg 0 ( -48 )
ERROR: clSetKernelArg 1 ( -48 )
ERROR: clSetKernelArg 2 ( -48 )
ERROR: clSetKernelArg 3 ( -48 )
ERROR: clEnqueueNDRangeKernel ( -48 )
0.000000 16.000000 = 0.000000
1.000000 15.000000 = 0.000000
2.000000 14.000000 = 0.000000
3.000000 13.000000 = 0.000000
4.000000 12.000000 = 0.000000
5.000000 11.000000 = 0.000000
6.000000 10.000000 = 0.000000
7.000000 9.000000 = 0.000000
8.000000 8.000000 = 0.000000
9.000000 7.000000 = 0.000000
10.000000 6.000000 = 0.000000
11.000000 5.000000 = 0.000000
12.000000 4.000000 = 0.000000
13.000000 3.000000 = 0.000000
14.000000 2.000000 = 0.000000
15.000000 1.000000 = 0.000000
Полный исходный код программы тестирования приведен ниже
#define CL_TARGET_OPENCL_VERSION 120
//#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#define MAX_SOURCE_SIZE (0x100000)
#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define VECTOR_SIZE 16
//OpenCL kernel which is run for every work item created.
const char *saxpy_kernel =
"__kernel n"
"void saxpy_kernel(float alpha, n"
" __global float *A, n"
" __global float *B, n"
" __global float *C) n"
"{ n"
" //Get the index of the work-item n"
" int index = get_global_id(0); n"
" C[index] = A[index] B[index]; n"
"} n";
inline bool checkError(cl_int err, const char * name)
{
bool success = true;
if (err != CL_SUCCESS) {
fprintf(stderr, "ERROR: %s ( %d )n", name, err);
success = false;
}
return success;
}
int main(void) {
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("saxpy.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
fprintf(stderr, "[kernel loaded. Read %d bytes]n", (unsigned int)source_size);
int i;
// Allocate space for vectors A, B and C
float alpha = 1.0;
float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE);
float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE);
float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE);
for (i = 0; i < VECTOR_SIZE; i )
{
A[i] = i;
B[i] = VECTOR_SIZE - i;
C[i] = 0;
}
cl_int clStatus;
// Get platform and device information
//cl_platform_id * platforms = NULL;
//cl_uint num_platforms;
//Set up the Platform
//cl_int clStatus = clGetPlatformIDs(0, NULL, amp;num_platforms);
//platforms = (cl_platform_id *)
// malloc(sizeof(cl_platform_id)*num_platforms);
//clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
//checkError(clStatus, "clGetPlatformIDs");
////Get the devices list and choose the device you want to run on
//cl_device_id *device_list = NULL;
//cl_uint num_devices;
//clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, amp;num_devices);
//device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
//clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
//checkError(clStatus, "clGetDeviceIDs");
bool deviceFound = true;
cl_uint num_platforms; ///< Number of platforms found in host
cl_platform_id* platforms; ///< Information of platforms found
cl_device_id* device_list; ///< Information of Devices found
cl_uint num_devices; ///< Number of devices found that support OpenCL
size_t paramValueSize; ///< Info Device
char *info = NULL;
int deviceId = 0;
// Select an OpenCL platform to run on.
clStatus = clGetPlatformIDs(0, NULL, amp;num_platforms);
fprintf(stderr, "[Number of platforms = %d]n", (unsigned int)num_platforms);
deviceFound = checkError((clStatus != CL_SUCCESS) ? clStatus : (num_platforms <= 0 ? -1 : CL_SUCCESS), "No Platforms Found");
platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
checkError((clStatus != CL_SUCCESS) ? clStatus : (num_platforms <= 0 ? -1 : CL_SUCCESS), "No Platforms Found");
device_list = NULL;
if (deviceFound)
{
// Iterate through the list of platforms until we find one that supports
// a GPU device, otherwise fail with an error.
cl_uint i;
for (i = 0; i < num_platforms; i )
{
//Find only GPU Devices
clStatus = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, amp;num_devices);
if (clStatus != CL_SUCCESS amp;amp; clStatus != CL_DEVICE_NOT_FOUND)
{
deviceFound = checkError(clStatus, "No devices Found");
}
//Construct a vector with devices found
else if (num_devices > 0)
{
device_list = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
clStatus = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, num_devices, amp;device_list[0], NULL);
deviceFound = checkError(clStatus, "No devices found");
deviceId = i;
fprintf(stderr, "[Device ID selected = %d]n", deviceId);
break;
}
}
//if found device, display information
if (deviceFound)
{
if (deviceId < 0 || deviceId >(num_devices - 1))
{
deviceId = 0;
printf("ID device not found, use default GPU device n");
}
//Obtain device vendor name to display info
clStatus = clGetDeviceInfo(device_list[deviceId], CL_DEVICE_NAME, 0, NULL, amp;paramValueSize);
checkError(clStatus, "Failed to find OpenCL device info");
info = (char *)malloc(sizeof(char) * paramValueSize); // String to display vendor name info
clStatus = clGetDeviceInfo(device_list[deviceId], CL_DEVICE_NAME, paramValueSize, info, NULL);
checkError(clStatus, "Failed to find OpenCL device info");
}
}
if (deviceFound)
fprintf(stderr, "[Using GPU device = %s]nn", info);
free(info);
// Create one OpenCL context for each device in the platform
cl_context context;
context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, amp;clStatus);
checkError(clStatus, "clCreateContext");
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_list[deviceId], 0, amp;clStatus);
checkError(clStatus, "clCreateCommandQueue");
//cl_queue_properties qprop[] = { CL_QUEUE_PROPERTIES, 0 , 0 };
//cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_list[0], qprop, amp;clStatus);
// Create memory buffers on the device for each vector
cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, amp;clStatus);
checkError(clStatus, "clCreateBuffer A");
cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, amp;clStatus);
checkError(clStatus, "clCreateBuffer B");
cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, amp;clStatus);
checkError(clStatus, "clCreateBuffer C");
// Copy the Buffer A and B to the device
clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);
// Create a program from the kernel source
//cl_program program = clCreateProgramWithSource(context, 1, (const char **)amp;saxpy_kernel, NULL, amp;clStatus);
//checkError(clStatus, "clCreateProgramWithSource");
// Build the program
//clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);
//checkError(clStatus, "clBuildProgram");
// Create the OpenCL kernel
//cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", amp;clStatus);
//checkError(clStatus, "clCreateKernel");
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1, (const char **)amp;source_str, (const size_t *)amp;source_size, amp;clStatus);
checkError(clStatus, "clCreateProgramWithSource");
// Build the program
clStatus = clBuildProgram(program, num_devices, device_list, NULL, NULL, NULL);
// Query if compilation was successful
if (clStatus == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, device_list[deviceId], CL_PROGRAM_BUILD_LOG, 0, NULL, amp;log_size);
// Allocate memory for the log
char *log = (char *)malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, device_list[deviceId], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
fprintf(stderr, "n************* CL_BUILD_PROGRAM_FAILURE ********************n");
fprintf(stderr, "[Error Description]nn%sn", log);
fprintf(stderr, "*************************************************************n");
free(log);
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", amp;clStatus);
checkError(clStatus, "clCreateKernel");
// Set the arguments of the kernel
clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)amp;alpha);
checkError(clStatus, "clSetKernelArg 0");
clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)amp;A_clmem);
checkError(clStatus, "clSetKernelArg 1");
clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)amp;B_clmem);
checkError(clStatus, "clSetKernelArg 2");
clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)amp;C_clmem);
checkError(clStatus, "clSetKernelArg 3");
// Execute the OpenCL kernel on the list
size_t global_size = VECTOR_SIZE; // Process the entire lists
size_t local_size = 16; // Process one item at a time
clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, amp;global_size, amp;local_size, 0, NULL, NULL);
checkError(clStatus, "clEnqueueNDRangeKernel");
// Read the cl memory C_clmem on device to the host variable C
clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
checkError(clStatus, "clEnqueueReadBuffer C");
// Clean up and wait for all the comands to complete.
clStatus = clFlush(command_queue);
clStatus = clFinish(command_queue);
// Display the result to the screen
for (i = 0; i < VECTOR_SIZE; i )
printf("%f %f = %fn", A[i], B[i], C[i]);
// Finally release all OpenCL allocated objects and host buffers.
clStatus = clReleaseKernel(kernel);
clStatus = clReleaseProgram(program);
clStatus = clReleaseMemObject(A_clmem);
clStatus = clReleaseMemObject(B_clmem);
clStatus = clReleaseMemObject(C_clmem);
clStatus = clReleaseCommandQueue(command_queue);
clStatus = clReleaseContext(context);
free(A);
free(B);
free(C);
free(platforms);
free(device_list);
return 0;
}
в saxpy.cl файл
__kernel void saxpy_kernel(float alpha,__global float *A, __global float *B, __global float *C)
{
int index = get_global_id(0);
C[index] = A[index] B[index];
}
Вы также можете ознакомиться с отчетом clinfo ниже
Number of platforms 1
Platform Name NVIDIA CUDA
Platform Vendor NVIDIA Corporation
Platform Version OpenCL 1.2 CUDA 10.1.120
Platform Profile FULL_PROFILE
Platform Extensions cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_khr_gl_event cl_nv_create_buffer
Platform Extensions function suffix NV
Platform Name NVIDIA CUDA
Number of devices 1
Device Name Quadro P4000
Device Vendor NVIDIA Corporation
Device Vendor ID 0x10de
Device Version OpenCL 1.2 CUDA
Driver Version 430.64
Device OpenCL C Version OpenCL C 1.2
Device Type GPU
Device Topology (NV) <printDeviceInfo:22: get CL_DEVICE_PCI_DOMAIN_ID_NV : error -30>
Device Profile FULL_PROFILE
Device Available Yes
Compiler Available Yes
Linker Available Yes
Max compute units 14
Max clock frequency 1480MHz
Compute Capability (NV) 6.1
Device Partition (core)
Max number of sub-devices 1
Supported partition types None
Supported affinity domains (n/a)
Max work item dimensions 3
Max work item sizes 1024x1024x64
Max work group size
1024
Комментарии:
1.
<printDeviceInfo:22: get CL_DEVICE_PCI_DOMAIN_ID_NV : error -30>
намекает на то, что что-то не так с установкой драйвера Nvidia.2. @ProjectPhysX Я связался с автором, и он ответил, что это ошибка clinfo
3. @ProjectPhysX clinfo также сообщает === CL_PROGRAM_BUILD_LOG === Предпочтительный размер рабочей группы несколько (ядро) <Размеры:1504: создать ядро : ошибка -45>