clGetProgramBuildInfo не возвращает данные журнала в ubuntu (Quadro P4000)

#c #ubuntu #build #opencl

Вопрос:

Приведенный ниже код тестирования отлично работает под Windows с использованием NVIDIA GeForce GTX 960M, но в Ubuntu с видеокартой Quadro P4000 он сообщает об ошибке построения. Я попытался получить подробную информацию об ошибке, но я не получаю никаких журналов вообще. Я получаю ту же ошибку, независимо от того, использую ли я внешний файл ядра saxpy.cl или встроенный код.

Я могу подтвердить, что под Windows журналы работают просто отлично. Итак, в чем может быть корень проблемы для среды Ubuntu? Может быть, плохая установка Cuda?

clinfo в ubuntu действительно можно найти карту NVidia Quadro, как вы можете видеть в конце этого вопроса.

Ошибка, которую я получаю, такова

 [kernel loaded. Read 208 bytes]
[Number of platforms = 1]
[Device ID selected = 0]
[Using GPU device = Quadro P4000]


************* CL_BUILD_PROGRAM_FAILURE ********************
[Error Description]


*************************************************************
ERROR: clCreateKernel ( -45 )
ERROR: clSetKernelArg 0 ( -48 )
ERROR: clSetKernelArg 1 ( -48 )
ERROR: clSetKernelArg 2 ( -48 )
ERROR: clSetKernelArg 3 ( -48 )
ERROR: clEnqueueNDRangeKernel ( -48 )
0.000000   16.000000 = 0.000000
1.000000   15.000000 = 0.000000
2.000000   14.000000 = 0.000000
3.000000   13.000000 = 0.000000
4.000000   12.000000 = 0.000000
5.000000   11.000000 = 0.000000
6.000000   10.000000 = 0.000000
7.000000   9.000000 = 0.000000
8.000000   8.000000 = 0.000000
9.000000   7.000000 = 0.000000
10.000000   6.000000 = 0.000000
11.000000   5.000000 = 0.000000
12.000000   4.000000 = 0.000000
13.000000   3.000000 = 0.000000
14.000000   2.000000 = 0.000000
15.000000   1.000000 = 0.000000
 

Полный исходный код программы тестирования приведен ниже

 #define CL_TARGET_OPENCL_VERSION 120
//#define CL_USE_DEPRECATED_OPENCL_1_2_APIS

#define MAX_SOURCE_SIZE (0x100000)
#define _CRT_SECURE_NO_WARNINGS

#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#define VECTOR_SIZE 16

//OpenCL kernel which is run for every work item created.
const char *saxpy_kernel =
"__kernel                                   n"
"void saxpy_kernel(float alpha,     n"
"                  __global float *A,       n"
"                  __global float *B,       n"
"                  __global float *C)       n"
"{                                          n"
"    //Get the index of the work-item       n"
"    int index = get_global_id(0);          n"
"    C[index] = A[index]   B[index]; n"
"}                                          n";

inline bool checkError(cl_int err, const char * name)
{
    bool success = true;
    if (err != CL_SUCCESS) {
        fprintf(stderr, "ERROR: %s ( %d )n", name, err);
        success = false;
    }
    return success;
}

int main(void) {


    // Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("saxpy.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);
    fprintf(stderr, "[kernel loaded. Read %d bytes]n", (unsigned int)source_size);

    int i;
    // Allocate space for vectors A, B and C
    float alpha = 1.0;
    float *A = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    float *B = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    float *C = (float*)malloc(sizeof(float)*VECTOR_SIZE);
    for (i = 0; i < VECTOR_SIZE; i  )
    {
        A[i] = i;
        B[i] = VECTOR_SIZE - i;
        C[i] = 0;
    }
    cl_int clStatus;
    // Get platform and device information
    //cl_platform_id * platforms = NULL;
    //cl_uint     num_platforms;
    //Set up the Platform
    //cl_int clStatus = clGetPlatformIDs(0, NULL, amp;num_platforms);
    //platforms = (cl_platform_id *)
    //  malloc(sizeof(cl_platform_id)*num_platforms);
    //clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
    //checkError(clStatus, "clGetPlatformIDs");

    ////Get the devices list and choose the device you want to run on
    //cl_device_id     *device_list = NULL;
    //cl_uint           num_devices;

    //clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, amp;num_devices);
    //device_list = (cl_device_id *) malloc(sizeof(cl_device_id)*num_devices);
    //clStatus = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, device_list, NULL);
    //checkError(clStatus, "clGetDeviceIDs");


    bool deviceFound = true;
    cl_uint                 num_platforms;           ///< Number of platforms found in host
    cl_platform_id*         platforms;            ///< Information of platforms found
    cl_device_id*           device_list;              ///< Information of Devices found
    cl_uint                 num_devices;             ///< Number of devices found that support OpenCL
    size_t                  paramValueSize;         ///< Info Device
    char *info = NULL;

    int deviceId = 0;
    // Select an OpenCL platform to run on.  
    clStatus = clGetPlatformIDs(0, NULL, amp;num_platforms);
    fprintf(stderr, "[Number of platforms = %d]n", (unsigned int)num_platforms);
    deviceFound = checkError((clStatus != CL_SUCCESS) ? clStatus : (num_platforms <= 0 ? -1 : CL_SUCCESS), "No Platforms Found");

    platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);

    clStatus = clGetPlatformIDs(num_platforms, platforms, NULL);
    checkError((clStatus != CL_SUCCESS) ? clStatus : (num_platforms <= 0 ? -1 : CL_SUCCESS), "No Platforms Found");
    device_list = NULL;
    if (deviceFound)
    {
        // Iterate through the list of platforms until we find one that supports
        // a GPU device, otherwise fail with an error.


        cl_uint i;
        for (i = 0; i < num_platforms; i  )
        {
            //Find only GPU Devices
            clStatus = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, amp;num_devices);
            if (clStatus != CL_SUCCESS amp;amp; clStatus != CL_DEVICE_NOT_FOUND)
            {
                deviceFound = checkError(clStatus, "No devices Found");

            }

            //Construct a vector with devices found
            else if (num_devices > 0)
            {
                device_list = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
                clStatus = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, num_devices, amp;device_list[0], NULL);
                deviceFound = checkError(clStatus, "No devices found");
                deviceId = i;
                fprintf(stderr, "[Device ID selected = %d]n", deviceId);

                break;
            }
        }

        //if found device, display information
        if (deviceFound)
        {
            if (deviceId < 0 || deviceId >(num_devices - 1))
            {
                deviceId = 0;
                printf("ID device not found, use default GPU device n");
            }

            //Obtain device vendor name to display info   
            clStatus = clGetDeviceInfo(device_list[deviceId], CL_DEVICE_NAME, 0, NULL, amp;paramValueSize);
            checkError(clStatus, "Failed to find OpenCL device info");


            info = (char *)malloc(sizeof(char) * paramValueSize);  // String to display vendor name info
            clStatus = clGetDeviceInfo(device_list[deviceId], CL_DEVICE_NAME, paramValueSize, info, NULL);
            checkError(clStatus, "Failed to find OpenCL device info");



        }
    }
    if (deviceFound)
        fprintf(stderr, "[Using GPU device = %s]nn", info);
    free(info);
    


    // Create one OpenCL context for each device in the platform
    cl_context context;
    context = clCreateContext(NULL, num_devices, device_list, NULL, NULL, amp;clStatus);
    checkError(clStatus, "clCreateContext");

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, device_list[deviceId], 0, amp;clStatus);
    checkError(clStatus, "clCreateCommandQueue");


    //cl_queue_properties qprop[] = { CL_QUEUE_PROPERTIES,  0 , 0 };
    //cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_list[0], qprop, amp;clStatus);

    // Create memory buffers on the device for each vector
    cl_mem A_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, amp;clStatus);
    checkError(clStatus, "clCreateBuffer A");
    cl_mem B_clmem = clCreateBuffer(context, CL_MEM_READ_ONLY, VECTOR_SIZE * sizeof(float), NULL, amp;clStatus);
    checkError(clStatus, "clCreateBuffer B");
    cl_mem C_clmem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, VECTOR_SIZE * sizeof(float), NULL, amp;clStatus);
    checkError(clStatus, "clCreateBuffer C");

    // Copy the Buffer A and B to the device
    clStatus = clEnqueueWriteBuffer(command_queue, A_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), A, 0, NULL, NULL);
    clStatus = clEnqueueWriteBuffer(command_queue, B_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), B, 0, NULL, NULL);

    // Create a program from the kernel source
    //cl_program program = clCreateProgramWithSource(context, 1, (const char **)amp;saxpy_kernel, NULL, amp;clStatus);
    //checkError(clStatus, "clCreateProgramWithSource");    
    // Build the program
    //clStatus = clBuildProgram(program, 1, device_list, NULL, NULL, NULL);
    //checkError(clStatus, "clBuildProgram");
    // Create the OpenCL kernel
    //cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", amp;clStatus);
    //checkError(clStatus, "clCreateKernel");

    // Create a program from the kernel source
    cl_program program = clCreateProgramWithSource(context, 1,  (const char **)amp;source_str, (const size_t *)amp;source_size, amp;clStatus);
    checkError(clStatus, "clCreateProgramWithSource");   
    // Build the program
    clStatus = clBuildProgram(program, num_devices, device_list, NULL, NULL, NULL);
    
    // Query if compilation was successful  
    
    if (clStatus == CL_BUILD_PROGRAM_FAILURE) {

        // Determine the size of the log
        size_t log_size;
        clGetProgramBuildInfo(program, device_list[deviceId], CL_PROGRAM_BUILD_LOG, 0, NULL, amp;log_size);

        // Allocate memory for the log
        char *log = (char *)malloc(log_size);

        // Get the log
        clGetProgramBuildInfo(program, device_list[deviceId], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);

        // Print the log
        fprintf(stderr, "n************* CL_BUILD_PROGRAM_FAILURE ********************n");
        fprintf(stderr, "[Error Description]nn%sn", log);
        fprintf(stderr, "*************************************************************n");
        free(log);
        
    }

    // Create the OpenCL kernel
    cl_kernel kernel = clCreateKernel(program, "saxpy_kernel", amp;clStatus);
    checkError(clStatus, "clCreateKernel");

    // Set the arguments of the kernel
    clStatus = clSetKernelArg(kernel, 0, sizeof(float), (void *)amp;alpha);
    checkError(clStatus, "clSetKernelArg 0");
    clStatus = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)amp;A_clmem);
    checkError(clStatus, "clSetKernelArg 1");
    clStatus = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)amp;B_clmem);
    checkError(clStatus, "clSetKernelArg 2");
    clStatus = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)amp;C_clmem);
    checkError(clStatus, "clSetKernelArg 3");

    // Execute the OpenCL kernel on the list
    size_t global_size = VECTOR_SIZE; // Process the entire lists
    size_t local_size = 16;           // Process one item at a time
    clStatus = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, amp;global_size, amp;local_size, 0, NULL, NULL);
    checkError(clStatus, "clEnqueueNDRangeKernel");

    // Read the cl memory C_clmem on device to the host variable C
    clStatus = clEnqueueReadBuffer(command_queue, C_clmem, CL_TRUE, 0, VECTOR_SIZE * sizeof(float), C, 0, NULL, NULL);
    checkError(clStatus, "clEnqueueReadBuffer C");

    // Clean up and wait for all the comands to complete.
    clStatus = clFlush(command_queue);
    clStatus = clFinish(command_queue);

    // Display the result to the screen
    for (i = 0; i < VECTOR_SIZE; i  )
        printf("%f   %f = %fn", A[i], B[i], C[i]);
 

    // Finally release all OpenCL allocated objects and host buffers.
    clStatus = clReleaseKernel(kernel);
    clStatus = clReleaseProgram(program);
    clStatus = clReleaseMemObject(A_clmem);
    clStatus = clReleaseMemObject(B_clmem);
    clStatus = clReleaseMemObject(C_clmem);

    clStatus = clReleaseCommandQueue(command_queue);
    clStatus = clReleaseContext(context);
    free(A);
    free(B);
    free(C);
    free(platforms);
    free(device_list);
    return 0;
}
 

в saxpy.cl файл

 __kernel void saxpy_kernel(float alpha,__global float *A, __global float *B, __global float *C)
    {                                   
        int index = get_global_id(0);  
        C[index] = A[index]   B[index];
    }  
                           
 

Вы также можете ознакомиться с отчетом clinfo ниже

 Number of platforms                               1
  Platform Name                                   NVIDIA CUDA
  Platform Vendor                                 NVIDIA Corporation
  Platform Version                                OpenCL 1.2 CUDA 10.1.120
  Platform Profile                                FULL_PROFILE
  Platform Extensions                             cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_byte_addressable_store cl_khr_icd cl_khr_gl_sharing cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_khr_gl_event cl_nv_create_buffer
  Platform Extensions function suffix             NV

  Platform Name                                   NVIDIA CUDA
Number of devices                                 1
  Device Name                                     Quadro P4000
  Device Vendor                                   NVIDIA Corporation
  Device Vendor ID                                0x10de
  Device Version                                  OpenCL 1.2 CUDA
  Driver Version                                  430.64
  Device OpenCL C Version                         OpenCL C 1.2
  Device Type                                     GPU
  Device Topology (NV)                            <printDeviceInfo:22: get CL_DEVICE_PCI_DOMAIN_ID_NV : error -30>
  Device Profile                                  FULL_PROFILE
  Device Available                                Yes
  Compiler Available                              Yes
  Linker Available                                Yes
  Max compute units                               14
  Max clock frequency                             1480MHz
  Compute Capability (NV)                         6.1
  Device Partition                                (core)
    Max number of sub-devices                     1
    Supported partition types                     None
    Supported affinity domains                    (n/a)
  Max work item dimensions                        3
  Max work item sizes                             1024x1024x64
  Max work group size 

                        1024
 

Комментарии:

1. <printDeviceInfo:22: get CL_DEVICE_PCI_DOMAIN_ID_NV : error -30> намекает на то, что что-то не так с установкой драйвера Nvidia.

2. @ProjectPhysX Я связался с автором, и он ответил, что это ошибка clinfo

3. @ProjectPhysX clinfo также сообщает === CL_PROGRAM_BUILD_LOG === Предпочтительный размер рабочей группы несколько (ядро) <Размеры:1504: создать ядро : ошибка -45>