Wednesday, March 25, 2015

LWJGL3 - use GPU [OpenCL] for multithreaded calculations

Lastly I spent some time investigating usage of GPU in computations, particularly nvidia CUDA. But during research I spent some time looking for other options, like OpenCL. So that brings me to this Lightweight Java Game Library.

Short intro:

LWJGL is a Java library that enables cross-platform access to popular native APIs useful in the development of graphics (OpenGL), audio (OpenAL) and parallel computing (OpenCL) applications. This access is direct and high-performance, yet also wrapped in a type-safe and user-friendly layer, appropriate for the Java ecosystem. LWJGL is an enabling technology and provides low-level access. It is not a framework and does not provide higher-level utilities than what the native libraries expose. As such, novice programmers are encouraged to try one of the frameworks or game engines that make use of LWJGL, before working directly with the library.   

And after quick look - you can get the same amount of access to all part of GPU as you can on JCUDA ( using CUDA of course ;-) ).

Here is simple OpenCL example  updated to use LWJGL3:




package com.yarenty.lwjgl3;

import org.lwjgl.opencl.CLUtil;
import org.lwjgl.BufferUtils;
import org.lwjgl.PointerBuffer;

import java.nio.FloatBuffer;
import java.nio.IntBuffer;
import java.util.List;

import org.lwjgl.opencl.CL;
import org.lwjgl.opencl.CLCreateContextCallback;
import org.lwjgl.opencl.CLDevice;
import org.lwjgl.opencl.CLPlatform;

import static org.lwjgl.opencl.CL10.*;
import static org.lwjgl.opencl.CLUtil.checkCLError;
import static org.lwjgl.system.MemoryUtil.NULL;
import static org.lwjgl.system.MemoryUtil.memDecodeUTF8;
 
public class OpenCLSum {
    // The OpenCL kernel
    static final String source =
        "kernel void sum(global const float *a, global const float *b, global float *answer) { "
        + "  unsigned int xid = get_global_id(0); "
        + "  answer[xid] = a[xid] + b[xid];"
        + "}";
 
    // Data buffers to store the input and result data in
    static final FloatBuffer a = toFloatBuffer(new float[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
    static final FloatBuffer b = toFloatBuffer(new float[]{9, 8, 7, 6, 5, 4, 3, 2, 1, 0});
    static final FloatBuffer answer = BufferUtils.createFloatBuffer(a.capacity());
 
    
 private static final CLCreateContextCallback CREATE_CONTEXT_CALLBACK = new CLCreateContextCallback() {
  @Override
  public void invoke(long errinfo, long private_info, long cb, long user_data) {
   System.err.println("[LWJGL] cl_create_context_callback");
   System.err.println("\tInfo: " + memDecodeUTF8(errinfo));
  }
 };
    
    public static void main(String[] args) throws Exception {
     
     System.setProperty("org.lwjgl.opencl.explicitInit","true");
     
     
        // Initialize OpenCL and create a context and command queue
        CL.create();
        System.out.println("CL created");
        
        CLPlatform platform = CLPlatform.getPlatforms().get(0);
        System.out.println("Platform created");

        PointerBuffer ctxProps = BufferUtils.createPointerBuffer(3);
  ctxProps.put(CL_CONTEXT_PLATFORM).put(platform).put(0).flip();
  System.out.println("CTX created");
  
  
        IntBuffer errcode_ret = BufferUtils.createIntBuffer(1);
  System.out.println("ERRCODE created");

        
        List devices = platform.getDevices(CL_DEVICE_TYPE_GPU);
       // long context = clCreateContext(platform, devices, null, null, null);
  long context = clCreateContext(ctxProps, devices.get(0).getPointer(), CREATE_CONTEXT_CALLBACK, NULL, errcode_ret);
  System.out.println("CONTEXT created");

  
  checkCLError(errcode_ret);
        //CLCommandQueue queue = clCreateCommandQueue(context, devices.get(0), CL_QUEUE_PROFILING_ENABLE, null);
        long queue = clCreateCommandQueue(context, devices.get(0).getPointer(), CL_QUEUE_PROFILING_ENABLE, errcode_ret);
  System.out.println("Command Q created");

        // Allocate memory for our two input buffers and our result buffer
        long aMem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, a, null);
        //long buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, 128, errcode_ret);
  System.out.println("A Buffer created");
        clEnqueueWriteBuffer(queue, aMem, 1, 0, a, null, null);
        long bMem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, b, null);
  System.out.println("B Buffer created");
        clEnqueueWriteBuffer(queue, bMem, 1, 0, b, null, null);
        long answerMem = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, answer, null);
  System.out.println("OUTPUT Buffer created");
        clFinish(queue);
 
        // Create our program and kernel
        long program = clCreateProgramWithSource(context, source, null);
  System.out.println("PROGRAM created");

     //public static int clBuildProgram(long program, long device, CharSequence options, CLProgramCallback pfn_notify, long user_data) {

      
        CLUtil.checkCLError(clBuildProgram(program, devices.get(0).getPointer(), "", null, 0L));
        // sum has to match a kernel method name in the OpenCL source
        long kernel = clCreateKernel(program, "sum", null);
  System.out.println("KERNEL created");

        // Execution our kernel
        PointerBuffer kernel1DGlobalWorkSize = BufferUtils.createPointerBuffer(1);
        System.out.println("KERNEL work size created");
        kernel1DGlobalWorkSize.put(0, a.capacity());
        System.out.println("KERNEL work size copied");
        
        clSetKernelArg1p(kernel, 0, aMem);
        clSetKernelArg1p(kernel, 1, bMem);
        clSetKernelArg1p(kernel, 2, answerMem);
        
  System.out.println("Args send to kernel");

        clEnqueueNDRangeKernel(queue, kernel, 1, null, kernel1DGlobalWorkSize, null, null, null);
        System.out.println("KERNEL queued created");
        
        
        // Read the results memory back into our result buffer
        clEnqueueReadBuffer(queue, answerMem, 1, 0, answer, null, null);
        System.out.println("and output ... created");
        
        clFinish(queue);
        // Print the result memory
        print(a);
        System.out.println("+");
        print(b);
        System.out.println("=");
        print(answer);
 
        // Clean up OpenCL resources
        clReleaseKernel(kernel);
        clReleaseProgram(program);
        clReleaseMemObject(aMem);
        clReleaseMemObject(bMem);
        clReleaseMemObject(answerMem);
        clReleaseCommandQueue(queue);
        clReleaseContext(context);
        CL.destroy();
    }
 
 
    /** Utility method to convert float array to float buffer
     * @param floats - the float array to convert
     * @return a float buffer containing the input float array
     */
    static FloatBuffer toFloatBuffer(float[] floats) {
        FloatBuffer buf = BufferUtils.createFloatBuffer(floats.length).put(floats);
        buf.rewind();
        return buf;
    }
 
 
    /** Utility method to print a float buffer
     * @param buffer - the float buffer to print to System.out
     */
    static void print(FloatBuffer buffer) {
        for (int i = 0; i < buffer.capacity(); i++) {
            System.out.print(buffer.get(i)+" ");
        }
        System.out.println("");
    }
 
}

and you can find full scala project here: https://github.com/yarenty/scala_lwjgl3

1 comment:

  1. HI, thanks for sharing this!
    I'm using 3.1.1 version of lwjgl and following classes are not found:

    import org.lwjgl.opencl.CLCreateContextCallback;
    import org.lwjgl.opencl.CLDevice;
    import org.lwjgl.opencl.CLPlatform;

    is there a working example for 3.1.1?

    ReplyDelete

Web 3 - blockchain layers

Layers from a blockchain perspective. My plan is to write 5 articles:  1 Intro: Web 1.. 2.. 3.. 2 Layers in crypto.  [this one] 3 Applicatio...