OpenCL Particles System with Processing

I ported the particles demo program in JavaCL to Processing 2.0 alpha. It has reasonable performance in my iMac up to 500,000 particles at twenty something frames per second. The video is captured using the QuickTime screen recording. The performance is much slower than the original screen version.
 

import processing.opengl.*;
import javax.media.opengl.*;
import javax.media.opengl.glu.GLU;
import java.util.Random;
 
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.opencl.CLMem.Usage;
import org.bridj.Pointer;
 
import static org.bridj.Pointer.*;
 
final int particlesCount = 200000;
 
GL2 gl;
PGL pgl;
 
int [] vbo = new int[1];
 
CLContext context;
CLQueue queue;
 
Pointer<Float> velocities;
CLKernel updateParticleKernel;
 
CLBuffer<Float> massesMem, velocitiesMem;
CLBuffer<Byte> interleavedColorAndPositionsMem;
Pointer<Byte> interleavedColorAndPositionsTemp;
 
int elementSize = 4*4;
 
void setup() {
  size(800, 600, OPENGL);
  background(0);
  randomSeed(millis());
 
  PGraphicsOpenGL pg = (PGraphicsOpenGL) g;
  pgl = pg.beginPGL();
  gl = pgl.gl.getGL().getGL2();
  gl.glClearColor(0, 0, 0, 1);
  gl.glClear(GL.GL_COLOR_BUFFER_BIT);
  gl.glEnable(GL.GL_BLEND);
  gl.glEnable(GL2.GL_POINT_SMOOTH);
  gl.glPointSize(1f);
  initOpenCL();
  pg.endPGL();
}
 
void initOpenCL() {
  context = JavaCL.createContextFromCurrentGL();
  queue = context.createDefaultQueue();
 
  Pointer<Float> masses = allocateFloats(particlesCount).order(context.getByteOrder());
  velocities = allocateFloats(2 * particlesCount).order(context.getByteOrder());
  interleavedColorAndPositionsTemp = allocateBytes(elementSize * particlesCount).order(context.getByteOrder());
 
  Pointer<Float> positionsView = interleavedColorAndPositionsTemp.as(Float.class);
  for (int i = 0; i < particlesCount; i++) {
    masses.set(i, 0.5f + 0.5f * random(1));
    velocities.set(i * 2, random(-0.5, 0.5) * 0.2f);
    velocities.set(i * 2 + 1, random(-0.5, 0.5) * 0.2f);
    int colorOffset = i * elementSize;
    int posOffset = i * (elementSize / 4) + 1;
    byte r = (byte) 220, g = r, b = r, a = r;
    interleavedColorAndPositionsTemp.set(colorOffset++, r);
    interleavedColorAndPositionsTemp.set(colorOffset++, g);
    interleavedColorAndPositionsTemp.set(colorOffset++, b);
    interleavedColorAndPositionsTemp.set(colorOffset, a);
    float x = random(-0.5, 0.5) * width/2.0, 
    y = random(-0.5, 0.5) * height/2.0;
    positionsView.set(posOffset, (float) x);
    positionsView.set(posOffset + 1, (float) y);
  }
  velocitiesMem = context.createBuffer(Usage.InputOutput, velocities, false);
  massesMem = context.createBuffer(Usage.Input, masses, true);
 
  gl.glGenBuffers(1, vbo, 0);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, vbo[0]);
  gl.glBufferData(GL.GL_ARRAY_BUFFER, (int) interleavedColorAndPositionsTemp.getValidBytes(), interleavedColorAndPositionsTemp.getByteBuffer(), GL2.GL_DYNAMIC_COPY);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, 0);
 
  interleavedColorAndPositionsMem = context.createBufferFromGLBuffer(Usage.InputOutput, vbo[0]);
  String pgmSrc = join(loadStrings(dataPath("ParticlesDemoProgram.cl")), "\n");
  CLProgram program = context.createProgram(pgmSrc);
  updateParticleKernel = program.build().createKernel("updateParticle");
  callKernel();
}
 
void draw() {
  queue.finish();
  gl.glClear(GL.GL_COLOR_BUFFER_BIT);
  gl.glBlendFunc(GL.GL_SRC_ALPHA, GL.GL_SRC_COLOR);
  gl.glMatrixMode(GL2.GL_PROJECTION);
  gl.glLoadIdentity();
  pgl.glu.gluOrtho2D(-width/2 - 1, width/2 + 1, -height/2 - 1, height/2 + 1);
  gl.glMatrixMode(GL2.GL_MODELVIEW);
 
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, vbo[0]);
  gl.glInterleavedArrays(GL2.GL_C4UB_V2F, elementSize, 0);
  gl.glDrawArrays(GL.GL_POINTS, 0, particlesCount);
 
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
  callKernel();
}
 
void callKernel() {
  CLEvent kernelCompletion;
  synchronized(updateParticleKernel) {
    interleavedColorAndPositionsMem.acquireGLObject(queue);
    updateParticleKernel.setArgs(massesMem, 
    velocitiesMem, 
    interleavedColorAndPositionsMem.as(Float.class), 
    new float[] {
      mouseX-width/2, height/2-mouseY
    }
    , 
    new float[] {
      width, height
    }
    , 
    2.0, 
    2.0, 
    0.9, 
    0.8, 
    (byte) 0);
 
    int [] globalSizes = new int[] {
      particlesCount
    };
    kernelCompletion = updateParticleKernel.enqueueNDRange(queue, globalSizes);
    interleavedColorAndPositionsMem.releaseGLObject(queue);
  }
}

 
Here is the OpenCL source.

// Ported to JavaCL/OpenCL4Java (+ added colors) by Olivier Chafik
 
#define REPULSION_FORCE 4.0f
#define CENTER_FORCE2 0.0005f
 
#define PI 3.1416f
 
//#pragma OpenCL cl_khr_byte_addressable_store : enable
 
uchar4 HSVAtoRGBA(float4 hsva)
{
    float h = hsva.x, s = hsva.y, v = hsva.z, a = hsva.w;
    float r, g, b;
 
        int i;
        float f, p, q, t;
        if (s == 0) {
                // achromatic (grey)
                r = g = b = v;
                return (uchar4)(r * 255, g * 255, b * 255, a * 255);
        }
        h /= 60;                        // sector 0 to 5
        i = floor( h );
        f = h - i;                      // factorial part of h
        p = v * ( 1 - s );
        q = v * ( 1 - s * f );
        t = v * ( 1 - s * ( 1 - f ) );
        switch( i ) {
                case 0:
                        r = v;
                        g = t;
                        b = p;
                        break;
                case 1:
                        r = q;
                        g = v;
                        b = p;
                        break;
                case 2:
                        r = p;
                        g = v;
                        b = t;
                        break;
                case 3:
                        r = p;
                        g = q;
                        b = v;
                        break;
                case 4:
                        r = t;
                        g = p;
                        b = v;
                        break;
                default:                // case 5:
                        r = v;
                        g = p;
                        b = q;
                        break;
        }
    return (uchar4)(r * 255, g * 255, b * 255, a * 255);
}
 
__kernel void updateParticle(
        __global float* masses,
        __global float2* velocities,
        //__global Particle* particles,
        __global float4* particles,
        //__global char* pParticles,
        const float2 mousePos,
        const float2 dimensions,
        float massFactor,
        float speedFactor,
        float slowDownFactor,
        float mouseWeight,
        char limitToScreen
) {
	int id = get_global_id(0);
 
        float4 particle = particles[id];
 
        uchar4 color = as_uchar4(particle.x);
 
        float2 position = particle.yz;
    	float2 diff = mousePos - position;
 
        float invDistSQ = 1.0f / dot(diff, diff);
	float2 halfD = dimensions / 2.0f;
        diff *= (halfD).y * invDistSQ;
 
        float mass = massFactor * masses[id];
        float2 velocity = velocities[id];
        velocity -= mass * position * CENTER_FORCE2 - diff * mass * mouseWeight;
        position += speedFactor * velocities[id];
 
        if (limitToScreen) {
            float2 halfDims = dimensions / 2.0f;
            position = clamp(position, -halfDims, halfDims);
        }
 
        float dirDot = cross((float4)(diff, (float2)0), (float4)(velocity, (float2)0)).z;
        float speed = length(velocity);
 
        float f = speed / 4 / mass;
        float hue = (dirDot < 0 ? f : f + 1) / 2;
        hue = clamp(hue, 0.0f, 1.0f) * 360;
 
        float opacity = clamp(0.1f + f, 0.0f, 1.0f);
        float saturation = mass / 2;
        float brightness = 0.6f + opacity * 0.3f;
 
        uchar4 targetColor = HSVAtoRGBA((float4)(hue, saturation, brightness, opacity));
 
        float colorSpeedFactor = min(0.01f * speedFactor, 1.0f), otherColorSpeedFactor = 1 - colorSpeedFactor;
        color = (uchar4)(
            (uchar)(targetColor.x * colorSpeedFactor + color.x * otherColorSpeedFactor),
            (uchar)(targetColor.y * colorSpeedFactor + color.y * otherColorSpeedFactor),
            (uchar)(targetColor.z * colorSpeedFactor + color.z * otherColorSpeedFactor),
            (uchar)(targetColor.w * colorSpeedFactor + color.w * otherColorSpeedFactor)
        );
 
        particle.x = as_float(color);
        particle.yz = position;
 
    	particles[id] = particle;
 
        velocity *= slowDownFactor;
        velocities[id] = velocity;
}