OpenCL Particles System Example in Processing

The second example is adopted from the Million Particles example from the MSAOpenCL library. I developed it with Processing 2.0 alpha version without using the library and the Pointer class. All the memory buffers are standard Java.nio.Buffer. The performance is very acceptable with one million particles in the ATI Radeon HD 4670 graphics card.
 

 
The Processing source

import processing.opengl.*;
import javax.media.opengl.*;
import javax.media.opengl.glu.GLU;
import java.nio.FloatBuffer;
import java.nio.ByteBuffer;
 
import com.nativelibs4java.opencl.*;
import com.nativelibs4java.opencl.CLMem.Usage;
 
final int PARTICLES_COUNT = 1000000;
float halfWidth, halfHeight;
 
GL2 gl;
PGL pgl;
 
int [] vbo = new int[1];
 
CLContext context;
CLQueue queue;
CLKernel kernel;
 
CLBuffer<Float> partMem;
FloatBuffer partBuf;
 
CLBuffer<Byte> posMem;
ByteBuffer partPos;
 
PVector mousePos;
 
void setup() {
  size(screenWidth, screenHeight, OPENGL);
  background(0);
  randomSeed(millis());
  halfWidth = width/2;
  halfHeight = height/2;
  mousePos = new PVector(float(mouseX) - halfWidth, halfHeight - float(mouseY));
 
  PGraphicsOpenGL pg = (PGraphicsOpenGL) g;
  pgl = pg.beginPGL();
  gl = pgl.gl.getGL().getGL2();
  gl.glClearColor(0, 0, 0, 1);
  gl.glClear(GL.GL_COLOR_BUFFER_BIT);
  gl.glEnable(GL2.GL_POINT_SMOOTH);
  gl.glPointSize(2f);
 
  initOpenCL();
  pg.endPGL();
}
 
void initOpenCL() {
  context = JavaCL.createContextFromCurrentGL();
  queue = context.createDefaultQueue();
 
  partBuf = FloatBuffer.allocate(PARTICLES_COUNT * 4);
  partPos = ByteBuffer.allocateDirect(PARTICLES_COUNT * 2 * Float.SIZE/8).order(context.getByteOrder());
  FloatBuffer tmpPos = partPos.asFloatBuffer();
 
  for (int i = 0; i < PARTICLES_COUNT; i++) {
 
    partBuf.put(0.0f);
    partBuf.put(0.0f);
    partBuf.put(random(0.2, 2.0));
    partBuf.put(0.0f);
 
    tmpPos.put(random(width));
    tmpPos.put(random(height));
  }
 
  partBuf.rewind();
  partPos.rewind();
 
  gl.glGenBuffers(1, vbo, 0);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, vbo[0]);
 
  gl.glBufferData(GL.GL_ARRAY_BUFFER, (int) partPos.capacity(), partPos, GL2.GL_DYNAMIC_COPY);
  gl.glBindBuffer(GL.GL_ARRAY_BUFFER, 0);
 
  posMem = context.createBufferFromGLBuffer(Usage.InputOutput, vbo[0]);
  partMem = context.createFloatBuffer(Usage.InputOutput, partBuf, true);
 
  String pgmSrc = join(loadStrings(dataPath("Particle.cl")), "\n");
  CLProgram program = context.createProgram(pgmSrc);
  kernel = program.build().createKernel("updateParticle");
  kernel.setArg(0, partMem);
  kernel.setArg(1, posMem);
  kernel.setArg(2, new float [] {
    mousePos.x, mousePos.y
  }
  );
}
 
void draw() {
  background(0);
  gl.glMatrixMode(GL2.GL_PROJECTION);
  gl.glLoadIdentity();
  pgl.glu.gluOrtho2D(-halfWidth - 1, halfWidth + 1, -halfHeight - 1, halfHeight + 1);
  gl.glMatrixMode(GL2.GL_MODELVIEW);
  gl.glColor3f(1.0f, 0.8f, 0.0f);
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, vbo[0]);
  queue.finish();
  gl.glEnableClientState(GL2.GL_VERTEX_ARRAY);
  gl.glVertexPointer(2, GL.GL_FLOAT, 0, 0);
  gl.glDrawArrays(GL2.GL_POINTS, 0, PARTICLES_COUNT);
 
  gl.glBindBuffer(GL2.GL_ARRAY_BUFFER, 0);
  gl.glDisableClientState(GL2.GL_VERTEX_ARRAY);
  callKernel();
}
 
void callKernel() {
  mousePos.set(float(mouseX) - halfWidth, halfHeight - float(mouseY), 0.0f);
  CLEvent kernelCompletion;
  synchronized(kernel) {
    posMem.acquireGLObject(queue);
    kernel.setArg(2, new float [] {
      mousePos.x, mousePos.y
    }
    );
    int [] globalSizes = new int[] {
      PARTICLES_COUNT
    };
    kernelCompletion = kernel.enqueueNDRange(queue, globalSizes);
    posMem.releaseGLObject(queue);
  }
}

 
The kernel source

#define DAMP			0.95f
#define CENTER_FORCE		0.005f
#define MOUSE_FORCE		200.0f
#define MIN_SPEED		0.2f
 
typedef struct{
	float2 vel;
	float mass;
	float dummy;		
} Particle;
 
__kernel void updateParticle(__global Particle* particles, 
	__global float2* posBuffer, 
	const float2 mousePos)
{
	int id = get_global_id(0);
	__global Particle *p = &particles[id];
 
	float2 diff = mousePos - posBuffer[id];
	float invDistSQ = 1.0f / dot(diff, diff);
	diff *= (MOUSE_FORCE * invDistSQ);
 
	p->vel += -posBuffer[id] * CENTER_FORCE - diff * p->mass;
 
	float speed2 = dot(p->vel, p->vel);
	if (speed2 < MIN_SPEED) 
		posBuffer[id] = mousePos + diff * (1 + p->mass);
 
	posBuffer[id] += p->vel;
	p->vel *= DAMP;
}