8
#define ITERATIONS 500000
10
#define fail(code, reason) {\
11
fprintf(stderr, "%s\n", reason); \
15
pthread_barrier_t bar;
19
cl_device_id *devices;
22
static inline cl_program load_app(cl_context ctx, cl_uint num_devices, cl_device_id *devices, const char *name) {
25
char *buf = "__kernel void test() {}";
26
device_info_t info = { num_devices, devices };
31
f = fopen(name, "rb");
34
fseek(f, 0, SEEK_END);
36
fseek(f, 0, SEEK_SET);
38
buf = (char*)malloc(len*sizeof(char));
39
if (!buf) return NULL;
41
if (fread(buf, 1, len, f) != len) {
51
printf("Compiling for ");
52
for (i=0;i<num_devices;i++) {
54
int err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(name), name, NULL);
55
if (err != CL_SUCCESS) puts("Unknown ");
56
else printf("%s ", name);
60
app = clCreateProgramWithSource(ctx, 1, (const char**)&buf, &len, NULL);
66
cl_build_status build_status;
68
//"-cl-nv-maxrregcount=48"
69
//"-cl-nv-opt-level=<N>" (0 - no optimizations, 3 - default)
70
//"-cl-nv-arch sm_XX" - selects the target CUDA Compute Level architecture to compile for (sm_10 for 1.0, sm_11 for 1.1, sm_12 for 1.2, sm_13 for 1.3 and sm_20 for 2.0 (Fermi))
71
//"--cl-nv-cstd=CLX.X" - selects the target OpenCL C language version (CL1.0 or CL1.1)
72
char *build_flags = "-cl-nv-arch sm_20";
73
int err = clBuildProgram(app, num_devices, devices, build_flags, NULL, NULL);
74
if (err != CL_SUCCESS) printf("Application build failed (%i)\n", err);
76
for (i = 0; i < num_devices; i++) {
78
err = clGetProgramBuildInfo(app, devices[i], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
79
} while (build_status == CL_BUILD_IN_PROGRESS);
81
err = clGetProgramBuildInfo(app, devices[i], CL_PROGRAM_BUILD_LOG, sizeof(build_log) - 1, &build_log, &size);
85
if (build_status == CL_BUILD_SUCCESS) {
86
} else if (build_status == CL_BUILD_ERROR) {
87
printf("Build failed for device %i:\n======================\n%s\n--------------------\n\n", i, build_log);
89
printf("Build failed for device %i\n", i);
95
printf("Program creation failed\n");
102
void bench(cl_context ctx, cl_device_id device, cl_kernel kern) {
107
struct timeval tv1,tv2;
108
size_t i, iterations = ITERATIONS;
109
size_t dims[2] = {1024, 1024};
111
cl_command_queue queue = clCreateCommandQueue(ctx, device, 0, &err);
112
if (err != CL_SUCCESS)
113
fail(-1, "clCreateCommandQueue() failed");
115
clEnqueueNDRangeKernel (queue, kern, 2, NULL, dims, NULL, 0, NULL, NULL);
119
pthread_barrier_wait(&bar);
121
gettimeofday(&tv1, NULL);
122
for (i = 0; i < iterations; i++) {
123
clEnqueueNDRangeKernel (queue, kern, 2, NULL, dims, NULL, 0, NULL, NULL);
126
gettimeofday(&tv2, NULL);
128
us = (tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec);
132
clReleaseCommandQueue(queue);
134
printf(" Execution time: %lu us\n", us);
144
void *bench_thread(void *vargs) {
145
struct thread_info *args = (struct thread_info*)vargs;
147
bench(args->ctx, args->device, args->kern);
151
int main(int argc, char *argv[]) {
156
cl_device_id devices[16];
158
struct thread_info args[16];
171
unsigned char *binary[16];
172
size_t binary_size[16];
180
fprintf(stderr, "Usage: %s [source] [device]\n", argv[0]);
186
devid = atoi(argv[2]);
191
cl_platform_id selected_platform;
192
cl_uint i, j, num_platforms, num_devices;
193
cl_platform_id platforms[4];
195
clGetPlatformIDs(4, platforms, &num_platforms);
197
for (i = 0; i < num_platforms; i++) {
198
err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
199
if ((err == CL_SUCCESS)&&(num_devices > max)) {
201
selected_platform = platforms[i];
205
clGetDeviceIDs(selected_platform, CL_DEVICE_TYPE_GPU, 16, devices, &num_devices);
207
if (devid >= (int)num_devices) fail(-1, "Invalid device number is specified");
210
// cl_context ctx = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);
211
// if (!ctx) fail(-1, "Failed to create OpenCL context");
214
// err = clGetContextInfo(ctx, CL_CONTEXT_DEVICES, 16 * sizeof(cl_device_id), &devices, &num_devices);
215
// if (err != CL_SUCCESS) fail(-1, "clGetContextInfo call is failed");
217
// num_devices /= sizeof(cl_device_id);
220
for (i = 0; i < num_devices; i++) {
221
ctxs[i] = clCreateContext(0, 1, &devices[i], NULL, NULL, &err);
222
if (!ctxs[i]) fail(-1, "Failed to create OpenCL context");
224
apps[i] = load_app(ctxs[i], 1, &devices[i], source);
225
if (!apps[i]) fail(-1, "Compilation failed");
227
kerns[i] = clCreateKernel(apps[i], "test", &err);
228
if (err != CL_SUCCESS) fail(-1, "Error creating kernel");
231
pthread_barrier_init(&bar, NULL, num_devices);
233
printf("Running in individual context:\n");
234
for (i = 0; i < num_devices; i++) {
235
args[i].ctx = ctxs[i];
236
args[i].device = devices[i];
237
args[i].kern = kerns[i];
238
// pthread_create(&thr[i], NULL, bench_thread, &args[i]);
241
for (i = 0; i < num_devices; i++) {
243
// pthread_join(thr[i], &res);
244
clReleaseKernel(kerns[i]);
245
clReleaseProgram(apps[i]);
246
clReleaseContext(ctxs[i]);
250
pthread_barrier_destroy(&bar);
252
for (j = /*2*/num_devices; j <= num_devices; j++) {
253
pthread_barrier_init(&bar, NULL, j);
255
ctx = clCreateContext(0, j, devices, NULL, NULL, &err);
256
if (!ctx) fail(-1, "Failed to create OpenCL context");
258
app = load_app(ctx, j, devices, source);
259
if (!app) fail(-1, "Compilation failed");
261
kern = clCreateKernel(app, "test", &err);
262
if (err != CL_SUCCESS) fail(-1, "Error creating kernel");
265
printf("Running parallel for %i devices:\n", j);
266
for (i = 0; i < j; i++) {
268
args[i].device = devices[i];
270
pthread_create(&thr[i], NULL, bench_thread, &args[i]);
273
for (i = 0; i < j; i++) {
275
pthread_join(thr[i], &res);
280
pthread_barrier_destroy(&bar);
282
clReleaseKernel(kern);
283
clReleaseProgram(app);
284
clReleaseContext(ctx);
287
pthread_barrier_init(&bar, NULL, 1);
289
ctx = clCreateContext(0, num_devices, devices, NULL, NULL, &err);
290
if (!ctx) fail(-1, "Failed to create OpenCL context");
292
app = load_app(ctx, num_devices, devices, source);
293
if (!app) fail(-1, "Compilation failed");
295
kern = clCreateKernel(app, "test", &err);
296
if (err != CL_SUCCESS) fail(-1, "Error creating kernel");
298
printf("Running sequential (multi-devices context):\n");
299
for (i = 0; i < num_devices; i++) {
300
bench(ctx, devices[i], kern);
303
clReleaseKernel(kern);
304
clReleaseProgram(app);
305
clReleaseContext(ctx);
307
pthread_barrier_destroy(&bar);