matmul.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2012,2017 Inria
  4. * Copyright (C) 2012,2013,2015-2018 CNRS
  5. * Copyright (C) 2010-2013,2015,2017 Université de Bordeaux
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #ifndef STARPU_NON_BLOCKING_DRIVERS
  19. int main(void)
  20. {
  21. /* testcase does not seem to support blocking drivers */
  22. return 77;
  23. }
  24. #else
  25. #ifdef __APPLE_CC__
  26. #include <OpenCL/opencl.h>
  27. #else
  28. #include <CL/cl.h>
  29. #endif
  30. #include <stdio.h>
  31. #include <string.h>
  32. #include <stdlib.h>
  33. #include <stdint.h>
  34. #include <unistd.h>
  35. #include <assert.h>
  36. #include <math.h>
  37. #include <sys/time.h>
  38. #define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
  39. #define check(exp) do { err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)
  40. #define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }
  41. #define check3(exp, err) do { if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); } } while(0)
  42. // Thread block size
  43. #define BLOCK_SIZE 16 // Kernel thread-block size
  44. #define WORK_SIZE 64 // Kernel global size in lines of A (or C)
  45. #define TYPE float
  46. // Basic Matrix dimensions
  47. #define WA (128L * BLOCK_SIZE) // Matrix A width
  48. #ifdef STARPU_QUICK_CHECK
  49. #define HA (128L * BLOCK_SIZE) // Matrix A height
  50. #else
  51. #define HA (512L * BLOCK_SIZE) // Matrix A height
  52. #endif
  53. #define WB (128L * BLOCK_SIZE) // Matrix B width
  54. #define HB WA // Matrix B height
  55. #define WC WB // Matrix C width
  56. #define HC HA // Matrix C height
  57. #define BLOCKS (HA / WORK_SIZE)
  58. ////////////////////////////////////////////////////////////////////////////////
  59. // declaration, forward
  60. void printDiff(TYPE*, TYPE*, int, int, int, TYPE);
  61. void computeReference(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int);
  62. #define str(x) #x
  63. #define CODE "\
  64. #define TYPE float\n\
  65. __kernel void sgemmNN(int wa, int ha, int wb, __global TYPE* A, __global TYPE* B, __global TYPE* C) {\n\
  66. #define BS 16\n\
  67. #define BLOCK_SIZE 16\n\
  68. int bx = get_group_id(0);\n\
  69. int by = get_group_id(1);\n\
  70. \n\
  71. int tx = get_local_id(0);\n\
  72. int ty = get_local_id(1);\n\
  73. \n\
  74. int gx = get_global_id(0);\n\
  75. int gy = get_global_id(1);\n\
  76. __local float As[BS][BS+1];\
  77. __local float Bs[BS][BS+1];\
  78. \n\
  79. unsigned int block_w = min(wb - bx * BLOCK_SIZE, BLOCK_SIZE);\n\
  80. unsigned int block_h = min(ha - by * BLOCK_SIZE, BLOCK_SIZE);\n\
  81. \n\
  82. int valid = (gx < wb && gy < ha);\n\
  83. \n\
  84. TYPE Csub = (TYPE)0.0;\n\
  85. \n\
  86. int pos = 0;\n\
  87. while (pos < wa) {\n\
  88. unsigned int size = min(wa-pos, BLOCK_SIZE);\n\
  89. if (tx < size && gy < ha)\n\
  90. As[tx][ty] = A[pos + tx + wa * gy];\n\
  91. if (ty < size && gx < wb)\n\
  92. Bs[tx][ty] = B[gx + wb * (pos+ty)];\n\
  93. \n\
  94. barrier(CLK_LOCAL_MEM_FENCE);\n\
  95. \n\
  96. if (valid) {\n\
  97. for (int k = 0; k < size; ++k)\n\
  98. Csub += As[k][ty] * Bs[tx][k];\n\
  99. }\n\
  100. pos += size;\n\
  101. barrier(CLK_LOCAL_MEM_FENCE);\n\
  102. }\n\
  103. \n\
  104. if (valid)\n\
  105. C[wb * gy + gx] = Csub;\n\
  106. }"
  107. static char * code = CODE;
  108. int check = 0;
  109. static void __attribute__((unused)) parse_args(int argc, const char **argv)
  110. {
  111. int i;
  112. for (i = 1; i < argc; i++)
  113. {
  114. if (strcmp(argv[i], "-check") == 0)
  115. {
  116. check = 1;
  117. }
  118. if (strcmp(argv[i], "-h") == 0)
  119. {
  120. printf("usage : %s [-check]\n", argv[0]);
  121. }
  122. }
  123. }
  124. // Round Up Division function
  125. size_t roundUp(int group_size, int global_size)
  126. {
  127. int r = global_size % group_size;
  128. if(r == 0)
  129. {
  130. return global_size;
  131. }
  132. else
  133. {
  134. return global_size + group_size - r;
  135. }
  136. }
  137. void fillArray(TYPE* data, int size)
  138. {
  139. int i;
  140. const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);
  141. for (i = 0; i < size; ++i)
  142. {
  143. data[i] = fScale * rand();
  144. }
  145. }
  146. void printArray(float* data, int size)
  147. {
  148. int i;
  149. for (i = 0; i < size; ++i)
  150. {
  151. printf("%d: %.3f\n", i, data[i]);
  152. }
  153. }
  154. /**
  155. * Compare two float arrays using L2-norm with an epsilon tolerance for equality
  156. * @return shrTRUE if \a reference and \a data are identical, otherwise shrFALSE
  157. * @param reference handle to the reference data / gold image
  158. * @param data handle to the computed data
  159. * @param len number of elements in reference and data
  160. * @param epsilon epsilon to use for the comparison
  161. */
  162. int shrCompareL2fe( const float* reference, const float* data, const unsigned int len, const float epsilon )
  163. {
  164. assert(epsilon >= 0);
  165. float error = 0;
  166. float ref = 0;
  167. unsigned int i;
  168. for(i = 0; i < len; ++i)
  169. {
  170. float diff = reference[i] - data[i];
  171. error += diff * diff;
  172. ref += reference[i] * reference[i];
  173. }
  174. float normRef = sqrtf(ref);
  175. if (fabs(ref) < 1e-7)
  176. {
  177. #ifdef _DEBUG
  178. fprintf(stderr, "ERROR, reference l2-norm is 0\n");
  179. #endif
  180. return 0;
  181. }
  182. float normError = sqrtf(error);
  183. error = normError / normRef;
  184. int result = error < epsilon;
  185. #ifdef _DEBUG
  186. if( !result)
  187. {
  188. fprintf(stderr, "ERROR, l2-norm error %lf is greater than epsilon %lf \n", error, epsilon);
  189. }
  190. #endif
  191. return result;
  192. }
  193. int main(int argc, const char** argv)
  194. {
  195. cl_uint platform_count;
  196. cl_platform_id platforms[5];
  197. cl_int err = CL_SUCCESS;
  198. unsigned int i, p;
  199. cl_device_type dev_type = CL_DEVICE_TYPE_ALL;
  200. void * ptrs[BLOCKS];
  201. cl_command_queue cqs[BLOCKS];
  202. cl_mem d_A[BLOCKS];
  203. cl_mem d_C[BLOCKS];
  204. cl_mem d_B[BLOCKS];
  205. cl_event GPUDone[BLOCKS];
  206. cl_event GPUExecution[BLOCKS];
  207. struct timeval start, end;
  208. int workOffset[BLOCKS];
  209. int workSize[BLOCKS];
  210. unsigned int sizePerGPU = HC / BLOCKS;
  211. unsigned int sizeMod = HC % BLOCKS;
  212. size_t A_size = WA * HA;
  213. size_t A_mem_size = sizeof(TYPE) * A_size;
  214. TYPE* A_data;
  215. size_t B_size = WB * HB;
  216. size_t B_mem_size = sizeof(TYPE) * B_size;
  217. TYPE* B_data;
  218. size_t C_size = WC * HC;
  219. size_t C_mem_size = sizeof(TYPE) * C_size;
  220. TYPE* C_data;
  221. parse_args(argc, argv);
  222. check(clGetPlatformIDs(5, platforms, &platform_count));
  223. if (platform_count == 0)
  224. {
  225. printf("No platform found\n");
  226. exit(77);
  227. }
  228. cl_uint device_count;
  229. cl_uint devs[platform_count];
  230. cl_device_id * devices[platform_count];
  231. cl_context ctx[platform_count];
  232. cl_command_queue * commandQueue[platform_count];
  233. device_count = 0;
  234. for (p=0; p<platform_count; p++)
  235. {
  236. cl_platform_id platform = platforms[p];
  237. err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
  238. if (err == CL_DEVICE_NOT_FOUND)
  239. {
  240. devs[p] = 0;
  241. continue;
  242. }
  243. if (devs[p] == 0)
  244. {
  245. printf("No OpenCL device found\n");
  246. exit(77);
  247. }
  248. if (err != CL_SUCCESS)
  249. {
  250. fprintf(stderr, "OpenCL Error (%d) in clGetDeviceIDs()\n", err);
  251. exit(EXIT_FAILURE);
  252. }
  253. if (devs[p] == 0)
  254. continue;
  255. devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
  256. commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);
  257. check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));
  258. cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
  259. check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));
  260. for(i = 0; i < devs[p]; ++i)
  261. {
  262. cl_device_id device = devices[p][i];
  263. char name[2048];
  264. name[0] = '\0';
  265. clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
  266. printf("Device %u: %s\n", i, name);
  267. commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
  268. if (err == CL_INVALID_VALUE)
  269. {
  270. fprintf(stderr, "Invalid property for clCreateCommandQueue\n");
  271. exit(77);
  272. }
  273. check3("clCreateCommandQueue", err);
  274. }
  275. device_count += devs[p];
  276. }
  277. if (device_count == 0)
  278. error("No device found\n");
  279. cl_kernel multiplicationKernel[platform_count];
  280. printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",
  281. (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);
  282. // allocate host memory for matrices A, B and C
  283. A_data = (TYPE*)malloc(A_mem_size);
  284. if (A_data == NULL)
  285. {
  286. perror("malloc");
  287. exit(-1);
  288. }
  289. B_data = (TYPE*)malloc(B_mem_size);
  290. if (B_data == NULL)
  291. {
  292. perror("malloc");
  293. exit(-1);
  294. }
  295. C_data = (TYPE*) malloc(C_mem_size);
  296. if (C_data == NULL)
  297. {
  298. perror("malloc");
  299. exit(-1);
  300. }
  301. cl_program program[platform_count];
  302. for (p=0; p<platform_count; p++)
  303. {
  304. if (devs[p] == 0)
  305. continue;
  306. check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));
  307. check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));
  308. check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
  309. }
  310. printf("Initializing data...\n");
  311. srand(2008);
  312. fillArray(A_data, A_size);
  313. fillArray(B_data, B_size);
  314. memset(C_data, 0, C_size);
  315. printf("Computing...\n");
  316. workOffset[0] = 0;
  317. gettimeofday(&start, NULL);
  318. size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
  319. int c = 0;
  320. for (p=0; p<platform_count;p++)
  321. {
  322. for (i=0; i<devs[p]; i++)
  323. {
  324. check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
  325. c++;
  326. }
  327. }
  328. for(i=0; i < BLOCKS; ++i)
  329. {
  330. int d = i % device_count;
  331. cl_uint platform = 0;
  332. // determine device platform
  333. int dev = d;
  334. for (platform = 0; platform < platform_count; platform++)
  335. {
  336. if ((cl_int)(dev - devs[platform]) < 0)
  337. break;
  338. dev -= devs[platform];
  339. }
  340. assert(platform < platform_count);
  341. workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;
  342. check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
  343. check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));
  344. check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i]));
  345. check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i]));
  346. check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i]));
  347. check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i]));
  348. check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d]));
  349. check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i]));
  350. size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};
  351. check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
  352. // Non-blocking copy of result from device to host
  353. cqs[i] = commandQueue[platform][dev];
  354. check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));
  355. if(i+1 < BLOCKS)
  356. workOffset[i + 1] = workOffset[i] + workSize[i];
  357. }
  358. // CPU sync with GPU
  359. for (p=0; p<platform_count;p++)
  360. {
  361. cl_uint dev;
  362. for (dev=0; dev<devs[p]; dev++)
  363. {
  364. clFinish(commandQueue[p][dev]);
  365. }
  366. }
  367. gettimeofday(&end, NULL);
  368. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  369. double dSeconds = timing/1000/1000;
  370. double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
  371. double gflops = 1.0e-9 * dNumOps/dSeconds;
  372. printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
  373. gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);
  374. // compute reference solution
  375. if (check)
  376. {
  377. printf("Comparing results with CPU computation... ");
  378. TYPE* reference = (TYPE*)malloc(C_mem_size);
  379. computeReference(reference, A_data, B_data, HA, WA, WB);
  380. // check result
  381. int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
  382. if (res == 0)
  383. {
  384. printf("\n\n");
  385. printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
  386. }
  387. else printf("PASSED\n\n");
  388. free(reference);
  389. }
  390. for(i = 0; i < BLOCKS; i++)
  391. {
  392. clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL);
  393. }
  394. for(i = 0; i < BLOCKS; i++)
  395. {
  396. clFinish(cqs[i]);
  397. }
  398. for (i=0; i<device_count; i++)
  399. {
  400. clReleaseMemObject(d_B[i]);
  401. }
  402. for(i = 0; i < BLOCKS; i++)
  403. {
  404. clReleaseMemObject(d_A[i]);
  405. clReleaseMemObject(d_C[i]);
  406. clReleaseEvent(GPUExecution[i]);
  407. clReleaseEvent(GPUDone[i]);
  408. }
  409. for (p=0; p<platform_count;p++)
  410. {
  411. if (devs[p] == 0)
  412. continue;
  413. check(clReleaseKernel(multiplicationKernel[p]));
  414. check(clReleaseProgram(program[p]));
  415. check(clReleaseContext(ctx[p]));
  416. cl_uint k;
  417. for(k = 0; k < devs[p]; ++k)
  418. {
  419. check(clReleaseCommandQueue(commandQueue[p][k]));
  420. }
  421. }
  422. free(A_data);
  423. free(B_data);
  424. free(C_data);
  425. return 0;
  426. }
  427. void printDiff(TYPE *data1, TYPE *data2, int width, int height, int listLength, TYPE listTol)
  428. {
  429. printf("Listing first %d Differences > %.6f...\n", listLength, listTol);
  430. int i,j,k;
  431. int error_count=0;
  432. for (j = 0; j < height; j++)
  433. {
  434. if (error_count < listLength)
  435. {
  436. printf("\n Row %d:\n", j);
  437. }
  438. for (i = 0; i < width; i++)
  439. {
  440. k = j * width + i;
  441. float diff = fabs(data1[k] - data2[k]);
  442. if (diff > listTol)
  443. {
  444. if (error_count < listLength)
  445. {
  446. printf(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], diff);
  447. }
  448. error_count++;
  449. }
  450. }
  451. }
  452. printf(" \n Total Errors = %d\n\n", error_count);
  453. }
  454. /**
  455. * Compute reference data set
  456. * C = A * B
  457. * @param C reference data, computed but preallocated
  458. * @param A matrix A as provided to device
  459. * @param B matrix B as provided to device
  460. * @param hA height of matrix A
  461. * @param wB width of matrix B
  462. */
  463. void computeReference(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB)
  464. {
  465. unsigned int i,j,k;
  466. for (i = 0; i < hA; ++i)
  467. for (j = 0; j < wB; ++j)
  468. {
  469. double sum = 0;
  470. for (k = 0; k < wA; ++k)
  471. {
  472. double a = A[i * wA + k];
  473. double b = B[k * wB + j];
  474. sum += a * b;
  475. }
  476. C[i * wB + j] = (TYPE)sum;
  477. }
  478. }
  479. #endif /* STARPU_NON_BLOCKING_DRIVERS */