matmul.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010,2011 University of Bordeaux
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <CL/cl.h>
  17. #include <stdio.h>
  18. #include <string.h>
  19. #include <stdlib.h>
  20. #include <stdint.h>
  21. #include <unistd.h>
  22. #include <assert.h>
  23. #include <math.h>
  24. #include <sys/time.h>
  25. #define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0)
  26. #define check(exp) do { cl_int err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)
  27. #define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }
  28. #ifdef UNUSED
  29. #elif defined(__GNUC__)
  30. #define UNUSED(x) UNUSED_ ## x __attribute__((unused))
  31. #else
  32. #define UNUSED(x) x
  33. #endif
  34. // Thread block size
  35. #define BLOCK_SIZE 16 // Kernel thread-block size
  36. #define WORK_SIZE 64 // Kernel global size in lines of A (or C)
  37. #define TYPE float
  38. // Basic Matrix dimensions
  39. #define WA (1024L * BLOCK_SIZE) // Matrix A width
  40. #define HA (512L * BLOCK_SIZE) // Matrix A height
  41. #define WB (1024L * BLOCK_SIZE) // Matrix B width
  42. #define HB WA // Matrix B height
  43. #define WC WB // Matrix C width
  44. #define HC HA // Matrix C height
  45. #define BLOCKS (HA / WORK_SIZE)
  46. ////////////////////////////////////////////////////////////////////////////////
  47. // declaration, forward
  48. void printDiff(TYPE*, TYPE*, int, int, int, TYPE);
  49. void computeGold(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int);
  50. #define str(x) #x
  51. #define CODE "\n\
  52. #define BS 16\n\
  53. \n\
  54. __kernel void matrixMul(\n\
  55. const int N,\n\
  56. const int P,\n\
  57. const int M,\n\
  58. __global float* A,\n\
  59. __global float* B, \n\
  60. __global float* C) {\n\
  61. int row = get_global_id(1); \n\
  62. int col = get_global_id(0); \n\
  63. float sum = 0.0f;\n\
  64. float sum2 = 0.0f;\n\
  65. int x = get_local_id(0);\n\
  66. int y = get_local_id(1);\n\
  67. __local float atile[BS][BS+1];\n\
  68. __local float btile[BS][BS+1];\n\
  69. for (int t=0; t<N; t+=BS) {\n\
  70. atile[y][x] = A[t + row * BS + x] ;\n\
  71. btile[y][x] = B[(t + y) *N + col];\n\
  72. barrier(CLK_LOCAL_MEM_FENCE);\n\
  73. for (int i=0; i<BS; i++) {\n\
  74. sum += atile[y][i] * btile[i][x];\n\
  75. } \n\
  76. barrier(CLK_LOCAL_MEM_FENCE);\n\
  77. }\n\
  78. C[row*N+col] = sum + sum2;\n\
  79. \n\
  80. }\n\
  81. \n\
  82. __kernel void saxpy( float a, __local float *b, float *c )\n\
  83. {\n\
  84. c[0] += a*b[0];\n\
  85. c[1] += a*b[1];\n\
  86. c[2] += a*b[2];\n\
  87. c[3] += a*b[3];\n\
  88. c[4] += a*b[4];\n\
  89. c[5] += a*b[5];\n\
  90. c[6] += a*b[6];\n\
  91. c[7] += a*b[7];\n\
  92. c[8] += a*b[8];\n\
  93. c[9] += a*b[9];\n\
  94. c[10] += a*b[10];\n\
  95. c[11] += a*b[11];\n\
  96. c[12] += a*b[12];\n\
  97. c[13] += a*b[13];\n\
  98. c[14] += a*b[14];\n\
  99. c[15] += a*b[15];\n\
  100. }\n\
  101. \n\
  102. \n\
  103. __kernel void sgemmNN( int lda,int ldb, int ldc, __global float *A, __global float *B, __global float* C)\n\
  104. {\n\
  105. const int inx = get_local_id(0);\n\
  106. const int iny = get_local_id(1);\n\
  107. const int ibx = get_group_id(0) * 16;\n\
  108. const int iby = get_group_id(1) * 16;\n\
  109. const int id = inx + iny*16;\n\
  110. int i;\n\
  111. \n\
  112. long Aidx = ibx + id;\n\
  113. long Bidx = inx + mul24( iby + iny, ldb );\n\
  114. long Cidx = ibx + id + mul24( iby, ldc );\n\
  115. \n\
  116. long Blast = ldb;\n\
  117. \n\
  118. float c[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n\
  119. \n\
  120. do\n\
  121. {\n\
  122. float a[4] = { A[Aidx+0*lda], A[Aidx+1*lda], A[Aidx+2*lda], A[Aidx+3*lda] };\n\
  123. \n\
  124. __local float bs[16][17];\n\
  125. \n\
  126. bs[inx][iny] = B[Bidx+0*ldb];\n\
  127. bs[inx][iny+4] = B[Bidx+4*ldb];\n\
  128. bs[inx][iny+8] = B[Bidx+8*ldb];\n\
  129. bs[inx][iny+12] = B[Bidx+12*ldb];\n\
  130. Bidx+= 16;\n\
  131. barrier(CLK_LOCAL_MEM_FENCE);\n\
  132. \n\
  133. Aidx += 4*lda;\n\
  134. saxpy( a[0], &bs[0][0], c ); a[0] = A[Aidx+0*lda];\n\
  135. saxpy( a[1], &bs[1][0], c ); a[1] = A[Aidx+1*lda];\n\
  136. saxpy( a[2], &bs[2][0], c ); a[2] = A[Aidx+2*lda];\n\
  137. saxpy( a[3], &bs[3][0], c ); a[3] = A[Aidx+3*lda];\n\
  138. } while( Bidx < Blast );\n\
  139. \n\
  140. for(i = 0; i < 16; i++, Cidx += ldc )\n\
  141. C[Cidx] = c[i];\n\
  142. \n\
  143. }"
  144. #define CODE2 "\
  145. #define TYPE float\n\
  146. __kernel void sgemmNN(int wa, int ha, int wb, __global TYPE* A, __global TYPE* B, __global TYPE* C) {\n\
  147. #define BS 16\n\
  148. #define BLOCK_SIZE 16\n\
  149. int bx = get_group_id(0);\n\
  150. int by = get_group_id(1);\n\
  151. \n\
  152. int tx = get_local_id(0);\n\
  153. int ty = get_local_id(1);\n\
  154. \n\
  155. int gx = get_global_id(0);\n\
  156. int gy = get_global_id(1);\n\
  157. __local float As[BS][BS+1];\
  158. __local float Bs[BS][BS+1];\
  159. \n\
  160. unsigned int block_w = min(wb - bx * BLOCK_SIZE, BLOCK_SIZE);\n\
  161. unsigned int block_h = min(ha - by * BLOCK_SIZE, BLOCK_SIZE);\n\
  162. \n\
  163. int valid = (gx < wb && gy < ha);\n\
  164. \n\
  165. TYPE Csub = (TYPE)0.0;\n\
  166. \n\
  167. int pos = 0;\n\
  168. while (pos < wa) {\n\
  169. unsigned int size = min(wa-pos, BLOCK_SIZE);\n\
  170. if (tx < size && gy < ha)\n\
  171. As[tx][ty] = A[pos + tx + wa * gy];\n\
  172. if (ty < size && gx < wb)\n\
  173. Bs[tx][ty] = B[gx + wb * (pos+ty)];\n\
  174. \n\
  175. barrier(CLK_LOCAL_MEM_FENCE);\n\
  176. \n\
  177. if (valid) {\n\
  178. for (int k = 0; k < size; ++k)\n\
  179. Csub += As[k][ty] * Bs[tx][k];\n\
  180. }\n\
  181. pos += size;\n\
  182. barrier(CLK_LOCAL_MEM_FENCE);\n\
  183. }\n\
  184. \n\
  185. if (valid)\n\
  186. C[wb * gy + gx] = Csub;\n\
  187. }"
  188. static char * code = CODE2;
  189. ////////////////////////////////////////////////////////////////////////////////
  190. // Helper functions
  191. ////////////////////////////////////////////////////////////////////////////////
  192. double executionTime(cl_event event)
  193. {
  194. cl_ulong start, end;
  195. clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
  196. clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
  197. return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
  198. }
  199. #define shrLog(...) fprintf(stderr, __VA_ARGS__);
  200. // Round Up Division function
  201. size_t shrRoundUp(int group_size, int global_size)
  202. {
  203. int r = global_size % group_size;
  204. if(r == 0)
  205. {
  206. return global_size;
  207. } else
  208. {
  209. return global_size + group_size - r;
  210. }
  211. }
  212. // Helper function to init data arrays
  213. // *********************************************************************
  214. void fillArray(TYPE* pfData, int iSize)
  215. {
  216. int i;
  217. const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);
  218. for (i = 0; i < iSize; ++i)
  219. {
  220. pfData[i] = fScale * rand();
  221. }
  222. }
  223. // Helper function to print data arrays
  224. // *********************************************************************
  225. void shrPrintArray(float* pfData, int iSize)
  226. {
  227. int i;
  228. for (i = 0; i < iSize; ++i)
  229. {
  230. shrLog("%d: %.3f\n", i, pfData[i]);
  231. }
  232. }
  233. ////////////////////////////////////////////////////////////////////////////////
  234. //! Compare two float arrays using L2-norm with an epsilon tolerance for equality
  235. //! @return shrTRUE if \a reference and \a data are identical, otherwise shrFALSE
  236. //! @param reference handle to the reference data / gold image
  237. //! @param data handle to the computed data
  238. //! @param len number of elements in reference and data
  239. //! @param epsilon epsilon to use for the comparison
  240. ////////////////////////////////////////////////////////////////////////////////
  241. int shrCompareL2fe( const float* reference, const float* data,
  242. const unsigned int len, const float epsilon )
  243. {
  244. assert(epsilon >= 0);
  245. float error = 0;
  246. float ref = 0;
  247. unsigned int i;
  248. for(i = 0; i < len; ++i) {
  249. float diff = reference[i] - data[i];
  250. error += diff * diff;
  251. ref += reference[i] * reference[i];
  252. }
  253. float normRef = sqrtf(ref);
  254. if (fabs(ref) < 1e-7) {
  255. #ifdef _DEBUG
  256. fprintf(stderr, "ERROR, reference l2-norm is 0\n");
  257. #endif
  258. return 0;
  259. }
  260. float normError = sqrtf(error);
  261. error = normError / normRef;
  262. int result = error < epsilon;
  263. #ifdef _DEBUG
  264. if( ! result)
  265. {
  266. fprintf(stderr, "ERROR, l2-norm error %d is greater than epsilon %lf \n", error, epsilon);
  267. }
  268. #endif
  269. return result;
  270. }
  271. int main(int UNUSED(argc), const char** UNUSED(argv))
  272. {
  273. cl_uint platform_count;
  274. cl_platform_id platforms[5];
  275. cl_int err = CL_SUCCESS;
  276. unsigned int i, p;
  277. cl_device_type dev_type = CL_DEVICE_TYPE_ALL;
  278. /* Get platforms */
  279. check(clGetPlatformIDs(5, platforms, &platform_count));
  280. if (platform_count == 0)
  281. error("No platform found\n");
  282. cl_uint device_count;
  283. cl_uint devs[platform_count];
  284. cl_device_id * devices[platform_count];
  285. cl_context ctx[platform_count];
  286. cl_command_queue * commandQueue[platform_count];
  287. device_count = 0;
  288. for (p=0; p<platform_count; p++) {
  289. cl_platform_id platform = platforms[p];
  290. cl_int err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
  291. if (err == CL_DEVICE_NOT_FOUND) {
  292. devs[p] = 0;
  293. continue;
  294. }
  295. check(err);
  296. if (devs[p] == 0)
  297. continue;
  298. devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
  299. commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);
  300. check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));
  301. cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
  302. check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));
  303. for(i = 0; i < devs[p]; ++i)
  304. {
  305. // get and print the device for this queue
  306. cl_device_id device = devices[p][i];
  307. char name[2048];
  308. name[0] = '\0';
  309. clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
  310. printf("Device %d: %s\n", i, name);
  311. // create command queue
  312. check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
  313. }
  314. device_count += devs[p];
  315. }
  316. if (device_count == 0)
  317. error("No device found\n");
  318. cl_kernel multiplicationKernel[platform_count];
  319. // Optional Command-line multiplier for matrix sizes
  320. printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",
  321. (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);
  322. // allocate host memory for matrices A, B and C
  323. size_t A_size = WA * HA;
  324. size_t A_mem_size = sizeof(TYPE) * A_size;
  325. TYPE* A_data = (TYPE*)malloc(A_mem_size);
  326. if (A_data == NULL) {
  327. perror("malloc");
  328. }
  329. size_t B_size = WB * HB;
  330. size_t B_mem_size = sizeof(TYPE) * B_size;
  331. TYPE* B_data = (TYPE*)malloc(B_mem_size);
  332. if (B_data == NULL) {
  333. perror("malloc");
  334. }
  335. size_t C_size = WC * HC;
  336. size_t C_mem_size = sizeof(TYPE) * C_size;
  337. TYPE* C_data = (TYPE*) malloc(C_mem_size);
  338. if (C_data == NULL) {
  339. perror("malloc");
  340. }
  341. cl_program program[platform_count];
  342. for (p=0; p<platform_count; p++) {
  343. if (devs[p] == 0)
  344. continue;
  345. check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));
  346. check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));
  347. // Create Kernel
  348. check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
  349. }
  350. // initialize host memory
  351. printf("Initializing data...\n");
  352. srand(2008);
  353. fillArray(A_data, A_size);
  354. fillArray(B_data, B_size);
  355. memset(C_data, 0, C_size);
  356. printf("Computing...\n");
  357. // Run multiplication on 1..deviceCount GPUs to compare improvement
  358. printf("\nRunning Computations on 1 - %d GPU's...\n\n", device_count);
  359. {
  360. cl_mem d_A[BLOCKS];
  361. cl_mem d_C[BLOCKS];
  362. void * ptrs[BLOCKS];
  363. cl_mem d_B[BLOCKS];
  364. cl_event GPUDone[BLOCKS];
  365. cl_event GPUExecution[BLOCKS];
  366. unsigned int sizePerGPU = HC / BLOCKS;
  367. unsigned int sizeMod = HC % BLOCKS;
  368. if (HC < BLOCKS) {
  369. printf("Not enough data to split in %d blocks. Test skipped\n", device_count);
  370. exit(0);
  371. }
  372. int workOffset[BLOCKS];
  373. int workSize[BLOCKS];
  374. workOffset[0] = 0;
  375. struct timeval start, end;
  376. gettimeofday(&start, NULL);
  377. size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
  378. int c = 0;
  379. for (p=0; p<platform_count;p++) {
  380. for (i=0; i<devs[p]; i++) {
  381. check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
  382. c++;
  383. }
  384. }
  385. unsigned int i;
  386. for(i=0; i < BLOCKS; ++i)
  387. {
  388. int d = i % device_count;
  389. cl_uint p = 0;
  390. // determine device platform
  391. int dev = d;
  392. for (p = 0; p < platform_count; p++) {
  393. if ((cl_int)(dev - devs[p]) < 0)
  394. break;
  395. dev -= devs[p];
  396. }
  397. // Input buffer
  398. workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;
  399. check2(d_A[i] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
  400. check2(d_C[i] = clCreateBuffer(ctx[p], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));
  401. // set the args values
  402. check(clSetKernelArg(multiplicationKernel[p], 0, sizeof(cl_int), &workSize[i]));
  403. check(clSetKernelArg(multiplicationKernel[p], 1, sizeof(cl_int), &workSize[i]));
  404. check(clSetKernelArg(multiplicationKernel[p], 2, sizeof(cl_int), &workSize[i]));
  405. check(clSetKernelArg(multiplicationKernel[p], 3, sizeof(cl_mem), (void *) &d_A[i]));
  406. check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));
  407. check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));
  408. // Multiplication - non-blocking execution: launch and push to device(s)
  409. size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE,WC), shrRoundUp(BLOCK_SIZE,workSize[i])};
  410. check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
  411. // Non-blocking copy of result from device to host
  412. //check2(ptrs[i] = clEnqueueMapBuffer(commandQueue[p][dev], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));
  413. if(i+1 < BLOCKS)
  414. workOffset[i + 1] = workOffset[i] + workSize[i];
  415. }
  416. // CPU sync with GPU
  417. for (p=0; p<platform_count;p++) {
  418. cl_uint dev;
  419. for (dev=0; dev<devs[p]; dev++) {
  420. clFinish(commandQueue[p][dev]);
  421. }
  422. }
  423. gettimeofday(&end, NULL);
  424. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  425. //double dSeconds = shrDeltaT(0)/(double)nIter;
  426. double dSeconds = timing/1000/1000;
  427. double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
  428. double gflops = 1.0e-9 * dNumOps/dSeconds;
  429. #ifdef SHORT_LOG
  430. printf("%f\n", timing/1000.0);
  431. #else
  432. printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
  433. gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);
  434. // Print kernel timing per GPU
  435. /*for(i = 0; i < k; i++)
  436. printf(" Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i]));
  437. printf("\n");*/
  438. #endif
  439. for (i=0; i<device_count; i++) {
  440. clReleaseMemObject(d_B[i]);
  441. }
  442. // Release mem and event objects
  443. for(i = 0; i < BLOCKS; i++)
  444. {
  445. clReleaseMemObject(d_A[i]);
  446. clReleaseMemObject(d_C[i]);
  447. clReleaseEvent(GPUExecution[i]);
  448. //clReleaseEvent(GPUDone[i]);
  449. }
  450. }
  451. // compute reference solution
  452. #ifdef CHECK
  453. printf("Comparing results with CPU computation... ");
  454. TYPE* reference = (TYPE*)malloc(C_mem_size);
  455. computeGold(reference, A_data, B_data, HA, WA, WB);
  456. // check result
  457. int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
  458. if (res == 0)
  459. {
  460. printf("\n\n");
  461. printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
  462. }
  463. else printf("PASSED\n\n");
  464. free(reference);
  465. #endif
  466. // clean up OCL resources
  467. for (p=0; p<platform_count;p++) {
  468. if (devs[p] == 0)
  469. continue;
  470. check(clReleaseKernel(multiplicationKernel[p]));
  471. check(clReleaseProgram(program[p]));
  472. check(clReleaseContext(ctx[p]));
  473. cl_uint k;
  474. for(k = 0; k < devs[p]; ++k)
  475. {
  476. check(clReleaseCommandQueue(commandQueue[p][k]));
  477. }
  478. }
  479. // clean up memory
  480. free(A_data);
  481. free(B_data);
  482. free(C_data);
  483. return 0;
  484. }
  485. void printDiff(TYPE *data1, TYPE *data2, int width, int height, int iListLength, TYPE fListTol)
  486. {
  487. shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
  488. int i,j,k;
  489. int error_count=0;
  490. for (j = 0; j < height; j++)
  491. {
  492. if (error_count < iListLength)
  493. {
  494. shrLog("\n Row %d:\n", j);
  495. }
  496. for (i = 0; i < width; i++)
  497. {
  498. k = j * width + i;
  499. float fDiff = fabs(data1[k] - data2[k]);
  500. if (fDiff > fListTol)
  501. {
  502. if (error_count < iListLength)
  503. {
  504. shrLog(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
  505. }
  506. error_count++;
  507. }
  508. }
  509. }
  510. shrLog(" \n Total Errors = %d\n\n", error_count);
  511. }
  512. ////////////////////////////////////////////////////////////////////////////////
  513. //! Compute reference data set
  514. //! C = A * B
  515. //! @param C reference data, computed but preallocated
  516. //! @param A matrix A as provided to device
  517. //! @param B matrix B as provided to device
  518. //! @param hA height of matrix A
  519. //! @param wB width of matrix B
  520. ////////////////////////////////////////////////////////////////////////////////
  521. void
  522. computeGold(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB)
  523. {
  524. unsigned int i,j,k;
  525. for (i = 0; i < hA; ++i)
  526. for (j = 0; j < wB; ++j) {
  527. double sum = 0;
  528. for (k = 0; k < wA; ++k) {
  529. double a = A[i * wA + k];
  530. double b = B[k * wB + j];
  531. sum += a * b;
  532. }
  533. C[i * wB + j] = (TYPE)sum;
  534. }
  535. }