|
@@ -28,13 +28,6 @@
|
|
|
#define check(exp) do { cl_int err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0)
|
|
|
#define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }
|
|
|
|
|
|
-#ifdef UNUSED
|
|
|
-#elif defined(__GNUC__)
|
|
|
-#define UNUSED(x) UNUSED_ ## x __attribute__((unused))
|
|
|
-#else
|
|
|
-#define UNUSED(x) x
|
|
|
-#endif
|
|
|
-
|
|
|
|
|
|
#define BLOCK_SIZE 16
|
|
|
#define WORK_SIZE 64
|
|
@@ -52,105 +45,11 @@
|
|
|
|
|
|
|
|
|
void printDiff(TYPE*, TYPE*, int, int, int, TYPE);
|
|
|
-void computeGold(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int);
|
|
|
+void computeReference(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int);
|
|
|
|
|
|
#define str(x) #x
|
|
|
|
|
|
-#define CODE "\n\
|
|
|
-#define BS 16\n\
|
|
|
-\n\
|
|
|
-__kernel void matrixMul(\n\
|
|
|
- const int N,\n\
|
|
|
- const int P,\n\
|
|
|
- const int M,\n\
|
|
|
- __global float* A,\n\
|
|
|
- __global float* B, \n\
|
|
|
- __global float* C) {\n\
|
|
|
- int row = get_global_id(1); \n\
|
|
|
- int col = get_global_id(0); \n\
|
|
|
- float sum = 0.0f;\n\
|
|
|
- float sum2 = 0.0f;\n\
|
|
|
- int x = get_local_id(0);\n\
|
|
|
- int y = get_local_id(1);\n\
|
|
|
- __local float atile[BS][BS+1];\n\
|
|
|
- __local float btile[BS][BS+1];\n\
|
|
|
- for (int t=0; t<N; t+=BS) {\n\
|
|
|
- atile[y][x] = A[t + row * BS + x] ;\n\
|
|
|
- btile[y][x] = B[(t + y) *N + col];\n\
|
|
|
- barrier(CLK_LOCAL_MEM_FENCE);\n\
|
|
|
- for (int i=0; i<BS; i++) {\n\
|
|
|
- sum += atile[y][i] * btile[i][x];\n\
|
|
|
- } \n\
|
|
|
- barrier(CLK_LOCAL_MEM_FENCE);\n\
|
|
|
- }\n\
|
|
|
- C[row*N+col] = sum + sum2;\n\
|
|
|
-\n\
|
|
|
-}\n\
|
|
|
-\n\
|
|
|
-__kernel void saxpy( float a, __local float *b, float *c )\n\
|
|
|
-{\n\
|
|
|
- c[0] += a*b[0];\n\
|
|
|
- c[1] += a*b[1];\n\
|
|
|
- c[2] += a*b[2];\n\
|
|
|
- c[3] += a*b[3];\n\
|
|
|
- c[4] += a*b[4];\n\
|
|
|
- c[5] += a*b[5];\n\
|
|
|
- c[6] += a*b[6];\n\
|
|
|
- c[7] += a*b[7];\n\
|
|
|
- c[8] += a*b[8];\n\
|
|
|
- c[9] += a*b[9];\n\
|
|
|
- c[10] += a*b[10];\n\
|
|
|
- c[11] += a*b[11];\n\
|
|
|
- c[12] += a*b[12];\n\
|
|
|
- c[13] += a*b[13];\n\
|
|
|
- c[14] += a*b[14];\n\
|
|
|
- c[15] += a*b[15];\n\
|
|
|
-}\n\
|
|
|
-\n\
|
|
|
-\n\
|
|
|
-__kernel void sgemmNN( int lda,int ldb, int ldc, __global float *A, __global float *B, __global float* C)\n\
|
|
|
-{\n\
|
|
|
- const int inx = get_local_id(0);\n\
|
|
|
- const int iny = get_local_id(1);\n\
|
|
|
- const int ibx = get_group_id(0) * 16;\n\
|
|
|
- const int iby = get_group_id(1) * 16;\n\
|
|
|
- const int id = inx + iny*16;\n\
|
|
|
- int i;\n\
|
|
|
- \n\
|
|
|
- long Aidx = ibx + id;\n\
|
|
|
- long Bidx = inx + mul24( iby + iny, ldb );\n\
|
|
|
- long Cidx = ibx + id + mul24( iby, ldc );\n\
|
|
|
- \n\
|
|
|
- long Blast = ldb;\n\
|
|
|
- \n\
|
|
|
- float c[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};\n\
|
|
|
- \n\
|
|
|
- do\n\
|
|
|
- {\n\
|
|
|
- float a[4] = { A[Aidx+0*lda], A[Aidx+1*lda], A[Aidx+2*lda], A[Aidx+3*lda] };\n\
|
|
|
-\n\
|
|
|
- __local float bs[16][17];\n\
|
|
|
-\n\
|
|
|
- bs[inx][iny] = B[Bidx+0*ldb];\n\
|
|
|
- bs[inx][iny+4] = B[Bidx+4*ldb];\n\
|
|
|
- bs[inx][iny+8] = B[Bidx+8*ldb];\n\
|
|
|
- bs[inx][iny+12] = B[Bidx+12*ldb];\n\
|
|
|
- Bidx+= 16;\n\
|
|
|
- barrier(CLK_LOCAL_MEM_FENCE);\n\
|
|
|
-\n\
|
|
|
- Aidx += 4*lda;\n\
|
|
|
- saxpy( a[0], &bs[0][0], c ); a[0] = A[Aidx+0*lda];\n\
|
|
|
- saxpy( a[1], &bs[1][0], c ); a[1] = A[Aidx+1*lda];\n\
|
|
|
- saxpy( a[2], &bs[2][0], c ); a[2] = A[Aidx+2*lda];\n\
|
|
|
- saxpy( a[3], &bs[3][0], c ); a[3] = A[Aidx+3*lda];\n\
|
|
|
- } while( Bidx < Blast );\n\
|
|
|
-\n\
|
|
|
- for(i = 0; i < 16; i++, Cidx += ldc )\n\
|
|
|
- C[Cidx] = c[i];\n\
|
|
|
-\n\
|
|
|
-}"
|
|
|
-
|
|
|
-#define CODE2 "\
|
|
|
+#define CODE "\
|
|
|
#define TYPE float\n\
|
|
|
__kernel void sgemmNN(int wa, int ha, int wb, __global TYPE* A, __global TYPE* B, __global TYPE* C) {\n\
|
|
|
#define BS 16\n\
|
|
@@ -195,432 +94,384 @@ __kernel void sgemmNN(int wa, int ha, int wb, __global TYPE* A, __global TYPE*
|
|
|
C[wb * gy + gx] = Csub;\n\
|
|
|
}"
|
|
|
|
|
|
-static char * code = CODE2;
|
|
|
+static char * code = CODE;
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
+int check = 0;
|
|
|
|
|
|
-double executionTime(cl_event event)
|
|
|
+static void __attribute__((unused)) parse_args(int argc, char **argv)
|
|
|
{
|
|
|
- cl_ulong start, end;
|
|
|
-
|
|
|
- clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
|
|
|
- clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
|
|
|
-
|
|
|
- return (double)1.0e-9 * (end - start);
|
|
|
+ int i;
|
|
|
+ for (i = 1; i < argc; i++)
|
|
|
+ {
|
|
|
+ if (strcmp(argv[i], "-check") == 0)
|
|
|
+ {
|
|
|
+ check = 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (strcmp(argv[i], "-h") == 0)
|
|
|
+ {
|
|
|
+ printf("usage : %s [-check]\n", argv[0]);
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
#define shrLog(...) fprintf(stderr, __VA_ARGS__);
|
|
|
|
|
|
|
|
|
-size_t shrRoundUp(int group_size, int global_size)
|
|
|
-{
|
|
|
- int r = global_size % group_size;
|
|
|
- if(r == 0)
|
|
|
- {
|
|
|
- return global_size;
|
|
|
- } else
|
|
|
- {
|
|
|
- return global_size + group_size - r;
|
|
|
- }
|
|
|
+size_t shrRoundUp(int group_size, int global_size) {
|
|
|
+ int r = global_size % group_size;
|
|
|
+ if(r == 0) {
|
|
|
+ return global_size;
|
|
|
+ } else {
|
|
|
+ return global_size + group_size - r;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-void fillArray(TYPE* pfData, int iSize)
|
|
|
-{
|
|
|
- int i;
|
|
|
- const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);
|
|
|
- for (i = 0; i < iSize; ++i)
|
|
|
- {
|
|
|
- pfData[i] = fScale * rand();
|
|
|
- }
|
|
|
+void fillArray(TYPE* pfData, int iSize) {
|
|
|
+ int i;
|
|
|
+ const TYPE fScale = (TYPE)(1.0f / (float)RAND_MAX);
|
|
|
+ for (i = 0; i < iSize; ++i) {
|
|
|
+ pfData[i] = fScale * rand();
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-void shrPrintArray(float* pfData, int iSize)
|
|
|
-{
|
|
|
- int i;
|
|
|
- for (i = 0; i < iSize; ++i)
|
|
|
- {
|
|
|
- shrLog("%d: %.3f\n", i, pfData[i]);
|
|
|
- }
|
|
|
+void shrPrintArray(float* pfData, int iSize) {
|
|
|
+ int i;
|
|
|
+ for (i = 0; i < iSize; ++i) {
|
|
|
+ shrLog("%d: %.3f\n", i, pfData[i]);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-int shrCompareL2fe( const float* reference, const float* data,
|
|
|
- const unsigned int len, const float epsilon )
|
|
|
-{
|
|
|
- assert(epsilon >= 0);
|
|
|
-
|
|
|
- float error = 0;
|
|
|
- float ref = 0;
|
|
|
-
|
|
|
- unsigned int i;
|
|
|
- for(i = 0; i < len; ++i) {
|
|
|
- float diff = reference[i] - data[i];
|
|
|
- error += diff * diff;
|
|
|
- ref += reference[i] * reference[i];
|
|
|
- }
|
|
|
-
|
|
|
- float normRef = sqrtf(ref);
|
|
|
- if (fabs(ref) < 1e-7) {
|
|
|
+
|
|
|
+ * Compare two float arrays using L2-norm with an epsilon tolerance for equality
|
|
|
+ * @return shrTRUE if \a reference and \a data are identical, otherwise shrFALSE
|
|
|
+ * @param reference handle to the reference data / gold image
|
|
|
+ * @param data handle to the computed data
|
|
|
+ * @param len number of elements in reference and data
|
|
|
+ * @param epsilon epsilon to use for the comparison
|
|
|
+*/
|
|
|
+int shrCompareL2fe( const float* reference, const float* data, const unsigned int len, const float epsilon ) {
|
|
|
+ assert(epsilon >= 0);
|
|
|
+
|
|
|
+ float error = 0;
|
|
|
+ float ref = 0;
|
|
|
+
|
|
|
+ unsigned int i;
|
|
|
+ for(i = 0; i < len; ++i) {
|
|
|
+ float diff = reference[i] - data[i];
|
|
|
+ error += diff * diff;
|
|
|
+ ref += reference[i] * reference[i];
|
|
|
+ }
|
|
|
+
|
|
|
+ float normRef = sqrtf(ref);
|
|
|
+ if (fabs(ref) < 1e-7) {
|
|
|
#ifdef _DEBUG
|
|
|
- fprintf(stderr, "ERROR, reference l2-norm is 0\n");
|
|
|
+ fprintf(stderr, "ERROR, reference l2-norm is 0\n");
|
|
|
#endif
|
|
|
- return 0;
|
|
|
- }
|
|
|
- float normError = sqrtf(error);
|
|
|
- error = normError / normRef;
|
|
|
- int result = error < epsilon;
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ float normError = sqrtf(error);
|
|
|
+ error = normError / normRef;
|
|
|
+ int result = error < epsilon;
|
|
|
#ifdef _DEBUG
|
|
|
- if( ! result)
|
|
|
- {
|
|
|
- fprintf(stderr, "ERROR, l2-norm error %d is greater than epsilon %lf \n", error, epsilon);
|
|
|
- }
|
|
|
+ if( !result) {
|
|
|
+ fprintf(stderr, "ERROR, l2-norm error %d is greater than epsilon %lf \n", error, epsilon);
|
|
|
+ }
|
|
|
#endif
|
|
|
|
|
|
- return result;
|
|
|
+ return result;
|
|
|
}
|
|
|
|
|
|
|
|
|
-int main(int UNUSED(argc), const char** UNUSED(argv))
|
|
|
-{
|
|
|
- cl_uint platform_count;
|
|
|
- cl_platform_id platforms[5];
|
|
|
+int main(int argc, const char** argv) {
|
|
|
+ cl_uint platform_count;
|
|
|
+ cl_platform_id platforms[5];
|
|
|
|
|
|
- cl_int err = CL_SUCCESS;
|
|
|
- unsigned int i, p;
|
|
|
+ cl_int err = CL_SUCCESS;
|
|
|
+ unsigned int i, p;
|
|
|
|
|
|
- cl_device_type dev_type = CL_DEVICE_TYPE_ALL;
|
|
|
+ cl_device_type dev_type = CL_DEVICE_TYPE_ALL;
|
|
|
|
|
|
-
|
|
|
- check(clGetPlatformIDs(5, platforms, &platform_count));
|
|
|
- if (platform_count == 0)
|
|
|
- error("No platform found\n");
|
|
|
+ void * ptrs[BLOCKS];
|
|
|
+ cl_mem d_A[BLOCKS];
|
|
|
+ cl_mem d_C[BLOCKS];
|
|
|
+ cl_mem d_B[BLOCKS];
|
|
|
|
|
|
- cl_uint device_count;
|
|
|
- cl_uint devs[platform_count];
|
|
|
- cl_device_id * devices[platform_count];
|
|
|
- cl_context ctx[platform_count];
|
|
|
- cl_command_queue * commandQueue[platform_count];
|
|
|
+ cl_event GPUDone[BLOCKS];
|
|
|
+ cl_event GPUExecution[BLOCKS];
|
|
|
+ struct timeval start, end;
|
|
|
|
|
|
- device_count = 0;
|
|
|
- for (p=0; p<platform_count; p++) {
|
|
|
- cl_platform_id platform = platforms[p];
|
|
|
+ int workOffset[BLOCKS];
|
|
|
+ int workSize[BLOCKS];
|
|
|
|
|
|
- cl_int err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
|
|
|
- if (err == CL_DEVICE_NOT_FOUND) {
|
|
|
- devs[p] = 0;
|
|
|
- continue;
|
|
|
- }
|
|
|
- check(err);
|
|
|
- if (devs[p] == 0)
|
|
|
- continue;
|
|
|
+ unsigned int sizePerGPU = HC / BLOCKS;
|
|
|
+ unsigned int sizeMod = HC % BLOCKS;
|
|
|
|
|
|
- devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
|
|
|
- commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);
|
|
|
+ size_t A_size = WA * HA;
|
|
|
+ size_t A_mem_size = sizeof(TYPE) * A_size;
|
|
|
+ TYPE* A_data;
|
|
|
|
|
|
- check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));
|
|
|
+ size_t B_size = WB * HB;
|
|
|
+ size_t B_mem_size = sizeof(TYPE) * B_size;
|
|
|
+ TYPE* B_data;
|
|
|
|
|
|
- cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
|
|
|
- check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));
|
|
|
+ size_t C_size = WC * HC;
|
|
|
+ size_t C_mem_size = sizeof(TYPE) * C_size;
|
|
|
+ TYPE* C_data;
|
|
|
|
|
|
- for(i = 0; i < devs[p]; ++i)
|
|
|
- {
|
|
|
-
|
|
|
- cl_device_id device = devices[p][i];
|
|
|
- char name[2048];
|
|
|
- name[0] = '\0';
|
|
|
- clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
|
|
|
- printf("Device %d: %s\n", i, name);
|
|
|
+ parse_args(argc, argv);
|
|
|
|
|
|
-
|
|
|
- check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
|
|
|
- }
|
|
|
-
|
|
|
- device_count += devs[p];
|
|
|
- }
|
|
|
-
|
|
|
- if (device_count == 0)
|
|
|
- error("No device found\n");
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- cl_kernel multiplicationKernel[platform_count];
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",
|
|
|
- (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);
|
|
|
-
|
|
|
-
|
|
|
- size_t A_size = WA * HA;
|
|
|
- size_t A_mem_size = sizeof(TYPE) * A_size;
|
|
|
- TYPE* A_data = (TYPE*)malloc(A_mem_size);
|
|
|
- if (A_data == NULL) {
|
|
|
- perror("malloc");
|
|
|
- }
|
|
|
-
|
|
|
- size_t B_size = WB * HB;
|
|
|
- size_t B_mem_size = sizeof(TYPE) * B_size;
|
|
|
- TYPE* B_data = (TYPE*)malloc(B_mem_size);
|
|
|
- if (B_data == NULL) {
|
|
|
- perror("malloc");
|
|
|
- }
|
|
|
-
|
|
|
- size_t C_size = WC * HC;
|
|
|
- size_t C_mem_size = sizeof(TYPE) * C_size;
|
|
|
- TYPE* C_data = (TYPE*) malloc(C_mem_size);
|
|
|
- if (C_data == NULL) {
|
|
|
- perror("malloc");
|
|
|
- }
|
|
|
+ check(clGetPlatformIDs(5, platforms, &platform_count));
|
|
|
+ if (platform_count == 0)
|
|
|
+ error("No platform found\n");
|
|
|
|
|
|
- cl_program program[platform_count];
|
|
|
-
|
|
|
- for (p=0; p<platform_count; p++) {
|
|
|
- if (devs[p] == 0)
|
|
|
- continue;
|
|
|
-
|
|
|
- check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));
|
|
|
-
|
|
|
- check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));
|
|
|
-
|
|
|
-
|
|
|
- check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- printf("Initializing data...\n");
|
|
|
- srand(2008);
|
|
|
- fillArray(A_data, A_size);
|
|
|
- fillArray(B_data, B_size);
|
|
|
- memset(C_data, 0, C_size);
|
|
|
-
|
|
|
-
|
|
|
- printf("Computing...\n");
|
|
|
-
|
|
|
-
|
|
|
- printf("\nRunning Computations on 1 - %d GPU's...\n\n", device_count);
|
|
|
- {
|
|
|
- cl_mem d_A[BLOCKS];
|
|
|
- cl_mem d_C[BLOCKS];
|
|
|
- void * ptrs[BLOCKS];
|
|
|
- cl_mem d_B[BLOCKS];
|
|
|
-
|
|
|
- cl_event GPUDone[BLOCKS];
|
|
|
- cl_event GPUExecution[BLOCKS];
|
|
|
-
|
|
|
- unsigned int sizePerGPU = HC / BLOCKS;
|
|
|
- unsigned int sizeMod = HC % BLOCKS;
|
|
|
- if (HC < BLOCKS) {
|
|
|
- printf("Not enough data to split in %d blocks. Test skipped\n", device_count);
|
|
|
- exit(0);
|
|
|
- }
|
|
|
-
|
|
|
- int workOffset[BLOCKS];
|
|
|
- int workSize[BLOCKS];
|
|
|
-
|
|
|
- workOffset[0] = 0;
|
|
|
-
|
|
|
- struct timeval start, end;
|
|
|
- gettimeofday(&start, NULL);
|
|
|
-
|
|
|
- size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
|
|
|
- int c = 0;
|
|
|
- for (p=0; p<platform_count;p++) {
|
|
|
- for (i=0; i<devs[p]; i++) {
|
|
|
- check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
|
|
|
- c++;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- unsigned int i;
|
|
|
- for(i=0; i < BLOCKS; ++i)
|
|
|
- {
|
|
|
- int d = i % device_count;
|
|
|
- cl_uint p = 0;
|
|
|
-
|
|
|
- int dev = d;
|
|
|
- for (p = 0; p < platform_count; p++) {
|
|
|
- if ((cl_int)(dev - devs[p]) < 0)
|
|
|
- break;
|
|
|
- dev -= devs[p];
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;
|
|
|
-
|
|
|
- check2(d_A[i] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
|
|
|
- check2(d_C[i] = clCreateBuffer(ctx[p], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));
|
|
|
-
|
|
|
-
|
|
|
- check(clSetKernelArg(multiplicationKernel[p], 0, sizeof(cl_int), &workSize[i]));
|
|
|
- check(clSetKernelArg(multiplicationKernel[p], 1, sizeof(cl_int), &workSize[i]));
|
|
|
- check(clSetKernelArg(multiplicationKernel[p], 2, sizeof(cl_int), &workSize[i]));
|
|
|
- check(clSetKernelArg(multiplicationKernel[p], 3, sizeof(cl_mem), (void *) &d_A[i]));
|
|
|
- check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));
|
|
|
- check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE,WC), shrRoundUp(BLOCK_SIZE,workSize[i])};
|
|
|
-
|
|
|
- check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- if(i+1 < BLOCKS)
|
|
|
- workOffset[i + 1] = workOffset[i] + workSize[i];
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- for (p=0; p<platform_count;p++) {
|
|
|
- cl_uint dev;
|
|
|
- for (dev=0; dev<devs[p]; dev++) {
|
|
|
- clFinish(commandQueue[p][dev]);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- gettimeofday(&end, NULL);
|
|
|
- double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
-
|
|
|
-
|
|
|
- double dSeconds = timing/1000/1000;
|
|
|
- double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
|
|
|
- double gflops = 1.0e-9 * dNumOps/dSeconds;
|
|
|
-
|
|
|
-#ifdef SHORT_LOG
|
|
|
- printf("%f\n", timing/1000.0);
|
|
|
-#else
|
|
|
- printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
|
|
|
- gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- printf(" Kernel execution time on GPU %d \t: %.5f s\n", i, executionTime(GPUExecution[i]));
|
|
|
- printf("\n");*/
|
|
|
-#endif
|
|
|
+ cl_uint device_count;
|
|
|
+ cl_uint devs[platform_count];
|
|
|
+ cl_device_id * devices[platform_count];
|
|
|
+ cl_context ctx[platform_count];
|
|
|
+ cl_command_queue * commandQueue[platform_count];
|
|
|
|
|
|
-
|
|
|
- for (i=0; i<device_count; i++) {
|
|
|
- clReleaseMemObject(d_B[i]);
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
- for(i = 0; i < BLOCKS; i++)
|
|
|
- {
|
|
|
- clReleaseMemObject(d_A[i]);
|
|
|
- clReleaseMemObject(d_C[i]);
|
|
|
- clReleaseEvent(GPUExecution[i]);
|
|
|
-
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-#ifdef CHECK
|
|
|
- printf("Comparing results with CPU computation... ");
|
|
|
- TYPE* reference = (TYPE*)malloc(C_mem_size);
|
|
|
- computeGold(reference, A_data, B_data, HA, WA, WB);
|
|
|
-
|
|
|
-
|
|
|
- int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
|
|
|
- if (res == 0)
|
|
|
- {
|
|
|
- printf("\n\n");
|
|
|
- printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
|
|
|
- }
|
|
|
- else printf("PASSED\n\n");
|
|
|
- free(reference);
|
|
|
-#endif
|
|
|
+ device_count = 0;
|
|
|
+ for (p=0; p<platform_count; p++) {
|
|
|
+ cl_platform_id platform = platforms[p];
|
|
|
|
|
|
-
|
|
|
+ cl_int err = clGetDeviceIDs(platform, dev_type, 0, NULL, &devs[p]);
|
|
|
+ if (err == CL_DEVICE_NOT_FOUND) {
|
|
|
+ devs[p] = 0;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ check(err);
|
|
|
+ if (devs[p] == 0)
|
|
|
+ continue;
|
|
|
|
|
|
- for (p=0; p<platform_count;p++) {
|
|
|
- if (devs[p] == 0)
|
|
|
- continue;
|
|
|
+ devices[p] = (cl_device_id*)malloc(sizeof(cl_device_id) * devs[p]);
|
|
|
+ commandQueue[p] = (cl_command_queue*)malloc(sizeof(cl_command_queue) * devs[p]);
|
|
|
|
|
|
- check(clReleaseKernel(multiplicationKernel[p]));
|
|
|
- check(clReleaseProgram(program[p]));
|
|
|
- check(clReleaseContext(ctx[p]));
|
|
|
- cl_uint k;
|
|
|
- for(k = 0; k < devs[p]; ++k)
|
|
|
- {
|
|
|
- check(clReleaseCommandQueue(commandQueue[p][k]));
|
|
|
- }
|
|
|
- }
|
|
|
+ check(clGetDeviceIDs(platform, dev_type, devs[p], devices[p], NULL));
|
|
|
|
|
|
-
|
|
|
- free(A_data);
|
|
|
- free(B_data);
|
|
|
- free(C_data);
|
|
|
+ cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
|
|
|
+ check2(ctx[p] = clCreateContext(properties, devs[p], devices[p], NULL, NULL, &err));
|
|
|
|
|
|
- return 0;
|
|
|
+ for(i = 0; i < devs[p]; ++i)
|
|
|
+ {
|
|
|
+ cl_device_id device = devices[p][i];
|
|
|
+ char name[2048];
|
|
|
+ name[0] = '\0';
|
|
|
+ clGetDeviceInfo(device, CL_DEVICE_NAME, 2048, name, NULL);
|
|
|
+ printf("Device %d: %s\n", i, name);
|
|
|
+
|
|
|
+ check2(commandQueue[p][i] = clCreateCommandQueue(ctx[p], device, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
|
|
|
+ }
|
|
|
+
|
|
|
+ device_count += devs[p];
|
|
|
+ }
|
|
|
+
|
|
|
+ if (device_count == 0)
|
|
|
+ error("No device found\n");
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ cl_kernel multiplicationKernel[platform_count];
|
|
|
+
|
|
|
+ printf("\nUsing Matrix Sizes: A(%lu x %lu), B(%lu x %lu), C(%lu x %lu)\n",
|
|
|
+ (unsigned long)WA, (unsigned long)HA, (unsigned long)WB, (unsigned long)HB, (unsigned long)WC, (unsigned long)HC);
|
|
|
+
|
|
|
+
|
|
|
+ A_data = (TYPE*)malloc(A_mem_size);
|
|
|
+ if (A_data == NULL) {
|
|
|
+ perror("malloc");
|
|
|
+ }
|
|
|
+
|
|
|
+ B_data = (TYPE*)malloc(B_mem_size);
|
|
|
+ if (B_data == NULL) {
|
|
|
+ perror("malloc");
|
|
|
+ }
|
|
|
+
|
|
|
+ C_data = (TYPE*) malloc(C_mem_size);
|
|
|
+ if (C_data == NULL) {
|
|
|
+ perror("malloc");
|
|
|
+ }
|
|
|
+
|
|
|
+ cl_program program[platform_count];
|
|
|
+
|
|
|
+ for (p=0; p<platform_count; p++) {
|
|
|
+ if (devs[p] == 0)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ check2(program[p] = clCreateProgramWithSource(ctx[p], 1, (const char **)&code, NULL, &err));
|
|
|
+
|
|
|
+ check(clBuildProgram(program[p], 0, NULL, NULL, NULL, NULL));
|
|
|
+
|
|
|
+ check2(multiplicationKernel[p] = clCreateKernel(program[p], "sgemmNN", &err));
|
|
|
+ }
|
|
|
+
|
|
|
+ printf("Initializing data...\n");
|
|
|
+ srand(2008);
|
|
|
+ fillArray(A_data, A_size);
|
|
|
+ fillArray(B_data, B_size);
|
|
|
+ memset(C_data, 0, C_size);
|
|
|
+
|
|
|
+
|
|
|
+ printf("Computing...\n");
|
|
|
+ workOffset[0] = 0;
|
|
|
+ gettimeofday(&start, NULL);
|
|
|
+
|
|
|
+ size_t localWorkSize[] = {BLOCK_SIZE, BLOCK_SIZE};
|
|
|
+ int c = 0;
|
|
|
+ for (p=0; p<platform_count;p++) {
|
|
|
+ for (i=0; i<devs[p]; i++) {
|
|
|
+ check2(d_B[c] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, HB * WB * sizeof(TYPE), B_data, &err));
|
|
|
+ c++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for(i=0; i < BLOCKS; ++i)
|
|
|
+ {
|
|
|
+ int d = i % device_count;
|
|
|
+ cl_uint p = 0;
|
|
|
+
|
|
|
+
|
|
|
+ int dev = d;
|
|
|
+ for (p = 0; p < platform_count; p++) {
|
|
|
+ if ((cl_int)(dev - devs[p]) < 0)
|
|
|
+ break;
|
|
|
+ dev -= devs[p];
|
|
|
+ }
|
|
|
+
|
|
|
+ workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;
|
|
|
+
|
|
|
+ check2(d_A[i] = clCreateBuffer(ctx[p], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
|
|
|
+ check2(d_C[i] = clCreateBuffer(ctx[p], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));
|
|
|
+
|
|
|
+ check(clSetKernelArg(multiplicationKernel[p], 0, sizeof(cl_int), &workSize[i]));
|
|
|
+ check(clSetKernelArg(multiplicationKernel[p], 1, sizeof(cl_int), &workSize[i]));
|
|
|
+ check(clSetKernelArg(multiplicationKernel[p], 2, sizeof(cl_int), &workSize[i]));
|
|
|
+ check(clSetKernelArg(multiplicationKernel[p], 3, sizeof(cl_mem), (void *) &d_A[i]));
|
|
|
+ check(clSetKernelArg(multiplicationKernel[p], 4, sizeof(cl_mem), (void *) &d_B[d]));
|
|
|
+ check(clSetKernelArg(multiplicationKernel[p], 5, sizeof(cl_mem), (void *) &d_C[i]));
|
|
|
+
|
|
|
+ size_t globalWorkSize[] = {shrRoundUp(BLOCK_SIZE,WC), shrRoundUp(BLOCK_SIZE,workSize[i])};
|
|
|
+
|
|
|
+ check(clEnqueueNDRangeKernel(commandQueue[p][dev], multiplicationKernel[p], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
|
|
|
+
|
|
|
+
|
|
|
+ check2(ptrs[i] = clEnqueueMapBuffer(commandQueue[p][dev], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));
|
|
|
+
|
|
|
+ if(i+1 < BLOCKS)
|
|
|
+ workOffset[i + 1] = workOffset[i] + workSize[i];
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ for (p=0; p<platform_count;p++) {
|
|
|
+ cl_uint dev;
|
|
|
+ for (dev=0; dev<devs[p]; dev++) {
|
|
|
+ clFinish(commandQueue[p][dev]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ gettimeofday(&end, NULL);
|
|
|
+ double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
|
|
|
+
|
|
|
+ double dSeconds = timing/1000/1000;
|
|
|
+ double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
|
|
|
+ double gflops = 1.0e-9 * dNumOps/dSeconds;
|
|
|
+
|
|
|
+ printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
|
|
|
+ gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);
|
|
|
+
|
|
|
+ for (i=0; i<device_count; i++) {
|
|
|
+ clReleaseMemObject(d_B[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ for(i = 0; i < BLOCKS; i++)
|
|
|
+ {
|
|
|
+ clReleaseMemObject(d_A[i]);
|
|
|
+ clReleaseMemObject(d_C[i]);
|
|
|
+ clReleaseEvent(GPUExecution[i]);
|
|
|
+ clReleaseEvent(GPUDone[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if (check) {
|
|
|
+ printf("Comparing results with CPU computation... ");
|
|
|
+ TYPE* reference = (TYPE*)malloc(C_mem_size);
|
|
|
+ computeReference(reference, A_data, B_data, HA, WA, WB);
|
|
|
+
|
|
|
+
|
|
|
+ int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
|
|
|
+ if (res == 0) {
|
|
|
+ printf("\n\n");
|
|
|
+ printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
|
|
|
+ }
|
|
|
+ else printf("PASSED\n\n");
|
|
|
+ free(reference);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (p=0; p<platform_count;p++) {
|
|
|
+ if (devs[p] == 0)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ check(clReleaseKernel(multiplicationKernel[p]));
|
|
|
+ check(clReleaseProgram(program[p]));
|
|
|
+ check(clReleaseContext(ctx[p]));
|
|
|
+ cl_uint k;
|
|
|
+ for(k = 0; k < devs[p]; ++k)
|
|
|
+ {
|
|
|
+ check(clReleaseCommandQueue(commandQueue[p][k]));
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ free(A_data);
|
|
|
+ free(B_data);
|
|
|
+ free(C_data);
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
-void printDiff(TYPE *data1, TYPE *data2, int width, int height, int iListLength, TYPE fListTol)
|
|
|
-{
|
|
|
- shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
|
|
|
- int i,j,k;
|
|
|
- int error_count=0;
|
|
|
- for (j = 0; j < height; j++)
|
|
|
- {
|
|
|
- if (error_count < iListLength)
|
|
|
- {
|
|
|
- shrLog("\n Row %d:\n", j);
|
|
|
- }
|
|
|
- for (i = 0; i < width; i++)
|
|
|
- {
|
|
|
- k = j * width + i;
|
|
|
- float fDiff = fabs(data1[k] - data2[k]);
|
|
|
- if (fDiff > fListTol)
|
|
|
- {
|
|
|
- if (error_count < iListLength)
|
|
|
- {
|
|
|
- shrLog(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
|
|
|
- }
|
|
|
- error_count++;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- shrLog(" \n Total Errors = %d\n\n", error_count);
|
|
|
+void printDiff(TYPE *data1, TYPE *data2, int width, int height, int iListLength, TYPE fListTol) {
|
|
|
+ shrLog("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
|
|
|
+ int i,j,k;
|
|
|
+ int error_count=0;
|
|
|
+ for (j = 0; j < height; j++) {
|
|
|
+ if (error_count < iListLength) {
|
|
|
+ shrLog("\n Row %d:\n", j);
|
|
|
+ }
|
|
|
+ for (i = 0; i < width; i++) {
|
|
|
+ k = j * width + i;
|
|
|
+ float fDiff = fabs(data1[k] - data2[k]);
|
|
|
+ if (fDiff > fListTol) {
|
|
|
+ if (error_count < iListLength) {
|
|
|
+ shrLog(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff);
|
|
|
+ }
|
|
|
+ error_count++;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ shrLog(" \n Total Errors = %d\n\n", error_count);
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- void
|
|
|
-computeGold(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB)
|
|
|
-{
|
|
|
- unsigned int i,j,k;
|
|
|
- for (i = 0; i < hA; ++i)
|
|
|
- for (j = 0; j < wB; ++j) {
|
|
|
- double sum = 0;
|
|
|
- for (k = 0; k < wA; ++k) {
|
|
|
- double a = A[i * wA + k];
|
|
|
- double b = B[k * wB + j];
|
|
|
- sum += a * b;
|
|
|
- }
|
|
|
- C[i * wB + j] = (TYPE)sum;
|
|
|
- }
|
|
|
+
|
|
|
+ * Compute reference data set
|
|
|
+ * C = A * B
|
|
|
+ * @param C reference data, computed but preallocated
|
|
|
+ * @param A matrix A as provided to device
|
|
|
+ * @param B matrix B as provided to device
|
|
|
+ * @param hA height of matrix A
|
|
|
+ * @param wB width of matrix B
|
|
|
+*/
|
|
|
+void computeReference(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB) {
|
|
|
+ unsigned int i,j,k;
|
|
|
+ for (i = 0; i < hA; ++i)
|
|
|
+ for (j = 0; j < wB; ++j) {
|
|
|
+ double sum = 0;
|
|
|
+ for (k = 0; k < wA; ++k) {
|
|
|
+ double a = A[i * wA + k];
|
|
|
+ double b = B[k * wB + j];
|
|
|
+ sum += a * b;
|
|
|
+ }
|
|
|
+ C[i * wB + j] = (TYPE)sum;
|
|
|
+ }
|
|
|
}
|
|
|
|