/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2010,2011 University of Bordeaux * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ #include #include #include #include #include #include #include #include #include #define error(...) do { fprintf(stderr, "Error: " __VA_ARGS__); exit(EXIT_FAILURE); } while(0) #define check(exp) do { cl_int err = exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); }} while(0) #define check2(exp) exp; if(err != CL_SUCCESS) { fprintf(stderr, "OpenCL Error (%d): " #exp "\n", err); exit(EXIT_FAILURE); } #ifdef UNUSED #elif defined(__GNUC__) #define UNUSED(x) UNUSED_ ## x __attribute__((unused)) #else #define UNUSED(x) x #endif // Thread block size #define BLOCK_SIZE 16 // Kernel thread-block size #define WORK_SIZE 64 // Kernel global size in lines of A (or C) #define TYPE float // Basic Matrix dimensions #define WA (1024L * BLOCK_SIZE) // Matrix A width #define HA (512L * BLOCK_SIZE) // Matrix A height #define WB (1024L * BLOCK_SIZE) // Matrix B width #define HB WA // Matrix B height #define WC WB // Matrix C width #define HC HA // Matrix C height #define BLOCKS (HA / WORK_SIZE) //////////////////////////////////////////////////////////////////////////////// // declaration, forward void printDiff(TYPE*, TYPE*, int, int, int, TYPE); void computeGold(TYPE*, const TYPE*, const TYPE*, unsigned int, unsigned int, unsigned int); #define str(x) #x #define CODE "\n\ #define BS 16\n\ \n\ __kernel void matrixMul(\n\ const int N,\n\ const int P,\n\ const int M,\n\ __global float* A,\n\ __global float* B, \n\ __global float* C) {\n\ int row = get_global_id(1); \n\ int col = get_global_id(0); \n\ float sum = 0.0f;\n\ float sum2 = 0.0f;\n\ int x = get_local_id(0);\n\ int y = get_local_id(1);\n\ __local float atile[BS][BS+1];\n\ __local float btile[BS][BS+1];\n\ for (int t=0; t= 0); float error = 0; float ref = 0; unsigned int i; for(i = 0; i < len; ++i) { float diff = reference[i] - data[i]; error += diff * diff; ref += reference[i] * reference[i]; } float normRef = sqrtf(ref); if (fabs(ref) < 1e-7) { #ifdef _DEBUG fprintf(stderr, "ERROR, reference l2-norm is 0\n"); #endif return 0; } float normError = sqrtf(error); error = normError / normRef; int result = error < epsilon; #ifdef _DEBUG if( ! result) { fprintf(stderr, "ERROR, l2-norm error %d is greater than epsilon %lf \n", error, epsilon); } #endif return result; } int main(int UNUSED(argc), const char** UNUSED(argv)) { cl_uint platform_count; cl_platform_id platforms[5]; cl_int err = CL_SUCCESS; unsigned int i, p; cl_device_type dev_type = CL_DEVICE_TYPE_ALL; /* Get platforms */ check(clGetPlatformIDs(5, platforms, &platform_count)); if (platform_count == 0) error("No platform found\n"); cl_uint device_count; cl_uint devs[platform_count]; cl_device_id * devices[platform_count]; cl_context ctx[platform_count]; cl_command_queue * commandQueue[platform_count]; device_count = 0; for (p=0; p %.6f...\n", iListLength, fListTol); int i,j,k; int error_count=0; for (j = 0; j < height; j++) { if (error_count < iListLength) { shrLog("\n Row %d:\n", j); } for (i = 0; i < width; i++) { k = j * width + i; float fDiff = fabs(data1[k] - data2[k]); if (fDiff > fListTol) { if (error_count < iListLength) { shrLog(" Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j, data1[k], data2[k], fDiff); } error_count++; } } } shrLog(" \n Total Errors = %d\n\n", error_count); } //////////////////////////////////////////////////////////////////////////////// //! Compute reference data set //! C = A * B //! @param C reference data, computed but preallocated //! @param A matrix A as provided to device //! @param B matrix B as provided to device //! @param hA height of matrix A //! @param wB width of matrix B //////////////////////////////////////////////////////////////////////////////// void computeGold(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, unsigned int wA, unsigned int wB) { unsigned int i,j,k; for (i = 0; i < hA; ++i) for (j = 0; j < wB; ++j) { double sum = 0; for (k = 0; k < wA; ++k) { double a = A[i * wA + k]; double b = B[k * wB + j]; sum += a * b; } C[i * wB + j] = (TYPE)sum; } }