#include #include #include __global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n) { uint tid = threadIdx.x; uint index = blockIdx.x*blockDim.x + threadIdx.x; // convert global data pointer to the local pointer of the block uint32_t *idata = window + blockIdx.x * blockDim.x; if (index >= n) return; // reduction algorithm for (int stride = blockDim.x/2; stride > 0; stride>>=1){ if (tid < stride){ idata[tid] += idata[tid+stride]; } __syncthreads(); } // write result for this block to global memory if(tid == 0){ dev_data[blockIdx.x] = idata[0]; } } extern "C" void output_thread_aggregation(void *buffers[], void *_args) { printf("cuda function\n"); uint64_t *aggregated_value = (uint64_t *)_args; /* length of the vector */ int n = STARPU_VECTOR_GET_NX(buffers[0]); uint32_t *window = (uint32_t *)STARPU_VECTOR_GET_PTR(buffers[0]); /* define the number of threads per block accordingly to the vector's size */ int n_threads; if (sqrt(n) <= 32) {n_threads = 32;} else if (sqrt(n) <= 64) {n_threads = 64;} else if (sqrt(n) <= 128) {n_threads = 128;} else if (sqrt(n) <= 256) {n_threads = 256;} else if (sqrt(n) <= 512) {n_threads = 512;} else {n_threads = 1024;} printf("n_threads = %lu\n",n_threads); // define number of blocks and number of threads per block (kernel parameters) dim3 threads_per_block (n_threads); dim3 blocks ((n+threads_per_block.x-1)/threads_per_block.x); // dynamic allocation of the reduced data matrix uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t)); if(!h_data){ printf("Allocation error for h_data - aborting.\n"); exit(1); } // GPU memory pointers uint32_t *dev_window; uint32_t *dev_data; // allocate the memory on the GPU cudaMalloc((void**)&dev_window, n*sizeof(uint32_t)); cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t)); // copy the array 'window' to the GPU cudaMemcpyAsync(dev_window, window, n*sizeof(uint32_t), cudaMemcpyHostToDevice); // launch kernel output_thread_aggregation<<>>(dev_window, dev_data, n); cudaStreamSynchronize(starpu_cuda_get_local_stream()); // copy back the result to the CPU cudaMemcpyAsync(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost); uint64_t gpu_sum = 0; // compute the total sum from gpu for(int i=0; i