123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- #include<stdio.h>
- #include <starpu.h>
- #include <math.h>
- __global__ void output_thread_aggregation(uint32_t *window, uint32_t *dev_data, int n)
- {
- uint tid = threadIdx.x;
- uint index = blockIdx.x*blockDim.x + threadIdx.x;
-
- // convert global data pointer to the local pointer of the block
- uint32_t *idata = window + blockIdx.x * blockDim.x;
-
- if (index >= n) return;
-
- // reduction algorithm
- for (int stride = blockDim.x/2; stride > 0; stride>>=1){
- if (tid < stride){
- idata[tid] += idata[tid+stride];
- }
- __syncthreads();
- }
-
- // write result for this block to global memory
- if(tid == 0){
- dev_data[blockIdx.x] = idata[0];
- }
-
- }
- extern "C" void output_thread_aggregation(void *buffers[], void *_args)
- {
- printf("cuda function\n");
-
- uint64_t *aggregated_value = (uint64_t *)_args;
-
- /* length of the vector */
- int n = STARPU_VECTOR_GET_NX(buffers[0]);
- uint32_t *window = (uint32_t *)STARPU_VECTOR_GET_PTR(buffers[0]);
-
- /* define the number of threads per block accordingly to the vector's size */
- int n_threads;
- if (sqrt(n) <= 32)
- {n_threads = 32;}
- else if (sqrt(n) <= 64)
- {n_threads = 64;}
- else if (sqrt(n) <= 128)
- {n_threads = 128;}
- else if (sqrt(n) <= 256)
- {n_threads = 256;}
- else if (sqrt(n) <= 512)
- {n_threads = 512;}
- else
- {n_threads = 1024;}
- printf("n_threads = %lu\n",n_threads);
-
- // define number of blocks and number of threads per block (kernel parameters)
- dim3 threads_per_block (n_threads);
- dim3 blocks ((n+threads_per_block.x-1)/threads_per_block.x);
-
- // dynamic allocation of the reduced data matrix
- uint32_t *h_data = (uint32_t *)malloc(blocks.x*sizeof(uint32_t));
-
- if(!h_data){
- printf("Allocation error for h_data - aborting.\n");
- exit(1);
- }
-
- // GPU memory pointers
- uint32_t *dev_window;
- uint32_t *dev_data;
-
- // allocate the memory on the GPU
- cudaMalloc((void**)&dev_window, n*sizeof(uint32_t));
- cudaMalloc((void**)&dev_data, blocks.x*sizeof(uint32_t));
-
- // copy the array 'window' to the GPU
- cudaMemcpyAsync(dev_window, window, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
-
- // launch kernel
- output_thread_aggregation<<<blocks,threads_per_block>>>(dev_window, dev_data, n);
-
- cudaStreamSynchronize(starpu_cuda_get_local_stream());
-
- // copy back the result to the CPU
- cudaMemcpyAsync(h_data, dev_data, blocks.x*sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
- uint64_t gpu_sum = 0;
-
- // compute the total sum from gpu
- for(int i=0; i<blocks.x; i++){
- gpu_sum += h_data[i];
- }
-
- // copy the sum to the main program
- *aggregated_value = gpu_sum;
-
- //free the memory allocated on the GPU
- cudaFree(dev_window);
- cudaFree(dev_data);
- }
|