Browse Source

forgot example files

Andra Hugo 8 years ago
parent
commit
1e8359bfaa

+ 54 - 0
examples/sched_ctx/axpy_partition_gpu.cu

@@ -0,0 +1,54 @@
+#include <starpu.h>
+#include "axpy_partition_gpu.h"
+#include <stdio.h>
+
+//This code demonstrates how to transform a kernel to execute on a given set of GPU SMs.
+
+
+// Original kernel
+__global__ void saxpy(int n, float a, float *x, float *y)
+{
+	int i = blockIdx.x*blockDim.x + threadIdx.x;
+	if (i<n)  y[i] = a*x[i] + y[i];
+}
+
+
+
+
+// Transformed kernel
+__global__ void saxpy_partitioned(__P_KARGS, int n, float a, float *x, float *y)
+{
+  __P_BEGIN;
+  __P_LOOPX;
+        int i = blockid.x*blockDim.x + threadIdx.x; // note that blockIdx is replaced.
+	if (i<n)  y[i] = a*x[i] + y[i];
+  __P_LOOPEND;
+}
+      
+
+extern "C" void cuda_axpy(void *descr[], void *_args)
+{
+	 float a = *((float *)_args);
+
+        unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+        float *x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+        float *y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int SM_mapping_start = -1;
+	int SM_mapping_end = -1; 
+  	int SM_allocation = -1;
+  
+	cudaStream_t stream = starpu_cuda_get_local_stream();
+	int workerid = starpu_worker_get_id();
+    	starpu_sched_ctx_get_sms_interval(workerid, &SM_mapping_start, &SM_mapping_end);
+	SM_allocation = SM_mapping_end - SM_mapping_start;
+	int dimensions = 512;	
+	//partitioning setup
+//	int SM_mapping_start = 0;
+//  	int SM_allocation = 13;
+  
+	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
+
+  	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
+}

+ 107 - 0
examples/sched_ctx/axpy_partition_gpu.h

@@ -0,0 +1,107 @@
+#pragma once
+
+
+__device__ static uint get_smid(void) {
+#if defined(__CUDACC__)
+  uint ret;
+  asm("mov.u32 %0, %smid;" : "=r"(ret) );
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+
+#define __P_HKARGS    dimGrid,     active_blocks     ,occupancy,               block_assignment_d,   mapping_start
+#define __P_KARGS dim3 blocks, int active_blocks, int occupancy, unsigned int* block_assignment, int mapping_start
+
+#define __P_DARGS blocks,blockid
+
+#define __P_BEGIN							\
+__shared__ unsigned int block_start;					\
+int smid = get_smid();							\
+if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)		\
+  {									\
+    block_start = atomicDec(&block_assignment[smid],0xDEADBEEF);	\
+  }									\
+__syncthreads();							\
+									\
+if(block_start > active_blocks)						\
+  {									\
+    return;								\
+  }									
+
+#define __P_LOOPXY							\
+  dim3 blockid;								\
+  blockid.z = 0;							\
+									\
+  int gridDim_sum = blocks.x*blocks.y;					\
+  int startBlock = block_start + (smid - mapping_start) * occupancy;	\
+									\
+  for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
+    {									\
+  blockid.x = blockid_sum % blocks.x;					\
+  blockid.y = blockid_sum / blocks.x;
+
+#define __P_LOOPEND }
+// Needed if shared memory is used
+#define __P_LOOPEND_SAFE __syncthreads(); }
+
+#define __P_LOOPX							\
+  dim3 blockid;								\
+  blockid.z = 0;							\
+  blockid.y = 0;							\
+  int gridDim_sum = blocks.x;						\
+  int startBlock = (smid-mapping_start) + block_start*(active_blocks/occupancy); \
+									\
+  for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
+    {									\
+  blockid.x = blockid_sum;
+
+
+  //  int startBlock = block_start + (smid - mapping_start) * occupancy; \
+
+
+//////////// HOST side functions
+
+
+template <typename F>
+static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int mapping_start, int allocation,
+				  int &width, int &active_blocks, unsigned int *block_assignment_d,cudaStream_t current_stream = cudaStreamPerThread)
+{
+  int occupancy;
+  int nb_SM = 13; //TODO: replace with call
+  int mapping_end = mapping_start + allocation - 1; // exclusive
+  unsigned int block_assignment[15];
+  
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy,cudaFun,threads,shmem);
+  //occupancy = 4;
+  width = occupancy * nb_SM; // Physical wrapper grid size. Fits GPU exactly
+  active_blocks = occupancy*allocation; // The total number of blocks doing work
+
+  for(int i = 0; i < mapping_start; i++)
+    block_assignment[i] = (unsigned) -1;
+
+  for(int i = mapping_start; i <= mapping_end; i++)
+    {
+      block_assignment[i] = occupancy - 1;
+    }
+
+  for(int i = mapping_end+1; i < nb_SM; i++)
+    block_assignment[i] = (unsigned) -1;
+
+  cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
+  //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
+}
+
+
+
+#define __P_HOSTSETUP(KERNEL,GRIDDIM,BLOCKSIZE,SHMEMSIZE,MAPPING_START,MAPPING_END,STREAM)	\
+  unsigned int* block_assignment_d; cudaMalloc((void**) &block_assignment_d,15*sizeof(unsigned int)); \
+  int width = 0;							\
+  int active_blocks = 0;						\
+  buildPartitionedBlockMapping(KERNEL,BLOCKSIZE,SHMEMSIZE,(MAPPING_START),(MAPPING_END)-(MAPPING_START), \
+			       width, active_blocks, block_assignment_d,STREAM); \
+  int occupancy = active_blocks/((MAPPING_END)-(MAPPING_START));		\
+  dim3 dimGrid = (GRIDDIM);\
+  int mapping_start = (MAPPING_START);

+ 240 - 0
examples/sched_ctx/gpu_partition.c

@@ -0,0 +1,240 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This creates two dumb vectors, splits them into chunks, and for each pair of
+ * chunk, run axpy on them.
+ */
+
+#include <starpu.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+
+#include <common/blas.h>
+
+#ifdef STARPU_USE_CUDA
+#include <cublas.h>
+#endif
+
+
+#define N	512*512
+#define NITER   100
+
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+#define EPSILON 1e-6
+
+float *_vec_x[NITER], *_vec_y[NITER];
+float _alpha = 3.41;
+
+/* descriptors for StarPU */
+starpu_data_handle_t _handle_y[NITER], _handle_x[NITER];
+
+void axpy_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
+{
+	float alpha = *((float *)arg);
+
+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+
+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	unsigned i;
+	for( i = 0; i < n; i++)
+		block_y[i] = alpha * block_x[i] + block_y[i];
+}
+
+#ifdef STARPU_USE_CUDA
+extern void cuda_axpy(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
+#endif
+
+static struct starpu_perfmodel axpy_model =
+{
+	.type = STARPU_HISTORY_BASED,
+	.symbol = "axpy"
+};
+
+static struct starpu_codelet axpy_cl =
+{
+	/* .cpu_funcs = {axpy_cpu}, */
+	/* .cpu_funcs_name = {"axpy_cpu"}, */
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = {cuda_axpy},
+#elif defined(STARPU_SIMGRID)
+	.cuda_funcs = {(void*)1},
+#endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_RW},
+	.name = "axpy",
+	.model = &axpy_model
+};
+
+static int
+check(int niter)
+{
+	int i;
+	for (i = 0; i < N; i++)
+	{
+		float expected_value = _alpha * _vec_x[niter][i] + 4.0;
+		if (fabs(_vec_y[niter][i] - expected_value) > expected_value * EPSILON)
+		{
+			FPRINTF(stderr,"at %d, %f*%f+%f=%f, expected %f\n", i, _alpha, _vec_x[niter][i], 4.0, _vec_y[niter][i], expected_value);
+			return EXIT_FAILURE;
+		}
+	}
+
+	return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+	int ret, exit_value = 0;
+	int iter;
+	/* Initialize StarPU */
+	ret = starpu_init(NULL);
+	if (ret == -ENODEV)
+		return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+
+	/* This is equivalent to
+		vec_a = malloc(N*sizeof(float));
+		vec_b = malloc(N*sizeof(float));
+	*/
+	for(iter = 0; iter < NITER; iter++)
+	{  
+		starpu_malloc((void **)&_vec_x[iter], N*sizeof(float));
+		assert(_vec_x[iter]);
+
+		starpu_malloc((void **)&_vec_y[iter], N*sizeof(float));
+		assert(_vec_y[iter]);
+
+		unsigned i;
+		for (i = 0; i < N; i++)
+		{
+			_vec_x[iter][i] = 1.0f; /*(float)starpu_drand48(); */
+			_vec_y[iter][i] = 4.0f; /*(float)starpu_drand48(); */
+		}
+		
+		/* Declare the data to StarPU */
+		starpu_vector_data_register(&_handle_x[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_x[iter], N, sizeof(float));
+		starpu_vector_data_register(&_handle_y[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_y[iter], N, sizeof(float));
+	}
+
+	double start;
+	double end;
+#ifdef STARPU_USE_CUDA
+	int gpu_devid = -1;
+	int nfound_gpus = starpu_worker_get_devids(STARPU_CUDA_WORKER, &gpu_devid, 1);
+
+	printf("gpu_devid found %d \n", gpu_devid);
+	if(nfound_gpus == 0)
+		return 0;
+
+	unsigned nworkers = starpu_worker_get_count();
+	int stream_workerids[nworkers];
+
+	int nstreams = starpu_worker_get_stream_workerids(gpu_devid, stream_workerids, STARPU_CUDA_WORKER);
+
+	int s;
+	for(s = 0; s < nstreams; s++)
+		printf("stream w %d \n", stream_workerids[s]);
+
+	int ncpus = starpu_cpu_worker_get_count();
+	int workers[ncpus+nstreams];
+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
+
+	int sched_ctxs[nstreams];
+	int nsms[nstreams];
+	nsms[0] = 6;
+	nsms[1] = 7;
+	
+	for(s = 0; s < nstreams; s++)
+	{
+		sched_ctxs[s] = starpu_sched_ctx_create(&stream_workerids[s], 1, "subctx",  STARPU_SCHED_CTX_CUDA_NSMS, nsms[s], 0);
+		workers[ncpus+s] = stream_workerids[s];  
+	}
+	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
+
+	printf("parent ctx %d\n", sched_ctx1);
+	starpu_sched_ctx_set_context(&sched_ctx1);
+
+#endif
+	start = starpu_timing_now();
+
+	for (iter = 0; iter < NITER; iter++)
+	{
+		struct starpu_task *task = starpu_task_create();
+
+		task->cl = &axpy_cl;
+
+		task->cl_arg = &_alpha;
+		task->cl_arg_size = sizeof(_alpha);
+
+		task->handles[0] = _handle_x[iter];
+		task->handles[1] = _handle_y[iter];
+
+		ret = starpu_task_submit(task);
+		if (ret == -ENODEV)
+		{
+			exit_value = 77;
+			goto enodev;
+		}
+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	}
+
+	starpu_task_wait_for_all();
+
+enodev:
+	for(iter = 0; iter < NITER; iter++)
+	{
+		starpu_data_unregister(_handle_x[iter]);
+		starpu_data_unregister(_handle_y[iter]);
+	}
+	end = starpu_timing_now();
+        double timing = end - start;
+
+	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(float)/timing);
+
+//	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[iter][0], _alpha);
+
+	if (exit_value != 77)
+	{
+		for(iter = 0; iter < NITER; iter++)
+		{			
+			exit_value = check(iter);
+			if(exit_value != EXIT_SUCCESS)
+				break;
+		}
+	}
+
+	for(iter = 0; iter < NITER; iter++)
+	{
+		starpu_free((void *)_vec_x[iter]);
+		starpu_free((void *)_vec_y[iter]);
+	}
+
+	/* Stop StarPU */
+	starpu_shutdown();
+
+	return exit_value;
+}