8 gadi atpakaļ · 1e8359bfaa
--- a/examples/sched_ctx/axpy_partition_gpu.cu
+++ b/examples/sched_ctx/axpy_partition_gpu.cu
@@ -0,0 +1,54 @@
 
				+#include <starpu.h>
			
 
				+#include "axpy_partition_gpu.h"
			
 
				+#include <stdio.h>
			
 
				+
			
 
				+//This code demonstrates how to transform a kernel to execute on a given set of GPU SMs.
			
 
				+
			
 
				+
			
 
				+// Original kernel
			
 
				+__global__ void saxpy(int n, float a, float *x, float *y)
			
 
				+{
			
 
				+	int i = blockIdx.x*blockDim.x + threadIdx.x;
			
 
				+	if (i<n)  y[i] = a*x[i] + y[i];
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+// Transformed kernel
			
 
				+__global__ void saxpy_partitioned(__P_KARGS, int n, float a, float *x, float *y)
			
 
				+{
			
 
				+  __P_BEGIN;
			
 
				+  __P_LOOPX;
			
 
				+        int i = blockid.x*blockDim.x + threadIdx.x; // note that blockIdx is replaced.
			
 
				+	if (i<n)  y[i] = a*x[i] + y[i];
			
 
				+  __P_LOOPEND;
			
 
				+}
			
 
				+      
			
 
				+
			
 
				+extern "C" void cuda_axpy(void *descr[], void *_args)
			
 
				+{
			
 
				+	 float a = *((float *)_args);
			
 
				+
			
 
				+        unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+        float *x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+        float *y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	int SM_mapping_start = -1;
			
 
				+	int SM_mapping_end = -1; 
			
 
				+  	int SM_allocation = -1;
			
 
				+  
			
 
				+	cudaStream_t stream = starpu_cuda_get_local_stream();
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+    	starpu_sched_ctx_get_sms_interval(workerid, &SM_mapping_start, &SM_mapping_end);
			
 
				+	SM_allocation = SM_mapping_end - SM_mapping_start;
			
 
				+	int dimensions = 512;	
			
 
				+	//partitioning setup
			
 
				+//	int SM_mapping_start = 0;
			
 
				+//  	int SM_allocation = 13;
			
 
				+  
			
 
				+	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
			
 
				+
			
 
				+  	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
			
 
				+}
			
--- a/examples/sched_ctx/axpy_partition_gpu.h
+++ b/examples/sched_ctx/axpy_partition_gpu.h
@@ -0,0 +1,107 @@
 
				+#pragma once
			
 
				+
			
 
				+
			
 
				+__device__ static uint get_smid(void) {
			
 
				+#if defined(__CUDACC__)
			
 
				+  uint ret;
			
 
				+  asm("mov.u32 %0, %smid;" : "=r"(ret) );
			
 
				+  return ret;
			
 
				+#else
			
 
				+  return 0;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#define __P_HKARGS    dimGrid,     active_blocks     ,occupancy,               block_assignment_d,   mapping_start
			
 
				+#define __P_KARGS dim3 blocks, int active_blocks, int occupancy, unsigned int* block_assignment, int mapping_start
			
 
				+
			
 
				+#define __P_DARGS blocks,blockid
			
 
				+
			
 
				+#define __P_BEGIN							\
			
 
				+__shared__ unsigned int block_start;					\
			
 
				+int smid = get_smid();							\
			
 
				+if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0)		\
			
 
				+  {									\
			
 
				+    block_start = atomicDec(&block_assignment[smid],0xDEADBEEF);	\
			
 
				+  }									\
			
 
				+__syncthreads();							\
			
 
				+									\
			
 
				+if(block_start > active_blocks)						\
			
 
				+  {									\
			
 
				+    return;								\
			
 
				+  }									
			
 
				+
			
 
				+#define __P_LOOPXY							\
			
 
				+  dim3 blockid;								\
			
 
				+  blockid.z = 0;							\
			
 
				+									\
			
 
				+  int gridDim_sum = blocks.x*blocks.y;					\
			
 
				+  int startBlock = block_start + (smid - mapping_start) * occupancy;	\
			
 
				+									\
			
 
				+  for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
			
 
				+    {									\
			
 
				+  blockid.x = blockid_sum % blocks.x;					\
			
 
				+  blockid.y = blockid_sum / blocks.x;
			
 
				+
			
 
				+#define __P_LOOPEND }
			
 
				+// Needed if shared memory is used
			
 
				+#define __P_LOOPEND_SAFE __syncthreads(); }
			
 
				+
			
 
				+#define __P_LOOPX							\
			
 
				+  dim3 blockid;								\
			
 
				+  blockid.z = 0;							\
			
 
				+  blockid.y = 0;							\
			
 
				+  int gridDim_sum = blocks.x;						\
			
 
				+  int startBlock = (smid-mapping_start) + block_start*(active_blocks/occupancy); \
			
 
				+									\
			
 
				+  for(int blockid_sum = startBlock; blockid_sum < gridDim_sum; blockid_sum +=active_blocks) \
			
 
				+    {									\
			
 
				+  blockid.x = blockid_sum;
			
 
				+
			
 
				+
			
 
				+  //  int startBlock = block_start + (smid - mapping_start) * occupancy; \
			
 
				+
			
 
				+
			
 
				+//////////// HOST side functions
			
 
				+
			
 
				+
			
 
				+template <typename F>
			
 
				+static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int mapping_start, int allocation,
			
 
				+				  int &width, int &active_blocks, unsigned int *block_assignment_d,cudaStream_t current_stream = cudaStreamPerThread)
			
 
				+{
			
 
				+  int occupancy;
			
 
				+  int nb_SM = 13; //TODO: replace with call
			
 
				+  int mapping_end = mapping_start + allocation - 1; // exclusive
			
 
				+  unsigned int block_assignment[15];
			
 
				+  
			
 
				+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy,cudaFun,threads,shmem);
			
 
				+  //occupancy = 4;
			
 
				+  width = occupancy * nb_SM; // Physical wrapper grid size. Fits GPU exactly
			
 
				+  active_blocks = occupancy*allocation; // The total number of blocks doing work
			
 
				+
			
 
				+  for(int i = 0; i < mapping_start; i++)
			
 
				+    block_assignment[i] = (unsigned) -1;
			
 
				+
			
 
				+  for(int i = mapping_start; i <= mapping_end; i++)
			
 
				+    {
			
 
				+      block_assignment[i] = occupancy - 1;
			
 
				+    }
			
 
				+
			
 
				+  for(int i = mapping_end+1; i < nb_SM; i++)
			
 
				+    block_assignment[i] = (unsigned) -1;
			
 
				+
			
 
				+  cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
			
 
				+  //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define __P_HOSTSETUP(KERNEL,GRIDDIM,BLOCKSIZE,SHMEMSIZE,MAPPING_START,MAPPING_END,STREAM)	\
			
 
				+  unsigned int* block_assignment_d; cudaMalloc((void**) &block_assignment_d,15*sizeof(unsigned int)); \
			
 
				+  int width = 0;							\
			
 
				+  int active_blocks = 0;						\
			
 
				+  buildPartitionedBlockMapping(KERNEL,BLOCKSIZE,SHMEMSIZE,(MAPPING_START),(MAPPING_END)-(MAPPING_START), \
			
 
				+			       width, active_blocks, block_assignment_d,STREAM); \
			
 
				+  int occupancy = active_blocks/((MAPPING_END)-(MAPPING_START));		\
			
 
				+  dim3 dimGrid = (GRIDDIM);\
			
 
				+  int mapping_start = (MAPPING_START);
			
--- a/examples/sched_ctx/gpu_partition.c
+++ b/examples/sched_ctx/gpu_partition.c
@@ -0,0 +1,240 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2009-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * This creates two dumb vectors, splits them into chunks, and for each pair of
			
 
				+ * chunk, run axpy on them.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <stdio.h>
			
 
				+#include <assert.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+#include <common/blas.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+#include <cublas.h>
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#define N	512*512
			
 
				+#define NITER   100
			
 
				+
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+#define EPSILON 1e-6
			
 
				+
			
 
				+float *_vec_x[NITER], *_vec_y[NITER];
			
 
				+float _alpha = 3.41;
			
 
				+
			
 
				+/* descriptors for StarPU */
			
 
				+starpu_data_handle_t _handle_y[NITER], _handle_x[NITER];
			
 
				+
			
 
				+void axpy_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
			
 
				+{
			
 
				+	float alpha = *((float *)arg);
			
 
				+
			
 
				+	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
			
 
				+
			
 
				+	float *block_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				+	float *block_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for( i = 0; i < n; i++)
			
 
				+		block_y[i] = alpha * block_x[i] + block_y[i];
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+extern void cuda_axpy(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args);
			
 
				+#endif
			
 
				+
			
 
				+static struct starpu_perfmodel axpy_model =
			
 
				+{
			
 
				+	.type = STARPU_HISTORY_BASED,
			
 
				+	.symbol = "axpy"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet axpy_cl =
			
 
				+{
			
 
				+	/* .cpu_funcs = {axpy_cpu}, */
			
 
				+	/* .cpu_funcs_name = {"axpy_cpu"}, */
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = {cuda_axpy},
			
 
				+#elif defined(STARPU_SIMGRID)
			
 
				+	.cuda_funcs = {(void*)1},
			
 
				+#endif
			
 
				+	.cuda_flags = {STARPU_CUDA_ASYNC},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_R, STARPU_RW},
			
 
				+	.name = "axpy",
			
 
				+	.model = &axpy_model
			
 
				+};
			
 
				+
			
 
				+static int
			
 
				+check(int niter)
			
 
				+{
			
 
				+	int i;
			
 
				+	for (i = 0; i < N; i++)
			
 
				+	{
			
 
				+		float expected_value = _alpha * _vec_x[niter][i] + 4.0;
			
 
				+		if (fabs(_vec_y[niter][i] - expected_value) > expected_value * EPSILON)
			
 
				+		{
			
 
				+			FPRINTF(stderr,"at %d, %f*%f+%f=%f, expected %f\n", i, _alpha, _vec_x[niter][i], 4.0, _vec_y[niter][i], expected_value);
			
 
				+			return EXIT_FAILURE;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret, exit_value = 0;
			
 
				+	int iter;
			
 
				+	/* Initialize StarPU */
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV)
			
 
				+		return 77;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+
			
 
				+	/* This is equivalent to
			
 
				+		vec_a = malloc(N*sizeof(float));
			
 
				+		vec_b = malloc(N*sizeof(float));
			
 
				+	*/
			
 
				+	for(iter = 0; iter < NITER; iter++)
			
 
				+	{  
			
 
				+		starpu_malloc((void **)&_vec_x[iter], N*sizeof(float));
			
 
				+		assert(_vec_x[iter]);
			
 
				+
			
 
				+		starpu_malloc((void **)&_vec_y[iter], N*sizeof(float));
			
 
				+		assert(_vec_y[iter]);
			
 
				+
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < N; i++)
			
 
				+		{
			
 
				+			_vec_x[iter][i] = 1.0f; /*(float)starpu_drand48(); */
			
 
				+			_vec_y[iter][i] = 4.0f; /*(float)starpu_drand48(); */
			
 
				+		}
			
 
				+		
			
 
				+		/* Declare the data to StarPU */
			
 
				+		starpu_vector_data_register(&_handle_x[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_x[iter], N, sizeof(float));
			
 
				+		starpu_vector_data_register(&_handle_y[iter], STARPU_MAIN_RAM, (uintptr_t)_vec_y[iter], N, sizeof(float));
			
 
				+	}
			
 
				+
			
 
				+	double start;
			
 
				+	double end;
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	int gpu_devid = -1;
			
 
				+	int nfound_gpus = starpu_worker_get_devids(STARPU_CUDA_WORKER, &gpu_devid, 1);
			
 
				+
			
 
				+	printf("gpu_devid found %d \n", gpu_devid);
			
 
				+	if(nfound_gpus == 0)
			
 
				+		return 0;
			
 
				+
			
 
				+	unsigned nworkers = starpu_worker_get_count();
			
 
				+	int stream_workerids[nworkers];
			
 
				+
			
 
				+	int nstreams = starpu_worker_get_stream_workerids(gpu_devid, stream_workerids, STARPU_CUDA_WORKER);
			
 
				+
			
 
				+	int s;
			
 
				+	for(s = 0; s < nstreams; s++)
			
 
				+		printf("stream w %d \n", stream_workerids[s]);
			
 
				+
			
 
				+	int ncpus = starpu_cpu_worker_get_count();
			
 
				+	int workers[ncpus+nstreams];
			
 
				+	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
			
 
				+
			
 
				+	int sched_ctxs[nstreams];
			
 
				+	int nsms[nstreams];
			
 
				+	nsms[0] = 6;
			
 
				+	nsms[1] = 7;
			
 
				+	
			
 
				+	for(s = 0; s < nstreams; s++)
			
 
				+	{
			
 
				+		sched_ctxs[s] = starpu_sched_ctx_create(&stream_workerids[s], 1, "subctx",  STARPU_SCHED_CTX_CUDA_NSMS, nsms[s], 0);
			
 
				+		workers[ncpus+s] = stream_workerids[s];  
			
 
				+	}
			
 
				+	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
			
 
				+
			
 
				+	printf("parent ctx %d\n", sched_ctx1);
			
 
				+	starpu_sched_ctx_set_context(&sched_ctx1);
			
 
				+
			
 
				+#endif
			
 
				+	start = starpu_timing_now();
			
 
				+
			
 
				+	for (iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &axpy_cl;
			
 
				+
			
 
				+		task->cl_arg = &_alpha;
			
 
				+		task->cl_arg_size = sizeof(_alpha);
			
 
				+
			
 
				+		task->handles[0] = _handle_x[iter];
			
 
				+		task->handles[1] = _handle_y[iter];
			
 
				+
			
 
				+		ret = starpu_task_submit(task);
			
 
				+		if (ret == -ENODEV)
			
 
				+		{
			
 
				+			exit_value = 77;
			
 
				+			goto enodev;
			
 
				+		}
			
 
				+		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+enodev:
			
 
				+	for(iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+		starpu_data_unregister(_handle_x[iter]);
			
 
				+		starpu_data_unregister(_handle_y[iter]);
			
 
				+	}
			
 
				+	end = starpu_timing_now();
			
 
				+        double timing = end - start;
			
 
				+
			
 
				+	FPRINTF(stderr, "timing -> %2.2f us %2.2f MB/s\n", timing, 3*N*sizeof(float)/timing);
			
 
				+
			
 
				+//	FPRINTF(stderr, "AFTER y[0] = %2.2f (ALPHA = %2.2f)\n", _vec_y[iter][0], _alpha);
			
 
				+
			
 
				+	if (exit_value != 77)
			
 
				+	{
			
 
				+		for(iter = 0; iter < NITER; iter++)
			
 
				+		{			
			
 
				+			exit_value = check(iter);
			
 
				+			if(exit_value != EXIT_SUCCESS)
			
 
				+				break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for(iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+		starpu_free((void *)_vec_x[iter]);
			
 
				+		starpu_free((void *)_vec_y[iter]);
			
 
				+	}
			
 
				+
			
 
				+	/* Stop StarPU */
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return exit_value;
			
 
				+}