Browse Source

Add starpu_cusparse_init/shutdown/get_local_handle for proper CUDA overlapping with cusparse.

Samuel Thibault 8 years ago
parent
commit
e5b21e47fd

+ 3 - 0
ChangeLog

@@ -57,6 +57,9 @@ New features:
   * Add starpu_data_acquire_try and starpu_data_acquire_on_node_try.
   * Add starpu_data_acquire_try and starpu_data_acquire_on_node_try.
   * Add NVCC_CC environment variable.
   * Add NVCC_CC environment variable.
   * Add -no-foo options to starpu_fxt_tool to make traces lighter
   * Add -no-foo options to starpu_fxt_tool to make traces lighter
+  * Add starpu_cusparse_init/shutdown/get_local_handle for proper CUDA
+    overlapping with cusparse.
+
 
 
 Small changes:
 Small changes:
   * Output generated through STARPU_MPI_COMM has been modified to
   * Output generated through STARPU_MPI_COMM has been modified to

+ 2 - 1
Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 #
-# Copyright (C) 2009-2016  Université de Bordeaux
+# Copyright (C) 2009-2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
 # Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
 # Copyright (C) 2014  INRIA
 # Copyright (C) 2014  INRIA
 # Copyright (C) 2016  Inria
 # Copyright (C) 2016  Inria
@@ -99,6 +99,7 @@ versinclude_HEADERS = 				\
 	include/starpu_disk.h			\
 	include/starpu_disk.h			\
 	include/starpu_cublas.h			\
 	include/starpu_cublas.h			\
 	include/starpu_cublas_v2.h		\
 	include/starpu_cublas_v2.h		\
+	include/starpu_cusparse.h		\
 	include/starpu_driver.h			\
 	include/starpu_driver.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\
 	include/starpu_thread.h			\

+ 2 - 0
configure.ac

@@ -1116,6 +1116,8 @@ if test x$enable_cuda = xyes; then
 	fi
 	fi
 
 
 	AC_CHECK_HEADERS([cuda_gl_interop.h])
 	AC_CHECK_HEADERS([cuda_gl_interop.h])
+
+	AC_CHECK_LIB([cusparse], [cusparseCreate])
 fi
 fi
 
 
 dnl Hey dude, are you around?
 dnl Hey dude, are you around?

+ 5 - 1
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -65,9 +65,13 @@ Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
 CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
 CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
 the beginning of the codelet to make sure that CUBLAS is really using the proper
-stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
+stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
 kernels with the proper configuration.
 kernels with the proper configuration.
 
 
+Similarly, calling starpu_cusparse_init() makes StarPU create CUSPARSE handles
+on each CUDA device, starpu_cusparse_get_local_handle() can then be used to
+queue CUSPARSE kernels with the proper configuration.
+
 If the kernel can be made to only use this local stream or other self-allocated
 If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then
 streams, i.e. the whole kernel submission can be made asynchronous, then
 one should enable asynchronous execution of the kernel.  That means setting
 one should enable asynchronous execution of the kernel.  That means setting

+ 17 - 0
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -95,4 +95,21 @@ Report a cublas error.
 Calls starpu_cublas_report_error(), passing the current
 Calls starpu_cublas_report_error(), passing the current
 function, file and line position.
 function, file and line position.
 
 
+\fn void starpu_cusparse_init(void)
+\ingroup API_CUDA_Extensions
+Calling starpu_cusparse_init() will initialize CUSPARSE on every CUDA device
+controlled by StarPU. This call blocks until CUSPARSE has been properly
+initialized on every device.
+
+\fn cusparseHandle_t starpu_cusparse_get_local_handle(void)
+\ingroup API_CUDA_Extensions
+This function returns the CUSPARSE handle to be used to queue CUSPARSE
+kernels. It is properly initialized and configured for multistream by
+starpu_cusparse_init().
+
+\fn void starpu_cusparse_shutdown(void)
+\ingroup API_CUDA_Extensions
+This function synchronously deinitializes the CUSPARSE library on
+every CUDA device.
+
 */
 */

+ 1 - 0
include/starpu.h

@@ -62,6 +62,7 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_rand.h>
 #include <starpu_rand.h>
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_cublas.h>
 #include <starpu_cublas.h>
+#include <starpu_cusparse.h>
 #include <starpu_bound.h>
 #include <starpu_bound.h>
 #include <starpu_hash.h>
 #include <starpu_hash.h>
 #include <starpu_profiling.h>
 #include <starpu_profiling.h>

+ 5 - 1
include/starpu_cublas_v2.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,6 +18,8 @@
 #ifndef __STARPU_CUBLAS_V2_H__
 #ifndef __STARPU_CUBLAS_V2_H__
 #define __STARPU_CUBLAS_V2_H__
 #define __STARPU_CUBLAS_V2_H__
 
 
+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
+
 #include <cublas_v2.h>
 #include <cublas_v2.h>
 
 
 #ifdef __cplusplus
 #ifdef __cplusplus
@@ -31,4 +33,6 @@ cublasHandle_t starpu_cublas_get_local_handle(void);
 }
 }
 #endif
 #endif
 
 
+#endif
+
 #endif /* __STARPU_CUBLAS_V2_H__ */
 #endif /* __STARPU_CUBLAS_V2_H__ */

+ 40 - 0
include/starpu_cusparse.h

@@ -0,0 +1,40 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_CUSPARSE_H__
+#define __STARPU_CUSPARSE_H__
+
+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
+
+#include <cusparse.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void starpu_cusparse_init(void);
+cusparseHandle_t starpu_cusparse_get_local_handle(void);
+void starpu_cusparse_shutdown(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif /* __STARPU_CUSPARSE_H__ */

+ 1 - 0
src/Makefile.am

@@ -318,6 +318,7 @@ endif
 endif
 endif
 
 
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusparse.c
 
 
 if STARPU_USE_OPENCL
 if STARPU_USE_OPENCL
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c

+ 71 - 0
src/drivers/cuda/starpu_cusparse.c

@@ -0,0 +1,71 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009-2012, 2014, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2017  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <common/config.h>
+
+#include <starpu.h>
+#include <starpu_cuda.h>
+#include <core/workers.h>
+
+#ifdef HAVE_LIBCUSPARSE
+#include <cusparse.h>
+
+static cusparseHandle_t cusparse_handles[STARPU_NMAXWORKERS];
+static cusparseHandle_t main_handle;
+
+static void init_cusparse_func(void *args STARPU_ATTRIBUTE_UNUSED)
+{
+	cusparseCreate(&cusparse_handles[starpu_worker_get_id_check()]);
+	cusparseSetStream(cusparse_handles[starpu_worker_get_id_check()], starpu_cuda_get_local_stream());
+}
+
+static void shutdown_cusparse_func(void *args STARPU_ATTRIBUTE_UNUSED)
+{
+	cusparseDestroy(cusparse_handles[starpu_worker_get_id_check()]);
+}
+#endif
+
+void starpu_cusparse_init(void)
+{
+#ifdef HAVE_LIBCUSPARSE
+	starpu_execute_on_each_worker(init_cusparse_func, NULL, STARPU_CUDA);
+
+	if (cusparseCreate(&main_handle) != CUSPARSE_STATUS_SUCCESS)
+		main_handle = NULL;
+#endif
+}
+
+void starpu_cusparse_shutdown(void)
+{
+#ifdef HAVE_LIBCUSPARSE
+	starpu_execute_on_each_worker(shutdown_cusparse_func, NULL, STARPU_CUDA);
+
+	if (main_handle)
+		cusparseDestroy(main_handle);
+#endif
+}
+
+#ifdef HAVE_LIBCUSPARSE
+cusparseHandle_t starpu_cusparse_get_local_handle(void)
+{
+	int workerid = starpu_worker_get_id();
+	if (workerid >= 0)
+		return cusparse_handles[workerid];
+	else
+		return main_handle;
+}
+#endif

+ 1 - 0
tests/Makefile.am

@@ -154,6 +154,7 @@ myPROGRAMS +=					\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/starpu_init_noworker		\
 	errorcheck/invalid_tasks		\
 	errorcheck/invalid_tasks		\
 	helper/cublas_init			\
 	helper/cublas_init			\
+	helper/cusparse_init			\
 	helper/pinned_memory			\
 	helper/pinned_memory			\
 	helper/execute_on_all			\
 	helper/execute_on_all			\
 	microbenchs/display_structures_size	\
 	microbenchs/display_structures_size	\

+ 72 - 0
tests/helper/cusparse_init.c

@@ -0,0 +1,72 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2009, 2010, 2014, 2016-2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <starpu.h>
+#include <stdlib.h>
+#include "../helper.h"
+
+/*
+ * Test initializing cusparse, and how much time that takes
+ */
+
+static double start;
+static double end;
+
+//static float *data = NULL;
+
+int main(int argc, char **argv)
+{
+	int ret;
+
+	ret = starpu_initialize(NULL, &argc, &argv);
+	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	unsigned ngpus = starpu_cuda_worker_get_count();
+
+	double init_timing;
+	double shutdown_timing;
+
+	start = starpu_timing_now();
+	starpu_cusparse_init();
+	end = starpu_timing_now();
+	init_timing = end - start;
+
+	start = starpu_timing_now();
+	starpu_cusparse_shutdown();
+	end = starpu_timing_now();
+	shutdown_timing = end - start;
+
+	FPRINTF(stderr, "Total:\n");
+	FPRINTF(stderr, "\tinit: %2.2f us\n", init_timing/(1000));
+	FPRINTF(stderr, "\tshutdown: %2.2f us\n", shutdown_timing/(1000));
+
+	if (ngpus != 0)
+	{
+		FPRINTF(stderr, "per-GPU (#gpu = %u):\n", ngpus);
+
+		FPRINTF(stderr, "\tinit: %2.2f us\n", init_timing/(1000*ngpus));
+		FPRINTF(stderr, "\tshutdown: %2.2f us\n", shutdown_timing/(1000*ngpus));
+	}
+
+	starpu_shutdown();
+
+	return EXIT_SUCCESS;
+}