Browse Source

Use a separate stream for gpu in/out, and peer, as some cards can actually do some transfers in parallel

Samuel Thibault 12 years ago
parent
commit
27960c28bc
3 changed files with 39 additions and 11 deletions
  1. 3 3
      src/datawizard/copy_driver.c
  2. 32 6
      src/drivers/cuda/driver_cuda.c
  3. 4 2
      src/drivers/cuda/driver_cuda.h

+ 3 - 3
src/datawizard/copy_driver.c

@@ -169,7 +169,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 
-			stream = starpu_cuda_get_local_transfer_stream();
+			stream = starpu_cuda_get_local_out_transfer_stream();
 			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
 
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
@@ -195,7 +195,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
 				STARPU_CUDA_REPORT_ERROR(cures);
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 
-			stream = starpu_cuda_get_local_transfer_stream();
+			stream = starpu_cuda_get_local_in_transfer_stream();
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
@@ -218,7 +218,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			cures = cudaEventCreate(&req->async_channel.event.cuda_event);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 
-			stream = starpu_cuda_get_local_transfer_stream();
+			stream = starpu_cuda_get_local_peer_transfer_stream();
 			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
 
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);

+ 32 - 6
src/drivers/cuda/driver_cuda.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2011-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
@@ -38,7 +38,9 @@
 static int ncudagpus;
 static int ncudagpus;
 
 
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t streams[STARPU_NMAXWORKERS];
-static cudaStream_t transfer_streams[STARPU_NMAXWORKERS];
+static cudaStream_t out_transfer_streams[STARPU_NMAXWORKERS];
+static cudaStream_t in_transfer_streams[STARPU_NMAXWORKERS];
+static cudaStream_t peer_transfer_streams[STARPU_NMAXWORKERS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
@@ -110,11 +112,25 @@ size_t starpu_cuda_get_global_mem_size(unsigned devid)
 	return (size_t)props[devid].totalGlobalMem;
 	return (size_t)props[devid].totalGlobalMem;
 }
 }
 
 
-cudaStream_t starpu_cuda_get_local_transfer_stream(void)
+cudaStream_t starpu_cuda_get_local_in_transfer_stream(void)
 {
 {
 	int worker = starpu_worker_get_id();
 	int worker = starpu_worker_get_id();
 
 
-	return transfer_streams[worker];
+	return in_transfer_streams[worker];
+}
+
+cudaStream_t starpu_cuda_get_local_out_transfer_stream(void)
+{
+	int worker = starpu_worker_get_id();
+
+	return out_transfer_streams[worker];
+}
+
+cudaStream_t starpu_cuda_get_local_peer_transfer_stream(void)
+{
+	int worker = starpu_worker_get_id();
+
+	return peer_transfer_streams[worker];
 }
 }
 
 
 cudaStream_t starpu_cuda_get_local_stream(void)
 cudaStream_t starpu_cuda_get_local_stream(void)
@@ -226,7 +242,15 @@ static void init_context(unsigned devid)
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 
 
-	cures = cudaStreamCreate(&transfer_streams[workerid]);
+	cures = cudaStreamCreate(&in_transfer_streams[workerid]);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaStreamCreate(&out_transfer_streams[workerid]);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaStreamCreate(&peer_transfer_streams[workerid]);
 	if (STARPU_UNLIKELY(cures))
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 }
@@ -236,7 +260,9 @@ static void deinit_context(int workerid, unsigned devid)
 	cudaError_t cures;
 	cudaError_t cures;
 
 
 	cudaStreamDestroy(streams[workerid]);
 	cudaStreamDestroy(streams[workerid]);
-	cudaStreamDestroy(transfer_streams[workerid]);
+	cudaStreamDestroy(in_transfer_streams[workerid]);
+	cudaStreamDestroy(out_transfer_streams[workerid]);
+	cudaStreamDestroy(peer_transfer_streams[workerid]);
 
 
 	unlimit_gpu_mem_if_needed(devid);
 	unlimit_gpu_mem_if_needed(devid);
 
 

+ 4 - 2
src/drivers/cuda/driver_cuda.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010, 2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -44,7 +44,9 @@ unsigned _starpu_get_cuda_device_count(void);
 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);
 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);
 void _starpu_init_cuda(void);
 void _starpu_init_cuda(void);
 void *_starpu_cuda_worker(void *);
 void *_starpu_cuda_worker(void *);
-cudaStream_t starpu_cuda_get_local_transfer_stream(void);
+cudaStream_t starpu_cuda_get_local_in_transfer_stream(void);
+cudaStream_t starpu_cuda_get_local_out_transfer_stream(void);
+cudaStream_t starpu_cuda_get_local_peer_transfer_stream(void);
 
 
 int _starpu_run_cuda(struct starpu_driver *);
 int _starpu_run_cuda(struct starpu_driver *);
 int _starpu_cuda_driver_init(struct starpu_driver *);
 int _starpu_cuda_driver_init(struct starpu_driver *);