Kaynağa Gözat

provide good examples by always using cudaMemsetAsync, not cudaMemset

Samuel Thibault 13 yıl önce
ebeveyn
işleme
d2cd1868e2

+ 10 - 3
examples/audio/starpu_audio_processing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
@@ -179,9 +179,11 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 	{
 	{
 		cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
 		cures = cufftPlan1d(&plans[workerid].plan, nsamples, CUFFT_R2C, 1);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plans[workerid].plan, starpu_cuda_get_local_stream());
 
 
 		cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
 		cures = cufftPlan1d(&plans[workerid].inv_plan, nsamples, CUFFT_C2R, 1);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
 		STARPU_ASSERT(cures == CUFFT_SUCCESS);
+		cufftSetStream(plans[workerid].inv_plan, starpu_cuda_get_local_stream());
 
 
 		cudaMalloc((void **)&plans[workerid].localout,
 		cudaMalloc((void **)&plans[workerid].localout,
 					nsamples*sizeof(cufftComplex));
 					nsamples*sizeof(cufftComplex));
@@ -198,11 +200,11 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 	
 	
 	/* filter low freqs */
 	/* filter low freqs */
 	unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
 	unsigned lowfreq_index = (LOWFREQ*nsamples)/SAMPLERATE;
-	cudaMemset(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex));
+	cudaMemsetAsync(&localout[0], 0, lowfreq_index*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
 
 
 	/* filter high freqs */
 	/* filter high freqs */
 	unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
 	unsigned hifreq_index = (HIFREQ*nsamples)/SAMPLERATE;
-	cudaMemset(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex));
+	cudaMemsetAsync(&localout[hifreq_index], nsamples/2, (nsamples/2 - hifreq_index)*sizeof(fftwf_complex), starpu_cuda_get_local_stream());
 
 
 	/* inverse FFT */
 	/* inverse FFT */
 	cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
 	cures = cufftExecC2R(plans[workerid].inv_plan, localout, localA);
@@ -210,6 +212,7 @@ static void band_filter_kernel_gpu(void *descr[], __attribute__((unused)) void *
 
 
 	/* FFTW does not normalize its output ! */
 	/* FFTW does not normalize its output ! */
 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
 	cublasSscal (nsamples, 1.0f/nsamples, localA, 1);
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 
@@ -410,6 +413,8 @@ int main(int argc, char **argv)
 		return 77;
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 
+	starpu_helper_cublas_init();
+
 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
 	starpu_vector_data_register(&A_handle, 0, (uintptr_t)A, niter*nsamples, sizeof(float));
 
 
 	struct starpu_data_filter f =
 	struct starpu_data_filter f =
@@ -458,6 +463,8 @@ int main(int argc, char **argv)
 	starpu_data_unpartition(A_handle, 0);
 	starpu_data_unpartition(A_handle, 0);
 	starpu_data_unregister(A_handle);
 	starpu_data_unregister(A_handle);
 
 
+	starpu_helper_cublas_shutdown();
+
 	/* we are done ! */
 	/* we are done ! */
 	starpu_shutdown();
 	starpu_shutdown();
 
 

+ 5 - 1
examples/gl_interop/gl_interop.c

@@ -27,13 +27,17 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <unistd.h>
 #include <unistd.h>
 #include <GL/glut.h>
 #include <GL/glut.h>
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 
 
 void dummy(void *buffers[], void *cl_arg)
 void dummy(void *buffers[], void *cl_arg)
 {
 {
 	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 
 
 	printf("Codelet running\n");
 	printf("Codelet running\n");
-	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	printf("Codelet done\n");
 	printf("Codelet done\n");
 }
 }
 
 

+ 5 - 1
examples/gl_interop/gl_interop_idle.c

@@ -30,13 +30,17 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <unistd.h>
 #include <unistd.h>
 #include <GL/glut.h>
 #include <GL/glut.h>
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 
 
 void dummy(void *buffers[], void *cl_arg)
 void dummy(void *buffers[], void *cl_arg)
 {
 {
 	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 	float *v = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 
 
 	printf("Codelet running\n");
 	printf("Codelet running\n");
-	cudaMemset(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float));
+	cudaMemsetAsync(v, 0, STARPU_VECTOR_GET_NX(buffers[0]) * sizeof(float), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	printf("Codelet done\n");
 	printf("Codelet done\n");
 }
 }
 
 

+ 2 - 2
examples/pi/pi_redux.c

@@ -235,8 +235,8 @@ static void init_cpu_func(void *descr[], void *cl_arg)
 static void init_cuda_func(void *descr[], void *cl_arg)
 static void init_cuda_func(void *descr[], void *cl_arg)
 {
 {
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
-        cudaMemset(val, 0, sizeof(unsigned long));
-        cudaThreadSynchronize();
+        cudaMemsetAsync(val, 0, sizeof(unsigned long), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 2 - 2
examples/reductions/dot_product.c

@@ -79,8 +79,8 @@ void init_cpu_func(void *descr[], void *cl_arg)
 void init_cuda_func(void *descr[], void *cl_arg)
 void init_cuda_func(void *descr[], void *cl_arg)
 {
 {
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
-	cudaMemset(dot, 0, sizeof(DOT_TYPE));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 7 - 2
tests/datawizard/write_only_tmp_buffer.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2009, 2010-2012  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,6 +23,10 @@
 #include <stdlib.h>
 #include <stdlib.h>
 #include "../helper.h"
 #include "../helper.h"
 
 
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
+
 #define VECTORSIZE	1024
 #define VECTORSIZE	1024
 
 
 starpu_data_handle_t v_handle;
 starpu_data_handle_t v_handle;
@@ -53,7 +57,8 @@ static void cuda_codelet_null(void *descr[], __attribute__ ((unused)) void *_arg
 
 
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 	char *buf = (char *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 
-	cudaMemset(buf, 42, 1);
+	cudaMemsetAsync(buf, 42, 1, starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 6 - 3
tests/perfmodels/non_linear_regression_based.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011-2012  Université de Bordeaux 1
  * Copyright (C) 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2012 inria
  * Copyright (C) 2012 inria
  *
  *
@@ -21,6 +21,9 @@
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #endif
 #endif
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 #include "../helper.h"
 #include "../helper.h"
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -31,8 +34,8 @@ static void memset_cuda(void *descr[], void *arg)
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	cudaMemset(ptr, 42, n * sizeof(*ptr));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif
 
 

+ 5 - 2
tests/perfmodels/regression_based.c

@@ -21,6 +21,9 @@
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #endif
 #endif
+#ifdef STARPU_USE_CUDA
+#include <starpu_cuda.h>
+#endif
 #include "../helper.h"
 #include "../helper.h"
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -31,8 +34,8 @@ static void memset_cuda(void *descr[], void *arg)
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	int *ptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
 
 
-	cudaMemset(ptr, 42, n * sizeof(*ptr));
-	cudaThreadSynchronize();
+	cudaMemsetAsync(ptr, 42, n * sizeof(*ptr), starpu_cuda_get_local_stream());
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 }
 #endif
 #endif