|
@@ -1,6 +1,6 @@
|
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
*
|
|
|
- * Copyright (C) 2010 Université de Bordeaux 1
|
|
|
+ * Copyright (C) 2010, 2012 Université de Bordeaux 1
|
|
|
* Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
|
|
|
*
|
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
@@ -44,14 +44,14 @@ static void redux_cuda_kernel(void *descr[], void *arg)
|
|
|
unsigned host_dst, host_src;
|
|
|
|
|
|
/* This is a dummy technique of course */
|
|
|
- cudaMemcpy(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost);
|
|
|
- cudaMemcpy(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost);
|
|
|
- cudaThreadSynchronize();
|
|
|
+ cudaMemcpyAsync(&host_src, src, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
|
|
|
+ cudaMemcpyAsync(&host_dst, dst, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
|
|
|
+ cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
|
|
|
host_dst += host_src;
|
|
|
|
|
|
- cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
|
|
|
- cudaThreadSynchronize();
|
|
|
+ cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
|
|
|
+ cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
}
|
|
|
|
|
|
static void neutral_cuda_kernel(void *descr[], void *arg)
|
|
@@ -62,8 +62,8 @@ static void neutral_cuda_kernel(void *descr[], void *arg)
|
|
|
|
|
|
/* This is a dummy technique of course */
|
|
|
unsigned host_dst = 0;
|
|
|
- cudaMemcpy(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice);
|
|
|
- cudaThreadSynchronize();
|
|
|
+ cudaMemcpyAsync(dst, &host_dst, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
|
|
|
+ cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
}
|
|
|
#endif
|
|
|
|
|
@@ -183,13 +183,13 @@ static void increment_cuda_kernel(void *descr[], void *arg)
|
|
|
unsigned host_token;
|
|
|
|
|
|
/* This is a dummy technique of course */
|
|
|
- cudaMemcpy(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost);
|
|
|
- cudaThreadSynchronize();
|
|
|
+ cudaMemcpyAsync(&host_token, tokenptr, sizeof(unsigned), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
|
|
|
+ cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
|
|
|
host_token++;
|
|
|
|
|
|
- cudaMemcpy(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice);
|
|
|
- cudaThreadSynchronize();
|
|
|
+ cudaMemcpyAsync(tokenptr, &host_token, sizeof(unsigned), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
|
|
|
+ cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
}
|
|
|
#endif
|
|
|
|