|
|
@@ -29,16 +29,16 @@
|
|
|
|
|
|
static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
|
|
|
#ifdef USE_CUDA
|
|
|
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
|
|
|
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
|
|
|
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
|
|
|
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
|
|
|
#endif
|
|
|
|
|
|
static const struct copy_data_methods_s bcsr_copy_data_methods_s = {
|
|
|
.ram_to_ram = dummy_copy_ram_to_ram,
|
|
|
.ram_to_spu = NULL,
|
|
|
#ifdef USE_CUDA
|
|
|
- .ram_to_cuda = copy_ram_to_cublas,
|
|
|
- .cuda_to_ram = copy_cublas_to_ram,
|
|
|
+ .ram_to_cuda = copy_ram_to_cuda,
|
|
|
+ .cuda_to_ram = copy_cuda_to_ram,
|
|
|
#endif
|
|
|
.cuda_to_cuda = NULL,
|
|
|
.cuda_to_spu = NULL,
|
|
|
@@ -261,15 +261,15 @@ static size_t allocate_bcsr_buffer_on_node(starpu_data_handle handle, uint32_t d
|
|
|
break;
|
|
|
#ifdef USE_CUDA
|
|
|
case CUDA_RAM:
|
|
|
- cublasAlloc(nnz*r*c, elemsize, (void **)&addr_nzval);
|
|
|
+ cudaMalloc((void **)&addr_nzval, nnz*r*c*elemsize);
|
|
|
if (!addr_nzval)
|
|
|
goto fail_nzval;
|
|
|
|
|
|
- cublasAlloc(nnz, sizeof(uint32_t), (void **)&addr_colind);
|
|
|
+ cudaMalloc((void **)&addr_colind, nnz*sizeof(uint32_t));
|
|
|
if (!addr_colind)
|
|
|
goto fail_colind;
|
|
|
|
|
|
- cublasAlloc((nrow+1), sizeof(uint32_t), (void **)&addr_rowptr);
|
|
|
+ cudaMalloc((void **)&addr_rowptr, (nrow+1)*sizeof(uint32_t));
|
|
|
if (!addr_rowptr)
|
|
|
goto fail_rowptr;
|
|
|
|
|
|
@@ -296,7 +296,7 @@ fail_rowptr:
|
|
|
free((void *)addr_colind);
|
|
|
#ifdef USE_CUDA
|
|
|
case CUDA_RAM:
|
|
|
- cublasFree((void*)addr_colind);
|
|
|
+ cudaFree((void*)addr_colind);
|
|
|
break;
|
|
|
#endif
|
|
|
default:
|
|
|
@@ -309,7 +309,7 @@ fail_colind:
|
|
|
free((void *)addr_nzval);
|
|
|
#ifdef USE_CUDA
|
|
|
case CUDA_RAM:
|
|
|
- cublasFree((void*)addr_nzval);
|
|
|
+ cudaFree((void*)addr_nzval);
|
|
|
break;
|
|
|
#endif
|
|
|
default:
|
|
|
@@ -337,9 +337,9 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
|
|
|
break;
|
|
|
#ifdef USE_CUDA
|
|
|
case CUDA_RAM:
|
|
|
- cublasFree((void*)bcsr_interface->nzval);
|
|
|
- cublasFree((void*)bcsr_interface->colind);
|
|
|
- cublasFree((void*)bcsr_interface->rowptr);
|
|
|
+ cudaFree((void*)bcsr_interface->nzval);
|
|
|
+ cudaFree((void*)bcsr_interface->colind);
|
|
|
+ cudaFree((void*)bcsr_interface->rowptr);
|
|
|
break;
|
|
|
#endif
|
|
|
default:
|
|
|
@@ -348,7 +348,7 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
|
|
|
}
|
|
|
|
|
|
#ifdef USE_CUDA
|
|
|
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
|
|
|
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
|
|
|
{
|
|
|
starpu_bcsr_interface_t *src_bcsr;
|
|
|
starpu_bcsr_interface_t *dst_bcsr;
|
|
|
@@ -363,21 +363,28 @@ static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint
|
|
|
uint32_t r = src_bcsr->r;
|
|
|
uint32_t c = src_bcsr->c;
|
|
|
|
|
|
- cublasGetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1,
|
|
|
- (uint8_t *)dst_bcsr->nzval, 1);
|
|
|
+ cudaError_t cures;
|
|
|
|
|
|
- cublasGetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1,
|
|
|
- (uint8_t *)dst_bcsr->colind, 1);
|
|
|
+ cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyDeviceToHost);
|
|
|
+ if (STARPU_UNLIKELY(cures))
|
|
|
+ CUDA_REPORT_ERROR(cures);
|
|
|
+
|
|
|
+ cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
|
|
+ if (STARPU_UNLIKELY(cures))
|
|
|
+ CUDA_REPORT_ERROR(cures);
|
|
|
+
|
|
|
+ cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
|
|
+ if (STARPU_UNLIKELY(cures))
|
|
|
+ CUDA_REPORT_ERROR(cures);
|
|
|
+
|
|
|
+ cudaThreadSynchronize();
|
|
|
|
|
|
- cublasGetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1,
|
|
|
- (uint8_t *)dst_bcsr->rowptr, 1);
|
|
|
-
|
|
|
TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
|
|
|
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
|
|
|
{
|
|
|
starpu_bcsr_interface_t *src_bcsr;
|
|
|
starpu_bcsr_interface_t *dst_bcsr;
|
|
|
@@ -392,15 +399,22 @@ static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint
|
|
|
uint32_t r = src_bcsr->r;
|
|
|
uint32_t c = src_bcsr->c;
|
|
|
|
|
|
- cublasSetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1,
|
|
|
- (uint8_t *)dst_bcsr->nzval, 1);
|
|
|
+ cudaError_t cures;
|
|
|
|
|
|
- cublasSetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1,
|
|
|
- (uint8_t *)dst_bcsr->colind, 1);
|
|
|
+ cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyHostToDevice);
|
|
|
+ if (STARPU_UNLIKELY(cures))
|
|
|
+ CUDA_REPORT_ERROR(cures);
|
|
|
+
|
|
|
+ cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyHostToDevice);
|
|
|
+ if (STARPU_UNLIKELY(cures))
|
|
|
+ CUDA_REPORT_ERROR(cures);
|
|
|
+
|
|
|
+ cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
|
|
|
+ if (STARPU_UNLIKELY(cures))
|
|
|
+ CUDA_REPORT_ERROR(cures);
|
|
|
+
|
|
|
+ cudaThreadSynchronize();
|
|
|
|
|
|
- cublasSetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1,
|
|
|
- (uint8_t *)dst_bcsr->rowptr, 1);
|
|
|
-
|
|
|
TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
|
|
|
|
|
|
return 0;
|