瀏覽代碼

merge trunk

Corentin Salingue 12 年之前
父節點
當前提交
231ac3fdf2

+ 2 - 0
configure.ac

@@ -258,6 +258,8 @@ AC_CHECK_HEADERS([malloc.h], [AC_DEFINE([STARPU_HAVE_MALLOC_H], [1], [Define to
 AC_CHECK_HEADERS([valgrind/valgrind.h], [AC_DEFINE([STARPU_HAVE_VALGRIND_H], [1], [Define to 1 if you have the <valgrind/valgrind.h> header file.])])
 AC_CHECK_HEADERS([valgrind/helgrind.h], [AC_DEFINE([STARPU_HAVE_HELGRIND_H], [1], [Define to 1 if you have the <valgrind/helgrind.h> header file.])])
 
+AC_CHECK_HEADERS([aio.h])
+
 # This defines HAVE_SYNC_VAL_COMPARE_AND_SWAP
 STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP
 

+ 4 - 2
doc/doxygen/chapters/mpi_support.doxy

@@ -388,6 +388,8 @@ MPI (complex numbers),
 <c>starpu_mpi_insert_task</c>. The non-distributed version can check for
 <algorithm correctness in 1-node configuration, the distributed version uses
 exactly the same source code, to be used over MPI,
-<li><c>mpi_lu</c> is an LU decomposition example, provided in both explicit and
-implicit versions.
+<li><c>mpi_lu</c> is an LU decomposition example, provided in three versions:
+<c>plu_example</c> uses explicit MPI data transfers, <c>plu_implicit_example</c>
+uses implicit MPI data transfers, <c>plu_outofcore_example</c> uses implicit MPI
+data transfers and supports data matrices which do not fit in memory (out-of-core).
 </ul>

+ 29 - 2
mpi/examples/Makefile.am

@@ -49,6 +49,7 @@ EXTRA_DIST = 				\
 	mpi_lu/mpi_lu-double.h		\
 	mpi_lu/plu_example.c		\
 	mpi_lu/plu_implicit_example.c	\
+	mpi_lu/plu_outofcore_example.c	\
 	mpi_lu/plu_solve.c		\
 	mpi_lu/pxlu.h			\
 	mpi_lu/pxlu.c			\
@@ -105,7 +106,9 @@ examplebin_PROGRAMS += 			\
 	mpi_lu/plu_example_float	\
 	mpi_lu/plu_example_double	\
 	mpi_lu/plu_implicit_example_float	\
-	mpi_lu/plu_implicit_example_double
+	mpi_lu/plu_implicit_example_double	\
+	mpi_lu/plu_outofcore_example_float	\
+	mpi_lu/plu_outofcore_example_double
 
 mpi_lu_plu_example_float_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
@@ -149,7 +152,31 @@ mpi_lu_plu_implicit_example_double_LDADD =	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
 mpi_lu_plu_implicit_example_double_SOURCES =	\
-	mpi_lu/plu_implicit_example_double.c	\
+	mpi_lu/plu_outofcore_example_double.c	\
+	mpi_lu/plu_solve_double.c		\
+	mpi_lu/pdlu_kernels.c			\
+	mpi_lu/pdlu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_float_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_float_SOURCES =	\
+	mpi_lu/plu_outofcore_example_float.c	\
+	mpi_lu/plu_solve_float.c		\
+	mpi_lu/pslu_kernels.c			\
+	mpi_lu/pslu_implicit.c			\
+	$(top_srcdir)/examples/common/blas.c
+
+mpi_lu_plu_outofcore_example_double_LDADD =	\
+	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
+	$(STARPU_LIBNUMA_LDFLAGS)				\
+	$(STARPU_BLAS_LDFLAGS) -lm
+
+mpi_lu_plu_outofcore_example_double_SOURCES =	\
+	mpi_lu/plu_outofcore_example_double.c	\
 	mpi_lu/plu_solve_double.c		\
 	mpi_lu/pdlu_kernels.c			\
 	mpi_lu/pdlu_implicit.c			\

+ 381 - 0
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -0,0 +1,381 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <math.h>
+#include <starpu.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include "pxlu.h"
+//#include "pxlu_kernels.h"
+
+#ifdef STARPU_HAVE_LIBNUMA
+#include <numaif.h>
+#endif
+
+static unsigned long size = 4096;
+static unsigned nblocks = 16;
+static unsigned check = 0;
+static int p = 1;
+static int q = 1;
+static unsigned display = 0;
+static char *path = "/tmp/starpu-mpi_LU";
+
+#ifdef STARPU_HAVE_LIBNUMA
+static unsigned numa = 0;
+#endif
+
+static size_t allocated_memory = 0;
+
+static starpu_data_handle_t *dataA_handles;
+
+int get_block_rank(unsigned i, unsigned j);
+
+static void parse_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-size") == 0) {
+			char *argptr;
+			size = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0) {
+			char *argptr;
+			nblocks = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-check") == 0) {
+			check = 1;
+		}
+
+		if (strcmp(argv[i], "-display") == 0) {
+			display = 1;
+		}
+
+		if (strcmp(argv[i], "-numa") == 0) {
+#ifdef STARPU_HAVE_LIBNUMA
+			numa = 1;
+#else
+			if (rank == 0)
+				fprintf(stderr, "Warning: libnuma is not available\n");
+#endif
+		}
+
+		if (strcmp(argv[i], "-p") == 0) {
+			char *argptr;
+			p = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-q") == 0) {
+			char *argptr;
+			q = strtol(argv[++i], &argptr, 10);
+		}
+
+		if (strcmp(argv[i], "-path") == 0)
+			path = argv[++i];
+
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
+			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
+			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
+			exit(0);
+		}
+	}
+}
+
+unsigned STARPU_PLU(display_flag)(void)
+{
+	return display;
+}
+
+static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
+{
+	const unsigned block_size = (psize/pnblocks);
+
+	unsigned i, j;
+	for (i = 0; i < block_size; i++)
+	     for (j = 0; j < block_size; j++)
+	     {
+		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
+	     }
+}
+
+static void create_matrix()
+{
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+	TYPE *blockptr = malloc(blocksize);
+	int fd;
+	char *filename;
+	unsigned filename_length = strlen(path) + 1 + sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1;
+
+	filename = malloc(filename_length);
+
+	allocated_memory += nblocks*nblocks*blocksize*sizeof(TYPE *);
+
+	/* Create the whole matrix on the disk */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			fill_block_with_random(blockptr, size, nblocks);
+			if (i == j)
+			{
+				unsigned tmp;
+				for (tmp = 0; tmp < size/nblocks; tmp++)
+				{
+					blockptr[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+				}
+			}
+			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
+			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
+			if (fd < 0) {
+				perror("open");
+				exit(1);
+			}
+			if (write(fd, blockptr, blocksize) != (ssize_t) blocksize) {
+				fprintf(stderr,"short write");
+				exit(1);
+			}
+			if (close(fd) < 0) {
+				perror("close");
+				exit(1);
+			}
+		}
+	}
+
+	free(blockptr);
+	free(filename);
+}
+
+static void init_matrix(int rank)
+{
+	/* Allocate a grid of data handles, not all of them have to be allocated later on */
+	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
+
+	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
+
+	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
+
+	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
+
+	/* Allocate all the blocks that belong to this mpi node */
+	unsigned i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			int block_rank = get_block_rank(i, j);
+//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
+
+			if (block_rank == rank)
+			{
+				void *disk_obj;
+				snprintf(filename, sizeof(filename), "%u,%u", i, j);
+				/* Register it to StarPU */
+				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
+				if (!disk_obj) {
+					fprintf(stderr,"could not open %s\n", filename);
+					exit(1);
+				}
+				starpu_matrix_data_register(handleptr, disk_node,
+					(uintptr_t) disk_obj, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			else {
+				starpu_matrix_data_register(handleptr, -1,
+					0, size/nblocks,
+					size/nblocks, size/nblocks, sizeof(TYPE));
+			}
+			starpu_data_set_rank(*handleptr, block_rank);
+			starpu_data_set_tag(*handleptr, j+i*nblocks);
+		}
+	}
+
+	//display_all_blocks(nblocks, size/nblocks);
+}
+
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	/* This does not really make sense in out of core */
+	assert(0);
+}
+
+int get_block_rank(unsigned i, unsigned j)
+{
+	/* Take a 2D block cyclic distribution */
+	/* NB: p (resp. q) is for "direction" i (resp. j) */
+	return (j % q) * p + (i % p);
+}
+
+starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
+int main(int argc, char **argv)
+{
+	int rank;
+	int world_size;
+	int ret;
+	unsigned i, j;
+
+	starpu_srand48((long int)time(NULL));
+
+	parse_args(argc, argv);
+
+	ret = mkdir(path, 0777);
+	if (ret != 0 && errno != EEXIST) {
+		fprintf(stderr,"%s does not exist\n", path);
+		exit(1);
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+	ret = starpu_mpi_init(&argc, &argv, 1);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+
+	STARPU_ASSERT(p*q == world_size);
+
+	starpu_cublas_init();
+
+	/*
+	 * 	Problem Init
+	 */
+
+	if (rank == 0)
+		create_matrix();
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	init_matrix(rank);
+
+	if (rank == 0)
+		fprintf(stderr, "%dMB on disk\n", (int)(allocated_memory/(1024*1024)));
+
+	TYPE *a_r = NULL;
+//	STARPU_PLU(display_data_content)(a_r, size);
+
+	TYPE *x, *y;
+
+	if (check)
+	{
+		x = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(x);
+
+		y = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			unsigned ind;
+			for (ind = 0; ind < size; ind++)
+				x[ind] = (TYPE)starpu_drand48();
+		}
+
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+	}
+
+	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
+
+	/*
+	 * 	Report performance
+	 */
+
+	if (rank == 0)
+	{
+		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
+
+		unsigned n = size;
+		double flop = (2.0f*n*n*n)/3.0f;
+		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
+	}
+
+	/*
+	 *	Test Result Correctness
+	 */
+
+	if (check)
+	{
+		/*
+		 *	Compute || A - LU ||
+		 */
+
+		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
+
+#if 0
+		/*
+		 *	Compute || Ax - LUx ||
+		 */
+
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+
+		if (rank == 0)
+		{
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		/* Compute y2 = y2 - y */
+		CPU_AXPY(size, -1.0, y, 1, y2, 1);
+
+		TYPE err = CPU_ASUM(size, y2, 1);
+		int max = CPU_IAMAX(size, y2, 1);
+
+		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
+		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
+	}
+
+	/*
+	 * 	Termination
+	 */
+	for (j = 0; j < nblocks; j++)
+	{
+		for (i = 0; i < nblocks; i++)
+		{
+			starpu_data_unregister(dataA_handles[j+nblocks*i]);
+		}
+	}
+
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+	starpu_shutdown();
+
+	return 0;
+}

+ 19 - 0
mpi/examples/mpi_lu/plu_outofcore_example_double.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-double.h"
+#include "plu_outofcore_example.c"

+ 19 - 0
mpi/examples/mpi_lu/plu_outofcore_example_float.c

@@ -0,0 +1,19 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
+ * Copyright (C) 2010  Centre National de la Recherche Scientifique
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include "mpi_lu-float.h"
+#include "plu_outofcore_example.c"

+ 1 - 88
src/core/disk_ops/disk_stdio.c

@@ -19,7 +19,6 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <sys/time.h>
-#include <aio.h>
 #include <errno.h>
 #include <time.h>
 
@@ -67,7 +66,7 @@ starpu_stdio_alloc (void *base, size_t size)
 
 #ifdef STARPU_HAVE_WINDOWS
         _mktemp(baseCpy);
-        id = open(baseCpy, "rb+");
+        id = open(baseCpy, O_RDWR | O_BINARY);
 #else
 	id = mkstemp(baseCpy);
 
@@ -210,26 +209,6 @@ starpu_stdio_read (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off
 }
 
 static int
-starpu_stdio_async_read (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off_t offset, size_t size, void * async_channel)
-{
-	struct starpu_stdio_obj * tmp = (struct starpu_stdio_obj *) obj;
-      
-	struct _starpu_async_channel * channel = (struct _starpu_async_channel *) async_channel;
-        struct aiocb *aiocb = &channel->event.disk_event._starpu_aiocb_disk;
-        
-	memset(aiocb, 0, sizeof(struct aiocb));
-        
-	aiocb->aio_fildes = tmp->descriptor;
-        aiocb->aio_offset = offset;
-	aiocb->aio_nbytes = size;
-        aiocb->aio_buf = buf;
-        aiocb->aio_reqprio = 0;
-        aiocb->aio_lio_opcode = LIO_NOP; 
-
-	return aio_read(aiocb);
-}
-
-static int
 starpu_stdio_full_read(unsigned node, void *base STARPU_ATTRIBUTE_UNUSED, void * obj, void ** ptr, size_t * size)
 {
 	struct starpu_stdio_obj * tmp = (struct starpu_stdio_obj *) obj;
@@ -258,25 +237,6 @@ starpu_stdio_write (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, const void *b
 }
 
 static int
-starpu_stdio_async_write (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off_t offset, size_t size, void * async_channel)
-{
-        struct starpu_stdio_obj * tmp = (struct starpu_stdio_obj *) obj;
-
-        struct _starpu_async_channel * channel = (struct _starpu_async_channel *) async_channel;
-        struct aiocb *aiocb = &channel->event.disk_event._starpu_aiocb_disk ;
-        memset(aiocb, 0, sizeof(struct aiocb));
-
-        aiocb->aio_fildes = tmp->descriptor;
-        aiocb->aio_offset = offset;
-        aiocb->aio_nbytes = size;
-        aiocb->aio_buf = buf;
-        aiocb->aio_reqprio = 0;
-        aiocb->aio_lio_opcode = LIO_NOP; 
-
-        return aio_write(aiocb);
-}
-
-static int
 starpu_stdio_full_write (unsigned node, void * base STARPU_ATTRIBUTE_UNUSED, void * obj, void * ptr, size_t size)
 {
 	struct starpu_stdio_obj * tmp = (struct starpu_stdio_obj *) obj;
@@ -397,49 +357,6 @@ get_stdio_bandwidth_between_disk_and_main_ram(unsigned node)
 	return 1;
 }
 
-static void 
-starpu_stdio_wait_request(void * async_channel)
-{
-	struct _starpu_async_channel * channel = (struct _starpu_async_channel *) async_channel;
-	const struct aiocb * aiocb = &channel->event.disk_event._starpu_aiocb_disk;
-	const struct aiocb * list[1];
-	list[0] = aiocb;
-	int values = -1;
-	int error_disk = EAGAIN;
-	while(values < 0 || error_disk == EAGAIN)
-	{
-		/* Wait the answer of the request TIMESTAMP IS NULL */
-		values = aio_suspend(list, 1, NULL);
-		error_disk = errno;
-	}
-}
-
-static int
-starpu_stdio_test_request(void * async_channel)
-{
-	struct timespec time_wait_request;
-	time_wait_request.tv_sec = 0;
-	time_wait_request.tv_nsec = 0;
-
-        struct _starpu_async_channel * channel = (struct _starpu_async_channel *) async_channel;
-        const struct aiocb * aiocb = &channel->event.disk_event._starpu_aiocb_disk;
-        const struct aiocb * list[1];
-        list[0] = aiocb;
-        int values = -1;
-        int error_disk = EAGAIN;
-        
-	/* Wait the answer of the request */
-        values = aio_suspend(list, 1, &time_wait_request);
-        error_disk = errno;
-	/* request is finished */
-	if (values == 0)
-		return 1;
-	/* values == -1 */
-	if (error_disk == EAGAIN)
-		return 0;
-	/* an error occured */
-	STARPU_ABORT();	
-}
 
 struct starpu_disk_ops starpu_disk_stdio_ops = {
 	.alloc = starpu_stdio_alloc,
@@ -447,15 +364,11 @@ struct starpu_disk_ops starpu_disk_stdio_ops = {
 	.open = starpu_stdio_open,
 	.close = starpu_stdio_close,
 	.read = starpu_stdio_read,
-	.async_read = starpu_stdio_async_read,
 	.write = starpu_stdio_write,
-	.async_write = starpu_stdio_async_write,
 	.plug = starpu_stdio_plug,
 	.unplug = starpu_stdio_unplug,
 	.copy = NULL,
 	.bandwidth = get_stdio_bandwidth_between_disk_and_main_ram,
-	.wait_request = starpu_stdio_wait_request,
-	.test_request = starpu_stdio_test_request,
 	.full_read = starpu_stdio_full_read,
 	.full_write = starpu_stdio_full_write
 };

+ 4 - 2
src/core/disk_ops/disk_unistd.c

@@ -35,7 +35,7 @@ starpu_unistd_alloc (void *base, size_t size)
         struct starpu_unistd_global_obj * obj = malloc(sizeof(struct starpu_unistd_global_obj));
         STARPU_ASSERT(obj != NULL);
 	/* only flags change between unistd and unistd_o_direct */
-	obj->flags = O_RDWR;
+	obj->flags = O_RDWR | O_BINARY;
 	return starpu_unistd_global_alloc (obj, base, size);
 }
 
@@ -46,7 +46,7 @@ starpu_unistd_open (void *base, void *pos, size_t size)
 	struct starpu_unistd_global_obj * obj = malloc(sizeof(struct starpu_unistd_global_obj));
 	STARPU_ASSERT(obj != NULL);
 	/* only flags change between unistd and unistd_o_direct */
-	obj->flags = O_RDWR;
+	obj->flags = O_RDWR | O_BINARY;
 	return starpu_unistd_global_open (obj, base, pos, size);	
 
 }
@@ -62,10 +62,12 @@ struct starpu_disk_ops starpu_disk_unistd_ops = {
 	.unplug = starpu_unistd_global_unplug,
 	.copy = NULL,
 	.bandwidth = get_unistd_global_bandwidth_between_disk_and_main_ram,
+#ifdef HAVE_AIO_H
 	.async_read = starpu_unistd_global_async_read,
 	.async_write = starpu_unistd_global_async_write,
 	.wait_request = starpu_unistd_global_wait_request,
 	.test_request = starpu_unistd_global_test_request,
+#endif
         .full_read = starpu_unistd_global_full_read,
         .full_write = starpu_unistd_global_full_write
 };

+ 4 - 2
src/core/disk_ops/disk_unistd_o_direct.c

@@ -35,7 +35,7 @@ starpu_unistd_o_direct_alloc (void *base, size_t size)
         struct starpu_unistd_global_obj * obj = malloc(sizeof(struct starpu_unistd_global_obj));
         STARPU_ASSERT(obj != NULL);
         /* only flags change between unistd and unistd_o_direct */
-        obj->flags = O_RDWR | O_DIRECT;
+        obj->flags = O_RDWR | O_DIRECT | O_BINARY;
         return starpu_unistd_global_alloc (obj, base, size);
 }
 
@@ -46,7 +46,7 @@ starpu_unistd_o_direct_open (void *base, void *pos, size_t size)
         struct starpu_unistd_global_obj * obj = malloc(sizeof(struct starpu_unistd_global_obj));
         STARPU_ASSERT(obj != NULL);
         /* only flags change between unistd and unistd_o_direct */
-        obj->flags = O_RDWR | O_DIRECT;
+        obj->flags = O_RDWR | O_DIRECT | O_BINARY;
         return starpu_unistd_global_open (obj, base, pos, size);
 
 }
@@ -96,10 +96,12 @@ struct starpu_disk_ops starpu_disk_unistd_o_direct_ops = {
 	.unplug = starpu_unistd_global_unplug,
 	.copy = NULL,
 	.bandwidth = get_unistd_global_bandwidth_between_disk_and_main_ram,
+#ifdef HAVE_AIO_H
         .async_read = starpu_unistd_global_async_read,
         .async_write = starpu_unistd_global_async_write,
         .wait_request = starpu_unistd_global_wait_request,
         .test_request = starpu_unistd_global_test_request,
+#endif
 	.full_read = starpu_unistd_global_full_read,
 	.full_write = starpu_unistd_global_full_write
 };

+ 11 - 6
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -20,7 +20,9 @@
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <stdint.h>
+#ifdef HAVE_AIO_H
 #include <aio.h>
+#endif
 #include <errno.h>
 
 #include <starpu.h>
@@ -47,12 +49,13 @@ starpu_unistd_global_alloc (struct starpu_unistd_global_obj * obj, void *base, s
 	const char *template = "STARPU_XXXXXX";
 
 	/* create template for mkstemp */
-	unsigned int sizeBase = strlen(base) + strlen(template)+1;
+	unsigned int sizeBase = strlen(base) + 1 + strlen(template)+1;
 
 	char * baseCpy = malloc(sizeBase*sizeof(char));
 	STARPU_ASSERT(baseCpy != NULL);
 
 	strcpy(baseCpy, (char *) base);
+	strcat(baseCpy,"/");
 	strcat(baseCpy,template);
 #ifdef STARPU_LINUX_SYS
 	id = mkostemp(baseCpy, obj->flags);
@@ -118,11 +121,7 @@ starpu_unistd_global_free (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, size_t
 starpu_unistd_global_open (struct starpu_unistd_global_obj * obj, void *base, void *pos, size_t size)
 {
 	/* create template */
-	unsigned int sizeBase = 16;
-	while(sizeBase < (strlen(base)+1+strlen(pos)+1))
-		sizeBase *= 2;
-	
-	char * baseCpy = malloc(sizeBase*sizeof(char));
+	char * baseCpy = malloc(strlen(base)+1+strlen(pos)+1);
 	STARPU_ASSERT(baseCpy != NULL);
 	strcpy(baseCpy,(char *) base);
 	strcat(baseCpy,(char *) "/");
@@ -181,6 +180,7 @@ starpu_unistd_global_read (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *
 }
 
 
+#ifdef HAVE_AIO_H
 int
 starpu_unistd_global_async_read (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off_t offset, size_t size, void * async_channel)
 {
@@ -200,6 +200,7 @@ starpu_unistd_global_async_read (void *base STARPU_ATTRIBUTE_UNUSED, void *obj,
 
         return aio_read(aiocb);
 }
+#endif
 
 int
 starpu_unistd_global_full_read(unsigned node, void *base STARPU_ATTRIBUTE_UNUSED, void * obj, void ** ptr, size_t * size)
@@ -232,6 +233,7 @@ starpu_unistd_global_write (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, const
 }
 
 
+#ifdef HAVE_AIO_H
 int
 starpu_unistd_global_async_write (void *base STARPU_ATTRIBUTE_UNUSED, void *obj, void *buf, off_t offset, size_t size, void * async_channel)
 {
@@ -250,6 +252,7 @@ starpu_unistd_global_async_write (void *base STARPU_ATTRIBUTE_UNUSED, void *obj,
 
         return aio_write(aiocb);
 }
+#endif
 
 int
 starpu_unistd_global_full_write (unsigned node, void * base STARPU_ATTRIBUTE_UNUSED, void * obj, void * ptr, size_t size)
@@ -373,6 +376,7 @@ get_unistd_global_bandwidth_between_disk_and_main_ram(unsigned node)
 	return 1;
 }
 
+#ifdef HAVE_AIO_H
 void
 starpu_unistd_global_wait_request(void * async_channel)
 {
@@ -416,3 +420,4 @@ starpu_unistd_global_test_request(void * async_channel)
         /* an error occured */
         STARPU_ABORT();
 }
+#endif

+ 6 - 0
src/core/disk_ops/unistd/disk_unistd_global.h

@@ -17,6 +17,12 @@
 #ifndef __DISK_UNISTD_GLOBAL_H__
 #define __DISK_UNISTD_GLOBAL_H__
 
+#include <fcntl.h>
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
 struct starpu_unistd_global_obj {
         int descriptor;
         char * path;

+ 2 - 2
src/datawizard/copy_driver.c

@@ -402,7 +402,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM):
 		if(copy_methods->any_to_any)
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
 
 		else
 		{
@@ -419,7 +419,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM):
 		if(copy_methods->any_to_any) 
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
 		else
 		{
 			struct starpu_disk_interface * disk_interface = (struct starpu_disk_interface *) src_interface; 

+ 5 - 1
src/datawizard/copy_driver.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012  Université de Bordeaux 1
+ * Copyright (C) 2010, 2012-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2013  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,7 +18,9 @@
 #ifndef __COPY_DRIVER_H__
 #define __COPY_DRIVER_H__
 
+#ifdef HAVE_AIO_H
 #include <aio.h>
+#endif
 
 #include <common/config.h>
 #include <datawizard/memory_nodes.h>
@@ -52,7 +54,9 @@ struct _starpu_mic_async_event
 
 struct _starpu_disk_async_event
 {
+#ifdef HAVE_AIO_H
         struct aiocb _starpu_aiocb_disk;
+#endif
 	unsigned memory_node;
 };
 

+ 24 - 1
src/datawizard/interfaces/block_interface.c

@@ -662,7 +662,7 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 {
 	struct starpu_block_interface *src_block = (struct starpu_block_interface *) src_interface;
 	struct starpu_block_interface *dst_block = (struct starpu_block_interface *) dst_interface;
-	int ret;
+	int ret = 0;
 
 	uint32_t nx = dst_block->nx;
 	uint32_t ny = dst_block->ny;
@@ -675,10 +675,33 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	uint32_t ldz_dst = dst_block->ldz;
 
 	unsigned y, z;
+
+	if (ldy_src == nx && ldy_dst == nx && ldz_src == ny && ldz_dst == ny)
+	{
+		/* Optimise non-partitioned and z-partitioned case */
+		if (starpu_interface_copy(src_block->dev_handle, src_block->offset, src_node,
+		                          dst_block->dev_handle, dst_block->offset, dst_node,
+		                          nx*ny*nz*elemsize, async_data))
+				ret = -EAGAIN;
+	}
+	else
 	for (z = 0; z < nz; z++)
 	{
+		if (ldy_src == nx && ldy_dst == nx)
+		{
+			/* Optimise y-partitioned case */
+			uint32_t src_offset = z*ldz_src*elemsize;
+			uint32_t dst_offset = z*ldz_dst*elemsize;
+
+			if (starpu_interface_copy(src_block->dev_handle, src_block->offset + src_offset, src_node,
+			                          dst_block->dev_handle, dst_block->offset + dst_offset, dst_node,
+			                          nx*ny*elemsize, async_data))
+				ret = -EAGAIN;
+		}
+		else
 		for (y = 0; y < ny; y++)
 		{
+			/* Eerf, x-partitioned case */
 			uint32_t src_offset = (y*ldy_src + z*ldz_src)*elemsize;
 			uint32_t dst_offset = (y*ldy_dst + z*ldz_dst)*elemsize;
 

+ 10 - 1
src/datawizard/interfaces/matrix_interface.c

@@ -625,7 +625,7 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 {
 	struct starpu_matrix_interface *src_matrix = (struct starpu_matrix_interface *) src_interface;
 	struct starpu_matrix_interface *dst_matrix = (struct starpu_matrix_interface *) dst_interface;
-	int ret;
+	int ret = 0;
 
 	unsigned y;
 	uint32_t nx = dst_matrix->nx;
@@ -635,6 +635,15 @@ static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_int
 	uint32_t ld_src = src_matrix->ld;
 	uint32_t ld_dst = dst_matrix->ld;
 
+	if (ld_src == nx && ld_dst == nx)
+	{
+		/* Optimize unpartitioned and y-partitioned cases */
+		if (starpu_interface_copy(src_matrix->dev_handle, src_matrix->offset, src_node,
+		                          dst_matrix->dev_handle, dst_matrix->offset, dst_node,
+		                          nx*ny*elemsize, async_data))
+			ret = -EAGAIN;
+	}
+	else
 	for (y = 0; y < ny; y++)
 	{
 		uint32_t src_offset = y*ld_src*elemsize;

+ 2 - 2
src/datawizard/memalloc.c

@@ -1059,7 +1059,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 static unsigned
 choose_target(starpu_data_handle_t handle, unsigned node)
 {
-	unsigned target = -1;
+	int target = -1;
 	size_t size_handle = _starpu_data_get_size(handle);
 	if (handle->home_node != -1)
 		/* try to push on RAM if we can before to push on disk */
@@ -1103,7 +1103,7 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 		}
 	}
 	/* we haven't the right to write on the disk */
-	if (starpu_node_get_kind(target) == STARPU_DISK_RAM && _starpu_get_disk_flag(target) == STARPU_DISK_NO_RECLAIM)
+	if (target != -1 && starpu_node_get_kind(target) == STARPU_DISK_RAM && _starpu_get_disk_flag(target) == STARPU_DISK_NO_RECLAIM)
 		target = -1;
 
 	return target;

+ 6 - 11
tests/disk/disk_compute.c

@@ -47,20 +47,17 @@ int main(int argc, char **argv)
 
 	char * base = "/tmp";
 
-	char * name_file_start = malloc(128*sizeof(char));
-	strcpy(name_file_start, "STARPU_DISK_COMPUTE_DATA_");
-	strcat(name_file_start, pid_str);
+	const char *name_file_start = "STARPU_DISK_COMPUTE_DATA_";
+	const char *name_file_end = "STARPU_DISK_COMPUTE_DATA_RESULT_";
 
-	char * name_file_end = malloc(128*sizeof(char));
-	strcpy(name_file_end, "STARPU_DISK_COMPUTE_DATA_RESULT_");
-	strcat(name_file_end, pid_str);
-
-	char * path_file_start = malloc(128*sizeof(char));
+	char * path_file_start = malloc(strlen(base) + 1 + strlen(name_file_start) + 1);
 	strcpy(path_file_start, base);
+	strcat(path_file_start, "/");
 	strcat(path_file_start, name_file_start);
 
-	char * path_file_end = malloc(128*sizeof(char));
+	char * path_file_end = malloc(strlen(base) + 1 + strlen(name_file_end) + 1);
 	strcpy(path_file_end, base);
+	strcat(path_file_end, "/");
 	strcat(path_file_end, name_file_end);
 
 
@@ -176,8 +173,6 @@ int main(int argc, char **argv)
 	unlink(path_file_start);
 	unlink(path_file_end);
 
-	free(name_file_start);
-	free(name_file_end);
 	free(path_file_start);
 	free(path_file_end);