Parcourir la source

Also remove the strassen2 example

Cédric Augonnet il y a 15 ans
Parent
commit
310422a8d1
2 fichiers modifiés avec 0 ajouts et 1122 suppressions
  1. 0 856
      examples/strassen2/strassen2.c
  2. 0 266
      examples/strassen2/strassen2_kernels.c

+ 0 - 856
examples/strassen2/strassen2.c

@@ -1,856 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <math.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <pthread.h>
-#include <signal.h>
-
-#include <starpu.h>
-
-#define MAXDEPS	4
-
-uint64_t current_tag = 1024;
-
-uint64_t used_mem = 0;
-uint64_t used_mem_predicted = 0;
-
-#define MAXREC	7
-
-/* the size consumed by the algorithm should be
- *	<= (size)^2 * ( predicted_mem[rec] + 1)
- * NB: we don't really need this, but this is useful to avoid allocating
- * thousands of pinned buffers and as many VMA that pressure Linux a lot */
-static unsigned predicted_mem[7] = {
-	12, 29, 58, 110, 201, 361, 640
-};
-
-static unsigned char *bigbuffer;
-
-/*
-
-Strassen:
-        M1 = (A11 + A22)(B11 + B22)
-        M2 = (A21 + A22)B11
-        M3 = A11(B12 - B22)
-        M4 = A22(B21 - B11)
-        M5 = (A11 + A12)B22
-        M6 = (A21 - A11)(B11 + B12)
-        M7 = (A12 - A22)(B21 + B22)
-
-        C11 = M1 + M4 - M5 + M7
-        C12 = M3 + M5
-        C21 = M2 + M4
-        C22 = M1 - M2 + M3 + M6
-
-	7 recursive calls to the Strassen algorithm (in each Mi computation)
-	10+7 temporary buffers (to compute the terms of Mi = Mia x Mib, and to store Mi)
-
-	complexity:
-		M(n) multiplication complexity
-		A(n) add/sub complexity
-
-		M(n) = (10 + 8) A(n/2) + 7 M(n/2)
-
-	NB: we consider fortran ordering (hence we compute M3t = (B12t - B22t)A11t for instance)
-
- */
-
-static unsigned size = 2048;
-static unsigned reclevel = 3;
-static unsigned norandom = 0;
-static unsigned pin = 0;
-
-extern void mult_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void self_add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-
-#ifdef STARPU_USE_CUDA
-extern void mult_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void self_add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-extern void self_sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-#endif
-
-extern void null_codelet(__attribute__((unused)) void *descr[],
-                  __attribute__((unused))  void *arg);
-
-
-extern void display_perf(double timing, unsigned size);
-
-struct starpu_perfmodel_t strassen_model_mult = {
-        .type = STARPU_HISTORY_BASED,
-        .symbol = "strassen_model_mult"
-};
-
-struct starpu_perfmodel_t strassen_model_add = {
-        .type = STARPU_HISTORY_BASED,
-        .symbol = "strassen_model_add"
-};
-
-struct starpu_perfmodel_t strassen_model_sub = {
-        .type = STARPU_HISTORY_BASED,
-        .symbol = "strassen_model_sub"
-};
-
-
-struct starpu_perfmodel_t strassen_model_self_add = {
-        .type = STARPU_HISTORY_BASED,
-        .symbol = "strassen_model_self_add"
-};
-
-struct starpu_perfmodel_t strassen_model_self_sub = {
-        .type = STARPU_HISTORY_BASED,
-        .symbol = "strassen_model_self_sub"
-};
-
-
-
-struct data_deps_t {
-	unsigned ndeps;
-	starpu_tag_t deps[MAXDEPS];
-};
-
-struct strassen_iter {
-	unsigned reclevel;
-	struct strassen_iter *children[7];
-
-	starpu_data_handle A, B, C;
-
-	/* temporary buffers */
-	/* Mi = Mia * Mib*/
-	starpu_data_handle Mia_data[7];
-	starpu_data_handle Mib_data[7];
-	starpu_data_handle Mi_data[7];
-
-	/* input deps */
-	struct data_deps_t A_deps;
-	struct data_deps_t B_deps;
-
-	/* output deps */
-	struct data_deps_t C_deps;
-};
-
-
-static starpu_filter f = 
-{
-	.filter_func = starpu_block_filter_func,
-	.filter_arg = 2
-};
-
-static starpu_filter f2 =
-{
-	.filter_func = starpu_vertical_block_filter_func,
-	.filter_arg = 2
-};
-
-static float *allocate_tmp_matrix_wrapper(size_t size)
-{
-	float *buffer;
-
-	buffer = (float *)&bigbuffer[used_mem];
-
-	/* XXX there could be some extra alignment constraints here */
-	used_mem += size;
-
-	if (used_mem > used_mem_predicted)
-		fprintf(stderr, "used %ld predict %ld\n", used_mem, used_mem_predicted);
-
-	assert(used_mem <= used_mem_predicted);
-
-	memset(buffer, 0, size);
-
-	return buffer;
-
-}
-
-static starpu_data_handle allocate_tmp_matrix(unsigned size, unsigned reclevel)
-{
-	starpu_data_handle *data = malloc(sizeof(starpu_data_handle));
-	float *buffer;
-
-	buffer = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
-
-	starpu_matrix_data_register(data, 0, (uintptr_t)buffer, size, size, size, sizeof(float));
-
-	/* we construct a starpu_filter tree of depth reclevel */
-	unsigned rec;
-	for (rec = 0; rec < reclevel; rec++)
-		starpu_map_filters(*data, 2, &f, &f2);
-
-	return *data;
-}
-
-enum operation {
-	ADD,
-	SUB,
-	MULT
-};
-
-static starpu_codelet cl_add = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_add,
-	.cpu_func = add_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = add_cublas_codelet,
-#endif
-	.nbuffers = 3
-};
-
-static starpu_codelet cl_sub = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_sub,
-	.cpu_func = sub_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = sub_cublas_codelet,
-#endif
-	.nbuffers = 3
-};
-
-static starpu_codelet cl_mult = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_mult,
-	.cpu_func = mult_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = mult_cublas_codelet,
-#endif
-	.nbuffers = 3
-};
-
-/* C = A op B */
-struct starpu_task *compute_add_sub_op(starpu_data_handle C, enum operation op, starpu_data_handle A, starpu_data_handle B)
-{
-	struct starpu_task *task = starpu_task_create();
-
-	uint64_t j_tag = current_tag++;
-
-	task->buffers[0].handle = C;
-	task->buffers[0].mode = STARPU_W;
-	task->buffers[1].handle = A;
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = B;
-	task->buffers[2].mode = STARPU_R;
-
-	task->callback_func = NULL;
-
-	switch (op) {
-		case ADD:
-			task->cl = &cl_add;
-			break;
-		case SUB:
-			task->cl = &cl_sub;
-			break;
-		case MULT:
-			task->cl = &cl_mult;
-			break;
-		default:
-			assert(0);
-	};
-
-	task->use_tag = 1;
-	task->tag_id = (starpu_tag_t)j_tag;
-
-	return task;
-}
-
-static starpu_codelet cl_self_add = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_self_add,
-	.cpu_func = self_add_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = self_add_cublas_codelet,
-#endif
-	.nbuffers = 2
-};
-
-static starpu_codelet cl_self_sub = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_self_sub,
-	.cpu_func = self_sub_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = self_sub_cublas_codelet,
-#endif
-	.nbuffers = 2
-};
-
-/* C = C op A */
-struct starpu_task *compute_self_add_sub_op(starpu_data_handle C, enum operation op, starpu_data_handle A)
-{
-	struct starpu_task *task = starpu_task_create();
-	uint64_t j_tag = current_tag++;
-
-	task->buffers[0].handle = C;
-	task->buffers[0].mode = STARPU_RW;
-	task->buffers[1].handle = A;
-	task->buffers[1].mode = STARPU_R;
-
-	task->callback_func = NULL;
-
-	switch (op) {
-		case ADD:
-			task->cl = &cl_self_add;
-			break;
-		case SUB:
-			task->cl = &cl_self_sub;
-			break;
-		default:
-			assert(0);
-	};
-
-	task->use_tag = 1;
-	task->tag_id = (starpu_tag_t)j_tag;
-
-	return task;
-}
-
-struct cleanup_arg {
-	unsigned ndeps;
-	starpu_tag_t tags[8];
-	unsigned ndata;
-	starpu_data_handle data[32];
-};
-
-void cleanup_callback(void *_arg)
-{
-	//fprintf(stderr, "cleanup callback\n");
-
-	struct cleanup_arg *arg = _arg;
-
-	unsigned i;
-	for (i = 0; i < arg->ndata; i++)
-		starpu_data_advise_as_important(arg->data[i], 0);
-
-	free(arg);
-}
-
-static starpu_codelet cleanup_codelet = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = NULL,
-	.cpu_func = null_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = null_codelet,
-#endif
-	.nbuffers = 0
-};
-
-/* this creates a codelet that will tell StarPU that all specified data are not
-  essential once the tasks corresponding to the task will be performed */
-void create_cleanup_task(struct cleanup_arg *cleanup_arg)
-{
-	struct starpu_task *task = starpu_task_create();
-	uint64_t j_tag = current_tag++;
-
-	task->cl = &cleanup_codelet;
-
-	task->callback_func = cleanup_callback;
-	task->callback_arg = cleanup_arg;
-
-	task->use_tag = 1;
-	task->tag_id = j_tag;
-
-	starpu_tag_declare_deps_array(j_tag, cleanup_arg->ndeps, cleanup_arg->tags);
-
-	starpu_task_submit(task);
-}
-
-void strassen_mult(struct strassen_iter *iter)
-{
-	if (iter->reclevel == 0)
-	{
-		struct starpu_task *task_mult = 
-			compute_add_sub_op(iter->C, MULT, iter->A, iter->B);
-		starpu_tag_t tag_mult = task_mult->tag_id;
-
-		starpu_tag_t deps_array[10];
-		unsigned indexA, indexB;
-		for (indexA = 0; indexA < iter->A_deps.ndeps; indexA++)
-		{
-			deps_array[indexA] = iter->A_deps.deps[indexA];
-		}
-
-		for (indexB = 0; indexB < iter->B_deps.ndeps; indexB++)
-		{
-			deps_array[indexB+indexA] = iter->B_deps.deps[indexB];
-		}
-
-		starpu_tag_declare_deps_array(tag_mult, indexA+indexB, deps_array);
-
-		iter->C_deps.ndeps = 1;
-		iter->C_deps.deps[0] = tag_mult;
-
-		starpu_task_submit(task_mult);
-
-		return;
-	}
-
-        starpu_data_handle A11 = starpu_data_get_sub_data(iter->A, 2, 0, 0);
-        starpu_data_handle A12 = starpu_data_get_sub_data(iter->A, 2, 1, 0);
-        starpu_data_handle A21 = starpu_data_get_sub_data(iter->A, 2, 0, 1);
-        starpu_data_handle A22 = starpu_data_get_sub_data(iter->A, 2, 1, 1);
-
-        starpu_data_handle B11 = starpu_data_get_sub_data(iter->B, 2, 0, 0);
-        starpu_data_handle B12 = starpu_data_get_sub_data(iter->B, 2, 1, 0);
-        starpu_data_handle B21 = starpu_data_get_sub_data(iter->B, 2, 0, 1);
-        starpu_data_handle B22 = starpu_data_get_sub_data(iter->B, 2, 1, 1);
-
-        starpu_data_handle C11 = starpu_data_get_sub_data(iter->C, 2, 0, 0);
-        starpu_data_handle C12 = starpu_data_get_sub_data(iter->C, 2, 1, 0);
-        starpu_data_handle C21 = starpu_data_get_sub_data(iter->C, 2, 0, 1);
-        starpu_data_handle C22 = starpu_data_get_sub_data(iter->C, 2, 1, 1);
-
-	unsigned size = starpu_matrix_get_nx(A11);
-
-	/* M1a = (A11 + A22) */
-	iter->Mia_data[0] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_1a = compute_add_sub_op(iter->Mia_data[0], ADD, A11, A22);
-	starpu_tag_t tag_1a = task_1a->tag_id;
-	starpu_tag_declare_deps_array(tag_1a, iter->A_deps.ndeps, iter->A_deps.deps);
-	starpu_task_submit(task_1a);
-
-	/* M1b = (B11 + B22) */
-	iter->Mib_data[0] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_1b = compute_add_sub_op(iter->Mib_data[0], ADD, B11, B22);
-	starpu_tag_t tag_1b = task_1b->tag_id;
-	starpu_tag_declare_deps_array(tag_1b, iter->B_deps.ndeps, iter->B_deps.deps);
-	starpu_task_submit(task_1b);
-
-	/* M2a = (A21 + A22) */
-	iter->Mia_data[1] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_2a = compute_add_sub_op(iter->Mia_data[1], ADD, A21, A22);
-	starpu_tag_t tag_2a = task_2a->tag_id;
-	starpu_tag_declare_deps_array(tag_2a, iter->A_deps.ndeps, iter->A_deps.deps);
-	starpu_task_submit(task_2a);
-
-	/* M3b = (B12 - B22) */
-	iter->Mib_data[2] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_3b = compute_add_sub_op(iter->Mib_data[2], SUB, B12, B22);
-	starpu_tag_t tag_3b = task_3b->tag_id;
-	starpu_tag_declare_deps_array(tag_3b, iter->B_deps.ndeps, iter->B_deps.deps);
-	starpu_task_submit(task_3b);
-	
-	/* M4b = (B21 - B11) */
-	iter->Mib_data[3] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_4b = compute_add_sub_op(iter->Mib_data[3], SUB, B21, B11);
-	starpu_tag_t tag_4b = task_4b->tag_id;
-	starpu_tag_declare_deps_array(tag_4b, iter->B_deps.ndeps, iter->B_deps.deps);
-	starpu_task_submit(task_4b);
-	
-	/* M5a = (A11 + A12) */
-	iter->Mia_data[4] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_5a = compute_add_sub_op(iter->Mia_data[4], ADD, A11, A12);
-	starpu_tag_t tag_5a = task_5a->tag_id;
-	starpu_tag_declare_deps_array(tag_5a, iter->A_deps.ndeps, iter->A_deps.deps);
-	starpu_task_submit(task_5a);
-
-	/* M6a = (A21 - A11) */
-	iter->Mia_data[5] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_6a = compute_add_sub_op(iter->Mia_data[5], SUB, A21, A11);
-	starpu_tag_t tag_6a = task_6a->tag_id;
-	starpu_tag_declare_deps_array(tag_6a, iter->A_deps.ndeps, iter->A_deps.deps);
-	starpu_task_submit(task_6a);
-
-	/* M6b = (B11 + B12) */
-	iter->Mib_data[5] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_6b = compute_add_sub_op(iter->Mib_data[5], SUB, B11, B12);
-	starpu_tag_t tag_6b = task_6b->tag_id;
-	starpu_tag_declare_deps_array(tag_6b, iter->B_deps.ndeps, iter->B_deps.deps);
-	starpu_task_submit(task_6b);
-
-	/* M7a = (A12 - A22) */
-	iter->Mia_data[6] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_7a = compute_add_sub_op(iter->Mia_data[6], SUB, A12, A22);
-	starpu_tag_t tag_7a = task_7a->tag_id;
-	starpu_tag_declare_deps_array(tag_7a, iter->A_deps.ndeps, iter->A_deps.deps);
-	starpu_task_submit(task_7a);
-
-	/* M7b = (B21 + B22) */
-	iter->Mib_data[6] = allocate_tmp_matrix(size, iter->reclevel);
-	struct starpu_task *task_7b = compute_add_sub_op(iter->Mib_data[6], ADD, B21, B22);
-	starpu_tag_t tag_7b = task_7b->tag_id;
-	starpu_tag_declare_deps_array(tag_7b, iter->B_deps.ndeps, iter->B_deps.deps);
-	starpu_task_submit(task_7b);
-
-	iter->Mi_data[0] = allocate_tmp_matrix(size, iter->reclevel);
-	iter->Mi_data[1] = allocate_tmp_matrix(size, iter->reclevel);
-	iter->Mi_data[2] = allocate_tmp_matrix(size, iter->reclevel);
-	iter->Mi_data[3] = allocate_tmp_matrix(size, iter->reclevel);
-	iter->Mi_data[4] = allocate_tmp_matrix(size, iter->reclevel);
-	iter->Mi_data[5] = allocate_tmp_matrix(size, iter->reclevel);
-	iter->Mi_data[6] = allocate_tmp_matrix(size, iter->reclevel);
-
-	/* M1 = M1a * M1b */
-	iter->children[0] = malloc(sizeof(struct strassen_iter));
-	iter->children[0]->reclevel = iter->reclevel - 1;
-	iter->children[0]->A_deps.ndeps = 1;
-	iter->children[0]->A_deps.deps[0] = tag_1a;
-	iter->children[0]->B_deps.ndeps = 1;
-	iter->children[0]->B_deps.deps[0] = tag_1b;
-	iter->children[0]->A = iter->Mia_data[0]; 
-	iter->children[0]->B = iter->Mib_data[0]; 
-	iter->children[0]->C = iter->Mi_data[0];
-	strassen_mult(iter->children[0]);
-
-	/* M2 = M2a * B11 */
-	iter->children[1] = malloc(sizeof(struct strassen_iter));
-	iter->children[1]->reclevel = iter->reclevel - 1;
-	iter->children[1]->A_deps.ndeps = 1;
-	iter->children[1]->A_deps.deps[0] = tag_2a;
-	iter->children[1]->B_deps.ndeps = iter->B_deps.ndeps;
-	memcpy(iter->children[1]->B_deps.deps, iter->B_deps.deps, iter->B_deps.ndeps*sizeof(starpu_tag_t));
-	iter->children[1]->A = iter->Mia_data[1]; 
-	iter->children[1]->B = B11; 
-	iter->children[1]->C = iter->Mi_data[1];
-	strassen_mult(iter->children[1]);
-
-	/* M3 = A11 * M3b */
-	iter->children[2] = malloc(sizeof(struct strassen_iter));
-	iter->children[2]->reclevel = iter->reclevel - 1;
-	iter->children[2]->A_deps.ndeps = iter->B_deps.ndeps;
-	memcpy(iter->children[2]->A_deps.deps, iter->A_deps.deps, iter->A_deps.ndeps*sizeof(starpu_tag_t));
-	iter->children[2]->B_deps.ndeps = 1;
-	iter->children[2]->B_deps.deps[0] = tag_3b;
-	iter->children[2]->A = A11; 
-	iter->children[2]->B = iter->Mib_data[2]; 
-	iter->children[2]->C = iter->Mi_data[2];
-	strassen_mult(iter->children[2]);
-
-	/* M4 = A22 * M4b */
-	iter->children[3] = malloc(sizeof(struct strassen_iter));
-	iter->children[3]->reclevel = iter->reclevel - 1;
-	iter->children[3]->A_deps.ndeps = iter->B_deps.ndeps;
-	memcpy(iter->children[3]->A_deps.deps, iter->A_deps.deps, iter->A_deps.ndeps*sizeof(starpu_tag_t));
-	iter->children[3]->B_deps.ndeps = 1;
-	iter->children[3]->B_deps.deps[0] = tag_4b;
-	iter->children[3]->A = A22; 
-	iter->children[3]->B = iter->Mib_data[3]; 
-	iter->children[3]->C = iter->Mi_data[3];
-	strassen_mult(iter->children[3]);
-
-	/* M5 = M5a * B22 */
-	iter->children[4] = malloc(sizeof(struct strassen_iter));
-	iter->children[4]->reclevel = iter->reclevel - 1;
-	iter->children[4]->A_deps.ndeps = 1;
-	iter->children[4]->A_deps.deps[0] = tag_5a;
-	iter->children[4]->B_deps.ndeps = iter->B_deps.ndeps;
-	memcpy(iter->children[4]->B_deps.deps, iter->B_deps.deps, iter->B_deps.ndeps*sizeof(starpu_tag_t));
-	iter->children[4]->A = iter->Mia_data[4]; 
-	iter->children[4]->B = B22; 
-	iter->children[4]->C = iter->Mi_data[4];
-	strassen_mult(iter->children[4]);
-
-	/* M6 = M6a * M6b */
-	iter->children[5] = malloc(sizeof(struct strassen_iter));
-	iter->children[5]->reclevel = iter->reclevel - 1;
-	iter->children[5]->A_deps.ndeps = 1;
-	iter->children[5]->A_deps.deps[0] = tag_6a;
-	iter->children[5]->B_deps.ndeps = 1;
-	iter->children[5]->B_deps.deps[0] = tag_6b;
-	iter->children[5]->A = iter->Mia_data[5]; 
-	iter->children[5]->B = iter->Mib_data[5]; 
-	iter->children[5]->C = iter->Mi_data[5];
-	strassen_mult(iter->children[5]);
-
-	/* M7 = M7a * M7b */
-	iter->children[6] = malloc(sizeof(struct strassen_iter));
-	iter->children[6]->reclevel = iter->reclevel - 1;
-	iter->children[6]->A_deps.ndeps = 1;
-	iter->children[6]->A_deps.deps[0] = tag_7a;
-	iter->children[6]->B_deps.ndeps = 1;
-	iter->children[6]->B_deps.deps[0] = tag_7b;
-	iter->children[6]->A = iter->Mia_data[6]; 
-	iter->children[6]->B = iter->Mib_data[6]; 
-	iter->children[6]->C = iter->Mi_data[6];
-	strassen_mult(iter->children[6]);
-
-	starpu_tag_t *tag_m1 = iter->children[0]->C_deps.deps;
-	starpu_tag_t *tag_m2 = iter->children[1]->C_deps.deps;
-	starpu_tag_t *tag_m3 = iter->children[2]->C_deps.deps;
-	starpu_tag_t *tag_m4 = iter->children[3]->C_deps.deps;
-	starpu_tag_t *tag_m5 = iter->children[4]->C_deps.deps;
-	starpu_tag_t *tag_m6 = iter->children[5]->C_deps.deps;
-	starpu_tag_t *tag_m7 = iter->children[6]->C_deps.deps;
-
-	/* C11 = M1 + M4 - M5 + M7 */
-	struct starpu_task *task_c11_a = compute_self_add_sub_op(C11, ADD, iter->Mi_data[0]);
-	struct starpu_task *task_c11_b = compute_self_add_sub_op(C11, ADD, iter->Mi_data[3]);
-	struct starpu_task *task_c11_c = compute_self_add_sub_op(C11, SUB, iter->Mi_data[4]);
-	struct starpu_task *task_c11_d = compute_self_add_sub_op(C11, ADD, iter->Mi_data[6]);
-
-	starpu_tag_t tag_c11_a = task_c11_a->tag_id;
-	starpu_tag_t tag_c11_b = task_c11_b->tag_id;
-	starpu_tag_t tag_c11_c = task_c11_c->tag_id;
-	starpu_tag_t tag_c11_d = task_c11_d->tag_id;
-
-	/* C12 = M3 + M5 */
-	struct starpu_task *task_c12_a = compute_self_add_sub_op(C12, ADD, iter->Mi_data[2]);
-	struct starpu_task *task_c12_b = compute_self_add_sub_op(C12, ADD, iter->Mi_data[4]);
-
-	starpu_tag_t tag_c12_a = task_c12_a->tag_id;
-	starpu_tag_t tag_c12_b = task_c12_b->tag_id;
-
-	/* C21 = M2 + M4 */
-	struct starpu_task *task_c21_a = compute_self_add_sub_op(C21, ADD, iter->Mi_data[1]);
-	struct starpu_task *task_c21_b = compute_self_add_sub_op(C21, ADD, iter->Mi_data[3]);
-
-	starpu_tag_t tag_c21_a = task_c21_a->tag_id;
-	starpu_tag_t tag_c21_b = task_c21_b->tag_id;
-
-	/* C22 = M1 - M2 + M3 + M6 */
-	struct starpu_task *task_c22_a = compute_self_add_sub_op(C22, ADD, iter->Mi_data[0]);
-	struct starpu_task *task_c22_b = compute_self_add_sub_op(C22, SUB, iter->Mi_data[1]);
-	struct starpu_task *task_c22_c = compute_self_add_sub_op(C22, ADD, iter->Mi_data[3]);
-	struct starpu_task *task_c22_d = compute_self_add_sub_op(C22, ADD, iter->Mi_data[5]);
-
-	starpu_tag_t tag_c22_a = task_c22_a->tag_id;
-	starpu_tag_t tag_c22_b = task_c22_b->tag_id;
-	starpu_tag_t tag_c22_c = task_c22_c->tag_id;
-	starpu_tag_t tag_c22_d = task_c22_d->tag_id;
-
-	if (iter->reclevel == 1)
-	{
-		starpu_tag_declare_deps(tag_c11_a, 1, tag_m1[0]);
-		starpu_tag_declare_deps(tag_c11_b, 2, tag_m4[0], tag_c11_a);
-		starpu_tag_declare_deps(tag_c11_c, 2, tag_m5[0], tag_c11_b);
-		starpu_tag_declare_deps(tag_c11_d, 2, tag_m7[0], tag_c11_c);
-	
-		starpu_tag_declare_deps(tag_c12_a, 1, tag_m3[0]);
-		starpu_tag_declare_deps(tag_c12_b, 2, tag_m5[0], tag_c12_a);
-
-		starpu_tag_declare_deps(tag_c21_a, 1, tag_m2[0]);
-		starpu_tag_declare_deps(tag_c21_b, 2, tag_m4[0], tag_c21_a);
-	
-		starpu_tag_declare_deps(tag_c22_a, 1, tag_m1[0]);
-		starpu_tag_declare_deps(tag_c22_b, 2, tag_m2[0], tag_c22_a);
-		starpu_tag_declare_deps(tag_c22_c, 2, tag_m3[0], tag_c22_b);
-		starpu_tag_declare_deps(tag_c22_d, 2, tag_m6[0], tag_c22_c);
-	}
-	else
-	{
-		starpu_tag_declare_deps(tag_c11_a, 4, tag_m1[0], tag_m1[1], tag_m1[2], tag_m1[3]);
-		starpu_tag_declare_deps(tag_c11_b, 5, tag_m4[0], tag_m4[1], tag_m4[2], tag_m4[3], tag_c11_a);
-		starpu_tag_declare_deps(tag_c11_c, 5, tag_m5[0], tag_m5[1], tag_m5[2], tag_m5[3], tag_c11_b);
-		starpu_tag_declare_deps(tag_c11_d, 5, tag_m7[0], tag_m7[1], tag_m7[2], tag_m7[3], tag_c11_c);
-
-		starpu_tag_declare_deps(tag_c12_a, 4, tag_m3[0], tag_m3[1], tag_m3[2], tag_m3[3]);
-		starpu_tag_declare_deps(tag_c12_b, 5, tag_m5[0], tag_m5[1], tag_m5[2], tag_m5[3], tag_c12_a);
-
-		starpu_tag_declare_deps(tag_c21_a, 4, tag_m2[0], tag_m2[1], tag_m2[2], tag_m2[3]);
-		starpu_tag_declare_deps(tag_c21_b, 5, tag_m4[0], tag_m4[1], tag_m4[2], tag_m4[3], tag_c21_a);
-
-		starpu_tag_declare_deps(tag_c22_a, 4, tag_m1[0], tag_m1[1], tag_m1[2], tag_m1[3]);
-		starpu_tag_declare_deps(tag_c22_b, 5, tag_m2[0], tag_m2[1], tag_m2[2], tag_m2[3], tag_c22_a);
-		starpu_tag_declare_deps(tag_c22_c, 5, tag_m3[0], tag_m3[1], tag_m3[2], tag_m3[3], tag_c22_b);
-		starpu_tag_declare_deps(tag_c22_d, 5, tag_m6[0], tag_m6[1], tag_m6[2], tag_m6[3], tag_c22_c);
-	}
-
-	starpu_task_submit(task_c11_a);
-	starpu_task_submit(task_c11_b);
-	starpu_task_submit(task_c11_c);
-	starpu_task_submit(task_c11_d);
-
-	starpu_task_submit(task_c12_a);
-	starpu_task_submit(task_c12_b);
-
-	starpu_task_submit(task_c21_a);
-	starpu_task_submit(task_c21_b);
-
-	starpu_task_submit(task_c22_a);
-	starpu_task_submit(task_c22_b);
-	starpu_task_submit(task_c22_c);
-	starpu_task_submit(task_c22_d);
-
-	iter->C_deps.ndeps = 4;
-	iter->C_deps.deps[0] = tag_c11_d;
-	iter->C_deps.deps[1] = tag_c12_b;
-	iter->C_deps.deps[2] = tag_c21_b;
-	iter->C_deps.deps[3] = tag_c22_d;
-
-	struct cleanup_arg *clean_struct = malloc(sizeof(struct cleanup_arg));
-
-	clean_struct->ndeps = 4;
-		clean_struct->tags[0] = tag_c11_d;
-		clean_struct->tags[1] = tag_c12_b;
-		clean_struct->tags[2] = tag_c21_b;
-		clean_struct->tags[3] = tag_c22_d;
-	clean_struct->ndata = 17;
-		clean_struct->data[0] = iter->Mia_data[0];
-		clean_struct->data[1] = iter->Mib_data[0];
-		clean_struct->data[2] = iter->Mia_data[1];
-		clean_struct->data[3] = iter->Mib_data[2];
-		clean_struct->data[4] = iter->Mib_data[3];
-		clean_struct->data[5] = iter->Mia_data[4];
-		clean_struct->data[6] = iter->Mia_data[5];
-		clean_struct->data[7] = iter->Mib_data[5];
-		clean_struct->data[8] = iter->Mia_data[6];
-		clean_struct->data[9] = iter->Mib_data[6];
-		clean_struct->data[10] = iter->Mi_data[0];
-		clean_struct->data[11] = iter->Mi_data[1];
-		clean_struct->data[12] = iter->Mi_data[2];
-		clean_struct->data[13] = iter->Mi_data[3];
-		clean_struct->data[14] = iter->Mi_data[4];
-		clean_struct->data[15] = iter->Mi_data[5];
-		clean_struct->data[16] = iter->Mi_data[6];
-		
-	create_cleanup_task(clean_struct);
-}
-
-static void dummy_codelet_func(__attribute__((unused))void *descr[],
-				__attribute__((unused))  void *arg)
-{
-}
-
-static starpu_codelet dummy_codelet = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = NULL,
-	.cpu_func = dummy_codelet_func,
-	#ifdef STARPU_USE_CUDA
-	.cuda_func = dummy_codelet_func,
-	#endif
-	.nbuffers = 0
-};
-
-static struct starpu_task *dummy_task(starpu_tag_t tag)
-{
-	struct starpu_task *task =starpu_task_create();
-		task->callback_func = NULL;
-                task->cl = &dummy_codelet;
-                task->cl_arg = NULL;
-
-	task->use_tag = 1;
-	task->tag_id = tag;
-
-	return task;
-}
-
-void parse_args(int argc, char **argv)
-{
-        int i;
-        for (i = 1; i < argc; i++) {
-                if (strcmp(argv[i], "-size") == 0) {
-                        char *argptr;
-                        size = strtol(argv[++i], &argptr, 10);
-                }
-
-                if (strcmp(argv[i], "-rec") == 0) {
-                        char *argptr;
-                        reclevel = strtol(argv[++i], &argptr, 10);
-                }
-
-                if (strcmp(argv[i], "-no-random") == 0) {
-                        norandom = 1;
-                }
-
-                if (strcmp(argv[i], "-pin") == 0) {
-                        pin = 1;
-                }
-
-        }
-}
-
-int main(int argc, char **argv)
-{
-	starpu_data_handle data_A, data_B, data_C;
-	float *A, *B, *C;
-
-	struct timeval start;
-	struct timeval end;
-
-	parse_args(argc, argv);
-
-	assert(reclevel <= MAXREC);
-
-	/* this is an upper bound ! */
-	used_mem_predicted = size*size*(predicted_mem[reclevel] + 1);
-
-	fprintf(stderr, "(Predicted) Memory consumption: %ld MB\n", used_mem_predicted/(1024*1024));
-
-	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
-#ifdef STARPU_USE_CUDA
-        if (pin) {
-                starpu_data_malloc_pinned_if_possible((void **)&bigbuffer, used_mem_predicted);
-        } else
-#endif
-        {
-#ifdef STARPU_HAVE_POSIX_MEMALIGN
-                posix_memalign((void **)&bigbuffer, 4096, used_mem_predicted);
-#else
-		bigbuffer = malloc(used_mem_predicted);
-#endif
-	}
-
-	A = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
-	B = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
-	C = allocate_tmp_matrix_wrapper(size*size*sizeof(float));
-
-	starpu_matrix_data_register(&data_A, 0, (uintptr_t)A, size, size, size, sizeof(float));
-	starpu_matrix_data_register(&data_B, 0, (uintptr_t)B, size, size, size, sizeof(float));
-	starpu_matrix_data_register(&data_C, 0, (uintptr_t)C, size, size, size, sizeof(float));
-
-	unsigned rec;
-	for (rec = 0; rec < reclevel; rec++)
-	{
-		starpu_map_filters(data_A, 2, &f, &f2);
-		starpu_map_filters(data_B, 2, &f, &f2);
-		starpu_map_filters(data_C, 2, &f, &f2);
-	}
-
-	struct strassen_iter iter;
-		iter.reclevel = reclevel;
-		iter.A = data_A;
-		iter.B = data_B;
-		iter.C = data_C;
-		iter.A_deps.ndeps = 1;
-		iter.A_deps.deps[0] = 42;
-		iter.B_deps.ndeps = 1;
-		iter.B_deps.deps[0] = 42;
-
-	strassen_mult(&iter);
-
-	starpu_tag_declare_deps_array(10, iter.C_deps.ndeps, iter.C_deps.deps);
-
-	fprintf(stderr, "Using %ld MB of memory\n", used_mem/(1024*1024));
-
-	struct starpu_task *task_start = dummy_task(42);
-
-	gettimeofday(&start, NULL);
-	starpu_task_submit(task_start);
-
-	struct starpu_task *task_end = dummy_task(10);
-	
-	task_end->synchronous = 1;
-	starpu_task_submit(task_end);
-
-	gettimeofday(&end, NULL);
-
-	starpu_helper_cublas_shutdown();
-
-	starpu_shutdown();
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-
-	display_perf(timing, size);
-
-	return 0;
-}

+ 0 - 266
examples/strassen2/strassen2_kernels.c

@@ -1,266 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-#include <math.h>
-#include <sys/types.h>
-#include <pthread.h>
-#include <signal.h>
-#include <semaphore.h>
-
-#include <starpu_config.h>
-#ifdef STARPU_USE_CUDA
-#include <cublas.h>
-#endif
-
-#include "../common/blas.h"
-
-#include <starpu.h>
-
-static double cublas_flop = 0.0;
-static double cpus_flop = 0.0;
-
-void display_perf(double timing, unsigned size)
-{
-	double total_flop_n3 = (2.0*size*size*size);
-	double total_flop = cublas_flop + cpus_flop;
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-	fprintf(stderr, "       GFlop : O(n3) -> %2.2f\n",
-			(double)total_flop_n3/1000000000.0f);
-	fprintf(stderr, "       GFlop : real %2.2f\n",
-			(double)total_flop/1000000000.0f);
-	fprintf(stderr, "	CPU : %2.2f (%2.2f%%)\n", (double)cpus_flop/1000000000.0, (100.0*cpus_flop)/(cpus_flop + cublas_flop));
-	fprintf(stderr, "	GPU : %2.2f (%2.2f%%)\n", (double)cublas_flop/1000000000.0, (100.0*cublas_flop)/(cpus_flop + cublas_flop));
-	fprintf(stderr, "       GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
-}
-
-static void mult_common_codelet(void *descr[], int s, __attribute__((unused))  void *arg)
-{
-	float *center 	= (float *)STARPU_GET_MATRIX_PTR(descr[0]);
-	float *left 	= (float *)STARPU_GET_MATRIX_PTR(descr[1]);
-	float *right 	= (float *)STARPU_GET_MATRIX_PTR(descr[2]);
-
-	unsigned n = STARPU_GET_MATRIX_NX(descr[0]);
-
-	unsigned ld21 = STARPU_GET_MATRIX_LD(descr[1]);
-	unsigned ld12 = STARPU_GET_MATRIX_LD(descr[2]);
-	unsigned ld22 = STARPU_GET_MATRIX_LD(descr[0]);
-
-	double flop = 2.0*n*n*n;
-
-#ifdef STARPU_USE_CUDA
-	cublasStatus cublasres;
-#endif
-
-	switch (s) {
-		case 0:
-			cpus_flop += flop;
-			SGEMM("N", "N", n, n, n, 1.0f, right, ld21, left, ld12, 0.0f, center, ld22);
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			cublas_flop += flop;
-
-			cublasSgemm('n', 'n', n, n, n, 1.0f, right, ld12, left, ld21, 0.0f, center, ld22);
-			cublasres = cublasGetError();
-			if (STARPU_UNLIKELY(cublasres))
-				STARPU_CUBLAS_REPORT_ERROR(cublasres);
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-void mult_cpu_codelet(void *descr[], void *_args)
-{
-	mult_common_codelet(descr, 0, _args);
-}
-
-#ifdef STARPU_USE_CUDA
-void mult_cublas_codelet(void *descr[], void *_args)
-{
-	mult_common_codelet(descr, 1, _args);
-}
-#endif
-
-static void add_sub_common_codelet(void *descr[], int s, __attribute__((unused))  void *arg, float alpha)
-{
-	/* C = A op B */
-
-	float *C 	= (float *)STARPU_GET_MATRIX_PTR(descr[0]);
-	float *A 	= (float *)STARPU_GET_MATRIX_PTR(descr[1]);
-	float *B 	= (float *)STARPU_GET_MATRIX_PTR(descr[2]);
-
-	unsigned n = STARPU_GET_MATRIX_NX(descr[0]);
-
-	unsigned ldA = STARPU_GET_MATRIX_LD(descr[1]);
-	unsigned ldB = STARPU_GET_MATRIX_LD(descr[2]);
-	unsigned ldC = STARPU_GET_MATRIX_LD(descr[0]);
-
-	double flop = 2.0*n*n;
-
-	// TODO check dim ...
-
-	unsigned line;
-#ifdef STARPU_USE_CUDA
-	cublasStatus cublasres;
-#endif
-
-	switch (s) {
-		case 0:
-			cpus_flop += flop;
-			for (line = 0; line < n; line++)
-			{
-				/* copy line A into C */
-				SAXPY(n, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
-				/* add line B to C = A */
-				SAXPY(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
-			}
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			cublas_flop += flop;
-			for (line = 0; line < n; line++)
-			{
-				/* copy line A into C */
-				cublasSaxpy(n, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
-				cublasres = cublasGetError();
-				if (STARPU_UNLIKELY(cublasres))
-					STARPU_CUBLAS_REPORT_ERROR(cublasres);
-				/* add line B to C = A */
-				cublasSaxpy(n, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
-				cublasres = cublasGetError();
-				if (STARPU_UNLIKELY(cublasres))
-					STARPU_CUBLAS_REPORT_ERROR(cublasres);
-			}
-
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-void sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 0, arg, -1.0f);
-}
-
-void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 0, arg, 1.0f);
-}
-
-#ifdef STARPU_USE_CUDA
-void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 1, arg, -1.0f);
-}
-
-void add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 1, arg, 1.0f);
-}
-#endif
-
-
-static void self_add_sub_common_codelet(void *descr[], int s, __attribute__((unused))  void *arg, float alpha)
-{
-	/* C +=/-= A */
-
-	float *C 	= (float *)STARPU_GET_MATRIX_PTR(descr[0]);
-	float *A 	= (float *)STARPU_GET_MATRIX_PTR(descr[1]);
-
-	unsigned n = STARPU_GET_MATRIX_NX(descr[0]);
-
-	unsigned ldA = STARPU_GET_MATRIX_LD(descr[1]);
-	unsigned ldC = STARPU_GET_MATRIX_LD(descr[0]);
-
-	double flop = 1.0*n*n;
-
-	// TODO check dim ...
-	
-	unsigned line;
-
-#ifdef STARPU_USE_CUDA
-	cublasStatus cublasres;
-#endif
-
-	switch (s) {
-		case 0:
-			cpus_flop += flop;
-			for (line = 0; line < n; line++)
-			{
-				/* add line A to C */
-				SAXPY(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
-			}
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			cublas_flop += flop;
-			for (line = 0; line < n; line++)
-			{
-				/* add line A to C */
-				cublasSaxpy(n, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
-				cublasres = cublasGetError();
-				if (STARPU_UNLIKELY(cublasres))
-					STARPU_CUBLAS_REPORT_ERROR(cublasres);
-			}
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-
-
-
-void self_add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 0, arg, 1.0f);
-}
-
-void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 0, arg, -1.0f);
-}
-
-#ifdef STARPU_USE_CUDA
-void self_add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 1, arg, 1.0f);
-}
-
-void self_sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 1, arg, -1.0f);
-}
-#endif
-
-/* this codelet does nothing  */
-void null_codelet(__attribute__((unused)) void *descr[],
-		  __attribute__((unused))  void *arg)
-{
-}