| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857 | /* * StarPU * Copyright (C) INRIA 2008-2009 (see AUTHORS file) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */#include <stdio.h>#include <stdint.h>#include <math.h>#include <sys/types.h>#include <sys/time.h>#include <pthread.h>#include <signal.h>#include <starpu.h>#define MAXDEPS	4uint64_t current_tag = 1024;uint64_t used_mem = 0;uint64_t used_mem_predicted = 0;#define MAXREC	7/* the size consumed by the algorithm should be *	<= (size)^2 * ( predicted_mem[rec] + 1) * NB: we don't really need this, but this is useful to avoid allocating * thousands of pinned buffers and as many VMA that pressure Linux a lot */static unsigned predicted_mem[7] = {	12, 29, 58, 110, 201, 361, 640};static unsigned char *bigbuffer;/*Strassen:        M1 = (A11 + A22)(B11 + B22)        M2 = (A21 + A22)B11        M3 = A11(B12 - B22)        M4 = A22(B21 - B11)        M5 = (A11 + A12)B22        M6 = (A21 - A11)(B11 + B12)        M7 = (A12 - A22)(B21 + B22)        C11 = M1 + M4 - M5 + M7        C12 = M3 + M5        C21 = M2 + M4        C22 = M1 - M2 + M3 + M6	7 recursive calls to the Strassen algorithm (in each Mi computation)	10+7 temporary buffers (to compute the terms of Mi = Mia x Mib, and to store Mi)	complexity:		M(n) multiplication complexity		A(n) add/sub complexity		M(n) = (10 + 8) A(n/2) + 7 M(n/2)	NB: we consider fortran ordering (hence we compute M3t = (B12t - B22t)A11t for instance) */static unsigned size = 2048;static unsigned reclevel = 3;static unsigned norandom = 0;static unsigned pin = 0;extern void mult_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);extern void sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);extern void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);extern void self_add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);extern void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);#ifdef STARPU_USE_CUDAextern void mult_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);extern void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);extern void add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);extern void self_add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);extern void self_sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);#endifextern void null_codelet(__attribute__((unused)) void *descr[],                  __attribute__((unused))  void *arg);extern void display_perf(double timing, unsigned size);struct starpu_perfmodel_t strassen_model_mult = {        .type = STARPU_HISTORY_BASED,        .symbol = "strassen_model_mult"};struct starpu_perfmodel_t strassen_model_add = {        .type = STARPU_HISTORY_BASED,        .symbol = "strassen_model_add"};struct starpu_perfmodel_t strassen_model_sub = {        .type = STARPU_HISTORY_BASED,        .symbol = "strassen_model_sub"};struct starpu_perfmodel_t strassen_model_self_add = {        .type = STARPU_HISTORY_BASED,        .symbol = "strassen_model_self_add"};struct starpu_perfmodel_t strassen_model_self_sub = {        .type = STARPU_HISTORY_BASED,        .symbol = "strassen_model_self_sub"};struct data_deps_t {	unsigned ndeps;	starpu_tag_t deps[MAXDEPS];};struct strassen_iter {	unsigned reclevel;	struct strassen_iter *children[7];	starpu_data_handle A, B, C;	/* temporary buffers */	/* Mi = Mia * Mib*/	starpu_data_handle Mia_data[7];	starpu_data_handle Mib_data[7];	starpu_data_handle Mi_data[7];	/* input deps */	struct data_deps_t A_deps;	struct data_deps_t B_deps;	/* output deps */	struct data_deps_t C_deps;};static starpu_filter f = {	.filter_func = starpu_block_filter_func,	.filter_arg = 2};static starpu_filter f2 ={	.filter_func = starpu_vertical_block_filter_func,	.filter_arg = 2};static float *allocate_tmp_matrix_wrapper(size_t size){	float *buffer;	buffer = (float *)&bigbuffer[used_mem];	/* XXX there could be some extra alignment constraints here */	used_mem += size;	if (used_mem > used_mem_predicted)		fprintf(stderr, "used %ld predict %ld\n", used_mem, used_mem_predicted);	assert(used_mem <= used_mem_predicted);	memset(buffer, 0, size);	return buffer;}static starpu_data_handle allocate_tmp_matrix(unsigned size, unsigned reclevel){	starpu_data_handle *data = malloc(sizeof(starpu_data_handle));	float *buffer;	buffer = allocate_tmp_matrix_wrapper(size*size*sizeof(float));	starpu_matrix_data_register(data, 0, (uintptr_t)buffer, size, size, size, sizeof(float));	/* we construct a starpu_filter tree of depth reclevel */	unsigned rec;	for (rec = 0; rec < reclevel; rec++)		starpu_map_filters(*data, 2, &f, &f2);	return *data;}enum operation {	ADD,	SUB,	MULT};static starpu_codelet cl_add = {	.where = STARPU_CPU|STARPU_CUDA,	.model = &strassen_model_add,	.cpu_func = add_cpu_codelet,#ifdef STARPU_USE_CUDA	.cuda_func = add_cublas_codelet,#endif	.nbuffers = 3};static starpu_codelet cl_sub = {	.where = STARPU_CPU|STARPU_CUDA,	.model = &strassen_model_sub,	.cpu_func = sub_cpu_codelet,#ifdef STARPU_USE_CUDA	.cuda_func = sub_cublas_codelet,#endif	.nbuffers = 3};static starpu_codelet cl_mult = {	.where = STARPU_CPU|STARPU_CUDA,	.model = &strassen_model_mult,	.cpu_func = mult_cpu_codelet,#ifdef STARPU_USE_CUDA	.cuda_func = mult_cublas_codelet,#endif	.nbuffers = 3};/* C = A op B */struct starpu_task *compute_add_sub_op(starpu_data_handle C, enum operation op, starpu_data_handle A, starpu_data_handle B){	struct starpu_task *task = starpu_task_create();	uint64_t j_tag = current_tag++;	task->buffers[0].handle = C;	task->buffers[0].mode = STARPU_W;	task->buffers[1].handle = A;	task->buffers[1].mode = STARPU_R;	task->buffers[2].handle = B;	task->buffers[2].mode = STARPU_R;	task->callback_func = NULL;	switch (op) {		case ADD:			task->cl = &cl_add;			break;		case SUB:			task->cl = &cl_sub;			break;		case MULT:			task->cl = &cl_mult;			break;		default:			assert(0);	};	task->use_tag = 1;	task->tag_id = (starpu_tag_t)j_tag;	return task;}static starpu_codelet cl_self_add = {	.where = STARPU_CPU|STARPU_CUDA,	.model = &strassen_model_self_add,	.cpu_func = self_add_cpu_codelet,#ifdef STARPU_USE_CUDA	.cuda_func = self_add_cublas_codelet,#endif	.nbuffers = 2};static starpu_codelet cl_self_sub = {	.where = STARPU_CPU|STARPU_CUDA,	.model = &strassen_model_self_sub,	.cpu_func = self_sub_cpu_codelet,#ifdef STARPU_USE_CUDA	.cuda_func = self_sub_cublas_codelet,#endif	.nbuffers = 2};/* C = C op A */struct starpu_task *compute_self_add_sub_op(starpu_data_handle C, enum operation op, starpu_data_handle A){	struct starpu_task *task = starpu_task_create();	uint64_t j_tag = current_tag++;	task->buffers[0].handle = C;	task->buffers[0].mode = STARPU_RW;	task->buffers[1].handle = A;	task->buffers[1].mode = STARPU_R;	task->callback_func = NULL;	switch (op) {		case ADD:			task->cl = &cl_self_add;			break;		case SUB:			task->cl = &cl_self_sub;			break;		default:			assert(0);	};	task->use_tag = 1;	task->tag_id = (starpu_tag_t)j_tag;	return task;}struct cleanup_arg {	unsigned ndeps;	starpu_tag_t tags[8];	unsigned ndata;	starpu_data_handle data[32];};void cleanup_callback(void *_arg){	//fprintf(stderr, "cleanup callback\n");	struct cleanup_arg *arg = _arg;	unsigned i;	for (i = 0; i < arg->ndata; i++)		starpu_data_advise_as_important(arg->data[i], 0);	free(arg);}static starpu_codelet cleanup_codelet = {	.where = STARPU_CPU|STARPU_CUDA,	.model = NULL,	.cpu_func = null_codelet,#ifdef STARPU_USE_CUDA	.cuda_func = null_codelet,#endif	.nbuffers = 0};/* this creates a codelet that will tell StarPU that all specified data are not  essential once the tasks corresponding to the task will be performed */void create_cleanup_task(struct cleanup_arg *cleanup_arg){	struct starpu_task *task = starpu_task_create();	uint64_t j_tag = current_tag++;	task->cl = &cleanup_codelet;	task->callback_func = cleanup_callback;	task->callback_arg = cleanup_arg;	task->use_tag = 1;	task->tag_id = j_tag;	starpu_tag_declare_deps_array(j_tag, cleanup_arg->ndeps, cleanup_arg->tags);	starpu_task_submit(task);}void strassen_mult(struct strassen_iter *iter){	if (iter->reclevel == 0)	{		struct starpu_task *task_mult = 			compute_add_sub_op(iter->C, MULT, iter->A, iter->B);		starpu_tag_t tag_mult = task_mult->tag_id;		starpu_tag_t deps_array[10];		unsigned indexA, indexB;		for (indexA = 0; indexA < iter->A_deps.ndeps; indexA++)		{			deps_array[indexA] = iter->A_deps.deps[indexA];		}		for (indexB = 0; indexB < iter->B_deps.ndeps; indexB++)		{			deps_array[indexB+indexA] = iter->B_deps.deps[indexB];		}		starpu_tag_declare_deps_array(tag_mult, indexA+indexB, deps_array);		iter->C_deps.ndeps = 1;		iter->C_deps.deps[0] = tag_mult;		starpu_task_submit(task_mult);		return;	}        starpu_data_handle A11 = starpu_data_get_sub_data(iter->A, 2, 0, 0);        starpu_data_handle A12 = starpu_data_get_sub_data(iter->A, 2, 1, 0);        starpu_data_handle A21 = starpu_data_get_sub_data(iter->A, 2, 0, 1);        starpu_data_handle A22 = starpu_data_get_sub_data(iter->A, 2, 1, 1);        starpu_data_handle B11 = starpu_data_get_sub_data(iter->B, 2, 0, 0);        starpu_data_handle B12 = starpu_data_get_sub_data(iter->B, 2, 1, 0);        starpu_data_handle B21 = starpu_data_get_sub_data(iter->B, 2, 0, 1);        starpu_data_handle B22 = starpu_data_get_sub_data(iter->B, 2, 1, 1);        starpu_data_handle C11 = starpu_data_get_sub_data(iter->C, 2, 0, 0);        starpu_data_handle C12 = starpu_data_get_sub_data(iter->C, 2, 1, 0);        starpu_data_handle C21 = starpu_data_get_sub_data(iter->C, 2, 0, 1);        starpu_data_handle C22 = starpu_data_get_sub_data(iter->C, 2, 1, 1);	unsigned size = starpu_matrix_get_nx(A11);	/* M1a = (A11 + A22) */	iter->Mia_data[0] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_1a = compute_add_sub_op(iter->Mia_data[0], ADD, A11, A22);	starpu_tag_t tag_1a = task_1a->tag_id;	starpu_tag_declare_deps_array(tag_1a, iter->A_deps.ndeps, iter->A_deps.deps);	starpu_task_submit(task_1a);	/* M1b = (B11 + B22) */	iter->Mib_data[0] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_1b = compute_add_sub_op(iter->Mib_data[0], ADD, B11, B22);	starpu_tag_t tag_1b = task_1b->tag_id;	starpu_tag_declare_deps_array(tag_1b, iter->B_deps.ndeps, iter->B_deps.deps);	starpu_task_submit(task_1b);	/* M2a = (A21 + A22) */	iter->Mia_data[1] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_2a = compute_add_sub_op(iter->Mia_data[1], ADD, A21, A22);	starpu_tag_t tag_2a = task_2a->tag_id;	starpu_tag_declare_deps_array(tag_2a, iter->A_deps.ndeps, iter->A_deps.deps);	starpu_task_submit(task_2a);	/* M3b = (B12 - B22) */	iter->Mib_data[2] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_3b = compute_add_sub_op(iter->Mib_data[2], SUB, B12, B22);	starpu_tag_t tag_3b = task_3b->tag_id;	starpu_tag_declare_deps_array(tag_3b, iter->B_deps.ndeps, iter->B_deps.deps);	starpu_task_submit(task_3b);		/* M4b = (B21 - B11) */	iter->Mib_data[3] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_4b = compute_add_sub_op(iter->Mib_data[3], SUB, B21, B11);	starpu_tag_t tag_4b = task_4b->tag_id;	starpu_tag_declare_deps_array(tag_4b, iter->B_deps.ndeps, iter->B_deps.deps);	starpu_task_submit(task_4b);		/* M5a = (A11 + A12) */	iter->Mia_data[4] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_5a = compute_add_sub_op(iter->Mia_data[4], ADD, A11, A12);	starpu_tag_t tag_5a = task_5a->tag_id;	starpu_tag_declare_deps_array(tag_5a, iter->A_deps.ndeps, iter->A_deps.deps);	starpu_task_submit(task_5a);	/* M6a = (A21 - A11) */	iter->Mia_data[5] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_6a = compute_add_sub_op(iter->Mia_data[5], SUB, A21, A11);	starpu_tag_t tag_6a = task_6a->tag_id;	starpu_tag_declare_deps_array(tag_6a, iter->A_deps.ndeps, iter->A_deps.deps);	starpu_task_submit(task_6a);	/* M6b = (B11 + B12) */	iter->Mib_data[5] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_6b = compute_add_sub_op(iter->Mib_data[5], SUB, B11, B12);	starpu_tag_t tag_6b = task_6b->tag_id;	starpu_tag_declare_deps_array(tag_6b, iter->B_deps.ndeps, iter->B_deps.deps);	starpu_task_submit(task_6b);	/* M7a = (A12 - A22) */	iter->Mia_data[6] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_7a = compute_add_sub_op(iter->Mia_data[6], SUB, A12, A22);	starpu_tag_t tag_7a = task_7a->tag_id;	starpu_tag_declare_deps_array(tag_7a, iter->A_deps.ndeps, iter->A_deps.deps);	starpu_task_submit(task_7a);	/* M7b = (B21 + B22) */	iter->Mib_data[6] = allocate_tmp_matrix(size, iter->reclevel);	struct starpu_task *task_7b = compute_add_sub_op(iter->Mib_data[6], ADD, B21, B22);	starpu_tag_t tag_7b = task_7b->tag_id;	starpu_tag_declare_deps_array(tag_7b, iter->B_deps.ndeps, iter->B_deps.deps);	starpu_task_submit(task_7b);	iter->Mi_data[0] = allocate_tmp_matrix(size, iter->reclevel);	iter->Mi_data[1] = allocate_tmp_matrix(size, iter->reclevel);	iter->Mi_data[2] = allocate_tmp_matrix(size, iter->reclevel);	iter->Mi_data[3] = allocate_tmp_matrix(size, iter->reclevel);	iter->Mi_data[4] = allocate_tmp_matrix(size, iter->reclevel);	iter->Mi_data[5] = allocate_tmp_matrix(size, iter->reclevel);	iter->Mi_data[6] = allocate_tmp_matrix(size, iter->reclevel);	/* M1 = M1a * M1b */	iter->children[0] = malloc(sizeof(struct strassen_iter));	iter->children[0]->reclevel = iter->reclevel - 1;	iter->children[0]->A_deps.ndeps = 1;	iter->children[0]->A_deps.deps[0] = tag_1a;	iter->children[0]->B_deps.ndeps = 1;	iter->children[0]->B_deps.deps[0] = tag_1b;	iter->children[0]->A = iter->Mia_data[0]; 	iter->children[0]->B = iter->Mib_data[0]; 	iter->children[0]->C = iter->Mi_data[0];	strassen_mult(iter->children[0]);	/* M2 = M2a * B11 */	iter->children[1] = malloc(sizeof(struct strassen_iter));	iter->children[1]->reclevel = iter->reclevel - 1;	iter->children[1]->A_deps.ndeps = 1;	iter->children[1]->A_deps.deps[0] = tag_2a;	iter->children[1]->B_deps.ndeps = iter->B_deps.ndeps;	memcpy(iter->children[1]->B_deps.deps, iter->B_deps.deps, iter->B_deps.ndeps*sizeof(starpu_tag_t));	iter->children[1]->A = iter->Mia_data[1]; 	iter->children[1]->B = B11; 	iter->children[1]->C = iter->Mi_data[1];	strassen_mult(iter->children[1]);	/* M3 = A11 * M3b */	iter->children[2] = malloc(sizeof(struct strassen_iter));	iter->children[2]->reclevel = iter->reclevel - 1;	iter->children[2]->A_deps.ndeps = iter->B_deps.ndeps;	memcpy(iter->children[2]->A_deps.deps, iter->A_deps.deps, iter->A_deps.ndeps*sizeof(starpu_tag_t));	iter->children[2]->B_deps.ndeps = 1;	iter->children[2]->B_deps.deps[0] = tag_3b;	iter->children[2]->A = A11; 	iter->children[2]->B = iter->Mib_data[2]; 	iter->children[2]->C = iter->Mi_data[2];	strassen_mult(iter->children[2]);	/* M4 = A22 * M4b */	iter->children[3] = malloc(sizeof(struct strassen_iter));	iter->children[3]->reclevel = iter->reclevel - 1;	iter->children[3]->A_deps.ndeps = iter->B_deps.ndeps;	memcpy(iter->children[3]->A_deps.deps, iter->A_deps.deps, iter->A_deps.ndeps*sizeof(starpu_tag_t));	iter->children[3]->B_deps.ndeps = 1;	iter->children[3]->B_deps.deps[0] = tag_4b;	iter->children[3]->A = A22; 	iter->children[3]->B = iter->Mib_data[3]; 	iter->children[3]->C = iter->Mi_data[3];	strassen_mult(iter->children[3]);	/* M5 = M5a * B22 */	iter->children[4] = malloc(sizeof(struct strassen_iter));	iter->children[4]->reclevel = iter->reclevel - 1;	iter->children[4]->A_deps.ndeps = 1;	iter->children[4]->A_deps.deps[0] = tag_5a;	iter->children[4]->B_deps.ndeps = iter->B_deps.ndeps;	memcpy(iter->children[4]->B_deps.deps, iter->B_deps.deps, iter->B_deps.ndeps*sizeof(starpu_tag_t));	iter->children[4]->A = iter->Mia_data[4]; 	iter->children[4]->B = B22; 	iter->children[4]->C = iter->Mi_data[4];	strassen_mult(iter->children[4]);	/* M6 = M6a * M6b */	iter->children[5] = malloc(sizeof(struct strassen_iter));	iter->children[5]->reclevel = iter->reclevel - 1;	iter->children[5]->A_deps.ndeps = 1;	iter->children[5]->A_deps.deps[0] = tag_6a;	iter->children[5]->B_deps.ndeps = 1;	iter->children[5]->B_deps.deps[0] = tag_6b;	iter->children[5]->A = iter->Mia_data[5]; 	iter->children[5]->B = iter->Mib_data[5]; 	iter->children[5]->C = iter->Mi_data[5];	strassen_mult(iter->children[5]);	/* M7 = M7a * M7b */	iter->children[6] = malloc(sizeof(struct strassen_iter));	iter->children[6]->reclevel = iter->reclevel - 1;	iter->children[6]->A_deps.ndeps = 1;	iter->children[6]->A_deps.deps[0] = tag_7a;	iter->children[6]->B_deps.ndeps = 1;	iter->children[6]->B_deps.deps[0] = tag_7b;	iter->children[6]->A = iter->Mia_data[6]; 	iter->children[6]->B = iter->Mib_data[6]; 	iter->children[6]->C = iter->Mi_data[6];	strassen_mult(iter->children[6]);	starpu_tag_t *tag_m1 = iter->children[0]->C_deps.deps;	starpu_tag_t *tag_m2 = iter->children[1]->C_deps.deps;	starpu_tag_t *tag_m3 = iter->children[2]->C_deps.deps;	starpu_tag_t *tag_m4 = iter->children[3]->C_deps.deps;	starpu_tag_t *tag_m5 = iter->children[4]->C_deps.deps;	starpu_tag_t *tag_m6 = iter->children[5]->C_deps.deps;	starpu_tag_t *tag_m7 = iter->children[6]->C_deps.deps;	/* C11 = M1 + M4 - M5 + M7 */	struct starpu_task *task_c11_a = compute_self_add_sub_op(C11, ADD, iter->Mi_data[0]);	struct starpu_task *task_c11_b = compute_self_add_sub_op(C11, ADD, iter->Mi_data[3]);	struct starpu_task *task_c11_c = compute_self_add_sub_op(C11, SUB, iter->Mi_data[4]);	struct starpu_task *task_c11_d = compute_self_add_sub_op(C11, ADD, iter->Mi_data[6]);	starpu_tag_t tag_c11_a = task_c11_a->tag_id;	starpu_tag_t tag_c11_b = task_c11_b->tag_id;	starpu_tag_t tag_c11_c = task_c11_c->tag_id;	starpu_tag_t tag_c11_d = task_c11_d->tag_id;	/* C12 = M3 + M5 */	struct starpu_task *task_c12_a = compute_self_add_sub_op(C12, ADD, iter->Mi_data[2]);	struct starpu_task *task_c12_b = compute_self_add_sub_op(C12, ADD, iter->Mi_data[4]);	starpu_tag_t tag_c12_a = task_c12_a->tag_id;	starpu_tag_t tag_c12_b = task_c12_b->tag_id;	/* C21 = M2 + M4 */	struct starpu_task *task_c21_a = compute_self_add_sub_op(C21, ADD, iter->Mi_data[1]);	struct starpu_task *task_c21_b = compute_self_add_sub_op(C21, ADD, iter->Mi_data[3]);	starpu_tag_t tag_c21_a = task_c21_a->tag_id;	starpu_tag_t tag_c21_b = task_c21_b->tag_id;	/* C22 = M1 - M2 + M3 + M6 */	struct starpu_task *task_c22_a = compute_self_add_sub_op(C22, ADD, iter->Mi_data[0]);	struct starpu_task *task_c22_b = compute_self_add_sub_op(C22, SUB, iter->Mi_data[1]);	struct starpu_task *task_c22_c = compute_self_add_sub_op(C22, ADD, iter->Mi_data[3]);	struct starpu_task *task_c22_d = compute_self_add_sub_op(C22, ADD, iter->Mi_data[5]);	starpu_tag_t tag_c22_a = task_c22_a->tag_id;	starpu_tag_t tag_c22_b = task_c22_b->tag_id;	starpu_tag_t tag_c22_c = task_c22_c->tag_id;	starpu_tag_t tag_c22_d = task_c22_d->tag_id;	if (iter->reclevel == 1)	{		starpu_tag_declare_deps(tag_c11_a, 1, tag_m1[0]);		starpu_tag_declare_deps(tag_c11_b, 2, tag_m4[0], tag_c11_a);		starpu_tag_declare_deps(tag_c11_c, 2, tag_m5[0], tag_c11_b);		starpu_tag_declare_deps(tag_c11_d, 2, tag_m7[0], tag_c11_c);			starpu_tag_declare_deps(tag_c12_a, 1, tag_m3[0]);		starpu_tag_declare_deps(tag_c12_b, 2, tag_m5[0], tag_c12_a);		starpu_tag_declare_deps(tag_c21_a, 1, tag_m2[0]);		starpu_tag_declare_deps(tag_c21_b, 2, tag_m4[0], tag_c21_a);			starpu_tag_declare_deps(tag_c22_a, 1, tag_m1[0]);		starpu_tag_declare_deps(tag_c22_b, 2, tag_m2[0], tag_c22_a);		starpu_tag_declare_deps(tag_c22_c, 2, tag_m3[0], tag_c22_b);		starpu_tag_declare_deps(tag_c22_d, 2, tag_m6[0], tag_c22_c);	}	else	{		starpu_tag_declare_deps(tag_c11_a, 4, tag_m1[0], tag_m1[1], tag_m1[2], tag_m1[3]);		starpu_tag_declare_deps(tag_c11_b, 5, tag_m4[0], tag_m4[1], tag_m4[2], tag_m4[3], tag_c11_a);		starpu_tag_declare_deps(tag_c11_c, 5, tag_m5[0], tag_m5[1], tag_m5[2], tag_m5[3], tag_c11_b);		starpu_tag_declare_deps(tag_c11_d, 5, tag_m7[0], tag_m7[1], tag_m7[2], tag_m7[3], tag_c11_c);		starpu_tag_declare_deps(tag_c12_a, 4, tag_m3[0], tag_m3[1], tag_m3[2], tag_m3[3]);		starpu_tag_declare_deps(tag_c12_b, 5, tag_m5[0], tag_m5[1], tag_m5[2], tag_m5[3], tag_c12_a);		starpu_tag_declare_deps(tag_c21_a, 4, tag_m2[0], tag_m2[1], tag_m2[2], tag_m2[3]);		starpu_tag_declare_deps(tag_c21_b, 5, tag_m4[0], tag_m4[1], tag_m4[2], tag_m4[3], tag_c21_a);		starpu_tag_declare_deps(tag_c22_a, 4, tag_m1[0], tag_m1[1], tag_m1[2], tag_m1[3]);		starpu_tag_declare_deps(tag_c22_b, 5, tag_m2[0], tag_m2[1], tag_m2[2], tag_m2[3], tag_c22_a);		starpu_tag_declare_deps(tag_c22_c, 5, tag_m3[0], tag_m3[1], tag_m3[2], tag_m3[3], tag_c22_b);		starpu_tag_declare_deps(tag_c22_d, 5, tag_m6[0], tag_m6[1], tag_m6[2], tag_m6[3], tag_c22_c);	}	starpu_task_submit(task_c11_a);	starpu_task_submit(task_c11_b);	starpu_task_submit(task_c11_c);	starpu_task_submit(task_c11_d);	starpu_task_submit(task_c12_a);	starpu_task_submit(task_c12_b);	starpu_task_submit(task_c21_a);	starpu_task_submit(task_c21_b);	starpu_task_submit(task_c22_a);	starpu_task_submit(task_c22_b);	starpu_task_submit(task_c22_c);	starpu_task_submit(task_c22_d);	iter->C_deps.ndeps = 4;	iter->C_deps.deps[0] = tag_c11_d;	iter->C_deps.deps[1] = tag_c12_b;	iter->C_deps.deps[2] = tag_c21_b;	iter->C_deps.deps[3] = tag_c22_d;	struct cleanup_arg *clean_struct = malloc(sizeof(struct cleanup_arg));	clean_struct->ndeps = 4;		clean_struct->tags[0] = tag_c11_d;		clean_struct->tags[1] = tag_c12_b;		clean_struct->tags[2] = tag_c21_b;		clean_struct->tags[3] = tag_c22_d;	clean_struct->ndata = 17;		clean_struct->data[0] = iter->Mia_data[0];		clean_struct->data[1] = iter->Mib_data[0];		clean_struct->data[2] = iter->Mia_data[1];		clean_struct->data[3] = iter->Mib_data[2];		clean_struct->data[4] = iter->Mib_data[3];		clean_struct->data[5] = iter->Mia_data[4];		clean_struct->data[6] = iter->Mia_data[5];		clean_struct->data[7] = iter->Mib_data[5];		clean_struct->data[8] = iter->Mia_data[6];		clean_struct->data[9] = iter->Mib_data[6];		clean_struct->data[10] = iter->Mi_data[0];		clean_struct->data[11] = iter->Mi_data[1];		clean_struct->data[12] = iter->Mi_data[2];		clean_struct->data[13] = iter->Mi_data[3];		clean_struct->data[14] = iter->Mi_data[4];		clean_struct->data[15] = iter->Mi_data[5];		clean_struct->data[16] = iter->Mi_data[6];			create_cleanup_task(clean_struct);}static void dummy_codelet_func(__attribute__((unused))void *descr[],				__attribute__((unused))  void *arg){}static starpu_codelet dummy_codelet = {	.where = STARPU_CPU|STARPU_CUDA,	.model = NULL,	.cpu_func = dummy_codelet_func,	#ifdef STARPU_USE_CUDA	.cuda_func = dummy_codelet_func,	#endif	.nbuffers = 0};static struct starpu_task *dummy_task(starpu_tag_t tag){	struct starpu_task *task =starpu_task_create();		task->callback_func = NULL;                task->cl = &dummy_codelet;                task->cl_arg = NULL;	task->use_tag = 1;	task->tag_id = tag;	return task;}void parse_args(int argc, char **argv){        int i;        for (i = 1; i < argc; i++) {                if (strcmp(argv[i], "-size") == 0) {                        char *argptr;                        size = strtol(argv[++i], &argptr, 10);                }                if (strcmp(argv[i], "-rec") == 0) {                        char *argptr;                        reclevel = strtol(argv[++i], &argptr, 10);                }                if (strcmp(argv[i], "-no-random") == 0) {                        norandom = 1;                }                if (strcmp(argv[i], "-pin") == 0) {                        pin = 1;                }        }}int main(int argc, char **argv){	starpu_data_handle data_A, data_B, data_C;	float *A, *B, *C;	struct timeval start;	struct timeval end;	parse_args(argc, argv);	assert(reclevel <= MAXREC);	/* this is an upper bound ! */	used_mem_predicted = size*size*(predicted_mem[reclevel] + 1);	fprintf(stderr, "(Predicted) Memory consumption: %ld MB\n", used_mem_predicted/(1024*1024));	starpu_init(NULL);	starpu_helper_cublas_init();#ifdef STARPU_USE_CUDA        if (pin) {                starpu_data_malloc_pinned_if_possible((void **)&bigbuffer, used_mem_predicted);        } else#endif        {#ifdef STARPU_HAVE_POSIX_MEMALIGN                posix_memalign((void **)&bigbuffer, 4096, used_mem_predicted);#else		bigbuffer = malloc(used_mem_predicted);#endif	}	A = allocate_tmp_matrix_wrapper(size*size*sizeof(float));	B = allocate_tmp_matrix_wrapper(size*size*sizeof(float));	C = allocate_tmp_matrix_wrapper(size*size*sizeof(float));	starpu_matrix_data_register(&data_A, 0, (uintptr_t)A, size, size, size, sizeof(float));	starpu_matrix_data_register(&data_B, 0, (uintptr_t)B, size, size, size, sizeof(float));	starpu_matrix_data_register(&data_C, 0, (uintptr_t)C, size, size, size, sizeof(float));	unsigned rec;	for (rec = 0; rec < reclevel; rec++)	{		starpu_map_filters(data_A, 2, &f, &f2);		starpu_map_filters(data_B, 2, &f, &f2);		starpu_map_filters(data_C, 2, &f, &f2);	}	struct strassen_iter iter;		iter.reclevel = reclevel;		iter.A = data_A;		iter.B = data_B;		iter.C = data_C;		iter.A_deps.ndeps = 1;		iter.A_deps.deps[0] = 42;		iter.B_deps.ndeps = 1;		iter.B_deps.deps[0] = 42;	strassen_mult(&iter);	starpu_tag_declare_deps_array(10, iter.C_deps.ndeps, iter.C_deps.deps);	fprintf(stderr, "Using %ld MB of memory\n", used_mem/(1024*1024));	struct starpu_task *task_start = dummy_task(42);	gettimeofday(&start, NULL);	starpu_task_submit(task_start);	struct starpu_task *task_end = dummy_task(10);		task_end->synchronous = 1;	starpu_task_submit(task_end);	gettimeofday(&end, NULL);	starpu_helper_cublas_shutdown();	starpu_shutdown();	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));	display_perf(timing, size);	return 0;}
 |