Browse Source

Remove the "strassen" example as it violates many basic assumptions of StarPU.

Cédric Augonnet 15 years ago
parent
commit
cf0348f0d6

+ 0 - 36
examples/Makefile.am

@@ -118,8 +118,6 @@ noinst_HEADERS = 				\
 	fortran/bindings/StarPU_fortran.h	\
 	fortran/bindings/StarPU_fortran.h	\
 	ppm_downscaler/ppm_downscaler.h		\
 	ppm_downscaler/ppm_downscaler.h		\
 	ppm_downscaler/yuv_downscaler.h		\
 	ppm_downscaler/yuv_downscaler.h		\
-	strassen/strassen.h			\
-	strassen/strassen_models.h		\
 	spmv/matrix_market/mmio.h		\
 	spmv/matrix_market/mmio.h		\
 	spmv/matrix_market/mm_to_bcsr.h		\
 	spmv/matrix_market/mm_to_bcsr.h		\
 	spmv/dw_spmv.h				\
 	spmv/dw_spmv.h				\
@@ -378,40 +376,6 @@ tag_example_tag_example3_SOURCES =		\
 tag_example_tag_restartable_SOURCES =		\
 tag_example_tag_restartable_SOURCES =		\
 	tag_example/tag_restartable.c
 	tag_example/tag_restartable.c
 
 
-####################
-# Strassen example #
-####################
-
-if ATLAS_BLAS_LIB
-
-examplebin_PROGRAMS += strassen/dw_strassen
-
-strassen_dw_strassen_SOURCES = 			\
-	strassen/strassen.c			\
-	strassen/strassen_kernels.c		\
-	strassen/test_strassen.c		\
-	strassen/strassen_models.c		\
-	common/blas.c
-
-endif
-
-#####################
-# Strassen2 example #
-#####################
-
-if !NO_BLAS_LIB
-
-check_PROGRAMS += strassen2/strassen
-
-examplebin_PROGRAMS += strassen2/strassen
-
-strassen2_strassen_SOURCES = 			\
-	strassen2/strassen2.c			\
-	strassen2/strassen2_kernels.c		\
-	common/blas.c
-
-endif
-
 ################
 ################
 # SpMV example #
 # SpMV example #
 ################
 ################

+ 0 - 515
examples/strassen/strassen.c

@@ -1,515 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "strassen.h"
-#include "strassen_models.h"
-
-static starpu_data_handle create_tmp_matrix(starpu_data_handle M)
-{
-	float *data;
-	starpu_data_handle state = malloc(sizeof(starpu_data_handle));
-
-	/* create a matrix with the same dimensions as M */
-	uint32_t nx = starpu_matrix_get_nx(M);
-	uint32_t ny = starpu_matrix_get_nx(M);
-
-	STARPU_ASSERT(state);
-
-	data = malloc(nx*ny*sizeof(float));
-	STARPU_ASSERT(data);
-
-	starpu_matrix_data_register(&state, 0, (uintptr_t)data, nx, nx, ny, sizeof(float));
-	
-	return state;
-}
-
-static void free_tmp_matrix(starpu_data_handle matrix)
-{
-	starpu_data_unregister(matrix);
-	free(matrix);
-}
-
-
-static void partition_matrices(strassen_iter_state_t *iter)
-{
-
-	starpu_data_handle A = iter->A;
-	starpu_data_handle B = iter->B;
-	starpu_data_handle C = iter->C;
-
-	starpu_filter f;
-	f.filter_func = starpu_block_filter_func;
-	f.filter_arg = 2;
-
-	starpu_filter f2;
-	f2.filter_func = starpu_vertical_block_filter_func;
-	f2.filter_arg = 2;
-
-	starpu_map_filters(A, 2, &f, &f2);
-	starpu_map_filters(B, 2, &f, &f2);
-	starpu_map_filters(C, 2, &f, &f2);
-
-	iter->A11 = starpu_data_get_sub_data(A, 2, 0, 0);
-	iter->A12 = starpu_data_get_sub_data(A, 2, 1, 0);
-	iter->A21 = starpu_data_get_sub_data(A, 2, 0, 1);
-	iter->A22 = starpu_data_get_sub_data(A, 2, 1, 1);
-
-	iter->B11 = starpu_data_get_sub_data(B, 2, 0, 0);
-	iter->B12 = starpu_data_get_sub_data(B, 2, 1, 0);
-	iter->B21 = starpu_data_get_sub_data(B, 2, 0, 1);
-	iter->B22 = starpu_data_get_sub_data(B, 2, 1, 1);
-
-	iter->C11 = starpu_data_get_sub_data(C, 2, 0, 0);
-	iter->C12 = starpu_data_get_sub_data(C, 2, 1, 0);
-	iter->C21 = starpu_data_get_sub_data(C, 2, 0, 1);
-	iter->C22 = starpu_data_get_sub_data(C, 2, 1, 1);
-
-	/* TODO check that all sub-matrices have the same size */
-}
-
-static void unpartition_matrices(strassen_iter_state_t *iter)
-{
-	/* TODO there is no  need to actually gather those results ... */
-	starpu_data_unpartition(iter->A, 0);
-	starpu_data_unpartition(iter->B, 0);
-	starpu_data_unpartition(iter->C, 0);
-}
-
-static starpu_codelet cl_add = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_add_sub,
-	.cpu_func = add_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = add_cublas_codelet,
-#endif
-	.nbuffers = 3
-};
-
-static starpu_codelet cl_sub = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_add_sub,
-	.cpu_func = sub_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = sub_cublas_codelet,
-#endif
-	.nbuffers = 3
-};
-
-static starpu_codelet cl_mult = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_mult,
-	.cpu_func = mult_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = mult_cublas_codelet,
-#endif
-	.nbuffers = 3
-};
-
-static starpu_codelet cl_self_add = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_self_add_sub,
-	.cpu_func = self_add_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = self_add_cublas_codelet,
-#endif
-	.nbuffers = 2
-};
-
-static starpu_codelet cl_self_sub = {
-	.where = STARPU_CPU|STARPU_CUDA,
-	.model = &strassen_model_self_add_sub,
-	.cpu_func = self_sub_cpu_codelet,
-#ifdef STARPU_USE_CUDA
-	.cuda_func = self_sub_cublas_codelet,
-#endif
-	.nbuffers = 2
-};
-
-static void compute_add_sub_op(starpu_data_handle A1, operation op,
-				starpu_data_handle A2, starpu_data_handle C, 
-				void (*callback)(void *), void *argcallback)
-{
-	/* performs C = (A op B) */
-	struct starpu_task *task = starpu_task_create();
-		task->cl_arg = NULL;
-		task->use_tag = 0;
-
-	task->buffers[0].handle = C;
-	task->buffers[0].mode = STARPU_W;
-	task->buffers[1].handle = A1;
-	task->buffers[1].mode = STARPU_R;
-	task->buffers[2].handle = A2;
-	task->buffers[2].mode = STARPU_R;
-	
-	task->callback_func = callback;
-	task->callback_arg = argcallback;
-
-	switch (op) {
-		case ADD:
-			STARPU_ASSERT(A1);
-			STARPU_ASSERT(A2);
-			STARPU_ASSERT(C);
-			task->cl = &cl_add;
-			break;
-		case SUB:
-			STARPU_ASSERT(A1);
-			STARPU_ASSERT(A2);
-			STARPU_ASSERT(C);
-			task->cl = &cl_sub;
-			break;
-		case MULT:
-			STARPU_ASSERT(A1);
-			STARPU_ASSERT(A2);
-			STARPU_ASSERT(C);
-			task->cl = &cl_mult;
-			break;
-		case SELFADD:
-			task->buffers[0].mode = STARPU_RW;
-			task->cl = &cl_self_add;
-			break;
-		case SELFSUB:
-			task->buffers[0].mode = STARPU_RW;
-			task->cl = &cl_self_sub;
-			break;
-		default:
-			STARPU_ABORT();
-	}
-
-	starpu_task_submit(task);
-}
-
-/* Cij +=/-= Ek is done */
-void phase_3_callback_function(void *_arg)
-{
-	unsigned cnt, use_cnt;
-	phase3_t *arg = _arg;
-
-	unsigned i = arg->i;
-	strassen_iter_state_t *iter = arg->iter;
-
-	free(arg);
-
-	use_cnt = STARPU_ATOMIC_ADD(&iter->Ei_remaining_use[i], -1);
-	if (use_cnt == 0) 
-	{
-		/* no one needs Ei anymore : free it */
-		switch (i) {
-			case 0:
-				free_tmp_matrix(iter->E1);
-				break;
-			case 1:
-				free_tmp_matrix(iter->E2);
-				break;
-			case 2:
-				free_tmp_matrix(iter->E3);
-				break;
-			case 3:
-				free_tmp_matrix(iter->E4);
-				break;
-			case 4:
-				free_tmp_matrix(iter->E5);
-				break;
-			case 5:
-				free_tmp_matrix(iter->E6);
-				break;
-			case 6:
-				free_tmp_matrix(iter->E7);
-				break;
-			default:
-				STARPU_ABORT();
-		}
-	}
-
-	cnt = STARPU_ATOMIC_ADD(&iter->counter, -1);
-	if (cnt == 0)
-	{
-		/* the entire strassen iteration is done ! */
-		unpartition_matrices(iter);
-
-		// XXX free the Ei
-		STARPU_ASSERT(iter->strassen_iter_callback);
-		iter->strassen_iter_callback(iter->argcb);
-
-		free(iter);
-	}
-}
-
-
-
-/* Ei is computed */
-void phase_2_callback_function(void *_arg)
-{
-	phase2_t *arg = _arg;
-
-	strassen_iter_state_t *iter = arg->iter;
-	unsigned i = arg->i;
-
-	free(arg);
-
-	phase3_t *arg1, *arg2;
-	arg1 = malloc(sizeof(phase3_t));
-	arg2 = malloc(sizeof(phase3_t));
-
-	arg1->iter = iter;
-	arg2->iter = iter;
-
-	arg1->i = i;
-	arg2->i = i;
-
-	switch (i) {
-		case 0:
-			free(arg2); // will not be needed .. 
-			free_tmp_matrix(iter->E11);
-			free_tmp_matrix(iter->E12);
-			/* C11 += E1 */
-			compute_add_sub_op(iter->E1, SELFADD, NULL, iter->C11, phase_3_callback_function, arg1);
-			break;
-		case 1:
-			free_tmp_matrix(iter->E21);
-			free_tmp_matrix(iter->E22);
-			/* C11 += E2 */
-			compute_add_sub_op(iter->E2, SELFADD, NULL, iter->C11, phase_3_callback_function, arg1);
-			/* C22 += E2 */
-			compute_add_sub_op(iter->E2, SELFADD, NULL, iter->C22, phase_3_callback_function, arg2);
-			break;
-		case 2:
-			free(arg2); // will not be needed .. 
-			free_tmp_matrix(iter->E31);
-			free_tmp_matrix(iter->E32);
-			/* C22 -= E3 */
-			compute_add_sub_op(iter->E3, SELFSUB, NULL, iter->C22, phase_3_callback_function, arg1);
-			break;
-		case 3:
-			free_tmp_matrix(iter->E41);
-			/* C11 -= E4 */
-			compute_add_sub_op(iter->E4, SELFSUB, NULL, iter->C11, phase_3_callback_function, arg1);
-			/* C12 += E4 */
-			compute_add_sub_op(iter->E4, SELFADD, NULL, iter->C12, phase_3_callback_function, arg2);
-			break;
-		case 4:
-			free_tmp_matrix(iter->E52);
-			/* C12 += E5 */
-			compute_add_sub_op(iter->E5, SELFADD, NULL, iter->C12, phase_3_callback_function, arg1);
-			/* C22 += E5 */
-			compute_add_sub_op(iter->E5, SELFADD, NULL, iter->C22, phase_3_callback_function, arg2);
-			break;
-		case 5:
-			free_tmp_matrix(iter->E62);
-			/* C11 += E6 */
-			compute_add_sub_op(iter->E6, SELFADD, NULL, iter->C11, phase_3_callback_function, arg1);
-			/* C21 += E6 */
-			compute_add_sub_op(iter->E6, SELFADD, NULL, iter->C21, phase_3_callback_function, arg2);
-			break;
-		case 6:
-			free_tmp_matrix(iter->E71);
-			/* C21 += E7 */
-			compute_add_sub_op(iter->E7, SELFADD, NULL, iter->C21, phase_3_callback_function, arg1);
-			/* C22 -= E7 */
-			compute_add_sub_op(iter->E7, SELFSUB, NULL, iter->C22, phase_3_callback_function, arg2);
-			break;
-		default:
-			STARPU_ABORT();
-	}
-}
-
-
-/* computes Ei */
-static void _strassen_phase_2(strassen_iter_state_t *iter, unsigned i)
-{
-	phase2_t *phase_2_arg = malloc(sizeof(phase2_t));
-
-	phase_2_arg->iter = iter;
-	phase_2_arg->i = i;
-
-	/* XXX */
-	starpu_data_handle A;
-	starpu_data_handle B;
-	starpu_data_handle C;
-
-	switch (i) {
-		case 0:
-			A = iter->E11; B = iter->E12;
-			iter->E1 = create_tmp_matrix(A);
-			C = iter->E1;
-			break;
-		case 1:
-			A = iter->E21; B = iter->E22;
-			iter->E2 = create_tmp_matrix(A);
-			C = iter->E2;
-			break;
-		case 2:
-			A = iter->E31; B = iter->E32;
-			iter->E3 = create_tmp_matrix(A);
-			C = iter->E3;
-			break;
-		case 3:
-			A = iter->E41; B = iter->E42;
-			iter->E4 = create_tmp_matrix(A);
-			C = iter->E4;
-			break;
-		case 4:
-			A = iter->E51; B = iter->E52;
-			iter->E5 = create_tmp_matrix(A);
-			C = iter->E5;
-			break;
-		case 5:
-			A = iter->E61; B = iter->E62;
-			iter->E6 = create_tmp_matrix(A);
-			C = iter->E6;
-			break;
-		case 6:
-			A = iter->E71; B = iter->E72;
-			iter->E7 = create_tmp_matrix(A);
-			C = iter->E7;
-			break;
-		default:
-			STARPU_ABORT();
-	}
-
-	STARPU_ASSERT(A);
-	STARPU_ASSERT(B);
-	STARPU_ASSERT(C);
-
-	// DEBUG XXX
-	//compute_add_sub_op(A, MULT, B, C, phase_2_callback_function, phase_2_arg);
-	strassen(A, B, C, phase_2_callback_function, phase_2_arg, iter->reclevel-1);
-}
-
-
-#define THRESHHOLD	128
-
-static void phase_1_callback_function(void *_arg)
-{
-
-	phase1_t *arg = _arg;
-	strassen_iter_state_t *iter = arg->iter;
-	unsigned i = arg->i;
-
-	free(arg);
-
-	unsigned cnt = STARPU_ATOMIC_ADD(&iter->Ei12[i], +1);
-
-	if (cnt == 2) {
-		/* Ei1 and Ei2 are ready, compute Ei */
-		_strassen_phase_2(iter, i);
-	}
-}
-
-/* computes Ei1 or Ei2 with i in 0-6 */
-static void _strassen_phase_1(starpu_data_handle A1, operation opA, starpu_data_handle A2,
-			      starpu_data_handle C, strassen_iter_state_t *iter, unsigned i)
-{
-	phase1_t *phase_1_arg = malloc(sizeof(phase1_t));
-	phase_1_arg->iter = iter;
-	phase_1_arg->i = i;
-
-	compute_add_sub_op(A1, opA, A2, C, phase_1_callback_function, phase_1_arg);
-}
-
-strassen_iter_state_t *init_strassen_iter_state(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*strassen_iter_callback)(void *), void *argcb)
-{
-	strassen_iter_state_t *iter_state = malloc(sizeof(strassen_iter_state_t));
-
-	iter_state->Ei12[0] = 0;
-	iter_state->Ei12[1] = 0;
-	iter_state->Ei12[2] = 0;
-	iter_state->Ei12[3] = 1; // E42 = B22
-	iter_state->Ei12[4] = 1; // E51 = A11
-	iter_state->Ei12[5] = 1; // E61 = A22
-	iter_state->Ei12[6] = 1; // E72 = B11
-
-	iter_state->Ei_remaining_use[0] = 1; 
-	iter_state->Ei_remaining_use[1] = 2;
-	iter_state->Ei_remaining_use[2] = 1;
-	iter_state->Ei_remaining_use[3] = 2;
-	iter_state->Ei_remaining_use[4] = 2;
-	iter_state->Ei_remaining_use[5] = 2;
-	iter_state->Ei_remaining_use[6] = 2;
-
-	unsigned i;
-	for (i = 0; i < 6; i++)
-	{
-		iter_state->Ei[i] = 0;
-	}
-
-	for (i = 0; i < 4; i++)
-	{
-		iter_state->Cij[i] = 0;
-	}
-
-	iter_state->strassen_iter_callback = strassen_iter_callback;
-	iter_state->argcb = argcb;
-
-	iter_state->A = A;
-	iter_state->B = B;
-	iter_state->C = C;
-
-	iter_state->counter = 12;
-
-	return iter_state;
-}
-
-static void _do_strassen(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*strassen_iter_callback)(void *), void *argcb, unsigned reclevel)
-{
-	/* do one level of recursion in the strassen algorithm */
-	strassen_iter_state_t *iter = init_strassen_iter_state(A, B, C, strassen_iter_callback, argcb);
-
-	partition_matrices(iter);
-	iter->reclevel = reclevel;
-
-	/* some Eij are already known */
-	iter->E11 = create_tmp_matrix(iter->A11);
-	iter->E12 = create_tmp_matrix(iter->B21);
-	iter->E21 = create_tmp_matrix(iter->A11);
-	iter->E22 = create_tmp_matrix(iter->B11);
-	iter->E31 = create_tmp_matrix(iter->A11);
-	iter->E32 = create_tmp_matrix(iter->B11);
-	iter->E41 = create_tmp_matrix(iter->A11);
-	iter->E42 = iter->B22;
-	iter->E51 = iter->A11;
-	iter->E52 = create_tmp_matrix(iter->B12);
-	iter->E61 = iter->A22;
-	iter->E62 = create_tmp_matrix(iter->B21);
-	iter->E71 = create_tmp_matrix(iter->A21);
-	iter->E72 = iter->B11;
-
-	/* compute all Eij */
-	_strassen_phase_1(iter->A11, SUB, iter->A22, iter->E11, iter, 0);
-	_strassen_phase_1(iter->B21, ADD, iter->B22, iter->E12, iter, 0);
-	_strassen_phase_1(iter->A11, ADD, iter->A22, iter->E21, iter, 1);
-	_strassen_phase_1(iter->B11, ADD, iter->B22, iter->E22, iter, 1);
-	_strassen_phase_1(iter->A11, SUB, iter->A21, iter->E31, iter, 2);
-	_strassen_phase_1(iter->B11, ADD, iter->B12, iter->E32, iter, 2);
-	_strassen_phase_1(iter->A11, ADD, iter->A12, iter->E41, iter, 3);
-	_strassen_phase_1(iter->B12, SUB, iter->B22, iter->E52, iter, 4);
-	_strassen_phase_1(iter->B21, SUB, iter->B11, iter->E62, iter, 5);
-	_strassen_phase_1(iter->A21, ADD, iter->A22, iter->E71, iter, 6);
-}
-
-
-void strassen(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*callback)(void *), void *argcb, unsigned reclevel)
-{
-	/* C = A * B */
-	if ( reclevel == 0 )
-	{
-		/* don't use Strassen but a simple sequential multiplication
-		 * provided this is small enough */
-		compute_add_sub_op(A, MULT, B, C, callback, argcb);
-	}
-	else {
-		_do_strassen(A, B, C, callback, argcb, reclevel);
-	}
-}

+ 0 - 114
examples/strassen/strassen.h

@@ -1,114 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __STRASSEN_H__
-#define __STRASSEN_H__
-
-#include <semaphore.h>
-#include <sys/time.h>
-#include <string.h>
-#include <math.h>
-#include <sys/types.h>
-#include <pthread.h>
-#include <signal.h>
-#include <cblas.h>
-
-#include <starpu_config.h>
-#ifdef STARPU_USE_CUDA
-#include <cuda.h>
-#include <cublas.h>
-#endif
-
-#include <starpu.h>
-
-typedef enum {
-	ADD,
-	SUB,
-	MULT,
-	SELFADD,
-	SELFSUB,
-	NONE
-} operation;
-
-typedef struct {
-	/* monitor the progress of the algorithm */
-	unsigned Ei12[7]; // Ei12[k] is 0, 1 or 2 (2 = finished Ei1 and Ei2)
-	unsigned Ei[7];
-	unsigned Ei_remaining_use[7];
-	unsigned Cij[4];
-
-	starpu_data_handle A, B, C;
-	starpu_data_handle A11, A12, A21, A22;
-	starpu_data_handle B11, B12, B21, B22;
-	starpu_data_handle C11, C12, C21, C22;
-
-	starpu_data_handle E1, E2, E3, E4, E5, E6, E7;
-	starpu_data_handle E11, E12, E21, E22, E31, E32, E41, E52, E62, E71;
-
-	starpu_data_handle E42, E51, E61, E72;
-
-	unsigned reclevel;
-	
-	/* */
-	unsigned counter;
-
-	/* called at the end of the iteration */
-	void (*strassen_iter_callback)(void *);
-	void *argcb;
-} strassen_iter_state_t;
-
-typedef struct {
-	strassen_iter_state_t *iter;
-
-	/* phase 1 computes Ei1 or Ei2 with i in 0-6 */
-	unsigned i;
-} phase1_t;
-
-typedef struct {
-	strassen_iter_state_t *iter;
-
-	/* phase 2 computes Ei with i in 0-6 */
-	unsigned i;
-} phase2_t;
-
-typedef struct {
-	strassen_iter_state_t *iter;
-
-	/* phase 2 computes Ei with i in 0-6 */
-	unsigned i;
-} phase3_t;
-
-void mult_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-void sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-void self_add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg);
-
-#ifdef STARPU_USE_CUDA
-void mult_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-void add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-void self_add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-void self_sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg);
-#endif
-
-void strassen(starpu_data_handle A, starpu_data_handle B, starpu_data_handle C, void (*callback)(void *), void *argcb, unsigned reclevel);
-
-extern struct starpu_perfmodel_t strassen_model_mult;
-extern struct starpu_perfmodel_t strassen_model_add_sub;
-extern struct starpu_perfmodel_t strassen_model_self_add_sub;
-
-#endif // __STRASSEN_H__

+ 0 - 204
examples/strassen/strassen_kernels.c

@@ -1,204 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "strassen.h"
-
-
-static void mult_common_codelet(void *descr[], int s, __attribute__((unused))  void *arg)
-{
-	float *center 	= (float *)STARPU_GET_MATRIX_PTR(descr[0]);
-	float *left 	= (float *)STARPU_GET_MATRIX_PTR(descr[1]);
-	float *right 	= (float *)STARPU_GET_MATRIX_PTR(descr[2]);
-
-	unsigned dx = STARPU_GET_MATRIX_NX(descr[0]);
-	unsigned dy = STARPU_GET_MATRIX_NY(descr[0]);
-	unsigned dz = STARPU_GET_MATRIX_NX(descr[1]);
-
-	unsigned ld21 = STARPU_GET_MATRIX_LD(descr[1]);
-	unsigned ld12 = STARPU_GET_MATRIX_LD(descr[2]);
-	unsigned ld22 = STARPU_GET_MATRIX_LD(descr[0]);
-
-	switch (s) {
-		case 0:
-			cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, 
-				dy, dx, dz, -1.0f, left, ld21, right, ld12,
-					     1.0f, center, ld22);
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			cublasSgemm('t', 'n', dx, dy, dz, 
-					-1.0f, right, ld12, left, ld21, 
-					 1.0f, center, ld22);
-			cudaThreadSynchronize();
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-void mult_cpu_codelet(void *descr[], void *_args)
-{
-	mult_common_codelet(descr, 0, _args);
-}
-
-#ifdef STARPU_USE_CUDA
-void mult_cublas_codelet(void *descr[], void *_args)
-{
-	mult_common_codelet(descr, 1, _args);
-}
-#endif
-
-static void add_sub_common_codelet(void *descr[], int s, __attribute__((unused))  void *arg, float alpha)
-{
-	/* C = A op B */
-
-	float *C 	= (float *)STARPU_GET_MATRIX_PTR(descr[0]);
-	float *A 	= (float *)STARPU_GET_MATRIX_PTR(descr[1]);
-	float *B 	= (float *)STARPU_GET_MATRIX_PTR(descr[2]);
-
-	unsigned dx = STARPU_GET_MATRIX_NX(descr[0]);
-	unsigned dy = STARPU_GET_MATRIX_NY(descr[0]);
-
-	unsigned ldA = STARPU_GET_MATRIX_LD(descr[1]);
-	unsigned ldB = STARPU_GET_MATRIX_LD(descr[2]);
-	unsigned ldC = STARPU_GET_MATRIX_LD(descr[0]);
-
-	// TODO check dim ...
-
-	unsigned line;
-
-	switch (s) {
-		case 0:
-			for (line = 0; line < dy; line++)
-			{
-				/* copy line A into C */
-				cblas_saxpy(dx, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
-				/* add line B to C = A */
-				cblas_saxpy(dx, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
-			}
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			for (line = 0; line < dy; line++)
-			{
-				/* copy line A into C */
-				cublasSaxpy(dx, 1.0f, &A[line*ldA], 1, &C[line*ldC], 1);
-				/* add line B to C = A */
-				cublasSaxpy(dx, alpha, &B[line*ldB], 1, &C[line*ldC], 1);
-			}
-			
-			cudaThreadSynchronize();
-
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-void sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 0, arg, -1.0f);
-}
-
-void add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 0, arg, 1.0f);
-}
-
-#ifdef STARPU_USE_CUDA
-void sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 1, arg, -1.0f);
-}
-
-void add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	add_sub_common_codelet(descr, 1, arg, 1.0f);
-}
-#endif
-
-
-static void self_add_sub_common_codelet(void *descr[], int s, __attribute__((unused))  void *arg, float alpha)
-{
-	/* C +=/-= A */
-
-	float *C 	= (float *)STARPU_GET_MATRIX_PTR(descr[0]);
-	float *A 	= (float *)STARPU_GET_MATRIX_PTR(descr[1]);
-
-	unsigned dx = STARPU_GET_MATRIX_NX(descr[0]);
-	unsigned dy = STARPU_GET_MATRIX_NY(descr[0]);
-
-	unsigned ldA = STARPU_GET_MATRIX_LD(descr[1]);
-	unsigned ldC = STARPU_GET_MATRIX_LD(descr[0]);
-
-	// TODO check dim ...
-	
-	unsigned line;
-
-	switch (s) {
-		case 0:
-			for (line = 0; line < dy; line++)
-			{
-				/* add line A to C */
-				cblas_saxpy(dx, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
-			}
-			break;
-#ifdef STARPU_USE_CUDA
-		case 1:
-			for (line = 0; line < dy; line++)
-			{
-				/* add line A to C */
-				cublasSaxpy(dx, alpha, &A[line*ldA], 1, &C[line*ldC], 1);
-			}
-			
-			cudaThreadSynchronize();
-
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-			break;
-	}
-}
-
-
-
-
-void self_add_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 0, arg, 1.0f);
-}
-
-void self_sub_cpu_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 0, arg, -1.0f);
-}
-
-#ifdef STARPU_USE_CUDA
-void self_add_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 1, arg, 1.0f);
-}
-
-void self_sub_cublas_codelet(void *descr[], __attribute__((unused))  void *arg)
-{
-	self_add_sub_common_codelet(descr, 1, arg, -1.0f);
-}
-#endif

+ 0 - 156
examples/strassen/strassen_models.c

@@ -1,156 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "strassen_models.h"
-
-#include <starpu.h>
-
-/*
- * As a convention, in that file, descr[0] is represented by A,
- * 				  descr[1] is B ...
- */
-
-/*
- *	Number of flops of Gemm 
- */
-
-//#define USE_PERTURBATION	1
-
-
-#ifdef USE_PERTURBATION
-#define PERTURBATE(a)	((starpu_drand48()*2.0f*(AMPL) + 1.0f - (AMPL))*(a))
-#else
-#define PERTURBATE(a)	(a)
-#endif
-
-
-static double self_add_sub_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (n*n)/10.0f/4.0f/7.75f;
-
-#ifdef STARPU_MODEL_DEBUG
-	printf("self add sub cost %e n = %d\n", cost, n);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cuda_self_add_sub_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (n*n)/10.0f/4.0f;
-
-#ifdef STARPU_MODEL_DEBUG
-	printf("self add sub cost %e n = %d\n", cost, n);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double add_sub_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (1.45f*n*n)/10.0f/2.0f;
-
-#ifdef STARPU_MODEL_DEBUG
-	printf("add sub cost %e n = %d\n", cost, n);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cuda_add_sub_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (1.45f*n*n)/10.0f/2.0f;
-
-#ifdef STARPU_MODEL_DEBUG
-	printf("add sub cost %e n = %d\n", cost, n);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-
-static double mult_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/1000.0f/4.11f/0.2588);
-
-#ifdef STARPU_MODEL_DEBUG
-	printf("mult cost %e n = %d \n", cost, n);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-static double cuda_mult_cost(starpu_buffer_descr *descr)
-{
-	uint32_t n;
-
-	n = starpu_matrix_get_nx(descr[0].handle);
-
-	double cost = (((double)(n)*n*n)/1000.0f/4.11f);
-
-#ifdef STARPU_MODEL_DEBUG
-	printf("mult cost %e n = %d \n", cost, n);
-#endif
-
-	return PERTURBATE(cost);
-}
-
-struct starpu_perfmodel_t strassen_model_mult = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = mult_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_mult_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "strassen_model_mult"
-};
-
-struct starpu_perfmodel_t strassen_model_add_sub = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = add_sub_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_add_sub_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "strassen_model_add_sub"
-};
-
-struct starpu_perfmodel_t strassen_model_self_add_sub = {
-	.per_arch = { 
-		[STARPU_CPU_DEFAULT] = { .cost_model = self_add_sub_cost },
-		[STARPU_CUDA_DEFAULT] = { .cost_model = cuda_self_add_sub_cost }
-	},
-	.type = STARPU_HISTORY_BASED,
-	.symbol = "strassen_model_self_add_sub"
-};

+ 0 - 22
examples/strassen/strassen_models.h

@@ -1,22 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __STRASSEN_MODELS_H__
-#define __STRASSEN_MODELS_H__
-
-#include <starpu.h>
-
-#endif // __STRASSEN_MODELS_H__

+ 0 - 192
examples/strassen/test_strassen.c

@@ -1,192 +0,0 @@
-/*
- * StarPU
- * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "strassen.h"
-#include <sys/time.h>
-
-unsigned dim = 4096;
-unsigned reclevel = 4;
-unsigned norandom = 0;
-
-sem_t sem;
-
-float *A;
-float *B;
-float *C;
-
-starpu_data_handle A_state;
-starpu_data_handle B_state;
-starpu_data_handle C_state;
-
-struct timeval start;
-struct timeval end;
-
-
-/* to compute MFlop/s */
-uint64_t flop_cublas = 0;
-uint64_t flop_atlas = 0;
-
-/* to compute MB/s (load/store) */
-uint64_t ls_cublas = 0;
-uint64_t ls_atlas = 0;
-
-/* 
- * Strassen complexity : n = 2^k matrices, stops at 2^r : recursion = (k-r) levels
- * 	m = n / 2^rec
- * 	M(k) = 7^(k-r) 8^r = 7^rec (m^3)
- * 	A(k) = 4^r (2^r + 5) 7^(k-r) - 6 x 4^k = (m^2)(m+5)*7^rec - 6n^2 
- *
- * 	4n^2.807
- */
-double strassen_complexity(unsigned n, unsigned rec)
-{
-	double mult, add;
-
-	double m = (1.0*n)/(pow(2.0, (double)rec));
-
-	add = ((m*m)*(m+5)*(pow(7.0, (double)rec)) - 6.0*n*n);
-	mult = (m*m*m)*(pow(7.0, (double)rec));
-	
-	//printf("%e adds %e mult\n", add, mult);
-
-	return (add+mult);
-}
-
-/*
- * That program should compute C = A * B 
- * 
- *   A of size (z,y)
- *   B of size (x,z)
- *   C of size (x,y)
- */
-
-void terminate(void *arg __attribute__ ((unused)))
-{
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	//uint64_t total_flop = flop_cublas + flop_atlas;
-	double total_flop =  strassen_complexity(dim, reclevel);//4.0*pow((double)dim, 2.807);
-
-	fprintf(stderr, "Computation took (ms):\n");
-	printf("%2.2f\n", timing/1000);
-	fprintf(stderr, "	GFlop : total (%2.2f)\n", (double)total_flop/1000000000.0f);
-	fprintf(stderr, "	GFlop/s : %2.2f\n", (double)total_flop / (double)timing/1000);
-
-	sem_post(&sem);
-}
-
-void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
-			char *argptr;
-			dim = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-rec") == 0) {
-			char *argptr;
-			reclevel = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-no-random") == 0) {
-			norandom = 1;
-		}
-	}
-}
-
-void init_problem(void)
-{
-	unsigned i,j;
-
-#ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(0);
-#endif
-
-	A = malloc(dim*dim*sizeof(float));
-	B = malloc(dim*dim*sizeof(float));
-	C = malloc(dim*dim*sizeof(float));
-
-	/* fill the A and B matrices */
-	if (norandom) {
-		for (i=0; i < dim; i++) {
-			for (j=0; j < dim; j++) {
-				A[i+j*dim] = (float)(i);
-			}
-		}
-	
-		for (i=0; i < dim; i++) {
-			for (j=0; j < dim; j++) {
-				B[i+j*dim] = (float)(j);
-			}
-		}
-	} 
-	else {
-		srand(2008);
-		for (j=0; j < dim; j++) {
-			for (i=0; i < dim; i++) {
-				A[i+j*dim] = (float)(starpu_drand48());
-			}
-		}
-	
-		for (j=0; j < dim; j++) {
-			for (i=0; i < dim; i++) {
-				B[i+j*dim] = (float)(starpu_drand48());
-			}
-		}
-	}
-
-	for (j=0; j < dim; j++) {
-		for (i=0; i < dim; i++) {
-			C[i+j*dim] = (float)(0);
-		}
-	}
-
-	starpu_matrix_data_register(&A_state, 0, (uintptr_t)A, 
-		dim, dim, dim, sizeof(float));
-	starpu_matrix_data_register(&B_state, 0, (uintptr_t)B, 
-		dim, dim, dim, sizeof(float));
-	starpu_matrix_data_register(&C_state, 0, (uintptr_t)C, 
-		dim, dim, dim, sizeof(float));
-
-	gettimeofday(&start, NULL);
-	strassen(A_state, B_state, C_state, terminate, NULL, reclevel);
-}
-
-int main(__attribute__ ((unused)) int argc, 
-	 __attribute__ ((unused)) char **argv)
-{
-
-	parse_args(argc, argv);
-
-	/* start the runtime */
-	starpu_init(NULL);
-
-	starpu_helper_cublas_init();
-
-	sem_init(&sem, 0, 0U);
-
-	init_problem();
-	sem_wait(&sem);
-	sem_destroy(&sem);
-
-	starpu_helper_cublas_shutdown();
-
-	starpu_shutdown();
-
-	return 0;
-}