| 
					
				 | 
			
			
				@@ -1,3 +1,21 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+/* StarPURM --- StarPU Resource Management Layer. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * Copyright (C) 2017, 2018  Inria 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * StarPU is free software; you can redistribute it and/or modify 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * it under the terms of the GNU Lesser General Public License as published by 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * the Free Software Foundation; either version 2.1 of the License, or (at 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * your option) any later version. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * StarPU is distributed in the hope that it will be useful, but 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details. 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+/* This example shows a basic StarPU vector scale app on top of StarPURM with a nVidia CUDA kernel */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #define _GNU_SOURCE 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #include <sched.h> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #include <stdio.h> 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -6,18 +24,56 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #include <morse.h> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #include <starpurm.h> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 #include <hwloc.h> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#include <pthread.h> 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#define CHECK 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 static int rm_cpu_type_id = -1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static int rm_cuda_type_id = -1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 static int rm_nb_cpu_units = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static int rm_nb_cuda_units = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static const int nb_random_tests = 10; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-static void test1(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-static void init_rm_infos(void); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static unsigned spawn_pending = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static pthread_mutex_t spawn_pending_mutex = PTHREAD_MUTEX_INITIALIZER; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static pthread_cond_t spawn_pending_cond; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-static const int nb_random_tests = 10; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void _inc_spawn_pending(void) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_mutex_lock(&spawn_pending_mutex); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	assert(spawn_pending < UINT_MAX); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	spawn_pending++; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_mutex_unlock(&spawn_pending_mutex); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-static void test1() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void _dec_spawn_pending(void) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	int i; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_mutex_lock(&spawn_pending_mutex); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	assert(spawn_pending > 0); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	spawn_pending--; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (spawn_pending == 0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		pthread_cond_broadcast(&spawn_pending_cond); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_mutex_unlock(&spawn_pending_mutex); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void _wait_pending_spawns(void) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_mutex_lock(&spawn_pending_mutex); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	while (spawn_pending > 0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		pthread_cond_wait(&spawn_pending_cond, &spawn_pending_mutex); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_mutex_unlock(&spawn_pending_mutex); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void spawn_callback(void *_arg) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	assert(42 == (uintptr_t)_arg); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	_dec_spawn_pending(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void usage(void) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	fprintf(stderr, "dgemm: M N K <trans_A=T|N> <trans_B=[T|N]>\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	exit(EXIT_FAILURE); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 static void init_rm_infos(void) 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -30,236 +86,223 @@ static void init_rm_infos(void) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		exit(77); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int cuda_type = starpurm_get_device_type_id("cuda"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int nb_cuda_units = starpurm_get_nb_devices_by_type(cuda_type); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	rm_cpu_type_id = cpu_type; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	rm_cuda_type_id = cuda_type; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	rm_nb_cpu_units = nb_cpu_units; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	rm_nb_cuda_units = nb_cuda_units; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-static void disp_selected_cpuset(void) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void disp_cpuset(hwloc_cpuset_t selected_cpuset) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	//hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	char str[strl+1]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	hwloc_bitmap_snprintf(str, strl+1, selected_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	printf("selected cpuset = %s\n", str); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	printf("%llx: selected cpuset = %s\n", (unsigned long long)pthread_self(), str); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-int main( int argc, char const *argv[]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+struct s_test_args 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	starpurm_initialize(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	init_rm_infos(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	printf("using default units\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	disp_selected_cpuset(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	test1(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	starpurm_shutdown(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-#if 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const int m; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const int n; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const int k; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int transA; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int transB; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+}; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if(argc < 6 || argc > 6) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 		 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr, "Usage: ./test_dgemm M N K TRANS_A TRANS_B\n" ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	// Local variables 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	int i, j; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	int m, n, k; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	const char *transA_input = NULL; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	const char *transB_input = NULL; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	enum DDSS_TRANS transA = Trans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	enum DDSS_TRANS transB = Trans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double alpha;  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double beta; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double error; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double max_error; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double count_error;	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double *A; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double *B; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double *C; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double *C_test; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	struct timeval start, end; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double flops; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double flops_ddss;  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	double flops_ref;  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	int ret; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	m = atoi( argv[1] ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	n = atoi( argv[2] ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	k = atoi( argv[3] ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if ( strlen( argv[4] ) != 1 )  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr,"Illegal value of TRANS_A, TRANS_A can be T or N\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	transA_input = argv[4];	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if ( strlen( argv[5] ) != 1 )  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void test(void *_args) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	struct s_test_args *args = _args; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const int m = args->m; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const int n = args->n; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const int k = args->k; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int transA = args->transA; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int transB = args->transB; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	unsigned rand_seed = (unsigned)time(NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	double *A = malloc(m * k * sizeof(double)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	double *B = malloc(k * n * sizeof(double)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	double *C = calloc(m * n, sizeof(double)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	double *C_test = calloc(m * n, sizeof(double)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const double alpha = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	const double beta  = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int i; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	for (i = 0; i < m; i++) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr,"Illegal value of TRANS_B, TRANS_B can be T or N\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int j; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		for (j = 0; j < n; j++) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			A[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			B[i*n+j] = (double)rand_r(&rand_seed) / ((double)rand_r(&rand_seed) + DBL_MIN); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	transB_input = argv[5];	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	// Set seed  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	srand(time(NULL)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int res = MORSE_dgemm(transA, transB, m, n, k, alpha, A, k, B, n, beta, C, n); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#ifdef CHECK 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	/* Check */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	cblas_dgemm( CblasColMajor,  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			( CBLAS_TRANSPOSE ) transA, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			( CBLAS_TRANSPOSE ) transB, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			m, n, k, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			alpha, A, k, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			B, n, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			beta, C_test, n ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	max_error = 1.0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	count_error = 0.0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	double C_test_inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	cblas_daxpy(m*n, -1, C, 1, C_test, 1); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	double inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	printf("%llx: ||C_test-C||_I / ||C_test||_I = %e\n", (unsigned long long)pthread_self(), inorm/C_test_inorm); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+#endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	free(A); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	free(B); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	free(C); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	free(C_test); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	// Checking inputs 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if ( m < 0 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr, "Illegal value of M, M must be >= 0\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if ( n < 0 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr, "Illegal value of N, N must be >= 0\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if ( k < 0 ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+static void select_units(hwloc_cpuset_t selected_cpuset, hwloc_cpuset_t available_cpuset, int offset, int nb) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int first_idx = hwloc_bitmap_first(available_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int last_idx = hwloc_bitmap_last(available_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int count = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int idx = first_idx; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	while (idx != -1 && idx <= last_idx && count < offset+nb) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr, "Illegal value of K, K must be >= 0\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if (hwloc_bitmap_isset(available_cpuset, idx)) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			if (count >= offset) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+				hwloc_bitmap_set(selected_cpuset, idx); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			count ++; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		idx = hwloc_bitmap_next(available_cpuset, idx); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	assert(count == offset+nb); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+void spawn_tests(int cpu_offset, int cpu_nb, int cuda_offset, int cuda_nb, void *args) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (cpu_offset + cpu_nb > rm_nb_cpu_units) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		exit(77); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (cuda_offset + cuda_nb > rm_nb_cuda_units) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		exit(77); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	hwloc_cpuset_t cpu_cpuset = starpurm_get_all_cpu_workers_cpuset(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	hwloc_cpuset_t cuda_cpuset = starpurm_get_all_device_workers_cpuset_by_type(rm_cuda_type_id); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	hwloc_cpuset_t sel_cpuset = hwloc_bitmap_alloc(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	assert(sel_cpuset != NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	select_units(sel_cpuset, cpu_cpuset, cpu_offset, cpu_nb); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	select_units(sel_cpuset, cuda_cpuset, cuda_offset, cuda_nb); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if ( transA_input[0] == 'T' ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		transA = Trans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	else if ( transA_input[0] == 'N' ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		transA = NoTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int strl1 = hwloc_bitmap_snprintf(NULL, 0, cpu_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		char str1[strl1+1]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		hwloc_bitmap_snprintf(str1, strl1+1, cpu_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int strl2 = hwloc_bitmap_snprintf(NULL, 0, cuda_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		char str2[strl2+1]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		hwloc_bitmap_snprintf(str2, strl2+1, cuda_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		printf("all cpus cpuset = %s\n", str1); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int strl3 = hwloc_bitmap_snprintf(NULL, 0, sel_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		char str3[strl3+1]; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		hwloc_bitmap_snprintf(str3, strl1+3, sel_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		printf("spawn on selected cpuset = %s (avail cpu %s, avail cuda %s)\n", str3, str1, str2); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	_inc_spawn_pending(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	starpurm_spawn_kernel_on_cpus_callback(NULL, test, args, sel_cpuset, spawn_callback, (void*)(uintptr_t)42); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	hwloc_bitmap_free(sel_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	hwloc_bitmap_free(cpu_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	hwloc_bitmap_free(cuda_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+int main( int argc, char const *argv[]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_cond_init(&spawn_pending_cond, NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int transA = MorseTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int transB = MorseTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (argc < 6 || argc > 6) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		usage(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int m = atoi(argv[1]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (m < 1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		usage(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int n = atoi(argv[2]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (n < 1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		usage(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	int k = atoi(argv[3]); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (k < 1) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		usage(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (strcmp(argv[4], "T") == 0)  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		transA = MorseTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	else if (strcmp(argv[4], "N") == 0)  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		transA = MorseNoTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	else 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr, "Illegal value of TRANS_A, TRANS_A can be T or N\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		usage(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	if ( transB_input[0] == 'T' ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		transB = Trans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	else if ( transB_input[0] == 'N' ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		transB = NoTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	if (strcmp(argv[5], "T") == 0)  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		transB = MorseTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	else if (strcmp(argv[5], "N") == 0)  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		transB = MorseNoTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	else 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		fprintf(stderr, "Illegal value of TRANS_B, TRANS_B can be T or N\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		return 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		usage(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	// Matrices allocation 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	A = ( double * ) malloc( sizeof( double ) * m * k ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	B = ( double * ) malloc( sizeof( double ) * k * n ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	C = ( double * ) malloc( sizeof( double ) * m * n ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	C_test = ( double * ) malloc( sizeof( double ) * m * n ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	srand(time(NULL)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	// Alpha and beta initialization 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	alpha = ( double ) rand() / (double) rand() + DBL_MIN; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	beta  = ( double ) rand() / (double) rand() + DBL_MIN; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	// Matrix A, B, C and C_test initialization 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	for ( i = 0; i < m; i++ ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		for ( j = 0; j < n; j++ ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			A[ i * n + j ] = ( double ) rand() / (double) rand()  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-							  + DBL_MIN; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			B[ i * n + j ] = ( double ) rand() / (double) rand()  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-							  + DBL_MIN; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			C[ i * n + j ] = 0.0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			C_test[ i * n + j ] = 0.0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	struct s_test_args test_args = { .m = m, .n = n, .k = k, .transA = transA, .transB = transB }; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	/* Test case */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		/* pocl_starpu_init */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			hwloc_topology_init(&topology); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			hwloc_topology_load(topology); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			starpurm_initialize(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			starpurm_set_drs_enable(NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	starpurm_initialize(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	starpurm_set_drs_enable(NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	init_rm_infos(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	printf("cpu units: %d\n", rm_nb_cpu_units); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	printf("cuda units: %d\n", rm_nb_cuda_units); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	printf("using default units\n"); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	disp_cpuset(starpurm_get_selected_cpuset()); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		/* pocl_starpu_submit_task */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	MORSE_Init(rm_nb_cpu_units, rm_nb_cuda_units); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	test(&test_args); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cpu_offset = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cpu_nb = rm_nb_cpu_units/2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if (cpu_nb == 0 && rm_nb_cpu_units > 0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			/* GLIBC cpu_mask as supplied by POCL */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			cpu_set_t cpu_mask; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			CPU_ZERO(&cpu_mask); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			CPU_SET (0, &cpu_mask); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			CPU_SET (1, &cpu_mask); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			CPU_SET (2, &cpu_mask); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			CPU_SET (3, &cpu_mask); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			/* Convert GLIBC cpu_mask into HWLOC cpuset */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			hwloc_cpuset_t hwloc_cpuset = hwloc_bitmap_alloc(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			int status = hwloc_cpuset_from_glibc_sched_affinity(topology, hwloc_cpuset, &cpu_mask, sizeof(cpu_set_t)); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			assert(status == 0); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			/* Reset any unit previously allocated to StarPU */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			starpurm_withdraw_all_cpus_from_starpu(NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			/* Enforce new cpu mask */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			starpurm_assign_cpu_mask_to_starpu(NULL, hwloc_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			/* task function */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int TRANS_A = transA==NoTrans?MorseNoTrans:MorseTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int TRANS_B = transB==NoTrans?MorseNoTrans:MorseTrans; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int M = m; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int N = n; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int K = k; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				double ALPHA = alpha; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int LDA = k; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int LDB = n; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				double BETA = beta; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int LDC = n; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				MORSE_Init(4, 0); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				int res = MORSE_dgemm(TRANS_A, TRANS_B, M, N, K, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-						ALPHA, A, LDA, B, LDB, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-						BETA, C, LDC); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				MORSE_Finalize(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			/* Withdraw all CPU units from StarPU */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			starpurm_withdraw_all_cpus_from_starpu(NULL); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			hwloc_bitmap_free(hwloc_cpuset); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			cpu_nb = 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		/* pocl_starpu_shutdown() */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cuda_offset = 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cuda_nb = rm_nb_cuda_units/2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if (cuda_nb == 0 && rm_nb_cuda_units > 0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			starpurm_shutdown(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			cuda_nb = 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				- 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-#if 0 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	/* Check */ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	cblas_dgemm( CblasColMajor,  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				 ( CBLAS_TRANSPOSE ) transA, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				 ( CBLAS_TRANSPOSE ) transB, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-									 m, n, k, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-							 		 alpha, A, k, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-							 			    B, n, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-							 		  beta, C_test, n ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	// Error computation 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	for ( i = 0; i < m; i++ ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-		for ( j = 0; j < n; j++ ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cpu_offset = rm_nb_cpu_units/2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cpu_nb = rm_nb_cpu_units/2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		if (cpu_nb == 0 && rm_nb_cpu_units > 0) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		{ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			error = abs( C[ i * n + j ] - C_test[ i * n + j ] ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			if ( max_error > error ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-				max_error = error; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-			count_error += error; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+			cpu_nb = 1; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 		} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cuda_offset = rm_nb_cuda_units/2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		int cuda_nb = rm_nb_cuda_units/2; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	_wait_pending_spawns(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	MORSE_Finalize(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	fprintf(stdout, "Max. error = %1.2f\n", max_error ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-	fprintf(stdout, "Av. error = %1.2f\n", count_error / ( m * n ) ); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-#endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-#endif 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	starpurm_shutdown(); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+	pthread_cond_destroy(&spawn_pending_cond); 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 | 
		
	
		
			
				 | 
				 | 
			
			
				 	return 0; 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				  
			 |