浏览代码

add concurrent heterogeneous resource sharing spawn tests

Olivier Aumage 6 年之前
父节点
当前提交
f4e05c2b48
共有 1 个文件被更改,包括 149 次插入10 次删除
  1. 149 10
      starpurm/dev/chameleon_test/dgemm.c

+ 149 - 10
starpurm/dev/chameleon_test/dgemm.c

@@ -24,6 +24,7 @@
 #include <morse.h>
 #include <starpurm.h>
 #include <hwloc.h>
+#include <pthread.h>
 
 #define CHECK
 
@@ -33,6 +34,42 @@ static int rm_nb_cpu_units = 0;
 static int rm_nb_cuda_units = 0;
 static const int nb_random_tests = 10;
 
+static unsigned spawn_pending = 0;
+static pthread_mutex_t spawn_pending_mutex = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t spawn_pending_cond;
+
+static void _inc_spawn_pending(void)
+{
+	pthread_mutex_lock(&spawn_pending_mutex);
+	assert(spawn_pending < UINT_MAX);
+	spawn_pending++;
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
+
+static void _dec_spawn_pending(void)
+{
+	pthread_mutex_lock(&spawn_pending_mutex);
+	assert(spawn_pending > 0);
+	spawn_pending--;
+	if (spawn_pending == 0)
+		pthread_cond_broadcast(&spawn_pending_cond);
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
+
+static void _wait_pending_spawns(void)
+{
+	pthread_mutex_lock(&spawn_pending_mutex);
+	while (spawn_pending > 0)
+		pthread_cond_wait(&spawn_pending_cond, &spawn_pending_mutex);
+	pthread_mutex_unlock(&spawn_pending_mutex);
+}
+
+static void spawn_callback(void *_arg)
+{
+	assert(42 == (uintptr_t)_arg);
+	_dec_spawn_pending();
+}
+
 static void usage(void)
 {
 	fprintf(stderr, "dgemm: M N K <trans_A=T|N> <trans_B=[T|N]>\n");
@@ -59,17 +96,32 @@ static void init_rm_infos(void)
 }
 
 
-static void disp_selected_cpuset(void)
+static void disp_cpuset(hwloc_cpuset_t selected_cpuset)
 {
-	hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
+	//hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
 	int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset);
 	char str[strl+1];
 	hwloc_bitmap_snprintf(str, strl+1, selected_cpuset);
-	printf("selected cpuset = %s\n", str);
+	printf("%llx: selected cpuset = %s\n", (unsigned long long)pthread_self(), str);
 }
 
-static void test(const int m, const int n, const int k, int transA, int transB)
+struct s_test_args
+{
+	const int m;
+	const int n;
+	const int k;
+	int transA;
+	int transB;
+};
+
+static void test(void *_args)
 {
+	struct s_test_args *args = _args;
+	const int m = args->m;
+	const int n = args->n;
+	const int k = args->k;
+	int transA = args->transA;
+	int transB = args->transB;
 	unsigned rand_seed = (unsigned)time(NULL);
 	double *A = malloc(m * k * sizeof(double));
 	double *B = malloc(k * n * sizeof(double));
@@ -101,23 +153,80 @@ static void test(const int m, const int n, const int k, int transA, int transB)
 			B, n,
 			beta, C_test, n );
 
-	double C_test_fnorm = LAPACKE_dlange(CblasColMajor, 'F', m, n, C_test, n);
 	double C_test_inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
 	cblas_daxpy(m*n, -1, C, 1, C_test, 1);
-	double fnorm = LAPACKE_dlange(CblasColMajor, 'F', m, n, C_test, n);
 	double inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
-	fprintf(stdout, "||C_test-C||_F / ||C_test||_F = %e\n", fnorm/C_test_fnorm);
-	fprintf(stdout, "||C_test-C||_I / ||C_test||_I = %e\n", inorm/C_test_inorm);
+	printf("%llx: ||C_test-C||_I / ||C_test||_I = %e\n", (unsigned long long)pthread_self(), inorm/C_test_inorm);
 #endif
 	free(A);
 	free(B);
 	free(C);
 	free(C_test);
+}
 
+static void select_units(hwloc_cpuset_t selected_cpuset, hwloc_cpuset_t available_cpuset, int offset, int nb)
+{
+	int first_idx = hwloc_bitmap_first(available_cpuset);
+	int last_idx = hwloc_bitmap_last(available_cpuset);
+	int count = 0;
+	int idx = first_idx;
+	while (idx != -1 && idx <= last_idx && count < offset+nb)
+	{
+		if (hwloc_bitmap_isset(available_cpuset, idx))
+		{
+			if (count >= offset)
+			{
+				hwloc_bitmap_set(selected_cpuset, idx);
+			}
+			count ++;
+		}
+		idx = hwloc_bitmap_next(available_cpuset, idx);
+	}
+	assert(count == offset+nb);
+}
+
+void spawn_tests(int cpu_offset, int cpu_nb, int cuda_offset, int cuda_nb, void *args)
+{
+	if (cpu_offset + cpu_nb > rm_nb_cpu_units)
+		exit(77);
+	if (cuda_offset + cuda_nb > rm_nb_cuda_units)
+		exit(77);
+	hwloc_cpuset_t cpu_cpuset = starpurm_get_all_cpu_workers_cpuset();
+	hwloc_cpuset_t cuda_cpuset = starpurm_get_all_device_workers_cpuset_by_type(rm_cuda_type_id);
+	hwloc_cpuset_t sel_cpuset = hwloc_bitmap_alloc();
+	assert(sel_cpuset != NULL);
+
+	select_units(sel_cpuset, cpu_cpuset, cpu_offset, cpu_nb);
+	select_units(sel_cpuset, cuda_cpuset, cuda_offset, cuda_nb);
+
+	{
+		int strl1 = hwloc_bitmap_snprintf(NULL, 0, cpu_cpuset);
+		char str1[strl1+1];
+		hwloc_bitmap_snprintf(str1, strl1+1, cpu_cpuset);
+
+		int strl2 = hwloc_bitmap_snprintf(NULL, 0, cuda_cpuset);
+		char str2[strl2+1];
+		hwloc_bitmap_snprintf(str2, strl2+1, cuda_cpuset);
+		printf("all cpus cpuset = %s\n", str1);
+		
+		int strl3 = hwloc_bitmap_snprintf(NULL, 0, sel_cpuset);
+		char str3[strl3+1];
+		hwloc_bitmap_snprintf(str3, strl1+3, sel_cpuset);
+		printf("spawn on selected cpuset = %s (avail cpu %s, avail cuda %s)\n", str3, str1, str2);
+	}
+
+	_inc_spawn_pending();
+	starpurm_spawn_kernel_on_cpus_callback(NULL, test, args, sel_cpuset, spawn_callback, (void*)(uintptr_t)42);
+
+	hwloc_bitmap_free(sel_cpuset);
+	hwloc_bitmap_free(cpu_cpuset);
+	hwloc_bitmap_free(cuda_cpuset);
 }
 
 int main( int argc, char const *argv[])
 {
+	pthread_cond_init(&spawn_pending_cond, NULL);
+
 	int transA = MorseTrans;
 	int transB = MorseTrans;
 
@@ -150,6 +259,8 @@ int main( int argc, char const *argv[])
 
 	srand(time(NULL));
 
+	struct s_test_args test_args = { .m = m, .n = n, .k = k, .transA = transA, .transB = transB };
+
 	/* Test case */
 	starpurm_initialize();
 	starpurm_set_drs_enable(NULL);
@@ -157,13 +268,41 @@ int main( int argc, char const *argv[])
 	printf("cpu units: %d\n", rm_nb_cpu_units);
 	printf("cuda units: %d\n", rm_nb_cuda_units);
 	printf("using default units\n");
-	disp_selected_cpuset();
+	disp_cpuset(starpurm_get_selected_cpuset());
 
 	MORSE_Init(rm_nb_cpu_units, rm_nb_cuda_units);
-	test(m, n, k, transA, transB);
+	test(&test_args);
+	{
+		int cpu_offset = 0;
+		int cpu_nb = rm_nb_cpu_units/2;
+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
+		{
+			cpu_nb = 1;
+		}
+		int cuda_offset = 0;
+		int cuda_nb = rm_nb_cuda_units/2;
+		if (cuda_nb == 0 && rm_nb_cuda_units > 0)
+		{
+			cuda_nb = 1;
+		}
+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
+	}
+	{
+		int cpu_offset = rm_nb_cpu_units/2;
+		int cpu_nb = rm_nb_cpu_units/2;
+		if (cpu_nb == 0 && rm_nb_cpu_units > 0)
+		{
+			cpu_nb = 1;
+		}
+		int cuda_offset = rm_nb_cuda_units/2;
+		int cuda_nb = rm_nb_cuda_units/2;
+		spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
+	}
+	_wait_pending_spawns();
 	MORSE_Finalize();
 
 	starpurm_shutdown();
+	pthread_cond_destroy(&spawn_pending_cond);
 
 	return 0;