|
@@ -24,6 +24,7 @@
|
|
|
#include <morse.h>
|
|
|
#include <starpurm.h>
|
|
|
#include <hwloc.h>
|
|
|
+#include <pthread.h>
|
|
|
|
|
|
#define CHECK
|
|
|
|
|
@@ -33,6 +34,42 @@ static int rm_nb_cpu_units = 0;
|
|
|
static int rm_nb_cuda_units = 0;
|
|
|
static const int nb_random_tests = 10;
|
|
|
|
|
|
+static unsigned spawn_pending = 0;
|
|
|
+static pthread_mutex_t spawn_pending_mutex = PTHREAD_MUTEX_INITIALIZER;
|
|
|
+static pthread_cond_t spawn_pending_cond;
|
|
|
+
|
|
|
+static void _inc_spawn_pending(void)
|
|
|
+{
|
|
|
+ pthread_mutex_lock(&spawn_pending_mutex);
|
|
|
+ assert(spawn_pending < UINT_MAX);
|
|
|
+ spawn_pending++;
|
|
|
+ pthread_mutex_unlock(&spawn_pending_mutex);
|
|
|
+}
|
|
|
+
|
|
|
+static void _dec_spawn_pending(void)
|
|
|
+{
|
|
|
+ pthread_mutex_lock(&spawn_pending_mutex);
|
|
|
+ assert(spawn_pending > 0);
|
|
|
+ spawn_pending--;
|
|
|
+ if (spawn_pending == 0)
|
|
|
+ pthread_cond_broadcast(&spawn_pending_cond);
|
|
|
+ pthread_mutex_unlock(&spawn_pending_mutex);
|
|
|
+}
|
|
|
+
|
|
|
+static void _wait_pending_spawns(void)
|
|
|
+{
|
|
|
+ pthread_mutex_lock(&spawn_pending_mutex);
|
|
|
+ while (spawn_pending > 0)
|
|
|
+ pthread_cond_wait(&spawn_pending_cond, &spawn_pending_mutex);
|
|
|
+ pthread_mutex_unlock(&spawn_pending_mutex);
|
|
|
+}
|
|
|
+
|
|
|
+static void spawn_callback(void *_arg)
|
|
|
+{
|
|
|
+ assert(42 == (uintptr_t)_arg);
|
|
|
+ _dec_spawn_pending();
|
|
|
+}
|
|
|
+
|
|
|
static void usage(void)
|
|
|
{
|
|
|
fprintf(stderr, "dgemm: M N K <trans_A=T|N> <trans_B=[T|N]>\n");
|
|
@@ -59,17 +96,32 @@ static void init_rm_infos(void)
|
|
|
}
|
|
|
|
|
|
|
|
|
-static void disp_selected_cpuset(void)
|
|
|
+static void disp_cpuset(hwloc_cpuset_t selected_cpuset)
|
|
|
{
|
|
|
- hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
|
|
|
+ //hwloc_cpuset_t selected_cpuset = starpurm_get_selected_cpuset();
|
|
|
int strl = hwloc_bitmap_snprintf(NULL, 0, selected_cpuset);
|
|
|
char str[strl+1];
|
|
|
hwloc_bitmap_snprintf(str, strl+1, selected_cpuset);
|
|
|
- printf("selected cpuset = %s\n", str);
|
|
|
+ printf("%llx: selected cpuset = %s\n", (unsigned long long)pthread_self(), str);
|
|
|
}
|
|
|
|
|
|
-static void test(const int m, const int n, const int k, int transA, int transB)
|
|
|
+struct s_test_args
|
|
|
+{
|
|
|
+ const int m;
|
|
|
+ const int n;
|
|
|
+ const int k;
|
|
|
+ int transA;
|
|
|
+ int transB;
|
|
|
+};
|
|
|
+
|
|
|
+static void test(void *_args)
|
|
|
{
|
|
|
+ struct s_test_args *args = _args;
|
|
|
+ const int m = args->m;
|
|
|
+ const int n = args->n;
|
|
|
+ const int k = args->k;
|
|
|
+ int transA = args->transA;
|
|
|
+ int transB = args->transB;
|
|
|
unsigned rand_seed = (unsigned)time(NULL);
|
|
|
double *A = malloc(m * k * sizeof(double));
|
|
|
double *B = malloc(k * n * sizeof(double));
|
|
@@ -101,23 +153,80 @@ static void test(const int m, const int n, const int k, int transA, int transB)
|
|
|
B, n,
|
|
|
beta, C_test, n );
|
|
|
|
|
|
- double C_test_fnorm = LAPACKE_dlange(CblasColMajor, 'F', m, n, C_test, n);
|
|
|
double C_test_inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
|
|
|
cblas_daxpy(m*n, -1, C, 1, C_test, 1);
|
|
|
- double fnorm = LAPACKE_dlange(CblasColMajor, 'F', m, n, C_test, n);
|
|
|
double inorm = LAPACKE_dlange(CblasColMajor, 'I', m, n, C_test, n);
|
|
|
- fprintf(stdout, "||C_test-C||_F / ||C_test||_F = %e\n", fnorm/C_test_fnorm);
|
|
|
- fprintf(stdout, "||C_test-C||_I / ||C_test||_I = %e\n", inorm/C_test_inorm);
|
|
|
+ printf("%llx: ||C_test-C||_I / ||C_test||_I = %e\n", (unsigned long long)pthread_self(), inorm/C_test_inorm);
|
|
|
#endif
|
|
|
free(A);
|
|
|
free(B);
|
|
|
free(C);
|
|
|
free(C_test);
|
|
|
+}
|
|
|
|
|
|
+static void select_units(hwloc_cpuset_t selected_cpuset, hwloc_cpuset_t available_cpuset, int offset, int nb)
|
|
|
+{
|
|
|
+ int first_idx = hwloc_bitmap_first(available_cpuset);
|
|
|
+ int last_idx = hwloc_bitmap_last(available_cpuset);
|
|
|
+ int count = 0;
|
|
|
+ int idx = first_idx;
|
|
|
+ while (idx != -1 && idx <= last_idx && count < offset+nb)
|
|
|
+ {
|
|
|
+ if (hwloc_bitmap_isset(available_cpuset, idx))
|
|
|
+ {
|
|
|
+ if (count >= offset)
|
|
|
+ {
|
|
|
+ hwloc_bitmap_set(selected_cpuset, idx);
|
|
|
+ }
|
|
|
+ count ++;
|
|
|
+ }
|
|
|
+ idx = hwloc_bitmap_next(available_cpuset, idx);
|
|
|
+ }
|
|
|
+ assert(count == offset+nb);
|
|
|
+}
|
|
|
+
|
|
|
+void spawn_tests(int cpu_offset, int cpu_nb, int cuda_offset, int cuda_nb, void *args)
|
|
|
+{
|
|
|
+ if (cpu_offset + cpu_nb > rm_nb_cpu_units)
|
|
|
+ exit(77);
|
|
|
+ if (cuda_offset + cuda_nb > rm_nb_cuda_units)
|
|
|
+ exit(77);
|
|
|
+ hwloc_cpuset_t cpu_cpuset = starpurm_get_all_cpu_workers_cpuset();
|
|
|
+ hwloc_cpuset_t cuda_cpuset = starpurm_get_all_device_workers_cpuset_by_type(rm_cuda_type_id);
|
|
|
+ hwloc_cpuset_t sel_cpuset = hwloc_bitmap_alloc();
|
|
|
+ assert(sel_cpuset != NULL);
|
|
|
+
|
|
|
+ select_units(sel_cpuset, cpu_cpuset, cpu_offset, cpu_nb);
|
|
|
+ select_units(sel_cpuset, cuda_cpuset, cuda_offset, cuda_nb);
|
|
|
+
|
|
|
+ {
|
|
|
+ int strl1 = hwloc_bitmap_snprintf(NULL, 0, cpu_cpuset);
|
|
|
+ char str1[strl1+1];
|
|
|
+ hwloc_bitmap_snprintf(str1, strl1+1, cpu_cpuset);
|
|
|
+
|
|
|
+ int strl2 = hwloc_bitmap_snprintf(NULL, 0, cuda_cpuset);
|
|
|
+ char str2[strl2+1];
|
|
|
+ hwloc_bitmap_snprintf(str2, strl2+1, cuda_cpuset);
|
|
|
+ printf("all cpus cpuset = %s\n", str1);
|
|
|
+
|
|
|
+ int strl3 = hwloc_bitmap_snprintf(NULL, 0, sel_cpuset);
|
|
|
+ char str3[strl3+1];
|
|
|
+ hwloc_bitmap_snprintf(str3, strl1+3, sel_cpuset);
|
|
|
+ printf("spawn on selected cpuset = %s (avail cpu %s, avail cuda %s)\n", str3, str1, str2);
|
|
|
+ }
|
|
|
+
|
|
|
+ _inc_spawn_pending();
|
|
|
+ starpurm_spawn_kernel_on_cpus_callback(NULL, test, args, sel_cpuset, spawn_callback, (void*)(uintptr_t)42);
|
|
|
+
|
|
|
+ hwloc_bitmap_free(sel_cpuset);
|
|
|
+ hwloc_bitmap_free(cpu_cpuset);
|
|
|
+ hwloc_bitmap_free(cuda_cpuset);
|
|
|
}
|
|
|
|
|
|
int main( int argc, char const *argv[])
|
|
|
{
|
|
|
+ pthread_cond_init(&spawn_pending_cond, NULL);
|
|
|
+
|
|
|
int transA = MorseTrans;
|
|
|
int transB = MorseTrans;
|
|
|
|
|
@@ -150,6 +259,8 @@ int main( int argc, char const *argv[])
|
|
|
|
|
|
srand(time(NULL));
|
|
|
|
|
|
+ struct s_test_args test_args = { .m = m, .n = n, .k = k, .transA = transA, .transB = transB };
|
|
|
+
|
|
|
/* Test case */
|
|
|
starpurm_initialize();
|
|
|
starpurm_set_drs_enable(NULL);
|
|
@@ -157,13 +268,41 @@ int main( int argc, char const *argv[])
|
|
|
printf("cpu units: %d\n", rm_nb_cpu_units);
|
|
|
printf("cuda units: %d\n", rm_nb_cuda_units);
|
|
|
printf("using default units\n");
|
|
|
- disp_selected_cpuset();
|
|
|
+ disp_cpuset(starpurm_get_selected_cpuset());
|
|
|
|
|
|
MORSE_Init(rm_nb_cpu_units, rm_nb_cuda_units);
|
|
|
- test(m, n, k, transA, transB);
|
|
|
+ test(&test_args);
|
|
|
+ {
|
|
|
+ int cpu_offset = 0;
|
|
|
+ int cpu_nb = rm_nb_cpu_units/2;
|
|
|
+ if (cpu_nb == 0 && rm_nb_cpu_units > 0)
|
|
|
+ {
|
|
|
+ cpu_nb = 1;
|
|
|
+ }
|
|
|
+ int cuda_offset = 0;
|
|
|
+ int cuda_nb = rm_nb_cuda_units/2;
|
|
|
+ if (cuda_nb == 0 && rm_nb_cuda_units > 0)
|
|
|
+ {
|
|
|
+ cuda_nb = 1;
|
|
|
+ }
|
|
|
+ spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
|
|
|
+ }
|
|
|
+ {
|
|
|
+ int cpu_offset = rm_nb_cpu_units/2;
|
|
|
+ int cpu_nb = rm_nb_cpu_units/2;
|
|
|
+ if (cpu_nb == 0 && rm_nb_cpu_units > 0)
|
|
|
+ {
|
|
|
+ cpu_nb = 1;
|
|
|
+ }
|
|
|
+ int cuda_offset = rm_nb_cuda_units/2;
|
|
|
+ int cuda_nb = rm_nb_cuda_units/2;
|
|
|
+ spawn_tests(cpu_offset, cpu_nb, cuda_offset, cuda_nb, &test_args);
|
|
|
+ }
|
|
|
+ _wait_pending_spawns();
|
|
|
MORSE_Finalize();
|
|
|
|
|
|
starpurm_shutdown();
|
|
|
+ pthread_cond_destroy(&spawn_pending_cond);
|
|
|
|
|
|
return 0;
|
|
|
|