|
@@ -30,6 +30,7 @@
|
|
|
#include <drivers/mpi/driver_mpi_common.h>
|
|
|
#include <drivers/mp_common/source_common.h>
|
|
|
#include <drivers/opencl/driver_opencl.h>
|
|
|
+#include <drivers/opencl/driver_opencl_utils.h>
|
|
|
#include <profiling/profiling.h>
|
|
|
#include <datawizard/datastats.h>
|
|
|
#include <datawizard/memory_nodes.h>
|
|
@@ -54,11 +55,23 @@
|
|
|
#include <hwloc/cuda.h>
|
|
|
#endif
|
|
|
|
|
|
+#if defined(STARPU_USE_OPENCL)
|
|
|
+#include <hwloc/opencl.h>
|
|
|
+#endif
|
|
|
+
|
|
|
static unsigned topology_is_initialized = 0;
|
|
|
static int nobind;
|
|
|
|
|
|
|
|
|
static int cpu_worker[STARPU_MAXCPUS];
|
|
|
+static unsigned nb_numa_nodes = 0;
|
|
|
+static int numa_memory_nodes_to_hwloclogid[STARPU_MAXNUMANODES];
|
|
|
+static int numa_memory_nodes_to_physicalid[STARPU_MAXNUMANODES];
|
|
|
+static unsigned numa_bus_id[STARPU_MAXNUMANODES*STARPU_MAXNUMANODES];
|
|
|
+static int _starpu_get_logical_numa_node_worker(unsigned workerid);
|
|
|
+
|
|
|
+#define STARPU_NUMA_UNINITIALIZED (-2)
|
|
|
+#define STARPU_NUMA_MAIN_RAM (-1)
|
|
|
|
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
|
|
|
|
|
@@ -87,6 +100,124 @@ static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
|
|
|
struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
|
|
|
#endif
|
|
|
|
|
|
+int starpu_memory_nodes_get_numa_count(void)
|
|
|
+{
|
|
|
+ return nb_numa_nodes;
|
|
|
+}
|
|
|
+
|
|
|
+#if defined(STARPU_HAVE_HWLOC)
|
|
|
+static int numa_get_logical_id(hwloc_obj_t obj)
|
|
|
+{
|
|
|
+ STARPU_ASSERT(obj);
|
|
|
+ while (obj->type != HWLOC_OBJ_NODE)
|
|
|
+ {
|
|
|
+ obj = obj->parent;
|
|
|
+
|
|
|
+
|
|
|
+ * hwloc does not know whether there are numa nodes or not, so
|
|
|
+ * we should not use a per-node sampling in that case. */
|
|
|
+ if (!obj)
|
|
|
+ return STARPU_NUMA_MAIN_RAM;
|
|
|
+ }
|
|
|
+ return obj->logical_index;
|
|
|
+}
|
|
|
+
|
|
|
+static int numa_get_physical_id(hwloc_obj_t obj)
|
|
|
+{
|
|
|
+ STARPU_ASSERT(obj);
|
|
|
+ while (obj->type != HWLOC_OBJ_NODE)
|
|
|
+ {
|
|
|
+ obj = obj->parent;
|
|
|
+
|
|
|
+
|
|
|
+ * hwloc does not know whether there are numa nodes or not, so
|
|
|
+ * we should not use a per-node sampling in that case. */
|
|
|
+ if (!obj)
|
|
|
+ return STARPU_NUMA_MAIN_RAM;
|
|
|
+ }
|
|
|
+ return obj->os_index;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+static int _starpu_get_logical_numa_node_worker(unsigned workerid)
|
|
|
+{
|
|
|
+#if defined(STARPU_HAVE_HWLOC)
|
|
|
+ char * state;
|
|
|
+ if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
|
|
|
+ {
|
|
|
+ struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
|
|
|
+ struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
|
|
|
+ struct _starpu_machine_topology *topology = &config->topology ;
|
|
|
+
|
|
|
+ hwloc_obj_t obj;
|
|
|
+ switch(worker->arch)
|
|
|
+ {
|
|
|
+ case STARPU_CPU_WORKER:
|
|
|
+ obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ STARPU_ABORT();
|
|
|
+ }
|
|
|
+
|
|
|
+ return numa_get_logical_id(obj);
|
|
|
+ }
|
|
|
+ else
|
|
|
+#endif
|
|
|
+ {
|
|
|
+ (void) workerid;
|
|
|
+ return STARPU_NUMA_MAIN_RAM;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static int _starpu_get_physical_numa_node_worker(unsigned workerid)
|
|
|
+{
|
|
|
+#if defined(STARPU_HAVE_HWLOC)
|
|
|
+ char * state;
|
|
|
+ if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
|
|
|
+ {
|
|
|
+ struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
|
|
|
+ struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
|
|
|
+ struct _starpu_machine_topology *topology = &config->topology ;
|
|
|
+
|
|
|
+ hwloc_obj_t obj;
|
|
|
+ switch(worker->arch)
|
|
|
+ {
|
|
|
+ case STARPU_CPU_WORKER:
|
|
|
+ obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
|
|
|
+ break;
|
|
|
+ default:
|
|
|
+ STARPU_ABORT();
|
|
|
+ }
|
|
|
+
|
|
|
+ return numa_get_physical_id(obj);
|
|
|
+ }
|
|
|
+ else
|
|
|
+#endif
|
|
|
+ {
|
|
|
+ (void) workerid;
|
|
|
+ return STARPU_NUMA_MAIN_RAM;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static int _starpu_numa_get_logical_id_from_pu(int pu)
|
|
|
+{
|
|
|
+#if defined(STARPU_HAVE_HWLOC)
|
|
|
+ if (nb_numa_nodes > 1)
|
|
|
+ {
|
|
|
+ struct _starpu_machine_config *config = _starpu_get_machine_config();
|
|
|
+ struct _starpu_machine_topology *topology = &config->topology;
|
|
|
+
|
|
|
+ hwloc_obj_t obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, pu);
|
|
|
+ return numa_get_logical_id(obj);
|
|
|
+ }
|
|
|
+ else
|
|
|
+#endif
|
|
|
+ {
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
|
|
|
{
|
|
|
unsigned nworkers = starpu_worker_get_count();
|
|
@@ -846,6 +977,67 @@ _starpu_topology_get_nhwpu (struct _starpu_machine_config *config)
|
|
|
return config->topology.nhwpus;
|
|
|
}
|
|
|
|
|
|
+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
|
|
|
+{
|
|
|
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
|
|
|
+ _starpu_opencl_init();
|
|
|
+#endif
|
|
|
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
|
|
|
+ _starpu_init_cuda();
|
|
|
+#endif
|
|
|
+ _starpu_init_topology(config);
|
|
|
+
|
|
|
+ int res;
|
|
|
+#if defined(STARPU_HAVE_HWLOC)
|
|
|
+ char * state;
|
|
|
+ if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
|
|
|
+ {
|
|
|
+ struct _starpu_machine_topology *topology = &config->topology ;
|
|
|
+ int nnumanodes = hwloc_get_nbobjs_by_type(topology->hwtopology, HWLOC_OBJ_NODE) ;
|
|
|
+ res = nnumanodes > 0 ? nnumanodes : 1 ;
|
|
|
+ }
|
|
|
+ else
|
|
|
+#endif
|
|
|
+ {
|
|
|
+ res = 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ STARPU_ASSERT_MSG(res <= STARPU_MAXNUMANODES, "Number of NUMA nodes discovered is higher than maximum accepted ! Use configure option --enable-maxnumanodes=xxx to increase the maximum value of supported NUMA nodes.\n");
|
|
|
+ return res;
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+int starpu_memory_nodes_numa_hwloclogid_to_id(int logid)
|
|
|
+{
|
|
|
+ unsigned n;
|
|
|
+ for (n = 0; n < nb_numa_nodes; n++)
|
|
|
+ if (numa_memory_nodes_to_hwloclogid[n] == logid)
|
|
|
+ return n;
|
|
|
+ return -1;
|
|
|
+}
|
|
|
+
|
|
|
+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id)
|
|
|
+{
|
|
|
+ STARPU_ASSERT(id < STARPU_MAXNUMANODES);
|
|
|
+ return numa_memory_nodes_to_hwloclogid[id];
|
|
|
+}
|
|
|
+
|
|
|
+int starpu_memory_nodes_numa_devid_to_id(unsigned id)
|
|
|
+{
|
|
|
+ STARPU_ASSERT(id < STARPU_MAXNUMANODES);
|
|
|
+ return numa_memory_nodes_to_physicalid[id];
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+int starpu_memory_nodes_numa_id_to_devid(int osid)
|
|
|
+{
|
|
|
+ unsigned n;
|
|
|
+ for (n = 0; n < nb_numa_nodes; n++)
|
|
|
+ if (numa_memory_nodes_to_physicalid[n] == osid)
|
|
|
+ return n;
|
|
|
+ return -1;
|
|
|
+}
|
|
|
+
|
|
|
#ifdef STARPU_HAVE_HWLOC
|
|
|
void _starpu_topology_filter(hwloc_topology_t topology)
|
|
|
{
|
|
@@ -1751,35 +1943,294 @@ _starpu_bind_thread_on_cpus (
|
|
|
#endif
|
|
|
}
|
|
|
|
|
|
-static void
|
|
|
-_starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
|
|
|
+static void _starpu_init_binding_cpu(struct _starpu_machine_config *config)
|
|
|
+{
|
|
|
+ unsigned worker;
|
|
|
+ for (worker = 0; worker < config->topology.nworkers; worker++)
|
|
|
+ {
|
|
|
+ struct _starpu_worker *workerarg = &config->workers[worker];
|
|
|
+
|
|
|
+ switch (workerarg->arch)
|
|
|
+ {
|
|
|
+ case STARPU_CPU_WORKER:
|
|
|
+ {
|
|
|
+
|
|
|
+ workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ default:
|
|
|
+
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+static void _starpu_init_numa_node(struct _starpu_machine_config *config)
|
|
|
{
|
|
|
-
|
|
|
- unsigned ram_memory_node;
|
|
|
+ nb_numa_nodes = 0;
|
|
|
+
|
|
|
+ unsigned i;
|
|
|
+ for (i = 0; i < STARPU_MAXNUMANODES; i++)
|
|
|
+ {
|
|
|
+ numa_memory_nodes_to_hwloclogid[i] = STARPU_NUMA_UNINITIALIZED;
|
|
|
+ numa_memory_nodes_to_physicalid[i] = STARPU_NUMA_UNINITIALIZED;
|
|
|
+ }
|
|
|
|
|
|
-
|
|
|
- * node */
|
|
|
-
|
|
|
- ram_memory_node = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
|
|
|
- STARPU_ASSERT(ram_memory_node == STARPU_MAIN_RAM);
|
|
|
|
|
|
+ char * state;
|
|
|
+
|
|
|
+ if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
|
|
|
+ {
|
|
|
+
|
|
|
+ unsigned worker;
|
|
|
+ for (worker = 0; worker < config->topology.nworkers; worker++)
|
|
|
+ {
|
|
|
+ struct _starpu_worker *workerarg = &config->workers[worker];
|
|
|
+ if (workerarg->arch == STARPU_CPU_WORKER)
|
|
|
+ {
|
|
|
+ int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
|
|
|
+
|
|
|
+
|
|
|
+ int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
|
|
|
+
|
|
|
+
|
|
|
+ if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
|
|
|
+ {
|
|
|
+ _STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
|
|
|
+ STARPU_ABORT();
|
|
|
+ }
|
|
|
+
|
|
|
+ if (numa_starpu_id == -1)
|
|
|
+ {
|
|
|
+ int devid = numa_logical_id == STARPU_NUMA_MAIN_RAM ? 0 : numa_logical_id;
|
|
|
+ int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, devid);
|
|
|
+ STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
|
|
|
+ numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
|
|
|
+ int numa_physical_id = _starpu_get_physical_numa_node_worker(worker);
|
|
|
+ numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
|
|
|
+ nb_numa_nodes++;
|
|
|
+#ifdef STARPU_SIMGRID
|
|
|
+ snprintf(name, sizeof(name), "RAM%d", memnode);
|
|
|
+ host = _starpu_simgrid_get_host_by_name(name);
|
|
|
+ STARPU_ASSERT(host);
|
|
|
+ _starpu_simgrid_memory_node_set_host(memnode, host);
|
|
|
+#endif
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ if (nb_numa_nodes != 0)
|
|
|
+ return;
|
|
|
+
|
|
|
+ _STARPU_DISP("No NUMA nodes found when checking CPU workers...\n");
|
|
|
+
|
|
|
+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
|
|
|
+ _STARPU_DISP("Take NUMA nodes attached to CUDA and OpenCL devices...\n");
|
|
|
+#endif
|
|
|
+
|
|
|
+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_HWLOC)
|
|
|
+ for (i = 0; i < config->topology.ncudagpus; i++)
|
|
|
+ {
|
|
|
+ hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, i);
|
|
|
+
|
|
|
+
|
|
|
+ if (!obj)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ while (obj->type != HWLOC_OBJ_NODE)
|
|
|
+ {
|
|
|
+ obj = obj->parent;
|
|
|
+
|
|
|
+
|
|
|
+ * hwloc does not know whether there are numa nodes or not, so
|
|
|
+ * we should not use a per-node sampling in that case. */
|
|
|
+ if (!obj)
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
|
|
|
+
|
|
|
+
|
|
|
+ if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
|
|
|
+ {
|
|
|
+ _STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
|
|
|
+ STARPU_ABORT();
|
|
|
+ }
|
|
|
+
|
|
|
+ if (numa_starpu_id == -1)
|
|
|
+ {
|
|
|
+ int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
|
|
|
+ STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
|
|
|
+ numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
|
|
|
+ numa_memory_nodes_to_physicalid[memnode] = obj->os_index;
|
|
|
+ nb_numa_nodes++;
|
|
|
+#ifdef STARPU_SIMGRID
|
|
|
+ snprintf(name, sizeof(name), "RAM%d", memnode);
|
|
|
+ host = _starpu_simgrid_get_host_by_name(name);
|
|
|
+ STARPU_ASSERT(host);
|
|
|
+ _starpu_simgrid_memory_node_set_host(memnode, host);
|
|
|
+#endif
|
|
|
+ }
|
|
|
+ }
|
|
|
+#endif
|
|
|
+#if defined(STARPU_USE_OPENCL) && defined(STARPU_HAVE_HWLOC)
|
|
|
+ if (config->topology.nopenclgpus > 0)
|
|
|
+ {
|
|
|
+ cl_int err;
|
|
|
+ cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
|
|
|
+ cl_uint nb_platforms;
|
|
|
+ unsigned platform;
|
|
|
+ unsigned nb_opencl_devices = 0, num = 0;
|
|
|
+
|
|
|
+ err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
|
|
|
+ if (STARPU_UNLIKELY(err != CL_SUCCESS))
|
|
|
+ nb_platforms=0;
|
|
|
+
|
|
|
+ cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
|
|
|
+ if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
|
|
|
+ device_type |= CL_DEVICE_TYPE_CPU;
|
|
|
+ if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
|
|
|
+ device_type = CL_DEVICE_TYPE_CPU;
|
|
|
+
|
|
|
+ for (platform = 0; platform < nb_platforms ; platform++)
|
|
|
+ {
|
|
|
+ err = clGetDeviceIDs(platform_id[platform], device_type, 0, NULL, &num);
|
|
|
+ if (err != CL_SUCCESS)
|
|
|
+ num = 0;
|
|
|
+ nb_opencl_devices += num;
|
|
|
+
|
|
|
+ for (i = 0; i < num; i++)
|
|
|
+ {
|
|
|
+ hwloc_obj_t obj = hwloc_opencl_get_device_osdev_by_index(config->topology.hwtopology, platform, i);
|
|
|
+
|
|
|
+
|
|
|
+ if (!obj)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ while (obj->type != HWLOC_OBJ_NODE)
|
|
|
+ {
|
|
|
+ obj = obj->parent;
|
|
|
+
|
|
|
+
|
|
|
+ * hwloc does not know whether there are numa nodes or not, so
|
|
|
+ * we should not use a per-node sampling in that case. */
|
|
|
+ if (!obj)
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
|
|
|
+
|
|
|
+
|
|
|
+ if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
|
|
|
+ {
|
|
|
+ _STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
|
|
|
+ STARPU_ABORT();
|
|
|
+ }
|
|
|
+
|
|
|
+ if (numa_starpu_id == -1)
|
|
|
+ {
|
|
|
+ int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
|
|
|
+ STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
|
|
|
+ numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
|
|
|
+ numa_memory_nodes_to_physicalid[memnode] = obj->os_index;
|
|
|
+ nb_numa_nodes++;
|
|
|
#ifdef STARPU_SIMGRID
|
|
|
- char name[16];
|
|
|
- msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
|
|
|
- STARPU_ASSERT(host);
|
|
|
- _starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
|
|
|
+ snprintf(name, sizeof(name), "RAM%d", memnode);
|
|
|
+ host = _starpu_simgrid_get_host_by_name(name);
|
|
|
+ STARPU_ASSERT(host);
|
|
|
+ _starpu_simgrid_memory_node_set_host(memnode, host);
|
|
|
#endif
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+#endif
|
|
|
+ }
|
|
|
+
|
|
|
+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
|
|
|
+
|
|
|
+ if (nb_numa_nodes != 0)
|
|
|
+ return;
|
|
|
+
|
|
|
+
|
|
|
+ _STARPU_DISP("No NUMA nodes found when checking GPUs devices...\n");
|
|
|
+#endif
|
|
|
+
|
|
|
+ _STARPU_DISP("Finally, take all NUMA nodes available... \n");
|
|
|
+
|
|
|
+ unsigned nnuma = _starpu_topology_get_nnumanodes(config);
|
|
|
+ if (nnuma > STARPU_MAXNUMANODES)
|
|
|
+ {
|
|
|
+ _STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
|
|
|
+ nnuma = STARPU_MAXNUMANODES;
|
|
|
+ }
|
|
|
+
|
|
|
+ unsigned numa;
|
|
|
+ for (numa = 0; numa < nnuma; numa++)
|
|
|
+ {
|
|
|
+#if defined(STARPU_HAVE_HWLOC)
|
|
|
+ if (nnuma > 1)
|
|
|
+ {
|
|
|
+ hwloc_obj_t obj = hwloc_get_obj_by_type(config->topology.hwtopology, HWLOC_OBJ_NUMANODE, numa);
|
|
|
+ unsigned numa_logical_id = obj->logical_index;
|
|
|
+ unsigned numa_physical_id = obj->os_index;
|
|
|
|
|
|
+ int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
|
|
|
+ STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available) \n", memnode, STARPU_MAXNUMANODES);
|
|
|
+
|
|
|
+ numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
|
|
|
+ numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
|
|
|
+ nb_numa_nodes++;
|
|
|
+
|
|
|
+#ifdef STARPU_SIMGRID
|
|
|
+ snprintf(name, sizeof(name), "RAM%d", memnode);
|
|
|
+ host = _starpu_simgrid_get_host_by_name(name);
|
|
|
+ STARPU_ASSERT(host);
|
|
|
+ _starpu_simgrid_memory_node_set_host(memnode, host);
|
|
|
+#endif
|
|
|
+ }
|
|
|
+ else
|
|
|
+#endif
|
|
|
+ {
|
|
|
+
|
|
|
+
|
|
|
+ int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
|
|
|
+ STARPU_ASSERT_MSG(memnode == STARPU_MAIN_RAM, "Wrong Memory Node : %d (expected %d) \n", memnode, STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ numa_memory_nodes_to_hwloclogid[memnode] = STARPU_NUMA_MAIN_RAM;
|
|
|
+ numa_memory_nodes_to_physicalid[memnode] = STARPU_NUMA_MAIN_RAM;
|
|
|
+ nb_numa_nodes++;
|
|
|
+#ifdef STARPU_SIMGRID
|
|
|
+ char name[16];
|
|
|
+ msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
|
|
|
+ STARPU_ASSERT(host);
|
|
|
+ _starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
|
|
|
+#endif
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ STARPU_ASSERT_MSG(nb_numa_nodes > 0, "No NUMA node found... We need at least one memory node !\n");
|
|
|
+}
|
|
|
+
|
|
|
+static void _starpu_init_numa_bus()
|
|
|
+{
|
|
|
+ unsigned i, j;
|
|
|
+ for (i = 0; i < nb_numa_nodes; i++)
|
|
|
+ for (j = 0; j < nb_numa_nodes; j++)
|
|
|
+ if (i != j)
|
|
|
+ numa_bus_id[i*nb_numa_nodes+j] = _starpu_register_bus(i, j);
|
|
|
+}
|
|
|
+
|
|
|
+static void
|
|
|
+_starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
|
|
|
+{
|
|
|
|
|
|
* combinations in a matrix which we initialize here. */
|
|
|
_starpu_initialize_busid_matrix();
|
|
|
|
|
|
-
|
|
|
- * giving it a memory node and a core bind id.
|
|
|
- */
|
|
|
-
|
|
|
- unsigned numa_init[1] = { 1 };
|
|
|
- unsigned numa_memory_nodes[1] = { ram_memory_node };
|
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
|
|
|
unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
|
|
|
unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
|
|
@@ -1801,6 +2252,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
|
|
|
unsigned mpi_bindid[STARPU_MAXMPIDEVS];
|
|
|
#endif
|
|
|
+
|
|
|
unsigned bindid;
|
|
|
|
|
|
for (bindid = 0; bindid < config->nbindid; bindid++)
|
|
@@ -1810,6 +2262,13 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
config->bindid_workers[bindid].nworkers = 0;
|
|
|
}
|
|
|
|
|
|
+
|
|
|
+ _starpu_init_binding_cpu(config);
|
|
|
+
|
|
|
+
|
|
|
+ _starpu_init_numa_node(config);
|
|
|
+ _starpu_init_numa_bus();
|
|
|
+
|
|
|
unsigned worker;
|
|
|
for (worker = 0; worker < config->topology.nworkers; worker++)
|
|
|
{
|
|
@@ -1828,33 +2287,22 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
{
|
|
|
case STARPU_CPU_WORKER:
|
|
|
{
|
|
|
-
|
|
|
- int numaid = 0;
|
|
|
-
|
|
|
- if (numa_init[numaid])
|
|
|
- {
|
|
|
- memory_node = numa_memory_nodes[numaid];
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
- numa_init[numaid] = 1;
|
|
|
- memory_node = numa_memory_nodes[numaid] = _starpu_memory_node_register(STARPU_CPU_RAM, numaid);
|
|
|
-#ifdef STARPU_SIMGRID
|
|
|
- snprintf(name, sizeof(name), "RAM%d", numaid);
|
|
|
- host = _starpu_simgrid_get_host_by_name(name);
|
|
|
- STARPU_ASSERT(host);
|
|
|
- _starpu_simgrid_memory_node_set_host(memory_node, host);
|
|
|
-#endif
|
|
|
- }
|
|
|
- workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
|
|
|
+ int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
|
|
|
+ int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
|
|
|
+ if (numa_starpu_id >= STARPU_MAXNUMANODES)
|
|
|
+ numa_starpu_id = STARPU_MAIN_RAM;
|
|
|
+
|
|
|
+ workerarg->numa_memory_node = memory_node = numa_starpu_id;
|
|
|
+
|
|
|
_starpu_memory_node_add_nworkers(memory_node);
|
|
|
|
|
|
- _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
|
|
|
- _starpu_worker_drives_memory_node(workerarg, memory_node);
|
|
|
+ _starpu_worker_drives_memory_node(workerarg, numa_starpu_id);
|
|
|
break;
|
|
|
}
|
|
|
#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
|
|
|
case STARPU_CUDA_WORKER:
|
|
|
+ {
|
|
|
+ unsigned numa;
|
|
|
#ifndef STARPU_SIMGRID
|
|
|
if (may_bind_automatically[STARPU_CUDA_WORKER])
|
|
|
{
|
|
@@ -1884,8 +2332,11 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
|
|
|
memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
|
|
|
|
|
|
- _starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
|
|
|
- _starpu_cuda_bus_ids[devid+1][0] = _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ {
|
|
|
+ _starpu_cuda_bus_ids[numa][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(numa, memory_node);
|
|
|
+ _starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][numa] = _starpu_register_bus(memory_node, numa);
|
|
|
+ }
|
|
|
#ifdef STARPU_SIMGRID
|
|
|
const char* cuda_memcpy_peer;
|
|
|
snprintf(name, sizeof(name), "CUDA%u", devid);
|
|
@@ -1912,8 +2363,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
if (workerarg2->arch == STARPU_CUDA_WORKER)
|
|
|
{
|
|
|
unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
|
|
|
- _starpu_cuda_bus_ids[devid2][devid] = _starpu_register_bus(memory_node2, memory_node);
|
|
|
- _starpu_cuda_bus_ids[devid][devid2] = _starpu_register_bus(memory_node, memory_node2);
|
|
|
+ _starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node2, memory_node);
|
|
|
+ _starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node, memory_node2);
|
|
|
#ifndef STARPU_SIMGRID
|
|
|
#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
|
|
|
{
|
|
@@ -1931,8 +2382,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
|
|
|
}
|
|
|
#endif
|
|
|
- starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2][devid], data->ngpus);
|
|
|
- starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid][devid2], data->ngpus);
|
|
|
+ starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], data->ngpus);
|
|
|
+ starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES], data->ngpus);
|
|
|
}
|
|
|
}
|
|
|
#endif
|
|
@@ -1943,13 +2394,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
}
|
|
|
_starpu_memory_node_add_nworkers(memory_node);
|
|
|
|
|
|
- _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ _starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
|
|
|
+
|
|
|
_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
|
|
|
break;
|
|
|
+ }
|
|
|
#endif
|
|
|
|
|
|
#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
|
|
|
case STARPU_OPENCL_WORKER:
|
|
|
+ {
|
|
|
+ unsigned numa;
|
|
|
#ifndef STARPU_SIMGRID
|
|
|
if (may_bind_automatically[STARPU_OPENCL_WORKER])
|
|
|
{
|
|
@@ -1970,8 +2427,12 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
opencl_init[devid] = 1;
|
|
|
workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
|
|
|
memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
|
|
|
- _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
|
|
|
- _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ {
|
|
|
+ _starpu_register_bus(numa, memory_node);
|
|
|
+ _starpu_register_bus(memory_node, numa);
|
|
|
+ }
|
|
|
#ifdef STARPU_SIMGRID
|
|
|
snprintf(name, sizeof(name), "OpenCL%u", devid);
|
|
|
host = _starpu_simgrid_get_host_by_name(name);
|
|
@@ -1981,13 +2442,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
}
|
|
|
_starpu_memory_node_add_nworkers(memory_node);
|
|
|
|
|
|
- _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ _starpu_worker_drives_memory_node(workerarg, numa);
|
|
|
+
|
|
|
_starpu_worker_drives_memory_node(workerarg, memory_node);
|
|
|
break;
|
|
|
+ }
|
|
|
#endif
|
|
|
|
|
|
#ifdef STARPU_USE_MIC
|
|
|
case STARPU_MIC_WORKER:
|
|
|
+ {
|
|
|
+ unsigned numa;
|
|
|
if (mic_init[devid])
|
|
|
{
|
|
|
memory_node = mic_memory_nodes[devid];
|
|
@@ -2004,21 +2471,30 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
|
|
|
mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
|
|
|
memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
|
|
|
- _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
|
|
|
- _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ {
|
|
|
+ _starpu_register_bus(numa, memory_node);
|
|
|
+ _starpu_register_bus(memory_node, numa);
|
|
|
+ }
|
|
|
|
|
|
}
|
|
|
workerarg->bindid = mic_bindid[devid];
|
|
|
_starpu_memory_node_add_nworkers(memory_node);
|
|
|
|
|
|
- _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ _starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
|
|
|
+
|
|
|
_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
|
|
|
break;
|
|
|
+ }
|
|
|
#endif
|
|
|
|
|
|
#ifdef STARPU_USE_SCC
|
|
|
case STARPU_SCC_WORKER:
|
|
|
{
|
|
|
+ unsigned numa;
|
|
|
|
|
|
struct _starpu_memory_node_descr *descr = _starpu_memory_node_get_description();
|
|
|
descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
|
|
@@ -2026,7 +2502,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
memory_node = ram_memory_node;
|
|
|
_starpu_memory_node_add_nworkers(memory_node);
|
|
|
|
|
|
- _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ _starpu_worker_drives_memory_node(workerarg, numa);
|
|
|
+
|
|
|
_starpu_worker_drives_memory_node(workerarg, memory_node);
|
|
|
}
|
|
|
break;
|
|
@@ -2035,6 +2514,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
#ifdef STARPU_USE_MPI_MASTER_SLAVE
|
|
|
case STARPU_MPI_MS_WORKER:
|
|
|
{
|
|
|
+ unsigned numa;
|
|
|
if (mpi_init[devid])
|
|
|
{
|
|
|
memory_node = mpi_memory_nodes[devid];
|
|
@@ -2044,11 +2524,18 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
|
|
|
mpi_init[devid] = 1;
|
|
|
mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
|
|
|
memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
|
|
|
- _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
|
|
|
- _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ {
|
|
|
+ _starpu_register_bus(numa, memory_node);
|
|
|
+ _starpu_register_bus(memory_node, numa);
|
|
|
+ }
|
|
|
|
|
|
}
|
|
|
- _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
|
|
|
+
|
|
|
+ for (numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
+ _starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
|
|
|
+
|
|
|
_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
|
|
|
#ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
|
|
|
|
|
@@ -2154,7 +2641,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
|
|
|
_starpu_memory_nodes_init();
|
|
|
_starpu_datastats_init();
|
|
|
|
|
|
- _starpu_init_workers_binding(config, no_mp_config);
|
|
|
+ _starpu_init_workers_binding_and_memory(config, no_mp_config);
|
|
|
|
|
|
config->cpus_nodeid = -1;
|
|
|
config->cuda_nodeid = -1;
|
|
@@ -2293,3 +2780,4 @@ starpu_topology_print (FILE *output)
|
|
|
fprintf(output, "\n");
|
|
|
}
|
|
|
}
|
|
|
+
|