Prechádzať zdrojové kódy

Let StarPU allocate memory on the FPGA

Samuel Thibault 5 rokov pred
rodič
commit
23b602565f

+ 0 - 6
src/core/topology.c

@@ -2796,15 +2796,9 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 					fpga_init[devid] = 1;
 					workerarg->bindid = fpga_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
 
-#if 0
-// TODO: il faut activer ça pour que StarPU se mette à allouer de la mémoire FPGA
 					memory_node = fpga_memory_nodes[devid] = _starpu_memory_node_register(STARPU_FPGA_RAM, devid, &_starpu_driver_fpga_node_ops); 
 					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
 					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
-#else
-					// Ici, éventuellement pour l'instant
-					memory_node = STARPU_MAIN_RAM;
-#endif
 
 #ifdef STARPU_SIMGRID
 					snprintf(name, sizeof(name), "Fpga%d", devid);

+ 7 - 0
src/datawizard/malloc.c

@@ -635,6 +635,12 @@ starpu_memory_unpin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRI
  * quickly find free segments to allocate.
  */
 
+#ifdef STARPU_USE_FPGA
+// FIXME: Maxeler FPGAs want 192 byte alignment
+#define CHUNK_SIZE (128*1024*192)
+#define CHUNK_ALLOC_MAX (CHUNK_SIZE / 8)
+#define CHUNK_ALLOC_MIN (128*192)
+#else
 /* Size of each chunk, 32MiB granularity brings 128 chunks to be allocated in
  * order to fill a 4GiB GPU. */
 #define CHUNK_SIZE (32*1024*1024)
@@ -647,6 +653,7 @@ starpu_memory_unpin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRI
  * 16KiB (i.e. 64x64 float) granularity eats 2MiB RAM for managing a 4GiB GPU.
  */
 #define CHUNK_ALLOC_MIN (16*1024)
+#endif
 
 /* Don't really deallocate chunks unless we have more than this many chunks
  * which are completely free. */

+ 2 - 0
src/datawizard/node_ops.c

@@ -40,6 +40,8 @@ const char* _starpu_node_get_prefix(enum starpu_node_kind kind)
 			return "Disk";
 		case STARPU_MIC_RAM:
 			return "MIC";
+		case STARPU_FPGA_RAM:
+			return "FPGA";
 		case STARPU_MPI_MS_RAM:
 			return "MPI_MS";
 		case STARPU_UNUSED:

+ 6 - 2
src/drivers/max/driver_fpga.c

@@ -69,6 +69,7 @@ void _starpu_init_fpga()
         //// pour récupérer l'accès à la LMem	
 }
 
+#if 0
 int fpga_allocate_memory(fpga_mem *ptr, size_t size){
 //This allocates BYTES
 	char *msg1="You asked to allocate ";
@@ -82,6 +83,7 @@ int fpga_allocate_memory(fpga_mem *ptr, size_t size){
        		else
 		return 1;
        			  }
+#endif
 
 int fpgaGetDeviceProperties(fpgaDeviceProp *props,unsigned devid){
 //TODO
@@ -354,12 +356,14 @@ uintptr_t _starpu_fpga_allocate_memory(unsigned dst_node, size_t size, int flags
 	unsigned devid = starpu_memory_node_get_devid(dst_node);
 	STARPU_ASSERT(devid == 0); // For now
 
-	static fpga_mem current_address = 0;
+	/* 0 would be seen as NULL, i.e. allocation failed... */
+// FIXME: Maxeler FPGAs want 192-byte alignment
+	static fpga_mem current_address = 8192*192;
 	fpga_mem addr;
 // TODO: vérifier si current_address + size > taille de la LMEm
  	addr = current_address;
 	current_address += size;
-printf("fpga mem returned from allocation @: %p\n",addr);
+printf("fpga mem returned from allocation @: %p - %p\n",addr, addr + size);
 //success = 0
         return (uintptr_t) addr;
 }

+ 189 - 49
tests/perfmodels/max_fpga.c

@@ -9,22 +9,6 @@
 #define SIZE (192/sizeof(int32_t))
 
 
-void cpu_func(void *buffers[], void *cl_arg)
-{
-    int *a = (int*) STARPU_VECTOR_GET_PTR(buffers[0]);
-    int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
-    int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
-   
-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
-
-    (void)buffers;
-    (void)cl_arg;
-
-    int i;
-    for (i = 0; i < size; i++)
-	c[i] = a[i] + b[i];
-}
-
 void fpga_impl(void *buffers[], void *cl_arg)
 {   
     (void)cl_arg;
@@ -54,6 +38,7 @@ int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
 
     printf("Loading DFE memory.\n");
 
+/* C = A+B */
     StreamFMA(SIZE, ptrA, sizeBytes, ptrB, sizeBytes, poubelle_cpu, sizeBytes,
     poubelle, LMemsize,
     poubelle, LMemsize,
@@ -63,6 +48,7 @@ int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
     poubelle, LMemsize);
 printf("T1 finished\n");
 
+/* C = A*B */
     StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes,
     ptrAT2, LMemsize,
     poubelle, LMemsize,
@@ -72,6 +58,7 @@ printf("T1 finished\n");
     ptrCT2, LMemsize);
 printf("T2 finished\n");
 
+/* C = A+B */
     StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, ptrC, sizeBytes,
     poubelle, LMemsize,
     ptrAT3, LMemsize,
@@ -88,15 +75,159 @@ printf("T3 finished\n");
 
 static struct starpu_codelet cl =
 {
-    .cpu_funcs = {cpu_func},
-    .cpu_funcs_name = {"cpu_func"},
-
     .fpga_funcs = {fpga_impl},
   
     .nbuffers = 3,
     .modes = {STARPU_R, STARPU_R, STARPU_W}
 };
- 
+
+
+
+void fpga_impl1(void *buffers[], void *cl_arg)
+{   
+    (void)cl_arg;
+    
+    int32_t *ptrA = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[0]);
+    int32_t *ptrB = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[1]);
+    size_t   ptrC = (size_t)   STARPU_VECTOR_GET_PTR(buffers[2]); /* FPGA */
+
+int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
+
+
+    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    
+    int sizeBytes=SIZE *sizeof(int32_t);
+    size_t LMemsize= SIZE *sizeof(int32_t);
+    
+    size_t poubelle = 0xc0000;
+
+#if 0
+printf("T1 with %p %p %zu\n", ptrA, ptrB, ptrC);
+//XXX
+    ptrC = 0x00000000000000c0;
+#endif
+
+printf("T1 with %p %p %zu\n", ptrA, ptrB, ptrC);
+/* C = A+B */
+    StreamFMA(SIZE, ptrA, sizeBytes, ptrB, sizeBytes, poubelle_cpu, sizeBytes,
+    poubelle, LMemsize,
+    poubelle, LMemsize,
+    poubelle, LMemsize,
+    poubelle, LMemsize,
+    ptrC, LMemsize,
+    poubelle, LMemsize);
+printf("T1 finished\n");
+
+  }
+
+
+static struct starpu_codelet cl1 =
+{
+    .fpga_funcs = {fpga_impl1},
+  
+    .nbuffers = 3,
+    .modes = {STARPU_R, STARPU_R, STARPU_W},
+    .specific_nodes = 1,
+    .nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
+};
+
+void fpga_impl2(void *buffers[], void *cl_arg)
+{   
+    (void)cl_arg;
+    
+    size_t ptrA = (size_t) STARPU_VECTOR_GET_PTR(buffers[0]); /* FPGA */
+    size_t ptrB = (size_t) STARPU_VECTOR_GET_PTR(buffers[1]); /* FPGA */
+    size_t ptrC = (size_t) STARPU_VECTOR_GET_PTR(buffers[2]); /* FPGA */
+
+int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
+
+
+    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    
+    int sizeBytes=SIZE *sizeof(int32_t);
+    size_t LMemsize= SIZE *sizeof(int32_t);
+    
+    size_t poubelle = 0xc0000;
+
+#if 0
+printf("T2 with %zu %zu %zu\n", ptrA, ptrB, ptrC);
+//XXX
+    ptrA = 0x00000000000000c0;
+    ptrB = 0x00000000000000c0;
+    ptrC = 0x0000000000000180;
+#endif
+
+printf("T2 with %zu %zu %zu\n", ptrA, ptrB, ptrC);
+/* C = A*B */
+    StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes,
+    ptrA, LMemsize,
+    poubelle, LMemsize,
+    ptrB, LMemsize,
+    poubelle, LMemsize,
+    poubelle, LMemsize,
+    ptrC, LMemsize);
+printf("T2 finished\n");
+              
+  }
+
+static struct starpu_codelet cl2 =
+{
+    .fpga_funcs = {fpga_impl2},
+  
+    .nbuffers = 3,
+    .modes = {STARPU_R, STARPU_R, STARPU_W}
+    /* local by default */
+};
+
+void fpga_impl3(void *buffers[], void *cl_arg)
+{   
+    (void)cl_arg;
+    
+    size_t   ptrA = (size_t)   STARPU_VECTOR_GET_PTR(buffers[0]); /* FPGA */
+    size_t   ptrB = (size_t)   STARPU_VECTOR_GET_PTR(buffers[1]); /* FPGA */
+    int32_t *ptrC = (int32_t*) STARPU_VECTOR_GET_PTR(buffers[2]);
+
+int32_t *poubelle_cpu = malloc(SIZE * sizeof(int32_t));
+
+
+    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    
+    int sizeBytes=SIZE *sizeof(int32_t);
+    size_t LMemsize= SIZE *sizeof(int32_t);
+    
+    size_t poubelle = 0xc0000;
+
+#if 0
+printf("T3 with %zu %zu %p\n", ptrA, ptrB, ptrC);
+//XXX
+    ptrA = 0x0000000000000180;
+    ptrB = 0x0000000000000180;
+#endif
+
+printf("T3 with %zu %zu %p\n", ptrA, ptrB, ptrC);
+/* C = A+B */
+    StreamFMA(SIZE, poubelle_cpu, sizeBytes, poubelle_cpu, sizeBytes, ptrC, sizeBytes,
+    poubelle, LMemsize,
+    ptrA, LMemsize,
+    poubelle, LMemsize,
+    ptrB, LMemsize,
+    poubelle, LMemsize,
+    poubelle, LMemsize);
+printf("T3 finished\n");
+  }
+
+static struct starpu_codelet cl3 =
+{
+    .fpga_funcs = {fpga_impl3},
+  
+    .nbuffers = 3,
+    .modes = {STARPU_R, STARPU_R, STARPU_W},
+    .specific_nodes = 1,
+    .nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_CPU},
+};
+
+
+
 int main(int argc, char **argv)
 {
 
@@ -104,7 +235,7 @@ int main(int argc, char **argv)
     starpu_profiling_status_set(1);
 
     struct starpu_conf conf;
-    starpu_data_handle_t handle_a, handle_b, handle_c;
+    starpu_data_handle_t handle_a, handle_b, handle_ct1, handle_ct2, handle_c;
     int ret;
     int size=1234;
 
@@ -118,9 +249,9 @@ int main(int argc, char **argv)
     STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
     
-    int a[SIZE];
-    int b[SIZE];
-    int c[SIZE];
+    int32_t a[SIZE];
+    int32_t b[SIZE];
+    int32_t c[SIZE];
 
     int i;
     for(i = 0; i < SIZE; ++i)
@@ -129,44 +260,53 @@ int main(int argc, char **argv)
         b[i] = random() % 100;
     }
 
-    starpu_vector_data_register(&handle_a, STARPU_MAIN_RAM, (uintptr_t) &a, SIZE, sizeof(int));
-    starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(int));
-    starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(int));
+    starpu_vector_data_register(&handle_a, STARPU_MAIN_RAM, (uintptr_t) &a, SIZE, sizeof(a[0]));
+    starpu_vector_data_register(&handle_b, STARPU_MAIN_RAM, (uintptr_t) &b, SIZE, sizeof(b[0]));
 
-    struct starpu_task *task = starpu_task_create();
-    task->cl = &cl;
-    task->handles[0] = handle_a;
-    task->handles[1] = handle_b;
-    task->handles[2] = handle_c;
-    
-    task->synchronous = 1;
-    task->destroy = 0;
-    /* submit the task to StarPU */
+    starpu_vector_data_register(&handle_ct1, -1, 0, SIZE, sizeof(c[0]));
+    starpu_vector_data_register(&handle_ct2, -1, 0, SIZE, sizeof(c[0]));
 
-    //starpu_task_destroy(task);
-    ret = starpu_task_submit(task);
-    
+    starpu_vector_data_register(&handle_c, STARPU_MAIN_RAM, (uintptr_t) &c, SIZE, sizeof(c[0]));
+
+#if 0
+    ret = starpu_task_insert(&cl, STARPU_R, handle_a, STARPU_R, handle_b, STARPU_W, handle_c, STARPU_TASK_SYNCHRONOUS, 1, 0);
     fprintf(stderr,"task submitted %d\n", ret);
+#else
+    ret = starpu_task_insert(&cl1, STARPU_R, handle_a, STARPU_R, handle_b, STARPU_W, handle_ct1, STARPU_TASK_SYNCHRONOUS, 1, 0);
+    ret = starpu_task_insert(&cl2, STARPU_R, handle_ct1, STARPU_R, handle_ct1, STARPU_W, handle_ct2, STARPU_TASK_SYNCHRONOUS, 1, 0);
+    ret = starpu_task_insert(&cl3, STARPU_R, handle_ct2, STARPU_R, handle_ct2, STARPU_W, handle_c, STARPU_TASK_SYNCHRONOUS, 1, 0);
+    fprintf(stderr,"task submitted %d\n", ret);
+#endif
     
     starpu_data_unregister(handle_a);
     starpu_data_unregister(handle_b);
     starpu_data_unregister(handle_c);
     
-    int mysize = SIZE;
-    if (mysize > 10)
-	mysize = 10;
-	for (i = 0; i < mysize; ++i) 
-	{
-		int ct1 = a[i] + b[i];
-		int ct2 = ct1 * ct1;
-		int ct3 = ct2 + ct2;
-		printf("%d == %d\n", c[i], ct3);
+    ret = EXIT_SUCCESS;
+
+    for (i = 0; i < SIZE; ++i) 
+    {
+	int ct1 = a[i] + b[i];
+	int ct2 = ct1 * ct1;
+	int ct3 = ct2 + ct2;
+
+	if (c[i] != ct3)
+	    ret = EXIT_FAILURE;
+
+	if (i < 10) {
+	    printf("%d == %d\n", c[i], ct3);
+	    if (c[i] != ct3)
+		printf("OOOPS\n");
 	}
+    }
 
 
     starpu_shutdown();
 
-    return EXIT_SUCCESS;
+    if (ret == EXIT_SUCCESS)
+	printf("OK!\n");
+
+    return ret;
 
 }