|
@@ -0,0 +1,127 @@
|
|
|
+#include<starpu.h>
|
|
|
+#include <stdio.h>
|
|
|
+#include<stdlib.h>
|
|
|
+#include <sys/time.h>
|
|
|
+// platform independent data types:
|
|
|
+#include <stdint.h>
|
|
|
+
|
|
|
+// Number of iterations
|
|
|
+#define NUM_ITERATIONS 10
|
|
|
+// The window height is equal to the number of streams
|
|
|
+#define NUM_INPUT_STREAMS WINDOW_HEIGHT
|
|
|
+#define WINDOW_HEIGHT 4
|
|
|
+// The window width is equal to number of tuples required to fill in a window
|
|
|
+#define WINDOW_WIDTH 100
|
|
|
+
|
|
|
+// The total number of elements is equal to the window_height times window_width
|
|
|
+#define elements WINDOW_HEIGHT*WINDOW_WIDTH
|
|
|
+
|
|
|
+// measure time:
|
|
|
+struct timeval start[NUM_ITERATIONS], end[NUM_ITERATIONS];
|
|
|
+
|
|
|
+extern void cpu_output(void *buffers[], void *_args);
|
|
|
+extern void output_thread_aggregation(void *buffers[], void *_args);
|
|
|
+
|
|
|
+static struct starpu_perfmodel perf_model = {
|
|
|
+ .type = STARPU_HISTORY_BASED,
|
|
|
+ .symbol = "main",
|
|
|
+};
|
|
|
+
|
|
|
+static struct starpu_codelet cl =
|
|
|
+{
|
|
|
+ /*CPU implementation of the codelet */
|
|
|
+ .cpu_funcs = { cpu_output },
|
|
|
+ .cpu_funcs_name = { "cpu_output" },
|
|
|
+ #ifdef STARPU_USE_CUDA
|
|
|
+ /* CUDA implementation of the codelet */
|
|
|
+ .cuda_funcs = { output_thread_aggregation },
|
|
|
+ #endif
|
|
|
+ .nbuffers = 1,
|
|
|
+ .modes = { STARPU_RW },
|
|
|
+ .model = &perf_model
|
|
|
+};
|
|
|
+
|
|
|
+
|
|
|
+int main(int argc, char **argv)
|
|
|
+{
|
|
|
+
|
|
|
+ uint32_t iterations_id;
|
|
|
+
|
|
|
+ // create input streams:
|
|
|
+ for (iterations_id=0; iterations_id<NUM_ITERATIONS; iterations_id++) {
|
|
|
+
|
|
|
+ // dynamic allocation of the memory needed for all the elements
|
|
|
+ uint32_t *window;
|
|
|
+ window = (uint32_t*)calloc(elements, sizeof(uint32_t));
|
|
|
+
|
|
|
+ // check if there's enough space for the allocation
|
|
|
+ if(!window){
|
|
|
+ printf("Allocation error for window - aborting.\n");
|
|
|
+ exit(1);
|
|
|
+ }
|
|
|
+
|
|
|
+ uint64_t ag_val = 0; // test variable to check if the cuda sum is equal to the cpu sum
|
|
|
+
|
|
|
+ // initialization - fill in the window with random numbers:
|
|
|
+ for (int i = 0; i < elements; i++) {
|
|
|
+ window[i] = (rand()%1000);
|
|
|
+ ag_val += window[i];
|
|
|
+ }
|
|
|
+ printf("TEST %lu\n", ag_val);
|
|
|
+
|
|
|
+ gettimeofday(&start[iterations_id], NULL); // start time for each iteration only for StarPU initialization and time to calculate aggregated value
|
|
|
+
|
|
|
+ /* initialize StarPU */
|
|
|
+ starpu_init(NULL);
|
|
|
+
|
|
|
+ /* initialize performance model */
|
|
|
+ starpu_perfmodel_init(&perf_model);
|
|
|
+
|
|
|
+ /* Tell StaPU to associate the "window" vector with the "vector_handle" */
|
|
|
+ starpu_data_handle_t vector_handle;
|
|
|
+ starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)window, elements, sizeof(window[0]));
|
|
|
+
|
|
|
+ /* create a synchronous task: any call to starpu_task_submit will block
|
|
|
+ * until it is terminated */
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+ task->synchronous = 1;
|
|
|
+ task->cl = &cl; /* Pointer to the codelet defined above */
|
|
|
+
|
|
|
+ /* the codelet manipulates one buffer in RW mode */
|
|
|
+ task->handles[0] = vector_handle;
|
|
|
+
|
|
|
+ uint64_t aggregated_value = 0;
|
|
|
+
|
|
|
+ /* an argument is passed to the codelet, beware that this is a
|
|
|
+ * READ-ONLY buffer and that the codelet may be given a pointer to a
|
|
|
+ * COPY of the argument */
|
|
|
+ task->cl_arg = &aggregated_value;
|
|
|
+ task->cl_arg_size = sizeof(aggregated_value);
|
|
|
+
|
|
|
+ /* submit the task to StarPU */
|
|
|
+ starpu_task_submit(task);
|
|
|
+
|
|
|
+ /* StarPU does not need to manipulate the array anymore so we can stop monitoring it */
|
|
|
+ starpu_data_unregister(vector_handle);
|
|
|
+
|
|
|
+ /* terminate StarPU */
|
|
|
+ starpu_shutdown();
|
|
|
+ gettimeofday(&end[iterations_id], NULL); // stop time for each iteration after aggregation value has been calculated and StarPU has been shutted down
|
|
|
+
|
|
|
+ printf("iter: %d - aggregated value: %lu\n", iterations_id, aggregated_value);
|
|
|
+
|
|
|
+ //free the memory allocated on the CPU
|
|
|
+ free(window);
|
|
|
+ }
|
|
|
+
|
|
|
+ uint64_t time = 0; // variable that holds the time
|
|
|
+
|
|
|
+ // calculate the time required for the calculation of the aggregated value for all iterations
|
|
|
+ for (iterations_id=0; iterations_id<NUM_ITERATIONS; iterations_id++) {
|
|
|
+ time += ((end[iterations_id].tv_sec * 1000000 + end[iterations_id].tv_usec) - (start[iterations_id].tv_sec * 1000000 + start[iterations_id].tv_usec));
|
|
|
+ }
|
|
|
+
|
|
|
+ printf("usec: %ld\n", time);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|