15 years ago · 4408a54de9
--- a/tests/experiments/latency/Makefile
+++ b/tests/experiments/latency/Makefile
@@ -0,0 +1,33 @@
 
				+#
			
 
				+# StarPU
			
 
				+# Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# This program is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				+
			
 
				+CUDASDKDIR?=/home/gonnet/NVIDIA_CUDA_SDK/
			
 
				+CUDAINSTALLDIR=/usr/local/cuda/
			
 
				+
			
 
				+CFLAGS += -I$(CUDAINSTALLDIR)/include -Wall
			
 
				+LDFLAGS += -lcuda -L/usr/local/cuda/lib64/ -lcudart
			
 
				+LDFLAGS += -lpthread
			
 
				+
			
 
				+CFLAGS+= -g #-pg
			
 
				+#LDFLAGS+= -pg
			
 
				+
			
 
				+all:  cuda-latency
			
 
				+
			
 
				+cuda-latency:  cuda-latency.c
			
 
				+
			
 
				+clean:
			
 
				+	rm -f cuda-latency
			
 
				+	rm -f *.o
			
--- a/tests/experiments/latency/cuda-latency.c
+++ b/tests/experiments/latency/cuda-latency.c
@@ -0,0 +1,190 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <pthread.h>
			
 
				+#include <stdio.h>
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <math.h>
			
 
				+#include <sys/types.h>
			
 
				+#include <sys/time.h>
			
 
				+#include <pthread.h>
			
 
				+#include <signal.h>
			
 
				+
			
 
				+static pthread_t thread[2];
			
 
				+static unsigned thread_is_initialized[2];
			
 
				+
			
 
				+static pthread_cond_t cond;
			
 
				+static pthread_mutex_t mutex;
			
 
				+
			
 
				+static size_t buffer_size = 1;
			
 
				+static void *cpu_buffer;
			
 
				+static void *gpu_buffer[2];
			
 
				+
			
 
				+static pthread_cond_t cond_go;
			
 
				+static unsigned ready = 0;
			
 
				+static unsigned nready_gpu = 0;
			
 
				+
			
 
				+static unsigned niter = 100000;
			
 
				+
			
 
				+static pthread_cond_t cond_gpu;
			
 
				+static pthread_mutex_t mutex_gpu;
			
 
				+static unsigned data_is_available[2];
			
 
				+
			
 
				+static cudaStream_t stream[2];
			
 
				+
			
 
				+#define ASYNC	1
			
 
				+
			
 
				+void send_data(unsigned src, unsigned dst)
			
 
				+{
			
 
				+	/* Copy data from GPU to RAM */
			
 
				+#ifdef ASYNC
			
 
				+	cudaMemcpyAsync(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost, stream[src]);
			
 
				+	cudaStreamSynchronize(stream[src]);
			
 
				+#else
			
 
				+	cudaMemcpy(cpu_buffer, gpu_buffer[src], buffer_size, cudaMemcpyDeviceToHost);
			
 
				+	cudaThreadSynchronize();
			
 
				+#endif
			
 
				+
			
 
				+	/* Tell the other GPU that data is in RAM */
			
 
				+	pthread_mutex_lock(&mutex_gpu);
			
 
				+	data_is_available[src] = 0;
			
 
				+	data_is_available[dst] = 1;
			
 
				+	pthread_cond_signal(&cond_gpu);
			
 
				+	pthread_mutex_unlock(&mutex_gpu);
			
 
				+	//fprintf(stderr, "SEND on %d\n", src);
			
 
				+}
			
 
				+
			
 
				+void recv_data(unsigned src, unsigned dst)
			
 
				+{
			
 
				+	/* Wait for the data to be in RAM */
			
 
				+	pthread_mutex_lock(&mutex_gpu);
			
 
				+	while (!data_is_available[dst])
			
 
				+	{
			
 
				+		pthread_cond_wait(&cond_gpu, &mutex_gpu);
			
 
				+	}
			
 
				+	pthread_mutex_unlock(&mutex_gpu);
			
 
				+	//fprintf(stderr, "RECV on %d\n", dst);
			
 
				+
			
 
				+	/* Upload data */
			
 
				+#ifdef ASYNC
			
 
				+	cudaMemcpyAsync(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyDeviceToHost, stream[dst]);
			
 
				+	cudaThreadSynchronize();
			
 
				+#else
			
 
				+	cudaMemcpy(gpu_buffer[dst], cpu_buffer, buffer_size, cudaMemcpyDeviceToHost);
			
 
				+	cudaThreadSynchronize();
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+void *launch_gpu_thread(void *arg)
			
 
				+{
			
 
				+	unsigned *idptr = arg;
			
 
				+	unsigned id = *idptr;
			
 
				+
			
 
				+	fprintf(stderr, "Initialize device %d\n", id);
			
 
				+	cudaSetDevice(id);
			
 
				+	cudaFree(0);
			
 
				+
			
 
				+	cudaMalloc(&gpu_buffer[id], buffer_size);
			
 
				+	cudaStreamCreate(&stream[id]);
			
 
				+
			
 
				+	pthread_mutex_lock(&mutex);
			
 
				+	thread_is_initialized[id] = 1;
			
 
				+	pthread_cond_signal(&cond);
			
 
				+
			
 
				+	nready_gpu++;
			
 
				+
			
 
				+	while (!ready)
			
 
				+		pthread_cond_wait(&cond_go, &mutex);
			
 
				+
			
 
				+	pthread_mutex_unlock(&mutex);
			
 
				+
			
 
				+	fprintf(stderr, "Device %d GOGO\n", id);
			
 
				+
			
 
				+	unsigned iter;
			
 
				+	for (iter = 0; iter < niter; iter++)
			
 
				+	{
			
 
				+		if (id == 0) {
			
 
				+			send_data(0, 1);
			
 
				+			recv_data(1, 0);
			
 
				+		}
			
 
				+		else {
			
 
				+			recv_data(0, 1);
			
 
				+			send_data(1, 0);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	pthread_mutex_lock(&mutex);
			
 
				+	nready_gpu--;
			
 
				+	pthread_cond_signal(&cond_go);
			
 
				+	pthread_mutex_unlock(&mutex);
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	pthread_mutex_init(&mutex, NULL);
			
 
				+	pthread_cond_init(&cond, NULL);
			
 
				+	pthread_cond_init(&cond_go, NULL);
			
 
				+
			
 
				+	cudaHostAlloc(&cpu_buffer, buffer_size, cudaHostAllocPortable);
			
 
				+
			
 
				+	unsigned id;
			
 
				+	for (id = 0; id < 2; id++)
			
 
				+	{
			
 
				+		thread_is_initialized[id] = 0;
			
 
				+		pthread_create(&thread[0], NULL, launch_gpu_thread, &id);
			
 
				+
			
 
				+		pthread_mutex_lock(&mutex);
			
 
				+		while (!thread_is_initialized[id])
			
 
				+		{
			
 
				+			 pthread_cond_wait(&cond, &mutex);
			
 
				+		}
			
 
				+		pthread_mutex_unlock(&mutex);
			
 
				+	}
			
 
				+
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	/* Start the ping pong */
			
 
				+	gettimeofday(&start, NULL);
			
 
				+
			
 
				+	pthread_mutex_lock(&mutex);
			
 
				+	ready = 1;
			
 
				+	pthread_cond_broadcast(&cond_go);
			
 
				+	pthread_mutex_unlock(&mutex);
			
 
				+
			
 
				+	/* Wait for the end of the ping pong */
			
 
				+	pthread_mutex_lock(&mutex);
			
 
				+	while (nready_gpu > 0)
			
 
				+	{
			
 
				+		pthread_cond_wait(&cond_go, &mutex);
			
 
				+	}
			
 
				+	pthread_mutex_unlock(&mutex);
			
 
				+
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	
			
 
				+	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 +
			
 
				+		(end.tv_usec - start.tv_usec));
			
 
				+
			
 
				+	fprintf(stderr, "Took %.0f ms for %d iterations\n", timing/1000, niter);
			
 
				+	fprintf(stderr, "Latency: %.2f us\n", timing/(2*niter));
			
 
				+
			
 
				+	return 0;
			
 
				+}