|
@@ -0,0 +1,317 @@
|
|
|
+import sys
|
|
|
+import os
|
|
|
+import subprocess
|
|
|
+import string
|
|
|
+import random
|
|
|
+
|
|
|
+# it genarates random CPU programs for use with PinTools (.c files),
|
|
|
+# CUDA programs (READY) (.cu files), OPENCL (NOT READY) programs (.cl files),
|
|
|
+# CPU+CUDA (.cu files) with kernel time measurement and evaluation of proper operation
|
|
|
+
|
|
|
+def malloc_host_data(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+ cudaoutfile.write("int size="+str(size_of_arrays)+";\n")
|
|
|
+ cudaoutfile.write("int intBytes = size*sizeof(int);\n")
|
|
|
+ cudaoutfile.write("int floatBytes = size*sizeof(float);\n")
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ cudaoutfile.write("int *A"+str(i)+";\n")
|
|
|
+ cudaoutfile.write("A"+str(i)+" = (int *)malloc(intBytes);\n")
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ cudaoutfile.write("float *B"+str(i)+";\n")
|
|
|
+ cudaoutfile.write("B"+str(i)+" = (float *)malloc(floatBytes);\n")
|
|
|
+
|
|
|
+ coutfile.write("int size="+str(size_of_arrays)+";\n")
|
|
|
+ coutfile.write("int intBytes = size*sizeof(int);\n")
|
|
|
+ coutfile.write("int floatBytes = size*sizeof(float);\n")
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ coutfile.write("int *A"+str(i)+";\n")
|
|
|
+ coutfile.write("A"+str(i)+" = (int *)malloc(intBytes);\n")
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ coutfile.write("float *B"+str(i)+";\n")
|
|
|
+ coutfile.write("B"+str(i)+" = (float *)malloc(floatBytes);\n")
|
|
|
+
|
|
|
+def init_arrays(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+ cudaoutfile.write("for(int i=0;i<"+str(size_of_arrays)+";i++){\n")
|
|
|
+ coutfile.write("for(int i=0;i<"+str(size_of_arrays)+";i++){\n")
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ operation = random.randrange(2)
|
|
|
+ if operation == 0:
|
|
|
+ line = "A"+str(i)+"[i] = "+str(random.randrange(100))+"+i+1;\n"
|
|
|
+ cudaoutfile.write(line)
|
|
|
+ coutfile.write(line)
|
|
|
+ if operation == 1:
|
|
|
+ line = "A"+str(i)+"[i] = "+str(random.randrange(100))+"*i+1;\n"
|
|
|
+ cudaoutfile.write(line)
|
|
|
+ coutfile.write(line)
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ operation = random.randrange(2)
|
|
|
+ if operation == 0:
|
|
|
+ line = "B"+str(i)+"[i] = "+str(random.uniform(0,100))+"+i+1;\n"
|
|
|
+ cudaoutfile.write(line)
|
|
|
+ coutfile.write(line)
|
|
|
+ if operation == 1:
|
|
|
+ line = "B"+str(i)+"[i] = "+str(random.uniform(0,100))+"*i+1;\n"
|
|
|
+ cudaoutfile.write(line)
|
|
|
+ coutfile.write(line)
|
|
|
+ cudaoutfile.write("}\n")
|
|
|
+ coutfile.write("}\n")
|
|
|
+
|
|
|
+def const_init_arrays_for_static_dataset(outfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ outfile.write("int A"+str(i)+"[10];\n")
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ outfile.write("float B"+str(i)+"[10];\n")
|
|
|
+
|
|
|
+def copy_data_to_device(outfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+ #outfile.write("int size="+str(size_of_arrays)+";\n")
|
|
|
+ #outfile.write("int intBytes = size*sizeof(int);\n")
|
|
|
+ #outfile.write("int floatBytes = size*sizeof(float);\n")
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ outfile.write("int *d_A"+str(i)+";\n")
|
|
|
+ outfile.write("cudaMalloc((void **)&d_A"+str(i)+","+str(size_of_arrays)+"*sizeof(int));\n")
|
|
|
+ outfile.write("cudaMemcpy(d_A"+str(i)+",A"+str(i)+","+str(size_of_arrays)+"*sizeof(int),cudaMemcpyHostToDevice);\n")
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ outfile.write("float *d_B"+str(i)+";\n")
|
|
|
+ outfile.write("cudaMalloc((void **)&d_B"+str(i)+","+str(size_of_arrays)+"*sizeof(float));\n")
|
|
|
+ outfile.write("cudaMemcpy(d_B"+str(i)+",B"+str(i)+","+str(size_of_arrays)+"*sizeof(float),cudaMemcpyHostToDevice);\n")
|
|
|
+
|
|
|
+def make_kernel(cudaoutfile, coutfile, cloutfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+ line_gpu = "__global__ void kernel_gpu"
|
|
|
+ line_cpu = "void kernel_cpu"
|
|
|
+ line = "("
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ line = line + "int *A"+str(i)+","
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ line = line + "float *B"+str(i)+","
|
|
|
+
|
|
|
+ #generate the computation part
|
|
|
+ kernel_body = ""
|
|
|
+ #select output arrays
|
|
|
+ if num_int_arrays > 1:
|
|
|
+ output_int_arrays = random.randrange(num_int_arrays-1)+1
|
|
|
+ elif num_int_arrays == 1:
|
|
|
+ output_int_arrays = 1
|
|
|
+ else:
|
|
|
+ output_int_arrays = 0
|
|
|
+ if num_float_arrays > 1:
|
|
|
+ output_float_arrays = random.randrange(num_float_arrays-1)+1
|
|
|
+ elif num_float_arrays == 1:
|
|
|
+ output_float_arrays = 1
|
|
|
+ else:
|
|
|
+ output_float_arrays = 0
|
|
|
+
|
|
|
+ for i in range(output_int_arrays):
|
|
|
+ kernel_body = kernel_body + "A"+str(i)+"[i] = "
|
|
|
+ input_int_arrays = random.randrange(4)+1
|
|
|
+ for j in range(input_int_arrays):
|
|
|
+ if num_int_arrays > 1:
|
|
|
+ out_array = random.randrange(num_int_arrays-output_int_arrays)
|
|
|
+ else:
|
|
|
+ out_array = -1;
|
|
|
+ kernel_body = kernel_body + "A"+str(output_int_arrays+out_array)+"[i]"
|
|
|
+ operation = random.randrange(4)
|
|
|
+ if operation == 0:
|
|
|
+ kernel_body = kernel_body+"+"
|
|
|
+ if operation == 1:
|
|
|
+ kernel_body = kernel_body+"-"
|
|
|
+ if operation == 2:
|
|
|
+ kernel_body = kernel_body+"*"
|
|
|
+ if operation == 3:
|
|
|
+ kernel_body = kernel_body+"/"
|
|
|
+ kernel_body = kernel_body[:-1]+";\n"
|
|
|
+
|
|
|
+ for i in range(output_float_arrays):
|
|
|
+ kernel_body = kernel_body + "B"+str(i)+"[i] = "
|
|
|
+ input_float_arrays = random.randrange(4)+1
|
|
|
+ for j in range(input_float_arrays):
|
|
|
+ if num_float_arrays > 1:
|
|
|
+ out_array = random.randrange(num_float_arrays-output_float_arrays)
|
|
|
+ else:
|
|
|
+ out_array = -1;
|
|
|
+ kernel_body = kernel_body + "B"+str(output_float_arrays+out_array)+"[i]"
|
|
|
+ operation = random.randrange(4)
|
|
|
+ if operation == 0:
|
|
|
+ kernel_body = kernel_body+"+"
|
|
|
+ if operation == 1:
|
|
|
+ kernel_body = kernel_body+"-"
|
|
|
+ if operation == 2:
|
|
|
+ kernel_body = kernel_body+"*"
|
|
|
+ if operation == 3:
|
|
|
+ kernel_body = kernel_body+"/"
|
|
|
+ kernel_body = kernel_body[:-1]+";\n"
|
|
|
+
|
|
|
+ # gpu kernel
|
|
|
+ cudaoutfile.write(line_gpu+line+"int N){\n")
|
|
|
+ cudaoutfile.write("int i = blockIdx.x * blockDim.x + threadIdx.x;\n")
|
|
|
+ cudaoutfile.write("if (i < N){\n")
|
|
|
+ cudaoutfile.write(kernel_body)
|
|
|
+ cudaoutfile.write("}\n")
|
|
|
+ cudaoutfile.write("}\n")
|
|
|
+ cudaoutfile.write("\n")
|
|
|
+
|
|
|
+ # cpu kernel
|
|
|
+ cudaoutfile.write(line_cpu+line+"int N){\n")
|
|
|
+ cudaoutfile.write("for(int i=0;i<N;i++){\n")
|
|
|
+ cudaoutfile.write(kernel_body)
|
|
|
+ cudaoutfile.write("}\n")
|
|
|
+ cudaoutfile.write("}\n")
|
|
|
+ coutfile.write(line_cpu+line+"int N){\n")
|
|
|
+ coutfile.write("for(int i=0;i<N;i++){\n")
|
|
|
+ coutfile.write(kernel_body)
|
|
|
+ coutfile.write("}\n")
|
|
|
+ coutfile.write("}\n")
|
|
|
+
|
|
|
+ cloutfile.write("for(int i=0;i<N;i++){\n")
|
|
|
+ cloutfile.write(kernel_body)
|
|
|
+ cloutfile.write("}\n")
|
|
|
+
|
|
|
+def call_kernels(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+
|
|
|
+ # the cuda kernel will have block grid size that can run the whole array in parallel
|
|
|
+ cudaoutfile.write("int n = "+str(int(size_of_arrays/256)+1)+"*256;\n")
|
|
|
+ cudaoutfile.write("int block_size = 256;\n")
|
|
|
+ cudaoutfile.write("int block_no = n/block_size;\n")
|
|
|
+ cudaoutfile.write("struct timeval time0,time1;\n")
|
|
|
+ cudaoutfile.write("gettimeofday(&time0,NULL);\n")
|
|
|
+
|
|
|
+ line = "kernel_gpu<<<block_no,block_size>>>("
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ line = line + "d_A"+str(i)+","
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ line = line + "d_B"+str(i)+","
|
|
|
+ cudaoutfile.write(line+str(size_of_arrays)+");\n")
|
|
|
+ cudaoutfile.write("cudaThreadSynchronize();\n")
|
|
|
+
|
|
|
+ cudaoutfile.write("gettimeofday(&time1,NULL);\n")
|
|
|
+ cudaoutfile.write("double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);\n")
|
|
|
+ cudaoutfile.write("fprintf(stderr, \"GPU time: %lf msecs \", (totaltime10)/1000.0F);\n")
|
|
|
+
|
|
|
+ #cudaoutfile.write("struct timeval time0,time1;\n")
|
|
|
+ cudaoutfile.write("gettimeofday(&time0,NULL);\n")
|
|
|
+
|
|
|
+ coutfile.write("struct timeval time0,time1;\n")
|
|
|
+ coutfile.write("gettimeofday(&time0,NULL);\n")
|
|
|
+
|
|
|
+ # Pin tools annotations
|
|
|
+ coutfile.write("FILE *file_for_block_of_interest = fopen(\"./profile_in_block.txt\",\"w\");\n")
|
|
|
+ coutfile.write("if(file_for_block_of_interest) {\n")
|
|
|
+ coutfile.write("char Buf[2] = \"1\";\n")
|
|
|
+ coutfile.write("fwrite(Buf, 1, 1, file_for_block_of_interest);\n")
|
|
|
+ coutfile.write("fclose(file_for_block_of_interest);}\n")
|
|
|
+
|
|
|
+ line = "kernel_cpu("
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ line = line + "A"+str(i)+","
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ line = line + "B"+str(i)+","
|
|
|
+ cudaoutfile.write(line+str(size_of_arrays)+");\n")
|
|
|
+ coutfile.write(line+str(size_of_arrays)+");\n")
|
|
|
+
|
|
|
+ # Pin tools annotations
|
|
|
+ coutfile.write("file_for_block_of_interest = fopen(\"./profile_in_block.txt\",\"w\");\n")
|
|
|
+ coutfile.write("if(file_for_block_of_interest) {\n")
|
|
|
+ coutfile.write("char Buf[2] = \"0\";\n")
|
|
|
+ coutfile.write("fwrite(Buf, 1, 1, file_for_block_of_interest);\n")
|
|
|
+ coutfile.write("fclose(file_for_block_of_interest);}\n")
|
|
|
+
|
|
|
+ cudaoutfile.write("gettimeofday(&time1,NULL);\n")
|
|
|
+ cudaoutfile.write("totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);\n")
|
|
|
+ cudaoutfile.write("fprintf(stderr, \"CPU time: %lf msecs \", (totaltime10)/1000.0F);\n")
|
|
|
+
|
|
|
+ coutfile.write("gettimeofday(&time1,NULL);\n")
|
|
|
+ coutfile.write("double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);\n")
|
|
|
+ coutfile.write("fprintf(stderr, \"CPU time: %lf msecs \", (totaltime10)/1000.0F); \n")
|
|
|
+
|
|
|
+def make_tests(outfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+ #outfile.write("int size="+str(size_of_arrays)+";\n")
|
|
|
+ #outfile.write("int intBytes = size*sizeof(int);\n")
|
|
|
+ #outfile.write("int floatBytes = size*sizeof(float);\n")
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ outfile.write("int *testA"+str(i)+";\n")
|
|
|
+ outfile.write("testA"+str(i)+" = (int *)malloc(intBytes);\n")
|
|
|
+ outfile.write("cudaMemcpy(testA"+str(i)+",d_A"+str(i)+","+str(size_of_arrays)+"*sizeof(int),cudaMemcpyDeviceToHost);\n")
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ outfile.write("float *testB"+str(i)+";\n")
|
|
|
+ outfile.write("testB"+str(i)+" = (float *)malloc(floatBytes);\n")
|
|
|
+ outfile.write("cudaMemcpy(testB"+str(i)+",d_B"+str(i)+","+str(size_of_arrays)+"*sizeof(int),cudaMemcpyDeviceToHost);\n")
|
|
|
+ outfile.write("for(int i=0;i<"+str(size_of_arrays)+";i++){\n")
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ outfile.write("if (A"+str(i)+"[i] != testA"+str(i)+"[i]) {\n")
|
|
|
+ outfile.write("printf(\"Invalid kernel \");\n")
|
|
|
+ outfile.write("break;}\n")
|
|
|
+ #for i in range(num_float_arrays):
|
|
|
+ # outfile.write("if (int(B"+str(i)+"[i]) != int(testB"+str(i)+"[i])) {\n")
|
|
|
+ # outfile.write("printf(\"Invalid kernel\\n \");\n")
|
|
|
+ # outfile.write("break;}\n")
|
|
|
+ outfile.write("}\n")
|
|
|
+
|
|
|
+def free_pointers(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
|
|
|
+ for i in range(num_int_arrays):
|
|
|
+ coutfile.write("free(A"+str(i)+");\n")
|
|
|
+ cudaoutfile.write("free(A"+str(i)+");\n")
|
|
|
+ cudaoutfile.write("free(testA"+str(i)+");\n")
|
|
|
+ cudaoutfile.write("cudaFree(d_A"+str(i)+");\n")
|
|
|
+ for i in range(num_float_arrays):
|
|
|
+ coutfile.write("free(B"+str(i)+");\n")
|
|
|
+ cudaoutfile.write("free(B"+str(i)+");\n")
|
|
|
+ cudaoutfile.write("free(testB"+str(i)+");\n")
|
|
|
+ cudaoutfile.write("cudaFree(d_B"+str(i)+");\n")
|
|
|
+
|
|
|
+num_of_programs = sys.argv[1]
|
|
|
+output_directory = sys.argv[2]
|
|
|
+
|
|
|
+for i in range (int(num_of_programs)):
|
|
|
+ cudafile = output_directory+"/cudatest"+str(i)+".cu"
|
|
|
+ cfile = output_directory+"/ctest"+str(i)+".c"
|
|
|
+ clfile = output_directory+"/test"+str(i)+".cl"
|
|
|
+ with open(cudafile,"a") as cudaoutfile:
|
|
|
+ with open(cfile,"a") as coutfile:
|
|
|
+ with open(clfile,"a") as cloutfile:
|
|
|
+
|
|
|
+ cudaoutfile.write("#include <iostream>\n")
|
|
|
+ cudaoutfile.write("#include <cuda.h>\n")
|
|
|
+ cudaoutfile.write("#include <stdio.h>\n")
|
|
|
+ cudaoutfile.write("#include <stdlib.h>\n")
|
|
|
+ cudaoutfile.write("#include <sys/time.h>\n")
|
|
|
+ cudaoutfile.write("using namespace std;\n")
|
|
|
+
|
|
|
+ cudaoutfile.write("\n")
|
|
|
+
|
|
|
+ coutfile.write("#include <stdio.h>\n")
|
|
|
+ coutfile.write("#include <stdlib.h>\n")
|
|
|
+ coutfile.write("#include <sys/time.h>\n")
|
|
|
+ #coutfile.write("using namespace std;\n")
|
|
|
+
|
|
|
+ coutfile.write("\n")
|
|
|
+
|
|
|
+ num_int_arrays = random.randrange(10)
|
|
|
+ num_float_arrays = random.randrange(10)
|
|
|
+ size_of_arrays = random.randrange(1000000)
|
|
|
+
|
|
|
+ cloutfile.write("__kernel void memset_kernel(){\n")
|
|
|
+ const_init_arrays_for_static_dataset(cloutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+
|
|
|
+ # 1 for only c, 2 for cuda
|
|
|
+ make_kernel(cudaoutfile, coutfile, cloutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+
|
|
|
+ cudaoutfile.write("\n")
|
|
|
+ cudaoutfile.write("int main(int argc,char **argv) {\n")
|
|
|
+ coutfile.write("\n")
|
|
|
+ coutfile.write("int main(int argc,char **argv) {\n")
|
|
|
+
|
|
|
+ cudaoutfile.write("fprintf(stderr, \""+str(i)+" \");\n")
|
|
|
+
|
|
|
+ malloc_host_data(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+ init_arrays(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+ copy_data_to_device(cudaoutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+
|
|
|
+ # 1 for only c, 2 for cuda
|
|
|
+ call_kernels(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+ make_tests(cudaoutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+ free_pointers(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
|
|
|
+
|
|
|
+ cudaoutfile.write("printf(\"\\n\");")
|
|
|
+ coutfile.write("printf(\"\\n\");")
|
|
|
+
|
|
|
+ cudaoutfile.write("return 0; }")
|
|
|
+ coutfile.write("return 0; }")
|
|
|
+ cloutfile.write("}")
|