generate.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. import sys
  2. import os
  3. import subprocess
  4. import string
  5. import random
  6. # it genarates random CPU programs for use with PinTools (.c files),
  7. # CUDA programs (READY) (.cu files), OPENCL (NOT READY) programs (.cl files),
  8. # CPU+CUDA (.cu files) with kernel time measurement and evaluation of proper operation
  9. def malloc_host_data(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
  10. cudaoutfile.write("int size="+str(size_of_arrays)+";\n")
  11. cudaoutfile.write("int intBytes = size*sizeof(int);\n")
  12. cudaoutfile.write("int floatBytes = size*sizeof(float);\n")
  13. for i in range(num_int_arrays):
  14. cudaoutfile.write("int *A"+str(i)+";\n")
  15. cudaoutfile.write("A"+str(i)+" = (int *)malloc(intBytes);\n")
  16. for i in range(num_float_arrays):
  17. cudaoutfile.write("float *B"+str(i)+";\n")
  18. cudaoutfile.write("B"+str(i)+" = (float *)malloc(floatBytes);\n")
  19. coutfile.write("int size="+str(size_of_arrays)+";\n")
  20. coutfile.write("int intBytes = size*sizeof(int);\n")
  21. coutfile.write("int floatBytes = size*sizeof(float);\n")
  22. for i in range(num_int_arrays):
  23. coutfile.write("int *A"+str(i)+";\n")
  24. coutfile.write("A"+str(i)+" = (int *)malloc(intBytes);\n")
  25. for i in range(num_float_arrays):
  26. coutfile.write("float *B"+str(i)+";\n")
  27. coutfile.write("B"+str(i)+" = (float *)malloc(floatBytes);\n")
  28. def init_arrays(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
  29. cudaoutfile.write("for(int i=0;i<"+str(size_of_arrays)+";i++){\n")
  30. coutfile.write("for(int i=0;i<"+str(size_of_arrays)+";i++){\n")
  31. for i in range(num_int_arrays):
  32. operation = random.randrange(2)
  33. if operation == 0:
  34. line = "A"+str(i)+"[i] = "+str(random.randrange(100))+"+i+1;\n"
  35. cudaoutfile.write(line)
  36. coutfile.write(line)
  37. if operation == 1:
  38. line = "A"+str(i)+"[i] = "+str(random.randrange(100))+"*i+1;\n"
  39. cudaoutfile.write(line)
  40. coutfile.write(line)
  41. for i in range(num_float_arrays):
  42. operation = random.randrange(2)
  43. if operation == 0:
  44. line = "B"+str(i)+"[i] = "+str(random.uniform(0,100))+"+i+1;\n"
  45. cudaoutfile.write(line)
  46. coutfile.write(line)
  47. if operation == 1:
  48. line = "B"+str(i)+"[i] = "+str(random.uniform(0,100))+"*i+1;\n"
  49. cudaoutfile.write(line)
  50. coutfile.write(line)
  51. cudaoutfile.write("}\n")
  52. coutfile.write("}\n")
  53. def const_init_arrays_for_static_dataset(outfile, num_int_arrays, num_float_arrays, size_of_arrays):
  54. for i in range(num_int_arrays):
  55. outfile.write("int A"+str(i)+"[10];\n")
  56. for i in range(num_float_arrays):
  57. outfile.write("float B"+str(i)+"[10];\n")
  58. def copy_data_to_device(outfile, num_int_arrays, num_float_arrays, size_of_arrays):
  59. #outfile.write("int size="+str(size_of_arrays)+";\n")
  60. #outfile.write("int intBytes = size*sizeof(int);\n")
  61. #outfile.write("int floatBytes = size*sizeof(float);\n")
  62. for i in range(num_int_arrays):
  63. outfile.write("int *d_A"+str(i)+";\n")
  64. outfile.write("cudaMalloc((void **)&d_A"+str(i)+","+str(size_of_arrays)+"*sizeof(int));\n")
  65. outfile.write("cudaMemcpy(d_A"+str(i)+",A"+str(i)+","+str(size_of_arrays)+"*sizeof(int),cudaMemcpyHostToDevice);\n")
  66. for i in range(num_float_arrays):
  67. outfile.write("float *d_B"+str(i)+";\n")
  68. outfile.write("cudaMalloc((void **)&d_B"+str(i)+","+str(size_of_arrays)+"*sizeof(float));\n")
  69. outfile.write("cudaMemcpy(d_B"+str(i)+",B"+str(i)+","+str(size_of_arrays)+"*sizeof(float),cudaMemcpyHostToDevice);\n")
  70. def make_kernel(cudaoutfile, coutfile, cloutfile, num_int_arrays, num_float_arrays, size_of_arrays):
  71. line_gpu = "__global__ void kernel_gpu"
  72. line_cpu = "void kernel_cpu"
  73. line = "("
  74. for i in range(num_int_arrays):
  75. line = line + "int *A"+str(i)+","
  76. for i in range(num_float_arrays):
  77. line = line + "float *B"+str(i)+","
  78. #generate the computation part
  79. kernel_body = ""
  80. #select output arrays
  81. if num_int_arrays > 1:
  82. output_int_arrays = random.randrange(num_int_arrays-1)+1
  83. elif num_int_arrays == 1:
  84. output_int_arrays = 1
  85. else:
  86. output_int_arrays = 0
  87. if num_float_arrays > 1:
  88. output_float_arrays = random.randrange(num_float_arrays-1)+1
  89. elif num_float_arrays == 1:
  90. output_float_arrays = 1
  91. else:
  92. output_float_arrays = 0
  93. for i in range(output_int_arrays):
  94. kernel_body = kernel_body + "A"+str(i)+"[i] = "
  95. input_int_arrays = random.randrange(4)+1
  96. for j in range(input_int_arrays):
  97. if num_int_arrays > 1:
  98. out_array = random.randrange(num_int_arrays-output_int_arrays)
  99. else:
  100. out_array = -1;
  101. kernel_body = kernel_body + "A"+str(output_int_arrays+out_array)+"[i]"
  102. operation = random.randrange(4)
  103. if operation == 0:
  104. kernel_body = kernel_body+"+"
  105. if operation == 1:
  106. kernel_body = kernel_body+"-"
  107. if operation == 2:
  108. kernel_body = kernel_body+"*"
  109. if operation == 3:
  110. kernel_body = kernel_body+"/"
  111. kernel_body = kernel_body[:-1]+";\n"
  112. for i in range(output_float_arrays):
  113. kernel_body = kernel_body + "B"+str(i)+"[i] = "
  114. input_float_arrays = random.randrange(4)+1
  115. for j in range(input_float_arrays):
  116. if num_float_arrays > 1:
  117. out_array = random.randrange(num_float_arrays-output_float_arrays)
  118. else:
  119. out_array = -1;
  120. kernel_body = kernel_body + "B"+str(output_float_arrays+out_array)+"[i]"
  121. operation = random.randrange(4)
  122. if operation == 0:
  123. kernel_body = kernel_body+"+"
  124. if operation == 1:
  125. kernel_body = kernel_body+"-"
  126. if operation == 2:
  127. kernel_body = kernel_body+"*"
  128. if operation == 3:
  129. kernel_body = kernel_body+"/"
  130. kernel_body = kernel_body[:-1]+";\n"
  131. # gpu kernel
  132. cudaoutfile.write(line_gpu+line+"int N){\n")
  133. cudaoutfile.write("int i = blockIdx.x * blockDim.x + threadIdx.x;\n")
  134. cudaoutfile.write("if (i < N){\n")
  135. cudaoutfile.write(kernel_body)
  136. cudaoutfile.write("}\n")
  137. cudaoutfile.write("}\n")
  138. cudaoutfile.write("\n")
  139. # cpu kernel
  140. cudaoutfile.write(line_cpu+line+"int N){\n")
  141. cudaoutfile.write("for(int i=0;i<N;i++){\n")
  142. cudaoutfile.write(kernel_body)
  143. cudaoutfile.write("}\n")
  144. cudaoutfile.write("}\n")
  145. coutfile.write(line_cpu+line+"int N){\n")
  146. coutfile.write("for(int i=0;i<N;i++){\n")
  147. coutfile.write(kernel_body)
  148. coutfile.write("}\n")
  149. coutfile.write("}\n")
  150. cloutfile.write("for(int i=0;i<N;i++){\n")
  151. cloutfile.write(kernel_body)
  152. cloutfile.write("}\n")
  153. def call_kernels(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
  154. # the cuda kernel will have block grid size that can run the whole array in parallel
  155. cudaoutfile.write("int n = "+str(int(size_of_arrays/256)+1)+"*256;\n")
  156. cudaoutfile.write("int block_size = 256;\n")
  157. cudaoutfile.write("int block_no = n/block_size;\n")
  158. cudaoutfile.write("struct timeval time0,time1;\n")
  159. cudaoutfile.write("gettimeofday(&time0,NULL);\n")
  160. line = "kernel_gpu<<<block_no,block_size>>>("
  161. for i in range(num_int_arrays):
  162. line = line + "d_A"+str(i)+","
  163. for i in range(num_float_arrays):
  164. line = line + "d_B"+str(i)+","
  165. cudaoutfile.write(line+str(size_of_arrays)+");\n")
  166. cudaoutfile.write("cudaThreadSynchronize();\n")
  167. cudaoutfile.write("gettimeofday(&time1,NULL);\n")
  168. cudaoutfile.write("double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);\n")
  169. cudaoutfile.write("fprintf(stderr, \"GPU time: %lf msecs \", (totaltime10)/1000.0F);\n")
  170. #cudaoutfile.write("struct timeval time0,time1;\n")
  171. cudaoutfile.write("gettimeofday(&time0,NULL);\n")
  172. coutfile.write("struct timeval time0,time1;\n")
  173. coutfile.write("gettimeofday(&time0,NULL);\n")
  174. # Pin tools annotations
  175. coutfile.write("FILE *file_for_block_of_interest = fopen(\"./profile_in_block.txt\",\"w\");\n")
  176. coutfile.write("if(file_for_block_of_interest) {\n")
  177. coutfile.write("char Buf[2] = \"1\";\n")
  178. coutfile.write("fwrite(Buf, 1, 1, file_for_block_of_interest);\n")
  179. coutfile.write("fclose(file_for_block_of_interest);}\n")
  180. line = "kernel_cpu("
  181. for i in range(num_int_arrays):
  182. line = line + "A"+str(i)+","
  183. for i in range(num_float_arrays):
  184. line = line + "B"+str(i)+","
  185. cudaoutfile.write(line+str(size_of_arrays)+");\n")
  186. coutfile.write(line+str(size_of_arrays)+");\n")
  187. # Pin tools annotations
  188. coutfile.write("file_for_block_of_interest = fopen(\"./profile_in_block.txt\",\"w\");\n")
  189. coutfile.write("if(file_for_block_of_interest) {\n")
  190. coutfile.write("char Buf[2] = \"0\";\n")
  191. coutfile.write("fwrite(Buf, 1, 1, file_for_block_of_interest);\n")
  192. coutfile.write("fclose(file_for_block_of_interest);}\n")
  193. cudaoutfile.write("gettimeofday(&time1,NULL);\n")
  194. cudaoutfile.write("totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);\n")
  195. cudaoutfile.write("fprintf(stderr, \"CPU time: %lf msecs \", (totaltime10)/1000.0F);\n")
  196. coutfile.write("gettimeofday(&time1,NULL);\n")
  197. coutfile.write("double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);\n")
  198. coutfile.write("fprintf(stderr, \"CPU time: %lf msecs \", (totaltime10)/1000.0F); \n")
  199. def make_tests(outfile, num_int_arrays, num_float_arrays, size_of_arrays):
  200. #outfile.write("int size="+str(size_of_arrays)+";\n")
  201. #outfile.write("int intBytes = size*sizeof(int);\n")
  202. #outfile.write("int floatBytes = size*sizeof(float);\n")
  203. for i in range(num_int_arrays):
  204. outfile.write("int *testA"+str(i)+";\n")
  205. outfile.write("testA"+str(i)+" = (int *)malloc(intBytes);\n")
  206. outfile.write("cudaMemcpy(testA"+str(i)+",d_A"+str(i)+","+str(size_of_arrays)+"*sizeof(int),cudaMemcpyDeviceToHost);\n")
  207. for i in range(num_float_arrays):
  208. outfile.write("float *testB"+str(i)+";\n")
  209. outfile.write("testB"+str(i)+" = (float *)malloc(floatBytes);\n")
  210. outfile.write("cudaMemcpy(testB"+str(i)+",d_B"+str(i)+","+str(size_of_arrays)+"*sizeof(int),cudaMemcpyDeviceToHost);\n")
  211. outfile.write("for(int i=0;i<"+str(size_of_arrays)+";i++){\n")
  212. for i in range(num_int_arrays):
  213. outfile.write("if (A"+str(i)+"[i] != testA"+str(i)+"[i]) {\n")
  214. outfile.write("printf(\"Invalid kernel \");\n")
  215. outfile.write("break;}\n")
  216. #for i in range(num_float_arrays):
  217. # outfile.write("if (int(B"+str(i)+"[i]) != int(testB"+str(i)+"[i])) {\n")
  218. # outfile.write("printf(\"Invalid kernel\\n \");\n")
  219. # outfile.write("break;}\n")
  220. outfile.write("}\n")
  221. def free_pointers(cudaoutfile, coutfile, num_int_arrays, num_float_arrays, size_of_arrays):
  222. for i in range(num_int_arrays):
  223. coutfile.write("free(A"+str(i)+");\n")
  224. cudaoutfile.write("free(A"+str(i)+");\n")
  225. cudaoutfile.write("free(testA"+str(i)+");\n")
  226. cudaoutfile.write("cudaFree(d_A"+str(i)+");\n")
  227. for i in range(num_float_arrays):
  228. coutfile.write("free(B"+str(i)+");\n")
  229. cudaoutfile.write("free(B"+str(i)+");\n")
  230. cudaoutfile.write("free(testB"+str(i)+");\n")
  231. cudaoutfile.write("cudaFree(d_B"+str(i)+");\n")
  232. num_of_programs = sys.argv[1]
  233. output_directory = sys.argv[2]
  234. for i in range (int(num_of_programs)):
  235. cudafile = output_directory+"/cudatest"+str(i)+".cu"
  236. cfile = output_directory+"/ctest"+str(i)+".c"
  237. clfile = output_directory+"/test"+str(i)+".cl"
  238. with open(cudafile,"a") as cudaoutfile:
  239. with open(cfile,"a") as coutfile:
  240. with open(clfile,"a") as cloutfile:
  241. cudaoutfile.write("#include <iostream>\n")
  242. cudaoutfile.write("#include <cuda.h>\n")
  243. cudaoutfile.write("#include <stdio.h>\n")
  244. cudaoutfile.write("#include <stdlib.h>\n")
  245. cudaoutfile.write("#include <sys/time.h>\n")
  246. cudaoutfile.write("using namespace std;\n")
  247. cudaoutfile.write("\n")
  248. coutfile.write("#include <stdio.h>\n")
  249. coutfile.write("#include <stdlib.h>\n")
  250. coutfile.write("#include <sys/time.h>\n")
  251. #coutfile.write("using namespace std;\n")
  252. coutfile.write("\n")
  253. num_int_arrays = random.randrange(10)
  254. num_float_arrays = random.randrange(10)
  255. size_of_arrays = random.randrange(1000000)
  256. cloutfile.write("__kernel void memset_kernel(){\n")
  257. const_init_arrays_for_static_dataset(cloutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  258. # 1 for only c, 2 for cuda
  259. make_kernel(cudaoutfile, coutfile, cloutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  260. cudaoutfile.write("\n")
  261. cudaoutfile.write("int main(int argc,char **argv) {\n")
  262. coutfile.write("\n")
  263. coutfile.write("int main(int argc,char **argv) {\n")
  264. cudaoutfile.write("fprintf(stderr, \""+str(i)+" \");\n")
  265. malloc_host_data(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  266. init_arrays(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  267. copy_data_to_device(cudaoutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  268. # 1 for only c, 2 for cuda
  269. call_kernels(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  270. make_tests(cudaoutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  271. free_pointers(cudaoutfile, coutfile, num_int_arrays,num_float_arrays,size_of_arrays)
  272. cudaoutfile.write("printf(\"\\n\");")
  273. coutfile.write("printf(\"\\n\");")
  274. cudaoutfile.write("return 0; }")
  275. coutfile.write("return 0; }")
  276. cloutfile.write("}")