cudatest115.cu 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #include <iostream>
  2. #include <cuda.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <sys/time.h>
  6. using namespace std;
  7. __global__ void kernel_gpu(int *A0,int *A1,int *A2,int *A3,int *A4,int *A5,int *A6,int *A7,float *B0,int N){
  8. int i = blockIdx.x * blockDim.x + threadIdx.x;
  9. if (i < N){
  10. A0[i] = A7[i];
  11. A1[i] = A7[i];
  12. A2[i] = A4[i]-A7[i]-A6[i];
  13. B0[i] = B0[i]/B0[i]*B0[i]+B0[i];
  14. }
  15. }
  16. void kernel_cpu(int *A0,int *A1,int *A2,int *A3,int *A4,int *A5,int *A6,int *A7,float *B0,int N){
  17. for(int i=0;i<N;i++){
  18. A0[i] = A7[i];
  19. A1[i] = A7[i];
  20. A2[i] = A4[i]-A7[i]-A6[i];
  21. B0[i] = B0[i]/B0[i]*B0[i]+B0[i];
  22. }
  23. }
  24. int main(int argc,char **argv) {
  25. fprintf(stderr, "115 ");
  26. int size=284467;
  27. int intBytes = size*sizeof(int);
  28. int floatBytes = size*sizeof(float);
  29. int *A0;
  30. A0 = (int *)malloc(intBytes);
  31. int *A1;
  32. A1 = (int *)malloc(intBytes);
  33. int *A2;
  34. A2 = (int *)malloc(intBytes);
  35. int *A3;
  36. A3 = (int *)malloc(intBytes);
  37. int *A4;
  38. A4 = (int *)malloc(intBytes);
  39. int *A5;
  40. A5 = (int *)malloc(intBytes);
  41. int *A6;
  42. A6 = (int *)malloc(intBytes);
  43. int *A7;
  44. A7 = (int *)malloc(intBytes);
  45. float *B0;
  46. B0 = (float *)malloc(floatBytes);
  47. for(int i=0;i<284467;i++){
  48. A0[i] = 84+i+1;
  49. A1[i] = 14*i+1;
  50. A2[i] = 39+i+1;
  51. A3[i] = 12+i+1;
  52. A4[i] = 38*i+1;
  53. A5[i] = 80*i+1;
  54. A6[i] = 87+i+1;
  55. A7[i] = 99+i+1;
  56. B0[i] = 28.6259627982*i+1;
  57. }
  58. int *d_A0;
  59. cudaMalloc((void **)&d_A0,284467*sizeof(int));
  60. cudaMemcpy(d_A0,A0,284467*sizeof(int),cudaMemcpyHostToDevice);
  61. int *d_A1;
  62. cudaMalloc((void **)&d_A1,284467*sizeof(int));
  63. cudaMemcpy(d_A1,A1,284467*sizeof(int),cudaMemcpyHostToDevice);
  64. int *d_A2;
  65. cudaMalloc((void **)&d_A2,284467*sizeof(int));
  66. cudaMemcpy(d_A2,A2,284467*sizeof(int),cudaMemcpyHostToDevice);
  67. int *d_A3;
  68. cudaMalloc((void **)&d_A3,284467*sizeof(int));
  69. cudaMemcpy(d_A3,A3,284467*sizeof(int),cudaMemcpyHostToDevice);
  70. int *d_A4;
  71. cudaMalloc((void **)&d_A4,284467*sizeof(int));
  72. cudaMemcpy(d_A4,A4,284467*sizeof(int),cudaMemcpyHostToDevice);
  73. int *d_A5;
  74. cudaMalloc((void **)&d_A5,284467*sizeof(int));
  75. cudaMemcpy(d_A5,A5,284467*sizeof(int),cudaMemcpyHostToDevice);
  76. int *d_A6;
  77. cudaMalloc((void **)&d_A6,284467*sizeof(int));
  78. cudaMemcpy(d_A6,A6,284467*sizeof(int),cudaMemcpyHostToDevice);
  79. int *d_A7;
  80. cudaMalloc((void **)&d_A7,284467*sizeof(int));
  81. cudaMemcpy(d_A7,A7,284467*sizeof(int),cudaMemcpyHostToDevice);
  82. float *d_B0;
  83. cudaMalloc((void **)&d_B0,284467*sizeof(float));
  84. cudaMemcpy(d_B0,B0,284467*sizeof(float),cudaMemcpyHostToDevice);
  85. int n = 1112*256;
  86. int block_size = 256;
  87. int block_no = n/block_size;
  88. struct timeval time0,time1;
  89. gettimeofday(&time0,NULL);
  90. kernel_gpu<<<block_no,block_size>>>(d_A0,d_A1,d_A2,d_A3,d_A4,d_A5,d_A6,d_A7,d_B0,284467);
  91. cudaThreadSynchronize();
  92. gettimeofday(&time1,NULL);
  93. double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
  94. fprintf(stderr, "GPU time: %lf msecs ", (totaltime10)/1000.0F);
  95. gettimeofday(&time0,NULL);
  96. kernel_cpu(A0,A1,A2,A3,A4,A5,A6,A7,B0,284467);
  97. gettimeofday(&time1,NULL);
  98. totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
  99. fprintf(stderr, "CPU time: %lf msecs ", (totaltime10)/1000.0F);
  100. int *testA0;
  101. testA0 = (int *)malloc(intBytes);
  102. cudaMemcpy(testA0,d_A0,284467*sizeof(int),cudaMemcpyDeviceToHost);
  103. int *testA1;
  104. testA1 = (int *)malloc(intBytes);
  105. cudaMemcpy(testA1,d_A1,284467*sizeof(int),cudaMemcpyDeviceToHost);
  106. int *testA2;
  107. testA2 = (int *)malloc(intBytes);
  108. cudaMemcpy(testA2,d_A2,284467*sizeof(int),cudaMemcpyDeviceToHost);
  109. int *testA3;
  110. testA3 = (int *)malloc(intBytes);
  111. cudaMemcpy(testA3,d_A3,284467*sizeof(int),cudaMemcpyDeviceToHost);
  112. int *testA4;
  113. testA4 = (int *)malloc(intBytes);
  114. cudaMemcpy(testA4,d_A4,284467*sizeof(int),cudaMemcpyDeviceToHost);
  115. int *testA5;
  116. testA5 = (int *)malloc(intBytes);
  117. cudaMemcpy(testA5,d_A5,284467*sizeof(int),cudaMemcpyDeviceToHost);
  118. int *testA6;
  119. testA6 = (int *)malloc(intBytes);
  120. cudaMemcpy(testA6,d_A6,284467*sizeof(int),cudaMemcpyDeviceToHost);
  121. int *testA7;
  122. testA7 = (int *)malloc(intBytes);
  123. cudaMemcpy(testA7,d_A7,284467*sizeof(int),cudaMemcpyDeviceToHost);
  124. float *testB0;
  125. testB0 = (float *)malloc(floatBytes);
  126. cudaMemcpy(testB0,d_B0,284467*sizeof(int),cudaMemcpyDeviceToHost);
  127. for(int i=0;i<284467;i++){
  128. if (A0[i] != testA0[i]) {
  129. printf("Invalid kernel ");
  130. break;}
  131. if (A1[i] != testA1[i]) {
  132. printf("Invalid kernel ");
  133. break;}
  134. if (A2[i] != testA2[i]) {
  135. printf("Invalid kernel ");
  136. break;}
  137. if (A3[i] != testA3[i]) {
  138. printf("Invalid kernel ");
  139. break;}
  140. if (A4[i] != testA4[i]) {
  141. printf("Invalid kernel ");
  142. break;}
  143. if (A5[i] != testA5[i]) {
  144. printf("Invalid kernel ");
  145. break;}
  146. if (A6[i] != testA6[i]) {
  147. printf("Invalid kernel ");
  148. break;}
  149. if (A7[i] != testA7[i]) {
  150. printf("Invalid kernel ");
  151. break;}
  152. }
  153. free(A0);
  154. free(testA0);
  155. cudaFree(d_A0);
  156. free(A1);
  157. free(testA1);
  158. cudaFree(d_A1);
  159. free(A2);
  160. free(testA2);
  161. cudaFree(d_A2);
  162. free(A3);
  163. free(testA3);
  164. cudaFree(d_A3);
  165. free(A4);
  166. free(testA4);
  167. cudaFree(d_A4);
  168. free(A5);
  169. free(testA5);
  170. cudaFree(d_A5);
  171. free(A6);
  172. free(testA6);
  173. cudaFree(d_A6);
  174. free(A7);
  175. free(testA7);
  176. cudaFree(d_A7);
  177. free(B0);
  178. free(testB0);
  179. cudaFree(d_B0);
  180. printf("\n");return 0; }