cudatest694.cu 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. #include <iostream>
  2. #include <cuda.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <sys/time.h>
  6. using namespace std;
  7. __global__ void kernel_gpu(int *A0,int *A1,int *A2,int *A3,float *B0,float *B1,int N){
  8. int i = blockIdx.x * blockDim.x + threadIdx.x;
  9. if (i < N){
  10. A0[i] = A1[i]*A3[i]/A1[i];
  11. B0[i] = B1[i]*B1[i];
  12. }
  13. }
  14. void kernel_cpu(int *A0,int *A1,int *A2,int *A3,float *B0,float *B1,int N){
  15. for(int i=0;i<N;i++){
  16. A0[i] = A1[i]*A3[i]/A1[i];
  17. B0[i] = B1[i]*B1[i];
  18. }
  19. }
  20. int main(int argc,char **argv) {
  21. fprintf(stderr, "694 ");
  22. int size=297289;
  23. int intBytes = size*sizeof(int);
  24. int floatBytes = size*sizeof(float);
  25. int *A0;
  26. A0 = (int *)malloc(intBytes);
  27. int *A1;
  28. A1 = (int *)malloc(intBytes);
  29. int *A2;
  30. A2 = (int *)malloc(intBytes);
  31. int *A3;
  32. A3 = (int *)malloc(intBytes);
  33. float *B0;
  34. B0 = (float *)malloc(floatBytes);
  35. float *B1;
  36. B1 = (float *)malloc(floatBytes);
  37. for(int i=0;i<297289;i++){
  38. A0[i] = 50+i+1;
  39. A1[i] = 92*i+1;
  40. A2[i] = 84*i+1;
  41. A3[i] = 1+i+1;
  42. B0[i] = 4.17428203495*i+1;
  43. B1[i] = 71.1505205804+i+1;
  44. }
  45. int *d_A0;
  46. cudaMalloc((void **)&d_A0,297289*sizeof(int));
  47. cudaMemcpy(d_A0,A0,297289*sizeof(int),cudaMemcpyHostToDevice);
  48. int *d_A1;
  49. cudaMalloc((void **)&d_A1,297289*sizeof(int));
  50. cudaMemcpy(d_A1,A1,297289*sizeof(int),cudaMemcpyHostToDevice);
  51. int *d_A2;
  52. cudaMalloc((void **)&d_A2,297289*sizeof(int));
  53. cudaMemcpy(d_A2,A2,297289*sizeof(int),cudaMemcpyHostToDevice);
  54. int *d_A3;
  55. cudaMalloc((void **)&d_A3,297289*sizeof(int));
  56. cudaMemcpy(d_A3,A3,297289*sizeof(int),cudaMemcpyHostToDevice);
  57. float *d_B0;
  58. cudaMalloc((void **)&d_B0,297289*sizeof(float));
  59. cudaMemcpy(d_B0,B0,297289*sizeof(float),cudaMemcpyHostToDevice);
  60. float *d_B1;
  61. cudaMalloc((void **)&d_B1,297289*sizeof(float));
  62. cudaMemcpy(d_B1,B1,297289*sizeof(float),cudaMemcpyHostToDevice);
  63. int n = 1162*256;
  64. int block_size = 256;
  65. int block_no = n/block_size;
  66. struct timeval time0,time1;
  67. gettimeofday(&time0,NULL);
  68. kernel_gpu<<<block_no,block_size>>>(d_A0,d_A1,d_A2,d_A3,d_B0,d_B1,297289);
  69. cudaThreadSynchronize();
  70. gettimeofday(&time1,NULL);
  71. double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
  72. fprintf(stderr, "GPU time: %lf msecs ", (totaltime10)/1000.0F);
  73. gettimeofday(&time0,NULL);
  74. kernel_cpu(A0,A1,A2,A3,B0,B1,297289);
  75. gettimeofday(&time1,NULL);
  76. totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
  77. fprintf(stderr, "CPU time: %lf msecs ", (totaltime10)/1000.0F);
  78. int *testA0;
  79. testA0 = (int *)malloc(intBytes);
  80. cudaMemcpy(testA0,d_A0,297289*sizeof(int),cudaMemcpyDeviceToHost);
  81. int *testA1;
  82. testA1 = (int *)malloc(intBytes);
  83. cudaMemcpy(testA1,d_A1,297289*sizeof(int),cudaMemcpyDeviceToHost);
  84. int *testA2;
  85. testA2 = (int *)malloc(intBytes);
  86. cudaMemcpy(testA2,d_A2,297289*sizeof(int),cudaMemcpyDeviceToHost);
  87. int *testA3;
  88. testA3 = (int *)malloc(intBytes);
  89. cudaMemcpy(testA3,d_A3,297289*sizeof(int),cudaMemcpyDeviceToHost);
  90. float *testB0;
  91. testB0 = (float *)malloc(floatBytes);
  92. cudaMemcpy(testB0,d_B0,297289*sizeof(int),cudaMemcpyDeviceToHost);
  93. float *testB1;
  94. testB1 = (float *)malloc(floatBytes);
  95. cudaMemcpy(testB1,d_B1,297289*sizeof(int),cudaMemcpyDeviceToHost);
  96. for(int i=0;i<297289;i++){
  97. if (A0[i] != testA0[i]) {
  98. printf("Invalid kernel ");
  99. break;}
  100. if (A1[i] != testA1[i]) {
  101. printf("Invalid kernel ");
  102. break;}
  103. if (A2[i] != testA2[i]) {
  104. printf("Invalid kernel ");
  105. break;}
  106. if (A3[i] != testA3[i]) {
  107. printf("Invalid kernel ");
  108. break;}
  109. }
  110. free(A0);
  111. free(testA0);
  112. cudaFree(d_A0);
  113. free(A1);
  114. free(testA1);
  115. cudaFree(d_A1);
  116. free(A2);
  117. free(testA2);
  118. cudaFree(d_A2);
  119. free(A3);
  120. free(testA3);
  121. cudaFree(d_A3);
  122. free(B0);
  123. free(testB0);
  124. cudaFree(d_B0);
  125. free(B1);
  126. free(testB1);
  127. cudaFree(d_B1);
  128. printf("\n");return 0; }