cudatest961.cu 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #include <iostream>
  2. #include <cuda.h>
  3. #include <stdio.h>
  4. #include <stdlib.h>
  5. #include <sys/time.h>
  6. using namespace std;
  7. __global__ void kernel_gpu(int *A0,int *A1,int *A2,int *A3,int N){
  8. int i = blockIdx.x * blockDim.x + threadIdx.x;
  9. if (i < N){
  10. A0[i] = A3[i]+A3[i];
  11. A1[i] = A3[i];
  12. A2[i] = A3[i]+A3[i]+A3[i];
  13. }
  14. }
  15. void kernel_cpu(int *A0,int *A1,int *A2,int *A3,int N){
  16. for(int i=0;i<N;i++){
  17. A0[i] = A3[i]+A3[i];
  18. A1[i] = A3[i];
  19. A2[i] = A3[i]+A3[i]+A3[i];
  20. }
  21. }
  22. int main(int argc,char **argv) {
  23. fprintf(stderr, "961 ");
  24. int size=689233;
  25. int intBytes = size*sizeof(int);
  26. int floatBytes = size*sizeof(float);
  27. int *A0;
  28. A0 = (int *)malloc(intBytes);
  29. int *A1;
  30. A1 = (int *)malloc(intBytes);
  31. int *A2;
  32. A2 = (int *)malloc(intBytes);
  33. int *A3;
  34. A3 = (int *)malloc(intBytes);
  35. for(int i=0;i<689233;i++){
  36. A0[i] = 29*i+1;
  37. A1[i] = 26*i+1;
  38. A2[i] = 4*i+1;
  39. A3[i] = 44*i+1;
  40. }
  41. int *d_A0;
  42. cudaMalloc((void **)&d_A0,689233*sizeof(int));
  43. cudaMemcpy(d_A0,A0,689233*sizeof(int),cudaMemcpyHostToDevice);
  44. int *d_A1;
  45. cudaMalloc((void **)&d_A1,689233*sizeof(int));
  46. cudaMemcpy(d_A1,A1,689233*sizeof(int),cudaMemcpyHostToDevice);
  47. int *d_A2;
  48. cudaMalloc((void **)&d_A2,689233*sizeof(int));
  49. cudaMemcpy(d_A2,A2,689233*sizeof(int),cudaMemcpyHostToDevice);
  50. int *d_A3;
  51. cudaMalloc((void **)&d_A3,689233*sizeof(int));
  52. cudaMemcpy(d_A3,A3,689233*sizeof(int),cudaMemcpyHostToDevice);
  53. int n = 2693*256;
  54. int block_size = 256;
  55. int block_no = n/block_size;
  56. struct timeval time0,time1;
  57. gettimeofday(&time0,NULL);
  58. kernel_gpu<<<block_no,block_size>>>(d_A0,d_A1,d_A2,d_A3,689233);
  59. cudaThreadSynchronize();
  60. gettimeofday(&time1,NULL);
  61. double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
  62. fprintf(stderr, "GPU time: %lf msecs ", (totaltime10)/1000.0F);
  63. gettimeofday(&time0,NULL);
  64. kernel_cpu(A0,A1,A2,A3,689233);
  65. gettimeofday(&time1,NULL);
  66. totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
  67. fprintf(stderr, "CPU time: %lf msecs ", (totaltime10)/1000.0F);
  68. int *testA0;
  69. testA0 = (int *)malloc(intBytes);
  70. cudaMemcpy(testA0,d_A0,689233*sizeof(int),cudaMemcpyDeviceToHost);
  71. int *testA1;
  72. testA1 = (int *)malloc(intBytes);
  73. cudaMemcpy(testA1,d_A1,689233*sizeof(int),cudaMemcpyDeviceToHost);
  74. int *testA2;
  75. testA2 = (int *)malloc(intBytes);
  76. cudaMemcpy(testA2,d_A2,689233*sizeof(int),cudaMemcpyDeviceToHost);
  77. int *testA3;
  78. testA3 = (int *)malloc(intBytes);
  79. cudaMemcpy(testA3,d_A3,689233*sizeof(int),cudaMemcpyDeviceToHost);
  80. for(int i=0;i<689233;i++){
  81. if (A0[i] != testA0[i]) {
  82. printf("Invalid kernel ");
  83. break;}
  84. if (A1[i] != testA1[i]) {
  85. printf("Invalid kernel ");
  86. break;}
  87. if (A2[i] != testA2[i]) {
  88. printf("Invalid kernel ");
  89. break;}
  90. if (A3[i] != testA3[i]) {
  91. printf("Invalid kernel ");
  92. break;}
  93. }
  94. free(A0);
  95. free(testA0);
  96. cudaFree(d_A0);
  97. free(A1);
  98. free(testA1);
  99. cudaFree(d_A1);
  100. free(A2);
  101. free(testA2);
  102. cudaFree(d_A2);
  103. free(A3);
  104. free(testA3);
  105. cudaFree(d_A3);
  106. printf("\n");return 0; }