| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183 |
- #include <iostream>
- #include <cuda.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <sys/time.h>
- using namespace std;
- __global__ void kernel_gpu(int *A0,int *A1,int *A2,int *A3,int *A4,int *A5,int *A6,int *A7,float *B0,int N){
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i < N){
- A0[i] = A7[i];
- A1[i] = A7[i];
- A2[i] = A4[i]-A7[i]-A6[i];
- B0[i] = B0[i]/B0[i]*B0[i]+B0[i];
- }
- }
- void kernel_cpu(int *A0,int *A1,int *A2,int *A3,int *A4,int *A5,int *A6,int *A7,float *B0,int N){
- for(int i=0;i<N;i++){
- A0[i] = A7[i];
- A1[i] = A7[i];
- A2[i] = A4[i]-A7[i]-A6[i];
- B0[i] = B0[i]/B0[i]*B0[i]+B0[i];
- }
- }
- int main(int argc,char **argv) {
- fprintf(stderr, "115 ");
- int size=284467;
- int intBytes = size*sizeof(int);
- int floatBytes = size*sizeof(float);
- int *A0;
- A0 = (int *)malloc(intBytes);
- int *A1;
- A1 = (int *)malloc(intBytes);
- int *A2;
- A2 = (int *)malloc(intBytes);
- int *A3;
- A3 = (int *)malloc(intBytes);
- int *A4;
- A4 = (int *)malloc(intBytes);
- int *A5;
- A5 = (int *)malloc(intBytes);
- int *A6;
- A6 = (int *)malloc(intBytes);
- int *A7;
- A7 = (int *)malloc(intBytes);
- float *B0;
- B0 = (float *)malloc(floatBytes);
- for(int i=0;i<284467;i++){
- A0[i] = 84+i+1;
- A1[i] = 14*i+1;
- A2[i] = 39+i+1;
- A3[i] = 12+i+1;
- A4[i] = 38*i+1;
- A5[i] = 80*i+1;
- A6[i] = 87+i+1;
- A7[i] = 99+i+1;
- B0[i] = 28.6259627982*i+1;
- }
- int *d_A0;
- cudaMalloc((void **)&d_A0,284467*sizeof(int));
- cudaMemcpy(d_A0,A0,284467*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A1;
- cudaMalloc((void **)&d_A1,284467*sizeof(int));
- cudaMemcpy(d_A1,A1,284467*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A2;
- cudaMalloc((void **)&d_A2,284467*sizeof(int));
- cudaMemcpy(d_A2,A2,284467*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A3;
- cudaMalloc((void **)&d_A3,284467*sizeof(int));
- cudaMemcpy(d_A3,A3,284467*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A4;
- cudaMalloc((void **)&d_A4,284467*sizeof(int));
- cudaMemcpy(d_A4,A4,284467*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A5;
- cudaMalloc((void **)&d_A5,284467*sizeof(int));
- cudaMemcpy(d_A5,A5,284467*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A6;
- cudaMalloc((void **)&d_A6,284467*sizeof(int));
- cudaMemcpy(d_A6,A6,284467*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A7;
- cudaMalloc((void **)&d_A7,284467*sizeof(int));
- cudaMemcpy(d_A7,A7,284467*sizeof(int),cudaMemcpyHostToDevice);
- float *d_B0;
- cudaMalloc((void **)&d_B0,284467*sizeof(float));
- cudaMemcpy(d_B0,B0,284467*sizeof(float),cudaMemcpyHostToDevice);
- int n = 1112*256;
- int block_size = 256;
- int block_no = n/block_size;
- struct timeval time0,time1;
- gettimeofday(&time0,NULL);
- kernel_gpu<<<block_no,block_size>>>(d_A0,d_A1,d_A2,d_A3,d_A4,d_A5,d_A6,d_A7,d_B0,284467);
- cudaThreadSynchronize();
- gettimeofday(&time1,NULL);
- double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
- fprintf(stderr, "GPU time: %lf msecs ", (totaltime10)/1000.0F);
- gettimeofday(&time0,NULL);
- kernel_cpu(A0,A1,A2,A3,A4,A5,A6,A7,B0,284467);
- gettimeofday(&time1,NULL);
- totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
- fprintf(stderr, "CPU time: %lf msecs ", (totaltime10)/1000.0F);
- int *testA0;
- testA0 = (int *)malloc(intBytes);
- cudaMemcpy(testA0,d_A0,284467*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA1;
- testA1 = (int *)malloc(intBytes);
- cudaMemcpy(testA1,d_A1,284467*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA2;
- testA2 = (int *)malloc(intBytes);
- cudaMemcpy(testA2,d_A2,284467*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA3;
- testA3 = (int *)malloc(intBytes);
- cudaMemcpy(testA3,d_A3,284467*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA4;
- testA4 = (int *)malloc(intBytes);
- cudaMemcpy(testA4,d_A4,284467*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA5;
- testA5 = (int *)malloc(intBytes);
- cudaMemcpy(testA5,d_A5,284467*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA6;
- testA6 = (int *)malloc(intBytes);
- cudaMemcpy(testA6,d_A6,284467*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA7;
- testA7 = (int *)malloc(intBytes);
- cudaMemcpy(testA7,d_A7,284467*sizeof(int),cudaMemcpyDeviceToHost);
- float *testB0;
- testB0 = (float *)malloc(floatBytes);
- cudaMemcpy(testB0,d_B0,284467*sizeof(int),cudaMemcpyDeviceToHost);
- for(int i=0;i<284467;i++){
- if (A0[i] != testA0[i]) {
- printf("Invalid kernel ");
- break;}
- if (A1[i] != testA1[i]) {
- printf("Invalid kernel ");
- break;}
- if (A2[i] != testA2[i]) {
- printf("Invalid kernel ");
- break;}
- if (A3[i] != testA3[i]) {
- printf("Invalid kernel ");
- break;}
- if (A4[i] != testA4[i]) {
- printf("Invalid kernel ");
- break;}
- if (A5[i] != testA5[i]) {
- printf("Invalid kernel ");
- break;}
- if (A6[i] != testA6[i]) {
- printf("Invalid kernel ");
- break;}
- if (A7[i] != testA7[i]) {
- printf("Invalid kernel ");
- break;}
- }
- free(A0);
- free(testA0);
- cudaFree(d_A0);
- free(A1);
- free(testA1);
- cudaFree(d_A1);
- free(A2);
- free(testA2);
- cudaFree(d_A2);
- free(A3);
- free(testA3);
- cudaFree(d_A3);
- free(A4);
- free(testA4);
- cudaFree(d_A4);
- free(A5);
- free(testA5);
- cudaFree(d_A5);
- free(A6);
- free(testA6);
- cudaFree(d_A6);
- free(A7);
- free(testA7);
- cudaFree(d_A7);
- free(B0);
- free(testB0);
- cudaFree(d_B0);
- printf("\n");return 0; }
|