| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- #include <iostream>
- #include <cuda.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <sys/time.h>
- using namespace std;
- __global__ void kernel_gpu(int *A0,int *A1,int *A2,int *A3,int N){
- int i = blockIdx.x * blockDim.x + threadIdx.x;
- if (i < N){
- A0[i] = A3[i]+A3[i];
- A1[i] = A3[i];
- A2[i] = A3[i]+A3[i]+A3[i];
- }
- }
- void kernel_cpu(int *A0,int *A1,int *A2,int *A3,int N){
- for(int i=0;i<N;i++){
- A0[i] = A3[i]+A3[i];
- A1[i] = A3[i];
- A2[i] = A3[i]+A3[i]+A3[i];
- }
- }
- int main(int argc,char **argv) {
- fprintf(stderr, "961 ");
- int size=689233;
- int intBytes = size*sizeof(int);
- int floatBytes = size*sizeof(float);
- int *A0;
- A0 = (int *)malloc(intBytes);
- int *A1;
- A1 = (int *)malloc(intBytes);
- int *A2;
- A2 = (int *)malloc(intBytes);
- int *A3;
- A3 = (int *)malloc(intBytes);
- for(int i=0;i<689233;i++){
- A0[i] = 29*i+1;
- A1[i] = 26*i+1;
- A2[i] = 4*i+1;
- A3[i] = 44*i+1;
- }
- int *d_A0;
- cudaMalloc((void **)&d_A0,689233*sizeof(int));
- cudaMemcpy(d_A0,A0,689233*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A1;
- cudaMalloc((void **)&d_A1,689233*sizeof(int));
- cudaMemcpy(d_A1,A1,689233*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A2;
- cudaMalloc((void **)&d_A2,689233*sizeof(int));
- cudaMemcpy(d_A2,A2,689233*sizeof(int),cudaMemcpyHostToDevice);
- int *d_A3;
- cudaMalloc((void **)&d_A3,689233*sizeof(int));
- cudaMemcpy(d_A3,A3,689233*sizeof(int),cudaMemcpyHostToDevice);
- int n = 2693*256;
- int block_size = 256;
- int block_no = n/block_size;
- struct timeval time0,time1;
- gettimeofday(&time0,NULL);
- kernel_gpu<<<block_no,block_size>>>(d_A0,d_A1,d_A2,d_A3,689233);
- cudaThreadSynchronize();
- gettimeofday(&time1,NULL);
- double totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
- fprintf(stderr, "GPU time: %lf msecs ", (totaltime10)/1000.0F);
- gettimeofday(&time0,NULL);
- kernel_cpu(A0,A1,A2,A3,689233);
- gettimeofday(&time1,NULL);
- totaltime10 = (time1.tv_sec*1000000.0 + time1.tv_usec) - (time0.tv_sec*1000000.0 + time0.tv_usec);
- fprintf(stderr, "CPU time: %lf msecs ", (totaltime10)/1000.0F);
- int *testA0;
- testA0 = (int *)malloc(intBytes);
- cudaMemcpy(testA0,d_A0,689233*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA1;
- testA1 = (int *)malloc(intBytes);
- cudaMemcpy(testA1,d_A1,689233*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA2;
- testA2 = (int *)malloc(intBytes);
- cudaMemcpy(testA2,d_A2,689233*sizeof(int),cudaMemcpyDeviceToHost);
- int *testA3;
- testA3 = (int *)malloc(intBytes);
- cudaMemcpy(testA3,d_A3,689233*sizeof(int),cudaMemcpyDeviceToHost);
- for(int i=0;i<689233;i++){
- if (A0[i] != testA0[i]) {
- printf("Invalid kernel ");
- break;}
- if (A1[i] != testA1[i]) {
- printf("Invalid kernel ");
- break;}
- if (A2[i] != testA2[i]) {
- printf("Invalid kernel ");
- break;}
- if (A3[i] != testA3[i]) {
- printf("Invalid kernel ");
- break;}
- }
- free(A0);
- free(testA0);
- cudaFree(d_A0);
- free(A1);
- free(testA1);
- cudaFree(d_A1);
- free(A2);
- free(testA2);
- cudaFree(d_A2);
- free(A3);
- free(testA3);
- cudaFree(d_A3);
- printf("\n");return 0; }
|