opencliper.lpi.tel.uva.es Git - OpenCLIPER/blob - performanceTests/arrayMultCUDA.cu

   1 // Matrices are stored in row-major order:
   2 // M(row, col) = *(M.elements + row * M.width + col)
   3 #include <stdio.h>
   4 #include <sys/time.h>
   5 #include <string>
   6 #include "commonArrayMult.hpp"
   7 #include "vectorUtils.hpp"
   8 #include "PerformanceTestArrayOpParallel.hpp"
   9
  10 typedef struct {
  11     unsigned int width;
  12     unsigned int height;
  13     float* elements;
  14 } MatrixForCUDA;
  15
  16 void check_result(cudaError code, const char* message)
  17 {
  18   if (code != cudaSuccess)
  19     {
  20       fprintf(stderr, "%s (%d): %s\n", message, (int) code,
  21               cudaGetErrorString(code));
  22       exit(-1);
  23     }
  24 }
  25
  26 // Forward declaration of the matrix multiplication kernel
  27 __global__ void MatMulKernel(const MatrixForCUDA, const MatrixForCUDA, MatrixForCUDA);
  28
  29 // MatrixForCUDA multiplication - Host code
  30 // MatrixForCUDA dimensions are assumed to be multiples of BLOCK_SIZE
  31 void MatMul(const MatrixForCUDA A, const MatrixForCUDA B, MatrixForCUDA C, int block_size,
  32             dim3 dimGrid, dim3 dimBlock, std::shared_ptr<LPISupport::SampleCollection> pSamples, unsigned int numberOfIterations)
  33 {
  34     // Load A and B to device memory
  35     MatrixForCUDA d_A;
  36     d_A.width = A.width; d_A.height = A.height;
  37     size_t size = A.width * A.height * sizeof(float);
  38     check_result(cudaMalloc(&d_A.elements, size),
  39                  "Unable to allocate device memory");
  40     check_result(cudaMemcpy(d_A.elements, A.elements, size,
  41                cudaMemcpyHostToDevice),
  42                  "Unable to copy variable to device");
  43     MatrixForCUDA d_B;
  44     d_B.width = B.width; d_B.height = B.height;
  45     size = B.width * B.height * sizeof(float);
  46     check_result(cudaMalloc(&d_B.elements, size),
  47                  "Unable to allocate device memory");
  48     check_result(cudaMemcpy(d_B.elements, B.elements, size,
  49                cudaMemcpyHostToDevice),
  50                  "Unable to copy variable to device");
  51
  52     // Allocate C in device memory
  53     MatrixForCUDA d_C;
  54     d_C.width = C.width; d_C.height = C.height;
  55     size = C.width * C.height * sizeof(float);
  56     check_result(cudaMalloc(&d_C.elements, size),
  57                  "Unable to allocate device memory");
  58
  59     // Invoke kernel: Mal dimGrid si B.width y A.height no son múltiplos de
  60     // block_size
  61     // dim3 dimBlock(block_size, block_size);
  62     // dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
  63
  64     //gettimeofday(&t1, NULL);
  65     cudaEvent_t start, stop;
  66     cudaEventCreate(&start);
  67     cudaEventCreate(&stop);
  68     for (unsigned int iteration = 0; iteration < numberOfIterations; iteration++) {
  69         cout << "Iteration #" << iteration << std::endl;
  70         cudaEventRecord(start, 0);
  71         MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
  72         //gettimeofday(&t2, NULL);
  73         cudaEventRecord(stop, 0);
  74         cudaEventSynchronize(stop);
  75         float gpuElapsedTime;
  76         cudaEventElapsedTime(&gpuElapsedTime, start, stop); // in ms
  77         /*  ((t2.tv_sec - t1.tv_sec) * 1000.0) +
  78         ((t2.tv_usec - t1.tv_usec) / 1000.0); */
  79         pSamples->appendSample(gpuElapsedTime / 1000);
  80     }
  81     cudaEventDestroy(start);
  82     cudaEventDestroy(stop);
  83
  84     // Read C from device memory
  85     check_result(cudaMemcpy(C.elements, d_C.elements, size,
  86                cudaMemcpyDeviceToHost),
  87                  "Unable to copy output variable from device");
  88
  89     // Free device memory
  90     check_result(cudaFree(d_A.elements), "Error freeing device memory");
  91     check_result(cudaFree(d_B.elements), "Error freeing device memory");
  92     check_result(cudaFree(d_C.elements), "Error freeing device memory");
  93 }
  94
  95 // MatrixForCUDA multiplication kernel called by MatMul()
  96 __global__ void MatMulKernel(MatrixForCUDA A, MatrixForCUDA B, MatrixForCUDA C)
  97 {
  98     // Each thread computes one element of C
  99     // by accumulating results into Cvalue
 100     float Cvalue = 0;
 101     int row = blockIdx.y * blockDim.y + threadIdx.y;
 102     if (row > A.height-1) /* Nos salimos del rango de índices válidos */
 103       return;
 104     if(A.height <= 32) // Son demasiados índices a mostrar con N grande
 105       printf("threadIdx.y: %d, blockIdx.y: %d, row: %d\n",
 106              threadIdx.y, blockIdx.y, row);
 107     int col = blockIdx.x * blockDim.x + threadIdx.x;
 108     if (col > B.width-1) /* Nos salimos del rango de índices válidos */
 109       return;
 110     if(B.width <= 32) // Son demasiados índices a mostrar con N grande
 111       printf("threadIdx.x: %d, blockIdx.x: %d, col: %d\n",
 112              threadIdx.x, blockIdx.x, col);
 113
 114     for (int e = 0; e < A.width; e++)
 115         Cvalue += A.elements[row * A.width + e] * B.elements[e * B.width + col];
 116     C.elements[row * C.width + col] = Cvalue;
 117 }
 118
 119 int main(int argc, char** argv) {
 120     unsigned long numberOfIterations = 1;
 121     MatrixForCUDA A, B, C;
 122     std::shared_ptr<LPISupport::SampleCollection> pSamples = make_shared<LPISupport::SampleCollection>("execution time");
 123     unsigned int size = 512;
 124     unsigned int block_size = 32;
 125     unsigned int SHOW_SIZE = 10, PRECISION_DIGITS = 10;
 126
 127     //printf("CUDA Runtime API reference performance measurement\n");
 128     try {
 129         PerformanceTestArrayOpParallel* pPerfTest = new PerformanceTestArrayOpParallel(argc, argv);
 130         auto pConfigTraits = std::dynamic_pointer_cast<PerformanceTestArrayOpParallel::ConfigTraits>(pPerfTest->getConfigTraits());
 131         size = pConfigTraits->size;
 132         numberOfIterations = pConfigTraits->repetitions;
 133
 134         //count, N
 135
 136         /*
 137         if (argc >= 4) {
 138             if ((block_size = atoi(argv[3])) < 1) {
 139             fprintf(stderr, "Invalid block size\n");
 140             return -1;
 141             }
 142         }
 143         */
 144         block_size = pConfigTraits->dimBlockOrLocalSize;
 145         if (block_size > size)
 146             block_size = size;
 147
 148         dim3 dimBlock(block_size, block_size);
 149         dim3 dimGrid(ceil((float)(size) / (float)(block_size)),
 150                 ceil((float)(size) / (float)(block_size)));
 151
 152         const unsigned int mem_size = sizeof(float) * size * size;
 153         A.width=A.height=size;
 154         B.width=B.height=size;
 155         C.width=C.height=size;
 156         A.elements = (float*)malloc(mem_size);
 157         B.elements = (float*)malloc(mem_size);
 158         C.elements = (float*)malloc(mem_size);
 159         cerr << argv[0] << " performance measurement" << std::endl;
 160         cout << "Starting program... " << flush;
 161         cout << "Creating and filling arrays ... " << flush;
 162         initArray(A.elements, A.height, A.width, 2.0);
 163         initArray(B.elements, B.height, B.width, 2.0);
 164         initArray(C.elements, C.height, C.width, 0.0);
 165         cout << "Done." << endl;
 166         print_array("a", A.elements, A.height, A.width, SHOW_SIZE);
 167         print_array("b", B.elements, B.height, B.width, SHOW_SIZE);
 168         cout << endl;
 169
 170         cerr << "Executing " << numberOfIterations << " iteration(s)\n";
 171         cerr << "- Matrix [1 .. " << size << ", 1 .. " << size << "]" << endl;
 172
 173         printf("Executing %lu iteration(s)\n", numberOfIterations);
 174         printf("- Matrix [1 .. %u, 1 .. %u]\n", size, size);
 175         printf("- Grid size %u\n", dimGrid.x);
 176         printf("- Block size %u\n", dimBlock.x);
 177         MatMul(A, B, C, block_size, dimGrid, dimBlock, pSamples, numberOfIterations);
 178         cout << "Product finished." << endl;
 179
 180         // Number of simple calculations for array multiplication is
 181         // ColsA products of two float numbers plus ColsA-1 sums of two float numbers
 182         // for every element of output array (which has RowsC*ColsC elements)
 183         // IMPORTANT!!: at least one expression operand must be casted to unsigned long type for the result being unsigned long;
 184         // otherwise, in spite of operand values being less than integer type maximum value, the result may exceed unsigned int maximum value and be truncated
 185         pConfigTraits->numOpsPerCalc = ((unsigned long) C.height) * C.width * (A.width*2-1);
 186         cerr << "Number of operations: " << ((unsigned long) C.height * C.width * (A.width*2-1)) << std::endl;
 187         cerr << "Number of operations stored in pConfigTraits->numOpsPerCalc: " << pConfigTraits->numOpsPerCalc << std::endl;
 188         print_array<float>("c", C.elements, C.height, C.width, SHOW_SIZE);
 189
 190         pConfigTraits->deviceType = "GPU";
 191         pConfigTraits->numDigitsPrec = PRECISION_DIGITS;
 192         pPerfTest->buildTestInfo(pSamples);
 193         pPerfTest->saveOrPrint();
 194
 195         free(A.elements);
 196         free(B.elements);
 197         free(C.elements);
 198         pSamples = nullptr;
 199         return 0;
 200     } catch(string msg) {
 201         cerr << "Exception caught in main(): " << msg << endl;
 202         //cleanupHost();
 203     }
 204 }