1 // Matrices are stored in row-major order:
2 // M(row, col) = *(M.elements + row * M.width + col)
6 #include "commonArrayMult.hpp"
7 #include "vectorUtils.hpp"
8 #include "PerformanceTestArrayOpParallel.hpp"
16 void check_result(cudaError code, const char* message)
18 if (code != cudaSuccess)
20 fprintf(stderr, "%s (%d): %s\n", message, (int) code,
21 cudaGetErrorString(code));
26 // Forward declaration of the matrix multiplication kernel
27 __global__ void MatMulKernel(const MatrixForCUDA, const MatrixForCUDA, MatrixForCUDA);
29 // MatrixForCUDA multiplication - Host code
30 // MatrixForCUDA dimensions are assumed to be multiples of BLOCK_SIZE
31 void MatMul(const MatrixForCUDA A, const MatrixForCUDA B, MatrixForCUDA C, int block_size,
32 dim3 dimGrid, dim3 dimBlock, std::shared_ptr<LPISupport::SampleCollection> pSamples, unsigned int numberOfIterations)
34 // Load A and B to device memory
36 d_A.width = A.width; d_A.height = A.height;
37 size_t size = A.width * A.height * sizeof(float);
38 check_result(cudaMalloc(&d_A.elements, size),
39 "Unable to allocate device memory");
40 check_result(cudaMemcpy(d_A.elements, A.elements, size,
41 cudaMemcpyHostToDevice),
42 "Unable to copy variable to device");
44 d_B.width = B.width; d_B.height = B.height;
45 size = B.width * B.height * sizeof(float);
46 check_result(cudaMalloc(&d_B.elements, size),
47 "Unable to allocate device memory");
48 check_result(cudaMemcpy(d_B.elements, B.elements, size,
49 cudaMemcpyHostToDevice),
50 "Unable to copy variable to device");
52 // Allocate C in device memory
54 d_C.width = C.width; d_C.height = C.height;
55 size = C.width * C.height * sizeof(float);
56 check_result(cudaMalloc(&d_C.elements, size),
57 "Unable to allocate device memory");
59 // Invoke kernel: Mal dimGrid si B.width y A.height no son múltiplos de
61 // dim3 dimBlock(block_size, block_size);
62 // dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
64 //gettimeofday(&t1, NULL);
65 cudaEvent_t start, stop;
66 cudaEventCreate(&start);
67 cudaEventCreate(&stop);
68 for (unsigned int iteration = 0; iteration < numberOfIterations; iteration++) {
69 cout << "Iteration #" << iteration << std::endl;
70 cudaEventRecord(start, 0);
71 MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
72 //gettimeofday(&t2, NULL);
73 cudaEventRecord(stop, 0);
74 cudaEventSynchronize(stop);
76 cudaEventElapsedTime(&gpuElapsedTime, start, stop); // in ms
77 /* ((t2.tv_sec - t1.tv_sec) * 1000.0) +
78 ((t2.tv_usec - t1.tv_usec) / 1000.0); */
79 pSamples->appendSample(gpuElapsedTime / 1000);
81 cudaEventDestroy(start);
82 cudaEventDestroy(stop);
84 // Read C from device memory
85 check_result(cudaMemcpy(C.elements, d_C.elements, size,
86 cudaMemcpyDeviceToHost),
87 "Unable to copy output variable from device");
90 check_result(cudaFree(d_A.elements), "Error freeing device memory");
91 check_result(cudaFree(d_B.elements), "Error freeing device memory");
92 check_result(cudaFree(d_C.elements), "Error freeing device memory");
95 // MatrixForCUDA multiplication kernel called by MatMul()
96 __global__ void MatMulKernel(MatrixForCUDA A, MatrixForCUDA B, MatrixForCUDA C)
98 // Each thread computes one element of C
99 // by accumulating results into Cvalue
101 int row = blockIdx.y * blockDim.y + threadIdx.y;
102 if (row > A.height-1) /* Nos salimos del rango de índices válidos */
104 if(A.height <= 32) // Son demasiados índices a mostrar con N grande
105 printf("threadIdx.y: %d, blockIdx.y: %d, row: %d\n",
106 threadIdx.y, blockIdx.y, row);
107 int col = blockIdx.x * blockDim.x + threadIdx.x;
108 if (col > B.width-1) /* Nos salimos del rango de índices válidos */
110 if(B.width <= 32) // Son demasiados índices a mostrar con N grande
111 printf("threadIdx.x: %d, blockIdx.x: %d, col: %d\n",
112 threadIdx.x, blockIdx.x, col);
114 for (int e = 0; e < A.width; e++)
115 Cvalue += A.elements[row * A.width + e] * B.elements[e * B.width + col];
116 C.elements[row * C.width + col] = Cvalue;
119 int main(int argc, char** argv) {
120 unsigned long numberOfIterations = 1;
121 MatrixForCUDA A, B, C;
122 std::shared_ptr<LPISupport::SampleCollection> pSamples = make_shared<LPISupport::SampleCollection>("execution time");
123 unsigned int size = 512;
124 unsigned int block_size = 32;
125 unsigned int SHOW_SIZE = 10, PRECISION_DIGITS = 10;
127 //printf("CUDA Runtime API reference performance measurement\n");
129 PerformanceTestArrayOpParallel* pPerfTest = new PerformanceTestArrayOpParallel(argc, argv);
130 auto pConfigTraits = std::dynamic_pointer_cast<PerformanceTestArrayOpParallel::ConfigTraits>(pPerfTest->getConfigTraits());
131 size = pConfigTraits->size;
132 numberOfIterations = pConfigTraits->repetitions;
138 if ((block_size = atoi(argv[3])) < 1) {
139 fprintf(stderr, "Invalid block size\n");
144 block_size = pConfigTraits->dimBlockOrLocalSize;
145 if (block_size > size)
148 dim3 dimBlock(block_size, block_size);
149 dim3 dimGrid(ceil((float)(size) / (float)(block_size)),
150 ceil((float)(size) / (float)(block_size)));
152 const unsigned int mem_size = sizeof(float) * size * size;
153 A.width=A.height=size;
154 B.width=B.height=size;
155 C.width=C.height=size;
156 A.elements = (float*)malloc(mem_size);
157 B.elements = (float*)malloc(mem_size);
158 C.elements = (float*)malloc(mem_size);
159 cerr << argv[0] << " performance measurement" << std::endl;
160 cout << "Starting program... " << flush;
161 cout << "Creating and filling arrays ... " << flush;
162 initArray(A.elements, A.height, A.width, 2.0);
163 initArray(B.elements, B.height, B.width, 2.0);
164 initArray(C.elements, C.height, C.width, 0.0);
165 cout << "Done." << endl;
166 print_array("a", A.elements, A.height, A.width, SHOW_SIZE);
167 print_array("b", B.elements, B.height, B.width, SHOW_SIZE);
170 cerr << "Executing " << numberOfIterations << " iteration(s)\n";
171 cerr << "- Matrix [1 .. " << size << ", 1 .. " << size << "]" << endl;
173 printf("Executing %lu iteration(s)\n", numberOfIterations);
174 printf("- Matrix [1 .. %u, 1 .. %u]\n", size, size);
175 printf("- Grid size %u\n", dimGrid.x);
176 printf("- Block size %u\n", dimBlock.x);
177 MatMul(A, B, C, block_size, dimGrid, dimBlock, pSamples, numberOfIterations);
178 cout << "Product finished." << endl;
180 // Number of simple calculations for array multiplication is
181 // ColsA products of two float numbers plus ColsA-1 sums of two float numbers
182 // for every element of output array (which has RowsC*ColsC elements)
183 // IMPORTANT!!: at least one expression operand must be casted to unsigned long type for the result being unsigned long;
184 // otherwise, in spite of operand values being less than integer type maximum value, the result may exceed unsigned int maximum value and be truncated
185 pConfigTraits->numOpsPerCalc = ((unsigned long) C.height) * C.width * (A.width*2-1);
186 cerr << "Number of operations: " << ((unsigned long) C.height * C.width * (A.width*2-1)) << std::endl;
187 cerr << "Number of operations stored in pConfigTraits->numOpsPerCalc: " << pConfigTraits->numOpsPerCalc << std::endl;
188 print_array<float>("c", C.elements, C.height, C.width, SHOW_SIZE);
190 pConfigTraits->deviceType = "GPU";
191 pConfigTraits->numDigitsPrec = PRECISION_DIGITS;
192 pPerfTest->buildTestInfo(pSamples);
193 pPerfTest->saveOrPrint();
200 } catch(string msg) {
201 cerr << "Exception caught in main(): " << msg << endl;