Peano
Loading...
Searching...
No Matches
KernelBenchmarks-main.cpp
Go to the documentation of this file.
1// This file is part of the ExaHyPE2 project. For conditions of distribution and
2// use, please see the copyright notice at www.peano-framework.org
3#include "KernelBenchmarks-main.h"
4
5#include "Constants.h"
6
7#include "repositories/DataRepository.h"
8#include "repositories/SolverRepository.h"
9#include "repositories/StepRepository.h"
10
11#include "exahype2/CellData.h"
12
13#include "exahype2/dg/DGUtils.h"
14
15#include "kernels/AderSolver/FusedSpaceTimePredictorVolumeIntegral.h"
16#include "kernels/AderSolver/FaceIntegral.h"
17#include "kernels/AderSolver/MaxScaledEigenvalue.h"
18#include "kernels/AderSolver/Quadrature.h"
19#include "kernels/AderSolver/DGMatrices.h"
20#include "kernels/AderSolver/RiemannSolver.h"
21#include "kernels/AderSolver/BufferSizes.h"
22
23#include "peano4/peano4.h"
24
25#include "tarch/NonCriticalAssertions.h"
26#include "tarch/accelerator/accelerator.h"
27#include "tarch/accelerator/Device.h"
28#include "tarch/multicore/multicore.h"
29#include "tarch/multicore/Core.h"
30#include "tarch/multicore/BooleanSemaphore.h"
31#include "tarch/multicore/Lock.h"
32#include "tarch/logging/Log.h"
33#include "tarch/timing/Measurement.h"
34#include "tarch/timing/Watch.h"
35
36#include <cstring>
37
39
40tarch::logging::Log _log("::");
41
42constexpr double TimeStamp = 0.0;
43constexpr double TimeStepSize = 1e-6;
44const tarch::la::Vector<DIMENSIONS,double> CellCenter = benchmarks::exahype2::kernelbenchmarks::DomainOffset + 0.5*benchmarks::exahype2::kernelbenchmarks::DomainSize;
45const tarch::la::Vector<DIMENSIONS,double> CellSize = benchmarks::exahype2::kernelbenchmarks::DomainSize / 81.0;
46// constexpr int HaloSize = 1;
47
48static_assert(Accuracy >= std::numeric_limits<double>::epsilon() || Accuracy == 0.0);
49
51 = (AderSolver::Order + 1)
52 * (AderSolver::Order + 1)
53#if DIMENSIONS == 3
54 * (AderSolver::Order + 1)
55#endif
56 * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
57
59 = 0;
60// = AderSolver::Order
61// * AderSolver::Order
62// #if DIMENSIONS == 3
63// * AderSolver::Order
64// #endif
65// * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
66
68 = AderSolver::Order
69 * AderSolver::Order
70#if DIMENSIONS == 3
71 * AderSolver::Order
72#endif
73 ;
74
75// Check the outcomes of each kernel
76double** validQ = nullptr;
77double* validMaxEigenvalue = nullptr;
78bool outcomeIsInvalid = false;
79
80tarch::timing::Measurement timingComputeKernel;
81
90void initInputData(double* Q) {
91 // for (int i = 0; i < NumberOfInputEntriesPerCell; i++) {
92 // Q[i] = std::sin(1.0 * i / (NumberOfInputEntriesPerCell) * tarch::la::PI);
93 // }
94
95 int linearisedIndex = 0;
96 dfor(index, AderSolver::Order + 1) {
97 repositories::instanceOfAderSolver.initialCondition(
98 Q + linearisedIndex,
99 ::exahype2::dg::getQuadraturePoint(
101 CellSize,
102 index,
103 repositories::instanceOfAderSolver.Order + 1,
104 kernels::AderSolver::Quadrature<double>::nodes
105 ),
107 //index,
108 );
109 linearisedIndex += AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables;
110 }
111
112}
113
123void allocateAndStoreOutcome(const double* const* Q,
124 const double* const maxEigenvalue,
125 const int numberOfCells
126) {
127 if constexpr (Accuracy <= 0.0) return;
128 if (validQ == nullptr and validMaxEigenvalue == nullptr) {
129 validQ = new double*[numberOfCells];
130 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
131 validQ[cellIndex] = new double[NumberOfOutputEntriesPerCell];
132 std::memcpy(validQ[cellIndex], Q[cellIndex], sizeof(double) * NumberOfOutputEntriesPerCell);
133 }
134 validMaxEigenvalue = new double[numberOfCells];
135 std::memcpy(validMaxEigenvalue, maxEigenvalue, sizeof(double) * numberOfCells);
136 logInfo("storeOutcome(...)", "bookmarked reference solution");
137 }
138}
139
140void freeOutcome(const int numberOfCells) {
141 if constexpr (Accuracy <= 0.0) return;
142 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
143 delete[] validQ[cellIndex];
144 }
145 delete[] validQ;
146 delete[] validMaxEigenvalue;
147 validQ = nullptr;
148 validMaxEigenvalue = nullptr;
149}
150
160 const double* const* Q,
161 const double* const maxEigenvalue,
162 const int numberOfCells
163) {
164 if constexpr (Accuracy <= 0.0) return;
165 int errors = 0;
166 double maxDifference = 0.0;
167
168 std::cerr.precision(16);
169 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
170 for (int i = 0; i < NumberOfOutputEntriesPerCell; i++) {
171 if (not tarch::la::equals(Q[cellIndex][i], validQ[cellIndex][i], Accuracy)) {
172 if (!errors) { // Only print once
173 logError("validateOutcome(...)",
174 std::fixed
175 << "cell " << cellIndex << ": "
176 << "Q[" << i << "]!=validQ[" << i << "] ("
177 << Q[cellIndex][i]
178 << "!="
179 << validQ[cellIndex][i]
180 << ")"
181 );
182 }
183 errors++;
184 maxDifference = std::max(maxDifference, std::abs(Q[cellIndex][i] - validQ[cellIndex][i]));
185 }
186 }
187
188 if (not tarch::la::equals(maxEigenvalue[cellIndex], validMaxEigenvalue[cellIndex], Accuracy)) {
189 if (!errors) {
190 logError("validateOutcome(...)",
191 std::fixed
192 << "maxEigenvalue[" << cellIndex << "]!=validMaxEigenvalue[" << cellIndex << "] ("
193 << maxEigenvalue[cellIndex] << "!=" << validMaxEigenvalue[cellIndex]
194 << ")";
195 );
196 }
197 errors++;
198 maxDifference = std::max(maxDifference, std::abs(maxEigenvalue[cellIndex] - validMaxEigenvalue[cellIndex]));
199 }
200 }
201
202 if (errors > 0) {
203 outcomeIsInvalid = true;
204 logError("validateOutcome(...)",
205 "max difference of outcome from all cells is "
206 << maxDifference
207 << " (admissible accuracy="
208 << Accuracy << ")"
209 << " for " << errors << " entries"
210 );
211 }
212}
213
223 const std::string& kernelIdentificator,
224 const tarch::timing::Measurement& timingKernelLaunch,
225 int numberOfCells
226) {
227 std::stringstream ss;
228 ss << "\n";
229 ss << kernelIdentificator << ":\n\t";
230 ss << timingComputeKernel.getValue() << " |\n\t";
231 ss << (timingComputeKernel.getValue() / numberOfCells ) << " |\n\t";
232 ss << timingComputeKernel.toString() << " |\n\t";
233 ss << timingKernelLaunch.getValue() << " |\n\t";
234 ss << (timingKernelLaunch.getValue() / numberOfCells );
235 ss << " |\n\t" << timingKernelLaunch.toString();
236 logInfo("reportRuntime()", ss.str());
237}
238
239/*
240 * Executes one full run of all of the kernels
241*/
242double runKernels(int device, exahype2::CellData<double, double>& cellData, int cellIndex, tarch::timing::Measurement& measurement){
243 // We must ensure that the jump from one face to the next is a multiple of
244 // the ALIGNMENT.
245 // Example: If ALIGNMENT is 64 bytes (AVX512) and double is 8 bytes,
246 // we need the stride to be a multiple of 8 doubles.
247 constexpr int AlignmentDoubles = ALIGNMENT / sizeof(double);
248 constexpr int PaddedBndFaceSize = ((kernels::AderSolver::getBndFaceSize() + AlignmentDoubles - 1) / AlignmentDoubles) * AlignmentDoubles;
249 constexpr int PaddedBndFluxSize = ((kernels::AderSolver::getBndFluxSize() + AlignmentDoubles - 1) / AlignmentDoubles) * AlignmentDoubles;
250
251 double boundaryData[2 * DIMENSIONS * PaddedBndFaceSize] __attribute__((aligned(ALIGNMENT)));
252 double* lQhbnd[2 * DIMENSIONS] = {
253 boundaryData,
254 boundaryData + PaddedBndFaceSize,
255 boundaryData + 2 * PaddedBndFaceSize,
256 boundaryData + 3 * PaddedBndFaceSize
257#if DIMENSIONS==3
258 ,boundaryData + 4 * PaddedBndFaceSize,
259 boundaryData + 5 * PaddedBndFaceSize
260#endif
261 };
262
263 double boundaryFlux[2 * DIMENSIONS * PaddedBndFluxSize] __attribute__((aligned(ALIGNMENT)));
264 double* lFhbnd[2 * DIMENSIONS] = {
265 boundaryFlux,
266 boundaryFlux + PaddedBndFluxSize,
267 boundaryFlux + 2 * PaddedBndFluxSize,
268 boundaryFlux + 3 * PaddedBndFluxSize
269#if DIMENSIONS==3
270 ,boundaryFlux + 4 * PaddedBndFluxSize,
271 boundaryFlux + 5 * PaddedBndFluxSize
272#endif
273 };
274
275 tarch::timing::Watch watchKernelCompute("::runBenchmarks", "assessKernel(...)", false);
276
277 int numberOfIterations = kernels::AderSolver::fusedSpaceTimePredictorVolumeIntegral<double, double, double>(
278 repositories::instanceOfAderSolver,
279 lQhbnd, lFhbnd, cellData.QIn[cellIndex],
281 );
282
283 for (int d = 0; d < DIMENSIONS; d++) {
284 const int direction = d;
285
286 //For the Riemann solver, we wrap the domain as though it had periodic boundary
287 // conditions and solve the problem between the left and right faces of the cell.
288 tarch::la::Vector<DIMENSIONS, double> faceCentre = CellCenter;
289// faceCentre[d] -= 0.5 * CellSize[d];
290
291 kernels::AderSolver::riemannSolver<double>(
292 repositories::instanceOfAderSolver,
293 lFhbnd[d+DIMENSIONS],
294 lFhbnd[d],
295 lQhbnd[d+DIMENSIONS],
296 lQhbnd[d],
297 TimeStamp,
299 faceCentre,
300 CellSize,
301 direction,
302 false,
303 0
304 );
305
306 const double inverseDxDirection = 1.0 / CellSize[d];
307 // Negative face
308 kernels::AderSolver::faceIntegral(
309 cellData.QIn[cellIndex],
310 lFhbnd[d],
311 direction,
312 0,
313 inverseDxDirection,
315 );
316
317 // Positive face
318 faceCentre[d] += CellSize[d];
319 kernels::AderSolver::faceIntegral(
320 cellData.QIn[cellIndex],
321 lFhbnd[d+DIMENSIONS],
322 direction,
323 1,
324 inverseDxDirection,
326 );
327
328 } // for d
329
330 double maxEigenvalue = kernels::AderSolver::maxScaledEigenvalue(
331 repositories::instanceOfAderSolver,
332 cellData.QIn[cellIndex],
334 CellSize,
335 TimeStamp,
337 );
338
339 watchKernelCompute.stop();
340 measurement.setValue(watchKernelCompute.getCalendarTime());
341
342 return maxEigenvalue;
343
344}
345
351void runBenchmarks(int numberOfCells) {
352 exahype2::CellData<double, double> cellData(numberOfCells);
353 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
354 cellData.QIn[cellIndex] = tarch::allocateMemory<double>(NumberOfInputEntriesPerCell, tarch::MemoryLocation::Heap);
355 cellData.QOut[cellIndex] = nullptr; //tarch::allocateMemory<double>(NumberOfOutputEntriesPerCell, tarch::MemoryLocation::Heap);
356 cellData.cellCentre[cellIndex] = CellCenter;
357 cellData.cellSize[cellIndex] = CellSize;
358 cellData.maxEigenvalue[cellIndex] = 0.0;
359 initInputData(cellData.QIn[cellIndex]);
360 std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(double));
361 }
362
363 cellData.t = TimeStamp;
364 cellData.dt = TimeStepSize;
365
366 auto assessKernel = [&](
367 std::function<double(int device, exahype2::CellData<double, double>& cellData, int cellIndex, tarch::timing::Measurement& measurement)> executeKernels,
368 const std::string& markerName,
369 const int device
370 ) -> void {
371 timingComputeKernel.erase();
372 tarch::timing::Measurement timingKernelLaunch;
373
374 int sample = 0;
375 while (sample <= NumberOfSamples) {
376 // Reset output data
377 #if defined(WITH_OPENMP)
378 #pragma omp parallel num_threads(NumberOfLaunchingThreads)
379 #endif
380 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
381 cellData.maxEigenvalue[cellIndex] = 0.0;
382 // std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(double));
383
384 // parallelFor(launchingThread, NumberOfLaunchingThreads) {
385 // for (int launchingThread = 0; launchingThread < NumberOfLaunchingThreads; launchingThread++) {
386 tarch::timing::Watch watchKernelLaunch("::runBenchmarks", "assessKernel(...)", false);
387 executeKernels(device, cellData, cellIndex, timingComputeKernel);
388 watchKernelLaunch.stop();
389 timingKernelLaunch.setValue(watchKernelLaunch.getCalendarTime());
390 // } endParallelFor
391 // }
392 // #if defined(WITH_OPENMP)
393 // }
394 // #endif
395 }
396 sample++;
397 }
398
399 reportRuntime(markerName, timingKernelLaunch, numberOfCells);
400 allocateAndStoreOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);
401 validateOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);
402 };
403
404 if constexpr (AssessHostKernels) {
405 assessKernel(runKernels,
406 "host, stateless, batched, AoS, serial",
407 tarch::accelerator::Device::DefaultDevice
408 );
409 } // AssessHostKernels
410
411 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
412 tarch::freeMemory(cellData.QIn[cellIndex], tarch::MemoryLocation::Heap);
413 tarch::freeMemory(cellData.QOut[cellIndex], tarch::MemoryLocation::Heap);
414 }
415}
416
417int main(int argc, char** argv) {
418 peano4::init(&argc, &argv, benchmarks::exahype2::kernelbenchmarks::DomainOffset, benchmarks::exahype2::kernelbenchmarks::DomainSize);
419 repositories::initLogFilters();
420 repositories::startSimulation();
421
422 logInfo(
423 "main()",
424 "number of compute threads: "
425 << tarch::multicore::Core::getInstance().getNumberOfThreads()
426 );
427 logInfo(
428 "main()",
429 "number of threads launching compute kernels: "
430 << NumberOfLaunchingThreads
431 );
432 logInfo(
433 "main()",
434 "number of unknowns: "
435 << AderSolver::NumberOfUnknowns
436 );
437 logInfo(
438 "main()",
439 "number of auxiliary variables: "
440 << AderSolver::NumberOfAuxiliaryVariables
441 );
442 logInfo(
443 "main()",
444 "number of finite volumes per axis per cell: "
445 << AderSolver::Order
446 );
447 logInfo(
448 "main()",
449 "number of samples per measurement: "
450 << NumberOfSamples
451 );
452 logInfo(
453 "main()",
454 "performing accuracy checks with precision: "
455 << Accuracy
456 );
457#if defined(WITH_GPU_SYCL)
458 logInfo(
459 "main()",
460 "set SYCL_DEVICE_FILTER=gpu or ONEAPI_DEVICE_SELECTOR=cuda:0 when using SYCL on the device"
461 );
462 logInfo(
463 "main()",
464 "set SYCL_PI_TRACE=2 in case of runtime errors"
465 );
466#endif
467
468// #if defined(WITH_OPENMP)
469// #pragma omp parallel
470// {
471// #pragma omp master
472// {
473// #endif
474 for (int i = 0; i < NumberOfCellsToStudy.size(); i++) {
475 logInfo("main()", "number of cells: " << NumberOfCellsToStudy[i]);
476 runBenchmarks(NumberOfCellsToStudy[i]);
477 freeOutcome(NumberOfCellsToStudy[i]);
478 }
479// #if defined(WITH_OPENMP)
480// }
481// }
482// #endif
483
484 repositories::finishSimulation();
485 peano4::shutdown();
486
487 if (outcomeIsInvalid) {
488 return EXIT_FAILURE; // Make sure the CI pipeline reports an error
489 }
490
491 return EXIT_SUCCESS;
492}
void freeOutcome(const int numberOfCells)
const tarch::la::Vector< DIMENSIONS, double > CellCenter
void runBenchmarks(int numberOfCells)
Run the benchmark for one particular number of cells.
int main(int argc, char **argv)
double runKernels(int device, exahype2::CellData< double, double > &cellData, int cellIndex, tarch::timing::Measurement &measurement)
void allocateAndStoreOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Allocates and stores outcome of one compute kernel.
constexpr double TimeStamp
void validateOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Validate data against pre-stored simulation outcome.
double ** validQ
bool outcomeIsInvalid
tarch::timing::Measurement timingComputeKernel
double * validMaxEigenvalue
tarch::logging::Log _log("::")
constexpr int NumberOfFiniteVolumesPerCell
constexpr double TimeStepSize
const tarch::la::Vector< DIMENSIONS, double > CellSize
constexpr int NumberOfInputEntriesPerCell
Definition Utils.h:14
void reportRuntime(const std::string &kernelIdentificator, const tarch::timing::Measurement &timingComputeKernel, const tarch::timing::Measurement &timingKernelLaunch, int numberOfCells, int numberOfThreads, tarch::logging::Log _log)
Reports the runtime and throughput of the benchmarks.
Definition Utils.h:67
constexpr int NumberOfOutputEntriesPerCell
Definition Utils.h:20
void initInputData(SolverPrecision *Q, const tarch::la::Vector< DIMENSIONS, double > CellCenter, const tarch::la::Vector< DIMENSIONS, double > CellSize)
Set input data.
Definition Utils.h:30