Peano
Loading...
Searching...
No Matches
KernelBenchmarks-main.cpp
Go to the documentation of this file.
1// This file is part of the ExaHyPE2 project. For conditions of distribution and
2// use, please see the copyright notice at www.peano-framework.org
3#include "KernelBenchmarks-main.h"
4
5#include "Constants.h"
6
7#include "repositories/DataRepository.h"
8#include "repositories/SolverRepository.h"
9#include "repositories/StepRepository.h"
10
11#include "exahype2/CellData.h"
12
13#include "exahype2/dg/DGUtils.h"
14
15#include "kernels/AderSolver/FusedSpaceTimePredictorVolumeIntegral.h"
16#include "kernels/AderSolver/FaceIntegral.h"
17#include "kernels/AderSolver/MaxScaledEigenvalue.h"
18#include "kernels/AderSolver/Quadrature.h"
19#include "kernels/AderSolver/DGMatrices.h"
20#include "kernels/AderSolver/RiemannSolver.h"
21#include "kernels/AderSolver/BufferSizes.h"
22
23#include "peano4/peano4.h"
24
25#include "tarch/NonCriticalAssertions.h"
26#include "tarch/accelerator/accelerator.h"
27#include "tarch/accelerator/Device.h"
28#include "tarch/multicore/multicore.h"
29#include "tarch/multicore/Core.h"
30#include "tarch/multicore/BooleanSemaphore.h"
31#include "tarch/multicore/Lock.h"
32#include "tarch/logging/Log.h"
33#include "tarch/timing/Measurement.h"
34#include "tarch/timing/Watch.h"
35
36#include <cstring>
37
38#include <fenv.h>
39#pragma float_control(precise, on)
40#pragma STDC FENV_ACCESS ON
41
43
44tarch::logging::Log _log("::");
45
46constexpr double TimeStamp = 0.0;
47constexpr double TimeStepSize = 1e-6;
48const tarch::la::Vector<DIMENSIONS,double> CellCenter = benchmarks::exahype2::kernelbenchmarks::DomainOffset + 0.5*benchmarks::exahype2::kernelbenchmarks::DomainSize;
49const tarch::la::Vector<DIMENSIONS,double> CellSize = benchmarks::exahype2::kernelbenchmarks::DomainSize / 81.0;
50// constexpr int HaloSize = 1;
51
52static_assert(Accuracy >= std::numeric_limits<double>::epsilon() || Accuracy == 0.0);
53
55 = (AderSolver::Order + 1)
56 * (AderSolver::Order + 1)
57#if DIMENSIONS == 3
58 * (AderSolver::Order + 1)
59#endif
60 * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
61
63 = 0;
64// = AderSolver::Order
65// * AderSolver::Order
66// #if DIMENSIONS == 3
67// * AderSolver::Order
68// #endif
69// * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
70
72 = AderSolver::Order
73 * AderSolver::Order
74#if DIMENSIONS == 3
75 * AderSolver::Order
76#endif
77 ;
78
79// Check the outcomes of each kernel
80double** validQ = nullptr;
81double* validMaxEigenvalue = nullptr;
82bool outcomeIsInvalid = false;
83
84tarch::timing::Measurement timingComputeKernel;
85
94void initInputData(double* Q) {
95 // for (int i = 0; i < NumberOfInputEntriesPerCell; i++) {
96 // Q[i] = std::sin(1.0 * i / (NumberOfInputEntriesPerCell) * tarch::la::PI);
97 // }
98
99 int linearisedIndex = 0;
100 dfor(index, AderSolver::Order + 1) {
101 repositories::instanceOfAderSolver.initialCondition(
102 Q + linearisedIndex,
103 ::exahype2::dg::getQuadraturePoint(
105 CellSize,
106 index,
107 repositories::instanceOfAderSolver.Order + 1,
108 kernels::AderSolver::Quadrature<double>::nodes
109 ),
110 CellSize,
111 //index,
112 true
113 );
114 linearisedIndex += AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables;
115 }
116
117}
118
128void allocateAndStoreOutcome(const double* const* Q,
129 const double* const maxEigenvalue,
130 const int numberOfCells
131) {
132 if constexpr (Accuracy <= 0.0) return;
133 if (validQ == nullptr and validMaxEigenvalue == nullptr) {
134 validQ = new double*[numberOfCells];
135 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
136 validQ[cellIndex] = new double[NumberOfOutputEntriesPerCell];
137 std::memcpy(validQ[cellIndex], Q[cellIndex], sizeof(double) * NumberOfOutputEntriesPerCell);
138 }
139 validMaxEigenvalue = new double[numberOfCells];
140 std::memcpy(validMaxEigenvalue, maxEigenvalue, sizeof(double) * numberOfCells);
141 logInfo("storeOutcome(...)", "bookmarked reference solution");
142 }
143}
144
145void freeOutcome(const int numberOfCells) {
146 if constexpr (Accuracy <= 0.0) return;
147 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
148 delete[] validQ[cellIndex];
149 }
150 delete[] validQ;
151 delete[] validMaxEigenvalue;
152 validQ = nullptr;
153 validMaxEigenvalue = nullptr;
154}
155
165 const double* const* Q,
166 const double* const maxEigenvalue,
167 const int numberOfCells
168) {
169 if constexpr (Accuracy <= 0.0) return;
170 int errors = 0;
171 double maxDifference = 0.0;
172
173 std::cerr.precision(16);
174 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
175 for (int i = 0; i < NumberOfOutputEntriesPerCell; i++) {
176 if (not tarch::la::equals(Q[cellIndex][i], validQ[cellIndex][i], Accuracy)) {
177 if (!errors) { // Only print once
178 logError("validateOutcome(...)",
179 std::fixed
180 << "cell " << cellIndex << ": "
181 << "Q[" << i << "]!=validQ[" << i << "] ("
182 << Q[cellIndex][i]
183 << "!="
184 << validQ[cellIndex][i]
185 << ")"
186 );
187 }
188 errors++;
189 maxDifference = std::max(maxDifference, std::abs(Q[cellIndex][i] - validQ[cellIndex][i]));
190 }
191 }
192
193 if (not tarch::la::equals(maxEigenvalue[cellIndex], validMaxEigenvalue[cellIndex], Accuracy)) {
194 if (!errors) {
195 logError("validateOutcome(...)",
196 std::fixed
197 << "maxEigenvalue[" << cellIndex << "]!=validMaxEigenvalue[" << cellIndex << "] ("
198 << maxEigenvalue[cellIndex] << "!=" << validMaxEigenvalue[cellIndex]
199 << ")";
200 );
201 }
202 errors++;
203 maxDifference = std::max(maxDifference, std::abs(maxEigenvalue[cellIndex] - validMaxEigenvalue[cellIndex]));
204 }
205 }
206
207 if (errors > 0) {
208 outcomeIsInvalid = true;
209 logError("validateOutcome(...)",
210 "max difference of outcome from all cells is "
211 << maxDifference
212 << " (admissible accuracy="
213 << Accuracy << ")"
214 << " for " << errors << " entries"
215 );
216 }
217}
218
228 const std::string& kernelIdentificator,
229 const tarch::timing::Measurement& timingKernelLaunch,
230 int numberOfCells
231) {
232 std::stringstream ss;
233 ss << "\n";
234 ss << kernelIdentificator << ":\n\t";
235 ss << timingComputeKernel.getValue() << " |\n\t";
236 ss << (timingComputeKernel.getValue() / numberOfCells ) << " |\n\t";
237 ss << timingComputeKernel.toString() << " |\n\t";
238 ss << timingKernelLaunch.getValue() << " |\n\t";
239 ss << (timingKernelLaunch.getValue() / numberOfCells );
240 ss << " |\n\t" << timingKernelLaunch.toString();
241 logInfo("reportRuntime()", ss.str());
242}
243
244/*
245 * Executes one full run of all of the kernels
246*/
247double runKernels(int device, exahype2::CellData<double, double>& cellData, int cellIndex, tarch::timing::Measurement& measurement){
248 double boundaryData[kernels::AderSolver::getBndFaceTotalSize()];
249 double* lQhbnd[2*DIMENSIONS] = {
250 boundaryData,
251 boundaryData + kernels::AderSolver::getBndFaceSize(),
252 boundaryData + 2*kernels::AderSolver::getBndFaceSize(),
253 boundaryData + 3*kernels::AderSolver::getBndFaceSize()
254#if DIMENSIONS==3
255 ,boundaryData + 4*kernels::AderSolver::getBndFaceSize(),
256 boundaryData + 5*kernels::AderSolver::getBndFaceSize()
257#endif
258 };
259
260 double boundaryFlux[kernels::AderSolver::getBndFluxTotalSize()];
261 double* lFhbnd[2*DIMENSIONS] = {
262 boundaryFlux,
263 boundaryFlux + kernels::AderSolver::getBndFluxSize(),
264 boundaryFlux + 2*kernels::AderSolver::getBndFluxSize(),
265 boundaryFlux + 3*kernels::AderSolver::getBndFluxSize()
266#if DIMENSIONS==3
267 ,boundaryFlux + 4*kernels::AderSolver::getBndFluxSize(),
268 boundaryFlux + 5*kernels::AderSolver::getBndFluxSize()
269#endif
270 };
271
272 tarch::timing::Watch watchKernelCompute("::runBenchmarks", "assessKernel(...)", false);
273
274 int numberOfIterations = kernels::AderSolver::fusedSpaceTimePredictorVolumeIntegral<double, double, double>(
275 repositories::instanceOfAderSolver,
276 lQhbnd, lFhbnd, cellData.QIn[cellIndex],
278 );
279
280 for (int d = 0; d < DIMENSIONS; d++) {
281 const int direction = d;
282
283 //For the Riemann solver, we wrap the domain as though it had periodic boundary
284 // conditions and solve the problem between the left and right faces of the cell.
285 tarch::la::Vector<DIMENSIONS, double> faceCentre = CellCenter;
286// faceCentre[d] -= 0.5 * CellSize[d];
287
288 kernels::AderSolver::riemannSolver<double>(
289 repositories::instanceOfAderSolver,
290 lFhbnd[d+DIMENSIONS],
291 lFhbnd[d],
292 lQhbnd[d+DIMENSIONS],
293 lQhbnd[d],
294 TimeStamp,
296 faceCentre,
297 CellSize,
298 direction,
299 false,
300 0
301 );
302
303 const double inverseDxDirection = 1.0 / CellSize[d];
304 // Negative face
305 kernels::AderSolver::faceIntegral(
306 cellData.QIn[cellIndex],
307 lFhbnd[d],
308 direction,
309 0,
310 inverseDxDirection,
312 );
313
314 // Positive face
315 faceCentre[d] += CellSize[d];
316 kernels::AderSolver::faceIntegral(
317 cellData.QIn[cellIndex],
318 lFhbnd[d+DIMENSIONS],
319 direction,
320 1,
321 inverseDxDirection,
323 );
324
325 } // for d
326
327 double maxEigenvalue = kernels::AderSolver::maxScaledEigenvalue(
328 repositories::instanceOfAderSolver,
329 cellData.QIn[cellIndex],
331 CellSize,
332 TimeStamp,
334 );
335
336 watchKernelCompute.stop();
337 measurement.setValue(watchKernelCompute.getCalendarTime());
338
339 return maxEigenvalue;
340
341}
342
348void runBenchmarks(int numberOfCells) {
349 exahype2::CellData<double, double> cellData(numberOfCells);
350 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
351 cellData.QIn[cellIndex] = tarch::allocateMemory<double>(NumberOfInputEntriesPerCell, tarch::MemoryLocation::Heap);
352 cellData.QOut[cellIndex] = nullptr; //tarch::allocateMemory<double>(NumberOfOutputEntriesPerCell, tarch::MemoryLocation::Heap);
353 cellData.cellCentre[cellIndex] = CellCenter;
354 cellData.cellSize[cellIndex] = CellSize;
355 cellData.maxEigenvalue[cellIndex] = 0.0;
356 initInputData(cellData.QIn[cellIndex]);
357 std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(double));
358 }
359
360 cellData.t = TimeStamp;
361 cellData.dt = TimeStepSize;
362
363 auto assessKernel = [&](
364 std::function<double(int device, exahype2::CellData<double, double>& cellData, int cellIndex, tarch::timing::Measurement& measurement)> executeKernels,
365 const std::string& markerName,
366 const int device
367 ) -> void {
368 timingComputeKernel.erase();
369 tarch::timing::Measurement timingKernelLaunch;
370
371 int sample = 0;
372 while (sample <= NumberOfSamples) {
373 // Reset output data
374 #if defined(WITH_OPENMP)
375 #pragma omp parallel num_threads(NumberOfLaunchingThreads)
376 #endif
377 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
378 cellData.maxEigenvalue[cellIndex] = 0.0;
379 // std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(double));
380
381 // parallelFor(launchingThread, NumberOfLaunchingThreads) {
382 // for (int launchingThread = 0; launchingThread < NumberOfLaunchingThreads; launchingThread++) {
383 tarch::timing::Watch watchKernelLaunch("::runBenchmarks", "assessKernel(...)", false);
384 executeKernels(device, cellData, cellIndex, timingComputeKernel);
385 watchKernelLaunch.stop();
386 timingKernelLaunch.setValue(watchKernelLaunch.getCalendarTime());
387 // } endParallelFor
388 // }
389 // #if defined(WITH_OPENMP)
390 // }
391 // #endif
392 }
393 sample++;
394 }
395
396 reportRuntime(markerName, timingKernelLaunch, numberOfCells);
397 allocateAndStoreOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);
398 validateOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);
399 };
400
401 if constexpr (AssessHostKernels) {
402 assessKernel(runKernels,
403 "host, stateless, batched, AoS, serial",
404 tarch::accelerator::Device::DefaultDevice
405 );
406 } // AssessHostKernels
407
408 for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
409 tarch::freeMemory(cellData.QIn[cellIndex], tarch::MemoryLocation::Heap);
410 tarch::freeMemory(cellData.QOut[cellIndex], tarch::MemoryLocation::Heap);
411 }
412}
413
414int main(int argc, char** argv) {
415 peano4::init(&argc, &argv, benchmarks::exahype2::kernelbenchmarks::DomainOffset, benchmarks::exahype2::kernelbenchmarks::DomainSize);
416 repositories::initLogFilters();
417
418 if constexpr (EnableFPE) {
419 feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
420 }
421
422 logInfo(
423 "main()",
424 "number of compute threads: "
425 << tarch::multicore::Core::getInstance().getNumberOfThreads()
426 );
427 logInfo(
428 "main()",
429 "number of threads launching compute kernels: "
430 << NumberOfLaunchingThreads
431 );
432 logInfo(
433 "main()",
434 "number of unknowns: "
435 << AderSolver::NumberOfUnknowns
436 );
437 logInfo(
438 "main()",
439 "number of auxiliary variables: "
440 << AderSolver::NumberOfAuxiliaryVariables
441 );
442 logInfo(
443 "main()",
444 "number of finite volumes per axis per cell: "
445 << AderSolver::Order
446 );
447 logInfo(
448 "main()",
449 "number of samples per measurement: "
450 << NumberOfSamples
451 );
452 logInfo(
453 "main()",
454 "floating-point exception handler enabled: "
455 << std::boolalpha << EnableFPE
456 );
457 logInfo(
458 "main()",
459 "performing accuracy checks with precision: "
460 << Accuracy
461 );
462#if defined(WITH_GPU_SYCL)
463 logInfo(
464 "main()",
465 "set SYCL_DEVICE_FILTER=gpu or ONEAPI_DEVICE_SELECTOR=cuda:0 when using SYCL on the device"
466 );
467 logInfo(
468 "main()",
469 "set SYCL_PI_TRACE=2 in case of runtime errors"
470 );
471#endif
472
473// #if defined(WITH_OPENMP)
474// #pragma omp parallel
475// {
476// #pragma omp master
477// {
478// #endif
479 for (int i = 0; i < NumberOfCellsToStudy.size(); i++) {
480 logInfo("main()", "number of cells: " << NumberOfCellsToStudy[i]);
481 runBenchmarks(NumberOfCellsToStudy[i]);
482 freeOutcome(NumberOfCellsToStudy[i]);
483 }
484// #if defined(WITH_OPENMP)
485// }
486// }
487// #endif
488
489 peano4::shutdown();
490
491 if (outcomeIsInvalid) {
492 return EXIT_FAILURE; // Make sure the CI pipeline reports an error
493 }
494
495 return EXIT_SUCCESS;
496}
void runKernels(exahype2::CellData< SolverPrecision, SolverPrecision > *myCellData, exahype2::CellFaceData< SolverPrecision, SolverPrecision > *myFaceData, const int cellId, const int faceId, tarch::timing::Measurement &measurement)
Definition Variant2.cpp:102
void freeOutcome(const int numberOfCells)
const tarch::la::Vector< DIMENSIONS, double > CellCenter
void runBenchmarks(int numberOfCells)
Run the benchmark for one particular number of cells.
int main(int argc, char **argv)
double runKernels(int device, exahype2::CellData< double, double > &cellData, int cellIndex, tarch::timing::Measurement &measurement)
void allocateAndStoreOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Allocates and stores outcome of one compute kernel.
constexpr double TimeStamp
void validateOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Validate data against pre-stored simulation outcome.
double ** validQ
bool outcomeIsInvalid
tarch::timing::Measurement timingComputeKernel
double * validMaxEigenvalue
tarch::logging::Log _log("::")
constexpr int NumberOfFiniteVolumesPerCell
constexpr double TimeStepSize
const tarch::la::Vector< DIMENSIONS, double > CellSize
constexpr int NumberOfInputEntriesPerCell
Definition Utils.h:14
void reportRuntime(const std::string &kernelIdentificator, const tarch::timing::Measurement &timingComputeKernel, const tarch::timing::Measurement &timingKernelLaunch, int numberOfCells, int numberOfThreads, tarch::logging::Log _log)
Reports the runtime and throughput of the benchmarks.
Definition Utils.h:68
constexpr int NumberOfOutputEntriesPerCell
Definition Utils.h:20
void initInputData(SolverPrecision *Q, const tarch::la::Vector< DIMENSIONS, double > CellCenter, const tarch::la::Vector< DIMENSIONS, double > CellSize)
Set input data.
Definition Utils.h:30