3 #include "KernelBenchmarks-main.h"
7 #include "repositories/DataRepository.h"
8 #include "repositories/SolverRepository.h"
9 #include "repositories/StepRepository.h"
11 #include "exahype2/CellData.h"
13 #include "exahype2/dg/DGUtils.h"
15 #include "kernels/AderSolver/FusedSpaceTimePredictorVolumeIntegral.h"
16 #include "kernels/AderSolver/FaceIntegral.h"
17 #include "kernels/AderSolver/MaxScaledEigenvalue.h"
18 #include "kernels/AderSolver/Quadrature.h"
19 #include "kernels/AderSolver/DGMatrices.h"
20 #include "kernels/AderSolver/RiemannSolver.h"
21 #include "kernels/AderSolver/BufferSizes.h"
23 #include "peano4/peano4.h"
25 #include "tarch/NonCriticalAssertions.h"
26 #include "tarch/accelerator/accelerator.h"
27 #include "tarch/accelerator/Device.h"
28 #include "tarch/multicore/multicore.h"
29 #include "tarch/multicore/Core.h"
30 #include "tarch/multicore/BooleanSemaphore.h"
31 #include "tarch/multicore/Lock.h"
32 #include "tarch/logging/Log.h"
33 #include "tarch/timing/Measurement.h"
34 #include "tarch/timing/Watch.h"
39 #pragma float_control(precise, on)
40 #pragma STDC FENV_ACCESS ON
44 tarch::logging::Log
_log(
"::");
48 const tarch::la::Vector<DIMENSIONS,double>
CellCenter = benchmarks::exahype2::kernelbenchmarks::DomainOffset + 0.5*benchmarks::exahype2::kernelbenchmarks::DomainSize;
49 const tarch::la::Vector<DIMENSIONS,double>
CellSize = benchmarks::exahype2::kernelbenchmarks::DomainSize / 81.0;
52 static_assert(Accuracy >= std::numeric_limits<double>::epsilon() || Accuracy == 0.0);
55 = (AderSolver::Order + 1)
56 * (AderSolver::Order + 1)
58 * (AderSolver::Order + 1)
60 * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
99 int linearisedIndex = 0;
100 dfor(index, AderSolver::Order + 1) {
101 repositories::instanceOfAderSolver.initialCondition(
103 ::exahype2::dg::getQuadraturePoint(
107 repositories::instanceOfAderSolver.Order + 1,
108 kernels::AderSolver::Quadrature<double>::nodes
114 linearisedIndex += AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables;
130 const int numberOfCells
132 if constexpr (Accuracy <= 0.0)
return;
134 validQ =
new double*[numberOfCells];
135 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
141 logInfo(
"storeOutcome(...)",
"bookmarked reference solution");
146 if constexpr (Accuracy <= 0.0)
return;
147 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
148 delete[]
validQ[cellIndex];
165 const double*
const* Q,
167 const int numberOfCells
169 if constexpr (Accuracy <= 0.0)
return;
171 double maxDifference = 0.0;
173 std::cerr.precision(16);
174 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
176 if (not tarch::la::equals(Q[cellIndex][i],
validQ[cellIndex][i], Accuracy)) {
178 logError(
"validateOutcome(...)",
180 <<
"cell " << cellIndex <<
": "
181 <<
"Q[" << i <<
"]!=validQ[" << i <<
"] ("
189 maxDifference = std::max(maxDifference, std::abs(Q[cellIndex][i] -
validQ[cellIndex][i]));
195 logError(
"validateOutcome(...)",
197 <<
"maxEigenvalue[" << cellIndex <<
"]!=validMaxEigenvalue[" << cellIndex <<
"] ("
209 logError(
"validateOutcome(...)",
210 "max difference of outcome from all cells is "
212 <<
" (admissible accuracy="
214 <<
" for " << errors <<
" entries"
228 const std::string& kernelIdentificator,
229 const tarch::timing::Measurement& timingKernelLaunch,
232 std::stringstream ss;
234 ss << kernelIdentificator <<
":\n\t";
238 ss << timingKernelLaunch.getValue() <<
" |\n\t";
239 ss << (timingKernelLaunch.getValue() / numberOfCells );
240 ss <<
" |\n\t" << timingKernelLaunch.toString();
241 logInfo(
"reportRuntime()", ss.str());
247 double runKernels(
int device, exahype2::CellData<double, double>& cellData,
int cellIndex, tarch::timing::Measurement& measurement){
248 double boundaryData[kernels::AderSolver::getBndFaceTotalSize()];
249 double* lQhbnd[2*DIMENSIONS] = {
251 boundaryData + kernels::AderSolver::getBndFaceSize(),
252 boundaryData + 2*kernels::AderSolver::getBndFaceSize(),
253 boundaryData + 3*kernels::AderSolver::getBndFaceSize()
255 ,boundaryData + 4*kernels::AderSolver::getBndFaceSize(),
256 boundaryData + 5*kernels::AderSolver::getBndFaceSize()
260 double boundaryFlux[kernels::AderSolver::getBndFluxTotalSize()];
261 double* lFhbnd[2*DIMENSIONS] = {
263 boundaryFlux + kernels::AderSolver::getBndFluxSize(),
264 boundaryFlux + 2*kernels::AderSolver::getBndFluxSize(),
265 boundaryFlux + 3*kernels::AderSolver::getBndFluxSize()
267 ,boundaryFlux + 4*kernels::AderSolver::getBndFluxSize(),
268 boundaryFlux + 5*kernels::AderSolver::getBndFluxSize()
272 tarch::timing::Watch watchKernelCompute(
"::runBenchmarks",
"assessKernel(...)",
false);
274 int numberOfIterations = kernels::AderSolver::fusedSpaceTimePredictorVolumeIntegral<double, double, double>(
275 repositories::instanceOfAderSolver,
276 lQhbnd, lFhbnd, cellData.QIn[cellIndex],
280 for (
int d = 0; d < DIMENSIONS; d++) {
281 const int direction = d;
285 tarch::la::Vector<DIMENSIONS, double> faceCentre =
CellCenter;
288 kernels::AderSolver::riemannSolver<double>(
289 repositories::instanceOfAderSolver,
290 lFhbnd[d+DIMENSIONS],
292 lQhbnd[d+DIMENSIONS],
303 const double inverseDxDirection = 1.0 /
CellSize[d];
305 kernels::AderSolver::faceIntegral(
306 cellData.QIn[cellIndex],
316 kernels::AderSolver::faceIntegral(
317 cellData.QIn[cellIndex],
318 lFhbnd[d+DIMENSIONS],
327 double maxEigenvalue = kernels::AderSolver::maxScaledEigenvalue(
328 repositories::instanceOfAderSolver,
329 cellData.QIn[cellIndex],
336 watchKernelCompute.stop();
337 measurement.setValue(watchKernelCompute.getCalendarTime());
349 exahype2::CellData<double, double> cellData(numberOfCells);
350 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
352 cellData.QOut[cellIndex] =
nullptr;
354 cellData.cellSize[cellIndex] =
CellSize;
355 cellData.maxEigenvalue[cellIndex] = 0.0;
363 auto assessKernel = [&](
364 std::function<double(
int device, exahype2::CellData<double, double>& cellData,
int cellIndex, tarch::timing::Measurement& measurement)> executeKernels,
365 const std::string& markerName,
369 tarch::timing::Measurement timingKernelLaunch;
372 while (sample <= NumberOfSamples) {
374 #if defined(WITH_OPENMP)
375 #pragma omp parallel num_threads(NumberOfLaunchingThreads)
377 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
378 cellData.maxEigenvalue[cellIndex] = 0.0;
383 tarch::timing::Watch watchKernelLaunch(
"::runBenchmarks",
"assessKernel(...)",
false);
385 watchKernelLaunch.stop();
386 timingKernelLaunch.setValue(watchKernelLaunch.getCalendarTime());
396 reportRuntime(markerName, timingKernelLaunch, numberOfCells);
401 if constexpr (AssessHostKernels) {
403 "host, stateless, batched, AoS, serial",
404 tarch::accelerator::Device::DefaultDevice
419 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
420 tarch::freeMemory(cellData.QIn[cellIndex], tarch::MemoryLocation::Heap);
421 tarch::freeMemory(cellData.QOut[cellIndex], tarch::MemoryLocation::Heap);
425 int main(
int argc,
char** argv) {
426 peano4::init(&argc, &argv, benchmarks::exahype2::kernelbenchmarks::DomainOffset, benchmarks::exahype2::kernelbenchmarks::DomainSize);
427 repositories::initLogFilters();
429 if constexpr (EnableFPE) {
430 feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
435 "number of compute threads: "
436 << tarch::multicore::Core::getInstance().getNumberOfThreads()
440 "number of threads launching compute kernels: "
441 << NumberOfLaunchingThreads
445 "number of unknowns: "
446 << AderSolver::NumberOfUnknowns
450 "number of auxiliary variables: "
451 << AderSolver::NumberOfAuxiliaryVariables
455 "number of finite volumes per axis per cell: "
460 "number of samples per measurement: "
465 "floating-point exception handler enabled: "
466 << std::boolalpha << EnableFPE
470 "performing accuracy checks with precision: "
473 #if defined(WITH_GPU_SYCL)
476 "set SYCL_DEVICE_FILTER=gpu or ONEAPI_DEVICE_SELECTOR=cuda:0 when using SYCL on the device"
480 "set SYCL_PI_TRACE=2 in case of runtime errors"
490 for (
int i = 0; i < NumberOfCellsToStudy.size(); i++) {
491 logInfo(
"main()",
"number of cells: " << NumberOfCellsToStudy[i]);
void freeOutcome(const int numberOfCells)
const tarch::la::Vector< DIMENSIONS, double > CellCenter
void runBenchmarks(int numberOfCells)
Run the benchmark for one particular number of cells.
int main(int argc, char **argv)
double runKernels(int device, exahype2::CellData< double, double > &cellData, int cellIndex, tarch::timing::Measurement &measurement)
void allocateAndStoreOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Allocates and stores outcome of one compute kernel.
constexpr double TimeStamp
void validateOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Validate data against pre-stored simulation outcome.
tarch::timing::Measurement timingComputeKernel
double * validMaxEigenvalue
tarch::logging::Log _log("::")
constexpr int NumberOfFiniteVolumesPerCell
constexpr double TimeStepSize
const tarch::la::Vector< DIMENSIONS, double > CellSize
static double maxEigenvalue(const double *const Q, int normal, const double CCZ4e, const double CCZ4ds, const double CCZ4GLMc, const double CCZ4GLMd)
constexpr int NumberOfInputEntriesPerCell
void reportRuntime(const std::string &kernelIdentificator, const tarch::timing::Measurement &timingComputeKernel, const tarch::timing::Measurement &timingKernelLaunch, int numberOfCells, int numberOfThreads, tarch::logging::Log _log)
Reports the runtime and throughput of the benchmarks.
constexpr int NumberOfOutputEntriesPerCell
void initInputData(SolverPrecision *Q, const tarch::la::Vector< DIMENSIONS, double > CellCenter, const tarch::la::Vector< DIMENSIONS, double > CellSize)
Set input data.