3#include "KernelBenchmarks-main.h"
7#include "repositories/DataRepository.h"
8#include "repositories/SolverRepository.h"
9#include "repositories/StepRepository.h"
11#include "exahype2/CellData.h"
13#include "exahype2/dg/DGUtils.h"
15#include "kernels/AderSolver/FusedSpaceTimePredictorVolumeIntegral.h"
16#include "kernels/AderSolver/FaceIntegral.h"
17#include "kernels/AderSolver/MaxScaledEigenvalue.h"
18#include "kernels/AderSolver/Quadrature.h"
19#include "kernels/AderSolver/DGMatrices.h"
20#include "kernels/AderSolver/RiemannSolver.h"
21#include "kernels/AderSolver/BufferSizes.h"
23#include "peano4/peano4.h"
25#include "tarch/NonCriticalAssertions.h"
26#include "tarch/accelerator/accelerator.h"
27#include "tarch/accelerator/Device.h"
28#include "tarch/multicore/multicore.h"
29#include "tarch/multicore/Core.h"
30#include "tarch/multicore/BooleanSemaphore.h"
31#include "tarch/multicore/Lock.h"
32#include "tarch/logging/Log.h"
33#include "tarch/timing/Measurement.h"
34#include "tarch/timing/Watch.h"
39#pragma float_control(precise, on)
40#pragma STDC FENV_ACCESS ON
44tarch::logging::Log
_log(
"::");
48const tarch::la::Vector<DIMENSIONS,double>
CellCenter = benchmarks::exahype2::kernelbenchmarks::DomainOffset + 0.5*benchmarks::exahype2::kernelbenchmarks::DomainSize;
49const tarch::la::Vector<DIMENSIONS,double>
CellSize = benchmarks::exahype2::kernelbenchmarks::DomainSize / 81.0;
52static_assert(Accuracy >= std::numeric_limits<double>::epsilon() || Accuracy == 0.0);
55 = (AderSolver::Order + 1)
56 * (AderSolver::Order + 1)
58 * (AderSolver::Order + 1)
60 * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
99 int linearisedIndex = 0;
100 dfor(index, AderSolver::Order + 1) {
101 repositories::instanceOfAderSolver.initialCondition(
103 ::exahype2::dg::getQuadraturePoint(
107 repositories::instanceOfAderSolver.Order + 1,
108 kernels::AderSolver::Quadrature<double>::nodes
114 linearisedIndex += AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables;
129 const double*
const maxEigenvalue,
130 const int numberOfCells
132 if constexpr (Accuracy <= 0.0)
return;
134 validQ =
new double*[numberOfCells];
135 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
141 logInfo(
"storeOutcome(...)",
"bookmarked reference solution");
146 if constexpr (Accuracy <= 0.0)
return;
147 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
148 delete[]
validQ[cellIndex];
165 const double*
const* Q,
166 const double*
const maxEigenvalue,
167 const int numberOfCells
169 if constexpr (Accuracy <= 0.0)
return;
171 double maxDifference = 0.0;
173 std::cerr.precision(16);
174 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
176 if (not tarch::la::equals(Q[cellIndex][i],
validQ[cellIndex][i], Accuracy)) {
178 logError(
"validateOutcome(...)",
180 <<
"cell " << cellIndex <<
": "
181 <<
"Q[" << i <<
"]!=validQ[" << i <<
"] ("
189 maxDifference = std::max(maxDifference, std::abs(Q[cellIndex][i] -
validQ[cellIndex][i]));
193 if (not tarch::la::equals(maxEigenvalue[cellIndex],
validMaxEigenvalue[cellIndex], Accuracy)) {
195 logError(
"validateOutcome(...)",
197 <<
"maxEigenvalue[" << cellIndex <<
"]!=validMaxEigenvalue[" << cellIndex <<
"] ("
203 maxDifference = std::max(maxDifference, std::abs(maxEigenvalue[cellIndex] -
validMaxEigenvalue[cellIndex]));
209 logError(
"validateOutcome(...)",
210 "max difference of outcome from all cells is "
212 <<
" (admissible accuracy="
214 <<
" for " << errors <<
" entries"
228 const std::string& kernelIdentificator,
229 const tarch::timing::Measurement& timingKernelLaunch,
232 std::stringstream ss;
234 ss << kernelIdentificator <<
":\n\t";
238 ss << timingKernelLaunch.getValue() <<
" |\n\t";
239 ss << (timingKernelLaunch.getValue() / numberOfCells );
240 ss <<
" |\n\t" << timingKernelLaunch.toString();
241 logInfo(
"reportRuntime()", ss.str());
247double runKernels(
int device, exahype2::CellData<double, double>& cellData,
int cellIndex, tarch::timing::Measurement& measurement){
248 double boundaryData[kernels::AderSolver::getBndFaceTotalSize()];
249 double* lQhbnd[2*DIMENSIONS] = {
251 boundaryData + kernels::AderSolver::getBndFaceSize(),
252 boundaryData + 2*kernels::AderSolver::getBndFaceSize(),
253 boundaryData + 3*kernels::AderSolver::getBndFaceSize()
255 ,boundaryData + 4*kernels::AderSolver::getBndFaceSize(),
256 boundaryData + 5*kernels::AderSolver::getBndFaceSize()
260 double boundaryFlux[kernels::AderSolver::getBndFluxTotalSize()];
261 double* lFhbnd[2*DIMENSIONS] = {
263 boundaryFlux + kernels::AderSolver::getBndFluxSize(),
264 boundaryFlux + 2*kernels::AderSolver::getBndFluxSize(),
265 boundaryFlux + 3*kernels::AderSolver::getBndFluxSize()
267 ,boundaryFlux + 4*kernels::AderSolver::getBndFluxSize(),
268 boundaryFlux + 5*kernels::AderSolver::getBndFluxSize()
272 tarch::timing::Watch watchKernelCompute(
"::runBenchmarks",
"assessKernel(...)",
false);
274 int numberOfIterations = kernels::AderSolver::fusedSpaceTimePredictorVolumeIntegral<double, double, double>(
275 repositories::instanceOfAderSolver,
276 lQhbnd, lFhbnd, cellData.QIn[cellIndex],
280 for (
int d = 0; d < DIMENSIONS; d++) {
281 const int direction = d;
285 tarch::la::Vector<DIMENSIONS, double> faceCentre =
CellCenter;
288 kernels::AderSolver::riemannSolver<double>(
289 repositories::instanceOfAderSolver,
290 lFhbnd[d+DIMENSIONS],
292 lQhbnd[d+DIMENSIONS],
303 const double inverseDxDirection = 1.0 /
CellSize[d];
305 kernels::AderSolver::faceIntegral(
306 cellData.QIn[cellIndex],
316 kernels::AderSolver::faceIntegral(
317 cellData.QIn[cellIndex],
318 lFhbnd[d+DIMENSIONS],
327 double maxEigenvalue = kernels::AderSolver::maxScaledEigenvalue(
328 repositories::instanceOfAderSolver,
329 cellData.QIn[cellIndex],
336 watchKernelCompute.stop();
337 measurement.setValue(watchKernelCompute.getCalendarTime());
339 return maxEigenvalue;
349 exahype2::CellData<double, double> cellData(numberOfCells);
350 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
352 cellData.QOut[cellIndex] =
nullptr;
354 cellData.cellSize[cellIndex] =
CellSize;
355 cellData.maxEigenvalue[cellIndex] = 0.0;
363 auto assessKernel = [&](
364 std::function<double(
int device, exahype2::CellData<double, double>& cellData,
int cellIndex, tarch::timing::Measurement& measurement)> executeKernels,
365 const std::string& markerName,
369 tarch::timing::Measurement timingKernelLaunch;
372 while (sample <= NumberOfSamples) {
374 #if defined(WITH_OPENMP)
375 #pragma omp parallel num_threads(NumberOfLaunchingThreads)
377 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
378 cellData.maxEigenvalue[cellIndex] = 0.0;
383 tarch::timing::Watch watchKernelLaunch(
"::runBenchmarks",
"assessKernel(...)",
false);
385 watchKernelLaunch.stop();
386 timingKernelLaunch.setValue(watchKernelLaunch.getCalendarTime());
396 reportRuntime(markerName, timingKernelLaunch, numberOfCells);
401 if constexpr (AssessHostKernels) {
403 "host, stateless, batched, AoS, serial",
404 tarch::accelerator::Device::DefaultDevice
408 for (
int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
409 tarch::freeMemory(cellData.QIn[cellIndex], tarch::MemoryLocation::Heap);
410 tarch::freeMemory(cellData.QOut[cellIndex], tarch::MemoryLocation::Heap);
414int main(
int argc,
char** argv) {
415 peano4::init(&argc, &argv, benchmarks::exahype2::kernelbenchmarks::DomainOffset, benchmarks::exahype2::kernelbenchmarks::DomainSize);
416 repositories::initLogFilters();
418 if constexpr (EnableFPE) {
419 feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
424 "number of compute threads: "
425 << tarch::multicore::Core::getInstance().getNumberOfThreads()
429 "number of threads launching compute kernels: "
430 << NumberOfLaunchingThreads
434 "number of unknowns: "
435 << AderSolver::NumberOfUnknowns
439 "number of auxiliary variables: "
440 << AderSolver::NumberOfAuxiliaryVariables
444 "number of finite volumes per axis per cell: "
449 "number of samples per measurement: "
454 "floating-point exception handler enabled: "
455 << std::boolalpha << EnableFPE
459 "performing accuracy checks with precision: "
462#if defined(WITH_GPU_SYCL)
465 "set SYCL_DEVICE_FILTER=gpu or ONEAPI_DEVICE_SELECTOR=cuda:0 when using SYCL on the device"
469 "set SYCL_PI_TRACE=2 in case of runtime errors"
479 for (
int i = 0; i < NumberOfCellsToStudy.size(); i++) {
480 logInfo(
"main()",
"number of cells: " << NumberOfCellsToStudy[i]);
void runKernels(exahype2::CellData< SolverPrecision, SolverPrecision > *myCellData, exahype2::CellFaceData< SolverPrecision, SolverPrecision > *myFaceData, const int cellId, const int faceId, tarch::timing::Measurement &measurement)
void freeOutcome(const int numberOfCells)
const tarch::la::Vector< DIMENSIONS, double > CellCenter
void runBenchmarks(int numberOfCells)
Run the benchmark for one particular number of cells.
int main(int argc, char **argv)
double runKernels(int device, exahype2::CellData< double, double > &cellData, int cellIndex, tarch::timing::Measurement &measurement)
void allocateAndStoreOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Allocates and stores outcome of one compute kernel.
constexpr double TimeStamp
void validateOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Validate data against pre-stored simulation outcome.
tarch::timing::Measurement timingComputeKernel
double * validMaxEigenvalue
tarch::logging::Log _log("::")
constexpr int NumberOfFiniteVolumesPerCell
constexpr double TimeStepSize
const tarch::la::Vector< DIMENSIONS, double > CellSize
constexpr int NumberOfInputEntriesPerCell
void reportRuntime(const std::string &kernelIdentificator, const tarch::timing::Measurement &timingComputeKernel, const tarch::timing::Measurement &timingKernelLaunch, int numberOfCells, int numberOfThreads, tarch::logging::Log _log)
Reports the runtime and throughput of the benchmarks.
constexpr int NumberOfOutputEntriesPerCell
void initInputData(SolverPrecision *Q, const tarch::la::Vector< DIMENSIONS, double > CellCenter, const tarch::la::Vector< DIMENSIONS, double > CellSize)
Set input data.