Peano
KernelBenchmarks-main.cpp
Go to the documentation of this file.
1 // This file is part of the ExaHyPE2 project. For conditions of distribution and
2 // use, please see the copyright notice at www.peano-framework.org
3 #include "KernelBenchmarks-main.h"
4 
5 #include "Constants.h"
6 
7 #include "repositories/DataRepository.h"
8 #include "repositories/SolverRepository.h"
9 #include "repositories/StepRepository.h"
10 
11 #include "exahype2/CellData.h"
12 
13 #include "exahype2/dg/DGUtils.h"
14 
15 #include "kernels/AderSolver/FusedSpaceTimePredictorVolumeIntegral.h"
16 #include "kernels/AderSolver/FaceIntegral.h"
17 #include "kernels/AderSolver/MaxScaledEigenvalue.h"
18 #include "kernels/AderSolver/Quadrature.h"
19 #include "kernels/AderSolver/DGMatrices.h"
20 #include "kernels/AderSolver/RiemannSolver.h"
21 #include "kernels/AderSolver/BufferSizes.h"
22 
23 #include "peano4/peano4.h"
24 
25 #include "tarch/NonCriticalAssertions.h"
26 #include "tarch/accelerator/accelerator.h"
27 #include "tarch/accelerator/Device.h"
28 #include "tarch/multicore/multicore.h"
29 #include "tarch/multicore/Core.h"
30 #include "tarch/multicore/BooleanSemaphore.h"
31 #include "tarch/multicore/Lock.h"
32 #include "tarch/logging/Log.h"
33 #include "tarch/timing/Measurement.h"
34 #include "tarch/timing/Watch.h"
35 
36 #include <cstring>
37 
38 #include <fenv.h>
39 #pragma float_control(precise, on)
40 #pragma STDC FENV_ACCESS ON
41 
43 
44 tarch::logging::Log _log("::");
45 
46 constexpr double TimeStamp = 0.0;
47 constexpr double TimeStepSize = 1e-6;
48 const tarch::la::Vector<DIMENSIONS,double> CellCenter = benchmarks::exahype2::kernelbenchmarks::DomainOffset + 0.5*benchmarks::exahype2::kernelbenchmarks::DomainSize;
49 const tarch::la::Vector<DIMENSIONS,double> CellSize = benchmarks::exahype2::kernelbenchmarks::DomainSize / 81.0;
50 // constexpr int HaloSize = 1;
51 
52 static_assert(Accuracy >= std::numeric_limits<double>::epsilon() || Accuracy == 0.0);
53 
55  = (AderSolver::Order + 1)
56  * (AderSolver::Order + 1)
57 #if DIMENSIONS == 3
58  * (AderSolver::Order + 1)
59 #endif
60  * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
61 
63  = 0;
64 // = AderSolver::Order
65 // * AderSolver::Order
66 // #if DIMENSIONS == 3
67 // * AderSolver::Order
68 // #endif
69 // * (AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables);
70 
72  = AderSolver::Order
73  * AderSolver::Order
74 #if DIMENSIONS == 3
75  * AderSolver::Order
76 #endif
77  ;
78 
79 // Check the outcomes of each kernel
80 double** validQ = nullptr;
81 double* validMaxEigenvalue = nullptr;
82 bool outcomeIsInvalid = false;
83 
84 tarch::timing::Measurement timingComputeKernel;
85 
94 void initInputData(double* Q) {
95  // for (int i = 0; i < NumberOfInputEntriesPerCell; i++) {
96  // Q[i] = std::sin(1.0 * i / (NumberOfInputEntriesPerCell) * tarch::la::PI);
97  // }
98 
99  int linearisedIndex = 0;
100  dfor(index, AderSolver::Order + 1) {
101  repositories::instanceOfAderSolver.initialCondition(
102  Q + linearisedIndex,
103  ::exahype2::dg::getQuadraturePoint(
104  CellCenter,
105  CellSize,
106  index,
107  repositories::instanceOfAderSolver.Order + 1,
108  kernels::AderSolver::Quadrature<double>::nodes
109  ),
110  CellSize,
111  //index,
112  true
113  );
114  linearisedIndex += AderSolver::NumberOfUnknowns + AderSolver::NumberOfAuxiliaryVariables;
115  }
116 
117 }
118 
128 void allocateAndStoreOutcome(const double* const* Q,
129  const double* const maxEigenvalue,
130  const int numberOfCells
131 ) {
132  if constexpr (Accuracy <= 0.0) return;
133  if (validQ == nullptr and validMaxEigenvalue == nullptr) {
134  validQ = new double*[numberOfCells];
135  for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
136  validQ[cellIndex] = new double[NumberOfOutputEntriesPerCell];
137  std::memcpy(validQ[cellIndex], Q[cellIndex], sizeof(double) * NumberOfOutputEntriesPerCell);
138  }
139  validMaxEigenvalue = new double[numberOfCells];
140  std::memcpy(validMaxEigenvalue, maxEigenvalue, sizeof(double) * numberOfCells);
141  logInfo("storeOutcome(...)", "bookmarked reference solution");
142  }
143 }
144 
145 void freeOutcome(const int numberOfCells) {
146  if constexpr (Accuracy <= 0.0) return;
147  for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
148  delete[] validQ[cellIndex];
149  }
150  delete[] validQ;
151  delete[] validMaxEigenvalue;
152  validQ = nullptr;
153  validMaxEigenvalue = nullptr;
154 }
155 
165  const double* const* Q,
166  const double* const maxEigenvalue,
167  const int numberOfCells
168 ) {
169  if constexpr (Accuracy <= 0.0) return;
170  int errors = 0;
171  double maxDifference = 0.0;
172 
173  std::cerr.precision(16);
174  for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
175  for (int i = 0; i < NumberOfOutputEntriesPerCell; i++) {
176  if (not tarch::la::equals(Q[cellIndex][i], validQ[cellIndex][i], Accuracy)) {
177  if (!errors) { // Only print once
178  logError("validateOutcome(...)",
179  std::fixed
180  << "cell " << cellIndex << ": "
181  << "Q[" << i << "]!=validQ[" << i << "] ("
182  << Q[cellIndex][i]
183  << "!="
184  << validQ[cellIndex][i]
185  << ")"
186  );
187  }
188  errors++;
189  maxDifference = std::max(maxDifference, std::abs(Q[cellIndex][i] - validQ[cellIndex][i]));
190  }
191  }
192 
193  if (not tarch::la::equals(maxEigenvalue[cellIndex], validMaxEigenvalue[cellIndex], Accuracy)) {
194  if (!errors) {
195  logError("validateOutcome(...)",
196  std::fixed
197  << "maxEigenvalue[" << cellIndex << "]!=validMaxEigenvalue[" << cellIndex << "] ("
198  << maxEigenvalue[cellIndex] << "!=" << validMaxEigenvalue[cellIndex]
199  << ")";
200  );
201  }
202  errors++;
203  maxDifference = std::max(maxDifference, std::abs(maxEigenvalue[cellIndex] - validMaxEigenvalue[cellIndex]));
204  }
205  }
206 
207  if (errors > 0) {
208  outcomeIsInvalid = true;
209  logError("validateOutcome(...)",
210  "max difference of outcome from all cells is "
211  << maxDifference
212  << " (admissible accuracy="
213  << Accuracy << ")"
214  << " for " << errors << " entries"
215  );
216  }
217 }
218 
228  const std::string& kernelIdentificator,
229  const tarch::timing::Measurement& timingKernelLaunch,
230  int numberOfCells
231 ) {
232  std::stringstream ss;
233  ss << "\n";
234  ss << kernelIdentificator << ":\n\t";
235  ss << timingComputeKernel.getValue() << " |\n\t";
236  ss << (timingComputeKernel.getValue() / numberOfCells ) << " |\n\t";
237  ss << timingComputeKernel.toString() << " |\n\t";
238  ss << timingKernelLaunch.getValue() << " |\n\t";
239  ss << (timingKernelLaunch.getValue() / numberOfCells );
240  ss << " |\n\t" << timingKernelLaunch.toString();
241  logInfo("reportRuntime()", ss.str());
242 }
243 
244 /*
245  * Executes one full run of all of the kernels
246 */
247 double runKernels(int device, exahype2::CellData<double, double>& cellData, int cellIndex, tarch::timing::Measurement& measurement){
248  double boundaryData[kernels::AderSolver::getBndFaceTotalSize()];
249  double* lQhbnd[2*DIMENSIONS] = {
250  boundaryData,
251  boundaryData + kernels::AderSolver::getBndFaceSize(),
252  boundaryData + 2*kernels::AderSolver::getBndFaceSize(),
253  boundaryData + 3*kernels::AderSolver::getBndFaceSize()
254 #if DIMENSIONS==3
255  ,boundaryData + 4*kernels::AderSolver::getBndFaceSize(),
256  boundaryData + 5*kernels::AderSolver::getBndFaceSize()
257 #endif
258  };
259 
260  double boundaryFlux[kernels::AderSolver::getBndFluxTotalSize()];
261  double* lFhbnd[2*DIMENSIONS] = {
262  boundaryFlux,
263  boundaryFlux + kernels::AderSolver::getBndFluxSize(),
264  boundaryFlux + 2*kernels::AderSolver::getBndFluxSize(),
265  boundaryFlux + 3*kernels::AderSolver::getBndFluxSize()
266 #if DIMENSIONS==3
267  ,boundaryFlux + 4*kernels::AderSolver::getBndFluxSize(),
268  boundaryFlux + 5*kernels::AderSolver::getBndFluxSize()
269 #endif
270  };
271 
272  tarch::timing::Watch watchKernelCompute("::runBenchmarks", "assessKernel(...)", false);
273 
274  int numberOfIterations = kernels::AderSolver::fusedSpaceTimePredictorVolumeIntegral<double, double, double>(
275  repositories::instanceOfAderSolver,
276  lQhbnd, lFhbnd, cellData.QIn[cellIndex],
278  );
279 
280  for (int d = 0; d < DIMENSIONS; d++) {
281  const int direction = d;
282 
283  //For the Riemann solver, we wrap the domain as though it had periodic boundary
284  // conditions and solve the problem between the left and right faces of the cell.
285  tarch::la::Vector<DIMENSIONS, double> faceCentre = CellCenter;
286 // faceCentre[d] -= 0.5 * CellSize[d];
287 
288  kernels::AderSolver::riemannSolver<double>(
289  repositories::instanceOfAderSolver,
290  lFhbnd[d+DIMENSIONS],
291  lFhbnd[d],
292  lQhbnd[d+DIMENSIONS],
293  lQhbnd[d],
294  TimeStamp,
295  TimeStepSize,
296  faceCentre,
297  CellSize,
298  direction,
299  false,
300  0
301  );
302 
303  const double inverseDxDirection = 1.0 / CellSize[d];
304  // Negative face
305  kernels::AderSolver::faceIntegral(
306  cellData.QIn[cellIndex],
307  lFhbnd[d],
308  direction,
309  0,
310  inverseDxDirection,
312  );
313 
314  // Positive face
315  faceCentre[d] += CellSize[d];
316  kernels::AderSolver::faceIntegral(
317  cellData.QIn[cellIndex],
318  lFhbnd[d+DIMENSIONS],
319  direction,
320  1,
321  inverseDxDirection,
323  );
324 
325  } // for d
326 
327  double maxEigenvalue = kernels::AderSolver::maxScaledEigenvalue(
328  repositories::instanceOfAderSolver,
329  cellData.QIn[cellIndex],
330  CellCenter,
331  CellSize,
332  TimeStamp,
334  );
335 
336  watchKernelCompute.stop();
337  measurement.setValue(watchKernelCompute.getCalendarTime());
338 
339  return maxEigenvalue;
340 
341 }
342 
348 void runBenchmarks(int numberOfCells) {
349  exahype2::CellData<double, double> cellData(numberOfCells);
350  for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
351  cellData.QIn[cellIndex] = tarch::allocateMemory<double>(NumberOfInputEntriesPerCell, tarch::MemoryLocation::Heap);
352  cellData.QOut[cellIndex] = nullptr; //tarch::allocateMemory<double>(NumberOfOutputEntriesPerCell, tarch::MemoryLocation::Heap);
353  cellData.cellCentre[cellIndex] = CellCenter;
354  cellData.cellSize[cellIndex] = CellSize;
355  cellData.maxEigenvalue[cellIndex] = 0.0;
356  initInputData(cellData.QIn[cellIndex]);
357  std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(double));
358  }
359 
360  cellData.t = TimeStamp;
361  cellData.dt = TimeStepSize;
362 
363  auto assessKernel = [&](
364  std::function<double(int device, exahype2::CellData<double, double>& cellData, int cellIndex, tarch::timing::Measurement& measurement)> executeKernels,
365  const std::string& markerName,
366  const int device
367  ) -> void {
368  timingComputeKernel.erase();
369  tarch::timing::Measurement timingKernelLaunch;
370 
371  int sample = 0;
372  while (sample <= NumberOfSamples) {
373  // Reset output data
374  #if defined(WITH_OPENMP)
375  #pragma omp parallel num_threads(NumberOfLaunchingThreads)
376  #endif
377  for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
378  cellData.maxEigenvalue[cellIndex] = 0.0;
379  // std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(double));
380 
381  // parallelFor(launchingThread, NumberOfLaunchingThreads) {
382  // for (int launchingThread = 0; launchingThread < NumberOfLaunchingThreads; launchingThread++) {
383  tarch::timing::Watch watchKernelLaunch("::runBenchmarks", "assessKernel(...)", false);
384  executeKernels(device, cellData, cellIndex, timingComputeKernel);
385  watchKernelLaunch.stop();
386  timingKernelLaunch.setValue(watchKernelLaunch.getCalendarTime());
387  // } endParallelFor
388  // }
389  // #if defined(WITH_OPENMP)
390  // }
391  // #endif
392  }
393  sample++;
394  }
395 
396  reportRuntime(markerName, timingKernelLaunch, numberOfCells);
397  allocateAndStoreOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);
398  validateOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);
399  };
400 
401  if constexpr (AssessHostKernels) {
402  assessKernel(runKernels,
403  "host, stateless, batched, AoS, serial",
404  tarch::accelerator::Device::DefaultDevice
405  );
406  } // AssessHostKernels
407 
408 // if constexpr (AssessDeviceKernels) {
409 // #if defined(WITH_GPU_OMP_TARGET)
410 
411 // #endif // WITH_GPU_OMP_TARGET
412 
413 // #if defined(WITH_GPU_SYCL)
414 
415 // #endif // WITH_GPU_SYCL
416 
417 // }
418 
419  for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {
420  tarch::freeMemory(cellData.QIn[cellIndex], tarch::MemoryLocation::Heap);
421  tarch::freeMemory(cellData.QOut[cellIndex], tarch::MemoryLocation::Heap);
422  }
423 }
424 
425 int main(int argc, char** argv) {
426  peano4::init(&argc, &argv, benchmarks::exahype2::kernelbenchmarks::DomainOffset, benchmarks::exahype2::kernelbenchmarks::DomainSize);
427  repositories::initLogFilters();
428 
429  if constexpr (EnableFPE) {
430  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
431  }
432 
433  logInfo(
434  "main()",
435  "number of compute threads: "
436  << tarch::multicore::Core::getInstance().getNumberOfThreads()
437  );
438  logInfo(
439  "main()",
440  "number of threads launching compute kernels: "
441  << NumberOfLaunchingThreads
442  );
443  logInfo(
444  "main()",
445  "number of unknowns: "
446  << AderSolver::NumberOfUnknowns
447  );
448  logInfo(
449  "main()",
450  "number of auxiliary variables: "
451  << AderSolver::NumberOfAuxiliaryVariables
452  );
453  logInfo(
454  "main()",
455  "number of finite volumes per axis per cell: "
456  << AderSolver::Order
457  );
458  logInfo(
459  "main()",
460  "number of samples per measurement: "
461  << NumberOfSamples
462  );
463  logInfo(
464  "main()",
465  "floating-point exception handler enabled: "
466  << std::boolalpha << EnableFPE
467  );
468  logInfo(
469  "main()",
470  "performing accuracy checks with precision: "
471  << Accuracy
472  );
473 #if defined(WITH_GPU_SYCL)
474  logInfo(
475  "main()",
476  "set SYCL_DEVICE_FILTER=gpu or ONEAPI_DEVICE_SELECTOR=cuda:0 when using SYCL on the device"
477  );
478  logInfo(
479  "main()",
480  "set SYCL_PI_TRACE=2 in case of runtime errors"
481  );
482 #endif
483 
484 // #if defined(WITH_OPENMP)
485 // #pragma omp parallel
486 // {
487 // #pragma omp master
488 // {
489 // #endif
490  for (int i = 0; i < NumberOfCellsToStudy.size(); i++) {
491  logInfo("main()", "number of cells: " << NumberOfCellsToStudy[i]);
492  runBenchmarks(NumberOfCellsToStudy[i]);
493  freeOutcome(NumberOfCellsToStudy[i]);
494  }
495 // #if defined(WITH_OPENMP)
496 // }
497 // }
498 // #endif
499 
500  peano4::shutdown();
501 
502  if (outcomeIsInvalid) {
503  return EXIT_FAILURE; // Make sure the CI pipeline reports an error
504  }
505 
506  return EXIT_SUCCESS;
507 }
void freeOutcome(const int numberOfCells)
const tarch::la::Vector< DIMENSIONS, double > CellCenter
void runBenchmarks(int numberOfCells)
Run the benchmark for one particular number of cells.
int main(int argc, char **argv)
double runKernels(int device, exahype2::CellData< double, double > &cellData, int cellIndex, tarch::timing::Measurement &measurement)
void allocateAndStoreOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Allocates and stores outcome of one compute kernel.
constexpr double TimeStamp
void validateOutcome(const double *const *Q, const double *const maxEigenvalue, const int numberOfCells)
Validate data against pre-stored simulation outcome.
double ** validQ
bool outcomeIsInvalid
tarch::timing::Measurement timingComputeKernel
double * validMaxEigenvalue
tarch::logging::Log _log("::")
constexpr int NumberOfFiniteVolumesPerCell
constexpr double TimeStepSize
const tarch::la::Vector< DIMENSIONS, double > CellSize
static double maxEigenvalue(const double *const Q, int normal, const double CCZ4e, const double CCZ4ds, const double CCZ4GLMc, const double CCZ4GLMd)
constexpr int NumberOfInputEntriesPerCell
Definition: Utils.h:14
void reportRuntime(const std::string &kernelIdentificator, const tarch::timing::Measurement &timingComputeKernel, const tarch::timing::Measurement &timingKernelLaunch, int numberOfCells, int numberOfThreads, tarch::logging::Log _log)
Reports the runtime and throughput of the benchmarks.
Definition: Utils.h:68
constexpr int NumberOfOutputEntriesPerCell
Definition: Utils.h:20
void initInputData(SolverPrecision *Q, const tarch::la::Vector< DIMENSIONS, double > CellCenter, const tarch::la::Vector< DIMENSIONS, double > CellSize)
Set input data.
Definition: Utils.h:30