dc/d0e/Variant2_8cpp_source.html

 #include "CellFaceData.h"

 #include "exahype2/CellData.h"

 #include "kernels/AderSolver/BufferSizes.h"

 #include "kernels/AderSolver/FaceIntegral.h"

 #include "kernels/AderSolver/FusedSpaceTimePredictorVolumeIntegral.h"

 #include "kernels/AderSolver/MaxScaledEigenvalue.h"

 #include "kernels/AderSolver/RiemannSolver.h"

 #include "repositories/SolverRepository.h"

 #include "Utils.h"

 #include "Variants.h"


 #include <cstring>


 tarch::logging::Log variant2::_log("variants3::");


 using namespace benchmarks::exahype2::kernelbenchmarks;


 inline void initialTask(

   exahype2::CellData<SolverPrecision, SolverPrecision>*     myCellData,

   exahype2::CellFaceData<SolverPrecision, SolverPrecision>* myFaceData,

   const int                               cellId,

   const int                               faceId,

   tarch::timing::Measurement&             measurement

 ) {


   // Assume that the faces for a given cell are one after another

   // in typical Peano order, e.g. left, lower, right, upper

   SolverPrecision* lQhbnd[2 * DIMENSIONS] = {

     &myFaceData->QIn[faceId][0][kernels::AderSolver::getBndFaceSize()],

     &myFaceData->QIn[faceId][1][kernels::AderSolver::getBndFaceSize()],

 #if DIMENSIONS == 3

     &myFaceData->QIn[faceId][2][kernels::AderSolver::getBndFaceSize()],

 #endif

     &myFaceData->QIn[faceId][DIMENSIONS + 0][0],

     &myFaceData->QIn[faceId][DIMENSIONS + 1][0]

 #if DIMENSIONS == 3

     ,

     &myFaceData->QIn[faceId][DIMENSIONS + 2][0]

 #endif

   };


   SolverPrecision* lFhbnd[2 * DIMENSIONS] = {

     &myFaceData->QOut[faceId][0][kernels::AderSolver::getBndFluxSize()],

     &myFaceData->QOut[faceId][1][kernels::AderSolver::getBndFluxSize()],

 #if DIMENSIONS == 3

     &myFaceData->QOut[faceId][2][kernels::AderSolver::getBndFluxSize()],

 #endif

     &myFaceData->QOut[faceId][DIMENSIONS + 0][0],

     &myFaceData->QOut[faceId][DIMENSIONS + 1][0]

 #if DIMENSIONS == 3

     ,

     &myFaceData->QOut[faceId][DIMENSIONS + 2][0]

 #endif

   };


   tarch::timing::Watch watchKernelCompute("::runBenchmarks", "assessKernel(...)", false);


   int numberOfIterations = kernels::AderSolver::fusedSpaceTimePredictorVolumeIntegral<SolverPrecision, SolverPrecision, SolverPrecision>(

     repositories::instanceOfAderSolver,

     lQhbnd,

     lFhbnd,

     myCellData->QIn[cellId],

     myCellData->cellCentre[cellId],

     myCellData->cellSize[cellId],

     myCellData->t,

     myCellData->dt

   );


   watchKernelCompute.stop();

   measurement.setValue(watchKernelCompute.getCalendarTime());

 }


 void runKernels(

   exahype2::CellData<SolverPrecision, SolverPrecision>*     myCellData,

   exahype2::CellFaceData<SolverPrecision, SolverPrecision>* myFaceData,

   const int                               cellId,

   const int                               faceId,

   tarch::timing::Measurement&             measurement

 ) {


   // Assume that the faces for a given cell are one after another

   // in typical Peano order, e.g. left, lower, right, upper

   // Assume that the faces for a given cell are one after another

   // in typical Peano order, e.g. left, lower, right, upper

   SolverPrecision* lQhbnd[2 * DIMENSIONS] = {

     &myFaceData->QIn[faceId][0][kernels::AderSolver::getBndFaceSize()],

     &myFaceData->QIn[faceId][1][kernels::AderSolver::getBndFaceSize()],

 #if DIMENSIONS == 3

     &myFaceData->QIn[faceId][2][kernels::AderSolver::getBndFaceSize()],

 #endif

     &myFaceData->QIn[faceId][DIMENSIONS + 0][0],

     &myFaceData->QIn[faceId][DIMENSIONS + 1][0]

 #if DIMENSIONS == 3

     ,

     &myFaceData->QIn[faceId][DIMENSIONS + 2][0]

 #endif

   };


   SolverPrecision* lFhbnd[2 * DIMENSIONS] = {

     &myFaceData->QOut[faceId][0][kernels::AderSolver::getBndFluxSize()],

     &myFaceData->QOut[faceId][1][kernels::AderSolver::getBndFluxSize()],

 #if DIMENSIONS == 3

     &myFaceData->QOut[faceId][2][kernels::AderSolver::getBndFluxSize()],

 #endif

     &myFaceData->QOut[faceId][DIMENSIONS + 0][0],

     &myFaceData->QOut[faceId][DIMENSIONS + 1][0]

 #if DIMENSIONS == 3

     ,

     &myFaceData->QOut[faceId][DIMENSIONS + 2][0]

 #endif

   };


   tarch::timing::Watch watchKernelCompute("::runBenchmarks", "assessKernel(...)", false);


   for (int d = 0; d < DIMENSIONS; d++) {

     const int direction = d;


     kernels::AderSolver::riemannSolver<SolverPrecision>(

       repositories::instanceOfAderSolver,

       &myFaceData->QOut[faceId][d][0], // lFhbnd[d+DIMENSIONS],

       &myFaceData->QOut[faceId][d][kernels::AderSolver::getBndFluxSize()],

       &myFaceData->QIn[faceId][d][0], // lFhbnd[d+DIMENSIONS],

       &myFaceData->QIn[faceId][d][kernels::AderSolver::getBndFaceSize()],

       0.5 * (myFaceData->t[faceId] + myFaceData->t[faceId]),

       0.5 * (myFaceData->dt[faceId] + myFaceData->dt[faceId]),

       0.5 * (myFaceData->cellCentre[faceId] + myFaceData->cellCentre[faceId]),

       0.5 * (myFaceData->cellSize[faceId] + myFaceData->cellSize[faceId]),

       direction,

       false,

       0

     );


     const double inverseDxDirection = 1.0 / myCellData->cellSize[cellId][d];

     // Negative face

     kernels::AderSolver::faceIntegral(

       myCellData->QIn[cellId],

       &myFaceData->QOut[faceId][d][kernels::AderSolver::getBndFluxSize()],

       direction,

       0,

       inverseDxDirection,

       myCellData->dt

     );


     tarch::la::Vector<DIMENSIONS, double> faceCentre = myFaceData->cellCentre[faceId];

     faceCentre[d] += 0.5 * myFaceData->cellSize[faceId][d];


     kernels::AderSolver::riemannSolver<SolverPrecision>(

       repositories::instanceOfAderSolver,

       &myFaceData->QOut[faceId][d + DIMENSIONS][0], // lFhbnd[d+DIMENSIONS],

       &myFaceData->QOut[faceId][d + DIMENSIONS][kernels::AderSolver::getBndFluxSize()],

       &myFaceData->QIn[faceId][d + DIMENSIONS][0], // lFhbnd[d+DIMENSIONS],

       &myFaceData->QIn[faceId][d + DIMENSIONS][kernels::AderSolver::getBndFaceSize()],

       0.5 * (myFaceData->t[faceId] + myFaceData->t[faceId]),

       0.5 * (myFaceData->dt[faceId] + myFaceData->dt[faceId]),

       faceCentre,

       myFaceData->cellSize[faceId],

       direction,

       false,

       0

     );


     faceCentre[d] -= myFaceData->cellSize[faceId][d];

     // Positive face

     kernels::AderSolver::faceIntegral(

       myCellData->QIn[cellId],

       &myFaceData->QOut[faceId][d + DIMENSIONS][0],

       direction,

       1,

       inverseDxDirection,

       myCellData->dt

     );


   } // for d


   myCellData->maxEigenvalue[cellId] = kernels::AderSolver::maxScaledEigenvalue(

     repositories::instanceOfAderSolver,

     myCellData->QIn[cellId],

     myCellData->cellCentre[cellId],

     myCellData->cellSize[cellId],

     myCellData->t,

     myCellData->dt

   );


   int numberOfIterations = kernels::AderSolver::fusedSpaceTimePredictorVolumeIntegral<SolverPrecision, SolverPrecision, SolverPrecision>(

     repositories::instanceOfAderSolver,

     lQhbnd,

     lFhbnd,

     myCellData->QIn[cellId],

     myCellData->cellCentre[cellId],

     myCellData->cellSize[cellId],

     myCellData->t,

     myCellData->dt

   );


   watchKernelCompute.stop();

   measurement.setValue(watchKernelCompute.getCalendarTime());

 }


 void variant2::runBenchmarks(

   int                                         numberOfCells,

   double                                      timeStamp,

   double                                      timeStepSize,

   const tarch::la::Vector<DIMENSIONS, double> cellCenter,

   const tarch::la::Vector<DIMENSIONS, double> cellSize

 ) {


   tarch::timing::Measurement timingComputeKernel;


   exahype2::CellData<SolverPrecision, SolverPrecision>     cellData(numberOfCells);

   exahype2::CellFaceData<SolverPrecision, SolverPrecision> cellFaceData(numberOfCells);


   for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {

     cellData.QIn[cellIndex] = tarch::allocateMemory<SolverPrecision>(

       NumberOfInputEntriesPerCell,

       tarch::MemoryLocation::Heap

     );

     cellData.t                        = timeStamp;

     cellData.dt                       = timeStepSize;

     cellData.QOut[cellIndex]          = nullptr;

     cellData.cellCentre[cellIndex]    = cellCenter;

     cellData.cellSize[cellIndex]      = cellSize;

     cellData.maxEigenvalue[cellIndex] = 0.0;


     initInputData(cellData.QIn[cellIndex], cellCenter, cellSize);

     std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(SolverPrecision));


     for (int i = 0; i < 2 * DIMENSIONS; i++) {

       cellFaceData.QIn[cellIndex][i] = tarch::allocateMemory<SolverPrecision>(

         2 * kernels::AderSolver::getBndFaceSize(),

         tarch::MemoryLocation::Heap

       );

       cellFaceData.QOut[cellIndex][i] = tarch::allocateMemory<SolverPrecision>(

         2 * kernels::AderSolver::getBndFluxSize(),

         tarch::MemoryLocation::Heap

       );

     }


     cellFaceData.t[cellIndex]          = timeStamp;

     cellFaceData.dt[cellIndex]         = timeStepSize;

     cellFaceData.cellCentre[cellIndex] = cellCenter;

     cellFaceData.cellSize[cellIndex]   = cellSize;


     std::memset(cellData.QOut[cellIndex], 0.0, NumberOfOutputEntriesPerCell * sizeof(SolverPrecision));

   }


   int numberOfThreads = 1;


   #if defined(WITH_OPENMP)

   for (int threadIndex = 0; threadIndex < NumberOfLaunchingThreads.size(); threadIndex++) {

     numberOfThreads = NumberOfLaunchingThreads[threadIndex];

   #endif


     // Initial task

     for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {

       initialTask(&cellData, &cellFaceData, cellIndex, cellIndex, timingComputeKernel);

     }


     timingComputeKernel.erase();

     tarch::timing::Measurement timingKernelLaunch;


     for (int sample = 0; sample <= NumberOfSamples; sample++) {


       #if defined(WITH_OPENMP)

       #pragma omp parallel for num_threads(NumberOfLaunchingThreads[threadIndex])

       #endif

       for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {


         // Copy data from one side of each face to the other

         // Here we just copy data from one side of a given cell's face to the opposite face of the same cell

         for (int d = 0; d < DIMENSIONS; d++) {

           // Copy inner negative face into outer positive face

           std::copy_n(

             &cellFaceData.QIn[cellIndex][d][kernels::AderSolver::getBndFaceSize()],

             kernels::AderSolver::getBndFaceSize(),

             &cellFaceData.QIn[cellIndex][d + DIMENSIONS][kernels::AderSolver::getBndFaceSize()]

           );

           std::copy_n(

             &cellFaceData.QOut[cellIndex][d][kernels::AderSolver::getBndFluxSize()],

             kernels::AderSolver::getBndFluxSize(),

             &cellFaceData.QOut[cellIndex][d + DIMENSIONS][kernels::AderSolver::getBndFluxSize()]

           );


           // Copy inner positive face into outer negative face

           std::copy_n(

             &cellFaceData.QIn[cellIndex][d + DIMENSIONS][0],

             kernels::AderSolver::getBndFaceSize(),

             &cellFaceData.QIn[cellIndex][d][0]

           );

           std::copy_n(

             &cellFaceData.QOut[cellIndex][d + DIMENSIONS][0],

             kernels::AderSolver::getBndFluxSize(),

             &cellFaceData.QOut[cellIndex][d][0]

           );

         }

       }


       tarch::timing::Watch watchKernelLaunch("::runBenchmarks", "assessKernel(...)", false);


       #if defined(WITH_OPENMP)

       #pragma omp parallel for num_threads(NumberOfLaunchingThreads[threadIndex])

       #endif

       for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {

         cellData.maxEigenvalue[cellIndex] = 0.0;

         runKernels(&cellData, &cellFaceData, cellIndex, cellIndex, timingComputeKernel);

       }


       watchKernelLaunch.stop();

       timingKernelLaunch.setValue(watchKernelLaunch.getCalendarTime());


     }


     reportRuntime("variant 2", timingComputeKernel, timingKernelLaunch, numberOfCells, numberOfThreads, _log);

     // allocateAndStoreOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);

     // validateOutcome(cellData.QOut, cellData.maxEigenvalue, numberOfCells);


   #if defined(WITH_OPENMP)

   }//threadIndex

   #endif


   for (int cellIndex = 0; cellIndex < numberOfCells; cellIndex++) {

     tarch::freeMemory(cellData.QIn[cellIndex], tarch::MemoryLocation::Heap);

     tarch::freeMemory(cellData.QOut[cellIndex], tarch::MemoryLocation::Heap);


     for (int i = 0; i < 2 * DIMENSIONS; i++) {

       tarch::freeMemory(cellFaceData.QIn[cellIndex][i], tarch::MemoryLocation::Heap);

       tarch::freeMemory(cellFaceData.QOut[cellIndex][i], tarch::MemoryLocation::Heap);

     }

   }

 }

CellFaceData.h

Utils.h

runKernels
void runKernels(exahype2::CellData< SolverPrecision, SolverPrecision > *myCellData, exahype2::CellFaceData< SolverPrecision, SolverPrecision > *myFaceData, const int cellId, const int faceId, tarch::timing::Measurement &measurement)
Definition: Variant2.cpp:102

initialTask
void initialTask(exahype2::CellData< SolverPrecision, SolverPrecision > *myCellData, exahype2::CellFaceData< SolverPrecision, SolverPrecision > *myFaceData, const int cellId, const int faceId, tarch::timing::Measurement &measurement)
Definition: Variant2.cpp:46

Variants.h

timeStamp
constexpr double timeStamp
Definition: KernelBenchmarks-main.cpp:46

cellCenter
const tarch::la::Vector< DIMENSIONS, double > cellCenter
Definition: KernelBenchmarks-main.cpp:49

cellSize
const tarch::la::Vector< DIMENSIONS, double > cellSize
Definition: KernelBenchmarks-main.cpp:51

timeStepSize
constexpr double timeStepSize
Definition: KernelBenchmarks-main.cpp:47

timingComputeKernel
tarch::timing::Measurement timingComputeKernel
Definition: KernelBenchmarks-main.cpp:84

benchmarks::exahype2::kernelbenchmarks
Definition: Utils.h:12

benchmarks::exahype2::kernelbenchmarks::NumberOfInputEntriesPerCell
constexpr int NumberOfInputEntriesPerCell
Definition: Utils.h:14

benchmarks::exahype2::kernelbenchmarks::reportRuntime
void reportRuntime(const std::string &kernelIdentificator, const tarch::timing::Measurement &timingComputeKernel, const tarch::timing::Measurement &timingKernelLaunch, int numberOfCells, int numberOfThreads, tarch::logging::Log _log)
Reports the runtime and throughput of the benchmarks.
Definition: Utils.h:68

benchmarks::exahype2::kernelbenchmarks::NumberOfOutputEntriesPerCell
constexpr int NumberOfOutputEntriesPerCell
Definition: Utils.h:20

benchmarks::exahype2::kernelbenchmarks::initInputData
void initInputData(SolverPrecision *Q, const tarch::la::Vector< DIMENSIONS, double > CellCenter, const tarch::la::Vector< DIMENSIONS, double > CellSize)
Set input data.
Definition: Utils.h:30

variant1::_log
tarch::logging::Log _log
This is variant 1 of the fused kernels.

variant2::runBenchmarks
void runBenchmarks(int numberOfCells, double timeStamp, double timeStepSize, const tarch::la::Vector< DIMENSIONS, double > cellCenter, const tarch::la::Vector< DIMENSIONS, double > cellSize)
Definition: Variant2.cpp:229

variant2::_log
tarch::logging::Log _log
This is variant 2 of the fused kernels.

exahype2::CellFaceData
Represents the faces of one cell, with a total of 2*Dim faces per cell For ADER QIn will contain the ...
Definition: CellFaceData.h:20

exahype2::CellFaceData::cellSize
tarch::la::Vector< DIMENSIONS, double > * cellSize
Definition: CellFaceData.h:27

exahype2::CellFaceData::cellCentre
tarch::la::Vector< DIMENSIONS, double > * cellCentre
Definition: CellFaceData.h:26

exahype2::CellFaceData::dt
double * dt
Definition: CellFaceData.h:30

exahype2::CellFaceData::QOut
outType *(* QOut)[2 *DIMENSIONS]
Out values.
Definition: CellFaceData.h:60

exahype2::CellFaceData::QIn
inType *(* QIn)[2 *DIMENSIONS]
QIn may not be const, as some kernels delete it straightaway once the input data has been handled.
Definition: CellFaceData.h:25

exahype2::CellFaceData::t
double * t
Definition: CellFaceData.h:29