The cost of rewriting code to organize data using SoA instead of AoS may outweigh the benefit. To fix: Use Intel SIMD Data Layout Templates (Intel SDLT), introduced in version 16.1 of the Intel compiler, to mitigate the cost. Intel SDLT is a C++11 template library that may reduce code rewrites to just a few lines.
...
struct kValues {
float Kx;
float Ky;
float Kz;
float PhiMag;
};
std::vector<kValues> dataset(count);
...Using SDLT instead of STL containers may improve the memory access pattern for more efficient vector processing.
struct kValues {
float Kx;
float Ky;
float Kz;
float PhiMag;
};
std::vector<kValues> dataset(count);
// Initialization step
for(int i=0; i < count; ++i) {
kValues[i].Kx = kx[i];
kValues[i].Ky = ky[i];
kValues[i].Kz = kz[i];
kValues[i].PhiMag = phiMag[i];
}
// Calculation step
for (indexK = 0; indexK < numK; indexK++) {
expArg = PIx2 * (kValues[indexK].Kx * x[indexX] +
kValues[indexK].Ky * y[indexX] +
kValues[indexK].Kz * z[indexX]);
cosArg = cosf(expArg);
sinArg = sinf(expArg);
float phi = kValues[indexK].PhiMag;
QrSum += phi * cosArg;
QiSum += phi * sinArg;
}#include <sdlt/sdlt.h>
...
SDLT_PRIMITIVE(kValues, Kx, Ky, Kz, PhiMag)
sdlt::soa1d_container<kValues> dataset(count);
...#include <sdlt/sdlt.h>
struct kValues {
float Kx;
float Ky;
float Kz;
float PhiMag;
};
SDLT_PRIMITIVE(kValues, Kx, Ky, Kz, PhiMag)
sdlt::soa1d_container<kValues> dataset(count);
// Initialization step
auto kValues = dataset.access();
for (k = 0; k < numK; k++) {
kValues [k].Kx() = kx[k];
kValues [k].Ky() = ky[k];
kValues [k].Kz() = kz[k];
kValues [k].PhiMag() = phiMag[k];
}
// Calculation step
auto kVals = dataset.const_access();
#pragma omp simd private(expArg, cosArg, sinArg) reduction(+:QrSum, QiSum)
for (indexK = 0; indexK < numK; indexK++) {
expArg = PIx2 * (kVals[indexK].Kx() * x[indexX] +
kVals[indexK].Ky() * y[indexX] +
kVals[indexK].Kz() * z[indexX]);
cosArg = cosf(expArg);
sinArg = sinf(expArg);
float phi = kVals[indexK].PhiMag();
QrSum += phi * cosArg;
QiSum += phi * sinArg;
}