The std::for_each algorithm runs sequentially. To run in parallel, use the Parallel STL alternative with one of the following execution polices: with the following execution policy: %polices%
...
std::for_each(std::execution::%policy%, a, a+n, [](float elem)
...#include "pstl/execution"
#include "pstl/algorithm"
void foo(float* a, int n)
{
std::for_each(std::execution::%policy%, a, a+n, [](float elem)
{
...
});
}