Locking objects slows loop execution. To fix: Rewrite the code without OpenMP lock functions.
int A[n];
list<int> L;
omp_lock_t lock_obj;
...
omp_set_lock(&(lock_obj));
L.insert(L.begin(), A[i]);
omp_unset_lock(&(lock_obj));
...Allocating separate arrays for each thread and then merging them after a parallel recommendation may improve speed (but consume more memory).
int A[n];
list<int> L;
...
omp_lock_t lock_obj;
omp_init_lock(&lock_obj);
#pragma omp parallel for shared(L, A, lock_obj) default(none)
for (int i = 0; i < n; ++i)
{
// A[i] calculation
...
if (A[i]<1.0)
{
omp_set_lock(&(lock_obj));
L.insert(L.begin(), A[i]);
omp_unset_lock(&(lock_obj));
}
}
omp_destroy_lock(&lock_obj);int A[n];
list<int> L;
omp_set_num_threads(nthreads_all);
...
vector<list<int>> L_by_thread(nthreads_all); // separate list for each thread
...
L_by_thread[k].insert(L_by_thread[k].begin(), A[i]);
...int A[n];
list<int> L;
omp_set_num_threads(nthreads_all);
...
vector<list<int>> L_by_thread(nthreads_all); // separate list for each thread
#pragma omp parallel shared(L, L_by_thread, A) default(none)
{
int k = omp_get_thread_num();
#pragma omp for nowait
for (int i = 0; i < n; ++i)
{
// A[i] calculation
...
if (A[i]<1.0)
{
L_by_thread[k].insert(L_by_thread[k].begin(), A[i]);
}
}
}
// merge data into single list
for (int k = 0; k < L_by_thread.size(); k++)
{
L.splice(L.end(), L_by_thread[k]);
}- Calling Functions on the CPU to Modify the Coprocessor's Execution Environment; Lock Routines recommendation in OpenMP Run-time Library Routines; omp for, omp parallel recommendations in OpenMP* Pragmas Summary
- Getting Started with Intel Compiler Pragmas and Directives and Vectorization Resources for Intel® Advisor Users