Remove OpenMP lock functions

Locking objects slows loop execution. To fix: Rewrite the code without OpenMP lock functions.

Example (original code)

int A[n];
list<int> L;
omp_lock_t lock_obj;
...
omp_set_lock(&(lock_obj));
L.insert(L.begin(), A[i]);
omp_unset_lock(&(lock_obj));
...
Allocating separate arrays for each thread and then merging them after a parallel recommendation may improve speed (but consume more memory).
int A[n];
list<int> L;
...
omp_lock_t lock_obj;
omp_init_lock(&lock_obj);
#pragma omp parallel for shared(L, A, lock_obj) default(none)
for (int i = 0; i < n; ++i)
{
    // A[i] calculation
    ...
    if (A[i]<1.0)
    {
        omp_set_lock(&(lock_obj));
        L.insert(L.begin(), A[i]);
        omp_unset_lock(&(lock_obj));
    }
}
omp_destroy_lock(&lock_obj);

Example (revised code)

int A[n];
list<int> L;
omp_set_num_threads(nthreads_all);
...
vector<list<int>> L_by_thread(nthreads_all); // separate list for each thread
...
L_by_thread[k].insert(L_by_thread[k].begin(), A[i]);
...
int A[n];
list<int> L;
omp_set_num_threads(nthreads_all);
...
vector<list<int>> L_by_thread(nthreads_all); // separate list for each thread
#pragma omp parallel shared(L, L_by_thread, A) default(none)
{
    int k = omp_get_thread_num();
    #pragma omp for nowait
    for (int i = 0; i < n; ++i)
    {
        // A[i] calculation
        ...
        if (A[i]<1.0)
        {
            L_by_thread[k].insert(L_by_thread[k].begin(), A[i]);
        }
   }
}

// merge data into single list
for (int k = 0; k < L_by_thread.size(); k++)
{
  L.splice(L.end(), L_by_thread[k]);
}

Read More