Updating maximum value from multiple threads

Is there a way to update the maximum of multiple threads using atomic operations?

Illustrative example:

std::vector<float> coord_max(128);
#pragma omp parallel for
for (int i = 0; i < limit; ++i) {
    int j = get_coord(i); // can return any value in range [0,128)
    float x = compute_value(j, i);
    #pragma omp critical (coord_max_update)
    coord_max[j] = std::max(coord_max[j], x);
}

In the above case, the critical section synchronizes access to the entire vector, while we only need to synchronize access to each of the values ​​independently.

+5
source share
4 answers

Following the suggestion in the comment, I found a solution that does not require locking, and instead uses the comparison and exchange function found in std :: atomic / boost :: atomic. I am limited to C ++ 03, so in this case I would use boost :: atomic.

BOOST_STATIC_ASSERT(sizeof(int) == sizeof(float));
union FloatPun { float f; int i; };

std::vector< boost::atomic<int> > coord_max(128);
#pragma omp parallel for
for (int i = 0; i < limit; ++i) {
    int j = get_coord(i);
    FloatPun x, maxval;
    x.f = compute_value(j, i);

    maxval.i = coord_max[j].load(boost::memory_order_relaxed);
    do {
        if (maxval.f >= x.f) break;
    } while (!coord_max[j].compare_exchange_weak(maxval.i, x.i,
        boost::memory_order_relaxed));
}

, float ints, , . 100% , "", , , .

+5

, , std::vector<std::mutex> ( boost::mutex) 128, j - ?

, - :

std::vector<float> coord_max(128);
std::vector<std::mutex> coord_mutex(128); 
#pragma omp parallel for
for (int i = 0; i < limit; ++i) {
    int j = get_coord(i); // can return any value in range [0,128)
    float x = compute_value(j, i);
    std::scoped_lock lock(coord_mutex[j]);
    coord_max[j] = std::max(coord_max[j], x);     
}

, # 3:

std::vector<float> coord_max(128);
const int parallelism = 16;
std::vector<std::mutex> coord_mutex(parallelism); 
#pragma omp parallel for
for (int i = 0; i < limit; ++i) {
    int j = get_coord(i); // can return any value in range [0,128)
    float x = compute_value(j, i);
    std::scoped_lock lock(coord_mutex[j % parallelism]);
    coord_max[j] = std::max(coord_max[j], x);     
}
+1

, :

  • , ( ).

  • , . : parallelism; : !

  • - ! , 16 ( 32/64/...) "" : bank0 0, 16, 32, 48, 64,... bank1 1, 17, 33, 49, 65,... bank2 2, 18, 34, 50, 66,... ... 16 , , 16-way parallelism. n, (n% 16), , .

+1

, , , omp critical:

std::vector<float> coord_max(128);  
float              fbuffer(0);
#pragma omp parallel 
{
  std::vector<float> thread_local_buffer(128);  

  // Assume limit is a really big number
  #pragma omp for       
  for (int ii = 0; ii < limit; ++ii) {
   int  jj = get_coord(ii); // can return any value in range [0,128)
   float x = compute_value(jj,ii);
   thread_local_buffer[jj] = std::max(thread_local_buffer[jj], x);
  } 
  // At this point each thread has a partial local vector
  // containing the maximum of the part of the problem 
  // it has explored

  // Reduce the results
  for( int ii = 0; ii < 128; ii++){
    // Find the max for position ii
#pragma omp for schedule(static,1) reduction(max:fbuffer)
    for( int jj = 0; jj < omp_get_thread_num(); jj++) {
      fbuffer = thread_local_buffer[ii];
    } // Barrier implied here
    // Write it in the vector at correct position
#pragma omp single
    {
      coord_max[ii] = fbuffer;
      fbuffer = 0;
    } // Barrier implied here

  }
}

Please note that I did not compile the fragment, so I could leave a syntax error inside. Anyway, I hope that I convey this idea.

+1
source

All Articles