Chapel: Improve the speed of our random number generator

Created on 1 May 2019 · 7Comments · Source: chapel-lang/chapel

Our RandomStream is parallel safe by default, but to get parallel safety we're currently just using a sync as a lock which has a lot of overhead. When parallel safety is not needed this lock adds something like a 300x overhead.

use Random, Time;

config const trials = 1_000_000;
config const tasks = here.maxTaskPar;
config param parSafe = true;

proc preventDCE(x) {}
var t: Timer; t.start();
coforall 1..tasks {
  var a: real;
  var rndStream = new RandomStream(real, parSafe=parSafe);
  for 1..trials do a += rndStream.getNext();
  preventDCE(a);
}
t.stop();
writeln(t.elapsed());

| config | time |
| ------------- | ------ |
| parSafe=true | 4.260s |
| parSafe=false | 0.016s |

Parallel safety by default is nice, but it'd be great to reduce the overhead. I was wondering if we could use an atomic CAS to implement the random number generation advancing. Below is a simple LCG RNG to compare the performance of serial vs. sync-lock vs. CAS-loop implementations at different contention levels. Based on those results it looks it may be worthwhile to use a CAS-loop

use Time;

config const trials = 1_000_000;
config const tasks = here.maxTaskPar;

proc preventDCE(x) {}
proc run(param parSafe, param concurrentAccess) {
  var t: Timer; t.start();
  if !concurrentAccess {
    coforall 1..tasks {
      var a: int;
      var rndStream = new RandomStream(parSafe);
      for 1..trials do a += rndStream.getNext();
      preventDCE(a);
    }
  } else {
    var rndStream = new RandomStream(parSafe);
    coforall 1..tasks {
      var a: int;
      for 1..trials do a += rndStream.getNext();
      preventDCE(a);
    }
  }
  t.stop();
  writeln(parSafe, " with concurrentAccess=", concurrentAccess, " took ",  t.elapsed(), " seconds ");
}

run(ParSafety.notsafe,    concurrentAccess=false);
run(ParSafety.atomicCAS,  concurrentAccess=false);
run(ParSafety.syncLock,   concurrentAccess=false);
run(ParSafety.atomicLock, concurrentAccess=false);

writeln();
run(ParSafety.atomicCAS,  concurrentAccess=true);
run(ParSafety.syncLock,   concurrentAccess=true);
run(ParSafety.atomicLock, concurrentAccess=true);

enum ParSafety {notsafe, atomicCAS, syncLock, atomicLock};

proc getLockType(param parSafe: ParSafety) type {
  select parSafe {
    when ParSafety.syncLock   do return sync bool;
    when ParSafety.atomicLock do return atomic bool;
    otherwise                 do return void;
  }
}

class RandomStream {
  param RAND_MAX = (1 << 31) - 1;
  param parSafe: ParSafety;
  var rseed: if parSafe == ParSafety.atomicCAS then atomic int else int;
  var lock: getLockType(parSafe);

  proc init(param parSafe) {
    this.parSafe = parSafe;
  }

  inline proc getNext() {
    select parSafe {
      when ParSafety.notsafe {
        rseed = (rseed * 1103515245 + 12345) & RAND_MAX;
        return rseed;
      }
      when ParSafety.atomicCAS {
        var desired: int;
        do {
          var prev_rseed = rseed.read();
          desired = (prev_rseed * 1103515245 + 12345) & RAND_MAX;
        } while !rseed.compareExchange(prev_rseed, desired);

        return desired;
      }
      when ParSafety.syncLock {
        lock = true;
        rseed = (rseed * 1103515245 + 12345) & RAND_MAX;
        lock;
        return rseed;
      }
      when ParSafety.atomicLock {
        while lock.testAndSet(memory_order_acquire) do chpl_task_yield();
        rseed = (rseed * 1103515245 + 12345) & RAND_MAX;
        lock.clear(memory_order_release);
        return rseed;
      }
    }
  }
}

For 1M trials:

| config | serial access | concurrent access |
| ------------ | ------------- | ----------------- |
| no safety | 0.002 | N/A |
| atomic CAS | 0.011 | 1.762 |
| sync lock | 4.477 | 18.51 |
| atomic lock | 0.011 | 47.26 |

For 10M trials:

| config | serial access | concurrent access |
| ---------- | ------------- | ----------------- |
| no safety | 0.017 | N/A |
| atomic CAS | 0.116 | 17.0 |

It looks like atomic CAS is a win for serial and concurrent accesses. There is obviously still overhead compared to something that doesn't need to be parallel safe at all, but we should be able to improve the performance of the default mode pretty significantly.

An atomic lock is better for serial access, but much worse under contention (which isn't surprising.) A hybrid lock like we've want in other cases could help too and may be good enough (since using CAS for the actual PCG implementation is much harder.)

Performance user issue

Source

ronawho

👍1

Most helpful comment

With https://github.com/chapel-lang/chapel/pull/13013 the overhead for uncontested (serial) generation from a random stream is pretty minimal. It used to add 300x overhead for generating random reals, and now it's only 1.2x.

For the code in the issue description we used to see:

| config (1M iters) | real(64) |
| ------------------- | -------- |
| parSafe=true | 4.260s |
| parSafe=false | 0.016s |

And now we see (with 100x more iters):

| config (100M iters) | real(64) |
| ------------------- | -------- |
| parSafe=true | 1.80s |
| parSafe=false | 1.50s |

So generating reals is only 1.2x slower when parallel safety isn't needed.

Note that generating uints is faster than generating reals and the microbenchmark added in #13009 generates uints. For generating uints the overhead of parallel safety when you don't need it is something like 5-10x, so it can still be worthwhile to use a parSafe=false stream when you don't need parallel safety but the overhead in the default case with parallel safety enabled is much better than it used to be.

ronawho on 15 May 2019

🎉1 😄1

All 7 comments

And this is probably obvious, but I don't know that much about random number generators so there could be bugs in my simple LCG program, but I think the performance results are sound.

ronawho on 1 May 2019

I also experienced a similar slowdown for RNG with parSafe = true. For simple tests on my Mac, generating 10^8 random numbers with getNext() took only 0.7s with parSafe = false (with 1 thread) and 20s with parSafe = true (again with 1 thread), so the latter is significantly slower. On the other hand, if I use fillRandom() and take several random numbers at once (say 100), the speed became comparable (e.g. 1.3 vs 1.6s for 1 thread, 2.5 vs 2.8 for 2 threads, and 3.2 vs 3.5 for 4 threads). But using more threads make the overall time longer, so I guess the parallel overhead might be also not small for fillRandom().

FWIW, my use case of RNG is molecular dynamics calculations where a large number of random numbers are used to thermalize the velocities of atoms (at each step of time evolution). In this case, the cost of RNG is not so important as compared to other parts of the calculation (like force evaluation), so the above slowdown is not problematic for me at all. However, I guess for some applications where RNG is the main computational part (including some kind of Monte Carlo), the performance of RNG might be much more important.

So anyway, I believe the potential speedup of RNG would be really nice (for general use cases :)

ty1027 on 2 May 2019

The simple LCG test in the issue uses a testAndSet lock, which was faster for serial access but slower for parallel/contested access. Using a test-and-testAndSet lock is much faster under contention and around 2 faster than sync under contention too. I'll switch to using that after we gather a few days of perf testing for the benchmark added in #13009

ronawho on 13 May 2019

For the code in the issue description we used to see:

| config (1M iters) | real(64) |
| ------------------- | -------- |
| parSafe=true | 4.260s |
| parSafe=false | 0.016s |

And now we see (with 100x more iters):

| config (100M iters) | real(64) |
| ------------------- | -------- |
| parSafe=true | 1.80s |
| parSafe=false | 1.50s |

So generating reals is only 1.2x slower when parallel safety isn't needed.

ronawho on 15 May 2019

🎉1 😄1

And if the last comment was confusing, here's a similar summary with code -- for a slightly modified version of the original test:

use Random, Time;

config const trials = 1_000_000;
config const tasks = here.maxTaskPar;

config const printIt = false;
proc preventDCE(a) { if printIt then writeln(a); }

proc test(param parSafe, type rndType) {
  var t: Timer; t.start();
  coforall 1..tasks {
    var a: rndType;
    var rndStream = new RandomStream(rndType, parSafe=parSafe);
    for 1..trials do a += rndStream.getNext();
    preventDCE(a);
  }
  t.stop();
  writef("%-10s parSafe=%-5t time(s): %dr\n", rndType:string, parSafe, t.elapsed());
}

test(parSafe=false, uint);
test(parSafe=true,  uint);

test(parSafe=false, real);
test(parSafe=true,  real);

Before: ./before-rng --trials=1_000_000

| RNG type | parSafe=false | parSafe=true | parSafe overhead |
| -------- | ------------- | ------------ | ---------------- |
| uint(64) | 0.003s | 4.200s | ~1400x |
| real(64) | 0.015s | 4.200s | ~ 300x |

Master (100x more trials): ./master-rng --trials=100_000_000

| RNG type | parSafe=false | parSafe=true | parSafe overhead |
| -------- | ------------- | ------------ | ---------------- |
| uint(64) | 0.20s | 0.95s | 4.5x |
| real(64) | 1.50s | 1.80s | 1.2x |

ronawho on 15 May 2019

👍1

I'm not sure where things ended up on the CAS vs test-and-test-and-set lock, but it's possible we might be able to migrate our PCG usage to work with a 128-bit CAS storing the number generated and the state. (Related RNGs to get more bits at a time, like how we get 64 bits or a complex(128), can use a strategy like how we do the bounded rng).

mppf on 31 May 2019

Looks like CAS could be up to 2x faster under contention, but no advantage when not contended, so I don't think it's worth trying to optimize further yet.

Timings for the below program (similar to original one but swapping testandset lock out with chpl_LocalSpinlock):

| config | serial access | concurrent access |
| ------------ | ------------- | ----------------- |
| atomic CAS | 0.058s | 8.8s |
| atomic lock | 0.056s | 15.2s |

                                                                                                                                         1,1           All
use Time;

config const trials = 5_000_000;
config const tasks = here.maxTaskPar;

proc preventDCE(x) {}
proc run(param parSafe, param concurrentAccess) {
  var t: Timer; t.start();
  if !concurrentAccess {
    coforall 1..tasks {
      var a: int;
      var rndStream = new RandomStream(parSafe);
      for 1..trials do a += rndStream.getNext();
      preventDCE(a);
    }
  } else {
    var rndStream = new RandomStream(parSafe);
    coforall 1..tasks {
      var a: int;
      for 1..trials do a += rndStream.getNext();
      preventDCE(a);
    }
  }
  t.stop();
  writeln(parSafe, " with concurrentAccess=", concurrentAccess, " took ",  t.elapsed(), " seconds ");
}

run(ParSafety.atomicCAS,  concurrentAccess=false);
run(ParSafety.atomicLock, concurrentAccess=false);
writeln();
run(ParSafety.atomicCAS,  concurrentAccess=true);
run(ParSafety.atomicLock, concurrentAccess=true);

enum ParSafety {atomicCAS, atomicLock};

class RandomStream {
  param RAND_MAX = (1 << 31) - 1;
  param parSafe: ParSafety;
  var rseed: if parSafe == ParSafety.atomicCAS then atomic int else int;
  var lock: if parSafe == ParSafety.atomicLock then chpl_LocalSpinlock else nothing;

  proc init(param parSafe) {
    this.parSafe = parSafe;
  }

  inline proc getNext() {
    select parSafe {
      when ParSafety.atomicCAS {
        var desired: int;
        do {
          var prev_rseed = rseed.read();
          desired = (prev_rseed * 1103515245 + 12345) & RAND_MAX;
        } while !rseed.compareExchange(prev_rseed, desired);
        return desired;
      }
      when ParSafety.atomicLock {
        lock.lock();
        rseed = (rseed * 1103515245 + 12345) & RAND_MAX;
        lock.unlock();
        return rseed;
      }
    }
  }
}

ronawho on 31 May 2019

Was this page helpful?

0 / 5 - 0 ratings