Spaces:
Running
Running
| // Copyright 2021 Google LLC | |
| // | |
| // Licensed under the Apache License, Version 2.0 (the "License"); | |
| // you may not use this file except in compliance with the License. | |
| // You may obtain a copy of the License at | |
| // | |
| // http://www.apache.org/licenses/LICENSE-2.0 | |
| // | |
| // Unless required by applicable law or agreed to in writing, software | |
| // distributed under the License is distributed on an "AS IS" BASIS, | |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| // See the License for the specific language governing permissions and | |
| // limitations under the License. | |
| namespace csrblocksparse { | |
| // All threads must execute a std::memory_order_seq_cst operation on | |
| // |barrier_step_| this is what ensures the global memory consistency across | |
| // the barrier. | |
| // | |
| // It is possible for the |barrier_step_| to roll over, but this is safe here. | |
| // | |
| // |yield| instructs the processor that it is in a spin loop and can stop doing | |
| // things like out of order, speculative execution, prefetching, etc. On hyper | |
| // threaded machines it can also choose to swap in the other thread. Note that | |
| // this is a hardware level decision and the OS is never involved. | |
| void SpinBarrier::barrier() { | |
| if (num_threads_ < 2) return; | |
| int old_step = barrier_step_.load(std::memory_order_relaxed); | |
| int val_threads = threads_at_barrier_.fetch_add(1, std::memory_order_acq_rel); | |
| if (val_threads == num_threads_ - 1) { | |
| // This is where the logic can go all wrong if the barrier is called by | |
| // more threads than |num_threads_| -- the assumption that we're the last | |
| // thread is inherently invalid. | |
| // Assuming num_threads_ are calling this barrier, then we're the last | |
| // thread to reach the barrier, reset and advance step count. | |
| threads_at_barrier_.store(0, std::memory_order_relaxed); | |
| barrier_step_.store(old_step + 1, std::memory_order_release); | |
| } else { | |
| // Wait for step count to advance, then continue. | |
| while (barrier_step_.load(std::memory_order_acquire) == old_step) { | |
| // Intel recommends the equivalent instruction PAUSE, not be called more | |
| // than once in a row, I can't find any recommendations for ARM, so | |
| // following that advice here. | |
| asm volatile("yield\n" ::: "memory"); | |
| // No pause for x86! The pause instruction on Skylake takes 141 clock | |
| // cycles, which in an AVX2-down-clocked CPU is getting on for 70ns. | |
| } | |
| } | |
| } | |
| } // namespace csrblocksparse | |