Spaces:

ntt123
/

WaveGRU-Text-To-Speech

Running

WaveGRU-Text-To-Speech / sparse_matmul /os /coop_threads.cc

NTT123

add fast cpp wavegru

d1a84ee over 3 years ago

2.57 kB

	// Copyright 2021 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include "sparse_matmul/os/coop_threads.h"

	#include <atomic>

	namespace csrblocksparse {

	// All threads must execute a std::memory_order_seq_cst operation on
	// \|barrier_step_\| this is what ensures the global memory consistency across
	// the barrier.
	//
	// It is possible for the \|barrier_step_\| to roll over, but this is safe here.
	//
	// \|yield\| instructs the processor that it is in a spin loop and can stop doing
	// things like out of order, speculative execution, prefetching, etc. On hyper
	// threaded machines it can also choose to swap in the other thread. Note that
	// this is a hardware level decision and the OS is never involved.
	void SpinBarrier::barrier() {
	if (num_threads_ < 2) return;

	int old_step = barrier_step_.load(std::memory_order_relaxed);

	int val_threads = threads_at_barrier_.fetch_add(1, std::memory_order_acq_rel);

	if (val_threads == num_threads_ - 1) {
	// This is where the logic can go all wrong if the barrier is called by
	// more threads than \|num_threads_\| -- the assumption that we're the last
	// thread is inherently invalid.

	// Assuming num_threads_ are calling this barrier, then we're the last
	// thread to reach the barrier, reset and advance step count.
	threads_at_barrier_.store(0, std::memory_order_relaxed);
	barrier_step_.store(old_step + 1, std::memory_order_release);
	} else {
	// Wait for step count to advance, then continue.
	while (barrier_step_.load(std::memory_order_acquire) == old_step) {
	// Intel recommends the equivalent instruction PAUSE, not be called more
	// than once in a row, I can't find any recommendations for ARM, so
	// following that advice here.
	#if defined __aarch64__ \|\| defined __arm__
	asm volatile("yield\n" ::: "memory");
	#else
	// No pause for x86! The pause instruction on Skylake takes 141 clock
	// cycles, which in an AVX2-down-clocked CPU is getting on for 70ns.
	#endif
	}
	}
	}

	} // namespace csrblocksparse