Spaces:

ntt123
/

WaveGRU-Text-To-Speech

Running

WaveGRU-Text-To-Speech / sparse_matmul /os /coop_threads.h

NTT123

add fast cpp wavegru

d1a84ee over 3 years ago

7.46 kB

	/*
	* Copyright 2021 Google LLC
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef LYRA_CODEC_SPARSE_MATMUL_OS_COOP_THREADS_H_
	#define LYRA_CODEC_SPARSE_MATMUL_OS_COOP_THREADS_H_

	#include <atomic>
	#include <thread> // NOLINT
	#include <vector>

	#define _COOP_THREADS_USE_STD_THREAD 1

	#include "absl/memory/memory.h"
	#include "glog/logging.h"

	namespace csrblocksparse {

	// A re-usable barrier. Keeps threads in extremely tight sync without
	// relinquishing control. All memory writes _before_ this barrier are visible
	// to all threads _after_ this barrier. Similar in spirit to
	// pthreads_barrier. If you expect arrival times at this barrier to be varied
	// by more than microseconds, this is probably not the right synchronization
	// primitive for you. If \|num_threads\| exceeds the number of physical threads
	// that can run simultaneously, then using this is certainly a bad idea
	// (although it should still be correct).
	//
	// Callers MUST NOT call barrier from more threads than \|num_threads\|. The
	// result is undefined behavior.
	class SpinBarrier {
	public:
	explicit SpinBarrier(int num_threads)
	: num_threads_(num_threads), threads_at_barrier_(0), barrier_step_(0) {}

	void barrier();

	private:
	const int num_threads_;
	std::atomic<int32_t> threads_at_barrier_;
	std::atomic<uint32_t> barrier_step_; // unsigned to make overflow defined.
	};

	// Producer-consumer API using the same underlying mechanism as SpinBarrier.
	// This class is intended to allow >=1 producers to produce data for >=1
	// consumers, without blocking the producers.
	// The consumer will block if it is ready before all the producer(s) have
	// produced.
	// WARNING: By design this lock does not work without some other barrier that
	// prevents any producer from producing again, or consumer from consuming again
	// until all consumers have consumed. Basically any loop that uses
	// ProducerConsumer must have at least two consume() calls in each thread (on
	// different instances) in order for the lock to work correctly.
	class ProducerConsumer {
	public:
	ProducerConsumer(int num_producers, int num_consumers)
	: num_producers_(num_producers),
	num_consumers_(num_consumers),
	producers_ready_(0),
	consumers_passed_(0) {}

	// Indicates that the data produced by this thread is ready. Does NOT block.
	// NOTE that some other lock must exist between the call to this produce and
	// looping back to call produce again on the same ProducerConsumer, that
	// depends on all consumers having called consume. One such candidate would
	// be a call to SpinBarrier above by all producers and consumers.
	// Another candidate would be a separate ProducerConsumer object in which
	// these producers consume some data produced by the threads that consume
	// the data produced here. Eg.
	// tid 0 1 2 3
	// action 1 produce produce consume consume (on ProducerConsumer 1)
	// action 2 consume consume produce produce (on ProducerConsumer 2)
	// action 3 produce produce consume consume (on ProducerConsumer 3)
	// action 4 consume consume produce produce (on ProducerConsumer 4)
	// loop back to action 1.
	// NOTE: It is inadequate to loop back after action2, as thread 0 could loop
	// back and consume again on PC2 while thread 1 is still completing its call
	// to consume. It is still inadequate to loop back after action 3 for the same
	// reason (but tsan doesn't seem to pick this up.)
	inline void produce() {
	producers_ready_.fetch_add(1, std::memory_order_acq_rel);
	}

	// Waits if necessary for all producers to have produced before proceeding.
	// The ProducerConsumer cannot be reused until all consumers have consumed.
	// See detailed comment and example on produce().
	inline void consume() {
	// We can't do anything until all the producers have produced.
	while (producers_ready_.load(std::memory_order_acquire) < num_producers_) {
	#if defined __aarch64__ \|\| defined __arm__
	asm volatile("yield\n" ::: "memory");
	#else
	// No pause for x86! The pause instruction on Skylake takes 141 clock
	// cycles, which in an AVX2-down-clocked CPU is getting on for 70ns.
	#endif
	}
	// NOTE: It is tempting to move this fetch_add to before the wait loop to
	// reduce contention for the memory location, but that would break the lock,
	// as then the last to arrive could zero out the producers_ready before the
	// other consumers have noticed that all producers have produced.
	// With the fetch_add after the wait loop, we are guaranteed that all
	// producers have produced AND all consumers have noticed that they have
	// produced before we zero out the counters.
	int consumers = consumers_passed_.fetch_add(1, std::memory_order_acq_rel);
	if (consumers == num_consumers_ - 1) {
	// The last consumer to pass has to reset everything for the next time.
	producers_ready_.store(0, std::memory_order_relaxed);
	consumers_passed_.store(0, std::memory_order_relaxed);
	}
	}
	int num_producers() const { return num_producers_; }
	int num_consumers() const { return num_consumers_; }

	private:
	const int num_producers_;
	const int num_consumers_;
	std::atomic<int32_t> producers_ready_;
	std::atomic<int32_t> consumers_passed_;
	};

	// We define Thread here, so we can easily change its type later.

	using Thread = std::thread;
	using ThreadId = std::thread::id;

	// Creates (\|num_threads\|-1) threads and executes a total of \|num_threads\|
	// copies of \|func\| (executes one on the calling thread).
	//
	// Useful for long running func bodies that are intended to run in lock step.
	// A possible use case for this style parallelism over a thread pool is when
	// we want tight control over which memory is resident in the L2 cache of a
	// processor. With a pool we have no control over which thread gets assigned
	// which portion of the computation resulting in L2 thrashing. With this
	// breakdown we can make sure each thread only acceses a specific L2-sized
	// portion of memory.
	//
	// func's signature must be (SpinBarrier*, int thread_id, ...);
	template <class Function, class... Args>
	void LaunchOnThreadsWithBarrier(int num_threads, Function&& func,
	Args&&... args) {
	SpinBarrier spin_barrier(num_threads);

	std::vector<std::unique_ptr<Thread>> threads;
	threads.reserve(num_threads);
	for (int tid = 1; tid < num_threads; ++tid) {
	auto f = [&, tid]() { func(&spin_barrier, tid, args...); };

	threads.emplace_back(absl::make_unique<Thread>(f));
	#ifndef _COOP_THREADS_USE_STD_THREAD
	CHECK_OK(threads.back()->Start());
	#endif
	}

	const int kLocalTid = 0;
	func(&spin_barrier, kLocalTid, args...);

	for (auto& thread : threads) {
	#ifdef _COOP_THREADS_USE_STD_THREAD
	thread->join();
	#else
	CHECK_OK(thread->Join());
	#endif
	}
	}

	} // namespace csrblocksparse

	#endif // LYRA_CODEC_SPARSE_MATMUL_OS_COOP_THREADS_H_