ConcurrentCopyExecute.hpp 781 Bytes
#pragma once

#include <array>
#include <functional>

#include <opencv2/core/cuda.hpp>

template <std::size_t N> class ConcurrentCopyExecute {
  public:
	/* Width should be divisible by N without any reminder */
	void run(const std::function<void(const cv::Range &, const cv::Range &,
	                                  cv::cuda::Stream &)> &callback,
	         const cv::Size &size) {
		assert(size.height % N == 0);

		const cv::Range colRange(0, size.width);
		const int stride = size.height / N;
		for (auto i = 0, j = 0; i < size.height; i += stride, ++j) {
			callback(cv::Range{i, i + stride}, colRange, streams[j]);
		}
	}

	void synchronize() {
		for (auto &stream : streams) {
			stream.waitForCompletion();
		}
	}

  private:
	std::array<cv::cuda::Stream, N> streams;
};