ConcurrentCopyExecute.hpp
781 Bytes
#pragma once
#include <array>
#include <functional>
#include <opencv2/core/cuda.hpp>
template <std::size_t N> class ConcurrentCopyExecute {
public:
/* Width should be divisible by N without any reminder */
void run(const std::function<void(const cv::Range &, const cv::Range &,
cv::cuda::Stream &)> &callback,
const cv::Size &size) {
assert(size.height % N == 0);
const cv::Range colRange(0, size.width);
const int stride = size.height / N;
for (auto i = 0, j = 0; i < size.height; i += stride, ++j) {
callback(cv::Range{i, i + stride}, colRange, streams[j]);
}
}
void synchronize() {
for (auto &stream : streams) {
stream.waitForCompletion();
}
}
private:
std::array<cv::cuda::Stream, N> streams;
};