Custom Memory Allocators in OpenCV - Zero copy for GPUs
Zero copy for GPUs
Introduction
OpenCV provides hooks to control memory allocation. While small POD types like cv::Point and cv::Rect live on the stack, large arrays such as cv::Mat and cv::cuda::GpuMat allocate heap or device memory—and that path is customizable. On the CPU, a custom cv::MatAllocator can be installed; on CUDA, cv::cuda::GpuMat::Allocator serves the same role. (For OpenCL, cv::UMat follows a separate path.) With these mechanisms, developers can introduce pooling, alignment, NUMA-aware placement, pinned memory for faster DMA, and even zero-copy flows where the GPU accesses host RAM directly.
CUDA zero-copy primitives: pinned/mapped host memory
Zero-copy means the GPU can directly access host RAM without cudaMemcpy. CUDA gives you two main ways to get there:
1. cudaHostAlloc: allocate new pinned (optionally mapped) host memory
cudaHostAlloc returns pinned (page-locked) host memory. Its flags argument controls behavior; if you pass cudaHostAllocMapped, the allocation is also mapped into the GPU’s virtual address space so kernels can dereference it directly (zero-copy).
Supported flags (bitwise-ORable):
cudaHostAllocDefault— Pinned memory only; no device mapping.cudaHostAllocPortable— Pinned allocation is visible to all CUDA contexts in the process.cudaHostAllocMapped— Pinned and mapped; obtain a device alias withcudaHostGetDevicePointer.cudaHostAllocWriteCombined— Write-combined pinned memory: fast CPU→GPU writes, slow CPU reads (best for upload buffers).
Prerequisite (for mapping): enable host mapping before creating the CUDA context.
1
2
3
cudaSetDevice(0);
cudaSetDeviceFlags(cudaDeviceMapHost); // enable host mapping
cudaFree(0); // create context with these flags
Freeing: release with cudaFreeHost(ptr) (do not use free/delete, and do not call cudaHostUnregister for cudaHostAlloc allocations).
2. cudaHostRegister: pin/map an existing host buffer (retrofit zero-copy)
Use cudaHostRegister(ptr, size, flags) when a buffer already exists and you want pinned/zero-copy without reallocating.
It pins (page-locks) the range for fast DMA; with cudaHostRegisterMapped it also maps those pages into the GPU’s virtual address space so kernels can dereference them directly.
Flags:
cudaHostRegisterDefault— pin only.cudaHostRegisterMapped— pin + map (enables zero-copy).cudaHostRegisterPortable— usable from any CUDA context in the process.cudaHostRegisterReadOnly— hint that the GPU will only read.
Prerequisite for mapping: enable host mapping before the CUDA context is created. A simple usage would be like this
1
2
3
4
5
6
7
8
void* buf = existingCpuPtr; // e.g., img.data from cv::Mat
size_t bytes = totalSizeInBytes;
cudaHostRegister(buf, bytes, cudaHostRegisterMapped); // pin + map
void* d = nullptr;
cudaHostGetDevicePointer(&d, buf, 0); // device alias (zero-copy)
// ... launch kernels using 'd' ...
cudaHostUnregister(buf); // then free/delete 'buf'
An example program using cudaHostAlloc
This program demonstrates a zero-copy OpenCV + CUDA pipeline:
- Enable host mapping: Initializes CUDA with
cudaSetDeviceFlags(cudaDeviceMapHost)and checkscanMapHostMemory. - Custom
cv::Matallocator: Installs aMatAllocatorthat usescudaHostAlloc(..., cudaHostAllocMapped)socv::Matbuffers live in pinned + mapped host RAM (DMA-friendly and visible to the GPU). - Header-only GPU view: Wraps those mapped
cv::Matbuffers ascv::cuda::GpuMat(via a helper likewrappedGpuMatusingcudaHostGetDevicePointer)—nocudaMemcpy. - CUDA image ops: Applies a Gaussian blur (
cv::cuda::createGaussianFilter()->apply) followed by Canny edge detection (`cv::cuda::createCann
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <opencv2/core.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudafilters.hpp> // cv::cuda::createGaussianFilter (alternative)
#include <opencv2/cudaimgproc.hpp> // cv::cuda::GaussianBlur, CannyEdgeDetector
#include <opencv2/imgcodecs.hpp>
using namespace cv;
#define CUDA_CHECK(x) \
do { \
cudaError_t e = (x); \
if (e != cudaSuccess) { \
fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
cudaGetErrorString(e)); \
std::exit(1); \
} \
} while (0)
// A custom allocator for cv::Mat. This will allocate a pinned and
// mapped host memory that can be used directly by GPU.
class GpuMappedAllocator : public MatAllocator {
public:
UMatData *allocate(int dims, const int *sizes, int type, void *data,
size_t *step, AccessFlag flags,
UMatUsageFlags usageFlags) const override {
// Working with 2 dimensions for now.
const int rows = sizes[0], cols = sizes[1];
const size_t elementSize = CV_ELEM_SIZE(type);
const size_t rowStep = cols * elementSize;
if (step) {
// OpenCV expects step[0] = row stride in bytes
// step[1] = elementSize
step[0] = rowStep;
if (dims > 1)
step[1] = CV_ELEM_SIZE(type);
}
UMatData *umat = new UMatData(this);
umat->size = rows * rowStep;
if (data) {
// Wrap user memory (not owning)
umat->data = umat->origdata = static_cast<uchar *>(data);
umat->flags = UMatData::USER_ALLOCATED;
umat->handle = nullptr;
return umat;
}
void *host = nullptr;
CUDA_CHECK(cudaHostAlloc(&host, rows * rowStep, cudaHostAllocMapped));
umat->data = umat->origdata = static_cast<uchar *>(host);
umat->handle = host;
return umat;
}
virtual bool allocate(UMatData *data, AccessFlag accessflags,
UMatUsageFlags usageFlags) const override {
// Not supporting UMat/OpenCL path; nothing to do.
return false;
}
class GpuMappedAllocator : public MatAllocator {
public:
UMatData *allocate(int dims, const int *sizes, int type, void *data,
size_t *step, AccessFlag flags,
UMatUsageFlags usageFlags) const override {
// Working with 2 dimensions for now.
const int rows = sizes[0], cols = sizes[1];
const size_t elementSize = CV_ELEM_SIZE(type);
const size_t rowStep = cols * elementSize;
if (step) {
// OpenCV expects step[0] = row stride in bytes
// step[1] = elementSize
step[0] = rowStep;
if (dims > 1)
step[1] = CV_ELEM_SIZE(type);
}
UMatData *umat = new UMatData(this);
umat->size = rows * rowStep;
if (data) {
// Wrap user memory (not owning)
umat->data = umat->origdata = static_cast<uchar *>(data);
umat->flags = UMatData::USER_ALLOCATED;
umat->handle = nullptr;
return umat;
}
void *host = nullptr;
CUDA_CHECK(cudaHostAlloc(&host, rows * rowStep, cudaHostAllocMapped));
umat->data = umat->origdata = static_cast<uchar *>(host);
umat->handle = host;
return umat;
}
virtual bool allocate(UMatData *data, AccessFlag accessflags,
UMatUsageFlags usageFlags) const override {
// Not supporting UMat/OpenCL path; nothing to do.
return false;
}
int main() {
CUDA_CHECK(cudaSetDevice(0));
CUDA_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));
CUDA_CHECK(cudaFree(0)); // force context
cudaDeviceProp prop{};
CUDA_CHECK(cudaGetDeviceProperties(&prop, 0));
if (!prop.canMapHostMemory) {
fprintf(stderr, "Device cannot map host memory.\n");
return 1;
}
// Install the custom Mat allocator (affects *Mat* only)
static GpuMappedAllocator gpuAllocator;
Mat::setDefaultAllocator(&gpuAllocator);
// Create a custom input matrix
const int W = 640, H = 480;
Mat in(H, W, CV_8UC1);
Mat out(H, W, CV_8UC1);
for (int y = 0; y < H; y++) {
auto *row = in.ptr<uchar>(y);
for (int x = 0; x < W; x++) {
row[x] = static_cast<uchar>((x / 8) % 2 ? 220 : 35);
}
}
cuda::GpuMat gin = wrappedGpuMat(in);
cuda::GpuMat gout = wrappedGpuMat(out);
cuda::GpuMat tmp;
cv::Ptr<cv::cuda::Filter> gauss = cv::cuda::createGaussianFilter(
gin.type(), gin.type(), cv::Size(5, 5), 1.2, 1.2, cv::BORDER_DEFAULT);
cv::Ptr<cv::cuda::CannyEdgeDetector> canny =
cv::cuda::createCannyEdgeDetector(50, 150);
gauss->apply(gin, tmp);
canny->detect(tmp, gout);
imwrite("input.png", in);
imwrite("output.png", out);
return 0;
}
Appendix : Downloading and building OpenCV for Nvidia GPUs
Building OpenCV from source
These steps will download build and install openCV in your local folder with cuda support. Note cuda toolkit and cuDNN must be installed before this
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
$ mkdir -p ~/src && cd ~/src
$ git clone --depth 1 https://github.com/opencv/opencv.git
$ git clone --depth 1 https://github.com/opencv/opencv_contrib.git
$ mkdir -p opencv-build && cd opencv-build
$ cmake ../opencv \
-DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$HOME/.local/opencv-cuda \
-DWITH_CUDA=ON -DOPENCV_DNN_CUDA=ON -DWITH_CUDNN=ON -DWITH_CUBLAS=ON \
-DENABLE_FAST_MATH=ON -DCUDA_FAST_MATH=ON -DCUDA_ARCH_BIN=8.9 \
-DBUILD_opencv_cudacodec=ON -DOPENCV_EXTRA_MODULES_PATH=../opencv_contrib/modules \
-DBUILD_EXAMPLES=OFF
$ make -j 16
$ make install # It will install in ~/.local/opencv-cuda
$ export OpenCV_DIR=$HOME/.local/opencv-cuda/lib/cmake/opencv4
$ export LD_LIBRARY_PATH=$HOME/.local/opencv-cuda/lib:$LD_LIBRARY_PATH
Once installed use this program to check
1
2
3
4
5
6
7
8
9
10
#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/core/cuda.hpp>
int main() {
int n = cv::cuda::getCudaEnabledDeviceCount();
std::cout << "CUDA devices: " << n << "\n";
if (n > 0)
cv::cuda::printShortCudaDeviceInfo(0);
}
Build the above program using
1
2
3
4
5
6
export OPENCV_DIR=$HOME/.local/opencv-cuda/
g++ -std=c++17 check_cuda.cpp -o check_cuda $(pkg-config --cflags --libs opencv4)
# The output looks like the below in my case
CUDA devices: 1
Device 0: "NVIDIA GeForce RTX 4050 Laptop GPU" 6140Mb, sm_89, Driver/Runtime ver.12.70/12.60