CANN计算机视觉加速利器:ops-cv图像处理与目标检测算子库详解
**ops-cv** 是CANN提供的图像处理、目标检测相关的算子库,专门为计算机视觉任务设计的高性能算子集合。随着计算机视觉技术在自动驾驶、智能安防、医疗影像等领域的广泛应用,对图像处理和目标检测算子的性能需求日益增长。ops-cv算子库涵盖了图像预处理、特征提取、目标检测、图像变换等各类CV核心算子,为计算机视觉应用在NPU上的高效运行提供关键支持。
CANN计算机视觉加速利器:ops-cv图像处理与目标检测算子库详解
一、项目简介
ops-cv 是CANN提供的图像处理、目标检测相关的算子库,专门为计算机视觉任务设计的高性能算子集合。随着计算机视觉技术在自动驾驶、智能安防、医疗影像等领域的广泛应用,对图像处理和目标检测算子的性能需求日益增长。ops-cv算子库涵盖了图像预处理、特征提取、目标检测、图像变换等各类CV核心算子,为计算机视觉应用在NPU上的高效运行提供关键支持。
该算子库针对图像数据的特点进行了深度优化,包括数据格式转换、色彩空间转换、几何变换、特征图处理等,能够显著提升CV模型的推理性能。
相关链接:
- CANN组织链接:https://atomgit.com/cann
- ops-cv仓库链接:https://atomgit.com/cann/ops-cv
二、核心功能与特性
2.1 算子分类
ops-cv算子库按照功能可分为以下几大类:
| 算子类别 | 主要功能 | 应用场景 |
|---|---|---|
| 图像预处理 | Resize、Crop、Pad、Normalize等 | 数据增强、模型输入准备 |
| 色彩空间转换 | RGB2BGR、RGB2YUV、RGB2GRAY等 | 图像格式转换 |
| 几何变换 | Affine、Perspective、Rotate、Flip等 | 数据增强、姿态校正 |
| 特征提取 | Conv2D、Dilation、Erosion、Sobel等 | 边缘检测、特征提取 |
| 目标检测 | NMS、ROIAlign、ROIPooling等 | 检测后处理 |
| 图像滤波 | GaussianBlur、MedianBlur、Bilateral等 | 噪声去除、图像平滑 |
| 形态学操作 | Open、Close、Gradient等 | 形状分析、特征增强 |
2.2 技术特性
- 多格式支持:支持NCHW、NHWC、HWC等多种数据布局
- 多精度计算:支持FP32、FP16、UINT8等多种数据类型
- 硬件加速:针对NPU的向量计算单元进行优化
- 零拷贝优化:支持原地计算,减少内存拷贝开销
- 批处理优化:支持多图像批量处理
三、环境准备
3.1 系统要求
- 操作系统:Ubuntu 18.04/20.04/22.04
- 处理器:Atlas 200I/300T/800T系列
- CANN版本:CANN 8.0.RC3及以上
- OpenCV:3.4+ 或 4.x(用于数据对比验证)
3.2 依赖安装
# 安装OpenCV(用于对比验证)
sudo apt-get install -y libopencv-dev python3-opencv
# 克隆ops-cv仓库
git clone https://atomgit.com/cann/ops-cv.git
cd ops-cv
# 编译安装
mkdir build && cd build
cmake .. \
-DCMAKE_BUILD_TYPE=Release \
-DCANN_INSTALL_PATH=/usr/local/Ascend \
-DWITH_OPENCV=ON
make -j$(nproc)
make install
四、图像预处理算子示例
4.1 图像缩放(Resize)
图像缩放是计算机视觉中最常用的预处理操作之一:
#include "cv_ops.h"
extern "C" __global__ __aicore__ void image_resize(
GM_ADDR input_gm, // 输入图像: [H, W, C]
GM_ADDR output_gm, // 输出图像: [new_h, new_w, C]
uint32_t src_h,
uint32_t src_w,
uint32_t channels,
uint32_t dst_h,
uint32_t dst_w,
InterpolationMode mode // 插值模式: NEAREST, BILINEAR, BICUBIC
)
{
// 定义Tensor
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
// 分配Local Tensor
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 使用ops-cv提供的缩放算子
Resize<uint8_t> resize_op;
resize_op.Init(src_h, src_w, channels, dst_h, dst_w, mode);
// 执行缩放操作
resize_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, dst_h * dst_w * channels);
// 释放资源
input_local.Free();
output_local.Free();
}
4.2 图像裁剪(Crop)
extern "C" __global__ __aicore__ void image_crop(
GM_ADDR input_gm, // 输入图像: [H, W, C]
GM_ADDR output_gm, // 输出图像: [crop_h, crop_w, C]
uint32_t src_h,
uint32_t src_w,
uint32_t channels,
uint32_t crop_top,
uint32_t crop_left,
uint32_t crop_h,
uint32_t crop_w
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local;
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 使用ops-cv提供的裁剪算子
Crop<uint8_t> crop_op;
crop_op.Init(src_h, src_w, channels, crop_top, crop_left, crop_h, crop_w);
// 计算输入偏移量
uint32_t input_offset = (crop_top * src_w + crop_left) * channels;
// 执行裁剪操作(直接内存拷贝)
for (uint32_t row = 0; row < crop_h; row++) {
uint32_t src_offset = input_offset + row * src_w * channels;
uint32_t dst_offset = row * crop_w * channels;
DataCopy(output_local,
input_gm + src_offset,
crop_w * channels);
DataCopy(output_gm + dst_offset,
output_local,
crop_w * channels);
}
input_local.Free();
output_local.Free();
}
4.3 图像填充(Pad)
extern "C" __global__ __aicore__ void image_pad(
GM_ADDR input_gm, // 输入图像: [H, W, C]
GM_ADDR output_gm, // 输出图像: [H+pad_h*2, W+pad_w*2, C]
uint32_t src_h,
uint32_t src_w,
uint32_t channels,
uint32_t pad_top,
uint32_t pad_bottom,
uint32_t pad_left,
uint32_t pad_right,
uint8_t pad_value // 填充值
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local;
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
uint32_t dst_h = src_h + pad_top + pad_bottom;
uint32_t dst_w = src_w + pad_left + pad_right;
// 使用ops-cv提供的填充算子
Pad<uint8_t> pad_op;
pad_op.Init(src_h, src_w, channels, pad_top, pad_bottom, pad_left, pad_right, pad_value);
// 执行填充操作
pad_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, dst_h * dst_w * channels);
input_local.Free();
output_local.Free();
}
4.4 图像归一化(Normalize)
extern "C" __global__ __aicore__ void image_normalize(
GM_ADDR input_gm, // 输入图像: [H, W, C], uint8
GM_ADDR output_gm, // 输出图像: [H, W, C], float
GM_ADDR mean_gm, // 均值: [C]
GM_ADDR std_gm, // 标准差: [C]
uint32_t h,
uint32_t w,
uint32_t channels
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
Tensor mean = Tensor(mean_gm);
Tensor std = Tensor(std_gm);
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<float> output_local = output.Out<float>();
LocalTensor<float> mean_local = mean.In<float>();
LocalTensor<float> std_local = std.In<float>();
// 转换为float并归一化: (x - mean) / std
const uint32_t block_size = 512;
for (uint32_t i = 0; i < h * w; i += block_size) {
uint32_t current_size = (i + block_size) > (h * w) ? (h * w - i) : block_size;
// 加载图像块
DataCopy(input_local, input_gm + i * channels, current_size * channels);
// 逐通道处理
for (uint32_t c = 0; c < channels; c++) {
// 获取当前通道的均值和标准差
float mean_val = mean_local.GetValue(c);
float std_val = std_local.GetValue(c);
// 处理该通道的所有像素
for (uint32_t j = 0; j < current_size; j++) {
// 转换为float [0, 1]
float val = static_cast<float>(input_local.GetValue(j * channels + c)) / 255.0f;
// 归一化
float normalized = (val - mean_val) / std_val;
// 存储结果
output_local.SetValue(j * channels + c, normalized);
}
}
// 写回结果
DataCopy(output_gm + i * channels * sizeof(float),
output_local,
current_size * channels);
}
input_local.Free();
output_local.Free();
mean_local.Free();
std_local.Free();
}
五、色彩空间转换示例
5.1 RGB转GRAY(灰度化)
extern "C" __global__ __aicore__ void rgb2gray(
GM_ADDR input_gm, // 输入: [H, W, 3], RGB
GM_ADDR output_gm, // 输出: [H, W], 灰度
uint32_t h,
uint32_t w
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 使用ops-cv提供的RGB转灰度算子
// 灰度 = 0.299*R + 0.587*G + 0.114*B
RGB2Gray<uint8_t> rgb2gray_op;
rgb2gray_op.Init(h, w);
rgb2gray_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, h * w);
input_local.Free();
output_local.Free();
}
5.2 RGB转YUV
extern "C" __global__ __aicore__ void rgb2yuv(
GM_ADDR input_gm, // 输入: [H, W, 3], RGB
GM_ADDR output_gm, // 输出: [H, W, 3], YUV
uint32_t h,
uint32_t w
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 使用ops-cv提供的RGB转YUV算子
// Y = 0.299*R + 0.587*G + 0.114*B
// U = -0.14713*R - 0.28886*G + 0.436*B + 128
// V = 0.615*R - 0.51499*G - 0.10001*B + 128
RGB2YUV<uint8_t> rgb2yuv_op;
rgb2yuv_op.Init(h, w);
rgb2yuv_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, h * w * 3);
input_local.Free();
output_local.Free();
}
5.3 BGR转RGB
extern "C" __global__ __aicore__ void bgr2rgb(
GM_ADDR input_gm, // 输入: [H, W, 3], BGR
GM_ADDR output_gm, // 输出: [H, W, 3], RGB
uint32_t h,
uint32_t w
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local;
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 分块处理
const uint32_t block_size = 512;
for (uint32_t i = 0; i < h * w; i += block_size) {
uint32_t current_size = (i + block_size) > (h * w) ? (h * w - i) : block_size;
// 加载BGR数据
DataCopy(input_local, input_gm + i * 3, current_size * 3);
// 交换通道: BGR -> RGB
for (uint32_t j = 0; j < current_size; j++) {
uint8_t b = input_local.GetValue(j * 3);
uint8_t g = input_local.GetValue(j * 3 + 1);
uint8_t r = input_local.GetValue(j * 3 + 2);
output_local.SetValue(j * 3, r);
output_local.SetValue(j * 3 + 1, g);
output_local.SetValue(j * 3 + 2, b);
}
// 写回RGB数据
DataCopy(output_gm + i * 3, output_local, current_size * 3);
}
input_local.Free();
output_local.Free();
}
六、几何变换示例
6.1 图像翻转(Flip)
extern "C" __global__ __aicore__ void image_flip(
GM_ADDR input_gm, // 输入图像: [H, W, C]
GM_ADDR output_gm, // 输出图像: [H, W, C]
uint32_t h,
uint32_t w,
uint32_t channels,
FlipMode mode // HORIZONTAL, VERTICAL, BOTH
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local;
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 使用ops-cv提供的翻转算子
Flip<uint8_t> flip_op;
flip_op.Init(h, w, channels, mode);
flip_op.Compute(input_gm, output_local);
// 写回结果
DataCopy(output_gm, output_local, h * w * channels);
input_local.Free();
output_local.Free();
}
6.2 图像旋转(Rotate)
extern "C" __global__ __aicore__ void image_rotate(
GM_ADDR input_gm, // 输入图像: [H, W, C]
GM_ADDR output_gm, // 输出图像: [H, W, C]
uint32_t h,
uint32_t w,
uint32_t channels,
float angle, // 旋转角度(度)
float center_x, // 旋转中心X
float center_y, // 旋转中心Y
uint8_t fill_value // 填充值
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 使用ops-cv提供的旋转算子
Rotate<uint8_t> rotate_op;
rotate_op.Init(h, w, channels, angle, center_x, center_y, fill_value);
rotate_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, h * w * channels);
input_local.Free();
output_local.Free();
}
6.3 仿射变换(Affine)
extern "C" __global__ __aicore__ void affine_transform(
GM_ADDR input_gm, // 输入图像: [H, W, C]
GM_ADDR output_gm, // 输出图像: [new_h, new_w, C]
GM_ADDR matrix_gm, // 仿射变换矩阵: [2, 3]
uint32_t src_h,
uint32_t src_w,
uint32_t dst_h,
uint32_t dst_w,
uint32_t channels,
uint8_t fill_value
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
Tensor matrix = Tensor(matrix_gm);
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
LocalTensor<float> matrix_local = matrix.In<float>();
// 使用ops-cv提供的仿射变换算子
Affine<uint8_t> affine_op;
affine_op.Init(src_h, src_w, dst_h, dst_w, channels, fill_value);
affine_op.SetMatrix(matrix_local);
affine_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, dst_h * dst_w * channels);
input_local.Free();
output_local.Free();
matrix_local.Free();
}
七、目标检测相关算子示例
7.1 非极大值抑制(NMS)
NMS是目标检测中用于去除重复框的关键算法:
struct Box {
float x1, y1, x2, y2;
float score;
int class_id;
};
extern "C" __global__ __aicore__ void nms(
GM_ADDR boxes_gm, // 输入框: [N, 6] (x1, y1, x2, y2, score, class_id)
GM_ADDR output_gm, // 输出框索引: [M]
GM_ADDR num_output_gm, // 输出框数量
uint32_t num_boxes,
float iou_threshold,
float score_threshold
)
{
Tensor boxes = Tensor(boxes_gm);
Tensor output = Tensor(output_gm);
Tensor num_output = Tensor(num_output_gm);
LocalTensor<Box> boxes_local = boxes.In<Box>();
LocalTensor<int32_t> output_local = output.Out<int32_t>();
LocalTensor<uint8_t> suppressed_local; // 抑制标记
// 初始化抑制标记
Duplicate<uint8_t, uint8_t> dup;
dup.Compute(0, suppressed_local, num_boxes);
// 按score降序排序
Sort<Box, float> sort_op;
sort_op.SortByScore(boxes_local, num_boxes, true);
// 执行NMS
int32_t keep_count = 0;
for (uint32_t i = 0; i < num_boxes; i++) {
if (suppressed_local.GetValue(i) == 1) continue;
Box box_i = boxes_local.GetValue(i);
if (box_i.score < score_threshold) continue;
// 保留当前框
output_local.SetValue(keep_count++, i);
// 计算与后续框的IoU,抑制高IoU的框
for (uint32_t j = i + 1; j < num_boxes; j++) {
if (suppressed_local.GetValue(j) == 1) continue;
Box box_j = boxes_local.GetValue(j);
// 只处理同一类的框
if (box_i.class_id != box_j.class_id) continue;
// 计算IoU
float inter_x1 = fmax(box_i.x1, box_j.x1);
float inter_y1 = fmax(box_i.y1, box_j.y1);
float inter_x2 = fmin(box_i.x2, box_j.x2);
float inter_y2 = fmin(box_i.y2, box_j.y2);
float inter_area = fmax(0.0f, inter_x2 - inter_x1) * fmax(0.0f, inter_y2 - inter_y1);
float box_i_area = (box_i.x2 - box_i.x1) * (box_i.y2 - box_i.y1);
float box_j_area = (box_j.x2 - box_j.x1) * (box_j.y2 - box_j.y1);
float union_area = box_i_area + box_j_area - inter_area;
float iou = (union_area > 0) ? (inter_area / union_area) : 0.0f;
if (iou > iou_threshold) {
suppressed_local.SetValue(j, 1);
}
}
}
// 写回结果
DataCopy(output_gm, output_local, keep_count);
memcpy(num_output_gm, &keep_count, sizeof(int32_t));
boxes_local.Free();
output_local.Free();
suppressed_local.Free();
}
7.2 ROI Align
ROI Align是实例分割和目标检测中的关键算子:
extern "C" __global__ __aicore__ void roi_align(
GM_ADDR feature_gm, // 输入特征图: [N, C, H, W]
GM_ADDR rois_gm, // ROI框: [M, 5] (batch_idx, x1, y1, x2, y2)
GM_ADDR output_gm, // 输出: [M, C, pooled_h, pooled_w]
uint32_t batch,
uint32_t channels,
uint32_t feature_h,
uint32_t feature_w,
uint32_t num_rois,
uint32_t pooled_h,
uint32_t pooled_w,
float spatial_scale,
uint32_t sampling_ratio
)
{
Tensor feature = Tensor(feature_gm);
Tensor rois = Tensor(rois_gm);
Tensor output = Tensor(output_gm);
LocalTensor<float> feature_local = feature.In<float>();
LocalTensor<float> rois_local = rois.In<float>();
LocalTensor<float> output_local = output.Out<float>();
// 使用ops-cv提供的ROI Align算子
ROIAlign<float> roi_align_op;
roi_align_op.Init(batch, channels, feature_h, feature_w,
pooled_h, pooled_w, spatial_scale, sampling_ratio);
// 对每个ROI执行操作
for (uint32_t roi_idx = 0; roi_idx < num_rois; roi_idx++) {
// 获取ROI信息
uint32_t batch_idx = static_cast<uint32_t>(rois_local.GetValue(roi_idx * 5));
float x1 = rois_local.GetValue(roi_idx * 5 + 1);
float y1 = rois_local.GetValue(roi_idx * 5 + 2);
float x2 = rois_local.GetValue(roi_idx * 5 + 3);
float y2 = rois_local.GetValue(roi_idx * 5 + 4);
// 执行ROI Align
roi_align_op.Compute(feature_local, batch_idx, x1, y1, x2, y2,
output_local, roi_idx);
}
// 写回结果
DataCopy(output_gm, output_local, num_rois * channels * pooled_h * pooled_w);
feature_local.Free();
rois_local.Free();
output_local.Free();
}
八、图像滤波示例
8.1 高斯模糊(Gaussian Blur)
extern "C" __global__ __aicore__ void gaussian_blur(
GM_ADDR input_gm, // 输入图像: [H, W, C]
GM_ADDR output_gm, // 输出图像: [H, W, C]
GM_ADDR kernel_gm, // 高斯核
uint32_t h,
uint32_t w,
uint32_t channels,
uint32_t kernel_size,
float sigma
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
Tensor kernel = Tensor(kernel_gm);
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
LocalTensor<float> kernel_local = kernel.In<float>();
// 使用ops-cv提供的高斯模糊算子
GaussianBlur<uint8_t> blur_op;
blur_op.Init(h, w, channels, kernel_size, sigma);
blur_op.SetKernel(kernel_local);
blur_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, h * w * channels);
input_local.Free();
output_local.Free();
kernel_local.Free();
}
8.2 Sobel边缘检测
extern "C" __global__ __aicore__ void sobel_edge_detection(
GM_ADDR input_gm, // 输入灰度图: [H, W]
GM_ADDR output_gm, // 输出梯度: [H, W]
uint32_t h,
uint32_t w
)
{
Tensor input = Tensor(input_gm);
Tensor output = Tensor(output_gm);
LocalTensor<uint8_t> input_local = input.In<uint8_t>();
LocalTensor<uint8_t> output_local = output.Out<uint8_t>();
// 使用ops-cv提供的Sobel算子
Sobel<uint8_t> sobel_op;
sobel_op.Init(h, w);
sobel_op.Compute(input_local, output_local);
// 写回结果
DataCopy(output_gm, output_local, h * w);
input_local.Free();
output_local.Free();
}
九、性能优化建议
- 合理选择插值算法:在Resize时,Bilinear通常是最优选择
- 批量处理:对于多张图像,尽量使用批量处理以提高效率
- 数据对齐:确保数据起始地址满足对齐要求(如512字节对齐)
- 原地计算:对于支持原地计算的算子,优先使用以减少内存拷贝
- 使用零拷贝:在数据传递时尽量使用指针而非数据拷贝
十、应用场景
ops-cv算子库广泛应用于以下场景:
| 场景 | 描述 | 推荐算子 |
|---|---|---|
| 图像分类 | 预处理和后处理 | Resize, Normalize, Crop |
| 目标检测 | 检测框处理 | NMS, ROIAlign, Resize |
| 语义分割 | 图像预处理 | Resize, Normalize, Pad |
| 实例分割 | ROI特征提取 | ROIAlign, Mask |
| 图像增强 | 数据增强 | Flip, Rotate, Affine, ColorJitter |
| OCR | 文本图像处理 | Gray, Threshold, Morphology |
十一、总结
ops-cv作为CANN生态系统中专门针对计算机视觉任务的算子库,为图像处理和目标检测应用提供了丰富的算子支持。通过本文的介绍和示例代码,开发者可以全面了解ops-cv的功能特性,并掌握其在实际项目中的应用方法。无论是基础的图像预处理还是复杂的目标检测后处理,ops-cv都提供了经过深度优化的高性能实现。
相关链接:
- CANN组织链接:https://atomgit.com/cann
- ops-cv仓库链接:https://atomgit.com/cann/ops-cv
昇腾计算产业是基于昇腾系列(HUAWEI Ascend)处理器和基础软件构建的全栈 AI计算基础设施、行业应用及服务,https://devpress.csdn.net/organization/setting/general/146749包括昇腾系列处理器、系列硬件、CANN、AI计算框架、应用使能、开发工具链、管理运维工具、行业应用及服务等全产业链
更多推荐


所有评论(0)