2021-10-17
“工程深水区”的 OpenCV-Python:特征点匹配、单应性矩阵、相机标定、镜头去畸变、光流跟踪、DNN 推理和性能排查。
这些能力更适合解决下面这类问题:
如果你要在大图里找一张小图,或者判断两张图片是不是拍到了同一个平面物体,特征点匹配很有用。
基本流程:
提取关键点和描述子
匹配描述子
过滤差匹配
估计单应性矩阵
把目标区域映射到大图
用 ORB 做一个完整例子:
from pathlib import Path
import cv2
import numpy as np
def locate_planar_object(template_path: str, scene_path: str, output_path: str) -> None:
template = cv2.imread(template_path, cv2.IMREAD_GRAYSCALE)
scene = cv2.imread(scene_path, cv2.IMREAD_GRAYSCALE)
if template is None:
raise FileNotFoundError(template_path)
if scene is None:
raise FileNotFoundError(scene_path)
orb = cv2.ORB_create(nfeatures=2000)
kp1, des1 = orb.detectAndCompute(template, None)
kp2, des2 = orb.detectAndCompute(scene, None)
if des1 is None or des2 is None:
raise RuntimeError("not enough descriptors")
matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
pairs = matcher.knnMatch(des1, des2, k=2)
good = []
for pair in pairs:
if len(pair) != 2:
continue
first, second = pair
if first.distance < 0.75 * second.distance:
good.append(first)
if len(good) < 8:
raise RuntimeError("not enough good matches")
src_points = np.float32([kp1[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
dst_points = np.float32([kp2[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
matrix, mask = cv2.findHomography(src_points, dst_points, cv2.RANSAC, 5.0)
if matrix is None:
raise RuntimeError("homography failed")
h, w = template.shape[:2]
corners = np.float32([
[0, 0],
[w, 0],
[w, h],
[0, h],
]).reshape(-1, 1, 2)
projected = cv2.perspectiveTransform(corners, matrix)
scene_color = cv2.imread(scene_path)
output = cv2.polylines(
scene_color,
[np.int32(projected)],
isClosed=True,
color=(0, 255, 0),
thickness=3,
)
cv2.imwrite(output_path, output)
locate_planar_object("template.jpg", "scene.jpg", "matched.jpg")
这段代码里最关键的是 findHomography(..., cv2.RANSAC, ...)。真实图片里总会有错配点,RANSAC 能把明显不靠谱的点过滤掉,留下更可信的几何关系。
OpenCV 里常见特征有 ORB、SIFT、AKAZE。
ORB:
detector = cv2.ORB_create(nfeatures=2000)
norm = cv2.NORM_HAMMING
特点是快,描述子是二进制,适合对速度敏感的场景。
SIFT:
detector = cv2.SIFT_create()
norm = cv2.NORM_L2
对尺度、旋转更稳,代价通常更高。
AKAZE:
detector = cv2.AKAZE_create()
norm = cv2.NORM_HAMMING
在一些纹理不算丰富的场景里表现不错。
选择建议:
先用 ORB 做 baseline
匹配不稳再试 SIFT 或 AKAZE
纹理太少时别硬上特征匹配
平面目标更适合 homography
非刚体目标要换思路
findHomography 不只是能定位平面目标,也能做透视矫正。
假设你已经有四个角点,可以把票据、屏幕、卡片拉成正视图:
import cv2
import numpy as np
def rectify_quad(image_path: str, points: list[tuple[float, float]], output_path: str) -> None:
image = cv2.imread(image_path)
if image is None:
raise FileNotFoundError(image_path)
src = np.float32(points)
dst = np.float32([
[0, 0],
[800, 0],
[800, 500],
[0, 500],
])
matrix = cv2.getPerspectiveTransform(src, dst)
warped = cv2.warpPerspective(image, matrix, (800, 500))
cv2.imwrite(output_path, warped)
rectify_quad(
"receipt.jpg",
[(120, 80), (720, 110), (760, 560), (90, 530)],
"receipt_rectified.jpg",
)
这类操作常见于文档扫描、票据识别、屏幕内容提取。难点通常不是变换本身,而是如何稳定拿到四个角点。
很多摄像头拍出来的图像边缘会弯,直线看起来像弧线。相机标定要估计两类东西:
camera matrix:焦距和主点
distortion coefficients:镜头畸变参数
标定常用棋盘格。OpenCV 提供 findChessboardCorners、cornerSubPix、calibrateCamera、undistort 这一整套工具。
from pathlib import Path
import cv2
import numpy as np
def calibrate_camera(image_dir: str, pattern_size: tuple[int, int]):
cols, rows = pattern_size
object_points_template = np.zeros((rows * cols, 3), np.float32)
object_points_template[:, :2] = np.mgrid[0:cols, 0:rows].T.reshape(-1, 2)
object_points = []
image_points = []
image_size = None
for path in Path(image_dir).glob("*.jpg"):
image = cv2.imread(str(path))
if image is None:
continue
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image_size = gray.shape[::-1]
ok, corners = cv2.findChessboardCorners(gray, pattern_size)
if not ok:
continue
refined = cv2.cornerSubPix(
gray,
corners,
winSize=(11, 11),
zeroZone=(-1, -1),
criteria=(
cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER,
30,
0.001,
),
)
object_points.append(object_points_template)
image_points.append(refined)
if not object_points or image_size is None:
raise RuntimeError("no valid chessboard images")
rms, camera_matrix, distortion, rvecs, tvecs = cv2.calibrateCamera(
object_points,
image_points,
image_size,
None,
None,
)
return {
"rms": rms,
"camera_matrix": camera_matrix,
"distortion": distortion,
}
使用标定结果去畸变:
def undistort_image(image_path: str, output_path: str, camera_matrix, distortion) -> None:
image = cv2.imread(image_path)
if image is None:
raise FileNotFoundError(image_path)
h, w = image.shape[:2]
new_matrix, roi = cv2.getOptimalNewCameraMatrix(
camera_matrix,
distortion,
(w, h),
1,
(w, h),
)
undistorted = cv2.undistort(image, camera_matrix, distortion, None, new_matrix)
x, y, rw, rh = roi
undistorted = undistorted[y:y + rh, x:x + rw]
cv2.imwrite(output_path, undistorted)
标定质量很依赖输入图片。棋盘格要覆盖不同位置和角度,角点检测失败的图片不要硬塞进去。
光流描述相邻帧中像素或特征点的运动。常见两类:
稀疏光流:跟踪少量关键点
稠密光流:估计整张图的运动场
稀疏 Lucas-Kanade 光流:
import cv2
import numpy as np
cap = cv2.VideoCapture("input.mp4")
ok, first = cap.read()
if not ok:
raise RuntimeError("cannot read first frame")
prev_gray = cv2.cvtColor(first, cv2.COLOR_BGR2GRAY)
points = cv2.goodFeaturesToTrack(
prev_gray,
maxCorners=200,
qualityLevel=0.01,
minDistance=8,
)
track_layer = np.zeros_like(first)
while True:
ok, frame = cap.read()
if not ok:
break
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
next_points, status, error = cv2.calcOpticalFlowPyrLK(
prev_gray,
gray,
points,
None,
)
if next_points is None:
break
good_new = next_points[status == 1]
good_old = points[status == 1]
for new, old in zip(good_new, good_old):
x1, y1 = new.ravel()
x0, y0 = old.ravel()
cv2.line(track_layer, (int(x0), int(y0)), (int(x1), int(y1)), (0, 255, 0), 2)
cv2.circle(frame, (int(x1), int(y1)), 3, (0, 255, 0), -1)
output = cv2.add(frame, track_layer)
cv2.imshow("sparse flow", output)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
prev_gray = gray
points = good_new.reshape(-1, 1, 2)
cap.release()
cv2.destroyAllWindows()
稀疏光流适合跟踪角点、纹理点、运动轨迹。点丢失后可以周期性重新检测关键点。
Farneback 光流会给每个像素估计一个运动向量。
import cv2
import numpy as np
def flow_to_bgr(flow):
magnitude, angle = cv2.cartToPolar(flow[..., 0], flow[..., 1])
hsv = np.zeros((flow.shape[0], flow.shape[1], 3), dtype=np.uint8)
hsv[..., 0] = angle * 180 / np.pi / 2
hsv[..., 1] = 255
hsv[..., 2] = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX)
return cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
cap = cv2.VideoCapture("input.mp4")
ok, first = cap.read()
if not ok:
raise RuntimeError("cannot read first frame")
prev_gray = cv2.cvtColor(first, cv2.COLOR_BGR2GRAY)
while True:
ok, frame = cap.read()
if not ok:
break
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
flow = cv2.calcOpticalFlowFarneback(
prev_gray,
gray,
None,
pyr_scale=0.5,
levels=3,
winsize=15,
iterations=3,
poly_n=5,
poly_sigma=1.2,
flags=0,
)
flow_view = flow_to_bgr(flow)
cv2.imshow("dense flow", flow_view)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
prev_gray = gray
cap.release()
cv2.destroyAllWindows()
稠密光流更重,但能看到整体运动趋势,适合运动分析、背景变化观察、画面稳定前的运动估计。
如果每一帧都跑重模型太贵,可以先检测一次目标,再用 tracker 跟踪。
import cv2
cap = cv2.VideoCapture("input.mp4")
ok, frame = cap.read()
if not ok:
raise RuntimeError("cannot read first frame")
roi = cv2.selectROI("select target", frame, fromCenter=False, showCrosshair=True)
cv2.destroyWindow("select target")
tracker = cv2.TrackerCSRT_create()
tracker.init(frame, roi)
while True:
ok, frame = cap.read()
if not ok:
break
tracked, box = tracker.update(frame)
if tracked:
x, y, w, h = [int(v) for v in box]
cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)
else:
cv2.putText(
frame,
"lost",
(40, 60),
cv2.FONT_HERSHEY_SIMPLEX,
1.0,
(0, 0, 255),
2,
)
cv2.imshow("tracker", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break
cap.release()
cv2.destroyAllWindows()
TrackerCSRT_create 通常比较稳,但速度可能不是最轻。不同 OpenCV 安装包和版本里 tracker API 位置可能略有差异,如果报属性不存在,可以检查是否安装了 contrib 包。
OpenCV 的 dnn 模块可以加载 ONNX、Caffe、TensorFlow 等格式。它适合做轻量推理或模型部署前验证。
一个通用 ONNX 分类模型推理骨架:
import cv2
import numpy as np
def classify_onnx(image_path: str, model_path: str) -> np.ndarray:
image = cv2.imread(image_path)
if image is None:
raise FileNotFoundError(image_path)
net = cv2.dnn.readNetFromONNX(model_path)
blob = cv2.dnn.blobFromImage(
image,
scalefactor=1.0 / 255.0,
size=(224, 224),
mean=(0.485, 0.456, 0.406),
swapRB=True,
crop=False,
)
net.setInput(blob)
output = net.forward()
return output
注意 swapRB=True。OpenCV 读图是 BGR,而很多模型训练时使用 RGB。这个参数很容易决定结果是否正常。
目标检测模型通常会输出很多候选框,需要非极大值抑制。
import cv2
boxes = [
[100, 80, 120, 160],
[108, 86, 118, 152],
[360, 200, 90, 120],
]
scores = [0.92, 0.85, 0.78]
indices = cv2.dnn.NMSBoxes(
bboxes=boxes,
scores=scores,
score_threshold=0.5,
nms_threshold=0.4,
)
for index in indices:
i = int(index)
x, y, w, h = boxes[i]
print(x, y, w, h, scores[i])
不同模型输出格式差异很大。后处理一定要先看清楚模型输出维度、坐标格式、置信度位置和类别位置。
OpenCV 性能优化的顺序别搞反。
先问这些问题:
一个常见坏例子:逐像素 Python 循环。
for y in range(image.shape[0]):
for x in range(image.shape[1]):
if image[y, x, 1] > 120:
mask[y, x] = 255
改成 NumPy:
mask = (image[:, :, 1] > 120).astype("uint8") * 255
能交给 OpenCV 的就交给 OpenCV:
mask = cv2.inRange(image, (0, 120, 0), (255, 255, 255))
这类改法比微调几个参数更有效。
OpenCV 有 UMat、OpenCL、DNN backend/target 等能力,但不是打开就一定更快。
DNN 可以设置后端:
net = cv2.dnn.readNetFromONNX("model.onnx")
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
如果环境支持其他后端,也可以按部署环境尝试。但要用真实输入测端到端耗时,不要只看单算子。
有些小图、小操作上,数据搬运成本可能比计算更贵。
把处理步骤封装成类,后面更容易插拔。
from dataclasses import dataclass
import cv2
import numpy as np
@dataclass
class MatchResult:
ok: bool
polygon: np.ndarray | None
matches: int
class PlanarMatcher:
def __init__(self, template_path: str):
self.template = cv2.imread(template_path, cv2.IMREAD_GRAYSCALE)
if self.template is None:
raise FileNotFoundError(template_path)
self.detector = cv2.ORB_create(nfeatures=2000)
self.kp_template, self.des_template = self.detector.detectAndCompute(
self.template,
None,
)
self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING)
def match(self, frame_bgr) -> MatchResult:
gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
kp_scene, des_scene = self.detector.detectAndCompute(gray, None)
if self.des_template is None or des_scene is None:
return MatchResult(False, None, 0)
pairs = self.matcher.knnMatch(self.des_template, des_scene, k=2)
good = []
for pair in pairs:
if len(pair) != 2:
continue
first, second = pair
if first.distance < 0.75 * second.distance:
good.append(first)
if len(good) < 8:
return MatchResult(False, None, len(good))
src = np.float32([self.kp_template[m.queryIdx].pt for m in good]).reshape(-1, 1, 2)
dst = np.float32([kp_scene[m.trainIdx].pt for m in good]).reshape(-1, 1, 2)
matrix, _ = cv2.findHomography(src, dst, cv2.RANSAC, 5.0)
if matrix is None:
return MatchResult(False, None, len(good))
h, w = self.template.shape[:2]
corners = np.float32([[0, 0], [w, 0], [w, h], [0, h]]).reshape(-1, 1, 2)
polygon = cv2.perspectiveTransform(corners, matrix)
return MatchResult(True, polygon, len(good))
使用:
matcher = PlanarMatcher("template.jpg")
frame = cv2.imread("scene.jpg")
result = matcher.match(frame)
if result.ok and result.polygon is not None:
cv2.polylines(frame, [np.int32(result.polygon)], True, (0, 255, 0), 3)
cv2.imwrite("pipeline_result.jpg", frame)
这种写法比一大段脚本更适合放到服务或批处理里。
别只看最终输出。把关键中间结果保存出来:
cv2.imwrite("debug_keypoints.jpg", keypoint_view)
cv2.imwrite("debug_matches.jpg", match_view)
cv2.imwrite("debug_mask.jpg", mask)
cv2.imwrite("debug_flow.jpg", flow_view)
特征匹配要看匹配线。
相机标定要看角点是否找准。
光流要看点是否漂移。
DNN 要看预处理后的输入尺寸、通道顺序和归一化方式。
视觉问题很多时候不是算法错,而是输入没处理成算法期待的样子。