# 计算机视觉指南 ## 数据增强 ```python import torchvision.transforms as T from torchvision.transforms import v2 import albumentations as A # torchvision 增强 train_transform = T.Compose([ T.Resize((256, 256)), T.RandomCrop(224), T.RandomHorizontalFlip(p=0.5), T.RandomRotation(15), T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) val_transform = T.Compose([ T.Resize((224, 224)), T.ToTensor(), T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # Albumentations(更强大) train_transform = A.Compose([ A.RandomResizedCrop(224, 224), A.HorizontalFlip(p=0.5), A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=15), A.OneOf([ A.GaussNoise(), A.GaussianBlur(), A.MotionBlur(), ], p=0.3), A.Normalize(), A.pytorch.ToTensorV2() ]) ``` ## 图像分类 ### 预训练模型 ```python import torchvision.models as models from torchvision.models import ResNet50_Weights # 加载预训练模型 model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2) # 修改分类头 num_classes = 10 model.fc = nn.Linear(model.fc.in_features, num_classes) # 冻结特征提取层 for param in model.parameters(): param.requires_grad = False for param in model.fc.parameters(): param.requires_grad = True # EfficientNet from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights model = efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1) model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes) # Vision Transformer from torchvision.models import vit_b_16, ViT_B_16_Weights model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1) model.heads.head = nn.Linear(model.heads.head.in_features, num_classes) ``` ### timm 库 ```python import timm # 列出可用模型 timm.list_models('resnet*') # 加载模型 model = timm.create_model('resnet50', pretrained=True, num_classes=10) # 获取模型配置 model.default_cfg ``` ## 目标检测 ### YOLOv8 ```python from ultralytics import YOLO # 加载预训练模型 model = YOLO('yolov8n.pt') # n, s, m, l, x # 推理 results = model('image.jpg') for result in results: boxes = result.boxes for box in boxes: x1, y1, x2, y2 = box.xyxy[0] conf = box.conf[0] cls = box.cls[0] print(f"类别: {cls}, 置信度: {conf:.2f}, 边界框: ({x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f})") # 训练自定义数据 model.train( data='dataset.yaml', epochs=100, imgsz=640, batch=16, device=0 ) # 导出 model.export(format='onnx') ``` ### dataset.yaml 格式 ```yaml path: /path/to/dataset train: images/train val: images/val names: 0: cat 1: dog 2: bird ``` ## 图像分割 ### 语义分割 ```python from torchvision.models.segmentation import deeplabv3_resnet50 model = deeplabv3_resnet50(pretrained=True) model.classifier[4] = nn.Conv2d(256, num_classes, kernel_size=1) # 推理 model.eval() with torch.no_grad(): output = model(image)['out'] pred = output.argmax(1) ``` ### U-Net ```python class UNet(nn.Module): def __init__(self, in_channels=3, num_classes=1): super().__init__() # Encoder self.enc1 = self._block(in_channels, 64) self.enc2 = self._block(64, 128) self.enc3 = self._block(128, 256) self.enc4 = self._block(256, 512) self.pool = nn.MaxPool2d(2) # Bottleneck self.bottleneck = self._block(512, 1024) # Decoder self.upconv4 = nn.ConvTranspose2d(1024, 512, 2, stride=2) self.dec4 = self._block(1024, 512) self.upconv3 = nn.ConvTranspose2d(512, 256, 2, stride=2) self.dec3 = self._block(512, 256) self.upconv2 = nn.ConvTranspose2d(256, 128, 2, stride=2) self.dec2 = self._block(256, 128) self.upconv1 = nn.ConvTranspose2d(128, 64, 2, stride=2) self.dec1 = self._block(128, 64) self.conv = nn.Conv2d(64, num_classes, 1) def _block(self, in_ch, out_ch): return nn.Sequential( nn.Conv2d(in_ch, out_ch, 3, padding=1), nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True), nn.Conv2d(out_ch, out_ch, 3, padding=1), nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True) ) def forward(self, x): # Encoder e1 = self.enc1(x) e2 = self.enc2(self.pool(e1)) e3 = self.enc3(self.pool(e2)) e4 = self.enc4(self.pool(e3)) # Bottleneck b = self.bottleneck(self.pool(e4)) # Decoder d4 = self.dec4(torch.cat([self.upconv4(b), e4], dim=1)) d3 = self.dec3(torch.cat([self.upconv3(d4), e3], dim=1)) d2 = self.dec2(torch.cat([self.upconv2(d3), e2], dim=1)) d1 = self.dec1(torch.cat([self.upconv1(d2), e1], dim=1)) return self.conv(d1) ``` ## OCR ```python from paddleocr import PaddleOCR # 初始化 ocr = PaddleOCR(use_angle_cls=True, lang='ch') # 识别 result = ocr.ocr('image.jpg', cls=True) for line in result[0]: bbox = line[0] text = line[1][0] confidence = line[1][1] print(f"文本: {text}, 置信度: {confidence:.2f}") ``` ## 评估指标 ```python # 分类 from sklearn.metrics import accuracy_score, classification_report # 检测 mAP # 使用 COCO API from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval coco_gt = COCO('annotations.json') coco_dt = coco_gt.loadRes('predictions.json') coco_eval = COCOeval(coco_gt, coco_dt, 'bbox') coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() # 分割 IoU def iou_score(pred, target, num_classes): ious = [] for cls in range(num_classes): pred_mask = (pred == cls) target_mask = (target == cls) intersection = (pred_mask & target_mask).sum() union = (pred_mask | target_mask).sum() if union == 0: ious.append(1.0) else: ious.append(intersection / union) return np.mean(ious) ``` ## 可视化 ```python import cv2 import matplotlib.pyplot as plt # 绘制边界框 def draw_boxes(image, boxes, labels, scores, class_names): for box, label, score in zip(boxes, labels, scores): x1, y1, x2, y2 = map(int, box) cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) text = f'{class_names[label]}: {score:.2f}' cv2.putText(image, text, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) return image # 可视化分割结果 def visualize_segmentation(image, mask, num_classes): colors = plt.cm.tab20(np.linspace(0, 1, num_classes))[:, :3] * 255 colored_mask = colors[mask] overlay = cv2.addWeighted(image, 0.7, colored_mask.astype(np.uint8), 0.3, 0) return overlay ```