bookworm-smart-assistant/skills/ai-ml-expert/references/cv-guide.md

284 lines
6.8 KiB
Markdown
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 计算机视觉指南
## 数据增强
```python
import torchvision.transforms as T
from torchvision.transforms import v2
import albumentations as A
# torchvision 增强
train_transform = T.Compose([
T.Resize((256, 256)),
T.RandomCrop(224),
T.RandomHorizontalFlip(p=0.5),
T.RandomRotation(15),
T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
val_transform = T.Compose([
T.Resize((224, 224)),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Albumentations更强大
train_transform = A.Compose([
A.RandomResizedCrop(224, 224),
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=15),
A.OneOf([
A.GaussNoise(),
A.GaussianBlur(),
A.MotionBlur(),
], p=0.3),
A.Normalize(),
A.pytorch.ToTensorV2()
])
```
## 图像分类
### 预训练模型
```python
import torchvision.models as models
from torchvision.models import ResNet50_Weights
# 加载预训练模型
model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# 修改分类头
num_classes = 10
model.fc = nn.Linear(model.fc.in_features, num_classes)
# 冻结特征提取层
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
# EfficientNet
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
model = efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
# Vision Transformer
from torchvision.models import vit_b_16, ViT_B_16_Weights
model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
model.heads.head = nn.Linear(model.heads.head.in_features, num_classes)
```
### timm 库
```python
import timm
# 列出可用模型
timm.list_models('resnet*')
# 加载模型
model = timm.create_model('resnet50', pretrained=True, num_classes=10)
# 获取模型配置
model.default_cfg
```
## 目标检测
### YOLOv8
```python
from ultralytics import YOLO
# 加载预训练模型
model = YOLO('yolov8n.pt') # n, s, m, l, x
# 推理
results = model('image.jpg')
for result in results:
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
conf = box.conf[0]
cls = box.cls[0]
print(f"类别: {cls}, 置信度: {conf:.2f}, 边界框: ({x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f})")
# 训练自定义数据
model.train(
data='dataset.yaml',
epochs=100,
imgsz=640,
batch=16,
device=0
)
# 导出
model.export(format='onnx')
```
### dataset.yaml 格式
```yaml
path: /path/to/dataset
train: images/train
val: images/val
names:
0: cat
1: dog
2: bird
```
## 图像分割
### 语义分割
```python
from torchvision.models.segmentation import deeplabv3_resnet50
model = deeplabv3_resnet50(pretrained=True)
model.classifier[4] = nn.Conv2d(256, num_classes, kernel_size=1)
# 推理
model.eval()
with torch.no_grad():
output = model(image)['out']
pred = output.argmax(1)
```
### U-Net
```python
class UNet(nn.Module):
def __init__(self, in_channels=3, num_classes=1):
super().__init__()
# Encoder
self.enc1 = self._block(in_channels, 64)
self.enc2 = self._block(64, 128)
self.enc3 = self._block(128, 256)
self.enc4 = self._block(256, 512)
self.pool = nn.MaxPool2d(2)
# Bottleneck
self.bottleneck = self._block(512, 1024)
# Decoder
self.upconv4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
self.dec4 = self._block(1024, 512)
self.upconv3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
self.dec3 = self._block(512, 256)
self.upconv2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
self.dec2 = self._block(256, 128)
self.upconv1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
self.dec1 = self._block(128, 64)
self.conv = nn.Conv2d(64, num_classes, 1)
def _block(self, in_ch, out_ch):
return nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
def forward(self, x):
# Encoder
e1 = self.enc1(x)
e2 = self.enc2(self.pool(e1))
e3 = self.enc3(self.pool(e2))
e4 = self.enc4(self.pool(e3))
# Bottleneck
b = self.bottleneck(self.pool(e4))
# Decoder
d4 = self.dec4(torch.cat([self.upconv4(b), e4], dim=1))
d3 = self.dec3(torch.cat([self.upconv3(d4), e3], dim=1))
d2 = self.dec2(torch.cat([self.upconv2(d3), e2], dim=1))
d1 = self.dec1(torch.cat([self.upconv1(d2), e1], dim=1))
return self.conv(d1)
```
## OCR
```python
from paddleocr import PaddleOCR
# 初始化
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
# 识别
result = ocr.ocr('image.jpg', cls=True)
for line in result[0]:
bbox = line[0]
text = line[1][0]
confidence = line[1][1]
print(f"文本: {text}, 置信度: {confidence:.2f}")
```
## 评估指标
```python
# 分类
from sklearn.metrics import accuracy_score, classification_report
# 检测 mAP
# 使用 COCO API
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
coco_gt = COCO('annotations.json')
coco_dt = coco_gt.loadRes('predictions.json')
coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
# 分割 IoU
def iou_score(pred, target, num_classes):
ious = []
for cls in range(num_classes):
pred_mask = (pred == cls)
target_mask = (target == cls)
intersection = (pred_mask & target_mask).sum()
union = (pred_mask | target_mask).sum()
if union == 0:
ious.append(1.0)
else:
ious.append(intersection / union)
return np.mean(ious)
```
## 可视化
```python
import cv2
import matplotlib.pyplot as plt
# 绘制边界框
def draw_boxes(image, boxes, labels, scores, class_names):
for box, label, score in zip(boxes, labels, scores):
x1, y1, x2, y2 = map(int, box)
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
text = f'{class_names[label]}: {score:.2f}'
cv2.putText(image, text, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
return image
# 可视化分割结果
def visualize_segmentation(image, mask, num_classes):
colors = plt.cm.tab20(np.linspace(0, 1, num_classes))[:, :3] * 255
colored_mask = colors[mask]
overlay = cv2.addWeighted(image, 0.7, colored_mask.astype(np.uint8), 0.3, 0)
return overlay
```