bookworm-smart-assistant/skills/ai-ml-expert/references/cv-guide.md

284 lines
6.8 KiB
Markdown
Raw Permalink Normal View History

# 计算机视觉指南
## 数据增强
```python
import torchvision.transforms as T
from torchvision.transforms import v2
import albumentations as A
# torchvision 增强
train_transform = T.Compose([
T.Resize((256, 256)),
T.RandomCrop(224),
T.RandomHorizontalFlip(p=0.5),
T.RandomRotation(15),
T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
val_transform = T.Compose([
T.Resize((224, 224)),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Albumentations更强大
train_transform = A.Compose([
A.RandomResizedCrop(224, 224),
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=15),
A.OneOf([
A.GaussNoise(),
A.GaussianBlur(),
A.MotionBlur(),
], p=0.3),
A.Normalize(),
A.pytorch.ToTensorV2()
])
```
## 图像分类
### 预训练模型
```python
import torchvision.models as models
from torchvision.models import ResNet50_Weights
# 加载预训练模型
model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
# 修改分类头
num_classes = 10
model.fc = nn.Linear(model.fc.in_features, num_classes)
# 冻结特征提取层
for param in model.parameters():
param.requires_grad = False
for param in model.fc.parameters():
param.requires_grad = True
# EfficientNet
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
model = efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
# Vision Transformer
from torchvision.models import vit_b_16, ViT_B_16_Weights
model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
model.heads.head = nn.Linear(model.heads.head.in_features, num_classes)
```
### timm 库
```python
import timm
# 列出可用模型
timm.list_models('resnet*')
# 加载模型
model = timm.create_model('resnet50', pretrained=True, num_classes=10)
# 获取模型配置
model.default_cfg
```
## 目标检测
### YOLOv8
```python
from ultralytics import YOLO
# 加载预训练模型
model = YOLO('yolov8n.pt') # n, s, m, l, x
# 推理
results = model('image.jpg')
for result in results:
boxes = result.boxes
for box in boxes:
x1, y1, x2, y2 = box.xyxy[0]
conf = box.conf[0]
cls = box.cls[0]
print(f"类别: {cls}, 置信度: {conf:.2f}, 边界框: ({x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f})")
# 训练自定义数据
model.train(
data='dataset.yaml',
epochs=100,
imgsz=640,
batch=16,
device=0
)
# 导出
model.export(format='onnx')
```
### dataset.yaml 格式
```yaml
path: /path/to/dataset
train: images/train
val: images/val
names:
0: cat
1: dog
2: bird
```
## 图像分割
### 语义分割
```python
from torchvision.models.segmentation import deeplabv3_resnet50
model = deeplabv3_resnet50(pretrained=True)
model.classifier[4] = nn.Conv2d(256, num_classes, kernel_size=1)
# 推理
model.eval()
with torch.no_grad():
output = model(image)['out']
pred = output.argmax(1)
```
### U-Net
```python
class UNet(nn.Module):
def __init__(self, in_channels=3, num_classes=1):
super().__init__()
# Encoder
self.enc1 = self._block(in_channels, 64)
self.enc2 = self._block(64, 128)
self.enc3 = self._block(128, 256)
self.enc4 = self._block(256, 512)
self.pool = nn.MaxPool2d(2)
# Bottleneck
self.bottleneck = self._block(512, 1024)
# Decoder
self.upconv4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
self.dec4 = self._block(1024, 512)
self.upconv3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
self.dec3 = self._block(512, 256)
self.upconv2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
self.dec2 = self._block(256, 128)
self.upconv1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
self.dec1 = self._block(128, 64)
self.conv = nn.Conv2d(64, num_classes, 1)
def _block(self, in_ch, out_ch):
return nn.Sequential(
nn.Conv2d(in_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True),
nn.Conv2d(out_ch, out_ch, 3, padding=1),
nn.BatchNorm2d(out_ch),
nn.ReLU(inplace=True)
)
def forward(self, x):
# Encoder
e1 = self.enc1(x)
e2 = self.enc2(self.pool(e1))
e3 = self.enc3(self.pool(e2))
e4 = self.enc4(self.pool(e3))
# Bottleneck
b = self.bottleneck(self.pool(e4))
# Decoder
d4 = self.dec4(torch.cat([self.upconv4(b), e4], dim=1))
d3 = self.dec3(torch.cat([self.upconv3(d4), e3], dim=1))
d2 = self.dec2(torch.cat([self.upconv2(d3), e2], dim=1))
d1 = self.dec1(torch.cat([self.upconv1(d2), e1], dim=1))
return self.conv(d1)
```
## OCR
```python
from paddleocr import PaddleOCR
# 初始化
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
# 识别
result = ocr.ocr('image.jpg', cls=True)
for line in result[0]:
bbox = line[0]
text = line[1][0]
confidence = line[1][1]
print(f"文本: {text}, 置信度: {confidence:.2f}")
```
## 评估指标
```python
# 分类
from sklearn.metrics import accuracy_score, classification_report
# 检测 mAP
# 使用 COCO API
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
coco_gt = COCO('annotations.json')
coco_dt = coco_gt.loadRes('predictions.json')
coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
# 分割 IoU
def iou_score(pred, target, num_classes):
ious = []
for cls in range(num_classes):
pred_mask = (pred == cls)
target_mask = (target == cls)
intersection = (pred_mask & target_mask).sum()
union = (pred_mask | target_mask).sum()
if union == 0:
ious.append(1.0)
else:
ious.append(intersection / union)
return np.mean(ious)
```
## 可视化
```python
import cv2
import matplotlib.pyplot as plt
# 绘制边界框
def draw_boxes(image, boxes, labels, scores, class_names):
for box, label, score in zip(boxes, labels, scores):
x1, y1, x2, y2 = map(int, box)
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
text = f'{class_names[label]}: {score:.2f}'
cv2.putText(image, text, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
return image
# 可视化分割结果
def visualize_segmentation(image, mask, num_classes):
colors = plt.cm.tab20(np.linspace(0, 1, num_classes))[:, :3] * 255
colored_mask = colors[mask]
overlay = cv2.addWeighted(image, 0.7, colored_mask.astype(np.uint8), 0.3, 0)
return overlay
```