284 lines
6.8 KiB
Markdown
284 lines
6.8 KiB
Markdown
|
|
# 计算机视觉指南
|
|||
|
|
|
|||
|
|
## 数据增强
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
import torchvision.transforms as T
|
|||
|
|
from torchvision.transforms import v2
|
|||
|
|
import albumentations as A
|
|||
|
|
|
|||
|
|
# torchvision 增强
|
|||
|
|
train_transform = T.Compose([
|
|||
|
|
T.Resize((256, 256)),
|
|||
|
|
T.RandomCrop(224),
|
|||
|
|
T.RandomHorizontalFlip(p=0.5),
|
|||
|
|
T.RandomRotation(15),
|
|||
|
|
T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
|
|||
|
|
T.ToTensor(),
|
|||
|
|
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
val_transform = T.Compose([
|
|||
|
|
T.Resize((224, 224)),
|
|||
|
|
T.ToTensor(),
|
|||
|
|
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|||
|
|
])
|
|||
|
|
|
|||
|
|
# Albumentations(更强大)
|
|||
|
|
train_transform = A.Compose([
|
|||
|
|
A.RandomResizedCrop(224, 224),
|
|||
|
|
A.HorizontalFlip(p=0.5),
|
|||
|
|
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=15),
|
|||
|
|
A.OneOf([
|
|||
|
|
A.GaussNoise(),
|
|||
|
|
A.GaussianBlur(),
|
|||
|
|
A.MotionBlur(),
|
|||
|
|
], p=0.3),
|
|||
|
|
A.Normalize(),
|
|||
|
|
A.pytorch.ToTensorV2()
|
|||
|
|
])
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 图像分类
|
|||
|
|
|
|||
|
|
### 预训练模型
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
import torchvision.models as models
|
|||
|
|
from torchvision.models import ResNet50_Weights
|
|||
|
|
|
|||
|
|
# 加载预训练模型
|
|||
|
|
model = models.resnet50(weights=ResNet50_Weights.IMAGENET1K_V2)
|
|||
|
|
|
|||
|
|
# 修改分类头
|
|||
|
|
num_classes = 10
|
|||
|
|
model.fc = nn.Linear(model.fc.in_features, num_classes)
|
|||
|
|
|
|||
|
|
# 冻结特征提取层
|
|||
|
|
for param in model.parameters():
|
|||
|
|
param.requires_grad = False
|
|||
|
|
for param in model.fc.parameters():
|
|||
|
|
param.requires_grad = True
|
|||
|
|
|
|||
|
|
# EfficientNet
|
|||
|
|
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
|
|||
|
|
model = efficientnet_b0(weights=EfficientNet_B0_Weights.IMAGENET1K_V1)
|
|||
|
|
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
|
|||
|
|
|
|||
|
|
# Vision Transformer
|
|||
|
|
from torchvision.models import vit_b_16, ViT_B_16_Weights
|
|||
|
|
model = vit_b_16(weights=ViT_B_16_Weights.IMAGENET1K_V1)
|
|||
|
|
model.heads.head = nn.Linear(model.heads.head.in_features, num_classes)
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### timm 库
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
import timm
|
|||
|
|
|
|||
|
|
# 列出可用模型
|
|||
|
|
timm.list_models('resnet*')
|
|||
|
|
|
|||
|
|
# 加载模型
|
|||
|
|
model = timm.create_model('resnet50', pretrained=True, num_classes=10)
|
|||
|
|
|
|||
|
|
# 获取模型配置
|
|||
|
|
model.default_cfg
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 目标检测
|
|||
|
|
|
|||
|
|
### YOLOv8
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
from ultralytics import YOLO
|
|||
|
|
|
|||
|
|
# 加载预训练模型
|
|||
|
|
model = YOLO('yolov8n.pt') # n, s, m, l, x
|
|||
|
|
|
|||
|
|
# 推理
|
|||
|
|
results = model('image.jpg')
|
|||
|
|
for result in results:
|
|||
|
|
boxes = result.boxes
|
|||
|
|
for box in boxes:
|
|||
|
|
x1, y1, x2, y2 = box.xyxy[0]
|
|||
|
|
conf = box.conf[0]
|
|||
|
|
cls = box.cls[0]
|
|||
|
|
print(f"类别: {cls}, 置信度: {conf:.2f}, 边界框: ({x1:.0f}, {y1:.0f}, {x2:.0f}, {y2:.0f})")
|
|||
|
|
|
|||
|
|
# 训练自定义数据
|
|||
|
|
model.train(
|
|||
|
|
data='dataset.yaml',
|
|||
|
|
epochs=100,
|
|||
|
|
imgsz=640,
|
|||
|
|
batch=16,
|
|||
|
|
device=0
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 导出
|
|||
|
|
model.export(format='onnx')
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### dataset.yaml 格式
|
|||
|
|
|
|||
|
|
```yaml
|
|||
|
|
path: /path/to/dataset
|
|||
|
|
train: images/train
|
|||
|
|
val: images/val
|
|||
|
|
|
|||
|
|
names:
|
|||
|
|
0: cat
|
|||
|
|
1: dog
|
|||
|
|
2: bird
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 图像分割
|
|||
|
|
|
|||
|
|
### 语义分割
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
from torchvision.models.segmentation import deeplabv3_resnet50
|
|||
|
|
|
|||
|
|
model = deeplabv3_resnet50(pretrained=True)
|
|||
|
|
model.classifier[4] = nn.Conv2d(256, num_classes, kernel_size=1)
|
|||
|
|
|
|||
|
|
# 推理
|
|||
|
|
model.eval()
|
|||
|
|
with torch.no_grad():
|
|||
|
|
output = model(image)['out']
|
|||
|
|
pred = output.argmax(1)
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### U-Net
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
class UNet(nn.Module):
|
|||
|
|
def __init__(self, in_channels=3, num_classes=1):
|
|||
|
|
super().__init__()
|
|||
|
|
|
|||
|
|
# Encoder
|
|||
|
|
self.enc1 = self._block(in_channels, 64)
|
|||
|
|
self.enc2 = self._block(64, 128)
|
|||
|
|
self.enc3 = self._block(128, 256)
|
|||
|
|
self.enc4 = self._block(256, 512)
|
|||
|
|
|
|||
|
|
self.pool = nn.MaxPool2d(2)
|
|||
|
|
|
|||
|
|
# Bottleneck
|
|||
|
|
self.bottleneck = self._block(512, 1024)
|
|||
|
|
|
|||
|
|
# Decoder
|
|||
|
|
self.upconv4 = nn.ConvTranspose2d(1024, 512, 2, stride=2)
|
|||
|
|
self.dec4 = self._block(1024, 512)
|
|||
|
|
self.upconv3 = nn.ConvTranspose2d(512, 256, 2, stride=2)
|
|||
|
|
self.dec3 = self._block(512, 256)
|
|||
|
|
self.upconv2 = nn.ConvTranspose2d(256, 128, 2, stride=2)
|
|||
|
|
self.dec2 = self._block(256, 128)
|
|||
|
|
self.upconv1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
|
|||
|
|
self.dec1 = self._block(128, 64)
|
|||
|
|
|
|||
|
|
self.conv = nn.Conv2d(64, num_classes, 1)
|
|||
|
|
|
|||
|
|
def _block(self, in_ch, out_ch):
|
|||
|
|
return nn.Sequential(
|
|||
|
|
nn.Conv2d(in_ch, out_ch, 3, padding=1),
|
|||
|
|
nn.BatchNorm2d(out_ch),
|
|||
|
|
nn.ReLU(inplace=True),
|
|||
|
|
nn.Conv2d(out_ch, out_ch, 3, padding=1),
|
|||
|
|
nn.BatchNorm2d(out_ch),
|
|||
|
|
nn.ReLU(inplace=True)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
def forward(self, x):
|
|||
|
|
# Encoder
|
|||
|
|
e1 = self.enc1(x)
|
|||
|
|
e2 = self.enc2(self.pool(e1))
|
|||
|
|
e3 = self.enc3(self.pool(e2))
|
|||
|
|
e4 = self.enc4(self.pool(e3))
|
|||
|
|
|
|||
|
|
# Bottleneck
|
|||
|
|
b = self.bottleneck(self.pool(e4))
|
|||
|
|
|
|||
|
|
# Decoder
|
|||
|
|
d4 = self.dec4(torch.cat([self.upconv4(b), e4], dim=1))
|
|||
|
|
d3 = self.dec3(torch.cat([self.upconv3(d4), e3], dim=1))
|
|||
|
|
d2 = self.dec2(torch.cat([self.upconv2(d3), e2], dim=1))
|
|||
|
|
d1 = self.dec1(torch.cat([self.upconv1(d2), e1], dim=1))
|
|||
|
|
|
|||
|
|
return self.conv(d1)
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## OCR
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
from paddleocr import PaddleOCR
|
|||
|
|
|
|||
|
|
# 初始化
|
|||
|
|
ocr = PaddleOCR(use_angle_cls=True, lang='ch')
|
|||
|
|
|
|||
|
|
# 识别
|
|||
|
|
result = ocr.ocr('image.jpg', cls=True)
|
|||
|
|
|
|||
|
|
for line in result[0]:
|
|||
|
|
bbox = line[0]
|
|||
|
|
text = line[1][0]
|
|||
|
|
confidence = line[1][1]
|
|||
|
|
print(f"文本: {text}, 置信度: {confidence:.2f}")
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 评估指标
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
# 分类
|
|||
|
|
from sklearn.metrics import accuracy_score, classification_report
|
|||
|
|
|
|||
|
|
# 检测 mAP
|
|||
|
|
# 使用 COCO API
|
|||
|
|
from pycocotools.coco import COCO
|
|||
|
|
from pycocotools.cocoeval import COCOeval
|
|||
|
|
|
|||
|
|
coco_gt = COCO('annotations.json')
|
|||
|
|
coco_dt = coco_gt.loadRes('predictions.json')
|
|||
|
|
coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
|
|||
|
|
coco_eval.evaluate()
|
|||
|
|
coco_eval.accumulate()
|
|||
|
|
coco_eval.summarize()
|
|||
|
|
|
|||
|
|
# 分割 IoU
|
|||
|
|
def iou_score(pred, target, num_classes):
|
|||
|
|
ious = []
|
|||
|
|
for cls in range(num_classes):
|
|||
|
|
pred_mask = (pred == cls)
|
|||
|
|
target_mask = (target == cls)
|
|||
|
|
intersection = (pred_mask & target_mask).sum()
|
|||
|
|
union = (pred_mask | target_mask).sum()
|
|||
|
|
if union == 0:
|
|||
|
|
ious.append(1.0)
|
|||
|
|
else:
|
|||
|
|
ious.append(intersection / union)
|
|||
|
|
return np.mean(ious)
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## 可视化
|
|||
|
|
|
|||
|
|
```python
|
|||
|
|
import cv2
|
|||
|
|
import matplotlib.pyplot as plt
|
|||
|
|
|
|||
|
|
# 绘制边界框
|
|||
|
|
def draw_boxes(image, boxes, labels, scores, class_names):
|
|||
|
|
for box, label, score in zip(boxes, labels, scores):
|
|||
|
|
x1, y1, x2, y2 = map(int, box)
|
|||
|
|
cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
|||
|
|
text = f'{class_names[label]}: {score:.2f}'
|
|||
|
|
cv2.putText(image, text, (x1, y1-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
|
|||
|
|
return image
|
|||
|
|
|
|||
|
|
# 可视化分割结果
|
|||
|
|
def visualize_segmentation(image, mask, num_classes):
|
|||
|
|
colors = plt.cm.tab20(np.linspace(0, 1, num_classes))[:, :3] * 255
|
|||
|
|
colored_mask = colors[mask]
|
|||
|
|
overlay = cv2.addWeighted(image, 0.7, colored_mask.astype(np.uint8), 0.3, 0)
|
|||
|
|
return overlay
|
|||
|
|
```
|