🔥 新作首發 🎮 天堂私服 YOLOv8 物件偵測實戰 — 從資料蒐集、模型訓練到即時偵測 立即閱讀 →
熱門系列
Like Share Discussion Bookmark Smile

J.J. Huang   2026-04-16   Python OpenCV 08.專案實作篇   瀏覽次數:次   DMCA.com Protection Status

Python | OpenCV 專案:小型圖片分類專案

📚 前言

在上一篇 車牌辨識應用 中,我們完成了結合 OCR 的辨識應用。

這一篇是 小型圖片分類專案,目標是把 模型訓練與微調 系列學到的遷移學習技術,整合成一個完整的端對端系統:從蒐集資料到即時攝影機推論,一次走完全流程

範例場景:辨識「剪刀、石頭、布」三種手勢(可替換成任何你想分類的物件)。

🎯 專案目標

  • 用攝影機蒐集自訂類別的訓練資料
  • 以 MobileNetV2 + 遷移學習訓練分類模型
  • 評估模型表現
  • 整合攝影機做即時分類推論

🗃️ 專案結構

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
image_classifier/
├── 1_collect.py ← 蒐集資料
├── 2_train.py ← 訓練模型
├── 3_evaluate.py ← 評估模型
├── 4_inference.py ← 即時推論
├── dataset/ ← 訓練資料(自動建立)
│ ├── train/
│ │ ├── scissors/
│ │ ├── rock/
│ │ └── paper/
│ └── val/
│ ├── scissors/
│ ├── rock/
│ └── paper/
└── models/
└── classifier.pth

💻 步驟一:蒐集訓練資料

開啟攝影機讓使用者逐類別按空白鍵蒐集訓練與驗證圖片,截取中央 ROI 後儲存。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# 1_collect.py
import cv2
import os

CLASSES = ["scissors", "rock", "paper"] # 修改成你的類別
TRAIN_COUNT = 200 # 每類訓練樣本數
VAL_COUNT = 50 # 每類驗證樣本數

for cls in CLASSES:
os.makedirs(f"dataset/train/{cls}", exist_ok=True)
os.makedirs(f"dataset/val/{cls}", exist_ok=True)

cap = cv2.VideoCapture(0)

for cls in CLASSES:
for split, target in [("train", TRAIN_COUNT), ("val", VAL_COUNT)]:
count = 0
print(f"\n準備蒐集 [{cls}] {split} 資料,按空白鍵開始")

# 等待使用者準備好
while True:
ret, frame = cap.read()
cv2.putText(frame, f"準備好後按空白鍵:{cls} ({split})",
(10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
cv2.imshow("Collect", frame)
if cv2.waitKey(1) & 0xFF == ord(" "):
break

# 開始蒐集
while count < target:
ret, frame = cap.read()
if not ret:
break

# 截取中央 ROI 作為訓練圖片
h, w = frame.shape[:2]
size = min(h, w) // 2
cx, cy = w // 2, h // 2
roi = frame[cy-size//2:cy+size//2, cx-size//2:cx+size//2]
roi_rsz = cv2.resize(roi, (224, 224))

fname = os.path.join(f"dataset/{split}/{cls}", f"{count:04d}.jpg")
cv2.imwrite(fname, roi_rsz)
count += 1

cv2.rectangle(frame, (cx-size//2, cy-size//2),
(cx+size//2, cy+size//2), (0, 255, 0), 2)
cv2.putText(frame, f"{cls} {split}: {count}/{target}",
(10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
cv2.imshow("Collect", frame)
cv2.waitKey(30) # 控制蒐集速度

cap.release()
cv2.destroyAllWindows()
print("資料蒐集完成!")

💻 步驟二:訓練模型

凍結 MobileNetV2 特徵層後只訓練分類頭,帶有增強的訓練迴圈並儲存最佳驗證準確率的模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# 2_train.py
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import os

CLASSES = ["scissors", "rock", "paper"]
EPOCHS = 20
BATCH = 32
LR = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"使用裝置:{DEVICE}")

# 資料增強
train_tf = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ColorJitter(brightness=0.3, contrast=0.3),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225]),
])
val_tf = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225]),
])

train_ds = datasets.ImageFolder("dataset/train", transform=train_tf)
val_ds = datasets.ImageFolder("dataset/val", transform=val_tf)
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=BATCH)

print(f"訓練集:{len(train_ds)} 張,驗證集:{len(val_ds)} 張")
print(f"類別對應:{train_ds.class_to_idx}")

# MobileNetV2 遷移學習
model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
for param in model.features.parameters():
param.requires_grad = False

model.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(model.last_channel, len(CLASSES)),
)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)

best_acc = 0.0
os.makedirs("models", exist_ok=True)

for epoch in range(1, EPOCHS + 1):
# --- 訓練 ---
model.train()
total, correct = 0, 0
for imgs, labels in train_dl:
imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
optimizer.zero_grad()
out = model(imgs)
loss = criterion(out, labels)
loss.backward()
optimizer.step()
correct += (out.argmax(1) == labels).sum().item()
total += labels.size(0)
train_acc = correct / total

# --- 驗證 ---
model.eval()
total, correct = 0, 0
with torch.no_grad():
for imgs, labels in val_dl:
imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
out = model(imgs)
correct += (out.argmax(1) == labels).sum().item()
total += labels.size(0)
val_acc = correct / total

print(f"Epoch {epoch:02d}/{EPOCHS} "
f"train_acc={train_acc:.4f} val_acc={val_acc:.4f}")

if val_acc > best_acc:
best_acc = val_acc
torch.save(model.state_dict(), "models/classifier.pth")
print(f" ✅ 最佳模型已儲存(val_acc={val_acc:.4f})")

print(f"\n訓練完成!最佳驗證準確率:{best_acc:.4f}")

💻 步驟三:評估模型

載入已訓練的 MobileNetV2 模型對驗證集推論,輸出各類別 Precision、Recall、F1-Score 報告

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# 3_evaluate.py
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from sklearn.metrics import classification_report
import torch.nn as nn
import numpy as np

CLASSES = ["scissors", "rock", "paper"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_tf = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225]),
])
val_ds = datasets.ImageFolder("dataset/val", transform=val_tf)
val_dl = DataLoader(val_ds, batch_size=32)

model = models.mobilenet_v2(weights=None)
model.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(model.last_channel, len(CLASSES)),
)
model.load_state_dict(torch.load("models/classifier.pth", map_location=DEVICE))
model = model.to(DEVICE)
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
for imgs, labels in val_dl:
imgs = imgs.to(DEVICE)
preds = model(imgs).argmax(1).cpu().numpy()
all_preds.extend(preds)
all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds, target_names=CLASSES))

💻 步驟四:整合攝影機即時推論

整合 MobileNetV2 與攝影機,對畫面中央 ROI 即時分類手勢並顯示類別名稱與信心度

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# 4_inference.py
import cv2
import torch
import torch.nn as nn
import numpy as np
from torchvision import transforms, models

CLASSES = ["scissors", "rock", "paper"]
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.mobilenet_v2(weights=None)
model.classifier = nn.Sequential(
nn.Dropout(0.2),
nn.Linear(model.last_channel, len(CLASSES)),
)
model.load_state_dict(torch.load("models/classifier.pth", map_location=DEVICE))
model = model.to(DEVICE)
model.eval()

tf = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225]),
])

cap = cv2.VideoCapture(0)
print("即時分類推論,按 q 離開")

while True:
ret, frame = cap.read()
if not ret:
break

h, w = frame.shape[:2]
size = min(h, w) // 2
cx, cy = w // 2, h // 2
roi = frame[cy-size//2:cy+size//2, cx-size//2:cx+size//2]

# 前處理
roi_rgb = cv2.cvtColor(cv2.resize(roi, (224, 224)), cv2.COLOR_BGR2RGB)
from PIL import Image
pil_img = Image.fromarray(roi_rgb)
tensor = tf(pil_img).unsqueeze(0).to(DEVICE)

with torch.no_grad():
logits = model(tensor)
probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
pred = probs.argmax()

label = f"{CLASSES[pred]}: {probs[pred]*100:.1f}%"
cv2.rectangle(frame, (cx-size//2, cy-size//2),
(cx+size//2, cy+size//2), (0, 255, 0), 2)
cv2.putText(frame, label, (10, 40),
cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3)

cv2.imshow("Classifier", frame)
if cv2.waitKey(1) & 0xFF == ord("q"):
break

cap.release()
cv2.destroyAllWindows()

⚠️ 注意事項

  • 蒐集資料時保持背景一致:背景差異太大會讓模型學到背景而非物件本身,建議在相同背景下蒐集。
  • 類別樣本數要均衡:每個類別樣本數差異不要超過 2 倍,否則模型會偏向樣本多的類別。
  • 推論 ROI 與訓練 ROI 要一致:訓練時取畫面中央 ROI,推論時也要取相同位置,避免分佈偏移。

🎯 結語

這個專案把資料蒐集、模型訓練、評估、即時推論串成一條完整的 pipeline,是深度學習與 OpenCV 整合最典型的範例。
下一個專案是 MediaPipe 手勢控制應用,用 MediaPipe 的手部關鍵點做更精細的手勢辨識。

📖 如在學習過程中遇到疑問,或是想了解更多相關主題,建議回顧一下 Python | OpenCV 系列導讀,掌握完整的章節目錄,方便快速找到你需要的內容。

註:以上參考了
PyTorch Transfer Learning Tutorial
torchvision MobileNetV2
scikit-learn classification_report