Python | OpenCV 專案：小型圖片分類專案

📑 目錄

📚 前言
🎯 專案目標
🗃️ 專案結構
💻 步驟一：蒐集訓練資料
💻 步驟二：訓練模型
💻 步驟三：評估模型
💻 步驟四：整合攝影機即時推論
⚠️ 注意事項
🎯 結語

📚 前言

在上一篇 車牌辨識應用 中，我們完成了結合 OCR 的辨識應用。

這一篇是 小型圖片分類專案，目標是把模型訓練與微調系列學到的遷移學習技術，整合成一個完整的端對端系統：從蒐集資料到即時攝影機推論，一次走完全流程。

範例場景：辨識「剪刀、石頭、布」三種手勢（可替換成任何你想分類的物件）。‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌

🎯 專案目標

用攝影機蒐集自訂類別的訓練資料
以 MobileNetV2 + 遷移學習訓練分類模型
評估模型表現
整合攝影機做即時分類推論

🗃️ 專案結構

image_classifier/
├── 1_collect.py       ← 蒐集資料
├── 2_train.py         ← 訓練模型
├── 3_evaluate.py      ← 評估模型
├── 4_inference.py     ← 即時推論
├── dataset/           ← 訓練資料（自動建立）
│   ├── train/
│   │   ├── scissors/
│   │   ├── rock/
│   │   └── paper/
│   └── val/
│       ├── scissors/
│       ├── rock/
│       └── paper/
└── models/
    └── classifier.pth

💻 步驟一：蒐集訓練資料

開啟攝影機讓使用者逐類別按空白鍵蒐集訓練與驗證圖片，截取中央 ROI 後儲存。

# 1_collect.py
import cv2
import os

CLASSES    = ["scissors", "rock", "paper"]   # 修改成你的類別
TRAIN_COUNT = 200    # 每類訓練樣本數
VAL_COUNT   = 50     # 每類驗證樣本數

for cls in CLASSES:
    os.makedirs(f"dataset/train/{cls}", exist_ok=True)
    os.makedirs(f"dataset/val/{cls}",   exist_ok=True)

cap = cv2.VideoCapture(0)

for cls in CLASSES:
    for split, target in [("train", TRAIN_COUNT), ("val", VAL_COUNT)]:
        count = 0
        print(f"\n準備蒐集 [{cls}] {split} 資料，按空白鍵開始")

        # 等待使用者準備好
        while True:
            ret, frame = cap.read()
            cv2.putText(frame, f"準備好後按空白鍵：{cls} ({split})",
                        (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)
            cv2.imshow("Collect", frame)
            if cv2.waitKey(1) & 0xFF == ord(" "):
                break

        # 開始蒐集
        while count < target:
            ret, frame = cap.read()
            if not ret:
                break

            # 截取中央 ROI 作為訓練圖片
            h, w  = frame.shape[:2]
            size  = min(h, w) // 2
            cx, cy = w // 2, h // 2
            roi   = frame[cy-size//2:cy+size//2, cx-size//2:cx+size//2]
            roi_rsz = cv2.resize(roi, (224, 224))

            fname = os.path.join(f"dataset/{split}/{cls}", f"{count:04d}.jpg")
            cv2.imwrite(fname, roi_rsz)
            count += 1

            cv2.rectangle(frame, (cx-size//2, cy-size//2),
                          (cx+size//2, cy+size//2), (0, 255, 0), 2)
            cv2.putText(frame, f"{cls} {split}: {count}/{target}",
                        (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
            cv2.imshow("Collect", frame)
            cv2.waitKey(30)   # 控制蒐集速度

cap.release()
cv2.destroyAllWindows()
print("資料蒐集完成！")

💻 步驟二：訓練模型

凍結 MobileNetV2 特徵層後只訓練分類頭，帶有增強的訓練迴圈並儲存最佳驗證準確率的模型

# 2_train.py
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
import os

CLASSES  = ["scissors", "rock", "paper"]
EPOCHS   = 20
BATCH    = 32
LR       = 1e-4
DEVICE   = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"使用裝置：{DEVICE}")

# 資料增強
train_tf = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])
val_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])

train_ds = datasets.ImageFolder("dataset/train", transform=train_tf)
val_ds   = datasets.ImageFolder("dataset/val",   transform=val_tf)
train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True)
val_dl   = DataLoader(val_ds,   batch_size=BATCH)

print(f"訓練集：{len(train_ds)} 張，驗證集：{len(val_ds)} 張")
print(f"類別對應：{train_ds.class_to_idx}")

# MobileNetV2 遷移學習
model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
for param in model.features.parameters():
    param.requires_grad = False

model.classifier = nn.Sequential(
    nn.Dropout(0.2),
    nn.Linear(model.last_channel, len(CLASSES)),
)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=LR)

best_acc = 0.0
os.makedirs("models", exist_ok=True)

for epoch in range(1, EPOCHS + 1):
    # --- 訓練 ---
    model.train()
    total, correct = 0, 0
    for imgs, labels in train_dl:
        imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        out  = model(imgs)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        correct += (out.argmax(1) == labels).sum().item()
        total   += labels.size(0)
    train_acc = correct / total

    # --- 驗證 ---
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for imgs, labels in val_dl:
            imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
            out      = model(imgs)
            correct += (out.argmax(1) == labels).sum().item()
            total   += labels.size(0)
    val_acc = correct / total

    print(f"Epoch {epoch:02d}/{EPOCHS}  "
          f"train_acc={train_acc:.4f}  val_acc={val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "models/classifier.pth")
        print(f"  ✅ 最佳模型已儲存（val_acc={val_acc:.4f}）")

print(f"\n訓練完成！最佳驗證準確率：{best_acc:.4f}")

💻 步驟三：評估模型

載入已訓練的 MobileNetV2 模型對驗證集推論，輸出各類別 Precision、Recall、F1-Score 報告‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌

# 3_evaluate.py
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms, models
from sklearn.metrics import classification_report
import torch.nn as nn
import numpy as np

CLASSES = ["scissors", "rock", "paper"]
DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])
val_ds = datasets.ImageFolder("dataset/val", transform=val_tf)
val_dl = DataLoader(val_ds, batch_size=32)

model = models.mobilenet_v2(weights=None)
model.classifier = nn.Sequential(
    nn.Dropout(0.2),
    nn.Linear(model.last_channel, len(CLASSES)),
)
model.load_state_dict(torch.load("models/classifier.pth", map_location=DEVICE))
model = model.to(DEVICE)
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
    for imgs, labels in val_dl:
        imgs   = imgs.to(DEVICE)
        preds  = model(imgs).argmax(1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

print(classification_report(all_labels, all_preds, target_names=CLASSES))

💻 步驟四：整合攝影機即時推論

整合 MobileNetV2 與攝影機，對畫面中央 ROI 即時分類手勢並顯示類別名稱與信心度

# 4_inference.py
import cv2
import torch
import torch.nn as nn
import numpy as np
from torchvision import transforms, models

CLASSES = ["scissors", "rock", "paper"]
DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = models.mobilenet_v2(weights=None)
model.classifier = nn.Sequential(
    nn.Dropout(0.2),
    nn.Linear(model.last_channel, len(CLASSES)),
)
model.load_state_dict(torch.load("models/classifier.pth", map_location=DEVICE))
model = model.to(DEVICE)
model.eval()

tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225]),
])

cap = cv2.VideoCapture(0)
print("即時分類推論，按 q 離開")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    h, w   = frame.shape[:2]
    size   = min(h, w) // 2
    cx, cy = w // 2, h // 2
    roi    = frame[cy-size//2:cy+size//2, cx-size//2:cx+size//2]

    # 前處理
    roi_rgb = cv2.cvtColor(cv2.resize(roi, (224, 224)), cv2.COLOR_BGR2RGB)
    from PIL import Image
    pil_img = Image.fromarray(roi_rgb)
    tensor  = tf(pil_img).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        logits = model(tensor)
        probs  = torch.softmax(logits, dim=1)[0].cpu().numpy()
        pred   = probs.argmax()

    label = f"{CLASSES[pred]}: {probs[pred]*100:.1f}%"
    cv2.rectangle(frame, (cx-size//2, cy-size//2),
                  (cx+size//2, cy+size//2), (0, 255, 0), 2)
    cv2.putText(frame, label, (10, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3)

    cv2.imshow("Classifier", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

⚠️ 注意事項

蒐集資料時保持背景一致：背景差異太大會讓模型學到背景而非物件本身，建議在相同背景下蒐集。
類別樣本數要均衡：每個類別樣本數差異不要超過 2 倍，否則模型會偏向樣本多的類別。
推論 ROI 與訓練 ROI 要一致：訓練時取畫面中央 ROI，推論時也要取相同位置，避免分佈偏移。

🎯 結語

這個專案把資料蒐集、模型訓練、評估、即時推論串成一條完整的 pipeline，是深度學習與 OpenCV 整合最典型的範例。
下一個專案是 MediaPipe 手勢控制應用，用 MediaPipe 的手部關鍵點做更精細的手勢辨識。

📖 如在學習過程中遇到疑問，或是想了解更多相關主題，建議回顧一下 Python | OpenCV 系列導讀，掌握完整的章節目錄，方便快速找到你需要的內容。

‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌‌

註：以上參考了
PyTorch Transfer Learning Tutorial
torchvision MobileNetV2
scikit-learn classification_report

J.J.'s Blogs

J.J. Huang 2026-04-16 Python OpenCV 08.專案實作篇瀏覽次數：次 {{moment(1776301200000).fromNow()}}

Python | OpenCV 專案：小型圖片分類專案

📚 前言

🎯 專案目標

🗃️ 專案結構

💻 步驟一：蒐集訓練資料

💻 步驟二：訓練模型

💻 步驟三：評估模型

💻 步驟四：整合攝影機即時推論

⚠️ 注意事項

🎯 結語

📝 文章方向調查

J.J. Huang 2026-04-16 Python OpenCV 08.專案實作篇 瀏覽次數：次 {{moment(1776301200000).fromNow()}}

Python | OpenCV 專案：小型圖片分類專案

📚 前言

🎯 專案目標

🗃️ 專案結構

💻 步驟一：蒐集訓練資料

💻 步驟二：訓練模型

💻 步驟三：評估模型

💻 步驟四：整合攝影機即時推論

⚠️ 注意事項

🎯 結語

📚 相關文章

J.J. Huang 2026-04-16 Python OpenCV 08.專案實作篇瀏覽次數：次 {{moment(1776301200000).fromNow()}}