feat: implement adaptive video frame extraction pipeline with YOLO-based auto-labeling and checkpointing

2026-04-09 18:47:04 +05:00 · 2026-04-09 18:47:04 +05:00 · a7184a5773
parent a1da64e017
commit a7184a5773
6 changed files with 1149 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,6 @@
 .env
 *.pt
 alerts/
-__pycache__
+__pycache__
+dataset/
+video_data/
--- a/extract_dataset.py
+++ b/extract_dataset.py
@ -0,0 +1,504 @@
+"""
+extract_dataset.py
+──────────────────
+Extracts person-detection frames from surveillance videos using adaptive
+frame sampling and yolo26x.pt for auto-labeling.
+
+Features:
+  - Adaptive FPS: baseline 1 FPS, high 3 FPS (person), low 0.5 FPS (idle)
+  - GPU-accelerated YOLO inference in batches
+  - Per-video checkpointing for crash recovery
+  - 50 GB dataset size cap
+  - Organizes output in YOLO detection format
+"""
+
+import os
+import sys
+import cv2
+import json
+import time
+import glob
+import logging
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+
+from ultralytics import YOLO
+
+import pipeline_config as cfg
+
+
+# ──────────────────────────────────────────────
+# Logging
+# ──────────────────────────────────────────────
+os.makedirs(cfg.LOG_DIR, exist_ok=True)
+os.makedirs(cfg.CHECKPOINT_DIR, exist_ok=True)
+
+log_file = os.path.join(cfg.LOG_DIR, f"extract_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.FileHandler(log_file),
+        logging.StreamHandler(sys.stdout),
+    ],
+)
+logger = logging.getLogger(__name__)
+
+
+# ──────────────────────────────────────────────
+# Helpers
+# ──────────────────────────────────────────────
+def get_dataset_size_gb(dataset_dir: str) -> float:
+    """Return total size of the dataset directory in GB."""
+    total = 0
+    for dirpath, _, filenames in os.walk(dataset_dir):
+        for f in filenames:
+            total += os.path.getsize(os.path.join(dirpath, f))
+    return total / (1024 ** 3)
+
+
+def get_checkpoint_path(video_path: str) -> str:
+    """Return checkpoint file path for a given video."""
+    video_name = Path(video_path).stem
+    return os.path.join(cfg.CHECKPOINT_DIR, f"{video_name}.json")
+
+
+def load_checkpoint(video_path: str) -> dict | None:
+    """Load checkpoint for a video if it exists."""
+    cp_path = get_checkpoint_path(video_path)
+    if os.path.exists(cp_path):
+        with open(cp_path, "r") as f:
+            return json.load(f)
+    return None
+
+
+def save_checkpoint(video_path: str, data: dict):
+    """Save checkpoint for a video."""
+    cp_path = get_checkpoint_path(video_path)
+    with open(cp_path, "w") as f:
+        json.dump(data, f, indent=2)
+
+
+def mark_video_done(video_path: str, stats: dict):
+    """Mark a video as fully processed."""
+    stats["done"] = True
+    save_checkpoint(video_path, stats)
+
+
+def discover_videos() -> dict[str, list[str]]:
+    """
+    Discover all camera directories and their video files.
+    Returns {camera_name: [video_paths]}.
+    """
+    cameras = {}
+    video_dir = Path(cfg.VIDEO_DIR)
+
+    if not video_dir.exists():
+        logger.error(f"Video directory not found: {cfg.VIDEO_DIR}")
+        sys.exit(1)
+
+    for cam_dir in sorted(video_dir.iterdir()):
+        if cam_dir.is_dir():
+            videos = []
+            for ext in cfg.VIDEO_EXTENSIONS:
+                videos.extend(glob.glob(str(cam_dir / f"*{ext}")))
+                videos.extend(glob.glob(str(cam_dir / f"*{ext.upper()}")))
+            videos = sorted(set(videos))
+            if videos:
+                cameras[cam_dir.name] = videos
+                logger.info(f"Camera '{cam_dir.name}': {len(videos)} videos")
+
+    if not cameras:
+        logger.error("No video files found in any camera directory!")
+        sys.exit(1)
+
+    return cameras
+
+
+def sanitize_camera_name(cam_name: str) -> str:
+    """Create a filesystem-safe camera identifier."""
+    return cam_name.replace(" ", "_").replace("-", "_").replace("__", "_").strip("_").lower()
+
+
+# ──────────────────────────────────────────────
+# Adaptive Sampler State Machine
+# ──────────────────────────────────────────────
+class AdaptiveSampler:
+    """
+    State machine for adaptive frame sampling.
+
+    States:
+      - NORMAL:  sample at BASE_FPS (1 fps)
+      - HIGH:    sample at HIGH_FPS (3 fps) — person recently detected
+      - LOW:     sample at LOW_FPS (0.5 fps) — long idle period
+    """
+
+    def __init__(self, video_fps: float):
+        self.video_fps = video_fps
+        self.state = "NORMAL"
+        self.current_sample_fps = cfg.BASE_FPS
+        self.last_person_time = 0.0      # video timestamp of last person detection
+        self.no_person_streak = 0.0      # seconds since last person
+
+    def update(self, timestamp: float, person_detected: bool):
+        """Update state based on detection result at given video timestamp."""
+        if person_detected:
+            self.last_person_time = timestamp
+            self.no_person_streak = 0.0
+            self.state = "HIGH"
+            self.current_sample_fps = cfg.HIGH_FPS
+        else:
+            self.no_person_streak = timestamp - self.last_person_time
+
+            if self.state == "HIGH":
+                # Stay high for HIGH_FPS_DURATION after last detection
+                if self.no_person_streak > cfg.HIGH_FPS_DURATION:
+                    self.state = "NORMAL"
+                    self.current_sample_fps = cfg.BASE_FPS
+
+            elif self.state == "NORMAL":
+                if self.no_person_streak > cfg.LOW_FPS_THRESHOLD:
+                    self.state = "LOW"
+                    self.current_sample_fps = cfg.LOW_FPS
+
+            # LOW stays LOW until a person is detected again
+
+    def get_frame_interval(self) -> int:
+        """Return the frame interval (number of video frames to skip between samples)."""
+        interval = max(1, int(self.video_fps / self.current_sample_fps))
+        return interval
+
+
+# ──────────────────────────────────────────────
+# Core Extraction
+# ──────────────────────────────────────────────
+def process_video(
+    model: YOLO,
+    video_path: str,
+    camera_name: str,
+    output_images_dir: str,
+    output_labels_dir: str,
+    global_stats: dict,
+) -> dict:
+    """
+    Process a single video file: extract frames with adaptive sampling,
+    detect persons with YOLO, save frames and labels.
+
+    Returns per-video stats dict.
+    """
+    cam_safe = sanitize_camera_name(camera_name)
+    video_name = Path(video_path).stem
+
+    # Check if already done
+    checkpoint = load_checkpoint(video_path)
+    if checkpoint and checkpoint.get("done"):
+        logger.info(f"  ⏭  Skipping (already done): {video_name}")
+        return checkpoint
+
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        logger.error(f"  ✗ Cannot open video: {video_path}")
+        return {"error": "cannot_open", "video": video_name}
+
+    video_fps = cap.get(cv2.CAP_PROP_FPS)
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    duration_sec = total_frames / video_fps if video_fps > 0 else 0
+
+    logger.info(f"  ▶ Processing: {video_name}")
+    logger.info(f"    FPS: {video_fps:.1f}, Frames: {total_frames}, Duration: {duration_sec:.0f}s")
+
+    # Resume from checkpoint
+    start_frame = 0
+    frame_counter = 0
+    if checkpoint:
+        start_frame = checkpoint.get("last_frame", 0)
+        frame_counter = checkpoint.get("frame_counter", 0)
+        logger.info(f"    Resuming from frame {start_frame}")
+
+    sampler = AdaptiveSampler(video_fps)
+    stats = {
+        "video": video_name,
+        "camera": camera_name,
+        "total_frames": total_frames,
+        "frames_extracted": 0,
+        "frames_with_person": 0,
+        "frames_without_person": 0,
+        "last_frame": start_frame,
+        "frame_counter": frame_counter,
+        "done": False,
+    }
+
+    # Seek to start position if resuming
+    if start_frame > 0:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
+
+    current_frame_idx = start_frame
+    batch_frames = []       # [(frame_idx, frame_array, timestamp), ...]
+    batch_save_info = []    # [(output_img_path, output_lbl_path), ...]
+
+    last_checkpoint_time = time.time()
+
+    while True:
+        # Check dataset size cap periodically
+        if stats["frames_extracted"] % 500 == 0 and stats["frames_extracted"] > 0:
+            current_size = get_dataset_size_gb(cfg.DATASET_DIR)
+            if current_size >= cfg.MAX_DATASET_SIZE_GB:
+                logger.warning(f"  ⚠ Dataset size cap reached ({current_size:.1f} GB). Stopping extraction.")
+                global_stats["size_cap_reached"] = True
+                break
+
+        # Calculate next frame to sample
+        interval = sampler.get_frame_interval()
+        target_frame = current_frame_idx + interval
+
+        if target_frame >= total_frames:
+            break
+
+        # Seek to target frame
+        cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        current_frame_idx = target_frame
+        timestamp = current_frame_idx / video_fps
+
+        # Build unique output filename
+        frame_counter += 1
+        frame_id = f"{cam_safe}_{frame_counter:06d}"
+        img_path = os.path.join(output_images_dir, f"{frame_id}.jpg")
+        lbl_path = os.path.join(output_labels_dir, f"{frame_id}.txt")
+
+        batch_frames.append((current_frame_idx, frame, timestamp))
+        batch_save_info.append((img_path, lbl_path))
+
+        # Process batch when full
+        if len(batch_frames) >= cfg.BATCH_SIZE:
+            persons_in_batch = _process_batch(
+                model, batch_frames, batch_save_info, sampler, stats
+            )
+            batch_frames.clear()
+            batch_save_info.clear()
+
+        # Checkpoint every 30 seconds
+        if time.time() - last_checkpoint_time > 30:
+            stats["last_frame"] = current_frame_idx
+            stats["frame_counter"] = frame_counter
+            save_checkpoint(video_path, stats)
+            last_checkpoint_time = time.time()
+
+        # Progress logging every 1000 extractions
+        if stats["frames_extracted"] % 1000 == 0 and stats["frames_extracted"] > 0:
+            pct = (current_frame_idx / total_frames) * 100
+            logger.info(
+                f"    Progress: {pct:.1f}% | Extracted: {stats['frames_extracted']} | "
+                f"Persons: {stats['frames_with_person']} | Mode: {sampler.state} | "
+                f"FPS: {sampler.current_sample_fps}"
+            )
+
+    # Process remaining batch
+    if batch_frames:
+        _process_batch(model, batch_frames, batch_save_info, sampler, stats)
+
+    cap.release()
+
+    stats["last_frame"] = current_frame_idx
+    stats["frame_counter"] = frame_counter
+    mark_video_done(video_path, stats)
+
+    logger.info(
+        f"  ✓ Done: {video_name} | Extracted: {stats['frames_extracted']} | "
+        f"With person: {stats['frames_with_person']} | "
+        f"Without: {stats['frames_without_person']}"
+    )
+
+    return stats
+
+
+def _process_batch(
+    model: YOLO,
+    batch_frames: list,
+    batch_save_info: list,
+    sampler: AdaptiveSampler,
+    stats: dict,
+) -> int:
+    """
+    Run YOLO inference on a batch of frames, save images + labels.
+    Returns number of frames with person detections.
+    """
+    frames = [f[1] for f in batch_frames]
+    timestamps = [f[2] for f in batch_frames]
+
+    # Run YOLO batch inference on GPU
+    results = model.predict(
+        source=frames,
+        conf=cfg.DETECTION_CONF,
+        iou=cfg.DETECTION_IOU,
+        classes=[cfg.PERSON_CLASS_ID],
+        device=0,
+        verbose=False,
+        half=True,            # FP16 for speed
+    )
+
+    persons_count = 0
+
+    for i, result in enumerate(results):
+        img_path, lbl_path = batch_save_info[i]
+        timestamp = timestamps[i]
+        frame = frames[i]
+
+        # Get person detections
+        boxes = result.boxes
+        person_boxes = boxes[boxes.cls == cfg.PERSON_CLASS_ID]
+        has_person = len(person_boxes) > 0
+
+        # Update adaptive sampler
+        sampler.update(timestamp, has_person)
+
+        # Save frame as JPEG
+        cv2.imwrite(img_path, frame, [cv2.IMWRITE_JPEG_QUALITY, cfg.JPEG_QUALITY])
+
+        # Save YOLO-format labels
+        h, w = frame.shape[:2]
+        with open(lbl_path, "w") as f:
+            for box in person_boxes:
+                # Convert to YOLO format: class x_center y_center width height (normalized)
+                xyxy = box.xyxy[0].cpu().numpy()
+                x_center = ((xyxy[0] + xyxy[2]) / 2) / w
+                y_center = ((xyxy[1] + xyxy[3]) / 2) / h
+                box_w = (xyxy[2] - xyxy[0]) / w
+                box_h = (xyxy[3] - xyxy[1]) / h
+                conf = float(box.conf[0])
+                f.write(f"0 {x_center:.6f} {y_center:.6f} {box_w:.6f} {box_h:.6f}\n")
+
+        stats["frames_extracted"] += 1
+        if has_person:
+            stats["frames_with_person"] += 1
+            persons_count += 1
+        else:
+            stats["frames_without_person"] += 1
+
+    return persons_count
+
+
+# ──────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────
+def extract_all() -> dict:
+    """
+    Main entry point: discover videos, extract dataset.
+    Returns {camera_name: [video_stats]}.
+    """
+    logger.info("=" * 60)
+    logger.info("DATASET EXTRACTION STARTED")
+    logger.info("=" * 60)
+
+    # Load YOLO model on GPU
+    logger.info(f"Loading detector model: {cfg.DETECTOR_MODEL}")
+    model = YOLO(cfg.DETECTOR_MODEL)
+    model.to("cuda")
+    logger.info("Model loaded on GPU ✓")
+
+    # Discover cameras and videos
+    cameras = discover_videos()
+    total_videos = sum(len(v) for v in cameras.values())
+    logger.info(f"Found {len(cameras)} cameras, {total_videos} videos total")
+
+    # Create output directories (flat — split happens later)
+    images_dir = os.path.join(cfg.DATASET_DIR, "images", "all")
+    labels_dir = os.path.join(cfg.DATASET_DIR, "labels", "all")
+    os.makedirs(images_dir, exist_ok=True)
+    os.makedirs(labels_dir, exist_ok=True)
+
+    # Save camera mapping for split script
+    camera_mapping_path = os.path.join(cfg.DATASET_DIR, "camera_mapping.json")
+
+    # Load existing mapping if resuming
+    if os.path.exists(camera_mapping_path):
+        with open(camera_mapping_path, "r") as f:
+            camera_mapping = json.load(f)
+    else:
+        camera_mapping = {}
+
+    global_stats = {"size_cap_reached": False}
+    all_stats = {}
+    video_num = 0
+
+    for cam_name, video_list in cameras.items():
+        cam_safe = sanitize_camera_name(cam_name)
+        logger.info(f"\n{'─' * 40}")
+        logger.info(f"Camera: {cam_name} ({len(video_list)} videos)")
+        logger.info(f"{'─' * 40}")
+
+        cam_stats = []
+        for video_path in video_list:
+            video_num += 1
+            logger.info(f"\n[{video_num}/{total_videos}]")
+
+            if global_stats["size_cap_reached"]:
+                logger.warning("Size cap reached — skipping remaining videos.")
+                break
+
+            vstats = process_video(
+                model, video_path, cam_name,
+                images_dir, labels_dir, global_stats
+            )
+            cam_stats.append(vstats)
+
+            # Track which frames belong to which camera
+            if cam_name not in camera_mapping:
+                camera_mapping[cam_name] = {"safe_name": cam_safe, "frames": []}
+
+            # Collect frame IDs for this camera
+            frame_prefix = cam_safe + "_"
+            existing_frames = camera_mapping[cam_name].get("frames", [])
+            new_frames = [
+                f for f in os.listdir(images_dir)
+                if f.startswith(frame_prefix) and f.endswith(".jpg")
+            ]
+            camera_mapping[cam_name]["frames"] = sorted(set(existing_frames + new_frames))
+
+            # Save mapping after each video
+            with open(camera_mapping_path, "w") as f:
+                json.dump(camera_mapping, f, indent=2)
+
+        all_stats[cam_name] = cam_stats
+
+        if global_stats["size_cap_reached"]:
+            break
+
+    # Summary
+    logger.info("\n" + "=" * 60)
+    logger.info("EXTRACTION COMPLETE")
+    logger.info("=" * 60)
+
+    total_extracted = 0
+    total_persons = 0
+    for cam, stats_list in all_stats.items():
+        for s in stats_list:
+            total_extracted += s.get("frames_extracted", 0)
+            total_persons += s.get("frames_with_person", 0)
+
+    dataset_size = get_dataset_size_gb(cfg.DATASET_DIR)
+    logger.info(f"Total frames extracted: {total_extracted}")
+    logger.info(f"Frames with persons:    {total_persons}")
+    logger.info(f"Dataset size:           {dataset_size:.2f} GB")
+    logger.info(f"Cameras processed:      {len(cameras)}")
+
+    # Save final stats
+    stats_path = os.path.join(cfg.DATASET_DIR, "extraction_stats.json")
+    with open(stats_path, "w") as f:
+        json.dump({
+            "total_extracted": total_extracted,
+            "total_with_persons": total_persons,
+            "dataset_size_gb": round(dataset_size, 2),
+            "cameras": {k: len(v) for k, v in cameras.items()},
+            "all_stats": all_stats,
+        }, f, indent=2, default=str)
+
+    return all_stats
+
+
+if __name__ == "__main__":
+    extract_all()
--- a/pipeline_config.py
+++ b/pipeline_config.py
@ -0,0 +1,59 @@
+"""
+Centralized configuration for the person detection dataset pipeline.
+All tunable parameters are defined here.
+"""
+import os
+
+# ──────────────────────────────────────────────
+# Paths
+# ──────────────────────────────────────────────
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+VIDEO_DIR = os.path.join(BASE_DIR, "video_data")
+DATASET_DIR = os.path.join(BASE_DIR, "dataset")
+CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
+LOG_DIR = os.path.join(BASE_DIR, "logs")
+
+# Model paths
+DETECTOR_MODEL = os.path.join(BASE_DIR, "yolo26x.pt")   # Large model for auto-labeling
+TRAIN_MODEL = "yolo26n.pt"                                # Nano model to train (auto-downloads)
+
+# ──────────────────────────────────────────────
+# Dataset Extraction
+# ──────────────────────────────────────────────
+MAX_DATASET_SIZE_GB = 50        # Stop extraction if dataset exceeds this
+JPEG_QUALITY = 85               # JPEG save quality (1-100)
+DETECTION_CONF = 0.35           # Min confidence for person detection
+DETECTION_IOU = 0.45            # NMS IoU threshold
+BATCH_SIZE = 16                 # Frames per YOLO inference batch
+PERSON_CLASS_ID = 0             # YOLO class ID for "person"
+
+# ──────────────────────────────────────────────
+# Adaptive Sampling
+# ──────────────────────────────────────────────
+BASE_FPS = 1.0                  # Default: 1 frame per second
+HIGH_FPS = 3.0                  # When person detected: 3 frames per second
+LOW_FPS = 0.5                   # When idle (no person): 0.5 frames per second
+HIGH_FPS_DURATION = 5           # Seconds to stay at high FPS after person detected
+LOW_FPS_THRESHOLD = 10          # Seconds without person before dropping to low FPS
+
+# ──────────────────────────────────────────────
+# Train/Test Split (Camera-Level)
+# ──────────────────────────────────────────────
+TEST_CAMERAS = 4                # Number of cameras to hold out for testing
+RANDOM_SEED = 42                # For reproducible camera selection
+
+# ──────────────────────────────────────────────
+# Training
+# ──────────────────────────────────────────────
+TRAIN_EPOCHS = 100
+TRAIN_BATCH = 16                # Batch size for training (adjust for VRAM)
+TRAIN_IMGSZ = 640               # Training image size
+EARLY_STOP_PATIENCE = 15        # Stop if no improvement for N epochs
+TRAIN_WORKERS = 8               # DataLoader workers
+TRAIN_PROJECT = os.path.join(BASE_DIR, "runs", "detect")
+TRAIN_NAME = "person_detection"
+
+# ──────────────────────────────────────────────
+# Video file extensions to process
+# ──────────────────────────────────────────────
+VIDEO_EXTENSIONS = {".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv"}
--- a/run_pipeline.py
+++ b/run_pipeline.py
@ -0,0 +1,173 @@
+"""
+run_pipeline.py
+───────────────
+Master pipeline script that orchestrates the full workflow:
+
+  Phase 1: Extract frames from videos (adaptive sampling + YOLO detection)
+  Phase 2: Split dataset by camera (train/test)
+  Phase 3: Train yolo26n.pt on the dataset
+
+Designed to run overnight — resumable from any phase.
+
+Usage:
+    python run_pipeline.py                  # Run full pipeline
+    python run_pipeline.py --phase 2        # Start from phase 2 (skip extraction)
+    python run_pipeline.py --phase 3        # Start from phase 3 (skip extract + split)
+    python run_pipeline.py --extract-only   # Only extract (no split or train)
+"""
+
+import os
+import sys
+import argparse
+import logging
+import traceback
+from datetime import datetime
+
+import pipeline_config as cfg
+
+# ──────────────────────────────────────────────
+# Logging
+# ──────────────────────────────────────────────
+os.makedirs(cfg.LOG_DIR, exist_ok=True)
+log_file = os.path.join(cfg.LOG_DIR, f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.FileHandler(log_file),
+        logging.StreamHandler(sys.stdout),
+    ],
+)
+logger = logging.getLogger(__name__)
+
+
+def phase1_extract():
+    """Phase 1: Extract dataset from videos."""
+    logger.info("\n" + "█" * 60)
+    logger.info("█  PHASE 1: DATASET EXTRACTION")
+    logger.info("█" * 60 + "\n")
+
+    from extract_dataset import extract_all
+    stats = extract_all()
+    return stats
+
+
+def phase2_split():
+    """Phase 2: Split dataset by camera."""
+    logger.info("\n" + "█" * 60)
+    logger.info("█  PHASE 2: CAMERA-LEVEL TRAIN/TEST SPLIT")
+    logger.info("█" * 60 + "\n")
+
+    from split_dataset import split_dataset
+    split_info = split_dataset()
+    return split_info
+
+
+def phase3_train():
+    """Phase 3: Train model."""
+    logger.info("\n" + "█" * 60)
+    logger.info("█  PHASE 3: MODEL TRAINING")
+    logger.info("█" * 60 + "\n")
+
+    from train_model import train_model
+    best_weights = train_model()
+    return best_weights
+
+
+def run_pipeline(start_phase: int = 1, extract_only: bool = False):
+    """Run the full pipeline from the specified starting phase."""
+    pipeline_start = datetime.now()
+
+    logger.info("╔" + "═" * 58 + "╗")
+    logger.info("║  PERSON DETECTION PIPELINE                                ║")
+    logger.info("║  " + f"Started: {pipeline_start.strftime('%Y-%m-%d %H:%M:%S')}".ljust(57) + "║")
+    logger.info("╚" + "═" * 58 + "╝")
+    logger.info("")
+    logger.info(f"Configuration:")
+    logger.info(f"  Video directory:  {cfg.VIDEO_DIR}")
+    logger.info(f"  Dataset output:   {cfg.DATASET_DIR}")
+    logger.info(f"  Detector model:   {cfg.DETECTOR_MODEL}")
+    logger.info(f"  Training model:   {cfg.TRAIN_MODEL}")
+    logger.info(f"  Max dataset size: {cfg.MAX_DATASET_SIZE_GB} GB")
+    logger.info(f"  Starting phase:   {start_phase}")
+    logger.info(f"  Extract only:     {extract_only}")
+    logger.info("")
+
+    try:
+        # Phase 1: Extract
+        if start_phase <= 1:
+            p1_start = datetime.now()
+            phase1_extract()
+            p1_duration = datetime.now() - p1_start
+            logger.info(f"\nPhase 1 completed in {p1_duration}")
+
+            if extract_only:
+                logger.info("Extract-only mode — stopping after Phase 1.")
+                return
+
+        # Phase 2: Split
+        if start_phase <= 2:
+            p2_start = datetime.now()
+            phase2_split()
+            p2_duration = datetime.now() - p2_start
+            logger.info(f"\nPhase 2 completed in {p2_duration}")
+
+        # Phase 3: Train
+        if start_phase <= 3:
+            p3_start = datetime.now()
+            best_weights = phase3_train()
+            p3_duration = datetime.now() - p3_start
+            logger.info(f"\nPhase 3 completed in {p3_duration}")
+
+        # Final summary
+        pipeline_end = datetime.now()
+        total_duration = pipeline_end - pipeline_start
+
+        logger.info("\n" + "╔" + "═" * 58 + "╗")
+        logger.info("║  PIPELINE COMPLETED SUCCESSFULLY                          ║")
+        logger.info("║  " + f"Duration: {total_duration}".ljust(57) + "║")
+        logger.info("╚" + "═" * 58 + "╝")
+
+    except KeyboardInterrupt:
+        logger.warning("\n\nPipeline interrupted by user. Progress has been checkpointed.")
+        logger.warning("Re-run to resume from where you left off.")
+        sys.exit(1)
+
+    except Exception as e:
+        logger.error(f"\n\nPipeline failed with error: {e}")
+        logger.error(traceback.format_exc())
+        logger.error("Progress has been checkpointed. Fix the error and re-run.")
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Person detection dataset pipeline",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python run_pipeline.py                  Run full pipeline
+  python run_pipeline.py --phase 2        Start from split phase
+  python run_pipeline.py --phase 3        Start from training phase
+  python run_pipeline.py --extract-only   Only extract dataset
+        """,
+    )
+    parser.add_argument(
+        "--phase",
+        type=int,
+        default=1,
+        choices=[1, 2, 3],
+        help="Starting phase (1=extract, 2=split, 3=train)",
+    )
+    parser.add_argument(
+        "--extract-only",
+        action="store_true",
+        help="Only run extraction phase",
+    )
+
+    args = parser.parse_args()
+    run_pipeline(start_phase=args.phase, extract_only=args.extract_only)
+
+
+if __name__ == "__main__":
+    main()
--- a/split_dataset.py
+++ b/split_dataset.py
@ -0,0 +1,232 @@
+"""
+split_dataset.py
+────────────────
+Camera-level train/test split for the extracted dataset.
+
+Splits ENTIRE cameras into train or test sets to prevent data leakage
+from similar/consecutive surveillance frames.
+
+- If >= 5 cameras: hold out 4 cameras for test
+- If < 5 cameras: hold out 1 camera for test
+"""
+
+import os
+import sys
+import json
+import shutil
+import random
+import logging
+from pathlib import Path
+from datetime import datetime
+
+import pipeline_config as cfg
+
+# ──────────────────────────────────────────────
+# Logging
+# ──────────────────────────────────────────────
+os.makedirs(cfg.LOG_DIR, exist_ok=True)
+log_file = os.path.join(cfg.LOG_DIR, f"split_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.FileHandler(log_file),
+        logging.StreamHandler(sys.stdout),
+    ],
+)
+logger = logging.getLogger(__name__)
+
+
+def load_camera_mapping() -> dict:
+    """Load the camera→frames mapping created during extraction."""
+    mapping_path = os.path.join(cfg.DATASET_DIR, "camera_mapping.json")
+    if not os.path.exists(mapping_path):
+        logger.error(f"Camera mapping not found: {mapping_path}")
+        logger.error("Run extract_dataset.py first!")
+        sys.exit(1)
+
+    with open(mapping_path, "r") as f:
+        return json.load(f)
+
+
+def split_cameras(camera_names: list[str]) -> tuple[list[str], list[str]]:
+    """
+    Split camera names into train and test sets.
+    Returns (train_cameras, test_cameras).
+    """
+    random.seed(cfg.RANDOM_SEED)
+
+    n_cameras = len(camera_names)
+    if n_cameras < 2:
+        logger.error(f"Need at least 2 cameras for splitting, found {n_cameras}")
+        sys.exit(1)
+
+    # Determine how many cameras to hold out
+    if n_cameras >= 5:
+        n_test = min(cfg.TEST_CAMERAS, n_cameras - 1)  # At least 1 for train
+    else:
+        n_test = 1
+
+    shuffled = camera_names.copy()
+    random.shuffle(shuffled)
+
+    test_cameras = shuffled[:n_test]
+    train_cameras = shuffled[n_test:]
+
+    return train_cameras, test_cameras
+
+
+def move_frames(
+    camera_mapping: dict,
+    camera_names: list[str],
+    split_name: str,
+    src_images_dir: str,
+    src_labels_dir: str,
+):
+    """Move frames belonging to given cameras into the split directory."""
+    dst_images = os.path.join(cfg.DATASET_DIR, "images", split_name)
+    dst_labels = os.path.join(cfg.DATASET_DIR, "labels", split_name)
+    os.makedirs(dst_images, exist_ok=True)
+    os.makedirs(dst_labels, exist_ok=True)
+
+    total_moved = 0
+
+    for cam_name in camera_names:
+        cam_data = camera_mapping[cam_name]
+        frames = cam_data.get("frames", [])
+
+        for img_filename in frames:
+            # Image
+            src_img = os.path.join(src_images_dir, img_filename)
+            dst_img = os.path.join(dst_images, img_filename)
+            if os.path.exists(src_img):
+                shutil.move(src_img, dst_img)
+
+            # Label
+            lbl_filename = img_filename.replace(".jpg", ".txt")
+            src_lbl = os.path.join(src_labels_dir, lbl_filename)
+            dst_lbl = os.path.join(dst_labels, lbl_filename)
+            if os.path.exists(src_lbl):
+                shutil.move(src_lbl, dst_lbl)
+
+            total_moved += 1
+
+    logger.info(f"  {split_name}: {total_moved} frames from {len(camera_names)} cameras")
+    return total_moved
+
+
+def create_dataset_yaml(train_cameras: list[str], test_cameras: list[str]):
+    """Create dataset.yaml for YOLO training."""
+    yaml_path = os.path.join(cfg.DATASET_DIR, "dataset.yaml")
+
+    # Use forward slashes for YOLO compatibility
+    dataset_path = cfg.DATASET_DIR.replace("\\", "/")
+
+    content = f"""# Auto-generated dataset config for person detection
+# Generated: {datetime.now().isoformat()}
+#
+# Train cameras: {', '.join(train_cameras)}
+# Test cameras:  {', '.join(test_cameras)}
+
+path: {dataset_path}
+train: images/train
+val: images/test
+
+nc: 1
+names: ['person']
+"""
+
+    with open(yaml_path, "w") as f:
+        f.write(content)
+
+    logger.info(f"Created dataset.yaml at {yaml_path}")
+    return yaml_path
+
+
+def split_dataset() -> dict:
+    """
+    Main entry point: split extracted dataset by camera.
+    Returns split info dict.
+    """
+    logger.info("=" * 60)
+    logger.info("DATASET SPLIT (CAMERA-LEVEL)")
+    logger.info("=" * 60)
+
+    # Load camera mapping
+    camera_mapping = load_camera_mapping()
+    camera_names = sorted(camera_mapping.keys())
+    logger.info(f"Found {len(camera_names)} cameras: {camera_names}")
+
+    # Count frames per camera
+    for cam in camera_names:
+        n_frames = len(camera_mapping[cam].get("frames", []))
+        logger.info(f"  {cam}: {n_frames} frames")
+
+    # Split cameras
+    train_cameras, test_cameras = split_cameras(camera_names)
+    logger.info(f"\nTrain cameras ({len(train_cameras)}): {train_cameras}")
+    logger.info(f"Test cameras  ({len(test_cameras)}):  {test_cameras}")
+
+    # Source directories
+    src_images = os.path.join(cfg.DATASET_DIR, "images", "all")
+    src_labels = os.path.join(cfg.DATASET_DIR, "labels", "all")
+
+    if not os.path.exists(src_images):
+        logger.error(f"Source images directory not found: {src_images}")
+        sys.exit(1)
+
+    # Move frames to train/test directories
+    logger.info("\nMoving frames to split directories...")
+    n_train = move_frames(camera_mapping, train_cameras, "train", src_images, src_labels)
+    n_test = move_frames(camera_mapping, test_cameras, "test", src_images, src_labels)
+
+    # Clean up empty 'all' directory
+    try:
+        remaining_imgs = os.listdir(src_images)
+        if not remaining_imgs:
+            os.rmdir(src_images)
+            src_labels_check = os.path.join(cfg.DATASET_DIR, "labels", "all")
+            remaining_lbls = os.listdir(src_labels_check)
+            if not remaining_lbls:
+                os.rmdir(src_labels_check)
+            logger.info("Cleaned up empty 'all' directories")
+        else:
+            logger.warning(f"{len(remaining_imgs)} orphan images left in 'all' directory")
+    except Exception as e:
+        logger.warning(f"Cleanup note: {e}")
+
+    # Create dataset.yaml
+    yaml_path = create_dataset_yaml(train_cameras, test_cameras)
+
+    # Create classes.txt
+    classes_path = os.path.join(cfg.DATASET_DIR, "classes.txt")
+    with open(classes_path, "w") as f:
+        f.write("person\n")
+
+    # Save split info
+    split_info = {
+        "train_cameras": train_cameras,
+        "test_cameras": test_cameras,
+        "train_frames": n_train,
+        "test_frames": n_test,
+        "random_seed": cfg.RANDOM_SEED,
+        "yaml_path": yaml_path,
+        "timestamp": datetime.now().isoformat(),
+    }
+
+    split_info_path = os.path.join(cfg.DATASET_DIR, "split_info.json")
+    with open(split_info_path, "w") as f:
+        json.dump(split_info, f, indent=2)
+
+    logger.info("\n" + "=" * 60)
+    logger.info("SPLIT COMPLETE")
+    logger.info("=" * 60)
+    logger.info(f"Train: {n_train} frames from {len(train_cameras)} cameras")
+    logger.info(f"Test:  {n_test} frames from {len(test_cameras)} cameras")
+    logger.info(f"Ratio: {n_train/(n_train+n_test)*100:.1f}% / {n_test/(n_train+n_test)*100:.1f}%")
+
+    return split_info
+
+
+if __name__ == "__main__":
+    split_dataset()
--- a/train_model.py
+++ b/train_model.py
@ -0,0 +1,178 @@
+"""
+train_model.py
+──────────────
+Fine-tune yolo26n.pt on the extracted person detection dataset.
+
+Optimized for NVIDIA RTX A5000 (16GB VRAM):
+  - Mixed precision (AMP) enabled
+  - Batch size 16, image size 640
+  - Early stopping with patience 15
+  - Full YOLO augmentation pipeline
+"""
+
+import os
+import sys
+import json
+import logging
+from pathlib import Path
+from datetime import datetime
+
+from ultralytics import YOLO
+
+import pipeline_config as cfg
+
+# ──────────────────────────────────────────────
+# Logging
+# ──────────────────────────────────────────────
+os.makedirs(cfg.LOG_DIR, exist_ok=True)
+log_file = os.path.join(cfg.LOG_DIR, f"train_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.FileHandler(log_file),
+        logging.StreamHandler(sys.stdout),
+    ],
+)
+logger = logging.getLogger(__name__)
+
+
+def verify_dataset() -> str:
+    """Verify dataset structure and return path to dataset.yaml."""
+    yaml_path = os.path.join(cfg.DATASET_DIR, "dataset.yaml")
+    if not os.path.exists(yaml_path):
+        logger.error(f"dataset.yaml not found at {yaml_path}")
+        logger.error("Run extract_dataset.py and split_dataset.py first!")
+        sys.exit(1)
+
+    # Check directories exist
+    train_images = os.path.join(cfg.DATASET_DIR, "images", "train")
+    test_images = os.path.join(cfg.DATASET_DIR, "images", "test")
+
+    if not os.path.exists(train_images):
+        logger.error(f"Train images directory not found: {train_images}")
+        sys.exit(1)
+    if not os.path.exists(test_images):
+        logger.error(f"Test images directory not found: {test_images}")
+        sys.exit(1)
+
+    n_train = len([f for f in os.listdir(train_images) if f.endswith(".jpg")])
+    n_test = len([f for f in os.listdir(test_images) if f.endswith(".jpg")])
+
+    logger.info(f"Dataset verified:")
+    logger.info(f"  Train images: {n_train}")
+    logger.info(f"  Test images:  {n_test}")
+
+    if n_train == 0:
+        logger.error("No training images found!")
+        sys.exit(1)
+    if n_test == 0:
+        logger.warning("No test images found — training will proceed without validation.")
+
+    return yaml_path
+
+
+def train_model() -> str:
+    """
+    Main entry point: train yolo26n.pt on the person detection dataset.
+    Returns path to best weights.
+    """
+    logger.info("=" * 60)
+    logger.info("MODEL TRAINING STARTED")
+    logger.info("=" * 60)
+
+    # Verify dataset
+    yaml_path = verify_dataset()
+    logger.info(f"Dataset config: {yaml_path}")
+
+    # Load base model
+    logger.info(f"Loading base model: {cfg.TRAIN_MODEL}")
+    model = YOLO(cfg.TRAIN_MODEL)
+    logger.info("Base model loaded ✓")
+
+    # Training configuration
+    train_args = {
+        "data": yaml_path,
+        "epochs": cfg.TRAIN_EPOCHS,
+        "batch": cfg.TRAIN_BATCH,
+        "imgsz": cfg.TRAIN_IMGSZ,
+        "device": 0,                    # GPU
+        "workers": cfg.TRAIN_WORKERS,
+        "patience": cfg.EARLY_STOP_PATIENCE,
+        "project": cfg.TRAIN_PROJECT,
+        "name": cfg.TRAIN_NAME,
+        "exist_ok": True,               # Overwrite previous run
+        "pretrained": True,
+        "save": True,
+        "save_period": 10,              # Save checkpoint every 10 epochs
+        "val": True,
+        "plots": True,                  # Generate training plots
+        "verbose": True,
+        # Augmentation (YOLO defaults are good, but explicit for clarity)
+        "hsv_h": 0.015,
+        "hsv_s": 0.7,
+        "hsv_v": 0.4,
+        "degrees": 0.0,
+        "translate": 0.1,
+        "scale": 0.5,
+        "shear": 0.0,
+        "flipud": 0.0,                 # No vertical flip (people don't appear upside down)
+        "fliplr": 0.5,                 # Horizontal flip is fine
+        "mosaic": 1.0,
+        "mixup": 0.1,
+    }
+
+    logger.info("Training configuration:")
+    for k, v in train_args.items():
+        logger.info(f"  {k}: {v}")
+
+    # Start training
+    logger.info("\n" + "─" * 40)
+    logger.info("TRAINING IN PROGRESS...")
+    logger.info("─" * 40)
+
+    start_time = datetime.now()
+    results = model.train(**train_args)
+    end_time = datetime.now()
+    duration = end_time - start_time
+
+    # Results
+    logger.info("\n" + "=" * 60)
+    logger.info("TRAINING COMPLETE")
+    logger.info("=" * 60)
+    logger.info(f"Duration: {duration}")
+
+    # Find best weights
+    best_weights = os.path.join(cfg.TRAIN_PROJECT, cfg.TRAIN_NAME, "weights", "best.pt")
+    last_weights = os.path.join(cfg.TRAIN_PROJECT, cfg.TRAIN_NAME, "weights", "last.pt")
+
+    if os.path.exists(best_weights):
+        logger.info(f"Best weights: {best_weights}")
+        # Copy best weights to project root for easy access
+        import shutil
+        output_model = os.path.join(cfg.BASE_DIR, "person_detector_best.pt")
+        shutil.copy2(best_weights, output_model)
+        logger.info(f"Copied best model to: {output_model}")
+    else:
+        logger.warning(f"Best weights not found at expected path: {best_weights}")
+        best_weights = last_weights
+
+    # Save training summary
+    summary = {
+        "start_time": start_time.isoformat(),
+        "end_time": end_time.isoformat(),
+        "duration_seconds": duration.total_seconds(),
+        "best_weights": best_weights,
+        "train_args": {k: str(v) for k, v in train_args.items()},
+    }
+
+    summary_path = os.path.join(cfg.TRAIN_PROJECT, cfg.TRAIN_NAME, "training_summary.json")
+    os.makedirs(os.path.dirname(summary_path), exist_ok=True)
+    with open(summary_path, "w") as f:
+        json.dump(summary, f, indent=2)
+
+    return best_weights
+
+
+if __name__ == "__main__":
+    train_model()