utopia-surveillance-tool/run_pipeline.py

"""
run_pipeline.py
───────────────
Master pipeline script that orchestrates the full workflow:

  Phase 1: Extract frames from videos (adaptive sampling + YOLO detection)
  Phase 2: Split dataset by camera (train/test)
  Phase 3: Train yolo26n.pt on the dataset

Designed to run overnight — resumable from any phase.

Usage:
    python run_pipeline.py                  # Run full pipeline
    python run_pipeline.py --phase 2        # Start from phase 2 (skip extraction)
    python run_pipeline.py --phase 3        # Start from phase 3 (skip extract + split)
    python run_pipeline.py --extract-only   # Only extract (no split or train)
"""

import os
import sys
import argparse
import logging
import traceback
from datetime import datetime

import pipeline_config as cfg

# ──────────────────────────────────────────────
# Logging
# ──────────────────────────────────────────────
os.makedirs(cfg.LOG_DIR, exist_ok=True)
log_file = os.path.join(cfg.LOG_DIR, f"pipeline_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler(sys.stdout),
    ],
)
logger = logging.getLogger(__name__)


def phase1_extract():
    """Phase 1: Extract dataset from videos."""
    logger.info("\n" + "=" * 60)
    logger.info("=  PHASE 1: DATASET EXTRACTION")
    logger.info("=" * 60 + "\n")

    from extract_dataset import extract_all
    stats = extract_all()
    return stats


def phase2_split():
    """Phase 2: Split dataset by camera."""
    logger.info("\n" + "=" * 60)
    logger.info("=  PHASE 2: CAMERA-LEVEL TRAIN/TEST SPLIT")
    logger.info("=" * 60 + "\n")

    from split_dataset import split_dataset
    split_info = split_dataset()
    return split_info


def phase3_train():
    """Phase 3: Train model."""
    logger.info("\n" + "=" * 60)
    logger.info("=  PHASE 3: MODEL TRAINING")
    logger.info("=" * 60 + "\n")

    from train_model import train_model
    best_weights = train_model()
    return best_weights


def run_pipeline(start_phase: int = 1, extract_only: bool = False):
    """Run the full pipeline from the specified starting phase."""
    pipeline_start = datetime.now()

    logger.info("+" + "-" * 58 + "+")
    logger.info("|  PERSON DETECTION PIPELINE                                |")
    logger.info("|  " + f"Started: {pipeline_start.strftime('%Y-%m-%d %H:%M:%S')}".ljust(57) + "|")
    logger.info("+" + "-" * 58 + "+")
    logger.info("")
    logger.info(f"Configuration:")
    logger.info(f"  Video directory:  {cfg.VIDEO_DIR}")
    logger.info(f"  Dataset output:   {cfg.DATASET_DIR}")
    logger.info(f"  Detector model:   {cfg.DETECTOR_MODEL}")
    logger.info(f"  Training model:   {cfg.TRAIN_MODEL}")
    logger.info(f"  Max dataset size: {cfg.MAX_DATASET_SIZE_GB} GB")
    logger.info(f"  Starting phase:   {start_phase}")
    logger.info(f"  Extract only:     {extract_only}")
    logger.info("")

    try:
        # Phase 1: Extract
        if start_phase <= 1:
            p1_start = datetime.now()
            phase1_extract()
            p1_duration = datetime.now() - p1_start
            logger.info(f"\nPhase 1 completed in {p1_duration}")

            if extract_only:
                logger.info("Extract-only mode — stopping after Phase 1.")
                return

        # Phase 2: Split
        if start_phase <= 2:
            p2_start = datetime.now()
            phase2_split()
            p2_duration = datetime.now() - p2_start
            logger.info(f"\nPhase 2 completed in {p2_duration}")

        # Phase 3: Train
        if start_phase <= 3:
            p3_start = datetime.now()
            best_weights = phase3_train()
            p3_duration = datetime.now() - p3_start
            logger.info(f"\nPhase 3 completed in {p3_duration}")

        # Final summary
        pipeline_end = datetime.now()
        total_duration = pipeline_end - pipeline_start

        logger.info("\n" + "+" + "-" * 58 + "+")
        logger.info("|  PIPELINE COMPLETED SUCCESSFULLY                          |")
        logger.info("|  " + f"Duration: {total_duration}".ljust(57) + "|")
        logger.info("+" + "-" * 58 + "+")

    except KeyboardInterrupt:
        logger.warning("\n\nPipeline interrupted by user. Progress has been checkpointed.")
        logger.warning("Re-run to resume from where you left off.")
        sys.exit(1)

    except Exception as e:
        logger.error(f"\n\nPipeline failed with error: {e}")
        logger.error(traceback.format_exc())
        logger.error("Progress has been checkpointed. Fix the error and re-run.")
        sys.exit(1)


def main():
    parser = argparse.ArgumentParser(
        description="Person detection dataset pipeline",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python run_pipeline.py                  Run full pipeline
  python run_pipeline.py --phase 2        Start from split phase
  python run_pipeline.py --phase 3        Start from training phase
  python run_pipeline.py --extract-only   Only extract dataset
        """,
    )
    parser.add_argument(
        "--phase",
        type=int,
        default=1,
        choices=[1, 2, 3],
        help="Starting phase (1=extract, 2=split, 3=train)",
    )
    parser.add_argument(
        "--extract-only",
        action="store_true",
        help="Only run extraction phase",
    )

    args = parser.parse_args()
    run_pipeline(start_phase=args.phase, extract_only=args.extract_only)


if __name__ == "__main__":
    main()