utopia-surveillance-tool/extract_dataset.py

505 lines
17 KiB
Python

"""
extract_dataset.py
──────────────────
Extracts person-detection frames from surveillance videos using adaptive
frame sampling and yolo26x.pt for auto-labeling.
Features:
- Adaptive FPS: baseline 1 FPS, high 3 FPS (person), low 0.5 FPS (idle)
- GPU-accelerated YOLO inference in batches
- Per-video checkpointing for crash recovery
- 50 GB dataset size cap
- Organizes output in YOLO detection format
"""
import os
import sys
import cv2
import json
import time
import glob
import logging
import numpy as np
from pathlib import Path
from datetime import datetime
from ultralytics import YOLO
import pipeline_config as cfg
# ──────────────────────────────────────────────
# Logging
# ──────────────────────────────────────────────
os.makedirs(cfg.LOG_DIR, exist_ok=True)
os.makedirs(cfg.CHECKPOINT_DIR, exist_ok=True)
log_file = os.path.join(cfg.LOG_DIR, f"extract_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler(sys.stdout),
],
)
logger = logging.getLogger(__name__)
# ──────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────
def get_dataset_size_gb(dataset_dir: str) -> float:
"""Return total size of the dataset directory in GB."""
total = 0
for dirpath, _, filenames in os.walk(dataset_dir):
for f in filenames:
total += os.path.getsize(os.path.join(dirpath, f))
return total / (1024 ** 3)
def get_checkpoint_path(video_path: str) -> str:
"""Return checkpoint file path for a given video."""
video_name = Path(video_path).stem
return os.path.join(cfg.CHECKPOINT_DIR, f"{video_name}.json")
def load_checkpoint(video_path: str) -> dict | None:
"""Load checkpoint for a video if it exists."""
cp_path = get_checkpoint_path(video_path)
if os.path.exists(cp_path):
with open(cp_path, "r") as f:
return json.load(f)
return None
def save_checkpoint(video_path: str, data: dict):
"""Save checkpoint for a video."""
cp_path = get_checkpoint_path(video_path)
with open(cp_path, "w") as f:
json.dump(data, f, indent=2)
def mark_video_done(video_path: str, stats: dict):
"""Mark a video as fully processed."""
stats["done"] = True
save_checkpoint(video_path, stats)
def discover_videos() -> dict[str, list[str]]:
"""
Discover all camera directories and their video files.
Returns {camera_name: [video_paths]}.
"""
cameras = {}
video_dir = Path(cfg.VIDEO_DIR)
if not video_dir.exists():
logger.error(f"Video directory not found: {cfg.VIDEO_DIR}")
sys.exit(1)
for cam_dir in sorted(video_dir.iterdir()):
if cam_dir.is_dir():
videos = []
for ext in cfg.VIDEO_EXTENSIONS:
videos.extend(glob.glob(str(cam_dir / f"*{ext}")))
videos.extend(glob.glob(str(cam_dir / f"*{ext.upper()}")))
videos = sorted(set(videos))
if videos:
cameras[cam_dir.name] = videos
logger.info(f"Camera '{cam_dir.name}': {len(videos)} videos")
if not cameras:
logger.error("No video files found in any camera directory!")
sys.exit(1)
return cameras
def sanitize_camera_name(cam_name: str) -> str:
"""Create a filesystem-safe camera identifier."""
return cam_name.replace(" ", "_").replace("-", "_").replace("__", "_").strip("_").lower()
# ──────────────────────────────────────────────
# Adaptive Sampler State Machine
# ──────────────────────────────────────────────
class AdaptiveSampler:
"""
State machine for adaptive frame sampling.
States:
- NORMAL: sample at BASE_FPS (1 fps)
- HIGH: sample at HIGH_FPS (3 fps) — person recently detected
- LOW: sample at LOW_FPS (0.5 fps) — long idle period
"""
def __init__(self, video_fps: float):
self.video_fps = video_fps
self.state = "NORMAL"
self.current_sample_fps = cfg.BASE_FPS
self.last_person_time = 0.0 # video timestamp of last person detection
self.no_person_streak = 0.0 # seconds since last person
def update(self, timestamp: float, person_detected: bool):
"""Update state based on detection result at given video timestamp."""
if person_detected:
self.last_person_time = timestamp
self.no_person_streak = 0.0
self.state = "HIGH"
self.current_sample_fps = cfg.HIGH_FPS
else:
self.no_person_streak = timestamp - self.last_person_time
if self.state == "HIGH":
# Stay high for HIGH_FPS_DURATION after last detection
if self.no_person_streak > cfg.HIGH_FPS_DURATION:
self.state = "NORMAL"
self.current_sample_fps = cfg.BASE_FPS
elif self.state == "NORMAL":
if self.no_person_streak > cfg.LOW_FPS_THRESHOLD:
self.state = "LOW"
self.current_sample_fps = cfg.LOW_FPS
# LOW stays LOW until a person is detected again
def get_frame_interval(self) -> int:
"""Return the frame interval (number of video frames to skip between samples)."""
interval = max(1, int(self.video_fps / self.current_sample_fps))
return interval
# ──────────────────────────────────────────────
# Core Extraction
# ──────────────────────────────────────────────
def process_video(
model: YOLO,
video_path: str,
camera_name: str,
output_images_dir: str,
output_labels_dir: str,
global_stats: dict,
) -> dict:
"""
Process a single video file: extract frames with adaptive sampling,
detect persons with YOLO, save frames and labels.
Returns per-video stats dict.
"""
cam_safe = sanitize_camera_name(camera_name)
video_name = Path(video_path).stem
# Check if already done
checkpoint = load_checkpoint(video_path)
if checkpoint and checkpoint.get("done"):
logger.info(f" ⏭ Skipping (already done): {video_name}")
return checkpoint
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
logger.error(f" ✗ Cannot open video: {video_path}")
return {"error": "cannot_open", "video": video_name}
video_fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration_sec = total_frames / video_fps if video_fps > 0 else 0
logger.info(f" ▶ Processing: {video_name}")
logger.info(f" FPS: {video_fps:.1f}, Frames: {total_frames}, Duration: {duration_sec:.0f}s")
# Resume from checkpoint
start_frame = 0
frame_counter = 0
if checkpoint:
start_frame = checkpoint.get("last_frame", 0)
frame_counter = checkpoint.get("frame_counter", 0)
logger.info(f" Resuming from frame {start_frame}")
sampler = AdaptiveSampler(video_fps)
stats = {
"video": video_name,
"camera": camera_name,
"total_frames": total_frames,
"frames_extracted": 0,
"frames_with_person": 0,
"frames_without_person": 0,
"last_frame": start_frame,
"frame_counter": frame_counter,
"done": False,
}
# Seek to start position if resuming
if start_frame > 0:
cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
current_frame_idx = start_frame
batch_frames = [] # [(frame_idx, frame_array, timestamp), ...]
batch_save_info = [] # [(output_img_path, output_lbl_path), ...]
last_checkpoint_time = time.time()
while True:
# Check dataset size cap periodically
if stats["frames_extracted"] % 500 == 0 and stats["frames_extracted"] > 0:
current_size = get_dataset_size_gb(cfg.DATASET_DIR)
if current_size >= cfg.MAX_DATASET_SIZE_GB:
logger.warning(f" ⚠ Dataset size cap reached ({current_size:.1f} GB). Stopping extraction.")
global_stats["size_cap_reached"] = True
break
# Calculate next frame to sample
interval = sampler.get_frame_interval()
target_frame = current_frame_idx + interval
if target_frame >= total_frames:
break
# Seek to target frame
cap.set(cv2.CAP_PROP_POS_FRAMES, target_frame)
ret, frame = cap.read()
if not ret:
break
current_frame_idx = target_frame
timestamp = current_frame_idx / video_fps
# Build unique output filename
frame_counter += 1
frame_id = f"{cam_safe}_{frame_counter:06d}"
img_path = os.path.join(output_images_dir, f"{frame_id}.jpg")
lbl_path = os.path.join(output_labels_dir, f"{frame_id}.txt")
batch_frames.append((current_frame_idx, frame, timestamp))
batch_save_info.append((img_path, lbl_path))
# Process batch when full
if len(batch_frames) >= cfg.BATCH_SIZE:
persons_in_batch = _process_batch(
model, batch_frames, batch_save_info, sampler, stats
)
batch_frames.clear()
batch_save_info.clear()
# Checkpoint every 30 seconds
if time.time() - last_checkpoint_time > 30:
stats["last_frame"] = current_frame_idx
stats["frame_counter"] = frame_counter
save_checkpoint(video_path, stats)
last_checkpoint_time = time.time()
# Progress logging every 1000 extractions
if stats["frames_extracted"] % 1000 == 0 and stats["frames_extracted"] > 0:
pct = (current_frame_idx / total_frames) * 100
logger.info(
f" Progress: {pct:.1f}% | Extracted: {stats['frames_extracted']} | "
f"Persons: {stats['frames_with_person']} | Mode: {sampler.state} | "
f"FPS: {sampler.current_sample_fps}"
)
# Process remaining batch
if batch_frames:
_process_batch(model, batch_frames, batch_save_info, sampler, stats)
cap.release()
stats["last_frame"] = current_frame_idx
stats["frame_counter"] = frame_counter
mark_video_done(video_path, stats)
logger.info(
f" ✓ Done: {video_name} | Extracted: {stats['frames_extracted']} | "
f"With person: {stats['frames_with_person']} | "
f"Without: {stats['frames_without_person']}"
)
return stats
def _process_batch(
model: YOLO,
batch_frames: list,
batch_save_info: list,
sampler: AdaptiveSampler,
stats: dict,
) -> int:
"""
Run YOLO inference on a batch of frames, save images + labels.
Returns number of frames with person detections.
"""
frames = [f[1] for f in batch_frames]
timestamps = [f[2] for f in batch_frames]
# Run YOLO batch inference on GPU
results = model.predict(
source=frames,
conf=cfg.DETECTION_CONF,
iou=cfg.DETECTION_IOU,
classes=[cfg.PERSON_CLASS_ID],
device=0,
verbose=False,
half=True, # FP16 for speed
)
persons_count = 0
for i, result in enumerate(results):
img_path, lbl_path = batch_save_info[i]
timestamp = timestamps[i]
frame = frames[i]
# Get person detections
boxes = result.boxes
person_boxes = boxes[boxes.cls == cfg.PERSON_CLASS_ID]
has_person = len(person_boxes) > 0
# Update adaptive sampler
sampler.update(timestamp, has_person)
# Save frame as JPEG
cv2.imwrite(img_path, frame, [cv2.IMWRITE_JPEG_QUALITY, cfg.JPEG_QUALITY])
# Save YOLO-format labels
h, w = frame.shape[:2]
with open(lbl_path, "w") as f:
for box in person_boxes:
# Convert to YOLO format: class x_center y_center width height (normalized)
xyxy = box.xyxy[0].cpu().numpy()
x_center = ((xyxy[0] + xyxy[2]) / 2) / w
y_center = ((xyxy[1] + xyxy[3]) / 2) / h
box_w = (xyxy[2] - xyxy[0]) / w
box_h = (xyxy[3] - xyxy[1]) / h
conf = float(box.conf[0])
f.write(f"0 {x_center:.6f} {y_center:.6f} {box_w:.6f} {box_h:.6f}\n")
stats["frames_extracted"] += 1
if has_person:
stats["frames_with_person"] += 1
persons_count += 1
else:
stats["frames_without_person"] += 1
return persons_count
# ──────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────
def extract_all() -> dict:
"""
Main entry point: discover videos, extract dataset.
Returns {camera_name: [video_stats]}.
"""
logger.info("=" * 60)
logger.info("DATASET EXTRACTION STARTED")
logger.info("=" * 60)
# Load YOLO model on GPU
logger.info(f"Loading detector model: {cfg.DETECTOR_MODEL}")
model = YOLO(cfg.DETECTOR_MODEL)
model.to("cuda")
logger.info("Model loaded on GPU ✓")
# Discover cameras and videos
cameras = discover_videos()
total_videos = sum(len(v) for v in cameras.values())
logger.info(f"Found {len(cameras)} cameras, {total_videos} videos total")
# Create output directories (flat — split happens later)
images_dir = os.path.join(cfg.DATASET_DIR, "images", "all")
labels_dir = os.path.join(cfg.DATASET_DIR, "labels", "all")
os.makedirs(images_dir, exist_ok=True)
os.makedirs(labels_dir, exist_ok=True)
# Save camera mapping for split script
camera_mapping_path = os.path.join(cfg.DATASET_DIR, "camera_mapping.json")
# Load existing mapping if resuming
if os.path.exists(camera_mapping_path):
with open(camera_mapping_path, "r") as f:
camera_mapping = json.load(f)
else:
camera_mapping = {}
global_stats = {"size_cap_reached": False}
all_stats = {}
video_num = 0
for cam_name, video_list in cameras.items():
cam_safe = sanitize_camera_name(cam_name)
logger.info(f"\n{'' * 40}")
logger.info(f"Camera: {cam_name} ({len(video_list)} videos)")
logger.info(f"{'' * 40}")
cam_stats = []
for video_path in video_list:
video_num += 1
logger.info(f"\n[{video_num}/{total_videos}]")
if global_stats["size_cap_reached"]:
logger.warning("Size cap reached — skipping remaining videos.")
break
vstats = process_video(
model, video_path, cam_name,
images_dir, labels_dir, global_stats
)
cam_stats.append(vstats)
# Track which frames belong to which camera
if cam_name not in camera_mapping:
camera_mapping[cam_name] = {"safe_name": cam_safe, "frames": []}
# Collect frame IDs for this camera
frame_prefix = cam_safe + "_"
existing_frames = camera_mapping[cam_name].get("frames", [])
new_frames = [
f for f in os.listdir(images_dir)
if f.startswith(frame_prefix) and f.endswith(".jpg")
]
camera_mapping[cam_name]["frames"] = sorted(set(existing_frames + new_frames))
# Save mapping after each video
with open(camera_mapping_path, "w") as f:
json.dump(camera_mapping, f, indent=2)
all_stats[cam_name] = cam_stats
if global_stats["size_cap_reached"]:
break
# Summary
logger.info("\n" + "=" * 60)
logger.info("EXTRACTION COMPLETE")
logger.info("=" * 60)
total_extracted = 0
total_persons = 0
for cam, stats_list in all_stats.items():
for s in stats_list:
total_extracted += s.get("frames_extracted", 0)
total_persons += s.get("frames_with_person", 0)
dataset_size = get_dataset_size_gb(cfg.DATASET_DIR)
logger.info(f"Total frames extracted: {total_extracted}")
logger.info(f"Frames with persons: {total_persons}")
logger.info(f"Dataset size: {dataset_size:.2f} GB")
logger.info(f"Cameras processed: {len(cameras)}")
# Save final stats
stats_path = os.path.join(cfg.DATASET_DIR, "extraction_stats.json")
with open(stats_path, "w") as f:
json.dump({
"total_extracted": total_extracted,
"total_with_persons": total_persons,
"dataset_size_gb": round(dataset_size, 2),
"cameras": {k: len(v) for k, v in cameras.items()},
"all_stats": all_stats,
}, f, indent=2, default=str)
return all_stats
if __name__ == "__main__":
extract_all()