import argparse import re import urllib.error import urllib.parse import urllib.request from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import pandas as pd def sanitize_name(value: str, fallback: str) -> str: cleaned = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", str(value)).strip() cleaned = re.sub(r"\s+", " ", cleaned) return cleaned[:120] if cleaned else fallback def split_urls(raw_value: object) -> list[str]: if pd.isna(raw_value): return [] text = str(raw_value).strip() if not text: return [] return [u.strip() for u in text.split(";") if u.strip()] def file_extension_from_url(url: str) -> str: parsed = urllib.parse.urlparse(url) path = parsed.path or "" suffix = Path(path).suffix.lower() if suffix in {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp"}: return suffix return ".jpg" def download_file(url: str, destination: Path, timeout: int, retries: int) -> tuple[bool, str]: destination.parent.mkdir(parents=True, exist_ok=True) for attempt in range(retries + 1): try: with urllib.request.urlopen(url, timeout=timeout) as response: if response.status != 200: raise urllib.error.HTTPError( url=url, code=response.status, msg=f"HTTP status {response.status}", hdrs=response.headers, fp=None, ) data = response.read() destination.write_bytes(data) return True, f"saved -> {destination}" except Exception as exc: if attempt == retries: return False, f"failed -> {url} ({exc})" return False, f"failed -> {url}" def resolve_id(row: pd.Series, row_index: int, id_column: str | None) -> str: if id_column and id_column in row and pd.notna(row[id_column]): return sanitize_name(str(row[id_column]), f"row_{row_index}") return f"row_{row_index}" def main() -> None: parser = argparse.ArgumentParser( description="Download images from semicolon-separated URL column in an Excel sheet." ) parser.add_argument( "--input", default="listing_data.xlsx", help="Path to the input Excel file. Default: listing_data.xlsx", ) parser.add_argument( "--output-dir", default="downloaded_images", help="Base output directory for downloaded images.", ) parser.add_argument( "--image-column", default="Image", help="Column containing semicolon-separated image URLs.", ) parser.add_argument( "--id-column", default="ASIN", help="Column used to name per-row folders. Use empty string to disable.", ) parser.add_argument( "--timeout", type=int, default=20, help="HTTP timeout in seconds per request.", ) parser.add_argument( "--retries", type=int, default=2, help="Retry count for failed downloads.", ) parser.add_argument( "--workers", type=int, default=10, help="Parallel download worker count.", ) args = parser.parse_args() input_path = Path(args.input) output_dir = Path(args.output_dir) id_column = args.id_column.strip() or None if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {input_path}") df = pd.read_excel(input_path) if args.image_column not in df.columns: raise ValueError( f"Image column '{args.image_column}' not found. Available columns: {list(df.columns)}" ) tasks = [] for idx, row in df.iterrows(): row_id = resolve_id(row, idx, id_column) row_urls = split_urls(row[args.image_column]) for image_index, url in enumerate(row_urls, start=1): ext = file_extension_from_url(url) filename = f"{image_index:02d}{ext}" destination = output_dir / row_id / filename tasks.append((url, destination)) if not tasks: print("No image URLs found. Nothing to download.") return print(f"Queued {len(tasks)} image(s) from {len(df)} row(s).") success_count = 0 fail_count = 0 with ThreadPoolExecutor(max_workers=max(args.workers, 1)) as executor: futures = [ executor.submit(download_file, url, destination, args.timeout, args.retries) for url, destination in tasks ] for future in as_completed(futures): ok, message = future.result() if ok: success_count += 1 else: fail_count += 1 print(message) print(f"\nDone. Success: {success_count}, Failed: {fail_count}") print(f"Images saved under: {output_dir.resolve()}") if __name__ == "__main__": main()