listing-radar/download_images.py

158 lines
4.9 KiB
Python

import argparse
import re
import urllib.error
import urllib.parse
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import pandas as pd
def sanitize_name(value: str, fallback: str) -> str:
cleaned = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", str(value)).strip()
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned[:120] if cleaned else fallback
def split_urls(raw_value: object) -> list[str]:
if pd.isna(raw_value):
return []
text = str(raw_value).strip()
if not text:
return []
return [u.strip() for u in text.split(";") if u.strip()]
def file_extension_from_url(url: str) -> str:
parsed = urllib.parse.urlparse(url)
path = parsed.path or ""
suffix = Path(path).suffix.lower()
if suffix in {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp"}:
return suffix
return ".jpg"
def download_file(url: str, destination: Path, timeout: int, retries: int) -> tuple[bool, str]:
destination.parent.mkdir(parents=True, exist_ok=True)
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(url, timeout=timeout) as response:
if response.status != 200:
raise urllib.error.HTTPError(
url=url,
code=response.status,
msg=f"HTTP status {response.status}",
hdrs=response.headers,
fp=None,
)
data = response.read()
destination.write_bytes(data)
return True, f"saved -> {destination}"
except Exception as exc:
if attempt == retries:
return False, f"failed -> {url} ({exc})"
return False, f"failed -> {url}"
def resolve_id(row: pd.Series, row_index: int, id_column: str | None) -> str:
if id_column and id_column in row and pd.notna(row[id_column]):
return sanitize_name(str(row[id_column]), f"row_{row_index}")
return f"row_{row_index}"
def main() -> None:
parser = argparse.ArgumentParser(
description="Download images from semicolon-separated URL column in an Excel sheet."
)
parser.add_argument(
"--input",
default="listing_data.xlsx",
help="Path to the input Excel file. Default: listing_data.xlsx",
)
parser.add_argument(
"--output-dir",
default="downloaded_images",
help="Base output directory for downloaded images.",
)
parser.add_argument(
"--image-column",
default="Image",
help="Column containing semicolon-separated image URLs.",
)
parser.add_argument(
"--id-column",
default="ASIN",
help="Column used to name per-row folders. Use empty string to disable.",
)
parser.add_argument(
"--timeout",
type=int,
default=20,
help="HTTP timeout in seconds per request.",
)
parser.add_argument(
"--retries",
type=int,
default=2,
help="Retry count for failed downloads.",
)
parser.add_argument(
"--workers",
type=int,
default=10,
help="Parallel download worker count.",
)
args = parser.parse_args()
input_path = Path(args.input)
output_dir = Path(args.output_dir)
id_column = args.id_column.strip() or None
if not input_path.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
df = pd.read_excel(input_path)
if args.image_column not in df.columns:
raise ValueError(
f"Image column '{args.image_column}' not found. Available columns: {list(df.columns)}"
)
tasks = []
for idx, row in df.iterrows():
row_id = resolve_id(row, idx, id_column)
row_urls = split_urls(row[args.image_column])
for image_index, url in enumerate(row_urls, start=1):
ext = file_extension_from_url(url)
filename = f"{image_index:02d}{ext}"
destination = output_dir / row_id / filename
tasks.append((url, destination))
if not tasks:
print("No image URLs found. Nothing to download.")
return
print(f"Queued {len(tasks)} image(s) from {len(df)} row(s).")
success_count = 0
fail_count = 0
with ThreadPoolExecutor(max_workers=max(args.workers, 1)) as executor:
futures = [
executor.submit(download_file, url, destination, args.timeout, args.retries)
for url, destination in tasks
]
for future in as_completed(futures):
ok, message = future.result()
if ok:
success_count += 1
else:
fail_count += 1
print(message)
print(f"\nDone. Success: {success_count}, Failed: {fail_count}")
print(f"Images saved under: {output_dir.resolve()}")
if __name__ == "__main__":
main()