158 lines
4.9 KiB
Python
158 lines
4.9 KiB
Python
import argparse
|
|
import re
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def sanitize_name(value: str, fallback: str) -> str:
|
|
cleaned = re.sub(r'[<>:"/\\|?*\x00-\x1F]', "_", str(value)).strip()
|
|
cleaned = re.sub(r"\s+", " ", cleaned)
|
|
return cleaned[:120] if cleaned else fallback
|
|
|
|
|
|
def split_urls(raw_value: object) -> list[str]:
|
|
if pd.isna(raw_value):
|
|
return []
|
|
text = str(raw_value).strip()
|
|
if not text:
|
|
return []
|
|
return [u.strip() for u in text.split(";") if u.strip()]
|
|
|
|
|
|
def file_extension_from_url(url: str) -> str:
|
|
parsed = urllib.parse.urlparse(url)
|
|
path = parsed.path or ""
|
|
suffix = Path(path).suffix.lower()
|
|
if suffix in {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp"}:
|
|
return suffix
|
|
return ".jpg"
|
|
|
|
|
|
def download_file(url: str, destination: Path, timeout: int, retries: int) -> tuple[bool, str]:
|
|
destination.parent.mkdir(parents=True, exist_ok=True)
|
|
for attempt in range(retries + 1):
|
|
try:
|
|
with urllib.request.urlopen(url, timeout=timeout) as response:
|
|
if response.status != 200:
|
|
raise urllib.error.HTTPError(
|
|
url=url,
|
|
code=response.status,
|
|
msg=f"HTTP status {response.status}",
|
|
hdrs=response.headers,
|
|
fp=None,
|
|
)
|
|
data = response.read()
|
|
destination.write_bytes(data)
|
|
return True, f"saved -> {destination}"
|
|
except Exception as exc:
|
|
if attempt == retries:
|
|
return False, f"failed -> {url} ({exc})"
|
|
return False, f"failed -> {url}"
|
|
|
|
|
|
def resolve_id(row: pd.Series, row_index: int, id_column: str | None) -> str:
|
|
if id_column and id_column in row and pd.notna(row[id_column]):
|
|
return sanitize_name(str(row[id_column]), f"row_{row_index}")
|
|
return f"row_{row_index}"
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Download images from semicolon-separated URL column in an Excel sheet."
|
|
)
|
|
parser.add_argument(
|
|
"--input",
|
|
default="listing_data.xlsx",
|
|
help="Path to the input Excel file. Default: listing_data.xlsx",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
default="downloaded_images",
|
|
help="Base output directory for downloaded images.",
|
|
)
|
|
parser.add_argument(
|
|
"--image-column",
|
|
default="Image",
|
|
help="Column containing semicolon-separated image URLs.",
|
|
)
|
|
parser.add_argument(
|
|
"--id-column",
|
|
default="ASIN",
|
|
help="Column used to name per-row folders. Use empty string to disable.",
|
|
)
|
|
parser.add_argument(
|
|
"--timeout",
|
|
type=int,
|
|
default=20,
|
|
help="HTTP timeout in seconds per request.",
|
|
)
|
|
parser.add_argument(
|
|
"--retries",
|
|
type=int,
|
|
default=2,
|
|
help="Retry count for failed downloads.",
|
|
)
|
|
parser.add_argument(
|
|
"--workers",
|
|
type=int,
|
|
default=10,
|
|
help="Parallel download worker count.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
input_path = Path(args.input)
|
|
output_dir = Path(args.output_dir)
|
|
id_column = args.id_column.strip() or None
|
|
|
|
if not input_path.exists():
|
|
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
|
|
df = pd.read_excel(input_path)
|
|
if args.image_column not in df.columns:
|
|
raise ValueError(
|
|
f"Image column '{args.image_column}' not found. Available columns: {list(df.columns)}"
|
|
)
|
|
|
|
tasks = []
|
|
for idx, row in df.iterrows():
|
|
row_id = resolve_id(row, idx, id_column)
|
|
row_urls = split_urls(row[args.image_column])
|
|
for image_index, url in enumerate(row_urls, start=1):
|
|
ext = file_extension_from_url(url)
|
|
filename = f"{image_index:02d}{ext}"
|
|
destination = output_dir / row_id / filename
|
|
tasks.append((url, destination))
|
|
|
|
if not tasks:
|
|
print("No image URLs found. Nothing to download.")
|
|
return
|
|
|
|
print(f"Queued {len(tasks)} image(s) from {len(df)} row(s).")
|
|
success_count = 0
|
|
fail_count = 0
|
|
|
|
with ThreadPoolExecutor(max_workers=max(args.workers, 1)) as executor:
|
|
futures = [
|
|
executor.submit(download_file, url, destination, args.timeout, args.retries)
|
|
for url, destination in tasks
|
|
]
|
|
for future in as_completed(futures):
|
|
ok, message = future.result()
|
|
if ok:
|
|
success_count += 1
|
|
else:
|
|
fail_count += 1
|
|
print(message)
|
|
|
|
print(f"\nDone. Success: {success_count}, Failed: {fail_count}")
|
|
print(f"Images saved under: {output_dir.resolve()}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|