listing-radar/test_temu_scraper.py

98 lines
3.4 KiB
Python

"""Smoke test for Temu scrapers (POST=Apify+save, GET=search cache).
Usage:
python test_temu_scraper.py scrape "wireless earbuds" 10
python test_temu_scraper.py search "wireless" 20
python test_temu_scraper.py scrape "women dress" 40 amit123
python test_temu_scraper.py search "dress" 10 amit123
Requires APIFY_API_TOKEN in .env for scrape mode only.
"""
import asyncio
import os
import sys
from dotenv import load_dotenv
load_dotenv()
from temu_scraper_api.serializers import AmitTemuScrapeRequest, TemuScrapeRequest
from temu_scraper_api.service import TemuScraperService
from temu_scraper_api.storage import get_storage
async def scrape(query: str, max_results: int, scraper: str) -> None:
if not os.getenv("APIFY_API_TOKEN"):
print("[FAIL] APIFY_API_TOKEN not set in .env")
sys.exit(1)
storage = get_storage()
storage.init_db()
service = TemuScraperService()
if scraper == "amit123":
max_results = max(max_results, 20)
request = AmitTemuScrapeRequest(search_queries=[query], max_results=max_results)
print(f"[amit123] Scraping '{query}' (max_results={max_results})...\n")
items, meta = await service.scrape_amit123(request)
request_payload = request.model_dump()
else:
request = TemuScrapeRequest(search_terms=[query], max_results=max_results)
print(f"[sovereigntaylor] Scraping '{query}' (max_results={max_results})...\n")
items, meta = await service.scrape_sovereigntaylor(request)
request_payload = request.model_dump()
run_id, saved_at, count = storage.save_scrape(
scraper=scraper,
actor_id=meta.get("actor_id"),
request_payload=request_payload,
items=items,
)
print(f"Actor: {meta.get('actor_id')}")
print(f"Saved run_id={run_id} at {saved_at} ({count} products)\n")
_print_items(items)
def search(query: str, limit: int, scraper: str) -> None:
storage = get_storage()
storage.init_db()
results, total = storage.search_products(scraper=scraper, query=query, limit=limit)
print(f"[{scraper}] Search '{query}' -> {len(results)} matches ({total} total saved)\n")
for i, row in enumerate(results[:10], 1):
product = row["product"]
title = row.get("title") or product.get("title", "N/A")
price = product.get("price") or product.get("price_info.price_str", "N/A")
print(f"{i}. [{row['saved_at']}] run={row['run_id']} {title}")
print(f" price={price} rank={row.get('rank')}")
print(f" {row.get('url')}\n")
if len(results) > 10:
print(f"... and {len(results) - 10} more")
def _print_items(items: list) -> None:
for i, item in enumerate(items[:10], 1):
title = item.get("title", "N/A")
price = item.get("price") or item.get("price_info.price_str", "N/A")
url = item.get("url") or item.get("link_url", "")
print(f"{i}. {title}")
print(f" price={price}")
print(f" {url}\n")
if len(items) > 10:
print(f"... and {len(items) - 10} more")
async def main():
mode = sys.argv[1] if len(sys.argv) > 1 else "scrape"
query = sys.argv[2] if len(sys.argv) > 2 else "wireless earbuds"
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
scraper = sys.argv[4] if len(sys.argv) > 4 else "sovereigntaylor"
if mode == "search":
search(query, limit, scraper)
else:
await scrape(query, limit, scraper)
if __name__ == "__main__":
asyncio.run(main())