98 lines
3.4 KiB
Python
98 lines
3.4 KiB
Python
"""Smoke test for Temu scrapers (POST=Apify+save, GET=search cache).
|
|
|
|
Usage:
|
|
python test_temu_scraper.py scrape "wireless earbuds" 10
|
|
python test_temu_scraper.py search "wireless" 20
|
|
python test_temu_scraper.py scrape "women dress" 40 amit123
|
|
python test_temu_scraper.py search "dress" 10 amit123
|
|
|
|
Requires APIFY_API_TOKEN in .env for scrape mode only.
|
|
"""
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
from temu_scraper_api.serializers import AmitTemuScrapeRequest, TemuScrapeRequest
|
|
from temu_scraper_api.service import TemuScraperService
|
|
from temu_scraper_api.storage import get_storage
|
|
|
|
|
|
async def scrape(query: str, max_results: int, scraper: str) -> None:
|
|
if not os.getenv("APIFY_API_TOKEN"):
|
|
print("[FAIL] APIFY_API_TOKEN not set in .env")
|
|
sys.exit(1)
|
|
|
|
storage = get_storage()
|
|
storage.init_db()
|
|
service = TemuScraperService()
|
|
|
|
if scraper == "amit123":
|
|
max_results = max(max_results, 20)
|
|
request = AmitTemuScrapeRequest(search_queries=[query], max_results=max_results)
|
|
print(f"[amit123] Scraping '{query}' (max_results={max_results})...\n")
|
|
items, meta = await service.scrape_amit123(request)
|
|
request_payload = request.model_dump()
|
|
else:
|
|
request = TemuScrapeRequest(search_terms=[query], max_results=max_results)
|
|
print(f"[sovereigntaylor] Scraping '{query}' (max_results={max_results})...\n")
|
|
items, meta = await service.scrape_sovereigntaylor(request)
|
|
request_payload = request.model_dump()
|
|
|
|
run_id, saved_at, count = storage.save_scrape(
|
|
scraper=scraper,
|
|
actor_id=meta.get("actor_id"),
|
|
request_payload=request_payload,
|
|
items=items,
|
|
)
|
|
print(f"Actor: {meta.get('actor_id')}")
|
|
print(f"Saved run_id={run_id} at {saved_at} ({count} products)\n")
|
|
_print_items(items)
|
|
|
|
|
|
def search(query: str, limit: int, scraper: str) -> None:
|
|
storage = get_storage()
|
|
storage.init_db()
|
|
results, total = storage.search_products(scraper=scraper, query=query, limit=limit)
|
|
print(f"[{scraper}] Search '{query}' -> {len(results)} matches ({total} total saved)\n")
|
|
for i, row in enumerate(results[:10], 1):
|
|
product = row["product"]
|
|
title = row.get("title") or product.get("title", "N/A")
|
|
price = product.get("price") or product.get("price_info.price_str", "N/A")
|
|
print(f"{i}. [{row['saved_at']}] run={row['run_id']} {title}")
|
|
print(f" price={price} rank={row.get('rank')}")
|
|
print(f" {row.get('url')}\n")
|
|
if len(results) > 10:
|
|
print(f"... and {len(results) - 10} more")
|
|
|
|
|
|
def _print_items(items: list) -> None:
|
|
for i, item in enumerate(items[:10], 1):
|
|
title = item.get("title", "N/A")
|
|
price = item.get("price") or item.get("price_info.price_str", "N/A")
|
|
url = item.get("url") or item.get("link_url", "")
|
|
print(f"{i}. {title}")
|
|
print(f" price={price}")
|
|
print(f" {url}\n")
|
|
if len(items) > 10:
|
|
print(f"... and {len(items) - 10} more")
|
|
|
|
|
|
async def main():
|
|
mode = sys.argv[1] if len(sys.argv) > 1 else "scrape"
|
|
query = sys.argv[2] if len(sys.argv) > 2 else "wireless earbuds"
|
|
limit = int(sys.argv[3]) if len(sys.argv) > 3 else 5
|
|
scraper = sys.argv[4] if len(sys.argv) > 4 else "sovereigntaylor"
|
|
|
|
if mode == "search":
|
|
search(query, limit, scraper)
|
|
else:
|
|
await scrape(query, limit, scraper)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|