import json import os import re import sqlite3 from contextlib import contextmanager from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Tuple ScraperKey = str DEFAULT_DB_PATH = os.getenv("TEMU_DB_PATH", "data/temu_scrapes.db") def _utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat() def _extract_product_fields(scraper: ScraperKey, item: Dict[str, Any]) -> Dict[str, Optional[str]]: title = item.get("title") url = item.get("url") or item.get("link_url") product_id = item.get("productId") or item.get("product_id") or item.get("goods_id") if product_id is not None: product_id = str(product_id) search_term = item.get("searchTerm") or item.get("search_term") category = item.get("category") if isinstance(category, dict): category = json.dumps(category) store = item.get("store") brand = item.get("brand") return { "title": title, "url": url, "product_id": product_id, "search_term": search_term, "category": str(category) if category else None, "store": store, "brand": str(brand) if brand else None, } def _fts_query(user_query: str) -> str: terms = re.findall(r"[\w]+", user_query, flags=re.UNICODE) if not terms: return '""' return " ".join(f'"{term}"*' for term in terms) class ScrapeStorage: def __init__(self, db_path: str = DEFAULT_DB_PATH): self.db_path = db_path Path(db_path).parent.mkdir(parents=True, exist_ok=True) @contextmanager def _connect(self): conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row try: yield conn conn.commit() finally: conn.close() def init_db(self) -> None: with self._connect() as conn: conn.executescript( """ CREATE TABLE IF NOT EXISTS scrape_runs ( id INTEGER PRIMARY KEY AUTOINCREMENT, scraper TEXT NOT NULL, saved_at TEXT NOT NULL, actor_id TEXT, request_json TEXT NOT NULL, product_count INTEGER NOT NULL ); CREATE TABLE IF NOT EXISTS scrape_products ( id INTEGER PRIMARY KEY AUTOINCREMENT, run_id INTEGER NOT NULL, scraper TEXT NOT NULL, saved_at TEXT NOT NULL, title TEXT, url TEXT, product_id TEXT, search_term TEXT, category TEXT, store TEXT, brand TEXT, data_json TEXT NOT NULL, FOREIGN KEY (run_id) REFERENCES scrape_runs(id) ); CREATE INDEX IF NOT EXISTS idx_products_scraper_saved ON scrape_products (scraper, saved_at DESC); CREATE INDEX IF NOT EXISTS idx_products_run_id ON scrape_products (run_id); CREATE VIRTUAL TABLE IF NOT EXISTS scrape_products_fts USING fts5( product_rowid UNINDEXED, scraper UNINDEXED, saved_at UNINDEXED, title, search_term, category, store, brand, url, tokenize='unicode61' ); """ ) def save_scrape( self, scraper: ScraperKey, actor_id: Optional[str], request_payload: Dict[str, Any], items: List[Dict[str, Any]], ) -> Tuple[int, str, int]: saved_at = _utc_now_iso() request_json = json.dumps(request_payload, ensure_ascii=False) with self._connect() as conn: cursor = conn.execute( """ INSERT INTO scrape_runs (scraper, saved_at, actor_id, request_json, product_count) VALUES (?, ?, ?, ?, ?) """, (scraper, saved_at, actor_id, request_json, len(items)), ) run_id = cursor.lastrowid for item in items: fields = _extract_product_fields(scraper, item) data_json = json.dumps(item, ensure_ascii=False) product_cursor = conn.execute( """ INSERT INTO scrape_products ( run_id, scraper, saved_at, title, url, product_id, search_term, category, store, brand, data_json ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( run_id, scraper, saved_at, fields["title"], fields["url"], fields["product_id"], fields["search_term"], fields["category"], fields["store"], fields["brand"], data_json, ), ) product_id = product_cursor.lastrowid conn.execute( """ INSERT INTO scrape_products_fts ( product_rowid, scraper, saved_at, title, search_term, category, store, brand, url ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( product_id, scraper, saved_at, fields["title"] or "", fields["search_term"] or "", fields["category"] or "", fields["store"] or "", fields["brand"] or "", fields["url"] or "", ), ) return run_id, saved_at, len(items) def search_products( self, scraper: ScraperKey, query: str, limit: int = 50, run_id: Optional[int] = None, since: Optional[str] = None, ) -> Tuple[List[Dict[str, Any]], int]: fts_q = _fts_query(query) sql = """ SELECT p.id, p.run_id, p.scraper, p.saved_at, p.title, p.url, p.product_id, p.search_term, p.data_json, bm25(scrape_products_fts) AS rank FROM scrape_products_fts JOIN scrape_products p ON p.id = scrape_products_fts.product_rowid WHERE scrape_products_fts MATCH ? AND scrape_products_fts.scraper = ? """ params: List[Any] = [fts_q, scraper] if run_id is not None: sql += " AND p.run_id = ?" params.append(run_id) if since: sql += " AND p.saved_at >= ?" params.append(since) sql += " ORDER BY rank LIMIT ?" params.append(limit) with self._connect() as conn: rows = conn.execute(sql, params).fetchall() total = conn.execute( "SELECT COUNT(*) FROM scrape_products WHERE scraper = ?", (scraper,) ).fetchone()[0] results = [] for row in rows: product = json.loads(row["data_json"]) results.append( { "id": row["id"], "run_id": row["run_id"], "scraper": row["scraper"], "saved_at": row["saved_at"], "title": row["title"], "url": row["url"], "product_id": row["product_id"], "search_term": row["search_term"], "product": product, "rank": row["rank"], } ) return results, total _storage: Optional[ScrapeStorage] = None def get_storage() -> ScrapeStorage: global _storage if _storage is None: _storage = ScrapeStorage() return _storage