From 2170cdfc689d013c870999c748fe5d917d50dbdc Mon Sep 17 00:00:00 2001 From: saif Date: Mon, 4 Aug 2025 18:07:57 +0500 Subject: [PATCH] rename file --- scrapper.py | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 scrapper.py diff --git a/scrapper.py b/scrapper.py new file mode 100644 index 0000000..180b818 --- /dev/null +++ b/scrapper.py @@ -0,0 +1,109 @@ +import os +import pickle +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from time import sleep +import json +import time + + +with open("marketplaces.json", "r", encoding="utf-8") as f: + data = json.load(f) + +with open("cookies.json", "r", encoding="utf-8") as f: + cookies_ref = json.load(f) + +# Or if it's a Python dict already: +marketplaces = data["marketplaces"] + + +def get_driver(): + options = Options() + options.add_argument("--headless") + options.add_argument("--disable-blink-features=AutomationControlled") + driver = webdriver.Chrome(options=options) + return driver + +def save_cookies(driver, path): + with open(path, "wb") as f: + pickle.dump(driver.get_cookies(), f) + + +def save_ranking(rankings, file_path): + with open(file_path, "w", encoding="utf-8") as f: + json.dump(rankings, f, ensure_ascii=False, indent=4) + +def load_cookies(driver, path): + with open(path, "rb") as f: + cookies = pickle.load(f) + for cookie in cookies: + if 'sameSite' in cookie: + cookie.pop('sameSite') # Optional fix if Chrome complains + driver.add_cookie(cookie) + + +def check_sponsored(item): + try: + # Check if any element inside contains the exact text "Sponsored" + sponsored_labels = item.find_elements(By.XPATH, './/*[contains(text(), "Sponsored")]') + for label in sponsored_labels: + if label.text.strip().lower() == "sponsored": + return 1 + except: + return 0 + +def check_consist_utopia( title ): + return 1 if "Utopia" in title else 0 + + + +def get_amazon_ranks(url, marketplace, keyword): + print( '[INFO] Getting Amazon Ranks for: ', marketplace, keyword) + url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}" + driver.get(url) + count =1 + ranks = [] + + COOKIE_FILE = f"{cookies_ref[marketplace]['cookies_name']}"; + print(COOKIE_FILE) + + # Load cookies if available + if os.path.exists(COOKIE_FILE): + load_cookies(driver, COOKIE_FILE) + driver.get(url) + else: + print("No cookie file found, visiting fresh") + driver.get(url) + sleep(5) # Give time to solve CAPTCHA manually (if needed) + save_cookies(driver, COOKIE_FILE) + + sleep(3) # Wait for JS to load + items = driver.find_elements(By.XPATH, '//div[contains(@class,"s-result-item") and @data-asin]') + for idx, item in enumerate(items, start=1): + asin = item.get_attribute("data-asin") + try: + sponsored = check_sponsored(item) + title = item.find_element(By.XPATH, './/h2//span').text + if title == 'Results': + continue + if sponsored == None : + ranks.append({'rank' : count , 'title' : title , 'marketplace' : marketplace , 'keyword': keyword, 'sponsored' : 0, 'asin' : asin , 'is_utopia' : check_consist_utopia(title) }) + count += 1 + except: + continue + + file_path = f"{int(time.time() * 1000)}-{marketplace}-{keyword}.json" + + save_ranking(ranks, file_path ) + + +driver = get_driver() +for marketplace, details in marketplaces.items(): + url = details['url'] + get_amazon_ranks(url, marketplace, 'pillows') +driver.quit() + + + +