rename file

2025-08-04 18:07:57 +05:00 · 2025-08-04 18:07:57 +05:00 · 2170cdfc68
parent 19dddd7551
commit 2170cdfc68
1 changed files with 109 additions and 0 deletions
--- a/scrapper.py
+++ b/scrapper.py
@ -0,0 +1,109 @@
+import os
+import pickle
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.by import By
+from time import sleep
+import json
+import time
+
+
+with open("marketplaces.json", "r", encoding="utf-8") as f:
+    data = json.load(f)
+
+with open("cookies.json", "r", encoding="utf-8") as f:
+    cookies_ref = json.load(f)
+
+# Or if it's a Python dict already:
+marketplaces = data["marketplaces"]
+
+
+def get_driver():
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    driver = webdriver.Chrome(options=options)
+    return driver
+
+def save_cookies(driver, path):
+    with open(path, "wb") as f:
+        pickle.dump(driver.get_cookies(), f)
+
+
+def save_ranking(rankings, file_path):
+    with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(rankings, f, ensure_ascii=False, indent=4)
+
+def load_cookies(driver, path):
+    with open(path, "rb") as f:
+        cookies = pickle.load(f)
+        for cookie in cookies:
+            if 'sameSite' in cookie:
+                cookie.pop('sameSite')  # Optional fix if Chrome complains
+            driver.add_cookie(cookie)
+
+
+def check_sponsored(item):
+    try:
+        # Check if any element inside contains the exact text "Sponsored"
+        sponsored_labels = item.find_elements(By.XPATH, './/*[contains(text(), "Sponsored")]')
+        for label in sponsored_labels:
+            if label.text.strip().lower() == "sponsored":
+                return 1
+    except:
+        return 0
+
+def check_consist_utopia( title ):
+    return 1 if "Utopia" in title else 0 
+
+
+
+def get_amazon_ranks(url, marketplace, keyword):
+    print( '[INFO] Getting Amazon Ranks for: ', marketplace, keyword)
+    url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}"
+    driver.get(url)
+    count =1
+    ranks = []
+
+    COOKIE_FILE = f"{cookies_ref[marketplace]['cookies_name']}";
+    print(COOKIE_FILE)
+   
+    # Load cookies if available
+    if os.path.exists(COOKIE_FILE):
+        load_cookies(driver, COOKIE_FILE)
+        driver.get(url)
+    else:
+        print("No cookie file found, visiting fresh")
+        driver.get(url)
+        sleep(5)  # Give time to solve CAPTCHA manually (if needed)
+        save_cookies(driver, COOKIE_FILE)
+
+    sleep(3)  # Wait for JS to load   
+    items = driver.find_elements(By.XPATH, '//div[contains(@class,"s-result-item") and @data-asin]')
+    for idx, item in enumerate(items, start=1):
+        asin = item.get_attribute("data-asin")
+        try:
+            sponsored = check_sponsored(item)
+            title = item.find_element(By.XPATH, './/h2//span').text
+            if title == 'Results':
+                continue
+            if sponsored == None :
+                ranks.append({'rank' : count , 'title' : title , 'marketplace' : marketplace , 'keyword': keyword, 'sponsored' : 0, 'asin' : asin , 'is_utopia' : check_consist_utopia(title) })
+                count += 1
+        except:
+            continue
+
+    file_path = f"{int(time.time() * 1000)}-{marketplace}-{keyword}.json"
+
+    save_ranking(ranks, file_path )
+   
+
+driver = get_driver()
+for marketplace, details in marketplaces.items():
+    url = details['url']
+    get_amazon_ranks(url, marketplace, 'pillows')
+driver.quit()
+
+    
+    
+