From 9d40fae97ce704a25acdfd0ea51ee58b51cd6d24 Mon Sep 17 00:00:00 2001 From: saif Date: Fri, 15 Aug 2025 16:49:43 +0500 Subject: [PATCH] config added --- scrapper.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/scrapper.py b/scrapper.py index fb79af8..265aa50 100644 --- a/scrapper.py +++ b/scrapper.py @@ -13,27 +13,49 @@ import time import re -with open("/home/ec2-user/keyword_ranking_crawler/marketplaces.json", "r", encoding="utf-8") as f: +ACTIVE_ENV = 'prod' + + +config = { + "prod": { + "base_path" : "/home/ec2-user/keyword_ranking_crawler", + "marketplace_path": "/home/ec2-user/keyword_ranking_crawler/marketplaces.json", + "keyword_path": "/home/ec2-user/keyword_ranking_crawler/keywords.json", + "cookies_path": "/home/ec2-user/keyword_ranking_crawler/cookies.json", + "data_path": "/mnt/AmazonReports/Amazon/keyword_ranking" + }, + "dev": { + "base_path" : "C:/Users/saif.haq/Desktop/Scrapper", + "marketplace_path": "marketplaces.json", + "keyword_path": "keywords.json", + "cookies_path": "cookies.json", + "data_path": "data" + } +} + + + +with open(config[ACTIVE_ENV]["marketplace_path"], "r", encoding="utf-8") as f: data = json.load(f) -with open("/home/ec2-user/keyword_ranking_crawler/keywords.json", "r", encoding="utf-8") as f: +with open(config[ACTIVE_ENV]["keyword_path"], "r", encoding="utf-8") as f: keywords = json.load(f) -with open("/home/ec2-user/keyword_ranking_crawler/cookies.json", "r", encoding="utf-8") as f: +with open(config[ACTIVE_ENV]["cookies_path"], "r", encoding="utf-8") as f: cookies_ref = json.load(f) # Or if it's a Python dict already: marketplaces = data["marketplaces"] -BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking' -#BASE_PATH= 'data' -MAX_PAGE = 10 +BASE_PATH= config[ACTIVE_ENV]["data_path"] +MAX_PAGE = 2 def get_driver(): options = Options() - options.add_argument("--headless") + if ACTIVE_ENV == "prod": + options.add_argument("--headless") options.add_argument("--disable-blink-features=AutomationControlled") driver = webdriver.Chrome(options=options) return driver @@ -108,7 +130,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count): url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}&page={page}" driver.get(url) ranks = [] - COOKIE_FILE = f"/home/ec2-user/keyword_ranking_crawler/{cookies_ref[marketplace]['cookies_name']}"; + COOKIE_FILE = f"{config[ACTIVE_ENV]['base_path']}/{cookies_ref[marketplace]['cookies_name']}"; # Load cookies if available if os.path.exists(COOKIE_FILE): load_cookies(driver, COOKIE_FILE) @@ -116,7 +138,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count): else: print("No cookie file found, visiting fresh") driver.get(url) - sleep(5) # Give time to solve CAPTCHA manually (if needed) + sleep(2) # Give time to solve CAPTCHA manually (if needed) save_cookies(driver, COOKIE_FILE) sleep(3) # Wait for JS to load @@ -147,7 +169,6 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count): }) count += 1 except: - print( f"[ERROR] Error getting Amazon Ranks for: {marketplace} {keyword} {page} {idx}" ) continue file_path = f"{BASE_PATH}/{int(time.time() * 1000)}-{marketplace}-{keyword}.json" @@ -169,7 +190,7 @@ try : count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count) if count == -1: break - sleep(3) + sleep(2) finally: print("[INFO] Closing WebDriver...") driver.quit()