config added

main
saif 2025-08-15 16:49:43 +05:00
parent 44cf2b4395
commit 9d40fae97c
1 changed files with 32 additions and 11 deletions

View File

@ -13,27 +13,49 @@ import time
import re
with open("/home/ec2-user/keyword_ranking_crawler/marketplaces.json", "r", encoding="utf-8") as f:
ACTIVE_ENV = 'prod'
config = {
"prod": {
"base_path" : "/home/ec2-user/keyword_ranking_crawler",
"marketplace_path": "/home/ec2-user/keyword_ranking_crawler/marketplaces.json",
"keyword_path": "/home/ec2-user/keyword_ranking_crawler/keywords.json",
"cookies_path": "/home/ec2-user/keyword_ranking_crawler/cookies.json",
"data_path": "/mnt/AmazonReports/Amazon/keyword_ranking"
},
"dev": {
"base_path" : "C:/Users/saif.haq/Desktop/Scrapper",
"marketplace_path": "marketplaces.json",
"keyword_path": "keywords.json",
"cookies_path": "cookies.json",
"data_path": "data"
}
}
with open(config[ACTIVE_ENV]["marketplace_path"], "r", encoding="utf-8") as f:
data = json.load(f)
with open("/home/ec2-user/keyword_ranking_crawler/keywords.json", "r", encoding="utf-8") as f:
with open(config[ACTIVE_ENV]["keyword_path"], "r", encoding="utf-8") as f:
keywords = json.load(f)
with open("/home/ec2-user/keyword_ranking_crawler/cookies.json", "r", encoding="utf-8") as f:
with open(config[ACTIVE_ENV]["cookies_path"], "r", encoding="utf-8") as f:
cookies_ref = json.load(f)
# Or if it's a Python dict already:
marketplaces = data["marketplaces"]
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
#BASE_PATH= 'data'
MAX_PAGE = 10
BASE_PATH= config[ACTIVE_ENV]["data_path"]
MAX_PAGE = 2
def get_driver():
options = Options()
options.add_argument("--headless")
if ACTIVE_ENV == "prod":
options.add_argument("--headless")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(options=options)
return driver
@ -108,7 +130,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}&page={page}"
driver.get(url)
ranks = []
COOKIE_FILE = f"/home/ec2-user/keyword_ranking_crawler/{cookies_ref[marketplace]['cookies_name']}";
COOKIE_FILE = f"{config[ACTIVE_ENV]['base_path']}/{cookies_ref[marketplace]['cookies_name']}";
# Load cookies if available
if os.path.exists(COOKIE_FILE):
load_cookies(driver, COOKIE_FILE)
@ -116,7 +138,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
else:
print("No cookie file found, visiting fresh")
driver.get(url)
sleep(5) # Give time to solve CAPTCHA manually (if needed)
sleep(2) # Give time to solve CAPTCHA manually (if needed)
save_cookies(driver, COOKIE_FILE)
sleep(3) # Wait for JS to load
@ -147,7 +169,6 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
})
count += 1
except:
print( f"[ERROR] Error getting Amazon Ranks for: {marketplace} {keyword} {page} {idx}" )
continue
file_path = f"{BASE_PATH}/{int(time.time() * 1000)}-{marketplace}-{keyword}.json"
@ -169,7 +190,7 @@ try :
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
if count == -1:
break
sleep(3)
sleep(2)
finally:
print("[INFO] Closing WebDriver...")
driver.quit()