config added
parent
44cf2b4395
commit
9d40fae97c
43
scrapper.py
43
scrapper.py
|
@ -13,27 +13,49 @@ import time
|
|||
import re
|
||||
|
||||
|
||||
with open("/home/ec2-user/keyword_ranking_crawler/marketplaces.json", "r", encoding="utf-8") as f:
|
||||
ACTIVE_ENV = 'prod'
|
||||
|
||||
|
||||
config = {
|
||||
"prod": {
|
||||
"base_path" : "/home/ec2-user/keyword_ranking_crawler",
|
||||
"marketplace_path": "/home/ec2-user/keyword_ranking_crawler/marketplaces.json",
|
||||
"keyword_path": "/home/ec2-user/keyword_ranking_crawler/keywords.json",
|
||||
"cookies_path": "/home/ec2-user/keyword_ranking_crawler/cookies.json",
|
||||
"data_path": "/mnt/AmazonReports/Amazon/keyword_ranking"
|
||||
},
|
||||
"dev": {
|
||||
"base_path" : "C:/Users/saif.haq/Desktop/Scrapper",
|
||||
"marketplace_path": "marketplaces.json",
|
||||
"keyword_path": "keywords.json",
|
||||
"cookies_path": "cookies.json",
|
||||
"data_path": "data"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
with open(config[ACTIVE_ENV]["marketplace_path"], "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
with open("/home/ec2-user/keyword_ranking_crawler/keywords.json", "r", encoding="utf-8") as f:
|
||||
with open(config[ACTIVE_ENV]["keyword_path"], "r", encoding="utf-8") as f:
|
||||
keywords = json.load(f)
|
||||
|
||||
with open("/home/ec2-user/keyword_ranking_crawler/cookies.json", "r", encoding="utf-8") as f:
|
||||
with open(config[ACTIVE_ENV]["cookies_path"], "r", encoding="utf-8") as f:
|
||||
cookies_ref = json.load(f)
|
||||
|
||||
# Or if it's a Python dict already:
|
||||
marketplaces = data["marketplaces"]
|
||||
|
||||
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
||||
#BASE_PATH= 'data'
|
||||
MAX_PAGE = 10
|
||||
BASE_PATH= config[ACTIVE_ENV]["data_path"]
|
||||
MAX_PAGE = 2
|
||||
|
||||
|
||||
|
||||
def get_driver():
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
if ACTIVE_ENV == "prod":
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
return driver
|
||||
|
@ -108,7 +130,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
|
|||
url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}&page={page}"
|
||||
driver.get(url)
|
||||
ranks = []
|
||||
COOKIE_FILE = f"/home/ec2-user/keyword_ranking_crawler/{cookies_ref[marketplace]['cookies_name']}";
|
||||
COOKIE_FILE = f"{config[ACTIVE_ENV]['base_path']}/{cookies_ref[marketplace]['cookies_name']}";
|
||||
# Load cookies if available
|
||||
if os.path.exists(COOKIE_FILE):
|
||||
load_cookies(driver, COOKIE_FILE)
|
||||
|
@ -116,7 +138,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
|
|||
else:
|
||||
print("No cookie file found, visiting fresh")
|
||||
driver.get(url)
|
||||
sleep(5) # Give time to solve CAPTCHA manually (if needed)
|
||||
sleep(2) # Give time to solve CAPTCHA manually (if needed)
|
||||
save_cookies(driver, COOKIE_FILE)
|
||||
|
||||
sleep(3) # Wait for JS to load
|
||||
|
@ -147,7 +169,6 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
|
|||
})
|
||||
count += 1
|
||||
except:
|
||||
print( f"[ERROR] Error getting Amazon Ranks for: {marketplace} {keyword} {page} {idx}" )
|
||||
continue
|
||||
|
||||
file_path = f"{BASE_PATH}/{int(time.time() * 1000)}-{marketplace}-{keyword}.json"
|
||||
|
@ -169,7 +190,7 @@ try :
|
|||
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
|
||||
if count == -1:
|
||||
break
|
||||
sleep(3)
|
||||
sleep(2)
|
||||
finally:
|
||||
print("[INFO] Closing WebDriver...")
|
||||
driver.quit()
|
||||
|
|
Loading…
Reference in New Issue