config added
parent
44cf2b4395
commit
9d40fae97c
43
scrapper.py
43
scrapper.py
|
@ -13,27 +13,49 @@ import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
with open("/home/ec2-user/keyword_ranking_crawler/marketplaces.json", "r", encoding="utf-8") as f:
|
ACTIVE_ENV = 'prod'
|
||||||
|
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"prod": {
|
||||||
|
"base_path" : "/home/ec2-user/keyword_ranking_crawler",
|
||||||
|
"marketplace_path": "/home/ec2-user/keyword_ranking_crawler/marketplaces.json",
|
||||||
|
"keyword_path": "/home/ec2-user/keyword_ranking_crawler/keywords.json",
|
||||||
|
"cookies_path": "/home/ec2-user/keyword_ranking_crawler/cookies.json",
|
||||||
|
"data_path": "/mnt/AmazonReports/Amazon/keyword_ranking"
|
||||||
|
},
|
||||||
|
"dev": {
|
||||||
|
"base_path" : "C:/Users/saif.haq/Desktop/Scrapper",
|
||||||
|
"marketplace_path": "marketplaces.json",
|
||||||
|
"keyword_path": "keywords.json",
|
||||||
|
"cookies_path": "cookies.json",
|
||||||
|
"data_path": "data"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
with open(config[ACTIVE_ENV]["marketplace_path"], "r", encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
|
||||||
with open("/home/ec2-user/keyword_ranking_crawler/keywords.json", "r", encoding="utf-8") as f:
|
with open(config[ACTIVE_ENV]["keyword_path"], "r", encoding="utf-8") as f:
|
||||||
keywords = json.load(f)
|
keywords = json.load(f)
|
||||||
|
|
||||||
with open("/home/ec2-user/keyword_ranking_crawler/cookies.json", "r", encoding="utf-8") as f:
|
with open(config[ACTIVE_ENV]["cookies_path"], "r", encoding="utf-8") as f:
|
||||||
cookies_ref = json.load(f)
|
cookies_ref = json.load(f)
|
||||||
|
|
||||||
# Or if it's a Python dict already:
|
# Or if it's a Python dict already:
|
||||||
marketplaces = data["marketplaces"]
|
marketplaces = data["marketplaces"]
|
||||||
|
|
||||||
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
BASE_PATH= config[ACTIVE_ENV]["data_path"]
|
||||||
#BASE_PATH= 'data'
|
MAX_PAGE = 2
|
||||||
MAX_PAGE = 10
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_driver():
|
def get_driver():
|
||||||
options = Options()
|
options = Options()
|
||||||
options.add_argument("--headless")
|
if ACTIVE_ENV == "prod":
|
||||||
|
options.add_argument("--headless")
|
||||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
driver = webdriver.Chrome(options=options)
|
driver = webdriver.Chrome(options=options)
|
||||||
return driver
|
return driver
|
||||||
|
@ -108,7 +130,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
|
||||||
url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}&page={page}"
|
url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}&page={page}"
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
ranks = []
|
ranks = []
|
||||||
COOKIE_FILE = f"/home/ec2-user/keyword_ranking_crawler/{cookies_ref[marketplace]['cookies_name']}";
|
COOKIE_FILE = f"{config[ACTIVE_ENV]['base_path']}/{cookies_ref[marketplace]['cookies_name']}";
|
||||||
# Load cookies if available
|
# Load cookies if available
|
||||||
if os.path.exists(COOKIE_FILE):
|
if os.path.exists(COOKIE_FILE):
|
||||||
load_cookies(driver, COOKIE_FILE)
|
load_cookies(driver, COOKIE_FILE)
|
||||||
|
@ -116,7 +138,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
|
||||||
else:
|
else:
|
||||||
print("No cookie file found, visiting fresh")
|
print("No cookie file found, visiting fresh")
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
sleep(5) # Give time to solve CAPTCHA manually (if needed)
|
sleep(2) # Give time to solve CAPTCHA manually (if needed)
|
||||||
save_cookies(driver, COOKIE_FILE)
|
save_cookies(driver, COOKIE_FILE)
|
||||||
|
|
||||||
sleep(3) # Wait for JS to load
|
sleep(3) # Wait for JS to load
|
||||||
|
@ -147,7 +169,6 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
|
||||||
})
|
})
|
||||||
count += 1
|
count += 1
|
||||||
except:
|
except:
|
||||||
print( f"[ERROR] Error getting Amazon Ranks for: {marketplace} {keyword} {page} {idx}" )
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
file_path = f"{BASE_PATH}/{int(time.time() * 1000)}-{marketplace}-{keyword}.json"
|
file_path = f"{BASE_PATH}/{int(time.time() * 1000)}-{marketplace}-{keyword}.json"
|
||||||
|
@ -169,7 +190,7 @@ try :
|
||||||
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
|
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
|
||||||
if count == -1:
|
if count == -1:
|
||||||
break
|
break
|
||||||
sleep(3)
|
sleep(2)
|
||||||
finally:
|
finally:
|
||||||
print("[INFO] Closing WebDriver...")
|
print("[INFO] Closing WebDriver...")
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
Loading…
Reference in New Issue