added selenium config
parent
beddc579c4
commit
65eae04ac0
28
scrapper.py
28
scrapper.py
|
@ -53,13 +53,37 @@ BASE_PATH= config[ACTIVE_ENV]["data_path"]
|
||||||
MAX_PAGE = 2
|
MAX_PAGE = 2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_driver():
|
def get_driver():
|
||||||
options = Options()
|
options = Options()
|
||||||
if ACTIVE_ENV == "prod":
|
if ACTIVE_ENV == "prod":
|
||||||
options.add_argument("--headless")
|
options.add_argument("--headless")
|
||||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
options.add_argument("--disable-infobars")
|
||||||
|
options.add_argument("--disable-extensions")
|
||||||
|
options.add_argument("--start-maximized")
|
||||||
|
options.add_argument("--no-sandbox")
|
||||||
|
options.add_argument("--disable-dev-shm-usage")
|
||||||
|
options.add_argument("--window-size=1920,1080")
|
||||||
|
# Set a realistic user agent
|
||||||
|
options.add_argument(
|
||||||
|
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/115.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
# Experimental options to hide automation
|
||||||
|
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
options.add_experimental_option("useAutomationExtension", False)
|
||||||
|
options.add_experimental_option("prefs", {
|
||||||
|
"profile.default_content_setting_values.cookies": 2
|
||||||
|
})
|
||||||
driver = webdriver.Chrome(options=options)
|
driver = webdriver.Chrome(options=options)
|
||||||
|
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||||
|
"source": """
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
});
|
||||||
|
"""
|
||||||
|
})
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
def save_cookies(driver, path):
|
def save_cookies(driver, path):
|
||||||
|
@ -143,7 +167,7 @@ def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
|
||||||
sleep(1) # Give time to solve CAPTCHA manually (if needed)
|
sleep(1) # Give time to solve CAPTCHA manually (if needed)
|
||||||
save_cookies(driver, COOKIE_FILE)
|
save_cookies(driver, COOKIE_FILE)
|
||||||
|
|
||||||
sleep(2) # Wait for JS to load
|
sleep(1) # Wait for JS to load
|
||||||
items = driver.find_elements(By.XPATH, '//div[contains(@class,"s-result-item") and @data-asin]')
|
items = driver.find_elements(By.XPATH, '//div[contains(@class,"s-result-item") and @data-asin]')
|
||||||
for idx, item in enumerate(items, start=1):
|
for idx, item in enumerate(items, start=1):
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in New Issue