add profile
parent
c6a2442023
commit
c56d666619
1004
keywords.json
1004
keywords.json
File diff suppressed because it is too large
Load Diff
44
scrapper.py
44
scrapper.py
|
@ -3,6 +3,9 @@ import pickle
|
|||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
from time import sleep
|
||||
import json
|
||||
import time
|
||||
|
@ -21,17 +24,31 @@ with open("cookies.json", "r", encoding="utf-8") as f:
|
|||
# Or if it's a Python dict already:
|
||||
marketplaces = data["marketplaces"]
|
||||
|
||||
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
||||
#BASE_PATH= 'data'
|
||||
#BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
||||
BASE_PATH= 'data'
|
||||
MAX_PAGE = 10
|
||||
|
||||
|
||||
|
||||
def get_driver():
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
driver = webdriver.Chrome(options=options)
|
||||
#options.add_argument("--headless")
|
||||
options.add_argument("--disable-blink-features=AutomationControlled") # Removes automation flag
|
||||
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
options.add_experimental_option('useAutomationExtension', False)
|
||||
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
|
||||
options.add_argument("--start-maximized")
|
||||
options.add_argument("user-data-dir=/home/ec2-user/keyword_ranking_crawler/chrome_path")
|
||||
options.add_argument("profile-directory=Default")
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||
# Remove navigator.webdriver
|
||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||
"source": """
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
})
|
||||
"""
|
||||
})
|
||||
return driver
|
||||
|
||||
def save_cookies(driver, path):
|
||||
|
@ -156,14 +173,15 @@ driver = get_driver()
|
|||
|
||||
for keyword in keywords:
|
||||
for marketplace, details in marketplaces.items():
|
||||
url = details['url']
|
||||
ratingPrefix = details['ratingPrefix']
|
||||
count =1
|
||||
for i in range(1, MAX_PAGE):
|
||||
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
|
||||
if count == -1:
|
||||
break
|
||||
sleep(3)
|
||||
if marketplace == 'AMAZON_USA':
|
||||
url = details['url']
|
||||
ratingPrefix = details['ratingPrefix']
|
||||
count =1
|
||||
for i in range(1, MAX_PAGE):
|
||||
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
|
||||
if count == -1:
|
||||
break
|
||||
sleep(3)
|
||||
driver.quit()
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue