add profile
parent
c6a2442023
commit
c56d666619
1004
keywords.json
1004
keywords.json
File diff suppressed because it is too large
Load Diff
44
scrapper.py
44
scrapper.py
|
@ -3,6 +3,9 @@ import pickle
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
from time import sleep
|
from time import sleep
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
@ -21,17 +24,31 @@ with open("cookies.json", "r", encoding="utf-8") as f:
|
||||||
# Or if it's a Python dict already:
|
# Or if it's a Python dict already:
|
||||||
marketplaces = data["marketplaces"]
|
marketplaces = data["marketplaces"]
|
||||||
|
|
||||||
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
#BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
||||||
#BASE_PATH= 'data'
|
BASE_PATH= 'data'
|
||||||
MAX_PAGE = 10
|
MAX_PAGE = 10
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_driver():
|
def get_driver():
|
||||||
options = Options()
|
options = Options()
|
||||||
options.add_argument("--headless")
|
#options.add_argument("--headless")
|
||||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
options.add_argument("--disable-blink-features=AutomationControlled") # Removes automation flag
|
||||||
driver = webdriver.Chrome(options=options)
|
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||||
|
options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
|
||||||
|
options.add_argument("--start-maximized")
|
||||||
|
options.add_argument("user-data-dir=/home/ec2-user/keyword_ranking_crawler/chrome_path")
|
||||||
|
options.add_argument("profile-directory=Default")
|
||||||
|
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
||||||
|
# Remove navigator.webdriver
|
||||||
|
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||||
|
"source": """
|
||||||
|
Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
get: () => undefined
|
||||||
|
})
|
||||||
|
"""
|
||||||
|
})
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
def save_cookies(driver, path):
|
def save_cookies(driver, path):
|
||||||
|
@ -156,14 +173,15 @@ driver = get_driver()
|
||||||
|
|
||||||
for keyword in keywords:
|
for keyword in keywords:
|
||||||
for marketplace, details in marketplaces.items():
|
for marketplace, details in marketplaces.items():
|
||||||
url = details['url']
|
if marketplace == 'AMAZON_USA':
|
||||||
ratingPrefix = details['ratingPrefix']
|
url = details['url']
|
||||||
count =1
|
ratingPrefix = details['ratingPrefix']
|
||||||
for i in range(1, MAX_PAGE):
|
count =1
|
||||||
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
|
for i in range(1, MAX_PAGE):
|
||||||
if count == -1:
|
count = get_amazon_ranks(url, marketplace, ratingPrefix, keyword, i, count)
|
||||||
break
|
if count == -1:
|
||||||
sleep(3)
|
break
|
||||||
|
sleep(3)
|
||||||
driver.quit()
|
driver.quit()
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue