fixes
parent
bf29a7dd87
commit
4dd819d876
20
scrapper.py
20
scrapper.py
|
@ -1,13 +1,10 @@
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import glob
|
|
||||||
import shutil
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.chrome.service import Service
|
from selenium.webdriver.chrome.service import Service
|
||||||
from webdriver_manager.chrome import ChromeDriverManager
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
import tempfile
|
|
||||||
|
|
||||||
|
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
@ -15,7 +12,6 @@ import json
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
chrome_profile_path = "/home/ec2-user/keyword_ranking_crawler/chrome_path_copy"
|
|
||||||
|
|
||||||
with open("marketplaces.json", "r", encoding="utf-8") as f:
|
with open("marketplaces.json", "r", encoding="utf-8") as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
|
@ -29,6 +25,7 @@ with open("cookies.json", "r", encoding="utf-8") as f:
|
||||||
# Or if it's a Python dict already:
|
# Or if it's a Python dict already:
|
||||||
marketplaces = data["marketplaces"]
|
marketplaces = data["marketplaces"]
|
||||||
|
|
||||||
|
chrome_profile_path = '/home/ec2-user/keyword_ranking_crawler/chrome_path'
|
||||||
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
|
||||||
#BASE_PATH= 'data'
|
#BASE_PATH= 'data'
|
||||||
MAX_PAGE = 10
|
MAX_PAGE = 10
|
||||||
|
@ -36,19 +33,6 @@ MAX_PAGE = 10
|
||||||
|
|
||||||
|
|
||||||
def get_driver():
|
def get_driver():
|
||||||
# Create a unique temp folder for this run
|
|
||||||
temp_profile = tempfile.mkdtemp()
|
|
||||||
# Copy profile but skip lock files
|
|
||||||
for item in os.listdir(chrome_profile_path):
|
|
||||||
s = os.path.join(chrome_profile_path, item)
|
|
||||||
d = os.path.join(temp_profile, item)
|
|
||||||
if item.startswith("Singleton"): # Skip lock files
|
|
||||||
continue
|
|
||||||
if os.path.isdir(s):
|
|
||||||
shutil.copytree(s, d, dirs_exist_ok=True)
|
|
||||||
else:
|
|
||||||
shutil.copy2(s, d)
|
|
||||||
|
|
||||||
options = Options()
|
options = Options()
|
||||||
#options.add_argument("--headless")
|
#options.add_argument("--headless")
|
||||||
options.add_argument("--disable-blink-features=AutomationControlled") # Removes automation flag
|
options.add_argument("--disable-blink-features=AutomationControlled") # Removes automation flag
|
||||||
|
@ -56,8 +40,8 @@ def get_driver():
|
||||||
options.add_experimental_option('useAutomationExtension', False)
|
options.add_experimental_option('useAutomationExtension', False)
|
||||||
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
|
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")
|
||||||
options.add_argument("--start-maximized")
|
options.add_argument("--start-maximized")
|
||||||
options.add_argument(f"user-data-dir={temp_profile}")
|
|
||||||
options.add_argument("profile-directory=Default")
|
options.add_argument("profile-directory=Default")
|
||||||
|
options.add_argument(f"user-data-dir={chrome_profile_path}")
|
||||||
driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()),options=options)
|
driver = webdriver.Chrome( service=Service(ChromeDriverManager().install()),options=options)
|
||||||
# Remove navigator.webdriver
|
# Remove navigator.webdriver
|
||||||
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
||||||
|
|
Loading…
Reference in New Issue