commit 19dddd7551c1f931d201930c147cfb7b6924c366 Author: saif Date: Mon Aug 4 17:57:25 2025 +0500 initial commit diff --git a/cookies.json b/cookies.json new file mode 100644 index 0000000..4596523 --- /dev/null +++ b/cookies.json @@ -0,0 +1,58 @@ +{ + "AMAZON_USA": { + "marketplace": "AMAZON_USA", + "cookies_name": "cookies/amazon_us_cookies.pkl" + }, + "AMAZON_CA": { + "marketplace": "AMAZON_CA", + "cookies_name":"cookies/amazon_ca_cookies.pkl" + }, + "AMAZON_SE": { + "marketplace": "AMAZON_SE", + "cookies_name": "cookies/amazon_se_cookies.pkl" + }, + "AMAZON_ES": { + "marketplace": "AMAZON_ES", + "cookies_name": "cookies/amazon_es_cookies.pkl" + }, + "AMAZON_FR": { + "marketplace": "AMAZON_FR", + "cookies_name": "cookies/amazon_fr_cookies.pkl" + }, + "AMAZON_IT": { + "marketplace": "AMAZON_IT", + "cookies_name": "cookies/amazon_it_cookies.pkl" + }, + "AMAZON_JP": { + "marketplace": "AMAZON_JP", + "cookies_name": "cookies/amazon_jb_cookies.pkl" + }, + "AMAZON_UK": { + "marketplace": "AMAZON_UK", + "cookies_name": "cookies/amazon_uk_cookies.pkl" + }, + "AMAZON_DE": { + "marketplace": "AMAZON_DE", + "cookies_name": "cookies/amazon_de_cookies.pkl" + }, + "AMAZON_MX": { + "marketplace": "AMAZON_MX", + "cookies_name": "cookies/amazon_mx_cookies.pkl" + }, + "AMAZON_AU": { + "marketplace": "AMAZON_AU", + "cookies_name": "cookies/amazon_au_cookies.pkl" + }, + "AMAZON_TR": { + "marketplace": "AMAZON_TR", + "cookies_name": "cookies/amazon_tr_cookies.pkl" + }, + "AMAZON_PL": { + "marketplace": "AMAZON_PL", + "cookies_name": "cookies/amazon_pl_cookies.pkl" + }, + "AMAZON_NL": { + "marketplace": "AMAZON_NL", + "cookies_name": "cookies/amazon_nl_cookies.pkl" + } +} diff --git a/cookies/amazon_au_cookies.pkl b/cookies/amazon_au_cookies.pkl new file mode 100644 index 0000000..bbe9fbc Binary files /dev/null and b/cookies/amazon_au_cookies.pkl differ diff --git a/cookies/amazon_ca_cookies.pkl b/cookies/amazon_ca_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_ca_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_de_cookies.pkl b/cookies/amazon_de_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_de_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_es_cookies.pkl b/cookies/amazon_es_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_es_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_fr_cookies.pkl b/cookies/amazon_fr_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_fr_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_it_cookies.pkl b/cookies/amazon_it_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_it_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_jb_cookies.pkl b/cookies/amazon_jb_cookies.pkl new file mode 100644 index 0000000..417976a Binary files /dev/null and b/cookies/amazon_jb_cookies.pkl differ diff --git a/cookies/amazon_mx_cookies.pkl b/cookies/amazon_mx_cookies.pkl new file mode 100644 index 0000000..00c14a8 Binary files /dev/null and b/cookies/amazon_mx_cookies.pkl differ diff --git a/cookies/amazon_nl_cookies.pkl b/cookies/amazon_nl_cookies.pkl new file mode 100644 index 0000000..54775cd Binary files /dev/null and b/cookies/amazon_nl_cookies.pkl differ diff --git a/cookies/amazon_pl_cookies.pkl b/cookies/amazon_pl_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_pl_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_se_cookies.pkl b/cookies/amazon_se_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_se_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_tr_cookies.pkl b/cookies/amazon_tr_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_tr_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_uk_cookies.pkl b/cookies/amazon_uk_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_uk_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/cookies/amazon_us_cookies.pkl b/cookies/amazon_us_cookies.pkl new file mode 100644 index 0000000..92c3c88 --- /dev/null +++ b/cookies/amazon_us_cookies.pkl @@ -0,0 +1 @@ +€]”. \ No newline at end of file diff --git a/marketplaces.json b/marketplaces.json new file mode 100644 index 0000000..e9d83f7 --- /dev/null +++ b/marketplaces.json @@ -0,0 +1,145 @@ +{ + "marketplaces": { + "AMAZON_USA": { + "url": "amazon.com", + "datePrefix": " on ", + "ratingPrefix": " out ", + "marketplaceId": "ATVPDKIKX0DER", + "sellerId": "A3AQP8TDYVYCGL", + "monsSelDirMcid": "amzn1.merchant.d.ACNTLSC3KHN32CA2ZGAMI6CIQOYA", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "spApiRegion": "na" + }, + "AMAZON_CA": { + "url": "amazon.ca", + "datePrefix": " on ", + "ratingPrefix": " out ", + "marketplaceId": "A2EUQ1WTGCTBG2", + "sellerId": "A3AQP8TDYVYCGL", + "monsSelDirMcid": "amzn1.merchant.d.ACNTLSC3KHN32CA2ZGAMI6CIQOYA", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : false + }, + "AMAZON_ES": { + "url": "amazon.es", + "datePrefix": " el ", + "ratingPrefix": " out ", + "marketplaceId": "A1RKKUPIHCS9HS", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_MX": { + "url": "amazon.com.mx", + "datePrefix": " on ", + "ratingPrefix": " out ", + "marketplaceId": "A1AM78C64UM0Y8", + "sellerId": "A3AQP8TDYVYCGL", + "monsSelDirMcid": "amzn1.merchant.d.ACNTLSC3KHN32CA2ZGAMI6CIQOYA", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_DE": { + "url": "amazon.de", + "datePrefix": " on ", + "ratingPrefix": " out ", + "marketplaceId": "A1PA6795UKMFR9", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_UK": { + "url": "amazon.co.uk", + "datePrefix": " on ", + "ratingPrefix": " out ", + "marketplaceId": "A1F83G8C2ARO7P", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "spApiRegion": "eu" + }, + "AMAZON_FR": { + "url": "amazon.fr", + "datePrefix": " le ", + "ratingPrefix": " sur ", + "marketplaceId": "A13V1IB3VIYZZH", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_IT": { + "url": "amazon.it", + "datePrefix": " il ", + "ratingPrefix": " out ", + "marketplaceId": "APJ6JRA9NG5V4", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_JP": { + "url": "amazon.co.jp", + "datePrefix": " on ", + "ratingPrefix": " out ", + "marketplaceId": "A1VC38T7YXB528", + "sellerId": "ASXGWNT2IP97D", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_AU": { + "url": "amazon.com.au", + "datePrefix": " on ", + "ratingPrefix": " out ", + "marketplaceId": "A39IBJ37TRP1C6", + "sellerId": "A10ATVJQO4YLOJ", + "monsSelDirMcid": "amzn1.merchant.d.AC2O7FOGAVY6N2ZGFWJDEKXDAUAA", + "monsSelDirPaid": "amzn1.pa.d.ACBRNFLIYBKMXXHCO4MLCNV2OV3A", + "existsInEurope" : false + }, + "AMAZON_NL": { + "url": "amazon.nl", + "datePrefix": " op ", + "ratingPrefix": " van ", + "marketplaceId": "A1805IZSGTT6HS", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_SE": { + "url": "amazon.se", + "datePrefix": " den ", + "ratingPrefix": " out ", + "marketplaceId": "A2NODRKZP88ZB9", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_PL": { + "url": "amazon.pl", + "datePrefix": " dnia ", + "ratingPrefix": " out ", + "marketplaceId": "A1C3SOZRARQ6R3", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + }, + "AMAZON_TR": { + "url": "amazon.com.tr", + "datePrefix": " ", + "dateSuffix": " tarihinde değerlendirildi", + "ratingPrefix": " out ", + "marketplaceId": "A33AVAJ2PDY3EV", + "sellerId": "A2PQ31EAG2KKDB", + "monsSelDirMcid": "amzn1.merchant.d.ABJPA72DQNTVVLRKE5THKMTM2S7Q", + "monsSelDirPaid": "amzn1.pa.d.ADY3WQHX65LDGRB7ZIY7L3H67AMA", + "existsInEurope" : true + } + } +} \ No newline at end of file diff --git a/scapper.py b/scapper.py new file mode 100644 index 0000000..180b818 --- /dev/null +++ b/scapper.py @@ -0,0 +1,109 @@ +import os +import pickle +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from time import sleep +import json +import time + + +with open("marketplaces.json", "r", encoding="utf-8") as f: + data = json.load(f) + +with open("cookies.json", "r", encoding="utf-8") as f: + cookies_ref = json.load(f) + +# Or if it's a Python dict already: +marketplaces = data["marketplaces"] + + +def get_driver(): + options = Options() + options.add_argument("--headless") + options.add_argument("--disable-blink-features=AutomationControlled") + driver = webdriver.Chrome(options=options) + return driver + +def save_cookies(driver, path): + with open(path, "wb") as f: + pickle.dump(driver.get_cookies(), f) + + +def save_ranking(rankings, file_path): + with open(file_path, "w", encoding="utf-8") as f: + json.dump(rankings, f, ensure_ascii=False, indent=4) + +def load_cookies(driver, path): + with open(path, "rb") as f: + cookies = pickle.load(f) + for cookie in cookies: + if 'sameSite' in cookie: + cookie.pop('sameSite') # Optional fix if Chrome complains + driver.add_cookie(cookie) + + +def check_sponsored(item): + try: + # Check if any element inside contains the exact text "Sponsored" + sponsored_labels = item.find_elements(By.XPATH, './/*[contains(text(), "Sponsored")]') + for label in sponsored_labels: + if label.text.strip().lower() == "sponsored": + return 1 + except: + return 0 + +def check_consist_utopia( title ): + return 1 if "Utopia" in title else 0 + + + +def get_amazon_ranks(url, marketplace, keyword): + print( '[INFO] Getting Amazon Ranks for: ', marketplace, keyword) + url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}" + driver.get(url) + count =1 + ranks = [] + + COOKIE_FILE = f"{cookies_ref[marketplace]['cookies_name']}"; + print(COOKIE_FILE) + + # Load cookies if available + if os.path.exists(COOKIE_FILE): + load_cookies(driver, COOKIE_FILE) + driver.get(url) + else: + print("No cookie file found, visiting fresh") + driver.get(url) + sleep(5) # Give time to solve CAPTCHA manually (if needed) + save_cookies(driver, COOKIE_FILE) + + sleep(3) # Wait for JS to load + items = driver.find_elements(By.XPATH, '//div[contains(@class,"s-result-item") and @data-asin]') + for idx, item in enumerate(items, start=1): + asin = item.get_attribute("data-asin") + try: + sponsored = check_sponsored(item) + title = item.find_element(By.XPATH, './/h2//span').text + if title == 'Results': + continue + if sponsored == None : + ranks.append({'rank' : count , 'title' : title , 'marketplace' : marketplace , 'keyword': keyword, 'sponsored' : 0, 'asin' : asin , 'is_utopia' : check_consist_utopia(title) }) + count += 1 + except: + continue + + file_path = f"{int(time.time() * 1000)}-{marketplace}-{keyword}.json" + + save_ranking(ranks, file_path ) + + +driver = get_driver() +for marketplace, details in marketplaces.items(): + url = details['url'] + get_amazon_ranks(url, marketplace, 'pillows') +driver.quit() + + + + diff --git a/send-data.js b/send-data.js new file mode 100644 index 0000000..06fdf7a --- /dev/null +++ b/send-data.js @@ -0,0 +1,56 @@ +const axios = require( 'axios' ); +const fs = require( 'fs' ); +const path = require( 'path' ); + +(async function() { + + /** + * directory path + */ + let rootPath = '/mnt/AmazonReports/Amazon/keyword_ranking'; + let processedPath = rootPath + '/processed'; + + if ( ! fs.existsSync( rootPath ) ) { + fs.mkdirSync( rootPath ); + } + + if ( ! fs.existsSync( processedPath ) ) { + fs.mkdirSync( processedPath ); + } + + /** + * read all files in directory, send data to cosmos then move to processed + */ + const jsonFiles = fs.readdirSync(rootPath) + .filter(file => path.extname(file).toLowerCase() === '.json') + .map(file => { + const filePath = path.join(rootPath, file); + const stats = fs.statSync(filePath); + return { file, birthtime: stats.birthtime }; + }) + .sort((a, b) => a.birthtime - b.birthtime) // ASCENDING order + .map(entry => entry.file); // extract filenames + + + for ( const file of jsonFiles ) { + try { + // read contents of the file + const filePath = path.join( rootPath, file ); + const orders = JSON.parse( fs.readFileSync( filePath, 'utf-8' ) ); + let payload = { progressList: orders }; + console.log( `Processing: ${filePath}` ); + // send post request to cosmos + let res = await axios.post( config[environment].cosmos_path_orders_progress, payload, { + headers: { + 'Content-Type': 'application/json' + } + } ); + if ( res['status'] == 200 ) { + fs.renameSync( filePath, path.join( processedPath, file ) ) + } + } catch ( e ) { + console.log( e ); + } + } + +})(); \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..9f3583c --- /dev/null +++ b/test.py @@ -0,0 +1,14 @@ +import json +import requests + +# File to read +filename = "1754304649015-toys.json" + + + +# Read JSON file +with open(filename, "r", encoding="utf-8") as f: + data = json.load(f) + + +print(data) \ No newline at end of file