main
saif 2025-08-08 17:21:44 +05:00
parent e4f3652d6d
commit 06f6ee9646
5 changed files with 343 additions and 16 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
node_modules/

268
package-lock.json generated Normal file
View File

@ -0,0 +1,268 @@
{
"name": "Scrapper",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"dependencies": {
"axios": "^1.11.0"
}
},
"node_modules/asynckit": {
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
"integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="
},
"node_modules/axios": {
"version": "1.11.0",
"resolved": "https://registry.npmjs.org/axios/-/axios-1.11.0.tgz",
"integrity": "sha512-1Lx3WLFQWm3ooKDYZD1eXmoGO9fxYQjrycfHFC8P0sCfQVXyROp0p9PFWBehewBOdCwHc+f/b8I0fMto5eSfwA==",
"dependencies": {
"follow-redirects": "^1.15.6",
"form-data": "^4.0.4",
"proxy-from-env": "^1.1.0"
}
},
"node_modules/call-bind-apply-helpers": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
"integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
"dependencies": {
"es-errors": "^1.3.0",
"function-bind": "^1.1.2"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/combined-stream": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
"integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
"dependencies": {
"delayed-stream": "~1.0.0"
},
"engines": {
"node": ">= 0.8"
}
},
"node_modules/delayed-stream": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
"engines": {
"node": ">=0.4.0"
}
},
"node_modules/dunder-proto": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
"integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
"dependencies": {
"call-bind-apply-helpers": "^1.0.1",
"es-errors": "^1.3.0",
"gopd": "^1.2.0"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/es-define-property": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
"integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
"engines": {
"node": ">= 0.4"
}
},
"node_modules/es-errors": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
"integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
"engines": {
"node": ">= 0.4"
}
},
"node_modules/es-object-atoms": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
"integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
"dependencies": {
"es-errors": "^1.3.0"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/es-set-tostringtag": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
"integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
"dependencies": {
"es-errors": "^1.3.0",
"get-intrinsic": "^1.2.6",
"has-tostringtag": "^1.0.2",
"hasown": "^2.0.2"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/follow-redirects": {
"version": "1.15.11",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.11.tgz",
"integrity": "sha512-deG2P0JfjrTxl50XGCDyfI97ZGVCxIpfKYmfyrQ54n5FO/0gfIES8C/Psl6kWVDolizcaaxZJnTS0QSMxvnsBQ==",
"funding": [
{
"type": "individual",
"url": "https://github.com/sponsors/RubenVerborgh"
}
],
"engines": {
"node": ">=4.0"
},
"peerDependenciesMeta": {
"debug": {
"optional": true
}
}
},
"node_modules/form-data": {
"version": "4.0.4",
"resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.4.tgz",
"integrity": "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==",
"dependencies": {
"asynckit": "^0.4.0",
"combined-stream": "^1.0.8",
"es-set-tostringtag": "^2.1.0",
"hasown": "^2.0.2",
"mime-types": "^2.1.12"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/function-bind": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
"integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/get-intrinsic": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
"integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
"dependencies": {
"call-bind-apply-helpers": "^1.0.2",
"es-define-property": "^1.0.1",
"es-errors": "^1.3.0",
"es-object-atoms": "^1.1.1",
"function-bind": "^1.1.2",
"get-proto": "^1.0.1",
"gopd": "^1.2.0",
"has-symbols": "^1.1.0",
"hasown": "^2.0.2",
"math-intrinsics": "^1.1.0"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/get-proto": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
"integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
"dependencies": {
"dunder-proto": "^1.0.1",
"es-object-atoms": "^1.0.0"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/gopd": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
"integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/has-symbols": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
"integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/has-tostringtag": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
"integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
"dependencies": {
"has-symbols": "^1.0.3"
},
"engines": {
"node": ">= 0.4"
},
"funding": {
"url": "https://github.com/sponsors/ljharb"
}
},
"node_modules/hasown": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
"integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
"dependencies": {
"function-bind": "^1.1.2"
},
"engines": {
"node": ">= 0.4"
}
},
"node_modules/math-intrinsics": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
"integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
"engines": {
"node": ">= 0.4"
}
},
"node_modules/mime-db": {
"version": "1.52.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
"integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
"engines": {
"node": ">= 0.6"
}
},
"node_modules/mime-types": {
"version": "2.1.35",
"resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
"integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
"dependencies": {
"mime-db": "1.52.0"
},
"engines": {
"node": ">= 0.6"
}
},
"node_modules/proxy-from-env": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz",
"integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg=="
}
}
}

5
package.json Normal file
View File

@ -0,0 +1,5 @@
{
"dependencies": {
"axios": "^1.11.0"
}
}

View File

@ -6,6 +6,7 @@ from selenium.webdriver.common.by import By
from time import sleep from time import sleep
import json import json
import time import time
import re
with open("marketplaces.json", "r", encoding="utf-8") as f: with open("marketplaces.json", "r", encoding="utf-8") as f:
@ -18,6 +19,8 @@ with open("cookies.json", "r", encoding="utf-8") as f:
marketplaces = data["marketplaces"] marketplaces = data["marketplaces"]
BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking' BASE_PATH= '/mnt/AmazonReports/Amazon/keyword_ranking'
MAX_PAGE = 10
def get_driver(): def get_driver():
@ -31,8 +34,40 @@ def save_cookies(driver, path):
with open(path, "wb") as f: with open(path, "wb") as f:
pickle.dump(driver.get_cookies(), f) pickle.dump(driver.get_cookies(), f)
def parse_number_with_commas(str):
try:
return int(str.replace(",", ""))
except:
return 0
def parse_rating(ratingHtml,prefix):
try :
rating_str = ratingHtml.get_attribute("innerHTML")
return float(rating_str.split(prefix)[0])
except:
return ""
def parse_bought_count(text):
if not text:
return 0
# Extract numeric part using regex
match = re.search(r'([\d,.]+)([KM]?)\+?', text.upper())
if not match:
return 0
number, unit = match.groups()
number = float(number.replace(',', ''))
if unit == 'K':
return int(number * 1_000)
elif unit == 'M':
return int(number * 1_000_000)
else:
return int(number)
def save_ranking(rankings, file_path): def save_ranking(rankings, file_path):
if len(rankings) > 0 :
with open(file_path, "w", encoding="utf-8") as f: with open(file_path, "w", encoding="utf-8") as f:
json.dump(rankings, f, ensure_ascii=False, indent=4) json.dump(rankings, f, ensure_ascii=False, indent=4)
@ -60,16 +95,12 @@ def check_consist_utopia( title ):
def get_amazon_ranks(url, marketplace, keyword): def get_amazon_ranks(url, marketplace, ratingPrefix, keyword, page, count):
print( '[INFO] Getting Amazon Ranks for: ', marketplace, keyword) print( '[INFO] Getting Amazon Ranks for: ', marketplace, keyword, page)
url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}" url = f"https://www.{url}/s?k={keyword.replace(' ', '+')}&page={page}"
driver.get(url) driver.get(url)
count =1
ranks = [] ranks = []
COOKIE_FILE = f"{cookies_ref[marketplace]['cookies_name']}"; COOKIE_FILE = f"{cookies_ref[marketplace]['cookies_name']}";
print(COOKIE_FILE)
# Load cookies if available # Load cookies if available
if os.path.exists(COOKIE_FILE): if os.path.exists(COOKIE_FILE):
load_cookies(driver, COOKIE_FILE) load_cookies(driver, COOKIE_FILE)
@ -87,23 +118,46 @@ def get_amazon_ranks(url, marketplace, keyword):
try: try:
sponsored = check_sponsored(item) sponsored = check_sponsored(item)
title = item.find_element(By.XPATH, './/h2//span').text title = item.find_element(By.XPATH, './/h2//span').text
rating = item.find_element(By.XPATH, './/span[@class="a-icon-alt"]')
reviews_count = item.find_element(By.XPATH, './/span[@class="a-size-base s-underline-text"]').text
last_month_bought = item.find_element(By.XPATH, './/span[contains(@class, "a-size-base a-color-secondary") and contains(text(), "bought")]').text
if title == 'Results': if title == 'Results':
continue continue
if sponsored == None : if sponsored == None :
ranks.append({'rank' : count , 'title' : title , 'marketplace' : marketplace , 'keyword': keyword, 'sponsored' : 0, 'asin' : asin , 'is_utopia' : check_consist_utopia(title) }) ranks.append({
'rank' : count,
'title' : title,
'marketplace' : marketplace,
'keyword': keyword,
'sponsored' : 0,
'asin' : asin,
'is_utopia' : check_consist_utopia(title),
'url' : url,
'rating' : parse_rating(rating,ratingPrefix),
'reviews_count' : parse_number_with_commas (reviews_count ),
'last_month_bought': parse_bought_count (last_month_bought)
})
count += 1 count += 1
except: except:
continue continue
file_path = f"{BASE_PATH}/{int(time.time() * 1000)}-{marketplace}-{keyword}.json" file_path = f"{BASE_PATH}/{int(time.time() * 1000)}-{marketplace}-{keyword}.json"
save_ranking(ranks, file_path ) save_ranking(ranks, file_path )
if( len(ranks) == 0 ):
return -1
return count
driver = get_driver() driver = get_driver()
for marketplace, details in marketplaces.items(): for marketplace, details in marketplaces.items():
url = details['url'] url = details['url']
get_amazon_ranks(url, marketplace, 'pillows') ratingPrefix = details['ratingPrefix']
count =1
for i in range(1, MAX_PAGE):
count = get_amazon_ranks(url, marketplace, ratingPrefix, 'comforter navy queen', i, count)
if count == -1:
break
sleep(3)
driver.quit() driver.quit()

View File

@ -36,11 +36,10 @@ const path = require( 'path' );
try { try {
// read contents of the file // read contents of the file
const filePath = path.join( rootPath, file ); const filePath = path.join( rootPath, file );
const orders = JSON.parse( fs.readFileSync( filePath, 'utf-8' ) ); const rankings = JSON.parse( fs.readFileSync( filePath, 'utf-8' ) );
let payload = { progressList: orders };
console.log( `Processing: ${filePath}` ); console.log( `Processing: ${filePath}` );
// send post request to cosmos // send post request to cosmos
let res = await axios.post( config[environment].cosmos_path_orders_progress, payload, { let res = await axios.post( 'https://cosmos.utopiadeals.com/cosmos/rest/bulk-insert/amazon-keyword-rankings', rankings, {
headers: { headers: {
'Content-Type': 'application/json' 'Content-Type': 'application/json'
} }