parse more than 4 result pages
This commit is contained in:
parent
f3a8182366
commit
1e6cb13e79
|
@ -0,0 +1,22 @@
|
||||||
|
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
""" Author: Hendrik Schutter, mail@hendrikschutter.com
|
||||||
|
Date of creation: 2022/05/31
|
||||||
|
Date of last modification: 2022/05/31
|
||||||
|
"""
|
||||||
|
import random
|
||||||
|
|
||||||
|
def get_random_user_agent():
|
||||||
|
uastrings = [
|
||||||
|
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",\
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",\
|
||||||
|
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10",\
|
||||||
|
"Mozilla/5.0 (Linux; Android 10.1; TV BOX) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/58.2.2878.53403",\
|
||||||
|
"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\
|
||||||
|
"Dalvik/2.1.0 (Linux; U; Android 10; M2006C3MI MIUI/V12.0.15.0.QCRINXM)"\
|
||||||
|
]
|
||||||
|
|
||||||
|
return random.choice(uastrings)
|
|
@ -11,11 +11,14 @@ import datetime
|
||||||
from tinydb import TinyDB, Query
|
from tinydb import TinyDB, Query
|
||||||
import urllib3
|
import urllib3
|
||||||
import sys
|
import sys
|
||||||
|
import helper
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
def make_soup(url):
|
def make_soup(url):
|
||||||
http = urllib3.PoolManager()
|
user_agent = {'user-agent': helper.get_random_user_agent()}
|
||||||
|
#print(user_agent)
|
||||||
|
http = urllib3.PoolManager(10, headers=user_agent)
|
||||||
r = http.request("GET", url)
|
r = http.request("GET", url)
|
||||||
return BeautifulSoup(r.data,'lxml')
|
return BeautifulSoup(r.data,'lxml')
|
||||||
|
|
||||||
|
@ -24,13 +27,15 @@ def search_listing(search_term, db, max_pages):
|
||||||
#display page 1 (first)
|
#display page 1 (first)
|
||||||
#EU only
|
#EU only
|
||||||
#60 listings in one result page
|
#60 listings in one result page
|
||||||
|
#No auction
|
||||||
|
|
||||||
page_counter = 1
|
page_counter = 1
|
||||||
last_result_page = False
|
last_result_page = False
|
||||||
|
|
||||||
while not last_result_page:
|
while not last_result_page:
|
||||||
result_page_added = 0
|
result_page_added = 0
|
||||||
url = 'https://www.ebay.de/sch/i.html?_from=R40&_nkw=' + search_term + '&_sop=10&_pgn='+ str(page_counter) + '&LH_PrefLoc=3&_ipg=60'
|
url = 'https://www.ebay.de/sch/i.html?_from=R40&_nkw=' + search_term.replace(" ", "+") + '&_sop=10&LH_PrefLoc=3&LH_BIN=1&_pgn=' + str(page_counter)
|
||||||
|
|
||||||
#print ("Web Page: ", url)
|
#print ("Web Page: ", url)
|
||||||
|
|
||||||
soup = make_soup(url)
|
soup = make_soup(url)
|
||||||
|
@ -52,9 +57,8 @@ def search_listing(search_term, db, max_pages):
|
||||||
last_result_page = True
|
last_result_page = True
|
||||||
page_counter += 1
|
page_counter += 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
search_db = TinyDB("search_db.json")
|
search_db = TinyDB("search_db.json")
|
||||||
search_listing("mainboard", search_db, max_pages = 4)
|
search_listing("mainboard power pc", search_db, max_pages = 4)
|
||||||
print(search_db.all())
|
print(len(search_db.all()))
|
||||||
|
|
||||||
|
|
|
@ -11,33 +11,39 @@ import datetime
|
||||||
from tinydb import TinyDB, Query
|
from tinydb import TinyDB, Query
|
||||||
import urllib3
|
import urllib3
|
||||||
import sys
|
import sys
|
||||||
|
import random
|
||||||
|
|
||||||
|
import helper
|
||||||
|
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
|
|
||||||
|
|
||||||
def make_soup(url):
|
def make_soup(url):
|
||||||
http = urllib3.PoolManager()
|
user_agent = {'user-agent': helper.get_random_user_agent()}
|
||||||
|
#print(user_agent)
|
||||||
|
http = urllib3.PoolManager(10, headers=user_agent)
|
||||||
r = http.request("GET", url)
|
r = http.request("GET", url)
|
||||||
return BeautifulSoup(r.data,'lxml')
|
return BeautifulSoup(r.data,'lxml')
|
||||||
|
|
||||||
def seller_listings(seller_name, db):
|
def seller_listings(seller_name, db):
|
||||||
#sort by newest listing
|
#sort by newest listing
|
||||||
#display page 1 (first)
|
|
||||||
#EU only
|
|
||||||
#60 listings in one result page
|
|
||||||
|
|
||||||
page_counter = 1
|
page_counter = 1
|
||||||
last_result_page = False
|
last_result_page = False
|
||||||
|
|
||||||
while not last_result_page:
|
while not last_result_page:
|
||||||
result_page_added = 0
|
result_page_added = 0
|
||||||
url = 'https://www.ebay.de/sch/m.html?_ssn='+ seller_name + '&_sop=10&_pgn='+ str(page_counter)
|
|
||||||
print ("Web Page: ", url)
|
if page_counter == 1:
|
||||||
|
url = 'https://www.ebay.de/sch/m.html?_ssn='+ seller_name + '&_sop=10&_pgn='+ str(page_counter)
|
||||||
|
else:
|
||||||
|
url = 'https://www.ebay.de/sch/m.html?_ssn='+ seller_name + '&_sop=10&_pgn='+ str(page_counter) + '&_skc=' + str(60*(page_counter-1)) +'&rt=nc'
|
||||||
|
|
||||||
|
#print ("Web Page: ", url)
|
||||||
|
|
||||||
soup = make_soup(url)
|
soup = make_soup(url)
|
||||||
results = soup.find_all("li", class_="sresult lvresult clearfix li")
|
results = soup.find_all("li", class_="sresult lvresult clearfix li")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
try:
|
try:
|
||||||
rec = {
|
rec = {
|
||||||
|
@ -47,12 +53,10 @@ def seller_listings(seller_name, db):
|
||||||
if not db.search(Query().epid == rec["epid"]):
|
if not db.search(Query().epid == rec["epid"]):
|
||||||
result_page_added += 1
|
result_page_added += 1
|
||||||
db.insert(rec)
|
db.insert(rec)
|
||||||
|
|
||||||
except (AttributeError, KeyError) as ex:
|
except (AttributeError, KeyError) as ex:
|
||||||
pass
|
pass
|
||||||
if (result_page_added == 0):
|
if (result_page_added == 0):
|
||||||
last_result_page = True
|
last_result_page = True
|
||||||
|
|
||||||
page_counter += 1
|
page_counter += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue