From 1e6cb13e79fe10ec80913dfca9fd9e458fc8e8ec Mon Sep 17 00:00:00 2001 From: localhorst Date: Tue, 31 May 2022 14:16:15 +0200 Subject: [PATCH] parse more than 4 result pages --- helper.py | 22 ++++++++++++++++++++++ search_listing.py | 14 +++++++++----- search_seller.py | 26 +++++++++++++++----------- 3 files changed, 46 insertions(+), 16 deletions(-) create mode 100644 helper.py diff --git a/helper.py b/helper.py new file mode 100644 index 0000000..30fb6aa --- /dev/null +++ b/helper.py @@ -0,0 +1,22 @@ + +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" Author: Hendrik Schutter, mail@hendrikschutter.com + Date of creation: 2022/05/31 + Date of last modification: 2022/05/31 +""" +import random + +def get_random_user_agent(): + uastrings = [ + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36",\ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25",\ + "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36",\ + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/600.1.17 (KHTML, like Gecko) Version/7.1 Safari/537.85.10",\ + "Mozilla/5.0 (Linux; Android 10.1; TV BOX) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 OPR/58.2.2878.53403",\ + "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0",\ + "Dalvik/2.1.0 (Linux; U; Android 10; M2006C3MI MIUI/V12.0.15.0.QCRINXM)"\ + ] + + return random.choice(uastrings) \ No newline at end of file diff --git a/search_listing.py b/search_listing.py index b26c335..3f0d042 100644 --- a/search_listing.py +++ b/search_listing.py @@ -11,11 +11,14 @@ import datetime from tinydb import TinyDB, Query import urllib3 import sys +import helper urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) def make_soup(url): - http = urllib3.PoolManager() + user_agent = {'user-agent': helper.get_random_user_agent()} + #print(user_agent) + http = urllib3.PoolManager(10, headers=user_agent) r = http.request("GET", url) return BeautifulSoup(r.data,'lxml') @@ -24,13 +27,15 @@ def search_listing(search_term, db, max_pages): #display page 1 (first) #EU only #60 listings in one result page + #No auction page_counter = 1 last_result_page = False while not last_result_page: result_page_added = 0 - url = 'https://www.ebay.de/sch/i.html?_from=R40&_nkw=' + search_term + '&_sop=10&_pgn='+ str(page_counter) + '&LH_PrefLoc=3&_ipg=60' + url = 'https://www.ebay.de/sch/i.html?_from=R40&_nkw=' + search_term.replace(" ", "+") + '&_sop=10&LH_PrefLoc=3&LH_BIN=1&_pgn=' + str(page_counter) + #print ("Web Page: ", url) soup = make_soup(url) @@ -52,9 +57,8 @@ def search_listing(search_term, db, max_pages): last_result_page = True page_counter += 1 - if __name__ == "__main__": search_db = TinyDB("search_db.json") - search_listing("mainboard", search_db, max_pages = 4) - print(search_db.all()) + search_listing("mainboard power pc", search_db, max_pages = 4) + print(len(search_db.all())) diff --git a/search_seller.py b/search_seller.py index 15c45a1..368cea4 100644 --- a/search_seller.py +++ b/search_seller.py @@ -11,33 +11,39 @@ import datetime from tinydb import TinyDB, Query import urllib3 import sys +import random + +import helper urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + def make_soup(url): - http = urllib3.PoolManager() + user_agent = {'user-agent': helper.get_random_user_agent()} + #print(user_agent) + http = urllib3.PoolManager(10, headers=user_agent) r = http.request("GET", url) return BeautifulSoup(r.data,'lxml') def seller_listings(seller_name, db): #sort by newest listing - #display page 1 (first) - #EU only - #60 listings in one result page - + page_counter = 1 last_result_page = False while not last_result_page: result_page_added = 0 - url = 'https://www.ebay.de/sch/m.html?_ssn='+ seller_name + '&_sop=10&_pgn='+ str(page_counter) - print ("Web Page: ", url) + + if page_counter == 1: + url = 'https://www.ebay.de/sch/m.html?_ssn='+ seller_name + '&_sop=10&_pgn='+ str(page_counter) + else: + url = 'https://www.ebay.de/sch/m.html?_ssn='+ seller_name + '&_sop=10&_pgn='+ str(page_counter) + '&_skc=' + str(60*(page_counter-1)) +'&rt=nc' + + #print ("Web Page: ", url) soup = make_soup(url) results = soup.find_all("li", class_="sresult lvresult clearfix li") - - for result in results: try: rec = { @@ -47,12 +53,10 @@ def seller_listings(seller_name, db): if not db.search(Query().epid == rec["epid"]): result_page_added += 1 db.insert(rec) - except (AttributeError, KeyError) as ex: pass if (result_page_added == 0): last_result_page = True - page_counter += 1