scrape listing metadata

This commit is contained in:
2025-11-24 22:13:58 +01:00
parent 61d30f0987
commit da702fb4e6

View File

@ -1,19 +1,21 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" Author: Hendrik Schutter, mail@hendrikschutter.com """Author: Hendrik Schutter, mail@hendrikschutter.com
Date of creation: 2025/11/24 Date of creation: 2025/11/24
Date of last modification: 2025/11/24 Date of last modification: 2025/11/24
""" """
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import datetime from datetime import datetime
from tinydb import TinyDB, Query from tinydb import TinyDB, Query
import urllib3 import urllib3
import sys import sys
import helper import helper
from tqdm import tqdm
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def make_soup(url): def make_soup(url):
user_agent = {"user-agent": helper.get_random_user_agent()} user_agent = {"user-agent": helper.get_random_user_agent()}
# print(user_agent) # print(user_agent)
@ -32,26 +34,33 @@ def search(search_term, max_pages, min_price, max_price):
found_listings = set() found_listings = set()
for page_counter in range(1, max_pages+1): for page_counter in range(1, max_pages + 1):
listing_url = ( listing_url = (
base_url + "/s-anbieter:privat/anzeige:angebote/preis:" + str(min_price) + ":" + str(max_price) + "/seite:"+ str(page_counter)+"/" base_url
+ "/s-anbieter:privat/anzeige:angebote/preis:"
+ str(min_price)
+ ":"
+ str(max_price)
+ "/seite:"
+ str(page_counter)
+ "/"
+ search_term.replace(" ", "-") + search_term.replace(" ", "-")
+ "/k0" + "/k0"
) )
print ("Web Page: ", listing_url) print("Web Page: ", listing_url)
soup = make_soup(listing_url) soup = make_soup(listing_url)
results = soup.find_all("li", class_="ad-listitem fully-clickable-card") results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
#print(len(results)) # print(len(results))
if(len(results) <= 0): if len(results) <= 0:
break break
for result in results: for result in results:
try: try:
#print(result) # print(result)
listing_url = result.a["href"] listing_url = result.a["href"]
#print("url: " + base_url + listing_url) # print("url: " + base_url + listing_url)
found_listings.add(base_url + listing_url) found_listings.add(base_url + listing_url)
except (AttributeError, KeyError) as ex: except (AttributeError, KeyError) as ex:
pass pass
@ -59,14 +68,56 @@ def search(search_term, max_pages, min_price, max_price):
return found_listings return found_listings
def scrape_listing(url):
# print("Web Page: ", url)
soup = make_soup(url)
listing = {
"title": soup.find("h1", class_="boxedarticle--title").text.strip(),
"price": int(
soup.find("h2", class_="boxedarticle--price").text.strip().split(" ")[0]
),
"id": int(
soup.find("ul", class_="flexlist text-light-800")
.find_all("li", recursive=False)[1]
.text
),
"zip_code": int(
soup.find_all("div", class_="boxedarticle--details--full")[0]
.find("span", id="viewad-locality")
.text.strip()
.split(" ")[0]
),
"address": soup.find_all("div", class_="boxedarticle--details--full")[0]
.find("span", id="viewad-locality")
.text.strip()
.partition(" ")[2],
"dateadded": datetime.strptime(
soup.find_all("div", class_="boxedarticle--details--full")[1].span.text,
"%d.%m.%Y",
),
"first_image": soup.find("div", class_="galleryimage-element current").img[
"src"
],
"url": url,
}
return listing
if __name__ == "__main__": if __name__ == "__main__":
found_listings = search("Fahrrad", max_pages=1, min_price = 24, max_price=42) found_listings = search("Gravelbike", max_pages=5, min_price=300, max_price=900)
print(found_listings) print(found_listings)
print(len(found_listings)) print(len(found_listings))
# url = "https://www.kleinanzeigen.de/s-anzeige/abus-bordo-6000k-hochwertiges-faltschloss-110-cm/3255935217-217-3407"
# listing = scrape_listing(url)
# print(listing)
# exit()
for url in tqdm(found_listings, desc="Scraping listings"):
try:
listing = scrape_listing(url)
# print(listing)
except Exception as e:
print("An error occurred:", e)
print("URL: " + url)