From da702fb4e6e147825a09884b599fe1fde1121193 Mon Sep 17 00:00:00 2001 From: localhorst Date: Mon, 24 Nov 2025 22:13:58 +0100 Subject: [PATCH] scrape listing metadata --- search_kleinanzeigen.py | 85 ++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/search_kleinanzeigen.py b/search_kleinanzeigen.py index 961ac45..cbb378c 100644 --- a/search_kleinanzeigen.py +++ b/search_kleinanzeigen.py @@ -1,19 +1,21 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -""" Author: Hendrik Schutter, mail@hendrikschutter.com - Date of creation: 2025/11/24 - Date of last modification: 2025/11/24 +"""Author: Hendrik Schutter, mail@hendrikschutter.com +Date of creation: 2025/11/24 +Date of last modification: 2025/11/24 """ from bs4 import BeautifulSoup -import datetime +from datetime import datetime from tinydb import TinyDB, Query import urllib3 import sys import helper +from tqdm import tqdm urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + def make_soup(url): user_agent = {"user-agent": helper.get_random_user_agent()} # print(user_agent) @@ -32,26 +34,33 @@ def search(search_term, max_pages, min_price, max_price): found_listings = set() - for page_counter in range(1, max_pages+1): + for page_counter in range(1, max_pages + 1): listing_url = ( - base_url + "/s-anbieter:privat/anzeige:angebote/preis:" + str(min_price) + ":" + str(max_price) + "/seite:"+ str(page_counter)+"/" + base_url + + "/s-anbieter:privat/anzeige:angebote/preis:" + + str(min_price) + + ":" + + str(max_price) + + "/seite:" + + str(page_counter) + + "/" + search_term.replace(" ", "-") + "/k0" ) - print ("Web Page: ", listing_url) + print("Web Page: ", listing_url) soup = make_soup(listing_url) results = soup.find_all("li", class_="ad-listitem fully-clickable-card") - #print(len(results)) + # print(len(results)) - if(len(results) <= 0): + if len(results) <= 0: break for result in results: try: - #print(result) + # print(result) listing_url = result.a["href"] - #print("url: " + base_url + listing_url) + # print("url: " + base_url + listing_url) found_listings.add(base_url + listing_url) except (AttributeError, KeyError) as ex: pass @@ -59,14 +68,56 @@ def search(search_term, max_pages, min_price, max_price): return found_listings +def scrape_listing(url): + # print("Web Page: ", url) + soup = make_soup(url) + + listing = { + "title": soup.find("h1", class_="boxedarticle--title").text.strip(), + "price": int( + soup.find("h2", class_="boxedarticle--price").text.strip().split(" ")[0] + ), + "id": int( + soup.find("ul", class_="flexlist text-light-800") + .find_all("li", recursive=False)[1] + .text + ), + "zip_code": int( + soup.find_all("div", class_="boxedarticle--details--full")[0] + .find("span", id="viewad-locality") + .text.strip() + .split(" ")[0] + ), + "address": soup.find_all("div", class_="boxedarticle--details--full")[0] + .find("span", id="viewad-locality") + .text.strip() + .partition(" ")[2], + "dateadded": datetime.strptime( + soup.find_all("div", class_="boxedarticle--details--full")[1].span.text, + "%d.%m.%Y", + ), + "first_image": soup.find("div", class_="galleryimage-element current").img[ + "src" + ], + "url": url, + } + return listing + + if __name__ == "__main__": - found_listings = search("Fahrrad", max_pages=1, min_price = 24, max_price=42) + found_listings = search("Gravelbike", max_pages=5, min_price=300, max_price=900) print(found_listings) print(len(found_listings)) + # url = "https://www.kleinanzeigen.de/s-anzeige/abus-bordo-6000k-hochwertiges-faltschloss-110-cm/3255935217-217-3407" + # listing = scrape_listing(url) + # print(listing) + # exit() - - - - - \ No newline at end of file +for url in tqdm(found_listings, desc="Scraping listings"): + try: + listing = scrape_listing(url) + # print(listing) + except Exception as e: + print("An error occurred:", e) + print("URL: " + url)