scrape listing metadata

This commit is contained in:
2025-11-24 22:13:58 +01:00
parent 61d30f0987
commit da702fb4e6

View File

@ -6,14 +6,16 @@
"""
from bs4 import BeautifulSoup
import datetime
from datetime import datetime
from tinydb import TinyDB, Query
import urllib3
import sys
import helper
from tqdm import tqdm
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def make_soup(url):
user_agent = {"user-agent": helper.get_random_user_agent()}
# print(user_agent)
@ -34,7 +36,14 @@ def search(search_term, max_pages, min_price, max_price):
for page_counter in range(1, max_pages + 1):
listing_url = (
base_url + "/s-anbieter:privat/anzeige:angebote/preis:" + str(min_price) + ":" + str(max_price) + "/seite:"+ str(page_counter)+"/"
base_url
+ "/s-anbieter:privat/anzeige:angebote/preis:"
+ str(min_price)
+ ":"
+ str(max_price)
+ "/seite:"
+ str(page_counter)
+ "/"
+ search_term.replace(" ", "-")
+ "/k0"
)
@ -44,7 +53,7 @@ def search(search_term, max_pages, min_price, max_price):
results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
# print(len(results))
if(len(results) <= 0):
if len(results) <= 0:
break
for result in results:
@ -59,14 +68,56 @@ def search(search_term, max_pages, min_price, max_price):
return found_listings
def scrape_listing(url):
# print("Web Page: ", url)
soup = make_soup(url)
listing = {
"title": soup.find("h1", class_="boxedarticle--title").text.strip(),
"price": int(
soup.find("h2", class_="boxedarticle--price").text.strip().split(" ")[0]
),
"id": int(
soup.find("ul", class_="flexlist text-light-800")
.find_all("li", recursive=False)[1]
.text
),
"zip_code": int(
soup.find_all("div", class_="boxedarticle--details--full")[0]
.find("span", id="viewad-locality")
.text.strip()
.split(" ")[0]
),
"address": soup.find_all("div", class_="boxedarticle--details--full")[0]
.find("span", id="viewad-locality")
.text.strip()
.partition(" ")[2],
"dateadded": datetime.strptime(
soup.find_all("div", class_="boxedarticle--details--full")[1].span.text,
"%d.%m.%Y",
),
"first_image": soup.find("div", class_="galleryimage-element current").img[
"src"
],
"url": url,
}
return listing
if __name__ == "__main__":
found_listings = search("Fahrrad", max_pages=1, min_price = 24, max_price=42)
found_listings = search("Gravelbike", max_pages=5, min_price=300, max_price=900)
print(found_listings)
print(len(found_listings))
# url = "https://www.kleinanzeigen.de/s-anzeige/abus-bordo-6000k-hochwertiges-faltschloss-110-cm/3255935217-217-3407"
# listing = scrape_listing(url)
# print(listing)
# exit()
for url in tqdm(found_listings, desc="Scraping listings"):
try:
listing = scrape_listing(url)
# print(listing)
except Exception as e:
print("An error occurred:", e)
print("URL: " + url)