scrape listing metadata

This commit is contained in:
2025-11-24 22:13:58 +01:00
parent 61d30f0987
commit da702fb4e6

View File

@ -6,14 +6,16 @@
""" """
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import datetime from datetime import datetime
from tinydb import TinyDB, Query from tinydb import TinyDB, Query
import urllib3 import urllib3
import sys import sys
import helper import helper
from tqdm import tqdm
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def make_soup(url): def make_soup(url):
user_agent = {"user-agent": helper.get_random_user_agent()} user_agent = {"user-agent": helper.get_random_user_agent()}
# print(user_agent) # print(user_agent)
@ -34,7 +36,14 @@ def search(search_term, max_pages, min_price, max_price):
for page_counter in range(1, max_pages + 1): for page_counter in range(1, max_pages + 1):
listing_url = ( listing_url = (
base_url + "/s-anbieter:privat/anzeige:angebote/preis:" + str(min_price) + ":" + str(max_price) + "/seite:"+ str(page_counter)+"/" base_url
+ "/s-anbieter:privat/anzeige:angebote/preis:"
+ str(min_price)
+ ":"
+ str(max_price)
+ "/seite:"
+ str(page_counter)
+ "/"
+ search_term.replace(" ", "-") + search_term.replace(" ", "-")
+ "/k0" + "/k0"
) )
@ -44,7 +53,7 @@ def search(search_term, max_pages, min_price, max_price):
results = soup.find_all("li", class_="ad-listitem fully-clickable-card") results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
# print(len(results)) # print(len(results))
if(len(results) <= 0): if len(results) <= 0:
break break
for result in results: for result in results:
@ -59,14 +68,56 @@ def search(search_term, max_pages, min_price, max_price):
return found_listings return found_listings
def scrape_listing(url):
# print("Web Page: ", url)
soup = make_soup(url)
listing = {
"title": soup.find("h1", class_="boxedarticle--title").text.strip(),
"price": int(
soup.find("h2", class_="boxedarticle--price").text.strip().split(" ")[0]
),
"id": int(
soup.find("ul", class_="flexlist text-light-800")
.find_all("li", recursive=False)[1]
.text
),
"zip_code": int(
soup.find_all("div", class_="boxedarticle--details--full")[0]
.find("span", id="viewad-locality")
.text.strip()
.split(" ")[0]
),
"address": soup.find_all("div", class_="boxedarticle--details--full")[0]
.find("span", id="viewad-locality")
.text.strip()
.partition(" ")[2],
"dateadded": datetime.strptime(
soup.find_all("div", class_="boxedarticle--details--full")[1].span.text,
"%d.%m.%Y",
),
"first_image": soup.find("div", class_="galleryimage-element current").img[
"src"
],
"url": url,
}
return listing
if __name__ == "__main__": if __name__ == "__main__":
found_listings = search("Fahrrad", max_pages=1, min_price = 24, max_price=42) found_listings = search("Gravelbike", max_pages=5, min_price=300, max_price=900)
print(found_listings) print(found_listings)
print(len(found_listings)) print(len(found_listings))
# url = "https://www.kleinanzeigen.de/s-anzeige/abus-bordo-6000k-hochwertiges-faltschloss-110-cm/3255935217-217-3407"
# listing = scrape_listing(url)
# print(listing)
# exit()
for url in tqdm(found_listings, desc="Scraping listings"):
try:
listing = scrape_listing(url)
# print(listing)
except Exception as e:
print("An error occurred:", e)
print("URL: " + url)