124 lines
3.5 KiB
Python
124 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""Author: Hendrik Schutter, mail@hendrikschutter.com
|
|
Date of creation: 2025/11/24
|
|
Date of last modification: 2025/11/24
|
|
"""
|
|
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
from tinydb import TinyDB, Query
|
|
import urllib3
|
|
import sys
|
|
import helper
|
|
from tqdm import tqdm
|
|
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
|
|
def make_soup(url):
|
|
user_agent = {"user-agent": helper.get_random_user_agent()}
|
|
# print(user_agent)
|
|
http = urllib3.PoolManager(10, headers=user_agent)
|
|
r = http.request("GET", url)
|
|
return BeautifulSoup(r.data, "lxml")
|
|
|
|
|
|
def search(search_term, max_pages, min_price, max_price):
|
|
# Sorted by newest listing
|
|
# No Geo distance
|
|
# Only private sellers
|
|
# Only offerings
|
|
|
|
base_url = "https://www.kleinanzeigen.de"
|
|
|
|
found_listings = set()
|
|
|
|
for page_counter in range(1, max_pages + 1):
|
|
listing_url = (
|
|
base_url
|
|
+ "/s-anbieter:privat/anzeige:angebote/preis:"
|
|
+ str(min_price)
|
|
+ ":"
|
|
+ str(max_price)
|
|
+ "/seite:"
|
|
+ str(page_counter)
|
|
+ "/"
|
|
+ search_term.replace(" ", "-")
|
|
+ "/k0"
|
|
)
|
|
|
|
print("Web Page: ", listing_url)
|
|
soup = make_soup(listing_url)
|
|
results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
|
|
# print(len(results))
|
|
|
|
if len(results) <= 0:
|
|
break
|
|
|
|
for result in results:
|
|
try:
|
|
# print(result)
|
|
listing_url = result.a["href"]
|
|
# print("url: " + base_url + listing_url)
|
|
found_listings.add(base_url + listing_url)
|
|
except (AttributeError, KeyError) as ex:
|
|
pass
|
|
|
|
return found_listings
|
|
|
|
|
|
def scrape_listing(url):
|
|
# print("Web Page: ", url)
|
|
soup = make_soup(url)
|
|
|
|
listing = {
|
|
"title": soup.find("h1", class_="boxedarticle--title").text.strip(),
|
|
"price": int(
|
|
soup.find("h2", class_="boxedarticle--price").text.strip().split(" ")[0]
|
|
),
|
|
"id": int(
|
|
soup.find("ul", class_="flexlist text-light-800")
|
|
.find_all("li", recursive=False)[1]
|
|
.text
|
|
),
|
|
"zip_code": int(
|
|
soup.find_all("div", class_="boxedarticle--details--full")[0]
|
|
.find("span", id="viewad-locality")
|
|
.text.strip()
|
|
.split(" ")[0]
|
|
),
|
|
"address": soup.find_all("div", class_="boxedarticle--details--full")[0]
|
|
.find("span", id="viewad-locality")
|
|
.text.strip()
|
|
.partition(" ")[2],
|
|
"dateadded": datetime.strptime(
|
|
soup.find_all("div", class_="boxedarticle--details--full")[1].span.text,
|
|
"%d.%m.%Y",
|
|
),
|
|
"first_image": soup.find("div", class_="galleryimage-element current").img[
|
|
"src"
|
|
],
|
|
"url": url,
|
|
}
|
|
return listing
|
|
|
|
|
|
if __name__ == "__main__":
|
|
found_listings = search("Gravelbike", max_pages=5, min_price=300, max_price=900)
|
|
print(found_listings)
|
|
print(len(found_listings))
|
|
|
|
# url = "https://www.kleinanzeigen.de/s-anzeige/abus-bordo-6000k-hochwertiges-faltschloss-110-cm/3255935217-217-3407"
|
|
# listing = scrape_listing(url)
|
|
# print(listing)
|
|
# exit()
|
|
|
|
for url in tqdm(found_listings, desc="Scraping listings"):
|
|
try:
|
|
listing = scrape_listing(url)
|
|
# print(listing)
|
|
except Exception as e:
|
|
print("An error occurred:", e)
|
|
print("URL: " + url)
|