split backend

This commit is contained in:
2025-11-26 11:30:02 +01:00
parent 8a1f078435
commit e9ef75030f
3 changed files with 319 additions and 225 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
__pycache__/helper.cpython-313.pyc
backend/zip_cache.json
backend/__pycache__/kleinanzeigen_scrape.cpython-313.pyc

View File

@ -0,0 +1,305 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from datetime import datetime
import urllib3
import random
import requests
import time
import json
import os
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class KleinanzeigenScraper:
def __init__(self, config, metrics=None):
"""
Initialize the scraper with configuration
Args:
config: Configuration dictionary
metrics: Optional metrics dictionary to track statistics
"""
self.config = config
self.metrics = metrics
self.zip_cache = {}
self.cache_file = config["cache"]["zip_cache_file"]
# Load cache
self._load_cache()
def _load_cache(self):
"""Load ZIP code cache from file"""
if os.path.exists(self.cache_file):
with open(self.cache_file, "r", encoding="utf-8") as f:
self.zip_cache = json.load(f)
print(f"Loaded {len(self.zip_cache)} ZIP codes from cache")
def _save_cache(self):
"""Save ZIP code cache to file"""
with open(self.cache_file, "w", encoding="utf-8") as f:
json.dump(self.zip_cache, f, ensure_ascii=False, indent=2)
def get_random_user_agent(self):
"""Generate random user agent string"""
return random.choice(self.config["user_agents"])
def make_soup(self, url):
"""Fetch URL and return BeautifulSoup object"""
user_agent = {"user-agent": self.get_random_user_agent()}
http = urllib3.PoolManager(10, headers=user_agent)
try:
r = http.request("GET", url)
# Track response code
if self.metrics is not None:
status_code = str(r.status)
if self.config["apis"]["kleinanzeigen"]["base_url"] in url:
self.metrics["kleinanzeigen_response_codes"][status_code] = (
self.metrics["kleinanzeigen_response_codes"].get(status_code, 0)
+ 1
)
return BeautifulSoup(r.data, "lxml")
except Exception as e:
print(f"Error fetching {url}: {e}")
if self.metrics is not None:
if self.config["apis"]["kleinanzeigen"]["base_url"] in url:
self.metrics["kleinanzeigen_response_codes"]["error"] = (
self.metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
)
raise
def geocode_zip(self, zip_code):
"""Geocode ZIP code using Nominatim API with caching"""
zip_code = str(zip_code)
# Check cache first
if zip_code in self.zip_cache:
return self.zip_cache[zip_code]
# Call Nominatim API
url = self.config["apis"]["nominatim"]["url"]
params = {
"postalcode": zip_code,
"country": "Germany",
"format": "json",
"limit": 1,
}
try:
response = requests.get(
url,
params=params,
headers={"user-agent": self.config["apis"]["nominatim"]["user_agent"]},
)
# Track response code
if self.metrics is not None:
status_code = str(response.status_code)
self.metrics["nominatim_response_codes"][status_code] = (
self.metrics["nominatim_response_codes"].get(status_code, 0) + 1
)
data = response.json()
if data:
coords = {"lat": float(data[0]["lat"]), "lon": float(data[0]["lon"])}
self.zip_cache[zip_code] = coords
# Save cache
self._save_cache()
time.sleep(self.config["scraping"]["geocoding_delay"])
return coords
except Exception as e:
print(f"Geocoding error for {zip_code}: {e}")
if self.metrics is not None:
self.metrics["nominatim_response_codes"]["error"] = (
self.metrics["nominatim_response_codes"].get("error", 0) + 1
)
return None
def search_listings(self, search_term, max_pages, min_price, max_price):
"""Search for listings on kleinanzeigen.de - returns only URLs"""
base_url = self.config["apis"]["kleinanzeigen"]["base_url"]
found_listings = set()
for page_counter in range(1, max_pages + 1):
listing_url = (
base_url
+ "/s-anbieter:privat/anzeige:angebote/preis:"
+ str(min_price)
+ ":"
+ str(max_price)
+ "/seite:"
+ str(page_counter)
+ "/"
+ search_term.replace(" ", "-")
+ "/k0"
)
print(f"Scraping page {page_counter}: {listing_url}")
try:
soup = self.make_soup(listing_url)
results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
if len(results) <= 0:
break
for result in results:
try:
listing_href = result.a["href"]
found_listings.add(base_url + listing_href)
except (AttributeError, KeyError):
pass
except Exception as e:
print(f"Error scraping page {page_counter}: {e}")
break
return list(found_listings)
def scrape_listing(self, url):
"""Scrape individual listing details"""
try:
soup = self.make_soup(url)
if self.metrics is not None:
self.metrics["scrape_requests"] += 1
title = soup.find("h1", class_="boxedarticle--title")
if not title:
return None
title = title.text.strip()
price_elem = soup.find("h2", class_="boxedarticle--price")
price = 0
if price_elem:
price_text = price_elem.text.strip().split(" ")[0]
try:
price = int(price_text.replace(".", "").replace(",", ""))
except:
price = 0
flexlist = soup.find("ul", class_="flexlist text-light-800")
listing_id = 0
if flexlist:
flex_items = flexlist.find_all("li", recursive=False)
if len(flex_items) > 1:
try:
listing_id = int(flex_items[1].text.strip())
except:
pass
locality = soup.find("span", id="viewad-locality")
zip_code = None
address = ""
if locality:
locality_text = locality.text.strip()
parts = locality_text.split(" ", 1)
if parts:
zip_code = parts[0]
if len(parts) > 1:
address = parts[1]
date_added = None
details_divs = soup.find_all("div", class_="boxedarticle--details--full")
if len(details_divs) > 1:
date_span = details_divs[1].find("span")
if date_span:
try:
date_added = datetime.strptime(date_span.text, "%d.%m.%Y")
except:
pass
first_image = None
img_elem = soup.find("div", class_="galleryimage-element current")
if img_elem:
img = img_elem.find("img")
if img and img.get("src"):
first_image = img["src"]
if not zip_code:
return None
listing = {
"title": title,
"price": price,
"id": listing_id,
"zip_code": zip_code,
"address": address,
"date_added": date_added.isoformat() if date_added else None,
"image": first_image,
"url": url,
}
# Add coordinates
coords = self.geocode_zip(zip_code)
if coords and isinstance(coords, dict):
listing["lat"] = coords.get("lat")
listing["lon"] = coords.get("lon")
return listing
except Exception as e:
print(f"Error scraping listing {url}: {e}")
return None
def main():
"""Test the scraper functionality"""
print("=== Kleinanzeigen Scraper Test ===\n")
# Load config
config_file = "config.json"
if not os.path.exists(config_file):
print(f"ERROR: {config_file} not found!")
return
with open(config_file, "r", encoding="utf-8") as f:
config = json.load(f)
# Initialize scraper
scraper = KleinanzeigenScraper(config)
# Test parameters
search_term = "Fahrrad"
max_pages = 1
min_price = 100
max_price = 500
print(f"Searching for: {search_term}")
print(f"Price range: {min_price}€ - {max_price}")
print(f"Max pages: {max_pages}\n")
# Search for listings
print("Step 1: Searching for listing URLs...")
listing_urls = scraper.search_listings(search_term, max_pages, min_price, max_price)
print(f"Found {len(listing_urls)} listings\n")
if len(listing_urls) > 0:
# Scrape first listing as test
print("Step 2: Scraping first listing details...")
test_url = listing_urls[0]
print(f"URL: {test_url}")
listing = scraper.scrape_listing(test_url)
if listing:
print("\nListing details:")
print(f" Title: {listing['title']}")
print(f" Price: €{listing['price']}")
print(f" ID: {listing['id']}")
print(f" Location: {listing['address']} ({listing['zip_code']})")
print(f" Date: {listing['date_added']}")
print(f" Coordinates: {listing.get('lat')}, {listing.get('lon')}")
print(f" Image: {listing['image']}")
else:
print("Failed to scrape listing")
print(f"\n=== Test completed ===")
print(f"ZIP cache size: {len(scraper.zip_cache)}")
if __name__ == "__main__":
main()

View File

@ -3,22 +3,18 @@
from flask import Flask, request, jsonify
from flask_cors import CORS
from bs4 import BeautifulSoup
from datetime import datetime
import urllib3
import random
import requests
import time
import json
import os
import uuid
import threading
import random
from kleinanzeigen_scrape import KleinanzeigenScraper
app = Flask(__name__)
CORS(app)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Load configuration
CONFIG_FILE = "config.json"
config = {}
@ -31,23 +27,15 @@ else:
exit(1)
# Configuration values
CACHE_FILE = config["cache"]["zip_cache_file"]
SESSION_TIMEOUT = config["scraping"]["session_timeout"]
LISTINGS_PER_PAGE = config["scraping"]["listings_per_page"]
MAX_WORKERS = config["scraping"]["max_workers"]
MIN_WORKERS = config["scraping"]["min_workers"]
RATE_LIMIT_DELAY = config["scraping"]["rate_limit_delay"]
GEOCODING_DELAY = config["scraping"]["geocoding_delay"]
USER_AGENTS = config["user_agents"]
NOMINATIM_URL = config["apis"]["nominatim"]["url"]
NOMINATIM_USER_AGENT = config["apis"]["nominatim"]["user_agent"]
KLEINANZEIGEN_BASE_URL = config["apis"]["kleinanzeigen"]["base_url"]
SERVER_HOST = config["server"]["host"]
SERVER_PORT = config["server"]["port"]
SERVER_DEBUG = config["server"]["debug"]
# Global state
zip_cache = {}
scrape_sessions = {}
app_start_time = time.time()
@ -59,14 +47,18 @@ metrics = {
"nominatim_response_codes": {},
}
# Initialize scraper
scraper = KleinanzeigenScraper(config, metrics)
def cleanup_old_sessions():
"""Remove sessions older than SESSION_TIMEOUT"""
session_timeout = config["scraping"]["session_timeout"]
current_time = time.time()
sessions_to_remove = []
for session_id, session in scrape_sessions.items():
if current_time - session.get("created_at", current_time) > SESSION_TIMEOUT:
if current_time - session.get("created_at", current_time) > session_timeout:
sessions_to_remove.append(session_id)
for session_id in sessions_to_remove:
@ -76,213 +68,13 @@ def cleanup_old_sessions():
return len(sessions_to_remove)
def get_random_user_agent():
"""Generate random user agent string"""
return random.choice(USER_AGENTS)
def make_soup(url):
"""Fetch URL and return BeautifulSoup object"""
user_agent = {"user-agent": get_random_user_agent()}
http = urllib3.PoolManager(10, headers=user_agent)
try:
r = http.request("GET", url)
# Track response code
status_code = str(r.status)
if KLEINANZEIGEN_BASE_URL in url:
metrics["kleinanzeigen_response_codes"][status_code] = (
metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1
)
return BeautifulSoup(r.data, "lxml")
except Exception as e:
print(f"Error fetching {url}: {e}")
if KLEINANZEIGEN_BASE_URL in url:
metrics["kleinanzeigen_response_codes"]["error"] = (
metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
)
raise
def geocode_zip(zip_code):
"""Geocode ZIP code using Nominatim API with caching"""
zip_code = str(zip_code)
# Check cache first
if zip_code in zip_cache:
return zip_cache[zip_code]
# Call Nominatim API
params = {
"postalcode": zip_code,
"country": "Germany",
"format": "json",
"limit": 1,
}
try:
response = requests.get(
NOMINATIM_URL, params=params, headers={"user-agent": NOMINATIM_USER_AGENT}
)
# Track response code
status_code = str(response.status_code)
metrics["nominatim_response_codes"][status_code] = (
metrics["nominatim_response_codes"].get(status_code, 0) + 1
)
data = response.json()
if data:
coords = {"lat": float(data[0]["lat"]), "lon": float(data[0]["lon"])}
zip_cache[zip_code] = coords
# Save cache
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(zip_cache, f, ensure_ascii=False, indent=2)
time.sleep(GEOCODING_DELAY)
return coords
except Exception as e:
print(f"Geocoding error for {zip_code}: {e}")
metrics["nominatim_response_codes"]["error"] = (
metrics["nominatim_response_codes"].get("error", 0) + 1
)
return None
def search_listings(search_term, max_pages, min_price, max_price):
"""Search for listings on kleinanzeigen.de - returns only URLs"""
found_listings = set()
for page_counter in range(1, max_pages + 1):
listing_url = (
KLEINANZEIGEN_BASE_URL
+ "/s-anbieter:privat/anzeige:angebote/preis:"
+ str(min_price)
+ ":"
+ str(max_price)
+ "/seite:"
+ str(page_counter)
+ "/"
+ search_term.replace(" ", "-")
+ "/k0"
)
print(f"Scraping page {page_counter}: {listing_url}")
try:
soup = make_soup(listing_url)
results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
if len(results) <= 0:
break
for result in results:
try:
listing_href = result.a["href"]
found_listings.add(KLEINANZEIGEN_BASE_URL + listing_href)
except (AttributeError, KeyError):
pass
except Exception as e:
print(f"Error scraping page {page_counter}: {e}")
break
return list(found_listings)
def scrape_listing(url):
"""Scrape individual listing details"""
try:
soup = make_soup(url)
metrics["scrape_requests"] += 1
title = soup.find("h1", class_="boxedarticle--title")
if not title:
return None
title = title.text.strip()
price_elem = soup.find("h2", class_="boxedarticle--price")
price = 0
if price_elem:
price_text = price_elem.text.strip().split(" ")[0]
try:
price = int(price_text.replace(".", "").replace(",", ""))
except:
price = 0
flexlist = soup.find("ul", class_="flexlist text-light-800")
listing_id = 0
if flexlist:
flex_items = flexlist.find_all("li", recursive=False)
if len(flex_items) > 1:
try:
listing_id = int(flex_items[1].text.strip())
except:
pass
locality = soup.find("span", id="viewad-locality")
zip_code = None
address = ""
if locality:
locality_text = locality.text.strip()
parts = locality_text.split(" ", 1)
if parts:
zip_code = parts[0]
if len(parts) > 1:
address = parts[1]
date_added = None
details_divs = soup.find_all("div", class_="boxedarticle--details--full")
if len(details_divs) > 1:
date_span = details_divs[1].find("span")
if date_span:
try:
date_added = datetime.strptime(date_span.text, "%d.%m.%Y")
except:
pass
first_image = None
img_elem = soup.find("div", class_="galleryimage-element current")
if img_elem:
img = img_elem.find("img")
if img and img.get("src"):
first_image = img["src"]
if not zip_code:
return None
listing = {
"title": title,
"price": price,
"id": listing_id,
"zip_code": zip_code,
"address": address,
"date_added": date_added.isoformat() if date_added else None,
"image": first_image,
"url": url,
}
# Add coordinates
coords = geocode_zip(zip_code)
if coords and isinstance(coords, dict):
listing["lat"] = coords.get("lat")
listing["lon"] = coords.get("lon")
return listing
except Exception as e:
print(f"Error scraping listing {url}: {e}")
return None
def scrape_listing_wrapper(session_id, url, results, index):
"""Wrapper for scraping listing in thread"""
session = scrape_sessions.get(session_id)
if not session:
return
listing = scrape_listing(url)
listing = scraper.scrape_listing(url)
results[index] = listing
@ -358,7 +150,9 @@ def api_search():
try:
# Search for listing URLs only
listing_urls = search_listings(search_term, max_pages, min_price, max_price)
listing_urls = scraper.search_listings(
search_term, max_pages, min_price, max_price
)
# Limit to requested number
listing_urls = listing_urls[:num_listings]
@ -514,11 +308,5 @@ def api_metrics():
if __name__ == "__main__":
print("Starting Kleinanzeigen Scraper API Server...")
# Load cache on startup
if os.path.exists(CACHE_FILE):
with open(CACHE_FILE, "r", encoding="utf-8") as f:
zip_cache = json.load(f)
print(f"Loaded {len(zip_cache)} ZIP codes from cache")
print(f"ZIP code cache loaded with {len(scraper.zip_cache)} entries")
app.run(debug=SERVER_DEBUG, host=SERVER_HOST, port=SERVER_PORT, threaded=True)