refactoring

This commit is contained in:
2025-11-25 22:12:43 +01:00
parent de7bcf881a
commit e2a3d54198
12 changed files with 1458 additions and 2842 deletions

34
backend/config.json Normal file
View File

@ -0,0 +1,34 @@
{
"server": {
"host": "0.0.0.0",
"port": 5000,
"debug": false
},
"scraping": {
"session_timeout": 300,
"listings_per_page": 25,
"max_workers": 4,
"min_workers": 2,
"rate_limit_delay": 0.5,
"geocoding_delay": 1.0
},
"cache": {
"zip_cache_file": "zip_cache.json"
},
"apis": {
"nominatim": {
"url": "https://nominatim.openstreetmap.org/search",
"user_agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
},
"kleinanzeigen": {
"base_url": "https://www.kleinanzeigen.de"
}
},
"user_agents": [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
]
}

View File

@ -1,10 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Flask API Server for Kleinanzeigen Scraper
Author: Hendrik Schutter
Date: 2025/11/24
"""
from flask import Flask, request, jsonify
from flask_cors import CORS
@ -24,10 +19,32 @@ CORS(app)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Configuration
CACHE_FILE = "zip_cache.json"
SESSION_TIMEOUT = 300 # seconds
LISTINGS_PER_PAGE = 25
# Load configuration
CONFIG_FILE = "config.json"
config = {}
if os.path.exists(CONFIG_FILE):
with open(CONFIG_FILE, "r", encoding="utf-8") as f:
config = json.load(f)
else:
print(f"ERROR: {CONFIG_FILE} not found!")
exit(1)
# Configuration values
CACHE_FILE = config["cache"]["zip_cache_file"]
SESSION_TIMEOUT = config["scraping"]["session_timeout"]
LISTINGS_PER_PAGE = config["scraping"]["listings_per_page"]
MAX_WORKERS = config["scraping"]["max_workers"]
MIN_WORKERS = config["scraping"]["min_workers"]
RATE_LIMIT_DELAY = config["scraping"]["rate_limit_delay"]
GEOCODING_DELAY = config["scraping"]["geocoding_delay"]
USER_AGENTS = config["user_agents"]
NOMINATIM_URL = config["apis"]["nominatim"]["url"]
NOMINATIM_USER_AGENT = config["apis"]["nominatim"]["user_agent"]
KLEINANZEIGEN_BASE_URL = config["apis"]["kleinanzeigen"]["base_url"]
SERVER_HOST = config["server"]["host"]
SERVER_PORT = config["server"]["port"]
SERVER_DEBUG = config["server"]["debug"]
# Global state
zip_cache = {}
@ -61,14 +78,7 @@ def cleanup_old_sessions():
def get_random_user_agent():
"""Generate random user agent string"""
uastrings = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
]
return random.choice(uastrings)
return random.choice(USER_AGENTS)
def make_soup(url):
@ -79,14 +89,14 @@ def make_soup(url):
r = http.request("GET", url)
# Track response code
status_code = str(r.status)
if "kleinanzeigen.de" in url:
if KLEINANZEIGEN_BASE_URL in url:
metrics["kleinanzeigen_response_codes"][status_code] = (
metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1
)
return BeautifulSoup(r.data, "lxml")
except Exception as e:
print(f"Error fetching {url}: {e}")
if "kleinanzeigen.de" in url:
if KLEINANZEIGEN_BASE_URL in url:
metrics["kleinanzeigen_response_codes"]["error"] = (
metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
)
@ -102,7 +112,6 @@ def geocode_zip(zip_code):
return zip_cache[zip_code]
# Call Nominatim API
url = "https://nominatim.openstreetmap.org/search"
params = {
"postalcode": zip_code,
"country": "Germany",
@ -112,7 +121,7 @@ def geocode_zip(zip_code):
try:
response = requests.get(
url, params=params, headers={"user-agent": get_random_user_agent()}
NOMINATIM_URL, params=params, headers={"user-agent": NOMINATIM_USER_AGENT}
)
# Track response code
@ -131,7 +140,7 @@ def geocode_zip(zip_code):
with open(CACHE_FILE, "w", encoding="utf-8") as f:
json.dump(zip_cache, f, ensure_ascii=False, indent=2)
time.sleep(1) # Respect API rate limits
time.sleep(GEOCODING_DELAY)
return coords
except Exception as e:
print(f"Geocoding error for {zip_code}: {e}")
@ -144,12 +153,11 @@ def geocode_zip(zip_code):
def search_listings(search_term, max_pages, min_price, max_price):
"""Search for listings on kleinanzeigen.de - returns only URLs"""
base_url = "https://www.kleinanzeigen.de"
found_listings = set()
for page_counter in range(1, max_pages + 1):
listing_url = (
base_url
KLEINANZEIGEN_BASE_URL
+ "/s-anbieter:privat/anzeige:angebote/preis:"
+ str(min_price)
+ ":"
@ -173,7 +181,7 @@ def search_listings(search_term, max_pages, min_price, max_price):
for result in results:
try:
listing_href = result.a["href"]
found_listings.add(base_url + listing_href)
found_listings.add(KLEINANZEIGEN_BASE_URL + listing_href)
except (AttributeError, KeyError):
pass
except Exception as e:
@ -284,13 +292,11 @@ def prefetch_listings_thread(session_id):
if not session:
return
urls = session["urls"]
max_workers = random.randrange(2, 8)
workers = random.randrange(MIN_WORKERS, MAX_WORKERS)
print(
f"Starting prefetch for session {session_id} with {max_workers} parallel workers"
)
print(f"Starting prefetch for session {session_id} with {workers} parallel workers")
for i in range(0, len(urls), max_workers):
for i in range(0, len(urls), workers):
# Check if session was cancelled or deleted
if (
session_id not in scrape_sessions
@ -300,7 +306,7 @@ def prefetch_listings_thread(session_id):
return
# Process batch of URLs in parallel
batch = urls[i : i + max_workers]
batch = urls[i : i + workers]
threads = []
results = [None] * len(batch)
@ -325,7 +331,7 @@ def prefetch_listings_thread(session_id):
session["scraped"] += len(batch)
# Rate limiting between batches
time.sleep(0.5)
time.sleep(RATE_LIMIT_DELAY)
print(f"Prefetch complete for session {session_id}")
@ -446,8 +452,6 @@ def health():
@app.route("/api/metrics", methods=["GET"])
def api_metrics():
"""Prometheus-style metrics endpoint"""
cleanup_old_sessions()
uptime = time.time() - app_start_time
# Build Prometheus text format
@ -517,4 +521,4 @@ if __name__ == "__main__":
zip_cache = json.load(f)
print(f"Loaded {len(zip_cache)} ZIP codes from cache")
app.run(debug=True, host="0.0.0.0", port=5000, threaded=True)
app.run(debug=SERVER_DEBUG, host=SERVER_HOST, port=SERVER_PORT, threaded=True)

File diff suppressed because it is too large Load Diff