refactoring
This commit is contained in:
34
backend/config.json
Normal file
34
backend/config.json
Normal file
@ -0,0 +1,34 @@
|
||||
{
|
||||
"server": {
|
||||
"host": "0.0.0.0",
|
||||
"port": 5000,
|
||||
"debug": false
|
||||
},
|
||||
"scraping": {
|
||||
"session_timeout": 300,
|
||||
"listings_per_page": 25,
|
||||
"max_workers": 4,
|
||||
"min_workers": 2,
|
||||
"rate_limit_delay": 0.5,
|
||||
"geocoding_delay": 1.0
|
||||
},
|
||||
"cache": {
|
||||
"zip_cache_file": "zip_cache.json"
|
||||
},
|
||||
"apis": {
|
||||
"nominatim": {
|
||||
"url": "https://nominatim.openstreetmap.org/search",
|
||||
"user_agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
},
|
||||
"kleinanzeigen": {
|
||||
"base_url": "https://www.kleinanzeigen.de"
|
||||
}
|
||||
},
|
||||
"user_agents": [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
]
|
||||
}
|
||||
@ -1,10 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Flask API Server for Kleinanzeigen Scraper
|
||||
Author: Hendrik Schutter
|
||||
Date: 2025/11/24
|
||||
"""
|
||||
|
||||
from flask import Flask, request, jsonify
|
||||
from flask_cors import CORS
|
||||
@ -24,10 +19,32 @@ CORS(app)
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# Configuration
|
||||
CACHE_FILE = "zip_cache.json"
|
||||
SESSION_TIMEOUT = 300 # seconds
|
||||
LISTINGS_PER_PAGE = 25
|
||||
# Load configuration
|
||||
CONFIG_FILE = "config.json"
|
||||
config = {}
|
||||
|
||||
if os.path.exists(CONFIG_FILE):
|
||||
with open(CONFIG_FILE, "r", encoding="utf-8") as f:
|
||||
config = json.load(f)
|
||||
else:
|
||||
print(f"ERROR: {CONFIG_FILE} not found!")
|
||||
exit(1)
|
||||
|
||||
# Configuration values
|
||||
CACHE_FILE = config["cache"]["zip_cache_file"]
|
||||
SESSION_TIMEOUT = config["scraping"]["session_timeout"]
|
||||
LISTINGS_PER_PAGE = config["scraping"]["listings_per_page"]
|
||||
MAX_WORKERS = config["scraping"]["max_workers"]
|
||||
MIN_WORKERS = config["scraping"]["min_workers"]
|
||||
RATE_LIMIT_DELAY = config["scraping"]["rate_limit_delay"]
|
||||
GEOCODING_DELAY = config["scraping"]["geocoding_delay"]
|
||||
USER_AGENTS = config["user_agents"]
|
||||
NOMINATIM_URL = config["apis"]["nominatim"]["url"]
|
||||
NOMINATIM_USER_AGENT = config["apis"]["nominatim"]["user_agent"]
|
||||
KLEINANZEIGEN_BASE_URL = config["apis"]["kleinanzeigen"]["base_url"]
|
||||
SERVER_HOST = config["server"]["host"]
|
||||
SERVER_PORT = config["server"]["port"]
|
||||
SERVER_DEBUG = config["server"]["debug"]
|
||||
|
||||
# Global state
|
||||
zip_cache = {}
|
||||
@ -61,14 +78,7 @@ def cleanup_old_sessions():
|
||||
|
||||
def get_random_user_agent():
|
||||
"""Generate random user agent string"""
|
||||
uastrings = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
]
|
||||
return random.choice(uastrings)
|
||||
return random.choice(USER_AGENTS)
|
||||
|
||||
|
||||
def make_soup(url):
|
||||
@ -79,14 +89,14 @@ def make_soup(url):
|
||||
r = http.request("GET", url)
|
||||
# Track response code
|
||||
status_code = str(r.status)
|
||||
if "kleinanzeigen.de" in url:
|
||||
if KLEINANZEIGEN_BASE_URL in url:
|
||||
metrics["kleinanzeigen_response_codes"][status_code] = (
|
||||
metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1
|
||||
)
|
||||
return BeautifulSoup(r.data, "lxml")
|
||||
except Exception as e:
|
||||
print(f"Error fetching {url}: {e}")
|
||||
if "kleinanzeigen.de" in url:
|
||||
if KLEINANZEIGEN_BASE_URL in url:
|
||||
metrics["kleinanzeigen_response_codes"]["error"] = (
|
||||
metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
|
||||
)
|
||||
@ -102,7 +112,6 @@ def geocode_zip(zip_code):
|
||||
return zip_cache[zip_code]
|
||||
|
||||
# Call Nominatim API
|
||||
url = "https://nominatim.openstreetmap.org/search"
|
||||
params = {
|
||||
"postalcode": zip_code,
|
||||
"country": "Germany",
|
||||
@ -112,7 +121,7 @@ def geocode_zip(zip_code):
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
url, params=params, headers={"user-agent": get_random_user_agent()}
|
||||
NOMINATIM_URL, params=params, headers={"user-agent": NOMINATIM_USER_AGENT}
|
||||
)
|
||||
|
||||
# Track response code
|
||||
@ -131,7 +140,7 @@ def geocode_zip(zip_code):
|
||||
with open(CACHE_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(zip_cache, f, ensure_ascii=False, indent=2)
|
||||
|
||||
time.sleep(1) # Respect API rate limits
|
||||
time.sleep(GEOCODING_DELAY)
|
||||
return coords
|
||||
except Exception as e:
|
||||
print(f"Geocoding error for {zip_code}: {e}")
|
||||
@ -144,12 +153,11 @@ def geocode_zip(zip_code):
|
||||
|
||||
def search_listings(search_term, max_pages, min_price, max_price):
|
||||
"""Search for listings on kleinanzeigen.de - returns only URLs"""
|
||||
base_url = "https://www.kleinanzeigen.de"
|
||||
found_listings = set()
|
||||
|
||||
for page_counter in range(1, max_pages + 1):
|
||||
listing_url = (
|
||||
base_url
|
||||
KLEINANZEIGEN_BASE_URL
|
||||
+ "/s-anbieter:privat/anzeige:angebote/preis:"
|
||||
+ str(min_price)
|
||||
+ ":"
|
||||
@ -173,7 +181,7 @@ def search_listings(search_term, max_pages, min_price, max_price):
|
||||
for result in results:
|
||||
try:
|
||||
listing_href = result.a["href"]
|
||||
found_listings.add(base_url + listing_href)
|
||||
found_listings.add(KLEINANZEIGEN_BASE_URL + listing_href)
|
||||
except (AttributeError, KeyError):
|
||||
pass
|
||||
except Exception as e:
|
||||
@ -284,13 +292,11 @@ def prefetch_listings_thread(session_id):
|
||||
if not session:
|
||||
return
|
||||
urls = session["urls"]
|
||||
max_workers = random.randrange(2, 8)
|
||||
workers = random.randrange(MIN_WORKERS, MAX_WORKERS)
|
||||
|
||||
print(
|
||||
f"Starting prefetch for session {session_id} with {max_workers} parallel workers"
|
||||
)
|
||||
print(f"Starting prefetch for session {session_id} with {workers} parallel workers")
|
||||
|
||||
for i in range(0, len(urls), max_workers):
|
||||
for i in range(0, len(urls), workers):
|
||||
# Check if session was cancelled or deleted
|
||||
if (
|
||||
session_id not in scrape_sessions
|
||||
@ -300,7 +306,7 @@ def prefetch_listings_thread(session_id):
|
||||
return
|
||||
|
||||
# Process batch of URLs in parallel
|
||||
batch = urls[i : i + max_workers]
|
||||
batch = urls[i : i + workers]
|
||||
threads = []
|
||||
results = [None] * len(batch)
|
||||
|
||||
@ -325,7 +331,7 @@ def prefetch_listings_thread(session_id):
|
||||
session["scraped"] += len(batch)
|
||||
|
||||
# Rate limiting between batches
|
||||
time.sleep(0.5)
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
|
||||
print(f"Prefetch complete for session {session_id}")
|
||||
|
||||
@ -446,8 +452,6 @@ def health():
|
||||
@app.route("/api/metrics", methods=["GET"])
|
||||
def api_metrics():
|
||||
"""Prometheus-style metrics endpoint"""
|
||||
cleanup_old_sessions()
|
||||
|
||||
uptime = time.time() - app_start_time
|
||||
|
||||
# Build Prometheus text format
|
||||
@ -517,4 +521,4 @@ if __name__ == "__main__":
|
||||
zip_cache = json.load(f)
|
||||
|
||||
print(f"Loaded {len(zip_cache)} ZIP codes from cache")
|
||||
app.run(debug=True, host="0.0.0.0", port=5000, threaded=True)
|
||||
app.run(debug=SERVER_DEBUG, host=SERVER_HOST, port=SERVER_PORT, threaded=True)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user