UI improvments and backend prefetch

This commit is contained in:
2025-11-25 18:16:29 +01:00
parent dd36618802
commit 0858be033b
3 changed files with 578 additions and 178 deletions

View File

@ -17,22 +17,34 @@ import time
import json
import os
import uuid
import threading
app = Flask(__name__)
CORS(app)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# ZIP code cache file
# Configuration
CACHE_FILE = "zip_cache.json"
zip_cache = {}
# Active scrape sessions
scrape_sessions = {}
SESSION_TIMEOUT = 300 # seconds
LISTINGS_PER_PAGE = 25
# Global state
zip_cache = {}
scrape_sessions = {}
app_start_time = time.time()
# Metrics
metrics = {
"search_requests": 0,
"scrape_requests": 0,
"kleinanzeigen_response_codes": {},
"nominatim_response_codes": {},
}
def cleanup_old_sessions():
"""Remove sessions older than SESSION_TIMEOUT"""
current_time = time.time()
sessions_to_remove = []
@ -63,8 +75,22 @@ def make_soup(url):
"""Fetch URL and return BeautifulSoup object"""
user_agent = {"user-agent": get_random_user_agent()}
http = urllib3.PoolManager(10, headers=user_agent)
r = http.request("GET", url)
return BeautifulSoup(r.data, "lxml")
try:
r = http.request("GET", url)
# Track response code
status_code = str(r.status)
if "kleinanzeigen.de" in url:
metrics["kleinanzeigen_response_codes"][status_code] = (
metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1
)
return BeautifulSoup(r.data, "lxml")
except Exception as e:
print(f"Error fetching {url}: {e}")
if "kleinanzeigen.de" in url:
metrics["kleinanzeigen_response_codes"]["error"] = (
metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
)
raise
def geocode_zip(zip_code):
@ -88,6 +114,13 @@ def geocode_zip(zip_code):
response = requests.get(
url, params=params, headers={"user-agent": get_random_user_agent()}
)
# Track response code
status_code = str(response.status_code)
metrics["nominatim_response_codes"][status_code] = (
metrics["nominatim_response_codes"].get(status_code, 0) + 1
)
data = response.json()
if data:
@ -102,6 +135,9 @@ def geocode_zip(zip_code):
return coords
except Exception as e:
print(f"Geocoding error for {zip_code}: {e}")
metrics["nominatim_response_codes"]["error"] = (
metrics["nominatim_response_codes"].get("error", 0) + 1
)
return None
@ -136,8 +172,8 @@ def search_listings(search_term, max_pages, min_price, max_price):
for result in results:
try:
listing_url = result.a["href"]
found_listings.add(base_url + listing_url)
listing_href = result.a["href"]
found_listings.add(base_url + listing_href)
except (AttributeError, KeyError):
pass
except Exception as e:
@ -151,6 +187,7 @@ def scrape_listing(url):
"""Scrape individual listing details"""
try:
soup = make_soup(url)
metrics["scrape_requests"] += 1
title = soup.find("h1", class_="boxedarticle--title")
if not title:
@ -231,26 +268,61 @@ def scrape_listing(url):
return None
def prefetch_listings_thread(session_id):
"""Background thread to prefetch all listings"""
session = scrape_sessions.get(session_id)
if not session:
return
print(f"Starting prefetch for session {session_id}")
for i, url in enumerate(session["urls"]):
# Check if session was cancelled or deleted
if (
session_id not in scrape_sessions
or scrape_sessions[session_id]["cancelled"]
):
print(f"Prefetch stopped for session {session_id}")
return
listing = scrape_listing(url)
if listing:
session["listings"].append(listing)
session["scraped"] += 1
time.sleep(0.3) # Rate limiting
print(
f"Prefetch complete for session {session_id}: {len(session['listings'])} listings"
)
@app.route("/api/search", methods=["POST"])
def api_search():
"""API endpoint for searching listings - returns only count and URLs"""
"""API endpoint for searching listings - returns count and starts prefetch"""
data = request.json
metrics["search_requests"] += 1
# Cleanup old sessions before creating new one
cleanup_old_sessions()
search_term = data.get("search_term", "")
max_pages = data.get("max_pages", 1)
num_listings = data.get("num_listings", 25)
min_price = data.get("min_price", 0)
max_price = data.get("max_price", 10000)
max_price = data.get("max_price", 1000000000)
if not search_term:
return jsonify({"error": "Search term is required"}), 400
# Calculate pages needed
max_pages = max(1, (num_listings + LISTINGS_PER_PAGE - 1) // LISTINGS_PER_PAGE)
try:
# Search for listing URLs only
listing_urls = search_listings(search_term, max_pages, min_price, max_price)
# Limit to requested number
listing_urls = listing_urls[:num_listings]
# Create session ID
session_id = str(uuid.uuid4())
@ -264,6 +336,12 @@ def api_search():
"created_at": time.time(),
}
# Start prefetch in background thread
prefetch_thread = threading.Thread(
target=prefetch_listings_thread, args=(session_id,), daemon=True
)
prefetch_thread.start()
return jsonify({"session_id": session_id, "total": len(listing_urls)})
except Exception as e:
@ -272,8 +350,7 @@ def api_search():
@app.route("/api/scrape/<session_id>", methods=["GET"])
def api_scrape(session_id):
"""API endpoint for scraping next listing in session"""
# Cleanup old sessions on each request
"""API endpoint to get next scraped listing from session"""
cleanup_old_sessions()
if session_id not in scrape_sessions:
@ -284,21 +361,28 @@ def api_scrape(session_id):
if session["cancelled"]:
return jsonify({"cancelled": True}), 200
if session["scraped"] >= session["total"]:
return jsonify({"complete": True, "listing": None})
# Wait briefly if no listings are ready yet
wait_count = 0
while (
len(session["listings"]) == 0
and session["scraped"] < session["total"]
and wait_count < 10
):
time.sleep(0.1)
wait_count += 1
# Scrape next listing
url = session["urls"][session["scraped"]]
listing = scrape_listing(url)
if len(session["listings"]) > 0:
listing = session["listings"].pop(0)
else:
listing = None
if listing:
session["listings"].append(listing)
session["scraped"] += 1
is_complete = (
session["scraped"] >= session["total"] and len(session["listings"]) == 0
)
return jsonify(
{
"complete": session["scraped"] >= session["total"],
"complete": is_complete,
"listing": listing,
"progress": {"current": session["scraped"], "total": session["total"]},
}
@ -307,51 +391,40 @@ def api_scrape(session_id):
@app.route("/api/scrape/<session_id>/cancel", methods=["POST"])
def api_cancel_scrape(session_id):
"""API endpoint to cancel scraping session"""
"""API endpoint to cancel scraping session and delete cached listings"""
cleanup_old_sessions()
if session_id not in scrape_sessions:
return jsonify({"error": "Invalid session ID"}), 404
scrape_sessions[session_id]["cancelled"] = True
# Delete session completely (including cached listings)
del scrape_sessions[session_id]
return jsonify(
{
"cancelled": True,
"listings": scrape_sessions[session_id]["listings"],
"total_scraped": len(scrape_sessions[session_id]["listings"]),
}
)
@app.route("/api/scrape/<session_id>/results", methods=["GET"])
def api_get_results(session_id):
"""API endpoint to get all scraped results"""
cleanup_old_sessions()
if session_id not in scrape_sessions:
return jsonify({"error": "Invalid session ID"}), 404
session = scrape_sessions[session_id]
return jsonify(
{
"listings": session["listings"],
"total": len(session["listings"]),
"progress": {"current": session["scraped"], "total": session["total"]},
}
)
return jsonify({"cancelled": True, "message": "Session deleted"})
@app.route("/api/health", methods=["GET"])
def health():
"""Health check endpoint"""
return jsonify({"status": "ok"})
@app.route("/api/metrics", methods=["GET"])
def api_metrics():
"""Prometheus-style metrics endpoint"""
cleanup_old_sessions()
uptime = time.time() - app_start_time
return jsonify(
{
"status": "ok",
"cache_size": len(zip_cache),
"search_requests_total": metrics["search_requests"],
"scrape_requests_total": metrics["scrape_requests"],
"uptime_seconds": uptime,
"kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"],
"nominatim_response_codes": metrics["nominatim_response_codes"],
"active_sessions": len(scrape_sessions),
"cache_size": len(zip_cache),
}
)
@ -365,5 +438,4 @@ if __name__ == "__main__":
zip_cache = json.load(f)
print(f"Loaded {len(zip_cache)} ZIP codes from cache")
print("ZIP code cache loaded with", len(zip_cache), "entries")
app.run(debug=True, host="0.0.0.0", port=5000)
app.run(debug=True, host="0.0.0.0", port=5000, threaded=True)

View File

@ -1370,5 +1370,233 @@
"20457": {
"lat": 53.5335376,
"lon": 9.9806284
},
"39120": {
"lat": 52.0855315,
"lon": 11.6329414
},
"47137": {
"lat": 51.4714791,
"lon": 6.7669058
},
"69207": {
"lat": 49.3421242,
"lon": 8.6370166
},
"27616": {
"lat": 53.4469731,
"lon": 8.8063121
},
"10119": {
"lat": 52.5301255,
"lon": 13.4055082
},
"47443": {
"lat": 51.4656893,
"lon": 6.6524975
},
"10557": {
"lat": 52.5256483,
"lon": 13.3640508
},
"37671": {
"lat": 51.7685153,
"lon": 9.3310829
},
"16225": {
"lat": 52.8292295,
"lon": 13.8384643
},
"52146": {
"lat": 50.8292627,
"lon": 6.1519607
},
"48149": {
"lat": 51.9638718,
"lon": 7.6026944
},
"48653": {
"lat": 51.9171483,
"lon": 7.1606437
},
"71134": {
"lat": 48.6820157,
"lon": 8.8818568
},
"53111": {
"lat": 50.7402492,
"lon": 7.0985907
},
"01993": {
"lat": 51.5082352,
"lon": 13.8868418
},
"32105": {
"lat": 52.0890558,
"lon": 8.7396016
},
"82445": {
"lat": 47.6253557,
"lon": 11.1138077
},
"40217": {
"lat": 51.2131204,
"lon": 6.774469
},
"49401": {
"lat": 52.5279348,
"lon": 8.232843
},
"55268": {
"lat": 49.90063,
"lon": 8.203086
},
"24306": {
"lat": 54.1620728,
"lon": 10.4375556
},
"22763": {
"lat": 53.550853,
"lon": 9.9138756
},
"21339": {
"lat": 53.2548482,
"lon": 10.3911518
},
"56218": {
"lat": 50.388183,
"lon": 7.5046437
},
"86899": {
"lat": 48.0336819,
"lon": 10.8638784
},
"84034": {
"lat": 48.5286909,
"lon": 12.0999127
},
"82110": {
"lat": 48.1321961,
"lon": 11.3600169
},
"56626": {
"lat": 50.431234,
"lon": 7.3730436
},
"10315": {
"lat": 52.5180707,
"lon": 13.5144045
},
"52080": {
"lat": 50.784851,
"lon": 6.160716
},
"51688": {
"lat": 51.1169225,
"lon": 7.419399
},
"45127": {
"lat": 51.4574619,
"lon": 7.0103435
},
"48324": {
"lat": 51.8545927,
"lon": 7.7859503
},
"26386": {
"lat": 53.553156,
"lon": 8.1039435
},
"86356": {
"lat": 48.392226,
"lon": 10.8016665
},
"50939": {
"lat": 50.9095331,
"lon": 6.9259241
},
"14195": {
"lat": 52.4585754,
"lon": 13.2846329
},
"21680": {
"lat": 53.5904569,
"lon": 9.4760161
},
"01257": {
"lat": 50.9983029,
"lon": 13.8123958
},
"29410": {
"lat": 52.8367097,
"lon": 11.1224073
},
"38300": {
"lat": 52.1513013,
"lon": 10.56812
},
"01819": {
"lat": 50.8809928,
"lon": 13.9044785
},
"85238": {
"lat": 48.4082502,
"lon": 11.4634544
},
"33378": {
"lat": 51.8441657,
"lon": 8.317883
},
"99192": {
"lat": 50.934868,
"lon": 10.9138336
},
"60438": {
"lat": 50.1786706,
"lon": 8.6271811
},
"35075": {
"lat": 50.7787074,
"lon": 8.5791746
},
"10827": {
"lat": 52.4836896,
"lon": 13.3528221
},
"24392": {
"lat": 54.6331185,
"lon": 9.7771951
},
"78647": {
"lat": 48.0717008,
"lon": 8.6373591
},
"10627": {
"lat": 52.5075196,
"lon": 13.3031999
},
"22419": {
"lat": 53.6662872,
"lon": 10.0055952
},
"06388": {
"lat": 51.6898413,
"lon": 11.9119914
},
"67117": {
"lat": 49.4122026,
"lon": 8.3936093
},
"68219": {
"lat": 49.435092,
"lon": 8.5365013
},
"77866": {
"lat": 48.6610036,
"lon": 7.9359671
},
"53175": {
"lat": 50.6989638,
"lon": 7.1445107
}
}