From 0858be033bb7e2a74fcc73bb301b6910d1f37ce9 Mon Sep 17 00:00:00 2001 From: localhorst Date: Tue, 25 Nov 2025 18:16:29 +0100 Subject: [PATCH] UI improvments and backend prefetch --- backend/scrape_proxy.py | 186 +++++++++++++++------- backend/zip_cache.json | 228 +++++++++++++++++++++++++++ web/index.html | 342 ++++++++++++++++++++++++++-------------- 3 files changed, 578 insertions(+), 178 deletions(-) diff --git a/backend/scrape_proxy.py b/backend/scrape_proxy.py index a538316..fcf3a1c 100644 --- a/backend/scrape_proxy.py +++ b/backend/scrape_proxy.py @@ -17,22 +17,34 @@ import time import json import os import uuid +import threading app = Flask(__name__) CORS(app) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -# ZIP code cache file +# Configuration CACHE_FILE = "zip_cache.json" -zip_cache = {} - -# Active scrape sessions -scrape_sessions = {} - SESSION_TIMEOUT = 300 # seconds +LISTINGS_PER_PAGE = 25 + +# Global state +zip_cache = {} +scrape_sessions = {} +app_start_time = time.time() + +# Metrics +metrics = { + "search_requests": 0, + "scrape_requests": 0, + "kleinanzeigen_response_codes": {}, + "nominatim_response_codes": {}, +} + def cleanup_old_sessions(): + """Remove sessions older than SESSION_TIMEOUT""" current_time = time.time() sessions_to_remove = [] @@ -63,8 +75,22 @@ def make_soup(url): """Fetch URL and return BeautifulSoup object""" user_agent = {"user-agent": get_random_user_agent()} http = urllib3.PoolManager(10, headers=user_agent) - r = http.request("GET", url) - return BeautifulSoup(r.data, "lxml") + try: + r = http.request("GET", url) + # Track response code + status_code = str(r.status) + if "kleinanzeigen.de" in url: + metrics["kleinanzeigen_response_codes"][status_code] = ( + metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1 + ) + return BeautifulSoup(r.data, "lxml") + except Exception as e: + print(f"Error fetching {url}: {e}") + if "kleinanzeigen.de" in url: + metrics["kleinanzeigen_response_codes"]["error"] = ( + metrics["kleinanzeigen_response_codes"].get("error", 0) + 1 + ) + raise def geocode_zip(zip_code): @@ -88,6 +114,13 @@ def geocode_zip(zip_code): response = requests.get( url, params=params, headers={"user-agent": get_random_user_agent()} ) + + # Track response code + status_code = str(response.status_code) + metrics["nominatim_response_codes"][status_code] = ( + metrics["nominatim_response_codes"].get(status_code, 0) + 1 + ) + data = response.json() if data: @@ -102,6 +135,9 @@ def geocode_zip(zip_code): return coords except Exception as e: print(f"Geocoding error for {zip_code}: {e}") + metrics["nominatim_response_codes"]["error"] = ( + metrics["nominatim_response_codes"].get("error", 0) + 1 + ) return None @@ -136,8 +172,8 @@ def search_listings(search_term, max_pages, min_price, max_price): for result in results: try: - listing_url = result.a["href"] - found_listings.add(base_url + listing_url) + listing_href = result.a["href"] + found_listings.add(base_url + listing_href) except (AttributeError, KeyError): pass except Exception as e: @@ -151,6 +187,7 @@ def scrape_listing(url): """Scrape individual listing details""" try: soup = make_soup(url) + metrics["scrape_requests"] += 1 title = soup.find("h1", class_="boxedarticle--title") if not title: @@ -231,26 +268,61 @@ def scrape_listing(url): return None +def prefetch_listings_thread(session_id): + """Background thread to prefetch all listings""" + session = scrape_sessions.get(session_id) + if not session: + return + + print(f"Starting prefetch for session {session_id}") + + for i, url in enumerate(session["urls"]): + # Check if session was cancelled or deleted + if ( + session_id not in scrape_sessions + or scrape_sessions[session_id]["cancelled"] + ): + print(f"Prefetch stopped for session {session_id}") + return + + listing = scrape_listing(url) + if listing: + session["listings"].append(listing) + session["scraped"] += 1 + time.sleep(0.3) # Rate limiting + + print( + f"Prefetch complete for session {session_id}: {len(session['listings'])} listings" + ) + + @app.route("/api/search", methods=["POST"]) def api_search(): - """API endpoint for searching listings - returns only count and URLs""" + """API endpoint for searching listings - returns count and starts prefetch""" data = request.json + metrics["search_requests"] += 1 # Cleanup old sessions before creating new one cleanup_old_sessions() search_term = data.get("search_term", "") - max_pages = data.get("max_pages", 1) + num_listings = data.get("num_listings", 25) min_price = data.get("min_price", 0) - max_price = data.get("max_price", 10000) + max_price = data.get("max_price", 1000000000) if not search_term: return jsonify({"error": "Search term is required"}), 400 + # Calculate pages needed + max_pages = max(1, (num_listings + LISTINGS_PER_PAGE - 1) // LISTINGS_PER_PAGE) + try: # Search for listing URLs only listing_urls = search_listings(search_term, max_pages, min_price, max_price) + # Limit to requested number + listing_urls = listing_urls[:num_listings] + # Create session ID session_id = str(uuid.uuid4()) @@ -264,6 +336,12 @@ def api_search(): "created_at": time.time(), } + # Start prefetch in background thread + prefetch_thread = threading.Thread( + target=prefetch_listings_thread, args=(session_id,), daemon=True + ) + prefetch_thread.start() + return jsonify({"session_id": session_id, "total": len(listing_urls)}) except Exception as e: @@ -272,8 +350,7 @@ def api_search(): @app.route("/api/scrape/", methods=["GET"]) def api_scrape(session_id): - """API endpoint for scraping next listing in session""" - # Cleanup old sessions on each request + """API endpoint to get next scraped listing from session""" cleanup_old_sessions() if session_id not in scrape_sessions: @@ -284,21 +361,28 @@ def api_scrape(session_id): if session["cancelled"]: return jsonify({"cancelled": True}), 200 - if session["scraped"] >= session["total"]: - return jsonify({"complete": True, "listing": None}) + # Wait briefly if no listings are ready yet + wait_count = 0 + while ( + len(session["listings"]) == 0 + and session["scraped"] < session["total"] + and wait_count < 10 + ): + time.sleep(0.1) + wait_count += 1 - # Scrape next listing - url = session["urls"][session["scraped"]] - listing = scrape_listing(url) + if len(session["listings"]) > 0: + listing = session["listings"].pop(0) + else: + listing = None - if listing: - session["listings"].append(listing) - - session["scraped"] += 1 + is_complete = ( + session["scraped"] >= session["total"] and len(session["listings"]) == 0 + ) return jsonify( { - "complete": session["scraped"] >= session["total"], + "complete": is_complete, "listing": listing, "progress": {"current": session["scraped"], "total": session["total"]}, } @@ -307,51 +391,40 @@ def api_scrape(session_id): @app.route("/api/scrape//cancel", methods=["POST"]) def api_cancel_scrape(session_id): - """API endpoint to cancel scraping session""" + """API endpoint to cancel scraping session and delete cached listings""" cleanup_old_sessions() if session_id not in scrape_sessions: return jsonify({"error": "Invalid session ID"}), 404 - scrape_sessions[session_id]["cancelled"] = True + # Delete session completely (including cached listings) + del scrape_sessions[session_id] - return jsonify( - { - "cancelled": True, - "listings": scrape_sessions[session_id]["listings"], - "total_scraped": len(scrape_sessions[session_id]["listings"]), - } - ) - - -@app.route("/api/scrape//results", methods=["GET"]) -def api_get_results(session_id): - """API endpoint to get all scraped results""" - cleanup_old_sessions() - - if session_id not in scrape_sessions: - return jsonify({"error": "Invalid session ID"}), 404 - - session = scrape_sessions[session_id] - - return jsonify( - { - "listings": session["listings"], - "total": len(session["listings"]), - "progress": {"current": session["scraped"], "total": session["total"]}, - } - ) + return jsonify({"cancelled": True, "message": "Session deleted"}) @app.route("/api/health", methods=["GET"]) def health(): """Health check endpoint""" + return jsonify({"status": "ok"}) + + +@app.route("/api/metrics", methods=["GET"]) +def api_metrics(): + """Prometheus-style metrics endpoint""" cleanup_old_sessions() + + uptime = time.time() - app_start_time + return jsonify( { - "status": "ok", - "cache_size": len(zip_cache), + "search_requests_total": metrics["search_requests"], + "scrape_requests_total": metrics["scrape_requests"], + "uptime_seconds": uptime, + "kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"], + "nominatim_response_codes": metrics["nominatim_response_codes"], "active_sessions": len(scrape_sessions), + "cache_size": len(zip_cache), } ) @@ -365,5 +438,4 @@ if __name__ == "__main__": zip_cache = json.load(f) print(f"Loaded {len(zip_cache)} ZIP codes from cache") - print("ZIP code cache loaded with", len(zip_cache), "entries") - app.run(debug=True, host="0.0.0.0", port=5000) + app.run(debug=True, host="0.0.0.0", port=5000, threaded=True) diff --git a/backend/zip_cache.json b/backend/zip_cache.json index 8329b26..2a03844 100644 --- a/backend/zip_cache.json +++ b/backend/zip_cache.json @@ -1370,5 +1370,233 @@ "20457": { "lat": 53.5335376, "lon": 9.9806284 + }, + "39120": { + "lat": 52.0855315, + "lon": 11.6329414 + }, + "47137": { + "lat": 51.4714791, + "lon": 6.7669058 + }, + "69207": { + "lat": 49.3421242, + "lon": 8.6370166 + }, + "27616": { + "lat": 53.4469731, + "lon": 8.8063121 + }, + "10119": { + "lat": 52.5301255, + "lon": 13.4055082 + }, + "47443": { + "lat": 51.4656893, + "lon": 6.6524975 + }, + "10557": { + "lat": 52.5256483, + "lon": 13.3640508 + }, + "37671": { + "lat": 51.7685153, + "lon": 9.3310829 + }, + "16225": { + "lat": 52.8292295, + "lon": 13.8384643 + }, + "52146": { + "lat": 50.8292627, + "lon": 6.1519607 + }, + "48149": { + "lat": 51.9638718, + "lon": 7.6026944 + }, + "48653": { + "lat": 51.9171483, + "lon": 7.1606437 + }, + "71134": { + "lat": 48.6820157, + "lon": 8.8818568 + }, + "53111": { + "lat": 50.7402492, + "lon": 7.0985907 + }, + "01993": { + "lat": 51.5082352, + "lon": 13.8868418 + }, + "32105": { + "lat": 52.0890558, + "lon": 8.7396016 + }, + "82445": { + "lat": 47.6253557, + "lon": 11.1138077 + }, + "40217": { + "lat": 51.2131204, + "lon": 6.774469 + }, + "49401": { + "lat": 52.5279348, + "lon": 8.232843 + }, + "55268": { + "lat": 49.90063, + "lon": 8.203086 + }, + "24306": { + "lat": 54.1620728, + "lon": 10.4375556 + }, + "22763": { + "lat": 53.550853, + "lon": 9.9138756 + }, + "21339": { + "lat": 53.2548482, + "lon": 10.3911518 + }, + "56218": { + "lat": 50.388183, + "lon": 7.5046437 + }, + "86899": { + "lat": 48.0336819, + "lon": 10.8638784 + }, + "84034": { + "lat": 48.5286909, + "lon": 12.0999127 + }, + "82110": { + "lat": 48.1321961, + "lon": 11.3600169 + }, + "56626": { + "lat": 50.431234, + "lon": 7.3730436 + }, + "10315": { + "lat": 52.5180707, + "lon": 13.5144045 + }, + "52080": { + "lat": 50.784851, + "lon": 6.160716 + }, + "51688": { + "lat": 51.1169225, + "lon": 7.419399 + }, + "45127": { + "lat": 51.4574619, + "lon": 7.0103435 + }, + "48324": { + "lat": 51.8545927, + "lon": 7.7859503 + }, + "26386": { + "lat": 53.553156, + "lon": 8.1039435 + }, + "86356": { + "lat": 48.392226, + "lon": 10.8016665 + }, + "50939": { + "lat": 50.9095331, + "lon": 6.9259241 + }, + "14195": { + "lat": 52.4585754, + "lon": 13.2846329 + }, + "21680": { + "lat": 53.5904569, + "lon": 9.4760161 + }, + "01257": { + "lat": 50.9983029, + "lon": 13.8123958 + }, + "29410": { + "lat": 52.8367097, + "lon": 11.1224073 + }, + "38300": { + "lat": 52.1513013, + "lon": 10.56812 + }, + "01819": { + "lat": 50.8809928, + "lon": 13.9044785 + }, + "85238": { + "lat": 48.4082502, + "lon": 11.4634544 + }, + "33378": { + "lat": 51.8441657, + "lon": 8.317883 + }, + "99192": { + "lat": 50.934868, + "lon": 10.9138336 + }, + "60438": { + "lat": 50.1786706, + "lon": 8.6271811 + }, + "35075": { + "lat": 50.7787074, + "lon": 8.5791746 + }, + "10827": { + "lat": 52.4836896, + "lon": 13.3528221 + }, + "24392": { + "lat": 54.6331185, + "lon": 9.7771951 + }, + "78647": { + "lat": 48.0717008, + "lon": 8.6373591 + }, + "10627": { + "lat": 52.5075196, + "lon": 13.3031999 + }, + "22419": { + "lat": 53.6662872, + "lon": 10.0055952 + }, + "06388": { + "lat": 51.6898413, + "lon": 11.9119914 + }, + "67117": { + "lat": 49.4122026, + "lon": 8.3936093 + }, + "68219": { + "lat": 49.435092, + "lon": 8.5365013 + }, + "77866": { + "lat": 48.6610036, + "lon": 7.9359671 + }, + "53175": { + "lat": 50.6989638, + "lon": 7.1445107 } } \ No newline at end of file diff --git a/web/index.html b/web/index.html index 5332a23..1ab19a4 100644 --- a/web/index.html +++ b/web/index.html @@ -1,9 +1,9 @@ - + - Kleinanzeigen Map Search + Kleinanzeigen Karten-Suche