From de7bcf881a894d79d8d6abca1f4c1230c66ec2b7 Mon Sep 17 00:00:00 2001 From: localhorst Date: Tue, 25 Nov 2025 18:43:42 +0100 Subject: [PATCH] parallel scrape and fix metrics --- backend/scrape_proxy.py | 123 +++++++++--- backend/zip_cache.json | 428 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 529 insertions(+), 22 deletions(-) diff --git a/backend/scrape_proxy.py b/backend/scrape_proxy.py index fcf3a1c..5ff02d1 100644 --- a/backend/scrape_proxy.py +++ b/backend/scrape_proxy.py @@ -268,15 +268,29 @@ def scrape_listing(url): return None -def prefetch_listings_thread(session_id): - """Background thread to prefetch all listings""" +def scrape_listing_wrapper(session_id, url, results, index): + """Wrapper for scraping listing in thread""" session = scrape_sessions.get(session_id) if not session: return - print(f"Starting prefetch for session {session_id}") + listing = scrape_listing(url) + results[index] = listing - for i, url in enumerate(session["urls"]): + +def prefetch_listings_thread(session_id): + """Background thread to prefetch all listings with parallel workers""" + session = scrape_sessions.get(session_id) + if not session: + return + urls = session["urls"] + max_workers = random.randrange(2, 8) + + print( + f"Starting prefetch for session {session_id} with {max_workers} parallel workers" + ) + + for i in range(0, len(urls), max_workers): # Check if session was cancelled or deleted if ( session_id not in scrape_sessions @@ -285,15 +299,35 @@ def prefetch_listings_thread(session_id): print(f"Prefetch stopped for session {session_id}") return - listing = scrape_listing(url) - if listing: - session["listings"].append(listing) - session["scraped"] += 1 - time.sleep(0.3) # Rate limiting + # Process batch of URLs in parallel + batch = urls[i : i + max_workers] + threads = [] + results = [None] * len(batch) - print( - f"Prefetch complete for session {session_id}: {len(session['listings'])} listings" - ) + for j, url in enumerate(batch): + thread = threading.Thread( + target=scrape_listing_wrapper, + args=(session_id, url, results, j), + daemon=True, + ) + thread.start() + threads.append(thread) + + # Wait for all threads in this batch to complete + for thread in threads: + thread.join() + + # Add results to session + for listing in results: + if listing: + session["listings"].append(listing) + + session["scraped"] += len(batch) + + # Rate limiting between batches + time.sleep(0.5) + + print(f"Prefetch complete for session {session_id}") @app.route("/api/search", methods=["POST"]) @@ -416,16 +450,61 @@ def api_metrics(): uptime = time.time() - app_start_time - return jsonify( - { - "search_requests_total": metrics["search_requests"], - "scrape_requests_total": metrics["scrape_requests"], - "uptime_seconds": uptime, - "kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"], - "nominatim_response_codes": metrics["nominatim_response_codes"], - "active_sessions": len(scrape_sessions), - "cache_size": len(zip_cache), - } + # Build Prometheus text format + lines = [] + + # Search requests + lines.append("# HELP search_requests_total Total number of search requests") + lines.append("# TYPE search_requests_total counter") + lines.append(f"search_requests_total {metrics['search_requests']}") + lines.append("") + + # Scrape requests + lines.append("# HELP scrape_requests_total Total number of scrape requests") + lines.append("# TYPE scrape_requests_total counter") + lines.append(f"scrape_requests_total {metrics['scrape_requests']}") + lines.append("") + + # Uptime + lines.append("# HELP uptime_seconds Application uptime in seconds") + lines.append("# TYPE uptime_seconds gauge") + lines.append(f"uptime_seconds {uptime}") + lines.append("") + + # Active sessions + lines.append("# HELP active_sessions Number of active scraping sessions") + lines.append("# TYPE active_sessions gauge") + lines.append(f"active_sessions {len(scrape_sessions)}") + lines.append("") + + # Cache size + lines.append("# HELP cache_size Number of cached ZIP codes") + lines.append("# TYPE cache_size gauge") + lines.append(f"zip_code_cache_size {len(zip_cache)}") + lines.append("") + + # Kleinanzeigen response codes + lines.append( + "# HELP kleinanzeigen_http_responses_total HTTP responses from kleinanzeigen.de" + ) + lines.append("# TYPE kleinanzeigen_http_responses_total counter") + for code, count in metrics["kleinanzeigen_response_codes"].items(): + lines.append(f'kleinanzeigen_http_responses_total{{code="{code}"}} {count}') + lines.append("") + + # Nominatim response codes + lines.append( + "# HELP nominatim_http_responses_total HTTP responses from Nominatim API" + ) + lines.append("# TYPE nominatim_http_responses_total counter") + for code, count in metrics["nominatim_response_codes"].items(): + lines.append(f'nominatim_http_responses_total{{code="{code}"}} {count}') + lines.append("") + + return ( + "\n".join(lines), + 200, + {"Content-Type": "text/plain; version=0.0.4; charset=utf-8"}, ) diff --git a/backend/zip_cache.json b/backend/zip_cache.json index 2a03844..73ea3f3 100644 --- a/backend/zip_cache.json +++ b/backend/zip_cache.json @@ -1598,5 +1598,433 @@ "53175": { "lat": 50.6989638, "lon": 7.1445107 + }, + "78467": { + "lat": 47.6929555, + "lon": 9.1513759 + }, + "48703": { + "lat": 52.0035321, + "lon": 6.9517971 + }, + "46049": { + "lat": 51.4725211, + "lon": 6.8311577 + }, + "48143": { + "lat": 51.9604439, + "lon": 7.6262442 + }, + "06231": { + "lat": 51.2849151, + "lon": 12.1146298 + }, + "33332": { + "lat": 51.8972222, + "lon": 8.4006525 + }, + "27283": { + "lat": 52.9410676, + "lon": 9.2354716 + }, + "10317": { + "lat": 52.4986204, + "lon": 13.4838382 + }, + "01640": { + "lat": 51.1331059, + "lon": 13.5656911 + }, + "46244": { + "lat": 51.5984773, + "lon": 6.9123203 + }, + "01796": { + "lat": 50.9470409, + "lon": 13.9505572 + }, + "32339": { + "lat": 52.3741653, + "lon": 8.6212978 + }, + "50181": { + "lat": 51.0144705, + "lon": 6.5569525 + }, + "93055": { + "lat": 49.007933, + "lon": 12.1608121 + }, + "18147": { + "lat": 54.1309902, + "lon": 12.1196962 + }, + "49504": { + "lat": 52.2991515, + "lon": 7.9218375 + }, + "60318": { + "lat": 50.1246887, + "lon": 8.6865254 + }, + "96052": { + "lat": 49.9117586, + "lon": 10.8880355 + }, + "29559": { + "lat": 52.8774192, + "lon": 10.6061272 + }, + "10115": { + "lat": 52.5319487, + "lon": 13.3837943 + }, + "27251": { + "lat": 52.7492339, + "lon": 8.7757762 + }, + "22303": { + "lat": 53.5897407, + "lon": 10.0234361 + }, + "38122": { + "lat": 52.2297328, + "lon": 10.4745918 + }, + "51371": { + "lat": 51.0590744, + "lon": 6.9417484 + }, + "60314": { + "lat": 50.1166698, + "lon": 8.7334387 + }, + "70376": { + "lat": 48.818393, + "lon": 9.2066864 + }, + "93499": { + "lat": 49.1429872, + "lon": 12.7164157 + }, + "18435": { + "lat": 54.32997, + "lon": 13.0649961 + }, + "12105": { + "lat": 52.4484553, + "lon": 13.3722304 + }, + "81929": { + "lat": 48.1606494, + "lon": 11.6631075 + }, + "45768": { + "lat": 51.6575564, + "lon": 7.0659333 + }, + "91074": { + "lat": 49.5738171, + "lon": 10.8926968 + }, + "49593": { + "lat": 52.5674997, + "lon": 7.9325832 + }, + "80935": { + "lat": 48.1997053, + "lon": 11.5552742 + }, + "52134": { + "lat": 50.8605761, + "lon": 6.1001816 + }, + "94535": { + "lat": 48.7110796, + "lon": 13.2553681 + }, + "99947": { + "lat": 51.1242532, + "lon": 10.6769762 + }, + "09112": { + "lat": 50.830933, + "lon": 12.9053458 + }, + "01968": { + "lat": 51.5238377, + "lon": 14.0284911 + }, + "31515": { + "lat": 52.4314053, + "lon": 9.428236 + }, + "40547": { + "lat": 51.2441486, + "lon": 6.7400785 + }, + "72800": { + "lat": 48.4830973, + "lon": 9.2728039 + }, + "81476": { + "lat": 48.0873869, + "lon": 11.4957046 + }, + "94034": { + "lat": 48.593963, + "lon": 13.449846 + }, + "84478": { + "lat": 48.1977065, + "lon": 12.4064772 + }, + "69120": { + "lat": 49.4197028, + "lon": 8.7013385 + }, + "16303": { + "lat": 53.0795487, + "lon": 14.2322027 + }, + "48165": { + "lat": 51.8982648, + "lon": 7.650382 + }, + "23554": { + "lat": 53.889632, + "lon": 10.6772133 + }, + "57648": { + "lat": 50.6553129, + "lon": 7.9089968 + }, + "50677": { + "lat": 50.9222793, + "lon": 6.9491251 + }, + "26826": { + "lat": 53.165921, + "lon": 7.3277997 + }, + "24340": { + "lat": 54.4684418, + "lon": 9.7984274 + }, + "25335": { + "lat": 53.7556754, + "lon": 9.6072404 + }, + "89160": { + "lat": 48.4795088, + "lon": 9.9097371 + }, + "51580": { + "lat": 50.955823, + "lon": 7.6952729 + }, + "59075": { + "lat": 51.706537, + "lon": 7.7471066 + }, + "28355": { + "lat": 53.1001424, + "lon": 8.9369005 + }, + "10961": { + "lat": 52.492375, + "lon": 13.3969612 + }, + "33649": { + "lat": 51.9812735, + "lon": 8.4631941 + }, + "01945": { + "lat": 51.426725, + "lon": 13.8800707 + }, + "40225": { + "lat": 51.1952407, + "lon": 6.7930966 + }, + "83043": { + "lat": 47.8653219, + "lon": 12.0086382 + }, + "01279": { + "lat": 51.0279271, + "lon": 13.8224355 + }, + "88348": { + "lat": 48.0132718, + "lon": 9.5038216 + }, + "57078": { + "lat": 50.9241982, + "lon": 7.9979802 + }, + "72160": { + "lat": 48.4524183, + "lon": 8.6624266 + }, + "49716": { + "lat": 52.6985078, + "lon": 7.2503852 + }, + "24111": { + "lat": 54.3043198, + "lon": 10.0647871 + }, + "09116": { + "lat": 50.8205765, + "lon": 12.8734753 + }, + "63450": { + "lat": 50.1285671, + "lon": 8.9252343 + }, + "64285": { + "lat": 49.8517954, + "lon": 8.6583914 + }, + "46399": { + "lat": 51.8767165, + "lon": 6.592176 + }, + "50823": { + "lat": 50.9508203, + "lon": 6.9259111 + }, + "51702": { + "lat": 51.0304049, + "lon": 7.6756018 + }, + "26129": { + "lat": 53.1529595, + "lon": 8.1751768 + }, + "22391": { + "lat": 53.6423048, + "lon": 10.081893 + }, + "41472": { + "lat": 51.1601804, + "lon": 6.654715 + }, + "76199": { + "lat": 48.9755465, + "lon": 8.4040415 + }, + "35043": { + "lat": 50.7979432, + "lon": 8.8227218 + }, + "65929": { + "lat": 50.0944874, + "lon": 8.5308675 + }, + "27308": { + "lat": 52.9255206, + "lon": 9.3782295 + }, + "99510": { + "lat": 51.0351265, + "lon": 11.4866204 + }, + "94315": { + "lat": 48.8839157, + "lon": 12.5955773 + }, + "69126": { + "lat": 49.3773204, + "lon": 8.7015986 + }, + "14193": { + "lat": 52.4813456, + "lon": 13.2384701 + }, + "04318": { + "lat": 51.3431283, + "lon": 12.4282967 + }, + "48161": { + "lat": 51.9892494, + "lon": 7.5383949 + }, + "35683": { + "lat": 50.7423221, + "lon": 8.2847449 + }, + "42477": { + "lat": 51.2100192, + "lon": 7.3649391 + }, + "48317": { + "lat": 51.8011531, + "lon": 7.7434268 + }, + "10999": { + "lat": 52.4976589, + "lon": 13.4231017 + }, + "88260": { + "lat": 47.7032598, + "lon": 9.9431795 + }, + "72760": { + "lat": 48.5120972, + "lon": 9.2052416 + }, + "82467": { + "lat": 47.4902875, + "lon": 11.0332252 + }, + "44319": { + "lat": 51.5383021, + "lon": 7.6017367 + }, + "12524": { + "lat": 52.4118113, + "lon": 13.5481684 + }, + "99428": { + "lat": 50.9720626, + "lon": 11.2029566 + }, + "86695": { + "lat": 48.6048016, + "lon": 10.8196538 + }, + "04177": { + "lat": 51.3425514, + "lon": 12.330756 + }, + "50735": { + "lat": 50.9893938, + "lon": 6.9609471 + }, + "53909": { + "lat": 50.692835, + "lon": 6.6581295 + }, + "50169": { + "lat": 50.8807881, + "lon": 6.7426581 + }, + "89584": { + "lat": 48.2846841, + "lon": 9.6586816 + }, + "47179": { + "lat": 51.5247327, + "lon": 6.7297793 + }, + "76287": { + "lat": 48.9632556, + "lon": 8.310033 + }, + "74072": { + "lat": 49.1394593, + "lon": 9.2148992 } } \ No newline at end of file