parallel scrape and fix metrics

This commit is contained in:
2025-11-25 18:43:42 +01:00
parent b3f293ca48
commit de7bcf881a
2 changed files with 529 additions and 22 deletions

View File

@ -268,15 +268,29 @@ def scrape_listing(url):
return None return None
def prefetch_listings_thread(session_id): def scrape_listing_wrapper(session_id, url, results, index):
"""Background thread to prefetch all listings""" """Wrapper for scraping listing in thread"""
session = scrape_sessions.get(session_id) session = scrape_sessions.get(session_id)
if not session: if not session:
return return
print(f"Starting prefetch for session {session_id}") listing = scrape_listing(url)
results[index] = listing
for i, url in enumerate(session["urls"]):
def prefetch_listings_thread(session_id):
"""Background thread to prefetch all listings with parallel workers"""
session = scrape_sessions.get(session_id)
if not session:
return
urls = session["urls"]
max_workers = random.randrange(2, 8)
print(
f"Starting prefetch for session {session_id} with {max_workers} parallel workers"
)
for i in range(0, len(urls), max_workers):
# Check if session was cancelled or deleted # Check if session was cancelled or deleted
if ( if (
session_id not in scrape_sessions session_id not in scrape_sessions
@ -285,15 +299,35 @@ def prefetch_listings_thread(session_id):
print(f"Prefetch stopped for session {session_id}") print(f"Prefetch stopped for session {session_id}")
return return
listing = scrape_listing(url) # Process batch of URLs in parallel
batch = urls[i : i + max_workers]
threads = []
results = [None] * len(batch)
for j, url in enumerate(batch):
thread = threading.Thread(
target=scrape_listing_wrapper,
args=(session_id, url, results, j),
daemon=True,
)
thread.start()
threads.append(thread)
# Wait for all threads in this batch to complete
for thread in threads:
thread.join()
# Add results to session
for listing in results:
if listing: if listing:
session["listings"].append(listing) session["listings"].append(listing)
session["scraped"] += 1
time.sleep(0.3) # Rate limiting
print( session["scraped"] += len(batch)
f"Prefetch complete for session {session_id}: {len(session['listings'])} listings"
) # Rate limiting between batches
time.sleep(0.5)
print(f"Prefetch complete for session {session_id}")
@app.route("/api/search", methods=["POST"]) @app.route("/api/search", methods=["POST"])
@ -416,16 +450,61 @@ def api_metrics():
uptime = time.time() - app_start_time uptime = time.time() - app_start_time
return jsonify( # Build Prometheus text format
{ lines = []
"search_requests_total": metrics["search_requests"],
"scrape_requests_total": metrics["scrape_requests"], # Search requests
"uptime_seconds": uptime, lines.append("# HELP search_requests_total Total number of search requests")
"kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"], lines.append("# TYPE search_requests_total counter")
"nominatim_response_codes": metrics["nominatim_response_codes"], lines.append(f"search_requests_total {metrics['search_requests']}")
"active_sessions": len(scrape_sessions), lines.append("")
"cache_size": len(zip_cache),
} # Scrape requests
lines.append("# HELP scrape_requests_total Total number of scrape requests")
lines.append("# TYPE scrape_requests_total counter")
lines.append(f"scrape_requests_total {metrics['scrape_requests']}")
lines.append("")
# Uptime
lines.append("# HELP uptime_seconds Application uptime in seconds")
lines.append("# TYPE uptime_seconds gauge")
lines.append(f"uptime_seconds {uptime}")
lines.append("")
# Active sessions
lines.append("# HELP active_sessions Number of active scraping sessions")
lines.append("# TYPE active_sessions gauge")
lines.append(f"active_sessions {len(scrape_sessions)}")
lines.append("")
# Cache size
lines.append("# HELP cache_size Number of cached ZIP codes")
lines.append("# TYPE cache_size gauge")
lines.append(f"zip_code_cache_size {len(zip_cache)}")
lines.append("")
# Kleinanzeigen response codes
lines.append(
"# HELP kleinanzeigen_http_responses_total HTTP responses from kleinanzeigen.de"
)
lines.append("# TYPE kleinanzeigen_http_responses_total counter")
for code, count in metrics["kleinanzeigen_response_codes"].items():
lines.append(f'kleinanzeigen_http_responses_total{{code="{code}"}} {count}')
lines.append("")
# Nominatim response codes
lines.append(
"# HELP nominatim_http_responses_total HTTP responses from Nominatim API"
)
lines.append("# TYPE nominatim_http_responses_total counter")
for code, count in metrics["nominatim_response_codes"].items():
lines.append(f'nominatim_http_responses_total{{code="{code}"}} {count}')
lines.append("")
return (
"\n".join(lines),
200,
{"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
) )

View File

@ -1598,5 +1598,433 @@
"53175": { "53175": {
"lat": 50.6989638, "lat": 50.6989638,
"lon": 7.1445107 "lon": 7.1445107
},
"78467": {
"lat": 47.6929555,
"lon": 9.1513759
},
"48703": {
"lat": 52.0035321,
"lon": 6.9517971
},
"46049": {
"lat": 51.4725211,
"lon": 6.8311577
},
"48143": {
"lat": 51.9604439,
"lon": 7.6262442
},
"06231": {
"lat": 51.2849151,
"lon": 12.1146298
},
"33332": {
"lat": 51.8972222,
"lon": 8.4006525
},
"27283": {
"lat": 52.9410676,
"lon": 9.2354716
},
"10317": {
"lat": 52.4986204,
"lon": 13.4838382
},
"01640": {
"lat": 51.1331059,
"lon": 13.5656911
},
"46244": {
"lat": 51.5984773,
"lon": 6.9123203
},
"01796": {
"lat": 50.9470409,
"lon": 13.9505572
},
"32339": {
"lat": 52.3741653,
"lon": 8.6212978
},
"50181": {
"lat": 51.0144705,
"lon": 6.5569525
},
"93055": {
"lat": 49.007933,
"lon": 12.1608121
},
"18147": {
"lat": 54.1309902,
"lon": 12.1196962
},
"49504": {
"lat": 52.2991515,
"lon": 7.9218375
},
"60318": {
"lat": 50.1246887,
"lon": 8.6865254
},
"96052": {
"lat": 49.9117586,
"lon": 10.8880355
},
"29559": {
"lat": 52.8774192,
"lon": 10.6061272
},
"10115": {
"lat": 52.5319487,
"lon": 13.3837943
},
"27251": {
"lat": 52.7492339,
"lon": 8.7757762
},
"22303": {
"lat": 53.5897407,
"lon": 10.0234361
},
"38122": {
"lat": 52.2297328,
"lon": 10.4745918
},
"51371": {
"lat": 51.0590744,
"lon": 6.9417484
},
"60314": {
"lat": 50.1166698,
"lon": 8.7334387
},
"70376": {
"lat": 48.818393,
"lon": 9.2066864
},
"93499": {
"lat": 49.1429872,
"lon": 12.7164157
},
"18435": {
"lat": 54.32997,
"lon": 13.0649961
},
"12105": {
"lat": 52.4484553,
"lon": 13.3722304
},
"81929": {
"lat": 48.1606494,
"lon": 11.6631075
},
"45768": {
"lat": 51.6575564,
"lon": 7.0659333
},
"91074": {
"lat": 49.5738171,
"lon": 10.8926968
},
"49593": {
"lat": 52.5674997,
"lon": 7.9325832
},
"80935": {
"lat": 48.1997053,
"lon": 11.5552742
},
"52134": {
"lat": 50.8605761,
"lon": 6.1001816
},
"94535": {
"lat": 48.7110796,
"lon": 13.2553681
},
"99947": {
"lat": 51.1242532,
"lon": 10.6769762
},
"09112": {
"lat": 50.830933,
"lon": 12.9053458
},
"01968": {
"lat": 51.5238377,
"lon": 14.0284911
},
"31515": {
"lat": 52.4314053,
"lon": 9.428236
},
"40547": {
"lat": 51.2441486,
"lon": 6.7400785
},
"72800": {
"lat": 48.4830973,
"lon": 9.2728039
},
"81476": {
"lat": 48.0873869,
"lon": 11.4957046
},
"94034": {
"lat": 48.593963,
"lon": 13.449846
},
"84478": {
"lat": 48.1977065,
"lon": 12.4064772
},
"69120": {
"lat": 49.4197028,
"lon": 8.7013385
},
"16303": {
"lat": 53.0795487,
"lon": 14.2322027
},
"48165": {
"lat": 51.8982648,
"lon": 7.650382
},
"23554": {
"lat": 53.889632,
"lon": 10.6772133
},
"57648": {
"lat": 50.6553129,
"lon": 7.9089968
},
"50677": {
"lat": 50.9222793,
"lon": 6.9491251
},
"26826": {
"lat": 53.165921,
"lon": 7.3277997
},
"24340": {
"lat": 54.4684418,
"lon": 9.7984274
},
"25335": {
"lat": 53.7556754,
"lon": 9.6072404
},
"89160": {
"lat": 48.4795088,
"lon": 9.9097371
},
"51580": {
"lat": 50.955823,
"lon": 7.6952729
},
"59075": {
"lat": 51.706537,
"lon": 7.7471066
},
"28355": {
"lat": 53.1001424,
"lon": 8.9369005
},
"10961": {
"lat": 52.492375,
"lon": 13.3969612
},
"33649": {
"lat": 51.9812735,
"lon": 8.4631941
},
"01945": {
"lat": 51.426725,
"lon": 13.8800707
},
"40225": {
"lat": 51.1952407,
"lon": 6.7930966
},
"83043": {
"lat": 47.8653219,
"lon": 12.0086382
},
"01279": {
"lat": 51.0279271,
"lon": 13.8224355
},
"88348": {
"lat": 48.0132718,
"lon": 9.5038216
},
"57078": {
"lat": 50.9241982,
"lon": 7.9979802
},
"72160": {
"lat": 48.4524183,
"lon": 8.6624266
},
"49716": {
"lat": 52.6985078,
"lon": 7.2503852
},
"24111": {
"lat": 54.3043198,
"lon": 10.0647871
},
"09116": {
"lat": 50.8205765,
"lon": 12.8734753
},
"63450": {
"lat": 50.1285671,
"lon": 8.9252343
},
"64285": {
"lat": 49.8517954,
"lon": 8.6583914
},
"46399": {
"lat": 51.8767165,
"lon": 6.592176
},
"50823": {
"lat": 50.9508203,
"lon": 6.9259111
},
"51702": {
"lat": 51.0304049,
"lon": 7.6756018
},
"26129": {
"lat": 53.1529595,
"lon": 8.1751768
},
"22391": {
"lat": 53.6423048,
"lon": 10.081893
},
"41472": {
"lat": 51.1601804,
"lon": 6.654715
},
"76199": {
"lat": 48.9755465,
"lon": 8.4040415
},
"35043": {
"lat": 50.7979432,
"lon": 8.8227218
},
"65929": {
"lat": 50.0944874,
"lon": 8.5308675
},
"27308": {
"lat": 52.9255206,
"lon": 9.3782295
},
"99510": {
"lat": 51.0351265,
"lon": 11.4866204
},
"94315": {
"lat": 48.8839157,
"lon": 12.5955773
},
"69126": {
"lat": 49.3773204,
"lon": 8.7015986
},
"14193": {
"lat": 52.4813456,
"lon": 13.2384701
},
"04318": {
"lat": 51.3431283,
"lon": 12.4282967
},
"48161": {
"lat": 51.9892494,
"lon": 7.5383949
},
"35683": {
"lat": 50.7423221,
"lon": 8.2847449
},
"42477": {
"lat": 51.2100192,
"lon": 7.3649391
},
"48317": {
"lat": 51.8011531,
"lon": 7.7434268
},
"10999": {
"lat": 52.4976589,
"lon": 13.4231017
},
"88260": {
"lat": 47.7032598,
"lon": 9.9431795
},
"72760": {
"lat": 48.5120972,
"lon": 9.2052416
},
"82467": {
"lat": 47.4902875,
"lon": 11.0332252
},
"44319": {
"lat": 51.5383021,
"lon": 7.6017367
},
"12524": {
"lat": 52.4118113,
"lon": 13.5481684
},
"99428": {
"lat": 50.9720626,
"lon": 11.2029566
},
"86695": {
"lat": 48.6048016,
"lon": 10.8196538
},
"04177": {
"lat": 51.3425514,
"lon": 12.330756
},
"50735": {
"lat": 50.9893938,
"lon": 6.9609471
},
"53909": {
"lat": 50.692835,
"lon": 6.6581295
},
"50169": {
"lat": 50.8807881,
"lon": 6.7426581
},
"89584": {
"lat": 48.2846841,
"lon": 9.6586816
},
"47179": {
"lat": 51.5247327,
"lon": 6.7297793
},
"76287": {
"lat": 48.9632556,
"lon": 8.310033
},
"74072": {
"lat": 49.1394593,
"lon": 9.2148992
} }
} }