parallel scrape and fix metrics

This commit is contained in:
2025-11-25 18:43:42 +01:00
parent b3f293ca48
commit de7bcf881a
2 changed files with 529 additions and 22 deletions

View File

@ -268,15 +268,29 @@ def scrape_listing(url):
return None
def prefetch_listings_thread(session_id):
"""Background thread to prefetch all listings"""
def scrape_listing_wrapper(session_id, url, results, index):
"""Wrapper for scraping listing in thread"""
session = scrape_sessions.get(session_id)
if not session:
return
print(f"Starting prefetch for session {session_id}")
listing = scrape_listing(url)
results[index] = listing
for i, url in enumerate(session["urls"]):
def prefetch_listings_thread(session_id):
"""Background thread to prefetch all listings with parallel workers"""
session = scrape_sessions.get(session_id)
if not session:
return
urls = session["urls"]
max_workers = random.randrange(2, 8)
print(
f"Starting prefetch for session {session_id} with {max_workers} parallel workers"
)
for i in range(0, len(urls), max_workers):
# Check if session was cancelled or deleted
if (
session_id not in scrape_sessions
@ -285,15 +299,35 @@ def prefetch_listings_thread(session_id):
print(f"Prefetch stopped for session {session_id}")
return
listing = scrape_listing(url)
if listing:
session["listings"].append(listing)
session["scraped"] += 1
time.sleep(0.3) # Rate limiting
# Process batch of URLs in parallel
batch = urls[i : i + max_workers]
threads = []
results = [None] * len(batch)
print(
f"Prefetch complete for session {session_id}: {len(session['listings'])} listings"
)
for j, url in enumerate(batch):
thread = threading.Thread(
target=scrape_listing_wrapper,
args=(session_id, url, results, j),
daemon=True,
)
thread.start()
threads.append(thread)
# Wait for all threads in this batch to complete
for thread in threads:
thread.join()
# Add results to session
for listing in results:
if listing:
session["listings"].append(listing)
session["scraped"] += len(batch)
# Rate limiting between batches
time.sleep(0.5)
print(f"Prefetch complete for session {session_id}")
@app.route("/api/search", methods=["POST"])
@ -416,16 +450,61 @@ def api_metrics():
uptime = time.time() - app_start_time
return jsonify(
{
"search_requests_total": metrics["search_requests"],
"scrape_requests_total": metrics["scrape_requests"],
"uptime_seconds": uptime,
"kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"],
"nominatim_response_codes": metrics["nominatim_response_codes"],
"active_sessions": len(scrape_sessions),
"cache_size": len(zip_cache),
}
# Build Prometheus text format
lines = []
# Search requests
lines.append("# HELP search_requests_total Total number of search requests")
lines.append("# TYPE search_requests_total counter")
lines.append(f"search_requests_total {metrics['search_requests']}")
lines.append("")
# Scrape requests
lines.append("# HELP scrape_requests_total Total number of scrape requests")
lines.append("# TYPE scrape_requests_total counter")
lines.append(f"scrape_requests_total {metrics['scrape_requests']}")
lines.append("")
# Uptime
lines.append("# HELP uptime_seconds Application uptime in seconds")
lines.append("# TYPE uptime_seconds gauge")
lines.append(f"uptime_seconds {uptime}")
lines.append("")
# Active sessions
lines.append("# HELP active_sessions Number of active scraping sessions")
lines.append("# TYPE active_sessions gauge")
lines.append(f"active_sessions {len(scrape_sessions)}")
lines.append("")
# Cache size
lines.append("# HELP cache_size Number of cached ZIP codes")
lines.append("# TYPE cache_size gauge")
lines.append(f"zip_code_cache_size {len(zip_cache)}")
lines.append("")
# Kleinanzeigen response codes
lines.append(
"# HELP kleinanzeigen_http_responses_total HTTP responses from kleinanzeigen.de"
)
lines.append("# TYPE kleinanzeigen_http_responses_total counter")
for code, count in metrics["kleinanzeigen_response_codes"].items():
lines.append(f'kleinanzeigen_http_responses_total{{code="{code}"}} {count}')
lines.append("")
# Nominatim response codes
lines.append(
"# HELP nominatim_http_responses_total HTTP responses from Nominatim API"
)
lines.append("# TYPE nominatim_http_responses_total counter")
for code, count in metrics["nominatim_response_codes"].items():
lines.append(f'nominatim_http_responses_total{{code="{code}"}} {count}')
lines.append("")
return (
"\n".join(lines),
200,
{"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
)