parallel scrape and fix metrics
This commit is contained in:
@ -268,15 +268,29 @@ def scrape_listing(url):
|
||||
return None
|
||||
|
||||
|
||||
def prefetch_listings_thread(session_id):
|
||||
"""Background thread to prefetch all listings"""
|
||||
def scrape_listing_wrapper(session_id, url, results, index):
|
||||
"""Wrapper for scraping listing in thread"""
|
||||
session = scrape_sessions.get(session_id)
|
||||
if not session:
|
||||
return
|
||||
|
||||
print(f"Starting prefetch for session {session_id}")
|
||||
listing = scrape_listing(url)
|
||||
results[index] = listing
|
||||
|
||||
for i, url in enumerate(session["urls"]):
|
||||
|
||||
def prefetch_listings_thread(session_id):
|
||||
"""Background thread to prefetch all listings with parallel workers"""
|
||||
session = scrape_sessions.get(session_id)
|
||||
if not session:
|
||||
return
|
||||
urls = session["urls"]
|
||||
max_workers = random.randrange(2, 8)
|
||||
|
||||
print(
|
||||
f"Starting prefetch for session {session_id} with {max_workers} parallel workers"
|
||||
)
|
||||
|
||||
for i in range(0, len(urls), max_workers):
|
||||
# Check if session was cancelled or deleted
|
||||
if (
|
||||
session_id not in scrape_sessions
|
||||
@ -285,15 +299,35 @@ def prefetch_listings_thread(session_id):
|
||||
print(f"Prefetch stopped for session {session_id}")
|
||||
return
|
||||
|
||||
listing = scrape_listing(url)
|
||||
if listing:
|
||||
session["listings"].append(listing)
|
||||
session["scraped"] += 1
|
||||
time.sleep(0.3) # Rate limiting
|
||||
# Process batch of URLs in parallel
|
||||
batch = urls[i : i + max_workers]
|
||||
threads = []
|
||||
results = [None] * len(batch)
|
||||
|
||||
print(
|
||||
f"Prefetch complete for session {session_id}: {len(session['listings'])} listings"
|
||||
)
|
||||
for j, url in enumerate(batch):
|
||||
thread = threading.Thread(
|
||||
target=scrape_listing_wrapper,
|
||||
args=(session_id, url, results, j),
|
||||
daemon=True,
|
||||
)
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
|
||||
# Wait for all threads in this batch to complete
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
# Add results to session
|
||||
for listing in results:
|
||||
if listing:
|
||||
session["listings"].append(listing)
|
||||
|
||||
session["scraped"] += len(batch)
|
||||
|
||||
# Rate limiting between batches
|
||||
time.sleep(0.5)
|
||||
|
||||
print(f"Prefetch complete for session {session_id}")
|
||||
|
||||
|
||||
@app.route("/api/search", methods=["POST"])
|
||||
@ -416,16 +450,61 @@ def api_metrics():
|
||||
|
||||
uptime = time.time() - app_start_time
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"search_requests_total": metrics["search_requests"],
|
||||
"scrape_requests_total": metrics["scrape_requests"],
|
||||
"uptime_seconds": uptime,
|
||||
"kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"],
|
||||
"nominatim_response_codes": metrics["nominatim_response_codes"],
|
||||
"active_sessions": len(scrape_sessions),
|
||||
"cache_size": len(zip_cache),
|
||||
}
|
||||
# Build Prometheus text format
|
||||
lines = []
|
||||
|
||||
# Search requests
|
||||
lines.append("# HELP search_requests_total Total number of search requests")
|
||||
lines.append("# TYPE search_requests_total counter")
|
||||
lines.append(f"search_requests_total {metrics['search_requests']}")
|
||||
lines.append("")
|
||||
|
||||
# Scrape requests
|
||||
lines.append("# HELP scrape_requests_total Total number of scrape requests")
|
||||
lines.append("# TYPE scrape_requests_total counter")
|
||||
lines.append(f"scrape_requests_total {metrics['scrape_requests']}")
|
||||
lines.append("")
|
||||
|
||||
# Uptime
|
||||
lines.append("# HELP uptime_seconds Application uptime in seconds")
|
||||
lines.append("# TYPE uptime_seconds gauge")
|
||||
lines.append(f"uptime_seconds {uptime}")
|
||||
lines.append("")
|
||||
|
||||
# Active sessions
|
||||
lines.append("# HELP active_sessions Number of active scraping sessions")
|
||||
lines.append("# TYPE active_sessions gauge")
|
||||
lines.append(f"active_sessions {len(scrape_sessions)}")
|
||||
lines.append("")
|
||||
|
||||
# Cache size
|
||||
lines.append("# HELP cache_size Number of cached ZIP codes")
|
||||
lines.append("# TYPE cache_size gauge")
|
||||
lines.append(f"zip_code_cache_size {len(zip_cache)}")
|
||||
lines.append("")
|
||||
|
||||
# Kleinanzeigen response codes
|
||||
lines.append(
|
||||
"# HELP kleinanzeigen_http_responses_total HTTP responses from kleinanzeigen.de"
|
||||
)
|
||||
lines.append("# TYPE kleinanzeigen_http_responses_total counter")
|
||||
for code, count in metrics["kleinanzeigen_response_codes"].items():
|
||||
lines.append(f'kleinanzeigen_http_responses_total{{code="{code}"}} {count}')
|
||||
lines.append("")
|
||||
|
||||
# Nominatim response codes
|
||||
lines.append(
|
||||
"# HELP nominatim_http_responses_total HTTP responses from Nominatim API"
|
||||
)
|
||||
lines.append("# TYPE nominatim_http_responses_total counter")
|
||||
for code, count in metrics["nominatim_response_codes"].items():
|
||||
lines.append(f'nominatim_http_responses_total{{code="{code}"}} {count}')
|
||||
lines.append("")
|
||||
|
||||
return (
|
||||
"\n".join(lines),
|
||||
200,
|
||||
{"Content-Type": "text/plain; version=0.0.4; charset=utf-8"},
|
||||
)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user