UI improvments and backend prefetch

This commit is contained in:
2025-11-25 18:16:29 +01:00
parent dd36618802
commit 0858be033b
3 changed files with 578 additions and 178 deletions

View File

@ -17,22 +17,34 @@ import time
import json
import os
import uuid
import threading
app = Flask(__name__)
CORS(app)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# ZIP code cache file
# Configuration
CACHE_FILE = "zip_cache.json"
zip_cache = {}
# Active scrape sessions
scrape_sessions = {}
SESSION_TIMEOUT = 300 # seconds
LISTINGS_PER_PAGE = 25
# Global state
zip_cache = {}
scrape_sessions = {}
app_start_time = time.time()
# Metrics
metrics = {
"search_requests": 0,
"scrape_requests": 0,
"kleinanzeigen_response_codes": {},
"nominatim_response_codes": {},
}
def cleanup_old_sessions():
"""Remove sessions older than SESSION_TIMEOUT"""
current_time = time.time()
sessions_to_remove = []
@ -63,8 +75,22 @@ def make_soup(url):
"""Fetch URL and return BeautifulSoup object"""
user_agent = {"user-agent": get_random_user_agent()}
http = urllib3.PoolManager(10, headers=user_agent)
r = http.request("GET", url)
return BeautifulSoup(r.data, "lxml")
try:
r = http.request("GET", url)
# Track response code
status_code = str(r.status)
if "kleinanzeigen.de" in url:
metrics["kleinanzeigen_response_codes"][status_code] = (
metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1
)
return BeautifulSoup(r.data, "lxml")
except Exception as e:
print(f"Error fetching {url}: {e}")
if "kleinanzeigen.de" in url:
metrics["kleinanzeigen_response_codes"]["error"] = (
metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
)
raise
def geocode_zip(zip_code):
@ -88,6 +114,13 @@ def geocode_zip(zip_code):
response = requests.get(
url, params=params, headers={"user-agent": get_random_user_agent()}
)
# Track response code
status_code = str(response.status_code)
metrics["nominatim_response_codes"][status_code] = (
metrics["nominatim_response_codes"].get(status_code, 0) + 1
)
data = response.json()
if data:
@ -102,6 +135,9 @@ def geocode_zip(zip_code):
return coords
except Exception as e:
print(f"Geocoding error for {zip_code}: {e}")
metrics["nominatim_response_codes"]["error"] = (
metrics["nominatim_response_codes"].get("error", 0) + 1
)
return None
@ -136,8 +172,8 @@ def search_listings(search_term, max_pages, min_price, max_price):
for result in results:
try:
listing_url = result.a["href"]
found_listings.add(base_url + listing_url)
listing_href = result.a["href"]
found_listings.add(base_url + listing_href)
except (AttributeError, KeyError):
pass
except Exception as e:
@ -151,6 +187,7 @@ def scrape_listing(url):
"""Scrape individual listing details"""
try:
soup = make_soup(url)
metrics["scrape_requests"] += 1
title = soup.find("h1", class_="boxedarticle--title")
if not title:
@ -231,26 +268,61 @@ def scrape_listing(url):
return None
def prefetch_listings_thread(session_id):
"""Background thread to prefetch all listings"""
session = scrape_sessions.get(session_id)
if not session:
return
print(f"Starting prefetch for session {session_id}")
for i, url in enumerate(session["urls"]):
# Check if session was cancelled or deleted
if (
session_id not in scrape_sessions
or scrape_sessions[session_id]["cancelled"]
):
print(f"Prefetch stopped for session {session_id}")
return
listing = scrape_listing(url)
if listing:
session["listings"].append(listing)
session["scraped"] += 1
time.sleep(0.3) # Rate limiting
print(
f"Prefetch complete for session {session_id}: {len(session['listings'])} listings"
)
@app.route("/api/search", methods=["POST"])
def api_search():
"""API endpoint for searching listings - returns only count and URLs"""
"""API endpoint for searching listings - returns count and starts prefetch"""
data = request.json
metrics["search_requests"] += 1
# Cleanup old sessions before creating new one
cleanup_old_sessions()
search_term = data.get("search_term", "")
max_pages = data.get("max_pages", 1)
num_listings = data.get("num_listings", 25)
min_price = data.get("min_price", 0)
max_price = data.get("max_price", 10000)
max_price = data.get("max_price", 1000000000)
if not search_term:
return jsonify({"error": "Search term is required"}), 400
# Calculate pages needed
max_pages = max(1, (num_listings + LISTINGS_PER_PAGE - 1) // LISTINGS_PER_PAGE)
try:
# Search for listing URLs only
listing_urls = search_listings(search_term, max_pages, min_price, max_price)
# Limit to requested number
listing_urls = listing_urls[:num_listings]
# Create session ID
session_id = str(uuid.uuid4())
@ -264,6 +336,12 @@ def api_search():
"created_at": time.time(),
}
# Start prefetch in background thread
prefetch_thread = threading.Thread(
target=prefetch_listings_thread, args=(session_id,), daemon=True
)
prefetch_thread.start()
return jsonify({"session_id": session_id, "total": len(listing_urls)})
except Exception as e:
@ -272,8 +350,7 @@ def api_search():
@app.route("/api/scrape/<session_id>", methods=["GET"])
def api_scrape(session_id):
"""API endpoint for scraping next listing in session"""
# Cleanup old sessions on each request
"""API endpoint to get next scraped listing from session"""
cleanup_old_sessions()
if session_id not in scrape_sessions:
@ -284,21 +361,28 @@ def api_scrape(session_id):
if session["cancelled"]:
return jsonify({"cancelled": True}), 200
if session["scraped"] >= session["total"]:
return jsonify({"complete": True, "listing": None})
# Wait briefly if no listings are ready yet
wait_count = 0
while (
len(session["listings"]) == 0
and session["scraped"] < session["total"]
and wait_count < 10
):
time.sleep(0.1)
wait_count += 1
# Scrape next listing
url = session["urls"][session["scraped"]]
listing = scrape_listing(url)
if len(session["listings"]) > 0:
listing = session["listings"].pop(0)
else:
listing = None
if listing:
session["listings"].append(listing)
session["scraped"] += 1
is_complete = (
session["scraped"] >= session["total"] and len(session["listings"]) == 0
)
return jsonify(
{
"complete": session["scraped"] >= session["total"],
"complete": is_complete,
"listing": listing,
"progress": {"current": session["scraped"], "total": session["total"]},
}
@ -307,51 +391,40 @@ def api_scrape(session_id):
@app.route("/api/scrape/<session_id>/cancel", methods=["POST"])
def api_cancel_scrape(session_id):
"""API endpoint to cancel scraping session"""
"""API endpoint to cancel scraping session and delete cached listings"""
cleanup_old_sessions()
if session_id not in scrape_sessions:
return jsonify({"error": "Invalid session ID"}), 404
scrape_sessions[session_id]["cancelled"] = True
# Delete session completely (including cached listings)
del scrape_sessions[session_id]
return jsonify(
{
"cancelled": True,
"listings": scrape_sessions[session_id]["listings"],
"total_scraped": len(scrape_sessions[session_id]["listings"]),
}
)
@app.route("/api/scrape/<session_id>/results", methods=["GET"])
def api_get_results(session_id):
"""API endpoint to get all scraped results"""
cleanup_old_sessions()
if session_id not in scrape_sessions:
return jsonify({"error": "Invalid session ID"}), 404
session = scrape_sessions[session_id]
return jsonify(
{
"listings": session["listings"],
"total": len(session["listings"]),
"progress": {"current": session["scraped"], "total": session["total"]},
}
)
return jsonify({"cancelled": True, "message": "Session deleted"})
@app.route("/api/health", methods=["GET"])
def health():
"""Health check endpoint"""
return jsonify({"status": "ok"})
@app.route("/api/metrics", methods=["GET"])
def api_metrics():
"""Prometheus-style metrics endpoint"""
cleanup_old_sessions()
uptime = time.time() - app_start_time
return jsonify(
{
"status": "ok",
"cache_size": len(zip_cache),
"search_requests_total": metrics["search_requests"],
"scrape_requests_total": metrics["scrape_requests"],
"uptime_seconds": uptime,
"kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"],
"nominatim_response_codes": metrics["nominatim_response_codes"],
"active_sessions": len(scrape_sessions),
"cache_size": len(zip_cache),
}
)
@ -365,5 +438,4 @@ if __name__ == "__main__":
zip_cache = json.load(f)
print(f"Loaded {len(zip_cache)} ZIP codes from cache")
print("ZIP code cache loaded with", len(zip_cache), "entries")
app.run(debug=True, host="0.0.0.0", port=5000)
app.run(debug=True, host="0.0.0.0", port=5000, threaded=True)