UI improvments and backend prefetch
This commit is contained in:
@ -17,22 +17,34 @@ import time
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
import threading
|
||||
|
||||
app = Flask(__name__)
|
||||
CORS(app)
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# ZIP code cache file
|
||||
# Configuration
|
||||
CACHE_FILE = "zip_cache.json"
|
||||
zip_cache = {}
|
||||
|
||||
# Active scrape sessions
|
||||
scrape_sessions = {}
|
||||
|
||||
SESSION_TIMEOUT = 300 # seconds
|
||||
LISTINGS_PER_PAGE = 25
|
||||
|
||||
# Global state
|
||||
zip_cache = {}
|
||||
scrape_sessions = {}
|
||||
app_start_time = time.time()
|
||||
|
||||
# Metrics
|
||||
metrics = {
|
||||
"search_requests": 0,
|
||||
"scrape_requests": 0,
|
||||
"kleinanzeigen_response_codes": {},
|
||||
"nominatim_response_codes": {},
|
||||
}
|
||||
|
||||
|
||||
def cleanup_old_sessions():
|
||||
"""Remove sessions older than SESSION_TIMEOUT"""
|
||||
current_time = time.time()
|
||||
sessions_to_remove = []
|
||||
|
||||
@ -63,8 +75,22 @@ def make_soup(url):
|
||||
"""Fetch URL and return BeautifulSoup object"""
|
||||
user_agent = {"user-agent": get_random_user_agent()}
|
||||
http = urllib3.PoolManager(10, headers=user_agent)
|
||||
r = http.request("GET", url)
|
||||
return BeautifulSoup(r.data, "lxml")
|
||||
try:
|
||||
r = http.request("GET", url)
|
||||
# Track response code
|
||||
status_code = str(r.status)
|
||||
if "kleinanzeigen.de" in url:
|
||||
metrics["kleinanzeigen_response_codes"][status_code] = (
|
||||
metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1
|
||||
)
|
||||
return BeautifulSoup(r.data, "lxml")
|
||||
except Exception as e:
|
||||
print(f"Error fetching {url}: {e}")
|
||||
if "kleinanzeigen.de" in url:
|
||||
metrics["kleinanzeigen_response_codes"]["error"] = (
|
||||
metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
|
||||
)
|
||||
raise
|
||||
|
||||
|
||||
def geocode_zip(zip_code):
|
||||
@ -88,6 +114,13 @@ def geocode_zip(zip_code):
|
||||
response = requests.get(
|
||||
url, params=params, headers={"user-agent": get_random_user_agent()}
|
||||
)
|
||||
|
||||
# Track response code
|
||||
status_code = str(response.status_code)
|
||||
metrics["nominatim_response_codes"][status_code] = (
|
||||
metrics["nominatim_response_codes"].get(status_code, 0) + 1
|
||||
)
|
||||
|
||||
data = response.json()
|
||||
|
||||
if data:
|
||||
@ -102,6 +135,9 @@ def geocode_zip(zip_code):
|
||||
return coords
|
||||
except Exception as e:
|
||||
print(f"Geocoding error for {zip_code}: {e}")
|
||||
metrics["nominatim_response_codes"]["error"] = (
|
||||
metrics["nominatim_response_codes"].get("error", 0) + 1
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
@ -136,8 +172,8 @@ def search_listings(search_term, max_pages, min_price, max_price):
|
||||
|
||||
for result in results:
|
||||
try:
|
||||
listing_url = result.a["href"]
|
||||
found_listings.add(base_url + listing_url)
|
||||
listing_href = result.a["href"]
|
||||
found_listings.add(base_url + listing_href)
|
||||
except (AttributeError, KeyError):
|
||||
pass
|
||||
except Exception as e:
|
||||
@ -151,6 +187,7 @@ def scrape_listing(url):
|
||||
"""Scrape individual listing details"""
|
||||
try:
|
||||
soup = make_soup(url)
|
||||
metrics["scrape_requests"] += 1
|
||||
|
||||
title = soup.find("h1", class_="boxedarticle--title")
|
||||
if not title:
|
||||
@ -231,26 +268,61 @@ def scrape_listing(url):
|
||||
return None
|
||||
|
||||
|
||||
def prefetch_listings_thread(session_id):
|
||||
"""Background thread to prefetch all listings"""
|
||||
session = scrape_sessions.get(session_id)
|
||||
if not session:
|
||||
return
|
||||
|
||||
print(f"Starting prefetch for session {session_id}")
|
||||
|
||||
for i, url in enumerate(session["urls"]):
|
||||
# Check if session was cancelled or deleted
|
||||
if (
|
||||
session_id not in scrape_sessions
|
||||
or scrape_sessions[session_id]["cancelled"]
|
||||
):
|
||||
print(f"Prefetch stopped for session {session_id}")
|
||||
return
|
||||
|
||||
listing = scrape_listing(url)
|
||||
if listing:
|
||||
session["listings"].append(listing)
|
||||
session["scraped"] += 1
|
||||
time.sleep(0.3) # Rate limiting
|
||||
|
||||
print(
|
||||
f"Prefetch complete for session {session_id}: {len(session['listings'])} listings"
|
||||
)
|
||||
|
||||
|
||||
@app.route("/api/search", methods=["POST"])
|
||||
def api_search():
|
||||
"""API endpoint for searching listings - returns only count and URLs"""
|
||||
"""API endpoint for searching listings - returns count and starts prefetch"""
|
||||
data = request.json
|
||||
metrics["search_requests"] += 1
|
||||
|
||||
# Cleanup old sessions before creating new one
|
||||
cleanup_old_sessions()
|
||||
|
||||
search_term = data.get("search_term", "")
|
||||
max_pages = data.get("max_pages", 1)
|
||||
num_listings = data.get("num_listings", 25)
|
||||
min_price = data.get("min_price", 0)
|
||||
max_price = data.get("max_price", 10000)
|
||||
max_price = data.get("max_price", 1000000000)
|
||||
|
||||
if not search_term:
|
||||
return jsonify({"error": "Search term is required"}), 400
|
||||
|
||||
# Calculate pages needed
|
||||
max_pages = max(1, (num_listings + LISTINGS_PER_PAGE - 1) // LISTINGS_PER_PAGE)
|
||||
|
||||
try:
|
||||
# Search for listing URLs only
|
||||
listing_urls = search_listings(search_term, max_pages, min_price, max_price)
|
||||
|
||||
# Limit to requested number
|
||||
listing_urls = listing_urls[:num_listings]
|
||||
|
||||
# Create session ID
|
||||
session_id = str(uuid.uuid4())
|
||||
|
||||
@ -264,6 +336,12 @@ def api_search():
|
||||
"created_at": time.time(),
|
||||
}
|
||||
|
||||
# Start prefetch in background thread
|
||||
prefetch_thread = threading.Thread(
|
||||
target=prefetch_listings_thread, args=(session_id,), daemon=True
|
||||
)
|
||||
prefetch_thread.start()
|
||||
|
||||
return jsonify({"session_id": session_id, "total": len(listing_urls)})
|
||||
|
||||
except Exception as e:
|
||||
@ -272,8 +350,7 @@ def api_search():
|
||||
|
||||
@app.route("/api/scrape/<session_id>", methods=["GET"])
|
||||
def api_scrape(session_id):
|
||||
"""API endpoint for scraping next listing in session"""
|
||||
# Cleanup old sessions on each request
|
||||
"""API endpoint to get next scraped listing from session"""
|
||||
cleanup_old_sessions()
|
||||
|
||||
if session_id not in scrape_sessions:
|
||||
@ -284,21 +361,28 @@ def api_scrape(session_id):
|
||||
if session["cancelled"]:
|
||||
return jsonify({"cancelled": True}), 200
|
||||
|
||||
if session["scraped"] >= session["total"]:
|
||||
return jsonify({"complete": True, "listing": None})
|
||||
# Wait briefly if no listings are ready yet
|
||||
wait_count = 0
|
||||
while (
|
||||
len(session["listings"]) == 0
|
||||
and session["scraped"] < session["total"]
|
||||
and wait_count < 10
|
||||
):
|
||||
time.sleep(0.1)
|
||||
wait_count += 1
|
||||
|
||||
# Scrape next listing
|
||||
url = session["urls"][session["scraped"]]
|
||||
listing = scrape_listing(url)
|
||||
if len(session["listings"]) > 0:
|
||||
listing = session["listings"].pop(0)
|
||||
else:
|
||||
listing = None
|
||||
|
||||
if listing:
|
||||
session["listings"].append(listing)
|
||||
|
||||
session["scraped"] += 1
|
||||
is_complete = (
|
||||
session["scraped"] >= session["total"] and len(session["listings"]) == 0
|
||||
)
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"complete": session["scraped"] >= session["total"],
|
||||
"complete": is_complete,
|
||||
"listing": listing,
|
||||
"progress": {"current": session["scraped"], "total": session["total"]},
|
||||
}
|
||||
@ -307,51 +391,40 @@ def api_scrape(session_id):
|
||||
|
||||
@app.route("/api/scrape/<session_id>/cancel", methods=["POST"])
|
||||
def api_cancel_scrape(session_id):
|
||||
"""API endpoint to cancel scraping session"""
|
||||
"""API endpoint to cancel scraping session and delete cached listings"""
|
||||
cleanup_old_sessions()
|
||||
|
||||
if session_id not in scrape_sessions:
|
||||
return jsonify({"error": "Invalid session ID"}), 404
|
||||
|
||||
scrape_sessions[session_id]["cancelled"] = True
|
||||
# Delete session completely (including cached listings)
|
||||
del scrape_sessions[session_id]
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"cancelled": True,
|
||||
"listings": scrape_sessions[session_id]["listings"],
|
||||
"total_scraped": len(scrape_sessions[session_id]["listings"]),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.route("/api/scrape/<session_id>/results", methods=["GET"])
|
||||
def api_get_results(session_id):
|
||||
"""API endpoint to get all scraped results"""
|
||||
cleanup_old_sessions()
|
||||
|
||||
if session_id not in scrape_sessions:
|
||||
return jsonify({"error": "Invalid session ID"}), 404
|
||||
|
||||
session = scrape_sessions[session_id]
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"listings": session["listings"],
|
||||
"total": len(session["listings"]),
|
||||
"progress": {"current": session["scraped"], "total": session["total"]},
|
||||
}
|
||||
)
|
||||
return jsonify({"cancelled": True, "message": "Session deleted"})
|
||||
|
||||
|
||||
@app.route("/api/health", methods=["GET"])
|
||||
def health():
|
||||
"""Health check endpoint"""
|
||||
return jsonify({"status": "ok"})
|
||||
|
||||
|
||||
@app.route("/api/metrics", methods=["GET"])
|
||||
def api_metrics():
|
||||
"""Prometheus-style metrics endpoint"""
|
||||
cleanup_old_sessions()
|
||||
|
||||
uptime = time.time() - app_start_time
|
||||
|
||||
return jsonify(
|
||||
{
|
||||
"status": "ok",
|
||||
"cache_size": len(zip_cache),
|
||||
"search_requests_total": metrics["search_requests"],
|
||||
"scrape_requests_total": metrics["scrape_requests"],
|
||||
"uptime_seconds": uptime,
|
||||
"kleinanzeigen_response_codes": metrics["kleinanzeigen_response_codes"],
|
||||
"nominatim_response_codes": metrics["nominatim_response_codes"],
|
||||
"active_sessions": len(scrape_sessions),
|
||||
"cache_size": len(zip_cache),
|
||||
}
|
||||
)
|
||||
|
||||
@ -365,5 +438,4 @@ if __name__ == "__main__":
|
||||
zip_cache = json.load(f)
|
||||
|
||||
print(f"Loaded {len(zip_cache)} ZIP codes from cache")
|
||||
print("ZIP code cache loaded with", len(zip_cache), "entries")
|
||||
app.run(debug=True, host="0.0.0.0", port=5000)
|
||||
app.run(debug=True, host="0.0.0.0", port=5000, threaded=True)
|
||||
|
||||
Reference in New Issue
Block a user