filter description

2025-11-28 22:34:43 +01:00
parent b23725e2a3
commit 943a147420
5 changed files with 150 additions and 2 deletions
--- a/backend/kleinanzeigen_scrape.py
+++ b/backend/kleinanzeigen_scrape.py
@ -222,6 +222,11 @@ class KleinanzeigenScraper:
            if not zip_code:
                return None

+            desc = None
+            desc_elem = soup.find("p", id="viewad-description-text")
+            if desc_elem:
+                desc = desc_elem.text.strip()
+
            listing = {
                "title": title,
                "price": price,
@ -231,6 +236,7 @@ class KleinanzeigenScraper:
                "date_added": date_added.isoformat() if date_added else None,
                "image": first_image,
                "url": url,
+                "desc": desc,
            }

            # Add coordinates
@ -275,6 +281,7 @@ def main():
    # Search for listings
    print("Step 1: Searching for listing URLs...")
    listing_urls = scraper.search_listings(search_term, max_pages, min_price, max_price)
+    #listing_urls = ["https://www.kleinanzeigen.de/s-anzeige/cube-nuroad-pro-fe-2023-58-cm-l-/3226095826-217-23527"]
    print(f"Found {len(listing_urls)} listings\n")

    if len(listing_urls) > 0:
@ -294,6 +301,7 @@ def main():
            print(f"  Date: {listing['date_added']}")
            print(f"  Coordinates: {listing.get('lat')}, {listing.get('lon')}")
            print(f"  Image: {listing['image']}")
+            print(f"  Description: {listing['desc']}")
        else:
            print("Failed to scrape listing")

--- a/backend/scrape_proxy.py
+++ b/backend/scrape_proxy.py
@ -9,6 +9,7 @@ import os
 import uuid
 import threading
 import random
+import re

 from kleinanzeigen_scrape import KleinanzeigenScraper

@ -128,6 +129,26 @@ def prefetch_listings_thread(session_id):
    print(f"Prefetch complete for session {session_id}")


+def filter_listing_search_description(description_term, description):
+    # Normalize the description (lowercase + remove non-alphanumeric chars)
+    text_clean = re.sub(r"[^a-z0-9]+", "", description.lower())
+
+    # Split the search terms by semicolon
+    terms = description_term.split(";")
+
+    # Check each term
+    for term in terms:
+        # Normalize each term
+        term_clean = re.sub(r"[^a-z0-9]+", "", term.lower())
+
+        # If any normalized term is not found in the normalized description → return False
+        if term_clean not in text_clean:
+            return False
+
+    # All terms were found
+    return True
+
+
@app.route("/api/search", methods=["POST"])
 def api_search():
    """API endpoint for searching listings - returns count and starts prefetch"""
@ -141,6 +162,7 @@ def api_search():
    num_listings = data.get("num_listings", 25)
    min_price = data.get("min_price", 0)
    max_price = data.get("max_price", 1000000000)
+    power_search_description = data.get("search_term_desc", "")

    if not search_term:
        return jsonify({"error": "Search term is required"}), 400
@ -163,6 +185,7 @@ def api_search():
        # Store session with creation timestamp
        scrape_sessions[session_id] = {
            "urls": listing_urls,
+            "power_search_desc": power_search_description,
            "total": len(listing_urls),
            "scraped": 0,
            "listings": [],
@ -207,6 +230,11 @@ def api_scrape(session_id):

    if len(session["listings"]) > 0:
        listing = session["listings"].pop(0)
+        if session["power_search_desc"]:
+            if not filter_listing_search_description(
+                session["power_search_desc"], listing["desc"]
+            ):
+                listing = None
    else:
        listing = None