filter description
This commit is contained in:
@ -222,6 +222,11 @@ class KleinanzeigenScraper:
|
||||
if not zip_code:
|
||||
return None
|
||||
|
||||
desc = None
|
||||
desc_elem = soup.find("p", id="viewad-description-text")
|
||||
if desc_elem:
|
||||
desc = desc_elem.text.strip()
|
||||
|
||||
listing = {
|
||||
"title": title,
|
||||
"price": price,
|
||||
@ -231,6 +236,7 @@ class KleinanzeigenScraper:
|
||||
"date_added": date_added.isoformat() if date_added else None,
|
||||
"image": first_image,
|
||||
"url": url,
|
||||
"desc": desc,
|
||||
}
|
||||
|
||||
# Add coordinates
|
||||
@ -275,6 +281,7 @@ def main():
|
||||
# Search for listings
|
||||
print("Step 1: Searching for listing URLs...")
|
||||
listing_urls = scraper.search_listings(search_term, max_pages, min_price, max_price)
|
||||
#listing_urls = ["https://www.kleinanzeigen.de/s-anzeige/cube-nuroad-pro-fe-2023-58-cm-l-/3226095826-217-23527"]
|
||||
print(f"Found {len(listing_urls)} listings\n")
|
||||
|
||||
if len(listing_urls) > 0:
|
||||
@ -294,6 +301,7 @@ def main():
|
||||
print(f" Date: {listing['date_added']}")
|
||||
print(f" Coordinates: {listing.get('lat')}, {listing.get('lon')}")
|
||||
print(f" Image: {listing['image']}")
|
||||
print(f" Description: {listing['desc']}")
|
||||
else:
|
||||
print("Failed to scrape listing")
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@ import os
|
||||
import uuid
|
||||
import threading
|
||||
import random
|
||||
import re
|
||||
|
||||
from kleinanzeigen_scrape import KleinanzeigenScraper
|
||||
|
||||
@ -128,6 +129,26 @@ def prefetch_listings_thread(session_id):
|
||||
print(f"Prefetch complete for session {session_id}")
|
||||
|
||||
|
||||
def filter_listing_search_description(description_term, description):
|
||||
# Normalize the description (lowercase + remove non-alphanumeric chars)
|
||||
text_clean = re.sub(r"[^a-z0-9]+", "", description.lower())
|
||||
|
||||
# Split the search terms by semicolon
|
||||
terms = description_term.split(";")
|
||||
|
||||
# Check each term
|
||||
for term in terms:
|
||||
# Normalize each term
|
||||
term_clean = re.sub(r"[^a-z0-9]+", "", term.lower())
|
||||
|
||||
# If any normalized term is not found in the normalized description → return False
|
||||
if term_clean not in text_clean:
|
||||
return False
|
||||
|
||||
# All terms were found
|
||||
return True
|
||||
|
||||
|
||||
@app.route("/api/search", methods=["POST"])
|
||||
def api_search():
|
||||
"""API endpoint for searching listings - returns count and starts prefetch"""
|
||||
@ -141,6 +162,7 @@ def api_search():
|
||||
num_listings = data.get("num_listings", 25)
|
||||
min_price = data.get("min_price", 0)
|
||||
max_price = data.get("max_price", 1000000000)
|
||||
power_search_description = data.get("search_term_desc", "")
|
||||
|
||||
if not search_term:
|
||||
return jsonify({"error": "Search term is required"}), 400
|
||||
@ -163,6 +185,7 @@ def api_search():
|
||||
# Store session with creation timestamp
|
||||
scrape_sessions[session_id] = {
|
||||
"urls": listing_urls,
|
||||
"power_search_desc": power_search_description,
|
||||
"total": len(listing_urls),
|
||||
"scraped": 0,
|
||||
"listings": [],
|
||||
@ -207,6 +230,11 @@ def api_scrape(session_id):
|
||||
|
||||
if len(session["listings"]) > 0:
|
||||
listing = session["listings"].pop(0)
|
||||
if session["power_search_desc"]:
|
||||
if not filter_listing_search_description(
|
||||
session["power_search_desc"], listing["desc"]
|
||||
):
|
||||
listing = None
|
||||
else:
|
||||
listing = None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user