split backend

2025-11-26 11:30:02 +01:00
parent 8a1f078435
commit e9ef75030f
3 changed files with 319 additions and 225 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 __pycache__/helper.cpython-313.pyc
 backend/zip_cache.json
+backend/__pycache__/kleinanzeigen_scrape.cpython-313.pyc
--- a/backend/kleinanzeigen_scrape.py
+++ b/backend/kleinanzeigen_scrape.py
@ -0,0 +1,305 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from bs4 import BeautifulSoup
+from datetime import datetime
+import urllib3
+import random
+import requests
+import time
+import json
+import os
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+class KleinanzeigenScraper:
+    def __init__(self, config, metrics=None):
+        """
+        Initialize the scraper with configuration
+
+        Args:
+            config: Configuration dictionary
+            metrics: Optional metrics dictionary to track statistics
+        """
+        self.config = config
+        self.metrics = metrics
+        self.zip_cache = {}
+        self.cache_file = config["cache"]["zip_cache_file"]
+
+        # Load cache
+        self._load_cache()
+
+    def _load_cache(self):
+        """Load ZIP code cache from file"""
+        if os.path.exists(self.cache_file):
+            with open(self.cache_file, "r", encoding="utf-8") as f:
+                self.zip_cache = json.load(f)
+            print(f"Loaded {len(self.zip_cache)} ZIP codes from cache")
+
+    def _save_cache(self):
+        """Save ZIP code cache to file"""
+        with open(self.cache_file, "w", encoding="utf-8") as f:
+            json.dump(self.zip_cache, f, ensure_ascii=False, indent=2)
+
+    def get_random_user_agent(self):
+        """Generate random user agent string"""
+        return random.choice(self.config["user_agents"])
+
+    def make_soup(self, url):
+        """Fetch URL and return BeautifulSoup object"""
+        user_agent = {"user-agent": self.get_random_user_agent()}
+        http = urllib3.PoolManager(10, headers=user_agent)
+        try:
+            r = http.request("GET", url)
+            # Track response code
+            if self.metrics is not None:
+                status_code = str(r.status)
+                if self.config["apis"]["kleinanzeigen"]["base_url"] in url:
+                    self.metrics["kleinanzeigen_response_codes"][status_code] = (
+                        self.metrics["kleinanzeigen_response_codes"].get(status_code, 0)
+                        + 1
+                    )
+            return BeautifulSoup(r.data, "lxml")
+        except Exception as e:
+            print(f"Error fetching {url}: {e}")
+            if self.metrics is not None:
+                if self.config["apis"]["kleinanzeigen"]["base_url"] in url:
+                    self.metrics["kleinanzeigen_response_codes"]["error"] = (
+                        self.metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
+                    )
+            raise
+
+    def geocode_zip(self, zip_code):
+        """Geocode ZIP code using Nominatim API with caching"""
+        zip_code = str(zip_code)
+
+        # Check cache first
+        if zip_code in self.zip_cache:
+            return self.zip_cache[zip_code]
+
+        # Call Nominatim API
+        url = self.config["apis"]["nominatim"]["url"]
+        params = {
+            "postalcode": zip_code,
+            "country": "Germany",
+            "format": "json",
+            "limit": 1,
+        }
+
+        try:
+            response = requests.get(
+                url,
+                params=params,
+                headers={"user-agent": self.config["apis"]["nominatim"]["user_agent"]},
+            )
+
+            # Track response code
+            if self.metrics is not None:
+                status_code = str(response.status_code)
+                self.metrics["nominatim_response_codes"][status_code] = (
+                    self.metrics["nominatim_response_codes"].get(status_code, 0) + 1
+                )
+
+            data = response.json()
+
+            if data:
+                coords = {"lat": float(data[0]["lat"]), "lon": float(data[0]["lon"])}
+                self.zip_cache[zip_code] = coords
+
+                # Save cache
+                self._save_cache()
+
+                time.sleep(self.config["scraping"]["geocoding_delay"])
+                return coords
+        except Exception as e:
+            print(f"Geocoding error for {zip_code}: {e}")
+            if self.metrics is not None:
+                self.metrics["nominatim_response_codes"]["error"] = (
+                    self.metrics["nominatim_response_codes"].get("error", 0) + 1
+                )
+
+        return None
+
+    def search_listings(self, search_term, max_pages, min_price, max_price):
+        """Search for listings on kleinanzeigen.de - returns only URLs"""
+        base_url = self.config["apis"]["kleinanzeigen"]["base_url"]
+        found_listings = set()
+
+        for page_counter in range(1, max_pages + 1):
+            listing_url = (
+                base_url
+                + "/s-anbieter:privat/anzeige:angebote/preis:"
+                + str(min_price)
+                + ":"
+                + str(max_price)
+                + "/seite:"
+                + str(page_counter)
+                + "/"
+                + search_term.replace(" ", "-")
+                + "/k0"
+            )
+
+            print(f"Scraping page {page_counter}: {listing_url}")
+
+            try:
+                soup = self.make_soup(listing_url)
+                results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
+
+                if len(results) <= 0:
+                    break
+
+                for result in results:
+                    try:
+                        listing_href = result.a["href"]
+                        found_listings.add(base_url + listing_href)
+                    except (AttributeError, KeyError):
+                        pass
+            except Exception as e:
+                print(f"Error scraping page {page_counter}: {e}")
+                break
+
+        return list(found_listings)
+
+    def scrape_listing(self, url):
+        """Scrape individual listing details"""
+        try:
+            soup = self.make_soup(url)
+            if self.metrics is not None:
+                self.metrics["scrape_requests"] += 1
+
+            title = soup.find("h1", class_="boxedarticle--title")
+            if not title:
+                return None
+            title = title.text.strip()
+
+            price_elem = soup.find("h2", class_="boxedarticle--price")
+            price = 0
+            if price_elem:
+                price_text = price_elem.text.strip().split(" ")[0]
+                try:
+                    price = int(price_text.replace(".", "").replace(",", ""))
+                except:
+                    price = 0
+
+            flexlist = soup.find("ul", class_="flexlist text-light-800")
+            listing_id = 0
+            if flexlist:
+                flex_items = flexlist.find_all("li", recursive=False)
+                if len(flex_items) > 1:
+                    try:
+                        listing_id = int(flex_items[1].text.strip())
+                    except:
+                        pass
+
+            locality = soup.find("span", id="viewad-locality")
+            zip_code = None
+            address = ""
+            if locality:
+                locality_text = locality.text.strip()
+                parts = locality_text.split(" ", 1)
+                if parts:
+                    zip_code = parts[0]
+                    if len(parts) > 1:
+                        address = parts[1]
+
+            date_added = None
+            details_divs = soup.find_all("div", class_="boxedarticle--details--full")
+            if len(details_divs) > 1:
+                date_span = details_divs[1].find("span")
+                if date_span:
+                    try:
+                        date_added = datetime.strptime(date_span.text, "%d.%m.%Y")
+                    except:
+                        pass
+
+            first_image = None
+            img_elem = soup.find("div", class_="galleryimage-element current")
+            if img_elem:
+                img = img_elem.find("img")
+                if img and img.get("src"):
+                    first_image = img["src"]
+
+            if not zip_code:
+                return None
+
+            listing = {
+                "title": title,
+                "price": price,
+                "id": listing_id,
+                "zip_code": zip_code,
+                "address": address,
+                "date_added": date_added.isoformat() if date_added else None,
+                "image": first_image,
+                "url": url,
+            }
+
+            # Add coordinates
+            coords = self.geocode_zip(zip_code)
+            if coords and isinstance(coords, dict):
+                listing["lat"] = coords.get("lat")
+                listing["lon"] = coords.get("lon")
+
+            return listing
+
+        except Exception as e:
+            print(f"Error scraping listing {url}: {e}")
+            return None
+
+
+def main():
+    """Test the scraper functionality"""
+    print("=== Kleinanzeigen Scraper Test ===\n")
+
+    # Load config
+    config_file = "config.json"
+    if not os.path.exists(config_file):
+        print(f"ERROR: {config_file} not found!")
+        return
+
+    with open(config_file, "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    # Initialize scraper
+    scraper = KleinanzeigenScraper(config)
+
+    # Test parameters
+    search_term = "Fahrrad"
+    max_pages = 1
+    min_price = 100
+    max_price = 500
+
+    print(f"Searching for: {search_term}")
+    print(f"Price range: {min_price}€ - {max_price}€")
+    print(f"Max pages: {max_pages}\n")
+
+    # Search for listings
+    print("Step 1: Searching for listing URLs...")
+    listing_urls = scraper.search_listings(search_term, max_pages, min_price, max_price)
+    print(f"Found {len(listing_urls)} listings\n")
+
+    if len(listing_urls) > 0:
+        # Scrape first listing as test
+        print("Step 2: Scraping first listing details...")
+        test_url = listing_urls[0]
+        print(f"URL: {test_url}")
+
+        listing = scraper.scrape_listing(test_url)
+
+        if listing:
+            print("\nListing details:")
+            print(f"  Title: {listing['title']}")
+            print(f"  Price: €{listing['price']}")
+            print(f"  ID: {listing['id']}")
+            print(f"  Location: {listing['address']} ({listing['zip_code']})")
+            print(f"  Date: {listing['date_added']}")
+            print(f"  Coordinates: {listing.get('lat')}, {listing.get('lon')}")
+            print(f"  Image: {listing['image']}")
+        else:
+            print("Failed to scrape listing")
+
+    print(f"\n=== Test completed ===")
+    print(f"ZIP cache size: {len(scraper.zip_cache)}")
+
+
+if __name__ == "__main__":
+    main()
--- a/backend/scrape_proxy.py
+++ b/backend/scrape_proxy.py
@ -3,22 +3,18 @@

 from flask import Flask, request, jsonify
 from flask_cors import CORS
-from bs4 import BeautifulSoup
-from datetime import datetime
-import urllib3
-import random
-import requests
 import time
 import json
 import os
 import uuid
 import threading
+import random
+
+from kleinanzeigen_scrape import KleinanzeigenScraper

 app = Flask(__name__)
 CORS(app)

-urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
 # Load configuration
 CONFIG_FILE = "config.json"
 config = {}
@ -31,23 +27,15 @@ else:
    exit(1)

 # Configuration values
-CACHE_FILE = config["cache"]["zip_cache_file"]
-SESSION_TIMEOUT = config["scraping"]["session_timeout"]
 LISTINGS_PER_PAGE = config["scraping"]["listings_per_page"]
 MAX_WORKERS = config["scraping"]["max_workers"]
 MIN_WORKERS = config["scraping"]["min_workers"]
 RATE_LIMIT_DELAY = config["scraping"]["rate_limit_delay"]
-GEOCODING_DELAY = config["scraping"]["geocoding_delay"]
-USER_AGENTS = config["user_agents"]
-NOMINATIM_URL = config["apis"]["nominatim"]["url"]
-NOMINATIM_USER_AGENT = config["apis"]["nominatim"]["user_agent"]
-KLEINANZEIGEN_BASE_URL = config["apis"]["kleinanzeigen"]["base_url"]
 SERVER_HOST = config["server"]["host"]
 SERVER_PORT = config["server"]["port"]
 SERVER_DEBUG = config["server"]["debug"]

 # Global state
-zip_cache = {}
 scrape_sessions = {}
 app_start_time = time.time()

@ -59,14 +47,18 @@ metrics = {
    "nominatim_response_codes": {},
 }

+# Initialize scraper
+scraper = KleinanzeigenScraper(config, metrics)
+

 def cleanup_old_sessions():
    """Remove sessions older than SESSION_TIMEOUT"""
+    session_timeout = config["scraping"]["session_timeout"]
    current_time = time.time()
    sessions_to_remove = []

    for session_id, session in scrape_sessions.items():
-        if current_time - session.get("created_at", current_time) > SESSION_TIMEOUT:
+        if current_time - session.get("created_at", current_time) > session_timeout:
            sessions_to_remove.append(session_id)

    for session_id in sessions_to_remove:
@ -76,213 +68,13 @@ def cleanup_old_sessions():
    return len(sessions_to_remove)


-def get_random_user_agent():
-    """Generate random user agent string"""
-    return random.choice(USER_AGENTS)
-
-
-def make_soup(url):
-    """Fetch URL and return BeautifulSoup object"""
-    user_agent = {"user-agent": get_random_user_agent()}
-    http = urllib3.PoolManager(10, headers=user_agent)
-    try:
-        r = http.request("GET", url)
-        # Track response code
-        status_code = str(r.status)
-        if KLEINANZEIGEN_BASE_URL in url:
-            metrics["kleinanzeigen_response_codes"][status_code] = (
-                metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1
-            )
-        return BeautifulSoup(r.data, "lxml")
-    except Exception as e:
-        print(f"Error fetching {url}: {e}")
-        if KLEINANZEIGEN_BASE_URL in url:
-            metrics["kleinanzeigen_response_codes"]["error"] = (
-                metrics["kleinanzeigen_response_codes"].get("error", 0) + 1
-            )
-        raise
-
-
-def geocode_zip(zip_code):
-    """Geocode ZIP code using Nominatim API with caching"""
-    zip_code = str(zip_code)
-
-    # Check cache first
-    if zip_code in zip_cache:
-        return zip_cache[zip_code]
-
-    # Call Nominatim API
-    params = {
-        "postalcode": zip_code,
-        "country": "Germany",
-        "format": "json",
-        "limit": 1,
-    }
-
-    try:
-        response = requests.get(
-            NOMINATIM_URL, params=params, headers={"user-agent": NOMINATIM_USER_AGENT}
-        )
-
-        # Track response code
-        status_code = str(response.status_code)
-        metrics["nominatim_response_codes"][status_code] = (
-            metrics["nominatim_response_codes"].get(status_code, 0) + 1
-        )
-
-        data = response.json()
-
-        if data:
-            coords = {"lat": float(data[0]["lat"]), "lon": float(data[0]["lon"])}
-            zip_cache[zip_code] = coords
-
-            # Save cache
-            with open(CACHE_FILE, "w", encoding="utf-8") as f:
-                json.dump(zip_cache, f, ensure_ascii=False, indent=2)
-
-            time.sleep(GEOCODING_DELAY)
-            return coords
-    except Exception as e:
-        print(f"Geocoding error for {zip_code}: {e}")
-        metrics["nominatim_response_codes"]["error"] = (
-            metrics["nominatim_response_codes"].get("error", 0) + 1
-        )
-
-    return None
-
-
-def search_listings(search_term, max_pages, min_price, max_price):
-    """Search for listings on kleinanzeigen.de - returns only URLs"""
-    found_listings = set()
-
-    for page_counter in range(1, max_pages + 1):
-        listing_url = (
-            KLEINANZEIGEN_BASE_URL
-            + "/s-anbieter:privat/anzeige:angebote/preis:"
-            + str(min_price)
-            + ":"
-            + str(max_price)
-            + "/seite:"
-            + str(page_counter)
-            + "/"
-            + search_term.replace(" ", "-")
-            + "/k0"
-        )
-
-        print(f"Scraping page {page_counter}: {listing_url}")
-
-        try:
-            soup = make_soup(listing_url)
-            results = soup.find_all("li", class_="ad-listitem fully-clickable-card")
-
-            if len(results) <= 0:
-                break
-
-            for result in results:
-                try:
-                    listing_href = result.a["href"]
-                    found_listings.add(KLEINANZEIGEN_BASE_URL + listing_href)
-                except (AttributeError, KeyError):
-                    pass
-        except Exception as e:
-            print(f"Error scraping page {page_counter}: {e}")
-            break
-
-    return list(found_listings)
-
-
-def scrape_listing(url):
-    """Scrape individual listing details"""
-    try:
-        soup = make_soup(url)
-        metrics["scrape_requests"] += 1
-
-        title = soup.find("h1", class_="boxedarticle--title")
-        if not title:
-            return None
-        title = title.text.strip()
-
-        price_elem = soup.find("h2", class_="boxedarticle--price")
-        price = 0
-        if price_elem:
-            price_text = price_elem.text.strip().split(" ")[0]
-            try:
-                price = int(price_text.replace(".", "").replace(",", ""))
-            except:
-                price = 0
-
-        flexlist = soup.find("ul", class_="flexlist text-light-800")
-        listing_id = 0
-        if flexlist:
-            flex_items = flexlist.find_all("li", recursive=False)
-            if len(flex_items) > 1:
-                try:
-                    listing_id = int(flex_items[1].text.strip())
-                except:
-                    pass
-
-        locality = soup.find("span", id="viewad-locality")
-        zip_code = None
-        address = ""
-        if locality:
-            locality_text = locality.text.strip()
-            parts = locality_text.split(" ", 1)
-            if parts:
-                zip_code = parts[0]
-                if len(parts) > 1:
-                    address = parts[1]
-
-        date_added = None
-        details_divs = soup.find_all("div", class_="boxedarticle--details--full")
-        if len(details_divs) > 1:
-            date_span = details_divs[1].find("span")
-            if date_span:
-                try:
-                    date_added = datetime.strptime(date_span.text, "%d.%m.%Y")
-                except:
-                    pass
-
-        first_image = None
-        img_elem = soup.find("div", class_="galleryimage-element current")
-        if img_elem:
-            img = img_elem.find("img")
-            if img and img.get("src"):
-                first_image = img["src"]
-
-        if not zip_code:
-            return None
-
-        listing = {
-            "title": title,
-            "price": price,
-            "id": listing_id,
-            "zip_code": zip_code,
-            "address": address,
-            "date_added": date_added.isoformat() if date_added else None,
-            "image": first_image,
-            "url": url,
-        }
-
-        # Add coordinates
-        coords = geocode_zip(zip_code)
-        if coords and isinstance(coords, dict):
-            listing["lat"] = coords.get("lat")
-            listing["lon"] = coords.get("lon")
-
-        return listing
-
-    except Exception as e:
-        print(f"Error scraping listing {url}: {e}")
-        return None
-
-
 def scrape_listing_wrapper(session_id, url, results, index):
    """Wrapper for scraping listing in thread"""
    session = scrape_sessions.get(session_id)
    if not session:
        return

-    listing = scrape_listing(url)
+    listing = scraper.scrape_listing(url)
    results[index] = listing


@ -358,7 +150,9 @@ def api_search():

    try:
        # Search for listing URLs only
-        listing_urls = search_listings(search_term, max_pages, min_price, max_price)
+        listing_urls = scraper.search_listings(
+            search_term, max_pages, min_price, max_price
+        )

        # Limit to requested number
        listing_urls = listing_urls[:num_listings]
@ -514,11 +308,5 @@ def api_metrics():

 if __name__ == "__main__":
    print("Starting Kleinanzeigen Scraper API Server...")
-
-    # Load cache on startup
-    if os.path.exists(CACHE_FILE):
-        with open(CACHE_FILE, "r", encoding="utf-8") as f:
-            zip_cache = json.load(f)
-
-    print(f"Loaded {len(zip_cache)} ZIP codes from cache")
+    print(f"ZIP code cache loaded with {len(scraper.zip_cache)} entries")
    app.run(debug=SERVER_DEBUG, host=SERVER_HOST, port=SERVER_PORT, threaded=True)