diff --git a/.gitignore b/.gitignore index 31d198d..c1285fc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ __pycache__/helper.cpython-313.pyc backend/zip_cache.json +backend/__pycache__/kleinanzeigen_scrape.cpython-313.pyc diff --git a/backend/kleinanzeigen_scrape.py b/backend/kleinanzeigen_scrape.py new file mode 100644 index 0000000..4b16c96 --- /dev/null +++ b/backend/kleinanzeigen_scrape.py @@ -0,0 +1,305 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from bs4 import BeautifulSoup +from datetime import datetime +import urllib3 +import random +import requests +import time +import json +import os + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +class KleinanzeigenScraper: + def __init__(self, config, metrics=None): + """ + Initialize the scraper with configuration + + Args: + config: Configuration dictionary + metrics: Optional metrics dictionary to track statistics + """ + self.config = config + self.metrics = metrics + self.zip_cache = {} + self.cache_file = config["cache"]["zip_cache_file"] + + # Load cache + self._load_cache() + + def _load_cache(self): + """Load ZIP code cache from file""" + if os.path.exists(self.cache_file): + with open(self.cache_file, "r", encoding="utf-8") as f: + self.zip_cache = json.load(f) + print(f"Loaded {len(self.zip_cache)} ZIP codes from cache") + + def _save_cache(self): + """Save ZIP code cache to file""" + with open(self.cache_file, "w", encoding="utf-8") as f: + json.dump(self.zip_cache, f, ensure_ascii=False, indent=2) + + def get_random_user_agent(self): + """Generate random user agent string""" + return random.choice(self.config["user_agents"]) + + def make_soup(self, url): + """Fetch URL and return BeautifulSoup object""" + user_agent = {"user-agent": self.get_random_user_agent()} + http = urllib3.PoolManager(10, headers=user_agent) + try: + r = http.request("GET", url) + # Track response code + if self.metrics is not None: + status_code = str(r.status) + if self.config["apis"]["kleinanzeigen"]["base_url"] in url: + self.metrics["kleinanzeigen_response_codes"][status_code] = ( + self.metrics["kleinanzeigen_response_codes"].get(status_code, 0) + + 1 + ) + return BeautifulSoup(r.data, "lxml") + except Exception as e: + print(f"Error fetching {url}: {e}") + if self.metrics is not None: + if self.config["apis"]["kleinanzeigen"]["base_url"] in url: + self.metrics["kleinanzeigen_response_codes"]["error"] = ( + self.metrics["kleinanzeigen_response_codes"].get("error", 0) + 1 + ) + raise + + def geocode_zip(self, zip_code): + """Geocode ZIP code using Nominatim API with caching""" + zip_code = str(zip_code) + + # Check cache first + if zip_code in self.zip_cache: + return self.zip_cache[zip_code] + + # Call Nominatim API + url = self.config["apis"]["nominatim"]["url"] + params = { + "postalcode": zip_code, + "country": "Germany", + "format": "json", + "limit": 1, + } + + try: + response = requests.get( + url, + params=params, + headers={"user-agent": self.config["apis"]["nominatim"]["user_agent"]}, + ) + + # Track response code + if self.metrics is not None: + status_code = str(response.status_code) + self.metrics["nominatim_response_codes"][status_code] = ( + self.metrics["nominatim_response_codes"].get(status_code, 0) + 1 + ) + + data = response.json() + + if data: + coords = {"lat": float(data[0]["lat"]), "lon": float(data[0]["lon"])} + self.zip_cache[zip_code] = coords + + # Save cache + self._save_cache() + + time.sleep(self.config["scraping"]["geocoding_delay"]) + return coords + except Exception as e: + print(f"Geocoding error for {zip_code}: {e}") + if self.metrics is not None: + self.metrics["nominatim_response_codes"]["error"] = ( + self.metrics["nominatim_response_codes"].get("error", 0) + 1 + ) + + return None + + def search_listings(self, search_term, max_pages, min_price, max_price): + """Search for listings on kleinanzeigen.de - returns only URLs""" + base_url = self.config["apis"]["kleinanzeigen"]["base_url"] + found_listings = set() + + for page_counter in range(1, max_pages + 1): + listing_url = ( + base_url + + "/s-anbieter:privat/anzeige:angebote/preis:" + + str(min_price) + + ":" + + str(max_price) + + "/seite:" + + str(page_counter) + + "/" + + search_term.replace(" ", "-") + + "/k0" + ) + + print(f"Scraping page {page_counter}: {listing_url}") + + try: + soup = self.make_soup(listing_url) + results = soup.find_all("li", class_="ad-listitem fully-clickable-card") + + if len(results) <= 0: + break + + for result in results: + try: + listing_href = result.a["href"] + found_listings.add(base_url + listing_href) + except (AttributeError, KeyError): + pass + except Exception as e: + print(f"Error scraping page {page_counter}: {e}") + break + + return list(found_listings) + + def scrape_listing(self, url): + """Scrape individual listing details""" + try: + soup = self.make_soup(url) + if self.metrics is not None: + self.metrics["scrape_requests"] += 1 + + title = soup.find("h1", class_="boxedarticle--title") + if not title: + return None + title = title.text.strip() + + price_elem = soup.find("h2", class_="boxedarticle--price") + price = 0 + if price_elem: + price_text = price_elem.text.strip().split(" ")[0] + try: + price = int(price_text.replace(".", "").replace(",", "")) + except: + price = 0 + + flexlist = soup.find("ul", class_="flexlist text-light-800") + listing_id = 0 + if flexlist: + flex_items = flexlist.find_all("li", recursive=False) + if len(flex_items) > 1: + try: + listing_id = int(flex_items[1].text.strip()) + except: + pass + + locality = soup.find("span", id="viewad-locality") + zip_code = None + address = "" + if locality: + locality_text = locality.text.strip() + parts = locality_text.split(" ", 1) + if parts: + zip_code = parts[0] + if len(parts) > 1: + address = parts[1] + + date_added = None + details_divs = soup.find_all("div", class_="boxedarticle--details--full") + if len(details_divs) > 1: + date_span = details_divs[1].find("span") + if date_span: + try: + date_added = datetime.strptime(date_span.text, "%d.%m.%Y") + except: + pass + + first_image = None + img_elem = soup.find("div", class_="galleryimage-element current") + if img_elem: + img = img_elem.find("img") + if img and img.get("src"): + first_image = img["src"] + + if not zip_code: + return None + + listing = { + "title": title, + "price": price, + "id": listing_id, + "zip_code": zip_code, + "address": address, + "date_added": date_added.isoformat() if date_added else None, + "image": first_image, + "url": url, + } + + # Add coordinates + coords = self.geocode_zip(zip_code) + if coords and isinstance(coords, dict): + listing["lat"] = coords.get("lat") + listing["lon"] = coords.get("lon") + + return listing + + except Exception as e: + print(f"Error scraping listing {url}: {e}") + return None + + +def main(): + """Test the scraper functionality""" + print("=== Kleinanzeigen Scraper Test ===\n") + + # Load config + config_file = "config.json" + if not os.path.exists(config_file): + print(f"ERROR: {config_file} not found!") + return + + with open(config_file, "r", encoding="utf-8") as f: + config = json.load(f) + + # Initialize scraper + scraper = KleinanzeigenScraper(config) + + # Test parameters + search_term = "Fahrrad" + max_pages = 1 + min_price = 100 + max_price = 500 + + print(f"Searching for: {search_term}") + print(f"Price range: {min_price}€ - {max_price}€") + print(f"Max pages: {max_pages}\n") + + # Search for listings + print("Step 1: Searching for listing URLs...") + listing_urls = scraper.search_listings(search_term, max_pages, min_price, max_price) + print(f"Found {len(listing_urls)} listings\n") + + if len(listing_urls) > 0: + # Scrape first listing as test + print("Step 2: Scraping first listing details...") + test_url = listing_urls[0] + print(f"URL: {test_url}") + + listing = scraper.scrape_listing(test_url) + + if listing: + print("\nListing details:") + print(f" Title: {listing['title']}") + print(f" Price: €{listing['price']}") + print(f" ID: {listing['id']}") + print(f" Location: {listing['address']} ({listing['zip_code']})") + print(f" Date: {listing['date_added']}") + print(f" Coordinates: {listing.get('lat')}, {listing.get('lon')}") + print(f" Image: {listing['image']}") + else: + print("Failed to scrape listing") + + print(f"\n=== Test completed ===") + print(f"ZIP cache size: {len(scraper.zip_cache)}") + + +if __name__ == "__main__": + main() diff --git a/backend/scrape_proxy.py b/backend/scrape_proxy.py index 5f14451..e1228ef 100644 --- a/backend/scrape_proxy.py +++ b/backend/scrape_proxy.py @@ -3,22 +3,18 @@ from flask import Flask, request, jsonify from flask_cors import CORS -from bs4 import BeautifulSoup -from datetime import datetime -import urllib3 -import random -import requests import time import json import os import uuid import threading +import random + +from kleinanzeigen_scrape import KleinanzeigenScraper app = Flask(__name__) CORS(app) -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - # Load configuration CONFIG_FILE = "config.json" config = {} @@ -31,23 +27,15 @@ else: exit(1) # Configuration values -CACHE_FILE = config["cache"]["zip_cache_file"] -SESSION_TIMEOUT = config["scraping"]["session_timeout"] LISTINGS_PER_PAGE = config["scraping"]["listings_per_page"] MAX_WORKERS = config["scraping"]["max_workers"] MIN_WORKERS = config["scraping"]["min_workers"] RATE_LIMIT_DELAY = config["scraping"]["rate_limit_delay"] -GEOCODING_DELAY = config["scraping"]["geocoding_delay"] -USER_AGENTS = config["user_agents"] -NOMINATIM_URL = config["apis"]["nominatim"]["url"] -NOMINATIM_USER_AGENT = config["apis"]["nominatim"]["user_agent"] -KLEINANZEIGEN_BASE_URL = config["apis"]["kleinanzeigen"]["base_url"] SERVER_HOST = config["server"]["host"] SERVER_PORT = config["server"]["port"] SERVER_DEBUG = config["server"]["debug"] # Global state -zip_cache = {} scrape_sessions = {} app_start_time = time.time() @@ -59,14 +47,18 @@ metrics = { "nominatim_response_codes": {}, } +# Initialize scraper +scraper = KleinanzeigenScraper(config, metrics) + def cleanup_old_sessions(): """Remove sessions older than SESSION_TIMEOUT""" + session_timeout = config["scraping"]["session_timeout"] current_time = time.time() sessions_to_remove = [] for session_id, session in scrape_sessions.items(): - if current_time - session.get("created_at", current_time) > SESSION_TIMEOUT: + if current_time - session.get("created_at", current_time) > session_timeout: sessions_to_remove.append(session_id) for session_id in sessions_to_remove: @@ -76,213 +68,13 @@ def cleanup_old_sessions(): return len(sessions_to_remove) -def get_random_user_agent(): - """Generate random user agent string""" - return random.choice(USER_AGENTS) - - -def make_soup(url): - """Fetch URL and return BeautifulSoup object""" - user_agent = {"user-agent": get_random_user_agent()} - http = urllib3.PoolManager(10, headers=user_agent) - try: - r = http.request("GET", url) - # Track response code - status_code = str(r.status) - if KLEINANZEIGEN_BASE_URL in url: - metrics["kleinanzeigen_response_codes"][status_code] = ( - metrics["kleinanzeigen_response_codes"].get(status_code, 0) + 1 - ) - return BeautifulSoup(r.data, "lxml") - except Exception as e: - print(f"Error fetching {url}: {e}") - if KLEINANZEIGEN_BASE_URL in url: - metrics["kleinanzeigen_response_codes"]["error"] = ( - metrics["kleinanzeigen_response_codes"].get("error", 0) + 1 - ) - raise - - -def geocode_zip(zip_code): - """Geocode ZIP code using Nominatim API with caching""" - zip_code = str(zip_code) - - # Check cache first - if zip_code in zip_cache: - return zip_cache[zip_code] - - # Call Nominatim API - params = { - "postalcode": zip_code, - "country": "Germany", - "format": "json", - "limit": 1, - } - - try: - response = requests.get( - NOMINATIM_URL, params=params, headers={"user-agent": NOMINATIM_USER_AGENT} - ) - - # Track response code - status_code = str(response.status_code) - metrics["nominatim_response_codes"][status_code] = ( - metrics["nominatim_response_codes"].get(status_code, 0) + 1 - ) - - data = response.json() - - if data: - coords = {"lat": float(data[0]["lat"]), "lon": float(data[0]["lon"])} - zip_cache[zip_code] = coords - - # Save cache - with open(CACHE_FILE, "w", encoding="utf-8") as f: - json.dump(zip_cache, f, ensure_ascii=False, indent=2) - - time.sleep(GEOCODING_DELAY) - return coords - except Exception as e: - print(f"Geocoding error for {zip_code}: {e}") - metrics["nominatim_response_codes"]["error"] = ( - metrics["nominatim_response_codes"].get("error", 0) + 1 - ) - - return None - - -def search_listings(search_term, max_pages, min_price, max_price): - """Search for listings on kleinanzeigen.de - returns only URLs""" - found_listings = set() - - for page_counter in range(1, max_pages + 1): - listing_url = ( - KLEINANZEIGEN_BASE_URL - + "/s-anbieter:privat/anzeige:angebote/preis:" - + str(min_price) - + ":" - + str(max_price) - + "/seite:" - + str(page_counter) - + "/" - + search_term.replace(" ", "-") - + "/k0" - ) - - print(f"Scraping page {page_counter}: {listing_url}") - - try: - soup = make_soup(listing_url) - results = soup.find_all("li", class_="ad-listitem fully-clickable-card") - - if len(results) <= 0: - break - - for result in results: - try: - listing_href = result.a["href"] - found_listings.add(KLEINANZEIGEN_BASE_URL + listing_href) - except (AttributeError, KeyError): - pass - except Exception as e: - print(f"Error scraping page {page_counter}: {e}") - break - - return list(found_listings) - - -def scrape_listing(url): - """Scrape individual listing details""" - try: - soup = make_soup(url) - metrics["scrape_requests"] += 1 - - title = soup.find("h1", class_="boxedarticle--title") - if not title: - return None - title = title.text.strip() - - price_elem = soup.find("h2", class_="boxedarticle--price") - price = 0 - if price_elem: - price_text = price_elem.text.strip().split(" ")[0] - try: - price = int(price_text.replace(".", "").replace(",", "")) - except: - price = 0 - - flexlist = soup.find("ul", class_="flexlist text-light-800") - listing_id = 0 - if flexlist: - flex_items = flexlist.find_all("li", recursive=False) - if len(flex_items) > 1: - try: - listing_id = int(flex_items[1].text.strip()) - except: - pass - - locality = soup.find("span", id="viewad-locality") - zip_code = None - address = "" - if locality: - locality_text = locality.text.strip() - parts = locality_text.split(" ", 1) - if parts: - zip_code = parts[0] - if len(parts) > 1: - address = parts[1] - - date_added = None - details_divs = soup.find_all("div", class_="boxedarticle--details--full") - if len(details_divs) > 1: - date_span = details_divs[1].find("span") - if date_span: - try: - date_added = datetime.strptime(date_span.text, "%d.%m.%Y") - except: - pass - - first_image = None - img_elem = soup.find("div", class_="galleryimage-element current") - if img_elem: - img = img_elem.find("img") - if img and img.get("src"): - first_image = img["src"] - - if not zip_code: - return None - - listing = { - "title": title, - "price": price, - "id": listing_id, - "zip_code": zip_code, - "address": address, - "date_added": date_added.isoformat() if date_added else None, - "image": first_image, - "url": url, - } - - # Add coordinates - coords = geocode_zip(zip_code) - if coords and isinstance(coords, dict): - listing["lat"] = coords.get("lat") - listing["lon"] = coords.get("lon") - - return listing - - except Exception as e: - print(f"Error scraping listing {url}: {e}") - return None - - def scrape_listing_wrapper(session_id, url, results, index): """Wrapper for scraping listing in thread""" session = scrape_sessions.get(session_id) if not session: return - listing = scrape_listing(url) + listing = scraper.scrape_listing(url) results[index] = listing @@ -358,7 +150,9 @@ def api_search(): try: # Search for listing URLs only - listing_urls = search_listings(search_term, max_pages, min_price, max_price) + listing_urls = scraper.search_listings( + search_term, max_pages, min_price, max_price + ) # Limit to requested number listing_urls = listing_urls[:num_listings] @@ -514,11 +308,5 @@ def api_metrics(): if __name__ == "__main__": print("Starting Kleinanzeigen Scraper API Server...") - - # Load cache on startup - if os.path.exists(CACHE_FILE): - with open(CACHE_FILE, "r", encoding="utf-8") as f: - zip_cache = json.load(f) - - print(f"Loaded {len(zip_cache)} ZIP codes from cache") + print(f"ZIP code cache loaded with {len(scraper.zip_cache)} entries") app.run(debug=SERVER_DEBUG, host=SERVER_HOST, port=SERVER_PORT, threaded=True)