2022-05-31 14:41:36 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
""" Author: Hendrik Schutter, mail@hendrikschutter.com
Date of creation : 2022 / 05 / 31
2024-08-18 22:49:48 +02:00
Date of last modification : 2024 / 08 / 18
2022-05-31 14:41:36 +02:00
"""
from bs4 import BeautifulSoup
import datetime
from tinydb import TinyDB , Query
import urllib3
import sys
import helper
urllib3 . disable_warnings ( urllib3 . exceptions . InsecureRequestWarning )
2024-08-18 22:49:48 +02:00
2022-05-31 14:41:36 +02:00
def make_soup ( url ) :
2024-08-18 22:49:48 +02:00
user_agent = { " user-agent " : helper . get_random_user_agent ( ) }
# print(user_agent)
2022-05-31 14:41:36 +02:00
http = urllib3 . PoolManager ( 10 , headers = user_agent )
r = http . request ( " GET " , url )
2024-08-18 22:49:48 +02:00
return BeautifulSoup ( r . data , " lxml " )
2022-05-31 14:41:36 +02:00
def scrape_listing ( url ) :
2024-08-18 22:49:48 +02:00
# print ("Web Page: ", url)
2022-05-31 14:41:36 +02:00
soup = make_soup ( url )
2024-08-18 22:49:48 +02:00
# print(soup)
# print(soup.find("div", class_="vim x-item-title").span.text)
# print(soup.find("span", class_="ux-call-to-action__text").text)
# print(float(soup.find('div', class_='x-price-primary').find('span', class_='ux-textspans').text.replace("EUR", "").strip().replace(',', '.')))
# print(soup.find("img", loading="eager")["src"])
# print(soup.find("nav", class_="breadcrumbs breadcrumb--overflow").find("li"))
category = list ( )
for span_subcategory in soup . find (
" nav " , class_ = " breadcrumbs breadcrumb--overflow "
) . find_all ( " span " ) :
category . append ( span_subcategory . text )
2022-05-31 14:41:36 +02:00
listing = {
2024-08-18 22:49:48 +02:00
" title " : soup . find ( " div " , class_ = " vim x-item-title " ) . span . text ,
" directbuy " : (
True
if soup . find ( " span " , class_ = " ux-call-to-action__text " ) . text
== " Sofort-Kaufen "
else False
) ,
" price " : float (
soup . find ( " div " , class_ = " x-price-primary " )
. find ( " span " , class_ = " ux-textspans " )
. text . replace ( " EUR " , " " )
. strip ( )
. replace ( " , " , " . " )
) ,
" category " : category ,
" image " : soup . find ( " img " , loading = " eager " ) [ " src " ] ,
" url " : url ,
2022-05-31 14:41:36 +02:00
}
return listing
2024-08-18 22:49:48 +02:00
if __name__ == " __main__ " :
# while(1):
listing = scrape_listing ( " https://www.ebay.de/itm/226288543773 " )
print ( listing )