Spaces:
Running
Running
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin | |
| from gradio_client import Client | |
| import json | |
| import csv | |
| import pandas | |
| #import groq | |
| import os | |
| #api_key = os.getenv('groq') | |
| def parse_links_and_content(ort): | |
| base_url = "https://vereine-in-deutschland.net" | |
| all_links = [] | |
| # Konstruiere die vollständige URL | |
| initial_url = f"{base_url}/vereine/Bayern/{ort}/" | |
| try: | |
| # Senden der Anfrage an die initiale URL | |
| response = requests.get(initial_url) | |
| response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war | |
| # Parse the HTML content using BeautifulSoup | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Ermittle die letzte Seite | |
| link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)') | |
| if link_element and 'href' in link_element.attrs: | |
| href = link_element['href'] | |
| # Extrahiere die letzten beiden Zeichen der URL | |
| last_two_chars = href[-2:] | |
| # Konvertiere die letzten beiden Zeichen in einen Integer | |
| last_two_chars_int = int(last_two_chars) | |
| else: | |
| last_two_chars_int = 1 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt | |
| # Schleife durch alle Seiten und sammle Links | |
| for page_number in range(1, last_two_chars_int + 1): | |
| page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}" | |
| response = requests.get(page_url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| target_div = soup.select_one('div.row-cols-1:nth-child(4)') | |
| if target_div: | |
| #links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)] | |
| texts = [a.text for a in target_div.find_all('a', href=True)] | |
| #print(texts) | |
| all_links.extend(texts) | |
| else: | |
| print(f"Target div not found on page {page_number}") | |
| except Exception as e: | |
| return str(e), [] | |
| all_links = all_links[0::2] | |
| return all_links | |
| def scrape_links(links): | |
| links=links | |
| contact_details= [] | |
| client = Client("mgokg/PerplexicaApi") | |
| for verein in links: | |
| result = client.predict( | |
| prompt=f"{verein}", | |
| api_name="/parse_links" | |
| ) | |
| #print(result) | |
| contact_details.append(result) | |
| return contact_details | |
| # Speichere die JSON-Daten in eine CSV-Datei | |
| def save_to_csv(data, filename): | |
| keys = data[0].keys() | |
| with open(filename, 'w', newline='', encoding='utf-8') as output_file: | |
| dict_writer = csv.DictWriter(output_file, fieldnames=keys) | |
| dict_writer.writeheader() | |
| dict_writer.writerows(data) | |
| # Erstelle die Gradio-Schnittstelle | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# ") | |
| with gr.Row(): | |
| ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein") | |
| links_output = gr.JSON(label="Vereinsliste") | |
| #links_output = gr.DataFrame(label="Ergebnisse") | |
| #json_output = gr.JSON(label="Ergebnisse") | |
| def process_ort(ort): | |
| links = parse_links_and_content(ort) | |
| #return links | |
| contact= scrape_links(links) | |
| json_data = [json.loads(item) for item in contact] | |
| #save_to_csv(json_data, './contact_details.csv') | |
| #return f"[Download CSV](contact_details.csv)", json_data | |
| #return json_data | |
| #return contact | |
| return json_data | |
| #return json_data | |
| # Button zum Starten der Parsung | |
| button = gr.Button("senden") | |
| # Verbinde den Button mit der Funktion | |
| button.click(fn=process_ort, inputs=ort_input, outputs=links_output) | |
| # Starte die Gradio-Anwendung | |
| demo.launch() | |