Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,14 +12,28 @@ import os
|
|
| 12 |
api_key = os.environ.get('groq')
|
| 13 |
read_key = os.environ.get('HF_TOKEN', None)
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def parse_links_and_content(ort):
|
|
|
|
| 16 |
base_url = "https://vereine-in-deutschland.net"
|
| 17 |
-
all_links = []
|
| 18 |
-
|
| 19 |
-
initial_url = f"{base_url}/vereine/Bayern/{ort}
|
| 20 |
|
| 21 |
try:
|
| 22 |
-
# Senden der Anfrage an die initiale URL
|
| 23 |
response = requests.get(initial_url)
|
| 24 |
response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war
|
| 25 |
|
|
@@ -32,15 +46,14 @@ def parse_links_and_content(ort):
|
|
| 32 |
if link_element and 'href' in link_element.attrs:
|
| 33 |
href = link_element['href']
|
| 34 |
# Extrahiere die letzten beiden Zeichen der URL
|
| 35 |
-
last_two_chars = href[-2:].strip()
|
| 36 |
-
|
| 37 |
# Konvertiere die letzten beiden Zeichen in einen Integer
|
| 38 |
last_two_chars_int = int(last_two_chars)
|
| 39 |
else:
|
| 40 |
last_two_chars_int = 1 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
|
| 41 |
|
| 42 |
# Schleife durch alle Seiten und sammle Links
|
| 43 |
-
for page_number in range(1,
|
| 44 |
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
|
| 45 |
response = requests.get(page_url)
|
| 46 |
response.raise_for_status()
|
|
@@ -48,10 +61,11 @@ def parse_links_and_content(ort):
|
|
| 48 |
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
|
| 49 |
|
| 50 |
if target_div:
|
| 51 |
-
|
| 52 |
texts = [a.text for a in target_div.find_all('a', href=True)]
|
| 53 |
#print(texts)
|
| 54 |
-
all_links.extend(
|
|
|
|
| 55 |
else:
|
| 56 |
print(f"Target div not found on page {page_number}")
|
| 57 |
|
|
@@ -59,7 +73,8 @@ def parse_links_and_content(ort):
|
|
| 59 |
return str(e), []
|
| 60 |
|
| 61 |
all_links = all_links[0::2]
|
| 62 |
-
|
|
|
|
| 63 |
|
| 64 |
def scrape_links(links):
|
| 65 |
links=links
|
|
|
|
| 12 |
api_key = os.environ.get('groq')
|
| 13 |
read_key = os.environ.get('HF_TOKEN', None)
|
| 14 |
|
| 15 |
+
# Use Llama 3 70B powered by Groq for answering
|
| 16 |
+
def ask_llm(prompt):
|
| 17 |
+
try:
|
| 18 |
+
completion = client.chat.completions.create(
|
| 19 |
+
model="llama3-70b-8192",
|
| 20 |
+
messages=[
|
| 21 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
| 22 |
+
{"role": "user", "content": f"{prompt}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
|
| 23 |
+
],
|
| 24 |
+
)
|
| 25 |
+
return completion.choices[0].message.content
|
| 26 |
+
except Exception as e:
|
| 27 |
+
return f"Error in response generation: {str(e)}"
|
| 28 |
+
|
| 29 |
def parse_links_and_content(ort):
|
| 30 |
+
|
| 31 |
base_url = "https://vereine-in-deutschland.net"
|
| 32 |
+
all_links = []
|
| 33 |
+
all_links_text = []
|
| 34 |
+
initial_url = f"{base_url}/vereine/Bayern/{ort}"
|
| 35 |
|
| 36 |
try:
|
|
|
|
| 37 |
response = requests.get(initial_url)
|
| 38 |
response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war
|
| 39 |
|
|
|
|
| 46 |
if link_element and 'href' in link_element.attrs:
|
| 47 |
href = link_element['href']
|
| 48 |
# Extrahiere die letzten beiden Zeichen der URL
|
| 49 |
+
last_two_chars = href[-2:].strip()
|
|
|
|
| 50 |
# Konvertiere die letzten beiden Zeichen in einen Integer
|
| 51 |
last_two_chars_int = int(last_two_chars)
|
| 52 |
else:
|
| 53 |
last_two_chars_int = 1 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
|
| 54 |
|
| 55 |
# Schleife durch alle Seiten und sammle Links
|
| 56 |
+
for page_number in range(1, last_two_chars_int +1):
|
| 57 |
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
|
| 58 |
response = requests.get(page_url)
|
| 59 |
response.raise_for_status()
|
|
|
|
| 61 |
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
|
| 62 |
|
| 63 |
if target_div:
|
| 64 |
+
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
|
| 65 |
texts = [a.text for a in target_div.find_all('a', href=True)]
|
| 66 |
#print(texts)
|
| 67 |
+
all_links.extend(links)
|
| 68 |
+
all_links_text.extend(texts)
|
| 69 |
else:
|
| 70 |
print(f"Target div not found on page {page_number}")
|
| 71 |
|
|
|
|
| 73 |
return str(e), []
|
| 74 |
|
| 75 |
all_links = all_links[0::2]
|
| 76 |
+
all_links_text = all_links_text[0::2]
|
| 77 |
+
return all_links_text
|
| 78 |
|
| 79 |
def scrape_links(links):
|
| 80 |
links=links
|