Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -12,14 +12,28 @@ import os
|
|
12 |
api_key = os.environ.get('groq')
|
13 |
read_key = os.environ.get('HF_TOKEN', None)
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def parse_links_and_content(ort):
|
|
|
16 |
base_url = "https://vereine-in-deutschland.net"
|
17 |
-
all_links = []
|
18 |
-
|
19 |
-
initial_url = f"{base_url}/vereine/Bayern/{ort}
|
20 |
|
21 |
try:
|
22 |
-
# Senden der Anfrage an die initiale URL
|
23 |
response = requests.get(initial_url)
|
24 |
response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war
|
25 |
|
@@ -32,15 +46,14 @@ def parse_links_and_content(ort):
|
|
32 |
if link_element and 'href' in link_element.attrs:
|
33 |
href = link_element['href']
|
34 |
# Extrahiere die letzten beiden Zeichen der URL
|
35 |
-
last_two_chars = href[-2:].strip()
|
36 |
-
|
37 |
# Konvertiere die letzten beiden Zeichen in einen Integer
|
38 |
last_two_chars_int = int(last_two_chars)
|
39 |
else:
|
40 |
last_two_chars_int = 1 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
|
41 |
|
42 |
# Schleife durch alle Seiten und sammle Links
|
43 |
-
for page_number in range(1,
|
44 |
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
|
45 |
response = requests.get(page_url)
|
46 |
response.raise_for_status()
|
@@ -48,10 +61,11 @@ def parse_links_and_content(ort):
|
|
48 |
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
|
49 |
|
50 |
if target_div:
|
51 |
-
|
52 |
texts = [a.text for a in target_div.find_all('a', href=True)]
|
53 |
#print(texts)
|
54 |
-
all_links.extend(
|
|
|
55 |
else:
|
56 |
print(f"Target div not found on page {page_number}")
|
57 |
|
@@ -59,7 +73,8 @@ def parse_links_and_content(ort):
|
|
59 |
return str(e), []
|
60 |
|
61 |
all_links = all_links[0::2]
|
62 |
-
|
|
|
63 |
|
64 |
def scrape_links(links):
|
65 |
links=links
|
|
|
12 |
api_key = os.environ.get('groq')
|
13 |
read_key = os.environ.get('HF_TOKEN', None)
|
14 |
|
15 |
+
# Use Llama 3 70B powered by Groq for answering
|
16 |
+
def ask_llm(prompt):
|
17 |
+
try:
|
18 |
+
completion = client.chat.completions.create(
|
19 |
+
model="llama3-70b-8192",
|
20 |
+
messages=[
|
21 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
22 |
+
{"role": "user", "content": f"{prompt}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
|
23 |
+
],
|
24 |
+
)
|
25 |
+
return completion.choices[0].message.content
|
26 |
+
except Exception as e:
|
27 |
+
return f"Error in response generation: {str(e)}"
|
28 |
+
|
29 |
def parse_links_and_content(ort):
|
30 |
+
|
31 |
base_url = "https://vereine-in-deutschland.net"
|
32 |
+
all_links = []
|
33 |
+
all_links_text = []
|
34 |
+
initial_url = f"{base_url}/vereine/Bayern/{ort}"
|
35 |
|
36 |
try:
|
|
|
37 |
response = requests.get(initial_url)
|
38 |
response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war
|
39 |
|
|
|
46 |
if link_element and 'href' in link_element.attrs:
|
47 |
href = link_element['href']
|
48 |
# Extrahiere die letzten beiden Zeichen der URL
|
49 |
+
last_two_chars = href[-2:].strip()
|
|
|
50 |
# Konvertiere die letzten beiden Zeichen in einen Integer
|
51 |
last_two_chars_int = int(last_two_chars)
|
52 |
else:
|
53 |
last_two_chars_int = 1 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
|
54 |
|
55 |
# Schleife durch alle Seiten und sammle Links
|
56 |
+
for page_number in range(1, last_two_chars_int +1):
|
57 |
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
|
58 |
response = requests.get(page_url)
|
59 |
response.raise_for_status()
|
|
|
61 |
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
|
62 |
|
63 |
if target_div:
|
64 |
+
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
|
65 |
texts = [a.text for a in target_div.find_all('a', href=True)]
|
66 |
#print(texts)
|
67 |
+
all_links.extend(links)
|
68 |
+
all_links_text.extend(texts)
|
69 |
else:
|
70 |
print(f"Target div not found on page {page_number}")
|
71 |
|
|
|
73 |
return str(e), []
|
74 |
|
75 |
all_links = all_links[0::2]
|
76 |
+
all_links_text = all_links_text[0::2]
|
77 |
+
return all_links_text
|
78 |
|
79 |
def scrape_links(links):
|
80 |
links=links
|