Spaces:
Running
Running
File size: 4,600 Bytes
a936419 04f7cb6 f81e9c7 ae59393 dde4a77 fa394de dde4a77 a936419 fa394de 0963c3d 3d552fd 740258d 5535edf 740258d 04f7cb6 740258d a936419 e410dd0 2f3bf94 e410dd0 a936419 e410dd0 2f3bf94 e410dd0 2f3bf94 740258d 2f3bf94 e410dd0 2f3bf94 740258d 2f3bf94 ae59393 2f3bf94 740258d 75278c7 b74e7f8 740258d 2f3bf94 e410dd0 cefaac5 920c8fd 740258d 5535edf b66ac54 db7669b b66ac54 a789a4d b66ac54 c1c8a1e a936419 85deaff dde4a77 b74e7f8 a28b6b3 5535edf 2f3bf94 a28b6b3 c1c8a1e 3a2f717 a8a4b07 a28b6b3 3a2f717 b74e7f8 c1c8a1e ae59393 a936419 a28a8aa a936419 a28b6b3 a936419 2f3bf94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from gradio_client import Client
import json
import csv
import pandas
import groq
import os
api_key = os.environ.get('groq')
read_key = os.environ.get('HF_TOKEN', None)
# Use Llama 3 70B powered by Groq for answering
def ask_llm(prompt):
try:
completion = client.chat.completions.create(
model="llama3-70b-8192",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"{prompt}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
],
)
return completion.choices[0].message.content
except Exception as e:
return f"Error in response generation: {str(e)}"
def parse_links_and_content(ort):
base_url = "https://vereine-in-deutschland.net"
all_links = []
all_links_text = []
initial_url = f"{base_url}/vereine/Bayern/{ort}"
try:
response = requests.get(initial_url)
response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Ermittle die letzte Seite
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
if link_element and 'href' in link_element.attrs:
href = link_element['href']
# Extrahiere die letzten beiden Zeichen der URL
last_two_chars = href[-2:].strip()
# Konvertiere die letzten beiden Zeichen in einen Integer
last_two_chars_int = int(last_two_chars)
else:
last_two_chars_int = 1 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
# Schleife durch alle Seiten und sammle Links
for page_number in range(1, last_two_chars_int +1):
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
if target_div:
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
texts = [a.text for a in target_div.find_all('a', href=True)]
#print(texts)
all_links.extend(links)
all_links_text.extend(texts)
else:
print(f"Target div not found on page {page_number}")
except Exception as e:
return str(e), []
all_links = all_links[0::2]
all_links_text = all_links_text[0::2]
return all_links_text
def scrape_links(links):
links=links
contact_details= []
client = Client("mgokg/PerplexicaApi")
for verein in links:
result = client.predict(
prompt=f"{verein}",
api_name="/parse_links"
)
#print(result)
contact_details.append(result)
return contact_details
# Speichere die JSON-Daten in eine CSV-Datei
def save_to_csv(data, filename):
keys = data[0].keys()
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data)
# Erstelle die Gradio-Schnittstelle
with gr.Blocks() as demo:
gr.Markdown("# ")
with gr.Row():
ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
links_output = gr.JSON(label="Vereinsliste")
#links_output = gr.DataFrame(label="Ergebnisse")
#json_output = gr.JSON(label="Ergebnisse")
def process_ort(ort):
links = parse_links_and_content(ort)
#return links
contact= scrape_links(links)
json_data = [json.loads(item) for item in contact]
#save_to_csv(json_data, './contact_details.csv')
#return f"[Download CSV](contact_details.csv)", json_data
#return json_data
#return contact
return json_data
#return json_data
# Button zum Starten der Parsung
button = gr.Button("senden")
# Verbinde den Button mit der Funktion
button.click(fn=process_ort, inputs=ort_input, outputs=links_output)
# Starte die Gradio-Anwendung
demo.launch()
|