mgokg's picture
Update app.py
d42ac24 verified
raw
history blame
5.42 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import json
import csv
import pandas as pd
# Load environment variables
api_key = os.environ.get('GROQ_API_KEY')
read_key = os.environ.get('HF_TOKEN', None)
# Initialize Groq client
if api_key:
from groq import Client as GroqClient
client = GroqClient(api_key=api_key)
else:
client = None
# Use Llama 3 70B powered by Groq for answering
def ask_llm(ort):
if not client:
return "Groq API key not set."
try:
completion = client.chat.completions.create(
model="llama3-70b-8192",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
],
)
return completion.choices[0].message.content
except Exception as e:
return f"Error in response generation: {str(e)}"
def parse_links_and_content(ort):
base_url = "https://vereine-in-deutschland.net"
all_links = []
all_links_text = []
initial_url = f"{base_url}/vereine/Bayern/{ort}"
try:
response = requests.get(initial_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Determine the last page
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
last_page = 10
if link_element and 'href' in link_element.attrs:
href = link_element['href']
last_page = int(href.split('/')[-1])
# Loop through all pages and collect links
for page_number in range(1, last_page + 1):
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
if target_div:
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
texts = [a.text for a in target_div.find_all('a', href=True)]
all_links.extend(links)
all_links_text.extend(texts)
else:
print(f"Target div not found on page {page_number}")
except Exception as e:
return str(e), []
all_links = all_links[0::2]
all_links_text = all_links_text[0::2]
return all_links_text, all_links
def extract_vereinsname(url):
parts = url.split('/')
vereinsname = parts[-1]
return vereinsname
def scrape_links(links):
details = []
for link in links:
try:
response = requests.get(link)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_nav = soup.select_one('.nav')
parts = link.split('/')
# Log the URL and its parts for debugging
print(f"Processing URL: {link}")
print(f"URL parts: {parts}")
# Extract the name of the Verein from the URL
vereinsname = parts[-1] if parts[-1] else parts[-2] # Fallback to the second-to-last part if the last part is empty
texte = target_nav.text.strip()
texte=texte.replace("Amtsgericht: Schweinfurt", "")
texte=texte.replace("Adresse folgt", "")
texte=texte.replace("Adresse", "Adresse:")
texte=texte.replace("Kontakt", "Email:")
texte=texte.replace("Noch keine Daten vorhanden", "")
if target_nav:
details.append(f"Verein: {vereinsname} {texte}")
else:
details.append(f"Verein: {vereinsname} - No contact information found")
except Exception as e:
details.append(f"Error: {str(e)}")
return details
def save_to_csv(data, filename):
keys = data[0].keys() if data else []
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data)
# Clear output
def clear():
return "", ""
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("[![Download](https://specialist-it.de/downloadbut.png)](https://specialist-it.de/verein.csv)")
with gr.Row():
ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
with gr.Row():
#details_output = gr.JSON(label="Kontaktinformation")
details_output= gr.Dataframe(label="Vereinsliste")
def process_ort(ort):
links_text, links = parse_links_and_content(ort)
contact_details = scrape_links(links)
json_data = [json.loads(detail) for detail in contact_details if detail.startswith("{")]
save_to_csv(json_data, './contact_details.csv')
return pd.DataFrame(contact_details)
with gr.Row():
clearbutton = gr.Button("Clear")
button = gr.Button("Senden")
# Connect the button to the function
button.click(fn=process_ort, inputs=ort_input, outputs=details_output)
clearbutton.click(fn=clear, inputs=[], outputs=details_output)
# Launch the Gradio application
demo.launch()