mgokg's picture
Update app.py
1605c68 verified
raw
history blame
7.43 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import json
import pandas as pd
from io import StringIO
import google.generativeai as genai
# Load environment variables
genai.configure(api_key=os.environ["geminiapikey"])
api_key = os.environ.get('GROQ_API_KEY')
read_key = os.environ.get('HF_TOKEN', None)
# Initialize Groq client
if api_key:
from groq import Client as GroqClient
client = GroqClient(api_key=api_key)
else:
client = None
# Use Llama 3 70B powered by Groq for answering
def ask_llm(ort):
if not client:
return "Groq API key not set."
try:
completion = client.chat.completions.create(
model="llama3-70b-8192",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
],
)
return completion.choices[0].message.content
except Exception as e:
return f"Error in response generation: {str(e)}"
def parse_links_and_content(ort):
base_url = "https://vereine-in-deutschland.net"
all_links = []
all_links_text = []
initial_url = f"{base_url}/vereine/Bayern/{ort}"
try:
response = requests.get(initial_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Determine the last page
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
last_page = 10
if link_element and 'href' in link_element.attrs:
href = link_element['href']
last_page = int(href.split('/')[-1])
# Loop through all pages and collect links
for page_number in range(1, last_page + 1):
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
if target_div:
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
texts = [a.text for a in target_div.find_all('a', href=True)]
all_links.extend(links)
all_links_text.extend(texts)
else:
print(f"Target div not found on page {page_number}")
except Exception as e:
return str(e), []
all_links = all_links[0::2]
all_links_text = all_links_text[0::2]
return all_links_text, all_links
def extract_vereinsname(url):
parts = url.split('/')
vereinsname = parts[-1]
vereinsname = vereinsname.replace("-", " ")
return vereinsname
def scrape_links(links):
details = []
for link in links:
try:
response = requests.get(link)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_nav = soup.select_one('.nav')
parts = link.split('/')
# Log the URL and its parts for debugging
print(f"Processing URL: {link}")
print(f"URL parts: {parts}")
# Extract the name of the Verein from the URL
vereinsname = parts[-1] if parts[-1] else parts[-2] # Fallback to the second-to-last part if the last part is empty
texte = target_nav.text.strip()
texte = texte.replace("Amtsgericht: Schweinfurt", "")
texte = texte.replace("Adresse folgt", "")
texte = texte.replace("Adresse", "Adresse:")
texte = texte.replace("Kontakt", "Email:")
texte = texte.replace("Noch keine Daten vorhanden", "")
if target_nav:
details.append(f"Verein: {vereinsname} {texte}")
else:
details.append(f"Verein: {vereinsname} - No contact information found")
except Exception as e:
details.append(f"Error: {str(e)}")
return details
def save_to_csv(data, filename):
keys = data[0].keys() if data else []
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data)
# Clear output
def clear():
return "", ""
def load_data():
return df
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("[Download](https://specialist-it.de/verein.csv)")
with gr.Row():
ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
with gr.Row():
#details_output = gr.DataFrame(label="Ausgabe")
details_output = gr.Textbox(label="Ausgabe", value = f"\n\n\n\n")
def process_ort(ort):
links_text, links = parse_links_and_content(ort)
#return links_text
kontakt = []
# Create the model
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
model_name="gemini-2.0-flash-exp",
generation_config=generation_config,
)
chat_session = model.start_chat(
history=[
]
)
for verein in links_text:
response = chat_session.send_message(f"kontakdaten für {verein}")
kontakt.append(response.text)
return kontakt
contact_details = scrape_links(links)
from gradio_client import Client
qwen_client = Client("Qwen/Qwen2.5-72B-Instruct")
result = qwen_client.predict(
query=f"return a valid json objects with contact details foreach verein. return the generated json only \n {contact_details}",
history=[],
system="you are a expert for json data and your job is to extract information from text and return a valid json object only. no text no explanations. reverse all email adresses. example: reverse ed.nesuahsgnitle-vs@dnatsrov to [email protected]. replace all dashes from vereinsname with a whitespace",
api_name="/model_chat"
)
json_data = result[1][0][1]
json_data = json_data.replace("```json", "")
json_data = json_data.replace("```", "")
# Convert JSON string to Python dictionary
data_dict = json.loads(json_data)
# Convert dictionary to DataFrame
df = pd.DataFrame(data_dict)
# DataFrame in eine CSV-Datei konvertieren
#df.to_csv('daten.csv', index=False)
# DataFrame in eine CSV-Variable konvertieren
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
csv_data = csv_buffer.getvalue()
print(csv_data)
#return csv_data
return df
with gr.Row():
clearbutton = gr.Button("Clear")
button = gr.Button("Senden")
# Connect the button to the function
button.click(fn=process_ort, inputs=ort_input, outputs=details_output)
clearbutton.click(fn=clear, inputs=[], outputs=details_output)
# Launch the Gradio application
demo.launch()