Spaces:
Running
Running
| import gradio as gr | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin | |
| import os | |
| import json | |
| import csv | |
| #import pandas as pd | |
| # Load environment variables | |
| api_key = os.environ.get('GROQ_API_KEY') | |
| read_key = os.environ.get('HF_TOKEN', None) | |
| # Initialize Groq client | |
| if api_key: | |
| from groq import Client as GroqClient | |
| client = GroqClient(api_key=api_key) | |
| else: | |
| client = None | |
| # Use Llama 3 70B powered by Groq for answering | |
| def ask_llm(ort): | |
| if not client: | |
| return "Groq API key not set." | |
| try: | |
| completion = client.chat.completions.create( | |
| model="llama3-70b-8192", | |
| messages=[ | |
| {"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"} | |
| ], | |
| ) | |
| return completion.choices[0].message.content | |
| except Exception as e: | |
| return f"Error in response generation: {str(e)}" | |
| def parse_links_and_content(ort): | |
| base_url = "https://vereine-in-deutschland.net" | |
| all_links = [] | |
| all_links_text = [] | |
| initial_url = f"{base_url}/vereine/Bayern/{ort}" | |
| try: | |
| response = requests.get(initial_url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Determine the last page | |
| link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)') | |
| last_page = 10 | |
| if link_element and 'href' in link_element.attrs: | |
| href = link_element['href'] | |
| last_page = int(href.split('/')[-1]) | |
| # Loop through all pages and collect links | |
| for page_number in range(1, last_page + 1): | |
| page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}" | |
| response = requests.get(page_url) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| target_div = soup.select_one('div.row-cols-1:nth-child(4)') | |
| if target_div: | |
| links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)] | |
| texts = [a.text for a in target_div.find_all('a', href=True)] | |
| all_links.extend(links) | |
| all_links_text.extend(texts) | |
| else: | |
| print(f"Target div not found on page {page_number}") | |
| except Exception as e: | |
| return str(e), [] | |
| all_links = all_links[0::2] | |
| all_links_text = all_links_text[0::2] | |
| return all_links_text, all_links | |
| def extract_vereinsname(url): | |
| parts = url.split('/') | |
| vereinsname = parts[-1] | |
| vereinsname = vereinsname.replace("-"," ") | |
| return vereinsname | |
| def scrape_links(links): | |
| details = [] | |
| for link in links: | |
| try: | |
| response = requests.get(link) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| target_nav = soup.select_one('.nav') | |
| parts = link.split('/') | |
| # Log the URL and its parts for debugging | |
| print(f"Processing URL: {link}") | |
| print(f"URL parts: {parts}") | |
| # Extract the name of the Verein from the URL | |
| vereinsname = parts[-1] if parts[-1] else parts[-2] # Fallback to the second-to-last part if the last part is empty | |
| texte = target_nav.text.strip() | |
| texte = texte.replace("Amtsgericht: Schweinfurt", "") | |
| texte = texte.replace("Adresse folgt", "") | |
| texte = texte.replace("Adresse", "Adresse:") | |
| texte = texte.replace("Kontakt", "Email:") | |
| texte = texte.replace("Noch keine Daten vorhanden", "") | |
| if target_nav: | |
| details.append(f"Verein: {vereinsname} {texte}") | |
| else: | |
| details.append(f"Verein: {vereinsname} - No contact information found") | |
| except Exception as e: | |
| details.append(f"Error: {str(e)}") | |
| return details | |
| def save_to_csv(data, filename): | |
| keys = data[0].keys() if data else [] | |
| with open(filename, 'w', newline='', encoding='utf-8') as output_file: | |
| dict_writer = csv.DictWriter(output_file, fieldnames=keys) | |
| dict_writer.writeheader() | |
| dict_writer.writerows(data) | |
| # Clear output | |
| def clear(): | |
| return "", "" | |
| # Create the Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("[](https://specialist-it.de/verein.csv)") | |
| with gr.Row(): | |
| ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein") | |
| with gr.Row(): | |
| details_output = gr.Markdown(label="Vereinsliste") | |
| def process_ort(ort): | |
| links_text, links = parse_links_and_content(ort) | |
| contact_details = scrape_links(links) | |
| from gradio_client import Client | |
| qwen_client = Client("Qwen/Qwen2.5-72B-Instruct") | |
| result = qwen_client.predict( | |
| query=f"return a valid json objects with contact details foreach verein. return the generated json only \n {contact_details}", | |
| history=[], | |
| system="you are a expert for json data and your job is to extract information from text and return a valid json object only. no text no explanations", | |
| api_name="/model_chat" | |
| ) | |
| #result[1]=gr.Markdown() | |
| #dict_data = json.loads(result[1]) | |
| #values = list(result.values()) | |
| # Return the value at index 1 | |
| json_data =result[1][0][0] | |
| #json_dat = json_data[1] | |
| #json_dat=gr.Markdown() | |
| #return result | |
| #return result[1] | |
| return json_data | |
| with gr.Row(): | |
| clearbutton = gr.Button("Clear") | |
| button = gr.Button("Senden") | |
| # Connect the button to the function | |
| button.click(fn=process_ort, inputs=ort_input, outputs=details_output) | |
| clearbutton.click(fn=clear, inputs=[], outputs=details_output) | |
| # Launch the Gradio application | |
| demo.launch() |