Spaces:
Running
Running
File size: 6,111 Bytes
a936419 04f7cb6 085ef0b ae59393 1c3a250 a936419 085ef0b e5d9b98 0963c3d 085ef0b 5399f24 740258d c51298d 085ef0b 56a9e2e 085ef0b 740258d c51298d 740258d 085ef0b 740258d 5535edf 0be39ee 04f7cb6 740258d 085ef0b e410dd0 2f3bf94 085ef0b e410dd0 085ef0b 2f3bf94 1e4afd6 2f3bf94 085ef0b 2f3bf94 085ef0b 2f3bf94 085ef0b 2f3bf94 740258d 085ef0b 030bd9a 740258d 2f3bf94 085ef0b e410dd0 085ef0b 920c8fd 740258d 085ef0b 96b654f 5535edf 6b3b0cd eee6618 6b3b0cd 085ef0b 3b28bd9 085ef0b 5f6d292 7e0c763 bb571f1 cc1e631 bedbbfa 085ef0b bb571f1 085ef0b 7e0c763 085ef0b 3b28bd9 e5d9b98 3b28bd9 085ef0b c1c8a1e e5d9b98 1e4afd6 e5d9b98 1e4afd6 085ef0b a936419 ba5cdce 85deaff 5399f24 0be39ee 5535edf 085ef0b cc1e631 949d21c cc1e631 9a23bef 9abe4cd 709f660 f842420 c577d47 0265349 5503bf5 2e1e0e1 f842420 a4ab0fc 63c2204 085ef0b 1e4afd6 e5d9b98 085ef0b fe746c1 e5d9b98 085ef0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import json
import csv
#import pandas as pd
# Load environment variables
api_key = os.environ.get('GROQ_API_KEY')
read_key = os.environ.get('HF_TOKEN', None)
# Initialize Groq client
if api_key:
from groq import Client as GroqClient
client = GroqClient(api_key=api_key)
else:
client = None
# Use Llama 3 70B powered by Groq for answering
def ask_llm(ort):
if not client:
return "Groq API key not set."
try:
completion = client.chat.completions.create(
model="llama3-70b-8192",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
],
)
return completion.choices[0].message.content
except Exception as e:
return f"Error in response generation: {str(e)}"
def parse_links_and_content(ort):
base_url = "https://vereine-in-deutschland.net"
all_links = []
all_links_text = []
initial_url = f"{base_url}/vereine/Bayern/{ort}"
try:
response = requests.get(initial_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Determine the last page
link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
last_page = 10
if link_element and 'href' in link_element.attrs:
href = link_element['href']
last_page = int(href.split('/')[-1])
# Loop through all pages and collect links
for page_number in range(1, last_page + 1):
page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
response = requests.get(page_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_div = soup.select_one('div.row-cols-1:nth-child(4)')
if target_div:
links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
texts = [a.text for a in target_div.find_all('a', href=True)]
all_links.extend(links)
all_links_text.extend(texts)
else:
print(f"Target div not found on page {page_number}")
except Exception as e:
return str(e), []
all_links = all_links[0::2]
all_links_text = all_links_text[0::2]
return all_links_text, all_links
def extract_vereinsname(url):
parts = url.split('/')
vereinsname = parts[-1]
vereinsname = vereinsname.replace("-"," ")
return vereinsname
def scrape_links(links):
details = []
for link in links:
try:
response = requests.get(link)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
target_nav = soup.select_one('.nav')
parts = link.split('/')
# Log the URL and its parts for debugging
print(f"Processing URL: {link}")
print(f"URL parts: {parts}")
# Extract the name of the Verein from the URL
vereinsname = parts[-1] if parts[-1] else parts[-2] # Fallback to the second-to-last part if the last part is empty
texte = target_nav.text.strip()
texte = texte.replace("Amtsgericht: Schweinfurt", "")
texte = texte.replace("Adresse folgt", "")
texte = texte.replace("Adresse", "Adresse:")
texte = texte.replace("Kontakt", "Email:")
texte = texte.replace("Noch keine Daten vorhanden", "")
if target_nav:
details.append(f"Verein: {vereinsname} {texte}")
else:
details.append(f"Verein: {vereinsname} - No contact information found")
except Exception as e:
details.append(f"Error: {str(e)}")
return details
def save_to_csv(data, filename):
keys = data[0].keys() if data else []
with open(filename, 'w', newline='', encoding='utf-8') as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data)
# Clear output
def clear():
return "", ""
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("[](https://specialist-it.de/verein.csv)")
with gr.Row():
ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
with gr.Row():
details_output = gr.Textbox(label="Vereinsliste")
def process_ort(ort):
links_text, links = parse_links_and_content(ort)
contact_details = scrape_links(links)
from gradio_client import Client
qwen_client = Client("Qwen/Qwen2.5-72B-Instruct")
result = qwen_client.predict(
query=f"return a valid json objects with contact details foreach verein. return the generated json only \n {contact_details}",
history=[],
system="you are a expert for json data and your job is to extract information from text and return a valid json object only. no text no explanations",
api_name="/model_chat"
)
#result[1]=gr.Markdown()
#dict_data = json.loads(result[1])
#values = list(result.values())
# Return the value at index 1
json_data =result[1][0][1]
#json_dat = json_data[1]
#json_dat=gr.Markdown()
#return result
#return result[1]
json_data = gr.Markdown(json_data)
return json_data
with gr.Row():
clearbutton = gr.Button("Clear")
button = gr.Button("Senden")
# Connect the button to the function
button.click(fn=process_ort, inputs=ort_input, outputs=details_output)
clearbutton.click(fn=clear, inputs=[], outputs=details_output)
# Launch the Gradio application
demo.launch() |