File size: 7,431 Bytes
a936419
 
 
04f7cb6
085ef0b
ae59393
6550ad7
796a20f
1605c68
a936419
085ef0b
1605c68
e5d9b98
0963c3d
085ef0b
 
 
 
 
 
 
5399f24
740258d
c51298d
085ef0b
 
56a9e2e
085ef0b
740258d
 
 
 
c51298d
740258d
085ef0b
740258d
 
 
 
5535edf
04f7cb6
740258d
 
 
085ef0b
e410dd0
2f3bf94
085ef0b
 
e410dd0
085ef0b
 
2f3bf94
1e4afd6
2f3bf94
 
085ef0b
 
 
 
2f3bf94
 
085ef0b
2f3bf94
 
085ef0b
2f3bf94
740258d
085ef0b
030bd9a
740258d
2f3bf94
 
085ef0b
e410dd0
 
085ef0b
920c8fd
740258d
085ef0b
96b654f
5535edf
6b3b0cd
 
eee6618
5068324
6b3b0cd
 
085ef0b
3b28bd9
085ef0b
 
 
 
 
 
5f6d292
7e0c763
 
 
 
 
 
 
bb571f1
cc1e631
 
 
 
 
bedbbfa
085ef0b
bb571f1
085ef0b
7e0c763
085ef0b
 
3b28bd9
e5d9b98
3b28bd9
085ef0b
 
 
 
 
c1c8a1e
 
e5d9b98
1e4afd6
e5d9b98
1e4afd6
e6bff66
 
 
085ef0b
a936419
1d3c9b5
85deaff
 
5399f24
cbd6e3c
 
5535edf
 
085ef0b
1129a27
 
1605c68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1129a27
1605c68
 
 
1129a27
 
 
 
085ef0b
cc1e631
 
 
 
 
949d21c
cc1e631
ecd5b62
cc1e631
 
f977046
5068324
 
 
 
 
 
312f50b
5068324
 
6b1696d
93ee922
 
 
 
 
 
 
42b6426
5068324
085ef0b
1e4afd6
e5d9b98
 
085ef0b
 
fe746c1
 
e5d9b98
085ef0b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import json
import pandas as pd
from io import StringIO
import google.generativeai as genai

# Load environment variables
genai.configure(api_key=os.environ["geminiapikey"])
api_key = os.environ.get('GROQ_API_KEY')
read_key = os.environ.get('HF_TOKEN', None)

# Initialize Groq client
if api_key:
    from groq import Client as GroqClient
    client = GroqClient(api_key=api_key)
else:
    client = None

# Use Llama 3 70B powered by Groq for answering
def ask_llm(ort):
    if not client:
        return "Groq API key not set."
    
    try:
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
            ],
        )
        return completion.choices[0].message.content
    except Exception as e:
        return f"Error in response generation: {str(e)}"

def parse_links_and_content(ort):
    base_url = "https://vereine-in-deutschland.net"
    all_links = []
    all_links_text = []
    initial_url = f"{base_url}/vereine/Bayern/{ort}"

    try:
        response = requests.get(initial_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        # Determine the last page
        link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
        last_page = 10
        if link_element and 'href' in link_element.attrs:
            href = link_element['href']
            last_page = int(href.split('/')[-1])

        # Loop through all pages and collect links
        for page_number in range(1, last_page + 1):
            page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
            response = requests.get(page_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            target_div = soup.select_one('div.row-cols-1:nth-child(4)')

            if target_div:
                links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
                texts = [a.text for a in target_div.find_all('a', href=True)]
                all_links.extend(links)
                all_links_text.extend(texts)
            else:
                print(f"Target div not found on page {page_number}")

    except Exception as e:
        return str(e), []

    all_links = all_links[0::2]
    all_links_text = all_links_text[0::2]

    return all_links_text, all_links

def extract_vereinsname(url):
    parts = url.split('/')
    vereinsname = parts[-1]
    vereinsname = vereinsname.replace("-", " ")
    return vereinsname

def scrape_links(links):
    details = []
    for link in links:
        try:
            response = requests.get(link)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            target_nav = soup.select_one('.nav')
            parts = link.split('/')
            
            # Log the URL and its parts for debugging
            print(f"Processing URL: {link}")
            print(f"URL parts: {parts}")
            
            # Extract the name of the Verein from the URL
            vereinsname = parts[-1] if parts[-1] else parts[-2]  # Fallback to the second-to-last part if the last part is empty
            texte = target_nav.text.strip()
            texte = texte.replace("Amtsgericht: Schweinfurt", "")
            texte = texte.replace("Adresse folgt", "")
            texte = texte.replace("Adresse", "Adresse:")
            texte = texte.replace("Kontakt", "Email:")
            texte = texte.replace("Noch keine Daten vorhanden", "")

            if target_nav:
                details.append(f"Verein: {vereinsname} {texte}")
            else:
                details.append(f"Verein: {vereinsname} - No contact information found")
        except Exception as e:
            details.append(f"Error: {str(e)}")

    return details

def save_to_csv(data, filename):
    keys = data[0].keys() if data else []
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

# Clear output
def clear():
    return "", ""

def load_data():
    return df

# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("[Download](https://specialist-it.de/verein.csv)")
    with gr.Row():
        ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
    with gr.Row():
        #details_output = gr.DataFrame(label="Ausgabe")        
        details_output = gr.Textbox(label="Ausgabe", value = f"\n\n\n\n")

    def process_ort(ort):
        links_text, links = parse_links_and_content(ort)
        #return links_text
        kontakt = []

        # Create the model
        generation_config = {
          "temperature": 1,
          "top_p": 0.95,
          "top_k": 40,
          "max_output_tokens": 8192,
          "response_mime_type": "text/plain",
        }

        model = genai.GenerativeModel(
          model_name="gemini-2.0-flash-exp",
          generation_config=generation_config,
        )

        chat_session = model.start_chat(
          history=[
          ]
        )

 
        for verein in links_text:
            response = chat_session.send_message(f"kontakdaten für {verein}")

            kontakt.append(response.text)
            

        return kontakt
        
        contact_details = scrape_links(links)
        
        from gradio_client import Client

        qwen_client = Client("Qwen/Qwen2.5-72B-Instruct")
        result = qwen_client.predict(
            query=f"return a valid json objects with contact details foreach verein. return the generated json only \n {contact_details}",
            history=[],
            system="you are a expert for json data and your job is to extract information from text and return a valid json object only. no text no explanations. reverse all email adresses. example: reverse ed.nesuahsgnitle-vs@dnatsrov to [email protected]. replace all dashes from vereinsname with a whitespace",     
            api_name="/model_chat"
        )

        json_data = result[1][0][1]
        json_data = json_data.replace("```json", "")
        json_data = json_data.replace("```", "")
        
        # Convert JSON string to Python dictionary
        data_dict = json.loads(json_data)

        # Convert dictionary to DataFrame
        df = pd.DataFrame(data_dict)
        # DataFrame in eine CSV-Datei konvertieren
        #df.to_csv('daten.csv', index=False)
        # DataFrame in eine CSV-Variable konvertieren
        csv_buffer = StringIO()
        df.to_csv(csv_buffer, index=False)
        csv_data = csv_buffer.getvalue()

        print(csv_data)
        #return csv_data
        return df

    with gr.Row():
        clearbutton = gr.Button("Clear")  
        button = gr.Button("Senden")    

    # Connect the button to the function
    button.click(fn=process_ort, inputs=ort_input, outputs=details_output)
    clearbutton.click(fn=clear, inputs=[], outputs=details_output)

# Launch the Gradio application
demo.launch()