mgokg commited on
Commit
085ef0b
·
verified ·
1 Parent(s): 96963d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -86
app.py CHANGED
@@ -2,148 +2,126 @@ import gradio as gr
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin
5
- from gradio_client import Client
6
  import json
7
  import csv
8
- import pandas
9
- import groq
10
- import os
11
 
 
12
  api_key = os.environ.get('groq')
13
  read_key = os.environ.get('HF_TOKEN', None)
14
- client = groq.Client(api_key=api_key)
 
 
 
 
 
 
15
 
16
  # Use Llama 3 70B powered by Groq for answering
17
  def ask_llm(ort):
 
 
18
 
19
- try:
20
  completion = client.chat.completions.create(
21
  model="llama3-70b-8192",
22
  messages=[
23
  {"role": "system", "content": "You are a helpful assistant."},
24
  {"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
25
  ],
26
- )
27
  return completion.choices[0].message.content
28
  except Exception as e:
29
  return f"Error in response generation: {str(e)}"
30
 
31
  def parse_links_and_content(ort):
32
-
33
  base_url = "https://vereine-in-deutschland.net"
34
  all_links = []
35
  all_links_text = []
36
  initial_url = f"{base_url}/vereine/Bayern/{ort}"
37
-
38
  try:
39
  response = requests.get(initial_url)
40
- response.raise_for_status() # Überprüfen, ob die Anfrage erfolgreich war
41
-
42
- # Parse the HTML content using BeautifulSoup
43
  soup = BeautifulSoup(response.content, 'html.parser')
44
-
45
- # Ermittle die letzte Seite
46
  link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
47
-
48
  if link_element and 'href' in link_element.attrs:
49
  href = link_element['href']
50
- # Extrahiere die letzten beiden Zeichen der URL
51
- last_two_chars = href[-2:].strip()
52
- # Konvertiere die letzten beiden Zeichen in einen Integer
53
- last_two_chars_int = int(last_two_chars)
54
- print(last_two_chars_int)
55
- else:
56
- last_two_chars_int = 10 # Falls die letzte Seite nicht gefunden wird, nimm an, dass es nur eine Seite gibt
57
-
58
- # Schleife durch alle Seiten und sammle Links
59
- for page_number in range(1, last_two_chars_int +1):
60
  page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
61
  response = requests.get(page_url)
62
- response.raise_for_status()
63
  soup = BeautifulSoup(response.content, 'html.parser')
64
  target_div = soup.select_one('div.row-cols-1:nth-child(4)')
65
-
66
  if target_div:
67
  links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
68
- texts = [a.text for a in target_div.find_all('a', href=True)]
69
- #print(texts)
70
  all_links.extend(links)
71
  all_links_text.extend(texts)
72
  else:
73
  print(f"Target div not found on page {page_number}")
74
-
75
  except Exception as e:
76
  return str(e), []
77
-
78
  all_links = all_links[0::2]
79
  all_links_text = all_links_text[0::2]
80
-
81
  return all_links_text, all_links
82
 
83
- def scrape_links(links):
84
- links=links
85
  details = []
86
- for contacts in links:
87
- contact_element = soup.select_one('.nav')
88
- response = requests.get(contacts)
89
- response.raise_for_status()
90
- soup = BeautifulSoup(response.content, 'html.parser')
91
- target_nav = soup.select_one('.nav')
92
- details.append(target_nav.text)
 
 
 
 
 
93
 
94
  return details
95
 
96
- client = Client("mgokg/PerplexicaApi")
97
- for verein in links:
98
- result = client.predict(
99
-
100
- prompt=f"{verein}",
101
- api_name="/parse_links"
102
- )
103
- #print(result)
104
- contact_details.append(result)
105
-
106
- return contact_details
107
-
108
- # Speichere die JSON-Daten in eine CSV-Datei
109
- def save_to_csv(data, filename):
110
- keys = data[0].keys()
111
- with open(filename, 'w', newline='', encoding='utf-8') as output_file:
112
- dict_writer = csv.DictWriter(output_file, fieldnames=keys)
113
- dict_writer.writeheader()
114
  dict_writer.writerows(data)
115
 
116
- # Erstelle die Gradio-Schnittstelle
117
  with gr.Blocks() as demo:
118
- gr.Markdown("# ")
119
  with gr.Row():
120
  ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
121
  with gr.Row():
122
- links_output = gr.JSON(label="Antwort")
123
- rechts_output = gr.JSON(label="Antwort")
124
- #links_output = gr.DataFrame(label="Ergebnisse")
125
- #json_output = gr.JSON(label="Ergebnisse")
126
 
127
  def process_ort(ort):
128
- #antwort = ask_llm(ort)
129
- #antwort=gr.Markdown()
130
- #return antwort
131
- links = parse_links_and_content(ort)
132
- #return links
133
- contact= scrape_links(links)
134
- json_data = [json.loads(item) for item in contact]
135
- #save_to_csv(json_data, './contact_details.csv')
136
- #return f"[Download CSV](contact_details.csv)", json_data
137
- #return json_data
138
- #return contact
139
- return json_data, links
140
- #return json_data
141
-
142
- # Button zum Starten der Parsung
143
- button = gr.Button("senden")
144
-
145
- # Verbinde den Button mit der Funktion
146
- button.click(fn=process_ort, inputs=ort_input, outputs=[links_output, rechts_output])
147
 
148
- # Starte die Gradio-Anwendung
149
- demo.launch()
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin
5
+ import os
6
  import json
7
  import csv
 
 
 
8
 
9
+ # Load environment variables
10
  api_key = os.environ.get('groq')
11
  read_key = os.environ.get('HF_TOKEN', None)
12
+
13
+ # Initialize Groq client
14
+ if api_key:
15
+ from groq import Client as GroqClient
16
+ client = GroqClient(api_key=api_key)
17
+ else:
18
+ client = None
19
 
20
  # Use Llama 3 70B powered by Groq for answering
21
  def ask_llm(ort):
22
+ if not client:
23
+ return "Groq API key not set."
24
 
25
+ try:
26
  completion = client.chat.completions.create(
27
  model="llama3-70b-8192",
28
  messages=[
29
  {"role": "system", "content": "You are a helpful assistant."},
30
  {"role": "user", "content": f"{ort}. \n instruction: antworte kurz und knapp. antworte immer auf deutsch"}
31
  ],
32
+ )
33
  return completion.choices[0].message.content
34
  except Exception as e:
35
  return f"Error in response generation: {str(e)}"
36
 
37
  def parse_links_and_content(ort):
 
38
  base_url = "https://vereine-in-deutschland.net"
39
  all_links = []
40
  all_links_text = []
41
  initial_url = f"{base_url}/vereine/Bayern/{ort}"
42
+
43
  try:
44
  response = requests.get(initial_url)
45
+ response.raise_for_status()
46
+
 
47
  soup = BeautifulSoup(response.content, 'html.parser')
48
+
49
+ # Determine the last page
50
  link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
51
+ last_page = 1
52
  if link_element and 'href' in link_element.attrs:
53
  href = link_element['href']
54
+ last_page = int(href.split('/')[-1])
55
+
56
+ # Loop through all pages and collect links
57
+ for page_number in range(1, last_page + 1):
 
 
 
 
 
 
58
  page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
59
  response = requests.get(page_url)
60
+ response.raise_for_status()
61
  soup = BeautifulSoup(response.content, 'html.parser')
62
  target_div = soup.select_one('div.row-cols-1:nth-child(4)')
63
+
64
  if target_div:
65
  links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
66
+ texts = [a.text for a in target_div.find_all('a', href=True)]
 
67
  all_links.extend(links)
68
  all_links_text.extend(texts)
69
  else:
70
  print(f"Target div not found on page {page_number}")
71
+
72
  except Exception as e:
73
  return str(e), []
74
+
75
  all_links = all_links[0::2]
76
  all_links_text = all_links_text[0::2]
77
+
78
  return all_links_text, all_links
79
 
80
+ def scrape_links(links):
 
81
  details = []
82
+ for link in links:
83
+ try:
84
+ response = requests.get(link)
85
+ response.raise_for_status()
86
+ soup = BeautifulSoup(response.content, 'html.parser')
87
+ target_nav = soup.select_one('.nav')
88
+ if target_nav:
89
+ details.append(target_nav.text.strip())
90
+ else:
91
+ details.append("No contact information found")
92
+ except Exception as e:
93
+ details.append(f"Error: {str(e)}")
94
 
95
  return details
96
 
97
+ def save_to_csv(data, filename):
98
+ keys = data[0].keys() if data else []
99
+ with open(filename, 'w', newline='', encoding='utf-8') as output_file:
100
+ dict_writer = csv.DictWriter(output_file, fieldnames=keys)
101
+ dict_writer.writeheader()
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  dict_writer.writerows(data)
103
 
104
+ # Create the Gradio interface
105
  with gr.Blocks() as demo:
106
+ gr.Markdown("# Vereine in Deutschland")
107
  with gr.Row():
108
  ort_input = gr.Textbox(label="Ort", placeholder="Gib den Namen des Ortes ein")
109
  with gr.Row():
110
+ links_output = gr.JSON(label="Links")
111
+ details_output = gr.JSON(label="Details")
 
 
112
 
113
  def process_ort(ort):
114
+ links_text, links = parse_links_and_content(ort)
115
+ contact_details = scrape_links(links)
116
+ json_data = [json.loads(detail) for detail in contact_details if detail.startswith("{")]
117
+ save_to_csv(json_data, './contact_details.csv')
118
+ return links_text, contact_details
119
+
120
+ # Button to start the parsing
121
+ button = gr.Button("Senden")
122
+
123
+ # Connect the button to the function
124
+ button.click(fn=process_ort, inputs=ort_input, outputs=[links_output, details_output])
 
 
 
 
 
 
 
 
125
 
126
+ # Launch the Gradio application
127
+ demo.launch()