Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,8 +15,7 @@ from bs4 import BeautifulSoup
|
|
| 15 |
import PyPDF2
|
| 16 |
import pytesseract
|
| 17 |
from PIL import Image
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
load_dotenv()
|
| 22 |
URL_APP_SCRIPT = os.getenv('URL_APP_SCRIPT')
|
|
@@ -159,13 +158,13 @@ def sidebar():
|
|
| 159 |
st.markdown("---")
|
| 160 |
st.markdown("# Ricerca Online")
|
| 161 |
st.session_state.cerca_online = st.toggle("Attivata", value=False)
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
st.session_state.selected_tbs = st.selectbox("Periodo:", list(st.session_state.tbs_options.keys()), disabled=not st.session_state.cerca_online)
|
| 167 |
st.session_state.tbs_value = st.session_state.tbs_options[st.session_state.selected_tbs]
|
| 168 |
-
st.session_state.numero_siti = st.slider(label="Risultati", min_value = 1, max_value=20, value=3, disabled=not st.session_state.cerca_online)
|
| 169 |
#st.session_state.suddividi_ricerca = st.toggle("Attivata", value=False)
|
| 170 |
st.markdown("---")
|
| 171 |
|
|
@@ -264,21 +263,41 @@ def gen_augmented_prompt(prompt, top_k) :
|
|
| 264 |
links.append((reference, testo))
|
| 265 |
return context, links
|
| 266 |
|
| 267 |
-
def
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
response = requests.get(url)
|
| 272 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 273 |
title = soup.title.string if soup.title else "N/A"
|
| 274 |
description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "N/A"
|
| 275 |
body_content = soup.find('body').get_text() if soup.find('body') else "N/A"
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
return results
|
| 281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
def gen_online_prompt(prompt, top_k) :
|
| 283 |
links = []
|
| 284 |
context = ''
|
|
|
|
| 15 |
import PyPDF2
|
| 16 |
import pytesseract
|
| 17 |
from PIL import Image
|
| 18 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
|
|
| 19 |
|
| 20 |
load_dotenv()
|
| 21 |
URL_APP_SCRIPT = os.getenv('URL_APP_SCRIPT')
|
|
|
|
| 158 |
st.markdown("---")
|
| 159 |
st.markdown("# Ricerca Online")
|
| 160 |
st.session_state.cerca_online = st.toggle("Attivata", value=False)
|
| 161 |
+
with st.popover("Siti Specifici", disabled=not st.session_state.cerca_online,use_container_width=True):
|
| 162 |
+
st.markdown("#### Inserisci Siti Web ")
|
| 163 |
+
for i in range(5):
|
| 164 |
+
st.session_state.urls[i] = st.text_input(f"URL Sito {i+1}", placeholder='Sito Web...', help='è possibile specificare anche il link di un video Youtube, in tal caso verrà restituita la trascrizione del video')
|
| 165 |
+
st.session_state.selected_tbs = st.selectbox("Periodo:", list(st.session_state.tbs_options.keys()), disabled=(not st.session_state.cerca_online) or (st.session_state.urls[0]!=""))
|
| 166 |
st.session_state.tbs_value = st.session_state.tbs_options[st.session_state.selected_tbs]
|
| 167 |
+
st.session_state.numero_siti = st.slider(label="Risultati", min_value = 1, max_value=20, value=3, disabled=(not st.session_state.cerca_online) or (st.session_state.urls[0]!=""))
|
| 168 |
#st.session_state.suddividi_ricerca = st.toggle("Attivata", value=False)
|
| 169 |
st.markdown("---")
|
| 170 |
|
|
|
|
| 263 |
links.append((reference, testo))
|
| 264 |
return context, links
|
| 265 |
|
| 266 |
+
def get_search_results_int(url):
|
| 267 |
+
result = {'title': '', 'description': '', 'url': '', 'body': ''}
|
| 268 |
+
try:
|
| 269 |
+
if "www.youtube.com" in url:
|
| 270 |
+
video_id = url.split("=")[1]
|
| 271 |
+
title = 'Video Youtube'
|
| 272 |
+
description = ''
|
| 273 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
| 274 |
+
body_content = " ".join([segment["text"] for segment in transcript])
|
| 275 |
+
print(video_id)
|
| 276 |
+
print(body_content)
|
| 277 |
+
result = {'title': title, 'description': body_content, 'url': url, 'body': body_content}
|
| 278 |
+
else:
|
| 279 |
response = requests.get(url)
|
| 280 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 281 |
title = soup.title.string if soup.title else "N/A"
|
| 282 |
description = soup.find('meta', attrs={'name': 'description'})['content'] if soup.find('meta', attrs={'name': 'description'}) else "N/A"
|
| 283 |
body_content = soup.find('body').get_text() if soup.find('body') else "N/A"
|
| 284 |
+
result = {'title': title, 'description': description, 'url': url, 'body': body_content}
|
| 285 |
+
except Exception as e:
|
| 286 |
+
print(f"Error fetching data from {url}: {e}")
|
| 287 |
+
return result
|
|
|
|
| 288 |
|
| 289 |
+
def get_search_results(query, top_k):
|
| 290 |
+
results = []
|
| 291 |
+
if st.session_state.urls[0] != "":
|
| 292 |
+
for i in range(5):
|
| 293 |
+
url = st.session_state.urls[i]
|
| 294 |
+
if url != "":
|
| 295 |
+
results.append(get_search_results_int(url))
|
| 296 |
+
else:
|
| 297 |
+
for url in search(query, num=top_k, stop=top_k, tbs=st.session_state.tbs_value):
|
| 298 |
+
results.append(get_search_results_int(url))
|
| 299 |
+
return results
|
| 300 |
+
|
| 301 |
def gen_online_prompt(prompt, top_k) :
|
| 302 |
links = []
|
| 303 |
context = ''
|