RohitCSharp commited on
Commit
e9d5607
·
verified ·
1 Parent(s): a791ea3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -9
app.py CHANGED
@@ -8,6 +8,8 @@ from transformers import pipeline
8
  from gtts import gTTS
9
  import tempfile
10
  import os
 
 
11
 
12
  # CPU-friendly summarization LLM
13
  summary_pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)
@@ -15,7 +17,7 @@ llm = HuggingFacePipeline(pipeline=summary_pipe)
15
 
16
  # Summarization prompt
17
  summary_prompt = PromptTemplate.from_template("""
18
- Summarize the following webpage content in a clear, concise way:
19
 
20
  {text}
21
 
@@ -24,17 +26,31 @@ Summary:
24
 
25
  summary_chain = LLMChain(llm=llm, prompt=summary_prompt)
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def url_to_audio_summary(url):
28
  try:
29
- loader = WebBaseLoader(url)
30
- docs = loader.load()
31
- splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
32
- splits = splitter.split_documents(docs)
33
 
34
- full_text = "\n".join([s.page_content for s in splits])
35
- summary = summary_chain.run(text=full_text)
36
 
37
- # Use gTTS for TTS since Hugging Face TTS model failed
38
  tts = gTTS(text=summary)
39
  temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
40
  tts.save(temp_path.name)
@@ -52,7 +68,7 @@ iface = gr.Interface(
52
  gr.Audio(label="Audio Summary")
53
  ],
54
  title="URL to Audio Summary Agent",
55
- description="Summarizes article from a URL and gives an audio summary. CPU-only using gTTS."
56
  )
57
 
58
  if __name__ == "__main__":
 
8
  from gtts import gTTS
9
  import tempfile
10
  import os
11
+ from bs4 import BeautifulSoup
12
+ import requests
13
 
14
  # CPU-friendly summarization LLM
15
  summary_pipe = pipeline("text2text-generation", model="google/flan-t5-base", device=-1)
 
17
 
18
  # Summarization prompt
19
  summary_prompt = PromptTemplate.from_template("""
20
+ Summarize the following article content in a clear, concise way:
21
 
22
  {text}
23
 
 
26
 
27
  summary_chain = LLMChain(llm=llm, prompt=summary_prompt)
28
 
29
+ def extract_main_content(url):
30
+ try:
31
+ response = requests.get(url, timeout=10)
32
+ soup = BeautifulSoup(response.content, "html.parser")
33
+
34
+ # Remove navigation, header, footer, sidebars, and scripts
35
+ for tag in soup(["nav", "header", "footer", "aside", "script", "style", "noscript"]):
36
+ tag.decompose()
37
+
38
+ # Extract main content using tags with significant paragraph text
39
+ paragraphs = soup.find_all("p")
40
+ content = "\n".join([p.get_text() for p in paragraphs if len(p.get_text()) > 60])
41
+ return content.strip()
42
+ except Exception as e:
43
+ return f"Error extracting article content: {str(e)}"
44
+
45
  def url_to_audio_summary(url):
46
  try:
47
+ article_text = extract_main_content(url)
48
+ if article_text.startswith("Error"):
49
+ return article_text, None
 
50
 
51
+ summary = summary_chain.run(text=article_text)
 
52
 
53
+ # Use gTTS for TTS
54
  tts = gTTS(text=summary)
55
  temp_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
56
  tts.save(temp_path.name)
 
68
  gr.Audio(label="Audio Summary")
69
  ],
70
  title="URL to Audio Summary Agent",
71
+ description="Summarizes only the article content from a URL and gives an audio summary. CPU-only using gTTS."
72
  )
73
 
74
  if __name__ == "__main__":