Shreyas094 commited on
Commit
6a5abbf
·
verified ·
1 Parent(s): 1f15b33

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -9
app.py CHANGED
@@ -124,26 +124,52 @@ def scrape_with_newspaper(url):
124
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
125
  try:
126
  # Check if the URL is a PDF
127
- response = requests.get(url)
128
  content_type = response.headers.get('Content-Type', '').lower()
129
 
130
  if 'application/pdf' in content_type:
131
- # Handle PDF
132
  logger.info(f"Detected PDF file: {url}")
133
- pdf_file = BytesIO(response.content)
134
- pdf_reader = PdfReader(pdf_file)
135
- text = ""
136
- for page in pdf_reader.pages:
137
- text += page.extract_text() + "\n"
138
- return text.strip()
139
  else:
140
  # Handle regular web page
141
  article = Article(url)
142
  article.download()
143
  article.parse()
144
  return article.text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  except Exception as e:
146
- logger.error(f"Error scraping {url} with Newspaper3k: {e}")
147
  return ""
148
 
149
  def scrape_with_bs4(url, session, max_chars=None):
 
124
  logger.info(f"Starting to scrape with Newspaper3k: {url}")
125
  try:
126
  # Check if the URL is a PDF
127
+ response = requests.get(url, timeout=30)
128
  content_type = response.headers.get('Content-Type', '').lower()
129
 
130
  if 'application/pdf' in content_type:
 
131
  logger.info(f"Detected PDF file: {url}")
132
+ return extract_pdf_content(response.content)
 
 
 
 
 
133
  else:
134
  # Handle regular web page
135
  article = Article(url)
136
  article.download()
137
  article.parse()
138
  return article.text
139
+ except requests.RequestException as e:
140
+ logger.error(f"Error fetching content from {url}: {e}")
141
+ except Exception as e:
142
+ logger.error(f"Unexpected error scraping {url}: {e}")
143
+
144
+ # If we've reached this point, both methods have failed
145
+ logger.warning(f"All scraping methods failed for {url}")
146
+ return ""
147
+
148
+ def extract_pdf_content(pdf_content):
149
+ try:
150
+ # First, try using PyPDF2 directly
151
+ pdf_file = BytesIO(pdf_content)
152
+ pdf_reader = PdfReader(pdf_file)
153
+ text = ""
154
+ for page in pdf_reader.pages:
155
+ text += page.extract_text() + "\n"
156
+ if text.strip():
157
+ return text.strip()
158
+
159
+ # If PyPDF2 fails to extract text, try saving the PDF and using newspaper
160
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
161
+ temp_pdf.write(pdf_content)
162
+ temp_pdf_path = temp_pdf.name
163
+
164
+ try:
165
+ article = Article('file://' + temp_pdf_path)
166
+ article.download()
167
+ article.parse()
168
+ return article.text
169
+ finally:
170
+ os.unlink(temp_pdf_path) # Ensure we always delete the temporary file
171
  except Exception as e:
172
+ logger.error(f"Error extracting content from PDF: {e}")
173
  return ""
174
 
175
  def scrape_with_bs4(url, session, max_chars=None):