SearXNG-WebSearch-Agent

Paused

App Files Files Community

Shreyas094 commited on Oct 5, 2024

Commit

6a5abbf

verified ·

1 Parent(s): 1f15b33

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -9

app.py CHANGED Viewed

@@ -124,26 +124,52 @@ def scrape_with_newspaper(url):
     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
         # Check if the URL is a PDF
-        response = requests.get(url)
         content_type = response.headers.get('Content-Type', '').lower()
         if 'application/pdf' in content_type:
-            # Handle PDF
             logger.info(f"Detected PDF file: {url}")
-            pdf_file = BytesIO(response.content)
-            pdf_reader = PdfReader(pdf_file)
-            text = ""
-            for page in pdf_reader.pages:
-                text += page.extract_text() + "\n"
-            return text.strip()
         else:
             # Handle regular web page
             article = Article(url)
             article.download()
             article.parse()
             return article.text
     except Exception as e:
-        logger.error(f"Error scraping {url} with Newspaper3k: {e}")
         return ""
 def scrape_with_bs4(url, session, max_chars=None):

     logger.info(f"Starting to scrape with Newspaper3k: {url}")
     try:
         # Check if the URL is a PDF
+        response = requests.get(url, timeout=30)
         content_type = response.headers.get('Content-Type', '').lower()
         if 'application/pdf' in content_type:
             logger.info(f"Detected PDF file: {url}")
+            return extract_pdf_content(response.content)
         else:
             # Handle regular web page
             article = Article(url)
             article.download()
             article.parse()
             return article.text
+    except requests.RequestException as e:
+        logger.error(f"Error fetching content from {url}: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error scraping {url}: {e}")
+    # If we've reached this point, both methods have failed
+    logger.warning(f"All scraping methods failed for {url}")
+    return ""
+def extract_pdf_content(pdf_content):
+    try:
+        # First, try using PyPDF2 directly
+        pdf_file = BytesIO(pdf_content)
+        pdf_reader = PdfReader(pdf_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text() + "\n"
+        if text.strip():
+            return text.strip()
+        # If PyPDF2 fails to extract text, try saving the PDF and using newspaper
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
+            temp_pdf.write(pdf_content)
+            temp_pdf_path = temp_pdf.name
+        try:
+            article = Article('file://' + temp_pdf_path)
+            article.download()
+            article.parse()
+            return article.text
+        finally:
+            os.unlink(temp_pdf_path)  # Ensure we always delete the temporary file
     except Exception as e:
+        logger.error(f"Error extracting content from PDF: {e}")
         return ""
 def scrape_with_bs4(url, session, max_chars=None):