mgbam commited on
Commit
9686c37
·
verified ·
1 Parent(s): 4739b8c

Update extractor.py

Browse files
Files changed (1) hide show
  1. extractor.py +22 -76
extractor.py CHANGED
@@ -1,109 +1,55 @@
1
  # /extractor.py
2
-
3
- """
4
- Handles content extraction from various sources like files, images, and websites.
5
-
6
- This module encapsulates the logic for parsing different file formats (PDF, DOCX),
7
- performing Optical Character Recognition (OCR) on images, and scraping web content.
8
- """
9
- import mimetypes
10
- import os
11
- import re
12
- from urllib.parse import urlparse, urljoin
13
- import logging
14
-
15
- import PyPDF2
16
- import docx
17
- import requests
18
  from bs4 import BeautifulSoup
19
 
20
- # --- Setup Logging ---
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
 
23
- # --- Optional OCR Imports ---
24
  try:
25
- import cv2
26
- import numpy as np
27
- import pytesseract
28
  OCR_AVAILABLE = True
29
  except ImportError:
30
  OCR_AVAILABLE = False
31
- logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")
32
 
33
  def extract_text_from_image(image_path: str) -> str:
34
- """Extracts text from an image file using Tesseract OCR."""
35
- if not OCR_AVAILABLE:
36
- return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
37
- try:
38
- pytesseract.get_tesseract_version()
39
- except Exception:
40
- return "Error: Tesseract OCR is not installed or not in your PATH."
41
-
42
  try:
43
  image = cv2.imread(image_path)
44
- if image is None:
45
- return "Error: Could not read image file."
46
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
47
- text = pytesseract.image_to_string(gray)
48
- return text.strip() or "No text found in image."
49
- except Exception as e:
50
- logging.error(f"OCR extraction failed: {e}")
51
- return f"Error during OCR: {e}"
52
 
53
  def extract_text_from_file(file_path: str) -> str:
54
- """Extracts text from a variety of file types."""
55
- if not file_path:
56
- return ""
57
  ext = os.path.splitext(file_path)[1].lower()
58
  try:
59
  if ext == ".pdf":
60
- with open(file_path, "rb") as f:
61
- reader = PyPDF2.PdfReader(f)
62
- return "\n".join(page.extract_text() or "" for page in reader.pages)
63
  elif ext == ".docx":
64
- doc = docx.Document(file_path)
65
- return "\n".join(p.text for p in doc.paragraphs)
66
  elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
67
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
68
- return f.read()
69
  elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
70
  return extract_text_from_image(file_path)
71
- else:
72
- return f"Unsupported file type: {ext}"
73
- except Exception as e:
74
- logging.error(f"Error extracting text from {file_path}: {e}")
75
- return f"Error extracting text: {e}"
76
 
77
  def extract_website_content(url: str) -> str:
78
- """Scrapes and returns the primary HTML content of a given URL."""
79
  try:
80
- headers = {
81
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
82
- }
83
  response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
84
  response.raise_for_status()
85
  response.encoding = response.apparent_encoding
86
  soup = BeautifulSoup(response.text, 'html.parser')
87
 
88
- # Make all resource links absolute
89
  for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
90
  for item in soup.find_all(tag):
91
- if item.has_attr(attr):
92
- item[attr] = urljoin(url, item[attr])
93
-
94
- title = soup.title.string if soup.title else "N/A"
95
- # Return a prettified version of the body content for context
96
- body_content = soup.body.prettify() if soup.body else str(soup)
97
-
98
- # Truncate for prompt
99
- if len(body_content) > 15000:
100
- body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
101
-
102
- return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"
103
-
104
- except requests.RequestException as e:
105
- logging.error(f"Website extraction failed for {url}: {e}")
106
- return f"Error: Could not fetch content from the URL. Details: {e}"
107
- except Exception as e:
108
- logging.error(f"An unexpected error occurred during website extraction: {e}")
109
- return f"Error: An unexpected error occurred. Details: {e}"
 
1
  # /extractor.py
2
+ """ Handles content extraction from various sources like files, images, and websites. """
3
+ import mimetypes, os, re, logging
4
+ from urllib.parse import urljoin
5
+ import PyPDF2, docx, requests
 
 
 
 
 
 
 
 
 
 
 
 
6
  from bs4 import BeautifulSoup
7
 
 
8
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
 
 
10
  try:
11
+ import cv2, pytesseract
 
 
12
  OCR_AVAILABLE = True
13
  except ImportError:
14
  OCR_AVAILABLE = False
15
+ logging.warning("OCR libraries not found. Text extraction from images will be disabled.")
16
 
17
  def extract_text_from_image(image_path: str) -> str:
18
+ if not OCR_AVAILABLE: return "Error: OCR dependencies not installed."
 
 
 
 
 
 
 
19
  try:
20
  image = cv2.imread(image_path)
 
 
21
  gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
22
+ return pytesseract.image_to_string(gray) or "No text found in image."
23
+ except Exception as e: return f"Error during OCR: {e}"
 
 
 
24
 
25
  def extract_text_from_file(file_path: str) -> str:
26
+ if not file_path: return ""
 
 
27
  ext = os.path.splitext(file_path)[1].lower()
28
  try:
29
  if ext == ".pdf":
30
+ with open(file_path, "rb") as f: return "\n".join(p.extract_text() or "" for p in PyPDF2.PdfReader(f).pages)
 
 
31
  elif ext == ".docx":
32
+ return "\n".join(p.text for p in docx.Document(file_path).paragraphs)
 
33
  elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
34
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read()
 
35
  elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
36
  return extract_text_from_image(file_path)
37
+ else: return f"Unsupported file type: {ext}"
38
+ except Exception as e: return f"Error extracting text: {e}"
 
 
 
39
 
40
  def extract_website_content(url: str) -> str:
 
41
  try:
42
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
 
 
43
  response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
44
  response.raise_for_status()
45
  response.encoding = response.apparent_encoding
46
  soup = BeautifulSoup(response.text, 'html.parser')
47
 
 
48
  for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
49
  for item in soup.find_all(tag):
50
+ if item.has_attr(attr): item[attr] = urljoin(url, item[attr])
51
+
52
+ body_content = str(soup)
53
+ if len(body_content) > 15000: body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
54
+ return f"<!-- Original URL: {url} -->\n{body_content}"
55
+ except Exception as e: return f"Error: Could not fetch content from {url}. Details: {e}"