mgbam commited on
Commit
1ae58ff
·
verified ·
1 Parent(s): 1687ea3

Rename ux_components.py to extractor.py

Browse files
Files changed (2) hide show
  1. extractor.py +109 -0
  2. ux_components.py +0 -24
extractor.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /extractor.py
2
+
3
+ """
4
+ Handles content extraction from various sources like files, images, and websites.
5
+
6
+ This module encapsulates the logic for parsing different file formats (PDF, DOCX),
7
+ performing Optical Character Recognition (OCR) on images, and scraping web content.
8
+ """
9
+ import mimetypes
10
+ import os
11
+ import re
12
+ from urllib.parse import urlparse, urljoin
13
+ import logging
14
+
15
+ import PyPDF2
16
+ import docx
17
+ import requests
18
+ from bs4 import BeautifulSoup
19
+
20
+ # --- Setup Logging ---
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
+
23
+ # --- Optional OCR Imports ---
24
+ try:
25
+ import cv2
26
+ import numpy as np
27
+ import pytesseract
28
+ OCR_AVAILABLE = True
29
+ except ImportError:
30
+ OCR_AVAILABLE = False
31
+ logging.warning("OCR libraries not found (cv2, numpy, pytesseract). Text extraction from images will be disabled.")
32
+
33
+ def extract_text_from_image(image_path: str) -> str:
34
+ """Extracts text from an image file using Tesseract OCR."""
35
+ if not OCR_AVAILABLE:
36
+ return "Error: OCR dependencies are not installed. Please run 'pip install opencv-python-headless pytesseract'."
37
+ try:
38
+ pytesseract.get_tesseract_version()
39
+ except Exception:
40
+ return "Error: Tesseract OCR is not installed or not in your PATH."
41
+
42
+ try:
43
+ image = cv2.imread(image_path)
44
+ if image is None:
45
+ return "Error: Could not read image file."
46
+ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
47
+ text = pytesseract.image_to_string(gray)
48
+ return text.strip() or "No text found in image."
49
+ except Exception as e:
50
+ logging.error(f"OCR extraction failed: {e}")
51
+ return f"Error during OCR: {e}"
52
+
53
+ def extract_text_from_file(file_path: str) -> str:
54
+ """Extracts text from a variety of file types."""
55
+ if not file_path:
56
+ return ""
57
+ ext = os.path.splitext(file_path)[1].lower()
58
+ try:
59
+ if ext == ".pdf":
60
+ with open(file_path, "rb") as f:
61
+ reader = PyPDF2.PdfReader(f)
62
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
63
+ elif ext == ".docx":
64
+ doc = docx.Document(file_path)
65
+ return "\n".join(p.text for p in doc.paragraphs)
66
+ elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
67
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
68
+ return f.read()
69
+ elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
70
+ return extract_text_from_image(file_path)
71
+ else:
72
+ return f"Unsupported file type: {ext}"
73
+ except Exception as e:
74
+ logging.error(f"Error extracting text from {file_path}: {e}")
75
+ return f"Error extracting text: {e}"
76
+
77
+ def extract_website_content(url: str) -> str:
78
+ """Scrapes and returns the primary HTML content of a given URL."""
79
+ try:
80
+ headers = {
81
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
82
+ }
83
+ response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
84
+ response.raise_for_status()
85
+ response.encoding = response.apparent_encoding
86
+ soup = BeautifulSoup(response.text, 'html.parser')
87
+
88
+ # Make all resource links absolute
89
+ for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
90
+ for item in soup.find_all(tag):
91
+ if item.has_attr(attr):
92
+ item[attr] = urljoin(url, item[attr])
93
+
94
+ title = soup.title.string if soup.title else "N/A"
95
+ # Return a prettified version of the body content for context
96
+ body_content = soup.body.prettify() if soup.body else str(soup)
97
+
98
+ # Truncate for prompt
99
+ if len(body_content) > 15000:
100
+ body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
101
+
102
+ return f"<!-- Original URL: {url} -->\n<!-- Title: {title} -->\n{body_content}"
103
+
104
+ except requests.RequestException as e:
105
+ logging.error(f"Website extraction failed for {url}: {e}")
106
+ return f"Error: Could not fetch content from the URL. Details: {e}"
107
+ except Exception as e:
108
+ logging.error(f"An unexpected error occurred during website extraction: {e}")
109
+ return f"Error: An unexpected error occurred. Details: {e}"
ux_components.py DELETED
@@ -1,24 +0,0 @@
1
- import gradio as gr
2
- from config import DEMO_LIST
3
-
4
- def create_top_demo_cards(input_textbox):
5
- """Creates a Gradio Column with buttons for the top 3 demo examples."""
6
- with gr.Column(visible=True) as quick_examples_col:
7
- for i, demo_item in enumerate(DEMO_LIST[:3]):
8
- demo_card = gr.Button(
9
- value=demo_item['title'],
10
- variant="secondary",
11
- size="sm",
12
- elem_id=f"demo_card_{i}" # Add an ID for potential styling
13
- )
14
- demo_card.click(
15
- fn=lambda idx=i: gr.update(value=DEMO_LIST[idx]['description']),
16
- outputs=input_textbox,
17
- )
18
- return quick_examples_col
19
-
20
- if __name__ == "__main__":
21
- with gr.Blocks() as demo:
22
- input_textbox = gr.Textbox(label="Input")
23
- create_top_demo_cards(input_textbox)
24
- demo.launch()