Spaces:
Running
Running
File size: 7,529 Bytes
413592b d6e49e1 413592b d6e49e1 e6218e4 3264dd4 d6e49e1 413592b e6218e4 191dc44 e6218e4 d6e49e1 191dc44 3264dd4 e6218e4 191dc44 e6218e4 413592b e6218e4 413592b e6218e4 413592b e6218e4 413592b d6e49e1 413592b 5e5ef17 3264dd4 d6e49e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import PyPDF2
from openpyxl import load_workbook
from pptx import Presentation
import gradio as gr
import io
import re
import zipfile
import xml.etree.ElementTree as ET
import filetype
import requests
import os
import mimetypes
# Constants
CHUNK_SIZE = 32000
# --- Utility Functions ---
def xml2text(xml):
"""Extracts text from XML data."""
text = u''
root = ET.fromstring(xml)
for child in root.iter():
text += child.text + " " if child.text is not None else ''
return text
def clean_text(content):
"""Cleans text content based on the 'clean' parameter."""
content = content.replace('\n', ' ')
content = content.replace('\r', ' ')
content = content.replace('\t', ' ')
content = re.sub(r'\s+', ' ', content)
return content
def split_content(content, chunk_size=CHUNK_SIZE):
"""Splits content into chunks of a specified size."""
chunks = []
for i in range(0, len(content), chunk_size):
chunks.append(content[i:i + chunk_size])
return chunks
# --- Document Reading Functions ---
def extract_text_from_docx(docx_data, clean=True):
"""Extracts text from DOCX files."""
text = u''
zipf = zipfile.ZipFile(io.BytesIO(docx_data))
filelist = zipf.namelist()
header_xmls = 'word/header[0-9]*.xml'
for fname in filelist:
if re.match(header_xmls, fname):
text += xml2text(zipf.read(fname))
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))
footer_xmls = 'word/footer[0-9]*.xml'
for fname in filelist:
if re.match(footer_xmls, fname):
text += xml2text(zipf.read(fname))
zipf.close()
if clean:
text = clean_text(text)
return text, len(text)
def extract_text_from_pptx(pptx_data, clean=True):
"""Extracts text from PPT files."""
text = u''
zipf = zipfile.ZipFile(io.BytesIO(pptx_data))
filelist = zipf.namelist()
# Extract text from slide notes
notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
for fname in filelist:
if re.match(notes_xmls, fname):
text += xml2text(zipf.read(fname))
# Extract text from slide content (shapes and text boxes)
slide_xmls = 'ppt/slides/slide[0-9]*.xml'
for fname in filelist:
if re.match(slide_xmls, fname):
text += xml2text(zipf.read(fname))
zipf.close()
if clean:
text = clean_text(text)
return text, len(text)
def read_document(file_path, clean=True):
with open(file_path, "rb") as f:
file_content = f.read()
kind = filetype.guess(file_content)
if kind is None:
mime = "text"
else:
mime = kind.mime
if mime == "application/pdf":
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
content = ''
for page in range(len(pdf_reader.pages)):
content += pdf_reader.pages[page].extract_text()
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading PDF: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
try:
wb = load_workbook(io.BytesIO(file_content))
content = ''
for sheet in wb.worksheets:
for row in sheet.rows:
for cell in row:
if cell.value is not None:
content += str(cell.value) + ' '
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading XLSX: {e}", 0
elif mime == "text/plain":
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading TXT file: {e}", 0
elif mime == "text/csv":
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading CSV file: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
try:
return extract_text_from_docx(file_content, clean)
except Exception as e:
return f"Error reading DOCX: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
try:
return extract_text_from_pptx(file_content, clean)
except Exception as e:
return f"Error reading PPTX: {e}", 0
else:
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading file: {e}", 0
def download_and_process_file(url, clean=True):
"""Downloads a file from a URL and returns the local file path."""
if not url.startswith("http://") and not url.startswith("https://"):
url = "http://" + url # Prepend "http://" if not present
try:
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an exception for bad status codes
# Generate a safe and unique temporary filename
original_filename = os.path.basename(url)
# Remove invalid characters from filename
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
temp_filename = f"{safe_filename}"
# Infer file extension from content type
content_type = response.headers['content-type']
ext = mimetypes.guess_extension(content_type)
if ext and not temp_filename.endswith(ext): # Append extension if not already present
temp_filename += ext
with open(temp_filename, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192000):
f.write(chunk)
# Check if it's an image type
kind = filetype.guess(temp_filename)
if kind and kind.mime.startswith('image/'):
return f"", 0 # Return markdown image syntax if it's an image
else:
return read_document(temp_filename, clean) # Otherwise, process as a document
except requests.exceptions.MissingSchema:
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
except requests.exceptions.ConnectionError:
return "Error: Could not connect to the server. Please check your internet connection.", 0
except requests.exceptions.RequestException as e:
return f"Error downloading file: {e}", 0
# --- Gradio Interface ---
iface = gr.Interface(
fn=download_and_process_file,
inputs=[
gr.Textbox(lines=1, placeholder="Enter URL of the file"),
gr.Checkbox(label="Clean Text", value=True),
],
outputs=[
gr.Textbox(label="Document Content/Image Markdown"),
gr.Number(label="Document Length (characters)"),
],
title="Enhanced File Processor for Hugging Face Chat Tools",
description="Enter the URL of site and extract its content"
"This tool is designed for use with Hugging Face Chat Tools: "
"[https://hf.co/chat/tools/66f1a8159d41ad4398ebb711](https://hf.co/chat/tools/66f1a8159d41ad4398ebb711)",
concurrency_limit=None
)
iface.launch() |