Spaces:
Running
Running
File size: 5,810 Bytes
217892e a1654f3 dbc91d5 a1654f3 3e87e84 a74d94b a1654f3 3e87e84 a1654f3 a74d94b a007d1e 459ea62 a74d94b a1654f3 a74d94b a1654f3 a74d94b a1654f3 a74d94b a1654f3 217892e a74d94b 217892e a74d94b a007d1e a74d94b 3e87e84 9686871 bf4e8a9 9686871 3e87e84 a74d94b 3e87e84 217892e 3e87e84 217892e dbc91d5 f1bdb57 dbc91d5 217892e a74d94b 558887c 217892e a74d94b cc4a32a 217892e a74d94b 8a16657 a007d1e a74d94b 558887c 217892e cc4a32a dbc91d5 217892e dbc91d5 558887c 217892e dbc91d5 558887c dbc91d5 cc4a32a 217892e a74d94b 217892e cc4a32a dbc91d5 cc4a32a 217892e a74d94b 558887c 217892e a74d94b 217892e 3e87e84 a74d94b c87c622 e27d06b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import PyPDF2
from openpyxl import load_workbook
from pptx import Presentation
import gradio as gr
import io
import re
import zipfile
import xml.etree.ElementTree as ET
import filetype
# Constants
CHUNK_SIZE = 32000
# --- Utility Functions ---
def xml2text(xml):
"""Extracts text from XML data."""
text = u''
root = ET.fromstring(xml)
for child in root.iter():
text += child.text + " " if child.text is not None else ''
return text
def clean_text(content):
"""Cleans text content based on the 'clean' parameter."""
content = content.replace('\n', ' ')
content = content.replace('\r', ' ')
content = content.replace('\t', ' ')
content = re.sub(r'\s+', ' ', content)
return content
def split_content(content, chunk_size=CHUNK_SIZE):
"""Splits content into chunks of a specified size."""
chunks = []
for i in range(0, len(content), chunk_size):
chunks.append(content[i:i + chunk_size])
return chunks
# --- Document Reading Functions ---
def extract_text_from_docx(docx_data, clean=True):
"""Extracts text from DOCX files."""
text = u''
zipf = zipfile.ZipFile(io.BytesIO(docx_data))
filelist = zipf.namelist()
header_xmls = 'word/header[0-9]*.xml'
for fname in filelist:
if re.match(header_xmls, fname):
text += xml2text(zipf.read(fname))
doc_xml = 'word/document.xml'
text += xml2text(zipf.read(doc_xml))
footer_xmls = 'word/footer[0-9]*.xml'
for fname in filelist:
if re.match(footer_xmls, fname):
text += xml2text(zipf.read(fname))
zipf.close()
if clean:
text = clean_text(text)
return text, len(text)
def extract_text_from_pptx(pptx_data, clean=True):
"""Extracts text from PPT files."""
text = u''
zipf = zipfile.ZipFile(io.BytesIO(pptx_data))
filelist = zipf.namelist()
# Extract text from slide notes
notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
for fname in filelist:
if re.match(notes_xmls, fname):
text += xml2text(zipf.read(fname))
# Extract text from slide content (shapes and text boxes)
slide_xmls = 'ppt/slides/slide[0-9]*.xml'
for fname in filelist:
if re.match(slide_xmls, fname):
text += xml2text(zipf.read(fname))
zipf.close()
if clean:
text = clean_text(text)
return text, len(text)
def read_document(file, clean=True):
file_path = file.name
with open(file_path, "rb") as f:
file_content = f.read()
kind = filetype.guess(file_content)
if kind is None:
mime = "text"
else:
mime = kind.mime
if mime == "application/pdf":
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
content = ''
for page in range(len(pdf_reader.pages)):
content += pdf_reader.pages[page].extract_text()
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading PDF: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
try:
wb = load_workbook(io.BytesIO(file_content))
content = ''
for sheet in wb.worksheets:
for row in sheet.rows:
for cell in row:
if cell.value is not None:
content += str(cell.value) + ' '
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading XLSX: {e}", 0
elif mime == "text/plain":
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading TXT file: {e}", 0
elif mime == "text/csv":
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading CSV file: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
try:
return extract_text_from_docx(file_content, clean)
except Exception as e:
return f"Error reading DOCX: {e}", 0
elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
try:
return extract_text_from_pptx(file_content, clean)
except Exception as e:
return f"Error reading PPTX: {e}", 0
else:
try:
content = file_content.decode('utf-8')
if clean:
content = clean_text(content)
return content, len(repr(content))
except Exception as e:
return f"Error reading file: {e}", 0
# --- Gradio Interface ---
iface = gr.Interface(
fn=read_document,
inputs=[
gr.File(label="Upload a Document"),
gr.Checkbox(label="Clean Text", value=True),
],
outputs=[
gr.Textbox(label="Document Content"),
gr.Number(label="Document Length (characters)"),
],
title="Better Document Reader for Hugging Face Chat Tools",
description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
"This tool is designed for use with Hugging Face Chat Tools: "
"[https://hf.co/chat/tools/66ed8236a35891a61e2bfcf2](https://hf.co/chat/tools/66ed8236a35891a61e2bfcf2)",
concurrency_limit = None
)
iface.launch() |