Spaces:

KingNish
/

Doc-Reader-and-Chat

Running

File size: 5,810 Bytes

217892e
 
 
 
 
a1654f3
 
 
dbc91d5
a1654f3
3e87e84
 
 
a74d94b
 
a1654f3
3e87e84
a1654f3
 
 
 
 
 
a74d94b
 
a007d1e
 
 
459ea62
a74d94b
 
 
 
 
 
 
 
 
 
 
 
 
 
a1654f3
 
a74d94b
a1654f3
 
a74d94b
a1654f3
a74d94b
a1654f3
217892e
a74d94b
 
 
 
 
 
 
217892e
a74d94b
a007d1e
a74d94b
 
3e87e84
9686871
 
 
bf4e8a9
9686871
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e87e84
a74d94b
3e87e84
217892e
3e87e84
217892e
 
dbc91d5
 
f1bdb57
 
 
 
dbc91d5
 
217892e
 
 
 
 
a74d94b
 
558887c
217892e
a74d94b
cc4a32a
217892e
 
 
 
 
 
a74d94b
8a16657
a007d1e
a74d94b
558887c
217892e
cc4a32a
dbc91d5
217892e
dbc91d5
 
 
558887c
217892e
dbc91d5
 
 
 
 
 
558887c
dbc91d5
 
cc4a32a
217892e
a74d94b
217892e
cc4a32a
 
dbc91d5
 
 
cc4a32a
 
217892e
 
a74d94b
 
 
558887c
217892e
a74d94b
217892e
3e87e84
a74d94b
c87c622
e27d06b

import PyPDF2
from openpyxl import load_workbook
from pptx import Presentation
import gradio as gr
import io
import re
import zipfile
import xml.etree.ElementTree as ET
import filetype

# Constants
CHUNK_SIZE = 32000

# --- Utility Functions ---

def xml2text(xml):
    """Extracts text from XML data."""
    text = u''
    root = ET.fromstring(xml)
    for child in root.iter():
        text += child.text + " " if child.text is not None else ''
    return text

def clean_text(content):
    """Cleans text content based on the 'clean' parameter."""
    content = content.replace('\n', ' ')
    content = content.replace('\r', ' ')
    content = content.replace('\t', ' ')
    content = re.sub(r'\s+', ' ', content)
    return content


def split_content(content, chunk_size=CHUNK_SIZE):
    """Splits content into chunks of a specified size."""
    chunks = []
    for i in range(0, len(content), chunk_size):
        chunks.append(content[i:i + chunk_size])
    return chunks

# --- Document Reading Functions ---

def extract_text_from_docx(docx_data, clean=True):
    """Extracts text from DOCX files."""
    text = u''
    zipf = zipfile.ZipFile(io.BytesIO(docx_data))

    filelist = zipf.namelist()

    header_xmls = 'word/header[0-9]*.xml'
    for fname in filelist:
        if re.match(header_xmls, fname):
            text += xml2text(zipf.read(fname))

    doc_xml = 'word/document.xml'
    text += xml2text(zipf.read(doc_xml))

    footer_xmls = 'word/footer[0-9]*.xml'
    for fname in filelist:
        if re.match(footer_xmls, fname):
            text += xml2text(zipf.read(fname))

    zipf.close()
    if clean:
        text = clean_text(text)
    return text, len(text)

def extract_text_from_pptx(pptx_data, clean=True):
    """Extracts text from PPT files."""
    text = u''
    zipf = zipfile.ZipFile(io.BytesIO(pptx_data))

    filelist = zipf.namelist()

    # Extract text from slide notes
    notes_xmls = 'ppt/notesSlides/notesSlide[0-9]*.xml'
    for fname in filelist:
        if re.match(notes_xmls, fname):
            text += xml2text(zipf.read(fname))

    # Extract text from slide content (shapes and text boxes)
    slide_xmls = 'ppt/slides/slide[0-9]*.xml'
    for fname in filelist:
        if re.match(slide_xmls, fname):
            text += xml2text(zipf.read(fname))

    zipf.close()
    if clean:
        text = clean_text(text)
    return text, len(text)

def read_document(file, clean=True):
    file_path = file.name

    with open(file_path, "rb") as f:
        file_content = f.read()

    kind = filetype.guess(file_content)

    if kind is None:
        mime = "text"
    else:
        mime = kind.mime

    if mime == "application/pdf":
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
            content = ''
            for page in range(len(pdf_reader.pages)):
                content += pdf_reader.pages[page].extract_text()
            if clean:
                content = clean_text(content)
            return content, len(repr(content))
        except Exception as e:
            return f"Error reading PDF: {e}", 0
    elif mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
        try:
            wb = load_workbook(io.BytesIO(file_content))
            content = ''
            for sheet in wb.worksheets:
                for row in sheet.rows:
                    for cell in row:
                        if cell.value is not None:
                            content += str(cell.value) + ' '
            if clean:
                content = clean_text(content)
            return content, len(repr(content))
        except Exception as e:
            return f"Error reading XLSX: {e}", 0
    elif mime == "text/plain":
        try:
            content = file_content.decode('utf-8')
            if clean:
                content = clean_text(content)
            return content, len(repr(content))
        except Exception as e:
            return f"Error reading TXT file: {e}", 0
    elif mime == "text/csv":
        try:
            content = file_content.decode('utf-8')
            if clean:
                content = clean_text(content)
            return content, len(repr(content))
        except Exception as e:
            return f"Error reading CSV file: {e}", 0
    elif mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        try:
            return extract_text_from_docx(file_content, clean)
        except Exception as e:
            return f"Error reading DOCX: {e}", 0
    elif mime == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
        try:
            return extract_text_from_pptx(file_content, clean)
        except Exception as e:
            return f"Error reading PPTX: {e}", 0

    else:
        try:
            content = file_content.decode('utf-8')
            if clean:
                content = clean_text(content)
            return content, len(repr(content))
        except Exception as e:
            return f"Error reading file: {e}", 0


# --- Gradio Interface ---

iface = gr.Interface(
    fn=read_document,
    inputs=[
        gr.File(label="Upload a Document"),
        gr.Checkbox(label="Clean Text", value=True),
    ],
    outputs=[
        gr.Textbox(label="Document Content"),
        gr.Number(label="Document Length (characters)"),
    ],
    title="Better Document Reader for Hugging Face Chat Tools",
    description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
                "This tool is designed for use with Hugging Face Chat Tools: "
                "[https://hf.co/chat/tools/66ed8236a35891a61e2bfcf2](https://hf.co/chat/tools/66ed8236a35891a61e2bfcf2)",
    concurrency_limit = None
)

iface.launch()