Spaces:

KingNish
/

Doc-Reader-and-Chat

Running

App Files Files Community

KingNish commited on Sep 19, 2024

Commit

3e87e84

verified ·

1 Parent(s): a1bf1bb

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -62

app.py CHANGED Viewed

@@ -8,44 +8,61 @@ import re
 import zipfile
 import xml.etree.ElementTree as ET
 def xml2text(xml):
     text = u''
     root = ET.fromstring(xml)
     for child in root.iter():
         text += child.text + " " if child.text is not None else ''
     return text
-def extract_text_from_docx(docx_data):
     text = u''
     zipf = zipfile.ZipFile(io.BytesIO(docx_data))
     filelist = zipf.namelist()
-    header_xmls = 'word/header[0-9]*.xml'
     for fname in filelist:
-        if re.match(header_xmls, fname):
             text += xml2text(zipf.read(fname))
-    doc_xml = 'word/document.xml'
-    text += xml2text(zipf.read(doc_xml))
-    footer_xmls = 'word/footer[0-9]*.xml'
-    for fname in filelist:
-        if re.match(footer_xmls, fname):
             text += xml2text(zipf.read(fname))
     zipf.close()
-    return text.strip()
-# Initialize the Mistral chat model
-client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
-def read_document(file):
-    file_path = file.name  # Get the file path from NamedString
     file_extension = file_path.split('.')[-1].lower()
-    with open(file_path, "rb") as f:  # Open the file in binary read mode
         file_content = f.read()
     if file_extension == 'pdf':
@@ -54,6 +71,8 @@ def read_document(file):
             content = ''
             for page in range(len(pdf_reader.pages)):
                 content += pdf_reader.pages[page].extract_text()
             return content
         except Exception as e:
             return f"Error reading PDF: {e}"
@@ -67,6 +86,8 @@ def read_document(file):
                     for cell in row:
                         if cell.value is not None:
                             content += str(cell.value) + ' '
             return content
         except Exception as e:
             return f"Error reading XLSX: {e}"
@@ -79,48 +100,44 @@ def read_document(file):
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         content += shape.text + ' '
             return content
         except Exception as e:
             return f"Error reading PPTX: {e}"
     elif file_extension == 'doc' or file_extension == 'docx':
         try:
-            return extract_text_from_docx(file_content)
         except Exception as e:
             return f"Error reading DOC/DOCX: {e}"
     else:
         try:
             content = file_content.decode('utf-8')
             return content
         except Exception as e:
             return f"Error reading file: {e}"
-def split_content(content, chunk_size=32000):
     chunks = []
-    for i in range(0, len(content), chunk_size):
-        chunks.append(content[i:i + chunk_size])
     return chunks
-def chat_document(file, question):
-    content = str(read_document(file))
-    if len(content) > 32000:
-        content = content.replace('\n', ' ')
-        content = content.replace('\r', ' ')
-        content = content.replace('\t', ' ')
-        content = content.replace('  ', '')
-        content = content.strip()
-        content = content[:32000]
-    # Define system prompt for the chat API
-    system_prompt = """
-    You are a helpful and informative assistant that can answer questions based on the content of documents.
-    You will receive the content of a document and a question about it.
-    Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
-    If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
-    """
-    message = f"""[INST] [SYSTEM] {system_prompt}
     Document Content: {content}
     Question: {question}
     Answer:"""
@@ -133,27 +150,15 @@ def chat_document(file, question):
         yield output
-def chat_document_v2(file, question):
-    content = str(read_document(file))
-    content = content.replace('\n', ' ')
-    content = content.replace('\r', ' ')
-    content = content.replace('\t', ' ')
-    content = content.replace('  ', '')
-    content = content.strip()
     chunks = split_content(content)
-    # Define system prompt for the chat API
-    system_prompt = """
-    You are a helpful and informative assistant that can answer questions based on the content of documents.
-    You will receive the content of a document and a question about it.
-    Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
-    If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
-    """
     all_answers = []
     for chunk in chunks:
-        message = f"""[INST] [SYSTEM] {system_prompt}
-        Document Content: {chunk[:32000]}
         Question: {question}
         Answer:"""
@@ -191,7 +196,7 @@ with gr.Blocks() as demo:
         with gr.TabItem("Document Reader"):
             iface1 = gr.Interface(
                 fn=read_document,
-                inputs=gr.File(label="Upload a Document"),
                 outputs=gr.Textbox(label="Document Content"),
                 title="Document Reader",
                 description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
@@ -199,7 +204,7 @@ with gr.Blocks() as demo:
         with gr.TabItem("Document Chat"):
             iface2 = gr.Interface(
                 fn=chat_document,
-                inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question")],
                 outputs=gr.Markdown(label="Answer"),
                 title="Document Chat",
                 description="Upload a document and ask questions about its content."
@@ -207,10 +212,10 @@ with gr.Blocks() as demo:
         with gr.TabItem("Document Chat V2"):
             iface3 = gr.Interface(
                 fn=chat_document_v2,
-                inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question")],
                 outputs=gr.Markdown(label="Answer"),
                 title="Document Chat V2",
                 description="Upload a document and ask questions about its content (using chunk-based approach)."
             )
-demo.launch()

 import zipfile
 import xml.etree.ElementTree as ET
+# Constants
+CHUNK_SIZE = 32000
+SYSTEM_PROMPT = """
+You are a helpful and informative assistant that can answer questions based on the content of documents.
+You will receive the content of a document and a question about it.
+Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
+If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
+"""
+# Initialize the Mistral chat model
+client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
 def xml2text(xml):
+    """Extracts text from XML data."""
     text = u''
     root = ET.fromstring(xml)
     for child in root.iter():
         text += child.text + " " if child.text is not None else ''
     return text
+def extract_text_from_docx(docx_data, strip_content):
+    """Extracts text from a DOCX file."""
     text = u''
     zipf = zipfile.ZipFile(io.BytesIO(docx_data))
     filelist = zipf.namelist()
     for fname in filelist:
+        if re.match('word/header[0-9]*.xml', fname):
             text += xml2text(zipf.read(fname))
+        elif re.match('word/footer[0-9]*.xml', fname):
             text += xml2text(zipf.read(fname))
+    text += xml2text(zipf.read('word/document.xml'))
     zipf.close()
+    if strip_content:
+        text = strip_text(text)
+    return f"{text}\n\n**Document Length:** {len(text)} characters"
+def strip_text(text):
+    """Strips unnecessary characters from text."""
+    content = text.replace('\n', ' ')
+    content = content.replace('\r', ' ')
+    content = content.replace('\t', ' ')
+    content = content.replace('  ', '')
+    return content.strip()
+def read_document(file, strip_content):
+    """Reads the content of a document based on its file type."""
+    file_path = file.name
     file_extension = file_path.split('.')[-1].lower()
+    with open(file_path, "rb") as f:
         file_content = f.read()
     if file_extension == 'pdf':
             content = ''
             for page in range(len(pdf_reader.pages)):
                 content += pdf_reader.pages[page].extract_text()
+            if strip_content:
+                content = strip_text(content)
             return content
         except Exception as e:
             return f"Error reading PDF: {e}"
                     for cell in row:
                         if cell.value is not None:
                             content += str(cell.value) + ' '
+            if strip_content:
+                content = strip_text(content)
             return content
         except Exception as e:
             return f"Error reading XLSX: {e}"
                 for shape in slide.shapes:
                     if hasattr(shape, "text"):
                         content += shape.text + ' '
+            if strip_content:
+                content = strip_text(content)
             return content
         except Exception as e:
             return f"Error reading PPTX: {e}"
     elif file_extension == 'doc' or file_extension == 'docx':
         try:
+            return extract_text_from_docx(file_content, strip_content)
         except Exception as e:
             return f"Error reading DOC/DOCX: {e}"
     else:
         try:
             content = file_content.decode('utf-8')
+            if strip_content:
+                content = strip_text(content)
             return content
         except Exception as e:
             return f"Error reading file: {e}"
+def split_content(content):
+    """Splits content into chunks for processing."""
     chunks = []
+    for i in range(0, len(content), CHUNK_SIZE):
+        chunks.append(content[i:i + CHUNK_SIZE])
     return chunks
+def chat_document(file, question, strip_content):
+    """Handles chat with a document using Mistral."""
+    content = str(read_document(file, strip_content))
+    if len(content) > CHUNK_SIZE:
+        content = content[:CHUNK_SIZE]
+    message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
     Document Content: {content}
     Question: {question}
     Answer:"""
         yield output
+def chat_document_v2(file, question, strip_content):
+    """Handles chat with a document using Mistral and chunk-based approach."""
+    content = str(read_document(file, strip_content))
     chunks = split_content(content)
     all_answers = []
     for chunk in chunks:
+        message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
+        Document Content: {chunk[:CHUNK_SIZE]}
         Question: {question}
         Answer:"""
         with gr.TabItem("Document Reader"):
             iface1 = gr.Interface(
                 fn=read_document,
+                inputs=[gr.File(label="Upload a Document"), gr.Checkbox(label="Strip Content", value=True)],
                 outputs=gr.Textbox(label="Document Content"),
                 title="Document Reader",
                 description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
         with gr.TabItem("Document Chat"):
             iface2 = gr.Interface(
                 fn=chat_document,
+                inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
                 outputs=gr.Markdown(label="Answer"),
                 title="Document Chat",
                 description="Upload a document and ask questions about its content."
         with gr.TabItem("Document Chat V2"):
             iface3 = gr.Interface(
                 fn=chat_document_v2,
+                inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
                 outputs=gr.Markdown(label="Answer"),
                 title="Document Chat V2",
                 description="Upload a document and ask questions about its content (using chunk-based approach)."
             )
+demo.launch()