KingNish commited on
Commit
3e87e84
·
verified ·
1 Parent(s): a1bf1bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -62
app.py CHANGED
@@ -8,44 +8,61 @@ import re
8
  import zipfile
9
  import xml.etree.ElementTree as ET
10
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def xml2text(xml):
 
12
  text = u''
13
  root = ET.fromstring(xml)
14
  for child in root.iter():
15
  text += child.text + " " if child.text is not None else ''
16
  return text
17
 
18
-
19
- def extract_text_from_docx(docx_data):
20
  text = u''
21
  zipf = zipfile.ZipFile(io.BytesIO(docx_data))
22
-
23
  filelist = zipf.namelist()
24
 
25
- header_xmls = 'word/header[0-9]*.xml'
26
  for fname in filelist:
27
- if re.match(header_xmls, fname):
28
  text += xml2text(zipf.read(fname))
29
-
30
- doc_xml = 'word/document.xml'
31
- text += xml2text(zipf.read(doc_xml))
32
-
33
- footer_xmls = 'word/footer[0-9]*.xml'
34
- for fname in filelist:
35
- if re.match(footer_xmls, fname):
36
  text += xml2text(zipf.read(fname))
37
-
 
38
  zipf.close()
39
- return text.strip()
40
 
41
- # Initialize the Mistral chat model
42
- client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
 
 
43
 
44
- def read_document(file):
45
- file_path = file.name # Get the file path from NamedString
 
 
 
 
 
 
 
 
 
 
46
  file_extension = file_path.split('.')[-1].lower()
47
 
48
- with open(file_path, "rb") as f: # Open the file in binary read mode
49
  file_content = f.read()
50
 
51
  if file_extension == 'pdf':
@@ -54,6 +71,8 @@ def read_document(file):
54
  content = ''
55
  for page in range(len(pdf_reader.pages)):
56
  content += pdf_reader.pages[page].extract_text()
 
 
57
  return content
58
  except Exception as e:
59
  return f"Error reading PDF: {e}"
@@ -67,6 +86,8 @@ def read_document(file):
67
  for cell in row:
68
  if cell.value is not None:
69
  content += str(cell.value) + ' '
 
 
70
  return content
71
  except Exception as e:
72
  return f"Error reading XLSX: {e}"
@@ -79,48 +100,44 @@ def read_document(file):
79
  for shape in slide.shapes:
80
  if hasattr(shape, "text"):
81
  content += shape.text + ' '
 
 
82
  return content
83
  except Exception as e:
84
  return f"Error reading PPTX: {e}"
85
 
86
  elif file_extension == 'doc' or file_extension == 'docx':
87
  try:
88
- return extract_text_from_docx(file_content)
89
  except Exception as e:
90
  return f"Error reading DOC/DOCX: {e}"
91
 
92
  else:
93
  try:
94
  content = file_content.decode('utf-8')
 
 
95
  return content
96
  except Exception as e:
97
  return f"Error reading file: {e}"
98
 
99
- def split_content(content, chunk_size=32000):
 
 
100
  chunks = []
101
- for i in range(0, len(content), chunk_size):
102
- chunks.append(content[i:i + chunk_size])
103
  return chunks
104
 
105
- def chat_document(file, question):
106
- content = str(read_document(file))
107
- if len(content) > 32000:
108
- content = content.replace('\n', ' ')
109
- content = content.replace('\r', ' ')
110
- content = content.replace('\t', ' ')
111
- content = content.replace(' ', '')
112
- content = content.strip()
113
- content = content[:32000]
114
-
115
- # Define system prompt for the chat API
116
- system_prompt = """
117
- You are a helpful and informative assistant that can answer questions based on the content of documents.
118
- You will receive the content of a document and a question about it.
119
- Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
120
- If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
121
- """
122
 
123
- message = f"""[INST] [SYSTEM] {system_prompt}
 
 
 
 
 
 
 
124
  Document Content: {content}
125
  Question: {question}
126
  Answer:"""
@@ -133,27 +150,15 @@ def chat_document(file, question):
133
  yield output
134
 
135
 
136
- def chat_document_v2(file, question):
137
- content = str(read_document(file))
138
- content = content.replace('\n', ' ')
139
- content = content.replace('\r', ' ')
140
- content = content.replace('\t', ' ')
141
- content = content.replace(' ', '')
142
- content = content.strip()
143
  chunks = split_content(content)
144
 
145
- # Define system prompt for the chat API
146
- system_prompt = """
147
- You are a helpful and informative assistant that can answer questions based on the content of documents.
148
- You will receive the content of a document and a question about it.
149
- Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
150
- If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
151
- """
152
-
153
  all_answers = []
154
  for chunk in chunks:
155
- message = f"""[INST] [SYSTEM] {system_prompt}
156
- Document Content: {chunk[:32000]}
157
  Question: {question}
158
  Answer:"""
159
 
@@ -191,7 +196,7 @@ with gr.Blocks() as demo:
191
  with gr.TabItem("Document Reader"):
192
  iface1 = gr.Interface(
193
  fn=read_document,
194
- inputs=gr.File(label="Upload a Document"),
195
  outputs=gr.Textbox(label="Document Content"),
196
  title="Document Reader",
197
  description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
@@ -199,7 +204,7 @@ with gr.Blocks() as demo:
199
  with gr.TabItem("Document Chat"):
200
  iface2 = gr.Interface(
201
  fn=chat_document,
202
- inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question")],
203
  outputs=gr.Markdown(label="Answer"),
204
  title="Document Chat",
205
  description="Upload a document and ask questions about its content."
@@ -207,10 +212,10 @@ with gr.Blocks() as demo:
207
  with gr.TabItem("Document Chat V2"):
208
  iface3 = gr.Interface(
209
  fn=chat_document_v2,
210
- inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question")],
211
  outputs=gr.Markdown(label="Answer"),
212
  title="Document Chat V2",
213
  description="Upload a document and ask questions about its content (using chunk-based approach)."
214
  )
215
 
216
- demo.launch()
 
8
  import zipfile
9
  import xml.etree.ElementTree as ET
10
 
11
+ # Constants
12
+ CHUNK_SIZE = 32000
13
+ SYSTEM_PROMPT = """
14
+ You are a helpful and informative assistant that can answer questions based on the content of documents.
15
+ You will receive the content of a document and a question about it.
16
+ Your task is to provide a concise and accurate answer to the question based solely on the provided document content.
17
+ If the document does not contain enough information to answer the question, simply state that you cannot answer the question based on the provided information.
18
+ """
19
+
20
+ # Initialize the Mistral chat model
21
+ client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
22
+
23
  def xml2text(xml):
24
+ """Extracts text from XML data."""
25
  text = u''
26
  root = ET.fromstring(xml)
27
  for child in root.iter():
28
  text += child.text + " " if child.text is not None else ''
29
  return text
30
 
31
+ def extract_text_from_docx(docx_data, strip_content):
32
+ """Extracts text from a DOCX file."""
33
  text = u''
34
  zipf = zipfile.ZipFile(io.BytesIO(docx_data))
 
35
  filelist = zipf.namelist()
36
 
 
37
  for fname in filelist:
38
+ if re.match('word/header[0-9]*.xml', fname):
39
  text += xml2text(zipf.read(fname))
40
+ elif re.match('word/footer[0-9]*.xml', fname):
 
 
 
 
 
 
41
  text += xml2text(zipf.read(fname))
42
+
43
+ text += xml2text(zipf.read('word/document.xml'))
44
  zipf.close()
 
45
 
46
+ if strip_content:
47
+ text = strip_text(text)
48
+
49
+ return f"{text}\n\n**Document Length:** {len(text)} characters"
50
 
51
+
52
+ def strip_text(text):
53
+ """Strips unnecessary characters from text."""
54
+ content = text.replace('\n', ' ')
55
+ content = content.replace('\r', ' ')
56
+ content = content.replace('\t', ' ')
57
+ content = content.replace(' ', '')
58
+ return content.strip()
59
+
60
+ def read_document(file, strip_content):
61
+ """Reads the content of a document based on its file type."""
62
+ file_path = file.name
63
  file_extension = file_path.split('.')[-1].lower()
64
 
65
+ with open(file_path, "rb") as f:
66
  file_content = f.read()
67
 
68
  if file_extension == 'pdf':
 
71
  content = ''
72
  for page in range(len(pdf_reader.pages)):
73
  content += pdf_reader.pages[page].extract_text()
74
+ if strip_content:
75
+ content = strip_text(content)
76
  return content
77
  except Exception as e:
78
  return f"Error reading PDF: {e}"
 
86
  for cell in row:
87
  if cell.value is not None:
88
  content += str(cell.value) + ' '
89
+ if strip_content:
90
+ content = strip_text(content)
91
  return content
92
  except Exception as e:
93
  return f"Error reading XLSX: {e}"
 
100
  for shape in slide.shapes:
101
  if hasattr(shape, "text"):
102
  content += shape.text + ' '
103
+ if strip_content:
104
+ content = strip_text(content)
105
  return content
106
  except Exception as e:
107
  return f"Error reading PPTX: {e}"
108
 
109
  elif file_extension == 'doc' or file_extension == 'docx':
110
  try:
111
+ return extract_text_from_docx(file_content, strip_content)
112
  except Exception as e:
113
  return f"Error reading DOC/DOCX: {e}"
114
 
115
  else:
116
  try:
117
  content = file_content.decode('utf-8')
118
+ if strip_content:
119
+ content = strip_text(content)
120
  return content
121
  except Exception as e:
122
  return f"Error reading file: {e}"
123
 
124
+
125
+ def split_content(content):
126
+ """Splits content into chunks for processing."""
127
  chunks = []
128
+ for i in range(0, len(content), CHUNK_SIZE):
129
+ chunks.append(content[i:i + CHUNK_SIZE])
130
  return chunks
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ def chat_document(file, question, strip_content):
134
+ """Handles chat with a document using Mistral."""
135
+ content = str(read_document(file, strip_content))
136
+
137
+ if len(content) > CHUNK_SIZE:
138
+ content = content[:CHUNK_SIZE]
139
+
140
+ message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
141
  Document Content: {content}
142
  Question: {question}
143
  Answer:"""
 
150
  yield output
151
 
152
 
153
+ def chat_document_v2(file, question, strip_content):
154
+ """Handles chat with a document using Mistral and chunk-based approach."""
155
+ content = str(read_document(file, strip_content))
 
 
 
 
156
  chunks = split_content(content)
157
 
 
 
 
 
 
 
 
 
158
  all_answers = []
159
  for chunk in chunks:
160
+ message = f"""[INST] [SYSTEM] {SYSTEM_PROMPT}
161
+ Document Content: {chunk[:CHUNK_SIZE]}
162
  Question: {question}
163
  Answer:"""
164
 
 
196
  with gr.TabItem("Document Reader"):
197
  iface1 = gr.Interface(
198
  fn=read_document,
199
+ inputs=[gr.File(label="Upload a Document"), gr.Checkbox(label="Strip Content", value=True)],
200
  outputs=gr.Textbox(label="Document Content"),
201
  title="Document Reader",
202
  description="Upload a document (PDF, XLSX, PPTX, TXT, CSV, DOC, DOCX and Code or text file) to read its content."
 
204
  with gr.TabItem("Document Chat"):
205
  iface2 = gr.Interface(
206
  fn=chat_document,
207
+ inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
208
  outputs=gr.Markdown(label="Answer"),
209
  title="Document Chat",
210
  description="Upload a document and ask questions about its content."
 
212
  with gr.TabItem("Document Chat V2"):
213
  iface3 = gr.Interface(
214
  fn=chat_document_v2,
215
+ inputs=[gr.File(label="Upload a Document"), gr.Textbox(label="Question"), gr.Checkbox(label="Strip Content", value=True)],
216
  outputs=gr.Markdown(label="Answer"),
217
  title="Document Chat V2",
218
  description="Upload a document and ask questions about its content (using chunk-based approach)."
219
  )
220
 
221
+ demo.launch()