Daemontatox commited on
Commit
0644b4c
·
verified ·
1 Parent(s): 3bc1ee9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -15
app.py CHANGED
@@ -11,11 +11,12 @@ import fitz # PyMuPDF
11
  import io
12
  import numpy as np
13
 
 
14
  ckpt = "Daemontatox/DocumentCogito"
15
- model = MllamaForConditionalGeneration.from_pretrained(ckpt,
16
- torch_dtype=torch.bfloat16).to("cuda")
17
  processor = AutoProcessor.from_pretrained(ckpt)
18
 
 
19
  class DocumentState:
20
  def __init__(self):
21
  self.current_doc_images = []
@@ -29,27 +30,26 @@ class DocumentState:
29
 
30
  doc_state = DocumentState()
31
 
 
32
  def process_pdf_file(file_path):
33
  """Convert PDF to images and extract text using PyMuPDF."""
34
  doc = fitz.open(file_path)
35
  images = []
36
  text = ""
37
 
38
- # Take first page only for initial processing
39
- if doc.page_count > 0:
40
- page = doc[0]
41
- text = f"First page content:\n{page.get_text()}\n"
42
  pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
43
  img_data = pix.tobytes("png")
44
  img = Image.open(io.BytesIO(img_data))
45
  images.append(img.convert("RGB"))
46
-
47
- if doc.page_count > 1:
48
- text += f"\nTotal pages in document: {doc.page_count}\n"
49
 
50
  doc.close()
51
  return images, text
52
 
 
53
  def process_file(file):
54
  """Process either PDF or image file and update document state."""
55
  doc_state.clear()
@@ -62,14 +62,15 @@ def process_file(file):
62
  if file_path.lower().endswith('.pdf'):
63
  doc_state.doc_type = 'pdf'
64
  doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
65
- return f"PDF first page processed. You can now ask questions about the content."
66
  else:
67
  doc_state.doc_type = 'image'
68
  doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
69
  return "Image loaded successfully. You can now ask questions about the content."
70
 
 
71
  @spaces.GPU()
72
- def bot_streaming(message, history, max_new_tokens=2048):
73
  txt = message["text"]
74
  messages = []
75
 
@@ -79,10 +80,13 @@ def bot_streaming(message, history, max_new_tokens=2048):
79
 
80
  # Process history
81
  for i, msg in enumerate(history):
82
- if isinstance(msg[0], tuple):
83
- messages.append({"role": "user", "content": [{"type": "text", "text": msg[0][1]}, {"type": "image"}]})
 
 
 
84
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
85
- elif isinstance(msg[0], str):
86
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
87
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
88
 
@@ -119,6 +123,7 @@ def bot_streaming(message, history, max_new_tokens=2048):
119
  time.sleep(0.01)
120
  yield buffer
121
 
 
122
  def clear_context():
123
  """Clear the current document context."""
124
  doc_state.clear()
@@ -127,7 +132,7 @@ def clear_context():
127
  # Create the Gradio interface
128
  with gr.Blocks() as demo:
129
  gr.Markdown("# Document Analyzer with Chat Support")
130
- gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, the first page will be processed for visual analysis.")
131
 
132
  chatbot = gr.ChatInterface(
133
  fn=bot_streaming,
 
11
  import io
12
  import numpy as np
13
 
14
+ # Load model and processor
15
  ckpt = "Daemontatox/DocumentCogito"
16
+ model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
 
17
  processor = AutoProcessor.from_pretrained(ckpt)
18
 
19
+ # Document state to track uploaded files
20
  class DocumentState:
21
  def __init__(self):
22
  self.current_doc_images = []
 
30
 
31
  doc_state = DocumentState()
32
 
33
+ # Function to convert PDF to images and extract text
34
  def process_pdf_file(file_path):
35
  """Convert PDF to images and extract text using PyMuPDF."""
36
  doc = fitz.open(file_path)
37
  images = []
38
  text = ""
39
 
40
+ # Process each page
41
+ for page_num in range(doc.page_count):
42
+ page = doc[page_num]
43
+ text += f"Page {page_num + 1} content:\n{page.get_text()}\n"
44
  pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
45
  img_data = pix.tobytes("png")
46
  img = Image.open(io.BytesIO(img_data))
47
  images.append(img.convert("RGB"))
 
 
 
48
 
49
  doc.close()
50
  return images, text
51
 
52
+ # Function to process uploaded files (PDF or image)
53
  def process_file(file):
54
  """Process either PDF or image file and update document state."""
55
  doc_state.clear()
 
62
  if file_path.lower().endswith('.pdf'):
63
  doc_state.doc_type = 'pdf'
64
  doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
65
+ return f"PDF processed. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
66
  else:
67
  doc_state.doc_type = 'image'
68
  doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
69
  return "Image loaded successfully. You can now ask questions about the content."
70
 
71
+ # Function to handle streaming responses from the model
72
  @spaces.GPU()
73
+ def bot_streaming(message, history, max_new_tokens=8192):
74
  txt = message["text"]
75
  messages = []
76
 
 
80
 
81
  # Process history
82
  for i, msg in enumerate(history):
83
+ if isinstance(msg[0], dict): # Multimodal message (text + files)
84
+ user_content = [{"type": "text", "text": msg[0]["text"]}]
85
+ if "files" in msg[0] and len(msg[0]["files"]) > 0:
86
+ user_content.append({"type": "image"})
87
+ messages.append({"role": "user", "content": user_content})
88
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
89
+ elif isinstance(msg[0], str): # Text-only message
90
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
91
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
92
 
 
123
  time.sleep(0.01)
124
  yield buffer
125
 
126
+ # Function to clear document context
127
  def clear_context():
128
  """Clear the current document context."""
129
  doc_state.clear()
 
132
  # Create the Gradio interface
133
  with gr.Blocks() as demo:
134
  gr.Markdown("# Document Analyzer with Chat Support")
135
+ gr.Markdown("Upload a PDF or image and chat about its contents. For PDFs, all pages will be processed for visual analysis.")
136
 
137
  chatbot = gr.ChatInterface(
138
  fn=bot_streaming,