ysharma HF staff commited on
Commit
ec5e8d4
·
verified ·
1 Parent(s): e529b79

added support for multiple pdfs and text

Browse files
Files changed (1) hide show
  1. app.py +80 -51
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import anthropic
3
  import base64
@@ -12,20 +15,20 @@ def read_pdf_as_base64(file_path: str) -> str:
12
  return base64.b64encode(file.read()).decode('utf-8')
13
 
14
  def user_message(
15
- user_input: str,
16
- history: list,
17
  enable_citations: bool,
18
  doc_type: str,
19
  text_content: str,
20
  pdf_file: str,
21
  api_key: str
22
  ) -> tuple:
23
- # logging
24
  print("\n----------- User Message -------------")
25
  print(f"User Input: {user_input}")
26
  print(f"Citations Enabled: {enable_citations}")
27
  print(f"Document Type: {doc_type}")
28
-
29
  history.append({
30
  "role": "user",
31
  "content": user_input,
@@ -33,15 +36,15 @@ def user_message(
33
  return "", history
34
 
35
  def format_message_history(
36
- history: list,
37
  enable_citations: bool,
38
  doc_type: str,
39
  text_content: str,
40
- pdf_file: str
41
  ) -> List[Dict]:
42
  """Convert Gradio chat history to Anthropic message format."""
43
  formatted_messages = []
44
-
45
  # Add previous messages
46
  for msg in history[:-1]:
47
  if msg["role"] == "user":
@@ -55,45 +58,71 @@ def format_message_history(
55
  "role": "assistant",
56
  "content": msg["content"]
57
  })
58
-
59
  # Prepare the latest message
60
  latest_message = {
61
  "role": "user",
62
  "content": []
63
  }
64
-
65
- # Add document if citations are enabled
66
  if enable_citations:
67
- if doc_type == "plain_text":
 
68
  latest_message["content"].append({
69
  "type": "document",
70
  "source": {
71
  "type": "text",
72
  "media_type": "text/plain",
73
- "data": text_content.strip() if text_content.strip() else DEFAULT_DOC
74
  },
75
- "title": "User Document" if text_content.strip() else "Sample Document",
76
  "citations": {"enabled": True}
77
  })
78
- elif doc_type == "pdf" and pdf_file:
79
- pdf_base64 = read_pdf_as_base64(pdf_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  latest_message["content"].append({
81
  "type": "document",
82
  "source": {
83
- "type": "base64",
84
- "media_type": "application/pdf",
85
- "data": pdf_base64
86
  },
87
- "title": "User PDF Document",
88
  "citations": {"enabled": True}
89
  })
90
-
91
  # Add the user's question
92
  latest_message["content"].append({
93
  "type": "text",
94
  "text": history[-1]["content"]
95
  })
96
-
97
  formatted_messages.append(latest_message)
98
  return formatted_messages
99
 
@@ -112,22 +141,22 @@ def bot_response(
112
  "content": "Please provide your Anthropic API key to continue."
113
  })
114
  return history
115
-
116
  # Initialize client with provided API key
117
  client = anthropic.Anthropic(api_key=api_key)
118
-
119
  messages = format_message_history(history, enable_citations, doc_type, text_content, pdf_file)
120
-
121
  response = client.messages.create(
122
  model="claude-3-5-sonnet-20241022",
123
  max_tokens=1024,
124
  messages=messages
125
  )
126
-
127
  # Initialize main response and citations
128
  main_response = ""
129
  citations = []
130
-
131
  # Process each content block
132
  for block in response.content:
133
  if block.type == "text":
@@ -136,13 +165,13 @@ def bot_response(
136
  for citation in block.citations:
137
  if citation.cited_text not in citations:
138
  citations.append(citation.cited_text)
139
-
140
  # Add main response
141
  history.append({
142
  "role": "assistant",
143
  "content": main_response
144
  })
145
-
146
  # Add citations if any were found and citations are enabled
147
  if enable_citations and citations:
148
  history.append({
@@ -150,9 +179,9 @@ def bot_response(
150
  "content": "\n".join([f"• {cite}" for cite in citations]),
151
  "metadata": {"title": "📚 Citations"}
152
  })
153
-
154
  return history
155
-
156
  except Exception as e:
157
  print(f"Error in bot_response: {str(e)}")
158
  error_message = str(e)
@@ -166,19 +195,19 @@ def bot_response(
166
 
167
  def update_document_inputs(enable_citations: bool, doc_type: str = "plain_text"):
168
  """Update visibility of document input components based on settings."""
169
- text_visible = enable_citations and doc_type == "plain_text"
170
- pdf_visible = enable_citations and doc_type == "pdf"
171
  radio_visible = enable_citations
172
-
173
  return {
174
  doc_type_radio: gr.Radio(visible=radio_visible),
175
  text_input: gr.Textbox(visible=text_visible),
176
  pdf_input: gr.File(visible=pdf_visible)
177
  }
178
 
179
- with gr.Blocks(fill_height=True, theme="ocean") as demo:
180
- gr.Markdown("# Chat with Citations")
181
-
182
  with gr.Row(scale=1):
183
  with gr.Column(scale=4):
184
  chatbot = gr.Chatbot(
@@ -187,13 +216,13 @@ with gr.Blocks(fill_height=True, theme="ocean") as demo:
187
  show_label=False,
188
  scale=1
189
  )
190
-
191
  msg = gr.Textbox(
192
  placeholder="Enter your message here...",
193
  show_label=False,
194
  container=False
195
  )
196
-
197
  with gr.Column(scale=1):
198
  api_key = gr.Textbox(
199
  type="password",
@@ -202,50 +231,50 @@ with gr.Blocks(fill_height=True, theme="ocean") as demo:
202
  info="Your API key will not be stored",
203
  interactive=True,
204
  )
205
-
206
  enable_citations = gr.Checkbox(
207
  label="Enable Citations",
208
  value=True,
209
  info="Toggle citation functionality"
210
  )
211
-
212
  doc_type_radio = gr.Radio(
213
- choices=["plain_text", "pdf"],
214
  value="plain_text",
215
  label="Document Type",
216
- info="Choose the type of document"
217
  )
218
-
219
  text_input = gr.Textbox(
220
  label="Document Content",
221
- placeholder=f"Enter your document text here. Default doc is -- {DEFAULT_DOC}",
222
  lines=10,
223
- info="Enter the text you want to reference. If empty, default document will be used."
224
  )
225
-
226
  pdf_input = gr.File(
227
  label="Upload PDF",
228
- file_count="single",
229
  file_types=[".pdf"],
230
  type="filepath",
231
  visible=False
232
  )
233
-
234
  clear = gr.ClearButton([msg, chatbot, text_input, pdf_input])
235
-
236
  # Update input visibility based on settings
237
  enable_citations.change(
238
  update_document_inputs,
239
  inputs=[enable_citations, doc_type_radio],
240
  outputs=[doc_type_radio, text_input, pdf_input]
241
  )
242
-
243
  doc_type_radio.change(
244
  update_document_inputs,
245
  inputs=[enable_citations, doc_type_radio],
246
  outputs=[doc_type_radio, text_input, pdf_input]
247
  )
248
-
249
  # Handle message submission
250
  msg.submit(
251
  user_message,
 
1
+ # adding support for multiple pdf files
2
+ # working
3
+ # Final one for PR
4
  import gradio as gr
5
  import anthropic
6
  import base64
 
15
  return base64.b64encode(file.read()).decode('utf-8')
16
 
17
  def user_message(
18
+ user_input: str,
19
+ history: list,
20
  enable_citations: bool,
21
  doc_type: str,
22
  text_content: str,
23
  pdf_file: str,
24
  api_key: str
25
  ) -> tuple:
26
+ # Logging
27
  print("\n----------- User Message -------------")
28
  print(f"User Input: {user_input}")
29
  print(f"Citations Enabled: {enable_citations}")
30
  print(f"Document Type: {doc_type}")
31
+
32
  history.append({
33
  "role": "user",
34
  "content": user_input,
 
36
  return "", history
37
 
38
  def format_message_history(
39
+ history: list,
40
  enable_citations: bool,
41
  doc_type: str,
42
  text_content: str,
43
+ pdf_files: str
44
  ) -> List[Dict]:
45
  """Convert Gradio chat history to Anthropic message format."""
46
  formatted_messages = []
47
+
48
  # Add previous messages
49
  for msg in history[:-1]:
50
  if msg["role"] == "user":
 
58
  "role": "assistant",
59
  "content": msg["content"]
60
  })
61
+
62
  # Prepare the latest message
63
  latest_message = {
64
  "role": "user",
65
  "content": []
66
  }
67
+
68
+ # Add documents if citations are enabled
69
  if enable_citations:
70
+ # Handle plain text input
71
+ if doc_type in ["plain_text", "combined"] and text_content.strip():
72
  latest_message["content"].append({
73
  "type": "document",
74
  "source": {
75
  "type": "text",
76
  "media_type": "text/plain",
77
+ "data": text_content.strip()
78
  },
79
+ "title": "User Text Document",
80
  "citations": {"enabled": True}
81
  })
82
+
83
+ # Handle PDF input
84
+ if doc_type in ["pdf", "combined"] and pdf_files:
85
+ # Handle pdf_files as a list
86
+ if isinstance(pdf_files, str):
87
+ pdf_files = [pdf_files] # Convert single path to list
88
+
89
+ # Add each PDF as a separate document
90
+ for i, pdf_file in enumerate(pdf_files):
91
+ try:
92
+ pdf_base64 = read_pdf_as_base64(pdf_file)
93
+ latest_message["content"].append({
94
+ "type": "document",
95
+ "source": {
96
+ "type": "base64",
97
+ "media_type": "application/pdf",
98
+ "data": pdf_base64
99
+ },
100
+ "title": f"User PDF Document {i+1}",
101
+ "citations": {"enabled": True}
102
+ })
103
+ except Exception as e:
104
+ print(f"Error processing PDF {i+1}: {str(e)}")
105
+ continue
106
+
107
+ # If no documents were added and citations are enabled, use default document
108
+ if not latest_message["content"]:
109
  latest_message["content"].append({
110
  "type": "document",
111
  "source": {
112
+ "type": "text",
113
+ "media_type": "text/plain",
114
+ "data": DEFAULT_DOC
115
  },
116
+ "title": "Sample Document",
117
  "citations": {"enabled": True}
118
  })
119
+
120
  # Add the user's question
121
  latest_message["content"].append({
122
  "type": "text",
123
  "text": history[-1]["content"]
124
  })
125
+
126
  formatted_messages.append(latest_message)
127
  return formatted_messages
128
 
 
141
  "content": "Please provide your Anthropic API key to continue."
142
  })
143
  return history
144
+
145
  # Initialize client with provided API key
146
  client = anthropic.Anthropic(api_key=api_key)
147
+
148
  messages = format_message_history(history, enable_citations, doc_type, text_content, pdf_file)
149
+
150
  response = client.messages.create(
151
  model="claude-3-5-sonnet-20241022",
152
  max_tokens=1024,
153
  messages=messages
154
  )
155
+
156
  # Initialize main response and citations
157
  main_response = ""
158
  citations = []
159
+
160
  # Process each content block
161
  for block in response.content:
162
  if block.type == "text":
 
165
  for citation in block.citations:
166
  if citation.cited_text not in citations:
167
  citations.append(citation.cited_text)
168
+
169
  # Add main response
170
  history.append({
171
  "role": "assistant",
172
  "content": main_response
173
  })
174
+
175
  # Add citations if any were found and citations are enabled
176
  if enable_citations and citations:
177
  history.append({
 
179
  "content": "\n".join([f"• {cite}" for cite in citations]),
180
  "metadata": {"title": "📚 Citations"}
181
  })
182
+
183
  return history
184
+
185
  except Exception as e:
186
  print(f"Error in bot_response: {str(e)}")
187
  error_message = str(e)
 
195
 
196
  def update_document_inputs(enable_citations: bool, doc_type: str = "plain_text"):
197
  """Update visibility of document input components based on settings."""
198
+ text_visible = enable_citations and (doc_type in ["plain_text", "combined"])
199
+ pdf_visible = enable_citations and (doc_type in ["pdf", "combined"])
200
  radio_visible = enable_citations
201
+
202
  return {
203
  doc_type_radio: gr.Radio(visible=radio_visible),
204
  text_input: gr.Textbox(visible=text_visible),
205
  pdf_input: gr.File(visible=pdf_visible)
206
  }
207
 
208
+ with gr.Blocks(theme="ocean", fill_height=True) as demo:
209
+ gr.Markdown("# Chat with Anthropic Claude's Citations")
210
+
211
  with gr.Row(scale=1):
212
  with gr.Column(scale=4):
213
  chatbot = gr.Chatbot(
 
216
  show_label=False,
217
  scale=1
218
  )
219
+
220
  msg = gr.Textbox(
221
  placeholder="Enter your message here...",
222
  show_label=False,
223
  container=False
224
  )
225
+
226
  with gr.Column(scale=1):
227
  api_key = gr.Textbox(
228
  type="password",
 
231
  info="Your API key will not be stored",
232
  interactive=True,
233
  )
234
+
235
  enable_citations = gr.Checkbox(
236
  label="Enable Citations",
237
  value=True,
238
  info="Toggle citation functionality"
239
  )
240
+
241
  doc_type_radio = gr.Radio(
242
+ choices=["plain_text", "pdf", "combined"],
243
  value="plain_text",
244
  label="Document Type",
245
+ info="Choose the type of document(s) to reference"
246
  )
247
+
248
  text_input = gr.Textbox(
249
  label="Document Content",
250
+ placeholder=f"Enter your document text here.\nDefault text will be picked if citations are enabled and you don't provide the documents. Default document is --"{DEFAULT_DOC}",
251
  lines=10,
252
+ info="Enter the text you want to reference"
253
  )
254
+
255
  pdf_input = gr.File(
256
  label="Upload PDF",
257
+ file_count="multiple",
258
  file_types=[".pdf"],
259
  type="filepath",
260
  visible=False
261
  )
262
+
263
  clear = gr.ClearButton([msg, chatbot, text_input, pdf_input])
264
+
265
  # Update input visibility based on settings
266
  enable_citations.change(
267
  update_document_inputs,
268
  inputs=[enable_citations, doc_type_radio],
269
  outputs=[doc_type_radio, text_input, pdf_input]
270
  )
271
+
272
  doc_type_radio.change(
273
  update_document_inputs,
274
  inputs=[enable_citations, doc_type_radio],
275
  outputs=[doc_type_radio, text_input, pdf_input]
276
  )
277
+
278
  # Handle message submission
279
  msg.submit(
280
  user_message,