import os import base64 import requests import gradio as gr from huggingface_hub import InferenceClient from dataclasses import dataclass import pytesseract from PIL import Image, ImageGrab import io @dataclass class ChatMessage: """Custom ChatMessage class since huggingface_hub doesn't provide one""" role: str content: str def to_dict(self): """Converts ChatMessage to a dictionary for JSON serialization.""" return {"role": self.role, "content": self.content} class XylariaChat: def __init__(self): # Securely load HuggingFace token self.hf_token = os.getenv("HF_TOKEN") if not self.hf_token: raise ValueError("HuggingFace token not found in environment variables") # Initialize the inference client with the Qwen model self.client = InferenceClient( model="Qwen/QwQ-32B-Preview", # Using the specified model api_key=self.hf_token ) # Image captioning API setup with the new model self.image_api_url = "https://api-inference.huggingface.co/models/microsoft/git-large-coco" self.image_api_headers = {"Authorization": f"Bearer {self.hf_token}"} # Initialize conversation history and persistent memory self.conversation_history = [] self.persistent_memory = {} # System prompt with more detailed instructions self.system_prompt = """You are a helpful and harmless assistant. You are Xylaria developed by Sk Md Saad Amin. You should think step-by-step. You should respond to image questions""" def store_information(self, key, value): """Store important information in persistent memory""" self.persistent_memory[key] = value return f"Stored: {key} = {value}" def retrieve_information(self, key): """Retrieve information from persistent memory""" return self.persistent_memory.get(key, "No information found for this key.") def reset_conversation(self): """ Completely reset the conversation history, persistent memory, and clear API-side memory """ # Clear local memory self.conversation_history = [] self.persistent_memory.clear() # Reinitialize the client (not strictly necessary for the API, but can help with local state) try: self.client = InferenceClient( model="Qwen/QwQ-32B-Preview", api_key=self.hf_token ) except Exception as e: print(f"Error resetting API client: {e}") return None # To clear the chatbot interface def caption_image(self, image): """ Caption an uploaded image using Hugging Face API Args: image (str or list): Base64 encoded image(s), file path(s), or file-like object(s) Returns: str: Concatenated image captions or error message """ try: # Ensure image is a list if not isinstance(image, list): image = [image] captions = [] for img in image: # If image is a file path, read and encode if isinstance(img, str) and os.path.isfile(img): with open(img, "rb") as f: data = f.read() # If image is already base64 encoded elif isinstance(img, str): # Remove data URI prefix if present if img.startswith('data:image'): img = img.split(',')[1] data = base64.b64decode(img) # If image is a file-like object else: data = img.read() # Send request to Hugging Face API response = requests.post( self.image_api_url, headers=self.image_api_headers, data=data ) # Check response if response.status_code == 200: caption = response.json()[0].get('generated_text', 'No caption generated') captions.append(caption) else: captions.append(f"Error captioning image: {response.status_code} - {response.text}") # Return concatenated captions return "\n".join(captions) except Exception as e: return f"Error processing image: {str(e)}" def perform_math_ocr(self, image_path): """ Perform OCR on an image and return the extracted text. Args: image_path (str): Path to the image file. Returns: str: Extracted text from the image, or an error message. """ try: # Open the image using Pillow library img = Image.open(image_path) # Use Tesseract to do OCR on the image text = pytesseract.image_to_string(img) # Remove leading/trailing whitespace and return return text.strip() except Exception as e: return f"Error during Math OCR: {e}" def get_response(self, user_input, images=None, math_ocr_image=None): """ Generate a response using chat completions with improved error handling Args: user_input (str): User's message images (list, optional): List of uploaded images math_ocr_image (str, optional): Path to math OCR image Returns: Stream of chat completions or error message """ try: # Prepare messages with conversation context and persistent memory messages = [] # Add system prompt as first message messages.append(ChatMessage( role="system", content=self.system_prompt ).to_dict()) # Add persistent memory context if available if self.persistent_memory: memory_context = "Remembered Information:\n" + "\n".join( [f"{k}: {v}" for k, v in self.persistent_memory.items()] ) messages.append(ChatMessage( role="system", content=memory_context ).to_dict()) # Convert existing conversation history to ChatMessage objects and then to dictionaries for msg in self.conversation_history: messages.append(ChatMessage( role=msg['role'], content=msg['content'] ).to_dict()) # Process images if uploaded image_context = "" if images and any(images): image_caption = self.caption_image(images) image_context += f"Uploaded images: {image_caption}\n\n" # Process math OCR image if uploaded if math_ocr_image: ocr_text = self.perform_math_ocr(math_ocr_image) if not ocr_text.startswith("Error"): image_context += f"Math OCR Result: {ocr_text}\n\n" # Combine image context with user input full_input = image_context + user_input # Add user input messages.append(ChatMessage( role="user", content=full_input ).to_dict()) # Calculate available tokens input_tokens = sum(len(msg['content'].split()) for msg in messages) max_new_tokens = 16384 - input_tokens - 50 # Reserve some tokens for safety # Limit max_new_tokens to prevent exceeding the total limit max_new_tokens = min(max_new_tokens, 10020) # Generate response with streaming stream = self.client.chat_completion( messages=messages, model="Qwen/QwQ-32B-Preview", temperature=0.7, max_tokens=max_new_tokens, top_p=0.9, stream=True ) return stream except Exception as e: print(f"Detailed error in get_response: {e}") return f"Error generating response: {str(e)}" def messages_to_prompt(self, messages): """ Convert a list of ChatMessage dictionaries to a single prompt string. This is a simple implementation and you might need to adjust it based on the specific requirements of the model you are using. """ prompt = "" for msg in messages: if msg["role"] == "system": prompt += f"<|system|>\n{msg['content']}<|end|>\n" elif msg["role"] == "user": prompt += f"<|user|>\n{msg['content']}<|end|>\n" elif msg["role"] == "assistant": prompt += f"<|assistant|>\n{msg['content']}<|end|>\n" prompt += "<|assistant|>\n" # Start of assistant's turn return prompt def create_interface(self): def get_clipboard_image(): """Capture image from clipboard""" try: img = ImageGrab.grabclipboard() if img is not None: # Save clipboard image to a temporary file temp_path = "clipboard_image.png" img.save(temp_path) return temp_path return None except Exception as e: print(f"Error getting clipboard image: {e}") return None def streaming_response(message, chat_history, image1, image2, image3, image4, image5, math_ocr_image_path): # Collect non-None images images = [img for img in [image1, image2, image3, image4, image5] if img is not None] # Generate response response_stream = self.get_response(message, images, math_ocr_image_path) # Handle errors in get_response if isinstance(response_stream, str): # Return immediately with the error message updated_history = chat_history + [[message, response_stream]] yield ("", updated_history) + ((None,) * 6) return # Prepare for streaming response full_response = "" updated_history = chat_history + [[message, ""]] # Streaming output try: for chunk in response_stream: if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: chunk_content = chunk.choices[0].delta.content full_response += chunk_content # Update the last message in chat history with partial response updated_history[-1][1] = full_response yield ("", updated_history) + ((None,) * 6) except Exception as e: print(f"Streaming error: {e}") # Display error in the chat interface updated_history[-1][1] = f"Error during response: {e}" yield ("", updated_history) + ((None,) * 6) return # Update conversation history self.conversation_history.append( {"role": "user", "content": message} ) self.conversation_history.append( {"role": "assistant", "content": full_response} ) # Limit conversation history if len(self.conversation_history) > 10: self.conversation_history = self.conversation_history[-10:] # Reset image inputs after processing yield ("", updated_history, None, None, None, None, None, None) # Custom CSS for Inter font and improved styling custom_css = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap'); body, .gradio-container { font-family: 'Inter', sans-serif !important; } .chatbot-container .message { font-family: 'Inter', sans-serif !important; } .gradio-container input, .gradio-container textarea, .gradio-container button { font-family: 'Inter', sans-serif !important; } """ with gr.Blocks(theme='soft', css=custom_css) as demo: # Chat interface with improved styling with gr.Column(): chatbot = gr.Chatbot( label="Xylaria 1.5 Senoa (EXPERIMENTAL)", height=500, show_copy_button=True, ) # Input row with improved layout with gr.Row(): with gr.Column(scale=4): txt = gr.Textbox( show_label=False, placeholder="Type your message...", container=False ) # Image and Math upload buttons with gr.Column(scale=1): # Buttons for image and math uploads with symbolic icons with gr.Row(): img_upload_btn = gr.Button("🖼️") # Image upload button math_upload_btn = gr.Button("➗") # Math upload button clipboard_btn = gr.Button("📋") # Clipboard paste button # Multiple image inputs with gr.Accordion("Images", open=False): with gr.Column(): with gr.Row(): img1 = gr.Image( sources=["upload", "webcam"], type="filepath", label="Image 1", height=200 ) img2 = gr.Image( sources=["upload", "webcam"], type="filepath", label="Image 2", height=200 ) with gr.Row(): img3 = gr.Image( sources=["upload", "webcam"], type="filepath", label="Image 3", height=200 ) img4 = gr.Image( sources=["upload", "webcam"], type="filepath", label="Image 4", height=200 ) img5 = gr.Image( sources=["upload", "webcam"], type="filepath", label="Image 5", height=200 ) # Math OCR Image Upload with gr.Accordion("Math Input", open=False): math_ocr_img = gr.Image( sources=["upload", "webcam"], type="filepath", label="Upload Image for math", height=200 ) # Clear history and memory buttons with gr.Row(): clear = gr.Button("Clear Conversation") clear_memory = gr.Button("Clear Memory") # Submit functionality with streaming and image support btn = gr.Button("Send") btn.click( fn=streaming_response, inputs=[txt, chatbot, img1, img2, img3, img4, img5, math_ocr_img], outputs=[txt, chatbot, img1, img2, img3, img4, img5, math_ocr_img] ) txt.submit( fn=streaming_response, inputs=[txt, chatbot, img1, img2, img3, img4, img5, math_ocr_img], outputs=[txt, chatbot, img1, img2, img3, img4, img5, math_ocr_img] ) # Clipboard button functionality clipboard_btn.click( fn=get_clipboard_image, outputs=[img1] ) # Clear conversation button clear.click( fn=self.reset_conversation, inputs=None, outputs=[chatbot, txt, img1, img2, img3, img4, img5, math_ocr_img] ) # Clear memory button clear_memory.click( fn=lambda: self.persistent_memory.clear(), inputs=None, outputs=[] ) return demo # Optional: If you want to run the interface if __name__ == "__main__": chat = XylariaChat() interface = chat.create_interface() interface.launch()