import os import gradio as gr from gradio import ChatMessage from typing import Iterator import google.generativeai as genai # get Gemini API Key from the environ variable GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") genai.configure(api_key=GEMINI_API_KEY) # we will be using the Gemini 2.0 Flash model with Thinking capabilities model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp-1219") def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]: """ Streams LLM's thoughts and response. """ try: # Enabling logging for users to understand how thinking works along with streaming print(f"\n=== New Request ===") print(f"User message: {user_message}") # Initialize response from Gemini response = model.generate_content(user_message, stream=True) # Initialize buffers and flags thought_buffer = "" response_buffer = "" has_response = False thinking_complete = False # Add initial thinking message messages.append( ChatMessage( role="assistant", content="", metadata={"title": "⏳Thinking: *The thoughts produced by the model are experimental"} ) ) for chunk in response: parts = chunk.candidates[0].content.parts current_chunk = parts[0].text if len(parts) == 2 and not thinking_complete: # Complete thought thought_buffer += current_chunk print(f"\n=== Complete Thought ===\n{thought_buffer}") # Update thinking message messages[-1] = ChatMessage( role="assistant", content=thought_buffer, metadata={"title": "⏳Thinking: *The thoughts produced by the model are experimental"} ) yield messages # Start response response_buffer = parts[1].text print(f"\n=== Starting Response ===\n{response_buffer}") messages.append( ChatMessage( role="assistant", content=response_buffer ) ) thinking_complete = True has_response = True yield messages time.sleep(0.05) # Small delay for visible streaming elif thinking_complete: # Stream response response_buffer += current_chunk print(f"\n=== Response Chunk ===\n{current_chunk}") messages[-1] = ChatMessage( role="assistant", content=response_buffer ) yield messages else: # Stream thinking thought_buffer += current_chunk print(f"\n=== Thinking Chunk ===\n{current_chunk}") messages[-1] = ChatMessage( role="assistant", content=thought_buffer, metadata={"title": "⏳Thinking: *The thoughts produced by the model are experimental"} ) yield messages # Log final complete response print(f"\n=== Final Response ===\n{response_buffer}") except Exception as e: print(f"\n=== Error ===\n{str(e)}") messages.append( ChatMessage( role="assistant", content=f"I apologize, but I encountered an error: {str(e)}" ) ) yield messages def user_message(msg: str, history: list) -> tuple[str, list]: """Adds user message to chat history""" history.append(ChatMessage(role="user", content=msg)) return "", history # Create the Gradio interface with gr.Blocks(theme=gr.themes.Citrus(), fill_height=True) as demo: #with gr.Column(): gr.Markdown("# Chat with Gemini 2.0 Flash and See its Thoughts 💭") chatbot = gr.Chatbot( type="messages", label="Gemini2.0 'Thinking' Chatbot", render_markdown=True, scale=1, avatar_images=(None,"https://lh3.googleusercontent.com/oxz0sUBF0iYoN4VvhqWTmux-cxfD1rxuYkuFEfm1SFaseXEsjjE4Je_C_V3UQPuJ87sImQK3HfQ3RXiaRnQetjaZbjJJUkiPL5jFJ1WRl5FKJZYibUA=w214-h214-n-nu") ) with gr.Row(equal_height=True): input_box = gr.Textbox( lines=1, label="Chat Message", placeholder="Type your message here...", scale=4 ) clear_button = gr.Button("Clear Chat", scale=1) # Set up event handlers msg_store = gr.State("") # Store for preserving user message input_box.submit( lambda msg: (msg, msg, ""), # Store message and clear input inputs=[input_box], outputs=[msg_store, input_box, input_box], queue=False ).then( user_message, # Add user message to chat inputs=[msg_store, chatbot], outputs=[input_box, chatbot], queue=False ).then( stream_gemini_response, # Generate and stream response inputs=[msg_store, chatbot], outputs=chatbot ) clear_button.click( lambda: ([], "", ""), outputs=[chatbot, input_box, msg_store], queue=False ) # Launch the interface if __name__ == "__main__": demo.launch(debug=True)