Spaces:

prithivMLmods
/

core-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 8

Commit

83a0174

verified ·

1 Parent(s): f74b154

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -23

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
-from collections.abc import Iterator
 from threading import Thread
 import gradio as gr
 import spaces
@@ -35,7 +34,7 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# Load the text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
@@ -54,7 +53,7 @@ TTS_VOICES = [
     "en-US-JasonNeural",  # @tts6
 ]
-# Load the multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -69,6 +68,18 @@ async def text_to_speech(text: str, voice: str, output_file="output.mp3"):
     await communicate.save(output_file)
     return output_file
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -80,14 +91,14 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot response and handles TTS requests with multimodal input support.
-    If the query starts with a TTS command (e.g. '@tts1'), the chat history is cleared
-    to avoid non-text responses (like Audio) interfering with template rendering.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # Check if input includes image(s)
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -95,33 +106,35 @@ def generate(
     else:
         images = []
-    # Check if the message is for TTS
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
     voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
-        # Clear conversation history to avoid issues with non-text outputs.
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
-        conversation = [*chat_history, {"role": "user", "content": text}]
-    # If there are images, process multimodal input
     if images:
-        messages = [
-            {"role": "user", "content": [
                 *[{"type": "image", "image": image} for image in images],
                 {"type": "text", "text": text},
-            ]}
-        ]
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
-        # Handle generation for multimodal input using model_m
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
@@ -134,9 +147,8 @@ def generate(
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
     else:
-        # Process text-only input using model
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -155,7 +167,7 @@ def generate(
             num_beams=1,
             repetition_penalty=repetition_penalty,
         )
-        t = Thread(target=model.generate, kwargs=generate_kwargs)
         t.start()
         outputs = []
@@ -164,11 +176,9 @@ def generate(
             yield "".join(outputs)
         final_response = "".join(outputs)
-        # Yield text response first.
         yield final_response
-        # If TTS was requested, yield audio output separately.
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)

 import os
 from threading import Thread
 import gradio as gr
 import spaces
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Text-only model and tokenizer
 model_id = "prithivMLmods/FastThink-0.5B-Tiny"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     "en-US-JasonNeural",  # @tts6
 ]
+# Multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
     await communicate.save(output_file)
     return output_file
+def clean_chat_history(chat_history):
+    """
+    Filter out any entries whose content is not a string.
+    This avoids non-text objects (like tuples or Audio) from being concatenated.
+    """
+    cleaned = []
+    for msg in chat_history:
+        # Only keep dict messages that have a string 'content'
+        if isinstance(msg, dict) and isinstance(msg.get("content"), str):
+            cleaned.append(msg)
+    return cleaned
 @spaces.GPU
 def generate(
     input_dict: dict,
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates a chatbot response and handles TTS requests with multimodal input support.
+    If the user’s query begins with an @tts command, previous chat history is ignored
+    (clearing any non-text outputs). Otherwise, the chat history is cleaned to include only text.
     """
     text = input_dict["text"]
     files = input_dict.get("files", [])
+    # Determine if images are provided
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
     else:
         images = []
+    # Check for TTS prefix
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
     voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
     if is_tts and voice_index:
         voice = TTS_VOICES[voice_index - 1]
         text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
+        # Clear any previous chat history when using TTS to avoid type errors
         conversation = [{"role": "user", "content": text}]
     else:
         voice = None
         text = text.replace(tts_prefix, "").strip()
+        # Clean the chat history to include only messages with string content
+        conversation = clean_chat_history(chat_history)
+        conversation.append({"role": "user", "content": text})
+    # Multimodal branch if images are provided
     if images:
+        messages = [{
+            "role": "user",
+            "content": [
                 *[{"type": "image", "image": image} for image in images],
                 {"type": "text", "text": text},
+            ]
+        }]
         prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
         streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
         thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
             buffer = buffer.replace("<|im_end|>", "")
             time.sleep(0.01)
             yield buffer
     else:
+        # Text-only branch using the text model
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             num_beams=1,
             repetition_penalty=repetition_penalty,
         )
+        t = Thread(target=model.generate, kwargs=generation_kwargs)
         t.start()
         outputs = []
             yield "".join(outputs)
         final_response = "".join(outputs)
+        # Yield text response first
         yield final_response
         if is_tts and voice:
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)