Spaces:

prithivMLmods
/

core-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 8

Commit

c106ebe

verified ·

1 Parent(s): 43f0687

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -29

app.py CHANGED Viewed

@@ -23,9 +23,6 @@ from transformers import (
 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
-# ============================================
-#            CHAT & TTS SETUP
-# ============================================
 DESCRIPTION = """
 # QwQ Edge 💬
@@ -61,13 +58,11 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.eval()
-# TTS voices
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
-# Load multimodal (OCR) model and processor
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -93,10 +88,6 @@ def clean_chat_history(chat_history):
             cleaned.append(msg)
     return cleaned
-# ============================================
-#            IMAGE GENERATION SETUP
-# ============================================
 # Environment variables and parameters for Stable Diffusion XL
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
@@ -187,10 +178,6 @@ def generate_image_fn(
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
-# ============================================
-#       MAIN GENERATION FUNCTION (CHAT)
-# ============================================
 @spaces.GPU
 def generate(
     input_dict: dict,
@@ -210,9 +197,6 @@ def generate(
     text = input_dict["text"]
     files = input_dict.get("files", [])
-    # ----------------------------
-    #  IMAGE GENERATION BRANCH
-    # ----------------------------
     if text.strip().lower().startswith("@image"):
         # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
@@ -234,9 +218,6 @@ def generate(
         yield gr.Image(image_paths[0])
         return  # Exit early
-    # ----------------------------
-    #  TTS Branch (if query starts with @tts)
-    # ----------------------------
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
@@ -253,9 +234,6 @@ def generate(
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
-    # ----------------------------
-    #  Multimodal (image + text) branch
-    # ----------------------------
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
@@ -285,9 +263,7 @@ def generate(
             time.sleep(0.01)
             yield buffer
     else:
-        # ----------------------------
-        #  Text-only branch
-        # ----------------------------
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
@@ -321,10 +297,6 @@ def generate(
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
-# ============================================
-#             GRADIO DEMO SETUP
-# ============================================
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[

 from transformers.image_utils import load_image
 from diffusers import StableDiffusionXLPipeline, EulerAncestralDiscreteScheduler
 DESCRIPTION = """
 # QwQ Edge 💬
 )
 model.eval()
 TTS_VOICES = [
     "en-US-JennyNeural",  # @tts1
     "en-US-GuyNeural",    # @tts2
 ]
 MODEL_ID = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model_m = Qwen2VLForConditionalGeneration.from_pretrained(
             cleaned.append(msg)
     return cleaned
 # Environment variables and parameters for Stable Diffusion XL
 MODEL_ID_SD = os.getenv("MODEL_VAL_PATH")  # SDXL Model repository path via env variable
 MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "4096"))
     image_paths = [save_image(img) for img in images]
     return image_paths, seed
 @spaces.GPU
 def generate(
     input_dict: dict,
     text = input_dict["text"]
     files = input_dict.get("files", [])
     if text.strip().lower().startswith("@image"):
         # Remove the "@image" tag and use the rest as prompt
         prompt = text[len("@image"):].strip()
         yield gr.Image(image_paths[0])
         return  # Exit early
     tts_prefix = "@tts"
     is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 3))
     voice_index = next((i for i in range(1, 3) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
         conversation = clean_chat_history(chat_history)
         conversation.append({"role": "user", "content": text})
     if files:
         if len(files) > 1:
             images = [load_image(image) for image in files]
             time.sleep(0.01)
             yield buffer
     else:
         input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
         if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
             input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
             output_file = asyncio.run(text_to_speech(final_response, voice))
             yield gr.Audio(output_file, autoplay=True)
 demo = gr.ChatInterface(
     fn=generate,
     additional_inputs=[