prithivMLmods commited on
Commit
2404d81
·
verified ·
1 Parent(s): 3108bef

Rename app(2).py to app.py

Browse files
Files changed (1) hide show
  1. app(2).py → app.py +57 -5
app(2).py → app.py RENAMED
@@ -112,10 +112,20 @@ moondream = AutoModelForCausalLM.from_pretrained(
112
  revision=REVISION_MD,
113
  trust_remote_code=True,
114
  torch_dtype=torch.float16,
115
- device_map={"": "cuda"},
116
  )
117
  tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD, revision=REVISION_MD)
118
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  # --- PDF Generation and Preview Utility Function ---
121
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
@@ -212,11 +222,52 @@ def process_document_stream(
212
  answer = moondream.answer_question(
213
  image_embeds=image_embeds,
214
  question=prompt_input,
215
- tokenizer=tokenizer_md
 
 
 
 
216
  )
217
  yield answer, answer
218
  return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
 
220
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
221
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
222
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
@@ -229,7 +280,8 @@ def process_document_stream(
229
  yield "Invalid model selected.", ""
230
  return
231
 
232
- messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
 
233
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
234
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
235
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
@@ -281,9 +333,9 @@ def create_gradio_interface():
281
  # Left Column (Inputs)
282
  with gr.Column(scale=1):
283
  model_choice = gr.Dropdown(
284
- choices=["LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
285
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Nanonets-OCR-s(ocr)"],
286
- label="Select Model", value= "LFM2-VL-450M(fast)"
287
  )
288
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
289
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
 
112
  revision=REVISION_MD,
113
  trust_remote_code=True,
114
  torch_dtype=torch.float16,
115
+ device_map="auto",
116
  )
117
  tokenizer_md = AutoTokenizer.from_pretrained(MODEL_ID_MD, revision=REVISION_MD)
118
 
119
+ # --- SmolVLM2 Model Loading ---
120
+ MODEL_ID_S2 = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
121
+ processor_s2 = AutoProcessor.from_pretrained(MODEL_ID_S2, trust_remote_code=True)
122
+ model_s2 = AutoModelForImageTextToText.from_pretrained(
123
+ MODEL_ID_S2,
124
+ torch_dtype=torch.float16,
125
+ trust_remote_code=True,
126
+ device_map="auto"
127
+ ).eval()
128
+
129
 
130
  # --- PDF Generation and Preview Utility Function ---
131
  def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
 
222
  answer = moondream.answer_question(
223
  image_embeds=image_embeds,
224
  question=prompt_input,
225
+ tokenizer=tokenizer_md,
226
+ max_new_tokens=max_new_tokens,
227
+ temperature=temperature,
228
+ top_p=top_p,
229
+ top_k=top_k,
230
  )
231
  yield answer, answer
232
  return
233
+
234
+ elif model_name == "SmolVLM2-2.2B-Instruct(smol)":
235
+ processor, model = processor_s2, model_s2
236
+ messages = [{
237
+ "role": "user",
238
+ "content": [
239
+ {"type": "text", "text": prompt_input},
240
+ {"type": "image"},
241
+ ]
242
+ }]
243
+ inputs = processor.apply_chat_template(
244
+ messages,
245
+ images=[image],
246
+ add_generation_prompt=True,
247
+ tokenize=True,
248
+ return_dict=True,
249
+ return_tensors="pt",
250
+ ).to(model.device)
251
+
252
+ # Convert float32 tensors to float16 if necessary
253
+ for k, v in inputs.items():
254
+ if v.dtype == torch.float32:
255
+ inputs[k] = v.to(torch.float16)
256
+
257
+ generated_ids = model.generate(
258
+ **inputs,
259
+ do_sample=True,
260
+ max_new_tokens=max_new_tokens,
261
+ temperature=temperature,
262
+ top_p=top_p,
263
+ repetition_penalty=repetition_penalty,
264
+ )
265
+ generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
266
+ final_output = generated_texts[0].split("Assistant:")[-1].strip()
267
+ yield final_output, final_output
268
+ return
269
 
270
+ # Assign processor and model for other models
271
  if model_name == "LFM2-VL-450M(fast)": processor, model = processor_m, model_m
272
  elif model_name == "LFM2-VL-1.6B(fast)": processor, model = processor_t, model_t
273
  elif model_name == "ShotVL-3B(cinematic)": processor, model = processor_z, model_z
 
280
  yield "Invalid model selected.", ""
281
  return
282
 
283
+ # Common streaming logic for the rest of the models
284
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_input}]}]
285
  prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
286
  inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
287
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
 
333
  # Left Column (Inputs)
334
  with gr.Column(scale=1):
335
  model_choice = gr.Dropdown(
336
+ choices=["SmolVLM2-2.2B-Instruct(smol)", "LFM2-VL-450M(fast)", "LFM2-VL-1.6B(fast)", "SmolVLM-Instruct-250M(smol)", "Moondream2(vision)", "ShotVL-3B(cinematic)", "Megalodon-OCR-Sync-0713(ocr)",
337
  "VLAA-Thinker-Qwen2VL-2B(reason)", "MonkeyOCR-pro-1.2B(ocr)", "Nanonets-OCR-s(ocr)"],
338
+ label="Select Model", value= "SmolVLM2-2.2B-Instruct(smol)"
339
  )
340
  prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query", value="Describe the image!")
341
  image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])