vikhyatk commited on
Commit
a294418
Β·
verified Β·
1 Parent(s): dfa0919

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -65
app.py CHANGED
@@ -1,81 +1,24 @@
1
- import spaces
2
- import torch
3
- import re
4
  import gradio as gr
5
- from threading import Thread
6
- from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
7
- from PIL import ImageDraw
8
- from torchvision.transforms.v2 import Resize
9
 
10
- import subprocess
11
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
12
 
13
- model_id = "vikhyatk/moondream2"
14
- revision = "2024-08-26"
15
- tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
16
- moondream = AutoModelForCausalLM.from_pretrained(
17
- model_id, trust_remote_code=True, revision=revision,
18
- torch_dtype=torch.bfloat16, device_map={"": "cuda"},
19
- attn_implementation="flash_attention_2"
20
- )
21
- moondream.eval()
22
-
23
-
24
- @spaces.GPU(duration=10)
25
  def answer_question(img, prompt):
26
- image_embeds = moondream.encode_image(img)
27
- streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
28
- thread = Thread(
29
- target=moondream.answer_question,
30
- kwargs={
31
- "image_embeds": image_embeds,
32
- "question": prompt,
33
- "tokenizer": tokenizer,
34
- "streamer": streamer,
35
- },
36
- )
37
- thread.start()
38
-
39
  buffer = ""
40
- for new_text in streamer:
41
- buffer += new_text
42
- yield buffer.strip()
43
-
44
- def extract_floats(text):
45
- # Regular expression to match an array of four floating point numbers
46
- pattern = r"\[\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\s*\]"
47
- match = re.search(pattern, text)
48
- if match:
49
- # Extract the numbers and convert them to floats
50
- return [float(num) for num in match.groups()]
51
- return None # Return None if no match is found
52
-
53
-
54
- def extract_bbox(text):
55
- bbox = None
56
- if extract_floats(text) is not None:
57
- x1, y1, x2, y2 = extract_floats(text)
58
- bbox = (x1, y1, x2, y2)
59
- return bbox
60
 
61
  def process_answer(img, answer):
62
- if extract_bbox(answer) is not None:
63
- x1, y1, x2, y2 = extract_bbox(answer)
64
- draw_image = Resize(768)(img)
65
- width, height = draw_image.size
66
- x1, x2 = int(x1 * width), int(x2 * width)
67
- y1, y2 = int(y1 * height), int(y2 * height)
68
- bbox = (x1, y1, x2, y2)
69
- ImageDraw.Draw(draw_image).rectangle(bbox, outline="red", width=3)
70
- return gr.update(visible=True, value=draw_image)
71
-
72
  return gr.update(visible=False, value=None)
73
 
74
  with gr.Blocks() as demo:
75
  gr.Markdown(
76
  """
77
  # πŸŒ” moondream2
78
- A tiny vision language model. [GitHub](https://github.com/vikhyat/moondream)
79
  """
80
  )
81
  with gr.Row():
 
 
 
 
1
  import gradio as gr
2
+ import moondream as md
3
+ import os
 
 
4
 
5
+ moondream_api_key = os.getenv("MOONDREAM_API_KEY")
6
+ model = md.vl(api_key=moondream_api_key)
7
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def answer_question(img, prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  buffer = ""
10
+ for chunk in model.query(img, prompt, stream=True)["answer"]:
11
+ buffer += chunk
12
+ yield buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def process_answer(img, answer):
 
 
 
 
 
 
 
 
 
 
15
  return gr.update(visible=False, value=None)
16
 
17
  with gr.Blocks() as demo:
18
  gr.Markdown(
19
  """
20
  # πŸŒ” moondream2
21
+ A tiny vision language model. Check out other capabilities (object detection, pointing etc.) in the [Moondream Playground](https://moondream.ai/playground).
22
  """
23
  )
24
  with gr.Row():