breadlicker45 commited on
Commit
45113e4
·
verified ·
1 Parent(s): 1c21246

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -23,7 +23,7 @@ def load_model():
23
  )
24
 
25
  # Load the processor and model using the correct identifier
26
- model_id = "google/paligemma2-10b-pt-448"
27
  processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token)
28
  device = "cuda" if torch.cuda.is_available() else "cpu"
29
  model = PaliGemmaForConditionalGeneration.from_pretrained(
@@ -34,7 +34,7 @@ def load_model():
34
 
35
 
36
  @spaces.GPU(duration=120) # Increased timeout to 120 seconds
37
- def process_image_and_text(image_pil, text_input, num_beams, temperature, seed):
38
  """Extract text from image using PaliGemma2."""
39
  try:
40
  processor, model = load_model()
@@ -43,18 +43,21 @@ def process_image_and_text(image_pil, text_input, num_beams, temperature, seed):
43
  # Load the image using load_image
44
  image = load_image(image_pil)
45
 
46
- prompt = ""
 
47
 
48
  # Use the provided text input
49
- model_inputs = processor(text=prompt, images=image, return_tensors="pt").to(
50
  device, dtype=torch.bfloat16
51
  )
52
  input_len = model_inputs["input_ids"].shape[-1]
53
-
54
- torch.manual_seed(seed) # Set random seed for reproducibility
 
 
55
 
56
  with torch.inference_mode():
57
- generation = model.generate(**model_inputs, max_new_tokens=500, do_sample=True, num_beams=num_beams, temperature=temperature)
58
  generation = generation[0][input_len:]
59
  decoded = processor.decode(generation, skip_special_tokens=True)
60
 
@@ -71,10 +74,10 @@ if __name__ == "__main__":
71
  gr.Image(type="pil", label="Upload an image"),
72
  gr.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Beams"),
73
  gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"),
74
- gr.Number(label="Random Seed", value=0, precision=0),
75
  ],
76
  outputs=gr.Textbox(label="Generated Text"),
77
- title="PaliGemma2 Image and Text to Text",
78
  description="Upload an image and enter a text prompt. The model will generate text based on both.",
79
  )
80
  iface.launch()
 
23
  )
24
 
25
  # Load the processor and model using the correct identifier
26
+ model_id = "google/paligemma2-28b-pt-896"
27
  processor = PaliGemmaProcessor.from_pretrained(model_id, use_auth_token=token)
28
  device = "cuda" if torch.cuda.is_available() else "cpu"
29
  model = PaliGemmaForConditionalGeneration.from_pretrained(
 
34
 
35
 
36
  @spaces.GPU(duration=120) # Increased timeout to 120 seconds
37
+ def process_image_and_text(image_pil, num_beams, temperature, seed):
38
  """Extract text from image using PaliGemma2."""
39
  try:
40
  processor, model = load_model()
 
43
  # Load the image using load_image
44
  image = load_image(image_pil)
45
 
46
+ # Add <image> token to the beginning of the text prompt
47
+ text_input = " "
48
 
49
  # Use the provided text input
50
+ model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
51
  device, dtype=torch.bfloat16
52
  )
53
  input_len = model_inputs["input_ids"].shape[-1]
54
+
55
+ # Set random seed for reproducibility, only if a seed is provided
56
+ if seed is not None:
57
+ torch.manual_seed(int(seed))
58
 
59
  with torch.inference_mode():
60
+ generation = model.generate(**model_inputs, max_new_tokens=200, do_sample=True, num_beams=num_beams, temperature=temperature)
61
  generation = generation[0][input_len:]
62
  decoded = processor.decode(generation, skip_special_tokens=True)
63
 
 
74
  gr.Image(type="pil", label="Upload an image"),
75
  gr.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Beams"),
76
  gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"),
77
+ gr.Number(label="Random Seed", value=42, precision=0, allow_none=True),
78
  ],
79
  outputs=gr.Textbox(label="Generated Text"),
80
+ title="PaliGemma2 Image to Text",
81
  description="Upload an image and enter a text prompt. The model will generate text based on both.",
82
  )
83
  iface.launch()