breadlicker45 commited on
Commit
eb23a74
·
verified ·
1 Parent(s): e7dd0dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -3
app.py CHANGED
@@ -34,7 +34,7 @@ def load_model():
34
 
35
 
36
  @spaces.GPU(duration=120) # Increased timeout to 120 seconds
37
- def process_image_and_text(image_pil, text_input, num_beams):
38
  """Extract text from image using PaliGemma2."""
39
  try:
40
  processor, model = load_model()
@@ -44,16 +44,18 @@ def process_image_and_text(image_pil, text_input, num_beams):
44
  image = load_image(image_pil)
45
 
46
  # Add <image> token to the beginning of the text prompt
47
- text_input = " "+text_input
48
 
49
  # Use the provided text input
50
  model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
51
  device, dtype=torch.bfloat16
52
  )
53
  input_len = model_inputs["input_ids"].shape[-1]
 
 
54
 
55
  with torch.inference_mode():
56
- generation = model.generate(**model_inputs, max_new_tokens=200, do_sample=False, num_beams=num_beams)
57
  generation = generation[0][input_len:]
58
  decoded = processor.decode(generation, skip_special_tokens=True)
59
 
@@ -70,6 +72,8 @@ if __name__ == "__main__":
70
  gr.Image(type="pil", label="Upload an image"),
71
  gr.Textbox(label="Enter Text Prompt"),
72
  gr.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Beams"),
 
 
73
  ],
74
  outputs=gr.Textbox(label="Generated Text"),
75
  title="PaliGemma2 Image and Text to Text",
 
34
 
35
 
36
  @spaces.GPU(duration=120) # Increased timeout to 120 seconds
37
+ def process_image_and_text(image_pil, text_input, num_beams, temperature, seed):
38
  """Extract text from image using PaliGemma2."""
39
  try:
40
  processor, model = load_model()
 
44
  image = load_image(image_pil)
45
 
46
  # Add <image> token to the beginning of the text prompt
47
+ text_input = "<image> " + text_input
48
 
49
  # Use the provided text input
50
  model_inputs = processor(text=text_input, images=image, return_tensors="pt").to(
51
  device, dtype=torch.bfloat16
52
  )
53
  input_len = model_inputs["input_ids"].shape[-1]
54
+
55
+ torch.manual_seed(seed) # Set random seed for reproducibility
56
 
57
  with torch.inference_mode():
58
+ generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=True, num_beams=num_beams, temperature=temperature)
59
  generation = generation[0][input_len:]
60
  decoded = processor.decode(generation, skip_special_tokens=True)
61
 
 
72
  gr.Image(type="pil", label="Upload an image"),
73
  gr.Textbox(label="Enter Text Prompt"),
74
  gr.Slider(minimum=1, maximum=10, step=1, value=1, label="Number of Beams"),
75
+ gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=1.0, label="Temperature"),
76
+ gr.Number(label="Random Seed", value=42, precision=0),
77
  ],
78
  outputs=gr.Textbox(label="Generated Text"),
79
  title="PaliGemma2 Image and Text to Text",