Spaces:

virendravaishnav
/

po-fetch-detail

Running

virendravaishnav commited on Sep 13, 2024

Commit

1082f74

1 Parent(s): 657edd9

Updated with OCR model and Gradio integration

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoProcessor, AutoModel
 import torch
 repo_id = "OpenGVLab/InternVL2-1B"
-# Load the tokenizer, processor, and model directly from the Hub
-tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
 processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
 model = AutoModel.from_pretrained(
     repo_id,
@@ -20,26 +19,16 @@ model.to(device)
 def analyze_image(image):
     try:
         img = image.convert("RGB")
-        # Process the image
-        image_inputs = processor(images=img, return_tensors="pt")
-        # Process the text
-        text_inputs = tokenizer("describe this image", return_tensors="pt")
-        # Move inputs to the appropriate device
-        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
-        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
-        # Combine the inputs
-        inputs = {
-            "input_ids": text_inputs["input_ids"],
-            "attention_mask": text_inputs["attention_mask"],
-            "pixel_values": image_inputs["pixel_values"],
-        }
         # Generate outputs
         outputs = model.generate(**inputs)
-        return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
         return f"An error occurred: {str(e)}"

 import gradio as gr
+from transformers import AutoProcessor, AutoModel
 import torch
 repo_id = "OpenGVLab/InternVL2-1B"
+# Load the processor and model directly from the Hub
 processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
 model = AutoModel.from_pretrained(
     repo_id,
 def analyze_image(image):
     try:
         img = image.convert("RGB")
+        text = "describe this image"
+        # Process both image and text together
+        inputs = processor(images=img, text=text, return_tensors="pt").to(device)
         # Generate outputs
         outputs = model.generate(**inputs)
+        # Decode the output using the processor
+        return processor.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
         return f"An error occurred: {str(e)}"