Spaces:

virendravaishnav
/

po-fetch-detail

Running

virendravaishnav commited on Sep 12, 2024

Commit

50cb395

1 Parent(s): 4cf03a8

Updated with OCR model and Gradio integration

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,20 +1,27 @@
 import gradio as gr
-from PIL import Image
-from transformers import AutoTokenizer, AutoModelForVision2Seq
-# Load the tokenizer and model
 model_name = "OpenGVLab/InternVL2-1B"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-model = AutoModelForVision2Seq.from_pretrained(model_name, trust_remote_code=True)
-# Custom load image function using PIL
-def load_image(image_path):
-    return Image.open(image_path)
 # Function to process and describe the image
 def analyze_image(image):
     # Use PIL to load the image
-    img = load_image(image)
     # Tokenize the input
     inputs = tokenizer("describe this image", return_tensors="pt")
     # Perform inference
@@ -27,8 +34,4 @@ demo = gr.Interface(
     inputs=gr.Image(type="pil"),  # Upload an image
     outputs="text",  # Output the extracted text
     title="Image Description using OpenGVLab/InternVL2-1B",
-    description="Upload an image and get a description generated by the InternVL2-1B model."
-)
-if __name__ == "__main__":
-    demo.launch(share=True)

 import gradio as gr
+from transformers import AutoTokenizer
+from huggingface_hub import hf_hub_download
+# Import the custom model code dynamically
+import sys
+sys.path.append(hf_hub_download(repo_id="OpenGVLab/InternVL2-1B", filename=""))  # Adjust path
+# Load the custom model and tokenizer
 model_name = "OpenGVLab/InternVL2-1B"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+# Import the custom model class from the downloaded files
+from transformers_modules.OpenGVLab.InternVL2-1B.configuration_internvl_chat import InternVLChatConfig
+from transformers_modules.OpenGVLab.InternVL2-1B.modeling_internvl import InternVLForVision2Seq
+# Load the model
+config = InternVLChatConfig.from_pretrained(model_name, trust_remote_code=True)
+model = InternVLForVision2Seq.from_pretrained(model_name, config=config, trust_remote_code=True)
 # Function to process and describe the image
 def analyze_image(image):
     # Use PIL to load the image
+    img = image.convert("RGB")
     # Tokenize the input
     inputs = tokenizer("describe this image", return_tensors="pt")
     # Perform inference
     inputs=gr.Image(type="pil"),  # Upload an image
     outputs="text",  # Output the extracted text
     title="Image Description using OpenGVLab/InternVL2-1B",
+    description="Upload⬤

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ gradio
 datasets
 pytesseract
 Pillow
-lmdeploy

 datasets
 pytesseract
 Pillow
+lmdeploy
+huggingface_hub