Spaces:

virendravaishnav
/

po-fetch-detail

Running

virendravaishnav commited on Sep 13, 2024

Commit

6dadcd1

1 Parent(s): e8cad36

Updated with OCR model and Gradio integration

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,27 +1,38 @@
 import gradio as gr
-from transformers import AutoModelForVision2Seq, AutoTokenizer
-# Load the Hugging Face model and tokenizer, trusting remote code
-model_name = "OpenGVLab/InternVL2-1B"
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-model = AutoModelForVision2Seq.from_pretrained(model_name, trust_remote_code=True)
 # Function to process and describe the image
 def analyze_image(image):
-    # Convert PIL image to RGB if needed
     img = image.convert("RGB")
-    # Tokenize the input
     inputs = tokenizer("describe this image", return_tensors="pt")
-    # Perform inference
     outputs = model.generate(**inputs)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Gradio interface for image input
 demo = gr.Interface(
     fn=analyze_image,
-    inputs=gr.Image(type="pil"),  # Upload an image
-    outputs="text",  # Output the extracted text
-    title="Image Description using OpenGVLab/InternVL2-1B",
     description="Upload an image and get a description generated by the InternVL2-1B model."
 )

 import gradio as gr
+from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
+import sys
+import os
+# Download the model snapshot
+repo_id = "OpenGVLab/InternVL2-1B"
+model_dir = snapshot_download(repo_id)
+# Add the model directory to the Python path for dynamic imports
+sys.path.append(model_dir)
+# Import the custom configuration and model classes
+from configuration_internvl_chat import InternVLChatConfig
+from modeling_internvl_chat import InternVLForVision2Seq
+# Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
+config = InternVLChatConfig.from_pretrained(repo_id, trust_remote_code=True)
+model = InternVLForVision2Seq.from_pretrained(repo_id, config=config, trust_remote_code=True)
 # Function to process and describe the image
 def analyze_image(image):
     img = image.convert("RGB")
     inputs = tokenizer("describe this image", return_tensors="pt")
     outputs = model.generate(**inputs)
     return tokenizer.decode(outputs[0], skip_special_tokens=True)
 # Gradio interface for image input
 demo = gr.Interface(
     fn=analyze_image,
+    inputs=gr.Image(type="pil"),
+    outputs="text",
+    title="Image Description using InternVL2-1B",
     description="Upload an image and get a description generated by the InternVL2-1B model."
 )

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 transformers==4.31.0
 gradio==3.28.3
 torch>=1.9
 Pillow==9.4.0

 transformers==4.31.0
+huggingface_hub==0.16.4
 gradio==3.28.3
 torch>=1.9
 Pillow==9.4.0