virendravaishnav commited on
Commit
1082f74
·
1 Parent(s): 657edd9

Updated with OCR model and Gradio integration

Browse files
Files changed (1) hide show
  1. app.py +10 -21
app.py CHANGED
@@ -1,11 +1,10 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoProcessor, AutoModel
3
  import torch
4
 
5
  repo_id = "OpenGVLab/InternVL2-1B"
6
 
7
- # Load the tokenizer, processor, and model directly from the Hub
8
- tokenizer = AutoTokenizer.from_pretrained(repo_id, trust_remote_code=True)
9
  processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
10
  model = AutoModel.from_pretrained(
11
  repo_id,
@@ -20,26 +19,16 @@ model.to(device)
20
  def analyze_image(image):
21
  try:
22
  img = image.convert("RGB")
23
-
24
- # Process the image
25
- image_inputs = processor(images=img, return_tensors="pt")
26
- # Process the text
27
- text_inputs = tokenizer("describe this image", return_tensors="pt")
28
-
29
- # Move inputs to the appropriate device
30
- image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
31
- text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
32
-
33
- # Combine the inputs
34
- inputs = {
35
- "input_ids": text_inputs["input_ids"],
36
- "attention_mask": text_inputs["attention_mask"],
37
- "pixel_values": image_inputs["pixel_values"],
38
- }
39
-
40
  # Generate outputs
41
  outputs = model.generate(**inputs)
42
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
43
  except Exception as e:
44
  return f"An error occurred: {str(e)}"
45
 
 
1
  import gradio as gr
2
+ from transformers import AutoProcessor, AutoModel
3
  import torch
4
 
5
  repo_id = "OpenGVLab/InternVL2-1B"
6
 
7
+ # Load the processor and model directly from the Hub
 
8
  processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
9
  model = AutoModel.from_pretrained(
10
  repo_id,
 
19
  def analyze_image(image):
20
  try:
21
  img = image.convert("RGB")
22
+ text = "describe this image"
23
+
24
+ # Process both image and text together
25
+ inputs = processor(images=img, text=text, return_tensors="pt").to(device)
26
+
 
 
 
 
 
 
 
 
 
 
 
 
27
  # Generate outputs
28
  outputs = model.generate(**inputs)
29
+
30
+ # Decode the output using the processor
31
+ return processor.decode(outputs[0], skip_special_tokens=True)
32
  except Exception as e:
33
  return f"An error occurred: {str(e)}"
34