Spaces:

ARCQUB
/

BPMN-entity-extractor

Sleeping

App Files Files Community

ARCQUB commited on 24 days ago

Commit

8093104

verified ·

1 Parent(s): c395a91

Update models/aya_vision.py

Browse files

Files changed (1) hide show

models/aya_vision.py +128 -128

models/aya_vision.py CHANGED Viewed

@@ -1,128 +1,128 @@
-import os
-import json
-import re
-from PIL import Image
-import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
-from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
-# Set Hugging Face Token
-hf_token = os.getenv("HF_TOKEN")
-# Initialize Aya Vision Model
-model_id = "CohereForAI/aya-vision-8b"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="auto", torch_dtype=torch.float16
-)
-# Initialize Pix2Struct OCR Model
-ocr_processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
-ocr_model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
-# Load prompt
-def load_prompt():
-    with open("/content/vision_model_space/vision_model_space_new/prompts/prompt.txt", "r", encoding="utf-8") as f:
-        return f.read()
-# Try extracting JSON from model output
-def try_extract_json(text):
-    if not text or not text.strip():
-        return None
-    try:
-        return json.loads(text)
-    except json.JSONDecodeError:
-        # Try extracting JSON substring by brace balancing
-        start = text.find('{')
-        if start == -1:
-            return None
-        brace_count = 0
-        json_candidate = ''
-        for i in range(start, len(text)):
-            char = text[i]
-            if char == '{':
-                brace_count += 1
-            elif char == '}':
-                brace_count -= 1
-            json_candidate += char
-            if brace_count == 0:
-                break
-        try:
-            return json.loads(json_candidate)
-        except json.JSONDecodeError:
-            return None
-# Extract OCR text using Pix2Struct
-def extract_all_text_pix2struct(image: Image.Image):
-    inputs = ocr_processor(images=image, return_tensors="pt")
-    predictions = ocr_model.generate(**inputs, max_new_tokens=512)
-    output_text = ocr_processor.decode(predictions[0], skip_special_tokens=True)
-    return output_text.strip()
-# Assign event/gateway names from OCR text
-def assign_event_gateway_names_from_ocr(json_data: dict, ocr_text: str):
-    if not ocr_text or not json_data:
-        return json_data
-    lines = [line.strip() for line in ocr_text.split('\n') if line.strip()]
-    def assign_best_guess(obj):
-        if not obj.get("name") or obj["name"].strip() == "":
-            obj["name"] = "(label unknown)"
-    for evt in json_data.get("events", []):
-        assign_best_guess(evt)
-    for gw in json_data.get("gateways", []):
-        assign_best_guess(gw)
-    return json_data
-# Run Aya model on image
-def run_model(image: Image.Image):
-    prompt = load_prompt()
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": prompt}
-            ]
-        }
-    ]
-    inputs = processor.apply_chat_template(
-        messages,
-        padding=True,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device)
-    gen_tokens = model.generate(
-        **inputs,
-        max_new_tokens=5000,
-        do_sample=True,
-        temperature=0.3,
-    )
-    output_text = processor.tokenizer.decode(
-        gen_tokens[0][inputs.input_ids.shape[1]:],
-        skip_special_tokens=True
-    )
-    parsed_json = try_extract_json(output_text)
-    # Apply OCR post-processing
-    ocr_text = extract_all_text_pix2struct(image)
-    parsed_json = assign_event_gateway_names_from_ocr(parsed_json, ocr_text)
-    # Return both parsed and raw
-    return {
-        "json": parsed_json,
-        "raw": output_text
-    }

+import os
+import json
+import re
+from PIL import Image
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+# Set Hugging Face Token
+hf_token = os.getenv("HF_TOKEN")
+# Initialize Aya Vision Model
+model_id = "CohereForAI/aya-vision-8b"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_id, device_map="auto", torch_dtype=torch.float16
+)
+# Initialize Pix2Struct OCR Model
+ocr_processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
+ocr_model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
+# Load prompt
+def load_prompt():
+    with open("prompts/prompt.txt", "r", encoding="utf-8") as f:
+        return f.read()
+# Try extracting JSON from model output
+def try_extract_json(text):
+    if not text or not text.strip():
+        return None
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        # Try extracting JSON substring by brace balancing
+        start = text.find('{')
+        if start == -1:
+            return None
+        brace_count = 0
+        json_candidate = ''
+        for i in range(start, len(text)):
+            char = text[i]
+            if char == '{':
+                brace_count += 1
+            elif char == '}':
+                brace_count -= 1
+            json_candidate += char
+            if brace_count == 0:
+                break
+        try:
+            return json.loads(json_candidate)
+        except json.JSONDecodeError:
+            return None
+# Extract OCR text using Pix2Struct
+def extract_all_text_pix2struct(image: Image.Image):
+    inputs = ocr_processor(images=image, return_tensors="pt")
+    predictions = ocr_model.generate(**inputs, max_new_tokens=512)
+    output_text = ocr_processor.decode(predictions[0], skip_special_tokens=True)
+    return output_text.strip()
+# Assign event/gateway names from OCR text
+def assign_event_gateway_names_from_ocr(json_data: dict, ocr_text: str):
+    if not ocr_text or not json_data:
+        return json_data
+    lines = [line.strip() for line in ocr_text.split('\n') if line.strip()]
+    def assign_best_guess(obj):
+        if not obj.get("name") or obj["name"].strip() == "":
+            obj["name"] = "(label unknown)"
+    for evt in json_data.get("events", []):
+        assign_best_guess(evt)
+    for gw in json_data.get("gateways", []):
+        assign_best_guess(gw)
+    return json_data
+# Run Aya model on image
+def run_model(image: Image.Image):
+    prompt = load_prompt()
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": prompt}
+            ]
+        }
+    ]
+    inputs = processor.apply_chat_template(
+        messages,
+        padding=True,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt"
+    ).to(model.device)
+    gen_tokens = model.generate(
+        **inputs,
+        max_new_tokens=5000,
+        do_sample=True,
+        temperature=0.3,
+    )
+    output_text = processor.tokenizer.decode(
+        gen_tokens[0][inputs.input_ids.shape[1]:],
+        skip_special_tokens=True
+    )
+    parsed_json = try_extract_json(output_text)
+    # Apply OCR post-processing
+    ocr_text = extract_all_text_pix2struct(image)
+    parsed_json = assign_event_gateway_names_from_ocr(parsed_json, ocr_text)
+    # Return both parsed and raw
+    return {
+        "json": parsed_json,
+        "raw": output_text
+    }