Spaces:

badrivishalk
/

MGVG

Sleeping

App Files Files Community

badrivishalk commited on May 27

Commit

30857ba

1 Parent(s): 23ca6b0

Added source code

Browse files

Files changed (5) hide show

Dockerfile +5 -20
src/app.py +370 -0
src/config.py +2 -0
src/predict_output.py +559 -0
src/requirements.txt +175 -0

Dockerfile CHANGED Viewed

@@ -1,21 +1,6 @@
-FROM python:3.9-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    software-properties-common \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
 EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.10
+WORKDIR /src
+COPY . /src
+RUN pip install -r requirements.txt
 EXPOSE 8501
+CMD ["streamlit", "run", "main/demo/app.py", "--server.port=8501", "--server.address=0.0.0.0"]

src/app.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import streamlit as st
+import time
+import os
+import shutil
+import pymupdf
+import json
+st.set_page_config(
+    page_title="MGVG Grounding Demo",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    page_icon="logo.png"
+)
+# --- Simple Authentication ---
+import streamlit as st
+import time
+# Define your valid credentials
+VALID_USERS = {
+    "iitb": "iitb123",
+    "badri": "badri123"
+}
+def login():
+    # Set a professional background for the whole app
+    st.markdown(
+        '''
+        <style>
+        body, .stApp {
+            background: linear-gradient(120deg, #e0eafc 0%, #cfdef3 100%) !important;
+        }
+        .login-box {
+            background: #fff;
+            padding: 2.5em 2em 2em 2em;
+            border-radius: 16px;
+            box-shadow: 0 4px 24px rgba(80, 120, 200, 0.12);
+            min-width: 320px;
+            max-width: 90vw;
+            margin: auto;
+        }
+        </style>
+        ''', unsafe_allow_html=True
+    )
+    # Center the login box using columns
+    col1, col2, col3 = st.columns([1,2,1])
+    with col2:
+        # st.markdown('<div class="login-box">', unsafe_allow_html=True)
+        # image at center
+        st.image("logo.png", width=800, use_container_width=False)
+        st.markdown('<h2 style="text-align:center; color:#2b6cb0; margin-bottom:1.5em;">🔒 Please log in to access the app</h2>', unsafe_allow_html=True)
+        username = st.text_input("Username", key="login_username")
+        password = st.text_input("Password", type="password", key="login_password")
+        login_btn = st.button("Login")
+        if login_btn:
+            if username in VALID_USERS and VALID_USERS[username] == password:
+                st.session_state["authenticated"] = True
+                st.success("Login successful!")
+                st.session_state["show_continue"] = True
+            else:
+                st.error("Invalid username or password")
+        if st.session_state.get("show_continue", False):
+            if st.button("Continue to App"):
+                st.session_state["show_continue"] = False
+                st.experimental_rerun() if hasattr(st, "experimental_rerun") else None
+        st.markdown('</div>', unsafe_allow_html=True)
+if "authenticated" not in st.session_state:
+    st.session_state["authenticated"] = False
+if not st.session_state["authenticated"]:
+    login()
+    st.stop()
+# --- End Authentication ---
+# st.image("logo.png", width=250)
+from PIL import Image, ImageDraw
+import io
+# from st_audiorec import st_audiorec
+from surya.layout import LayoutPredictor
+from doctr.models import ocr_predictor
+from transformers import pipeline
+@st.cache_resource
+def get_layout_predictor():
+    return LayoutPredictor()
+@st.cache_resource
+def get_ocr_model():
+    return ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
+@st.cache_resource
+def get_llm_model(device):
+    return pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
+from predict_output import predict_output
+layout_predictor = get_layout_predictor()
+model = get_ocr_model()
+pipe = get_llm_model("cuda")
+print("Models loaded")
+# --- Placeholder function for demo ---
+def get_corresponding_bboxes(image, question):
+    # Returns dummy bounding boxes and answer for demo
+    # Each bbox: (x1, y1, x2, y2)
+    w, h = image.size
+    block_bboxes = [(w//8, h//8, w//2, h//2)]
+    line_bboxes = [(w//4, h//4, w//2, h//3)]
+    word_bboxes = [(w//3, h//3, w//2, h//2)]
+    point_bboxes = [(w//2, h//2, w//2+5, h//2+5)]
+    answer = "This is a demo answer."
+    return block_bboxes, line_bboxes, word_bboxes, point_bboxes, answer
+# --- Helper to draw bboxes ---
+def draw_bboxes(image, bboxes, color):
+    img = image.copy()
+    # width proportional to the image size
+    width = int(img.width/100)
+    draw = ImageDraw.Draw(img)
+    for bbox in bboxes:
+        draw.rectangle(bbox, outline=color, width=width)
+    return img
+def draw_points(image, bboxes, color):
+    img = image.copy()
+    width = int(img.width)
+    draw = ImageDraw.Draw(img)
+    for bbox in bboxes:
+        # x1, y1, x2, y2 = bbox
+        cx, cy = bbox[0], bbox[1]
+        # r being relative to the image size
+        r = int(img.width/100)
+        draw.ellipse((cx-r, cy-r, cx+r, cy+r), outline=color, width=width, fill=color)
+    return img
+# model_type = st.sidebar.checkbox("Use LLM Model", value=False)
+# model_type = "llm" if model_type else "inhouse"
+st.markdown("""
+    <style>
+    .main {
+        background: linear-gradient(135deg, #f8fafc 0%, #e0e7ef 100%);
+    }
+    .block-container {
+        padding-top: 2rem;
+        padding-bottom: 2rem;
+    }
+    .stButton>button {
+        background-color: #4F8BF9;
+        color: white;
+        border-radius: 8px;
+        font-size: 1.1rem;
+        padding: 0.5em 2em;
+    }
+    .stTextInput>div>input {
+        border-radius: 8px;
+        border: 1px solid #4F8BF9;
+    }
+    .stFileUploader>div>div {
+        border-radius: 8px;
+        border: 2px dashed #4F8BF9;
+    }
+    .stAudio>audio {
+        width: 100% !important;
+    }
+    </style>
+""", unsafe_allow_html=True)
+col_logo, col_title = st.columns([1, 8])
+with col_logo:
+    st.image("logo.png", width=180)
+with col_title:
+    st.markdown("<h1 style='margin-bottom: 0;'>MGVG - Multi-Granular Visual Grounding</h1>", unsafe_allow_html=True)
+# List of quotes (HTML formatted)
+QUOTES = [
+    '''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
+        "प्रत्यक्षं किं प्रमाणं?" <span style="font-size:0.9em; color:#444;">(<i>What better proof is there than direct perception?)</i></span>
+    </div>''',
+    '''<div style="color: #2b6cb0; font-size: 1.3em; font-weight: 500; margin-bottom: 1em;">
+        <i>"Truth is not told—it is seen."</i>
+    </div>'''
+]
+# Initialize session state for quote index and last update time
+if "quote_index" not in st.session_state:
+    st.session_state.quote_index = 0
+    st.session_state.last_quote_time = time.time()
+# Check if 5 seconds have passed
+if time.time() - st.session_state.last_quote_time > 5:
+    st.session_state.quote_index = (st.session_state.quote_index + 1) % len(QUOTES)
+    st.session_state.last_quote_time = time.time()
+    # Rerun the app to update the quote
+    if hasattr(st, "experimental_rerun"):
+        st.experimental_rerun()
+# Display the current quote
+st.markdown(QUOTES[st.session_state.quote_index], unsafe_allow_html=True)
+col1, col2 = st.columns([1, 2])
+with col1:
+    st.subheader("1. Upload Image or pdf document")
+    image = "Not Uploaded"
+    uploaded_file = st.file_uploader("Choose an image", type=["png", "jpg", "jpeg", "pdf"])
+    if uploaded_file:
+        current_dir = os.getcwd()
+        temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
+        # delete the temp_output_folder
+        if os.path.exists(temp_output_folder):
+            shutil.rmtree(temp_output_folder)
+        document_type = "image"
+        if uploaded_file.type == "application/pdf":
+            # save the uploaded file to a temp file
+            temp_file_path = os.path.join(current_dir, "temp_file.pdf")
+            # delete the temp_file_path
+            if os.path.exists(temp_file_path):
+                os.remove(temp_file_path)
+            with open(temp_file_path, "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            if not os.path.exists(temp_output_folder):
+                os.makedirs(temp_output_folder)
+            # output_file = simple_counter_generator("page", ".jpg")
+            # convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
+            pages = 0
+            doc = pymupdf.open(temp_file_path)  # open document
+            for page in doc:  # iterate through the pages
+                pages += 1
+                pix = page.get_pixmap()  # render page to an image
+                pix.save(f"{temp_output_folder}/{page.number}.png")
+            if(pages == 1):
+                document_type = "image"
+                document_path = os.path.join(temp_output_folder, "0.png")
+                uploaded_file = os.path.join(temp_output_folder, "0.png")
+                image = Image.open(uploaded_file).convert("RGB")
+            else:
+                document_type = "pdf"
+                # image = Image.open(uploaded_file).convert("RGB")
+        if document_type == "image":
+            image = Image.open(uploaded_file).convert("RGB")
+            st.image(image, caption="Uploaded Image", use_container_width=True)
+            # Save uploaded image to a temp file for predict_output
+            temp_file_path = "sample.png"
+            image.save(temp_file_path)
+        else:
+            document_type = "pdf"
+            document_path = uploaded_file.name
+            image = "Uploaded PDF"
+            # st.image(uploaded_file, caption="Uploaded PDF", use_container_width=True)
+    else:
+        image = "Not Uploaded"
+        temp_output_folder = None
+        st.image("https://placehold.co/400x300?text=Upload+Image", caption="Uploaded Image", use_container_width=True)
+    st.subheader("2. Ask a question")
+    question = st.text_input("Type your question here")
+    # Add radio button for model selection
+    model_type = st.radio(
+        "Select Model Type:",
+        options=["MGVG", "IndoDocs"],
+        index=1,
+        horizontal=True
+    )
+    run_demo = st.button("Run Grounding Demo", use_container_width=True)
+# --- Output placeholders ---
+with col2:
+    st.subheader("3. Visual Grounding Outputs")
+    if image!="Not Uploaded" and (question):
+        print(image)
+        print(question)
+    if run_demo and image!="Not Uploaded" and (question):
+        # Use text input only
+        q = question
+        answer, block_bboxes, line_bboxes, word_bboxes, point_bboxes, current_page = predict_output(
+            temp_file_path, q, pipe, layout_predictor, model, model_type, document_type
+        )
+        # print(block_bboxes)
+        # print(line_bboxes)
+        # print(word_bboxes)
+        # print(point_bboxes)
+        print(answer)
+        if(current_page != -1):
+            image = Image.open(os.path.join(temp_output_folder, f"{current_page}.png")).convert("RGB")
+        print("--------------------------------")
+        print(image)
+        block_img = draw_bboxes(image, block_bboxes, color="#4F8BF9")
+        line_img = draw_bboxes(image, line_bboxes, color="#F97B4F")
+        word_img = draw_bboxes(image, word_bboxes, color="#4FF9B2")
+        point_img = draw_points(image, point_bboxes, color="#FFFF00")
+        imgs = [block_img, line_img, word_img, point_img]
+        labels = ["Block Level", "Line Level", "Word Level", "Point Level"]
+        cols = st.columns(4)
+        for i, (img, label) in enumerate(zip(imgs, labels)):
+            with cols[i]:
+                st.image(img, caption=label, use_container_width=True)
+        answer_lines = answer.splitlines()
+        st.markdown("""
+        <div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
+            <h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
+            <p style='font-size: 1.2em; color: #222;'>""" + "<br>".join(answer_lines) + """</p>
+        </div>
+        """, unsafe_allow_html=True)
+        # --- Centered Save Results Button ---
+        result_data = {
+            "question": q,
+            "answer": answer,
+            "block_bboxes": block_bboxes,
+            "line_bboxes": line_bboxes,
+            "word_bboxes": word_bboxes,
+            "point_bboxes": point_bboxes,
+            "current_page": current_page
+        }
+        json_str = json.dumps(result_data, indent=2)
+        col_left, col_center, col_right = st.columns([2, 3, 2])
+        with col_center:
+            st.download_button(
+                label="Save Results as JSON",
+                data=json_str,
+                file_name="grounding_results.json",
+                mime="application/json"
+            )
+    else:
+        st.markdown("""
+        <div style='display: flex; gap: 2em; flex-wrap: wrap;'>
+            <div style='flex: 1; min-width: 220px;'>
+                <img src='https://placehold.co/220x180?text=Block+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
+                <p style='text-align:center; font-weight:600;'>Block Level</p>
+            </div>
+            <div style='flex: 1; min-width: 220px;'>
+                <img src='https://placehold.co/220x180?text=Line+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
+                <p style='text-align:center; font-weight:600;'>Line Level</p>
+            </div>
+            <div style='flex: 1; min-width: 220px;'>
+                <img src='https://placehold.co/220x180?text=Word+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
+                <p style='text-align:center; font-weight:600;'>Word Level</p>
+            </div>
+            <div style='flex: 1; min-width: 220px;'>
+                <img src='https://placehold.co/220x180?text=Point+Level' style='width:100%; border-radius: 10px; border: 2px solid #4F8BF9;'>
+                <p style='text-align:center; font-weight:600;'>Point Level</p>
+            </div>
+        </div>
+        <br>
+        <div style='background: #f1f5fa; border-radius: 10px; padding: 1em 2em; border: 1.5px solid #4F8BF9;'>
+            <h4 style='color: #4F8BF9;'>Predicted Answer:</h4>
+            <p style='font-size: 1.2em; color: #222;'>[Answer will appear here]</p>
+        </div>
+        """, unsafe_allow_html=True)

src/config.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ MODEL1 = "MGVG"
2	+ MODEL2 = "IndoDocs"

src/predict_output.py ADDED Viewed

	@@ -0,0 +1,559 @@

+import os
+import shutil
+from fuzzywuzzy import fuzz
+from tqdm import tqdm
+from PIL import Image
+import requests
+# from surya.layout import LayoutPredictor
+from doctr.io import DocumentFile
+from pdf2image import convert_from_path
+import pymupdf
+# from doctr.models import ocr_predictor
+import numpy as np
+from time import time
+pipe = None
+layout_predictor = None
+MAX_BLOCK_MATCHES = 2
+MAX_LINE_MATCHES = 5
+CUT_OFF_THRESHOLD = 60
+QUESTION_WEIGHT = 0.2
+ANSWER_WEIGHT = 0.8
+LEVEL = "line"
+jpg_options = {
+    "quality"    : 100,
+    "progressive": True,
+    "optimize"   : False
+}
+stop_words = {'what', 'is', 'the', 'this', 'that', 'these', 'those', 'which', 'how', 'why', 'where', 'when', 'who', 'will', 'be', 'and', 'or', 'in', 'at', 'to', 'for', 'of', 'with', 'by'}
+def longest_consecutive_range(indices):
+    if not indices:
+        return []
+    indices = sorted(set(indices))
+    longest = []
+    current = [indices[0]]
+    for i in range(1, len(indices)):
+        if indices[i] == indices[i - 1] + 1:
+            current.append(indices[i])
+        else:
+            if len(current) > len(longest):
+                longest = current
+            current = [indices[i]]
+    if len(current) > len(longest):
+        longest = current
+    return longest
+def get_word_level_matches(answer_text, top_k_matches):
+    bboxes = []
+    for match in top_k_matches:
+        indices = []
+        for index, word in enumerate(match['words']):
+            if word['text'].lower() in answer_text.lower():
+                # bboxes.append(word['bbox'])
+                indices.append(index)
+        longest_indices = longest_consecutive_range(indices)
+        for index in longest_indices:
+            bboxes.append(match['words'][index]['bbox'])
+    return bboxes
+def get_matched_regions(question_text, target_text, predictions, level):
+    question_terms = [word.lower() for word in question_text.split() if word.lower() not in stop_words]
+    matched_regions = []
+    for region in predictions:
+        region_text = region['text']
+        region_copy = region.copy()
+        if target_text.lower() in region_text.lower():
+            region_copy['match_score'] = 100
+            region_copy['match_details'] = {
+                    'exact_match': True,
+                    'answer_score': 100,
+                    'question_score': 100
+                }
+            matched_regions.append(region_copy)
+            continue
+        partial_score = fuzz.partial_ratio(target_text.lower(), region_text.lower())
+        token_score = fuzz.token_set_ratio(target_text.lower(), region_text.lower())
+        # Calculate length factor (preference for longer matches that contain meaningful content)
+        target_len = len(target_text)
+        region_len = len(region_text)
+        length_factor = min(1.0, region_len / min(50, target_len))  # Cap at 1.0, adapt based on target length
+        # Combine scores for answer with weights
+        # Higher weight to token matching for longer texts, higher weight to partial matching for shorter texts
+        if region_len > 10:
+            answer_score = (partial_score * 0.3) + (token_score * 0.5) + (length_factor * 100 * 0.2)
+        else:
+            # For very short texts, reduce their overall score unless they're exact matches
+            answer_score = (partial_score * 0.3) + (token_score * 0.4) + (length_factor * 100 * 0.3)
+            if region_len < 5 and partial_score < 100:
+                answer_score *= 0.5  # Penalize very short inexact matches
+        # penalize shorter region_texts
+        if region_len < 5:
+            answer_score *= 0.5
+        # Calculate fuzzy match scores for question terms using both methods
+        partial_question_scores = [fuzz.partial_ratio(term, region_text.lower()) for term in question_terms]
+        token_question_scores = [fuzz.token_set_ratio(term, region_text.lower()) for term in question_terms]
+        # Get best scores for question terms
+        best_partial_question = max(partial_question_scores) if partial_question_scores else 0
+        best_token_question = max(token_question_scores) if token_question_scores else 0
+        # Combine question scores
+        question_score = (best_partial_question * 0.4) + (best_token_question * 0.6)
+        # Combine scores (giving more weight to answer matches)
+        combined_score = (answer_score * ANSWER_WEIGHT) + (question_score * QUESTION_WEIGHT)
+        # print(combined_score)
+        if combined_score >= CUT_OFF_THRESHOLD:
+            region_copy['match_score'] = combined_score
+            region_copy['match_details'] = {
+                'exact_match': False,
+                'answer_score': answer_score,
+                'question_score': question_score,
+                'answer_weight': ANSWER_WEIGHT,
+                'question_weight': QUESTION_WEIGHT
+            }
+            matched_regions.append(region_copy)
+    matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
+    # If no matches, reduce threshold by 20 and get the topmost single output
+    if not matched_regions:
+        new_threshold = max(CUT_OFF_THRESHOLD - 20, 0)  # Prevent negative threshold
+        matched_regions = [region for region in matched_regions if region['match_score'] >= new_threshold]
+        matched_regions.sort(key=lambda x: x['match_score'], reverse=True)
+        if matched_regions:
+            matched_regions = [matched_regions[0]]  # Only keep the topmost single output
+    if level == "block":
+        top_matches = matched_regions[:MAX_BLOCK_MATCHES]
+    elif level == "line":
+        top_matches = matched_regions[:MAX_LINE_MATCHES]
+    return top_matches
+def get_processed_text_for_llm(block_predictions, gap):
+    final_text = ""
+    for block_data in block_predictions:
+        final_text += block_data['text'] + gap
+    return final_text
+def get_page_number(block_bboxes):
+    pages = {}
+    for block in block_bboxes:
+        if block['page'] not in pages:
+            pages[block['page']] = 1
+        else:
+            pages[block['page']] += 1
+    print(pages)
+    max_page = max(pages, key=pages.get)
+    return max_page
+def predict_output(document_path, question, pipe, layout_predictor, model, model_type, document_type="image"):
+    predicted_answer = None
+    block_box_predictions = None
+    line_box_predictions = None
+    word_box_predictions = None
+    point_box_predictions = None
+    curr_time = time()
+    line_predictions, pages_count = get_line_predictions(document_path, model, document_type)
+    line_time = time()
+    print(f"Done with line predictions in {line_time - curr_time} seconds")
+    curr_time = time()
+    if(document_type == "pdf" and pages_count < 3):
+        block_predictions = get_block_predictions(document_path, layout_predictor, model, document_type)
+        gap = '\n\n\n'
+    else:
+        block_predictions = line_predictions
+        gap = '\n'
+    block_time = time()
+    print(f"Done with block predictions in {block_time - line_time} seconds")
+    # exit()
+    # print(line_predictions)
+    # print(block_predictions)
+    curr_time = time()
+    if model_type == "MGVG" or document_type=="pdf":
+        processed_text_for_llm = get_processed_text_for_llm(block_predictions, gap=gap)
+        # print("Processed Text for LLM: ", processed_text_for_llm)
+        predicted_answer = generate_llm_answer(question, processed_text_for_llm, pipe)
+    elif model_type == "IndoDocs":
+        predicted_answer = generate_via_inhouse_model_answer(question, document_path)
+    llm_time = time()
+    print(f"Done with LLM in {llm_time - curr_time} seconds")
+    print("LLM Answer: ", predicted_answer)
+    total_algo_time = time()
+    # print(predicted_answer)
+    curr_time = time()
+    line_matches = get_matched_regions(question, predicted_answer, line_predictions, "line")
+    block_bboxes = get_matched_regions(question, predicted_answer, block_predictions, "block")
+    match_time = time()
+    print(f"Done with match in {match_time - curr_time} seconds")
+    if document_type == "pdf":
+        current_page = get_page_number(block_bboxes)
+    else:
+        current_page = -1
+    if(current_page != -1):
+        predicted_answer = "Answer predicted from page: " + str(current_page+1) + "\n" + predicted_answer
+    block_box_predictions = []
+    for match in block_bboxes:
+        block_box_predictions.append(match['bbox'])
+    line_box_predictions = []
+    for match in line_matches:
+        # print(match['page'], match['bbox'])
+        if current_page == -1 or match['page'] == current_page:
+            line_box_predictions.append(match['bbox'])
+    # for line in line_box_predictions:
+    #     print(line)
+    curr_time = time()
+    word_box_predictions = get_word_level_matches(predicted_answer, top_k_matches=line_matches)
+    word_time = time()
+    print(f"Done with word in {word_time - curr_time} seconds")
+    curr_time = time()
+    point_box_predictions = get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions)
+    point_time = time()
+    print(f"Done with point in {point_time - curr_time} seconds")
+    print(f"Total algo time: {time() - total_algo_time} seconds")
+    # print(block_box_predictions)
+    # print(line_box_predictions)
+    # print(word_box_predictions)
+    # print(point_box_predictions)
+    # print(predicted_answer)
+    return predicted_answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions, current_page
+def calculate_midpoint_of_bboxes(bboxes):
+    if not bboxes:
+        return None
+    # Convert to numpy array for easier manipulation
+    bboxes = np.array(bboxes)
+    # Find the extreme points of all bboxes combined
+    min_x = np.min(bboxes[:, 0])
+    min_y = np.min(bboxes[:, 1])
+    max_x = np.max(bboxes[:, 2])
+    max_y = np.max(bboxes[:, 3])
+    # Calculate midpoint
+    midpoint_x = (min_x + max_x) / 2
+    midpoint_y = (min_y + max_y) / 2
+    return round(midpoint_x, 2), round(midpoint_y, 2)
+def get_point_level_matches(block_box_predictions, line_box_predictions, word_box_predictions):
+    point_box_predictions = []
+    if len(block_box_predictions) ==1:
+        try:
+            x, y = calculate_midpoint_of_bboxes(block_box_predictions)
+            point_box_predictions = [[x, y]]
+            # print(x, y)
+        except:
+            try:
+                x, y = calculate_midpoint_of_bboxes(line_box_predictions)
+                point_box_predictions = [[x, y]]
+            except:
+                point_box_predictions = []
+    else:
+        points = []
+        for block_bbox in block_box_predictions:
+            try:
+                x, y = calculate_midpoint_of_bboxes(block_bbox)
+                points.append([x, y])
+            except:
+                continue
+        point_box_predictions = points
+    return point_box_predictions
+def generate_via_inhouse_model_answer(question, image_path, api_key="VISION-TEAM", max_tokens=512, temperature=0.7, endpoint="http://103.207.148.38:9000/api/v1/chat/upload"):
+    headers = {
+        "x-api-key": api_key  # or whatever the Swagger UI says
+    }
+    files = {
+        "image": open(image_path, "rb")
+    }
+    data = {
+        "text": question,
+        "max_tokens": str(max_tokens),
+        "temperature": str(temperature)
+    }
+    try:
+        response = requests.post(endpoint, headers=headers, files=files, data=data)
+        response.raise_for_status()
+        result = response.json()
+    except requests.exceptions.RequestException as e:
+        return {"error": str(e)}
+    return result['response']['choices'][0]['message']['content']
+def generate_llm_answer(question, context, pipe):
+    prompt = f"""You are given a question and context. Your task is to find and return the best possible answer to the question using only the context as it is.
+Do not generate summaries, paraphrased content, or any additional explanations including any preamble and postamble.
+Return only the exact phrase or sentence fragment from the context that answers the question.
+If the answer is not found in the context, return: Answer not found in context.
+Question: {question}
+Context: {context}
+Answer:
+"""
+    messages = [ {"role": "user", "content": prompt}]
+    result = pipe(messages, max_new_tokens=512, do_sample=True, temperature=0.7)
+    ans = result[0]["generated_text"][1]['content']
+    return ans
+def get_line_predictions(document_path, model, document_type):
+    current_dir = os.getcwd()
+    if document_type == "pdf":
+        output_file = simple_counter_generator("page", ".jpg")
+        current_dir = os.getcwd()
+        temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
+        # delete the temp_output_folder
+        if os.path.exists(temp_output_folder):
+            shutil.rmtree(temp_output_folder)
+        if not os.path.exists(temp_output_folder):
+            os.makedirs(temp_output_folder)
+        # output_file = simple_counter_generator("page", ".jpg")
+        # convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
+        doc = pymupdf.open(document_path)  # open document
+        for page in doc:  # iterate through the pages
+            pix = page.get_pixmap()  # render page to an image
+            pix.save(f"{temp_output_folder}/{page.number}.png")
+        images_path = sorted(os.listdir(temp_output_folder))
+    else:
+        images_path = [os.path.join(current_dir, document_path)]
+        print(images_path)
+    block_predictions = []
+    # print(document_path)
+    # if document_type == "pdf":
+    #     doc = DocumentFile.from_pdf(document_path)
+    # else:
+    #     doc = DocumentFile.from_images(document_path)
+    # result = model(doc)
+    line_predictions = []
+    pages_count = -1
+    for image_path in images_path:
+        pages_count += 1
+        if(len(images_path) > 1):
+            doc = DocumentFile.from_images(os.path.join(temp_output_folder, image_path))
+        else:
+            doc = DocumentFile.from_images(image_path)
+        result = model(doc)
+        for page in result.pages:
+            dim = tuple(reversed(page.dimensions))
+            for block in page.blocks:
+                for line in block.lines:
+                    output = {}
+                    geo = line.geometry
+                    a = list(a*b for a,b in zip(geo[0],dim))
+                    b = list(a*b for a,b in zip(geo[1],dim))
+                    x1 = round(a[0], 2).astype(float)
+                    y1 = round(a[1], 2).astype(float)
+                    x2 = round(b[0], 2).astype(float)
+                    y2 = round(b[1], 2).astype(float)
+                    line_bbox = [x1, y1, x2, y2]
+                    sent = []
+                    words_data = []
+                    for word in line.words:
+                        word_data = {}
+                        sent.append(word.value)
+                        geo = word.geometry
+                        a = list(a*b for a,b in zip(geo[0],dim))
+                        b = list(a*b for a,b in zip(geo[1],dim))
+                        x1 = round(a[0], 2).astype(float)
+                        y1 = round(a[1], 2).astype(float)
+                        x2 = round(b[0], 2).astype(float)
+                        y2 = round(b[1], 2).astype(float)
+                        bbox = [x1, y1, x2, y2]
+                        word_data['bbox'] = bbox
+                        word_data['text'] = word.value
+                        words_data.append(word_data)
+                    output['bbox'] = line_bbox
+                    output['text'] = " ".join(sent)
+                    output['words'] = words_data
+                    output['page'] = pages_count
+                    line_predictions.append(output)
+    return line_predictions, pages_count
+def get_block_predictions(document_path, layout_predictor, model, document_type):
+    current_dir = os.getcwd()
+    if document_type == "pdf":
+        output_file = simple_counter_generator("page", ".jpg")
+        current_dir = os.getcwd()
+        temp_output_folder = os.path.join(current_dir, "temp_output_folder/")
+        # delete the temp_output_folder
+        if os.path.exists(temp_output_folder):
+            shutil.rmtree(temp_output_folder)
+        if not os.path.exists(temp_output_folder):
+            os.makedirs(temp_output_folder)
+        # output_file = simple_counter_generator("page", ".jpg")
+        # convert_from_path(document_path, output_folder=temp_output_folder, dpi=300, fmt='jpeg', jpegopt= jpg_options, output_file=output_file)
+        doc = pymupdf.open(document_path)  # open document
+        for page in doc:  # iterate through the pages
+            pix = page.get_pixmap()  # render page to an image
+            pix.save(f"{temp_output_folder}/{page.number}.png")
+        images_path = sorted(os.listdir(temp_output_folder))
+    else:
+        images_path = [os.path.join(current_dir, document_path)]
+        # print(images_path)
+    block_predictions = []
+    page_count = -1
+    for image_path in images_path:
+        page_count += 1
+        if(len(images_path) > 1):
+            image = Image.open(os.path.join(temp_output_folder, image_path))
+        else:
+            image = Image.open(os.path.join(current_dir, document_path))
+        # print(image_path)
+        # print(image)
+        layout_predictions = layout_predictor([image])
+        for block in layout_predictions[0].bboxes:
+            output = {}
+            bbox = [int(x) for x in block.bbox]
+            cropped_image = image.crop(bbox)
+            cropped_image.save(f'temp.png')
+            doc = DocumentFile.from_images('temp.png')
+            result = model(doc)
+            text = []
+            for page in result.pages:
+                for block in page.blocks:
+                    for line in block.lines:
+                        for word in line.words:
+                            text.append(word.value)
+            output['bbox'] = bbox
+            output['text'] = " ".join(text)
+            output['page'] = page_count
+            block_predictions.append(output)
+    return block_predictions
+def simple_counter_generator(prefix="", suffix=""):
+    while True:
+        yield 'p'
+# from doctr.models import ocr_predictor
+# model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)
+# # from transformers import pipeline
+# # def load_llm_model(device):
+# #     pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3.1-8B-Instruct", device=device)
+# #     return pipe
+# # pipe = load_llm_model("cuda")
+# pipe = None
+# # from surya.layout import LayoutPredictor
+# # layout_predictor = LayoutPredictor()
+# layout_predictor = None
+# document_path = "sample.pdf"
+# question = "What is the subject of the circular?"
+# answer, block_box_predictions, line_box_predictions, word_box_predictions, point_box_predictions = predict_output(document_path, question, pipe, layout_predictor, model, "Inhouse", document_type="pdf")
+# print(answer)
+# print(block_box_predictions)
+# print(line_box_predictions)
+# print(word_box_predictions)
+# print(point_box_predictions)

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,175 @@

+accelerate==1.6.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+albucore==0.0.23
+albumentations==2.0.5
+altair==5.5.0
+annotated-types==0.7.0
+anthropic==0.46.0
+anyascii==0.3.2
+anyio==4.9.0
+asttokens==3.0.0
+async-timeout==5.0.1
+attrs==25.3.0
+av==14.3.0
+beautifulsoup4==4.13.4
+blinker==1.9.0
+cachetools==5.5.2
+certifi==2025.1.31
+cfgv==3.4.0
+charset-normalizer==3.4.1
+click==8.1.8
+comm==0.2.2
+contourpy==1.3.1
+cycler==0.12.1
+datasets==3.5.0
+debugpy==1.8.14
+decorator==5.2.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distro==1.9.0
+doclayout_yolo==0.0.3
+exceptiongroup==1.2.2
+executing==2.2.0
+filelock==3.18.0
+filetype==1.2.0
+fonttools==4.57.0
+frozenlist==1.6.0
+fsspec==2024.12.0
+ftfy==6.3.1
+fuzzywuzzy==0.18.0
+gitdb==4.0.12
+GitPython==3.1.44
+google-auth==2.39.0
+google-genai==1.11.0
+h11==0.14.0
+h5py==3.13.0
+httpcore==1.0.8
+httpx==0.28.1
+huggingface-hub==0.30.2
+identify==2.6.10
+idna==3.10
+ipykernel==6.29.5
+ipython==8.35.0
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.9.0
+joblib==1.4.2
+jsonschema==4.23.0
+jsonschema-specifications==2025.4.1
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+kiwisolver==1.4.8
+langdetect==1.0.9
+markdown2==2.5.3
+markdownify==0.13.1
+marker-pdf==1.6.2
+MarkupSafe==3.0.2
+matplotlib==3.10.1
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+multidict==6.4.3
+multiprocess==0.70.16
+narwhals==1.39.1
+nest-asyncio==1.6.0
+networkx==3.4.2
+nodeenv==1.9.1
+numpy==2.2.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.75.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+packaging==24.2
+pandas==2.2.3
+parso==0.8.4
+pdf2image==1.17.0
+pdftext==0.6.2
+pexpect==4.9.0
+pillow==10.4.0
+platformdirs==4.3.7
+pre_commit==4.2.0
+prompt_toolkit==3.0.50
+propcache==0.3.1
+protobuf==6.31.0
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pyarrow==19.0.1
+pyasn1==0.6.1
+pyasn1_modules==0.4.2
+pyclipper==1.3.0.post6
+pydantic==2.11.3
+pydantic-settings==2.8.1
+pydantic_core==2.33.1
+pydeck==0.9.1
+Pygments==2.19.1
+PyMuPDF==1.25.5
+pyparsing==3.2.3
+pypdfium2==4.30.0
+pytesseract==0.3.13
+python-dateutil==2.9.0.post0
+python-doctr==0.11.0
+python-dotenv==1.1.0
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==26.4.0
+qwen-vl-utils==0.0.10
+RapidFuzz==3.13.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rpds-py==0.25.0
+rsa==4.9.1
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.2
+seaborn==0.13.2
+sentence-transformers==4.1.0
+shapely==2.1.0
+simsimd==6.2.1
+six==1.17.0
+smmap==5.0.2
+sniffio==1.3.1
+soupsieve==2.7
+stack-data==0.6.3
+streamlit==1.45.1
+stringzilla==3.12.3
+surya-ocr==0.13.1
+sympy==1.13.1
+tenacity==9.1.2
+thop==0.1.1.post2209072238
+threadpoolctl==3.6.0
+tokenizers==0.21.1
+toml==0.10.2
+torch==2.6.0
+torchvision==0.21.0
+tornado==6.4.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.51.2
+triton==3.2.0
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+virtualenv==20.30.0
+watchdog==6.0.0
+wcwidth==0.2.13
+websockets==15.0.1
+xxhash==3.5.0
+yarl==1.20.0