insight-finder

Runtime error

File size: 15,951 Bytes

from fastapi import FastAPI
from pydantic import BaseModel
from typing import Dict, List
import gradio as gr
import pandas as pd
import json
import re
from src.core import *
from src.ressources.main_css import *


app = FastAPI(
    title="Insight Finder",
    description="Find relevant technologies from a problem",
)

class InputProblem(BaseModel):
    problem: str

class InputConstraints(BaseModel):
    constraints: Dict[str, str]
    
# This schema defines the structure for a single technology object
class Technology(BaseModel):
    """Represents a single technology entry with its details."""
    title: str
    purpose: str
    key_components: str
    advantages: str
    limitations: str
    id: int

class OutputPriorArt(BaseModel):
    """Represents the search of prior art using the technology combinations"""
    content: str
    uris: List

class InputPriorArtConstraints(BaseModel):
    technologies: List[Technology]
    constraints: Dict[str, str]

class InputPriorArtProblem(BaseModel):
    technologies: List[Technology]
    problem: str


# This schema defines the root structure of the JSON
class TechnologyData(BaseModel):
    """Represents the top-level object containing a list of technologies."""
    technologies: List[Technology]

@app.post("/process", response_model=TechnologyData)
async def process(data: InputProblem):
    result= process_input(data, global_tech, global_tech_embeddings, "problem")
    return {"technologies": result}

@app.post("/process-constraints", response_model=TechnologyData)
async def process_constraints(constraints: InputConstraints):
    result= process_input(constraints.constraints, global_tech, global_tech_embeddings, "constraints")
    return {"technologies": result}

@app.post("/prior-art-constraints", response_model=OutputPriorArt)
async def prior_art_constraints(data: InputPriorArtConstraints):
    prior_art = process_prior_art(data.technologies, data.constraints, "constraints", "pydantic")
    print(prior_art)
    return prior_art

@app.post("/prior-art-problems", response_model=OutputPriorArt)
async def prior_art_problems(data: InputPriorArtProblem):
    prior_art = process_prior_art(data.technologies, data.problems, "problem", "pydantic")
    return prior_art

def make_json_serializable(data):
    if isinstance(data, dict):
        return {k: make_json_serializable(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [make_json_serializable(item) for item in data]
    elif isinstance(data, tuple):
        return tuple(make_json_serializable(item) for item in data)
    elif hasattr(data, 'item'):
        return float(data.item())
    else:
        return data

def format_constraints_html(constraints: dict) -> str:
    html_content = "<div class='constraints-container'>"
    for title, description in constraints.items():
        html_content += f"""
        <div class='constraint-item'>
            <p><span class='constraint-title'>{title}:</span> <span class='constraint-description'>{description}</span></p>
        </div>
        """
    html_content += "</div>"
    return "<h1>Retrieved Constraints</h1>" + html_content

def format_best_combinations_html(combinations_data: list) -> str:
    html_content = "<div class='combinations-outer-container'>"
    for i, combination in enumerate(combinations_data):
        problem_title = combination.get("problem", {}).get("title", f"Problem {i+1}")
        technologies = combination.get("technologies", [])

        html_content += f"""
        <div class='problem-card'>
            <h3 class='problem-card-title'>{problem_title}</h3>
            <div class='technologies-inner-container'>
        """
        for tech_info_score in technologies:
            tech_info = tech_info_score[0]
            if isinstance(tech_info, dict):
                html_content += f"""
                <div class='technology-card'>
                    <h4 class='tech-card-title'>{tech_info.get('title', 'N/A')}</h4>
                    <p><strong>Purpose:</strong> {tech_info.get('purpose', 'N/A')}</p>
                    <p><strong>Components:</strong> {tech_info.get('key_components', 'N/A')}</p>
                    <p><strong>Advantages:</strong> {tech_info.get('advantages', 'N/A')}</p>
                    <p><strong>Limitations:</strong> {tech_info.get('limitations', 'N/A')}</p>
                </div>
                """
        html_content += """
            </div>
        </div>
        """
    html_content += "</div>"
    return "<h1>The 5 Best Technology Combinations per constraint</h1>" + html_content

def format_final_technologies_html(technologies_list: list) -> str:
    html_content = "<div class='final-tech-container'>"
    for tech_info in technologies_list:
        if isinstance(tech_info, dict):
            html_content += f"""
            <div class='final-tech-card'>
                <h4 class='final-tech-title'>{tech_info.get('title', 'N/A')}</h4>
                <p><strong>Purpose:</strong> {tech_info.get('purpose', 'N/A')}</p>
                <p><strong>Components:</strong> {tech_info.get('key_components', 'N/A')}</p>
                <p><strong>Advantages:</strong> {tech_info.get('advantages', 'N/A')}</p>
                <p><strong>Limitations:</strong> {tech_info.get('limitations', 'N/A')}</p>
            </div>
            """
    html_content += "</div>"
    return "<h1>The best technologies combinations </h1>" + html_content

def format_prior_art_html(prior_art_data: dict) -> str:
    if not prior_art_data or 'content' not in prior_art_data:
        return "<div class='prior-art-container'><p>No prior art data available.</p></div>"

    content = prior_art_data['content']
    uris = prior_art_data.get('uris', [])

    # 1. Convert **text** to <strong>text</strong>
    processed_content = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', content)

    # 2. Convert [x](uri) to clickable links
    # This regex handles cases where [x] is followed by (uri)
    # It captures the number (group 1) and the URI (group 2)
    processed_content = re.sub(r'\[(\d+)\]\((https?:\/\/[^\s\)]+)\)', r'<a href="\2" target="_blank" class="prior-art-inline-link">\1</a>', processed_content)

    # Split content into initial summary and then document sections
    sections = processed_content.split("Here are the documents found and the technologies used within them:\n\n")

    summary_html = ""
    documents_html = ""

    # Process summary part (the first part of the split)
    if len(sections) > 0:
        summary_lines = sections[0].strip().split('\n')
        summary_html += "    <div class='prior-art-summary'>\n"
        for line in summary_lines:
            if line.strip().startswith('*'):
                # For bullet points, specially format bold text
                # The bolding for **text** is already handled by re.sub
                parts = line.split(':', 1)
                if len(parts) > 1:
                    summary_html += f"        <p class='summary-bullet'><strong>{parts[0].replace('*', '').strip()}:</strong> {parts[1].strip()}</p>\n"
                else:
                    summary_html += f"        <p class='summary-bullet'>{line.replace('*', '').strip()}</p>\n"
            elif line.strip():
                summary_html += f"        <p>{line.strip()}</p>\n"
        summary_html += "    </div>\n"

    # Process documents part (the second part of the split)
    if len(sections) > 1:
        documents_raw = sections[1].strip()
        # Split by "number. **" to get individual document entries reliably
        document_entries = re.split(r'(\d+\.\s*\*\*.*?\*\*)', documents_raw)
        
        parsed_docs = []
        for i in range(1, len(document_entries), 2):
            title_line = document_entries[i].strip()
            content_block = document_entries[i+1].strip() if i+1 < len(document_entries) else ""
            parsed_docs.append({'title_line': title_line, 'content_block': content_block})

        documents_html += "    <div class='prior-art-documents'>\n"
        for doc in parsed_docs:
            doc_number_title = doc['title_line']
            doc_content_lines = [l.strip() for l in doc['content_block'].split('\n') if l.strip()]

            doc_description = ""
            tech_used_section = []
            
            desc_start_idx = -1
            tech_start_idx = -1

            for idx, line in enumerate(doc_content_lines):
                if line.startswith("Description:"):
                    desc_start_idx = idx
                elif line.startswith("Technologies Used:"):
                    tech_start_idx = idx

            if desc_start_idx != -1:
                desc_end_idx = tech_start_idx if tech_start_idx != -1 else len(doc_content_lines)
                doc_description = " ".join(doc_content_lines[desc_start_idx:desc_end_idx]).replace("Description:", "").strip()
            
            if tech_start_idx != -1:
                tech_used_section = [l.replace('*', '').strip() for l in doc_content_lines[tech_start_idx:] if l.strip().startswith('*')]


            documents_html += f"""\
        <div class='prior-art-document-card'>
            <h4 class='document-title'>{doc_number_title}</h4>
            <p class='document-description'><strong>Description:</strong> {doc_description}</p>\n"""
            if tech_used_section:
                documents_html += "            <div class='document-technologies'>\n"
                documents_html += "                <h5>Technologies Used:</h5>\n                <ul>\n"
                for tech_item in tech_used_section:
                    if tech_item.strip():
                        tech_parts = tech_item.split(':', 1)
                        if len(tech_parts) > 1:
                            documents_html += f"                    <li><strong>{tech_parts[0].strip()}:</strong> {tech_parts[1].strip()}</li>\n"
                        else:
                            documents_html += f"                    <li>{tech_item.strip()}</li>\n"
                documents_html += "                </ul>\n            </div>\n"
            documents_html += "        </div>\n"
        documents_html += "    </div>\n"

    # Grouped URLs at the end
    grouped_uris_html = ""
    if uris:
        grouped_uris_html += "    <div class='grouped-uris-section'>\n"
        grouped_uris_html += "        <hr class='disruptive-line'>\n" # Disruptive line
        grouped_uris_html += "        <h3>Referenced Documents (URIs):</h3>\n"
        grouped_uris_html += "        <ul>\n"
        for idx, uri in enumerate(uris):
            grouped_uris_html += f"            <li>{idx + 1}. <a href='{uri}' target='_blank' class='prior-art-grouped-link'>Document {idx + 1} Link</a></li>\n"
        grouped_uris_html += "        </ul>\n    </div>\n"

    return f"<div class='prior-art-container'>\n{summary_html}{documents_html}{grouped_uris_html}</div>"


def gradio_prior_art(best_technologies, constraints):
    prior_art = process_prior_art(best_technologies, constraints, "constraints", "dict")
    html_prior_art = format_prior_art_html(prior_art)
    print(html_prior_art)
    return html_prior_art

def process_input_gradio(problem_description: str):
    """
    Processes the input problem description step-by-step for Gradio.
    Returns all intermediate results.
    """
    # Step 1: Set Prompt
    prompt = set_prompt(problem_description)

    # Step 2: Retrieve Constraints
    constraints = retrieve_constraints(prompt)

    # Step 3: Stem Constraints
    constraints_stemmed = stem(constraints, "constraints")
    save_dataframe(pd.DataFrame({"stemmed_constraints": constraints_stemmed}), "constraints_stemmed.xlsx")
    print(constraints_stemmed)

    # Step 4: Global Tech (already loaded, just acknowledge)
    # save_dataframe(global_tech_df, "global_tech.xlsx") # This is already done implicitly by loading

    # Step 5: Get Contrastive Similarities
    result_similarities, matrix = get_contrastive_similarities(
        constraints_stemmed, global_tech, global_tech_embeddings
    )
    save_to_pickle(result_similarities)

    # Step 6: Find Best List Combinations
    best_combinations = find_best_list_combinations(constraints_stemmed, global_tech, matrix)

    # Step 7: Select Technologies
    best_technologies_id = select_technologies(best_combinations)

    # Step 8: Get Technologies by ID
    best_technologies = get_technologies_by_id(best_technologies_id, global_tech)
    
    # Format outputs for Gradio
    # For Constraints:
    constraints_html = format_constraints_html(constraints)

    # For Best Combinations:
    best_combinations_html = format_best_combinations_html(best_combinations)

    # For Final Technologies:
    final_technologies_html = format_final_technologies_html(best_technologies)

    return (
        prompt,
        constraints_html, # Output HTML for constraints
        best_combinations_html, # Output HTML for best combinations
        ", ".join(map(str, best_technologies_id)), # Still a simple text list
        final_technologies_html, # Output HTML for final technologies
        {"technologies": best_technologies}, # `best_technologies` is the actual list of dicts
        constraints
    )

   
    # Return a gr.update object to change the value and visibility in one step
#    return gr.update(value=html_prior_art, visible=True)
   

# --- Gradio Interface Setup ---
input_problem = gr.Textbox(
    label="Enter Problem Description",
    placeholder="e.g., Develop a secure and scalable e-commerce platform with real-time analytics."
)

output_prompt = gr.Textbox(label="1. Generated Prompt", interactive=False)
output_constraints = gr.HTML(label="2. Retrieved Constraints") # Changed to HTML
output_best_combinations = gr.HTML(label="7. Best Technology Combinations Found") # Changed to HTML
output_selected_ids = gr.Textbox(label="8. Selected Technology IDs", interactive=False)
output_final_technologies = gr.HTML(label="9. Final Best Technologies") # Changed to HTML
output_prior_art = gr.HTML(label="10. Prior Art Analysis") # Initially hidden

stock_technologies = gr.JSON(visible=False)
stock_constraints = gr.JSON(visible=False)

with gr.Blocks(
    theme=gr.themes.Soft(),
    css=custom_css
) as gradio_app_blocks:
    gr.Markdown("# Insight Finder: Step-by-Step Technology Selection")
    gr.Markdown("## Enter a problem description to see how relevant technologies are identified through various processing steps.")

    with gr.Row():
        with gr.Column(scale=2):
            input_problem.render()
        with gr.Column(scale=1):
            gr.Markdown("Click to start the analysis:"),
            process_button = gr.Button("Process Problem", elem_id="process_button")
        

    gr.Markdown("---")
    gr.Markdown("### Processing Steps & Results:")

    with gr.Row():
        with gr.Column():
            output_prompt.render()
            output_constraints.render()
        with gr.Column():
            output_selected_ids.render()
            output_best_combinations.render()
            output_final_technologies.render()
    
    gr.Markdown("---")
    gr.Markdown("### Prior Art Analysis")
    prior_art_button = gr.Button("Find Prior Art", elem_id="prior_art_button")
    output_prior_art.render()
    stock_technologies.render()
    stock_constraints.render()
    
    process_button.click(
        fn=process_input_gradio,
        inputs=input_problem,
        outputs=[
            output_prompt,
            output_constraints,
            output_best_combinations,
            output_selected_ids,
            output_final_technologies,
            stock_technologies,
            stock_constraints
        ]
    )

    prior_art_button.click(
        fn=gradio_prior_art,
        inputs=[stock_technologies, stock_constraints],
        outputs=output_prior_art
    )


gr.mount_gradio_app(app, gradio_app_blocks, path="/gradio")
#if __name__ == "__main__":
#    gradio_app_blocks.launch()