Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Apr 5

Commit

57ab60a

verified ·

1 Parent(s): e3b69f0

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -252

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 os.environ["TRANSFORMERS_NO_FAST"] = "1"  # Force use of slow tokenizers
-os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 import io
 import torch
@@ -14,7 +13,7 @@ import numpy as np
 import json
 import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException, Form, BackgroundTasks
-from fastapi.responses import FileResponse, JSONResponse, HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 from sentence_transformers import SentenceTransformer
@@ -28,17 +27,14 @@ import hashlib  # For caching file results
 # For asynchronous blocking calls
 from starlette.concurrency import run_in_threadpool
-# Gensim for topic modeling
 import gensim
 from gensim import corpora, models
-# Spacy stop words
-from spacy.lang.en.stop_words import STOP_WORDS
 # Global cache for analysis results based on file hash
 analysis_cache = {}
-# Ensure compatibility with Google Colab (if applicable)
 try:
     from google.colab import drive
     drive.mount('/content/drive')
@@ -49,7 +45,7 @@ except Exception:
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
-# Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize FastAPI
@@ -68,13 +64,16 @@ app.add_middleware(
 document_storage = {}
 chat_history = []
 def store_document_context(task_id, text):
     document_storage[task_id] = text
     return True
 def load_document_context(task_id):
     return document_storage.get(task_id, "")
 def compute_md5(content: bytes) -> str:
     return hashlib.md5(content).hexdigest()
@@ -84,6 +83,7 @@ def compute_md5(content: bytes) -> str:
 def fine_tune_cuad_model():
     from datasets import load_dataset
     from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer
     print("✅ Loading CUAD dataset for fine tuning...")
@@ -121,10 +121,7 @@ def fine_tune_cuad_model():
         tokenized_examples["end_positions"] = []
         for i, offsets in enumerate(offset_mapping):
             input_ids = tokenized_examples["input_ids"][i]
-            try:
-                cls_index = input_ids.index(tokenizer.cls_token_id)
-            except ValueError:
-                cls_index = 0
             sequence_ids = tokenized_examples.sequence_ids(i)
             sample_index = sample_mapping[i]
             answers = examples["answers"][sample_index]
@@ -135,26 +132,21 @@ def fine_tune_cuad_model():
                 start_char = answers["answer_start"][0]
                 end_char = start_char + len(answers["text"][0])
                 tokenized_start_index = 0
-                while tokenized_start_index < len(sequence_ids) and sequence_ids[tokenized_start_index] != 1:
                     tokenized_start_index += 1
                 tokenized_end_index = len(input_ids) - 1
-                while tokenized_end_index >= 0 and sequence_ids[tokenized_end_index] != 1:
                     tokenized_end_index -= 1
-                if tokenized_start_index >= len(offsets) or tokenized_end_index < 0:
-                    tokenized_examples["start_positions"].append(cls_index)
-                    tokenized_examples["end_positions"].append(cls_index)
-                elif not (offsets[tokenized_start_index][0] <= start_char and offsets[tokenized_end_index][1] >= end_char):
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
                 else:
                     while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
                         tokenized_start_index += 1
-                    safe_start = tokenized_start_index - 1 if tokenized_start_index > 0 else cls_index
-                    tokenized_examples["start_positions"].append(safe_start)
-                    while tokenized_end_index >= 0 and offsets[tokenized_end_index][1] >= end_char:
                         tokenized_end_index -= 1
-                    safe_end = tokenized_end_index + 1 if tokenized_end_index < len(offsets) - 1 else cls_index
-                    tokenized_examples["end_positions"].append(safe_end)
         return tokenized_examples
     print("✅ Tokenizing dataset...")
@@ -198,70 +190,53 @@ def fine_tune_cuad_model():
 #############################
 try:
-    # Load spaCy model
     try:
         nlp = spacy.load("en_core_web_sm")
     except Exception:
         spacy.cli.download("en_core_web_sm")
         nlp = spacy.load("en_core_web_sm")
-    print("✅ Loaded spaCy model.")
-    # Create summarizer and QA pipelines on GPU
     summarizer = pipeline(
         "summarization",
-        model="facebook/bart-large-cnn",
-        tokenizer="facebook/bart-large-cnn",
-        device=0 if device == "cuda" else -1
-    )
-    qa_model = pipeline(
-        "question-answering",
-        model="deepset/roberta-base-squad2",
-        device=0 if device == "cuda" else -1
     )
-    # Use GPU for sentence embeddings if available
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
-    ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if device == "cuda" else -1)
-    # Speech-to-text pipeline on GPU (if available)
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
-                              device_map="auto" if device == "cuda" else None)
-    # Load or fine-tune the CUAD QA model and move to GPU
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
         from transformers import AutoModelForQuestionAnswering
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
     else:
-        print("⚠️ Fine-tuned QA model not found. Fine-tuning now (this may take a while)...")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
         cuad_model.to(device)
-    sentiment_pipeline = pipeline(
-        "sentiment-analysis",
-        model="distilbert-base-uncased-finetuned-sst-2-english",
-        device=0 if device == "cuda" else -1
-    )
-    print("✅ All models loaded successfully.")
 except Exception as e:
-    print(f"⚠️ Error loading models: {str(e)}")
     raise RuntimeError(f"Error loading models: {str(e)}")
-#############################
-#       Helper Functions    #
-#############################
 def legal_chatbot(user_input, context):
     global chat_history
     chat_history.append({"role": "user", "content": user_input})
-    try:
-        response = qa_model(question=user_input, context=context)["answer"]
-    except Exception as e:
-        response = f"Error processing query: {e}"
     chat_history.append({"role": "assistant", "content": response})
     return response
@@ -314,9 +289,9 @@ def extract_named_entities(text):
         entities.extend([{"entity": ent.text, "label": ent.label_} for ent in doc.ents])
     return entities
-#############################
-#   Risk & Topic Analysis   #
-#############################
 def analyze_sentiment(text):
     sentences = [sent.text for sent in nlp(text).sents]
@@ -343,82 +318,20 @@ def get_enhanced_context_info(text):
     enhanced["topics"] = analyze_topics(text, num_topics=5)
     return enhanced
-def explain_topics(topics):
-    explanation = {}
-    for topic_idx, topic_str in topics:
-        parts = topic_str.split('+')
-        terms = []
-        for part in parts:
-            part = part.strip()
-            if '*' in part:
-                weight_str, word = part.split('*', 1)
-                word = word.strip().strip('\"').strip('\'')
-                try:
-                    weight = float(weight_str)
-                except:
-                    weight = 0.0
-                if word.lower() not in STOP_WORDS and len(word) > 1:
-                    terms.append((weight, word))
-        terms.sort(key=lambda x: -x[0])
-        if terms:
-            if any("liability" in w.lower() for _, w in terms):
-                label = "Liability & Penalty Risk"
-            elif any("termination" in w.lower() for _, w in terms):
-                label = "Termination & Refund Risk"
-            elif any("compliance" in w.lower() for _, w in terms):
-                label = "Compliance & Regulatory Risk"
-            else:
-                label = "General Risk Language"
-        else:
-            label = "General Risk Language"
-        explanation_text = (
-            f"Topic {topic_idx} ({label}) is characterized by dominant terms: " +
-            ", ".join([f"'{word}' ({weight:.3f})" for weight, word in terms[:5]])
-        )
-        explanation[topic_idx] = {
-            "label": label,
-            "explanation": explanation_text,
-            "terms": terms
-        }
-    return explanation
 def analyze_risk_enhanced(text):
     enhanced = get_enhanced_context_info(text)
     avg_sentiment = enhanced["average_sentiment"]
     risk_score = abs(avg_sentiment) if avg_sentiment < 0 else 0
-    topics_raw = enhanced["topics"]
-    topics_explanation = explain_topics(topics_raw)
-    return {
-        "risk_score": risk_score,
-        "average_sentiment": avg_sentiment,
-        "topics": topics_raw,
-        "topics_explanation": topics_explanation
-    }
-#############################
-#   Clause Detection (GPU)  #
-#############################
-def chunk_text_by_tokens(text, tokenizer, max_chunk_len=384, stride=128):
-    encoded = tokenizer(text, add_special_tokens=False)
-    input_ids = encoded["input_ids"]
-    chunks = []
-    idx = 0
-    while idx < len(input_ids):
-        end = idx + max_chunk_len
-        sub_ids = input_ids[idx:end]
-        chunk_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
-        chunks.append(chunk_text)
-        if end >= len(input_ids):
-            break
-        idx = end - stride
-        if idx < 0:
-            idx = 0
-    return chunks
 def analyze_contract_clauses(text):
-    text_chunks = chunk_text_by_tokens(text, cuad_tokenizer, max_chunk_len=384, stride=128)
     try:
         clause_types = list(cuad_model.config.id2label.values())
     except Exception:
@@ -428,50 +341,26 @@ def analyze_contract_clauses(text):
             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
-    clauses_detected = []
-    for chunk in text_chunks:
-        chunk = chunk.strip()
-        if not chunk:
-            continue
-        try:
-            tokenized_inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
-            # Move to GPU and clamp token IDs to ensure they are within valid range
-            inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}
-            inputs["input_ids"] = torch.clamp(inputs["input_ids"], max=cuad_model.config.vocab_size - 1)
-            if torch.any(inputs["input_ids"] >= cuad_model.config.vocab_size):
-                print("Invalid token id found; skipping chunk")
-                continue
-            with torch.no_grad():
-                outputs = cuad_model(**inputs)
-                if device == "cuda":
-                    torch.cuda.synchronize()
-            if outputs.start_logits.shape[1] != inputs["input_ids"].shape[1]:
-                print("Mismatch in logits shape; skipping chunk")
-                continue
-            predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
-            for idx, confidence in enumerate(predictions):
-                if confidence > 0.5 and idx < len(clause_types):
-                    clauses_detected.append({
-                        "type": clause_types[idx],
-                        "confidence": float(confidence)
-                    })
-        except Exception as e:
-            print(f"Error processing chunk: {e}")
-            if device == "cuda":
-                torch.cuda.empty_cache()
-            continue
     aggregated_clauses = {}
     for clause in clauses_detected:
-        ctype = clause["type"]
-        if ctype not in aggregated_clauses or clause["confidence"] > aggregated_clauses[ctype]["confidence"]:
-            aggregated_clauses[ctype] = clause
     return list(aggregated_clauses.values())
-#############################
-#         Endpoints         #
-#############################
 @app.post("/analyze_legal_document")
 async def analyze_legal_document(file: UploadFile = File(...)):
@@ -484,14 +373,7 @@ async def analyze_legal_document(file: UploadFile = File(...)):
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
         summary_text = text[:4096] if len(text) > 4096 else text
-        try:
-            if len(text) > 100:
-                summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
-            else:
-                summary = "Document too short for a meaningful summary."
-        except Exception as e:
-            summary = "Summarization failed due to an error."
-            print(f"Summarization error: {e}")
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
@@ -529,14 +411,7 @@ async def analyze_legal_video(file: UploadFile = File(...), background_tasks: Ba
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
-        try:
-            if len(text) > 100:
-                summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
-            else:
-                summary = "Transcript too short for meaningful summarization."
-        except Exception as e:
-            summary = "Summarization failed due to an error."
-            print(f"Summarization error: {e}")
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
@@ -576,14 +451,7 @@ async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: Ba
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
-        try:
-            if len(text) > 100:
-                summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
-            else:
-                summary = "Transcript too short for meaningful summarization."
-        except Exception as e:
-            summary = "Summarization failed due to an error."
-            print(f"Summarization error: {e}")
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
@@ -616,7 +484,7 @@ async def get_transcript(transcript_id: str):
 async def legal_chatbot_api(query: str = Form(...), task_id: str = Form(...)):
     document_context = load_document_context(task_id)
     if not document_context:
-        return {"response": "⚠️ No relevant document found for this task ID."}
     response = legal_chatbot(query, document_context)
     return {"response": response, "chat_history": chat_history[-5:]}
@@ -646,95 +514,129 @@ def setup_ngrok():
                 try:
                     tunnels = ngrok.get_tunnels()
                     if not tunnels:
-                        print("⚠️ Ngrok tunnel closed. Reconnecting...")
                         ngrok_tunnel = ngrok.connect(8500, "http")
                         print(f"✅ Reconnected. New URL: {ngrok_tunnel.public_url}")
                 except Exception as e:
-                    print(f"⚠️ Ngrok error: {e}")
         Thread(target=keep_alive, daemon=True).start()
         return public_url
     except Exception as e:
-        print(f"⚠️ Ngrok setup error: {e}")
         return None
-@app.get("/download_clause_bar_chart")
-async def download_clause_bar_chart(task_id: str):
     try:
         text = load_document_context(task_id)
         if not text:
             raise HTTPException(status_code=404, detail="Document context not found")
-        clauses = analyze_contract_clauses(text)
-        if not clauses:
-            raise HTTPException(status_code=404, detail="No clauses detected.")
-        clause_types = [c["type"] for c in clauses]
-        confidences = [c["confidence"] for c in clauses]
-        plt.figure(figsize=(10, 6))
-        plt.bar(clause_types, confidences, color='blue')
-        plt.xlabel("Clause Type")
-        plt.ylabel("Confidence Score")
-        plt.title("Extracted Legal Clause Confidence Scores")
-        plt.xticks(rotation=45, ha="right")
-        plt.tight_layout()
-        bar_chart_path = os.path.join("static", f"clause_bar_chart_{task_id}.png")
-        plt.savefig(bar_chart_path)
         plt.close()
-        return FileResponse(bar_chart_path, media_type="image/png", filename=f"clause_bar_chart_{task_id}.png")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error generating clause bar chart: {str(e)}")
-@app.get("/download_clause_donut_chart")
-async def download_clause_donut_chart(task_id: str):
     try:
         text = load_document_context(task_id)
         if not text:
             raise HTTPException(status_code=404, detail="Document context not found")
-        clauses = analyze_contract_clauses(text)
-        if not clauses:
-            raise HTTPException(status_code=404, detail="No clauses detected.")
-        from collections import Counter
-        clause_counter = Counter([c["type"] for c in clauses])
-        labels = list(clause_counter.keys())
-        sizes = list(clause_counter.values())
         plt.figure(figsize=(6, 6))
-        wedges, texts, autotexts = plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
-        centre_circle = plt.Circle((0, 0), 0.70, fc='white')
-        fig = plt.gcf()
-        fig.gca().add_artist(centre_circle)
-        plt.title("Clause Type Distribution")
-        plt.tight_layout()
-        donut_chart_path = os.path.join("static", f"clause_donut_chart_{task_id}.png")
-        plt.savefig(donut_chart_path)
         plt.close()
-        return FileResponse(donut_chart_path, media_type="image/png", filename=f"clause_donut_chart_{task_id}.png")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error generating clause donut chart: {str(e)}")
-@app.get("/download_clause_radar_chart")
-async def download_clause_radar_chart(task_id: str):
     try:
         text = load_document_context(task_id)
         if not text:
             raise HTTPException(status_code=404, detail="Document context not found")
-        clauses = analyze_contract_clauses(text)
-        if not clauses:
-            raise HTTPException(status_code=404, detail="No clauses detected.")
-        labels = [c["type"] for c in clauses]
-        values = [c["confidence"] for c in clauses]
-        labels += labels[:1]
         values += values[:1]
-        angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
         angles += angles[:1]
         fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
         ax.plot(angles, values, 'o-', linewidth=2)
         ax.fill(angles, values, alpha=0.25)
-        ax.set_thetagrids(np.degrees(angles[:-1]), labels[:-1])
-        ax.set_title("Legal Clause Radar Chart", y=1.1)
-        radar_chart_path = os.path.join("static", f"clause_radar_chart_{task_id}.png")
         plt.savefig(radar_chart_path)
         plt.close()
-        return FileResponse(radar_chart_path, media_type="image/png", filename=f"clause_radar_chart_{task_id}.png")
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error generating clause radar chart: {str(e)}")
 def run():
     print("Starting FastAPI server...")
@@ -745,5 +647,5 @@ if __name__ == "__main__":
     if public_url:
         print(f"\n✅ Your API is publicly available at: {public_url}/docs\n")
     else:
-        print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()

 import os
 os.environ["TRANSFORMERS_NO_FAST"] = "1"  # Force use of slow tokenizers
 import io
 import torch
 import json
 import tempfile
 from fastapi import FastAPI, UploadFile, File, HTTPException, Form, BackgroundTasks
+from fastapi.responses import FileResponse, JSONResponse, HTMLResponse  # Added HTMLResponse
 from fastapi.middleware.cors import CORSMiddleware
 from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
 from sentence_transformers import SentenceTransformer
 # For asynchronous blocking calls
 from starlette.concurrency import run_in_threadpool
+# Import gensim for topic modeling
 import gensim
 from gensim import corpora, models
 # Global cache for analysis results based on file hash
 analysis_cache = {}
+# Ensure compatibility with Google Colab
 try:
     from google.colab import drive
     drive.mount('/content/drive')
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
+# Ensure GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Initialize FastAPI
 document_storage = {}
 chat_history = []
+# Function to store document context by task ID
 def store_document_context(task_id, text):
     document_storage[task_id] = text
     return True
+# Function to load document context by task ID
 def load_document_context(task_id):
     return document_storage.get(task_id, "")
+# Utility to compute MD5 hash from file content
 def compute_md5(content: bytes) -> str:
     return hashlib.md5(content).hexdigest()
 def fine_tune_cuad_model():
     from datasets import load_dataset
+    import numpy as np
     from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer
     print("✅ Loading CUAD dataset for fine tuning...")
         tokenized_examples["end_positions"] = []
         for i, offsets in enumerate(offset_mapping):
             input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
             sequence_ids = tokenized_examples.sequence_ids(i)
             sample_index = sample_mapping[i]
             answers = examples["answers"][sample_index]
                 start_char = answers["answer_start"][0]
                 end_char = start_char + len(answers["text"][0])
                 tokenized_start_index = 0
+                while sequence_ids[tokenized_start_index] != 1:
                     tokenized_start_index += 1
                 tokenized_end_index = len(input_ids) - 1
+                while sequence_ids[tokenized_end_index] != 1:
                     tokenized_end_index -= 1
+                if not (offsets[tokenized_start_index][0] <= start_char and offsets[tokenized_end_index][1] >= end_char):
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
                 else:
                     while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
                         tokenized_start_index += 1
+                    tokenized_examples["start_positions"].append(tokenized_start_index - 1)
+                    while offsets[tokenized_end_index][1] >= end_char:
                         tokenized_end_index -= 1
+                    tokenized_examples["end_positions"].append(tokenized_end_index + 1)
         return tokenized_examples
     print("✅ Tokenizing dataset...")
 #############################
 try:
     try:
         nlp = spacy.load("en_core_web_sm")
     except Exception:
         spacy.cli.download("en_core_web_sm")
         nlp = spacy.load("en_core_web_sm")
+    print("✅ Loading NLP models...")
+    from transformers import PegasusTokenizer
     summarizer = pipeline(
         "summarization",
+        model="nsi319/legal-pegasus",
+        tokenizer=PegasusTokenizer.from_pretrained("nsi319/legal-pegasus", use_fast=False),
+        device=0 if torch.cuda.is_available() else -1
     )
+    # Optionally convert summarizer model to FP16 for faster inference on GPU
+    if device == "cuda":
+        summarizer.model.half()
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
+    ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
+                              device_map="auto" if torch.cuda.is_available() else "cpu")
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
         from transformers import AutoModelForQuestionAnswering
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
+        if device == "cuda":
+            cuad_model.half()
     else:
+        print("⚠ Fine-tuned QA model not found. Starting fine tuning on CUAD QA dataset. This may take a while...")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
         cuad_model.to(device)
+    print("✅ All models loaded successfully")
 except Exception as e:
+    print(f"⚠ Error loading models: {str(e)}")
     raise RuntimeError(f"Error loading models: {str(e)}")
+from transformers import pipeline
+qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
+sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0 if torch.cuda.is_available() else -1)
 def legal_chatbot(user_input, context):
     global chat_history
     chat_history.append({"role": "user", "content": user_input})
+    response = qa_model(question=user_input, context=context)["answer"]
     chat_history.append({"role": "assistant", "content": response})
     return response
         entities.extend([{"entity": ent.text, "label": ent.label_} for ent in doc.ents])
     return entities
+# -----------------------------
+# Enhanced Risk Analysis Functions
+# -----------------------------
 def analyze_sentiment(text):
     sentences = [sent.text for sent in nlp(text).sents]
     enhanced["topics"] = analyze_topics(text, num_topics=5)
     return enhanced
 def analyze_risk_enhanced(text):
     enhanced = get_enhanced_context_info(text)
     avg_sentiment = enhanced["average_sentiment"]
     risk_score = abs(avg_sentiment) if avg_sentiment < 0 else 0
+    return {"risk_score": risk_score, "average_sentiment": avg_sentiment, "topics": enhanced["topics"]}
+# -----------------------------
+# Clause Detection (Chunk-Based)
+# -----------------------------
 def analyze_contract_clauses(text):
+    max_length = 512
+    step = 256
+    clauses_detected = []
     try:
         clause_types = list(cuad_model.config.id2label.values())
     except Exception:
             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
+    # Create chunks of the text
+    chunks = [text[i:i+max_length] for i in range(0, len(text), step) if i+step < len(text)]
+    for chunk in chunks:
+        inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512).to(device)
+        with torch.no_grad():
+            outputs = cuad_model(**inputs)
+        predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
+        for idx, confidence in enumerate(predictions):
+            if confidence > 0.5 and idx < len(clause_types):
+                clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
     aggregated_clauses = {}
     for clause in clauses_detected:
+        clause_type = clause["type"]
+        if clause_type not in aggregated_clauses or clause["confidence"] > aggregated_clauses[clause_type]["confidence"]:
+            aggregated_clauses[clause_type] = clause
     return list(aggregated_clauses.values())
+# -----------------------------
+# Endpoints
+# -----------------------------
 @app.post("/analyze_legal_document")
 async def analyze_legal_document(file: UploadFile = File(...)):
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
         summary_text = text[:4096] if len(text) > 4096 else text
+        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Document too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
+        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Transcript too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
+        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Transcript too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
 async def legal_chatbot_api(query: str = Form(...), task_id: str = Form(...)):
     document_context = load_document_context(task_id)
     if not document_context:
+        return {"response": "⚠ No relevant document found for this task ID."}
     response = legal_chatbot(query, document_context)
     return {"response": response, "chat_history": chat_history[-5:]}
                 try:
                     tunnels = ngrok.get_tunnels()
                     if not tunnels:
+                        print("⚠ Ngrok tunnel closed. Reconnecting...")
                         ngrok_tunnel = ngrok.connect(8500, "http")
                         print(f"✅ Reconnected. New URL: {ngrok_tunnel.public_url}")
                 except Exception as e:
+                    print(f"⚠ Ngrok error: {e}")
         Thread(target=keep_alive, daemon=True).start()
         return public_url
     except Exception as e:
+        print(f"⚠ Ngrok setup error: {e}")
         return None
+# ------------------------------
+# Dynamic Visualization Endpoints
+# ------------------------------
+@app.get("/download_risk_chart")
+async def download_risk_chart(task_id: str):
     try:
         text = load_document_context(task_id)
         if not text:
             raise HTTPException(status_code=404, detail="Document context not found")
+        risk_analysis = analyze_risk_enhanced(text)
+        plt.figure(figsize=(8, 5))
+        plt.bar(["Risk Score"], [risk_analysis["risk_score"]], color='red')
+        plt.ylabel("Risk Score")
+        plt.title("Legal Risk Assessment (Enhanced)")
+        risk_chart_path = os.path.join("static", f"risk_chart_{task_id}.png")
+        plt.savefig(risk_chart_path)
         plt.close()
+        return FileResponse(risk_chart_path, media_type="image/png", filename=f"risk_chart_{task_id}.png")
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating risk chart: {str(e)}")
+@app.get("/download_risk_pie_chart")
+async def download_risk_pie_chart(task_id: str):
     try:
         text = load_document_context(task_id)
         if not text:
             raise HTTPException(status_code=404, detail="Document context not found")
+        risk_analysis = analyze_risk_enhanced(text)
+        labels = ["Risk", "No Risk"]
+        risk_value = risk_analysis["risk_score"]
+        risk_value = min(max(risk_value, 0), 1)
+        values = [risk_value, 1 - risk_value]
         plt.figure(figsize=(6, 6))
+        plt.pie(values, labels=labels, autopct='%1.1f%%', startangle=90)
+        plt.title("Legal Risk Distribution (Enhanced)")
+        pie_chart_path = os.path.join("static", f"risk_pie_chart_{task_id}.png")
+        plt.savefig(pie_chart_path)
         plt.close()
+        return FileResponse(pie_chart_path, media_type="image/png", filename=f"risk_pie_chart_{task_id}.png")
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating pie chart: {str(e)}")
+@app.get("/download_risk_radar_chart")
+async def download_risk_radar_chart(task_id: str):
     try:
         text = load_document_context(task_id)
         if not text:
             raise HTTPException(status_code=404, detail="Document context not found")
+        risk_analysis = analyze_risk_enhanced(text)
+        categories = ["Average Sentiment", "Risk Score"]
+        values = [risk_analysis["average_sentiment"], risk_analysis["risk_score"]]
+        categories += categories[:1]
         values += values[:1]
+        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
         angles += angles[:1]
         fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
         ax.plot(angles, values, 'o-', linewidth=2)
         ax.fill(angles, values, alpha=0.25)
+        ax.set_thetagrids(np.degrees(angles[:-1]), ["Sentiment", "Risk"])
+        ax.set_title("Legal Risk Radar Chart (Enhanced)", y=1.1)
+        radar_chart_path = os.path.join("static", f"risk_radar_chart_{task_id}.png")
         plt.savefig(radar_chart_path)
         plt.close()
+        return FileResponse(radar_chart_path, media_type="image/png", filename=f"risk_radar_chart_{task_id}.png")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating radar chart: {str(e)}")
+@app.get("/download_risk_trend_chart")
+async def download_risk_trend_chart(task_id: str):
+    try:
+        text = load_document_context(task_id)
+        if not text:
+            raise HTTPException(status_code=404, detail="Document context not found")
+        words = text.split()
+        segments = np.array_split(words, 4)
+        segment_texts = [" ".join(segment) for segment in segments]
+        trend_scores = []
+        for segment in segment_texts:
+            risk = analyze_risk_enhanced(segment)
+            trend_scores.append(risk["risk_score"])
+        segments_labels = [f"Segment {i+1}" for i in range(len(segment_texts))]
+        plt.figure(figsize=(10, 6))
+        plt.plot(segments_labels, trend_scores, marker='o')
+        plt.xlabel("Document Segments")
+        plt.ylabel("Risk Score")
+        plt.title("Dynamic Legal Risk Trends (Enhanced)")
+        plt.xticks(rotation=45)
+        trend_chart_path = os.path.join("static", f"risk_trend_chart_{task_id}.png")
+        plt.savefig(trend_chart_path, bbox_inches="tight")
+        plt.close()
+        return FileResponse(trend_chart_path, media_type="image/png", filename=f"risk_trend_chart_{task_id}.png")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating trend chart: {str(e)}")
+@app.get("/interactive_risk_chart", response_class=HTMLResponse)
+async def interactive_risk_chart(task_id: str):
+    try:
+        import pandas as pd
+        import plotly.express as px
+        text = load_document_context(task_id)
+        if not text:
+            raise HTTPException(status_code=404, detail="Document context not found")
+        risk_analysis = analyze_risk_enhanced(text)
+        df = pd.DataFrame({
+            "Metric": ["Average Sentiment", "Risk Score"],
+            "Value": [risk_analysis["average_sentiment"], risk_analysis["risk_score"]]
+        })
+        fig = px.bar(df, x="Metric", y="Value", title="Interactive Enhanced Legal Risk Assessment")
+        return fig.to_html()
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating interactive chart: {str(e)}")
 def run():
     print("Starting FastAPI server...")
     if public_url:
         print(f"\n✅ Your API is publicly available at: {public_url}/docs\n")
     else:
+        print("\n⚠ Ngrok setup failed. API will only be available locally.\n")
     run()