Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Apr 5

Commit

dd43ec8

verified ·

1 Parent(s): 4e897df

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -76

app.py CHANGED Viewed

@@ -28,11 +28,11 @@ import hashlib  # For caching file results
 # For asynchronous blocking calls
 from starlette.concurrency import run_in_threadpool
-# Import gensim for topic modeling
 import gensim
 from gensim import corpora, models
-# Import spacy stop words
 from spacy.lang.en.stop_words import STOP_WORDS
 # Global cache for analysis results based on file hash
@@ -43,19 +43,19 @@ try:
     from google.colab import drive
     drive.mount('/content/drive')
 except Exception:
-    pass  # Skip drive mount if not in Google Colab
-# Ensure required directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
-# Ensure GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Initialize FastAPI
 app = FastAPI(title="Legal Document and Video Analyzer")
-# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -64,31 +64,31 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# In-memory storage for document text and chat history
 document_storage = {}
 chat_history = []
-# Function to store document context by task ID
 def store_document_context(task_id, text):
     document_storage[task_id] = text
     return True
-# Function to load document context by task ID
 def load_document_context(task_id):
     return document_storage.get(task_id, "")
-# Utility to compute MD5 hash from file content
 def compute_md5(content: bytes) -> str:
     return hashlib.md5(content).hexdigest()
 #############################
-#   Fine-tuning on CUAD QA   #
 #############################
 def fine_tune_cuad_model():
     from datasets import load_dataset
-    import numpy as np
-    from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering
     print("✅ Loading CUAD dataset for fine tuning...")
     dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
@@ -144,7 +144,6 @@ def fine_tune_cuad_model():
                 tokenized_end_index = len(input_ids) - 1
                 while tokenized_end_index >= 0 and sequence_ids[tokenized_end_index] != 1:
                     tokenized_end_index -= 1
-                # Safety check: if indices are not found, default to cls_index
                 if tokenized_start_index >= len(offsets) or tokenized_end_index < 0:
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
@@ -152,19 +151,16 @@ def fine_tune_cuad_model():
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
                 else:
-                    # Move tokenized_start_index to the first token after start_char
                     while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
                         tokenized_start_index += 1
                     safe_start = tokenized_start_index - 1 if tokenized_start_index > 0 else cls_index
                     tokenized_examples["start_positions"].append(safe_start)
-                    # Move tokenized_end_index backwards to the last token before end_char
                     while tokenized_end_index >= 0 and offsets[tokenized_end_index][1] >= end_char:
                         tokenized_end_index -= 1
                     safe_end = tokenized_end_index + 1 if tokenized_end_index < len(offsets) - 1 else cls_index
                     tokenized_examples["end_positions"].append(safe_end)
         return tokenized_examples
-    print("✅ Tokenizing dataset...")
     train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
     val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
@@ -205,57 +201,74 @@ def fine_tune_cuad_model():
 #############################
 try:
     try:
         nlp = spacy.load("en_core_web_sm")
     except Exception:
         spacy.cli.download("en_core_web_sm")
         nlp = spacy.load("en_core_web_sm")
-    print("✅ Loading NLP models...")
-    # Update summarizer to use facebook/bart-large-cnn for summarization
     summarizer = pipeline(
         "summarization",
         model="facebook/bart-large-cnn",
         tokenizer="facebook/bart-large-cnn",
-        device=0 if torch.cuda.is_available() else -1
     )
-    # Commenting out FP16 conversion to avoid potential issues
-    # if device == "cuda":
-    #     try:
-    #         summarizer.model.half()
-    #     except Exception as e:
-    #         print("FP16 conversion failed:", e)
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
-    ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
-                              device_map="auto" if torch.cuda.is_available() else "cpu")
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
         from transformers import AutoModelForQuestionAnswering
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
-        # Commenting out FP16 conversion for cuad_model as well
-        # if device == "cuda":
-        #     cuad_model.half()
     else:
-        print("⚠️ Fine-tuned QA model not found. Starting fine tuning on CUAD QA dataset. This may take a while...")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
         cuad_model.to(device)
-    print("✅ All models loaded successfully")
 except Exception as e:
     print(f"⚠️ Error loading models: {str(e)}")
     raise RuntimeError(f"Error loading models: {str(e)}")
-from transformers import pipeline
-qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
-sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0 if torch.cuda.is_available() else -1)
 def legal_chatbot(user_input, context):
     global chat_history
     chat_history.append({"role": "user", "content": user_input})
-    response = qa_model(question=user_input, context=context)["answer"]
     chat_history.append({"role": "assistant", "content": response})
     return response
@@ -268,6 +281,9 @@ def extract_text_from_pdf(pdf_file):
         raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
 async def process_video_to_text(video_file_path):
     try:
         print(f"Processing video file at {video_file_path}")
         temp_audio_path = os.path.join("temp", "extracted_audio.wav")
@@ -289,6 +305,9 @@ async def process_video_to_text(video_file_path):
         raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
 async def process_audio_to_text(audio_file_path):
     try:
         print(f"Processing audio file at {audio_file_path}")
         result = await run_in_threadpool(speech_to_text, audio_file_path)
@@ -300,6 +319,9 @@ async def process_audio_to_text(audio_file_path):
         raise HTTPException(status_code=400, detail=f"Audio processing failed: {str(e)}")
 def extract_named_entities(text):
     max_length = 10000
     entities = []
     for i in range(0, len(text), max_length):
@@ -308,9 +330,9 @@ def extract_named_entities(text):
         entities.extend([{"entity": ent.text, "label": ent.label_} for ent in doc.ents])
     return entities
-# -----------------------------
-# Enhanced Risk Analysis Functions
-# -----------------------------
 def analyze_sentiment(text):
     sentences = [sent.text for sent in nlp(text).sents]
@@ -337,11 +359,9 @@ def get_enhanced_context_info(text):
     enhanced["topics"] = analyze_topics(text, num_topics=5)
     return enhanced
-# New function to create a detailed, dynamic explanation for each topic
 def explain_topics(topics):
     explanation = {}
     for topic_idx, topic_str in topics:
-        # Split topic string into individual weighted terms
         parts = topic_str.split('+')
         terms = []
         for part in parts:
@@ -353,22 +373,23 @@ def explain_topics(topics):
                     weight = float(weight_str)
                 except:
                     weight = 0.0
-                # Filter out common stop words
                 if word.lower() not in STOP_WORDS and len(word) > 1:
                     terms.append((weight, word))
         terms.sort(key=lambda x: -x[0])
-        # Create a plain language label based on dominant words
         if terms:
-            if any("liability" in word.lower() for weight, word in terms):
                 label = "Liability & Penalty Risk"
-            elif any("termination" in word.lower() for weight, word in terms):
                 label = "Termination & Refund Risk"
-            elif any("compliance" in word.lower() for weight, word in terms):
                 label = "Compliance & Regulatory Risk"
             else:
                 label = "General Risk Language"
         else:
             label = "General Risk Language"
         explanation_text = (
             f"Topic {topic_idx} ({label}) is characterized by dominant terms: " +
             ", ".join([f"'{word}' ({weight:.3f})" for weight, word in terms[:5]])
@@ -393,10 +414,42 @@ def analyze_risk_enhanced(text):
         "topics_explanation": topics_explanation
     }
 def analyze_contract_clauses(text):
-    max_length = 512
-    step = 256
-    clauses_detected = []
     try:
         clause_types = list(cuad_model.config.id2label.values())
     except Exception:
@@ -406,60 +459,105 @@ def analyze_contract_clauses(text):
             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
-    # Process text in chunks of 'max_length' with a step size 'step'
-    for i in range(0, len(text), step):
-        chunk = text[i:i+max_length]
-        if not chunk.strip():
-            continue  # Skip empty chunks
         try:
             tokenized_inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
             inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}
-            # Check that token IDs are within vocabulary bounds
-            max_token = inputs["input_ids"].max().item()
-            if max_token >= cuad_model.config.vocab_size:
-                print(f"Skipping chunk due to invalid token id: {max_token}")
                 continue
             with torch.no_grad():
                 outputs = cuad_model(**inputs)
-            # Optional: verify shape consistency
             if outputs.start_logits.shape[1] != inputs["input_ids"].shape[1]:
-                print("Mismatch in logits shape, skipping chunk")
                 continue
             predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
             for idx, confidence in enumerate(predictions):
                 if confidence > 0.5 and idx < len(clause_types):
-                    clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
         except Exception as e:
             print(f"Error processing chunk: {e}")
             continue
     aggregated_clauses = {}
     for clause in clauses_detected:
-        clause_type = clause["type"]
-        if clause_type not in aggregated_clauses or clause["confidence"] > aggregated_clauses[clause_type]["confidence"]:
-            aggregated_clauses[clause_type] = clause
     return list(aggregated_clauses.values())
-# -----------------------------
-# Endpoints
-# -----------------------------
 @app.post("/analyze_legal_document")
 async def analyze_legal_document(file: UploadFile = File(...)):
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         text = await run_in_threadpool(extract_text_from_pdf, io.BytesIO(content))
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
         summary_text = text[:4096] if len(text) > 4096 else text
-        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Document too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
@@ -468,36 +566,65 @@ async def analyze_legal_document(file: UploadFile = File(...)):
             "risk_analysis": risk_analysis,
             "clauses_detected": clauses
         }
         analysis_cache[file_hash] = result
         return result
     except Exception as e:
         return {"status": "error", "message": str(e)}
 @app.post("/analyze_legal_video")
 async def analyze_legal_video(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
         text = await process_video_to_text(temp_file_path)
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the video."}
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
-        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Transcript too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
@@ -515,29 +642,55 @@ async def analyze_legal_video(file: UploadFile = File(...), background_tasks: Ba
 @app.post("/analyze_legal_audio")
 async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
         text = await process_audio_to_text(temp_file_path)
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the audio."}
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
-        summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text'] if len(text) > 100 else "Transcript too short for meaningful summarization."
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
@@ -563,6 +716,9 @@ async def get_transcript(transcript_id: str):
 @app.post("/legal_chatbot")
 async def legal_chatbot_api(query: str = Form(...), task_id: str = Form(...)):
     document_context = load_document_context(task_id)
     if not document_context:
         return {"response": "⚠️ No relevant document found for this task ID."}
@@ -606,10 +762,7 @@ def setup_ngrok():
         print(f"⚠️ Ngrok setup error: {e}")
         return None
-# ------------------------------
-# Clause Visualization Endpoints
-# ------------------------------
 @app.get("/download_clause_bar_chart")
 async def download_clause_bar_chart(task_id: str):
     try:
@@ -673,6 +826,7 @@ async def download_clause_radar_chart(task_id: str):
             raise HTTPException(status_code=404, detail="No clauses detected.")
         labels = [c["type"] for c in clauses]
         values = [c["confidence"] for c in clauses]
         labels += labels[:1]
         values += values[:1]
         angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
@@ -700,3 +854,4 @@ if __name__ == "__main__":
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()

 # For asynchronous blocking calls
 from starlette.concurrency import run_in_threadpool
+# Gensim for topic modeling
 import gensim
 from gensim import corpora, models
+# Spacy stop words
 from spacy.lang.en.stop_words import STOP_WORDS
 # Global cache for analysis results based on file hash
     from google.colab import drive
     drive.mount('/content/drive')
 except Exception:
+    pass  # Not in Colab
+# Make sure directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
+# Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# FastAPI setup
 app = FastAPI(title="Legal Document and Video Analyzer")
+# CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# In-memory storage
 document_storage = {}
 chat_history = []
 def store_document_context(task_id, text):
     document_storage[task_id] = text
     return True
 def load_document_context(task_id):
     return document_storage.get(task_id, "")
 def compute_md5(content: bytes) -> str:
     return hashlib.md5(content).hexdigest()
 #############################
+#   Fine-tuning on CUAD QA  #
 #############################
 def fine_tune_cuad_model():
+    """
+    Minimal stub for fine-tuning the CUAD QA model.
+    If you have a full fine-tuning script, place it here.
+    """
     from datasets import load_dataset
+    from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer
     print("✅ Loading CUAD dataset for fine tuning...")
     dataset = load_dataset("theatticusproject/cuad-qa", trust_remote_code=True)
                 tokenized_end_index = len(input_ids) - 1
                 while tokenized_end_index >= 0 and sequence_ids[tokenized_end_index] != 1:
                     tokenized_end_index -= 1
                 if tokenized_start_index >= len(offsets) or tokenized_end_index < 0:
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
                 else:
                     while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
                         tokenized_start_index += 1
                     safe_start = tokenized_start_index - 1 if tokenized_start_index > 0 else cls_index
                     tokenized_examples["start_positions"].append(safe_start)
                     while tokenized_end_index >= 0 and offsets[tokenized_end_index][1] >= end_char:
                         tokenized_end_index -= 1
                     safe_end = tokenized_end_index + 1 if tokenized_end_index < len(offsets) - 1 else cls_index
                     tokenized_examples["end_positions"].append(safe_end)
         return tokenized_examples
     train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
     val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
 #############################
 try:
+    # Load spacy
     try:
         nlp = spacy.load("en_core_web_sm")
     except Exception:
         spacy.cli.download("en_core_web_sm")
         nlp = spacy.load("en_core_web_sm")
+    print("✅ Loaded spaCy model.")
+    # Summarizer (GPU)
     summarizer = pipeline(
         "summarization",
         model="facebook/bart-large-cnn",
         tokenizer="facebook/bart-large-cnn",
+        device=0 if device == "cuda" else -1
     )
+    # QA pipeline (GPU)
+    qa_model = pipeline(
+        "question-answering",
+        model="deepset/roberta-base-squad2",
+        device=0 if device == "cuda" else -1
+    )
+    # Embeddings (GPU if available)
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
+    # Named Entity Recognition (GPU)
+    ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if device == "cuda" else -1)
+    # Speech-to-text (GPU if available via device_map="auto")
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
+                              device_map="auto" if device == "cuda" else None)
+    # Fine-tuned CUAD QA
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
         from transformers import AutoModelForQuestionAnswering
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
     else:
+        print("⚠️ Fine-tuned QA model not found. Fine-tuning now (this may be slow).")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
         cuad_model.to(device)
+    # Sentiment (GPU)
+    sentiment_pipeline = pipeline(
+        "sentiment-analysis",
+        model="distilbert-base-uncased-finetuned-sst-2-english",
+        device=0 if device == "cuda" else -1
+    )
+    print("✅ All models loaded successfully.")
 except Exception as e:
     print(f"⚠️ Error loading models: {str(e)}")
     raise RuntimeError(f"Error loading models: {str(e)}")
+#############################
+#       Helper Functions    #
+#############################
 def legal_chatbot(user_input, context):
     global chat_history
     chat_history.append({"role": "user", "content": user_input})
+    try:
+        response = qa_model(question=user_input, context=context)["answer"]
+    except Exception as e:
+        response = f"Error processing query: {e}"
     chat_history.append({"role": "assistant", "content": response})
     return response
         raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
 async def process_video_to_text(video_file_path):
+    """
+    Extracts audio from video and runs speech-to-text.
+    """
     try:
         print(f"Processing video file at {video_file_path}")
         temp_audio_path = os.path.join("temp", "extracted_audio.wav")
         raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
 async def process_audio_to_text(audio_file_path):
+    """
+    Runs speech-to-text on an audio file.
+    """
     try:
         print(f"Processing audio file at {audio_file_path}")
         result = await run_in_threadpool(speech_to_text, audio_file_path)
         raise HTTPException(status_code=400, detail=f"Audio processing failed: {str(e)}")
 def extract_named_entities(text):
+    """
+    Splits text into manageable chunks, runs spaCy for entity extraction.
+    """
     max_length = 10000
     entities = []
     for i in range(0, len(text), max_length):
         entities.extend([{"entity": ent.text, "label": ent.label_} for ent in doc.ents])
     return entities
+#############################
+#   Risk & Topic Analysis   #
+#############################
 def analyze_sentiment(text):
     sentences = [sent.text for sent in nlp(text).sents]
     enhanced["topics"] = analyze_topics(text, num_topics=5)
     return enhanced
 def explain_topics(topics):
     explanation = {}
     for topic_idx, topic_str in topics:
         parts = topic_str.split('+')
         terms = []
         for part in parts:
                     weight = float(weight_str)
                 except:
                     weight = 0.0
+                # Filter out short words & stop words
                 if word.lower() not in STOP_WORDS and len(word) > 1:
                     terms.append((weight, word))
         terms.sort(key=lambda x: -x[0])
+        # Heuristic labeling
         if terms:
+            if any("liability" in w.lower() for _, w in terms):
                 label = "Liability & Penalty Risk"
+            elif any("termination" in w.lower() for _, w in terms):
                 label = "Termination & Refund Risk"
+            elif any("compliance" in w.lower() for _, w in terms):
                 label = "Compliance & Regulatory Risk"
             else:
                 label = "General Risk Language"
         else:
             label = "General Risk Language"
         explanation_text = (
             f"Topic {topic_idx} ({label}) is characterized by dominant terms: " +
             ", ".join([f"'{word}' ({weight:.3f})" for weight, word in terms[:5]])
         "topics_explanation": topics_explanation
     }
+#############################
+#   Clause Detection (GPU)  #
+#############################
+def chunk_text_by_tokens(text, tokenizer, max_chunk_len=384, stride=128):
+    """
+    Convert the entire text into tokens once, then create overlapping chunks
+    of up to `max_chunk_len` tokens with overlap `stride`.
+    """
+    # Encode text once
+    encoded = tokenizer(text, add_special_tokens=False)
+    input_ids = encoded["input_ids"]
+    # We'll create overlapping windows of tokens
+    chunks = []
+    idx = 0
+    while idx < len(input_ids):
+        end = idx + max_chunk_len
+        sub_ids = input_ids[idx:end]
+        # Convert back to text
+        chunk_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
+        chunks.append(chunk_text)
+        if end >= len(input_ids):
+            break
+        idx = end - stride
+        if idx < 0:
+            idx = 0
+    return chunks
 def analyze_contract_clauses(text):
+    """
+    Token-based chunking to avoid partial tokens.
+    Each chunk is fed into the fine-tuned CUAD model on GPU.
+    """
+    # We'll break the text into chunks of up to 384 tokens, with a stride of 128
+    text_chunks = chunk_text_by_tokens(text, cuad_tokenizer, max_chunk_len=384, stride=128)
     try:
         clause_types = list(cuad_model.config.id2label.values())
     except Exception:
             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
+    clauses_detected = []
+    for chunk in text_chunks:
+        chunk = chunk.strip()
+        if not chunk:
+            continue
         try:
+            # Tokenize the chunk again for the model
             tokenized_inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
             inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}
+            # Check for invalid token IDs
+            if torch.any(inputs["input_ids"] >= cuad_model.config.vocab_size):
+                print("Invalid token id found; skipping chunk")
                 continue
             with torch.no_grad():
                 outputs = cuad_model(**inputs)
+                # Force synchronization so that if there's a device error, we catch it here
+                if device == "cuda":
+                    torch.cuda.synchronize()
+            # Shape check
             if outputs.start_logits.shape[1] != inputs["input_ids"].shape[1]:
+                print("Mismatch in logits shape; skipping chunk")
                 continue
+            # For demonstration, we just apply a threshold to the start_logits
             predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
             for idx, confidence in enumerate(predictions):
                 if confidence > 0.5 and idx < len(clause_types):
+                    clauses_detected.append({
+                        "type": clause_types[idx],
+                        "confidence": float(confidence)
+                    })
         except Exception as e:
             print(f"Error processing chunk: {e}")
+            # Clear GPU cache if there's an error
+            if device == "cuda":
+                torch.cuda.empty_cache()
             continue
+    # Aggregate clauses by their highest confidence
     aggregated_clauses = {}
     for clause in clauses_detected:
+        ctype = clause["type"]
+        if ctype not in aggregated_clauses or clause["confidence"] > aggregated_clauses[ctype]["confidence"]:
+            aggregated_clauses[ctype] = clause
     return list(aggregated_clauses.values())
+#############################
+#         Endpoints         #
+#############################
 @app.post("/analyze_legal_document")
 async def analyze_legal_document(file: UploadFile = File(...)):
+    """
+    Analyze a legal document (PDF). Extract text, summarize, detect entities,
+    do risk analysis, detect clauses, and store context for chat.
+    """
     try:
         content = await file.read()
         file_hash = compute_md5(content)
+        # Return cached result if we've already processed this file
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
+        # Extract text
         text = await run_in_threadpool(extract_text_from_pdf, io.BytesIO(content))
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
+        # Summarize (handle short documents gracefully)
         summary_text = text[:4096] if len(text) > 4096 else text
+        try:
+            if len(text) > 100:
+                summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
+            else:
+                summary = "Document too short for a meaningful summary."
+        except Exception as e:
+            summary = "Summarization failed due to an error."
+            print(f"Summarization error: {e}")
+        # Extract named entities
         entities = extract_named_entities(text)
+        # Analyze risk
         risk_analysis = analyze_risk_enhanced(text)
+        # Detect clauses
         clauses = analyze_contract_clauses(text)
+        # Store the document context for chatbot
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
             "risk_analysis": risk_analysis,
             "clauses_detected": clauses
         }
+        # Cache it
         analysis_cache[file_hash] = result
         return result
     except Exception as e:
         return {"status": "error", "message": str(e)}
 @app.post("/analyze_legal_video")
 async def analyze_legal_video(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
+    """
+    Analyze a legal video: transcribe, summarize, detect entities, risk analysis, etc.
+    """
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
+        # Save video temporarily
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
+        # Transcribe
         text = await process_video_to_text(temp_file_path)
+        # Cleanup
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the video."}
+        # Save transcript
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
+        # Summarize
         summary_text = text[:4096] if len(text) > 4096 else text
+        try:
+            if len(text) > 100:
+                summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
+            else:
+                summary = "Transcript too short for meaningful summarization."
+        except Exception as e:
+            summary = "Summarization failed due to an error."
+            print(f"Summarization error: {e}")
+        # Entities, risk, clauses
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
+        # Store context
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
 @app.post("/analyze_legal_audio")
 async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
+    """
+    Analyze an audio file: transcribe, summarize, detect entities, risk analysis, etc.
+    """
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
+        # Save audio temporarily
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
+        # Transcribe
         text = await process_audio_to_text(temp_file_path)
+        # Cleanup
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the audio."}
+        # Save transcript
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
+        # Summarize
         summary_text = text[:4096] if len(text) > 4096 else text
+        try:
+            if len(text) > 100:
+                summary = summarizer(summary_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
+            else:
+                summary = "Transcript too short for meaningful summarization."
+        except Exception as e:
+            summary = "Summarization failed due to an error."
+            print(f"Summarization error: {e}")
+        # Entities, risk, clauses
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
+        # Store context
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
 @app.post("/legal_chatbot")
 async def legal_chatbot_api(query: str = Form(...), task_id: str = Form(...)):
+    """
+    Simple QA pipeline on the stored document context.
+    """
     document_context = load_document_context(task_id)
     if not document_context:
         return {"response": "⚠️ No relevant document found for this task ID."}
         print(f"⚠️ Ngrok setup error: {e}")
         return None
+# Visualization endpoints
 @app.get("/download_clause_bar_chart")
 async def download_clause_bar_chart(task_id: str):
     try:
             raise HTTPException(status_code=404, detail="No clauses detected.")
         labels = [c["type"] for c in clauses]
         values = [c["confidence"] for c in clauses]
+        # close the loop for radar
         labels += labels[:1]
         values += values[:1]
         angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()