Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Apr 5

Commit

e3b69f0

verified ·

1 Parent(s): dd43ec8

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -124

app.py CHANGED Viewed

@@ -38,24 +38,24 @@ from spacy.lang.en.stop_words import STOP_WORDS
 # Global cache for analysis results based on file hash
 analysis_cache = {}
-# Ensure compatibility with Google Colab
 try:
     from google.colab import drive
     drive.mount('/content/drive')
 except Exception:
-    pass  # Not in Colab
-# Make sure directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
 # Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# FastAPI setup
 app = FastAPI(title="Legal Document and Video Analyzer")
-# CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -64,7 +64,7 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# In-memory storage
 document_storage = {}
 chat_history = []
@@ -79,14 +79,10 @@ def compute_md5(content: bytes) -> str:
     return hashlib.md5(content).hexdigest()
 #############################
-#   Fine-tuning on CUAD QA  #
 #############################
 def fine_tune_cuad_model():
-    """
-    Minimal stub for fine-tuning the CUAD QA model.
-    If you have a full fine-tuning script, place it here.
-    """
     from datasets import load_dataset
     from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer
@@ -161,6 +157,7 @@ def fine_tune_cuad_model():
                     tokenized_examples["end_positions"].append(safe_end)
         return tokenized_examples
     train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
     val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
@@ -201,7 +198,7 @@ def fine_tune_cuad_model():
 #############################
 try:
-    # Load spacy
     try:
         nlp = spacy.load("en_core_web_sm")
     except Exception:
@@ -209,32 +206,29 @@ try:
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loaded spaCy model.")
-    # Summarizer (GPU)
     summarizer = pipeline(
         "summarization",
         model="facebook/bart-large-cnn",
         tokenizer="facebook/bart-large-cnn",
         device=0 if device == "cuda" else -1
     )
-    # QA pipeline (GPU)
     qa_model = pipeline(
         "question-answering",
         model="deepset/roberta-base-squad2",
         device=0 if device == "cuda" else -1
     )
-    # Embeddings (GPU if available)
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
-    # Named Entity Recognition (GPU)
     ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if device == "cuda" else -1)
-    # Speech-to-text (GPU if available via device_map="auto")
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
                               device_map="auto" if device == "cuda" else None)
-    # Fine-tuned CUAD QA
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
@@ -242,11 +236,10 @@ try:
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
     else:
-        print("⚠️ Fine-tuned QA model not found. Fine-tuning now (this may be slow).")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
         cuad_model.to(device)
-    # Sentiment (GPU)
     sentiment_pipeline = pipeline(
         "sentiment-analysis",
         model="distilbert-base-uncased-finetuned-sst-2-english",
@@ -281,9 +274,6 @@ def extract_text_from_pdf(pdf_file):
         raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
 async def process_video_to_text(video_file_path):
-    """
-    Extracts audio from video and runs speech-to-text.
-    """
     try:
         print(f"Processing video file at {video_file_path}")
         temp_audio_path = os.path.join("temp", "extracted_audio.wav")
@@ -305,9 +295,6 @@ async def process_video_to_text(video_file_path):
         raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
 async def process_audio_to_text(audio_file_path):
-    """
-    Runs speech-to-text on an audio file.
-    """
     try:
         print(f"Processing audio file at {audio_file_path}")
         result = await run_in_threadpool(speech_to_text, audio_file_path)
@@ -319,9 +306,6 @@ async def process_audio_to_text(audio_file_path):
         raise HTTPException(status_code=400, detail=f"Audio processing failed: {str(e)}")
 def extract_named_entities(text):
-    """
-    Splits text into manageable chunks, runs spaCy for entity extraction.
-    """
     max_length = 10000
     entities = []
     for i in range(0, len(text), max_length):
@@ -373,11 +357,9 @@ def explain_topics(topics):
                     weight = float(weight_str)
                 except:
                     weight = 0.0
-                # Filter out short words & stop words
                 if word.lower() not in STOP_WORDS and len(word) > 1:
                     terms.append((weight, word))
         terms.sort(key=lambda x: -x[0])
-        # Heuristic labeling
         if terms:
             if any("liability" in w.lower() for _, w in terms):
                 label = "Liability & Penalty Risk"
@@ -419,20 +401,13 @@ def analyze_risk_enhanced(text):
 #############################
 def chunk_text_by_tokens(text, tokenizer, max_chunk_len=384, stride=128):
-    """
-    Convert the entire text into tokens once, then create overlapping chunks
-    of up to `max_chunk_len` tokens with overlap `stride`.
-    """
-    # Encode text once
     encoded = tokenizer(text, add_special_tokens=False)
     input_ids = encoded["input_ids"]
-    # We'll create overlapping windows of tokens
     chunks = []
     idx = 0
     while idx < len(input_ids):
         end = idx + max_chunk_len
         sub_ids = input_ids[idx:end]
-        # Convert back to text
         chunk_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
         chunks.append(chunk_text)
         if end >= len(input_ids):
@@ -443,13 +418,7 @@ def chunk_text_by_tokens(text, tokenizer, max_chunk_len=384, stride=128):
     return chunks
 def analyze_contract_clauses(text):
-    """
-    Token-based chunking to avoid partial tokens.
-    Each chunk is fed into the fine-tuned CUAD model on GPU.
-    """
-    # We'll break the text into chunks of up to 384 tokens, with a stride of 128
     text_chunks = chunk_text_by_tokens(text, cuad_tokenizer, max_chunk_len=384, stride=128)
     try:
         clause_types = list(cuad_model.config.id2label.values())
     except Exception:
@@ -459,7 +428,6 @@ def analyze_contract_clauses(text):
             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
     clauses_detected = []
     for chunk in text_chunks:
@@ -467,26 +435,20 @@ def analyze_contract_clauses(text):
         if not chunk:
             continue
         try:
-            # Tokenize the chunk again for the model
             tokenized_inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
             inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}
-            # Check for invalid token IDs
             if torch.any(inputs["input_ids"] >= cuad_model.config.vocab_size):
                 print("Invalid token id found; skipping chunk")
                 continue
             with torch.no_grad():
                 outputs = cuad_model(**inputs)
-                # Force synchronization so that if there's a device error, we catch it here
                 if device == "cuda":
                     torch.cuda.synchronize()
-            # Shape check
             if outputs.start_logits.shape[1] != inputs["input_ids"].shape[1]:
                 print("Mismatch in logits shape; skipping chunk")
                 continue
-            # For demonstration, we just apply a threshold to the start_logits
             predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
             for idx, confidence in enumerate(predictions):
                 if confidence > 0.5 and idx < len(clause_types):
@@ -494,21 +456,17 @@ def analyze_contract_clauses(text):
                         "type": clause_types[idx],
                         "confidence": float(confidence)
                     })
         except Exception as e:
             print(f"Error processing chunk: {e}")
-            # Clear GPU cache if there's an error
             if device == "cuda":
                 torch.cuda.empty_cache()
             continue
-    # Aggregate clauses by their highest confidence
     aggregated_clauses = {}
     for clause in clauses_detected:
         ctype = clause["type"]
         if ctype not in aggregated_clauses or clause["confidence"] > aggregated_clauses[ctype]["confidence"]:
             aggregated_clauses[ctype] = clause
     return list(aggregated_clauses.values())
 #############################
@@ -517,24 +475,14 @@ def analyze_contract_clauses(text):
 @app.post("/analyze_legal_document")
 async def analyze_legal_document(file: UploadFile = File(...)):
-    """
-    Analyze a legal document (PDF). Extract text, summarize, detect entities,
-    do risk analysis, detect clauses, and store context for chat.
-    """
     try:
         content = await file.read()
         file_hash = compute_md5(content)
-        # Return cached result if we've already processed this file
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
-        # Extract text
         text = await run_in_threadpool(extract_text_from_pdf, io.BytesIO(content))
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
-        # Summarize (handle short documents gracefully)
         summary_text = text[:4096] if len(text) > 4096 else text
         try:
             if len(text) > 100:
@@ -544,20 +492,11 @@ async def analyze_legal_document(file: UploadFile = File(...)):
         except Exception as e:
             summary = "Summarization failed due to an error."
             print(f"Summarization error: {e}")
-        # Extract named entities
         entities = extract_named_entities(text)
-        # Analyze risk
         risk_analysis = analyze_risk_enhanced(text)
-        # Detect clauses
         clauses = analyze_contract_clauses(text)
-        # Store the document context for chatbot
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
@@ -566,46 +505,29 @@ async def analyze_legal_document(file: UploadFile = File(...)):
             "risk_analysis": risk_analysis,
             "clauses_detected": clauses
         }
-        # Cache it
         analysis_cache[file_hash] = result
         return result
     except Exception as e:
         return {"status": "error", "message": str(e)}
 @app.post("/analyze_legal_video")
 async def analyze_legal_video(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
-    """
-    Analyze a legal video: transcribe, summarize, detect entities, risk analysis, etc.
-    """
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
-        # Save video temporarily
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
-        # Transcribe
         text = await process_video_to_text(temp_file_path)
-        # Cleanup
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the video."}
-        # Save transcript
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
-        # Summarize
         summary_text = text[:4096] if len(text) > 4096 else text
         try:
             if len(text) > 100:
@@ -615,16 +537,11 @@ async def analyze_legal_video(file: UploadFile = File(...), background_tasks: Ba
         except Exception as e:
             summary = "Summarization failed due to an error."
             print(f"Summarization error: {e}")
-        # Entities, risk, clauses
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
-        # Store context
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
@@ -642,36 +559,22 @@ async def analyze_legal_video(file: UploadFile = File(...), background_tasks: Ba
 @app.post("/analyze_legal_audio")
 async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
-    """
-    Analyze an audio file: transcribe, summarize, detect entities, risk analysis, etc.
-    """
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
-        # Save audio temporarily
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
-        # Transcribe
         text = await process_audio_to_text(temp_file_path)
-        # Cleanup
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the audio."}
-        # Save transcript
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
-        # Summarize
         summary_text = text[:4096] if len(text) > 4096 else text
         try:
             if len(text) > 100:
@@ -681,16 +584,11 @@ async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: Ba
         except Exception as e:
             summary = "Summarization failed due to an error."
             print(f"Summarization error: {e}")
-        # Entities, risk, clauses
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
-        # Store context
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
@@ -716,9 +614,6 @@ async def get_transcript(transcript_id: str):
 @app.post("/legal_chatbot")
 async def legal_chatbot_api(query: str = Form(...), task_id: str = Form(...)):
-    """
-    Simple QA pipeline on the stored document context.
-    """
     document_context = load_document_context(task_id)
     if not document_context:
         return {"response": "⚠️ No relevant document found for this task ID."}
@@ -762,7 +657,6 @@ def setup_ngrok():
         print(f"⚠️ Ngrok setup error: {e}")
         return None
-# Visualization endpoints
 @app.get("/download_clause_bar_chart")
 async def download_clause_bar_chart(task_id: str):
     try:
@@ -826,7 +720,6 @@ async def download_clause_radar_chart(task_id: str):
             raise HTTPException(status_code=404, detail="No clauses detected.")
         labels = [c["type"] for c in clauses]
         values = [c["confidence"] for c in clauses]
-        # close the loop for radar
         labels += labels[:1]
         values += values[:1]
         angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
@@ -854,4 +747,3 @@ if __name__ == "__main__":
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()

 # Global cache for analysis results based on file hash
 analysis_cache = {}
+# Ensure compatibility with Google Colab (if applicable)
 try:
     from google.colab import drive
     drive.mount('/content/drive')
 except Exception:
+    pass  # Not running in Colab
+# Ensure required directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
 # Use GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize FastAPI
 app = FastAPI(title="Legal Document and Video Analyzer")
+# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# In-memory storage for document text and chat history
 document_storage = {}
 chat_history = []
     return hashlib.md5(content).hexdigest()
 #############################
+#   Fine-tuning on CUAD QA   #
 #############################
 def fine_tune_cuad_model():
     from datasets import load_dataset
     from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer
                     tokenized_examples["end_positions"].append(safe_end)
         return tokenized_examples
+    print("✅ Tokenizing dataset...")
     train_dataset = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
     val_dataset = val_dataset.map(prepare_train_features, batched=True, remove_columns=val_dataset.column_names)
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
 #############################
 try:
+    # Load spaCy model
     try:
         nlp = spacy.load("en_core_web_sm")
     except Exception:
         nlp = spacy.load("en_core_web_sm")
     print("✅ Loaded spaCy model.")
+    # Create summarizer and QA pipelines on GPU
     summarizer = pipeline(
         "summarization",
         model="facebook/bart-large-cnn",
         tokenizer="facebook/bart-large-cnn",
         device=0 if device == "cuda" else -1
     )
     qa_model = pipeline(
         "question-answering",
         model="deepset/roberta-base-squad2",
         device=0 if device == "cuda" else -1
     )
+    # Use GPU for sentence embeddings if available
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
     ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if device == "cuda" else -1)
+    # Speech-to-text pipeline on GPU (if available)
     speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-medium", chunk_length_s=30,
                               device_map="auto" if device == "cuda" else None)
+    # Load or fine-tune the CUAD QA model and move to GPU
     if os.path.exists("fine_tuned_legal_qa"):
         print("✅ Loading fine-tuned CUAD QA model from fine_tuned_legal_qa...")
         cuad_tokenizer = AutoTokenizer.from_pretrained("fine_tuned_legal_qa")
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
     else:
+        print("⚠️ Fine-tuned QA model not found. Fine-tuning now (this may take a while)...")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
         cuad_model.to(device)
     sentiment_pipeline = pipeline(
         "sentiment-analysis",
         model="distilbert-base-uncased-finetuned-sst-2-english",
         raise HTTPException(status_code=400, detail=f"PDF extraction failed: {str(e)}")
 async def process_video_to_text(video_file_path):
     try:
         print(f"Processing video file at {video_file_path}")
         temp_audio_path = os.path.join("temp", "extracted_audio.wav")
         raise HTTPException(status_code=400, detail=f"Video processing failed: {str(e)}")
 async def process_audio_to_text(audio_file_path):
     try:
         print(f"Processing audio file at {audio_file_path}")
         result = await run_in_threadpool(speech_to_text, audio_file_path)
         raise HTTPException(status_code=400, detail=f"Audio processing failed: {str(e)}")
 def extract_named_entities(text):
     max_length = 10000
     entities = []
     for i in range(0, len(text), max_length):
                     weight = float(weight_str)
                 except:
                     weight = 0.0
                 if word.lower() not in STOP_WORDS and len(word) > 1:
                     terms.append((weight, word))
         terms.sort(key=lambda x: -x[0])
         if terms:
             if any("liability" in w.lower() for _, w in terms):
                 label = "Liability & Penalty Risk"
 #############################
 def chunk_text_by_tokens(text, tokenizer, max_chunk_len=384, stride=128):
     encoded = tokenizer(text, add_special_tokens=False)
     input_ids = encoded["input_ids"]
     chunks = []
     idx = 0
     while idx < len(input_ids):
         end = idx + max_chunk_len
         sub_ids = input_ids[idx:end]
         chunk_text = tokenizer.decode(sub_ids, skip_special_tokens=True)
         chunks.append(chunk_text)
         if end >= len(input_ids):
     return chunks
 def analyze_contract_clauses(text):
     text_chunks = chunk_text_by_tokens(text, cuad_tokenizer, max_chunk_len=384, stride=128)
     try:
         clause_types = list(cuad_model.config.id2label.values())
     except Exception:
             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
     clauses_detected = []
     for chunk in text_chunks:
         if not chunk:
             continue
         try:
             tokenized_inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
+            # Move to GPU and clamp token IDs to ensure they are within valid range
             inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}
+            inputs["input_ids"] = torch.clamp(inputs["input_ids"], max=cuad_model.config.vocab_size - 1)
             if torch.any(inputs["input_ids"] >= cuad_model.config.vocab_size):
                 print("Invalid token id found; skipping chunk")
                 continue
             with torch.no_grad():
                 outputs = cuad_model(**inputs)
                 if device == "cuda":
                     torch.cuda.synchronize()
             if outputs.start_logits.shape[1] != inputs["input_ids"].shape[1]:
                 print("Mismatch in logits shape; skipping chunk")
                 continue
             predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
             for idx, confidence in enumerate(predictions):
                 if confidence > 0.5 and idx < len(clause_types):
                         "type": clause_types[idx],
                         "confidence": float(confidence)
                     })
         except Exception as e:
             print(f"Error processing chunk: {e}")
             if device == "cuda":
                 torch.cuda.empty_cache()
             continue
     aggregated_clauses = {}
     for clause in clauses_detected:
         ctype = clause["type"]
         if ctype not in aggregated_clauses or clause["confidence"] > aggregated_clauses[ctype]["confidence"]:
             aggregated_clauses[ctype] = clause
     return list(aggregated_clauses.values())
 #############################
 @app.post("/analyze_legal_document")
 async def analyze_legal_document(file: UploadFile = File(...)):
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         text = await run_in_threadpool(extract_text_from_pdf, io.BytesIO(content))
         if not text:
             return {"status": "error", "message": "No valid text found in the document."}
         summary_text = text[:4096] if len(text) > 4096 else text
         try:
             if len(text) > 100:
         except Exception as e:
             summary = "Summarization failed due to an error."
             print(f"Summarization error: {e}")
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
             "risk_analysis": risk_analysis,
             "clauses_detected": clauses
         }
         analysis_cache[file_hash] = result
         return result
     except Exception as e:
         return {"status": "error", "message": str(e)}
 @app.post("/analyze_legal_video")
 async def analyze_legal_video(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
         text = await process_video_to_text(temp_file_path)
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the video."}
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
         try:
             if len(text) > 100:
         except Exception as e:
             summary = "Summarization failed due to an error."
             print(f"Summarization error: {e}")
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
 @app.post("/analyze_legal_audio")
 async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: BackgroundTasks = None):
     try:
         content = await file.read()
         file_hash = compute_md5(content)
         if file_hash in analysis_cache:
             return analysis_cache[file_hash]
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
         text = await process_audio_to_text(temp_file_path)
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:
             return {"status": "error", "message": "No speech could be transcribed from the audio."}
         transcript_path = os.path.join("static", f"transcript_{int(time.time())}.txt")
         with open(transcript_path, "w") as f:
             f.write(text)
         summary_text = text[:4096] if len(text) > 4096 else text
         try:
             if len(text) > 100:
         except Exception as e:
             summary = "Summarization failed due to an error."
             print(f"Summarization error: {e}")
         entities = extract_named_entities(text)
         risk_analysis = analyze_risk_enhanced(text)
         clauses = analyze_contract_clauses(text)
         generated_task_id = str(uuid.uuid4())
         store_document_context(generated_task_id, text)
         result = {
             "status": "success",
             "task_id": generated_task_id,
 @app.post("/legal_chatbot")
 async def legal_chatbot_api(query: str = Form(...), task_id: str = Form(...)):
     document_context = load_document_context(task_id)
     if not document_context:
         return {"response": "⚠️ No relevant document found for this task ID."}
         print(f"⚠️ Ngrok setup error: {e}")
         return None
 @app.get("/download_clause_bar_chart")
 async def download_clause_bar_chart(task_id: str):
     try:
             raise HTTPException(status_code=404, detail="No clauses detected.")
         labels = [c["type"] for c in clauses]
         values = [c["confidence"] for c in clauses]
         labels += labels[:1]
         values += values[:1]
         angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()