Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Mar 31

Commit

64af888

verified ·

1 Parent(s): c575db1

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -18

app.py CHANGED Viewed

@@ -23,21 +23,21 @@ import time
 import uuid
 import subprocess  # For running ffmpeg commands
-# ✅ Ensure compatibility with Google Colab
 try:
     from google.colab import drive
     drive.mount('/content/drive')
-except:
     pass  # Skip drive mount if not in Google Colab
-# ✅ Ensure required directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
-# ✅ Ensure GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# ✅ Initialize FastAPI
 app = FastAPI(title="Legal Document and Video Analyzer")
 # Add CORS middleware
@@ -49,19 +49,17 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# ✅ Initialize document storage
 document_storage = {}
-chat_history = []  # Global chat history
-# ✅ Function to store document context by task ID
 def store_document_context(task_id, text):
-    """Store document text for retrieval by chatbot."""
     document_storage[task_id] = text
     return True
-# ✅ Function to load document context by task ID
 def load_document_context(task_id):
-    """Retrieve document text for chatbot context."""
     return document_storage.get(task_id, "")
 #############################
@@ -72,7 +70,7 @@ def fine_tune_cuad_model():
     """
     Fine tunes a QA model on the CUAD dataset for clause extraction.
     For testing, we use only 50 training examples (and 10 for validation)
-    and set training arguments for very fast, minimal training.
     """
     from datasets import load_dataset
     import numpy as np
@@ -151,18 +149,19 @@ def fine_tune_cuad_model():
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
-    # Adjust training arguments for fast testing
     training_args = TrainingArguments(
         output_dir="./fine_tuned_legal_qa",
         evaluation_strategy="steps",
-        eval_steps=10,
         learning_rate=2e-5,
         per_device_train_batch_size=4,
         per_device_eval_batch_size=4,
-        num_train_epochs=0.1,  # Very short training for testing purposes
         weight_decay=0.01,
-        logging_steps=5,
-        save_steps=10,
         load_best_model_at_end=True,
         report_to=[]  # Disable wandb logging
     )
@@ -741,4 +740,3 @@ if __name__ == "__main__":
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()

 import uuid
 import subprocess  # For running ffmpeg commands
+# Ensure compatibility with Google Colab
 try:
     from google.colab import drive
     drive.mount('/content/drive')
+except Exception:
     pass  # Skip drive mount if not in Google Colab
+# Ensure required directories exist
 os.makedirs("static", exist_ok=True)
 os.makedirs("temp", exist_ok=True)
+# Ensure GPU usage
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Initialize FastAPI
 app = FastAPI(title="Legal Document and Video Analyzer")
 # Add CORS middleware
     allow_headers=["*"],
 )
+# Initialize document storage and chat history
 document_storage = {}
+chat_history = []
+# Function to store document context by task ID
 def store_document_context(task_id, text):
     document_storage[task_id] = text
     return True
+# Function to load document context by task ID
 def load_document_context(task_id):
     return document_storage.get(task_id, "")
 #############################
     """
     Fine tunes a QA model on the CUAD dataset for clause extraction.
     For testing, we use only 50 training examples (and 10 for validation)
+    and restrict training to 10 steps.
     """
     from datasets import load_dataset
     import numpy as np
     train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
     val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "start_positions", "end_positions"])
+    # Set max_steps to 10 for fast testing.
     training_args = TrainingArguments(
         output_dir="./fine_tuned_legal_qa",
+        max_steps=10,
         evaluation_strategy="steps",
+        eval_steps=5,
         learning_rate=2e-5,
         per_device_train_batch_size=4,
         per_device_eval_batch_size=4,
+        num_train_epochs=1,
         weight_decay=0.01,
+        logging_steps=1,
+        save_steps=5,
         load_best_model_at_end=True,
         report_to=[]  # Disable wandb logging
     )
     else:
         print("\n⚠️ Ngrok setup failed. API will only be available locally.\n")
     run()