Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on 28 days ago

Commit

f0cbc9e

verified ·

1 Parent(s): d951253

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -16

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 os.environ["TRANSFORMERS_NO_FAST"] = "1"  # Force use of slow tokenizers
 os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 import io
@@ -126,7 +125,10 @@ def fine_tune_cuad_model():
         tokenized_examples["end_positions"] = []
         for i, offsets in enumerate(offset_mapping):
             input_ids = tokenized_examples["input_ids"][i]
-            cls_index = input_ids.index(tokenizer.cls_token_id)
             sequence_ids = tokenized_examples.sequence_ids(i)
             sample_index = sample_mapping[i]
             answers = examples["answers"][sample_index]
@@ -137,21 +139,29 @@ def fine_tune_cuad_model():
                 start_char = answers["answer_start"][0]
                 end_char = start_char + len(answers["text"][0])
                 tokenized_start_index = 0
-                while sequence_ids[tokenized_start_index] != 1:
                     tokenized_start_index += 1
                 tokenized_end_index = len(input_ids) - 1
-                while sequence_ids[tokenized_end_index] != 1:
                     tokenized_end_index -= 1
-                if not (offsets[tokenized_start_index][0] <= start_char and offsets[tokenized_end_index][1] >= end_char):
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
                 else:
                     while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
                         tokenized_start_index += 1
-                    tokenized_examples["start_positions"].append(tokenized_start_index - 1)
-                    while offsets[tokenized_end_index][1] >= end_char:
                         tokenized_end_index -= 1
-                    tokenized_examples["end_positions"].append(tokenized_end_index + 1)
         return tokenized_examples
     print("✅ Tokenizing dataset...")
@@ -209,11 +219,12 @@ try:
         tokenizer="facebook/bart-large-cnn",
         device=0 if torch.cuda.is_available() else -1
     )
-    if device == "cuda":
-        try:
-            summarizer.model.half()
-        except Exception as e:
-            print("FP16 conversion failed:", e)
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
     ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
@@ -225,8 +236,9 @@ try:
         from transformers import AutoModelForQuestionAnswering
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
-        if device == "cuda":
-            cuad_model.half()
     else:
         print("⚠️ Fine-tuned QA model not found. Starting fine tuning on CUAD QA dataset. This may take a while...")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
@@ -494,7 +506,7 @@ async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: Ba
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
-        text = await process_audio_to_text(temp_audio_path=temp_file_path)
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text:

 import os
 os.environ["TRANSFORMERS_NO_FAST"] = "1"  # Force use of slow tokenizers
 os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 import io
         tokenized_examples["end_positions"] = []
         for i, offsets in enumerate(offset_mapping):
             input_ids = tokenized_examples["input_ids"][i]
+            try:
+                cls_index = input_ids.index(tokenizer.cls_token_id)
+            except ValueError:
+                cls_index = 0
             sequence_ids = tokenized_examples.sequence_ids(i)
             sample_index = sample_mapping[i]
             answers = examples["answers"][sample_index]
                 start_char = answers["answer_start"][0]
                 end_char = start_char + len(answers["text"][0])
                 tokenized_start_index = 0
+                while tokenized_start_index < len(sequence_ids) and sequence_ids[tokenized_start_index] != 1:
                     tokenized_start_index += 1
                 tokenized_end_index = len(input_ids) - 1
+                while tokenized_end_index >= 0 and sequence_ids[tokenized_end_index] != 1:
                     tokenized_end_index -= 1
+                # Safety check: if indices are not found, default to cls_index
+                if tokenized_start_index >= len(offsets) or tokenized_end_index < 0:
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                elif not (offsets[tokenized_start_index][0] <= start_char and offsets[tokenized_end_index][1] >= end_char):
                     tokenized_examples["start_positions"].append(cls_index)
                     tokenized_examples["end_positions"].append(cls_index)
                 else:
+                    # Move tokenized_start_index to the first token after start_char
                     while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
                         tokenized_start_index += 1
+                    safe_start = tokenized_start_index - 1 if tokenized_start_index > 0 else cls_index
+                    tokenized_examples["start_positions"].append(safe_start)
+                    # Move tokenized_end_index backwards to the last token before end_char
+                    while tokenized_end_index >= 0 and offsets[tokenized_end_index][1] >= end_char:
                         tokenized_end_index -= 1
+                    safe_end = tokenized_end_index + 1 if tokenized_end_index < len(offsets) - 1 else cls_index
+                    tokenized_examples["end_positions"].append(safe_end)
         return tokenized_examples
     print("✅ Tokenizing dataset...")
         tokenizer="facebook/bart-large-cnn",
         device=0 if torch.cuda.is_available() else -1
     )
+    # Commenting out FP16 conversion to avoid potential issues
+    # if device == "cuda":
+    #     try:
+    #         summarizer.model.half()
+    #     except Exception as e:
+    #         print("FP16 conversion failed:", e)
     embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
     ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
         from transformers import AutoModelForQuestionAnswering
         cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
         cuad_model.to(device)
+        # Commenting out FP16 conversion for cuad_model as well
+        # if device == "cuda":
+        #     cuad_model.half()
     else:
         print("⚠️ Fine-tuned QA model not found. Starting fine tuning on CUAD QA dataset. This may take a while...")
         cuad_tokenizer, cuad_model = fine_tune_cuad_model()
         with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
             temp_file.write(content)
             temp_file_path = temp_file.name
+        text = await process_audio_to_text(temp_file_path)
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         if not text: