tejash300 commited on
Commit
f0cbc9e
·
verified ·
1 Parent(s): d951253

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -16
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  os.environ["TRANSFORMERS_NO_FAST"] = "1" # Force use of slow tokenizers
3
-
4
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
5
 
6
  import io
@@ -126,7 +125,10 @@ def fine_tune_cuad_model():
126
  tokenized_examples["end_positions"] = []
127
  for i, offsets in enumerate(offset_mapping):
128
  input_ids = tokenized_examples["input_ids"][i]
129
- cls_index = input_ids.index(tokenizer.cls_token_id)
 
 
 
130
  sequence_ids = tokenized_examples.sequence_ids(i)
131
  sample_index = sample_mapping[i]
132
  answers = examples["answers"][sample_index]
@@ -137,21 +139,29 @@ def fine_tune_cuad_model():
137
  start_char = answers["answer_start"][0]
138
  end_char = start_char + len(answers["text"][0])
139
  tokenized_start_index = 0
140
- while sequence_ids[tokenized_start_index] != 1:
141
  tokenized_start_index += 1
142
  tokenized_end_index = len(input_ids) - 1
143
- while sequence_ids[tokenized_end_index] != 1:
144
  tokenized_end_index -= 1
145
- if not (offsets[tokenized_start_index][0] <= start_char and offsets[tokenized_end_index][1] >= end_char):
 
 
 
 
146
  tokenized_examples["start_positions"].append(cls_index)
147
  tokenized_examples["end_positions"].append(cls_index)
148
  else:
 
149
  while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
150
  tokenized_start_index += 1
151
- tokenized_examples["start_positions"].append(tokenized_start_index - 1)
152
- while offsets[tokenized_end_index][1] >= end_char:
 
 
153
  tokenized_end_index -= 1
154
- tokenized_examples["end_positions"].append(tokenized_end_index + 1)
 
155
  return tokenized_examples
156
 
157
  print("✅ Tokenizing dataset...")
@@ -209,11 +219,12 @@ try:
209
  tokenizer="facebook/bart-large-cnn",
210
  device=0 if torch.cuda.is_available() else -1
211
  )
212
- if device == "cuda":
213
- try:
214
- summarizer.model.half()
215
- except Exception as e:
216
- print("FP16 conversion failed:", e)
 
217
 
218
  embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
219
  ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
@@ -225,8 +236,9 @@ try:
225
  from transformers import AutoModelForQuestionAnswering
226
  cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
227
  cuad_model.to(device)
228
- if device == "cuda":
229
- cuad_model.half()
 
230
  else:
231
  print("⚠️ Fine-tuned QA model not found. Starting fine tuning on CUAD QA dataset. This may take a while...")
232
  cuad_tokenizer, cuad_model = fine_tune_cuad_model()
@@ -494,7 +506,7 @@ async def analyze_legal_audio(file: UploadFile = File(...), background_tasks: Ba
494
  with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
495
  temp_file.write(content)
496
  temp_file_path = temp_file.name
497
- text = await process_audio_to_text(temp_audio_path=temp_file_path)
498
  if os.path.exists(temp_file_path):
499
  os.remove(temp_file_path)
500
  if not text:
 
1
  import os
2
  os.environ["TRANSFORMERS_NO_FAST"] = "1" # Force use of slow tokenizers
 
3
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
4
 
5
  import io
 
125
  tokenized_examples["end_positions"] = []
126
  for i, offsets in enumerate(offset_mapping):
127
  input_ids = tokenized_examples["input_ids"][i]
128
+ try:
129
+ cls_index = input_ids.index(tokenizer.cls_token_id)
130
+ except ValueError:
131
+ cls_index = 0
132
  sequence_ids = tokenized_examples.sequence_ids(i)
133
  sample_index = sample_mapping[i]
134
  answers = examples["answers"][sample_index]
 
139
  start_char = answers["answer_start"][0]
140
  end_char = start_char + len(answers["text"][0])
141
  tokenized_start_index = 0
142
+ while tokenized_start_index < len(sequence_ids) and sequence_ids[tokenized_start_index] != 1:
143
  tokenized_start_index += 1
144
  tokenized_end_index = len(input_ids) - 1
145
+ while tokenized_end_index >= 0 and sequence_ids[tokenized_end_index] != 1:
146
  tokenized_end_index -= 1
147
+ # Safety check: if indices are not found, default to cls_index
148
+ if tokenized_start_index >= len(offsets) or tokenized_end_index < 0:
149
+ tokenized_examples["start_positions"].append(cls_index)
150
+ tokenized_examples["end_positions"].append(cls_index)
151
+ elif not (offsets[tokenized_start_index][0] <= start_char and offsets[tokenized_end_index][1] >= end_char):
152
  tokenized_examples["start_positions"].append(cls_index)
153
  tokenized_examples["end_positions"].append(cls_index)
154
  else:
155
+ # Move tokenized_start_index to the first token after start_char
156
  while tokenized_start_index < len(offsets) and offsets[tokenized_start_index][0] <= start_char:
157
  tokenized_start_index += 1
158
+ safe_start = tokenized_start_index - 1 if tokenized_start_index > 0 else cls_index
159
+ tokenized_examples["start_positions"].append(safe_start)
160
+ # Move tokenized_end_index backwards to the last token before end_char
161
+ while tokenized_end_index >= 0 and offsets[tokenized_end_index][1] >= end_char:
162
  tokenized_end_index -= 1
163
+ safe_end = tokenized_end_index + 1 if tokenized_end_index < len(offsets) - 1 else cls_index
164
+ tokenized_examples["end_positions"].append(safe_end)
165
  return tokenized_examples
166
 
167
  print("✅ Tokenizing dataset...")
 
219
  tokenizer="facebook/bart-large-cnn",
220
  device=0 if torch.cuda.is_available() else -1
221
  )
222
+ # Commenting out FP16 conversion to avoid potential issues
223
+ # if device == "cuda":
224
+ # try:
225
+ # summarizer.model.half()
226
+ # except Exception as e:
227
+ # print("FP16 conversion failed:", e)
228
 
229
  embedding_model = SentenceTransformer("all-mpnet-base-v2", device=device)
230
  ner_model = pipeline("ner", model="dslim/bert-base-NER", device=0 if torch.cuda.is_available() else -1)
 
236
  from transformers import AutoModelForQuestionAnswering
237
  cuad_model = AutoModelForQuestionAnswering.from_pretrained("fine_tuned_legal_qa")
238
  cuad_model.to(device)
239
+ # Commenting out FP16 conversion for cuad_model as well
240
+ # if device == "cuda":
241
+ # cuad_model.half()
242
  else:
243
  print("⚠️ Fine-tuned QA model not found. Starting fine tuning on CUAD QA dataset. This may take a while...")
244
  cuad_tokenizer, cuad_model = fine_tune_cuad_model()
 
506
  with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
507
  temp_file.write(content)
508
  temp_file_path = temp_file.name
509
+ text = await process_audio_to_text(temp_file_path)
510
  if os.path.exists(temp_file_path):
511
  os.remove(temp_file_path)
512
  if not text: