Spaces:

tejash300
/

docanalyzer

Runtime error

App Files Files Community

tejash300 commited on Apr 5

Commit

4e897df

verified ·

1 Parent(s): d5c52ad

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -11

app.py CHANGED Viewed

@@ -406,17 +406,32 @@ def analyze_contract_clauses(text):
             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
-    chunks = [text[i:i+max_length] for i in range(0, len(text), step) if i+step < len(text)]
-    for chunk in chunks:
-        # Move each tensor individually to the device
-        tokenized_inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
-        inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}
-        with torch.no_grad():
-            outputs = cuad_model(**inputs)
-        predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
-        for idx, confidence in enumerate(predictions):
-            if confidence > 0.5 and idx < len(clause_types):
-                clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
     aggregated_clauses = {}
     for clause in clauses_detected:
         clause_type = clause["type"]

             "Assignment", "Warranty", "Limitation of Liability", "Arbitration",
             "IP Rights", "Force Majeure", "Revenue/Profit Sharing", "Audit Rights"
         ]
+    # Process text in chunks of 'max_length' with a step size 'step'
+    for i in range(0, len(text), step):
+        chunk = text[i:i+max_length]
+        if not chunk.strip():
+            continue  # Skip empty chunks
+        try:
+            tokenized_inputs = cuad_tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
+            inputs = {k: v.to(device) for k, v in tokenized_inputs.items()}
+            # Check that token IDs are within vocabulary bounds
+            max_token = inputs["input_ids"].max().item()
+            if max_token >= cuad_model.config.vocab_size:
+                print(f"Skipping chunk due to invalid token id: {max_token}")
+                continue
+            with torch.no_grad():
+                outputs = cuad_model(**inputs)
+            # Optional: verify shape consistency
+            if outputs.start_logits.shape[1] != inputs["input_ids"].shape[1]:
+                print("Mismatch in logits shape, skipping chunk")
+                continue
+            predictions = torch.sigmoid(outputs.start_logits).cpu().numpy()[0]
+            for idx, confidence in enumerate(predictions):
+                if confidence > 0.5 and idx < len(clause_types):
+                    clauses_detected.append({"type": clause_types[idx], "confidence": float(confidence)})
+        except Exception as e:
+            print(f"Error processing chunk: {e}")
+            continue
     aggregated_clauses = {}
     for clause in clauses_detected:
         clause_type = clause["type"]