legal_document_summarization / legal_document_analysis.py
sohampawar1030's picture
Upload 13 files
6a020f1 verified
import streamlit as st
from groq import Groq
from PyPDF2 import PdfReader
from docx import Document
from tiktoken import get_encoding, Encoding
import concurrent.futures
import matplotlib.pyplot as plt
import io
import base64
import os
# Groq API client initialization
client = Groq(api_key="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") # Replace with your actual API key.
def extract_text_from_pdf(file):
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
def extract_text_from_docx(file):
doc = Document(file)
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
return text
def preprocess_text(text):
return " ".join(text.replace("\n", " ").replace("\r", " ").split())
def get_default_encoding():
return get_encoding("cl100k_base")
def split_into_chunks(text, token_limit=5500):
encoding = get_default_encoding()
words = text.split()
chunks = []
current_chunk = []
current_tokens = 0
for word in words:
word_tokens = len(encoding.encode(word + " "))
if current_tokens + word_tokens > token_limit:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
current_tokens = word_tokens
else:
current_chunk.append(word)
current_tokens += word_tokens
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def summarize_text(text):
try:
response = client.chat.completions.create(
messages=[{
"role": "user",
"content": f"Summarize the following legal document in a concise manner: {text}"
}],
model="llama-3.1-8b-instant",
stream=False
)
if response and response.choices:
return response.choices[0].message.content
else:
return "Error: Received an empty or invalid response from Groq API."
except Exception as e:
return f"Error generating summary: {e}"
def summarize_large_text(text, chunk_limit=5000):
chunks = split_into_chunks(text, token_limit=chunk_limit)
summaries = []
for chunk in chunks:
summaries.append(summarize_text(chunk))
return " ".join(summaries)
def detect_key_clauses(text):
key_clauses = [
{"clause": "confidentiality", "summary": "Confidentiality clauses ensure that sensitive information remains protected."},
{"clause": "liability", "summary": "Liability clauses outline the responsibility for damages or losses incurred."},
{"clause": "termination", "summary": "Termination clauses specify the conditions under which a contract may be ended."},
{"clause": "force majeure", "summary": "Force majeure clauses excuse parties from performance obligations due to unforeseen events."},
{"clause": "governing law", "summary": "Governing law clauses specify which jurisdiction's laws will govern the contract."},
{"clause": "dispute resolution", "summary": "Dispute resolution clauses specify how conflicts between parties will be resolved."},
{"clause": "amendment", "summary": "Amendment clauses outline the process for changing the terms of the contract."},
{"clause": "warranty", "summary": "Warranty clauses provide assurances regarding the quality or condition of goods or services."},
]
detected_clauses = []
for clause in key_clauses:
if clause["clause"].lower() in text.lower():
clause_start = text.lower().find(clause["clause"].lower())
context = text[clause_start - 50: clause_start + 200]
explanation = f"The document mentions '{clause['clause']}' clause. Context: {context.strip()}..."
detected_clauses.append({
"clause": clause["clause"].capitalize(),
"summary": clause["summary"],
"explanation": explanation
})
return detected_clauses
def detect_hidden_obligations_or_dependencies(text, summary):
hidden_obligations = [
{"phrase": "dependent upon", "summary": "This suggests that some action is conditional upon another."},
{"phrase": "if", "summary": "This indicates that certain conditions must be met to fulfill the obligation."},
{"phrase": "may be required", "summary": "Implies that the party could be obligated to perform an action under specific conditions."},
{"phrase": "should", "summary": "Implies a recommendation or requirement, though not explicitly mandatory."},
{"phrase": "obligated to", "summary": "Indicates a clear, binding duty to perform an action."},
]
hidden_dependencies = []
for item in hidden_obligations:
if item["phrase"].lower() in text.lower() or item["phrase"].lower() in summary.lower():
phrase_start = text.lower().find(item["phrase"].lower())
context = text[phrase_start - 50: phrase_start + 200]
hidden_dependencies.append({
"phrase": item["phrase"],
"summary": item["summary"],
"context": context.strip()
})
return hidden_dependencies
def detect_risks(text, summary):
risk_phrases = [
{"phrase": "penalty", "summary": "Penalty clauses may impose financial or legal consequences on the parties involved."},
{"phrase": "liability", "summary": "Liability clauses may indicate potential financial responsibility or legal risks."},
{"phrase": "default", "summary": "Default clauses can expose parties to consequences for failure to perform obligations."},
{"phrase": "breach", "summary": "Breach of contract can lead to serious legal consequences including financial penalties."},
{"phrase": "suspension", "summary": "Suspension clauses may indicate risks of halting services or operations in case of non-compliance."},
]
detected_risks = []
for item in risk_phrases:
if item["phrase"].lower() in text.lower() or item["phrase"].lower() in summary.lower():
phrase_start = text.lower().find(item["phrase"].lower())
context = text[phrase_start - 50: phrase_start + 200]
detected_risks.append({
"phrase": item["phrase"],
"summary": item["summary"],
"context": context.strip()
})
return detected_risks
def plot_risk_pie_chart(detected_clauses, hidden_obligations, detected_risks):
# Calculate counts for each category
num_clauses = len(detected_clauses)
num_obligations = len(hidden_obligations)
num_risks = len(detected_risks)
# Create a pie chart
labels = ['Detected Key Clauses', 'Hidden Obligations or Dependencies', 'Detected Risks']
sizes = [num_clauses, num_obligations, num_risks]
colors = ['#ff9999','#66b3ff','#99ff99']
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, wedgeprops={'edgecolor': 'black'})
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
# Create a buffer to save the plot as an image in memory
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
# Encode the image to base64
img_str = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
return img_str
def generate_analysis_document(document_text, summary, detected_clauses, hidden_obligations, detected_risks):
doc = Document()
doc.add_heading('Legal Document Analysis', level=1)
doc.add_heading('Extracted Document Text', level=2)
doc.add_paragraph(document_text)
doc.add_heading('Summary', level=2)
doc.add_paragraph(summary)
doc.add_heading('Key Clauses', level=2)
if detected_clauses:
for clause in detected_clauses:
doc.add_paragraph(f"Clause: {clause['clause']}")
doc.add_paragraph(f"Summary: {clause['summary']}")
doc.add_paragraph(f"Explanation: {clause['explanation']}")
else:
doc.add_paragraph("No key clauses detected.")
doc.add_heading('Hidden Obligations or Dependencies', level=2)
if hidden_obligations:
for obligation in hidden_obligations:
doc.add_paragraph(f"Phrase: {obligation['phrase']}")
doc.add_paragraph(f"Summary: {obligation['summary']}")
doc.add_paragraph(f"Context: {obligation['context']}")
else:
doc.add_paragraph("No hidden obligations detected.")
doc.add_heading('Risks', level=2)
if detected_risks:
for risk in detected_risks:
doc.add_paragraph(f"Risk Phrase: {risk['phrase']}")
doc.add_paragraph(f"Summary: {risk['summary']}")
doc.add_paragraph(f"Context: {risk['context']}")
else:
doc.add_paragraph("No risks detected.")
return doc
def display_legal_analysis_page():
st.title("Legal Document Analysis with Groq API")
uploaded_file = st.file_uploader("Upload your legal document (PDF or DOCX)", type=["pdf", "docx"])
if uploaded_file:
if uploaded_file.name.endswith(".pdf"):
document_text = preprocess_text(extract_text_from_pdf(uploaded_file))
elif uploaded_file.name.endswith(".docx"):
document_text = preprocess_text(extract_text_from_docx(uploaded_file))
else:
st.error("Unsupported file type!")
return
tabs = st.tabs(["Document Text", "Summary", "Key Clauses", "Hidden Obligations or Dependencies", "Risk Analysis"])
with tabs[0]:
st.subheader("Extracted Legal Document Text")
st.text_area("Document Text", document_text, height=300)
with tabs[1]:
st.subheader("Quick Summary")
summary = summarize_large_text(document_text)
if "Error" in summary:
st.warning("Summary generation failed.")
summary = "Summary not available."
st.write(summary)
with tabs[2]:
st.subheader("Detected Key Clauses")
detected_clauses = detect_key_clauses(document_text)
if not detected_clauses:
st.write("No key clauses detected.")
else:
# Count occurrences of each detected clause
clause_counts = {}
for clause in detected_clauses:
clause_counts[clause['clause']] = clause_counts.get(clause['clause'], 0) + 1
# Create a bar chart for detected clauses
if clause_counts:
labels = list(clause_counts.keys())
values = list(clause_counts.values())
fig, ax = plt.subplots()
ax.bar(labels, values, color='skyblue')
# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right')
# Add titles and labels
ax.set_title("Detected Key Clauses Visualization")
ax.set_xlabel("Clause")
ax.set_ylabel("Count")
# Display the plot
st.pyplot(fig)
# Display details of each clause
for clause in detected_clauses:
if st.button(f"Show Explanation for {clause['clause']} Clause"):
st.write(f"**Clause: {clause['clause']}**")
st.write(f"Summary: {clause['summary']}\nExplanation: {clause['explanation']}")
with tabs[3]:
st.subheader("Detected Hidden Obligations or Dependencies")
hidden_obligations = detect_hidden_obligations_or_dependencies(document_text, summary)
if not hidden_obligations:
st.write("No hidden obligations or dependencies detected.")
else:
for item in hidden_obligations:
st.write(f"**Phrase: {item['phrase']}**")
st.write(f"Summary: {item['summary']}\nContext: {item['context']}")
with tabs[4]:
st.subheader("Risk Analysis & Visualization")
detected_clauses = detect_key_clauses(document_text)
hidden_obligations = detect_hidden_obligations_or_dependencies(document_text, summary)
detected_risks = detect_risks(document_text, summary)
# Generate and display the pie chart
img_str = plot_risk_pie_chart(detected_clauses, hidden_obligations, detected_risks)
st.image(f"data:image/png;base64,{img_str}", use_column_width=True)
# Display the detected risks after the visualization
st.write("### Detected Risks:")
if detected_risks:
for risk in detected_risks:
st.write(f"**{risk['phrase']}**: {risk['summary']}")
# Optionally, show other categories (Key Clauses, Hidden Obligations) after risks
st.write("### Detected Key Clauses:")
for clause in detected_clauses:
st.write(f"**{clause['clause']}**: {clause['explanation']}")
st.write("### Hidden Obligations or Dependencies:")
for obligation in hidden_obligations:
st.write(f"**{obligation['phrase']}**: {obligation['summary']}")
# Generate the full analysis document for download
analysis_doc = generate_analysis_document(document_text, summary, detected_clauses, hidden_obligations, detected_risks)
with st.expander("Download Analysis"):
output_path = "analysis_report.docx"
analysis_doc.save(output_path)
with open(output_path, "rb") as f:
st.download_button("Download Analysis", data=f, file_name="analysis_report.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
if __name__ == "__main__":
display_legal_analysis_page()