legal_document_summarization_final / legal_document_analysis.py
sohampawar1030's picture
Update legal_document_analysis.py
4cae9bd verified
import os
import PyPDF2
import streamlit as st
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from docx import Document
import matplotlib.pyplot as plt
import io
import base64
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
import smtplib
from fpdf import FPDF
import getpass
import pandas as pd
import seaborn as sns
import requests
from bs4 import BeautifulSoup
# Load environment variables from .env file
load_dotenv()
# Check if the GROQ_API_KEY is in the environment variables
if not os.environ.get("GROQ_API_KEY"):
os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")
# Initialize the model
model = ChatGroq(model="llama-3.1-8b-instant", api_key=os.environ.get("GROQ_API_KEY"))
# Custom CSS for improved aesthetics
st.markdown(
"""
<style>
.main {
background-color: #f0f2f5;
}
.sidebar .sidebar-content {
background-color: #ffffff;
}
h1 {
color: #2C3E50;
}
h2 {
color: #2980B9;
}
.stButton button {
background-color: #2980B9;
color: white;
border: None;
border-radius: 5px;
padding: 10px;
}
</style>
""",
unsafe_allow_html=True
)
# Function to read PDF content
def read_pdf(file):
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to extract text from DOCX files
def extract_text_from_docx(file):
doc = Document(file)
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
return text
# Function to preprocess text
def preprocess_text(text):
return " ".join(text.replace("\n", " ").replace("\r", " ").split())
# Function to chunk large text into smaller parts
def chunk_text(text, max_tokens=2000):
chunks = []
current_chunk = []
current_length = 0
for sentence in text.split(". "):
sentence_length = len(sentence.split())
if current_length + sentence_length <= max_tokens:
current_chunk.append(sentence)
current_length += sentence_length
else:
chunks.append(". ".join(current_chunk))
current_chunk = [sentence]
current_length = sentence_length
if current_chunk:
chunks.append(". ".join(current_chunk))
return chunks
# Function to generate summary for each chunk
def generate_summary(text):
prompt = f"Please summarize the following content:\n\n{text}"
try:
response = model.invoke(prompt)
if hasattr(response, 'content'):
summary = response.content
else:
summary = str(response)
return summary.strip() if summary else "No summary available."
except Exception as e:
st.error(f"Error generating summary: {str(e)}")
return None
# Function to summarize large texts
def summarize_large_text(text, chunk_limit=5000):
chunks = chunk_text(text, max_tokens=chunk_limit)
summaries = []
for chunk in chunks:
summary = generate_summary(chunk)
if summary:
summaries.append(summary)
return " ".join(summaries)
# Function to detect key clauses
def detect_key_clauses(text):
key_clauses = [
{"clause": "confidentiality", "summary": "Confidentiality clauses ensure that sensitive information remains protected."},
{"clause": "liability", "summary": "Liability clauses outline the responsibility for damages or losses incurred."},
{"clause": "termination", "summary": "Termination clauses specify the conditions under which a contract may be ended."},
{"clause": "force majeure", "summary": "Force majeure clauses excuse parties from performance obligations due to unforeseen events."},
{"clause": "governing law", "summary": "Governing law clauses specify which jurisdiction's laws will govern the contract."},
{"clause": "dispute resolution", "summary": "Dispute resolution clauses specify how conflicts between parties will be resolved."},
{"clause": "amendment", "summary": "Amendment clauses outline the process for changing the terms of the contract."},
{"clause": "warranty", "summary": "Warranty clauses provide assurances regarding the quality or condition of goods or services."},
]
detected_clauses = []
for clause in key_clauses:
if clause["clause"].lower() in text.lower():
clause_start = text.lower().find(clause["clause"].lower())
context = text[clause_start - 50: clause_start + 200]
explanation = f"The document mentions '{clause['clause']}' clause. Context: {context.strip()}..."
detected_clauses.append({
"clause": clause["clause"].capitalize(),
"summary": clause["summary"],
"explanation": explanation
})
return detected_clauses
# Function to detect hidden obligations or dependencies
def detect_hidden_obligations_or_dependencies(text, summary):
hidden_obligations = [
{"phrase": "dependent upon", "summary": "This suggests that some action is conditional upon another."},
{"phrase": "if", "summary": "This indicates that certain conditions must be met to fulfill the obligation."},
{"phrase": "may be required", "summary": "Implies that the party could be obligated to perform an action under specific conditions."},
{"phrase": "should", "summary": "Implies a recommendation or requirement, though not explicitly mandatory."},
{"phrase": "obligated to", "summary": "Indicates a clear, binding duty to perform an action."},
]
hidden_dependencies = []
for item in hidden_obligations:
if item["phrase"].lower() in text.lower() or item["phrase"].lower() in summary.lower():
phrase_start = text.lower().find(item["phrase"].lower())
context = text[phrase_start - 50: phrase_start + 200]
hidden_dependencies.append({
"phrase": item["phrase"],
"summary": item["summary"],
"context": context.strip()
})
return hidden_dependencies
# Function to answer questions about the document
def answer_question(question, document_text):
prompt = f"The following is a legal document:\n\n{document_text}\n\nBased on this document, answer the following question: {question}"
try:
response = model.invoke(prompt)
if hasattr(response, 'content'):
answer = response.content
else:
answer = str(response)
return answer.strip() if answer else "No answer available."
except Exception as e:
st.error(f"Error answering question: {str(e)}")
return None
# Function to detect risks in the text
def detect_risks(text, summary):
risk_phrases = [
{"phrase": "penalty", "summary": "This indicates financial or legal consequences.", "risk_level": "High"},
{"phrase": "liability", "summary": "This suggests potential financial responsibility.", "risk_level": "Medium"},
{"phrase": "default", "summary": "This can lead to serious legal consequences.", "risk_level": "High"},
{"phrase": "breach", "summary": "This may expose the party to significant penalties.", "risk_level": "High"},
{"phrase": "suspension", "summary": "This indicates risks of halting services.", "risk_level": "Medium"},
{"phrase": "should", "summary": "This implies a recommendation, which may not be mandatory.", "risk_level": "Low"},
{"phrase": "may be required", "summary": "This suggests that obligations could exist under certain conditions.", "risk_level": "Low"},
{"phrase": "indemnify", "summary": "This entails a duty to compensate for harm or loss, indicating potential financial risk.", "risk_level": "High"},
{"phrase": "termination for cause", "summary": "This indicates a risk of ending the contract due to specific failures.", "risk_level": "High"},
{"phrase": "compliance", "summary": "Non-compliance with regulations can lead to legal penalties.", "risk_level": "High"},
]
detected_risks = []
for item in risk_phrases:
if item["phrase"].lower() in text.lower() or item["phrase"].lower() in summary.lower():
phrase_start = text.lower().find(item["phrase"].lower())
context = text[phrase_start - 50: phrase_start + 200]
detected_risks.append({
"phrase": item["phrase"],
"summary": item["summary"],
"context": context.strip(),
"risk_level": item["risk_level"]
})
return detected_risks
# Function to calculate overall risk score
def calculate_overall_risk_score(detected_risks):
risk_scores = {
"High": 3,
"Medium": 2,
"Low": 1
}
total_score = sum(risk_scores.get(risk['risk_level'], 0) for risk in detected_risks)
return total_score
# Function to plot risk assessment matrix
def plot_risk_assessment_matrix(detected_risks):
likelihood = []
impact = []
for risk in detected_risks:
if risk['risk_level'] == 'High':
likelihood.append(3)
impact.append(3)
elif risk['risk_level'] == 'Medium':
likelihood.append(2)
impact.append(2)
elif risk['risk_level'] == 'Low':
likelihood.append(1)
impact.append(1)
fig, ax = plt.subplots(figsize=(6, 6))
scatter = ax.scatter(likelihood, impact, alpha=0.6)
ax.set_xticks([1, 2, 3])
ax.set_yticks([1, 2, 3])
ax.set_xticklabels(['Low', 'Medium', 'High'])
ax.set_yticklabels(['Low', 'Medium', 'High'])
ax.set_xlabel('Likelihood')
ax.set_ylabel('Impact')
ax.set_title('Risk Assessment Matrix')
for i in range(len(detected_risks)):
ax.annotate(detected_risks[i]['phrase'], (likelihood[i], impact[i]))
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
return img_str
# Function to plot risk level distribution pie chart
def plot_risk_level_distribution(detected_risks):
risk_levels = [risk['risk_level'] for risk in detected_risks]
level_counts = {level: risk_levels.count(level) for level in set(risk_levels)}
fig, ax = plt.subplots(figsize=(4, 3))
ax.pie(level_counts.values(), labels=level_counts.keys(), autopct='%1.1f%%', startangle=90)
ax.axis('equal')
plt.title("Risk Level Distribution", fontsize=10)
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
return img_str
# Function to plot risks by type bar chart
def plot_risks_by_type(detected_risks):
risk_phrases = [risk['phrase'] for risk in detected_risks]
phrase_counts = {phrase: risk_phrases.count(phrase) for phrase in set(risk_phrases)}
fig, ax = plt.subplots(figsize=(4, 3))
ax.bar(phrase_counts.keys(), phrase_counts.values(), color='lightcoral')
plt.xticks(rotation=45, ha='right')
ax.set_title("Risks by Type", fontsize=10)
ax.set_ylabel("Count")
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
return img_str
# Function to plot stacked bar chart of risks by level
def plot_stacked_bar_chart(detected_risks):
risk_levels = ['High', 'Medium', 'Low']
level_counts = {level: 0 for level in risk_levels}
for risk in detected_risks:
level_counts[risk['risk_level']] += 1
fig, ax = plt.subplots(figsize=(4, 3))
ax.bar(level_counts.keys(), level_counts.values(), color=['#ff9999', '#66b3ff', '#99ff99'])
ax.set_title("Stacked Bar Chart of Risks by Level", fontsize=10)
ax.set_ylabel("Count")
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
return img_str
# Function to plot risk heatmap
def plot_risk_heatmap(detected_risks):
risk_data = {'Risk Level': [], 'Count': []}
for risk in detected_risks:
risk_data['Risk Level'].append(risk['risk_level'])
risk_data['Count'].append(1)
df = pd.DataFrame(risk_data)
heatmap_data = df.groupby('Risk Level').count().reset_index()
fig, ax = plt.subplots(figsize=(4, 3))
sns.heatmap(heatmap_data.pivot_table(index='Risk Level', values='Count'), annot=True, cmap='YlGnBu', ax=ax)
ax.set_title("Risk Heatmap")
buf = io.BytesIO()
plt.savefig(buf, format="png", bbox_inches='tight')
buf.seek(0)
img_str = base64.b64encode(buf.read()).decode('utf-8')
buf.close()
return img_str
# Function to convert base64 to image
def base64_to_image(data):
return io.BytesIO(base64.b64decode(data))
# Function to generate PDF document with improved aesthetics
def generate_pdf_analysis(document_text, summary, detected_clauses, hidden_obligations, detected_risks, risk_assessment_matrix, risk_level_distribution, risks_by_type, stacked_bar_chart, risk_heatmap):
pdf = FPDF()
pdf.add_page()
# Set page borders
pdf.set_draw_color(0, 0, 0)
pdf.rect(5, 5, 200, 287)
# Add Arial font
pdf.add_font("Arial", "", "arial.ttf", uni=True)
pdf.set_font("Arial", size=12)
# Title
pdf.set_font("Arial", 'B', 16)
pdf.cell(0, 10, 'Legal Document Analysis Report', ln=True, align='C')
pdf.ln(10)
# Executive Summary
pdf.set_font("Arial", 'B', 14)
pdf.cell(0, 10, 'Executive Summary', ln=True)
pdf.set_font("Arial", '', 12)
pdf.multi_cell(0, 10, summary)
pdf.ln(10)
# Risks Section
pdf.set_font("Arial", 'B', 14)
pdf.cell(0, 10, 'Risk Analysis', ln=True)
pdf.set_font("Arial", '', 12)
for risk in detected_risks:
pdf.cell(0, 10, f"{risk['phrase']}: {risk['summary']} (Risk Level: {risk['risk_level']})", ln=True)
pdf.ln(10)
# Save images temporarily and add them to the PDF
def save_base64_image(image_str, filename):
with open(filename, "wb") as img_file:
img_file.write(base64.b64decode(image_str))
# Save images
image_filenames = [
"risk_assessment_matrix.png",
"risk_level_distribution.png",
"risks_by_type.png",
"stacked_bar_chart.png",
"risk_heatmap.png"
]
images = [risk_assessment_matrix, risk_level_distribution, risks_by_type, stacked_bar_chart, risk_heatmap]
for img_str, filename in zip(images, image_filenames):
save_base64_image(img_str, filename)
pdf.image(filename, x=10, y=pdf.get_y(), w=90) # Use the saved image file
pdf.ln(10)
# Save PDF to a temporary file
temp_pdf_path = "legal_document_analysis.pdf"
pdf.output(temp_pdf_path, 'F')
# Load the PDF into a BytesIO object
with open(temp_pdf_path, "rb") as f:
pdf_buffer = io.BytesIO(f.read())
# Optionally, delete the temporary file
os.remove(temp_pdf_path)
return pdf_buffer
# Function to handle chatbot interaction
def chatbot_query(user_input):
try:
response = model({"text": user_input})
if isinstance(response, dict) and 'text' in response:
return response['text']
else:
return "Error: Unexpected response format."
except Exception as e:
return f"Error: {str(e)}"
# Function to generate suggestions for improvement
def generate_suggestions(text):
suggestions = []
if "shall" in text.lower():
suggestions.append("Consider replacing 'shall' with 'must' for clarity.")
if "may" in text.lower():
suggestions.append("Clarify the conditions under which actions 'may' be taken.")
if "if" in text.lower() and "then" not in text.lower():
suggestions.append("Ensure conditional statements are clear and complete.")
if "not" in text.lower():
suggestions.append("Review negative clauses to ensure they are not overly restrictive.")
return suggestions
# Function to send feedback via email
def send_feedback(feedback_content):
sender_email = os.getenv("SENDER_EMAIL")
receiver_email = os.getenv("FEEDBACK_EMAIL")
password = os.getenv("EMAIL_PASS")
msg = MIMEMultipart()
msg['From'] = sender_email
msg['To'] = receiver_email
msg['Subject'] = "User Feedback on Legal Document Analysis"
msg.attach(MIMEText(feedback_content, 'plain'))
try:
with smtplib.SMTP('smtp.gmail.com', 587) as server:
server.starttls()
server.login(sender_email, password)
server.send_message(msg)
return True
except Exception as e:
return False
# Function to send PDF via email
def send_pdf_via_email(pdf_buffer, recipient_email):
sender_email = os.getenv("SENDER_EMAIL")
password = os.getenv("EMAIL_PASS")
msg = MIMEMultipart()
msg['From'] = sender_email
msg['To'] = recipient_email
msg['Subject'] = "Legal Document Analysis PDF"
msg.attach(MIMEText("Please find the attached analysis of your legal document.", 'plain'))
# Attach the PDF
pdf_attachment = io.BytesIO()
pdf_buffer.seek(0)
pdf_attachment.write(pdf_buffer.read())
pdf_attachment.seek(0)
part = MIMEApplication(pdf_attachment.read(), Name='legal_document_analysis.pdf')
part['Content-Disposition'] = 'attachment; filename="legal_document_analysis.pdf"'
msg.attach(part)
try:
with smtplib.SMTP('smtp.gmail.com', 587) as server:
server.starttls()
server.login(sender_email, password)
server.send_message(msg)
return True
except Exception as e:
return False
# Function to simulate tracking updates in the document
def track_updates(document_text):
updates = [
{"update": "Updated confidentiality clause.", "suggestion": "Consider specifying the duration of confidentiality."},
{"update": "Revised liability limits.", "suggestion": "Ensure the limits are realistic and compliant with regulations."},
{"update": "Clarified termination conditions.", "suggestion": "Check if all potential termination scenarios are covered."},
]
return updates
# Function to get suggestion from Groq API based on the update
def get_update_suggestion(update):
prompt = f"Suggest improvements or updates for this legal clause: {update}"
suggestion = generate_summary(prompt)
return suggestion if suggestion else "No suggestion available."
# Function to display feedback form
def display_feedback_form():
st.subheader("Feedback Form")
feedback = st.text_area("Please provide your feedback or suggestions:")
question1 = st.radio("How would you rate the analysis?", ("Excellent", "Good", "Fair", "Poor"))
question2 = st.radio("Would you recommend this tool to others?", ("Yes", "No"))
if st.button("Submit Feedback"):
feedback_content = f"Feedback: {feedback}\nRating: {question1}\nRecommendation: {question2}"
if send_feedback(feedback_content):
st.success("Thank you for your feedback! It has been sent.")
else:
st.error("Failed to send feedback. Please try again later.")
# Main function to display the legal analysis page
def display_legal_analysis_page():
st.title("๐Ÿ“œ Advanced AI-Driven Legal Document Summarization and Risk Assessment")
uploaded_file = st.file_uploader("Upload your legal document (PDF or DOCX)", type=["pdf", "docx"])
if uploaded_file:
if uploaded_file.name.endswith(".pdf"):
document_text = preprocess_text(read_pdf(uploaded_file))
elif uploaded_file.name.endswith(".docx"):
document_text = preprocess_text(extract_text_from_docx(uploaded_file))
else:
st.error("Unsupported file type!")
return
tabs = st.tabs(["๐Ÿ“„ Document Text", "๐Ÿ” Summary", "๐Ÿ”‘ Key Clauses", "๐Ÿ”’ Hidden Obligations", "โš  Risk Analysis", "๐Ÿ’ก Suggestions & Chatbot", "๐Ÿ”„ document update"])
with tabs[0]:
st.subheader("Document Text")
st.write(document_text)
with tabs[1]:
st.subheader("Summary")
summary = summarize_large_text(document_text)
st.write(summary)
with tabs[2]:
st.subheader("Key Clauses Identified")
detected_clauses = detect_key_clauses(document_text)
if detected_clauses:
for clause in detected_clauses:
with st.expander(clause['clause'], expanded=False):
st.write(f"*Summary:* {clause['summary']}")
st.write(f"*Context:* {clause['explanation']}")
else:
st.write("No key clauses detected.")
with tabs[3]:
st.subheader("Hidden Obligations and Dependencies")
hidden_obligations = detect_hidden_obligations_or_dependencies(document_text, summary)
if hidden_obligations:
for obligation in hidden_obligations:
st.write(f"{obligation['phrase']}: {obligation['summary']}")
st.write(obligation['context'])
else:
st.write("No hidden obligations detected.")
with tabs[4]:
st.subheader("Risk Analysis")
detected_risks = detect_risks(document_text, summary)
overall_risk_score = calculate_overall_risk_score(detected_risks)
st.write(f"*Overall Risk Score:* {overall_risk_score}")
if detected_risks:
for risk in detected_risks:
with st.expander(risk['phrase'], expanded=False):
st.write(f"*Summary:* {risk['summary']} (Risk Level: {risk['risk_level']})")
short_context = risk['context'].strip().split('. ')[0] + '.'
st.write(f"*Context:* {short_context}")
else:
st.write("No risks detected.")
# Generate all visualizations
risk_assessment_matrix = plot_risk_assessment_matrix(detected_risks)
risk_level_distribution = plot_risk_level_distribution(detected_risks)
risks_by_type = plot_risks_by_type(detected_risks)
stacked_bar_chart = plot_stacked_bar_chart(detected_risks)
risk_heatmap = plot_risk_heatmap(detected_risks)
# Display the charts
st.image(f"data:image/png;base64,{risk_assessment_matrix}", caption="Risk Assessment Matrix")
st.image(f"data:image/png;base64,{risk_level_distribution}", caption="Risk Level Distribution")
st.image(f"data:image/png;base64,{risks_by_type}", caption="Risks by Type")
st.image(f"data:image/png;base64,{stacked_bar_chart}", caption="Stacked Bar Chart of Risks by Level")
st.image(f"data:image/png;base64,{risk_heatmap}", caption="Risk Heatmap")
with tabs[5]:
st.subheader("Suggestions for Improvement")
suggestions = generate_suggestions(document_text)
for suggestion in suggestions:
st.write(f"- {suggestion}")
# Chatbot Tab
st.subheader("๐Ÿค– Chatbot")
question = st.text_input("Ask a question about the document:")
if question:
with st.spinner("Getting answer..."):
answer = answer_question(question, document_text)
if answer:
st.write(f"Answer: {answer}")
else:
st.write("Sorry, I couldn't find an answer to thatย question.")
# Download PDF Analysis Button
st.subheader("Download Analysis as PDF")
pdf_buffer = generate_pdf_analysis(document_text, summary, detected_clauses, hidden_obligations, detected_risks, risk_assessment_matrix, risk_level_distribution, risks_by_type, stacked_bar_chart, risk_heatmap)
pdf_buffer.seek(0)
# Add download button for PDF
st.download_button(
label="Download PDF Analysis",
data=pdf_buffer,
file_name="legal_document_analysis.pdf",
mime="application/pdf"
)
# Input for recipient email
recipient_email = st.text_input("Enter your email address to receive the PDF:")
# Button to send PDF via email
if st.button("Send PDF Analysis"):
if recipient_email:
if send_pdf_via_email(pdf_buffer, recipient_email):
st.success("PDF has been sent successfully!")
else:
st.error("Failed to send PDF. Please try again.")
else:
st.warning("Please enter a valid email address.")
# Feedback Form Section
display_feedback_form()
with tabs[6]: # Update Tracker Tab
st.subheader("Document Updates")
updates = track_updates(document_text)
if st.button("Show Updates"):
if updates:
for update in updates:
with st.expander(update['update'], expanded=False):
suggestion = get_update_suggestion(update['update'])
st.write(f"*Suggestion:* {suggestion}")
# Additional functionality
if st.button(f"Mark '{update['update']}' as addressed"):
st.success(f"'{update['update']}' has been marked as addressed.")
else:
st.write("No updates detected.")
# Run the application
if __name__ == "__main__":
display_legal_analysis_page()