Update legal_document_analysis.py
Browse files- legal_document_analysis.py +121 -56
legal_document_analysis.py
CHANGED
@@ -6,7 +6,7 @@ from langchain_groq import ChatGroq
|
|
6 |
from docx import Document
|
7 |
import matplotlib.pyplot as plt
|
8 |
import io
|
9 |
-
import
|
10 |
from email.mime.multipart import MIMEMultipart
|
11 |
from email.mime.text import MIMEText
|
12 |
from email.mime.application import MIMEApplication
|
@@ -15,6 +15,8 @@ from fpdf import FPDF
|
|
15 |
import getpass
|
16 |
import pandas as pd
|
17 |
import seaborn as sns
|
|
|
|
|
18 |
|
19 |
# Load environment variables from .env file
|
20 |
load_dotenv()
|
@@ -238,11 +240,14 @@ def plot_risk_assessment_matrix(detected_risks):
|
|
238 |
for i in range(len(detected_risks)):
|
239 |
ax.annotate(detected_risks[i]['phrase'], (likelihood[i], impact[i]))
|
240 |
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
|
|
|
|
|
|
246 |
|
247 |
# Function to plot risk level distribution pie chart
|
248 |
def plot_risk_level_distribution(detected_risks):
|
@@ -255,11 +260,14 @@ def plot_risk_level_distribution(detected_risks):
|
|
255 |
|
256 |
plt.title("Risk Level Distribution", fontsize=10)
|
257 |
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
263 |
|
264 |
# Function to plot risks by type bar chart
|
265 |
def plot_risks_by_type(detected_risks):
|
@@ -272,11 +280,14 @@ def plot_risks_by_type(detected_risks):
|
|
272 |
ax.set_title("Risks by Type", fontsize=10)
|
273 |
ax.set_ylabel("Count")
|
274 |
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
|
|
|
|
|
|
280 |
|
281 |
# Function to plot stacked bar chart of risks by level
|
282 |
def plot_stacked_bar_chart(detected_risks):
|
@@ -291,11 +302,14 @@ def plot_stacked_bar_chart(detected_risks):
|
|
291 |
ax.set_title("Stacked Bar Chart of Risks by Level", fontsize=10)
|
292 |
ax.set_ylabel("Count")
|
293 |
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
|
|
|
|
|
|
299 |
|
300 |
# Function to plot risk heatmap
|
301 |
def plot_risk_heatmap(detected_risks):
|
@@ -312,14 +326,21 @@ def plot_risk_heatmap(detected_risks):
|
|
312 |
sns.heatmap(heatmap_data.pivot_table(index='Risk Level', values='Count'), annot=True, cmap='YlGnBu', ax=ax)
|
313 |
ax.set_title("Risk Heatmap")
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
|
321 |
# Function to generate PDF document with improved aesthetics
|
322 |
-
def generate_pdf_analysis(document_text, summary, detected_clauses, hidden_obligations, detected_risks,
|
323 |
pdf = FPDF()
|
324 |
pdf.add_page()
|
325 |
|
@@ -352,15 +373,15 @@ def generate_pdf_analysis(document_text, summary, detected_clauses, hidden_oblig
|
|
352 |
pdf.ln(10)
|
353 |
|
354 |
# Add visualizations for risks
|
355 |
-
pdf.image(
|
356 |
-
pdf.image(
|
357 |
pdf.ln(60)
|
358 |
|
359 |
-
pdf.image(
|
360 |
-
pdf.image(
|
361 |
pdf.ln(60)
|
362 |
|
363 |
-
pdf.image(
|
364 |
pdf.ln(10)
|
365 |
|
366 |
# Footer
|
@@ -368,11 +389,7 @@ def generate_pdf_analysis(document_text, summary, detected_clauses, hidden_oblig
|
|
368 |
pdf.set_font("Arial", 'I', 8)
|
369 |
pdf.cell(0, 10, f'Page {pdf.page_no()}', 0, 0, 'C')
|
370 |
|
371 |
-
|
372 |
-
pdf_file_path = tempfile.mktemp(suffix=".pdf")
|
373 |
-
pdf.output(pdf_file_path, 'F')
|
374 |
-
|
375 |
-
return pdf_file_path # Return the path to the saved PDF
|
376 |
|
377 |
# Function to handle chatbot interaction
|
378 |
def chatbot_query(user_input):
|
@@ -435,8 +452,11 @@ def send_pdf_via_email(pdf_buffer, recipient_email):
|
|
435 |
msg.attach(MIMEText("Please find the attached analysis of your legal document.", 'plain'))
|
436 |
|
437 |
# Attach the PDF
|
438 |
-
pdf_attachment = io.BytesIO(
|
|
|
|
|
439 |
pdf_attachment.seek(0)
|
|
|
440 |
part = MIMEApplication(pdf_attachment.read(), Name='legal_document_analysis.pdf')
|
441 |
part['Content-Disposition'] = 'attachment; filename="legal_document_analysis.pdf"'
|
442 |
msg.attach(part)
|
@@ -494,7 +514,7 @@ def display_legal_analysis_page():
|
|
494 |
st.error("Unsupported file type!")
|
495 |
return
|
496 |
|
497 |
-
tabs = st.tabs(["π Document Text", "π Summary", "π Key Clauses", "π Hidden Obligations", "β Risk Analysis", "π‘ Suggestions & Chatbot", "π Update Tracker"])
|
498 |
|
499 |
with tabs[0]:
|
500 |
st.subheader("Document Text")
|
@@ -513,7 +533,6 @@ def display_legal_analysis_page():
|
|
513 |
with st.expander(clause['clause'], expanded=False):
|
514 |
st.write(f"*Summary:* {clause['summary']}")
|
515 |
st.write(f"*Context:* {clause['explanation']}")
|
516 |
-
|
517 |
else:
|
518 |
st.write("No key clauses detected.")
|
519 |
|
@@ -544,18 +563,18 @@ def display_legal_analysis_page():
|
|
544 |
st.write("No risks detected.")
|
545 |
|
546 |
# Generate all visualizations
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
|
553 |
# Display the charts
|
554 |
-
st.image(
|
555 |
-
st.image(
|
556 |
-
st.image(
|
557 |
-
st.image(
|
558 |
-
st.image(
|
559 |
|
560 |
with tabs[5]:
|
561 |
st.subheader("Suggestions for Improvement")
|
@@ -575,13 +594,9 @@ def display_legal_analysis_page():
|
|
575 |
|
576 |
# Download PDF Analysis Button
|
577 |
st.subheader("Download Analysis as PDF")
|
578 |
-
pdf_file_path = generate_pdf_analysis(document_text, summary, detected_clauses, hidden_obligations, detected_risks, risk_assessment_matrix_path, risk_level_distribution_path, risks_by_type_path, stacked_bar_chart_path, risk_heatmap_path)
|
579 |
-
|
580 |
-
# Read PDF into BytesIO for download
|
581 |
pdf_buffer = io.BytesIO()
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
pdf_buffer.seek(0)
|
586 |
|
587 |
# Add download button for PDF
|
@@ -617,9 +632,59 @@ def display_legal_analysis_page():
|
|
617 |
with st.expander(update['update'], expanded=False):
|
618 |
suggestion = get_update_suggestion(update['update'])
|
619 |
st.write(f"*Suggestion:* {suggestion}")
|
|
|
|
|
|
|
620 |
else:
|
621 |
st.write("No updates detected.")
|
622 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
623 |
# Run the application
|
624 |
if __name__ == "__main__":
|
625 |
display_legal_analysis_page()
|
|
|
6 |
from docx import Document
|
7 |
import matplotlib.pyplot as plt
|
8 |
import io
|
9 |
+
import base64
|
10 |
from email.mime.multipart import MIMEMultipart
|
11 |
from email.mime.text import MIMEText
|
12 |
from email.mime.application import MIMEApplication
|
|
|
15 |
import getpass
|
16 |
import pandas as pd
|
17 |
import seaborn as sns
|
18 |
+
import requests
|
19 |
+
from bs4 import BeautifulSoup
|
20 |
|
21 |
# Load environment variables from .env file
|
22 |
load_dotenv()
|
|
|
240 |
for i in range(len(detected_risks)):
|
241 |
ax.annotate(detected_risks[i]['phrase'], (likelihood[i], impact[i]))
|
242 |
|
243 |
+
buf = io.BytesIO()
|
244 |
+
plt.savefig(buf, format="png", bbox_inches='tight')
|
245 |
+
buf.seek(0)
|
246 |
+
|
247 |
+
img_str = base64.b64encode(buf.read()).decode('utf-8')
|
248 |
+
buf.close()
|
249 |
+
|
250 |
+
return img_str
|
251 |
|
252 |
# Function to plot risk level distribution pie chart
|
253 |
def plot_risk_level_distribution(detected_risks):
|
|
|
260 |
|
261 |
plt.title("Risk Level Distribution", fontsize=10)
|
262 |
|
263 |
+
buf = io.BytesIO()
|
264 |
+
plt.savefig(buf, format="png", bbox_inches='tight')
|
265 |
+
buf.seek(0)
|
266 |
+
|
267 |
+
img_str = base64.b64encode(buf.read()).decode('utf-8')
|
268 |
+
buf.close()
|
269 |
+
|
270 |
+
return img_str
|
271 |
|
272 |
# Function to plot risks by type bar chart
|
273 |
def plot_risks_by_type(detected_risks):
|
|
|
280 |
ax.set_title("Risks by Type", fontsize=10)
|
281 |
ax.set_ylabel("Count")
|
282 |
|
283 |
+
buf = io.BytesIO()
|
284 |
+
plt.savefig(buf, format="png", bbox_inches='tight')
|
285 |
+
buf.seek(0)
|
286 |
+
|
287 |
+
img_str = base64.b64encode(buf.read()).decode('utf-8')
|
288 |
+
buf.close()
|
289 |
+
|
290 |
+
return img_str
|
291 |
|
292 |
# Function to plot stacked bar chart of risks by level
|
293 |
def plot_stacked_bar_chart(detected_risks):
|
|
|
302 |
ax.set_title("Stacked Bar Chart of Risks by Level", fontsize=10)
|
303 |
ax.set_ylabel("Count")
|
304 |
|
305 |
+
buf = io.BytesIO()
|
306 |
+
plt.savefig(buf, format="png", bbox_inches='tight')
|
307 |
+
buf.seek(0)
|
308 |
+
|
309 |
+
img_str = base64.b64encode(buf.read()).decode('utf-8')
|
310 |
+
buf.close()
|
311 |
+
|
312 |
+
return img_str
|
313 |
|
314 |
# Function to plot risk heatmap
|
315 |
def plot_risk_heatmap(detected_risks):
|
|
|
326 |
sns.heatmap(heatmap_data.pivot_table(index='Risk Level', values='Count'), annot=True, cmap='YlGnBu', ax=ax)
|
327 |
ax.set_title("Risk Heatmap")
|
328 |
|
329 |
+
buf = io.BytesIO()
|
330 |
+
plt.savefig(buf, format="png", bbox_inches='tight')
|
331 |
+
buf.seek(0)
|
332 |
+
|
333 |
+
img_str = base64.b64encode(buf.read()).decode('utf-8')
|
334 |
+
buf.close()
|
335 |
+
|
336 |
+
return img_str
|
337 |
+
|
338 |
+
# Function to convert base64 to image
|
339 |
+
def base64_to_image(data):
|
340 |
+
return io.BytesIO(base64.b64decode(data))
|
341 |
|
342 |
# Function to generate PDF document with improved aesthetics
|
343 |
+
def generate_pdf_analysis(document_text, summary, detected_clauses, hidden_obligations, detected_risks, risk_assessment_matrix, risk_level_distribution, risks_by_type, stacked_bar_chart, risk_heatmap):
|
344 |
pdf = FPDF()
|
345 |
pdf.add_page()
|
346 |
|
|
|
373 |
pdf.ln(10)
|
374 |
|
375 |
# Add visualizations for risks
|
376 |
+
pdf.image(base64_to_image(risk_assessment_matrix), x=10, y=pdf.get_y(), w=90)
|
377 |
+
pdf.image(base64_to_image(risk_level_distribution), x=110, y=pdf.get_y()-50, w=90) # Position next to the first image
|
378 |
pdf.ln(60)
|
379 |
|
380 |
+
pdf.image(base64_to_image(risks_by_type), x=10, y=pdf.get_y(), w=90)
|
381 |
+
pdf.image(base64_to_image(stacked_bar_chart), x=110, y=pdf.get_y()-50, w=90) # Position next to the previous image
|
382 |
pdf.ln(60)
|
383 |
|
384 |
+
pdf.image(base64_to_image(risk_heatmap), x=10, y=pdf.get_y(), w=190) # Fit image to width
|
385 |
pdf.ln(10)
|
386 |
|
387 |
# Footer
|
|
|
389 |
pdf.set_font("Arial", 'I', 8)
|
390 |
pdf.cell(0, 10, f'Page {pdf.page_no()}', 0, 0, 'C')
|
391 |
|
392 |
+
return pdf
|
|
|
|
|
|
|
|
|
393 |
|
394 |
# Function to handle chatbot interaction
|
395 |
def chatbot_query(user_input):
|
|
|
452 |
msg.attach(MIMEText("Please find the attached analysis of your legal document.", 'plain'))
|
453 |
|
454 |
# Attach the PDF
|
455 |
+
pdf_attachment = io.BytesIO()
|
456 |
+
pdf_buffer.seek(0)
|
457 |
+
pdf_attachment.write(pdf_buffer.read())
|
458 |
pdf_attachment.seek(0)
|
459 |
+
|
460 |
part = MIMEApplication(pdf_attachment.read(), Name='legal_document_analysis.pdf')
|
461 |
part['Content-Disposition'] = 'attachment; filename="legal_document_analysis.pdf"'
|
462 |
msg.attach(part)
|
|
|
514 |
st.error("Unsupported file type!")
|
515 |
return
|
516 |
|
517 |
+
tabs = st.tabs(["π Document Text", "π Summary", "π Key Clauses", "π Hidden Obligations", "β Risk Analysis", "π‘ Suggestions & Chatbot", "π Update Tracker", "π GDPR Updates"])
|
518 |
|
519 |
with tabs[0]:
|
520 |
st.subheader("Document Text")
|
|
|
533 |
with st.expander(clause['clause'], expanded=False):
|
534 |
st.write(f"*Summary:* {clause['summary']}")
|
535 |
st.write(f"*Context:* {clause['explanation']}")
|
|
|
536 |
else:
|
537 |
st.write("No key clauses detected.")
|
538 |
|
|
|
563 |
st.write("No risks detected.")
|
564 |
|
565 |
# Generate all visualizations
|
566 |
+
risk_assessment_matrix = plot_risk_assessment_matrix(detected_risks)
|
567 |
+
risk_level_distribution = plot_risk_level_distribution(detected_risks)
|
568 |
+
risks_by_type = plot_risks_by_type(detected_risks)
|
569 |
+
stacked_bar_chart = plot_stacked_bar_chart(detected_risks)
|
570 |
+
risk_heatmap = plot_risk_heatmap(detected_risks)
|
571 |
|
572 |
# Display the charts
|
573 |
+
st.image(f"data:image/png;base64,{risk_assessment_matrix}", caption="Risk Assessment Matrix")
|
574 |
+
st.image(f"data:image/png;base64,{risk_level_distribution}", caption="Risk Level Distribution")
|
575 |
+
st.image(f"data:image/png;base64,{risks_by_type}", caption="Risks by Type")
|
576 |
+
st.image(f"data:image/png;base64,{stacked_bar_chart}", caption="Stacked Bar Chart of Risks by Level")
|
577 |
+
st.image(f"data:image/png;base64,{risk_heatmap}", caption="Risk Heatmap")
|
578 |
|
579 |
with tabs[5]:
|
580 |
st.subheader("Suggestions for Improvement")
|
|
|
594 |
|
595 |
# Download PDF Analysis Button
|
596 |
st.subheader("Download Analysis as PDF")
|
|
|
|
|
|
|
597 |
pdf_buffer = io.BytesIO()
|
598 |
+
pdf = generate_pdf_analysis(document_text, summary, detected_clauses, hidden_obligations, detected_risks, risk_assessment_matrix, risk_level_distribution, risks_by_type, stacked_bar_chart, risk_heatmap)
|
599 |
+
pdf.output(pdf_buffer, 'F')
|
|
|
600 |
pdf_buffer.seek(0)
|
601 |
|
602 |
# Add download button for PDF
|
|
|
632 |
with st.expander(update['update'], expanded=False):
|
633 |
suggestion = get_update_suggestion(update['update'])
|
634 |
st.write(f"*Suggestion:* {suggestion}")
|
635 |
+
# Additional functionality
|
636 |
+
if st.button(f"Mark '{update['update']}' as addressed"):
|
637 |
+
st.success(f"'{update['update']}' has been marked as addressed.")
|
638 |
else:
|
639 |
st.write("No updates detected.")
|
640 |
|
641 |
+
with tabs[7]: # GDPR Updates Tab
|
642 |
+
st.subheader("GDPR Website Updates")
|
643 |
+
if st.button("Fetch Live Recitals"):
|
644 |
+
with st.spinner("Fetching updates..."):
|
645 |
+
recitals = fetch_gdpr_recitals()
|
646 |
+
if recitals:
|
647 |
+
for number, details in recitals.items():
|
648 |
+
st.markdown(f"*Recital {number}: {details['title']}*")
|
649 |
+
st.write(details['content'])
|
650 |
+
else:
|
651 |
+
st.write("No recitals found.")
|
652 |
+
|
653 |
+
# Function to fetch live recitals from the GDPR website
|
654 |
+
def fetch_gdpr_recitals():
|
655 |
+
url = "https://gdpr-info.eu/recitals/"
|
656 |
+
response = requests.get(url)
|
657 |
+
|
658 |
+
# Check if the request was successful
|
659 |
+
if response.status_code != 200:
|
660 |
+
st.error("Failed to fetch data from the GDPR website.")
|
661 |
+
return {}
|
662 |
+
|
663 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
664 |
+
|
665 |
+
recitals = {}
|
666 |
+
# Locate all recital links
|
667 |
+
articles = soup.find_all('div', class_='artikel')
|
668 |
+
|
669 |
+
# Extract each recital's link and title
|
670 |
+
for i, article in enumerate(articles):
|
671 |
+
if i >= 3: # Limit to the first 3 recitals
|
672 |
+
break
|
673 |
+
link = article.find('a')['href']
|
674 |
+
number = article.find('span', class_='nummer').text.strip('()')
|
675 |
+
title = article.find('span', class_='titel').text.strip()
|
676 |
+
|
677 |
+
# Fetch the content of each recital
|
678 |
+
rec_response = requests.get(link)
|
679 |
+
if rec_response.status_code == 200:
|
680 |
+
rec_soup = BeautifulSoup(rec_response.content, 'html.parser')
|
681 |
+
content = rec_soup.find('div', class_='entry-content').get_text(strip=True)
|
682 |
+
recitals[number] = {'title': title, 'content': content}
|
683 |
+
else:
|
684 |
+
print(f"Failed to fetch recital {number} from {link}")
|
685 |
+
|
686 |
+
return recitals
|
687 |
+
|
688 |
# Run the application
|
689 |
if __name__ == "__main__":
|
690 |
display_legal_analysis_page()
|