Upload 13 files
Browse files- .env +2 -0
- Update_tracking.py +169 -0
- __pycache__/Update_tracking.cpython-311.pyc +0 -0
- __pycache__/advanced_legal_document_analysis.cpython-311.pyc +0 -0
- __pycache__/legal_document_analysis.cpython-311.pyc +0 -0
- __pycache__/rag_pipeline.cpython-311.pyc +0 -0
- __pycache__/tab.cpython-311.pyc +0 -0
- __pycache__/website_tracking.cpython-311.pyc +0 -0
- app.py +17 -0
- credentials.json +13 -0
- legal_document_analysis.py +328 -0
- rag_pipeline.py +76 -0
- requirements.txt +17 -0
.env
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 | |
2 |
+
EMAIL_PASSWORD=mwvm tluh heuy fowf
|
Update_tracking.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import time
|
4 |
+
import streamlit as st
|
5 |
+
import threading
|
6 |
+
from datetime import datetime, timedelta
|
7 |
+
import smtplib
|
8 |
+
from email.message import EmailMessage
|
9 |
+
import os
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from google.oauth2.service_account import Credentials
|
12 |
+
from googleapiclient.discovery import build
|
13 |
+
|
14 |
+
# Load environment variables from .env file
|
15 |
+
load_dotenv()
|
16 |
+
|
17 |
+
# Google Sheets Configuration
|
18 |
+
SHEET_ID = '1bZjlA-UJrBhWS2jHlEQ-7nbmDvxpEoKylgxHW51Hhzc' # Google Sheets ID
|
19 |
+
RANGE = 'Sheet1!A:D' # The range where you want to append the data
|
20 |
+
|
21 |
+
# Predefined list of URLs to track
|
22 |
+
TRACKING_URLS = [
|
23 |
+
"https://gdpr-info.eu/recitals/no-1/"]
|
24 |
+
|
25 |
+
# Event to signal thread termination
|
26 |
+
stop_event = threading.Event()
|
27 |
+
|
28 |
+
# Authenticate Google Sheets API
|
29 |
+
def authenticate_google_sheets():
|
30 |
+
creds = Credentials.from_service_account_file(
|
31 |
+
'Credentials.json',
|
32 |
+
scopes=['https://www.googleapis.com/auth/spreadsheets']
|
33 |
+
)
|
34 |
+
service = build('sheets', 'v4', credentials=creds)
|
35 |
+
return service
|
36 |
+
|
37 |
+
# Append data to Google Sheets
|
38 |
+
def append_to_google_sheets(service, url, title, content, timestamp):
|
39 |
+
values = [
|
40 |
+
[url, title, content[:200], timestamp] # Prepare row to append
|
41 |
+
]
|
42 |
+
body = {'values': values}
|
43 |
+
try:
|
44 |
+
service.spreadsheets().values().append(
|
45 |
+
spreadsheetId=SHEET_ID,
|
46 |
+
range=RANGE,
|
47 |
+
valueInputOption="RAW",
|
48 |
+
body=body
|
49 |
+
).execute()
|
50 |
+
st.write(f"Data appended to Google Sheets at {timestamp}.")
|
51 |
+
except Exception as e:
|
52 |
+
st.error(f"Error appending to Google Sheets: {e}")
|
53 |
+
|
54 |
+
# Send email notification
|
55 |
+
def send_email_notification(to_email, url, title, content, timestamp):
|
56 |
+
sender_email = os.getenv("EMAIL_ADDRESS")
|
57 |
+
sender_password = os.getenv("EMAIL_PASSWORD")
|
58 |
+
smtp_server = "smtp.gmail.com"
|
59 |
+
smtp_port = 587
|
60 |
+
|
61 |
+
if not sender_email or not sender_password:
|
62 |
+
st.error("Environment variables not loaded. Check your .env file.")
|
63 |
+
return
|
64 |
+
|
65 |
+
msg = EmailMessage()
|
66 |
+
msg["Subject"] = f"Website Update Notification for {url}"
|
67 |
+
msg["From"] = sender_email
|
68 |
+
msg["To"] = to_email
|
69 |
+
msg.set_content(f"""
|
70 |
+
Website: {url}
|
71 |
+
Title: {title}
|
72 |
+
Content (preview): {content[:200]}...
|
73 |
+
Tracked at: {timestamp}
|
74 |
+
""")
|
75 |
+
|
76 |
+
try:
|
77 |
+
with smtplib.SMTP(smtp_server, smtp_port) as server:
|
78 |
+
server.starttls()
|
79 |
+
server.login(sender_email, sender_password)
|
80 |
+
server.send_message(msg)
|
81 |
+
st.success(f"Notification email sent to {to_email}")
|
82 |
+
except smtplib.SMTPException as e:
|
83 |
+
st.error(f"SMTP Error: {e}")
|
84 |
+
|
85 |
+
# Fetch website data
|
86 |
+
def fetch_website_data(url):
|
87 |
+
try:
|
88 |
+
response = requests.get(url, timeout=10)
|
89 |
+
response.raise_for_status()
|
90 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
91 |
+
title = soup.title.string.strip() if soup.title else 'No title available'
|
92 |
+
paragraphs = soup.find_all('p')
|
93 |
+
content = ' '.join([p.text.strip() for p in paragraphs]) if paragraphs else 'New Notification available'
|
94 |
+
return title, content
|
95 |
+
except requests.exceptions.RequestException as e:
|
96 |
+
st.error(f"Error fetching website data: {e}")
|
97 |
+
return "Error occurred", "New notification detected. No content available due to an error."
|
98 |
+
|
99 |
+
# Track websites and store updates in Google Sheets
|
100 |
+
def track_websites(urls, recipient_email, interval=60, max_duration=20*60):
|
101 |
+
st.write(f"Started tracking for {recipient_email}")
|
102 |
+
service = authenticate_google_sheets()
|
103 |
+
last_updates = {} # To track changes in website content
|
104 |
+
|
105 |
+
start_time = datetime.now() # Record the start time
|
106 |
+
end_time = start_time + timedelta(seconds=max_duration) # Set end time (20 minutes later)
|
107 |
+
|
108 |
+
while not stop_event.is_set() and datetime.now() < end_time:
|
109 |
+
for url in urls:
|
110 |
+
title, content = fetch_website_data(url)
|
111 |
+
if title and content:
|
112 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
113 |
+
|
114 |
+
# Check for updates
|
115 |
+
if url not in last_updates or last_updates[url] != (title, content):
|
116 |
+
last_updates[url] = (title, content)
|
117 |
+
|
118 |
+
# Append to Google Sheets
|
119 |
+
append_to_google_sheets(service, url, title, content, timestamp)
|
120 |
+
|
121 |
+
# Send notification email
|
122 |
+
try:
|
123 |
+
send_email_notification(recipient_email, url, title, content, timestamp)
|
124 |
+
except Exception as e:
|
125 |
+
st.error(f"Error sending email notification: {e}")
|
126 |
+
|
127 |
+
# Wait for the next interval or until stop_event is set
|
128 |
+
stop_event.wait(interval)
|
129 |
+
|
130 |
+
st.write("Stopped tracking after 20 minutes.")
|
131 |
+
|
132 |
+
# Display tracking status
|
133 |
+
def display_tracking_status():
|
134 |
+
st.title("Update Tracking System with Notifications")
|
135 |
+
|
136 |
+
email_input = st.text_input("Enter your email for notifications:")
|
137 |
+
|
138 |
+
# Maintain thread state
|
139 |
+
if "tracking_thread" not in st.session_state:
|
140 |
+
st.session_state["tracking_thread"] = None
|
141 |
+
|
142 |
+
if email_input:
|
143 |
+
# Start tracking
|
144 |
+
if st.button("Tracking"):
|
145 |
+
if st.session_state["tracking_thread"] is None or not st.session_state["tracking_thread"].is_alive():
|
146 |
+
stop_event.clear() # Clear the stop flag to allow tracking
|
147 |
+
thread = threading.Thread(target=track_websites, args=(TRACKING_URLS, email_input), daemon=True)
|
148 |
+
thread.start()
|
149 |
+
st.session_state["tracking_thread"] = thread
|
150 |
+
st.success(f"Notifications will be sent to {email_input}.")
|
151 |
+
else:
|
152 |
+
st.warning("Tracking Updates is already running.")
|
153 |
+
|
154 |
+
# Stop tracking
|
155 |
+
if st.button("Stop Tracking"):
|
156 |
+
if st.session_state["tracking_thread"] is not None and st.session_state["tracking_thread"].is_alive():
|
157 |
+
stop_event.set() # Signal the thread to stop
|
158 |
+
st.session_state["tracking_thread"].join() # Wait for the thread to finish
|
159 |
+
st.session_state["tracking_thread"] = None
|
160 |
+
st.success("Tracking stopped.")
|
161 |
+
else:
|
162 |
+
st.warning("No active tracking to stop.")
|
163 |
+
|
164 |
+
# Main function
|
165 |
+
def main():
|
166 |
+
display_tracking_status()
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
main()
|
__pycache__/Update_tracking.cpython-311.pyc
ADDED
Binary file (9.65 kB). View file
|
|
__pycache__/advanced_legal_document_analysis.cpython-311.pyc
ADDED
Binary file (8.59 kB). View file
|
|
__pycache__/legal_document_analysis.cpython-311.pyc
ADDED
Binary file (20.1 kB). View file
|
|
__pycache__/rag_pipeline.cpython-311.pyc
ADDED
Binary file (3.64 kB). View file
|
|
__pycache__/tab.cpython-311.pyc
ADDED
Binary file (8.38 kB). View file
|
|
__pycache__/website_tracking.cpython-311.pyc
ADDED
Binary file (10.3 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import Update_tracking
|
3 |
+
import legal_document_analysis
|
4 |
+
from rag_pipeline import extract_text_from_pdf, create_vector_store, create_qa_pipeline
|
5 |
+
|
6 |
+
# Streamlit App Navigation
|
7 |
+
def main():
|
8 |
+
st.sidebar.title("Navigation")
|
9 |
+
page = st.sidebar.radio("Choose a page", ["Update Tracking", "Legal Document Analysis"])
|
10 |
+
|
11 |
+
if page == "Update Tracking":
|
12 |
+
Update_tracking.display_tracking_status() # Ensure the correct function name
|
13 |
+
elif page == "Legal Document Analysis":
|
14 |
+
legal_document_analysis.display_legal_analysis_page()
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
main()
|
credentials.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"type": "service_account",
|
3 |
+
"project_id": "graphite-setup-448304-u1",
|
4 |
+
"private_key_id": "c745314b1cd8b8a135a10c4819dabdb0d3d9a552",
|
5 |
+
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDKKXcJFGxBxdan\nBPI+M1js/YcBI3646efu1U88oWG7CrZDYEs6ZMTx46mdsgaiJ3LCmt6NRI691rdt\nOl2rh3KZ/rMLQXyyQvMfePwk4t1f/MFbHK75pD+nUWsUlbXawnGfZXRJxV9gC0F8\nindrqRFXqfrJaCjsexW5WdKEt5FAb4OSRSEAEzO6AC7VOAnME+ctOvzI8pg8Cuaw\nkJf8/0fTN/1Fgmygmj3om16J3gYVmxvaTYjZnrM6hQhvHFvTCkoLYk4DSXhIBxce\nzD/qLgoqwFGjlhrveb8FXErzTXlulAed/R49HJnbSKsL+Nq2guthoYV/j3Bg+TZ4\ngwPdlWQxAgMBAAECggEAC/jb6eud1CDUWxCDzItzqhhDCx4YInkGsVoMTtvDSwWB\nwPh2SAtg7u7DtBkvg6GYrh3oDV9ooSUyuGsfEjoLn0NKROHAUdm9ZmkerIwLXmR8\namoG0jWoB99+WwoZKo7+ldkXI4nG4cwU2RxloVhyNJn2RkBvAP8yjIcdXJr25QUC\nqA7v+grCR2CcxBkbRXoh/BQ+4wJQjT12eW7ybYXgxJb8HK8fWcGWXv8ir+KPNOrL\nONMhUS8rHyJ3i/9i/uwMP61pqmKf5x1gBTM1h7Wr+8tTIbCsrv1D8DSwOqvHrgTb\nDzUDKmpZFBUD0xyKETR5r7xTpje1M/xcTQlOskllyQKBgQDmhxz3kcnEHN88J1Jq\nvCXD4+0pBF+tzNc9ZmSnu2W+EBq5mAMu7sDTGRZtj/GisyYgm8YMybGnh38IPh5f\nOM+dqcXOM68ARi1srkGXPbLOMksAALhx9NVkbAZvm8Y6wIw6S5V/SsRiW8dq0VTM\nW2ncwUfn9gV3jstdAokjsZTM2QKBgQDgf/l8AZO6MHl/lujz480HP1ye/bkxhPnZ\ndsyHJG7zKRjQsqa1Kj006mGVLtcaC+zhNxGpeLrBbK/+OzyiBuM5OX0UXeS8zfIp\nPkXliSNarWIncgTCEwrcZOe1HFhIYYfd8JKebpaMtPOGYs2ZaKALMNXLDAJv5kSe\nrB0y0nabGQKBgEBKLAzrDo7v0l3vRW/Y2ttfpFNNEfXha6GG/ck9XOl9ikZ6G2Eq\nWoJEu9mATE4okhiD+N62HSJwij/dOxEpQdVhdHXGOibka2Sk0EwCe+w/YGU2zjmR\nozVnXwAfPFBERJc5Xw7p5gzcNagXiwOW9tYG3SvWk729B///ZgmbS7k5AoGBAIWX\nwgeoDJe0duiVDgTsHzrb2qsGAylTnzZoOmeFJ7ztTMYMOaX+/7M4cl9LI1dtl0Wh\n9pGptUCbA9O+vmMQxDBuZF81TIOJ7jvawc9In1A7cvhUIj63BDSIm1WsC2DvIOHS\nYf2Dg3UxzOTAcOaEWxCtu9t7Rwv9tAHUG//9O/UpAoGBALNfN69s+vhpPX4Ioq3B\nyv0bAk3QU4NZHnX7d48eFbWSS5Hhbboo/T/0KPcOf7ynOptsLeV+HqLS/WLXsJ5J\npKw7mW7pb0VoLv+JokWeAtIc4c0Ft2EZtvb+6t3GcrrDJsFBshUdwblrP9sl6i2X\nYUd3Ck2TaXpXirfFdUgByhLl\n-----END PRIVATE KEY-----\n",
|
6 |
+
"client_email": "[email protected]",
|
7 |
+
"client_id": "105183693282963617063",
|
8 |
+
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
|
9 |
+
"token_uri": "https://oauth2.googleapis.com/token",
|
10 |
+
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
|
11 |
+
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/sheet-865%40graphite-setup-448304-u1.iam.gserviceaccount.com",
|
12 |
+
"universe_domain": "googleapis.com"
|
13 |
+
}
|
legal_document_analysis.py
ADDED
@@ -0,0 +1,328 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from groq import Groq
|
3 |
+
from PyPDF2 import PdfReader
|
4 |
+
from docx import Document
|
5 |
+
from tiktoken import get_encoding, Encoding
|
6 |
+
import concurrent.futures
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import io
|
9 |
+
import base64
|
10 |
+
import os
|
11 |
+
|
12 |
+
# Groq API client initialization
|
13 |
+
client = Groq(api_key="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") # Replace with your actual API key.
|
14 |
+
|
15 |
+
def extract_text_from_pdf(file):
|
16 |
+
reader = PdfReader(file)
|
17 |
+
text = ""
|
18 |
+
for page in reader.pages:
|
19 |
+
text += page.extract_text()
|
20 |
+
return text
|
21 |
+
|
22 |
+
def extract_text_from_docx(file):
|
23 |
+
doc = Document(file)
|
24 |
+
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
25 |
+
return text
|
26 |
+
|
27 |
+
def preprocess_text(text):
|
28 |
+
return " ".join(text.replace("\n", " ").replace("\r", " ").split())
|
29 |
+
|
30 |
+
def get_default_encoding():
|
31 |
+
return get_encoding("cl100k_base")
|
32 |
+
|
33 |
+
def split_into_chunks(text, token_limit=5500):
|
34 |
+
encoding = get_default_encoding()
|
35 |
+
words = text.split()
|
36 |
+
chunks = []
|
37 |
+
current_chunk = []
|
38 |
+
current_tokens = 0
|
39 |
+
|
40 |
+
for word in words:
|
41 |
+
word_tokens = len(encoding.encode(word + " "))
|
42 |
+
if current_tokens + word_tokens > token_limit:
|
43 |
+
chunks.append(" ".join(current_chunk))
|
44 |
+
current_chunk = [word]
|
45 |
+
current_tokens = word_tokens
|
46 |
+
else:
|
47 |
+
current_chunk.append(word)
|
48 |
+
current_tokens += word_tokens
|
49 |
+
|
50 |
+
if current_chunk:
|
51 |
+
chunks.append(" ".join(current_chunk))
|
52 |
+
return chunks
|
53 |
+
|
54 |
+
def summarize_text(text):
|
55 |
+
try:
|
56 |
+
response = client.chat.completions.create(
|
57 |
+
messages=[{
|
58 |
+
"role": "user",
|
59 |
+
"content": f"Summarize the following legal document in a concise manner: {text}"
|
60 |
+
}],
|
61 |
+
model="llama-3.1-8b-instant",
|
62 |
+
stream=False
|
63 |
+
)
|
64 |
+
if response and response.choices:
|
65 |
+
return response.choices[0].message.content
|
66 |
+
else:
|
67 |
+
return "Error: Received an empty or invalid response from Groq API."
|
68 |
+
except Exception as e:
|
69 |
+
return f"Error generating summary: {e}"
|
70 |
+
|
71 |
+
def summarize_large_text(text, chunk_limit=5000):
|
72 |
+
chunks = split_into_chunks(text, token_limit=chunk_limit)
|
73 |
+
summaries = []
|
74 |
+
for chunk in chunks:
|
75 |
+
summaries.append(summarize_text(chunk))
|
76 |
+
return " ".join(summaries)
|
77 |
+
|
78 |
+
def detect_key_clauses(text):
|
79 |
+
key_clauses = [
|
80 |
+
{"clause": "confidentiality", "summary": "Confidentiality clauses ensure that sensitive information remains protected."},
|
81 |
+
{"clause": "liability", "summary": "Liability clauses outline the responsibility for damages or losses incurred."},
|
82 |
+
{"clause": "termination", "summary": "Termination clauses specify the conditions under which a contract may be ended."},
|
83 |
+
{"clause": "force majeure", "summary": "Force majeure clauses excuse parties from performance obligations due to unforeseen events."},
|
84 |
+
{"clause": "governing law", "summary": "Governing law clauses specify which jurisdiction's laws will govern the contract."},
|
85 |
+
{"clause": "dispute resolution", "summary": "Dispute resolution clauses specify how conflicts between parties will be resolved."},
|
86 |
+
{"clause": "amendment", "summary": "Amendment clauses outline the process for changing the terms of the contract."},
|
87 |
+
{"clause": "warranty", "summary": "Warranty clauses provide assurances regarding the quality or condition of goods or services."},
|
88 |
+
]
|
89 |
+
|
90 |
+
detected_clauses = []
|
91 |
+
for clause in key_clauses:
|
92 |
+
if clause["clause"].lower() in text.lower():
|
93 |
+
clause_start = text.lower().find(clause["clause"].lower())
|
94 |
+
context = text[clause_start - 50: clause_start + 200]
|
95 |
+
explanation = f"The document mentions '{clause['clause']}' clause. Context: {context.strip()}..."
|
96 |
+
detected_clauses.append({
|
97 |
+
"clause": clause["clause"].capitalize(),
|
98 |
+
"summary": clause["summary"],
|
99 |
+
"explanation": explanation
|
100 |
+
})
|
101 |
+
|
102 |
+
return detected_clauses
|
103 |
+
|
104 |
+
def detect_hidden_obligations_or_dependencies(text, summary):
|
105 |
+
hidden_obligations = [
|
106 |
+
{"phrase": "dependent upon", "summary": "This suggests that some action is conditional upon another."},
|
107 |
+
{"phrase": "if", "summary": "This indicates that certain conditions must be met to fulfill the obligation."},
|
108 |
+
{"phrase": "may be required", "summary": "Implies that the party could be obligated to perform an action under specific conditions."},
|
109 |
+
{"phrase": "should", "summary": "Implies a recommendation or requirement, though not explicitly mandatory."},
|
110 |
+
{"phrase": "obligated to", "summary": "Indicates a clear, binding duty to perform an action."},
|
111 |
+
]
|
112 |
+
|
113 |
+
hidden_dependencies = []
|
114 |
+
|
115 |
+
for item in hidden_obligations:
|
116 |
+
if item["phrase"].lower() in text.lower() or item["phrase"].lower() in summary.lower():
|
117 |
+
phrase_start = text.lower().find(item["phrase"].lower())
|
118 |
+
context = text[phrase_start - 50: phrase_start + 200]
|
119 |
+
hidden_dependencies.append({
|
120 |
+
"phrase": item["phrase"],
|
121 |
+
"summary": item["summary"],
|
122 |
+
"context": context.strip()
|
123 |
+
})
|
124 |
+
|
125 |
+
return hidden_dependencies
|
126 |
+
|
127 |
+
def detect_risks(text, summary):
|
128 |
+
risk_phrases = [
|
129 |
+
{"phrase": "penalty", "summary": "Penalty clauses may impose financial or legal consequences on the parties involved."},
|
130 |
+
{"phrase": "liability", "summary": "Liability clauses may indicate potential financial responsibility or legal risks."},
|
131 |
+
{"phrase": "default", "summary": "Default clauses can expose parties to consequences for failure to perform obligations."},
|
132 |
+
{"phrase": "breach", "summary": "Breach of contract can lead to serious legal consequences including financial penalties."},
|
133 |
+
{"phrase": "suspension", "summary": "Suspension clauses may indicate risks of halting services or operations in case of non-compliance."},
|
134 |
+
]
|
135 |
+
|
136 |
+
detected_risks = []
|
137 |
+
|
138 |
+
for item in risk_phrases:
|
139 |
+
if item["phrase"].lower() in text.lower() or item["phrase"].lower() in summary.lower():
|
140 |
+
phrase_start = text.lower().find(item["phrase"].lower())
|
141 |
+
context = text[phrase_start - 50: phrase_start + 200]
|
142 |
+
detected_risks.append({
|
143 |
+
"phrase": item["phrase"],
|
144 |
+
"summary": item["summary"],
|
145 |
+
"context": context.strip()
|
146 |
+
})
|
147 |
+
|
148 |
+
return detected_risks
|
149 |
+
|
150 |
+
def plot_risk_pie_chart(detected_clauses, hidden_obligations, detected_risks):
|
151 |
+
# Calculate counts for each category
|
152 |
+
num_clauses = len(detected_clauses)
|
153 |
+
num_obligations = len(hidden_obligations)
|
154 |
+
num_risks = len(detected_risks)
|
155 |
+
|
156 |
+
# Create a pie chart
|
157 |
+
labels = ['Detected Key Clauses', 'Hidden Obligations or Dependencies', 'Detected Risks']
|
158 |
+
sizes = [num_clauses, num_obligations, num_risks]
|
159 |
+
colors = ['#ff9999','#66b3ff','#99ff99']
|
160 |
+
|
161 |
+
fig, ax = plt.subplots()
|
162 |
+
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, wedgeprops={'edgecolor': 'black'})
|
163 |
+
ax.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
|
164 |
+
|
165 |
+
# Create a buffer to save the plot as an image in memory
|
166 |
+
buf = io.BytesIO()
|
167 |
+
plt.savefig(buf, format="png")
|
168 |
+
buf.seek(0)
|
169 |
+
|
170 |
+
# Encode the image to base64
|
171 |
+
img_str = base64.b64encode(buf.read()).decode('utf-8')
|
172 |
+
buf.close()
|
173 |
+
|
174 |
+
return img_str
|
175 |
+
|
176 |
+
def generate_analysis_document(document_text, summary, detected_clauses, hidden_obligations, detected_risks):
|
177 |
+
doc = Document()
|
178 |
+
doc.add_heading('Legal Document Analysis', level=1)
|
179 |
+
|
180 |
+
doc.add_heading('Extracted Document Text', level=2)
|
181 |
+
doc.add_paragraph(document_text)
|
182 |
+
|
183 |
+
doc.add_heading('Summary', level=2)
|
184 |
+
doc.add_paragraph(summary)
|
185 |
+
|
186 |
+
doc.add_heading('Key Clauses', level=2)
|
187 |
+
if detected_clauses:
|
188 |
+
for clause in detected_clauses:
|
189 |
+
doc.add_paragraph(f"Clause: {clause['clause']}")
|
190 |
+
doc.add_paragraph(f"Summary: {clause['summary']}")
|
191 |
+
doc.add_paragraph(f"Explanation: {clause['explanation']}")
|
192 |
+
else:
|
193 |
+
doc.add_paragraph("No key clauses detected.")
|
194 |
+
|
195 |
+
doc.add_heading('Hidden Obligations or Dependencies', level=2)
|
196 |
+
if hidden_obligations:
|
197 |
+
for obligation in hidden_obligations:
|
198 |
+
doc.add_paragraph(f"Phrase: {obligation['phrase']}")
|
199 |
+
doc.add_paragraph(f"Summary: {obligation['summary']}")
|
200 |
+
doc.add_paragraph(f"Context: {obligation['context']}")
|
201 |
+
else:
|
202 |
+
doc.add_paragraph("No hidden obligations detected.")
|
203 |
+
|
204 |
+
doc.add_heading('Risks', level=2)
|
205 |
+
if detected_risks:
|
206 |
+
for risk in detected_risks:
|
207 |
+
doc.add_paragraph(f"Risk Phrase: {risk['phrase']}")
|
208 |
+
doc.add_paragraph(f"Summary: {risk['summary']}")
|
209 |
+
doc.add_paragraph(f"Context: {risk['context']}")
|
210 |
+
else:
|
211 |
+
doc.add_paragraph("No risks detected.")
|
212 |
+
|
213 |
+
return doc
|
214 |
+
|
215 |
+
def display_legal_analysis_page():
|
216 |
+
st.title("Legal Document Analysis with Groq API")
|
217 |
+
|
218 |
+
uploaded_file = st.file_uploader("Upload your legal document (PDF or DOCX)", type=["pdf", "docx"])
|
219 |
+
if uploaded_file:
|
220 |
+
if uploaded_file.name.endswith(".pdf"):
|
221 |
+
document_text = preprocess_text(extract_text_from_pdf(uploaded_file))
|
222 |
+
elif uploaded_file.name.endswith(".docx"):
|
223 |
+
document_text = preprocess_text(extract_text_from_docx(uploaded_file))
|
224 |
+
else:
|
225 |
+
st.error("Unsupported file type!")
|
226 |
+
return
|
227 |
+
|
228 |
+
tabs = st.tabs(["Document Text", "Summary", "Key Clauses", "Hidden Obligations or Dependencies", "Risk Analysis"])
|
229 |
+
|
230 |
+
|
231 |
+
with tabs[0]:
|
232 |
+
st.subheader("Extracted Legal Document Text")
|
233 |
+
st.text_area("Document Text", document_text, height=300)
|
234 |
+
|
235 |
+
with tabs[1]:
|
236 |
+
st.subheader("Quick Summary")
|
237 |
+
summary = summarize_large_text(document_text)
|
238 |
+
if "Error" in summary:
|
239 |
+
st.warning("Summary generation failed.")
|
240 |
+
summary = "Summary not available."
|
241 |
+
st.write(summary)
|
242 |
+
|
243 |
+
with tabs[2]:
|
244 |
+
st.subheader("Detected Key Clauses")
|
245 |
+
|
246 |
+
detected_clauses = detect_key_clauses(document_text)
|
247 |
+
if not detected_clauses:
|
248 |
+
st.write("No key clauses detected.")
|
249 |
+
else:
|
250 |
+
# Count occurrences of each detected clause
|
251 |
+
clause_counts = {}
|
252 |
+
for clause in detected_clauses:
|
253 |
+
clause_counts[clause['clause']] = clause_counts.get(clause['clause'], 0) + 1
|
254 |
+
|
255 |
+
# Create a bar chart for detected clauses
|
256 |
+
if clause_counts:
|
257 |
+
labels = list(clause_counts.keys())
|
258 |
+
values = list(clause_counts.values())
|
259 |
+
|
260 |
+
fig, ax = plt.subplots()
|
261 |
+
ax.bar(labels, values, color='skyblue')
|
262 |
+
|
263 |
+
# Rotate x-axis labels for better visibility
|
264 |
+
plt.xticks(rotation=45, ha='right')
|
265 |
+
|
266 |
+
# Add titles and labels
|
267 |
+
ax.set_title("Detected Key Clauses Visualization")
|
268 |
+
ax.set_xlabel("Clause")
|
269 |
+
ax.set_ylabel("Count")
|
270 |
+
|
271 |
+
# Display the plot
|
272 |
+
st.pyplot(fig)
|
273 |
+
|
274 |
+
# Display details of each clause
|
275 |
+
for clause in detected_clauses:
|
276 |
+
if st.button(f"Show Explanation for {clause['clause']} Clause"):
|
277 |
+
st.write(f"**Clause: {clause['clause']}**")
|
278 |
+
st.write(f"Summary: {clause['summary']}\nExplanation: {clause['explanation']}")
|
279 |
+
|
280 |
+
with tabs[3]:
|
281 |
+
st.subheader("Detected Hidden Obligations or Dependencies")
|
282 |
+
hidden_obligations = detect_hidden_obligations_or_dependencies(document_text, summary)
|
283 |
+
if not hidden_obligations:
|
284 |
+
st.write("No hidden obligations or dependencies detected.")
|
285 |
+
else:
|
286 |
+
for item in hidden_obligations:
|
287 |
+
st.write(f"**Phrase: {item['phrase']}**")
|
288 |
+
st.write(f"Summary: {item['summary']}\nContext: {item['context']}")
|
289 |
+
|
290 |
+
with tabs[4]:
|
291 |
+
st.subheader("Risk Analysis & Visualization")
|
292 |
+
|
293 |
+
detected_clauses = detect_key_clauses(document_text)
|
294 |
+
hidden_obligations = detect_hidden_obligations_or_dependencies(document_text, summary)
|
295 |
+
detected_risks = detect_risks(document_text, summary)
|
296 |
+
|
297 |
+
# Generate and display the pie chart
|
298 |
+
img_str = plot_risk_pie_chart(detected_clauses, hidden_obligations, detected_risks)
|
299 |
+
st.image(f"data:image/png;base64,{img_str}", use_column_width=True)
|
300 |
+
|
301 |
+
# Display the detected risks after the visualization
|
302 |
+
st.write("### Detected Risks:")
|
303 |
+
if detected_risks:
|
304 |
+
for risk in detected_risks:
|
305 |
+
st.write(f"**{risk['phrase']}**: {risk['summary']}")
|
306 |
+
|
307 |
+
# Optionally, show other categories (Key Clauses, Hidden Obligations) after risks
|
308 |
+
st.write("### Detected Key Clauses:")
|
309 |
+
for clause in detected_clauses:
|
310 |
+
st.write(f"**{clause['clause']}**: {clause['explanation']}")
|
311 |
+
|
312 |
+
st.write("### Hidden Obligations or Dependencies:")
|
313 |
+
for obligation in hidden_obligations:
|
314 |
+
st.write(f"**{obligation['phrase']}**: {obligation['summary']}")
|
315 |
+
|
316 |
+
# Generate the full analysis document for download
|
317 |
+
analysis_doc = generate_analysis_document(document_text, summary, detected_clauses, hidden_obligations, detected_risks)
|
318 |
+
|
319 |
+
with st.expander("Download Analysis"):
|
320 |
+
output_path = "analysis_report.docx"
|
321 |
+
analysis_doc.save(output_path)
|
322 |
+
|
323 |
+
with open(output_path, "rb") as f:
|
324 |
+
st.download_button("Download Analysis", data=f, file_name="analysis_report.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
325 |
+
|
326 |
+
|
327 |
+
if __name__ == "__main__":
|
328 |
+
display_legal_analysis_page()
|
rag_pipeline.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
from transformers import pipeline
|
4 |
+
from PyPDF2 import PdfReader
|
5 |
+
from langchain.text_splitter import CharacterTextSplitter
|
6 |
+
from langchain.vectorstores import FAISS
|
7 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
8 |
+
from langchain.chains import RetrievalQA
|
9 |
+
from langchain.llms import HuggingFaceHub
|
10 |
+
|
11 |
+
# Load environment variables from .env file
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
def extract_text_from_pdf(pdf_file):
|
15 |
+
"""Extracts text from a PDF file."""
|
16 |
+
reader = PdfReader(pdf_file)
|
17 |
+
text = ""
|
18 |
+
for page in reader.pages:
|
19 |
+
text += page.extract_text() or ""
|
20 |
+
return text
|
21 |
+
|
22 |
+
def create_vector_store(text, embeddings_model="sentence-transformers/all-MiniLM-L6-v2"):
|
23 |
+
"""Creates a FAISS vector store from the input text."""
|
24 |
+
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
25 |
+
texts = text_splitter.split_text(text)
|
26 |
+
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)
|
27 |
+
return FAISS.from_texts(texts, embeddings)
|
28 |
+
|
29 |
+
def create_qa_pipeline(vector_store, llm_model="EleutherAI/gpt-neo-2.7B"):
|
30 |
+
"""Creates a Retrieval-based Question-Answering pipeline."""
|
31 |
+
|
32 |
+
# Get the Hugging Face API token from the environment variable
|
33 |
+
huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
|
34 |
+
|
35 |
+
if huggingfacehub_api_token is None:
|
36 |
+
raise ValueError("HuggingFace Hub API token is missing! Please set the 'HUGGINGFACEHUB_API_TOKEN' in your .env file.")
|
37 |
+
|
38 |
+
retriever = vector_store.as_retriever()
|
39 |
+
|
40 |
+
# Initialize Hugging Face LLM with the API token
|
41 |
+
llm = HuggingFaceHub(
|
42 |
+
repo_id=llm_model, # specify the repo_id (e.g., gpt-neo-2.7B)
|
43 |
+
huggingfacehub_api_token=huggingfacehub_api_token,
|
44 |
+
task="text-generation" # specify the task (e.g., text-generation for language models)
|
45 |
+
)
|
46 |
+
|
47 |
+
return RetrievalQA.from_chain_type(llm, retriever=retriever)
|
48 |
+
|
49 |
+
def process_pdf_and_answer(pdf_path):
|
50 |
+
"""Processes the PDF and returns answers to the text inside."""
|
51 |
+
|
52 |
+
# Extract text from the PDF
|
53 |
+
text = extract_text_from_pdf(pdf_path)
|
54 |
+
|
55 |
+
# Create a FAISS vector store
|
56 |
+
vector_store = create_vector_store(text)
|
57 |
+
|
58 |
+
# Create a QA pipeline
|
59 |
+
qa_pipeline = create_qa_pipeline(vector_store)
|
60 |
+
|
61 |
+
# Answer the question
|
62 |
+
# Since you no longer need to ask a question manually, just extract some context
|
63 |
+
answer = qa_pipeline.run("Extract key information from the PDF.") # Modify to get a summary or key data
|
64 |
+
return answer
|
65 |
+
|
66 |
+
if __name__ == "__main__":
|
67 |
+
import argparse
|
68 |
+
parser = argparse.ArgumentParser(description="RAG Pipeline for PDF analysis")
|
69 |
+
parser.add_argument("--pdf", type=str, required=True, help="Path to the PDF file")
|
70 |
+
args = parser.parse_args()
|
71 |
+
|
72 |
+
pdf_path = args.pdf
|
73 |
+
|
74 |
+
# Process the PDF and get results
|
75 |
+
answer = process_pdf_and_answer(pdf_path)
|
76 |
+
print(f"Answer: {answer}")
|
requirements.txt
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.20.0
|
2 |
+
groq==0.15.0
|
3 |
+
PyPDF2==2.10.0
|
4 |
+
python-docx==0.8.11
|
5 |
+
tiktoken==0.3.0
|
6 |
+
matplotlib==3.6.3
|
7 |
+
concurrent.futures; python_version<'3.9'
|
8 |
+
requests==2.28.1
|
9 |
+
beautifulsoup4==4.11.1
|
10 |
+
python-dotenv==0.21.0
|
11 |
+
google-auth==2.16.0
|
12 |
+
google-auth-oauthlib==0.5.3
|
13 |
+
google-auth-httplib2==0.1.0
|
14 |
+
google-api-python-client==2.80.0
|
15 |
+
transformers==4.25.0
|
16 |
+
langchain==0.0.154
|
17 |
+
huggingface-hub==0.13.1
|