Spaces:

TabasumDev
/

GraniteByte

Sleeping

File size: 21,712 Bytes

# # import streamlit as st
# # import os
# # import re
# # import torch
# # from transformers import AutoModelForCausalLM, AutoTokenizer
# # from PyPDF2 import PdfReader
# # from peft import get_peft_model, LoraConfig, TaskType

# # # ✅ Force CPU execution for Streamlit Cloud
# # device = torch.device("cpu")

# # # 🔹 Load IBM Granite Model (CPU-Compatible)
# # MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"

# # model = AutoModelForCausalLM.from_pretrained(
# #     MODEL_NAME,
# #     device_map="cpu",  # Force CPU execution
# #     torch_dtype=torch.float32  # Use float32 since Streamlit runs on CPU
# # )

# # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # # 🔹 Apply LoRA Fine-Tuning Configuration
# # lora_config = LoraConfig(
# #     r=8,
# #     lora_alpha=32,
# #     target_modules=["q_proj", "v_proj"],
# #     lora_dropout=0.1,
# #     bias="none",
# #     task_type=TaskType.CAUSAL_LM
# # )
# # model = get_peft_model(model, lora_config)
# # model.eval()

# # # 🛠 Function to Read & Extract Text from PDFs
# # def read_files(file):
# #     file_context = ""
# #     reader = PdfReader(file)
    
# #     for page in reader.pages:
# #         text = page.extract_text()
# #         if text:
# #             file_context += text + "\n"
    
# #     return file_context.strip()

# # # 🛠 Function to Format AI Prompts
# # def format_prompt(system_msg, user_msg, file_context=""):
# #     if file_context:
# #         system_msg += f" The user has provided a contract document. Use its context to generate insights, but do not repeat or summarize the document itself."
# #     return [
# #         {"role": "system", "content": system_msg},
# #         {"role": "user", "content": user_msg}
# #     ]

# # # 🛠 Function to Generate AI Responses
# # def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
# #     model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
    
# #     with torch.no_grad():
# #         output = model.generate(
# #             **model_inputs,
# #             max_new_tokens=max_tokens,
# #             do_sample=True,
# #             top_p=top_p,
# #             temperature=temperature,
# #             num_return_sequences=1,
# #             pad_token_id=tokenizer.eos_token_id
# #         )
    
# #     return tokenizer.decode(output[0], skip_special_tokens=True)

# # # 🛠 Function to Clean AI Output
# # def post_process(text):
# #     cleaned = re.sub(r'戥+', '', text)  # Remove unwanted symbols
# #     lines = cleaned.splitlines()
# #     unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
# #     return "\n".join(unique_lines)

# # # 🛠 Function to Handle RAG with IBM Granite & Streamlit
# # def granite_simple(prompt, file):
# #     file_context = read_files(file) if file else ""
    
# #     system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
    
# #     messages = format_prompt(system_message, prompt, file_context)
# #     input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
# #     response = generate_response(input_text)
# #     return post_process(response)

# # # 🔹 Streamlit UI
# # def main():
# #     st.set_page_config(page_title="Contract Analysis AI", page_icon="📜", layout="wide")

# #     st.title("📜 AI-Powered Contract Analysis Tool")
# #     st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")

# #     # 🔹 Sidebar Settings
# #     with st.sidebar:
# #         st.header("⚙️ Settings")
# #         max_tokens = st.slider("Max Tokens", 50, 1000, 250, 50)
# #         top_p = st.slider("Top P (sampling)", 0.1, 1.0, 0.9, 0.1)
# #         temperature = st.slider("Temperature (creativity)", 0.1, 1.0, 0.7, 0.1)

# #     # 🔹 File Upload Section
# #     uploaded_file = st.file_uploader("📂 Upload a contract document (PDF)", type="pdf")

# #     if uploaded_file is not None:
# #         temp_file_path = "temp_uploaded_contract.pdf"
# #         with open(temp_file_path, "wb") as f:
# #             f.write(uploaded_file.getbuffer())

# #         st.success("✅ File uploaded successfully!")

# #         # 🔹 User Input for Analysis
# #         user_prompt = "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges."

# #         if st.button("🔍 Analyze Document"):
# #             with st.spinner("Analyzing contract document... ⏳"):
# #                 final_answer = granite_simple(user_prompt, temp_file_path)

# #             # 🔹 Display Analysis Result
# #             st.subheader("📑 Analysis Result")
# #             st.write(final_answer)

# #             # 🔹 Remove Temporary File
# #             os.remove(temp_file_path)

# # # 🔥 Run Streamlit App
# # if __name__ == '__main__':
# #     main()





# import streamlit as st
# import os
# import re
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from PyPDF2 import PdfReader
# from peft import get_peft_model, LoraConfig, TaskType

# # ✅ Auto-detect GPU for Hugging Face Spaces
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # 🔹 Load IBM Granite Model (CPU/GPU Compatible)
# MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     device_map="auto",  # Auto-detect GPU if available
#     torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
# )

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # 🔹 Apply LoRA Fine-Tuning Configuration
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type=TaskType.CAUSAL_LM
# )
# model = get_peft_model(model, lora_config)
# model.eval()

# # 🛠 Function to Read & Extract Text from PDFs (No Temp File Needed)
# def read_files(file):
#     file_context = ""
#     reader = PdfReader(file)
    
#     for page in reader.pages:
#         text = page.extract_text()
#         if text:
#             file_context += text + "\n"
    
#     return file_context.strip()

# # 🛠 Function to Format AI Prompts
# def format_prompt(system_msg, user_msg, file_context=""):
#     if file_context:
#         system_msg += f" The user has provided a contract document. Use its context to generate insights, but do not repeat or summarize the document itself."
#     return [
#         {"role": "system", "content": system_msg},
#         {"role": "user", "content": user_msg}
#     ]

# # 🛠 Function to Generate AI Responses
# def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
#     model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
    
#     with torch.no_grad():
#         output = model.generate(
#             **model_inputs,
#             max_new_tokens=max_tokens,
#             do_sample=True,
#             top_p=top_p,
#             temperature=temperature,
#             num_return_sequences=1,
#             pad_token_id=tokenizer.eos_token_id
#         )
    
#     return tokenizer.decode(output[0], skip_special_tokens=True)

# # 🛠 Function to Clean AI Output
# def post_process(text):
#     cleaned = re.sub(r'戥+', '', text)  # Remove unwanted symbols
#     lines = cleaned.splitlines()
#     unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
#     return "\n".join(unique_lines)

# # 🛠 Function to Handle AI Analysis (No Temp File)
# def granite_simple(prompt, file_content):
#     system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
    
#     messages = format_prompt(system_message, prompt, file_content)
#     input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
#     response = generate_response(input_text)
#     return post_process(response)

# # 🔹 Streamlit UI
# def main():
#     st.set_page_config(page_title="Contract Analysis AI", page_icon="📜", layout="wide")

#     st.title("📜 AI-Powered Contract Analysis Tool")
#     st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")

#     # 🔹 Sidebar Settings
#     with st.sidebar:
#         st.header("⚙️ Settings")
#         max_tokens = st.slider("Max Tokens", 50, 1000, 250, 50)
#         top_p = st.slider("Top P (sampling)", 0.1, 1.0, 0.9, 0.1)
#         temperature = st.slider("Temperature (creativity)", 0.1, 1.0, 0.7, 0.1)

#     # 🔹 File Upload Section (No Temp File)
#     uploaded_file = st.file_uploader("📂 Upload a contract document (PDF)", type="pdf")

#     if uploaded_file is not None:
#         st.success("✅ File uploaded successfully!")

#         # 🔹 Read PDF Content (No Temp File)
#         file_content = read_files(uploaded_file)

#         # 🔹 User Input for Analysis
#         user_prompt = "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges."

#         if st.button("🔍 Analyze Document"):
#             with st.spinner("Analyzing contract document... ⏳"):
#                 final_answer = granite_simple(user_prompt, file_content)

#             # 🔹 Display Analysis Result
#             st.subheader("📑 Analysis Result")
#             st.write(final_answer)

# # 🔥 Run Streamlit App
# if __name__ == '__main__':
#     main()




# import streamlit as st

# st.title("File Upload Debugging")

# uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")

# if uploaded_file:
#     st.success(f"File uploaded: {uploaded_file.name}")
#     st.write(f"File Size: {uploaded_file.size / 1024:.2f} KB")

# ###################################################################################


# import streamlit as st
# import os
# import re
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from PyPDF2 import PdfReader
# from peft import get_peft_model, LoraConfig, TaskType

# # ✅ Force CPU execution for Hugging Face Spaces
# device = torch.device("cpu")

# # 🔹 Load IBM Granite Model (CPU-Compatible)
# MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     device_map="cpu",  # Force CPU execution
#     torch_dtype=torch.float32  # Use float32 since Hugging Face runs on CPU
# )

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # 🔹 Apply LoRA Fine-Tuning Configuration
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type=TaskType.CAUSAL_LM
# )
# model = get_peft_model(model, lora_config)
# model.eval()

# # 🛠 Function to Read & Extract Text from PDFs
# def read_files(file):
#     file_context = ""
#     try:
#         reader = PdfReader(file)
#         for page in reader.pages:
#             text = page.extract_text()
#             if text:
#                 file_context += text + "\n"
#     except Exception as e:
#         st.error(f"⚠️ Error reading PDF file: {e}")
#         return ""

#     return file_context.strip()

# # 🛠 Function to Format AI Prompts
# def format_prompt(system_msg, user_msg, file_context=""):
#     if file_context:
#         system_msg += f" The user has provided a contract document. Use its context to generate insights, but do not repeat or summarize the document itself."
#     return [
#         {"role": "system", "content": system_msg},
#         {"role": "user", "content": user_msg}
#     ]

# # 🛠 Function to Generate AI Responses
# def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
#     st.write("🔍 Generating response...")  # Debugging message
#     model_inputs = tokenizer([input_text], return_tensors="pt").to(device)

#     with torch.no_grad():
#         output = model.generate(
#             **model_inputs,
#             max_new_tokens=max_tokens,
#             do_sample=True,
#             top_p=top_p,
#             temperature=temperature,
#             num_return_sequences=1,
#             pad_token_id=tokenizer.eos_token_id
#         )

#     response = tokenizer.decode(output[0], skip_special_tokens=True)
#     st.write("✅ Response Generated!")  # Debugging message
#     return response

# # 🛠 Function to Clean AI Output
# def post_process(text):
#     cleaned = re.sub(r'戥+', '', text)  # Remove unwanted symbols
#     lines = cleaned.splitlines()
#     unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
#     return "\n".join(unique_lines)

# # 🛠 Function to Handle RAG with IBM Granite & Streamlit
# def granite_simple(prompt, file):
#     file_context = read_files(file) if file else ""

#     # Debugging: Show extracted file content preview
#     if not file_context:
#         st.error("⚠️ No content extracted from the PDF. It might be a scanned image or encrypted.")
#         return "Error: No content found in the document."

#     system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."

#     messages = format_prompt(system_message, prompt, file_context)
#     input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

#     response = generate_response(input_text)
#     return post_process(response)

# # 🔹 Streamlit UI
# def main():
#     st.set_page_config(page_title="Contract Analysis AI", page_icon="📜")

#     st.title("📜 AI-Powered Contract Analysis Tool")
#     st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")

#     # 🔹 Sidebar Settings
#     with st.sidebar:
#         st.header("⚙️ Settings")
#         max_tokens = st.slider("Max Tokens", 50, 1000, 250, 50)
#         top_p = st.slider("Top P (sampling)", 0.1, 1.0, 0.9, 0.1)
#         temperature = st.slider("Temperature (creativity)", 0.1, 1.0, 0.7, 0.1)

#     # 🔹 File Upload Section
#     uploaded_file = st.file_uploader("📂 Upload a contract document (PDF)", type="pdf")

#     if uploaded_file:
#         st.success(f"✅ File uploaded successfully! File Name: {uploaded_file.name}")
#         st.write(f"**File Size:** {uploaded_file.size / 1024:.2f} KB")

#         # Debugging: Show extracted text preview
#         pdf_text = read_files(uploaded_file)
#         if pdf_text:
#             st.write("**Extracted Sample Text:**")
#             st.code(pdf_text[:500])  # Show first 500 characters
#         else:
#             st.error("⚠️ No readable text found in the document.")

#         st.write("Click the button below to analyze the contract.")

#         # Force button to always render
#         st.markdown('<style>div.stButton > button {display: block; width: 100%;}</style>', unsafe_allow_html=True)

#         if st.button("🔍 Analyze Document"):
#             with st.spinner("Analyzing contract document... ⏳"):
#                 final_answer = granite_simple(
#                     "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges.",
#                     uploaded_file
#                 )

#             # 🔹 Display Analysis Result
#             st.subheader("📑 Analysis Result")
#             st.write(final_answer)

# # 🔥 Run Streamlit App
# if __name__ == '__main__':
#     main()

import streamlit as st
from PyPDF2 import PdfReader

st.title("📂 PDF Upload Debugger")

uploaded_file = st.file_uploader("Upload a PDF", type="pdf")

if uploaded_file:
    st.success(f"✅ File uploaded: {uploaded_file.name}")
    st.write(f"📏 File Size: {uploaded_file.size / 1024:.2f} KB")

    try:
        reader = PdfReader(uploaded_file)
        text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
        
        if text.strip():
            st.subheader("Extracted Text (First 500 characters)")
            st.code(text[:500])  # Show a preview of the text
        else:
            st.error("⚠️ No text found. The document might be scanned or encrypted.")

    except Exception as e:
        st.error(f"⚠️ Error reading PDF: {e}")


# ###################################################################################

# import streamlit as st
# import os
# import re
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from PyPDF2 import PdfReader
# from peft import get_peft_model, LoraConfig, TaskType

# # ✅ Force CPU execution
# device = torch.device("cpu")

# # 🔹 Load IBM Granite Model (CPU-Compatible)
# MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     device_map="cpu",  # Force CPU execution
#     torch_dtype=torch.float32  # Use float32 since Hugging Face runs on CPU
# )

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # 🔹 Apply LoRA Fine-Tuning Configuration
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
#     lora_dropout=0.1,
#     bias="none",
#     task_type=TaskType.CAUSAL_LM
# )
# model = get_peft_model(model, lora_config)
# model.eval()

# # 🛠 Function to Read & Extract Text from PDFs
# def read_files(file):
#     file_context = ""
#     reader = PdfReader(file)
    
#     for page in reader.pages:
#         text = page.extract_text()
#         if text:
#             file_context += text + "\n"
    
#     return file_context.strip()

# # 🛠 Function to Format AI Prompts
# def format_prompt(system_msg, user_msg, file_context=""):
#     if file_context:
#         system_msg += f" The user has provided a contract document. Use its context to generate insights, but do not repeat or summarize the document itself."
#     return [
#         {"role": "system", "content": system_msg},
#         {"role": "user", "content": user_msg}
#     ]

# # 🛠 Function to Generate AI Responses
# def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
#     model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
    
#     with torch.no_grad():
#         output = model.generate(
#             **model_inputs,
#             max_new_tokens=max_tokens,
#             do_sample=True,
#             top_p=top_p,
#             temperature=temperature,
#             num_return_sequences=1,
#             pad_token_id=tokenizer.eos_token_id
#         )
    
#     return tokenizer.decode(output[0], skip_special_tokens=True)

# # 🛠 Function to Clean AI Output
# def post_process(text):
#     cleaned = re.sub(r'戥+', '', text)  # Remove unwanted symbols
#     lines = cleaned.splitlines()
#     unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
#     return "\n".join(unique_lines)

# # 🛠 Function to Handle RAG with IBM Granite & Streamlit
# def granite_simple(prompt, file):
#     file_context = read_files(file) if file else ""
    
#     system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
    
#     messages = format_prompt(system_message, prompt, file_context)
#     input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
#     response = generate_response(input_text)
#     return post_process(response)

# # 🔹 Streamlit UI
# def main():
#     st.set_page_config(page_title="Contract Analysis AI", page_icon="📜")

#     st.title("📜 AI-Powered Contract Analysis Tool")
#     st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")

#     # 🔹 Sidebar Settings
#     with st.sidebar:
#         st.header("⚙️ Settings")
#         max_tokens = st.slider("Max Tokens", 50, 1000, 250, 50)
#         top_p = st.slider("Top P (sampling)", 0.1, 1.0, 0.9, 0.1)
#         temperature = st.slider("Temperature (creativity)", 0.1, 1.0, 0.7, 0.1)

#     # 🔹 File Upload Section
#     uploaded_file = st.file_uploader("📂 Upload a contract document (PDF)", type="pdf")

#     # ✅ Ensure file upload message is displayed
#     if uploaded_file is not None:
#         st.session_state["uploaded_file"] = uploaded_file  # Persist file in session state
#         st.success("✅ File uploaded successfully!")
#         st.write("Click the button below to analyze the contract.")

#         # Force button to always render
#         st.markdown('<style>div.stButton > button {display: block; width: 100%;}</style>', unsafe_allow_html=True)

#         if st.button("🔍 Analyze Document"):
#             with st.spinner("Analyzing contract document... ⏳"):
#                 final_answer = granite_simple(
#                     "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges.",
#                     uploaded_file
#                 )

#             # 🔹 Display Analysis Result
#             st.subheader("📑 Analysis Result")
#             st.write(final_answer)

# # 🔥 Run Streamlit App
# if __name__ == '__main__':
#     main()