import streamlit as st
from transformers import pipeline
import spacy
from io import StringIO
import PyPDF2
import docx

# Load Hugging Face's pre-trained NER model
nlp = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Sample regulations database (can be expanded with more detailed regulations)
regulations = {
    "pollution_limit": "Air pollution should not exceed 100 µg/m³ of particulate matter.",
    "waste_management": "Waste should be sorted into recyclable and non-recyclable categories.",
}

# Function to extract text from PDF
def extract_text_from_pdf(file):
    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to extract text from DOCX
def extract_text_from_docx(file):
    doc = docx.Document(file)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

# Function to check compliance with regulations
def check_compliance(document_text):
    compliance_feedback = []
    
    # Check for pollution limit violations
    if "pollution" in document_text.lower():
        compliance_feedback.append("Check pollution limits: Ensure PM2.5 does not exceed 100 µg/m³.")
    
    # Check for waste management practices
    if "waste" in document_text.lower():
        compliance_feedback.append("Check waste management: Ensure waste is properly sorted.")
    
    return compliance_feedback

# Streamlit App
st.title("🌱 Environmental Compliance Checker")

# Upload document
uploaded_file = st.file_uploader("Upload Environmental Report", type=["txt", "pdf", "docx"])

if uploaded_file is not None:
    # Extract text based on file type
    file_extension = uploaded_file.name.split('.')[-1].lower()
    if file_extension == "pdf":
        file_content = extract_text_from_pdf(uploaded_file)
    elif file_extension == "docx":
        file_content = extract_text_from_docx(uploaded_file)
    elif file_extension == "txt":
        file_content = uploaded_file.read().decode("utf-8")
    else:
        st.error("Unsupported file type!")
        file_content = ""

    if file_content:
        st.text_area("Uploaded Document", file_content, height=300)

        # Check compliance with regulations
        st.subheader("Compliance Feedback")
        feedback = check_compliance(file_content)
        
        if feedback:
            for item in feedback:
                st.write(f"- {item}")
        else:
            st.write("No compliance issues found.")
        
        # Optional: Provide NLP-based analysis or highlight regulations mentioned in the document
        st.subheader("Regulation Mentions in Document")
        entities = nlp(file_content)
        for entity in entities:
            st.write(f"Entity: {entity['word']} - Label: {entity['entity']}")

else:
    st.write("Please upload a document to check compliance.")