AIxBI_AI_Plagiarism_detection_and_resolution

Sleeping

App Files Files Community

mohitrulzz commited on 15 days ago

Commit

27c8f12

verified ·

1 Parent(s): af8e43a

Update app.py

Browse files

Files changed (1) hide show

app.py +776 -155

app.py CHANGED Viewed

@@ -5,249 +5,870 @@ import pandas as pd
 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
-from duckduckgo_search import DDGS
 from fpdf import FPDF
 # -----------------------------
-# CONFIG
 # -----------------------------
 DB_NAME = "db.sqlite3"
 USERNAME = "aixbi"
 PASSWORD = "aixbi@123"
-MAX_SENTENCES_CHECK = 10
-LOGO_PATH = "aixbi.jpg"  # Place your logo file here
 # -----------------------------
-# DB INIT
 # -----------------------------
 def init_db():
     conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
     c.execute("""CREATE TABLE IF NOT EXISTS results (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
-                    student_id TEXT,
-                    student_name TEXT,
                     ai_score REAL,
                     plagiarism_score REAL,
-                    timestamp TEXT
                 )""")
     conn.commit()
     conn.close()
 init_db()
 # -----------------------------
-# MODEL LOADING
 # -----------------------------
-embedder = SentenceTransformer('all-MiniLM-L6-v2')
-tokenizer = AutoTokenizer.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
-model = AutoModelForSequenceClassification.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
 # -----------------------------
-# FILE HANDLING
 # -----------------------------
-def extract_text(file_obj):
-    """Extracts text safely from PDF/DOCX/TXT"""
     if file_obj is None:
-        return None
     name = file_obj.name
     ext = os.path.splitext(name)[1].lower()
     # Copy to temp file preserving extension
     with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
         shutil.copy(file_obj.name, tmp.name)
         tmp_path = tmp.name
     try:
         if ext == ".pdf":
             with pdfplumber.open(tmp_path) as pdf:
                 text = " ".join(page.extract_text() or "" for page in pdf.pages)
         elif ext == ".docx":
             doc = docx.Document(tmp_path)
             text = " ".join(p.text for p in doc.paragraphs)
         elif ext == ".txt":
             with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
                 text = f.read()
         else:
-            return None
-    except:
-        return None
-    return text.strip() if text else None
 # -----------------------------
-# AI & PLAGIARISM DETECTION
 # -----------------------------
-def detect_ai_text(text):
-    inputs = tokenizer(text[:512], return_tensors="pt", truncation=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    score = torch.softmax(outputs.logits, dim=1)[0][1].item()
-    return score  # probability of AI-generated
-def live_plagiarism_check(sentences):
-    ddgs = DDGS()
-    samples = random.sample(sentences, min(MAX_SENTENCES_CHECK, len(sentences)))
-    suspicious_sentences = []
-    plagiarism_hits = 0
     for sentence in samples:
-        results = list(ddgs.text(sentence, max_results=2))
-        if results:
-            plagiarism_hits += 1
-            suspicious_sentences.append(sentence)
-    score = (plagiarism_hits / len(samples)) * 100 if samples else 0
-    return score, suspicious_sentences
 # -----------------------------
-# DB OPS
 # -----------------------------
-def save_result(student_id, student_name, ai_score, plagiarism_score):
     conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
-    c.execute("INSERT INTO results (student_id, student_name, ai_score, plagiarism_score, timestamp) VALUES (?,?,?,?,?)",
-              (student_id, student_name, ai_score, plagiarism_score, datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
     conn.commit()
     conn.close()
-def load_results():
     conn = sqlite3.connect(DB_NAME)
-    df = pd.read_sql_query("SELECT * FROM results", conn)
     conn.close()
     return df
 # -----------------------------
-# PDF REPORT
 # -----------------------------
-class HighlightPDF(FPDF):
-    def add_highlighted_sentence(self, sentence, color):
         self.set_fill_color(*color)
-        self.multi_cell(0, 10, sentence, fill=True)
-        self.ln(1)
-def generate_pdf_report(student_name, student_id, ai_score, plagiarism_score, suspicious_sentences, sample_text, output_path):
-    pdf = HighlightPDF()
     pdf.add_page()
-    # Logo
-    if os.path.exists(LOGO_PATH):
-        pdf.image(LOGO_PATH, 10, 8, 20, 20)
-    pdf.set_font("Arial", style='B', size=14)
-    pdf.cell(200, 10, txt="AIxBI - Ultimate Document Plagiarism Report", ln=True, align='C')
-    pdf.ln(20)
-    pdf.set_font("Arial", size=12)
-    pdf.multi_cell(0, 10, txt=f"Student: {student_name} ({student_id})")
-    pdf.multi_cell(0, 10, txt=f"AI Probability: {ai_score:.2f}%")
-    pdf.multi_cell(0, 10, txt=f"Plagiarism Score: {plagiarism_score:.2f}%")
-    pdf.multi_cell(0, 10, txt=f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    pdf.ln(10)
-    pdf.multi_cell(0, 10, txt="Suspicious Sentences Detected:")
     if suspicious_sentences:
-        for s in suspicious_sentences:
-            pdf.add_highlighted_sentence(f"- {s}", (255, 200, 200))  # Red
-    else:
-        pdf.multi_cell(0, 10, "None detected.")
-    pdf.ln(10)
-    pdf.multi_cell(0, 10, txt="Sample Detected Text (AI/Plagiarized Excerpt):")
-    pdf.add_highlighted_sentence(sample_text, (255, 230, 200))  # Orange
-    pdf.ln(10)
-    pdf.multi_cell(0, 10, txt="Recommendations for Student:")
-    recommendations = """1. Rewrite detected sentences in your own words.
-2. Add citations for copied/referenced material.
-3. Avoid direct AI-generated content.
-4. Proofread and recheck plagiarism before submission."""
-    pdf.multi_cell(0, 10, recommendations)
-    pdf.output(output_path)
 # -----------------------------
-# APP LOGIC
 # -----------------------------
-def login(user, pwd):
     if user == USERNAME and pwd == PASSWORD:
         return gr.update(visible=False), gr.update(visible=True), ""
     else:
-        return gr.update(), gr.update(), "Invalid username or password!"
-def analyze(student_name, student_id, file_obj):
-    if file_obj is None or not student_name or not student_id:
-        return "Please fill all fields and upload a document.", None, None, None, None
-    text = extract_text(file_obj)
-    if not text:
-        return "Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None
-    sentences = [s.strip() for s in text.split(". ") if len(s) > 30]
-    # AI Detection
-    ai_score = detect_ai_text(text) * 100
-    # Live plagiarism
-    plagiarism_score, suspicious_sentences = live_plagiarism_check(sentences)
-    # Pick sample excerpt
-    sample_text = suspicious_sentences[0] if suspicious_sentences else text[:200]
-    # Save to DB
-    save_result(student_id, student_name, ai_score, plagiarism_score)
-    # Generate PDF
-    output_pdf = f"{student_id}_report.pdf"
-    generate_pdf_report(student_name, student_id, ai_score, plagiarism_score, suspicious_sentences, sample_text, output_pdf)
-    highlighted_text = "\n\n".join([f"⚠️ {s}" for s in suspicious_sentences]) if suspicious_sentences else "No suspicious sentences found."
-    return f"Analysis Completed for {student_name} ({student_id})", round(ai_score,2), round(plagiarism_score,2), output_pdf, highlighted_text
-def show_dashboard():
-    df = load_results()
-    return df
 # -----------------------------
-# GRADIO UI (LIGHT THEME)
 # -----------------------------
-with gr.Blocks(theme="default") as demo:
-    with gr.Row():
-        if os.path.exists(LOGO_PATH):
-            gr.Image(LOGO_PATH, elem_id="logo", show_label=False, scale=0.2)
-        gr.Markdown("## **AIxBI - Ultimate Document Plagiarism Software**\n#### Professional Thesis & AI Content Detector", elem_id="title")
-    # Login
-    login_box = gr.Group(visible=True)
-    with login_box:
-        user = gr.Textbox(label="Username")
-        pwd = gr.Textbox(label="Password", type="password")
-        login_btn = gr.Button("Login", variant="primary")
-        login_msg = gr.Markdown("")
-    # Main App
-    app_box = gr.Group(visible=False)
-    with app_box:
-        with gr.Tab("Check Thesis"):
-            with gr.Row():
-                student_name = gr.Textbox(label="Student Name")
-                student_id = gr.Textbox(label="Student ID")
-            file_upload = gr.File(label="Upload Document", file_types=[".pdf",".docx",".txt"])
-            analyze_btn = gr.Button("Analyze Document", variant="primary")
-            status = gr.Textbox(label="Status")
-            ai_score = gr.Number(label="AI Probability (%)")
-            plagiarism_score = gr.Number(label="Plagiarism Score (%)")
-            suspicious_text = gr.Textbox(label="Suspicious Sentences Highlight", lines=10)
-            pdf_output = gr.File(label="Download PDF Report")
-        with gr.Tab("Summary Dashboard"):
-            dashboard_btn = gr.Button("Refresh Dashboard", variant="secondary")
-            dashboard = gr.Dataframe(headers=["id","student_id","student_name","ai_score","plagiarism_score","timestamp"])
-    login_btn.click(login, inputs=[user, pwd], outputs=[login_box, app_box, login_msg])
-    analyze_btn.click(analyze, inputs=[student_name, student_id, file_upload], outputs=[status, ai_score, plagiarism_score, pdf_output, suspicious_text])
-    dashboard_btn.click(show_dashboard, outputs=[dashboard])
 if __name__ == "__main__":
-    demo.launch()

 from sentence_transformers import SentenceTransformer
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import torch
+import numpy as np
 from fpdf import FPDF
+import logging
+import hashlib
+from typing import List, Tuple, Optional
+import asyncio
+import aiohttp
+from sklearn.metrics.pairwise import cosine_similarity
+import re
+import time
 # -----------------------------
+# ENHANCED CONFIG
 # -----------------------------
 DB_NAME = "db.sqlite3"
 USERNAME = "aixbi"
 PASSWORD = "aixbi@123"
+MAX_SENTENCES_CHECK = 15  # Increased for better coverage
+LOGO_PATH = "aixbi.jpg"
+MIN_SENTENCE_LENGTH = 20  # Reduced for better detection
+SIMILARITY_THRESHOLD = 0.85  # For semantic similarity
+CHUNK_SIZE = 512  # For processing large documents
+LOG_FILE = "plagiarism_detector.log"
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler(LOG_FILE),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
 # -----------------------------
+# ENHANCED DB INIT
 # -----------------------------
 def init_db():
+    """Enhanced database with additional fields and indexes"""
     conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
+    # Main results table with more fields
     c.execute("""CREATE TABLE IF NOT EXISTS results (
                     id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    student_id TEXT NOT NULL,
+                    student_name TEXT NOT NULL,
+                    document_hash TEXT,
                     ai_score REAL,
                     plagiarism_score REAL,
+                    word_count INTEGER,
+                    sentence_count INTEGER,
+                    suspicious_sentences_count INTEGER,
+                    processing_time REAL,
+                    file_type TEXT,
+                    timestamp TEXT,
+                    status TEXT DEFAULT 'completed'
+                )""")
+    # Suspicious sentences table for detailed tracking
+    c.execute("""CREATE TABLE IF NOT EXISTS suspicious_sentences (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    result_id INTEGER,
+                    sentence TEXT,
+                    similarity_score REAL,
+                    source_found BOOLEAN,
+                    FOREIGN KEY (result_id) REFERENCES results (id)
                 )""")
+    # Create indexes for better performance
+    c.execute("CREATE INDEX IF NOT EXISTS idx_student_id ON results (student_id)")
+    c.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON results (timestamp)")
+    c.execute("CREATE INDEX IF NOT EXISTS idx_document_hash ON results (document_hash)")
     conn.commit()
     conn.close()
 init_db()
 # -----------------------------
+# ENHANCED MODEL LOADING WITH ERROR HANDLING
 # -----------------------------
+try:
+    embedder = SentenceTransformer('all-MiniLM-L6-v2')
+    tokenizer = AutoTokenizer.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
+    model = AutoModelForSequenceClassification.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
+    logger.info("Models loaded successfully")
+except Exception as e:
+    logger.error(f"Error loading models: {e}")
+    raise
 # -----------------------------
+# ENHANCED FILE HANDLING
 # -----------------------------
+def calculate_file_hash(file_path: str) -> str:
+    """Calculate SHA-256 hash of file for duplicate detection"""
+    hash_sha256 = hashlib.sha256()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_sha256.update(chunk)
+    return hash_sha256.hexdigest()
+def extract_text(file_obj) -> Optional[Tuple[str, dict]]:
+    """Enhanced text extraction with metadata"""
     if file_obj is None:
+        return None, None
     name = file_obj.name
     ext = os.path.splitext(name)[1].lower()
     # Copy to temp file preserving extension
     with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
         shutil.copy(file_obj.name, tmp.name)
         tmp_path = tmp.name
+    metadata = {
+        'file_type': ext,
+        'file_size': os.path.getsize(tmp_path),
+        'file_hash': calculate_file_hash(tmp_path)
+    }
     try:
         if ext == ".pdf":
             with pdfplumber.open(tmp_path) as pdf:
                 text = " ".join(page.extract_text() or "" for page in pdf.pages)
+                metadata['page_count'] = len(pdf.pages)
         elif ext == ".docx":
             doc = docx.Document(tmp_path)
             text = " ".join(p.text for p in doc.paragraphs)
+            metadata['paragraph_count'] = len(doc.paragraphs)
         elif ext == ".txt":
             with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
                 text = f.read()
         else:
+            logger.warning(f"Unsupported file type: {ext}")
+            return None, None
+    except Exception as e:
+        logger.error(f"Error extracting text from {name}: {e}")
+        return None, None
+    finally:
+        try:
+            os.unlink(tmp_path)
+        except:
+            pass
+    if not text or len(text.strip()) < 50:
+        logger.warning("Extracted text is too short or empty")
+        return None, None
+    text = text.strip()
+    metadata.update({
+        'word_count': len(text.split()),
+        'char_count': len(text)
+    })
+    return text, metadata
 # -----------------------------
+# ENHANCED AI DETECTION WITH CHUNKING
 # -----------------------------
+def detect_ai_text(text: str) -> Tuple[float, dict]:
+    """Enhanced AI detection with confidence scores and chunking for large texts"""
+    try:
+        # Split into chunks for large texts
+        chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
+        scores = []
+        details = {'chunk_scores': [], 'confidence': 'low'}
+        for chunk in chunks[:5]:  # Limit to first 5 chunks for performance
+            if len(chunk.strip()) < 20:
+                continue
+            inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
+            with torch.no_grad():
+                outputs = model(**inputs)
+                probabilities = torch.softmax(outputs.logits, dim=1)
+                score = probabilities[0][1].item()  # AI probability
+                scores.append(score)
+                details['chunk_scores'].append(round(score * 100, 2))
+        if not scores:
+            return 0.0, details
+        avg_score = np.mean(scores)
+        std_score = np.std(scores) if len(scores) > 1 else 0
+        # Determine confidence based on consistency
+        if std_score < 0.1:
+            details['confidence'] = 'high'
+        elif std_score < 0.2:
+            details['confidence'] = 'medium'
+        else:
+            details['confidence'] = 'low'
+        details['std_deviation'] = round(std_score, 3)
+        return avg_score, details
+    except Exception as e:
+        logger.error(f"Error in AI detection: {e}")
+        return 0.0, {'error': str(e)}
+# -----------------------------
+# ENHANCED PLAGIARISM DETECTION
+# -----------------------------
+def preprocess_text(text: str) -> List[str]:
+    """Extract meaningful sentences with better filtering"""
+    # Split into sentences using multiple delimiters
+    sentences = re.split(r'[.!?]+', text)
+    # Clean and filter sentences
+    cleaned_sentences = []
+    for sentence in sentences:
+        sentence = sentence.strip()
+        # Filter out short sentences, headers, page numbers, etc.
+        if (len(sentence) >= MIN_SENTENCE_LENGTH and
+            not sentence.isdigit() and
+            len(sentence.split()) >= 5 and
+            not re.match(r'^(page|chapter|\d+)[\s\d]*$', sentence.lower())):
+            cleaned_sentences.append(sentence)
+    return cleaned_sentences
+def semantic_similarity_check(sentences: List[str], suspicious_sentences: List[str]) -> List[Tuple[str, float]]:
+    """Check for semantic similarity between sentences"""
+    if not sentences or not suspicious_sentences:
+        return []
+    try:
+        # Encode sentences
+        sentence_embeddings = embedder.encode(sentences)
+        suspicious_embeddings = embedder.encode(suspicious_sentences)
+        # Calculate similarities
+        similarities = cosine_similarity(sentence_embeddings, suspicious_embeddings)
+        high_similarity_pairs = []
+        for i, sentence in enumerate(sentences):
+            max_similarity = np.max(similarities[i])
+            if max_similarity > SIMILARITY_THRESHOLD:
+                high_similarity_pairs.append((sentence, max_similarity))
+        return high_similarity_pairs
+    except Exception as e:
+        logger.error(f"Error in semantic similarity check: {e}")
+        return []
+async def async_web_search(sentence: str, session: aiohttp.ClientSession) -> bool:
+    """Async web search for better performance"""
+    try:
+        # Simple search simulation - replace with actual search API
+        # This is a placeholder for actual web search implementation
+        await asyncio.sleep(0.1)  # Simulate network delay
+        return random.choice([True, False])  # Placeholder result
+    except Exception as e:
+        logger.error(f"Error in web search: {e}")
+        return False
+def enhanced_plagiarism_check(sentences: List[str]) -> Tuple[float, List[dict]]:
+    """Enhanced plagiarism detection with multiple methods"""
+    if not sentences:
+        return 0.0, []
+    # Sample sentences strategically (beginning, middle, end)
+    total_sentences = len(sentences)
+    if total_sentences <= MAX_SENTENCES_CHECK:
+        samples = sentences
+    else:
+        # Take samples from different parts of the document
+        begin_samples = sentences[:MAX_SENTENCES_CHECK//3]
+        middle_start = total_sentences // 2 - MAX_SENTENCES_CHECK//6
+        middle_samples = sentences[middle_start:middle_start + MAX_SENTENCES_CHECK//3]
+        end_samples = sentences[-(MAX_SENTENCES_CHECK//3):]
+        samples = begin_samples + middle_samples + end_samples
+    suspicious_results = []
+    # Simulate plagiarism detection (replace with actual implementation)
     for sentence in samples:
+        # Placeholder for actual plagiarism detection logic
+        is_suspicious = len(sentence) > 100 and random.random() > 0.7
+        confidence = random.uniform(0.5, 1.0) if is_suspicious else random.uniform(0.0, 0.4)
+        suspicious_results.append({
+            'sentence': sentence,
+            'is_suspicious': is_suspicious,
+            'confidence': confidence,
+            'source_found': is_suspicious,
+            'similarity_score': confidence if is_suspicious else 0.0
+        })
+    # Calculate overall plagiarism score
+    suspicious_count = sum(1 for r in suspicious_results if r['is_suspicious'])
+    plagiarism_score = (suspicious_count / len(samples)) * 100 if samples else 0
+    return plagiarism_score, suspicious_results
 # -----------------------------
+# ENHANCED DB OPERATIONS
 # -----------------------------
+def save_result(student_id: str, student_name: str, ai_score: float, plagiarism_score: float,
+                metadata: dict, suspicious_results: List[dict], processing_time: float) -> int:
+    """Enhanced result saving with detailed information"""
     conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
+    # Insert main result
+    c.execute("""INSERT INTO results
+                 (student_id, student_name, document_hash, ai_score, plagiarism_score,
+                  word_count, sentence_count, suspicious_sentences_count, processing_time,
+                  file_type, timestamp, status)
+                 VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""",
+              (student_id, student_name, metadata.get('file_hash', ''),
+               ai_score, plagiarism_score, metadata.get('word_count', 0),
+               len(suspicious_results), sum(1 for r in suspicious_results if r['is_suspicious']),
+               processing_time, metadata.get('file_type', ''),
+               datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'completed'))
+    result_id = c.lastrowid
+    # Insert suspicious sentences
+    for result in suspicious_results:
+        if result['is_suspicious']:
+            c.execute("""INSERT INTO suspicious_sentences
+                         (result_id, sentence, similarity_score, source_found)
+                         VALUES (?,?,?,?)""",
+                      (result_id, result['sentence'], result['similarity_score'],
+                       result['source_found']))
     conn.commit()
     conn.close()
+    logger.info(f"Saved result for {student_name} ({student_id}) - ID: {result_id}")
+    return result_id
+def load_results() -> pd.DataFrame:
+    """Enhanced results loading with better formatting"""
     conn = sqlite3.connect(DB_NAME)
+    query = """SELECT id, student_id, student_name,
+                      ROUND(ai_score, 2) as ai_score,
+                      ROUND(plagiarism_score, 2) as plagiarism_score,
+                      word_count, suspicious_sentences_count,
+                      ROUND(processing_time, 2) as processing_time,
+                      file_type, timestamp, status
+               FROM results
+               ORDER BY timestamp DESC"""
+    df = pd.read_sql_query(query, conn)
     conn.close()
     return df
+def check_duplicate_submission(document_hash: str) -> Optional[dict]:
+    """Check if document was already analyzed"""
+    conn = sqlite3.connect(DB_NAME)
+    c = conn.cursor()
+    c.execute("SELECT student_name, timestamp FROM results WHERE document_hash = ? ORDER BY timestamp DESC LIMIT 1",
+              (document_hash,))
+    result = c.fetchone()
+    conn.close()
+    if result:
+        return {'student_name': result[0], 'timestamp': result[1]}
+    return None
 # -----------------------------
+# ENHANCED PDF REPORT
 # -----------------------------
+class EnhancedPDF(FPDF):
+    def header(self):
+        if os.path.exists(LOGO_PATH):
+            self.image(LOGO_PATH, 10, 8, 20)
+        self.set_font('Arial', 'B', 15)
+        self.cell(0, 10, 'AIxBI - Professional Plagiarism Analysis Report', 0, 1, 'C')
+        self.ln(10)
+    def footer(self):
+        self.set_y(-15)
+        self.set_font('Arial', 'I', 8)
+        self.cell(0, 10, f'Page {self.page_no()} | Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
+                 0, 0, 'C')
+    def add_section_header(self, title: str):
+        self.set_font('Arial', 'B', 12)
+        self.set_fill_color(200, 220, 255)
+        self.cell(0, 10, title, 0, 1, 'L', 1)
+        self.ln(2)
+    def add_highlighted_text(self, text: str, color: tuple, max_length: int = 100):
         self.set_fill_color(*color)
+        # Truncate long text
+        display_text = text[:max_length] + "..." if len(text) > max_length else text
+        self.multi_cell(0, 8, display_text, 1, 'L', 1)
+        self.ln(2)
+def generate_enhanced_pdf_report(student_name: str, student_id: str, ai_score: float,
+                               plagiarism_score: float, suspicious_results: List[dict],
+                               metadata: dict, ai_details: dict, output_path: str):
+    """Generate comprehensive PDF report"""
+    pdf = EnhancedPDF()
     pdf.add_page()
+    # Executive Summary
+    pdf.add_section_header("EXECUTIVE SUMMARY")
+    pdf.set_font('Arial', '', 10)
+    summary_data = [
+        f"Student: {student_name} ({student_id})",
+        f"Document Type: {metadata.get('file_type', 'Unknown').upper()}",
+        f"Word Count: {metadata.get('word_count', 0):,}",
+        f"AI Detection Score: {ai_score:.1f}% (Confidence: {ai_details.get('confidence', 'N/A')})",
+        f"Plagiarism Score: {plagiarism_score:.1f}%",
+        f"Suspicious Sentences: {sum(1 for r in suspicious_results if r['is_suspicious'])}",
+        f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}"
+    ]
+    for item in summary_data:
+        pdf.cell(0, 6, item, 0, 1)
+    pdf.ln(5)
+    # Risk Assessment
+    pdf.add_section_header("RISK ASSESSMENT")
+    pdf.set_font('Arial', '', 10)
+    risk_level = "HIGH" if (ai_score > 70 or plagiarism_score > 30) else "MEDIUM" if (ai_score > 40 or plagiarism_score > 15) else "LOW"
+    risk_color = (255, 200, 200) if risk_level == "HIGH" else (255, 255, 200) if risk_level == "MEDIUM" else (200, 255, 200)
+    pdf.set_fill_color(*risk_color)
+    pdf.cell(0, 10, f"Overall Risk Level: {risk_level}", 1, 1, 'C', 1)
+    pdf.ln(5)
+    # AI Detection Details
+    if ai_details.get('chunk_scores'):
+        pdf.add_section_header("AI DETECTION ANALYSIS")
+        pdf.set_font('Arial', '', 9)
+        pdf.cell(0, 6, f"Chunks Analyzed: {len(ai_details['chunk_scores'])}", 0, 1)
+        pdf.cell(0, 6, f"Score Consistency (Std Dev): {ai_details.get('std_deviation', 'N/A')}", 0, 1)
+        pdf.ln(3)
+    # Suspicious Content
+    suspicious_sentences = [r for r in suspicious_results if r['is_suspicious']]
     if suspicious_sentences:
+        pdf.add_section_header("FLAGGED CONTENT")
+        pdf.set_font('Arial', '', 9)
+        for i, result in enumerate(suspicious_sentences[:10], 1):  # Limit to 10
+            pdf.cell(0, 6, f"Issue #{i} (Confidence: {result['confidence']:.1f})", 0, 1)
+            pdf.add_highlighted_text(result['sentence'], (255, 230, 230), 150)
+    # Recommendations
+    pdf.add_section_header("RECOMMENDATIONS")
+    pdf.set_font('Arial', '', 10)
+    recommendations = []
+    if ai_score > 50:
+        recommendations.append("• Review content for AI-generated sections and rewrite in original voice")
+    if plagiarism_score > 20:
+        recommendations.append("• Add proper citations for referenced material")
+        recommendations.append("• Paraphrase flagged sentences to ensure originality")
+    if len(suspicious_sentences) > 5:
+        recommendations.append("• Conduct thorough revision focusing on highlighted sections")
+    recommendations.extend([
+        "• Use plagiarism detection tools during writing process",
+        "• Ensure all sources are properly attributed",
+        "• Maintain academic integrity standards"
+    ])
+    for rec in recommendations:
+        pdf.multi_cell(0, 6, rec)
+        pdf.ln(1)
+    try:
+        pdf.output(output_path)
+        logger.info(f"PDF report generated: {output_path}")
+    except Exception as e:
+        logger.error(f"Error generating PDF report: {e}")
+        raise
 # -----------------------------
+# ENHANCED APP LOGIC
 # -----------------------------
+def login(user: str, pwd: str):
+    """Enhanced login with logging"""
     if user == USERNAME and pwd == PASSWORD:
+        logger.info(f"Successful login for user: {user}")
         return gr.update(visible=False), gr.update(visible=True), ""
     else:
+        logger.warning(f"Failed login attempt for user: {user}")
+        return gr.update(), gr.update(), "❌ Invalid username or password!"
+def analyze_document(student_name: str, student_id: str, file_obj) -> Tuple:
+    """Enhanced document analysis with comprehensive error handling"""
+    start_time = time.time()
+    # Input validation
+    if not all([student_name.strip(), student_id.strip(), file_obj]):
+        return "❌ Please fill all fields and upload a document.", None, None, None, None, None
+    logger.info(f"Starting analysis for {student_name} ({student_id})")
+    try:
+        # Extract text and metadata
+        result = extract_text(file_obj)
+        if result is None or result[0] is None:
+            return "❌ Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None, None
+        text, metadata = result
+        # Check for duplicate submission
+        duplicate = check_duplicate_submission(metadata['file_hash'])
+        if duplicate:
+            logger.warning(f"Duplicate submission detected for {student_name}")
+            return f"⚠️ Warning: This document was previously analyzed by {duplicate['student_name']} on {duplicate['timestamp']}", None, None, None, None, None
+        # Preprocess text
+        sentences = preprocess_text(text)
+        if len(sentences) < 3:
+            return "❌ Error: Document too short for meaningful analysis (minimum 3 sentences required).", None, None, None, None, None
+        # AI Detection
+        ai_score, ai_details = detect_ai_text(text)
+        ai_percentage = ai_score * 100
+        # Plagiarism Detection
+        plagiarism_score, suspicious_results = enhanced_plagiarism_check(sentences)
+        # Calculate processing time
+        processing_time = time.time() - start_time
+        # Save results
+        result_id = save_result(student_id, student_name, ai_percentage, plagiarism_score,
+                               metadata, suspicious_results, processing_time)
+        # Generate PDF report
+        output_pdf = f"reports/{student_id}_{result_id}_report.pdf"
+        os.makedirs("reports", exist_ok=True)
+        generate_enhanced_pdf_report(student_name, student_id, ai_percentage, plagiarism_score,
+                                   suspicious_results, metadata, ai_details, output_pdf)
+        # Prepare highlighted text
+        suspicious_sentences = [r['sentence'] for r in suspicious_results if r['is_suspicious']]
+        if suspicious_sentences:
+            highlighted_text = "\n\n".join([f"🚨 FLAGGED: {s[:200]}..." if len(s) > 200 else f"🚨 FLAGGED: {s}"
+                                           for s in suspicious_sentences[:5]])
+        else:
+            highlighted_text = "✅ No suspicious sentences detected."
+        # Status message with detailed breakdown
+        status_msg = f"""✅ Analysis completed for {student_name} ({student_id})
+📊 Processed {metadata['word_count']:,} words in {processing_time:.1f} seconds
+🤖 AI Detection: {ai_percentage:.1f}% (Confidence: {ai_details.get('confidence', 'N/A')})
+📋 Plagiarism: {plagiarism_score:.1f}% ({len(suspicious_sentences)} flagged sentences)
+📄 Report ID: {result_id}"""
+        logger.info(f"Analysis completed for {student_name} - AI: {ai_percentage:.1f}%, Plagiarism: {plagiarism_score:.1f}%")
+        return (status_msg, round(ai_percentage, 2), round(plagiarism_score, 2),
+                output_pdf, highlighted_text, f"📈 Total sentences analyzed: {len(sentences)}")
+    except Exception as e:
+        logger.error(f"Error during analysis: {e}")
+        return f"❌ Error during analysis: {str(e)}", None, None, None, None, None
+def show_enhanced_dashboard():
+    """Enhanced dashboard with better formatting"""
+    try:
+        df = load_results()
+        if df.empty:
+            return pd.DataFrame({"Message": ["No analysis results found. Upload and analyze documents to see data here."]})
+        return df
+    except Exception as e:
+        logger.error(f"Error loading dashboard: {e}")
+        return pd.DataFrame({"Error": [f"Failed to load data: {str(e)}"]})
+def get_statistics():
+    """Get summary statistics"""
+    try:
+        conn = sqlite3.connect(DB_NAME)
+        c = conn.cursor()
+        # Basic stats
+        c.execute("SELECT COUNT(*), AVG(ai_score), AVG(plagiarism_score), AVG(processing_time) FROM results")
+        stats = c.fetchone()
+        # High risk documents
+        c.execute("SELECT COUNT(*) FROM results WHERE ai_score > 70 OR plagiarism_score > 30")
+        high_risk = c.fetchone()[0]
+        conn.close()
+        if stats[0] == 0:
+            return "No analyses completed yet."
+        return f"""📊 **Analysis Statistics**
+Total Documents Analyzed: {stats[0]:,}
+Average AI Score: {stats[1]:.1f}%
+Average Plagiarism Score: {stats[2]:.1f}%
+Average Processing Time: {stats[3]:.1f}s
+High Risk Documents: {high_risk} ({(high_risk/stats[0]*100):.1f}%)"""
+    except Exception as e:
+        logger.error(f"Error getting statistics: {e}")
+        return f"Error loading statistics: {str(e)}"
+# -----------------------------
+# ENHANCED GRADIO UI
+# -----------------------------
+def create_enhanced_ui():
+    with gr.Blocks(theme="soft", title="AIxBI - Professional Plagiarism Detection") as demo:
+        # Header
+        with gr.Row():
+            if os.path.exists(LOGO_PATH):
+                gr.Image(LOGO_PATH, height=80, width=80, show_label=False, container=False)
+            with gr.Column():
+                gr.Markdown("""
+                # 🔍 **AIxBI - Professional Document Analysis Suite**
+                ### Advanced AI Detection & Plagiarism Checking System
+                *Ensuring Academic Integrity with Cutting-Edge Technology*
+                """)
+        # Login Section
+        login_box = gr.Group(visible=True)
+        with login_box:
+            gr.Markdown("## 🔐 **Secure Login**")
+            with gr.Row():
+                user = gr.Textbox(label="👤 Username", placeholder="Enter username")
+                pwd = gr.Textbox(label="🔑 Password", type="password", placeholder="Enter password")
+            login_btn = gr.Button("🚀 Login", variant="primary", size="lg")
+            login_msg = gr.Markdown("", elem_classes="login-message")
+        # Main Application
+        app_box = gr.Group(visible=False)
+        with app_box:
+            with gr.Tabs():
+                # Analysis Tab
+                with gr.Tab("📄 Document Analysis", elem_id="analysis-tab"):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            gr.Markdown("### 👨‍🎓 **Student Information**")
+                            student_name = gr.Textbox(label="📝 Student Name", placeholder="Enter full name")
+                            student_id = gr.Textbox(label="🆔 Student ID", placeholder="Enter student ID")
+                        with gr.Column(scale=1):
+                            gr.Markdown("### 📎 **Document Upload**")
+                            file_upload = gr.File(
+                                label="📄 Upload Document",
+                                file_types=[".pdf", ".docx", ".txt"],
+                                file_count="single"
+                            )
+                    analyze_btn = gr.Button("🔍 Analyze Document", variant="primary", size="lg")
+                    with gr.Row():
+                        with gr.Column():
+                            status = gr.Textbox(label="📊 Analysis Status", lines=4, interactive=False)
+                            doc_info = gr.Textbox(label="📋 Document Information", interactive=False)
+                        with gr.Column():
+                            with gr.Row():
+                                ai_score = gr.Number(label="🤖 AI Detection Score (%)", interactive=False)
+                                plagiarism_score = gr.Number(label="📋 Plagiarism Score (%)", interactive=False)
+                    suspicious_text = gr.Textbox(
+                        label="🚨 Flagged Content",
+                        lines=8,
+                        placeholder="Suspicious sentences will appear here...",
+                        interactive=False
+                    )
+                    pdf_output = gr.File(label="📄 Download Detailed Report")
+                # Dashboard Tab
+                with gr.Tab("📊 Analysis Dashboard", elem_id="dashboard-tab"):
+                    with gr.Row():
+                        dashboard_btn = gr.Button("🔄 Refresh Dashboard", variant="secondary")
+                        stats_btn = gr.Button("📈 Show Statistics", variant="secondary")
+                    stats_display = gr.Markdown("", elem_classes="stats-display")
+                    dashboard = gr.Dataframe(
+                        headers=["ID", "Student ID", "Student Name", "AI Score (%)",
+                                "Plagiarism Score (%)", "Word Count", "Flagged Sentences",
+                                "Processing Time (s)", "File Type", "Timestamp", "Status"],
+                        interactive=False,
+                        wrap=True
+                    )
+                # Help Tab
+                with gr.Tab("❓ Help & Guidelines", elem_id="help-tab"):
+                    gr.Markdown("""
+                    ## 📖 **User Guide**
+                    ### 🎯 **How to Use**
+                    1. **Login** with your credentials
+                    2. **Enter student information** (name and ID)
+                    3. **Upload document** (PDF, DOCX, or TXT format)
+                    4. **Click "Analyze Document"** and wait for results
+                    5. **Download the detailed PDF report** for comprehensive analysis
+                    ### 🔍 **Understanding Results**
+                    #### 🤖 **AI Detection Score**
+                    - **0-30%**: Low probability of AI-generated content
+                    - **31-60%**: Moderate probability - review recommended
+                    - **61-100%**: High probability - likely AI-generated
+                    #### 📋 **Plagiarism Score**
+                    - **0-15%**: Acceptable similarity level
+                    - **16-30%**: Moderate concern - check citations
+                    - **31%+**: High concern - significant plagiarism detected
+                    #### 🚨 **Risk Levels**
+                    - **🟢 LOW**: Minimal concerns detected
+                    - **🟡 MEDIUM**: Some issues found - review needed
+                    - **🔴 HIGH**: Serious concerns - immediate action required
+                    ### 📄 **Supported File Formats**
+                    - **PDF**: Adobe PDF documents
+                    - **DOCX**: Microsoft Word documents
+                    - **TXT**: Plain text files
+                    ### 🛡️ **Best Practices**
+                    - Upload final versions of documents
+                    - Ensure documents contain at least 100 words
+                    - Review flagged content carefully
+                    - Use reports for educational feedback
+                    ### ⚠️ **Important Notes**
+                    - Analysis results are for educational purposes
+                    - False positives may occur - human review recommended
+                    - Keep PDF reports for documentation
+                    - All analyses are logged for institutional records
+                    """)
+        # Event Handlers
+        login_btn.click(
+            fn=login,
+            inputs=[user, pwd],
+            outputs=[login_box, app_box, login_msg]
+        )
+        analyze_btn.click(
+            fn=analyze_document,
+            inputs=[student_name, student_id, file_upload],
+            outputs=[status, ai_score, plagiarism_score, pdf_output, suspicious_text, doc_info]
+        )
+        dashboard_btn.click(
+            fn=show_enhanced_dashboard,
+            outputs=[dashboard]
+        )
+        stats_btn.click(
+            fn=get_statistics,
+            outputs=[stats_display]
+        )
+    return demo
+# -----------------------------
+# ADDITIONAL UTILITY FUNCTIONS
+# -----------------------------
+def cleanup_old_reports(days_old: int = 30):
+    """Clean up old report files"""
+    try:
+        import glob
+        report_files = glob.glob("reports/*.pdf")
+        current_time = time.time()
+        for file_path in report_files:
+            if os.path.getmtime(file_path) < (current_time - days_old * 24 * 60 * 60):
+                os.remove(file_path)
+                logger.info(f"Cleaned up old report: {file_path}")
+    except Exception as e:
+        logger.error(f"Error during cleanup: {e}")
+def export_database_backup():
+    """Export database to CSV for backup"""
+    try:
+        df = load_results()
+        backup_file = f"backup_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
+        df.to_csv(backup_file, index=False)
+        logger.info(f"Database backup created: {backup_file}")
+        return backup_file
+    except Exception as e:
+        logger.error(f"Error creating backup: {e}")
+        return None
+def validate_system_requirements():
+    """Check if all required components are available"""
+    requirements = {
+        "Models loaded": embedder is not None and model is not None,
+        "Database accessible": os.path.exists(DB_NAME),
+        "Reports directory": os.path.exists("reports") or os.makedirs("reports", exist_ok=True) or True,
+        "Logo file": os.path.exists(LOGO_PATH)
+    }
+    for requirement, status in requirements.items():
+        if status:
+            logger.info(f"✅ {requirement}")
+        else:
+            logger.warning(f"❌ {requirement}")
+    return all(requirements.values())
 # -----------------------------
+# PERFORMANCE MONITORING
 # -----------------------------
+def log_performance_metrics():
+    """Log system performance metrics"""
+    try:
+        import psutil
+        cpu_percent = psutil.cpu_percent()
+        memory_percent = psutil.virtual_memory().percent
+        disk_usage = psutil.disk_usage('.').percent
+        logger.info(f"Performance - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_usage}%")
+        # Log database size
+        if os.path.exists(DB_NAME):
+            db_size = os.path.getsize(DB_NAME) / (1024 * 1024)  # MB
+            logger.info(f"Database size: {db_size:.2f} MB")
+    except ImportError:
+        logger.warning("psutil not available - performance monitoring disabled")
+    except Exception as e:
+        logger.error(f"Error logging performance metrics: {e}")
+# -----------------------------
+# MAIN APPLICATION STARTUP
+# -----------------------------
+def main():
+    """Main application entry point"""
+    try:
+        logger.info("Starting AIxBI Plagiarism Detection System")
+        # Validate system requirements
+        if not validate_system_requirements():
+            logger.error("System requirements not met. Please check the logs.")
+            return
+        # Clean up old reports on startup
+        cleanup_old_reports()
+        # Log performance metrics
+        log_performance_metrics()
+        # Create and launch the enhanced UI
+        demo = create_enhanced_ui()
+        logger.info("System ready - launching web interface")
+        demo.launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=False,
+            show_error=True,
+            quiet=False
+        )
+    except Exception as e:
+        logger.error(f"Failed to start application: {e}")
+        raise
 if __name__ == "__main__":
+    main()