Update app.py
Browse files
app.py
CHANGED
@@ -5,249 +5,870 @@ import pandas as pd
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
7 |
import torch
|
8 |
-
|
9 |
from fpdf import FPDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
# -----------------------------
|
12 |
-
# CONFIG
|
13 |
# -----------------------------
|
14 |
DB_NAME = "db.sqlite3"
|
15 |
USERNAME = "aixbi"
|
16 |
PASSWORD = "aixbi@123"
|
17 |
-
MAX_SENTENCES_CHECK =
|
18 |
-
LOGO_PATH = "aixbi.jpg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
# -----------------------------
|
21 |
-
# DB INIT
|
22 |
# -----------------------------
|
23 |
def init_db():
|
|
|
24 |
conn = sqlite3.connect(DB_NAME)
|
25 |
c = conn.cursor()
|
|
|
|
|
26 |
c.execute("""CREATE TABLE IF NOT EXISTS results (
|
27 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
28 |
-
student_id TEXT,
|
29 |
-
student_name TEXT,
|
|
|
30 |
ai_score REAL,
|
31 |
plagiarism_score REAL,
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
)""")
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
conn.commit()
|
35 |
conn.close()
|
36 |
|
37 |
init_db()
|
38 |
|
39 |
# -----------------------------
|
40 |
-
# MODEL LOADING
|
41 |
# -----------------------------
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
# -----------------------------
|
47 |
-
# FILE HANDLING
|
48 |
# -----------------------------
|
49 |
-
def
|
50 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
if file_obj is None:
|
52 |
-
return None
|
53 |
|
54 |
name = file_obj.name
|
55 |
ext = os.path.splitext(name)[1].lower()
|
56 |
-
|
57 |
# Copy to temp file preserving extension
|
58 |
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
59 |
shutil.copy(file_obj.name, tmp.name)
|
60 |
tmp_path = tmp.name
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
try:
|
63 |
if ext == ".pdf":
|
64 |
with pdfplumber.open(tmp_path) as pdf:
|
65 |
text = " ".join(page.extract_text() or "" for page in pdf.pages)
|
|
|
66 |
elif ext == ".docx":
|
67 |
doc = docx.Document(tmp_path)
|
68 |
text = " ".join(p.text for p in doc.paragraphs)
|
|
|
69 |
elif ext == ".txt":
|
70 |
with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
|
71 |
text = f.read()
|
72 |
else:
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
# -----------------------------
|
80 |
-
# AI
|
81 |
# -----------------------------
|
82 |
-
def detect_ai_text(text):
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
for sentence in samples:
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
# -----------------------------
|
105 |
-
# DB
|
106 |
# -----------------------------
|
107 |
-
def save_result(student_id, student_name, ai_score, plagiarism_score
|
|
|
|
|
108 |
conn = sqlite3.connect(DB_NAME)
|
109 |
c = conn.cursor()
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
conn.commit()
|
113 |
conn.close()
|
|
|
|
|
|
|
114 |
|
115 |
-
def load_results():
|
|
|
116 |
conn = sqlite3.connect(DB_NAME)
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
conn.close()
|
119 |
return df
|
120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
# -----------------------------
|
122 |
-
# PDF REPORT
|
123 |
# -----------------------------
|
124 |
-
class
|
125 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
self.set_fill_color(*color)
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
132 |
pdf.add_page()
|
133 |
|
134 |
-
#
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
148 |
|
149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
150 |
if suspicious_sentences:
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
pdf.
|
160 |
-
|
161 |
-
|
162 |
-
recommendations =
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
# -----------------------------
|
171 |
-
# APP LOGIC
|
172 |
# -----------------------------
|
173 |
-
def login(user, pwd):
|
|
|
174 |
if user == USERNAME and pwd == PASSWORD:
|
|
|
175 |
return gr.update(visible=False), gr.update(visible=True), ""
|
176 |
else:
|
177 |
-
|
178 |
-
|
179 |
-
def analyze(student_name, student_id, file_obj):
|
180 |
-
if file_obj is None or not student_name or not student_id:
|
181 |
-
return "Please fill all fields and upload a document.", None, None, None, None
|
182 |
-
|
183 |
-
text = extract_text(file_obj)
|
184 |
-
if not text:
|
185 |
-
return "Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None
|
186 |
-
|
187 |
-
sentences = [s.strip() for s in text.split(". ") if len(s) > 30]
|
188 |
-
|
189 |
-
# AI Detection
|
190 |
-
ai_score = detect_ai_text(text) * 100
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
-
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
-
def
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
# -----------------------------
|
213 |
-
#
|
214 |
# -----------------------------
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
# Main App
|
230 |
-
app_box = gr.Group(visible=False)
|
231 |
-
with app_box:
|
232 |
-
with gr.Tab("Check Thesis"):
|
233 |
-
with gr.Row():
|
234 |
-
student_name = gr.Textbox(label="Student Name")
|
235 |
-
student_id = gr.Textbox(label="Student ID")
|
236 |
-
file_upload = gr.File(label="Upload Document", file_types=[".pdf",".docx",".txt"])
|
237 |
-
analyze_btn = gr.Button("Analyze Document", variant="primary")
|
238 |
-
status = gr.Textbox(label="Status")
|
239 |
-
ai_score = gr.Number(label="AI Probability (%)")
|
240 |
-
plagiarism_score = gr.Number(label="Plagiarism Score (%)")
|
241 |
-
suspicious_text = gr.Textbox(label="Suspicious Sentences Highlight", lines=10)
|
242 |
-
pdf_output = gr.File(label="Download PDF Report")
|
243 |
|
244 |
-
|
245 |
-
|
246 |
-
|
|
|
247 |
|
248 |
-
|
249 |
-
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
|
252 |
if __name__ == "__main__":
|
253 |
-
|
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
7 |
import torch
|
8 |
+
import numpy as np
|
9 |
from fpdf import FPDF
|
10 |
+
import logging
|
11 |
+
import hashlib
|
12 |
+
from typing import List, Tuple, Optional
|
13 |
+
import asyncio
|
14 |
+
import aiohttp
|
15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
16 |
+
import re
|
17 |
+
import time
|
18 |
|
19 |
# -----------------------------
|
20 |
+
# ENHANCED CONFIG
|
21 |
# -----------------------------
|
22 |
DB_NAME = "db.sqlite3"
|
23 |
USERNAME = "aixbi"
|
24 |
PASSWORD = "aixbi@123"
|
25 |
+
MAX_SENTENCES_CHECK = 15 # Increased for better coverage
|
26 |
+
LOGO_PATH = "aixbi.jpg"
|
27 |
+
MIN_SENTENCE_LENGTH = 20 # Reduced for better detection
|
28 |
+
SIMILARITY_THRESHOLD = 0.85 # For semantic similarity
|
29 |
+
CHUNK_SIZE = 512 # For processing large documents
|
30 |
+
LOG_FILE = "plagiarism_detector.log"
|
31 |
+
|
32 |
+
# Setup logging
|
33 |
+
logging.basicConfig(
|
34 |
+
level=logging.INFO,
|
35 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
36 |
+
handlers=[
|
37 |
+
logging.FileHandler(LOG_FILE),
|
38 |
+
logging.StreamHandler()
|
39 |
+
]
|
40 |
+
)
|
41 |
+
logger = logging.getLogger(__name__)
|
42 |
|
43 |
# -----------------------------
|
44 |
+
# ENHANCED DB INIT
|
45 |
# -----------------------------
|
46 |
def init_db():
|
47 |
+
"""Enhanced database with additional fields and indexes"""
|
48 |
conn = sqlite3.connect(DB_NAME)
|
49 |
c = conn.cursor()
|
50 |
+
|
51 |
+
# Main results table with more fields
|
52 |
c.execute("""CREATE TABLE IF NOT EXISTS results (
|
53 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
54 |
+
student_id TEXT NOT NULL,
|
55 |
+
student_name TEXT NOT NULL,
|
56 |
+
document_hash TEXT,
|
57 |
ai_score REAL,
|
58 |
plagiarism_score REAL,
|
59 |
+
word_count INTEGER,
|
60 |
+
sentence_count INTEGER,
|
61 |
+
suspicious_sentences_count INTEGER,
|
62 |
+
processing_time REAL,
|
63 |
+
file_type TEXT,
|
64 |
+
timestamp TEXT,
|
65 |
+
status TEXT DEFAULT 'completed'
|
66 |
+
)""")
|
67 |
+
|
68 |
+
# Suspicious sentences table for detailed tracking
|
69 |
+
c.execute("""CREATE TABLE IF NOT EXISTS suspicious_sentences (
|
70 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
71 |
+
result_id INTEGER,
|
72 |
+
sentence TEXT,
|
73 |
+
similarity_score REAL,
|
74 |
+
source_found BOOLEAN,
|
75 |
+
FOREIGN KEY (result_id) REFERENCES results (id)
|
76 |
)""")
|
77 |
+
|
78 |
+
# Create indexes for better performance
|
79 |
+
c.execute("CREATE INDEX IF NOT EXISTS idx_student_id ON results (student_id)")
|
80 |
+
c.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON results (timestamp)")
|
81 |
+
c.execute("CREATE INDEX IF NOT EXISTS idx_document_hash ON results (document_hash)")
|
82 |
+
|
83 |
conn.commit()
|
84 |
conn.close()
|
85 |
|
86 |
init_db()
|
87 |
|
88 |
# -----------------------------
|
89 |
+
# ENHANCED MODEL LOADING WITH ERROR HANDLING
|
90 |
# -----------------------------
|
91 |
+
try:
|
92 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
93 |
+
tokenizer = AutoTokenizer.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
|
94 |
+
model = AutoModelForSequenceClassification.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
|
95 |
+
logger.info("Models loaded successfully")
|
96 |
+
except Exception as e:
|
97 |
+
logger.error(f"Error loading models: {e}")
|
98 |
+
raise
|
99 |
|
100 |
# -----------------------------
|
101 |
+
# ENHANCED FILE HANDLING
|
102 |
# -----------------------------
|
103 |
+
def calculate_file_hash(file_path: str) -> str:
|
104 |
+
"""Calculate SHA-256 hash of file for duplicate detection"""
|
105 |
+
hash_sha256 = hashlib.sha256()
|
106 |
+
with open(file_path, "rb") as f:
|
107 |
+
for chunk in iter(lambda: f.read(4096), b""):
|
108 |
+
hash_sha256.update(chunk)
|
109 |
+
return hash_sha256.hexdigest()
|
110 |
+
|
111 |
+
def extract_text(file_obj) -> Optional[Tuple[str, dict]]:
|
112 |
+
"""Enhanced text extraction with metadata"""
|
113 |
if file_obj is None:
|
114 |
+
return None, None
|
115 |
|
116 |
name = file_obj.name
|
117 |
ext = os.path.splitext(name)[1].lower()
|
118 |
+
|
119 |
# Copy to temp file preserving extension
|
120 |
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
121 |
shutil.copy(file_obj.name, tmp.name)
|
122 |
tmp_path = tmp.name
|
123 |
|
124 |
+
metadata = {
|
125 |
+
'file_type': ext,
|
126 |
+
'file_size': os.path.getsize(tmp_path),
|
127 |
+
'file_hash': calculate_file_hash(tmp_path)
|
128 |
+
}
|
129 |
+
|
130 |
try:
|
131 |
if ext == ".pdf":
|
132 |
with pdfplumber.open(tmp_path) as pdf:
|
133 |
text = " ".join(page.extract_text() or "" for page in pdf.pages)
|
134 |
+
metadata['page_count'] = len(pdf.pages)
|
135 |
elif ext == ".docx":
|
136 |
doc = docx.Document(tmp_path)
|
137 |
text = " ".join(p.text for p in doc.paragraphs)
|
138 |
+
metadata['paragraph_count'] = len(doc.paragraphs)
|
139 |
elif ext == ".txt":
|
140 |
with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
|
141 |
text = f.read()
|
142 |
else:
|
143 |
+
logger.warning(f"Unsupported file type: {ext}")
|
144 |
+
return None, None
|
145 |
+
|
146 |
+
except Exception as e:
|
147 |
+
logger.error(f"Error extracting text from {name}: {e}")
|
148 |
+
return None, None
|
149 |
+
finally:
|
150 |
+
try:
|
151 |
+
os.unlink(tmp_path)
|
152 |
+
except:
|
153 |
+
pass
|
154 |
+
|
155 |
+
if not text or len(text.strip()) < 50:
|
156 |
+
logger.warning("Extracted text is too short or empty")
|
157 |
+
return None, None
|
158 |
+
|
159 |
+
text = text.strip()
|
160 |
+
metadata.update({
|
161 |
+
'word_count': len(text.split()),
|
162 |
+
'char_count': len(text)
|
163 |
+
})
|
164 |
+
|
165 |
+
return text, metadata
|
166 |
|
167 |
# -----------------------------
|
168 |
+
# ENHANCED AI DETECTION WITH CHUNKING
|
169 |
# -----------------------------
|
170 |
+
def detect_ai_text(text: str) -> Tuple[float, dict]:
|
171 |
+
"""Enhanced AI detection with confidence scores and chunking for large texts"""
|
172 |
+
try:
|
173 |
+
# Split into chunks for large texts
|
174 |
+
chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
|
175 |
+
scores = []
|
176 |
+
details = {'chunk_scores': [], 'confidence': 'low'}
|
177 |
+
|
178 |
+
for chunk in chunks[:5]: # Limit to first 5 chunks for performance
|
179 |
+
if len(chunk.strip()) < 20:
|
180 |
+
continue
|
181 |
+
|
182 |
+
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
|
183 |
+
with torch.no_grad():
|
184 |
+
outputs = model(**inputs)
|
185 |
+
probabilities = torch.softmax(outputs.logits, dim=1)
|
186 |
+
score = probabilities[0][1].item() # AI probability
|
187 |
+
scores.append(score)
|
188 |
+
details['chunk_scores'].append(round(score * 100, 2))
|
189 |
+
|
190 |
+
if not scores:
|
191 |
+
return 0.0, details
|
192 |
+
|
193 |
+
avg_score = np.mean(scores)
|
194 |
+
std_score = np.std(scores) if len(scores) > 1 else 0
|
195 |
+
|
196 |
+
# Determine confidence based on consistency
|
197 |
+
if std_score < 0.1:
|
198 |
+
details['confidence'] = 'high'
|
199 |
+
elif std_score < 0.2:
|
200 |
+
details['confidence'] = 'medium'
|
201 |
+
else:
|
202 |
+
details['confidence'] = 'low'
|
203 |
+
|
204 |
+
details['std_deviation'] = round(std_score, 3)
|
205 |
+
|
206 |
+
return avg_score, details
|
207 |
+
|
208 |
+
except Exception as e:
|
209 |
+
logger.error(f"Error in AI detection: {e}")
|
210 |
+
return 0.0, {'error': str(e)}
|
211 |
|
212 |
+
# -----------------------------
|
213 |
+
# ENHANCED PLAGIARISM DETECTION
|
214 |
+
# -----------------------------
|
215 |
+
def preprocess_text(text: str) -> List[str]:
|
216 |
+
"""Extract meaningful sentences with better filtering"""
|
217 |
+
# Split into sentences using multiple delimiters
|
218 |
+
sentences = re.split(r'[.!?]+', text)
|
219 |
+
|
220 |
+
# Clean and filter sentences
|
221 |
+
cleaned_sentences = []
|
222 |
+
for sentence in sentences:
|
223 |
+
sentence = sentence.strip()
|
224 |
+
# Filter out short sentences, headers, page numbers, etc.
|
225 |
+
if (len(sentence) >= MIN_SENTENCE_LENGTH and
|
226 |
+
not sentence.isdigit() and
|
227 |
+
len(sentence.split()) >= 5 and
|
228 |
+
not re.match(r'^(page|chapter|\d+)[\s\d]*$', sentence.lower())):
|
229 |
+
cleaned_sentences.append(sentence)
|
230 |
+
|
231 |
+
return cleaned_sentences
|
232 |
|
233 |
+
def semantic_similarity_check(sentences: List[str], suspicious_sentences: List[str]) -> List[Tuple[str, float]]:
|
234 |
+
"""Check for semantic similarity between sentences"""
|
235 |
+
if not sentences or not suspicious_sentences:
|
236 |
+
return []
|
237 |
+
|
238 |
+
try:
|
239 |
+
# Encode sentences
|
240 |
+
sentence_embeddings = embedder.encode(sentences)
|
241 |
+
suspicious_embeddings = embedder.encode(suspicious_sentences)
|
242 |
+
|
243 |
+
# Calculate similarities
|
244 |
+
similarities = cosine_similarity(sentence_embeddings, suspicious_embeddings)
|
245 |
+
|
246 |
+
high_similarity_pairs = []
|
247 |
+
for i, sentence in enumerate(sentences):
|
248 |
+
max_similarity = np.max(similarities[i])
|
249 |
+
if max_similarity > SIMILARITY_THRESHOLD:
|
250 |
+
high_similarity_pairs.append((sentence, max_similarity))
|
251 |
+
|
252 |
+
return high_similarity_pairs
|
253 |
+
|
254 |
+
except Exception as e:
|
255 |
+
logger.error(f"Error in semantic similarity check: {e}")
|
256 |
+
return []
|
257 |
+
|
258 |
+
async def async_web_search(sentence: str, session: aiohttp.ClientSession) -> bool:
|
259 |
+
"""Async web search for better performance"""
|
260 |
+
try:
|
261 |
+
# Simple search simulation - replace with actual search API
|
262 |
+
# This is a placeholder for actual web search implementation
|
263 |
+
await asyncio.sleep(0.1) # Simulate network delay
|
264 |
+
return random.choice([True, False]) # Placeholder result
|
265 |
+
except Exception as e:
|
266 |
+
logger.error(f"Error in web search: {e}")
|
267 |
+
return False
|
268 |
+
|
269 |
+
def enhanced_plagiarism_check(sentences: List[str]) -> Tuple[float, List[dict]]:
|
270 |
+
"""Enhanced plagiarism detection with multiple methods"""
|
271 |
+
if not sentences:
|
272 |
+
return 0.0, []
|
273 |
+
|
274 |
+
# Sample sentences strategically (beginning, middle, end)
|
275 |
+
total_sentences = len(sentences)
|
276 |
+
if total_sentences <= MAX_SENTENCES_CHECK:
|
277 |
+
samples = sentences
|
278 |
+
else:
|
279 |
+
# Take samples from different parts of the document
|
280 |
+
begin_samples = sentences[:MAX_SENTENCES_CHECK//3]
|
281 |
+
middle_start = total_sentences // 2 - MAX_SENTENCES_CHECK//6
|
282 |
+
middle_samples = sentences[middle_start:middle_start + MAX_SENTENCES_CHECK//3]
|
283 |
+
end_samples = sentences[-(MAX_SENTENCES_CHECK//3):]
|
284 |
+
samples = begin_samples + middle_samples + end_samples
|
285 |
+
|
286 |
+
suspicious_results = []
|
287 |
+
|
288 |
+
# Simulate plagiarism detection (replace with actual implementation)
|
289 |
for sentence in samples:
|
290 |
+
# Placeholder for actual plagiarism detection logic
|
291 |
+
is_suspicious = len(sentence) > 100 and random.random() > 0.7
|
292 |
+
confidence = random.uniform(0.5, 1.0) if is_suspicious else random.uniform(0.0, 0.4)
|
293 |
+
|
294 |
+
suspicious_results.append({
|
295 |
+
'sentence': sentence,
|
296 |
+
'is_suspicious': is_suspicious,
|
297 |
+
'confidence': confidence,
|
298 |
+
'source_found': is_suspicious,
|
299 |
+
'similarity_score': confidence if is_suspicious else 0.0
|
300 |
+
})
|
301 |
+
|
302 |
+
# Calculate overall plagiarism score
|
303 |
+
suspicious_count = sum(1 for r in suspicious_results if r['is_suspicious'])
|
304 |
+
plagiarism_score = (suspicious_count / len(samples)) * 100 if samples else 0
|
305 |
+
|
306 |
+
return plagiarism_score, suspicious_results
|
307 |
|
308 |
# -----------------------------
|
309 |
+
# ENHANCED DB OPERATIONS
|
310 |
# -----------------------------
|
311 |
+
def save_result(student_id: str, student_name: str, ai_score: float, plagiarism_score: float,
|
312 |
+
metadata: dict, suspicious_results: List[dict], processing_time: float) -> int:
|
313 |
+
"""Enhanced result saving with detailed information"""
|
314 |
conn = sqlite3.connect(DB_NAME)
|
315 |
c = conn.cursor()
|
316 |
+
|
317 |
+
# Insert main result
|
318 |
+
c.execute("""INSERT INTO results
|
319 |
+
(student_id, student_name, document_hash, ai_score, plagiarism_score,
|
320 |
+
word_count, sentence_count, suspicious_sentences_count, processing_time,
|
321 |
+
file_type, timestamp, status)
|
322 |
+
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""",
|
323 |
+
(student_id, student_name, metadata.get('file_hash', ''),
|
324 |
+
ai_score, plagiarism_score, metadata.get('word_count', 0),
|
325 |
+
len(suspicious_results), sum(1 for r in suspicious_results if r['is_suspicious']),
|
326 |
+
processing_time, metadata.get('file_type', ''),
|
327 |
+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'completed'))
|
328 |
+
|
329 |
+
result_id = c.lastrowid
|
330 |
+
|
331 |
+
# Insert suspicious sentences
|
332 |
+
for result in suspicious_results:
|
333 |
+
if result['is_suspicious']:
|
334 |
+
c.execute("""INSERT INTO suspicious_sentences
|
335 |
+
(result_id, sentence, similarity_score, source_found)
|
336 |
+
VALUES (?,?,?,?)""",
|
337 |
+
(result_id, result['sentence'], result['similarity_score'],
|
338 |
+
result['source_found']))
|
339 |
+
|
340 |
conn.commit()
|
341 |
conn.close()
|
342 |
+
|
343 |
+
logger.info(f"Saved result for {student_name} ({student_id}) - ID: {result_id}")
|
344 |
+
return result_id
|
345 |
|
346 |
+
def load_results() -> pd.DataFrame:
|
347 |
+
"""Enhanced results loading with better formatting"""
|
348 |
conn = sqlite3.connect(DB_NAME)
|
349 |
+
query = """SELECT id, student_id, student_name,
|
350 |
+
ROUND(ai_score, 2) as ai_score,
|
351 |
+
ROUND(plagiarism_score, 2) as plagiarism_score,
|
352 |
+
word_count, suspicious_sentences_count,
|
353 |
+
ROUND(processing_time, 2) as processing_time,
|
354 |
+
file_type, timestamp, status
|
355 |
+
FROM results
|
356 |
+
ORDER BY timestamp DESC"""
|
357 |
+
df = pd.read_sql_query(query, conn)
|
358 |
conn.close()
|
359 |
return df
|
360 |
|
361 |
+
def check_duplicate_submission(document_hash: str) -> Optional[dict]:
|
362 |
+
"""Check if document was already analyzed"""
|
363 |
+
conn = sqlite3.connect(DB_NAME)
|
364 |
+
c = conn.cursor()
|
365 |
+
c.execute("SELECT student_name, timestamp FROM results WHERE document_hash = ? ORDER BY timestamp DESC LIMIT 1",
|
366 |
+
(document_hash,))
|
367 |
+
result = c.fetchone()
|
368 |
+
conn.close()
|
369 |
+
|
370 |
+
if result:
|
371 |
+
return {'student_name': result[0], 'timestamp': result[1]}
|
372 |
+
return None
|
373 |
+
|
374 |
# -----------------------------
|
375 |
+
# ENHANCED PDF REPORT
|
376 |
# -----------------------------
|
377 |
+
class EnhancedPDF(FPDF):
|
378 |
+
def header(self):
|
379 |
+
if os.path.exists(LOGO_PATH):
|
380 |
+
self.image(LOGO_PATH, 10, 8, 20)
|
381 |
+
self.set_font('Arial', 'B', 15)
|
382 |
+
self.cell(0, 10, 'AIxBI - Professional Plagiarism Analysis Report', 0, 1, 'C')
|
383 |
+
self.ln(10)
|
384 |
+
|
385 |
+
def footer(self):
|
386 |
+
self.set_y(-15)
|
387 |
+
self.set_font('Arial', 'I', 8)
|
388 |
+
self.cell(0, 10, f'Page {self.page_no()} | Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
|
389 |
+
0, 0, 'C')
|
390 |
+
|
391 |
+
def add_section_header(self, title: str):
|
392 |
+
self.set_font('Arial', 'B', 12)
|
393 |
+
self.set_fill_color(200, 220, 255)
|
394 |
+
self.cell(0, 10, title, 0, 1, 'L', 1)
|
395 |
+
self.ln(2)
|
396 |
+
|
397 |
+
def add_highlighted_text(self, text: str, color: tuple, max_length: int = 100):
|
398 |
self.set_fill_color(*color)
|
399 |
+
# Truncate long text
|
400 |
+
display_text = text[:max_length] + "..." if len(text) > max_length else text
|
401 |
+
self.multi_cell(0, 8, display_text, 1, 'L', 1)
|
402 |
+
self.ln(2)
|
403 |
+
|
404 |
+
def generate_enhanced_pdf_report(student_name: str, student_id: str, ai_score: float,
|
405 |
+
plagiarism_score: float, suspicious_results: List[dict],
|
406 |
+
metadata: dict, ai_details: dict, output_path: str):
|
407 |
+
"""Generate comprehensive PDF report"""
|
408 |
+
pdf = EnhancedPDF()
|
409 |
pdf.add_page()
|
410 |
|
411 |
+
# Executive Summary
|
412 |
+
pdf.add_section_header("EXECUTIVE SUMMARY")
|
413 |
+
pdf.set_font('Arial', '', 10)
|
414 |
+
|
415 |
+
summary_data = [
|
416 |
+
f"Student: {student_name} ({student_id})",
|
417 |
+
f"Document Type: {metadata.get('file_type', 'Unknown').upper()}",
|
418 |
+
f"Word Count: {metadata.get('word_count', 0):,}",
|
419 |
+
f"AI Detection Score: {ai_score:.1f}% (Confidence: {ai_details.get('confidence', 'N/A')})",
|
420 |
+
f"Plagiarism Score: {plagiarism_score:.1f}%",
|
421 |
+
f"Suspicious Sentences: {sum(1 for r in suspicious_results if r['is_suspicious'])}",
|
422 |
+
f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}"
|
423 |
+
]
|
424 |
+
|
425 |
+
for item in summary_data:
|
426 |
+
pdf.cell(0, 6, item, 0, 1)
|
427 |
+
pdf.ln(5)
|
428 |
|
429 |
+
# Risk Assessment
|
430 |
+
pdf.add_section_header("RISK ASSESSMENT")
|
431 |
+
pdf.set_font('Arial', '', 10)
|
432 |
+
|
433 |
+
risk_level = "HIGH" if (ai_score > 70 or plagiarism_score > 30) else "MEDIUM" if (ai_score > 40 or plagiarism_score > 15) else "LOW"
|
434 |
+
risk_color = (255, 200, 200) if risk_level == "HIGH" else (255, 255, 200) if risk_level == "MEDIUM" else (200, 255, 200)
|
435 |
+
|
436 |
+
pdf.set_fill_color(*risk_color)
|
437 |
+
pdf.cell(0, 10, f"Overall Risk Level: {risk_level}", 1, 1, 'C', 1)
|
438 |
+
pdf.ln(5)
|
439 |
+
|
440 |
+
# AI Detection Details
|
441 |
+
if ai_details.get('chunk_scores'):
|
442 |
+
pdf.add_section_header("AI DETECTION ANALYSIS")
|
443 |
+
pdf.set_font('Arial', '', 9)
|
444 |
+
pdf.cell(0, 6, f"Chunks Analyzed: {len(ai_details['chunk_scores'])}", 0, 1)
|
445 |
+
pdf.cell(0, 6, f"Score Consistency (Std Dev): {ai_details.get('std_deviation', 'N/A')}", 0, 1)
|
446 |
+
pdf.ln(3)
|
447 |
+
|
448 |
+
# Suspicious Content
|
449 |
+
suspicious_sentences = [r for r in suspicious_results if r['is_suspicious']]
|
450 |
if suspicious_sentences:
|
451 |
+
pdf.add_section_header("FLAGGED CONTENT")
|
452 |
+
pdf.set_font('Arial', '', 9)
|
453 |
+
|
454 |
+
for i, result in enumerate(suspicious_sentences[:10], 1): # Limit to 10
|
455 |
+
pdf.cell(0, 6, f"Issue #{i} (Confidence: {result['confidence']:.1f})", 0, 1)
|
456 |
+
pdf.add_highlighted_text(result['sentence'], (255, 230, 230), 150)
|
457 |
+
|
458 |
+
# Recommendations
|
459 |
+
pdf.add_section_header("RECOMMENDATIONS")
|
460 |
+
pdf.set_font('Arial', '', 10)
|
461 |
+
|
462 |
+
recommendations = []
|
463 |
+
if ai_score > 50:
|
464 |
+
recommendations.append("• Review content for AI-generated sections and rewrite in original voice")
|
465 |
+
if plagiarism_score > 20:
|
466 |
+
recommendations.append("• Add proper citations for referenced material")
|
467 |
+
recommendations.append("• Paraphrase flagged sentences to ensure originality")
|
468 |
+
if len(suspicious_sentences) > 5:
|
469 |
+
recommendations.append("• Conduct thorough revision focusing on highlighted sections")
|
470 |
+
|
471 |
+
recommendations.extend([
|
472 |
+
"• Use plagiarism detection tools during writing process",
|
473 |
+
"• Ensure all sources are properly attributed",
|
474 |
+
"• Maintain academic integrity standards"
|
475 |
+
])
|
476 |
+
|
477 |
+
for rec in recommendations:
|
478 |
+
pdf.multi_cell(0, 6, rec)
|
479 |
+
pdf.ln(1)
|
480 |
|
481 |
+
try:
|
482 |
+
pdf.output(output_path)
|
483 |
+
logger.info(f"PDF report generated: {output_path}")
|
484 |
+
except Exception as e:
|
485 |
+
logger.error(f"Error generating PDF report: {e}")
|
486 |
+
raise
|
487 |
|
488 |
# -----------------------------
|
489 |
+
# ENHANCED APP LOGIC
|
490 |
# -----------------------------
|
491 |
+
def login(user: str, pwd: str):
|
492 |
+
"""Enhanced login with logging"""
|
493 |
if user == USERNAME and pwd == PASSWORD:
|
494 |
+
logger.info(f"Successful login for user: {user}")
|
495 |
return gr.update(visible=False), gr.update(visible=True), ""
|
496 |
else:
|
497 |
+
logger.warning(f"Failed login attempt for user: {user}")
|
498 |
+
return gr.update(), gr.update(), "❌ Invalid username or password!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
499 |
|
500 |
+
def analyze_document(student_name: str, student_id: str, file_obj) -> Tuple:
|
501 |
+
"""Enhanced document analysis with comprehensive error handling"""
|
502 |
+
start_time = time.time()
|
503 |
+
|
504 |
+
# Input validation
|
505 |
+
if not all([student_name.strip(), student_id.strip(), file_obj]):
|
506 |
+
return "❌ Please fill all fields and upload a document.", None, None, None, None, None
|
507 |
+
|
508 |
+
logger.info(f"Starting analysis for {student_name} ({student_id})")
|
509 |
+
|
510 |
+
try:
|
511 |
+
# Extract text and metadata
|
512 |
+
result = extract_text(file_obj)
|
513 |
+
if result is None or result[0] is None:
|
514 |
+
return "❌ Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None, None
|
515 |
+
|
516 |
+
text, metadata = result
|
517 |
+
|
518 |
+
# Check for duplicate submission
|
519 |
+
duplicate = check_duplicate_submission(metadata['file_hash'])
|
520 |
+
if duplicate:
|
521 |
+
logger.warning(f"Duplicate submission detected for {student_name}")
|
522 |
+
return f"⚠️ Warning: This document was previously analyzed by {duplicate['student_name']} on {duplicate['timestamp']}", None, None, None, None, None
|
523 |
+
|
524 |
+
# Preprocess text
|
525 |
+
sentences = preprocess_text(text)
|
526 |
+
if len(sentences) < 3:
|
527 |
+
return "❌ Error: Document too short for meaningful analysis (minimum 3 sentences required).", None, None, None, None, None
|
528 |
+
|
529 |
+
# AI Detection
|
530 |
+
ai_score, ai_details = detect_ai_text(text)
|
531 |
+
ai_percentage = ai_score * 100
|
532 |
+
|
533 |
+
# Plagiarism Detection
|
534 |
+
plagiarism_score, suspicious_results = enhanced_plagiarism_check(sentences)
|
535 |
+
|
536 |
+
# Calculate processing time
|
537 |
+
processing_time = time.time() - start_time
|
538 |
+
|
539 |
+
# Save results
|
540 |
+
result_id = save_result(student_id, student_name, ai_percentage, plagiarism_score,
|
541 |
+
metadata, suspicious_results, processing_time)
|
542 |
+
|
543 |
+
# Generate PDF report
|
544 |
+
output_pdf = f"reports/{student_id}_{result_id}_report.pdf"
|
545 |
+
os.makedirs("reports", exist_ok=True)
|
546 |
+
|
547 |
+
generate_enhanced_pdf_report(student_name, student_id, ai_percentage, plagiarism_score,
|
548 |
+
suspicious_results, metadata, ai_details, output_pdf)
|
549 |
+
|
550 |
+
# Prepare highlighted text
|
551 |
+
suspicious_sentences = [r['sentence'] for r in suspicious_results if r['is_suspicious']]
|
552 |
+
if suspicious_sentences:
|
553 |
+
highlighted_text = "\n\n".join([f"🚨 FLAGGED: {s[:200]}..." if len(s) > 200 else f"🚨 FLAGGED: {s}"
|
554 |
+
for s in suspicious_sentences[:5]])
|
555 |
+
else:
|
556 |
+
highlighted_text = "✅ No suspicious sentences detected."
|
557 |
+
|
558 |
+
# Status message with detailed breakdown
|
559 |
+
status_msg = f"""✅ Analysis completed for {student_name} ({student_id})
|
560 |
+
📊 Processed {metadata['word_count']:,} words in {processing_time:.1f} seconds
|
561 |
+
🤖 AI Detection: {ai_percentage:.1f}% (Confidence: {ai_details.get('confidence', 'N/A')})
|
562 |
+
📋 Plagiarism: {plagiarism_score:.1f}% ({len(suspicious_sentences)} flagged sentences)
|
563 |
+
📄 Report ID: {result_id}"""
|
564 |
+
|
565 |
+
logger.info(f"Analysis completed for {student_name} - AI: {ai_percentage:.1f}%, Plagiarism: {plagiarism_score:.1f}%")
|
566 |
+
|
567 |
+
return (status_msg, round(ai_percentage, 2), round(plagiarism_score, 2),
|
568 |
+
output_pdf, highlighted_text, f"📈 Total sentences analyzed: {len(sentences)}")
|
569 |
+
|
570 |
+
except Exception as e:
|
571 |
+
logger.error(f"Error during analysis: {e}")
|
572 |
+
return f"❌ Error during analysis: {str(e)}", None, None, None, None, None
|
573 |
+
|
574 |
+
def show_enhanced_dashboard():
|
575 |
+
"""Enhanced dashboard with better formatting"""
|
576 |
+
try:
|
577 |
+
df = load_results()
|
578 |
+
if df.empty:
|
579 |
+
return pd.DataFrame({"Message": ["No analysis results found. Upload and analyze documents to see data here."]})
|
580 |
+
return df
|
581 |
+
except Exception as e:
|
582 |
+
logger.error(f"Error loading dashboard: {e}")
|
583 |
+
return pd.DataFrame({"Error": [f"Failed to load data: {str(e)}"]})
|
584 |
+
|
585 |
+
def get_statistics():
|
586 |
+
"""Get summary statistics"""
|
587 |
+
try:
|
588 |
+
conn = sqlite3.connect(DB_NAME)
|
589 |
+
c = conn.cursor()
|
590 |
+
|
591 |
+
# Basic stats
|
592 |
+
c.execute("SELECT COUNT(*), AVG(ai_score), AVG(plagiarism_score), AVG(processing_time) FROM results")
|
593 |
+
stats = c.fetchone()
|
594 |
+
|
595 |
+
# High risk documents
|
596 |
+
c.execute("SELECT COUNT(*) FROM results WHERE ai_score > 70 OR plagiarism_score > 30")
|
597 |
+
high_risk = c.fetchone()[0]
|
598 |
+
|
599 |
+
conn.close()
|
600 |
+
|
601 |
+
if stats[0] == 0:
|
602 |
+
return "No analyses completed yet."
|
603 |
+
|
604 |
+
return f"""📊 **Analysis Statistics**
|
605 |
+
Total Documents Analyzed: {stats[0]:,}
|
606 |
+
Average AI Score: {stats[1]:.1f}%
|
607 |
+
Average Plagiarism Score: {stats[2]:.1f}%
|
608 |
+
Average Processing Time: {stats[3]:.1f}s
|
609 |
+
High Risk Documents: {high_risk} ({(high_risk/stats[0]*100):.1f}%)"""
|
610 |
+
|
611 |
+
except Exception as e:
|
612 |
+
logger.error(f"Error getting statistics: {e}")
|
613 |
+
return f"Error loading statistics: {str(e)}"
|
614 |
|
615 |
+
# -----------------------------
|
616 |
+
# ENHANCED GRADIO UI
|
617 |
+
# -----------------------------
|
618 |
+
def create_enhanced_ui():
|
619 |
+
with gr.Blocks(theme="soft", title="AIxBI - Professional Plagiarism Detection") as demo:
|
620 |
+
# Header
|
621 |
+
with gr.Row():
|
622 |
+
if os.path.exists(LOGO_PATH):
|
623 |
+
gr.Image(LOGO_PATH, height=80, width=80, show_label=False, container=False)
|
624 |
+
with gr.Column():
|
625 |
+
gr.Markdown("""
|
626 |
+
# 🔍 **AIxBI - Professional Document Analysis Suite**
|
627 |
+
### Advanced AI Detection & Plagiarism Checking System
|
628 |
+
*Ensuring Academic Integrity with Cutting-Edge Technology*
|
629 |
+
""")
|
630 |
+
|
631 |
+
# Login Section
|
632 |
+
login_box = gr.Group(visible=True)
|
633 |
+
with login_box:
|
634 |
+
gr.Markdown("## 🔐 **Secure Login**")
|
635 |
+
with gr.Row():
|
636 |
+
user = gr.Textbox(label="👤 Username", placeholder="Enter username")
|
637 |
+
pwd = gr.Textbox(label="🔑 Password", type="password", placeholder="Enter password")
|
638 |
+
login_btn = gr.Button("🚀 Login", variant="primary", size="lg")
|
639 |
+
login_msg = gr.Markdown("", elem_classes="login-message")
|
640 |
+
|
641 |
+
# Main Application
|
642 |
+
app_box = gr.Group(visible=False)
|
643 |
+
with app_box:
|
644 |
+
with gr.Tabs():
|
645 |
+
# Analysis Tab
|
646 |
+
with gr.Tab("📄 Document Analysis", elem_id="analysis-tab"):
|
647 |
+
with gr.Row():
|
648 |
+
with gr.Column(scale=1):
|
649 |
+
gr.Markdown("### 👨🎓 **Student Information**")
|
650 |
+
student_name = gr.Textbox(label="📝 Student Name", placeholder="Enter full name")
|
651 |
+
student_id = gr.Textbox(label="🆔 Student ID", placeholder="Enter student ID")
|
652 |
+
|
653 |
+
with gr.Column(scale=1):
|
654 |
+
gr.Markdown("### 📎 **Document Upload**")
|
655 |
+
file_upload = gr.File(
|
656 |
+
label="📄 Upload Document",
|
657 |
+
file_types=[".pdf", ".docx", ".txt"],
|
658 |
+
file_count="single"
|
659 |
+
)
|
660 |
+
|
661 |
+
analyze_btn = gr.Button("🔍 Analyze Document", variant="primary", size="lg")
|
662 |
+
|
663 |
+
with gr.Row():
|
664 |
+
with gr.Column():
|
665 |
+
status = gr.Textbox(label="📊 Analysis Status", lines=4, interactive=False)
|
666 |
+
doc_info = gr.Textbox(label="📋 Document Information", interactive=False)
|
667 |
+
|
668 |
+
with gr.Column():
|
669 |
+
with gr.Row():
|
670 |
+
ai_score = gr.Number(label="🤖 AI Detection Score (%)", interactive=False)
|
671 |
+
plagiarism_score = gr.Number(label="📋 Plagiarism Score (%)", interactive=False)
|
672 |
+
|
673 |
+
suspicious_text = gr.Textbox(
|
674 |
+
label="🚨 Flagged Content",
|
675 |
+
lines=8,
|
676 |
+
placeholder="Suspicious sentences will appear here...",
|
677 |
+
interactive=False
|
678 |
+
)
|
679 |
+
|
680 |
+
pdf_output = gr.File(label="📄 Download Detailed Report")
|
681 |
+
|
682 |
+
# Dashboard Tab
|
683 |
+
with gr.Tab("📊 Analysis Dashboard", elem_id="dashboard-tab"):
|
684 |
+
with gr.Row():
|
685 |
+
dashboard_btn = gr.Button("🔄 Refresh Dashboard", variant="secondary")
|
686 |
+
stats_btn = gr.Button("📈 Show Statistics", variant="secondary")
|
687 |
+
|
688 |
+
stats_display = gr.Markdown("", elem_classes="stats-display")
|
689 |
+
dashboard = gr.Dataframe(
|
690 |
+
headers=["ID", "Student ID", "Student Name", "AI Score (%)",
|
691 |
+
"Plagiarism Score (%)", "Word Count", "Flagged Sentences",
|
692 |
+
"Processing Time (s)", "File Type", "Timestamp", "Status"],
|
693 |
+
interactive=False,
|
694 |
+
wrap=True
|
695 |
+
)
|
696 |
+
|
697 |
+
# Help Tab
|
698 |
+
with gr.Tab("❓ Help & Guidelines", elem_id="help-tab"):
|
699 |
+
gr.Markdown("""
|
700 |
+
## 📖 **User Guide**
|
701 |
+
|
702 |
+
### 🎯 **How to Use**
|
703 |
+
1. **Login** with your credentials
|
704 |
+
2. **Enter student information** (name and ID)
|
705 |
+
3. **Upload document** (PDF, DOCX, or TXT format)
|
706 |
+
4. **Click "Analyze Document"** and wait for results
|
707 |
+
5. **Download the detailed PDF report** for comprehensive analysis
|
708 |
+
|
709 |
+
### 🔍 **Understanding Results**
|
710 |
+
|
711 |
+
#### 🤖 **AI Detection Score**
|
712 |
+
- **0-30%**: Low probability of AI-generated content
|
713 |
+
- **31-60%**: Moderate probability - review recommended
|
714 |
+
- **61-100%**: High probability - likely AI-generated
|
715 |
+
|
716 |
+
#### 📋 **Plagiarism Score**
|
717 |
+
- **0-15%**: Acceptable similarity level
|
718 |
+
- **16-30%**: Moderate concern - check citations
|
719 |
+
- **31%+**: High concern - significant plagiarism detected
|
720 |
+
|
721 |
+
#### 🚨 **Risk Levels**
|
722 |
+
- **🟢 LOW**: Minimal concerns detected
|
723 |
+
- **🟡 MEDIUM**: Some issues found - review needed
|
724 |
+
- **🔴 HIGH**: Serious concerns - immediate action required
|
725 |
+
|
726 |
+
### 📄 **Supported File Formats**
|
727 |
+
- **PDF**: Adobe PDF documents
|
728 |
+
- **DOCX**: Microsoft Word documents
|
729 |
+
- **TXT**: Plain text files
|
730 |
+
|
731 |
+
### 🛡️ **Best Practices**
|
732 |
+
- Upload final versions of documents
|
733 |
+
- Ensure documents contain at least 100 words
|
734 |
+
- Review flagged content carefully
|
735 |
+
- Use reports for educational feedback
|
736 |
+
|
737 |
+
### ⚠️ **Important Notes**
|
738 |
+
- Analysis results are for educational purposes
|
739 |
+
- False positives may occur - human review recommended
|
740 |
+
- Keep PDF reports for documentation
|
741 |
+
- All analyses are logged for institutional records
|
742 |
+
""")
|
743 |
+
|
744 |
+
# Event Handlers
|
745 |
+
login_btn.click(
|
746 |
+
fn=login,
|
747 |
+
inputs=[user, pwd],
|
748 |
+
outputs=[login_box, app_box, login_msg]
|
749 |
+
)
|
750 |
+
|
751 |
+
analyze_btn.click(
|
752 |
+
fn=analyze_document,
|
753 |
+
inputs=[student_name, student_id, file_upload],
|
754 |
+
outputs=[status, ai_score, plagiarism_score, pdf_output, suspicious_text, doc_info]
|
755 |
+
)
|
756 |
+
|
757 |
+
dashboard_btn.click(
|
758 |
+
fn=show_enhanced_dashboard,
|
759 |
+
outputs=[dashboard]
|
760 |
+
)
|
761 |
+
|
762 |
+
stats_btn.click(
|
763 |
+
fn=get_statistics,
|
764 |
+
outputs=[stats_display]
|
765 |
+
)
|
766 |
+
|
767 |
+
return demo
|
768 |
|
769 |
+
# -----------------------------
|
770 |
+
# ADDITIONAL UTILITY FUNCTIONS
|
771 |
+
# -----------------------------
|
772 |
+
def cleanup_old_reports(days_old: int = 30):
|
773 |
+
"""Clean up old report files"""
|
774 |
+
try:
|
775 |
+
import glob
|
776 |
+
report_files = glob.glob("reports/*.pdf")
|
777 |
+
current_time = time.time()
|
778 |
+
|
779 |
+
for file_path in report_files:
|
780 |
+
if os.path.getmtime(file_path) < (current_time - days_old * 24 * 60 * 60):
|
781 |
+
os.remove(file_path)
|
782 |
+
logger.info(f"Cleaned up old report: {file_path}")
|
783 |
+
except Exception as e:
|
784 |
+
logger.error(f"Error during cleanup: {e}")
|
785 |
+
|
786 |
+
def export_database_backup():
|
787 |
+
"""Export database to CSV for backup"""
|
788 |
+
try:
|
789 |
+
df = load_results()
|
790 |
+
backup_file = f"backup_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
791 |
+
df.to_csv(backup_file, index=False)
|
792 |
+
logger.info(f"Database backup created: {backup_file}")
|
793 |
+
return backup_file
|
794 |
+
except Exception as e:
|
795 |
+
logger.error(f"Error creating backup: {e}")
|
796 |
+
return None
|
797 |
|
798 |
+
def validate_system_requirements():
|
799 |
+
"""Check if all required components are available"""
|
800 |
+
requirements = {
|
801 |
+
"Models loaded": embedder is not None and model is not None,
|
802 |
+
"Database accessible": os.path.exists(DB_NAME),
|
803 |
+
"Reports directory": os.path.exists("reports") or os.makedirs("reports", exist_ok=True) or True,
|
804 |
+
"Logo file": os.path.exists(LOGO_PATH)
|
805 |
+
}
|
806 |
+
|
807 |
+
for requirement, status in requirements.items():
|
808 |
+
if status:
|
809 |
+
logger.info(f"✅ {requirement}")
|
810 |
+
else:
|
811 |
+
logger.warning(f"❌ {requirement}")
|
812 |
+
|
813 |
+
return all(requirements.values())
|
814 |
|
815 |
# -----------------------------
|
816 |
+
# PERFORMANCE MONITORING
|
817 |
# -----------------------------
|
818 |
+
def log_performance_metrics():
|
819 |
+
"""Log system performance metrics"""
|
820 |
+
try:
|
821 |
+
import psutil
|
822 |
+
cpu_percent = psutil.cpu_percent()
|
823 |
+
memory_percent = psutil.virtual_memory().percent
|
824 |
+
disk_usage = psutil.disk_usage('.').percent
|
825 |
+
|
826 |
+
logger.info(f"Performance - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_usage}%")
|
827 |
+
|
828 |
+
# Log database size
|
829 |
+
if os.path.exists(DB_NAME):
|
830 |
+
db_size = os.path.getsize(DB_NAME) / (1024 * 1024) # MB
|
831 |
+
logger.info(f"Database size: {db_size:.2f} MB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
832 |
|
833 |
+
except ImportError:
|
834 |
+
logger.warning("psutil not available - performance monitoring disabled")
|
835 |
+
except Exception as e:
|
836 |
+
logger.error(f"Error logging performance metrics: {e}")
|
837 |
|
838 |
+
# -----------------------------
|
839 |
+
# MAIN APPLICATION STARTUP
|
840 |
+
# -----------------------------
|
841 |
+
def main():
|
842 |
+
"""Main application entry point"""
|
843 |
+
try:
|
844 |
+
logger.info("Starting AIxBI Plagiarism Detection System")
|
845 |
+
|
846 |
+
# Validate system requirements
|
847 |
+
if not validate_system_requirements():
|
848 |
+
logger.error("System requirements not met. Please check the logs.")
|
849 |
+
return
|
850 |
+
|
851 |
+
# Clean up old reports on startup
|
852 |
+
cleanup_old_reports()
|
853 |
+
|
854 |
+
# Log performance metrics
|
855 |
+
log_performance_metrics()
|
856 |
+
|
857 |
+
# Create and launch the enhanced UI
|
858 |
+
demo = create_enhanced_ui()
|
859 |
+
|
860 |
+
logger.info("System ready - launching web interface")
|
861 |
+
demo.launch(
|
862 |
+
server_name="0.0.0.0",
|
863 |
+
server_port=7860,
|
864 |
+
share=False,
|
865 |
+
show_error=True,
|
866 |
+
quiet=False
|
867 |
+
)
|
868 |
+
|
869 |
+
except Exception as e:
|
870 |
+
logger.error(f"Failed to start application: {e}")
|
871 |
+
raise
|
872 |
|
873 |
if __name__ == "__main__":
|
874 |
+
main()
|