File size: 15,491 Bytes
4d5c005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ee5b4a7
4d5c005
 
d6ff276
4d5c005
d6ff276
 
4d5c005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8fa2bc
 
4d5c005
 
 
 
e8fa2bc
 
4d5c005
e8fa2bc
4d5c005
e8fa2bc
 
4d5c005
 
 
 
 
 
 
 
 
 
e8fa2bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d5c005
 
 
 
 
 
 
 
 
e8fa2bc
 
4d5c005
e8fa2bc
 
 
4d5c005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
from flask import Flask, render_template, request, jsonify, redirect, url_for, flash, session
from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user
from flask_wtf.csrf import CSRFProtect
from flask_wtf import FlaskForm
from wtforms import StringField, PasswordField, SubmitField
from wtforms.validators import DataRequired
from werkzeug.security import generate_password_hash, check_password_hash
import arxiv
import requests
import PyPDF2
from io import BytesIO
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.memory import ConversationBufferMemory
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from functools import lru_cache
import time
import os
from dotenv import load_dotenv
import json
from datetime import datetime
import firebase_admin
from firebase_admin import credentials, auth

# Load environment variables
load_dotenv()

app = Flask(__name__)
app.secret_key = os.getenv('FLASK_SECRET_KEY')

# Initialize CSRF protection
csrf = CSRFProtect()
csrf.init_app(app)

# Initialize Flask-Login
login_manager = LoginManager()
login_manager.init_app(app)
login_manager.login_view = 'login'

# Initialize Groq
groq_api_key = os.getenv('GROQ_API_KEY')
llm = ChatGroq(
    temperature=0.3,
    groq_api_key=groq_api_key,
    model_name="qwen-2.5-32b"
)

# Initialize embeddings with proper cache directory
embeddings_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    cache_folder="/code/.cache/huggingface"
)

# Constants
MAX_CHUNKS = 50
MAX_RESPONSE_LENGTH = 6000
CACHE_DURATION = 3600  # 1 hour in seconds

# Form Classes
class LoginForm(FlaskForm):
    username = StringField('Username', validators=[DataRequired()])
    password = PasswordField('Password', validators=[DataRequired()])
    submit = SubmitField('Login')

class RegisterForm(FlaskForm):
    username = StringField('Username', validators=[DataRequired()])
    password = PasswordField('Password', validators=[DataRequired()])
    submit = SubmitField('Register')

# User class for Flask-Login
class User(UserMixin):
    def __init__(self, user_id, email):
        self.id = user_id
        self.email = email

def generate_analysis(chunks):
    analysis_prompts = {
    'executive_summary': """
## 🧠 Role  
You are an AI assistant that explains research papers in a way that makes reading the original paper unnecessary. Your explanations should be **clear, engaging, and easy to understand**, even for someone who is not deeply familiar with the subject.  

## 🎯 Goal  
Given any research paper, provide a **simple breakdown** covering:  

### 1️⃣ What problem does this paper solve?  
- Explain the **issue the paper addresses**.  
- Why is this problem **important**?  
- What **challenges** existed before this research?  

### 2️⃣ How does it solve the problem?  
- Summarize the **key idea, method, or approach** used in the paper.  
- If applicable, break it down into **steps or components**.  
- Compare it to **previous solutions** and highlight what makes it better.  

### 3️⃣ Why does this matter? (Real-world impact & applications)  
- How can this research be **used in practice**?  
- What **industries or fields** benefit from it?  
- Does it improve **efficiency, accuracy, cost, or scalability**?  

### 4️⃣ Explain with a simple analogy (if applicable)  
- Use a **real-life example** to explain complex ideas.  
- Keep it **relatable** (e.g., compare it to something like cooking, traveling, or streaming music).  

### 5️⃣ Key findings & results  
- Summarize the **main results** in simple terms.  
- If possible, include **numbers, graphs, or comparisons** for clarity.  

### 6️⃣ Limitations & Future Work  
- Mention any **weaknesses** or areas for improvement.  
- What are the **next steps** for research in this area?  

### 7️⃣ Final Takeaway (One-liner summary)  
- Provide a **quick summary** of the research in a **single sentence**.  

---

## 🎨 Tone & Style  
✔ **Simple & clear language** – Avoid jargon unless necessary.  
✔ **Step-by-step explanations** – Organize information logically.  
✔ **Engaging & structured** – Use bullet points, lists, or tables when needed.  
✔ **Make it feel like a story** – Guide the reader smoothly from problem to solution.  

---

## ⚡ How to Use This Prompt  
1️⃣ Enter the **title, abstract, or full text** of any research paper.  
2️⃣ AI will generate a **detailed explanation** that makes the paper easy to understand.  
3️⃣ Use it for **blog posts, study guides, or an AI-powered research assistant**.  


Remember: The output should be properly formatted in markdown while providing comprehensive coverage of the paper's content."""
    }
    
    analysis_results = {}
    
    for aspect, prompt in analysis_prompts.items():
        try:
            # Clean and join the chunks
            context = "\n\n".join(
                chunk.encode('ascii', 'ignore').decode('ascii')
                for chunk in chunks[:3]
            )
            response = llm.invoke(
                f"""Based on the following context from a research paper, {prompt}
                
                Context:
                {context}

                Additional Instructions:
                - Provide specific examples and evidence from the text
                - Use clear, academic language
                - Maintain objectivity
                - Include relevant quotes or data points
                - Structure your response logically
                - Use markdown formatting for clarity

                Please provide a clear and specific response.""", 
                temperature=0.3
            )
            analysis_results[aspect] = response.content[:MAX_RESPONSE_LENGTH]
        except Exception as e:
            analysis_results[aspect] = f"Analysis failed: {str(e)}"
    
    return analysis_results

def process_pdf(pdf_url):
    try:
        print(f"Starting PDF processing for: {pdf_url}")
        
        response = requests.get(pdf_url, timeout=30)
        response.raise_for_status()
        pdf_file = BytesIO(response.content)

        pdf_reader = PyPDF2.PdfReader(pdf_file)
        # Clean and normalize the text
        text = " ".join(
            page.extract_text().encode('ascii', 'ignore').decode('ascii')
            for page in pdf_reader.pages
        )
        
        if not text.strip():
            return {'error': 'No text could be extracted from the PDF'}

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        
        chunks = text_splitter.split_text(text)[:MAX_CHUNKS]
        
        analysis = generate_analysis(chunks)
        return {
            'success': True,
            'analysis': analysis
        }
            
    except Exception as e:
        return {'error': f"PDF processing failed: {str(e)}"}


@login_manager.user_loader
def load_user(user_id):
    if 'user_data' in session:
        user_data = session['user_data']
        return User(user_data['uid'], user_data['email'])
    return None

# User management functions
def load_users():
    try:
        with open('users.json', 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def save_users(users):
    with open('users.json', 'w') as f:
        json.dump(users, f)

# Routes
@app.route('/')
@login_required
def index():
    return render_template('index.html')

@app.route('/login', methods=['GET'])
def login():
    if current_user.is_authenticated:
        return redirect(url_for('index'))
    return render_template('login.html')

@app.route('/register', methods=['GET'])
def register():
    if current_user.is_authenticated:
        print("User is already authenticated")
        return redirect(url_for('index'))
    return render_template('register.html')

@app.route('/verify-token', methods=['POST'])
def verify_token():
    try:
        data = request.json
        if not data or not data.get('uid') or not data.get('email'):
            return jsonify({'error': 'Missing required data'}), 400

        # Store user data in session
        session['user_data'] = {
            'uid': data['uid'],
            'email': data['email']
        }
        
        # Create and login user
        user = User(data['uid'], data['email'])
        login_user(user)
        
        return jsonify({'success': True, 'redirect': url_for('index')})
    except Exception as e:
        print(f"Verification error: {str(e)}")  # Add logging
        return jsonify({'error': str(e)}), 500

@app.route('/logout')
@login_required
def logout():
    logout_user()
    session.clear()
    return redirect(url_for('login'))

@app.route('/search', methods=['POST'])
@login_required
def search():
    try:
        data = request.get_json()
        paper_name = data.get('paper_name')
        sort_by = data.get('sort_by', 'relevance')  # Default to relevance
        max_results = data.get('max_results', 20)  # Increase to get more candidates for filtering

        if not paper_name:
            return jsonify({'error': 'No search query provided'}), 400

        # Configure sorting based on user preference
        sort_options = {
            'relevance': arxiv.SortCriterion.Relevance,
            'recent': arxiv.SortCriterion.SubmittedDate
        }
        
        sort_criterion = sort_options.get(sort_by, arxiv.SortCriterion.Relevance)

        # Perform the search
        search = arxiv.Search(
            query=paper_name,
            max_results=max_results,
            sort_by=sort_criterion
        )

        results = []
        for paper in search.results():
            # Extract citation count if available (not directly provided by arXiv API)
            citation_count = 0
            
            # You could integrate with a citation API here (e.g., Semantic Scholar)
            # For now, we'll use proxies for popularity like:
            # - Papers with DOIs (published in journals) tend to be more established
            # - Papers with more authors often have more visibility
            # - More recent papers in the results might indicate ongoing relevance
            
            has_doi = hasattr(paper, 'doi') and paper.doi is not None
            author_count = len(paper.authors)
            
            # Calculate a simple "popularity score" (this is a heuristic)
            popularity_score = (10 if has_doi else 0) + min(author_count, 5)
            
            results.append({
                'title': paper.title,
                'authors': ', '.join(author.name for author in paper.authors),
                'abstract': paper.summary,
                'pdf_link': paper.pdf_url,
                'arxiv_link': paper.entry_id,
                'published': paper.published.strftime('%Y-%m-%d'),
                'category': paper.primary_category,
                'comment': paper.comment if hasattr(paper, 'comment') else None,
                'doi': paper.doi if hasattr(paper, 'doi') else None,
                'popularity_score': popularity_score  # Add popularity score
            })
        
        # Sort results by our popularity score (higher is better)
        results.sort(key=lambda x: x['popularity_score'], reverse=True)

        return jsonify(results)

    except Exception as e:
        print(f"Search error: {str(e)}")
        return jsonify({'error': f'Failed to search papers: {str(e)}'}), 500

@app.route('/perform-rag', methods=['POST'])
@login_required
def perform_rag():
    try:
        pdf_url = request.json.get('pdf_url')
        if not pdf_url:
            return jsonify({'error': 'PDF URL is required'}), 400

        result = process_pdf(pdf_url)
        
        if 'error' in result:
            return jsonify({'error': result['error']}), 500
            
        return jsonify(result)

    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/chat-with-paper', methods=['POST'])
@login_required
def chat_with_paper():
    try:
        pdf_url = request.json.get('pdf_url')
        question = request.json.get('question')
        
        if not pdf_url or not question:
            return jsonify({'error': 'PDF URL and question are required'}), 400

        # Get PDF text and create chunks
        response = requests.get(pdf_url, timeout=30)
        response.raise_for_status()
        pdf_file = BytesIO(response.content)

        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = " ".join(page.extract_text() for page in pdf_reader.pages)
        
        if not text.strip():
            return jsonify({'error': 'No text could be extracted from the PDF'})

        # Create text chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=2000,
            chunk_overlap=200,
            length_function=len
        )
        chunks = text_splitter.split_text(text)[:MAX_CHUNKS]

        # Generate embeddings for chunks
        chunk_embeddings = embeddings_model.embed_documents(chunks)

        # Generate embedding for the question
        question_embedding = embeddings_model.embed_query(question)

        # Find most relevant chunks using cosine similarity
        similarities = []
        for chunk_embedding in chunk_embeddings:
            similarity = np.dot(question_embedding, chunk_embedding) / (
                np.linalg.norm(question_embedding) * np.linalg.norm(chunk_embedding)
            )
            similarities.append(similarity)

        # Get top 3 most relevant chunks
        top_chunk_indices = np.argsort(similarities)[-3:][::-1]
        relevant_chunks = [chunks[i] for i in top_chunk_indices]

        # Construct prompt with relevant context
        context = "\n\n".join(relevant_chunks)
        prompt = f"""Based on the following relevant excerpts from the research paper, please answer this question: {question}

        Context from paper:
        {context}

        Please provide a clear, specific, and accurate response based solely on the information provided in these excerpts. If the answer cannot be fully determined from the given context, please indicate this in your response."""

        # Generate response using Groq
        response = llm.invoke(prompt)

        # Format and return response
        formatted_response = response.content.strip()
        
        # Add source citations
        source_info = "\n\nThis response is based on specific sections from the paper."
        
        return jsonify({
            'response': formatted_response + source_info,
            'relevance_scores': [float(similarities[i]) for i in top_chunk_indices]
        })

    except Exception as e:
        print(f"Chat error: {str(e)}")
        return jsonify({'error': f'Failed to process request: {str(e)}'}), 500

@app.route('/api/data', methods=['GET'])
def get_data():
    try:
        # Example: Get documents from a collection
        docs = load_users()
        data = [{doc_id: doc_data} for doc_id, doc_data in docs.items()]
        return jsonify(data), 200
    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=7860)