Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 8

Commit

f377404

1 Parent(s): 8ce8fc9

Update app.py

Browse files

Files changed (1) hide show

app.py +207 -78

app.py CHANGED Viewed

@@ -38,6 +38,10 @@ def load_models():
     try:
         print("Loading models...")
         # Embedding models
         models['embedding'] = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         models['cross_encoder'] = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
@@ -64,24 +68,78 @@ def load_models():
         print(f"Error loading models: {e}")
         return False
-def load_data():
-    """Load embeddings and document data"""
     try:
-        print("Loading data...")
-        # Load embeddings
-        with open('embeddings.pkl', 'rb') as f:
-            data['embeddings'] = pickle.load(f)
-        # Load document links
-        data['df'] = pd.read_excel('finalcleaned_excel_file.xlsx')
-        print("Data loaded successfully")
         return True
     except Exception as e:
-        print(f"Error loading data: {e}")
         return False
 def translate_text(text, source_to_target='ar_to_en'):
     """Translate text between Arabic and English"""
     try:
@@ -99,26 +157,8 @@ def translate_text(text, source_to_target='ar_to_en'):
         print(f"Translation error: {e}")
         return text
-def query_embeddings(query_embedding, n_results=5):
-    """Find relevant documents using embedding similarity"""
-    doc_ids = list(data['embeddings'].keys())
-    doc_embeddings = np.array(list(data['embeddings'].values()))
-    similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
-    top_indices = similarities.argsort()[-n_results:][::-1]
-    return [(doc_ids[i], similarities[i]) for i in top_indices]
-def retrieve_document_text(doc_id):
-    """Retrieve document text from HTML file"""
-    try:
-        with open(f"downloaded_articles/{doc_id}", 'r', encoding='utf-8') as file:
-            soup = BeautifulSoup(file, 'html.parser')
-            return soup.get_text(separator=' ', strip=True)
-    except Exception as e:
-        print(f"Error retrieving document {doc_id}: {e}")
-        return ""
 def extract_entities(text):
-    """Extract medical entities from text"""
     try:
         results = models['ner_pipeline'](text)
         return list({result['word'] for result in results if result['entity'].startswith("B-")})
@@ -130,37 +170,101 @@ def generate_answer(query, context, max_length=860, temperature=0.2):
     """Generate answer using LLM"""
     try:
         prompt = f"""
-        As a medical expert, answer the following question based only on the provided context:
         Context: {context}
-        Question: {query}
-        Answer:"""
         inputs = models['llm_tokenizer'](prompt, return_tensors="pt", truncation=True)
-        outputs = models['llm_model'].generate(
-            inputs.input_ids,
-            max_length=max_length,
-            num_return_sequences=1,
-            temperature=temperature,
-            pad_token_id=models['llm_tokenizer'].eos_token_id
-        )
-        answer = models['llm_tokenizer'].decode(outputs[0], skip_special_tokens=True)
-        return answer.split("Answer:")[-1].strip()
     except Exception as e:
         print(f"Error generating answer: {e}")
-        return "Sorry, I couldn't generate an answer at this time."
 @app.route('/health', methods=['GET'])
 def health_check():
     """Health check endpoint"""
-    return jsonify({'status': 'healthy'})
 @app.route('/api/query', methods=['POST'])
 def process_query():
     """Main query processing endpoint"""
     try:
         data = request.json
         if not data or 'query' not in data:
             return jsonify({'error': 'No query provided', 'success': False}), 400
@@ -168,40 +272,67 @@ def process_query():
         query_text = data['query']
         language_code = data.get('language_code', 0)
-        # Translate if Arabic
-        if language_code == 0:
-            query_text = translate_text(query_text, 'ar_to_en')
-        # Get query embedding and find relevant documents
-        query_embedding = models['embedding'].encode([query_text])
-        relevant_docs = query_embeddings(query_embedding)
-        # Retrieve and process documents
-        doc_texts = [retrieve_document_text(doc_id) for doc_id, _ in relevant_docs]
-        # Extract entities and generate context
-        query_entities = extract_entities(query_text)
-        contexts = []
-        for text in doc_texts:
-            doc_entities = extract_entities(text)
-            if set(query_entities) & set(doc_entities):
-                contexts.append(text)
-        context = " ".join(contexts[:3])  # Use top 3 most relevant contexts
-        # Generate answer
-        answer = generate_answer(query_text, context)
-        # Translate back if needed
-        if language_code == 0:
-            answer = translate_text(answer, 'en_to_ar')
-        return jsonify({
-            'answer': answer,
-            'success': True
-        })
     except Exception as e:
         return jsonify({
             'error': str(e),
             'success': False
@@ -212,9 +343,7 @@ print("Initializing application...")
 init_success = init_nltk() and load_models() and load_data()
 if not init_success:
-    print("Failed to initialize application")
-    exit(1)
 if __name__ == "__main__":
-    app.run(host='0.0.0.0', port=7860)

     try:
         print("Loading models...")
+        # Set device
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Device set to use {device}")
         # Embedding models
         models['embedding'] = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         models['cross_encoder'] = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
         print(f"Error loading models: {e}")
         return False
+def load_embeddings():
+    """Load embeddings with robust error handling"""
     try:
+        print("Loading embeddings...")
+        embeddings_path = 'embeddings.pkl'
+        if not os.path.exists(embeddings_path):
+            print(f"Error: {embeddings_path} not found")
+            return False
+        # Custom unpickler to handle potential compatibility issues
+        class CustomUnpickler(pickle.Unpickler):
+            def find_class(self, module, name):
+                if module == "__main__":
+                    module = "numpy"
+                return super().find_class(module, name)
+        with open(embeddings_path, 'rb') as f:
+            try:
+                data['embeddings'] = pickle.load(f)
+            except Exception as e:
+                print(f"Standard unpickling failed, trying custom unpickler: {e}")
+                f.seek(0)
+                try:
+                    data['embeddings'] = CustomUnpickler(f).load()
+                except Exception as e:
+                    print(f"Custom unpickler failed: {e}")
+                    data['embeddings'] = {}
+                    return False
+        if not isinstance(data['embeddings'], dict):
+            print("Error: Embeddings data is not in expected format")
+            data['embeddings'] = {}
+            return False
+        print(f"Successfully loaded {len(data['embeddings'])} embeddings")
+        return True
+    except Exception as e:
+        print(f"Error loading embeddings: {e}")
+        data['embeddings'] = {}
+        return False
+def load_documents_data():
+    """Load document data with error handling"""
+    try:
+        print("Loading documents data...")
+        docs_path = 'finalcleaned_excel_file.xlsx'
+        if not os.path.exists(docs_path):
+            print(f"Error: {docs_path} not found")
+            return False
+        data['df'] = pd.read_excel(docs_path)
+        print(f"Successfully loaded {len(data['df'])} document records")
         return True
     except Exception as e:
+        print(f"Error loading documents data: {e}")
+        data['df'] = pd.DataFrame()
         return False
+def load_data():
+    """Load all required data"""
+    embeddings_success = load_embeddings()
+    documents_success = load_documents_data()
+    if not embeddings_success:
+        print("Warning: Failed to load embeddings, falling back to basic functionality")
+    if not documents_success:
+        print("Warning: Failed to load documents data, falling back to basic functionality")
+    return True
 def translate_text(text, source_to_target='ar_to_en'):
     """Translate text between Arabic and English"""
     try:
         print(f"Translation error: {e}")
         return text
 def extract_entities(text):
+    """Extract medical entities from text using NER"""
     try:
         results = models['ner_pipeline'](text)
         return list({result['word'] for result in results if result['entity'].startswith("B-")})
     """Generate answer using LLM"""
     try:
         prompt = f"""
+        As a medical expert, please provide a clear and accurate answer to the following question based solely on the provided context.
         Context: {context}
+        Question: {query}
+        Answer: Let me help you with accurate information from reliable medical sources."""
         inputs = models['llm_tokenizer'](prompt, return_tensors="pt", truncation=True)
+        with torch.no_grad():
+            outputs = models['llm_model'].generate(
+                inputs.input_ids,
+                max_length=max_length,
+                num_return_sequences=1,
+                temperature=temperature,
+                do_sample=True,
+                top_p=0.9,
+                pad_token_id=models['llm_tokenizer'].eos_token_id
+            )
+        response = models['llm_tokenizer'].decode(outputs[0], skip_special_tokens=True)
+        # Clean up the response
+        if "Answer:" in response:
+            response = response.split("Answer:")[-1].strip()
+        # Remove incomplete sentences at the end
+        sentences = nltk.sent_tokenize(response)
+        if sentences:
+            return " ".join(sentences)
+        return response
     except Exception as e:
         print(f"Error generating answer: {e}")
+        return "I apologize, but I'm unable to generate an answer at this time. Please try again later."
+def query_embeddings(query_embedding, n_results=5):
+    """Find relevant documents using embedding similarity"""
+    if not data['embeddings']:
+        return []
+    try:
+        doc_ids = list(data['embeddings'].keys())
+        doc_embeddings = np.array(list(data['embeddings'].values()))
+        similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()
+        top_indices = similarities.argsort()[-n_results:][::-1]
+        return [(doc_ids[i], similarities[i]) for i in top_indices]
+    except Exception as e:
+        print(f"Error in query_embeddings: {e}")
+        return []
+def retrieve_document_text(doc_id):
+    """Retrieve document text from HTML file"""
+    try:
+        file_path = os.path.join('downloaded_articles', doc_id)
+        if not os.path.exists(file_path):
+            print(f"Warning: Document file not found: {file_path}")
+            return ""
+        with open(file_path, 'r', encoding='utf-8') as file:
+            soup = BeautifulSoup(file, 'html.parser')
+            return soup.get_text(separator=' ', strip=True)
+    except Exception as e:
+        print(f"Error retrieving document {doc_id}: {e}")
+        return ""
+def rerank_documents(query, doc_texts):
+    """Rerank documents using cross-encoder"""
+    try:
+        pairs = [(query, doc) for doc in doc_texts]
+        scores = models['cross_encoder'].predict(pairs)
+        return scores
+    except Exception as e:
+        print(f"Error reranking documents: {e}")
+        return np.zeros(len(doc_texts))
 @app.route('/health', methods=['GET'])
 def health_check():
     """Health check endpoint"""
+    status = {
+        'status': 'healthy',
+        'models_loaded': bool(models),
+        'embeddings_loaded': bool(data.get('embeddings')),
+        'documents_loaded': not data.get('df', pd.DataFrame()).empty
+    }
+    return jsonify(status)
 @app.route('/api/query', methods=['POST'])
 def process_query():
     """Main query processing endpoint"""
     try:
+        if not request.is_json:
+            return jsonify({'error': 'Request must be JSON', 'success': False}), 400
         data = request.json
         if not data or 'query' not in data:
             return jsonify({'error': 'No query provided', 'success': False}), 400
         query_text = data['query']
         language_code = data.get('language_code', 0)
+        # Basic response if no models or data are loaded
+        if not models or not data.get('embeddings'):
+            return jsonify({
+                'answer': 'The system is currently initializing. Please try again in a few minutes.',
+                'success': False
+            }), 503
+        # Process query with available models and data
+        try:
+            # Handle Arabic queries
+            if language_code == 0:
+                query_text = translate_text(query_text, 'ar_to_en')
+            # Get query embedding and find relevant documents
+            query_embedding = models['embedding'].encode([query_text])
+            relevant_docs = query_embeddings(query_embedding)
+            if not relevant_docs:
+                return jsonify({
+                    'answer': 'No relevant information found. Please try a different query.',
+                    'success': True
+                })
+            # Retrieve and process documents
+            doc_texts = [retrieve_document_text(doc_id) for doc_id, _ in relevant_docs]
+            doc_texts = [text for text in doc_texts if text.strip()]
+            if not doc_texts:
+                return jsonify({
+                    'answer': 'Unable to retrieve relevant documents. Please try again.',
+                    'success': True
+                })
+            # Rerank documents
+            rerank_scores = rerank_documents(query_text, doc_texts)
+            ranked_texts = [text for _, text in sorted(zip(rerank_scores, doc_texts), reverse=True)]
+            # Combine top documents
+            context = " ".join(ranked_texts[:3])
+            # Generate answer
+            answer = generate_answer(query_text, context)
+            # Translate answer back to Arabic if needed
+            if language_code == 0:
+                answer = translate_text(answer, 'en_to_ar')
+            return jsonify({
+                'answer': answer,
+                'success': True
+            })
+        except Exception as e:
+            print(f"Error processing query: {e}")
+            return jsonify({
+                'error': 'An error occurred while processing your query',
+                'success': False
+            }), 500
     except Exception as e:
+        print(f"Error in process_query: {e}")
         return jsonify({
             'error': str(e),
             'success': False
 init_success = init_nltk() and load_models() and load_data()
 if not init_success:
+    print("Warning: Application initialized with partial functionality")
 if __name__ == "__main__":
+    app.run(host='0.0.0.0', port=7860)