Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Jan 23

Commit

ea7cf68

verified ·

1 Parent(s): b482ae1

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -290

app.py CHANGED Viewed

@@ -33,10 +33,7 @@ from safetensors.numpy import load_file
 from safetensors.torch import safe_open
 nltk.download('punkt_tab')
-# Initialize FastAPI app
 app = FastAPI()
-# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -44,8 +41,6 @@ app.add_middleware(
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# Global variables for models and data
 models = {}
 data = {}
@@ -68,7 +63,6 @@ class ChatMessage(BaseModel):
     timestamp: str
 def init_nltk():
-    """Initialize NLTK resources"""
     try:
         nltk.download('punkt', quiet=True)
         return True
@@ -77,39 +71,25 @@ def init_nltk():
         return False
 def load_models():
-    """Initialize all required models"""
     try:
         print("Loading models...")
-        # Set device
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Device set to use {device}")
-        # Embedding models
         models['embedding_model'] = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         models['cross_encoder'] = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
         models['semantic_model'] = SentenceTransformer('all-MiniLM-L6-v2')
-        # Translation models
         models['ar_to_en_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
         models['ar_to_en_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
         models['en_to_ar_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
         models['en_to_ar_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
-        #Attention model
         models['att_tokenizer'] = AutoTokenizer.from_pretrained("facebook/bart-base")
         models['att_model'] = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
-        # NER model
         models['bio_tokenizer'] = AutoTokenizer.from_pretrained("blaze999/Medical-NER")
         models['bio_model'] = AutoModelForTokenClassification.from_pretrained("blaze999/Medical-NER")
         models['ner_pipeline'] = pipeline("ner", model=models['bio_model'], tokenizer=models['bio_tokenizer'])
-        # LLM model
         model_name = "M4-ai/Orca-2.0-Tau-1.8B"
         models['llm_tokenizer'] = AutoTokenizer.from_pretrained(model_name)
         models['llm_model'] = AutoModelForCausalLM.from_pretrained(model_name)
         print("Models loaded successfully")
         return True
     except Exception as e:
@@ -118,7 +98,6 @@ def load_models():
 def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
     try:
-        # Locate or download embeddings file
         embeddings_path = 'embeddings.safetensors'
         if not os.path.exists(embeddings_path):
             print("File not found locally. Attempting to download from Hugging Face Hub...")
@@ -128,62 +107,35 @@ def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
                 repo_type="space"
             )
-        # Initialize a dictionary to store embeddings
         embeddings = {}
-        # Open the safetensors file
         with safe_open(embeddings_path, framework="pt") as f:
             keys = f.keys()
-            #0print(f"Available keys in the .safetensors file: {list(keys)}")  # Debugging info
-            # Iterate over the keys and load tensors
             for key in keys:
                 try:
                     tensor = f.get_tensor(key)
                     if not isinstance(tensor, torch.Tensor):
-                        raise TypeError(f"Value for key {key} is not a valid PyTorch tensor.")
-                    # Convert tensor to NumPy array
                     embeddings[key] = tensor.numpy()
                 except Exception as key_error:
                     print(f"Failed to process key {key}: {key_error}")
         if embeddings:
             print("Embeddings successfully loaded.")
         else:
-            print("No embeddings could be loaded. Please check the file format and content.")
         return embeddings
     except Exception as e:
         print(f"Error loading embeddings: {e}")
         return None
 def normalize_key(key: str) -> str:
-    """Normalize embedding keys to match metadata IDs."""
     match = re.search(r'file_(\d+)', key)
     if match:
-        return match.group(1)  # Extract the numeric part
     return key
-import os
-import numpy as np
-from typing import Optional
-from safetensors.numpy import load_file
-from huggingface_hub import hf_hub_download
 def load_recipes_embeddings() -> Optional[np.ndarray]:
-    """
-    Loads recipe embeddings from a .safetensors file, handling local and remote downloads.
-    Returns:
-        Optional[np.ndarray]: A numpy array containing all embeddings (shape: (num_recipes, embedding_dim)).
-    """
     try:
-        embeddings_path = 'recipes_embeddings.safetensors'
-        # Check if file exists locally, otherwise download from Hugging Face Hub
         if not os.path.exists(embeddings_path):
             print("File not found locally. Attempting to download from Hugging Face Hub...")
             embeddings_path = hf_hub_download(
@@ -191,60 +143,40 @@ def load_recipes_embeddings() -> Optional[np.ndarray]:
                 filename="embeddings.safetensors",
                 repo_type="space"
             )
-        # Load the embeddings tensor from the .safetensors file
         embeddings = load_file(embeddings_path)
-        # Ensure the key 'embeddings' exists in the file
         if "embeddings" not in embeddings:
             raise ValueError("Key 'embeddings' not found in the safetensors file.")
-        # Retrieve the tensor as a numpy array
-        tensor = embeddings["embeddings"]
-        # Print information about the embeddings
         print(f"Successfully loaded embeddings.")
         print(f"Shape of embeddings: {tensor.shape}")
         print(f"Dtype of embeddings: {tensor.dtype}")
         print(f"First few values of the first embedding: {tensor[0][:5]}")
         return tensor
     except Exception as e:
         print(f"Error loading embeddings: {e}")
         return None
 def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
-    """Load document data from HTML articles in a specified folder."""
     try:
         print("Loading documents data...")
-        # Check if the folder exists
         if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
             print(f"Error: Folder '{folder_path}' not found")
             return False
-        # List all HTML files in the folder
         html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]
         if not html_files:
             print(f"No HTML files found in folder '{folder_path}'")
             return False
         documents = []
-        # Iterate through each HTML file and parse the content
         for file_name in html_files:
             file_path = os.path.join(folder_path, file_name)
             try:
                 with open(file_path, 'r', encoding='utf-8') as file:
-                    # Parse the HTML file
                     soup = BeautifulSoup(file, 'html.parser')
-                    # Extract text content (or customize this as per your needs)
                     text = soup.get_text(separator='\n').strip()
                     documents.append({"file_name": file_name, "content": text})
             except Exception as e:
                 print(f"Error reading file {file_name}: {e}")
-            # Convert the list of documents to a DataFrame
-            data['df'] = pd.DataFrame(documents)
             if data['df'].empty:
                 print("No valid documents loaded.")
                 return False
@@ -254,34 +186,27 @@ def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
         print(f"Error loading docs: {e}")
         return None
 def load_data():
-    """Load all required data"""
     embeddings_success = load_embeddings()
-    documents_success = load_documents_data()
     if not embeddings_success:
         print("Warning: Failed to load embeddings, falling back to basic functionality")
     if not documents_success:
-        print("Warning: Failed to load documents data, falling back to basic functionality")
     return True
-# Initialize application
 print("Initializing application...")
 init_success = load_models() and load_data()
 def translate_text(text, source_to_target='ar_to_en'):
-    """Translate text between Arabic and English"""
     try:
         if source_to_target == 'ar_to_en':
             tokenizer = models['ar_to_en_tokenizer']
             model = models['ar_to_en_model']
         else:
             tokenizer = models['en_to_ar_tokenizer']
-            model = models['en_to_ar_model']
         inputs = tokenizer(text, return_tensors="pt", truncation=True)
         outputs = model.generate(**inputs)
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
@@ -309,40 +234,17 @@ def query_embeddings(query_embedding, embeddings_data=None, n_results=5):
         print(f"Error in query_embeddings: {e}")
         return []
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
 def query_recipes_embeddings(query_embedding, embeddings_data, n_results = 5):
-    """
-    Query the recipes embeddings to find the most similar recipes based on cosine similarity.
-    Args:
-        query_embedding (np.ndarray): A 1D numpy array representing the query embedding.
-        n_results (int): Number of top results to return.
-    Returns:
-        List[Tuple[int, float]]: A list of tuples containing the indices of the top results and their similarity scores.
-    """
-    # Load embeddings
     embeddings_data = load_recipes_embeddings()
     if embeddings_data is None:
         print("No embeddings data available.")
         return []
     try:
-        # Ensure query_embedding is 2D for cosine similarity computation
         if query_embedding.ndim == 1:
             query_embedding = query_embedding.reshape(1, -1)
-        # Compute cosine similarity
         similarities = cosine_similarity(query_embedding, embeddings_data).flatten()
-        # Get the indices of the top N most similar embeddings
         top_indices = similarities.argsort()[-n_results:][::-1]
-        # Return the indices and similarity scores of the top results
         return [(index, similarities[index]) for index in top_indices]
     except Exception as e:
         print(f"Error in query_recipes_embeddings: {e}")
         return []
@@ -364,12 +266,10 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
     for doc_id in doc_ids:
         file_path = os.path.join(folder_path, doc_id)
         try:
-            # Check if the file exists
             if not os.path.exists(file_path):
                 print(f"Warning: Document file not found: {file_path}")
                 texts.append("")
                 continue
-            # Read and parse the HTML file
             with open(file_path, 'r', encoding='utf-8') as file:
                 soup = BeautifulSoup(file, 'html.parser')
                 text = soup.get_text(separator=' ', strip=True)
@@ -379,82 +279,71 @@ def retrieve_document_texts(doc_ids, folder_path='downloaded_articles/downloaded
             texts.append("")
     return texts
-import os
-import pandas as pd
 def retrieve_rec_texts(
     document_indices,
     folder_path='downloaded_articles/downloaded_articles',
     metadata_path='recipes_metadata.xlsx'
 ):
-    """
-    Retrieve the texts of documents corresponding to the given indices.
-    Args:
-        document_indices (List[int]): A list of document indices to retrieve.
-        folder_path (str): Path to the folder containing the article files.
-        metadata_path (str): Path to the metadata file mapping indices to file names.
-    Returns:
-        List[str]: A list of document texts corresponding to the given indices.
-    """
     try:
-        # Load metadata file to map indices to original file names
         metadata_df = pd.read_excel(metadata_path)
-        # Ensure the metadata file has the required columns
         if "id" not in metadata_df.columns or "original_file_name" not in metadata_df.columns:
             raise ValueError("Metadata file must contain 'id' and 'original_file_name' columns.")
-        # Ensure the 'id' column aligns with the embeddings row indices
         metadata_df = metadata_df.sort_values(by="id").reset_index(drop=True)
-        # Verify the alignment of metadata with embeddings indices
         if metadata_df.index.max() < max(document_indices):
             raise ValueError("Some document indices exceed the range of metadata.")
-        # Retrieve file names for the given indices
         document_texts = []
         for idx in document_indices:
             if idx >= len(metadata_df):
                 print(f"Warning: Index {idx} is out of range for metadata.")
                 continue
             original_file_name = metadata_df.iloc[idx]["original_file_name"]
             if not original_file_name:
                 print(f"Warning: No file name found for index {idx}")
                 continue
-            # Construct the file path using the original file name
             file_path = os.path.join(folder_path, original_file_name)
-            # Check if the file exists and read its content
             if os.path.exists(file_path):
                 with open(file_path, "r", encoding="utf-8") as f:
                     document_texts.append(f.read())
             else:
                 print(f"Warning: File not found at {file_path}")
         return document_texts
     except Exception as e:
         print(f"Error in retrieve_rec_texts: {e}")
         return []
 def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
     try:
-        # Prepare pairs for the cross-encoder
         pairs = [(query, doc) for doc in document_texts]
-        # Get scores using the cross-encoder model
         scores = cross_encoder_model.predict(pairs)
-        # Combine scores with document IDs and texts
         scored_documents = list(zip(scores, document_ids, document_texts))
-        # Sort by scores in descending order
         scored_documents.sort(key=lambda x: x[0], reverse=True)
-        # Print reranked results
         print("Reranked results:")
         for idx, (score, doc_id, doc) in enumerate(scored_documents):
             print(f"Rank {idx + 1} (Score: {score:.4f}, Document ID: {doc_id})")
@@ -465,12 +354,9 @@ def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
 def extract_entities(text, ner_pipeline=None):
     try:
-        # Use the provided pipeline or default to the model dictionary
         if ner_pipeline is None:
             ner_pipeline = models['ner_pipeline']
-        # Perform NER using the pipeline
         ner_results = ner_pipeline(text)
-        # Extract unique entities that start with "B-"
         entities = {result['word'] for result in ner_results if result['entity'].startswith("B-")}
         return list(entities)
     except Exception as e:
@@ -488,22 +374,16 @@ def match_entities(query_entities, sentence_entities):
 def extract_relevant_portions(document_texts, query, max_portions=3, portion_size=1, min_query_words=1):
     relevant_portions = {}
-    # Extract entities from the query
     query_entities = extract_entities(query)
     print(f"Extracted Query Entities: {query_entities}")
     for doc_id, doc_text in enumerate(document_texts):
-        sentences = nltk.sent_tokenize(doc_text)  # Split document into sentences
         doc_relevant_portions = []
-        # Extract entities from the entire document
-        #ner_biobert = models['ner_pipeline']
         doc_entities = extract_entities(doc_text)
         print(f"Document {doc_id} Entities: {doc_entities}")
         for i, sentence in enumerate(sentences):
-            # Extract entities from the sentence
             sentence_entities = extract_entities(sentence)
-            # Compute relevance score
             relevance_score = match_entities(query_entities, sentence_entities)
-            # Select sentences with at least `min_query_words` matching entities
             if relevance_score >= min_query_words:
                 start_idx = max(0, i - portion_size // 2)
                 end_idx = min(len(sentences), i + portion_size // 2 + 1)
@@ -511,13 +391,11 @@ def extract_relevant_portions(document_texts, query, max_portions=3, portion_siz
                 doc_relevant_portions.append(portion)
             if len(doc_relevant_portions) >= max_portions:
                 break
-        # Fallback: Include most entity-dense sentences if no relevant portions were found
         if not doc_relevant_portions and len(doc_entities) > 0:
             print(f"Fallback: Selecting sentences with most entities for Document {doc_id}")
             sorted_sentences = sorted(sentences, key=lambda s: len(extract_entities(s, ner_biobert)), reverse=True)
             for fallback_sentence in sorted_sentences[:max_portions]:
                 doc_relevant_portions.append(fallback_sentence)
-        # Add the extracted portions to the result dictionary
         relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
     return relevant_portions
@@ -537,7 +415,6 @@ def extract_entities(text):
         inputs = biobert_tokenizer(text, return_tensors="pt")
         outputs = biobert_model(**inputs)
         predictions = torch.argmax(outputs.logits, dim=2)
         tokens = biobert_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
         entities = [
             tokens[i]
@@ -568,9 +445,6 @@ def generate_answer(prompt, max_length=860, temperature=0.2):
     tokenizer_f = models['llm_tokenizer']
     model_f = models['llm_model']
     inputs = tokenizer_f(prompt, return_tensors="pt", truncation=True)
-    # Start timing
-    #start_time = time.time()
-    # Generate the output
     output_ids = model_f.generate(
         inputs.input_ids,
         max_length=max_length,
@@ -578,38 +452,27 @@ def generate_answer(prompt, max_length=860, temperature=0.2):
         temperature=temperature,
         pad_token_id=tokenizer_f.eos_token_id
     )
-    # End timing
-    #end_time = time.time()
-    # Calculate the duration
-    #duration = end_time - start_time
-    # Decode the answer
     answer = tokenizer_f.decode(output_ids[0], skip_special_tokens=True)
-    # Extract keywords from the passage and answer
-    passage_keywords = set(prompt.lower().split())  # Adjusted to check keywords in the full prompt
     answer_keywords = set(answer.lower().split())
-    # Verify if the answer aligns with the passage
     if passage_keywords.intersection(answer_keywords):
-        return answer  #, duration
     else:
-        return "Sorry, I can't help with that." #, duration
 def remove_answer_prefix(text):
     prefix = "Answer:"
     if prefix in text:
-        return text.split(prefix, 1)[-1].strip()  # Split only once to avoid splitting at other occurrences of "Answer:"
     return text
 def remove_incomplete_sentence(text):
-    # Check if the text ends with a period
     if not text.endswith('.'):
-        # Find the last period or the end of the string
         last_period_index = text.rfind('.')
         if last_period_index != -1:
-            # Remove everything after the last period
             return text[:last_period_index + 1].strip()
     return text
 @app.get("/")
 async def root():
     return {"message": "Welcome to the FastAPI application! Use the /health endpoint to check health, and /api/query for processing queries."}
@@ -630,7 +493,7 @@ async def chat_endpoint(chat_query: ChatQuery):
     try:
         query_text = chat_query.query
         language_code = chat_query.language_code
-        query_embedding = embed_query_text(query_text)  # Embed the query text
         embeddings_data = load_embeddings ()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_embeddings(query_embedding, embeddings_data, n_results=5)
@@ -671,34 +534,21 @@ async def chat_endpoint(chat_query: ChatQuery):
 @app.post("/api/resources")
 async def resources_endpoint(profile: MedicalProfile):
-    try:
-        # Build the query text
         query_text = profile.conditions + " " + profile.daily_symptoms
-        print(f"Generated query text: {query_text}")
-        # Generate the query embedding
         query_embedding = embed_query_text(query_text)
         if query_embedding is None:
             raise ValueError("Failed to generate query embedding.")
-        # Load embeddings and retrieve initial results
         embeddings_data = load_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_embeddings(query_embedding, embeddings_data, n_results=6)
         if not initial_results:
             raise ValueError("No relevant documents found.")
-        # Extract document IDs
         document_ids = [doc_id for doc_id, _ in initial_results]
-        # Load document metadata (URL mappings)
         file_path = 'finalcleaned_excel_file.xlsx'
         df = pd.read_excel(file_path)
         file_name_to_url = {f"article_{index}.html": url for index, url in enumerate(df['Unnamed: 0'])}
-        # Map file names to original URLs
         resources = []
         for file_name in document_ids:
             original_url = file_name_to_url.get(file_name, None)
@@ -707,54 +557,34 @@ async def resources_endpoint(profile: MedicalProfile):
                 resources.append({"file_name": file_name, "title": title, "url": original_url})
             else:
                 resources.append({"file_name": file_name, "title": "Unknown", "url": None})
-        # Retrieve document texts
         document_texts = retrieve_document_texts(document_ids, folder_path)
         if not document_texts:
             raise ValueError("Failed to retrieve document texts.")
-        # Perform re-ranking
         cross_encoder = models['cross_encoder']
         scores = cross_encoder.predict([(query_text, doc) for doc in document_texts])
-        scores = [float(score) for score in scores]  # Convert to native Python float
-        # Combine scores with resources
         for i, resource in enumerate(resources):
             resource["score"] = scores[i] if i < len(scores) else 0.0
-        # Sort resources by score
         resources.sort(key=lambda x: x["score"], reverse=True)
-        # Limit response to top 5 resources
         return {"resources": resources[:5], "success": True}
     except ValueError as ve:
-        # Handle expected errors
         raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
-        # Handle unexpected errors
         print(f"Unexpected error: {e}")
         raise HTTPException(status_code=500, detail="An unexpected error occurred.")
 @app.post("/api/recipes")
 async def recipes_endpoint(profile: MedicalProfile):
     try:
-        # Build the query text for recipes
         recipe_query = (
             f"Recipes foods and meals suitable for someone with: "
             f"{profile.conditions} and experiencing {profile.daily_symptoms}"
         )
         query_text = recipe_query
         print(f"Generated query text: {query_text}")
-        # Generate the query embedding
         query_embedding = embed_query_text(query_text)
         if query_embedding is None:
             raise ValueError("Failed to generate query embedding.")
-        # Load embeddings and retrieve initial results
         embeddings_data = load_recipes_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_recipes_embeddings(query_embedding, embeddings_data, n_results=5)
@@ -762,92 +592,26 @@ async def recipes_endpoint(profile: MedicalProfile):
             raise ValueError("No relevant recipes found.")
         print("Initial results (document indices and similarities):")
         print(initial_results)
-        # Extract document indices from the results
         document_indices = [doc_id for doc_id, _ in initial_results]
-        print("Document indices:", document_indices)
-        # Retrieve document texts using the indices
-        document_texts = retrieve_rec_texts(document_indices, folder_path)
-        if not document_texts:
-            raise ValueError("Failed to retrieve document texts.")
-        print("Document texts retrieved:")
-        print(document_texts)
-        # Extract relevant portions from documents using the query text
-        relevant_portions = extract_relevant_portions(document_texts, query_text, max_portions=3, portion_size=1, min_query_words=1)
-        print("Relevant portions extracted:")
-        print(relevant_portions)
-        flattened_relevant_portions = []
-        for doc_id, portions in relevant_portions.items():
-            flattened_relevant_portions.extend(portions)
-        unique_selected_parts = remove_duplicates(flattened_relevant_portions)
-        print("Unique selected parts:")
-        print(unique_selected_parts)
-        combined_parts = " ".join(unique_selected_parts)
-        print("Combined text for context:")
-        print(combined_parts)
-        context = [query_text] + unique_selected_parts
-        print("Final context for answering:")
-        print(context)
-        # Extract entities from the query
-        entities = extract_entities(query_text)
-        print("Extracted entities:")
-        print(entities)
-        # Enhance the passage with the extracted entities
-        passage = enhance_passage_with_entities(combined_parts, entities)
-        print("Enhanced passage with entities:")
-        print(passage)
-        # Create the prompt for the model
-        prompt = create_prompt(query_text, passage)
-        print("Generated prompt:")
-        print(prompt)
-        # Generate the answer from the model
-        answer = generate_answer(prompt)
-        print("Generated answer:")
-        print(answer)
-        # Clean up the answer to extract the relevant part
-        answer_part = answer.split("Answer:")[-1].strip()
-        cleaned_answer = remove_answer_prefix(answer_part)
-        print("Cleaned answer:")
-        print(cleaned_answer)
-        final_answer = remove_incomplete_sentence(cleaned_answer)
-        print("Final answer:")
-        print(final_answer)
-        if language_code == 0:
-            final_answer = translate_en_to_ar(final_answer)
-        if final_answer:
-            print("Answer:")
-            print(final_answer)
-        else:
-            print("Sorry, I can't help with that.")
-        return {"response": final_answer}
     except ValueError as ve:
-        # Handle expected errors
         raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
-        # Handle unexpected errors
         print(f"Unexpected error: {e}")
         raise HTTPException(status_code=500, detail="An unexpected error occurred.")
 if not init_success:
     print("Warning: Application initialized with partial functionality")
-# For running locally
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from safetensors.torch import safe_open
 nltk.download('punkt_tab')
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_methods=["*"],
     allow_headers=["*"],
 )
 models = {}
 data = {}
     timestamp: str
 def init_nltk():
     try:
         nltk.download('punkt', quiet=True)
         return True
         return False
 def load_models():
     try:
         print("Loading models...")
         device = "cuda" if torch.cuda.is_available() else "cpu"
         print(f"Device set to use {device}")
         models['embedding_model'] = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
         models['cross_encoder'] = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
         models['semantic_model'] = SentenceTransformer('all-MiniLM-L6-v2')
         models['ar_to_en_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
         models['ar_to_en_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
         models['en_to_ar_tokenizer'] = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
         models['en_to_ar_model'] = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
         models['att_tokenizer'] = AutoTokenizer.from_pretrained("facebook/bart-base")
         models['att_model'] = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
         models['bio_tokenizer'] = AutoTokenizer.from_pretrained("blaze999/Medical-NER")
         models['bio_model'] = AutoModelForTokenClassification.from_pretrained("blaze999/Medical-NER")
         models['ner_pipeline'] = pipeline("ner", model=models['bio_model'], tokenizer=models['bio_tokenizer'])
         model_name = "M4-ai/Orca-2.0-Tau-1.8B"
         models['llm_tokenizer'] = AutoTokenizer.from_pretrained(model_name)
         models['llm_model'] = AutoModelForCausalLM.from_pretrained(model_name)
         print("Models loaded successfully")
         return True
     except Exception as e:
 def load_embeddings() -> Optional[Dict[str, np.ndarray]]:
     try:
         embeddings_path = 'embeddings.safetensors'
         if not os.path.exists(embeddings_path):
             print("File not found locally. Attempting to download from Hugging Face Hub...")
                 repo_type="space"
             )
         embeddings = {}
         with safe_open(embeddings_path, framework="pt") as f:
             keys = f.keys()
             for key in keys:
                 try:
                     tensor = f.get_tensor(key)
                     if not isinstance(tensor, torch.Tensor):
+                        raise TypeError(f"Value for key {key} is not a valid PyTorch tensor.")
                     embeddings[key] = tensor.numpy()
                 except Exception as key_error:
                     print(f"Failed to process key {key}: {key_error}")
         if embeddings:
             print("Embeddings successfully loaded.")
         else:
+            print("No embeddings could be loaded. Please check the file format and content.")
         return embeddings
     except Exception as e:
         print(f"Error loading embeddings: {e}")
         return None
 def normalize_key(key: str) -> str:
     match = re.search(r'file_(\d+)', key)
     if match:
+        return match.group(1)
     return key
 def load_recipes_embeddings() -> Optional[np.ndarray]:
     try:
+        embeddings_path = 'recipes_embeddings.safetensors'
         if not os.path.exists(embeddings_path):
             print("File not found locally. Attempting to download from Hugging Face Hub...")
             embeddings_path = hf_hub_download(
                 filename="embeddings.safetensors",
                 repo_type="space"
             )
         embeddings = load_file(embeddings_path)
         if "embeddings" not in embeddings:
             raise ValueError("Key 'embeddings' not found in the safetensors file.")
+        tensor = embeddings["embeddings"]
         print(f"Successfully loaded embeddings.")
         print(f"Shape of embeddings: {tensor.shape}")
         print(f"Dtype of embeddings: {tensor.dtype}")
         print(f"First few values of the first embedding: {tensor[0][:5]}")
         return tensor
     except Exception as e:
         print(f"Error loading embeddings: {e}")
         return None
 def load_documents_data(folder_path='downloaded_articles/downloaded_articles'):
     try:
         print("Loading documents data...")
         if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
             print(f"Error: Folder '{folder_path}' not found")
             return False
         html_files = [f for f in os.listdir(folder_path) if f.endswith('.html')]
         if not html_files:
             print(f"No HTML files found in folder '{folder_path}'")
             return False
         documents = []
         for file_name in html_files:
             file_path = os.path.join(folder_path, file_name)
             try:
                 with open(file_path, 'r', encoding='utf-8') as file:
                     soup = BeautifulSoup(file, 'html.parser')
                     text = soup.get_text(separator='\n').strip()
                     documents.append({"file_name": file_name, "content": text})
             except Exception as e:
                 print(f"Error reading file {file_name}: {e}")
+            data['df'] = pd.DataFrame(documents)
             if data['df'].empty:
                 print("No valid documents loaded.")
                 return False
         print(f"Error loading docs: {e}")
         return None
 def load_data():
     embeddings_success = load_embeddings()
+    documents_success = load_documents_data()
     if not embeddings_success:
         print("Warning: Failed to load embeddings, falling back to basic functionality")
     if not documents_success:
+        print("Warning: Failed to load documents data, falling back to basic functionality")
     return True
 print("Initializing application...")
 init_success = load_models() and load_data()
 def translate_text(text, source_to_target='ar_to_en'):
     try:
         if source_to_target == 'ar_to_en':
             tokenizer = models['ar_to_en_tokenizer']
             model = models['ar_to_en_model']
         else:
             tokenizer = models['en_to_ar_tokenizer']
+            model = models['en_to_ar_model']
         inputs = tokenizer(text, return_tensors="pt", truncation=True)
         outputs = model.generate(**inputs)
         return tokenizer.decode(outputs[0], skip_special_tokens=True)
         print(f"Error in query_embeddings: {e}")
         return []
 def query_recipes_embeddings(query_embedding, embeddings_data, n_results = 5):
     embeddings_data = load_recipes_embeddings()
     if embeddings_data is None:
         print("No embeddings data available.")
         return []
     try:
         if query_embedding.ndim == 1:
             query_embedding = query_embedding.reshape(1, -1)
         similarities = cosine_similarity(query_embedding, embeddings_data).flatten()
         top_indices = similarities.argsort()[-n_results:][::-1]
         return [(index, similarities[index]) for index in top_indices]
     except Exception as e:
         print(f"Error in query_recipes_embeddings: {e}")
         return []
     for doc_id in doc_ids:
         file_path = os.path.join(folder_path, doc_id)
         try:
             if not os.path.exists(file_path):
                 print(f"Warning: Document file not found: {file_path}")
                 texts.append("")
                 continue
             with open(file_path, 'r', encoding='utf-8') as file:
                 soup = BeautifulSoup(file, 'html.parser')
                 text = soup.get_text(separator=' ', strip=True)
             texts.append("")
     return texts
 def retrieve_rec_texts(
     document_indices,
     folder_path='downloaded_articles/downloaded_articles',
     metadata_path='recipes_metadata.xlsx'
 ):
     try:
         metadata_df = pd.read_excel(metadata_path)
         if "id" not in metadata_df.columns or "original_file_name" not in metadata_df.columns:
             raise ValueError("Metadata file must contain 'id' and 'original_file_name' columns.")
         metadata_df = metadata_df.sort_values(by="id").reset_index(drop=True)
         if metadata_df.index.max() < max(document_indices):
             raise ValueError("Some document indices exceed the range of metadata.")
         document_texts = []
         for idx in document_indices:
             if idx >= len(metadata_df):
                 print(f"Warning: Index {idx} is out of range for metadata.")
                 continue
             original_file_name = metadata_df.iloc[idx]["original_file_name"]
             if not original_file_name:
                 print(f"Warning: No file name found for index {idx}")
                 continue
             file_path = os.path.join(folder_path, original_file_name)
             if os.path.exists(file_path):
                 with open(file_path, "r", encoding="utf-8") as f:
                     document_texts.append(f.read())
             else:
                 print(f"Warning: File not found at {file_path}")
         return document_texts
     except Exception as e:
         print(f"Error in retrieve_rec_texts: {e}")
         return []
+def retrieve_metadata(document_indices: List[str], metadata_path: str = 'recipes_metadata.xlsx') -> Dict[str, Dict[str, str]]:
+    try:
+        metadata_df = pd.read_excel(metadata_path)
+        required_columns = {'id', 'original_file_name', 'url'}
+        if not required_columns.issubset(metadata_df.columns):
+            raise ValueError(f"Metadata file must contain the following columns: {required_columns}")
+        metadata_mapping = metadata_df.set_index('id')[['original_file_name', 'url']].to_dict('index')
+        result = {doc_id: metadata_mapping.get(doc_id, {}) for doc_id in document_indices}
+        return result
+    except Exception as e:
+        print(f"Error retrieving metadata: {e}")
+        return {}
+def retrieve_metadata(document_indices: List[str], metadata_path: str = 'recipes_metadata.xlsx') -> Dict[str, Dict[str, str]]:
+    try:
+        metadata_df = pd.read_excel(metadata_path)
+        required_columns = {'id', 'original_file_name', 'url'}
+        if not required_columns.issubset(metadata_df.columns):
+            raise ValueError(f"Metadata file must contain the following columns: {required_columns}")
+        metadata_mapping = metadata_df.set_index('id')[['original_file_name', 'url']].to_dict('index')
+        result = {doc_id: metadata_mapping.get(doc_id, {}) for doc_id in document_indices}
+        return result
+    except Exception as e:
+        print(f"Error retrieving metadata: {e}")
+        return {}
 def rerank_documents(query, document_ids, document_texts, cross_encoder_model):
     try:
         pairs = [(query, doc) for doc in document_texts]
         scores = cross_encoder_model.predict(pairs)
         scored_documents = list(zip(scores, document_ids, document_texts))
         scored_documents.sort(key=lambda x: x[0], reverse=True)
         print("Reranked results:")
         for idx, (score, doc_id, doc) in enumerate(scored_documents):
             print(f"Rank {idx + 1} (Score: {score:.4f}, Document ID: {doc_id})")
 def extract_entities(text, ner_pipeline=None):
     try:
         if ner_pipeline is None:
             ner_pipeline = models['ner_pipeline']
         ner_results = ner_pipeline(text)
         entities = {result['word'] for result in ner_results if result['entity'].startswith("B-")}
         return list(entities)
     except Exception as e:
 def extract_relevant_portions(document_texts, query, max_portions=3, portion_size=1, min_query_words=1):
     relevant_portions = {}
     query_entities = extract_entities(query)
     print(f"Extracted Query Entities: {query_entities}")
     for doc_id, doc_text in enumerate(document_texts):
+        sentences = nltk.sent_tokenize(doc_text)
         doc_relevant_portions = []
         doc_entities = extract_entities(doc_text)
         print(f"Document {doc_id} Entities: {doc_entities}")
         for i, sentence in enumerate(sentences):
             sentence_entities = extract_entities(sentence)
             relevance_score = match_entities(query_entities, sentence_entities)
             if relevance_score >= min_query_words:
                 start_idx = max(0, i - portion_size // 2)
                 end_idx = min(len(sentences), i + portion_size // 2 + 1)
                 doc_relevant_portions.append(portion)
             if len(doc_relevant_portions) >= max_portions:
                 break
         if not doc_relevant_portions and len(doc_entities) > 0:
             print(f"Fallback: Selecting sentences with most entities for Document {doc_id}")
             sorted_sentences = sorted(sentences, key=lambda s: len(extract_entities(s, ner_biobert)), reverse=True)
             for fallback_sentence in sorted_sentences[:max_portions]:
                 doc_relevant_portions.append(fallback_sentence)
         relevant_portions[f"Document_{doc_id}"] = doc_relevant_portions
     return relevant_portions
         inputs = biobert_tokenizer(text, return_tensors="pt")
         outputs = biobert_model(**inputs)
         predictions = torch.argmax(outputs.logits, dim=2)
         tokens = biobert_tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
         entities = [
             tokens[i]
     tokenizer_f = models['llm_tokenizer']
     model_f = models['llm_model']
     inputs = tokenizer_f(prompt, return_tensors="pt", truncation=True)
     output_ids = model_f.generate(
         inputs.input_ids,
         max_length=max_length,
         temperature=temperature,
         pad_token_id=tokenizer_f.eos_token_id
     )
     answer = tokenizer_f.decode(output_ids[0], skip_special_tokens=True)
+    passage_keywords = set(prompt.lower().split())
     answer_keywords = set(answer.lower().split())
     if passage_keywords.intersection(answer_keywords):
+        return answer
     else:
+        return "Sorry, I can't help with that."
 def remove_answer_prefix(text):
     prefix = "Answer:"
     if prefix in text:
+        return text.split(prefix, 1)[-1].strip()
     return text
 def remove_incomplete_sentence(text):
     if not text.endswith('.'):
         last_period_index = text.rfind('.')
         if last_period_index != -1:
             return text[:last_period_index + 1].strip()
     return text
 @app.get("/")
 async def root():
     return {"message": "Welcome to the FastAPI application! Use the /health endpoint to check health, and /api/query for processing queries."}
     try:
         query_text = chat_query.query
         language_code = chat_query.language_code
+        query_embedding = embed_query_text(query_text)
         embeddings_data = load_embeddings ()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_embeddings(query_embedding, embeddings_data, n_results=5)
 @app.post("/api/resources")
 async def resources_endpoint(profile: MedicalProfile):
+    try:
         query_text = profile.conditions + " " + profile.daily_symptoms
+        print(f"Generated query text: {query_text}")
         query_embedding = embed_query_text(query_text)
         if query_embedding is None:
             raise ValueError("Failed to generate query embedding.")
         embeddings_data = load_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_embeddings(query_embedding, embeddings_data, n_results=6)
         if not initial_results:
             raise ValueError("No relevant documents found.")
         document_ids = [doc_id for doc_id, _ in initial_results]
         file_path = 'finalcleaned_excel_file.xlsx'
         df = pd.read_excel(file_path)
         file_name_to_url = {f"article_{index}.html": url for index, url in enumerate(df['Unnamed: 0'])}
         resources = []
         for file_name in document_ids:
             original_url = file_name_to_url.get(file_name, None)
                 resources.append({"file_name": file_name, "title": title, "url": original_url})
             else:
                 resources.append({"file_name": file_name, "title": "Unknown", "url": None})
         document_texts = retrieve_document_texts(document_ids, folder_path)
         if not document_texts:
             raise ValueError("Failed to retrieve document texts.")
         cross_encoder = models['cross_encoder']
         scores = cross_encoder.predict([(query_text, doc) for doc in document_texts])
+        scores = [float(score) for score in scores]
         for i, resource in enumerate(resources):
             resource["score"] = scores[i] if i < len(scores) else 0.0
         resources.sort(key=lambda x: x["score"], reverse=True)
         return {"resources": resources[:5], "success": True}
     except ValueError as ve:
         raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
         print(f"Unexpected error: {e}")
         raise HTTPException(status_code=500, detail="An unexpected error occurred.")
 @app.post("/api/recipes")
 async def recipes_endpoint(profile: MedicalProfile):
     try:
         recipe_query = (
             f"Recipes foods and meals suitable for someone with: "
             f"{profile.conditions} and experiencing {profile.daily_symptoms}"
         )
         query_text = recipe_query
         print(f"Generated query text: {query_text}")
         query_embedding = embed_query_text(query_text)
         if query_embedding is None:
             raise ValueError("Failed to generate query embedding.")
         embeddings_data = load_recipes_embeddings()
         folder_path = 'downloaded_articles/downloaded_articles'
         initial_results = query_recipes_embeddings(query_embedding, embeddings_data, n_results=5)
             raise ValueError("No relevant recipes found.")
         print("Initial results (document indices and similarities):")
         print(initial_results)
         document_indices = [doc_id for doc_id, _ in initial_results]
+        print("Document indices:", document_indices)
+        metadata_path = 'recipes_metadata.xlsx'
+        metadata = retrieve_metadata(document_indices, metadata_path=metadata_path)
+        print(f"Retrieved Metadata: {metadata}")
+        response = {
+            "metadata": [
+                {"id": doc_id, "original_file_name": metadata.get(doc_id, {}).get("original_file_name"), "url": metadata.get(doc_id, {}).get("url")}
+                for doc_id in document_indices
+            ],
+        }
+        return response
     except ValueError as ve:
         raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
         print(f"Unexpected error: {e}")
         raise HTTPException(status_code=500, detail="An unexpected error occurred.")
 if not init_success:
     print("Warning: Application initialized with partial functionality")
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)