Spaces:

thechaiexperiment
/

TeaRAG

Sleeping

App Files Files Community

thechaiexperiment commited on Dec 11, 2024

Commit

2a5cca5

1 Parent(s): 9b4d106

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -11

app.py CHANGED Viewed

@@ -19,10 +19,12 @@ from transformers import (
 import pandas as pd
 import time
 class CustomUnpickler(pickle.Unpickler):
     def persistent_load(self, pid):
         try:
-            # Handle string encoding issues by decoding and re-encoding as ASCII
             if isinstance(pid, bytes):
                 pid = pid.decode('utf-8', errors='ignore')
             pid = str(pid).encode('ascii', errors='ignore').decode('ascii')
@@ -39,11 +41,9 @@ def safe_load_embeddings():
             unpickler = CustomUnpickler(file)
             embeddings_data = unpickler.load()
-            # Verify the data structure
             if not isinstance(embeddings_data, dict):
                 raise ValueError("Loaded data is not a dictionary")
-            # Verify the embeddings format
             first_key = next(iter(embeddings_data))
             if not isinstance(embeddings_data[first_key], (np.ndarray, list)):
                 raise ValueError("Embeddings are not in the expected format")
@@ -54,6 +54,7 @@ def safe_load_embeddings():
         print(f"Error loading embeddings: {str(e)}")
         return None
 class GlobalModels:
     embedding_model = None
     cross_encoder = None
@@ -71,8 +72,25 @@ class GlobalModels:
     bio_tokenizer = None
     bio_model = None
 global_models = GlobalModels()
 @app.on_event("startup")
 async def load_models():
     """Initialize all models and data on startup"""
@@ -86,12 +104,36 @@ async def load_models():
             raise HTTPException(status_code=500, detail="Failed to load embeddings data")
         global_models.embeddings_data = embeddings_data
-        # Continue loading other models only if embeddings loaded successfully
         global_models.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
         global_models.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
-        # Load remaining models...
-        # (rest of your model loading code remains the same)
         print("All models loaded successfully")
@@ -99,11 +141,6 @@ async def load_models():
         print(f"Error during startup: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Failed to initialize application: {str(e)}")
-# Rest of your FastAPI application code remains the same...
-@app.get("/")
-async def root():
-    return {"message": "Server is running"}
 # Models and data structures to store loaded models
 class GlobalModels:
@@ -356,6 +393,10 @@ async def get_answer(input_data: QueryInput):
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)

 import pandas as pd
 import time
+# Initialize FastAPI app first
+app = FastAPI()
 class CustomUnpickler(pickle.Unpickler):
     def persistent_load(self, pid):
         try:
             if isinstance(pid, bytes):
                 pid = pid.decode('utf-8', errors='ignore')
             pid = str(pid).encode('ascii', errors='ignore').decode('ascii')
             unpickler = CustomUnpickler(file)
             embeddings_data = unpickler.load()
             if not isinstance(embeddings_data, dict):
                 raise ValueError("Loaded data is not a dictionary")
             first_key = next(iter(embeddings_data))
             if not isinstance(embeddings_data[first_key], (np.ndarray, list)):
                 raise ValueError("Embeddings are not in the expected format")
         print(f"Error loading embeddings: {str(e)}")
         return None
+# Models and data structures
 class GlobalModels:
     embedding_model = None
     cross_encoder = None
     bio_tokenizer = None
     bio_model = None
+# Initialize global models
 global_models = GlobalModels()
+# Download NLTK data
+nltk.download('punkt')
+# Pydantic models for request validation
+class QueryInput(BaseModel):
+    query_text: str
+    language_code: int  # 0 for Arabic, 1 for English
+    query_type: str    # "profile" or "question"
+    previous_qa: Optional[List[Dict[str, str]]] = None
+class DocumentResponse(BaseModel):
+    title: str
+    url: str
+    text: str
+    score: float
 @app.on_event("startup")
 async def load_models():
     """Initialize all models and data on startup"""
             raise HTTPException(status_code=500, detail="Failed to load embeddings data")
         global_models.embeddings_data = embeddings_data
+        # Load remaining models
         global_models.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=512)
         global_models.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Load BART models
+        global_models.tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
+        global_models.model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
+        # Load Orca model
+        model_name = "M4-ai/Orca-2.0-Tau-1.8B"
+        global_models.tokenizer_f = AutoTokenizer.from_pretrained(model_name)
+        global_models.model_f = AutoModelForCausalLM.from_pretrained(model_name)
+        # Load translation models
+        global_models.ar_to_en_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
+        global_models.ar_to_en_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ar-en")
+        global_models.en_to_ar_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
+        global_models.en_to_ar_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar")
+        # Load Medical NER models
+        global_models.bio_tokenizer = AutoTokenizer.from_pretrained("blaze999/Medical-NER")
+        global_models.bio_model = AutoModelForTokenClassification.from_pretrained("blaze999/Medical-NER")
+        # Load URL mapping data
+        try:
+            df = pd.read_excel('finalcleaned_excel_file.xlsx')
+            global_models.file_name_to_url = {f"article_{index}.html": url for index, url in enumerate(df['Unnamed: 0'])}
+        except Exception as e:
+            print(f"Error loading URL mapping data: {e}")
+            raise HTTPException(status_code=500, detail="Failed to load URL mapping data.")
         print("All models loaded successfully")
         print(f"Error during startup: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Failed to initialize application: {str(e)}")
 # Models and data structures to store loaded models
 class GlobalModels:
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+@app.get("/")
+async def root():
+    return {"message": "Server is running"}
 if __name__ == "__main__":
     import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=7860)