Spaces:

Manel
/

Stoic

Sleeping

App Files Files Community

Manel commited on Oct 13, 2024

Commit

9c39b83

verified ·

1 Parent(s): 24e6bd1

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -2

app.py CHANGED Viewed

@@ -43,9 +43,11 @@ def load_model(model_name):
         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         tokenizer.pad_token = tokenizer.eos_token
     logger.info(f"Model Loading Time : {time.time() - start_time} .")
     return model, tokenizer
@@ -76,6 +78,7 @@ def load_db(device, local_embed=False,  CHROMA_PATH = './ChromaDB'):
     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
     logger.info(f"Vector Embeddings and Chroma Database Loading Time : {time.time() - start_time} .")
     return db
@@ -105,6 +108,7 @@ def fetch_context(db, model, model_name, query, template, use_compressor=True):
     make sure that returned compressed context is relevant to the query.
     """
     if use_compressor:
         if model_name=='llama':
             compressor = LLMChainExtractor.from_llm(model)
             compressor.llm_chain.prompt.template = template['llama_rag_template']
@@ -121,7 +125,7 @@ def fetch_context(db, model, model_name, query, template, use_compressor=True):
         #logger.info(f"User Query : {query}")
         compressed_docs = compression_retriever.get_relevant_documents(query)
         #logger.info(f"Retrieved Compressed Docs : {compressed_docs}")
         return compressed_docs
     docs = db.max_marginal_relevance_search(query)
@@ -145,6 +149,8 @@ def llm_chain_with_context(model, model_name, query, context, template):
     """
     formated_context = format_context(context)
     # Give a precise answer to the question based on the context. Don't be verbose.
     if model_name=='llama':
         prompt_template = PromptTemplate(input_variables=['context', 'user_query'], template = template['llama_prompt_template'])
         llm_chain = LLMChain(llm=model, prompt=prompt_template)
@@ -152,8 +158,15 @@ def llm_chain_with_context(model, model_name, query, context, template):
     elif model_name=='mistral':
         prompt_template = PromptTemplate(input_variables=['context', 'user_query'], template = template['prompt_template'])
         llm_chain = LLMChain(llm=HF_pipeline_model, prompt=prompt_template)
     output = llm_chain.predict(user_query=query, context=formated_context)
     return output
@@ -170,6 +183,7 @@ def generate_response(query,  model, template):
     my_bar.progress(0.6, "Generating Answer.  Please wait.")
     response = llm_chain_with_context(model, model_name, query, context, template)
     logger.info(f"Total Execution Time: {time.time() - start_time}")
     my_bar.progress(0.9, "Post Processing.  Please wait.")

         tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
         tokenizer.pad_token = tokenizer.eos_token
+    print(f"Model Loading Time : {time.time() - start_time}."))
     logger.info(f"Model Loading Time : {time.time() - start_time} .")
     return model, tokenizer
     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)
     logger.info(f"Vector Embeddings and Chroma Database Loading Time : {time.time() - start_time} .")
+    print(f"Vector Embeddings and Chroma Database Loading Time : {time.time() - start_time} .")
     return db
     make sure that returned compressed context is relevant to the query.
     """
     if use_compressor:
+        start_time = time.time()
         if model_name=='llama':
             compressor = LLMChainExtractor.from_llm(model)
             compressor.llm_chain.prompt.template = template['llama_rag_template']
         #logger.info(f"User Query : {query}")
         compressed_docs = compression_retriever.get_relevant_documents(query)
         #logger.info(f"Retrieved Compressed Docs : {compressed_docs}")
+        print(f"Compressed context Generation Time: {time.time() - start_time}")")
         return compressed_docs
     docs = db.max_marginal_relevance_search(query)
     """
     formated_context = format_context(context)
     # Give a precise answer to the question based on the context. Don't be verbose.
+    start_chain_time = time.time()
     if model_name=='llama':
         prompt_template = PromptTemplate(input_variables=['context', 'user_query'], template = template['llama_prompt_template'])
         llm_chain = LLMChain(llm=model, prompt=prompt_template)
     elif model_name=='mistral':
         prompt_template = PromptTemplate(input_variables=['context', 'user_query'], template = template['prompt_template'])
         llm_chain = LLMChain(llm=HF_pipeline_model, prompt=prompt_template)
+    print(f"LLMChain Setup Time: {time.time() - start_chain_time}")
+    start_inference_time = time.time()
     output = llm_chain.predict(user_query=query, context=formated_context)
+    print(f"LLM Inference Time: {time.time() - start_inference_time}")
     return output
     my_bar.progress(0.6, "Generating Answer.  Please wait.")
     response = llm_chain_with_context(model, model_name, query, context, template)
+    print(f"Total Execution Time: {time.time() - start_time}")
     logger.info(f"Total Execution Time: {time.time() - start_time}")
     my_bar.progress(0.9, "Post Processing.  Please wait.")