Spaces:

vermen
/

neuroRAG

Runtime error

App Files Files Community

vermen commited on Oct 31, 2024

Commit

094f2eb

verified ·

1 Parent(s): c872003

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -36

app.py CHANGED Viewed

@@ -1,44 +1,29 @@
 import gradio as gr
 from llama_index.llms.llama_cpp import LlamaCPP
-from llama_index.core import VectorStoreIndex, StorageContext
 from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
 import pymongo
 from pymongo.mongo_client import MongoClient
 from pymongo.operations import SearchIndexModel
 from llama_index.core import VectorStoreIndex, StorageContext
 import os
 ###### load LLM
-model_url = "https://huggingface.co/georgesung/llama3_8b_chat_uncensored/resolve/main/llama3_8b_chat_uncensored_q4_0.gguf"
-llm = LlamaCPP(
-    # You can pass in the URL to a GGML model to download it automatically
-    model_url=model_url,
-    # optionally, you can set the path to a pre-downloaded model instead of model_url
-    model_path=None,
-    temperature=0.01,
-    max_new_tokens=256, # could be larger but requires more time
-    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
-    context_window=3900,
-    # kwargs to pass to __call__()
-    generate_kwargs={},
-    # kwargs to pass to __init__()
-    # set to at least 1 to use GPU
-    model_kwargs={"n_gpu_layers": 1},
-    verbose=True,
-)
-# load embedding model
-# sentence transformers
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
-from llama_index.core.node_parser import SentenceSplitter
-from llama_index.core import Settings
-embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
-Settings.llm = llm
-Settings.embed_model = embed_model
-Settings.node_parser = SentenceSplitter(chunk_size=1024)
-Settings.num_output = 256
-Settings.context_window = 3900
 # Load vector database
@@ -52,10 +37,39 @@ collection = mongo_client[DB_NAME][COLLECTION_NAME]
 #
 vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, vector_index_name="default")
-# Recover index
-index = VectorStoreIndex.from_vector_store(vector_store)
 ########### FOR CHAT
 def respond(
@@ -73,12 +87,21 @@ def respond(
             messages.append({"role": "assistant", "content": val[1]})
     #
     # build the query engine
-    query_engine = index.as_query_engine(similarity_top_k=top_k)
-    #
     query_str = message
-    response = query_engine.query(query_str)
     #
-    return str(response)
 #
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface

+# version for gradio
 import gradio as gr
 from llama_index.llms.llama_cpp import LlamaCPP
 from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
 import pymongo
 from pymongo.mongo_client import MongoClient
 from pymongo.operations import SearchIndexModel
 from llama_index.core import VectorStoreIndex, StorageContext
+from llama_index.core.vector_stores.types import VectorStoreQuery
+from langchain_nomic.embeddings import NomicEmbeddings
 import os
 ###### load LLM
+os.system("ollama pull llama3.2:3b-instruct-fp16")
+# LLM
+from langchain_ollama import ChatOllama
+local_llm = "llama3.2:3b-instruct-fp16"
+llm = ChatOllama(model=local_llm, temperature=0)
+llm_json_mode = ChatOllama(model=local_llm, temperature=0, format="json")
+# load embedding model
+# sentence transformers
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+embed_model = NomicEmbeddings(model="nomic-embed-text-v1.5", inference_mode="local")
 # Load vector database
 #
 vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, vector_index_name="default")
+#
+# print(query_results.nodes[0].text)
+# COMPONENT
+### Router
+import json
+from langchain_core.messages import HumanMessage, SystemMessage
+# Prompt
+### Generate
+# Prompt
+rag_prompt = """Você é um assistente multilíngue para tarefas de resposta a perguntas.
+Aquí está o contexto a ser usado para responder à pergunta:
+{context}
+Pense cuidadosamente acerca do contexto de acima.
+Agora, revise a pergunta do usuario:
+{question}
+Forneça uma resposta a essas perguntas usando apenas o contexto acima.
+Mantenha sua resposta formal e concisa.
+Resposta:"""
+# Post-processing
+def format_docs(nodes):
+    return "\n\n".join(doc.text for doc in nodes)
 ########### FOR CHAT
 def respond(
             messages.append({"role": "assistant", "content": val[1]})
     #
     # build the query engine
+    ####
     query_str = message
+    question = query_str
+    #
+    query_embedding = embed_model.embed_query(query_str)
+    vector_store_query = VectorStoreQuery(query_embedding,similarity_top_k = top_k)
+    # Recover index
+    query_results = vector_store.query(vector_store_query)
+    docs = query_results.nodes
+    docs_txt = format_docs(docs)
+    rag_prompt_formatted = rag_prompt.format(context=docs_txt, question=question)
+    #print(rag_prompt_formatted)
+    generation = llm.invoke([HumanMessage(content=rag_prompt_formatted)])
     #
+    return generation.content
 #
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface