Spaces:

vermen
/

neuroRAG

Runtime error

App Files Files Community

vermen commited on Oct 12, 2024

Commit

15ed53f

verified ·

1 Parent(s): 061ed1d

Update app.y

Browse files

Files changed (1) hide show

app.py +69 -38

app.py CHANGED Viewed

@@ -1,19 +1,67 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 def respond(
     message,
     history: list[tuple[str, str]],
     system_message,
-    max_tokens,
-    temperature,
-    top_p,
 ):
     messages = [{"role": "system", "content": system_message}]
@@ -22,43 +70,26 @@ def respond(
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
 if __name__ == "__main__":
-    demo.launch()

+from llama_index.llms.llama_cpp import LlamaCPP
+from llama_index.core import VectorStoreIndex, StorageContext
+from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
+import pymongo
+from pymongo.mongo_client import MongoClient
+from pymongo.operations import SearchIndexModel
+from llama_index.core import VectorStoreIndex, StorageContext
+import os
+###### load LLM
+model_url = "https://huggingface.co/georgesung/llama3_8b_chat_uncensored/resolve/main/llama3_8b_chat_uncensored_q4_0.gguf"
+llm = LlamaCPP(
+    # You can pass in the URL to a GGML model to download it automatically
+    model_url=model_url,
+    # optionally, you can set the path to a pre-downloaded model instead of model_url
+    model_path=None,
+    temperature=0.01,
+    max_new_tokens=1024,
+    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
+    context_window=3900,
+    # kwargs to pass to __call__()
+    generate_kwargs={},
+    # kwargs to pass to __init__()
+    # set to at least 1 to use GPU
+    model_kwargs={"n_gpu_layers": 1},
+    verbose=True,
+)
+# load embedding model
+# sentence transformers
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core import Settings
+embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
+Settings.llm = llm
+Settings.embed_model = embed_model
+Settings.node_parser = SentenceSplitter(chunk_size=1024)
+Settings.num_output = 256
+Settings.context_window = 3900
+# Load vector database
+MONGO_URI = "mongodb+srv://groverorgrf:[email protected]/?retryWrites=true&w=majority&appName=Cluster0"
+os.environ["MONGODB_URI"] = MONGO_URI
+DB_NAME = "neuroRAG"
+COLLECTION_NAME = "neuro_books"
+# Connect to your Atlas deployment
+mongo_client = MongoClient(MONGO_URI)
+collection = mongo_client[DB_NAME][COLLECTION_NAME]
+#
+vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, vector_index_name="default")
+# Recover index
+index = VectorStoreIndex.from_vector_store(vector_store)
+########### FOR CHAT
 def respond(
     message,
     history: list[tuple[str, str]],
     system_message,
+    top_k,
 ):
     messages = [{"role": "system", "content": system_message}]
             messages.append({"role": "user", "content": val[0]})
         if val[1]:
             messages.append({"role": "assistant", "content": val[1]})
+    #
+    # build the query engine
+    query_engine = index.as_query_engine(similarity_top_k=top_k)
+    #
+    query_str = message
+    response = query_engine.query(query_str)
+    #
+    return response
+#
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
 demo = gr.ChatInterface(
     respond,
     additional_inputs=[
+        gr.Textbox(value="Qual é sua pergunta?", label="System message"),
+        gr.Slider(minimum=1, maximum=10, value=3, step=1, label="top-k"),
     ],
 )
 if __name__ == "__main__":
+    demo.launch()