{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "aaa7e27e-7632-4024-9764-a4404293fd05", "metadata": {}, "outputs": [], "source": [ "import openai\n", "from llama_index import SimpleDirectoryReader\n", "from llama_index import Document\n", "from llama_index import VectorStoreIndex\n", "from llama_index import ServiceContext\n", "from llama_index.llms import OpenAI\n", "\n", "from llama_index.embeddings import HuggingFaceEmbedding\n", "from llama_index import StorageContext, load_index_from_storage\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "8a2abdeb-9ff3-4793-8a14-92d8d06391f9", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "ca5606cf-16ec-4ae1-a057-25fab105d545", "metadata": {}, "source": [ "### Run Indexing in memory without using vectorstore" ] }, { "cell_type": "code", "execution_count": null, "id": "a2d79bc4-1c0f-4354-ae6c-fe3697a28c9d", "metadata": {}, "outputs": [], "source": [ "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": null, "id": "219bf47e-2df4-46e2-b6c4-b0406820fe8c", "metadata": {}, "outputs": [], "source": [ "documents = SimpleDirectoryReader(input_files=[\"../raw_documents/HI_Knowledge_Base.pdf\"]).load_data()\n", "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))" ] }, { "cell_type": "code", "execution_count": null, "id": "6e0e2c29-98c5-48c8-9f1c-b41ca0a1e904", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ad4edb79-f29b-4556-9e69-afa8917568c7", "metadata": {}, "outputs": [], "source": [ "llm = OpenAI(model=\"gpt-3.5-turbo-1106\", temperature=0.1)" ] }, { "cell_type": "code", "execution_count": null, "id": "f4911316-bc1b-4e65-a66e-c83e70c6fa00", "metadata": {}, "outputs": [], "source": [ "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")" ] }, { "cell_type": "code", "execution_count": null, "id": "6a441a1f-91d8-49ec-999d-36adbccbfa90", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "942f4406-4b24-4c4e-a803-e0a9a5f31f6c", "metadata": {}, "outputs": [], "source": [ "# service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)\n", "service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)\n", "index = VectorStoreIndex.from_documents([document], service_context=service_context)" ] }, { "cell_type": "code", "execution_count": null, "id": "630afd70-7aad-4049-950e-096976e4f7fa", "metadata": {}, "outputs": [], "source": [ "indexing_cost = time.time() - start_time\n", "indexing_cost = indexing_cost / 60\n", "print(f\"indexing time: {indexing_cost:.1f} mins\")" ] }, { "cell_type": "code", "execution_count": null, "id": "24691a2c-00d9-4121-81f8-8aacbe764ae8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "36f81a58-4249-43bd-97e2-781b1d08b6d9", "metadata": {}, "outputs": [], "source": [ "index.storage_context.persist(persist_dir=\"../models/llama_index_json/\")" ] }, { "cell_type": "code", "execution_count": null, "id": "dca06682-8fa1-4cf2-81a9-4c7d223d2032", "metadata": {}, "outputs": [], "source": [ "query_engine = index.as_query_engine()" ] }, { "cell_type": "code", "execution_count": null, "id": "f2553fd3-dc88-4a75-a85c-ab095ae3fba8", "metadata": {}, "outputs": [], "source": [ "response = query_engine.query(\"What is medishield\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c8c6f966-6196-4af2-aaac-042e327bb046", "metadata": {}, "outputs": [], "source": [ "print(str(response))" ] }, { "cell_type": "code", "execution_count": null, "id": "b9365c5a-fcbb-4581-997b-d23723175129", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "94977d29-431f-4704-a93f-3b96a1e70cd0", "metadata": {}, "source": [ "### Load index from persisted file without using vectorstore" ] }, { "cell_type": "code", "execution_count": null, "id": "c5348da7-59e8-45d1-afee-d50ff58549cb", "metadata": {}, "outputs": [], "source": [ "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": null, "id": "0d51d2d5-9523-46dd-9bc4-b013bba42735", "metadata": {}, "outputs": [], "source": [ "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c2a72bfe-8e3e-4b26-8cbe-4e55b1fbab34", "metadata": {}, "outputs": [], "source": [ "service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)" ] }, { "cell_type": "code", "execution_count": null, "id": "7a460ce0-a3a2-47ec-95bb-472c61d9f23a", "metadata": {}, "outputs": [], "source": [ "storage_context = StorageContext.from_defaults(persist_dir=\"../models/llama_index_json/\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1331b3e5-7b3a-4f0f-80f0-f8a6810431af", "metadata": {}, "outputs": [], "source": [ "index = load_index_from_storage(storage_context=storage_context, service_context=service_context)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e9b0825-eff8-42cd-b014-0f612b268d42", "metadata": {}, "outputs": [], "source": [ "load_indexing_cost = time.time() - start_time\n", "load_indexing_cost = load_indexing_cost / 60\n", "print(f\"Load indexing time: {load_indexing_cost:.1f} mins\")" ] }, { "cell_type": "code", "execution_count": null, "id": "7df3557b-c79f-45ad-9216-8d1502a5bc8e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "f77c3494-65a6-44fc-ab6e-de64153300f0", "metadata": {}, "source": [ "### Instantiate query engine" ] }, { "cell_type": "code", "execution_count": null, "id": "891f6fe1-ef71-44f8-b6a8-03d8abdcadae", "metadata": {}, "outputs": [], "source": [ "query_engine = index.as_query_engine()" ] }, { "cell_type": "code", "execution_count": null, "id": "a6ad3fbb-8bb0-4115-88e5-4b272b1ba422", "metadata": { "scrolled": true }, "outputs": [], "source": [ "response = query_engine.query(\"What is medishield\")\n", "print(str(response))" ] }, { "cell_type": "code", "execution_count": null, "id": "88fc7e00-64af-4910-a1d0-34a36e110b05", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "34a4ea9f-73d0-4733-847e-89fb4a051294", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "8acae3ed-2953-45a3-aba9-0327b6ae3679", "metadata": {}, "source": [ "### ChromaDB method - create vectorstore" ] }, { "cell_type": "code", "execution_count": null, "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97", "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores import ChromaVectorStore\n", "from llama_index.storage.storage_context import StorageContext\n", "from llama_index import ServiceContext\n", "from llama_index import Document\n", "\n", "from llama_index.embeddings import HuggingFaceEmbedding\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb", "metadata": {}, "outputs": [], "source": [ "# load some documents\n", "documents = SimpleDirectoryReader(input_files=[\"../raw_documents/HI_Knowledge_Base.pdf\"]).load_data()\n", "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))" ] }, { "cell_type": "code", "execution_count": null, "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d", "metadata": {}, "outputs": [], "source": [ "# initialize client, setting path to save data\n", "db = chromadb.PersistentClient(path=\"../models/chroma_db\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed", "metadata": {}, "outputs": [], "source": [ "# create collection\n", "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ] }, { "cell_type": "code", "execution_count": null, "id": "eb5edab2-30db-4bf7-96b5-4005d3161988", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b4adc76c-b18d-4a3f-8563-127074491ba9", "metadata": {}, "outputs": [], "source": [ "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")" ] }, { "cell_type": "code", "execution_count": null, "id": "5289f0f9-bce2-4a2f-9428-b99906b78622", "metadata": {}, "outputs": [], "source": [ "service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)" ] }, { "cell_type": "code", "execution_count": null, "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4", "metadata": {}, "outputs": [], "source": [ "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05", "metadata": {}, "outputs": [], "source": [ "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": null, "id": "82fc724e-4b03-433d-ada4-d451e13e25e9", "metadata": {}, "outputs": [], "source": [ "# create your index\n", "index = VectorStoreIndex.from_documents(\n", " documents, service_context=service_context, storage_context=storage_context\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "1ff50855-e043-4736-87c7-0ef8c11bbb26", "metadata": {}, "outputs": [], "source": [ "indexing_cost = time.time() - start_time\n", "indexing_cost = indexing_cost / 60\n", "print(f\"Indexing time: {indexing_cost:.1f} mins\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ccbfa64b-6e09-40c4-b6bf-18055eaa6735", "metadata": {}, "outputs": [], "source": [ "# create a query engine and query\n", "query_engine = index.as_query_engine()" ] }, { "cell_type": "code", "execution_count": null, "id": "3f16351f-7c28-4b8f-9050-3c90a40998c5", "metadata": {}, "outputs": [], "source": [ "retriever = index.as_retriever()" ] }, { "cell_type": "code", "execution_count": null, "id": "3d07f753-1643-4c18-b368-6c55f4a7968a", "metadata": {}, "outputs": [], "source": [ "r_list = retriever.retrieve(\"What is the meaning of life?\")" ] }, { "cell_type": "code", "execution_count": null, "id": "74b4367a-185f-42cf-9951-48325378adf0", "metadata": { "scrolled": true }, "outputs": [], "source": [ "r_list[1].to_dict()" ] }, { "cell_type": "code", "execution_count": null, "id": "0d058fa9-2608-4508-9a6c-dd8ff9387987", "metadata": {}, "outputs": [], "source": [ "response = query_engine.query(\"What is the meaning of life?\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ec91bf3e-ce05-4183-9270-e53c1a21ccb4", "metadata": {}, "outputs": [], "source": [ "print(response)" ] }, { "cell_type": "code", "execution_count": null, "id": "08fb2be5-3a44-4bb8-a9fc-61d7f03b7a35", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "a7fc01f6-4738-415b-a96b-afd6cf8d789a", "metadata": {}, "source": [ "### ChromaDB method - load vectorstore" ] }, { "cell_type": "code", "execution_count": null, "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5", "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "from llama_index import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores import ChromaVectorStore\n", "from llama_index.storage.storage_context import StorageContext\n", "from llama_index import ServiceContext\n", "from llama_index import Document\n", "\n", "from llama_index.embeddings import HuggingFaceEmbedding\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "11ff0889-ef46-4447-ae2e-6fcaaf0733ec", "metadata": {}, "outputs": [], "source": [ "embed_model = HuggingFaceEmbedding(model_name=\"BAAI/bge-small-en-v1.5\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ffdbf912-7eb2-429a-a98e-5e3a9e8fe8bd", "metadata": {}, "outputs": [], "source": [ "service_context = ServiceContext.from_defaults(llm=None, embed_model=embed_model)" ] }, { "cell_type": "code", "execution_count": null, "id": "b7b302bf-111d-46a0-95e0-a148cb327ad2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba", "metadata": {}, "outputs": [], "source": [ "db = chromadb.PersistentClient(path=\"../models/chroma_db\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1b385644-b46e-4d13-88fa-9f4af39db405", "metadata": {}, "outputs": [], "source": [ "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": null, "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae", "metadata": {}, "outputs": [], "source": [ "# create your index\n", "index = VectorStoreIndex.from_vector_store(\n", " vector_store=vector_store, service_context=service_context, storage_context=storage_context\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "78a5c9f6-a63b-40d6-a43f-0cf4bb7b15a6", "metadata": {}, "outputs": [], "source": [ "# create a query engine\n", "query_engine = index.as_query_engine()" ] }, { "cell_type": "code", "execution_count": null, "id": "58315ac6-c0bc-424d-b4b0-90123ebc57df", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7bb7c21a-7461-40c1-87a7-4a1f92f70153", "metadata": { "scrolled": true }, "outputs": [], "source": [ "response = query_engine.query(\"What is llama2?\")\n", "print(response)" ] }, { "cell_type": "code", "execution_count": null, "id": "874a39ce-e682-42fa-8085-646bacea6cdb", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }