{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ca2c990f-5215-4ab9-8143-1d79db28edc6", "metadata": {}, "outputs": [], "source": [ "import json, os\n", "\n", "from llama_index import SimpleDirectoryReader\n", "from llama_index.node_parser import SentenceSplitter\n", "from llama_index.schema import MetadataMode" ] }, { "cell_type": "code", "execution_count": null, "id": "2c535ad7-7846-4bef-8ba8-33e182490c3d", "metadata": {}, "outputs": [], "source": [ "from llama_index.finetuning import (\n", " generate_qa_embedding_pairs,\n", " EmbeddingQAFinetuneDataset,\n", ")\n", "from llama_index.finetuning import SentenceTransformersFinetuneEngine" ] }, { "cell_type": "code", "execution_count": null, "id": "12527049-a5cb-423c-8de5-099aee970c85", "metadata": {}, "outputs": [], "source": [ "from llama_index.llms import OpenAI" ] }, { "cell_type": "code", "execution_count": null, "id": "abde5e6c-3474-460c-9fac-4f3352c38b53", "metadata": {}, "outputs": [], "source": [ "import llama_index\n", "print(llama_index.__version__)" ] }, { "cell_type": "code", "execution_count": null, "id": "7dc65d7b-3cdb-4513-b09f-f7406ad59b35", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "978cf71f-1ce7-4598-92fe-18fe22ca37c6", "metadata": {}, "outputs": [], "source": [ "TRAIN_FILES = [\"../raw_documents/HI_Knowledge_Base.pdf\"]\n", "VAL_FILES = [\"../raw_documents/HI Chapter Summary Version 1.3.pdf\"]\n", "\n", "TRAIN_CORPUS_FPATH = \"../data/train_corpus.json\"\n", "VAL_CORPUS_FPATH = \"../data/val_corpus.json\"" ] }, { "cell_type": "code", "execution_count": null, "id": "663cd20e-c16e-4dda-924e-5f60eb25a772", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "26f614c8-eb45-4cc1-b067-2c7299587982", "metadata": {}, "outputs": [], "source": [ "def load_corpus(files, verbose=False):\n", " if verbose:\n", " print(f\"Loading files {files}\")\n", "\n", " reader = SimpleDirectoryReader(input_files=files)\n", " docs = reader.load_data()\n", " if verbose:\n", " print(f\"Loaded {len(docs)} docs\")\n", "\n", " parser = SentenceSplitter()\n", " nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)\n", "\n", " if verbose:\n", " print(f\"Parsed {len(nodes)} nodes\")\n", "\n", " return nodes" ] }, { "cell_type": "code", "execution_count": null, "id": "a6ba52e5-4d7f-4c30-8979-8d84a1bc3ca4", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "84cc4308-8ac4-4eba-9478-b81d5b645c48", "metadata": {}, "outputs": [], "source": [ "if not os.path.exists(TRAIN_CORPUS_FPATH) or \\\n", " not os.path.exists(VAL_CORPUS_FPATH):\n", "\n", " train_nodes = load_corpus(TRAIN_FILES, verbose=True)\n", " val_nodes = load_corpus(VAL_FILES, verbose=True)\n", " \n", " train_dataset = generate_qa_embedding_pairs(\n", " llm=OpenAI(model=\"gpt-3.5-turbo-1106\"), nodes=train_nodes\n", " )\n", " val_dataset = generate_qa_embedding_pairs(\n", " llm=OpenAI(model=\"gpt-3.5-turbo-1106\"), nodes=val_nodes\n", " )\n", " \n", " train_dataset.save_json(TRAIN_CORPUS_FPATH)\n", " val_dataset.save_json(VAL_CORPUS_FPATH)\n", " \n", "else:\n", " train_dataset = EmbeddingQAFinetuneDataset.from_json(TRAIN_CORPUS_FPATH)\n", " val_dataset = EmbeddingQAFinetuneDataset.from_json(VAL_CORPUS_FPATH)" ] }, { "cell_type": "code", "execution_count": null, "id": "c3399443-5936-4dfe-b0ec-821d222e734d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8f17c832-e9ae-477b-8bf7-a9c8410f1ed8", "metadata": {}, "outputs": [], "source": [ "finetune_engine = SentenceTransformersFinetuneEngine(\n", " train_dataset,\n", " model_id=\"BAAI/bge-small-en-v1.5\",\n", " model_output_path=\"test_model\",\n", " val_dataset=val_dataset,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "a6498d0b-da9a-4f7f-8c85-c9bf4d772c72", "metadata": {}, "outputs": [], "source": [ "finetune_engine.finetune()" ] }, { "cell_type": "code", "execution_count": null, "id": "e057b405-aa0e-4e78-91e0-9bf40f01c1a9", "metadata": {}, "outputs": [], "source": [ "embed_model = finetune_engine.get_finetuned_model()" ] }, { "cell_type": "code", "execution_count": null, "id": "72d9f97a-0902-4e65-8459-b34613e419f6", "metadata": {}, "outputs": [], "source": [ "embed_model" ] }, { "cell_type": "code", "execution_count": null, "id": "0709eaf7-b934-4f1d-84ea-c356a1dc5f11", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }