In [None]:
import json, os

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter
from llama_index.schema import MetadataMode

In [None]:
from llama_index.finetuning import (
 generate_qa_embedding_pairs,
 EmbeddingQAFinetuneDataset,
)
from llama_index.finetuning import SentenceTransformersFinetuneEngine

In [None]:
from llama_index.llms import OpenAI

In [None]:
import llama_index
print(llama_index.__version__)

In [None]:
TRAIN_FILES = ["../raw_documents/HI_Knowledge_Base.pdf"]
VAL_FILES = ["../raw_documents/HI Chapter Summary Version 1.3.pdf"]

TRAIN_CORPUS_FPATH = "../data/train_corpus.json"
VAL_CORPUS_FPATH = "../data/val_corpus.json"

In [None]:
def load_corpus(files, verbose=False):
 if verbose:
 print(f"Loading files {files}")

 reader = SimpleDirectoryReader(input_files=files)
 docs = reader.load_data()
 if verbose:
 print(f"Loaded {len(docs)} docs")

 parser = SentenceSplitter()
 nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

 if verbose:
 print(f"Parsed {len(nodes)} nodes")

 return nodes

In [None]:
if not os.path.exists(TRAIN_CORPUS_FPATH) or \
 not os.path.exists(VAL_CORPUS_FPATH):

 train_nodes = load_corpus(TRAIN_FILES, verbose=True)
 val_nodes = load_corpus(VAL_FILES, verbose=True)
 
 train_dataset = generate_qa_embedding_pairs(
 llm=OpenAI(model="gpt-3.5-turbo-1106"), nodes=train_nodes
 )
 val_dataset = generate_qa_embedding_pairs(
 llm=OpenAI(model="gpt-3.5-turbo-1106"), nodes=val_nodes
 )
 
 train_dataset.save_json(TRAIN_CORPUS_FPATH)
 val_dataset.save_json(VAL_CORPUS_FPATH)
 
else:
 train_dataset = EmbeddingQAFinetuneDataset.from_json(TRAIN_CORPUS_FPATH)
 val_dataset = EmbeddingQAFinetuneDataset.from_json(VAL_CORPUS_FPATH)

In [None]:
finetune_engine = SentenceTransformersFinetuneEngine(
 train_dataset,
 model_id="BAAI/bge-small-en-v1.5",
 model_output_path="test_model",
 val_dataset=val_dataset,
)

In [None]:
finetune_engine.finetune()

In [None]:
embed_model = finetune_engine.get_finetuned_model()

In [None]:
embed_model