Spaces:
Sleeping
Sleeping
File size: 1,792 Bytes
f66560f f5d22a4 6f2843a f66560f 5439651 f66560f f5d22a4 f66560f f5d22a4 f66560f f5d22a4 f66560f f5d22a4 f66560f f5d22a4 f66560f 5439651 f5d22a4 f66560f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# embeddings functions
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
import time
from langchain_core.documents import Document
from config import EMBEDDING_MODEL
def create_embeddings(
docs: list[Document],
chunk_size:int = 500,
chunk_overlap:int = 50,
):
"""given a sequence of `Document` objects this fucntion will
generate embeddings for it.
## argument
:params docs (list[Document]) -> list of `list[Document]`
:params chunk_size (int) -> chunk size in which documents are chunks, defaults to 500
:params chunk_overlap (int) -> the amount of token that will be overlapped between chunks, defaults to 50
:params embedding_model (str) -> the huggingspace model that will embed the documents
## Return
Tuple of embedding and chunks
"""
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", "(?<=\. )", " ", ""],
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
length_function = len,
)
# Stage one: read all the docs, split them into chunks.
st = time.time()
print('Loading documents and creating chunks ...')
# Split each document into chunks using the configured text splitter
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
et = time.time() - st
print(f'Time taken to chunk {len(docs)} documents: {et} seconds.')
#Stage two: embed the docs.
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL)
print(f"created a total of {len(chunks)} chunks")
return embeddings,chunks |