# ################ | |
# # PDF ํ์ผ์ ๋ก๋ํ๊ณ ๋ฌธ์๋ฅผ ์ชผ๊ฐ์ ๋ฌธ์๋ฒกํฐํ ํ ํ ์ง์ํ๊ธฐ | |
# ################ | |
# import tiktoken | |
# tokenizer = tiktoken.get_encoding('cl100k_base') | |
# def tiktoken_len(text): | |
# tokens = tokenizer.encode(text) | |
# return len(tokens) | |
# from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings | |
# from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# from langchain.vectorstores import Chroma | |
# from langchain.document_loaders import PyPDFLoader | |
# from langchain.embeddings import HuggingFaceEmbeddings | |
# ## pdf ํ์ผ๋ก๋ ํ๊ณ ์ชผ๊ฐ๊ธฐ | |
# loader = PyPDFLoader('gsat_170823.pdf') | |
# pages = loader.load_and_split() | |
# ## chunk๋ก ์ชผ๊ฐ๊ธฐ | |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len) | |
# sourceDocs = text_splitter.split_documents(pages) | |
# ################ | |
# # HuggingFace ๋ชจ๋ธ๋ก ๋ฌธ์๋ฒกํฐํ ํ ์ ์ฌ๋ ํ์ | |
# ################ | |
# from langchain.embeddings import HuggingFaceEmbeddings | |
# model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', | |
# model_kwargs = {'device':'cpu'}, | |
# encode_kwargs = {'normalize_embeddings' : True}) | |
# ## Chroma ๊ธฐ๋ฐ pdf(docs ๋ฒกํฐํ) | |
# db = Chroma.from_documents(sourceDocs, model_huggingface) | |
# ## ์ง์ํ๊ธฐ | |
# def SearchDocs(question, k=1): | |
# results = db.similarity_search_with_relevance_scores(question, k = k) | |
# merged = ' '.join([sourceDocs[result[0]][0] for result in results]) | |
# return merged |