File size: 1,644 Bytes
8abf10e b16918e 8abf10e b16918e 8abf10e b16918e 8abf10e b16918e 5711a5a 8abf10e 5711a5a 8abf10e 5711a5a 8abf10e 5711a5a 8abf10e 5711a5a 8abf10e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
# ################
# # PDF ํ์ผ์ ๋ก๋ํ๊ณ ๋ฌธ์๋ฅผ ์ชผ๊ฐ์ ๋ฌธ์๋ฒกํฐํ ํ ํ ์ง์ํ๊ธฐ
# ################
# import tiktoken
# tokenizer = tiktoken.get_encoding('cl100k_base')
# def tiktoken_len(text):
# tokens = tokenizer.encode(text)
# return len(tokens)
# from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.vectorstores import Chroma
# from langchain.document_loaders import PyPDFLoader
# from langchain.embeddings import HuggingFaceEmbeddings
# ## pdf ํ์ผ๋ก๋ ํ๊ณ ์ชผ๊ฐ๊ธฐ
# loader = PyPDFLoader('gsat_170823.pdf')
# pages = loader.load_and_split()
# ## chunk๋ก ์ชผ๊ฐ๊ธฐ
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
# sourceDocs = text_splitter.split_documents(pages)
# ################
# # HuggingFace ๋ชจ๋ธ๋ก ๋ฌธ์๋ฒกํฐํ ํ ์ ์ฌ๋ ํ์
# ################
# from langchain.embeddings import HuggingFaceEmbeddings
# model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
# model_kwargs = {'device':'cpu'},
# encode_kwargs = {'normalize_embeddings' : True})
# ## Chroma ๊ธฐ๋ฐ pdf(docs ๋ฒกํฐํ)
# db = Chroma.from_documents(sourceDocs, model_huggingface)
# ## ์ง์ํ๊ธฐ
# def SearchDocs(question, k=1):
# results = db.similarity_search_with_relevance_scores(question, k = k)
# merged = ' '.join([sourceDocs[result[0]][0] for result in results])
# return merged |