File size: 1,644 Bytes
8abf10e
 
 
 
 
 
 
 
b16918e
8abf10e
 
 
 
 
b16918e
8abf10e
 
 
b16918e
8abf10e
 
 
b16918e
5711a5a
8abf10e
5711a5a
8abf10e
5711a5a
8abf10e
 
 
5711a5a
8abf10e
 
5711a5a
8abf10e
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# ################
# # PDF ํŒŒ์ผ์„ ๋กœ๋“œํ•˜๊ณ  ๋ฌธ์„œ๋ฅผ ์ชผ๊ฐœ์„œ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ•œ ํ›„ ์งˆ์˜ํ•˜๊ธฐ
# ################
# import tiktoken
# tokenizer = tiktoken.get_encoding('cl100k_base')
# def tiktoken_len(text):
#     tokens = tokenizer.encode(text)
#     return len(tokens)
    
# from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.vectorstores import Chroma
# from langchain.document_loaders import PyPDFLoader
# from langchain.embeddings import HuggingFaceEmbeddings

# ## pdf ํŒŒ์ผ๋กœ๋“œ ํ•˜๊ณ  ์ชผ๊ฐœ๊ธฐ
# loader = PyPDFLoader('gsat_170823.pdf')
# pages = loader.load_and_split()

# ## chunk๋กœ ์ชผ๊ฐœ๊ธฐ
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
# sourceDocs = text_splitter.split_documents(pages)

# ################
# # HuggingFace ๋ชจ๋ธ๋กœ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ›„ ์œ ์‚ฌ๋„ ํƒ์ƒ‰
# ################
# from langchain.embeddings import HuggingFaceEmbeddings

# model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', 
#                                           model_kwargs = {'device':'cpu'}, 
#                                           encode_kwargs = {'normalize_embeddings' : True})

# ## Chroma ๊ธฐ๋ฐ˜ pdf(docs ๋ฒกํ„ฐํ™”)
# db = Chroma.from_documents(sourceDocs, model_huggingface)

# ## ์งˆ์˜ํ•˜๊ธฐ
# def SearchDocs(question, k=1):
#     results = db.similarity_search_with_relevance_scores(question, k = k)
#     merged = ' '.join([sourceDocs[result[0]][0] for result in results])
#     return merged