{user_name}
commited on
Commit
·
8abf10e
1
Parent(s):
5711a5a
rag 코드 비활성화
Browse files
rag.py
CHANGED
@@ -1,64 +1,40 @@
|
|
1 |
-
################
|
2 |
-
# PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기
|
3 |
-
################
|
4 |
-
import tiktoken
|
5 |
-
tokenizer = tiktoken.get_encoding('cl100k_base')
|
6 |
-
def tiktoken_len(text):
|
7 |
-
|
8 |
-
|
9 |
|
10 |
-
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
11 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
-
from langchain.vectorstores import Chroma
|
13 |
-
from langchain.document_loaders import PyPDFLoader
|
14 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
15 |
-
|
16 |
-
## pdf 파일로드 하고 쪼개기
|
17 |
-
loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf')
|
18 |
-
pages = loader.load_and_split()
|
19 |
-
|
20 |
-
## chunk로 쪼개기
|
21 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
|
22 |
-
sourceDocs = text_splitter.split_documents(pages)
|
23 |
-
|
24 |
-
################
|
25 |
-
# HuggingFace 모델로 문서벡터화 후 유사도 탐색
|
26 |
-
################
|
27 |
-
from langchain.embeddings import HuggingFaceEmbeddings
|
28 |
-
|
29 |
-
model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
|
30 |
-
model_kwargs = {'device':'cpu'},
|
31 |
-
encode_kwargs = {'normalize_embeddings' : True})
|
32 |
|
33 |
-
##
|
34 |
-
|
|
|
35 |
|
36 |
-
##
|
37 |
-
|
38 |
-
|
39 |
-
merged = ' '.join([sourceDocs[result[0]][0] for result in results])
|
40 |
-
return merged
|
41 |
|
42 |
# ################
|
43 |
-
# #
|
44 |
# ################
|
45 |
-
# from
|
46 |
-
# llm = ChatOllama(
|
47 |
-
# base_url='http://localhost:11434',
|
48 |
-
# # model="phi3:medium", # 너무 느려서 mini로 변경
|
49 |
-
# model="phi3:mini",
|
50 |
-
# )
|
51 |
|
52 |
-
#
|
|
|
|
|
53 |
|
54 |
-
#
|
55 |
-
#
|
56 |
-
# ("user", "{question}"),
|
57 |
-
# ])
|
58 |
|
59 |
-
#
|
60 |
-
#
|
61 |
-
#
|
62 |
-
#
|
63 |
-
#
|
64 |
-
# return chain.invoke({"question": question, "document": mergedDoc})
|
|
|
1 |
+
# ################
|
2 |
+
# # PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기
|
3 |
+
# ################
|
4 |
+
# import tiktoken
|
5 |
+
# tokenizer = tiktoken.get_encoding('cl100k_base')
|
6 |
+
# def tiktoken_len(text):
|
7 |
+
# tokens = tokenizer.encode(text)
|
8 |
+
# return len(tokens)
|
9 |
|
10 |
+
# from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
|
11 |
+
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
+
# from langchain.vectorstores import Chroma
|
13 |
+
# from langchain.document_loaders import PyPDFLoader
|
14 |
+
# from langchain.embeddings import HuggingFaceEmbeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
# ## pdf 파일로드 하고 쪼개기
|
17 |
+
# loader = PyPDFLoader('gsat_170823.pdf')
|
18 |
+
# pages = loader.load_and_split()
|
19 |
|
20 |
+
# ## chunk로 쪼개기
|
21 |
+
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
|
22 |
+
# sourceDocs = text_splitter.split_documents(pages)
|
|
|
|
|
23 |
|
24 |
# ################
|
25 |
+
# # HuggingFace 모델로 문서벡터화 후 유사도 탐색
|
26 |
# ################
|
27 |
+
# from langchain.embeddings import HuggingFaceEmbeddings
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
+
# model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
|
30 |
+
# model_kwargs = {'device':'cpu'},
|
31 |
+
# encode_kwargs = {'normalize_embeddings' : True})
|
32 |
|
33 |
+
# ## Chroma 기반 pdf(docs 벡터화)
|
34 |
+
# db = Chroma.from_documents(sourceDocs, model_huggingface)
|
|
|
|
|
35 |
|
36 |
+
# ## 질의하기
|
37 |
+
# def SearchDocs(question, k=1):
|
38 |
+
# results = db.similarity_search_with_relevance_scores(question, k = k)
|
39 |
+
# merged = ' '.join([sourceDocs[result[0]][0] for result in results])
|
40 |
+
# return merged
|
|