{user_name} commited on
Commit
8abf10e
·
1 Parent(s): 5711a5a

rag 코드 비활성화

Browse files
Files changed (1) hide show
  1. rag.py +31 -55
rag.py CHANGED
@@ -1,64 +1,40 @@
1
- ################
2
- # PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기
3
- ################
4
- import tiktoken
5
- tokenizer = tiktoken.get_encoding('cl100k_base')
6
- def tiktoken_len(text):
7
- tokens = tokenizer.encode(text)
8
- return len(tokens)
9
 
10
- from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
11
- from langchain.text_splitter import RecursiveCharacterTextSplitter
12
- from langchain.vectorstores import Chroma
13
- from langchain.document_loaders import PyPDFLoader
14
- from langchain.embeddings import HuggingFaceEmbeddings
15
-
16
- ## pdf 파일로드 하고 쪼개기
17
- loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf')
18
- pages = loader.load_and_split()
19
-
20
- ## chunk로 쪼개기
21
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
22
- sourceDocs = text_splitter.split_documents(pages)
23
-
24
- ################
25
- # HuggingFace 모델로 문서벡터화 후 유사도 탐색
26
- ################
27
- from langchain.embeddings import HuggingFaceEmbeddings
28
-
29
- model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
30
- model_kwargs = {'device':'cpu'},
31
- encode_kwargs = {'normalize_embeddings' : True})
32
 
33
- ## Chroma 기반 pdf(docs 벡터화)
34
- db = Chroma.from_documents(sourceDocs, model_huggingface)
 
35
 
36
- ## 질의하기
37
- def SearchDocs(question, k=1):
38
- results = db.similarity_search_with_relevance_scores(question, k = k)
39
- merged = ' '.join([sourceDocs[result[0]][0] for result in results])
40
- return merged
41
 
42
  # ################
43
- # # 찾은 문서를 프롬프트에 전달하여 LLM으로 답변 생성
44
  # ################
45
- # from langchain_community.chat_models import ChatOllama
46
- # llm = ChatOllama(
47
- # base_url='http://localhost:11434',
48
- # # model="phi3:medium", # 너무 느려서 mini로 변경
49
- # model="phi3:mini",
50
- # )
51
 
52
- # from langchain_core.prompts import ChatPromptTemplate
 
 
53
 
54
- # prompt = ChatPromptTemplate.from_messages([
55
- # ("system", "Please answer the following question from the document: {document}"),
56
- # ("user", "{question}"),
57
- # ])
58
 
59
- # # print('-'*50)
60
- # chain = prompt | llm
61
- # def Response(question):
62
- # searchedDocs = SearchDocs(question)
63
- # mergedDoc = ' '.join(searchedDocs[0][0])
64
- # return chain.invoke({"question": question, "document": mergedDoc})
 
1
+ # ################
2
+ # # PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기
3
+ # ################
4
+ # import tiktoken
5
+ # tokenizer = tiktoken.get_encoding('cl100k_base')
6
+ # def tiktoken_len(text):
7
+ # tokens = tokenizer.encode(text)
8
+ # return len(tokens)
9
 
10
+ # from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
11
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ # from langchain.vectorstores import Chroma
13
+ # from langchain.document_loaders import PyPDFLoader
14
+ # from langchain.embeddings import HuggingFaceEmbeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # ## pdf 파일로드 하고 쪼개기
17
+ # loader = PyPDFLoader('gsat_170823.pdf')
18
+ # pages = loader.load_and_split()
19
 
20
+ # ## chunk로 쪼개기
21
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
22
+ # sourceDocs = text_splitter.split_documents(pages)
 
 
23
 
24
  # ################
25
+ # # HuggingFace 모델로 문서벡터화 유사도 탐색
26
  # ################
27
+ # from langchain.embeddings import HuggingFaceEmbeddings
 
 
 
 
 
28
 
29
+ # model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
30
+ # model_kwargs = {'device':'cpu'},
31
+ # encode_kwargs = {'normalize_embeddings' : True})
32
 
33
+ # ## Chroma 기반 pdf(docs 벡터화)
34
+ # db = Chroma.from_documents(sourceDocs, model_huggingface)
 
 
35
 
36
+ # ## 질의하기
37
+ # def SearchDocs(question, k=1):
38
+ # results = db.similarity_search_with_relevance_scores(question, k = k)
39
+ # merged = ' '.join([sourceDocs[result[0]][0] for result in results])
40
+ # return merged