################ # PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기 ################ import tiktoken tokenizer = tiktoken.get_encoding('cl100k_base') def tiktoken_len(text): tokens = tokenizer.encode(text) return len(tokens) from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.document_loaders import PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings ## pdf 파일로드 하고 쪼개기 loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf') pages = loader.load_and_split() ## chunk로 쪼개기 text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len) sourceDocs = text_splitter.split_documents(pages) ################ # HuggingFace 모델로 문서벡터화 후 유사도 탐색 ################ from langchain.embeddings import HuggingFaceEmbeddings model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', model_kwargs = {'device':'cpu'}, encode_kwargs = {'normalize_embeddings' : True}) ## Chroma 기반 pdf(docs 벡터화) db = Chroma.from_documents(sourceDocs, model_huggingface) ## 질의하기 def SearchDocs(question, k=1): results = db.similarity_search_with_relevance_scores(question, k = k) merged = ' '.join([sourceDocs[result[0]][0] for result in results]) return merged # ################ # # 찾은 문서를 프롬프트에 전달하여 LLM으로 답변 생성 # ################ # from langchain_community.chat_models import ChatOllama # llm = ChatOllama( # base_url='http://localhost:11434', # # model="phi3:medium", # 너무 느려서 mini로 변경 # model="phi3:mini", # ) # from langchain_core.prompts import ChatPromptTemplate # prompt = ChatPromptTemplate.from_messages([ # ("system", "Please answer the following question from the document: {document}"), # ("user", "{question}"), # ]) # # print('-'*50) # chain = prompt | llm # def Response(question): # searchedDocs = SearchDocs(question) # mergedDoc = ' '.join(searchedDocs[0][0]) # return chain.invoke({"question": question, "document": mergedDoc})