################ # PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기 ################ import tiktoken tokenizer = tiktoken.get_encoding('cl100k_base') def tiktoken_len(text): tokens = tokenizer.encode(text) return len(tokens) from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.document_loaders import PyPDFLoader from langchain.embeddings import HuggingFaceEmbeddings ## pdf 파일로드 하고 쪼개기 loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf') pages = loader.load_and_split() ## chunk로 쪼개기 text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len) sourceDocs = text_splitter.split_documents(pages) ################ # HuggingFace 모델로 문서벡터화 후 유사도 탐색 ################ from langchain.embeddings import HuggingFaceEmbeddings model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', model_kwargs = {'device':'cpu'}, encode_kwargs = {'normalize_embeddings' : True}) ## Chroma 기반 pdf(docs 벡터화) db = Chroma.from_documents(sourceDocs, model_huggingface) ## 질의하기 def searchDocs(question, k=1): results = db.similarity_search_with_relevance_scores(question, k = k) return results ################ # 찾은 문서를 프롬프트에 전달하여 LLM으로 답변 생성 ################ from langchain_community.chat_models import ChatOllama llm = ChatOllama( base_url='http://localhost:11434', # model="phi3:medium", # 너무 느려서 mini로 변경 model="phi3:mini", ) from langchain_core.prompts import ChatPromptTemplate prompt = ChatPromptTemplate.from_messages([ ("system", "Please answer the following question from the document: {document}"), ("user", "{question}"), ]) # print('-'*50) chain = prompt | llm def Response(question): searchedDocs = searchDocs(question) mergedDoc = ' '.join(searchedDocs[0][0]) return chain.invoke({"question": question, "document": mergedDoc})