# ################ # # PDF 파일을 로드하고 문서를 쪼개서 문서벡터화 한 후 질의하기 # ################ # import tiktoken # tokenizer = tiktoken.get_encoding('cl100k_base') # def tiktoken_len(text): # tokens = tokenizer.encode(text) # return len(tokens) # from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings # from langchain.text_splitter import RecursiveCharacterTextSplitter # from langchain.vectorstores import Chroma # from langchain.document_loaders import PyPDFLoader # from langchain.embeddings import HuggingFaceEmbeddings # ## pdf 파일로드 하고 쪼개기 # loader = PyPDFLoader('gsat_170823.pdf') # pages = loader.load_and_split() # ## chunk로 쪼개기 # text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len) # sourceDocs = text_splitter.split_documents(pages) # ################ # # HuggingFace 모델로 문서벡터화 후 유사도 탐색 # ################ # from langchain.embeddings import HuggingFaceEmbeddings # model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', # model_kwargs = {'device':'cpu'}, # encode_kwargs = {'normalize_embeddings' : True}) # ## Chroma 기반 pdf(docs 벡터화) # db = Chroma.from_documents(sourceDocs, model_huggingface) # ## 질의하기 # def SearchDocs(question, k=1): # results = db.similarity_search_with_relevance_scores(question, k = k) # merged = ' '.join([sourceDocs[result[0]][0] for result in results]) # return merged