File size: 2,301 Bytes
b16918e 442895c b16918e 442895c b16918e 442895c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
################
# PDF ํ์ผ์ ๋ก๋ํ๊ณ ๋ฌธ์๋ฅผ ์ชผ๊ฐ์ ๋ฌธ์๋ฒกํฐํ ํ ํ ์ง์ํ๊ธฐ
################
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
tokens = tokenizer.encode(text)
return len(tokens)
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
## pdf ํ์ผ๋ก๋ ํ๊ณ ์ชผ๊ฐ๊ธฐ
loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf')
pages = loader.load_and_split()
## chunk๋ก ์ชผ๊ฐ๊ธฐ
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
sourceDocs = text_splitter.split_documents(pages)
################
# HuggingFace ๋ชจ๋ธ๋ก ๋ฌธ์๋ฒกํฐํ ํ ์ ์ฌ๋ ํ์
################
from langchain.embeddings import HuggingFaceEmbeddings
model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask',
model_kwargs = {'device':'cpu'},
encode_kwargs = {'normalize_embeddings' : True})
## Chroma ๊ธฐ๋ฐ pdf(docs ๋ฒกํฐํ)
db = Chroma.from_documents(sourceDocs, model_huggingface)
## ์ง์ํ๊ธฐ
def searchDocs(question, k=1):
results = db.similarity_search_with_relevance_scores(question, k = k)
return results
################
# ์ฐพ์ ๋ฌธ์๋ฅผ ํ๋กฌํํธ์ ์ ๋ฌํ์ฌ LLM์ผ๋ก ๋ต๋ณ ์์ฑ
################
from langchain_community.chat_models import ChatOllama
llm = ChatOllama(
base_url='http://localhost:11434',
# model="phi3:medium", # ๋๋ฌด ๋๋ ค์ mini๋ก ๋ณ๊ฒฝ
model="phi3:mini",
)
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
("system", "Please answer the following question from the document: {document}"),
("user", "{question}"),
])
# print('-'*50)
chain = prompt | llm
def Response(question):
searchedDocs = searchDocs(question)
mergedDoc = ' '.join(searchedDocs[0][0])
return chain.invoke({"question": question, "document": mergedDoc}) |