File size: 2,301 Bytes
b16918e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442895c
 
 
b16918e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442895c
b16918e
442895c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
################
# PDF ํŒŒ์ผ์„ ๋กœ๋“œํ•˜๊ณ  ๋ฌธ์„œ๋ฅผ ์ชผ๊ฐœ์„œ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ•œ ํ›„ ์งˆ์˜ํ•˜๊ธฐ
################
import tiktoken
tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)
    
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings

## pdf ํŒŒ์ผ๋กœ๋“œ ํ•˜๊ณ  ์ชผ๊ฐœ๊ธฐ
loader = PyPDFLoader('https://wdr.ubion.co.kr/wowpass/img/event/gsat_170823/gsat_170823.pdf')
pages = loader.load_and_split()

## chunk๋กœ ์ชผ๊ฐœ๊ธฐ
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=80,length_function=tiktoken_len)
sourceDocs = text_splitter.split_documents(pages)

################
# HuggingFace ๋ชจ๋ธ๋กœ ๋ฌธ์„œ๋ฒกํ„ฐํ™” ํ›„ ์œ ์‚ฌ๋„ ํƒ์ƒ‰
################
from langchain.embeddings import HuggingFaceEmbeddings

model_huggingface = HuggingFaceEmbeddings(model_name = 'jhgan/ko-sroberta-multitask', 
                                          model_kwargs = {'device':'cpu'}, 
                                          encode_kwargs = {'normalize_embeddings' : True})

## Chroma ๊ธฐ๋ฐ˜ pdf(docs ๋ฒกํ„ฐํ™”)
db = Chroma.from_documents(sourceDocs, model_huggingface)

## ์งˆ์˜ํ•˜๊ธฐ
def searchDocs(question, k=1):
    results = db.similarity_search_with_relevance_scores(question, k = k)
    return results

################
# ์ฐพ์€ ๋ฌธ์„œ๋ฅผ ํ”„๋กฌํ”„ํŠธ์— ์ „๋‹ฌํ•˜์—ฌ LLM์œผ๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ
################
from langchain_community.chat_models import ChatOllama
llm = ChatOllama(
    base_url='http://localhost:11434',
    # model="phi3:medium", # ๋„ˆ๋ฌด ๋Š๋ ค์„œ mini๋กœ ๋ณ€๊ฒฝ
    model="phi3:mini",
)

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", "Please answer the following question from the document: {document}"),
    ("user", "{question}"),
])

# print('-'*50)
chain = prompt | llm
def Response(question):
    searchedDocs = searchDocs(question)
    mergedDoc = ' '.join(searchedDocs[0][0])
    return chain.invoke({"question": question, "document": mergedDoc})