File size: 2,349 Bytes
fd4565a
 
 
 
7310f1a
 
fd4565a
 
 
6871744
fd4565a
 
 
 
 
 
 
 
3d61420
22c0759
 
fd4565a
 
3d61420
22c0759
 
fd4565a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7310f1a
fd4565a
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from langchain_community.llms import CTransformers
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence
from langchain.chains import RetrievalQA
#from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from huggingface_hub import hf_hub_download

# !pip install llama-cpp-python

# from llama_cpp import Llama

# model_file = Llama.from_pretrained(
# 	repo_id="Pudding48/TinyLLamaTest",
# 	filename="tinyllama-1.1b-chat-v1.0.Q8_0.gguf",
# )
import os
cache_path = "/home/user/app/hf_cache"
os.makedirs(cache_path, exist_ok=True)

model_file = hf_hub_download(
    repo_id="Pudding48/TinyLlamaTest",
    filename="tinyllama-1.1b-chat-v1.0.Q8_0.gguf",
    cache_dir=cache_path
)

# Vector store location
vector_dp_path = "vectorstores/db_faiss"

from prepare_vector_dp import create_db_from_text
create_db_from_text()

# Load LLM with CTransformers
def load_llm(model_file):
    return CTransformers(
        model=model_file,
        model_type="llama",
        temperature=0.01,
        config={'gpu_layers': 0},
        max_new_tokens=128,
        context_length=512
    )

# Create the prompt
def creat_prompt(template):
    return PromptTemplate(template=template, input_variables=["context", "question"])

# Create QA pipeline
def create_qa_chain(prompt, llm, db):
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=db.as_retriever(search_kwargs={"k": 1}),
        return_source_documents=False,
        chain_type_kwargs={'prompt': prompt}
    )

# Load vector DB
def read_vector_db():
    embedding_model = HuggingFaceEmbeddings(model_file = "sentence-transformers/all-MiniLM-L6-v2")
    return FAISS.load_local(vector_dp_path, embedding_model, allow_dangerous_deserialization=True)

# Build everything
db = read_vector_db()
llm = load_llm(model_file)

template = """<|im_start|>system\nSử dụng thông tin sau đây để trả lời câu hỏi. Nếu bạn không biết câu trả lời, hãy nói không biết, đừng cố tạo ra câu trả lời\n
{context}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant"""

prompt = creat_prompt(template)
llm_chain = create_qa_chain(prompt, llm, db)