File size: 3,874 Bytes
114ce4a
 
a08531a
 
114ce4a
a08531a
 
 
2da0c1f
114ce4a
 
 
 
 
 
 
 
 
 
 
 
 
a08531a
 
 
 
114ce4a
 
 
fe4c7e0
114ce4a
fe4c7e0
114ce4a
 
a08531a
 
114ce4a
 
 
 
25c6d42
 
fdb4410
a08531a
fdb4410
114ce4a
a08531a
114ce4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe4c7e0
114ce4a
 
 
fe4c7e0
114ce4a
 
 
 
 
 
 
fe4c7e0
114ce4a
fe4c7e0
114ce4a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import torch
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFaceHub
import os

# Check for GPU availability and set the appropriate device for computation.
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

# Global variables
conversation_retrieval_chain = None
chat_history = []
llm_hub = None
embeddings = None

# Function to initialize the language model and its embeddings
def init_llm():
    global llm_hub, embeddings
    # Set up the environment variable for HuggingFace and initialize the desired model.
    tokenfile = open("api_token.txt")
    api_token = tokenfile.readline().replace("\n","")
    tokenfile.close()
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token

    # repo name for the model
    # model_id = "tiiuae/falcon-7b-instruct"
    model_id = "microsoft/Phi-3.5-mini-instruct"
    # model_id = "meta-llama/Llama-3.2-1B-Instruct"
    # model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    
    # load the model into the HuggingFaceHub
    #llm_hub = HuggingFaceHub(repo_id=model_id, temperature=0.1, max_new_tokens=600, model_kwargs={"max_length":600})
    llm_hub = HuggingFaceHub(repo_id=model_id, model_kwargs={"temperature": 0.1, "max_new_tokens": 600, "max_length": 600})
    llm_hub.client.api_url = 'https://api-inference.huggingface.co/models/'+model_id
    # llm_hub.invoke('foo bar')

    #Initialize embeddings using a pre-trained model to represent the text data.
    embedddings_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
    # embedddings_model = "sentence-transformers/all-MiniLM-L6-v2"

    # emb_model = SentenceTransformer(embedddings_model)
    
    embeddings = HuggingFaceInstructEmbeddings(
        model_name=embedddings_model,
        model_kwargs={"device": DEVICE}
    )


# Function to process a PDF document
def process_document(document_path):
    global conversation_retrieval_chain

    # Load the document
    loader = PyPDFLoader(document_path)
    documents = loader.load()
    
    # Split the document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
    texts = text_splitter.split_documents(documents)
    
    # Create an embeddings database using Chroma from the split text chunks.
    db = Chroma.from_documents(texts, embedding=embeddings)


    # --> Build the QA chain, which utilizes the LLM and retriever for answering questions. 
    # By default, the vectorstore retriever uses similarity search. 
    # If the underlying vectorstore support maximum marginal relevance search, you can specify that as the search type (search_type="mmr").
    # You can also specify search kwargs like k to use when doing retrieval. k represent how many search results send to llm
    retriever = db.as_retriever(search_type="mmr", search_kwargs={'k': 6, 'lambda_mult': 0.25})
    conversation_retrieval_chain = RetrievalQA.from_chain_type(
        llm=llm_hub,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=False,
        input_key = "question"
     #   chain_type_kwargs={"prompt": prompt} # if you are using prompt template, you need to uncomment this part
    )


# Function to process a user prompt
def process_prompt(prompt, chat_history):
    global conversation_retrieval_chain
    # global chat_history
    
    # Query the model
    output = conversation_retrieval_chain.invoke({"question": prompt, "chat_history": chat_history})
    answer = output["result"]
    
    # Update the chat history
    chat_history.append((prompt, answer))
    
    # Return the model's response
    return answer

# Initialize the language model
init_llm()