# Load in packages import os from typing import Type from langchain_community.embeddings import HuggingFaceEmbeddings#, HuggingFaceInstructEmbeddings from langchain_community.vectorstores import FAISS import gradio as gr import pandas as pd from transformers import AutoTokenizer from ctransformers import AutoModelForCausalLM import torch import llama_cpp from llama_cpp import Llama from huggingface_hub import hf_hub_download PandasDataFrame = Type[pd.DataFrame] # Disable cuda devices if necessary #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' #from chatfuncs.chatfuncs import * import chatfuncs.ingest as ing ## Load preset embeddings, vectorstore, and model embeddings_name = "BAAI/bge-base-en-v1.5" def load_embeddings(embeddings_name = embeddings_name): embeddings_func = HuggingFaceEmbeddings(model_name=embeddings_name) global embeddings embeddings = embeddings_func return embeddings def get_faiss_store(faiss_vstore_folder,embeddings): import zipfile with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref: zip_ref.extractall(faiss_vstore_folder) faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings, allow_dangerous_deserialization=True) os.remove(faiss_vstore_folder + "/index.faiss") os.remove(faiss_vstore_folder + "/index.pkl") global vectorstore vectorstore = faiss_vstore return vectorstore import chatfuncs.chatfuncs as chatf chatf.embeddings = load_embeddings(embeddings_name) chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"]) # def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None): # print("Loading model") # # Default values inside the function # if gpu_config is None: # gpu_config = chatf.gpu_config # if cpu_config is None: # cpu_config = chatf.cpu_config # if torch_device is None: # torch_device = chatf.torch_device # if model_type == "Mistral Open Orca (larger, slow)": # if torch_device == "cuda": # gpu_config.update_gpu(gpu_layers) # else: # gpu_config.update_gpu(gpu_layers) # cpu_config.update_gpu(gpu_layers) # print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.") # print(vars(gpu_config)) # print(vars(cpu_config)) # try: # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu()) # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu()) # model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu()) # #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu()) # except: # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu()) # #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu()) # model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu()) # #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu()) # tokenizer = [] # if model_type == "Flan Alpaca (small, fast)": # # Huggingface chat model # hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # # # def create_hf_model(model_name): # from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM # if torch_device == "cuda": # if "flan" in model_name: # model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto") # else: # model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") # else: # if "flan" in model_name: # model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # else: # model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True) # tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length) # return model, tokenizer, model_type # model, tokenizer, model_type = create_hf_model(model_name = hf_checkpoint) # chatf.model = model # chatf.tokenizer = tokenizer # chatf.model_type = model_type # load_confirmation = "Finished loading model: " + model_type # print(load_confirmation) # return model_type, load_confirmation, model_type def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None): print("Loading model") # Default values inside the function if gpu_config is None: gpu_config = chatf.gpu_config if cpu_config is None: cpu_config = chatf.cpu_config if torch_device is None: torch_device = chatf.torch_device if model_type == "Mistral Open Orca (larger, slow)": if torch_device == "cuda": gpu_config.update_gpu(gpu_layers) print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.") else: gpu_config.update_gpu(gpu_layers) cpu_config.update_gpu(gpu_layers) print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.") print(vars(gpu_config)) print(vars(cpu_config)) try: model = Llama( model_path=hf_hub_download( repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"), filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"), ), **vars(gpu_config) # change n_gpu_layers if you have more or less VRAM ) except Exception as e: print("GPU load failed") print(e) model = Llama( model_path=hf_hub_download( repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"), filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"), ), **vars(cpu_config) ) tokenizer = [] if model_type == "Flan Alpaca (small, fast)": # Huggingface chat model hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # # def create_hf_model(model_name): from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM if torch_device == "cuda": if "flan" in model_name: model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) else: model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16) else: if "flan" in model_name: model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16) else: model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16) tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length) return model, tokenizer, model_type model, tokenizer, model_type = create_hf_model(model_name = hf_checkpoint) chatf.model = model chatf.tokenizer = tokenizer chatf.model_type = model_type load_confirmation = "Finished loading model: " + model_type print(load_confirmation) return model_type, load_confirmation, model_type # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded model_type = "Mistral Open Orca (larger, slow)" load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device) model_type = "Flan Alpaca (small, fast)" load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device) def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings): print(f"> Total split documents: {len(docs_out)}") print(docs_out) vectorstore_func = FAISS.from_documents(documents=docs_out, embedding=embeddings) chatf.vectorstore = vectorstore_func out_message = "Document processing complete" return out_message, vectorstore_func # Gradio chat block = gr.Blocks(theme = gr.themes.Base())#css=".gradio-container {background-color: black}") with block: ingest_text = gr.State() ingest_metadata = gr.State() ingest_docs = gr.State() model_type_state = gr.State(model_type) embeddings_state = gr.State(chatf.embeddings)#globals()["embeddings"]) vectorstore_state = gr.State(chatf.vectorstore)#globals()["vectorstore"]) model_state = gr.State() # chatf.model (gives error) tokenizer_state = gr.State() # chatf.tokenizer (gives error) chat_history_state = gr.State() instruction_prompt_out = gr.State() gr.Markdown("