print(9) import argparse # from dataclasses import dataclass from langchain.prompts import ChatPromptTemplate try: from langchain_community.vectorstores import Chroma except: from langchain_community.vectorstores import Chroma # from langchain.document_loaders import DirectoryLoader from langchain_community.document_loaders import DirectoryLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document # from langchain.embeddings import OpenAIEmbeddings #from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import Chroma import openai from dotenv import load_dotenv import os import shutil import torch from langchain_experimental.text_splitter import SemanticChunker from typing import List import re import warnings from typing import List import torch from langchain import PromptTemplate from langchain.chains import ConversationChain from langchain.chains.conversation.memory import ConversationBufferWindowMemory from langchain.llms import HuggingFacePipeline from langchain.schema import BaseOutputParser from transformers import ( AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, pipeline, ) import subprocess import sys def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) install('accelerate') MODEL_NAME = "tiiuae/falcon-7b-instruct" llama_pipeline = pipeline( "text-generation", model=MODEL_NAME, torch_dtype=torch.float16, device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) from transformers import AutoModel,AutoTokenizer model2 = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") tokenizer2 = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") # this shoub be used when we can not use sentence_transformers (which reqiures transformers==4.39. we cannot use # this version since causes using large amount of RAm when loading falcon model) # a custom embedding #from sentence_transformers import SentenceTransformer warnings.filterwarnings("ignore", category=UserWarning) class MyEmbeddings: def __init__(self): #self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") self.model=model2 def embed_documents(self, texts: List[str]) -> List[List[float]]: inputs = tokenizer2(texts, padding=True, truncation=True, return_tensors="pt") # Get the model outputs with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling to get sentence embeddings embeddings = outputs.last_hidden_state.mean(dim=1) return [embeddings[i].tolist() for i, sentence in enumerate(texts)] def embed_query(self, query: str) -> List[float]: inputs = tokenizer2(query, padding=True, truncation=True, return_tensors="pt") # Get the model outputs with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling to get sentence embeddings embeddings = outputs.last_hidden_state.mean(dim=1) return embeddings[0].tolist() embeddings = MyEmbeddings() splitter = SemanticChunker(embeddings) CHROMA_PATH = "chroma8" # call the chroma generated in a directory db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings) prompt = """ The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. Current conversation: Human: Who is Dwight K Schrute? AI: """.strip() input_ids = tokenizer(prompt, return_tensors="pt").input_ids input_ids = input_ids.to(model.device) class StopGenerationCriteria(StoppingCriteria): def __init__( self, tokens: List[List[str]], tokenizer: AutoTokenizer, device: torch.device ): stop_token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] self.stop_token_ids = [ torch.tensor(x, dtype=torch.long, device=device) for x in stop_token_ids ] def __call__( self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs ) -> bool: for stop_ids in self.stop_token_ids: if torch.eq(input_ids[0][-len(stop_ids) :], stop_ids).all(): return True return False stop_tokens = [["Human", ":"], ["AI", ":"]] stopping_criteria = StoppingCriteriaList( [StopGenerationCriteria(stop_tokens, tokenizer, model.device)] ) class CleanupOutputParser(BaseOutputParser): def parse(self, text: str) -> str: user_pattern = r"\nUser" text = re.sub(user_pattern, "", text) human_pattern = r"\nHuman:" text = re.sub(human_pattern, "", text) ai_pattern = r"\nAI:" return re.sub(ai_pattern, "", text).strip() @property def _type(self) -> str: return "output_parser" template = """ The following Current conversation: {history} Human: {input} AI:""".strip() prompt = PromptTemplate(input_variables=["history", "input"], template=template) memory = ConversationBufferWindowMemory( memory_key="history", k=6, return_only_outputs=True ) chain = ConversationChain( llm=llm, memory=memory, prompt=prompt, output_parser=CleanupOutputParser(), verbose=True, ) def get_llama_response(message: str, history: list) -> str: query_text = message results = db.similarity_search_with_relevance_scores(query_text, k=3) if len(results) == 0 or results[0][1] < 0.5: print(f"Unable to find matching results.") context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results]) query = """ Answer the question based only on the following context. Dont provide any information out of the context: {context} --- Answer the question based on the above context: {question} """ query=query.format(context=context_text,question=message) sequences = llama_pipeline( query, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=1024, ) generated_text = sequences[0]['generated_text'] response = generated_text[len(query):] return response.strip() import gradio as gr gr.ChatInterface(get_llama_response).launch()