Spaces:
Sleeping
Sleeping
File size: 5,149 Bytes
ec9e166 3e1c3a5 ec9e166 ad07962 165f5e4 ad07962 165f5e4 ad07962 8c29218 ec9e166 8c29218 ec9e166 8c29218 c24dee1 ec9e166 8c29218 ec9e166 6b53f9c a5ca45a ec9e166 e6f76c5 ec9e166 e6f76c5 ec9e166 e370bd9 ec9e166 e6f76c5 ec9e166 730194c ec9e166 8c29218 ec9e166 8c29218 730194c ec9e166 8c29218 ec9e166 13387eb 730194c ec9e166 730194c ec9e166 730194c ec9e166 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import os
import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from htmlTemplates import css, bot_template, user_template
from langchain.llms import HuggingFaceHub
# set this key as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets['huggingface_token']
def add_logo():
st.markdown(
f"""
<style>
[data-testid="stSidebar"] {{
background-image: url(https://smbk.s3.amazonaws.com/media/organization_logos/111579646d1241f4be17bd7394dcb238.jpg);
background-repeat: no-repeat;
padding-top: 80px;
background-position: 20px 20px;
}}
</style>
""",
unsafe_allow_html=True,
)
def get_pdf_text(pdf_docs : list) -> str:
text = ""
for pdf in pdf_docs:
pdf_reader = PdfReader(pdf)
for page in pdf_reader.pages:
text += page.extract_text()
return text
def get_text_chunks(text:str) ->list:
text_splitter = CharacterTextSplitter(
separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
)
chunks = text_splitter.split_text(text)
return chunks
def get_vectorstore(text_chunks : list) -> FAISS:
model = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
encode_kwargs = {
"normalize_embeddings": True
} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
model_name=model, encode_kwargs=encode_kwargs, model_kwargs={"device": "cpu"}
)
vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
return vectorstore
def get_conversation_chain(vectorstore:FAISS) -> ConversationalRetrievalChain:
# llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
llm = HuggingFaceHub(
repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
#repo_id="TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
model_kwargs={"temperature": 0.5, "max_length": 1048},
)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
llm=llm, retriever=vectorstore.as_retriever(), memory=memory
)
return conversation_chain
#def handle_userinput(user_question:str):
# response = st.session_state.conversation({"pregunta": user_question})
# st.session_state.chat_history = response["chat_history"]
#
# for i, message in enumerate(st.session_state.chat_history):
# if i % 2 == 0:
# st.write(" Usuario: " + message.content)
# else:
# st.write("🤖 ChatBot: " + message.content)
def handle_userinput(user_question):
"""
Handle user input and generate a response using the conversational retrieval chain.
Parameters
----------
user_question : str
The user's question.
"""
response = st.session_state.conversation({"question": user_question})
st.session_state.chat_history = response["chat_history"]
for i, message in enumerate(st.session_state.chat_history):
if i % 2 == 0:
st.write("//_^ User: " + message.content)
else:
st.write("🤖 ChatBot: " + message.content)
def main():
st.set_page_config(
page_title="Chat with a Bot that tries to answer questions about multiple PDFs",
page_icon=":books:",
)
st.markdown("# Charla con TedCasBot")
st.markdown("Este Bot será tu aliado a la hora de buscar información en múltiples documentos pdf. Déjanos ayudarte! 🙏🏾")
st.write(css, unsafe_allow_html=True)
if "conversation" not in st.session_state:
st.session_state.conversation = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = None
st.header("Charla con un Bot 🤖🦾 que te ayudará a responder preguntas sobre tus pdfs:")
user_question = st.text_input("Haz tu pregunta!:")
if user_question:
handle_userinput(user_question)
with st.sidebar:
add_logo()
st.subheader("Tus documentos")
pdf_docs = st.file_uploader(
"Sube tus documentos y haz click en 'Procesar'", accept_multiple_files=True
)
if st.button("Procesar"):
with st.spinner("Procesando"):
# get pdf text
raw_text = get_pdf_text(pdf_docs)
# get the text chunks
text_chunks = get_text_chunks(raw_text)
# create vector store
vectorstore = get_vectorstore(text_chunks)
# create conversation chain
st.session_state.conversation = get_conversation_chain(vectorstore)
if __name__ == "__main__":
main()
|