Spaces:
Sleeping
Sleeping
File size: 5,727 Bytes
2df9243 9917a1a 9c5d83d 2b04423 7c95914 2df9243 7c95914 2df9243 7c95914 161353e 2df9243 7c95914 47f6195 7c95914 abd1f1b 976634b d3de2d8 976634b d3de2d8 976634b a98948f 976634b a98948f 2df9243 7c95914 f8dc8d1 a98948f 403a475 7c95914 f7d3ba1 7c95914 f7d3ba1 cab37f8 2df9243 7c95914 2df9243 7c95914 f9dbffb 7c95914 e38f084 5751fd4 ebf2d4c c239258 9ca7d21 2df9243 2b04423 7c95914 9ca7d21 eb1ac12 48a4a46 7c95914 7ccf47a a58a97b 7ccf47a a58a97b 7ccf47a 2df9243 bcd8992 586a969 bcd8992 36c54d4 2df9243 36c54d4 161353e abd1f1b 36c54d4 0da8351 5616919 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import os
import streamlit.components.v1 as components
from datasets import load_dataset
import random
import pickle
from nltk.tokenize import sent_tokenize
import nltk
from PyPDF2 import PdfReader
import streamlit as st
from streamlit_extras.add_vertical_space import add_vertical_space
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks import get_openai_callback
from my_component import my_component
nltk.download('punkt')
# Sidebar contents
with st.sidebar:
st.title(':orange_book: BinDoc GmbH')
api_key = os.getenv("OPENAI_API_KEY")
# Retrieve the API key from st.secrets
if not api_key:
st.warning('API key is required to proceed.')
st.stop() # Stop the app if the API key is not provided
st.markdown("Experience the future of document interaction with the revolutionary")
st.markdown("**BinDocs Chat App**.")
st.markdown("Harnessing the power of a Large Language Model and AI technology,")
st.markdown("this innovative platform redefines PDF engagement,")
st.markdown("enabling dynamic conversations that bridge the gap between")
st.markdown("human and machine intelligence.")
add_vertical_space(3) # Add more vertical space between text blocks
st.write('Made with ❤️ by BinDoc GmbH')
def load_pdf(file_path):
pdf_reader = PdfReader(file_path)
chunks = []
for page in pdf_reader.pages:
text = page.extract_text()
if text:
chunks.append(text)
store_name = file_path.name[:-4]
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
VectorStore = pickle.load(f)
else:
embeddings = OpenAIEmbeddings()
VectorStore = FAISS.from_texts(chunks, embedding=embeddings)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(VectorStore, f)
return VectorStore
def load_chatbot(max_tokens=300):
return load_qa_chain(llm=OpenAI(temperature=0.1, max_tokens=max_tokens), chain_type="stuff")
def display_chat_history(chat_history):
for chat in chat_history:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
st.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
def remove_incomplete_sentences(text):
sentences = sent_tokenize(text)
complete_sentences = [sent for sent in sentences if sent.endswith(('.', '!', '?'))]
return ' '.join(complete_sentences)
def remove_redundant_information(text):
sentences = sent_tokenize(text)
unique_sentences = list(set(sentences))
return ' '.join(unique_sentences)
# Define a maximum token limit to avoid infinite loops
MAX_TOKEN_LIMIT = 400
import random
def main():
st.title("BinDocs Chat App")
if "chat_history" not in st.session_state:
st.session_state['chat_history'] = []
display_chat_history(st.session_state['chat_history'])
new_messages_placeholder = st.empty()
pdf = st.file_uploader("Upload your PDF", type="pdf")
query = st.text_input("Ask questions about your PDF file (in any preferred language):")
if not pdf:
st.warning("Please upload a PDF file to proceed.")
else:
if st.button("Was genau ist ein Belegarzt?"):
query = "Was genau ist ein Belegarzt?"
if st.button("Wofür wird die Alpha-ID verwendet?"):
query = "Wofür wird die Alpha-ID verwendet?"
if st.button("Was sind die Vorteile des ambulanten operierens?"):
query = "Was sind die Vorteile des ambulanten operierens?"
if query:
st.session_state['last_input'] = query
st.session_state['chat_history'].append(("User", query, "new"))
loading_message = st.empty()
loading_message.text('Bot is thinking...')
VectorStore = load_pdf(pdf)
max_tokens = 120
chain = load_chatbot(max_tokens=max_tokens)
docs = VectorStore.similarity_search(query=query, k=2)
with get_openai_callback() as cb:
response = chain.run(input_documents=docs, question=query)
# Post-processing to remove incomplete sentences and redundant information
filtered_response = remove_incomplete_sentences(response)
filtered_response = remove_redundant_information(filtered_response)
st.session_state['chat_history'].append(("Bot", filtered_response, "new"))
new_messages = st.session_state['chat_history'][-2:]
for chat in new_messages:
background_color = "#FFA07A" if chat[2] == "new" else "#acf" if chat[0] == "User" else "#caf"
new_messages_placeholder.markdown(f"<div style='background-color: {background_color}; padding: 10px; border-radius: 10px; margin: 10px;'>{chat[0]}: {chat[1]}</div>", unsafe_allow_html=True)
st.write("<script>document.getElementById('response').scrollIntoView();</script>", unsafe_allow_html=True)
loading_message.empty()
query = ""
else:
st.warning("Please upload a PDF file before asking questions.")
st.session_state['chat_history'] = [(sender, msg, "old") for sender, msg, _ in st.session_state['chat_history']]
my_component()
if __name__ == "__main__":
main()
|