|
import streamlit as st |
|
from langchain.embeddings import HuggingFaceInstructEmbeddings |
|
from langchain.vectorstores import FAISS |
|
from langchain.text_splitter import CharacterTextSplitter |
|
from langchain.document_loaders import DirectoryLoader, PyPDFLoader |
|
import os |
|
from PyPDF2 import PdfReader |
|
from transformers import pipeline |
|
from transformers import AutoModel |
|
|
|
|
|
from langchain.prompts import ChatPromptTemplate |
|
from langchain.schema import StrOutputParser |
|
from langchain.schema.runnable import RunnablePassthrough |
|
from langchain.chains import ConversationalRetrievalChain |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_pdf_text(folder_path): |
|
text = "" |
|
|
|
for filename in os.listdir(folder_path): |
|
filepath = os.path.join(folder_path, filename) |
|
|
|
|
|
if os.path.isfile(filepath) and filename.lower().endswith(".pdf"): |
|
pdf_reader = PdfReader(filepath) |
|
for page in pdf_reader.pages: |
|
text += page.extract_text() |
|
|
|
|
|
return text |
|
|
|
|
|
def get_text_chunks(text): |
|
|
|
text_splitter = CharacterTextSplitter( |
|
separator="\n", |
|
chunk_size=1000, |
|
chunk_overlap=200, |
|
length_function=len |
|
) |
|
chunks = text_splitter.split_text(text) |
|
return chunks |
|
|
|
|
|
def create_vectorstore_and_store(): |
|
folder_path = './files' |
|
pdf_text = get_pdf_text(folder_path) |
|
text_chunks = get_text_chunks(pdf_text) |
|
embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2") |
|
|
|
|
|
vectorstoreDB = FAISS.from_texts(texts=text_chunks,embedding=embeddings) |
|
|
|
save_directory = "Store" |
|
|
|
vectorstoreDB.save_local(save_directory) |
|
print(vectorstoreDB) |
|
return None |
|
|
|
|
|
|
|
def get_vectorstore(): |
|
embeddings = HuggingFaceInstructEmbeddings(model_name="deutsche-telekom/bert-multi-english-german-squad2") |
|
|
|
|
|
save_directory = "Store" |
|
vectorstoreDB = FAISS.load_local(save_directory, embeddings) |
|
return vectorstoreDB |
|
|
|
|
|
|
|
def get_conversation_chain(vectorstore): |
|
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512}) |
|
conversation_chain = ConversationalRetrievalChain.from_llm( |
|
llm=llm, |
|
retriever=vectorstore.as_retriever() |
|
) |
|
return conversation_chain |
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
user_question = st.text_area("Stell mir eine Frage: ") |
|
|
|
retriever=get_vectorstore().as_retriever() |
|
retrieved_docs=retriever.invoke( |
|
user_question |
|
) |
|
if user_question: |
|
st.text(retrieved_docs[0].page_content) |
|
context=retrieved_docs[0].page_content |
|
question=user_question |
|
st.text(user_question) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
qa_pipeline = pipeline("question-answering", model="deutsche-telekom/bert-multi-english-german-squad2", tokenizer="deutsche-telekom/bert-multi-english-german-squad2") |
|
|
|
|
|
answer = qa_pipeline(question=question, context=context) |
|
|
|
|
|
st.text("Basisantwort:") |
|
st.text(answer["answer"]) |
|
st.text(answer) |
|
|
|
|
|
|
|
newA = get_conversation_chain(get_vectorstore()) |
|
st.text(newA) |
|
|
|
""" |
|
generator = pipeline('text-generation', model = 'tiiuae/falcon-40b') |
|
generator(answer, max_length = 30, num_return_sequences=3) |
|
st.text("Generierte Erweiterung:") |
|
st.text(generator) |
|
""" |
|
|
|
""" |
|
#IDEE Retriever erweitern |
|
template = Answer the question based only on the following context: |
|
|
|
{context} |
|
|
|
Question: {question} |
|
|
|
prompt = ChatPromptTemplate.from_template(template) |
|
model = AutoModel.from_pretrained("hkunlp/instructor-base") |
|
|
|
|
|
def format_docs(docs): |
|
return "\n\n".join([d.page_content for d in docs]) |
|
|
|
|
|
chain = ( |
|
{"context": retriever | format_docs, "question": RunnablePassthrough()} |
|
| prompt |
|
| model |
|
| StrOutputParser() |
|
) |
|
|
|
ausgabetext = chain.invoke(user_question) |
|
st.text(ausgabetext) |
|
""" |
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
main() |