sumanthkv commited on
Commit
3a6de21
·
verified ·
1 Parent(s): 149797b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -0
app.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.document_loaders import PyPDFLoader
3
+ from langchain.document_loaders import TextLoader
4
+ from langchain.document_loaders import Docx2txtLoader
5
+ from langchain.text_splitter import CharacterTextSplitter
6
+ from langchain.embeddings import HuggingFaceEmbeddings
7
+ from langchain.vectorstores import Chroma
8
+ from huggingface_hub import notebook_login
9
+ import torch
10
+ import transformers
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+ from transformers import pipeline
13
+ from langchain import HuggingFacePipeline
14
+ from langchain.chains import ConversationalRetrievalChain
15
+ from langchain.memory import ConversationBufferMemory
16
+ from langchain.embeddings.openai import OpenAIEmbeddings
17
+ from langchain.chat_models import ChatOpenAI
18
+ import os
19
+ import sys
20
+
21
+
22
+ # Create a directory for documents if it doesn't exist
23
+ if not os.path.exists("docs"):
24
+ os.makedirs("docs")
25
+
26
+ # Define a function to load documents from the "docs" directory
27
+ def load_documents():
28
+ document = []
29
+ for file in os.listdir("docs"):
30
+ if file.endswith(".pdf"):
31
+ pdf_path = "./docs/" + file
32
+ loader = PyPDFLoader(pdf_path)
33
+ document.extend(loader.load())
34
+ elif file.endswith('.docx') or file.endswith('.doc'):
35
+ doc_path = "./docs/" + file
36
+ loader = Docx2txtLoader(doc_path)
37
+ document.extend(loader.load())
38
+ elif file.endswith('.txt'):
39
+ text_path = "./docs/" + file
40
+ loader = TextLoader(text_path)
41
+ document.extend(loader.load())
42
+ return document
43
+
44
+ # Load documents
45
+ document = load_documents()
46
+
47
+ # Split documents into chunks
48
+ document_splitter = CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)
49
+ document_chunks = document_splitter.split_documents(document)
50
+
51
+ # Initialize embeddings
52
+ embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
53
+
54
+ # Set OpenAI API key
55
+ os.environ["OPENAI_API_KEY"] = "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"
56
+
57
+ # Initialize Chroma as the vector database
58
+ vectordb = Chroma.from_documents(document_chunks, embedding=embeddings, persist_directory='./data')
59
+ vectordb.persist()
60
+
61
+ # Login to Hugging Face Hub
62
+ notebook_login()
63
+
64
+ # Initialize tokenizer and model for text generation
65
+ tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased", use_auth_token=True)
66
+ model = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16, device_map="auto")
67
+
68
+ # Initialize the text generation pipeline
69
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.bfloat16, device_map='auto',
70
+ max_new_tokens=512, min_new_tokens=-1, top_k=30)
71
+
72
+ # Initialize the conversational retrieval chain
73
+ llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': 0})
74
+ llm = ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo')
75
+ memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
76
+ pdf_qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectordb.as_retriever(search_kwargs={'k': 6}),
77
+ verbose=False, memory=memory)
78
+
79
+ # Streamlit app
80
+ st.title('DocBot - Your Document Query Assistant')
81
+
82
+ st.write('Upload your documents to get started.')
83
+
84
+ uploaded_files = st.file_uploader("Upload Files", type=['pdf', 'docx', 'doc', 'txt'], accept_multiple_files=True)
85
+
86
+ if uploaded_files:
87
+ st.write("Uploaded Files:")
88
+ for file in uploaded_files:
89
+ with open(os.path.join("docs", file.name), "wb") as f:
90
+ f.write(file.getbuffer())
91
+ st.write("Files uploaded successfully. You can start asking questions now.")
92
+
93
+ while True:
94
+ query = st.text_input("Ask a question:")
95
+ if query:
96
+ result = pdf_qa({"question": query})
97
+ st.write("Answer: " + result["answer"])