Mattral commited on
Commit
3ef7ded
·
verified ·
1 Parent(s): 8ef7048

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -97
app.py CHANGED
@@ -1,21 +1,19 @@
1
  import streamlit as st
2
  from bs4 import BeautifulSoup
3
  import io
4
- import fitz
5
  import requests
 
 
 
6
  from langchain.llms import LlamaCpp
7
  from langchain.callbacks.base import BaseCallbackHandler
8
  from langchain.vectorstores import DocArrayInMemorySearch
9
- from langchain.docstore.document import Document
10
  from langchain.embeddings import HuggingFaceEmbeddings
11
  from langchain.memory import ConversationBufferMemory
12
  from langchain.chains import ConversationalRetrievalChain
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
 
15
-
16
- # StreamHandler to intercept streaming output from the LLM.
17
- # This makes it appear that the Language Model is "typing"
18
- # in realtime.
19
  class StreamHandler(BaseCallbackHandler):
20
  def __init__(self, container, initial_text=""):
21
  self.container = container
@@ -25,24 +23,25 @@ class StreamHandler(BaseCallbackHandler):
25
  self.text += token
26
  self.container.markdown(self.text)
27
 
 
 
 
 
28
 
29
  @st.cache_data
30
-
31
  def get_page_urls(url):
32
  page = requests.get(url)
33
  soup = BeautifulSoup(page.content, 'html.parser')
34
- links = [link['href'] for link in soup.find_all('a') if 'href' in link.attrs and link['href'].startswith(url) and link['href'] not in [url]]
35
  links.append(url)
36
  return set(links)
37
 
38
  @st.cache(allow_output_mutation=True)
39
  def process_pdf(file):
40
- # file is expected to be a BytesIO object directly from the file uploader
41
- doc = fitz.open("pdf", file.read()) # "pdf" indicates file format is PDF, reading the BytesIO stream
42
  texts = [page.get_text() for page in doc]
43
  return '\n'.join(texts)
44
 
45
-
46
  def get_url_content(url):
47
  response = requests.get(url)
48
  if url.endswith('.pdf'):
@@ -52,93 +51,49 @@ def get_url_content(url):
52
  else:
53
  soup = BeautifulSoup(response.content, 'html.parser')
54
  content = soup.find_all('div', class_='wpb_content_element')
55
- text = [c.get_text().strip() for c in content if c.get_text().strip() != '']
56
- text = [line for item in text for line in item.split('\n') if line.strip() != '']
57
- # Exclude footer content
58
- try:
59
- arts_on_index = text.index('ARTS ON:')
60
- return (url, '\n'.join(text[:arts_on_index]))
61
- except ValueError:
62
- return (url, '\n'.join(text)) # Return full text if specific marker not found
63
 
64
  @st.cache_resource
65
  def get_retriever(urls):
66
  all_content = [get_url_content(url) for url in urls]
67
- print(all_content) # See what is actually fetched
68
- documents = [Document(page_content=doc, metadata={'url': url}) for (url, doc) in all_content]
69
- print(documents) # Verify that documents are created correctly
70
-
71
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
72
  docs = text_splitter.split_documents(documents)
73
- print(docs) # Check the final structure of split documents
74
-
75
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
76
  db = DocArrayInMemorySearch.from_documents(docs, embeddings)
77
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10})
78
  return retriever
79
 
80
-
81
  @st.cache_resource
82
  def create_chain(_retriever):
83
- # A stream handler to direct streaming output on the chat screen.
84
- # This will need to be handled somewhat differently.
85
- # But it demonstrates what potential it carries.
86
- # stream_handler = StreamHandler(st.empty())
87
-
88
- # Callback manager is a way to intercept streaming output from the
89
- # LLM and take some action on it. Here we are giving it our custom
90
- # stream handler to make it appear as if the LLM is typing the
91
- # responses in real time.
92
- # callback_manager = CallbackManager([stream_handler])
93
-
94
- n_gpu_layers = 40 # Change this value based on your model and your GPU VRAM pool.
95
- n_batch = 2048 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
96
-
97
  llm = LlamaCpp(
98
- model_path="models /mistral-7b-instruct-v0.1.Q5_0.gguf",
99
  n_gpu_layers=n_gpu_layers,
100
  n_batch=n_batch,
101
  n_ctx=2048,
102
- # max_tokens=2048,
103
  temperature=0,
104
- # callback_manager=callback_manager,
105
  verbose=False,
106
  streaming=True,
107
  )
108
-
109
- # Template for the prompt.
110
- # template = "{question}"
111
-
112
- # We create a prompt from the template so we can use it with langchain
113
- # prompt = PromptTemplate(template=template, input_variables=["question"])
114
-
115
- # Setup memory for contextual conversation
116
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
117
-
118
- # We create a qa chain with our llm, retriever, and memory
119
  qa_chain = ConversationalRetrievalChain.from_llm(
120
  llm, retriever=_retriever, memory=memory, verbose=False
121
  )
122
-
123
  return qa_chain
124
 
125
-
126
-
127
- # Set the webpage title
128
  st.set_page_config(page_title="Your own AI-Chat!")
129
  st.header("Your own AI-Chat!")
130
 
131
- # This sets the LLM's personality.
132
- # The initial personality privided is basic.
133
- # Try something interesting and notice how the LLM responses are affected.
134
- # system_prompt = st.text_area(
135
- # label="System Prompt",
136
- # value="You are a helpful AI assistant who answers questions in short sentences.",
137
- # key="system_prompt")
138
- # Choose input method
139
 
140
  input_type = st.radio("Choose an input method:", ['URL', 'Upload PDF'])
141
-
142
  if input_type == 'URL':
143
  base_url = st.text_input("Enter the site URL here:", key="base_url")
144
  if base_url:
@@ -149,36 +104,21 @@ elif input_type == 'Upload PDF':
149
  uploaded_file = st.file_uploader("Upload your PDF here:", type="pdf")
150
  if uploaded_file:
151
  pdf_text = process_pdf(uploaded_file)
152
- # Process the PDF text into a format that can be used by your LLM
153
- urls = [pdf_text] # Adapt as needed for your system
154
- retriever = get_retriever(urls) # Ensure your retriever can handle raw text; if not, adapt it.
155
  llm_chain = create_chain(retriever)
156
 
157
- # We store the conversation in the session state.
158
- # This will be used to render the chat conversation.
159
- # We initialize it with the first message we want to be greeted with
160
-
161
- # Initialize chat session state for storing messages and responses
162
- if "messages" not in st.session_state:
163
- st.session_state.messages = [{"role": "assistant", "content": "How may I help you today?"}]
164
-
165
- if "current_response" not in st.session_state:
166
- st.session_state.current_response = ""
167
-
168
- # Render the chat messages
169
- for message in st.session_state.messages:
170
- with st.chat_message(message["role"]):
171
- st.markdown(message["content"])
172
-
173
- # Input and response handling
174
- if llm_chain and (user_prompt := st.chat_input("Your message here", key="user_input")):
175
- # Add user input to the session state and chat window
176
- st.session_state.messages.append({"role": "user", "content": user_prompt})
177
- with st.chat_message("user"):
178
- st.markdown(user_prompt)
179
-
180
- # Generate and display the response using the LLM chain
181
- response = llm_chain.run(user_prompt)
182
- st.session_state.messages.append({"role": "assistant", "content": response})
183
- with st.chat_message("assistant"):
184
- st.markdown(response)
 
1
  import streamlit as st
2
  from bs4 import BeautifulSoup
3
  import io
4
+ import fitz # PyMuPDF
5
  import requests
6
+ from docarray import Document
7
+ from pydantic import BaseModel, Field
8
+ from typing import List
9
  from langchain.llms import LlamaCpp
10
  from langchain.callbacks.base import BaseCallbackHandler
11
  from langchain.vectorstores import DocArrayInMemorySearch
 
12
  from langchain.embeddings import HuggingFaceEmbeddings
13
  from langchain.memory import ConversationBufferMemory
14
  from langchain.chains import ConversationalRetrievalChain
15
  from langchain.text_splitter import RecursiveCharacterTextSplitter
16
 
 
 
 
 
17
  class StreamHandler(BaseCallbackHandler):
18
  def __init__(self, container, initial_text=""):
19
  self.container = container
 
23
  self.text += token
24
  self.container.markdown(self.text)
25
 
26
+ class DocArrayDoc(BaseModel):
27
+ text: str = Field(default="")
28
+ embedding: List[float]
29
+ metadata: dict = Field(default_factory=dict)
30
 
31
  @st.cache_data
 
32
  def get_page_urls(url):
33
  page = requests.get(url)
34
  soup = BeautifulSoup(page.content, 'html.parser')
35
+ links = [link['href'] for link in soup.find_all('a', href=True) if link['href'].startswith(url)]
36
  links.append(url)
37
  return set(links)
38
 
39
  @st.cache(allow_output_mutation=True)
40
  def process_pdf(file):
41
+ doc = fitz.open("pdf", file.read())
 
42
  texts = [page.get_text() for page in doc]
43
  return '\n'.join(texts)
44
 
 
45
  def get_url_content(url):
46
  response = requests.get(url)
47
  if url.endswith('.pdf'):
 
51
  else:
52
  soup = BeautifulSoup(response.content, 'html.parser')
53
  content = soup.find_all('div', class_='wpb_content_element')
54
+ text = ' '.join([c.get_text().strip() for c in content])
55
+ return (url, text)
 
 
 
 
 
 
56
 
57
  @st.cache_resource
58
  def get_retriever(urls):
59
  all_content = [get_url_content(url) for url in urls]
60
+ documents = [Document(text=content, metadata={'url': url}) for (url, content) in all_content]
 
 
 
61
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
62
  docs = text_splitter.split_documents(documents)
 
 
63
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
64
  db = DocArrayInMemorySearch.from_documents(docs, embeddings)
65
  retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 5, "fetch_k": 10})
66
  return retriever
67
 
 
68
  @st.cache_resource
69
  def create_chain(_retriever):
70
+ n_gpu_layers = 10
71
+ n_batch = 2048
 
 
 
 
 
 
 
 
 
 
 
 
72
  llm = LlamaCpp(
73
+ model_path="models/mistral-7b-instruct-v0.1.Q5_0.gguf",
74
  n_gpu_layers=n_gpu_layers,
75
  n_batch=n_batch,
76
  n_ctx=2048,
 
77
  temperature=0,
 
78
  verbose=False,
79
  streaming=True,
80
  )
 
 
 
 
 
 
 
 
81
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 
 
82
  qa_chain = ConversationalRetrievalChain.from_llm(
83
  llm, retriever=_retriever, memory=memory, verbose=False
84
  )
 
85
  return qa_chain
86
 
87
+ # Webpage title and header
 
 
88
  st.set_page_config(page_title="Your own AI-Chat!")
89
  st.header("Your own AI-Chat!")
90
 
91
+ system_prompt = st.text_area(
92
+ label="System Prompt",
93
+ value="You are a helpful AI assistant who answers questions accurately.",
94
+ key="system_prompt")
 
 
 
 
95
 
96
  input_type = st.radio("Choose an input method:", ['URL', 'Upload PDF'])
 
97
  if input_type == 'URL':
98
  base_url = st.text_input("Enter the site URL here:", key="base_url")
99
  if base_url:
 
104
  uploaded_file = st.file_uploader("Upload your PDF here:", type="pdf")
105
  if uploaded_file:
106
  pdf_text = process_pdf(uploaded_file)
107
+ urls = [pdf_text] # Assuming this needs to be wrapped into proper structure
108
+ retriever = get_retriever(urls) # Ensure retriever accepts this
 
109
  llm_chain = create_chain(retriever)
110
 
111
+ # Interaction and message handling
112
+ if 'retriever' in locals() and retriever:
113
+ if "messages" not in st.session_state:
114
+ st.session_state.messages = [{"role": "assistant", "content": "How may I help you today?"}]
115
+ if "current_response" not in st.session_state:
116
+ st.session_state.current_response = ""
117
+ for message in st.session_state.messages:
118
+ with st.chat_message(message["role"]):
119
+ st.markdown(message["content"])
120
+ user_prompt = st.chat_input("Your message here", key="user_input")
121
+ if user_prompt:
122
+ st.session_state.messages.append({"role": "user", "content": user_prompt})
123
+ response = llm_chain.run(user_prompt)
124
+ st.session_state.messages.append({"role": "assistant", "content": response})