Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import requests
|
2 |
from bs4 import BeautifulSoup
|
3 |
from urllib.parse import urljoin
|
@@ -10,7 +11,6 @@ from sentence_transformers import SentenceTransformer
|
|
10 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
11 |
import torch
|
12 |
|
13 |
-
|
14 |
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
|
15 |
visited = set()
|
16 |
results = []
|
@@ -31,13 +31,11 @@ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[
|
|
31 |
response = requests.get(url)
|
32 |
soup = BeautifulSoup(response.text, 'html.parser')
|
33 |
|
34 |
-
|
35 |
text = soup.get_text()
|
36 |
text = re.sub(r'\s+', ' ', text).strip()
|
37 |
|
38 |
results.append((url, text))
|
39 |
|
40 |
-
|
41 |
if depth < max_depth:
|
42 |
for link in soup.find_all('a', href=True):
|
43 |
next_url = urljoin(url, link['href'])
|
@@ -47,8 +45,7 @@ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[
|
|
47 |
except Exception as e:
|
48 |
print(f"Error crawling {url}: {e}")
|
49 |
|
50 |
-
return results
|
51 |
-
|
52 |
|
53 |
def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
|
54 |
chunks = []
|
@@ -66,24 +63,27 @@ def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
|
|
66 |
|
67 |
return chunks
|
68 |
|
69 |
-
|
70 |
class InMemoryStorage:
|
71 |
def __init__(self):
|
72 |
-
self.embeddings = []
|
73 |
self.texts = []
|
74 |
self.urls = []
|
75 |
|
76 |
def insert(self, embeddings, texts, urls):
|
77 |
-
self.embeddings.
|
|
|
|
|
|
|
78 |
self.texts.extend(texts)
|
79 |
self.urls.extend(urls)
|
80 |
|
81 |
def search(self, query_embedding, top_k=5):
|
|
|
|
|
82 |
similarities = np.dot(self.embeddings, query_embedding)
|
83 |
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
84 |
return [(self.texts[i], self.urls[i]) for i in top_indices]
|
85 |
|
86 |
-
|
87 |
def get_sentence_transformer():
|
88 |
return SentenceTransformer('distilbert-base-nli-mean-tokens')
|
89 |
|
@@ -92,13 +92,11 @@ def insert_chunks(storage, chunks: List[str], urls: List[str]):
|
|
92 |
embeddings = model.encode(chunks)
|
93 |
storage.insert(embeddings, chunks, urls)
|
94 |
|
95 |
-
|
96 |
def vector_search(storage, query: str, top_k: int = 5):
|
97 |
model = get_sentence_transformer()
|
98 |
query_embedding = model.encode([query])[0]
|
99 |
return storage.search(query_embedding, top_k)
|
100 |
|
101 |
-
|
102 |
class QuestionAnsweringSystem:
|
103 |
def __init__(self):
|
104 |
self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
@@ -118,45 +116,38 @@ class QuestionAnsweringSystem:
|
|
118 |
|
119 |
return answer
|
120 |
|
121 |
-
|
122 |
def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
|
123 |
results = vector_search(storage, query)
|
|
|
|
|
124 |
context = " ".join([result[0] for result in results])
|
125 |
answer = qa_system.answer_question(query, context)
|
126 |
source_url = results[0][1] if results else ""
|
127 |
return answer, source_url
|
128 |
|
129 |
-
|
130 |
-
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
qa_system = QuestionAnsweringSystem()
|
135 |
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
st.write("**Answer:**")
|
157 |
-
st.write(answer)
|
158 |
-
st.write("**Source:**")
|
159 |
-
st.write(source_url)
|
160 |
-
|
161 |
-
if __name__ == "__main__":
|
162 |
-
main()
|
|
|
1 |
+
import streamlit as st
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin
|
|
|
11 |
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
12 |
import torch
|
13 |
|
|
|
14 |
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
|
15 |
visited = set()
|
16 |
results = []
|
|
|
31 |
response = requests.get(url)
|
32 |
soup = BeautifulSoup(response.text, 'html.parser')
|
33 |
|
|
|
34 |
text = soup.get_text()
|
35 |
text = re.sub(r'\s+', ' ', text).strip()
|
36 |
|
37 |
results.append((url, text))
|
38 |
|
|
|
39 |
if depth < max_depth:
|
40 |
for link in soup.find_all('a', href=True):
|
41 |
next_url = urljoin(url, link['href'])
|
|
|
45 |
except Exception as e:
|
46 |
print(f"Error crawling {url}: {e}")
|
47 |
|
48 |
+
return results, crawled_urls
|
|
|
49 |
|
50 |
def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
|
51 |
chunks = []
|
|
|
63 |
|
64 |
return chunks
|
65 |
|
|
|
66 |
class InMemoryStorage:
|
67 |
def __init__(self):
|
68 |
+
self.embeddings = np.array([])
|
69 |
self.texts = []
|
70 |
self.urls = []
|
71 |
|
72 |
def insert(self, embeddings, texts, urls):
|
73 |
+
if self.embeddings.size == 0:
|
74 |
+
self.embeddings = embeddings
|
75 |
+
else:
|
76 |
+
self.embeddings = np.vstack((self.embeddings, embeddings))
|
77 |
self.texts.extend(texts)
|
78 |
self.urls.extend(urls)
|
79 |
|
80 |
def search(self, query_embedding, top_k=5):
|
81 |
+
if self.embeddings.size == 0:
|
82 |
+
return []
|
83 |
similarities = np.dot(self.embeddings, query_embedding)
|
84 |
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
85 |
return [(self.texts[i], self.urls[i]) for i in top_indices]
|
86 |
|
|
|
87 |
def get_sentence_transformer():
|
88 |
return SentenceTransformer('distilbert-base-nli-mean-tokens')
|
89 |
|
|
|
92 |
embeddings = model.encode(chunks)
|
93 |
storage.insert(embeddings, chunks, urls)
|
94 |
|
|
|
95 |
def vector_search(storage, query: str, top_k: int = 5):
|
96 |
model = get_sentence_transformer()
|
97 |
query_embedding = model.encode([query])[0]
|
98 |
return storage.search(query_embedding, top_k)
|
99 |
|
|
|
100 |
class QuestionAnsweringSystem:
|
101 |
def __init__(self):
|
102 |
self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
|
|
116 |
|
117 |
return answer
|
118 |
|
|
|
119 |
def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
|
120 |
results = vector_search(storage, query)
|
121 |
+
if not results:
|
122 |
+
return "No relevant documents found.", ""
|
123 |
context = " ".join([result[0] for result in results])
|
124 |
answer = qa_system.answer_question(query, context)
|
125 |
source_url = results[0][1] if results else ""
|
126 |
return answer, source_url
|
127 |
|
128 |
+
# Streamlit UI
|
129 |
+
st.title("CUDA Documentation QA System")
|
130 |
|
131 |
+
storage = InMemoryStorage()
|
132 |
+
qa_system = QuestionAnsweringSystem()
|
|
|
133 |
|
134 |
+
# Crawling and processing the data
|
135 |
+
if st.button('Crawl CUDA Documentation'):
|
136 |
+
with st.spinner('Crawling CUDA documentation...'):
|
137 |
+
crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
|
138 |
+
st.write(f"Processed {len(crawled_data)} pages.")
|
139 |
+
|
140 |
+
for url, text in crawled_data:
|
141 |
+
chunks = chunk_text(text, max_chunk_size=1024)
|
142 |
+
insert_chunks(storage, chunks, [url] * len(chunks))
|
143 |
+
st.success("Crawling and processing completed.")
|
144 |
+
|
145 |
+
# Asking questions
|
146 |
+
query = st.text_input("Enter your question about CUDA:")
|
147 |
+
if query:
|
148 |
+
with st.spinner('Searching for an answer...'):
|
149 |
+
answer, source_url = get_answer(storage, qa_system, query)
|
150 |
+
st.write("**Answer:**")
|
151 |
+
st.write(answer)
|
152 |
+
st.write("**Source:**")
|
153 |
+
st.write(source_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|