pankajsingh3012 commited on
Commit
5cd5547
·
verified ·
1 Parent(s): 7fdd598

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -44
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import requests
2
  from bs4 import BeautifulSoup
3
  from urllib.parse import urljoin
@@ -10,7 +11,6 @@ from sentence_transformers import SentenceTransformer
10
  from transformers import T5Tokenizer, T5ForConditionalGeneration
11
  import torch
12
 
13
-
14
  def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
15
  visited = set()
16
  results = []
@@ -31,13 +31,11 @@ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[
31
  response = requests.get(url)
32
  soup = BeautifulSoup(response.text, 'html.parser')
33
 
34
-
35
  text = soup.get_text()
36
  text = re.sub(r'\s+', ' ', text).strip()
37
 
38
  results.append((url, text))
39
 
40
-
41
  if depth < max_depth:
42
  for link in soup.find_all('a', href=True):
43
  next_url = urljoin(url, link['href'])
@@ -47,8 +45,7 @@ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[
47
  except Exception as e:
48
  print(f"Error crawling {url}: {e}")
49
 
50
- return results[:10], crawled_urls[:10]
51
-
52
 
53
  def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
54
  chunks = []
@@ -66,24 +63,27 @@ def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
66
 
67
  return chunks
68
 
69
-
70
  class InMemoryStorage:
71
  def __init__(self):
72
- self.embeddings = []
73
  self.texts = []
74
  self.urls = []
75
 
76
  def insert(self, embeddings, texts, urls):
77
- self.embeddings.extend(embeddings)
 
 
 
78
  self.texts.extend(texts)
79
  self.urls.extend(urls)
80
 
81
  def search(self, query_embedding, top_k=5):
 
 
82
  similarities = np.dot(self.embeddings, query_embedding)
83
  top_indices = np.argsort(similarities)[-top_k:][::-1]
84
  return [(self.texts[i], self.urls[i]) for i in top_indices]
85
 
86
-
87
  def get_sentence_transformer():
88
  return SentenceTransformer('distilbert-base-nli-mean-tokens')
89
 
@@ -92,13 +92,11 @@ def insert_chunks(storage, chunks: List[str], urls: List[str]):
92
  embeddings = model.encode(chunks)
93
  storage.insert(embeddings, chunks, urls)
94
 
95
-
96
  def vector_search(storage, query: str, top_k: int = 5):
97
  model = get_sentence_transformer()
98
  query_embedding = model.encode([query])[0]
99
  return storage.search(query_embedding, top_k)
100
 
101
-
102
  class QuestionAnsweringSystem:
103
  def __init__(self):
104
  self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
@@ -118,45 +116,38 @@ class QuestionAnsweringSystem:
118
 
119
  return answer
120
 
121
-
122
  def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
123
  results = vector_search(storage, query)
 
 
124
  context = " ".join([result[0] for result in results])
125
  answer = qa_system.answer_question(query, context)
126
  source_url = results[0][1] if results else ""
127
  return answer, source_url
128
 
129
- def main():
130
- print("CUDA Documentation QA System")
131
 
132
-
133
- storage = InMemoryStorage()
134
- qa_system = QuestionAnsweringSystem()
135
 
136
-
137
- print("Crawling CUDA documentation...")
138
- crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=0, delay=0.1)
139
-
140
- print("Processing and inserting data...")
141
- for url, text in crawled_data:
142
- chunks = chunk_text(text, max_chunk_size=1024)
143
- insert_chunks(storage, chunks, [url] * len(chunks))
144
-
145
- print(f"Data crawled and inserted successfully! Processed {len(crawled_data)} pages.")
146
-
147
-
148
- print("\nCrawled URLs:")
149
- for url in crawled_urls:
150
- print(url)
151
-
152
- query = st.text_input("Enter your question about CUDA:")
153
- if query:
154
- with st.spinner('Searching for an answer...'):
155
- answer, source_url = get_answer(storage, qa_system, query)
156
- st.write("**Answer:**")
157
- st.write(answer)
158
- st.write("**Source:**")
159
- st.write(source_url)
160
-
161
- if __name__ == "__main__":
162
- main()
 
1
+ import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
  from urllib.parse import urljoin
 
11
  from transformers import T5Tokenizer, T5ForConditionalGeneration
12
  import torch
13
 
 
14
  def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
15
  visited = set()
16
  results = []
 
31
  response = requests.get(url)
32
  soup = BeautifulSoup(response.text, 'html.parser')
33
 
 
34
  text = soup.get_text()
35
  text = re.sub(r'\s+', ' ', text).strip()
36
 
37
  results.append((url, text))
38
 
 
39
  if depth < max_depth:
40
  for link in soup.find_all('a', href=True):
41
  next_url = urljoin(url, link['href'])
 
45
  except Exception as e:
46
  print(f"Error crawling {url}: {e}")
47
 
48
+ return results, crawled_urls
 
49
 
50
  def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
51
  chunks = []
 
63
 
64
  return chunks
65
 
 
66
  class InMemoryStorage:
67
  def __init__(self):
68
+ self.embeddings = np.array([])
69
  self.texts = []
70
  self.urls = []
71
 
72
  def insert(self, embeddings, texts, urls):
73
+ if self.embeddings.size == 0:
74
+ self.embeddings = embeddings
75
+ else:
76
+ self.embeddings = np.vstack((self.embeddings, embeddings))
77
  self.texts.extend(texts)
78
  self.urls.extend(urls)
79
 
80
  def search(self, query_embedding, top_k=5):
81
+ if self.embeddings.size == 0:
82
+ return []
83
  similarities = np.dot(self.embeddings, query_embedding)
84
  top_indices = np.argsort(similarities)[-top_k:][::-1]
85
  return [(self.texts[i], self.urls[i]) for i in top_indices]
86
 
 
87
  def get_sentence_transformer():
88
  return SentenceTransformer('distilbert-base-nli-mean-tokens')
89
 
 
92
  embeddings = model.encode(chunks)
93
  storage.insert(embeddings, chunks, urls)
94
 
 
95
  def vector_search(storage, query: str, top_k: int = 5):
96
  model = get_sentence_transformer()
97
  query_embedding = model.encode([query])[0]
98
  return storage.search(query_embedding, top_k)
99
 
 
100
  class QuestionAnsweringSystem:
101
  def __init__(self):
102
  self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
 
116
 
117
  return answer
118
 
 
119
  def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
120
  results = vector_search(storage, query)
121
+ if not results:
122
+ return "No relevant documents found.", ""
123
  context = " ".join([result[0] for result in results])
124
  answer = qa_system.answer_question(query, context)
125
  source_url = results[0][1] if results else ""
126
  return answer, source_url
127
 
128
+ # Streamlit UI
129
+ st.title("CUDA Documentation QA System")
130
 
131
+ storage = InMemoryStorage()
132
+ qa_system = QuestionAnsweringSystem()
 
133
 
134
+ # Crawling and processing the data
135
+ if st.button('Crawl CUDA Documentation'):
136
+ with st.spinner('Crawling CUDA documentation...'):
137
+ crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
138
+ st.write(f"Processed {len(crawled_data)} pages.")
139
+
140
+ for url, text in crawled_data:
141
+ chunks = chunk_text(text, max_chunk_size=1024)
142
+ insert_chunks(storage, chunks, [url] * len(chunks))
143
+ st.success("Crawling and processing completed.")
144
+
145
+ # Asking questions
146
+ query = st.text_input("Enter your question about CUDA:")
147
+ if query:
148
+ with st.spinner('Searching for an answer...'):
149
+ answer, source_url = get_answer(storage, qa_system, query)
150
+ st.write("**Answer:**")
151
+ st.write(answer)
152
+ st.write("**Source:**")
153
+ st.write(source_url)