Spaces:

pankajsingh3012
/

web_crewler

Sleeping

App Files Files Community

pankajsingh3012 commited on Jul 16, 2024

Commit

5cd5547

verified ·

1 Parent(s): 7fdd598

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -44

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
@@ -10,7 +11,6 @@ from sentence_transformers import SentenceTransformer
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 import torch
 def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
     visited = set()
     results = []
@@ -31,13 +31,11 @@ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[
             response = requests.get(url)
             soup = BeautifulSoup(response.text, 'html.parser')
             text = soup.get_text()
             text = re.sub(r'\s+', ' ', text).strip()
             results.append((url, text))
             if depth < max_depth:
                 for link in soup.find_all('a', href=True):
                     next_url = urljoin(url, link['href'])
@@ -47,8 +45,7 @@ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[
         except Exception as e:
             print(f"Error crawling {url}: {e}")
-    return results[:10], crawled_urls[:10]
 def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
     chunks = []
@@ -66,24 +63,27 @@ def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
     return chunks
 class InMemoryStorage:
     def __init__(self):
-        self.embeddings = []
         self.texts = []
         self.urls = []
     def insert(self, embeddings, texts, urls):
-        self.embeddings.extend(embeddings)
         self.texts.extend(texts)
         self.urls.extend(urls)
     def search(self, query_embedding, top_k=5):
         similarities = np.dot(self.embeddings, query_embedding)
         top_indices = np.argsort(similarities)[-top_k:][::-1]
         return [(self.texts[i], self.urls[i]) for i in top_indices]
 def get_sentence_transformer():
     return SentenceTransformer('distilbert-base-nli-mean-tokens')
@@ -92,13 +92,11 @@ def insert_chunks(storage, chunks: List[str], urls: List[str]):
     embeddings = model.encode(chunks)
     storage.insert(embeddings, chunks, urls)
 def vector_search(storage, query: str, top_k: int = 5):
     model = get_sentence_transformer()
     query_embedding = model.encode([query])[0]
     return storage.search(query_embedding, top_k)
 class QuestionAnsweringSystem:
     def __init__(self):
         self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
@@ -118,45 +116,38 @@ class QuestionAnsweringSystem:
         return answer
 def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
     results = vector_search(storage, query)
     context = " ".join([result[0] for result in results])
     answer = qa_system.answer_question(query, context)
     source_url = results[0][1] if results else ""
     return answer, source_url
-def main():
-    print("CUDA Documentation QA System")
-    storage = InMemoryStorage()
-    qa_system = QuestionAnsweringSystem()
-    print("Crawling CUDA documentation...")
-    crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=0, delay=0.1)
-    print("Processing and inserting data...")
-    for url, text in crawled_data:
-        chunks = chunk_text(text, max_chunk_size=1024)
-        insert_chunks(storage, chunks, [url] * len(chunks))
-    print(f"Data crawled and inserted successfully! Processed {len(crawled_data)} pages.")
-    print("\nCrawled URLs:")
-    for url in crawled_urls:
-        print(url)
-    query = st.text_input("Enter your question about CUDA:")
-    if query:
-        with st.spinner('Searching for an answer...'):
-            answer, source_url = get_answer(storage, qa_system, query)
-            st.write("**Answer:**")
-            st.write(answer)
-            st.write("**Source:**")
-            st.write(source_url)
-if __name__ == "__main__":
-    main()

+import streamlit as st
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 import torch
 def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
     visited = set()
     results = []
             response = requests.get(url)
             soup = BeautifulSoup(response.text, 'html.parser')
             text = soup.get_text()
             text = re.sub(r'\s+', ' ', text).strip()
             results.append((url, text))
             if depth < max_depth:
                 for link in soup.find_all('a', href=True):
                     next_url = urljoin(url, link['href'])
         except Exception as e:
             print(f"Error crawling {url}: {e}")
+    return results, crawled_urls
 def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
     chunks = []
     return chunks
 class InMemoryStorage:
     def __init__(self):
+        self.embeddings = np.array([])
         self.texts = []
         self.urls = []
     def insert(self, embeddings, texts, urls):
+        if self.embeddings.size == 0:
+            self.embeddings = embeddings
+        else:
+            self.embeddings = np.vstack((self.embeddings, embeddings))
         self.texts.extend(texts)
         self.urls.extend(urls)
     def search(self, query_embedding, top_k=5):
+        if self.embeddings.size == 0:
+            return []
         similarities = np.dot(self.embeddings, query_embedding)
         top_indices = np.argsort(similarities)[-top_k:][::-1]
         return [(self.texts[i], self.urls[i]) for i in top_indices]
 def get_sentence_transformer():
     return SentenceTransformer('distilbert-base-nli-mean-tokens')
     embeddings = model.encode(chunks)
     storage.insert(embeddings, chunks, urls)
 def vector_search(storage, query: str, top_k: int = 5):
     model = get_sentence_transformer()
     query_embedding = model.encode([query])[0]
     return storage.search(query_embedding, top_k)
 class QuestionAnsweringSystem:
     def __init__(self):
         self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
         return answer
 def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
     results = vector_search(storage, query)
+    if not results:
+        return "No relevant documents found.", ""
     context = " ".join([result[0] for result in results])
     answer = qa_system.answer_question(query, context)
     source_url = results[0][1] if results else ""
     return answer, source_url
+# Streamlit UI
+st.title("CUDA Documentation QA System")
+storage = InMemoryStorage()
+qa_system = QuestionAnsweringSystem()
+# Crawling and processing the data
+if st.button('Crawl CUDA Documentation'):
+    with st.spinner('Crawling CUDA documentation...'):
+        crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
+        st.write(f"Processed {len(crawled_data)} pages.")
+        for url, text in crawled_data:
+            chunks = chunk_text(text, max_chunk_size=1024)
+            insert_chunks(storage, chunks, [url] * len(chunks))
+        st.success("Crawling and processing completed.")
+# Asking questions
+query = st.text_input("Enter your question about CUDA:")
+if query:
+    with st.spinner('Searching for an answer...'):
+        answer, source_url = get_answer(storage, qa_system, query)
+        st.write("**Answer:**")
+        st.write(answer)
+        st.write("**Source:**")
+        st.write(source_url)