Spaces:
Sleeping
Sleeping
Rename ctest.py to app.py
Browse files- ctest.py → app.py +168 -168
ctest.py → app.py
RENAMED
@@ -1,169 +1,169 @@
|
|
1 |
-
import requests
|
2 |
-
from bs4 import BeautifulSoup
|
3 |
-
from urllib.parse import urljoin
|
4 |
-
import re
|
5 |
-
from typing import List, Tuple
|
6 |
-
from collections import deque
|
7 |
-
import time
|
8 |
-
import numpy as np
|
9 |
-
from sentence_transformers import SentenceTransformer
|
10 |
-
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
11 |
-
import torch
|
12 |
-
|
13 |
-
|
14 |
-
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
|
15 |
-
visited = set()
|
16 |
-
results = []
|
17 |
-
queue = deque([(start_url, 0)])
|
18 |
-
crawled_urls = []
|
19 |
-
|
20 |
-
while queue:
|
21 |
-
url, depth = queue.popleft()
|
22 |
-
|
23 |
-
if depth > max_depth or url in visited:
|
24 |
-
continue
|
25 |
-
|
26 |
-
visited.add(url)
|
27 |
-
crawled_urls.append(url)
|
28 |
-
|
29 |
-
try:
|
30 |
-
time.sleep(delay)
|
31 |
-
response = requests.get(url)
|
32 |
-
soup = BeautifulSoup(response.text, 'html.parser')
|
33 |
-
|
34 |
-
|
35 |
-
text = soup.get_text()
|
36 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
37 |
-
|
38 |
-
results.append((url, text))
|
39 |
-
|
40 |
-
|
41 |
-
if depth < max_depth:
|
42 |
-
for link in soup.find_all('a', href=True):
|
43 |
-
next_url = urljoin(url, link['href'])
|
44 |
-
if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
|
45 |
-
queue.append((next_url, depth + 1))
|
46 |
-
|
47 |
-
except Exception as e:
|
48 |
-
print(f"Error crawling {url}: {e}")
|
49 |
-
|
50 |
-
return results, crawled_urls
|
51 |
-
|
52 |
-
|
53 |
-
def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
|
54 |
-
chunks = []
|
55 |
-
current_chunk = ""
|
56 |
-
|
57 |
-
for sentence in re.split(r'(?<=[.!?])\s+', text):
|
58 |
-
if len(current_chunk) + len(sentence) <= max_chunk_size:
|
59 |
-
current_chunk += sentence + " "
|
60 |
-
else:
|
61 |
-
chunks.append(current_chunk.strip())
|
62 |
-
current_chunk = sentence + " "
|
63 |
-
|
64 |
-
if current_chunk:
|
65 |
-
chunks.append(current_chunk.strip())
|
66 |
-
|
67 |
-
return chunks
|
68 |
-
|
69 |
-
|
70 |
-
class InMemoryStorage:
|
71 |
-
def __init__(self):
|
72 |
-
self.embeddings = []
|
73 |
-
self.texts = []
|
74 |
-
self.urls = []
|
75 |
-
|
76 |
-
def insert(self, embeddings, texts, urls):
|
77 |
-
self.embeddings.extend(embeddings)
|
78 |
-
self.texts.extend(texts)
|
79 |
-
self.urls.extend(urls)
|
80 |
-
|
81 |
-
def search(self, query_embedding, top_k=5):
|
82 |
-
similarities = np.dot(self.embeddings, query_embedding)
|
83 |
-
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
84 |
-
return [(self.texts[i], self.urls[i]) for i in top_indices]
|
85 |
-
|
86 |
-
|
87 |
-
def get_sentence_transformer():
|
88 |
-
return SentenceTransformer('distilbert-base-nli-mean-tokens')
|
89 |
-
|
90 |
-
def insert_chunks(storage, chunks: List[str], urls: List[str]):
|
91 |
-
model = get_sentence_transformer()
|
92 |
-
embeddings = model.encode(chunks)
|
93 |
-
storage.insert(embeddings, chunks, urls)
|
94 |
-
|
95 |
-
|
96 |
-
def vector_search(storage, query: str, top_k: int = 5):
|
97 |
-
model = get_sentence_transformer()
|
98 |
-
query_embedding = model.encode([query])[0]
|
99 |
-
return storage.search(query_embedding, top_k)
|
100 |
-
|
101 |
-
|
102 |
-
class QuestionAnsweringSystem:
|
103 |
-
def __init__(self):
|
104 |
-
self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
105 |
-
self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
106 |
-
self.tokenizer.model_max_length = 1024
|
107 |
-
self.model.config.max_length = 1024
|
108 |
-
|
109 |
-
def answer_question(self, question: str, context: str) -> str:
|
110 |
-
input_text = f"question: {question} context: {context}"
|
111 |
-
inputs = self.tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
|
112 |
-
|
113 |
-
outputs = self.model.generate(inputs.input_ids,
|
114 |
-
max_length=1024,
|
115 |
-
num_beams=4,
|
116 |
-
early_stopping=True)
|
117 |
-
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
118 |
-
|
119 |
-
return answer
|
120 |
-
|
121 |
-
|
122 |
-
def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
|
123 |
-
results = vector_search(storage, query)
|
124 |
-
context = " ".join([result[0] for result in results])
|
125 |
-
answer = qa_system.answer_question(query, context)
|
126 |
-
source_url = results[0][1] if results else ""
|
127 |
-
return answer, source_url
|
128 |
-
|
129 |
-
def main():
|
130 |
-
print("CUDA Documentation QA System")
|
131 |
-
|
132 |
-
|
133 |
-
storage = InMemoryStorage()
|
134 |
-
qa_system = QuestionAnsweringSystem()
|
135 |
-
|
136 |
-
|
137 |
-
print("Crawling CUDA documentation...")
|
138 |
-
crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
|
139 |
-
|
140 |
-
print("Processing and inserting data...")
|
141 |
-
for url, text in crawled_data:
|
142 |
-
chunks = chunk_text(text, max_chunk_size=1024)
|
143 |
-
insert_chunks(storage, chunks, [url] * len(chunks))
|
144 |
-
|
145 |
-
print(f"Data crawled and inserted successfully! Processed {len(crawled_data)} pages.")
|
146 |
-
|
147 |
-
|
148 |
-
print("\nCrawled URLs:")
|
149 |
-
for url in crawled_urls:
|
150 |
-
print(url)
|
151 |
-
|
152 |
-
|
153 |
-
while True:
|
154 |
-
query = input("\nEnter your question about CUDA (or 'quit' to exit): ")
|
155 |
-
|
156 |
-
if query.lower() == 'quit':
|
157 |
-
break
|
158 |
-
|
159 |
-
print("Searching for an answer...")
|
160 |
-
answer, source_url = get_answer(storage, qa_system, query)
|
161 |
-
|
162 |
-
print("\nAnswer:")
|
163 |
-
print(answer)
|
164 |
-
|
165 |
-
print("\nSource:")
|
166 |
-
print(source_url)
|
167 |
-
|
168 |
-
if __name__ == "__main__":
|
169 |
main()
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from urllib.parse import urljoin
|
4 |
+
import re
|
5 |
+
from typing import List, Tuple
|
6 |
+
from collections import deque
|
7 |
+
import time
|
8 |
+
import numpy as np
|
9 |
+
from sentence_transformers import SentenceTransformer
|
10 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
11 |
+
import torch
|
12 |
+
|
13 |
+
|
14 |
+
def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
|
15 |
+
visited = set()
|
16 |
+
results = []
|
17 |
+
queue = deque([(start_url, 0)])
|
18 |
+
crawled_urls = []
|
19 |
+
|
20 |
+
while queue:
|
21 |
+
url, depth = queue.popleft()
|
22 |
+
|
23 |
+
if depth > max_depth or url in visited:
|
24 |
+
continue
|
25 |
+
|
26 |
+
visited.add(url)
|
27 |
+
crawled_urls.append(url)
|
28 |
+
|
29 |
+
try:
|
30 |
+
time.sleep(delay)
|
31 |
+
response = requests.get(url)
|
32 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
33 |
+
|
34 |
+
|
35 |
+
text = soup.get_text()
|
36 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
37 |
+
|
38 |
+
results.append((url, text))
|
39 |
+
|
40 |
+
|
41 |
+
if depth < max_depth:
|
42 |
+
for link in soup.find_all('a', href=True):
|
43 |
+
next_url = urljoin(url, link['href'])
|
44 |
+
if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
|
45 |
+
queue.append((next_url, depth + 1))
|
46 |
+
|
47 |
+
except Exception as e:
|
48 |
+
print(f"Error crawling {url}: {e}")
|
49 |
+
|
50 |
+
return results, crawled_urls
|
51 |
+
|
52 |
+
|
53 |
+
def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
|
54 |
+
chunks = []
|
55 |
+
current_chunk = ""
|
56 |
+
|
57 |
+
for sentence in re.split(r'(?<=[.!?])\s+', text):
|
58 |
+
if len(current_chunk) + len(sentence) <= max_chunk_size:
|
59 |
+
current_chunk += sentence + " "
|
60 |
+
else:
|
61 |
+
chunks.append(current_chunk.strip())
|
62 |
+
current_chunk = sentence + " "
|
63 |
+
|
64 |
+
if current_chunk:
|
65 |
+
chunks.append(current_chunk.strip())
|
66 |
+
|
67 |
+
return chunks
|
68 |
+
|
69 |
+
|
70 |
+
class InMemoryStorage:
|
71 |
+
def __init__(self):
|
72 |
+
self.embeddings = []
|
73 |
+
self.texts = []
|
74 |
+
self.urls = []
|
75 |
+
|
76 |
+
def insert(self, embeddings, texts, urls):
|
77 |
+
self.embeddings.extend(embeddings)
|
78 |
+
self.texts.extend(texts)
|
79 |
+
self.urls.extend(urls)
|
80 |
+
|
81 |
+
def search(self, query_embedding, top_k=5):
|
82 |
+
similarities = np.dot(self.embeddings, query_embedding)
|
83 |
+
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
84 |
+
return [(self.texts[i], self.urls[i]) for i in top_indices]
|
85 |
+
|
86 |
+
|
87 |
+
def get_sentence_transformer():
|
88 |
+
return SentenceTransformer('distilbert-base-nli-mean-tokens')
|
89 |
+
|
90 |
+
def insert_chunks(storage, chunks: List[str], urls: List[str]):
|
91 |
+
model = get_sentence_transformer()
|
92 |
+
embeddings = model.encode(chunks)
|
93 |
+
storage.insert(embeddings, chunks, urls)
|
94 |
+
|
95 |
+
|
96 |
+
def vector_search(storage, query: str, top_k: int = 5):
|
97 |
+
model = get_sentence_transformer()
|
98 |
+
query_embedding = model.encode([query])[0]
|
99 |
+
return storage.search(query_embedding, top_k)
|
100 |
+
|
101 |
+
|
102 |
+
class QuestionAnsweringSystem:
|
103 |
+
def __init__(self):
|
104 |
+
self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
|
105 |
+
self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
|
106 |
+
self.tokenizer.model_max_length = 1024
|
107 |
+
self.model.config.max_length = 1024
|
108 |
+
|
109 |
+
def answer_question(self, question: str, context: str) -> str:
|
110 |
+
input_text = f"question: {question} context: {context}"
|
111 |
+
inputs = self.tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
|
112 |
+
|
113 |
+
outputs = self.model.generate(inputs.input_ids,
|
114 |
+
max_length=1024,
|
115 |
+
num_beams=4,
|
116 |
+
early_stopping=True)
|
117 |
+
answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
118 |
+
|
119 |
+
return answer
|
120 |
+
|
121 |
+
|
122 |
+
def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
|
123 |
+
results = vector_search(storage, query)
|
124 |
+
context = " ".join([result[0] for result in results])
|
125 |
+
answer = qa_system.answer_question(query, context)
|
126 |
+
source_url = results[0][1] if results else ""
|
127 |
+
return answer, source_url
|
128 |
+
|
129 |
+
def main():
|
130 |
+
print("CUDA Documentation QA System")
|
131 |
+
|
132 |
+
|
133 |
+
storage = InMemoryStorage()
|
134 |
+
qa_system = QuestionAnsweringSystem()
|
135 |
+
|
136 |
+
|
137 |
+
print("Crawling CUDA documentation...")
|
138 |
+
crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
|
139 |
+
|
140 |
+
print("Processing and inserting data...")
|
141 |
+
for url, text in crawled_data:
|
142 |
+
chunks = chunk_text(text, max_chunk_size=1024)
|
143 |
+
insert_chunks(storage, chunks, [url] * len(chunks))
|
144 |
+
|
145 |
+
print(f"Data crawled and inserted successfully! Processed {len(crawled_data)} pages.")
|
146 |
+
|
147 |
+
|
148 |
+
print("\nCrawled URLs:")
|
149 |
+
for url in crawled_urls:
|
150 |
+
print(url)
|
151 |
+
|
152 |
+
|
153 |
+
while True:
|
154 |
+
query = input("\nEnter your question about CUDA (or 'quit' to exit): ")
|
155 |
+
|
156 |
+
if query.lower() == 'quit':
|
157 |
+
break
|
158 |
+
|
159 |
+
print("Searching for an answer...")
|
160 |
+
answer, source_url = get_answer(storage, qa_system, query)
|
161 |
+
|
162 |
+
print("\nAnswer:")
|
163 |
+
print(answer)
|
164 |
+
|
165 |
+
print("\nSource:")
|
166 |
+
print(source_url)
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
main()
|