pankajsingh3012 commited on
Commit
21bdb2c
·
verified ·
1 Parent(s): bb0be27

Rename ctest.py to app.py

Browse files
Files changed (1) hide show
  1. ctest.py → app.py +168 -168
ctest.py → app.py RENAMED
@@ -1,169 +1,169 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- from urllib.parse import urljoin
4
- import re
5
- from typing import List, Tuple
6
- from collections import deque
7
- import time
8
- import numpy as np
9
- from sentence_transformers import SentenceTransformer
10
- from transformers import T5Tokenizer, T5ForConditionalGeneration
11
- import torch
12
-
13
-
14
- def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
15
- visited = set()
16
- results = []
17
- queue = deque([(start_url, 0)])
18
- crawled_urls = []
19
-
20
- while queue:
21
- url, depth = queue.popleft()
22
-
23
- if depth > max_depth or url in visited:
24
- continue
25
-
26
- visited.add(url)
27
- crawled_urls.append(url)
28
-
29
- try:
30
- time.sleep(delay)
31
- response = requests.get(url)
32
- soup = BeautifulSoup(response.text, 'html.parser')
33
-
34
-
35
- text = soup.get_text()
36
- text = re.sub(r'\s+', ' ', text).strip()
37
-
38
- results.append((url, text))
39
-
40
-
41
- if depth < max_depth:
42
- for link in soup.find_all('a', href=True):
43
- next_url = urljoin(url, link['href'])
44
- if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
45
- queue.append((next_url, depth + 1))
46
-
47
- except Exception as e:
48
- print(f"Error crawling {url}: {e}")
49
-
50
- return results, crawled_urls
51
-
52
-
53
- def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
54
- chunks = []
55
- current_chunk = ""
56
-
57
- for sentence in re.split(r'(?<=[.!?])\s+', text):
58
- if len(current_chunk) + len(sentence) <= max_chunk_size:
59
- current_chunk += sentence + " "
60
- else:
61
- chunks.append(current_chunk.strip())
62
- current_chunk = sentence + " "
63
-
64
- if current_chunk:
65
- chunks.append(current_chunk.strip())
66
-
67
- return chunks
68
-
69
-
70
- class InMemoryStorage:
71
- def __init__(self):
72
- self.embeddings = []
73
- self.texts = []
74
- self.urls = []
75
-
76
- def insert(self, embeddings, texts, urls):
77
- self.embeddings.extend(embeddings)
78
- self.texts.extend(texts)
79
- self.urls.extend(urls)
80
-
81
- def search(self, query_embedding, top_k=5):
82
- similarities = np.dot(self.embeddings, query_embedding)
83
- top_indices = np.argsort(similarities)[-top_k:][::-1]
84
- return [(self.texts[i], self.urls[i]) for i in top_indices]
85
-
86
-
87
- def get_sentence_transformer():
88
- return SentenceTransformer('distilbert-base-nli-mean-tokens')
89
-
90
- def insert_chunks(storage, chunks: List[str], urls: List[str]):
91
- model = get_sentence_transformer()
92
- embeddings = model.encode(chunks)
93
- storage.insert(embeddings, chunks, urls)
94
-
95
-
96
- def vector_search(storage, query: str, top_k: int = 5):
97
- model = get_sentence_transformer()
98
- query_embedding = model.encode([query])[0]
99
- return storage.search(query_embedding, top_k)
100
-
101
-
102
- class QuestionAnsweringSystem:
103
- def __init__(self):
104
- self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
105
- self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
106
- self.tokenizer.model_max_length = 1024
107
- self.model.config.max_length = 1024
108
-
109
- def answer_question(self, question: str, context: str) -> str:
110
- input_text = f"question: {question} context: {context}"
111
- inputs = self.tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
112
-
113
- outputs = self.model.generate(inputs.input_ids,
114
- max_length=1024,
115
- num_beams=4,
116
- early_stopping=True)
117
- answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
118
-
119
- return answer
120
-
121
-
122
- def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
123
- results = vector_search(storage, query)
124
- context = " ".join([result[0] for result in results])
125
- answer = qa_system.answer_question(query, context)
126
- source_url = results[0][1] if results else ""
127
- return answer, source_url
128
-
129
- def main():
130
- print("CUDA Documentation QA System")
131
-
132
-
133
- storage = InMemoryStorage()
134
- qa_system = QuestionAnsweringSystem()
135
-
136
-
137
- print("Crawling CUDA documentation...")
138
- crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
139
-
140
- print("Processing and inserting data...")
141
- for url, text in crawled_data:
142
- chunks = chunk_text(text, max_chunk_size=1024)
143
- insert_chunks(storage, chunks, [url] * len(chunks))
144
-
145
- print(f"Data crawled and inserted successfully! Processed {len(crawled_data)} pages.")
146
-
147
-
148
- print("\nCrawled URLs:")
149
- for url in crawled_urls:
150
- print(url)
151
-
152
-
153
- while True:
154
- query = input("\nEnter your question about CUDA (or 'quit' to exit): ")
155
-
156
- if query.lower() == 'quit':
157
- break
158
-
159
- print("Searching for an answer...")
160
- answer, source_url = get_answer(storage, qa_system, query)
161
-
162
- print("\nAnswer:")
163
- print(answer)
164
-
165
- print("\nSource:")
166
- print(source_url)
167
-
168
- if __name__ == "__main__":
169
  main()
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urljoin
4
+ import re
5
+ from typing import List, Tuple
6
+ from collections import deque
7
+ import time
8
+ import numpy as np
9
+ from sentence_transformers import SentenceTransformer
10
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
11
+ import torch
12
+
13
+
14
+ def crawl(start_url: str, max_depth: int = 1, delay: float = 0.1) -> Tuple[List[Tuple[str, str]], List[str]]:
15
+ visited = set()
16
+ results = []
17
+ queue = deque([(start_url, 0)])
18
+ crawled_urls = []
19
+
20
+ while queue:
21
+ url, depth = queue.popleft()
22
+
23
+ if depth > max_depth or url in visited:
24
+ continue
25
+
26
+ visited.add(url)
27
+ crawled_urls.append(url)
28
+
29
+ try:
30
+ time.sleep(delay)
31
+ response = requests.get(url)
32
+ soup = BeautifulSoup(response.text, 'html.parser')
33
+
34
+
35
+ text = soup.get_text()
36
+ text = re.sub(r'\s+', ' ', text).strip()
37
+
38
+ results.append((url, text))
39
+
40
+
41
+ if depth < max_depth:
42
+ for link in soup.find_all('a', href=True):
43
+ next_url = urljoin(url, link['href'])
44
+ if next_url.startswith('https://docs.nvidia.com/cuda/') and next_url not in visited:
45
+ queue.append((next_url, depth + 1))
46
+
47
+ except Exception as e:
48
+ print(f"Error crawling {url}: {e}")
49
+
50
+ return results, crawled_urls
51
+
52
+
53
+ def chunk_text(text: str, max_chunk_size: int = 1000) -> List[str]:
54
+ chunks = []
55
+ current_chunk = ""
56
+
57
+ for sentence in re.split(r'(?<=[.!?])\s+', text):
58
+ if len(current_chunk) + len(sentence) <= max_chunk_size:
59
+ current_chunk += sentence + " "
60
+ else:
61
+ chunks.append(current_chunk.strip())
62
+ current_chunk = sentence + " "
63
+
64
+ if current_chunk:
65
+ chunks.append(current_chunk.strip())
66
+
67
+ return chunks
68
+
69
+
70
+ class InMemoryStorage:
71
+ def __init__(self):
72
+ self.embeddings = []
73
+ self.texts = []
74
+ self.urls = []
75
+
76
+ def insert(self, embeddings, texts, urls):
77
+ self.embeddings.extend(embeddings)
78
+ self.texts.extend(texts)
79
+ self.urls.extend(urls)
80
+
81
+ def search(self, query_embedding, top_k=5):
82
+ similarities = np.dot(self.embeddings, query_embedding)
83
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
84
+ return [(self.texts[i], self.urls[i]) for i in top_indices]
85
+
86
+
87
+ def get_sentence_transformer():
88
+ return SentenceTransformer('distilbert-base-nli-mean-tokens')
89
+
90
+ def insert_chunks(storage, chunks: List[str], urls: List[str]):
91
+ model = get_sentence_transformer()
92
+ embeddings = model.encode(chunks)
93
+ storage.insert(embeddings, chunks, urls)
94
+
95
+
96
+ def vector_search(storage, query: str, top_k: int = 5):
97
+ model = get_sentence_transformer()
98
+ query_embedding = model.encode([query])[0]
99
+ return storage.search(query_embedding, top_k)
100
+
101
+
102
+ class QuestionAnsweringSystem:
103
+ def __init__(self):
104
+ self.tokenizer = T5Tokenizer.from_pretrained("t5-small")
105
+ self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
106
+ self.tokenizer.model_max_length = 1024
107
+ self.model.config.max_length = 1024
108
+
109
+ def answer_question(self, question: str, context: str) -> str:
110
+ input_text = f"question: {question} context: {context}"
111
+ inputs = self.tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
112
+
113
+ outputs = self.model.generate(inputs.input_ids,
114
+ max_length=1024,
115
+ num_beams=4,
116
+ early_stopping=True)
117
+ answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
118
+
119
+ return answer
120
+
121
+
122
+ def get_answer(storage, qa_system: QuestionAnsweringSystem, query: str) -> Tuple[str, str]:
123
+ results = vector_search(storage, query)
124
+ context = " ".join([result[0] for result in results])
125
+ answer = qa_system.answer_question(query, context)
126
+ source_url = results[0][1] if results else ""
127
+ return answer, source_url
128
+
129
+ def main():
130
+ print("CUDA Documentation QA System")
131
+
132
+
133
+ storage = InMemoryStorage()
134
+ qa_system = QuestionAnsweringSystem()
135
+
136
+
137
+ print("Crawling CUDA documentation...")
138
+ crawled_data, crawled_urls = crawl("https://docs.nvidia.com/cuda/", max_depth=1, delay=0.1)
139
+
140
+ print("Processing and inserting data...")
141
+ for url, text in crawled_data:
142
+ chunks = chunk_text(text, max_chunk_size=1024)
143
+ insert_chunks(storage, chunks, [url] * len(chunks))
144
+
145
+ print(f"Data crawled and inserted successfully! Processed {len(crawled_data)} pages.")
146
+
147
+
148
+ print("\nCrawled URLs:")
149
+ for url in crawled_urls:
150
+ print(url)
151
+
152
+
153
+ while True:
154
+ query = input("\nEnter your question about CUDA (or 'quit' to exit): ")
155
+
156
+ if query.lower() == 'quit':
157
+ break
158
+
159
+ print("Searching for an answer...")
160
+ answer, source_url = get_answer(storage, qa_system, query)
161
+
162
+ print("\nAnswer:")
163
+ print(answer)
164
+
165
+ print("\nSource:")
166
+ print(source_url)
167
+
168
+ if __name__ == "__main__":
169
  main()