jaywadekar commited on
Commit
0b7fd0d
·
1 Parent(s): ba88389

Added PDFs in urls.txt

Browse files
Files changed (2) hide show
  1. rag.py +74 -29
  2. urls.txt +34 -17
rag.py CHANGED
@@ -9,7 +9,7 @@ from langchain_chroma import Chroma
9
  from langchain_core.output_parsers import StrOutputParser
10
  from langchain_core.runnables import RunnablePassthrough
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
- from langchain_community.document_loaders import WebBaseLoader
13
  from langchain.schema import Document
14
  import requests
15
  import json
@@ -145,69 +145,77 @@ class GitHubLoader(WebBaseLoader):
145
  return text.strip()
146
 
147
  def _scrape(self, url: str, *args, **kwargs) -> str:
148
- """Scrape data from URL and clean it.
149
-
150
- Args:
151
- url: The URL to scrape
152
- *args: Additional positional arguments
153
- **kwargs: Additional keyword arguments including bs_kwargs
154
-
155
- Returns:
156
- str: The cleaned content
157
- """
158
  response = requests.get(url)
159
  response.raise_for_status()
160
 
161
  # For directory listings (tree URLs), use the API
162
  if '/tree/' in url:
163
- # Parse URL components
164
  parts = url.replace("https://github.com/", "").split("/")
165
  owner = parts[0]
166
  repo = parts[1]
167
  branch = parts[3] # usually 'main' or 'master'
168
  path = "/".join(parts[4:]) if len(parts) > 4 else ""
169
-
170
- # Construct API URL
171
  api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
172
  api_response = requests.get(api_url)
173
  api_response.raise_for_status()
174
-
175
- # Parse directory listing
176
  contents = api_response.json()
177
  if isinstance(contents, list):
178
- # Format directory contents
179
  files = [f"{item['name']} ({item['type']})" for item in contents]
180
  return "Directory contents:\n" + "\n".join(files)
181
  else:
182
  return f"Error: Unexpected API response for {url}"
183
 
184
- # For regular files, parse HTML
185
  soup = BeautifulSoup(response.text, 'html.parser')
186
 
187
  # For README and markdown files
188
  readme_content = soup.find('article', class_='markdown-body')
189
- if readme_content:
190
  return self.clean_text(readme_content.get_text())
191
 
192
  # For code files
193
  code_content = soup.find('table', class_='highlight')
194
- if code_content:
195
  return self.clean_text(code_content.get_text())
196
-
197
  # For other content, get main content
198
  main_content = soup.find('main')
199
- if main_content:
200
  return self.clean_text(main_content.get_text())
201
 
202
- # Final fallback
203
- return self.clean_text(soup.get_text())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  # Load documentation from urls
206
  def load_docs():
207
  # Get urls
208
  urlsfile = open("urls.txt")
209
  urls = urlsfile.readlines()
210
- urls = [url.replace("\n","") for url in urls]
211
  urlsfile.close()
212
 
213
  # Load documents from URLs
@@ -218,17 +226,40 @@ def load_docs():
218
  if not url:
219
  continue
220
 
 
 
 
 
 
 
 
 
 
 
 
221
  # Check if URL is a Jupyter notebook
222
- if url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
223
  print(f"Loading notebook: {url}")
224
  notebook_docs = load_github_notebook(url)
225
  docs.extend(notebook_docs)
226
- # Handle Python and Markdown files using raw content
 
 
 
 
 
 
 
 
 
 
 
 
227
  elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
228
  print(f"Loading raw content: {url}")
229
  try:
230
  raw_url = github_to_raw(url)
231
- loader = WebBaseLoader([raw_url])
232
  web_docs = loader.load()
233
  # Preserve original URL in metadata
234
  for doc in web_docs:
@@ -285,11 +316,25 @@ def load_docs():
285
  return docs
286
 
287
  def extract_reference(url):
288
- """Extract a reference keyword from the GitHub URL"""
 
289
  if "blob/main" in url:
290
  return url.split("blob/main/")[-1]
 
291
  elif "tree/main" in url:
292
  return url.split("tree/main/")[-1] or "root"
 
 
 
 
 
 
 
 
 
 
 
 
293
  return url
294
 
295
  # Join content pages for processing
 
9
  from langchain_core.output_parsers import StrOutputParser
10
  from langchain_core.runnables import RunnablePassthrough
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
+ from langchain_community.document_loaders import WebBaseLoader, PyPDFLoader
13
  from langchain.schema import Document
14
  import requests
15
  import json
 
145
  return text.strip()
146
 
147
  def _scrape(self, url: str, *args, **kwargs) -> str:
 
 
 
 
 
 
 
 
 
 
148
  response = requests.get(url)
149
  response.raise_for_status()
150
 
151
  # For directory listings (tree URLs), use the API
152
  if '/tree/' in url:
 
153
  parts = url.replace("https://github.com/", "").split("/")
154
  owner = parts[0]
155
  repo = parts[1]
156
  branch = parts[3] # usually 'main' or 'master'
157
  path = "/".join(parts[4:]) if len(parts) > 4 else ""
 
 
158
  api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={branch}"
159
  api_response = requests.get(api_url)
160
  api_response.raise_for_status()
 
 
161
  contents = api_response.json()
162
  if isinstance(contents, list):
 
163
  files = [f"{item['name']} ({item['type']})" for item in contents]
164
  return "Directory contents:\n" + "\n".join(files)
165
  else:
166
  return f"Error: Unexpected API response for {url}"
167
 
 
168
  soup = BeautifulSoup(response.text, 'html.parser')
169
 
170
  # For README and markdown files
171
  readme_content = soup.find('article', class_='markdown-body')
172
+ if readme_content and hasattr(readme_content, 'get_text'):
173
  return self.clean_text(readme_content.get_text())
174
 
175
  # For code files
176
  code_content = soup.find('table', class_='highlight')
177
+ if code_content and hasattr(code_content, 'get_text'):
178
  return self.clean_text(code_content.get_text())
179
+
180
  # For other content, get main content
181
  main_content = soup.find('main')
182
+ if main_content and hasattr(main_content, 'get_text'):
183
  return self.clean_text(main_content.get_text())
184
 
185
+ # Final fallback: get all text from soup
186
+ if hasattr(soup, 'get_text'):
187
+ return self.clean_text(soup.get_text())
188
+ else:
189
+ return self.clean_text(str(soup))
190
+
191
+ def load(self):
192
+ docs = []
193
+ for url in self.web_paths:
194
+ text = self._scrape(url)
195
+ docs.append(Document(page_content=text, metadata={"source": url}))
196
+ return docs
197
+
198
+ class RawContentLoader(WebBaseLoader):
199
+ """Loader for raw content from GitHub (Python files, etc.)"""
200
+
201
+ def _scrape(self, url: str, *args, **kwargs) -> str:
202
+ response = requests.get(url)
203
+ response.raise_for_status()
204
+ return response.text
205
+
206
+ def load(self):
207
+ docs = []
208
+ for url in self.web_paths:
209
+ text = self._scrape(url)
210
+ docs.append(Document(page_content=text, metadata={"source": url}))
211
+ return docs
212
 
213
  # Load documentation from urls
214
  def load_docs():
215
  # Get urls
216
  urlsfile = open("urls.txt")
217
  urls = urlsfile.readlines()
218
+ urls = [url.replace("\n","") for url in urls if not url.strip().startswith("#") and url.strip()]
219
  urlsfile.close()
220
 
221
  # Load documents from URLs
 
226
  if not url:
227
  continue
228
 
229
+ # Handle PDF files
230
+ if url.endswith('.pdf'):
231
+ print(f"Loading PDF: {url}")
232
+ try:
233
+ loader = PyPDFLoader(url)
234
+ pdf_docs = loader.load()
235
+ for doc in pdf_docs:
236
+ doc.metadata['source'] = url
237
+ docs.extend(pdf_docs)
238
+ except Exception as e:
239
+ print(f"Error loading PDF {url}: {str(e)}")
240
  # Check if URL is a Jupyter notebook
241
+ elif url.endswith('.ipynb') and 'github.com' in url and '/blob/' in url:
242
  print(f"Loading notebook: {url}")
243
  notebook_docs = load_github_notebook(url)
244
  docs.extend(notebook_docs)
245
+ # Handle raw content URLs (already in raw.githubusercontent.com format)
246
+ elif 'raw.githubusercontent.com' in url:
247
+ print(f"Loading raw content: {url}")
248
+ try:
249
+ loader = RawContentLoader([url])
250
+ web_docs = loader.load()
251
+ # Preserve original URL in metadata
252
+ for doc in web_docs:
253
+ doc.metadata['source'] = url
254
+ docs.extend(web_docs)
255
+ except Exception as e:
256
+ print(f"Error loading {url}: {str(e)}")
257
+ # Handle Python and Markdown files using raw content (convert from blob to raw)
258
  elif url.endswith(('.py', '.md')) and 'github.com' in url and '/blob/' in url:
259
  print(f"Loading raw content: {url}")
260
  try:
261
  raw_url = github_to_raw(url)
262
+ loader = RawContentLoader([raw_url])
263
  web_docs = loader.load()
264
  # Preserve original URL in metadata
265
  for doc in web_docs:
 
316
  return docs
317
 
318
  def extract_reference(url):
319
+ """Extract a reference keyword from the URL for display in citations."""
320
+ # Handle GitHub blob URLs
321
  if "blob/main" in url:
322
  return url.split("blob/main/")[-1]
323
+ # Handle GitHub tree URLs
324
  elif "tree/main" in url:
325
  return url.split("tree/main/")[-1] or "root"
326
+ # Handle raw.githubusercontent.com URLs
327
+ elif "raw.githubusercontent.com" in url:
328
+ # Example: https://raw.githubusercontent.com/user/repo/branch/path/to/file.py
329
+ parts = url.split("raw.githubusercontent.com/")[-1].split("/")
330
+ if len(parts) > 3:
331
+ # Remove user, repo, branch
332
+ return "/".join(parts[3:])
333
+ else:
334
+ return url
335
+ # For arXiv PDFs and other URLs, just use the filename
336
+ elif url.endswith('.pdf') or url.endswith('.ipynb') or url.endswith('.py') or url.endswith('.md'):
337
+ return url.split("/")[-1]
338
  return url
339
 
340
  # Join content pages for processing
urls.txt CHANGED
@@ -1,20 +1,37 @@
 
1
  https://github.com/JayWadekar/gwIAS-HM/tree/main
2
  https://github.com/JayWadekar/gwIAS-HM/tree/main/Pipeline
3
  https://github.com/JayWadekar/gwIAS-HM/blob/main/README.md
4
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ML_modules.py
5
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_hm_search.py
6
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coherent_score_mz_fast.py
7
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/coincidence_HM.py
8
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/data_operations.py
9
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/download_data.py
10
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/gw_detect_file.py
11
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/params.py
12
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/python_utils.py
13
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/ranking_HM.py
14
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/readligo.py
15
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_generator_HM.py
16
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/template_bank_params_O3a_HM.py
17
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggering_on_cluster.py
18
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/triggers_single_detector_HM.py
19
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Pipeline/utils.py
20
- https://github.com/JayWadekar/gwIAS-HM/blob/main/Tutorial_notebooks/4.Trig_Coin_on_cluster.ipynb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Repository Structure
2
  https://github.com/JayWadekar/gwIAS-HM/tree/main
3
  https://github.com/JayWadekar/gwIAS-HM/tree/main/Pipeline
4
  https://github.com/JayWadekar/gwIAS-HM/blob/main/README.md
5
+
6
+ # Core Pipeline Components
7
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/coherent_score_hm_search.py
8
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/coherent_score_mz_fast.py
9
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/coincidence_HM.py
10
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/triggers_single_detector_HM.py
11
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/ranking_HM.py
12
+
13
+ # Data Handling
14
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/data_operations.py
15
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/download_data.py
16
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/readligo.py
17
+
18
+ # Template Bank Generation
19
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/template_bank_generator_HM.py
20
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/template_bank_params_O3a_HM.py
21
+
22
+ # Machine Learning Components
23
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/ML_modules.py
24
+
25
+ # Utilities and Configuration
26
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/params.py
27
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/python_utils.py
28
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/utils.py
29
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/gw_detect_file.py
30
+ https://raw.githubusercontent.com/JayWadekar/gwIAS-HM/main/Pipeline/triggering_on_cluster.py
31
+
32
+ # Tutorials and Documentation
33
+ https://github.com/JayWadekar/gwIAS-HM/blob/main/Tutorial_notebooks/4.Trig_Coin_on_cluster.ipynb
34
+
35
+ # Research Papers
36
+ https://arxiv.org/pdf/1902.10341.pdf
37
+ https://arxiv.org/pdf/2405.17400v2.pdf