silvanocerza commited on
Commit
d1577c1
·
1 Parent(s): 03e5585

Revert "Rework repositories download to avoid using subprocess"

Browse files

This reverts commit 8d47ff87a17c03ab89c37d106a2e1fc86ca063b0.

Files changed (2) hide show
  1. main.py +19 -50
  2. requirements.txt +0 -1
main.py CHANGED
@@ -1,8 +1,7 @@
1
  from typing import List, Tuple
2
  from pathlib import Path
 
3
  import os
4
- import zipfile
5
- import io
6
 
7
  from dotenv import load_dotenv
8
  from haystack.preview import Pipeline
@@ -19,75 +18,45 @@ from haystack.preview.components.writers import DocumentWriter
19
  from haystack.preview.components.file_converters import TextFileToDocument
20
  from haystack.preview.document_stores.memory import MemoryDocumentStore
21
  import streamlit as st
22
- import requests
23
 
24
  # Load the environment variables, we're going to need it for OpenAI
25
  load_dotenv()
26
 
27
  # This is the list of documentation that we're going to fetch
28
  DOCUMENTATIONS = [
29
- (
30
- "DocArray",
31
- "https://github.com/docarray/docarray",
32
- "/archive/refs/heads/main.zip",
33
- "./docs/**/*.md",
34
- ),
35
- (
36
- "Streamlit",
37
- "https://github.com/streamlit/docs",
38
- "/archive/refs/heads/main.zip",
39
- "./content/**/*.md",
40
- ),
41
- (
42
- "Jinja",
43
- "https://github.com/pallets/jinja",
44
- "/archive/refs/heads/main.zip",
45
- "./docs/**/*.rst",
46
- ),
47
- (
48
- "Pandas",
49
- "https://github.com/pandas-dev/pandas",
50
- "/archive/refs/heads/main.zip",
51
- "./docs/source/**/*.rst",
52
- ),
53
  (
54
  "Elasticsearch",
55
  "https://github.com/elastic/elasticsearch",
56
- "/archive/refs/heads/main.zip",
57
  "./docs/**/*.asciidoc",
58
  ),
59
- (
60
- "NumPy",
61
- "https://github.com/numpy/numpy",
62
- "/archive/refs/heads/main.zip",
63
- "./doc/**/*.rst",
64
- ),
65
  ]
66
 
67
 
68
  @st.cache_data(show_spinner=False)
69
  def fetch(documentations: List[Tuple[str, str, str]]):
70
  files = []
71
- docs_path = Path(__file__).parent / "downloaded_docs"
72
- for name, url, zip_path, pattern in documentations:
73
  st.write(f"Fetching {name} repository")
74
- # All projects use `main` as the default branch
75
- branch = "main"
76
- # The name of the folder depends on the name of the repository
77
- # on GitHub plus the branch zip we're downloading
78
- repo_folder = docs_path / (url.split("/")[-1] + f"-{branch}")
79
- if not repo_folder.exists():
80
- res = requests.get(f"{url}{zip_path}", stream=True)
81
- zip = zipfile.ZipFile(io.BytesIO(res.content))
82
- # The zip file contains a folder with the name of the repository
83
- # so we extract directly into the docs folder
84
- zip.extractall(docs_path)
85
-
86
- for p in repo_folder.glob(pattern):
87
  data = {
88
  "path": p,
89
  "metadata": {
90
- "url_source": f"{url}/tree/{branch}/{p.relative_to(repo_folder)}",
91
  "suffix": p.suffix,
92
  },
93
  }
 
1
  from typing import List, Tuple
2
  from pathlib import Path
3
+ import subprocess
4
  import os
 
 
5
 
6
  from dotenv import load_dotenv
7
  from haystack.preview import Pipeline
 
18
  from haystack.preview.components.file_converters import TextFileToDocument
19
  from haystack.preview.document_stores.memory import MemoryDocumentStore
20
  import streamlit as st
 
21
 
22
  # Load the environment variables, we're going to need it for OpenAI
23
  load_dotenv()
24
 
25
  # This is the list of documentation that we're going to fetch
26
  DOCUMENTATIONS = [
27
+ ("DocArray", "https://github.com/docarray/docarray", "./docs/**/*.md"),
28
+ ("Streamlit", "https://github.com/streamlit/docs", "./content/**/*.md"),
29
+ ("Jinja", "https://github.com/pallets/jinja", "./docs/**/*.rst"),
30
+ ("Pandas", "https://github.com/pandas-dev/pandas", "./docs/source/**/*.rst"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  (
32
  "Elasticsearch",
33
  "https://github.com/elastic/elasticsearch",
 
34
  "./docs/**/*.asciidoc",
35
  ),
36
+ ("NumPy", "https://github.com/numpy/numpy", "./doc/**/*.rst"),
 
 
 
 
 
37
  ]
38
 
39
 
40
  @st.cache_data(show_spinner=False)
41
  def fetch(documentations: List[Tuple[str, str, str]]):
42
  files = []
43
+ for name, url, pattern in documentations:
 
44
  st.write(f"Fetching {name} repository")
45
+ repo = Path(__file__).parent / "downloaded_docs" / name
46
+ if not repo.exists():
47
+ subprocess.run(["git", "clone", "--depth", "1", url, str(repo)], check=True)
48
+ res = subprocess.run(
49
+ ["git", "rev-parse", "--abbrev-ref", "HEAD"],
50
+ check=True,
51
+ capture_output=True,
52
+ encoding="utf-8",
53
+ )
54
+ branch = res.stdout.strip()
55
+ for p in repo.glob(pattern):
 
 
56
  data = {
57
  "path": p,
58
  "metadata": {
59
+ "url_source": f"{url}/tree/{branch}/{p.relative_to(repo)}",
60
  "suffix": p.suffix,
61
  },
62
  }
requirements.txt CHANGED
@@ -4,4 +4,3 @@ langdetect
4
  streamlit==1.27.2
5
  python-dotenv
6
  watchdog
7
- requests
 
4
  streamlit==1.27.2
5
  python-dotenv
6
  watchdog