admin commited on
Commit
4ab8727
·
1 Parent(s): 6ab8ddf
Files changed (2) hide show
  1. app.py +68 -51
  2. requirements.txt +1 -3
app.py CHANGED
@@ -7,18 +7,17 @@ import pandas as pd
7
  from tqdm import tqdm
8
  from bs4 import BeautifulSoup
9
 
10
- cache_json = "cv_backbones.json"
11
 
12
 
13
- def parse_url(url):
14
  response = requests.get(url)
15
  html = response.text
16
  return BeautifulSoup(html, "html.parser")
17
 
18
 
19
- def special_type(m_ver):
20
  m_type = re.search("[a-zA-Z]+", m_ver).group(0)
21
-
22
  if m_type == "wide" or m_type == "resnext":
23
  return "resnet"
24
 
@@ -31,7 +30,7 @@ def special_type(m_ver):
31
  return m_type
32
 
33
 
34
- def info_on_dataset(m_ver, m_type, in1k_span):
35
  url_span = in1k_span.find_next_sibling("span", {"class": "s2"})
36
  size_span = url_span.find_next_sibling("span", {"class": "mi"})
37
  m_url = str(url_span.text[1:-1])
@@ -45,94 +44,112 @@ def gen_dataframe(url="https://pytorch.org/vision/main/_modules/"):
45
  article = torch_page.find("article", {"id": "pytorch-article"})
46
  ul = article.find("ul").find("ul")
47
  in1k_v1, in1k_v2 = [], []
48
-
49
  for li in tqdm(ul.find_all("li"), desc="Crawling cv backbone info..."):
50
  name = str(li.text)
51
  if name.__contains__("torchvision.models.") and len(name.split(".")) == 3:
52
-
53
- if (
54
- name.__contains__("_api")
55
- or name.__contains__("feature_extraction")
56
- or name.__contains__("maxvit")
57
- ):
58
  continue
59
 
60
  href = li.find("a").get("href")
61
  model_page = parse_url(url + href)
62
  divs = model_page.select("div.viewcode-block")
63
-
64
  for div in divs:
65
  div_id = str(div["id"])
66
  if div_id.__contains__("_Weights"):
67
  m_ver = div_id.split("_Weight")[0].lower()
68
-
69
- if m_ver.__contains__("swin_v2_"):
70
- continue
71
-
72
  m_type = special_type(m_ver)
73
-
74
  in1k_v1_span = div.find(
75
- name="span", attrs={"class": "n"}, string="IMAGENET1K_V1"
 
 
76
  )
77
-
78
  if not in1k_v1_span:
79
  continue
80
 
81
  m_dict, size_span = info_on_dataset(m_ver, m_type, in1k_v1_span)
82
  in1k_v1.append(m_dict)
83
-
84
  in1k_v2_span = size_span.find_next_sibling(
85
- name="span", attrs={"class": "n"}, string="IMAGENET1K_V2"
 
 
86
  )
87
-
88
  if in1k_v2_span:
89
  m_dict, _ = info_on_dataset(m_ver, m_type, in1k_v2_span)
90
  in1k_v2.append(m_dict)
91
 
92
  dataset = {"IMAGENET1K_V1": in1k_v1, "IMAGENET1K_V2": in1k_v2}
93
-
94
- with open("IMAGENET1K_V1.jsonl", "w", encoding="utf-8") as jsonl_file:
95
  for item in in1k_v1:
96
  jsonl_file.write(json.dumps(item) + "\n")
97
 
98
- with open("IMAGENET1K_V2.jsonl", "w", encoding="utf-8") as jsonl_file:
99
  for item in in1k_v2:
100
  jsonl_file.write(json.dumps(item) + "\n")
101
 
102
  return dataset
103
 
104
 
105
- def inference(subset):
106
- cache_json = f"{subset}.jsonl"
107
- if os.path.exists(cache_json):
108
- with open(cache_json, "r", encoding="utf-8") as jsonl_file:
109
- dataset = [json.loads(line) for line in jsonl_file]
110
- else:
111
- dataset = gen_dataframe()[subset]
 
 
112
 
113
- return pd.DataFrame(dataset), cache_json
 
114
 
 
 
115
 
116
- def sync(subset):
117
- cache_json = f"{subset}.jsonl"
118
- if os.path.exists(cache_json):
119
- os.remove(cache_json)
120
 
121
- return None
122
 
123
 
124
- with gr.Blocks() as demo:
125
- with gr.Row():
126
- subset_opt = gr.Dropdown(
127
- choices=["IMAGENET1K_V1", "IMAGENET1K_V2"], value="IMAGENET1K_V1"
128
- )
129
- sync_btn = gr.Button("Clean cache")
130
- dld_file = gr.components.File(label="Download JSON lines")
 
 
 
 
 
 
131
 
132
- with gr.Row():
133
- data_frame = gr.Dataframe(headers=["ver", "type", "input_size", "url"])
134
 
135
- subset_opt.change(inference, inputs=subset_opt, outputs=[data_frame, dld_file])
136
- sync_btn.click(sync, inputs=subset_opt, outputs=dld_file)
137
 
138
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from tqdm import tqdm
8
  from bs4 import BeautifulSoup
9
 
10
+ V_TO_SPLIT = {"IMAGENET1K_V1": "train", "IMAGENET1K_V2": "test"}
11
 
12
 
13
+ def parse_url(url: str):
14
  response = requests.get(url)
15
  html = response.text
16
  return BeautifulSoup(html, "html.parser")
17
 
18
 
19
+ def special_type(m_ver: str):
20
  m_type = re.search("[a-zA-Z]+", m_ver).group(0)
 
21
  if m_type == "wide" or m_type == "resnext":
22
  return "resnet"
23
 
 
30
  return m_type
31
 
32
 
33
+ def info_on_dataset(m_ver: str, m_type: str, in1k_span):
34
  url_span = in1k_span.find_next_sibling("span", {"class": "s2"})
35
  size_span = url_span.find_next_sibling("span", {"class": "mi"})
36
  m_url = str(url_span.text[1:-1])
 
44
  article = torch_page.find("article", {"id": "pytorch-article"})
45
  ul = article.find("ul").find("ul")
46
  in1k_v1, in1k_v2 = [], []
 
47
  for li in tqdm(ul.find_all("li"), desc="Crawling cv backbone info..."):
48
  name = str(li.text)
49
  if name.__contains__("torchvision.models.") and len(name.split(".")) == 3:
50
+ if name.__contains__("_api") or name.__contains__("feature_extraction"):
 
 
 
 
 
51
  continue
52
 
53
  href = li.find("a").get("href")
54
  model_page = parse_url(url + href)
55
  divs = model_page.select("div.viewcode-block")
 
56
  for div in divs:
57
  div_id = str(div["id"])
58
  if div_id.__contains__("_Weights"):
59
  m_ver = div_id.split("_Weight")[0].lower()
 
 
 
 
60
  m_type = special_type(m_ver)
 
61
  in1k_v1_span = div.find(
62
+ name="span",
63
+ attrs={"class": "n"},
64
+ string="IMAGENET1K_V1",
65
  )
 
66
  if not in1k_v1_span:
67
  continue
68
 
69
  m_dict, size_span = info_on_dataset(m_ver, m_type, in1k_v1_span)
70
  in1k_v1.append(m_dict)
 
71
  in1k_v2_span = size_span.find_next_sibling(
72
+ name="span",
73
+ attrs={"class": "n"},
74
+ string="IMAGENET1K_V2",
75
  )
 
76
  if in1k_v2_span:
77
  m_dict, _ = info_on_dataset(m_ver, m_type, in1k_v2_span)
78
  in1k_v2.append(m_dict)
79
 
80
  dataset = {"IMAGENET1K_V1": in1k_v1, "IMAGENET1K_V2": in1k_v2}
81
+ with open("train.jsonl", "w", encoding="utf-8") as jsonl_file:
 
82
  for item in in1k_v1:
83
  jsonl_file.write(json.dumps(item) + "\n")
84
 
85
+ with open("test.jsonl", "w", encoding="utf-8") as jsonl_file:
86
  for item in in1k_v2:
87
  jsonl_file.write(json.dumps(item) + "\n")
88
 
89
  return dataset
90
 
91
 
92
+ # outer func
93
+ def infer(subset: str):
94
+ status = "Success"
95
+ prewiew = out_json = None
96
+ try:
97
+ cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
98
+ if os.path.exists(cache_json):
99
+ with open(cache_json, "r", encoding="utf-8") as jsonl_file:
100
+ dataset = [json.loads(line) for line in jsonl_file]
101
 
102
+ else:
103
+ dataset = gen_dataframe()[subset]
104
 
105
+ prewiew = pd.DataFrame(dataset)
106
+ out_json = cache_json
107
 
108
+ except Exception as e:
109
+ status = f"{e}"
 
 
110
 
111
+ return status, prewiew, out_json
112
 
113
 
114
+ # outer func
115
+ def sync(subset: str):
116
+ status = "Success"
117
+ try:
118
+ cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
119
+ if os.path.exists(cache_json):
120
+ os.remove(cache_json)
121
+
122
+ if os.path.exists(cache_json):
123
+ raise Exception(f"Failed to clean {cache_json}")
124
+
125
+ except Exception as e:
126
+ status = f"{e}"
127
 
128
+ return status, None
 
129
 
 
 
130
 
131
+ if __name__ == "__main__":
132
+ with gr.Blocks() as demo:
133
+ with gr.Row():
134
+ with gr.Column():
135
+ subset_opt = gr.Dropdown(
136
+ choices=["IMAGENET1K_V1", "IMAGENET1K_V2"],
137
+ value="IMAGENET1K_V1",
138
+ )
139
+ sync_btn = gr.Button("Clean cache")
140
+
141
+ with gr.Column():
142
+ status_bar = gr.Textbox(label="Status", show_copy_button=True)
143
+ dld_file = gr.File(label="Download JSON lines")
144
+
145
+ with gr.Row():
146
+ data_frame = gr.Dataframe(headers=["ver", "type", "input_size", "url"])
147
+
148
+ subset_opt.change(
149
+ infer,
150
+ inputs=subset_opt,
151
+ outputs=[status_bar, data_frame, dld_file],
152
+ )
153
+ sync_btn.click(sync, inputs=subset_opt, outputs=[status_bar, dld_file])
154
+
155
+ demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
- pandas
2
- tqdm
3
  bs4
4
- requests
 
 
 
1
  bs4
2
+ pandas