Spaces:

monetjoe
/

cv_backbones

Running

cv_backbones / app.py

admin

sync ms

b5b0595 3 months ago

4.91 kB

	import os
	import re
	import json
	import requests
	import gradio as gr
	import pandas as pd
	from tqdm import tqdm
	from bs4 import BeautifulSoup

	V_TO_SPLIT = {"IMAGENET1K_V1": "train", "IMAGENET1K_V2": "test"}


	def parse_url(url: str):
	response = requests.get(url)
	html = response.text
	return BeautifulSoup(html, "html.parser")


	def special_type(m_ver: str):
	m_type = re.search("[a-zA-Z]+", m_ver).group(0)
	if m_type == "wide" or m_type == "resnext":
	return "resnet"

	elif m_type == "swin":
	return "swin_transformer"

	elif m_type == "inception":
	return "googlenet"

	return m_type


	def info_on_dataset(m_ver: str, m_type: str, in1k_span):
	url_span = in1k_span.find_next_sibling("span", {"class": "s2"})
	size_span = url_span.find_next_sibling("span", {"class": "mi"})
	m_url = str(url_span.text[1:-1])
	input_size = int(size_span.text)
	m_dict = {"ver": m_ver, "type": m_type, "input_size": input_size, "url": m_url}
	return m_dict, size_span


	def gen_dataframe(url="https://pytorch.org/vision/main/_modules/"):
	torch_page = parse_url(url)
	article = torch_page.find("article", {"id": "pytorch-article"})
	ul = article.find("ul").find("ul")
	in1k_v1, in1k_v2 = [], []
	for li in tqdm(ul.find_all("li"), desc="Crawling cv backbone info..."):
	name = str(li.text)
	if name.__contains__("torchvision.models.") and len(name.split(".")) == 3:
	if name.__contains__("_api") or name.__contains__("feature_extraction"):
	continue

	href = li.find("a").get("href")
	model_page = parse_url(url + href)
	divs = model_page.select("div.viewcode-block")
	for div in divs:
	div_id = str(div["id"])
	if div_id.__contains__("_Weights"):
	m_ver = div_id.split("_Weight")[0].lower()
	m_type = special_type(m_ver)
	in1k_v1_span = div.find(
	name="span",
	attrs={"class": "n"},
	string="IMAGENET1K_V1",
	)
	if not in1k_v1_span:
	continue

	m_dict, size_span = info_on_dataset(m_ver, m_type, in1k_v1_span)
	in1k_v1.append(m_dict)
	in1k_v2_span = size_span.find_next_sibling(
	name="span",
	attrs={"class": "n"},
	string="IMAGENET1K_V2",
	)
	if in1k_v2_span:
	m_dict, _ = info_on_dataset(m_ver, m_type, in1k_v2_span)
	in1k_v2.append(m_dict)

	dataset = {"IMAGENET1K_V1": in1k_v1, "IMAGENET1K_V2": in1k_v2}
	with open("train.jsonl", "w", encoding="utf-8") as jsonl_file:
	for item in in1k_v1:
	jsonl_file.write(json.dumps(item) + "\n")

	with open("test.jsonl", "w", encoding="utf-8") as jsonl_file:
	for item in in1k_v2:
	jsonl_file.write(json.dumps(item) + "\n")

	return dataset


	# outer func
	def infer(subset: str):
	status = "Success"
	prewiew = out_json = None
	try:
	cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
	if os.path.exists(cache_json):
	with open(cache_json, "r", encoding="utf-8") as jsonl_file:
	dataset = [json.loads(line) for line in jsonl_file]

	else:
	dataset = gen_dataframe()[subset]

	prewiew = pd.DataFrame(dataset)
	out_json = cache_json

	except Exception as e:
	status = f"{e}"

	return status, prewiew, out_json


	# outer func
	def sync(subset: str):
	status = "Success"
	try:
	cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
	if os.path.exists(cache_json):
	os.remove(cache_json)

	if os.path.exists(cache_json):
	raise Exception(f"Failed to clean {cache_json}")

	except Exception as e:
	status = f"{e}"

	return status, None


	if __name__ == "__main__":
	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	subset_opt = gr.Dropdown(
	label="ImageNet version",
	choices=["IMAGENET1K_V1", "IMAGENET1K_V2"],
	value="IMAGENET1K_V1",
	)
	sync_btn = gr.Button("Clean cache")

	with gr.Column():
	status_bar = gr.Textbox(label="Status", show_copy_button=True)
	dld_file = gr.File(label="Download JSON lines")

	with gr.Row():
	data_frame = gr.Dataframe(label="Preview")

	subset_opt.change(
	infer,
	inputs=subset_opt,
	outputs=[status_bar, data_frame, dld_file],
	)
	sync_btn.click(sync, inputs=subset_opt, outputs=[status_bar, dld_file])

	demo.launch()