import os import re import json import requests import gradio as gr import pandas as pd from tqdm import tqdm from bs4 import BeautifulSoup cache_json = 'cv_backbones.json' def parse_url(url): response = requests.get(url) html = response.text return BeautifulSoup(html, 'html.parser') def special_type(m_ver): m_type = re.search('[a-zA-Z]+', m_ver).group(0) if m_type == 'wide' or m_type == 'resnext': return 'resnet' elif m_type == 'swin': return 'swin_transformer' elif m_type == 'inception': return 'googlenet' return m_type def info_on_dataset(m_ver, m_type, in1k_span): url_span = in1k_span.find_next_sibling('span', {'class': 's2'}) size_span = url_span.find_next_sibling('span', {'class': 'mi'}) m_url = str(url_span.text[1:-1]) input_size = int(size_span.text) m_dict = { 'ver': m_ver, 'type': m_type, 'input_size': input_size, 'url': m_url } return m_dict, size_span def gen_dataframe(url='https://pytorch.org/vision/main/_modules/'): torch_page = parse_url(url) article = torch_page.find('article', {'id': 'pytorch-article'}) ul = article.find('ul').find('ul') in1k_v1, in1k_v2 = [], [] for li in tqdm(ul.find_all('li'), desc='Crawling cv backbone info...'): name = str(li.text) if name.__contains__('torchvision.models.') and len(name.split('.')) == 3: if name.__contains__('_api') or \ name.__contains__('feature_extraction') or \ name.__contains__('maxvit'): continue href = li.find('a').get('href') model_page = parse_url(url + href) divs = model_page.select('div.viewcode-block') for div in divs: div_id = str(div['id']) if div_id.__contains__('_Weights'): m_ver = div_id.split('_Weight')[0].lower() if m_ver.__contains__('swin_v2_'): continue m_type = special_type(m_ver) in1k_v1_span = div.find( name='span', attrs={'class': 'n'}, string='IMAGENET1K_V1' ) if not in1k_v1_span: continue m_dict, size_span = info_on_dataset( m_ver, m_type, in1k_v1_span ) in1k_v1.append(m_dict) in1k_v2_span = size_span.find_next_sibling( name='span', attrs={'class': 'n'}, string='IMAGENET1K_V2' ) if in1k_v2_span: m_dict, _ = info_on_dataset( m_ver, m_type, in1k_v2_span ) in1k_v2.append(m_dict) dataset = { 'IMAGENET1K_V1': in1k_v1, 'IMAGENET1K_V2': in1k_v2 } with open('IMAGENET1K_V1.jsonl', 'w', encoding='utf-8') as jsonl_file: for item in in1k_v1: jsonl_file.write(json.dumps(item) + '\n') with open('IMAGENET1K_V2.jsonl', 'w', encoding='utf-8') as jsonl_file: for item in in1k_v2: jsonl_file.write(json.dumps(item) + '\n') return dataset def inference(subset): cache_json = f'{subset}.jsonl' if os.path.exists(cache_json): with open(cache_json, 'r', encoding='utf-8') as jsonl_file: dataset = [json.loads(line) for line in jsonl_file] else: dataset = gen_dataframe()[subset] return pd.DataFrame(dataset), cache_json def sync(subset): cache_json = f'{subset}.jsonl' if os.path.exists(cache_json): os.remove(cache_json) return None with gr.Blocks() as demo: with gr.Row(): subset_opt = gr.Dropdown( choices=['IMAGENET1K_V1', 'IMAGENET1K_V2'], value='IMAGENET1K_V1' ) sync_btn = gr.Button("Clean cache") dld_file = gr.components.File(label="Download JSON lines") with gr.Row(): data_frame = gr.Dataframe( headers=["ver", "type", "input_size", "url"] ) subset_opt.change( inference, inputs=subset_opt, outputs=[data_frame, dld_file] ) sync_btn.click( sync, inputs=subset_opt, outputs=dld_file ) demo.launch(share=True)