Spaces:

monetjoe
/

cv_backbones

Running

File size: 4,612 Bytes

import os
import re
import json
import requests
import gradio as gr
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

cache_json = 'cv_backbones.json'


def parse_url(url):
    response = requests.get(url)
    html = response.text
    return BeautifulSoup(html, 'html.parser')


def special_type(m_ver):
    m_type = re.search('[a-zA-Z]+', m_ver).group(0)

    if m_type == 'wide' or m_type == 'resnext':
        return 'resnet'

    elif m_type == 'swin':
        return 'swin_transformer'

    elif m_type == 'inception':
        return 'googlenet'

    return m_type


def info_on_dataset(m_ver, m_type, in1k_span):
    url_span = in1k_span.find_next_sibling('span', {'class': 's2'})
    size_span = url_span.find_next_sibling('span', {'class': 'mi'})
    m_url = str(url_span.text[1:-1])
    input_size = int(size_span.text)
    m_dict = {
        'ver': m_ver,
        'type': m_type,
        'input_size': input_size,
        'url': m_url
    }
    return m_dict, size_span


def gen_dataframe(url='https://pytorch.org/vision/main/_modules/'):
    torch_page = parse_url(url)
    article = torch_page.find('article', {'id': 'pytorch-article'})
    ul = article.find('ul').find('ul')
    in1k_v1, in1k_v2 = [], []

    for li in tqdm(ul.find_all('li'), desc='Crawling cv backbone info...'):
        name = str(li.text)
        if name.__contains__('torchvision.models.') and len(name.split('.')) == 3:

            if name.__contains__('_api') or \
                name.__contains__('feature_extraction') or \
                    name.__contains__('maxvit'):
                continue

            href = li.find('a').get('href')
            model_page = parse_url(url + href)
            divs = model_page.select('div.viewcode-block')

            for div in divs:
                div_id = str(div['id'])
                if div_id.__contains__('_Weights'):
                    m_ver = div_id.split('_Weight')[0].lower()

                    if m_ver.__contains__('swin_v2_'):
                        continue

                    m_type = special_type(m_ver)

                    in1k_v1_span = div.find(
                        name='span',
                        attrs={'class': 'n'},
                        string='IMAGENET1K_V1'
                    )

                    if not in1k_v1_span:
                        continue

                    m_dict, size_span = info_on_dataset(
                        m_ver,
                        m_type,
                        in1k_v1_span
                    )
                    in1k_v1.append(m_dict)

                    in1k_v2_span = size_span.find_next_sibling(
                        name='span',
                        attrs={'class': 'n'},
                        string='IMAGENET1K_V2'
                    )

                    if in1k_v2_span:
                        m_dict, _ = info_on_dataset(
                            m_ver,
                            m_type,
                            in1k_v2_span
                        )
                        in1k_v2.append(m_dict)

    dataset = {
        'IMAGENET1K_V1': in1k_v1,
        'IMAGENET1K_V2': in1k_v2
    }

    with open('IMAGENET1K_V1.jsonl', 'w', encoding='utf-8') as jsonl_file:
        for item in in1k_v1:
            jsonl_file.write(json.dumps(item) + '\n')

    with open('IMAGENET1K_V2.jsonl', 'w', encoding='utf-8') as jsonl_file:
        for item in in1k_v2:
            jsonl_file.write(json.dumps(item) + '\n')

    return dataset


def inference(subset):
    cache_json = f'{subset}.jsonl'
    if os.path.exists(cache_json):
        with open(cache_json, 'r', encoding='utf-8') as jsonl_file:
            dataset = [json.loads(line) for line in jsonl_file]
    else:
        dataset = gen_dataframe()[subset]

    return pd.DataFrame(dataset), cache_json


def sync(subset):
    cache_json = f'{subset}.jsonl'
    if os.path.exists(cache_json):
        os.remove(cache_json)

    return None


with gr.Blocks() as demo:
    with gr.Row():
        subset_opt = gr.Dropdown(
            choices=['IMAGENET1K_V1', 'IMAGENET1K_V2'],
            value='IMAGENET1K_V1'
        )
        sync_btn = gr.Button("Clean cache")
        dld_file = gr.components.File(label="Download JSON lines")

    with gr.Row():
        data_frame = gr.Dataframe(
            headers=["ver", "type", "input_size", "url"]
        )

    subset_opt.change(
        inference,
        inputs=subset_opt,
        outputs=[data_frame, dld_file]
    )
    sync_btn.click(
        sync,
        inputs=subset_opt,
        outputs=dld_file
    )

demo.launch(share=True)