Spaces:

monetjoe
/

cv_backbones

Running

File size: 4,905 Bytes

import os
import re
import json
import requests
import gradio as gr
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

V_TO_SPLIT = {"IMAGENET1K_V1": "train", "IMAGENET1K_V2": "test"}


def parse_url(url: str):
    response = requests.get(url)
    html = response.text
    return BeautifulSoup(html, "html.parser")


def special_type(m_ver: str):
    m_type = re.search("[a-zA-Z]+", m_ver).group(0)
    if m_type == "wide" or m_type == "resnext":
        return "resnet"

    elif m_type == "swin":
        return "swin_transformer"

    elif m_type == "inception":
        return "googlenet"

    return m_type


def info_on_dataset(m_ver: str, m_type: str, in1k_span):
    url_span = in1k_span.find_next_sibling("span", {"class": "s2"})
    size_span = url_span.find_next_sibling("span", {"class": "mi"})
    m_url = str(url_span.text[1:-1])
    input_size = int(size_span.text)
    m_dict = {"ver": m_ver, "type": m_type, "input_size": input_size, "url": m_url}
    return m_dict, size_span


def gen_dataframe(url="https://pytorch.org/vision/main/_modules/"):
    torch_page = parse_url(url)
    article = torch_page.find("article", {"id": "pytorch-article"})
    ul = article.find("ul").find("ul")
    in1k_v1, in1k_v2 = [], []
    for li in tqdm(ul.find_all("li"), desc="Crawling cv backbone info..."):
        name = str(li.text)
        if name.__contains__("torchvision.models.") and len(name.split(".")) == 3:
            if name.__contains__("_api") or name.__contains__("feature_extraction"):
                continue

            href = li.find("a").get("href")
            model_page = parse_url(url + href)
            divs = model_page.select("div.viewcode-block")
            for div in divs:
                div_id = str(div["id"])
                if div_id.__contains__("_Weights"):
                    m_ver = div_id.split("_Weight")[0].lower()
                    m_type = special_type(m_ver)
                    in1k_v1_span = div.find(
                        name="span",
                        attrs={"class": "n"},
                        string="IMAGENET1K_V1",
                    )
                    if not in1k_v1_span:
                        continue

                    m_dict, size_span = info_on_dataset(m_ver, m_type, in1k_v1_span)
                    in1k_v1.append(m_dict)
                    in1k_v2_span = size_span.find_next_sibling(
                        name="span",
                        attrs={"class": "n"},
                        string="IMAGENET1K_V2",
                    )
                    if in1k_v2_span:
                        m_dict, _ = info_on_dataset(m_ver, m_type, in1k_v2_span)
                        in1k_v2.append(m_dict)

    dataset = {"IMAGENET1K_V1": in1k_v1, "IMAGENET1K_V2": in1k_v2}
    with open("train.jsonl", "w", encoding="utf-8") as jsonl_file:
        for item in in1k_v1:
            jsonl_file.write(json.dumps(item) + "\n")

    with open("test.jsonl", "w", encoding="utf-8") as jsonl_file:
        for item in in1k_v2:
            jsonl_file.write(json.dumps(item) + "\n")

    return dataset


# outer func
def infer(subset: str):
    status = "Success"
    prewiew = out_json = None
    try:
        cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
        if os.path.exists(cache_json):
            with open(cache_json, "r", encoding="utf-8") as jsonl_file:
                dataset = [json.loads(line) for line in jsonl_file]

        else:
            dataset = gen_dataframe()[subset]

        prewiew = pd.DataFrame(dataset)
        out_json = cache_json

    except Exception as e:
        status = f"{e}"

    return status, prewiew, out_json


# outer func
def sync(subset: str):
    status = "Success"
    try:
        cache_json = f"{V_TO_SPLIT[subset]}.jsonl"
        if os.path.exists(cache_json):
            os.remove(cache_json)

        if os.path.exists(cache_json):
            raise Exception(f"Failed to clean {cache_json}")

    except Exception as e:
        status = f"{e}"

    return status, None


if __name__ == "__main__":
    with gr.Blocks() as demo:
        with gr.Row():
            with gr.Column():
                subset_opt = gr.Dropdown(
                    label="ImageNet version",
                    choices=["IMAGENET1K_V1", "IMAGENET1K_V2"],
                    value="IMAGENET1K_V1",
                )
                sync_btn = gr.Button("Clean cache")

            with gr.Column():
                status_bar = gr.Textbox(label="Status", show_copy_button=True)
                dld_file = gr.File(label="Download JSON lines")

        with gr.Row():
            data_frame = gr.Dataframe(label="Preview")

        subset_opt.change(
            infer,
            inputs=subset_opt,
            outputs=[status_bar, data_frame, dld_file],
        )
        sync_btn.click(sync, inputs=subset_opt, outputs=[status_bar, dld_file])

    demo.launch()