Spaces:

mikeee
/

radiobee-aligner

Build error

File size: 17,754 Bytes

"""Run interactively."""
# pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, wrong-import-position, too-many-locals, too-many-statements
from typing import Tuple  # , Optional

import sys
from pathlib import Path
import signal
from random import randint
from textwrap import dedent
from itertools import zip_longest
from socket import socket, AF_INET, SOCK_STREAM

from sklearn.cluster import DBSCAN  # noqa
import joblib
from varname import nameof
from logzero import logger

# import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt  # noqa

# from tabulate import tabulate
from fastlid import fastlid

if "." not in sys.path:
    sys.path.insert(0, ".")
import gradio as gr
from radiobee.process_upload import process_upload
from radiobee.files2df import files2df
from radiobee.file2text import file2text
from radiobee.lists2cmat import lists2cmat
from radiobee.gen_pset import gen_pset
from radiobee.gen_aset import gen_aset
from radiobee.align_texts import align_texts

from radiobee.cmat2tset import cmat2tset

# from radiobee.plot_df import plot_df
# from radiobee.plot_cmat import plot_cmat
from radiobee.trim_df import trim_df

sns.set()
sns.set_style("darkgrid")
fastlid.set_languages = ["en", "zh"]

signal.signal(signal.SIGINT, signal.SIG_DFL)
print("Press Ctrl+C to quit\n")


def savelzma(obj, fileloc: str = None):
    """Aux funciton."""
    if fileloc is None:
        fileloc = nameof(obj)  # this wont work
    joblib.dump(obj, f"data/{fileloc}.lzma")


def greet(input):
    """Greet yo."""
    return f"'Sup yo! (your input: {input})"


def upfile1(file1, file2=None) -> Tuple[str, str]:
    """Upload file1, file2."""
    del file2
    return file1.name, f"'Sup yo! (your input: {input})"


def process_2upoads(file1, file2):
    """Process stuff."""
    # return f"{process_upload(file1)}\n===***\n{process_upload(file2)}"

    text1 = [_.strip() for _ in process_upload(file1).splitlines() if _.strip()]
    text2 = [_.strip() for _ in process_upload(file2).splitlines() if _.strip()]

    text1, text2 = zip(*zip_longest(text1, text2, fillvalue=""))

    df = pd.DataFrame({"text1": text1, "text2": text2})

    # return tabulate(df)
    # return tabulate(df, tablefmt="grid")
    # return tabulate(df, tablefmt='html')

    return df


if __name__ == "__main__":
    _ = """
    fn = process_2upoads
    inputs = ["file", "file"]
    examples = [
        ["data/test_zh.txt", "data/test_en.txt"],
        ["data/test_en.txt", "data/test_zh.txt"],
    ]
    outputs = ["dataframe"]
    # """
    import logzero

    # debug = True
    debug = False
    if debug:
        logzero.loglevel(10)
    logger.debug(" debug ")
    logger.info(" info ")

    # _ = """
    inputs = [
        gr.inputs.Textbox(
            # placeholder="Input something here",
            default="test text"
        )
    ]
    inputs = ["file", "file"]
    inputs = [
        gr.inputs.File(label="file 1"),
        # gr.inputs.File(file_count="multiple", label="file 2", optional=True),
        gr.inputs.File(label="file 2", optional=True),
    ]

    # modi 1
    _ = """
        tf_type: Literal[linear, sqrt, log, binary] = 'linear'
        idf_type: Optional[Literal[standard, smooth, bm25]] = None
        dl_type: Optional[Literal[linear, sqrt, log]] = None
        norm: norm: Optional[Literal[l1, l2]] = None
        x min_df: int | float = 1
        x max_df: int | float = 1.0
    # """
    input_tf_type = gr.inputs.Dropdown(
        ["linear", "sqrt", "log", "binary"], default="linear"
    )
    input_idf_type = gr.inputs.Radio(
        ["None", "standard", "smooth", "bm25"], default="None"
    )  # need to convert "None" this to None in fn
    input_dl_type = gr.inputs.Radio(
        ["None", "linear", "sqrt", "log"], default="None"
    )  # ditto
    input_norm_type = gr.inputs.Radio(["None", "l1", "l2"], default="None")  # ditto

    inputs = [
        gr.inputs.File(label="file 1"),
        gr.inputs.File(label="file 2", optional=True),
        input_tf_type,  # modi inputs
        input_idf_type,
        input_dl_type,
        input_norm_type,
        gr.inputs.Slider(
            minimum=1,
            maximum=20,
            step=0.1,
            default=10,
        ),
        gr.inputs.Slider(
            minimum=1,
            maximum=20,
            step=1,
            default=6,
        ),
    ]

    # modi
    examples = [
        [
            "data/test_zh.txt",
            "data/test_en.txt",
            "linear",
            "None",
            "None",
            "None",
            10,
            6,
        ],
        [
            "data/test_en.txt",
            "data/test_zh.txt",
            "linear",
            "None",
            "None",
            "None",
            10,
            6,
        ],
        [
            "data/shakespeare_zh500.txt",
            "data/shakespeare_en500.txt",
            "linear",
            "None",
            "None",
            "None",
            10,
            6,
        ],
        [
            "data/shakespeare_en500.txt",
            "data/shakespeare_zh500.txt",
            "linear",
            "None",
            "None",
            "None",
            10,
            6,
        ],
        [
            "data/hlm-ch1-zh.txt",
            "data/hlm-ch1-en.txt",
            "linear",
            "None",
            "None",
            "None",
            10,
            6,
        ],
        [
            "data/hlm-ch1-en.txt",
            "data/hlm-ch1-zh.txt",
            "linear",
            "None",
            "None",
            "None",
            10,
            6,
        ],
        [
            "data/ps-cn.txt",
            "data/ps-en.txt",
            "linear",
            "None",
            "None",
            "None",
            10,
            4,
        ],
    ]
    outputs = ["dataframe", "plot"]
    outputs = ["plot"]
    outputs = ["dataframe", "plot"]
    out_df = gr.outputs.Dataframe(
        headers=None,
        max_rows=12,  # 20
        max_cols=None,
        overflow_row_behaviour="paginate",
        type="auto",
        label="To be aligned",
    )
    out_df_aligned = gr.outputs.Dataframe(
        headers=None,
        # max_rows=12,  # 20
        max_cols=3,
        overflow_row_behaviour="paginate",
        type="auto",
        label="aligned pairs",
    )
    out_file_dl = gr.outputs.File(
        label="Click to download csv",
    )
    out_file_dl_excel = gr.outputs.File(
        label="Click to download xlsx",
    )

    # modi outputs
    outputs = [
        out_df,
        "plot",
        out_file_dl,
        out_file_dl_excel,
        out_df_aligned,
    ]
    # outputs = ["dataframe", "plot", "plot"]  # wont work
    # outputs = ["dataframe"]
    # outputs = ["dataframe", "dataframe", ]

    # def fn(file1, file2):
    # def fn(file1, file2, min_samples, eps):
    def fn(
        file1,
        file2,
        tf_type,
        idf_type,
        dl_type,
        norm,
        eps,
        min_samples,
    ):
        # modi fn
        """Process inputs and return outputs."""
        logger.debug(" *debug* ")

        # conver "None" to None for those Radio types
        for _ in [idf_type, dl_type, norm]:
            if _ in "None":
                _ = None

        # logger.info("file1: *%s*, file2: *%s*", file1, file2)
        logger.info("file1.name: *%s*, file2.name: *%s*", file1.name, file2.name)

        # bypass if file1 or file2 is str input
        # if not (isinstance(file1, str) or isinstance(file2, str)):
        text1 = file2text(file1)
        text2 = file2text(file2)
        lang1, _ = fastlid(text1)
        lang2, _ = fastlid(text2)

        df1 = files2df(file1, file2)

        lst1 = [elm for elm in df1.text1 if elm]
        lst2 = [elm for elm in df1.text2 if elm]
        # len1 = len(lst1)  # noqa
        # len2 = len(lst2)  # noqa

        cmat = lists2cmat(
            lst1,
            lst2,
            tf_type=tf_type,
            idf_type=idf_type,
            dl_type=dl_type,
            norm=norm,
        )

        tset = pd.DataFrame(cmat2tset(cmat))
        tset.columns = ["x", "y", "cos"]

        df_trimmed = trim_df(df1)
        _ = """
        df_trimmed = pd.concat(
            [
                df1.iloc[:4, :],
                pd.DataFrame(
                    [
                        [
                            "...",
                            "...",
                        ]
                    ],
                    columns=df1.columns,
                ),
                df1.iloc[-4:, :],
            ],
            ignore_index=1,
        )
        # """

        # process lst1, lst2 to obtained df_aligned
        # quick fix ValueError: not enough values to unpack (expected at least 1, got 0)
        # fixed in gen_pet, but we leave the loop here
        for min_s in range(min_samples):
            logger.info(" min_samples, using %s", min_samples - min_s)
            try:
                pset = gen_pset(
                    cmat,
                    eps=eps,
                    min_samples=min_samples - min_s,
                    delta=7,
                )
                break
            except ValueError:
                logger.info(" decrease min_samples by %s", min_s + 1)
                continue
            except Exception as e:
                logger.error(e)
                continue
        else:
            # break should happen above when min_samples = 2
            raise Exception("bummer, this shouldn't happen, probably another bug")

        min_samples = gen_pset.min_samples

        # will result in error message:
        # UserWarning: Starting a Matplotlib GUI outside of
        # the main thread will likely fail."
        _ = """
        plot_cmat(
            cmat,
            eps=eps,
            min_samples=min_samples,
            xlabel=lang1,
            ylabel=lang2,
        )
        # """

        # move plot_cmat's code to the main thread here
        # to make it work
        xlabel = lang1
        ylabel = lang2

        len1, len2 = cmat.shape
        ylim, xlim = len1, len2

        # does not seem to show up
        logger.debug(" len1 (ylim): %s, len2 (xlim): %s", len1, len2)
        if debug:
            print(f" len1 (ylim): {len1}, len2 (xlim): {len2}")

        df_ = pd.DataFrame(cmat2tset(cmat))
        df_.columns = ["x", "y", "cos"]

        sns.set()
        sns.set_style("darkgrid")

        # close all existing figures, necesssary for hf spaces
        plt.close("all")
        # if sys.platform not in ["win32", "linux"]:
        plt.switch_backend('Agg')  # to cater for Mac, thanks to WhiteFox

        # figsize=(13, 8), (339, 212) mm on '1280x800+0+0'
        fig = plt.figure(figsize=(13, 8))

        # gs = fig.add_gridspec(2, 2, wspace=0.4, hspace=0.58)
        gs = fig.add_gridspec(1, 2, wspace=0.4, hspace=0.58)
        ax_heatmap = fig.add_subplot(gs[0, 0])  # ax2
        ax0 = fig.add_subplot(gs[0, 1])
        # ax1 = fig.add_subplot(gs[1, 0])

        cmap = "viridis_r"
        sns.heatmap(cmat, cmap=cmap, ax=ax_heatmap).invert_yaxis()
        ax_heatmap.set_xlabel(xlabel)
        ax_heatmap.set_ylabel(ylabel)
        ax_heatmap.set_title("cos similarity heatmap")

        fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")

        _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
        # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
        _x = ~_

        # max cos along columns
        df_.plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax0)

        # outliers
        df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0)
        ax0.set_xlabel(xlabel)
        ax0.set_ylabel(ylabel)
        ax0.set_xlim(xmin=0, xmax=xlim)
        ax0.set_ylim(ymin=0, ymax=ylim)
        ax0.set_title(
            "max along columns ('x': outliers)\n"
            "potential aligned pairs (green line)\n"
            f"({round(sum(_) / xlim, 2):.0%})"
        )

        # clustered
        # df_[_].plot.scatter("x", "y", c="cos", cmap=cmap, ax=ax1)
        # ax1.set_xlabel(xlabel)
        # ax1.set_ylabel(ylabel)
        # ax1.set_xlim(0, len1)
        # ax1.set_title(f"potential aligned pairs ({round(sum(_) / len1, 2):.0%})")
        # end of plot_cmat

        src_len, tgt_len = cmat.shape
        aset = gen_aset(pset, src_len, tgt_len)
        final_list = align_texts(aset, lst2, lst1)  # note the order

        # df_aligned = df_trimmed
        df_aligned = pd.DataFrame(final_list, columns=["text1", "text2", "likelihood"])

        # swap text1 text2
        df_aligned = df_aligned[["text2", "text1", "likelihood"]]
        df_aligned.columns = ["text1", "text2", "likelihood"]

        _ = df_aligned.to_csv(index=False)
        file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
        file_dl.write_text(_, encoding="utf8")

        # file_dl.write_text(_, encoding="gb2312")  # no go

        file_dl_xlsx = Path(
            f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.xlsx"
        )
        df_aligned.to_excel(file_dl_xlsx)

        # return df_trimmed, plt
        return df_trimmed, plt, file_dl, file_dl_xlsx, df_aligned
        # modi outputs

    server_port = 7860
    with socket(AF_INET, SOCK_STREAM) as sock:
        sock.settimeout(0.01)  # 10ms

        # try numb times before giving up
        numb = 5
        for _ in range(numb):
            if sock.connect_ex(("127.0.0.1", server_port)) != 0:  # port idle
                break
            server_port = server_port + randint(0, 50)
        else:
            raise SystemExit(f"Tried {numb} times to no avail, giving up...")

    article = dedent(
        """
        ## NB
        *   `radiobee aligner` is a sibling of `bumblebee aligner`. To know more about these aligners, please join qq group `316287378`.
        *   Uploaded files should be in pure text format (txt, md, csv etc). `docx`, `pdf`, `srt`, `html` etc may be supported later on.
        *   Click "Clear" first for subsequent submits when uploading files.
        *   `tf_type` `idf_type` `dl_type` `norm`: Normally there is no need to touch these unless you know what you are doing.
        *   Suggested `esp` and `min_samples` values -- `esp` (minimum epsilon): 8-12, `min_samples`: 4-8.
           -   Smaller larger `esp` or `min_samples` will result in more aligned pairs but also more **false positives** (pairs
           falsely identified as candidates). On the other hand,
           larger smaller `esp` or `min_samples` values tend to miss
           'good' pairs.
        *   If you need to have a better look at the image, you can right-click on the image and select copy-image-address and open a new tab in the browser with the copied image address.
        *   `Flag`: Should something go wrong, you can click Flag to save the output and inform the developer.
    """
    )
    css_image = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
    # css = ".output_image, .input_image {height: 20rem !important; width: 100% !important;}"
    css_input_file = (
        ".input_file, {height: 9rem !important; width: 100% !important;}"
    )
    css_output_file = (
        ".output_file , {height: 4rem !important; width: 100% !important;}"
    )

    logger.info("running at port %s", server_port)

    iface = gr.Interface(
        # fn=greet,
        # inputs="text",
        # fn=process_upload,
        # fn=process_2upoads,
        # inputs=["file", "file"],
        # outputs="text",
        # outputs="html",
        fn=fn,
        inputs=inputs,
        outputs=outputs,
        title="radiobee-aligner🔠",
        description="WIP showcasing a blazing fast dualtext aligner, currrently supported language pairs: en-zh/zh-en",
        article=article,
        examples=examples,
        # theme="darkgrass",
        theme="grass",
        layout="vertical",  # horizontal unaligned
        # height=150,  # 500
        width=900,  # 900
        allow_flagging=True,
        flagging_options=[
            "fatal",
            "bug",
            "brainstorm",
            "excelsior",
        ],  # "paragon"],
        css=f"{css_image} {css_input_file} {css_output_file}",
    )

    iface.launch(
        share=False,
        # share=True,
        debug=debug,
        # server_name="0.0.0.0",
        server_name="127.0.0.1",
        server_port=server_port,
        # show_tips=True,
        enable_queue=True,
    )

_ = """

        ax = sns.heatmap(cmat, cmap="viridis_r")

        ax.invert_yaxis()
        ax.set_xlabel(fastlid(df.text1)[0])
        ax.set_xlabel(fastlid(df.text2)[0])

        # return df, plt
        return plt.gca()

https://colab.research.google.com/drive/1Gz9624VeAQLT7wlETgjOjPVURzQckXI0#scrollTo=qibtTvwecgsL colab gradio-file-inputs-upload.ipynb
    iface = gr.Interface(plot_text, "file", "image")

def is_port_in_use(port):
    import socket
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(('localhost', port)) == 0

socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex(('127.0.0.1', 7911))

---
css https://huggingface.co/spaces/nielsr/LayoutLMv2-FUNSD/blob/main/app.py#L83

css = ".output_image, .input_image {height: 40rem !important; width: 100% !important;}"
#css = "@media screen and (max-width: 600px) { .output_image, .input_image {height:20rem !important; width: 100% !important;} }"
# css = ".output_image, .input_image {height: 600px !important}"

mod = 'en2zh'
packname = packx.__name__

globals()[mod] = getattr(importlib.import_module(f"{packname}.{mod}"), mod)

"""