Spaces:

davanstrien
/

dataset-language-prediction-api-literstar

Sleeping

App Files Files Community

davanstrien HF Staff commited on Jan 24, 2024

Commit

059d73d

verified ·

1 Parent(s): 06cab33

Upload 2 files

Browse files

Files changed (2) hide show

app.py +247 -0
requirements.txt +277 -0

app.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import gradio as gr
+from httpx import Client
+import random
+import os
+import fasttext
+from huggingface_hub import hf_hub_download
+from typing import Union
+from typing import Iterator
+from dotenv import load_dotenv
+from toolz import groupby, valmap, concat
+from statistics import mean
+from httpx import Timeout
+from huggingface_hub.utils import logging
+from litestar import get
+from httpx import AsyncClient
+import random
+import asyncio
+import httpx
+# ...
+from litestar import Litestar, get
+logger = logging.get_logger(__name__)
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
+DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
+headers = {
+    "authorization": f"Bearer ${HF_TOKEN}",
+}
+timeout = Timeout(60, read=120)
+client = Client(headers=headers, timeout=timeout)
+async_client = AsyncClient(headers=headers, timeout=timeout)
+# non exhaustive list of columns that might contain text which can be used for language detection
+# we prefer to use columns in this order i.e. if there is a column named "text" we will use it first
+TARGET_COLUMN_NAMES = {
+    "text",
+    "input",
+    "tokens",
+    "prompt",
+    "instruction",
+    "sentence_1",
+    "question",
+    "sentence2",
+    "answer",
+    "sentence",
+    "response",
+    "context",
+    "query",
+    "chosen",
+    "rejected",
+}
+def datasets_server_valid_rows(hub_id: str):
+    resp = client.get(f"{BASE_DATASETS_SERVER_URL}/is-valid?dataset={hub_id}")
+    resp.raise_for_status()
+    return resp.json()["viewer"]
+def get_first_config_and_split_name(hub_id: str):
+    resp = client.get(f"https://datasets-server.huggingface.co/splits?dataset={hub_id}")
+    resp.raise_for_status()
+    data = resp.json()
+    return data["splits"][0]["config"], data["splits"][0]["split"]
+def get_dataset_info(hub_id: str, config: str | None = None):
+    if config is None:
+        config = get_first_config_and_split_name(hub_id)
+        if config is None:
+            return None
+        else:
+            config = config[0]
+    resp = client.get(
+        f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
+    )
+    resp.raise_for_status()
+    return resp.json()
+async def get_random_rows(
+    hub_id: str,
+    total_length: int,
+    number_of_rows: int,
+    max_request_calls: int,
+    config="default",
+    split="train",
+):
+    rows = []
+    rows_per_call = min(
+        number_of_rows // max_request_calls, total_length // max_request_calls
+    )
+    rows_per_call = min(rows_per_call, 100)  # Ensure rows_per_call is not more than 100
+    for _ in range(min(max_request_calls, number_of_rows // rows_per_call)):
+        offset = random.randint(0, total_length - rows_per_call)
+        url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
+        response = await async_client.get(url)
+        if response.status_code == 200:
+            data = response.json()
+            batch_rows = data.get("rows")
+            rows.extend(batch_rows)
+        else:
+            print(f"Failed to fetch data: {response.status_code}")
+            print(url)
+        if len(rows) >= number_of_rows:
+            break
+    return [row.get("row") for row in rows]
+def load_model(repo_id: str) -> fasttext.FastText._FastText:
+    model_path = hf_hub_download(repo_id, filename="model.bin")
+    return fasttext.load_model(model_path)
+def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterator[str]:
+    for row in rows:
+        if isinstance(row, str):
+            # split on lines and remove empty lines
+            line = row.split("\n")
+            for line in line:
+                if line:
+                    yield line
+        elif isinstance(row, list):
+            try:
+                line = " ".join(row)
+                if len(line) < min_length:
+                    continue
+                else:
+                    yield line
+            except TypeError:
+                continue
+FASTTEXT_PREFIX_LENGTH = 9  # fasttext labels are formatted like "__label__eng_Latn"
+# model = load_model(DEFAULT_FAST_TEXT_MODEL)
+model = fasttext.load_model(
+    hf_hub_download("facebook/fasttext-language-identification", "model.bin")
+)
+def model_predict(inputs: str, k=1) -> list[dict[str, float]]:
+    predictions = model.predict(inputs, k=k)
+    return [
+        {"label": label[FASTTEXT_PREFIX_LENGTH:], "score": prob}
+        for label, prob in zip(predictions[0], predictions[1])
+    ]
+def get_label(x):
+    return x.get("label")
+def get_mean_score(preds):
+    return mean([pred.get("score") for pred in preds])
+def filter_by_frequency(counts_dict: dict, threshold_percent: float = 0.2):
+    """Filter a dict to include items whose value is above `threshold_percent`"""
+    total = sum(counts_dict.values())
+    threshold = total * threshold_percent
+    return {k for k, v in counts_dict.items() if v >= threshold}
+def predict_rows(rows, target_column, language_threshold_percent=0.2):
+    rows = (row.get(target_column) for row in rows)
+    rows = (row for row in rows if row is not None)
+    rows = list(yield_clean_rows(rows))
+    predictions = [model_predict(row) for row in rows]
+    predictions = [pred for pred in predictions if pred is not None]
+    predictions = list(concat(predictions))
+    predictions_by_lang = groupby(get_label, predictions)
+    langues_counts = valmap(len, predictions_by_lang)
+    keys_to_keep = filter_by_frequency(
+        langues_counts, threshold_percent=language_threshold_percent
+    )
+    filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
+    return {
+        "predictions": dict(valmap(get_mean_score, filtered_dict)),
+        "pred": predictions,
+    }
+@get("/predict_language/")
+async def predict_language(
+    hub_id: str,
+    config: str | None = None,
+    split: str | None = None,
+    max_request_calls: int = 10,
+    number_of_rows: int = 1000,
+) -> dict[str, float | str]:
+    is_valid = datasets_server_valid_rows(hub_id)
+    if not is_valid:
+        gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
+    if not config:
+        config, split = get_first_config_and_split_name(hub_id)
+    info = get_dataset_info(hub_id, config)
+    if info is None:
+        gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
+    if dataset_info := info.get("dataset_info"):
+        total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
+        logger.info(f"Total rows for split {split}: {total_rows_for_split}")
+        features = dataset_info.get("features")
+        column_names = set(features.keys())
+        logger.info(f"Column names: {column_names}")
+        if not set(column_names).intersection(TARGET_COLUMN_NAMES):
+            raise gr.Error(
+                f"Dataset {hub_id} does not contain any of the target columns {TARGET_COLUMN_NAMES}"
+            )
+        for column in TARGET_COLUMN_NAMES:
+            if column in column_names:
+                target_column = column
+                logger.info(f"Using column {target_column} for language detection")
+                break
+        random_rows = await get_random_rows(
+            hub_id,
+            total_rows_for_split,
+            number_of_rows,
+            max_request_calls,
+            config,
+            split,
+        )
+        logger.info(f"Predicting language for {len(random_rows)} rows")
+        predictions = predict_rows(random_rows, target_column)
+        predictions["hub_id"] = hub_id
+        predictions["config"] = config
+        predictions["split"] = split
+        return predictions
+app = Litestar([predict_language])
+# inputs = [
+#     gr.Text(label="dataset id"),
+#     gr.Textbox(
+#         None,
+#         label="config",
+#     ),
+#     gr.Textbox(None, label="split"),
+# ]
+# interface = gr.Interface(predict_language, inputs=inputs, outputs="json")
+# interface.queue()
+# interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,277 @@

+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile
+#
+aiofiles==23.2.1
+    # via gradio
+aiohttp==3.9.1
+    # via
+    #   datasets
+    #   fsspec
+aiosignal==1.3.1
+    # via aiohttp
+altair==5.2.0
+    # via gradio
+annotated-types==0.6.0
+    # via pydantic
+anyio==4.2.0
+    # via
+    #   httpx
+    #   litestar
+    #   starlette
+attrs==23.2.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+certifi==2023.11.17
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via
+    #   litestar
+    #   rich-click
+    #   typer
+    #   uvicorn
+colorama==0.4.6
+    # via typer
+contourpy==1.2.0
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+datasets==2.14.4
+    # via -r requirements.in
+dill==0.3.7
+    # via
+    #   datasets
+    #   multiprocess
+faker==22.5.0
+    # via polyfactory
+fastapi==0.109.0
+    # via gradio
+fasttext==0.9.2
+    # via -r requirements.in
+ffmpy==0.3.1
+    # via gradio
+filelock==3.13.1
+    # via huggingface-hub
+fonttools==4.47.2
+    # via matplotlib
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2023.12.2
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+gradio==4.15.0
+    # via -r requirements.in
+gradio-client==0.8.1
+    # via gradio
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==1.0.2
+    # via httpx
+httpx==0.26.0
+    # via
+    #   -r requirements.in
+    #   gradio
+    #   gradio-client
+    #   litestar
+huggingface-hub==0.20.3
+    # via
+    #   -r requirements.in
+    #   datasets
+    #   gradio
+    #   gradio-client
+idna==3.6
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+importlib-resources==6.1.1
+    # via gradio
+iso639-lang==2.2.2
+    # via -r requirements.in
+jinja2==3.1.3
+    # via
+    #   altair
+    #   gradio
+jsonschema==4.21.1
+    # via altair
+jsonschema-specifications==2023.12.1
+    # via jsonschema
+kiwisolver==1.4.5
+    # via matplotlib
+litestar==2.5.1
+    # via -r requirements.in
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.4
+    # via
+    #   gradio
+    #   jinja2
+matplotlib==3.8.2
+    # via gradio
+mdurl==0.1.2
+    # via markdown-it-py
+msgspec==0.18.6
+    # via litestar
+multidict==6.0.4
+    # via
+    #   aiohttp
+    #   litestar
+    #   yarl
+multiprocess==0.70.15
+    # via datasets
+numpy==1.26.3
+    # via
+    #   altair
+    #   contourpy
+    #   datasets
+    #   fasttext
+    #   gradio
+    #   matplotlib
+    #   pandas
+    #   pyarrow
+orjson==3.9.12
+    # via gradio
+packaging==23.2
+    # via
+    #   altair
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   matplotlib
+pandas==2.2.0
+    # via
+    #   altair
+    #   datasets
+    #   gradio
+pillow==10.2.0
+    # via
+    #   gradio
+    #   matplotlib
+polyfactory==2.14.1
+    # via litestar
+pyarrow==15.0.0
+    # via datasets
+pybind11==2.11.1
+    # via fasttext
+pydantic==2.5.3
+    # via
+    #   fastapi
+    #   gradio
+pydantic-core==2.14.6
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.17.2
+    # via rich
+pyparsing==3.1.1
+    # via matplotlib
+python-dateutil==2.8.2
+    # via
+    #   faker
+    #   matplotlib
+    #   pandas
+python-dotenv==1.0.1
+    # via -r requirements.in
+python-multipart==0.0.6
+    # via gradio
+pytz==2023.3.post1
+    # via pandas
+pyyaml==6.0.1
+    # via
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   litestar
+referencing==0.32.1
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+requests==2.31.0
+    # via
+    #   datasets
+    #   fsspec
+    #   huggingface-hub
+rich==13.7.0
+    # via
+    #   -r requirements.in
+    #   litestar
+    #   rich-click
+    #   typer
+rich-click==1.7.3
+    # via litestar
+rpds-py==0.17.1
+    # via
+    #   jsonschema
+    #   referencing
+ruff==0.1.14
+    # via gradio
+semantic-version==2.10.0
+    # via gradio
+shellingham==1.5.4
+    # via typer
+six==1.16.0
+    # via python-dateutil
+sniffio==1.3.0
+    # via
+    #   anyio
+    #   httpx
+starlette==0.35.1
+    # via fastapi
+tomlkit==0.12.0
+    # via gradio
+toolz==0.12.0
+    # via
+    #   -r requirements.in
+    #   altair
+tqdm==4.66.1
+    # via
+    #   datasets
+    #   huggingface-hub
+typer[all]==0.9.0
+    # via
+    #   gradio
+    #   typer
+typing-extensions==4.9.0
+    # via
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   litestar
+    #   polyfactory
+    #   pydantic
+    #   pydantic-core
+    #   rich-click
+    #   typer
+tzdata==2023.4
+    # via pandas
+urllib3==2.1.0
+    # via requests
+uvicorn==0.27.0
+    # via gradio
+websockets==11.0.3
+    # via gradio-client
+xxhash==3.4.1
+    # via datasets
+yarl==1.9.4
+    # via aiohttp
+# The following packages are considered to be unsafe in a requirements file:
+# setuptools