Spaces:
Runtime error
Runtime error
nurasaki
commited on
Commit
·
7360456
1
Parent(s):
704dc9c
gradio_nlp_berta_masked_example: first commit
Browse files- .gitignore +1 -0
- README.md +20 -11
- app.py +95 -37
- flagged/log.csv +6 -0
.gitignore
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
__pycache__/
|
| 2 |
.DS_Store
|
|
|
|
|
|
| 1 |
__pycache__/
|
| 2 |
.DS_Store
|
| 3 |
+
private.md
|
README.md
CHANGED
|
@@ -10,20 +10,29 @@ pinned: false
|
|
| 10 |
---
|
| 11 |
|
| 12 |
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
git status
|
| 20 |
-
git add .
|
| 21 |
-
git commit -am "gradio_nlp_berta_masked_example: first commit"
|
| 22 |
-
git push
|
| 23 |
|
| 24 |
-
|
| 25 |
-
git push gh_repo main
|
| 26 |
-
```
|
| 27 |
|
|
|
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
|
| 13 |
+
# Masked Language Modeling Example
|
| 14 |
|
| 15 |
+
by [nurasaki](https://huggingface.co/spaces/nurasaki)
|
| 16 |
|
| 17 |
+
* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
|
| 18 |
+
* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
|
| 19 |
+
* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
+
<br>
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
## Model description
|
| 24 |
|
| 25 |
+
The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
|
| 26 |
+
|
| 27 |
+
It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
|
| 28 |
+
|
| 29 |
+
<br>
|
| 30 |
+
|
| 31 |
+
## Usage
|
| 32 |
+
|
| 33 |
+
The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
|
| 34 |
+
|
| 35 |
+
Choose one of the provided examples or enter your own masked text.
|
| 36 |
+
|
| 37 |
+
<br>
|
| 38 |
|
app.py
CHANGED
|
@@ -1,12 +1,23 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
# save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
|
| 5 |
auth_token = os.getenv("auth_token")
|
| 6 |
|
| 7 |
|
| 8 |
|
| 9 |
|
|
|
|
| 10 |
print("========================================================================")
|
| 11 |
print("Starting ... gradio_demo_nlp_autocomplete/app.py")
|
| 12 |
print("AUTH TOKEN:", auth_token)
|
|
@@ -14,58 +25,105 @@ print("AUTH TOKEN:", auth_token)
|
|
| 14 |
|
| 15 |
# load a model from https://hf.co/models as an interface, then use it as an api
|
| 16 |
# you can remove the api_key parameter if you don't care about rate limiting.
|
| 17 |
-
api = gr.Interface.load(
|
| 18 |
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
|
|
|
|
| 21 |
|
|
|
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
-
|
| 27 |
-
print("type(api):", type(api) )
|
| 28 |
-
print("Api:", api, "\n" )
|
| 29 |
|
| 30 |
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
print("text:")
|
| 34 |
-
print(text)
|
| 35 |
-
print("------------------------------------------------------------------------")
|
| 36 |
-
print("text[:-50]:")
|
| 37 |
-
print(text[:-50])
|
| 38 |
-
print("------------------------------------------------------------------------")
|
| 39 |
-
print("api(text):")
|
| 40 |
-
print(api(text))
|
| 41 |
-
print("------------------------------------------------------------------------")
|
| 42 |
-
print("text[-50:]:")
|
| 43 |
-
print(text[-50:])
|
| 44 |
-
print("------------------------------------------------------------------------")
|
| 45 |
-
print("api(text[-50:]")
|
| 46 |
-
print(api(text[-50:]))
|
| 47 |
-
print("------------------------------------------------------------------------")
|
| 48 |
-
|
| 49 |
|
| 50 |
-
|
| 51 |
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
print("------------------------------------------------------------------------")
|
| 56 |
-
print("with gr.Blocks")
|
| 57 |
|
| 58 |
-
textbox = gr.Textbox(placeholder="Type here...", lines=4)
|
| 59 |
-
btn = gr.Button("Autocomplete")
|
| 60 |
-
|
| 61 |
-
print("textbox:", textbox)
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
|
|
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
import torch
|
| 7 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 8 |
+
import logging
|
| 9 |
+
from torch.nn.functional import softmax
|
| 10 |
+
import pandas as pd
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
|
| 14 |
# save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
|
| 15 |
auth_token = os.getenv("auth_token")
|
| 16 |
|
| 17 |
|
| 18 |
|
| 19 |
|
| 20 |
+
|
| 21 |
print("========================================================================")
|
| 22 |
print("Starting ... gradio_demo_nlp_autocomplete/app.py")
|
| 23 |
print("AUTH TOKEN:", auth_token)
|
|
|
|
| 25 |
|
| 26 |
# load a model from https://hf.co/models as an interface, then use it as an api
|
| 27 |
# you can remove the api_key parameter if you don't care about rate limiting.
|
| 28 |
+
# api = gr.Interface.load(, api_key=auth_token,)
|
| 29 |
|
| 30 |
|
| 31 |
+
model_ref = "projecte-aina/roberta-base-ca-v2"
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ref)
|
| 33 |
+
model = AutoModelForMaskedLM.from_pretrained(model_ref)
|
| 34 |
|
| 35 |
+
def get_topk(text, tokenizer, model, k):
|
| 36 |
|
| 37 |
+
print("Get top K,", text)
|
| 38 |
|
| 39 |
+
# Tokenize
|
| 40 |
+
# ==========================================================================================
|
| 41 |
+
tokenizer_kwargs = dict(padding='longest', return_token_type_ids=False, return_tensors="pt")
|
| 42 |
+
inputs = tokenizer(text, **tokenizer_kwargs).to("cpu")
|
| 43 |
+
input_ids = inputs.input_ids
|
| 44 |
|
| 45 |
+
|
| 46 |
+
# Get model outputs and probabilities
|
| 47 |
+
# ==========================================================================================
|
| 48 |
+
# logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
|
| 49 |
+
logits = model.to("cpu")(**inputs).logits
|
| 50 |
+
probs = softmax(logits, dim=2)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# Index ok <mask> (ojo només funciona quan hi ha 1 MASK)
|
| 54 |
+
# ==========================================================================================
|
| 55 |
+
row_idx, mask_idx = torch.where(input_ids.to("cpu") == tokenizer.mask_token_id)
|
| 56 |
|
| 57 |
+
return probs[row_idx, mask_idx].topk(k), mask_idx
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
+
def generate_output(text, k):
|
| 61 |
|
| 62 |
+
# lines = print_topk(text, tokenizer, model, k=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
(values, indices), input_idx = get_topk(text, tokenizer, model, int(k))
|
| 65 |
|
| 66 |
+
for mask_vals, mask_indices, input_idx in zip(values, indices, input_idx):
|
| 67 |
+
labels = {tokenizer.decode(ind): val.item()
|
| 68 |
+
for val, ind in zip(mask_vals, mask_indices)}
|
| 69 |
|
| 70 |
+
return labels
|
|
|
|
|
|
|
|
|
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
md_text ="""
|
| 74 |
+
# Masked Language Modeling Example
|
| 75 |
|
| 76 |
+
by [nurasaki](https://huggingface.co/spaces/nurasaki)
|
| 77 |
+
|
| 78 |
+
* Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
|
| 79 |
+
* Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
|
| 80 |
+
* Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
|
| 81 |
+
|
| 82 |
+
<br>
|
| 83 |
+
|
| 84 |
+
## Model description
|
| 85 |
+
|
| 86 |
+
The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
|
| 87 |
|
| 88 |
+
It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
|
| 89 |
|
| 90 |
+
<br>
|
| 91 |
+
|
| 92 |
+
## Usage
|
| 93 |
+
|
| 94 |
+
The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
|
| 95 |
+
|
| 96 |
+
Choose one of the provided examples or enter your own masked text.
|
| 97 |
+
|
| 98 |
+
<br>
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
"""
|
| 103 |
+
|
| 104 |
+
examples = [
|
| 105 |
+
"La meva mare es diu <mask>.",
|
| 106 |
+
"La meva mare treballa de <mask>.",
|
| 107 |
+
"El meu fill es diu <mask>.",
|
| 108 |
+
"El teu pare treballa de <mask>.",
|
| 109 |
+
]
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
with gr.Blocks() as demo:
|
| 114 |
+
gr.Markdown(md_text)
|
| 115 |
+
with gr.Row():
|
| 116 |
+
with gr.Column():
|
| 117 |
+
text = gr.Textbox("La meva mare es diu <mask>.", label="Masked text")
|
| 118 |
+
k = gr.Number(value=10, label="Num. results")
|
| 119 |
+
btn = gr.Button("Generate")
|
| 120 |
+
|
| 121 |
+
with gr.Column():
|
| 122 |
+
out_label = gr.Label(label="Results")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
btn.click(generate_output, inputs=[text, k], outputs=[out_label])
|
| 126 |
+
gr.Examples(examples, inputs=[text])
|
| 127 |
+
|
| 128 |
+
# if __name__ == "__main__":
|
| 129 |
+
demo.launch(favicon_path="favicon.png")
|
flagged/log.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Input Text,output,flag,username,timestamp
|
| 2 |
+
"The tower is 324 metres (1,063 ft) tall,",,,,2023-04-03 16:23:19.212953
|
| 3 |
+
,"<p>Start typing below and then click <strong>Run</strong> to see the output.</p>
|
| 4 |
+
",,,2023-04-03 16:28:32.735416
|
| 5 |
+
El teu pare treballa de <maks>.,"<p>Masked Text: xxx</p>
|
| 6 |
+
",,,,2023-04-03 17:34:10.400919
|