Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,26 +1,72 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
from transformers import pipeline
|
3 |
-
from
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
else:
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
|
|
|
|
|
|
17 |
|
18 |
-
|
19 |
|
20 |
-
def ner(input: str) -> Dict[str, Any]:
|
21 |
-
output = get_completion(input)
|
22 |
-
merged_tokens = merge_tokens(output)
|
23 |
-
return {"text": input, "entities": merged_tokens}
|
24 |
|
25 |
css = '''
|
26 |
h1#title {
|
@@ -29,13 +75,33 @@ h1#title {
|
|
29 |
'''
|
30 |
|
31 |
theme = gr.themes.Soft()
|
|
|
32 |
demo = gr.Blocks(css=css, theme=theme)
|
33 |
|
34 |
with demo:
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import BertTokenizer, BertForTokenClassification
|
3 |
from transformers import pipeline
|
4 |
+
from collections import defaultdict
|
5 |
+
|
6 |
+
|
7 |
+
model_name = "b3x0m/bert-xomlac-ner"
|
8 |
+
tokenizer = BertTokenizer.from_pretrained(model_name)
|
9 |
+
model = BertForTokenClassification.from_pretrained(model_name)
|
10 |
+
|
11 |
+
nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer)
|
12 |
+
|
13 |
+
def ner(file, selected_entities, min_count):
|
14 |
+
with open(file.name) as f:
|
15 |
+
text = f.read()
|
16 |
+
|
17 |
+
lines = text.splitlines()
|
18 |
+
|
19 |
+
batch_size = 32
|
20 |
+
batches = [lines[i:i + batch_size] for i in range(0, len(lines), batch_size)]
|
21 |
+
|
22 |
+
entity_count = defaultdict(int)
|
23 |
+
|
24 |
+
for batch in batches:
|
25 |
+
batch_text = " ".join(batch)
|
26 |
+
tokens = tokenizer(batch_text)['input_ids']
|
27 |
+
|
28 |
+
if len(tokens) > 128:
|
29 |
+
for i in range(0, len(tokens), 128):
|
30 |
+
sub_tokens = tokens[i:i + 128]
|
31 |
+
sub_batch_text = tokenizer.decode(sub_tokens, skip_special_tokens=True)
|
32 |
+
ner_results = nlp_ner(sub_batch_text)
|
33 |
+
|
34 |
+
current_entity = None
|
35 |
+
for entity in ner_results:
|
36 |
+
if entity['entity'].startswith("B-") or entity['entity'].startswith("M-") or entity['entity'].startswith("I-"):
|
37 |
+
if current_entity is None:
|
38 |
+
current_entity = {'text': entity['word'], 'label': entity['entity'][2:]}
|
39 |
+
else:
|
40 |
+
current_entity['text'] += entity['word']
|
41 |
+
elif entity['entity'].startswith("E-"):
|
42 |
+
if current_entity:
|
43 |
+
current_entity['text'] += entity['word']
|
44 |
+
current_entity['label'] = entity['entity'][2:]
|
45 |
+
entity_count[(current_entity['text'], current_entity['label'])] += 1
|
46 |
+
current_entity = None
|
47 |
else:
|
48 |
+
ner_results = nlp_ner(batch_text)
|
49 |
+
current_entity = None
|
50 |
+
for entity in ner_results:
|
51 |
+
if entity['entity'].startswith("B-") or entity['entity'].startswith("M-") or entity['entity'].startswith("I-"):
|
52 |
+
if current_entity is None:
|
53 |
+
current_entity = {'text': entity['word'], 'label': entity['entity'][2:]}
|
54 |
+
else:
|
55 |
+
current_entity['text'] += entity['word']
|
56 |
+
elif entity['entity'].startswith("E-"):
|
57 |
+
if current_entity:
|
58 |
+
current_entity['text'] += entity['word']
|
59 |
+
current_entity['label'] = entity['entity'][2:]
|
60 |
+
entity_count[(current_entity['text'], current_entity['label'])] += 1
|
61 |
+
current_entity = None
|
62 |
|
63 |
+
output = []
|
64 |
+
for (name, label), count in entity_count.items():
|
65 |
+
if count >= min_count and (not selected_entities or label in selected_entities):
|
66 |
+
output.append(f"{name}={label}={count}")
|
67 |
|
68 |
+
return "\n".join(output)
|
69 |
|
|
|
|
|
|
|
|
|
70 |
|
71 |
css = '''
|
72 |
h1#title {
|
|
|
75 |
'''
|
76 |
|
77 |
theme = gr.themes.Soft()
|
78 |
+
|
79 |
demo = gr.Blocks(css=css, theme=theme)
|
80 |
|
81 |
with demo:
|
82 |
+
input_file = gr.File(label="Upload File (.txt)", file_types=[".txt"])
|
83 |
+
|
84 |
+
entity_filter = gr.CheckboxGroup(
|
85 |
+
label="Entities",
|
86 |
+
choices=["PER", "ORG", "LOC", "GPE"],
|
87 |
+
type="value"
|
88 |
+
)
|
89 |
+
|
90 |
+
count_entities = gr.Number(
|
91 |
+
label="Frequency",
|
92 |
+
minimum=1,
|
93 |
+
maximum=10,
|
94 |
+
step=1,
|
95 |
+
value=3
|
96 |
+
)
|
97 |
+
|
98 |
+
output_text = gr.Textbox(label="Output", show_copy_button=True, interactive=False, lines=10, max_lines=20)
|
99 |
+
|
100 |
+
interface = gr.Interface(
|
101 |
+
fn=ner,
|
102 |
+
inputs=[input_file, entity_filter, count_entities],
|
103 |
+
outputs=[output_text],
|
104 |
+
allow_flagging="never",
|
105 |
+
)
|
106 |
|
107 |
+
demo.launch()
|