b3x0m commited on
Commit
e10858d
·
verified ·
1 Parent(s): d5d78ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -23
app.py CHANGED
@@ -1,26 +1,72 @@
1
  import gradio as gr
 
2
  from transformers import pipeline
3
- from typing import List, Dict, Any
4
-
5
- def merge_tokens(tokens: List[Dict[str, any]]) -> List[Dict[str, any]]:
6
- merged_tokens = []
7
- for token in tokens:
8
- if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]):
9
- last_token = merged_tokens[-1]
10
- last_token['word'] += token['word'].replace('##', '')
11
- last_token['end'] = token['end']
12
- last_token['score'] = (last_token['score'] + token['score']) / 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  else:
14
- merged_tokens.append(token)
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- return merged_tokens
 
 
 
17
 
18
- get_completion = pipeline("ner", model="b3x0m/bert-xomlac-ner")
19
 
20
- def ner(input: str) -> Dict[str, Any]:
21
- output = get_completion(input)
22
- merged_tokens = merge_tokens(output)
23
- return {"text": input, "entities": merged_tokens}
24
 
25
  css = '''
26
  h1#title {
@@ -29,13 +75,33 @@ h1#title {
29
  '''
30
 
31
  theme = gr.themes.Soft()
 
32
  demo = gr.Blocks(css=css, theme=theme)
33
 
34
  with demo:
35
- interface = gr.Interface(fn=ner,
36
- inputs=[gr.Textbox(label="Input text", lines=10)],
37
- outputs=[gr.HighlightedText(label="Output")],
38
- allow_flagging="never",
39
- examples=["灵符山道场之外,玄玉子、赵成等诸多灵符山高层落座。", "李雷和韩梅梅今天一起去北京旅游。"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import BertTokenizer, BertForTokenClassification
3
  from transformers import pipeline
4
+ from collections import defaultdict
5
+
6
+
7
+ model_name = "b3x0m/bert-xomlac-ner"
8
+ tokenizer = BertTokenizer.from_pretrained(model_name)
9
+ model = BertForTokenClassification.from_pretrained(model_name)
10
+
11
+ nlp_ner = pipeline("ner", model=model, tokenizer=tokenizer)
12
+
13
+ def ner(file, selected_entities, min_count):
14
+ with open(file.name) as f:
15
+ text = f.read()
16
+
17
+ lines = text.splitlines()
18
+
19
+ batch_size = 32
20
+ batches = [lines[i:i + batch_size] for i in range(0, len(lines), batch_size)]
21
+
22
+ entity_count = defaultdict(int)
23
+
24
+ for batch in batches:
25
+ batch_text = " ".join(batch)
26
+ tokens = tokenizer(batch_text)['input_ids']
27
+
28
+ if len(tokens) > 128:
29
+ for i in range(0, len(tokens), 128):
30
+ sub_tokens = tokens[i:i + 128]
31
+ sub_batch_text = tokenizer.decode(sub_tokens, skip_special_tokens=True)
32
+ ner_results = nlp_ner(sub_batch_text)
33
+
34
+ current_entity = None
35
+ for entity in ner_results:
36
+ if entity['entity'].startswith("B-") or entity['entity'].startswith("M-") or entity['entity'].startswith("I-"):
37
+ if current_entity is None:
38
+ current_entity = {'text': entity['word'], 'label': entity['entity'][2:]}
39
+ else:
40
+ current_entity['text'] += entity['word']
41
+ elif entity['entity'].startswith("E-"):
42
+ if current_entity:
43
+ current_entity['text'] += entity['word']
44
+ current_entity['label'] = entity['entity'][2:]
45
+ entity_count[(current_entity['text'], current_entity['label'])] += 1
46
+ current_entity = None
47
  else:
48
+ ner_results = nlp_ner(batch_text)
49
+ current_entity = None
50
+ for entity in ner_results:
51
+ if entity['entity'].startswith("B-") or entity['entity'].startswith("M-") or entity['entity'].startswith("I-"):
52
+ if current_entity is None:
53
+ current_entity = {'text': entity['word'], 'label': entity['entity'][2:]}
54
+ else:
55
+ current_entity['text'] += entity['word']
56
+ elif entity['entity'].startswith("E-"):
57
+ if current_entity:
58
+ current_entity['text'] += entity['word']
59
+ current_entity['label'] = entity['entity'][2:]
60
+ entity_count[(current_entity['text'], current_entity['label'])] += 1
61
+ current_entity = None
62
 
63
+ output = []
64
+ for (name, label), count in entity_count.items():
65
+ if count >= min_count and (not selected_entities or label in selected_entities):
66
+ output.append(f"{name}={label}={count}")
67
 
68
+ return "\n".join(output)
69
 
 
 
 
 
70
 
71
  css = '''
72
  h1#title {
 
75
  '''
76
 
77
  theme = gr.themes.Soft()
78
+
79
  demo = gr.Blocks(css=css, theme=theme)
80
 
81
  with demo:
82
+ input_file = gr.File(label="Upload File (.txt)", file_types=[".txt"])
83
+
84
+ entity_filter = gr.CheckboxGroup(
85
+ label="Entities",
86
+ choices=["PER", "ORG", "LOC", "GPE"],
87
+ type="value"
88
+ )
89
+
90
+ count_entities = gr.Number(
91
+ label="Frequency",
92
+ minimum=1,
93
+ maximum=10,
94
+ step=1,
95
+ value=3
96
+ )
97
+
98
+ output_text = gr.Textbox(label="Output", show_copy_button=True, interactive=False, lines=10, max_lines=20)
99
+
100
+ interface = gr.Interface(
101
+ fn=ner,
102
+ inputs=[input_file, entity_filter, count_entities],
103
+ outputs=[output_text],
104
+ allow_flagging="never",
105
+ )
106
 
107
+ demo.launch()