Spaces:
Sleeping
Sleeping
File size: 1,508 Bytes
d7c8091 26ac453 6afda5c d7c8091 6085bca d7c8091 6085bca 4a8eac5 6085bca 4a8eac5 2c439c5 4a8eac5 6085bca d7c8091 26ac453 1faf45c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
from io import StringIO, BytesIO
import gradio as gr
from pdfminer.high_level import extract_text
from transformers import pipeline
import pandas as pd
import numpy as np
nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing")
class Group():
def __init__(self):
self.id = 0
self.text = ''
def getgroup(self,text):
if self.text == text:
return self.id
else:
self.id +=1
self.text = text
return self.id
grp_gen = Group()
def entities_to_df(entities):
df = pd.DataFrame(entities)
df['entity'] = df['entity'].apply(lambda x: x[2:])
df['group'] = df['entity'].apply(grp_gen.getgroup)
group_tag = df.groupby(by='group')
img_tagging = group_tag.agg({
'start':min,
'end':max,
'entity':np.unique,
'word':lambda x: " ".join(x)
})
return img_tagging
def transform_entity_type(entities):
for d in entities:
d['entity'] = d['entity'][0]
return entities
def highlight_text(fileObj):
path = BytesIO(fileObj)
text = extract_text(path)
entities = nlp(text)
df = entities_to_df(entities)
entities = df.to_dict('records')
entities = transform_entity_type(entities)
return {"text": text, "entities": entities}
gr.Interface(highlight_text,
gr.inputs.File(file_count="single", type="bytes"),
gr.HighlightedText(),
).launch()
|