File size: 1,508 Bytes
d7c8091
 
26ac453
6afda5c
d7c8091
6085bca
 
d7c8091
 
 
6085bca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a8eac5
 
 
 
6085bca
 
4a8eac5
 
 
 
 
 
 
 
 
2c439c5
4a8eac5
 
6085bca
d7c8091
26ac453
1faf45c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from io import StringIO, BytesIO

import gradio as gr
from pdfminer.high_level import extract_text
from transformers import pipeline
import pandas as pd
import numpy as np

nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing")

class Group():
    def __init__(self):
        self.id = 0
        self.text = ''
        
    def getgroup(self,text):
        if self.text == text:
            return self.id
        else:
            self.id +=1
            self.text = text
            return self.id
            
grp_gen = Group()

def entities_to_df(entities):
    df = pd.DataFrame(entities)
    df['entity'] = df['entity'].apply(lambda x: x[2:])
    
    df['group'] = df['entity'].apply(grp_gen.getgroup)
    group_tag = df.groupby(by='group')
    
    img_tagging = group_tag.agg({
    'start':min,
    'end':max,
    'entity':np.unique,
    'word':lambda x: " ".join(x)
    })
    
    return img_tagging
    
    
def transform_entity_type(entities):
    for d in entities:
        d['entity'] = d['entity'][0]
    return entities


def highlight_text(fileObj):
    path = BytesIO(fileObj)
    text = extract_text(path)
    entities = nlp(text)
    df = entities_to_df(entities)
    
    entities = df.to_dict('records')
    
    entities = transform_entity_type(entities)
    
    return {"text": text, "entities": entities}

gr.Interface(highlight_text,
            gr.inputs.File(file_count="single", type="bytes"),
            gr.HighlightedText(),
).launch()