File size: 2,809 Bytes
b8d16b2
 
65e9efa
b8d16b2
65e9efa
 
b8d16b2
65e9efa
b8d16b2
65e9efa
b8d16b2
65e9efa
b8d16b2
 
65e9efa
 
 
 
b8d16b2
 
169138c
b8d16b2
65e9efa
b8d16b2
 
65e9efa
b8d16b2
65e9efa
 
59135d9
 
 
 
 
 
 
053f2f3
59135d9
169138c
 
3f1f888
169138c
 
65e9efa
169138c
 
59135d9
 
169138c
 
b8d16b2
169138c
b8d16b2
169138c
 
 
 
 
 
 
053f2f3
169138c
 
 
 
 
 
 
 
053f2f3
169138c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import random

import spacy
import srsly
import streamlit as st

nlp = spacy.load("en_core_web_trf")

# Load pre-processed grants from disk.

grants = list(srsly.read_jsonl("data/processed/entities.jsonl"))

colors = {"GPE": "#5cff84", "LOC": "#5cff84"}
options = {"ents": ["GPE", "LOC"], "colors": colors}

HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""


def render_entities(doc, colors: dict, options: dict) -> str:
    """
    Takes a SpaCy doc and renders the entities with the given colors.
    """

    html = spacy.displacy.render(doc, style="ent", options=options)
    html = html.replace("\n", " ")

    return html


def show_example(text):
    html = render_entities(doc, colors, options)
    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)

    return text


st.header("Location Recognition Demo πŸ”ŽπŸŒ†πŸŒ")
st.sidebar.header("Information ℹ️ ")
st.sidebar.markdown(
    """
This example application accompanies the blog post: [Extracting useful information from documents with Named Entity Recognition](https://medium.com/@reproducible/extracting-useful-information-from-documents-with-named-entity-recognition-4e009b60a8c).
It uses a pre-trained Named Entity Recognition (NER) model from the [spaCy](https://spacy.io/) library to extract locations from your own examples, or a sample of grant applications from The Wellcome Trust.
The application will extract the following types of location entity:

* __GPE__: Geopolitical entities (countries, cities, states)
* __LOC__: Locations (mountains, rivers, lakes)

This model will innevitably make some mistakes; it was trained on a large generic corpus of text, and the Wellcome Trust grant applications come from a very specific domain. We could improve this model by fine-tuning it on data from this domain.
"""
)

if st.button("Show Wellcome example", key="text"):
    sample = random.choice(grants)
    text = st.text_area(
        "Add your own text or click the button to see a Wellcome example",
        value=sample["text"],
        height=200,
        help="Enter your own text and press CTRL + ENTER to search for entities",
    )
    doc = nlp(text)
    show_example(text)
else:
    text = st.text_area(
        "Add your own text or click the button to see a Wellcome example",
        value="Enter your text here",
        height=200,
        help="Enter your own text and press CTRL + ENTER to search for entities",
    )
    doc = nlp(text)
    show_example(text)

st.markdown(
    "Examples from The Wellcome Trust are taken from data that are publishes openly at [360 Giving](https://data.threesixtygiving.org/). They are published under a [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/) license."
)