File size: 3,719 Bytes
95ba32b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import streamlit as st
import difflib
import spacy


@st.cache(allow_output_mutation=True)
def load_model():
    return spacy.load('en_core_web_md')


## Layout stuff
st.set_page_config(
    page_title="Compare Demo",
    page_icon="πŸ”—",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={
        'Get Help': 'mailto:[email protected]',
        'Report a bug': None,
        'About': "## This a demo showcasing different Legal AI Actions"
    }
)

st.title('πŸ”— Compare Demo')
st.write("""
This demo shows how AI can be used to compare passages of text.
""")
st.write("**πŸ‘ˆ Enter two passages of text on the left** and hit the button **Compare** to see the demo in action")

with st.spinner('βš™οΈ Loading model...'):
    nlp = load_model()

EXAMPLE_TEXT_1 = """This Agreement shall be governed by and interpreted under the laws of the
State of Delaware without regard to its conflicts of law provisions."""

EXAMPLE_TEXT_2 = """This agreement will be governed by and must be construed in accordance with the laws of the State of Israel."""

text_1 = st.sidebar.text_area('Enter a passage of text', value=EXAMPLE_TEXT_1, height=150, key='input1')
text_2 = st.sidebar.text_area('Enter a second passage of text', value=EXAMPLE_TEXT_2, height=150, key='input2')

button = st.sidebar.button('Compare', type='primary', use_container_width=True)


def get_tokens(doc):
    return [token.lower for token in doc]


def add_md_color(text, match):
    color = 'green' if match else 'red'
    return f":{color}[{text}]"


def create_str_output(doc, matching_idxs):
    out = []
    for token in doc:
        if any(token.i in range(start, end) for start, end in matching_idxs):
            match = True
        else:
            match = False
        out.append(add_md_color(token.text, match))
    return ' '.join(out)


if button:

    with st.spinner('βš™οΈ Comparing Texts...'):
        doc_1 = nlp(text_1)
        doc_2 = nlp(text_2)

    st.header('πŸ§ͺ Comparison')
    st.markdown('We can highlight the :green[similarities] and :red[differences] across the two texts')
    col1, col2 = st.columns(2)
    sm = difflib.SequenceMatcher(None, get_tokens(doc_1), get_tokens(doc_2))
    matching_blocks = [match for match in sm.get_matching_blocks()]

    doc_1_matching_idxs = []
    doc_2_matching_idxs = []
    for a, b, n in matching_blocks:
        doc_1_matching_idxs.append((a, a + n))
        doc_2_matching_idxs.append((b, b + n))

    with col1:
        st.markdown(create_str_output(doc_1, doc_1_matching_idxs))
    with col2:
        st.markdown(create_str_output(doc_2, doc_2_matching_idxs))

    col1, col2, col3 = st.columns(3)

    with col1:
        # perform simple sequence matching
        sm = difflib.SequenceMatcher(None, get_tokens(doc_1), get_tokens(doc_2))
        st.subheader('πŸ“‘ Textual Similarity')
        st.markdown('We can measure the similarity based on the *wording* of the two texts.')
        st.metric(label='Textual Similarity', value=f"{sm.ratio() * 100:.1f}%")

    with col2:
        st.subheader('πŸ“ Linguistic Similarity')
        st.markdown(
            'We can measure the similarity based on the *linguistic features* of the two texts.')
        postags_1 = [token.pos_ for token in doc_1]
        postags_2 = [token.pos_ for token in doc_2]
        sm = difflib.SequenceMatcher(None, postags_1, postags_2)
        st.metric(label='Linguistic Similarity', value=f"{sm.ratio() * 100:.1f}%")

    with col3:
        st.subheader('πŸ’­ Semantic Similarity')
        st.markdown('We can measure the similarity based on the *meaning* of the two texts.')
        st.metric(label='Semantic Similarity', value=f"{doc_1.similarity(doc_2) * 100:.1f}%")