|
import streamlit as st |
|
import spacy |
|
from paraphrase_metrics import metrics as pm |
|
import time |
|
import difflib |
|
|
|
st.set_page_config(page_title="TextDiff Visualizer") |
|
|
|
def render_single_para(paragraph, segment_info, prefix="a", other="b", gap=" "): |
|
|
|
span_diff_1 = """<span style="background-color:LightCoral;color:black;border-radius:2px;" onmouseover="chbg_""" |
|
span_diff_2 = """('cyan')" onmouseout="chbg_""" |
|
span_diff_3 = """('LightCoral')" id='""" |
|
|
|
span_same_1 = """<span style="background-color:LightGreen;color:black;border-radius:2px;" onmouseover="chbg_""" |
|
span_same_2 = """('cyan')" onmouseout="chbg_""" |
|
span_same_3 = """('LightGreen')" id='""" |
|
segments = ["<p>",] |
|
for i, m in enumerate(segment_info): |
|
span1_id = prefix+"_"+str(i)+"_1" |
|
span1_id_other = other+"_"+str(i)+"_1" |
|
if i > 0: |
|
m_prev = segment_info[i-1] |
|
segment1 = span_diff_1 + span1_id + span_diff_2 + span1_id + span_diff_3 + span1_id + "'>" + paragraph[m_prev[0]+m_prev[1]:m[0]] + "</span>" |
|
else: |
|
segment1 = span_diff_1 + span1_id + span_diff_2 + span1_id + span_diff_3 + span1_id + "'>" + paragraph[:m[0]] + "</span>" |
|
span2_id = prefix+"_"+str(i)+"_2" |
|
span2_id_other = other+"_"+str(i)+"_2" |
|
segment2 = span_same_1 + span2_id + span_same_2 + span2_id + span_same_3 + span2_id + "'>" + paragraph[m[0]:m[0]+m[1]] + "</span>" |
|
highlighting_code = """<script> |
|
function chbg_"""+span1_id+"""(colour){ |
|
document.getElementById('"""+span1_id+"""').style.backgroundColor=colour; |
|
document.getElementById('"""+span1_id_other+"""').style.backgroundColor=colour; |
|
} |
|
function chbg_"""+span2_id+"""(colour){ |
|
document.getElementById('"""+span2_id+"""').style.backgroundColor=colour; |
|
document.getElementById('"""+span2_id_other+"""').style.backgroundColor=colour; |
|
} |
|
</script>""" |
|
segments += [highlighting_code, segment1, segment2] |
|
segments.append("</p>") |
|
return gap.join(segments) |
|
|
|
def render_diff(a_parapgraph, b_parapgraph, gap=" ", prefix=None): |
|
if prefix is None: |
|
prefix = str(int(time.time())) |
|
s = difflib.SequenceMatcher(None, a_parapgraph.lower(), b_parapgraph.lower(), autojunk=False) |
|
matching_blocks = s.get_matching_blocks() |
|
|
|
a_segment_info = [[b.a,b.size] for b in matching_blocks] |
|
a_html_paragraph = render_single_para(a_parapgraph, a_segment_info, gap=gap, prefix=prefix+"_a", other=prefix+"_b") |
|
|
|
b_segment_info = [[b.b,b.size] for b in matching_blocks] |
|
b_html_paragraph = render_single_para(b_parapgraph, b_segment_info, gap=gap, prefix=prefix+"_b", other=prefix+"_a") |
|
|
|
table = """<table style="width:100%;font-family:sans-serif;font-size:large;"><tr style="background-color:white;padding=1px;"> |
|
<td style="border: 1px solid silver;padding:0.4em;border-radius:4px;">"""+a_html_paragraph+"""</td> |
|
<td style="border: 1px solid silver;padding:0.4em;border-radius:4px;">"""+b_html_paragraph+"""</td> |
|
</tr></table>""" |
|
return table |
|
|
|
@st.cache(allow_output_mutation=True) |
|
def load_model(): |
|
nlp = spacy.load("en_core_web_sm") |
|
return nlp |
|
|
|
nlp = load_model() |
|
|
|
st.markdown("### TextDiff Visualizer") |
|
|
|
mode = st.selectbox("Input", ["Custom", "Examples"]) |
|
|
|
if mode == "Custom": |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
text_A = st.text_area("Text 1", value="The findings are being published July 1st in the Annals of Internal Medicine.") |
|
with col2: |
|
text_B = st.text_area("Text 2", value="The findings are published in the July 1st issue of the Annals of Internal Medicine.") |
|
else: |
|
examples = st.radio("Examples", [ |
|
"The top rate will go to 4.45 percent for all residents with taxable incomes above $500,000. ; For residents with incomes above $500,000, the income-tax rate will increase to 4.45 percent.", |
|
"However, prosecutors have declined to take criminal action against guards, though Fine said his inquiry is not finished. ; Prosecutors have declined to take criminal action against corrections officers, although Fine said his inquiry was not finished.", |
|
"In trading on the New York Stock Exchange, Kraft shares fell 25 cents to close at $32.30. ; Kraft's shares fell 25 cents to close at $32.30 yesterday on the New York Stock Exchange.", |
|
"An attempt last month in the Senate to keep the fund open for another year fell flat. ; An attempt to keep the fund open for another year fell flat in the Senate last month.", |
|
"Prisoners were tortured and executed -- their ears and scalps severed for souvenirs. ; They frequently tortured and shot prisoners, severing ears and scalps for souvenirs.", |
|
"American has laid off 6,500 of its flight attendants since Dec. 31. ; Since October 2001, American has laid off 6,149 flight attendants.", |
|
]) |
|
text_A, text_B = examples.split(" ; ") |
|
|
|
st.markdown("Visualization") |
|
|
|
html_viz = render_diff(text_A, text_B) |
|
|
|
st.components.v1.html(html_viz) |
|
|
|
dist = round(pm.edit_distance(text_A, text_B), 2) |
|
bleu = round(pm.self_bleu(text_A, text_B), 2) |
|
text_A, text_B = nlp(text_A), nlp(text_B) |
|
wpd = round(pm.wpd(text_A, text_B), 2) |
|
ld = round(pm.ld(text_A, text_B), 2) |
|
|
|
metriccol1, metriccol2, metriccol3, metriccol4 = st.columns(4) |
|
metriccol1.metric("WPD", wpd) |
|
metriccol2.metric("LD", ld) |
|
metriccol3.metric("Edit Dist.", dist) |
|
metriccol4.metric("BLEU", bleu) |
|
|
|
with st.expander("More info"): |
|
st.markdown("""**Explantion of Metrics** |
|
|
|
* **WPD**: Word Position Deviation measures structural changes between two paraphrases |
|
* **LD**: Lexical Deviation measures degree of vocabulary changes between two paraphrases |
|
* **Edit Dist.**: Levenshtein edit distance |
|
* **BLEU**: SELF-BLEU score |
|
|
|
For more information, see https://github.com/tlkh/paraphrase-metrics |
|
""") |