Spaces:
Running
Running
File size: 4,683 Bytes
6f595b5 915e04a 6f595b5 3086575 6f595b5 3086575 6f595b5 0302fc1 e7ee4c6 6f595b5 e7ee4c6 6f595b5 e7ee4c6 6f595b5 3086575 6f595b5 3086575 6f595b5 3086575 6f595b5 3086575 6f595b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import streamlit as st
import pandas as pd
from transformers import pipeline
from stqdm import stqdm
from simplet5 import SimpleT5
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
@st.cache
def load_t5():
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
return model, tokenizer
@st.cache
def custom_model():
return pipeline("summarization", model="my_awesome_sum/")
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv(index=False).encode("utf-8")
@st.cache
def load_one_line_summarizer(model):
return model.load_model("t5", "snrspeaks/t5-one-line-summary")
st.set_page_config(layout="wide", page_title="Amazon Review Summarizer")
st.title("Amazon Review Summarizer")
uploaded_file = st.file_uploader("Choose a file", type=["xlsx", "xls", "csv"])
summarizer_option = st.selectbox(
"Select Summarizer",
("Custom trained on the dataset", "t5-base", "t5-one-line-summary"),
)
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
ps = st.empty()
if st.button("Process"):
if uploaded_file is not None:
if uploaded_file.name.split(".")[-1] in ["xls", "xlsx"]:
df = pd.read_excel(uploaded_file, engine="openpyxl")
if uploaded_file.name.split(".")[-1] in [".csv"]:
df = pd.read_csv(uploaded_file)
columns = df.columns.values.tolist()
columns = [x.lower() for x in columns]
df.columns = columns
print(summarizer_option)
if summarizer_option == "Custom trained on the dataset":
model = custom_model()
print(summarizer_option)
text = df["text"].values.tolist()
progress_text = "Summarization in progress. Please wait."
summary = []
for x in stqdm(range(len(text))):
try:
summary.append(
model(
f"summarize: {text[x]}", max_length=50, early_stopping=True
)[0]["summary_text"]
)
except:
pass
output = pd.DataFrame(
{"text": df["text"].values.tolist(), "summary": summary}
)
csv = convert_df(output)
st.download_button(
label="Download data as CSV",
data=csv,
file_name=f"{summarizer_option}_df.csv",
mime="text/csv",
)
if summarizer_option == "t5-base":
model, tokenizer = load_t5()
text = df["text"].values.tolist()
summary = []
for x in stqdm(range(len(text))):
tokens_input = tokenizer.encode(
"summarize: " + text[x],
return_tensors="pt",
max_length=tokenizer.model_max_length,
truncation=True,
)
summary_ids = model.generate(
tokens_input,
min_length=80,
max_length=150,
length_penalty=20,
num_beams=2,
)
summary_gen = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summary.append(summary_gen)
output = pd.DataFrame(
{"text": df["text"].values.tolist(), "summary": summary}
)
csv = convert_df(output)
st.download_button(
label="Download data as CSV",
data=csv,
file_name=f"{summarizer_option}_df.csv",
mime="text/csv",
)
if summarizer_option == "t5-one-line-summary":
model = SimpleT5()
text = df["text"].values.tolist()
load_one_line_summarizer(model=model)
summary = []
for x in stqdm(range(len(text))):
try:
summary.append(model.predict(text[x])[0])
except:
pass
output = pd.DataFrame(
{"text": df["text"].values.tolist(), "summary": summary}
)
csv = convert_df(output)
st.download_button(
label="Download data as CSV",
data=csv,
file_name=f"{summarizer_option}_df.csv",
mime="text/csv",
)
|