Spaces:

ashhadahsan
/

summarizer-space

Running

App Files Files Community

ashhadahsan commited on Mar 22, 2023

Commit

3d51c8f

1 Parent(s): 5521456

added sub theme

Browse files

Files changed (1) hide show

app.py +162 -80

app.py CHANGED Viewed

@@ -13,9 +13,10 @@ import logging
 import pip
 date = datetime.now().strftime(r"%Y-%m-%d")
-model_classes ={
     0: "Ads",
     1: "Apps",
     2: "Battery",
@@ -32,7 +33,8 @@ model_classes ={
     13: "WiFi",
 }
-@st.cache(allow_output_mutation=True,suppress_st_warning=True)
 def load_t5():
     model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
@@ -40,29 +42,36 @@ def load_t5():
     return model, tokenizer
-@st.cache(allow_output_mutation=True,suppress_st_warning=True)
 def custom_model():
     return pipeline("summarization", model="my_awesome_sum/")
-@st.cache(allow_output_mutation=True,suppress_st_warning=True)
 def convert_df(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_csv(index=False).encode("utf-8")
-@st.cache(allow_output_mutation=True,suppress_st_warning=True)
 def load_one_line_summarizer(model):
     return model.load_model("t5", "snrspeaks/t5-one-line-summary")
-@st.cache(allow_output_mutation=True,suppress_st_warning=True)
 def classify_category():
     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
     new_model = load_model("model")
     return tokenizer, new_model
 st.set_page_config(layout="wide", page_title="Amazon Review Summarizer")
 st.title("Amazon Review Summarizer")
@@ -71,14 +80,23 @@ summarizer_option = st.selectbox(
     "Select Summarizer",
     ("Custom trained on the dataset", "t5-base", "t5-one-line-summary"),
 )
-classification = st.checkbox("Classify Category", value=True)
 ps = st.empty()
-if st.button("Process",type="primary"):
-    cancel_button=st.empty()
-    cancel_button2=st.empty()
-    cancel_button3=st.empty()
     if uploaded_file is not None:
         if uploaded_file.name.split(".")[-1] in ["xls", "xlsx"]:
@@ -89,33 +107,34 @@ if st.button("Process",type="primary"):
         columns = [x.lower() for x in columns]
         df.columns = columns
         print(summarizer_option)
         try:
             text = df["text"].values.tolist()
             if summarizer_option == "Custom trained on the dataset":
-                model = custom_model()
-                progress_text = "Summarization in progress. Please wait."
-                summary = []
-                for x in stqdm(range(len(text))):
-                    if cancel_button.button("Cancel",key=x):
-                        del model
-                        break
-                    try:
-                        summary.append(
-                            model(
-                                f"summarize: {text[x]}",
-                                max_length=50,
-                                early_stopping=True,
-                            )[0]["summary_text"]
-                        )
-                    except:
-                        pass
-                output = pd.DataFrame(
-                    {"text": df["text"].values.tolist(), "summary": summary}
-                )
                 if classification:
                     classification_token, classification_model = classify_category()
                     tf_batch = classification_token(
@@ -135,6 +154,27 @@ if st.button("Process",type="primary"):
                             keys = model_classes
                             classes.append(keys.get(label))
                         output["category"] = classes
                 csv = convert_df(output)
                 st.download_button(
@@ -144,34 +184,34 @@ if st.button("Process",type="primary"):
                     mime="text/csv",
                 )
             if summarizer_option == "t5-base":
-                model, tokenizer = load_t5()
-                summary = []
-                for x in stqdm(range(len(text))):
-                    if cancel_button2.button("Cancel",key=x):
-                        del model,tokenizer
-                        break
-                    tokens_input = tokenizer.encode(
-                        "summarize: " + text[x],
-                        return_tensors="pt",
-                        max_length=tokenizer.model_max_length,
-                        truncation=True,
-                    )
-                    summary_ids = model.generate(
-                        tokens_input,
-                        min_length=80,
-                        max_length=150,
-                        length_penalty=20,
-                        num_beams=2,
-                    )
-                    summary_gen = tokenizer.decode(
-                        summary_ids[0], skip_special_tokens=True
-                    )
-                    summary.append(summary_gen)
-                output = pd.DataFrame(
-                    {"text": df["text"].values.tolist(), "summary": summary}
-                )
                 if classification:
                     classification_token, classification_model = classify_category()
                     tf_batch = classification_token(
@@ -191,7 +231,27 @@ if st.button("Process",type="primary"):
                             keys = model_classes
                             classes.append(keys.get(label))
                         output["category"] = classes
                 csv = convert_df(output)
                 st.download_button(
                     label="Download data as CSV",
@@ -201,21 +261,22 @@ if st.button("Process",type="primary"):
                 )
             if summarizer_option == "t5-one-line-summary":
-                model = SimpleT5()
-                load_one_line_summarizer(model=model)
-                summary = []
-                for x in stqdm(range(len(text))):
-                    if cancel_button3.button("Cancel",key=x):
-                        del model
-                        break
-                    try:
-                        summary.append(model.predict(text[x])[0])
-                    except:
-                        pass
-                output = pd.DataFrame(
-                    {"text": df["text"].values.tolist(), "summary": summary}
-                )
                 if classification:
                     classification_token, classification_model = classify_category()
                     tf_batch = classification_token(
@@ -235,6 +296,27 @@ if st.button("Process",type="primary"):
                             keys = model_classes
                             classes.append(keys.get(label))
                         output["category"] = classes
                 csv = convert_df(output)
                 st.download_button(
@@ -251,4 +333,4 @@ if st.button("Process",type="primary"):
             )
             st.info("Text column must have amazon reviews", icon="ℹ️")
         except BaseException as e:
-            logging.exception("An exception was occurred")

 import pip
+import gc
 date = datetime.now().strftime(r"%Y-%m-%d")
+model_classes = {
     0: "Ads",
     1: "Apps",
     2: "Battery",
     13: "WiFi",
 }
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
 def load_t5():
     model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
     return model, tokenizer
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
 def custom_model():
     return pipeline("summarization", model="my_awesome_sum/")
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
 def convert_df(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_csv(index=False).encode("utf-8")
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
 def load_one_line_summarizer(model):
     return model.load_model("t5", "snrspeaks/t5-one-line-summary")
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
 def classify_category():
     tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
     new_model = load_model("model")
     return tokenizer, new_model
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def classify_sub_theme():
+    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    new_model = load_model("sub_theme_model")
+    return tokenizer, new_model
 st.set_page_config(layout="wide", page_title="Amazon Review Summarizer")
 st.title("Amazon Review Summarizer")
     "Select Summarizer",
     ("Custom trained on the dataset", "t5-base", "t5-one-line-summary"),
 )
+col1, col2, col3 = st.columns([1, 1, 1])
+with col1:
+    summary_yes = st.checkbox("Summrization", value=False)
+with col2:
+    classification = st.checkbox("Classify Category", value=True)
+with col3:
+    sub_theme = st.checkbox("Sub theme classification", value=True)
 ps = st.empty()
+if st.button("Process", type="primary"):
+    cancel_button = st.empty()
+    cancel_button2 = st.empty()
+    cancel_button3 = st.empty()
     if uploaded_file is not None:
         if uploaded_file.name.split(".")[-1] in ["xls", "xlsx"]:
         columns = [x.lower() for x in columns]
         df.columns = columns
         print(summarizer_option)
+        output = pd.DataFrame()
         try:
             text = df["text"].values.tolist()
+            output["text"] = text
             if summarizer_option == "Custom trained on the dataset":
+                if summary_yes:
+                    model = custom_model()
+                    progress_text = "Summarization in progress. Please wait."
+                    summary = []
+                    for x in stqdm(range(len(text))):
+                        if cancel_button.button("Cancel", key=x):
+                            del model
+                            break
+                        try:
+                            summary.append(
+                                model(
+                                    f"summarize: {text[x]}",
+                                    max_length=50,
+                                    early_stopping=True,
+                                )[0]["summary_text"]
+                            )
+                        except:
+                            pass
+                    output["summary"] = summary
+                    del model
                 if classification:
                     classification_token, classification_model = classify_category()
                     tf_batch = classification_token(
                             keys = model_classes
                             classes.append(keys.get(label))
                         output["category"] = classes
+                    del classification_token, classification_model
+                if sub_theme:
+                    classification_token, classification_model = classify_sub_theme()
+                    tf_batch = classification_token(
+                        text,
+                        max_length=128,
+                        padding=True,
+                        truncation=True,
+                        return_tensors="tf",
+                    )
+                    with st.spinner(text="identifying sub theme"):
+                        tf_outputs = classification_model(tf_batch)
+                    classes = []
+                    with st.spinner(text="creating output file"):
+                        for x in stqdm(range(len(text))):
+                            tf_o = softmax(tf_outputs["logits"][x], axis=-1)
+                            label = np.argmax(tf_o, axis=0)
+                            keys = model_classes
+                            classes.append(keys.get(label))
+                        output["sub theme"] = classes
+                    del classification_token, classification_model
                 csv = convert_df(output)
                 st.download_button(
                     mime="text/csv",
                 )
             if summarizer_option == "t5-base":
+                if summary_yes:
+                    model, tokenizer = load_t5()
+                    summary = []
+                    for x in stqdm(range(len(text))):
+                        if cancel_button2.button("Cancel", key=x):
+                            del model, tokenizer
+                            break
+                        tokens_input = tokenizer.encode(
+                            "summarize: " + text[x],
+                            return_tensors="pt",
+                            max_length=tokenizer.model_max_length,
+                            truncation=True,
+                        )
+                        summary_ids = model.generate(
+                            tokens_input,
+                            min_length=80,
+                            max_length=150,
+                            length_penalty=20,
+                            num_beams=2,
+                        )
+                        summary_gen = tokenizer.decode(
+                            summary_ids[0], skip_special_tokens=True
+                        )
+                        summary.append(summary_gen)
+                    del model, tokenizer
+                    output["summary"] = summary
                 if classification:
                     classification_token, classification_model = classify_category()
                     tf_batch = classification_token(
                             keys = model_classes
                             classes.append(keys.get(label))
                         output["category"] = classes
+                    del classification_token, classification_model
+                if sub_theme:
+                    classification_token, classification_model = classify_sub_theme()
+                    tf_batch = classification_token(
+                        text,
+                        max_length=128,
+                        padding=True,
+                        truncation=True,
+                        return_tensors="tf",
+                    )
+                    with st.spinner(text="identifying sub theme"):
+                        tf_outputs = classification_model(tf_batch)
+                    classes = []
+                    with st.spinner(text="creating output file"):
+                        for x in stqdm(range(len(text))):
+                            tf_o = softmax(tf_outputs["logits"][x], axis=-1)
+                            label = np.argmax(tf_o, axis=0)
+                            keys = model_classes
+                            classes.append(keys.get(label))
+                        output["sub theme"] = classes
+                    del classification_token, classification_model
                 csv = convert_df(output)
                 st.download_button(
                     label="Download data as CSV",
                 )
             if summarizer_option == "t5-one-line-summary":
+                if summary_yes:
+                    model = SimpleT5()
+                    load_one_line_summarizer(model=model)
+                    summary = []
+                    for x in stqdm(range(len(text))):
+                        if cancel_button3.button("Cancel", key=x):
+                            del model
+                            break
+                        try:
+                            summary.append(model.predict(text[x])[0])
+                        except:
+                            pass
+                    output["summary"] = summary
+                    del model
                 if classification:
                     classification_token, classification_model = classify_category()
                     tf_batch = classification_token(
                             keys = model_classes
                             classes.append(keys.get(label))
                         output["category"] = classes
+                    del classification_token,classification_model
+                if sub_theme:
+                    classification_token, classification_model = classify_sub_theme()
+                    tf_batch = classification_token(
+                        text,
+                        max_length=128,
+                        padding=True,
+                        truncation=True,
+                        return_tensors="tf",
+                    )
+                    with st.spinner(text="identifying sub theme"):
+                        tf_outputs = classification_model(tf_batch)
+                    classes = []
+                    with st.spinner(text="creating output file"):
+                        for x in stqdm(range(len(text))):
+                            tf_o = softmax(tf_outputs["logits"][x], axis=-1)
+                            label = np.argmax(tf_o, axis=0)
+                            keys = model_classes
+                            classes.append(keys.get(label))
+                        output["sub theme"] = classes
+                    del classification_token, classification_model
                 csv = convert_df(output)
                 st.download_button(
             )
             st.info("Text column must have amazon reviews", icon="ℹ️")
         except BaseException as e:
+            logging.exception("An exception was occurred")