Spaces:

cheesexuebao
/

murphy

Sleeping

App Files Files Community

cheesexuebao commited on Apr 22, 2024

Commit

f845b05

1 Parent(s): b6be546

standard repo for pre.

Browse files

Files changed (4) hide show

app.py +62 -190
assets/Kickstarter_sentence_level_5000.csv +0 -0
assets/Prediction.py.bak +0 -132
convert.py +0 -30

app.py CHANGED Viewed

@@ -3,19 +3,7 @@ import pandas as pd
 from Prediction import *
 import os
 from datetime import datetime
-import re
-import json
-import hashlib
-persistent_path = "/output"
-# os.environ['HF_HOME'] = os.path.join(persistent_path, ".huggingface")
-user_input_path = os.path.join(persistent_path, 'user.jsonl')
-secret = "2fc9ff032e027e8f23bb9fb693234899"
-def get_md5(s):
-    md = hashlib.md5()
-    md.update(s.encode('utf-8'))
-    return md.hexdigest()
 examples = []
 if os.path.exists("assets/examples.txt"):
@@ -53,72 +41,6 @@ def csv_process(csv_file, attr="content"):
     outputs.append(output_path)
     return outputs
-def logfile_query(auth):
-    if get_md5(auth) == secret and os.path.exists(user_input_path):
-        return [user_input_path]
-    else:
-        return None
-def check_save(fname, lname, cnum, email, oname, position):
-    errors = []
-    valid_vars = {}
-    if not fname.strip() or not lname.strip():
-        errors.append("Name cannot be empty")
-    elif fname.isdigit() or lname.isdigit():
-        errors.append("Name cannot be purely numerical")
-    else:
-        valid_vars["fname"] = fname
-        valid_vars["lname"] = lname
-    valid_vars["cnum"] = ''
-    if cnum:
-        if not cnum.isdigit():
-            errors.append("The phone number must be a pure number")
-        else:
-            valid_vars["cnum"] = cnum
-    if not email.strip():
-        errors.append("Email cannot be empty")
-    elif not re.match(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', email):
-        errors.append("Incorrect email format")
-    else:
-        valid_vars["email"] = email
-    if not oname.strip():
-        errors.append("Organization name cannot be empty")
-    elif oname.isdigit():
-        errors.append("Organization cannot be purely numerical")
-    else:
-        valid_vars["oname"] = oname
-    valid_vars["position"] = ''
-    if position:
-        if position.isdigit():
-            errors.append("Position in your company cannot be purely numerical")
-        else:
-            valid_vars["position"] = position
-    if errors:
-        return errors
-    current_time = datetime.now()
-    formatted_time = current_time.strftime("%Y_%m_%d_%H_%M_%S")
-    valid_vars['time'] = formatted_time
-    with open(user_input_path, 'a+', encoding="utf8") as file:
-        file.write(json.dumps(valid_vars)+"\n")
-    records = {}
-    with open(user_input_path, 'r', encoding="utf8") as file:
-        for line in file:
-            line = line.strip()
-            dct = json.loads(line)
-            records[dct['time']] = dct
-    return records
 my_theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty")
 with gr.Blocks(theme=my_theme, title='Brand_Tone_of_Voice_demo') as demo:
@@ -138,116 +60,66 @@ with gr.Blocks(theme=my_theme, title='Brand_Tone_of_Voice_demo') as demo:
         </div>
         </div>
         """)
-    with gr.Column(visible=True) as regis:
-        gr.Markdown("# Welcome to BTV!  Please fill out the form below to continue.\nI’m assuming that you mention somewhere that this project/research is conducted by the University of Manchester/AMBS. By ticking this box, I consent to be approached by the research team of the University of Manchester.")
-        with gr.Column(variant='panel'):
-            fname_tb = gr.Textbox(label="First Name: ", type='text')
-            lname_tb = gr.Textbox(label="Last Name: ", type='text')
-            email_tb = gr.Textbox(label="Email: ", type='email')
-            cnum_tb = gr.Textbox(label="Contact: (Optional)", type='text')
-            oname_tb = gr.Textbox(label="Organization name: ", type='text')
-            position_tb = gr.Textbox(label="Positions in your company: (Optional)", type='text')
-        error_box = gr.HTML(value="", visible=False)
-        submit_btn = gr.Button("Click here to start if you have fullfill all the item!")
-    with gr.Row(visible=False) as mainrow:
-        with gr.Tab("Single Sentence"):
-            with gr.Row():
-                tbox_input = gr.Textbox(label="Input",
-                                        info="Please input a sentence here:")
-                gr.Markdown("""
-                    # Detailed information about our model:
-                    ...
-                    """)
-            tab_output = gr.DataFrame(label='Predictions:',
-                                    headers=["Label", "Probability"],
-                                    datatype=["str", "number"],
-                                    interactive=False)
-            with gr.Row():
-                button_ss = gr.Button("Submit", variant="primary")
-                button_ss.click(fn=single_sentence, inputs=[tbox_input], outputs=[tab_output])
-                gr.ClearButton([tbox_input, tab_output])
-            gr.Examples(
-                examples=examples,
-                inputs=tbox_input,
-                examples_per_page=len(examples)
-            )
-        with gr.Tab("Csv File"):
-            with gr.Row():
-                csv_input = gr.File(label="CSV File:",
-                                    file_types=['.csv'],
-                                    file_count="single"
-                                    )
-                csv_output = gr.File(label="Predictions:")
-            with gr.Row():
-                button_cf = gr.Button("Submit", variant="primary")
-                button_cf.click(fn=csv_process, inputs=[csv_input], outputs=[csv_output])
-                gr.ClearButton([csv_input, csv_output])
-            gr.Markdown("## Examples \n The incoming CSV must include the ``content`` field, which represents the text that needs to be predicted!")
-            gr.DataFrame(label='Csv input format:',
-                        value=[[i, examples[i]] for i in range(len(examples))],
-                        headers=["index", "content"],
-                        datatype=["number","str"],
-                        interactive=False
-                        )
-        with gr.Tab("Readme"):
-            gr.Markdown(
-                """
-                # Paper Name
-                # Authors
-                + First author
-                + Corresponding author
-                # Detailed Information
-                ...
-                """
-            )
-        with gr.Tab("Log File"):
-            with gr.Row():
-                auth_token = gr.Textbox(label="Authentication Tokens: ", info="Enter the key to download persistent stored log information.")
-                log_output = gr.File(label="Log file: ")
-            with gr.Row():
-                button_lf = gr.Button("Validate", variant="primary")
-                button_lf.click(fn=logfile_query, inputs=[auth_token], outputs=[log_output])
-                gr.ClearButton([auth_token, log_output])
-    def submit(*user_input):
-        res = check_save(*user_input)
-        if isinstance(res, list):
-            return {
-                error_box: gr.HTML(
-                    value=f"""
-                    <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
-                    <div>
-                        <p style="color:red;">{"; ".join(res)}</p>
-                    </div>
-                    </div>
-                    """,
-                    visible=True)
-            }
-        else:
-            return {
-                mainrow: gr.Row(visible=True),
-                regis: gr.Row(visible=False),
-                error_box: gr.HTML(visible=False)
-            }
-    submit_btn.click(
-        submit,
-        [fname_tb, lname_tb, cnum_tb, email_tb, oname_tb, position_tb],
-        [mainrow, regis, error_box],
-    )
 demo.launch()

 from Prediction import *
 import os
 from datetime import datetime
 examples = []
 if os.path.exists("assets/examples.txt"):
     outputs.append(output_path)
     return outputs
 my_theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty")
 with gr.Blocks(theme=my_theme, title='Brand_Tone_of_Voice_demo') as demo:
         </div>
         </div>
         """)
+    with gr.Tab("Readme"):
+        gr.Markdown("""
+            # Detailed information about our model:
+            The example model here is a tone classification model suitable for financial field texts.
+            # Paper Name
+            # Authors
+            + First author
+            + Corresponding author
+            # How to use?
+            Please refer to the other two tab card for predictions.
+            + The `Single Sentence` for the tone classification of individual sentence.
+            + The `CSV File` for inputting CSV file for batch prediction and return.
+            ...
+            """)
+    with gr.Tab("Single Sentence"):
+        tbox_input = gr.Textbox(label="Input",
+                                    info="Please input a sentence here:")
+        tab_output = gr.DataFrame(label='Predictions:',
+                                  headers=["Category", "Probability"],
+                                  datatype=["str", "number"],
+                                  interactive=False)
+        with gr.Row():
+            button_ss = gr.Button("Submit", variant="primary")
+            button_ss.click(fn=single_sentence, inputs=[tbox_input], outputs=[tab_output])
+            gr.ClearButton([tbox_input, tab_output])
+        gr.Examples(
+            examples=examples,
+            inputs=tbox_input,
+            examples_per_page=len(examples)
+        )
+    with gr.Tab("Csv File"):
+        with gr.Row():
+            csv_input = gr.File(label="CSV File:",
+                                file_types=['.csv'],
+                                file_count="single"
+                                )
+            csv_output = gr.File(label="Predictions:")
+        with gr.Row():
+            button = gr.Button("Submit", variant="primary")
+            button.click(fn=csv_process, inputs=[csv_input], outputs=[csv_output])
+            gr.ClearButton([csv_input, csv_output])
+        gr.Markdown("## Examples \n The incoming CSV must include the ``content`` field, which represents the text that needs to be predicted!")
+        gr.DataFrame(label='Csv input format:',
+                    value=[[i, examples[i]] for i in range(len(examples))],
+                    headers=["index", "content"],
+                    datatype=["number","str"],
+                    interactive=False
+                    )
 demo.launch()

assets/Kickstarter_sentence_level_5000.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

assets/Prediction.py.bak DELETED Viewed

@@ -1,132 +0,0 @@
-### install the needed package
-# !pip install transformers
-# !pip install torchmetrics
-# !pip3 install ogb pytorch_lightning -q
-import pandas as pd
-from tqdm.auto import tqdm
-import torch
-import torch.nn as nn
-from torch.utils.data import DataLoader, Dataset
-from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
-# import pytorch_lightning as pl
-pd.set_option('display.max_columns', 500)
-RANDOM_SEED = 42
-class ModelTagger(nn.Module):
-  def __init__(self, model_path="bert-base-uncased"):
-    super().__init__()
-    self.bert = BertModel.from_pretrained(model_path, return_dict=True)
-    self.classifier = nn.Linear(self.bert.config.hidden_size, 4)
-    self.criterion = nn.BCELoss()
-  def forward(self, input_ids, attention_mask, labels=None):
-    output = self.bert(input_ids, attention_mask=attention_mask)
-    output = self.classifier(output.pooler_output)
-    output = torch.sigmoid(output)
-    loss = 0
-    if labels is not None:
-        loss = self.criterion(output, labels)
-    return loss, output
-class Predict_Dataset(Dataset):
-  def __init__(
-    self,
-    data: pd.DataFrame,
-    text_col: str,
-    tokenizer: BertTokenizer,
-    max_token_len: int = 128
-  ):
-    self.text_col = text_col
-    self.tokenizer = tokenizer
-    self.data = data
-    self.max_token_len = max_token_len
-  def __len__(self):
-    return len(self.data)
-  def __getitem__(self, index: int):
-    data_row = self.data.iloc[index]
-    post = data_row[self.text_col]
-    encoding = self.tokenizer.encode_plus(
-      post,
-      add_special_tokens=True,
-      max_length=self.max_token_len,
-      return_token_type_ids=False,
-      padding="max_length",
-      truncation=True,
-      return_attention_mask=True,
-      return_tensors='pt',
-    )
-    return dict(
-      post=post,
-      input_ids=encoding["input_ids"].flatten(),
-      attention_mask=encoding["attention_mask"].flatten(),
-    )
-def predict(data, text_col, tokenizer, model, device, LABEL_COLUMNS, max_token_len=128):
-    predictions = []
-    df_token = Predict_Dataset(data, text_col, tokenizer, max_token_len=max_token_len)
-    loader = DataLoader(df_token, batch_size=1000, num_workers=0)
-    for item in tqdm(loader):
-        _, prediction = model(
-            item["input_ids"].to(device),
-            item["attention_mask"].to(device)
-        )
-        predictions.append(prediction.detach().cpu())
-    final_pred = torch.cat(predictions, dim=0)
-    y_inten = final_pred.numpy().T
-    return {
-        LABEL_COLUMNS[0]: y_inten[0].tolist(),
-        LABEL_COLUMNS[1]: y_inten[1].tolist(),
-        LABEL_COLUMNS[2]: y_inten[2].tolist(),
-    LABEL_COLUMNS[3]: y_inten[3].tolist()
-    }
-def get_result(df, result, LABEL_COLUMNS):
-  df[LABEL_COLUMNS[0]] = result[LABEL_COLUMNS[0]]
-  df[LABEL_COLUMNS[1]] = result[LABEL_COLUMNS[1]]
-  df[LABEL_COLUMNS[2]] = result[LABEL_COLUMNS[2]]
-  df[LABEL_COLUMNS[3]] = result[LABEL_COLUMNS[3]]
-  return df
-Data = pd.read_csv("Kickstarter_sentence_level_5000.csv")
-Data = Data[:20]
-device = torch.device('cpu')
-BERT_MODEL_NAME = 'bert-base-uncased'
-tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
-LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone"]
-params = torch.load("checkpoints/Kickstarter.ckpt", map_location='cpu')['state_dict']
-kick_model = ModelTagger()
-kick_model.load_state_dict(params, strict=True)
-kick_model.eval()
-kick_model = kick_model.to(device)
-kick_fk_doc_result = predict(Data,"content", tokenizer,kick_model, device, LABEL_COLUMNS)
-fk_result = get_result(Data, kick_fk_doc_result, LABEL_COLUMNS)
-fk_result.to_csv("output/prediction_origin_Kickstarter.csv")
-# tab_output = gr.Label(label='Probability Predictions:', value=dict(zip(LABEL_COLUMNS, [0]*len(LABEL_COLUMNS))))

convert.py DELETED Viewed

@@ -1,30 +0,0 @@
-import torch
-import glob
-import os
-from transformers import BertTokenizerFast as BertTokenizer, BertForSequenceClassification
-os.environ['https_proxy'] = "127.0.0.1:1081"
-LABEL_COLUMNS = ["Assertive Tone", "Conversational Tone", "Emotional Tone", "Informative Tone", "None"]
-tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
-id2label = {i:label for i,label in enumerate(LABEL_COLUMNS)}
-label2id = {label:i for i,label in enumerate(LABEL_COLUMNS)}
-for ckpt in glob.glob('checkpoints/*.ckpt'):
-    base_name = os.path.basename(ckpt)
-    # 去除文件后缀
-    model_name = os.path.splitext(base_name)[0]
-    params = torch.load(ckpt, map_location="cpu")['state_dict']
-    msg = model.load_state_dict(params, strict=True)
-    path = f'models/{model_name}'
-    os.makedirs(path, exist_ok=True)
-    torch.save(model.state_dict(), f'{path}/pytorch_model.bin')
-    config = model.config
-    config.architectures = ['BertForSequenceClassification']
-    config.label2id = label2id
-    config.id2label = id2label
-    model.config.to_json_file(f'{path}/config.json')
-    tokenizer.save_vocabulary(path)