Spaces:

Seetha
/

IMA-pipeline-streamlit

Sleeping

App Files Files Community

Seetha commited on Jun 1, 2023

Commit

51cb4ac

1 Parent(s): 5106269

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -66

app.py CHANGED Viewed

@@ -10,6 +10,7 @@ from transformers import AutoTokenizer, DistilBertTokenizerFast
 from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
 import numpy as np
 import pandas as pd
 import json
 import sys
 import os
@@ -31,6 +32,7 @@ import json
 import re
 import numpy as np
 import pandas as pd
 import nltk
 nltk.download("punkt")
 #stemmer = nltk.SnowballStemmer("english")
@@ -56,9 +58,9 @@ from sklearn.feature_extraction.text import CountVectorizer
 #from urllib.request import urlopen
 #from tabulate import tabulate
 import csv
-# import gdown
-# import zipfile
-# import wget
 import pdfplumber
 import pathlib
 import shutil
@@ -66,6 +68,9 @@ import webbrowser
 from streamlit.components.v1 import html
 import streamlit.components.v1 as components
 from PyPDF2 import PdfReader
 #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -81,17 +86,20 @@ def main():
   k=2
   seed = 1
   k1= 5
-  uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf")
   text_list = []
   causal_sents = []
-  reader = PdfReader(uploaded_file)
-  for page in reader.pages:
-    text = page.extract_text()
-    text_list.append(text)
   text_list_final = [x.replace('\n', '') for x in text_list]
   text_list_final = re.sub('"', '', str(text_list_final))
@@ -103,8 +111,9 @@ def main():
     result2 = re.sub(r'[^\w\s]','',result1)
     result.append(result2)
-  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
-  model_path = "checkpoint-2850"
   model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'})
@@ -117,7 +126,10 @@ def main():
   model_name = "distilbert-base-cased"
   tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
-  model_path1 = "DistilBertforTokenclassification"
   model = DistilBertForTokenClassification.from_pretrained(model_path1) #len(unique_tags),, num_labels= 7, , id2label={0:'CT',1:'E',2:'C',3:'O'}
   pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True
@@ -162,9 +174,9 @@ def main():
   final_list = pd.DataFrame(
       {'Id': sent_id,
-       'Full sentence': sentence_pred,
        'Component': class_list,
-       'cause/effect': entity_list,
        'Label_level1': level0,
        'Label_level2': pred_val
       })
@@ -174,7 +186,7 @@ def main():
   final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')]
   li = []
   uni = final_list1['Id'].unique()
   for i in uni:
@@ -186,17 +198,23 @@ def main():
   li_pan = pd.DataFrame(out,columns=['Id'])
   df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \
               .query("_merge == 'left_only'") \
-              .drop('_merge',1)
-  df = df3.groupby(['Id','Full sentence','cause/effect', 'Label_level1', 'Label_level2'])['Component'].apply(', '.join).reset_index()
-  df["cause/effect"].replace({"C": "cause", "E": "effect"}, inplace=True)
-  df_final = df[df['cause/effect'] != 'CT']
   df['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True)
-  df_final = df_final.drop('Component',1)
   df_final.insert(2, "Component", df['New string'], True)
-  df_final.to_csv('predictions.csv')
   count_NP_NP = 0
   count_NP_investor = 0
@@ -229,8 +247,8 @@ def main():
   count_soc_society = 0
   for i in range(0,df_final['Id'].max()):
     j = df_final.loc[df_final['Id'] == i]
-    cause_tab = j.loc[j['cause/effect'] == 'cause']
-    effect_tab = j.loc[j['cause/effect'] == 'effect']
     cause_coun_NP = (cause_tab.Label_level2 == 'Non-performance').sum()
     effect_coun_NP = (effect_tab.Label_level2 == 'Non-performance').sum()
@@ -428,9 +446,13 @@ def main():
 #      'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]},
 #       index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'])
-  df_tab.to_csv('final_data.csv')
-  df = pd.read_csv('final_data.csv', index_col=0)
   # Convert to JSON format
   json_data = []
@@ -443,11 +465,11 @@ def main():
         })
   # Write JSON to file
-  with open('smalljson.json', 'w') as f:
     json.dump(json_data, f)
-  csv_file = "predictions.csv"
-  json_file = "ch.json"
   # Open the CSV file and read the data
   with open(csv_file, "r") as f:
@@ -477,45 +499,73 @@ def main():
   csv2 = convert_df(df_tab.astype(str))
   with st.container():
     st.download_button(label="Download the detailed result table",data=csv1,file_name='results.csv',mime='text/csv')
-    st.download_button(label="Download the result table",data=csv2,file_name='final_data.csv',mime='text/csv')
 #   # LINK TO THE CSS FILE
-#  def tree_css(file_name):
-#   with open('/Users/seetha/Downloads/tree.css')as f:
-#    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
-#
-#  def div_css(file_name):
-#   with open('/Users/seetha/Downloads/div.css')as f:
-#    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
-#
-#  def side_css(file_name):
-#   with open('/Users/seetha/Downloads/side.css')as f:
-#    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
-#
-#  tree_css('tree.css')
-#  div_css('div.css')
-#  side_css('side.css')
-    STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static'
-    CSS_PATH = (STREAMLIT_STATIC_PATH / "css1")
-    if not CSS_PATH.is_dir():
-      CSS_PATH.mkdir()
-    css_file = CSS_PATH / "tree.css"
-    css_file1 = CSS_PATH / "div.css"
-    css_file2 = CSS_PATH / "side.css"
-    jso_file = CSS_PATH / "smalljson.json"
-    if not css_file.exists():
-      shutil.copy("tree.css", css_file)
-      shutil.copy("div.css", css_file1)
-      shutil.copy("side.css", css_file2)
-      shutil.copy("smalljson.json", jso_file)
   HtmlFile = open("index.html", 'r', encoding='utf-8')
-  source_code = HtmlFile.read()
   #print(source_code)
-  components.html(source_code)
 #   # Define your javascript
 #   my_js = """
 #     alert("Hello World");

 from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
 import numpy as np
 import pandas as pd
+import torch
 import json
 import sys
 import os
 import re
 import numpy as np
 import pandas as pd
+import re
 import nltk
 nltk.download("punkt")
 #stemmer = nltk.SnowballStemmer("english")
 #from urllib.request import urlopen
 #from tabulate import tabulate
 import csv
+#import gdown
+import zipfile
+import wget
 import pdfplumber
 import pathlib
 import shutil
 from streamlit.components.v1 import html
 import streamlit.components.v1 as components
 from PyPDF2 import PdfReader
+from git import Repo
+import io
 #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
   k=2
   seed = 1
   k1= 5
   text_list = []
   causal_sents = []
+  try:
+    uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf")
+    st.stop()
+  except:
+    st.write("Upload a pdf file...")
+  if uploaded_file is not None:
+    reader = PdfReader(uploaded_file)
+    for page in reader.pages:
+      text = page.extract_text()
+      text_list.append(text)
   text_list_final = [x.replace('\n', '') for x in text_list]
   text_list_final = re.sub('"', '', str(text_list_final))
     result2 = re.sub(r'[^\w\s]','',result1)
     result.append(result2)
+  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #bert-base-uncased
+  model_path = "checkpoint2850"
   model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'})
   model_name = "distilbert-base-cased"
   tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
+  model_path1 = "DistilBertForTokeClassification"
   model = DistilBertForTokenClassification.from_pretrained(model_path1) #len(unique_tags),, num_labels= 7, , id2label={0:'CT',1:'E',2:'C',3:'O'}
   pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True
   final_list = pd.DataFrame(
       {'Id': sent_id,
+       'Full_sentence': sentence_pred,
        'Component': class_list,
+       'CauseOrEffect': entity_list,
        'Label_level1': level0,
        'Label_level2': pred_val
       })
   final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')]
   li = []
   uni = final_list1['Id'].unique()
   for i in uni:
   li_pan = pd.DataFrame(out,columns=['Id'])
   df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \
               .query("_merge == 'left_only'") \
+              .drop("_merge",axis=1)
+  df = df3.groupby(['Id','Full_sentence','CauseOrEffect', 'Label_level1', 'Label_level2'])['Component'].apply(', '.join).reset_index()
+  #st.write(df)
+  df["CauseOrEffect"].replace({"C": "cause", "E": "effect"}, inplace=True)
+  df_final = df[df['CauseOrEffect'] != 'CT']
   df['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True)
+  df_final = df_final.drop("Component",axis=1)
   df_final.insert(2, "Component", df['New string'], True)
+  df_final.to_csv('/app/ima-pipeline-streamlit/predictions.csv')
+#   buffer = io.BytesIO()
+#   with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
+#     df_final.to_excel(writer, sheet_name="Sheet1", index=False)
+#     writer.close()
   count_NP_NP = 0
   count_NP_investor = 0
   count_soc_society = 0
   for i in range(0,df_final['Id'].max()):
     j = df_final.loc[df_final['Id'] == i]
+    cause_tab = j.loc[j['CauseOrEffect'] == 'cause']
+    effect_tab = j.loc[j['CauseOrEffect'] == 'effect']
     cause_coun_NP = (cause_tab.Label_level2 == 'Non-performance').sum()
     effect_coun_NP = (effect_tab.Label_level2 == 'Non-performance').sum()
 #      'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]},
 #       index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'])
+  df_tab.to_csv('/app/ima-pipeline-streamlit/final_data.csv')
+  buffer = io.BytesIO()
+  with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
+    df_tab.to_excel(writer,sheet_name="Sheet1",index=False)
+    writer.close()
+  df = pd.read_csv('/app/ima-pipeline-streamlit/final_data.csv', index_col=0)
   # Convert to JSON format
   json_data = []
         })
   # Write JSON to file
+  with open('/app/ima-pipeline-streamlit/ch.json', 'w') as f:
     json.dump(json_data, f)
+  csv_file = "/app/ima-pipeline-streamlit/predictions.csv"
+  json_file = "/app/ima-pipeline-streamlit/smalljson.json"
   # Open the CSV file and read the data
   with open(csv_file, "r") as f:
   csv2 = convert_df(df_tab.astype(str))
   with st.container():
     st.download_button(label="Download the detailed result table",data=csv1,file_name='results.csv',mime='text/csv')
+#     st.download_button(label="Download the result table",data=csv2,file_name='final_data.csv',mime='text/csv')
+    st.download_button(label="Download the detailed result table",data=buffer,file_name="df_final.xlsx",mime="application/vnd.ms-excel")
+    st.download_button(label="Download the result table",data=buffer,file_name="df_tab.xlsx",mime="application/vnd.ms-excel")
+#     repo_dir = 'IMA-pipeline-streamlit'
+#     repo = Repo(repo_dir)
+#     file_list = [
+#         '/app/ima-pipeline-streamlit/results.csv',
+#         '/app/ima-pipeline-streamlit/final_data.csv'
+#     ]
+#     commit_message = 'Add the generated files to Github'
+#     repo.index.add(file_list)
+#     repo.index.commit(commit_message)
+#     origin = repo.remote('origin')
+#     origin.push()
 #   # LINK TO THE CSS FILE
+  def tree_css(file_name):
+   with open('tree.css')as f:
+    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
+  def div_css(file_name):
+   with open('div.css')as f:
+    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
+  def side_css(file_name):
+   with open('side.css')as f:
+    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html = True)
+  tree_css('tree.css')
+  div_css('div.css')
+  side_css('side.css')
+#     STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static'
+#     CSS_PATH = (STREAMLIT_STATIC_PATH / "css1")
+#     if not CSS_PATH.is_dir():
+#       CSS_PATH.mkdir()
+#     css_file = CSS_PATH / "tree.css"
+#     css_file1 = CSS_PATH / "div.css"
+#     css_file2 = CSS_PATH / "side.css"
+#     #jso_file = CSS_PATH / "smalljson.json"
+#     if not css_file.exists():
+#       shutil.copy("tree.css", css_file)
+#       shutil.copy("div.css", css_file1)
+#       shutil.copy("side.css", css_file2)
+#       shutil.copy("smalljson.json", jso_file)
+  STREAMLIT_STATIC_PATH = pathlib.Path(st.__path__[0]) / 'static'
+  CSS_PATH = (STREAMLIT_STATIC_PATH / "assets/css")
+  if not CSS_PATH.is_dir():
+    CSS_PATH.mkdir()
+  css_file = CSS_PATH / "tree.css"
+  css_file1 = CSS_PATH / "div.css"
+  css_file2 = CSS_PATH / "side.css"
+  if not css_file.exists():
+    shutil.copy("assets/css/tree.css", css_file)
+    shutil.copy("assets/css/div.css", css_file1)
+    shutil.copy("assets/css/side.css", css_file2)
   HtmlFile = open("index.html", 'r', encoding='utf-8')
+  source_code = HtmlFile.read()
   #print(source_code)
+  components.html(source_code)
 #   # Define your javascript
 #   my_js = """
 #     alert("Hello World");