Spaces:

KGBrain
/

Causal-pipeline

Sleeping

App Files Files Community

KGBrain commited on Nov 30, 2023

Commit

c521774

1 Parent(s): 14b6745

Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
Checkpoint-classification.sav +3 -0
app.py +581 -0
gitignore +36 -0
level2.json +1 -0
model (1).json +1 -0
model (2).h5 +3 -0
model.bin +3 -0
packages.txt +1 -0
requirements (2).txt +20 -0
sample_anno.pdf +0 -0
tree.css +15 -0
vectorizefile_classification.pickle +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+Checkpoint-classification.sav filter=lfs diff=lfs merge=lfs -text

Checkpoint-classification.sav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:69238b2d6b5b3bd3b927c56b204cbf033ac304f34d3fe24c92ad6cda33c3017d
+size 1693045

app.py ADDED Viewed

	@@ -0,0 +1,581 @@

+# import all packages
+import requests
+import streamlit as st
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import KFold
+# tokenizer
+from transformers import AutoTokenizer, DistilBertTokenizerFast
+# sequence tagging model + training-related
+from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
+import torch
+import sys
+import os
+from sklearn.metrics import classification_report
+from pandas import read_csv
+from sklearn.linear_model import LogisticRegression
+import sklearn.model_selection
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.pipeline import Pipeline, FeatureUnion
+import math
+# from sklearn.metrics import accuracy_score
+# from sklearn.metrics import precision_recall_fscore_support
+import json
+import re
+import numpy as np
+import pandas as pd
+import nltk
+nltk.download("punkt")
+import string
+from sklearn.model_selection import train_test_split
+from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoConfig
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+import itertools
+from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer
+from transformers import pipeline
+import pickle
+import csv
+import pdfplumber
+import pathlib
+import shutil
+import webbrowser
+from streamlit.components.v1 import html
+import streamlit.components.v1 as components
+from PyPDF2 import PdfReader
+from huggingface_hub import HfApi
+import io
+from datasets import load_dataset
+import time
+import huggingface_hub
+from huggingface_hub import Repository
+from datetime import datetime
+import pathlib as Path
+from requests import get
+import urllib.request
+# import gradio as gr
+# from gradio import inputs, outputs
+from datasets import load_dataset
+from huggingface_hub import HfApi, list_models
+import os
+from huggingface_hub import HfFileSystem
+from tensorflow.keras.models import Sequential, model_from_json
+#import tensorflow_datasets as tfds
+import tensorflow as tf
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import spacy
+from tensorflow.keras.preprocessing.text import Tokenizer
+#from spacy import en_core_web_lg
+#import en_core_web_lg
+#nlp = en_core_web_lg.load()
+nlp = spacy.load('en_core_web_sm')
+#tfds.disable_progress_bar()
+MAX_SEQUENCE_LENGTH = 500
+# dataset = load_dataset('Seetha/Visualization', streaming=True)
+# df = pd.DataFrame.from_dict(dataset['train'])
+# DATASET_REPO_URL = "https://huggingface.co/datasets/Seetha/Visualization"
+# DATA_FILENAME = "level2.json"
+#DATA_FILE = os.path.join("data", DATA_FILENAME)
+DATASET_REPO_URL = "https://huggingface.co/datasets/Seetha/visual_files"
+DATA_FILENAME = "detailedResults.json"
+DATA_FILENAME1 = "level2.json"
+HF_TOKEN = os.environ.get("HF_TOKEN")
+#st.write("is none?", HF_TOKEN is None)
+def main():
+  st.title("Text to Causal Knowledge Graph")
+  st.sidebar.title("Please upload your text documents in one file here:")
+  k=2
+  seed = 1
+  k1= 5
+  text_list = []
+  causal_sents = []
+  uploaded_file = None
+  try:
+      uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf")
+  except:
+      uploaded_file = PdfReader('sample_anno.pdf')
+      st.error("Please upload your own PDF to be analyzed")
+  if uploaded_file is not None:
+    reader = PdfReader(uploaded_file)
+    for page in reader.pages:
+      text = page.extract_text()
+      text_list.append(text)
+  else:
+     st.error("Please upload your own PDF to be analyzed")
+     st.stop()
+  text_list_final = [x.replace('\n', '') for x in text_list]
+  text_list_final = re.sub('"', '', str(text_list_final))
+  sentences = nltk.sent_tokenize(text_list_final)
+  result =[]
+  for i in sentences:
+    result1 = i.lower()
+    result2 = re.sub(r'[^\w\s]','',result1)
+    result.append(result2)
+  #st.write("--- %s seconds ---" % (time.time() - start_time))
+  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #bert-base-uncased
+  model_path = "checkpoint-2850"
+  model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'})
+  #st.write('sequence classification loaded')
+  pipe1 = pipeline("text-classification", model=model,tokenizer=tokenizer)
+  for sent in result:
+    pred = pipe1(sent)
+    for lab in pred:
+        if lab['label'] == 'causal': #causal
+            causal_sents.append(sent)
+  # st.write('causal sentence classification finished')
+  # st.write("--- %s seconds ---" % (time.time() - start_time))
+  model_name = "distilbert-base-cased"
+  tokenizer = DistilBertTokenizerFast.from_pretrained(model_name,low_cpu_mem_usage=True)
+  model_path1 = "DistilBertforTokenclassification"
+  model = DistilBertForTokenClassification.from_pretrained(model_path1,low_cpu_mem_usage=True) #len(unique_tags),, num_labels= 7, , id2label={0:'CT',1:'E',2:'C',3:'O'}
+  pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True
+  st.write('DistilBERT loaded')
+  sentence_pred = []
+  class_list = []
+  entity_list = []
+  for k in causal_sents:
+    pred= pipe(k)
+    #st.write(pred)
+    #st.write('preds')
+    for i in pred:
+      sentence_pred.append(k)
+      class_list.append(i['word'])
+      entity_list.append(i['entity_group'])
+  # st.write('causality extraction finished')
+  # st.write("--- %s seconds ---" % (time.time() - start_time))
+  filename = 'Checkpoint-classification.sav'
+  loaded_model = pickle.load(open(filename, 'rb'))
+  loaded_vectorizer = pickle.load(open('vectorizefile_classification.pickle', 'rb'))
+  pipeline_test_output = loaded_vectorizer.transform(class_list)
+  predicted = loaded_model.predict(pipeline_test_output)
+  # tokenizer = Tokenizer(num_words=100000)
+  # tokenizer.fit_on_texts(class_list)
+  # word_index = tokenizer.word_index
+  # text_embedding = np.zeros((len(word_index) + 1, 300))
+  # for word, i in word_index.items():
+  #     text_embedding[i] = nlp(word).vector
+  # json_file = open('model.json', 'r')
+  # loaded_model_json = json_file.read()
+  # json_file.close()
+  # loaded_model = model_from_json(loaded_model_json)
+  # # load weights into new model
+  # loaded_model.load_weights("model.h5")
+  # loss = tf.keras.losses.CategoricalCrossentropy() #from_logits=True
+  # loaded_model.compile(loss=loss,optimizer=tf.keras.optimizers.Adam(1e-4))
+  # predictions = loaded_model.predict(pad_sequences(tokenizer.texts_to_sequences(class_list),maxlen=MAX_SEQUENCE_LENGTH))
+  # predicted = np.argmax(predictions,axis=1)
+  # st.write(predictions)
+  # st.write(predicted)
+  # st.write('stakeholder taxonomy finished')
+  # st.write("--- %s seconds ---" % (time.time() - start_time))
+  pred1 = predicted
+  level0 = []
+  count =0
+  for i in predicted:
+    if i == 3:
+      level0.append('Non-Performance')
+      count +=1
+    else:
+      level0.append('Performance')
+      count +=1
+  list_pred = {0: 'Customers',1:'Employees',2:'Investors',3:'Non-performance',4:'Society',5:'Unclassified'}
+  pred_val = [list_pred[i] for i in pred1]
+  #print('count',count)
+  for ind,(sent,preds) in enumerate(zip(class_list,pred_val)):
+      if 'customers' in sent or 'client' in sent or 'consumer' in sent or 'user' in sent:
+          pred_val[ind] = 'Customers'
+      elif 'investor' in sent or 'finance' in sent or 'shareholder' in sent or 'stockholder' in sent or 'owners' in sent:
+          pred_val[ind] = 'Investors'
+      elif 'employee' in sent or 'worker' in sent or 'staff' in sent:
+          pred_val[ind] = 'Employees'
+      elif 'society' in sent or 'societal' in sent or 'social responsib*' in sent or 'social performance' in sent or 'community' in sent:
+          pred_val[ind] = 'Society'
+  sent_id, unique = pd.factorize(sentence_pred)
+  final_list = pd.DataFrame(
+      {'Id': sent_id,
+       'Fullsentence': sentence_pred,
+       'Component': class_list,
+       'causeOrEffect': entity_list,
+       'Labellevel1': level0,
+       'Labellevel2': pred_val
+      })
+  s = final_list['Component'].shift(-1)
+  m = s.str.startswith('##', na=False)
+  final_list.loc[m, 'Component'] += (' ' + s[m])
+  final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')]
+  li = []
+  uni = final_list1['Id'].unique()
+  for i in uni:
+    df_new = final_list1[final_list1['Id'] == i]
+    uni1 = df_new['Id'].unique()
+  #   if 'E' not in df_new.values:
+  #     li.append(uni1)
+  # out = np.concatenate(li).ravel()
+  # li_pan = pd.DataFrame(out,columns=['Id'])
+  # df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \
+  #             .query("_merge == 'left_only'") \
+  #             .drop("_merge",axis=1)
+  df3 = final_list1
+  #df = df3.groupby(['Id','Fullsentence','causeOrEffect', 'Labellevel1', 'Labellevel2'])['Component'].apply(', '.join).reset_index()
+  #st.write(df)
+  #df = df3
+  df3["causeOrEffect"].replace({"C": "cause", "E": "effect"}, inplace=True)
+  df_final = df3[df3['causeOrEffect'] != 'CT']
+  df3['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True)
+  df_final = df_final.drop("Component",axis=1)
+  df_final.insert(2, "Component", df3['New string'], True)
+  df_final1 = df_final[df_final['Component'].str.split().str.len().gt(1)]
+    #st.write(df_final[df_final['Component'].str.len() != 1])
+  #df_final1.to_csv('predictions.csv')
+#   buffer = io.BytesIO()
+#   with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
+#     df_final.to_excel(writer, sheet_name="Sheet1", index=False)
+#     writer.close()
+  count_NP_NP = 0
+  count_NP_investor = 0
+  count_NP_customer = 0
+  count_NP_employees = 0
+  count_NP_society = 0
+  count_inv_np = 0
+  count_inv_investor = 0
+  count_inv_customer = 0
+  count_inv_employee = 0
+  count_inv_society = 0
+  count_cus_np = 0
+  count_cus_investor = 0
+  count_cus_customer = 0
+  count_cus_employee = 0
+  count_cus_society = 0
+  count_emp_np = 0
+  count_emp_investor = 0
+  count_emp_customer = 0
+  count_emp_employee = 0
+  count_emp_society = 0
+  count_soc_np = 0
+  count_soc_investor = 0
+  count_soc_customer = 0
+  count_soc_employee = 0
+  count_soc_society = 0
+  for i in range(0,df_final['Id'].max()):
+    j = df_final.loc[df_final['Id'] == i]
+    cause_tab = j.loc[j['causeOrEffect'] == 'cause']
+    effect_tab = j.loc[j['causeOrEffect'] == 'effect']
+    cause_coun_NP = (cause_tab.Labellevel2 == 'Non-performance').sum()
+    effect_coun_NP = (effect_tab.Labellevel2 == 'Non-performance').sum()
+    if (cause_coun_NP > 0) and (effect_coun_NP > 0):
+        count_NP = cause_coun_NP if cause_coun_NP >= effect_coun_NP else effect_coun_NP
+    else:
+        count_NP = 0
+    effect_NP_inv = (effect_tab.Labellevel2 == 'Investors').sum()
+    if (cause_coun_NP > 0) and (effect_NP_inv > 0):
+        count_NP_inv = cause_coun_NP if cause_coun_NP >= effect_NP_inv else effect_NP_inv
+    else:
+        count_NP_inv = 0
+    effect_NP_cus = (effect_tab.Labellevel2 == 'Customers').sum()
+    if (cause_coun_NP > 0) and (effect_NP_cus > 0):
+        count_NP_cus = cause_coun_NP if cause_coun_NP >= effect_NP_cus else effect_NP_cus
+    else:
+        count_NP_cus = 0
+    effect_NP_emp = (effect_tab.Labellevel2 == 'Employees').sum()
+    if (cause_coun_NP > 0) and (effect_NP_emp > 0):
+        count_NP_emp = cause_coun_NP if cause_coun_NP >= effect_NP_emp else effect_NP_emp
+    else:
+        count_NP_emp = 0
+    effect_NP_soc = (effect_tab.Labellevel2 == 'Society').sum()
+    if (cause_coun_NP > 0) and (effect_NP_soc > 0):
+        count_NP_soc = cause_coun_NP if cause_coun_NP >= effect_NP_soc else effect_NP_soc
+    else:
+        count_NP_soc = 0
+    cause_coun_inv = (cause_tab.Labellevel2 == 'Investors').sum()
+    effect_coun_inv = (effect_tab.Labellevel2 == 'Non-performance').sum()
+    if (cause_coun_inv > 0) and (effect_coun_inv > 0):
+        count_NP_inv = cause_coun_inv if cause_coun_inv >= effect_coun_inv else effect_coun_inv
+    else:
+        count_NP_inv = 0
+    effect_inv_inv = (effect_tab.Labellevel2 == 'Investors').sum()
+    if (cause_coun_inv > 0) and (effect_inv_inv > 0):
+        count_inv_inv = cause_coun_inv if cause_coun_inv >= effect_inv_inv else effect_inv_inv
+    else:
+        count_inv_inv = 0
+    effect_inv_cus = (effect_tab.Labellevel2 == 'Customers').sum()
+    if (cause_coun_inv > 0) and (effect_inv_cus > 0):
+        count_inv_cus = cause_coun_inv if cause_coun_inv >= effect_inv_cus else effect_inv_cus
+    else:
+        count_inv_cus = 0
+    effect_inv_emp = (effect_tab.Labellevel2 == 'Employees').sum()
+    if (cause_coun_inv > 0) and (effect_inv_emp > 0):
+        count_inv_emp = cause_coun_inv if cause_coun_inv >= effect_inv_emp else effect_inv_emp
+    else:
+        count_inv_emp = 0
+    effect_inv_soc = (effect_tab.Labellevel2 == 'Society').sum()
+    if (cause_coun_inv > 0) and (effect_inv_soc > 0):
+        count_inv_soc = cause_coun_inv if cause_coun_inv >= effect_inv_soc else effect_inv_soc
+    else:
+        count_inv_soc = 0
+    cause_coun_cus = (cause_tab.Labellevel2 == 'Customers').sum()
+    effect_coun_cus = (effect_tab.Labellevel2 == 'Non-performance').sum()
+    if (cause_coun_cus > 0) and (effect_coun_cus > 0):
+        count_NP_cus = cause_coun_cus if cause_coun_cus >= effect_coun_cus else effect_coun_cus
+    else:
+        count_NP_cus = 0
+    effect_cus_inv = (effect_tab.Labellevel2 == 'Investors').sum()
+    if (cause_coun_cus > 0) and (effect_cus_inv > 0):
+        count_cus_inv = cause_coun_cus if cause_coun_cus >= effect_cus_inv else effect_cus_inv
+    else:
+        count_cus_inv = 0
+    effect_cus_cus = (effect_tab.Labellevel2 == 'Customers').sum()
+    if (cause_coun_cus > 0) and (effect_cus_cus > 0):
+        count_cus_cus = cause_coun_cus if cause_coun_cus >= effect_cus_cus else effect_cus_cus
+    else:
+        count_cus_cus = 0
+    effect_cus_emp = (effect_tab.Labellevel2 == 'Employees').sum()
+    if (cause_coun_cus > 0) and (effect_cus_emp > 0):
+        count_cus_emp = cause_coun_cus if cause_coun_cus >= effect_cus_emp else effect_cus_emp
+    else:
+        count_cus_emp = 0
+    effect_cus_soc = (effect_tab.Labellevel2 == 'Society').sum()
+    if (cause_coun_cus > 0) and (effect_cus_soc > 0):
+        count_cus_soc = cause_coun_cus if cause_coun_cus >= effect_cus_soc else effect_cus_soc
+    else:
+        count_cus_soc = 0
+    cause_coun_emp = (cause_tab.Labellevel2 == 'Employees').sum()
+    effect_coun_emp = (effect_tab.Labellevel2 == 'Non-performance').sum()
+    if (cause_coun_emp > 0) and (effect_coun_emp > 0):
+        count_NP_emp = cause_coun_emp if cause_coun_emp >= effect_coun_emp else effect_coun_emp
+    else:
+        count_NP_emp = 0
+    effect_emp_inv = (effect_tab.Labellevel2 == 'Investors').sum()
+    if (cause_coun_emp > 0) and (effect_emp_inv > 0):
+        count_emp_inv = cause_coun_emp if cause_coun_emp >= effect_emp_inv else effect_emp_inv
+    else:
+        count_emp_inv = 0
+    effect_emp_cus = (effect_tab.Labellevel2 == 'Customers').sum()
+    if (cause_coun_emp > 0) and (effect_emp_cus > 0):
+        count_emp_cus = cause_coun_emp if cause_coun_emp >= effect_emp_cus else effect_emp_cus
+    else:
+        count_emp_cus = 0
+    effect_emp_emp = (effect_tab.Labellevel2 == 'Employees').sum()
+    if (cause_coun_emp > 0) and (effect_emp_emp > 0):
+        count_emp_emp = cause_coun_emp if cause_coun_emp >= effect_emp_emp else effect_emp_emp
+    else:
+        count_emp_emp = 0
+    effect_emp_soc = (effect_tab.Labellevel2 == 'Society').sum()
+    if (cause_coun_emp > 0) and (effect_emp_soc > 0):
+        count_emp_soc = cause_coun_emp if cause_coun_emp >= effect_emp_soc else effect_emp_soc
+    else:
+        count_emp_soc = 0
+    cause_coun_soc = (cause_tab.Labellevel2 == 'Society').sum()
+    effect_coun_soc = (effect_tab.Labellevel2 == 'Non-performance').sum()
+    if (cause_coun_soc > 0) and (effect_coun_soc > 0):
+        count_NP_soc = cause_coun_soc if cause_coun_soc >= effect_coun_soc else effect_coun_soc
+    else:
+        count_NP_soc = 0
+    effect_soc_inv = (effect_tab.Labellevel2 == 'Investors').sum()
+    if (cause_coun_soc > 0) and (effect_soc_inv > 0):
+        count_soc_inv = cause_coun_soc if cause_coun_soc >= effect_soc_inv else effect_soc_inv
+    else:
+        count_soc_inv = 0
+    effect_soc_cus = (effect_tab.Labellevel2 == 'Customers').sum()
+    if (cause_coun_soc > 0) and (effect_soc_cus > 0):
+        count_soc_cus = cause_coun_soc if cause_coun_soc >= effect_soc_cus else effect_soc_cus
+    else:
+        count_soc_cus = 0
+    effect_soc_emp = (effect_tab.Labellevel2 == 'Employees').sum()
+    if (cause_coun_soc > 0) and (effect_soc_emp > 0):
+        count_soc_emp = cause_coun_soc if cause_coun_soc >= effect_soc_emp else effect_soc_emp
+    else:
+        count_soc_emp = 0
+    effect_soc_soc = (effect_tab.Labellevel2 == 'Society').sum()
+    if (cause_coun_soc > 0) and (effect_soc_soc > 0):
+        count_soc_soc = cause_coun_soc if cause_coun_soc >= effect_soc_soc else effect_soc_soc
+    else:
+        count_soc_soc = 0
+    count_NP_NP = count_NP_NP + count_NP
+    count_NP_investor = count_NP_investor + count_NP_inv
+    count_NP_customer = count_NP_customer + count_NP_cus
+    count_NP_employees = count_NP_employees + count_NP_emp
+    count_NP_society = count_NP_society + count_NP_soc
+    count_inv_np = count_inv_np + count_NP_inv
+    count_inv_investor = count_inv_investor + count_inv_inv
+    count_inv_customer = count_inv_customer + count_inv_cus
+    count_inv_employee = count_inv_employee + count_inv_emp
+    count_inv_society = count_inv_society + count_inv_soc
+    count_cus_np = count_cus_np + count_NP_cus
+    count_cus_investor = count_cus_investor + count_cus_inv
+    count_cus_customer = count_cus_customer + count_cus_cus
+    count_cus_employee = count_cus_employee + count_cus_emp
+    count_cus_society = count_cus_society + count_cus_soc
+    count_emp_np = count_emp_np + count_NP_emp
+    count_emp_investor = count_emp_investor + count_emp_inv
+    count_emp_customer = count_emp_customer + count_emp_cus
+    count_emp_employee = count_emp_employee + count_emp_emp
+    count_emp_society = count_emp_society + count_emp_soc
+    count_soc_np = count_soc_np + count_NP_soc
+    count_soc_investor = count_soc_investor + count_soc_inv
+    count_soc_customer = count_soc_customer + count_soc_cus
+    count_soc_employee = count_soc_employee + count_soc_emp
+    count_soc_society = count_soc_society + count_soc_soc
+    df_tab = pd.DataFrame(columns = ['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'],index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'], dtype=object)
+    df_tab.loc['Non-performance'] = [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society]
+    df_tab.loc['Investors'] = [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society]
+    df_tab.loc['Customers'] = [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society]
+    df_tab.loc['Employees'] = [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society]
+    df_tab.loc['Society'] = [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]
+#  df_tab = pd.DataFrame({
+#      'Non-performance': [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society],
+#      'Investors': [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society],
+#      'Customers': [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society],
+#      'Employees': [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society],
+#      'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]},
+#       index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'])
+  #df_tab.to_csv('final_data.csv')
+  buffer = io.BytesIO()
+  with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
+    df_tab.to_excel(writer,sheet_name="count_result",index=False)
+    df_final1.to_excel(writer,sheet_name="Detailed_results",index=False)
+    writer.close()
+  #df = pd.read_csv('final_data.csv', index_col=0)
+#474-515
+  # Convert to JSON format
+  json_data = []
+  for row in df_tab.index:
+    for col in df_tab.columns:
+      json_data.append({
+            'source': row,
+            'target': col,
+            'value': int(df_tab.loc[row, col])
+        })
+  HfApi().delete_file(path_in_repo = DATA_FILENAME1 ,repo_id = 'Seetha/visual_files',token= HF_TOKEN,repo_type='dataset')
+  #st.write('file-deleted')
+  fs = HfFileSystem(token=HF_TOKEN)
+  with fs.open('datasets/Seetha/visual_files/level2.json', 'w') as f:
+    json.dump(json_data, f)
+  df_final1.to_csv('predictions.csv')
+  csv_file = "predictions.csv"
+  json_file = "detailedResults.json"
+  # Open the CSV file and read the data
+  with open(csv_file, "r") as f:
+    csv_data = csv.DictReader(f)
+    #   # Convert the CSV data to a list of dictionaries
+    data_list = []
+    for row in csv_data:
+        data_list.append(dict(row))
+  # # Convert the list of dictionaries to JSON
+  json_data = json.dumps(data_list)
+  HfApi().delete_file(path_in_repo = DATA_FILENAME ,repo_id = 'Seetha/visual_files',token= HF_TOKEN,repo_type='dataset')
+  #st.write('file2-deleted')
+  with fs.open('datasets/Seetha/visual_files/detailedResults.json','w') as fi:
+    #data = json.load(fi)
+    fi.write(json_data)
+  def convert_df(df):
+  #IMPORTANT: Cache the conversion to prevent computation on every rerun
+    return df.to_csv().encode('utf-8')
+  csv1 = convert_df(df_final1.astype(str))
+  csv2 = convert_df(df_tab.astype(str))
+  with st.container():
+    st.download_button(label="Download the result table",data=buffer,file_name="t2cg_outputs.xlsx",mime="application/vnd.ms-excel")
+    st.markdown('<a href="https://huggingface.co/spaces/Seetha/visual-knowledgegraph" target="_blank">Click this link in a separate tab to view knowledge graph</a>', unsafe_allow_html=True)
+    # st.download_button(label="Download the detailed result table_csv",data=csv1,file_name='results.csv',mime='text/csv')
+    # st.download_button(label="Download the result table_csv",data=csv2,file_name='final_data.csv',mime='text/csv')
+#with st.container():
+    # Execute your app
+    #st.title("Visualization example")
+#     components.html(source_code)
+    #html(my_html)
+    #webbrowser.open('https://huggingface.co/spaces/Seetha/visual-knowledgegraph')
+#     # embed streamlit docs in a streamlit app
+#     #components.iframe("https://webpages.charlotte.edu/ltotapal/")
+if __name__ == '__main__':
+    start_time = time.time()
+    main()

gitignore ADDED Viewed

	@@ -0,0 +1,36 @@

+import requests
+import streamlit as st
+from sklearn.model_selection import StratifiedKFold
+from sklearn.model_selection import train_test_split
+from sklearn.model_selection import KFold
+# tokenizer
+from transformers import AutoTokenizer, DistilBertTokenizerFast
+# sequence tagging model + training-related
+from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
+import numpy as np
+import pandas as pd
+import torch
+import json
+import sys
+import os
+#from datasets import load_metric
+from sklearn.metrics import classification_report
+from pandas import read_csv
+from sklearn.linear_model import LogisticRegression
+import sklearn.model_selection
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.model_selection import GridSearchCV
+from sklearn.pipeline import Pipeline, FeatureUnion
+import math
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.model_selection import train_test_split
+#from sklearn.metrics import make_scorer
+import json
+import re
+import numpy as np
+import pandas as pd
+import re
+import nltk

level2.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"source": "Non-performance", "target": "Non-performance", "value": 9}, {"source": "Non-performance", "target": "Investors", "value": 31}, {"source": "Non-performance", "target": "Customers", "value": 2}, {"source": "Non-performance", "target": "Employees", "value": 20}, {"source": "Non-performance", "target": "Society", "value": 41}, {"source": "Investors", "target": "Non-performance", "value": 31}, {"source": "Investors", "target": "Investors", "value": 18}, {"source": "Investors", "target": "Customers", "value": 6}, {"source": "Investors", "target": "Employees", "value": 6}, {"source": "Investors", "target": "Society", "value": 5}, {"source": "Customers", "target": "Non-performance", "value": 2}, {"source": "Customers", "target": "Investors", "value": 0}, {"source": "Customers", "target": "Customers", "value": 0}, {"source": "Customers", "target": "Employees", "value": 2}, {"source": "Customers", "target": "Society", "value": 0}, {"source": "Employees", "target": "Non-performance", "value": 20}, {"source": "Employees", "target": "Investors", "value": 6}, {"source": "Employees", "target": "Customers", "value": 4}, {"source": "Employees", "target": "Employees", "value": 4}, {"source": "Employees", "target": "Society", "value": 4}, {"source": "Society", "target": "Non-performance", "value": 41}, {"source": "Society", "target": "Investors", "value": 27}, {"source": "Society", "target": "Customers", "value": 8}, {"source": "Society", "target": "Employees", "value": 3}, {"source": "Society", "target": "Society", "value": 10}]

model (1).json ADDED Viewed

	@@ -0,0 +1 @@

+ {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null], "dtype": "float32", "sparse": false, "ragged": false, "name": "embedding_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "dtype": "float32", "batch_input_shape": [null, null], "input_dim": 6794, "output_dim": 300, "embeddings_initializer": {"module": "keras.initializers", "class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}, "registered_name": null}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": true, "input_length": null}, "registered_name": null, "build_config": {"input_shape": [null, null]}}, {"module": "keras.layers", "class_name": "Bidirectional", "config": {"name": "bidirectional", "trainable": true, "dtype": "float32", "layer": {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null}, "merge_mode": "concat"}, "registered_name": null, "build_config": {"input_shape": [null, null, 300]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 64, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 128]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 6, "activation": "softmax", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 64]}}]}, "keras_version": "2.14.0", "backend": "tensorflow"}

model (2).h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33cbad4eb52bdc1b4d57df65fd064f048238b6a27178881aef6f86767fde22f8
+size 8957728

model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:482d0c2527effa137fea1fa7e0b5510a13d8c787c6630aad9074a334328eda15
+size 16623687

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ wget

requirements (2).txt ADDED Viewed

	@@ -0,0 +1,20 @@

+numpy
+cython
+pandas
+scikit-learn==1.2.2
+streamlit
+torch
+transformers
+huggingface-hub
+pdfplumber
+nltk
+PyPdf2
+xlsxwriter
+gitpython
+pathlib
+gradio
+Werkzeug==2.0.3
+spacy==3.7
+tensorflow
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
+accelerate

sample_anno.pdf ADDED Viewed

Binary file (21.5 kB). View file

tree.css ADDED Viewed

	@@ -0,0 +1,15 @@

+.node circle {
+    fill: #fff;
+    stroke: black;
+    stroke-width: 2px;
+}
+.node text {
+    font: 12px sans-serif;
+}
+.link {
+    fill: none;
+    stroke: #ccc;
+    stroke-width: 2px;
+}

vectorizefile_classification.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:520709ee0c1cfe462d493cb64eb001a60133c12e4d0706e5483017c808dce51d
+size 3532954