KGBrain commited on
Commit
c521774
·
1 Parent(s): 14b6745

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Checkpoint-classification.sav filter=lfs diff=lfs merge=lfs -text
Checkpoint-classification.sav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69238b2d6b5b3bd3b927c56b204cbf033ac304f34d3fe24c92ad6cda33c3017d
3
+ size 1693045
app.py ADDED
@@ -0,0 +1,581 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import all packages
2
+ import requests
3
+ import streamlit as st
4
+ from sklearn.model_selection import StratifiedKFold
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.model_selection import KFold
7
+ # tokenizer
8
+ from transformers import AutoTokenizer, DistilBertTokenizerFast
9
+ # sequence tagging model + training-related
10
+ from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
11
+ import torch
12
+ import sys
13
+ import os
14
+ from sklearn.metrics import classification_report
15
+ from pandas import read_csv
16
+ from sklearn.linear_model import LogisticRegression
17
+ import sklearn.model_selection
18
+ from sklearn.feature_extraction.text import TfidfTransformer
19
+ from sklearn.feature_extraction.text import CountVectorizer
20
+ from sklearn.pipeline import Pipeline, FeatureUnion
21
+ import math
22
+ # from sklearn.metrics import accuracy_score
23
+ # from sklearn.metrics import precision_recall_fscore_support
24
+ import json
25
+ import re
26
+ import numpy as np
27
+ import pandas as pd
28
+ import nltk
29
+ nltk.download("punkt")
30
+ import string
31
+ from sklearn.model_selection import train_test_split
32
+ from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoConfig
33
+ from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
34
+ import itertools
35
+ from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer
36
+ from transformers import pipeline
37
+ import pickle
38
+ import csv
39
+ import pdfplumber
40
+ import pathlib
41
+ import shutil
42
+ import webbrowser
43
+ from streamlit.components.v1 import html
44
+ import streamlit.components.v1 as components
45
+ from PyPDF2 import PdfReader
46
+ from huggingface_hub import HfApi
47
+ import io
48
+ from datasets import load_dataset
49
+ import time
50
+
51
+ import huggingface_hub
52
+ from huggingface_hub import Repository
53
+ from datetime import datetime
54
+ import pathlib as Path
55
+ from requests import get
56
+ import urllib.request
57
+ # import gradio as gr
58
+ # from gradio import inputs, outputs
59
+ from datasets import load_dataset
60
+ from huggingface_hub import HfApi, list_models
61
+ import os
62
+ from huggingface_hub import HfFileSystem
63
+ from tensorflow.keras.models import Sequential, model_from_json
64
+ #import tensorflow_datasets as tfds
65
+ import tensorflow as tf
66
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
67
+ import spacy
68
+ from tensorflow.keras.preprocessing.text import Tokenizer
69
+ #from spacy import en_core_web_lg
70
+ #import en_core_web_lg
71
+ #nlp = en_core_web_lg.load()
72
+ nlp = spacy.load('en_core_web_sm')
73
+
74
+ #tfds.disable_progress_bar()
75
+ MAX_SEQUENCE_LENGTH = 500
76
+
77
+ # dataset = load_dataset('Seetha/Visualization', streaming=True)
78
+ # df = pd.DataFrame.from_dict(dataset['train'])
79
+ # DATASET_REPO_URL = "https://huggingface.co/datasets/Seetha/Visualization"
80
+ # DATA_FILENAME = "level2.json"
81
+ #DATA_FILE = os.path.join("data", DATA_FILENAME)
82
+ DATASET_REPO_URL = "https://huggingface.co/datasets/Seetha/visual_files"
83
+ DATA_FILENAME = "detailedResults.json"
84
+ DATA_FILENAME1 = "level2.json"
85
+
86
+ HF_TOKEN = os.environ.get("HF_TOKEN")
87
+ #st.write("is none?", HF_TOKEN is None)
88
+
89
+ def main():
90
+
91
+ st.title("Text to Causal Knowledge Graph")
92
+ st.sidebar.title("Please upload your text documents in one file here:")
93
+ k=2
94
+ seed = 1
95
+ k1= 5
96
+ text_list = []
97
+ causal_sents = []
98
+
99
+ uploaded_file = None
100
+ try:
101
+ uploaded_file = st.sidebar.file_uploader("Choose a file", type = "pdf")
102
+ except:
103
+ uploaded_file = PdfReader('sample_anno.pdf')
104
+ st.error("Please upload your own PDF to be analyzed")
105
+
106
+ if uploaded_file is not None:
107
+ reader = PdfReader(uploaded_file)
108
+ for page in reader.pages:
109
+ text = page.extract_text()
110
+ text_list.append(text)
111
+ else:
112
+ st.error("Please upload your own PDF to be analyzed")
113
+ st.stop()
114
+
115
+ text_list_final = [x.replace('\n', '') for x in text_list]
116
+ text_list_final = re.sub('"', '', str(text_list_final))
117
+
118
+ sentences = nltk.sent_tokenize(text_list_final)
119
+
120
+ result =[]
121
+ for i in sentences:
122
+ result1 = i.lower()
123
+ result2 = re.sub(r'[^\w\s]','',result1)
124
+ result.append(result2)
125
+
126
+ #st.write("--- %s seconds ---" % (time.time() - start_time))
127
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #bert-base-uncased
128
+
129
+ model_path = "checkpoint-2850"
130
+
131
+ model = AutoModelForSequenceClassification.from_pretrained(model_path,id2label={0:'non-causal',1:'causal'})
132
+
133
+ #st.write('sequence classification loaded')
134
+ pipe1 = pipeline("text-classification", model=model,tokenizer=tokenizer)
135
+ for sent in result:
136
+ pred = pipe1(sent)
137
+ for lab in pred:
138
+ if lab['label'] == 'causal': #causal
139
+ causal_sents.append(sent)
140
+
141
+ # st.write('causal sentence classification finished')
142
+ # st.write("--- %s seconds ---" % (time.time() - start_time))
143
+
144
+ model_name = "distilbert-base-cased"
145
+ tokenizer = DistilBertTokenizerFast.from_pretrained(model_name,low_cpu_mem_usage=True)
146
+
147
+ model_path1 = "DistilBertforTokenclassification"
148
+
149
+ model = DistilBertForTokenClassification.from_pretrained(model_path1,low_cpu_mem_usage=True) #len(unique_tags),, num_labels= 7, , id2label={0:'CT',1:'E',2:'C',3:'O'}
150
+ pipe = pipeline('ner', model=model, tokenizer=tokenizer,aggregation_strategy='simple') #grouped_entities=True
151
+ st.write('DistilBERT loaded')
152
+ sentence_pred = []
153
+ class_list = []
154
+ entity_list = []
155
+ for k in causal_sents:
156
+ pred= pipe(k)
157
+ #st.write(pred)
158
+ #st.write('preds')
159
+ for i in pred:
160
+ sentence_pred.append(k)
161
+ class_list.append(i['word'])
162
+ entity_list.append(i['entity_group'])
163
+
164
+ # st.write('causality extraction finished')
165
+ # st.write("--- %s seconds ---" % (time.time() - start_time))
166
+
167
+ filename = 'Checkpoint-classification.sav'
168
+ loaded_model = pickle.load(open(filename, 'rb'))
169
+ loaded_vectorizer = pickle.load(open('vectorizefile_classification.pickle', 'rb'))
170
+
171
+ pipeline_test_output = loaded_vectorizer.transform(class_list)
172
+ predicted = loaded_model.predict(pipeline_test_output)
173
+
174
+ # tokenizer = Tokenizer(num_words=100000)
175
+ # tokenizer.fit_on_texts(class_list)
176
+ # word_index = tokenizer.word_index
177
+ # text_embedding = np.zeros((len(word_index) + 1, 300))
178
+ # for word, i in word_index.items():
179
+ # text_embedding[i] = nlp(word).vector
180
+ # json_file = open('model.json', 'r')
181
+ # loaded_model_json = json_file.read()
182
+ # json_file.close()
183
+ # loaded_model = model_from_json(loaded_model_json)
184
+ # # load weights into new model
185
+ # loaded_model.load_weights("model.h5")
186
+
187
+ # loss = tf.keras.losses.CategoricalCrossentropy() #from_logits=True
188
+ # loaded_model.compile(loss=loss,optimizer=tf.keras.optimizers.Adam(1e-4))
189
+
190
+ # predictions = loaded_model.predict(pad_sequences(tokenizer.texts_to_sequences(class_list),maxlen=MAX_SEQUENCE_LENGTH))
191
+ # predicted = np.argmax(predictions,axis=1)
192
+
193
+ # st.write(predictions)
194
+ # st.write(predicted)
195
+ # st.write('stakeholder taxonomy finished')
196
+ # st.write("--- %s seconds ---" % (time.time() - start_time))
197
+ pred1 = predicted
198
+ level0 = []
199
+ count =0
200
+ for i in predicted:
201
+ if i == 3:
202
+ level0.append('Non-Performance')
203
+ count +=1
204
+ else:
205
+ level0.append('Performance')
206
+ count +=1
207
+
208
+ list_pred = {0: 'Customers',1:'Employees',2:'Investors',3:'Non-performance',4:'Society',5:'Unclassified'}
209
+ pred_val = [list_pred[i] for i in pred1]
210
+
211
+ #print('count',count)
212
+ for ind,(sent,preds) in enumerate(zip(class_list,pred_val)):
213
+ if 'customers' in sent or 'client' in sent or 'consumer' in sent or 'user' in sent:
214
+ pred_val[ind] = 'Customers'
215
+ elif 'investor' in sent or 'finance' in sent or 'shareholder' in sent or 'stockholder' in sent or 'owners' in sent:
216
+ pred_val[ind] = 'Investors'
217
+ elif 'employee' in sent or 'worker' in sent or 'staff' in sent:
218
+ pred_val[ind] = 'Employees'
219
+ elif 'society' in sent or 'societal' in sent or 'social responsib*' in sent or 'social performance' in sent or 'community' in sent:
220
+ pred_val[ind] = 'Society'
221
+
222
+ sent_id, unique = pd.factorize(sentence_pred)
223
+
224
+ final_list = pd.DataFrame(
225
+ {'Id': sent_id,
226
+ 'Fullsentence': sentence_pred,
227
+ 'Component': class_list,
228
+ 'causeOrEffect': entity_list,
229
+ 'Labellevel1': level0,
230
+ 'Labellevel2': pred_val
231
+ })
232
+ s = final_list['Component'].shift(-1)
233
+ m = s.str.startswith('##', na=False)
234
+ final_list.loc[m, 'Component'] += (' ' + s[m])
235
+
236
+
237
+ final_list1 = final_list[~final_list['Component'].astype(str).str.startswith('##')]
238
+ li = []
239
+ uni = final_list1['Id'].unique()
240
+ for i in uni:
241
+ df_new = final_list1[final_list1['Id'] == i]
242
+ uni1 = df_new['Id'].unique()
243
+
244
+ # if 'E' not in df_new.values:
245
+ # li.append(uni1)
246
+ # out = np.concatenate(li).ravel()
247
+ # li_pan = pd.DataFrame(out,columns=['Id'])
248
+ # df3 = pd.merge(final_list1, li_pan[['Id']], on='Id', how='left', indicator=True) \
249
+ # .query("_merge == 'left_only'") \
250
+ # .drop("_merge",axis=1)
251
+
252
+ df3 = final_list1
253
+ #df = df3.groupby(['Id','Fullsentence','causeOrEffect', 'Labellevel1', 'Labellevel2'])['Component'].apply(', '.join).reset_index()
254
+ #st.write(df)
255
+
256
+ #df = df3
257
+ df3["causeOrEffect"].replace({"C": "cause", "E": "effect"}, inplace=True)
258
+ df_final = df3[df3['causeOrEffect'] != 'CT']
259
+ df3['New string'] = df_final['Component'].replace(r'[##]+', ' ', regex=True)
260
+
261
+ df_final = df_final.drop("Component",axis=1)
262
+ df_final.insert(2, "Component", df3['New string'], True)
263
+
264
+ df_final1 = df_final[df_final['Component'].str.split().str.len().gt(1)]
265
+ #st.write(df_final[df_final['Component'].str.len() != 1])
266
+ #df_final1.to_csv('predictions.csv')
267
+
268
+ # buffer = io.BytesIO()
269
+ # with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
270
+ # df_final.to_excel(writer, sheet_name="Sheet1", index=False)
271
+ # writer.close()
272
+
273
+ count_NP_NP = 0
274
+ count_NP_investor = 0
275
+ count_NP_customer = 0
276
+ count_NP_employees = 0
277
+ count_NP_society = 0
278
+
279
+ count_inv_np = 0
280
+ count_inv_investor = 0
281
+ count_inv_customer = 0
282
+ count_inv_employee = 0
283
+ count_inv_society = 0
284
+
285
+ count_cus_np = 0
286
+ count_cus_investor = 0
287
+ count_cus_customer = 0
288
+ count_cus_employee = 0
289
+ count_cus_society = 0
290
+
291
+ count_emp_np = 0
292
+ count_emp_investor = 0
293
+ count_emp_customer = 0
294
+ count_emp_employee = 0
295
+ count_emp_society = 0
296
+
297
+ count_soc_np = 0
298
+ count_soc_investor = 0
299
+ count_soc_customer = 0
300
+ count_soc_employee = 0
301
+ count_soc_society = 0
302
+ for i in range(0,df_final['Id'].max()):
303
+ j = df_final.loc[df_final['Id'] == i]
304
+ cause_tab = j.loc[j['causeOrEffect'] == 'cause']
305
+ effect_tab = j.loc[j['causeOrEffect'] == 'effect']
306
+ cause_coun_NP = (cause_tab.Labellevel2 == 'Non-performance').sum()
307
+ effect_coun_NP = (effect_tab.Labellevel2 == 'Non-performance').sum()
308
+
309
+ if (cause_coun_NP > 0) and (effect_coun_NP > 0):
310
+ count_NP = cause_coun_NP if cause_coun_NP >= effect_coun_NP else effect_coun_NP
311
+ else:
312
+ count_NP = 0
313
+ effect_NP_inv = (effect_tab.Labellevel2 == 'Investors').sum()
314
+ if (cause_coun_NP > 0) and (effect_NP_inv > 0):
315
+ count_NP_inv = cause_coun_NP if cause_coun_NP >= effect_NP_inv else effect_NP_inv
316
+ else:
317
+ count_NP_inv = 0
318
+ effect_NP_cus = (effect_tab.Labellevel2 == 'Customers').sum()
319
+ if (cause_coun_NP > 0) and (effect_NP_cus > 0):
320
+ count_NP_cus = cause_coun_NP if cause_coun_NP >= effect_NP_cus else effect_NP_cus
321
+ else:
322
+ count_NP_cus = 0
323
+ effect_NP_emp = (effect_tab.Labellevel2 == 'Employees').sum()
324
+ if (cause_coun_NP > 0) and (effect_NP_emp > 0):
325
+ count_NP_emp = cause_coun_NP if cause_coun_NP >= effect_NP_emp else effect_NP_emp
326
+ else:
327
+ count_NP_emp = 0
328
+ effect_NP_soc = (effect_tab.Labellevel2 == 'Society').sum()
329
+ if (cause_coun_NP > 0) and (effect_NP_soc > 0):
330
+ count_NP_soc = cause_coun_NP if cause_coun_NP >= effect_NP_soc else effect_NP_soc
331
+ else:
332
+ count_NP_soc = 0
333
+
334
+ cause_coun_inv = (cause_tab.Labellevel2 == 'Investors').sum()
335
+ effect_coun_inv = (effect_tab.Labellevel2 == 'Non-performance').sum()
336
+ if (cause_coun_inv > 0) and (effect_coun_inv > 0):
337
+ count_NP_inv = cause_coun_inv if cause_coun_inv >= effect_coun_inv else effect_coun_inv
338
+ else:
339
+ count_NP_inv = 0
340
+
341
+ effect_inv_inv = (effect_tab.Labellevel2 == 'Investors').sum()
342
+ if (cause_coun_inv > 0) and (effect_inv_inv > 0):
343
+ count_inv_inv = cause_coun_inv if cause_coun_inv >= effect_inv_inv else effect_inv_inv
344
+ else:
345
+ count_inv_inv = 0
346
+ effect_inv_cus = (effect_tab.Labellevel2 == 'Customers').sum()
347
+ if (cause_coun_inv > 0) and (effect_inv_cus > 0):
348
+ count_inv_cus = cause_coun_inv if cause_coun_inv >= effect_inv_cus else effect_inv_cus
349
+ else:
350
+ count_inv_cus = 0
351
+ effect_inv_emp = (effect_tab.Labellevel2 == 'Employees').sum()
352
+ if (cause_coun_inv > 0) and (effect_inv_emp > 0):
353
+ count_inv_emp = cause_coun_inv if cause_coun_inv >= effect_inv_emp else effect_inv_emp
354
+ else:
355
+ count_inv_emp = 0
356
+
357
+ effect_inv_soc = (effect_tab.Labellevel2 == 'Society').sum()
358
+ if (cause_coun_inv > 0) and (effect_inv_soc > 0):
359
+ count_inv_soc = cause_coun_inv if cause_coun_inv >= effect_inv_soc else effect_inv_soc
360
+ else:
361
+ count_inv_soc = 0
362
+
363
+ cause_coun_cus = (cause_tab.Labellevel2 == 'Customers').sum()
364
+ effect_coun_cus = (effect_tab.Labellevel2 == 'Non-performance').sum()
365
+ if (cause_coun_cus > 0) and (effect_coun_cus > 0):
366
+ count_NP_cus = cause_coun_cus if cause_coun_cus >= effect_coun_cus else effect_coun_cus
367
+ else:
368
+ count_NP_cus = 0
369
+
370
+ effect_cus_inv = (effect_tab.Labellevel2 == 'Investors').sum()
371
+ if (cause_coun_cus > 0) and (effect_cus_inv > 0):
372
+ count_cus_inv = cause_coun_cus if cause_coun_cus >= effect_cus_inv else effect_cus_inv
373
+ else:
374
+ count_cus_inv = 0
375
+
376
+ effect_cus_cus = (effect_tab.Labellevel2 == 'Customers').sum()
377
+ if (cause_coun_cus > 0) and (effect_cus_cus > 0):
378
+ count_cus_cus = cause_coun_cus if cause_coun_cus >= effect_cus_cus else effect_cus_cus
379
+ else:
380
+ count_cus_cus = 0
381
+
382
+ effect_cus_emp = (effect_tab.Labellevel2 == 'Employees').sum()
383
+ if (cause_coun_cus > 0) and (effect_cus_emp > 0):
384
+ count_cus_emp = cause_coun_cus if cause_coun_cus >= effect_cus_emp else effect_cus_emp
385
+ else:
386
+ count_cus_emp = 0
387
+
388
+ effect_cus_soc = (effect_tab.Labellevel2 == 'Society').sum()
389
+ if (cause_coun_cus > 0) and (effect_cus_soc > 0):
390
+ count_cus_soc = cause_coun_cus if cause_coun_cus >= effect_cus_soc else effect_cus_soc
391
+ else:
392
+ count_cus_soc = 0
393
+
394
+ cause_coun_emp = (cause_tab.Labellevel2 == 'Employees').sum()
395
+ effect_coun_emp = (effect_tab.Labellevel2 == 'Non-performance').sum()
396
+ if (cause_coun_emp > 0) and (effect_coun_emp > 0):
397
+ count_NP_emp = cause_coun_emp if cause_coun_emp >= effect_coun_emp else effect_coun_emp
398
+ else:
399
+ count_NP_emp = 0
400
+
401
+ effect_emp_inv = (effect_tab.Labellevel2 == 'Investors').sum()
402
+ if (cause_coun_emp > 0) and (effect_emp_inv > 0):
403
+ count_emp_inv = cause_coun_emp if cause_coun_emp >= effect_emp_inv else effect_emp_inv
404
+ else:
405
+ count_emp_inv = 0
406
+
407
+ effect_emp_cus = (effect_tab.Labellevel2 == 'Customers').sum()
408
+ if (cause_coun_emp > 0) and (effect_emp_cus > 0):
409
+ count_emp_cus = cause_coun_emp if cause_coun_emp >= effect_emp_cus else effect_emp_cus
410
+ else:
411
+ count_emp_cus = 0
412
+
413
+ effect_emp_emp = (effect_tab.Labellevel2 == 'Employees').sum()
414
+ if (cause_coun_emp > 0) and (effect_emp_emp > 0):
415
+ count_emp_emp = cause_coun_emp if cause_coun_emp >= effect_emp_emp else effect_emp_emp
416
+ else:
417
+ count_emp_emp = 0
418
+
419
+ effect_emp_soc = (effect_tab.Labellevel2 == 'Society').sum()
420
+ if (cause_coun_emp > 0) and (effect_emp_soc > 0):
421
+ count_emp_soc = cause_coun_emp if cause_coun_emp >= effect_emp_soc else effect_emp_soc
422
+ else:
423
+ count_emp_soc = 0
424
+
425
+ cause_coun_soc = (cause_tab.Labellevel2 == 'Society').sum()
426
+ effect_coun_soc = (effect_tab.Labellevel2 == 'Non-performance').sum()
427
+ if (cause_coun_soc > 0) and (effect_coun_soc > 0):
428
+ count_NP_soc = cause_coun_soc if cause_coun_soc >= effect_coun_soc else effect_coun_soc
429
+ else:
430
+ count_NP_soc = 0
431
+
432
+ effect_soc_inv = (effect_tab.Labellevel2 == 'Investors').sum()
433
+ if (cause_coun_soc > 0) and (effect_soc_inv > 0):
434
+ count_soc_inv = cause_coun_soc if cause_coun_soc >= effect_soc_inv else effect_soc_inv
435
+ else:
436
+ count_soc_inv = 0
437
+
438
+ effect_soc_cus = (effect_tab.Labellevel2 == 'Customers').sum()
439
+ if (cause_coun_soc > 0) and (effect_soc_cus > 0):
440
+ count_soc_cus = cause_coun_soc if cause_coun_soc >= effect_soc_cus else effect_soc_cus
441
+ else:
442
+ count_soc_cus = 0
443
+
444
+ effect_soc_emp = (effect_tab.Labellevel2 == 'Employees').sum()
445
+ if (cause_coun_soc > 0) and (effect_soc_emp > 0):
446
+ count_soc_emp = cause_coun_soc if cause_coun_soc >= effect_soc_emp else effect_soc_emp
447
+ else:
448
+ count_soc_emp = 0
449
+
450
+ effect_soc_soc = (effect_tab.Labellevel2 == 'Society').sum()
451
+ if (cause_coun_soc > 0) and (effect_soc_soc > 0):
452
+ count_soc_soc = cause_coun_soc if cause_coun_soc >= effect_soc_soc else effect_soc_soc
453
+ else:
454
+ count_soc_soc = 0
455
+
456
+ count_NP_NP = count_NP_NP + count_NP
457
+ count_NP_investor = count_NP_investor + count_NP_inv
458
+ count_NP_customer = count_NP_customer + count_NP_cus
459
+ count_NP_employees = count_NP_employees + count_NP_emp
460
+ count_NP_society = count_NP_society + count_NP_soc
461
+
462
+ count_inv_np = count_inv_np + count_NP_inv
463
+ count_inv_investor = count_inv_investor + count_inv_inv
464
+ count_inv_customer = count_inv_customer + count_inv_cus
465
+ count_inv_employee = count_inv_employee + count_inv_emp
466
+ count_inv_society = count_inv_society + count_inv_soc
467
+
468
+ count_cus_np = count_cus_np + count_NP_cus
469
+ count_cus_investor = count_cus_investor + count_cus_inv
470
+ count_cus_customer = count_cus_customer + count_cus_cus
471
+ count_cus_employee = count_cus_employee + count_cus_emp
472
+ count_cus_society = count_cus_society + count_cus_soc
473
+
474
+ count_emp_np = count_emp_np + count_NP_emp
475
+ count_emp_investor = count_emp_investor + count_emp_inv
476
+ count_emp_customer = count_emp_customer + count_emp_cus
477
+ count_emp_employee = count_emp_employee + count_emp_emp
478
+ count_emp_society = count_emp_society + count_emp_soc
479
+
480
+ count_soc_np = count_soc_np + count_NP_soc
481
+ count_soc_investor = count_soc_investor + count_soc_inv
482
+ count_soc_customer = count_soc_customer + count_soc_cus
483
+ count_soc_employee = count_soc_employee + count_soc_emp
484
+ count_soc_society = count_soc_society + count_soc_soc
485
+
486
+ df_tab = pd.DataFrame(columns = ['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'],index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'], dtype=object)
487
+
488
+ df_tab.loc['Non-performance'] = [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society]
489
+ df_tab.loc['Investors'] = [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society]
490
+ df_tab.loc['Customers'] = [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society]
491
+ df_tab.loc['Employees'] = [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society]
492
+ df_tab.loc['Society'] = [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]
493
+
494
+
495
+ # df_tab = pd.DataFrame({
496
+ # 'Non-performance': [count_NP_NP, count_NP_investor, count_NP_customer, count_NP_employees, count_NP_society],
497
+ # 'Investors': [count_inv_np, count_inv_investor, count_inv_customer, count_inv_employee, count_inv_society],
498
+ # 'Customers': [count_cus_np, count_cus_investor, count_cus_customer, count_cus_employee, count_cus_society],
499
+ # 'Employees': [count_emp_np, count_emp_investor, count_emp_customer, count_emp_employee, count_emp_society],
500
+ # 'Society': [count_soc_np, count_soc_investor, count_soc_customer, count_soc_employee, count_soc_society]},
501
+ # index=['Non-performance', 'Investors', 'Customers', 'Employees', 'Society'])
502
+
503
+ #df_tab.to_csv('final_data.csv')
504
+
505
+ buffer = io.BytesIO()
506
+ with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
507
+ df_tab.to_excel(writer,sheet_name="count_result",index=False)
508
+ df_final1.to_excel(writer,sheet_name="Detailed_results",index=False)
509
+ writer.close()
510
+ #df = pd.read_csv('final_data.csv', index_col=0)
511
+ #474-515
512
+ # Convert to JSON format
513
+ json_data = []
514
+ for row in df_tab.index:
515
+ for col in df_tab.columns:
516
+ json_data.append({
517
+ 'source': row,
518
+ 'target': col,
519
+ 'value': int(df_tab.loc[row, col])
520
+ })
521
+
522
+ HfApi().delete_file(path_in_repo = DATA_FILENAME1 ,repo_id = 'Seetha/visual_files',token= HF_TOKEN,repo_type='dataset')
523
+ #st.write('file-deleted')
524
+ fs = HfFileSystem(token=HF_TOKEN)
525
+ with fs.open('datasets/Seetha/visual_files/level2.json', 'w') as f:
526
+ json.dump(json_data, f)
527
+
528
+ df_final1.to_csv('predictions.csv')
529
+ csv_file = "predictions.csv"
530
+ json_file = "detailedResults.json"
531
+
532
+ # Open the CSV file and read the data
533
+ with open(csv_file, "r") as f:
534
+ csv_data = csv.DictReader(f)
535
+
536
+ # # Convert the CSV data to a list of dictionaries
537
+ data_list = []
538
+ for row in csv_data:
539
+ data_list.append(dict(row))
540
+
541
+ # # Convert the list of dictionaries to JSON
542
+ json_data = json.dumps(data_list)
543
+
544
+ HfApi().delete_file(path_in_repo = DATA_FILENAME ,repo_id = 'Seetha/visual_files',token= HF_TOKEN,repo_type='dataset')
545
+ #st.write('file2-deleted')
546
+ with fs.open('datasets/Seetha/visual_files/detailedResults.json','w') as fi:
547
+ #data = json.load(fi)
548
+ fi.write(json_data)
549
+
550
+ def convert_df(df):
551
+
552
+ #IMPORTANT: Cache the conversion to prevent computation on every rerun
553
+
554
+ return df.to_csv().encode('utf-8')
555
+
556
+
557
+
558
+ csv1 = convert_df(df_final1.astype(str))
559
+ csv2 = convert_df(df_tab.astype(str))
560
+
561
+ with st.container():
562
+
563
+ st.download_button(label="Download the result table",data=buffer,file_name="t2cg_outputs.xlsx",mime="application/vnd.ms-excel")
564
+ st.markdown('<a href="https://huggingface.co/spaces/Seetha/visual-knowledgegraph" target="_blank">Click this link in a separate tab to view knowledge graph</a>', unsafe_allow_html=True)
565
+ # st.download_button(label="Download the detailed result table_csv",data=csv1,file_name='results.csv',mime='text/csv')
566
+ # st.download_button(label="Download the result table_csv",data=csv2,file_name='final_data.csv',mime='text/csv')
567
+
568
+ #with st.container():
569
+ # Execute your app
570
+ #st.title("Visualization example")
571
+ # components.html(source_code)
572
+ #html(my_html)
573
+ #webbrowser.open('https://huggingface.co/spaces/Seetha/visual-knowledgegraph')
574
+ # # embed streamlit docs in a streamlit app
575
+ # #components.iframe("https://webpages.charlotte.edu/ltotapal/")
576
+
577
+
578
+
579
+ if __name__ == '__main__':
580
+ start_time = time.time()
581
+ main()
gitignore ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import streamlit as st
3
+ from sklearn.model_selection import StratifiedKFold
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.model_selection import KFold
6
+ # tokenizer
7
+ from transformers import AutoTokenizer, DistilBertTokenizerFast
8
+ # sequence tagging model + training-related
9
+ from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
10
+ import numpy as np
11
+ import pandas as pd
12
+ import torch
13
+ import json
14
+ import sys
15
+ import os
16
+ #from datasets import load_metric
17
+ from sklearn.metrics import classification_report
18
+ from pandas import read_csv
19
+ from sklearn.linear_model import LogisticRegression
20
+ import sklearn.model_selection
21
+ from sklearn.feature_extraction.text import TfidfTransformer
22
+ from sklearn.feature_extraction.text import CountVectorizer
23
+ from sklearn.naive_bayes import MultinomialNB
24
+ from sklearn.model_selection import GridSearchCV
25
+ from sklearn.pipeline import Pipeline, FeatureUnion
26
+ import math
27
+ from sklearn.metrics import accuracy_score
28
+ from sklearn.metrics import precision_recall_fscore_support
29
+ from sklearn.model_selection import train_test_split
30
+ #from sklearn.metrics import make_scorer
31
+ import json
32
+ import re
33
+ import numpy as np
34
+ import pandas as pd
35
+ import re
36
+ import nltk
level2.json ADDED
@@ -0,0 +1 @@
 
 
1
+ [{"source": "Non-performance", "target": "Non-performance", "value": 9}, {"source": "Non-performance", "target": "Investors", "value": 31}, {"source": "Non-performance", "target": "Customers", "value": 2}, {"source": "Non-performance", "target": "Employees", "value": 20}, {"source": "Non-performance", "target": "Society", "value": 41}, {"source": "Investors", "target": "Non-performance", "value": 31}, {"source": "Investors", "target": "Investors", "value": 18}, {"source": "Investors", "target": "Customers", "value": 6}, {"source": "Investors", "target": "Employees", "value": 6}, {"source": "Investors", "target": "Society", "value": 5}, {"source": "Customers", "target": "Non-performance", "value": 2}, {"source": "Customers", "target": "Investors", "value": 0}, {"source": "Customers", "target": "Customers", "value": 0}, {"source": "Customers", "target": "Employees", "value": 2}, {"source": "Customers", "target": "Society", "value": 0}, {"source": "Employees", "target": "Non-performance", "value": 20}, {"source": "Employees", "target": "Investors", "value": 6}, {"source": "Employees", "target": "Customers", "value": 4}, {"source": "Employees", "target": "Employees", "value": 4}, {"source": "Employees", "target": "Society", "value": 4}, {"source": "Society", "target": "Non-performance", "value": 41}, {"source": "Society", "target": "Investors", "value": 27}, {"source": "Society", "target": "Customers", "value": 8}, {"source": "Society", "target": "Employees", "value": 3}, {"source": "Society", "target": "Society", "value": 10}]
model (1).json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, null], "dtype": "float32", "sparse": false, "ragged": false, "name": "embedding_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "Embedding", "config": {"name": "embedding", "trainable": true, "dtype": "float32", "batch_input_shape": [null, null], "input_dim": 6794, "output_dim": 300, "embeddings_initializer": {"module": "keras.initializers", "class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}, "registered_name": null}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": true, "input_length": null}, "registered_name": null, "build_config": {"input_shape": [null, null]}}, {"module": "keras.layers", "class_name": "Bidirectional", "config": {"name": "bidirectional", "trainable": true, "dtype": "float32", "layer": {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null}, "merge_mode": "concat"}, "registered_name": null, "build_config": {"input_shape": [null, null, 300]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 64, "activation": "relu", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 128]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "dtype": "float32", "units": 6, "activation": "softmax", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 64]}}]}, "keras_version": "2.14.0", "backend": "tensorflow"}
model (2).h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33cbad4eb52bdc1b4d57df65fd064f048238b6a27178881aef6f86767fde22f8
3
+ size 8957728
model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:482d0c2527effa137fea1fa7e0b5510a13d8c787c6630aad9074a334328eda15
3
+ size 16623687
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ wget
requirements (2).txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy
2
+ cython
3
+ pandas
4
+ scikit-learn==1.2.2
5
+ streamlit
6
+ torch
7
+ transformers
8
+ huggingface-hub
9
+ pdfplumber
10
+ nltk
11
+ PyPdf2
12
+ xlsxwriter
13
+ gitpython
14
+ pathlib
15
+ gradio
16
+ Werkzeug==2.0.3
17
+ spacy==3.7
18
+ tensorflow
19
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0.tar.gz
20
+ accelerate
sample_anno.pdf ADDED
Binary file (21.5 kB). View file
 
tree.css ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .node circle {
2
+ fill: #fff;
3
+ stroke: black;
4
+ stroke-width: 2px;
5
+ }
6
+
7
+ .node text {
8
+ font: 12px sans-serif;
9
+ }
10
+
11
+ .link {
12
+ fill: none;
13
+ stroke: #ccc;
14
+ stroke-width: 2px;
15
+ }
vectorizefile_classification.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:520709ee0c1cfe462d493cb64eb001a60133c12e4d0706e5483017c808dce51d
3
+ size 3532954