aimlnerd commited on
Commit
f7abe49
·
1 Parent(s): 67d83f0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # .gitignore specific to this project
2
+ # ===================================
3
+ source/services/predicting_effective_arguments/model/*
4
+ dist
5
+ build
6
+ comlib.egg-info
7
+
8
+ # tex files
9
+ .log
10
+ .synctex.gz
11
+ .toc
12
+ .aux
13
+ .out
14
+ .idx
15
+ .bbl
16
+ .blg
17
+
18
+ # env file
19
+ .env
20
+
21
+ # data
22
+ sensitive/*
23
+ .cache/*
24
+ router/cache/*
25
+ database.json
26
+ data/*
27
+ log/*
28
+ !log/.empty
29
+ uploads/*
30
+
31
+ # models
32
+ model/*
33
+
34
+ # ide's config
35
+ .idea/*
36
+ .ropeproject
37
+
38
+ # python generated files
39
+ __pycache__
40
+ .mypy_cache
41
+ *.egg-info
42
+ *.pyc
43
+ dependencies/python-pdfbox/build
44
+ dependencies/python-pdfbox/dist
45
+
46
+ # log files
47
+ corpus.log
48
+
49
+ # local config files
50
+
51
+
52
+ # Default .gitignore file
53
+ # =======================
54
+
55
+ # Mac OS X
56
+ .DS_Store
57
+
58
+ # Windows image file caches
59
+ Thumbs.db
60
+ ehthumbs.db
61
+
62
+ # Folder config file
63
+ Desktop.ini
64
+
65
+ # Recycle Bin used on file shares
66
+ $RECYCLE.BIN/
67
+
68
+ # Windows Installer files
69
+ *.cab
70
+ *.msi
71
+ *.msm
72
+ *.msp
73
+
74
+ # Windows shortcuts
75
+ *.lnk
76
+
77
+ # Vagrant
78
+ .vagrant/
79
+
80
+ # IntelliJ
81
+ .idea/
82
+ *.iml
83
+ *.iws
84
+
85
+ # Eclipse
86
+ .classpath
87
+ .project
88
+ .settings/
89
+
90
+ # Maven
91
+ log/
92
+ target/
93
+
94
+ # Gradle
95
+ .gradle/
96
+ build/
97
+
98
+ # SASS
99
+ **/.sass-cache
100
+ **/.sass-cache/*
101
+
102
+ # Byte-compiled / optimized / DLL files
103
+ __pycache__/
104
+ *.py[cod]
105
+
106
+ # C extensions
107
+ *.so
108
+
109
+ # Distribution / packaging
110
+ bin/
111
+ build/
112
+ develop-eggs/
113
+ dist/
114
+ eggs/
115
+ lib64/
116
+ parts/
117
+ sdist/
118
+ var/
119
+ *.egg-info/
120
+ .installed.cfg
121
+ *.egg
122
+
123
+ # Installer logs
124
+ pip-log.txt
125
+ pip-delete-this-directory.txt
126
+
127
+ # Unit test / coverage reports
128
+ .tox/
129
+ .coverage
130
+ .cache
131
+ nosetests.xml
132
+ coverage.xml
133
+
134
+ # Translations
135
+ *.mo
136
+
137
+ # Mr Developer
138
+ .mr.developer.cfg
139
+ .project
140
+ .pydevproject
141
+
142
+ # Rope
143
+ .ropeproject
144
+
145
+ # Django stuff:
146
+ *.log
147
+ *.pot
148
+
149
+ # Sphinx documentation
150
+ docs/_build/
151
+
152
+ # VSCode
153
+ .vscode
154
+
155
+ # Jupyter Notebook
156
+ .ipynb_checkpoints
157
+
158
+ data_exploration/data/anonymization.xlsx
159
+ data_exploration/data/~$anonymization.xlsx
160
+ data_exploration/data/query4_results.pdf
161
+ data_exploration/data/query5_results.pdf
162
+ data_exploration/src/data_versioning_problem_illustrate.pptx
163
+ data_exploration/src/
data/raw_data/.gitkeep DELETED
File without changes
data/raw_data/sample_submission.csv CHANGED
File without changes
data/raw_data/test.csv CHANGED
File without changes
data/raw_data/train.csv CHANGED
The diff for this file is too large to render. See raw diff
 
source/services/predicting_effective_arguments/train/seq_classification.py CHANGED
@@ -56,10 +56,10 @@ if __name__ == '__main__':
56
  test_size = 0.1
57
 
58
  # First split: Separate out the training set
59
- train_df, temp_df = train_test_split(data, test_size=1 - train_size)
60
 
61
  # Second split: Separate out the validation and test sets
62
- valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size))
63
 
64
 
65
  train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
@@ -69,14 +69,15 @@ if __name__ == '__main__':
69
  train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
70
  val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
71
  test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
72
-
73
  train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
74
  val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
75
  test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)
76
 
77
  seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)
78
- y_pred = seqClassifer.predict_valid_data(val_tok_dataset)
79
- seqClassifer.predict_test_data(model_checkpoint=config.MODEL_OUTPUT_DIR, test_data=test_df['inputs'].tolist())
 
80
  pass
81
 
82
  """
@@ -94,7 +95,4 @@ if __name__ == '__main__':
94
  plt.suptitle("")
95
  plt.xlabel("")
96
  plt.show()
97
- """
98
-
99
-
100
- pass
 
56
  test_size = 0.1
57
 
58
  # First split: Separate out the training set
59
+ train_df, temp_df = train_test_split(data, test_size=1 - train_size, random_state=5600)
60
 
61
  # Second split: Separate out the validation and test sets
62
+ valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size), random_state=5600)
63
 
64
 
65
  train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
 
69
  train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
70
  val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
71
  test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
72
+ labels = train_dataset.features["label"].names
73
  train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
74
  val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
75
  test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)
76
 
77
  seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)
78
+ y_valid_pred = seqClassifer.predict_valid_data(val_tok_dataset)
79
+ seqClassifer.plot_confusion_matrix(y_preds=y_valid_pred, y_true=val_dataset['label'], labels=labels)
80
+ y_test_pred = seqClassifer.predict_test_data(model_checkpoint=config.MODEL_OUTPUT_DIR, test_list=test_df['inputs'].tolist())
81
  pass
82
 
83
  """
 
95
  plt.suptitle("")
96
  plt.xlabel("")
97
  plt.show()
98
+ """