Spaces:
Runtime error
Runtime error
add
Browse files- .gitattributes +0 -35
- .gitignore +163 -0
- data/raw_data/.gitkeep +0 -0
- data/raw_data/sample_submission.csv +0 -0
- data/raw_data/test.csv +0 -0
- data/raw_data/train.csv +0 -0
- source/services/predicting_effective_arguments/train/seq_classification.py +7 -9
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# .gitignore specific to this project
|
2 |
+
# ===================================
|
3 |
+
source/services/predicting_effective_arguments/model/*
|
4 |
+
dist
|
5 |
+
build
|
6 |
+
comlib.egg-info
|
7 |
+
|
8 |
+
# tex files
|
9 |
+
.log
|
10 |
+
.synctex.gz
|
11 |
+
.toc
|
12 |
+
.aux
|
13 |
+
.out
|
14 |
+
.idx
|
15 |
+
.bbl
|
16 |
+
.blg
|
17 |
+
|
18 |
+
# env file
|
19 |
+
.env
|
20 |
+
|
21 |
+
# data
|
22 |
+
sensitive/*
|
23 |
+
.cache/*
|
24 |
+
router/cache/*
|
25 |
+
database.json
|
26 |
+
data/*
|
27 |
+
log/*
|
28 |
+
!log/.empty
|
29 |
+
uploads/*
|
30 |
+
|
31 |
+
# models
|
32 |
+
model/*
|
33 |
+
|
34 |
+
# ide's config
|
35 |
+
.idea/*
|
36 |
+
.ropeproject
|
37 |
+
|
38 |
+
# python generated files
|
39 |
+
__pycache__
|
40 |
+
.mypy_cache
|
41 |
+
*.egg-info
|
42 |
+
*.pyc
|
43 |
+
dependencies/python-pdfbox/build
|
44 |
+
dependencies/python-pdfbox/dist
|
45 |
+
|
46 |
+
# log files
|
47 |
+
corpus.log
|
48 |
+
|
49 |
+
# local config files
|
50 |
+
|
51 |
+
|
52 |
+
# Default .gitignore file
|
53 |
+
# =======================
|
54 |
+
|
55 |
+
# Mac OS X
|
56 |
+
.DS_Store
|
57 |
+
|
58 |
+
# Windows image file caches
|
59 |
+
Thumbs.db
|
60 |
+
ehthumbs.db
|
61 |
+
|
62 |
+
# Folder config file
|
63 |
+
Desktop.ini
|
64 |
+
|
65 |
+
# Recycle Bin used on file shares
|
66 |
+
$RECYCLE.BIN/
|
67 |
+
|
68 |
+
# Windows Installer files
|
69 |
+
*.cab
|
70 |
+
*.msi
|
71 |
+
*.msm
|
72 |
+
*.msp
|
73 |
+
|
74 |
+
# Windows shortcuts
|
75 |
+
*.lnk
|
76 |
+
|
77 |
+
# Vagrant
|
78 |
+
.vagrant/
|
79 |
+
|
80 |
+
# IntelliJ
|
81 |
+
.idea/
|
82 |
+
*.iml
|
83 |
+
*.iws
|
84 |
+
|
85 |
+
# Eclipse
|
86 |
+
.classpath
|
87 |
+
.project
|
88 |
+
.settings/
|
89 |
+
|
90 |
+
# Maven
|
91 |
+
log/
|
92 |
+
target/
|
93 |
+
|
94 |
+
# Gradle
|
95 |
+
.gradle/
|
96 |
+
build/
|
97 |
+
|
98 |
+
# SASS
|
99 |
+
**/.sass-cache
|
100 |
+
**/.sass-cache/*
|
101 |
+
|
102 |
+
# Byte-compiled / optimized / DLL files
|
103 |
+
__pycache__/
|
104 |
+
*.py[cod]
|
105 |
+
|
106 |
+
# C extensions
|
107 |
+
*.so
|
108 |
+
|
109 |
+
# Distribution / packaging
|
110 |
+
bin/
|
111 |
+
build/
|
112 |
+
develop-eggs/
|
113 |
+
dist/
|
114 |
+
eggs/
|
115 |
+
lib64/
|
116 |
+
parts/
|
117 |
+
sdist/
|
118 |
+
var/
|
119 |
+
*.egg-info/
|
120 |
+
.installed.cfg
|
121 |
+
*.egg
|
122 |
+
|
123 |
+
# Installer logs
|
124 |
+
pip-log.txt
|
125 |
+
pip-delete-this-directory.txt
|
126 |
+
|
127 |
+
# Unit test / coverage reports
|
128 |
+
.tox/
|
129 |
+
.coverage
|
130 |
+
.cache
|
131 |
+
nosetests.xml
|
132 |
+
coverage.xml
|
133 |
+
|
134 |
+
# Translations
|
135 |
+
*.mo
|
136 |
+
|
137 |
+
# Mr Developer
|
138 |
+
.mr.developer.cfg
|
139 |
+
.project
|
140 |
+
.pydevproject
|
141 |
+
|
142 |
+
# Rope
|
143 |
+
.ropeproject
|
144 |
+
|
145 |
+
# Django stuff:
|
146 |
+
*.log
|
147 |
+
*.pot
|
148 |
+
|
149 |
+
# Sphinx documentation
|
150 |
+
docs/_build/
|
151 |
+
|
152 |
+
# VSCode
|
153 |
+
.vscode
|
154 |
+
|
155 |
+
# Jupyter Notebook
|
156 |
+
.ipynb_checkpoints
|
157 |
+
|
158 |
+
data_exploration/data/anonymization.xlsx
|
159 |
+
data_exploration/data/~$anonymization.xlsx
|
160 |
+
data_exploration/data/query4_results.pdf
|
161 |
+
data_exploration/data/query5_results.pdf
|
162 |
+
data_exploration/src/data_versioning_problem_illustrate.pptx
|
163 |
+
data_exploration/src/
|
data/raw_data/.gitkeep
DELETED
File without changes
|
data/raw_data/sample_submission.csv
CHANGED
File without changes
|
data/raw_data/test.csv
CHANGED
File without changes
|
data/raw_data/train.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
source/services/predicting_effective_arguments/train/seq_classification.py
CHANGED
@@ -56,10 +56,10 @@ if __name__ == '__main__':
|
|
56 |
test_size = 0.1
|
57 |
|
58 |
# First split: Separate out the training set
|
59 |
-
train_df, temp_df = train_test_split(data, test_size=1 - train_size)
|
60 |
|
61 |
# Second split: Separate out the validation and test sets
|
62 |
-
valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size))
|
63 |
|
64 |
|
65 |
train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
|
@@ -69,14 +69,15 @@ if __name__ == '__main__':
|
|
69 |
train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
|
70 |
val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
|
71 |
test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
|
72 |
-
|
73 |
train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
|
74 |
val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
|
75 |
test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)
|
76 |
|
77 |
seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)
|
78 |
-
|
79 |
-
seqClassifer.
|
|
|
80 |
pass
|
81 |
|
82 |
"""
|
@@ -94,7 +95,4 @@ if __name__ == '__main__':
|
|
94 |
plt.suptitle("")
|
95 |
plt.xlabel("")
|
96 |
plt.show()
|
97 |
-
"""
|
98 |
-
|
99 |
-
|
100 |
-
pass
|
|
|
56 |
test_size = 0.1
|
57 |
|
58 |
# First split: Separate out the training set
|
59 |
+
train_df, temp_df = train_test_split(data, test_size=1 - train_size, random_state=5600)
|
60 |
|
61 |
# Second split: Separate out the validation and test sets
|
62 |
+
valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size), random_state=5600)
|
63 |
|
64 |
|
65 |
train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
|
|
|
69 |
train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
|
70 |
val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
|
71 |
test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
|
72 |
+
labels = train_dataset.features["label"].names
|
73 |
train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
|
74 |
val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
|
75 |
test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)
|
76 |
|
77 |
seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)
|
78 |
+
y_valid_pred = seqClassifer.predict_valid_data(val_tok_dataset)
|
79 |
+
seqClassifer.plot_confusion_matrix(y_preds=y_valid_pred, y_true=val_dataset['label'], labels=labels)
|
80 |
+
y_test_pred = seqClassifer.predict_test_data(model_checkpoint=config.MODEL_OUTPUT_DIR, test_list=test_df['inputs'].tolist())
|
81 |
pass
|
82 |
|
83 |
"""
|
|
|
95 |
plt.suptitle("")
|
96 |
plt.xlabel("")
|
97 |
plt.show()
|
98 |
+
"""
|
|
|
|
|
|