Spaces:
Sleeping
Sleeping
João Pedro
commited on
Commit
·
edcda91
1
Parent(s):
e6ad839
add wandb to pre_processing
Browse files- pre_processing.py +21 -1
pre_processing.py
CHANGED
@@ -10,6 +10,7 @@ from transformers import BertTokenizer
|
|
10 |
from constants import (RAW_DATA_DIR,
|
11 |
PROCESSED_DATA_DIR,
|
12 |
METADATA_FILEPATH,
|
|
|
13 |
BERT_BASE,
|
14 |
MAX_SEQUENCE_LENGHT,
|
15 |
FilePath,
|
@@ -18,6 +19,8 @@ from constants import (RAW_DATA_DIR,
|
|
18 |
# Allow for unlimited image size, some documents are pretty big...
|
19 |
Image.MAX_IMAGE_PIXELS = None
|
20 |
|
|
|
|
|
21 |
|
22 |
def make_page_filepaths(basename, label, page_index) -> Tuple[str, str]:
|
23 |
out_dirname = path.join(PROCESSED_DATA_DIR, label)
|
@@ -108,4 +111,21 @@ def process_training_data() -> pd.DataFrame:
|
|
108 |
return pages_metadata_df
|
109 |
|
110 |
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
from constants import (RAW_DATA_DIR,
|
11 |
PROCESSED_DATA_DIR,
|
12 |
METADATA_FILEPATH,
|
13 |
+
PROJECT_NAME,
|
14 |
BERT_BASE,
|
15 |
MAX_SEQUENCE_LENGHT,
|
16 |
FilePath,
|
|
|
19 |
# Allow for unlimited image size, some documents are pretty big...
|
20 |
Image.MAX_IMAGE_PIXELS = None
|
21 |
|
22 |
+
run = wandb.init(project=PROJECT_NAME, name='pre-processing')
|
23 |
+
|
24 |
|
25 |
def make_page_filepaths(basename, label, page_index) -> Tuple[str, str]:
|
26 |
out_dirname = path.join(PROCESSED_DATA_DIR, label)
|
|
|
111 |
return pages_metadata_df
|
112 |
|
113 |
|
114 |
+
def main():
|
115 |
+
metadata_df = process_training_data()
|
116 |
+
|
117 |
+
raw_dataset_artifact = wandb.Artifact("raw-dataset", type="dataset")
|
118 |
+
raw_dataset_artifact.add_dir(RAW_DATA_DIR)
|
119 |
+
run.log_artifact(raw_dataset_artifacth)
|
120 |
+
|
121 |
+
processed_dataset_artifact = wandb.Artifact("processed-dataset", type="dataset")
|
122 |
+
processed_dataset_artifact.add_dir(PROCESSED_DATA_DIR)
|
123 |
+
run.log_artifact(processed_dataset_artifact)
|
124 |
+
|
125 |
+
dataset_metadata_artifact = wandb.Artifact("dataset-metadata", type="dataset")
|
126 |
+
dataset_metadata_table = wandb.Table(dataframe=metadata_df)
|
127 |
+
dataset_metadata_artifact.add(dataset_metadata_table, name='metadata-table')
|
128 |
+
run.log_artifact(dataset_metadata_artifact)
|
129 |
+
|
130 |
+
if __name__ == '__main__':
|
131 |
+
main()
|