João Pedro commited on
Commit
edcda91
·
1 Parent(s): e6ad839

add wandb to pre_processing

Browse files
Files changed (1) hide show
  1. pre_processing.py +21 -1
pre_processing.py CHANGED
@@ -10,6 +10,7 @@ from transformers import BertTokenizer
10
  from constants import (RAW_DATA_DIR,
11
  PROCESSED_DATA_DIR,
12
  METADATA_FILEPATH,
 
13
  BERT_BASE,
14
  MAX_SEQUENCE_LENGHT,
15
  FilePath,
@@ -18,6 +19,8 @@ from constants import (RAW_DATA_DIR,
18
  # Allow for unlimited image size, some documents are pretty big...
19
  Image.MAX_IMAGE_PIXELS = None
20
 
 
 
21
 
22
  def make_page_filepaths(basename, label, page_index) -> Tuple[str, str]:
23
  out_dirname = path.join(PROCESSED_DATA_DIR, label)
@@ -108,4 +111,21 @@ def process_training_data() -> pd.DataFrame:
108
  return pages_metadata_df
109
 
110
 
111
- process_training_data()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  from constants import (RAW_DATA_DIR,
11
  PROCESSED_DATA_DIR,
12
  METADATA_FILEPATH,
13
+ PROJECT_NAME,
14
  BERT_BASE,
15
  MAX_SEQUENCE_LENGHT,
16
  FilePath,
 
19
  # Allow for unlimited image size, some documents are pretty big...
20
  Image.MAX_IMAGE_PIXELS = None
21
 
22
+ run = wandb.init(project=PROJECT_NAME, name='pre-processing')
23
+
24
 
25
  def make_page_filepaths(basename, label, page_index) -> Tuple[str, str]:
26
  out_dirname = path.join(PROCESSED_DATA_DIR, label)
 
111
  return pages_metadata_df
112
 
113
 
114
+ def main():
115
+ metadata_df = process_training_data()
116
+
117
+ raw_dataset_artifact = wandb.Artifact("raw-dataset", type="dataset")
118
+ raw_dataset_artifact.add_dir(RAW_DATA_DIR)
119
+ run.log_artifact(raw_dataset_artifacth)
120
+
121
+ processed_dataset_artifact = wandb.Artifact("processed-dataset", type="dataset")
122
+ processed_dataset_artifact.add_dir(PROCESSED_DATA_DIR)
123
+ run.log_artifact(processed_dataset_artifact)
124
+
125
+ dataset_metadata_artifact = wandb.Artifact("dataset-metadata", type="dataset")
126
+ dataset_metadata_table = wandb.Table(dataframe=metadata_df)
127
+ dataset_metadata_artifact.add(dataset_metadata_table, name='metadata-table')
128
+ run.log_artifact(dataset_metadata_artifact)
129
+
130
+ if __name__ == '__main__':
131
+ main()