mserras commited on
Commit
f06fcfd
·
1 Parent(s): 77f8535

Update load_data for own dataset sync

Browse files
Files changed (1) hide show
  1. load_data.py +12 -9
load_data.py CHANGED
@@ -10,7 +10,7 @@ from datasets import load_dataset, concatenate_datasets
10
  from argilla.listeners import listener
11
 
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
- HUB_DATASET_NAME = os.environ.get('HUB_DATASET_NAME')
14
 
15
  @listener(
16
  dataset="somos-alpaca-es",
@@ -44,20 +44,23 @@ class LoadDatasets:
44
  print(e)
45
  old_ds = None
46
 
47
- dataset = load_dataset("somosnlp/somos-clean-alpaca-es", split="train")
48
 
49
 
50
- if old_ds:
51
- print("Concatenating datasets")
52
- dataset = concatenate_datasets([dataset, old_ds])
53
- print("Concatenated dataset is:")
54
- print(dataset)
55
 
56
- dataset = dataset.remove_columns("metrics")
 
 
 
57
  records = rg.DatasetForTextClassification.from_datasets(dataset)
58
 
59
  settings = rg.TextClassificationSettings(
60
- label_schema=["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD"]
61
  )
62
  rg.configure_dataset(name="somos-alpaca-es", settings=settings, workspace="team")
63
 
 
10
  from argilla.listeners import listener
11
 
12
  HF_TOKEN = os.environ.get("HF_TOKEN")
13
+ HUB_DATASET_NAME = "mserras/alpaca-es-hackaton"
14
 
15
  @listener(
16
  dataset="somos-alpaca-es",
 
44
  print(e)
45
  old_ds = None
46
 
47
+ # dataset = load_dataset("somosnlp/somos-clean-alpaca-es", split="train")
48
 
49
 
50
+ # if old_ds:
51
+ # print("Concatenating datasets")
52
+ # dataset = concatenate_datasets([dataset, old_ds])
53
+ # print("Concatenated dataset is:")
54
+ # print(dataset)
55
 
56
+ # dataset = dataset.remove_columns("metrics")
57
+ if not old_ds:
58
+ return
59
+ dataset = old_ds
60
  records = rg.DatasetForTextClassification.from_datasets(dataset)
61
 
62
  settings = rg.TextClassificationSettings(
63
+ label_schema=["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD", "HALLUCINATION", "UNPROCESSABLE"]
64
  )
65
  rg.configure_dataset(name="somos-alpaca-es", settings=settings, workspace="team")
66