Spaces:
Runtime error
Runtime error
File size: 3,287 Bytes
3455d37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#!/usr/bin/env python
# coding=utf-8
"""
Commonly used constants.
"""
TEXT_ONLY_DATASET_DESCRIPTION = (
"""
"text_only": a dataset with only raw text instances, with following format:
{
"type": "text_only",
"instances": [
{ "text": "TEXT_1" },
{ "text": "TEXT_2" },
...
]
}
"""
).lstrip("\n")
TEXT_ONLY_DATASET_DETAILS = (
"""
For example,
```python
from lmflow.datasets import Dataset
data_dict = {
"type": "text_only",
"instances": [
{ "text": "Human: Hello. Bot: Hi!" },
{ "text": "Human: How are you today? Bot: Fine, thank you!" },
]
}
dataset = Dataset.create_from_dict(data_dict)
```
You may also save the corresponding format to json,
```python
import json
from lmflow.args import DatasetArguments
from lmflow.datasets import Dataset
data_dict = {
"type": "text_only",
"instances": [
{ "text": "Human: Hello. Bot: Hi!" },
{ "text": "Human: How are you today? Bot: Fine, thank you!" },
]
}
with open("data.json", "w") as fout:
json.dump(data_dict, fout)
data_args = DatasetArgument(dataset_path="data.json")
dataset = Dataset(data_args)
new_data_dict = dataset.to_dict()
# `new_data_dict` Should have the same content as `data_dict`
```
"""
).lstrip("\n")
TEXT2TEXT_DATASET_DESCRIPTION = (
"""
"text2text": a dataset with input & output instances, with following format:
{
"type": "text2text",
"instances": [
{ "input": "INPUT_1", "output": "OUTPUT_1" },
{ "input": "INPUT_2", "output": "OUTPUT_2" },
...
]
}
"""
).lstrip("\n")
TEXT2TEXT_DATASET_DETAILS = (
"""
For example,
```python
from lmflow.datasets import Dataset
data_dict = {
"type": "text2text",
"instances": [
{
"input": "Human: Hello.",
"output": "Bot: Hi!",
},
{
"input": "Human: How are you today?",
"output": "Bot: Fine, thank you! And you?",
}
]
}
dataset = Dataset.create_from_dict(data_dict)
```
You may also save the corresponding format to json,
```python
import json
from lmflow.args import DatasetArguments
from lmflow.datasets import Dataset
data_dict = {
"type": "text2text",
"instances": [
{
"input": "Human: Hello.",
"output": "Bot: Hi!",
},
{
"input": "Human: How are you today?",
"output": "Bot: Fine, thank you! And you?",
}
]
}
with open("data.json", "w") as fout:
json.dump(data_dict, fout)
data_args = DatasetArgument(dataset_path="data.json")
dataset = Dataset(data_args)
new_data_dict = dataset.to_dict()
# `new_data_dict` Should have the same content as `data_dict`
```
"""
).lstrip("\n")
TEXT_ONLY_DATASET_LONG_DESCRITION = (
TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS
)
TEXT2TEXT_DATASET_LONG_DESCRITION = (
TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS
)
|