JeffYang52415 commited on
Commit
8cf2761
·
unverified ·
0 Parent(s):

feat: first commit

Browse files
Files changed (4) hide show
  1. .gitignore +28 -0
  2. LICENSE +21 -0
  3. README.md +64 -0
  4. pyproject.toml +56 -0
.gitignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+
11
+ # Poetry
12
+ poetry.lock
13
+
14
+ # Virtual environment
15
+ .env/
16
+ .venv/
17
+
18
+ # Pytest cache
19
+ .pytest_cache/
20
+
21
+ # MyPy cache
22
+ .mypy_cache/
23
+
24
+ # VSCode settings
25
+ .vscode/
26
+
27
+ #csv files
28
+ *.csv
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Zjh-819
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # LLMDataParser
3
+
4
+ **LLMDataParser** is a Python library that provides parsers for benchmark datasets used in evaluating Large Language Models (LLMs). It offers a unified interface for loading and parsing datasets like **MMLU** and **GSM8k**, simplifying dataset preparation for LLM evaluation.
5
+
6
+ ## Features
7
+
8
+ - **Unified Interface**: Consistent `DatasetParser` for all datasets.
9
+ - **LLM-Agnostic**: Independent of any specific language model.
10
+ - **Easy to Use**: Simple methods and built-in Python types.
11
+ - **Extensible**: Easily add support for new datasets.
12
+
13
+ ## Installation
14
+
15
+ ### Option 1: Using pip
16
+
17
+ You can install the package directly using `pip`. Even with only a `pyproject.toml` file, this method works for standard installations.
18
+
19
+ 1. **Clone the Repository**:
20
+
21
+ ```bash
22
+ git clone https://github.com/jeff52415/LLMDataParser.git
23
+ cd LLMDataParser
24
+ ```
25
+
26
+ 2. **Install Dependencies with pip**:
27
+
28
+ ```bash
29
+ pip install .
30
+ ```
31
+
32
+ ### Option 2: Using Poetry
33
+
34
+ Poetry manages the virtual environment and dependencies automatically, so you don't need to create a conda environment first.
35
+
36
+ 1. **Install Dependencies with Poetry**:
37
+
38
+ ```bash
39
+ poetry install
40
+ ```
41
+
42
+ 2. **Activate the Virtual Environment**:
43
+
44
+ ```bash
45
+ poetry shell
46
+ ```
47
+
48
+
49
+ ## Available Parsers
50
+
51
+ - **MMLUParser**: Parses the MMLU dataset.
52
+ - **GSM8kParser**: Parses the GSM8k dataset.
53
+
54
+ ## Contributing
55
+
56
+ Contributions are welcome! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
57
+
58
+ ## License
59
+
60
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
61
+
62
+ ## Contact
63
+
64
+ For questions or support, please open an issue on GitHub or contact [[email protected]](mailto:[email protected]).
pyproject.toml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "llmdataparser"
3
+ version = "0.1.0"
4
+ description = "A collection of parsers for LLM benchmark datasets like MMLU, MMLU-Pro, GSM8k, and more."
5
+ authors = ["Jeff <[email protected]>"]
6
+ license = "MIT"
7
+ readme = "README.md"
8
+ homepage = "https://github.com/jeff52415/LLMDataParser"
9
+ repository = "https://github.com/jeff52415/LLMDataParser"
10
+ keywords = ["LLM", "benchmark", "dataset", "parser", "NLP", "machine learning"]
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: OS Independent",
15
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
16
+ "Intended Audience :: Developers",
17
+ ]
18
+
19
+ [tool.poetry.dependencies]
20
+ python = ">=3.11"
21
+ pandas = "^2.0.3"
22
+ datasets = "^2.14.4"
23
+ typing-extensions = "^4.8.0"
24
+
25
+
26
+
27
+ [tool.poetry.group.dev.dependencies]
28
+ pytest = "^7.4.0"
29
+ black = {version = "^23.9.1", allow-prereleases = true}
30
+ flake8 = "^6.1.0"
31
+ isort = "^5.12.0"
32
+ mypy = "^1.5.1"
33
+ pre-commit = "^3.4.0"
34
+ types-python-dateutil = "^2.8.19.14"
35
+ ipykernel = "^6.7.0"
36
+
37
+ [tool.black]
38
+ line-length = 88
39
+ target-version = ["py311"]
40
+ exclude = '''
41
+ /(
42
+ \.git
43
+ | \.venv
44
+ | build
45
+ | dist
46
+ )/
47
+ '''
48
+
49
+ [tool.isort]
50
+ profile = "black"
51
+ line_length = 88
52
+ known_first_party = ["llmdataparser"]
53
+
54
+ [build-system]
55
+ requires = ["poetry-core>=1.5.0"]
56
+ build-backend = "poetry.core.masonry.api"