File size: 7,170 Bytes
d93aca0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# Copyright (c) 2025 SparkAudio
#               2025 Xinsheng Wang ([email protected])
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Description:
    This script contains a collection of functions designed to handle various
    file reading and writing operations. It provides utilities to read from files,
    write data to files, and perform file manipulation tasks.
"""


import os
import json
import json
import csv

from tqdm import tqdm
from typing import List, Dict, Any, Set, Union
from pathlib import Path
from omegaconf import OmegaConf, DictConfig


def resolve_symbolic_link(symbolic_link_path: Path) -> Path:
    """
    Resolves the absolute path of a symbolic link.

    Args:
        symbolic_link_path (Path): The path to the symbolic link.

    Returns:
        Path: The absolute path that the symbolic link points to.
    """

    link_directory = os.path.dirname(symbolic_link_path)
    target_path_relative = os.readlink(symbolic_link_path)
    return os.path.join(link_directory, target_path_relative)


def write_jsonl(metadata: List[dict], file_path: Path) -> None:
    """Writes a list of dictionaries to a JSONL file.

    Args:
    metadata : List[dict]
        A list of dictionaries, each representing a piece of meta.
    file_path : Path
        The file path to save the JSONL file

    This function writes each dictionary in the list to a new line in the specified file.
    """
    with open(file_path, "w", encoding="utf-8") as f:
        for meta in tqdm(metadata, desc="writing jsonl"):
            # Convert dictionary to JSON string and write it to the file with a newline
            json_str = json.dumps(meta, ensure_ascii=False) + "\n"
            f.write(json_str)
    print(f"jsonl saved to {file_path}")


def read_jsonl(file_path: Path) -> List[dict]:
    """
    Reads a JSONL file and returns a list of dictionaries.

    Args:
    file_path : Path
        The path to the JSONL file to be read.

    Returns:
    List[dict]
        A list of dictionaries parsed from each line of the JSONL file.
    """
    metadata = []
    # Open the file for reading
    with open(file_path, "r", encoding="utf-8") as f:
        # Split the file into lines
        lines = f.read().splitlines()
    # Process each line
    for line in lines:
        # Convert JSON string back to dictionary and append to list
        meta = json.loads(line)
        metadata.append(meta)
    # Return the list of metadata
    return metadata

def read_json_as_jsonl(file_path: Path) -> List[dict]:
    metadata = []
    with open(file_path, 'r', encoding='utf-8') as infile:
        data = json.load(infile) 
    for k in sorted(data.keys()):
        meta = {'index': k}
        meta.update(data[k])
        metadata.append(meta)
    return metadata



def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]:
    processed_meta = {}
    for k, v in meta.items():
        if isinstance(v, str):
            processed_meta[k] = v.encode("utf-8").decode("unicode_escape")
        else:
            processed_meta[k] = v
    return processed_meta


def load_config(config_path: Path) -> DictConfig:
    """Loads a configuration file and optionally merges it with a base configuration.

    Args:
    config_path (Path): Path to the configuration file.
    """
    # Load the initial configuration from the given path
    config = OmegaConf.load(config_path)

    # Check if there is a base configuration specified and merge if necessary
    if config.get("base_config", None) is not None:
        base_config = OmegaConf.load(config["base_config"])
        config = OmegaConf.merge(base_config, config)

    return config



def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None:
    """
    Converts a JSONL file to a CSV file.
    
    This function reads a JSONL file, determines all unique keys present in the file,
    and writes the data to a CSV file with columns for all these keys.
    """
    
    all_keys = set()
    data_rows = []
    
    # Read the JSONL file once to extract keys and collect data
    with open(jsonl_file_path, 'r') as file:
        for line in file:
            data = json.loads(line.strip())
            data_rows.append(data)
            all_keys.update(data.keys())
    
    # Convert the set of keys to a sorted list for consistent column order
    sorted_keys = sorted(all_keys)
    
    # Write the data to a CSV file
    with open(csv_file_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=sorted_keys)
        
        # Write the header row
        writer.writeheader()
        
        # Write each row of data
        for data in data_rows:
            writer.writerow(data)
    
    print(f"CSV file has been created at {csv_file_path}")


def save_metadata(data, filename, headers=None):
    """
    Save metadata to a file.
    
    Args:
        data (list of dict): Metadata to be saved.
        filename (str): Name of the file to save the metadata.
        headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided.
    """
    # Set headers to keys from the first dictionary in data if not explicitly provided
    if headers is None:
        headers = list(data[0].keys())
    
    with open(filename, "w", encoding="utf-8") as file:
        # Write the headers to the file
        file.write("|".join(headers) + "\n")
        for entry in data:
            # Retrieve values in the order of headers, replacing any '|' characters with a space to prevent formatting errors
            formatted_values = [str(entry.get(key, "")).replace("|", " ") for key in headers]
            # Write the formatted values to the file
            file.write("|".join(formatted_values) + "\n")


def read_metadata(filename, headers=None):
    """
    Read metadata from a file.
    
    Args:
        filename (str): The file from which to read the metadata.
    
    Returns:
        list of dict: The metadata read from the file.
        list of str: The headers used in the file.
    """
    with open(filename, "r", encoding="utf-8") as file:
        lines = file.readlines()

    data = []
    # Set headers from the first line of the file if not provided
    if headers is None:
        headers = lines[0].strip().split("|")
        lines = lines[1:]

    for line in lines:
        line = line.strip()
        # Skip empty lines
        if not line:
            continue
        # Split the line by '|' and pair with headers to form a dictionary
        entry_data = dict(zip(headers, line.split("|")))
        data.append(entry_data)
    
    return data, headers