|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Description: |
|
This script contains a collection of functions designed to handle various |
|
file reading and writing operations. It provides utilities to read from files, |
|
write data to files, and perform file manipulation tasks. |
|
""" |
|
|
|
|
|
import os |
|
import json |
|
import json |
|
import csv |
|
|
|
from tqdm import tqdm |
|
from typing import List, Dict, Any, Set, Union |
|
from pathlib import Path |
|
from omegaconf import OmegaConf, DictConfig |
|
|
|
|
|
def resolve_symbolic_link(symbolic_link_path: Path) -> Path: |
|
""" |
|
Resolves the absolute path of a symbolic link. |
|
|
|
Args: |
|
symbolic_link_path (Path): The path to the symbolic link. |
|
|
|
Returns: |
|
Path: The absolute path that the symbolic link points to. |
|
""" |
|
|
|
link_directory = os.path.dirname(symbolic_link_path) |
|
target_path_relative = os.readlink(symbolic_link_path) |
|
return os.path.join(link_directory, target_path_relative) |
|
|
|
|
|
def write_jsonl(metadata: List[dict], file_path: Path) -> None: |
|
"""Writes a list of dictionaries to a JSONL file. |
|
|
|
Args: |
|
metadata : List[dict] |
|
A list of dictionaries, each representing a piece of meta. |
|
file_path : Path |
|
The file path to save the JSONL file |
|
|
|
This function writes each dictionary in the list to a new line in the specified file. |
|
""" |
|
with open(file_path, "w", encoding="utf-8") as f: |
|
for meta in tqdm(metadata, desc="writing jsonl"): |
|
|
|
json_str = json.dumps(meta, ensure_ascii=False) + "\n" |
|
f.write(json_str) |
|
print(f"jsonl saved to {file_path}") |
|
|
|
|
|
def read_jsonl(file_path: Path) -> List[dict]: |
|
""" |
|
Reads a JSONL file and returns a list of dictionaries. |
|
|
|
Args: |
|
file_path : Path |
|
The path to the JSONL file to be read. |
|
|
|
Returns: |
|
List[dict] |
|
A list of dictionaries parsed from each line of the JSONL file. |
|
""" |
|
metadata = [] |
|
|
|
with open(file_path, "r", encoding="utf-8") as f: |
|
|
|
lines = f.read().splitlines() |
|
|
|
for line in lines: |
|
|
|
meta = json.loads(line) |
|
metadata.append(meta) |
|
|
|
return metadata |
|
|
|
def read_json_as_jsonl(file_path: Path) -> List[dict]: |
|
metadata = [] |
|
with open(file_path, 'r', encoding='utf-8') as infile: |
|
data = json.load(infile) |
|
for k in sorted(data.keys()): |
|
meta = {'index': k} |
|
meta.update(data[k]) |
|
metadata.append(meta) |
|
return metadata |
|
|
|
|
|
|
|
def decode_unicode_strings(meta: Dict[str, Any]) -> Dict[str, Any]: |
|
processed_meta = {} |
|
for k, v in meta.items(): |
|
if isinstance(v, str): |
|
processed_meta[k] = v.encode("utf-8").decode("unicode_escape") |
|
else: |
|
processed_meta[k] = v |
|
return processed_meta |
|
|
|
|
|
def load_config(config_path: Path) -> DictConfig: |
|
"""Loads a configuration file and optionally merges it with a base configuration. |
|
|
|
Args: |
|
config_path (Path): Path to the configuration file. |
|
""" |
|
|
|
config = OmegaConf.load(config_path) |
|
|
|
|
|
if config.get("base_config", None) is not None: |
|
base_config = OmegaConf.load(config["base_config"]) |
|
config = OmegaConf.merge(base_config, config) |
|
|
|
return config |
|
|
|
|
|
|
|
def jsonl_to_csv(jsonl_file_path: str, csv_file_path: str) -> None: |
|
""" |
|
Converts a JSONL file to a CSV file. |
|
|
|
This function reads a JSONL file, determines all unique keys present in the file, |
|
and writes the data to a CSV file with columns for all these keys. |
|
""" |
|
|
|
all_keys = set() |
|
data_rows = [] |
|
|
|
|
|
with open(jsonl_file_path, 'r') as file: |
|
for line in file: |
|
data = json.loads(line.strip()) |
|
data_rows.append(data) |
|
all_keys.update(data.keys()) |
|
|
|
|
|
sorted_keys = sorted(all_keys) |
|
|
|
|
|
with open(csv_file_path, 'w', newline='') as csvfile: |
|
writer = csv.DictWriter(csvfile, fieldnames=sorted_keys) |
|
|
|
|
|
writer.writeheader() |
|
|
|
|
|
for data in data_rows: |
|
writer.writerow(data) |
|
|
|
print(f"CSV file has been created at {csv_file_path}") |
|
|
|
|
|
def save_metadata(data, filename, headers=None): |
|
""" |
|
Save metadata to a file. |
|
|
|
Args: |
|
data (list of dict): Metadata to be saved. |
|
filename (str): Name of the file to save the metadata. |
|
headers (list of str): The order of column names to be saved; defaults to the keys from the first dictionary in data if not provided. |
|
""" |
|
|
|
if headers is None: |
|
headers = list(data[0].keys()) |
|
|
|
with open(filename, "w", encoding="utf-8") as file: |
|
|
|
file.write("|".join(headers) + "\n") |
|
for entry in data: |
|
|
|
formatted_values = [str(entry.get(key, "")).replace("|", " ") for key in headers] |
|
|
|
file.write("|".join(formatted_values) + "\n") |
|
|
|
|
|
def read_metadata(filename, headers=None): |
|
""" |
|
Read metadata from a file. |
|
|
|
Args: |
|
filename (str): The file from which to read the metadata. |
|
|
|
Returns: |
|
list of dict: The metadata read from the file. |
|
list of str: The headers used in the file. |
|
""" |
|
with open(filename, "r", encoding="utf-8") as file: |
|
lines = file.readlines() |
|
|
|
data = [] |
|
|
|
if headers is None: |
|
headers = lines[0].strip().split("|") |
|
lines = lines[1:] |
|
|
|
for line in lines: |
|
line = line.strip() |
|
|
|
if not line: |
|
continue |
|
|
|
entry_data = dict(zip(headers, line.split("|"))) |
|
data.append(entry_data) |
|
|
|
return data, headers |
|
|