Spaces:
Running
Running
import json | |
import traceback | |
import mlcroissant as mlc | |
import func_timeout | |
WAIT_TIME = 5 * 60 # seconds | |
def validate_json(file_path): | |
"""Validate that the file is proper JSON.""" | |
try: | |
with open(file_path, 'r') as f: | |
json_data = json.load(f) | |
return True, "β The file is valid JSON.", json_data | |
except json.JSONDecodeError as e: | |
error_message = f"β Invalid JSON format: {str(e)}" | |
return False, error_message, None | |
except Exception as e: | |
error_message = f"β Error reading file: {str(e)}" | |
return False, error_message, None | |
def validate_croissant(json_data): | |
"""Validate that the JSON follows Croissant schema.""" | |
try: | |
dataset = mlc.Dataset(jsonld=json_data) | |
return True, "β The dataset passes Croissant validation." | |
except mlc.ValidationError as e: | |
error_details = traceback.format_exc() | |
error_message = f"β Validation failed: {str(e)}\n\n{error_details}" | |
return False, error_message | |
except Exception as e: | |
error_details = traceback.format_exc() | |
error_message = f"β Unexpected error during validation: {str(e)}\n\n{error_details}" | |
return False, error_message | |
def validate_records(json_data): | |
"""Validate that records can be generated within the time limit.""" | |
try: | |
dataset = mlc.Dataset(jsonld=json_data) | |
record_sets = dataset.metadata.record_sets | |
if not record_sets: | |
return True, "β No record sets found to validate." | |
results = [] | |
for record_set in record_sets: | |
try: | |
records = dataset.records(record_set=record_set.uuid) | |
print(records) | |
_ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records))) | |
results.append(f"β Record set '{record_set.uuid}' passed validation.") | |
except func_timeout.exceptions.FunctionTimedOut: | |
error_message = f"β Record set '{record_set.uuid}' generation took too long (>300s)" | |
return False, error_message | |
except Exception as e: | |
error_details = traceback.format_exc() | |
error_message = f"β Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}" | |
return False, error_message | |
return True, "\n".join(results) | |
except Exception as e: | |
error_details = traceback.format_exc() | |
error_message = f"β Unexpected error during records validation: {str(e)}\n\n{error_details}" | |
return False, error_message | |
def generate_validation_report(file_path, json_data, validation_results): | |
"""Generate a detailed markdown report of the validation results.""" | |
report = [] | |
# Header | |
report.append("# CROISSANT VALIDATION REPORT") | |
report.append("=" * 80) | |
report.append("") | |
# Validation Results Section | |
report.append("## VALIDATION RESULTS") | |
report.append("-" * 80) | |
report.append(f"Starting validation for file: {file_path}") | |
# Process each validation step | |
for step_name, passed, message in validation_results: | |
report.append("") | |
report.append(f"### {step_name}") | |
report.append("β" if passed else "β") | |
report.append(message.replace("β ", "β").replace("β", "β")) | |
# Add extra details for record sets if available | |
if step_name == "Records Generation Test" and json_data: | |
try: | |
dataset = mlc.Dataset(jsonld=json_data) | |
for record_set in dataset.metadata.record_sets: | |
report.append("") | |
report.append(f"#### Record Set: {record_set.uuid}") | |
report.append(f"Description: {record_set.description}") | |
if record_set.data_types: | |
report.append(f"Data Types: {record_set.data_types}") | |
report.append("") | |
report.append("Fields:") | |
for field in record_set.fields: | |
report.append(f"- {field.name} ({field.data_type})") | |
if field.description: | |
report.append(f" Description: {field.description}") | |
except Exception as e: | |
report.append(f"Error getting record set details: {str(e)}") | |
# JSON-LD Reference Section | |
report.append("") | |
report.append("## JSON-LD REFERENCE") | |
report.append("=" * 80) | |
report.append("") | |
report.append("```json") | |
report.append(json.dumps(json_data, indent=2)) | |
report.append("```") | |
return "\n".join(report) |