luisoala commited on
Commit
a9775df
Β·
1 Parent(s): 99cfa16
Files changed (1) hide show
  1. validation.py +106 -49
validation.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import traceback
 
3
  import mlcroissant as mlc
4
  import func_timeout
5
 
@@ -7,58 +8,114 @@ WAIT_TIME = 5 * 60 # seconds
7
 
8
  def validate_json(file_path):
9
  """Validate that the file is proper JSON."""
10
- try:
11
- with open(file_path, 'r') as f:
12
- json_data = json.load(f)
13
- return True, "βœ… The file is valid JSON.", json_data
14
- except json.JSONDecodeError as e:
15
- error_message = f"❌ Invalid JSON format: {str(e)}"
16
- return False, error_message, None
17
- except Exception as e:
18
- error_message = f"❌ Error reading file: {str(e)}"
19
- return False, error_message, None
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def validate_croissant(json_data):
22
  """Validate that the JSON follows Croissant schema."""
23
- try:
24
- dataset = mlc.Dataset(jsonld=json_data)
25
- return True, "βœ… The dataset passes Croissant validation."
26
- except mlc.ValidationError as e:
27
- error_details = traceback.format_exc()
28
- error_message = f"❌ Validation failed: {str(e)}\n\n{error_details}"
29
- return False, error_message
30
- except Exception as e:
31
- error_details = traceback.format_exc()
32
- error_message = f"❌ Unexpected error during validation: {str(e)}\n\n{error_details}"
33
- return False, error_message
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def validate_records(json_data):
36
  """Validate that records can be generated within the time limit."""
37
- try:
38
- dataset = mlc.Dataset(jsonld=json_data)
39
- record_sets = dataset.metadata.record_sets
40
-
41
- if not record_sets:
42
- return True, "βœ… No record sets found to validate."
43
-
44
- results = []
45
-
46
- for record_set in record_sets:
47
- try:
48
- records = dataset.records(record_set=record_set.uuid)
49
- print(records)
50
- _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
51
- results.append(f"βœ… Record set '{record_set.uuid}' passed validation.")
52
- except func_timeout.exceptions.FunctionTimedOut:
53
- error_message = f"❌ Record set '{record_set.uuid}' generation took too long (>300s)"
54
- return False, error_message
55
- except Exception as e:
56
- error_details = traceback.format_exc()
57
- error_message = f"❌ Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}"
58
- return False, error_message
59
-
60
- return True, "\n".join(results)
61
- except Exception as e:
62
- error_details = traceback.format_exc()
63
- error_message = f"❌ Unexpected error during records validation: {str(e)}\n\n{error_details}"
64
- return False, error_message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import traceback
3
+ import warnings
4
  import mlcroissant as mlc
5
  import func_timeout
6
 
 
8
 
9
  def validate_json(file_path):
10
  """Validate that the file is proper JSON."""
11
+ with warnings.catch_warnings(record=True) as caught_warnings:
12
+ warnings.simplefilter("always") # Ensure all warnings are captured
13
+ try:
14
+ with open(file_path, 'r') as f:
15
+ json_data = json.load(f)
16
+ warning_msgs = [str(w.message) for w in caught_warnings]
17
+ if warning_msgs:
18
+ return True, f"βœ… The file is valid JSON.\n\nWarnings:\n" + "\n".join(warning_msgs), json_data
19
+ return True, "βœ… The file is valid JSON.", json_data
20
+ except json.JSONDecodeError as e:
21
+ warning_msgs = [str(w.message) for w in caught_warnings]
22
+ error_message = f"❌ Invalid JSON format: {str(e)}"
23
+ if warning_msgs:
24
+ error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
25
+ return False, error_message, None
26
+ except Exception as e:
27
+ warning_msgs = [str(w.message) for w in caught_warnings]
28
+ error_message = f"❌ Error reading file: {str(e)}"
29
+ if warning_msgs:
30
+ error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
31
+ return False, error_message, None
32
 
33
  def validate_croissant(json_data):
34
  """Validate that the JSON follows Croissant schema."""
35
+ with warnings.catch_warnings(record=True) as caught_warnings:
36
+ warnings.simplefilter("always") # Ensure all warnings are captured
37
+ try:
38
+ dataset = mlc.Dataset(jsonld=json_data)
39
+ warning_msgs = [str(w.message) for w in caught_warnings]
40
+ if warning_msgs:
41
+ return True, f"βœ… The dataset passes Croissant validation.\n\nWarnings:\n" + "\n".join(warning_msgs)
42
+ return True, "βœ… The dataset passes Croissant validation."
43
+ except mlc.ValidationError as e:
44
+ warning_msgs = [str(w.message) for w in caught_warnings]
45
+ error_details = traceback.format_exc()
46
+ error_message = f"❌ Validation failed: {str(e)}\n\n{error_details}"
47
+ if warning_msgs:
48
+ error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
49
+ return False, error_message
50
+ except Exception as e:
51
+ warning_msgs = [str(w.message) for w in caught_warnings]
52
+ error_details = traceback.format_exc()
53
+ error_message = f"❌ Unexpected error during validation: {str(e)}\n\n{error_details}"
54
+ if warning_msgs:
55
+ error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
56
+ return False, error_message
57
 
58
  def validate_records(json_data):
59
  """Validate that records can be generated within the time limit."""
60
+ with warnings.catch_warnings(record=True) as caught_warnings:
61
+ warnings.simplefilter("always") # Ensure all warnings are captured
62
+ try:
63
+ dataset = mlc.Dataset(jsonld=json_data)
64
+ record_sets = dataset.metadata.record_sets
65
+
66
+ if not record_sets:
67
+ warning_msgs = [str(w.message) for w in caught_warnings]
68
+ msg = "βœ… No record sets found to validate."
69
+ if warning_msgs:
70
+ msg += "\n\nWarnings:\n" + "\n".join(warning_msgs)
71
+ return True, msg
72
+
73
+ results = []
74
+ all_warnings = []
75
+
76
+ for record_set in record_sets:
77
+ # Capture warnings for each record set separately
78
+ with warnings.catch_warnings(record=True) as record_warnings:
79
+ warnings.simplefilter("always")
80
+ try:
81
+ records = dataset.records(record_set=record_set.uuid)
82
+ print(f"Attempting to validate record set: {record_set.uuid}")
83
+ _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
84
+
85
+ # Add any warnings from this record set
86
+ warning_msgs = [str(w.message) for w in record_warnings]
87
+ if warning_msgs:
88
+ all_warnings.extend(warning_msgs)
89
+
90
+ results.append(f"βœ… Record set '{record_set.uuid}' passed validation.")
91
+ except func_timeout.exceptions.FunctionTimedOut:
92
+ warning_msgs = [str(w.message) for w in record_warnings]
93
+ error_message = f"❌ Record set '{record_set.uuid}' generation took too long (>300s)"
94
+ if warning_msgs:
95
+ error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
96
+ return False, error_message
97
+ except Exception as e:
98
+ warning_msgs = [str(w.message) for w in record_warnings]
99
+ error_details = traceback.format_exc()
100
+ error_message = f"❌ Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}"
101
+ if warning_msgs:
102
+ error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
103
+ return False, error_message
104
+
105
+ # Add any warnings from the initial setup
106
+ warning_msgs = [str(w.message) for w in caught_warnings]
107
+ if warning_msgs:
108
+ all_warnings.extend(warning_msgs)
109
+
110
+ final_message = "\n".join(results)
111
+ if all_warnings:
112
+ final_message += "\n\nWarnings:\n" + "\n".join(all_warnings)
113
+
114
+ return True, final_message
115
+ except Exception as e:
116
+ warning_msgs = [str(w.message) for w in caught_warnings]
117
+ error_details = traceback.format_exc()
118
+ error_message = f"❌ Unexpected error during records validation: {str(e)}\n\n{error_details}"
119
+ if warning_msgs:
120
+ error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
121
+ return False, error_message