luisoala commited on
Commit
ddf094a
Β·
1 Parent(s): 895624a

Merge working-april-04 validation changes

Browse files
Files changed (1) hide show
  1. validation.py +49 -106
validation.py CHANGED
@@ -1,6 +1,5 @@
1
  import json
2
  import traceback
3
- import warnings
4
  import mlcroissant as mlc
5
  import func_timeout
6
 
@@ -8,114 +7,58 @@ WAIT_TIME = 5 * 60 # seconds
8
 
9
  def validate_json(file_path):
10
  """Validate that the file is proper JSON."""
11
- with warnings.catch_warnings(record=True) as caught_warnings:
12
- warnings.simplefilter("always") # Ensure all warnings are captured
13
- try:
14
- with open(file_path, 'r') as f:
15
- json_data = json.load(f)
16
- warning_msgs = [str(w.message) for w in caught_warnings]
17
- if warning_msgs:
18
- return True, f"βœ… The file is valid JSON.\n\nWarnings:\n" + "\n".join(warning_msgs), json_data
19
- return True, "βœ… The file is valid JSON.", json_data
20
- except json.JSONDecodeError as e:
21
- warning_msgs = [str(w.message) for w in caught_warnings]
22
- error_message = f"❌ Invalid JSON format: {str(e)}"
23
- if warning_msgs:
24
- error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
25
- return False, error_message, None
26
- except Exception as e:
27
- warning_msgs = [str(w.message) for w in caught_warnings]
28
- error_message = f"❌ Error reading file: {str(e)}"
29
- if warning_msgs:
30
- error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
31
- return False, error_message, None
32
 
33
  def validate_croissant(json_data):
34
  """Validate that the JSON follows Croissant schema."""
35
- with warnings.catch_warnings(record=True) as caught_warnings:
36
- warnings.simplefilter("always") # Ensure all warnings are captured
37
- try:
38
- dataset = mlc.Dataset(jsonld=json_data)
39
- warning_msgs = [str(w.message) for w in caught_warnings]
40
- if warning_msgs:
41
- return True, f"βœ… The dataset passes Croissant validation.\n\nWarnings:\n" + "\n".join(warning_msgs)
42
- return True, "βœ… The dataset passes Croissant validation."
43
- except mlc.ValidationError as e:
44
- warning_msgs = [str(w.message) for w in caught_warnings]
45
- error_details = traceback.format_exc()
46
- error_message = f"❌ Validation failed: {str(e)}\n\n{error_details}"
47
- if warning_msgs:
48
- error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
49
- return False, error_message
50
- except Exception as e:
51
- warning_msgs = [str(w.message) for w in caught_warnings]
52
- error_details = traceback.format_exc()
53
- error_message = f"❌ Unexpected error during validation: {str(e)}\n\n{error_details}"
54
- if warning_msgs:
55
- error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
56
- return False, error_message
57
 
58
  def validate_records(json_data):
59
  """Validate that records can be generated within the time limit."""
60
- with warnings.catch_warnings(record=True) as caught_warnings:
61
- warnings.simplefilter("always") # Ensure all warnings are captured
62
- try:
63
- dataset = mlc.Dataset(jsonld=json_data)
64
- record_sets = dataset.metadata.record_sets
65
-
66
- if not record_sets:
67
- warning_msgs = [str(w.message) for w in caught_warnings]
68
- msg = "βœ… No record sets found to validate."
69
- if warning_msgs:
70
- msg += "\n\nWarnings:\n" + "\n".join(warning_msgs)
71
- return True, msg
72
-
73
- results = []
74
- all_warnings = []
75
-
76
- for record_set in record_sets:
77
- # Capture warnings for each record set separately
78
- with warnings.catch_warnings(record=True) as record_warnings:
79
- warnings.simplefilter("always")
80
- try:
81
- records = dataset.records(record_set=record_set.uuid)
82
- print(f"Attempting to validate record set: {record_set.uuid}")
83
- _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
84
-
85
- # Add any warnings from this record set
86
- warning_msgs = [str(w.message) for w in record_warnings]
87
- if warning_msgs:
88
- all_warnings.extend(warning_msgs)
89
-
90
- results.append(f"βœ… Record set '{record_set.uuid}' passed validation.")
91
- except func_timeout.exceptions.FunctionTimedOut:
92
- warning_msgs = [str(w.message) for w in record_warnings]
93
- msg = f"⚠️ Record set '{record_set.uuid}' generation took too long (>300s)"
94
- if warning_msgs:
95
- msg += "\n\nWarnings:\n" + "\n".join(warning_msgs)
96
- results.append(msg)
97
- except Exception as e:
98
- warning_msgs = [str(w.message) for w in record_warnings]
99
- error_details = traceback.format_exc()
100
- msg = f"⚠️ Record set '{record_set.uuid}' encountered an issue: {str(e)}\n\n{error_details}"
101
- if warning_msgs:
102
- msg += "\n\nWarnings:\n" + "\n".join(warning_msgs)
103
- results.append(msg)
104
-
105
- # Add any warnings from the initial setup
106
- warning_msgs = [str(w.message) for w in caught_warnings]
107
- if warning_msgs:
108
- all_warnings.extend(warning_msgs)
109
-
110
- final_message = "\n".join(results)
111
- if all_warnings:
112
- final_message += "\n\nWarnings:\n" + "\n".join(all_warnings)
113
-
114
- return True, final_message
115
- except Exception as e:
116
- warning_msgs = [str(w.message) for w in caught_warnings]
117
- error_details = traceback.format_exc()
118
- error_message = f"⚠️ Unexpected error during records validation: {str(e)}\n\n{error_details}"
119
- if warning_msgs:
120
- error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
121
- return True, error_message
 
1
  import json
2
  import traceback
 
3
  import mlcroissant as mlc
4
  import func_timeout
5
 
 
7
 
8
  def validate_json(file_path):
9
  """Validate that the file is proper JSON."""
10
+ try:
11
+ with open(file_path, 'r') as f:
12
+ json_data = json.load(f)
13
+ return True, "βœ… The file is valid JSON.", json_data
14
+ except json.JSONDecodeError as e:
15
+ error_message = f"❌ Invalid JSON format: {str(e)}"
16
+ return False, error_message, None
17
+ except Exception as e:
18
+ error_message = f"❌ Error reading file: {str(e)}"
19
+ return False, error_message, None
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def validate_croissant(json_data):
22
  """Validate that the JSON follows Croissant schema."""
23
+ try:
24
+ dataset = mlc.Dataset(jsonld=json_data)
25
+ return True, "βœ… The dataset passes Croissant validation."
26
+ except mlc.ValidationError as e:
27
+ error_details = traceback.format_exc()
28
+ error_message = f"❌ Validation failed: {str(e)}\n\n{error_details}"
29
+ return False, error_message
30
+ except Exception as e:
31
+ error_details = traceback.format_exc()
32
+ error_message = f"❌ Unexpected error during validation: {str(e)}\n\n{error_details}"
33
+ return False, error_message
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  def validate_records(json_data):
36
  """Validate that records can be generated within the time limit."""
37
+ try:
38
+ dataset = mlc.Dataset(jsonld=json_data)
39
+ record_sets = dataset.metadata.record_sets
40
+
41
+ if not record_sets:
42
+ return True, "βœ… No record sets found to validate."
43
+
44
+ results = []
45
+
46
+ for record_set in record_sets:
47
+ try:
48
+ records = dataset.records(record_set=record_set.uuid)
49
+ print(records)
50
+ _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
51
+ results.append(f"βœ… Record set '{record_set.uuid}' passed validation.")
52
+ except func_timeout.exceptions.FunctionTimedOut:
53
+ error_message = f"❌ Record set '{record_set.uuid}' generation took too long (>300s)"
54
+ return False, error_message
55
+ except Exception as e:
56
+ error_details = traceback.format_exc()
57
+ error_message = f"❌ Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}"
58
+ return False, error_message
59
+
60
+ return True, "\n".join(results)
61
+ except Exception as e:
62
+ error_details = traceback.format_exc()
63
+ error_message = f"❌ Unexpected error during records validation: {str(e)}\n\n{error_details}"
64
+ return False, error_message