Spaces:

luisoala
/

croissant-checker

Running

App Files Files Community

luisoala commited on Apr 4

Commit

a9775df

1 Parent(s): 99cfa16

warnings

Browse files

Files changed (1) hide show

validation.py +106 -49

validation.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import traceback
 import mlcroissant as mlc
 import func_timeout
@@ -7,58 +8,114 @@ WAIT_TIME = 5 * 60  # seconds
 def validate_json(file_path):
     """Validate that the file is proper JSON."""
-    try:
-        with open(file_path, 'r') as f:
-            json_data = json.load(f)
-        return True, "✅ The file is valid JSON.", json_data
-    except json.JSONDecodeError as e:
-        error_message = f"❌ Invalid JSON format: {str(e)}"
-        return False, error_message, None
-    except Exception as e:
-        error_message = f"❌ Error reading file: {str(e)}"
-        return False, error_message, None
 def validate_croissant(json_data):
     """Validate that the JSON follows Croissant schema."""
-    try:
-        dataset = mlc.Dataset(jsonld=json_data)
-        return True, "✅ The dataset passes Croissant validation."
-    except mlc.ValidationError as e:
-        error_details = traceback.format_exc()
-        error_message = f"❌ Validation failed: {str(e)}\n\n{error_details}"
-        return False, error_message
-    except Exception as e:
-        error_details = traceback.format_exc()
-        error_message = f"❌ Unexpected error during validation: {str(e)}\n\n{error_details}"
-        return False, error_message
 def validate_records(json_data):
     """Validate that records can be generated within the time limit."""
-    try:
-        dataset = mlc.Dataset(jsonld=json_data)
-        record_sets = dataset.metadata.record_sets
-        if not record_sets:
-            return True, "✅ No record sets found to validate."
-        results = []
-        for record_set in record_sets:
-            try:
-                records = dataset.records(record_set=record_set.uuid)
-                print(records)
-                _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
-                results.append(f"✅ Record set '{record_set.uuid}' passed validation.")
-            except func_timeout.exceptions.FunctionTimedOut:
-                error_message = f"❌ Record set '{record_set.uuid}' generation took too long (>300s)"
-                return False, error_message
-            except Exception as e:
-                error_details = traceback.format_exc()
-                error_message = f"❌ Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}"
-                return False, error_message
-        return True, "\n".join(results)
-    except Exception as e:
-        error_details = traceback.format_exc()
-        error_message = f"❌ Unexpected error during records validation: {str(e)}\n\n{error_details}"
-        return False, error_message

 import json
 import traceback
+import warnings
 import mlcroissant as mlc
 import func_timeout
 def validate_json(file_path):
     """Validate that the file is proper JSON."""
+    with warnings.catch_warnings(record=True) as caught_warnings:
+        warnings.simplefilter("always")  # Ensure all warnings are captured
+        try:
+            with open(file_path, 'r') as f:
+                json_data = json.load(f)
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            if warning_msgs:
+                return True, f"✅ The file is valid JSON.\n\nWarnings:\n" + "\n".join(warning_msgs), json_data
+            return True, "✅ The file is valid JSON.", json_data
+        except json.JSONDecodeError as e:
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            error_message = f"❌ Invalid JSON format: {str(e)}"
+            if warning_msgs:
+                error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+            return False, error_message, None
+        except Exception as e:
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            error_message = f"❌ Error reading file: {str(e)}"
+            if warning_msgs:
+                error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+            return False, error_message, None
 def validate_croissant(json_data):
     """Validate that the JSON follows Croissant schema."""
+    with warnings.catch_warnings(record=True) as caught_warnings:
+        warnings.simplefilter("always")  # Ensure all warnings are captured
+        try:
+            dataset = mlc.Dataset(jsonld=json_data)
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            if warning_msgs:
+                return True, f"✅ The dataset passes Croissant validation.\n\nWarnings:\n" + "\n".join(warning_msgs)
+            return True, "✅ The dataset passes Croissant validation."
+        except mlc.ValidationError as e:
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            error_details = traceback.format_exc()
+            error_message = f"❌ Validation failed: {str(e)}\n\n{error_details}"
+            if warning_msgs:
+                error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+            return False, error_message
+        except Exception as e:
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            error_details = traceback.format_exc()
+            error_message = f"❌ Unexpected error during validation: {str(e)}\n\n{error_details}"
+            if warning_msgs:
+                error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+            return False, error_message
 def validate_records(json_data):
     """Validate that records can be generated within the time limit."""
+    with warnings.catch_warnings(record=True) as caught_warnings:
+        warnings.simplefilter("always")  # Ensure all warnings are captured
+        try:
+            dataset = mlc.Dataset(jsonld=json_data)
+            record_sets = dataset.metadata.record_sets
+            if not record_sets:
+                warning_msgs = [str(w.message) for w in caught_warnings]
+                msg = "✅ No record sets found to validate."
+                if warning_msgs:
+                    msg += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+                return True, msg
+            results = []
+            all_warnings = []
+            for record_set in record_sets:
+                # Capture warnings for each record set separately
+                with warnings.catch_warnings(record=True) as record_warnings:
+                    warnings.simplefilter("always")
+                    try:
+                        records = dataset.records(record_set=record_set.uuid)
+                        print(f"Attempting to validate record set: {record_set.uuid}")
+                        _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records)))
+                        # Add any warnings from this record set
+                        warning_msgs = [str(w.message) for w in record_warnings]
+                        if warning_msgs:
+                            all_warnings.extend(warning_msgs)
+                        results.append(f"✅ Record set '{record_set.uuid}' passed validation.")
+                    except func_timeout.exceptions.FunctionTimedOut:
+                        warning_msgs = [str(w.message) for w in record_warnings]
+                        error_message = f"❌ Record set '{record_set.uuid}' generation took too long (>300s)"
+                        if warning_msgs:
+                            error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+                        return False, error_message
+                    except Exception as e:
+                        warning_msgs = [str(w.message) for w in record_warnings]
+                        error_details = traceback.format_exc()
+                        error_message = f"❌ Record set '{record_set.uuid}' failed: {str(e)}\n\n{error_details}"
+                        if warning_msgs:
+                            error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+                        return False, error_message
+            # Add any warnings from the initial setup
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            if warning_msgs:
+                all_warnings.extend(warning_msgs)
+            final_message = "\n".join(results)
+            if all_warnings:
+                final_message += "\n\nWarnings:\n" + "\n".join(all_warnings)
+            return True, final_message
+        except Exception as e:
+            warning_msgs = [str(w.message) for w in caught_warnings]
+            error_details = traceback.format_exc()
+            error_message = f"❌ Unexpected error during records validation: {str(e)}\n\n{error_details}"
+            if warning_msgs:
+                error_message += "\n\nWarnings:\n" + "\n".join(warning_msgs)
+            return False, error_message