Spaces:

Forrest99
/

codesearchBase

Running

Forrest99 commited on Mar 5

Commit

144fc9e

verified ·

1 Parent(s): ff162ee

Create validate_data.py

Files changed (1) hide show

validate_data.py ADDED Viewed

+def validate_inputs(snippets):
+    """数据验证工具"""
+    errors = []
+    valid_data = []
+    for idx, s in enumerate(snippets):
+        # 类型检查
+        if not isinstance(s, str):
+            errors.append(f"索引 {idx}: 类型错误 ({type(s)}) → 已转换")
+            s = str(s)
+        # 清理数据
+        cleaned = s.replace("...", "").strip()
+        # 有效性检查
+        if len(cleaned) < 10:  # 最小代码长度
+            errors.append(f"索引 {idx}: 代码过短 ({len(cleaned)} 字符)")
+            continue
+        valid_data.append(cleaned)
+    # 输出报告
+    print(f"验证结果：\n成功 {len(valid_data)} 条\n错误 {len(errors)} 条")
+    for err in errors[:3]:  # 显示前3个错误
+        print(f"  - {err}")
+    if len(errors) > 3:
+        print(f"  ...（共 {len(errors)} 个错误）")
+    return valid_data
+if __name__ == "__main__":
+    # 测试数据
+    test_data = [
+        "def example(): pass",
+        12345,  # 错误类型
+        "print(...)",  # 包含...
+        "   ",  # 空字符串
+        """def valid_func():
+            return '正确代码'"""
+    ]
+    validated = validate_inputs(test_data)
+    print("\n有效数据示例：")
+    for i, data in enumerate(validated[:2]):
+        print(f"[{i}] {data[:50]}...")