Synced repo using 'sync_with_huggingface' Github Action
Browse files- iscc_sct/dev.py +22 -0
- pyproject.toml +8 -2
- space.yml +0 -34
iscc_sct/dev.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pathlib
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
HERE = pathlib.Path(__file__).parent.absolute()
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def convert_lf(): # pragma: no cover
|
| 8 |
+
"""Convert line endings to LF"""
|
| 9 |
+
crlf = b"\r\n"
|
| 10 |
+
lf = b"\n"
|
| 11 |
+
extensions = {".py", ".toml", ".lock", ".txt", ".yml", ".sh", ".md"}
|
| 12 |
+
n = 0
|
| 13 |
+
for fp in HERE.parent.glob("**/*"):
|
| 14 |
+
if fp.suffix in extensions:
|
| 15 |
+
with open(fp, "rb") as infile:
|
| 16 |
+
content = infile.read()
|
| 17 |
+
if crlf in content:
|
| 18 |
+
content = content.replace(crlf, lf)
|
| 19 |
+
with open(fp, "wb") as outfile:
|
| 20 |
+
outfile.write(content)
|
| 21 |
+
n += 1
|
| 22 |
+
print(f"{n} files converted to LF")
|
pyproject.toml
CHANGED
|
@@ -84,11 +84,17 @@ line-length = 119
|
|
| 84 |
[tool.ruff.format]
|
| 85 |
line-ending = "lf"
|
| 86 |
|
|
|
|
|
|
|
|
|
|
| 87 |
[tool.poe.tasks]
|
| 88 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
| 89 |
format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
[build-system]
|
| 94 |
requires = ["poetry-core>=1.0.0"]
|
|
|
|
| 84 |
[tool.ruff.format]
|
| 85 |
line-ending = "lf"
|
| 86 |
|
| 87 |
+
[tool.coverage.run]
|
| 88 |
+
omit = ["iscc_sct/dev.py", "tests/"]
|
| 89 |
+
|
| 90 |
[tool.poe.tasks]
|
| 91 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
| 92 |
format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
|
| 93 |
+
convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
|
| 94 |
+
test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
|
| 95 |
+
update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
|
| 96 |
+
all = ["format-code", "format-markdown", "convert-lf", "test"]
|
| 97 |
+
update = ["update-dependencies", "all"]
|
| 98 |
|
| 99 |
[build-system]
|
| 100 |
requires = ["poetry-core>=1.0.0"]
|
space.yml
CHANGED
|
@@ -4,40 +4,6 @@ colorFrom: red
|
|
| 4 |
colorTo: blue
|
| 5 |
sdk: gradio
|
| 6 |
sdk_version: 4.41.0
|
| 7 |
-
app_file: ./iscc_sct/demo.py
|
| 8 |
pinned: true
|
| 9 |
license: CC-BY-NC-SA-4.0
|
| 10 |
short_description: Cross Lingual Similarity Preserving Text Simprints
|
| 11 |
-
description: >
|
| 12 |
-
# ISCC-LAB - Semantic-Code Text
|
| 13 |
-
|
| 14 |
-
`iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
|
| 15 |
-
[ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
|
| 16 |
-
short identifiers created from text documents that preserve similarity (in hamming distance)
|
| 17 |
-
for semantically similar cross-lingual text inputs.
|
| 18 |
-
|
| 19 |
-
## What is the ISCC
|
| 20 |
-
|
| 21 |
-
The ISCC is a combination of various similarity preserving fingerprints and an identifier for
|
| 22 |
-
digital media content.
|
| 23 |
-
|
| 24 |
-
ISCCs are generated algorithmically from digital content, just like cryptographic hashes. However,
|
| 25 |
-
instead of using a single cryptographic hash function to identify data only, the ISCC uses various
|
| 26 |
-
algorithms to create a composite identifier that exhibits similarity-preserving properties (soft
|
| 27 |
-
hash or Simprint).
|
| 28 |
-
|
| 29 |
-
The component-based structure of the ISCC identifies content at multiple levels of abstraction. Each
|
| 30 |
-
component is self-describing, modular, and can be used separately or with others to aid in various
|
| 31 |
-
content identification tasks. The algorithmic design supports content deduplication, database
|
| 32 |
-
synchronization, indexing, integrity verification, timestamping, versioning, data provenance,
|
| 33 |
-
similarity clustering, anomaly detection, usage tracking, allocation of royalties, fact-checking and
|
| 34 |
-
general digital asset management use-cases.
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
## ISCC Status
|
| 38 |
-
|
| 39 |
-
The [ISCC](https://iscc.codes) is an ISO Standrad published under
|
| 40 |
-
[ISO 24138:2024](https://www.iso.org/standard/77899.html) - International Standard Content Code
|
| 41 |
-
within [ISO/TC 46/SC 9/WG 18](https://www.iso.org/committee/48836.html).
|
| 42 |
-
|
| 43 |
-
The algorithms of this `iscc-sct` are experimental and not (yet) part of the official standard.
|
|
|
|
| 4 |
colorTo: blue
|
| 5 |
sdk: gradio
|
| 6 |
sdk_version: 4.41.0
|
|
|
|
| 7 |
pinned: true
|
| 8 |
license: CC-BY-NC-SA-4.0
|
| 9 |
short_description: Cross Lingual Similarity Preserving Text Simprints
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|