File size: 2,461 Bytes
b31f748 73ab668 8c51bed 73ab668 b31f748 8c51bed b31f748 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import argparse
import glob
from pathlib import Path
from loguru import logger
from iscc_sct.main import create
from charset_normalizer import from_bytes
def main():
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
parser.add_argument(
"path",
type=str,
help="Path to text files (supports glob patterns) or 'gui' to launch Gradio demo.",
nargs="?",
)
parser.add_argument(
"-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
)
parser.add_argument(
"-g", "--granular", action="store_true", help="Activate granular processing."
)
parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
args = parser.parse_args()
if args.path is None:
parser.print_help()
return
if not args.debug:
logger.remove()
if args.path == "gui": # pragma: no cover
try:
from iscc_sct.demo import demo
demo.launch(inbrowser=True)
except ImportError:
print(
"Error: Gradio is not installed. Please install it with 'pip install gradio' to use the GUI."
)
return
for path in glob.glob(args.path):
path = Path(path)
if path.is_file():
logger.debug(f"Processing {path.name}")
with path.open("rb") as file:
data = file.read()
try:
text = data.decode("utf-8")
if not text.strip():
logger.warning(f"SKIPPED empty: {path}")
continue
except UnicodeDecodeError:
logger.debug(f"Could not decode {path.name} as UTF-8.")
charset_match = from_bytes(data).best()
if not charset_match: # pragma: no cover
logger.error(f"SKIPPING {path.name} - failed to detect text encoding")
continue
logger.debug(f"Decode {path.name} with {charset_match.encoding}.")
text = str(charset_match)
sct_meta = create(text, granular=args.granular, bits=args.bits)
if args.granular:
print(repr(sct_meta))
else:
print(sct_meta.iscc)
if __name__ == "__main__": # pragma: no cover
main()
|