Spaces:
Paused
Paused
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # Remove non printable char as per: | |
| # https://stackoverflow.com/questions/92438/stripping-non-printable-characters-from-a-string-in-python | |
| # | |
| # This is supposed to be a drop in replacement to moses strip-non-printing-char.perl | |
| import sys | |
| import unicodedata | |
| def get_replacer(replace_by: str = " ") -> str: | |
| non_printable_map = { | |
| ord(c): replace_by | |
| for c in (chr(i) for i in range(sys.maxunicode + 1)) | |
| # same as \p{C} in perl | |
| # see https://www.unicode.org/reports/tr44/#General_Category_Values | |
| if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"} | |
| } | |
| def replace_non_printing_char(line) -> str: | |
| return line.translate(non_printable_map) | |
| return replace_non_printing_char | |
| def test_remove(): | |
| replaceby_ = get_replacer("_") | |
| assert ( | |
| replaceby_("See what's hidden in your string… or behind") | |
| == "See what's hidden in your string…_or be_hind_" | |
| ) | |
| replacebyspace = get_replacer(" ") | |
| assert replacebyspace("\x00\x11Hello\u200bWorld") == " Hello World" | |