File size: 2,138 Bytes
57b1c4f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
"""Split text to sentences.
Use sentence_splitter if supported,
else use polyglot.text.Text
!apt install libicu-dev
!install pyicu pycld2 Morfessor
!pip install polyglot sentence_splitter
"""
from typing import List, Optional
from tqdm.auto import tqdm
from polyglot.detect.base import logger as polyglot_logger
from polyglot.text import Detector, Text
from sentence_splitter import split_text_into_sentences
from logzero import logger
# turn of polyglot.text.Detector warning
polyglot_logger.setLevel("ERROR")
# fmt: off
# use sentence_splitter if supported
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
"el", "hu", "is", "it", "lv", "lt", "no", "pl",
"pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]
def seg_text(
text: str,
lang: Optional[str] = None,
qmode: bool = False,
maxlines: int = 1000
) -> List[str]:
# fmt: on
"""
Split text to sentences.
Use sentence_splitter if supported,
else use polyglot.text.Text.sentences
qmode: skip split_text_into_sentences if True, default False
vectors for all books are based on qmode=False.
qmode=True is for quick test purpose only
maxlines (default 1000), threhold for turn on tqdm progressbar
set to <1 or a large number to turn it off
"""
if lang is None:
try:
lang = Detector(text).language.code
except Exception as exc:
logger.warning("polyglot.text.Detector exc: %s, setting to 'en'", exc)
lang = "en"
if not qmode and lang in LANG_S:
_ = []
lines = text.splitlines()
# if maxlines > 1 and len(lines) > maxlines:
if len(lines) > maxlines > 1:
for para in tqdm(lines):
if para.strip():
_.extend(split_text_into_sentences(para, lang))
else:
for para in lines:
if para.strip():
_.extend(split_text_into_sentences(para, lang))
return _
# return split_text_into_sentences(text, lang)
return [elm.string for elm in Text(text, lang).sentences]
|