ttw / gradiobee /seg_text.py
freemt
Before branch dev
57b1c4f
raw
history blame
2.15 kB
"""Split text to sentences.
Use sentence_splitter if supported,
else use polyglot.text.Text
from hlm_texts
!apt install libicu-dev
!install pyicu pycld2 Morfessor
!pip install polyglot sentence_splitter
"""
from typing import List, Optional
from tqdm.auto import tqdm
from polyglot.detect.base import logger as polyglot_logger
from polyglot.text import Detector, Text
from sentence_splitter import split_text_into_sentences
from logzero import logger
# turn of polyglot.text.Detector warning
polyglot_logger.setLevel("ERROR")
# fmt: off
# use sentence_splitter if supported
LANG_S = ["ca", "cs", "da", "nl", "en", "fi", "fr", "de",
"el", "hu", "is", "it", "lv", "lt", "no", "pl",
"pt", "ro", "ru", "sk", "sl", "es", "sv", "tr"]
def seg_text(
text: str,
lang: Optional[str] = None,
qmode: bool = False,
maxlines: int = 1000
) -> List[str]:
# fmt: on
"""
Split text to sentences.
Use sentence_splitter if supported,
else use polyglot.text.Text.sentences
qmode: skip split_text_into_sentences if True, default False
vectors for all books are based on qmode=False.
qmode=True is for quick test purpose only
maxlines (default 1000), threhold for turn on tqdm progressbar
set to <1 or a large number to turn it off
"""
if lang is None:
try:
lang = Detector(text).language.code
except Exception as exc:
logger.warning("polyglot.text.Detector exc: %s, setting to 'en'", exc)
lang = "en"
if not qmode and lang in LANG_S:
_ = []
lines = text.splitlines()
# if maxlines > 1 and len(lines) > maxlines:
if len(lines) > maxlines > 1:
for para in tqdm(lines):
if para.strip():
_.extend(split_text_into_sentences(para, lang))
else:
for para in lines:
if para.strip():
_.extend(split_text_into_sentences(para, lang))
return _
# return split_text_into_sentences(text, lang)
return [elm.string for elm in Text(text, lang).sentences]