Spaces:
Running
Running
File size: 1,253 Bytes
acd7cf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import os
from typing import Dict, List, Optional
import nltk
import jieba
resource_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources")
class NLTKHelper:
_stopwords: Dict[str, Optional[List[str]]] = {
"english": None,
"chinese": None,
}
def __init__(self):
jieba.initialize()
def get_stopwords(self, lang: str) -> List[str]:
nltk.data.path.append(os.path.join(resource_path, "nltk_data"))
if self._stopwords[lang] is None:
try:
nltk.data.find("corpora/stopwords")
except LookupError:
nltk.download("stopwords", download_dir=os.path.join(resource_path, "nltk_data"))
self._stopwords[lang] = nltk.corpus.stopwords.words(lang)
return self._stopwords[lang]
@staticmethod
def word_tokenize(text: str, lang: str) -> List[str]:
if lang == "zh":
return jieba.lcut(text)
nltk.data.path.append(os.path.join(resource_path, "nltk_data"))
try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError:
nltk.download("punkt_tab", download_dir=os.path.join(resource_path, "nltk_data"))
return nltk.word_tokenize(text)
|