Spaces:
Sleeping
Sleeping
import string | |
import underthesea | |
from .dictionary import (emotion2wordform_dict, mispelling_dict, number_dict, | |
translate_dict, wordform2vnese_dict) | |
#10 Remove stopwords | |
def remove_stopwords(input_text, stopwords_file='Datasets/Query/stopword.txt'): | |
# Read the custom stop words from the file | |
with open(stopwords_file, 'r', encoding='utf-8') as file: | |
stopwords = set(line.strip() for line in file) | |
cleaned_words = [word for word in input_text.split() if word.lower() not in stopwords] | |
cleaned_text = ' '.join(cleaned_words) | |
return cleaned_text | |
#9 word segmentation | |
def word_segment(text): | |
return underthesea.word_tokenize(text, format="text") | |
#8 Remove numbers | |
def remove_numbers(input_string): | |
# Use the isalpha() method to filter out numeric characters | |
cleaned_string = ''.join(char for char in input_string if not char.isdigit()) | |
return cleaned_string | |
#7 | |
def remove_extra_whitespace(input_string): | |
words = input_string.split() | |
return ' '.join(words) | |
#6 Tranform Number to text (8 - tám) | |
def number2text(sentence): | |
words = sentence.split() | |
converted_words = [number_dict.get(word, word) for word in words] | |
converted_sentence = ' '.join(converted_words) | |
return converted_sentence | |
#5 Transform mispelling words, acronyms, .....(include translate english words) | |
def translate2word(sentence, dictionary = translate_dict): | |
sentence = " " + sentence.strip() + " " | |
for key, value_list in dictionary.items(): | |
for value in value_list: | |
sentence = sentence.replace(value, key) | |
return sentence | |
def mispell2word(sentence, dictionary = mispelling_dict): | |
sentence = " " + sentence.strip() + " " | |
for key, value_list in dictionary.items(): | |
for value in value_list: | |
sentence = sentence.replace(value, key) | |
return sentence | |
#4 Transform word from into vietnamese (colonsmile - cười) | |
def word_form2Vnese(sentence): | |
words = sentence.split() | |
converted_words = [wordform2vnese_dict.get(word, word) for word in words] | |
converted_sentence = ' '.join(converted_words) | |
return converted_sentence | |
#3 f | |
def remove_punctuation(input_string): | |
# Create a translation table to remove all punctuation characters | |
translator = str.maketrans('', '', string.punctuation) | |
# Use the translate method to remove punctuation | |
cleaned_string = input_string.translate(translator) | |
return cleaned_string | |
#2 emoticon to word form ( :) - colonsmile ) | |
def emoticon2word(sentence): | |
words = sentence.split() | |
converted_words = [emotion2wordform_dict.get(word, word) for word in words] | |
converted_sentence = ' '.join(converted_words) | |
return converted_sentence | |
#1 lower case | |
def lower_case(text): | |
return text.lower() | |
def data_preprocessing(text): | |
return remove_stopwords(word_segment(remove_extra_whitespace(number2text(mispell2word(remove_punctuation(lower_case(text))))))) | |
def read_input(input): #hàm cuối cùng khi đọc và xử lí input sentence | |
return data_preprocessing(input) | |