Spaces:
Sleeping
Sleeping
File size: 3,086 Bytes
280d87f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import string
import underthesea
from .dictionary import (emotion2wordform_dict, mispelling_dict, number_dict,
translate_dict, wordform2vnese_dict)
#10 Remove stopwords
def remove_stopwords(input_text, stopwords_file='Datasets/Query/stopword.txt'):
# Read the custom stop words from the file
with open(stopwords_file, 'r', encoding='utf-8') as file:
stopwords = set(line.strip() for line in file)
cleaned_words = [word for word in input_text.split() if word.lower() not in stopwords]
cleaned_text = ' '.join(cleaned_words)
return cleaned_text
#9 word segmentation
def word_segment(text):
return underthesea.word_tokenize(text, format="text")
#8 Remove numbers
def remove_numbers(input_string):
# Use the isalpha() method to filter out numeric characters
cleaned_string = ''.join(char for char in input_string if not char.isdigit())
return cleaned_string
#7
def remove_extra_whitespace(input_string):
words = input_string.split()
return ' '.join(words)
#6 Tranform Number to text (8 - tám)
def number2text(sentence):
words = sentence.split()
converted_words = [number_dict.get(word, word) for word in words]
converted_sentence = ' '.join(converted_words)
return converted_sentence
#5 Transform mispelling words, acronyms, .....(include translate english words)
def translate2word(sentence, dictionary = translate_dict):
sentence = " " + sentence.strip() + " "
for key, value_list in dictionary.items():
for value in value_list:
sentence = sentence.replace(value, key)
return sentence
def mispell2word(sentence, dictionary = mispelling_dict):
sentence = " " + sentence.strip() + " "
for key, value_list in dictionary.items():
for value in value_list:
sentence = sentence.replace(value, key)
return sentence
#4 Transform word from into vietnamese (colonsmile - cười)
def word_form2Vnese(sentence):
words = sentence.split()
converted_words = [wordform2vnese_dict.get(word, word) for word in words]
converted_sentence = ' '.join(converted_words)
return converted_sentence
#3 f
def remove_punctuation(input_string):
# Create a translation table to remove all punctuation characters
translator = str.maketrans('', '', string.punctuation)
# Use the translate method to remove punctuation
cleaned_string = input_string.translate(translator)
return cleaned_string
#2 emoticon to word form ( :) - colonsmile )
def emoticon2word(sentence):
words = sentence.split()
converted_words = [emotion2wordform_dict.get(word, word) for word in words]
converted_sentence = ' '.join(converted_words)
return converted_sentence
#1 lower case
def lower_case(text):
return text.lower()
def data_preprocessing(text):
return remove_stopwords(word_segment(remove_extra_whitespace(number2text(mispell2word(remove_punctuation(lower_case(text)))))))
def read_input(input): #hàm cuối cùng khi đọc và xử lí input sentence
return data_preprocessing(input)
|