File size: 3,086 Bytes
280d87f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import string

import underthesea

from .dictionary import (emotion2wordform_dict, mispelling_dict, number_dict,
                         translate_dict, wordform2vnese_dict)


#10 Remove stopwords 
def remove_stopwords(input_text, stopwords_file='Datasets/Query/stopword.txt'):
    # Read the custom stop words from the file
    with open(stopwords_file, 'r', encoding='utf-8') as file:
        stopwords = set(line.strip() for line in file)

    cleaned_words = [word for word in input_text.split() if word.lower() not in stopwords]
    cleaned_text = ' '.join(cleaned_words)

    return cleaned_text

#9 word segmentation
def word_segment(text):
    return underthesea.word_tokenize(text, format="text")

#8 Remove numbers
def remove_numbers(input_string):
    # Use the isalpha() method to filter out numeric characters
    cleaned_string = ''.join(char for char in input_string if not char.isdigit())
    return cleaned_string

#7 
def remove_extra_whitespace(input_string):
    words = input_string.split()
    return ' '.join(words)

#6 Tranform Number to text (8 - tám)
def number2text(sentence):
    words = sentence.split()
    converted_words = [number_dict.get(word, word) for word in words]
    converted_sentence = ' '.join(converted_words)
    return converted_sentence

#5 Transform mispelling words, acronyms, .....(include translate english words)
def translate2word(sentence, dictionary = translate_dict):
    sentence = " " + sentence.strip() + " "
    for key, value_list in dictionary.items():
        for value in value_list:
            sentence = sentence.replace(value, key)
    return sentence

def mispell2word(sentence, dictionary = mispelling_dict):
    sentence = " " + sentence.strip() + " "
    for key, value_list in dictionary.items():
        for value in value_list:
            sentence = sentence.replace(value, key)
    return sentence

#4 Transform word from into vietnamese (colonsmile - cười)
def word_form2Vnese(sentence):
    words = sentence.split()
    converted_words = [wordform2vnese_dict.get(word, word) for word in words]
    converted_sentence = ' '.join(converted_words)
    return converted_sentence

#3 f
def remove_punctuation(input_string):
    # Create a translation table to remove all punctuation characters
    translator = str.maketrans('', '', string.punctuation)
    
    # Use the translate method to remove punctuation
    cleaned_string = input_string.translate(translator)
    
    return cleaned_string

#2 emoticon to word form  ( :) - colonsmile )
def emoticon2word(sentence):
    words = sentence.split()
    converted_words = [emotion2wordform_dict.get(word, word) for word in words]
    converted_sentence = ' '.join(converted_words)
    return converted_sentence

#1 lower case
def lower_case(text):
    return text.lower()

def data_preprocessing(text):
    return remove_stopwords(word_segment(remove_extra_whitespace(number2text(mispell2word(remove_punctuation(lower_case(text)))))))

def read_input(input): #hàm cuối cùng khi đọc và xử lí input sentence
    return data_preprocessing(input)