Spaces:
Runtime error
Runtime error
from nltk.stem.isri import ISRIStemmer | |
from pyarabic.araby import strip_tashkeel, strip_tatweel | |
import numpy as np | |
import pandas as pd | |
import json | |
import re | |
import time | |
import os | |
import math | |
import random | |
isristemmer = ISRIStemmer() | |
def stemming(txt): | |
return isristemmer.stem(txt) | |
def remove_singleCharacter(text): | |
text_tokenized = ar.tokenize(text) | |
clean_txt = '' | |
for word in text_tokenized: | |
if len(word) != 1: | |
clean_txt = clean_txt + word + ' ' | |
return clean_txt[:-1] | |
# remove_punctuations | |
def remove_punctuations(text): | |
punc = '''()-[]{};:'"\,<>./@#$%^&*،؛_~''' | |
arabic_punctuations = '''`÷×؛_ـ،/:".,'~¦+|”…“–ـ=﴾﴿ ﹱ ﹹ ⸀˓• ב''' | |
punctuations_list = punc + arabic_punctuations | |
for x in punctuations_list: | |
text = text.replace(x, ' ') | |
return text | |
def normalize_text(txt): | |
txt = strip_tashkeel(txt) | |
txt = strip_tatweel(txt) | |
txt = ''.join(txt[i] for i in range(len(txt)) if i == | |
0 or txt[i-1] != txt[i]) # remove repeated characters | |
return txt | |
def remove_stopwords(txt, path="stopword.txt"): | |
text_tokenized = txt.split(' ') | |
clean_txt = '' | |
# useful_words=[] | |
# filtered_sentence=" " | |
arabic_stop_words_file = open(path, 'r', encoding='utf-8') | |
arabic_stop_words = arabic_stop_words_file.read().split('\n') | |
for word in text_tokenized: | |
if word not in arabic_stop_words: | |
clean_txt = clean_txt + word + ' ' | |
return clean_txt[:-1] | |
def Remove_unwanted(text): | |
# removing the extra spacing and links | |
text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) | |
text = re.sub(r'^http?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) | |
text = re.sub(r"http\S+", " ", text) | |
text = re.sub(r"https\S+", " ", text) | |
text = re.sub(r'\s+', ' ', text) | |
text = re.sub(r'[a-zA-Z]+', ' ', text) | |
text = re.sub(r"^\s+|\s+$", "", text) | |
text = re.sub(r"(\s\d+)", " ", text) | |
text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", " ", text) | |
text = re.sub(r"\d+", " ", text) | |
text = re.sub(r'[إأٱآا]', 'ا', text) | |
text = re.sub(r'ى', '[ي]', text) | |
text = re.sub(r'ء', '[ؤئ]', text) | |
text = re.sub(r' +', ' ', text) | |
return text | |
def txt_preprocess(text): | |
text = normalize_text(text) | |
text = stemming(text) | |
text = remove_stopwords(text) | |
text = remove_punctuations(text) | |
text = Remove_unwanted(text) | |
return text | |