Spaces:
Runtime error
Runtime error
| from nltk.stem.isri import ISRIStemmer | |
| from pyarabic.araby import strip_tashkeel, strip_tatweel | |
| import numpy as np | |
| import pandas as pd | |
| import json | |
| import re | |
| import time | |
| import os | |
| import math | |
| import random | |
| isristemmer = ISRIStemmer() | |
| def stemming(txt): | |
| return isristemmer.stem(txt) | |
| def remove_singleCharacter(text): | |
| text_tokenized = ar.tokenize(text) | |
| clean_txt = '' | |
| for word in text_tokenized: | |
| if len(word) != 1: | |
| clean_txt = clean_txt + word + ' ' | |
| return clean_txt[:-1] | |
| # remove_punctuations | |
| def remove_punctuations(text): | |
| punc = '''()-[]{};:'"\,<>./@#$%^&*،؛_~''' | |
| arabic_punctuations = '''`÷×؛_ـ،/:".,'~¦+|”…“–ـ=﴾﴿ ﹱ ﹹ ⸀˓• ב''' | |
| punctuations_list = punc + arabic_punctuations | |
| for x in punctuations_list: | |
| text = text.replace(x, ' ') | |
| return text | |
| def normalize_text(txt): | |
| txt = strip_tashkeel(txt) | |
| txt = strip_tatweel(txt) | |
| txt = ''.join(txt[i] for i in range(len(txt)) if i == | |
| 0 or txt[i-1] != txt[i]) # remove repeated characters | |
| return txt | |
| def remove_stopwords(txt, path="stopword.txt"): | |
| text_tokenized = txt.split(' ') | |
| clean_txt = '' | |
| # useful_words=[] | |
| # filtered_sentence=" " | |
| arabic_stop_words_file = open(path, 'r', encoding='utf-8') | |
| arabic_stop_words = arabic_stop_words_file.read().split('\n') | |
| for word in text_tokenized: | |
| if word not in arabic_stop_words: | |
| clean_txt = clean_txt + word + ' ' | |
| return clean_txt[:-1] | |
| def Remove_unwanted(text): | |
| # removing the extra spacing and links | |
| text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) | |
| text = re.sub(r'^http?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE) | |
| text = re.sub(r"http\S+", " ", text) | |
| text = re.sub(r"https\S+", " ", text) | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'[a-zA-Z]+', ' ', text) | |
| text = re.sub(r"^\s+|\s+$", "", text) | |
| text = re.sub(r"(\s\d+)", " ", text) | |
| text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", " ", text) | |
| text = re.sub(r"\d+", " ", text) | |
| text = re.sub(r'[إأٱآا]', 'ا', text) | |
| text = re.sub(r'ى', '[ي]', text) | |
| text = re.sub(r'ء', '[ؤئ]', text) | |
| text = re.sub(r' +', ' ', text) | |
| return text | |
| def txt_preprocess(text): | |
| text = normalize_text(text) | |
| text = stemming(text) | |
| text = remove_stopwords(text) | |
| text = remove_punctuations(text) | |
| text = Remove_unwanted(text) | |
| return text | |