!pip install transformers import pandas as pd from wordcloud import WordCloud import seaborn as sns import re import string from collections import Counter, defaultdict from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import plotly.express as px from plotly.subplots import make_subplots import plotly.graph_objects as go from plotly.offline import plot import matplotlib.gridspec as gridspec from matplotlib.ticker import MaxNLocator import matplotlib.patches as mpatches import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') import nltk nltk.download('stopwords') from nltk.corpus import stopwords stopWords_nltk = set(stopwords.words('english')) import re from typing import Union, List class CleanText(): """ clearing text except digits () . , word character """ def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"): self.clean_pattern =clean_pattern def __call__(self, text: Union[str, list]) -> List[List[str]]: if isinstance(text, str): docs = [[text]] if isinstance(text, list): docs = text text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs] return text def remove_emoji(data): emoj = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", re.UNICODE) return re.sub(emoj, '', data)