Zayed13 commited on
Commit
5e28a48
·
1 Parent(s): a425f9a

Create Untitled0.ipynb

Browse files
Files changed (1) hide show
  1. Untitled0.ipynb +68 -0
Untitled0.ipynb ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install transformers
2
+ import pandas as pd
3
+ from wordcloud import WordCloud
4
+ import seaborn as sns
5
+ import re
6
+ import string
7
+ from collections import Counter, defaultdict
8
+
9
+ from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
10
+
11
+ import plotly.express as px
12
+ from plotly.subplots import make_subplots
13
+ import plotly.graph_objects as go
14
+ from plotly.offline import plot
15
+
16
+ import matplotlib.gridspec as gridspec
17
+ from matplotlib.ticker import MaxNLocator
18
+ import matplotlib.patches as mpatches
19
+ import matplotlib.pyplot as plt
20
+ import warnings
21
+ warnings.filterwarnings('ignore')
22
+ import nltk
23
+ nltk.download('stopwords')
24
+ from nltk.corpus import stopwords
25
+ stopWords_nltk = set(stopwords.words('english'))
26
+
27
+
28
+ import re
29
+ from typing import Union, List
30
+ class CleanText():
31
+ """ clearing text except digits () . , word character """
32
+
33
+ def __init__(self, clean_pattern = r"[^A-ZĞÜŞİÖÇIa-zğüı'şöç0-9.\"',()]"):
34
+ self.clean_pattern =clean_pattern
35
+
36
+ def __call__(self, text: Union[str, list]) -> List[List[str]]:
37
+
38
+ if isinstance(text, str):
39
+ docs = [[text]]
40
+
41
+ if isinstance(text, list):
42
+ docs = text
43
+
44
+ text = [[re.sub(self.clean_pattern, " ", sent) for sent in sents] for sents in docs]
45
+
46
+ return text
47
+ def remove_emoji(data):
48
+ emoj = re.compile("["
49
+ u"\U0001F600-\U0001F64F" # emoticons
50
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
51
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
52
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
53
+ u"\U00002500-\U00002BEF"
54
+ u"\U00002702-\U000027B0"
55
+ u"\U00002702-\U000027B0"
56
+ u"\U000024C2-\U0001F251"
57
+ u"\U0001f926-\U0001f937"
58
+ u"\U00010000-\U0010ffff"
59
+ u"\u2640-\u2642"
60
+ u"\u2600-\u2B55"
61
+ u"\u200d"
62
+ u"\u23cf"
63
+ u"\u23e9"
64
+ u"\u231a"
65
+ u"\ufe0f" # dingbats
66
+ u"\u3030"
67
+ "]+", re.UNICODE)
68
+ return re.sub(emoj, '', data)