entity-based-sentiment-analysis / twitter_scraper.py
wira.indra
add twitter feature
d49f00b
raw
history blame
1.58 kB
import snscrape.modules.twitter as sntwitter
import datetime as dt
import pandas as pd
import sys
import os
import re
import tqdm
def scrape_tweets(query, max_tweets=10, output_path="./scraper/output/" ):
tweets_list = []
for i,tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(query).get_items())):
if max_tweets != -1 and i >= int(max_tweets):
break
tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, tweet.likeCount, tweet.retweetCount, tweet.replyCount, tweet.quoteCount, tweet.url, tweet.lang])
df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Likes', 'Retweets', 'Replies', 'Quotes', 'URL', 'Language'])
df = df[df["Language"] == "in"]
return df
def remove_unnecessary_char(text):
text = re.sub("\[USERNAME\]", " ", text)
text = re.sub("\[URL\]", " ", text)
text = re.sub("\[SENSITIVE-NO\]", " ", text)
text = re.sub(' +', ' ', text)
return text
def preprocess_tweet(text):
text = re.sub('\n',' ',text)
text = re.sub('^(\@\w+ ?)+',' ',text)
text = re.sub(r'\@\w+',' ',text)
text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text)
text = re.sub('/', ' ', text)
text = re.sub(' +', ' ', text)
return text
def remove_nonaplhanumeric(text):
text = re.sub('[^0-9a-zA-Z]+', ' ', text)
return text
def preprocess_text(text):
text = preprocess_tweet(text)
text = remove_unnecessary_char(text)
text = remove_nonaplhanumeric(text)
text = text.lower()
return text