Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
from finnlp.data_sources.social_media._base import Social_Media_Downloader
from tqdm import tqdm
from lxml import etree
import requests
import pandas as pd
import json
import base64
class Reddit_Streaming(Social_Media_Downloader):
def __init__(self, args = {}):
super().__init__(args)
self.dataframe = pd.DataFrame()
def download_streaming_all(self, rounds = 3):
# Download the first page by url
base_url = "https://www.reddit.com/r/wallstreetbets/new/"
pbar = tqdm(total= rounds, desc= "Downloading by pages...")
res = self._request_get(base_url)
if res is None:
raise ConnectionError
# get the info from init page
html = etree.HTML(res.text)
init = html.xpath("//*[@id='data']/text()")[0]
init = json.loads(init[14:][:-1])
init = init["posts"]["models"]
tmp_df = pd.DataFrame(init).T.reset_index(drop = True)
self.dataframe = tmp_df
init = [i for i in init if len(i)< 12]
last_id = init[-1]
last_id = self._encode_base64(last_id)
pbar.update(1)
# fetch other pages
if rounds > 1:
for _ in range(1,rounds):
last_id = self._fatch_other_pages(last_id, pbar)
def _fatch_other_pages(self, last_page, pbar):
url = 'https://gql.reddit.com/'
headers = {
"referer":"https://www.reddit.com/",
"authorization": "Bearer -twjFZkBAlpR8gZnZqsGHvz-G5c49PA",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
}
data = {
"id": "02e3b6d0d0d7",
"variables": {
"name": "wallstreetbets",
"includeIdentity": False,
"adContext": {
"layout": "CARD",
"clientSignalSessionData": {
"adsSeenCount": 4,
"totalPostsSeenCount": 79,
"sessionStartTime": "2023-04-07T15:32:13.933Z",
}
},
"isFake": False,
"includeAppliedFlair": False,
"includeDevPlatformMetadata": True,
"includeRecents": False,
"includeTrending": False,
"includeSubredditRankings": True,
"includeSubredditChannels": False,
"isAdHocMulti": False,
"isAll": False,
"isLoggedOutGatedOptedin": False,
"isLoggedOutQuarantineOptedin": False,
"isPopular": False,
"recentPostIds": [],
"subredditNames": [],
"sort": "NEW",
"pageSize": 25,
"after": last_page
}
}
response = self._request_post(url = url, headers= headers, json = data)
data = json.loads(response.text)
data = data["data"]["subredditInfoByName"]["elements"]["edges"]
for d in data:
if d["node"]["__typename"] == "SubredditPost":
tmp = pd.DataFrame(d).T
self.dataframe = pd.concat([self.dataframe, tmp])
last_id = tmp.id.values[0]
last_id = self._encode_base64(last_id)
pbar.update(1)
return last_id
def _encode_base64(self,id):
return base64.b64encode(id.encode('utf-8')).decode()