Spaces:

sigridveronica
/

ai-news-analyzer

Running

ai-news-analyzer / external /FinNLP /finnlp /data_sources /social_media /reddit_streaming.py

Sigrid De los Santos

Remove remaining binary file for Hugging Face

9df4cc0 11 days ago

3.36 kB

	from finnlp.data_sources.social_media._base import Social_Media_Downloader

	from tqdm import tqdm
	from lxml import etree
	import requests
	import pandas as pd
	import json
	import base64

	class Reddit_Streaming(Social_Media_Downloader):

	def __init__(self, args = {}):
	super().__init__(args)
	self.dataframe = pd.DataFrame()

	def download_streaming_all(self, rounds = 3):
	# Download the first page by url
	base_url = "https://www.reddit.com/r/wallstreetbets/new/"
	pbar = tqdm(total= rounds, desc= "Downloading by pages...")
	res = self._request_get(base_url)
	if res is None:
	raise ConnectionError

	# get the info from init page
	html = etree.HTML(res.text)
	init = html.xpath("//*[@id='data']/text()")[0]
	init = json.loads(init[14:][:-1])
	init = init["posts"]["models"]
	tmp_df = pd.DataFrame(init).T.reset_index(drop = True)
	self.dataframe = tmp_df
	init = [i for i in init if len(i)< 12]
	last_id = init[-1]
	last_id = self._encode_base64(last_id)

	pbar.update(1)

	# fetch other pages
	if rounds > 1:
	for _ in range(1,rounds):
	last_id = self._fatch_other_pages(last_id, pbar)

	def _fatch_other_pages(self, last_page, pbar):
	url = 'https://gql.reddit.com/'
	headers = {
	"referer":"https://www.reddit.com/",
	"authorization": "Bearer -twjFZkBAlpR8gZnZqsGHvz-G5c49PA",
	"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36"
	}
	data = {
	"id": "02e3b6d0d0d7",
	"variables": {
	"name": "wallstreetbets",
	"includeIdentity": False,
	"adContext": {
	"layout": "CARD",
	"clientSignalSessionData": {
	"adsSeenCount": 4,
	"totalPostsSeenCount": 79,
	"sessionStartTime": "2023-04-07T15:32:13.933Z",
	}
	},
	"isFake": False,
	"includeAppliedFlair": False,
	"includeDevPlatformMetadata": True,
	"includeRecents": False,
	"includeTrending": False,
	"includeSubredditRankings": True,
	"includeSubredditChannels": False,
	"isAdHocMulti": False,
	"isAll": False,
	"isLoggedOutGatedOptedin": False,
	"isLoggedOutQuarantineOptedin": False,
	"isPopular": False,
	"recentPostIds": [],
	"subredditNames": [],
	"sort": "NEW",
	"pageSize": 25,
	"after": last_page
	}
	}
	response = self._request_post(url = url, headers= headers, json = data)
	data = json.loads(response.text)
	data = data["data"]["subredditInfoByName"]["elements"]["edges"]
	for d in data:
	if d["node"]["__typename"] == "SubredditPost":
	tmp = pd.DataFrame(d).T
	self.dataframe = pd.concat([self.dataframe, tmp])
	last_id = tmp.id.values[0]

	last_id = self._encode_base64(last_id)
	pbar.update(1)

	return last_id

	def _encode_base64(self,id):
	return base64.b64encode(id.encode('utf-8')).decode()