Sigrid De los Santos
Remove remaining binary file for Hugging Face
9df4cc0
"""
Author: Peter DUlworth
Date: 02/22/2019
This file contains helper methods to generate request headers.
"""
from enum import Enum
import random
import requests
from lxml.html import fromstring
from itertools import cycle
import traceback
userAgents = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
class Site(Enum):
SA = 1
NASDAQ = 2
def getFreeProxies():
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
proxies = set()
# look at 400 rows of the proxy table
for i in parser.xpath('//tbody/tr')[:500]:
# if the proxy support HTTPS
if i.xpath('.//td[7][contains(text(),"yes")]'):
# if the proxy is in the US, CA, MX
if i.xpath('.//td[3][contains(text(),"US")]') or i.xpath('.//td[3][contains(text(),"CA")]') or i.xpath('.//td[3][contains(text(),"MX")]'):
# save the proxy to our list
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
print ("Possible Proxies: ", proxies)
return list(proxies)
def getValidProxies():
proxies = getFreeProxies()
# if there we couldn't find any free proxies, well bummer just return an empty set
if not proxies:
return []
random.shuffle(proxies)
proxy_pool = cycle(proxies)
validProxies = set()
atLeastOneValid = False
# find my IP
url = 'https://httpbin.org/ip'
myIP = requests.get(url).json()
i = 0
# test at most three proxies (but keep testing if we haven't found a valid one yet)
while (i < min(len(proxies), 3) or not atLeastOneValid):
if i >= len(proxies):
return list(validProxies)
#Get a proxy from the pool
proxy = next(proxy_pool)
print("\nRequest #%d using %s" % (i, proxy))
try:
response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout=1.0)
# not good if it doesn't mask
if myIP == response.json():
raise AssertionError('Proxy doesn\'t properly mask IP.')
validProxies.add(proxy)
atLeastOneValid = True
print(response.json())
except AssertionError:
print('Proxy doesn\'t properly mask IP.')
except:
# Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work.
# We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url
print("Skipping. Connnection error")
i += 1
print ("Valid Proxies: ", list(validProxies))
return list(validProxies)
def getProxy():
proxies = {'http':'96.47.238.50:443'}
validProxies = getValidProxies()
if validProxies:
validProxy = random.choice(validProxies)
print ("Chosen Proxy: ", validProxy)
return { "http": validProxy }
# return { "http": validProxy, "https": validProxy }
else:
print ("NO PROXY FOUND")
return {}
def getHeaders(siteEnum):
# use the correct referrer and host
if siteEnum == Site.SA:
host = 'www.seekingalpha.com'
ref = 'https://seekingalpha.com'
elif siteEnum == Site.NASDAQ:
host = 'www.nasdaq.com'
ref = 'https://www.nasdaq.com'
else:
host = ''
ref = ''
# randomize the user agent
userAgent = random.choice(userAgents)
return {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
"Connection": "keep-alive",
# "Host": host,
"Referer": ref,
"Upgrade-Insecure-Requests": "1",
"User-Agent": userAgent
}
if __name__ == "__main__":
print(getProxy())