Spaces:
Running
Running
""" | |
Author: Peter DUlworth | |
Date: 02/22/2019 | |
This file contains helper methods to generate request headers. | |
""" | |
from enum import Enum | |
import random | |
import requests | |
from lxml.html import fromstring | |
from itertools import cycle | |
import traceback | |
userAgents = [ | |
# Chrome | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', | |
# Firefox | |
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', | |
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', | |
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', | |
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', | |
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)' | |
] | |
class Site(Enum): | |
SA = 1 | |
NASDAQ = 2 | |
def getFreeProxies(): | |
url = 'https://free-proxy-list.net/' | |
response = requests.get(url) | |
parser = fromstring(response.text) | |
proxies = set() | |
# look at 400 rows of the proxy table | |
for i in parser.xpath('//tbody/tr')[:500]: | |
# if the proxy support HTTPS | |
if i.xpath('.//td[7][contains(text(),"yes")]'): | |
# if the proxy is in the US, CA, MX | |
if i.xpath('.//td[3][contains(text(),"US")]') or i.xpath('.//td[3][contains(text(),"CA")]') or i.xpath('.//td[3][contains(text(),"MX")]'): | |
# save the proxy to our list | |
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]]) | |
proxies.add(proxy) | |
print ("Possible Proxies: ", proxies) | |
return list(proxies) | |
def getValidProxies(): | |
proxies = getFreeProxies() | |
# if there we couldn't find any free proxies, well bummer just return an empty set | |
if not proxies: | |
return [] | |
random.shuffle(proxies) | |
proxy_pool = cycle(proxies) | |
validProxies = set() | |
atLeastOneValid = False | |
# find my IP | |
url = 'https://httpbin.org/ip' | |
myIP = requests.get(url).json() | |
i = 0 | |
# test at most three proxies (but keep testing if we haven't found a valid one yet) | |
while (i < min(len(proxies), 3) or not atLeastOneValid): | |
if i >= len(proxies): | |
return list(validProxies) | |
#Get a proxy from the pool | |
proxy = next(proxy_pool) | |
print("\nRequest #%d using %s" % (i, proxy)) | |
try: | |
response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout=1.0) | |
# not good if it doesn't mask | |
if myIP == response.json(): | |
raise AssertionError('Proxy doesn\'t properly mask IP.') | |
validProxies.add(proxy) | |
atLeastOneValid = True | |
print(response.json()) | |
except AssertionError: | |
print('Proxy doesn\'t properly mask IP.') | |
except: | |
# Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. | |
# We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url | |
print("Skipping. Connnection error") | |
i += 1 | |
print ("Valid Proxies: ", list(validProxies)) | |
return list(validProxies) | |
def getProxy(): | |
proxies = {'http':'96.47.238.50:443'} | |
validProxies = getValidProxies() | |
if validProxies: | |
validProxy = random.choice(validProxies) | |
print ("Chosen Proxy: ", validProxy) | |
return { "http": validProxy } | |
# return { "http": validProxy, "https": validProxy } | |
else: | |
print ("NO PROXY FOUND") | |
return {} | |
def getHeaders(siteEnum): | |
# use the correct referrer and host | |
if siteEnum == Site.SA: | |
host = 'www.seekingalpha.com' | |
ref = 'https://seekingalpha.com' | |
elif siteEnum == Site.NASDAQ: | |
host = 'www.nasdaq.com' | |
ref = 'https://www.nasdaq.com' | |
else: | |
host = '' | |
ref = '' | |
# randomize the user agent | |
userAgent = random.choice(userAgents) | |
return { | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | |
"Accept-Encoding": "gzip, deflate", | |
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7", | |
"Connection": "keep-alive", | |
# "Host": host, | |
"Referer": ref, | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": userAgent | |
} | |
if __name__ == "__main__": | |
print(getProxy()) | |