Spaces:
Running
Running
File size: 6,224 Bytes
9df4cc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
"""
Author: Peter DUlworth
Date: 02/22/2019
This file contains helper methods to generate request headers.
"""
from enum import Enum
import random
import requests
from lxml.html import fromstring
from itertools import cycle
import traceback
userAgents = [
# Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
# Firefox
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]
class Site(Enum):
SA = 1
NASDAQ = 2
def getFreeProxies():
url = 'https://free-proxy-list.net/'
response = requests.get(url)
parser = fromstring(response.text)
proxies = set()
# look at 400 rows of the proxy table
for i in parser.xpath('//tbody/tr')[:500]:
# if the proxy support HTTPS
if i.xpath('.//td[7][contains(text(),"yes")]'):
# if the proxy is in the US, CA, MX
if i.xpath('.//td[3][contains(text(),"US")]') or i.xpath('.//td[3][contains(text(),"CA")]') or i.xpath('.//td[3][contains(text(),"MX")]'):
# save the proxy to our list
proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
proxies.add(proxy)
print ("Possible Proxies: ", proxies)
return list(proxies)
def getValidProxies():
proxies = getFreeProxies()
# if there we couldn't find any free proxies, well bummer just return an empty set
if not proxies:
return []
random.shuffle(proxies)
proxy_pool = cycle(proxies)
validProxies = set()
atLeastOneValid = False
# find my IP
url = 'https://httpbin.org/ip'
myIP = requests.get(url).json()
i = 0
# test at most three proxies (but keep testing if we haven't found a valid one yet)
while (i < min(len(proxies), 3) or not atLeastOneValid):
if i >= len(proxies):
return list(validProxies)
#Get a proxy from the pool
proxy = next(proxy_pool)
print("\nRequest #%d using %s" % (i, proxy))
try:
response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout=1.0)
# not good if it doesn't mask
if myIP == response.json():
raise AssertionError('Proxy doesn\'t properly mask IP.')
validProxies.add(proxy)
atLeastOneValid = True
print(response.json())
except AssertionError:
print('Proxy doesn\'t properly mask IP.')
except:
# Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work.
# We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url
print("Skipping. Connnection error")
i += 1
print ("Valid Proxies: ", list(validProxies))
return list(validProxies)
def getProxy():
proxies = {'http':'96.47.238.50:443'}
validProxies = getValidProxies()
if validProxies:
validProxy = random.choice(validProxies)
print ("Chosen Proxy: ", validProxy)
return { "http": validProxy }
# return { "http": validProxy, "https": validProxy }
else:
print ("NO PROXY FOUND")
return {}
def getHeaders(siteEnum):
# use the correct referrer and host
if siteEnum == Site.SA:
host = 'www.seekingalpha.com'
ref = 'https://seekingalpha.com'
elif siteEnum == Site.NASDAQ:
host = 'www.nasdaq.com'
ref = 'https://www.nasdaq.com'
else:
host = ''
ref = ''
# randomize the user agent
userAgent = random.choice(userAgents)
return {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
"Connection": "keep-alive",
# "Host": host,
"Referer": ref,
"Upgrade-Insecure-Requests": "1",
"User-Agent": userAgent
}
if __name__ == "__main__":
print(getProxy())
|