Spaces:

sigridveronica
/

ai-news-analyzer

Running

File size: 6,224 Bytes

9df4cc0

"""
Author: Peter DUlworth
Date: 02/22/2019

This file contains helper methods to generate request headers.
"""

from enum import Enum
import random
import requests
from lxml.html import fromstring
from itertools import cycle
import traceback

userAgents = [
    # Chrome
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
    
    # Firefox
    'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)'
]

class Site(Enum):
    SA = 1
    NASDAQ = 2

def getFreeProxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()

    # look at 400 rows of the proxy table
    for i in parser.xpath('//tbody/tr')[:500]:
        # if the proxy support HTTPS
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            # if the proxy is in the US, CA, MX
            if i.xpath('.//td[3][contains(text(),"US")]') or i.xpath('.//td[3][contains(text(),"CA")]') or i.xpath('.//td[3][contains(text(),"MX")]'):
                # save the proxy to our list
                proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
                proxies.add(proxy)

    print ("Possible Proxies: ", proxies)
    return list(proxies)


def getValidProxies():
    proxies = getFreeProxies()
    
    # if there we couldn't find any free proxies, well bummer just return an empty set
    if not proxies:
        return []

    random.shuffle(proxies)
    proxy_pool = cycle(proxies)
    validProxies = set()
    atLeastOneValid = False

    # find my IP
    url = 'https://httpbin.org/ip'
    myIP = requests.get(url).json()

    i = 0
    # test at most three proxies (but keep testing if we haven't found a valid one yet)
    while (i < min(len(proxies), 3) or not atLeastOneValid):
        if i >= len(proxies):
            return list(validProxies)

        #Get a proxy from the pool
        proxy = next(proxy_pool)
        print("\nRequest #%d using %s" % (i, proxy))
        try:
            response = requests.get(url, proxies={"http": proxy, "https": proxy}, timeout=1.0)

            # not good if it doesn't mask
            if myIP == response.json():
                raise AssertionError('Proxy doesn\'t properly mask IP.')
            
            validProxies.add(proxy)
            atLeastOneValid = True
            print(response.json())
        
        except AssertionError:
            print('Proxy doesn\'t properly mask IP.')

        except:
            # Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
            # We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
            print("Skipping. Connnection error")
        
        i += 1

    print ("Valid Proxies: ", list(validProxies))
    return list(validProxies)

def getProxy():
    proxies = {'http':'96.47.238.50:443'} 
    validProxies = getValidProxies()
    
    if validProxies:
        validProxy = random.choice(validProxies)
        print ("Chosen Proxy: ", validProxy)
        return { "http": validProxy }
        # return { "http": validProxy, "https": validProxy }

    else:
        print ("NO PROXY FOUND")
        return {}

def getHeaders(siteEnum):
    
    # use the correct referrer and host
    if siteEnum == Site.SA:
        host = 'www.seekingalpha.com'
        ref = 'https://seekingalpha.com'
    elif siteEnum == Site.NASDAQ:
        host = 'www.nasdaq.com'
        ref = 'https://www.nasdaq.com'
    else:
        host = ''
        ref = ''

    # randomize the user agent
    userAgent = random.choice(userAgents)
    
    return {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Accept-Language": "en-GB,en;q=0.9,en-US;q=0.8,ml;q=0.7",
        "Connection": "keep-alive",
        # "Host": host,
        "Referer": ref,
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": userAgent
    } 

if __name__ == "__main__":
    print(getProxy())