In [82]:
#@title Setup

%%capture
!pip install networkx pulp numpy pandas

!rm -rf ./data/
!mkdir -p ./data/
!wget -c -O ./data/lastfm_asia.zip "https://snap.stanford.edu/data/lastfm_asia.zip"
!unzip -q ./data/lastfm_asia.zip -d ./data/

In [83]:
#@title Problem 3: Linear Programming


from pulp import *
from IPython.display import HTML, display

def display_table(table):
    display(HTML(
       '<table><tr>{}</tr></table>'.format(
           '</tr><tr>'.join(
               '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in table)
           )
    ))

problem = LpProblem("MSML_602_Midterm_Q3", LpMaximize)

X = LpVariable("X", cat="Integer")
Y = LpVariable("Y", cat="Integer")

problem += (5 * X) + (3 * Y), "Objective"
problem += X + (2 * Y) <= 14, "Constraint 1"
problem += (3* X) - Y >= 0, "Constraint 2"
problem += X - Y <= 2, "Constraint 3"

problem.solve()
print("Solution:\n")

data = [["Variable", "Value"]] + [[v.name, v.varValue] for v in problem.variables()]
data += [["Max value for objective function: ", problem.objective.value()]]
display_table(data)

Solution:



0,1
Variable,Value
X,6.0
Y,4.0
Max value for objective function:,42.0


In [84]:
#@title Problem 5: Graph Metrics

import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

df = pd.read_csv("/content/data/lasftm_asia/lastfm_asia_edges.csv")
G = nx.from_pandas_edgelist(df, source="node_1", target="node_2")
shortest_path = nx.shortest_path_length(G, 0)
del shortest_path[0]
num = len(shortest_path)
total_length = sum([shortest_path[k] for k in shortest_path])
avg_shortest_path = total_length / num
print(f"The average shortest path length from node 0 to all other nodes is: {avg_shortest_path}")

The average shortest path length from node 0 to all other nodes is: 5.651974288337925


## Problem 6: Extracting Webpage Data

In [85]:
#@title Scraping result

import requests
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np

page = requests.get("https://www.worldometers.info/coronavirus/#countries")
html = page.content

soup = BeautifulSoup(html, 'html.parser')
table = soup.find("table", {"id": "main_table_countries_today"})

cols = [
    '#', 'Country', 'TotalCases', 'NewCases', 'TotalDeaths', 'NewDeaths', 'TotalRecovered',
    'NewRecovered','ActiveCases','Serious,Critical','TotalCases/1M pop','Deaths/1M pop', 
    'TotalTests', 'Tests/1M pop', 'Population', 'Continent', '1 Case every X ppl', '1 Death every X ppl',
    '1 Test every X ppl', 'New Cases/1M pop', 'New Deaths/1M pop', 'Active Cases/1M pop'
]

tbody = table.find("tbody")
rows = tbody.find_all("tr")

data = []
for row in rows:
  cells = row.find_all("td")
  values = [c.text for c in cells]
  data.append(values)

def sanitize_country_number(row):
  val = row["#"]
  if not val.strip():
    return np.NaN
  else:
    return val

def fill_active_cases(row):
  val = row["ActiveCases"]
  if not np.isnan(val):
    return val
  active_per_1_mil = row["Active Cases/1M pop"]
  if np.isnan(active_per_1_mil):
    return np.nan
  population = row["Population"]
  return (active_per_1_mil/1000000) * population

def to_float(col):
  def mapper(row):
    if row[col] == "N/A":
      return np.NaN
    val = row[col]
    val = val.replace(",", "").strip()
    if not val:
      return np.NaN
    return float(val)
  return mapper 

df = pd.DataFrame(data, columns=cols)
df.replace(r"\n", "", regex=True, inplace=True)

df.head()


Unnamed: 0,#,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",...,TotalTests,Tests/1M pop,Population,Continent,1 Case every X ppl,1 Death every X ppl,1 Test every X ppl,New Cases/1M pop,New Deaths/1M pop,Active Cases/1M pop
0,,North America,118308960,16354,1557219,76,113762872,16362.0,2988869,7881,...,,,,North America,,,,,,
1,,Asia,195343819,168618,1491630,230,188186652,78736.0,5665537,9159,...,,,,Asia,,,,,,
2,,Europe,235496414,41389,1948669,165,229427346,175758.0,4120399,7685,...,,,,Europe,,,,,,
3,,South America,64557158,10126,1333737,79,62884992,7699.0,338429,10119,...,,,,South America,,,,,,
4,,Oceania,12691699,3057,21779,9,12512305,,157615,97,...,,,,Australia/Oceania,,,,,,


In [86]:
#@title Data sanitization / generation

#@markdown Some of the countries (actually ships, in this case) did not have any population data, so I excluded those records from the dataset.

#@markdown Some countries didn't have data for exact active cases, but had data for **active cases per 1 million population**. 
#@markdown For these countries, I calculated their active cases by using the active cases per 1 million population data as follows: 

#@markdown ```Active Cases = (Active cases per 1 million population / 1,000,000) * Population```

df["country_number"] = df.apply(sanitize_country_number, axis=1)

data_by_country = df[df["country_number"].notna()].copy()
data_by_country["ActiveCases"] = data_by_country.apply(to_float("ActiveCases"), axis=1)
data_by_country["Active Cases/1M pop"] = data_by_country.apply(to_float("Active Cases/1M pop"), axis=1)
data_by_country["Population"] = data_by_country.apply(to_float("Population"), axis=1)
data_by_country["ActiveCases"] = data_by_country.apply(fill_active_cases, axis=1)
data_by_country[data_by_country["ActiveCases"] == "N/A"].head(20)
aggregated = data_by_country.groupby("Country").agg({'ActiveCases':'mean', 'Population':'sum'}, as_index=False)
aggregated.reset_index(inplace=True)
dropped_countries = aggregated[aggregated["Population"] == 0 ]
aggregated = aggregated[aggregated["Population"] != 0 ]
aggregated["PercentageInfected"] = aggregated.apply(lambda x: x["ActiveCases"]/x["Population"], axis=1)
aggregated.sort_values(["PercentageInfected"], ascending=False, inplace=True)

print("These were the countries(ships) that didn't have population data:\n")
print(dropped_countries)


These were the countries(ships) that didn't have population data:

              Country  ActiveCases  Population
56   Diamond Princess          0.0         0.0
120        MS Zaandam          0.0         0.0


In [87]:
#@title Average active cases & the proportion of the total population affected

from IPython.display import HTML, display

def display_table(table):
    display(HTML(
       '<table><tr>{}</tr></table>'.format(
           '</tr><tr>'.join(
               '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in table)
           )
    ))

avg_active_cases = aggregated["ActiveCases"].mean()

aggr = aggregated.agg({"ActiveCases": "sum",  "Population": "sum"}, as_index=False)
final_df = aggr.to_frame().T
final_df["PercentageInfected"] = final_df.apply(lambda x: (x["ActiveCases"]/x["Population"]) * 100, axis=1)
percentage_infected = final_df["PercentageInfected"].to_numpy()[0]

display(HTML(
    """
      <h3>Result:</h3>
      <br>
      <table border="1">
        <tr>
          <th align="left">Average active cases:</th>
          <td>{0:.2f}</td>
        </tr>
        <tr>
          <th align="left">Proportion of total <br>population currently infected:</th>
          <td>{1:.2f}%</td>
        </tr>
      </table>
      <br>
    """.format(avg_active_cases, percentage_infected))
)


print("""
I was unsure whether the problem wanted the percentage of the population
affected for each country, so I have included the percentage for each country 
as well, just in case:
""")
aggregated.head()

0,1
Average active cases:,60038.20
Proportion of total population currently infected:,0.17%



I was unsure whether the problem wanted the percentage of the population
affected for each country, so I have included the percentage for each country 
as well, just in case:



Unnamed: 0,Country,ActiveCases,Population,PercentageInfected
129,Martinique,222576.901869,374087.0,0.594987
68,Faeroe Islands,26936.998989,49233.0,0.547133
195,St. Barth,4854.999825,9945.0,0.488185
84,Guadeloupe,193026.939904,399794.0,0.482816
93,Iceland,130899.111498,345393.0,0.378986
