# Import necessary modules | |
from sklearn import linear_model | |
from sklearn.cross_validation import train_test_split | |
# Load data | |
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv ' , sep = ';') | |
X = df.drop('quality' , 1).values #drop target variable | |
y1 = df['quality'].values | |
y = y1 <= 5 # is the rating <= 5? | |
# plot histograms of original target variable | |
# and aggregated target variable | |
plt.figure(figsize=(20,5)); | |
plt.subplot(1, 2, 1 ); | |
plt.hist(y1); | |
plt.xlabel('original target value') | |
plt.ylabel('count') | |
plt.subplot(1, 2, 2); | |
plt.hist(y) | |
plt.xlabel('aggregated target value') | |
plt.show() |