|
from __future__ import annotations |
|
|
|
import random |
|
from typing import TYPE_CHECKING |
|
|
|
from matplotlib import patches |
|
import matplotlib.lines as mlines |
|
import numpy as np |
|
|
|
from pandas.core.dtypes.missing import notna |
|
|
|
from pandas.io.formats.printing import pprint_thing |
|
from pandas.plotting._matplotlib.style import get_standard_colors |
|
from pandas.plotting._matplotlib.tools import ( |
|
create_subplots, |
|
do_adjust_figure, |
|
maybe_adjust_figure, |
|
set_ticks_props, |
|
) |
|
|
|
if TYPE_CHECKING: |
|
from collections.abc import Hashable |
|
|
|
from matplotlib.axes import Axes |
|
from matplotlib.figure import Figure |
|
|
|
from pandas import ( |
|
DataFrame, |
|
Index, |
|
Series, |
|
) |
|
|
|
|
|
def scatter_matrix( |
|
frame: DataFrame, |
|
alpha: float = 0.5, |
|
figsize: tuple[float, float] | None = None, |
|
ax=None, |
|
grid: bool = False, |
|
diagonal: str = "hist", |
|
marker: str = ".", |
|
density_kwds=None, |
|
hist_kwds=None, |
|
range_padding: float = 0.05, |
|
**kwds, |
|
): |
|
df = frame._get_numeric_data() |
|
n = df.columns.size |
|
naxes = n * n |
|
fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) |
|
|
|
|
|
maybe_adjust_figure(fig, wspace=0, hspace=0) |
|
|
|
mask = notna(df) |
|
|
|
marker = _get_marker_compat(marker) |
|
|
|
hist_kwds = hist_kwds or {} |
|
density_kwds = density_kwds or {} |
|
|
|
|
|
kwds.setdefault("edgecolors", "none") |
|
|
|
boundaries_list = [] |
|
for a in df.columns: |
|
values = df[a].values[mask[a].values] |
|
rmin_, rmax_ = np.min(values), np.max(values) |
|
rdelta_ext = (rmax_ - rmin_) * range_padding / 2 |
|
boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) |
|
|
|
for i, a in enumerate(df.columns): |
|
for j, b in enumerate(df.columns): |
|
ax = axes[i, j] |
|
|
|
if i == j: |
|
values = df[a].values[mask[a].values] |
|
|
|
|
|
if diagonal == "hist": |
|
ax.hist(values, **hist_kwds) |
|
|
|
elif diagonal in ("kde", "density"): |
|
from scipy.stats import gaussian_kde |
|
|
|
y = values |
|
gkde = gaussian_kde(y) |
|
ind = np.linspace(y.min(), y.max(), 1000) |
|
ax.plot(ind, gkde.evaluate(ind), **density_kwds) |
|
|
|
ax.set_xlim(boundaries_list[i]) |
|
|
|
else: |
|
common = (mask[a] & mask[b]).values |
|
|
|
ax.scatter( |
|
df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds |
|
) |
|
|
|
ax.set_xlim(boundaries_list[j]) |
|
ax.set_ylim(boundaries_list[i]) |
|
|
|
ax.set_xlabel(b) |
|
ax.set_ylabel(a) |
|
|
|
if j != 0: |
|
ax.yaxis.set_visible(False) |
|
if i != n - 1: |
|
ax.xaxis.set_visible(False) |
|
|
|
if len(df.columns) > 1: |
|
lim1 = boundaries_list[0] |
|
locs = axes[0][1].yaxis.get_majorticklocs() |
|
locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])] |
|
adj = (locs - lim1[0]) / (lim1[1] - lim1[0]) |
|
|
|
lim0 = axes[0][0].get_ylim() |
|
adj = adj * (lim0[1] - lim0[0]) + lim0[0] |
|
axes[0][0].yaxis.set_ticks(adj) |
|
|
|
if np.all(locs == locs.astype(int)): |
|
|
|
locs = locs.astype(int) |
|
axes[0][0].yaxis.set_ticklabels(locs) |
|
|
|
set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) |
|
|
|
return axes |
|
|
|
|
|
def _get_marker_compat(marker): |
|
if marker not in mlines.lineMarkers: |
|
return "o" |
|
return marker |
|
|
|
|
|
def radviz( |
|
frame: DataFrame, |
|
class_column, |
|
ax: Axes | None = None, |
|
color=None, |
|
colormap=None, |
|
**kwds, |
|
) -> Axes: |
|
import matplotlib.pyplot as plt |
|
|
|
def normalize(series): |
|
a = min(series) |
|
b = max(series) |
|
return (series - a) / (b - a) |
|
|
|
n = len(frame) |
|
classes = frame[class_column].drop_duplicates() |
|
class_col = frame[class_column] |
|
df = frame.drop(class_column, axis=1).apply(normalize) |
|
|
|
if ax is None: |
|
ax = plt.gca() |
|
ax.set_xlim(-1, 1) |
|
ax.set_ylim(-1, 1) |
|
|
|
to_plot: dict[Hashable, list[list]] = {} |
|
colors = get_standard_colors( |
|
num_colors=len(classes), colormap=colormap, color_type="random", color=color |
|
) |
|
|
|
for kls in classes: |
|
to_plot[kls] = [[], []] |
|
|
|
m = len(frame.columns) - 1 |
|
s = np.array( |
|
[(np.cos(t), np.sin(t)) for t in [2 * np.pi * (i / m) for i in range(m)]] |
|
) |
|
|
|
for i in range(n): |
|
row = df.iloc[i].values |
|
row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1) |
|
y = (s * row_).sum(axis=0) / row.sum() |
|
kls = class_col.iat[i] |
|
to_plot[kls][0].append(y[0]) |
|
to_plot[kls][1].append(y[1]) |
|
|
|
for i, kls in enumerate(classes): |
|
ax.scatter( |
|
to_plot[kls][0], |
|
to_plot[kls][1], |
|
color=colors[i], |
|
label=pprint_thing(kls), |
|
**kwds, |
|
) |
|
ax.legend() |
|
|
|
ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor="none")) |
|
|
|
for xy, name in zip(s, df.columns): |
|
ax.add_patch(patches.Circle(xy, radius=0.025, facecolor="gray")) |
|
|
|
if xy[0] < 0.0 and xy[1] < 0.0: |
|
ax.text( |
|
xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small" |
|
) |
|
elif xy[0] < 0.0 <= xy[1]: |
|
ax.text( |
|
xy[0] - 0.025, |
|
xy[1] + 0.025, |
|
name, |
|
ha="right", |
|
va="bottom", |
|
size="small", |
|
) |
|
elif xy[1] < 0.0 <= xy[0]: |
|
ax.text( |
|
xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small" |
|
) |
|
elif xy[0] >= 0.0 and xy[1] >= 0.0: |
|
ax.text( |
|
xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small" |
|
) |
|
|
|
ax.axis("equal") |
|
return ax |
|
|
|
|
|
def andrews_curves( |
|
frame: DataFrame, |
|
class_column, |
|
ax: Axes | None = None, |
|
samples: int = 200, |
|
color=None, |
|
colormap=None, |
|
**kwds, |
|
) -> Axes: |
|
import matplotlib.pyplot as plt |
|
|
|
def function(amplitudes): |
|
def f(t): |
|
x1 = amplitudes[0] |
|
result = x1 / np.sqrt(2.0) |
|
|
|
|
|
|
|
|
|
coeffs = np.delete(np.copy(amplitudes), 0) |
|
coeffs = np.resize(coeffs, (int((coeffs.size + 1) / 2), 2)) |
|
|
|
|
|
|
|
harmonics = np.arange(0, coeffs.shape[0]) + 1 |
|
trig_args = np.outer(harmonics, t) |
|
|
|
result += np.sum( |
|
coeffs[:, 0, np.newaxis] * np.sin(trig_args) |
|
+ coeffs[:, 1, np.newaxis] * np.cos(trig_args), |
|
axis=0, |
|
) |
|
return result |
|
|
|
return f |
|
|
|
n = len(frame) |
|
class_col = frame[class_column] |
|
classes = frame[class_column].drop_duplicates() |
|
df = frame.drop(class_column, axis=1) |
|
t = np.linspace(-np.pi, np.pi, samples) |
|
used_legends: set[str] = set() |
|
|
|
color_values = get_standard_colors( |
|
num_colors=len(classes), colormap=colormap, color_type="random", color=color |
|
) |
|
colors = dict(zip(classes, color_values)) |
|
if ax is None: |
|
ax = plt.gca() |
|
ax.set_xlim(-np.pi, np.pi) |
|
for i in range(n): |
|
row = df.iloc[i].values |
|
f = function(row) |
|
y = f(t) |
|
kls = class_col.iat[i] |
|
label = pprint_thing(kls) |
|
if label not in used_legends: |
|
used_legends.add(label) |
|
ax.plot(t, y, color=colors[kls], label=label, **kwds) |
|
else: |
|
ax.plot(t, y, color=colors[kls], **kwds) |
|
|
|
ax.legend(loc="upper right") |
|
ax.grid() |
|
return ax |
|
|
|
|
|
def bootstrap_plot( |
|
series: Series, |
|
fig: Figure | None = None, |
|
size: int = 50, |
|
samples: int = 500, |
|
**kwds, |
|
) -> Figure: |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
|
|
data = list(series.values) |
|
samplings = [random.sample(data, size) for _ in range(samples)] |
|
|
|
means = np.array([np.mean(sampling) for sampling in samplings]) |
|
medians = np.array([np.median(sampling) for sampling in samplings]) |
|
midranges = np.array( |
|
[(min(sampling) + max(sampling)) * 0.5 for sampling in samplings] |
|
) |
|
if fig is None: |
|
fig = plt.figure() |
|
x = list(range(samples)) |
|
axes = [] |
|
ax1 = fig.add_subplot(2, 3, 1) |
|
ax1.set_xlabel("Sample") |
|
axes.append(ax1) |
|
ax1.plot(x, means, **kwds) |
|
ax2 = fig.add_subplot(2, 3, 2) |
|
ax2.set_xlabel("Sample") |
|
axes.append(ax2) |
|
ax2.plot(x, medians, **kwds) |
|
ax3 = fig.add_subplot(2, 3, 3) |
|
ax3.set_xlabel("Sample") |
|
axes.append(ax3) |
|
ax3.plot(x, midranges, **kwds) |
|
ax4 = fig.add_subplot(2, 3, 4) |
|
ax4.set_xlabel("Mean") |
|
axes.append(ax4) |
|
ax4.hist(means, **kwds) |
|
ax5 = fig.add_subplot(2, 3, 5) |
|
ax5.set_xlabel("Median") |
|
axes.append(ax5) |
|
ax5.hist(medians, **kwds) |
|
ax6 = fig.add_subplot(2, 3, 6) |
|
ax6.set_xlabel("Midrange") |
|
axes.append(ax6) |
|
ax6.hist(midranges, **kwds) |
|
for axis in axes: |
|
plt.setp(axis.get_xticklabels(), fontsize=8) |
|
plt.setp(axis.get_yticklabels(), fontsize=8) |
|
if do_adjust_figure(fig): |
|
plt.tight_layout() |
|
return fig |
|
|
|
|
|
def parallel_coordinates( |
|
frame: DataFrame, |
|
class_column, |
|
cols=None, |
|
ax: Axes | None = None, |
|
color=None, |
|
use_columns: bool = False, |
|
xticks=None, |
|
colormap=None, |
|
axvlines: bool = True, |
|
axvlines_kwds=None, |
|
sort_labels: bool = False, |
|
**kwds, |
|
) -> Axes: |
|
import matplotlib.pyplot as plt |
|
|
|
if axvlines_kwds is None: |
|
axvlines_kwds = {"linewidth": 1, "color": "black"} |
|
|
|
n = len(frame) |
|
classes = frame[class_column].drop_duplicates() |
|
class_col = frame[class_column] |
|
|
|
if cols is None: |
|
df = frame.drop(class_column, axis=1) |
|
else: |
|
df = frame[cols] |
|
|
|
used_legends: set[str] = set() |
|
|
|
ncols = len(df.columns) |
|
|
|
|
|
x: list[int] | Index |
|
if use_columns is True: |
|
if not np.all(np.isreal(list(df.columns))): |
|
raise ValueError("Columns must be numeric to be used as xticks") |
|
x = df.columns |
|
elif xticks is not None: |
|
if not np.all(np.isreal(xticks)): |
|
raise ValueError("xticks specified must be numeric") |
|
if len(xticks) != ncols: |
|
raise ValueError("Length of xticks must match number of columns") |
|
x = xticks |
|
else: |
|
x = list(range(ncols)) |
|
|
|
if ax is None: |
|
ax = plt.gca() |
|
|
|
color_values = get_standard_colors( |
|
num_colors=len(classes), colormap=colormap, color_type="random", color=color |
|
) |
|
|
|
if sort_labels: |
|
classes = sorted(classes) |
|
color_values = sorted(color_values) |
|
colors = dict(zip(classes, color_values)) |
|
|
|
for i in range(n): |
|
y = df.iloc[i].values |
|
kls = class_col.iat[i] |
|
label = pprint_thing(kls) |
|
if label not in used_legends: |
|
used_legends.add(label) |
|
ax.plot(x, y, color=colors[kls], label=label, **kwds) |
|
else: |
|
ax.plot(x, y, color=colors[kls], **kwds) |
|
|
|
if axvlines: |
|
for i in x: |
|
ax.axvline(i, **axvlines_kwds) |
|
|
|
ax.set_xticks(x) |
|
ax.set_xticklabels(df.columns) |
|
ax.set_xlim(x[0], x[-1]) |
|
ax.legend(loc="upper right") |
|
ax.grid() |
|
return ax |
|
|
|
|
|
def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes: |
|
|
|
import matplotlib.pyplot as plt |
|
|
|
kwds.setdefault("c", plt.rcParams["patch.facecolor"]) |
|
|
|
data = series.values |
|
y1 = data[:-lag] |
|
y2 = data[lag:] |
|
if ax is None: |
|
ax = plt.gca() |
|
ax.set_xlabel("y(t)") |
|
ax.set_ylabel(f"y(t + {lag})") |
|
ax.scatter(y1, y2, **kwds) |
|
return ax |
|
|
|
|
|
def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwds) -> Axes: |
|
import matplotlib.pyplot as plt |
|
|
|
n = len(series) |
|
data = np.asarray(series) |
|
if ax is None: |
|
ax = plt.gca() |
|
ax.set_xlim(1, n) |
|
ax.set_ylim(-1.0, 1.0) |
|
mean = np.mean(data) |
|
c0 = np.sum((data - mean) ** 2) / n |
|
|
|
def r(h): |
|
return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / n / c0 |
|
|
|
x = np.arange(n) + 1 |
|
y = [r(loc) for loc in x] |
|
z95 = 1.959963984540054 |
|
z99 = 2.5758293035489004 |
|
ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey") |
|
ax.axhline(y=z95 / np.sqrt(n), color="grey") |
|
ax.axhline(y=0.0, color="black") |
|
ax.axhline(y=-z95 / np.sqrt(n), color="grey") |
|
ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey") |
|
ax.set_xlabel("Lag") |
|
ax.set_ylabel("Autocorrelation") |
|
ax.plot(x, y, **kwds) |
|
if "label" in kwds: |
|
ax.legend() |
|
ax.grid() |
|
return ax |
|
|
|
|
|
def unpack_single_str_list(keys): |
|
|
|
if isinstance(keys, list) and len(keys) == 1: |
|
keys = keys[0] |
|
return keys |
|
|