import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.preprocessing import LabelEncoder
def correlation_ellipse(ax, x, y, data, **kwargs):
"""Plot correlation ellipses like R's circles package."""
= np.cov(data[x], data[y])
cov = np.linalg.eig(cov)
eigvals, eigvecs
= np.degrees(np.arctan2(*eigvecs[:, 0][::-1]))
angle
= Ellipse(
ellipse =(data[x].mean(), data[y].mean()),
xy=2 * np.sqrt(eigvals[0]),
width=2 * np.sqrt(eigvals[1]),
height=angle,
angle='black',
edgecolor='none',
facecolor=1.5
linewidth
)
ax.add_patch(ellipse)
def custom_pair_plot(df):
"""Generate a pair plot with scatter plots for continuous variables,
violin plots for discrete variables, and correlation ellipses."""
= df.select_dtypes(include=['float64', 'int64']).columns
continuous_vars = df.select_dtypes(include=['object', 'category']).columns
discrete_vars
= {col: LabelEncoder().fit(df[col]) for col in discrete_vars}
label_encoders = df.copy()
df_encoded
# Encode categorical variables
for col in discrete_vars:
= label_encoders[col].transform(df[col])
df_encoded[col]
= df_encoded.shape[1]
num_vars = plt.subplots(num_vars, num_vars, figsize=(2*num_vars, 2*num_vars))
fig, axes
for i, var1 in enumerate(df_encoded.columns):
for j, var2 in enumerate(df_encoded.columns):
= axes[i, j]
ax if i == j:
=ax, kde=True, bins=20, color="lightblue")
sns.histplot(df[var1], axelif var1 in continuous_vars and var2 in continuous_vars:
=df[var2], y=df[var1], ax=ax, alpha=0.6)
sns.scatterplot(x
correlation_ellipse(ax, var2, var1, df_encoded)elif var1 in discrete_vars and var2 in continuous_vars:
=df[var1], y=df[var2], ax=ax)
sns.violinplot(xelif var1 in continuous_vars and var2 in discrete_vars:
=df[var2], y=df[var1], ax=ax)
sns.violinplot(xelse:
=True, fmt="d", cmap="Blues", ax=ax, cbar=False)
sns.heatmap(pd.crosstab(df[var2], df[var1]), annot
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout()
plt.show()
# Example usage
= pd.DataFrame({
df 'A': np.random.randn(100),
'B': np.random.randn(100) * 2,
'C': np.random.choice(['Low', 'Medium', 'High'], 100),
'D': np.random.choice(['Yes', 'No'], 100)
})
custom_pair_plot(df)
In the notes on I had some ideas about making a better SPLOMs. A splom is a scatterplot matrix.
The first batch of ideas involve making scatterplot more infomative by pickinga more suitable plot type based on the data types of the variables being plotted.
- for a continuous vs continuous variable pairs plot a scatter plot.
- for a continuous vs discrete variable pairs plot a violin plot.
- for a discrete vs discrete variable pairs plot a heatmap.
- on the diagonal plot histograms with a density plot overlay.
- for the lower triangle of the plot, plot correlation ellipses. And the correlation ellipses can be overlayed with a number for the correlation coefficient.
The second batch of ideas involve making the plot more useful for understanding effects in the data and potential regression issues like outliers and high leverage points, heteroscedasticity, and non-linear relationships. And one more issue that I came across is how to handle zero inflation arising from one or the other variable….
Another thought about SPLOM is to make the cells of the plot three dimensional so we could see the two predictors vs the response variable when we move off the first collumn.
- Highlight effects in these plots. using a regression line or a loess line for the main effects.
- For the second order effects, we might show the fit a 3d quadratic spline or loess for the two predictors as a surface in a 3d plot. If we were interactive we might also show
- The residuals
- Error surfaces above and below the fit surface. A sort of confidence interval for the fit
- The outliers and high leverage points for the interaction terms. Perhaps as bubbles for the residuals and as points for the outliers and high leverage points.
- The zero inflation points. Perhaps as a different color for the residuals.
- highlight hetroscedasticity by a residual from a normal distribution….
- make the
- make it interactive.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
def correlation_ellipse(ax, x, y, data, max_size=1.2):
"""Plot a properly scaled correlation ellipse in the lower triangle."""
= data[x]
x_vals = data[y]
y_vals
= np.corrcoef(x_vals, y_vals)[0, 1] # Pearson correlation
corr = np.cov(x_vals, y_vals)
cov = np.linalg.eig(cov)
eigvals, eigvecs
# Normalize eigenvalues to ensure uniform ellipse size
= np.max(eigvals)
max_eigenvalue = max_size * (eigvals / max_eigenvalue) # Scale within a fixed box
width, height = np.degrees(np.arctan2(*eigvecs[:, 0][::-1]))
angle
= Ellipse(
ellipse =(0, 0), # Centered at (0,0) since we scale manually
xy=width, height=height,
width=angle,
angle='black',
edgecolor='gray',
facecolor=0.3
alpha
)
ax.add_patch(ellipse)
# Fix axis limits to keep all ellipses inside a square
-1, 1)
ax.set_xlim(-1, 1)
ax.set_ylim(
ax.set_xticks([])
ax.set_yticks([])False) # Remove frame for clarity
ax.set_frame_on(
def custom_pair_plot(df):
"""Generate a pair plot where:
- Histograms appear on the diagonal.
- Scatter & Violin plots are in the upper triangle.
- Normalized correlation ellipses appear in the lower triangle.
- Labels for the bottom and left axes of ellipses.
"""
= df.select_dtypes(include=['float64', 'int64']).columns
continuous_vars = df.select_dtypes(include=['object', 'category']).columns
discrete_vars
# Normalize all continuous variables to [0,1] for consistent ellipses
= MinMaxScaler()
scaler = df.copy()
df_encoded = scaler.fit_transform(df[continuous_vars])
df_encoded[continuous_vars]
# Encode categorical variables
= {col: LabelEncoder().fit(df[col]) for col in discrete_vars}
label_encoders for col in discrete_vars:
= label_encoders[col].transform(df[col])
df_encoded[col]
= len(df_encoded.columns)
num_vars = plt.subplots(num_vars, num_vars, figsize=(2*num_vars, 2*num_vars))
fig, axes
for i, var1 in enumerate(df_encoded.columns):
for j, var2 in enumerate(df_encoded.columns):
= axes[i, j]
ax
# Diagonal: Histograms
if i == j:
=ax, kde=True, bins=20, color="lightblue")
sns.histplot(df[var1], ax
# Above diagonal: Scatter & Violin plots
elif i < j:
if var1 in continuous_vars and var2 in continuous_vars:
=df[var2], y=df[var1], ax=ax, alpha=0.6)
sns.scatterplot(xelif var1 in discrete_vars and var2 in continuous_vars:
=df[var1], y=df[var2], ax=ax)
sns.violinplot(xelif var1 in continuous_vars and var2 in discrete_vars:
=df[var2], y=df[var1], ax=ax)
sns.violinplot(xelse:
=True, fmt="d", cmap="Blues", ax=ax, cbar=False)
sns.heatmap(pd.crosstab(df[var2], df[var1]), annot
# Below diagonal: Correlation ellipses
else:
correlation_ellipse(ax, var2, var1, df_encoded)
# Add axis labels for the bottom row and left-most column
if i == num_vars - 1:
=12)
ax.set_xlabel(var2, fontsizeif j == 0:
=12, rotation=90)
ax.set_ylabel(var1, fontsize
ax.set_xticks([])
ax.set_yticks([])False) # Remove frame for clarity
ax.set_frame_on(
plt.tight_layout()
plt.show()
# Test dataset with very different scales
= np.random.uniform(size=100)
a = np.random.uniform(size=100)
b = pd.DataFrame({
df 'A': (a) * 100.0,
'B': (b) * 100.0,
'C': (a - b) - 100.0,
'D': (a + b) * 50.0 - 50.0,
})
custom_pair_plot(df)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def smart_sns_plot(x, y, data, ax=None, **kwargs):
"""
Automatically delegates to sns.scatterplot, sns.violinplot, or sns.heatmap
based on the data types of x and y.
- Continuous vs. Continuous → sns.scatterplot
- Continuous vs. Discrete → sns.violinplot
- Discrete vs. Discrete → sns.heatmap
"""
if ax is None:
= plt.gca()
ax
# Determine data types
= data[x].dtype
x_dtype = data[y].dtype
y_dtype = np.issubdtype(x_dtype, np.number)
is_x_cont = np.issubdtype(y_dtype, np.number)
is_y_cont
if is_x_cont and is_y_cont:
# Continuous vs. Continuous → Scatter Plot
=x, y=y, data=data, ax=ax, **kwargs)
sns.scatterplot(x
elif is_x_cont and not is_y_cont:
# Continuous vs. Discrete → Violin Plot
=y, y=x, data=data, ax=ax, **kwargs)
sns.violinplot(x
elif not is_x_cont and is_y_cont:
# Discrete vs. Continuous → Violin Plot (flipped)
=x, y=y, data=data, ax=ax, **kwargs)
sns.violinplot(x
else:
# Discrete vs. Discrete → Heatmap
= pd.crosstab(data[x], data[y])
cross_tab =True, fmt="d", cmap="Blues", ax=ax, cbar=False)
sns.heatmap(cross_tab, annot
return ax # Return the modified axis
# Example Usage
= pd.DataFrame({
df 'A': np.random.randn(100),
'B': np.random.randn(100) * 2,
'C': np.random.choice(['Low', 'Medium', 'High'], 100),
'D': np.random.choice(['Yes', 'No'], 100)
})
= plt.subplots(2, 2, figsize=(10, 10))
fig, axes
# Scatterplot (Continuous vs Continuous)
"A", "B", df, ax=axes[0, 0])
smart_sns_plot(
# Violinplot (Continuous vs Discrete)
"A", "C", df, ax=axes[0, 1])
smart_sns_plot(
# Violinplot (Flipped Discrete vs Continuous)
"C", "B", df, ax=axes[1, 0])
smart_sns_plot(
# Heatmap (Discrete vs Discrete)
"C", "D", df, ax=axes[1, 1])
smart_sns_plot(
plt.tight_layout() plt.show()
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.preprocessing import MinMaxScaler
def sns_correlation_ellipse(x, y, data, ax=None, max_size=1.2, **kwargs):
"""
Seaborn-compatible function that plots a correlation ellipse
based on the covariance of x and y.
- Uses eigenvectors/eigenvalues of covariance matrix.
- Normalizes ellipse size for consistent visuals.
- Designed for use in sns.pairplot or FacetGrid.
"""
if ax is None:
= plt.gca()
ax
# Standardize data for consistent ellipses
= MinMaxScaler()
scaler = scaler.fit_transform(data[[x, y]])
scaled_data = scaled_data[:, 0], scaled_data[:, 1]
x_vals, y_vals
# Compute covariance and eigen decomposition
= np.cov(x_vals, y_vals)
cov = np.linalg.eig(cov)
eigvals, eigvecs
# Normalize ellipse size (ensures all ellipses fit in a fixed-size box)
= np.max(eigvals)
max_eigenvalue = max_size * (eigvals / max_eigenvalue)
width, height = np.degrees(np.arctan2(*eigvecs[:, 0][::-1]))
angle
# Draw ellipse at the center
= Ellipse(
ellipse =(0, 0), width=width, height=height,
xy=angle, edgecolor="black", facecolor="gray", alpha=0.3
angle
)
ax.add_patch(ellipse)
# Fix axis limits so all ellipses are comparable
-1, 1)
ax.set_xlim(-1, 1)
ax.set_ylim(
ax.set_xticks([])
ax.set_yticks([])False)
ax.set_frame_on(
return ax # Return modified axis
# Example Data
= np.random.uniform(size=100)
a = np.random.uniform(size=100)
b = pd.DataFrame({
df 'A': (a) * 100.0,
'B': (b) * 100.0,
'C': (a - b) - 100.0,
'D': (a + b) * 50.0 - 50.0,
})
# Test with individual plots
= plt.subplots(figsize=(4, 4))
fig, ax "A", "B", df, ax=ax)
sns_correlation_ellipse( plt.show()
= sns.pairplot(df, kind="scatter", corner=True)
g for i, row_var in enumerate(df.columns):
for j, col_var in enumerate(df.columns):
if i > j: # Only lower triangle
=g.axes[i, j])
sns_correlation_ellipse(row_var, col_var, df, ax plt.show()
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from sklearn.preprocessing import MinMaxScaler
def sns_correlation_ellipse(x, y, data, ax=None, max_size=1.2, **kwargs):
"""
Seaborn-compatible function that plots a correlation ellipse
based on the covariance of x and y.
- Uses eigenvectors/eigenvalues of covariance matrix.
- Normalizes ellipse size for consistent visuals.
- Designed for use in sns.pairplot or FacetGrid.
"""
if ax is None:
= plt.gca()
ax
# Standardize data for consistent ellipses
= MinMaxScaler()
scaler = scaler.fit_transform(data[[x, y]])
scaled_data = scaled_data[:, 0], scaled_data[:, 1]
x_vals, y_vals
# Compute covariance and eigen decomposition
= np.cov(x_vals, y_vals)
cov = np.linalg.eig(cov)
eigvals, eigvecs
# Normalize ellipse size (ensures all ellipses fit in a fixed-size box)
= np.max(eigvals)
max_eigenvalue = max_size * (eigvals / max_eigenvalue)
width, height = np.degrees(np.arctan2(*eigvecs[:, 0][::-1]))
angle
# Draw ellipse at the center
= Ellipse(
ellipse =(0, 0), width=width, height=height,
xy=angle, edgecolor="black", facecolor="gray", alpha=0.3
angle
)
ax.add_patch(ellipse)
# Fix axis limits so all ellipses are comparable
-1, 1)
ax.set_xlim(-1, 1)
ax.set_ylim(
ax.set_xticks([])
ax.set_yticks([])False)
ax.set_frame_on(
return ax # Return modified axis
def smart_sns_plot(x, y, data, ax=None, **kwargs):
"""
Automatically delegates to sns.scatterplot, sns.violinplot, or sns.heatmap
based on the data types of x and y.
- Continuous vs. Continuous → sns.scatterplot
- Continuous vs. Discrete → sns.violinplot
- Discrete vs. Discrete → sns.heatmap
"""
if ax is None:
= plt.gca()
ax
# Determine data types
= data[x].dtype
x_dtype = data[y].dtype
y_dtype = np.issubdtype(x_dtype, np.number)
is_x_cont = np.issubdtype(y_dtype, np.number)
is_y_cont
if is_x_cont and is_y_cont:
# Continuous vs. Continuous → Scatter Plot
=x, y=y, data=data, ax=ax, **kwargs)
sns.scatterplot(x
elif is_x_cont and not is_y_cont:
# Continuous vs. Discrete → Violin Plot
=y, y=x, data=data, ax=ax, **kwargs)
sns.violinplot(x
elif not is_x_cont and is_y_cont:
# Discrete vs. Continuous → Violin Plot (flipped)
=x, y=y, data=data, ax=ax, **kwargs)
sns.violinplot(x
else:
# Discrete vs. Discrete → Heatmap
= pd.crosstab(data[x], data[y])
cross_tab =True, fmt="d", cmap="Blues", ax=ax, cbar=False)
sns.heatmap(cross_tab, annot
return ax # Return the modified axis
# Example Data
= np.random.uniform(size=100)
a = np.random.uniform(size=100)
b = pd.DataFrame({
df 'A': (a) * 100.0,
'B': (b) * 100.0,
'C': (a - b) - 100.0,
'D': (a + b) * 50.0 - 50.0,
'E': np.random.choice(['Low', 'Medium', 'High'], 100),
'F': np.random.choice(['Yes', 'No'], 100)
})
# Create PairGrid
= sns.PairGrid(df, diag_sharey=False)
g
# Upper Triangle: Use smart_sns_plot
g.map_upper(smart_sns_plot)
# Lower Triangle: Use sns_correlation_ellipse
g.map_lower(sns_correlation_ellipse)
# Diagonal: Histograms
=True, bins=20, color="lightblue")
g.map_diag(sns.histplot, kde
plt.show()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[6], line 103 100 g = sns.PairGrid(df, diag_sharey=False) 102 # Upper Triangle: Use smart_sns_plot --> 103 g.map_upper(smart_sns_plot) 105 # Lower Triangle: Use sns_correlation_ellipse 106 g.map_lower(sns_correlation_ellipse) File ~/work/notes/notes-islr-py/.venv/lib/python3.10/site-packages/seaborn/axisgrid.py:1410, in PairGrid.map_upper(self, func, **kwargs) 1399 """Plot with a bivariate function on the upper diagonal subplots. 1400 1401 Parameters (...) 1407 1408 """ 1409 indices = zip(*np.triu_indices_from(self.axes, 1)) -> 1410 self._map_bivariate(func, indices, **kwargs) 1411 return self File ~/work/notes/notes-islr-py/.venv/lib/python3.10/site-packages/seaborn/axisgrid.py:1574, in PairGrid._map_bivariate(self, func, indices, **kwargs) 1572 if ax is None: # i.e. we are in corner mode 1573 continue -> 1574 self._plot_bivariate(x_var, y_var, ax, func, **kws) 1575 self._add_axis_labels() 1577 if "hue" in signature(func).parameters: File ~/work/notes/notes-islr-py/.venv/lib/python3.10/site-packages/seaborn/axisgrid.py:1583, in PairGrid._plot_bivariate(self, x_var, y_var, ax, func, **kwargs) 1581 """Draw a bivariate plot on the specified axes.""" 1582 if "hue" not in signature(func).parameters: -> 1583 self._plot_bivariate_iter_hue(x_var, y_var, ax, func, **kwargs) 1584 return 1586 kwargs = kwargs.copy() File ~/work/notes/notes-islr-py/.venv/lib/python3.10/site-packages/seaborn/axisgrid.py:1659, in PairGrid._plot_bivariate_iter_hue(self, x_var, y_var, ax, func, **kwargs) 1657 func(x=x, y=y, **kws) 1658 else: -> 1659 func(x, y, **kws) 1661 self._update_legend_data(ax) TypeError: smart_sns_plot() missing 1 required positional argument: 'data'
Reuse
Citation
@online{bochman2024,
author = {Bochman, Oren},
title = {Better {SPLOM}},
date = {2024-06-01},
url = {https://orenbochman.github.io/notes-islr/posts/better-splom/},
langid = {en}
}