Numeric Instability During Linear Regression With Large x-values
JackFielding opened this issue · comments
I was using the altair
package (version 5.2.0
), and the linear regression transform seems numerically unstable for large x values. The regression line fits the data poorly.
I was using a datetime x-axis. The number of epoch seconds since 1970 is 1,709,023,537 (milliseconds 1000x higher), so the x-values can be very large.
Related Issues
- #2378 Similar instability noted for exponential regression, fixed with mean-centering.
- #2198 Another older issue.
Minimal Example
In altair I used the following code (based on the example in https://altair-viz.github.io/user_guide/transform/regression.html):
import altair as alt
import pandas as pd
import numpy as np
np.random.seed(42)
x = np.linspace(0, 10)
x_big = x + 1_000_000_000
y = x - 5 + np.random.randn(len(x))
df = pd.DataFrame({'x': x, 'y': y, "x_big": x_big})
def scatter_with_regression(x):
chart = alt.Chart(df).mark_point().encode(
x=alt.X(x, scale=alt.Scale(zero=False)),
y='y'
)
return chart + chart.transform_regression(x, 'y').mark_line()
(scatter_with_regression(x="x") | scatter_with_regression(x="x_big")).properties(title="Linear Regression Transformation Could Benefit from Mean Centering")
This code generated the generated the following chart:
The chart has a vega-lite JSON representation of:
{
"config": {"view": {"continuousWidth": 300, "continuousHeight": 300}},
"hconcat": [
{
"layer": [
{
"mark": {"type": "point"},
"encoding": {
"x": {
"field": "x",
"scale": {"zero": false},
"type": "quantitative"
},
"y": {"field": "y", "type": "quantitative"}
}
},
{
"mark": {"type": "line"},
"encoding": {
"x": {
"field": "x",
"scale": {"zero": false},
"type": "quantitative"
},
"y": {"field": "y", "type": "quantitative"}
},
"transform": [{"on": "x", "regression": "y"}]
}
]
},
{
"layer": [
{
"mark": {"type": "point"},
"encoding": {
"x": {
"field": "x_big",
"scale": {"zero": false},
"type": "quantitative"
},
"y": {"field": "y", "type": "quantitative"}
}
},
{
"mark": {"type": "line"},
"encoding": {
"x": {
"field": "x_big",
"scale": {"zero": false},
"type": "quantitative"
},
"y": {"field": "y", "type": "quantitative"}
},
"transform": [{"on": "x_big", "regression": "y"}]
}
]
}
],
"data": {"name": "data-44ec9525e1d4ab566db24882f3a645ef"},
"title": "Linear Regression Transformation Could Benefit from Mean Centering",
"$schema": "https://vega.github.io/schema/vega-lite/v5.16.3.json",
"datasets": {
"data-44ec9525e1d4ab566db24882f3a645ef": [
{"x": 0, "y": -4.503285846988767, "x_big": 1000000000},
{
"x": 0.20408163265306123,
"y": -4.934182668518123,
"x_big": 1000000000.2040817
},
{
"x": 0.40816326530612246,
"y": -3.9441481965931855,
"x_big": 1000000000.4081633
},
{
"x": 0.6122448979591837,
"y": -2.8647252456327905,
"x_big": 1000000000.6122448
},
{
"x": 0.8163265306122449,
"y": -4.417826844111091,
"x_big": 1000000000.8163265
},
{
"x": 1.0204081632653061,
"y": -4.213728793683875,
"x_big": 1000000001.0204082
},
{
"x": 1.2244897959183674,
"y": -2.196297388574241,
"x_big": 1000000001.2244898
},
{
"x": 1.4285714285714286,
"y": -2.8039938422756623,
"x_big": 1000000001.4285715
},
{
"x": 1.6326530612244898,
"y": -3.836821324710462,
"x_big": 1000000001.6326531
},
{
"x": 1.836734693877551,
"y": -2.6207052625364846,
"x_big": 1000000001.8367347
},
{
"x": 2.0408163265306123,
"y": -3.42260136628185,
"x_big": 1000000002.0408163
},
{
"x": 2.2448979591836737,
"y": -3.220831794386583,
"x_big": 1000000002.244898
},
{
"x": 2.4489795918367347,
"y": -2.309058136597231,
"x_big": 1000000002.4489796
},
{
"x": 2.6530612244897958,
"y": -4.260219020168002,
"x_big": 1000000002.6530613
},
{
"x": 2.857142857142857,
"y": -3.8677749753701756,
"x_big": 1000000002.8571428
},
{
"x": 3.0612244897959187,
"y": -2.501063039445054,
"x_big": 1000000003.0612245
},
{
"x": 3.2653061224489797,
"y": -2.747524997885444,
"x_big": 1000000003.2653061
},
{
"x": 3.4693877551020407,
"y": -1.2163649123026854,
"x_big": 1000000003.4693878
},
{
"x": 3.673469387755102,
"y": -2.2345546877661087,
"x_big": 1000000003.6734694
},
{
"x": 3.8775510204081636,
"y": -2.534752680927128,
"x_big": 1000000003.8775511
},
{
"x": 4.081632653061225,
"y": 0.5472814219827786,
"x_big": 1000000004.0816326
},
{
"x": 4.285714285714286,
"y": -0.94006201477225,
"x_big": 1000000004.2857143
},
{
"x": 4.4897959183673475,
"y": -0.44267587694472865,
"x_big": 1000000004.4897959
},
{
"x": 4.6938775510204085,
"y": -1.7308706351930483,
"x_big": 1000000004.6938776
},
{
"x": 4.8979591836734695,
"y": -0.6464235408517132,
"x_big": 1000000004.8979592
},
{
"x": 5.1020408163265305,
"y": 0.21296340603639657,
"x_big": 1000000005.1020408
},
{
"x": 5.3061224489795915,
"y": -0.8448711284427113,
"x_big": 1000000005.3061224
},
{
"x": 5.510204081632653,
"y": 0.8859020999783254,
"x_big": 1000000005.5102041
},
{
"x": 5.714285714285714,
"y": 0.11364702436690943,
"x_big": 1000000005.7142857
},
{
"x": 5.918367346938775,
"y": 0.6266735971454986,
"x_big": 1000000005.9183674
},
{
"x": 6.122448979591837,
"y": 0.5207423673624404,
"x_big": 1000000006.1224489
},
{
"x": 6.326530612244898,
"y": 3.1788087967538363,
"x_big": 1000000006.3265306
},
{
"x": 6.530612244897959,
"y": 1.5171150201600254,
"x_big": 1000000006.5306122
},
{
"x": 6.73469387755102,
"y": 0.67698294859512,
"x_big": 1000000006.7346939
},
{
"x": 6.938775510204081,
"y": 2.7613204223072705,
"x_big": 1000000006.9387755
},
{
"x": 7.142857142857143,
"y": 0.922013492886121,
"x_big": 1000000007.1428572
},
{
"x": 7.346938775510204,
"y": 2.5558023705149595,
"x_big": 1000000007.3469387
},
{
"x": 7.551020408163265,
"y": 0.5913502842834897,
"x_big": 1000000007.5510204
},
{
"x": 7.755102040816327,
"y": 1.4269159919178966,
"x_big": 1000000007.755102
},
{
"x": 7.959183673469388,
"y": 3.1560449093385117,
"x_big": 1000000007.9591837
},
{
"x": 8.16326530612245,
"y": 3.90173188611786,
"x_big": 1000000008.1632653
},
{
"x": 8.36734693877551,
"y": 3.5387152199654808,
"x_big": 1000000008.3673469
},
{
"x": 8.571428571428571,
"y": 3.4557802890403306,
"x_big": 1000000008.5714285
},
{
"x": 8.775510204081632,
"y": 3.4744065084923434,
"x_big": 1000000008.7755102
},
{
"x": 8.979591836734695,
"y": 2.5010698463672676,
"x_big": 1000000008.9795918
},
{
"x": 9.183673469387756,
"y": 3.463829260993047,
"x_big": 1000000009.1836735
},
{
"x": 9.387755102040817,
"y": 3.9271163310810295,
"x_big": 1000000009.3877552
},
{
"x": 9.591836734693878,
"y": 5.648958960912793,
"x_big": 1000000009.5918367
},
{
"x": 9.795918367346939,
"y": 5.1395366569154,
"x_big": 1000000009.7959183
},
{"x": 10, "y": 3.236959844637266, "x_big": 1000000010}
]
}
}