多元回归分析 ¶
多元线性回归模型表示为: $$ f(x_i) = \mathbf{w}^T \mathbf{x}_i + b $$ 其中,$\mathbf{w}$ 是权重向量,$b$ 是偏置项,$\mathbf{x}_i$ 是输入向量,$f(x_i)$ 是预测值。
参数估计 ¶
为了估计 $\mathbf{w}$ 和 $b$,可以使用最小二乘法。将 $\mathbf{w}$ 和 $b$ 合并为向量形式 $\hat{\mathbf{w}} = (\mathbf{w}; b)$,数据集 $D$ 表示为矩阵 $\mathbf{X}$,其中每行对应一个样本,前 $d$ 个元素对应样本的 $d$ 个属性值,最后一个元素恒定为 1。
$$ \mathbf{X} = \begin{pmatrix} x_{11} & x_{12} & \ldots & x_{1d} & 1 \\ x_{21} & x_{22} & \ldots & x_{2d} & 1 \\ \vdots & \vdots & \ddots & \vdots & \vdots \\ x_{m1} & x_{m2} & \ldots & x_{md} & 1 \\ \end{pmatrix} = \begin{pmatrix} \mathbf{x}_1^T & 1 \\ \mathbf{x}_2^T & 1 \\ \vdots & \vdots \\ \mathbf{x}_m^T & 1 \\ \end{pmatrix} $$
标记向量 $\mathbf{y} = (y_1; y_2; \ldots; y_m)$,则目标函数为:
$$ \hat{\mathbf{w}}^* = \arg \min_{\hat{\mathbf{w}}} (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})^T (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}}) $$
令 $E_{\hat{\mathbf{w}}} = (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})^T (\mathbf{y} - \mathbf{X}\hat{\mathbf{w}})$,对 $\hat{\mathbf{w}}$ 求导并令其为零,得到最优解的形式:
$$ \frac{\partial E_{\hat{\mathbf{w}}}}{\partial \hat{\mathbf{w}}} = 2 \mathbf{X}^T (\mathbf{X}\hat{\mathbf{w}} - \mathbf{y}) = 0 $$
当 $\mathbf{X}^T \mathbf{X}$ 为满秩矩阵(full-rank matrix)或正定矩阵(positive definite matrix)时,可得:
$$ \hat{\mathbf{w}}^* = (\mathbf{X}^T \mathbf{X})^{-1} \mathbf{X}^T \mathbf{y} $$
参数的区间 ¶
$$ (\hat\beta_j -\beta_j)/(\hat{\sigma}\sqrt{L^{-1}_{jj}}) \sim t_{n-p-1} $$
import scipy.stats as st
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress
n=2
fa=n/2/(n-1)
a=st.f(2,n-2).ppf(0.95)/fa
achi=st.chi2(2).ppf(0.95)
print(a,achi)
nan 5.991464547107979
线性回归的误差 ¶
$L=(X^TX)$,$\sigma^2 L^{-1}$ 是 $\hat{\mathbf{w}}$ 的协方差矩阵
有
- $\text{Var}(\beta_0) = \sigma^2/n$
- $\text{Var}(\beta_i) = \sigma^2 L^{-1}_{ii}$
- $\text{Var}(\beta_0,\beta_j)=0$
- $\text{Var}(\beta_i,\beta_j) = \sigma^2 L^{-1}_{ij}$
$\sigma^2$ 的估计仍然可以用残差估计
$$ \begin{aligned} \delta^2 = (Y - X\beta)^\intercal (Y-X\beta)\\ \hat \sigma^2 =\delta^2/(n-p-1)\\ \delta^2/\sigma^2 \sim \chi^2_{n-p-1} \end{aligned} $$
%matplotlib notebook
import scipy.stats as st
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(304)
one=np.ones(20)
x=st.uniform(0,10).rvs(20)
y=st.uniform(0,8).rvs(20)
print(x.shape,y.shape)
Xmm=np.vstack([one,x-np.mean(x),y-np.mean(y)]).T # 先叠成3行,再转置成为3列,变成每一个样本为一行
# print(Xmm.shape)
beta=np.linalg.solve(Xmm.T@Xmm,Xmm.T@Y)
# print(beta,"\n")
L_inv=np.linalg.inv(Xmm.T@Xmm)
sigma2=(Y-Xmm@beta).T @(Y-Xmm@beta)/(20-2-1)
# print(L_inv,"\n")
err=np.sqrt(sigma2*np.diag(L_inv))
# print(err)
(20,) (20,)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[2], line 17 15 Xmm=np.vstack([one,x-np.mean(x),y-np.mean(y)]).T # 先叠成3行,再转置成为3列,变成每一个样本为一行 16 # print(Xmm.shape) ---> 17 beta=np.linalg.solve(Xmm.T@Xmm,Xmm.T@Y) 18 # print(beta,"\n") 19 L_inv=np.linalg.inv(Xmm.T@Xmm) NameError: name 'Y' is not defined
两个变量的时候,通常检验 $\beta_1$ 和 $\beta_2$ 的计算表达式是否小于 5.99
$f(x)$ 的区间比 $f(x_i)$ 更宽泛
%matplotlib notebook
import scipy.stats as st
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress
from matplotlib.animation import FuncAnimation
np.random.seed(304)
a=4
b=4
c=10
x=st.uniform(0,10).rvs(20)
y=st.uniform(0,8).rvs(20)
z=[np.random.normal(a*i+b*j+c,5) for i,j in zip(x,y)]
def animate(dfn):
    ax.view_init(elev=32, azim=dfn)
    
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')
ax.plot(x, y, z, color='k', zorder=15, linestyle='none', marker='o', alpha=0.5)
anim = FuncAnimation(fig, animate, frames=range(0,360,2))
# anim.save('data.gif',fps=3,dpi=200)
from IPython.display import HTML
HTML(anim.to_jshtml())
fig.show()
# 使用sklearn进行回归
from sklearn import linear_model
    
X=np.vstack((x, y)).T # vstack 叠叠乐函数
ols = linear_model.LinearRegression() # 创建模型
model = ols.fit(X,z) # 预测模型
x_pred = np.linspace(0, 10, 50)  
y_pred = np.linspace(0, 8, 50)  
xx_pred, yy_pred = np.meshgrid(x_pred, y_pred) # 拼接成一个50x50的网格
#flatten() change the 50x50 to 1x2500
model_viz = np.array([xx_pred.flatten(), yy_pred.flatten()]).T
# print(model_viz.shape)----- (2500,2)
predicted = model.predict(model_viz)
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z,color='k',alpha=0.5 )
ax.plot(xx_pred.flatten(), yy_pred.flatten(), predicted, alpha=0.9)
anim = FuncAnimation(fig, animate, frames=range(0,360,3))
fig.show()
# anim.save('fit.gif',fps=3,dpi=200)
from IPython.display import HTML
HTML(anim.to_jshtml())
print(model.intercept_,model.coef_) # 打印截距和每个X的系数
7.546774866213575 [3.8898734 4.2202737]
当然,也可以使用矩阵进行计算 需要注意的是,这里第一列是全1列
- 非 0 偏置(截距)缩到 $\beta$ 矩阵当中了,所以对应的 $\beta X$ 中 $X$ 要多一列全 1 值
# 使用矩阵方法进行计算
one=np.ones(20)
x=st.uniform(0,10).rvs(20)
y=st.uniform(0,8).rvs(20)
z=[np.random.normal(a*i+b*j+c,5) for i,j in zip(x,y)]
Xm=np.vstack([one,x,y]).T
Y = np.array(z).reshape(20,1)
b=np.linalg.solve(Xm.T@Xm,Xm.T@Y)
print(b)
[[6.96602352] [4.34440642] [4.11604012]]
stats 中的 OLS model ¶
import statsmodels.api as sm
    
#Xp=np.vstack((np.ones(20),x, y)).T
Xp = sm.add_constant(X) # 等同于上面一行
# Ordinary least square
model = sm.OLS(z, Xp)
results = model.fit()
print(results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.294
Model:                            OLS   Adj. R-squared:                  0.211
Method:                 Least Squares   F-statistic:                     3.540
Date:                Mon, 20 Oct 2025   Prob (F-statistic):             0.0518
Time:                        04:29:09   Log-Likelihood:                -78.492
No. Observations:                  20   AIC:                             163.0
Df Residuals:                      17   BIC:                             166.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         28.1183      8.023      3.505      0.003      11.192      45.045
x1             0.7903      1.102      0.717      0.483      -1.535       3.115
x2             3.5215      1.388      2.537      0.021       0.593       6.450
==============================================================================
Omnibus:                        0.855   Durbin-Watson:                   2.811
Prob(Omnibus):                  0.652   Jarque-Bera (JB):                0.685
Skew:                           0.415   Prob(JB):                        0.710
Kurtosis:                       2.635   Cond. No.                         18.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
可以使用 python 集成的功能进行计算
F 值
t 值: 相当于对 x1,x2 进行了原假设为 $\mu_{x_1} = \mu_{x_2} = 0$ 的假设
最后两列还会给出参数 95% 置信区间
可以通过 result 的函数获取计算结果
pingouin 包 ¶
也可以实现相同的效果,更加精简一点
import pingouin as pg
lm = pg.linear_regression(X, z)
print(lm)
       names       coef        se         T      pval        r2    adj_r2  \
0  Intercept  28.118337  8.022802  3.504803  0.002716  0.294045  0.210992   
1         x1   0.790336  1.102023  0.717168  0.483009  0.294045  0.210992   
2         x2   3.521503  1.387927  2.537239  0.021260  0.294045  0.210992   
    CI[2.5%]  CI[97.5%]  
0  11.191704  45.044970  
1  -1.534730   3.115402  
2   0.593233   6.449772  
假设检验 ¶
检验每个 $\beta_i$ ¶
$$ (\hat\beta_j -\beta_j)/(\hat{\sigma}\sqrt{L^{-1}_{jj}}) \sim t_{n-p-1} $$
检验整体 ( 看是否有相关性 ) ¶
一系列 $\beta_1,\beta_2,\beta_3 \dots$ 系数是否为 0
- $H_0: \beta_1, \cdots, \beta_r = 0$
- $R_2 = \delta'^2 | \beta_1, \cdots, \beta_r = 0$
- $R_1 = \delta^2$
- $\delta$ 重新在只有 $p-r$ 个变量时通过最小二乘法计算
- $(R_2 - R_1)/\sigma^2 \sim \chi^2_r$
- $(n-p-1)\hat{\sigma}^2/\sigma^2 \sim \chi^2_{n-p-1}$
- $(R_2 - R_1)/(r\hat{\sigma}^2) \sim F_{r,n-p-1}$
- 当 $(R_2 - R_1)/(r\hat{\sigma}^2) > F_{r,n-p-1}(0.05)$ 时否定假设
## Single beta_i=0
print(results.tvalues)
## multiple beta_r=0
## 验证方法1
print(results.f_pvalue)
hypotheses = '(x1 = x2 =0)'
print(results.f_test(hypotheses))
# p值小于0.05 就要拒绝原来的假设了
## 验证方法2:使得A矩阵线性无关的两个向量是要验证的向量
A = np.identity(len(results.params))
A = A[1:,:]
print(A)
print(results.f_test(A))
[3.50480251 0.71716783 2.53723929] 0.051832539952542854 <F test: F=3.540434571475479, p=0.05183253995254299, df_denom=17, df_num=2> [[0. 1. 0.] [0. 0. 1.]] <F test: F=3.5404345714754792, p=0.05183253995254299, df_denom=17, df_num=2>
%matplotlib inline
from sklearn.preprocessing import PolynomialFeatures
a=0.3
b=-4
c=10
d=8
x2=st.uniform(0,10).rvs(40)
x2=np.sort(x2)
print("x2=",x2.shape)
y2=np.random.normal(a*x2**3+b*x2**2+c*x2+d,2)
x2= (40,)
需要准备数据,把数据转换成每列代表不同的属性,每一行代表不同的样本
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(np.array(x2).reshape(40,1))
fig = plt.figure()
plt.scatter(x2,y2)
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(np.array(x2).reshape(40,1))
print(X_poly.shape)
model2 = sm.OLS(y2, X_poly)
results2 = model2.fit()
print(x2.shape,y2.shape)
print(results2.params.shape)
ypred = model2.predict(results2.params,X_poly)
plt.plot(x2,ypred,color = 'C0')
# plt.show()
print(results2.summary())
(40, 6)
(40,) (40,)
(6,)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.980
Model:                            OLS   Adj. R-squared:                  0.977
Method:                 Least Squares   F-statistic:                     325.7
Date:                Mon, 20 Oct 2025   Prob (F-statistic):           1.09e-27
Time:                        04:29:09   Log-Likelihood:                -72.219
No. Observations:                  40   AIC:                             156.4
Df Residuals:                      34   BIC:                             166.6
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.8112      1.211      5.624      0.000       4.350       9.272
x1            13.8635      3.163      4.383      0.000       7.436      20.291
x2            -6.7615      2.080     -3.251      0.003     -10.989      -2.534
x3             1.0314      0.555      1.857      0.072      -0.097       2.160
x4            -0.0797      0.065     -1.235      0.225      -0.211       0.051
x5             0.0031      0.003      1.134      0.265      -0.002       0.009
==============================================================================
Omnibus:                        1.219   Durbin-Watson:                   2.149
Prob(Omnibus):                  0.544   Jarque-Bera (JB):                0.414
Skew:                           0.047   Prob(JB):                        0.813
Kurtosis:                       3.489   Cond. No.                     2.63e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.63e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib_inline/backend_inline.py:99, in show(close, block) 96 # only call close('all') if any to close 97 # close triggers gc.collect, which can be slow 98 if close and Gcf.get_all_fig_managers(): ---> 99 matplotlib.pyplot.close('all') File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/pyplot.py:1201, in close(fig) 1199 _pylab_helpers.Gcf.destroy(manager) 1200 elif fig == 'all': -> 1201 _pylab_helpers.Gcf.destroy_all() 1202 elif isinstance(fig, int): 1203 _pylab_helpers.Gcf.destroy(fig) File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/_pylab_helpers.py:81, in Gcf.destroy_all(cls) 79 for manager in list(cls.figs.values()): 80 manager.canvas.mpl_disconnect(manager._cidgcf) ---> 81 manager.destroy() 82 cls.figs.clear() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:144, in FigureManagerNbAgg.destroy(self) 142 for comm in list(self.web_sockets): 143 comm.on_close() --> 144 self.clearup_closed() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:152, in FigureManagerNbAgg.clearup_closed(self) 148 self.web_sockets = {socket for socket in self.web_sockets 149 if socket.is_open()} 151 if len(self.web_sockets) == 0: --> 152 CloseEvent("close_event", self.canvas)._process() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backend_bases.py:1189, in Event._process(self) 1187 def _process(self): 1188 """Process this event on ``self.canvas``, then unset ``guiEvent``.""" -> 1189 self.canvas.callbacks.process(self.name, self) 1190 self.guiEvent = None File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:366, in CallbackRegistry.process(self, s, *args, **kwargs) 364 except Exception as exc: 365 if self.exception_handler is not None: --> 366 self.exception_handler(exc) 367 else: 368 raise File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:110, in _exception_printer(exc) 108 def _exception_printer(exc): 109 if _get_running_interactive_framework() in ["headless", None]: --> 110 raise exc 111 else: 112 traceback.print_exc() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:361, in CallbackRegistry.process(self, s, *args, **kwargs) 359 if func is not None: 360 try: --> 361 func(*args, **kwargs) 362 # this does not capture KeyboardInterrupt, SystemExit, 363 # and GeneratorExit 364 except Exception as exc: File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/animation.py:940, in Animation._stop(self, *args) 938 self._fig.canvas.mpl_disconnect(self._resize_id) 939 self._fig.canvas.mpl_disconnect(self._close_id) --> 940 self.event_source.remove_callback(self._step) 941 self.event_source = None AttributeError: 'NoneType' object has no attribute 'remove_callback'
# 使用f-test对poly阶数进行检验
print(results2.f_test('x4=x5=0'))
print(results2.f_test('x3=x4=x5=0'))
<F test: F=1.2003554155707552, p=0.3135262379513147, df_denom=34, df_num=2> <F test: F=115.33656688355687, p=6.863336752113644e-18, df_denom=34, df_num=3>
for i in (2,3):
    poly = PolynomialFeatures(degree = i)
    X_poly = poly.fit_transform(np.array(x2).reshape(40,1))
    model2 = sm.OLS(y2, X_poly)
    results2 = model2.fit()
    print(results2.summary())
    ypred = model2.predict(results2.params,X_poly)
    fig.gca().plot(x2,ypred,color = 'C{:d}'.format(i))
fig
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.771
Model:                            OLS   Adj. R-squared:                  0.759
Method:                 Least Squares   F-statistic:                     62.42
Date:                Mon, 20 Oct 2025   Prob (F-statistic):           1.39e-12
Time:                        04:29:10   Log-Likelihood:                -120.50
No. Observations:                  40   AIC:                             247.0
Df Residuals:                      37   BIC:                             252.1
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         15.6061      1.855      8.415      0.000      11.848      19.364
x1            -3.8036      1.029     -3.697      0.001      -5.888      -1.719
x2             0.0469      0.119      0.394      0.696      -0.195       0.289
==============================================================================
Omnibus:                       11.658   Durbin-Watson:                   0.499
Prob(Omnibus):                  0.003   Jarque-Bera (JB):               13.411
Skew:                           0.915   Prob(JB):                      0.00122
Kurtosis:                       5.168   Cond. No.                         80.7
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.978
Model:                            OLS   Adj. R-squared:                  0.976
Method:                 Least Squares   F-statistic:                     536.0
Date:                Mon, 20 Oct 2025   Prob (F-statistic):           6.48e-30
Time:                        04:29:10   Log-Likelihood:                -73.584
No. Observations:                  40   AIC:                             155.2
Df Residuals:                      36   BIC:                             161.9
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.3412      0.703     11.869      0.000       6.916       9.766
x1             9.1856      0.775     11.852      0.000       7.614      10.757
x2            -3.7585      0.210    -17.915      0.000      -4.184      -3.333
x3             0.2844      0.015     18.434      0.000       0.253       0.316
==============================================================================
Omnibus:                        1.220   Durbin-Watson:                   2.023
Prob(Omnibus):                  0.543   Jarque-Bera (JB):                0.424
Skew:                           0.102   Prob(JB):                        0.809
Kurtosis:                       3.461   Cond. No.                         960.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
指数拟合 ¶
$$ \begin{align*} Y = b_0 e^{b_1X}\\ \log Y = \log b_0 + b_1 X\\ \end{align*} $$
两个变量的相关性 ¶
- 相关系数 (Correlation)
$$ \rho = \frac{\text{Cov}(x, y)}{\sqrt{\text{Var}(x) \text{Var}(y)}} $$
- 用样本估计
$$ r = \frac{\sum (x - \bar{x})(y - \bar{y})}{\sqrt{\sum (x - \bar{x})^2 \sum (y - \bar{y})^2}} $$
- 假设检验:皮尔逊相关系数检验
$$ H_0: \rho = 0,H_1: \rho \ne 0 $$
检验统计量为
$$ T = \frac{r\sqrt{n - 2}}{\sqrt{1 - r^2}} \sim t_{n-2} $$
$$ C = t_{n-2}(\alpha/2) \sqrt{\frac{n - 2 + t_{n-2}^2(\alpha/2)}{n - 2}} $$
需要注意,在判断相关性的时候,要注意系统的自由度
系统自由度很低的时候(样本很少
由下图可以看出,在样本数量 20 的时候,小于 0.63 就认为不相关了
## C的取值
cl=[]
nl=[]
for n in range(10,100,5):
    # print(n,cl)
    tn=st.t.ppf(0.975,n-2)
    c=tn/np.sqrt(n-2+tn**2)
    cl.append(c)
    nl.append(n)
plt.plot(nl,cl)
plt.title('Critical Correlation Coefficient vs Sample Size')
plt.xlabel('Sample Size (n)')
plt.ylabel('Critical r value')
plt.grid(True, alpha=0.3)
偏相关 ¶
分析相关性,给 x1 和给 x2 差不多的话,就可以先去掉一个
取结果中方差最小的
根据全相关系数来求偏相关系数
p 值很大,支持原假设 p值很小,不支持原假设
- 一组观测量 $X_i$ 中,互相之间有关联 - $X_3$ 为一个人的收入,$X_1, X_2$ 为一个人在吃和穿上的花费
- 观察到 $X_1, X_2$ 的正相关,$r > 0$
- 实际 $X_1, X_2$ 均由 $X_3$ 带动,使得他们呈现正相关
- 若能去掉 $X_3$ 的影响,观测它们的实际相关,可能转为负相关
 
- 重新定义 $X_1', X_2'$ 
$$ X_1' = X_1 - L(X_3 \cdots) $$
$$ X_2' = X_2 - L'(X_3 \cdots) $$
使得 $E(X_1')^2$ 和 $E(X_2')^2$ 最小
可以证明
$$ \rho(X_1', X_2') = (\rho_{12} - \rho_{13} \rho_{23})/[(1 - \rho_{13}^2)(1 - \rho_{23}^2)]^{1/2} $$
将原本 $X$ 的相关矩阵写为
$$ P = \begin{pmatrix} 1 & \cdots & \rho_{1p} \\ \vdots & 1 & \vdots \\ \rho_{p1} & \cdots & 1 \end{pmatrix} $$
记 $P_{ij}$ 为 $\rho(X_i', X_j')$ 为去掉第 $i$ 行第 $j$ 列后的行列式
- $\rho(X_1', X_2') = P_{12}/\sqrt{P_{11} P_{22}}$
%matplotlib inline
import pandas as pd
x3=st.expon(0,1000).rvs(50) # income
x1=np.random.normal(x3*0.4+30,100) # spent on eating
x2= np.random.normal(x3-x1-20,10) # spent on clothes
plt.scatter(x1,x2)
plt.title('Spending on Food vs Clothes')
plt.xlabel('Spending on Food')
plt.ylabel('Spending on Clothes')
plt.show()
xyz=np.vstack((x1,x2,x3)).T
df = pd.DataFrame(xyz, columns = ['x', 'y', 'z']) 
print("Correlation\n",df.corr())
print("Partial Correlation\n",df.pcorr())
print(pg.partial_corr(data=df, x='x', y='y', covar='z',method='pearson'))
cor=np.corrcoef(xyz.T)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[15], line 11 9 plt.xlabel('Spending on Food') 10 plt.ylabel('Spending on Clothes') ---> 11 plt.show() 12 xyz=np.vstack((x1,x2,x3)).T 14 df = pd.DataFrame(xyz, columns = ['x', 'y', 'z']) File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/pyplot.py:613, in show(*args, **kwargs) 569 """ 570 Display all open figures. 571 (...) 610 explicitly there. 611 """ 612 _warn_if_gui_out_of_main_thread() --> 613 return _get_backend_mod().show(*args, **kwargs) File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib_inline/backend_inline.py:99, in show(close, block) 96 # only call close('all') if any to close 97 # close triggers gc.collect, which can be slow 98 if close and Gcf.get_all_fig_managers(): ---> 99 matplotlib.pyplot.close('all') File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/pyplot.py:1201, in close(fig) 1199 _pylab_helpers.Gcf.destroy(manager) 1200 elif fig == 'all': -> 1201 _pylab_helpers.Gcf.destroy_all() 1202 elif isinstance(fig, int): 1203 _pylab_helpers.Gcf.destroy(fig) File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/_pylab_helpers.py:81, in Gcf.destroy_all(cls) 79 for manager in list(cls.figs.values()): 80 manager.canvas.mpl_disconnect(manager._cidgcf) ---> 81 manager.destroy() 82 cls.figs.clear() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:144, in FigureManagerNbAgg.destroy(self) 142 for comm in list(self.web_sockets): 143 comm.on_close() --> 144 self.clearup_closed() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backends/backend_nbagg.py:152, in FigureManagerNbAgg.clearup_closed(self) 148 self.web_sockets = {socket for socket in self.web_sockets 149 if socket.is_open()} 151 if len(self.web_sockets) == 0: --> 152 CloseEvent("close_event", self.canvas)._process() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/backend_bases.py:1189, in Event._process(self) 1187 def _process(self): 1188 """Process this event on ``self.canvas``, then unset ``guiEvent``.""" -> 1189 self.canvas.callbacks.process(self.name, self) 1190 self.guiEvent = None File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:366, in CallbackRegistry.process(self, s, *args, **kwargs) 364 except Exception as exc: 365 if self.exception_handler is not None: --> 366 self.exception_handler(exc) 367 else: 368 raise File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:110, in _exception_printer(exc) 108 def _exception_printer(exc): 109 if _get_running_interactive_framework() in ["headless", None]: --> 110 raise exc 111 else: 112 traceback.print_exc() File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/cbook.py:361, in CallbackRegistry.process(self, s, *args, **kwargs) 359 if func is not None: 360 try: --> 361 func(*args, **kwargs) 362 # this does not capture KeyboardInterrupt, SystemExit, 363 # and GeneratorExit 364 except Exception as exc: File /opt/hostedtoolcache/Python/3.12.4/x64/lib/python3.12/site-packages/matplotlib/animation.py:940, in Animation._stop(self, *args) 938 self._fig.canvas.mpl_disconnect(self._resize_id) 939 self._fig.canvas.mpl_disconnect(self._close_id) --> 940 self.event_source.remove_callback(self._step) 941 self.event_source = None AttributeError: 'NoneType' object has no attribute 'remove_callback'
# 自己计算也是一样的效果
print((cor[0,1]-cor[0,2]*cor[1,2])/np.sqrt((1-cor[0,2]**2)*(1-cor[1,2]**2)))
print((cor[1,2]-cor[0,1]*cor[0,2])/np.sqrt((1-cor[0,1]**2)*(1-cor[0,2]**2)))
print((cor[0,2]-cor[0,1]*cor[1,2])/np.sqrt((1-cor[0,1]**2)*(1-cor[1,2]**2)))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[16], line 2 1 # 自己计算也是一样的效果 ----> 2 print((cor[0,1]-cor[0,2]*cor[1,2])/np.sqrt((1-cor[0,2]**2)*(1-cor[1,2]**2))) 3 print((cor[1,2]-cor[0,1]*cor[0,2])/np.sqrt((1-cor[0,1]**2)*(1-cor[0,2]**2))) 4 print((cor[0,2]-cor[0,1]*cor[1,2])/np.sqrt((1-cor[0,1]**2)*(1-cor[1,2]**2))) NameError: name 'cor' is not defined
meana=[23,34,51]
covD=np.array((4,5,6))
corD=[[1,0.5,0.9],[0.5,1,0.2],[0.9,0.2,1]]
cov=covD*covD.reshape(3,-1)*corD
print(np.array(corD))
mn=st.multivariate_normal(meana,cov)
xyz=mn.rvs(100)
df = pd.DataFrame(xyz, columns = ['x', 'y', 'z']) 
print(df.corr())
df.pcorr().round(3)
[[1.  0.5 0.9]
 [0.5 1.  0.2]
 [0.9 0.2 1. ]]
          x         y         z
x  1.000000  0.503486  0.890283
y  0.503486  1.000000  0.194100
z  0.890283  0.194100  1.000000
| x | y | z | |
|---|---|---|---|
| x | 1.000 | 0.740 | 0.935 | 
| y | 0.740 | 1.000 | -0.646 | 
| z | 0.935 | -0.646 | 1.000 |