Powered by GitBook

第7课：线性模型

import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.graphics.api as smg
import patsy
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

1.Statistical models and patsy formula

np.random.seed(123456789)

y = np.array([1, 2, 3, 4, 5])

x1 = np.array([6, 7, 8, 9, 10])

x2 = np.array([11, 12, 13, 14, 15])

X = np.vstack([np.ones(5), x1, x2, x1*x2]).T

array([[   1.,    6.,   11.,   66.],
       [   1.,    7.,   12.,   84.],
       [   1.,    8.,   13.,  104.],
       [   1.,    9.,   14.,  126.],
       [   1.,   10.,   15.,  150.]])

beta, res, rank, sval = np.linalg.lstsq(X, y)
beta

array([ -5.55555556e-01,   1.88888889e+00,  -8.88888889e-01,
        -5.25467605e-16])

model = sm.OLS(y, X)
result = model.fit()

result.params

array([ -5.55555556e-01,   1.88888889e+00,  -8.88888889e-01,
        -4.99600361e-16])

data = {"y": y, "x1": x1, "x2": x2}
df_data = pd.DataFrame(data)

model = smf.ols("y ~ 1 + x1 + x2 + x1:x2", df_data)

result = model.fit()

result.params

Intercept   -5.555556e-01
x1           1.888889e+00
x2          -8.888889e-01
x1:x2       -4.996004e-16
dtype: float64

print(result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.089e+28
Date:                Sun, 19 Jun 2016   Prob (F-statistic):           9.18e-29
Time:                        15:39:00   Log-Likelihood:                 152.57
No. Observations:                   5   AIC:                            -299.1
Df Residuals:                       2   BIC:                            -300.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -0.5556   4.89e-14  -1.14e+13      0.000        -0.556    -0.556
x1             1.8889   1.82e-13   1.04e+13      0.000         1.889     1.889
x2            -0.8889   6.21e-14  -1.43e+13      0.000        -0.889    -0.889
x1:x2      -4.996e-16   5.73e-15     -0.087      0.938     -2.51e-14  2.41e-14
==============================================================================
Omnibus:                          nan   Durbin-Watson:                   0.024
Prob(Omnibus):                    nan   Jarque-Bera (JB):                0.484
Skew:                           0.158   Prob(JB):                        0.785
Kurtosis:                       1.509   Cond. No.                     1.26e+18
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.91e-32. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

beta

array([ -5.55555556e-01,   1.88888889e+00,  -8.88888889e-01,
        -5.25467605e-16])

from collections import defaultdict

data = defaultdict(lambda: np.array([1,2,3]))

patsy.dmatrices("y ~ a", data=data)[1].design_info.term_names

['Intercept', 'a']

patsy.dmatrices("y ~ 1 + a + b", data=data)[1].design_info.term_names

['Intercept', 'a', 'b']

patsy.dmatrices("y ~ -1 + a + b", data=data)[1].design_info.term_names

['a', 'b']

patsy.dmatrices("y ~ a * b", data=data)[1].design_info.term_names

['Intercept', 'a', 'b', 'a:b']

patsy.dmatrices("y ~ a * b * c", data=data)[1].design_info.term_names

['Intercept', 'a', 'b', 'a:b', 'c', 'a:c', 'b:c', 'a:b:c']

patsy.dmatrices("y ~ a * b * c - a:b:c", data=data)[1].design_info.term_names

['Intercept', 'a', 'b', 'a:b', 'c', 'a:c', 'b:c']

data = {k: np.array([]) for k in ["y", "a", "b", "c"]}

patsy.dmatrices("y ~ a + b", data=data)[1].design_info.term_names

['Intercept', 'a', 'b']

patsy.dmatrices("y ~ I(a + b)", data=data)[1].design_info.term_names

['Intercept', 'I(a + b)']

patsy.dmatrices("y ~ I(a**2)", data=data)[1].design_info.term_names

['Intercept', 'I(a ** 2)']

patsy.dmatrices("y ~ np.log(a) + b", data=data)[1].design_info.term_names

['Intercept', 'np.log(a)', 'b']

z = lambda x1, x2: x1+x2

patsy.dmatrices("y ~ z(a, b)", data=data)[1].design_info.term_names

['Intercept', 'z(a, b)']

Categorical variables

data = {"y": [1, 2, 3], "a": [1, 2, 3]}

patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1]

	a
0	1.0
1	2.0
2	3.0

patsy.dmatrices("y ~ - 1 + C(a)", data=data, return_type="dataframe")[1]

	C(a)[1]	C(a)[2]	C(a)[3]
0	1.0	0.0	0.0
1	0.0	1.0	0.0
2	0.0	0.0	1.0

data = {"y": [1, 2, 3], "a": ["type A", "type B", "type C"]}

patsy.dmatrices("y ~ - 1 + a", data=data, return_type="dataframe")[1]

	a[type A]	a[type B]	a[type C]
0	1.0	0.0	0.0
1	0.0	1.0	0.0
2	0.0	0.0	1.0

patsy.dmatrices("y ~ - 1 + C(a, Poly)", data=data, return_type="dataframe")[1]

	C(a, Poly).Constant	C(a, Poly).Linear	C(a, Poly).Quadratic
0	1.0	-7.071068e-01	0.408248
1	1.0	-4.433780e-17	-0.816497
2	1.0	7.071068e-01	0.408248

2. Linear regression

np.random.seed(123456789)

N = 100

x1 = np.random.randn(N)

x2 = np.random.randn(N)

data = pd.DataFrame({"x1": x1, "x2": x2})

def y_true(x1, x2):
    return 1  + 2 * x1 + 3 * x2 + 4 * x1 * x2

data["y_true"] = y_true(x1, x2)

e = np.random.randn(N)

data["y"] = data["y_true"] + e

data.head()

	x1	x2	y_true	y
0	2.212902	-0.474588	-0.198823	-1.452775
1	2.128398	-1.524772	-12.298805	-12.560965
2	1.841711	-1.939271	-15.420705	-14.715090
3	0.082382	0.345148	2.313945	1.190283
4	0.858964	-0.621523	-1.282107	0.307772

fig, axes = plt.subplots(1, 2, figsize=(8, 4))

axes[0].scatter(data["x1"], data["y"])
axes[1].scatter(data["x2"], data["y"])

fig.tight_layout()

data.shape

(100, 4)

model = smf.ols("y ~ x1 + x2", data)

result = model.fit()

print(result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.380
Model:                            OLS   Adj. R-squared:                  0.367
Method:                 Least Squares   F-statistic:                     29.76
Date:                Sun, 19 Jun 2016   Prob (F-statistic):           8.36e-11
Time:                        15:39:09   Log-Likelihood:                -271.52
No. Observations:                 100   AIC:                             549.0
Df Residuals:                      97   BIC:                             556.9
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.9868      0.382      2.581      0.011         0.228     1.746
x1             1.0810      0.391      2.766      0.007         0.305     1.857
x2             3.0793      0.432      7.134      0.000         2.223     3.936
==============================================================================
Omnibus:                       19.951   Durbin-Watson:                   1.682
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.964
Skew:                          -0.660   Prob(JB):                     1.41e-11
Kurtosis:                       6.201   Cond. No.                         1.32
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

result.rsquared

0.38025383255132528

result.resid.head()

0    -3.370455
1   -11.153477
2   -11.721319
3    -0.948410
4     0.306215
dtype: float64

z, p = stats.normaltest(result.resid.values)

4.652499025300895e-05

result.params

Intercept    0.986826
x1           1.081044
x2           3.079284
dtype: float64

fig, ax = plt.subplots(figsize=(8, 4))
smg.qqplot(result.resid, ax=ax)

fig.tight_layout()

model = smf.ols("y ~ x1 + x2 + x1:x2", data)

result = model.fit()

print(result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.955
Model:                            OLS   Adj. R-squared:                  0.954
Method:                 Least Squares   F-statistic:                     684.5
Date:                Sun, 19 Jun 2016   Prob (F-statistic):           1.21e-64
Time:                        15:39:09   Log-Likelihood:                -140.01
No. Observations:                 100   AIC:                             288.0
Df Residuals:                      96   BIC:                             298.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.8706      0.103      8.433      0.000         0.666     1.076
x1             1.9693      0.108     18.160      0.000         1.754     2.185
x2             2.9670      0.117     25.466      0.000         2.736     3.198
x1:x2          3.9440      0.112     35.159      0.000         3.721     4.167
==============================================================================
Omnibus:                        2.950   Durbin-Watson:                   2.072
Prob(Omnibus):                  0.229   Jarque-Bera (JB):                2.734
Skew:                           0.327   Prob(JB):                        0.255
Kurtosis:                       2.521   Cond. No.                         1.38
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

result.params

Intercept    0.870620
x1           1.969345
x2           2.967004
x1:x2        3.943993
dtype: float64

result.rsquared

0.95533937458843676

z, p = stats.normaltest(result.resid.values)

0.2287471048250517

fig, ax = plt.subplots(figsize=(8, 4))
smg.qqplot(result.resid, ax=ax)

fig.tight_layout()

x = np.linspace(-1, 1, 50)

X1, X2 = np.meshgrid(x, x)

new_data = pd.DataFrame({"x1": X1.ravel(), "x2": X2.ravel()})

y_pred = result.predict(new_data)

y_pred.shape

(2500,)

y_pred = y_pred.reshape(50, 50)

fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

def plot_y_contour(ax, Y, title):
    c = ax.contourf(X1, X2, Y, 15, cmap=plt.cm.RdBu)
    ax.set_xlabel(r"$x_1$", fontsize=20)
    ax.set_ylabel(r"$x_2$", fontsize=20)
    ax.set_title(title)
    cb = fig.colorbar(c, ax=ax)
    cb.set_label(r"$y$", fontsize=20)

plot_y_contour(axes[0], y_true(X1, X2), "true relation")
plot_y_contour(axes[1], y_pred, "fitted model")

fig.tight_layout()

Datasets from R

dataset = sm.datasets.get_rdataset("Icecream", "Ecdat")

dataset.title

'Ice Cream Consumption'

print(dataset.__doc__)

+------------+-------------------+
| Icecream   | R Documentation   |
+------------+-------------------+

Ice Cream Consumption
---------------------

Description
~~~~~~~~~~~

four–weekly observations from 1951–03–18 to 1953–07–11

*number of observations* : 30

*observation* : country

*country* : United States

Usage
~~~~~

::

    data(Icecream)

Format
~~~~~~

A time serie containing :

cons
    consumption of ice cream per head (in pints);

income
    average family income per week (in US Dollars);

price
    price of ice cream (per pint);

temp
    average temperature (in Fahrenheit);

Source
~~~~~~

Hildreth, C. and J. Lu (1960) *Demand relations with autocorrelated
disturbances*, Technical Bulletin No 2765, Michigan State University.

References
~~~~~~~~~~

Verbeek, Marno (2004) *A guide to modern econometrics*, John Wiley and
Sons,
`http://www.econ.kuleuven.ac.be/GME <http://www.econ.kuleuven.ac.be/GME>`__,
chapter 4.

See Also
~~~~~~~~

``Index.Source``, ``Index.Economics``, ``Index.Econometrics``,
``Index.Observations``,

``Index.Time.Series``

dataset.data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 4 columns):
cons      30 non-null float64
income    30 non-null int64
price     30 non-null float64
temp      30 non-null int64
dtypes: float64(2), int64(2)
memory usage: 1.0 KB

model = smf.ols("cons ~ -1 + price + temp", data=dataset.data)

result = model.fit()

print(result.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   cons   R-squared:                       0.986
Model:                            OLS   Adj. R-squared:                  0.985
Method:                 Least Squares   F-statistic:                     1001.
Date:                Sun, 19 Jun 2016   Prob (F-statistic):           9.03e-27
Time:                        15:39:20   Log-Likelihood:                 51.903
No. Observations:                  30   AIC:                            -99.81
Df Residuals:                      28   BIC:                            -97.00
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
price          0.7254      0.093      7.805      0.000         0.535     0.916
temp           0.0032      0.000      6.549      0.000         0.002     0.004
==============================================================================
Omnibus:                        5.350   Durbin-Watson:                   0.637
Prob(Omnibus):                  0.069   Jarque-Bera (JB):                3.675
Skew:                           0.776   Prob(JB):                        0.159
Kurtosis:                       3.729   Cond. No.                         593.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

smg.plot_fit(result, 0, ax=ax1)
smg.plot_fit(result, 1, ax=ax2)

fig.tight_layout()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

sns.regplot("price", "cons", dataset.data, ax=ax1);
sns.regplot("temp", "cons", dataset.data, ax=ax2);

fig.tight_layout()

3. logistic regression

df = sm.datasets.get_rdataset("iris").data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal.Length    150 non-null float64
Sepal.Width     150 non-null float64
Petal.Length    150 non-null float64
Petal.Width     150 non-null float64
Species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB

df.Species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

df_subset = df[(df.Species == "versicolor") | (df.Species == "virginica" )].copy()

df_subset.Species = df_subset.Species.map({"versicolor": 1, "virginica": 0})

df_subset.rename(columns={"Sepal.Length": "Sepal_Length", "Sepal.Width": "Sepal_Width",
                          "Petal.Length": "Petal_Length", "Petal.Width": "Petal_Width"}, inplace=True)

df_subset.head(3)

	Sepal_Length	Sepal_Width	Petal_Length	Petal_Width	Species
50	7.0	3.2	4.7	1.4	1
51	6.4	3.2	4.5	1.5	1
52	6.9	3.1	4.9	1.5	1

model = smf.logit("Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width", data=df_subset)

result = model.fit()

Optimization terminated successfully.
         Current function value: 0.059493
         Iterations 12

print(result.summary())

                           Logit Regression Results                           
==============================================================================
Dep. Variable:                Species   No. Observations:                  100
Model:                          Logit   Df Residuals:                       95
Method:                           MLE   Df Model:                            4
Date:                Sun, 19 Jun 2016   Pseudo R-squ.:                  0.9142
Time:                        15:39:37   Log-Likelihood:                -5.9493
converged:                       True   LL-Null:                       -69.315
                                        LLR p-value:                 1.947e-26
================================================================================
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept       42.6378     25.708      1.659      0.097        -7.748    93.024
Sepal_Length     2.4652      2.394      1.030      0.303        -2.228     7.158
Sepal_Width      6.6809      4.480      1.491      0.136        -2.099    15.461
Petal_Length    -9.4294      4.737     -1.990      0.047       -18.714    -0.145
Petal_Width    -18.2861      9.743     -1.877      0.061       -37.381     0.809
================================================================================

print(result.get_margeff().summary())

        Logit Marginal Effects       
=====================================
Dep. Variable:                Species
Method:                          dydx
At:                           overall
================================================================================
                  dy/dx    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Sepal_Length     0.0445      0.038      1.163      0.245        -0.031     0.120
Sepal_Width      0.1207      0.064      1.891      0.059        -0.004     0.246
Petal_Length    -0.1703      0.057     -2.965      0.003        -0.283    -0.058
Petal_Width     -0.3303      0.110     -2.998      0.003        -0.546    -0.114
================================================================================

Note: Sepal_Length and Sepal_Width do not seem to contribute much to predictiveness of the model.

model = smf.logit("Species ~ Petal_Length + Petal_Width", data=df_subset)

result = model.fit()

Optimization terminated successfully.
         Current function value: 0.102818
         Iterations 10

print(result.summary())

                           Logit Regression Results                           
==============================================================================
Dep. Variable:                Species   No. Observations:                  100
Model:                          Logit   Df Residuals:                       97
Method:                           MLE   Df Model:                            2
Date:                Sun, 19 Jun 2016   Pseudo R-squ.:                  0.8517
Time:                        15:39:37   Log-Likelihood:                -10.282
converged:                       True   LL-Null:                       -69.315
                                        LLR p-value:                 2.303e-26
================================================================================
                   coef    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept       45.2723     13.612      3.326      0.001        18.594    71.951
Petal_Length    -5.7545      2.306     -2.496      0.013       -10.274    -1.235
Petal_Width    -10.4467      3.756     -2.782      0.005       -17.808    -3.086
================================================================================

print(result.get_margeff().summary())

        Logit Marginal Effects       
=====================================
Dep. Variable:                Species
Method:                          dydx
At:                           overall
================================================================================
                  dy/dx    std err          z      P>|z|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Petal_Length    -0.1736      0.052     -3.347      0.001        -0.275    -0.072
Petal_Width     -0.3151      0.068     -4.608      0.000        -0.449    -0.181
================================================================================

params = result.params
beta0 = -params['Intercept']/params['Petal_Width']
beta1 = -params['Petal_Length']/params['Petal_Width']

df_new = pd.DataFrame({"Petal_Length": np.random.randn(20)*0.5 + 5,
                       "Petal_Width": np.random.randn(20)*0.5 + 1.7})

df_new["P-Species"] = result.predict(df_new)

df_new["P-Species"].head(3)

0    0.995472
1    0.799899
2    0.000033
Name: P-Species, dtype: float64

df_new["Species"] = (df_new["P-Species"] > 0.5).astype(int)

df_new.head()

	Petal_Length	Petal_Width	P-Species	Species
0	4.717684	1.218695	0.995472	1
1	5.280952	1.292013	0.799899	1
2	5.610778	2.230056	0.000033	0
3	4.458715	1.907844	0.421614	0
4	4.822227	1.938929	0.061070	0

fig, ax = plt.subplots(1, 1, figsize=(8, 4))

ax.plot(df_subset[df_subset.Species == 0].Petal_Length.values,
        df_subset[df_subset.Species == 0].Petal_Width.values, 's', label='virginica')
ax.plot(df_new[df_new.Species == 0].Petal_Length.values,
        df_new[df_new.Species == 0].Petal_Width.values,
        'o', markersize=10, color="steelblue", label='virginica (pred.)')

ax.plot(df_subset[df_subset.Species == 1].Petal_Length.values,
        df_subset[df_subset.Species == 1].Petal_Width.values, 's', label='versicolor')
ax.plot(df_new[df_new.Species == 1].Petal_Length.values,
        df_new[df_new.Species == 1].Petal_Width.values,
        'o', markersize=10, color="green", label='versicolor (pred.)')

_x = np.array([4.0, 6.1])
ax.plot(_x, beta0 + beta1 * _x, 'k')

ax.set_xlabel('Petal length')
ax.set_ylabel('Petal width')
ax.legend(loc=2)
fig.tight_layout()

4. Time series

df = pd.read_csv("temperature_outdoor_2014.tsv", header=None, delimiter="\t", names=["time", "temp"])
df.time = pd.to_datetime(df.time, unit="s")
df = df.set_index("time").resample("H").mean()

df_march = df[df.index.month == 3]

df_april = df[df.index.month == 4]

df_march.plot(figsize=(12, 4));

plt.scatter(df_april[1:], df_april[:-1]);

plt.scatter(df_april[2:], df_april[:-2]);

plt.scatter(df_april[24:], df_april[:-24]);

fig, axes = plt.subplots(1, 4, figsize=(12, 3))
smg.tsa.plot_acf(df_march.temp, lags=72, ax=axes[0])
smg.tsa.plot_acf(df_march.temp.diff().dropna(), lags=72, ax=axes[1])
smg.tsa.plot_acf(df_march.temp.diff().diff().dropna(), lags=72, ax=axes[2])
smg.tsa.plot_acf(df_march.temp.diff().diff().diff().dropna(), lags=72, ax=axes[3])
fig.tight_layout()

model = sm.tsa.AR(df_march.temp)

result = model.fit(72)

sm.stats.durbin_watson(result.resid)

1.9985623006352833

fig, ax = plt.subplots(1, 1, figsize=(8, 3))
smg.tsa.plot_acf(result.resid, lags=72, ax=ax)
fig.tight_layout()

fig, ax = plt.subplots(1, 1, figsize=(12, 4))
ax.plot(df_march.index.values[-72:], df_march.temp.values[-72:], label="train data")
ax.plot(df_april.index.values[:72], df_april.temp.values[:72], label="actual outcome")
ax.plot(pd.date_range("2014-04-01", "2014-04-4", freq="H").values,
        result.predict("2014-04-01", "2014-04-4"), label="predicted outcome")

ax.legend()
fig.tight_layout()

results matching ""

No results matching ""