-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats.py
66 lines (51 loc) · 2.72 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/python
import statsmodels.formula.api as sm
import pandas as pd
import numpy as np
def df_impute_values_ols(adf,outvar,model, verbose=True):
"""Specify a Pandas DataFrame with some null (eg. np.nan) values in column <outvar>.
Specify a string model (in statsmodels format, which is like R) to use to predict them when they are missing. Nonlinear transformations can be specified in this string.
e.g.: model=' x1 + np.sin(x1) + I((x1-5)**2) '
At the moment, this uses OLS, so outvar should be continuous.
Two dfs are returned: one containing just the updated rows and a
subset of columns, and version of the incoming DataFrame with some
null values filled in (those that have the model variables) will
be returned, using single imputation.
AAh!! This bug https://github.com/statsmodels/statsmodels/issues/2171 wasted me hours. :( The latest release of Statsmodels is over a year old...
So this is written in order to avoid ANY NaN's in the modeldf. That should be less necessary in future versions.
To do:
- Add plots to verbose mode
- (It should have a discrete or at least binary outvar model option)
Issues:
- the "horrid kluge" line below will give trouble if there are column names that are part of other column names.
"""
formula=outvar+' ~ '+model
rhsv=[vv for vv in adf.columns if vv in model] # This is a horrid kluge! Ne
updateIndex= adf[pd.isnull(adf[outvar]) ] [rhsv].dropna().index
modeldf=adf[[outvar]+rhsv].dropna()
results=sm.ols(formula, data=modeldf).fit()
if verbose:
print results.summary()
newvals=adf[pd.isnull(adf[outvar])][rhsv].dropna()
newvals[outvar] = results.predict(newvals)
adf.loc[updateIndex,outvar]=newvals[outvar]
if verbose:
print(' %d rows updated for %s'%(len(newvals),outvar))
return(newvals, adf)
def test_df_impute_values_ols():
# Find missing values and fill them in:
df = pd.DataFrame({"A": [10, 20, 30, 324, 2353, np.nan],
"B": [20, 30, 10, 100, 2332, 2332],
"C": [0, np.nan, 120, 11, 2, 2 ]})
newv,df2=df_impute_values_ols(df,'A',' B + C ', verbose=True)
print df2
assert df2.iloc[-1]['A']==2357.5427562610648
assert df2.size==18
# Can we handle some missing values which also have missing predictors?
df = pd.DataFrame({"A": [10, 20, 30, 324, 2353, np.nan, np.nan],
"B": [20, 30, 10, 100, 2332, 2332, 2332],
"C": [0, np.nan, 120, 11, 2, 2, np.nan ]})
newv,df2=df_impute_values_ols(df,'A',' B + C + I(C**2) ', verbose=True)
print df2
assert pd.isnull( df2.iloc[-1]['A'] )
assert df2.iloc[-2]['A'] == 2352.999999999995