本文整理汇总了Python中statsmodels.formula.api.glm函数的典型用法代码示例。如果您正苦于以下问题:Python glm函数的具体用法?Python glm怎么用?Python glm使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了glm函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: regression
def regression(self):
print self.people.head(n=1)
self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"
self.logfile.write( "\n\n Sum Temp Interest NegBinom")
m = glm("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
self.logfile.write( "\n AIC"+str(+m.aic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
self.logfile.write( "\n\n Sum Temp Interest OLS")
m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
self.logfile.write( "\n AIC"+str(+m.aic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
self.logfile.write( "\n\n Pos Temp Interest NegBinom")
m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
#lim_people = self.people[self.people.timePosInterest>0]
self.logfile.write( "\n\n Pos Temp Interest OLS")
m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
示例2: regression
def regression(self):
print self.people.head(n=1)
self.people.rename(columns={'class': 'dbpedia_class'}, inplace=True) # all_bios is the dataframe with the consolidated data. somehow it doesn't work if the class column is named "class"
self.logfile.write( "\n\n Num Regions NegativeBinomial")
m = glm("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
data=self.people, family=families.NegativeBinomial()).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
#lim_people = self.people[self.people.numRegions>0]
self.logfile.write( "\n\n Num Regions OLS")
m = ols("numRegions ~ C(gender,Treatment(reference='male')) ", # + C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
data=self.people).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
# we could use beta regression for normalized entropy
#print "\n\n Region Entropy"
#m = ols("entropy ~ C(gender,Treatment(reference='male')) ", #+ C(dbpedia_class,Treatment(reference='http://dbpedia.org/ontology/Person')) + birth_century
# data=self.people).fit()
#print m.summary() # <-- this gives you the table of coefficients with p-values, confidence intervals, and so on
self.logfile.write( "\n\n Sum Temp Interest")
m = ols("timeInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
self.logfile.write( "\n AIC"+str(+m.aic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
self.logfile.write( "\n\n Pos Temp Interest")
m = glm("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people, family=families.NegativeBinomial()).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
#lim_people = self.people[self.people.timePosInterest>0]
self.logfile.write( "\n\n Pos Temp Interest OLS")
m = ols("timePosInterest ~ C(gender,Treatment(reference='male')) ", data=self.people).fit()
self.logfile.write( "\n AIC "+str(m.aic))
self.logfile.write( "\n BIC "+str(m.bic))
for table in m.summary().tables:
self.logfile.write(table.as_latex_tabular())
示例3: generate_regression_models
def generate_regression_models(df):
# Using glm function in statsmodels.formula.api class to create regression models
heart_deaths = sm.glm(formula="Heart_Disease_Deaths ~ Obesity + Binge_Drinking + Smoking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
cancer_deaths = sm.glm(formula="Cancer_Deaths ~ Obesity + Binge_Drinking + Smoking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
diabetes_deaths = sm.glm(formula="Diabetes_Deaths ~ Obesity + Smoking + Binge_Drinking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
resp_deaths = sm.glm(formula="Respiratory_Disease_Deaths ~ Obesity + Smoking + Binge_Drinking + Primary_Care + No_Insurance + Median_Household_Income + College_Degrees + Long_Term_Care_Hospital_Admissions + Unemployed_Persons + Liquor_Stores", data=df).fit()
# Appending the different models to a list
models = []
models.append(heart_deaths)
models.append(cancer_deaths)
models.append(resp_deaths)
models.append(diabetes_deaths)
return models
示例4: multiple_linear_regression
def multiple_linear_regression():
'''Multiple linear regression
chapter 6.3, p. 98'''
# get the data from the web
inFile = r'GLM_data/Table 6.3 Carbohydrate diet.xls'
df = get_data(inFile)
# do the fit, for the original model ...
model = ols('carbohydrate ~ age + weight + protein', data=df).fit()
print model.summary()
print anova_lm(model)
# as GLM
glm = glm('carbohydrate ~ age + weight + protein',
family=Gaussian(), data=df).fit()
print 'Same model, calculated with GLM'
''' The confidence intervals are different than those from OLS.
The reason (from Nathaniel Smith):
OLS uses a method that gives exact results, but only works in the special
case where all the usual OLS criteria apply - iid Gaussian noise etc. GLM
instead uses an approximate method which is correct asymptotically but may
be off for small samples; the tradeoff you get in return is that this method
works the same way for all GLM models, including those with non-Gaussian
error terms and non-trivial link functions. So that's why they're different.
'''
print glm.summary()
# ... and for model 1
model1 = ols('carbohydrate ~ weight + protein', data=df).fit()
print model1.summary()
print anova_lm(model1)
示例5: logistic_regression
def logistic_regression(self, use_glm=True):
"""
(b) it seems the statistical significant predict variable is only Lag2. How disappointing...
"""
formula = "Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume"
model = (
smf.glm(formula, data=self.df, family=sm.families.Binomial())
if use_glm
else smf.logit(formula, data=self.transformedDF)
)
result = model.fit()
if use_glm:
probs = result.fittedvalues
"""Beware the prob here is the index 0's prob, so we should use the lambda function below"""
pred_values = probs.map(lambda x: 0 if x > 0.5 else 1)
else:
"""The probability of being 1"""
probs = Series(result.predict(sm.add_constant(self.df[["Lag1", "Lag2", "Lag3", "Lag4", "Lag5", "Volume"]])))
pred_values = probs.map(lambda x: 1 if x > 0.5 else 0)
"""
(c) Percentage of currect predictions: (54+557)/(54+557+48+430) = 56.1%.
Weeks the market goes up the logistic regression is right most of the time, 557/(557+48) = 92.1%.
Weeks the market goes up the logistic regression is wrong most of the time 54/(430+54) = 11.2%.
"""
tp.output_table(pred_values.values, self.transformedDF[self.y_col].values)
示例6: test_all_methods
def test_all_methods(self):
x_cols = ["Lag2"]
formula = "Direction~Lag2"
# print self.df.shape[0]
train_data = self.df.ix[(self.df["Year"] >= 1990) & (self.df["Year"] <= 2008), :]
# print train_data.shape[0]
""" (d) logistic"""
model = smf.glm(formula, data=train_data, family=sm.families.Binomial())
result = model.fit()
test_data = self.df.ix[self.df["Year"] > 2008, :]
probs = Series(result.predict(sm.add_constant(test_data[["Lag2"]])))
pred_values = probs.map(lambda x: "Down" if x > 0.5 else "Up")
tp.output_table(pred_values.values, test_data[self.y_col].values)
train_X = train_data[x_cols].values
train_y = train_data[self.y_col].values
test_X = test_data[x_cols].values
test_y = test_data[self.y_col].values
""" (e) LDA """
lda_res = LDA().fit(train_X, train_y)
pred_y = lda_res.predict(test_X)
tp.output_table(pred_y, test_y)
""" (f) QDA """
qda_res = QDA().fit(train_X, train_y)
pred_y = qda_res.predict(test_X)
tp.output_table(pred_y, test_y)
""" (g) KNN """
clf = neighbors.KNeighborsClassifier(1, weights="uniform")
clf.fit(train_X, train_y)
pred_y = clf.predict(test_X)
tp.output_table(pred_y, test_y)
""" (h) logistic and LDA """
""" (i) Is the purpose of the last question going through all methods with no direction?"""
示例7: fit_model
def fit_model(self,df, filters, model_expression):
"""
Use statsmodels GLM to construct a model relation.
Parameters
----------
df : pandas.DataFrame
Data to use for fit. Should contain all the columns
referenced in the `model_expression`.
filters : list of str
Any filters to apply before doing the model fit.
model_expression : str
A patsy model expression that can be used with statsmodels.
Should contain both the left- and right-hand sides.
Returns
-------
fit : statsmodels.regression.linear_model.GLMSResults
"""
df = util.apply_filter_query(df, filters)
model=smf.glm(formula=model_expression, data=df, family=sm.families.Poisson())
if len(model.exog) != len(df):
raise ModelEvaluationError(
'Estimated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')
with log_start_finish('statsmodels GLM fit', logger):
return model.fit()
示例8: fit_with_logistic
def fit_with_logistic(self, threshold=0.5):
formula = "%s~%s" % (self.y_col, "+".join(self.x_cols))
model = smf.glm(formula, data=self.train_set, family=sm.families.Binomial())
result = model.fit()
predict_probs = result.predict(exog=self.test_set)
real_values = self.test_set[self.y_col].map(lambda x: 1 if x == 'No' else 0)
tp.output_table_with_prob(predict_probs, real_values, threshold=threshold, zero_one_col_texts=["Yes", "No"])
示例9: logistic_regression
def logistic_regression():
'''Logistic regression example
chapter 7.3, p 130
[tbd]: the cloglog values are inconsistent with those mentioned in the book.
This is probably due to the specific definitions of "loglog" and "cloglog"
in the respective languages.
'''
inFile = r'GLM_data/Table 7.2 Beetle mortality.xls'
df = get_data(inFile)
# adjust the unusual column names in the Excel file
colNames = [name.split(',')[1].lstrip() for name in df.columns.values]
df.columns = colNames
# fit the model
df['tested'] = df['n']
df['killed'] = df['y']
df['survived'] = df['tested'] - df['killed']
model = glm('survived + killed ~ x', data=df, family=Binomial()).fit()
print model.summary()
print '-'*65
print 'Equivalent solution:'
model = glm('I(n - y) + y ~ x', data=df, family=Binomial()).fit()
print model.summary()
# The fitted number of survivors can be obtained by
fits = df['n']*(1-model.fittedvalues)
print 'Fits Logit:'
print fits
# The fits for other link functions are:
model_probit = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.probit)).fit()
print model_probit.summary()
fits_probit = df['n']*(1-model_probit.fittedvalues)
print 'Fits Probit:'
print fits_probit
model_cll = glm('I(n - y) + y ~ x', data=df, family=Binomial(links.cloglog)).fit()
print model_cll.summary()
fits_cll = df['n']*(1-model_cll.fittedvalues)
print 'Fits Extreme Value:'
print fits_cll
示例10: pred
def pred(working, rating):
data = working[working['prosper_rating']==rating]
#https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
#60%, 20% 20% for traing, test and validation
train, validation, test = np.split(data.sample(frac=1), [int(.6*len(data)), int(.8*len(data))])
print("total:{} train:{} test:{} validation:{}".format(len(data), len(train), len(validation), len(test)))
mod = smf.glm('status ~ borrower_rate', data=train, family=sm.families.Binomial()).fit()
print(test_model(mod, test))
示例11: report_glm
def report_glm(formula, data, verbose=True, **kwargs):
"""Fit GLM, print a report, and return the fit object."""
results = smf.glm(formula, data=data, **kwargs).fit(disp=False, **kwargs)
summary = results.summary()
if verbose:
report = """\n{summary}\n""".format(summary=summary)
print(report)
return results
示例12: test_logit
def test_logit(self):
from statsmodels.formula.api import glm
from statsmodels.genmod.families import Binomial
inData = C13_2_logit.getData()
dfFit = C13_2_logit.prepareForFit(inData)
model = glm('ok + failed ~ temp', data=dfFit, family=Binomial()).fit()
C13_2_logit.showResults(inData, model)
self.assertAlmostEqual(model.params.Intercept, -15.042902, places=5)
示例13: regression
def regression():
'''Poisson regression example
chapter 4.4, p.69'''
# get the data from the web
inFile = r'GLM_data/Table 4.3 Poisson regression.xls'
df = get_data(inFile)
# do the fit
p = glm('y~x', family=Poisson(links.identity), data=df)
print p.fit().summary()
示例14: calculate_odds_ratio
def calculate_odds_ratio(genotypes, phen_vector1,phen_vector2,reg_type,covariates,response='',phen_vector3=''): #diff - done
"""
Runs the regression for a specific phenotype vector relative to the genotype data and covariates.
:param genotypes: a DataFrame containing the genotype information
:param phen_vector: a array containing the phenotype vector
:param covariates: a string containing all desired covariates
:type genotypes: pandas DataFrame
:type phen_vector: numpy array
:type covariates: string
.. note::
The covariates must be a string that is delimited by '+', not a list.
If you are using a list of covariates and would like to convert it to the pyPhewas format, use the following::
l = ['genotype', 'age'] # a list of your covariates
covariates = '+'.join(l) # pyPhewas format
The covariates that are listed here *must* be headers to your genotype CSV file.
"""
data = genotypes
data['y']=phen_vector1
data['MaxAgeAtICD'] = phen_vector2
#f='y~'+covariates
if response:
f = response+'~ y + ' + covariates
if phen_vector3.any():
data['phe'] = phen_vector3
f = response + '~ y + phe +' + covariates
else:
f = 'y ~' + covariates
if phen_vector3.any():
data['phe'] = phen_vector3
f = 'y ~ phe +' + covariates
try:
if reg_type==0:
logreg = smf.logit(f,data).fit(method='bfgs',disp=False)
p=logreg.pvalues.genotype
odds=logreg.deviance
conf = logreg.conf_int()
od = [-math.log10(p), logreg.params.genotype, '[%s,%s]' % (conf[0]['genotype'],conf[1]['genotype'])]
else:
linreg = smf.glm(f,data).fit(method='bfgs',disp=False)
p=linreg.pvalues.genotype
odds=0
conf = linreg.conf_int()
od = [-math.log10(p), linreg.params.genotype, '[%s,%s]' % (conf[0]['genotype'],conf[1]['genotype'])]
except:
odds=0
p=np.nan
od = [np.nan,np.nan,np.nan]
return (odds,p,od)
示例15: senility_and_WAIS
def senility_and_WAIS():
'''Another example of logistic regression.
chapter 7.8, p 143
[tbd]: I don't understand how the "Binomial model" (grouped response)
is supposed to work, in either language'''
inFile = r'GLM_data/Table 7.8 Senility and WAIS.xls'
df = get_data(inFile)
# ungrouped
model = glm('s ~ x', data=df, family=Binomial()).fit()
print model.summary()