Commit e511a80c authored by 王家华's avatar 王家华

修改了model refit和画图的部分代码

parent 38d4951f
from pyplotz.pyplotz import PyplotZ
from pyplotz.pyplotz import plt
from data.analyis import datacal
from tools import datacal
import seaborn as sns
import pandas as pd
......
"""
Created on Thu Apr 18 11:32:06 2019
@author: wangjiahua
@author: Jason Wang
"""
......@@ -10,54 +10,83 @@ import numpy as np
import pandas as pd
import seaborn as sns
############# plot config ###############
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 #图片像素
plt.rcParams['figure.dpi'] = 200 #分辨率
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None, plot_tab=True, legend_list=None,
saved_path=None):
def topN_feature_importance(model, clf, title="untitled", save_path = './plots/', topN=20):
'''
instructions : visualization of pivot
plot feature importance squence
params:
classifier
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(16, 9), linewidth=0.1)
plt.figure(figsize=(10, 6))
model.plot_importance(clf, max_num_features = topN)
plt.title("Feature Importances")
path = save_path + title + "featureImportance.png"
plt.savefig(path)
plt.show()
return path
def plot_table(dataset, auc, title='untitled', X_label=None, y_label=None,
tab_df=None, plot_tab=True, saved_path=None):
'''
instructions : visualization of pivot
Params :
dataset -
auc - auc list / array
title - title of plot('untitled' as default)
x_label - X axis label of plot
y_label - y axis label of plot
plot_tab - plot table or not , default as True
saved_path - saved path, set as None as there has no download needs
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 100 # 分辨率
fig, axs = plt.subplots(1, 1, figsize=(6, 6), linewidth=0.1)
table_rows = dataset.columns
table_cols = dataset.index
# traverse each columns of dataframe
for i in table_rows:
x = table_cols
y = dataset[i]
axs.plot(x, y, maker='o', label=str(i) + ' AUC: ' + auc[i])
if plot_tab != False:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset.head()))],
axs.plot(x, y, label=str(i) + ' AUC: ' + str(auc[i]))
# if table should be plot
if plot_tab:
the_table = plt.table(cellText=[list(dataset.iloc[i, :].values) for i in range(len(dataset))],
rowLabels=table_rows,
colLabels=table_cols,
colWidths=[0.91 / (len(table_cols) - 1)] * len(table_cols),
loc='bottom')
plt.xticks([])
# otherwise, nothing to do here
the_table.auto_set_font_size(False)
the_table.set_fontsize(8)
the_table.set_fontsize(6)
fig.subplots_adjust(bottom=0.2)
plt.grid()
plt.ylabel(title)
if y_label is not None:
plt.ylabel(y_label)
if X_label is not None:
plt.xlabel(X_label)
plt.legend()
# plt.vlines(xrange(len(cols))0],y,color='lightgrey',linestyle='--')
plt.title(title)
plt.show()
return 1
def plot_curve_singleCurve(dataset, x_label = None, y_label = None,table_tab = None,
save_path = None, figure_arrangement = 11, fig_size = (4,3),
fig_title='General Plot', fig_name = 'untitled',
......@@ -144,8 +173,6 @@ def density_chart(dataset,title):
plt.show()
#
# alpha = 0.98 / 4 * fig_ith + 0.01
# ax.set_title('%.3f' % alpha)
......
def topN_feature_importance(classifier, clf, topN=20, model=lgb):
'''
plot feature importance squence
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
plt.title("Feature Importances")
plt.show()
......@@ -6,7 +6,8 @@ import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os,psutil
from tools import datacal
from graph import matplot
params_lgb = {
'task': 'train', # 用途
'application': 'binary', # 用于二分类
......@@ -43,7 +44,8 @@ def returnAUC(clf, training_set, validation_set, features, target='target'):
return train_auc, val_auc
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'):
def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target',
featureImportance_path = '../mvp/plots/', topN_featureImportance=20, featureImportance_title='lightgbm'):
'''
instructions : training lightgbm model with specified params
......@@ -68,6 +70,8 @@ def train_lgbm(params, df_train, df_val, features, adds_on=None, target='target'
lgbm = lgb.train(params, lgb_train, valid_sets=lgb_val, verbose_eval=False)
train_auc, val_auc = returnAUC(lgbm, df_train, df_val, features)
matplot.topN_feature_importance(lgb, lgbm, title=featureImportance_title,
save_path = featureImportance_path, topN=topN_featureImportance)
# auc = roc_auc_score(dev['target'],gbm.predict(dev[features]))
return train_auc, val_auc, lgbm
......@@ -117,9 +121,8 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
# replace the worst parameter with a greater combination
para['max_depth'] = deepth
para['num_leaves'] = leaves
optimal_para[topn.argmin()] = para
return optimal_para, lgb_train, lgb_val, topn
return optimal_para, topn
# training_curve.append(train_auc)
......@@ -163,18 +166,11 @@ def lgb_params_tuning(params, features, train, val, target='target', topN=3, cv_
# return 1
def topN_feature_importance(classifier, clf, topN=20, model=lgb):
'''
plot feature importance squence
'''
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 226 # 图片像素
plt.rcParams['figure.dpi'] = 200 # 分辨率
plt.figure(figsize=(10, 6))
classifier.plot_importance(clf, max_num_features=topN)
plt.title("Featurer Importances")
plt.show()
def predict(lgbm,df_test,features,target='target'):
predictions = lgbm.predict(df_test[features])
auc = roc_auc_score(predictions,df_test[target])
return predictions, auc
def buildClf(params=params_lgb):
......@@ -183,7 +179,7 @@ def buildClf(params=params_lgb):
Params :
'''
return lgbm.LGBMClassifier(params)
return lgb.LGBMClassifier(params)
def automodelfit(clf, param_grid, dftrain, features, resp, kfold=10, scoring='roc_auc'):
......
......@@ -210,84 +210,73 @@ class dhb:
and datediff(now(),deadline) > ''' + str(passdue_day) + '''
'''
def dhb_features_extract(self):
value_map = {
"近3天": 1,
"近4-5天": 2,
"近6-7天": 3,
"近8-15天": 4,
"近16-30天": 5,
"近31-60天": 6,
"近61-90天": 7,
"近91-120天": 8,
"近121-150天": 9,
"近151-180天": 10,
"180天前": 11,
"无": 0
}
# print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data
print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
self.end_time_period))
dhb_loan = pd.read_sql(
self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
mysqldb.engine_risk_analysis)
dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
"dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[
["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
"dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]].applymap(
lambda x: value_map[x])
dhb_loan.loc[
dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
dhb_loan.loc[
dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25
dhb_loan.loc[
dhb_loan.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
dhb_loan.loc[
dhb_loan.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
dhb_loan.loc[
dhb_loan.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500
dhb_loan.loc[
dhb_loan.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
dhb_loan.loc[
dhb_loan.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000
dhb_loan.loc[
dhb_loan.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25
dhb_loan.loc[
dhb_loan.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
dhb_loan.loc[dhb_loan.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22
dhb_loan.loc[
dhb_loan.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100
dhb_loan.loc[
dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
def dhb_features_extract(self,df):
try:
value_map = {
"近3天": 1,
"近4-5天": 2,
"近6-7天": 3,
"近8-15天": 4,
"近16-30天": 5,
"近31-60天": 6,
"近61-90天": 7,
"近91-120天": 8,
"近121-150天": 9,
"近151-180天": 10,
"180天前": 11,
"无": 0
}
# print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
# use risk_analysis to extract data
print('sql: ', self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
self.end_time_period))
dhb_loan = pd.read_sql(
self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',self.end_time_period),
mysqldb.engine_risk_analysis)
dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
"dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[
["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time",
"dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]].applymap(
lambda x: value_map[x])
dhb_loan.loc[
dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
dhb_loan.loc[
dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25
dhb_loan.loc[
dhb_loan.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
dhb_loan.loc[
dhb_loan.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
dhb_loan.loc[
dhb_loan.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500
dhb_loan.loc[
dhb_loan.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
dhb_loan.loc[
dhb_loan.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000
dhb_loan.loc[
dhb_loan.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000
dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25
dhb_loan.loc[
dhb_loan.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
dhb_loan.loc[dhb_loan.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22
dhb_loan.loc[
dhb_loan.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100
dhb_loan.loc[
dhb_loan.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
dhb_loan.to_csv("./dhb_loan_sample——" + str(datetime.date.today()) + ".csv")
print(time.strftime('%Y.%m.%d %H:%M:%S', time.localtime(
time.time())) + "提取了dhb " + self.start_time_period + "to" + self.end_time_period + "时段样本")
# ignore exceptions such as "colmns doesn't exist"
except Exception as e:
print("data preprocessing ERR ",e)
pass
return dhb_loan
'''
instructions : build a comparasion
Params :
df - test dataset which was given
score - score column
target - label
start_time_period -
end_time_period -
applied_tpye -
applied_from -
Returns :
auc comparasion
liftchart plot
'''
def dhb_predict_with_pkl(self,test,pkl='./dhb_cuishou_jianzhi_v3.pkl',features=features):
open_file = open(pkl, "rb")
......@@ -327,7 +316,7 @@ class dhb:
def dhb_comparasion(df, score_BM='model_exec_data_source#dhb', score_predict='predict', target='target',applied_type=None, applied_from=None):
'''
instructions : comparasion of previous dhb liftchart & auc
instructions : obtain online dhb score from mongodb
'''
# spliting data with appliedType & applied_channel
df = df[df.applied_type == applied_type]
......@@ -337,15 +326,6 @@ class dhb:
df['bins_BM'] = df.qcut(df[score_BM], q=10, percision=6, dupulicates='drop')
## bins of predictions
df['bins_predict'] = df.qcut(df[score_predict], q=10, percision=6, dupulicates='drop')
pivot_BM = df[['bins_BM', target]].groupby('bins_BM')
pivot_predict = df[['bins_predict', target]].groupby('bins_predict')
# output liftchart & AUC
pivot_BM = pivot_BM.sum() / pivot_BM.count()
pivot_predict = pivot_predict.sum() / pivot_predict.count()
# concate two pivot
pivot = pd.concat([pivot_BM, pivot_predict],axis = 1)
# pivottable plot
pivot.plot()
return 1
......
import pandas as pd
import numpy as np
import datetime
from mvp import xgbreport
from mvp import lgbreport
from data.analyis import datacal
from tools import datacal
from models import xgboost
from models import lightgbm
from mvp import refit
from mvp import rebuild
from models_obj import dhb_obj
###### global variable ######
# label
target = 'target'
#############################
from mvp import dhb
# from mvp import dhb
from data.samples import dhb,sample
dhb = dhb.dhb()
dhb = dhb_obj.dhb()
df_sample = dhb.dhb_features_extract()
target = 'target'
features = dhb.features
df_sample[features] = df_sample[features].astype(float)
df_sample['target'] = df_sample['target'].astype(int)
......@@ -27,8 +35,6 @@ print('----no.',len(features),'of samples of dhb----')
# to save model performance
if __name__ == '__main__':
# data extraction
''' ## Old Edition here
......@@ -40,10 +46,10 @@ if __name__ == '__main__':
# else:
# df_train,df_test = datacal.train_test_split_general(df_sample, val_size=None, test_size=0.25, stratify='target', random_state=7)
'''
df_train, df_val, df_test = train_test_split_general()
# 默认取样本方法
df_train, df_val, df_test = datacal.train_test_split_general()
# data manipulation
## TODO
......@@ -69,7 +75,7 @@ if __name__ == '__main__':
#lgbreport.report(df_train, df_test, df_val, features, target,'','dhb模型迭代报告.doc', kfold = 2)
# merge as single dataframe full of models
pd.DataFrame(xgb_model)
#pd.DataFrame(xgb_model)
......@@ -109,4 +115,63 @@ if __name__ == '__main__':
# test_min_date=dftest.applied_at.min(),test_max_date=dftest.applied_at.max(),test_cnt=dftest.shape[0])
#== xgboost gbtree
xgbreport.report(dftrain,dftest,dhb.get_feature(),'label','','xgboost_%s.doc' % datetime.datetime.now().date().strftime('%y%m%d'),kfold=2)
#################################################### report settings #############################################################################
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
# refit / rebuild sequence
# 生成电话帮对象(使用默认参数)
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 提取样本
df_sample = dhb.dhb_features_extract()
# 备份df_sample
df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 电话帮数据处理
# report sequence
import pandas as pd
import numpy as np
import datetime
from data.analyis import filetool
from data.analyis import datacal
from models import lightgbm
from tools import filetool
from tools import datacal
from models_kit import lightgbm
from matplotlib import pyplot as plt
from data.graph import matplot
from graph import matplot
from models_obj import dhb_obj
from tools import datacal
import datetime
from models_kit import lightgbm
from models_kit import xgboost
import lightgbm as lgb
from graph import matplot
from tools import filetool
dhb = dhb_obj.dhb(features=None, sql=None, start_time_period=None, end_time_period=None,passdue_day=15)
# 提取样本
#df_sample = dhb.dhb_features_extract()
######### temp #############
import pandas as pd
df_sample = pd.read_csv('E:\\model\\model_mvp\\mvp\\dhb_loan_sample——2019-04-23.csv',engine='python')
############################
# 备份df_sample
df_sample.to_csv(str(datetime.date.today())+"dhb_samples.xlsx")
# 默认样本划分
df_train, df_val, df_test = datacal.train_test_split_general(df_sample, val_size=0.2, test_size=0.2, stratify='target',
random_state=7,split_methods='random',
time_label='applied_at')
del df_sample
# 用交叉验证获取最优参optimal_para和对应参数在CV验证集上最优AUC列表topn
optimal_para,topn = lightgbm.lgb_params_tuning(lightgbm.params_lgb, dhb.features, df_train, df_val, target='target',
topN=3, cv_fold=5)
print('topn 通过train交叉验证得到的auc ',topn)
# 用新参数(optimal_para)训练模型,adds_on是需要修改的参数字典,输出feature Importance
train_auc, val_auc, lgbm = lightgbm.train_lgbm(lightgbm.params_lgb, df_train, df_val, dhb.features,
adds_on=optimal_para, target='target')
predictions ,test_auc = lightgbm.predict(lgbm,df_test,features=dhb.features)
df_test['predict'] = predictions
####### allocator cache ############
applied_from = {'1,214,217,198': '内部', '333': '融360', '159537': '360金融'}
applied_type = {'1,2':'首贷','1,2,3':'首付贷','1':'首申','2':'复申','3':'复贷'}
####################################
### report
# plot feature importance
path = matplot.topN_feature_importance(lgb, lgbm, title="untitled", save_path='./plots/', topN=20)
# report file
report_path = "E:\\bla\\"
report_name = "lgb_report.docx"
document = filetool.buildDocument(report_path, report_name)
document.add_heading('lightGBM 算法refit报告')
filetool.Document.add_paragraph('特征权重图')
filetool.add_picture(path)
filetool.Document.add_paragraph('univar_chart')
for i in dhb.features:
univar = datacal.cal_univar(df_train,score='raw_score')
univarChart = matplot.plot_table(univar,title= i +' univar Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i +' univar Chart')
for i in dhb.features:
pdp = datacal.cal_pdp(df_test,score='predict')
pdpChart = matplot.plot_table(pdp,title= i +' PDP Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' PDP Chart')
for i in dhb.features:
lift = datacal.cal_liftchart(df_test,score='predict')
liftChart = matplot.plot_table(lift, title=i +' lift Chart',saved_path='./plots/cache')
filetool.add_picture("./plots/cache" + i + ' lift Chart')
filetool.saveDocument(document, report_path, report_name)
......@@ -4,6 +4,31 @@ import datetime
from sklearn.model_selection import train_test_split
def liftchart(df,target='target',qcut=10,retbins=True):
'''
instructions : return liftchart dataframe with qcut & pivot 逾期率liftchart
Params :
df - dataframe(注意一定是是放款集!!)
target - label column
qcut - quantiles
retbins - return bins interval when 'retbins' is True, else False
:return:
liftchart dataframe
'''
df = df.copy()
# create a bins column
df['bins'] = pd.qcut(df, q=10, precision=6, retbins=False, duplicates='drop')
pivot = df[['bins','target']].groupby('bins').agg(['mean','count'])
return pivot
def train_test_split_general(dataset, val_size=0.2, test_size=0.2, stratify='target', random_state=7,
split_methods='random', time_label='applied_at'):
'''
......@@ -92,6 +117,20 @@ def cal_month(df,date_name,date_name_new):
return df
def cal_feature_grid(df,feature,bin=10,method=2):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
......@@ -156,7 +195,7 @@ def cal_univar(df,feature,target,bin=10,classes=[]):
:return:
'''
if df.shape[0]==0:
raise('no date')
raise('no data')
columns=df.columns.tolist()
if target not in columns:
raise('not found %s' % target)
......@@ -167,9 +206,9 @@ def cal_univar(df,feature,target,bin=10,classes=[]):
tmp[feature].fillna(-1, inplace=True)
# == bin 划分,feature 有可能 非数字
try:
tmp[feature]=tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp,feature,bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest=True)
tmp[feature] = tmp[feature].astype(float)
feature_grid = cal_feature_grid(tmp, feature, bin)
tmp['lbl'] = pd.cut(tmp[feature], feature_grid, include_lowest = True)
tmp['grid'] = tmp['lbl'].cat.codes
except ValueError:
tmp['lbl']=tmp[feature]
......@@ -181,7 +220,7 @@ def cal_univar(df,feature,target,bin=10,classes=[]):
df_out=df_gp
else:
df_all = tmp.groupby(['grid','lbl']).agg({target: ['count', 'mean','sum']}).reset_index()
df_all.columns = ['grid','lbl', 'count', 'mean','sum']
df_all.columns = ['grid', 'lbl', 'count', 'mean', 'sum']
df_out = df_all
return df_out
......
......@@ -3,6 +3,12 @@ from docx import Document
from docx.shared import Inches
def buildDocument(path,filename):
'''
instrucions : build a document writer
:param path:
:param filename:
:return:
'''
if filename[-3:]!='doc':
if filename[-4:] !='docx':
raise ValueError('{} is not a word file'.format(filename))
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment