Commit b980367e authored by linfang.wang's avatar linfang.wang

优化sample

parent 912b21b8
'''
目标:查询百融样本数据,百融样本特征
'''
import pandas as pd import pandas as pd
from data.samples import sample
from data.datasource import dbquery
from data.datasource.mysqldb import engine_risk_analysis
from data.samples.yewudata import *
''' '''
目的:获取电话邦特征,样本数据,数据源为风控分析库 目的:获取电话邦特征,样本数据,数据源为风控分析库
''' '''
feature_file_name='features/dhb.csv' feature_file_name='features/dhb.csv'
def get_features_from_file(): def get_feature():
''' return sample.get_feature_by_version(feature_file_name)
从feature 文件中读取feature
:return: df,columns=['feature','version']
'''
df_feature=pd.read_csv(feature_file_name,sep='\t')
return df_feature
def get_feature_by_version(version=None): def query_sample(start_date,end_date,is_loan=True):
'''
根据feature 的版本号,获取该版本下的feature,如果不指定,则获取最新的版本号
:param version:int 负数为不指定或者
:return:list
''' '''
df_feature = get_features_from_file() 默认提取放款集
if (version ==None) or (version<1): :param start_date:
version=df_feature.version.max() :param end_date:
return df_feature[df_feature.version == version].feature.tolist() :return:样本数据
def save_features(features):
''' '''
针对新的feature,同维护的feature文档比较,如果同最新版的特征一样,那么无需保存,如果不一致,则作为新的一版特征进行保存 features=get_feature()
:param features:list if is_loan:
:return: sql='''
''' select loan_id,%s
f2=get_feature_by_version() from risk_analysis
if (set(f2) & set(features)) == (set(f2) | set(features)): where dhb_flag =1 and transacted=1 and applied=1
print('features are already newest,not need to save') and applied_at >='%s' and applied_at<'%s'
return True ''' % (','.join(features),start_date,end_date)
else: else:
#== 更新特征 sql='''
tmp=pd.DataFrame(features,columns=['feature']) select loan_id,%s
df_feature=get_features_from_file() from risk_analysis
version=df_feature.version.max()+1 where dhb_flag =1 and applied=1
tmp['version']=version and applied_at >='%s' and applied_at<'%s'
columns=['feature','version'] ''' % (','.join(features),start_date,end_date)
df_feature=pd.concat([df_feature[columns],tmp[columns]]) df=dbquery.mysql_query(sql,engine_risk_analysis)
df_feature[columns].to_csv(feature_file_name,index=None,encoding='utf8') yewu=query_byloanid(df.loan_id.tolist())
df=pd.merge(df,yewu,on='loan_id',how='inner')
value_map = {
"近3天": 1,
"近4-5天": 2,
"近6-7天": 3,
"近8-15天": 4,
"近16-30天": 5,
"近31-60天": 6,
"近61-90天": 7,
"近91-120天": 8,
"近121-150天": 9,
"近151-180天": 10,
"180天前": 11,
"无": 0
}
cols = ["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time", "dhb_overview_ntdun_first_call_time",
"dhb_overview_ntdun_last_call_time"]
df[cols] = df[cols].applymap(lambda x: value_map[x])
df.loc[
df.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42, "dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
df.loc[df.dhb_overview_ntdun_call_duration_above60 >= 25, "dhb_overview_ntdun_call_duration_above60"] = 25
df.loc[
df.dhb_last_30_and_60_days_ntdun_call_total_duration >= 800, "dhb_last_30_and_60_days_ntdun_call_total_duration"] = 800
df.loc[
df.dhb_last_30_and_60_days_dun_call_in_duration >= 1600, "dhb_last_30_and_60_days_dun_call_in_duration"] = 1600
df.loc[df.dhb_last_30_days_ntdun_call_total_duration >= 2500, "dhb_last_30_days_ntdun_call_total_duration"] = 2500
df.loc[df.dhb_last_30_days_ntdun_call_tel_total_nums >= 25, "dhb_last_30_days_ntdun_call_tel_total_nums"] = 25
df.loc[df.dhb_last_30_days_dun_call_in_duration >= 1000, "dhb_last_30_days_dun_call_in_duration"] = 1000
df.loc[df.dhb_overview_ntdun_call_total_duration >= 3000, "dhb_overview_ntdun_call_total_duration"] = 3000
df.loc[df.dhb_overview_ntdun_call_in_times >= 25, "dhb_overview_ntdun_call_in_times"] = 25
df.loc[
df.dhb_last_60_and_90_days_ntdun_call_in_duration >= 1000, "dhb_last_60_and_90_days_ntdun_call_in_duration"] = 1000
df.loc[df.dhb_overview_dun_call_tel_total_nums >= 22, "dhb_overview_dun_call_tel_total_nums"] = 22
df.loc[df.dhb_last_30_days_dun_call_total_duration >= 1100, "dhb_last_30_days_dun_call_total_duration"] = 1100
df.loc[df.dhb_last_two_weeks_ntdun_call_in_duration >= 300, "dhb_last_two_weeks_ntdun_call_in_duration"] = 300
return df
if __name__ == '__main__': # if __name__ == '__main__':
features=get_feature_by_version() # features=sample.get_feature_by_version(feature_file_name)
# features=features[1:10] # features=features[1:10]
save_features(features) # sample.save_features(features,feature_file_name)
This diff is collapsed.
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta
import os
'''
读取特征文件
'''
def get_features_from_file(feature_file_name):
'''
从feature 文件中读取feature
:return: df,columns=['feature','version']
'''
df_feature=pd.read_csv(feature_file_name)
return df_feature
def get_feature_by_version(feature_file_name,version=None):
'''
根据feature 的版本号,获取该版本下的feature,如果不指定,则获取最新的版本号
:param version:int 负数为不指定或者
:return:list
'''
df_feature = get_features_from_file(feature_file_name)
if (version ==None) or (version<1):
version=df_feature.version.max()
return df_feature[df_feature.version == version].feature.tolist()
def save_features(features,feature_file_name):
'''
针对新的feature,同维护的feature文档比较,如果同最新版的特征一样,那么无需保存,如果不一致,则作为新的一版特征进行保存
:param features:list
:return:
'''
f2=get_feature_by_version(feature_file_name)
if (set(f2) & set(features)) == (set(f2) | set(features)):
print('features are already newest,not need to save')
return True
else:
#== 更新特征
tmp=pd.DataFrame(features,columns=['feature'])
df_feature=get_features_from_file(feature_file_name)
version=df_feature.version.max()+1
tmp['version']=version
columns=['feature','version']
df_feature=pd.concat([df_feature[columns],tmp[columns]])
df_feature[columns].to_csv(feature_file_name,index=None,encoding='utf8')
def cal_sample_date(last_sample_max_date=None,passdue_day=15):
'''
提取样本数据,基于上次样本的last_sample_max_date,和 passdue_day 提取有表现的到目前的数据
如果不指定 last_sample_max_date ,则以当前时间为基准,提取有passdue_day表现的近3个月的样本数据
:param last_sample_min_date:上一次样本最早时间 精确到天,格式为 '%Y-%m-%d'
:param last_sample_max_date:上一次样本最新时间 精确到天,格式为 '%Y-%m-%d'
:param passdue_day:查看表现的,比如逾期15天表现的样本
:return:start_date,end_date,可提取样本的最早时间,最晚时间
'''
base_date=datetime.date()
#== +5 是因为不是每个用户的放款都是30天周期,有的可能是31天等
#== 提取的样本数据不得超过base_date
base_date=base_date+relativedelta(days=-(passdue_day+5),months=-1)
if last_sample_max_date is None:
start_date=base_date+relativedelta(months=-3)
end_date=base_date
else:
#last_sample_max_date 为基准,计算
if type(last_sample_max_date)==str:
last_sample_max_date = datetime.strptime(last_sample_max_date,'%Y-%m-%d').date()
if last_sample_max_date >=base_date:
last_sample_max_date=base_date
start_date=last_sample_max_date
end_date=base_date
return start_date,end_date
def read_record():
file_name = 'record.txt'
cols = ['model_name', 'min_date', 'max_date', 'sample_cnt',
'train_min_date', 'train_max_date', 'train_cnt','train_auc',
'test_min_date', 'test_max_date', 'test_cnt', 'test_auc', 'update_date']
if os.path.exists(file_name):
df = pd.read_csv('record.txt')
else:
df = pd.DataFrame(columns=cols)
return df
def get_record(model_name):
'''
获取某一个模型下的所有的迭代的记录
:param model_name:
:return:
'''
df=read_record()
df_select = df[df.model_name == model_name]
df_select.sort_values(['update_date'], ascending=False, inplace=True)
return df_select
def get_last_record(model_name):
'''
获取指定模型的上一次迭代模型的样本信息
:param model_name:
:return:
'''
df_select=get_record(model_name)
if df_select.shape == 0:
return df_select
return df_select.head(1)
def save_model_record(model_name,min_date=None,max_date=None,sample_cnt=None,
train_min_date=None,train_max_date=None,train_cnt=None,train_auc=None,
test_min_date=None,test_max_date=None,test_cnt=None,test_auc=None):
'''
model_name,update_date 组成唯一健;如果有值,则更新,否则不进行更新
:param model_name:
:param min_date:
:param max_date:
:param sample_cnt:
:param train_min_date:
:param train_max_date:
:param train_cnt:
:param train_auc:
:param test_min_date:
:param test_max_date:
:param test_cnt:
:param test_auc:
:return:
'''
df_all=read_record()
df_all.reset_index(inplace=True)
#== 获取当下的记录
df_record=get_record(model_name)
df_record=df_record[df_record.update_date==datetime.date()]
cols = ['model_name', 'min_date', 'max_date', 'sample_cnt',
'train_min_date', 'train_max_date', 'train_cnt', 'train_auc',
'test_min_date', 'test_max_date', 'test_cnt', 'test_auc', 'update_date']
if df_record.shape[0]==0:
df_record=pd.DataFrame(columns=cols)
df_record['model_name']=model_name
df_record['update_date']=datetime.date()
else:
df_all = df_all[~df_all.index.isin(df_record.index)]
df_record=__update__(df_record,'min_date',min_date)
df_record = __update__(df_record, 'max_date', max_date)
df_record = __update__(df_record, 'sample_cnt', sample_cnt)
df_record = __update__(df_record, 'train_min_date', train_min_date)
df_record = __update__(df_record, 'train_max_date', train_max_date)
df_record = __update__(df_record, 'train_cnt', train_cnt)
df_record = __update__(df_record, 'train_auc', train_auc)
df_record = __update__(df_record, 'test_min_date', test_min_date)
df_record = __update__(df_record, 'test_max_date', test_max_date)
df_record = __update__(df_record, 'test_cnt', test_cnt)
df_record = __update__(df_record, 'test_auc', test_auc)
pd.concat([df_all[cols],df_record[cols]]).to_csv('record.txt',index=None,encoding='utf8')
def __update__(df,name,value):
if value is not None:
df[name]=value
return df
import pandas as pd import pandas as pd
from data.datasource.mysqldb import * from data.datasource.mysqldb import *
from data.datasource import dbquery from data.datasource import dbquery
''' '''
目的:提供业务数据,包括 order_no,loan_id,用户类型,支持策略用risk_info,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 目的:提供业务数据,包括 order_no,loan_id,用户类型,支持策略用risk_info,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数
''' '''
...@@ -126,3 +127,6 @@ def query_bydate(start_date,end_date,is_loan=True): ...@@ -126,3 +127,6 @@ def query_bydate(start_date,end_date,is_loan=True):
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment