Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
fe8f7148
Commit
fe8f7148
authored
Apr 25, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
加入MySQL 连接
parent
b980367e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
56 additions
and
44 deletions
+56
-44
dbquery.py
data/datasource/dbquery.py
+2
-1
__init__.py
data/samples/__init__.py
+3
-1
dhb.py
data/samples/dhb.py
+37
-20
dhb.csv
data/samples/features/dhb.csv
+0
-9
sample.py
data/samples/sample.py
+6
-5
yewudata.py
data/samples/yewudata.py
+8
-8
No files found.
data/datasource/dbquery.py
View file @
fe8f7148
...
@@ -8,7 +8,8 @@ def mysql_query(sql,engine_sql):
...
@@ -8,7 +8,8 @@ def mysql_query(sql,engine_sql):
:return:dataframe
:return:dataframe
'''
'''
res
=
[]
res
=
[]
tmp
=
pd
.
read_sql
(
sql
,
engine_sql
,
chunksize
=
10000
)
#== palo 每次查询不超过10000
tmp
=
pd
.
read_sql
(
sql
,
engine_sql
,
chunksize
=
5000
)
for
tt
in
tmp
:
for
tt
in
tmp
:
res
.
append
(
tt
)
res
.
append
(
tt
)
return
pd
.
concat
(
res
)
return
pd
.
concat
(
res
)
\ No newline at end of file
data/samples/__init__.py
View file @
fe8f7148
...
@@ -4,3 +4,5 @@
...
@@ -4,3 +4,5 @@
2、去重标准,文本转0-1 or 其他
2、去重标准,文本转0-1 or 其他
3、调参,哪些参数,参数标准
3、调参,哪些参数,参数标准
'''
'''
import
sample
import
yewudata
\ No newline at end of file
data/samples/dhb.py
View file @
fe8f7148
...
@@ -3,11 +3,12 @@ from data.samples import sample
...
@@ -3,11 +3,12 @@ from data.samples import sample
from
data.datasource
import
dbquery
from
data.datasource
import
dbquery
from
data.datasource.mysqldb
import
engine_risk_analysis
from
data.datasource.mysqldb
import
engine_risk_analysis
from
data.samples.yewudata
import
*
from
data.samples.yewudata
import
*
import
os
'''
'''
目的:获取电话邦特征,样本数据,数据源为风控分析库
目的:获取电话邦特征,样本数据,数据源为风控分析库
'''
'''
feature_file_name
=
'features/dhb.csv'
feature_file_name
=
'
/Users/wlf/PycharmProjects/model_mvp/data/samples/
features/dhb.csv'
def
get_feature
():
def
get_feature
():
return
sample
.
get_feature_by_version
(
feature_file_name
)
return
sample
.
get_feature_by_version
(
feature_file_name
)
...
@@ -53,24 +54,40 @@ def query_sample(start_date,end_date,is_loan=True):
...
@@ -53,24 +54,40 @@ def query_sample(start_date,end_date,is_loan=True):
}
}
cols
=
[
"dhb_overview_dun_first_call_time"
,
"dhb_overview_dun_last_call_time"
,
"dhb_overview_ntdun_first_call_time"
,
cols
=
[
"dhb_overview_dun_first_call_time"
,
"dhb_overview_dun_last_call_time"
,
"dhb_overview_ntdun_first_call_time"
,
"dhb_overview_ntdun_last_call_time"
]
"dhb_overview_ntdun_last_call_time"
]
#== df.columns 中必须有特征的
cols
=
list
(
set
(
cols
)
&
set
(
df
.
columns
.
tolist
()))
if
len
(
cols
)
>
0
:
df
[
cols
]
=
df
[
cols
]
.
applymap
(
lambda
x
:
value_map
[
x
])
df
[
cols
]
=
df
[
cols
]
.
applymap
(
lambda
x
:
value_map
[
x
])
cols
=
df
.
columns
.
tolist
()
if
'dhb_last_60_and_90_days_ntdun_call_avg_duration'
in
cols
:
df
.
loc
[
df
.
loc
[
df
.
dhb_last_60_and_90_days_ntdun_call_avg_duration
>=
42
,
"dhb_last_60_and_90_days_ntdun_call_avg_duration"
]
=
42
df
.
dhb_last_60_and_90_days_ntdun_call_avg_duration
>=
42
,
"dhb_last_60_and_90_days_ntdun_call_avg_duration"
]
=
42
if
'dhb_overview_ntdun_call_duration_above60'
in
cols
:
df
.
loc
[
df
.
dhb_overview_ntdun_call_duration_above60
>=
25
,
"dhb_overview_ntdun_call_duration_above60"
]
=
25
df
.
loc
[
df
.
dhb_overview_ntdun_call_duration_above60
>=
25
,
"dhb_overview_ntdun_call_duration_above60"
]
=
25
if
'dhb_last_30_and_60_days_ntdun_call_total_duration'
in
cols
:
df
.
loc
[
df
.
loc
[
df
.
dhb_last_30_and_60_days_ntdun_call_total_duration
>=
800
,
"dhb_last_30_and_60_days_ntdun_call_total_duration"
]
=
800
df
.
dhb_last_30_and_60_days_ntdun_call_total_duration
>=
800
,
"dhb_last_30_and_60_days_ntdun_call_total_duration"
]
=
800
if
'dhb_last_30_and_60_days_dun_call_in_duration'
in
cols
:
df
.
loc
[
df
.
loc
[
df
.
dhb_last_30_and_60_days_dun_call_in_duration
>=
1600
,
"dhb_last_30_and_60_days_dun_call_in_duration"
]
=
1600
df
.
dhb_last_30_and_60_days_dun_call_in_duration
>=
1600
,
"dhb_last_30_and_60_days_dun_call_in_duration"
]
=
1600
if
'dhb_last_30_days_ntdun_call_total_duration'
in
cols
:
df
.
loc
[
df
.
dhb_last_30_days_ntdun_call_total_duration
>=
2500
,
"dhb_last_30_days_ntdun_call_total_duration"
]
=
2500
df
.
loc
[
df
.
dhb_last_30_days_ntdun_call_total_duration
>=
2500
,
"dhb_last_30_days_ntdun_call_total_duration"
]
=
2500
if
'dhb_last_30_days_ntdun_call_tel_total_nums'
in
cols
:
df
.
loc
[
df
.
dhb_last_30_days_ntdun_call_tel_total_nums
>=
25
,
"dhb_last_30_days_ntdun_call_tel_total_nums"
]
=
25
df
.
loc
[
df
.
dhb_last_30_days_ntdun_call_tel_total_nums
>=
25
,
"dhb_last_30_days_ntdun_call_tel_total_nums"
]
=
25
if
'dhb_last_30_days_dun_call_in_duration'
in
cols
:
df
.
loc
[
df
.
dhb_last_30_days_dun_call_in_duration
>=
1000
,
"dhb_last_30_days_dun_call_in_duration"
]
=
1000
df
.
loc
[
df
.
dhb_last_30_days_dun_call_in_duration
>=
1000
,
"dhb_last_30_days_dun_call_in_duration"
]
=
1000
if
'dhb_overview_ntdun_call_total_duration'
in
cols
:
df
.
loc
[
df
.
dhb_overview_ntdun_call_total_duration
>=
3000
,
"dhb_overview_ntdun_call_total_duration"
]
=
3000
df
.
loc
[
df
.
dhb_overview_ntdun_call_total_duration
>=
3000
,
"dhb_overview_ntdun_call_total_duration"
]
=
3000
if
'dhb_overview_ntdun_call_in_times'
in
cols
:
df
.
loc
[
df
.
dhb_overview_ntdun_call_in_times
>=
25
,
"dhb_overview_ntdun_call_in_times"
]
=
25
df
.
loc
[
df
.
dhb_overview_ntdun_call_in_times
>=
25
,
"dhb_overview_ntdun_call_in_times"
]
=
25
if
'dhb_last_60_and_90_days_ntdun_call_in_duration'
in
cols
:
df
.
loc
[
df
.
loc
[
df
.
dhb_last_60_and_90_days_ntdun_call_in_duration
>=
1000
,
"dhb_last_60_and_90_days_ntdun_call_in_duration"
]
=
1000
df
.
dhb_last_60_and_90_days_ntdun_call_in_duration
>=
1000
,
"dhb_last_60_and_90_days_ntdun_call_in_duration"
]
=
1000
if
'dhb_overview_dun_call_tel_total_nums'
in
cols
:
df
.
loc
[
df
.
dhb_overview_dun_call_tel_total_nums
>=
22
,
"dhb_overview_dun_call_tel_total_nums"
]
=
22
df
.
loc
[
df
.
dhb_overview_dun_call_tel_total_nums
>=
22
,
"dhb_overview_dun_call_tel_total_nums"
]
=
22
if
'dhb_last_30_days_dun_call_total_duration'
in
cols
:
df
.
loc
[
df
.
dhb_last_30_days_dun_call_total_duration
>=
1100
,
"dhb_last_30_days_dun_call_total_duration"
]
=
1100
df
.
loc
[
df
.
dhb_last_30_days_dun_call_total_duration
>=
1100
,
"dhb_last_30_days_dun_call_total_duration"
]
=
1100
if
'dhb_last_two_weeks_ntdun_call_in_duration'
in
cols
:
df
.
loc
[
df
.
dhb_last_two_weeks_ntdun_call_in_duration
>=
300
,
"dhb_last_two_weeks_ntdun_call_in_duration"
]
=
300
df
.
loc
[
df
.
dhb_last_two_weeks_ntdun_call_in_duration
>=
300
,
"dhb_last_two_weeks_ntdun_call_in_duration"
]
=
300
return
df
return
df
...
...
data/samples/features/dhb.csv
View file @
fe8f7148
...
@@ -157,12 +157,3 @@ dhb_overview_ntdun_call_total_duration,1
...
@@ -157,12 +157,3 @@ dhb_overview_ntdun_call_total_duration,1
dhb_overview_ntdun_call_total_times,1
dhb_overview_ntdun_call_total_times,1
dhb_overview_ntdun_first_call_time,1
dhb_overview_ntdun_first_call_time,1
dhb_overview_ntdun_last_call_time,1
dhb_overview_ntdun_last_call_time,1
dhb_last_30_and_60_days_dun_call_duration_above60,2
dhb_last_30_and_60_days_dun_call_duration_below15,2
dhb_last_30_and_60_days_dun_call_duration_between15_and_30,2
dhb_last_30_and_60_days_dun_call_in_duration,2
dhb_last_30_and_60_days_dun_call_in_times,2
dhb_last_30_and_60_days_dun_call_out_duration,2
dhb_last_30_and_60_days_dun_call_out_times,2
dhb_last_30_and_60_days_dun_call_tel_total_nums,2
dhb_last_30_and_60_days_dun_call_total_duration,2
data/samples/sample.py
View file @
fe8f7148
...
@@ -11,6 +11,7 @@ def get_features_from_file(feature_file_name):
...
@@ -11,6 +11,7 @@ def get_features_from_file(feature_file_name):
从feature 文件中读取feature
从feature 文件中读取feature
:return: df,columns=['feature','version']
:return: df,columns=['feature','version']
'''
'''
print
(
'当前目录:'
,
os
.
path
.
abspath
(
'.'
))
df_feature
=
pd
.
read_csv
(
feature_file_name
)
df_feature
=
pd
.
read_csv
(
feature_file_name
)
return
df_feature
return
df_feature
...
@@ -53,7 +54,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
...
@@ -53,7 +54,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
:param passdue_day:查看表现的,比如逾期15天表现的样本
:param passdue_day:查看表现的,比如逾期15天表现的样本
:return:start_date,end_date,可提取样本的最早时间,最晚时间
:return:start_date,end_date,可提取样本的最早时间,最晚时间
'''
'''
base_date
=
datetime
.
date
()
base_date
=
datetime
.
date
time
.
now
()
.
date
()
#== +5 是因为不是每个用户的放款都是30天周期,有的可能是31天等
#== +5 是因为不是每个用户的放款都是30天周期,有的可能是31天等
#== 提取的样本数据不得超过base_date
#== 提取的样本数据不得超过base_date
base_date
=
base_date
+
relativedelta
(
days
=-
(
passdue_day
+
5
),
months
=-
1
)
base_date
=
base_date
+
relativedelta
(
days
=-
(
passdue_day
+
5
),
months
=-
1
)
...
@@ -63,7 +64,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
...
@@ -63,7 +64,7 @@ def cal_sample_date(last_sample_max_date=None,passdue_day=15):
else
:
else
:
#last_sample_max_date 为基准,计算
#last_sample_max_date 为基准,计算
if
type
(
last_sample_max_date
)
==
str
:
if
type
(
last_sample_max_date
)
==
str
:
last_sample_max_date
=
datetime
.
strptime
(
last_sample_max_date
,
'
%
Y-
%
m-
%
d'
)
.
date
()
last_sample_max_date
=
datetime
.
strptime
(
last_sample_max_date
,
'
%
Y-
%
m-
%
d
%
H:
%
M:
%
S
'
)
.
date
()
if
last_sample_max_date
>=
base_date
:
if
last_sample_max_date
>=
base_date
:
last_sample_max_date
=
base_date
last_sample_max_date
=
base_date
start_date
=
last_sample_max_date
start_date
=
last_sample_max_date
...
@@ -81,7 +82,7 @@ def read_record():
...
@@ -81,7 +82,7 @@ def read_record():
df
=
pd
.
DataFrame
(
columns
=
cols
)
df
=
pd
.
DataFrame
(
columns
=
cols
)
return
df
return
df
def
get_record
(
model_name
):
def
get_record
s
(
model_name
):
'''
'''
获取某一个模型下的所有的迭代的记录
获取某一个模型下的所有的迭代的记录
:param model_name:
:param model_name:
...
@@ -98,7 +99,7 @@ def get_last_record(model_name):
...
@@ -98,7 +99,7 @@ def get_last_record(model_name):
:param model_name:
:param model_name:
:return:
:return:
'''
'''
df_select
=
get_record
(
model_name
)
df_select
=
get_record
s
(
model_name
)
if
df_select
.
shape
==
0
:
if
df_select
.
shape
==
0
:
return
df_select
return
df_select
return
df_select
.
head
(
1
)
return
df_select
.
head
(
1
)
...
@@ -125,7 +126,7 @@ def save_model_record(model_name,min_date=None,max_date=None,sample_cnt=None,
...
@@ -125,7 +126,7 @@ def save_model_record(model_name,min_date=None,max_date=None,sample_cnt=None,
df_all
=
read_record
()
df_all
=
read_record
()
df_all
.
reset_index
(
inplace
=
True
)
df_all
.
reset_index
(
inplace
=
True
)
#== 获取当下的记录
#== 获取当下的记录
df_record
=
get_record
(
model_name
)
df_record
=
get_record
s
(
model_name
)
df_record
=
df_record
[
df_record
.
update_date
==
datetime
.
date
()]
df_record
=
df_record
[
df_record
.
update_date
==
datetime
.
date
()]
cols
=
[
'model_name'
,
'min_date'
,
'max_date'
,
'sample_cnt'
,
cols
=
[
'model_name'
,
'min_date'
,
'max_date'
,
'sample_cnt'
,
'train_min_date'
,
'train_max_date'
,
'train_cnt'
,
'train_auc'
,
'train_min_date'
,
'train_max_date'
,
'train_cnt'
,
'train_auc'
,
...
...
data/samples/yewudata.py
View file @
fe8f7148
...
@@ -43,8 +43,8 @@ def query_byloanid(loan_ids):
...
@@ -43,8 +43,8 @@ def query_byloanid(loan_ids):
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数
'''
'''
sql
=
'''
sql
=
'''
select t1.loan_id,t1.user_id,t
2.order_no,t2.contract_term,if(t2.term_no=
=1,t2.passdue_day,null) as passdue_day,
select t1.loan_id,t1.user_id,t
1.order_no,t2.contract_term,if(t2.term_no
=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=
=
1,'审核通过','审核未通过')) as refuse,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
max(t2.passdue_day) as max_passdue_day
from loan_application t1
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
...
@@ -69,8 +69,8 @@ def query_by_orderno(order_nos):
...
@@ -69,8 +69,8 @@ def query_by_orderno(order_nos):
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 reason['已放款','审核通过','审核未通过','黑名单']
:return:order_no,user_id,loan_id,用户类型,渠道,申请时间,拒绝原因,首逾天数,最大逾期天数,放款期数 reason['已放款','审核通过','审核未通过','黑名单']
'''
'''
sql
=
'''
sql
=
'''
select t1.loan_id,t1.user_id,t
2.order_no,t2.contract_term,if(t2.term_no=
=1,t2.passdue_day,null) as passdue_day,
select t1.loan_id,t1.user_id,t
1.order_no,t2.contract_term,if(t2.term_no
=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=
=
1,'审核通过','审核未通过')) as refuse,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
max(t2.passdue_day) as max_passdue_day
from loan_application t1
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
...
@@ -98,8 +98,8 @@ def query_bydate(start_date,end_date,is_loan=True):
...
@@ -98,8 +98,8 @@ def query_bydate(start_date,end_date,is_loan=True):
'''
'''
if
is_loan
:
if
is_loan
:
sql
=
'''
sql
=
'''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=
=
1,t2.passdue_day,null) as passdue_day,
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=
=
1,'审核通过','审核未通过')) as refuse,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
max(t2.passdue_day) as max_passdue_day
from loan_application t1
from loan_application t1
join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
...
@@ -109,8 +109,8 @@ def query_bydate(start_date,end_date,is_loan=True):
...
@@ -109,8 +109,8 @@ def query_bydate(start_date,end_date,is_loan=True):
'''
%
(
start_date
,
end_date
)
'''
%
(
start_date
,
end_date
)
else
:
else
:
sql
=
'''
sql
=
'''
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=
=
1,t2.passdue_day,null) as passdue_day,
select t1.loan_id,t1.user_id,t2.order_no,t2.contract_term,if(t2.term_no=1,t2.passdue_day,null) as passdue_day,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=
=
1,'审核通过','审核未通过')) as refuse,
t1.applied_at,t1.applied_type,t1.applied_channel,if(t2.loan_id is not null,'已放款',if(t1.approval=1,'审核通过','审核未通过')) as refuse,
max(t2.passdue_day) as max_passdue_day
max(t2.passdue_day) as max_passdue_day
from loan_application t1
from loan_application t1
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
left join loan_repay t2 on t1.loan_id=t2.loan_id and t2.repayment_status!=4
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment