Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
9763c9e6
Commit
9763c9e6
authored
Apr 19, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
等宽等频离散
parent
4791a5e8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
56 additions
and
38 deletions
+56
-38
datacal.py
data/analyis/datacal.py
+44
-37
drawplot.py
data/graph/drawplot.py
+12
-1
No files found.
data/analyis/datacal.py
View file @
9763c9e6
...
...
@@ -2,7 +2,6 @@ import pandas as pd
import
numpy
as
np
import
datetime
def
split_train_val
(
df
,
trainsplit
=
'random'
,
trainsplitRatio
=
0.8
,
sort_col
=
None
):
'''
切换df 为训练集 和 验证集
...
...
@@ -59,58 +58,62 @@ def cal_month(df,date_name,date_name_new):
return
df
# def cal_isometric(df,feature,bin=10,method=2):
# '''
# 等分计算,默认等频;等宽 1 ,等频 2 ,聚类 3
# :param df:
# :param feature:
# :param bin:
# :param method: 1:等宽;2:等频;3:聚类;默认2
# :return:
# '''
# if method==1:
#
def
cal_feature_grid
(
df
,
feature
,
bin
=
10
):
def
cal_feature_grid
(
df
,
feature
,
bin
=
10
,
method
=
2
):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
:param df:
数据离散计算,默认等频;等宽 1 ,等频 2
:param df:dataframe
:param feature:
:param bin:
:param method: 1:等宽;2:等频;3:聚类;默认2
:return:
'''
#== 等宽为数据max-min / bin 即每个区间的宽度是一样的
#== 存在数据每个区间数量不一致
tmp
=
df
.
copy
()
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
num
=
tmp
[
feature
]
.
nunique
()
if
num
<
bin
:
feature_grid
=
sorted
(
set
(
tmp
[
feature
]
.
unique
()
.
tolist
())
|
set
([
-
0.00001
]))
tmp
[
feature
]
=
tmp
[
feature
]
.
astype
(
float
)
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
# 默认负数为单独一个区间
num
=
df
[
feature
]
.
nunique
()
if
method
==
1
:
max
=
df
[
feature
]
.
max
()
if
max
<
0
:
max
=
0
if
num
<
bin
:
feature_grid
=
sorted
(
set
(
tmp
[
feature
]
.
unique
()
.
tolist
())
|
set
([
-
0.00001
]))
else
:
bin_index
=
[
max
*
i
/
bin
for
i
in
range
(
0
,
bin
+
1
)]
feature_grid
=
sorted
(
set
(
bin_index
)
|
set
([
-
99999
,
-
0.00001
]))
else
:
# == 负数单独一个区间,非负数n等份
bin_index
=
[
i
/
bin
for
i
in
range
(
0
,
bin
+
1
)]
feature_grid
=
sorted
(
set
(
tmp
[
tmp
[
feature
]
>=
0
][
feature
]
.
quantile
(
bin_index
))
|
set
([
-
99999
,
-
0.00001
]))
# 等频离散,保证每个区间的数量是尽量一致
if
num
<
bin
:
feature_grid
=
sorted
(
set
(
tmp
[
feature
]
.
unique
()
.
tolist
())
|
set
([
-
0.00001
]))
else
:
# == 负数单独一个区间,非负数n等份
bin_index
=
[
i
/
bin
for
i
in
range
(
0
,
bin
+
1
)]
feature_grid
=
sorted
(
set
(
tmp
[
tmp
[
feature
]
>=
0
][
feature
]
.
quantile
(
bin_index
))
|
set
([
-
99999
,
-
0.00001
]))
return
feature_grid
def
cal_accume
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
'''
groupby(classes),feature bin 分位; 对各个分位的target进行 count,mean ,sum计算 和累计 count,mean ,sum
:param df:
:param feature:
:param target:
:param bin:
:param classes:
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,
坏样本数量,坏样本比例
:return: 对feature 进行分段;计算每个区间的mean,count,sum 累计 count,
mean ,sum
'''
df_out
=
cal_univar
(
df
,
feature
,
target
,
bin
,
classes
=
classes
)
df_out
[
'acmCnt'
]
=
df_out
.
groupby
(
classes
)[
'count'
]
.
cumsum
()
df_out
[
'acm
Event
'
]
=
df_out
.
groupby
(
classes
)[
'sum'
]
.
cumsum
()
df_out
[
'acm
EventRate'
]
=
df_out
[
'acmEvent
'
]
/
df_out
[
'acmCnt'
]
df_out
[
'acm
Sum
'
]
=
df_out
.
groupby
(
classes
)[
'sum'
]
.
cumsum
()
df_out
[
'acm
Mean'
]
=
df_out
[
'acmSum
'
]
/
df_out
[
'acmCnt'
]
return
df_out
def
cal_univar
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean
计算,累计count,mean
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean
,sum计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
...
...
@@ -193,10 +196,16 @@ def cal_miss(df,feature,classes=[]):
if
feature
not
in
columns
:
raise
(
'no feature'
)
tmp
=
df
.
copy
()
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
tmp
[
'flag'
]
=
'缺失值'
tmp
.
loc
[
tmp
[
feature
]
==
0
,
'flag'
]
=
'0值'
tmp
.
loc
[
tmp
[
feature
]
>
0
,
'flag'
]
=
'非0值'
try
:
tmp
[
feature
]
=
tmp
[
feature
]
.
astype
(
float
)
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
tmp
[
'flag'
]
=
'缺失值'
tmp
.
loc
[
tmp
[
feature
]
==
0
,
'flag'
]
=
'0值'
tmp
.
loc
[
tmp
[
feature
]
>
0
,
'flag'
]
=
'非0值'
except
:
tmp
[
'flag'
]
=
'缺失值'
tmp
.
loc
[
tmp
[
feature
]
.
notna
(),
'flag'
]
=
'未缺失'
tmp
[
feature
]
.
fillna
(
'缺失'
,
inplace
=
True
)
headers
=
classes
+
[
'flag'
,
'cnt'
,
'match_rate'
]
if
len
(
classes
)
>
0
:
...
...
@@ -209,9 +218,7 @@ def cal_miss(df,feature,classes=[]):
df_gp
[
'match_rate'
]
=
np
.
round
(
df_gp
.
cnt1
/
df_gp
.
cnt
,
3
)
df_out
=
df_gp
else
:
all
=
[[
'非0值'
,
tmp
.
shape
[
0
],
round
(
tmp
[
tmp
[
feature
]
>
0
]
.
shape
[
0
]
/
tmp
.
shape
[
0
],
3
)],
[
'0值'
,
tmp
.
shape
[
0
],
round
(
tmp
[
tmp
[
feature
]
==
0
]
.
shape
[
0
]
/
tmp
.
shape
[
0
],
3
)],
[
'缺失值'
,
tmp
.
shape
[
0
],
round
(
tmp
[(
tmp
[
feature
]
<
0
)]
.
shape
[
0
]
/
tmp
.
shape
[
0
],
3
)]]
df_all
=
pd
.
DataFrame
(
all
,
columns
=
headers
)
df_out
=
df_all
df_out
=
tmp
.
groupby
(
'flag'
)[
feature
]
.
count
()
.
reset_index
()
.
rename
(
columns
=
{
feature
:
'cnt1'
})
df_out
[
'cnt'
]
=
tmp
.
shape
[
0
]
df_out
[
'match_rate'
]
=
np
.
round
(
df_out
[
'cnt1'
]
/
df_out
[
'cnt'
],
3
)
return
df_out
[
headers
]
\ No newline at end of file
data/graph/drawplot.py
View file @
9763c9e6
...
...
@@ -13,7 +13,18 @@ plt.rc('font',**font_options)
def
liftchart
(
df
,
x
,
y
,
classes
=
''
,
bin
=
10
,
title
=
''
,
xlabel
=
''
,
ylabel
=
''
):
'''
x:x轴;y:y轴
:param df:dataframe
:param x:
:param y:
:param classes:分组,str
:param bin:
:param title:
:param xlabel:
:param ylabel:
:return:
'''
# #== 单个TODO 待输出
# df_fig1=pd.pivot_table(df_out, index=classes, columns=['lbl', 'grid'],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment