Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
model_mvp
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
decision-science
model_mvp
Commits
0c209f31
Commit
0c209f31
authored
Apr 04, 2019
by
linfang.wang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
计算类,分组统计
parent
ad260520
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
167 additions
and
0 deletions
+167
-0
datacal.py
data/analyis/datacal.py
+167
-0
No files found.
data/analyis/datacal.py
0 → 100644
View file @
0c209f31
import
pandas
as
pd
import
numpy
as
np
import
datetime
def
cal_week
(
df
,
date_name
,
date_name_new
):
'''
:param df: dateframe
:param date_name: eg applied_at
:return:
%
y-
%
m-
%
d 每周第一天
'''
df
[
date_name
]
=
pd
.
to_datetime
(
df
[
date_name
])
df
[
date_name_new
]
=
df
[
date_name
]
.
dt
.
strftime
(
'
%
w'
)
df
[
date_name_new
]
=
df
[
date_name_new
]
.
astype
(
int
)
df
[
date_name_new
]
=
df
.
apply
(
lambda
x
:
x
[
date_name
]
+
datetime
.
timedelta
(
days
=-
x
[
date_name_new
]),
axis
=
1
)
df
[
date_name_new
]
=
pd
.
to_datetime
(
df
[
date_name_new
])
.
dt
.
date
return
df
def
cal_month
(
df
,
date_name
,
date_name_new
):
'''
:param df: dateframe
:param date_name: eg applied_at
:return:
%
y-
%
m
'''
columns
=
df
.
columns
.
tolist
()
if
date_name
not
in
columns
:
raise
(
'not found
%
'
%
date_name
)
df
[
date_name
]
=
pd
.
to_datetime
(
df
[
date_name
])
df
[
date_name_new
]
=
df
[
date_name
]
.
dt
.
strftime
(
'
%
y-
%
m'
)
return
df
def
cal_feature_grid
(
df
,
feature
,
bin
=
10
):
'''
定义 N分位切割区间,负数单独一个区间,非负数N 切割
:param df:
:param feature:
:param bin:
:return:
'''
tmp
=
df
.
copy
()
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
num
=
tmp
[
feature
]
.
nunique
()
if
num
<
bin
:
feature_grid
=
sorted
(
set
(
tmp
[
feature
]
.
unique
()
.
tolist
())
|
set
([
-
0.00001
]))
else
:
# == 负数单独一个区间,非负数n等份
bin_index
=
[
i
/
bin
for
i
in
range
(
0
,
bin
+
1
)]
feature_grid
=
sorted
(
set
(
tmp
[
tmp
[
feature
]
>=
0
][
feature
]
.
quantile
(
bin_index
))
|
set
([
-
99999
,
-
0.00001
]))
return
feature_grid
def
cal_univar
(
df
,
feature
,
target
,
bin
=
10
,
classes
=
[]):
'''
groupby(classes) 分组,对feature 进行bin 分位,对各个分位进行 count,mean 计算
:param df: dataframe
:param feature: feature in df.columns
:param target: in df.columns eg: count(target) mean(target)
:param bins:default =10
:param classes: 分组
:return:
'''
if
df
.
shape
[
0
]
==
0
:
raise
(
'no date'
)
columns
=
df
.
columns
.
tolist
()
if
target
not
in
columns
:
raise
(
'not found
%
s'
%
target
)
if
feature
not
in
columns
:
raise
(
'not found
%
s'
%
feature
)
tmp
=
df
.
copy
()
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
# == bin 划分,feature 有可能 非数字
try
:
tmp
[
feature
]
=
tmp
[
feature
]
.
astype
(
float
)
feature_grid
=
cal_feature_grid
(
tmp
,
feature
,
bin
)
tmp
[
'lbl'
]
=
pd
.
cut
(
tmp
[
feature
],
feature_grid
,
include_lowest
=
True
)
tmp
[
'grid'
]
=
tmp
[
'lbl'
]
.
cat
.
codes
except
ValueError
:
tmp
[
'lbl'
]
=
tmp
[
feature
]
tmp
[
'grid'
]
=
tmp
[
feature
]
if
len
(
classes
)
>
0
:
df_gp
=
tmp
.
groupby
(
classes
+
[
'grid'
,
'lbl'
])
.
agg
({
target
:
[
'count'
,
'mean'
]})
.
reset_index
()
df_gp
.
columns
=
classes
+
[
'grid'
,
'lbl'
,
'count'
,
'mean'
]
df_out
=
df_gp
else
:
df_all
=
tmp
.
groupby
([
'grid'
,
'lbl'
])
.
agg
({
target
:
[
'count'
,
'mean'
]})
.
reset_index
()
df_all
.
columns
=
[
'grid'
,
'lbl'
,
'count'
,
'mean'
]
df_out
=
df_all
return
df_out
def
cal_distribution
(
df
,
target
,
classes
=
[]):
'''
对 classes 分组,对target 计算count,mean
:param df: dataframe
:param target: cal mean(target),count(target)
:param classes:分组
:return:dataframe
'''
if
df
.
shape
[
0
]
==
0
:
raise
(
'no date'
)
columns
=
df
.
columns
.
tolist
()
if
target
not
in
columns
:
raise
(
'not found target'
)
tmp
=
df
.
copy
()
headers
=
classes
+
[
'count'
,
'mean'
]
if
len
(
classes
)
>
0
:
df_gp
=
tmp
.
groupby
(
classes
)
.
agg
({
target
:[
'count'
,
'mean'
]})
.
reset_index
()
df_gp
.
columns
=
classes
+
[
'count'
,
'mean'
]
df_out
=
df_gp
else
:
all
=
[[
tmp
[
target
]
.
count
(),
tmp
[
target
]
.
mean
()]]
df_all
=
pd
.
DataFrame
(
all
,
columns
=
headers
)
df_out
=
df_all
return
df_out
[
headers
]
def
cal_miss
(
df
,
feature
,
classes
=
[]):
'''
target: 计算 某个 特征的 缺失率
:param df: dataframe
:param feature ; field name in df.columns
:param classes : list 要分组的,如果为空,默认不分组
:return df_out :dataframe,contains feature,class_name[if True],cnt,miss_rate,
:argument warnning 分为 0值,非0值,负值,默认负数+缺失值均为负值处理
'''
if
df
.
shape
[
0
]
<=
0
:
raise
(
'no data'
)
columns
=
df
.
columns
.
tolist
()
if
feature
not
in
columns
:
raise
(
'no feature'
)
tmp
=
df
.
copy
()
tmp
[
feature
]
.
fillna
(
-
1
,
inplace
=
True
)
tmp
[
'flag'
]
=
'缺失值'
tmp
.
loc
[
tmp
[
feature
]
==
0
,
'flag'
]
=
'0值'
tmp
.
loc
[
tmp
[
feature
]
>
0
,
'flag'
]
=
'非0值'
headers
=
classes
+
[
'flag'
,
'cnt'
,
'miss_rate'
]
if
len
(
classes
)
>
0
:
# == 分类型
df_gp
=
pd
.
merge
(
tmp
.
groupby
(
classes
)[
feature
]
.
count
()
.
reset_index
()
.
rename
(
columns
=
{
feature
:
"cnt"
}),
tmp
.
groupby
(
classes
+
[
'flag'
])[
feature
]
.
count
()
.
reset_index
()
.
rename
(
columns
=
{
feature
:
"cnt1"
}),
on
=
classes
,
how
=
'left'
)
df_gp
[
'miss_rate'
]
=
np
.
round
(
1
-
df_gp
.
cnt1
/
df_gp
.
cnt
,
3
)
df_out
=
df_gp
else
:
all
=
[[
'非0值'
,
tmp
.
shape
[
0
],
round
(
1
-
tmp
[
tmp
[
feature
]
>
0
]
.
shape
[
0
]
/
tmp
.
shape
[
0
],
3
)],
[
'0值'
,
tmp
.
shape
[
0
],
round
(
1
-
tmp
[
tmp
[
feature
]
==
0
]
.
shape
[
0
]
/
tmp
.
shape
[
0
],
3
)],
[
'缺失值'
,
tmp
.
shape
[
0
],
round
(
1
-
tmp
[(
tmp
[
feature
]
<
0
)]
.
shape
[
0
]
/
tmp
.
shape
[
0
],
3
)]]
df_all
=
pd
.
DataFrame
(
all
,
columns
=
headers
)
df_out
=
df_all
return
df_out
[
headers
]
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment