大数据查询

a33add70 · linfang.wang · 45721de0 · a33add70 · a33add70
Commit a33add70 authored Apr 22, 2019 by linfang.wang
Show whitespace changes
Inline Side-by-side

Showing with 24 additions and 3 deletions

allocator.py mvp/allocator.py +1 -1

dhb.py mvp/dhb.py +23 -2

No files found.
--- a/mvp/allocator.py
+++ b/mvp/allocator.py
@@ -47,7 +47,7 @@ if __name__ == '__main__':
    #     'third_data_source#xy_pan_newqueryAorgAcount',
    #     'third_data_source#xy_pan_newqueryAsumAcount'
    # ]
-    dhb = dhb.dhb()
+    dhb = dhb.dhb(start_time_period='2019-01-19 11:00:00',end_time_period='2019-01-20 12:00:00')
    df=dhb.dhb_features_extract()
    label='target'
    # df=pd.read_csv('test.csv')

--- a/mvp/dhb.py
+++ b/mvp/dhb.py
@@ -236,8 +236,19 @@ class dhb:
          
        #print(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period))
        # use risk_analysis to extract data
-        dhb_loan = pd.read_sql(self.sql.replace('@start_time_period',self.start_time_period).replace('@end_time_period',self.end_time_period),mysqldb.engine_risk_analysis)
-        dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]] = dhb_loan[["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]].applymap(lambda x : value_map[x])      
+        print('-----get dhb features from risk_analysis---',datetime.datetime.now())
+        cnt=self.cnt_samples()
+        print('-----samples number is %d ' % cnt['cnt'][0])
+        res = []
+        tmp=pd.read_sql(
+            self.sql.replace('@start_time_period', self.start_time_period).replace('@end_time_period',
+                                                                                   self.end_time_period),
+            mysqldb.engine_risk_analysis, chunksize=10000)
+        for tt in tmp:
+            res.append(tt)
+        dhb_loan=pd.concat(res)
+        cols=["dhb_overview_dun_first_call_time", "dhb_overview_dun_last_call_time","dhb_overview_ntdun_first_call_time", "dhb_overview_ntdun_last_call_time"]
+        dhb_loan[cols] = dhb_loan[cols].applymap(lambda x : value_map[x])
        
        dhb_loan.loc[dhb_loan.dhb_last_60_and_90_days_ntdun_call_avg_duration >= 42,"dhb_last_60_and_90_days_ntdun_call_avg_duration"] = 42
        dhb_loan.loc[dhb_loan.dhb_overview_ntdun_call_duration_above60 >= 25,"dhb_overview_ntdun_call_duration_above60"] = 25
@@ -280,6 +291,16 @@ class dhb:
    #     df['bins'] = df.qcut(df['target'], q = 10, percision = 6, dupulicates='drop')
    #     df.groupby
    #     return 1
+    def cnt_samples(self):
+        sql = '''
+            select count(1) as cnt
+            from risk_analysis
+            where applied_at >= '%s' and applied_at < '%s'
+            and transacted = 1
+            and dhb_flag =1
+            and datediff(now(),deadline) > 15
+        ''' % (self.start_time_period,self.end_time_period)
+        return pd.read_sql(sql,mysqldb.engine_risk_analysis)