Commit 88472943 authored by 张鹏程's avatar 张鹏程

1. 提交征信解析第一版版

parents
import os
FILE_PATH = os.path.realpath(os.curdir)
FEATURE_HOST = 'http://172.20.1.131:23010'
FEATURE_HOST_MY = 'http://localhost:23010'
url_reportanalysis = '/report'
# -*- coding:utf-8 -*-
import tornado.web
class BaseHandler(tornado.web.RequestHandler):
# 初始化映射关系
def prepare(self):
self.set_header('Content-Type', "application/json; charset=UTF-8")
# -*- coding:utf-8 -*-
import threading
from utils import JsonUtil
from handler.Base_Handler import BaseHandler
from service.PersonalInformation_Service import PersonalInformation # 个人基本信息
from service.InformationSummary_Service import InformationSummary # 信息概要
from service.TransactionDetails_Service import TransactionDetails # 信贷交易明细
from service.QueryInfomation_Service import QueryInfomation # 查询记录
from service.Base_Service import Result # 返回结果存储
from service.InitHtml_Service import InitHtml # 初始化HTML
class ReportAnalysis(BaseHandler):
def post(self):
self._filepath = self.get_argument('filepath', default=None)
if self._filepath == None or self._filepath == '':
self.write(JsonUtil.build_json(code = JsonUtil.Constants.Code_Params_Error,
mssage=JsonUtil.Constants.Msg_Params_Error.format('filepath',self._filepath)))
self.flush()
else:
filepath = self._filepath
isFile = False
try :
with open(filepath, 'rb') as f:
htmlhandle = f.read()
f.close()
isFile = True
except IOError as e:
self.write(JsonUtil.build_json(code = JsonUtil.Constants.Code_File_Error,
mssage = JsonUtil.Constants.Msg_File_Error))
self.flush()
if isFile:
try :
html = InitHtml(htmlhandle)
asslysis = ['PersonalInformation',
'InformationSummary',
'TransactionDetails',
'QueryInfomation'
]
maxconnections = len(asslysis) #最大并发数
semlock = threading.BoundedSemaphore(maxconnections)
for i in range(maxconnections):
semlock.acquire()
t = threading.Thread(target=eval(asslysis[i]),args=(html.menu_dict,))
semlock.release()
t.start()
# PersonalInformation(html.menu_dict)
# InformationSummary(html.menu_dict)
# TransactionDetails(html.menu_dict)
# QueryInfomation(html.menu_dict)
result = Result.get_result()
Result.clear_result()
self.write(JsonUtil.build_json(report = result,
code=JsonUtil.Constants.Code_Success,
mssage=JsonUtil.Constants.Msg_Success))
self.flush()
self.finish()
except :
self.write(JsonUtil.build_json(code=JsonUtil.Constants.Code_Analysis_Error,
mssage=JsonUtil.Constants.Msg_Analysis_Error
))
self.flush()
# import json
# outpath = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/provinces.json'
# with open(outpath,"w+",encoding="utf-8") as f:
# _json = json.dumps(Result.get_result())
# f.write(_json)
# f.close()
# filepath = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/432325197803211379.html'
# filepath = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/13082119950823527X.htm'
# filepath = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/32052219780226051X.htm'
# filepath = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/140427198607030038.htm'
# filepath = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/330682198805122815.htm'
# filepath = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/440421198210078001.html'
# def run(self):
# if self._target is not None:
# self._return = self._target(*self._args, **self._kwargs)
#
# def get_return(self):
# try:
# return self._return
# except Exception:
# return None
#
# def join(self):
# threading.Thread.join(self)
# return self._return
\ No newline at end of file
# -*- coding:utf-8 -*-
import bs4
import re
from collections import defaultdict
class Result:
"""html to json"""
result = {}
html = None
@classmethod
def set_result(cls,result):
cls.result.update(result)
@classmethod
def get_result(cls):
return cls.result
@classmethod
def clear_result(cls):
cls.result = {}
return cls.result
@classmethod
def get_htmlMsg(cls,html):
if cls.html == None:
cls.html = 1
return cls.html
class Base(object):
def __init__(self,html):
self.menu_dict = html
pass
def get_menuDetail(self,tag):
"""按报告内部小标题拆分"""
_dict = defaultdict(list)
key = None
for i in range(len(tag)):
index = []
for m in tag[i].stripped_strings:
# ret=re.findall(r'(\(一\)|\(二\))[\ ]?(\w+)',m)
# ret=re.findall(r'(\(\S\))[\ ]?(\w+)',m)
ret=re.findall(r'(\([\u4E00-\u9FA5]\))[\ ]?(\w+)',m)
if ret:
key = ret[0][1]
index.append(i)
if key:
if i not in index:
_dict[key].append(tag[i])
return dict(_dict)
def get_menuDetail_personalInformation(self,tag):
"""按报告内部小标题拆分 - 公共信息明细"""
_dict = defaultdict(list)
key = None
for i in tag:
if i.table==None:
if i.b:
key = i.b.get_text()
else:
_dict[key].append( i.table)
return _dict
def get_menuDetail_queryInformation(self,tag):
"""按报告内部小标题拆分 - """
_dict = defaultdict(list)
key = None
for i in tag:
if i.table==None:
if i.b:
key = i.b.get_text()
else:
_dict[key].append( i.table)
return _dict
def get_table(self,tag):
"""把table 转list"""
tag_list = []
table = tag.tbody
for i,t in enumerate(table):
if isinstance(t,bs4.element.Tag):
tag_list.append(t)
return tag_list
def get_json(self,tag):
"""获取table解析后的json , 支持表头与值循环"""
result = defaultdict(list)
_key = []
# _colspan=0
index = 0
for i,val in enumerate(tag):
tds = val.find_all('td')
if i %2 == 0:
key = []
for ii ,td in enumerate(tds):
if isinstance(td,bs4.element.Tag):
# _colspan = td.get('colspan')
tm = ''
for k in td.stripped_strings:
tm+= k
key.append(tm)
_key.append(key)
index+=1
else:
if len(_key[index-1]) == len(tds):
for ii ,td in enumerate(tds):
if isinstance(td,bs4.element.Tag):
value = ''
for k in td.stripped_strings:
value+=k
result[_key[index-1][ii]] = value
else:
value = []
for ii ,td in enumerate(tds):
value.append(td.get_text().strip())
result[_key[index-1][0]] = value
return dict(result)
# def get_json_tableList(self,tag):
# """支持 table 列表形式 存储
# ::return list
# """
# _result = []
# tag = tag.tbody
# for i in tag:
# td = tag.findAll('td')
# _v = []
# for t in td:
# if t.get('style'):
# for v in t.stripped_strings:
# _v.append(v)
# _result.append(_v)
# return _result
\ No newline at end of file
import bs4
from service.Base_Service import Base,Result
class InformationSummary(Base):
"""信息概要"""
def __init__(self,filepath):
Base.__init__(self,filepath)
if '信息概要' in self.menu_dict.keys():
self.menu_informationSummary = self.get_menuDetail(self.menu_dict['信息概要']) # 信息概要
Result.set_result(self.get_Logic(self.menu_informationSummary))
else:
Result.set_result({'信息概要':None})
pass
def get_Logic(self,tag):
result = {}
if '信用提示' in tag.keys(): # 信用汇总
result.update(self.get_information_creditSummary(tag,'信用提示'))
result.update(self.get_numberUnscramble(tag,'信用提示'))
if '逾期及违约信息概要' in tag.keys():
result.update(self.get_information_overdueAndDefault(tag,'逾期及违约信息概要'))
if '授信及负债信息概要' in tag.keys(): # 未结清贷款信息汇总 / 未销户贷记卡信息汇总 / 未销户准贷记卡信息汇总
result.update(self.get_information_uncLoanSummary(tag,'授信及负债信息概要'))
return result
def get_information_creditSummary(self,tag,name):
"""信用提示 - 信用汇总or信用提示"""
result = {'信用汇总':{}}
tag = self.menu_informationSummary['信用提示'][0]
table_tag = self.get_table(tag)
result['信用汇总'].update(self.get_json(table_tag))
return result
def get_numberUnscramble(self,tag,name):
"""信用提示 - 个人信用报告 “数字解读”"""
result = {}
tag = tag[name]
istrue = False
key = ''
if tag:
for i,t in enumerate(tag):
if isinstance(t,bs4.element.Tag):
if '个人信用报告 “数字解读”' in t.td.get_text():
istrue = True
key = t.td.get_text()
table = t.next_sibling.next_sibling
if istrue:
tag = table
if tag:
if tag.find_all('tbody'):
for i,t in enumerate(tag):
if isinstance(t,bs4.element.Tag):
table_tag = self.get_table(t)
_json = self.get_json(table_tag)
result[key] = _json
return result
def get_information_uncLoanSummary(self,tag,name):
"""未结清贷款信息汇总 / 未销户贷记卡信息汇总 / 未销户准贷记卡信息汇总"""
result = {}
tag = tag[name]
if tag:
for i,t in enumerate(tag):
if isinstance(t,bs4.element.Tag):
if t.td.get('align'):
key = t.td.get_text()
else:
if t.tbody:
table_tag = self.get_table(t)
_json = self.get_json(table_tag)
result[key] = _json
return result
def get_information_overdueAndDefault(self,tag,name):
"""逾期及违约信息概要"""
result = {}
tags = tag[name]
if tags:
for i,tag in enumerate(tags):
if isinstance(tag,bs4.element.Tag):
if tag.td.get('align'):
key = tag.td.get_text() # 获取第一行,设置为head
else:
if tag.tbody:
table_tag = self.get_table(tag)
_json = self.get_json_overdueAndDefault(table_tag)
result[key] = _json
result['逾期信息汇总'] = result.pop('逾期(透支)信息汇总')
return result
def get_json_overdueAndDefault(self,tag):
"""解析table"""
column = [] #一共几列
head = []
key = []
body = []
line_id = 0 #行
row_id = 0 #列
values = []
result = {}
for i,tds in enumerate(tag):
tds = tds.children
for ii ,td in enumerate(tds):
if isinstance(td,bs4.element.Tag):
if td.get('colspan'):
column.append(int(td.get('colspan')))
tm = ''
for k in td.stripped_strings:
tm+=k
head.append(tm)
else:
tm = ''
for k in td.stripped_strings:
tm+=k
values.append(tm)
row_id += 1
if line_id<=1:
if row_id % column[line_id] == 0:
key.append(values)
values = []
# print(row_id,line_id,column[line_id])
else:
if row_id % column[line_id] == 0:
body.append(values)
values = []
line_id +=1
for i,h in enumerate(head):
result[h] = dict(zip(key[i],body[i]))
return result
if __name__ == '__main__':
InformationSummary()
\ No newline at end of file
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup,Comment
import bs4,re
from collections import defaultdict
class InitHtml:
"""初始化html 文件,拆分标题"""
def __init__(self,html):
htmlhandle = html
soup = BeautifulSoup(htmlhandle,'lxml')
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments] # 去除注释
[s.extract() for s in soup("style")] # 去除指定标签
children = self.get_children(soup)
# detail = self.children[2].tbody.find_all('tr')
# menu_detail = detail[4:]
menu_detail = children[2].tbody.children
self.menu_dict = self.get_menu(menu_detail) # 获取报告标题信息
def get_children(self,soup):
"""获取子节点"""
children = []
for child in soup.body.children:
if child != '\n':
children.append(child)
return children
# def get_menu(self,menu):
# """按报告标题大标题拆分"""
# menu_dict = defaultdict(list)
# index = []
# for i in range(len(menu)):
# for m in menu[i].stripped_strings:
# # # ret = re.findall(r"[一|二|三|四|五] .*", m)
# # ret=re.findall('(一|二|三|四|五|六|十一)[\ ]{1,1}(\w+)',m)
# ret=re.findall('([\u4E00-\u9FA5])[\ ]{1,1}(\w+)',m)
# if ret:
# key = ret[0][1]
# index.append(i)
# if i not in index:
# menu_dict[key].append(menu[i])
# return dict(menu_dict)
def get_menu(self,menu):
"""按报告标题大标题拆分"""
menu_dict = defaultdict(list)
index = []
key = None
for ii ,i in enumerate(menu):
if isinstance(i,bs4.element.Tag):
con = i.contents
if con :
menu = con[1]
# menu = i.contents[1]
for mm,m in enumerate(menu.stripped_strings):
# ret = re.findall(r"[一|二|三|四|五] .*", m)
ret=re.findall('([\u4E00-\u9FA5])[\ ]{1,1}(\w+)',m)
if ret:
key = ret[0][1]
index.append(ii)
if key:
if ii not in index:
menu_dict[key].append(i)
return dict(menu_dict)
# -*- coding:utf-8 -*-
import json
from collections import defaultdict
import bs4
import pandas as pd
from service.Base_Service import Base,Result
class PersonalInformation(Base):
"""个人基本信息"""
def __init__(self,html):
Base.__init__(self,html)
if '个人基本信息' in self.menu_dict.keys():
self.menu_personalInformation = self.get_menuDetail_personalInformation(self.menu_dict['个人基本信息']) # 个人基本信息
Result.set_result(self.get_Logic(self.menu_personalInformation))
else:
Result.set_result({'个人基本信息':None})
def get_Logic(self,tag):
result = {}
if '身份信息' in tag.keys():
result.update(self.get_information_Identity(tag,'身份信息'))
if '居住信息' in tag.keys():
result.update(self.get_information_Residential(tag,'居住信息'))
if '职业信息' in tag.keys():
result.update(self.get_information_Occupation(tag,'职业信息'))
return result
def get_information_Identity(self,tag,name):
"""身份信息"""
result = {name:{}}
tag_msg = tag[name][0].tbody.children
msg = self.get_json(tag_msg)
for k,v in msg.items():
for i in v:
result[name].update(i)
result.update(result)
return result
def get_information_Residential(self,tag,name):
"""居住信息"""
result = {name:{}}
tag_msg = tag[name][0].tbody.children
msg = self.get_json(tag_msg)
result[name] = msg[0]
return result
def get_information_Occupation(self,tag,name):
"""职业信息"""
result = {name:{}}
tag_msg = tag[name][0].tbody.children
msg = self.get_json(tag_msg)
if msg :
basic = pd.read_json(json.dumps(msg[0]))
details = pd.read_json(json.dumps(msg[1]))
df = pd.merge(basic,details,on='编号',how = 'left')
js = df.to_dict(orient='records')
result[name] = js
return result
def get_json(self,tag):
result = defaultdict(list)
key = []
_num = 0
for tr in tag:
index = 0
if isinstance(tr,bs4.element.Tag):
tds = tr.find_all('td')
value = []
# key = []
td_num = len(tds)
for td in tds:
for m in td.stripped_strings:
if td.get('style') == None:
key.append(m)
else:
value.append(m)
index+=1
if len(key)!=td_num:
key = key[len(key)-td_num:]
_num += 1
if value:
result[_num].append(dict(zip(key,value)))
return result
if __name__ == '__main__':
PersonalInformation()
\ No newline at end of file
from collections import defaultdict
import bs4
from service.Base_Service import Base,Result
class QueryInfomation(Base):
"""查询记录"""
def __init__(self,filepath):
Base.__init__(self,filepath)
if '查询记录' in self.menu_dict.keys():
self.menu_queryInformation = self.get_menuDetail_personalInformation(self.menu_dict['查询记录'])
Result.set_result(self.get_Logic(self.menu_queryInformation))
else:
Result.set_result({'查询记录':None})
pass
def get_Logic(self,tag):
result = {}
if '查询记录汇总' in tag.keys():
result.update(self.get_querySummary(tag,'查询记录汇总'))
if '信贷审批查询记录明细' in tag.keys():
result.update(self.get_queryCreditApprovalDetail(tag,'信贷审批查询记录明细'))
return result
def get_querySummary(self,tag,name):
"""查询记录汇总"""
_result = {'最近1个月内的查询机构数':{'贷款审批':None,'信用卡审批':None},
'最近1个月内的查询次数':{'贷款审批':None,'信用卡审批':None,'本人查询':None},
'最近2年内的查询次数':{'贷后管理':None,'担保资格审查':None,'特约商户实名审查':None}
}
result = {name:{}}
values = []
tds = self.get_table(tag[name][0])
for i ,td in enumerate(tds[2]):
if isinstance(td,bs4.element.Tag):
tm = ''
for msg in td.stripped_strings:
tm+=msg
values.append(tm)
index = 0
for k ,v in _result.items():
for kk,vv in v.items():
result[name][kk+k] = values[index]
index+=1
return result
def get_queryCreditApprovalDetail(self,tag,name):
"""信贷审批查询记录明细"""
result = {}
table = tag[name][0].tbody.children
result.update(self.get_json(table))
result[name] = result.pop(0)
return result
def get_json(self,tag):
result = defaultdict(list)
key = []
_num = 0
for tr in tag:
index = 0
if isinstance(tr,bs4.element.Tag):
tds = tr.find_all('td')
value = []
# key = []
td_num = len(tds)
for td in tds:
for m in td.stripped_strings:
if td.get('style') == None:
key.append(m)
else:
value.append(m)
index+=1
if len(key)!=td_num:
key = key[len(key)-td_num:]
_num += 1
if value:
result[_num].append(dict(zip(key,value)))
return result
if __name__ == '__main__':
QueryInfomation()
\ No newline at end of file
import re
from collections import defaultdict
import bs4
import pandas as pd
from service.Base_Service import Base,Result
class TransactionDetails(Base):
"""信贷交易明细"""
def __init__(self,filepath):
Base.__init__(self,filepath)
if '信贷交易信息明细' in self.menu_dict.keys():
self.menu_transactionDetails = self.get_menuDetail(self.menu_dict['信贷交易信息明细']) # 信贷交易信息明细
self.menu_transactionDetailsType = {}
for k,value in self.menu_transactionDetails.items():
# if k == '贷记卡':
self.menu_transactionDetailsType[k] = self.get_menuDetailsList(value)
Result.set_result(self.get_Logic(self.menu_transactionDetailsType))
else:
Result.set_result({'信贷交易信息明细':None})
def get_Logic(self,tag):
result = {}
if '贷款' in tag.keys():
result.update(self.get_Loan_Details(tag['贷款']))
if '贷记卡' in tag.keys():
result.update(self.get_DebitCard_Details(tag['贷记卡']))
if '准贷记卡' in tag.keys():
result.update(self.get_DebitCard_Details(tag['准贷记卡']))
return result
def get_menuDetailsList(self,tag):
"""交易信息明细小标题拆分"""
_result = []
_temp = []
for i in tag:
if i.td.attrs:
try :
if i.div['align'] == 'left':
# print(list( re.findall('([0-9]\d{0,2})[\.]{1}(\S+)',i.div.get_text())[0]))
_temp.extend(list( re.findall('([0-9]\d{0,2})[\.]{1}(\S+)',i.div.get_text())[0]))
elif i.div['align'] == 'center':
_temp.append(i)
except:
_temp.append(None)
else:
if i.table:
_temp.append(i) #
if _temp:
_result.append(_temp)
_temp = []
return _result
def get_Loan_Details(self,tag):
"""贷款明细拆分拆分"""
result = {'贷款':{}}
_result = []
for v in tag:
_json = {}
_json['序号'] =v[0]
_json['描述'] =v[0]+'.'+v[1]
r1 = re.findall("账户状态为“(\S+)”",v[1])
if r1:
# r = re.findall("(\d{4}年\d{1,2}月\d{1,2}日)([\S+]*)发放的([\S+]*)元\(([\S+]*)\)([\S+]*),业务号([\S+]{0,1}),([\S+]*),([\d]*)期,([\S+]*)。截至(\d{4}年\d{1,2}月\d{1,2}日)[\S+]*账户状态为“(\S+)”",v[1])
r = re.findall("(\d{4}年\d{1,2}月\d{1,2}日)([\S+]*)发放的(.*?)元\(([\S+]*)\)(.*?),业务号(.*?),(.*?),(.*?)。截至(.*?),账户状态为“(.*?)”",v[1])
if r:
r = r[0]
_json['银行名称'] = r[1]
_json['贷款金额'] = r[2]
_json['贷款类型'] = r[4]
terms = r[7].split(',')
if len(terms)>1:
term = re.findall('(.*?)期',terms[0])[0]
_json['贷款期数'] = term
else:
_json['贷款期数'] = None
_json['是否抵押'] = r[6]
_json['起始时间'] = r[0]
_json['终止时间'] = r[8]
_json['账户状态'] = r[9]
else:
# r = re.findall("(\d{4}年\d{1,2}月\d{1,2}日)([\S+]*)发放的([\S+]*)元\(([\S+]*)\)([\S+]*),业务号([\S+]{0,1}),([\S+]*),([\d]*)期,([\S+]*),(\d{4}年\d{1,2}月\d{1,2}日)[\S+]*[\S+]*(\d{4}年\d{1,2}月\d{1,2}日)",v[1])
r = re.findall("(\d{4}年\d{1,2}月\d{1,2}日)([\S+]*)发放的(.*?)元\(([\S+]*)\)(.*?),业务号(.*?),(.*?),(.*?),(\d{4}年\d{1,2}月\d{1,2}日)到期。截至(\d{4}年\d{1,2}月\d{1,2}日)",v[1])
if r:
r = r[0]
_json['银行名称'] = r[1]
_json['贷款金额'] = r[2]
_json['贷款类型'] = r[4]
# _json['贷款期数'] = r[7]
terms = r[7].split(',')
if len(terms)>1:
term = re.findall('(.*?)期',terms[0])[0]
_json['贷款期数'] = term
else:
_json['贷款期数'] = None
_json['是否抵押'] = r[6]
_json['起始时间'] = r[0]
_json['终止时间'] = r[9]
if v[2] != None:
table_tag = self.get_table(v[2])
table_json = self.get_json_LoanDetails(table_tag)
key_name = table_json.keys()
# rename = ''
plan = ''
overdue = ''
temp_json = {}
for name in key_name:
# r = re.findall("(\d{4}年\d{1,2}月)-(\d{4}年\d{1,2}月)的([\S+]*)",name)
# if r :
# temp_json['近24个月的还款记录-开始时间'] =r[0][0]
# temp_json['近24个月的还款记录-截至时间'] =r[0][1]
# rename = name
# if rename:
# table_json['近24个月的还款记录-各期还款记录'] = table_json.pop(rename)
# if temp_json:
# table_json.update(temp_json)
r = re.findall("(\d{4}年\d{1,2}月)-(\d{4}年\d{1,2}月)的([\S+]*)",name)
if r :
if r[0][2] == '还款记录':
temp_json['近24个月的还款记录-开始时间'] =r[0][0]
temp_json['近24个月的还款记录-截至时间'] =r[0][1]
plan = name
if r[0][2] == '逾期记录':
overdue = name
if plan:
table_json['近24个月的还款记录-各期还款记录'] = table_json.pop(plan)
if overdue:
del table_json[overdue]
if temp_json:
table_json.update(temp_json)
if '逾期记录' in table_json.keys():
df = pd.DataFrame(table_json['逾期记录'])
df = df.loc[(df['逾期持续月数']!='--')&(df['逾期月份']!='--')&(df['逾期金额']!='--')]
df['逾期金额'] = df['逾期金额'].apply(lambda x : float(x.replace(',','')))
table_json['逾期记录'] = df.to_dict(orient='records')
table_json['近5年逾期记录'] = table_json.pop('逾期记录')
_json.update(table_json)
_result.append(_json)
result['贷款'] = _result
return result
def get_DebitCard_Details(self,tag):
"""贷记卡明细拆分"""
result = {'贷记卡':{}}
_result = []
for v in tag:
_json = {}
_json['序号'] =v[0]
_json['描述'] =v[0]+'.'+v[1]
r = re.findall("(\d{4}年\d{1,2}月\d{1,2}日)(.*?)发放的(.*?),业务号(.*?),授信额度[折合人民币]{0,5}(.*?)元,共享授信额度[折合人民币]{0,5}(.*?)元,(.*?)[,|。]{1}截至(\d{4}年\d{1,2}月\d{1,2}日),(账户状态为“[\u4E00-\u9FA5]{1,2}”)*",v[1])
if r :
r = r[0]
_json['银行名称'] = r[1]
_json['授信金额'] = r[4]
_json['共享授信金额'] = r[5]
_json['授信时间'] = r[0]
_json['截止时间'] = r[7]
if r[8]:
r_temp = re.findall('账户状态为“(.*?)”',r[8])[0]
_json['账户状态 '] = r_temp
if v[2] != None:
table_tag = self.get_table(v[2])
table_json = self.get_json_DebitCard(table_tag)
key_name = table_json.keys()
plan = ''
overdue = ''
temp_json = {}
for name in key_name:
r = re.findall("(\d{4}年\d{1,2}月)-(\d{4}年\d{1,2}月)的([\S+]*)",name)
if r :
if r[0][2] == '还款记录':
temp_json['近24个月的还款记录-开始时间'] =r[0][0]
temp_json['近24个月的还款记录-截至时间'] =r[0][1]
plan = name
if r[0][2] == '逾期记录':
overdue = name
if plan:
table_json['近24个月的还款记录-各期还款记录'] = table_json.pop(plan)
if overdue:
del table_json[overdue]
if temp_json:
table_json.update(temp_json)
if '逾期记录' in table_json.keys():
df = pd.DataFrame(table_json['逾期记录'])
df = df.loc[(df['逾期持续月数']!='--')&(df['逾期月份']!='--')&(df['逾期金额']!='--')]
df['逾期金额'] = df['逾期金额'].apply(lambda x : float(x.replace(',','')))
table_json['逾期记录'] = df.to_dict(orient='records')
table_json['近5年逾期记录'] = table_json.pop('逾期记录')
_json.update(table_json)
_result.append(_json)
result['贷记卡'] = _result
return result
def get_json_DebitCard(self,tag):
result = defaultdict(dict)
key = []
value = []
value_list = []
temp_key = None
for i,val in enumerate(tag):
tds = val.find_all('td')
# tds_len = len(tds)
# next_sibling = val.next_sibling
# if (next_sibling == ' ' or next_sibling =='\n' ) and next_sibling!=None:
# next_sibling = val.next_sibling.next_sibling
# tds_next = next_sibling.find_all('td')
# tds_next_len = len(next_sibling.find_all('td'))
for ii ,td in enumerate(tds):
# print(tds_len,tds_next_len,'---1---')
if isinstance(td,bs4.element.Tag):
if val.get('align') or td.get('colspan') == '24':
tm = ''
for k in td.stripped_strings:
tm+= k
key.append(tm)
else:
tm = ''
for k in td.stripped_strings:
tm+= k
value.append(tm)
if len(key) == 0 and temp_key:
value_list.append(tm)
r = re.findall('\d{4}年\d{1,2}月-\d{4}年\d{1,2}月的([\S+]*)',temp_key)
if r:
r = r[0]
if result[temp_key][ii] not in result[r].keys():
result[r][result[temp_key][ii]] = []
result[r][result[temp_key][ii]].append(value_list[ii])
if key and value:
if len(key) == len(value):
result.update(dict(zip(key,value)))
else:
result.update(dict({key[0]:value}))
temp_key = key[0]
key=[]
value = []
return dict(result)
def get_json_LoanDetails(self,tag):
result = defaultdict(dict)
key = []
value = []
value_list = []
temp_key = None
for i,val in enumerate(tag):
tds = val.find_all('td')
# tds_len = len(tds)
# next_sibling = val.next_sibling
# if (next_sibling == ' ' or next_sibling =='\n' ) and next_sibling!=None:
# next_sibling = val.next_sibling.next_sibling
# tds_next = next_sibling.find_all('td')
# tds_next_len = len(next_sibling.find_all('td'))
for ii ,td in enumerate(tds):
# print(tds_len,tds_next_len,'---1---')
if isinstance(td,bs4.element.Tag):
if td.get('align') or td.get('colspan') == '24':
tm = ''
for k in td.stripped_strings:
tm+= k
key.append(tm)
else:
tm = ''
for k in td.stripped_strings:
tm+= k
value.append(tm)
if len(key) == 0 and temp_key:
value_list.append(tm)
r = re.findall('\d{4}年\d{1,2}月-\d{4}年\d{1,2}月的([\S+]*)',temp_key)
if r:
r = r[0]
if result[temp_key][ii] not in result[r].keys():
result[r][result[temp_key][ii]] = []
result[r][result[temp_key][ii]].append(value_list[ii])
if key and value:
if len(key) == len(value):
result.update(dict(zip(key,value)))
else:
result.update(dict({key[0]:value}))
temp_key = key[0]
key=[]
value = []
return dict(result)
if __name__ == '__main__':
TransactionDetails()
\ No newline at end of file
# -*- coding:utf-8 -*-
'''
服务启动接口
'''
from tornado.web import Application
from tornado.httpserver import HTTPServer
from tornado.ioloop import IOLoop
import tornado.log
import logging
import tornado.options
from config import settings as URL
# from handler import LoanDueRateHandler
from tornado.options import define, options
from handler import ReportAnalysis_Handler
from tornado.options import define, options
define("port", default=23010, help="run on the given port ", type=int)
define("log_path", default='/tmp', help="log path ", type=str)
class LogFormatter(tornado.log.LogFormatter):
def __init__(self):
print('%(color)s[%(asctime)s %(filename)s:%(funcName)s:%(lineno)d %(levelname)s]%(end_color)s %(message)s')
super(LogFormatter, self).__init__(
fmt='%(color)s[%(asctime)s %(filename)s:%(funcName)s:%(lineno)d %(levelname)s]%(end_color)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
def apps():
return Application([
(URL.url_reportanalysis, ReportAnalysis_Handler.ReportAnalysis),
])
if __name__ == "__main__":
# app = apps()
# server = HTTPServer(app)
# [i.setFormatter(LogFormatter()) for i in logging.getLogger().handlers]
# tornado.options.parse_command_line()
#
# # #== 本地调试
# app.listen(23011)
# IOLoop.instance().start()
tornado.options.parse_command_line()
app = apps()
http_server = tornado.httpserver.HTTPServer(app)
http_server.bind(options.port)
http_server.start()
tornado.ioloop.IOLoop.instance().start()
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup,Comment
from abc import ABCMeta,abstractmethod
import re
from lxml import etree
# path = "/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/432325197803211379.html"
# path = "/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/32052219780226051X.htm"
path = "/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/test.html"
htmlfile = open(path, 'rb')
# print(htmlfile)
htmlhandle = htmlfile.read()
soup = BeautifulSoup(htmlhandle,'lxml')
class Base(object):
def __init__(self):
self.path = "/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/test.html"
htmlfile = open(path, 'rb')
htmlhandle = htmlfile.read()
soup = BeautifulSoup(htmlhandle,'lxml')
comments = soup.findAll(text=lambda text: isinstance(text, Comment))
[comment.extract() for comment in comments] # 去除注释
# [s.extract() for s in soup("style")] # 去除指定标签
self.children = self.get_children(soup)
pass
def get_children(self,soup):
children = []
for child in soup.body.children:
if child != '\n':
children.append(child)
return children
@abstractmethod
def get_json(self,detail,msgJson):
return msgJson
class ReportDetail(Base):
"""报告详情"""
def __init__(self):
Base.__init__(self)
detail = self.children[1].find_all('b')
msgJson = {'报告编号':None,'查询请求时间':None,'报告时间':None}
self.result = {'报告详情':self.get_json(detail,msgJson)}
pass
def get_json(self,detail,msgJson):
for hl in detail:
text = hl.get_text()
for k,v in msgJson.items():
if k in text:
msgJson[k] = text.split(k+':')[1]
return msgJson
class SelectDetail(Base):
"""查询信息"""
def __init__(self):
Base.__init__(self)
detail = self.children[2].table.find_all('tr')
msgJson = {}
self.result = {'查询信息':self.get_json(detail,msgJson)}
pass
# r = ReportDetail()
# s = SelectDetail()
a = AllDatail()
children = []
for child in soup.body.children:
if child != '\n':
children.append(child)
d_identity = {}
table = soup.table
tbody = table.find_all('tbody')
tr_arr = table.find_all("tr")
tds = []
for tr in tr_arr:
tds.append(tr.find_all('td'))
# print(tds[0][0].get_text())
#
# print(tds[1][0].table['class'])
#
# print(tds[1][0].tr.contents[3])
name = []
value = []
tds[0][0].div.get_text()
for i in range(len(tds[1][0].find_all('tr'))):
if tds[1][0].find_all('tr')[i].find(style="WORD-BREAK: break-all") == None:
span = tds[1][0].find_all('tr')[i].find_all('div')
for s in span:
name.append(s.get_text())
else:
span = tds[1][0].find_all('tr')[i].find_all('div')
for s in span:
value.append(s.get_text())
d_identity = dict(zip(name,value))
# for child in children:
#
# # print(type(child))
# tr_arr = child.find_all("tr")
# for tr in tr_arr:
# print(tr)
#
# # print(child.find_all(text="性别"))
# print('----------')
# # print('--------')
# from lxml import etree
# html=etree.HTML(htmlhandle,etree.HTMLParser())
# print(html.text())
# print(html.xpath("//b[contains(text(),'一 个人基本信息')]"))
import urllib3
import sys
import bs4
import chardet
# reload(sys)
# sys.setdefaultencoding('utf-8')
def download(url):
htmlfile = open('/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/432325197803211379_1.html','w')
try:
# result = urllib3.urlopen(url)
result = open('/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/432325197803211379.html', 'rb')
content = result.read()
info = result.info()
result.close()
except Exception as e:
print ('download error!!!')
print (e)
else:
if content != None:
charset1 = (chardet.detect(content))['encoding'] #real encoding type
charset2 = info.getparam('charset') #declared encoding type
print (charset1,' ', charset2)
# case1: charset is not None.
if charset1 != None and charset2 != None and charset1.lower() != charset2.lower():
newcont = bs4.BeautifulSoup(content, from_encoding='GB18030') #coding: GB18030
for cont in newcont:
htmlfile.write('%s\n'%cont)
# case2: either charset is None, or charset is the same.
else:
#print sys.getdefaultencoding()
htmlfile.write(content) #default coding: utf-8
htmlfile.close()
if __name__ == "__main__":
url = '//www.jb51.net'
download(url)
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup,Comment\n",
"import bs4\n",
"from abc import ABCMeta,abstractmethod\n",
"from collections import defaultdict\n",
"import re\n",
"from lxml import etree"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"def get_children(soup):\n",
" children = []\n",
" for child in soup.body.children:\n",
" if child != '\\n':\n",
" children.append(child)\n",
" return children\n",
"\n",
"def get_menu(menu):\n",
" \"\"\"按报告标题大标题拆分\"\"\"\n",
" menu_dict = defaultdict(list)\n",
" index = []\n",
" key = None\n",
" for ii ,i in enumerate(menu):\n",
" if isinstance(i,bs4.element.Tag):\n",
" con = i.contents\n",
" if con :\n",
" menu = con[1]\n",
" # menu = i.contents[1]\n",
" for mm,m in enumerate(menu.stripped_strings):\n",
" # ret = re.findall(r\"[一|二|三|四|五] .*\", m)\n",
" ret=re.findall('([\\u4E00-\\u9FA5])[\\ ]{1,1}(\\w+)',m)\n",
" if ret:\n",
" key = ret[0][1]\n",
" index.append(ii)\n",
"\n",
" if key:\n",
" if ii not in index:\n",
" menu_dict[key].append(i)\n",
" return dict(menu_dict)\n",
" \n",
"def get_menuDetail(tag):\n",
" \"\"\"按报告内部小标题拆分\"\"\"\n",
" _dict = defaultdict(list)\n",
" key = None\n",
" for i in range(len(tag)):\n",
" index = []\n",
" for m in tag[i].stripped_strings:\n",
" # ret=re.findall(r'(\\(一\\)|\\(二\\))[\\ ]?(\\w+)',m)\n",
" # ret=re.findall(r'(\\(\\S\\))[\\ ]?(\\w+)',m)\n",
" ret=re.findall(r'(\\([\\u4E00-\\u9FA5]\\))[\\ ]?(\\w+)',m)\n",
" if ret:\n",
" key = ret[0][1]\n",
" index.append(i)\n",
" if key:\n",
" if i not in index:\n",
" _dict[key].append(tag[i])\n",
" return dict(_dict)\n",
"\n",
"def get_menuDetail_personalInformation(tag):\n",
" \"\"\"按报告内部小标题拆分 - 个人基本信息\"\"\"\n",
" _dict = defaultdict(list)\n",
" key = None\n",
" for i in tag:\n",
" if i.table==None:\n",
" if i.b:\n",
" key = i.b.get_text()\n",
" else:\n",
" _dict[key].append( i.table)\n",
" return _dict"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"path = r\"/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/test.html\"\n",
"# path = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/440421198210078001.html'\n",
"# path = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/432325197803211379.html'\n",
"# path = r'/Users/zhangpengcheng/Documents/量化派代码管理/credit_report/html/一代征信报告/330682198805122815.htm'\n",
"htmlfile = open(path, 'rb')\n",
"htmlhandle = htmlfile.read()\n",
"htmlfile.close()\n",
"soup = BeautifulSoup(htmlhandle,'lxml')\n",
"comments = soup.findAll(text=lambda text: isinstance(text, Comment))\n",
"[comment.extract() for comment in comments] # 去除注释\n",
"[s.extract() for s in soup(\"style\")] # 去除指定标签\n",
"children = get_children(soup)\n",
"# detail = self.children[2].tbody.find_all('tr')\n",
"# detail = self.children[2].tbody.find_all('tr')\n",
"# menu_detail = detail[4:]\n",
"menu_detail = children[2].tbody.children\n",
"menu_dict = get_menu(menu_detail) # 获取报告标题信息\n",
"menu_list = list(menu_dict.keys()) # 获取报告标题列表\n",
"\n",
"menu_personalInformation = get_menuDetail_personalInformation(menu_dict[menu_list[0]]) # 个人基本信息\n",
"menu_informationSummary = get_menuDetail(menu_dict[menu_list[1]]) # 信息概要\n",
"menu_transactionDetails = get_menuDetail(menu_dict[menu_list[2]]) # 交易信息明细\n",
"# menu_queryInformation = get_menuDetail_personalInformation(menu_dict[menu_list[4]])"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"import bs4\n",
"def get_detailsList(tag): # 交易信息明细拆分\n",
" \"\"\"交易信息明细小标题拆分\"\"\"\n",
" _result = []\n",
" _temp = []\n",
" for i in tag:\n",
" if i.td.attrs:\n",
" try :\n",
" if i.div['align'] == 'left':\n",
" # print(list( re.findall('([0-9]\\d{0,2})[\\.]{1}(\\S+)',i.div.get_text())[0]))\n",
" _temp.extend(list( re.findall('([0-9]\\d{0,2})[\\.]{1}(\\S+)',i.div.get_text())[0]))\n",
" elif i.div['align'] == 'center':\n",
" _temp.append(i)\n",
" except:\n",
" _temp.append(None)\n",
" else:\n",
" if i.table:\n",
" _temp.append(i)\n",
" if _temp:\n",
" _result.append(_temp)\n",
" _temp = []\n",
" return _result\n",
"menu_transactionDetailsType = {}\n",
"for k,value in menu_transactionDetails.items():\n",
" menu_transactionDetailsType[k] = get_detailsList(value)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"def get_LoanDetails(self,tag):\n",
" \"\"\"贷款明细拆分拆分\"\"\"\n",
" result = {'贷款':{}}\n",
" _result = []\n",
" for v in tag:\n",
" _json = {}\n",
" _json['序号'] =v[0]\n",
" r1 = re.findall(\"账户状态为“(\\S+)”\",v[1])\n",
" if r1:\n",
" # r = re.findall(\"(\\d{4}年\\d{1,2}月\\d{1,2}日)([\\S+]*)发放的([\\S+]*)元\\(([\\S+]*)\\)([\\S+]*),业务号([\\S+]{0,1}),([\\S+]*),([\\d]*)期,([\\S+]*)。截至(\\d{4}年\\d{1,2}月\\d{1,2}日)[\\S+]*账户状态为“(\\S+)”\",v[1])\n",
" r = re.findall(\"(\\d{4}年\\d{1,2}月\\d{1,2}日)([\\S+]*)发放的(.*?)元\\(([\\S+]*)\\)(.*?),业务号(.*?),(.*?),(.*?)。截至(.*?),账户状态为“(.*?)”\",v[1])\n",
" if r:\n",
" r = r[0]\n",
" _json['银行名称'] = r[1]\n",
" _json['贷款金额'] = r[2]\n",
" _json['贷款类型'] = r[4]\n",
" terms = r[7].split(',')\n",
" if len(terms)>1:\n",
" term = re.findall('(.*?)期',terms[0])[0]\n",
" _json['贷款期数'] = term\n",
" else:\n",
" _json['贷款期数'] = None\n",
"\n",
" _json['起始时间'] = r[0]\n",
" _json['终止时间'] = r[8]\n",
" _json['账户状态'] = r[9]\n",
" else:\n",
" # r = re.findall(\"(\\d{4}年\\d{1,2}月\\d{1,2}日)([\\S+]*)发放的([\\S+]*)元\\(([\\S+]*)\\)([\\S+]*),业务号([\\S+]{0,1}),([\\S+]*),([\\d]*)期,([\\S+]*),(\\d{4}年\\d{1,2}月\\d{1,2}日)[\\S+]*[\\S+]*(\\d{4}年\\d{1,2}月\\d{1,2}日)\",v[1])\n",
" r = re.findall(\"(\\d{4}年\\d{1,2}月\\d{1,2}日)([\\S+]*)发放的(.*?)元\\(([\\S+]*)\\)(.*?),业务号(.*?),(.*?),(.*?),(\\d{4}年\\d{1,2}月\\d{1,2}日)到期。截至(\\d{4}年\\d{1,2}月\\d{1,2}日)\",v[1])\n",
" if r:\n",
" r = r[0]\n",
" _json['银行名称'] = r[1]\n",
" _json['贷款金额'] = r[2]\n",
" _json['贷款类型'] = r[4]\n",
" # _json['贷款期数'] = r[7]\n",
" terms = r[7].split(',')\n",
" if len(terms)>1:\n",
" term = re.findall('(.*?)期',terms[0])[0]\n",
" _json['贷款期数'] = term\n",
" else:\n",
" _json['贷款期数'] = None\n",
" _json['起始时间'] = r[0]\n",
" _json['终止时间'] = r[9]\n",
" if v[2] != None:\n",
" table_tag = get_table(v[2])\n",
" table_json = get_json(table_tag)\n",
" key_name = table_json.keys()\n",
" rename = ''\n",
" temp_json = {}\n",
" for name in key_name:\n",
" r = re.findall(\"(\\d{4}年\\d{1,2}月)-(\\d{4}年\\d{1,2}月)的([\\S+]*)\",name)\n",
" if r :\n",
" temp_json['近24个月的还款记录-开始时间'] =r[0][0]\n",
" temp_json['近24个月的还款记录-截至时间'] =r[0][1]\n",
" rename = name\n",
" if rename:\n",
" table_json['近24个月的还款记录-各期还款记录'] = table_json.pop(rename)\n",
" if temp_json:\n",
" table_json.update(temp_json)\n",
"\n",
" _json.update(table_json)\n",
" _result.append(_json)\n",
" result['贷款'] = _result\n",
" return result"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"def get_table(tag):\n",
" tag_list = []\n",
" table = tag.tbody\n",
" for i,t in enumerate(table):\n",
" if isinstance(t,bs4.element.Tag):\n",
" tag_list.append(t)\n",
" return tag_list\n",
"\n",
"def get_json(table):\n",
" result = defaultdict(list)\n",
" _key = []\n",
"# _colspan = 0\n",
" index = 0\n",
" for i,val in enumerate(table):\n",
" tds = val.find_all('td')\n",
" if i %2 == 0:\n",
" key = []\n",
" for ii ,td in enumerate(tds):\n",
" if isinstance(td,bs4.element.Tag):\n",
"# _colspan = td.get('colspan')\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" key.append(tm)\n",
" _key.append(key)\n",
" index+=1\n",
" else:\n",
" if len(_key[index-1]) == len(tds):\n",
" for ii ,td in enumerate(tds):\n",
" if isinstance(td,bs4.element.Tag):\n",
" value = ''\n",
" for k in td.stripped_strings:\n",
" value+=k\n",
" result[_key[index-1][ii]] = value\n",
" else:\n",
" value = []\n",
" for ii ,td in enumerate(tds):\n",
" value.append(td.get_text().strip())\n",
" result[_key[index-1][0]] = value\n",
" return dict(result)"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2017年05月07日小额信贷公司“BF”发放的139,934元(人民币)个人消费贷款,业务号X,信用/免担保,34期,按月归还,2020年03月05日到期。截至2019年03月31日,'"
]
},
"execution_count": 87,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"tag = menu_transactionDetailsType['贷款'][1][2]\n",
"table_tag = get_table(tag)\n",
"_json = get_json(table_tag)\n",
"menu_transactionDetailsType['贷款'][1][1]\n",
"# table_tag"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<tr>\n",
"<td>\n",
"<div align=\"center\">\n",
"<table align=\"center\" cellpadding=\"2\" cellspacing=\"0\" class=\"tableStyle\" width=\"620\">\n",
"<tbody>\n",
"<tr>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">个人住房贷款笔数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">个人商用房(包括商住两用)贷款笔数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">其他贷款笔数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">首笔贷款发放月份</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">贷记卡账户数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">首张贷记卡发卡月份</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">准贷记卡账户数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">首张准贷记卡发卡月份</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">本人声明数目</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">异议标注数目</span></font></b></div></td></tr>\n",
"<tr>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">97</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">2016.03</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">10</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">2007.07</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">--</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td></tr></tbody></table></div></td></tr>"
]
},
"execution_count": 88,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"def get_json(table):\n",
" result = defaultdict(dict)\n",
" key = []\n",
" value = []\n",
" value_list = []\n",
" \n",
" temp_key = None\n",
" for i,val in enumerate(table):\n",
" tds = val.find_all('td') \n",
" tds_len = len(tds)\n",
" next_sibling = val.next_sibling\n",
" tds_next_len = tds_len\n",
" if (next_sibling == ' ' or next_sibling =='\\n' ) and next_sibling!=None:\n",
" next_sibling = val.next_sibling.next_sibling\n",
" tds_next = next_sibling.find_all('td')\n",
" tds_next_len = len(next_sibling.find_all('td'))\n",
" \n",
" for ii ,td in enumerate(tds):\n",
"# print(tds_len,tds_next_len,'---1---')\n",
" if isinstance(td,bs4.element.Tag):\n",
" if val.get('align') or td.get('colspan') == '24':\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" key.append(tm)\n",
" else:\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" value.append(tm)\n",
" if len(key) == 0 and temp_key:\n",
" value_list.append(tm)\n",
" r = re.findall('\\d{4}年\\d{1,2}月-\\d{4}年\\d{1,2}月的([\\S+]*)',temp_key)\n",
" if r:\n",
" r = r[0]\n",
" if result[temp_key][ii] not in result[r].keys():\n",
" result[r][result[temp_key][ii]] = []\n",
" result[r][result[temp_key][ii]].append(value_list[ii])\n",
" \n",
"# result[r].append({result[temp_key][ii]:value_list[ii]})\n",
" \n",
" if key and value:\n",
" if len(key) == len(value):\n",
" print(key,value)\n",
" result.update(dict(zip(key,value)))\n",
" else:\n",
" result.update(dict({key[0]:value}))\n",
" temp_key = key[0]\n",
"# print(key,value)\n",
" key=[]\n",
" value = []\n",
" return dict(result)\n",
"\n",
"# table_tag = get_table(menu_transactionDetailsType['贷记卡'][0][2])\n",
"# get_json(table_tag)\n",
"\n",
"tag = menu_informationSummary['信用提示'][0]\n",
"# table_tag = get_table(tag)\n",
"# _json = get_json(table_tag)\n",
"# _json\n",
"\n",
"tag"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'个人信用报告 “数字解读”': {'余额': '381,170',\n",
" '合同总额': '455,783',\n",
" '最近6个月平均应还款': '16,911',\n",
" '笔数': '21',\n",
" '贷款机构数': '12',\n",
" '贷款法人机构数': '12'}}"
]
},
"execution_count": 103,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"def get_json(tag):\n",
" \"\"\"获取table解析后的json , 支持表头与值循环\"\"\"\n",
" result = defaultdict(list)\n",
" _key = []\n",
" # _colspan=0\n",
" index = 0\n",
" for i,val in enumerate(tag):\n",
" tds = val.find_all('td')\n",
" if i %2 == 0:\n",
" key = []\n",
" for ii ,td in enumerate(tds):\n",
" if isinstance(td,bs4.element.Tag):\n",
" # _colspan = td.get('colspan')\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" key.append(tm)\n",
" _key.append(key)\n",
" index+=1\n",
" else:\n",
" if len(_key[index-1]) == len(tds):\n",
" for ii ,td in enumerate(tds):\n",
" if isinstance(td,bs4.element.Tag):\n",
" value = ''\n",
" for k in td.stripped_strings:\n",
" value+=k\n",
" result[_key[index-1][ii]] = value\n",
" else:\n",
" value = []\n",
" for ii ,td in enumerate(tds):\n",
" value.append(td.get_text().strip())\n",
" result[_key[index-1][0]] = value\n",
" return dict(result)\n",
"def get_numberUnscramble(tag,name):\n",
" result = {}\n",
" tag = tag[name]\n",
" istrue = False\n",
" key = ''\n",
" if tag:\n",
" for i,t in enumerate(tag):\n",
" if isinstance(t,bs4.element.Tag):\n",
" if '个人信用报告 “数字解读”' in t.td.get_text():\n",
" istrue = True\n",
" key = t.td.get_text()\n",
" table = t.next_sibling.next_sibling\n",
" if istrue:\n",
" tag = table\n",
" if tag:\n",
" if tag.find_all('tbody'):\n",
" for i,t in enumerate(tag):\n",
" if isinstance(t,bs4.element.Tag):\n",
" table_tag = get_table(t)\n",
" _json = get_json(table_tag)\n",
" result[key] = _json\n",
" return result\n",
"get_numberUnscramble(menu_informationSummary,'信用提示')"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<tr>\n",
"<td>\n",
"<div align=\"center\">\n",
"<table align=\"center\" cellpadding=\"2\" cellspacing=\"0\" class=\"tableStyle\" width=\"620\">\n",
"<tbody>\n",
"<tr>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">个人住房贷款笔数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">个人商用房(包括商住两用)贷款笔数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">其他贷款笔数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">首笔贷款发放月份</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">贷记卡账户数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">首张贷记卡发卡月份</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">准贷记卡账户数</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">首张准贷记卡发卡月份</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">本人声明数目</span></font></b></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">异议标注数目</span></font></b></div></td></tr>\n",
"<tr>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">97</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">2016.03</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">10</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">2007.07</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">--</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td>\n",
"<td class=\"tdStyle\" width=\"10%\">\n",
"<div align=\"center\" class=\"high\"><font color=\"#0066cc\">0</font></div></td></tr></tbody></table></div></td></tr>"
]
},
"execution_count": 57,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"menu_informationSummary['信用提示'][0]"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>逾期持续月数</th>\n",
" <th>逾期月份</th>\n",
" <th>逾期金额</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2016.07</td>\n",
" <td>755</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2014.12</td>\n",
" <td>497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>2016.07</td>\n",
" <td>755</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2014.12</td>\n",
" <td>497</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"import pandas as pd\n",
"aa = {'逾期持续月数': ['1', '1', '1', '1'],\n",
" '逾期月份': ['2016.07', '2014.12', '2016.07', '2014.12'],\n",
" '逾期金额': ['755', '497', '755', '497']}\n",
"pd.DataFrame(aa)"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 105,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"tag = menu_informationSummary['授信及负债信息概要'][1]\n",
"def get_table(tag):\n",
" \"\"\"把table 转list\"\"\"\n",
" tag_list = []\n",
" table = tag.tbody\n",
" for i,t in enumerate(table):\n",
" if isinstance(t,bs4.element.Tag):\n",
" tag_list.append(t)\n",
" return tag_list\n",
"\n",
"table_tag = get_table(tag)\n",
"_json = get_json(table_tag)\n",
"_json"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'未结清贷款信息汇总': {}, '未销户贷记卡信息汇总': {}}\n"
]
}
],
"source": [
"result = {}\n",
"for i,tag in enumerate(menu_informationSummary['授信及负债信息概要']):\n",
" if isinstance(tag,bs4.element.Tag):\n",
" if tag.td.get('align'):\n",
" key = tag.td.get_text()\n",
" else:\n",
" if tag.tbody:\n",
" table_tag = get_table(tag)\n",
" _json = get_json(table_tag)\n",
" result[key] = _json\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [],
"source": [
"# _key = menu_personalInformation.keys()\n",
"# for k in _key:\n",
" \n",
"# table_tagmsg_tag = menu_personalInformation[k][0]\n",
"# table_tagmsg = get_table(table_tagmsg_tag)\n",
"# _json = get_json(table_tagmsg)\n",
"# print(_json)\n",
"\n",
"table_tag_live = menu_personalInformation['居住信息'][0]\n",
"# table_tagmsg = get_table(table_tagmsg_tag)\n",
"# _json = get_json(table_tagmsg)\n",
"# _json"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"tag = table_tag_live.tbody"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"_result = []\n",
"for i in tag:\n",
" td = tag.findAll('td')\n",
" _v = []\n",
" for t in td:\n",
" if t.get('style'):\n",
" for v in t.stripped_strings:\n",
" _v.append(v)\n",
" _result.append(_v)\n",
" \n",
" \n",
"# if isinstance(i,bs4.element.Tag):\n",
"# if i.td.get('colspan') == None:\n",
"# print(i)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['1', '浙xxxxxxxx司', '浙xxxxxxxxx9号'], ['2', '浙xxxxxxxxxx中心', '--'], ['3', '浙xxx司', '浙xxxxxxxx9号'], ['4', '杭xxxxxx络', '浙xxxxxxxxxxx902'], ['5', '杭xxxxxxxx公司', '浙xxxxxxxxxxxx际'], ['1', '--'], ['--', '--', '--'], ['1900', '2019.02.20', '2'], ['--', '租赁和商务服务业', '--'], ['--', '--', '2018.09.28'], ['3', '--', '--'], ['--', '--', '2017'], ['2018.03.05', '4', '--'], ['信息传输、计算机服务和软件业', '--', '无'], ['--', '2018.02.24', '5'], ['--', '信息传输、计算机服务和软件业', '一般员工'], ['--', '--', '2018.01.16']]\n"
]
}
],
"source": [
"table = menu_personalInformation['职业信息']\n",
"result = []\n",
"bodys= []\n",
"heads = {}\n",
"\n",
"def get_head(tag):\n",
" num = 0\n",
" for i,k in enumerate(table[0].tr.children):\n",
" if isinstance(k,bs4.element.Tag):\n",
" num+=1\n",
" return num\n",
"result_2 = defaultdict(list)\n",
"\n",
"for i,val in enumerate(table):\n",
" tds = val.find_all('td')\n",
" index = 0\n",
" _i = 0\n",
" for ii,td in enumerate(tds):\n",
" if isinstance(td,bs4.element.Tag):\n",
"# print(td)\n",
" td_num = get_head(val)\n",
" \n",
" if td.get('style') == None:\n",
" for h in td.stripped_strings:\n",
" heads[h] = []\n",
"\n",
" else:\n",
" for v in td.stripped_strings:\n",
" bodys.append(v)\n",
" index +=1\n",
" if index%td_num == 0 :\n",
"# print(td)\n",
" if bodys:\n",
" result.append(bodys)\n",
" bodys = []\n",
"# for mm, m in enumerate(td.stripped_strings):\n",
"# if td.get('style') == None:\n",
"# key.append(m)\n",
"# else:\n",
"# result_2[key[index%td_num]].append(m)\n",
"# index +=1\n",
"\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"# table = menu_personalInformation['身份信息']\n",
"table = menu_transactionDetailsType['贷款'][1][2]"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"table = menu_personalInformation['职业信息']\n",
"tag = table[0].tbody.children\n",
"def get_json(tag):\n",
" result = defaultdict(list)\n",
" key = []\n",
" _num = 0\n",
" for tr in tag:\n",
" _re = []\n",
" index = 0\n",
" if isinstance(tr,bs4.element.Tag):\n",
" tds = tr.find_all('td')\n",
" value = []\n",
" # key = []\n",
" td_num = len(tds)\n",
" for td in tds:\n",
" for m in td.stripped_strings:\n",
" if td.get('style') == None:\n",
" key.append(m)\n",
" else:\n",
" value.append(m)\n",
" index+=1\n",
"\n",
" if len(key)!=td_num:\n",
" key = key[len(key)-td_num:]\n",
" _num += 1\n",
" if value:\n",
" result[_num].append(dict(zip(key,value)))\n",
" return dict(result)\n",
"_identity = get_json(tag)"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"# 11.查询记录汇总\n",
"querySummary = menu_queryInformation['查询记录汇总'][0]\n",
"query_Detail = get_table(querySummary)\n",
"query_Detail[1]\n",
"_result = {'最近1个月内的查询机构数':{'贷款审批':None,'信用卡审批':None},\n",
" '最近1个月内的查询次数':{'贷款审批':None,'信用卡审批':None,'本人查询':None},\n",
" '最近2年内的查询次数':{'贷后管理':None,'担保资格审查':None,'特约商户实名审查':None}\n",
" }\n",
"values = []\n",
"\n",
"for i ,td in enumerate(query_Detail[2]):\n",
" if isinstance(td,bs4.element.Tag):\n",
" tm = ''\n",
" for msg in td.stripped_strings:\n",
" tm+=msg\n",
" values.append(tm)\n",
"result = {'查询记录汇总':{}}\n",
"index = 0\n",
"for k ,v in _result.items():\n",
" for kk,vv in v.items():\n",
" result['查询记录汇总'][kk+k] = values[index]\n",
" index+=1"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'查询记录汇总': {'信用卡审批最近1个月内的查询机构数': '0',\n",
" '信用卡审批最近1个月内的查询次数': '0',\n",
" '担保资格审查最近2年内的查询次数': '0',\n",
" '本人查询最近1个月内的查询次数': '0',\n",
" '特约商户实名审查最近2年内的查询次数': '0',\n",
" '贷后管理最近2年内的查询次数': '24',\n",
" '贷款审批最近1个月内的查询机构数': '2',\n",
" '贷款审批最近1个月内的查询次数': '2'}}"
]
},
"execution_count": 114,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"result"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: [{'查询原因': '贷款审批', '查询操作员': 'VB', '查询日期': '2019.05.07', '编号': '1'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2019.04.21', '编号': '2'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZV', '查询日期': '2019.04.08', '编号': '3'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'OX', '查询日期': '2019.04.01', '编号': '4'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YM', '查询日期': '2019.03.05', '编号': '5'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YP', '查询日期': '2019.03.05', '编号': '6'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'SC', '查询日期': '2019.03.04', '编号': '7'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'VB', '查询日期': '2019.03.01', '编号': '8'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'OX', '查询日期': '2019.02.26', '编号': '9'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'RZ', '查询日期': '2019.02.20', '编号': '10'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'JJ', '查询日期': '2019.02.18', '编号': '11'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YP', '查询日期': '2019.01.08', '编号': '12'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'OX', '查询日期': '2018.12.05', '编号': '13'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'DG', '查询日期': '2018.11.14', '编号': '14'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'RZ', '查询日期': '2018.11.14', '编号': '15'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YM', '查询日期': '2018.11.07', '编号': '16'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YP', '查询日期': '2018.11.07', '编号': '17'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'TX', '查询日期': '2018.11.07', '编号': '18'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2018.11.07', '编号': '19'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'OO', '查询日期': '2018.10.29', '编号': '20'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'HR', '查询日期': '2018.10.12', '编号': '21'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'NP', '查询日期': '2018.10.08', '编号': '22'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'DG', '查询日期': '2018.10.08', '编号': '23'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'OX', '查询日期': '2018.10.01', '编号': '24'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2018.09.28', '编号': '25'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'ZH', '查询日期': '2018.09.27', '编号': '26'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZV', '查询日期': '2018.09.09', '编号': '27'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'ZH', '查询日期': '2018.09.05', '编号': '28'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZZ', '查询日期': '2018.09.04', '编号': '29'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2018.08.17', '编号': '30'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'OO', '查询日期': '2018.08.14', '编号': '31'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'PP', '查询日期': '2018.08.08', '编号': '32'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'QC', '查询日期': '2018.08.07', '编号': '33'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZV', '查询日期': '2018.07.31', '编号': '34'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'GH', '查询日期': '2018.07.12', '编号': '35'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZV', '查询日期': '2018.06.25', '编号': '36'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'GH', '查询日期': '2018.06.25', '编号': '37'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'GZ', '查询日期': '2018.06.25', '编号': '38'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2018.06.20', '编号': '39'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'PK', '查询日期': '2018.06.07', '编号': '40'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'ZH', '查询日期': '2018.06.05', '编号': '41'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'BM', '查询日期': '2018.05.26', '编号': '42'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2018.05.14', '编号': '43'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZF', '查询日期': '2018.05.12', '编号': '44'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZV', '查询日期': '2018.05.10', '编号': '45'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'RZ', '查询日期': '2018.05.09', '编号': '46'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'RF', '查询日期': '2018.03.09', '编号': '47'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2018.03.07', '编号': '48'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'BU', '查询日期': '2018.02.28', '编号': '49'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'XZ', '查询日期': '2018.01.14', '编号': '50'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YM', '查询日期': '2017.12.14', '编号': '51'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'XZ', '查询日期': '2017.11.05', '编号': '52'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YM', '查询日期': '2017.11.01', '编号': '53'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZN', '查询日期': '2017.11.01', '编号': '54'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'CB', '查询日期': '2017.10.18', '编号': '55'},\n",
" {'查询原因': '信用卡审批', '查询操作员': 'ZH', '查询日期': '2017.10.17', '编号': '56'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'PZ', '查询日期': '2017.10.14', '编号': '57'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ST', '查询日期': '2017.09.28', '编号': '58'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'ZN', '查询日期': '2017.09.25', '编号': '59'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YM', '查询日期': '2017.09.24', '编号': '60'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'JJ', '查询日期': '2017.09.12', '编号': '61'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'RC', '查询日期': '2017.09.10', '编号': '62'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'GH', '查询日期': '2017.08.21', '编号': '63'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YM', '查询日期': '2017.08.21', '编号': '64'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'RC', '查询日期': '2017.07.13', '编号': '65'},\n",
" {'查询原因': '贷款审批', '查询操作员': 'YM', '查询日期': '2017.05.18', '编号': '66'}]}"
]
},
"execution_count": 116,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"# 12.信贷审批查询记录明细\n",
"queryCreditApprovalDetail = menu_queryInformation['信贷审批查询记录明细'][0]\n",
"\n",
"tag = queryCreditApprovalDetail.tbody.children\n",
"def get_json(tag):\n",
" result = defaultdict(list)\n",
" key = []\n",
" _num = 0\n",
" for tr in tag:\n",
" _re = []\n",
" index = 0\n",
" if isinstance(tr,bs4.element.Tag):\n",
" tds = tr.find_all('td')\n",
" value = []\n",
" # key = []\n",
" td_num = len(tds)\n",
" for td in tds:\n",
" for m in td.stripped_strings:\n",
" if td.get('style') == None:\n",
" key.append(m)\n",
" else:\n",
" value.append(m)\n",
" index+=1\n",
"\n",
" if len(key)!=td_num:\n",
" key = key[len(key)-td_num:]\n",
" _num += 1\n",
" if value:\n",
" result[_num].append(dict(zip(key,value)))\n",
" return dict(result)\n",
"get_json(tag)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'逾期(透支)信息汇总': {'准贷记卡60天以上透支': {'单月最高透支余额': '0',\n",
" '最长透支月数': '0',\n",
" '月份数': '0',\n",
" '账户数': '0'},\n",
" '贷款逾期': {'单月最高逾期总额': '3,119', '最长逾期月数': '1', '月份数': '1', '笔数': '1'},\n",
" '贷记卡逾期': {'单月最高逾期总额': '0', '最长逾期月数': '1', '月份数': '2', '账户数': '2'}}}"
]
},
"execution_count": 117,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"# 逾期及违约信息概要\n",
"def get_information_overdueAndDefault(tag,name):\n",
" \"\"\"逾期及违约信息概要\"\"\"\n",
" result = {}\n",
" tags = tag[name]\n",
" if tags:\n",
" for i,tag in enumerate(tags):\n",
" if isinstance(tag,bs4.element.Tag):\n",
" if tag.td.get('align'):\n",
" key = tag.td.get_text()\n",
" else:\n",
" if tag.tbody:\n",
" table_tag = get_table(tag)\n",
" _json = get_json_overdueAndDefault(table_tag)\n",
" result[key] = _json\n",
"\n",
" return result\n",
"def get_json_overdueAndDefault(tag):\n",
" column = [] #一共几列\n",
" head = []\n",
" key = []\n",
" body = []\n",
" line_id = 0 #行\n",
" row_id = 0 #列\n",
" values = []\n",
" result = {}\n",
" for i,tds in enumerate(tag):\n",
" tds = tds.children\n",
" for ii ,td in enumerate(tds):\n",
" if isinstance(td,bs4.element.Tag):\n",
" if td.get('colspan'):\n",
" column.append(int(td.get('colspan')))\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+=k\n",
" head.append(tm)\n",
" else:\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+=k\n",
" values.append(tm)\n",
" row_id += 1\n",
" if line_id<=1:\n",
" if row_id % column[line_id] == 0:\n",
" key.append(values)\n",
" values = []\n",
"# print(row_id,line_id,column[line_id])\n",
" else:\n",
" if row_id % column[line_id] == 0:\n",
" body.append(values)\n",
" values = []\n",
" line_id +=1\n",
" for i,h in enumerate(head):\n",
" result[h] = dict(zip(key[i],body[i]))\n",
" return result\n",
"get_information_overdueAndDefault(menu_informationSummary,'逾期及违约信息概要')"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"def get_DebitCard_Details(tag):\n",
" \"\"\"贷记卡明细拆分\"\"\"\n",
" result = {'贷记卡':{}}\n",
" _result = []\n",
" for v in tag:\n",
" _json = {}\n",
" _json['序号'] =v[0]\n",
" _json['描述'] =v[0]+'.'+v[1]\n",
" r = re.findall(\"(\\d{4}年\\d{1,2}月\\d{1,2}日)(.*?)发放的(.*?),业务号(.*?),授信额度[折合人民币]{0,5}(.*?)元,共享授信额度[折合人民币]{0,5}(.*?)元,(.*?)[,|。]{1}截至(\\d{4}年\\d{1,2}月\\d{1,2}日),(账户状态为“[\\u4E00-\\u9FA5]{1,2}”)*\",v[1])\n",
" if r :\n",
" r = r[0]\n",
" _json['银行名称'] = r[1]\n",
" _json['授信金额'] = r[4]\n",
" _json['共享授信金额'] = r[5]\n",
" _json['授信时间'] = r[0]\n",
" _json['截止时间'] = r[7]\n",
" if r[8]:\n",
" r_temp = re.findall('账户状态为“(.*?)”',r[8])[0]\n",
" _json['账户状态 '] = r_temp\n",
"\n",
" if v[2] != None:\n",
" table_tag = get_table(v[2])\n",
" \n",
" table_json = get_json_DebitCard(table_tag)\n",
" key_name = table_json.keys()\n",
" plan = ''\n",
" overdue = ''\n",
" temp_json = {}\n",
" for name in key_name:\n",
" r = re.findall(\"(\\d{4}年\\d{1,2}月)-(\\d{4}年\\d{1,2}月)的([\\S+]*)\",name)\n",
" if r :\n",
" if r[0][2] == '还款记录':\n",
" temp_json['近24个月的还款记录-开始时间'] =r[0][0]\n",
" temp_json['近24个月的还款记录-截至时间'] =r[0][1]\n",
" plan = name\n",
" if r[0][2] == '逾期记录':\n",
" overdue = name\n",
"\n",
" if plan:\n",
" table_json['近24个月的还款记录-各期还款记录'] = table_json.pop(plan)\n",
" if overdue:\n",
" del table_json[overdue]\n",
"\n",
" if temp_json:\n",
" table_json.update(temp_json)\n",
"\n",
" if '逾期记录' in table_json.keys():\n",
" table_json['近5年逾期记录'] = table_json.pop('逾期记录')\n",
"\n",
" _json.update(table_json)\n",
" _result.append(_json)\n",
" result['贷记卡'] = _result\n",
" return result\n",
"\n",
"table_tag_new = None\n",
"def get_json_DebitCard(tag):\n",
" result = defaultdict(dict)\n",
" key = []\n",
" value = []\n",
" value_list = []\n",
"\n",
" temp_key = None\n",
" for i,val in enumerate(tag):\n",
" tds = val.find_all('td')\n",
" # tds_len = len(tds)\n",
" next_sibling = val.next_sibling\n",
" if (next_sibling == ' ' or next_sibling =='\\n' ) and next_sibling!=None:\n",
" next_sibling = val.next_sibling.next_sibling\n",
" # tds_next = next_sibling.find_all('td')\n",
" # tds_next_len = len(next_sibling.find_all('td'))\n",
"\n",
" for ii ,td in enumerate(tds):\n",
" # print(tds_len,tds_next_len,'---1---')\n",
" if isinstance(td,bs4.element.Tag):\n",
" if val.get('align') or td.get('colspan') == '24':\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" key.append(tm)\n",
" else:\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" value.append(tm)\n",
" if len(key) == 0 and temp_key:\n",
" value_list.append(tm)\n",
" r = re.findall('\\d{4}年\\d{1,2}月-\\d{4}年\\d{1,2}月的([\\S+]*)',temp_key)\n",
" if r:\n",
" r = r[0]\n",
" if result[temp_key][ii] not in result[r].keys():\n",
" result[r][result[temp_key][ii]] = []\n",
" result[r][result[temp_key][ii]].append(value_list[ii])\n",
"\n",
" if key and value:\n",
" if len(key) == len(value):\n",
" result.update(dict(zip(key,value)))\n",
" else:\n",
" result.update(dict({key[0]:value}))\n",
" temp_key = key[0]\n",
" key=[]\n",
" value = []\n",
" return dict(result)"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"menu_transactionDetailsType = {}\n",
"for k,value in menu_transactionDetails.items():\n",
" menu_transactionDetailsType[k] = get_detailsList(value)\n",
"# get_DebitCard_Details(menu_transactionDetailsType['贷款'])\n",
"tag = None\n",
"for t in menu_transactionDetailsType['贷款']:\n",
" if t[0] == '1':\n",
" tag = t\n",
"table_tag = get_table(tag[2])"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[<tr>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">账户状态</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">五级分类</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">本金余额</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">剩余还<br/>款期数</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\"><b><font color=\"#0066cc\"><span class=\"high\">本月应还款</span></font></b> </td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\"><b><font color=\"#0066cc\"><span class=\"high\">应还款日</span></font></b> </td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\"><b><font color=\"#0066cc\"><span class=\"high\">本月实还款</span></font></b> </td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"3\" style=\"BORDER-RIGHT-WIDTH: 1px\"><b><font color=\"#0066cc\"><span class=\"high\">最近一次<br/>还款日期</span></font></b> </td></tr>,\n",
" <tr>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">正常</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">正常</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">327,097</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">136</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">3,119</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">2019.04.14</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">3,119</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"3\" style=\"BORDER-RIGHT-WIDTH: 1px; WORD-BREAK: break-all\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">2019.04.14</span></font></div></td></tr>,\n",
" <tr>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">当前逾期期数</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">当前逾期金额</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期31-60天<br/>未还本金</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期61-90天<bt>未还本金</bt></span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期91-180天<br/>未还本金</span></font></b></div></td>\n",
" <td align=\"center\" class=\"tdStyle\" colspan=\"4\" style=\"BORDER-RIGHT-WIDTH: 1px\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期180天以<br/>上未还本金</span></font></b></div></td></tr>,\n",
" <tr>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">0</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">0</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">0</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">0</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">0</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\" style=\"BORDER-RIGHT-WIDTH: 1px\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">0</span></font></div></td></tr>,\n",
" <tr>\n",
" <td class=\"tdStyle\" colspan=\"24\" style=\"BORDER-RIGHT-WIDTH: 1px\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><strong><b>2017年05月-2019年04月的还款记录</b></strong></font></div></td></tr>,\n",
" <tr>\n",
" <td class=\"tdStyle\" width=\"37\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"37\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" width=\"38\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td>\n",
" <td class=\"tdStyle\" style=\"BORDER-RIGHT-WIDTH: 1px\" width=\"28\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">N</span></font></div></td></tr>,\n",
" <tr>\n",
" <td class=\"tdStyle\" colspan=\"24\" style=\"BORDER-RIGHT-WIDTH: 1px\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">2015年08月-2017年04月的逾期记录 </span></font></b></div></td></tr>,\n",
" <tr>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期月份</span></font></b></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期持续月数</span></font></b></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期金额</span></font></b></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期月份</span></font></b></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期持续月数</span></font></b></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\" style=\"BORDER-RIGHT-WIDTH: 1px\">\n",
" <div align=\"center\" class=\"high\"><b><font color=\"#0066cc\"><span class=\"high\">逾期金额</span></font></b></div></td></tr>,\n",
" <tr>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">2016.12</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">1</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">3,119</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">--</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">--</span></font></div></td>\n",
" <td class=\"tdStyle\" colspan=\"4\" style=\"BORDER-RIGHT-WIDTH: 1px\">\n",
" <div align=\"center\" class=\"high\"><font color=\"#0066cc\"><span class=\"high\">--</span></font></div></td></tr>]"
]
},
"execution_count": 123,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"table_tag"
]
},
{
"cell_type": "code",
"execution_count": 310,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "list index out of range",
"output_type": "error",
"traceback": [
"---------------------------------------------------------------------------",
"IndexError Traceback (most recent call last)",
"<ipython-input-310-6f3d66682be4> in <module>()\n 65 # _k = []\n 66 return dict(result)\n---> 67 get_json_DebitCard(table_tag)\n",
"<ipython-input-310-6f3d66682be4> in get_json_DebitCard(tag)\n 35 if r:\n 36 r = r[0]\n---> 37 if result[temp_key][ii] not in result[r].keys():\n 38 result[r][result[temp_key][ii]] = []\n 39 result[r][result[temp_key][ii]].append(value_list[ii])\n",
"IndexError: list index out of range"
]
}
],
"source": [
"def get_json_DebitCard(tag):\n",
" result = defaultdict(dict)\n",
" key = []\n",
" value = []\n",
" value_list = []\n",
" _k = []\n",
" _v = []\n",
"\n",
" temp_key = None\n",
" for i,val in enumerate(tag):\n",
" tds = val.find_all('td')\n",
" # tds_len = len(tds)\n",
"# next_sibling = val.next_sibling\n",
"# if (next_sibling == ' ' or next_sibling =='\\n' ) and next_sibling!=None:\n",
"# next_sibling = val.next_sibling.next_sibling\n",
"# # tds_next = next_sibling.find_all('td')\n",
"# # tds_next_len = len(next_sibling.find_all('td'))\n",
"\n",
" for ii ,td in enumerate(tds):\n",
" # print(tds_len,tds_next_len,'---1---')\n",
" if isinstance(td,bs4.element.Tag):\n",
" if td.get('align') or td.get('colspan') == '24':\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" key.append(tm)\n",
" else:\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" value.append(tm)\n",
" if len(key) == 0 and temp_key:\n",
" value_list.append(tm)\n",
" r = re.findall('\\d{4}年\\d{1,2}月-\\d{4}年\\d{1,2}月的([\\S+]*)',temp_key)\n",
" if r:\n",
" r = r[0]\n",
" if result[temp_key][ii] not in result[r].keys():\n",
" result[r][result[temp_key][ii]] = []\n",
" result[r][result[temp_key][ii]].append(value_list[ii])\n",
"\n",
" if key and value:\n",
" if len(key) == len(value):\n",
" result.update(dict(zip(key,value)))\n",
" else:\n",
" result.update(dict({key[0]:value}))\n",
" temp_key = key[0]\n",
" key=[]\n",
" value = []\n",
" \n",
" \n",
" \n",
"# if key and value_list:\n",
"# print(len(next_sibling.find_all('td')))\n",
"# print(next_sibling.find_all('td'))\n",
"# key = []\n",
"# value_list = []\n",
"# if len(key)==0 and len(value_list)==0:\n",
"# if value and len(value)<= len(tds):\n",
"# _k=value\n",
"# else :\n",
"# _v = value[len(tds):]\n",
"# result.update(dict(zip(_k,_v)))\n",
" \n",
"# value = []\n",
"# _k = []\n",
"# return dict(result)\n",
"get_json_DebitCard(table_tag)"
]
},
{
"cell_type": "code",
"execution_count": 312,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'2015年08月-2017年04月的逾期记录': ['逾期月份',\n",
" '逾期持续月数',\n",
" '逾期金额',\n",
" '逾期月份',\n",
" '逾期持续月数',\n",
" '逾期金额'],\n",
" '2017年05月-2019年04月的还款记录': ['N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N',\n",
" 'N'],\n",
" '五级分类': '正常',\n",
" '剩余还款期数': '136',\n",
" '应还款日': '2019.04.14',\n",
" '当前逾期期数': '0',\n",
" '当前逾期金额': '0',\n",
" '最近一次还款日期': '2019.04.14',\n",
" '本月实还款': '3,119',\n",
" '本月应还款': '3,119',\n",
" '本金余额': '327,097',\n",
" '账户状态': '正常',\n",
" '逾期180天以上未还本金': '0',\n",
" '逾期31-60天未还本金': '0',\n",
" '逾期61-90天未还本金': '0',\n",
" '逾期91-180天未还本金': '0',\n",
" '逾期记录': {'逾期持续月数': ['1', '--'],\n",
" '逾期月份': ['2016.12', '--'],\n",
" '逾期金额': ['3,119', '--']}}"
]
},
"execution_count": 312,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"def get_json_DebitCard(tag):\n",
" result = defaultdict(dict)\n",
" key = []\n",
" value = []\n",
" value_list = []\n",
"\n",
" temp_key = None\n",
" for i,val in enumerate(tag):\n",
" tds = val.find_all('td')\n",
" # tds_len = len(tds)\n",
" # next_sibling = val.next_sibling\n",
" # if (next_sibling == ' ' or next_sibling =='\\n' ) and next_sibling!=None:\n",
" # next_sibling = val.next_sibling.next_sibling\n",
" # tds_next = next_sibling.find_all('td')\n",
" # tds_next_len = len(next_sibling.find_all('td'))\n",
"\n",
" for ii ,td in enumerate(tds):\n",
" # print(tds_len,tds_next_len,'---1---')\n",
" if isinstance(td,bs4.element.Tag):\n",
" if td.get('align') or td.get('colspan') == '24':\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" key.append(tm)\n",
" else:\n",
" tm = ''\n",
" for k in td.stripped_strings:\n",
" tm+= k\n",
" value.append(tm)\n",
" if len(key) == 0 and temp_key:\n",
" value_list.append(tm)\n",
" r = re.findall('\\d{4}年\\d{1,2}月-\\d{4}年\\d{1,2}月的([\\S+]*)',temp_key)\n",
" if r:\n",
" r = r[0]\n",
" if result[temp_key][ii] not in result[r].keys():\n",
" result[r][result[temp_key][ii]] = []\n",
" result[r][result[temp_key][ii]].append(value_list[ii])\n",
"\n",
" if key and value:\n",
" if len(key) == len(value):\n",
" result.update(dict(zip(key,value)))\n",
" else:\n",
" result.update(dict({key[0]:value}))\n",
" temp_key = key[0]\n",
" key=[]\n",
" value = []\n",
" return dict(result)\n",
"get_json_DebitCard(table_tag)"
]
},
{
"cell_type": "code",
"execution_count": 315,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>逾期持续月数</th>\n",
" <th>逾期月份</th>\n",
" <th>逾期金额</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2016.12</td>\n",
" <td>3,119</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
]
},
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"aa = [{'逾期持续月数': '1', '逾期月份': '2016.12', '逾期金额': '3,119'},{'逾期持续月数':2}]\n",
"\n",
"pd.DataFrame("
]
},
{
"cell_type": "code",
"execution_count": 317,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<filter at 0x10b612048>"
]
},
"execution_count": 317,
"output_type": "execute_result",
"metadata": {}
}
],
"source": [
"aa = '3,119'\n",
"filter (lambda x:x,aa)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3.0
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
\ No newline at end of file
# -*- coding:utf-8 -*-
'''
json 组装
'''
import json
## 常数变量定义
## json 返回中的 错误码定义
class Constants:
Code_Success = 200
Msg_Success = 'SUCCESS'
Code_Fail = 1
Msg_Fail = '失败'
Code_Param_Error = -1
Msg_Param_Error = '参数错误'
Code_LoginToken_Auth_Error = -2
Msg_LoginToken_Auth_Error = 'token登陆认证错误'
Code_Error = -3
Msg_Error = '数据异常'
Code_Params_Error = -4
Msg_Params_Error = '{0} is error,{0} is {1}'
Code_File_Error = -5
Msg_File_Error = '文件读取错误'
Code_Analysis_Error = -6
Msg_Analysis_Error = 'html 解析错误'
Code_Error_Value = 506
Msg_Error_Value = '没找到指定参数的返回值'
def build_json(code=None,report=None,mssage=None,**kwargs):
if report == None:
report = []
body = {}
for key,value in kwargs.items():
body[str(key)] = value
try :
if body:
ddata = {'code':code,'mssage':mssage,'body':body}
else:
ddata = {'code':code,'mssage':mssage,'result':report}
except Exception as e:
print(e.args)
return json.dumps(ddata,ensure_ascii=False)
# businessType=int(_business_type), orderId=_orderId, loanId=_loanId,uuid = _uuid,
# data=_result, flag=True)
#
# def build_json_feature (features=None, code=Constants.Code_Success, msg=Constants.Msg_Success):
#
# if data == None:
# data = []
# ddata = dict(code=code, businessCode=code, msg=msg, features=features)
#
# def build_json_feature_two(data):
# return json.dumps(data,ensure_ascii=False)
#
# """
# 只返回 错误码 + 错误提示
# """
#
#
# def build_json(code, msg):
# ddata = dict(code=code, msg=msg, businessCode=code)
# # return json.dumps(ddata,encoding='UTF-8',ensure_ascii=False) #python2
# return json.dumps(ddata, ensure_ascii=False) # python3
#
#
# """
# 错误码 默认为 成功;
# data may be dict array
# """
#
#
# def build_json_with_data(data=None, code=Constants.Code_Success, msg=Constants.Msg_Success):
# if data == None:
# data = []
# ddata = dict(code=code, businessCode=code, msg=msg, data=data)
# # return json.dumps(ddata,encoding='UTF-8',ensure_ascii=False) # python2
# return json.dumps(ddata, ensure_ascii=False) # python3
#
#
# def build_json_with_data_page(data=None, total=0, page_no=1, page_size=20, code=Constants.Code_Success,
# msg=Constants.Msg_Success):
# page_no = int(page_no)
# page_size = int(page_size)
# total = int(total)
# if total % page_size == 0:
# _page_total = total / page_size
# else:
# _page_total = total / page_size + 1
# # ==没有查询到数据,则data 为空
# if total == 0:
# data = []
# _data = dict(data=data, page_no=int(page_no), page_size=int(page_size), page_total=int(_page_total),
# total=int(total))
# ddata = dict(code=code, businessCode=code, msg=msg, data=_data)
# # return json.dumps(ddata,encoding='UTF-8',ensure_ascii=False) # python2
# return json.dumps(ddata, ensure_ascii=False) # python3
#
#
# def build_json_with_block(data=None, code=Constants.Code_Success, msg=Constants.Msg_Success):
# ddata = dict(code=code, businessCode=code, msg=msg, data=data)
# # return json.dumps(ddata,encoding='UTF-8',ensure_ascii=False) # python2
# return json.dumps(ddata, ensure_ascii=False) # python3
#
#
# """
# 转换为 dict 对象; 直接 json.key 获取数据
# 如果data 为空,返回一个空的data数据
# """
#
#
# def str_parse_json(data):
# if any(data):
# return json.loads(data, encoding='UTF-8')
# return dict()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment