# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup,Comment
import bs4,re
from collections import defaultdict

class InitHtml:
    """初始化html 文件,拆分标题"""

    def __init__(self,html):
        htmlhandle = html
        soup = BeautifulSoup(htmlhandle,'lxml')
        comments = soup.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments] # 去除注释
        [s.extract() for s in soup("style")] # 去除指定标签
        children = self.get_children(soup)
        # detail = self.children[2].tbody.find_all('tr')
        # menu_detail = detail[4:]
        menu_detail = children[2].tbody.children
        self.menu_dict = self.get_menu(menu_detail) # 获取报告标题信息

    def get_children(self,soup):
        """获取子节点"""
        children = []
        for child in soup.body.children:
            if child != '\n':
                children.append(child)
        return children



    # def get_menu(self,menu):
    #     """按报告标题大标题拆分"""
    #     menu_dict = defaultdict(list)
    #     index = []
    #     for i in range(len(menu)):
    #         for m in menu[i].stripped_strings:
    #             # #         ret = re.findall(r"[一|二|三|四|五] .*", m)
    #             # ret=re.findall('(一|二|三|四|五|六|十一)[\ ]{1,1}(\w+)',m)
    #             ret=re.findall('([\u4E00-\u9FA5])[\ ]{1,1}(\w+)',m)
    #             if ret:
    #                 key = ret[0][1]
    #                 index.append(i)
    #         if i not in index:
    #             menu_dict[key].append(menu[i])
    #     return dict(menu_dict)

    def get_menu(self,menu):
        """按报告标题大标题拆分"""
        menu_dict = defaultdict(list)
        index = []
        key = None
        for ii ,i in enumerate(menu):
            if isinstance(i,bs4.element.Tag):
                con = i.contents
                if con :
                    menu = con[1]
                    # menu = i.contents[1]
                    for mm,m in enumerate(menu.stripped_strings):
                        #                 ret = re.findall(r"[一|二|三|四|五] .*", m)
                        ret=re.findall('([\u4E00-\u9FA5])[\ ]{1,1}(\w+)',m)
                        if ret:
                            key = ret[0][1]
                            index.append(ii)

                    if key:
                        if ii not in index:
                            menu_dict[key].append(i)
        return dict(menu_dict)

