當前位置: 首頁>>代碼示例>>Python>>正文


Python html.HtmlElement方法代碼示例

本文整理匯總了Python中lxml.html.HtmlElement方法的典型用法代碼示例。如果您正苦於以下問題:Python html.HtmlElement方法的具體用法?Python html.HtmlElement怎麽用?Python html.HtmlElement使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在lxml.html的用法示例。


在下文中一共展示了html.HtmlElement方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: _parse_node

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def _parse_node(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Iterator[Sentence]:
        """Entry point for parsing all node types.

        :param node: The lxml HTML node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        :return: a *generator* of Sentences
        """
        # Processing on entry of node
        state = self._parse_section(node, state)

        state = self._parse_figure(node, state)

        if self.tabular:
            state = self._parse_table(node, state)

        state = self._parse_caption(node, state)

        yield from self._parse_paragraph(node, state) 
開發者ID:HazyResearch,項目名稱:fonduer,代碼行數:23,代碼來源:parser.py

示例2: _fragments_from_string

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def _fragments_from_string(html_string):
    fragments = html.fragments_fromstring(html_string)
    if not len(fragments):
        return []
    # convert and append text node before starting tag
    if not isinstance(fragments[0], html.HtmlElement):
        if len(fragments[0].strip()) > 0:
            if len(fragments) == 1:
                return html.fragments_fromstring('<p>%s</p>' % fragments[0])
            else:
                paragraph = _create_element('p')
                paragraph.text = fragments[0]
                fragments[1].addprevious(paragraph)
                fragments.insert(1, paragraph)

        fragments.pop(0)
        if not len(fragments):
            return []

    # remove xml instructions (if cleaning is disabled)
    for instruction in fragments[0].xpath('//processing-instruction()'):
        instruction.drop_tag()

    return fragments 
開發者ID:mercuree,項目名稱:html-telegraph-poster,代碼行數:26,代碼來源:html_to_telegraph.py

示例3: test_scores

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def test_scores(self):
        """ Return a list of dictionnary with test scores """
        if isinstance(self.xp_test_scores, html.HtmlElement) is True:
            count=int(self.get_clean_xpath(
                'count(//div[@id="background-test-scores"]/div[contains(@id, "scores-")])'))
            test_scores=[]
            for i in range(1, count + 1):
                data={}
                data['name']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h4//text()'))
                data['score']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h5//text()'))
                data['description']=' '.join((self.get_xp(
                    self.xp_test_scores, './/p[contains(@class,"description")]//text()')))
                data['date']=extract_one(self.get_xp(
                    self.xp_test_scores, './/span[@class = "date-range"]/time[1]/text()'))
                test_scores.append(data)
        else:
            test_scores=[]
        return test_scores 
開發者ID:ericfourrier,項目名稱:scrape-linkedin,代碼行數:22,代碼來源:scraper.py

示例4: get_resp

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def get_resp(url):
    """Get webpage response as an lxml.html.HtmlElement object."""
    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        try:
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
    except Exception:
        sys.stderr.write("Failed to retrieve {0}.\n".format(url))
        raise 
開發者ID:huntrar,項目名稱:scrape,代碼行數:15,代碼來源:utils.py

示例5: parse_html

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def parse_html(infile, xpath):
    """Filter HTML using XPath."""
    if not isinstance(infile, lh.HtmlElement):
        infile = lh.fromstring(infile)
    infile = infile.xpath(xpath)
    if not infile:
        raise ValueError("XPath {0} returned no results.".format(xpath))
    return infile


# URL processing functions
# 
開發者ID:huntrar,項目名稱:scrape,代碼行數:14,代碼來源:utils.py

示例6: extractor

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
        publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
        publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用戶指定的 Xpath 是第一優先級
                        or self.extract_from_meta(element)   # 第二優先級從 Meta 中提取
                        or self.extract_from_text(element))  # 最壞的情況從正文中提取
        return publish_time 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:8,代碼來源:TimeExtractor.py

示例7: extract_from_user_xpath

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def extract_from_user_xpath(self, publish_time_xpath: str, element: HtmlElement) -> str:
        if publish_time_xpath:
            publish_time = ''.join(element.xpath(publish_time_xpath))
            return publish_time
        return '' 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:7,代碼來源:TimeExtractor.py

示例8: extract_from_text

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def extract_from_text(self, element: HtmlElement) -> str:
        text = ''.join(element.xpath('.//text()'))
        for dt in self.time_pattern:
            dt_obj = re.search(dt, text)
            if dt_obj:
                return dt_obj.group(1)
        else:
            return '' 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:10,代碼來源:TimeExtractor.py

示例9: extract_from_meta

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def extract_from_meta(self, element: HtmlElement) -> str:
        """
        一些很規範的新聞網站,會把新聞的發布時間放在 META 中,因此應該優先檢查 META 數據
        :param element: 網頁源代碼對應的Dom 樹
        :return: str
        """
        for xpath in PUBLISH_TIME_META:
            publish_time = element.xpath(xpath)
            if publish_time:
                return ''.join(publish_time)
        return '' 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:13,代碼來源:TimeExtractor.py

示例10: extractor

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def extractor(self, element: HtmlElement, author_xpath=''):
        author_xpath = author_xpath or config.get('author', {}).get('xpath')
        if author_xpath:
            author = ''.join(element.xpath(author_xpath))
            return author
        text = ''.join(element.xpath('.//text()'))
        for pattern in self.author_pattern:
            author_obj = re.search(pattern, text)
            if author_obj:
                return author_obj.group(1)
        return '' 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:13,代碼來源:AuthorExtractor.py

示例11: extract

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
        title_xpath = title_xpath or config.get('title', {}).get('xpath')
        title = (self.extract_by_xpath(element, title_xpath)
                 or self.extract_by_htag_and_title(element)
                 or self.extract_by_title(element)
                 or self.extract_by_htag(element)
                 )
        return title.strip() 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:10,代碼來源:TitleExtractor.py

示例12: normalize_node

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # merge text in span or strong to parent p tag
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:32,代碼來源:utils.py

示例13: iter_node

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def iter_node(element: HtmlElement):
    yield element
    for sub_element in element:
        if isinstance(sub_element, HtmlElement):
            yield from iter_node(sub_element) 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:7,代碼來源:utils.py

示例14: remove_node

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def remove_node(node: HtmlElement):
    """
    this is a in-place operation, not necessary to return
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        parent.remove(node) 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:11,代碼來源:utils.py

示例15: drop_tag

# 需要導入模塊: from lxml import html [as 別名]
# 或者: from lxml.html import HtmlElement [as 別名]
def drop_tag(node: HtmlElement):
    """
    only delete the tag, but merge its text to parent.
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        node.drop_tag() 
開發者ID:kingname,項目名稱:GeneralNewsExtractor,代碼行數:11,代碼來源:utils.py


注:本文中的lxml.html.HtmlElement方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。