当前位置: 首页>>代码示例>>Python>>正文


Python html.HtmlElement方法代码示例

本文整理汇总了Python中lxml.html.HtmlElement方法的典型用法代码示例。如果您正苦于以下问题:Python html.HtmlElement方法的具体用法?Python html.HtmlElement怎么用?Python html.HtmlElement使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在lxml.html的用法示例。


在下文中一共展示了html.HtmlElement方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _parse_node

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def _parse_node(
        self, node: HtmlElement, state: Dict[str, Any]
    ) -> Iterator[Sentence]:
        """Entry point for parsing all node types.

        :param node: The lxml HTML node to parse
        :param state: The global state necessary to place the node in context
            of the document as a whole.
        :return: a *generator* of Sentences
        """
        # Processing on entry of node
        state = self._parse_section(node, state)

        state = self._parse_figure(node, state)

        if self.tabular:
            state = self._parse_table(node, state)

        state = self._parse_caption(node, state)

        yield from self._parse_paragraph(node, state) 
开发者ID:HazyResearch,项目名称:fonduer,代码行数:23,代码来源:parser.py

示例2: _fragments_from_string

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def _fragments_from_string(html_string):
    fragments = html.fragments_fromstring(html_string)
    if not len(fragments):
        return []
    # convert and append text node before starting tag
    if not isinstance(fragments[0], html.HtmlElement):
        if len(fragments[0].strip()) > 0:
            if len(fragments) == 1:
                return html.fragments_fromstring('<p>%s</p>' % fragments[0])
            else:
                paragraph = _create_element('p')
                paragraph.text = fragments[0]
                fragments[1].addprevious(paragraph)
                fragments.insert(1, paragraph)

        fragments.pop(0)
        if not len(fragments):
            return []

    # remove xml instructions (if cleaning is disabled)
    for instruction in fragments[0].xpath('//processing-instruction()'):
        instruction.drop_tag()

    return fragments 
开发者ID:mercuree,项目名称:html-telegraph-poster,代码行数:26,代码来源:html_to_telegraph.py

示例3: test_scores

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def test_scores(self):
        """ Return a list of dictionnary with test scores """
        if isinstance(self.xp_test_scores, html.HtmlElement) is True:
            count=int(self.get_clean_xpath(
                'count(//div[@id="background-test-scores"]/div[contains(@id, "scores-")])'))
            test_scores=[]
            for i in range(1, count + 1):
                data={}
                data['name']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h4//text()'))
                data['score']=extract_one(
                    self.get_xp(self.xp_test_scores, './/h5//text()'))
                data['description']=' '.join((self.get_xp(
                    self.xp_test_scores, './/p[contains(@class,"description")]//text()')))
                data['date']=extract_one(self.get_xp(
                    self.xp_test_scores, './/span[@class = "date-range"]/time[1]/text()'))
                test_scores.append(data)
        else:
            test_scores=[]
        return test_scores 
开发者ID:ericfourrier,项目名称:scrape-linkedin,代码行数:22,代码来源:scraper.py

示例4: get_resp

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def get_resp(url):
    """Get webpage response as an lxml.html.HtmlElement object."""
    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        try:
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
    except Exception:
        sys.stderr.write("Failed to retrieve {0}.\n".format(url))
        raise 
开发者ID:huntrar,项目名称:scrape,代码行数:15,代码来源:utils.py

示例5: parse_html

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def parse_html(infile, xpath):
    """Filter HTML using XPath."""
    if not isinstance(infile, lh.HtmlElement):
        infile = lh.fromstring(infile)
    infile = infile.xpath(xpath)
    if not infile:
        raise ValueError("XPath {0} returned no results.".format(xpath))
    return infile


# URL processing functions
# 
开发者ID:huntrar,项目名称:scrape,代码行数:14,代码来源:utils.py

示例6: extractor

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extractor(self, element: HtmlElement, publish_time_xpath: str = '') -> str:
        publish_time_xpath = publish_time_xpath or config.get('publish_time', {}).get('xpath')
        publish_time = (self.extract_from_user_xpath(publish_time_xpath, element)  # 用户指定的 Xpath 是第一优先级
                        or self.extract_from_meta(element)   # 第二优先级从 Meta 中提取
                        or self.extract_from_text(element))  # 最坏的情况从正文中提取
        return publish_time 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:8,代码来源:TimeExtractor.py

示例7: extract_from_user_xpath

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract_from_user_xpath(self, publish_time_xpath: str, element: HtmlElement) -> str:
        if publish_time_xpath:
            publish_time = ''.join(element.xpath(publish_time_xpath))
            return publish_time
        return '' 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:7,代码来源:TimeExtractor.py

示例8: extract_from_text

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract_from_text(self, element: HtmlElement) -> str:
        text = ''.join(element.xpath('.//text()'))
        for dt in self.time_pattern:
            dt_obj = re.search(dt, text)
            if dt_obj:
                return dt_obj.group(1)
        else:
            return '' 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:10,代码来源:TimeExtractor.py

示例9: extract_from_meta

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract_from_meta(self, element: HtmlElement) -> str:
        """
        一些很规范的新闻网站,会把新闻的发布时间放在 META 中,因此应该优先检查 META 数据
        :param element: 网页源代码对应的Dom 树
        :return: str
        """
        for xpath in PUBLISH_TIME_META:
            publish_time = element.xpath(xpath)
            if publish_time:
                return ''.join(publish_time)
        return '' 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:13,代码来源:TimeExtractor.py

示例10: extractor

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extractor(self, element: HtmlElement, author_xpath=''):
        author_xpath = author_xpath or config.get('author', {}).get('xpath')
        if author_xpath:
            author = ''.join(element.xpath(author_xpath))
            return author
        text = ''.join(element.xpath('.//text()'))
        for pattern in self.author_pattern:
            author_obj = re.search(pattern, text)
            if author_obj:
                return author_obj.group(1)
        return '' 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:13,代码来源:AuthorExtractor.py

示例11: extract

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def extract(self, element: HtmlElement, title_xpath: str = '') -> str:
        title_xpath = title_xpath or config.get('title', {}).get('xpath')
        title = (self.extract_by_xpath(element, title_xpath)
                 or self.extract_by_htag_and_title(element)
                 or self.extract_by_title(element)
                 or self.extract_by_htag(element)
                 )
        return title.strip() 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:10,代码来源:TitleExtractor.py

示例12: normalize_node

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def normalize_node(element: HtmlElement):
    etree.strip_elements(element, *USELESS_TAG)
    for node in iter_node(element):
        # inspired by readability.
        if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
            remove_node(node)

        # merge text in span or strong to parent p tag
        if node.tag.lower() == 'p':
            etree.strip_tags(node, 'span')
            etree.strip_tags(node, 'strong')

        # if a div tag does not contain any sub node, it could be converted to p node.
        if node.tag.lower() == 'div' and not node.getchildren():
            node.tag = 'p'

        if node.tag.lower() == 'span' and not node.getchildren():
            node.tag = 'p'

        # remove empty p tag
        if node.tag.lower() == 'p' and not node.xpath('.//img'):
            if not (node.text and node.text.strip()):
                drop_tag(node)

        class_name = node.get('class')
        if class_name:
            for attribute in USELESS_ATTR:
                if attribute in class_name:
                    remove_node(node)
                    break 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:32,代码来源:utils.py

示例13: iter_node

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def iter_node(element: HtmlElement):
    yield element
    for sub_element in element:
        if isinstance(sub_element, HtmlElement):
            yield from iter_node(sub_element) 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:7,代码来源:utils.py

示例14: remove_node

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def remove_node(node: HtmlElement):
    """
    this is a in-place operation, not necessary to return
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        parent.remove(node) 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:11,代码来源:utils.py

示例15: drop_tag

# 需要导入模块: from lxml import html [as 别名]
# 或者: from lxml.html import HtmlElement [as 别名]
def drop_tag(node: HtmlElement):
    """
    only delete the tag, but merge its text to parent.
    :param node:
    :return:
    """
    parent = node.getparent()
    if parent is not None:
        node.drop_tag() 
开发者ID:kingname,项目名称:GeneralNewsExtractor,代码行数:11,代码来源:utils.py


注:本文中的lxml.html.HtmlElement方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。