Python lxml.html方法代码示例

本文整理汇总了Python中lxml.html方法的典型用法代码示例。如果您正苦于以下问题：Python lxml.html方法的具体用法？Python lxml.html怎么用？Python lxml.html使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lxml的用法示例。

在下文中一共展示了lxml.html方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_tags

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def get_tags(doc):
    '''
    Get tags from a DOM tree

    :param doc: lxml parsed object
    :return:
    '''
    tags = list()

    for el in doc.getroot().iter():
        if isinstance(el, lxml.html.HtmlElement):
            tags.append(el.tag)
        elif isinstance(el, lxml.html.HtmlComment):
            tags.append('comment')
        else:
            raise ValueError('Don\'t know what to do with element: {}'.format(el))

    return tags

开发者ID:matiskay，项目名称:html-similarity，代码行数:20，代码来源:structural_similarity.py

示例2: structural_similarity

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def structural_similarity(document_1, document_2):
    """
    Computes the structural similarity between two DOM Trees
    :param document_1: html string
    :param document_2: html string
    :return: int
    """
    try:
        document_1 = lxml.html.parse(StringIO(document_1))
        document_2 = lxml.html.parse(StringIO(document_2))
    except Exception as e:
        print(e)
        return 0

    tags1 = get_tags(document_1)
    tags2 = get_tags(document_2)
    diff = difflib.SequenceMatcher()
    diff.set_seq1(tags1)
    diff.set_seq2(tags2)

    return diff.ratio()

开发者ID:matiskay，项目名称:html-similarity，代码行数:23，代码来源:structural_similarity.py

示例3: testParseCurrencyListAll

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def testParseCurrencyListAll(self):
        """Test parseCurrencyListAll."""
        f = codecs.open("{0}/example/currencylist.html".format(
            os.path.dirname(os.path.abspath(__file__))), 'r', 'utf-8')
        html = f.read()
        f.close()
        data = parseCurrencyListAll(html)
        self.assertEqual(len(data), 452)
        expectedFirst = {
            'name': 'Bitcoin',
            'slug': 'bitcoin',
            'symbol': 'BTC',
            'explorer_link': 'http://blockchain.info'
        }
        self.assertEqual(data[0], expectedFirst)
        expectedLast = {
            'name': 'Marscoin',
            'slug': 'marscoin',
            'symbol': 'MRS',
            'explorer_link': 'http://explore.marscoin.org/chain/Marscoin/'
        }
        self.assertEqual(data[-1], expectedLast)

开发者ID:gogogoutham，项目名称:coinmarketcap-scraper，代码行数:24，代码来源:coinmarketcap.py

示例4: remove_html_encode_errors

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def remove_html_encode_errors(self, headers, error):
        """
        Use this method to remove html special characters (Eg. &nbps), encoding errors or other unicode text.

        Simply pass headers rows to the method and the error, as a unicode string, you want to correct

        :param headers: rows list of headers
        :param error: unicode string you want to delete from header cells
        :return: nothing
        """
        # Iterates over headers
        for row in headers:
            # Iterate over header cells
            for header in row:
                # Replace 'error' with u'' in the text of this header cell
                header['th'] = header['th'].replace(error, u'')

开发者ID:dbpedia，项目名称:table-extractor，代码行数:18，代码来源:HtmlTableParser.py

示例5: url_composer

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def url_composer(self, query, service):
        """
        This function is used to compose a url to call some web services, such as sparql endpoints.

        :param query: is the string used in some rest calls.
        :param service: type of service you request (dbpedia sparql endpoint)
        :return url: the url composed
        """
        # use quote_plus method from urllib to encode special character (must to do with web service)
        query = urllib.quote_plus(query)

        """
        The following if clause are differentiated by service requested Eg. 'dbpedia',..
            but in all the cases url is composed using pre formatted string along with the query
        """
        if service == 'dbpedia':
            url = self.dbpedia_sparql_url + query + self.call_format_sparql

        elif service == 'html':
            url = self.html_format + query

        else:
            url = "ERROR"
        return url

开发者ID:dbpedia，项目名称:table-extractor，代码行数:26，代码来源:Utilities.py

示例6: rss_item_to_relevant_data

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def rss_item_to_relevant_data(self, item):
        """
        Extract the relevant data from the given RSS item.

        Args:
            `item`:
                A single item from the RSS feed.  Such an
                item is an element of a list obtained with a
                `<lxml etree/html document>.xpath(...)` call
                (see the source code of the _process_rss()
                method).

        Returns:
            Some hashable object.  It may be, for example, a
            tuple or a string -- the exact type depends on the
            implementation provided by a particular subclass
            of BaseRSSCollector.
        """
        raise NotImplementedError

开发者ID:CERT-Polska，项目名称:n6，代码行数:21，代码来源:generic.py

示例7: test_terms_of_service

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_terms_of_service(self) -> None:
        user = self.example_user('hamlet')
        self.login_user(user)

        for user_tos_version in [None, '1.1', '2.0.3.4']:
            user.tos_version = user_tos_version
            user.save()

            with \
                    self.settings(TERMS_OF_SERVICE='whatever'), \
                    self.settings(TOS_VERSION='99.99'):

                result = self.client_get('/', dict(stream='Denmark'))

            html = result.content.decode('utf-8')
            self.assertIn('Accept the new Terms of Service', html)

开发者ID:zulip，项目名称:zulip，代码行数:18，代码来源:test_home.py

示例8: test_invites_by_admins_only

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_invites_by_admins_only(self) -> None:
        user_profile = self.example_user('hamlet')

        realm = user_profile.realm
        realm.invite_by_admins_only = True
        realm.save()

        self.login_user(user_profile)
        self.assertFalse(user_profile.is_realm_admin)
        result = self._get_home_page()
        html = result.content.decode('utf-8')
        self.assertNotIn('Invite more users', html)

        user_profile.role = UserProfile.ROLE_REALM_ADMINISTRATOR
        user_profile.save()
        result = self._get_home_page()
        html = result.content.decode('utf-8')
        self.assertIn('Invite more users', html)

开发者ID:zulip，项目名称:zulip，代码行数:20，代码来源:test_home.py

示例9: test_quotation_splitter_inside_blockquote

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_quotation_splitter_inside_blockquote():
    msg_body = """Reply
<blockquote>

  <div>
    On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
  </div>

  <div>
    Test
  </div>

</blockquote>"""

    eq_("<html><head></head><body>Reply</body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

开发者ID:mailgun，项目名称:talon，代码行数:18，代码来源:html_quotations_test.py

示例10: test_regular_blockquote

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_regular_blockquote():
    msg_body = """Reply
<blockquote>Regular</blockquote>

<div>
  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:
</div>

<blockquote>
  <div>
    <blockquote>Nested</blockquote>
  </div>
</blockquote>
"""
    eq_("<html><head></head><body>Reply<blockquote>Regular</blockquote></body></html>",
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

开发者ID:mailgun，项目名称:talon，代码行数:18，代码来源:html_quotations_test.py

示例11: test_validate_output_html

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_validate_output_html():
    msg_body = """Reply
<div>
  On 11-Apr-2011, at 6:54 PM, Bob &lt;bob@example.com&gt; wrote:

    <blockquote>
      <div>
        Test
      </div>
    </blockquote>
</div>

<div/>
"""
    out = quotations.extract_from_html(msg_body)
    ok_('<html>' in out and '</html>' in out,
        'Invalid HTML - <html>/</html> tag not present')
    ok_('<div/>' not in out,
        'Invalid HTML output - <div/> element is not valid')

开发者ID:mailgun，项目名称:talon，代码行数:21，代码来源:html_quotations_test.py

示例12: test_date_block

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_date_block():
    msg_body = """
<div>
  message<br>
  <div>
    <hr>
    Date: Fri, 23 Mar 2012 12:35:31 -0600<br>
    To: <a href="mailto:bob@example.com">bob@example.com</a><br>
    From: <a href="mailto:rob@example.com">rob@example.com</a><br>
    Subject: You Have New Mail From Mary!<br><br>

    text
  </div>
</div>
"""
    eq_('<html><head></head><body><div>message<br></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

开发者ID:mailgun，项目名称:talon，代码行数:19，代码来源:html_quotations_test.py

示例13: test_from_block_and_quotations_in_separate_divs

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_from_block_and_quotations_in_separate_divs():
    msg_body = '''
Reply
<div>
  <hr/>
  <div>
    <font>
      <b>From: bob@example.com</b>
      <b>Date: Thu, 24 Mar 2016 08:07:12 -0700</b>
    </font>
  </div>
  <div>
    Quoted message
  </div>
</div>
'''
    eq_('<html><head></head><body>Reply<div><hr></div></body></html>',
        RE_WHITESPACE.sub('', quotations.extract_from_html(msg_body)))

开发者ID:mailgun，项目名称:talon，代码行数:20，代码来源:html_quotations_test.py

示例14: test_remove_namespaces

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def test_remove_namespaces():
    msg_body = """
    <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns="http://www.w3.org/TR/REC-html40">
        <body>
            <o:p>Dear Sir,</o:p>
            <o:p>Thank you for the email.</o:p>
            <blockquote>thing</blockquote>
        </body>
    </html>
    """

    rendered = quotations.extract_from_html(msg_body)

    assert_true("<p>" in rendered)
    assert_true("xmlns" in rendered)

    assert_true("<o:p>" not in rendered)
    assert_true("<xmlns:o>" not in rendered)

开发者ID:mailgun，项目名称:talon，代码行数:20，代码来源:html_quotations_test.py

示例15: scrape

# 需要导入模块: import lxml [as 别名]
# 或者: from lxml import html [as 别名]
def scrape():
    html = requests.get(URL, params={
        "_": random.random()
    }).content
    dom = lxml.html.fromstring(html)

    table = dom.cssselect("table")[0]
    trs = table.cssselect("tr")

    rows = [ [ parse_cell(td.text_content())
        for td in tr.cssselect("td:nth-child(1), td:nth-child(2), td:nth-child(4)") ] 
             for tr in trs ]

    data = [ row for row in rows[1:]
        if len(row) and not row[0] in [ "", "States", "Territories" ] ]

    df = pd.DataFrame(data, columns=COLS)
    for c in INT_COLS:
        df[c] = df[c].str.replace(",", "").str.strip("*").astype(int)

    return df

开发者ID:BuzzFeedNews，项目名称:zika-data，代码行数:23，代码来源:scrape-cdc-state-case-counts.py

注：本文中的lxml.html方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。