當前位置: 首頁>>代碼示例>>Python>>正文


Python UnicodeDammit.replace方法代碼示例

本文整理匯總了Python中bs4.UnicodeDammit.replace方法的典型用法代碼示例。如果您正苦於以下問題:Python UnicodeDammit.replace方法的具體用法?Python UnicodeDammit.replace怎麽用?Python UnicodeDammit.replace使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在bs4.UnicodeDammit的用法示例。


在下文中一共展示了UnicodeDammit.replace方法的3個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: remove_evernote_link

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import replace [as 別名]
def remove_evernote_link(link, html):
    html = UnicodeDammit(html, ["utf-8"], is_html=True).unicode_markup
    link_converted = UnicodeDammit(link.WholeRegexMatch, ["utf-8"], is_html=True).unicode_markup
    sep = u'<span style="color: rgb(105, 170, 53);"> | </span>'
    sep_regex = escape_regex(sep)
    no_start_tag_regex = r"[^<]*"
    regex_replace = r"<{0}[^>]*>[^<]*{1}[^<]*</{0}>"
    # html = re.sub(regex_replace.format('li', link.WholeRegexMatch), "", html)
    # Remove link
    html = html.replace(link.WholeRegexMatch, "")
    # Remove empty li
    html = re.sub(regex_replace.format("li", no_start_tag_regex), "", html)
    # Remove dangling separator

    regex_span = regex_replace.format("span", no_start_tag_regex) + no_start_tag_regex + sep_regex
    html = re.sub(regex_span, "", html)
    # Remove double separator
    html = re.sub(sep_regex + no_start_tag_regex + sep_regex, sep_regex, html)
    return html
開發者ID:holycrepe,項目名稱:anknotes,代碼行數:21,代碼來源:shared.py

示例2: getContent

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import replace [as 別名]
def getContent(soup, source=''):
    newContent = []
    # Cleanning phase
    genericCleaning(soup)
    sourceSpecificcleaning(soup, source)

    # f = open("content.html", 'w'); f.write(soup.prettify().encode('utf-8')); f.close();
    # Finding content in the tree
    bestElem = None; bestText = '';
    for el in soup.findAll(True):
        score = 0.0;  hasTitle = False
        if el.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7'] and el.parent.name == '[document]':
            score += 3
        for c in el:
            if c.name == 'br': # business insider style
                score += 0.5
            if c.name == 'p':
                score += 1.0
            if not hasTitle and c.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7']:
                score += 1.0
                hasTitle = True
        if score >= 3.0: # at least 3 paragraphs
            textOutput = getText(el)
            if float(len(textOutput))/score > 20.0: # we need at least 20 characters per container
                newContent.append(textOutput)
        elif score >= 1.0:
            if bestElem is None:
                bestElem = el; bestText = getText(el, False)
            else:
                a = getText(el, False)
                if bestElem is None or len(a) > len(bestText):
                    bestElem = el; bestText = a
    if len(newContent) == 0 and bestElem is not None: # in case nothing had a score of 3, but something had a score of 1 or more
        newContent.append(bestText)

    finalText = UnicodeDammit(u'\n'.join(newContent), smart_quotes_to='ascii').unicode_markup
    return finalText.replace('\n\n', '\n')
開發者ID:gt-big-data,項目名稱:QDoc,代碼行數:39,代碼來源:articleParser.py

示例3: urlparse

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import replace [as 別名]
# (or default to 'config.ini')
parser = argparse.ArgumentParser(description='Create players for past '
                                             'DF Game Club sessions')
parser.add_argument('config', nargs='?', default='config.ini',
                    help='configuration file with past session data')
namespace = parser.parse_args()

# Read configuration file
conf = configparser.ConfigParser(interpolation=None)
conf.read(namespace.config)

for section in conf.sections():
    pastebin_url = PASTEBIN_RAW_PREFIX + \
                    urlparse(conf[section]["pastebin_url"]).path.strip('/')
    log = UnicodeDammit(urlopen(pastebin_url).read()).unicode_markup
    log = log.replace('\r', '').split('\n')

    regex = re.compile(conf[section]['regex'])
    timestamp_format = conf[section]['timestamp_format']
    video_timestamp = conf[section]['video_timestamp']

    twitch_url = urlparse(conf[section]['twitch_url'])\
        .path.strip('/').split('/')
    channel, archive_id = twitch_url[0], twitch_url[2]

    ignore_lines = [
        int(num) for num in [
            item for item in
                conf[section]['ignore_lines'].replace(' ','').split(',')
            if item != ''
        ]
開發者ID:Double-Fine-Game-Club,項目名稱:game-club-player,代碼行數:33,代碼來源:generate.py


注:本文中的bs4.UnicodeDammit.replace方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。