Python Document.replace方法代码示例

本文整理汇总了Python中readability.readability.Document.replace方法的典型用法代码示例。如果您正苦于以下问题：Python Document.replace方法的具体用法？Python Document.replace怎么用？Python Document.replace使用的例子？那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document的用法示例。

在下文中一共展示了Document.replace方法的14个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: fetch_article_contents

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
 def fetch_article_contents(self):
     """
     Uses Readability.js + BS4 methods to parse raw html list and
     outputs list of text in an article
     """
     for article in self.raw_html:
         article = Document(article).summary()
         article = BeautifulSoup(article)
         [tag.extract() for tag in article.find_all('img')]
         [tag.extract() for tag in article.find_all('embed')]
         article = article.get_text()
         article = unicode(article)
         article = article.replace('\t', '')
         article = article.replace('\n', ' ')
         self.article_html.append(article)
     return self.article_html

开发者ID:randallm，项目名称:hfnews-scripts，代码行数:18，代码来源:newsscraper.py

示例2: getTextFromHTML

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
    def getTextFromHTML(self, url_id):
        """ Runs Readability (Document) on the HTML text
        """
        html_row = get_html(self.pg_conn, url_id)

        if not html_row or 'html' not in html_row:
            return False

        if html_row['readabletext'] and html_row['readabletext'] != '':
            return html_row['readabletext']

        html = html_row['html']

        try:
            html_summary = Document(html).summary(html_partial=True)
            html_summary = html_summary.replace('\n','').replace('\t','')

            if len(html_summary) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
                return False

            raw_text = lxml.html.document_fromstring(html_summary).text_content()
        except:
            raw_text = False

        if raw_text:
            save_readabletext(self.pg_conn, url_id, raw_text, 'meta')
        else:
            save_readabletext(self.pg_conn, url_id, '', 'meta')

        return raw_text

开发者ID:konfabproject，项目名称:konfab-consumer，代码行数:32，代码来源:readable_text.py

示例3: getText

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def getText():
    dataList = []
    for f in os.listdir('unsupervised\\documents'):
        filePath = 'unsupervised\\documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"
    return dataList

开发者ID:adamstein，项目名称:mayhem，代码行数:33，代码来源:chunkedPhrases.py

示例4: main

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def main():
    #print 'Hello there'
    # Command line args are in sys.argv[1], sys.argv[2] ...
    # sys.argv[0] is the script name itself and can be ignored

    dataList = []

    for f in os.listdir('documents'):
        filePath = 'documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            # with open(filePath) as f:
            #     doc = slate.PDF(f)
            #     print doc[1]
            #     exit()


            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"

    #print dataList
    #for i in dataList:
    #    print i
    cachedStopWords = stopwords.words("english")
    combined = ' '.join(dataList)

    #print combined
    bloblist = [tb(combined)]

    for i, blob in enumerate(bloblist):
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')}
        #print scores
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        #print sorted_words
        for word, score in sorted_words:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

开发者ID:adamstein，项目名称:mayhem，代码行数:61，代码来源:run.py

示例5: get_article

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def get_article (url, referrer=None):
    """Fetch the html found at url and use the readability algorithm
    to return just the text content"""

    html = load_url(url, referrer)
    if html is not None:
        doc_html = Document(html).summary(html_partial=True)
        clean_html = doc_html.replace('&amp;', u'&').replace(u'&#13;', u'\n')
        return BeautifulSoup(clean_html).getText(separator=u' ').replace(u'  ', u' ')

开发者ID:dpapathanasiou，项目名称:cmdline-news，代码行数:11，代码来源:cmdlinenews.py

示例6: url_matcher

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def url_matcher(event, url, *args, **kwargs):
    r = requests.head(url)
    # files that are too big cause trouble. Let's just ignore them.
    if 'content-length' in r.headers and \
       int(r.headers['content-length']) > 5e6:
        return

    html = requests.get(url).text
    readable_article = Document(html).summary().encode("utf-8")
    readable_article = TAG_RE.sub('', readable_article)
    readable_article = WHITESPACE_RE.sub(' ', readable_article)
    readable_article = readable_article.replace('\n', ' ')
    readable_article = readable_article.replace('&#13;', '')

    if len(readable_article) > 75:
        readable_article = readable_article[:75] + '...'

    readable_title = Document(html).short_title().encode("utf-8")

    return "> " + url + " > " + readable_title + " > " + readable_article

开发者ID:mrshu，项目名称:brutal-plugins，代码行数:22，代码来源:url_matcher.py

示例7: get_main_text

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def get_main_text(html):
    main_text = Document(html).summary()
    main_text = BeautifulSoup(main_text).getText()
    # 处理空行
    r = re.compile(r'\n+', re.M | re.S)
    main_text = r.sub('\n', main_text)
    # 去除首行回车
    if main_text.find('\n') == 0:
        main_text = main_text.replace('\n', '', 1)

    return main_text

开发者ID:jzlingmo，项目名称:ScsMS，代码行数:13，代码来源:parse_item.py

示例8: getTextFromHTML

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
    def getTextFromHTML(self, html):
        """ Runs Readability (Document) on the HTML text
        """
        try:
            html_summary = Document(html).summary(html_partial=True)
            html_summary = html_summary.replace('\n','').replace('\t','')
            if "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
                return False
            raw_text = lxml.html.document_fromstring(html_summary).text_content()
        except:
            raw_text = False

        return raw_text

开发者ID:konfabproject，项目名称:konfab-consumer，代码行数:15，代码来源:test_readble.py

示例9: extrat_html_document

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def extrat_html_document(url):
    try :
        print "extrat_html_document"
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        r = urllib2.Request(url, headers=headers)
        socket = urllib2.urlopen(r,timeout = 1)
        url = socket.geturl()
        html = socket.read()

        #block_url pass
        for bl_url in block_url:
            if len(url.split(bl_url)) > 1:
                summary="block"
                return summary

        for ext_url in exception_url:
            if len(url.split(ext_url)) > 1:
                readable_title = Document(html).short_title()
                summary = readable_title.encode('utf-8')
                _file.write(summary+'\n')
                return summary

        readable_article = Document(html).summary()
        readable_title = Document(html).short_title()
        summary = readable_title.encode('utf-8') + readable_title.encode('utf-8')
        print "soup start"
        soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser")
        print "summary:"

        for s in soup("p"):
            summary += str(s.encode('utf-8'))

#        summary += readable_article.encode('utf-8')


    except Exception:
        _file.write('extrat_html_document Failed URL : ' + url + '\n')
        summary = "Failed Get data"

    return summary

开发者ID:yoonwonsang，项目名称:redcabinet，代码行数:43，代码来源:views.py

示例10: parser_content

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def parser_content(url):
    rt_result = []
    dr = re.compile(r'<[^>]+>',re.S)
    html = urllib.urlopen(url).read()
    readable_article = Document(html).summary().encode('utf8')
    #print readable_article
    readable_article = readable_article.replace('&#13;','')
    cur_list = readable_article.split('\n')
    for item in cur_list:
        if '<img' in item and 'src=' in item:
            #print item.split('src=')[1].split('"')[1]
            dom = soupparser.fromstring(item)
            if len(dom) > 0:
                img_path = dom[0].xpath('.//img')
                for img in img_path:
                    rt_result.append(['0',img.get('src')])
        else:
            use_item = dr.sub('',item).replace(' ','')
            if len(use_item) > 10:
                rt_result.append(['1',use_item])
    return rt_result

开发者ID:luoyan，项目名称:miniprog，代码行数:23，代码来源:test_parser_content.py

示例11: get_cleaned_html_from_url

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def get_cleaned_html_from_url(url):
    readable_article = Document(get_html(url)).summary()
    readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
    string_out = "<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>"
    string_out += readable_article[6:]
    return string_out

开发者ID:smythp，项目名称:scrape-and-clean，代码行数:8，代码来源:pull.py

示例12: write_readable_text_from_url

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def write_readable_text_from_url(url,out_file):
    readable_article = Document(get_html(url)).summary()
    readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
    out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>")
    out_file.write(readable_article[6:])

开发者ID:smythp，项目名称:scrape-and-clean，代码行数:7，代码来源:pull.py

示例13: update

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def update(offset):
    offset = int(offset)
    if offset == 0:
        db.session.query(Entry).delete()
        db.session.commit()
        return ''

    # Obtain bearer token from Twitter
    url = "https://api.twitter.com/oauth2/token"
    consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
    consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET')
    auth = base64.b64encode(consumer_key + ':' + consumer_secret)
    request = urllib2.Request(url, "grant_type=client_credentials", {"Authorization": "Basic "+auth})
    response = urllib2.urlopen(request).read()
    json_response = json.loads(response)
    access_token = json_response['access_token']

    # Obtain HN posts >100 pts
    url = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=newsyc100&count=40"
    request = urllib2.Request(url, headers={"Authorization": "Bearer "+access_token})
    response = urllib2.urlopen(request).read()
    tweets = json.loads(response)

    increment = 2

    start_at = (offset - 1) * increment
    tweets = tweets[start_at:start_at + increment]
    for tweet in tweets:
        title = tweet['text']

        start_link = title.rfind("(http")
        end_link = title.find(")", start_link)
        comment_link = title[start_link+1:end_link]

        title = title[0:start_link]

        start_link = title.rfind("http")
        end_link = title.find(" ", start_link)
        link = title[start_link:end_link]

        title = title[0:start_link]

        try:
            response = urllib2.urlopen(link)
        except urllib2.HTTPError:
            continue

        encoding = response.headers['content-type'].split('charset=')[-1]
        if encoding == 'text/html':
            encoding = 'utf-8'
        if encoding == 'application/pdf':
            continue
        html = response.read().decode(encoding, 'ignore')

        if sys.modules.has_key('readability.readability'):
            body = Document(html).summary()
        else:
            body = html

        body = body.replace('<html><body>', '<html><body><a href="' + comment_link + '">HN Comments</a><br>')
        body = body.replace('<body id="readabilityBody">', '')

        entry = Entry(link, title, body)
        db.session.add(entry)

    db.session.commit()
    return ''

开发者ID:dannysu，项目名称:hackernews-feed，代码行数:69，代码来源:app.py

示例14: print

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
import requests
from readability.readability import Document

url_in = "" # put url here

r = requests.get(url_in)
# print(r.status_code)

html = r.text

with open('out.html','w') as out_file:
    readable_article = Document(html).summary()
    readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
    out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>")
    out_file.write(readable_article[6:])

开发者ID:smythp，项目名称:readable-html，代码行数:17，代码来源:pull.py

注：本文中的readability.readability.Document.replace方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。