本文整理汇总了Python中readability.readability.Document.replace方法的典型用法代码示例。如果您正苦于以下问题:Python Document.replace方法的具体用法?Python Document.replace怎么用?Python Document.replace使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document
的用法示例。
在下文中一共展示了Document.replace方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: fetch_article_contents
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def fetch_article_contents(self):
"""
Uses Readability.js + BS4 methods to parse raw html list and
outputs list of text in an article
"""
for article in self.raw_html:
article = Document(article).summary()
article = BeautifulSoup(article)
[tag.extract() for tag in article.find_all('img')]
[tag.extract() for tag in article.find_all('embed')]
article = article.get_text()
article = unicode(article)
article = article.replace('\t', '')
article = article.replace('\n', ' ')
self.article_html.append(article)
return self.article_html
示例2: getTextFromHTML
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def getTextFromHTML(self, url_id):
""" Runs Readability (Document) on the HTML text
"""
html_row = get_html(self.pg_conn, url_id)
if not html_row or 'html' not in html_row:
return False
if html_row['readabletext'] and html_row['readabletext'] != '':
return html_row['readabletext']
html = html_row['html']
try:
html_summary = Document(html).summary(html_partial=True)
html_summary = html_summary.replace('\n','').replace('\t','')
if len(html_summary) < 150 or "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
return False
raw_text = lxml.html.document_fromstring(html_summary).text_content()
except:
raw_text = False
if raw_text:
save_readabletext(self.pg_conn, url_id, raw_text, 'meta')
else:
save_readabletext(self.pg_conn, url_id, '', 'meta')
return raw_text
示例3: getText
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def getText():
dataList = []
for f in os.listdir('unsupervised\\documents'):
filePath = 'unsupervised\\documents\\' + f
#print filePath
fileName, fileExtension = os.path.splitext(filePath)
#print fileExtension
if fileExtension.lower() == '.docx':
print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
doc = docxDocument(filePath)
for p in doc.paragraphs:
dataList.append(p.text) #print p.text
#print "-------------------------------"
elif fileExtension.lower() == '.pdf':
print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
#TODO
elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
with codecs.open (filePath, errors='ignore') as myfile:
source = myfile.read()
article = Document(source).summary()
title = Document(source).title()
soup = BeautifulSoup(article, 'lxml')
final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
dataList.append(final)
#print '*** TITLE *** \n\"' + title + '\"\n'
#print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
else:
print '' # 'undectected document type'
print '' #"-------------------------------"
return dataList
示例4: main
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def main():
#print 'Hello there'
# Command line args are in sys.argv[1], sys.argv[2] ...
# sys.argv[0] is the script name itself and can be ignored
dataList = []
for f in os.listdir('documents'):
filePath = 'documents\\' + f
#print filePath
fileName, fileExtension = os.path.splitext(filePath)
#print fileExtension
if fileExtension.lower() == '.docx':
print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
doc = docxDocument(filePath)
for p in doc.paragraphs:
dataList.append(p.text) #print p.text
#print "-------------------------------"
elif fileExtension.lower() == '.pdf':
print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
# with open(filePath) as f:
# doc = slate.PDF(f)
# print doc[1]
# exit()
#TODO
elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
with codecs.open (filePath, errors='ignore') as myfile:
source = myfile.read()
article = Document(source).summary()
title = Document(source).title()
soup = BeautifulSoup(article, 'lxml')
final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
dataList.append(final)
#print '*** TITLE *** \n\"' + title + '\"\n'
#print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
else:
print '' # 'undectected document type'
print '' #"-------------------------------"
#print dataList
#for i in dataList:
# print i
cachedStopWords = stopwords.words("english")
combined = ' '.join(dataList)
#print combined
bloblist = [tb(combined)]
for i, blob in enumerate(bloblist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')}
#print scores
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
#print sorted_words
for word, score in sorted_words:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))
示例5: get_article
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def get_article (url, referrer=None):
"""Fetch the html found at url and use the readability algorithm
to return just the text content"""
html = load_url(url, referrer)
if html is not None:
doc_html = Document(html).summary(html_partial=True)
clean_html = doc_html.replace('&', u'&').replace(u' ', u'\n')
return BeautifulSoup(clean_html).getText(separator=u' ').replace(u' ', u' ')
示例6: url_matcher
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def url_matcher(event, url, *args, **kwargs):
r = requests.head(url)
# files that are too big cause trouble. Let's just ignore them.
if 'content-length' in r.headers and \
int(r.headers['content-length']) > 5e6:
return
html = requests.get(url).text
readable_article = Document(html).summary().encode("utf-8")
readable_article = TAG_RE.sub('', readable_article)
readable_article = WHITESPACE_RE.sub(' ', readable_article)
readable_article = readable_article.replace('\n', ' ')
readable_article = readable_article.replace(' ', '')
if len(readable_article) > 75:
readable_article = readable_article[:75] + '...'
readable_title = Document(html).short_title().encode("utf-8")
return "> " + url + " > " + readable_title + " > " + readable_article
示例7: get_main_text
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def get_main_text(html):
main_text = Document(html).summary()
main_text = BeautifulSoup(main_text).getText()
# 处理空行
r = re.compile(r'\n+', re.M | re.S)
main_text = r.sub('\n', main_text)
# 去除首行回车
if main_text.find('\n') == 0:
main_text = main_text.replace('\n', '', 1)
return main_text
示例8: getTextFromHTML
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def getTextFromHTML(self, html):
""" Runs Readability (Document) on the HTML text
"""
try:
html_summary = Document(html).summary(html_partial=True)
html_summary = html_summary.replace('\n','').replace('\t','')
if "Something's wrong here..." in html_summary or "<h1>Not Found</h1><p>The requested URL" in html_summary or html_summary == "<html><head/></html>" or "403 Forbidden" in html_summary:
return False
raw_text = lxml.html.document_fromstring(html_summary).text_content()
except:
raw_text = False
return raw_text
示例9: extrat_html_document
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def extrat_html_document(url):
try :
print "extrat_html_document"
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
r = urllib2.Request(url, headers=headers)
socket = urllib2.urlopen(r,timeout = 1)
url = socket.geturl()
html = socket.read()
#block_url pass
for bl_url in block_url:
if len(url.split(bl_url)) > 1:
summary="block"
return summary
for ext_url in exception_url:
if len(url.split(ext_url)) > 1:
readable_title = Document(html).short_title()
summary = readable_title.encode('utf-8')
_file.write(summary+'\n')
return summary
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
summary = readable_title.encode('utf-8') + readable_title.encode('utf-8')
print "soup start"
soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser")
print "summary:"
for s in soup("p"):
summary += str(s.encode('utf-8'))
# summary += readable_article.encode('utf-8')
except Exception:
_file.write('extrat_html_document Failed URL : ' + url + '\n')
summary = "Failed Get data"
return summary
示例10: parser_content
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def parser_content(url):
rt_result = []
dr = re.compile(r'<[^>]+>',re.S)
html = urllib.urlopen(url).read()
readable_article = Document(html).summary().encode('utf8')
#print readable_article
readable_article = readable_article.replace(' ','')
cur_list = readable_article.split('\n')
for item in cur_list:
if '<img' in item and 'src=' in item:
#print item.split('src=')[1].split('"')[1]
dom = soupparser.fromstring(item)
if len(dom) > 0:
img_path = dom[0].xpath('.//img')
for img in img_path:
rt_result.append(['0',img.get('src')])
else:
use_item = dr.sub('',item).replace(' ','')
if len(use_item) > 10:
rt_result.append(['1',use_item])
return rt_result
示例11: get_cleaned_html_from_url
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def get_cleaned_html_from_url(url):
readable_article = Document(get_html(url)).summary()
readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
string_out = "<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>"
string_out += readable_article[6:]
return string_out
示例12: write_readable_text_from_url
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def write_readable_text_from_url(url,out_file):
readable_article = Document(get_html(url)).summary()
readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>")
out_file.write(readable_article[6:])
示例13: update
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
def update(offset):
offset = int(offset)
if offset == 0:
db.session.query(Entry).delete()
db.session.commit()
return ''
# Obtain bearer token from Twitter
url = "https://api.twitter.com/oauth2/token"
consumer_key = os.environ.get('TWITTER_CONSUMER_KEY')
consumer_secret = os.environ.get('TWITTER_CONSUMER_SECRET')
auth = base64.b64encode(consumer_key + ':' + consumer_secret)
request = urllib2.Request(url, "grant_type=client_credentials", {"Authorization": "Basic "+auth})
response = urllib2.urlopen(request).read()
json_response = json.loads(response)
access_token = json_response['access_token']
# Obtain HN posts >100 pts
url = "https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=newsyc100&count=40"
request = urllib2.Request(url, headers={"Authorization": "Bearer "+access_token})
response = urllib2.urlopen(request).read()
tweets = json.loads(response)
increment = 2
start_at = (offset - 1) * increment
tweets = tweets[start_at:start_at + increment]
for tweet in tweets:
title = tweet['text']
start_link = title.rfind("(http")
end_link = title.find(")", start_link)
comment_link = title[start_link+1:end_link]
title = title[0:start_link]
start_link = title.rfind("http")
end_link = title.find(" ", start_link)
link = title[start_link:end_link]
title = title[0:start_link]
try:
response = urllib2.urlopen(link)
except urllib2.HTTPError:
continue
encoding = response.headers['content-type'].split('charset=')[-1]
if encoding == 'text/html':
encoding = 'utf-8'
if encoding == 'application/pdf':
continue
html = response.read().decode(encoding, 'ignore')
if sys.modules.has_key('readability.readability'):
body = Document(html).summary()
else:
body = html
body = body.replace('<html><body>', '<html><body><a href="' + comment_link + '">HN Comments</a><br>')
body = body.replace('<body id="readabilityBody">', '')
entry = Entry(link, title, body)
db.session.add(entry)
db.session.commit()
return ''
示例14: print
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import replace [as 别名]
import requests
from readability.readability import Document
url_in = "" # put url here
r = requests.get(url_in)
# print(r.status_code)
html = r.text
with open('out.html','w') as out_file:
readable_article = Document(html).summary()
readable_article = readable_article.replace(u"\u2018", "'").replace(u"\u2019", "'").replace(u"\u201c","\"").replace(u"\u201d", "\"")
out_file.write("<!DOCTYPE html>\n<html><head><meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /></head>")
out_file.write(readable_article[6:])