本文整理汇总了Python中readability.readability.Document.content方法的典型用法代码示例。如果您正苦于以下问题:Python Document.content方法的具体用法?Python Document.content怎么用?Python Document.content使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document
的用法示例。
在下文中一共展示了Document.content方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: recommend_by_url
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import content [as 别名]
def recommend_by_url(url):
parsed = urlparse(url)
doc = Document(requests.get(url).content)
content = html.fromstring(doc.content()).xpath('string()')
bigrams = make_bigrams(content)
vec_bow = dictionary.doc2bow(bigrams)
vec_lsi = lsi[vec_bow]
sims = index[vec_lsi]
#print sims
docs = sorted(list(enumerate(sims)), key=lambda item: -item[1])
results, seen = [], []
for doc, score in docs:
res = ARTICLES[doc]
if not 'url' in res or res['url'] in seen:
continue
seen.append(res['url'])
p = urlparse(res['url'])
if p.hostname.endswith(parsed.hostname):
continue
res['score'] = float(score)
if 'content' in res:
del res['content']
if 'html' in res:
del res['html']
if res['summary']:
res['summary'] = res['summary'].strip()
results.append(res)
if len(results) > 14:
break
return results
示例2: get_article
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import content [as 别名]
def get_article(d):
url = d['url']
if table.find_one(url=url):
return
print "fetching stuff for %s" % url
d['html'] = requests.get(url).content
try:
doc = Document(d['html'])
d['summary'] = html.fromstring(doc.summary()).xpath('string()')
d['content'] = html.fromstring(doc.content()).xpath('string()')
d['title'] = doc.title()
except Exception, e:
print e
示例3: make_readable
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import content [as 别名]
def make_readable(url):
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError:
return None
document = Document(html)
document_dict = {
'title': document.title(),
'summary': document.summary(),
'content': document.content(),
'short_title': document.short_title()
}
return document_dict