本文整理汇总了Python中readability.readability.Document.short_title方法的典型用法代码示例。如果您正苦于以下问题:Python Document.short_title方法的具体用法?Python Document.short_title怎么用?Python Document.short_title使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document
的用法示例。
在下文中一共展示了Document.short_title方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def main():
html = open('./samples/21853124_0.shtml').read()
doc = Document(html)
doc.transform()
doc.get_publish_date()
doc.short_title()
doc.text_content()
示例2: read_command
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def read_command(api, args):
from readability.readability import Document
import html2text
h = html2text.HTML2Text()
h.inline_links = False
h.ignore_images = True
h.ignore_emphasis = True
res = requests.get(args.url)
if res.ok:
article = Document(res.content)
print article.short_title()
print h.handle(article.summary())
else:
print res.headers['status']
示例3: strip_chapter
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def strip_chapter(self, html):
"""
Strips chapter and gets relevant HTML using Readability
:param html: str
:return:
"""
doc = Document(html)
if len(doc.summary()) <= 20:
content = str(BeautifulSoup(html, 'html.parser').find_all('div', class_=self.main_content_div)[0])
content = '<html><head><meta charset="utf-8"></head>' + content + '</html>'
return doc.short_title(), content
return (doc.short_title(),
str(doc.summary()).replace('<html>', '<html><head><meta charset="utf-8"></head>'))
示例4: extract_article
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_article(url):
r = requests.get(url)
# the the url exists, continue
if r.status_code == 200:
# extract and parse response url
url = parse_url(r.url)
# extract html
html = r.content.decode('utf-8', errors='ignore')
# run boilerpipe
# boilerpipe_extractor = Extractor(html=html)
# run readability
readability_extractor = Document(html)
html = readability_extractor.summary()
# return article data
return {
'title': readability_extractor.short_title(),
'html': html,
'content': strip_tags(html).encode('utf-8', errors='ignore'),
'url': url
}
# otherwise return an empty dict
else:
return {}
示例5: markdownify
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def markdownify(url_list, **options):
articles = []
images = []
paragraph_links = options['paragraph_links']
wrap_text = options['wrap_text']
preamble = options['preamble']
for url in url_list:
req = urllib2.Request(url,None,{'Referer': url_list[0]})
html = urllib2.urlopen(req).read()
document = Document(html, url=url)
readable_title = document.short_title()
summary = document.summary()
summary_doc = build_doc(summary)
images.extend([a.get('src') for a in summary_doc.findall('.//img')])
articles.append(document.summary())
markdown_articles = []
for (article, url) in zip(articles, url_list):
h = html2text.HTML2Text(baseurl=url)
h.inline_links = False
h.links_each_paragraph = (paragraph_links and 1) or 0
h.body_width = (wrap_text and 78) or 0
markdown_articles.append(h.handle(article))
combined_article = u"\n\n----\n\n".join(markdown_articles)
if preamble:
combined_article = (u"Title: %s \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article
return combined_article.encode("utf-8")
示例6: set
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
class Gist:
keyword_pattern = re.compile(r'^[^\d]+$')
stop_words = set(get_stop_words('en'))
def __init__(self, html):
self.html = html
self.document = Document(html)
@property
def title(self):
return self.document.short_title()
@cached_property
def text(self):
text = self.document.summary()
text = re.sub('<br[^>]+>', '\n', text)
text = re.sub('</?p[^>]+>', '\n\n', text)
text = re.sub('<[^>]+>', '', text)
text = re.sub('^[ \t]+$', '', text)
text = re.sub('\n{3,}', '\n\n', text, flags=re.MULTILINE)
return text
@staticmethod
def _common_prefix(one, two):
parallelity = [x == y for x, y in zip(one, two)] + [False]
return parallelity.index(False)
@classmethod
def _find_representative(cls, stem, text):
tokens = text.split()
prefixes = {token: cls._common_prefix(token, stem) for token in tokens}
best = lambda token: (-token[1], len(token[0]))
return sorted(prefixes.items(), key=best)[0][0]
@classmethod
def _is_good_keyword(cls, word):
return (word not in cls.stop_words) and \
cls.keyword_pattern.match(word)
@classmethod
def find_keywords(cls, text):
whoosh_backend = SearchForm().searchqueryset.query.backend
if not whoosh_backend.setup_complete:
whoosh_backend.setup()
with whoosh_backend.index.searcher() as searcher:
keywords = searcher.key_terms_from_text(
'text', text, numterms=10, normalize=False)
keywords = list(zip(*keywords))[0] if keywords else []
keywords = [cls._find_representative(keyword, text) for keyword in keywords]
keywords = [keyword for keyword in keywords if cls._is_good_keyword(keyword)]
#no double keywords in list
keywords = list(set(keywords))
#no punctuation in suggested keywords
keywords = [''.join(c for c in s if c not in string.punctuation) for s in keywords]
return keywords
@property
def keywords(self):
return self.find_keywords(self.text)
示例7: extract_article
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_article(url):
r = requests.get(url)
# the the url exists, continue
if r.status_code == 200:
# extract and parse response url
url = parse_url(r.url)
# extract html
html = r.content.decode('utf-8', errors='ignore')
# run boilerpipe
BP = Extractor(html=html)
# run readability
Rdb = Document(html)
html = Rdb.summary()
# return article data
return {
'extracted_title': Rdb.short_title().strip(),
'extracted_content': strip_tags(BP.getText()),
}
# otherwise return an empty dict
else:
return {}
示例8: main
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def main():
html = urllib.urlopen("http://habrahabr.ru/post/150756/").read()
doc = Document(html)
short_title = doc.short_title()
readable_article = doc.summary()
f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb")
f.write(readable_article.encode("utf-8"))
f.close()
示例9: get_article_from_item
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def get_article_from_item(self, item):
url = item['link']
logging.debug(url)
author = 'n/a'
if item.has_key('author'):
author = item.author
html = urllib.urlopen(url).read()
doc = Document(html)
return Article(doc.title(), doc.short_title(), author, doc.summary())
示例10: extract_by_readability
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_by_readability(html):
document = Document(html)
def strip_html(html):
return re.sub(r'<[^<]+?>', '', html)
return {
'title': ensure_unicode(document.short_title()),
'body': strip_html(ensure_unicode(document.summary())),
}
示例11: extract_data
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_data(self, patchurl):
try:
f = requests.get(patchurl)
html = f.content
doc = Document(html)
title = doc.short_title()
summary = doc.summary()
return smart_str(title), smart_str(summary)
except:
return None, None
示例12: decode_doc
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def decode_doc(doc, url):
#print('doc')
cs = re.compile(b'^<(meta|META).*charset=("|\')?([^ "\']*)')
pkey = re.compile(b'^<(meta|META).*keywords.*content=("|\')?([^ "\']*)')
codec = None
keywords = None
#print(*doc)
for l in doc :
if (l.startswith(b'<meta') or l.startswith(b'<META')) :
if codec is None and (b'charset' in l) :
m = cs.match(l)
codec = m.group(3).decode()
if keywords is None and b'keywords' in l :
m = pkey.match(l)
if m :
keywords = m.group(3)
sdoc = []
for l in doc :
try :
l = l.decode(codec)
except :
l = ''
sdoc.append(l)
try :
if keywords :
keywords = keywords.decode(codec)
else :
#print(*sdoc, sep = '\n')
keywords = ''
keywords = re.split(r'[ ,;\|]',keywords)
#print(keywords.encode('utf8'))
except :
pass
#if sum(len(x) for x in sdoc) < 1000 : return
doc = '\n'.join(sdoc)
#if len(doc) < 1000 :return
try :
doc = Document(doc)
title = doc.short_title()
content = doc.summary()
except :
return
#print(doc.summary().encode('utf8'))
#print(doc.short_title().encode('utf8'))
data = {"url":url,
'keywords':keywords,
'title': title,
'content':content}
return data
示例13: parse_news_content
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def parse_news_content(self, response):
for link in self.full_article_link_extractor.extract_links(response):
request = response.request.replace(url=link.url)
yield request
item = self._create_item(response)
if item is not None:
doc = Document(response.body)
item['title'] = doc.short_title()
item['content'] = html2text.html2text(doc.summary())
yield item
示例14: extract_url_content
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_url_content(self, url=None):
if not url:
url = self.url
url_parse = urlparse(url)
headers = {}
if url_parse.netloc != "t.co":
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1 Iceweasel/9.0.1"
headers['User-Agent'] = user_agent
content = requests.get(url, headers=headers)
self.content_type = content.headers.get('content-type')
self.status_code = content.status_code
self.content = content.text
self.url = self.clean_url(self.url)
self.url = self.url_morph(content.url)
self.image = self.find_taller_image(self.content)
if self.image:
self.logger.info("found image : %s"%self.image)
self.url_parse = urlparse(self.url)
if url_parse.netloc in oembed.keys():
print "found oembed"
mod = oembed[url_parse.netloc]
self.content = mod.get_widget(url)
self.summary = self.content
self.title = os.path.basename(url_parse.path)
self.content_type = "collectr/parsed"
self.tags = [mod.get_tag()]
self.tagstring = mod.get_tag()
return
if self.status_code >= 400:
raise UrlExtractException("Can't extract content for %s (http<%d>)" % (url, content.status_code))
elif "image" in self.content_type:
print "log: content type : image"
self.summary = """<img src="%s" />""" % self.url
self.title = self.url
elif "html" in self.content_type:
doc = Document(self.content)
self.summary = doc.summary()
try:
self.title = doc.short_title()
except AttributeError:
self.title = u"No title"
else:
self.summary = None
self.title = os.path.basename(url_parse.path)
示例15: extract
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract(html):
try:
doc = Document(html)
article = doc.summary()
title = doc.short_title()
return {
'title': title,
'article': html_to_text(article),
'full_text': html_to_text(html)
}
except:
logging.exception('extract html')
return {}