本文整理汇总了Python中readability.readability.Document.title方法的典型用法代码示例。如果您正苦于以下问题:Python Document.title方法的具体用法?Python Document.title怎么用?Python Document.title使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document
的用法示例。
在下文中一共展示了Document.title方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
class Article:
def __init__(self, url):
print('Saving page: {}'.format(url))
res = requests.get(url)
self.url = url
self.article = Document(res.content)
self._add_title()
self._save_images()
def _add_title(self):
self.root = etree.fromstring(self.article.summary())
body = self.root.find('body')
title = self.article.title()
ascii_title = unidecode(title) if type(title) == unicode else title
title_header = etree.HTML('<h2>{}</h2>'.format(ascii_title))
body.insert(0, title_header)
def _save_images(self):
tmppath = tempfile.mkdtemp()
images = self.root.xpath('//img')
for img in images:
imgsrc = img.get('src')
# handle scheme-agnostic URLs
if 'http' not in imgsrc and '//' in imgsrc:
imgsrc = 'http:{}'.format(imgsrc)
# handle relative file paths
elif 'http' not in imgsrc:
parsed = urlparse(self.url)
imgsrc = '{}://{}{}'.format(parsed.scheme, parsed.netloc, imgsrc)
filename = os.path.basename(imgsrc)
dest = os.path.join(tmppath, filename)
try:
res = requests.get(imgsrc)
except Exception as e:
print('Could not fetch image ({}) from "{}"'.format(str(e), imgsrc))
return
if res.status_code == 404:
print('Could not fetch image (HTTP 404), attempted fetch: "{}", source URL: {}'.format(imgsrc, img.get('src')))
continue
with open(dest, 'wb') as f:
f.write(res.content)
img.set('src', dest)
@property
def title(self):
return self.article.title()
@property
def html(self):
return etree.tostring(self.root)
示例2: run
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def run(index):
print "Index %d" % index
dirname = "data/%04d" % index
# url of english article
url = open(dirname + "/url_en.txt").read()
# download html
html = urllib.urlopen(url).read().decode('latin-1')
# apply readability
document = Document(html)
article = document.summary()
article = nltk.clean_html(article)
# replace latin characters
article = re.sub(u' ', u'\n', article)
article = re.sub(u'\x92', u'`', article)
article = re.sub(u'\x96', u'-', article)
# article_en.txt
output = codecs.open(dirname + "/article_en.txt", 'w', encoding='ascii', errors='ignore')
output.write(article)
output.close()
# title.txt
output = codecs.open(dirname + "/title.txt", 'w', encoding='ascii', errors='ignore')
output.write(document.title())
output.close()
示例3: _getResponseText
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def _getResponseText(self, response):
'''
(reponse) -> Text
Returns text within the body of an HttpResponse object.
'''
readability = Document(response.body)
content = readability.title() + readability.summary()
return content
示例4: crawl_url
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def crawl_url(url):
html = requests.get(url)
doc = Document(html.content)
content = doc.summary().encode('utf-8')
title = doc.title().encode('utf-8')
return {
'content': content,
'title': title
}
示例5: get_article_from_item
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def get_article_from_item(self, item):
url = item['link']
logging.debug(url)
author = 'n/a'
if item.has_key('author'):
author = item.author
html = urllib.urlopen(url).read()
doc = Document(html)
return Article(doc.title(), doc.short_title(), author, doc.summary())
示例6: extract_article
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def extract_article(self):
"""Returns only readable content
Returns:
data - {
'title': 'Title of the article',
'content': 'HTML body of the article'
}
"""
doc = Document(self._html)
return {'title': doc.title(), 'content': doc.summary()}
示例7: get_article
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def get_article(d):
url = d['url']
if table.find_one(url=url):
return
print "fetching stuff for %s" % url
d['html'] = requests.get(url).content
try:
doc = Document(d['html'])
d['summary'] = html.fromstring(doc.summary()).xpath('string()')
d['content'] = html.fromstring(doc.content()).xpath('string()')
d['title'] = doc.title()
except Exception, e:
print e
示例8: make_readable
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def make_readable(url):
try:
html = urllib2.urlopen(url).read()
except urllib2.URLError:
return None
document = Document(html)
document_dict = {
'title': document.title(),
'summary': document.summary(),
'content': document.content(),
'short_title': document.short_title()
}
return document_dict
示例9: parseURL_pr
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def parseURL_pr(url):
parsed = urlparse(url)
if ( "youtube" in parsed.hostname ):
print url, 'has youtube and we dont parse that'
return None
try:
response = urlopen(url)
except IOError:
return None
if ( response.getcode() > 400 ):
print url , ' is not accessible any more', response.getcode()
return None
html = response.read()
doc = Document(html)
content = {}
#content['content'] = doc.summary()
html = doc.summary(True)
soup = BeautifulSoup(html)
content['content'] = soup.get_text()
content['title'] = doc.title()
content['word_count'] = len(content['content'])
return content
示例10: textgetter
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def textgetter(url):
"""Scrapes web news and returns the content
Parameters
----------
url : str
web address to news report
Returns
-------
answer : dict
Python dictionary with key/value pairs for:
text (str) - Full text of article
url (str) - url to article
title (str) - extracted title of article
author (str) - name of extracted author(s)
base (str) - base url of where article was located
provider (str) - string of the news provider from url
published_date (str,isoformat) - extracted date of article
top_image (str) - extracted url of the top image for article
"""
global done
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']
# regex for url check
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
if s.search(url):
site = u.search(s.search(url).group()).group(3)
else:
site = None
answer = {}
# check that its an url
if s.search(url):
if url in done.keys():
return done[url]
pass
try:
r = requests.get(url, verify=False, timeout=1)
except:
done[url] = "Unable to reach website."
answer['author'] = None
answer['base'] = s.search(url).group()
answer['provider']=site
answer['published_date']=None
answer['text'] = "Unable to reach website."
answer['title'] = None
answer['top_image'] = None
answer['url'] = url
yield answer
if r.status_code != 200:
done[url] = "Unable to reach website."
answer['author'] = None
answer['base'] = s.search(url).group()
answer['provider']=site
answer['published_date']=None
answer['text'] = "Unable to reach website."
answer['title'] = None
answer['top_image'] = None
answer['url'] = url
if len(r.content)>500:
article = Article(url)
article.download(input_html=r.content)
article.parse()
if len(article.text) >= 200:
answer['author'] = ", ".join(article.authors)
answer['base'] = s.search(url).group()
answer['provider']=site
answer['published_date'] = article.publish_date
if isinstance(article.publish_date,datetime.datetime):
answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat()
answer['text'] = article.text
answer['title'] = article.title
answer['top_image'] = article.top_image
answer['url'] = url
else:
doc = Paper(r.content)
data = doc.summary()
title = doc.title()
soup = BeautifulSoup(data, 'lxml')
newstext = " ".join([l.text for l in soup.find_all(TAGS)])
if len(newstext) > 200:
answer['author'] = None
answer['base'] = s.search(url).group()
answer['provider']=site
answer['published_date']=None
answer['text'] = newstext
answer['title'] = title
answer['top_image'] = None
#.........这里部分代码省略.........