当前位置: 首页>>代码示例>>Python>>正文


Python Document.title方法代码示例

本文整理汇总了Python中readability.readability.Document.title方法的典型用法代码示例。如果您正苦于以下问题:Python Document.title方法的具体用法?Python Document.title怎么用?Python Document.title使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在readability.readability.Document的用法示例。


在下文中一共展示了Document.title方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
class Article:

    def __init__(self, url):
        print('Saving page: {}'.format(url))
        res = requests.get(url)
        self.url = url
        self.article = Document(res.content)
        self._add_title()
        self._save_images()

    def _add_title(self):
        self.root = etree.fromstring(self.article.summary())
        body = self.root.find('body')

        title = self.article.title()
        ascii_title = unidecode(title) if type(title) == unicode else title

        title_header = etree.HTML('<h2>{}</h2>'.format(ascii_title))
        body.insert(0, title_header)

    def _save_images(self):
        tmppath = tempfile.mkdtemp()
        images = self.root.xpath('//img')
        for img in images:
            imgsrc = img.get('src')

            # handle scheme-agnostic URLs
            if 'http' not in imgsrc and '//' in imgsrc:
                imgsrc = 'http:{}'.format(imgsrc)

            # handle relative file paths
            elif 'http' not in imgsrc:
                parsed = urlparse(self.url)
                imgsrc = '{}://{}{}'.format(parsed.scheme, parsed.netloc, imgsrc)

            filename = os.path.basename(imgsrc)
            dest = os.path.join(tmppath, filename)

            try:
                res = requests.get(imgsrc)
            except Exception as e:
                print('Could not fetch image ({}) from "{}"'.format(str(e), imgsrc))
                return

            if res.status_code == 404:
                print('Could not fetch image (HTTP 404), attempted fetch: "{}", source URL: {}'.format(imgsrc, img.get('src')))
                continue

            with open(dest, 'wb') as f:
                f.write(res.content)

            img.set('src', dest)

    @property
    def title(self):
        return self.article.title()

    @property
    def html(self):
        return etree.tostring(self.root)
开发者ID:cjpetrus,项目名称:lambda-epubify,代码行数:62,代码来源:worker.py

示例2: run

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def run(index):
	print "Index %d" % index
	dirname = "data/%04d" % index

	# url of english article
	url = open(dirname + "/url_en.txt").read()

	# download html
	html = urllib.urlopen(url).read().decode('latin-1')

	# apply readability
	document = Document(html)
	article = document.summary()
	article = nltk.clean_html(article)

	# replace latin characters
	article = re.sub(u'&#13;', u'\n', article)
	article = re.sub(u'\x92', u'`', article)
	article = re.sub(u'\x96', u'-', article)

	# article_en.txt
	output = codecs.open(dirname + "/article_en.txt", 'w', encoding='ascii', errors='ignore')
	output.write(article)
	output.close()

	# title.txt
	output = codecs.open(dirname + "/title.txt", 'w', encoding='ascii', errors='ignore')
	output.write(document.title())
	output.close()
开发者ID:moon6pence,项目名称:DailyCode,代码行数:31,代码来源:article_en.py

示例3: _getResponseText

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
 def _getResponseText(self, response):
     '''
     (reponse) -> Text
     Returns text within the body of an HttpResponse object.
     '''
     readability = Document(response.body)
     content = readability.title() + readability.summary()
     return content
开发者ID:jasonliw93,项目名称:recon,代码行数:10,代码来源:reconspider.py

示例4: crawl_url

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def crawl_url(url):
    html = requests.get(url)
    doc = Document(html.content)
    content = doc.summary().encode('utf-8')
    title = doc.title().encode('utf-8')
    return {
        'content': content,
        'title': title
    }
开发者ID:jungledrum,项目名称:bo,代码行数:11,代码来源:crawl_article.py

示例5: get_article_from_item

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
开发者ID:andrebask,项目名称:rsstoebook,代码行数:11,代码来源:ArticleData.py

示例6: extract_article

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
 def extract_article(self):
     """Returns only readable content
     Returns:
         data - {
             'title': 'Title of the article',
             'content': 'HTML body of the article'
         }
     """
     doc = Document(self._html)
     return {'title': doc.title(), 'content': doc.summary()}
开发者ID:katakumpo,项目名称:django-scraper,代码行数:12,代码来源:extractor.py

示例7: get_article

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def get_article(d):
    url = d['url']
    if table.find_one(url=url):
        return
    print "fetching stuff for %s" % url
    d['html'] = requests.get(url).content
    try:
        doc = Document(d['html'])
        d['summary'] = html.fromstring(doc.summary()).xpath('string()')
        d['content'] = html.fromstring(doc.content()).xpath('string()')
        d['title'] = doc.title()
    except Exception, e:
        print e
开发者ID:pudo-attic,项目名称:newshacks,代码行数:15,代码来源:scraper.py

示例8: make_readable

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def make_readable(url):
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError:
        return None

    document = Document(html)

    document_dict = {
        'title': document.title(),
        'summary': document.summary(),
        'content': document.content(),
        'short_title': document.short_title()
    }

    return document_dict
开发者ID:joshgoss,项目名称:reader-server,代码行数:18,代码来源:utils.py

示例9: parseURL_pr

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def parseURL_pr(url):
    parsed = urlparse(url)
    if ( "youtube" in parsed.hostname ):
        print url, 'has youtube and we dont parse that'
        return None
    try:
        response = urlopen(url)
    except IOError:
        return None

    if ( response.getcode() > 400 ):
        print url , ' is not accessible any more', response.getcode()
        return None
    html = response.read()
    doc = Document(html)
    content = {}
    #content['content'] = doc.summary()
    html = doc.summary(True)
    soup = BeautifulSoup(html)
    content['content'] = soup.get_text()
    content['title'] = doc.title()
    content['word_count'] = len(content['content'])
    return content
开发者ID:nyakosuta,项目名称:HN_rank,代码行数:25,代码来源:parse_article.py

示例10: textgetter

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import title [as 别名]
def textgetter(url):
    """Scrapes web news and returns the content

    Parameters
    ----------

    url : str
        web address to news report

    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article

    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group()).group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        try:
            r = requests.get(url, verify=False, timeout=1)
        except:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            yield answer
                
        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url


        if len(r.content)>500:
            article = Article(url)
            article.download(input_html=r.content)
            article.parse()
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider']=site
                answer['published_date'] = article.publish_date
                if isinstance(article.publish_date,datetime.datetime):
                    answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat()
                

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url

                
            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
#.........这里部分代码省略.........
开发者ID:DistrictDataLabs,项目名称:blog-files,代码行数:103,代码来源:utilities.py


注:本文中的readability.readability.Document.title方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。