当前位置: 首页>>代码示例>>Python>>正文


Python Document.short_title方法代码示例

本文整理汇总了Python中readability.readability.Document.short_title方法的典型用法代码示例。如果您正苦于以下问题:Python Document.short_title方法的具体用法?Python Document.short_title怎么用?Python Document.short_title使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在readability.readability.Document的用法示例。


在下文中一共展示了Document.short_title方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def main():
    html = open('./samples/21853124_0.shtml').read()
    doc = Document(html)
    doc.transform()
    doc.get_publish_date()
    doc.short_title()
    doc.text_content()
开发者ID:actberw,项目名称:python-readability,代码行数:9,代码来源:benchmark.py

示例2: read_command

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def read_command(api, args):
    from readability.readability import Document
    import html2text
    h = html2text.HTML2Text()
    h.inline_links = False
    h.ignore_images = True
    h.ignore_emphasis = True
    res = requests.get(args.url)
    if res.ok:
        article = Document(res.content)
        print article.short_title()
        print h.handle(article.summary())
    else:
        print res.headers['status']
开发者ID:zeekay,项目名称:readitlater,代码行数:16,代码来源:readitlater.py

示例3: strip_chapter

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
    def strip_chapter(self, html):
        """
        Strips chapter and gets relevant HTML using Readability
        :param html: str
        :return:
        """
        doc = Document(html)
        if len(doc.summary()) <= 20:
            content = str(BeautifulSoup(html, 'html.parser').find_all('div', class_=self.main_content_div)[0])
            content = '<html><head><meta charset="utf-8"></head>' + content + '</html>'
            return doc.short_title(), content

        return (doc.short_title(),
                str(doc.summary()).replace('<html>', '<html><head><meta charset="utf-8"></head>'))
开发者ID:exp0nge,项目名称:light-novel-scraper,代码行数:16,代码来源:light_scrapper_web_api.py

示例4: extract_article

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    # boilerpipe_extractor = Extractor(html=html)

    # run readability
    readability_extractor = Document(html)

    html = readability_extractor.summary()
    # return article data
    return {
      'title': readability_extractor.short_title(),
      'html': html,
      'content': strip_tags(html).encode('utf-8', errors='ignore'),
      'url': url
    }

  # otherwise return an empty dict
  else:
    return {}
开发者ID:abelsonlive,项目名称:complicity,代码行数:32,代码来源:article_extractor.py

示例5: markdownify

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def markdownify(url_list, **options):
    articles = []
    images = []
    paragraph_links = options['paragraph_links']
    wrap_text = options['wrap_text']
    preamble = options['preamble']
    for url in url_list:
        req = urllib2.Request(url,None,{'Referer': url_list[0]})
        html = urllib2.urlopen(req).read()
        document = Document(html, url=url)
        readable_title = document.short_title()
        summary = document.summary()
        summary_doc = build_doc(summary)
        images.extend([a.get('src') for a in summary_doc.findall('.//img')])
        articles.append(document.summary())

    markdown_articles = []
    for (article, url) in zip(articles, url_list):
        h = html2text.HTML2Text(baseurl=url)
        h.inline_links = False
        h.links_each_paragraph = (paragraph_links and 1) or 0
        h.body_width = (wrap_text and 78) or 0
        markdown_articles.append(h.handle(article))
    combined_article = u"\n\n----\n\n".join(markdown_articles)
    if preamble:
        combined_article = (u"Title:        %s  \nOriginal URL: %s\n\n" % (readable_title, url_list[0])) + combined_article
    return combined_article.encode("utf-8")
开发者ID:evandeaubl,项目名称:markability,代码行数:29,代码来源:markability.py

示例6: set

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
class Gist:

    keyword_pattern = re.compile(r'^[^\d]+$')
    stop_words = set(get_stop_words('en'))

    def __init__(self, html):
        self.html = html
        self.document = Document(html)

    @property
    def title(self):
        return self.document.short_title()

    @cached_property
    def text(self):
        text = self.document.summary()
        text = re.sub('<br[^>]+>', '\n', text)
        text = re.sub('</?p[^>]+>', '\n\n', text)
        text = re.sub('<[^>]+>', '', text)
        text = re.sub('^[ \t]+$', '', text)
        text = re.sub('\n{3,}', '\n\n', text, flags=re.MULTILINE)
        return text

    @staticmethod
    def _common_prefix(one, two):
        parallelity = [x == y for x, y in zip(one, two)] + [False]
        return parallelity.index(False)

    @classmethod
    def _find_representative(cls, stem, text):
        tokens = text.split()
        prefixes = {token: cls._common_prefix(token, stem) for token in tokens}
        best = lambda token: (-token[1], len(token[0]))
        return sorted(prefixes.items(), key=best)[0][0]

    @classmethod
    def _is_good_keyword(cls, word):
        return (word not in cls.stop_words) and \
                cls.keyword_pattern.match(word)

    @classmethod
    def find_keywords(cls, text):
        whoosh_backend = SearchForm().searchqueryset.query.backend
        if not whoosh_backend.setup_complete:
            whoosh_backend.setup()
        with whoosh_backend.index.searcher() as searcher:
            keywords = searcher.key_terms_from_text(
                'text', text, numterms=10, normalize=False)
        keywords = list(zip(*keywords))[0] if keywords else []
        keywords = [cls._find_representative(keyword, text) for keyword in keywords]
        keywords = [keyword for keyword in keywords if cls._is_good_keyword(keyword)]
        #no double keywords in list
        keywords = list(set(keywords))
        #no punctuation in suggested keywords
        keywords = [''.join(c for c in s if c not in string.punctuation) for s in keywords]
        return keywords

    @property
    def keywords(self):
        return self.find_keywords(self.text)
开发者ID:FUB-HCC,项目名称:ACE-Research-Library,代码行数:62,代码来源:utils.py

示例7: extract_article

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_article(url):
  r = requests.get(url)
  
  # the the url exists, continue
  if r.status_code == 200:
    
    # extract and parse response url
    url = parse_url(r.url)

    # extract html
    html = r.content.decode('utf-8', errors='ignore')

    # run boilerpipe
    BP = Extractor(html=html)

    # run readability
    Rdb = Document(html)

    html = Rdb.summary()
    # return article data
    return {
      'extracted_title': Rdb.short_title().strip(),
      'extracted_content': strip_tags(BP.getText()),
    }

  # otherwise return an empty dict
  else:
    return {}
开发者ID:voidfiles,项目名称:particle,代码行数:30,代码来源:article_extractor.py

示例8: main

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def main():
    html = urllib.urlopen("http://habrahabr.ru/post/150756/").read()
    doc = Document(html)
    short_title = doc.short_title()
    readable_article = doc.summary()
    f = open("C:\\users\\mykola\\documents\\%s.html" % short_title, "wb")
    f.write(readable_article.encode("utf-8"))
    f.close()
开发者ID:mykolad,项目名称:python-readability,代码行数:10,代码来源:TestReadability.py

示例9: get_article_from_item

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
 def get_article_from_item(self, item):
     url = item['link']
     logging.debug(url)
     author = 'n/a'
     if item.has_key('author'):
         author = item.author
     html = urllib.urlopen(url).read()
     doc = Document(html)
     return Article(doc.title(), doc.short_title(), author, doc.summary())
开发者ID:andrebask,项目名称:rsstoebook,代码行数:11,代码来源:ArticleData.py

示例10: extract_by_readability

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract_by_readability(html):
    document = Document(html)

    def strip_html(html):
        return re.sub(r'<[^<]+?>', '', html)

    return {
        'title': ensure_unicode(document.short_title()),
        'body': strip_html(ensure_unicode(document.summary())),
    }
开发者ID:yuzroz,项目名称:content_extraction,代码行数:12,代码来源:extract_py3.py

示例11: extract_data

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
 def extract_data(self, patchurl):
     try:
         f = requests.get(patchurl)
         html = f.content
         doc = Document(html)
         title = doc.short_title()
         summary = doc.summary()
         return smart_str(title), smart_str(summary)
     except:
         return None, None
开发者ID:treeship,项目名称:treestump,代码行数:12,代码来源:patch.py

示例12: decode_doc

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def decode_doc(doc, url):
    #print('doc')
    cs = re.compile(b'^<(meta|META).*charset=("|\')?([^ "\']*)')
    pkey = re.compile(b'^<(meta|META).*keywords.*content=("|\')?([^ "\']*)')
    codec = None
    keywords = None
    #print(*doc)
    for l in doc :
        if (l.startswith(b'<meta') or l.startswith(b'<META')) :
            if codec is None and (b'charset' in l) :
                m = cs.match(l)
                codec = m.group(3).decode()
            if keywords is None and b'keywords' in l :
                m = pkey.match(l)
                if m :
                    keywords = m.group(3)


    sdoc = []
    for l in doc :
        try :
            l = l.decode(codec)
        except :
            l = ''
        sdoc.append(l)

    try :
        if keywords :
            keywords = keywords.decode(codec)
        else :
            #print(*sdoc, sep = '\n')
            keywords = ''
        keywords = re.split(r'[ ,;\|]',keywords)
        #print(keywords.encode('utf8'))
    except :
        pass

    #if sum(len(x) for x in sdoc) < 1000 : return
    doc = '\n'.join(sdoc)
    #if len(doc) < 1000 :return
    try :
        doc = Document(doc)
        title = doc.short_title()
        content = doc.summary()
    except :
        return
    #print(doc.summary().encode('utf8'))
    #print(doc.short_title().encode('utf8'))


    data = {"url":url, 
            'keywords':keywords,
            'title': title,
            'content':content}
    return data
开发者ID:zhangkaixu,项目名称:juicy,代码行数:57,代码来源:wash_news_html.py

示例13: parse_news_content

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
    def parse_news_content(self, response):
        for link in self.full_article_link_extractor.extract_links(response):
            request = response.request.replace(url=link.url)
            yield request

        item = self._create_item(response)
        if item is not None:
            doc = Document(response.body)
            item['title'] = doc.short_title()
            item['content'] = html2text.html2text(doc.summary())
            yield item
开发者ID:LightKool,项目名称:scraper,代码行数:13,代码来源:wdzj.py

示例14: extract_url_content

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
    def extract_url_content(self, url=None):
        if not url:
            url = self.url
        url_parse = urlparse(url)
        headers = {}
        if url_parse.netloc != "t.co":
            user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1 Iceweasel/9.0.1"
            headers['User-Agent'] = user_agent

        content = requests.get(url, headers=headers)
        self.content_type = content.headers.get('content-type')
        self.status_code = content.status_code
        self.content = content.text
        self.url = self.clean_url(self.url)
        self.url = self.url_morph(content.url)
        self.image = self.find_taller_image(self.content)
        if self.image:
            self.logger.info("found image : %s"%self.image)
        self.url_parse = urlparse(self.url)

        if url_parse.netloc in oembed.keys():
            print "found oembed"
            mod = oembed[url_parse.netloc]
            self.content = mod.get_widget(url)
            self.summary = self.content
            self.title = os.path.basename(url_parse.path)
            self.content_type = "collectr/parsed"
            self.tags = [mod.get_tag()]
            self.tagstring = mod.get_tag()
            return



        if self.status_code >= 400:
            raise UrlExtractException("Can't extract content for %s (http<%d>)" % (url, content.status_code))

        elif "image" in self.content_type:
            print "log: content type : image"
            self.summary = """<img src="%s" />""" % self.url
            self.title = self.url

        elif "html" in self.content_type:
            doc = Document(self.content)
            self.summary = doc.summary()
            try:
                self.title = doc.short_title()
            except AttributeError:
                self.title = u"No title"


        else:
            self.summary = None
            self.title = os.path.basename(url_parse.path)
开发者ID:Daroth,项目名称:collectr,代码行数:55,代码来源:tasks.py

示例15: extract

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import short_title [as 别名]
def extract(html):
    try:
        doc = Document(html)
        article = doc.summary()
        title = doc.short_title()
        return {
            'title': title,
            'article': html_to_text(article),
            'full_text': html_to_text(html)
        }
    except:
        logging.exception('extract html')
        return {}
开发者ID:AllSundays,项目名称:yummy,代码行数:15,代码来源:extract.py


注:本文中的readability.readability.Document.short_title方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。