Python Article.set_html方法代码示例

本文整理汇总了Python中newspaper.Article.set_html方法的典型用法代码示例。如果您正苦于以下问题：Python Article.set_html方法的具体用法？Python Article.set_html怎么用？Python Article.set_html使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article的用法示例。

在下文中一共展示了Article.set_html方法的12个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
    def extract(self, item):
        """Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
        parsing the HTML-Code.

        :param item: A NewscrawlerItem to parse.
        :return: ArticleCandidate containing the recovered article data.
        """
        article_candidate = ArticleCandidate()
        article_candidate.extractor = self._name()

        article = Article('')
        article.set_html(item['spider_response'].body)
        article.parse()
        article_candidate.title = article.title
        article_candidate.description = article.meta_description
        article_candidate.text = article.text
        article_candidate.topimage = article.top_image
        article_candidate.author = article.authors
        if article.publish_date is not None:
            try:
                article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
            except ValueError as exception:
                self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
                              'Publishing date set to None' % item['url'])
        article_candidate.language = article.meta_lang

        return article_candidate

开发者ID:Sayeedsalam，项目名称:spec-event-data-server，代码行数:29，代码来源:newspaper_extractor.py

示例2: main

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def main(argv):
    if len(argv) > 1:
        htmlist = argv[1]
    else:
        htmlist = 'htmlist'

    # Our permanent config for html cleaning
    config = Config()
    config.language = 'id'
    config.MIN_SENT_COUNT = 20
    config.memoize = False
    config.fetch_images = False
    config.verbose= True

    cleaner = Article(url='', config=config)

    with open(htmlist, 'r') as f:
        htmfile = f.read().split('\n')

    raw = []

    for htm in htmfile:
        print (htm)
        if not htm.endswith("rss.html"):
            with open(htm, 'r') as f:
                h = f.read()

            cleaner.set_html(h)
            cleaner.parse()
            sentences = nlp.split_sentences(cleaner.text)
            #raw.append(sentences])
        
            with open('htm-out', 'a') as f:
                [f.write(r + '\n') for r in sentences]

开发者ID:cilsat，项目名称:perisalah-corpus，代码行数:36，代码来源:main.py

示例3: extract_with_newspaper

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
 def extract_with_newspaper(self, html):
     '''Parses HTML using Newspaper.'''
     article = Article(self.url)
     article.set_html(html)
     filterwarnings('ignore', category=DeprecationWarning)
     with catch_warnings():
         article.parse()
     return article.__dict__

开发者ID:michaelmcmillan，项目名称:LittList，代码行数:10，代码来源:general.py

示例4: parse_article_page

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
 def parse_article_page(response):
     article = Article(url=response.request.url)
     article.set_html(response.text)
     article.parse()
     if article.title and article.text:
         item = NewsArticle()
         item['title'] = article.title
         item['text'] = article.text
         yield item

开发者ID:lebinh，项目名称:scrappers，代码行数:11，代码来源:generic_news_site.py

示例5: enrich

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
    async def enrich(self, result):
        # none of the following lines will work if we couldn't make soup
        if not self.soup:
            return result

        sanitized = sanitize_html(self.response.body)
        if not sanitized:
            return result

        article = Article(self.url, config=FixedArticleConfig())
        article.config.fetch_images = False
        article.set_html(sanitized)
        article.parse()

        result.set('title', article.title, 2, 'textlength')
        if len(article.meta_description) > 0:
            result.set('subtitle', article.meta_description, 2, 'textlength')

        if len(article.article_html) > 0:
            sanitized = sanitize_html(article.article_html)
            result.set('content', sanitized, 0, 'textlength')
        elif article.top_node is not None:
            sanitized = sanitize_html(tostring(article.top_node))
            result.set('content', sanitized, 2)

        if article.authors:
            result.set('authors', article.authors, 2)
        if article.publish_date and len(str(article.publish_date)) > 0:
            result.set('published_at', article.publish_date, 2)
        result.add('keywords', list(article.keywords))
        result.add('keywords', list(article.tags))
        result.add('_candidate_images', list(article.imgs))
        # Primary image guess is actually pretty crappy
        if article.top_image:
            result.add('_candidate_images', [article.top_img])

        text = ""
        for paragraph in article.text.split("\n"):
            paragraph = paragraph.strip()
            # this is done to get rid of cases where a stray heading
            # like "Photographs" ends up as a paragraph
            if Summarizer.has_sentence(paragraph):
                text += " " + paragraph

        if len(text) > 0:
            result.set('_text', text, 2)

        return result

开发者ID:rlugojr，项目名称:readembedability，代码行数:50，代码来源:newspaper.py

示例6: _parse_article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
 def _parse_article(self, key, url):
     a = Article('')
     html = Google().cache(url)
     a.set_html(html)
     a.parse()
     a.nlp()
     article = {"summary":a.summary,
               "publish_date":a.publish_date,
               "images":a.images,
               "top_image":a.top_image,
               "title":a.title,
               "authors":a.authors,
               "keywords":a.keywords,
               "text":a.text}
     # update
     #conn = r.connect(db="clearspark")
     conn = r.connect(**rethink_conn.conn())

开发者ID:john2x，项目名称:scaling-fortnight，代码行数:19，代码来源:company_event.py

示例7: clean_source

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def clean_source(url, source):
    """ Parse a pre-downloaded article using newspaper.

    Args:
        url (str): The url where the article was sourced (necessary for the
                newspaper API).

        source (str): Html source of the article page.

    Returns:
        Dictionary providing cleaned article and extracted content
        (see `construct_result`), or `None` if newspaper could not extract
        the article.
    """
    article = Article(url)
    article.set_html(source)
    article.parse()

    if article.top_node is None:
        return None

    return construct_result(article)

开发者ID:asm-products，项目名称:saulify-web，代码行数:24，代码来源:newspaper.py

示例8: parser_nlp

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def parser_nlp(fname, html):
  Ts = timeit.default_timer()
  raw_html = html
  # basic info
  fid = int(fname.split('_')[0].split('/')[1])
  pm = parse_machine()
  html = pm.fix_html(html)
  link_stats = pm.parse_links(html)
  link_factors = [t for t in list(set(" ".join(link_stats.keys()).lower().split())) if (len(t) > 3)]
  doc = db.articles(
    fid           = fid,
    html          = html,
    html_cnt      = len(html),
    link_stats    = link_stats,
    link_factors  = link_factors,
    rand          = random.random(),
    # extra
    lines         = raw_html.count('\n'),
    spaces        = raw_html.count(' '),
    tabs          = raw_html.count('\t'),
    braces        = raw_html.count('{'),
    brackets      = raw_html.count('['),
    quesmarks     = raw_html.count('?'),
    exclamarks    = raw_html.count('!'),
    words         = len(re.split('\s+', raw_html)),
  )
  # check empty
  if ((doc.html == None) | (len(doc.html.replace(r'\s', '')) < 10)):
    doc.empty = True
    return doc
  try:
  # if True:
    pd = Article('', fetch_images=False)
    pd.set_html(doc.html)
    pd.parse()
    pd.nlp()
  except Exception as e:
    print("-"*60)
    print("[parser_nlp %s]: %s" % (doc.fid, e)) 
    print(doc.html[:500])
    print("-"*60) 
    return doc #"%s: %s" % (e, doc.id)
  # select cleaned_text
  cleaned_text = " ".join(pd.text.lower().split())
  if (len(cleaned_text) < 140):
    soup = bs(doc.html)
    if soup.body: 
      cleaned_text = soup.body.text
    if (len(cleaned_text) < 140): 
      cleaned_text = soup.text
  cleaned_text = sanitize_txt(cleaned_text, lower=True)
  bow = nlp.nlp().txt2words(cleaned_text or '', False)
  # save results 
  try:
    opengraph = pd.meta_data.get('og', {}) if pd.meta_data else {}
    top_image = opengraph.get('image') or (pd.top_image if pd.top_image else None)
    if isinstance(top_image, dict): top_image = top_image.get('identifier')
    if isinstance(opengraph.get('locale'), dict): opengraph['locale'] = opengraph.get('locale').get('identifier')
    publish_date = pm.process_date(opengraph.get('updated_time') or pd.publish_date)
    # canonical_link & domain
    domain = canonical_link = str(opengraph.get('url') or pd.canonical_link)
    if '//' in domain: domain = domain.split('//')[1]
    if '?' in domain: domain = domain.split('?')[0]
    domain = '/'.join(domain.split('/')[0:1])
    # update
    # doc.update(
    doc = db.articles(
      fid               = doc.fid,
      html              = doc.html,
      link_stats        = doc.link_stats,
      link_factors      = doc.link_factors,
      rand              = doc.rand,
      html_cnt          = doc.html_cnt,
      #
      lines             = doc.lines,
      spaces            = doc.spaces,
      tabs              = doc.tabs,
      braces            = doc.braces,
      brackets          = doc.brackets,
      quesmarks         = doc.quesmarks,
      exclamarks        = doc.exclamarks,
      words             = doc.words,
      #
      title             = str(opengraph.get('title') or pd.title)[:500],
      # cleaned_text      = str(cleaned_text),
      bow               = bow,
      tags              = [t.lower() for t in pd.tags],
      # opengraph         = {sanitize_txt(k): sanitize_txt(v) for k,v in opengraph.items()},
      # summary           = str(pd.summary),
      keywords          = pd.keywords,
      top_image         = str(top_image),
      movies            = pd.movies,
      publish_date      = publish_date,
      meta_site_name    = str(opengraph.get('site_name')),
      meta_lang         = str(opengraph.get('locale') or pd.meta_lang),
      meta_description  = str(opengraph.get('description') or pd.meta_description),
      meta_keywords     = pd.meta_keywords,
      canonical_link    = canonical_link,
      domain            = domain,
      authors           = [n.lower().replace(' ', '_') for n in pd.authors],
#.........这里部分代码省略.........

开发者ID:Marsan-Ma，项目名称:tnative，代码行数:103，代码来源:parse.py

示例9: extract_data

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def extract_data(fname, loadp, savep):
    ######################
    # initialize process #
    ######################
    stream = GzipFile(loadp + fname)
    protocol = TBinaryProtocol.TBinaryProtocol(TTransport.TBufferedTransport(stream))

    data = {'data': []}
    count = 0

    ####################
    # begin extraction #
    ####################
    while True:
        page = WikiLinkItem()
        try:
            page.read(protocol)
            count += 1
        except:
            stream.close()
            break

        print '- processing FILE {0} ENTRY # {1}'.format(fname, count)
        print '\t $ URL: {0}'.format(page.url)

        #####################
        # initial filtering #
        #####################
        if page.url[:3] == 'ftp':
            print '\t\t ###### Ftp prefix detected (ignore) ###### \n'
            continue
        if page.url[len(page.url) - 4:] != 'html':
            print '\t\t ###### Non-html suffix detected (ignore) ###### \n'
            continue
        if page.content.dom == None:
            print '\t\t ###### Empty dom detected (ignore) ###### \n'
            continue

        #######################
        # secondary filtering #
        #######################
        entities = extract_entities(page.mentions)
        if len(entities) < 2:
            print '\t\t ###### Single entity found (discard) ###### \n'
            continue

        print '\t $ # Entities:', len(entities)

        #########################
        # alignment and parsing #
        #########################
        html = mark_dom(page.content.dom, entities)

        news = Article(page.url, language = 'en')
        try:
            news.set_html(html)
            news.parse()
        except:
            print '\t\t ###### Parsing failed (discard) ###### \n'
            continue

        ################
        # tokenization #
        ################
        text = None
        try:
            text = ftfy.fix_text(news.text)
            text = text.encode('ascii', 'ignore')
            text = seperate_delimiter(word_tokenize(text))
        except:
            print '\t\t ###### Tokenization failed (discard) ###### \n'
            continue

        #######################
        # save processed data #
        #######################
        print '\t $ Entry # {0} Saved \n'.format(count)
        data['data'].append({'text': text, 'dict': entities})

    #####################
    # save as json file #
    #####################
    print '****** {0}.json saved ******\n'.format(fname[:3])
    f = open(savep + '{0}.json'.format(fname[:3]), 'w')
    json.dump(data, f, indent = 4)
    f.close()

开发者ID:leo-teng-long，项目名称:freelink，代码行数:88，代码来源:wikilink.py

示例10: Article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
# -*- coding: utf-8 -*-
from newspaper import Article
from goose import Goose
import requests
import json
import sys

article = Article(sys.argv[1])

article.download()
if not article.html:
  r = requests.get(sys.argv[1], verify=False, headers={ 'User-Agent': 'Mozilla/5.0' })
  article.set_html(r.text)

article.parse()
article.nlp()

published = ''
if article.publish_date:
  published = article.publish_date.strftime("%Y-%m-%d %H:%M:%S")

# Get body with goose
g = Goose()
goose_article = g.extract(raw_html=article.html)
body = goose_article.cleaned_text
summary = goose_article.meta_description

# Maybe use https://github.com/xiaoxu193/PyTeaser
if not summary:
  summary = article.summary

开发者ID:lateral，项目名称:feed-feeder，代码行数:32，代码来源:recommend-by-url.py

示例11: prepare

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
 def prepare(self, response):
     article = Article(url=response.url)
     article.set_html(response.text)
     article.parse()
     return article

开发者ID:chr7stos，项目名称:cortex_old，代码行数:7，代码来源:article_spider.py

示例12: Article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
import sys, json
from newspaper import Article

htmlStr = ""

for line in sys.stdin:
    htmlStr = htmlStr + line

#obj = json.loads(jsonStr)
article = Article('')
article.set_html(htmlStr);
article.parse()
article.nlp()
ret = json.dumps(article.keywords)
print ret

开发者ID:gscoon，项目名称:cs109final，代码行数:17，代码来源:receive.py

注：本文中的newspaper.Article.set_html方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。