Python Article.download方法代码示例

本文整理汇总了Python中newspaper.Article.download方法的典型用法代码示例。如果您正苦于以下问题：Python Article.download方法的具体用法？Python Article.download怎么用？Python Article.download使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article的用法示例。

在下文中一共展示了Article.download方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: get_image

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def get_image():
  url = request.args.get('url', '')
  if not url:
    abort(400)

  if is_image(url):
    return redirect(url)

  article = Article(url)
  article.download()

  try:
    article.parse()
  except (IOError, UnicodeDecodeError):
    return '', 422

  try:
    top_image = article.top_image.rsplit('?',1)[0]
  except AttributeError:
    top_image = ''

  if not top_image == '':
    return redirect(top_image)
  else:
    return '', 422

开发者ID:mskog，项目名称:cheapskate，代码行数:27，代码来源:cheapskate.py

示例2: test_arabic_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
 def test_arabic_fulltext_extract(self):
     url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html"
     article = Article(url=url, language="ar")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()

开发者ID:WheresWardy，项目名称:newspaper，代码行数:9，代码来源:unit_tests.py

示例3: test_pre_parse_nlp

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
 def test_pre_parse_nlp(self):
     """Test running NLP algos before parsing the article
     """
     new_article = Article(self.article.url)
     resp = mock_response_with(new_article.url, 'cnn_article')
     new_article.download(resp)
     self.assertRaises(ArticleException, new_article.nlp)

开发者ID:erezbil，项目名称:newspaper，代码行数:9，代码来源:unit_tests.py

示例4: main

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def main():
    try:
        headlines = requests.get(headline_url)
        
        headlines = json.loads(headlines.text)
        for headline in headlines['Headlines']:
            print("Processing Article %s" % headline['Url'])
            article = Article(headline['Url'])
            article.download()
            article.parse()
            
            
            response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
            rdf = json.loads(response.text)
            
            for x in rdf:
                if '_type' in rdf[x] and 'name' in rdf[x]:
                    print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
                    for instance in rdf[x]['instances']:
                        text = instance['prefix'] + instance['suffix']
                        blob = TextBlob(text)
                        for sentence in blob.sentences:
                            print(sentence)
                            print(sentence.sentiment.polarity)
            print('--------------------')
            
            #print(rdf)
    except Exception as e:
        print ('Error in connect ' , e)

开发者ID:tallstreet，项目名称:pr-classifier，代码行数:31，代码来源:test.py

示例5: get_nlp_data

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def get_nlp_data(url):
	article = Article(url)
	article.download()
	article.parse()
	article.nlp()
	
	return json.dumps(article.keywords)

开发者ID:USA-Hacks，项目名称:Politik-Back，代码行数:9，代码来源:app.py

示例6: scrapeURLS

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def scrapeURLS(inFilPath):
    texts = []
    cache = loadCache()
    toDelURLs = []
    with open(inFilPath) as f:
        urls = f.readlines()
    for url in urls:
        if filter(urlFilters, url):
            toDelURLs.append(url)
            
        if url in cache:
            txt = cache[url]
        else:
            print "Scraping URL %s" % url
            article = Article(url)
            article.download()
            article.parse()
            txt = article.text.replace("\n", " ").replace("  ", " ").strip()
            if txt == "" or filter(txtFilter, txt):
                toDelURLs.append(url)
                continue
            cacheURL(url, txt)
        texts.append(txt)
        deleteURLs(inFilPath, toDelURLs)
    return texts

开发者ID:pjdrm，项目名称:EM_pjdrm，代码行数:27，代码来源:nlp.py

示例7: post_new

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def post_new(request):
    if request.method == "POST":
        form = PostForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            post.author = request.user
            post.published_date = timezone.now()
            post.save()
            return redirect('blog.views.post_detail', pk=post.pk)
    elif request.method == 'GET':
        url = request.GET.get('url', '')
               
        if len(url) > 5:
            article = Article(url, language='en')
            article.download()
            article.parse()
            article.nlp()
            image = article.top_image
            summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
            title = article.title.replace(u'\u2019',"\'")
            source = url.split('//')[1].split('/')[0].replace('www.','')
            status = 'UD'
            form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) 
        else:
            form = PostForm() 

    return render(request, 'blog/post_edit.html', {'form': form})

开发者ID:WChen1127，项目名称:my-first-blog，代码行数:29，代码来源:views.py

示例8: get_article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def get_article(url):
    a = Article(url)
    a.download()
    a.parse()

    article = dict()

    article['title'] = a.title
    article['publish_date'] = a.published_date
    article['authors'] = a.authors
    article['lead_image'] = a.top_image
    article['movies'] = a.movies
    article['text'] = a.text
    article['keywords'] = get_keywords(a.text)


    # This is more likely to fail.
    # try:
    #     article.nlp()
    #     article['summary'] = 'This summary is generated: \n ' + a.summary
    # except Exception:
    #     print Exception
    #     article['summary'] = a.summary

    return article

开发者ID:allyjweir，项目名称:lackawanna，代码行数:27，代码来源:web_import.py

示例9: parse_news

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
    def parse_news(self, response):
        item = ScrapyGooglenewsItem()
        #only log the warning info from request
        logging.getLogger("requests").setLevel(logging.WARNING)

        for href in response.xpath('//h2[@class="title"]/a/@href').extract():
            item['link'] = href
            #use newspaper-0.0.8 to scrape the webpage, then get clean text.
            article = Article(item['link'])
            article.download()
            article.parse()
            item['title'] = article.title
            item['text'] = article.text
            #item['authors'] = article.authors
            #item['date'] = article.publish_date

            if response.url.split('&')[-1] == 'topic=w':
                item['domain'] = 'World'
            if response.url.split('&')[-1] == 'topic=n':
                item['domain'] = 'U.S.'
            if response.url.split('&')[-1] == 'topic=b':
                item['domain'] = 'Business'
            if response.url.split('&')[-1] == 'topic=tc':
                item['domain'] = 'Technology'
            if response.url.split('&')[-1] == 'topic=e':
                item['domain'] = 'Entertainment'
            if response.url.split('&')[-1] ==  'topic=s':
                item['domain'] = 'Sports'
            if response.url.split('&')[-1] ==  'topic=snc':
                item['domain'] = 'Science'
            if response.url.split('&')[-1] ==  'topic=m':
                item['domain'] = 'Health'

            yield item

开发者ID:jyb002，项目名称:Scrapy_GoogleNews，代码行数:36，代码来源:spider.py

示例10: insert_url

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def insert_url(url):
    conn = sqlite3.connect('publico_news_sqllite3.db')
    cursor = conn.cursor()

    # get the article in plain text
    article = Article(url)
    article.download()
    article.parse()
    date = article.publish_date
    title = article.title
    text = article.text

    item = dict()
    item['datetime'] = date
    item['title'] = title
    item['text'] = text
    item['category'] = sys.argv[1].split('/')[6]
    item['link'] = sys.argv[1]
    item['origLink'] = sys.argv[1]

    print(item['category'])
    print(item['datetime'])

    if not duplicate(item, item['category'], cursor):
        status = insert_db(item, item['category'], cursor)
        if status == 1:
            print(sys.argv[1], "inserted")
        else:
            print("Error", status)
    else:
        print(url, "already in BD")

    conn.commit()
    conn.close()

开发者ID:davidsbatista，项目名称:publico.pt-news-scrapper，代码行数:36，代码来源:fetch-news-v4.py

示例11: makeDocs

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"

开发者ID:shawncaeiro，项目名称:persoNews，代码行数:35，代码来源:createIndex.py

示例12: is_valid_article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def is_valid_article(link):
    print("Checking valid:\n" + link)

    if "cnn.com" not in link:
        return False
    if "html" not in link:
        return False
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    keywords = article.keywords

    matched = False

    for key in keywords:
        if key in nc_set:
            matched = True
    for key in keywords:
        if key in contorversial_set:
            matched = False

    if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
        main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
        visited_articles.write(link+"\n")
        return True

    return False

开发者ID:abhi21，项目名称:news，代码行数:30，代码来源:traverse.py

示例13: test2

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
 def test2(self):
     articles =[
      'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
      'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
      'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
      ]
     
     articles = [
      'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
      'http://www.bbc.co.uk/news/uk-wales-35954982',
      'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
      'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
      'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
      'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
      'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
      'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
      'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
     
     with open("./Output2.txt", "w") as text_file:
         for url in articles:
             print(url)
             a = Article(url)
             a.download()
             a.parse()
             text_file.write(a.text.encode('utf-8'))
             text_file.write('\n')

开发者ID:boztaskent，项目名称:tensorflow-named-entity，代码行数:34，代码来源:test_data_helpers.py

示例14: runTest

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
    def runTest(self):
        # The "correct" fulltext needs to be manually checked
        # we have 50 so far
        FULLTEXT_PREPARED = 50
        domain_counters = {}

        with open(URLS_FILE, 'r') as f:
            urls = [d.strip() for d in f.readlines() if d.strip()]

        for url in urls[:FULLTEXT_PREPARED]:
            domain = get_base_domain(url)
            if domain in domain_counters:
                domain_counters[domain] += 1
            else:
                domain_counters[domain] = 1

            res_filename = domain + str(domain_counters[domain])
            html = mock_resource_with(res_filename, 'html')
            try:
                a = Article(url)
                a.download(html)
                a.parse()
            except Exception:
                print('<< URL: %s parse ERROR >>' % url)
                traceback.print_exc()
                continue

            correct_text = mock_resource_with(res_filename, 'txt')
            condensed_url = url[:30] + ' ...'
            print('%s -- fulltext status: %s' %
                  (condensed_url, a.text == correct_text))

开发者ID:pombredanne，项目名称:newspaper，代码行数:33，代码来源:unit_tests.py

示例15: show_article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def show_article():
    url_to_clean = request.args.get('url_to_clean')
    if not url_to_clean:
        return redirect(url_for('index'))

    article = Article(url_to_clean)
    article.download()
    article.parse()

    try:
      html_string = ElementTree.tostring(article.clean_top_node)
    except:
      html_string = "Error converting html to string."

    try:
      article.nlp()
    except:
      log.error("Couldn't process with NLP")

    a = {
          'html': html_string, 
         'authors': str(', '.join(article.authors)), 
         'title': article.title,
         'text': article.text,
         'top_image': article.top_image,
         'videos': str(', '.join(article.movies)),
         'keywords': str(', '.join(article.keywords)),
         'summary': article.summary
         }
    return render_template('article/index.html', article=a, url=url_to_clean)

开发者ID:chagge，项目名称:newspaper-demo，代码行数:32，代码来源:__init__.py

注：本文中的newspaper.Article.download方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。