本文整理汇总了Python中newspaper.Article.download方法的典型用法代码示例。如果您正苦于以下问题:Python Article.download方法的具体用法?Python Article.download怎么用?Python Article.download使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article
的用法示例。
在下文中一共展示了Article.download方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_image
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def get_image():
url = request.args.get('url', '')
if not url:
abort(400)
if is_image(url):
return redirect(url)
article = Article(url)
article.download()
try:
article.parse()
except (IOError, UnicodeDecodeError):
return '', 422
try:
top_image = article.top_image.rsplit('?',1)[0]
except AttributeError:
top_image = ''
if not top_image == '':
return redirect(top_image)
else:
return '', 422
示例2: test_arabic_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def test_arabic_fulltext_extract(self):
url = "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html"
article = Article(url=url, language="ar")
article.download()
article.parse()
with codecs.open(os.path.join(TEXT_FN, "arabic_text_1.txt"), "r", "utf8") as f:
assert article.text == f.read()
示例3: test_pre_parse_nlp
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def test_pre_parse_nlp(self):
"""Test running NLP algos before parsing the article
"""
new_article = Article(self.article.url)
resp = mock_response_with(new_article.url, 'cnn_article')
new_article.download(resp)
self.assertRaises(ArticleException, new_article.nlp)
示例4: main
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def main():
try:
headlines = requests.get(headline_url)
headlines = json.loads(headlines.text)
for headline in headlines['Headlines']:
print("Processing Article %s" % headline['Url'])
article = Article(headline['Url'])
article.download()
article.parse()
response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
rdf = json.loads(response.text)
for x in rdf:
if '_type' in rdf[x] and 'name' in rdf[x]:
print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
for instance in rdf[x]['instances']:
text = instance['prefix'] + instance['suffix']
blob = TextBlob(text)
for sentence in blob.sentences:
print(sentence)
print(sentence.sentiment.polarity)
print('--------------------')
#print(rdf)
except Exception as e:
print ('Error in connect ' , e)
示例5: get_nlp_data
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def get_nlp_data(url):
article = Article(url)
article.download()
article.parse()
article.nlp()
return json.dumps(article.keywords)
示例6: scrapeURLS
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def scrapeURLS(inFilPath):
texts = []
cache = loadCache()
toDelURLs = []
with open(inFilPath) as f:
urls = f.readlines()
for url in urls:
if filter(urlFilters, url):
toDelURLs.append(url)
if url in cache:
txt = cache[url]
else:
print "Scraping URL %s" % url
article = Article(url)
article.download()
article.parse()
txt = article.text.replace("\n", " ").replace(" ", " ").strip()
if txt == "" or filter(txtFilter, txt):
toDelURLs.append(url)
continue
cacheURL(url, txt)
texts.append(txt)
deleteURLs(inFilPath, toDelURLs)
return texts
示例7: post_new
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def post_new(request):
if request.method == "POST":
form = PostForm(request.POST)
if form.is_valid():
post = form.save(commit=False)
post.author = request.user
post.published_date = timezone.now()
post.save()
return redirect('blog.views.post_detail', pk=post.pk)
elif request.method == 'GET':
url = request.GET.get('url', '')
if len(url) > 5:
article = Article(url, language='en')
article.download()
article.parse()
article.nlp()
image = article.top_image
summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
title = article.title.replace(u'\u2019',"\'")
source = url.split('//')[1].split('/')[0].replace('www.','')
status = 'UD'
form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,})
else:
form = PostForm()
return render(request, 'blog/post_edit.html', {'form': form})
示例8: get_article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def get_article(url):
a = Article(url)
a.download()
a.parse()
article = dict()
article['title'] = a.title
article['publish_date'] = a.published_date
article['authors'] = a.authors
article['lead_image'] = a.top_image
article['movies'] = a.movies
article['text'] = a.text
article['keywords'] = get_keywords(a.text)
# This is more likely to fail.
# try:
# article.nlp()
# article['summary'] = 'This summary is generated: \n ' + a.summary
# except Exception:
# print Exception
# article['summary'] = a.summary
return article
示例9: parse_news
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def parse_news(self, response):
item = ScrapyGooglenewsItem()
#only log the warning info from request
logging.getLogger("requests").setLevel(logging.WARNING)
for href in response.xpath('//h2[@class="title"]/a/@href').extract():
item['link'] = href
#use newspaper-0.0.8 to scrape the webpage, then get clean text.
article = Article(item['link'])
article.download()
article.parse()
item['title'] = article.title
item['text'] = article.text
#item['authors'] = article.authors
#item['date'] = article.publish_date
if response.url.split('&')[-1] == 'topic=w':
item['domain'] = 'World'
if response.url.split('&')[-1] == 'topic=n':
item['domain'] = 'U.S.'
if response.url.split('&')[-1] == 'topic=b':
item['domain'] = 'Business'
if response.url.split('&')[-1] == 'topic=tc':
item['domain'] = 'Technology'
if response.url.split('&')[-1] == 'topic=e':
item['domain'] = 'Entertainment'
if response.url.split('&')[-1] == 'topic=s':
item['domain'] = 'Sports'
if response.url.split('&')[-1] == 'topic=snc':
item['domain'] = 'Science'
if response.url.split('&')[-1] == 'topic=m':
item['domain'] = 'Health'
yield item
示例10: insert_url
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def insert_url(url):
conn = sqlite3.connect('publico_news_sqllite3.db')
cursor = conn.cursor()
# get the article in plain text
article = Article(url)
article.download()
article.parse()
date = article.publish_date
title = article.title
text = article.text
item = dict()
item['datetime'] = date
item['title'] = title
item['text'] = text
item['category'] = sys.argv[1].split('/')[6]
item['link'] = sys.argv[1]
item['origLink'] = sys.argv[1]
print(item['category'])
print(item['datetime'])
if not duplicate(item, item['category'], cursor):
status = insert_db(item, item['category'], cursor)
if status == 1:
print(sys.argv[1], "inserted")
else:
print("Error", status)
else:
print(url, "already in BD")
conn.commit()
conn.close()
示例11: makeDocs
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def makeDocs():
utc = pytz.utc
es = Elasticsearch(BONSAI_URL, verify_certs= True)
es.indices.delete(index='news', ignore=[400, 404])
es.indices.create(index='news', ignore=400)
print "Created"
cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
a = defaultdict(int)
cnn_articles = cnn_paper.articles
print cnn_paper.size()
for i in range(10):
article = cnn_articles[i]
url = article.url
art = Article(url)
art.download()
art.parse()
print art.publish_date
print art.text
print "Article" + str(i)
print art.publish_date is not None
print art.text is not None
if (art.publish_date is not None) and (art.text is not None):
try:
doc = {
'domain': 'CNN',
'date': utc.localize(art.publish_date),
'text': art.text
}
res = es.index(index="news", doc_type='article', id=i, body=doc)
print "Doc" + str(i)
except:
print "Doc not accepted"
示例12: is_valid_article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def is_valid_article(link):
print("Checking valid:\n" + link)
if "cnn.com" not in link:
return False
if "html" not in link:
return False
article = Article(link)
article.download()
article.parse()
article.nlp()
keywords = article.keywords
matched = False
for key in keywords:
if key in nc_set:
matched = True
for key in keywords:
if key in contorversial_set:
matched = False
if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
visited_articles.write(link+"\n")
return True
return False
示例13: test2
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def test2(self):
articles =[
'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
]
articles = [
'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
'http://www.bbc.co.uk/news/uk-wales-35954982',
'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
with open("./Output2.txt", "w") as text_file:
for url in articles:
print(url)
a = Article(url)
a.download()
a.parse()
text_file.write(a.text.encode('utf-8'))
text_file.write('\n')
示例14: runTest
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def runTest(self):
# The "correct" fulltext needs to be manually checked
# we have 50 so far
FULLTEXT_PREPARED = 50
domain_counters = {}
with open(URLS_FILE, 'r') as f:
urls = [d.strip() for d in f.readlines() if d.strip()]
for url in urls[:FULLTEXT_PREPARED]:
domain = get_base_domain(url)
if domain in domain_counters:
domain_counters[domain] += 1
else:
domain_counters[domain] = 1
res_filename = domain + str(domain_counters[domain])
html = mock_resource_with(res_filename, 'html')
try:
a = Article(url)
a.download(html)
a.parse()
except Exception:
print('<< URL: %s parse ERROR >>' % url)
traceback.print_exc()
continue
correct_text = mock_resource_with(res_filename, 'txt')
condensed_url = url[:30] + ' ...'
print('%s -- fulltext status: %s' %
(condensed_url, a.text == correct_text))
示例15: show_article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import download [as 别名]
def show_article():
url_to_clean = request.args.get('url_to_clean')
if not url_to_clean:
return redirect(url_for('index'))
article = Article(url_to_clean)
article.download()
article.parse()
try:
html_string = ElementTree.tostring(article.clean_top_node)
except:
html_string = "Error converting html to string."
try:
article.nlp()
except:
log.error("Couldn't process with NLP")
a = {
'html': html_string,
'authors': str(', '.join(article.authors)),
'title': article.title,
'text': article.text,
'top_image': article.top_image,
'videos': str(', '.join(article.movies)),
'keywords': str(', '.join(article.keywords)),
'summary': article.summary
}
return render_template('article/index.html', article=a, url=url_to_clean)