本文整理汇总了Python中newspaper.Article类的典型用法代码示例。如果您正苦于以下问题:Python Article类的具体用法?Python Article怎么用?Python Article使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Article类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_details
def get_details():
url = request.args.get('url', '')
if not url:
abort(400)
if is_image(url):
result = {
"url": url,
"top_image": url,
"text": "",
}
return jsonify(result)
article = Article(url)
article.download()
try:
article.parse()
except (IOError, UnicodeDecodeError):
return '', 422
try:
top_image = article.top_image.rsplit('?',1)[0]
except AttributeError:
top_image = ''
result = {
"url": url,
"top_image": top_image,
"text": article.text,
}
return jsonify(result)
示例2: test_spanish_fulltext_extract
def test_spanish_fulltext_extract(self):
url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
article = Article(url=url, language="es")
article.download()
article.parse()
with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
assert article.text == f.read()
示例3: get_text
def get_text(url):
article = Article(url)
download=article.download()
parser= article.parse()
authors=article.authors
publish_date=article.publish_date # TODO: Slice publish date
body_text=article.text
body_text=body_text.replace('"','\"')
body_text=body_text.replace('"','')
#nlp=article.nlp()
keywords=article.keywords
summary=article.summary
title=article.title
tags=article.tags
#print body_text
title=strip_non_ascii(title)
summary=strip_non_ascii(summary)
body_text=strip_non_ascii(body_text)
keywords=' '.join(keywords)
keywords=strip_non_ascii(keywords)
#print (title, summary, authors, publish_date, body_text, keywords)
return (title, summary, authors, publish_date, body_text, keywords, tags)
示例4: main
def main():
try:
headlines = requests.get(headline_url)
headlines = json.loads(headlines.text)
for headline in headlines['Headlines']:
print("Processing Article %s" % headline['Url'])
article = Article(headline['Url'])
article.download()
article.parse()
response = requests.post(calais_url, files={'file': article.text}, headers=headers, timeout=80)
rdf = json.loads(response.text)
for x in rdf:
if '_type' in rdf[x] and 'name' in rdf[x]:
print("Output for %s %s" % (rdf[x]['_type'], rdf[x]['name']))
for instance in rdf[x]['instances']:
text = instance['prefix'] + instance['suffix']
blob = TextBlob(text)
for sentence in blob.sentences:
print(sentence)
print(sentence.sentiment.polarity)
print('--------------------')
#print(rdf)
except Exception as e:
print ('Error in connect ' , e)
示例5: test_chinese_fulltext_extract
def test_chinese_fulltext_extract(self):
url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
article = Article(url=url, language="zh")
article.download()
article.parse()
with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
assert article.text == f.read()
示例6: test_chinese_fulltext_extract
def test_chinese_fulltext_extract(self):
url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'
article = Article(url=url, language='zh')
article.build()
# assert isinstance(article.stopwords_class, StopWordsChinese)
with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f:
assert article.text == f.read()
示例7: main
def main(argv):
if len(argv) > 1:
htmlist = argv[1]
else:
htmlist = 'htmlist'
# Our permanent config for html cleaning
config = Config()
config.language = 'id'
config.MIN_SENT_COUNT = 20
config.memoize = False
config.fetch_images = False
config.verbose= True
cleaner = Article(url='', config=config)
with open(htmlist, 'r') as f:
htmfile = f.read().split('\n')
raw = []
for htm in htmfile:
print (htm)
if not htm.endswith("rss.html"):
with open(htm, 'r') as f:
h = f.read()
cleaner.set_html(h)
cleaner.parse()
sentences = nlp.split_sentences(cleaner.text)
#raw.append(sentences])
with open('htm-out', 'a') as f:
[f.write(r + '\n') for r in sentences]
示例8: extract
def extract(self, item):
"""Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
parsing the HTML-Code.
:param item: A NewscrawlerItem to parse.
:return: ArticleCandidate containing the recovered article data.
"""
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name()
article = Article('')
article.set_html(item['spider_response'].body)
article.parse()
article_candidate.title = article.title
article_candidate.description = article.meta_description
article_candidate.text = article.text
article_candidate.topimage = article.top_image
article_candidate.author = article.authors
if article.publish_date is not None:
try:
article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
except ValueError as exception:
self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
'Publishing date set to None' % item['url'])
article_candidate.language = article.meta_lang
return article_candidate
示例9: get_article
def get_article(url):
a = Article(url)
a.download()
a.parse()
article = dict()
article['title'] = a.title
article['publish_date'] = a.published_date
article['authors'] = a.authors
article['lead_image'] = a.top_image
article['movies'] = a.movies
article['text'] = a.text
article['keywords'] = get_keywords(a.text)
# This is more likely to fail.
# try:
# article.nlp()
# article['summary'] = 'This summary is generated: \n ' + a.summary
# except Exception:
# print Exception
# article['summary'] = a.summary
return article
示例10: insert_url
def insert_url(url):
conn = sqlite3.connect('publico_news_sqllite3.db')
cursor = conn.cursor()
# get the article in plain text
article = Article(url)
article.download()
article.parse()
date = article.publish_date
title = article.title
text = article.text
item = dict()
item['datetime'] = date
item['title'] = title
item['text'] = text
item['category'] = sys.argv[1].split('/')[6]
item['link'] = sys.argv[1]
item['origLink'] = sys.argv[1]
print(item['category'])
print(item['datetime'])
if not duplicate(item, item['category'], cursor):
status = insert_db(item, item['category'], cursor)
if status == 1:
print(sys.argv[1], "inserted")
else:
print("Error", status)
else:
print(url, "already in BD")
conn.commit()
conn.close()
示例11: makeDocs
def makeDocs():
utc = pytz.utc
es = Elasticsearch(BONSAI_URL, verify_certs= True)
es.indices.delete(index='news', ignore=[400, 404])
es.indices.create(index='news', ignore=400)
print "Created"
cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
a = defaultdict(int)
cnn_articles = cnn_paper.articles
print cnn_paper.size()
for i in range(10):
article = cnn_articles[i]
url = article.url
art = Article(url)
art.download()
art.parse()
print art.publish_date
print art.text
print "Article" + str(i)
print art.publish_date is not None
print art.text is not None
if (art.publish_date is not None) and (art.text is not None):
try:
doc = {
'domain': 'CNN',
'date': utc.localize(art.publish_date),
'text': art.text
}
res = es.index(index="news", doc_type='article', id=i, body=doc)
print "Doc" + str(i)
except:
print "Doc not accepted"
示例12: test2
def test2(self):
articles =[
'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
]
articles = [
'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
'http://www.bbc.co.uk/news/uk-wales-35954982',
'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
with open("./Output2.txt", "w") as text_file:
for url in articles:
print(url)
a = Article(url)
a.download()
a.parse()
text_file.write(a.text.encode('utf-8'))
text_file.write('\n')
示例13: runTest
def runTest(self):
# The "correct" fulltext needs to be manually checked
# we have 50 so far
FULLTEXT_PREPARED = 50
domain_counters = {}
with open(URLS_FILE, 'r') as f:
urls = [d.strip() for d in f.readlines() if d.strip()]
for url in urls[:FULLTEXT_PREPARED]:
domain = get_base_domain(url)
if domain in domain_counters:
domain_counters[domain] += 1
else:
domain_counters[domain] = 1
res_filename = domain + str(domain_counters[domain])
html = mock_resource_with(res_filename, 'html')
try:
a = Article(url)
a.download(html)
a.parse()
except Exception:
print('<< URL: %s parse ERROR >>' % url)
traceback.print_exc()
continue
correct_text = mock_resource_with(res_filename, 'txt')
condensed_url = url[:30] + ' ...'
print('%s -- fulltext status: %s' %
(condensed_url, a.text == correct_text))
示例14: get_image
def get_image():
url = request.args.get('url', '')
if not url:
abort(400)
if is_image(url):
return redirect(url)
article = Article(url)
article.download()
try:
article.parse()
except (IOError, UnicodeDecodeError):
return '', 422
try:
top_image = article.top_image.rsplit('?',1)[0]
except AttributeError:
top_image = ''
if not top_image == '':
return redirect(top_image)
else:
return '', 422
示例15: f
def f(url):
url_urls = url.text
try:
response = urllib2.urlopen(url_urls)
status = response.code
#print "detected webpage code:", status
if(status == 404):
pass
else:
a_zh = Article(url_urls, language = 'zh')
a_zh.download()
a_zh.parse()
# content_urls = a_zh.text
# if(content_urls == ''):
# a_en = Article(url_urls, language = 'en')
# a_en.download()
# a_en.parse()
# content_urls = content_urls + a_en.text
# if(content_urls != ''):
# pass
# # compare_article(url_urls, content_urls)
except:
pass