本文整理汇总了Python中newspaper.Article.parse方法的典型用法代码示例。如果您正苦于以下问题:Python Article.parse方法的具体用法?Python Article.parse怎么用?Python Article.parse使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article
的用法示例。
在下文中一共展示了Article.parse方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse_article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def parse_article(url, lang, featured=0, db=connect_db()):
cur = db.execute("select * from articles where url=?", (url,))
entries = [dict(id=row[0], url=row[1], title=row[2], image=row[3], text=row[4], authors=row[5], date=row[6], featured=row[7], language=row[8]) for row in cur.fetchall()]
if len(entries) >= 1:
return entries[0]
article = Article(url)
article.download()
try:
article.parse()
except:
return None
title = article.title
image = article.top_image
text = article.text
authors = ",".join(article.authors)
date = int(time.mktime(article.publish_date.timetuple())) if type(article.publish_date) is datetime.datetime else 0
db.execute("insert into articles (url, title, image, text, authors, date, featured, language) values (?, ?, ?, ?, ?, ?, ?, ?)", (url, title, image, text, authors, date, featured and len(text) >= 50, lang))
db.commit()
idquery = db.execute("select (id) from articles where url=?", (url,))
id = [row[0] for row in idquery.fetchall()][0]
return {"id": id, "url": url, "title": title, "image": image, "text": text, "authors": authors, "date": date, "language": lang}
示例2: test_chinese_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def test_chinese_fulltext_extract(self):
url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
article = Article(url=url, language="zh")
article.download()
article.parse()
with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
assert article.text == f.read()
示例3: test_spanish_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def test_spanish_fulltext_extract(self):
url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
article = Article(url=url, language="es")
article.download()
article.parse()
with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
assert article.text == f.read()
示例4: post_new
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def post_new(request):
if request.method == "POST":
form = PostForm(request.POST)
if form.is_valid():
post = form.save(commit=False)
post.author = request.user
post.published_date = timezone.now()
post.save()
return redirect('blog.views.post_detail', pk=post.pk)
elif request.method == 'GET':
url = request.GET.get('url', '')
if len(url) > 5:
article = Article(url, language='en')
article.download()
article.parse()
article.nlp()
image = article.top_image
summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
title = article.title.replace(u'\u2019',"\'")
source = url.split('//')[1].split('/')[0].replace('www.','')
status = 'UD'
form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,})
else:
form = PostForm()
return render(request, 'blog/post_edit.html', {'form': form})
示例5: run
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def run(self):
logging.debug("run() - [WAIT]")
from newspaper import Article
'''
Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm
'''
NOTES_LIST = [
'118',
'117',
# '116',
# '115',
]
for note_id in NOTES_LIST:
note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id)
note.download()
print '*' * 100
# print 'H T M L'
# print note.html
#print '*' * 100
# print 'T E X T'
note.parse()
print note.text
logging.debug("run() - [DONE]")
示例6: f
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def f(url):
url_urls = url.text
try:
response = urllib2.urlopen(url_urls)
status = response.code
#print "detected webpage code:", status
if(status == 404):
pass
else:
a_zh = Article(url_urls, language = 'zh')
a_zh.download()
a_zh.parse()
# content_urls = a_zh.text
# if(content_urls == ''):
# a_en = Article(url_urls, language = 'en')
# a_en.download()
# a_en.parse()
# content_urls = content_urls + a_en.text
# if(content_urls != ''):
# pass
# # compare_article(url_urls, content_urls)
except:
pass
示例7: check_url
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def check_url(args):
"""
:param (basestr, basestr) url, res_filename:
:return: (pubdate_failed, fulltext_failed)
"""
url, res_filename = args
pubdate_failed, fulltext_failed = False, False
html = mock_resource_with(res_filename, 'html')
try:
a = Article(url)
a.download(html)
a.parse()
if a.publish_date is None:
pubdate_failed = True
except Exception:
print('<< URL: %s parse ERROR >>' % url)
traceback.print_exc()
pubdate_failed, fulltext_failed = True, True
else:
correct_text = mock_resource_with(res_filename, 'txt')
if not (a.text == correct_text):
# print('Diff: ', simplediff.diff(correct_text, a.text))
# `correct_text` holds the reason of failure if failure
print('%s -- %s -- %s' %
('Fulltext failed',
res_filename, correct_text.strip()))
fulltext_failed = True
# TODO: assert statements are commented out for full-text
# extraction tests because we are constantly tweaking the
# algorithm and improving
# assert a.text == correct_text
return pubdate_failed, fulltext_failed
示例8: insert_url
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def insert_url(url):
conn = sqlite3.connect('publico_news_sqllite3.db')
cursor = conn.cursor()
# get the article in plain text
article = Article(url)
article.download()
article.parse()
date = article.publish_date
title = article.title
text = article.text
item = dict()
item['datetime'] = date
item['title'] = title
item['text'] = text
item['category'] = sys.argv[1].split('/')[6]
item['link'] = sys.argv[1]
item['origLink'] = sys.argv[1]
print(item['category'])
print(item['datetime'])
if not duplicate(item, item['category'], cursor):
status = insert_db(item, item['category'], cursor)
if status == 1:
print(sys.argv[1], "inserted")
else:
print("Error", status)
else:
print(url, "already in BD")
conn.commit()
conn.close()
示例9: makeDocs
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def makeDocs():
utc = pytz.utc
es = Elasticsearch(BONSAI_URL, verify_certs= True)
es.indices.delete(index='news', ignore=[400, 404])
es.indices.create(index='news', ignore=400)
print "Created"
cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
a = defaultdict(int)
cnn_articles = cnn_paper.articles
print cnn_paper.size()
for i in range(10):
article = cnn_articles[i]
url = article.url
art = Article(url)
art.download()
art.parse()
print art.publish_date
print art.text
print "Article" + str(i)
print art.publish_date is not None
print art.text is not None
if (art.publish_date is not None) and (art.text is not None):
try:
doc = {
'domain': 'CNN',
'date': utc.localize(art.publish_date),
'text': art.text
}
res = es.index(index="news", doc_type='article', id=i, body=doc)
print "Doc" + str(i)
except:
print "Doc not accepted"
示例10: test2
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def test2(self):
articles =[
'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
]
articles = [
'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
'http://www.bbc.co.uk/news/uk-wales-35954982',
'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
with open("./Output2.txt", "w") as text_file:
for url in articles:
print(url)
a = Article(url)
a.download()
a.parse()
text_file.write(a.text.encode('utf-8'))
text_file.write('\n')
示例11: is_valid_article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def is_valid_article(link):
print("Checking valid:\n" + link)
if "cnn.com" not in link:
return False
if "html" not in link:
return False
article = Article(link)
article.download()
article.parse()
article.nlp()
keywords = article.keywords
matched = False
for key in keywords:
if key in nc_set:
matched = True
for key in keywords:
if key in contorversial_set:
matched = False
if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
visited_articles.write(link+"\n")
return True
return False
示例12: runTest
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def runTest(self):
# The "correct" fulltext needs to be manually checked
# we have 50 so far
FULLTEXT_PREPARED = 50
domain_counters = {}
with open(URLS_FILE, 'r') as f:
urls = [d.strip() for d in f.readlines() if d.strip()]
for url in urls[:FULLTEXT_PREPARED]:
domain = get_base_domain(url)
if domain in domain_counters:
domain_counters[domain] += 1
else:
domain_counters[domain] = 1
res_filename = domain + str(domain_counters[domain])
html = mock_resource_with(res_filename, 'html')
try:
a = Article(url)
a.download(html)
a.parse()
except Exception:
print('<< URL: %s parse ERROR >>' % url)
traceback.print_exc()
continue
correct_text = mock_resource_with(res_filename, 'txt')
condensed_url = url[:30] + ' ...'
print('%s -- fulltext status: %s' %
(condensed_url, a.text == correct_text))
示例13: get_image
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def get_image():
url = request.args.get('url', '')
if not url:
abort(400)
if is_image(url):
return redirect(url)
article = Article(url)
article.download()
try:
article.parse()
except (IOError, UnicodeDecodeError):
return '', 422
try:
top_image = article.top_image.rsplit('?',1)[0]
except AttributeError:
top_image = ''
if not top_image == '':
return redirect(top_image)
else:
return '', 422
示例14: get_details
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def get_details():
url = request.args.get('url', '')
if not url:
abort(400)
if is_image(url):
result = {
"url": url,
"top_image": url,
"text": "",
}
return jsonify(result)
article = Article(url)
article.download()
try:
article.parse()
except (IOError, UnicodeDecodeError):
return '', 422
try:
top_image = article.top_image.rsplit('?',1)[0]
except AttributeError:
top_image = ''
result = {
"url": url,
"top_image": top_image,
"text": article.text,
}
return jsonify(result)
示例15: show_article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def show_article():
url_to_clean = request.args.get('url_to_clean')
if not url_to_clean:
return redirect(url_for('index'))
article = Article(url_to_clean)
article.download()
article.parse()
try:
html_string = ElementTree.tostring(article.clean_top_node)
except:
html_string = "Error converting html to string."
try:
article.nlp()
except:
log.error("Couldn't process with NLP")
a = {
'html': html_string,
'authors': str(', '.join(article.authors)),
'title': article.title,
'text': article.text,
'top_image': article.top_image,
'videos': str(', '.join(article.movies)),
'keywords': str(', '.join(article.keywords)),
'summary': article.summary
}
return render_template('article/index.html', article=a, url=url_to_clean)