本文整理汇总了Python中newspaper.Article.build方法的典型用法代码示例。如果您正苦于以下问题:Python Article.build方法的具体用法?Python Article.build怎么用?Python Article.build使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article
的用法示例。
在下文中一共展示了Article.build方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_chinese_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test_chinese_fulltext_extract(self):
url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'
article = Article(url=url, language='zh')
article.build()
# assert isinstance(article.stopwords_class, StopWordsChinese)
with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f:
assert article.text == f.read()
示例2: test_chinese_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test_chinese_fulltext_extract(self):
url = 'http://news.sohu.com/20050601/n225789219.shtml'
mock_response_with(url, 'chinese_article')
article = Article(url=url, language='zh')
article.build()
with codecs.open(os.path.join(TEXT_FN, 'chinese.txt'),
'r', 'utf8') as f:
assert article.text == f.read()
示例3: test_arabic_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test_arabic_fulltext_extract(self):
url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html'
article = Article(url=url)
article.build()
assert article.meta_lang == 'ar'
# assert isinstance(article.stopwords_class, StopWordsArabic)
with codecs.open(os.path.join(TEXT_FN, 'arabic_text_1.txt'), 'r', 'utf8') as f:
assert article.text == f.read()
示例4: test_spanish_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test_spanish_fulltext_extract(self):
url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal'\
'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
mock_response_with(url, 'spanish_article')
article = Article(url=url, language='es')
article.build()
with codecs.open(os.path.join(TEXT_FN, 'spanish.txt'),
'r', 'utf8') as f:
assert article.text == f.read()
示例5: test
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
url = 'http://www.bbc.com/news/world-europe-35828810'
#url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
try:
a = Article(url)
a.build()
process_and_save_article(a, 'bbc')
except:
print ("error detected")
示例6: test_arabic_fulltext_extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test_arabic_fulltext_extract(self):
url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/'\
'index.html'
mock_response_with(url, 'arabic_article')
article = Article(url=url)
article.build()
assert article.meta_lang == 'ar'
with codecs.open(os.path.join(TEXT_FN, 'arabic.txt'),
'r', 'utf8') as f:
assert article.text == f.read()
示例7: _retrive_content
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def _retrive_content(url):
article = Article(url)
success = False
try:
article.build()
success = True
except ArticleException as e:
sterr.write(e)
finally:
return article, success
示例8: test
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
#url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
#url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
url = 'http://www.bbc.com/news/world-europe-35828810'
#url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
a = Article(url)
a.build()
loc = get_news_location(a, num_of_location=3)
print (loc)
try:
print (detect(a.text))
except lang_detect_exception.LangDetectException:
print ("Not English")
示例9: test_save_article_function
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test_save_article_function():
from newspaper import Article
today = time.time()
today = datetime.datetime.fromtimestamp(today)
url = 'http://www.bbc.com/news/world-europe-35828810'
#url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
a = Article(url)
a.build()
#print (a.title, a.publish_date)
#if the news has no publish_date, set it to today
if a.publish_date is None:
a.publish_date = today
path_to_save = get_path_to_save(a)
data_a = get_serialized_article_obj(a)
create_file(path_to_save, data = data_a)
示例10: test
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
#url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
#url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
#url = 'http://www.bbc.com/news/world-europe-35828810'
#url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
url = 'http://www.nytimes.com/2016/03/19/world/europe/dubai-airliner-crashes-while-trying-to-land-at-russian-airport.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region®ion=top-news&WT.nav=top-news&_r=1'
print ("building:", url)
a = Article(url)
a.build()
process_and_save_article(a)
print ("first paragraph")
print (a.text.split('\n')[0])
print ("Summary:")
print (a.summary)
try:
print (detect(a.text))
except lang_detect_exception.LangDetectException:
print ("Not English")
示例11: aggregate
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def aggregate():
ArticleRec.objects.filter(article_published__lte= datetime.datetime.today()-datetime.timedelta(days=7)).delete()
for f in shuffle(FeedRec.objects.all()):
u = f.feed_url
print(u)
article_list = grab_rss(f)
x = 0
for a in article_list:
x += 1
print("Checking article: " + str(x))
article = Article(url=a.url)
try:
article.build()
except (ArticleException, UnicodeDecodeError, ValueError):
print("Error: ArticleException")
continue
a.content = parser.parse(article.text)['text']
print(len(a.content))
if len(a.content) < 50:
print("Error: Too short")
continue
a.tag = clf.predict([article.text])[0]
width, height = get_image_size(article.top_image)
if width > 100 or height > 100:
a.img = article.top_image
add_article(a)
示例12: test
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
#url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
#url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
#url = 'http://www.bbc.com/news/world-europe-35828810'
#url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
urls = []
#urls.append('')
urls.append('http://www.bbc.com/news/world-australia-35800175')
#urls.append('http://edition.cnn.com/2016/03/21/politics/bernie-sanders-wins-democrats-abroad/index.html')
#urls.append('http://www.huffingtonpost.com/jonathan-greenberg/three-reasons-bernie-sand_b_9538508.html')
#urls.append('http://ewn.co.za/2016/03/25/Nigeria-targets-300-army-officers-and-firms-in-widening-corruption-probe')
for url in urls:
print ("building:", url)
a = Article(url)
a.build()
process_and_save_article(a)
try:
print (detect(a.text))
except lang_detect_exception.LangDetectException:
print ("Not English")
示例13: ArticleTestCase
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
class ArticleTestCase(unittest.TestCase):
def runTest(self):
print 'testing article unit'
self.test_url()
self.test_download_html()
self.test_pre_download_parse()
self.test_parse_html()
self.test_meta_type_extraction()
self.test_meta_extraction()
self.test_pre_parse_nlp()
self.test_nlp_body()
def setUp(self):
"""called before the first test case of this unit begins"""
self.article = Article(
url='http://www.cnn.com/2013/11/27/travel/weather-'
'thanksgiving/index.html?iref=allsearch')
def tearDown(self):
"""Called after all test cases finish of this unit
"""
pass
@print_test
def test_url(self):
assert self.article.url == (
u'http://www.cnn.com/2013/11/27/travel/weather-'
'thanksgiving/index.html')
@print_test
@responses.activate
def test_download_html(self):
mock_response_with(self.article.url, 'cnn_article')
self.article.download()
assert len(self.article.html) == 75244
@print_test
def test_pre_download_parse(self):
"""Before we download an article you should not be parsing!
"""
article = Article(self.article.url)
def failfunc():
article.parse()
self.assertRaises(ArticleException, failfunc)
@print_test
@responses.activate
def test_parse_html(self):
TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
'01-weather-1128-story-top.jpg')
DOMAIN = 'www.cnn.com'
SCHEME = 'http'
AUTHORS = ['Dana Ford', 'Tom Watkins']
TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
LEN_IMGS = 46
META_LANG = 'en'
mock_response_with(self.article.url, 'cnn_article')
self.article.build()
with open(os.path.join(TEST_DIR, 'data/cnn.txt'), 'r') as f:
assert self.article.text == f.read()
assert self.article.top_img == TOP_IMG
assert self.article.authors == AUTHORS
assert self.article.title == TITLE
assert len(self.article.imgs) == LEN_IMGS
assert self.article.meta_lang == META_LANG
@print_test
@responses.activate
def test_meta_type_extraction(self):
mock_response_with(self.article.url, 'cnn_article')
self.article.build()
meta_type = self.article.extractor.get_meta_type(
self.article.clean_doc)
assert 'article' == meta_type
@print_test
@responses.activate
def test_meta_extraction(self):
mock_response_with(self.article.url, 'cnn_article')
self.article.build()
meta = self.article.extractor.get_meta_data(self.article.clean_doc)
META_DATA = defaultdict(dict, {
'medium': 'news',
'googlebot': 'noarchive',
'pubdate': '2013-11-27T08:36:32Z',
'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
'og': {'site_name': 'CNN','description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'},
'section': 'travel',
'author': 'Dana Ford and Tom Watkins, CNN',
'robots': 'index,follow',
'vr': {'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'},
'source': 'CNN',
'fb': {'page_id': 18793419640, 'app_id': 80401312489},
'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
'article': {'publisher': 'https://www.facebook.com/cnninternational'},
#.........这里部分代码省略.........