Python Article.build方法代码示例

本文整理汇总了Python中newspaper.Article.build方法的典型用法代码示例。如果您正苦于以下问题：Python Article.build方法的具体用法？Python Article.build怎么用？Python Article.build使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article的用法示例。

在下文中一共展示了Article.build方法的13个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_chinese_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
 def test_chinese_fulltext_extract(self):
     url = 'http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml'
     article = Article(url=url, language='zh')
     article.build()
     # assert isinstance(article.stopwords_class, StopWordsChinese)
     with codecs.open(os.path.join(TEXT_FN, 'chinese_text_1.txt'), 'r', 'utf8') as f:
         assert article.text == f.read()

开发者ID:Geekking，项目名称:newspaper，代码行数:9，代码来源:unit_tests.py

示例2: test_chinese_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
 def test_chinese_fulltext_extract(self):
     url = 'http://news.sohu.com/20050601/n225789219.shtml'
     mock_response_with(url, 'chinese_article')
     article = Article(url=url, language='zh')
     article.build()
     with codecs.open(os.path.join(TEXT_FN, 'chinese.txt'),
                      'r', 'utf8') as f:
         assert article.text == f.read()

开发者ID:Jamonek，项目名称:newspaper，代码行数:10，代码来源:unit_tests.py

示例3: test_arabic_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
    def test_arabic_fulltext_extract(self):
        url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html'

        article = Article(url=url)
        article.build()
        assert article.meta_lang == 'ar'
        # assert isinstance(article.stopwords_class, StopWordsArabic)
        with codecs.open(os.path.join(TEXT_FN, 'arabic_text_1.txt'), 'r', 'utf8') as f:
            assert article.text == f.read()

开发者ID:Geekking，项目名称:newspaper，代码行数:11，代码来源:unit_tests.py

示例4: test_spanish_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
 def test_spanish_fulltext_extract(self):
     url = 'http://ultimahora.es/mallorca/noticia/noticias/local/fiscal'\
           'ia-anticorrupcion-estudia-recurre-imputacion-infanta.html'
     mock_response_with(url, 'spanish_article')
     article = Article(url=url, language='es')
     article.build()
     with codecs.open(os.path.join(TEXT_FN, 'spanish.txt'),
                      'r', 'utf8') as f:
         assert article.text == f.read()

开发者ID:Jamonek，项目名称:newspaper，代码行数:11，代码来源:unit_tests.py

示例5: test

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
    url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    try:
        a = Article(url)
        a.build()
        process_and_save_article(a, 'bbc')
    except:
        print ("error detected")

开发者ID:what7newssay，项目名称:webpage，代码行数:11，代码来源:articleProcessor.py

示例6: test_arabic_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
 def test_arabic_fulltext_extract(self):
     url = 'http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/'\
           'index.html'
     mock_response_with(url, 'arabic_article')
     article = Article(url=url)
     article.build()
     assert article.meta_lang == 'ar'
     with codecs.open(os.path.join(TEXT_FN, 'arabic.txt'),
                      'r', 'utf8') as f:
         assert article.text == f.read()

开发者ID:Jamonek，项目名称:newspaper，代码行数:12，代码来源:unit_tests.py

示例7: _retrive_content

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def _retrive_content(url):
    article = Article(url)
    success = False
    try:
        article.build()
        success = True
    except ArticleException as e:
        sterr.write(e)
    finally:
        return article, success

开发者ID:CzarSimon，项目名称:mimir，代码行数:12，代码来源:scraper.py

示例8: test

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
    #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    a = Article(url)
    a.build()

    loc = get_news_location(a, num_of_location=3)
    print (loc)

    
    try:
        print (detect(a.text))
    except lang_detect_exception.LangDetectException:
        print ("Not English")

开发者ID:what7newssay，项目名称:webpage，代码行数:18，代码来源:news_processing.py

示例9: test_save_article_function

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test_save_article_function():
    from newspaper import Article
    today = time.time()
    today = datetime.datetime.fromtimestamp(today)
    url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    a = Article(url)
    a.build()
    #print (a.title, a.publish_date)

    #if the news has no publish_date, set it to today
    if a.publish_date is None:
        a.publish_date = today

    path_to_save = get_path_to_save(a)
    data_a = get_serialized_article_obj(a)
    create_file(path_to_save, data = data_a)

开发者ID:what7newssay，项目名称:webpage，代码行数:19，代码来源:news_processing.py

示例10: test

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
    #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    #url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    url = 'http://www.nytimes.com/2016/03/19/world/europe/dubai-airliner-crashes-while-trying-to-land-at-russian-airport.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news&_r=1'
    print ("building:", url)
    a = Article(url)
    a.build()
    process_and_save_article(a)

    print ("first paragraph")
    print (a.text.split('\n')[0])
    print ("Summary:")
    print (a.summary)   
    
    try:
        print (detect(a.text))
    except lang_detect_exception.LangDetectException:
        print ("Not English")

开发者ID:what7newssay，项目名称:webpage，代码行数:22，代码来源:news_processing+-+Copy.py

示例11: aggregate

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def aggregate():
    ArticleRec.objects.filter(article_published__lte= datetime.datetime.today()-datetime.timedelta(days=7)).delete()

    for f in shuffle(FeedRec.objects.all()):

        u = f.feed_url

        print(u)
        article_list = grab_rss(f)

        x = 0

        for a in article_list:
            x += 1
            print("Checking article: " + str(x))

            article = Article(url=a.url)

            try:
                article.build()
            except (ArticleException, UnicodeDecodeError, ValueError):
                print("Error: ArticleException")
                continue

            a.content = parser.parse(article.text)['text']
            print(len(a.content))
            if len(a.content) < 50:
                print("Error: Too short")
                continue

            a.tag = clf.predict([article.text])[0]

            width, height = get_image_size(article.top_image)

            if width > 100 or height > 100:
                a.img = article.top_image
            add_article(a)

开发者ID:liamcreagh，项目名称:Anthus-News，代码行数:39，代码来源:feed_aggregator.py

示例12: test

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
def test():
    #url = 'http://money.cnn.com/2016/02/26/investing/warren-buffett-berkshire-hathaway-annual-shareholder-letter/index.html?section=money_topstories'
    #url = 'http://www.bbc.com/hindi/sport/2016/02/160227_heart_change_for_kohli_fan_dil'
    #url = 'http://www.bbc.com/news/world-europe-35828810'
    #url = 'http://fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/'
    urls = []
    #urls.append('')
    urls.append('http://www.bbc.com/news/world-australia-35800175')
    #urls.append('http://edition.cnn.com/2016/03/21/politics/bernie-sanders-wins-democrats-abroad/index.html')
    #urls.append('http://www.huffingtonpost.com/jonathan-greenberg/three-reasons-bernie-sand_b_9538508.html')
    #urls.append('http://ewn.co.za/2016/03/25/Nigeria-targets-300-army-officers-and-firms-in-widening-corruption-probe')
    for url in urls:
        print ("building:", url)
        a = Article(url)
        a.build()
        process_and_save_article(a)
    

    
    
    try:
        print (detect(a.text))
    except lang_detect_exception.LangDetectException:
        print ("Not English")

开发者ID:what7newssay，项目名称:webpage，代码行数:26，代码来源:np_special.py

示例13: ArticleTestCase

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import build [as 别名]
class ArticleTestCase(unittest.TestCase):
    def runTest(self):
        print 'testing article unit'
        self.test_url()
        self.test_download_html()
        self.test_pre_download_parse()
        self.test_parse_html()
        self.test_meta_type_extraction()
        self.test_meta_extraction()
        self.test_pre_parse_nlp()
        self.test_nlp_body()

    def setUp(self):
        """called before the first test case of this unit begins"""

        self.article = Article(
            url='http://www.cnn.com/2013/11/27/travel/weather-'
                'thanksgiving/index.html?iref=allsearch')

    def tearDown(self):
        """Called after all test cases finish of this unit
        """
        pass

    @print_test
    def test_url(self):
        assert self.article.url == (
            u'http://www.cnn.com/2013/11/27/travel/weather-'
            'thanksgiving/index.html')

    @print_test
    @responses.activate
    def test_download_html(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.download()
        assert len(self.article.html) == 75244

    @print_test
    def test_pre_download_parse(self):
        """Before we download an article you should not be parsing!
        """
        article = Article(self.article.url)

        def failfunc():
            article.parse()
        self.assertRaises(ArticleException, failfunc)

    @print_test
    @responses.activate
    def test_parse_html(self):
        TOP_IMG = ('http://i2.cdn.turner.com/cnn/dam/assets/131129200805-'
                   '01-weather-1128-story-top.jpg')
        DOMAIN = 'www.cnn.com'
        SCHEME = 'http'
        AUTHORS = ['Dana Ford', 'Tom Watkins']
        TITLE = 'After storm, forecasters see smooth sailing for Thanksgiving'
        LEN_IMGS = 46
        META_LANG = 'en'

        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()
        with open(os.path.join(TEST_DIR, 'data/cnn.txt'), 'r') as f:
            assert self.article.text == f.read()
        assert self.article.top_img == TOP_IMG
        assert self.article.authors == AUTHORS
        assert self.article.title == TITLE
        assert len(self.article.imgs) == LEN_IMGS
        assert self.article.meta_lang == META_LANG

    @print_test
    @responses.activate
    def test_meta_type_extraction(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()

        meta_type = self.article.extractor.get_meta_type(
            self.article.clean_doc)
        assert 'article' == meta_type

    @print_test
    @responses.activate
    def test_meta_extraction(self):
        mock_response_with(self.article.url, 'cnn_article')
        self.article.build()

        meta = self.article.extractor.get_meta_data(self.article.clean_doc)
        META_DATA = defaultdict(dict, {
            'medium': 'news',
            'googlebot': 'noarchive',
            'pubdate': '2013-11-27T08:36:32Z',
            'title': 'After storm, forecasters see smooth sailing for Thanksgiving - CNN.com',
            'og': {'site_name': 'CNN','description': 'A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.', 'title': 'After storm, forecasters see smooth sailing for Thanksgiving', 'url': 'http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html', 'image': 'http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg', 'type': 'article'},
            'section': 'travel',
            'author': 'Dana Ford and Tom Watkins, CNN',
            'robots': 'index,follow',
            'vr': {'canonical': 'http://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html'},
            'source': 'CNN',
            'fb': {'page_id': 18793419640, 'app_id': 80401312489},
            'keywords': 'winter storm,holiday travel,Thanksgiving storm,Thanksgiving winter storm',
            'article': {'publisher': 'https://www.facebook.com/cnninternational'},
#.........这里部分代码省略.........

开发者ID:Geekking，项目名称:newspaper，代码行数:103，代码来源:unit_tests.py

注：本文中的newspaper.Article.build方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。