本文整理汇总了Python中newspaper.Source类的典型用法代码示例。如果您正苦于以下问题:Python Source类的具体用法?Python Source怎么用?Python Source使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Source类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
source="The Guardian"
#config = Config()
#config.memoize_articles = False
guardian = Source("http://www.theguardian.com/world", memoize_articles=False)
guardian.build()
#guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
#news_pool.set([guardian], threads_per_source=2)
#news_pool.join()
#print(guardian.size())
for article in [x for x in guardian.articles if re.match(".*/world/.*", x.url) is not None]:
url = article.url
a = Article(url, language='en')
a.download()
for i in range(10):
if a.is_downloaded:
break
else:
a.download()
try:
a.parse()
a.nlp()
except:
print("Error: Not parsed/downloaded correctly.")
continue
html = a.html
summary = a.summary
keywords = a.keywords
title = a.title
text = a.text
date = str(a.publish_date).split()[0].split("-")
date[0], date[1], date[2] = date[1], date[2], date[0]
date = "/".join(date)
delta = re.search(r'<span class="content__dateline-time">(.*)</span>' , html).group(1).replace(".",":").split()[0]
time = datetime.now() + timedelta(hours=delta )
date_time = date + " " + time
#print(title)
#print(date_time)
date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
#print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
#TODO: Add stuff to the DB
try:
article = {
'headline': title,
'url': url,
'text': text,
'date': date_obj
}
newspaper_article('The Guardian', article, keywords=keywords)
except Exception as ex:
print 'Article could not be created due to following error'
print ex
示例2: test_source_build
def test_source_build(self):
"""
builds a source object, validates it has no errors, prints out
all valid categories and feed urls
"""
DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
BRAND = "cnn"
config = Configuration()
config.verbose = False
s = Source("http://cnn.com", config=config)
s.clean_memo_cache()
s.build()
assert s.brand == BRAND
assert s.description == DESC
# For this test case and a few more, I don't believe you can actually
# assert two values to equal eachother because some values are ever changing.
# Insead, i'm just going to print some stuff out so it is just as easy to take
# a glance and see if it looks OK.
print "\t\tWe have %d articles currently!" % s.size()
print
print "\t\t%s categories are: %s" % (s.url, str(s.category_urls()))
示例3: test_source_build
def test_source_build(self):
"""
builds a source object, validates it has no errors, prints out
all valid categories and feed urls
"""
DESC = ('CNN.com International delivers breaking news from across '
'the globe and information on the latest top stories, '
'business, sports and entertainment headlines. Follow the '
'news as it happens through: special reports, videos, '
'audio, photo galleries plus interactive maps and timelines.')
CATEGORY_URLS = [
'http://cnn.com/ASIA', 'http://connecttheworld.blogs.cnn.com',
'http://cnn.com/HLN', 'http://cnn.com/MIDDLEEAST',
'http://cnn.com', 'http://ireport.cnn.com',
'http://cnn.com/video', 'http://transcripts.cnn.com',
'http://cnn.com/espanol',
'http://partners.cnn.com', 'http://www.cnn.com',
'http://cnn.com/US', 'http://cnn.com/EUROPE',
'http://cnn.com/TRAVEL', 'http://cnn.com/cnni',
'http://cnn.com/SPORT', 'http://cnn.com/mostpopular',
'http://arabic.cnn.com', 'http://cnn.com/WORLD',
'http://cnn.com/LATINAMERICA', 'http://us.cnn.com',
'http://travel.cnn.com', 'http://mexico.cnn.com',
'http://cnn.com/SHOWBIZ', 'http://edition.cnn.com',
'http://amanpour.blogs.cnn.com', 'http://money.cnn.com',
'http://cnn.com/tools/index.html', 'http://cnnespanol.cnn.com',
'http://cnn.com/CNNI', 'http://business.blogs.cnn.com',
'http://cnn.com/AFRICA', 'http://cnn.com/TECH',
'http://cnn.com/BUSINESS']
FEEDS = ['http://rss.cnn.com/rss/edition.rss']
BRAND = 'cnn'
s = Source('http://cnn.com', verbose=False, memoize_articles=False)
# resp = mock_response_with('http://cnn.com', 'cnn_main_site')
s.clean_memo_cache()
s.build()
示例4: test_source_build
def test_source_build(self):
"""
builds a source object, validates it has no errors, prints out
all valid categories and feed urls
"""
DESC = ('CNN.com International delivers breaking news from across '
'the globe and information on the latest top stories, '
'business, sports and entertainment headlines. Follow the '
'news as it happens through: special reports, videos, '
'audio, photo galleries plus interactive maps and timelines.')
CATEGORY_URLS = [
u'http://cnn.com/ASIA', u'http://connecttheworld.blogs.cnn.com',
u'http://cnn.com/HLN', u'http://cnn.com/MIDDLEEAST',
u'http://cnn.com', u'http://ireport.cnn.com',
u'http://cnn.com/video', u'http://transcripts.cnn.com',
u'http://cnn.com/espanol',
u'http://partners.cnn.com', u'http://www.cnn.com',
u'http://cnn.com/US', u'http://cnn.com/EUROPE',
u'http://cnn.com/TRAVEL', u'http://cnn.com/cnni',
u'http://cnn.com/SPORT', u'http://cnn.com/mostpopular',
u'http://arabic.cnn.com', u'http://cnn.com/WORLD',
u'http://cnn.com/LATINAMERICA', u'http://us.cnn.com',
u'http://travel.cnn.com', u'http://mexico.cnn.com',
u'http://cnn.com/SHOWBIZ', u'http://edition.cnn.com',
u'http://amanpour.blogs.cnn.com', u'http://money.cnn.com',
u'http://cnn.com/tools/index.html', u'http://cnnespanol.cnn.com',
u'http://cnn.com/CNNI', u'http://business.blogs.cnn.com',
u'http://cnn.com/AFRICA', u'http://cnn.com/TECH',
u'http://cnn.com/BUSINESS']
FEEDS = [u'http://rss.cnn.com/rss/edition.rss']
BRAND = 'cnn'
s = Source('http://cnn.com', verbose=False, memoize_articles=False)
url_re = re.compile(".*cnn\.com")
mock_response_with(url_re, 'cnn_main_site')
s.clean_memo_cache()
s.build()
assert s.brand == BRAND
assert s.description == DESC
assert s.size() == 241
assert s.category_urls() == CATEGORY_URLS
# TODO: A lot of the feed extraction is NOT being tested because feeds
# are primarly extracted from the HTML of category URLs. We lose this
# effect by just mocking CNN's main page HTML. Warning: tedious fix.
assert s.feed_urls() == FEEDS
示例5: test_source_build
def test_source_build(self):
"""builds a source object, validates it has no errors, prints out
all valid categories and feed urls"""
DESC = """CNN.com delivers the latest breaking news and information on the latest top stories, weather, business, entertainment, politics, and more. For in-depth coverage, CNN.com provides special reports, video, audio, photo galleries, and interactive guides."""
BRAND = 'cnn'
configs = Configuration()
configs.verbose = False
s = Source('http://cnn.com', configs=configs)
s.clean_memo_cache()
s.build()
assert s.brand == BRAND
assert s.description == DESC
print '\t\tWe have %d articles currently!' % s.size()
示例6: test_cache_categories
def test_cache_categories(self):
"""Builds two same source objects in a row examines speeds of both
"""
url = 'http://uk.yahoo.com'
mock_response_with(url, 'yahoo_main_site')
s = Source(url)
s.download()
s.parse()
s.set_categories()
saved_urls = s.category_urls()
s.categories = []
s.set_categories()
assert sorted(s.category_urls()) == sorted(saved_urls)
示例7: test_cache_categories
def test_cache_categories(self):
"""
builds two same source objects in a row examines speeds of both
"""
s = Source("http://yahoo.com")
s.download()
s.parse()
s.set_categories()
saved_urls = s.category_urls()
s.categories = [] # reset and try again with caching
s.set_categories()
assert sorted(s.category_urls()) == sorted(saved_urls)
示例8: test_cache_categories
def test_cache_categories(self):
"""Builds two same source objects in a row examines speeds of both
"""
url = 'http://uk.yahoo.com'
html = mock_resource_with('yahoo_main_site', 'html')
s = Source(url)
s.download()
s.parse()
s.set_categories()
saved_urls = s.category_urls()
s.categories = []
s.set_categories()
self.assertCountEqual(saved_urls, s.category_urls())
示例9: main
def main():
source="The Huffington Post"
delivery_time="6:00"
#config = Config()
#config.memoize_articles = False
hpost = Source("http://huffingtonpost.com/theworldpost", memoize_articles=False)
hpost.download()
hpost.parse()
hpost.set_categories()
hpost.categories = [hpost.categories[0]]
hpost.categories[0].url = "http://huffingtonpost.com/theworldpost"
hpost.download_categories()
hpost.parse_categories()
hpost.set_feeds()
hpost.download_feeds()
hpost.generate_articles()
#for c in hpost.categories:
# print(c)
#guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
#news_pool.set([guardian], threads_per_source=2)
#news_pool.join()
#print(hpost.size())
for article in [x for x in hpost.articles if re.match(".*html.*world.*", x.url) is not None]:
url = article.url
a = Article(url, language='en')
a.download()
for i in range(10):
if a.is_downloaded:
break
else:
a.download()
try:
a.parse()
a.nlp()
except:
print("Error: Not parsed/downloaded correctly.")
continue
html = a.html
summary = a.summary
keywords = a.keywords
title = a.title
text = a.text
#print(html)
#print(text)
#print(summary)
#print(keywords)
#print(title)
#print(a.publish_date)
if source in title:
title = None
#print(title)
findtime = re.search(r'Posted.*<time datetime="(.*?)">', html)
if findtime is None:
date=None
time=None
else:
date,time = findtime.group(1).split("T")
date = date.split("-")
date[0], date[1], date[2] = date[1], date[2], date[0]
date = "/".join(date)
time = ":".join(time.split("-")[0].split(":")[0:2])
date_time = str(date) + " " + str(time)
#print(title)
#print(date_time)
date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
#print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
try:
article = {
'headline': title,
'url': url,
'text': text,
'date': date_obj
}
newspaper_article('Huffington Post', article, keywords=keywords)
except Exception as ex:
print 'Article could not be created due to following error'
print ex
示例10: main
def main():
source="The Washington Post"
delivery_time="6:00"
#config = Config()
#config.memoize_articles = False
wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
wpost.download()
wpost.parse()
wpost.set_categories()
wpost.categories = [wpost.categories[0]]
wpost.categories[0].url = "http://washingtonpost.com/world"
wpost.download_categories()
wpost.parse_categories()
wpost.set_feeds()
wpost.download_feeds()
wpost.generate_articles()
#for c in wpost.categories:
# print(c)
#guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
#news_pool.set([guardian], threads_per_source=2)
#news_pool.join()
#print(wpost.size())
for article in [x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None]:
url = article.url
a = Article(url, language='en')
a.download()
for i in range(10):
if a.is_downloaded:
break
else:
a.download()
try:
a.parse()
a.nlp()
except:
print("Error: Not parsed/downloaded correctly.")
continue
html = a.html
summary = a.summary
keywords = a.keywords
title = a.title
text = a.text
#print(html)
#print(text)
#print(summary)
#print(keywords)
#print(title)
#print(a.publish_date)
if source in title:
title = None
#print(title)
if a.publish_date is not None:
date = str(a.publish_date).split()[0].split("-")
#print(date)
date[0], date[1], date[2] = date[1], date[2], date[0]
date = "/".join(date)
else:
date = None
time = re.search(r'<span class="pb-timestamp">(.*?)</span>' , html)
if time is None:
print(url)
date = None
else:
time = time.group(1)
if ":" not in time:
time = delivery_time
else:
time = time.split(" at ")[1]
time = datetime.datetime.strptime(time,'%I:%M %p').strftime('%H:%M')
date_time = str(date) + " " + str(time)
#print(date_time)
date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
#print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
#print(text)
#print(date_time)
#TODO: Add stuff to the DB
try:
article = {
'headline': title,
'url': url,
'text': text,
'date': date_obj
}
newspaper_article(source, article, keywords=keywords)
except Exception as ex:
print 'Article could not be created due to following error'
print ex