本文整理匯總了Python中newspaper.Source.download_categories方法的典型用法代碼示例。如果您正苦於以下問題:Python Source.download_categories方法的具體用法?Python Source.download_categories怎麽用?Python Source.download_categories使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類newspaper.Source
的用法示例。
在下文中一共展示了Source.download_categories方法的2個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from newspaper import Source [as 別名]
# 或者: from newspaper.Source import download_categories [as 別名]
def main():
source="The Huffington Post"
delivery_time="6:00"
#config = Config()
#config.memoize_articles = False
hpost = Source("http://huffingtonpost.com/theworldpost", memoize_articles=False)
hpost.download()
hpost.parse()
hpost.set_categories()
hpost.categories = [hpost.categories[0]]
hpost.categories[0].url = "http://huffingtonpost.com/theworldpost"
hpost.download_categories()
hpost.parse_categories()
hpost.set_feeds()
hpost.download_feeds()
hpost.generate_articles()
#for c in hpost.categories:
# print(c)
#guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
#news_pool.set([guardian], threads_per_source=2)
#news_pool.join()
#print(hpost.size())
for article in [x for x in hpost.articles if re.match(".*html.*world.*", x.url) is not None]:
url = article.url
a = Article(url, language='en')
a.download()
for i in range(10):
if a.is_downloaded:
break
else:
a.download()
try:
a.parse()
a.nlp()
except:
print("Error: Not parsed/downloaded correctly.")
continue
html = a.html
summary = a.summary
keywords = a.keywords
title = a.title
text = a.text
#print(html)
#print(text)
#print(summary)
#print(keywords)
#print(title)
#print(a.publish_date)
if source in title:
title = None
#print(title)
findtime = re.search(r'Posted.*<time datetime="(.*?)">', html)
if findtime is None:
date=None
time=None
else:
date,time = findtime.group(1).split("T")
date = date.split("-")
date[0], date[1], date[2] = date[1], date[2], date[0]
date = "/".join(date)
time = ":".join(time.split("-")[0].split(":")[0:2])
date_time = str(date) + " " + str(time)
#print(title)
#print(date_time)
date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
#print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
try:
article = {
'headline': title,
'url': url,
'text': text,
'date': date_obj
}
newspaper_article('Huffington Post', article, keywords=keywords)
except Exception as ex:
print 'Article could not be created due to following error'
print ex
示例2: main
# 需要導入模塊: from newspaper import Source [as 別名]
# 或者: from newspaper.Source import download_categories [as 別名]
def main():
source="The Washington Post"
delivery_time="6:00"
#config = Config()
#config.memoize_articles = False
wpost = Source("http://washingtonpost.com/world", memoize_articles=False)
wpost.download()
wpost.parse()
wpost.set_categories()
wpost.categories = [wpost.categories[0]]
wpost.categories[0].url = "http://washingtonpost.com/world"
wpost.download_categories()
wpost.parse_categories()
wpost.set_feeds()
wpost.download_feeds()
wpost.generate_articles()
#for c in wpost.categories:
# print(c)
#guardian = newspaper.build('http://theguardian.com/world', memoize_articles=False)
#news_pool.set([guardian], threads_per_source=2)
#news_pool.join()
#print(wpost.size())
for article in [x for x in wpost.articles if re.match(".*com/world/.*", x.url) is not None and re.match(".*gallery.html", x.url) is None]:
url = article.url
a = Article(url, language='en')
a.download()
for i in range(10):
if a.is_downloaded:
break
else:
a.download()
try:
a.parse()
a.nlp()
except:
print("Error: Not parsed/downloaded correctly.")
continue
html = a.html
summary = a.summary
keywords = a.keywords
title = a.title
text = a.text
#print(html)
#print(text)
#print(summary)
#print(keywords)
#print(title)
#print(a.publish_date)
if source in title:
title = None
#print(title)
if a.publish_date is not None:
date = str(a.publish_date).split()[0].split("-")
#print(date)
date[0], date[1], date[2] = date[1], date[2], date[0]
date = "/".join(date)
else:
date = None
time = re.search(r'<span class="pb-timestamp">(.*?)</span>' , html)
if time is None:
print(url)
date = None
else:
time = time.group(1)
if ":" not in time:
time = delivery_time
else:
time = time.split(" at ")[1]
time = datetime.datetime.strptime(time,'%I:%M %p').strftime('%H:%M')
date_time = str(date) + " " + str(time)
#print(date_time)
date_obj = datetime.datetime.strptime(date_time,'%m/%d/%Y %H:%M')
#print(date_obj.strftime('%Y/%m/%d %I:%M %p'))
#print(text)
#print(date_time)
#TODO: Add stuff to the DB
try:
article = {
'headline': title,
'url': url,
'text': text,
'date': date_obj
}
newspaper_article(source, article, keywords=keywords)
except Exception as ex:
print 'Article could not be created due to following error'
print ex