本文整理汇总了Python中newspaper.build函数的典型用法代码示例。如果您正苦于以下问题:Python build函数的具体用法?Python build怎么用?Python build使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了build函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_articles
def get_articles():
# get Chinese articles from domain
for url in open("list_ch.txt", 'r'):
try:
paper = newspaper.build(url, memoize_articles = True, language = 'zh')
match_object = re.search('http\:\/\/([^\/]+)\/', url)
domain = match_object.group(1)
for article in paper.articles:
get_meta(article, domain)
except:
pass
# get English articles from domain
for url in open("list_en.txt", 'r'):
try:
paper = newspaper.build(url, memoize_articles = True, language = 'en')
match_object = re.search('http\:\/\/([^\/]+)\/', url)
domain = match_object.group(1)
for article in paper.articles:
get_meta(article, domain)
except:
pass
# get articles from RSS
for url in open("list_rss_ch.txt", 'r'):
try:
feed = feedparser.parse(url)
match_object = re.search('http\:\/\/([^\/]+)\/', url)
domain = match_object.group(1)
chinese = True
for post in feed.entries:
link = post.link
get_meta_rss(link, domain, chinese)
except:
pass
for url in open("list_rss_en.txt", 'r'):
try:
feed = feedparser.parse(url)
match_object = re.search('http\:\/\/([^\/]+)\/', url)
domain = match_object.group(1)
chinese = False
for post in feed.entries:
link = post.link
get_meta_rss(link, domain, chinese)
except:
pass
print "success!"
return
示例2: CheckForMoreArticles
def CheckForMoreArticles():
print 'Checking for more articles from CNN'
cnn = newspaper.build(u'http://us.cnn.com/')
print 'Found ' + str(cnn.size()) + ' new articles from CNN'
print 'Checking for more articles from SMH'
smh = newspaper.build(u'http://smh.com.au/')
print 'Found ' + str(smh.size()) + ' new articles from SMH'
print 'Checking for more articles from Slashdot'
slashdot = newspaper.build(u'http://slashdot.org/')
print 'Found ' + str(smh.size()) + ' new articles from SlashDot'
print 'Checking for more articles from BBC'
bbc = newspaper.build(u'http://www.bbc.com/')
print 'Found ' + str(smh.size()) + ' new articles from BBC'
return cnn.articles + smh.articles + slashdot.articles + bbc.articles
示例3: get_article_urls
def get_article_urls(self, rclient, source_url):
paper = newspaper.build(
source_url, memoize_articles=False, fetch_images=False,
request_timeout=self.timeout, number_threads=self.threads,
language=self.language, browser_user_agent=self.user_agent)
urls = ((a.url, a.title) for a in paper.articles[:self.max_articles])
return ifilterfalse(lambda x: rclient.exists(x[0]), urls)
示例4: populate_sites
def populate_sites(sites):
""" (list of str) -> list of [str, newspaper.source.Source]
Parses through the sites using newspaper library and
returns list of sites with available articles populated
Keyword arguments:
sites -- List of [name, url] of each site
"""
new_sites = []
for s in range(len(sites)):
# Check for any new command on communication stream
check_command()
# Duplicate the name of the sites
new_sites.append([sites[s][0]])
# Use the url and populate the site with articles
new_sites[s].append(
(
newspaper.build(
sites[s][1],
memoize_articles=False,
keep_article_html=True,
fetch_images=False,
language="en",
number_threads=1,
)
)
)
# Append site url
new_sites[s].append(sites[s][1])
return new_sites
示例5: readArticleCollectionFile
def readArticleCollectionFile(site, filename, c):
f = open(filename, 'w')
paper = newspaper.build(site, memoize_articles=False)
print len(paper.articles)
i = 0
for article in paper.articles:
article.download()
article.parse()
title = article.title.encode('ascii', 'ignore')
text = article.text.encode('ascii', 'ignore')
#article.nlp()
#keywords = article.keywords
#summary = article.summary.encode('ascii', 'ignore')
f.write('<article>\n')
f.write("<class>" + str(c) + "</class>\n")
f.write('<title>' + title + '</title>\n')
f.write('<text>\n' + text + '</text>\n')
#f.write('<keywords>' + str(keywords) + '</keywords>\n')
#f.write('<summary>' + summary + '</summary>\n')
f.write("</article>\n")
i = i + 1
if i > 40:
break
f.close()
示例6: fetch_article_url
def fetch_article_url(self, memoize=False):
paper = newspaper.build(self.url, memoize_articles=memoize) or []
self.narticles = paper.size()
print 'article count:%s' % self.narticles
pipe = redis.pipeline()
date_fmt = r'\d{4}[-/]\d{2}[-/]\d{2}'
for article in paper.articles:
url = article.url
print url
date_keys = re.findall(date_fmt, url)
if not date_keys:
continue
date_key = date_keys[0]
key = self.key(date_key)
pipe.sadd(key, url)
if self.save and date_key in self.get_valid_days():
print 'processing....'
try:
article.download()
article.parse()
key = self.key(date_key, article.title)
pipe.set(key, article.text)
except:
pass
pipe.execute()
示例7: _get_articles
def _get_articles(url):
url = url.strip()
for file in os.listdir(newspaper.settings.ANCHOR_DIRECTORY): # clearing newspaper categories cache
os.unlink(os.path.join(newspaper.settings.ANCHOR_DIRECTORY, file))
articles = newspaper.build(url).articles
if url.split('.')[1] == 'jetbrains': # at least for now. Newspaper is a bit buggy on JetBrains site
articles = []
for page in range(10):
soup = BeautifulSoup(requests.get(url + '/page/' + str(page)).content, 'html.parser')
for title in soup.find_all('h2', {'class': 'entry-title'}):
articles.append(NewspaperArticle(title.find('a').get('href')))
for article in articles:
article.download()
if not article.is_downloaded:
print("Failed to download article:", article.url)
continue
article.parse()
article.nlp()
publish_date = article.publish_date
if publish_date is None and url.split('.')[1] == 'jetbrains':
soup = BeautifulSoup(requests.get(article.url).content, 'html.parser')
publish_date = soup.find('span', {'class': 'entry-date'}).getText()
# actually, newspaper is very buggy on JetBrains blog and often cannot parse publish date
print(publish_date)
yield DataMiningArticle(article.html, article.title, article.summary, article.text,
"", article.canonical_link, "", publish_date)
示例8: validate_site
def validate_site(site):
try:
s = newspaper.build(site, memoize_articles=False, keep_article_html=True, fetch_images=False, language="en")
if s.size() == 0:
raise ValidationError("%s is not a valid Referring Site!" % site)
except:
raise ValidationError("%s is not a valid Referring Site!" % site)
示例9: Calculate
def Calculate():
try:
news = request.form['inputNews'].lower()
topic = request.form['inputTopic']
category = request.form['inputCategory']
print news + "\t" + topic + "\t" + category
from havenondemand.hodindex import HODClient
client = HODClient(apikey='6b1f8438-56c7-45e0-98a6-6742c1be0d65', apiversiondefault=1)
"""def get_bias(url):
print "Hello"
data = {'url': url}
r = client.post('analyzesentiment', data)
sentiment = r.json()['aggregate']['sentiment']
score = r.json()['aggregate']['score']
print url + " | " + sentiment + " | " + str(score)
return score"""
paper = newspaper.build("http://" + news + ".com", language='en', memoize_articles=False)
url = []
for article in paper.articles:
url.append(article.url)
cumulative_score = 0.0
countNegative = 0
countPositive = 0
countNeutral = 0
"""import multiprocessing as mp
p = mp.Pool(3)
res = p.map(get_bias, url)"""
print newspaper.category
for u in url:
data = {'url': u}
r = client.post('analyzesentiment', data)
sentiment = r.json()['aggregate']['sentiment']
score = r.json()['aggregate']['score']
print u + " | " + sentiment + " | " + str(score)
cumulative_score += score
if sentiment == 'positive':
countPositive += 1
elif sentiment == 'negative':
countNegative += 1
elif sentiment == 'neutral':
countNeutral += 1
print cumulative_score
print cumulative_score/len(url)
except Exception as e:
return json.dumps({'error':str(e)})
return news + topic + category
示例10: discover_feeds_urls
def discover_feeds_urls(feed_url):
""" Try to discover more feed URLs in one. """
LOGGER.info(u'Trying to discover new RSS/Atom feeds from %s…', feed_url)
try:
site = newspaper.build(feed_url)
urls_to_try = set(site.feed_urls())
except:
LOGGER.exception(u'Newspaper did not help finding feeds '
u'from “%s”', feed_url)
created = []
known = []
for url in urls_to_try:
result = create_feeds_from_url(url, recurse=False)
if result:
# keep feeds if they have been created
created.extend(x[0] for x in result if x[1])
known.extend(x[0] for x in result if not x[1])
LOGGER.info(u'Done discovering %s: %s feeds created, %s already known.',
feed_url, len(created), len(known))
示例11: get_news_data
def get_news_data():
# Get list of settings
urllist: SettingsList = get_safe_settingslist('CryptoNewsUrls', urls)
keylist: SettingsList = get_safe_settingslist('CrytoNewsKeywords', keywords)
logger_name = 'main_scraper.' + "bitcoin_news"
logger = logging.getLogger(logger_name)
for url in urllist.list:
paper = newspaper.build(url, language='en')
for article in paper.articles:
try:
article.download()
article.parse()
keys = [key for key in keylist.list if key in article.title.lower()]
if len(keys) > 0:
# check if article already exists
obj = CryptoNews.objects(title=article.title).first()
if obj is None:
news = CryptoNews()
news.title = article.title
news.description = article.meta_description
news.text = article.text
news.tags = keys
news.url = article.url
news.save()
logger.info(article.title)
except BaseException as e:
logger.error('Cryptonews error{0}'.format(e))
pass
示例12: build_newspaper
def build_newspaper(self):
'''
This method builds newspaper using their url and newspaper library
'''
for site_url in self.site_urls:
self.built_newspapers.append(newspaper.build(site_url,
memoize_articles=False))
示例13: makeDocs
def makeDocs():
utc = pytz.utc
es = Elasticsearch(BONSAI_URL, verify_certs= True)
es.indices.delete(index='news', ignore=[400, 404])
es.indices.create(index='news', ignore=400)
print "Created"
cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
a = defaultdict(int)
cnn_articles = cnn_paper.articles
print cnn_paper.size()
for i in range(10):
article = cnn_articles[i]
url = article.url
art = Article(url)
art.download()
art.parse()
print art.publish_date
print art.text
print "Article" + str(i)
print art.publish_date is not None
print art.text is not None
if (art.publish_date is not None) and (art.text is not None):
try:
doc = {
'domain': 'CNN',
'date': utc.localize(art.publish_date),
'text': art.text
}
res = es.index(index="news", doc_type='article', id=i, body=doc)
print "Doc" + str(i)
except:
print "Doc not accepted"
示例14: getArticlesFromSource
def getArticlesFromSource(self, source, external=False):
paper = newspaper.build(source, memoize_articles=True, browser_user_agent='BEEVA/Emojinews crawler 1.0')
#Filtering articles out of domain
news = filter((lambda x: x.url.startswith(source) or external), paper.articles)
news = map(self.cleanNames, news)
news = filter((lambda x: x.title), paper.articles)
news = map(lambda x: {'url':x.url, 'title':x.title}, news)
return news
示例15: main
def main():
source_url, rabbit_url = parse_config()
paper = newspaper.build(source_url)
publisher = Publisher(
rabbit_url=rabbit_url,
publish_interval=0.25,
article_urls=paper.article_urls())
publisher.run()