本文整理汇总了Python中newspaper.Article.set_html方法的典型用法代码示例。如果您正苦于以下问题:Python Article.set_html方法的具体用法?Python Article.set_html怎么用?Python Article.set_html使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article
的用法示例。
在下文中一共展示了Article.set_html方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def extract(self, item):
"""Creates an instance of Article without a Download and returns an ArticleCandidate with the results of
parsing the HTML-Code.
:param item: A NewscrawlerItem to parse.
:return: ArticleCandidate containing the recovered article data.
"""
article_candidate = ArticleCandidate()
article_candidate.extractor = self._name()
article = Article('')
article.set_html(item['spider_response'].body)
article.parse()
article_candidate.title = article.title
article_candidate.description = article.meta_description
article_candidate.text = article.text
article_candidate.topimage = article.top_image
article_candidate.author = article.authors
if article.publish_date is not None:
try:
article_candidate.publish_date = article.publish_date.strftime('%Y-%m-%d %H:%M:%S')
except ValueError as exception:
self.log.debug('%s: Newspaper failed to extract the date in the supported format,'
'Publishing date set to None' % item['url'])
article_candidate.language = article.meta_lang
return article_candidate
示例2: main
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def main(argv):
if len(argv) > 1:
htmlist = argv[1]
else:
htmlist = 'htmlist'
# Our permanent config for html cleaning
config = Config()
config.language = 'id'
config.MIN_SENT_COUNT = 20
config.memoize = False
config.fetch_images = False
config.verbose= True
cleaner = Article(url='', config=config)
with open(htmlist, 'r') as f:
htmfile = f.read().split('\n')
raw = []
for htm in htmfile:
print (htm)
if not htm.endswith("rss.html"):
with open(htm, 'r') as f:
h = f.read()
cleaner.set_html(h)
cleaner.parse()
sentences = nlp.split_sentences(cleaner.text)
#raw.append(sentences])
with open('htm-out', 'a') as f:
[f.write(r + '\n') for r in sentences]
示例3: extract_with_newspaper
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def extract_with_newspaper(self, html):
'''Parses HTML using Newspaper.'''
article = Article(self.url)
article.set_html(html)
filterwarnings('ignore', category=DeprecationWarning)
with catch_warnings():
article.parse()
return article.__dict__
示例4: parse_article_page
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def parse_article_page(response):
article = Article(url=response.request.url)
article.set_html(response.text)
article.parse()
if article.title and article.text:
item = NewsArticle()
item['title'] = article.title
item['text'] = article.text
yield item
示例5: enrich
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
async def enrich(self, result):
# none of the following lines will work if we couldn't make soup
if not self.soup:
return result
sanitized = sanitize_html(self.response.body)
if not sanitized:
return result
article = Article(self.url, config=FixedArticleConfig())
article.config.fetch_images = False
article.set_html(sanitized)
article.parse()
result.set('title', article.title, 2, 'textlength')
if len(article.meta_description) > 0:
result.set('subtitle', article.meta_description, 2, 'textlength')
if len(article.article_html) > 0:
sanitized = sanitize_html(article.article_html)
result.set('content', sanitized, 0, 'textlength')
elif article.top_node is not None:
sanitized = sanitize_html(tostring(article.top_node))
result.set('content', sanitized, 2)
if article.authors:
result.set('authors', article.authors, 2)
if article.publish_date and len(str(article.publish_date)) > 0:
result.set('published_at', article.publish_date, 2)
result.add('keywords', list(article.keywords))
result.add('keywords', list(article.tags))
result.add('_candidate_images', list(article.imgs))
# Primary image guess is actually pretty crappy
if article.top_image:
result.add('_candidate_images', [article.top_img])
text = ""
for paragraph in article.text.split("\n"):
paragraph = paragraph.strip()
# this is done to get rid of cases where a stray heading
# like "Photographs" ends up as a paragraph
if Summarizer.has_sentence(paragraph):
text += " " + paragraph
if len(text) > 0:
result.set('_text', text, 2)
return result
示例6: _parse_article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def _parse_article(self, key, url):
a = Article('')
html = Google().cache(url)
a.set_html(html)
a.parse()
a.nlp()
article = {"summary":a.summary,
"publish_date":a.publish_date,
"images":a.images,
"top_image":a.top_image,
"title":a.title,
"authors":a.authors,
"keywords":a.keywords,
"text":a.text}
# update
#conn = r.connect(db="clearspark")
conn = r.connect(**rethink_conn.conn())
示例7: clean_source
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def clean_source(url, source):
""" Parse a pre-downloaded article using newspaper.
Args:
url (str): The url where the article was sourced (necessary for the
newspaper API).
source (str): Html source of the article page.
Returns:
Dictionary providing cleaned article and extracted content
(see `construct_result`), or `None` if newspaper could not extract
the article.
"""
article = Article(url)
article.set_html(source)
article.parse()
if article.top_node is None:
return None
return construct_result(article)
示例8: parser_nlp
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def parser_nlp(fname, html):
Ts = timeit.default_timer()
raw_html = html
# basic info
fid = int(fname.split('_')[0].split('/')[1])
pm = parse_machine()
html = pm.fix_html(html)
link_stats = pm.parse_links(html)
link_factors = [t for t in list(set(" ".join(link_stats.keys()).lower().split())) if (len(t) > 3)]
doc = db.articles(
fid = fid,
html = html,
html_cnt = len(html),
link_stats = link_stats,
link_factors = link_factors,
rand = random.random(),
# extra
lines = raw_html.count('\n'),
spaces = raw_html.count(' '),
tabs = raw_html.count('\t'),
braces = raw_html.count('{'),
brackets = raw_html.count('['),
quesmarks = raw_html.count('?'),
exclamarks = raw_html.count('!'),
words = len(re.split('\s+', raw_html)),
)
# check empty
if ((doc.html == None) | (len(doc.html.replace(r'\s', '')) < 10)):
doc.empty = True
return doc
try:
# if True:
pd = Article('', fetch_images=False)
pd.set_html(doc.html)
pd.parse()
pd.nlp()
except Exception as e:
print("-"*60)
print("[parser_nlp %s]: %s" % (doc.fid, e))
print(doc.html[:500])
print("-"*60)
return doc #"%s: %s" % (e, doc.id)
# select cleaned_text
cleaned_text = " ".join(pd.text.lower().split())
if (len(cleaned_text) < 140):
soup = bs(doc.html)
if soup.body:
cleaned_text = soup.body.text
if (len(cleaned_text) < 140):
cleaned_text = soup.text
cleaned_text = sanitize_txt(cleaned_text, lower=True)
bow = nlp.nlp().txt2words(cleaned_text or '', False)
# save results
try:
opengraph = pd.meta_data.get('og', {}) if pd.meta_data else {}
top_image = opengraph.get('image') or (pd.top_image if pd.top_image else None)
if isinstance(top_image, dict): top_image = top_image.get('identifier')
if isinstance(opengraph.get('locale'), dict): opengraph['locale'] = opengraph.get('locale').get('identifier')
publish_date = pm.process_date(opengraph.get('updated_time') or pd.publish_date)
# canonical_link & domain
domain = canonical_link = str(opengraph.get('url') or pd.canonical_link)
if '//' in domain: domain = domain.split('//')[1]
if '?' in domain: domain = domain.split('?')[0]
domain = '/'.join(domain.split('/')[0:1])
# update
# doc.update(
doc = db.articles(
fid = doc.fid,
html = doc.html,
link_stats = doc.link_stats,
link_factors = doc.link_factors,
rand = doc.rand,
html_cnt = doc.html_cnt,
#
lines = doc.lines,
spaces = doc.spaces,
tabs = doc.tabs,
braces = doc.braces,
brackets = doc.brackets,
quesmarks = doc.quesmarks,
exclamarks = doc.exclamarks,
words = doc.words,
#
title = str(opengraph.get('title') or pd.title)[:500],
# cleaned_text = str(cleaned_text),
bow = bow,
tags = [t.lower() for t in pd.tags],
# opengraph = {sanitize_txt(k): sanitize_txt(v) for k,v in opengraph.items()},
# summary = str(pd.summary),
keywords = pd.keywords,
top_image = str(top_image),
movies = pd.movies,
publish_date = publish_date,
meta_site_name = str(opengraph.get('site_name')),
meta_lang = str(opengraph.get('locale') or pd.meta_lang),
meta_description = str(opengraph.get('description') or pd.meta_description),
meta_keywords = pd.meta_keywords,
canonical_link = canonical_link,
domain = domain,
authors = [n.lower().replace(' ', '_') for n in pd.authors],
#.........这里部分代码省略.........
示例9: extract_data
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def extract_data(fname, loadp, savep):
######################
# initialize process #
######################
stream = GzipFile(loadp + fname)
protocol = TBinaryProtocol.TBinaryProtocol(TTransport.TBufferedTransport(stream))
data = {'data': []}
count = 0
####################
# begin extraction #
####################
while True:
page = WikiLinkItem()
try:
page.read(protocol)
count += 1
except:
stream.close()
break
print '- processing FILE {0} ENTRY # {1}'.format(fname, count)
print '\t $ URL: {0}'.format(page.url)
#####################
# initial filtering #
#####################
if page.url[:3] == 'ftp':
print '\t\t ###### Ftp prefix detected (ignore) ###### \n'
continue
if page.url[len(page.url) - 4:] != 'html':
print '\t\t ###### Non-html suffix detected (ignore) ###### \n'
continue
if page.content.dom == None:
print '\t\t ###### Empty dom detected (ignore) ###### \n'
continue
#######################
# secondary filtering #
#######################
entities = extract_entities(page.mentions)
if len(entities) < 2:
print '\t\t ###### Single entity found (discard) ###### \n'
continue
print '\t $ # Entities:', len(entities)
#########################
# alignment and parsing #
#########################
html = mark_dom(page.content.dom, entities)
news = Article(page.url, language = 'en')
try:
news.set_html(html)
news.parse()
except:
print '\t\t ###### Parsing failed (discard) ###### \n'
continue
################
# tokenization #
################
text = None
try:
text = ftfy.fix_text(news.text)
text = text.encode('ascii', 'ignore')
text = seperate_delimiter(word_tokenize(text))
except:
print '\t\t ###### Tokenization failed (discard) ###### \n'
continue
#######################
# save processed data #
#######################
print '\t $ Entry # {0} Saved \n'.format(count)
data['data'].append({'text': text, 'dict': entities})
#####################
# save as json file #
#####################
print '****** {0}.json saved ******\n'.format(fname[:3])
f = open(savep + '{0}.json'.format(fname[:3]), 'w')
json.dump(data, f, indent = 4)
f.close()
示例10: Article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
# -*- coding: utf-8 -*-
from newspaper import Article
from goose import Goose
import requests
import json
import sys
article = Article(sys.argv[1])
article.download()
if not article.html:
r = requests.get(sys.argv[1], verify=False, headers={ 'User-Agent': 'Mozilla/5.0' })
article.set_html(r.text)
article.parse()
article.nlp()
published = ''
if article.publish_date:
published = article.publish_date.strftime("%Y-%m-%d %H:%M:%S")
# Get body with goose
g = Goose()
goose_article = g.extract(raw_html=article.html)
body = goose_article.cleaned_text
summary = goose_article.meta_description
# Maybe use https://github.com/xiaoxu193/PyTeaser
if not summary:
summary = article.summary
示例11: prepare
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
def prepare(self, response):
article = Article(url=response.url)
article.set_html(response.text)
article.parse()
return article
示例12: Article
# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import set_html [as 别名]
import sys, json
from newspaper import Article
htmlStr = ""
for line in sys.stdin:
htmlStr = htmlStr + line
#obj = json.loads(jsonStr)
article = Article('')
article.set_html(htmlStr);
article.parse()
article.nlp()
ret = json.dumps(article.keywords)
print ret