本文整理汇总了Python中newspaper.Article方法的典型用法代码示例。如果您正苦于以下问题:Python newspaper.Article方法的具体用法?Python newspaper.Article怎么用?Python newspaper.Article使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper
的用法示例。
在下文中一共展示了newspaper.Article方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def run(param):
(article_dir, title_dir, html_path) = param
try:
raw_html = open(html_path, encoding="ascii", errors="surrogateescape").read().strip()
except:
raw_html = open(html_path, encoding=encoding_detector(html_path), errors="surrogateescape").read().strip()
id = html_path.split('/')[-1].split('.')[0]
a = Article('http:/www.dummy.com', language='en')
a.download(input_html=raw_html)
a.parse()
title = a.title
text = a.text
title = remove_non_ascii(title)
text = remove_non_ascii(text)
fw = open('{}/{}'.format(article_dir, id),'w',encoding='utf-8')
fw.write(text)
fw.close()
fw = open('{}/{}'.format(title_dir, id),'w',encoding='utf-8')
fw.write(title)
fw.close()
示例2: summarizeArticles
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def summarizeArticles(articles, length, firstlast = False):
summedArticles = []
for a in articles:
try:
A = Article(a.link)
A.download()
A.parse()
text = ""
paragraphs = A.text.split('\n')
for p in paragraphs:
if len(p) > 100:
a.body.append(p)
text += p + ' '
sentences = summarize(text, length, firstlast)
for s in sentences:
a.summary.append(s)
summedArticles.append(a)
except: pass
return summedArticles
示例3: get_article
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def get_article(link, news, date):
article = Article(link)
article.download()
article.parse()
article.nlp()
lang = 'eng'
if len(article.title) < 5 or len(article.text) < 5:
print('found BM/ID article')
article = Article(link, language = 'id')
article.download()
article.parse()
article.nlp()
lang = 'id'
return {
'title': article.title,
'url': link,
'authors': article.authors,
'top-image': article.top_image,
'text': article.text,
'keyword': article.keywords,
'summary': article.summary,
'news': news,
'date': date,
'language': lang,
}
示例4: raw_scraper
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def raw_scraper(url, memoize):
t1 = time.time()
try:
cleaner = Cleaner()
cleaner.javascript = True
cleaner.style = True
article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
article.download()
html = minify(article.html)
html = cleaner.clean_html(html)
article.parse()
except:
return None, None
if article.text == "":
return None, None
metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
return html, metadata
示例5: newspaper_scraper
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def newspaper_scraper(url, memoize):
t1 = time.time()
try:
article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
article.download()
article.parse()
text = article.text
count = len(text.split())
except:
return None, None
metadata = {
"url": url,
"word_count": count,
"elapsed": time.time() - t1,
"scraper": "newspaper",
}
return text, metadata
示例6: bs4_scraper
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def bs4_scraper(url, memoize):
t1 = time.time()
try:
article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
article.download()
html = article.html
soup = bs4.BeautifulSoup(html, "lxml")
text, count = find_and_filter_tag("p", soup)
# DDB: keep text as a single string for consistency with
# newspaper_scraper
text = " ".join(text)
except:
return None, None
metadata = {
"url": url,
"word_count": count,
"elapsed": time.time() - t1,
"scraper": "bs4",
}
return text, metadata
示例7: __init__
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def __init__(self, html):
self.html = html if html is not None else ""
self.dummy_article = newspaper.Article(url='', fetch_images=False, verbose=True)
self.dummy_article.set_html(html)
self.dummy_article.parse()
self.text = _filter_excessive_newlines(self.dummy_article.text)
self.authors = self.dummy_article.authors
self.authors = [x for x in self.authors if len(x.split(' ')) < 10]
self.title = self.dummy_article.title
# sometimes the text started with the title... that's bad
if self.text.startswith(self.title + '\n'):
self.text = self.text[len(self.title):].lstrip('\n')
if self.dummy_article.publish_date and not isinstance(self.dummy_article.publish_date, str):
try:
self.publish_date = self.dummy_article.publish_date.date().strftime(
"%m-%d-%Y")
except AttributeError:
self.publish_date = None
else:
self.publish_date = None
self._extract_summary()
示例8: download_text
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def download_text(self):
"""Downloads text from self.url and strip HTML tags.
"""
if not self.text and self.url:
a = Article(self.url)
a.download()
a.parse()
self.text = a.text
示例9: analyse_web_page_article
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def analyse_web_page_article(self, url):
article = Article(url)
article.download()
article.parse()
return article, self.analyse(article.text)
# return a list of high influential value websites
示例10: parse_file
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def parse_file(file_entry):
file_name, html = file_entry
url_hash = md5(html).hexdigest()
article = newspaper.Article(url=url_hash, fetch_images=False)
article.set_html(html)
article.parse()
return (file_name, article.text)
示例11: handle_non_premium
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def handle_non_premium(cls):
"""Handle a non-premium article."""
article = Article(cls.url)
article.download()
article.parse()
title = article.title
body = article.text
return Comment(title, body)
示例12: make_comment
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def make_comment(cls, best_candidate):
url = f"https://www.pressreader.com{best_candidate}"
article = Article(url, browser_user_agent="Googlebot-News", keep_article_html=True)
article.download()
try:
article.parse()
except:
return Comment('', '')
title = article.title.replace("\xad", "") # clean the text
body = article.text.replace("\xad", "") # clean the text
print(f"checking the article in this url: {url} with title {title}")
return Comment(title, body)
示例13: handle
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def handle(cls, url):
article = Article(url)
article.download()
article.parse()
title = article.title
body = article.text
return Comment(title, body)
示例14: crawl
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def crawl(self, doc):
""" Crawl this document. """
# instantiate and download article
article = Article(url=doc.url, language='en', fetch_images=False, request_timeout=10)
article.download()
# extract content
self.extract(doc, article)
示例15: parse_input
# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def parse_input(text, extractor='newspaper'):
if isinstance(text, str) or isinstance(text, unicode):
if text.startswith(('http://', 'https://')):
# Input is a link - need to extract the text from html
if extractor.lower() == 'goose':
from goose import Goose
urlparse = Goose()
article = urlparse.extract(url=text)
return unicode_to_ascii(article.cleaned_text)
else:
from newspaper import Article
article = Article(text)
article.download()
article.parse()
return unicode_to_ascii(article.text)
elif text.endswith('.txt'):
# Input is a file - need to read it
textfile = open(text, 'rb')
article = textfile.read()
textfile.close()
return unicode_to_ascii(article)
else:
# Input is a string containing the raw text
return unicode_to_ascii(text)
else:
raise ValueError('Input text must be of type str or unicode.')