本文整理匯總了Python中goose.Goose方法的典型用法代碼示例。如果您正苦於以下問題:Python goose.Goose方法的具體用法?Python goose.Goose怎麽用?Python goose.Goose使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類goose
的用法示例。
在下文中一共展示了goose.Goose方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
# 需要導入模塊: import goose [as 別名]
# 或者: from goose import Goose [as 別名]
def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'):
'''
Read links and associated categories for specified articles
in text file seperated by a space
Args:
corpus_dir (str): The directory to save the generated corpus
datastore_type (Optional[str]): Format to save generated corpus.
Specify either 'file' or 'sqlite'.
db_name (Optional[str]): Name of database if 'sqlite' is selected.
'''
self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'})
#self.g = Goose({'browser_user_agent': 'Mozilla'})
self.corpus_dir = corpus_dir
self.datastore_type = datastore_type
self.db_name = db_name
self.stats = defaultdict(int)
self._create_corpus_dir(self.corpus_dir)
self.db = None
if self.datastore_type == 'sqlite':
self.db = self.corpus_dir + '/' + self.db_name
self._set_up_db(self.db)
示例2: goose_extractor
# 需要導入模塊: import goose [as 別名]
# 或者: from goose import Goose [as 別名]
def goose_extractor(url):
'''webpage extraction using
Goose Library'''
article = Goose().extract(url=url)
return article.title, article.meta_description,\
article.cleaned_text
示例3: parse_input
# 需要導入模塊: import goose [as 別名]
# 或者: from goose import Goose [as 別名]
def parse_input(text, extractor='newspaper'):
if isinstance(text, str) or isinstance(text, unicode):
if text.startswith(('http://', 'https://')):
# Input is a link - need to extract the text from html
if extractor.lower() == 'goose':
from goose import Goose
urlparse = Goose()
article = urlparse.extract(url=text)
return unicode_to_ascii(article.cleaned_text)
else:
from newspaper import Article
article = Article(text)
article.download()
article.parse()
return unicode_to_ascii(article.text)
elif text.endswith('.txt'):
# Input is a file - need to read it
textfile = open(text, 'rb')
article = textfile.read()
textfile.close()
return unicode_to_ascii(article)
else:
# Input is a string containing the raw text
return unicode_to_ascii(text)
else:
raise ValueError('Input text must be of type str or unicode.')
示例4: get_parser
# 需要導入模塊: import goose [as 別名]
# 或者: from goose import Goose [as 別名]
def get_parser(url, tokenizer):
useragent = ' '.join([
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)",
"AppleWebKit/537.36 (KHTML, like Gecko)",
"Chrome/52.0.2743.116 Safari/537.36"])
twitter_bypass = ['wsj.com']
extra_headers = {}
if any(i for i in twitter_bypass if i in url):
extra_headers['Referer'] = r'https://t.co/T1323aaaa'
# Scrape Web Page With HTMLParser and Goose and select the best scrape
html_parser = HtmlParser.from_url(url, tokenizer, **extra_headers)
article = Goose({'browser_user_agent': useragent})
# Goose raises IndexError when requesting unfamiliar sites.
try:
extract = article.extract(url=url)
except:
extract = article.extract(raw_html=requests.get(url).text)
goose_parser = PlaintextParser(extract, tokenizer)
# Aggregate Site Metadata
meta = {
k: v for (k, v) in extract.infos.items()
if k not in ('cleaned_text', 'links', 'tweets', 'movies')
}
# Select Best Parser
parser = (
html_parser
if len(goose_parser.document.words) < len(html_parser.document.words) else # noqa
goose_parser)
return parser, meta
示例5: parse_results
# 需要導入模塊: import goose [as 別名]
# 或者: from goose import Goose [as 別名]
def parse_results(rss_results, website, lang, db_collection):
"""
Function to parse the links drawn from an RSS feed.
Parameters
----------
rss_results: pattern.web.Results.
Object containing data on the parsed RSS feed. Each item
represents a unique entry in the RSS feed and contains
relevant information such as the URL and title of the
story.
website: String.
Nickname for the RSS feed being scraped.
db_collection: pymongo Collection.
Collection within MongoDB that in which results are
stored.
"""
if lang == 'english':
goose_extractor = Goose({'use_meta_language': False,
'target_language': 'en',
'enable_image_fetching': False})
elif lang == 'arabic':
from goose.text import StopWordsArabic
goose_extractor = Goose({'stopwords_class': StopWordsArabic,
'enable_image_fetching': False})
else:
print(lang)
for result in rss_results:
page_url = _convert_url(result.url, website)
in_database = _check_mongo(page_url, db_collection)
if not in_database:
try:
text, meta = pages_scrape.scrape(page_url, goose_extractor)
text = text.encode('utf-8')
except TypeError:
logger.warning('Problem obtaining text from URL: {}'.format(page_url))
text = ''
else:
logger.debug('Result from {} already in database'.format(page_url))
text = ''
if text:
cleaned_text = _clean_text(text, website)
entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
result.title, result.url,
result.date, website, lang)
if entry_id:
try:
logger.info('Added entry from {} with id {}'.format(page_url,
entry_id))
except UnicodeDecodeError:
logger.info('Added entry from {}. Unicode error for id'.format(result.url))