当前位置: 首页>>代码示例>>Python>>正文


Python goose.Goose方法代码示例

本文整理汇总了Python中goose.Goose方法的典型用法代码示例。如果您正苦于以下问题:Python goose.Goose方法的具体用法?Python goose.Goose怎么用?Python goose.Goose使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在goose的用法示例。


在下文中一共展示了goose.Goose方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: import goose [as 别名]
# 或者: from goose import Goose [as 别名]
def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'):
        '''
        Read links and associated categories for specified articles 
        in text file seperated by a space

        Args:
            corpus_dir (str): The directory to save the generated corpus
            datastore_type (Optional[str]): Format to save generated corpus.
                                            Specify either 'file' or 'sqlite'.
            db_name (Optional[str]): Name of database if 'sqlite' is selected.
        '''

        self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'})
        #self.g = Goose({'browser_user_agent': 'Mozilla'})
        self.corpus_dir = corpus_dir
        self.datastore_type = datastore_type
        self.db_name = db_name
        self.stats = defaultdict(int)

        self._create_corpus_dir(self.corpus_dir)

        self.db = None
        if self.datastore_type == 'sqlite':
            self.db = self.corpus_dir + '/' + self.db_name
            self._set_up_db(self.db) 
开发者ID:skillachie,项目名称:news-corpus-builder,代码行数:27,代码来源:news_corpus_generator.py

示例2: goose_extractor

# 需要导入模块: import goose [as 别名]
# 或者: from goose import Goose [as 别名]
def goose_extractor(url):
    '''webpage extraction using
       Goose Library'''

    article = Goose().extract(url=url)
    return article.title, article.meta_description,\
                              article.cleaned_text 
开发者ID:lekhakpadmanabh,项目名称:Summarizer,代码行数:9,代码来源:core.py

示例3: parse_input

# 需要导入模块: import goose [as 别名]
# 或者: from goose import Goose [as 别名]
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.') 
开发者ID:jaijuneja,项目名称:PyTLDR,代码行数:28,代码来源:preprocess.py

示例4: get_parser

# 需要导入模块: import goose [as 别名]
# 或者: from goose import Goose [as 别名]
def get_parser(url, tokenizer):
    useragent = ' '.join([
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)",
        "AppleWebKit/537.36 (KHTML, like Gecko)",
        "Chrome/52.0.2743.116 Safari/537.36"])

    twitter_bypass = ['wsj.com']
    extra_headers = {}

    if any(i for i in twitter_bypass if i in url):
        extra_headers['Referer'] = r'https://t.co/T1323aaaa'
    # Scrape Web Page With HTMLParser and Goose and select the best scrape
    html_parser = HtmlParser.from_url(url, tokenizer, **extra_headers)
    article = Goose({'browser_user_agent': useragent})

    # Goose raises IndexError when requesting unfamiliar sites.
    try:
        extract = article.extract(url=url)
    except:
        extract = article.extract(raw_html=requests.get(url).text)

    goose_parser = PlaintextParser(extract, tokenizer)

    # Aggregate Site Metadata
    meta = {
        k: v for (k, v) in extract.infos.items()
        if k not in ('cleaned_text', 'links', 'tweets', 'movies')
    }
    # Select Best Parser
    parser = (
        html_parser
        if len(goose_parser.document.words) < len(html_parser.document.words) else  # noqa
        goose_parser)

    return parser, meta 
开发者ID:jjangsangy,项目名称:ExplainToMe,代码行数:37,代码来源:textrank.py

示例5: parse_results

# 需要导入模块: import goose [as 别名]
# 或者: from goose import Goose [as 别名]
def parse_results(rss_results, website, lang, db_collection):
    """
    Function to parse the links drawn from an RSS feed.

    Parameters
    ----------

    rss_results: pattern.web.Results.
                    Object containing data on the parsed RSS feed. Each item
                    represents a unique entry in the RSS feed and contains
                    relevant information such as the URL and title of the
                    story.

    website: String.
                Nickname for the RSS feed being scraped.

    db_collection: pymongo Collection.
                        Collection within MongoDB that in which results are
                        stored.
    """
    if lang == 'english':
        goose_extractor = Goose({'use_meta_language': False,
                                 'target_language': 'en',
                                 'enable_image_fetching': False})
    elif lang == 'arabic':
        from goose.text import StopWordsArabic

        goose_extractor = Goose({'stopwords_class': StopWordsArabic,
                                 'enable_image_fetching': False})
    else:
        print(lang)

    for result in rss_results:

        page_url = _convert_url(result.url, website)

        in_database = _check_mongo(page_url, db_collection)

        if not in_database:
            try:
                text, meta = pages_scrape.scrape(page_url, goose_extractor)
                text = text.encode('utf-8')
            except TypeError:
                logger.warning('Problem obtaining text from URL: {}'.format(page_url))
                text = ''
        else:
            logger.debug('Result from {} already in database'.format(page_url))
            text = ''

        if text:
            cleaned_text = _clean_text(text, website)

            entry_id = mongo_connection.add_entry(db_collection, cleaned_text,
                                                  result.title, result.url,
                                                  result.date, website, lang)
            if entry_id:
                try:
                    logger.info('Added entry from {} with id {}'.format(page_url,
                                                                        entry_id))
                except UnicodeDecodeError:
                    logger.info('Added entry from {}. Unicode error for id'.format(result.url)) 
开发者ID:openeventdata,项目名称:scraper,代码行数:63,代码来源:scraper.py


注:本文中的goose.Goose方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。