当前位置: 首页>>代码示例>>Python>>正文


Python newspaper.Article方法代码示例

本文整理汇总了Python中newspaper.Article方法的典型用法代码示例。如果您正苦于以下问题:Python newspaper.Article方法的具体用法?Python newspaper.Article怎么用?Python newspaper.Article使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在newspaper的用法示例。


在下文中一共展示了newspaper.Article方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: run

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def run(param):
    (article_dir, title_dir, html_path) = param
    try:
        raw_html = open(html_path, encoding="ascii", errors="surrogateescape").read().strip()
    except:
        raw_html = open(html_path, encoding=encoding_detector(html_path), errors="surrogateescape").read().strip()

    id = html_path.split('/')[-1].split('.')[0]
    a = Article('http:/www.dummy.com', language='en')
    a.download(input_html=raw_html)
    a.parse()
    title = a.title
    text = a.text
    title = remove_non_ascii(title)
    text = remove_non_ascii(text)
    fw = open('{}/{}'.format(article_dir, id),'w',encoding='utf-8')
    fw.write(text)
    fw.close()
    fw = open('{}/{}'.format(title_dir, id),'w',encoding='utf-8')
    fw.write(title)
    fw.close() 
开发者ID:yaserkl,项目名称:TransferRL,代码行数:23,代码来源:cnn_dm_downloader.py

示例2: summarizeArticles

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def summarizeArticles(articles, length, firstlast = False):
    summedArticles = []
    for a in articles:
        try: 
            A = Article(a.link)
            A.download()
            A.parse()
            text = ""
            paragraphs = A.text.split('\n')
            for p in paragraphs:
                if len(p) > 100:
                    a.body.append(p)
                    text += p + ' ' 
            sentences = summarize(text, length, firstlast)
            for s in sentences:
                a.summary.append(s) 
            summedArticles.append(a)    
        except: pass
    return summedArticles 
开发者ID:anfederico,项目名称:Stockeye,代码行数:21,代码来源:watch.py

示例3: get_article

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def get_article(link, news, date):
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    lang = 'eng'
    if len(article.title) < 5 or len(article.text) < 5:
        print('found BM/ID article')
        article = Article(link, language = 'id')
        article.download()
        article.parse()
        article.nlp()
        lang = 'id'
    return {
        'title': article.title,
        'url': link,
        'authors': article.authors,
        'top-image': article.top_image,
        'text': article.text,
        'keyword': article.keywords,
        'summary': article.summary,
        'news': news,
        'date': date,
        'language': lang,
    } 
开发者ID:huseinzol05,项目名称:Python-DevOps,代码行数:27,代码来源:core.py

示例4: raw_scraper

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def raw_scraper(url, memoize):
    t1 = time.time()

    try:
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = minify(article.html)
        html = cleaner.clean_html(html)
        article.parse()
    except:
        return None, None
    if article.text == "":
        return None, None

    metadata = {"url": url, "elapsed": time.time() - t1, "scraper": "raw"}
    return html, metadata 
开发者ID:jcpeterson,项目名称:openwebtext,代码行数:21,代码来源:scrapers.py

示例5: newspaper_scraper

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def newspaper_scraper(url, memoize):
    t1 = time.time()

    try:
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        article.parse()
        text = article.text
        count = len(text.split())
    except:
        return None, None

    metadata = {
        "url": url,
        "word_count": count,
        "elapsed": time.time() - t1,
        "scraper": "newspaper",
    }
    return text, metadata 
开发者ID:jcpeterson,项目名称:openwebtext,代码行数:21,代码来源:scrapers.py

示例6: bs4_scraper

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def bs4_scraper(url, memoize):
    t1 = time.time()

    try:
        article = newspaper.Article(url, fetch_images=False, memoize_articles=memoize)
        article.download()
        html = article.html
        soup = bs4.BeautifulSoup(html, "lxml")
        text, count = find_and_filter_tag("p", soup)
        # DDB: keep text as a single string for consistency with
        # newspaper_scraper
        text = " ".join(text)
    except:
        return None, None

    metadata = {
        "url": url,
        "word_count": count,
        "elapsed": time.time() - t1,
        "scraper": "bs4",
    }
    return text, metadata 
开发者ID:jcpeterson,项目名称:openwebtext,代码行数:24,代码来源:scrapers.py

示例7: __init__

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def __init__(self, html):
        self.html = html if html is not None else ""

        self.dummy_article = newspaper.Article(url='', fetch_images=False, verbose=True)
        self.dummy_article.set_html(html)
        self.dummy_article.parse()

        self.text = _filter_excessive_newlines(self.dummy_article.text)
        self.authors = self.dummy_article.authors
        self.authors = [x for x in self.authors if len(x.split(' ')) < 10]
        self.title = self.dummy_article.title

        # sometimes the text started with the title... that's bad
        if self.text.startswith(self.title + '\n'):
            self.text = self.text[len(self.title):].lstrip('\n')

        if self.dummy_article.publish_date and not isinstance(self.dummy_article.publish_date, str):
            try:
                self.publish_date = self.dummy_article.publish_date.date().strftime(
                    "%m-%d-%Y")
            except AttributeError:
                self.publish_date = None
        else:
            self.publish_date = None

        self._extract_summary() 
开发者ID:rowanz,项目名称:grover,代码行数:28,代码来源:process_ccrawl.py

示例8: download_text

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def download_text(self):
        """Downloads text from self.url and strip HTML tags.
        """
        if not self.text and self.url:
            a = Article(self.url)
            a.download()
            a.parse()
            self.text = a.text 
开发者ID:Corollarium,项目名称:geograpy2,代码行数:10,代码来源:extraction.py

示例9: analyse_web_page_article

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def analyse_web_page_article(self, url):
        article = Article(url)
        article.download()
        article.parse()
        return article, self.analyse(article.text)

    # return a list of high influential value websites 
开发者ID:Drakkar-Software,项目名称:OctoBot-Tentacles,代码行数:9,代码来源:text_analysis.py

示例10: parse_file

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def parse_file(file_entry):
    file_name, html = file_entry
    url_hash = md5(html).hexdigest()
    article = newspaper.Article(url=url_hash, fetch_images=False)
    article.set_html(html)
    article.parse()
    return (file_name, article.text) 
开发者ID:ConnorJL,项目名称:GPT2,代码行数:9,代码来源:extract_text.py

示例11: handle_non_premium

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def handle_non_premium(cls):
        """Handle a non-premium article."""
        article = Article(cls.url)
        article.download()
        article.parse()

        title = article.title
        body = article.text

        return Comment(title, body) 
开发者ID:fterh,项目名称:sneakpeek,代码行数:12,代码来源:__init__.py

示例12: make_comment

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def make_comment(cls, best_candidate):
        url = f"https://www.pressreader.com{best_candidate}"
        article = Article(url, browser_user_agent="Googlebot-News", keep_article_html=True)
        article.download()
        try:
            article.parse()
        except:
            return Comment('', '')

        title = article.title.replace("\xad", "")  # clean the text
        body = article.text.replace("\xad", "")  # clean the text

        print(f"checking the article in this url: {url} with title {title}")
        return Comment(title, body) 
开发者ID:fterh,项目名称:sneakpeek,代码行数:16,代码来源:__init__.py

示例13: handle

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def handle(cls, url):
        article = Article(url)
        article.download()
        article.parse()

        title = article.title
        body = article.text

        return Comment(title, body) 
开发者ID:fterh,项目名称:sneakpeek,代码行数:11,代码来源:__init__.py

示例14: crawl

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def crawl(self, doc):
        """ Crawl this document. """

        # instantiate and download article
        article = Article(url=doc.url, language='en', fetch_images=False, request_timeout=10)
        article.download()

        # extract content
        self.extract(doc, article) 
开发者ID:Code4SA,项目名称:mma-dexter,代码行数:11,代码来源:generic.py

示例15: parse_input

# 需要导入模块: import newspaper [as 别名]
# 或者: from newspaper import Article [as 别名]
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.') 
开发者ID:jaijuneja,项目名称:PyTLDR,代码行数:28,代码来源:preprocess.py


注:本文中的newspaper.Article方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。