Python Article.parse方法代码示例

本文整理汇总了Python中newspaper.Article.parse方法的典型用法代码示例。如果您正苦于以下问题：Python Article.parse方法的具体用法？Python Article.parse怎么用？Python Article.parse使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类newspaper.Article的用法示例。

在下文中一共展示了Article.parse方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse_article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def parse_article(url, lang, featured=0, db=connect_db()):
    cur = db.execute("select * from articles where url=?", (url,))
    entries = [dict(id=row[0], url=row[1], title=row[2], image=row[3], text=row[4], authors=row[5], date=row[6], featured=row[7], language=row[8]) for row in cur.fetchall()]

    if len(entries) >= 1:
        return entries[0]

    article = Article(url)
    article.download()

    try:
        article.parse()
    except:
        return None

    title = article.title
    image = article.top_image
    text = article.text
    authors = ",".join(article.authors)
    date = int(time.mktime(article.publish_date.timetuple())) if type(article.publish_date) is datetime.datetime else 0

    db.execute("insert into articles (url, title, image, text, authors, date, featured, language) values (?, ?, ?, ?, ?, ?, ?, ?)", (url, title, image, text, authors, date, featured and len(text) >= 50, lang))
    db.commit()

    idquery = db.execute("select (id) from articles where url=?", (url,))
    id = [row[0] for row in idquery.fetchall()][0]

    return {"id": id, "url": url, "title": title, "image": image, "text": text, "authors": authors, "date": date, "language": lang}

开发者ID:getaccent，项目名称:accent-backend，代码行数:30，代码来源:fetch.py

示例2: test_chinese_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
 def test_chinese_fulltext_extract(self):
     url = "http://www.bbc.co.uk/zhongwen/simp/chinese_news/2012/12/121210_hongkong_politics.shtml"
     article = Article(url=url, language="zh")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "chinese_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()

开发者ID:WheresWardy，项目名称:newspaper，代码行数:9，代码来源:unit_tests.py

示例3: test_spanish_fulltext_extract

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
 def test_spanish_fulltext_extract(self):
     url = "http://ultimahora.es/mallorca/noticia/noticias/local/fiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html"
     article = Article(url=url, language="es")
     article.download()
     article.parse()
     with codecs.open(os.path.join(TEXT_FN, "spanish_text_1.txt"), "r", "utf8") as f:
         assert article.text == f.read()

开发者ID:WheresWardy，项目名称:newspaper，代码行数:9，代码来源:unit_tests.py

示例4: post_new

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def post_new(request):
    if request.method == "POST":
        form = PostForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            post.author = request.user
            post.published_date = timezone.now()
            post.save()
            return redirect('blog.views.post_detail', pk=post.pk)
    elif request.method == 'GET':
        url = request.GET.get('url', '')
               
        if len(url) > 5:
            article = Article(url, language='en')
            article.download()
            article.parse()
            article.nlp()
            image = article.top_image
            summary = article.summary.replace('\n', ' ').replace(u'\u2019',"\'")
            title = article.title.replace(u'\u2019',"\'")
            source = url.split('//')[1].split('/')[0].replace('www.','')
            status = 'UD'
            form = PostForm({'title': title, 'summary': summary, 'image': image, 'link':url, 'source':source, 'status':status,}) 
        else:
            form = PostForm() 

    return render(request, 'blog/post_edit.html', {'form': form})

开发者ID:WChen1127，项目名称:my-first-blog，代码行数:29，代码来源:views.py

示例5: run

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
    def run(self):
        logging.debug("run() - [WAIT]")
        from newspaper import Article

        '''
        Library documentation: http://newspaper.readthedocs.org/en/latest/user_guide/quickstart.htm
        '''

        NOTES_LIST = [
            '118',
            '117',
            # '116',
            # '115',
        ]
        for note_id in NOTES_LIST:
            note = Article(url="http://site.tiagoprnl.in/core/visitor_home/nota/%s/" % note_id)
            note.download()

            print '*' * 100
            # print 'H T M L'
            # print note.html
            #print '*' * 100
            # print 'T E X T'
            note.parse()
            print note.text


        logging.debug("run() - [DONE]")

开发者ID:tiagoprn，项目名称:experiments，代码行数:30，代码来源:scrapper.py

示例6: f

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def f(url):
	url_urls = url.text
	try:
		response = urllib2.urlopen(url_urls)
		status = response.code

		#print "detected webpage code:", status

		if(status == 404):
			pass
		else:
			a_zh = Article(url_urls, language = 'zh')
			a_zh.download()
			a_zh.parse()
			# content_urls = a_zh.text

			# if(content_urls == ''):
			# 	a_en = Article(url_urls, language = 'en')
			# 	a_en.download()
			# 	a_en.parse()
			# 	content_urls = content_urls + a_en.text

			# if(content_urls != ''):
			# 	pass
			# 	# compare_article(url_urls, content_urls)			
	except:
		pass

开发者ID:shujianbu，项目名称:newshub，代码行数:29，代码来源:detect_cmp.py

示例7: check_url

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
 def check_url(args):
     """
     :param (basestr, basestr) url, res_filename:
     :return: (pubdate_failed, fulltext_failed)
     """
     url, res_filename = args
     pubdate_failed, fulltext_failed = False, False
     html = mock_resource_with(res_filename, 'html')
     try:
         a = Article(url)
         a.download(html)
         a.parse()
         if a.publish_date is None:
             pubdate_failed = True
     except Exception:
         print('<< URL: %s parse ERROR >>' % url)
         traceback.print_exc()
         pubdate_failed, fulltext_failed = True, True
     else:
         correct_text = mock_resource_with(res_filename, 'txt')
         if not (a.text == correct_text):
             # print('Diff: ', simplediff.diff(correct_text, a.text))
             # `correct_text` holds the reason of failure if failure
             print('%s -- %s -- %s' %
                   ('Fulltext failed',
                    res_filename, correct_text.strip()))
             fulltext_failed = True
             # TODO: assert statements are commented out for full-text
             # extraction tests because we are constantly tweaking the
             # algorithm and improving
             # assert a.text == correct_text
     return pubdate_failed, fulltext_failed

开发者ID:Newspad，项目名称:newspaper，代码行数:34，代码来源:unit_tests.py

示例8: insert_url

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def insert_url(url):
    conn = sqlite3.connect('publico_news_sqllite3.db')
    cursor = conn.cursor()

    # get the article in plain text
    article = Article(url)
    article.download()
    article.parse()
    date = article.publish_date
    title = article.title
    text = article.text

    item = dict()
    item['datetime'] = date
    item['title'] = title
    item['text'] = text
    item['category'] = sys.argv[1].split('/')[6]
    item['link'] = sys.argv[1]
    item['origLink'] = sys.argv[1]

    print(item['category'])
    print(item['datetime'])

    if not duplicate(item, item['category'], cursor):
        status = insert_db(item, item['category'], cursor)
        if status == 1:
            print(sys.argv[1], "inserted")
        else:
            print("Error", status)
    else:
        print(url, "already in BD")

    conn.commit()
    conn.close()

开发者ID:davidsbatista，项目名称:publico.pt-news-scrapper，代码行数:36，代码来源:fetch-news-v4.py

示例9: makeDocs

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def makeDocs():
    utc = pytz.utc
    es = Elasticsearch(BONSAI_URL, verify_certs= True)
    es.indices.delete(index='news', ignore=[400, 404])
    es.indices.create(index='news', ignore=400)

    print "Created"
    cnn_paper = newspaper.build(u'http://cnn.com', memoize_articles=False)
    a = defaultdict(int)
    cnn_articles = cnn_paper.articles
    print cnn_paper.size()
    for i in range(10):
        article = cnn_articles[i]
        url = article.url
        art = Article(url)
        art.download()
        art.parse()
        print art.publish_date
        print art.text
        print "Article" + str(i)
        print art.publish_date is not None
        print art.text is not None
        if (art.publish_date is not None) and (art.text is not None):
            try:
                doc = {
                'domain': 'CNN',
                'date': utc.localize(art.publish_date), 
                'text': art.text
                }
                res = es.index(index="news", doc_type='article', id=i, body=doc)
                print "Doc" + str(i)
            except:
                print "Doc not accepted"

开发者ID:shawncaeiro，项目名称:persoNews，代码行数:35，代码来源:createIndex.py

示例10: test2

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
 def test2(self):
     articles =[
      'http://www.radionz.co.nz/news/national/281869/seven-arrests-over-police-manhunt',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11491573',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580358',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11580350',
      'http://www.stuff.co.nz/national/crime/75978990/whanganui-woman-accused-of-leaving-child-in-car-overnight.html',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11574608',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11577923',
      'http://www.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11591401',
      'http://m.nzherald.co.nz/wanganui-chronicle/news/article.cfm?c_id=1503426&objectid=11566180'
      ]
     
     articles = [
      'http://www.express.co.uk/news/uk/657926/New-Zealand-John-Key-slams-David-Cameron-Britain-forgetting-history-European-Union-EU',
      'http://www.bbc.co.uk/news/uk-wales-35954982',
      'http://www.telegraph.co.uk/news/2016/04/04/david-cameron-will-be-an-excellent-former-prime-minister/',
      'http://www.pressandjournal.co.uk/fp/news/aberdeenshire/880519/david-camerons-father-named-panamanian-company-aberdeenshire-home/',
      'http://www.theguardian.com/politics/2016/apr/01/senior-tories-brexit-vote-leave-attacks-david-cameron-letter-nhs-staff',
      'http://www.dailymail.co.uk/news/article-3519908/Nuclear-drones-threat-British-cities-Cameron-Obama-hold-war-game-session-respond-attack-kill-thousands-people.html',
      'http://www.telegraph.co.uk/news/2016/03/31/if-david-cameron-cant-stop-the-tory-fighting-hell-clear-jeremy-c/',
      'http://www.manchestereveningnews.co.uk/news/greater-manchester-news/gmp-boost-number-armed-officers-11125178',
      'http://www.theguardian.com/commentisfree/2016/apr/03/cameron-headphones-what-is-cool-what-is-not']
     
     with open("./Output2.txt", "w") as text_file:
         for url in articles:
             print(url)
             a = Article(url)
             a.download()
             a.parse()
             text_file.write(a.text.encode('utf-8'))
             text_file.write('\n')

开发者ID:boztaskent，项目名称:tensorflow-named-entity，代码行数:34，代码来源:test_data_helpers.py

示例11: is_valid_article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def is_valid_article(link):
    print("Checking valid:\n" + link)

    if "cnn.com" not in link:
        return False
    if "html" not in link:
        return False
    article = Article(link)
    article.download()
    article.parse()
    article.nlp()
    keywords = article.keywords

    matched = False

    for key in keywords:
        if key in nc_set:
            matched = True
    for key in keywords:
        if key in contorversial_set:
            matched = False

    if matched & (len(article.authors) > 0) & (article.publish_date < datetime.datetime(2007, 12, 30, 0, 0)):
        main_file.write(article.title+"\t\t"+article.keywords+"\t\t"+link+"\t\t"+article.text+"\n")
        visited_articles.write(link+"\n")
        return True

    return False

开发者ID:abhi21，项目名称:news，代码行数:30，代码来源:traverse.py

示例12: runTest

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
    def runTest(self):
        # The "correct" fulltext needs to be manually checked
        # we have 50 so far
        FULLTEXT_PREPARED = 50
        domain_counters = {}

        with open(URLS_FILE, 'r') as f:
            urls = [d.strip() for d in f.readlines() if d.strip()]

        for url in urls[:FULLTEXT_PREPARED]:
            domain = get_base_domain(url)
            if domain in domain_counters:
                domain_counters[domain] += 1
            else:
                domain_counters[domain] = 1

            res_filename = domain + str(domain_counters[domain])
            html = mock_resource_with(res_filename, 'html')
            try:
                a = Article(url)
                a.download(html)
                a.parse()
            except Exception:
                print('<< URL: %s parse ERROR >>' % url)
                traceback.print_exc()
                continue

            correct_text = mock_resource_with(res_filename, 'txt')
            condensed_url = url[:30] + ' ...'
            print('%s -- fulltext status: %s' %
                  (condensed_url, a.text == correct_text))

开发者ID:pombredanne，项目名称:newspaper，代码行数:33，代码来源:unit_tests.py

示例13: get_image

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def get_image():
  url = request.args.get('url', '')
  if not url:
    abort(400)

  if is_image(url):
    return redirect(url)

  article = Article(url)
  article.download()

  try:
    article.parse()
  except (IOError, UnicodeDecodeError):
    return '', 422

  try:
    top_image = article.top_image.rsplit('?',1)[0]
  except AttributeError:
    top_image = ''

  if not top_image == '':
    return redirect(top_image)
  else:
    return '', 422

开发者ID:mskog，项目名称:cheapskate，代码行数:27，代码来源:cheapskate.py

示例14: get_details

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def get_details():
    url = request.args.get('url', '')
    if not url:
      abort(400)

    if is_image(url):
      result = {
        "url": url,
        "top_image": url,
        "text": "",
      }
      return jsonify(result)

    article = Article(url)
    article.download()

    try:
      article.parse()
    except (IOError, UnicodeDecodeError):
      return '', 422

    try:
      top_image = article.top_image.rsplit('?',1)[0]
    except AttributeError:
      top_image = ''

    result = {
      "url": url,
      "top_image": top_image,
      "text": article.text,
    }

    return jsonify(result)

开发者ID:mskog，项目名称:cheapskate，代码行数:35，代码来源:cheapskate.py

示例15: show_article

# 需要导入模块: from newspaper import Article [as 别名]
# 或者: from newspaper.Article import parse [as 别名]
def show_article():
    url_to_clean = request.args.get('url_to_clean')
    if not url_to_clean:
        return redirect(url_for('index'))

    article = Article(url_to_clean)
    article.download()
    article.parse()

    try:
      html_string = ElementTree.tostring(article.clean_top_node)
    except:
      html_string = "Error converting html to string."

    try:
      article.nlp()
    except:
      log.error("Couldn't process with NLP")

    a = {
          'html': html_string, 
         'authors': str(', '.join(article.authors)), 
         'title': article.title,
         'text': article.text,
         'top_image': article.top_image,
         'videos': str(', '.join(article.movies)),
         'keywords': str(', '.join(article.keywords)),
         'summary': article.summary
         }
    return render_template('article/index.html', article=a, url=url_to_clean)

开发者ID:chagge，项目名称:newspaper-demo，代码行数:32，代码来源:__init__.py

注：本文中的newspaper.Article.parse方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。