当前位置: 首页>>代码示例>>Python>>正文


Python Document.encode方法代码示例

本文整理汇总了Python中readability.readability.Document.encode方法的典型用法代码示例。如果您正苦于以下问题:Python Document.encode方法的具体用法?Python Document.encode怎么用?Python Document.encode使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在readability.readability.Document的用法示例。


在下文中一共展示了Document.encode方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: checkerFunction

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def checkerFunction(myInput):
	today = datetime.date.today()
	try:
		google1 = 'http://www.google.com/search?hl=en&q='
		google2 = '%20privacy%20policy&btnI=1'
		keyword = myInput
		
		url = google1 + keyword + google2
		r = requests.get(url, allow_redirects=False)
		url = r.headers['location']
	except Exception as e:
		return


	
	myFullPath = "./sandbox/db/" + keyword

	if not os.path.exists("./sandbox"):
    	  os.makedirs("./sandbox")

	if not os.path.exists("./sandbox/db/"):
      	  os.makedirs("./sandbox/db/")

	if not os.path.exists(myFullPath):
    	  os.makedirs(myFullPath)

	filename = keyword + "." + str(today)
	filetowrite = myFullPath + "/" + filename
	
	fileExist =  os.path.isfile(filetowrite)
	
	
	
	
	if (url == None):
		return
	html = urllib.urlopen(url).read()
	readable_article = Document(html).summary()
	tempFileMade = False
	originalFileMade = False
	if(fileExist):
		filetowrite = filetowrite + ".tmp."
		f = open(filetowrite, 'w')
		writeThis = str(readable_article.encode('ascii', 'ignore')) 
		f.write(writeThis)
		f.close
		tempFileMade = True
	else:
		f = open(filetowrite, 'w')
		writeThis = str(readable_article.encode('ascii', 'ignore'))
		f.write(writeThis)
		f.close
		originalFileMade = True
	
	hashedmd5 = hashlib.md5(readable_article.encode('ascii', 'ignore'))
	hashedArticle = hashedmd5.hexdigest()
	return hashedArticle	
开发者ID:joubin,项目名称:PrivacyPolicyChecker,代码行数:59,代码来源:checker.py

示例2: get_announcement_body

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def get_announcement_body(url):

        now = datetime.datetime.now()
        resp = ["","","","","",""]
        images = []
        html = br.open(url).read()

        readable_announcement = Document(html).summary()
        readable_title = Document(html).title()
        soup = BeautifulSoup(readable_announcement, "lxml")
        final_announcement = soup.text
        links = soup.findAll('img', src=True)
        for lin in links:
                li = urlparse.urljoin(url,lin['src'])
                images.append( li)
                
        resp[0] = str(final_announcement.encode("ascii","ignore"))
        resp[1] = str(readable_title.encode("ascii","ignore"))
        resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
        resp[3] = url
        resp[4] = url
        resp[5] = ""
        #insertDB(resp)
        #print "inserted resp"
                 
        title_article = []
        title_article.append(final_announcement)
        title_article.append(readable_title)
        title_article.append(images)                
        return title_article
开发者ID:lukharri,项目名称:Web-Scraping,代码行数:32,代码来源:getAnnouncement.py

示例3: extrat_html_document

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def extrat_html_document(url):
    try :
        print "extrat_html_document"
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        r = urllib2.Request(url, headers=headers)
        socket = urllib2.urlopen(r,timeout = 1)
        url = socket.geturl()
        html = socket.read()

        #block_url pass
        for bl_url in block_url:
            if len(url.split(bl_url)) > 1:
                summary="block"
                return summary

        for ext_url in exception_url:
            if len(url.split(ext_url)) > 1:
                readable_title = Document(html).short_title()
                summary = readable_title.encode('utf-8')
                _file.write(summary+'\n')
                return summary

        readable_article = Document(html).summary()
        readable_title = Document(html).short_title()
        summary = readable_title.encode('utf-8') + readable_title.encode('utf-8')
        print "soup start"
        soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser")
        print "summary:"

        for s in soup("p"):
            summary += str(s.encode('utf-8'))

#        summary += readable_article.encode('utf-8')


    except Exception:
        _file.write('extrat_html_document Failed URL : ' + url + '\n')
        summary = "Failed Get data"

    return summary
开发者ID:yoonwonsang,项目名称:redcabinet,代码行数:43,代码来源:views.py

示例4: download_html_as_text

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def download_html_as_text(url, filename=None, format_to='rst'):
    """Download HTML content from url and convert it to plain text."""
    # Construct internet connection
    headers = {'User-Agent' : 'Mozilla Firefox for Ubuntu canonical - 1.0'}
    req = urllib2.Request(url, headers=headers)
    con = urllib2.urlopen(req)
    html = con.read()

    # Fetch and convert main contents
    article = Document(html).summary()
    if len(article) < 1024:
        article = html

    article = patch_image_alt(article)
    title = Document(html).short_title()
    text = pypandoc.convert(article, format_to, format='html')

    title_utf8 = title.encode('utf-8')
    lines_insert = [u'\n\n',
                    u'='*len(title_utf8), u'\n',
                    title_utf8, u'\n',
                    u'='*len(title_utf8), u'\n\n', 
                    u':URL: ' + url,  u'\n\n']
    title = title.split('|,-')[0]

    # Search for urls of images
    imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)'
    imgurl_re = re.compile(imgurl_pattern, re.I)
    image_urls = imgurl_re.findall(text)

    if filename is None:
        filename = title.split('-')[0].strip().replace(' ', '-')

    txtfile = open(filename + '-bak.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()

    # Replace online image URLs with local paths.
    images = download_images(image_urls, filename + '-images')
    for img, link in images:
        text = text.replace(link, img)

    txtfile = open(filename + '.' + format_to, 'w')
    txtfile.writelines(lines_insert)
    txtfile.write(text.encode('utf-8'))
    txtfile.close()
开发者ID:fredqi,项目名称:notes,代码行数:49,代码来源:plain-notes.py

示例5: getReadableArticle

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def getReadableArticle(url):

        now = datetime.datetime.now()
        resp = ["","","","","",""]
        images = []
        html = br.open(url).read()

        readable_article = Document(html).summary()
        #print readable_article
        #print readable_article
        readable_title = Document(html).title()
        #print readable_title
        soup = BeautifulSoup(readable_article)
        final_article = soup.text
        #print final_article
        #print final_article
        links = soup.findAll('img', src=True)
        for lin in links:
                li = urlparse.urljoin(url,lin['src'])
                #print li
                images.append( li)
                
        resp[0] = str(final_article.encode("ascii","ignore"))
        #print resp[0]
        resp[1] = str(readable_title.encode("ascii","ignore"))
        resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
        resp[3] = url
        resp[4] = url
        #if len(images)>0:
                #resp[5] = images[0]
        #else:
        resp[5] = ""
        insertDB(resp)
        print "inserted resp"
                 

        title_article = []
        title_article.append(final_article)
        title_article.append(readable_title)
        title_article.append(images)                
        return title_article
开发者ID:rrmckeever0319,项目名称:Python-Code,代码行数:43,代码来源:webscraper.py

示例6: parse

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
    def parse(self, response):
        sel = Selector(response)
        item = PostItem()

        # fill page url
        item['url'] = response.url
        # extract page title
        def match_title(title):
            if title is None: return False
            for keyword in self.title_keywords:
                regex = re.compile(".*%s.*" %keyword)
                if not regex.match(title):
                    return False
            return True
        for tag in ("h1", "h2", "h3", "h4", "title", "strong", "b", "p", "span"):
            for heads in sel.xpath("//%s/text()" %tag).extract():
                #for head in heads.strip().encode('utf-8').split(" - "):
                for head in filter(None, self.head_seps.split(heads.encode('utf-8'))):
                    if match_title(head):
                        item['title'] = head.strip()
                        break
        # clean page content
        html = sel.xpath("//html").extract()
        if html:
            content = Document(html[0]).summary()
            item['page_content'] = content.encode('utf-8')
            #print item['page_content']
        if item.get('title') is None:
            print "title not found in this page"
            return
        if item.get('page_content') is None:
            print "content not found in this page"
            return

        #text = HtmlTool.text(html[0]).encode('utf-8')
        text = HtmlTool.text(content).encode('utf-8')
        lines = filter(None, self.line_seps.split(text))
        # try to extract project name from title
        res = self.project_name_exp.match(item['title'])
        if res:
            item["project_name"] = res.groups()[1]
        # project pollutions
        item["pollutions"] = {}
        # extract other fields from page content
        post_lapse_time = None
        self.hinting_results = {}
        # dates occuring in page content
        self.dates = []
        for line in lines:
            def extract_field(field):
                exps = self.field_regexps[field].get("extract", [])
                for exp in exps:
                    result = exp[0].match(line)
                    if result:
                        try:
                            return result.groups()[exp[1]]
                        except:
                            pass
            def hintextract_field(field):
                if field in self.hintings:
                    exps = self.field_regexps[field].get("hintextract", [])
                    for exp in exps:
                        result = exp[0].match(line)
                        if result:
                            try:
                                return result.groups()[exp[1]]
                            except:
                                pass
            def set_field(field):
                def set_extract_field(field):
                    extract_res = extract_field(field)
                    if extract_res:
                        item[field] = extract_res
                        self.hinting_results[field] = False
                        return True
                def set_hintextract_field(field):
                    hintextract_res = hintextract_field(field)
                    if hintextract_res:
                        item[field] = hintextract_res
                        self.hinting_results[field] = True
                        return True
                if not item.get(field):
                    if set_extract_field(field):
                        return True
                    else:
                        return set_hintextract_field(field)
                elif self.hinting_results.get(field):
                    set_extract_field(field)
                    return True
                else:
                    return True
            def append_field(field):
                exps = self.field_regexps[field].get("appending", [])
                for exp in exps:
                    if exp[0].match(line):
                        item[field][exp[1].encode('utf-8')] = 1
            def hinting_fields(fields):
                for field in fields:
                    exps = self.field_regexps[field].get("hinting", [])
                    for exp in exps:
#.........这里部分代码省略.........
开发者ID:chrox,项目名称:ecolect,代码行数:103,代码来源:post_spider.py

示例7: Document

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
#!/usr/bin/env python

import urllib.request, urllib.parse, urllib.error
import sys

from readability.readability import Document

url = sys.argv[1]

#url = "http://www.space.com/29740-mice-of-mars-rodents-pave-way-to-red-planet.html"

html = urllib.request.urlopen(url).read()
readable_article = Document(html).summary()
readable_title = Document(html).short_title()

print(readable_title)

print(readable_article.encode('utf-8').strip())
开发者ID:KyubiSystems,项目名称:Wisewolf,代码行数:20,代码来源:readability_test.py

示例8: print

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
#Here is my code.
# importing file.
from readability.readability import Document
import urllib2
# Get Users URL
URL= "http://arstechnica.com/science/2017/01/texas-slams-fda-with-lawsuit-for-holding-up-imported-execution-drugs/"
#req = urllib2.Request(URL)
# putting it in to object.
print ("url is - " + URL)
#URL = URL.strip('\'"')
#print ("new url is - " + URL)
fURL = urllib2.urlopen(URL)
#Making Html file 
htmlName="decrufted.html"
htmlThing = open(htmlName,'w')
#htmlThing.write(Document(fURL.read()).summary())
#first i printed on this shall
# and than i thought that i could store that in to veriable. and i stored in to string veriable.
#print Document(fURL.read()).summary()
#making srting object.
strHtmlStuff = Document(fURL.read()).summary()
#Writing stuff from string object.
htmlThing.write(strHtmlStuff.encode('utf8') + '\n')
htmlThing.close
print "The file name is: " + htmlName
# there u go with html file.
开发者ID:kshitij108,项目名称:project_p,代码行数:28,代码来源:Final_submited.py

示例9: Document

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
import os, re
import requests
import pdfkit
import json
from bs4 import BeautifulSoup, NavigableString, Tag
from apiclient.discovery import build
from readability.readability import Document
import urllib
   
blogURL = 'http://katelynnow.com/riding-solo/'
r = requests.get(blogURL)
html = r.text
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
with open('test.html', 'wb') as f:
    f.write(readable_article.encode('utf8'))

pdfkit.from_string(readable_title + readable_article,
                   'out.pdf')

os.chdir('/Users/mrswhitneybell/Documents/Jason/J4')
def blog2pdf(blogURL):
    service = build('blogger', 'v3', 
        developerKey = 'AIzaSyAMtRVlEQPjdxvESWqjocPE42D9s1eFlRM') 
    blogs = service.blogs() 
    request = blogs.getByUrl(url = blogURL, view = 'READER') 
    blogInfo = request.execute()
    blogId = blogInfo['id'] 
    posts = service.posts()
    request = posts.list(blogId = blogId, status = 'live', 
        orderBy = 'published', fetchImages = True, view = 'READER')
开发者ID:j4group,项目名称:wolf_is_lame,代码行数:33,代码来源:blogPrint-dev.py

示例10: startanalyse

# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def startanalyse(region, company, keyword, count):
    print '\nModule 3 - analyse html pages to judge keyword related or not.'

    searchkey = '%s+%s'%(company,keyword)

    #file for saving analyzing results
    txtfilename = region+os.sep+company+os.sep+'%s_result.txt'%searchkey
    txtfile = open(txtfilename,'r')
    txtcont = txtfile.readlines()
    txtfile.close()

    #meta html page file name
    _htmlfilename = region+os.sep+company+os.sep+searchkey+'_%d.html'

    yes = 0
    no = 0

    #pattern: description, keywords, title
    pattern_title = '<title>(.*?)</title>'
    pattern_key = '<meta\s(name=["]?keywords["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?keywords["]?).*?>'   #.*?>: not always end symbol & space character
    pattern_des = '<meta\s(name=["]?description["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?description["]?).*?>'   #.*?>: not always end symbol & space character

    txtlist = []
    tmpfilename = 'tmp.txt' #temp usage
    for i in range(count):
        tmp = i + 1
        htmlfilename = _htmlfilename%tmp

        company_flag = False
        keyword_flag = False
            
        #judge html file is NULL or not
        file_size = os.stat(htmlfilename).st_size
        
        if file_size != 0:
            htmlfile = open(htmlfilename, 'r')
            htmlcontent = htmlfile.read()
            htmlfile.close()

            #1 - head content: description, keywords, title
            head_title = re.search(pattern_title,htmlcontent,re.I | re.S)
            head_key = re.search(pattern_key,htmlcontent,re.I | re.S)
            head_des = re.search(pattern_des,htmlcontent,re.I | re.S)
            #2 - body content: readability
            body_content = Document(htmlcontent).summary()
            tmpfile = open(tmpfilename,'w')
            tmpfile.write(body_content.encode('utf-8'))
            tmpfile.close()
            tmpfile = open(tmpfilename,'r')
            body_content = tmpfile.read()
            tmpfile.close()

            #is company related or not?
            if (head_title!=None and (company in head_title.group(1))) or (head_key!=None and (company in head_key.group(1))) or (head_des!=None and (company in head_des.group(1))):
                company_flag = True
            else:
                _company = unicode(company,'mbcs')
                if _company in body_content:
                    company_flag = True
            #if company not, stop judging
            if company_flag:
                #is keyword related or not?
                if (head_title!=None and (keyword in head_title.group(1))) or (head_key!=None and (keyword in head_key.group(1))) or (head_des!=None and (keyword in head_des.group(1))):
                    keyword_flag = True
                else:
                    _keyword = unicode(keyword, 'mbcs')
                    if _keyword in body_content:
                        keyword_flag = True
        #show results
        print tmp,' company related:',company_flag,' keyword related:',keyword_flag
    
        #store results
        if company_flag and keyword_flag:
            txtlist.append('yes')
        else:
            txtlist.append('no')
        i += 1

    #write back to analyzing result file
    for j in range(len(txtcont)):
        newcont = '*'+txtlist[j]+'\n'
        oldcont = txtcont[j]
        txtcont[j] = oldcont.replace('\n', newcont)
    txtfile = open(txtfilename,'w')
    txtfile.writelines(txtcont)
    txtfile.close()

    if os.path.exists(tmpfilename)==True:
        os.remove(tmpfilename)
开发者ID:holybin,项目名称:htmlparser,代码行数:91,代码来源:analyse.py


注:本文中的readability.readability.Document.encode方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。