本文整理汇总了Python中readability.readability.Document.encode方法的典型用法代码示例。如果您正苦于以下问题:Python Document.encode方法的具体用法?Python Document.encode怎么用?Python Document.encode使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类readability.readability.Document
的用法示例。
在下文中一共展示了Document.encode方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: checkerFunction
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def checkerFunction(myInput):
today = datetime.date.today()
try:
google1 = 'http://www.google.com/search?hl=en&q='
google2 = '%20privacy%20policy&btnI=1'
keyword = myInput
url = google1 + keyword + google2
r = requests.get(url, allow_redirects=False)
url = r.headers['location']
except Exception as e:
return
myFullPath = "./sandbox/db/" + keyword
if not os.path.exists("./sandbox"):
os.makedirs("./sandbox")
if not os.path.exists("./sandbox/db/"):
os.makedirs("./sandbox/db/")
if not os.path.exists(myFullPath):
os.makedirs(myFullPath)
filename = keyword + "." + str(today)
filetowrite = myFullPath + "/" + filename
fileExist = os.path.isfile(filetowrite)
if (url == None):
return
html = urllib.urlopen(url).read()
readable_article = Document(html).summary()
tempFileMade = False
originalFileMade = False
if(fileExist):
filetowrite = filetowrite + ".tmp."
f = open(filetowrite, 'w')
writeThis = str(readable_article.encode('ascii', 'ignore'))
f.write(writeThis)
f.close
tempFileMade = True
else:
f = open(filetowrite, 'w')
writeThis = str(readable_article.encode('ascii', 'ignore'))
f.write(writeThis)
f.close
originalFileMade = True
hashedmd5 = hashlib.md5(readable_article.encode('ascii', 'ignore'))
hashedArticle = hashedmd5.hexdigest()
return hashedArticle
示例2: get_announcement_body
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def get_announcement_body(url):
now = datetime.datetime.now()
resp = ["","","","","",""]
images = []
html = br.open(url).read()
readable_announcement = Document(html).summary()
readable_title = Document(html).title()
soup = BeautifulSoup(readable_announcement, "lxml")
final_announcement = soup.text
links = soup.findAll('img', src=True)
for lin in links:
li = urlparse.urljoin(url,lin['src'])
images.append( li)
resp[0] = str(final_announcement.encode("ascii","ignore"))
resp[1] = str(readable_title.encode("ascii","ignore"))
resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
resp[3] = url
resp[4] = url
resp[5] = ""
#insertDB(resp)
#print "inserted resp"
title_article = []
title_article.append(final_announcement)
title_article.append(readable_title)
title_article.append(images)
return title_article
示例3: extrat_html_document
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def extrat_html_document(url):
try :
print "extrat_html_document"
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
r = urllib2.Request(url, headers=headers)
socket = urllib2.urlopen(r,timeout = 1)
url = socket.geturl()
html = socket.read()
#block_url pass
for bl_url in block_url:
if len(url.split(bl_url)) > 1:
summary="block"
return summary
for ext_url in exception_url:
if len(url.split(ext_url)) > 1:
readable_title = Document(html).short_title()
summary = readable_title.encode('utf-8')
_file.write(summary+'\n')
return summary
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
summary = readable_title.encode('utf-8') + readable_title.encode('utf-8')
print "soup start"
soup = BeautifulSoup(readable_article.replace("br/","p"),"html.parser")
print "summary:"
for s in soup("p"):
summary += str(s.encode('utf-8'))
# summary += readable_article.encode('utf-8')
except Exception:
_file.write('extrat_html_document Failed URL : ' + url + '\n')
summary = "Failed Get data"
return summary
示例4: download_html_as_text
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def download_html_as_text(url, filename=None, format_to='rst'):
"""Download HTML content from url and convert it to plain text."""
# Construct internet connection
headers = {'User-Agent' : 'Mozilla Firefox for Ubuntu canonical - 1.0'}
req = urllib2.Request(url, headers=headers)
con = urllib2.urlopen(req)
html = con.read()
# Fetch and convert main contents
article = Document(html).summary()
if len(article) < 1024:
article = html
article = patch_image_alt(article)
title = Document(html).short_title()
text = pypandoc.convert(article, format_to, format='html')
title_utf8 = title.encode('utf-8')
lines_insert = [u'\n\n',
u'='*len(title_utf8), u'\n',
title_utf8, u'\n',
u'='*len(title_utf8), u'\n\n',
u':URL: ' + url, u'\n\n']
title = title.split('|,-')[0]
# Search for urls of images
imgurl_pattern = '\.\.\s+\|([^|]+)\|\s+image::\s+(https?://\S+)'
imgurl_re = re.compile(imgurl_pattern, re.I)
image_urls = imgurl_re.findall(text)
if filename is None:
filename = title.split('-')[0].strip().replace(' ', '-')
txtfile = open(filename + '-bak.' + format_to, 'w')
txtfile.writelines(lines_insert)
txtfile.write(text.encode('utf-8'))
txtfile.close()
# Replace online image URLs with local paths.
images = download_images(image_urls, filename + '-images')
for img, link in images:
text = text.replace(link, img)
txtfile = open(filename + '.' + format_to, 'w')
txtfile.writelines(lines_insert)
txtfile.write(text.encode('utf-8'))
txtfile.close()
示例5: getReadableArticle
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def getReadableArticle(url):
now = datetime.datetime.now()
resp = ["","","","","",""]
images = []
html = br.open(url).read()
readable_article = Document(html).summary()
#print readable_article
#print readable_article
readable_title = Document(html).title()
#print readable_title
soup = BeautifulSoup(readable_article)
final_article = soup.text
#print final_article
#print final_article
links = soup.findAll('img', src=True)
for lin in links:
li = urlparse.urljoin(url,lin['src'])
#print li
images.append( li)
resp[0] = str(final_article.encode("ascii","ignore"))
#print resp[0]
resp[1] = str(readable_title.encode("ascii","ignore"))
resp[2] = str(now.month)+" "+str(now.day)+" "+str(now.year)+"-"+str(now.hour)+":"+str(now.minute)+":"+str(now.second)
resp[3] = url
resp[4] = url
#if len(images)>0:
#resp[5] = images[0]
#else:
resp[5] = ""
insertDB(resp)
print "inserted resp"
title_article = []
title_article.append(final_article)
title_article.append(readable_title)
title_article.append(images)
return title_article
示例6: parse
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def parse(self, response):
sel = Selector(response)
item = PostItem()
# fill page url
item['url'] = response.url
# extract page title
def match_title(title):
if title is None: return False
for keyword in self.title_keywords:
regex = re.compile(".*%s.*" %keyword)
if not regex.match(title):
return False
return True
for tag in ("h1", "h2", "h3", "h4", "title", "strong", "b", "p", "span"):
for heads in sel.xpath("//%s/text()" %tag).extract():
#for head in heads.strip().encode('utf-8').split(" - "):
for head in filter(None, self.head_seps.split(heads.encode('utf-8'))):
if match_title(head):
item['title'] = head.strip()
break
# clean page content
html = sel.xpath("//html").extract()
if html:
content = Document(html[0]).summary()
item['page_content'] = content.encode('utf-8')
#print item['page_content']
if item.get('title') is None:
print "title not found in this page"
return
if item.get('page_content') is None:
print "content not found in this page"
return
#text = HtmlTool.text(html[0]).encode('utf-8')
text = HtmlTool.text(content).encode('utf-8')
lines = filter(None, self.line_seps.split(text))
# try to extract project name from title
res = self.project_name_exp.match(item['title'])
if res:
item["project_name"] = res.groups()[1]
# project pollutions
item["pollutions"] = {}
# extract other fields from page content
post_lapse_time = None
self.hinting_results = {}
# dates occuring in page content
self.dates = []
for line in lines:
def extract_field(field):
exps = self.field_regexps[field].get("extract", [])
for exp in exps:
result = exp[0].match(line)
if result:
try:
return result.groups()[exp[1]]
except:
pass
def hintextract_field(field):
if field in self.hintings:
exps = self.field_regexps[field].get("hintextract", [])
for exp in exps:
result = exp[0].match(line)
if result:
try:
return result.groups()[exp[1]]
except:
pass
def set_field(field):
def set_extract_field(field):
extract_res = extract_field(field)
if extract_res:
item[field] = extract_res
self.hinting_results[field] = False
return True
def set_hintextract_field(field):
hintextract_res = hintextract_field(field)
if hintextract_res:
item[field] = hintextract_res
self.hinting_results[field] = True
return True
if not item.get(field):
if set_extract_field(field):
return True
else:
return set_hintextract_field(field)
elif self.hinting_results.get(field):
set_extract_field(field)
return True
else:
return True
def append_field(field):
exps = self.field_regexps[field].get("appending", [])
for exp in exps:
if exp[0].match(line):
item[field][exp[1].encode('utf-8')] = 1
def hinting_fields(fields):
for field in fields:
exps = self.field_regexps[field].get("hinting", [])
for exp in exps:
#.........这里部分代码省略.........
示例7: Document
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
#!/usr/bin/env python
import urllib.request, urllib.parse, urllib.error
import sys
from readability.readability import Document
url = sys.argv[1]
#url = "http://www.space.com/29740-mice-of-mars-rodents-pave-way-to-red-planet.html"
html = urllib.request.urlopen(url).read()
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
print(readable_title)
print(readable_article.encode('utf-8').strip())
示例8: print
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
#Here is my code.
# importing file.
from readability.readability import Document
import urllib2
# Get Users URL
URL= "http://arstechnica.com/science/2017/01/texas-slams-fda-with-lawsuit-for-holding-up-imported-execution-drugs/"
#req = urllib2.Request(URL)
# putting it in to object.
print ("url is - " + URL)
#URL = URL.strip('\'"')
#print ("new url is - " + URL)
fURL = urllib2.urlopen(URL)
#Making Html file
htmlName="decrufted.html"
htmlThing = open(htmlName,'w')
#htmlThing.write(Document(fURL.read()).summary())
#first i printed on this shall
# and than i thought that i could store that in to veriable. and i stored in to string veriable.
#print Document(fURL.read()).summary()
#making srting object.
strHtmlStuff = Document(fURL.read()).summary()
#Writing stuff from string object.
htmlThing.write(strHtmlStuff.encode('utf8') + '\n')
htmlThing.close
print "The file name is: " + htmlName
# there u go with html file.
示例9: Document
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
import os, re
import requests
import pdfkit
import json
from bs4 import BeautifulSoup, NavigableString, Tag
from apiclient.discovery import build
from readability.readability import Document
import urllib
blogURL = 'http://katelynnow.com/riding-solo/'
r = requests.get(blogURL)
html = r.text
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
with open('test.html', 'wb') as f:
f.write(readable_article.encode('utf8'))
pdfkit.from_string(readable_title + readable_article,
'out.pdf')
os.chdir('/Users/mrswhitneybell/Documents/Jason/J4')
def blog2pdf(blogURL):
service = build('blogger', 'v3',
developerKey = 'AIzaSyAMtRVlEQPjdxvESWqjocPE42D9s1eFlRM')
blogs = service.blogs()
request = blogs.getByUrl(url = blogURL, view = 'READER')
blogInfo = request.execute()
blogId = blogInfo['id']
posts = service.posts()
request = posts.list(blogId = blogId, status = 'live',
orderBy = 'published', fetchImages = True, view = 'READER')
示例10: startanalyse
# 需要导入模块: from readability.readability import Document [as 别名]
# 或者: from readability.readability.Document import encode [as 别名]
def startanalyse(region, company, keyword, count):
print '\nModule 3 - analyse html pages to judge keyword related or not.'
searchkey = '%s+%s'%(company,keyword)
#file for saving analyzing results
txtfilename = region+os.sep+company+os.sep+'%s_result.txt'%searchkey
txtfile = open(txtfilename,'r')
txtcont = txtfile.readlines()
txtfile.close()
#meta html page file name
_htmlfilename = region+os.sep+company+os.sep+searchkey+'_%d.html'
yes = 0
no = 0
#pattern: description, keywords, title
pattern_title = '<title>(.*?)</title>'
pattern_key = '<meta\s(name=["]?keywords["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?keywords["]?).*?>' #.*?>: not always end symbol & space character
pattern_des = '<meta\s(name=["]?description["]?\scontent=\"(.*?)\"|content=\"(.*?)\"\sname=["]?description["]?).*?>' #.*?>: not always end symbol & space character
txtlist = []
tmpfilename = 'tmp.txt' #temp usage
for i in range(count):
tmp = i + 1
htmlfilename = _htmlfilename%tmp
company_flag = False
keyword_flag = False
#judge html file is NULL or not
file_size = os.stat(htmlfilename).st_size
if file_size != 0:
htmlfile = open(htmlfilename, 'r')
htmlcontent = htmlfile.read()
htmlfile.close()
#1 - head content: description, keywords, title
head_title = re.search(pattern_title,htmlcontent,re.I | re.S)
head_key = re.search(pattern_key,htmlcontent,re.I | re.S)
head_des = re.search(pattern_des,htmlcontent,re.I | re.S)
#2 - body content: readability
body_content = Document(htmlcontent).summary()
tmpfile = open(tmpfilename,'w')
tmpfile.write(body_content.encode('utf-8'))
tmpfile.close()
tmpfile = open(tmpfilename,'r')
body_content = tmpfile.read()
tmpfile.close()
#is company related or not?
if (head_title!=None and (company in head_title.group(1))) or (head_key!=None and (company in head_key.group(1))) or (head_des!=None and (company in head_des.group(1))):
company_flag = True
else:
_company = unicode(company,'mbcs')
if _company in body_content:
company_flag = True
#if company not, stop judging
if company_flag:
#is keyword related or not?
if (head_title!=None and (keyword in head_title.group(1))) or (head_key!=None and (keyword in head_key.group(1))) or (head_des!=None and (keyword in head_des.group(1))):
keyword_flag = True
else:
_keyword = unicode(keyword, 'mbcs')
if _keyword in body_content:
keyword_flag = True
#show results
print tmp,' company related:',company_flag,' keyword related:',keyword_flag
#store results
if company_flag and keyword_flag:
txtlist.append('yes')
else:
txtlist.append('no')
i += 1
#write back to analyzing result file
for j in range(len(txtcont)):
newcont = '*'+txtlist[j]+'\n'
oldcont = txtcont[j]
txtcont[j] = oldcont.replace('\n', newcont)
txtfile = open(txtfilename,'w')
txtfile.writelines(txtcont)
txtfile.close()
if os.path.exists(tmpfilename)==True:
os.remove(tmpfilename)