本文整理汇总了Python中tidylib.tidy_document函数的典型用法代码示例。如果您正苦于以下问题:Python tidy_document函数的具体用法?Python tidy_document怎么用?Python tidy_document使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tidy_document函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_doc_with_entity
def test_doc_with_entity(self):
h = "é"
expected = DOC % "é"
doc, err = tidy_document(h)
self.assertEqual(doc, expected)
expected = DOC % "é"
doc, err = tidy_document(h, {'numeric-entities':1})
self.assertEqual(doc, expected)
示例2: _massage_diff_content
def _massage_diff_content(content):
tidy_options = {
'output-xhtml': 0,
'force-output': 1,
}
try:
content = tidy_document(content, options=tidy_options)
except UnicodeDecodeError:
# In case something happens in pytidylib we'll try again with
# a proper encoding
content = tidy_document(content.encode('utf-8'), options=tidy_options)
tidied, errors = content
content = tidied.decode('utf-8'), errors
return content
示例3: marklogic_put_xml
def marklogic_put_xml(self, item, spider_name):
# Set the uri and collection
if (self.ml_transform == ''):
params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name}
else:
params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform}
# Set up the XML payload
payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent')
# Decode the <> characters back again
payload = payload.replace('<', '<').replace('>', '>').replace(''', "'").replace('"', '"')
# Run tidy in order to get wel-formed XML
payload, errors = tidy_document(payload, options={'input-xml': 1})
# Set up the header
headers = {'Content-Type': 'application/xml'}
ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri
logging.info("PUTting XML in " + ml_uri + " as " + item['uri'])
# Call the MarkLogic REST endpoint
ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user
ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd
r = requests.put(ml_uri,
params = params,
auth = HTTPDigestAuth(ml_user, ml_pwd),
data = payload,
headers = headers)
logging.info("PUT response: " + str(r.status_code) + ", " + r.text)
示例4: fetch_data
def fetch_data():
def bvbreplace(s):
return "BVB" if "Dortmund" in s else s
doc = None
try:
doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
soup = Soup(doc)
except Exception as e:
raise Exception(u"Error fetching/parsing website: %s" % e)
out = ''
matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
timestr = ''
try:
home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
league = ''
try:
league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
except:
league = select(soup, "div.next-match p span")[2].contents[0].strip()
matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
location = u"Heim" if u"BVB" == home else u"Auswaerts"
out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
except IndexError:
# This means: No next game on the webpage.
sys.exit(1)
except Exception as e:
#print(traceback.format_exc())
raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
return out, matchtime
示例5: call
def call():
if world.results:
return
data = urllib.urlencode(world.params)
req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data),
headers=world.header)
fd = urllib2.urlopen(req)
page = fd.read()
fmt = world.params.get('format')
if fmt not in ('html', 'xml', 'json', 'jsonv2'):
fmt = 'xml' if world.requesttype == 'reverse' else 'html'
pageinfo = fd.info()
assert_equal('utf-8', pageinfo.getparam('charset').lower())
pagetype = pageinfo.gettype()
if fmt == 'html':
assert_equals('text/html', pagetype)
document, errors = tidy_document(page,
options={'char-encoding' : 'utf8'})
assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors
world.results = document
elif fmt == 'xml':
assert_equals('text/xml', pagetype)
world.results = parseString(page).documentElement
else:
if 'json_callback' in world.params:
func = world.params['json_callback']
assert page.startswith(func + '(')
assert page.endswith(')')
page = page[(len(func)+1):-1]
assert_equals('application/javascript', pagetype)
else:
assert_equals('application/json', pagetype)
world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)
示例6: test_xmlns_large_document_xml_corner_case
def test_xmlns_large_document_xml_corner_case(self):
# Test for a super weird edge case in Tidy that can cause it to return
# the wrong required buffer size.
body = '<span><span>A</span></span>' + 'A' * 7937
html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body
doc, err = tidy_document(html, {'output-xml': 1})
self.assertEqual(doc.strip()[-7:], "</html>")
示例7: nofoutofplacefeatures
def nofoutofplacefeatures(url):
try:
# pdb.set_trace()
if url[:4]=="http":
r = requests.get(url)
else:
url="http://"+url
r = requests.get(url)
#r = requests.get(url)
data = r.text
data2=r.content
document, errors = tidy_document(data,
options={'numeric-entities':1})
#print document
#print errors
#print "Number of Elements Out of Place : " + str(len(errors))
return len(errors)
except:
pass
示例8: convert_to_html
def convert_to_html(filename):
# Do the conversion with pandoc
output = pypandoc.convert(filename, 'html')
# Clean up with tidy...
output, errors = tidy_document(output, options={
'numeric-entities': 1,
'wrap': 80,
})
print(errors)
# replace smart quotes.
output = output.replace(u"\u2018", '‘').replace(u"\u2019", '’')
output = output.replace(u"\u201c", "“").replace(u"\u201d", "”")
# write the output
filename, ext = os.path.splitext(filename)
filename = "{0}.html".format(filename)
with open(filename, 'w') as f:
# Python 2 "fix". If this isn't a string, encode it.
if type(output) is not str:
output = output.encode('utf-8')
f.write(output)
print("Done! Output written to: {}\n".format(filename))
示例9: html2enml
def html2enml(html):
# doc, err = tidy_fragment(
doc, err = tidy_document(
html,
options={
"output-xhtml": 1,
"drop-proprietary-attributes": 1,
"merge-divs": 1,
"clean": 1
}
)
root = fromstring(doc)
# XXX dirty hack to circumvent a bug in lxml parser
root = fromstring(etree.tostring(root))
logging.debug(etree.tostring(root))
# tidy_document returns a valid html document which means it usually contains html tag and proper body element
root = root.find('body')
if root is None:
logging.warn("No body on this document")
logging.warn(html)
return "<div></div>"
root.tag = 'div'
root = remove_prohibited_elements(root)
root = remove_prohibited_attributes(root)
#FIXME Skipping dtd validation because of slow DTD creation speed
# validate_dtd(html, f):
return etree.tostring(root)
示例10: scrape
def scrape(slug, url, name, title=None):
f = urlopen(url)
doc = f.read()
doc, errs = tidy_document(
doc,
options={
"output-html": 1,
#'indent':1,
"clean": 1,
"drop-font-tags": 1,
},
)
if errs:
# raise Exception, errs
print errs
doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8',
html.xhtml_to_html(doc)
jQuery = PyQuery([doc])
td = jQuery("td#content")
assert len(td) == 1
for img in td("img"):
# print 'img:', PyQuery (img)
img = PyQuery(img)
src = img.attr("src")
# alt = img.attr('alt')
# if src.startswith ('/image'):
rslt = getimage(src, slug.split("/")[0])
img.attr("src", rslt)
if trace:
print rslt
# td =
# no_fonts (td)
# need to fix links here
content = PyQuery(td[0])
# content = content.html()
content = no_namespaces(content.html())
print slug, content[:60] # .html() # [:60]
if dbteeth:
# q, created = QuickPage.objects.get_or_create (
qp, created = create_or_update(
QuickPage,
keys=dict(slug=slug),
fields=dict(
name=name,
title=title if title else name,
content=content,
# defaults = dict (sortorder = sortorder),
),
)
示例11: __trading_years
def __trading_years(self, instrument):
re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument))
document, errors = tidy_document(re.read())
soup = BeautifulSoup(document)
node = soup.find('select', attrs={'name':'year'})
for option in node.findAll('option'):
yield option.getText()
示例12: process_response
def process_response(self, request, response):
if 'text/html' in response['Content-Type'] and response.content:
document, errors = tidy_document(response.content)
if errors:
raise HTMLValidationError(errors)
return response
示例13: _tidysrc
def _tidysrc(self,data,srccode):
"""tidy scribe the html src"""
try:
from tidylib import tidy_document
BASE_OPTIONS = {
"output-xhtml": 1, # XHTML instead of HTML4
"indent": 1, # Pretty; not too much of a performance hit
"tidy-mark": 0, # No tidy meta tag in output
"wrap": 0, # No wrapping
"alt-text": "", # Help ensure validation
"doctype": 'strict', # Little sense in transitional for tool-generated markup...
"force-output": 1, # May not get what you expect but you will get something
"char-encoding":'utf-8',
"input-encoding":srccode,
"output-encoding":'utf-8',
}
if not isinstance(data, unicode):
try:
data = data.decode(srccode)
except:
pass
doc, errors = tidy_document(data,options={'numeric-entities':1})
return doc
except:
return data
示例14: dynamic_test_method
def dynamic_test_method(self):
"""this function name doesn't matter much, it can start with `test`,
but we're going to rename it dynamically below"""
reportURLstring = '/report?reportname=' + reportItem.metadata['action']
response=self._my_app.get(reportURLstring)
code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0})
self.assertFalse(error, '%s did not return valid html page' % reportURLstring)
示例15: getMenu
def getMenu():
storeFile = open("list.txt","r")
txt = storeFile.read()
storeFile.close()
list=txt.split('\n\n\n')
# print list
for store in list:
# print store
rest = store.split('\n')
if len(rest)!=3:
break
try:
url=baseUrl+rest[2] +'menu'
print url
res=urlopen(url)
html=res.read()
options = {'output-encoding':'utf8', 'output-xhtml':1 }
document,errors = tidy_document(html,options)
filepath = dataDir+ (rest[2].split('/'))[2] + ".html"
saveFile = open(filepath,"w")
saveFile.write(document)
saveFile.close()
print filepath
except :
print "skip:"+url