Python tidylib.tidy_document函数代码示例

本文整理汇总了Python中tidylib.tidy_document函数的典型用法代码示例。如果您正苦于以下问题：Python tidy_document函数的具体用法？Python tidy_document怎么用？Python tidy_document使用的例子？那么, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了tidy_document函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_doc_with_entity

 def test_doc_with_entity(self):
     h = "&eacute;"
     expected = DOC % "&eacute;"
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
     
     expected = DOC % "&#233;"
     doc, err = tidy_document(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)

开发者ID:18600597055，项目名称:hue，代码行数:9，代码来源:DocsTest.py

示例2: _massage_diff_content

def _massage_diff_content(content):
    tidy_options = {
        'output-xhtml': 0,
        'force-output': 1,
    }
    try:
        content = tidy_document(content, options=tidy_options)
    except UnicodeDecodeError:
        # In case something happens in pytidylib we'll try again with
        # a proper encoding
        content = tidy_document(content.encode('utf-8'), options=tidy_options)
        tidied, errors = content
        content = tidied.decode('utf-8'), errors
    return content

开发者ID:VitorVRS，项目名称:kuma，代码行数:14，代码来源:helpers.py

示例3: marklogic_put_xml

   def marklogic_put_xml(self, item, spider_name):
       # Set the uri and collection
       if (self.ml_transform == ''):
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name}
       else:
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform}
       # Set up the XML payload
       payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent')
       # Decode the <> characters back again
       payload = payload.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"')
       # Run tidy in order to get wel-formed XML
       payload, errors = tidy_document(payload, options={'input-xml': 1})
 
       # Set up the header
       headers = {'Content-Type': 'application/xml'}
 
       ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri
       logging.info("PUTting XML in " + ml_uri + " as " + item['uri'])
 
       # Call the MarkLogic REST endpoint
       ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user
       ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd
       r = requests.put(ml_uri,
           params = params,
           auth = HTTPDigestAuth(ml_user, ml_pwd),
           data = payload,
           headers = headers)
 
       logging.info("PUT response: " + str(r.status_code) + ", " + r.text)

开发者ID:michelderu，项目名称:ml-scrapy-pipeline，代码行数:29，代码来源:pipelines.py

示例4: fetch_data

def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime

开发者ID:orithena，项目名称:sportswarnbot，代码行数:34，代码来源:bvb.py

示例5: call

def call():
    if world.results:
        return

    data = urllib.urlencode(world.params)
    req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data),
                          headers=world.header)
    fd = urllib2.urlopen(req)
    page = fd.read()

    fmt = world.params.get('format')
    if fmt not in ('html', 'xml', 'json', 'jsonv2'):
        fmt = 'xml' if world.requesttype == 'reverse' else 'html'
    pageinfo = fd.info()
    assert_equal('utf-8', pageinfo.getparam('charset').lower())
    pagetype = pageinfo.gettype()
    if fmt == 'html':
        assert_equals('text/html', pagetype)
        document, errors = tidy_document(page, 
                             options={'char-encoding' : 'utf8'})
        assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors
        world.results = document
    elif fmt == 'xml':
        assert_equals('text/xml', pagetype)
        world.results = parseString(page).documentElement
    else:
        if 'json_callback' in world.params:
            func = world.params['json_callback']
            assert page.startswith(func + '(')
            assert page.endswith(')')
            page = page[(len(func)+1):-1]
            assert_equals('application/javascript', pagetype)
        else:
            assert_equals('application/json', pagetype)
        world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)

开发者ID:mtmail，项目名称:test-nominatim，代码行数:35，代码来源:request_setup.py

示例6: test_xmlns_large_document_xml_corner_case

 def test_xmlns_large_document_xml_corner_case(self):
     # Test for a super weird edge case in Tidy that can cause it to return
     # the wrong required buffer size.
     body = '<span><span>A</span></span>' + 'A' * 7937
     html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body
     doc, err = tidy_document(html, {'output-xml': 1})
     self.assertEqual(doc.strip()[-7:], "</html>")

开发者ID:GertBurger，项目名称:pytidylib，代码行数:7，代码来源:test_docs.py

示例7: nofoutofplacefeatures

def nofoutofplacefeatures(url):
	try:
	

	#	pdb.set_trace()

		if url[:4]=="http":
			r = requests.get(url)
		else:
			url="http://"+url
			r  = requests.get(url)

		#r = requests.get(url)
		data = r.text
		data2=r.content

		document, errors = tidy_document(data,
		  options={'numeric-entities':1})

		#print document
		#print errors
		#print "Number of Elements Out of Place : " + str(len(errors))
		return len(errors)
	except:
		pass

开发者ID:BelloHe，项目名称:Malicious_Website_Detection，代码行数:25，代码来源:realtestmodel.py

示例8: convert_to_html

def convert_to_html(filename):
    # Do the conversion with pandoc
    output = pypandoc.convert(filename, 'html')

    # Clean up with tidy...
    output, errors = tidy_document(output,  options={
        'numeric-entities': 1,
        'wrap': 80,
    })
    print(errors)

    # replace smart quotes.
    output = output.replace(u"\u2018", '&lsquo;').replace(u"\u2019", '&rsquo;')
    output = output.replace(u"\u201c", "&ldquo;").replace(u"\u201d", "&rdquo;")

    # write the output
    filename, ext = os.path.splitext(filename)
    filename = "{0}.html".format(filename)
    with open(filename, 'w') as f:
        # Python 2 "fix". If this isn't a string, encode it.
        if type(output) is not str:
            output = output.encode('utf-8')
        f.write(output)

    print("Done! Output written to: {}\n".format(filename))

开发者ID:bradmontgomery，项目名称:word2html，代码行数:25，代码来源:main.py

示例9: html2enml

def html2enml(html):
    # doc, err = tidy_fragment(

    doc, err = tidy_document(
        html,
        options={
            "output-xhtml": 1,
            "drop-proprietary-attributes": 1,
            "merge-divs": 1,
            "clean": 1
        }
    )

    root = fromstring(doc)

    # XXX dirty hack to circumvent a bug in lxml parser
    root = fromstring(etree.tostring(root))

    logging.debug(etree.tostring(root))

    # tidy_document returns a valid html document which means it usually contains html tag and proper body element
    root = root.find('body')
    if root is None:
        logging.warn("No body on this document")
        logging.warn(html)
        return "<div></div>"
    root.tag = 'div'

    root = remove_prohibited_elements(root)
    root = remove_prohibited_attributes(root)
    #FIXME Skipping dtd validation because of slow DTD creation speed
    # validate_dtd(html, f):

    return etree.tostring(root)

开发者ID:shurain，项目名称:archiver，代码行数:34，代码来源:enml.py

示例10: scrape

def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )

开发者ID:satyadevi-nyros，项目名称:eracks，代码行数:60，代码来源:scrape_pages.py

示例11: __trading_years

 def __trading_years(self, instrument):
     re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument))
     document, errors = tidy_document(re.read())
     soup = BeautifulSoup(document)
     node = soup.find('select', attrs={'name':'year'})
     for option in node.findAll('option'):
         yield option.getText()

开发者ID:B-Rich，项目名称:dealer，代码行数:7，代码来源:sinads.py

示例12: process_response

    def process_response(self, request, response):
        if 'text/html' in response['Content-Type'] and response.content:
            document, errors = tidy_document(response.content)
            if errors:
                raise HTMLValidationError(errors)

        return response

开发者ID:nosamanuel，项目名称:django-test-validator，代码行数:7，代码来源:middleware.py

示例13: _tidysrc

    def _tidysrc(self,data,srccode):
        """tidy scribe the html src"""

        try:
            from tidylib import tidy_document
            BASE_OPTIONS = {
    "output-xhtml": 1,     # XHTML instead of HTML4
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
    "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    "char-encoding":'utf-8',
    "input-encoding":srccode,
    "output-encoding":'utf-8',
    }
            if not isinstance(data, unicode):                
                try:
                    data = data.decode(srccode)
                except:
                    pass
            doc, errors = tidy_document(data,options={'numeric-entities':1})
            return doc
        except:
            return data

开发者ID:adam139，项目名称:xsgs.theme，代码行数:26，代码来源:homepage.py

示例14: dynamic_test_method

 def dynamic_test_method(self):
     """this function name doesn't matter much, it can start with `test`,
     but we're going to rename it dynamically below"""
     reportURLstring = '/report?reportname=' + reportItem.metadata['action']
     response=self._my_app.get(reportURLstring)
     code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0})
     self.assertFalse(error, '%s did not return valid html page' % reportURLstring)

开发者ID:woodenshoe，项目名称:infoshopkeeper，代码行数:7，代码来源:test_unittest.py

示例15: getMenu

def getMenu():
    storeFile = open("list.txt","r")
    txt = storeFile.read()
    storeFile.close()
    
    list=txt.split('\n\n\n')
    

 #   print list
    
    for store in list:    
#        print store
        rest = store.split('\n')
        if len(rest)!=3:
            break
        try:
            url=baseUrl+rest[2] +'menu'
            print url
            res=urlopen(url)
            html=res.read()    
         
            options = {'output-encoding':'utf8', 'output-xhtml':1 }
            document,errors = tidy_document(html,options)   
            
            filepath = dataDir+ (rest[2].split('/'))[2] + ".html"
            saveFile = open(filepath,"w")
            saveFile.write(document)
            saveFile.close()
            print filepath
        except :
            print "skip:"+url

开发者ID:blueskywalker，项目名称:menuReview，代码行数:31，代码来源:getMenu.py

注：本文中的tidylib.tidy_document函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。