當前位置: 首頁>>代碼示例>>Python>>正文


Python UnicodeDammit.detwingle方法代碼示例

本文整理匯總了Python中bs4.UnicodeDammit.detwingle方法的典型用法代碼示例。如果您正苦於以下問題:Python UnicodeDammit.detwingle方法的具體用法?Python UnicodeDammit.detwingle怎麽用?Python UnicodeDammit.detwingle使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在bs4.UnicodeDammit的用法示例。


在下文中一共展示了UnicodeDammit.detwingle方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: learn

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
    def learn(self, name, phrase, channel):
        name = self.aliases.resolve(name)
        if name not in self.users:
            self.users[name] = True

        if "password" in phrase:
            return
        phrase = phrase.split(" ")
        phrase = filter(lambda x: x and "http" not in x and "ftp:" not in x and x[0] != ".", phrase)
        now = datetime.datetime.utcnow()
        documents = []

        for i in range(len(phrase) + 1):
            seed = UnicodeDammit.detwingle(phrase[i-1] if i > 0 else "")
            answer = UnicodeDammit.detwingle(phrase[i] if i < len(phrase) else "")

            documents.append({
                "name": name,
                "seed": seed,
                "answer": answer,
                "added": now,
                "random": random.random()
            })

        yield self.db.insert(documents, safe=True)
開發者ID:jdpls,項目名稱:Servrhe,代碼行數:27,代碼來源:markov.py

示例2: ramble

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
    def ramble(self, name=None, seed=""):
        if name:
            name = self.aliases.resolve(name)
            if name not in self.users:
                returnValue("")

        message = []

        if seed:
            seed = UnicodeDammit.detwingle(seed)
            chunk = seed
            while chunk and len(" ".join(message)) < 300:
                message.append(chunk)
                chunk = yield self.prev(name, chunk)
            message.reverse()

        chunk = yield self.next(name, seed)
        while chunk and len(" ".join(message)) < 300:
            message.append(chunk)
            chunk = yield self.next(name, chunk)
            if not chunk and len(" ".join(message)) < 30:
                chunk = yield self.next(name, chunk)

        response = (" ".join(message)).decode("utf8")
        if seed and response == seed.decode("utf8"):
            response = yield self.ramble(name)
        returnValue(response)
開發者ID:jdpls,項目名稱:Servrhe,代碼行數:29,代碼來源:markov.py

示例3: __init__

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
 def __init__(self,url):# logs info,warning,error,critical,debug events.
     '''
     Description: This is the class constructor and is going to get a simple url as input and parse it based on RFC1738.
     Status: In Progress.
     Usage: This is going to be used by by the connection manager and the active/passive scanner to extract url variables.
     '''
     self.url = UnicodeDammit.detwingle(url, 'UTF-8')        
     self.defaultHttpsPort = 443
     self.defaultHttpPort = 80
     urlLogger.logInfo("--- Package: UrlManager - Module: UrlHandler Class: urlHandler Initiated ---")
開發者ID:rekcahemal,項目名稱:CapCake,代碼行數:12,代碼來源:URLAnalyzer.py

示例4: selectdir

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
   def selectdir(geturl):
      r = scraper.get(geturl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      rt = UnicodeDammit.detwingle(r.text)
      html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
      if debug == 1:
         orenc = str(html.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      findlinks = html.findAll('a')
      dirlist = []
      for link in findlinks:
         b = link.get('href')
         if not re.match(r'^((\.\.)?\/)$', str(b)):
            if re.search(r'^(.*)(\/)$', str(b)):
               dirlist.append(b)

      p = urlparse(geturl)
      part = p.path.split('/')[-1]
      path = p.path.rstrip(part)
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path

      i = 0
      dirtotal = len(dirlist)
      if dirtotal > 0:
         print('\nFOUND %d DIRECTORIES: \n' % dirtotal)
         while i < dirtotal:
            sel = i + 1
            print(str(sel) + ' - ' + str(dirlist[i]))
            i += 1
         print('')
         lim = dirtotal + 1
         matchtop = r'^(%s)(\/)?$' % urlfqdn
         if not re.match(matchtop,geturl):
            print('0 - BACK TO PARENT DIRECTORY \n')
            startsel = '0-%d' % dirtotal
         else:
            startsel = '1-%d' % dirtotal
         selectdir = raw_input('make a selection [%s] --> ' % startsel)
         if not int(selectdir) in range(0, lim):
            selectdir = raw_input('invalid entry. please enter a selection %s --> ' % startsel)
         if selectdir == '0':
            geturl = parent
            subcont = 0
         else:
            n = int(selectdir) - 1
            usedir = dirlist[n]
            geturl = parent + usedir
            subcont = 1
      else:
         print('\nNO DIRECTORIES FOUND. using current directory.. \n')
         subcont = 0
         geturl = parent + part
      return geturl, subcont, parent
開發者ID:johnjohnsp1,項目名稱:cloudget,代碼行數:57,代碼來源:cloudget.py

示例5: formatForReddit

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
    def formatForReddit(self, feedEntry, postType, subreddit, raw):
        if 'content' in feedEntry:
          content = feedEntry['content'][0]['value']
        elif 'description' in feedEntry:
          content = feedEntry.description
        else:
          content = ''
        logging.debug(content)
        parser = EveRssHtmlParser()
        
        title = feedEntry['title']

        # some feeds like Twitter are raw so the parser hates it.
        if (raw):
          regex_of_url = '(https?:\/\/[\dA-z\.-]+\.[A-z\.]{2,6}[\/\w&=#\.\-\?]*)'
          title = re.sub(regex_of_url, '', title)
          clean_content = content.replace(' pic.twitter.com', ' http://pic.twitter.com')
          clean_content = re.sub(regex_of_url, '<a href="\\1">link</a>', clean_content)
          clean_content = UnicodeDammit.detwingle(clean_content)
          #logging.info(clean_content)
          u = UnicodeDammit(clean_content, 
                      smart_quotes_to='html', 
                      is_html = False )
          # fix twitter putting ellipses on the end
          content = u.unicode_markup.replace(unichr(8230),' ...')
          logging.debug('.....')
        
        if "tumblr.com" in content:
          # Replace with larger images (hopefully such images exist)
          content = content.replace('_500.', '_1280.')
        
        # Added the .replace because the parser does something funny to them and 
        # removes them before I can handle them
        content = content.replace('&nbsp;', ' ')
        content = content.replace('&bull;', '*').replace('&middot;','*')
        content = content.replace('&ldquo;','\'').replace('&rdquo;','\'')
        content = re.sub('( [ ]+)', ' ', content)
        parser.feed(content)
        parser.comments[0] = '%s\n\n%s' %(feedEntry['link'], parser.comments[0])
        parser.comments[-1] += self.config['signature']
        
        if 'author' in feedEntry:
          author = '~' + feedEntry['author'].replace('@', ' at ')
        else:
          author = ''

        return {'comments': parser.comments,
                'link':     feedEntry['link'],
                'subreddit': subreddit,
                'title':    '[%s] %s %s' %(postType, title, author)}
開發者ID:eveRedditBot,項目名稱:eve_reddit_bot,代碼行數:52,代碼來源:main.py

示例6: normalize

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def normalize(s):
    if isinstance(s, unicode):
        return s

    try:
        u = s.decode("utf8")
    except:
        try:
            u = (s[:-1]).decode("utf8")
        except:
            try:
                u = UnicodeDammit.detwingle(s).decode("utf8")
            except:
                u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup

    return u
開發者ID:skiddiks,項目名稱:Servrhe,代碼行數:18,代碼來源:irc.py

示例7: getpage

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
 def getpage(cfurl):      
    r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
    if 'text' in r.headers.get('Content-Type'):
       rt = UnicodeDammit.detwingle(r.text)
       html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
       print('\r\n--------------------------------------------------------\r\n')
       if debug == 1:
          orenc = str(html.original_encoding)
          print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
       bs = html.prettify(formatter=None)
       print(bs)
       print('\r\n--------------------------------------------------------\r\n')
    else:
       found = -1
    
    if debug == 1:
       print('\n\033[34mDEBUG: finished list length: \033[37;1m%d \033[0m\n' % len(finished))
開發者ID:johnjohnsp1,項目名稱:cloudget,代碼行數:19,代碼來源:cloudget.py

示例8: format

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
    def format(self, script):
        dammit = UnicodeDammit.detwingle(script)
        soup = BeautifulSoup(dammit, from_encoding="utf8")
        header = soup.find('subtitle_script')
        header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\nScaledBorderAndShadow: yes\nYCbCr Matrix: TV.709\n\n";
        styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
        events = "\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n";
        stylelist = soup.findAll('style')
        eventlist = soup.findAll('event')
        
        for style in stylelist:
            styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n"

        for event in eventlist:
            events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n"

        formattedSubs = header+styles+events
        return formattedSubs
開發者ID:RHExcelion,項目名稱:Servrhe,代碼行數:20,代碼來源:crunchy.py

示例9: to_unicode

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def to_unicode(data, is_html=False, detwingle=False, verbose=True,
               lang=None):
    " converts everything to unicode"
    dammit = UnicodeDammit(data, is_html=is_html)
    if detwingle and dammit.original_encoding == 'windows-1252':
        new_data = UnicodeDammit.detwingle(data)
        dammit = UnicodeDammit(new_data, is_html=is_html)

    if verbose:
        sys.stderr.write("Original encoding (via BS): %s\n" %
                         (dammit.original_encoding))

    if lang is None:
        return dammit.unicode_markup

    if lang == 'auto':
        lang = _guess_lang_from_data(dammit.unicode_markup, is_html=is_html)
        if verbose:
            sys.stderr.write("Detected language: %s\n" % (lang))

    return _to_unicode_chared(data, lang, verbose=verbose)
開發者ID:christianbuck,項目名稱:CorpusMining,代碼行數:23,代碼來源:encoding.py

示例10: unicode_dammit_example

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
def unicode_dammit_example():
    # Install the 'chardet' or 'cchardet' Python libraries for better guesses

    ### Take a string with unknown encoding and make the string Unicode
    weirdass_string = "Sacr\xc3\xa9 bleu!"
    dammit = UnicodeDammit(weirdass_string)
    print "Original Word with weird encoding:", weirdass_string
    print "Dammit Print:", (dammit.unicode_markup)
    print "Dammit Type:", (dammit.original_encoding)

    ### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult
    # data sources) and convert to UTF-8 Unicode with .Dammit.detwingle()
    snowmen = (u"\N{SNOWMAN}" * 3)
    quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
    doc = snowmen.encode("utf8") + quote.encode("windows-1252")
    # So now we have one doc with two encodings in it, printing is a mess
    #print "Weird Decoding doc with utf8:", doc # messed up, won't print
    #print (doc.decode("windows-1252")) # So messed up it doesn't even print

    # Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8
    new_doc = UnicodeDammit.detwingle(doc)
    print new_doc.decode("utf8")
開發者ID:HunterAllman,項目名稱:Python,代碼行數:24,代碼來源:beautifulsoup_example.py

示例11: to_unicode

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
    def to_unicode(data, is_html=False, detwingle=False, verbose=False,
                   lang=None):
        """ Produce unicode from text of unknown encoding.
        Input: bytestring """
        dammit = UnicodeDammit(data, is_html=is_html)
        if detwingle and dammit.original_encoding == 'windows-1252':
            new_data = UnicodeDammit.detwingle(data)
            dammit = UnicodeDammit(new_data, is_html=is_html)

        if verbose:
            sys.stderr.write("Original encoding (via BS): %s\n" %
                             (dammit.original_encoding))

        if lang is None:
            return dammit.unicode_markup

        if lang == 'auto':
            lang = TextSanitizer.guess_lang_from_data(
                dammit.unicode_markup, is_html=is_html)
            if verbose:
                sys.stderr.write("Detected language: %s\n" % (lang))

        return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
開發者ID:christianbuck,項目名稱:CorpusMining,代碼行數:25,代碼來源:textsanitzer.py

示例12: getlinks

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
 def getlinks(cfurl):
    r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
    rt = UnicodeDammit.detwingle(r.text)
    html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
    if debug == 1:
       orenc = str(html.original_encoding)
       print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
    bs = html.prettify(formatter=None)
    linkresult = html.findAll('a')
    if len(linkresult) > 0:
       foundlinks = len(linkresult)
       print('\nFOUND %s LINKS AT %s:\n' % (str(foundlinks), cfurl))
       for link in linkresult:
          b = link.get('href')
          b = str(b)
          if b not in cfurl and not re.match(r'^(\.\.)?\/$', b):
             print(b)
       print('')
    else:
       print('\nNO LINKS FOUND.\n')
       foundlinks = 0
    time.sleep(4)
    return foundlinks
開發者ID:johnjohnsp1,項目名稱:cloudget,代碼行數:25,代碼來源:cloudget.py

示例13: slim_html

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
 def slim_html(self, raw_html):
     doc = UnicodeDammit.detwingle(raw_html)
     soup = BeautifulSoup(doc, "html5lib", from_encoding="utf-8")
     return soup.prettify().encode("utf-8");
開發者ID:lite,項目名稱:MyTestBox,代碼行數:6,代碼來源:s.py

示例14: followlinks

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
   def followlinks(bx):
      p = urlparse(bx)
      if '/' not in p.path[-1:]:
         part = p.path.split('/')[-1]
         path = p.path.rstrip(part)
      else:
         path = p.path
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path + '/'
      s = scraper.get(bx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      print('\n----------------------------------------------------------- \n')
      print(s)
      print('\n')
      scr = UnicodeDammit.detwingle(s.text)
      shtml = BeautifulSoup(scr, "html.parser")
      if debug == 1:
         orenc = str(shtml.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      print('\n----------------------------------------------------------- \n')
      sfindlinks = shtml.findAll('a')
      slen = len(sfindlinks)
      sdirs = []
      si = 0
      while si < slen:
         for slink in sfindlinks:
            if debug == 1:
               print('\n\033[34;1mSLINK LOOP\r\n\033[32;21m* si = %d, si < %d\033[0m\n' % (si, slen))
            sl = slink.get('href')
            si += 1
            if sl:
               if not re.search(r'^((\.\.)?\/)$', str(sl)):
                  if '/' in bx[-1:]:
                     if 'http' not in sl[:4]:
                        sl = sl.lstrip('/')
                        sx = bx + sl
                     else:
                        sx = sl
                     print(sx)
                     getCF(sx, 0)
                     ss = scraper.get(sx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
                     bs = BeautifulSoup(ss.text, "html.parser")
                     if bs is not None:                        
                        if debug == 1:
                           orenc = str(bs.original_encoding)
                           print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
                        pagehead = bs.html.head.contents
                        pagehead = str(pagehead)
                        if pagehead:
                           pagetitle = re.search(r'<title>(.*)<\/title>', pagehead)
                           pagetitle = str(pagetitle.group(1))
                           bigtitle = pagetitle.upper()
                           titlestars = lambda a: '*' * (len(str(a)) + 4)
                           pagestars = titlestars(pagetitle)
                           print('\n\033[40m\033[33m%s\n\033[34;1m* %s * \n\033[40m\033[33;21m%s\n\033[0m' % (pagestars, bigtitle, pagestars)) 
                     sb = bs.find_all('a', href = re.compile(r'.+$'))
                     #sb = bs.findAll('a')
                     sblen = len(sb)
                     if sblen > 0:
                        n = 0
                        while n < sblen:
                           for sbl in sb:
                              if debug == 1:
                                 print('\n\033[35;1mSBL LOOP\r\n\033[37;21m* n = %d, n < %d \033[0m\n' % (n, sblen))
                              if sbl is not None:
                                 sr = sbl.get('href').strip()
                                 sr = str(sr)
                                 print('\n* %s \n') % sr
                                 if not re.search('http', sr[:4]):
                                    parent = getparent(sx)
                                    srs = sr.lstrip('/')
                                    sr = parent + srs
                                 if re.match(r'([^.]+\/)$', str(sr)):
                                    followlinks(sr)
                                    sdirs.append(sr)
                                 else:
                                    if '/' not in sr[-1:]:
                                       getCF(sr, 0)
                                       sdirs.append(sr)
                                 n += 1
                              else:
                                 n += 1
                                 continue

                  elif 'Error-222' in bx:
                     print('\nuh-oh. might have triggered a flag with cloudflare.\n')
                     for i in xrange(10,0,-1):
                        time.sleep(1)        
                        print('delaying request for %d seconds.. \r' % i)
                        sys.stdout.flush()
                     break
                  else:
                     if not re.search('http', str(sl[:4])):
                        parent = getparent(bx)
                        sl = sl.lstrip('/')
                        sx = parent + sl
                     else:
                        sx = str(sl)

#.........這裏部分代碼省略.........
開發者ID:johnjohnsp1,項目名稱:cloudget,代碼行數:103,代碼來源:cloudget.py

示例15: UnicodeDammit

# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import detwingle [as 別名]
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<b><!-- i am a comment --></b>

<p class="story">...</p>
"""

# 編碼監測 可設置可能編碼
dammit = UnicodeDammit(html_doc, ["utf8", "gbk"])
if 0:
    print dammit.original_encoding

# 轉換掉字符中的Windows-1252字符
html_doc = UnicodeDammit.detwingle(html_doc)

# 文檔解析過程
# diagnose(html_doc)


# 解析部分文檔  提高效率
only_a_tag = SoupStrainer("a")

# html_doc 可以是 文件對象或字符串
soup = BeautifulSoup(html_doc, features=["lxml"], from_encoding='utf8')
'''
:param: features=[] 解析器列表
:param: from_encoding='utf8' 編碼
:param: parse_only SoupStrainer 實例
'''
開發者ID:duanyifei,項目名稱:python_modules_test,代碼行數:33,代碼來源:test_BeautifulSoup.py


注:本文中的bs4.UnicodeDammit.detwingle方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。