当前位置: 首页>>代码示例>>Python>>正文


Python unidecode.unidecode函数代码示例

本文整理汇总了Python中unidecode.unidecode函数的典型用法代码示例。如果您正苦于以下问题:Python unidecode函数的具体用法?Python unidecode怎么用?Python unidecode使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了unidecode函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: crawl_video_urls

def crawl_video_urls(url='http://documentaryheaven.com/category/space/'):
    myopener = MyOpener()
    page = myopener.open(url)
    page = page.read()

    html = BeautifulSoup(page, "lxml")

    # find all class=post
    posts = html.find_all('div', class_="post")

    # for each class=post:
    for p in posts:
        obj = {}
        #class=post-title --> a (href, string)
        title = p.find('h2').find('a')
        obj['url'] = title['href']
        obj['title'] = unidecode(title.string)
        abstract = p.find('div', class_='browse-description').find('p')
        obj['abstract'] = unidecode(abstract.string).replace('\n', '').replace('\r\r', ' ').strip()
        #class=browse-description --> p (string)

        results.append(obj)
    # next page: class=next --> (href)
    next_page = html.find('a', class_="next page-numbers")

    if not next_page:
        return None
    print results
    print next_page['href']

    return crawl_video_urls(url=next_page['href'])
开发者ID:Mec-iS,项目名称:hypermedia-bulkclient,代码行数:31,代码来源:crawl.py

示例2: test_ascii

	def test_ascii(self):

		log = []
		def showwarning_new(message, category, *args):
			if ("not an unicode object" in str(message)) and \
					(category is RuntimeWarning):
				log.append((message, category))
			else:
				showwarning_old(message, category, *args)

		showwarning_old = warnings.showwarning
		warnings.showwarning = showwarning_new
		warnings.filterwarnings("always")

		for n in xrange(0,128):
			t = chr(n)
			self.assertEqual(unidecode(t), t)

		# Passing string objects to unidecode should raise a warning
		self.assertEqual(128, len(log))
		log = []

		for n in xrange(0,128):
			t = unichr(n)
			self.assertEqual(unidecode(t), t)

		# unicode objects shouldn't raise warnings
		self.assertEqual(0, len(log))

		warnings.showwarning = showwarning_old
开发者ID:AmyMalone,项目名称:datascience-fall14,代码行数:30,代码来源:basic_2.py

示例3: extract_info

def extract_info(article):
    '''
    INPUT: dict object with output from the api
    OUTPUT: bool if extraction was successful or not,
            dict object to insert into mongodb
    '''
    headline = unidecode(article['title']['$text'])
    date_published = str(article['pubDate']['$text'])
    try:
        author = [str(author['name']['$text']) for author in article['byline']]
    except:
        author = None
    try:
        url = str(article['link'][0]['$text'])
    except:
        return False, ''
    try:
        article_text = unidecode(' '.join([line.get('$text', '\n') for line in article['text']['paragraph']]))
    except:
        return False, ''
    insert = {'url': url,
              'source': 'npr',
              'headline': headline,
              'date_published': date_published,
              'author': author,
              'article_text': article_text}
    return True, insert
开发者ID:ewellinger,项目名称:election_analysis,代码行数:27,代码来源:npr_scraper.py

示例4: do_the_trick

def do_the_trick():
    print "\n\t-->> [Collecting]"
    global dot
    global samples
    dot, samples = do_fetch()
    if len(samples)>0:
        print "\n\t-->> [Playing]:"
    for ind_s, s in enumerate(samples):
        print "\n<.%s.>" % s['text']
        #threat msg for spacing and tokenizing
        for j,k in enumerate(kws):
            if unidecode(k).lower() in unidecode(s['text']).lower():                
                
                newTweet = M.tweetmetanalyze(unidecode(s['text']))
                ste = newTweet
                print "U:", ste

                #here, send osc
                try:
                    cmnd = OSC.OSCMessage("/tweet")
                    cmnd.append(ste)
                    cmnd.append(ind_s)
                    oscClient.send(cmnd)

                    cmnd = OSC.OSCMessage("/palabra")
                    cmnd.append(categorias_emotivas[gesto_to_class[k]])
                    cmnd.append(gesto_to_inten[k])
                    oscClient_ari.send(cmnd)

                except:
                    print '\n\tAquí le falló\n\t'
        sleep(randint(1,5))
开发者ID:KernelPanicCode,项目名称:teleText,代码行数:32,代码来源:teleText.py

示例5: path

def path(start, end):
  start =unidecode.unidecode(start)
  end = unidecode.unidecode(end)
  print start
  print end
  path = find_path(start, end)
  return jsonify(path=path)
开发者ID:makinj,项目名称:wikidist,代码行数:7,代码来源:main.py

示例6: getIndividualSubject

def getIndividualSubject(roster_semester,subject):
  url = COURSE_ROSTER_API_CLASSES + roster_semester + '&subject=' + subject
  soup = BeautifulSoup(requests.get(url).text)

  classes = soup.find_all('class')
  for c in classes:
    listing = subject + c.find('catalognbr').text
    if listing not in COURSE_DICT:
      name = unidecode(c.find('titlelong').text.replace('\n', ' '))
      units_min = c.find('unitsminimum').text
      units_max = c.find('unitsmaximum').text
      if units_min == units_max:
        credits = units_min
      else:
        credits = units_min + "-" + units_max
      course_obj = Course(listing,name,credits)
      course_obj.description   = unidecode(c.find('description').text.replace('\n', ' '))
      course_obj.offered       = unidecode(c.find('catalogwhenoffered').text.replace('\n', ' '))
      course_obj.prerequisites = unidecode(c.find('catalogprereqcoreq').text.replace('\n', ' '))
      course_obj.arts_tags     = unidecode(c.find('catalogdistr').text.replace('\n', ' '))
      crosslists = []
      for combination in c.find_all('combination'):
        crosslists.append(combination.find('subject').text + combination.find('catalognbr').text)
      course_obj.crosslisted_classes = ";".join(crosslists)
      COURSE_DICT[listing] = course_obj
      print str(course_obj)
      print '-' * 50
开发者ID:eric-chahin,项目名称:CS5150,代码行数:27,代码来源:parse_data.py

示例7: parseCISI

def parseCISI(title, tmdb_title = None):
    movs = search(title)
    mov = None
    mov_id = None
    imdb_id = None
    year = None
    ss = []
    sel = 'n'
    if movs is not None and len(movs) > 0:
        for m in movs:
            cisi_title = unidecode(m['title']).replace(',', '')
            if cisi_title.lower() == title.lower():
                sel = 'y'
                break
            elif title.lower() in cisi_title.lower() or cisi_title.lower() in title.lower():
                sel = raw_input(
                    "Matching '{}' with canistream.it '{}' ({})... OK? [y or n] ".format(
                        title
                        , cisi_title
                        , m['_id']
                    )
                ).lower()
                if sel == 'y':
                    break
            print("Trying again...")
    elif tmdb_title is not None:
        movs = search(tmdb_title)
        sel = 'n'
        if movs is not None and len(movs) > 0:
            for m in movs:
                cisi_title = unidecode(m['title'].decode('utf-8'))
                if cisi_title.lower() == tmdb_title.lower():
                    sel = 'y'
                    break
                elif tmdb_title.lower() in cisi_title.lower() or cisi_title.lower() in tmdb_title.lower():
                    sel = raw_input(
                        "Matching TMDB '{}' with canistream.it '{}' ({})... OK? [y or n] ".format(
                            tmdb_title
                            , cisi_title
                            , m['_id']
                        )
                    ).lower()
                    if sel == 'y':
                        break
                    else:
                        print("Trying again...")
    if sel == 'y':
        mov = m
        mov_id = str(m['_id'])
        year = int(m['year'])
        if 'imdb' in m['links'].keys():
            imdb_id = str(m['links']['imdb'].split("/")[-2])
    else:
        print("Unable to find match in canistream.it for '{}'".format(title))
    if mov is not None:
        ss = getStreams(mov_id)
        print("* MATCHED canistream.it")
    elif tmdb_title is not None:
        print("Streaming availability won't be available.")
    return mov_id, year, ss, imdb_id
开发者ID:jkroening,项目名称:netflix-queue-sorter,代码行数:60,代码来源:chooseMovie.py

示例8: _alignBySplittingToken

    def _alignBySplittingToken(self, tag, word, t_iter):
        # alignment helper
        self.logger.debug('tag %s exceeds word %s', repr(tag.word), repr(word))
        tmp = list(tag)
        words = [word]
        asciis = [unidecode(word).replace('-', '')]
        tag_word = ''.join(self.tokenizer.split(tag.word))
        aligned = lambda: ''.join(asciis) == tag_word
        max_len = len(tag_word)
        aligned_tags = []

        while not aligned() and sum(map(len, asciis)) < max_len:
            words.append(next(t_iter))
            asciis.append(unidecode(words[-1]).replace('-', ''))

        if aligned():
            self.logger.debug('dropping tag %s [%s] for words "%s"',
                              repr(tag.word), tag[-1], ' '.join(words))

            for w, a in zip(words, asciis):
                tmp[0] = w
                tmp[1] = a
                self.logger.debug('adding tag %s [%s]', repr(w), tmp[-1])
                aligned_tags.append(Token(*tmp))

                for p in (3, 4):
                    if tmp[p].startswith('B-'):
                        tmp[p] = 'I' + tmp[p][1:]
        else:
            raise RuntimeError('alignment of words %s as %s to token "%s" as "%s" failed' % (
                repr(words), repr(asciis), tag.word, tag_word
            ))

        return aligned_tags
开发者ID:fnl,项目名称:libfnl,代码行数:34,代码来源:analysis.py

示例9: parse_authors

    def parse_authors(self):
        # Create authors
        print "Parsing Authors..."
        f = open(data_io.get_paths()["author_processed_path"], "r")
        titles = f.readline()
        for l in f.readlines():
            res = l.strip().split(",")
            # Titles
            raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
            (name, surname) = nlp.filter_title(raw_title)
            try:
                self.surnames[surname] = self.surnames[surname] + 1
            except:
                self.surnames[surname] = 1

            #Affiliations
            raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
            affiliation = nlp.filter_affiliation(raw_affiliation)
            try:
                self.affiliations[affiliation] = self.affiliations[affiliation] + 1
            except:
                self.affiliations[affiliation] = 1
            self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation)

        print "Done"
        f.close()
开发者ID:sjuvekar,项目名称:Kdd2013AuthorPaperIdentification,代码行数:26,代码来源:parser.py

示例10: _sort_glossary

def _sort_glossary(qresult, lang):
    """
    Sort the result into categories and questions from response returned by the backend engine
    """
    glossary_content = []
    letters = []
    letters_found = OrderedDict()
    field = "glossary_term_lang_" + lang

    for i in string.ascii_uppercase:
        letters.append(i)
        letters_found[i] = 0

    if len(qresult) > 0:
        # Process results
        from itertools import groupby

        items = [o.get_stored_fields() for o in qresult]
        items = sorted(items, key=lambda x: unidecode(x[field]))
        for k, g in groupby(items, key=lambda x: unidecode(x[field])[0]):
            letters_found[k] = 1
            glossary_content.append(
                {
                    "letter": k,
                    "terms": [
                        {"term": item[field], "description": item["glossary_description_lang_" + lang]} for item in g
                    ],
                }
            )

    return letters, letters_found, glossary_content
开发者ID:emory-libraries,项目名称:voyages,代码行数:31,代码来源:views.py

示例11: fast_iter

def fast_iter(context, func,*args, **kwargs):
	collaborations = [u'www', u'phdthesis', u'inproceedings', u'incollection', u'proceedings', u'book', u'mastersthesis', u'article']
	#xml categories
	author_array = []
	title = ''

	#read chunk line by line
	#we focus author and title
	for event, elem in context:
		if elem.tag == 'author':
			author_array.append(unidecode(elem.text))

		if elem.tag == 'title':
			if elem.text:
				title = unidecode(elem.text)

		if elem.tag in collaborations:
			if len(author_array) is not 0 and title is not '':
				#rejected paper has no author or title
				#it should be check

				for a in author_array:
					func(a+"||"+title, *args, **kwargs)
					#write into kv file

				title = ''
				del author_array[:]

		elem.clear()
		while elem.getprevious() is not None:
			del elem.getparent()[0]
	del context
开发者ID:songmw90,项目名称:dblp-parser,代码行数:32,代码来源:dblp-parse-python3.py

示例12: get_forms

def get_forms(c):
    global forms

    c.execute("SELECT DISTINCT species_id FROM pokemon WHERE id IN (SELECT pokemon_id FROM pokemon_forms WHERE form_identifier != 'NULL' ORDER BY pokemon_id) ORDER BY species_id")
    species_ids = c.fetchall()

    for i in range(len(species_ids)):
        c.execute("SELECT name FROM pokemon_species_names WHERE pokemon_species_id=%d AND local_language_id=9" % species_ids[i][0])
        species_name = str(unidecode(c.fetchone()[0])).replace("-","_").replace(" ","_").replace(".","").replace("'","")

        c.execute("SELECT pokemon_form_id,form_name FROM pokemon_form_names WHERE pokemon_form_id IN (SELECT id FROM pokemon_forms WHERE pokemon_id IN (SELECT id FROM pokemon WHERE species_id=%s)) AND local_language_id=9" % species_ids[i][0])
        species_forms = c.fetchall()

        form_index = []
        form_index += [species_name]
        for j in range(len(species_forms)):
            form_name = "STANDARD" if species_forms[j][1] == None else str(unidecode(species_forms[j][1])).replace("-","_").replace(" ","_").replace(".","").replace("'","").upper()
            form_name = form_name.replace("_FORME","").replace("_FORM","").replace("_TYPE","").replace("_ROTOM","").replace("???","QUESTION_MARK").replace("!","EXCLAMATION_MARK")
            form_name = form_name.replace("?","QUESTION_MARK").replace("_PATTERN","").replace("_KYUREM","").replace("_MODE","")

            if "MEGA" in form_name and "_X" in form_name:
                form_name = "MEGA_X"
            elif "MEGA" in form_name and "_Y" in form_name:
                form_name = "MEGA_Y"
            elif "MEGA" in form_name:
                form_name = "MEGA"

            form_index += [(species_forms[j][0], form_name)]

        forms += [form_index]
开发者ID:codemonkey85,项目名称:LibPKMN,代码行数:30,代码来源:generate_cpp_enums.py

示例13: scrape_wikitables

def scrape_wikitables():
    """Scrapes wikipedia for the list of current top boxers"""

    champURL = "https://en.wikipedia.org/wiki/List_of_current_boxing_rankings"
    page = urllib.request.urlopen(champURL)
    soup = bs4.BeautifulSoup(page, "html5lib")

    tables = soup.find_all("table", {"class": "wikitable"})
    unique_boxers = []

    for table_number in range(1, 6):
        table = tables[table_number]
        rows = table.find_all("tr")
        for row in rows:
            data = row.find_all("td")
            text = [i.text for i in data]
            for boxer_name in range(len(text)):
                if len(text[boxer_name]) > 3:
                    boxer_name = text[boxer_name].rstrip('\n')
                    boxer_name = re.findall(r"\S{3,}\ .[^\ \(]+", boxer_name)
                    if len(boxer_name) > 0:
                        if unidecode(boxer_name[0]) not in unique_boxers:
                            unique_boxers.append(unidecode(boxer_name[0]))

    unique_boxers.sort()
    return unique_boxers
开发者ID:WnndGws,项目名称:OneOffCodes,代码行数:26,代码来源:boxing.py

示例14: main

def main():
    URL_SENTIMENT140 = 'http://www.sentiment140.com/api/[email protected]'
    tweets = []
    for line in sys.stdin:
        try:
            tweetData = json.loads(line.decode('utf-8'))
            location = tweetData['user']['location'].strip()
            if location is None or bool(re.search(r'\d',location)):
                location = 'unknown'
            tempDataDict = {'text': unidecode(tweetData['text']), 'location':\
            unidecode(location.upper())}
            tweets.append(tempDataDict)
        except:
            continue
    dataToSend = {'data': tweets}
    try:
        response = urllib2.urlopen(URL_SENTIMENT140, str(dataToSend))
        sentimentJsonResponse = json.loads(response.read())
        parsedDataDict = parseResponse(sentimentJsonResponse)
        for key, value in parsedDataDict.items():
            print "{0}\t{1}".format(key, value)
    except HTTPError as e:
        print 'The server couldn\'t fulfill the request.'
        print 'Error code: ', e.code
    except URLError as e:
        print 'We failed to reach a server.'
        print 'Reason: ', e.reason
    except:
        print 'response from server is null or some error has occured'
开发者ID:Nikhil-Saxena,项目名称:Tweets_analysis_in_hadoop_using_flume,代码行数:29,代码来源:locationSentimentMapper.py

示例15: plug_in

    def plug_in(self, out=sys.stdout, hadoop=False,
                filter_id=None, subtree=True):
        """Generates a basic javascript implementation of local predictions

        `out` is file descriptor to write the javascript code.

        """
        # fill the camelcase variable names with the JS_KEYWORDS restrictions
        objective_field = self.tree.fields[self.tree.objective_id]
        camelcase = to_camel_js(unidecode(objective_field['name']), False)
        objective_field['CamelCase'] = camelcase
        for field in [(key, val) for key, val in
                      sort_fields(self.tree.fields)]:
            field_obj = self.tree.fields[field[0]]
            field_obj['camelCase'] = to_camel_js(unidecode(field_obj['name']))

        body, term_analysis_predicates, item_analysis_predicates = \
            self.tree.plug_in_body()
        terms_body = ""
        items_body = ""
        if term_analysis_predicates:
            terms_body = self.js_term_analysis_body(term_analysis_predicates)
        if item_analysis_predicates:
            items_body = self.js_item_analysis_body(item_analysis_predicates)
        output = self.js_pre_body()
        output += terms_body + items_body + body
        output += u"%sreturn null;\n}\n" % INDENT
        if not PY3:
            output = output.encode("utf8")
        out.write(output)
        out.flush()
开发者ID:shantanusharma,项目名称:bigmler,代码行数:31,代码来源:jsmodel.py


注:本文中的unidecode.unidecode函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。