本文整理汇总了Python中unidecode.unidecode函数的典型用法代码示例。如果您正苦于以下问题:Python unidecode函数的具体用法?Python unidecode怎么用?Python unidecode使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了unidecode函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: crawl_video_urls
def crawl_video_urls(url='http://documentaryheaven.com/category/space/'):
myopener = MyOpener()
page = myopener.open(url)
page = page.read()
html = BeautifulSoup(page, "lxml")
# find all class=post
posts = html.find_all('div', class_="post")
# for each class=post:
for p in posts:
obj = {}
#class=post-title --> a (href, string)
title = p.find('h2').find('a')
obj['url'] = title['href']
obj['title'] = unidecode(title.string)
abstract = p.find('div', class_='browse-description').find('p')
obj['abstract'] = unidecode(abstract.string).replace('\n', '').replace('\r\r', ' ').strip()
#class=browse-description --> p (string)
results.append(obj)
# next page: class=next --> (href)
next_page = html.find('a', class_="next page-numbers")
if not next_page:
return None
print results
print next_page['href']
return crawl_video_urls(url=next_page['href'])
示例2: test_ascii
def test_ascii(self):
log = []
def showwarning_new(message, category, *args):
if ("not an unicode object" in str(message)) and \
(category is RuntimeWarning):
log.append((message, category))
else:
showwarning_old(message, category, *args)
showwarning_old = warnings.showwarning
warnings.showwarning = showwarning_new
warnings.filterwarnings("always")
for n in xrange(0,128):
t = chr(n)
self.assertEqual(unidecode(t), t)
# Passing string objects to unidecode should raise a warning
self.assertEqual(128, len(log))
log = []
for n in xrange(0,128):
t = unichr(n)
self.assertEqual(unidecode(t), t)
# unicode objects shouldn't raise warnings
self.assertEqual(0, len(log))
warnings.showwarning = showwarning_old
示例3: extract_info
def extract_info(article):
'''
INPUT: dict object with output from the api
OUTPUT: bool if extraction was successful or not,
dict object to insert into mongodb
'''
headline = unidecode(article['title']['$text'])
date_published = str(article['pubDate']['$text'])
try:
author = [str(author['name']['$text']) for author in article['byline']]
except:
author = None
try:
url = str(article['link'][0]['$text'])
except:
return False, ''
try:
article_text = unidecode(' '.join([line.get('$text', '\n') for line in article['text']['paragraph']]))
except:
return False, ''
insert = {'url': url,
'source': 'npr',
'headline': headline,
'date_published': date_published,
'author': author,
'article_text': article_text}
return True, insert
示例4: do_the_trick
def do_the_trick():
print "\n\t-->> [Collecting]"
global dot
global samples
dot, samples = do_fetch()
if len(samples)>0:
print "\n\t-->> [Playing]:"
for ind_s, s in enumerate(samples):
print "\n<.%s.>" % s['text']
#threat msg for spacing and tokenizing
for j,k in enumerate(kws):
if unidecode(k).lower() in unidecode(s['text']).lower():
newTweet = M.tweetmetanalyze(unidecode(s['text']))
ste = newTweet
print "U:", ste
#here, send osc
try:
cmnd = OSC.OSCMessage("/tweet")
cmnd.append(ste)
cmnd.append(ind_s)
oscClient.send(cmnd)
cmnd = OSC.OSCMessage("/palabra")
cmnd.append(categorias_emotivas[gesto_to_class[k]])
cmnd.append(gesto_to_inten[k])
oscClient_ari.send(cmnd)
except:
print '\n\tAquí le falló\n\t'
sleep(randint(1,5))
示例5: path
def path(start, end):
start =unidecode.unidecode(start)
end = unidecode.unidecode(end)
print start
print end
path = find_path(start, end)
return jsonify(path=path)
示例6: getIndividualSubject
def getIndividualSubject(roster_semester,subject):
url = COURSE_ROSTER_API_CLASSES + roster_semester + '&subject=' + subject
soup = BeautifulSoup(requests.get(url).text)
classes = soup.find_all('class')
for c in classes:
listing = subject + c.find('catalognbr').text
if listing not in COURSE_DICT:
name = unidecode(c.find('titlelong').text.replace('\n', ' '))
units_min = c.find('unitsminimum').text
units_max = c.find('unitsmaximum').text
if units_min == units_max:
credits = units_min
else:
credits = units_min + "-" + units_max
course_obj = Course(listing,name,credits)
course_obj.description = unidecode(c.find('description').text.replace('\n', ' '))
course_obj.offered = unidecode(c.find('catalogwhenoffered').text.replace('\n', ' '))
course_obj.prerequisites = unidecode(c.find('catalogprereqcoreq').text.replace('\n', ' '))
course_obj.arts_tags = unidecode(c.find('catalogdistr').text.replace('\n', ' '))
crosslists = []
for combination in c.find_all('combination'):
crosslists.append(combination.find('subject').text + combination.find('catalognbr').text)
course_obj.crosslisted_classes = ";".join(crosslists)
COURSE_DICT[listing] = course_obj
print str(course_obj)
print '-' * 50
示例7: parseCISI
def parseCISI(title, tmdb_title = None):
movs = search(title)
mov = None
mov_id = None
imdb_id = None
year = None
ss = []
sel = 'n'
if movs is not None and len(movs) > 0:
for m in movs:
cisi_title = unidecode(m['title']).replace(',', '')
if cisi_title.lower() == title.lower():
sel = 'y'
break
elif title.lower() in cisi_title.lower() or cisi_title.lower() in title.lower():
sel = raw_input(
"Matching '{}' with canistream.it '{}' ({})... OK? [y or n] ".format(
title
, cisi_title
, m['_id']
)
).lower()
if sel == 'y':
break
print("Trying again...")
elif tmdb_title is not None:
movs = search(tmdb_title)
sel = 'n'
if movs is not None and len(movs) > 0:
for m in movs:
cisi_title = unidecode(m['title'].decode('utf-8'))
if cisi_title.lower() == tmdb_title.lower():
sel = 'y'
break
elif tmdb_title.lower() in cisi_title.lower() or cisi_title.lower() in tmdb_title.lower():
sel = raw_input(
"Matching TMDB '{}' with canistream.it '{}' ({})... OK? [y or n] ".format(
tmdb_title
, cisi_title
, m['_id']
)
).lower()
if sel == 'y':
break
else:
print("Trying again...")
if sel == 'y':
mov = m
mov_id = str(m['_id'])
year = int(m['year'])
if 'imdb' in m['links'].keys():
imdb_id = str(m['links']['imdb'].split("/")[-2])
else:
print("Unable to find match in canistream.it for '{}'".format(title))
if mov is not None:
ss = getStreams(mov_id)
print("* MATCHED canistream.it")
elif tmdb_title is not None:
print("Streaming availability won't be available.")
return mov_id, year, ss, imdb_id
示例8: _alignBySplittingToken
def _alignBySplittingToken(self, tag, word, t_iter):
# alignment helper
self.logger.debug('tag %s exceeds word %s', repr(tag.word), repr(word))
tmp = list(tag)
words = [word]
asciis = [unidecode(word).replace('-', '')]
tag_word = ''.join(self.tokenizer.split(tag.word))
aligned = lambda: ''.join(asciis) == tag_word
max_len = len(tag_word)
aligned_tags = []
while not aligned() and sum(map(len, asciis)) < max_len:
words.append(next(t_iter))
asciis.append(unidecode(words[-1]).replace('-', ''))
if aligned():
self.logger.debug('dropping tag %s [%s] for words "%s"',
repr(tag.word), tag[-1], ' '.join(words))
for w, a in zip(words, asciis):
tmp[0] = w
tmp[1] = a
self.logger.debug('adding tag %s [%s]', repr(w), tmp[-1])
aligned_tags.append(Token(*tmp))
for p in (3, 4):
if tmp[p].startswith('B-'):
tmp[p] = 'I' + tmp[p][1:]
else:
raise RuntimeError('alignment of words %s as %s to token "%s" as "%s" failed' % (
repr(words), repr(asciis), tag.word, tag_word
))
return aligned_tags
示例9: parse_authors
def parse_authors(self):
# Create authors
print "Parsing Authors..."
f = open(data_io.get_paths()["author_processed_path"], "r")
titles = f.readline()
for l in f.readlines():
res = l.strip().split(",")
# Titles
raw_title = unidecode.unidecode(unicode(res[1], encoding="utf-8"))
(name, surname) = nlp.filter_title(raw_title)
try:
self.surnames[surname] = self.surnames[surname] + 1
except:
self.surnames[surname] = 1
#Affiliations
raw_affiliation = unidecode.unidecode(unicode(res[2], encoding="utf-8"))
affiliation = nlp.filter_affiliation(raw_affiliation)
try:
self.affiliations[affiliation] = self.affiliations[affiliation] + 1
except:
self.affiliations[affiliation] = 1
self.authors[int(res[0])] = author.Author(int(res[0]), name, surname, affiliation)
print "Done"
f.close()
示例10: _sort_glossary
def _sort_glossary(qresult, lang):
"""
Sort the result into categories and questions from response returned by the backend engine
"""
glossary_content = []
letters = []
letters_found = OrderedDict()
field = "glossary_term_lang_" + lang
for i in string.ascii_uppercase:
letters.append(i)
letters_found[i] = 0
if len(qresult) > 0:
# Process results
from itertools import groupby
items = [o.get_stored_fields() for o in qresult]
items = sorted(items, key=lambda x: unidecode(x[field]))
for k, g in groupby(items, key=lambda x: unidecode(x[field])[0]):
letters_found[k] = 1
glossary_content.append(
{
"letter": k,
"terms": [
{"term": item[field], "description": item["glossary_description_lang_" + lang]} for item in g
],
}
)
return letters, letters_found, glossary_content
示例11: fast_iter
def fast_iter(context, func,*args, **kwargs):
collaborations = [u'www', u'phdthesis', u'inproceedings', u'incollection', u'proceedings', u'book', u'mastersthesis', u'article']
#xml categories
author_array = []
title = ''
#read chunk line by line
#we focus author and title
for event, elem in context:
if elem.tag == 'author':
author_array.append(unidecode(elem.text))
if elem.tag == 'title':
if elem.text:
title = unidecode(elem.text)
if elem.tag in collaborations:
if len(author_array) is not 0 and title is not '':
#rejected paper has no author or title
#it should be check
for a in author_array:
func(a+"||"+title, *args, **kwargs)
#write into kv file
title = ''
del author_array[:]
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
del context
示例12: get_forms
def get_forms(c):
global forms
c.execute("SELECT DISTINCT species_id FROM pokemon WHERE id IN (SELECT pokemon_id FROM pokemon_forms WHERE form_identifier != 'NULL' ORDER BY pokemon_id) ORDER BY species_id")
species_ids = c.fetchall()
for i in range(len(species_ids)):
c.execute("SELECT name FROM pokemon_species_names WHERE pokemon_species_id=%d AND local_language_id=9" % species_ids[i][0])
species_name = str(unidecode(c.fetchone()[0])).replace("-","_").replace(" ","_").replace(".","").replace("'","")
c.execute("SELECT pokemon_form_id,form_name FROM pokemon_form_names WHERE pokemon_form_id IN (SELECT id FROM pokemon_forms WHERE pokemon_id IN (SELECT id FROM pokemon WHERE species_id=%s)) AND local_language_id=9" % species_ids[i][0])
species_forms = c.fetchall()
form_index = []
form_index += [species_name]
for j in range(len(species_forms)):
form_name = "STANDARD" if species_forms[j][1] == None else str(unidecode(species_forms[j][1])).replace("-","_").replace(" ","_").replace(".","").replace("'","").upper()
form_name = form_name.replace("_FORME","").replace("_FORM","").replace("_TYPE","").replace("_ROTOM","").replace("???","QUESTION_MARK").replace("!","EXCLAMATION_MARK")
form_name = form_name.replace("?","QUESTION_MARK").replace("_PATTERN","").replace("_KYUREM","").replace("_MODE","")
if "MEGA" in form_name and "_X" in form_name:
form_name = "MEGA_X"
elif "MEGA" in form_name and "_Y" in form_name:
form_name = "MEGA_Y"
elif "MEGA" in form_name:
form_name = "MEGA"
form_index += [(species_forms[j][0], form_name)]
forms += [form_index]
示例13: scrape_wikitables
def scrape_wikitables():
"""Scrapes wikipedia for the list of current top boxers"""
champURL = "https://en.wikipedia.org/wiki/List_of_current_boxing_rankings"
page = urllib.request.urlopen(champURL)
soup = bs4.BeautifulSoup(page, "html5lib")
tables = soup.find_all("table", {"class": "wikitable"})
unique_boxers = []
for table_number in range(1, 6):
table = tables[table_number]
rows = table.find_all("tr")
for row in rows:
data = row.find_all("td")
text = [i.text for i in data]
for boxer_name in range(len(text)):
if len(text[boxer_name]) > 3:
boxer_name = text[boxer_name].rstrip('\n')
boxer_name = re.findall(r"\S{3,}\ .[^\ \(]+", boxer_name)
if len(boxer_name) > 0:
if unidecode(boxer_name[0]) not in unique_boxers:
unique_boxers.append(unidecode(boxer_name[0]))
unique_boxers.sort()
return unique_boxers
示例14: main
def main():
URL_SENTIMENT140 = 'http://www.sentiment140.com/api/[email protected]'
tweets = []
for line in sys.stdin:
try:
tweetData = json.loads(line.decode('utf-8'))
location = tweetData['user']['location'].strip()
if location is None or bool(re.search(r'\d',location)):
location = 'unknown'
tempDataDict = {'text': unidecode(tweetData['text']), 'location':\
unidecode(location.upper())}
tweets.append(tempDataDict)
except:
continue
dataToSend = {'data': tweets}
try:
response = urllib2.urlopen(URL_SENTIMENT140, str(dataToSend))
sentimentJsonResponse = json.loads(response.read())
parsedDataDict = parseResponse(sentimentJsonResponse)
for key, value in parsedDataDict.items():
print "{0}\t{1}".format(key, value)
except HTTPError as e:
print 'The server couldn\'t fulfill the request.'
print 'Error code: ', e.code
except URLError as e:
print 'We failed to reach a server.'
print 'Reason: ', e.reason
except:
print 'response from server is null or some error has occured'
开发者ID:Nikhil-Saxena,项目名称:Tweets_analysis_in_hadoop_using_flume,代码行数:29,代码来源:locationSentimentMapper.py
示例15: plug_in
def plug_in(self, out=sys.stdout, hadoop=False,
filter_id=None, subtree=True):
"""Generates a basic javascript implementation of local predictions
`out` is file descriptor to write the javascript code.
"""
# fill the camelcase variable names with the JS_KEYWORDS restrictions
objective_field = self.tree.fields[self.tree.objective_id]
camelcase = to_camel_js(unidecode(objective_field['name']), False)
objective_field['CamelCase'] = camelcase
for field in [(key, val) for key, val in
sort_fields(self.tree.fields)]:
field_obj = self.tree.fields[field[0]]
field_obj['camelCase'] = to_camel_js(unidecode(field_obj['name']))
body, term_analysis_predicates, item_analysis_predicates = \
self.tree.plug_in_body()
terms_body = ""
items_body = ""
if term_analysis_predicates:
terms_body = self.js_term_analysis_body(term_analysis_predicates)
if item_analysis_predicates:
items_body = self.js_item_analysis_body(item_analysis_predicates)
output = self.js_pre_body()
output += terms_body + items_body + body
output += u"%sreturn null;\n}\n" % INDENT
if not PY3:
output = output.encode("utf8")
out.write(output)
out.flush()