本文整理汇总了Python中utils.to_unicode函数的典型用法代码示例。如果您正苦于以下问题:Python to_unicode函数的具体用法?Python to_unicode怎么用?Python to_unicode使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_unicode函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, input, transposed=True):
"""
Initialize the matrix reader.
The `input` refers to a file on local filesystem, which is expected to
be in the sparse (coordinate) Matrix Market format. Documents are assumed
to be rows of the matrix (and document features are columns).
`input` is either a string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
"""
logger.info("initializing corpus reader from %s" % input)
self.input, self.transposed = input, transposed
with utils.file_or_filename(self.input) as lines:
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header))
except StopIteration:
pass
self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
(self.num_docs, self.num_terms, self.num_nnz))
示例2: single_picky
def single_picky(slug='test'):
try:
f = open(PICKY_DIR + slug + '.md')
except IOError:
abort(404)
picky = f.read()
f.close()
meta_regex = re.compile(
r"^\s*(?:-|=){3,}\s*\n((?:.|\n)+?)\n\s*(?:-|=){3,}\s*\n*",
re.MULTILINE
)
match = re.match(meta_regex, picky)
if not match:
abort(404)
metas = match.group(1)
title = None
date = None
meta = metas.split("\n")
try:
title = meta[0].split("=>")[1]
except IndexError:
title = meta[0].split("=>")[0]
try:
date = meta[1].split("=>")[1]
except IndexError:
date = meta[1].split("=>")[0]
cont = to_unicode(picky[match.end():])
content = to_markdown(cont)
return template('picky.html', content=content, title=to_unicode(title),
date=to_unicode(date), slug=slug)
示例3: load_cat2vec_format
def load_cat2vec_format(cls, cat_model=None, sent_model=None, word_model=None):
"""
Load sentence vectors
"""
model = Category2Vec(None)
count = 0
if cat_model:
logger.info("loading %s object(cat) from %s" % (cls.__name__, cat_model))
for line in open(cat_model,"r"):
line = line.rstrip()
if count == 0:
info = line.split()
model.cat_len = int(info[0])
model.layer1_size = int(info[1])
model.sg = int(info[2])
model.hs = int(info[3])
model.negative = int(info[4])
model.cbow_mean = int(info[5])
model.cats = empty((model.cat_len, model.layer1_size), dtype=REAL)
model.cat_no_hash = {}
model.cat_id_list = []
else:
idx = count - 1
row = line.split("\t")
cat_id = utils.to_unicode(row[0])
model.cat_no_hash[cat_id] = idx
model.cat_id_list.append(cat_id)
vals = row[1].split()
for j in xrange(model.layer1_size):
model.cats[idx][j] = float(vals[j])
count += 1
count = 0
if sent_model:
logger.info("loading %s object(sentence) from %s" % (cls.__name__, sent_model))
for line in open(sent_model,"r"):
line = line.rstrip()
if count == 0:
info = line.split()
model.sents_len = int(info[0])
model.sents = empty((model.sents_len, model.layer1_size), dtype=REAL)
model.sent_no_hash = {}
model.sent_id_list = []
else:
idx = count - 1
row = line.split("\t")
sent_id = utils.to_unicode(row[0])
model.sent_no_hash[sent_id] = idx
model.sent_id_list.append(sent_id)
vals = row[1].split()
for j in xrange(model.layer1_size):
model.sents[idx][j] = float(vals[j])
count += 1
if word_model:
logger.info("loading word2vec from %s" % word_model)
model.w2v = Word2Vec.load(word_model)
model.vocab = model.w2v.vocab
return model
示例4: __init__
def __init__ (self, id, uri, name, type):
if id is None:
self.id = DBRepository.id_counter
DBRepository.id_counter += 1
else:
self.id = id
self.uri = to_unicode (uri)
self.name = to_unicode (name)
self.type = to_unicode (type)
示例5: __init__
def __init__ (self, id, commit):
if id is None:
self.id = DBLog.id_counter
DBLog.id_counter += 1
else:
self.id = id
self.rev = to_unicode (commit.revision)
self.committer = None
self.author = None
self.date = commit.date
self.message = to_unicode (commit.message)
self.composed_rev = commit.composed_rev
示例6: __iter__
def __iter__(self):
"""Iterate through the lines in the source."""
try:
# Assume it is a file-like object and try treating it as such
# Things that don't have seek will trigger an exception
self.source.seek(0)
for line in self.source:
yield utils.to_unicode(line).split()
except AttributeError:
# If it didn't work like a file, use it as a string filename
with utils.smart_open(self.source) as fin:
for line in fin:
yield utils.to_unicode(line).split()
示例7: response
def response(self,msg,**kwargs):
## msg is parsed and your handled data.Actually,it is a dict.
## Your could specify a type by assign.ex response(type='music').I list all legal types.
'''
ex: response(message,type='yourType')
optional kwargs:
type='legal_types',content='yourContent',handler=foo,count=1
ps:when type is news,the count kwarg is nessceary
support types:
text,image,voice,video,music,news
'''
msg['receiver'],msg['sender'] = msg['sender'],msg['receiver']
legal_types = ['text','music','image','voice','video','news']
## get some kwargs ##
# key word content ---- which force type to textand return a static string
if kwargs.get('type'):
type = kwargs.get('type')
else:type = msg['type']
if type == 'music':
if not msg['hq_musurl']:
msg['hq_musurl'] = msg['musurl']
# charge receiver and sender
if kwargs.get('content'):
msg['type'] = type = 'text'
msg['content'] = to_unicode(kwargs.get('content'))
if not type in legal_types:
raise Exception("Illgal type!You could only choose one type from legal_types!")
# key word handler ---- which is a function object,accept a dict and return a modified dict
else:
msg['type'] = type
if kwargs.get('handler'):
msg = kwargs.get('handler')(msg)
## more kwargs ##
if not type == 'news':
template = to_unicode(getattr(Template(),type))
else:
count = kwargs.get('count')
if count:
temp = Template()
template = to_unicode(temp.news(count))
else:
raise Exception('When type is set to news,the count kwarg is necessary!')
logging.info(template.format(**msg))
try:
retdata = template.format(**msg)
except:
raise Exception("You did't pass enough args or pass wrong args,please check args which template needed.Read template.py maybe inspire your mind")
return retdata
示例8: add_header
def add_header(self):
if self.file is not None :
dis = ""
dis += "Script file : %s\n" % sys.argv[0]
dis += "Date : %s\n" % time.strftime("%d/%m/%Y %H:%M:%S", self.gtime.start_date)
dis += "\n%s\n" % self.format("Time(s)", "Scope", "Info")
self.file.write(utils.to_unicode(dis))
示例9: __init__
def __init__(self, unique_name, base_filepath, parameters):
"""
Arguments
---------
keyword_name : feature unique name
base_filepath : filepath of feature config
parameters : lexicon parameters, presented by dictionary
"""
self.unique_name = unique_name
self.parameters = parameters
filepath = os.path.join(
base_filepath,
parameters[BagOfClustersFeature.PARAM_CLUSTERED_WORDS_FILEPATH])
if parameters[BagOfClustersFeature.PARAM_ENABLED] == 'false':
return
print "Loading file with clusters of words: {}".format(filepath)
with io.open(filepath, 'r', encoding='utf-8') as f:
self.clustered_words = json.load(f, encoding='utf-8')
print "Create dictionary with all clusters, accessed by cluster_id ..."
self.clusters = {}
for word in self.clustered_words.iterkeys():
cluster_id = self.clustered_words[word]
if cluster_id not in self.clusters:
self.clusters[cluster_id] = []
self.clusters[cluster_id].append(utils.to_unicode(word))
示例10: _create_des_
def _create_des_(msg):
if msg is None :
return {}
elif not(isinstance(msg, types.StringTypes)):
raise pexception.PytestembError("Msg must be a string")
else:
return dict({"msg":"%s" % utils.to_unicode(msg)})
示例11: get_cluster_id
def get_cluster_id(self, word):
"""
Returns
-------
Returns id of cluster, which is contain the 'word'
"""
return self.clustered_words[utils.to_unicode(word)]
示例12: get_local_features
def get_local_features(token, word_freq=None):
assert len(token) >= 1
features = []
ntoken = normalize(token, lowercase=False)
if token.isalpha():
if 'UpperCase' in features_on:
if first_upper_case(ntoken):
features += ['IsUpperCase']
if 'AllUpperCase' in features_on:
if all_upper_case(ntoken):
features += ['IsAllUpperCase']
if 'AllLowerCase' in features_on:
if all_lower_case(ntoken):
features += ['IsAllLowerCase']
if 'Freq' in features_on:
features += ['Freq:%s' % str(word_freq[ntoken])]
if 'Rare' in features_on:
if word_freq[ntoken] <= rare_thr:
features += ['IsRare']
if 'IsWord' in features_on:
features += ['IsWord']
elif token.isdigit():
if 'Number' in features_on:
features += ['IsNumber']
elif token.isalnum():
if 'AlphaNum' in features_on:
features += ['IsAlphaNum']
elif len(to_unicode(token)) == 1:
if is_punct(token):
if 'Separator' in features_on:
features += ['IsSeparator']
else:
if 'NonAlphanum' in features_on:
features += ['IsNonAlphanum']
if 'Word' in features_on:
if not any(x in features for x in ['IsNumber', 'IsAlphaNum']):
features += ['W=%s' % ntoken]
if 'Length' in features_on:
features += ['Length:%s' % str(len(ntoken))]
return features
示例13: get_terms_info
def get_terms_info(self, term):
"""
returns: dict
amount of documents which includes 'term' for different sentiment
classes and at all (DocVocabulary.ALL)
"""
uterm = to_unicode(term)
return self.terms_info[uterm]
示例14: _check_same_origin
def _check_same_origin(self, current_url):
"""
检查两个URL是否同源
"""
current_url = to_unicode(current_url)
url_part = urlparse.urlparse(current_url)
url_origin = (url_part.scheme, url_part.netloc)
return url_origin == self.origin
示例15: __getitem__
def __getitem__(self, name):
'''Get a header value, from the message, decoded and as a
unicode string.
If the header does not exist, None is returned'''
value = self._msg[name]
if value is None:
return None
return u''.join(to_unicode(*tupl) for tupl in decode_header(value))