本文整理汇总了Python中gensim.utils.to_unicode方法的典型用法代码示例。如果您正苦于以下问题:Python utils.to_unicode方法的具体用法?Python utils.to_unicode怎么用?Python utils.to_unicode使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.utils
的用法示例。
在下文中一共展示了utils.to_unicode方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: load_from_text
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def load_from_text(fname):
"""
Load a previously stored Dictionary from a text file.
Mirror function to `save_as_text`.
"""
result = Dictionary()
with utils.smart_open(fname) as f:
for lineno, line in enumerate(f):
line = utils.to_unicode(line)
try:
wordid, word, docfreq = line[:-1].split('\t')
except Exception:
raise ValueError("invalid line in dictionary file %s: %s"
% (fname, line.strip()))
wordid = int(wordid)
if word in result.token2id:
raise KeyError('token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word]))
result.token2id[word] = wordid
result.dfs[wordid] = int(docfreq)
return result
示例2: load_word_topics
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def load_word_topics(self):
logger.info("loading assigned topics from %s" % self.fstate())
wordtopics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
with utils.smart_open(self.fstate()) as fin:
_ = next(fin) # header
self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]])
assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics"
_ = next(fin) # beta
for lineno, line in enumerate(fin):
line = utils.to_unicode(line)
doc, source, pos, typeindex, token, topic = line.split()
tokenid = self.id2word.token2id[token] if hasattr(self.id2word, 'token2id') else int(token)
wordtopics[int(topic), tokenid] += 1
logger.info("loaded assigned topics for %i tokens" % wordtopics.sum())
self.wordtopics = wordtopics
self.print_topics(15)
示例3: setUp
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def setUp(self):
"""setup lee test corpora"""
global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2
pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
bg_corpus_file = 'lee_background.cor'
corpus_file = 'lee.cor'
sim_file = 'similarities0-1.txt'
# read in the corpora
latin1 = lambda line: utils.to_unicode(line, encoding='latin1')
with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
bg_corpus = preprocess_documents(latin1(line) for line in f)
with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
corpus = preprocess_documents(latin1(line) for line in f)
with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]
with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]
# read the human similarity data
sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
sim_m_size = np.shape(sim_matrix)[0]
human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
示例4: tokenize
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def tokenize(content, token_min_len=2, token_max_len=100, lower=True):
content = re.sub(EMAIL_PATTERN, ' ', content) # remove email pattern
content = re.sub(URL_PATTERN, ' ', content) # remove url pattern
content = re.sub(WIKI_REMOVE_CHARS, ' ', content) # remove unnecessary chars
content = re.sub(WIKI_SPACE_CHARS, ' ', content)
content = re.sub(MULTIPLE_SPACES, ' ', content)
tokens = content.replace(", )", "").split(" ")
result = []
for token in tokens:
if not token.startswith('_'):
token_candidate = to_unicode(re.sub(WIKI_REMOVE_TOKEN_CHARS, '', token))
else:
token_candidate = ""
if len(token_candidate) > 0:
result.append(token_candidate)
return result
示例5: __init__
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def __init__(self, input, transposed=True):
"""
Initialize the matrix reader.
The `input` refers to a file on local filesystem, which is expected to
be in the sparse (coordinate) Matrix Market format. Documents are assumed
to be rows of the matrix (and document features are columns).
`input` is either a string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
"""
logger.info("initializing corpus reader from %s" % input)
self.input, self.transposed = input, transposed
with utils.file_or_filename(self.input) as lines:
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header))
except StopIteration:
pass
self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
(self.num_docs, self.num_terms, self.num_nnz))
示例6: line2doc
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def line2doc(self, line):
"""
Create a document from a single line (string) in SVMlight format
"""
line = utils.to_unicode(line)
line = line[: line.find('#')].strip()
if not line:
return None # ignore comments and empty lines
parts = line.split()
if not parts:
raise ValueError('invalid line format in %s' % self.fname)
target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
return doc, target
示例7: filter_wiki
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def filter_wiki(raw):
"""
Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
or utf-8 encoded string.
"""
# parsing of the wiki markup is not perfect, but sufficient for our purposes
# contributions to improving this code are welcome :)
text = utils.to_unicode(raw, 'utf8', errors='ignore')
text = utils.decode_htmlentities(text) # ' ' --> '\xa0'
return remove_markup(text)
示例8: line2doc
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def line2doc(self, line):
l = [word for word in utils.to_unicode(line).strip().split(' ') if word]
docid, doclang, words = l[0], l[1], l[2:]
doc = super(MalletCorpus, self).line2doc(' '.join(words))
if self.metadata:
return doc, (docid, doclang)
else:
return doc
示例9: split_on_space
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def split_on_space(s):
return [word for word in utils.to_unicode(s).strip().split(' ') if word]
示例10: save_corpus
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def save_corpus(fname, corpus, id2word=None, metadata=False):
"""
Save a corpus in the List-of-words format.
This function is automatically called by `LowCorpus.serialize`; don't
call it directly, call `serialize` instead.
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
id2word = utils.dict_from_corpus(corpus)
logger.info("storing corpus in List-Of-Words format into %s" % fname)
truncated = 0
offsets = []
with utils.smart_open(fname, 'wb') as fout:
fout.write(utils.to_utf8('%i\n' % len(corpus)))
for doc in corpus:
words = []
for wordid, value in doc:
if abs(int(value) - value) > 1e-6:
truncated += 1
words.extend([utils.to_unicode(id2word[wordid])] * int(value))
offsets.append(fout.tell())
fout.write(utils.to_utf8('%s\n' % ' '.join(words)))
if truncated:
logger.warning("List-of-words format can only save vectors with "
"integer elements; %i float entries were truncated to integer value" %
truncated)
return offsets
示例11: from_corpus
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def from_corpus(corpus, id2word=None):
"""
Create Dictionary from an existing corpus. This can be useful if you only
have a term-document BOW matrix (represented by `corpus`), but not the
original text corpus.
This will scan the term-document count matrix for all word ids that
appear in it, then construct and return Dictionary which maps each
`word_id -> id2word[word_id]`.
`id2word` is an optional dictionary that maps the `word_id` to a token. In
case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)`
will be used.
"""
result = Dictionary()
max_id = -1
for docno, document in enumerate(corpus):
if docno % 10000 == 0:
logger.info("adding document #%i to %s" % (docno, result))
result.num_docs += 1
result.num_nnz += len(document)
for wordid, word_freq in document:
max_id = max(wordid, max_id)
result.num_pos += word_freq
result.dfs[wordid] = result.dfs.get(wordid, 0) + 1
if id2word is None:
# make sure length(result) == get_max_id(corpus) + 1
result.token2id = dict((unicode(i), i) for i in xrange(max_id + 1))
else:
# id=>word mapping given: simply copy it
result.token2id = dict((utils.to_unicode(token), id) for id, token in iteritems(id2word))
for id in itervalues(result.token2id):
# make sure all token ids have a valid `dfs` entry
result.dfs[id] = result.dfs.get(id, 0)
logger.info("built %s from %i documents (total %i corpus positions)" %
(result, result.num_docs, result.num_pos))
return result
#endclass Dictionary
示例12: line2doc
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def line2doc(self, line):
parts = utils.to_unicode(line).split()
if int(parts[0]) != len(parts) - 1:
raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
doc = [part.rsplit(':', 1) for part in parts[1:]]
doc = [(int(p1), float(p2)) for p1, p2 in doc]
return doc
示例13: remove_stopwords
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def remove_stopwords(s):
s = utils.to_unicode(s)
return " ".join(w for w in s.split() if w not in STOPWORDS)
示例14: strip_punctuation
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def strip_punctuation(s):
s = utils.to_unicode(s)
return RE_PUNCT.sub(" ", s)
# unicode.translate cannot delete characters like str can
示例15: strip_tags
# 需要导入模块: from gensim import utils [as 别名]
# 或者: from gensim.utils import to_unicode [as 别名]
def strip_tags(s):
s = utils.to_unicode(s)
return RE_TAGS.sub("",s)