本文整理汇总了Python中orangecontrib.text.corpus.Corpus类的典型用法代码示例。如果您正苦于以下问题:Python Corpus类的具体用法?Python Corpus怎么用?Python Corpus使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Corpus类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: open_file
def open_file(self, path=None, data=None):
self.closeContext()
self.Error.clear()
self.unused_attrs_model[:] = []
self.used_attrs_model[:] = []
if data:
self.corpus = Corpus.from_table(data.domain, data)
elif path:
try:
self.corpus = Corpus.from_file(path)
self.corpus.name = os.path.splitext(os.path.basename(path))[0]
except BaseException as err:
self.Error.read_file(path, str(err))
else:
return
self.update_info()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
self.Error.corpus_without_text_features()
self.Outputs.corpus.send(None)
return
self.openContext(self.corpus)
self.used_attrs_model.extend(self.used_attrs)
self.unused_attrs_model.extend(
[f for f in self.corpus.domain.metas
if f.is_string and f not in self.used_attrs_model])
示例2: test_compute_values
def test_compute_values(self):
corpus = Corpus.from_file('deerwester')
vect = BowVectorizer()
bow = vect.transform(corpus)
computed = Corpus.from_table(bow.domain, corpus)
self.assertEqual(bow.domain, computed.domain)
self.assertEqual((bow.X != computed.X).nnz, 0)
示例3: test_infer_text_features
def test_infer_text_features(self):
c = Corpus.from_file('friends-transcripts')
tf = c.text_features
self.assertEqual(len(tf), 1)
self.assertEqual(tf[0].name, 'Quote')
c = Corpus.from_file('deerwester')
tf = c.text_features
self.assertEqual(len(tf), 1)
self.assertEqual(tf[0].name, 'Text')
示例4: test_compute_values_to_different_domain
def test_compute_values_to_different_domain(self):
source = Corpus.from_file('deerwester')
destination = Corpus.from_file('book-excerpts')
self.assertFalse(source.domain.attributes)
self.assertFalse(destination.domain.attributes)
bow = BowVectorizer().transform(source)
computed = destination.transform(bow.domain)
self.assertEqual(bow.domain.attributes, computed.domain.attributes)
示例5: test_corpus_from_file
def test_corpus_from_file(self):
c = Corpus.from_file('book-excerpts')
self.assertEqual(len(c), 140)
self.assertEqual(len(c.domain), 1)
self.assertEqual(len(c.domain.metas), 1)
self.assertEqual(c.metas.shape, (140, 1))
c = Corpus.from_file('deerwester')
self.assertEqual(len(c), 9)
self.assertEqual(len(c.domain), 1)
self.assertEqual(len(c.domain.metas), 1)
self.assertEqual(c.metas.shape, (9, 1))
示例6: test_corpus_from_file_just_text
def test_corpus_from_file_just_text(self):
c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.tab'))
self.assertEqual(len(c), 9)
self.assertEqual(len(c.domain), 0)
self.assertEqual(len(c.domain.metas), 1)
self.assertEqual(c.metas.shape, (9, 1))
示例7: test_corpus_from_file
def test_corpus_from_file(self):
c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.txt'))
self.assertEqual(len(c), 140)
self.assertEqual(len(c.domain), 0)
self.assertEqual(len(c.domain.metas), 2)
self.assertEqual(c.metas.shape, (140, 2))
示例8: main
def main():
from Orange.data import Table, Domain, ContinuousVariable, StringVariable
words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home'
words = np.array([w.replace('~', ' ') for w in words.split()], dtype=object, ndmin=2).T
weights = np.random.random((len(words), 1))
data = np.zeros((len(words), 0))
metas = []
for i, w in enumerate(weights.T):
data = np.column_stack((data, words, w))
metas = metas + [StringVariable('Topic' + str(i)),
ContinuousVariable('weights')]
domain = Domain([], metas=metas)
table = Table.from_numpy(domain,
X=np.zeros((len(words), 0)),
metas=data)
app = QtGui.QApplication([''])
w = OWWordCloud()
w.on_topics_change(table)
domain = Domain([], metas=[StringVariable('text')])
data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]]))
w.on_corpus_change(data)
w.show()
app.exec()
示例9: test_transform
def test_transform(self):
vect = BowVectorizer()
corpus = Corpus.from_file('deerwester')
result = vect.transform(corpus)
self.assertIsInstance(result, Corpus)
self.assertEqual(len(result.domain), 43)
示例10: test_documents
def test_documents(self):
c = Corpus.from_file('book-excerpts')
docs = c.documents
types = set(type(i) for i in docs)
self.assertEqual(len(docs), len(c))
self.assertEqual(len(types), 1)
self.assertIn(str, types)
示例11: test_corpus_not_eq
def test_corpus_not_eq(self):
c = Corpus.from_file('book-excerpts')
n_doc = c.X.shape[0]
c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, [])
self.assertNotEqual(c, c2)
c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features)
self.assertNotEqual(c, c2)
c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features)
self.assertNotEqual(c, c2)
broken_metas = np.copy(c.metas)
broken_metas[0, 0] = ''
c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features)
self.assertNotEqual(c, c2)
new_meta = [StringVariable('text2')]
broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta)
c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta)
self.assertNotEqual(c, c2)
c2 = c.copy()
c2.ngram_range = (2, 4)
self.assertNotEqual(c, c2)
示例12: set_data
def set_data(self, data=None):
self.reset_widget()
self.corpus = data
if data is not None:
if not isinstance(data, Corpus):
self.corpus = Corpus.from_table(data.domain, data)
self.load_features()
self.regenerate_docs()
self.commit()
示例13: test_create_bow
def test_create_bow(self):
corpus = Corpus.from_file('deerwester')
bag_of_words = self.bow(corpus, use_tfidf=True)
self.assertIsNotNone(bag_of_words.X)
self.assertEqual(9, bag_of_words.X.shape[0])
self.assertEqual(42, bag_of_words.X.shape[1])
self.assertEqual(self.progress_callbacks, 4)
self.assertEqual(self.error_callbacks, 0)
示例14: test_empty_corpus
def test_empty_corpus(self):
"""
Empty data.
GH-247
"""
corpus = Corpus.from_file("deerwester")[:0]
vect = BowVectorizer(norm=BowVectorizer.L1)
out = vect.transform(corpus)
self.assertEqual(out, corpus)
示例15: test_POSTagger
def test_POSTagger(self):
corpus = Corpus.from_file('deerwester')
tagger = tag.AveragedPerceptronTagger()
result = tagger.tag_corpus(corpus)
self.assertTrue(hasattr(result, 'pos_tags'))
# for token in itertools.chain(*result.tokens):
# self.assertRegexpMatches(token, '[a-z]+_[A-Z]+')
for tokens, tags in zip(result.tokens, result.pos_tags):
self.assertEqual(len(tokens), len(tags))