本文整理汇总了Python中orangecontrib.text.corpus.Corpus.from_file方法的典型用法代码示例。如果您正苦于以下问题:Python Corpus.from_file方法的具体用法?Python Corpus.from_file怎么用?Python Corpus.from_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类orangecontrib.text.corpus.Corpus
的用法示例。
在下文中一共展示了Corpus.from_file方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_infer_text_features
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_infer_text_features(self):
c = Corpus.from_file('friends-transcripts')
tf = c.text_features
self.assertEqual(len(tf), 1)
self.assertEqual(tf[0].name, 'Quote')
c = Corpus.from_file('deerwester')
tf = c.text_features
self.assertEqual(len(tf), 1)
self.assertEqual(tf[0].name, 'Text')
示例2: test_compute_values_to_different_domain
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_compute_values_to_different_domain(self):
source = Corpus.from_file('deerwester')
destination = Corpus.from_file('book-excerpts')
self.assertFalse(source.domain.attributes)
self.assertFalse(destination.domain.attributes)
bow = BowVectorizer().transform(source)
computed = destination.transform(bow.domain)
self.assertEqual(bow.domain.attributes, computed.domain.attributes)
示例3: test_corpus_from_file
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_corpus_from_file(self):
c = Corpus.from_file('book-excerpts')
self.assertEqual(len(c), 140)
self.assertEqual(len(c.domain), 1)
self.assertEqual(len(c.domain.metas), 1)
self.assertEqual(c.metas.shape, (140, 1))
c = Corpus.from_file('deerwester')
self.assertEqual(len(c), 9)
self.assertEqual(len(c.domain), 1)
self.assertEqual(len(c.domain.metas), 1)
self.assertEqual(c.metas.shape, (9, 1))
示例4: open_file
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def open_file(self, path=None, data=None):
self.closeContext()
self.Error.clear()
self.unused_attrs_model[:] = []
self.used_attrs_model[:] = []
if data:
self.corpus = Corpus.from_table(data.domain, data)
elif path:
try:
self.corpus = Corpus.from_file(path)
self.corpus.name = os.path.splitext(os.path.basename(path))[0]
except BaseException as err:
self.Error.read_file(path, str(err))
else:
return
self.update_info()
self.used_attrs = list(self.corpus.text_features)
if not self.corpus.text_features:
self.Error.corpus_without_text_features()
self.Outputs.corpus.send(None)
return
self.openContext(self.corpus)
self.used_attrs_model.extend(self.used_attrs)
self.unused_attrs_model.extend(
[f for f in self.corpus.domain.metas
if f.is_string and f not in self.used_attrs_model])
示例5: test_corpus_from_file_just_text
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_corpus_from_file_just_text(self):
c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.tab'))
self.assertEqual(len(c), 9)
self.assertEqual(len(c.domain), 0)
self.assertEqual(len(c.domain.metas), 1)
self.assertEqual(c.metas.shape, (9, 1))
示例6: test_corpus_from_file
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_corpus_from_file(self):
c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.txt'))
self.assertEqual(len(c), 140)
self.assertEqual(len(c.domain), 0)
self.assertEqual(len(c.domain.metas), 2)
self.assertEqual(c.metas.shape, (140, 2))
示例7: test_transform
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_transform(self):
vect = BowVectorizer()
corpus = Corpus.from_file('deerwester')
result = vect.transform(corpus)
self.assertIsInstance(result, Corpus)
self.assertEqual(len(result.domain), 43)
示例8: test_corpus_not_eq
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_corpus_not_eq(self):
c = Corpus.from_file('book-excerpts')
n_doc = c.X.shape[0]
c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, [])
self.assertNotEqual(c, c2)
c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features)
self.assertNotEqual(c, c2)
c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features)
self.assertNotEqual(c, c2)
broken_metas = np.copy(c.metas)
broken_metas[0, 0] = ''
c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features)
self.assertNotEqual(c, c2)
new_meta = [StringVariable('text2')]
broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta)
c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta)
self.assertNotEqual(c, c2)
c2 = c.copy()
c2.ngram_range = (2, 4)
self.assertNotEqual(c, c2)
示例9: test_documents
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_documents(self):
c = Corpus.from_file('book-excerpts')
docs = c.documents
types = set(type(i) for i in docs)
self.assertEqual(len(docs), len(c))
self.assertEqual(len(types), 1)
self.assertIn(str, types)
示例10: test_POSTagger
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_POSTagger(self):
corpus = Corpus.from_file('deerwester')
tagger = tag.AveragedPerceptronTagger()
result = tagger.tag_corpus(corpus)
self.assertTrue(hasattr(result, 'pos_tags'))
# for token in itertools.chain(*result.tokens):
# self.assertRegexpMatches(token, '[a-z]+_[A-Z]+')
for tokens, tags in zip(result.tokens, result.pos_tags):
self.assertEqual(len(tokens), len(tags))
示例11: test_compute_values
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_compute_values(self):
corpus = Corpus.from_file('deerwester')
vect = BowVectorizer()
bow = vect.transform(corpus)
computed = Corpus.from_table(bow.domain, corpus)
self.assertEqual(bow.domain, computed.domain)
self.assertEqual((bow.X != computed.X).nnz, 0)
示例12: test_create_bow
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_create_bow(self):
corpus = Corpus.from_file('deerwester')
bag_of_words = self.bow(corpus, use_tfidf=True)
self.assertIsNotNone(bag_of_words.X)
self.assertEqual(9, bag_of_words.X.shape[0])
self.assertEqual(42, bag_of_words.X.shape[1])
self.assertEqual(self.progress_callbacks, 4)
self.assertEqual(self.error_callbacks, 0)
示例13: test_empty_corpus
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_empty_corpus(self):
"""
Empty data.
GH-247
"""
corpus = Corpus.from_file("deerwester")[:0]
vect = BowVectorizer(norm=BowVectorizer.L1)
out = vect.transform(corpus)
self.assertEqual(out, corpus)
示例14: test_init_preserve_shape_of_empty_x
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_init_preserve_shape_of_empty_x(self):
c = Corpus.from_file('book-excerpts')
d = c.domain
new_domain = Domain((ContinuousVariable('c1'),), d.class_vars, d.metas)
empty_X = csr_matrix((len(c), 1))
new = Corpus(new_domain, X=empty_X, Y=c.Y, metas=c.metas)
self.assertEqual(empty_X.nnz, 0)
self.assertEqual(new.X.shape, empty_X.shape)
示例15: test_ngrams
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_file [as 别名]
def test_ngrams(self):
vect = BowVectorizer()
corpus = Corpus.from_file('deerwester')
pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
ngrams_range=(1, 3))
pr(corpus, inplace=True)
result = vect.transform(corpus)
attrs = [attr.name for attr in result.domain.attributes]
self.assertIn(corpus.tokens[0][1], attrs)
self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)