本文整理汇总了Python中orangecontrib.text.corpus.Corpus.from_documents方法的典型用法代码示例。如果您正苦于以下问题:Python Corpus.from_documents方法的具体用法?Python Corpus.from_documents怎么用?Python Corpus.from_documents使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类orangecontrib.text.corpus.Corpus
的用法示例。
在下文中一共展示了Corpus.from_documents方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: search
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_documents [as 别名]
def search(self, query, date_from=None, date_to=None, max_docs=None,
on_progress=None, should_break=None):
"""
Args:
query (str): Search query.
date_from (date): Start date limit.
date_to (date): End date limit.
max_docs (int): Maximal number of documents returned.
on_progress (callback): Called after every iteration of downloading.
should_break (callback): Callback for breaking the computation before the end.
If it evaluates to True, downloading is stopped and document downloaded till now
are returned in a Corpus.
Returns:
Corpus: Search results.
"""
if not self.api_key_valid():
raise RuntimeError('The API key is not valid.')
if max_docs is None or max_docs > MAX_DOCS:
max_docs = MAX_DOCS
# TODO create corpus on the fly and extend, so it stops faster.
records = []
data, cached = self._fetch_page(query, date_from, date_to, 0)
if data is None:
return None
records.extend(data['response']['docs'])
max_docs = min(data['response']['meta']['hits'], max_docs)
if callable(on_progress):
on_progress(len(records), max_docs)
for page in range(1, math.ceil(max_docs/BATCH_SIZE)):
if callable(should_break) and should_break():
break
data, cached = self._fetch_page(query, date_from, date_to, page)
if data is None:
break
records.extend(data['response']['docs'])
if callable(on_progress):
on_progress(len(records), max_docs)
if not cached:
sleep(SLEEP)
if len(records) > max_docs:
records = records[:max_docs]
return Corpus.from_documents(records, 'NY Times', self.attributes,
self.class_vars, self.metas, title_indices=[-1])
示例2: test_from_documents
# 需要导入模块: from orangecontrib.text.corpus import Corpus [as 别名]
# 或者: from orangecontrib.text.corpus.Corpus import from_documents [as 别名]
def test_from_documents(self):
documents = [
{
'wheels': 4,
'engine': 'w4',
'type': 'car',
'desc': 'A new car.'
},
{
'wheels': 8.,
'engine': 'w8',
'type': 'truck',
'desc': 'An old truck.'
},
{
'wheels': 12.,
'engine': 'w12',
'type': 'truck',
'desc': 'An new truck.'
}
]
attrs = [
(DiscreteVariable('Engine'), lambda doc: doc.get('engine')),
(ContinuousVariable('Wheels'), lambda doc: doc.get('wheels')),
]
class_vars = [
(DiscreteVariable('Type'), lambda doc: doc.get('type')),
]
metas = [
(StringVariable('Description'), lambda doc: doc.get('desc')),
]
dataset_name = 'TruckData'
c = Corpus.from_documents(documents, dataset_name, attrs, class_vars, metas)
self.assertEqual(len(c), len(documents))
self.assertEqual(c.name, dataset_name)
self.assertEqual(len(c.domain.attributes), len(attrs))
self.assertEqual(len(c.domain.class_vars), len(class_vars))
self.assertEqual(len(c.domain.metas), len(metas))
engine_dv = c.domain.attributes[0]
self.assertEqual(sorted(engine_dv.values),
sorted([d['engine'] for d in documents]))
self.assertEqual([engine_dv.repr_val(v) for v in c.X[:, 0]],
[d['engine'] for d in documents])