本文整理汇总了Python中allennlp.data.tokenizers.WordTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenizers.WordTokenizer方法的具体用法?Python tokenizers.WordTokenizer怎么用?Python tokenizers.WordTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类allennlp.data.tokenizers
的用法示例。
在下文中一共展示了tokenizers.WordTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
db: FeverDocDB,
sentence_level = False,
wiki_tokenizer: Tokenizer = None,
claim_tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
filtering: str = None) -> None:
self._sentence_level = sentence_level
self._wiki_tokenizer = wiki_tokenizer or WordTokenizer()
self._claim_tokenizer = claim_tokenizer or WordTokenizer()
self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
self.db = db
self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema(),filtering=filtering)
self.reader = JSONLineReader()
示例2: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
lazy = False,
tokenizer = None,
sentence_token_indexers = None,
nonterminal_indexers = None,
terminal_indexers = None,
output_agendas = True) :
super(NlvrDatasetReader, self).__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._sentence_token_indexers = sentence_token_indexers or {u"tokens": SingleIdTokenIndexer()}
self._nonterminal_indexers = nonterminal_indexers or {u"tokens":
SingleIdTokenIndexer(u"rule_labels")}
self._terminal_indexers = terminal_indexers or {u"tokens": SingleIdTokenIndexer(u"rule_labels")}
self._output_agendas = output_agendas
#overrides
示例3: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
tokens_per_instance = None,
tokenizer = None,
token_indexers = None,
lazy = False) :
super(LanguageModelingReader, self).__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._token_indexers = token_indexers or {u"tokens": SingleIdTokenIndexer()}
self._tokens_per_instance = tokens_per_instance
# No matter how you want to represent the input, we'll always represent the output as a
# single token id. This code lets you learn a language model that concatenates word
# embeddings with character-level encoders, in order to predict the word token that comes
# next.
self._output_indexer = None
for name, indexer in list(self._token_indexers.items()):
if isinstance(indexer, SingleIdTokenIndexer):
self._output_indexer = {name: indexer}
break
else:
self._output_indexer = {u"tokens": SingleIdTokenIndexer()}
#overrides
示例4: test_char_span_to_token_span_handles_easy_cases
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def test_char_span_to_token_span_handles_easy_cases(self):
# These are _inclusive_ spans, on both sides.
tokenizer = WordTokenizer()
passage = u"On January 7, 2012, Beyoncé gave birth to her first child, a daughter, Blue Ivy " +\
u"Carter, at Lenox Hill Hospital in New York. Five months later, she performed for four " +\
u"nights at Revel Atlantic City's Ovation Hall to celebrate the resort's opening, her " +\
u"first performances since giving birth to Blue Ivy."
tokens = tokenizer.tokenize(passage)
offsets = [(t.idx, t.idx + len(t.text)) for t in tokens]
# "January 7, 2012"
token_span = util.char_span_to_token_span(offsets, (3, 18))[0]
assert token_span == (1, 4)
# "Lenox Hill Hospital"
token_span = util.char_span_to_token_span(offsets, (91, 110))[0]
assert token_span == (22, 24)
# "Lenox Hill Hospital in New York."
token_span = util.char_span_to_token_span(offsets, (91, 123))[0]
assert token_span == (22, 28)
示例5: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
debug: bool = False,
tokenizer: Tokenizer = None,
include_more_numbers: bool = False,
skip_when_all_empty: List[str] = None,
max_number_of_answer: int = 8,
max_number_count: int = 10,
logger = None) -> None:
super().__init__()
self.debug = debug
self._tokenizer = tokenizer or WordTokenizer()
self.include_more_numbers = include_more_numbers
self.max_number_of_answer = max_number_of_answer
self.max_number_count = max_number_count
self.skip_when_all_empty = skip_when_all_empty if skip_when_all_empty is not None else []
for item in self.skip_when_all_empty:
assert item in ["passage_span", "question_span", "addition_subtraction", "counting", "negation"], \
f"Unsupported skip type: {item}"
self.logger = logger
示例6: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
words_per_instance: int = 35
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer(
start_tokens=[START_SYMBOL],
end_tokens=[END_SYMBOL]
)
self._token_indexers = token_indexers or {
"tokens": SingleIdTokenIndexer(namespace="tokens", lowercase_tokens=True)
}
self._words_per_instance = words_per_instance
示例7: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
clean_citation: bool = True,
with_elmo: bool = False
# use_lexicon_features: bool = False,
# use_sparse_lexicon_features: bool = False
) -> None:
super().__init__(lazy)
self._clean_citation = clean_citation
self._tokenizer = tokenizer or WordTokenizer()
if with_elmo:
self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
else:
self._token_indexers = {"tokens": SingleIdTokenIndexer()}
示例8: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
use_lexicon_features: bool=False,
use_sparse_lexicon_features: bool = False,
multilabel: bool = False,
with_elmo: bool = False,
reader_format: str = 'flat') -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
if with_elmo:
# self._token_indexers = {"tokens": SingleIdTokenIndexer()}
self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
else:
self._token_indexers = {"tokens": SingleIdTokenIndexer()}
self.use_lexicon_features = use_lexicon_features
self.use_sparse_lexicon_features = use_sparse_lexicon_features
if self.use_lexicon_features or self.use_sparse_lexicon_features:
self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
self.multilabel = multilabel
self.reader_format = reader_format
示例9: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
use_lexicon_features: bool = False,
use_sparse_lexicon_features: bool = False,
with_elmo: bool = False
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
if with_elmo:
self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
else:
self._token_indexers = {"tokens": SingleIdTokenIndexer()}
self.use_lexicon_features = use_lexicon_features
self.use_sparse_lexicon_features = use_sparse_lexicon_features
if self.use_lexicon_features or self.use_sparse_lexicon_features:
self.lexicons = {**ALL_ACTION_LEXICONS, **ALL_CONCEPT_LEXICONS}
示例10: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
clean_citation: bool = True,
with_elmo: bool = False
) -> None:
super().__init__(lazy)
self._clean_citation = clean_citation
self._tokenizer = tokenizer or WordTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
if with_elmo:
self._token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
else:
self._token_indexers = {"tokens": SingleIdTokenIndexer()}
示例11: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
source_add_start_token: bool = True,
max_doc_length:int = -1,
max_query_length:int = -1,
min_doc_length:int = -1,
min_query_length:int = -1,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer() # little bit faster, useful for multicore proc. word_splitter=SimpleWordSplitter()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer(lowercase_tokens=True)}
self._source_add_start_token = source_add_start_token
self.max_doc_length = max_doc_length
self.max_query_length = max_query_length
self.min_doc_length = min_doc_length
self.min_query_length = min_query_length
self.padding_value = Token(text = "@@PADDING@@",text_id=0)
示例12: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
token_indexers: Dict[str, TokenIndexer] = None,
tokenizer: Tokenizer = None,
max_sequence_length: int = None,
ignore_labels: bool = False,
sample: int = None,
skip_label_indexing: bool = False,
lazy: bool = False) -> None:
super().__init__(lazy=lazy,
token_indexers=token_indexers,
tokenizer=tokenizer,
max_sequence_length=max_sequence_length,
skip_label_indexing=skip_label_indexing)
self._tokenizer = tokenizer or WordTokenizer()
self._sample = sample
self._max_sequence_length = max_sequence_length
self._ignore_labels = ignore_labels
self._skip_label_indexing = skip_label_indexing
self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
if self._segment_sentences:
self._sentence_segmenter = SpacySentenceSplitter()
示例13: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
lazy: bool = False,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
示例14: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self,
db: FeverDocDB,
wiki_tokenizer: Tokenizer = None,
claim_tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None) -> None:
self._wiki_tokenizer = wiki_tokenizer or WordTokenizer()
self._claim_tokenizer = claim_tokenizer or WordTokenizer()
self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
self.db = db
self.formatter = FEVERSentenceFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema())
self.reader = JSONLineReader()
示例15: __init__
# 需要导入模块: from allennlp.data import tokenizers [as 别名]
# 或者: from allennlp.data.tokenizers import WordTokenizer [as 别名]
def __init__(self, lazy: bool = False,
max_bag_size: int = 25,
negative_exampels_percentage: int = 100,
with_direct_supervision: bool = True) -> None:
"""
args:
lazy: lazy reading of the dataset
max_bag_size: maximum number of sentences per a bag
negative_exampels_percentage: percentage of negative examples to keep
with_direct_supervision: keep or ignore direct supervision examples
"""
super().__init__(lazy=lazy)
self.max_bag_size = max_bag_size
self.negative_exampels_percentage = negative_exampels_percentage
self.with_direct_supervision = with_direct_supervision
self._tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())
self._token_indexers = {"tokens": SingleIdTokenIndexer()}
# for logging and input validation
self._inst_counts: Dict = defaultdict(int) # count instances per relation type
self._pairs: Set = set() # keep track of pairs of entities
self._bag_sizes: Dict = defaultdict(int) # count relation types per bag
self._relation_coocur: Dict = defaultdict(int) # count relation types per bag
self._failed_mentions_count: int = 0 # count mentions with wrong formating
self._count_direct_supervised_inst: int = 0
self._count_bag_labels: Dict = defaultdict(int)