本文整理汇总了Python中spacy.tokens方法的典型用法代码示例。如果您正苦于以下问题:Python spacy.tokens方法的具体用法?Python spacy.tokens怎么用?Python spacy.tokens使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类spacy
的用法示例。
在下文中一共展示了spacy.tokens方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __new__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __new__(
cls,
span,
mention_index,
utterance_index,
utterance_start_sent,
speaker=None,
gold_label=None,
*args,
**kwargs,
):
# We need to override __new__ see http://cython.readthedocs.io/en/latest/src/userguide/special_methods.html
obj = spacy.tokens.Span.__new__(
cls, span.doc, span.start, span.end, *args, **kwargs
)
return obj
示例2: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __init__(
self,
language: str = "en_core_web_sm",
pos_tags: bool = False,
parse: bool = False,
ner: bool = False,
keep_spacy_tokens: bool = False,
split_on_spaces: bool = False,
start_tokens: Optional[List[str]] = None,
end_tokens: Optional[List[str]] = None,
) -> None:
self.spacy = get_spacy_model(language, pos_tags, parse, ner)
if split_on_spaces:
self.spacy.tokenizer = _WhitespaceSpacyTokenizer(self.spacy.vocab)
self._keep_spacy_tokens = keep_spacy_tokens
self._start_tokens = start_tokens or []
# We reverse the tokens here because we're going to insert them with `insert(0)` later;
# this makes sure they show up in the right order.
self._start_tokens.reverse()
self._end_tokens = end_tokens or []
示例3: _sanitize
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _sanitize(self, tokens: List[spacy.tokens.Token]) -> List[Token]:
"""
Converts spaCy tokens to allennlp tokens. Is a no-op if
keep_spacy_tokens is True
"""
if not self._keep_spacy_tokens:
tokens = [
Token(
token.text,
token.idx,
token.idx + len(token.text),
token.lemma_,
token.pos_,
token.tag_,
token.dep_,
token.ent_type_,
)
for token in tokens
]
for start_token in self._start_tokens:
tokens.insert(0, Token(start_token, 0))
for end_token in self._end_tokens:
tokens.append(Token(end_token, -1))
return tokens
示例4: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
try:
from nltk.corpus import wordnet
import nltk
except ImportError:
raise ImportError(
"WordNet-based data augmentation requires nltk to be installed."
)
self.wn = wordnet
try:
import spacy
from spacy.tokens import Token
except ImportError:
raise ImportError(
"WordNet-based data augmentation requires spaCy and a language "
"model to be installed (for part of speech tagging)."
)
if not skip_download_check:
nltk.download("wordnet")
self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
Token.set_extension("replacement", default=None, force=True)
示例5: get_sentence_tokens
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def get_sentence_tokens(texts, charoffsets):
whole_text = "".join(texts)
tokens = []
sentence_offsets = []
start_t = 0
end_t = 0
for offset_list in charoffsets:
end_t = start_t
for start, end in offset_list:
cur_token = whole_text[start:end]
if len(cur_token) > 0:
tokens.append(cur_token)
end_t += 1
sentence_offsets.append((start_t, end_t))
start_t = end_t
return tokens, sentence_offsets
示例6: remove_stopwords
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def remove_stopwords(self, sentence_str: str=None, tokens: List[Token]=None, use_lemma: bool=True) -> str:
"""Function which gets a normalized string of the sentence and removes stop words
Keyword Arguments:
sentence_str {str} -- input sentence string (default: {None})
tokens {List[Token]} -- pre-computed token list, with feature added (default: {None})
use_lemma {bool} -- return the lemma or the text (default: {True})
Returns:
str -- the str with stopwords removed
"""
if not tokens and sentence_str:
#sentence_str = normalize_answer(sentence_str)
tokens = self.model(sentence_str)
elif not tokens:
tokens = []
#word_tokenize(sentence_str)
attr = 'lemma_' if use_lemma else 'text' # what to merge
return ' '.join([ getattr(token, attr) for token in tokens
if not token.is_punct and token.text not in STOP_WORDS and token.lemma_ not in STOP_WORDS])
示例7: _generate_partly_censored_word
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _generate_partly_censored_word(self, word: Union[str, spacy.tokens.Token], profane_word: str) -> str:
def is_delete_or_insert(opcode):
return opcode[0] in ('delete', 'insert')
# noinspection PyShadowingNames
def find_word_part(word: str, word_part: str) -> str:
word_to_word_part_opcodes = Levenshtein.opcodes(word, word_part)
word_part_in_word_start = (
word_to_word_part_opcodes[0][2] if is_delete_or_insert(word_to_word_part_opcodes[0]) else 0)
word_part_in_word_finish = (
word_to_word_part_opcodes[-1][1] if is_delete_or_insert(word_to_word_part_opcodes[-1]) else len(word))
return word[word_part_in_word_start:word_part_in_word_finish]
with suppress(AttributeError):
word = word.text
word_part_for_censoring = find_word_part(word.lower(), profane_word)
return regex.sub(pattern=re.escape(word_part_for_censoring),
repl=self._generate_fully_censored_word(word=word_part_for_censoring),
string=word,
flags=regex.IGNORECASE)
示例8: test_spacy_training_sample_alignment
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def test_spacy_training_sample_alignment(spacy_nlp_component):
from spacy.tokens import Doc
m1 = Message.build(text="I have a feeling", intent="feeling")
m2 = Message.build(text="", intent="feeling")
m3 = Message.build(text="I am the last message", intent="feeling")
td = TrainingData(training_examples=[m1, m2, m3])
attribute_docs = spacy_nlp_component.docs_for_training_data(td)
assert isinstance(attribute_docs["text"][0], Doc)
assert isinstance(attribute_docs["text"][1], Doc)
assert isinstance(attribute_docs["text"][2], Doc)
assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
assert [t.text for t in attribute_docs["text"][1]] == []
assert [t.text for t in attribute_docs["text"][2]] == [
"i",
"am",
"the",
"last",
"message",
]
示例9: create_nlp_instance
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def create_nlp_instance():
import spacy
from spacymoji import Emoji
nlp = spacy.load('en')
emoji_pipe = Emoji(nlp)
nlp.add_pipe(emoji_pipe, first=True)
# Merge hashtag tokens which were split by spacy
def hashtag_pipe(doc):
merged_hashtag = False
while True:
for token_index, token in enumerate(doc):
if token.text == '#':
if token.head is not None:
start_index = token.idx
end_index = start_index + len(token.head.text) + 1
if doc.merge(start_index, end_index) is not None:
merged_hashtag = True
break
if not merged_hashtag:
break
merged_hashtag = False
return doc
nlp.add_pipe(hashtag_pipe)
return nlp
示例10: _form_ann_line
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _form_ann_line(
idx: str,
char_offset: Tuple[int, int, str],
tag_name: str,
doc: spacy.tokens.doc.Doc,
):
""" Forms a ann line that can be used to write the ANN files for CoNLL format
Parameters
----------
idx : int
The index for the entity being written
char_offset : int
THe start, end, tag for the line
tag_name : str
The tag to be used and is one of ``[Task, Process, Material]``
doc : str
Spacy doc to query the appropriate characters
Returns
-------
str
An ANN line that is formed.
"""
start_offset, end_offset, entity_type = char_offset
surface_form = doc.char_span(start_offset, end_offset).text
start_offset = str(start_offset)
end_offset = str(end_offset)
ann_line = " ".join([start_offset, end_offset])
ann_line = "\t".join([ann_line, surface_form])
ann_line = " ".join([tag_name, ann_line])
ann_line = "\t".join([f"T{idx}", ann_line])
return ann_line
示例11: batch_tokenize
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def batch_tokenize(self, texts: List[str]) -> List[List[Token]]:
return [
self._sanitize(_remove_spaces(tokens))
for tokens in self.spacy.pipe(texts, n_threads=-1)
]
示例12: _remove_spaces
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def _remove_spaces(tokens: List[spacy.tokens.Token]) -> List[spacy.tokens.Token]:
return [token for token in tokens if not token.is_space]
示例13: __init__
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def __init__(self, add_unk=True):
# init dictionaries
self.item2idx: Dict[str, int] = {}
self.idx2item: List[str] = []
# in order to deal with unknown tokens, add <unk>
if add_unk:
self.add_item('<unk>')
示例14: spacy_get_pos
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def spacy_get_pos(tokens):
doc = spacy.tokens.doc.Doc(
nlp.vocab, words=tokens)
for name, proc in nlp.pipeline:
proc(doc)
return [token.pos_ for token in doc]
示例15: iterative_abs
# 需要导入模块: import spacy [as 别名]
# 或者: from spacy import tokens [as 别名]
def iterative_abs(debug_num=None):
total_doc_num = init_inspect.TOTAL_NUM_DOC if debug_num is None else debug_num
cur_count = 0
with open(config.ABS_WIKI_FILE, 'rb') as abs_file:
for line in tqdm(abs_file, total=total_doc_num):
item = json.loads(line)
# print(item.keys())
# print()
tokens, sent_offset = get_sentence_tokens(item['text'], item['charoffset'])
poss = spacy_get_pos(tokens)
assert len(tokens) == len(poss)
print(tokens)
print(sent_offset)
# print(poss)