本文整理汇总了Python中nltk.tokenize.TreebankWordTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.TreebankWordTokenizer方法的具体用法?Python tokenize.TreebankWordTokenizer怎么用?Python tokenize.TreebankWordTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize
的用法示例。
在下文中一共展示了tokenize.TreebankWordTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
#return pkgutil.get_data('scattertext',
# 'data/viz/semiotic_new.html').decode('utf-8')
path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
tokenizer_fn = path + 'punkt.english.pickle'
tagger_fn = path + 'averaged_perceptron_tagger.pickle'
#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例2: __init__
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
示例3: main
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def main(tag_gt, word2idx, zipname):
with zf.ZipFile(zipname) as myzip:
namelist = myzip.namelist()
print('namelist:', namelist)
datainfo = myzip.open(namelist[-1], 'r')
info_dict = json.load(datainfo)
sentences = info_dict['sentences']
tokenizer = TreebankWordTokenizer()
for sentence in sentences:
video_id = sentence['video_id']
video_idx = int(video_id[5:])
caption = sentence['caption']
words = tokenizer.tokenize(caption)
for word in words:
if word in word2idx:
tag_gt[video_idx, word2idx[word]] = 1
示例4: tokenise
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def tokenise(text, asbytes=True, append_eos=False):
text = text.decode() if asbytes else text
if use_nltk:
sents = [s for s in sent_tokenize(text)]
tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
else:
for char in string.punctuation+'()-–':
text = text.replace(char, ' '+char+' ')
tokens = text.lower().split(' ')
tokens = [w.encode() if asbytes else w for w in tokens if w.strip() != '']
if append_eos:
tokens.append(EOS.encode() if asbytes else EOS)
# tokens = np.asarray(tokens)
# return np.asarray(tokens)
return tokens
示例5: stem_and_tokenize_text
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def stem_and_tokenize_text(text):
sents = sent_tokenize(text)
tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
terms = [Term(token) for token in tokens]
return filter(lambda term: not term.is_punctuation(), terms)
示例6: tokenize
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def tokenize(sentence):
"Tokenize sentence the way parser expects."
tokenizer = TreebankWordTokenizer()
s = tokenizer.tokenize(sentence)
s = ' '.join(s)
# character replacements
s = ''.join(REPLACEMENTS_R.get(x,x) for x in s)
return s
示例7: fresh
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def fresh(self, s, tokenized=False):
"""UD-parse and POS-tag sentence `s`. Returns (UDParse, PTB-parse-string).
Pass in `tokenized=True` if `s` has already been tokenized, otherwise we
apply `nltk.tokenize.TreebankWordTokenizer`.
"""
if self.process is None:
self._start_subprocess()
s = str(s.strip())
if not tokenized:
s = tokenize(s)
s = s.strip()
assert '\n' not in s, "No newline characters allowed %r" % s
try:
self.process.stdin.write(s.encode('utf-8'))
except IOError as e:
#if e.errno == 32: # broken pipe
# self.process = None
# return self(s) # retry will restart process
raise e
self.process.stdin.write(b'\n')
self.process.stdin.flush()
out = self.process.stdout.readline()
if sys.version_info[0] == 3:
out = out.decode()
return self.to_ud(out)
示例8: parse
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def parse(self, sentence):
s_toks = TreebankWordTokenizer().tokenize(sentence)
sentence = ' '.join(s_toks).lower()
return sentence
示例9: get
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def get(self, text=["medical"]):
if type(text) == str:
text = text.lower()
text = TreebankWordTokenizer().tokenize(text)
try:
data = np.array(map(self.vocab.get, text))
return self.onehot(data), data
except:
unknowns = []
for word in text:
if self.vocab.get(word) == None:
unknowns.append(word)
raise Exception(" [!] unknown words: %s" % ",".join(unknowns))
示例10: char_pos_to_word
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def char_pos_to_word(text, tokens, char_pos, asbytes=True):
ix=0
text=text.decode() if asbytes else text
if use_nltk:
sents = [s for s in sent_tokenize(text)]
spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents]
# lens = [len(sent)+1 for sent in sents]
offsets = []
for i,sent in enumerate(sents):
offsets.append(text.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster?
spans = [(span[0]+offsets[i], span[1]+offsets[i]) for i,sent in enumerate(spans) for span in sent]
# print(char_pos)
for ix,s in enumerate(spans):
# print(s, tokens[ix])
if s[1] > char_pos:
return ix
print('couldnt find the char pos via nltk')
print(text, char_pos, len(text))
else:
tokens = [t.decode() for t in tokens]
if char_pos>len(text):
print('Char pos doesnt fall within size of text!')
for t,token in enumerate(tokens):
for char in token:
ix = text.find(char, ix)
ix += 1
if ix >= char_pos:
return t
print('couldnt find the char pos')
print(text, tokens, char_pos, len(text))
# Filter a complete context down to the sentence containing the start of the answer span
示例11: filter_context
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def filter_context(ctxt, char_pos, window_size_before=0, window_size_after=0, max_tokens=-1):
sents = [s for s in sent_tokenize(ctxt)]
spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents]
# lens = [len(sent)+1 for sent in sents]
offsets = []
for i,sent in enumerate(sents):
# print(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0))
# print(len(sents[i-1]) if i>0 else 0)
# print(offsets[i-1] if i>0 else 0)
# print(offsets[i-1]+len(sents[i-1]) if i>0 else 0)
offsets.append(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster?
spans = [[(span[0]+offsets[i], span[1]+offsets[i]) for span in sent] for i,sent in enumerate(spans) ]
for ix,sent in enumerate(spans):
# print(sent[0][0], sent[-1][1], char_pos)
if char_pos >= sent[0][0] and char_pos < sent[-1][1]:
start=max(0, ix-window_size_before)
end = min(len(sents)-1, ix+window_size_after)
# print(start, end, start, offsets[start])
# new_ix=char_pos-offsets[start]
# print(new_ix)
# print(" ".join(sents[start:end+1])[new_ix:new_ix+10])
flat_spans=[span for sen in spans for span in sen]
if max_tokens > -1 and len([span for sen in spans[start:end+1] for span in sen]) > max_tokens:
for i,span in enumerate(flat_spans):
if char_pos < span[1]:
tok_ix =i
# print(span, char_pos)
break
start_ix = max(spans[start][0][0], flat_spans[max(tok_ix-max_tokens,0)][0])
end_ix = min(spans[end][-1][1], flat_spans[min(tok_ix+max_tokens, len(flat_spans)-1)][1])
# if len(flat_spans[start_tok:end_tok+1]) > 21:
# print(start_tok, end_tok, tok_ix)
# print(flat_spans[tok_ix])
# print(flat_spans[start_tok:end_tok])
# print(ctxt[flat_spans[start_tok][0]:flat_spans[end_tok][1]])
return ctxt[start_ix:end_ix], char_pos-start_ix
else:
return " ".join(sents[start:end+1]), char_pos - offsets[start]
print('couldnt find the char pos')
print(ctxt, char_pos, len(ctxt))
示例12: get_vocab
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def get_vocab(corpus, vocab_size=2000):
def tokenise(text):
sents = [s for s in sent_tokenize(text)]
tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
return tokens
vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
word_count = defaultdict(float)
for l in corpus:
# for w in l.lower().split():
for w in tokenise(l):
word_count[w] +=1
vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))]
for w in vocab_list:
vocab[w] = len(vocab)
return vocab
示例13: get_glove_vocab
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def get_glove_vocab(path, size=2000, d=200, variant='6B', filter_to_squad=False):
# this is a copy of the function in preprocessing.py - but we can't use it as we'd get a circular import!
def tokenise(text):
sents = [s for s in sent_tokenize(text)]
tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
return tokens
vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
if filter_to_squad:
squad_words = set()
squad_train = load_squad_triples(path, dev=False)
squad_dev = load_squad_triples(path, dev=True)
for triple in squad_train+squad_dev:
squad_words |= set(tokenise(triple[0]))
squad_words |= set(tokenise(triple[1]))
squad_words |= set(tokenise(triple[2]))
with open(path+'glove.'+variant+'/glove.'+variant+'.'+str(d)+'d.txt') as fp:
entries = fp.readlines()
for i,row in enumerate(entries):
if len(vocab)-4>= size and size > 0:
break
cols = row.strip().split(' ')
if len(cols) < d+1:
print(row)
if (filter_to_squad and cols[0] in squad_words) or not filter_to_squad:
vocab[cols[0]] = len(vocab)
return vocab
# def get_vocab(corpus, vocab_size=1000):
# lines = [re.sub(r'([\,\?\!\.]+)',r' \1 ', line).lower() for line in corpus]
# # lines = re.split('[\n]+',raw_data.lower())
# vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
# word_count = defaultdict(float)
# for l in lines:
# for w in l.split():
# word_count[w] +=1
# vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))]
# for w in vocab_list:
# vocab[w] = len(vocab)
# return vocab
示例14: tokenizeit
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def tokenizeit(store):
#NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ? Note this
# seems to be done differently in different corpora.
tokenizer = TreebankWordTokenizer()
do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.',
'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.',
'Sgt.',
'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.',
'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.',
'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.',
'Ltda.','E.U.','I.B.M.','D.T.',
'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.',
'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.',
'S.C.','Neb.',
'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.',
'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.',
'Mo.','Vt.',
'Blvd.','Ave.','Ln.','Rd.',
'No.']
pat = re.compile(r'[0-9][.,]{0,1}[0-9]*')
for i,x in enumerate(store):
if x[0] == '\n':
store[i] = ([x[0]], store[i][1])
#elif any([i in x[0] for i in do_not_tokenize]) and
#elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}:
elif x[0] in do_not_tokenize:
toks = [x[0]]
store[i] = (toks, store[i][1])
elif shall_use_split(x[0], do_not_tokenize):
#x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
toks = x[0].split(' ')
#print 'Plain split on: ', x[0]
store[i] = (toks, store[i][1])
else:
toks = tokenizer.tokenize(x[0])
# if '$' not in x[0] and '%' not in x[0] and "'" not in x[0] and "`" not in x[0] and x[0][-1]!='.' and not pat.match(x[0]):
# toks = regtok(x[0])
# elif x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
# toks = x[0].split(' ')
# elif x[0][0:4] in do_not_tokenize:
# toks = [x[0][0:4]]
# toks.extend(x[0][4:].split(' '))
# toks = [i for i in toks if i!='']
# print toks
# else:
# toks = word_tokenize(x[0])
store[i] = (toks, store[i][1])
return store
示例15: tokenizeit
# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import TreebankWordTokenizer [as 别名]
def tokenizeit(store):
#NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ? Note this
# seems to be done differently in different corpora.
tokenizer = TreebankWordTokenizer()
do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.',
'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.',
'Sgt.','M.D.',
'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.',
'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.',
'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.',
'Ltda.','E.U.','I.B.M.','D.T.',
'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.',
'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.',
'S.C.','Neb.',
'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.',
'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.',
'Mo.','Vt.',
'Blvd.','Ave.','Ln.','Rd.',
'No.']
pat = re.compile(r'[0-9][.,]{0,1}[0-9]*')
for i,x in enumerate(store):
if x[0] == '\n':
store[i] = ([x[0]], store[i][1])
#elif any([i in x[0] for i in do_not_tokenize]) and
#elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}:
elif x[0] in do_not_tokenize:
toks = [x[0]]
store[i] = (toks, store[i][1])
elif shall_use_split(x[0], do_not_tokenize):
#x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
toks = x[0].split(' ')
#print 'Plain split on: ', x[0]
store[i] = (toks, store[i][1])
elif '%' in x[0]:
toks = tokenizer.tokenize(x[0])
store[i] = (toks, store[i][1] )
else:
#NOTE It seems like this is already tokenized inline in the xml,
# so this way (just splitting spaces) may be best here.
toks = x[0].split(' ')
store[i] = (toks, store[i][1])
return store