本文整理汇总了Python中nltk.tokenize.TreebankWordTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python TreebankWordTokenizer.tokenize方法的具体用法?Python TreebankWordTokenizer.tokenize怎么用?Python TreebankWordTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.TreebankWordTokenizer
的用法示例。
在下文中一共展示了TreebankWordTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: find_ml
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def find_ml(self, td):
f_tokenizer = TreebankWordTokenizer()
query_words = f_tokenizer.tokenize(td)
genres = self.sentiment_analysis(query_words)
weighted_genres = []
genre_weights = {}
for x in genres:
if x[1] is not None:
weighted_genres.append(x[0])
genre_weights[x[0]] = x[1]
d_score_updates = {}
for movie in self.movies:
g = self.genre_dict[movie][0]
total_genre_score = 0
if u'Comedy' in g and 'comedy' in weighted_genres:
total_genre_score += genre_weights['comedy']
if u'Action' in g and 'action' in weighted_genres:
total_genre_score += genre_weights['action']
if u'Crime' in g and 'crime' in weighted_genres:
total_genre_score += genre_weights['crime']
if u'Drama' in g and 'drana' in weighted_genres:
total_genre_score += genre_weights['drama']
d_score_updates[self.movies.index(movie)] = total_genre_score * .1
return d_score_updates
示例2: __init__
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class TreebankWordTokenizerWrapper:
""" Seriously I don't know why we need this class - this makes no sense """
PAT_NLTK_BUG = re.compile(r"^(?:(.+)(,|'s))$")
def __init__(self):
self.word_tokenizer = TreebankWordTokenizer()
def tokenize(self, s):
temp = self.word_tokenizer.tokenize(s)
if temp:
it = []
for t0 in temp:
t = [t0]
while True:
m = self.PAT_NLTK_BUG.search(t[0])
if m:
t.insert(0, m.group(1))
t[1] = m.group(2)
else:
break
it += t
#sys.stderr.write('DEBUG: t=%s => %s\n' % (t0, t))
else:
it = temp
return it
示例3: CRCleaner
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class CRCleaner(Cleaner):
def __init__(self, input_dir, output_dir):
super(CRCleaner,self).__init__(input_dir, output_dir, u"-\n'", punctuation+digits)
self.t = TreebankWordTokenizer()
def cleaned_text(self, text):
if len(text) == 0:
return u""
sans_xml = self.xml_to_txt(text)
arr = self.t.tokenize(sans_xml)
return self.reconstruct_arr(arr)
def xml_to_txt(self, xml):
arr = []
dom = parseString(xml)
for node in (dom.firstChild.getElementsByTagName('speaking')+dom.firstChild.getElementsByTagName('speaking-unknown-id')):
paragraphs = node.getElementsByTagName('paragraph')
if len(paragraphs) > 0:
for node2 in paragraphs:
if node2.hasChildNodes():
child = node2.firstChild
if child.nodeType == child.TEXT_NODE:
arr += [child.data.replace(' ',' ')]
return ' '.join(arr)
def new_filename(self, old_filename):
return old_filename.replace('.xml', '.txt')
示例4: pos_titles_from
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def pos_titles_from(input_path, output_path = None, options = None):
finput, foutput = get_streams(input_path, output_path)
skip, end = get_options(options)
tokenizer = Tokenizer()
tagger = PerceptronTagger()
line_counter = 0
skipped_lines = 0
for line in finput:
log_advance(1000000, line_counter)
line_counter += 1
if line_counter <= skip:
continue
if end and line_counter > end:
break
try:
paper_id, title = get_fields(line)
if is_english(title):
print >> foutput, paper_id
tokens = tokenizer.tokenize(title)
for token in tagger.tag(tokens):
print >> foutput, token[0], token[1]
print >> foutput
else:
skipped_lines += 1
except:
print >> sys.stderr, "Error:", line, sys.exc_info()
log_nlines(line_counter, skipped_lines)
示例5: transformTweetData
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def transformTweetData(tweet):
content = unicode(tweet.sentence.lower(), errors='ignore')
words = content.strip().split()
tokenizer = TreebankWordTokenizer()
extra_features = []
content = " ".join(words + extra_features)
tokens = tokenizer.tokenize(content)
tokens = [t for t in tokens if t not in stopwords]
return tokens
示例6: tokenize_en
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def tokenize_en(text):
"""
Return a list of lists of the tokens in text, separated by sentences.
"""
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tokenizer = TreebankWordTokenizer()
sentences = [tokenizer.tokenize(sentence)
for sentence in sent_tokenizer.tokenize(text)]
return sentences
示例7: DssgUnigramExtractor
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class DssgUnigramExtractor(object):
"""
An instance of this is used to obtain a list of unigrams, given a text.
Usages:
unigramExtractor = DssgUnigramExtractor()
tokenList = unigramExtractor.extract("here is a text as a string") # ['text', 'string']
"""
_cache = {}
def __init__(self):
self._tokenizer = TreebankWordTokenizer()
self._stopwordSet = set(stopwords.words("english"))
self._stemmer = PorterStemmer()
def __repr__(self):
return self.__class__.__name__ + "()"
def extract(self, text):
"""
Given a text, return a list of unigram tokens.
"""
if text not in DssgUnigramExtractor._cache:
text = (
text.replace("<", "<")
.replace(">", ">")
.replace(""", '"')
.replace("&", "&")
.replace(" ", " ")
)
text = nltk.clean_html(text)
tokens = self._tokenizer.tokenize(text)
newTokens = []
for tok in tokens:
# - lowercase, remove '
tok = tok.lower().strip("`'.,-_*/:;\\[email protected]#$%^&*()=\"")
# - remove stopwords, one character word, only numbers
# - remove one character word
# - remove only numbers
if tok in self._stopwordSet or len(tok) <= 1 or isAllNumbers(tok):
continue
# - apply stemming
# oldTok = copy.deepcopy(tok); # for debug
tok = self._stemmer.stem(tok)
# sometimes a token is like 'theres' and becomes stopword after
# stemming
if tok in self._stopwordSet:
continue
newTokens.append(tok)
DssgUnigramExtractor._cache[text] = newTokens
return DssgUnigramExtractor._cache[text]
示例8: getNoun
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def getNoun(self, parser, sentence):
#mysent = sentence.encode('ascii','ignore')
#sent = mysent.decode()
penn = TreebankWordTokenizer()
tags = parser.tag(penn.tokenize(sentence))
the_tags = []
nouns = []
for t in tags:
if t[1].startswith('NN'):
nouns.append(t[0])
return ' '.join(nouns)
示例9: pos_per_line
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def pos_per_line(text_file):
try:
tokenizer = Tokenizer()
#pos
tagger = PerceptronTagger()
for s in text_file:
tokens = tokenizer.tokenize(s)
#print " ".join([" ".join(token) for token in tagger.tag(tokens)])
print " ".join([token[1] for token in tagger.tag(tokens)])
except:
print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
示例10: genLexicon
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def genLexicon(data):
tok = TreebankWordTokenizer()
texts = []
for doc in data:
for sent in doc:
texts.append(tok.tokenize( sent[1].lower() ))
dictionary = corpora.Dictionary(texts)
pickle.dump(dictionary, open("lex/toy.lex", "w"))
示例11: __init__
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class MorphyStemmer:
def __init__(self):
self.tokenizer = TreebankWordTokenizer()
def __call__(self, doc):
stemmed_doc = []
for t in self.tokenizer.tokenize(doc):
stem = wordnet.morphy(t)
if stem:
stemmed_doc.append(stem.lower())
else:
stemmed_doc.append(t.lower())
return stemmed_doc
示例12: crear_dicc_doc_term
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def crear_dicc_doc_term(path):
result = []
result_aux = []
file = open(path)
for f in file:
result.append(f)
tokenizer = TreebankWordTokenizer()
for s in result:
tokenizer = RegexpTokenizer("[\w']+")
temp = tokenizer.tokenize(s)
words = temp
result_aux += eiminar_stopwords(words)
return result_aux
示例13: section_02_02
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def section_02_02( datDIR ):
print("\n### ~~~~~ Section 02.02 ~~~~~~~~");
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
textfile = os.path.join( datDIR , "the-great-gatsby.txt" )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
with open(file = textfile, mode = 'r') as inF:
sentences = []
for i, tempLine in enumerate(inF):
if i > 100:
break
tempLine = tempLine.strip()
sentences.append(tempLine)
print( "%5d: %s" % (i,tempLine) )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
mySentence = sentences[20] + " " + sentences[21]
print("\nmySentence:")
print( mySentence )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
#tokens = mySentence.split("([-\s.,;!?])+")
tokens = re.split("([-\s.,;!?])+",mySentence)
temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
print("\ntemp")
print( temp )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myPattern = re.compile("([-\s.,;!?])+")
tokens = myPattern.split(mySentence)
print("\ntokens[-10:]")
print( tokens[-10:] )
temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
print("\ntemp")
print( temp )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myRegexpTokenizer = RegexpTokenizer("\w+|$[0-9.]+|\S+")
print("\nmyRegexpTokenizer.tokenize(mySentence):")
print( myRegexpTokenizer.tokenize(mySentence) )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myTreebankWordTokenizer = TreebankWordTokenizer()
print("\nmyTreebankWordTokenizer.tokenize(mySentence):")
print( myTreebankWordTokenizer.tokenize(mySentence) )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
return( None )
示例14: word_tokenizePT
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def word_tokenizePT(self, text, tokenizer):
""" tokenize a portuguese sentence in words
@input params: sentence - a sentence, a phrase (self)
tokenizer - "TB" for TreebankWordTokenizer
"WP" for WordPunctTokenizer
@returns word's list or error """
if tokenizer == "TB":
tokenizerTB = TreebankWordTokenizer()
return tokenizerTB.tokenize(text)
elif tokenizer == "WP":
tokenizerWP = WordPunctTokenizer()
return tokenizerWP.tokenize(text)
else:
return "tokenizer error: not found"
示例15: tf_normalized
# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def tf_normalized(full_texts):
tokenizer = Tokenizer()
tf = {}
max_value = 0
for text in full_texts:
text_tokens = tokenizer.tokenize(text)
text_tokens = escape_not_abbreviations(text_tokens)
for token in text_tokens:
token = token.lower()
tf.setdefault(token, 0.0)
tf[token] += 1.0
if tf[token] > max_value:
max_value = tf[token]
for t in tf:
tf[t] = tf[t]/max_value
return tf