当前位置: 首页>>代码示例>>Python>>正文


Python TreebankWordTokenizer.tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.TreebankWordTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python TreebankWordTokenizer.tokenize方法的具体用法?Python TreebankWordTokenizer.tokenize怎么用?Python TreebankWordTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.TreebankWordTokenizer的用法示例。


在下文中一共展示了TreebankWordTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: find_ml

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
    def find_ml(self, td):
        f_tokenizer = TreebankWordTokenizer()
        query_words = f_tokenizer.tokenize(td)
        genres = self.sentiment_analysis(query_words)
        weighted_genres = []
        genre_weights = {}
        for x in genres:
            if x[1] is not None:
                weighted_genres.append(x[0])
                genre_weights[x[0]] = x[1]

        d_score_updates = {}
        for movie in self.movies:
            g = self.genre_dict[movie][0]
            total_genre_score = 0
            if u'Comedy' in g and 'comedy' in weighted_genres:
                total_genre_score += genre_weights['comedy']
            if u'Action' in g and 'action' in weighted_genres:
                total_genre_score += genre_weights['action']
            if u'Crime' in g and 'crime' in weighted_genres:
                total_genre_score += genre_weights['crime']
            if u'Drama' in g and 'drana' in weighted_genres:
                total_genre_score += genre_weights['drama']
            d_score_updates[self.movies.index(movie)] = total_genre_score * .1

        return d_score_updates
开发者ID:nporwal,项目名称:cs4300sp2016-moviequotes,代码行数:28,代码来源:find.py

示例2: __init__

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class TreebankWordTokenizerWrapper:
  """ Seriously I don't know why we need this class - this makes no sense """

  PAT_NLTK_BUG = re.compile(r"^(?:(.+)(,|'s))$")

  def __init__(self):
    self.word_tokenizer = TreebankWordTokenizer()

  def tokenize(self, s):
    temp = self.word_tokenizer.tokenize(s)
    if temp:
      it = []
      for t0 in temp:
        t = [t0]
        while True:
          m = self.PAT_NLTK_BUG.search(t[0])
          if m:
            t.insert(0, m.group(1))
            t[1] = m.group(2)
          else:
            break
        it += t
        #sys.stderr.write('DEBUG: t=%s => %s\n' % (t0, t))
    else:
      it = temp
    return it
开发者ID:acli,项目名称:Coursera-subtitles,代码行数:28,代码来源:reformat-extracted-text.py

示例3: CRCleaner

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class CRCleaner(Cleaner):
    def __init__(self, input_dir, output_dir):
        super(CRCleaner,self).__init__(input_dir, output_dir, u"-\n'", punctuation+digits)
        self.t = TreebankWordTokenizer()
    
    def cleaned_text(self, text):
        if len(text) == 0:
            return u""
        sans_xml = self.xml_to_txt(text)
        arr = self.t.tokenize(sans_xml)
        return self.reconstruct_arr(arr)
    
    def xml_to_txt(self, xml):
        arr = []
        dom = parseString(xml)
        for node in (dom.firstChild.getElementsByTagName('speaking')+dom.firstChild.getElementsByTagName('speaking-unknown-id')):
            paragraphs = node.getElementsByTagName('paragraph')
            if len(paragraphs) > 0:
                for node2 in paragraphs:
                    if node2.hasChildNodes():
                        child = node2.firstChild
                        if child.nodeType == child.TEXT_NODE:
                            arr += [child.data.replace(' ',' ')]
        return ' '.join(arr)
    
    def new_filename(self, old_filename):
        return old_filename.replace('.xml', '.txt')
开发者ID:jergason,项目名称:topicalguide,代码行数:29,代码来源:clean_cr.py

示例4: pos_titles_from

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def pos_titles_from(input_path, output_path = None, options = None):
    finput, foutput = get_streams(input_path, output_path)
    skip, end = get_options(options)
    tokenizer = Tokenizer()
    tagger = PerceptronTagger()
    line_counter = 0
    skipped_lines = 0
    for line in finput:
        log_advance(1000000, line_counter)
        line_counter += 1
        if line_counter <= skip:
            continue
        if end and line_counter > end:
            break
        try:
            paper_id, title = get_fields(line)
            if is_english(title):
                print >> foutput, paper_id
                tokens = tokenizer.tokenize(title)
                for token in tagger.tag(tokens):
                    print >> foutput, token[0], token[1]
                print >> foutput
            else:
                skipped_lines += 1
        except:
            print >> sys.stderr, "Error:", line, sys.exc_info()
    log_nlines(line_counter, skipped_lines)
开发者ID:snovd,项目名称:term-extraction-tests,代码行数:29,代码来源:mag-titles-pos.py

示例5: transformTweetData

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def transformTweetData(tweet):
    content = unicode(tweet.sentence.lower(), errors='ignore')
    words = content.strip().split()
    tokenizer = TreebankWordTokenizer()
    extra_features = []
    content = " ".join(words + extra_features)
    tokens = tokenizer.tokenize(content)
    tokens = [t for t in tokens if t not in stopwords]
    return tokens
开发者ID:prashant-r,项目名称:StanceClassification,代码行数:11,代码来源:preprocess.py

示例6: tokenize_en

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def tokenize_en(text):
    """
    Return a list of lists of the tokens in text, separated by sentences.
    """
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    tokenizer = TreebankWordTokenizer()
    sentences = [tokenizer.tokenize(sentence) 
                 for sentence in sent_tokenizer.tokenize(text)]
    return sentences
开发者ID:DSam1991,项目名称:nlpnet,代码行数:11,代码来源:utils.py

示例7: DssgUnigramExtractor

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class DssgUnigramExtractor(object):

    """
    An instance of this is used to obtain a list of unigrams, given a text.
    Usages:
    unigramExtractor = DssgUnigramExtractor()
    tokenList = unigramExtractor.extract("here is a text as a string") # ['text', 'string']
    """

    _cache = {}

    def __init__(self):
        self._tokenizer = TreebankWordTokenizer()
        self._stopwordSet = set(stopwords.words("english"))
        self._stemmer = PorterStemmer()

    def __repr__(self):
        return self.__class__.__name__ + "()"

    def extract(self, text):
        """
        Given a text, return a list of unigram tokens.
        """
        if text not in DssgUnigramExtractor._cache:
            text = (
                text.replace("&lt;", "<")
                .replace("&gt;", ">")
                .replace("&quot;", '"')
                .replace("&amp;", "&")
                .replace("&nbsp;", " ")
            )
            text = nltk.clean_html(text)
            tokens = self._tokenizer.tokenize(text)

            newTokens = []
            for tok in tokens:
                # - lowercase, remove '
                tok = tok.lower().strip("`'.,-_*/:;\\[email protected]#$%^&*()=\"")

                # - remove stopwords, one character word, only numbers
                # - remove one character word
                # - remove only numbers
                if tok in self._stopwordSet or len(tok) <= 1 or isAllNumbers(tok):
                    continue

                # - apply stemming
                # oldTok = copy.deepcopy(tok); # for debug
                tok = self._stemmer.stem(tok)
                # sometimes a token is like 'theres' and becomes stopword after
                # stemming
                if tok in self._stopwordSet:
                    continue

                newTokens.append(tok)
            DssgUnigramExtractor._cache[text] = newTokens
        return DssgUnigramExtractor._cache[text]
开发者ID:pombredanne,项目名称:ushine-learning,代码行数:58,代码来源:vectorizer.py

示例8: getNoun

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
	def getNoun(self, parser, sentence):
		#mysent = sentence.encode('ascii','ignore')
		#sent = mysent.decode()
		penn = TreebankWordTokenizer()
		tags = parser.tag(penn.tokenize(sentence))
		the_tags = []
		nouns = []
		for t in tags:
			if t[1].startswith('NN'):
				nouns.append(t[0])
		return ' '.join(nouns)
开发者ID:sidiksoleman,项目名称:CiteCheck,代码行数:13,代码来源:nounextraction.py

示例9: pos_per_line

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def pos_per_line(text_file):
    try:
        tokenizer = Tokenizer()
        #pos
        tagger = PerceptronTagger()
        for s in text_file:
            tokens = tokenizer.tokenize(s)
            #print " ".join([" ".join(token)  for token in tagger.tag(tokens)])
            print " ".join([token[1]  for token in tagger.tag(tokens)])
    except:
        print >> sys.stderr, "Error pos_per_line(text_file): ", sys.exc_info()
开发者ID:snovd,项目名称:term-extraction-tests,代码行数:13,代码来源:pos_per_line.py

示例10: genLexicon

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def genLexicon(data):

	tok = TreebankWordTokenizer()

	texts = []
	for doc in data:
		for sent in doc:
			texts.append(tok.tokenize( sent[1].lower() ))

	dictionary = corpora.Dictionary(texts)

	pickle.dump(dictionary, open("lex/toy.lex", "w"))
开发者ID:LEONOB2014,项目名称:NLP-final-project,代码行数:14,代码来源:genFeature.py

示例11: __init__

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
class MorphyStemmer:
    def __init__(self):
        self.tokenizer = TreebankWordTokenizer()

    def __call__(self, doc):
        stemmed_doc = []
        for t in self.tokenizer.tokenize(doc):
            stem = wordnet.morphy(t)
            if stem:
                stemmed_doc.append(stem.lower())
            else:
                stemmed_doc.append(t.lower())
        return stemmed_doc
开发者ID:sangheestyle,项目名称:nlp2014,代码行数:15,代码来源:quiz_king.py

示例12: crear_dicc_doc_term

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def crear_dicc_doc_term(path):
    result = []
    result_aux = []
    file = open(path)
    for f in file:
        result.append(f)
    tokenizer = TreebankWordTokenizer()
    for s in result:
        tokenizer = RegexpTokenizer("[\w']+")
        temp = tokenizer.tokenize(s)
        words = temp
        result_aux += eiminar_stopwords(words)
    return result_aux
开发者ID:YanetFrancisco,项目名称:NaiveBayesClassifier,代码行数:15,代码来源:pre_procesing_text.py

示例13: section_02_02

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def section_02_02( datDIR ):

    print("\n### ~~~~~ Section 02.02 ~~~~~~~~");

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    textfile = os.path.join( datDIR , "the-great-gatsby.txt" )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    with open(file = textfile, mode = 'r') as inF:
        sentences = []
        for i, tempLine in enumerate(inF):
            if i > 100:
                break
            tempLine = tempLine.strip()
            sentences.append(tempLine)
            print( "%5d: %s" % (i,tempLine) )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    mySentence = sentences[20] + " " + sentences[21]
    print("\nmySentence:")
    print(   mySentence  )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    #tokens = mySentence.split("([-\s.,;!?])+")
    tokens = re.split("([-\s.,;!?])+",mySentence)
    temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
    print("\ntemp")
    print(   temp )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myPattern = re.compile("([-\s.,;!?])+")
    tokens = myPattern.split(mySentence)
    print("\ntokens[-10:]")
    print(   tokens[-10:] )

    temp = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
    print("\ntemp")
    print(   temp )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myRegexpTokenizer = RegexpTokenizer("\w+|$[0-9.]+|\S+")
    print("\nmyRegexpTokenizer.tokenize(mySentence):")
    print(   myRegexpTokenizer.tokenize(mySentence)  )

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myTreebankWordTokenizer = TreebankWordTokenizer()
    print("\nmyTreebankWordTokenizer.tokenize(mySentence):")
    print(   myTreebankWordTokenizer.tokenize(mySentence)  )
    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
开发者ID:paradisepilot,项目名称:statistics,代码行数:52,代码来源:Section_02_02.py

示例14: word_tokenizePT

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
 def word_tokenizePT(self,  text, tokenizer):
     """ tokenize a portuguese sentence in words
     @input params: sentence - a sentence, a phrase (self)
                    tokenizer - "TB" for TreebankWordTokenizer
                                "WP" for WordPunctTokenizer
     @returns word's list or error """
     if tokenizer == "TB":
         tokenizerTB = TreebankWordTokenizer()
         return tokenizerTB.tokenize(text)
     elif tokenizer == "WP":
         tokenizerWP = WordPunctTokenizer()
         return tokenizerWP.tokenize(text)
     else:
         return "tokenizer error: not found" 
开发者ID:fabiodomingos,项目名称:EADW,代码行数:16,代码来源:NLP_PT.py

示例15: tf_normalized

# 需要导入模块: from nltk.tokenize import TreebankWordTokenizer [as 别名]
# 或者: from nltk.tokenize.TreebankWordTokenizer import tokenize [as 别名]
def tf_normalized(full_texts):
    tokenizer = Tokenizer()
    tf = {}
    max_value = 0
    for text in full_texts:
        text_tokens = tokenizer.tokenize(text)
        text_tokens = escape_not_abbreviations(text_tokens)
        for token in text_tokens:
            token = token.lower()
            tf.setdefault(token, 0.0)
            tf[token] += 1.0
            if tf[token] > max_value:
                max_value = tf[token]
    for t in tf:
        tf[t] = tf[t]/max_value
    return tf
开发者ID:snovd,项目名称:term-extraction-tests,代码行数:18,代码来源:kpcommon.py


注:本文中的nltk.tokenize.TreebankWordTokenizer.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。