当前位置: 首页>>代码示例>>Python>>正文


Python nltk.word_tokenize函数代码示例

本文整理汇总了Python中nltk.word_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python word_tokenize函数的具体用法?Python word_tokenize怎么用?Python word_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了word_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: load_file_without_frequency

    def load_file_without_frequency(self,positif, negatif):
        tab = []
        maxs = self.nbFeatures
        phrases = []
        y = []
        with codecs.open(positif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(1)
		        for mot in word_tokenize(line):
		            tab.append(mot)
        with codecs.open(negatif,"r",encoding='latin-1') as my_file:
		    for line in my_file:
		        line= line.strip().lower() # remove the \n*
		        phrases.append(line)
		        y.append(0)
		        for mot in word_tokenize(line):
		            tab.append(mot)
        word_fd = FreqDist(tab)
        print(word_fd)
        for i in range(len(phrases)):
		    mots = word_tokenize(phrases[i])
		    tmp  = []
		    for element in mots:
		        tmp.append(word_fd[element])
		    if(len(tmp) < maxs):
		        for j in range(maxs - len(tmp)):
		            tmp.append(0)
		    elif(len(tmp)>maxs):
		            tmp = tmp[:maxs]
		    phrases[i] = tmp
        return (np.array(phrases),np.array(list(set(tab))),np.array(y))
开发者ID:Nicolas99-9,项目名称:TERApprentissage,代码行数:33,代码来源:neural.py

示例2: __init__

 def __init__(self, title, full_text, sentence):
     self.title = title
     self.sentence = sentence
     # map of word -> number of times it appears in the full article text
     self.full_text_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(full_text))
     # map of word -> number of times it appears in the given sentence
     self.sentence_word_frequencies = nltk.FreqDist(word.lower() for word in nltk.word_tokenize(sentence))
开发者ID:jeevnayak,项目名称:gapfill,代码行数:7,代码来源:keyword_chooser.py

示例3: vectorize

def vectorize(data, s):
    '''
    :param data: list of instances for a given lexelt with the following structure:
        {
			[(instance_id, left_context, head, right_context, sense_id), ...]
        }
    :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
    :return: vectors: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }
            labels: A dictionary with the following structure
            { instance_id : sense_id }

    '''

    vectors = {}
    labels = {}
    for (instance_id, left_context, head, right_context, sense_id) in data:
        labels[instance_id] = sense_id
        left_tokens = nltk.word_tokenize(left_context)
        right_tokens = nltk.word_tokenize(right_context)
        words = k_nearest_words_vector_from_tokens(left_tokens, right_tokens, window_size)
        vectors[instance_id] = frequency_vector_from_near_words(s, words)

    return vectors, labels
开发者ID:williamFalcon,项目名称:NLP_HW3,代码行数:26,代码来源:A.py

示例4: colocation

def colocation(windowSize, pos, context,dictionary):
    if windowSize<=0:
        return dictionary
    #going forward
    forward= context[:(pos)]
    f= forward[(-windowSize/2):]
    #going backward    
    backward= context[pos+1:]
    b= backward[:windowSize/2]
    for item in f:
        key= "pre"+str(len(f)-f.index(item))+"-word"
        value= item
        dictionary[key]=value
        key= "pre"+str(len(f)-f.index(item))+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    for item in b:
        key= "fol"+str(b.index(item)+1)+"-word"
        value= item
        dictionary[key]=value
        key= "fol"+str(b.index(item)+1)+"-pos"
        text = nltk.word_tokenize(item)
        value= nltk.pos_tag(text)[0][1]
        dictionary[key]=value
    return dictionary
开发者ID:ansuabraham,项目名称:cs4740_3,代码行数:26,代码来源:colocation.py

示例5: __tokenize

    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
开发者ID:ishalyminov,项目名称:dstc5,代码行数:30,代码来源:baseline_slu.py

示例6: reading_level

def reading_level(full_text):
    #Clean the full_text
    full_text_clean = ""
    for char in full_text:
        if char == ".":
            full_text_clean += ". "
        else:
            full_text_clean += char

    #Language features
    import nltk
    words = nltk.word_tokenize(full_text_clean)

    n_sents = len(nltk.sent_tokenize(full_text_clean))
    n_words = len(nltk.word_tokenize(full_text_clean))

    #Count the syllables
    n_syll = 0
    for word in words:
        n_syll += syllable_count(word)

    #Calculate the reading level
    #https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

    grade_level = -15.59 + 0.39*(n_words/n_sents) + 11.8*(n_syll/n_words)
    return round(grade_level,1)
开发者ID:ECohen16,项目名称:rapid_reader,代码行数:26,代码来源:views.py

示例7: update

    def update(self, other):
        """Adds counts for elements in other"""
        if isinstance(other, self.__class__):
            self.n_sents += other.n_sents
            for x, n in other.items():
                self[x] += n
        else:
            for sent in other:
                self.n_sents += 1

                # import pdb;pdb.set_trace()
                if self.poscache is not None:
                    if sent in self.poscache:
                        tags = self.poscache[sent]
                    else:
                        self.poscache[sent] = tags = nltk.pos_tag(
                            nltk.word_tokenize(sent))
                else:
                    tags = nltk.pos_tag(nltk.word_tokenize(sent))

                for x in tags:
                    tok, tag = x
                    self[tag] += 1

            if self.normalize:
                for x, n in self.items():
                    self[x] /= float(self.n_sents)
开发者ID:Axighi,项目名称:Scripts,代码行数:27,代码来源:PosTagFreqVectorizer.py

示例8: main

def main(question, article):
  ddict = {}
  counts = get_counts()
  for tok in nltk.word_tokenize(article):
    ddict[tok] = ddict.get(tok, 0) + 1

  vec = []
  for tok in nltk.word_tokenize(question):

    # count in article
    tf = ddict.get(tok, 0) 

    # total articles is 108 / number that have current token
    idf = math.log(float(108)/len(filter(lambda x:tok in x.keys(),counts)) + 1)
    vec.append(tf*idf)

  largest = max(vec)
  normalized = map(lambda y: y/largest, vec)

  finDic = {}
  for word,i in enumerate(nltk.word_tokenize(question)):
    finDic[word] = normalized[i]

  print finDic
  return finDic
开发者ID:NLP-Project,项目名称:NLP-project,代码行数:25,代码来源:tdIDF.py

示例9: next_note

def next_note(tokenizer):
    print 'SemEval data'
    for semeval_file in semeval_files:
        print 'File', semeval_file
        with open(semeval_file, 'r') as f:
            st = []
            for line in f:
                st += [line.strip()]
            text = read_visit_sem(st)
            text = tokenizer.tokenize(text)
            for sent in text:
                yield nltk.word_tokenize(sent.lower())
    print 'MIMIC data'
    for notes_file in subset(notes_files, 15): # 15 random MIMIC files
        print 'File', notes_file
        try:
            with open(notes_file, 'r') as f:
                ct = 0
                st = []
                for line in f:
                    ct += 1
                    if ct % 50000 == 0:
                        print ct
                    if line.strip() == '</VISIT>':
                        text = read_visit(st)
                        text = tokenizer.tokenize(text)
                        for sent in text:
                            yield nltk.word_tokenize(sent.lower())
                        st = []
                    elif line.strip() != '<VISIT>':
                        st += [line.strip()]
        except IOError:
            pass
开发者ID:ankitkv,项目名称:MIMICTools,代码行数:33,代码来源:PhraseDetect.py

示例10: PushDataPair

def PushDataPair(data, database):
        last = len(database['Q'].keys())
        for pair in data:
                database['Q'][last] = nltk.word_tokenize(pair['question'])
                database['A'][last] = nltk.word_tokenize(pair['answer'])
                last += 1
        return database
开发者ID:echoyuzhou,项目名称:ticktock_text_api,代码行数:7,代码来源:Loader.py

示例11: build_s

def build_s(data):
    '''
    Compute the context vector for each lexelt
    :param data: dict with the following structure:
        {
			lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
			...
        }
    :return: dict s with the following structure:
        {
			lexelt: [w1,w2,w3, ...],
			...
        }

    '''
    s = {}

    # implement your code here
    for key,value in data.items():
      for i in value:
        tokens_left = nltk.word_tokenize(i[1])
        tokens_right = nltk.word_tokenize(i[3])
        left = [w for w in tokens_left if w not in string.punctuation][-window_size:]
        right = [w for w in tokens_right if w not in string.punctuation][:window_size]
        context = left + right
        if key not in s:
          s[key]=[]
        for word in context:
          if word not in s[key]:
            s[key].append(word)            
          
    return s
开发者ID:jubimishra,项目名称:Natural-Language-Processing,代码行数:32,代码来源:A.py

示例12: paragraph_features

def paragraph_features(paragraph_sents):
    global count
    count += 1
    print '\r', count,

    if FEATURE == FEAT_CONTAINS:
        paragraph_words = set(
            sents_to_words(paragraph_sents)
        )
    elif FEATURE == FEAT_LINKED_TITLES:
        paragraph_words = ' '.join(paragraph_sents)
    elif FEATURE == FEAT_FIRST_SENT:
        paragraph_words = nltk.word_tokenize(
            paragraph_sents[0]
        )
    elif FEATURE == FEAT_BEGIN_SENT:
        paragraph_words = {
            nltk.word_tokenize(sent)[0]
            for sent in paragraph_sents
        }
    else:
        paragraph_words = None
        print 'FEATURE NOT SUPPORTED'
        exit()

    features = dict()
    for word in word_features:
        features[word_features[word]] = (
            word in paragraph_words
        )

    return features
开发者ID:mikeholler,项目名称:thesis-undergrad,代码行数:32,代码来源:classifier.py

示例13: synsym

def synsym(s1,s2):
    ts0 = nltk.pos_tag(nltk.word_tokenize(s1))
    ts1 = nltk.pos_tag(nltk.word_tokenize(s2))
    # adj  
    jj0 = [x for x,y in ts0 if y=='JJ' or y=='JJR' or y=='JJS']
    jj1 = [x for x,y in ts1 if y=='JJ' or y=='JJR' or y=='JJS']
    if len(jj0) == 0 or len(jj1) ==0:
      jjps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      jjps = np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # noum  
    jj0 = [x for x,y in ts0 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    jj1 = [x for x,y in ts1 if y=='NN' or y=='NNS' or y=='NNP' or y=='NNPS' or y=='DT']
    if len(jj0) == 0 or len(jj1) ==0:
      nps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      nps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))
    # verb
    jj0 = [x for x,y in ts0 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    jj1 = [x for x,y in ts1 if y=='VB' or y=='VBD' or y=='VBG' or y=='VBN' or y=='VBP' or y=='VBZ']
    if len(jj0) == 0 or len(jj1) ==0:
      vps = 0
    else: 
      v1 = makeFeatureVec(jj0,model,300)
      v2 = makeFeatureVec(jj1,model,300)
      vps =  np.inner(v1,v2)/(LA.norm(v1)*LA.norm(v2))    
    return [jjps,nps,vps]
开发者ID:gtesei,项目名称:fast-furious,代码行数:31,代码来源:gensin_1.py

示例14: build_s

def build_s(data):
    """
    Compute the context vector for each lexelt
    :param data: dic with the following structure:
        {
			lexelt: [(instance_id, left_context, head, right_context, sense_id), ...],
			...
        }
    :return: dic s with the following structure:
        {
			lexelt: [w1,w2,w3, ...],
			...
        }

    """
    s = {}

    # implement your code here

    for lexelt in data:
        words = set()
        for instance in data[lexelt]:

            left_context = word_tokenize(instance[1].strip())
            for token in left_context[-window_size:]:
                if token not in puncts:
                    words.add(token)

            right_context = word_tokenize(instance[3].strip())
            for token in right_context[:window_size]:
                if token not in puncts:
                    words.add(token)
        s[lexelt] = list(words)

    return s
开发者ID:keyu-lai,项目名称:NLP,代码行数:35,代码来源:A.py

示例15: parseFile

def parseFile(file):
	""" Parse the header and source files for the class, and return the bindings dictionary, which contains tag data (and other pertinent 
		information about the file)
	"""
	#print file
	
	bindings 	= []
	
	
	# Load header file
	tokens 		= []
	if (file['header'] != ''):
		with open(file['header'], 'r') as f:
			# Tokenize
			for line in f.readlines():
				tokens += nltk.word_tokenize(line)
	
	# Parse tokens
	bindings += parseTokens( tokens, file, 'header' )

	
	# Load source file
	tokens 		= []
	if (file['source'] != ''):
		with open(file['source'], 'r') as f:
			# Tokenize
			for line in f.readlines():
				tokens += nltk.word_tokenize(line)
	
	# Parse tokens
	bindings += parseTokens( tokens, file, 'source' )	
	
	return bindings
开发者ID:jarrettchisholm,项目名称:pyliteserializer,代码行数:33,代码来源:pyliteserializer.py


注:本文中的nltk.word_tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。