当前位置: 首页>>代码示例>>Python>>正文


Python WordPunctTokenizer.tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.WordPunctTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python WordPunctTokenizer.tokenize方法的具体用法?Python WordPunctTokenizer.tokenize怎么用?Python WordPunctTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.WordPunctTokenizer的用法示例。


在下文中一共展示了WordPunctTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: tfIdf

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def tfIdf():
	TFIDF_MIN_SCORE = 100
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	docs = collection.find()
	tfidf = []
	idfMap = create_idf_map()
	docs = collection.find()
	for d in docs:
		tfMap = {}
		for word in set(tokenizer.tokenize(d['content'].lower())):
		 	if word not in tfMap:
		 		tfMap[word] = 1
		 	else:
		 		tfMap[word] += 1
		tfIdfValues = []
		for word in set(tokenizer.tokenize(d['content'].lower())):
			if (tfMap[word] * 1000 / idfMap[word]) > TFIDF_MIN_SCORE:
				tfIdfValues.append((word, tfMap[word] * 1000 / idfMap[word]))
		tfIdfValues = sorted(tfIdfValues, key = lambda x : x[1], reverse = True)
		d['tfidf'] = tfIdfValues
		tfidf.append({'d' : d,
					  'tfidf' : tfIdfValues})
		collection.save(d)


	genFreq = generaral_frequency(idfMap)
	return render_template("tfidf.html", documents = tfidf)
开发者ID:tempflip,项目名称:szakdoga,代码行数:33,代码来源:flask1.py

示例2: class1

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def class1():
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	docId = request.args.get('d')
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	featuresets = []
	tagSet = set()
	for d in collection.find():	
		bagOfWords = bag_of_words(tokenizer.tokenize(d['content']))
		if 'tags' not in d: continue
		for tag in d['tags']:
			featuresets.append((bagOfWords, tag))
			tagSet.add(tag)
	classifier = nltk.NaiveBayesClassifier.train(featuresets)

	d = collection.find_one({'_id' : ObjectId(docId)})

	#classifier.show_most_informative_features(100)
	cl = classifier.prob_classify(bag_of_words(tokenizer.tokenize(d['content'])))
	probs = []
	for tag in tagSet:
		probs.append((tag, round(cl.prob(tag)*100) ))
	classifier.show_most_informative_features(n=20)
	probs = sorted(probs, key = lambda x : x[1],  reverse = True)
	return render_template('class1.html', probs = probs, d=d)
开发者ID:tempflip,项目名称:szakdoga,代码行数:29,代码来源:flask1.py

示例3: build_word_dictionary

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    tokenizer = WordPunctTokenizer()
    with open(input_file_name) as input_file:
        for record in json.loads(input_file.read()):
            dictionary.update(tokenizer.tokenize(record['content']))
            dictionary.update(tokenizer.tokenize(record['abstract']))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
开发者ID:BKJackson,项目名称:txtnets,代码行数:14,代码来源:prepare_nips.py

示例4: tokenize_words

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def tokenize_words(sentence):
    """
    :param sentence:
    :return: list of words in sentence
    """
    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)
开发者ID:paulzin,项目名称:NltkTokenizerDemo,代码行数:9,代码来源:nltk_tokenizer.py

示例5: message_to_wordlist

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    #review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove messages numbers
    message_text = re.sub(">>\d+","", message)
    message_text = message_text.lower()
    message_text = re.sub(u"ё", 'e', message_text, re.UNICODE)
    message_text = clean_str(message_text)
    tokenizer = WordPunctTokenizer()
    # 3. Convert words to lower case and split them
    words = tokenizer.tokenize(message_text)
    lemmas = []
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if lemmas_bool == 'l':
        for word in words:
            word_parsed = morph.parse(word)
            if len(word_parsed) > 0:
                lemmas.append(word_parsed[0].normal_form)
    elif lemmas_bool == 's':
        for word in words:
            word = stemmer.stem(word)
            if len(word) > 0:
                lemmas.append(word)
    else:
        lemmas = words
    # 5. Return a list of words
    return(lemmas)
开发者ID:denis-gordeev,项目名称:CNN-aggression-RU,代码行数:36,代码来源:train_tensorflow.py

示例6: clean_data

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.encode('ascii', 'ignore')
        word = word.lower()
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w][email protected])?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w][email protected])[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word

    tokenizer = WordPunctTokenizer()
    data = []
    with open(input_file_name) as input_file:
        for sentences, label in json.load(input_file):
            cleaned_sentences = []
            for sentence in sentences:
                cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                cleaned_sentence = tokenizer.tokenize(cleaned_sentence)
                cleaned_sentences.append(cleaned_sentence)

            data.append([cleaned_sentences, label])

    with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file:
        json.dump(data, output_file)
开发者ID:BKJackson,项目名称:txtnets,代码行数:28,代码来源:prepare_amazon_sentiment.py

示例7: clean_data

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.lower()
        word = word.replace('&amp;','&').replace('&lt;','<').replace('&gt;','>').replace('&quot;','"').replace('&#39;',"'")
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        word = word.encode('ascii', 'ignore')

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w][email protected])?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w][email protected])[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word.encode('ascii', 'ignore')

    tokenizer = WordPunctTokenizer()

    with gzip.open(input_file_name) as input_file:
        with gzip.open(output_file_name, 'w') as output_file:
            for line in input_file:
                sentences, score = json.loads(line)
                cleaned_sentences = []
                for sentence in sentences:
                    cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                    cleaned_sentences.append(tokenizer.tokenize(cleaned_sentence))

                json.dump([cleaned_sentences, score], output_file)
                output_file.write("\n")
开发者ID:BKJackson,项目名称:txtnets,代码行数:29,代码来源:prepare_amazon_reviews.py

示例8: extract_nl_text

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def extract_nl_text(ms):
    """
    Extracts and tokenizes text from malware sample object

    :param ms: MalwareSample object
    :return: list of tokenized strings found in malware sample object's internal strings list
    """
    wpt = WordPunctTokenizer()
    all_tokenized_strings_in_ms = []
    inside_xml_privileges = False
    for s in ms.strings:
        if 'requestedPrivileges' in s or 'This program cannot be run in DOS mode' in s:
            continue
        elif inside_xml_privileges:
            continue
        elif '<assembly xmlns' in s:
            inside_xml_privileges = True
            continue
        elif '</assembly>' in s:
            inside_xml_privileges = False
            continue

        tokenized_string = []
        tokens = wpt.tokenize(s)
        if tokens:
            for t in tokens:
                if wordnet.synsets(t) and len(t) > 3:  # had to use length to eliminate false positives
                    tokenized_string.extend(tokens)
                    break
        if tokenized_string:
            all_tokenized_strings_in_ms.append(tokenized_string)
    return all_tokenized_strings_in_ms
开发者ID:danzek,项目名称:nlhbi-malware-extractor,代码行数:34,代码来源:getNLindicators.py

示例9: fred_language_analyser

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
class fred_language_analyser(language_analyser):
	''' a own analyser based on nltk with an asshole algoritme 
	'''
	def __init__(self, language = 'french'):
		'''Initialisation
			language	:	'french'
		'''
		self.tokenizer = WordPunctTokenizer()		
		self.stopwords = set(stopwords.words(language))
		self.stopwords.add(u"'")
	
	def text_to_vector(self, text):
		 tokens = self.tokenizer.tokenize(text)
		 tokens = [token for token in tokens if token.lower() not in self.stopwords]
		 return tokens
	
	def distance(self, text1, text2):
		v1 = self.text_to_vector(text1)
		v2 = self.text_to_vector(text2)
		#En attendant l'optimisation, on limite à 6 mots
		v1 = v1[0:6]
		v2 = v2[0:6]
		n = max(len(v1),len(v2))
		if len(v1)>len(v2):
			v1,v2 = v2,v1
		v1_1 = v1 + [None]*(n-len(v1))
		distance = 99
		for v1_2 in itertools.permutations(v1_1):#un peu boeuf : on permutte aussi les None avec les None
			#Distance entre les mots
			d_mot=0
			for i in range(n):
				try:
					d_mot += (6-min(6,edit_distance(v1_2[i],v2[i])))**2
				except:
					d_mot += 1 #si None
			d_mot = 6*(n**0.5)-d_mot**0.5
			#distance de la permuttation
			#Nb de Non insérés = nb de None pas au début ni à la fin
			v1_3 = []
			debut = True
			for m in v1_2:
				if m or not debut:
					debut = False
					v1_3.append(m)
			v1_4 = []
			debut = True
			for i in range(len(v1_3)-1,-1,-1):
				if v1_3[i] or not debut:
					debut = False
					v1_4.append(v1_3[i])
			d_perm = len(v1_4)-len(v1)
			#Les permutation de mot : 3 par permutation
			l=[]
			for m in list(filter(lambda x:x,v1_4)):
				l.append(v1.index(m))
			for i in range(len(l)-1):
				if l[i]<l[i+1]:
					d_perm +=3
			distance = min(distance, (d_mot**2+d_perm**2)**0.5)
		return distance
开发者ID:FredThx,项目名称:FSTA,代码行数:62,代码来源:fred_language_analyser.py

示例10: number_of_different_words

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
    def number_of_different_words(self):
        # TODO: Stemming, then move to language specific classes
        tokenizer = WordPunctTokenizer()
        words = tokenizer.tokenize(self.text.strip())
        only_textual_words = filter(unicode.isalpha, words)

        return len(set(only_textual_words))
开发者ID:aufziehvogel,项目名称:sprakit,代码行数:9,代码来源:text_statistics.py

示例11: TextProcessor

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def TextProcessor(src, tgt, low=True, num=True):

    print "processing "+src
    if low==True:
        print "lowercasing.."
    if num==True:
        print "removing numeric.."

    srcfile = codecs.open(src,"r","utf-8")
    tgtfile = codecs.open(tgt,"w","utf-8")

    word_punct_tokenizer = WordPunctTokenizer()

    linecount=0
    for line in srcfile:
        linecount+=1
        line = word_punct_tokenizer.tokenize(line)
        if low==True:
            for i in range(0,len(line)):
                line[i] = line[i].lower()
        if num==True:
            for i in range(0,len(line)):
                if line[i].isnumeric()==True:
                    line[i] = "<number>"

        tgtfile.write(listtostring(line))

    srcfile.close()
    tgtfile.close()
    print "done processing "+str(linecount)+" lines!!"
开发者ID:apsarath,项目名称:pyNN,代码行数:32,代码来源:TextProcessor.py

示例12: extract_words

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def extract_words(text):
    stemmer = PorterStemmer()

    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)

    result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
    return result
开发者ID:teonghan,项目名称:crimefeeder,代码行数:10,代码来源:crimeclassifier_v2.py

示例13: get_similarity_score

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
    def get_similarity_score(a, b):
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.extend(string.punctuation)
        stopwords.append('')
        tokenizer = WordPunctTokenizer()
        """Check if a and b are matches."""
        tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
                    if token.lower().strip(string.punctuation) not in stopwords]

        tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
                    if token.lower().strip(string.punctuation) not in stopwords]

        # Calculate Jaccard similarity
        ratio = 0
        if len(set(tokens_a).union(tokens_b)) > 0:
            ratio = len(set(tokens_a).intersection(tokens_b)) / float(len(set(tokens_a).union(tokens_b)))
        return (ratio)
开发者ID:elangovana,项目名称:Aristo,代码行数:19,代码来源:text_analyser.py

示例14: get_words_without_stopwords

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
 def get_words_without_stopwords(self, text):
     stopwords = nltk.corpus.stopwords.words('english')
     stopwords.extend(string.punctuation)
     stopwords.append('')
     tokenizer = WordPunctTokenizer()
     tokens = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(text) \
               if token.lower().strip(string.punctuation) not in stopwords]
     return tokens
开发者ID:elangovana,项目名称:Aristo,代码行数:10,代码来源:text_analyser.py

示例15: get_tokens

# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def get_tokens(sentence):
    """
    Tokenizes a list of sentences
    :param sentence: list of sentences
    :return: list of tokenized sentences
    """

    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)
开发者ID:zweiss,项目名称:RC_Readability_Calculator,代码行数:11,代码来源:nlp.py


注:本文中的nltk.tokenize.WordPunctTokenizer.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。