本文整理汇总了Python中nltk.tokenize.WordPunctTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python WordPunctTokenizer.tokenize方法的具体用法?Python WordPunctTokenizer.tokenize怎么用?Python WordPunctTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.WordPunctTokenizer
的用法示例。
在下文中一共展示了WordPunctTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: tfIdf
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def tfIdf():
TFIDF_MIN_SCORE = 100
import nltk
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
collection = initialize_collection('documents')
docs = collection.find()
tfidf = []
idfMap = create_idf_map()
docs = collection.find()
for d in docs:
tfMap = {}
for word in set(tokenizer.tokenize(d['content'].lower())):
if word not in tfMap:
tfMap[word] = 1
else:
tfMap[word] += 1
tfIdfValues = []
for word in set(tokenizer.tokenize(d['content'].lower())):
if (tfMap[word] * 1000 / idfMap[word]) > TFIDF_MIN_SCORE:
tfIdfValues.append((word, tfMap[word] * 1000 / idfMap[word]))
tfIdfValues = sorted(tfIdfValues, key = lambda x : x[1], reverse = True)
d['tfidf'] = tfIdfValues
tfidf.append({'d' : d,
'tfidf' : tfIdfValues})
collection.save(d)
genFreq = generaral_frequency(idfMap)
return render_template("tfidf.html", documents = tfidf)
示例2: class1
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def class1():
import nltk
from nltk.tokenize import WordPunctTokenizer
docId = request.args.get('d')
tokenizer = WordPunctTokenizer()
collection = initialize_collection('documents')
featuresets = []
tagSet = set()
for d in collection.find():
bagOfWords = bag_of_words(tokenizer.tokenize(d['content']))
if 'tags' not in d: continue
for tag in d['tags']:
featuresets.append((bagOfWords, tag))
tagSet.add(tag)
classifier = nltk.NaiveBayesClassifier.train(featuresets)
d = collection.find_one({'_id' : ObjectId(docId)})
#classifier.show_most_informative_features(100)
cl = classifier.prob_classify(bag_of_words(tokenizer.tokenize(d['content'])))
probs = []
for tag in tagSet:
probs.append((tag, round(cl.prob(tag)*100) ))
classifier.show_most_informative_features(n=20)
probs = sorted(probs, key = lambda x : x[1], reverse = True)
return render_template('class1.html', probs = probs, d=d)
示例3: build_word_dictionary
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def build_word_dictionary(input_file_name, output_file_name):
dictionary = Counter()
tokenizer = WordPunctTokenizer()
with open(input_file_name) as input_file:
for record in json.loads(input_file.read()):
dictionary.update(tokenizer.tokenize(record['content']))
dictionary.update(tokenizer.tokenize(record['abstract']))
dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']
with open(output_file_name, 'w') as output_file:
output_file.write("{}\n".format(json.dumps(dictionary)))
示例4: tokenize_words
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def tokenize_words(sentence):
"""
:param sentence:
:return: list of words in sentence
"""
tokenizer = WordPunctTokenizer()
return tokenizer.tokenize(sentence)
示例5: message_to_wordlist
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False):
# Function to convert a document to a sequence of words,
# optionally removing stop words. Returns a list of words.
#
# 1. Remove HTML
#review_text = BeautifulSoup(review).get_text()
#
# 2. Remove messages numbers
message_text = re.sub(">>\d+","", message)
message_text = message_text.lower()
message_text = re.sub(u"ё", 'e', message_text, re.UNICODE)
message_text = clean_str(message_text)
tokenizer = WordPunctTokenizer()
# 3. Convert words to lower case and split them
words = tokenizer.tokenize(message_text)
lemmas = []
# 4. Optionally remove stop words (false by default)
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
if lemmas_bool == 'l':
for word in words:
word_parsed = morph.parse(word)
if len(word_parsed) > 0:
lemmas.append(word_parsed[0].normal_form)
elif lemmas_bool == 's':
for word in words:
word = stemmer.stem(word)
if len(word) > 0:
lemmas.append(word)
else:
lemmas = words
# 5. Return a list of words
return(lemmas)
示例6: clean_data
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def clean_data(input_file_name, output_file_name):
def clean_word(word):
word = word.encode('ascii', 'ignore')
word = word.lower()
word = re.sub(r'(\S)\1+', r'\1\1', word) # normalize repeated characters to two
word = re.sub(r'(\S\S)\1+', r'\1\1', word)
if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w][email protected])?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w][email protected])[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
word = 'GENERIC_HTTP'
return word
tokenizer = WordPunctTokenizer()
data = []
with open(input_file_name) as input_file:
for sentences, label in json.load(input_file):
cleaned_sentences = []
for sentence in sentences:
cleaned_sentence = " ".join(map(clean_word, sentence.split()))
cleaned_sentence = tokenizer.tokenize(cleaned_sentence)
cleaned_sentences.append(cleaned_sentence)
data.append([cleaned_sentences, label])
with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file:
json.dump(data, output_file)
示例7: clean_data
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def clean_data(input_file_name, output_file_name):
def clean_word(word):
word = word.lower()
word = word.replace('&','&').replace('<','<').replace('>','>').replace('"','"').replace(''',"'")
word = re.sub(r'(\S)\1+', r'\1\1', word) # normalize repeated characters to two
word = re.sub(r'(\S\S)\1+', r'\1\1', word)
word = word.encode('ascii', 'ignore')
if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w][email protected])?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w][email protected])[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
word = 'GENERIC_HTTP'
return word.encode('ascii', 'ignore')
tokenizer = WordPunctTokenizer()
with gzip.open(input_file_name) as input_file:
with gzip.open(output_file_name, 'w') as output_file:
for line in input_file:
sentences, score = json.loads(line)
cleaned_sentences = []
for sentence in sentences:
cleaned_sentence = " ".join(map(clean_word, sentence.split()))
cleaned_sentences.append(tokenizer.tokenize(cleaned_sentence))
json.dump([cleaned_sentences, score], output_file)
output_file.write("\n")
示例8: extract_nl_text
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def extract_nl_text(ms):
"""
Extracts and tokenizes text from malware sample object
:param ms: MalwareSample object
:return: list of tokenized strings found in malware sample object's internal strings list
"""
wpt = WordPunctTokenizer()
all_tokenized_strings_in_ms = []
inside_xml_privileges = False
for s in ms.strings:
if 'requestedPrivileges' in s or 'This program cannot be run in DOS mode' in s:
continue
elif inside_xml_privileges:
continue
elif '<assembly xmlns' in s:
inside_xml_privileges = True
continue
elif '</assembly>' in s:
inside_xml_privileges = False
continue
tokenized_string = []
tokens = wpt.tokenize(s)
if tokens:
for t in tokens:
if wordnet.synsets(t) and len(t) > 3: # had to use length to eliminate false positives
tokenized_string.extend(tokens)
break
if tokenized_string:
all_tokenized_strings_in_ms.append(tokenized_string)
return all_tokenized_strings_in_ms
示例9: fred_language_analyser
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
class fred_language_analyser(language_analyser):
''' a own analyser based on nltk with an asshole algoritme
'''
def __init__(self, language = 'french'):
'''Initialisation
language : 'french'
'''
self.tokenizer = WordPunctTokenizer()
self.stopwords = set(stopwords.words(language))
self.stopwords.add(u"'")
def text_to_vector(self, text):
tokens = self.tokenizer.tokenize(text)
tokens = [token for token in tokens if token.lower() not in self.stopwords]
return tokens
def distance(self, text1, text2):
v1 = self.text_to_vector(text1)
v2 = self.text_to_vector(text2)
#En attendant l'optimisation, on limite à 6 mots
v1 = v1[0:6]
v2 = v2[0:6]
n = max(len(v1),len(v2))
if len(v1)>len(v2):
v1,v2 = v2,v1
v1_1 = v1 + [None]*(n-len(v1))
distance = 99
for v1_2 in itertools.permutations(v1_1):#un peu boeuf : on permutte aussi les None avec les None
#Distance entre les mots
d_mot=0
for i in range(n):
try:
d_mot += (6-min(6,edit_distance(v1_2[i],v2[i])))**2
except:
d_mot += 1 #si None
d_mot = 6*(n**0.5)-d_mot**0.5
#distance de la permuttation
#Nb de Non insérés = nb de None pas au début ni à la fin
v1_3 = []
debut = True
for m in v1_2:
if m or not debut:
debut = False
v1_3.append(m)
v1_4 = []
debut = True
for i in range(len(v1_3)-1,-1,-1):
if v1_3[i] or not debut:
debut = False
v1_4.append(v1_3[i])
d_perm = len(v1_4)-len(v1)
#Les permutation de mot : 3 par permutation
l=[]
for m in list(filter(lambda x:x,v1_4)):
l.append(v1.index(m))
for i in range(len(l)-1):
if l[i]<l[i+1]:
d_perm +=3
distance = min(distance, (d_mot**2+d_perm**2)**0.5)
return distance
示例10: number_of_different_words
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def number_of_different_words(self):
# TODO: Stemming, then move to language specific classes
tokenizer = WordPunctTokenizer()
words = tokenizer.tokenize(self.text.strip())
only_textual_words = filter(unicode.isalpha, words)
return len(set(only_textual_words))
示例11: TextProcessor
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def TextProcessor(src, tgt, low=True, num=True):
print "processing "+src
if low==True:
print "lowercasing.."
if num==True:
print "removing numeric.."
srcfile = codecs.open(src,"r","utf-8")
tgtfile = codecs.open(tgt,"w","utf-8")
word_punct_tokenizer = WordPunctTokenizer()
linecount=0
for line in srcfile:
linecount+=1
line = word_punct_tokenizer.tokenize(line)
if low==True:
for i in range(0,len(line)):
line[i] = line[i].lower()
if num==True:
for i in range(0,len(line)):
if line[i].isnumeric()==True:
line[i] = "<number>"
tgtfile.write(listtostring(line))
srcfile.close()
tgtfile.close()
print "done processing "+str(linecount)+" lines!!"
示例12: extract_words
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def extract_words(text):
stemmer = PorterStemmer()
tokenizer = WordPunctTokenizer()
tokens = tokenizer.tokenize(text)
result = [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
return result
示例13: get_similarity_score
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def get_similarity_score(a, b):
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
tokenizer = WordPunctTokenizer()
"""Check if a and b are matches."""
tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
if token.lower().strip(string.punctuation) not in stopwords]
tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
if token.lower().strip(string.punctuation) not in stopwords]
# Calculate Jaccard similarity
ratio = 0
if len(set(tokens_a).union(tokens_b)) > 0:
ratio = len(set(tokens_a).intersection(tokens_b)) / float(len(set(tokens_a).union(tokens_b)))
return (ratio)
示例14: get_words_without_stopwords
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def get_words_without_stopwords(self, text):
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(string.punctuation)
stopwords.append('')
tokenizer = WordPunctTokenizer()
tokens = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(text) \
if token.lower().strip(string.punctuation) not in stopwords]
return tokens
示例15: get_tokens
# 需要导入模块: from nltk.tokenize import WordPunctTokenizer [as 别名]
# 或者: from nltk.tokenize.WordPunctTokenizer import tokenize [as 别名]
def get_tokens(sentence):
"""
Tokenizes a list of sentences
:param sentence: list of sentences
:return: list of tokenized sentences
"""
tokenizer = WordPunctTokenizer()
return tokenizer.tokenize(sentence)