本文整理汇总了Python中nltk.probability.FreqDist.get方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.get方法的具体用法?Python FreqDist.get怎么用?Python FreqDist.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.probability.FreqDist
的用法示例。
在下文中一共展示了FreqDist.get方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: simhash
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def simhash(raw_text):
"""Compute the simhash value for a string."""
fdist = FreqDist()
for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'):
fdist.inc(word.lower())
v = [0] * 128
for word in fdist:
projection = bitarray()
projection.fromstring(hashlib.md5(word).digest())
#print "\tw:%s, %d" % (word, fdist[word])
#print "\t\t 128 bit hash: " + str(b)
for i in xrange(128):
if projection[i]:
v[i] += fdist.get(word)
else:
v[i] -= fdist.get(word)
hash_val = bitarray(128)
hash_val.setall(False)
for i in xrange(128):
if v[i] > 0:
hash_val[i] = True
return hash_val
示例2: get_term_freq_dict
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def get_term_freq_dict(data):
# Change it to lower case
lower_data = data.lower()
# Tokenize it
tokens = word_tokenize(lower_data)
freq_dist = FreqDist(tokens)
# Lemmatize it
word_freq = {}
for term in freq_dist.keys():
lemmatize_term = wordnet.lemmatize(term)
val = freq_dist.get(term)
# If it exist in word_freq, add value
if lemmatize_term in word_freq:
freq = word_freq[lemmatize_term]
word_freq[lemmatize_term] = freq + val
# Else, assign value
else:
word_freq[lemmatize_term] = val
return word_freq
示例3: featureList
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def featureList(corpus):
featList = []
for post in corpus:
listItem = [0]*noFeat
fileFreqDist = FreqDist()
fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
i =0
for key in trainKeys:
if fileFreqDist.has_key(key):
listItem[i] = fileFreqDist.get(key)
i=i+1
featList.append(listItem)
return featList
示例4: featureList
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def featureList(corpus):
featList = []
for trFile in corpus.fileids():
listItem = [0]*noFeat
fileFreqDist = FreqDist()
fileFreqDist = nltk.FreqDist(corpus.words(trFile))
i =0
for key in trainKeys:
if fileFreqDist.has_key(key):
listItem[i] = fileFreqDist.get(key)
i=i+1
featList.append(listItem)
return featList
示例5: createFeatures
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def createFeatures(sentVect, ordList):
noFeat = len(ordList)
featList = []
for post in sentVect:
listItem = [0]*noFeat
fileFreqDist = FreqDist()
fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
i =0
for key in ordList:
if fileFreqDist.has_key(key):
listItem[i] = fileFreqDist.get(key)
i=i+1
featList.append(listItem)
return featList
示例6: createProbDist
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def createProbDist(readerWordlist,writerUniqueWordlist):
### create a dictionary to store the frequency of each term
prob_dist = []
### using nltk calcuate frequency of each word
unigramWordList = FreqDist(readerWordlist)
datalen = len(readerWordlist) ### total words in the that document
#print len(unigramWordList)
#print datalen
for word in writerUniqueWordlist:
if word in unigramWordList:
#print word
#print unigramWordList.get(word)
#print unigramWordList.get(word)/float(datalen)
prob_dist.append(unigramWordList.get(word)/float(datalen))
else:
prob_dist.append(0)
#print prob_dist
return prob_dist
示例7: main
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def main():
parser = argparse.ArgumentParser(description='Categorize (nouns, verbs, etc.) all words in a text')
parser.add_argument('file', type=argparse.FileType('rU'),
help='text file to categorize')
parser.add_argument('--include-stopwords', action='store_true', default=DEFAULT_STOPWORDS,
help='include very common words in results (default: {})'
.format(DEFAULT_STOPWORDS))
parser.add_argument('--sort', type=str, default=DEFAULT_SORT,
choices=[SORT_ALPHA, SORT_OCCURRENCES, SORT_LENGTH, SORT_CLASS],
help='''how to sort output; by word alphabetically,
number of occurrences, word length, or word class (default: {})'''
.format(DEFAULT_SORT))
args = parser.parse_args()
if not args.file:
sys.exit('You must specify a text file to categorize')
# Download required nltk data if needed.
nltk.download('maxent_treebank_pos_tagger')
if not args.include_stopwords:
nltk.download('stopwords')
# Create tokenized lists of words from the text.
text = args.file.read()
text = text.lower()
tokens = nltk.word_tokenize(text)
# Only keep words consisting entirely of letters (i.e. remove punctuation,
# numbers, etc.)
tokens = [word for word in tokens if word.isalpha()]
# Save number of occurrences for each word for later reference, which could
# as well be through a collections.Counter but FreqDist will cover more
# cases if expanded on later.
fdist = FreqDist(tokens)
# Remove stopwords, i.e. very common ones.
tokens = set(tokens)
if not args.include_stopwords:
tokens -= STOPWORDS
print('Found {} unique words'.format(len(tokens)))
# Make a list of (word, occurrences, length, word_class) tuples for final
# output. `word` is the word itself, `occurrences` the number of times it
# occurs in its source text, `length` the length of the word, and
# `word_class` what type of word it is, e.g. noun, verb, etc.
matches = [(word, fdist.get(word), len(word), word_class(word)) for word in tokens]
headers = ['word', 'occurrences', 'length', 'word_class']
# Sort output accordingly.
if args.sort == SORT_ALPHA:
matches = sorted(matches, key=itemgetter(0))
elif args.sort == SORT_OCCURRENCES:
matches = sorted(matches, key=itemgetter(1, 0), reverse=True)
elif args.sort == SORT_LENGTH:
matches = sorted(matches, key=itemgetter(2, 0), reverse=True)
elif args.sort == SORT_CLASS:
matches = sorted(matches, key=itemgetter(3, 0))
# Finally, print results in a nice table.
print(tabulate(matches, headers=headers))
示例8: defaultdict
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
for (token, is_gene) in goldstandard_words:
dist = gene_substrings if is_gene else notgene_substrings
dist.update(substrings(token))
difference = defaultdict(float)
size1 = float(gene_substrings.N())
size2 = float(notgene_substrings.N())
# difference ist eine Hashtabelle, wobei die Schlüssel von
# Substrings und die Werte Differenzen in ihrer Häufigkeit sind.
# Positiv bedeutet: häufig in Gennamen; negativ bedeutet: selten
# in Gennamen.
for substr in (gene_substrings + notgene_substrings).iterkeys():
v1 = gene_substrings.get(substr) or 0
v2 = notgene_substrings.get(substr) or 0
difference[substr] = v1 / size1 - v2 / size2
r = xrange(-10, 10 + 1)
A = array([array(r), zeros_like(r)])
A.dtype = dtype('float32')
# for (i, e) in enumerate(xrange(-10, 10 + 1)):
# if e != 0:
# epsilon = 0.2 / e
# print "epsilon = ", epsilon
# klassi = OrClassifier([
# DictionaryClassifier(given_genes, stopwords),
# DifferenceClassifier(difference),
示例9: ConllCorpusReader
# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist
conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos')) # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.') # list of 12 POS tags
sentslen = len(conllreader.tagged_sents()) # getting number of sentences
tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words()) # getting frequence of (word,tag)
firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents()) # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)
TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
TagPair.append((words[i][1], words[i+1][1]))
TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)
TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
BiwMLE = MLEProbDist(TagWordfdist)