当前位置: 首页>>代码示例>>Python>>正文


Python FreqDist.get方法代码示例

本文整理汇总了Python中nltk.probability.FreqDist.get方法的典型用法代码示例。如果您正苦于以下问题:Python FreqDist.get方法的具体用法?Python FreqDist.get怎么用?Python FreqDist.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.probability.FreqDist的用法示例。


在下文中一共展示了FreqDist.get方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: simhash

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
    def simhash(raw_text):
        """Compute the simhash value for a string."""
        fdist = FreqDist()
        for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'):
            fdist.inc(word.lower())

        v = [0] * 128

        for word in fdist:
            projection = bitarray()
            projection.fromstring(hashlib.md5(word).digest())
            #print "\tw:%s, %d" % (word, fdist[word])
            #print "\t\t 128 bit hash: " + str(b)

            for i in xrange(128):
                if projection[i]:
                    v[i] += fdist.get(word)
                else:
                    v[i] -= fdist.get(word)


        hash_val = bitarray(128)
        hash_val.setall(False)

        for i in xrange(128):
            if v[i] > 0:
                hash_val[i] = True
        return hash_val
开发者ID:TPNguyen,项目名称:neardups,代码行数:30,代码来源:test_deal.py

示例2: get_term_freq_dict

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def get_term_freq_dict(data):
    # Change it to lower case
    lower_data = data.lower()
    
    # Tokenize it
    tokens = word_tokenize(lower_data)
    freq_dist = FreqDist(tokens)
    
    # Lemmatize it
    word_freq = {}
    
    for term in freq_dist.keys():
        lemmatize_term = wordnet.lemmatize(term)
        val = freq_dist.get(term)
        
        # If it exist in word_freq, add value
        if lemmatize_term in word_freq:
            freq = word_freq[lemmatize_term]
            word_freq[lemmatize_term] = freq + val
            
        # Else, assign value
        else:
            word_freq[lemmatize_term] = val
    
    
    return word_freq
开发者ID:Maverickwarrior,项目名称:Search-Engine,代码行数:28,代码来源:tokenize_docs.py

示例3: featureList

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
 def featureList(corpus):
     featList = []
     for post in corpus:
         listItem = [0]*noFeat
         fileFreqDist = FreqDist()
         fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
         
         i =0
         for key in trainKeys:
             if fileFreqDist.has_key(key):
                 listItem[i] = fileFreqDist.get(key)
             i=i+1
             
         featList.append(listItem)
         
     return featList
开发者ID:abhinavmishra590,项目名称:NLPbased-Automated_Tagging_System,代码行数:18,代码来源:SVM.py

示例4: featureList

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def featureList(corpus):
    featList = []
    for trFile in corpus.fileids():
        listItem = [0]*noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(corpus.words(trFile))
        
        i =0
        for key in trainKeys:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i=i+1
            
        featList.append(listItem)
        
    return featList
开发者ID:ashish0038,项目名称:sentimentAnalysis,代码行数:18,代码来源:SVM.py

示例5: createFeatures

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def createFeatures(sentVect, ordList):
    
    noFeat = len(ordList)
    
    featList = []
    for post in sentVect:
        listItem = [0]*noFeat
        fileFreqDist = FreqDist()
        fileFreqDist = nltk.FreqDist(nltk.word_tokenize(post))
            
        i =0
        for key in ordList:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i=i+1
                
        featList.append(listItem)
            
    return featList
开发者ID:abhinavmishra590,项目名称:NLPbased-Automated_Tagging_System,代码行数:21,代码来源:SVM.py

示例6: createProbDist

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def createProbDist(readerWordlist,writerUniqueWordlist):      
        
    ### create a dictionary to store the frequency of each term
    prob_dist = []
    
    ### using nltk calcuate frequency of each word
    unigramWordList = FreqDist(readerWordlist)
    datalen = len(readerWordlist) ### total words in the that document
    #print len(unigramWordList)
    #print datalen
    
    for word in writerUniqueWordlist:
        if word in unigramWordList:
            #print word
            #print unigramWordList.get(word)
            #print unigramWordList.get(word)/float(datalen)
            prob_dist.append(unigramWordList.get(word)/float(datalen))
        else:
            prob_dist.append(0)
            
    #print prob_dist
    return prob_dist
开发者ID:UW-INFX575,项目名称:Kirtika_dhathathri,代码行数:24,代码来源:updated-jargonDist.py

示例7: main

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
def main():
    parser = argparse.ArgumentParser(description='Categorize (nouns, verbs, etc.) all words in a text')

    parser.add_argument('file', type=argparse.FileType('rU'),
                        help='text file to categorize')

    parser.add_argument('--include-stopwords', action='store_true', default=DEFAULT_STOPWORDS,
                        help='include very common words in results (default: {})'
                        .format(DEFAULT_STOPWORDS))

    parser.add_argument('--sort', type=str, default=DEFAULT_SORT,
                        choices=[SORT_ALPHA, SORT_OCCURRENCES, SORT_LENGTH, SORT_CLASS],
                        help='''how to sort output; by word alphabetically,
                        number of occurrences, word length, or word class (default: {})'''
                        .format(DEFAULT_SORT))

    args = parser.parse_args()

    if not args.file:
        sys.exit('You must specify a text file to categorize')

    # Download required nltk data if needed.
    nltk.download('maxent_treebank_pos_tagger')
    if not args.include_stopwords:
        nltk.download('stopwords')

    # Create tokenized lists of words from the text.
    text = args.file.read()
    text = text.lower()
    tokens = nltk.word_tokenize(text)

    # Only keep words consisting entirely of letters (i.e. remove punctuation,
    # numbers, etc.)
    tokens = [word for word in tokens if word.isalpha()]

    # Save number of occurrences for each word for later reference, which could
    # as well be through a collections.Counter but FreqDist will cover more
    # cases if expanded on later.
    fdist = FreqDist(tokens)

    # Remove stopwords, i.e. very common ones.
    tokens = set(tokens)
    if not args.include_stopwords:
        tokens -= STOPWORDS

    print('Found {} unique words'.format(len(tokens)))

    # Make a list of (word, occurrences, length, word_class) tuples for final
    # output. `word` is the word itself, `occurrences` the number of times it
    # occurs in its source text, `length` the length of the word, and
    # `word_class` what type of word it is, e.g. noun, verb, etc.
    matches = [(word, fdist.get(word), len(word), word_class(word)) for word in tokens]
    headers = ['word', 'occurrences', 'length', 'word_class']

    # Sort output accordingly.
    if args.sort == SORT_ALPHA:
        matches = sorted(matches, key=itemgetter(0))
    elif args.sort == SORT_OCCURRENCES:
        matches = sorted(matches, key=itemgetter(1, 0), reverse=True)
    elif args.sort == SORT_LENGTH:
        matches = sorted(matches, key=itemgetter(2, 0), reverse=True)
    elif args.sort == SORT_CLASS:
        matches = sorted(matches, key=itemgetter(3, 0))

    # Finally, print results in a nice table.
    print(tabulate(matches, headers=headers))
开发者ID:alimony,项目名称:text-comparison-tools,代码行数:68,代码来源:word-classes.py

示例8: defaultdict

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
    for (token, is_gene) in goldstandard_words:
        dist = gene_substrings if is_gene else notgene_substrings
        dist.update(substrings(token))

    difference = defaultdict(float)
    size1 = float(gene_substrings.N())
    size2 = float(notgene_substrings.N())

    # difference ist eine Hashtabelle, wobei die Schlüssel von
    # Substrings und die Werte Differenzen in ihrer Häufigkeit sind.
    # Positiv bedeutet: häufig in Gennamen; negativ bedeutet: selten
    # in Gennamen.

    for substr in (gene_substrings + notgene_substrings).iterkeys():
        v1 = gene_substrings.get(substr) or 0
        v2 = notgene_substrings.get(substr) or 0

        difference[substr] = v1 / size1 - v2 / size2

    r = xrange(-10, 10 + 1)
    A = array([array(r), zeros_like(r)])
    A.dtype = dtype('float32')

    # for (i, e) in enumerate(xrange(-10, 10 + 1)):
    #     if e != 0:
    #         epsilon = 0.2 / e
    #         print "epsilon = ", epsilon
    #         klassi = OrClassifier([
    #             DictionaryClassifier(given_genes, stopwords),
    #             DifferenceClassifier(difference),
开发者ID:kedorlaomer,项目名称:ta3,代码行数:32,代码来源:useless_classifiers.py

示例9: ConllCorpusReader

# 需要导入模块: from nltk.probability import FreqDist [as 别名]
# 或者: from nltk.probability.FreqDist import get [as 别名]
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist

conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos'))  # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.')  # list of 12 POS tags
sentslen = len(conllreader.tagged_sents())  # getting number of sentences

tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words())   # getting frequence of (word,tag)

firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents())  # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)

TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
    TagPair.append((words[i][1], words[i+1][1]))

TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)

TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
BiwMLE = MLEProbDist(TagWordfdist)
开发者ID:andrey2111,项目名称:ANLP_viterbi,代码行数:32,代码来源:Train.py


注:本文中的nltk.probability.FreqDist.get方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。