本文整理汇总了Python中nltk.bigrams函数的典型用法代码示例。如果您正苦于以下问题:Python bigrams函数的具体用法?Python bigrams怎么用?Python bigrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了bigrams函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: freq_dst
def freq_dst(self,posCorpus,negCorpus):
#Creates frequency distribution for words in corpus
posFreqDist = FreqDist()
for word in posCorpus.words():
posFreqDist.inc(word)
negFreqDist = FreqDist()
for word in negCorpus.words():
negFreqDist.inc(word)
#Frequency Distributions with Laplace Smoothing
global posLapFreq
posLapFreq = nltk.probability.LaplaceProbDist(posFreqDist)
global negLapFreq
negLapFreq = nltk.probability.LaplaceProbDist(negFreqDist)
#GetBigrams
posBigrams = nltk.bigrams(posCorpus.words())
negBigrams = nltk.bigrams(negCorpus.words())
#Get no. of words per corpus
posWordLen = len(posCorpus.words())
negWordLen = len(negCorpus.words())
#FreqDist for Bigrams
global posBiFreq
posBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(posBigrams))
global negBiFreq
negBiFreq = nltk.probability.LaplaceProbDist(nltk.FreqDist(negBigrams))
示例2: find_colloc
def find_colloc(data): # find most common collocations
def check(wb, tb):
if len(wb[0]) <= 1 or len(wb[1]) <= 2:
return False
try:
if detect(wb[0]) != "ar" or detect(wb[1]) != "ar":
return False
except:
return False
if tb in [("NN", "NN"), ("NN", "DTNN"), ("NNP", "NNP")]:
return True
return False
bigrams = FreqDist()
for d in data:
tokens = d["tokens"]
words_bigrams = nltk.bigrams([t[0] for t in tokens])
tags_bigrams = nltk.bigrams([t[1] for t in tokens])
for wb, tb in zip(words_bigrams, tags_bigrams):
if check(wb, tb):
bigrams[wb] += 1
return bigrams
示例3: similarity
def similarity(paper1,paper2):
score=[]
stops=nltk.corpus.stopwords.words('english') #stopwords to weed out
##compare the titles and score the word cosine similarity
title1 = paper1[1]
title2 = paper2[1]
tokens1=[w for w in nltk.word_tokenize(title1) if w not in stops]
tokens2=[w for w in nltk.word_tokenize(title2) if w not in stops]
fd1=nltk.FreqDist(tokens1)
fd2=nltk.FreqDist(tokens2)
keys=list(set(list(fd1.keys())+list(fd2.keys())))
scoretemp=0
for key in keys:
scoretemp += fd1[key]*fd2[key]
a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
if a:
score.append(1-scoretemp/a)
else:
score.append(0)
##compare the abstracts and score single word cosine similarity
abstract1 = paper1[3]
abstract2 = paper2[3]
tokens1=[w for w in nltk.word_tokenize(abstract1) if w not in stops]
tokens2=[w for w in nltk.word_tokenize(abstract2) if w not in stops]
fd1=nltk.FreqDist(tokens1)
fd2=nltk.FreqDist(tokens2)
keys=list(set(list(fd1.keys())+list(fd2.keys())))
scoretemp=0
for key in keys:
scoretemp += fd1[key]*fd2[key]
a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
if a:
score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
else:
score.append(0)
##compare the abstracts and score bigram cosine similarity
tokens1 = nltk.word_tokenize(abstract1)
tokens2 = nltk.word_tokenize(abstract2)
bgsall1 = nltk.bigrams(tokens1)
bgsall2 = nltk.bigrams(tokens2)
bgs1 = [bg for bg in bgsall1 if bg[0] not in stops and bg[1] not in stops]
bgs2 = [bg for bg in bgsall2 if bg[0] not in stops and bg[1] not in stops]
fd1=nltk.FreqDist(bgs1)
fd2=nltk.FreqDist(bgs2)
keys=list(set(list(fd1.keys())+list(fd2.keys())))
scoretemp=0
for key in keys:
scoretemp += fd1[key]*fd2[key]
# print(fd1.values())
a = numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))
if a:
score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
else:
score.append(0)
##total score is sum of the three scores
return sum(score)
示例4: test
def test():
uniDictList = [{} for x in range(6)]
biDictList = [{} for x in range(6)]
vocabSize = [0 for x in range(6)]
totalSize = [0 for x in range(6)]
biVocabSize = [0 for x in range(6)]
bitotalSize = [0 for x in range(6)]
numList = [0 for x in range(6)]
numCorrect = total = 0
# randomly split set
for entry in entryList:
if random.random() > 0.10:
entry.test = 0
else:
entry.test = 1
# compute train dictionaries
for entry in entryList:
if entry.test == 0:
for word in entry.review.split():
uniDictList[entry.rating][word] = uniDictList[entry.rating].get(word,0)+1
for bigram in bigrams(entry.review.split()):
biDictList[entry.rating][bigram] = biDictList[entry.rating].get(word,0)+1
numList[entry.rating] += 1
print numList
totalCount = reduce(lambda x,y: x+y, numList)
# compute dictionary stats
for x in xrange(1,6):
vocabSize[x] = len(uniDictList[x].keys())
totalSize[x] = reduce(lambda x,y: x+y,uniDictList[x].values())
biVocabSize[x] = len(biDictList[x].keys())
bitotalSize[x] = reduce(lambda x,y: x+y,biDictList[x].values())
# testing
for entry in entryList:
if entry.test == 1:
rankProb = [0 for x in range(6)]
for x in range(1,6):
for word in entry.review.split():
rankProb[x] += math.log(uniDictList[x].get(word,1)) - math.log(vocabSize[x]+totalSize[x])
for bigram in bigrams(entry.review.split()):
rankProb[x] += math.log(biDictList[x].get(bigram,1)) - math.log(biVocabSize[x]+bitotalSize[x])
map(lambda x: x*numList[entry.rating]/totalCount,rankProb)
entry.pRating = rankProb.index(max(rankProb[1:6]))
if entry.pRating == entry.rating:
numCorrect += 1
total += 1
print bigrams(entry.review.split())
return [numCorrect, total]
示例5: estimateLikelihood
def estimateLikelihood(self):
uniqBigrams = set()
uniqCount = 0
for tweet in self._focusTweets['aae']:
tweet = tweet.split('\t')
for bigram in nltk.bigrams(tweet):
try:
dummy = self._biDict[bigram]
self._likelihood['aae'][bigram] += 1
self._likelihood['aae']['__BITOTAL__'] += 1
if bigram not in uniqBigrams:
uniqBigrams.add(bigram)
uniqCount += 1
except:
continue
self._likelihood['aae']['__BITOTAL__'] += uniqCount ## Adding vocab to total for add one smoothing!!
sys.stderr.write("Likelihood Bigram Entries AAE:"+str(len(self._likelihood['aae']))+"\n")
uniqBigrams = set()
uniqCount = 0
for tweet in self._focusTweets['mse']:
tweet = tweet.split('\t')
for bigram in nltk.bigrams(tweet):
try:
dummy = self._biDict[bigram]
self._likelihood['mse'][bigram] += 1
self._likelihood['mse']['__BITOTAL__'] += 1
if bigram not in uniqBigrams:
uniqBigrams.add(bigram)
uniqCount += 1
except:
continue
self._likelihood['mse']['__BITOTAL__'] += uniqCount
sys.stderr.write("Likelihood Bigram Entries MSE:"+str(len(self._likelihood['mse']))+"\n")
示例6: to_bigram
def to_bigram(self, termpos):
words = [elem[0] for elem in termpos]
pos_tags = [elem[1] for elem in termpos]
b_words = nltk.bigrams(words)
b_pos = nltk.bigrams(pos_tags)
return (b_words, b_pos)
示例7: main
def main():
text = open('holmes.txt').read()
tokens = nltk.wordpunct_tokenize(text)
charList = []
for word in tokens:
for char in word:
charList.append(char)
fDistChars = nltk.FreqDist(charList)
fDistWords = nltk.FreqDist(tokens)
print("Answer to 1A, there are {} character types in the book, namely: \n{}".format(len(fDistChars),sorted(fDistChars)))
print("\nAnswer to 1B, there are {} word types in the book, namely: \n{}".format(len(fDistWords),sorted(fDistWords)))
bigramChars = nltk.bigrams(charList)
trigramChars = nltk.trigrams(charList)
print("\nAnswer to 1C, the 20 most common characters are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(charList),
most_common(bigramChars), most_common(trigramChars)))
bigramWords = nltk.bigrams(tokens)
trigramWords = nltk.trigrams(tokens)
print("\nAnswer to 1D, the 20 most common words are: \nUnigrams: \n{}\nBigrams: \n{}\nTrigrams: \n{}".format(most_common(tokens),
most_common(bigramWords), most_common(trigramWords)))
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
scoredPMI = finder.score_ngrams(bigram_measures.pmi)
scoredCHI = finder.score_ngrams(bigram_measures.chi_sq)
print("\nAnswer to 2, the 20 most likely collocations are:\nPMI:\n{} \nChi's square\n{}" .format(scoredPMI[:20],scoredCHI[:20]))
print("\nSpearmans correlation = {}".format(nltk.metrics.spearman.spearman_correlation(scoredPMI, scoredCHI)))
示例8: textsimilarity
def textsimilarity(text1,text2):
score=[]
stops=nltk.corpus.stopwords.words('english') #stopwords to weed out
stops = stops + ['we',',','.','(',')','using','new','propose','investigate']
stops = stops + ['-','show','infer','novel','method']
#get tokens and bigrams from the text, either string or list of keywords
if type(text1) is not list:
alltokens = nltk.word_tokenize(text1.lower())
allpairs = [list(pair) for pair in nltk.bigrams(alltokens)]
tokens1 = [token for token in alltokens if token not in stops]
pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
else:
alltokens = []
allpairs1 = []
for el in text1:
atokens = nltk.word_tokenize(el.lower())
alltokens += atokens
apairs = [list(pair) for pair in nltk.bigrams(atokens)]
allpairs += apairs
tokens1 = [token for token in alltokens if token not in stops]
pairs1 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
if type(text2) is not list:
tokens = nltk.word_tokenize(text2.lower())
allpairs = [list(pair) for pair in nltk.bigrams(tokens)]
tokens2 = [token for token in tokens if token not in stops]
pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
else:
for el in text2:
atokens = nltk.word_tokenize(el.lower())
alltokens += atokens
apairs = [list(pair) for pair in nltk.bigrams(atokens)]
allpairs += apairs
tokens2 = [token for token in alltokens if token not in stops]
pairs2 = [" ".join(bg) for bg in allpairs if bg[0] not in stops and bg[1] not in stops]
###score single word cosine similarity
## fd1=nltk.FreqDist(tokens1)
## fd2=nltk.FreqDist(tokens2)
## keys=list(set(list(fd1.keys())+list(fd2.keys())))
## scoretemp=0
## for key in keys:
## scoretemp += fd1[key]*fd2[key]
## score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
##
####score bigram cosine similarity
## fd1=nltk.FreqDist(pairs1)
## fd2=nltk.FreqDist(pairs2)
## keys=list(set(list(fd1.keys())+list(fd2.keys())))
## scoretemp=0
## for key in keys:
## scoretemp += fd1[key]*fd2[key]
## score.append(1-scoretemp/(numpy.linalg.norm(numpy.asarray(list(fd1.values())))*numpy.linalg.norm(numpy.asarray(list(fd2.values())))))
score.append(sum(1 for token in tokens1 if token in tokens2))
score.append(sum(1 for pair in pairs1 if pair in pairs2))
print('done')
##total score is sum of the the scores
return sum(score)
示例9: main
def main():
# Corpus Location
#for training data
posTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_train'
negTrainCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_train'
#for test data
posTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/pos_test'
negTestCorpus = 'C:/Users/Abhinav/Desktop/Course work/NLP/txt_sentoken/neg_test'
# Create Plain Text Corpus for training data
posCorpus = PlaintextCorpusReader(posTrainCorpus, '.*')
negCorpus = PlaintextCorpusReader(negTrainCorpus, '.*')
# Create Plain Text Corpus for test data
posTstCorpus = PlaintextCorpusReader(posTestCorpus, '.*')
negTstCorpus = PlaintextCorpusReader(negTestCorpus, '.*')
#GetBigrams
posBigrams = nltk.bigrams(posCorpus.words())
negBigrams = nltk.bigrams(negCorpus.words())
#Get no. of words per corpus
posWordLen = len(posCorpus.words())
negWordLen = len(negCorpus.words())
# Creating object of Lang_Model_classifier
obj1 = Lang_Model_Classifier()
obj1.freq_dst(posCorpus, negCorpus)
#For negative test data
for filename in os.listdir(negTestCorpus):
wordSet = negTstCorpus.words(filename)
print '**Unigram**'
unigr = obj1.perp(wordSet)
print unigr
print '**Bigram**'
bigr = obj1.perpBi(nltk.bigrams(wordSet))
print bigr
#For positive test data
for filename in os.listdir(posTestCorpus):
wordSet2 = posTstCorpus.words(filename)
print '**Unigram**'
posunigr = obj1.perp(wordSet2)
print posunigr
print '**Bigram**'
posbigr = obj1.perpBi(nltk.bigrams(wordSet2))
print posbigr
示例10: hybrid_cfdist
def hybrid_cfdist():
sherlock_corpus = PlaintextCorpusReader(CORPUS_ROOT_SHERLOCK, '.*', encoding='utf-8')
sherlock_bigrams = nltk.bigrams(sherlock_corpus.words())
pokemon_corpus = PlaintextCorpusReader(CORPUS_ROOT_POKEMON, '.*', encoding='utf-8')
pokemon_bigrams = nltk.bigrams(pokemon_corpus.words())
return nltk.ConditionalFreqDist(sherlock_bigrams + pokemon_bigrams)
示例11: how_is_often_used_in_text
def how_is_often_used_in_text():
from nltk.corpus import brown
brown_learned_text = brown.words(categories="learned")
print sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == "often"))
# or use the tagged words for the actual POS tags
brown_learned_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
fd = nltk.FreqDist([b[1] for (a, b) in nltk.bigrams(brown_learned_tagged) if a[0] == "often"])
fd.tabulate()
示例12: wordlistfun
def wordlistfun(filename):
minlength = 2
lmtzr = nltk.stem.wordnet.WordNetLemmatizer()
wordlist = []
wordfreq = []
hashlist = []
hashfreq = []
with open(filename, "r") as f:
count_all = Counter()
count_hash = Counter()
count_only = Counter()
count_bi = Counter()
count_only2 = Counter()
count_bigramonly = Counter()
count_bigramstop = Counter()
for line in f:
try:
tweet = json.loads(line)
# Create a list with all the terms
terms_stop = [
term for term in preprocess(tweet["text"]) if term.lower() not in stop
] # Update the counter
terms_hash = [term for term in preprocess(tweet["text"]) if term.lower().startswith("#")]
terms_only = [
term
for term in preprocess(tweet["text"])
if term.lower() not in stop and not term.lower().startswith(("#", "@"))
]
# mind the ((double brackets))
# startswith() takes a tuple (not a list) if # we pass a list of inputs
terms_only2 = [
term.encode("unicode-escape")
for term in preprocess(tweet["text"])
if term.lower() not in stop
and not term.lower().startswith(("#", "@"))
and not term.lower().startswith(("htt", "\u"))
and term.lower() not in [r"(?:(?:\d+,?)+(?:\.?\d+)?)"]
and len(term) > minlength
]
terms_bigramstop = bigrams(terms_stop)
terms_bigramonly = bigrams(terms_only2)
count_all.update(terms_stop)
count_hash.update(terms_hash)
count_only.update(terms_only)
count_only2.update(terms_only2)
count_bigramonly.update(terms_bigramonly)
count_bigramstop.update(terms_bigramstop)
except:
pass
wordlist, wordfreq = zip(*count_only2.most_common())
hashlist, hashfreq = zip(*count_hash.most_common())
return wordlist, wordfreq, hashlist, hashfreq
示例13: do_ir2
def do_ir2(db, param):
print 'Computazione di IR2', db, param, '...'
def words(text):
stopwords = set(nltk.corpus.stopwords.words('english'))
return [w for w in nltk.word_tokenize(text.lower()) if w not in string.punctuation and w not in stopwords]
class BigramsCorpus:
def __init__(self, db, collection):
self.client = MongoClient()[db][collection]
def __iter__(self):
for doc in self.client.find():
yield [doc['_id']]
def __len__(self):
return self.client.count()
bigram_corpus = BigramsCorpus('cordis', 'bi_grams')
bigrams = Dictionary(bigram_corpus)
project ={'$project': {'_id': 0, 'title': 1, 'reference': 1}}
a = [project]
project_corpus = MongoCorpus('cordis', 'projects', aggregate=a)
n = max(bigrams.keys())
dataset = []
for doc in project_corpus:
temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
x = [0]*(n+1)
for bi, _ in temp:
x[bi] = 1
dataset.append(x)
alg = KMeans(n_clusters=int(param))
alg.fit(dataset)
clusters = defaultdict(list)
for i, doc in enumerate(project_corpus):
temp = bigrams.doc2bow([' '.join(x) for x in nltk.bigrams(words(doc['title']))])
x = [0]*(n+1)
for bi, _ in temp:
x[bi] = 1
p = alg.predict([x])
clusters[p[0]].append(doc['reference'])
mongo_clusters = []
for k, v in clusters.items():
mongo_clusters.append({'cluster': k, 'projects': v})
# Mongo da questo errore: InvalidDocument: Cannot encode object: 0
print mongo_clusters
# Salva su collezione Mongo
mongo = MongoClient()['g8']['ir2']
mongo.insert_many(mongo_clusters)
print 'Fatto!'
示例14: extract_bigrams
def extract_bigrams(articleList, commentCount):
featureMatrix = np.zeros([commentCount,100])
index = 0
stemmer = SnowballStemmer("english", ignore_stopwords=True)
bagOfWords = []
for art in articleList.items():
for comm in art[1]:
mywords = words(comm.body)
mywords = known_words(mywords)
# Remove Stops
filtered_words = [w for w in mywords if not w in stopwords.words('english')]
# Stemming
stemmed_words = [stemmer.stem(w) for w in filtered_words]
bagOfWords += stemmed_words
bagOfWords.append("\n")
tempVector = dict()
#Create your bigrams
bgs = nltk.bigrams(bagOfWords)
fdist = nltk.FreqDist(bgs)
for k in fdist.keys()[:100]:
tempVector[k] = 0
theKeys = tempVector.keys()
for art in articleList.items():
for comm in art[1]:
mywords = words(comm.body)
mywords = known_words(mywords)
# Remove Stops
filtered_words = [w for w in mywords if not w in stopwords.words('english')]
# Stemming
stemmed_words = [stemmer.stem(w) for w in filtered_words]
bgs = nltk.bigrams(stemmed_words)
for word in (w for w in bgs if tempVector.has_key(w)):
keyInd = theKeys.index(word)
featureMatrix[index][keyInd] += 1
index += 1
if index % 100 == 0:
print "extracted", index, "features"
if index >= commentCount:
break
print "non-zero",np.count_nonzero(featureMatrix)
print "Percentage filled:%.2f" %(float(np.count_nonzero(featureMatrix))/(featureMatrix.shape[0]*featureMatrix.shape[1]))
return featureMatrix
示例15: featureSets
def featureSets(data): #data accepted as (rating, list of words)
fs = []
for (r, words) in data:
nicewords = [word.lower() for word in words if not isStopWord(word) and not isPunctuation(word)]
for bigram in nltk.bigrams(nicewords):
fs.append((BigramClassifier.features(bigram),r))
return fs
return [(BigramClassifier.features(bigram), r) for bigram in nltk.bigrams(words)]