本文整理汇总了Python中nltk.bigrams方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.bigrams方法的具体用法?Python nltk.bigrams怎么用?Python nltk.bigrams使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.bigrams方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: bigram_counts
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def bigram_counts(word_list):
bgs = nltk.bigrams(word_list)
fdist = nltk.FreqDist(bgs)
d = Counter()
for k, v in fdist.items():
d[k] = v
return d
示例2: extract_bigram_feats
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def extract_bigram_feats(document, bigrams):
"""
Populate a dictionary of bigram features, reflecting the presence/absence in
the document of each of the tokens in `bigrams`. This extractor function only
considers contiguous bigrams obtained by `nltk.bigrams`.
:param document: a list of words/tokens.
:param unigrams: a list of bigrams whose presence/absence has to be
checked in `document`.
:return: a dictionary of bigram features {bigram : boolean}.
>>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')]
>>> document = 'ice is melting due to global warming'.split()
>>> sorted(extract_bigram_feats(document, bigrams).items())
[('contains(global - warming)', True), ('contains(love - you)', False),
('contains(police - prevented)', False)]
"""
features = {}
for bigr in bigrams:
features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document)
return features
#////////////////////////////////////////////////////////////
#{ Helper Functions
#////////////////////////////////////////////////////////////
示例3: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def __init__(self, data_paths, batch_size, unroll, level):
self.batch_size = batch_size
self.unroll = unroll
train_data, valid_data, test_data, token_to_id, frequencies, hist_freqs, train_tokens = load_text_data(
data_paths, level)
self.bg_counts = bigram_counts(train_tokens)
self.tg_counts = trigram_counts(train_tokens)
self.token_to_id = token_to_id
# NOTE extends the vocabulary
self.token_to_id['<_>'] = len(self.token_to_id)
self.id_to_token = dict((v, k) for k, v in self.token_to_id.iteritems())
train_data = _reshape_data(train_data, batch_size, unroll)
valid_data = _reshape_data(valid_data, batch_size, unroll)
test_data = _reshape_data(test_data, batch_size, unroll)
self.split_data = {"train": train_data, "valid": valid_data,
"test": test_data}
self.frequencies = frequencies
self.frequencies_cumsum = np.cumsum(frequencies)
self.hist_freqs = hist_freqs
self.hist_freqs_cumsum = np.cumsum(hist_freqs)
self.continuations = build_continuations(self.bg_counts)
bgs = nltk.bigrams(train_tokens)
if level == "word":
self.D1, self.D2, self.D3p, self.N1_lookup, self.N2_lookup, self.N3p_lookup = estimate_modkn_discounts(
bgs)
示例4: get_strings_from_utterance
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
"""
Based on the current utterance, return a dictionary where the keys are the strings in
the database that map to lists of the token indices that they are linked to.
"""
string_linking_scores: Dict[str, List[int]] = defaultdict(list)
for index, token in enumerate(tokenized_utterance):
for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
string_linking_scores[string].append(index)
token_bigrams = bigrams([token.text for token in tokenized_utterance])
for index, token_bigram in enumerate(token_bigrams):
for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
string_linking_scores[string].extend([index, index + 1])
trigrams = ngrams([token.text for token in tokenized_utterance], 3)
for index, trigram in enumerate(trigrams):
if trigram[0] == "st":
natural_language_key = f"st. {trigram[2]}".lower()
else:
natural_language_key = " ".join(trigram).lower()
for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
string_linking_scores[string].extend([index, index + 1, index + 2])
return string_linking_scores
示例5: get_data_from_file
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def get_data_from_file(file_name, isTrain=True):
data = []
with open(train_csv_file, 'r') as csv:
lines = csv.readlines()
total = len(lines)
for i, line in enumerate(lines):
if isTrain:
tag = line.split(',')[1]
bag_of_words = line.split(',')[2].split()
if USE_BIGRAMS:
bag_of_words_bigram = list(nltk.bigrams(line.split(',')[2].split()))
bag_of_words = bag_of_words+bag_of_words_bigram
else :
tag = '5'
bag_of_words = line.split(',')[1].split()
if USE_BIGRAMS:
bag_of_words_bigram = list(nltk.bigrams(line.split(',')[1].split()))
bag_of_words = bag_of_words+bag_of_words_bigram
data.append((bag_of_words, tag))
return data
示例6: create_qb_tokenizer
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def create_qb_tokenizer(
unigrams=True, bigrams=False, trigrams=False,
zero_length_token='zerolengthunk', strip_qb_patterns=True):
def tokenizer(text):
if strip_qb_patterns:
text = re.sub(
'\s+', ' ',
re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
).strip().capitalize()
import nltk
tokens = nltk.word_tokenize(text)
if len(tokens) == 0:
return [zero_length_token]
else:
ngrams = []
if unigrams:
ngrams.extend(tokens)
if bigrams:
ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
if trigrams:
ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])
if len(ngrams) == 0:
ngrams.append(zero_length_token)
return ngrams
return tokenizer
示例7: train_bigram
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def train_bigram(lst):
model = defaultdict(lambda: defaultdict(lambda: 0))
for sent in lst:
sent = sent.split()
for w1, w2 in bigrams(sent, pad_right=True, pad_left=True):
model[w1][w2] += 1
total_count = 0
for w1 in model:
total_count = float(sum(model[w1].values()))
for w2 in model[w1]:
model[w1][w2] /= total_count
return model
#Total Sum Of Bigram Probablity Of A Sentence[Returns Float]:
示例8: estimate_modkn_discounts
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def estimate_modkn_discounts(ngrams):
# Get counts
counts = Counter(ngrams)
N1 = float(len([k for k in counts if counts[k] == 1]))
N2 = float(len([k for k in counts if counts[k] == 2]))
N3 = float(len([k for k in counts if counts[k] == 3]))
N4 = float(len([k for k in counts if counts[k] == 4]))
N3p = float(len([k for k in counts if counts[k] >= 3]))
# Estimate discounting parameters
Y = N1 / (N1 + 2 * N2)
D1 = 1 - 2 * Y * (N2 / N1)
D2 = 2 - 3 * Y * (N3 / N2)
D3p = 3 - 4 * Y * (N4 / N3)
# FIXME(zxie) Assumes bigrams for now
# Also compute N1/N2/N3p lookups (context -> n-grams with count 1/2/3+)
N1_lookup = Counter()
N2_lookup = Counter()
N3p_lookup = Counter()
for bg in counts:
if counts[bg] == 1:
N1_lookup[bg[0]] += 1
elif counts[bg] == 2:
N2_lookup[bg[0]] += 1
else:
N3p_lookup[bg[0]] += 1
return D1, D2, D3p, N1_lookup, N2_lookup, N3p_lookup
示例9: get_valid_bigram_words
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def get_valid_bigram_words(self, words):
_words = []
for i in nltk.bigrams(words):
if (len(i[0]) >= self.min_len) and (len(i[1]) >= self.min_len):
if (not self.exclude_stopwords) or ((i[0] not in config.STOP_WORDS) and (i[1] not in config.STOP_WORDS)):
if (not self.skip_digit) or ((len(re.findall(re.compile("\d+"), i[0])) == 0) and (len(re.findall(re.compile("\d+"), i[1])) == 0)):
_words.append(" ".join(i))
return _words
示例10: words2bigrams
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def words2bigrams(sep, tokens):
'''Tokenize words into bigrams. Bigrams are two word tokens.
Punctuation is considered as a separate token.'''
content = read_tokens(tokens)
bigrams = []
try:
bigrams = list(nltk.bigrams(content))
except LookupError as err:
click.echo(message="Error with tokenization", nl=True)
click.echo(message="Have you run \"textkit download\"?", nl=True)
click.echo(message="\nOriginal Error:", nl=True)
click.echo(err)
[output(sep.join(bigram)) for bigram in bigrams]
示例11: count_bigrams
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def count_bigrams(corpus):
text = corpus.map(itemgetter(1))
sents = text.flatMap(nltk.sent_tokenize)
sents = sents.map(lambda s: list(nltk.word_tokenize(s)))
bigrams = sents.flatMap(lambda s: list(nltk.bigrams(s)))
unique_bigrams = bigrams.distinct().count()
print("unique bigrams: {}".format(unique_bigrams))
bigram_counts = bigrams.map(lambda g: (g, 1)).reduceByKey(add).toDF()
print(bigram_counts.head())
## Main functionality
示例12: tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def tokenize(text):
# text = NB.remove_punctuation(text)
try:
text = text.decode('utf-8').encode('ascii', 'replace').strip().lower()
except:
text = text.encode('ascii', 'replace').strip().lower()
word = [porter.stem(w) for w in re.findall(r"[\w'-]+|[^\s\w]", text)] # split punctuations but dont split single quotes for words like don't
biword = [b for b in nltk.bigrams(word)]
triword = [t for t in nltk.trigrams(word)]
# word = [w for w in word if w not in stopwords.words('english')]
return word # triword
示例13: update_terms_stats
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def update_terms_stats(terms_fd, json_tweet, lex):
tweet = utils.extract_tweet_from_json(json_tweet)
tweet_terms = []
if tweet is None:
return False
tokenizer = nltk.RegexpTokenizer('\#?[\w\d]+')
doc = tokenizer.tokenize(tweet)
for w_raw in doc:
w = w_raw.strip('\"\'.,;?!:)(@/*&')
if not (w.strip('#')).isalpha():
w_aux = ''
#ignore non-ascii characters
for s in w:
if ord(s) < 128:
w_aux += s
else:
break
w = w_aux
w = w.lower()
if (w not in stopwords.words('english') and w not in set(['rt','http','amp'])) and len(w) in range(3, 16):
if w in lex:
continue
tweet_terms.append(w)
terms_fd.inc(w)
bigrams = nltk.bigrams(tweet_terms)
for b in bigrams:
if b[1]+" "+b[0] in lex or b[0]+" "+b[1] in lex:
continue
if b[1]+" "+b[0] in terms_fd:
terms_fd.inc(b[1]+" "+b[0])
else:
terms_fd.inc(b[0]+" "+b[1])
return True
示例14: get_stemmed_terms_list
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def get_stemmed_terms_list(doc, stem_words_map = None, stem_bigrams_map = None):
ps = PorterStemmer()
local_map = dict()
word_list = []
clean_doc = [(w.strip()).lower() for w in doc.split() if len(w) in range(3,16)]
filtered_words = [w.strip('.,;?!:)(#') for w in clean_doc if not w.strip('.,;?!:)(#') in stopwords.words('english')]
for w in filtered_words:
if w.isalpha():
w_temp = ps.stem_word(w)
if stem_words_map is not None:
if w_temp not in stem_words_map:
stem_words_map[w_temp] = dict()
stem_words_map[w_temp][w] = stem_words_map[w_temp].get(w, 0)+1
local_map[w_temp] = w
word_list.append(w_temp)
bigrams = nltk.bigrams(word_list)
for b in bigrams:
bigram_org = (local_map[b[0]],local_map[b[1]])
if stem_bigrams_map is not None:
if b not in stem_bigrams_map:
stem_bigrams_map[b] = dict()
stem_bigrams_map[b][bigram_org] = stem_bigrams_map[b].get(bigram_org, 0)+1
return word_list, bigrams
# keeps track of the exact form of the stemmed bigrams, not only the one of the words
示例15: get_tweet_terms
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import bigrams [as 别名]
def get_tweet_terms(tweet, stem_map = None, bigrams_map = None):
words, bigrams = get_stemmed_terms_list(tweet, stem_map, bigrams_map)
filtered_words = [w for w in words if not w in stopwords.words('english')]
bigrams = nltk.bigrams(filtered_words)
words_set = set(filtered_words)
terms_dict = {}
for w in words_set:
terms_dict['%s'%w] = 'y'
for b in bigrams:
terms_dict['%s %s'%(b[0],b[1])] = 'y'
return terms_dict