本文整理汇总了Python中nltk.ngrams方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.ngrams方法的具体用法?Python nltk.ngrams怎么用?Python nltk.ngrams使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.ngrams方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def parse():
parser = argparse.ArgumentParser()
parser.add_argument('dataset', help='pol or main', type=str)
parser.add_argument('-n', '--n', default=1, help='Number of grams', type=int)
parser.add_argument('--min_count', default=1, help='Min count', type=int)
parser.add_argument('--embedding', default=CCGLOVE,
help='embedding file', type=str)
parser.add_argument('--weights', default=None,
help='weights to use for ngrams (e.g. sif, None)', type=str)
parser.add_argument('-norm', '--normalize', action='store_true',
help='Normalize vectors')
parser.add_argument('-l', '--lower', action='store_true',
help='Whether or not to lowercase text')
parser.add_argument('-e', '--embed', action='store_true',
help='Use embeddings instead of bong')
return parser.parse_args()
示例2: ngram_context
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def ngram_context(strdoc, intdoc, vocabulary, n=1, wndo2=5, unkgram=None):
'''sliding window around n-grams in a document
Args:
strdoc: list of tokens (as strings)
intdoc: list of indices (as ints); len(intdoc) == len(strdoc)
vocabulary: n-gram vocabulary (set of n-grams or dict with n-grams as keys)
n: n in n-gram
wndo2: half the window size
unkgram: map n-grams not in vocabulary to this n-gram; if None does not yield such n-grams
Returns:
(n-gram, int generator) generator over (n-gram, context window pairs)
'''
wndo2pn = wndo2+n
unk = not unkgram is None
for i, ngram in enumerate(nltk.ngrams(strdoc, n)):
if ngram in vocabulary:
yield ngram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
elif unk:
yield unkgram, chain(intdoc[max(i-wndo2, 0):i], intdoc[i+n:i+wndo2pn])
示例3: alabong
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def alabong(A, word_embeddings, lists, coocs, counts):
n = len(lists)
def represent(documents):
output = []
docs = tokenize(doc.lower() for doc in documents)
for k, kgramlist, kgramcooc, kgramcount in zip(range(1, n+1), lists, coocs, counts):
kgrams = [list(nltk.ngrams(doc, k)) for doc in docs]
vocab = {kgram for doc in kgrams for kgram in doc}
where = np.array([i for i, kgram in enumerate(kgramlist) if kgram in vocab and kgramcount[i]])
bong = docs2bofs(kgrams, vocabulary=kgramlist, format='csc')
output.append(np.zeros((len(documents), word_embeddings.shape[1]), dtype=FLOAT))
for offset in range(0, where.shape[0], MAXSLICE):
indices = where[offset:offset+MAXSLICE]
if k > 1:
vecs = normalize(A.predict(kgramcooc[indices].dot(word_embeddings)/kgramcount[indices,None])) / k
else:
vecs = normalize(word_embeddings[indices])
output[-1] += bong[:,indices].dot(vecs)
return np.hstack(output)
return represent, None, True
示例4: get_word_skipgram_distribution
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def get_word_skipgram_distribution(input_buffer, n=2, k=2, encoding="utf-8",
tokenize_method=nltk.word_tokenize):
"""
Get distribution of skipgrams with given n and k values from input_buffer.
:param input_buffer:
:param n:
:param k:
:param encoding:
:param tokenize_method:
:return:
"""
# Ensure we have a decoded string
if isinstance(input_buffer, bytes):
input_buffer = input_buffer.decode(encoding)
ngrams = nltk.ngrams(tokenize_method(input_buffer), n=n)
return nltk.util.skipgrams(ngrams, n, k)
示例5: extract_ngrams
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def extract_ngrams(text, stemmer, N):
'''
Parameter Arguments:
text: 'Ney York is a city. It has a huge population.'
N: Length of the n-grams e.g. 1, 2
return: a list of n-grams
[('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'),
('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
'''
ngrams_list = []
ngram_items = list(ngrams(sent2stokens(text, stemmer), N))
for i, ngram in enumerate(ngram_items):
ngram_str = ' '.join(ngram)
ngrams_list.append(ngram_str)
return ngrams_list
示例6: parse
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def parse(self, tagged_text, ngram_len=-1):
ngrams = []
if len(tagged_text) == 0:
return ngrams
if tagged_text[0]['pos'] in self._exclude_if_first:
tagged_text = tagged_text[1:]
if ngram_len == -1:
for l in range(len(tagged_text), 0, -1):
ngrams += list(nltk.ngrams(tagged_text, l))
else:
ngrams += list(nltk.ngrams(tagged_text, ngram_len))
ngrams += [n[:-1] for n in ngrams if len(n) > 1 and n[-1]['pos'] in {"NN", "NNS"}]
ngrams += [n[1:] for n in ngrams if len(n) > 1 and n[0]['pos'] in {"NN", "NNS"}]
ngrams = [n for n in ngrams
if len({el[i] for el in n for i in {'pos', 'ner'}} & self._exclude_pos) == 0
and (len(n) == 1 or (n[0]['pos'] not in self._exclude_prefix
and n[0]['word'].lower() not in utils.stop_words_en
and n[-1]['pos'] not in self._exclude_suffix
and n[-1]['word'].lower() not in utils.stop_words_en)
)
and not(len(n) == 1 and (n[0]['pos'] in self._exclude_alone or n[0]['word'].lower() in utils.stop_words_en))]
return ngrams
示例7: get_strings_from_utterance
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def get_strings_from_utterance(tokenized_utterance: List[Token]) -> Dict[str, List[int]]:
"""
Based on the current utterance, return a dictionary where the keys are the strings in
the database that map to lists of the token indices that they are linked to.
"""
string_linking_scores: Dict[str, List[int]] = defaultdict(list)
for index, token in enumerate(tokenized_utterance):
for string in atis_tables.ATIS_TRIGGER_DICT.get(token.text.lower(), []):
string_linking_scores[string].append(index)
token_bigrams = bigrams([token.text for token in tokenized_utterance])
for index, token_bigram in enumerate(token_bigrams):
for string in atis_tables.ATIS_TRIGGER_DICT.get(" ".join(token_bigram).lower(), []):
string_linking_scores[string].extend([index, index + 1])
trigrams = ngrams([token.text for token in tokenized_utterance], 3)
for index, trigram in enumerate(trigrams):
if trigram[0] == "st":
natural_language_key = f"st. {trigram[2]}".lower()
else:
natural_language_key = " ".join(trigram).lower()
for string in atis_tables.ATIS_TRIGGER_DICT.get(natural_language_key, []):
string_linking_scores[string].extend([index, index + 1, index + 2])
return string_linking_scores
示例8: get_time_range_start_from_utterance
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def get_time_range_start_from_utterance(
utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
late_indices = {
index for index, token in enumerate(tokenized_utterance) if token.text == "late"
}
time_range_start_linking_dict: Dict[str, List[int]] = defaultdict(list)
for token_index, token in enumerate(tokenized_utterance):
for time in TIME_RANGE_START_DICT.get(token.text, []):
if token_index - 1 not in late_indices:
time_range_start_linking_dict[str(time)].append(token_index)
bigrams = ngrams([token.text for token in tokenized_utterance], 2)
for bigram_index, bigram in enumerate(bigrams):
for time in TIME_RANGE_START_DICT.get(" ".join(bigram), []):
time_range_start_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])
return time_range_start_linking_dict
示例9: get_time_range_end_from_utterance
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def get_time_range_end_from_utterance(
utterance: str, tokenized_utterance: List[Token]
) -> Dict[str, List[int]]:
early_indices = {
index for index, token in enumerate(tokenized_utterance) if token.text == "early"
}
time_range_end_linking_dict: Dict[str, List[int]] = defaultdict(list)
for token_index, token in enumerate(tokenized_utterance):
for time in TIME_RANGE_END_DICT.get(token.text, []):
if token_index - 1 not in early_indices:
time_range_end_linking_dict[str(time)].append(token_index)
bigrams = ngrams([token.text for token in tokenized_utterance], 2)
for bigram_index, bigram in enumerate(bigrams):
for time in TIME_RANGE_END_DICT.get(" ".join(bigram), []):
time_range_end_linking_dict[str(time)].extend([bigram_index, bigram_index + 1])
return time_range_end_linking_dict
示例10: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def __init__(self, ngrams: Union[int, List[int]] = 1,
exclude_stopwords: bool = False,
stop_words: Optional[List] = None) -> None:
""" Initialize the NGramsTokenizer
Parameters
----------
ngrams : Union[int, List[int]], optional
[description], by default 1
exclude_stopwords: bool
[description], by default False
stop_words: Optional[List]
[description], by default None
"""
self.ngrams = ngrams
self.exclude_stopwords = exclude_stopwords
if self.exclude_stopwords:
self.stop_words = stop_words
if self.stop_words is None:
nltk.download('stopwords', quiet=True)
self.stop_words = stopwords.words('english')
nltk.download('punkt', quiet=True)
示例11: tokenize
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def tokenize(self, example: str) -> List[str]:
"""Tokenize an input example.
Parameters
----------
example : str
The input example, as a string.
Returns
-------
List[str]
The output word tokens, as a list of strings
"""
if self.exclude_stopwords and self.stop_words:
example = ' '.join([word for word in word_tokenize(example)
if word not in self.stop_words])
if isinstance(self.ngrams, List):
ret: List[str] = []
for i in self.ngrams:
ret.extend(self._tokenize(example, i))
return ret
else:
return NGramsTokenizer._tokenize(example, self.ngrams)
示例12: extract_ngrams2
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def extract_ngrams2(sentences, stemmer, language, N=2):
'''
Parameter Arguments:
sentences: list of sentences
['Ney York is a city.', 'It has a huge population.']
N: Length of the n-grams e.g. 1, 2
return: a list of n-grams
[('new', 'york'), ('york', 'is'), ('is', 'a'), ('a', 'city'), (city, '.'),
('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
'''
ngrams_list = []
for sent in sentences:
sent = re.sub('[-](,?\s)','\\1', sent) #case where magister- has to be handled
ngram_items = list(ngrams(sent2stokens(sent, stemmer, language), N))
for i, ngram in enumerate(ngram_items):
ngram_str = ' '.join(ngram)
ngrams_list.append(ngram_str)
return ngrams_list
示例13: extract_nuggets
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def extract_nuggets(sentences, nugget_type, language):
'''
Parameter Arguments:
sentences: list of sentences
['Ney York is a city.', 'It has a huge population.']
return: a list of noun phrases, events, named_entities
[('new', 'york'), ('york', 'is'), ('a', 'city'),
('it', 'has'), ('has','a'), ('a', 'huge'), ('huge', 'population') , ('population', '.')]
'''
nugget_list = []
for sent in sentences:
if nugget_type == 'n-grams':
nugget_items = list(ngrams(sent2stokens(sent, language), 2))
if nugget_type == 'NP':
nugget_items = get_phrases(sent, 'NP')
if nugget_type == 'Phrases':
nugget_items = get_phrases(sent, 'Phrases')
if nugget_type == 'NE':
nugget_items = get_phrases(sent, 'NE')
for nugget in nugget_items:
nugget_list.append(' '.join(nugget))
return nugget_list
示例14: add_sentences
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def add_sentences(self, sentences):
"""
@type sentences: list[Sentence]
"""
counter = self.counter
G = self.G
for sent in sentences:
counter.update(ngrams(sent.tokens, self.N))
G.add_nodes_from(sent.tokens)
updated_edges = []
for v in counter.elements():
s = v[0]
t = v[1]
c = counter[v]
updated_edges.append((s, t, c))
G.add_weighted_edges_from(updated_edges)
示例15: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import ngrams [as 别名]
def __init__(self, order, alpha, sentences):
self.order = order
self.alpha = alpha
if order > 1:
self.backoff = LangModel(order - 1, alpha, sentences)
self.lexicon = None
else:
self.backoff = None
self.n = 0
self.ngramFD = nltk.FreqDist()
lexicon = set()
for sentence in sentences:
words = nltk.word_tokenize(sentence)
wordNGrams = nltk.ngrams(words, order)
for wordNGram in wordNGrams:
self.ngramFD[wordNGram] += 1
# self.ngramFD.inc(wordNGram)
if order == 1:
lexicon.add(wordNGram)
self.n += 1
self.v = len(lexicon)