本文整理汇总了Python中nltk.tokenize.RegexpTokenizer类的典型用法代码示例。如果您正苦于以下问题:Python RegexpTokenizer类的具体用法?Python RegexpTokenizer怎么用?Python RegexpTokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RegexpTokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: preprocess
def preprocess(TWEETS, typeTweet):
wordlist = []
tokenizer = RegexpTokenizer(r'#?\w+')
#normalize text -- TOKENIZE USING REGEX TOKENIZER
cnt = 0
for item in TWEETS:
text = TWEETS[cnt]
tweet = ''.join(text)
tweet = tweet.lower().strip('\n')
tweet = re.sub(r'[0-9]+', "" , tweet)
tweet = re.sub(r'@[^\s]+', "" , tweet)
tweet = re.sub(r'#\w+primary', "" , tweet)
wordlist.extend(tokenizer.tokenize(tweet))
cnt += 1
#remove stopwords
stop = stopwords.words('english') + ['rt', 'via', 'u', 'r', 'b', '2', 'http',
'https', 'co', 'live', 'hall', 'town', 'watch',
'tune', 'time', 'tonight', 'today', 'campaign',
'debate', 'wants', 'without', 'dont',
'#hillaryclinton', '#berniesanders', '#donaldtrump',
'#tedcruz', "#johnkasich", '#politics']
filtered = [term for term in wordlist if term not in stop]
filtered_final = [term for term in filtered if len(term)>3]
print 'Preprocessed %s tweets' % (typeTweet)
return filtered_final
示例2: lda
def lda(data):
data = get_only_text(data)
only_tweet = data
length = len(only_tweet)
length = min(20,length)
for i in xrange(0,length):
print i
print only_tweet[i]
return
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()
length = len(only_tweet)
length = min(20,length)
total_texts = []
for i in xrange(0,length):
print only_tweet[i]
print
to_lower = only_tweet[i].lower()
tokens = tokenizer.tokenize(to_lower)
stopped_tokens = [k for k in tokens if not k in en_stop]
texts = [p_stemmer.stem(k) for k in stopped_tokens]
total_texts.append(texts)
dictionary = corpora.Dictionary(total_texts)
corpus = [dictionary.doc2bow(text) for text in total_texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
result = ldamodel.print_topics(num_topics=2, num_words=1)
for i in result:
print i
示例3: textToWordList
def textToWordList(txt):
p_stemmer = RussianStemmer()
tokenizer = RegexpTokenizer(r'\w+')
stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')]
r = re.compile('^[а-я]+$')
badword =[
'дом',
'город',
"дорог",
"час",
"ноч",
"слов",
"утр",
"стран",
"пут",
"путешеств",
"мест",
'нов',
"друз",
"добр"
]
txt = txt.lower().replace("<br>", "\n")
tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)]
tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword]
return tokens
示例4: Tokenize
def Tokenize(TextData):
tokenizer = RegexpTokenizer(r'\w+')
tokens = list()
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
# clean and tokenize document string
raw = TextData.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
tokens = stemmed_tokens
TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt")
fp = open(TOKENIZEDTEXT_FILE, "w")
print(TOKENIZEDTEXT_FILE)
# pickle.dump(tokens, fp)
fp.write(str(tokens))
fp.close()
示例5: tokenize
def tokenize(self, doc):
'''
use NLTK RegexpTokenizer
'''
tokenizer = RegexpTokenizer("\w{3,}")
return [self.stemmer.stem(x) for x in tokenizer.tokenize(doc)]
示例6: text_process
def text_process(text):
'''
Takes in a string of text, then performs the following
1. Tokenizes and removes punctuation
2. Removes stopwords
3. Stems
4. Returns a list of the cleaned text
'''
if(pd.isnull(text)):
return []
# Tokenize
tokenizer = RegexpTokenizer(r'\w+')
text_processed = tokenizer.tokenize(text)
# Removing any stopwords
text_processed = [word.lower() for word in text_processed if word.lower() not in stopwords.words('english')]
# Stemming
porterStemmer = PorterStemmer()
text_processed = [porterStemmer.stem(word) for word in text_processed]
try:
text_processed.remove('b')
except:
pass
return " ".join(text_processed)
示例7: trainMarkovChain
def trainMarkovChain(self, n = 1):
self.ngram_degree = n
self.markov_model = defaultdict(lambda : defaultdict(int))
sentences = self.corpus_sentences
if sentences is None:
sentences = self.sentenceTokenizeCorpus()
print("Training markov model on corpus.")
word_tokenizer = RegexpTokenizer(r"\w+")
for sentence in sentences:
words = word_tokenizer.tokenize(sentence)
last_word_list = ["#"] * n
for word in words:
last_token = " ".join(last_word_list)
self.markov_model[last_token][word] += 1
last_word_list.append(word)
last_word_list = last_word_list[1:]
last_token = " ".join(last_word_list)
self.markov_model[last_token]["#"] += 1
示例8: __init__
def __init__(self, oldid, newid, data, general):
self.newid=newid
self.oldid=oldid
self.data=data
self.tfidfatt=[]
self.tfidfval=[]
self.freatt=[]
self.freval=[]
self.text=''
self.ntlk=[]
self.idfvalue=[]
self.general=general
tokenizer = RegexpTokenizer(r'\w+')
#stemmer = SnowballStemmer("english")
stemmer = PorterStemmer()
stop = stopwords.words('english')
for r in tokenizer.tokenize(data):
a=0
if r not in stop:
if not any(i.isdigit() for i in r):
r = stemmer.stem(r)
if r not in self.ntlk:
self.ntlk.append(r)
self.text=self.text+' '+r
示例9: mean_stdDeviation
def mean_stdDeviation(self,query,stopWordInstruction):
list_count_postTitles = []
list_postTitles = self.data[:][query].tolist()
tokenizer = RegexpTokenizer(r'\w+')
stopwords_mine = []
#a.encode('ascii','ignore')
stopwords_mine+= (word.encode('ascii','ignore') for word in stopwords.words('english'))
tokenized_list = []
new_list_tokenized = []
for item in list_postTitles:
tokenized_list.append(tokenizer.tokenize(item))
if stopWordInstruction==True:
for item in tokenized_list:
temp = []
temp += (word for word in item if word.lower() not in stopwords_mine)
#print temp
#raw_input()
new_list_tokenized.append(temp)
else:
new_list_tokenized=copy.deepcopy(tokenized_list)
for x in new_list_tokenized:
list_count_postTitles.append(len(x))
#print list_count_postTitles
npArray = np.asarray(list_count_postTitles)
print npArray.mean()
print npArray.std()
return [npArray.mean(),npArray.std(),list_postTitles,list_count_postTitles]
示例10: issue_analysis
def issue_analysis(df):
df_sub = df[['Issue']]
df_sub.insert(0, 'count', 1)
Issue_List=[]
for i in range(0,50):
Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name)
tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg
en_stop = get_stop_words('en') # create English stop words list
p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer
texts = [] # list for tokenized documents in loop
text_view = ''
# loop through document list
for i in Issue_List:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens and add them to list
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
texts.append(stemmed_tokens)
#print ' '.join(stemmed_tokens)
text_view += ' '.join(stemmed_tokens)
text_view += ' '
wordcloud = WordCloud().generate(text_view)
fig = plt.figure(figsize=(8,6))
fig1 = fig.add_subplot(1,1,1)
fig1.set_title("Top issued words", fontdict={'fontsize':25})
fig1.imshow(wordcloud)
fig1.axis("off")
#plt.savefig('ComplainCount_WC.png')
plt.savefig('ComplainCount_WC_2016.png')
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary)
LDAText = ldamodel.print_topics(num_topics=5, num_words=3)
#print "\n Topic analysis result for top 25 issues with LDA"
#print(LDAText)
vis_data = gensimvis.prepare(ldamodel, corpus, dictionary)
#pyLDAvis.show(vis_data)
#pyLDAvis.save_html(vis_data, "issue_lda.html")
#pyLDAvis.save_json(vis_data, "issue_lda.json")
pyLDAvis.save_html(vis_data, "issue_lda_2016.html")
pyLDAvis.save_json(vis_data, "issue_lda_2016.json")
return 0
示例11: stripped_words
def stripped_words(self, original_sentence):
_sentence = filter(self.printable_char_filter, original_sentence)
_sentence = _sentence.replace(u'\u2013', ' ')
_sentence = _sentence.replace(u'\u2014', ' ')
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokens = tokenizer.tokenize(_sentence)
return [word.lower() for word in tokens if word.lower() not in stop_words]
示例12: relevance_features
def relevance_features(doc):
print "relfeatures"
print doc[:10]
features={}
#print doc
#Test 1 : Has synonyms of NIT Warangal
features['contains synonym']='false'
for word in synonyms:
if word in doc:
features['contains synonym']='true'
break
#Test 2 : Has a person name that appears in Almabase's DB
count=0
names=ner.get_names(data)
count=ner.query_db(names)
print 'count is {}'.format(count)
# if count==0:
# features['hasAlumnus']='none'
# elif count<=3:
# features['hasAlumnus']='medium'
# elif count>3:
# features['hasAlumnus']='high'
# print count
#Test 3: Bag of words approach
tokenizer = RegexpTokenizer(r'\w+')
document_words=tokenizer.tokenize(doc)
for word in word_features:
if word.lower() in document_words:
print "{} is present".format(word)
features['contains({})'.format(word.lower())] = (word in document_words)
return features
示例13: preprocess_wikidata
def preprocess_wikidata(raw):
# Initialize Tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# Initialize Lemmatizer
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Decode Wiki Markup entities and remove markup
text = filter_wiki(raw)
text = re.sub(filter_more, '', text)
# clean and tokenize document string
text = text.lower().split('../img/')[0]
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
tokens = [i for i in tokens if not i in en_stop]
# stem tokens
tokens = [lemma.lemmatize(i) for i in tokens]
# remove non alphabetic characters
tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]
# remove unigrams and bigrams
tokens = [i for i in tokens if len(i)>2]
return (tokens, text)
示例14: get_product_vocab
def get_product_vocab(dict_queries):
tok = RegexpTokenizer(r'\w+')
vocab = {}
for query,v in dict_queries.items():
words = defaultdict(int)
for prod in v:
w_prod = tok.tokenize(prod[1])
for w in w_prod:
#wt = stem(wt)
if not re.match(r'\d+$', w) and \
len(w) > 1 and \
w not in stop_words:
words[w] += 1
vocab[query] = words.keys()
#vocab[query] = [k for (k, v) in words.iteritems() if v > 1]
"""
print "Query: " + query
sorted_w = sorted(words.items(), key=lambda x:x[1], reverse=True)
print sorted_w
"""
return vocab
示例15: write_summary
def write_summary(texts, ofile):
word_tokenizer = RegexpTokenizer(r"\w+")
with codecs.open(ofile, u"w", u"utf-8") as f:
for text in texts:
f.write(u" ".join([w.lower() for w in word_tokenizer.tokenize(text)]))
f.write(u"\n")
f.flush()