本文整理汇总了Python中nltk.tokenize.regexp_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python regexp_tokenize函数的具体用法?Python regexp_tokenize怎么用?Python regexp_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了regexp_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_score
def get_score(self, document, lang):
# Extract ngrams
unigrams = regexp_tokenize(document, pattern_unigrams)
bigrams = regexp_tokenize(document, pattern_bigrams)
#Create frequency distributions
doc_fdist = FreqDist(unigrams + bigrams)
sim = cosineOnDicts(self._prototypes[lang], doc_fdist, self._union)
return sim
示例2: find_version
def find_version(text):
digit_pattern = r"(?:(\d+)\.)?(?:(\d+)\.)?(\*|\d+)"
pattern = "\s?[vV]ersion\s?" + digit_pattern
pattern += "| [vV]er\s?\.?\s?" + digit_pattern
pattern += "| [vV]\s?\.?\s?" + digit_pattern
version_matches = regexp_tokenize(text, pattern)
pattern = digit_pattern + "$"
versions = []
for version in version_matches:
matches = regexp_tokenize(version, pattern)
for match in matches:
versions.append(match)
return versions
示例3: are_duplicates
def are_duplicates(doc1, doc2):
if len(doc1) > 50 and len(doc2) > 50 and not are_duplicates(doc1[:50], doc2[:50]):
return False
txt_tokens_1 = regexp_tokenize(doc1, pattern_words)
txt_tokens_2 = regexp_tokenize(doc2, pattern_words)
ngrams_1 = txt_tokens_1 + generate_ngrams(txt_tokens_1, 2)
ngrams_2 = txt_tokens_2 + generate_ngrams(txt_tokens_2, 2)
overlap = len([w for w in ngrams_1 if w in ngrams_2])
score = (2*overlap)/(len(ngrams_1) + len(ngrams_1) + 1)
if score > 0.8:
return True
else:
return False
示例4: __init__
def __init__(self, sentence):
self.sentence = sentence
self.forms = []
for s in tuple(open(FORMS, "r")): # read the user_forms from file
self.forms.append([w for w in regexp_tokenize(s, "[-\w]+") if w.isalnum()])
if self.is_valid():
self.tokens = regexp_tokenize(self.sentence, "(\\$)|[-\w]+") # tokenizing with regex
self.stop_words = set(stop.words("english")) # filtering tokens words to remove
self.filtered = [w.lower() for w in self.tokens if w not in self.stop_words] # remove stop words
self.spell_checked = self.spell_check()
self.tags = pos_tag(self.spell_checked, tagset="universal") # speech tagging (identification)
print(self.tags)
self.digits = self.get_digits()
self.user_form = self.get_user_form()
示例5: word_split
def word_split(text):
"""
Split a text in words. Returns a list of tuple that contains
word.
"""
a = regexp_tokenize(text.lower().strip(), pattern=r'\w+')
return a
示例6: main
def main():
args = argument_parser.main()
global sql
sql = SQLConnector(host=args.host, port=args.port, user=args.user, passwd=args.password, db=args.db)
global bing
bing = BingSearch()
global new_software
new_software = NewSoftware()
global possible_tags
possible_tags = []
mongo = MongoConnector(host=args.H, db=args.db)
for page in range(1):
res = sql.load_data(page)
rows = res.num_rows()
if not rows:
print "No tweets left to analyse"
break
for _i_ in range(1): # rows):
for tweet in res.fetch_row():
tweet_id = str(tweet[0])
text = tweet[1].lower()
# text = "Version 2 Microsoft just released MS Office ver 3.20.2 for 99 cent 100c 10ps 13pence 10 pence"
urls = find_url(text)
for url in urls:
text = text.replace(url, "").strip()
versions = find_version(text)
words = regexp_tokenize(text, pattern=r"\w+([.,]\w+)*|\S+")
# print words
prices = find_price(words)
pos_ = pos(words)
ngram = ngrams(words, 5)
try:
tagged_tweet = tag_tweets(ngram, tweet_id)
tagged_tweet.add("tweet_text", text)
tagged_tweet.add("sentiment", tweet[2])
tagged_tweet.add("url", urls)
tagged_tweet.add("version", versions)
tagged_tweet.add("price", prices)
if tweet_id in possible_tags:
print tweet_id
else:
if tagged_tweet.contains("software_id") or tagged_tweet.contains("operating_system_id"):
print tweet
print tagged_tweet
print
# mongo.insert(tagged_tweet)
else:
print tweet, "No software"
# sql.setTagged(tagged_tweet.get('tweet_db_id'))
except IncompleteTaggingError, e:
# This will allow the tweet to be tagged again at a later stage
print tweet_id + ":", e
print tweet
print
示例7: simhash
def simhash(raw_text):
"""Compute the simhash value for a string."""
fdist = FreqDist()
for word in regexp_tokenize(raw_text, pattern=r'\w+([.,]\w+)*|\S+'):
fdist.inc(word.lower())
v = [0] * 128
for word in fdist:
projection = bitarray()
projection.fromstring(hashlib.md5(word).digest())
#print "\tw:%s, %d" % (word, fdist[word])
#print "\t\t 128 bit hash: " + str(b)
for i in xrange(128):
if projection[i]:
v[i] += fdist.get(word)
else:
v[i] -= fdist.get(word)
hash_val = bitarray(128)
hash_val.setall(False)
for i in xrange(128):
if v[i] > 0:
hash_val[i] = True
return hash_val
示例8: identify_language
def identify_language(self, document, default_lang = None):
# Extract ngrams
unigrams = regexp_tokenize(document, pattern_unigrams)
bigrams = regexp_tokenize(document, pattern_bigrams)
#Create frequency distributions
doc_fdist = FreqDist(unigrams + bigrams)
predicted_lang = default_lang
max_sim = 0.5
for k,v in self._prototypes.items():
sim = cosineOnDicts(v, doc_fdist, self._union)
if sim > max_sim:
max_sim = sim
predicted_lang = k
return predicted_lang
示例9: getTokenizedQueries
def getTokenizedQueries():
queriesFileName = "../cacm.query"
f = open(queriesFileName, 'r')
i = 0
queriesList = {}
isText = False
for lineWithEnter in f:
line = lineWithEnter[:-1]
if len(line) == 0:
continue
elif line[0] == '<' or (line[0] == ' ' and len(line) == 1):
isText = False
continue
else:
if not isText:
isText = True
queriesList[i] = ""
queriesList[i] += line
i += 1
else:
queriesList[i - 1] += " "
queriesList[i - 1] += line
# print line
tokenizedQueriesList = {}
for q in queriesList:
tokenizedQueriesList[q] = regexp_tokenize(queriesList[q], pattern='[\d]+[\.\,\d]*[\d]+\%?|\[\d+\]|[\w\-]+')
return tokenizedQueriesList
示例10: tokenizeList
def tokenizeList(tokenList):
#remove stop words, punctuation & stem words to create tokens out of phrases and names
tokenized_list = []
for item in tokenList:
tokenized = regexp_tokenize(item.lower(), "[\w']+")
for word in tokenized:
if word not in english_stops:
stemmed = stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation)
if not stemmed.isalpha():
if stemmed.isdigit():
stemmed = 'NUMBER'
tokenized_list.append(stemmed)
elif stemmed.isalnum():
stemmed = 'ALPHANUM'
tokenized_list.append(stemmed)
else:
tokenized_list.append(stemmed)
'''
filtered = [word for word in tokenized if word not in english_stops]
stemmed = [stemmer.stem(word).encode('ascii', 'ignore').lstrip().lower().translate(None,string.punctuation) for word in filtered]
stemmed = [word for word in stemmed if word !='']
tokenized_list.extend(stemmed)
'''
return tokenized_list
示例11: tag_and_tokenize
def tag_and_tokenize(self,file):
'''Tokenize, Chuncks and tags string 's' the bulk of the script work (time) is done here'''
self.text = get_docx_text(file)
self.sentences = ""
print("Tokenize and tagging...")
self.sentences = regexp_tokenize(self.text, pattern='\w+|\$[\d\.]+|\S+')
self.sentences = [st.tag(self.sentences)]
print("Tagging done")
示例12: words
def words(text, splitContractions=False, contractChars = ["'"]):
'''uses a regexpTokenizer to tokenize text to words. If splitContractions is true,
the regex pattern is [\w]+ so that contractions are split, e.g. "I can't" -> ['I','can','t'],
otherwise the regex pattern is [\w']+ so that contractions are not split, i.e. "I can't" -> ['I', "can't"]
Additional contract characters, e.g. a hyphen, can be added by over riding the contractChars arg'''
if splitContractions:
pat = "[\w]+"
else:
pat = "[\w{0}]+".format(reduce(lambda x,y: x+y, contractChars, ""))
return regexp_tokenize(text, pat, discard_empty=True)
示例13: index
def index(request):
if request.method == "POST":
if request.POST.get("tokens"):
with open(settings.BASE_DIR+"/data/corpus.pkl", 'rb') as handle:
corpus = pickle.load(handle)
tokens = ast.literal_eval(request.POST.get("tokens"))
tagged = []
i = 1
for item in tokens:
tagged.append((item,request.POST.get("token_"+str(i))))
i += 1
if tagged not in corpus:
corpus.append(tagged)
with open(settings.BASE_DIR+"/data/corpus.pkl", 'wb') as handle:
pickle.dump(corpus, handle)
tag_set = unique_list(tag for sent in corpus for (word,tag) in sent)
symbols = unique_list(word for sent in corpus for (word,tag) in sent)
trainer = HiddenMarkovModelTrainer(tag_set, symbols)
hmm = trainer.train_supervised(corpus, estimator=LaplaceProbDist)
with open(settings.BASE_DIR+"/data/hmm.pkl", 'wb') as handle:
pickle.dump(hmm, handle)
return render(request, 'tagger/index.html', {'corpus': corpus})
else:
if request.POST.get("random") == 'true':
address = get_random_address()
if not address:
return render(request, 'tagger/index.html', {'error_message': 'No random addresses left'})
else:
address = request.POST.get("address")
tokens = regexp_tokenize(address, pattern=r'\d+|[^\r\n\t\f 0-9,]+|,', )
if tokens:
pkl_file = open(settings.BASE_DIR+"/data/hmm.pkl", 'rb')
hmm = pickle.load(pkl_file)
pkl_file.close()
tagged = hmm.tag(tokens)
tags_file = open(settings.BASE_DIR+"/data/tags.json", 'rb')
reader = codecs.getreader("utf-8")
tags = json.load(reader(tags_file))
tags_file.close()
return render(request, 'tagger/index.html', {'address': address,
'tokens': tokens,
'tagged': tagged,
'tags': sorted(tags.items(), key=operator.itemgetter(1)) })
return render(request, 'tagger/index.html', {})
示例14: getReviews
def getReviews(rootdir):
reviews = []
unique = []
for folder, subs, files in os.walk(rootdir):
for filename in files:
with open(os.path.join(folder,filename),'r') as src:
review = src.read()
words = regexp_tokenize(review,"\w+")
for word in words:
unique.append(word)
reviews.append(review)
return reviews
示例15: _tokenize_content
def _tokenize_content(self):
tokenized_content = []
raw_content = self._clean_content()
content_sents = sent_tokenize(raw_content)
content_words_by_sents = map(lambda sent: word_tokenize(sent), content_sents)
stopwords = regexp_tokenize(STOPWORDS, "[\w']+")
extra_puncts = ['),', ').', '%),', '%).', '):', '()', '://', '>.', '.;', '...', '/>.']
puncts = list(punctuation) + extra_puncts
stopwords.extend(puncts)
for sent in content_words_by_sents:
clean_sent = [word for word in sent if word not in stopwords]
tokenized_content.append(clean_sent)
return tokenized_content