本文整理汇总了Python中nltk.corpus.stopwords.fileids函数的典型用法代码示例。如果您正苦于以下问题:Python fileids函数的具体用法?Python fileids怎么用?Python fileids使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fileids函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: split_count
def split_count(sentence): # split the sentence and count where each words come from
# how about making a parallel list and then nameing the language and word in the same index #box
vocab_list = []
languages_ratios = {}
split = wordpunct_tokenize(sentence) # tokenizes the input
words = [word.lower()for word in split] # makes sentence lower in the list split
lang_dict = {}
for language in stopwords.fileids(): # iterate through a list of lang built in
stopwords_set = set(stopwords.words(language))
words_set = set(words) # creates a set of words
vocab_list = words # good
# print "this is word set: " ,words_set
#print "this is vocablist: " , vocab_list
common_element = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_element) # this will detrm}ain the score
lang_dict[language] = common_element # works like intend, but want to make Cleaner
#main_language_set =
#secondary_lang = lang_dict.intersection( secondary_lang)
# print "size of vocab: ",len(vocab_list) #,"and lang ", len(lang_list) ---Delete
# for i in range(len(vocab_list)):
# print lang_list[i],vocab_list[i]
# print "----------------------------"
print "this is the set for main lang:", lang_dict.get(main_language), "\n"
print "this is the set for second lang:", lang_dict.get(secondary_lang),"\n"
# print "this lang. ratios", languages_ratios , "\n"
# print "this is lang list: ",lang_list
print "this is vocb_list: ", vocab_list , "\n" # check good
print "this is DICT: ", lang_dict
print "ORIGINAL SENTENCE: " , sentence
示例2: hello_world
def hello_world():
if request.method == 'POST':
print "Request: ", request
print "Form: ", request.form
print "Files: ", request.files
archive = zipfile.ZipFile(request.files.get("solution"))
with archive.open("extra.txt") as solution:
languages_ratios = {}
tokens = nltk.wordpunct_tokenize(solution.read().decode('utf-8'))
words_list = [word.lower() for word in tokens]
words_set = set(words_list)
print "Words_set: ", words_set
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
common_elements = words_set.intersection(stopwords_set)
if common_elements:
languages_ratios[language] = len(common_elements)
print "Language ratios: ", languages_ratios
# 50%
mark = 50 if max(languages_ratios, key=languages_ratios.get) == 'english' else 0
# 50%
print "Mark for lang: ", mark
words_count = len(words_list)
print "Words count: ", words_count
mark += (float(words_count) / 200) * 50 if words_count < 200 else 50
print "Total Mark: ", mark
req = requests.post(request.form["url"], data={"mark": int(mark)})
return ''
示例3: calculate_language_scores
def calculate_language_scores(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.
:param text: Text to analyze.
:type text: str
:return: Dictionary with languages and unique stopwords seen in analyzed text.
:rtype: dict(str -> int)
:raises: TypeError
"""
if not isinstance(text, basestring):
raise TypeError("Expected basestring, got '%s' instead" % type(text))
if not text:
return {}
languages_ratios = {}
# Split the text into separate tokens, using natural language punctuation signs.
tokens = wordpunct_tokenize(text)
tokenized_words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(tokenized_words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
示例4: _calculate_languages_ratios
def _calculate_languages_ratios(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
'''
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
示例5: detect_language
def detect_language(comment):
"""
To detect language we could compare a comment to stopwords from each language. The language that has most
stopwords in common with the comment is likely to be the language in which the comment is written. This is obviously
not waterproof, however, a well written comment would work way better than a comment written in slang or with poor
grammar. Ultimately, this would likely result in comments that are more valuable because of their structure.
In addition, languages that are easily distinguished from English could be detected, thus being able to compare the
language of a comment to the actual content that is annotated in Hypothes.is, since most users won't understand
comments in a different language anyway.
"""
# first we tokenize the comment
tokens = wordpunct_tokenize(comment)
words = [word.lower() for word in tokens]
languages_ratios = {}
# Then we compare the words to the most frequent stopwords per language
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
# Calculate the language score
languages_ratios[language] = len(common_elements)
# Get the key with the highest value
most_rated_language = max(languages_ratios, key=languages_ratios.get)
return most_rated_language
示例6: calcularValoresDeIdioma
def calcularValoresDeIdioma(contenido):
languages_ratios = {}
tokens = wordpunct_tokenize(contenido)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements)
return languages_ratios
示例7: calculate_languages_ratios
def calculate_languages_ratios(text):
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
示例8: detectLanguage
def detectLanguage(self, text):
languages_scores = {}
tokens = word_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords
# appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_scores[language] = len(common_elements) # language "score"
return max(languages_scores, key=languages_scores.get)
示例9: check_language
def check_language(self, word_list):
""" source: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/"""
languages_ratios = {}
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(word_list)
# Check similarity
common_elements = words_set.intersection(stopwords_set)
# Save as ratio
languages_ratios[language] = len(common_elements)
# Get language with most similarities
most_rated_language = max(languages_ratios, key=languages_ratios.get)
return most_rated_language
示例10: cal
def cal():
text = sys.stdin.read()
languages_ratios = {}
toekns = wordpunct_tokenize(text)
words = [word.lower() for word in toekns]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements)
ratios = languages_ratios
most = max(ratios, key=ratios.get)
print (most)
"""if most == "english":
示例11: _calculate_languages_ratios
def _calculate_languages_ratios(text):
text = str(text) # assuring we receive a String
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios
示例12: _calculate_languages_ratios
def _calculate_languages_ratios(self, text):
#Calcule la probabilité d'avoir un text écrit dans telle ou telle languages et
#retourne un dictionnaire qui ressemble à {'french': 2, 'english': 4, 'dutsh': 0}
languages_ratios = {}
tokens = self.getWords(text)
# Compte par language le nombre de stopwords qui apparait.
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(tokens)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # nombre d'aparition de stopwords par langue
return languages_ratios
示例13: language_detector
def language_detector(string):
tokens = wordpunct_tokenize(string)
words = [word.lower() for word in tokens]
# compute language scores
languages_ratios = {}
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
languages_ratios
most_rated_language = max(languages_ratios, key=languages_ratios.get)
return most_rated_language
示例14: lang_likelihood
def lang_likelihood(self, document):
''' This method computes the language likelihood using algorithm
and tokenizer from NLTK.
'''
languages_likelihood = {}
tokens = wordpunct_tokenize(document)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_likelihood[language] = len(common_elements) # language "score"
return languages_likelihood
示例15: _calculate_languages_ratios
def _calculate_languages_ratios(text):
"""
Calculate probability of given text to be written in several languages and
return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
@param text: Text whose language want to be detected
@type text: str
@return: Dictionary with languages and unique stopwords seen in analyzed text
@rtype: dict
"""
languages_ratios = {}
'''
nltk.wordpunct_tokenize() splits all punctuations into separate tokens
>>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
'''
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
# Compute per language included in nltk number of unique stopwords appearing in analyzed text
for language in stopwords.fileids():
if (language == "portuguese"):
lista=stopwords.words(language)
lista.append('Fatec')
lista.append('fatec')
lista.append('Palmeiras')
lista.append('palmeiras')
lista.append('Dilma')
lista.append('dilma')
lista.append('Copa')
lista.append('copa')
stopwords_set=set(lista)
else:
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements) # language "score"
return languages_ratios