本文整理汇总了Python中helper.Helper.read_file方法的典型用法代码示例。如果您正苦于以下问题:Python Helper.read_file方法的具体用法?Python Helper.read_file怎么用?Python Helper.read_file使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类helper.Helper
的用法示例。
在下文中一共展示了Helper.read_file方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from helper import Helper [as 别名]
# 或者: from helper.Helper import read_file [as 别名]
class TextAnalyzer:
def __init__(self):
self.Helper = Helper()
self.Stemmer = SnowballStemmer("english")
self.Lemmatizer = WordNetLemmatizer()
self.filtered_books = []
self.stemmed_books = []
self.lemmatized_books = []
self.all_tokens_counts = []
self.distinct_tokens_counts = []
self.distinct_tokens_counts_with_stemming = []
self.distinct_tokens_counts_with_lemmatization = []
def process_text(self, specific_id=-1, perform_stemming=True, perform_lemmatization=True):
for filename in os.listdir(self.Helper.directory_original_books):
if filename.endswith(".txt"):
# only analyze book with specific ID
if (specific_id >= 0) & (filename != str(specific_id) + ".txt"):
continue
book = self.Helper.read_file(filename)
# analyze books without stemming
tokens = self.tokenize(book)
# join tokens again to have the filtered text for stemming
book = " ".join(tokens)
self.Helper.write_file(book, filename, "f")
if perform_stemming & (not self.Helper.file_exists(self.Helper.stemming_directory, filename)):
stemmed_book = self.perform_stemming(book, filename)
self.stemmed_books.append(stemmed_book)
self.Helper.write_file(stemmed_book, filename, "s")
else:
print self.Helper.stemming_directory + filename, "already exists."
if perform_lemmatization & (not self.Helper.file_exists(self.Helper.lemmatization_directory, filename)):
lemmatized_book = self.perform_lemmatization(book, filename)
self.lemmatized_books.append(lemmatized_book)
self.Helper.write_file(lemmatized_book, filename, "l")
else:
print self.Helper.lemmatization_directory + filename, "already exists."
# tokenization currently without stopword removal
def tokenize(self, string):
book_lower_case = string.lower()
tokens = nltk.wordpunct_tokenize(book_lower_case)
tokens_filtered = filter(lambda t: t.isalpha(), tokens)
return tokens_filtered
def perform_stemming(self, book, filename):
print "INFO: STEMMING", filename
stemmed_book = ""
for word in book.split():
stemmed_word = self.Stemmer.stem(word)
stemmed_book += stemmed_word + " "
return stemmed_book
def perform_lemmatization(self, book, filename):
print "INFO: LEMMATIZING", filename
lemmatized_book = ""
for word in book.split():
lemmatized_word = self.Lemmatizer.lemmatize(word)
lemmatized_book += lemmatized_word + " "
return lemmatized_book
def count_distinct_tokens(self, tokens):
return set(tokens)
def count_documents(self):
return len(self.filtered_books)
def count_all_tokens_of_all_books(self):
return sum(self.all_tokens_counts)
def count_distinct_tokens_of_all_books(self):
return sum(self.distinct_tokens_counts)
def count_distinct_tokens_of_all_books_with_stemming(self):
return sum(self.distinct_tokens_counts_with_stemming)
def count_distinct_tokens_of_all_books_with_lemmatization(self):
return sum(self.distinct_tokens_counts_with_lemmatization)
def calculate_word_frequencies(self, identifier):
combined_books = ""
if identifier == "f":
for book in self.filtered_books:
combined_books += book + "\n"
elif identifier == "s":
for book in self.stemmed_books:
combined_books += book + "\n"
#.........这里部分代码省略.........