本文整理汇总了Python中nltk.corpus.reader.xmldocs.XMLCorpusReader.fileids方法的典型用法代码示例。如果您正苦于以下问题:Python XMLCorpusReader.fileids方法的具体用法?Python XMLCorpusReader.fileids怎么用?Python XMLCorpusReader.fileids使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.reader.xmldocs.XMLCorpusReader
的用法示例。
在下文中一共展示了XMLCorpusReader.fileids方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: feature_apply
# 需要导入模块: from nltk.corpus.reader.xmldocs import XMLCorpusReader [as 别名]
# 或者: from nltk.corpus.reader.xmldocs.XMLCorpusReader import fileids [as 别名]
def feature_apply(feature_extractor, feature_vector, attribute, number_of_file):
"""
Extract features from each document
:param feature_extractor: function that extract features
:param feature_vector: contains a list of features
:param attribute: indicate if the process for gender or age feature extraction
:param number_of_file: number of document to be processed
:return:vector that contain the extracted features
"""
corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/en'
#corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/meTets'
newcorpus = XMLCorpusReader(corpus_root, '.*')
i=0
feature_set = []
doc_list = newcorpus.fileids()
print len(doc_list)
for doc in doc_list[:number_of_file]:
i+=1
if i%50==0:
print i
doc = newcorpus.xml(doc)
number_of_conversation=int(doc[0].attrib["count"])
#print(doc[0].attrib["count"])
txt = " ".join([doc[0][j].text for j in range(number_of_conversation) if doc[0][j].text is not None])
#print txt
if textstat.sentence_count(txt) != 0:
feature_set.append((feature_extractor(txt, feature_vector), doc.attrib[attribute]))
return feature_set
示例2: test_set
# 需要导入模块: from nltk.corpus.reader.xmldocs import XMLCorpusReader [as 别名]
# 或者: from nltk.corpus.reader.xmldocs.XMLCorpusReader import fileids [as 别名]
def test_set(corpus_dir, feature_extrator, vect_path, i):
"""
Read ,process the test set and extract features for each document
:param corpus_dir:path of the test set
:param feature_extrator: function that extract features
:param vect_path:
:param i:index of class in the true_pred dictionay values; if 0 it refers to the gender else it refers to the age
:return:vector that contain the extracted features
"""
vect = create_feature_vect(vect_path)
newcorpus = XMLCorpusReader(corpus_dir, '.*')
doc_list = newcorpus.fileids()
test_feature_set = []
true_pred = extract_true_pred(corpus_dir[:-2]+"truth-en.txt")
for doc in doc_list:
xml_name = doc
doc = newcorpus.xml(doc)
print(doc[0].attrib["count"])
txt = fetch_text(doc)
if (textstat.sentence_count(txt) != 0) and (txt != ""):
test_feature_set.append((feature_extrator(txt, vect), true_pred[xml_name][i]))
return test_feature_set