本文整理汇总了Python中corpus.Corpus类的典型用法代码示例。如果您正苦于以下问题:Python Corpus类的具体用法?Python Corpus怎么用?Python Corpus使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Corpus类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: output_json_format
def output_json_format(self, parse_path, rel_path):
preds = [it.strip().split()[-1] for it in open(self.predicted_file)]
rel_dict = Corpus.read_relations(rel_path)
idx = 0
for art in Corpus.read_parses(parse_path, rel_dict):
for rel in art.relations:
if rel.rel_type == 'Explicit':
continue
pred_sense = preds[idx]
json_dict = {}
json_dict['DocID'] = rel.doc_id
if pred_sense == 'EntRel':
r_type = 'EntRel'
elif pred_sense == 'NoRel':
r_type = 'NoRel'
else:
r_type = 'Implicit'
json_dict['Type'] = r_type
json_dict['Sense'] = [pred_sense.replace('_', ' ')]
json_dict['Connective'] = {}
json_dict['Connective']['TokenList'] = []
json_dict['Arg1'] = {}
json_dict['Arg1']['TokenList'] = []
json_dict['Arg2'] = {}
json_dict['Arg2']['TokenList'] = []
print json.dumps(json_dict)
idx += 1
示例2: contruir_corpus_experimento
def contruir_corpus_experimento(self):
'''Contruye el dataset'''
c = Corpus()
if self.tamanio == 'BI':
busquedaInicial=leer_archivo(open(self.directorio+'bi.csv','r'), eliminar_primero=True)
clasificados = leer_archivo(open(self.directorio+'clasificados.csv', 'r'),eliminar_primero=True)
elif self.tamanio == 'Univ':
busquedaInicial=leer_archivo(open(self.directorio++'dataPapers.csv','r'), eliminar_primero=True)
clasificados = leer_archivo(open(self.directorio++'validacion.csv', 'r'),eliminar_primero=True)
conjuntoA=leer_archivo(open(self.directorio+'a.csv','r'),eliminar_primero=True)
conjuntoS=leer_archivo(open(self.directorio+'s.csv','r'),eliminar_primero=True)
conjuntoJ=leer_archivo(open(self.directorio+'j.csv','r'),eliminar_primero=True)
conjuntoO=leer_archivo(open(self.directorio+'o.csv','r'),eliminar_primero=True)
xmls = self.obtener_xmls()
#Archivos con los eid de los papers que van a conformar la red
##archivo_papers_red = dividir_archivo_fecha(open(self.directorio+'relevantes.csv'), open(self.directorio+'relevantesFecha.csv'), 2013)
archivo_papers_red = open(self.directorio+'bi.csv')
#Lista con los eid de los papers que van a conformar la red
lista_papers_red = leer_archivo(archivo_papers_red, eliminar_primero=True)
#Autores-papers de la red
dicci_contruir_red = obtener_autores(xmls, lista_papers_red)
#Aqué deberían estar todos los autores-papers del corpus
dicci_todos_autores_papers = obtener_autores(xmls, leer_archivo(open(self.directorio+'bi.csv'), eliminar_primero=True))
#c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados,
# conjuntos_red=dicci_contruir_red, diccionario_todos_autores=dicci_todos_autores_papers)
c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados)
示例3: preprocess
def preprocess(filename, stopword_filename=None, extra_stopwords=None):
"""
Preprocesses a CSV file and returns ...
Arguments:
filename -- name of CSV file
Keyword arguments:
stopword_filename -- name of file containing stopwords
extra_stopwords -- list of additional stopwords
"""
stopwords = create_stopword_list(stopword_filename)
stopwords.update(create_stopword_list(extra_stopwords))
corpus = Corpus()
for fields in reader(open(filename), delimiter=','):
corpus.add(fields[0], tokenize(fields[-1], stopwords))
corpus.freeze()
return corpus
示例4: test
def test(self, path):
corp = Corpus(path)
bs = Bayesian()
count = 0
sender_bl = load_pickle('sender_bl.pickle')
# scan email and define if msg is SPAM or HAM
# first check if sender occurs in sender Blacklist
# then count spamicity of the word using the Bayes approach
for fname, body in corp.emails():
sender = find_sender(body)
if sender in sender_bl:
self.tag_it(path, fname, 'SPAM')
continue
spamicity_list = []
count += 1
tokens = tokenize(body)
# compute spamicity for each word and create list of the values
for el in tokens:
word_spamicity = [el, bs.word_spamicity(el)]
spamicity_list.append(word_spamicity)
# prepare list for Bayes
spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))] # remove duplicates from list
spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
prediction = bs.bayes_pred(spamicity_list[:15]) # Consider only 15 'words'
if prediction > 0.9 or sender in sender_bl:
self.tag_it(path, fname, 'SPAM')
else:
self.tag_it(path, fname, 'OK')
示例5: test_interro4
def test_interro4():
print('Testing interrogation 4')
corp = Corpus('data/test-stripped-tokenised')
data = corp.interrogate({'n': 'any'})
d = {'and interrogating': {'first': 0, 'second': 2},
'concordancing and': {'first': 0, 'second': 2}}
assert_equals(data.results.to_dict(), d)
示例6: main
def main():
logging.basicConfig(format=DefaultSetting.FORMAT_LOG, level=logging.INFO)
start_time = datetime.now()
input_file = 'data/content.with.categories.seg.vni'
stopwords_file = 'data/stopwords.txt'
num_topics = 100
prefix_name = 'demo'
directory = 'tmp'
query = 'data/query.txt'
corpus = Corpus()
corpus.build_corpus(input_file, stopwords_file, directory, prefix_name)
LDA = LDAModel()
LDA.train(corpus.corpus, corpus.dictionary, num_topics, directory, prefix_name)
LDA.show()
docsim = DocSim()
docsim.set_model(LDA.model)
docsim.set_doc(corpus)
docsim.vectorized(num_topics)
# docsim.save(directory, prefix_name)
print 'Training time: ', datetime.now() - start_time
start_time = datetime.now()
reader = codecs.open(query, 'r', 'utf8')
documents = []
for line in reader.readlines():
documents.append(line.replace('\n', ''))
docsim.query(documents, True, directory, prefix_name)
docsim.query(documents, False, directory, prefix_name)
print 'Query time: ', datetime.now() - start_time
示例7: _process_nonexp_sense
def _process_nonexp_sense(self, articles, which):
nonexp_feat_name = FILE_PATH + '/../tmp/nonexp.feat'
nonexp_sense_file = codecs.open(nonexp_feat_name, 'w', 'utf-8')
nonexpParser = NonExplicit() # change name later
for art in articles:
self.generate_nonexp_relations(art)
for rel in art.nonexp_relations:
nonexpParser.print_features(rel, ['xxxxx'], nonexp_sense_file)
nonexp_sense_file.close()
nonexp_pred_name = FILE_PATH + '/../tmp/nonexp.pred'
Corpus.test_with_opennlp(nonexp_feat_name, nonexpParser.model_file, nonexp_pred_name)
nonexp_res = [l.strip().split()[-1] for l in codecs.open(nonexp_pred_name, 'r', 'utf-8')]
rid = 0
for art in articles:
for rel in art.nonexp_relations:
pred_sense = nonexp_res[rid]
if pred_sense == 'EntRel':
r_type = 'EntRel'
elif pred_sense == 'NoRel':
r_type = 'NoRel'
else:
r_type = 'Implicit'
rel.rel_type = r_type
rel.sense = [pred_sense]
rid += 1
assert len(nonexp_res) == rid, 'nonexp relations size not match'
示例8: _process_parsed_conn
def _process_parsed_conn(self, articles, which='test'):
"""
generate explicit relation for each true discourse connective
"""
connParser = Connective()
conn_feat_name = FILE_PATH + '/../tmp/conn.feat'
conn_feat_file = codecs.open(conn_feat_name, 'w', 'utf-8')
checked_conns = []
for art in articles:
checked_conns.append(connParser.print_features(art, which, conn_feat_file))
conn_feat_file.close()
conn_pred_name = FILE_PATH + '/../tmp/conn.pred'
Corpus.test_with_opennlp(conn_feat_name, connParser.model_file, conn_pred_name)
conn_res = [l.strip().split()[-1] for l in codecs.open(conn_pred_name, 'r', 'utf-8')]
assert len(checked_conns) == len(articles), 'article size not match'
s = 0
for art, cand_conns in zip(articles, checked_conns):
length = len(cand_conns)
cand_res = conn_res[s:s+length]
s += length
for conn, label in zip(cand_conns, cand_res):
if label == '1':
rel = Relation()
rel.doc_id = art.id
rel.rel_type = 'Explicit'
rel.article = art
rel.conn_leaves = conn
rel.conn_addr = [n.leaf_id for n in conn]
art.exp_relations.append(rel)
assert s == len(conn_res), 'conn size not match'
示例9: prepare_data
def prepare_data(self, parse_path, rel_path, which, to_file):
rel_dict = Corpus.read_relations(rel_path)
for art in Corpus.read_parses(parse_path, rel_dict):
for rel in art.relations:
if rel.rel_type != 'Explicit':
continue
rel.article = art
rel.get_conn_leaves()
self.print_features(art, which, to_file)
示例10: generateData
def generateData():
rep = Representor(None, 'citybeat', 'next_week_candidate_event_25by25_merged')
corpus = Corpus()
corpus.buildCorpusOnDB('citybeat', 'next_week_candidate_event_25by25_merged')
true_event_list, false_event_list = loadNextWeekData()
EventFeatureTwitter(None).GenerateArffFileHeader()
for event in true_event_list + false_event_list:
EventFeatureTwitter(event, corpus, rep).printFeatures()
示例11: test_bb_target_state_halfed
def test_bb_target_state_halfed(self):
feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_cons_feature_table.json"))
constraint_set = ConstraintSet.load(get_constraint_set_fixture("bb_target_constraint_set.json"),
feature_table)
target_lexicon_words = Corpus.load(get_corpus_fixture("bb_target_lexicon_halfed.txt")).get_words()
lexicon = Lexicon(target_lexicon_words, feature_table)
grammar = Grammar(feature_table, constraint_set, lexicon)
corpus = Corpus.load(get_corpus_fixture("bb_corpus.txt"))
traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus)
self.assertEqual(traversable_hypothesis.get_energy(), 407430)
示例12: test_parse
def test_parse():
import shutil
print('Testing parser')
unparsed = Corpus(unparsed_path)
try:
shutil.rmtree('data/test-parsed')
except:
pass
parsed = unparsed.parse()
assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
示例13: test_parse_speakseg
def test_parse_speakseg(skipassert = False):
print('Testing parser with speaker segmentation')
unparsed = Corpus(unparsed_path)
import shutil
try:
shutil.rmtree(parsed_path)
except:
pass
parsed = unparsed.parse(speaker_segmentation = True)
if not skipassert:
assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
示例14: _get_corpus
def _get_corpus(self):
self.training_corpus = Corpus()
self.training_corpus.load_from_file(self.training_corpus_f)
self.unlabeled_corpus = Corpus()
self.unlabeled_corpus.load_from_file(self.u_corpus_f)
self.test_corpus = Corpus()
self.test_corpus.load_from_file(self.test_corpus_f)
self.user_corpus = Corpus()
示例15: scoring
def scoring(self, method='zagibolov'):
# Supply argument in Corpus to connect to databse. user, password and db.
corpus = Corpus(password='', db='project_major')
corpus.getTweets()
dataset = corpus.dataSet
preprocess = Preprocess('zagibolov', self.lexicons, self.negatives, self.stopWords)
scoring = Scoring(method, self.lexicons, self.negatives, self.stopWords, self.seeds)
j = 0
for data in dataset:
preprocess.preprocessScoring(data)
processed = preprocess.processed_data
for data in processed:
scoring.count(data['tweet'])
## print self.seeds
preprocess.seeds = scoring.lexicon_count
preprocess.processLexicon()
scoring.lexicons = preprocess.lexicons
## print scoring.lexicon_count
last_score = {}
i = 0
for i in range(0,3):
total = 0
j = 0
negative = 0
positive = 0
scoring.resetLexiconCount()
## print self.lexicons
for data in processed:
if j == 50:
break
j += 1
score = scoring.score(data)
if score != 0:
total += 1
if score < 0:
negative += 1
else:
positive += 1
scoring.adjustScoring()
if last_score == {}:
last_score = scoring.lexicons
this_score = last_score
else:
this_score = scoring.lexicons
if this_score == last_score:
break
else:
last_score = this_score
print this_score
print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
print this_score
print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive