当前位置: 首页>>代码示例>>Python>>正文


Python corpus.Corpus类代码示例

本文整理汇总了Python中corpus.Corpus的典型用法代码示例。如果您正苦于以下问题:Python Corpus类的具体用法?Python Corpus怎么用?Python Corpus使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Corpus类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: output_json_format

    def output_json_format(self, parse_path, rel_path):
        preds = [it.strip().split()[-1] for it in open(self.predicted_file)]
        rel_dict = Corpus.read_relations(rel_path)
        idx = 0
        for art in Corpus.read_parses(parse_path, rel_dict):
            for rel in art.relations:
                if rel.rel_type == 'Explicit':
                    continue
                pred_sense = preds[idx]
                json_dict = {}
                json_dict['DocID'] = rel.doc_id
                if pred_sense == 'EntRel':
                    r_type = 'EntRel'
                elif pred_sense == 'NoRel':
                    r_type = 'NoRel'
                else:
                    r_type = 'Implicit'

                json_dict['Type'] = r_type
                json_dict['Sense'] = [pred_sense.replace('_', ' ')]
                json_dict['Connective'] = {}
                json_dict['Connective']['TokenList'] = []
                json_dict['Arg1'] = {}
                json_dict['Arg1']['TokenList'] = []
                json_dict['Arg2'] = {}
                json_dict['Arg2']['TokenList'] = []
                print json.dumps(json_dict)
                idx += 1
开发者ID:qcl6355,项目名称:conll2016,代码行数:28,代码来源:nonexp.py

示例2: contruir_corpus_experimento

    def contruir_corpus_experimento(self):
        '''Contruye el dataset'''
        c = Corpus()
        if self.tamanio == 'BI':
            busquedaInicial=leer_archivo(open(self.directorio+'bi.csv','r'), eliminar_primero=True)
            clasificados = leer_archivo(open(self.directorio+'clasificados.csv', 'r'),eliminar_primero=True)
        elif self.tamanio == 'Univ':
            busquedaInicial=leer_archivo(open(self.directorio++'dataPapers.csv','r'), eliminar_primero=True)
            clasificados = leer_archivo(open(self.directorio++'validacion.csv', 'r'),eliminar_primero=True)
        conjuntoA=leer_archivo(open(self.directorio+'a.csv','r'),eliminar_primero=True)
        conjuntoS=leer_archivo(open(self.directorio+'s.csv','r'),eliminar_primero=True)
        conjuntoJ=leer_archivo(open(self.directorio+'j.csv','r'),eliminar_primero=True)
        conjuntoO=leer_archivo(open(self.directorio+'o.csv','r'),eliminar_primero=True)

        xmls = self.obtener_xmls()

        #Archivos con los eid de los papers que van a conformar la red
        ##archivo_papers_red = dividir_archivo_fecha(open(self.directorio+'relevantes.csv'), open(self.directorio+'relevantesFecha.csv'), 2013)
        archivo_papers_red = open(self.directorio+'bi.csv')
        #Lista con los eid de los papers que van a conformar la red
        lista_papers_red = leer_archivo(archivo_papers_red, eliminar_primero=True)
        #Autores-papers de la red
        dicci_contruir_red = obtener_autores(xmls, lista_papers_red)
        #Aqué deberían estar todos los autores-papers del corpus
        dicci_todos_autores_papers = obtener_autores(xmls, leer_archivo(open(self.directorio+'bi.csv'), eliminar_primero=True))
        #c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados,
        #                   conjuntos_red=dicci_contruir_red, diccionario_todos_autores=dicci_todos_autores_papers)
        c.construir_corpus(self.nombre_corpus, busquedaInicial, conjuntoA, conjuntoS, conjuntoJ, conjuntoO, clasificados)
开发者ID:cesardlq,项目名称:Tesis,代码行数:28,代码来源:experimento.py

示例3: preprocess

def preprocess(filename, stopword_filename=None, extra_stopwords=None):
    """
    Preprocesses a CSV file and returns ...

    Arguments:

    filename -- name of CSV file

    Keyword arguments:

    stopword_filename -- name of file containing stopwords
    extra_stopwords -- list of additional stopwords
    """

    stopwords = create_stopword_list(stopword_filename)
    stopwords.update(create_stopword_list(extra_stopwords))

    corpus = Corpus()

    for fields in reader(open(filename), delimiter=','):
        corpus.add(fields[0], tokenize(fields[-1], stopwords))

    corpus.freeze()

    return corpus
开发者ID:kuntu,项目名称:networkdate,代码行数:25,代码来源:preprocess.py

示例4: test

    def test(self, path):
        corp = Corpus(path)
        bs = Bayesian()
        count = 0
        sender_bl = load_pickle('sender_bl.pickle')
        # scan email and define if msg is SPAM or HAM
        # first check if sender occurs in sender Blacklist
        # then count spamicity of the word using the Bayes approach
        for fname, body in corp.emails():
            sender = find_sender(body)
            if sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
                continue

            spamicity_list = []
            count += 1
            tokens = tokenize(body)
            # compute spamicity for each word and create list of the values
            for el in tokens:
                word_spamicity = [el, bs.word_spamicity(el)]
                spamicity_list.append(word_spamicity)
            # prepare list for Bayes
            spamicity_list = [list(i) for i in set(map(tuple, spamicity_list))]  # remove duplicates from list
            spamicity_list.sort(key=lambda x: abs(0.5 - x[1]), reverse=True)
            prediction = bs.bayes_pred(spamicity_list[:15])  # Consider only 15 'words'
            if prediction > 0.9 or sender in sender_bl:
                self.tag_it(path, fname, 'SPAM')
            else:
                self.tag_it(path, fname, 'OK')
开发者ID:unacau,项目名称:bayesian-spam-filtering,代码行数:29,代码来源:filter.py

示例5: test_interro4

def test_interro4():
    print('Testing interrogation 4')
    corp = Corpus('data/test-stripped-tokenised')
    data = corp.interrogate({'n': 'any'})
    d = {'and interrogating': {'first': 0, 'second': 2},
         'concordancing and': {'first': 0, 'second': 2}}
    assert_equals(data.results.to_dict(), d)
开发者ID:maxdesp,项目名称:corpkit,代码行数:7,代码来源:nosetests.py

示例6: main

def main():
    logging.basicConfig(format=DefaultSetting.FORMAT_LOG, level=logging.INFO)

    start_time = datetime.now()

    input_file = 'data/content.with.categories.seg.vni'
    stopwords_file = 'data/stopwords.txt'
    num_topics = 100
    prefix_name = 'demo'
    directory = 'tmp'
    query = 'data/query.txt'

    corpus = Corpus()
    corpus.build_corpus(input_file, stopwords_file, directory, prefix_name)
    LDA = LDAModel()
    LDA.train(corpus.corpus, corpus.dictionary, num_topics, directory, prefix_name)
    LDA.show()

    docsim = DocSim()
    docsim.set_model(LDA.model)
    docsim.set_doc(corpus)
    docsim.vectorized(num_topics)
    # docsim.save(directory, prefix_name)

    print 'Training time: ', datetime.now() - start_time

    start_time = datetime.now()
    reader = codecs.open(query, 'r', 'utf8')
    documents = []
    for line in reader.readlines():
        documents.append(line.replace('\n', ''))
    docsim.query(documents, True, directory, prefix_name)
    docsim.query(documents, False, directory, prefix_name)
    print 'Query time: ', datetime.now() - start_time
开发者ID:khoaipx,项目名称:Document-Similarity,代码行数:34,代码来源:main.py

示例7: _process_nonexp_sense

    def _process_nonexp_sense(self, articles, which):
        nonexp_feat_name = FILE_PATH + '/../tmp/nonexp.feat'
        nonexp_sense_file = codecs.open(nonexp_feat_name, 'w', 'utf-8')
        nonexpParser = NonExplicit()  # change name later
        for art in articles:
            self.generate_nonexp_relations(art)
            for rel in art.nonexp_relations:
                nonexpParser.print_features(rel, ['xxxxx'], nonexp_sense_file)
        nonexp_sense_file.close()
        nonexp_pred_name = FILE_PATH + '/../tmp/nonexp.pred'
        Corpus.test_with_opennlp(nonexp_feat_name, nonexpParser.model_file, nonexp_pred_name)
        nonexp_res = [l.strip().split()[-1] for l in codecs.open(nonexp_pred_name, 'r', 'utf-8')]

        rid = 0
        for art in articles:
            for rel in art.nonexp_relations:
                pred_sense = nonexp_res[rid]
                if pred_sense == 'EntRel':
                    r_type = 'EntRel'
                elif pred_sense == 'NoRel':
                    r_type = 'NoRel'
                else:
                    r_type = 'Implicit'
                rel.rel_type = r_type
                rel.sense = [pred_sense]
                rid += 1

        assert len(nonexp_res) == rid, 'nonexp relations size not match'
开发者ID:qcl6355,项目名称:conll2016,代码行数:28,代码来源:end2end.py

示例8: _process_parsed_conn

 def _process_parsed_conn(self, articles, which='test'):
     """
     generate explicit relation for each true discourse connective
     """
     connParser = Connective()
     conn_feat_name = FILE_PATH + '/../tmp/conn.feat'
     conn_feat_file = codecs.open(conn_feat_name, 'w', 'utf-8')
     checked_conns = []
     for art in articles:
         checked_conns.append(connParser.print_features(art, which, conn_feat_file))
     conn_feat_file.close()
     conn_pred_name = FILE_PATH + '/../tmp/conn.pred'
     Corpus.test_with_opennlp(conn_feat_name, connParser.model_file, conn_pred_name)
     conn_res = [l.strip().split()[-1] for l in codecs.open(conn_pred_name, 'r', 'utf-8')]
     assert len(checked_conns) == len(articles), 'article size not match'
     s = 0
     for art, cand_conns in zip(articles, checked_conns):
         length = len(cand_conns)
         cand_res = conn_res[s:s+length]
         s += length
         for conn, label in zip(cand_conns, cand_res):
             if label == '1':
                 rel = Relation()
                 rel.doc_id = art.id
                 rel.rel_type = 'Explicit'
                 rel.article = art
                 rel.conn_leaves = conn
                 rel.conn_addr = [n.leaf_id for n in conn]
                 art.exp_relations.append(rel)
     assert s == len(conn_res), 'conn size not match'
开发者ID:qcl6355,项目名称:conll2016,代码行数:30,代码来源:end2end.py

示例9: prepare_data

 def prepare_data(self, parse_path, rel_path, which, to_file):
     rel_dict = Corpus.read_relations(rel_path)
     for art in Corpus.read_parses(parse_path, rel_dict):
         for rel in art.relations:
             if rel.rel_type != 'Explicit':
                 continue
             rel.article = art
             rel.get_conn_leaves()
         self.print_features(art, which, to_file)
开发者ID:qcl6355,项目名称:conll2016,代码行数:9,代码来源:connective.py

示例10: generateData

def generateData():
	rep = Representor(None, 'citybeat', 'next_week_candidate_event_25by25_merged')
	corpus = Corpus()
	corpus.buildCorpusOnDB('citybeat', 'next_week_candidate_event_25by25_merged')
	true_event_list, false_event_list = loadNextWeekData()
	EventFeatureTwitter(None).GenerateArffFileHeader()
		
	for event in true_event_list + false_event_list:
		EventFeatureTwitter(event, corpus, rep).printFeatures()
开发者ID:daifanxiang,项目名称:CityBeat,代码行数:9,代码来源:process_data_next_week.py

示例11: test_bb_target_state_halfed

 def test_bb_target_state_halfed(self):
     feature_table = FeatureTable.load(get_feature_table_fixture("a_b_and_cons_feature_table.json"))
     constraint_set = ConstraintSet.load(get_constraint_set_fixture("bb_target_constraint_set.json"),
                                         feature_table)
     target_lexicon_words = Corpus.load(get_corpus_fixture("bb_target_lexicon_halfed.txt")).get_words()
     lexicon = Lexicon(target_lexicon_words, feature_table)
     grammar = Grammar(feature_table, constraint_set, lexicon)
     corpus = Corpus.load(get_corpus_fixture("bb_corpus.txt"))
     traversable_hypothesis = TraversableGrammarHypothesis(grammar, corpus)
     self.assertEqual(traversable_hypothesis.get_energy(), 407430)
开发者ID:taucompling,项目名称:otml,代码行数:10,代码来源:test_traversable_grammar_hypothesis.py

示例12: test_parse

def test_parse():
    import shutil
    print('Testing parser')
    unparsed = Corpus(unparsed_path)
    try:
        shutil.rmtree('data/test-parsed')
    except:
        pass
    parsed = unparsed.parse()
    assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
开发者ID:xsongx,项目名称:corpkit,代码行数:10,代码来源:nosetests.py

示例13: test_parse_speakseg

def test_parse_speakseg(skipassert = False):
    print('Testing parser with speaker segmentation')
    unparsed = Corpus(unparsed_path)
    import shutil
    try:
        shutil.rmtree(parsed_path)
    except:
        pass
    parsed = unparsed.parse(speaker_segmentation = True)
    if not skipassert:
        assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
开发者ID:xsongx,项目名称:corpkit,代码行数:11,代码来源:nosetests.py

示例14: _get_corpus

    def _get_corpus(self):
        self.training_corpus = Corpus()
        self.training_corpus.load_from_file(self.training_corpus_f)

        self.unlabeled_corpus = Corpus()
        self.unlabeled_corpus.load_from_file(self.u_corpus_f)

        self.test_corpus = Corpus()
        self.test_corpus.load_from_file(self.test_corpus_f)

        self.user_corpus = Corpus()
开发者ID:mit0110,项目名称:tesis,代码行数:11,代码来源:activepipe.py

示例15: scoring

 def scoring(self, method='zagibolov'):
     # Supply argument in Corpus to connect to databse. user, password and db.
     corpus = Corpus(password='', db='project_major')
     corpus.getTweets()
     dataset = corpus.dataSet
     preprocess = Preprocess('zagibolov', self.lexicons, self.negatives, self.stopWords)
     scoring = Scoring(method, self.lexicons, self.negatives, self.stopWords, self.seeds)
     j = 0
     for data in dataset:
         preprocess.preprocessScoring(data)
         processed = preprocess.processed_data
         
     for data in processed:
         scoring.count(data['tweet'])
 ##        print self.seeds
     preprocess.seeds = scoring.lexicon_count
     preprocess.processLexicon()
     scoring.lexicons = preprocess.lexicons
 ##        print scoring.lexicon_count
     last_score = {}
     i = 0
     for i in range(0,3):
         total = 0
         j = 0
         negative = 0
         positive = 0
         scoring.resetLexiconCount()
 ##        print self.lexicons
         for data in processed:
             if j == 50:
                 break
             j += 1
             score = scoring.score(data)
             if score != 0:
                 total += 1
                 if score < 0:
                     negative += 1
                 else:
                     positive += 1
         scoring.adjustScoring()
         if last_score == {}:
             last_score = scoring.lexicons
             this_score = last_score
         else:
             this_score = scoring.lexicons
             if this_score == last_score:
                 break
             else:
                 last_score = this_score
         print this_score
         print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
     print this_score
     print "Total scored: " + str(total), "Negative: ", negative, "Positive: ", positive
开发者ID:neerajkhandelwal,项目名称:Movie-ter,代码行数:53,代码来源:main.py


注:本文中的corpus.Corpus类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。