Python stanford.POSTagger类代码示例

本文整理汇总了Python中nltk.tag.stanford.POSTagger类的典型用法代码示例。如果您正苦于以下问题：Python POSTagger类的具体用法？Python POSTagger怎么用？Python POSTagger使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了POSTagger类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: nltk_stanfordpos

def nltk_stanfordpos(inpath, outfolder):
    """POS-Tagging French text with Stanford POS-Tagger via NLTK."""
    print("\nLaunched nltk_stanfordpos.")

    import os
    import glob
    from nltk.tag.stanford import POSTagger

    for file in glob.glob(inpath):
        st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
        with open(file, "r", encoding="utf-8") as infile:
            untagged = infile.read()
            tagged = st.tag(untagged.split())

            taggedstring = ""
            for item in tagged:
                item = "\t".join(item)
                taggedstring = taggedstring + str(item) + "\n"
            #print(taggedstring)

            basename = os.path.basename(file)
            cleanfilename = basename
            if not os.path.exists(outfolder):
                os.makedirs(outfolder)
            with open(os.path.join(outfolder, cleanfilename),"w") as output:
                output.write(taggedstring)
    print("Done.")

开发者ID:daschloer，项目名称:tmw，代码行数:27，代码来源:tmw.py

示例2: main

def main():

    st = POSTagger(
        "/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger",
        "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar",
    )

    # st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \
    # "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar")

    # print st.tag("Die Kinder in Bayern haben lange Ferien".split())

    # return

    with open(sys.argv[1], "r") as f:
        content = f.read()

    sentences = re.split("\n|\.|\?", content)

    for s in sentences:
        if len(s) == 0:
            continue
        # print s
        pieces = st.tag(s.split())
        strippedPieces = stripPieces(pieces)

        print " ".join(strippedPieces)

开发者ID:spattersongt，项目名称:lingq，代码行数:27，代码来源:case_trainer.py

示例3: cleanTokens

def cleanTokens(tokens):


    st = POSTagger('/models/german-fast.tagger')

    tags = st.tag(tokens);
    def cleanTags(x):
        y = x[1]
        return True if re.match("NE|NN",y) and len(x[0]) > 3 else False

    clean_tags= filter(cleanTags,tags)

    #import pdb;pdb.set_trace();


    def buildSentens(arr):
        list = []
        sen =""
        for i in arr:
            list.append(i[0])
        return list



    #print len(clean_tags)
    #print clean_tags
    clean =  buildSentens(clean_tags)

    return clean

开发者ID:jbrissier，项目名称:gccheck，代码行数:29，代码来源:extract_text.py

示例4: stanford_corenlp_filter

def stanford_corenlp_filter(sent):
  from nltk.tag.stanford import POSTagger
  posTagger = POSTagger('/Users/gt/Downloads/'
                        'stanford-postagger-2013-06-20/models/'
                        'wsj-0-18-bidirectional-nodistsim.tagger',
                        '/Users/gt/Downloads/stanford-postagger-2013-06-20'
                        '/stanford-postagger-3.2.0.jar',encoding=encoding)

  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1 = b1.lower()
  tokens = word_tokenize(b1)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

      #note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2 = b2.lower()
  tokens = word_tokenize(b2)
  pos_tags = posTagger.tag(tokens)
  filtered_sent = ' '
  for pos_t in pos_tags:
    if pos_t[1] in filterList:
      # filtered_sent += stemmer.stem(pos_t[0]) + ' '
      filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  return filtered_sent

开发者ID:gthandavam，项目名称:Recipes，代码行数:32，代码来源:builder.py

示例5: vectorizer

def vectorizer(tokens, w2v_db):
    db_path = w2v_db
    # POS TAGGING
    tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
    tagged_tokens = tagger.tag(tokens)
    unsorted_kw = OrderedDict()
    for (w,t) in tagged_tokens:
        if t in ['NNP', 'NNPS', 'FW']:
            label = 1.5
        elif t in ['NN', 'NNS']:
            label = 1
            
        else:
            continue
        w = w.lower()
        try:
            unsorted_kw[w] += label
        except KeyError:
            unsorted_kw[w] = label
    # Get the vectors of words. Maintain order as in document.
    token_vecs = OrderedDict()
    conn = SQLCon(db_path)
    words = (word.lower() for word in unsorted_kw)
    for word in words:
        try:
            if token_vecs[word]: continue
        except KeyError:
            v = conn.read(word)
            if not v is None:
                token_vecs[word] = list(v)
    print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
    conn.close()
    return unsorted_kw, token_vecs

开发者ID:suraj813，项目名称:SOMClassifier，代码行数:33，代码来源:pp_v4.py

示例6: postext_st

def postext_st(filename):
    # Opening of File
    path_to_raw = '/home/cyneo/Work/Scans/Text Version/'

    if type(filename) != str:
        raise IOError('Filename must be a string')

    # Preparing to Tokenize
    with open(osp.abspath(path_to_raw + filename + '.txt'),
              'r', encoding='utf8') as raw:
        # Initialize the punkt module
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = []

        for line in raw:
            sents.extend(sent_detector.tokenize(line.strip()))
    
    tokenedsents = []
    # Tokenizing
    from nltk.tokenize.stanford import StanfordTokenizer
    for line in sents:
        tokenedsents.append(StanfordTokenizer().tokenize(line))

    # Parts of Speech Tagging
    posSents = []
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
                   encoding='utf8')

    for line in tokenedsents:
        # Returns a list of a list of tuples
        posSents.append(st.tag(line))

    return posSents

开发者ID:cyneo，项目名称:feminism，代码行数:34，代码来源:adjective+extract.py

示例7: createModel

def createModel():
    global classifierit
    global classifierloose
    global classifieryou
    global classifierto
    global classifiertheir
    trainingitSet = []
    traininglooseSet = []
    trainingyouSet = []
    trainingtoSet = []
    trainingtheirSet= []
    st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
    for line in brown.sents():
        print line
        tagSent = st.tag(line)
        print tagSent
        arrayOfitFeature = pos_itfeatures(tagSent)
        arrayOfyouFeature = pos_youfeatures(tagSent)
        arrayOftheirFeature = pos_theirfeatures(tagSent)
        arrayOflooseFeature = pos_loosefeatures(tagSent)
        arrayOftoFeature = pos_tofeatures(tagSent)
        if arrayOfitFeature:
            trainingitSet.extend(arrayOfitFeature)
        if arrayOftheirFeature:
            trainingtheirSet.extend(arrayOftheirFeature)
        if arrayOflooseFeature:
            traininglooseSet.extend(arrayOflooseFeature)
        if arrayOftoFeature:
            trainingtoSet.extend(arrayOftoFeature)
        if arrayOfyouFeature:
            trainingyouSet.extend(arrayOfyouFeature)
        
    
    algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
    #encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
    classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
    f = open('classifierit.pickle', 'wb')
    pickle.dump(classifierit, f)
    f.close()
    #encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
    classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
    f = open('classifierloose.pickle', 'wb')
    pickle.dump(classifierloose, f)
    f.close()
    #encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
    classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
    f = open('classifieryou.pickle', 'wb')
    pickle.dump(classifieryou, f)
    f.close()
    #encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
    classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
    f = open('classifierto.pickle', 'wb')
    pickle.dump(classifierto, f)
    f.close()
    #encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
    classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
    f = open('classifiertheir.pickle', 'wb')
    pickle.dump(classifiertheir, f)
    f.close()

开发者ID:siddharthasandhu，项目名称:NLPProjects，代码行数:59，代码来源:stanLearn.py

示例8: stanford_tag

def stanford_tag(sentence):
    ''' use stanford tagger to tag a single tokenized sentence
    '''
    import src.experiment.path as path
    tagger = POSTagger(path.stanford_tagger_model_path(),
                       path.stanford_tagger_path(),
                       java_options='-Xmx16g -XX:MaxPermSize=256m')
    return tagger.tag(sentence)

开发者ID:fashandge，项目名称:deja，代码行数:8，代码来源:utilities.py

示例9: tag

def tag(segments):
    #st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
    st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'),
                   os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
    tagged = []
    for segment in segments:
        x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
        tagged.append(x.decode('utf-8'))
    return tagged

开发者ID:bwallace，项目名称:irony-redux，代码行数:9，代码来源:extract_and_tag.py

示例10: spanish_pos

def spanish_pos(text):
	""" Parts of speech tagger for Spanish """
	
	text = text.encode('utf8')

	st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger', 
				'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')

	pos_tagged = st.tag(text.split())

	return pos_tagged

开发者ID:lenazun，项目名称:context，代码行数:11，代码来源:spanish_processing.py

示例11: german_pos

def german_pos(text):
	""" Parts of speech tagger for German """
	
	text = text.encode('utf8')

	st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger', 
				'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')

	pos_tagged = st.tag(text.split())

	return pos_tagged

开发者ID:lenazun，项目名称:context，代码行数:11，代码来源:german_processing.py

示例12: stanford_batch_tag

def stanford_batch_tag(sentences):
    '''use stanford tagger to batch tag a list of tokenized
    sentences
    '''
    import src.experiment.path as path
    # need to replace the model path and tagger path of standford parser 
    # in your computer (I use two functions here, you can hard code the paths if 
    # you like)
    tagger = POSTagger(path.stanford_tagger_model_path(),
                       path.stanford_tagger_path())
    return tagger.batch_tag(sentences)

开发者ID:fashandge，项目名称:deja，代码行数:11，代码来源:utilities.py

示例13: pos_tag

def pos_tag(texts):

    from nltk.tag.stanford import POSTagger
    
    jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar"
    if language == "german":
        model = config.mainpath+"analyze/SPOS/models/german-fast.tagger"
    if language == "english":
        model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger"
    tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8")

    return tagger.tag_sents(texts)

开发者ID:chreman，项目名称:output_BA，代码行数:12，代码来源:parallel_preprocessing.py

示例14: main

def main():

    print "Inicio..."
    with open("tweets_a_procesar_v2.csv", 'rb') as csvfile:
        lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'")
        # En esta variable estan todos los tweets
        tweets = []
        for line in lines:
            tweet = Tweet(line)
            #print tweet.spanish_text.split()
            tweets.append(tweet)
        
    #archivo de salida
    output = open("output_tagged_v2.csv", 'wb')
    filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'")

    #importando el tagger en español de Stanford NLP
    from nltk.tag.stanford import POSTagger
    st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
    #st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
    #st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8')

    n=0
    for tweet in tweets:
        n+=1
        print tweet.spanish_text
        #Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split())
        tweet_tagged = st.tag((tweet.spanish_text).split())
        #Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
        #print tweet_tagged

        important_words = []
        n_adj = 0
        for tag in tweet_tagged:
            inicial = tag[1][:1]
            if('a' in inicial):
                important_words.append(tag[0])
            if('r' in inicial):
                important_words.append(tag[0])
            if('n' in inicial):
                important_words.append(tag[0])
            if('v' in inicial):
                important_words.append(tag[0])

        #tweet.cant_adj = n_adj
        tweet.tweet_tagged = tweet_tagged
        tweet.important_words = important_words
        filewriter.writerow(tweet.to_CSV())
        if n % 100 == 0: print n
    print "Done"
    output.close()

开发者ID:wilchess26，项目名称:WebMining，代码行数:51，代码来源:unParseStream.py

示例15: pos_tag_stanford

def pos_tag_stanford(toked_sentence):
	"""
	INPUT: list of strings
	OUTPUT: list of tuples

	Given a tokenized sentence, return 
	a list of tuples of form (token, POS)
	where POS is the part of speech of token
	"""

	from nltk.tag.stanford import POSTagger
	st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger', 
               '/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar')

	return st.tag(toked_sentence)

开发者ID:Jewelryland，项目名称:Opinion-Mining-Project，代码行数:15，代码来源:extract_aspects.py

注：本文中的nltk.tag.stanford.POSTagger类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。