当前位置: 首页>>代码示例>>Python>>正文


Python tokenizer.Tokenizer类代码示例

本文整理汇总了Python中tokenizer.Tokenizer的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer类的具体用法?Python Tokenizer怎么用?Python Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Tokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

def main():
    ## args
    parser = argparse.ArgumentParser()
    parser.add_argument('-r', '--reviews', required=True, help='Review data file')
    parser.add_argument('-o', '--out', required=True, help='Inverted index output file')
    parser.add_argument('-s', '--stop', required=True, help='Stopword list')
    opts = parser.parse_args()

    ## Output file
    csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t")
    csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...'])

    ## Tokenizer
    tk = Tokenizer(opts.stop)
    token_map = defaultdict(list)

    ## Tokenize review texts
    # for each word in the vocabulary (in this case all words found in all reviews):
    # business id, review id, and position of each term occurrence
    # instead of using the review id, uses the line on which the review occurs as a unique identifier
    reviews = open(opts.reviews)
    for review_num, line in enumerate(reviews):
        review = json.loads(line)
        business_id = review['business_id'].encode('utf-8')
        tokens = tk.tokenize(review['text'])
        for position, word in enumerate(tokens):
            token_map[word].append((business_id, review_num, position))

    ## Print sorted inverted index
    for token in sorted(token_map):
        row = [token]
        row.extend(token_map[token])
        csv_writer.writerow(row)
开发者ID:jschear,项目名称:cs1951a-final,代码行数:33,代码来源:create_index.py

示例2: ext_json

def ext_json():
    rdfUrl = ''
    tok = Tokenizer()
    if request.method == 'POST':
        rdf = request.form['data']
        status_test = "0"#request.form['status']
        filters = ""#request.form['exculdeurls']
        #rdf = "http://jpp.no-ip.org/MAD_J.rdf"
        try:
            #r = requests.get(rdf)
            gg = Graph()
            #g.load(rdfUrl)
            rdf_content = StringIO.StringIO(rdf.encode('utf-8'))
            #print rdf_content.readline()
            gg.parse(rdf_content,  format="xml")
            ext = Extractor(gg)
            uris = ext.getUris()
            mapping = MapFactory()
            for uri in uris:
                term = tok.tokenized_url(uri)
                uri_status = ""
                if status_test == "1":
                    uri_status = ext.testUri(uri)
                else:
                    uri_status = "N/A"  
                uri_lookup = str(uri)+"\"" 
                lnum = ext.get_lines(rdf_content, uri_lookup)          
                ent = MapEntry(uri, term, "", lnum, uri_status)
                mapping.add(ent)
            jsonized_result = json.dumps(mapping.get())              
            return Response(jsonized_result, mimetype='application/json')
        except requests.exceptions.ConnectionError:
            X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!'
            return X2Rwarning
开发者ID:FengPu,项目名称:x2r-me,代码行数:34,代码来源:x2r-me.py

示例3: main

def main(args):
    try:
        (opts, args) = getopt(args, "o:TPX")
    except GetoptError:
        usage()
    if len(args) != 1:
        usage()

    from tokenizer import Tokenizer
    from parser import Parser
    from error import JtError
    import context
    from os.path import abspath

    filename = abspath(args[0])
    stdin = file(filename, "r")
    target = "P"
    stdout = sys.stdout
    for (ok, ov) in opts:
        if ok in ("-T", "-P", "-X"):
            target = ok[1]
        elif ok == "-o":
            stdout = file(ov, "w")
    contents = stdin.read()
    tokenizer = Tokenizer()
    tokenizer.build()
    tokenizer.input(contents)
    parser = Parser(tokenizer)
    result_tree = None
    try:
        result_tree = parser.parse()
    except JtError, error:
        failure(error)
开发者ID:jwilk,项目名称:jtc,代码行数:33,代码来源:cli.py

示例4: execute

 def execute(self):
     if len(self.proj_paths) > 0:
         logging.info('Starting tokenizer. Producibles (logs, output, etc) can be found under the name '+self.target_folders)
         tokenizer = Tokenizer(self.proj_paths, self.DB_user, self.DB_pass, self.DB_name, logging, self.logs_folder, self.output_folder, self.N_PROCESSES, self.BATCH_SIZE, self.PROJECTS_CONFIGURATION)
         tokenizer.execute()
     else:
         logging.warning('The list of new projects is empty (or these are already on the DB).')
开发者ID:Mondego,项目名称:SourcererCC,代码行数:7,代码来源:tokenizerController.py

示例5: _tokenize_tweet

 def _tokenize_tweet(self, tweet):
     """
     Input: tweet (String)
     Output: List of tokens
     """
     tok = Tokenizer(preserve_case=False)
     return tok.tokenize(tweet)
开发者ID:Chouffe,项目名称:senti-tweet,代码行数:7,代码来源:tools.py

示例6: tokenize

    def tokenize(self, **kwargs):
        """
        Returns the tokenized string using a parser.
        """

        string_tokenizer = Tokenizer()

        return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
开发者ID:DarkmatterVale,项目名称:regex4dummies,代码行数:8,代码来源:toolkit.py

示例7: interpret_line

    def interpret_line(self, line):
        tokenizer = Tokenizer()
        tokenizer.parse(line)

        first_token = tokenizer.getNextToken()
        if first_token.type == Token.NUMBER:
            self.lines[int(first_token.value)] = tokenizer.prog[tokenizer.pos:]
            self.sort_lines()
        else:
            self.run_line(line)
开发者ID:tonyedgecombe,项目名称:pytinybasic,代码行数:10,代码来源:interpreter.py

示例8: main

def main():
    tok = Tokenizer()
    mapping = MapFactory()
    uris = ["http://abc.ee.ntu/alf_123", "http://sc.e.ncli.ABCdefGU"]
    for uri in uris:
        term = tok.tokenized_url(uri)
        ent = MapEntry(uri, term, "", "", "")
        mapping.add(ent)
    jsonized_result = json.dumps(mapping.get())   
    print jsonized_result   
开发者ID:FengPu,项目名称:x2r-me,代码行数:10,代码来源:x2r-me.py

示例9: testExecutionTreeWithItemAssignment

    def testExecutionTreeWithItemAssignment(self):

        c = ExpressionCompiler()
        tokenizer = Tokenizer()

        tokenizer.tokenize("A[B]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree
        )

        # a little bit more complex
        tokenizer.tokenize("A[B+(C*3)+1]= 1 + R")

        tokenizer.next()

        expr = c.compile(tokenizer)

        exec_tree = expr.get_execution_tree()

        print "Expression Tree %s\n" % (exec_tree)

        self.assertEqual(
            "( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )",
            exec_tree,
        )
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:34,代码来源:expr_compiler.py

示例10: testEvaluateFactors

    def testEvaluateFactors(self):

        c = ExpressionCompiler()

        tokenizer = Tokenizer()
        tokenizer.tokenize("7*7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(49.0, result)

        tokenizer.tokenize("7*7/7")
        tokenizer.next()

        expr = c.compile(tokenizer)

        result = expr.evaluate()

        print "result = %s\n" % (result)

        self.assertEqual(7.0, result)
开发者ID:pombredanne,项目名称:java-balivernes,代码行数:26,代码来源:expr_compiler.py

示例11: main

def main():
	# first read in the inverted index file
	parser = argparse.ArgumentParser()
	parser.add_argument('-index', required=True, help='Path to inverted index file')
	parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json")
	opts = parser.parse_args()

	# Pre-processing
	f_index = open(opts.index,'r')
	print "loading index file..."
	wordsmap = {}
	# count = 0
	# for line in f_index:
	# 	count += 1
	# 	j_obj = json.load(line)
	# 	for k, v in j_obj.items():
	# 		wordsmap[k] = v
	# 	j_obj = None
	# 	if count % 100 == 0:
	# 		print count
	wordsmap = json.load(f_index)
	print "done"
	f_index.close()
	b_map = {}
	print "loading business file..."
	f_b = open(opts.business, 'r')
	line_num = 0
	for line in f_b:
		b_json = json.loads(line)
		b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])}
		line_num += 1
	print "done"


	tokenizer = Tokenizer()
	# TODO: need to check error input  
	# Bug: c-d exit situation
	
	for line in sys.stdin:
		result = []
		line = line.strip('\n')
		if len(line)==0:
			continue
		elif line[0]=='"':
			line = line.strip('"')
			words = tokenizer.process_review(line)
			result = phrase_query(words, wordsmap)
		elif len(line.split())==1:
			words = tokenizer.process_review(line)
			result = one_word_query(words[0], wordsmap)
		else:
			words = tokenizer.process_review(line)
			result = free_text_query(words, wordsmap)
		rank_res = rank(words,result,b_map,wordsmap)
		print rank_res
开发者ID:cc26,项目名称:data-science-yelp,代码行数:55,代码来源:query_index.py

示例12: __init__

 def __init__( self, string_to_tokenize = '', prefix_chars = '-=<>!+*&|/%^', suffix_chars = '=<>&|' ):
     Tokenizer.__init__( self, string_to_tokenize )
     self.prefix     =   prefix_chars
     self.suffix     =   suffix_chars
 ### Setup JavaScriptTokenizer-specific regexen
     self.PREFIX             =   re.compile( "[%s]" % self.prefix )
     self.SUFFIX             =   re.compile( "[%s]" % self.suffix )
     self.BEGIN_IDENTIFIER   =   self.CHARACTER
     self.MULTILINE_COMMENT  =   re.compile("[\*]")
     self.END_COMMENT        =   re.compile("[/]")
     self.ESCAPE             =   re.compile("[\\\\]")
开发者ID:mikewest,项目名称:topdown,代码行数:11,代码来源:javascripttokenizer.py

示例13: correct_macro_syntax_test

def correct_macro_syntax_test():
    macro_string = """
!config {
output: pdf, html
table_of_contents: true
}"""
    tokenizer = Tokenizer(macro_string)
    for token in tokenizer:
        if token[0] == "!":
            open_brackets = tokenizer.next()
            if open_brackets != "{":
                raise DMLSyntaxError(open_brackets, "{")
开发者ID:Ed-von-Schleck,项目名称:dml,代码行数:12,代码来源:test-function-syntax.py

示例14: test_ast_opts

    def test_ast_opts(self):
        a = AST()
        t = Tokenizer()
        opts = {}
        opts['get-me'] = 'I am superman'

        a.parse(t.parse('{{ opts.get("get-me") }}'))
        c = a.traverse(opts=opts)
        self.assertEqual(c.buffer, 'I am superman')

        a.parse(t.parse('{@ if opts.get("get-me"): @}I am superman{@ end @}'))
        c = a.traverse(opts=opts)
        self.assertEqual(c.buffer, 'I am superman')
开发者ID:narupo,项目名称:cap,代码行数:13,代码来源:tests.py

示例15: __init__

	def __init__(self, _what, _who, _when, _where, _why, _how, _text):
		self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"")
		self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"")
		self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"")
		self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"")
		self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"")
		self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"")
		self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"")
		self.sentences = Tokenizer.getSentences(self.text)
		self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]
开发者ID:anpandu,项目名称:5w1h_extractor,代码行数:10,代码来源:Info5W1H.py


注:本文中的tokenizer.Tokenizer类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。