本文整理汇总了Python中tokenizer.Tokenizer类的典型用法代码示例。如果您正苦于以下问题:Python Tokenizer类的具体用法?Python Tokenizer怎么用?Python Tokenizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Tokenizer类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
def main():
## args
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--reviews', required=True, help='Review data file')
parser.add_argument('-o', '--out', required=True, help='Inverted index output file')
parser.add_argument('-s', '--stop', required=True, help='Stopword list')
opts = parser.parse_args()
## Output file
csv_writer = csv.writer(open(opts.out, 'w'), delimiter="\t")
csv_writer.writerow(['token', 'business_id', 'review_id', 'position', '...'])
## Tokenizer
tk = Tokenizer(opts.stop)
token_map = defaultdict(list)
## Tokenize review texts
# for each word in the vocabulary (in this case all words found in all reviews):
# business id, review id, and position of each term occurrence
# instead of using the review id, uses the line on which the review occurs as a unique identifier
reviews = open(opts.reviews)
for review_num, line in enumerate(reviews):
review = json.loads(line)
business_id = review['business_id'].encode('utf-8')
tokens = tk.tokenize(review['text'])
for position, word in enumerate(tokens):
token_map[word].append((business_id, review_num, position))
## Print sorted inverted index
for token in sorted(token_map):
row = [token]
row.extend(token_map[token])
csv_writer.writerow(row)
示例2: ext_json
def ext_json():
rdfUrl = ''
tok = Tokenizer()
if request.method == 'POST':
rdf = request.form['data']
status_test = "0"#request.form['status']
filters = ""#request.form['exculdeurls']
#rdf = "http://jpp.no-ip.org/MAD_J.rdf"
try:
#r = requests.get(rdf)
gg = Graph()
#g.load(rdfUrl)
rdf_content = StringIO.StringIO(rdf.encode('utf-8'))
#print rdf_content.readline()
gg.parse(rdf_content, format="xml")
ext = Extractor(gg)
uris = ext.getUris()
mapping = MapFactory()
for uri in uris:
term = tok.tokenized_url(uri)
uri_status = ""
if status_test == "1":
uri_status = ext.testUri(uri)
else:
uri_status = "N/A"
uri_lookup = str(uri)+"\""
lnum = ext.get_lines(rdf_content, uri_lookup)
ent = MapEntry(uri, term, "", lnum, uri_status)
mapping.add(ent)
jsonized_result = json.dumps(mapping.get())
return Response(jsonized_result, mimetype='application/json')
except requests.exceptions.ConnectionError:
X2Rwarning = 'X2R Warning: The requested URL raises ConnectionError~!!!'
return X2Rwarning
示例3: main
def main(args):
try:
(opts, args) = getopt(args, "o:TPX")
except GetoptError:
usage()
if len(args) != 1:
usage()
from tokenizer import Tokenizer
from parser import Parser
from error import JtError
import context
from os.path import abspath
filename = abspath(args[0])
stdin = file(filename, "r")
target = "P"
stdout = sys.stdout
for (ok, ov) in opts:
if ok in ("-T", "-P", "-X"):
target = ok[1]
elif ok == "-o":
stdout = file(ov, "w")
contents = stdin.read()
tokenizer = Tokenizer()
tokenizer.build()
tokenizer.input(contents)
parser = Parser(tokenizer)
result_tree = None
try:
result_tree = parser.parse()
except JtError, error:
failure(error)
示例4: execute
def execute(self):
if len(self.proj_paths) > 0:
logging.info('Starting tokenizer. Producibles (logs, output, etc) can be found under the name '+self.target_folders)
tokenizer = Tokenizer(self.proj_paths, self.DB_user, self.DB_pass, self.DB_name, logging, self.logs_folder, self.output_folder, self.N_PROCESSES, self.BATCH_SIZE, self.PROJECTS_CONFIGURATION)
tokenizer.execute()
else:
logging.warning('The list of new projects is empty (or these are already on the DB).')
示例5: _tokenize_tweet
def _tokenize_tweet(self, tweet):
"""
Input: tweet (String)
Output: List of tokens
"""
tok = Tokenizer(preserve_case=False)
return tok.tokenize(tweet)
示例6: tokenize
def tokenize(self, **kwargs):
"""
Returns the tokenized string using a parser.
"""
string_tokenizer = Tokenizer()
return string_tokenizer.tokenize(kwargs.get("text"), kwargs.get("parser"))
示例7: interpret_line
def interpret_line(self, line):
tokenizer = Tokenizer()
tokenizer.parse(line)
first_token = tokenizer.getNextToken()
if first_token.type == Token.NUMBER:
self.lines[int(first_token.value)] = tokenizer.prog[tokenizer.pos:]
self.sort_lines()
else:
self.run_line(line)
示例8: main
def main():
tok = Tokenizer()
mapping = MapFactory()
uris = ["http://abc.ee.ntu/alf_123", "http://sc.e.ncli.ABCdefGU"]
for uri in uris:
term = tok.tokenized_url(uri)
ent = MapEntry(uri, term, "", "", "")
mapping.add(ent)
jsonized_result = json.dumps(mapping.get())
print jsonized_result
示例9: testExecutionTreeWithItemAssignment
def testExecutionTreeWithItemAssignment(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("A[B]= 1 + R")
tokenizer.next()
expr = c.compile(tokenizer)
exec_tree = expr.get_execution_tree()
print "Expression Tree %s\n" % (exec_tree)
self.assertEqual(
"( = ( item_assign ( literal A ) ( index ( literal B ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )", exec_tree
)
# a little bit more complex
tokenizer.tokenize("A[B+(C*3)+1]= 1 + R")
tokenizer.next()
expr = c.compile(tokenizer)
exec_tree = expr.get_execution_tree()
print "Expression Tree %s\n" % (exec_tree)
self.assertEqual(
"( = ( item_assign ( literal A ) ( index ( + ( + ( literal B ) ( * ( literal C ) ( literal 3.0 ) ) ) ( literal 1.0 ) ) ) ) ( + ( literal 1.0 ) ( literal R ) ) )",
exec_tree,
)
示例10: testEvaluateFactors
def testEvaluateFactors(self):
c = ExpressionCompiler()
tokenizer = Tokenizer()
tokenizer.tokenize("7*7")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(49.0, result)
tokenizer.tokenize("7*7/7")
tokenizer.next()
expr = c.compile(tokenizer)
result = expr.evaluate()
print "result = %s\n" % (result)
self.assertEqual(7.0, result)
示例11: main
def main():
# first read in the inverted index file
parser = argparse.ArgumentParser()
parser.add_argument('-index', required=True, help='Path to inverted index file')
parser.add_argument('-business', required=False, help='Path to yelp business data json file', default="/course/cs1951a/pub/final/data/extracted/yelp_academic_dataset_business.json")
opts = parser.parse_args()
# Pre-processing
f_index = open(opts.index,'r')
print "loading index file..."
wordsmap = {}
# count = 0
# for line in f_index:
# count += 1
# j_obj = json.load(line)
# for k, v in j_obj.items():
# wordsmap[k] = v
# j_obj = None
# if count % 100 == 0:
# print count
wordsmap = json.load(f_index)
print "done"
f_index.close()
b_map = {}
print "loading business file..."
f_b = open(opts.business, 'r')
line_num = 0
for line in f_b:
b_json = json.loads(line)
b_map[str(line_num)]={"business_id":b_json['business_id'],"review_count":int(b_json['review_count']), "stars":float(b_json['stars'])}
line_num += 1
print "done"
tokenizer = Tokenizer()
# TODO: need to check error input
# Bug: c-d exit situation
for line in sys.stdin:
result = []
line = line.strip('\n')
if len(line)==0:
continue
elif line[0]=='"':
line = line.strip('"')
words = tokenizer.process_review(line)
result = phrase_query(words, wordsmap)
elif len(line.split())==1:
words = tokenizer.process_review(line)
result = one_word_query(words[0], wordsmap)
else:
words = tokenizer.process_review(line)
result = free_text_query(words, wordsmap)
rank_res = rank(words,result,b_map,wordsmap)
print rank_res
示例12: __init__
def __init__( self, string_to_tokenize = '', prefix_chars = '-=<>!+*&|/%^', suffix_chars = '=<>&|' ):
Tokenizer.__init__( self, string_to_tokenize )
self.prefix = prefix_chars
self.suffix = suffix_chars
### Setup JavaScriptTokenizer-specific regexen
self.PREFIX = re.compile( "[%s]" % self.prefix )
self.SUFFIX = re.compile( "[%s]" % self.suffix )
self.BEGIN_IDENTIFIER = self.CHARACTER
self.MULTILINE_COMMENT = re.compile("[\*]")
self.END_COMMENT = re.compile("[/]")
self.ESCAPE = re.compile("[\\\\]")
示例13: correct_macro_syntax_test
def correct_macro_syntax_test():
macro_string = """
!config {
output: pdf, html
table_of_contents: true
}"""
tokenizer = Tokenizer(macro_string)
for token in tokenizer:
if token[0] == "!":
open_brackets = tokenizer.next()
if open_brackets != "{":
raise DMLSyntaxError(open_brackets, "{")
示例14: test_ast_opts
def test_ast_opts(self):
a = AST()
t = Tokenizer()
opts = {}
opts['get-me'] = 'I am superman'
a.parse(t.parse('{{ opts.get("get-me") }}'))
c = a.traverse(opts=opts)
self.assertEqual(c.buffer, 'I am superman')
a.parse(t.parse('{@ if opts.get("get-me"): @}I am superman{@ end @}'))
c = a.traverse(opts=opts)
self.assertEqual(c.buffer, 'I am superman')
示例15: __init__
def __init__(self, _what, _who, _when, _where, _why, _how, _text):
self.what = Tokenizer.removeNonAscii(_what).replace(".\"",". \"")
self.who = Tokenizer.removeNonAscii(_who).replace(".\"",". \"")
self.when = Tokenizer.removeNonAscii(_when).replace(".\"",". \"")
self.where = Tokenizer.removeNonAscii(_where).replace(".\"",". \"")
self.why = Tokenizer.removeNonAscii(_why).replace(".\"",". \"")
self.how = Tokenizer.removeNonAscii(_how).replace(".\"",". \"")
self.text = Tokenizer.removeNonAscii(_text).replace(".\"",". \"")
self.sentences = Tokenizer.getSentences(self.text)
self.tokenized_sentences = [Tokenizer.getTokens(sentence) for sentence in self.sentences]