当前位置: 首页>>代码示例>>Python>>正文


Python transcoder.transcoder_processString函数代码示例

本文整理汇总了Python中transcoder.transcoder_processString函数的典型用法代码示例。如果您正苦于以下问题:Python transcoder_processString函数的具体用法?Python transcoder_processString怎么用?Python transcoder_processString使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了transcoder_processString函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: adjust_wx

def adjust_wx(x):
 # modfied to return both slp1 
 # headword entries start with a <wx-headword> line and
 # end with a </wx-headword> line.
 # convert these to <slp-headword> 
 # and </slp-headword>
 m = re.search(r'^<(/?)(.*?)>$',x)
 if m:
  x1 = m.group(1)
  x2 = m.group(2)
  y2 =  transcoder.transcoder_processString(x2,'wx','slp1')
  ans = "<%s%s>" %(x1,y2)
  return ans
 # presumably, not a headword. Don't transcode xml tags
 outarr = [] # slp1
 parts = re.split(r'(<[^>]+>)',x) # xml tags
 for part in parts: 
  if not part: #why needed? 
   pass 
  elif part.startswith('<') and part.endswith('>'):
   outarr.append(part)
  elif part.startswith('[Page') and part.endswith(']'):
   outarr.append(part)
  else: 
   # assume text in wx. Convert to slp1. Use specialized wx_slp1.xml
   y = transcoder.transcoder_processString(part,'wx','slp1')
   outarr.append(y)
 ans = ''.join(outarr)
 return ans
开发者ID:sanskrit-lexicon,项目名称:VCP,代码行数:29,代码来源:wx_to_slp1.py

示例2: r

def r(text):
	#text1 = transcoder.transcoder_processString(text.decode('utf-8'),'deva','slp1')
	wordtype = wtd(text)
	text = transcoder.transcoder_processString(text,'deva','slp1')
	text = text.strip('.')
	url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + text + '&t=SL&c=' + wordtype
	response = urllib2.urlopen(url)
	#print "webpage downloaded at ",
	#timestamp()
	html_doc = response.read()
	soup = BeautifulSoup(html_doc, 'html.parser')
	#print "soup made at ",
	#timestamp()
	interestingdiv = soup.find("div", { "class" : "center" })
	table = interestingdiv.find("table", { "class" : "yellow_cent" })
	span = table.tr.th.find("span", { "class" : "latin12" })
	data = unicode(span).split('<br>\n')[1]
	if wordtype not in ["Part", "Piic" ]:		
		verbattr_separator = unicode(data).split('}[')
		attributes = verbattr_separator[0]
		verbsoup = BeautifulSoup(verbattr_separator[1], 'html.parser')
		verb = verbsoup.a.text
		verb = re.sub("[0-9_]+", "", verb)
		verb = transcoder.transcoder_processString(verb,'roman','slp1')
		data = tosm(attributes)
		m = []
		if len(data) > 1:
			for datum in data:
				m.append(verb + '.' + datum)
			output = '|'.join(m)
		else:
			output = verb + '.' + data[0]
	elif wordtype in ["Part", "Piic" ]:
		output = kridantaattributes(data)
	return output
开发者ID:drdhaval2785,项目名称:inriawrapper,代码行数:35,代码来源:inriawrapper.py

示例3: key_transcode

def key_transcode(m,fromcode,tocode):
 x1 = m.group(1)
 key1=m.group(2)
 x2 = m.group(3)
 key2=m.group(4)
 body=m.group(5)
 key1a = transcoder.transcoder_processString(key1,fromcode,tocode)
 key2a = transcoder.transcoder_processString(key2,fromcode,tocode)
 out = "<H1>%s{%s}%s{%s}%s" %(x1,key1a,x2,key2a,body)
 return out
开发者ID:sanskrit-lexicon,项目名称:PWG,代码行数:10,代码来源:transcode.py

示例4: alterations

def alterations(filein,fileout):
	fin = codecs.open(filein,'r','utf-8')
	data = fin.read()
	fin.close()
	data = data.strip()
	print 'making preprocess changes'
	data = changelist(data)
	print "Debugging and writing to log.txt"
	log = codecs.open('log.txt','a','utf-8')
	log.write('#'+filein+"#\n")
	words = data.split(' ')
	counter=1
	out = []
	for i in xrange(len(words)):
		word = words[i]
		word = snchanges(word)
		# Creating log for श ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/1
		"""
		if re.search(r'\s["][sn]',word):
			changed = snchanges(word)
			#log.write(str(counter)+":"+word+"\n")
			counter = counter+1
			if not changed == word:
				out.append(changed)
			else:
				out.append(word)
		# Creating log for ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/2
		if re.search(r'"n[^aAiIuUfFxXeEoOykglnm]',word):
			out.append(word)
			rep = word.replace('\n',' ')
			log.write(str(counter)+":"+rep+"\n")
			counter = counter+1
		else:
			out.append(word)
		"""
		out.append(word)
	data = ' '.join(out)
	log.close()
	print 'changing to slp1'
	output = transcoder.transcoder_processString(data,'vel','slp1')
	#fout1 = codecs.open(fileout,'w','utf-8')
	#fout1.write(output)
	#fout1.close()
	output = slpchanges(output)
	print 'changing to Devanagari'
	output = transcoder.transcoder_processString(output,'slp1','deva')
	output = output.replace('#','')
	#output = output.replace('\n','<br/>')
	print 'putting the data in output folder'
	fout1 = codecs.open(fileout,'w','utf-8')
	fout1.write(output)
	fout1.close()
开发者ID:drdhaval2785,项目名称:padamanjari,代码行数:52,代码来源:preprocess.py

示例5: unused_convertrecs

def unused_convertrecs(recs,tranin,tranout):
 "Modifies recs"
 n=0
 for rec in recs:
  n=n+1
  try:
   rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
   rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
   m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
   if m:
    print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
  except:
   print "convertrecs problem",n,rec.line.encode('utf-8')
开发者ID:sanskrit-lexicon,项目名称:PWG,代码行数:13,代码来源:as_roman.py

示例6: linking

def linking(fin,fout):
	infile = codecs.open(fin,'r','utf-8')
	input = infile.readlines()
	input = triming(input)
	outfile = codecs.open(fout,'w','utf-8')
	#acc:akzoByatantre,41695:akzoByatantre:n:oBy -> acc:अक्षोभ्यतन्त्रे,41695:अक्षोभ्यतन्त्रे:n:oBy
	for line in input:
		[dict,headword,replica,errcode,note] = line.split(':')
		[hw,lnum] = headword.split(',')
		hw = transcoder.transcoder_processString(hw,'slp1','deva')
		note = transcoder.transcoder_processString(note,'slp1','deva')
		outfile.write(dict+':'+hw+','+lnum+':'+hw+':'+errcode+':'+note+'\n')
	outfile.close()
	print "Check", fout, "for testing"
开发者ID:sanskrit-lexicon,项目名称:CORRECTIONS,代码行数:14,代码来源:usha.py

示例7: convertrecs

def convertrecs(recs,tranin,tranout):
 "Modifies recs"
 n=0
 for rec in recs:
  n=n+1
  try:
   rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
   rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
   m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
   if m:
    print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
   # Undo some transcodings
   rec.titleunicode = re.sub(r'YOLLY','JOLLY',rec.titleunicode)  # JOLLY is an author
  except:
   print "convertrecs problem",n,rec.line.encode('utf-8')
开发者ID:sanskrit-lexicon,项目名称:PWK,代码行数:15,代码来源:pwbib1.py

示例8: disp_md

def disp_md(dictcode,icase,L,hw0,url,page0,datalines):
 """ return array of lines, formatted for details of GitHub Markdown
 """
 outarr=[]
 pageref = "[page %s](%s)" %(page0,url)
 outarr.append(' Case %04d: %s  %s ' % (icase,hw0,pageref))
 datalines = adjust_datalines(dictcode,datalines)
 # output up to 10 lines of datalines
 outlines = datalines[0:10]
 outarr.append('```')
 # construct potential headword change record
 out = "%s:%s,%s:%s:n:" %(dictcode,hw0,L,hw0)
 outarr.append(out)
 outarr.append('')
 for x in outlines:
  # Remove '|', which is a line-separator in CAE
  x = re.sub(r'[|]','',x)
  y = transcoder.transcoder_processString(x,'as','roman')
  if (y.strip() != ''):
   outarr.append('%s' % y)
 if len(datalines)>10:
  ndiff = len(datalines) - 10
  outarr.append('  [and %s more lines]' % ndiff)
 outarr.append('```')
 outarr.append('------------------------------------------')
 outarr.append('')
 return outarr
开发者ID:sanskrit-lexicon,项目名称:CORRECTIONS,代码行数:27,代码来源:prep1.py

示例9: abbrv_transcode

def abbrv_transcode(p):
 tranin = 'as'
 tranout = 'roman1'
 proman = transcoder.transcoder_processString(p,tranin,tranout)
 # correct some errors:
 proman = proman.replace('Yourn','Journ')
 return proman
开发者ID:sanskrit-lexicon,项目名称:PWG,代码行数:7,代码来源:abbrv3.py

示例10: adv

def adv(text):
	input = text.split('.')
	errormessage = 'not found as a'
	if input[1] == 'adv':
		url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + input[0] + '&t=SL&c=Advb'
		response = urllib2.urlopen(url).read()
		if errormessage not in response:
			return transcoder.transcoder_processString(input[0],'slp1','deva')
开发者ID:drdhaval2785,项目名称:inriawrapper,代码行数:8,代码来源:inriawrapper.py

示例11: iter

def iter(wordxml, strength="Full"):
    if wordxml == "????":
        return "????"  # Error message
    else:
        wordxml = unicode(wordxml)  # Converted the word to unicode
        wordwithtags = []  # Empty list
        individualentries = wordxml.split("|")
        for individualentry in individualentries:
            tree = StringIO(individualentry)  # Created XML from the worddata
            # print "parsing of iter started at", printtimestamp()
            context = etree.parse(tree)  # Parsed the element tree.
            # print "parsing of iter ended at", printtimestamp()
            root = context.getroot()  # got the root of element tree e.g. 'f'
            # The next two steps require explanation. In Gerard's XML files, All possible attributes are given as children of 'f'. The last child is always 's' which stores the stem. All other children are the various possible word attributes. Given as 'na' or 'v' etc. Gio
            children = root.getchildren()[:-1]  # attributes
            basedata = root.getchildren()[-1]  # 's' stem
            basewordslp = basedata.get("stem").strip()  # Base word in SLP1 encoding.
            if strength == "deva":
                baseword = transcoder.transcoder_processString(
                    basewordslp, "slp1", "deva"
                )  # If the user wants output in Devanagari rather than SLP1, this code converts it to Devanagari.
            else:
                baseword = basewordslp  # Otherwise in SLP1.
            attributes = []  # An empty list to store attributes.
            for child in children:
                taglist = child.xpath(
                    ".//*"
                )  # Fetches all elements (abbreviations) of a particular verb / word characteristics.
                output = [child.tag]  # The first member of output list is the tag of element 'v', 'na' etc.
                output = output + [
                    tagitem.tag for tagitem in taglist
                ]  # Other tags (abbreviations) and add it to output list.
                # The following section is commented out right now. But it would be needed for situation where we need to konw the gaNa of a verb or 7 kinds of aorist derivation.
                """if len(child.xpath('.//prs[@gn]')) > 0:
					prsgana = child.xpath('.//prs')[0].get('gn')
					output.append('verbgana')
					output.append(prsgana)
				elif len(child.xpath('.//aor[@gn]')) > 0:
					aorgana = child.xpath('.//aor')[0].get('gn')
					output.append('aoristgana')
					output.append(aorgana)
				elif len(child.xpath('.//inj[@gn]')) > 0:
					injgana = child.xpath('.//inj')[0].get('gn')
					output.append('injunctivegana')
					output.append(injgana)"""
                attributes.append(output)  # output list is appended to attributes list.
            if strength == "deva":
                outputlist = converttodevanagari(attributes)  # Devanagari
            else:
                outputlist = attributes  # SLP1
            for member in outputlist:
                wordwithtags.append(
                    baseword + "-" + "-".join(member)
                )  # Created a list wordwithtags where the first member is baseword and the rest of the members are attributes separated by '-'
                # print "postprocessing of iter ended at", printtimestamp()
        return "|".join(
            wordwithtags
        )  # If there are more than one possible verb characteristics for a given form, they are shown separated by a '|'
开发者ID:drdhaval2785,项目名称:inriaxmlwrapper,代码行数:58,代码来源:sanskritmark.py

示例12: convertfromfile

def convertfromfile(inputfile, outputfile):
    f = codecs.open(inputfile, "r", "utf-8")  # Opened inputfile with UTF-8 encoding.
    data = f.readlines()  # Read the lines into a list.
    f.close()  # Closed the inputfile.
    g = codecs.open(outputfile, "w", "utf-8")  # Opened the outputfile with UTF-8 encoding.
    for datum1 in data:  # For each member of data,
        datum1 = datum1.strip()  # Removed unnecessary whitespaces.
        datum1 = transcoder.transcoder_processString(datum1, "deva", "slp1")  # Converted from Devanagari to SLP1.
        dat = re.split("(\W+)", datum1)  # Created a word list by exploding the sentence at word boundaries.
        for i in xrange(len(dat)):
            datum = dat[i].strip()  # Clean whitespaces.
            if i % 2 == 0 and i != len(
                dat
            ):  # Even members of datum are the words and odd members are word boundaries. Therefore, processing only even members.
                # print "analysis of word started", printtimestamp()
                x = devanagaridisplay(datum)  # Analysed the even members.
                # print "analysis of word ended", printtimestamp()
                g.write(
                    transcoder.transcoder_processString(datum, "slp1", "deva") + "(" + x + ")"
                )  # Wrote to the outputfile.
                print transcoder.transcoder_processString(
                    datum, "slp1", "deva"
                ) + "(" + x + ")"  # printed to the screen for the user.
                # print "wrote to the file", printtimestamp()
            else:
                g.write(
                    transcoder.transcoder_processString(dat[i], "slp1", "deva")
                )  # For odd members, converted the word boundaries to their Devanagari counterparts.
                print transcoder.transcoder_processString(
                    dat[i], "slp1", "deva"
                )  # For odd members, converted the word boundaries to their Devanagari counterparts.
        g.write("\n")  # Newline character added
        print  # Newline character printed on terminal.
    g.close()  # Closed outputfile.
开发者ID:drdhaval2785,项目名称:inriaxmlwrapper,代码行数:34,代码来源:sanskritmark.py

示例13: convertline

def convertline(line,tranfrom,tranto):
 """ 
 """
 parts=line.split('@')
 # 4th part is the part to convert
 if tranfrom == 'roman2':
  parts[4] = parts[4].lower()
 parts[4] = transcoder.transcoder_processString(parts[4],tranfrom,tranto)
 return '@'.join(parts)
开发者ID:sanskrit-lexicon,项目名称:MW72,代码行数:9,代码来源:adjtxt3_italics.py

示例14: dev

def dev(file):
	f = codecs.open(file, 'r+', 'utf-8-sig')
	data = f.read()
	data = transcoder.transcoder_processString(data,'slp1','deva')
	data = re.sub(u'ळ्ह्', '|', data)
	f.close()
	g = codecs.open("hindidevanagariverbform.txt", "w+", "utf-8-sig")
	g = codecs.open("skd_deva.txt", "w+", "utf-8-sig")
	g.write(data)
	g.close()
开发者ID:drdhaval2785,项目名称:inriaxmlwrapper,代码行数:10,代码来源:misc.py

示例15: convertline

def convertline(line,tranfrom,tranto):
 """ do transcoder, but don't convert [Page...]
 """
 parts=line.split('[Page')
 parts[0] = transcoder.transcoder_processString(parts[0],tranfrom,tranto)
 if re.search(r'[a-zA-Z][0-9]',parts[0]):
  unconverted=True
 else:
  unconverted=False
 return (unconverted,'[Page'.join(parts))
开发者ID:sanskrit-lexicon,项目名称:MW72,代码行数:10,代码来源:adjtxt2.py


注:本文中的transcoder.transcoder_processString函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。