本文整理汇总了Python中transcoder.transcoder_processString函数的典型用法代码示例。如果您正苦于以下问题:Python transcoder_processString函数的具体用法?Python transcoder_processString怎么用?Python transcoder_processString使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了transcoder_processString函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: adjust_wx
def adjust_wx(x):
# modfied to return both slp1
# headword entries start with a <wx-headword> line and
# end with a </wx-headword> line.
# convert these to <slp-headword>
# and </slp-headword>
m = re.search(r'^<(/?)(.*?)>$',x)
if m:
x1 = m.group(1)
x2 = m.group(2)
y2 = transcoder.transcoder_processString(x2,'wx','slp1')
ans = "<%s%s>" %(x1,y2)
return ans
# presumably, not a headword. Don't transcode xml tags
outarr = [] # slp1
parts = re.split(r'(<[^>]+>)',x) # xml tags
for part in parts:
if not part: #why needed?
pass
elif part.startswith('<') and part.endswith('>'):
outarr.append(part)
elif part.startswith('[Page') and part.endswith(']'):
outarr.append(part)
else:
# assume text in wx. Convert to slp1. Use specialized wx_slp1.xml
y = transcoder.transcoder_processString(part,'wx','slp1')
outarr.append(y)
ans = ''.join(outarr)
return ans
示例2: r
def r(text):
#text1 = transcoder.transcoder_processString(text.decode('utf-8'),'deva','slp1')
wordtype = wtd(text)
text = transcoder.transcoder_processString(text,'deva','slp1')
text = text.strip('.')
url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + text + '&t=SL&c=' + wordtype
response = urllib2.urlopen(url)
#print "webpage downloaded at ",
#timestamp()
html_doc = response.read()
soup = BeautifulSoup(html_doc, 'html.parser')
#print "soup made at ",
#timestamp()
interestingdiv = soup.find("div", { "class" : "center" })
table = interestingdiv.find("table", { "class" : "yellow_cent" })
span = table.tr.th.find("span", { "class" : "latin12" })
data = unicode(span).split('<br>\n')[1]
if wordtype not in ["Part", "Piic" ]:
verbattr_separator = unicode(data).split('}[')
attributes = verbattr_separator[0]
verbsoup = BeautifulSoup(verbattr_separator[1], 'html.parser')
verb = verbsoup.a.text
verb = re.sub("[0-9_]+", "", verb)
verb = transcoder.transcoder_processString(verb,'roman','slp1')
data = tosm(attributes)
m = []
if len(data) > 1:
for datum in data:
m.append(verb + '.' + datum)
output = '|'.join(m)
else:
output = verb + '.' + data[0]
elif wordtype in ["Part", "Piic" ]:
output = kridantaattributes(data)
return output
示例3: key_transcode
def key_transcode(m,fromcode,tocode):
x1 = m.group(1)
key1=m.group(2)
x2 = m.group(3)
key2=m.group(4)
body=m.group(5)
key1a = transcoder.transcoder_processString(key1,fromcode,tocode)
key2a = transcoder.transcoder_processString(key2,fromcode,tocode)
out = "<H1>%s{%s}%s{%s}%s" %(x1,key1a,x2,key2a,body)
return out
示例4: alterations
def alterations(filein,fileout):
fin = codecs.open(filein,'r','utf-8')
data = fin.read()
fin.close()
data = data.strip()
print 'making preprocess changes'
data = changelist(data)
print "Debugging and writing to log.txt"
log = codecs.open('log.txt','a','utf-8')
log.write('#'+filein+"#\n")
words = data.split(' ')
counter=1
out = []
for i in xrange(len(words)):
word = words[i]
word = snchanges(word)
# Creating log for श ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/1
"""
if re.search(r'\s["][sn]',word):
changed = snchanges(word)
#log.write(str(counter)+":"+word+"\n")
counter = counter+1
if not changed == word:
out.append(changed)
else:
out.append(word)
# Creating log for ङ issue. See https://github.com/drdhaval2785/padamanjari/issues/2
if re.search(r'"n[^aAiIuUfFxXeEoOykglnm]',word):
out.append(word)
rep = word.replace('\n',' ')
log.write(str(counter)+":"+rep+"\n")
counter = counter+1
else:
out.append(word)
"""
out.append(word)
data = ' '.join(out)
log.close()
print 'changing to slp1'
output = transcoder.transcoder_processString(data,'vel','slp1')
#fout1 = codecs.open(fileout,'w','utf-8')
#fout1.write(output)
#fout1.close()
output = slpchanges(output)
print 'changing to Devanagari'
output = transcoder.transcoder_processString(output,'slp1','deva')
output = output.replace('#','')
#output = output.replace('\n','<br/>')
print 'putting the data in output folder'
fout1 = codecs.open(fileout,'w','utf-8')
fout1.write(output)
fout1.close()
示例5: unused_convertrecs
def unused_convertrecs(recs,tranin,tranout):
"Modifies recs"
n=0
for rec in recs:
n=n+1
try:
rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
if m:
print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
except:
print "convertrecs problem",n,rec.line.encode('utf-8')
示例6: linking
def linking(fin,fout):
infile = codecs.open(fin,'r','utf-8')
input = infile.readlines()
input = triming(input)
outfile = codecs.open(fout,'w','utf-8')
#acc:akzoByatantre,41695:akzoByatantre:n:oBy -> acc:अक्षोभ्यतन्त्रे,41695:अक्षोभ्यतन्त्रे:n:oBy
for line in input:
[dict,headword,replica,errcode,note] = line.split(':')
[hw,lnum] = headword.split(',')
hw = transcoder.transcoder_processString(hw,'slp1','deva')
note = transcoder.transcoder_processString(note,'slp1','deva')
outfile.write(dict+':'+hw+','+lnum+':'+hw+':'+errcode+':'+note+'\n')
outfile.close()
print "Check", fout, "for testing"
示例7: convertrecs
def convertrecs(recs,tranin,tranout):
"Modifies recs"
n=0
for rec in recs:
n=n+1
try:
rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
if m:
print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
# Undo some transcodings
rec.titleunicode = re.sub(r'YOLLY','JOLLY',rec.titleunicode) # JOLLY is an author
except:
print "convertrecs problem",n,rec.line.encode('utf-8')
示例8: disp_md
def disp_md(dictcode,icase,L,hw0,url,page0,datalines):
""" return array of lines, formatted for details of GitHub Markdown
"""
outarr=[]
pageref = "[page %s](%s)" %(page0,url)
outarr.append(' Case %04d: %s %s ' % (icase,hw0,pageref))
datalines = adjust_datalines(dictcode,datalines)
# output up to 10 lines of datalines
outlines = datalines[0:10]
outarr.append('```')
# construct potential headword change record
out = "%s:%s,%s:%s:n:" %(dictcode,hw0,L,hw0)
outarr.append(out)
outarr.append('')
for x in outlines:
# Remove '|', which is a line-separator in CAE
x = re.sub(r'[|]','',x)
y = transcoder.transcoder_processString(x,'as','roman')
if (y.strip() != ''):
outarr.append('%s' % y)
if len(datalines)>10:
ndiff = len(datalines) - 10
outarr.append(' [and %s more lines]' % ndiff)
outarr.append('```')
outarr.append('------------------------------------------')
outarr.append('')
return outarr
示例9: abbrv_transcode
def abbrv_transcode(p):
tranin = 'as'
tranout = 'roman1'
proman = transcoder.transcoder_processString(p,tranin,tranout)
# correct some errors:
proman = proman.replace('Yourn','Journ')
return proman
示例10: adv
def adv(text):
input = text.split('.')
errormessage = 'not found as a'
if input[1] == 'adv':
url = 'http://sanskrit.inria.fr/cgi-bin/SKT/sktlemmatizer?lex=MW&q=' + input[0] + '&t=SL&c=Advb'
response = urllib2.urlopen(url).read()
if errormessage not in response:
return transcoder.transcoder_processString(input[0],'slp1','deva')
示例11: iter
def iter(wordxml, strength="Full"):
if wordxml == "????":
return "????" # Error message
else:
wordxml = unicode(wordxml) # Converted the word to unicode
wordwithtags = [] # Empty list
individualentries = wordxml.split("|")
for individualentry in individualentries:
tree = StringIO(individualentry) # Created XML from the worddata
# print "parsing of iter started at", printtimestamp()
context = etree.parse(tree) # Parsed the element tree.
# print "parsing of iter ended at", printtimestamp()
root = context.getroot() # got the root of element tree e.g. 'f'
# The next two steps require explanation. In Gerard's XML files, All possible attributes are given as children of 'f'. The last child is always 's' which stores the stem. All other children are the various possible word attributes. Given as 'na' or 'v' etc. Gio
children = root.getchildren()[:-1] # attributes
basedata = root.getchildren()[-1] # 's' stem
basewordslp = basedata.get("stem").strip() # Base word in SLP1 encoding.
if strength == "deva":
baseword = transcoder.transcoder_processString(
basewordslp, "slp1", "deva"
) # If the user wants output in Devanagari rather than SLP1, this code converts it to Devanagari.
else:
baseword = basewordslp # Otherwise in SLP1.
attributes = [] # An empty list to store attributes.
for child in children:
taglist = child.xpath(
".//*"
) # Fetches all elements (abbreviations) of a particular verb / word characteristics.
output = [child.tag] # The first member of output list is the tag of element 'v', 'na' etc.
output = output + [
tagitem.tag for tagitem in taglist
] # Other tags (abbreviations) and add it to output list.
# The following section is commented out right now. But it would be needed for situation where we need to konw the gaNa of a verb or 7 kinds of aorist derivation.
"""if len(child.xpath('.//prs[@gn]')) > 0:
prsgana = child.xpath('.//prs')[0].get('gn')
output.append('verbgana')
output.append(prsgana)
elif len(child.xpath('.//aor[@gn]')) > 0:
aorgana = child.xpath('.//aor')[0].get('gn')
output.append('aoristgana')
output.append(aorgana)
elif len(child.xpath('.//inj[@gn]')) > 0:
injgana = child.xpath('.//inj')[0].get('gn')
output.append('injunctivegana')
output.append(injgana)"""
attributes.append(output) # output list is appended to attributes list.
if strength == "deva":
outputlist = converttodevanagari(attributes) # Devanagari
else:
outputlist = attributes # SLP1
for member in outputlist:
wordwithtags.append(
baseword + "-" + "-".join(member)
) # Created a list wordwithtags where the first member is baseword and the rest of the members are attributes separated by '-'
# print "postprocessing of iter ended at", printtimestamp()
return "|".join(
wordwithtags
) # If there are more than one possible verb characteristics for a given form, they are shown separated by a '|'
示例12: convertfromfile
def convertfromfile(inputfile, outputfile):
f = codecs.open(inputfile, "r", "utf-8") # Opened inputfile with UTF-8 encoding.
data = f.readlines() # Read the lines into a list.
f.close() # Closed the inputfile.
g = codecs.open(outputfile, "w", "utf-8") # Opened the outputfile with UTF-8 encoding.
for datum1 in data: # For each member of data,
datum1 = datum1.strip() # Removed unnecessary whitespaces.
datum1 = transcoder.transcoder_processString(datum1, "deva", "slp1") # Converted from Devanagari to SLP1.
dat = re.split("(\W+)", datum1) # Created a word list by exploding the sentence at word boundaries.
for i in xrange(len(dat)):
datum = dat[i].strip() # Clean whitespaces.
if i % 2 == 0 and i != len(
dat
): # Even members of datum are the words and odd members are word boundaries. Therefore, processing only even members.
# print "analysis of word started", printtimestamp()
x = devanagaridisplay(datum) # Analysed the even members.
# print "analysis of word ended", printtimestamp()
g.write(
transcoder.transcoder_processString(datum, "slp1", "deva") + "(" + x + ")"
) # Wrote to the outputfile.
print transcoder.transcoder_processString(
datum, "slp1", "deva"
) + "(" + x + ")" # printed to the screen for the user.
# print "wrote to the file", printtimestamp()
else:
g.write(
transcoder.transcoder_processString(dat[i], "slp1", "deva")
) # For odd members, converted the word boundaries to their Devanagari counterparts.
print transcoder.transcoder_processString(
dat[i], "slp1", "deva"
) # For odd members, converted the word boundaries to their Devanagari counterparts.
g.write("\n") # Newline character added
print # Newline character printed on terminal.
g.close() # Closed outputfile.
示例13: convertline
def convertline(line,tranfrom,tranto):
"""
"""
parts=line.split('@')
# 4th part is the part to convert
if tranfrom == 'roman2':
parts[4] = parts[4].lower()
parts[4] = transcoder.transcoder_processString(parts[4],tranfrom,tranto)
return '@'.join(parts)
示例14: dev
def dev(file):
f = codecs.open(file, 'r+', 'utf-8-sig')
data = f.read()
data = transcoder.transcoder_processString(data,'slp1','deva')
data = re.sub(u'ळ्ह्', '|', data)
f.close()
g = codecs.open("hindidevanagariverbform.txt", "w+", "utf-8-sig")
g = codecs.open("skd_deva.txt", "w+", "utf-8-sig")
g.write(data)
g.close()
示例15: convertline
def convertline(line,tranfrom,tranto):
""" do transcoder, but don't convert [Page...]
"""
parts=line.split('[Page')
parts[0] = transcoder.transcoder_processString(parts[0],tranfrom,tranto)
if re.search(r'[a-zA-Z][0-9]',parts[0]):
unconverted=True
else:
unconverted=False
return (unconverted,'[Page'.join(parts))