本文整理汇总了Python中Utils.Utils.readSentences方法的典型用法代码示例。如果您正苦于以下问题:Python Utils.readSentences方法的具体用法?Python Utils.readSentences怎么用?Python Utils.readSentences使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Utils.Utils
的用法示例。
在下文中一共展示了Utils.readSentences方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: Utils
# 需要导入模块: from Utils import Utils [as 别名]
# 或者: from Utils.Utils import readSentences [as 别名]
## Both the phrases are same, for dual structure principle
if l1PhraseTag == l2PhraseTag or l2PhraseTag in self.__phraseMap[l1PhraseTag]:
# Debugging !!
#sys.stderr.write("Alignment: "+str(align)+"\n")
#sys.stderr.write("L2Sequence: "+str(l2Sequence)+" Same Const: "+l1PhraseTag+"\n")
##############
break
return l1Sequence
if __name__ == "__main__":
l2Data = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/engParse.wx"
l1Data = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/hinParse.wx"
l2Aligns = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/engAlign.wx"
l1Aligns = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/Parallel/Good2.0/trainSingle/hinAlign.wx"
pureL1 = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/HinPOS/hindiTrain.wx"
pureL2 = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/EngPOS/train.0-18.tsv"
U = Utils()
parL1 = U.readSentences(l1Data)
parL2 = U.readSentences(l2Data)
align = U.readAligns(l1Aligns, l2Aligns)
pureL1 = U.readSentencesPlain(pureL1)
pureL2 = U.readSentencesPlain(pureL2)
#print parL1[0]print parL2[0]print pureL1[0]print pureL2[0]
print align[0]
CS = CSHandler()
CS.updateHandler(parL1[0], parL2[0], align[0], 0)
print CS.csSentence(4)
示例2: __init__
# 需要导入模块: from Utils import Utils [as 别名]
# 或者: from Utils.Utils import readSentences [as 别名]
#.........这里部分代码省略.........
self.align = []
self.pureL1 = []
self.pureL2 = []
self.L1Tags = set()
self.L2Tags = set()
self.commonTags = set()
self.posMap = {}
## Pre-processing
self.genPosMap()
## Others
self.__utils = Utils()
def addLangTags(self, WordTags, lTag):
wordTags = []
for wt in WordTags:
newWT = [i for i in wt]
wordTags.append(newWT)
for index in range(len(wordTags)):
wordTags[index].append(lTag)
return wordTags
def makeLD(self, wordsTagsLangs):
newLine = []
for index in range(len(wordsTagsLangs)):
wordTagLang = copy.deepcopy(wordsTagsLangs[index])
wordTagLang[1] = wordTagLang[1] + '_' + wordTagLang[2]
newLine.append(wordTagLang)
return newLine
def genPosMap(self):
for i in open(self.l1MapFile):
i = i.strip()
srcTag = i.split()[0]
uniTag = i.split()[1]
self.posMap[srcTag] = uniTag
for i in open(self.l2MapFile):
i = i.strip()
srcTag = i.split()[0]
uniTag = i.split()[1]
self.posMap[srcTag] = uniTag
self.L1Tags = set()
for line in open(self.l1MapFile):
tag = line.split()[0]
self.L1Tags.add(tag)
for line in open(self.l2MapFile):
tag = line.split()[0]
self.L2Tags.add(tag)
self.commonTags = set([c for c in self.L1Tags if c in self.L2Tags])
def map2Uni(self, wordTagsLangs):
newLine = []
for index in range(len(wordTagsLangs)):
newLine.append(wordTagsLangs[index])
tag = wordTagsLangs[index][1]
try:
newLine[index][1] = self.posMap[tag]
except:
newLine[index][1] = 'X'
return newLine
def mapLD2Uni(self, wordTagsLangs):
newLine = []
for index in range(len(wordTagsLangs)):
wordTagLang = copy.deepcopy(wordTagsLangs[index])
tag = wordTagLang[1].split("_")[0]
lang = wordTagLang[2]
try:
wordTagLang[1] = self.posMap[tag] + "_" + lang
except:
wordTagLang[1] = 'X' + "_" + lang
newLine.append(wordTagLang)
return newLine
def map2UniControl(self, wordTagsLangs):
newLine = []
for index in range(len(wordTagsLangs)):
newLine.append(wordTagsLangs[index])
tag = wordTagsLangs[index][1]
lang = wordTagsLangs[index][2]
try:
newLine[index][1] = self.posMap[tag] + '_' + lang
except:
newLine[index][1] = 'X' + '_' + lang
return newLine
def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
self.parL1 = self.__utils.readSentences(l1Data)
self.parL2 = self.__utils.readSentences(l2Data)
self.align = self.__utils.readAligns(l1Aligns, l2Aligns)
self.pureL1 = self.__utils.readSentencesPlain(pureL1Data)
self.pureL2 = self.__utils.readSentencesPlain(pureL2Data)
sys.stderr.write("parL1:" + str(len(self.parL1)) + "\n")
sys.stderr.write("parL2:" + str(len(self.parL2)) + "\n")
sys.stderr.write("align:" + str(len(self.align)) + "\n")
sys.stderr.write("pureL1:" + str(len(self.pureL1)) + "\n")
sys.stderr.write("pureL2:" + str(len(self.pureL2)) + "\n")
示例3: __init__
# 需要导入模块: from Utils import Utils [as 别名]
# 或者: from Utils.Utils import readSentences [as 别名]
class DataGenerator:
def __init__(self, outDir):
sys.stderr.write("DataGenerator: Constructor\n")
## Languages and Order
self.__LID = ["HI","EN"]
self.__l2MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/en-ptb.map"
self.__l1MapFile = "/usr0/home/pgadde/Work/CodeSwitching/Hinglish/Data/UniMaps/hi-hyd.map"
## Data containers
self.__parL1 = []
self.__parL2 = []
self.__align = []
self.__pureL1 = []
self.__pureL2 = []
self.__outputDir = outDir
self.__posMap = {}
self.__phraseMap = dd(list)
self.__csInstance = CSHandler()
self.__utils = Utils()
self.__Tree = Dependencytree()
## Generation Variants
self.__csVariants = [0,1,2,3,4]
self.__tagsetVariants = ["",".uni"]
self.__dataRange = range(50,900,50)
##self.__dataRange = [200]
self.__splits = [(50,50),(60,40),(70,30),(80,20),(90,10)]
self.__csHash = set()
##LID stuff
self.__L1Tags = set()
self.__L2Tags = set()
self.__commonTags = set()
## Pre processing
self.__genPosMap()
self.__genPhraseMap()
self.__csInstance.updatePhraseMap(self.__phraseMap)
self.__csInstance.updateLIDTags(self.__LID[0], self.__LID[1])
## Real test overwrites
#self.__csVariants = [1,2,3,4]
self.__tagsetVariants = [""]
self.__dataRange = [400]
self.__dataRanges = {0:range(40,601,40), 1:range(40,601,40), 2:range(35,540,35), 3:range(30,451,30), 4:range(15,231,15)}
#self.__dataRanges = {0:[880], 1:[880], 2:[800], 3:[630], 4:[330]}
#self.__dataRanges = {0:[60], 1:[60], 2:[60], 3:[60], 4:[60]}
#self.__splits = [(50,50)]
#for i in range(0,51,5):
# split = (100-i, i)
# self.__splits.append(split)
self.__fileSuffix = ""
def loadData(self, l1Data, l2Data, l1Aligns, l2Aligns, pureL1Data, pureL2Data):
self.__parL1 = self.__utils.readSentences(l1Data)
self.__parL2 = self.__utils.readSentences(l2Data)
self.__align = self.__utils.readAligns(l1Aligns, l2Aligns)
self.__pureL1 = self.__utils.readSentencesPlain(pureL1Data)
self.__pureL2 = self.__utils.readSentencesPlain(pureL2Data)
sys.stderr.write("parL1:"+str(len(self.__parL1))+"\n")
sys.stderr.write("parL2:"+str(len(self.__parL2))+"\n")
sys.stderr.write("align:"+str(len(self.__align))+"\n")
sys.stderr.write("pureL1:"+str(len(self.__pureL1))+"\n")
sys.stderr.write("pureL2:"+str(len(self.__pureL2))+"\n")
def __genTrainData(self):
statusCount = 0
for data in self.__dataRange:
#control = 0
#while 1:
for Split in self.__splits:
#for control in range(3):
#if control == 3:
# break
#pr= int(control*1.0/2 * data)
pr= int((Split[0]*1.0/(Split[0]+Split[1])) * data)
tr = data - pr
pr = pr/2
print pr
random.seed()
pIndicesL1 = random.sample(range(len(self.__pureL1)),pr)
pIndicesL2 = random.sample(range(len(self.__pureL2)),pr)
for csType in self.__csVariants:
self.__csHash = set()
##sys.stderr.write("csType:"+str(csType)+'\n')
# Debugging !!
#switch = ""
#############
#for tag in self.__tagsetVariants:
# Debugging !!
#if switch == "yes":
# break
###################
#sys.stderr.write(outputDir+"Train"+cs+str(len(trainVariants[tr]))+"Pure"+str(len(pureVariants[pr]))+tag+"\n")
sys.stderr.write(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix+'\n')
dataFile = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+self.__fileSuffix,'w')
##dataFileUni = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni",'w')
##dataFileUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uniq",'w')
##dataFileUniUniq = open(self.__outputDir+"TrainCSType"+str(csType)+"CS"+str(Split[1])+"Pure"+str(Split[0])+"Total"+str(data)+".uni.uniq",'w')
#.........这里部分代码省略.........