本文整理汇总了Python中Utils.ProgressCounter.ProgressCounter.showMilliseconds方法的典型用法代码示例。如果您正苦于以下问题:Python ProgressCounter.showMilliseconds方法的具体用法?Python ProgressCounter.showMilliseconds怎么用?Python ProgressCounter.showMilliseconds使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Utils.ProgressCounter.ProgressCounter
的用法示例。
在下文中一共展示了ProgressCounter.showMilliseconds方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: loadCorpus
# 需要导入模块: from Utils.ProgressCounter import ProgressCounter [as 别名]
# 或者: from Utils.ProgressCounter.ProgressCounter import showMilliseconds [as 别名]
def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True):
"""
Load an entire corpus through CorpusElements and add SentenceGraph-objects
to its SentenceElements-objects.
"""
import cElementTreeUtils as ETUtils
import sys
sys.path.append("..")
from Utils.ProgressCounter import ProgressCounter
from InteractionXML.CorpusElements import CorpusElements
# Corpus may be in file or not
if type(corpus) == types.StringType:
print >> sys.stderr, "Loading corpus file", corpus
corpusTree = ETUtils.ETFromObj(corpus)
corpusRoot = corpusTree.getroot()
# Use CorpusElements-class to access xml-tree
corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements)
print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences"
# Make sentence graphs
duplicateInteractionEdgesRemoved = 0
sentences = []
counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs")
counter.showMilliseconds = True
for sentence in corpusElements.sentences[:]:
counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ")
# No tokens, no sentence. No also no dependencies = no sentence.
# Let's not remove them though, so that we don't lose sentences from input.
if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0:
#corpusElements.sentences.remove(sentence)
sentence.sentenceGraph = None
continue
for pair in sentence.pairs:
# gif-xml defines two closely related element types, interactions and
# pairs. Pairs are like interactions, but they can also be negative (if
# interaction-attribute == False). Sometimes pair-elements have been
# (incorrectly) used without this attribute. To work around these issues
# we take all pair-elements that define interaction and add them to
# the interaction-element list.
isInteraction = pair.get("interaction")
if isInteraction == "True" or isInteraction == None:
sentence.interactions.append(pair) # add to interaction-elements
if pair.get("type") == None: # type-attribute must be explicitly defined
pair.set("type", "undefined")
# Construct the basic SentenceGraph (only syntactic information)
graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
# Add semantic information, i.e. the interactions
graph.mapInteractions(sentence.entities, sentence.interactions)
graph.interSentenceInteractions = sentence.interSentenceInteractions
duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
sentence.sentenceGraph = graph
graph.parseElement = sentence.parseElement
#graph.mapEntityHints()
print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs"
return corpusElements
示例2: findHeads
# 需要导入模块: from Utils.ProgressCounter import ProgressCounter [as 别名]
# 或者: from Utils.ProgressCounter.ProgressCounter import showMilliseconds [as 别名]
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False):
if iterate:
from Utils.ProgressCounter import ProgressCounter
import InteractionXML.SentenceElements as SentenceElements
print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
print >> sys.stderr, "Removing existing head offsets"
removeCount = 0
counter = ProgressCounter(None, "Find heads")
counter.showMilliseconds = True
for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization):
for sentence in sentences:
if removeExisting:
for e in sentence.sentence.findall("entity"):
if e.get("headOffset") != None:
removeCount += 1
del e.attrib["headOffset"]
graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies)
graph.mapInteractions(sentence.entities, sentence.interactions)
# Make sure every parse gets head scores
#if graph.tokenHeadScores == None:
# graph.getTokenHeadScores()
counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ")
print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
else:
xml = ETUtils.ETFromObj(input)
if removeExisting:
print >> sys.stderr, "Removing existing head offsets"
removeCount = 0
xml = ETUtils.ETFromObj(input)
for d in xml.getroot().findall("document"):
for s in d.findall("sentence"):
for e in s.findall("entity"):
if e.get("headOffset") != None:
removeCount += 1
del e.attrib["headOffset"]
print >> sys.stderr, "Removed head offsets from", removeCount, "entities"
# SentenceGraph automatically calculates head offsets and adds them to entities if they are missing
print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization
corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization)
# Make sure every parse gets head scores
for sentence in corpusElements.sentences:
if sentence.sentenceGraph == None:
continue
if sentence.sentenceGraph.tokenHeadScores == None:
sentence.sentenceGraph.getTokenHeadScores()
if output != None:
print >> sys.stderr, "Writing output to", output
ETUtils.write(corpusElements.rootElement, output)
return xml
示例3: waitForProcess
# 需要导入模块: from Utils.ProgressCounter import ProgressCounter [as 别名]
# 或者: from Utils.ProgressCounter.ProgressCounter import showMilliseconds [as 别名]
def waitForProcess(process, numCorpusSentences, measureByGap, outputFile, counterName, updateMessage, timeout=None):
"""
Waits for a process to finish, and tracks the number of entities it writes
to it's outputfile. If writing a sentence takes longer than the timeout,
the process is considered stalled and is killed.
"""
maxStartupTime = 600 # Give extra time for the process to start up (even if it creates immediately an empty output file)
counter = ProgressCounter(numCorpusSentences, counterName)
counter.showMilliseconds = True
prevNumSentences = 0 # Number of output sentences on previous check
finalCheckLeft = True # Make one final check to update counters
processStatus = None # When None, process not finished
prevTime = time.time()
startTime = time.time()
# Wait until process is finished and periodically check it's progress.
while processStatus == None or finalCheckLeft:
if processStatus != None: # Extra loop to let counters finish
finalCheckLeft = False # Done only once
if os.path.exists(outputFile[0]): # Output file has already appeared on disk
# Measure number of sentences in output file
numSentences = 0
f = codecs.open(outputFile[0], "rt", **outputFile[1])
for line in f:
if measureByGap:
if line.strip() == "":
numSentences += 1
else:
numSentences += 1
f.close()
# Update status
if numSentences - prevNumSentences != 0: # Process has progressed
counter.update(numSentences - prevNumSentences, updateMessage + ": ")
if finalCheckLeft: # This is a normal loop, not the final check
# Startuptime hasn't yet passed or process has made progress
if time.time() - startTime < maxStartupTime or numSentences - prevNumSentences != 0:
#if prevNumSentences == 0 or numSentences - prevNumSentences != 0:
prevTime = time.time() # reset timeout
else: # Nothing happened on this update, check whether process hung
elapsedTime = time.time() - prevTime
if timeout != None and elapsedTime > timeout:
print >> sys.stderr, "Process timed out (" + str(elapsedTime) + " vs. " + str(timeout) + ")"
print >> sys.stderr, "Killing process"
process.kill()
prevNumSentences = numSentences
time.sleep(1)
else: # Output file doesn't exist yet
prevTime = time.time() # reset counter if output file hasn't been created
processStatus = process.poll() # Get process status, None == still running
counter.markFinished() # If we get this far, don't show the error message even if process didn't finish
return (numSentences, numCorpusSentences)
示例4: makeSentences
# 需要导入模块: from Utils.ProgressCounter import ProgressCounter [as 别名]
# 或者: from Utils.ProgressCounter.ProgressCounter import showMilliseconds [as 别名]
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False):
"""
Run GENIA Sentence Splitter
Divide text in the "text" attributes of document and section
elements into sentence elements. These sentence elements are
inserted into their respective parent elements.
"""
global sentenceSplitterDir
print >> sys.stderr, "Loading corpus", input
corpusTree = ETUtils.ETFromObj(input)
print >> sys.stderr, "Corpus file loaded"
corpusRoot = corpusTree.getroot()
print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR,
if postProcess:
print >> sys.stderr, "(Using post-processing)"
else:
print >> sys.stderr, "(No post-processing)"
docCount = 0
sentencesCreated = 0
redivideCount = 0
sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter")
counter.showMilliseconds = True
# Create working directory
workdir = tempfile.mkdtemp()
for document in sourceElements:
counter.update(1, "Splitting Documents ("+document.get("id")+"): ")
docId = document.get("id")
if docId == None:
docId = "CORPUS.d" + str(docCount)
docTag = "-" + str(docCount)
assert document.find("sentence") == None
text = document.get("text")
if text == None or text.strip() == "":
continue
#print type(text)
# Write text to workfile
#workdir = tempfile.mkdtemp()
workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8")
# From http://themoritzfamily.com/python-encodings-and-unicode.html
# "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode
# object otherwise it will try to automatically decode the byte stream as ASCII"
# However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII,
# thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this
# point we should be able to safely write(text), as the output file is unicode, and reading with
# the correct coded is taken care of earlier in the pipeline.
workfile.write(text) #.encode("utf-8"))
workfile.close()
# Run sentence splitter
assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR
args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH]
#p = subprocess.call(args)
p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
if stdout != "":
print >> sys.stderr, stdout
if stderr != 'Extracting events.roading model file.\nstart classification.\n':
print >> sys.stderr, stderr
#print "stdout<", p.stdout.readlines(), ">"
#print "stderr<", p.stderr.readlines(), ">"
if postProcess:
ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8")
subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR, "geniass-postproc.pl"), stdin=ppIn, stdout=ppOut)
ppIn.close()
ppOut.close()
# Read split sentences
workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8")
else:
workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8")
start = 0 # sentences are consecutively aligned to the text for charOffsets
sentenceCount = 0
#text = text.replace("\n", " ") # should stop sentence splitter from crashing.
#text = text.replace(" ", " ") # should stop sentence splitter from crashing.
#alignmentText = text.replace("\n", " ").replace("\r", " ")
#docTokens = reWhiteSpace.split(text)
docIndex = 0
sentenceBeginIndex = -1
prevSentence = None
prevEndIndex = None
emptySentenceCount = 0
prevText = None
for sText in workfile.readlines():
sText = sText.strip() # The text of the sentence
if sText == "":
emptySentenceCount += 1
continue
for i in range(len(sText)):
if sText[i].isspace():
assert sText[i] not in ["\n", "\r"]
continue
while text[docIndex].isspace():
if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1:
redivideCount += 1
prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex)
prevSentence.set("id", docId + ".s" + str(sentenceCount))
#.........这里部分代码省略.........
示例5: mainFunc
# 需要导入模块: from Utils.ProgressCounter import ProgressCounter [as 别名]
# 或者: from Utils.ProgressCounter.ProgressCounter import showMilliseconds [as 别名]
def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True):
print >> sys.stderr, "Protein Name Splitter"
if logFileName != None:
print >> sys.stderr, "Writing log to", logFileName
logFile = open(logFileName, "wt")
else:
logFile = None
#if input.endswith(".gz"):
# inFile = gzip.GzipFile(input)
#else:
# inFile = open(input)
tree = ETUtils.ETFromObj(input)
if tokenizationName == None:
tokenizationName = parseName
#tree = ElementTree.parse(inFile)
root = tree.getroot()
sentences = [x for x in root.getiterator("sentence")]
counter = ProgressCounter(len(sentences), "Split Protein Names")
counter.showMilliseconds = True
missingTokCount = 0
for sentence in sentences:
sId = sentence.get("id")
counter.update(1, "Splitting names ("+sId+"): ")
tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld)
if tok == None:
missingTokCount += 1
continue
assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (tokenizationName, sId)
parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld)
assert parse is not None, "Missing parse '%s' in sentence %s!" % (parseName, sId)
split = splitTokens(tok, sentence, logFile)
# Default names
if removeOld:
if newTokenizationName == None:
newTokenizationName = tok.get("tokenizer")
if newParseName == None:
newParseName = parse.get("parser")
else:
if newTokenizationName == None:
newTokenizationName = "split-" + tok.get("tokenizer")
if newParseName == None:
newParseName = "split-" + parse.get("parser")
# add a new tokenization with the split tokens.
splittok = addTokenization(newTokenizationName, sentence, sId)
addTokensToTree(split, splittok)
for a in tok.attrib:
if splittok.get(a) == None:
splittok.set(a, tok.get(a))
#splittok.set("split-")
# make a mapping from original to split token ids. Store the
# head token when given.
tokenIdMap = {}
for t in split:
if t.head:
head = t.head
# traverse
while head.head is not None:
assert head.head != t, "Cyclic heads"
head = head.head
# should match (nah, punctuation problems)
# assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict"
tokenIdMap[t.origId] = head.id
else:
# only allow overwrite of existing entry if the current token
# is not punctuation.
if t.origId not in tokenIdMap or not t.isPunct():
tokenIdMap[t.origId] = t.id
# make a copy of the specified parse that refers to the split tokens
# instead of the originals.
newparse = addParse(newParseName, newTokenizationName, sentence, sId)
for a in parse.attrib:
if newparse.get(a) == None:
newparse.set(a, parse.get(a))
newparse.set("ProteinNameSplitter", "True")
splittok.set("ProteinNameSplitter", "True")
depSeqId = 0 #1
for d in parse.getiterator("dependency"):
t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type")
assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR"
dep = ElementTree.SubElement(newparse, "dependency")
dep.set("t1", tokenIdMap[t1])
dep.set("t2", tokenIdMap[t2])
dep.set("type", dType)
dep.set("id", "sd_%d" % depSeqId)
depSeqId += 1
#.........这里部分代码省略.........