Python textract.process函数代码示例

本文整理汇总了Python中textract.process函数的典型用法代码示例。如果您正苦于以下问题：Python process函数的具体用法？Python process怎么用？Python process使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了process函数的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_missing_filename_python

 def test_missing_filename_python(self):
     """Make sure missing files raise the correct error"""
     filename = self.get_temp_filename()
     os.remove(filename)
     import textract
     from textract.exceptions import MissingFileError
     with self.assertRaises(MissingFileError):
         textract.process(filename)

开发者ID:deanmalmgren，项目名称:textract，代码行数:8，代码来源:test_exceptions.py

示例2: test_unsupported_extension_python

 def test_unsupported_extension_python(self):
     """Make sure unsupported extension raises the correct error"""
     filename = self.get_temp_filename(extension="extension")
     import textract
     from textract.exceptions import ExtensionNotSupported
     with self.assertRaises(ExtensionNotSupported):
         textract.process(filename)
     os.remove(filename)

开发者ID:deanmalmgren，项目名称:textract，代码行数:8，代码来源:test_exceptions.py

示例3: annotate_doc

def annotate_doc(pdf_file_path, ontologies):
    if pdf_file_path.endswith('pdf') or pdf_file_path.endswith('PDF'):
        text = textract.process(pdf_file_path, method="pdfminer")
    elif pdf_file_path.endswith('html') or pdf_file_path.endswith('htm'):
        text = textract.process(pdf_file_path, method="beautifulsoup4")
    elif pdf_file_path.endswith('txt'):
            with open(pdf_file_path, 'r') as file:
                text = file.read()
    db = DBConnect()
    if text.isspace():
        log = {
            'file_name': pdf_file_path.encode('utf-8'),
            'error': 'Failed PDF to text transformation in annotation process',
            'exception': '',
            'data': ''
        }
        db.insert_log(log)
        return
    ontologies = ",".join(ontologies)
    annotations = []
    text = unidecode(text.decode('utf8'))
    text = ' '.join(text.split())
    # post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text,
    #                  display_links='true', display_context='false', minimum_match_length='3',
    #                  exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true')
    post_data = dict(apikey=settings.BIOPORTAL_API_KEY, text=text,
                     display_links='true', display_context='false', minimum_match_length='3',
                     exclude_numbers='true', longest_only='true', ontologies=ontologies, exclude_synonyms='true')
    try:
        response = requests.post(settings.ANNOTATOR_URL, post_data)
        json_results = json.loads(response.text)
        for result in json_results:
            for annotation in result['annotations']:
                context_begin = annotation['from']  if annotation['from'] - 40 < 1 else annotation['from'] - 40
                context_end = annotation['to'] if annotation['to'] + 40 > len(text) else annotation['to'] + 40
                record = {
                    'file_name': pdf_file_path.encode('utf-8'),
                    'bio_class_id': result['annotatedClass']['@id'],
                    'bio_ontology_id': result['annotatedClass']['links']['ontology'],
                    'text': u'' + annotation['text'].encode('utf-8'),
                    'match_type': annotation['matchType'],
                    'context': u''+text[context_begin:context_end]
                }
                annotations.append(record)
        db.insert_annotations(annotations)
        return
    except (ValueError, IndexError, KeyError) as e:
        print e
        log = {
            'file_name': pdf_file_path.encode('utf-8'),
            'error': 'Bad response from Bioportal Annotator',
            'exception': str(e),
            'data': ''
        }
        db.insert_log(log)
        return

开发者ID:ficolo，项目名称:corpora-char-cli，代码行数:56，代码来源:bioportal_annotator.py

示例4: pdftotext_any

def pdftotext_any(myfile):
    # Todo: use tempfile instead
    path = '/tmp/infile.pdf'
    with open(path, 'wb') as f:
    #with tempfile.NamedTemporaryFile() as f:
    #    path = f.name
        f.write(myfile)
    text = textract.process(path, method='pdftotext')
    if len(text)<5: # No text found, it is probably an image scan, so we need to do an OCR
        text = textract.process(path, method='tesseract')
    return text

开发者ID:rsandstroem，项目名称:DocumentStore，代码行数:11，代码来源:docstore.py

示例5: process_text_file

def process_text_file(file_path):
    file_name, extension = os.path.splitext(file_path)
    print file_name, extension
    if (extension == ".txt"):
        return file_path
    elif (extension == '.epub'):
        print "Trying epub"
        try:
            text = textract.process(file_path)
            print "Processed epub: ", file_path
            output_path = file_name+'.txt'
            output_file = open(output_path, 'w')
            output_file.write(text)
            print "Converted epub: ", output_path
            return output_path
        except Exception as error:
            # TODO: textract raises own error so none isn't returned on try failure
            print error
            print 'Failed to convert epub: ', file_path
            return None
    elif (extension == ""):
        text_content = None
        try:
            with open(file_path) as input_file:
                text_content = input_file.read()
                if text_content:
                    print "Managed to read file: ", file_path
                    return file_path
        except IOError:
            print "Failed to read file: ", file_path
            return None
    else:
        print 'Unsupported file type: ', file_path
        return None

开发者ID:EilidhHendry，项目名称:author-similarity，代码行数:34，代码来源:util.py

示例6: get_text_from_files

def get_text_from_files(files_to_process):
    """Extracts text from each file given a list of file_names"""
    file_text_dict = {}
    for file_name in iter(files_to_process):
        extracted_text = textract.process(file_name)
        file_text_dict[file_name] = extracted_text
    return file_text_dict

开发者ID:aggerdom，项目名称:textprocessingfuncs，代码行数:7，代码来源:dumptext_dialog.py

示例7: get_text_from_file

	def get_text_from_file(self, file):
		filename = file['id'] + '.pdf'

		self._download_file(file, filename)
		text = textract.process(filename)
		os.remove(filename)
		return text

开发者ID:mlchow，项目名称:duethat，代码行数:7，代码来源:GoogleDoc.py

示例8: extract_text_from_lectureDocuments

    def extract_text_from_lectureDocuments(self):
        # pull files from database
        lectureDocumentsObjects = lectureDocuments.objects.filter(extracted=False)

        # loop through modules and pull all text
        for lectureDocumentsObject in lectureDocumentsObjects:
            if lectureDocumentsObject.document:
                print lectureDocumentsObject.document
                path_to_file = MEDIA_ROOT + '/' + str(lectureDocumentsObject.document)
                document_contents = textract.process(path_to_file, encoding='ascii')

                # create tags from noun_phrases
                # only add tags if none exist
                blobbed = TextBlob(document_contents)
                np = blobbed.noun_phrases
                np = list(set(np))
                np = [s for s in np if s]
                lectureDocumentsObject.tags.clear()
                for item in np:
                    s = ''.join(ch for ch in item if ch not in exclude)
                    print s
                    lectureDocumentsObject.tags.add(s)

                # save this string
                lectureDocumentsObject.document_contents = document_contents
                lectureDocumentsObject.extracted = True
                lectureDocumentsObject.save()

开发者ID:NiJeLorg，项目名称:GAHTC，代码行数:27，代码来源:import_text_docs.py

示例9: save

 def save(self, *args, **kwargs):
     super(Document, self).save(*args, **kwargs)
     text = textract.process(self.source_file.url)
     filtered_stems = self.get_filtered_stems(text)
     self.total_word_count = len(filtered_stems)
     self.count_target_words(filtered_stems)
     super(Document, self).save(*args, **kwargs)

开发者ID:pixelrust，项目名称:devjargon，代码行数:7，代码来源:models.py

示例10: extract

def extract(path):
    '''
    Extract full text fro pdf's

    :param path: [String] Path to a pdf file downloaded via {fetch}, or another way.

    :return: [str] a string of text

    Usage::

        from pyminer import miner

        # a pdf
        url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126"
        out = miner.fetch(url)
        out.parse()

        # search first, then pass links to fetch
        res = miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
        # url = res.links_pdf()[0]
        url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928'
        x = miner.fetch(url)
        miner.extract(x.path)
    '''
    text = textract.process(path)
    return text

开发者ID:sckott，项目名称:pyminer，代码行数:26，代码来源:extract.py

示例11: build_indexes

def build_indexes(files_list, index_file):
    toolbar_width = len(files_list)
    print(toolbar_width)
    sys.stdout.write("[%s]" % (" " * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['
    hash_index = {}
    for item in files_list:
        text = textract.process(item)
        details = re.split("[, \t\n\\t:;]",  text)
        for i in details:
            if i == "" : continue
            if hash_index.has_key((i)) :
                if hash_index[(i)].has_key((item)):
                    hash_index[(i)][(item)] += 1
                else:
                    hash_index[(i)][(item)] = 1
            else:
                hash_index[(i)] = {}
                if hash_index[(i)].has_key(item):
                    hash_index[(i)][(item)] += 1
                else:
                    hash_index[(i)][(item)] = 1


        # update the bar
        sys.stdout.write("-")
        sys.stdout.flush()

    sys.stdout.write("\n")
    fp = open(index_file, "w")
    json.dump(hash_index, fp)
    fp.close()

开发者ID:saadasad，项目名称:highlight，代码行数:33，代码来源:main.py

示例12: get_path_details

 def get_path_details(cls, temp_path, image_path):
     """Return the byte sequence and the full text for a given path."""
     byte_sequence = ByteSequence.from_path(temp_path)
     extension = map_mime_to_ext(byte_sequence.mime_type, cls.mime_map)
     logging.debug("Assessing MIME: %s EXTENSION %s SHA1:%s", byte_sequence.mime_type,
                   extension, byte_sequence.sha1)
     full_text = ""
     if extension is not None:
         try:
             logging.debug("Textract for SHA1 %s, extension map val %s",
                           byte_sequence.sha1, extension)
             full_text = process(temp_path, extension=extension, encoding='ascii',
                                 preserveLineBreaks=True)
         except ExtensionNotSupported as _:
             logging.exception("Textract extension not supported for ext %s", extension)
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
         except LookupError as _:
             logging.exception("Lookup error for encoding.")
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
         except UnicodeDecodeError as _:
             logging.exception("UnicodeDecodeError, problem with file encoding")
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
         except:
             logging.exception("Textract UNEXPECTEDLY failed for temp_file.")
             logging.debug("Image path for file is %s, temp file at %s", image_path, temp_path)
             full_text = "N/A"
     return byte_sequence, full_text

开发者ID:BitCurator，项目名称:bca-webtools，代码行数:30，代码来源:text_indexer.py

示例13: indexing

def indexing():
    ana = analysis.StemmingAnalyzer()
    schema = Schema(title=TEXT(analyzer=ana, spelling=True), path=ID(stored=True), content=TEXT)
    ix = create_in("data/pdf_data", schema)
    writer = ix.writer()
    count = 0

    with open('Final_Links/doc_links.txt') as fp, open('data/pdf_data/mytemp/doc_content.txt', 'w+') as f:
        for line in fp:
            count += 1
            url = line
            doc_name = re.search('.*/(.*)', url).group(1)

            try:
                response = urllib2.urlopen(url, timeout=3)
                if int(response.headers['content-length']) > 2475248:
                    continue
                fil = open("data/pdf_data/mytemp/" + doc_name, 'w+')
                fil.write(response.read())
                fil.close()

                content_text = textract.process('data/pdf_data/mytemp/' + doc_name, encoding='ascii')
                f.write(content_text)
                writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"),
                                    content=unicode(content_text))
                writer.add_document(title=unicode(url, "utf-8"), path=unicode(url, "utf-8"),
                                    content=unicode(url))
            except Exception as e:
                print "Caught exception e at " + '' + str(e)
                continue
            print str(count) + " in " + " URL:" + url

    writer.commit()
    print "Indexing Completed !"

开发者ID:theawless，项目名称:IITG-Search，代码行数:34，代码来源:doc_indexing_main_thread.py

示例14: parse_sentences

def parse_sentences(pdf):
	text = textract.process(pdf)

	reg = "[.?!]"

	sentences = re.split(reg, text)

	return [s for s in sentences if "\\x" not in s]

开发者ID:denalirao，项目名称:thesis-Visualizations，代码行数:8，代码来源:thesis_parser.py

示例15: compare_python_output

    def compare_python_output(self, filename, expected_filename=None, **kwargs):
        if expected_filename is None:
            expected_filename = self.get_expected_filename(filename, **kwargs)

        import textract
        result = textract.process(filename, **kwargs)
        with open(expected_filename) as stream:
            self.assertEqual(result, stream.read())

开发者ID:AlinaKay，项目名称:textract，代码行数:8，代码来源:base.py

注：本文中的textract.process函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。