Python textract.process方法代码示例

本文整理汇总了Python中textract.process方法的典型用法代码示例。如果您正苦于以下问题：Python textract.process方法的具体用法？Python textract.process怎么用？Python textract.process使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类textract的用法示例。

在下文中一共展示了textract.process方法的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: add_text

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def add_text(self, my_dir = None):
        """ Adds all text / files from raw_file directory """
        if not my_dir:
            my_dir = self.my_data_dir
       
        ## may not have to check here but seems pragmatic
        self._check_defaults()

        print "Extracting text from:", my_dir

        file_list = [f for f in listdir(my_dir) if isfile(join(my_dir, f))]

        for f in file_list: # Will overwrite text for any existing files
            print "\tProcessing file:", f
            txt = textract.process( join(my_dir, f), encoding="utf-8" )
            txt = txt.replace("\xa0", " ")
            txt = txt.decode('ascii', errors="ignore")
            txt = txt.encode("ascii") #, errors="ignore")
            self.everything['input'][f] = txt

开发者ID:18F，项目名称:markov_bot，代码行数:21，代码来源:popular_phrases.py

示例2: convert_to_text_and_and_move_field

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text_and_and_move_field():
       
        for _i in range(0,len(lists[2]) ,1) :                   # .doc
                _text_lists_convert2[_i] = (textract.process(_dir+lists[2][_i])).decode('utf-8').lower()
                lan,x =  langid.classify(_text_lists_convert2[_i])
               
                
                if lan == "vi":   
                        shutil.copy(_dir+lists[2][_i]  + "",_dir_vi + lists[2][_i] +"")             
                if lan == "en":
                        shutil.copy(_dir+lists[2][_i]  + "",_dir_en + lists[2][_i] +"")                      
                else :
                        print 'done or the language not of english , vietnames with doc^^'
                
                
                _i+=1
      #  print '\tHave %s doc file ' %(str(_i))

开发者ID:hieuxinhe94，项目名称:CVProject，代码行数:19，代码来源:nhan_biet_ngonngu.py

示例3: get_features

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in RTFAdapter.get_file_list(directory, 'txt'):
            features = RTFAdapter.get_meta_features(file_path=file_path)

            features['text'] = textract.process(file_path)

            yield features

开发者ID:texta-tk，项目名称:texta，代码行数:11，代码来源:rtf_adapter.py

示例4: extract

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def extract(path):
    '''
    Extract full text fro pdf's

    :param path: [String] Path to a pdf file downloaded via {fetch}, or another way.

    :return: [str] a string of text

    Usage::

        import pyminer

        # a pdf
        url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126"
        out = pyminer.fetch(url)
        out.parse()

        # search first, then pass links to fetch
        res = pyminer.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
        # url = res.links_pdf()[0]
        url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928'
        x = pyminer.fetch(url)
        pyminer.extract(x.path)
    '''
    text = textract.process(path)
    return text

开发者ID:sckott，项目名称:pyminer，代码行数:28，代码来源:extract.py

示例5: extract

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def extract(path):
    '''
    Extract full text fro pdf's

    :param path: [String] Path to a pdf file downloaded via {fetch}, or another way.

    :return: [str] a string of text

    Usage::

        from pyminer import miner

        # a pdf
        url = "http://www.banglajol.info/index.php/AJMBR/article/viewFile/25509/17126"
        out = miner.fetch(url)
        out.parse()

        # search first, then pass links to fetch
        res = miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"})
        # url = res.links_pdf()[0]
        url = 'http://www.nepjol.info/index.php/JSAN/article/viewFile/13527/10928'
        x = miner.fetch(url)
        miner.extract(x.path)
    '''
    text = textract.process(path)
    return text

开发者ID:sckott，项目名称:pyminer，代码行数:28，代码来源:extract.py

示例6: get_features

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in DocAdapter.get_file_list(directory, 'txt'):
            features = DocAdapter.get_meta_features(file_path=file_path)

            features['text'] = textract.process(file_path)

            yield features

开发者ID:texta-tk，项目名称:texta，代码行数:11，代码来源:doc_adapter.py

示例7: get_features

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def get_features(**kwargs):
        directory = kwargs['directory']

        for file_path in PDFAdapter.get_file_list(directory, 'txt'):
            features = PDFAdapter.get_meta_features(file_path=file_path)

            features['text'] = textract.process(file_path)

            yield features

开发者ID:texta-tk，项目名称:texta，代码行数:11，代码来源:pdf_adapter.py

示例8: parse_pdfs

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def parse_pdfs():

        pdfs = ScienceArticle.objects.filter(got_pdf=True).filter(text=None)

        for pdf in pdfs:
            filename = join(settings.BASE_DIR, 'uploads', 'research', str(pdf.file).split('/')[-1])
            try:
                parsed = textract.process(filename)
                pdf.text = smart_text(parsed)
                pdf.save()
                print((colored.green("Successfully saved PDF text to db.")))
            except Exception as e:
                print((colored.red("[ERROR] At PDF text parse: {0}".format(e))))

开发者ID:quant-trade，项目名称:QProb，代码行数:15，代码来源:science_articles.py

示例9: add_files

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def add_files(self, my_dir = None):
        """ Adds all text / files from raw_file directory - maybe call train() next """
        if not my_dir:
            my_dir = self.my_data_dir

        ## still have to check here vs. __init__() in case file is corrupt
        self._check_defaults()

        print "Extracting text from:", my_dir

        file_list = [f for f in listdir(my_dir) if isfile(join(my_dir, f))]

        for f in file_list: # Will overwrite text for any existing files
            print "\tProcessing file:", f
            self.everything['input'][f] = textract.process( join(my_dir, f), encoding="ascii" )

开发者ID:18F，项目名称:markov_bot，代码行数:17，代码来源:markov2.py

示例10: convert_to_text

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text(self, path=None, content=None, converter=None):
        if converter == "text":
            result = self.remove_newline.sub(b' ', content)
            return result
        elif converter == "textract":
            result = textract.process(path)
            result = self.remove_newline.sub(b' ', result)
            return result
        else:
            return None

开发者ID:cahya-wirawan，项目名称:opentc，代码行数:12，代码来源:icap-server-opentc.py

示例11: convert_to_text_to_process

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text_to_process():
       
        for _i in range(0,len(lists[2]) ,1) :                   # .doc
                lists[2][_i] = (textract.process(_dir+lists[2][_i])).decode('utf-8') #.lower()
               
                _i+=1
 #       print _i 
        '''       
        _j=i
        for _j in range(_i,len(lists[3])+_i,1) :                    # .pdf
                lists[2][_j] = (textract.process(_dir+lists[3][_j-_i])).decode('utf-8').lower()
               
                _j+=1
        print _j
        '''

开发者ID:hieuxinhe94，项目名称:CVProject，代码行数:17，代码来源:function_en.py

示例12: convert_to_text_to_process

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def convert_to_text_to_process():
     
        for _i in range(0,len(lists[2]) ,1) :                   # .doc
                lists[2][_i] = (textract.process(_dir+lists[2][_i])).decode('utf-8')
               
                _i+=1
        print _i

开发者ID:hieuxinhe94，项目名称:CVProject，代码行数:9，代码来源:function_vi.py

示例13: build_indexes

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def build_indexes(files_list, index_file):
    toolbar_width = len(files_list)
    print(toolbar_width)
    sys.stdout.write("[%s]" % (" " * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['
    hash_index = {}
    for item in files_list:
        text = textract.process(item)
        details = re.split("[, \t\n\\t:;]",  text)
        for i in details:
            if i == "" : continue
            if hash_index.has_key((i)) :
                if hash_index[(i)].has_key((item)):
                    hash_index[(i)][(item)] += 1
                else:
                    hash_index[(i)][(item)] = 1
            else:
                hash_index[(i)] = {}
                if hash_index[(i)].has_key(item):
                    hash_index[(i)][(item)] += 1
                else:
                    hash_index[(i)][(item)] = 1


        # update the bar
        sys.stdout.write("-")
        sys.stdout.flush()

    sys.stdout.write("\n")
    fp = open(index_file, "w")
    json.dump(hash_index, fp)
    fp.close()

#build_indexes(path_helper.get_files_list(), 'index')

开发者ID:zainulabidin302，项目名称:highlight，代码行数:37，代码来源:main.py

示例14: extract_text

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def extract_text(file_extraction, file_pdf):
    logging.info(f"Extracting text from {file_pdf} ... ( This may take a minute or two.)")
    with file_extraction.open("w", encoding="utf-8") as fout:
        fout.write(textract.process(str(file_pdf)).decode('utf-8'))
    logging.info(f"Saved extraction to {file_extraction}")

开发者ID:DeastinY，项目名称:srpdfcrawler，代码行数:7，代码来源:pdf_parser.py

示例15: test_text_ssns

# 需要导入模块: import textract [as 别名]
# 或者: from textract import process [as 别名]
def test_text_ssns(self):
		options = pdf_redactor.RedactorOptions()
		options.content_filters = [
			(
				re.compile(u"[?–—~?]"),
				lambda m: "-"
			),
			(
				re.compile(r"(?<!\d)(?!666|000|9\d{2})([OoIli0-9]{3})([\s-]?)(?!00)([OoIli0-9]{2})\2(?!0{4})([OoIli0-9]{4})(?!\d)"),
				lambda m: "XXX-XX-XXXX"
			),
		]
		with RedactFixture(FIXTURE_PATH, options) as redacted_path:
			text = textract.process(redacted_path)
			self.assertIn(b"Here are some fake SSNs\n\nXXX-XX-XXXX\n--\n\nXXX-XX-XXXX XXX-XX-XXXX\n\nAnd some more with common OCR character substitutions:\nXXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX XXX-XX-XXXX", text)

开发者ID:JoshData，项目名称:pdf-redactor，代码行数:17，代码来源:test_redactor.py

注：本文中的textract.process方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。