Python FeatureExtractor.extract_features方法代码示例

本文整理汇总了Python中feature_extractor.FeatureExtractor.extract_features方法的典型用法代码示例。如果您正苦于以下问题：Python FeatureExtractor.extract_features方法的具体用法？Python FeatureExtractor.extract_features怎么用？Python FeatureExtractor.extract_features使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类feature_extractor.FeatureExtractor的用法示例。

在下文中一共展示了FeatureExtractor.extract_features方法的2个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: TestFeatureExtractor

# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import extract_features [as 别名]
class TestFeatureExtractor(unittest.TestCase):
    '''
    Unit tests for the FeatureExtractor class. Does simple tests to insure that 
    the feature vector we get back is of the right length and has frequency
    data that makes sense. More tests should be added.
    ''' 
    def setUp(self):
        '''Sets up the test by constructing feature vectors to get tested'''       
        self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
            IUPAC.protein),
            id="YP_025292.1", name="HokC",
            description="toxic membrane protein, small")        
        self.seq1 = self.record1.seq
        self.feature_extractor = FeatureExtractor()  
        self.feature_vector1 = self.feature_extractor.extract_features(self.seq1)
        
    def test_feature_vector_length(self):
        '''Tests that the feature vector is 400 elements long'''
        self.assertEqual(len(self.feature_vector1), 400, msg="Feature vector not 400 long")
        
    def test_dipeptide_frequency_sum(self):
        '''Tests that the dipeptide frequencies sum to 1'''
        checksum = 0.0
        for i in range(0,400):
            checksum += self.feature_vector1[i]
        self.assertAlmostEqual(checksum, 1.0, places=5, msg="Frequencies don't sum to 1")

开发者ID:jrlawson，项目名称:profunc，代码行数:28，代码来源:test_feature_extractor.py

示例2: MultiReader

# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import extract_features [as 别名]
class MultiReader(DataLoader):
    def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False):
        self.input_width = 400
        self.output_width = output_width
        self.training_frac = training_frac
        self.validation_frac = validation_frac
        self.debug = debug
        # self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/"      # Where the files live.
        self.names = [  # Names of all of the files.
            "baseplate_3370",
            "collar_1385",
            "htj_2258_nofg",
            "major_tail_1512",
            "mcp_3589",
            "minor_capsid_1500_nofg",
            "minor_tail_2033",
            "portal_2141",
            "tail_fiber_3007",
            "tail_sheath_2350",
        ]

        self.feature_extractor = FeatureExtractor()

    def load_data(self, source):
        """Load the data from a directory with a collection of source files,
        one file for each kind of protein. 
        
        Returns an array of pairs in the form:
        
        [(train_set_in, train_set_out), (validation_set_in, validation_set_out), (test_set_in, test_set_out)]

        :type source:   String
        :param source:  The directory where the source files are located.
        """
        dir = source
        raw_data = list()
        unsupporteds = list()
        for i in range(0, len(self.names)):
            num_in_file = 0
            if self.debug:
                print (dir + self.names[i] + ".faa")
            handle = open(dir + self.names[i] + ".faa", "rU")  # Open a file.
            for record in SeqIO.parse(handle, "fasta"):
                num_in_file += 1
                try:
                    # print "      " + record.id
                    feature_vector = self.feature_extractor.extract_features(record)
                    # Now we have to augment the feature vector with the output
                    # vector. So we:
                    #   1) Make a new array a bit longer than the feature vector,
                    #   2) Copy the feature vector into the first cells of the new array,
                    #   3) Find the appropriate cell in the tail of the new array
                    #      and set that one equal to 1.
                    prepared_data_record = numpy.zeros(len(feature_vector) + self.output_width)
                    for col in range(0, len(feature_vector)):  # This surely could be done more efficiently.
                        prepared_data_record[col] = feature_vector[col]  # Doesn't matter for now.
                    prepared_data_record[
                        len(feature_vector) + i
                    ] = 1  # The class of the protein is taken from the order of the files in the list "names"
                    raw_data.append(prepared_data_record)
                except KeyError:
                    if self.debug:
                        print "   Unsupported sequence: " + record.id + "   " + str(record.annotations)
                    unsupporteds.append(record)
                pass
            handle.close()
            if self.debug:
                print "Total in file " + self.names[i] + " = " + str(num_in_file)

        # Now we are done reading all of the data in. In debug mode, print some
        # overall summary information.
        if self.debug:
            print "Supported Sequences = " + str(len(raw_data))
            print "Unsupported Sequences = " + str(len(unsupporteds))

        num_examples = len(raw_data)

        # But the labeled data we have is not randomly ordered. It is sorted
        # by class. We need to shuffle it up or we will only train on the first
        # classes.
        if self.debug:
            print "Shuffling data to randomize for training"
        shuffle = self.rand_perm(num_examples)

        data = numpy.ndarray((num_examples, self.input_width + self.output_width), float)
        for n in range(0, num_examples):
            for w in range(0, self.input_width + self.output_width):
                s = raw_data[shuffle[n]][w]
                data[n, w] = float(s)
        if self.debug:
            print "Finished shuffling data"
            print "Processing data to cull outliers"
        data = self.preprocess(self.cull(data))
        num_examples = len(data)
        print "Data shape = ", data.shape, "   num_examples=", num_examples
        inputs = numpy.array(data)[:, 0 : self.input_width]
        outputs_full = numpy.array(data)[:, self.input_width : self.input_width + self.output_width]
        if self.debug:
            print "Finished culling outliers"
            print inputs.shape
#.........这里部分代码省略.........

开发者ID:jrlawson，项目名称:profunc，代码行数:103，代码来源:multi_reader.py

注：本文中的feature_extractor.FeatureExtractor.extract_features方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。