本文整理汇总了Python中feature_extractor.FeatureExtractor.extract_features方法的典型用法代码示例。如果您正苦于以下问题:Python FeatureExtractor.extract_features方法的具体用法?Python FeatureExtractor.extract_features怎么用?Python FeatureExtractor.extract_features使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类feature_extractor.FeatureExtractor
的用法示例。
在下文中一共展示了FeatureExtractor.extract_features方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: TestFeatureExtractor
# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import extract_features [as 别名]
class TestFeatureExtractor(unittest.TestCase):
'''
Unit tests for the FeatureExtractor class. Does simple tests to insure that
the feature vector we get back is of the right length and has frequency
data that makes sense. More tests should be added.
'''
def setUp(self):
'''Sets up the test by constructing feature vectors to get tested'''
self.record1 = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
IUPAC.protein),
id="YP_025292.1", name="HokC",
description="toxic membrane protein, small")
self.seq1 = self.record1.seq
self.feature_extractor = FeatureExtractor()
self.feature_vector1 = self.feature_extractor.extract_features(self.seq1)
def test_feature_vector_length(self):
'''Tests that the feature vector is 400 elements long'''
self.assertEqual(len(self.feature_vector1), 400, msg="Feature vector not 400 long")
def test_dipeptide_frequency_sum(self):
'''Tests that the dipeptide frequencies sum to 1'''
checksum = 0.0
for i in range(0,400):
checksum += self.feature_vector1[i]
self.assertAlmostEqual(checksum, 1.0, places=5, msg="Frequencies don't sum to 1")
示例2: MultiReader
# 需要导入模块: from feature_extractor import FeatureExtractor [as 别名]
# 或者: from feature_extractor.FeatureExtractor import extract_features [as 别名]
class MultiReader(DataLoader):
def __init__(self, output_width=11, training_frac=70.0, validation_frac=15.0, debug=False):
self.input_width = 400
self.output_width = output_width
self.training_frac = training_frac
self.validation_frac = validation_frac
self.debug = debug
# self.dir = "/home/jlawson/Dropbox/ProteinFunctionData/" # Where the files live.
self.names = [ # Names of all of the files.
"baseplate_3370",
"collar_1385",
"htj_2258_nofg",
"major_tail_1512",
"mcp_3589",
"minor_capsid_1500_nofg",
"minor_tail_2033",
"portal_2141",
"tail_fiber_3007",
"tail_sheath_2350",
]
self.feature_extractor = FeatureExtractor()
def load_data(self, source):
"""Load the data from a directory with a collection of source files,
one file for each kind of protein.
Returns an array of pairs in the form:
[(train_set_in, train_set_out), (validation_set_in, validation_set_out), (test_set_in, test_set_out)]
:type source: String
:param source: The directory where the source files are located.
"""
dir = source
raw_data = list()
unsupporteds = list()
for i in range(0, len(self.names)):
num_in_file = 0
if self.debug:
print (dir + self.names[i] + ".faa")
handle = open(dir + self.names[i] + ".faa", "rU") # Open a file.
for record in SeqIO.parse(handle, "fasta"):
num_in_file += 1
try:
# print " " + record.id
feature_vector = self.feature_extractor.extract_features(record)
# Now we have to augment the feature vector with the output
# vector. So we:
# 1) Make a new array a bit longer than the feature vector,
# 2) Copy the feature vector into the first cells of the new array,
# 3) Find the appropriate cell in the tail of the new array
# and set that one equal to 1.
prepared_data_record = numpy.zeros(len(feature_vector) + self.output_width)
for col in range(0, len(feature_vector)): # This surely could be done more efficiently.
prepared_data_record[col] = feature_vector[col] # Doesn't matter for now.
prepared_data_record[
len(feature_vector) + i
] = 1 # The class of the protein is taken from the order of the files in the list "names"
raw_data.append(prepared_data_record)
except KeyError:
if self.debug:
print " Unsupported sequence: " + record.id + " " + str(record.annotations)
unsupporteds.append(record)
pass
handle.close()
if self.debug:
print "Total in file " + self.names[i] + " = " + str(num_in_file)
# Now we are done reading all of the data in. In debug mode, print some
# overall summary information.
if self.debug:
print "Supported Sequences = " + str(len(raw_data))
print "Unsupported Sequences = " + str(len(unsupporteds))
num_examples = len(raw_data)
# But the labeled data we have is not randomly ordered. It is sorted
# by class. We need to shuffle it up or we will only train on the first
# classes.
if self.debug:
print "Shuffling data to randomize for training"
shuffle = self.rand_perm(num_examples)
data = numpy.ndarray((num_examples, self.input_width + self.output_width), float)
for n in range(0, num_examples):
for w in range(0, self.input_width + self.output_width):
s = raw_data[shuffle[n]][w]
data[n, w] = float(s)
if self.debug:
print "Finished shuffling data"
print "Processing data to cull outliers"
data = self.preprocess(self.cull(data))
num_examples = len(data)
print "Data shape = ", data.shape, " num_examples=", num_examples
inputs = numpy.array(data)[:, 0 : self.input_width]
outputs_full = numpy.array(data)[:, self.input_width : self.input_width + self.output_width]
if self.debug:
print "Finished culling outliers"
print inputs.shape
#.........这里部分代码省略.........