本文整理汇总了Python中pandas.factorize方法的典型用法代码示例。如果您正苦于以下问题:Python pandas.factorize方法的具体用法?Python pandas.factorize怎么用?Python pandas.factorize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pandas
的用法示例。
在下文中一共展示了pandas.factorize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_mixed
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_mixed(self):
# doc example reshaping.rst
x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf])
labels, uniques = algos.factorize(x)
exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
exp = Index(['A', 'B', 3.14, np.inf])
tm.assert_index_equal(uniques, exp)
labels, uniques = algos.factorize(x, sort=True)
exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
exp = Index([3.14, np.inf, 'A', 'B'])
tm.assert_index_equal(uniques, exp)
示例2: group_sums
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def group_sums(x, group):
'''sum x for each group, simple bincount version, again
group : array, integer
assumed to be consecutive integers
no dtype checking because I want to raise in that case
uses loop over columns of x
#TODO: remove this, already copied to tools/grouputils
'''
#TODO: transpose return in group_sum, need test coverage first
# re-label groups or bincount takes too much memory
if np.max(group) > 2 * x.shape[0]:
group = pd.factorize(group)[0]
return np.array([np.bincount(group, weights=x[:, col])
for col in range(x.shape[1])])
示例3: test_factorize_nan
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_factorize_nan(self):
# nan should map to na_sentinel, not reverse_indexer[na_sentinel]
# rizer.factorize should not raise an exception if na_sentinel indexes
# outside of reverse_indexer
key = np.array([1, 2, 1, np.nan], dtype='O')
rizer = ht.Factorizer(len(key))
for na_sentinel in (-1, 20):
ids = rizer.factorize(key, sort=True, na_sentinel=na_sentinel)
expected = np.array([0, 1, 0, na_sentinel], dtype='int32')
assert len(set(key)) == len(set(expected))
tm.assert_numpy_array_equal(pd.isna(key),
expected == na_sentinel)
# nan still maps to na_sentinel when sort=False
key = np.array([0, np.nan, 1], dtype='O')
na_sentinel = -1
# TODO(wesm): unused?
ids = rizer.factorize(key, sort=False, na_sentinel=na_sentinel) # noqa
expected = np.array([2, -1, 0], dtype='int32')
assert len(set(key)) == len(set(expected))
tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
示例4: test_uint64_factorize
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_uint64_factorize(self):
data = np.array([2**63, 1, 2**63], dtype=np.uint64)
exp_labels = np.array([0, 1, 0], dtype=np.intp)
exp_uniques = np.array([2**63, 1], dtype=np.uint64)
labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)
data = np.array([2**63, -1, 2**63], dtype=object)
exp_labels = np.array([0, 1, 0], dtype=np.intp)
exp_uniques = np.array([2**63, -1], dtype=object)
labels, uniques = algos.factorize(data)
tm.assert_numpy_array_equal(labels, exp_labels)
tm.assert_numpy_array_equal(uniques, exp_uniques)
示例5: c_discrete
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def c_discrete(self):
"""Discretized form of c
If c is discrete then this converts it to
integers from 0 to `n_c_unique`
"""
if self._c_discrete is None:
if isinstance(self._cmap, dict):
self._labels = np.array(
[k for k in self._cmap.keys() if k in self.c_unique]
)
self._c_discrete = np.zeros_like(self._c, dtype=int)
for i, label in enumerate(self._labels):
self._c_discrete[self._c == label] = i
else:
self._c_discrete = np.zeros_like(self._c, dtype=int)
self._c_discrete[self._mask], self._labels = pd.factorize(
self._c_masked, sort=True
)
return self._c_discrete
示例6: process_embarked
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def process_embarked():
global df_titanic_data
# replacing the missing values with the most commmon value in the variable
df_titanic_data.Embarked[df_titanic_data.Embarked.isnull()] = df_titanic_data.Embarked.dropna().mode().values
# converting the values into numbers
df_titanic_data['Embarked'] = pd.factorize(df_titanic_data['Embarked'])[0]
# binarizing the constructed features
if keep_binary:
df_titanic_data = pd.concat([df_titanic_data, pd.get_dummies(df_titanic_data['Embarked']).rename(
columns=lambda x: 'Embarked_' + str(x))], axis=1)
# Define a helper function that can use RandomForestClassifier for handling the missing values of the age variable
示例7: process_cabin
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def process_cabin():
# refering to the global variable that contains the titanic examples
global df_titanic_data
# repllacing the missing value in cabin variable "U0"
df_titanic_data['Cabin'][df_titanic_data.Cabin.isnull()] = 'U0'
# the cabin number is a sequence of of alphanumerical digits, so we are going to create some features
# from the alphabetical part of it
df_titanic_data['CabinLetter'] = df_titanic_data['Cabin'].map(lambda l: get_cabin_letter(l))
df_titanic_data['CabinLetter'] = pd.factorize(df_titanic_data['CabinLetter'])[0]
# binarizing the cabin letters features
if keep_binary:
cletters = pd.get_dummies(df_titanic_data['CabinLetter']).rename(columns=lambda x: 'CabinLetter_' + str(x))
df_titanic_data = pd.concat([df_titanic_data, cletters], axis=1)
# creating features from the numerical side of the cabin
df_titanic_data['CabinNumber'] = df_titanic_data['Cabin'].map(lambda x: get_cabin_num(x)).astype(int) + 1
# scaling the feature
if keep_scaled:
scaler_processing = preprocessing.StandardScaler()
df_titanic_data['CabinNumber_scaled'] = scaler_processing.fit_transform(df_titanic_data.CabinNumber.reshape(-1, 1))
示例8: test_factorize
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_factorize(self, data_for_grouping, na_sentinel):
labels, uniques = pd.factorize(data_for_grouping,
na_sentinel=na_sentinel)
expected_labels = np.array([0, 0, na_sentinel,
na_sentinel, 1, 1, 0, 2],
dtype=np.intp)
expected_uniques = data_for_grouping.take([0, 4, 7])
tm.assert_numpy_array_equal(labels, expected_labels)
self.assert_extension_array_equal(uniques, expected_uniques)
示例9: test_factorize_equivalence
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
tm.assert_numpy_array_equal(l1, l2)
self.assert_extension_array_equal(u1, u2)
示例10: test_factorize_empty
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_factorize_empty(self, data):
labels, uniques = pd.factorize(data[:0])
expected_labels = np.array([], dtype=np.intp)
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
tm.assert_numpy_array_equal(labels, expected_labels)
self.assert_extension_array_equal(uniques, expected_uniques)
示例11: test_groupby_extension_agg
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_groupby_extension_agg(self, as_index, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping})
result = df.groupby("B", as_index=as_index).A.mean()
_, index = pd.factorize(data_for_grouping, sort=True)
index = pd.Index(index, name="B")
expected = pd.Series([3, 1, 4], index=index, name="A")
if as_index:
self.assert_series_equal(result, expected)
else:
expected = expected.reset_index()
self.assert_frame_equal(result, expected)
示例12: test_groupby_extension_no_sort
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_groupby_extension_no_sort(self, data_for_grouping):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
"B": data_for_grouping})
result = df.groupby("B", sort=False).A.mean()
_, index = pd.factorize(data_for_grouping, sort=False)
index = pd.Index(index, name="B")
expected = pd.Series([1, 3, 4], index=index, name="A")
self.assert_series_equal(result, expected)
示例13: test_factorized_sort
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_factorized_sort():
cat = pd.Categorical(['b', 'b', None, 'a'])
labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(['a', 'b'])
tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_categorical_equal(uniques, expected_uniques)
示例14: test_factorized_sort_ordered
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_factorized_sort_ordered():
cat = pd.Categorical(['b', 'b', None, 'a'],
categories=['c', 'b', 'a'],
ordered=True)
labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(['b', 'a'],
categories=['c', 'b', 'a'],
ordered=True)
tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_categorical_equal(uniques, expected_uniques)
示例15: test_basic
# 需要导入模块: import pandas [as 别名]
# 或者: from pandas import factorize [as 别名]
def test_basic(self):
labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c',
'c'])
tm.assert_numpy_array_equal(
uniques, np.array(['a', 'b', 'c'], dtype=object))
labels, uniques = algos.factorize(['a', 'b', 'b', 'a',
'a', 'c', 'c', 'c'], sort=True)
exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
exp = np.array(['a', 'b', 'c'], dtype=object)
tm.assert_numpy_array_equal(uniques, exp)
labels, uniques = algos.factorize(list(reversed(range(5))))
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
exp = np.array([4, 3, 2, 1, 0], dtype=np.int64)
tm.assert_numpy_array_equal(uniques, exp)
labels, uniques = algos.factorize(list(reversed(range(5))), sort=True)
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
exp = np.array([0, 1, 2, 3, 4], dtype=np.int64)
tm.assert_numpy_array_equal(uniques, exp)
labels, uniques = algos.factorize(list(reversed(np.arange(5.))))
exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64)
tm.assert_numpy_array_equal(uniques, exp)
labels, uniques = algos.factorize(list(reversed(np.arange(5.))),
sort=True)
exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
tm.assert_numpy_array_equal(labels, exp)
exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64)
tm.assert_numpy_array_equal(uniques, exp)