0.033 6.652 6.681 0.194 0.874 3.177
0.034 9.039 6.224 0.194 1.137 3.4
0.035 10.936 10.304 1.015 0.911 4.9
0.022 10.11 9.603 1.374 0.848 4.566
0.035 2.963 17.156 0.599 0.823 9.406
0.033 10.872 10.244 1.015 0.574 4.871
0.035 21.694 22.389 1.015 0.859 9.259
0.035 10.936 10.304 1.015 0.911 4.5
def correlation(dataset, threshold):
col_corr = set() # Set of all the names of deleted columns
corr_matrix = dataset.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
colname = corr_matrix.columns[i] # getting the name of column
if colname in dataset.columns:
del dataset[colname] # deleting the column from the dataset
ここでの方法は私にとってはうまくいきました。ほんの数行のコード: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/
import numpy as np
# Create correlation matrix
corr_matrix = df.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
df.drop(df.columns[to_drop], axis=1)
corr_matrix = df.corr().abs()
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in Zip(*high_corr_var) if x!=y and x<y]
def corr_df(x, corr_val):
Obj: Drops features that are strongly correlated to other features.
This lowers model complexity, and aids in generalizing the model.
df: features df (x)
corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
Output: df that only includes uncorrelated features
# Creates Correlation Matrix and Instantiates
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if abs(val) >= corr_val:
# Prints the correlated feature set and the corr val
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
x = x.drop(col, axis=1)
return x
def corr_df(x, corr_val):
Obj: Drops features that are strongly correlated to other features.
This lowers model complexity, and aids in generalizing the model.
df: features df (x)
corr_val: Columns are dropped relative to the corr_val input (e.g. 0.8)
Output: df that only includes uncorrelated features
# Creates Correlation Matrix and Instantiates
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []
# Iterates through Correlation Matrix Table to find correlated columns
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = item.values
if val >= corr_val:
# Prints the correlated feature set and the corr val
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drops = sorted(set(drop_cols))[::-1]
# Drops the correlated columns
for i in drops:
col = x.iloc[:, (i+1):(i+2)].columns.values
df = x.drop(col, axis=1)
return df
def filter_df_corr(inp_data, corr_val):
Returns an array or dataframe (based on type(inp_data) adjusted to drop \
columns with high correlation to one another. Takes second arg corr_val
that defines the cutoff
inp_data : np.array, pd.DataFrame
Values to consider
corr_val : float
Value [0, 1] on which to base the correlation cutoff
# Creates Correlation Matrix
if isinstance(inp_data, np.ndarray):
inp_data = pd.DataFrame(data=inp_data)
array_flag = True
array_flag = False
corr_matrix = inp_data.corr()
# Iterates through Correlation Matrix Table to find correlated columns
drop_cols = []
n_cols = len(corr_matrix.columns)
for i in range(n_cols):
for k in range(i+1, n_cols):
val = corr_matrix.iloc[k, i]
col = corr_matrix.columns[i]
row = corr_matrix.index[k]
if abs(val) >= corr_val:
# Prints the correlated feature set and the corr val
print(col, "|", row, "|", round(val, 2))
# Drops the correlated columns
drop_cols = set(drop_cols)
inp_data = inp_data.drop(columns=drop_cols)
# Return same type as inp
if array_flag:
return inp_data.values
return inp_data