一种异常值检测方法（Python）

摘要：import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport mathimport scipyfrom sklearn.neighbors import LocalOutl

import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport mathimport scipyfrom sklearn.neighbors import LocalOutlierFactordf = pd.read_pickle('data/processed/01_data_processed.pkl')outlier_columns = list(df.columns[:-1])plt.style.use("fivethirtyeight")plt.rcParams["figure.figsize"] = (20, 5)plt.rcParams["figure.dpi"] = 100#boxplots for each categoryfor outlier_column in outlier_columns:df[[outlier_column] + ["Category"]].boxplot(by="Category", figsize=(20, 10))plt.show

def plot_binary_outliers(dataset, col, outlier_col, reset_index):""" Plot outliers in case of a binary outlier score. Here, the col specifies the real datacolumn and outlier_col the columns with a binary value (outlier or not).Args:dataset (pd.DataFrame): The datasetcol (string): Column that you want to plotoutlier_col (string): Outlier column marked with true/falsereset_index (bool): whether to reset the index for plotting"""dataset = dataset.dropna(axis=0, subset=[col, outlier_col])dataset[outlier_col] = dataset[outlier_col].astype("bool")if reset_index:dataset = dataset.reset_indexfig, ax = plt.subplotsplt.xlabel("samples")plt.ylabel("value")# Plot non outliers in default colorax.plot(dataset.index[~dataset[outlier_col]],dataset[col][~dataset[outlier_col]],"+",)# Plot data points that are outliers in redax.plot(dataset.index[dataset[outlier_col]],dataset[col][dataset[outlier_col]],"r+",)plt.legend(["outlier " + col, "no outlier " + col],loc="upper center",ncol=2,fancybox=True,shadow=True,)plt.showdef mark_outliers_iqr(dataset, col):"""Function to mark values as outliers using the IQR method.Args:dataset (pd.DataFrame): The datasetcol (string): The column you want apply outlier detection toReturns:pd.DataFrame: The original dataframe with an extra boolean column indicating whether the value is an outlier or not."""dataset = dataset.copyQ1 = dataset[col].quantile(0.25)Q3 = dataset[col].quantile(0.75)IQR = Q3 - Q1lower_bound = Q1 - 1.5 * IQRupper_bound = Q3 + 1.5 * IQRdataset[col + "_outlier"] = (dataset[col] upper_bound)return datasetcol = "Ib (Amp)"dataset = mark_outliers_iqr(df, col)plot_binary_outliers(dataset=dataset, col=col, outlier_col=col+"_outlier", reset_index=True)# Loop over all columnsfor col in outlier_columns:dataset = mark_outliers_iqr(df, col)plot_binary_outliers(dataset=dataset, col=col, outlier_col=col+"_outlier", reset_index=True)

df[outlier_columns + ["Category"]].hist(by="Category", figsize=(20, 10), layout=(3, 3))array([[, ,],[,,],[, , ]], dtype=object)

def mark_outliers_chauvenet(dataset, col, C=2):"""Finds outliers in the specified column of datatable and adds a binary column withthe same name extended with '_outlier' that expresses the result per data point.Args:dataset (pd.DataFrame): The datasetcol (string): The column you want apply outlier detection toC (int, optional): Degree of certainty for the identification of outliers given the assumption of a normal distribution, typicaly between 1 - 10. Defaults to 2.Returns:pd.DataFrame: The original dataframe with an extra boolean column indicating whether the value is an outlier or not."""dataset = dataset.copy# Compute the mean and standard deviation.mean = dataset[col].meanstd = dataset[col].stdN = len(dataset.index)criterion = 1.0 / (C * N)# Consider the deviation for the data points.deviation = abs(dataset[col] - mean) / std# Express the upper and lower bounds.low = -deviation / math.sqrt(C)high = deviation / math.sqrt(C)prob = mask = # Pass all rows in the dataset.for i in range(0, len(dataset.index)):# Determine the probability of observing the pointprob.append(1.0 - 0.5 * (scipy.special.erf(high.iloc[i]) - scipy.special.erf(low.iloc[i])))# And mark as an outlier when the probability is below our criterion.mask.append(prob[i] for col in outlier_columns:dataset = mark_outliers_chauvenet(df, col)plot_binary_outliers(dataset=dataset, col=col, outlier_col=col+"_outlier", reset_index=True)

def mark_outliers_lof(dataset, columns, n=20):"""Mark values as outliers using LOFArgs:dataset (pd.DataFrame): The datasetcol (string): The column you want apply outlier detection ton (int, optional): n_neighbors. Defaults to 20.Returns:pd.DataFrame: The original dataframe with an extra boolean columnindicating whether the value is an outlier or not."""dataset = dataset.copylof = LocalOutlierFactor(n_neighbors=n)data = dataset[columns]outliers = lof.fit_predict(data)X_scores = lof.negative_outlier_factor_dataset["outlier_lof"] = outliers == -1return dataset, outliers, X_scoresdf = df.dropnadataset, outliers, X_scores = mark_outliers_lof(df, outlier_columns)for col in outlier_columns:plot_binary_outliers(dataset=dataset, col=col, outlier_col="outlier_lof", reset_index=True)

#selectod method Chauvenets criteronfor col in outlier_columns:dataset = mark_outliers_chauvenet(df, col)plot_binary_outliers(dataset=dataset, col=col, outlier_col=col+"_outlier", reset_index=True)

通过网盘分享的文件：01_data_processed.pkl

知乎学术咨询：https://www.zhihu.com/consult/people/792359672131756032?isMe=1担任《Mechanical System and Signal Processing》《中国电机工程学报》等期刊审稿专家，擅长领域：信号滤波/降噪，机器学习/深度学习，时间序列预分析/预测，设备故障诊断/缺陷检测/异常检测。分割线分割线分割线分割线分割线分割线分割线分割线分割线分割线

非平稳信号的一种维格纳-维尔分布（WV分布）中交叉项的消除方法（基于滑动模式奇异谱分析）（MATLAB）

完整代码可通过知乎付费咨询获得：https://www.zhihu.com/consult/people/792359672131756032一维时间序列信号的稀疏度度量方法（MATLAB R2018A）算法运行环境为MATLAB R2018A，执行一维信号的稀疏度量方法，包括峰度(Kurt)、负熵(NE)、d -范数(DN)、2-范数与1-范数之比(L2/L1)、基尼指数(GI)、修正平滑指数(MSI)、基尼指数2 (GI2)、基尼指数3 (GI3)、广义基尼指数(GGI)、完全广义基尼指数等。算法可迁移至金融时间序列，地震信号，机械振动信号，语音信号，声信号，生理信号（EEG,EMG）等一维时间序列信号。完整代码可通过知乎付费咨询获得：https://www.zhihu.com/consult/people/792359672131756032

基于稀疏学习的转子断条故障诊断（MATLAB）