{"id":1165398,"date":"2025-01-15T15:19:53","date_gmt":"2025-01-15T07:19:53","guid":{"rendered":"https:\/\/docs.pingcode.com\/ask\/ask-ask\/1165398.html"},"modified":"2025-01-15T15:19:56","modified_gmt":"2025-01-15T07:19:56","slug":"python%e5%a6%82%e4%bd%95%e5%8e%bb%e9%99%a4%e9%94%99%e8%af%af%e6%a0%b7%e6%9c%ac","status":"publish","type":"post","link":"https:\/\/docs.pingcode.com\/ask\/1165398.html","title":{"rendered":"python\u5982\u4f55\u53bb\u9664\u9519\u8bef\u6837\u672c"},"content":{"rendered":"<p style=\"text-align:center;\" ><img decoding=\"async\" src=\"https:\/\/cdn-kb.worktile.com\/kb\/wp-content\/uploads\/2024\/04\/25205722\/c32ae21b-ffff-4aac-bbab-bdf4aac3129e.webp\" alt=\"python\u5982\u4f55\u53bb\u9664\u9519\u8bef\u6837\u672c\" \/><\/p>\n<p><p> \u5728Python\u4e2d\u53bb\u9664\u9519\u8bef\u6837\u672c\u53ef\u4ee5\u901a\u8fc7<strong>\u6570\u636e\u6e05\u6d17\u3001\u5f02\u5e38\u68c0\u6d4b\u3001\u7edf\u8ba1\u65b9\u6cd5<\/strong>\u7b49\u591a\u79cd\u624b\u6bb5\u6765\u5b9e\u73b0\u3002\u4ee5\u6570\u636e\u6e05\u6d17\u4e3a\u4f8b\uff0c\u53ef\u4ee5\u5229\u7528Pandas\u5e93\u4e2d\u7684\u51fd\u6570\u5bf9\u6570\u636e\u8fdb\u884c\u7b5b\u9009\u548c\u8fc7\u6ee4\uff0c\u786e\u4fdd\u6570\u636e\u7684\u5b8c\u6574\u6027\u548c\u51c6\u786e\u6027\u3002<strong>\u6570\u636e\u6e05\u6d17<\/strong>\u662f\u5904\u7406\u548c\u5206\u6790\u6570\u636e\u524d\u7684\u5173\u952e\u6b65\u9aa4\uff0c\u901a\u8fc7\u8bc6\u522b\u548c\u5220\u9664\u5305\u542b\u7f3a\u5931\u503c\u6216\u5f02\u5e38\u503c\u7684\u6837\u672c\uff0c\u53ef\u4ee5\u5927\u5e45\u63d0\u5347\u6570\u636e\u8d28\u91cf\uff0c\u8fdb\u800c\u63d0\u9ad8\u6a21\u578b\u7684\u6027\u80fd\u548c\u51c6\u786e\u6027\u3002<\/p>\n<\/p>\n<p><h3>\u4e00\u3001\u6570\u636e\u6e05\u6d17<\/h3>\n<\/p>\n<p><p>\u6570\u636e\u6e05\u6d17\u662f\u6570\u636e\u9884\u5904\u7406\u4e2d\u7684\u5173\u952e\u6b65\u9aa4\u4e4b\u4e00\uff0c\u901a\u5e38\u5305\u62ec\u5904\u7406\u7f3a\u5931\u503c\u3001\u91cd\u590d\u503c\u3001\u5f02\u5e38\u503c\u4ee5\u53ca\u6570\u636e\u8f6c\u6362\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u5e38\u89c1\u7684\u6570\u636e\u6e05\u6d17\u65b9\u6cd5\uff1a<\/p>\n<\/p>\n<p><h4>1.1 \u5904\u7406\u7f3a\u5931\u503c<\/h4>\n<\/p>\n<p><p>\u7f3a\u5931\u503c\u662f\u6570\u636e\u96c6\u4e2d\u5e38\u89c1\u7684\u95ee\u9898\u3002\u5904\u7406\u7f3a\u5931\u503c\u7684\u5e38\u7528\u65b9\u6cd5\u5305\u62ec\u5220\u9664\u542b\u7f3a\u5931\u503c\u7684\u6837\u672c\u3001\u7528\u5747\u503c\u6216\u4e2d\u4f4d\u6570\u586b\u8865\u7f3a\u5931\u503c\u3001\u6216\u8005\u4f7f\u7528\u63d2\u503c\u65b9\u6cd5\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import pandas as pd<\/p>\n<h2><strong>\u8bfb\u53d6\u6570\u636e<\/strong><\/h2>\n<p>data = pd.read_csv(&#39;data.csv&#39;)<\/p>\n<h2><strong>\u5220\u9664\u542b\u6709\u4efb\u4f55\u7f3a\u5931\u503c\u7684\u884c<\/strong><\/h2>\n<p>data_cleaned = data.dropna()<\/p>\n<h2><strong>\u7528\u5747\u503c\u586b\u8865\u7f3a\u5931\u503c<\/strong><\/h2>\n<p>data_filled = data.fillna(data.mean())<\/p>\n<h2><strong>\u7528\u63d2\u503c\u65b9\u6cd5\u586b\u8865\u7f3a\u5931\u503c<\/strong><\/h2>\n<p>data_interpolated = data.interpolate()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>1.2 \u5904\u7406\u91cd\u590d\u503c<\/h4>\n<\/p>\n<p><p>\u91cd\u590d\u503c\u53ef\u80fd\u4f1a\u5f71\u54cd\u6a21\u578b\u7684\u6027\u80fd\u3002\u53ef\u4ee5\u4f7f\u7528Pandas\u5e93\u4e2d\u7684<code>drop_duplicates<\/code>\u51fd\u6570\u6765\u5220\u9664\u91cd\u590d\u503c\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u5220\u9664\u91cd\u590d\u503c<\/p>\n<p>data_no_duplicates = data.drop_duplicates()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>1.3 \u5904\u7406\u5f02\u5e38\u503c<\/h4>\n<\/p>\n<p><p>\u5f02\u5e38\u503c\u53ef\u80fd\u4f1a\u5bf9\u6a21\u578b\u4ea7\u751f\u4e0d\u5229\u5f71\u54cd\u3002\u5e38\u89c1\u7684\u5904\u7406\u65b9\u6cd5\u5305\u62ec\u5220\u9664\u5f02\u5e38\u503c\u3001\u4f7f\u7528\u7edf\u8ba1\u65b9\u6cd5\u8bc6\u522b\u5f02\u5e38\u503c\uff08\u5982Z-score\u3001IQR\uff09\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">import numpy as np<\/p>\n<h2><strong>\u4f7f\u7528Z-score\u65b9\u6cd5\u68c0\u6d4b\u548c\u5220\u9664\u5f02\u5e38\u503c<\/strong><\/h2>\n<p>from scipy import stats<\/p>\n<p>z_scores = np.abs(stats.zscore(data))<\/p>\n<p>data_no_outliers = data[(z_scores &lt; 3).all(axis=1)]<\/p>\n<h2><strong>\u4f7f\u7528IQR\u65b9\u6cd5\u68c0\u6d4b\u548c\u5220\u9664\u5f02\u5e38\u503c<\/strong><\/h2>\n<p>Q1 = data.quantile(0.25)<\/p>\n<p>Q3 = data.quantile(0.75)<\/p>\n<p>IQR = Q3 - Q1<\/p>\n<p>data_no_outliers = data[~((data &lt; (Q1 - 1.5 * IQR)) |(data &gt; (Q3 + 1.5 * IQR))).any(axis=1)]<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e8c\u3001\u5f02\u5e38\u68c0\u6d4b<\/h3>\n<\/p>\n<p><p>\u5f02\u5e38\u68c0\u6d4b\u662f\u4e00\u79cd\u7528\u4e8e\u8bc6\u522b\u6570\u636e\u96c6\u4e2d\u5f02\u5e38\u503c\u6216\u79bb\u7fa4\u70b9\u7684\u6280\u672f\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ec\u57fa\u4e8e\u7edf\u8ba1\u7684\u5f02\u5e38\u68c0\u6d4b\u3001\u57fa\u4e8e<a href=\"https:\/\/docs.pingcode.com\/ask\/59192.html\" target=\"_blank\">\u673a\u5668\u5b66\u4e60<\/a>\u7684\u5f02\u5e38\u68c0\u6d4b\u7b49\u3002<\/p>\n<\/p>\n<p><h4>2.1 \u57fa\u4e8e\u7edf\u8ba1\u7684\u5f02\u5e38\u68c0\u6d4b<\/h4>\n<\/p>\n<p><p>\u57fa\u4e8e\u7edf\u8ba1\u7684\u65b9\u6cd5\u901a\u5e38\u5229\u7528\u6570\u636e\u7684\u7edf\u8ba1\u7279\u6027\u6765\u8bc6\u522b\u5f02\u5e38\u503c\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ecZ-score\u3001IQR\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u4f7f\u7528Z-score\u65b9\u6cd5\u68c0\u6d4b\u5f02\u5e38\u503c<\/p>\n<p>z_scores = np.abs(stats.zscore(data))<\/p>\n<p>outliers = data[(z_scores &gt;= 3).any(axis=1)]<\/p>\n<h2><strong>\u4f7f\u7528IQR\u65b9\u6cd5\u68c0\u6d4b\u5f02\u5e38\u503c<\/strong><\/h2>\n<p>Q1 = data.quantile(0.25)<\/p>\n<p>Q3 = data.quantile(0.75)<\/p>\n<p>IQR = Q3 - Q1<\/p>\n<p>outliers = data[((data &lt; (Q1 - 1.5 * IQR)) |(data &gt; (Q3 + 1.5 * IQR))).any(axis=1)]<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>2.2 \u57fa\u4e8e\u673a\u5668\u5b66\u4e60\u7684\u5f02\u5e38\u68c0\u6d4b<\/h4>\n<\/p>\n<p><p>\u57fa\u4e8e\u673a\u5668\u5b66\u4e60\u7684\u65b9\u6cd5\u5229\u7528\u8bad\u7ec3\u597d\u7684\u6a21\u578b\u6765\u8bc6\u522b\u5f02\u5e38\u503c\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ec\u4f7f\u7528Isolation Forest\u3001One-Class SVM\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.ensemble import IsolationForest<\/p>\n<p>from sklearn.svm import OneClassSVM<\/p>\n<h2><strong>\u4f7f\u7528Isolation Forest\u65b9\u6cd5\u68c0\u6d4b\u5f02\u5e38\u503c<\/strong><\/h2>\n<p>clf = IsolationForest(contamination=0.1)<\/p>\n<p>clf.fit(data)<\/p>\n<p>outliers = data[clf.predict(data) == -1]<\/p>\n<h2><strong>\u4f7f\u7528One-Class SVM\u65b9\u6cd5\u68c0\u6d4b\u5f02\u5e38\u503c<\/strong><\/h2>\n<p>clf = OneClassSVM(nu=0.1, kernel=&quot;rbf&quot;, gamma=0.1)<\/p>\n<p>clf.fit(data)<\/p>\n<p>outliers = data[clf.predict(data) == -1]<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e09\u3001\u6570\u636e\u8f6c\u6362<\/h3>\n<\/p>\n<p><p>\u6570\u636e\u8f6c\u6362\u662f\u5c06\u6570\u636e\u4ece\u4e00\u79cd\u683c\u5f0f\u6216\u7ed3\u6784\u8f6c\u6362\u4e3a\u53e6\u4e00\u79cd\u683c\u5f0f\u6216\u7ed3\u6784\u7684\u8fc7\u7a0b\u3002\u5e38\u89c1\u7684\u6570\u636e\u8f6c\u6362\u65b9\u6cd5\u5305\u62ec\u6807\u51c6\u5316\u3001\u5f52\u4e00\u5316\u3001\u7f16\u7801\u7b49\u3002<\/p>\n<\/p>\n<p><h4>3.1 \u6807\u51c6\u5316\u548c\u5f52\u4e00\u5316<\/h4>\n<\/p>\n<p><p>\u6807\u51c6\u5316\u548c\u5f52\u4e00\u5316\u662f\u6570\u636e\u8f6c\u6362\u7684\u5e38\u7528\u65b9\u6cd5\uff0c\u901a\u5e38\u7528\u4e8e\u5c06\u6570\u636e\u7f29\u653e\u5230\u7279\u5b9a\u8303\u56f4\u5185\uff0c\u4ece\u800c\u4f7f\u6a21\u578b\u66f4\u5bb9\u6613\u8bad\u7ec3\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.preprocessing import StandardScaler, MinMaxScaler<\/p>\n<h2><strong>\u6807\u51c6\u5316\u6570\u636e<\/strong><\/h2>\n<p>scaler = StandardScaler()<\/p>\n<p>data_standardized = scaler.fit_transform(data)<\/p>\n<h2><strong>\u5f52\u4e00\u5316\u6570\u636e<\/strong><\/h2>\n<p>scaler = MinMaxScaler()<\/p>\n<p>data_normalized = scaler.fit_transform(data)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>3.2 \u7f16\u7801<\/h4>\n<\/p>\n<p><p>\u7f16\u7801\u662f\u5c06\u5206\u7c7b\u53d8\u91cf\u8f6c\u6362\u4e3a\u6570\u503c\u53d8\u91cf\u7684\u8fc7\u7a0b\u3002\u5e38\u89c1\u7684\u7f16\u7801\u65b9\u6cd5\u5305\u62ec\u6807\u7b7e\u7f16\u7801\u3001\u72ec\u70ed\u7f16\u7801\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.preprocessing import LabelEncoder, OneHotEncoder<\/p>\n<h2><strong>\u6807\u7b7e\u7f16\u7801<\/strong><\/h2>\n<p>label_encoder = LabelEncoder()<\/p>\n<p>data[&#39;category&#39;] = label_encoder.fit_transform(data[&#39;category&#39;])<\/p>\n<h2><strong>\u72ec\u70ed\u7f16\u7801<\/strong><\/h2>\n<p>one_hot_encoder = OneHotEncoder()<\/p>\n<p>data_encoded = one_hot_encoder.fit_transform(data[[&#39;category&#39;]])<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u56db\u3001\u7279\u5f81\u9009\u62e9<\/h3>\n<\/p>\n<p><p>\u7279\u5f81\u9009\u62e9\u662f\u4ece\u6570\u636e\u96c6\u4e2d\u9009\u62e9\u6700\u5177\u4ee3\u8868\u6027\u7684\u7279\u5f81\uff0c\u4ee5\u63d0\u9ad8\u6a21\u578b\u7684\u6027\u80fd\u548c\u53ef\u89e3\u91ca\u6027\u3002\u5e38\u89c1\u7684\u7279\u5f81\u9009\u62e9\u65b9\u6cd5\u5305\u62ec\u8fc7\u6ee4\u65b9\u6cd5\u3001\u5d4c\u5165\u65b9\u6cd5\u548c\u5305\u88c5\u65b9\u6cd5\u3002<\/p>\n<\/p>\n<p><h4>4.1 \u8fc7\u6ee4\u65b9\u6cd5<\/h4>\n<\/p>\n<p><p>\u8fc7\u6ee4\u65b9\u6cd5\u5229\u7528\u7edf\u8ba1\u68c0\u9a8c\u6216\u76f8\u5173\u6027\u5206\u6790\u6765\u9009\u62e9\u7279\u5f81\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.feature_selection import SelectKBest, chi2<\/p>\n<h2><strong>\u4f7f\u7528\u5361\u65b9\u68c0\u9a8c\u9009\u62e9\u7279\u5f81<\/strong><\/h2>\n<p>selector = SelectKBest(chi2, k=10)<\/p>\n<p>data_selected = selector.fit_transform(data, labels)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>4.2 \u5d4c\u5165\u65b9\u6cd5<\/h4>\n<\/p>\n<p><p>\u5d4c\u5165\u65b9\u6cd5\u901a\u8fc7\u6a21\u578b\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\u7684\u7279\u5f81\u91cd\u8981\u6027\u6765\u9009\u62e9\u7279\u5f81\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.ensemble import RandomForestClassifier<\/p>\n<h2><strong>\u4f7f\u7528\u968f\u673a\u68ee\u6797\u9009\u62e9\u7279\u5f81<\/strong><\/h2>\n<p>clf = RandomForestClassifier()<\/p>\n<p>clf.fit(data, labels)<\/p>\n<p>importances = clf.feature_importances_<\/p>\n<p>indices = np.argsort(importances)[::-1]<\/p>\n<p>data_selected = data[:, indices[:10]]<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>4.3 \u5305\u88c5\u65b9\u6cd5<\/h4>\n<\/p>\n<p><p>\u5305\u88c5\u65b9\u6cd5\u901a\u8fc7\u8fed\u4ee3\u8bad\u7ec3\u6a21\u578b\u548c\u9009\u62e9\u7279\u5f81\u6765\u4f18\u5316\u7279\u5f81\u9009\u62e9\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.feature_selection import RFE<\/p>\n<p>from sklearn.linear_model import LogisticRegression<\/p>\n<h2><strong>\u4f7f\u7528\u9012\u5f52\u7279\u5f81\u6d88\u9664\u9009\u62e9\u7279\u5f81<\/strong><\/h2>\n<p>clf = LogisticRegression()<\/p>\n<p>selector = RFE(clf, n_features_to_select=10, step=1)<\/p>\n<p>data_selected = selector.fit_transform(data, labels)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e94\u3001\u6570\u636e\u589e\u5f3a<\/h3>\n<\/p>\n<p><p>\u6570\u636e\u589e\u5f3a\u662f\u901a\u8fc7\u751f\u6210\u65b0\u7684\u6837\u672c\u6765\u6269\u5145\u6570\u636e\u96c6\u7684\u65b9\u6cd5\uff0c\u901a\u5e38\u7528\u4e8e\u5904\u7406\u4e0d\u5e73\u8861\u6570\u636e\u96c6\u3002\u5e38\u89c1\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\u5305\u62ec\u8fc7\u91c7\u6837\u3001\u6b20\u91c7\u6837\u3001\u6570\u636e\u5408\u6210\u7b49\u3002<\/p>\n<\/p>\n<p><h4>5.1 \u8fc7\u91c7\u6837<\/h4>\n<\/p>\n<p><p>\u8fc7\u91c7\u6837\u662f\u901a\u8fc7\u590d\u5236\u5c11\u6570\u7c7b\u6837\u672c\u6765\u589e\u52a0\u5176\u6570\u91cf\uff0c\u4ece\u800c\u5e73\u8861\u6570\u636e\u96c6\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from imblearn.over_sampling import SMOTE<\/p>\n<h2><strong>\u4f7f\u7528SMOTE\u65b9\u6cd5\u8fc7\u91c7\u6837<\/strong><\/h2>\n<p>smote = SMOTE()<\/p>\n<p>data_resampled, labels_resampled = smote.fit_resample(data, labels)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>5.2 \u6b20\u91c7\u6837<\/h4>\n<\/p>\n<p><p>\u6b20\u91c7\u6837\u662f\u901a\u8fc7\u51cf\u5c11\u591a\u6570\u7c7b\u6837\u672c\u6765\u5e73\u8861\u6570\u636e\u96c6\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from imblearn.under_sampling import RandomUnderSampler<\/p>\n<h2><strong>\u4f7f\u7528\u968f\u673a\u6b20\u91c7\u6837\u65b9\u6cd5<\/strong><\/h2>\n<p>rus = RandomUnderSampler()<\/p>\n<p>data_resampled, labels_resampled = rus.fit_resample(data, labels)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>5.3 \u6570\u636e\u5408\u6210<\/h4>\n<\/p>\n<p><p>\u6570\u636e\u5408\u6210\u662f\u901a\u8fc7\u751f\u6210\u65b0\u7684\u6837\u672c\u6765\u6269\u5145\u6570\u636e\u96c6\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.datasets import make_classification<\/p>\n<h2><strong>\u4f7f\u7528make_classification\u65b9\u6cd5\u751f\u6210\u65b0\u7684\u6837\u672c<\/strong><\/h2>\n<p>data_synthetic, labels_synthetic = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, n_classes=2)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u516d\u3001\u6a21\u578b\u8bc4\u4f30\u548c\u9a8c\u8bc1<\/h3>\n<\/p>\n<p><p>\u6a21\u578b\u8bc4\u4f30\u548c\u9a8c\u8bc1\u662f\u786e\u4fdd\u6a21\u578b\u6027\u80fd\u548c\u53ef\u9760\u6027\u7684\u5173\u952e\u6b65\u9aa4\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ec\u4ea4\u53c9\u9a8c\u8bc1\u3001\u6df7\u6dc6\u77e9\u9635\u3001ROC\u66f2\u7ebf\u7b49\u3002<\/p>\n<\/p>\n<p><h4>6.1 \u4ea4\u53c9\u9a8c\u8bc1<\/h4>\n<\/p>\n<p><p>\u4ea4\u53c9\u9a8c\u8bc1\u662f\u4e00\u79cd\u8bc4\u4f30\u6a21\u578b\u6027\u80fd\u7684\u5e38\u7528\u65b9\u6cd5\uff0c\u901a\u8fc7\u5c06\u6570\u636e\u96c6\u5212\u5206\u4e3a\u591a\u4e2a\u5b50\u96c6\uff0c\u4f9d\u6b21\u4f7f\u7528\u6bcf\u4e2a\u5b50\u96c6\u4f5c\u4e3a\u9a8c\u8bc1\u96c6\uff0c\u5176\u4f59\u5b50\u96c6\u4f5c\u4e3a\u8bad\u7ec3\u96c6\uff0c\u4ece\u800c\u8bc4\u4f30\u6a21\u578b\u7684\u7a33\u5b9a\u6027\u548c\u6cdb\u5316\u80fd\u529b\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.model_selection import cross_val_score<\/p>\n<p>from sklearn.ensemble import RandomForestClassifier<\/p>\n<h2><strong>\u4f7f\u7528\u4ea4\u53c9\u9a8c\u8bc1\u8bc4\u4f30\u6a21\u578b\u6027\u80fd<\/strong><\/h2>\n<p>clf = RandomForestClassifier()<\/p>\n<p>scores = cross_val_score(clf, data, labels, cv=5)<\/p>\n<p>print(f&#39;Cross-Validation Accuracy: {scores.mean():.2f} \u00b1 {scores.std():.2f}&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>6.2 \u6df7\u6dc6\u77e9\u9635<\/h4>\n<\/p>\n<p><p>\u6df7\u6dc6\u77e9\u9635\u662f\u4e00\u79cd\u7528\u4e8e\u8bc4\u4f30\u5206\u7c7b\u6a21\u578b\u6027\u80fd\u7684\u5de5\u5177\uff0c\u901a\u8fc7\u663e\u793a\u6a21\u578b\u9884\u6d4b\u7ed3\u679c\u7684\u6b63\u786e\u548c\u9519\u8bef\u5206\u7c7b\u60c5\u51b5\uff0c\u5e2e\u52a9\u6211\u4eec\u4e86\u89e3\u6a21\u578b\u7684\u5206\u7c7b\u80fd\u529b\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.metrics import confusion_matrix, classification_report<\/p>\n<p>from sklearn.model_selection import tr<a href=\"https:\/\/docs.pingcode.com\/blog\/59162.html\" target=\"_blank\">AI<\/a>n_test_split<\/p>\n<h2><strong>\u5212\u5206\u8bad\u7ec3\u96c6\u548c\u6d4b\u8bd5\u96c6<\/strong><\/h2>\n<p>X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)<\/p>\n<h2><strong>\u8bad\u7ec3\u6a21\u578b\u5e76\u9884\u6d4b<\/strong><\/h2>\n<p>clf = RandomForestClassifier()<\/p>\n<p>clf.fit(X_train, y_train)<\/p>\n<p>y_pred = clf.predict(X_test)<\/p>\n<h2><strong>\u8ba1\u7b97\u6df7\u6dc6\u77e9\u9635<\/strong><\/h2>\n<p>cm = confusion_matrix(y_test, y_pred)<\/p>\n<p>print(f&#39;Confusion Matrix:\\n{cm}&#39;)<\/p>\n<h2><strong>\u751f\u6210\u5206\u7c7b\u62a5\u544a<\/strong><\/h2>\n<p>report = classification_report(y_test, y_pred)<\/p>\n<p>print(f&#39;Classification Report:\\n{report}&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>6.3 ROC\u66f2\u7ebf<\/h4>\n<\/p>\n<p><p>ROC\u66f2\u7ebf\u662f\u4e00\u79cd\u7528\u4e8e\u8bc4\u4f30\u5206\u7c7b\u6a21\u578b\u6027\u80fd\u7684\u5de5\u5177\uff0c\u901a\u8fc7\u7ed8\u5236\u771f\u9633\u6027\u7387\uff08TPR\uff09\u548c\u5047\u9633\u6027\u7387\uff08FPR\uff09\u4e4b\u95f4\u7684\u5173\u7cfb\uff0c\u5e2e\u52a9\u6211\u4eec\u4e86\u89e3\u6a21\u578b\u7684\u533a\u5206\u80fd\u529b\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.metrics import roc_curve, auc<\/p>\n<p>import matplotlib.pyplot as plt<\/p>\n<h2><strong>\u8ba1\u7b97ROC\u66f2\u7ebf<\/strong><\/h2>\n<p>fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])<\/p>\n<p>roc_auc = auc(fpr, tpr)<\/p>\n<h2><strong>\u7ed8\u5236ROC\u66f2\u7ebf<\/strong><\/h2>\n<p>plt.figure()<\/p>\n<p>plt.plot(fpr, tpr, color=&#39;darkorange&#39;, lw=2, label=f&#39;ROC curve (area = {roc_auc:.2f})&#39;)<\/p>\n<p>plt.plot([0, 1], [0, 1], color=&#39;navy&#39;, lw=2, linestyle=&#39;--&#39;)<\/p>\n<p>plt.xlim([0.0, 1.0])<\/p>\n<p>plt.ylim([0.0, 1.05])<\/p>\n<p>plt.xlabel(&#39;False Positive Rate&#39;)<\/p>\n<p>plt.ylabel(&#39;True Positive Rate&#39;)<\/p>\n<p>plt.title(&#39;Receiver Operating Characteristic&#39;)<\/p>\n<p>plt.legend(loc=&#39;lower right&#39;)<\/p>\n<p>plt.show()<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u4e03\u3001\u6a21\u578b\u4f18\u5316<\/h3>\n<\/p>\n<p><p>\u6a21\u578b\u4f18\u5316\u662f\u63d0\u9ad8\u6a21\u578b\u6027\u80fd\u548c\u7cbe\u5ea6\u7684\u8fc7\u7a0b\uff0c\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ec\u8d85\u53c2\u6570\u8c03\u4f18\u3001\u7279\u5f81\u5de5\u7a0b\u3001\u6a21\u578b\u96c6\u6210\u7b49\u3002<\/p>\n<\/p>\n<p><h4>7.1 \u8d85\u53c2\u6570\u8c03\u4f18<\/h4>\n<\/p>\n<p><p>\u8d85\u53c2\u6570\u8c03\u4f18\u662f\u901a\u8fc7\u8c03\u6574\u6a21\u578b\u7684\u8d85\u53c2\u6570\u6765\u4f18\u5316\u6a21\u578b\u6027\u80fd\u7684\u8fc7\u7a0b\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ec\u7f51\u683c\u641c\u7d22\u3001\u968f\u673a\u641c\u7d22\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.model_selection import GridSearchCV, RandomizedSearchCV<\/p>\n<h2><strong>\u4f7f\u7528\u7f51\u683c\u641c\u7d22\u8fdb\u884c\u8d85\u53c2\u6570\u8c03\u4f18<\/strong><\/h2>\n<p>param_grid = {&#39;n_estimators&#39;: [50, 100, 200], &#39;max_depth&#39;: [None, 10, 20, 30]}<\/p>\n<p>grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=&#39;accuracy&#39;)<\/p>\n<p>grid_search.fit(data, labels)<\/p>\n<p>print(f&#39;Best Parameters: {grid_search.best_params_}&#39;)<\/p>\n<p>print(f&#39;Best Score: {grid_search.best_score_:.2f}&#39;)<\/p>\n<h2><strong>\u4f7f\u7528\u968f\u673a\u641c\u7d22\u8fdb\u884c\u8d85\u53c2\u6570\u8c03\u4f18<\/strong><\/h2>\n<p>param_dist = {&#39;n_estimators&#39;: [50, 100, 200], &#39;max_depth&#39;: [None, 10, 20, 30]}<\/p>\n<p>random_search = RandomizedSearchCV(clf, param_dist, cv=5, scoring=&#39;accuracy&#39;, n_iter=10, random_state=42)<\/p>\n<p>random_search.fit(data, labels)<\/p>\n<p>print(f&#39;Best Parameters: {random_search.best_params_}&#39;)<\/p>\n<p>print(f&#39;Best Score: {random_search.best_score_:.2f}&#39;)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>7.2 \u7279\u5f81\u5de5\u7a0b<\/h4>\n<\/p>\n<p><p>\u7279\u5f81\u5de5\u7a0b\u662f\u901a\u8fc7\u521b\u5efa\u65b0\u7684\u7279\u5f81\u6216\u8f6c\u6362\u73b0\u6709\u7279\u5f81\u6765\u63d0\u9ad8\u6a21\u578b\u6027\u80fd\u7684\u8fc7\u7a0b\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ec\u7279\u5f81\u7ec4\u5408\u3001\u7279\u5f81\u7f29\u653e\u3001\u7279\u5f81\u9009\u62e9\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\"># \u7279\u5f81\u7ec4\u5408<\/p>\n<p>data[&#39;new_feature&#39;] = data[&#39;feature1&#39;] * data[&#39;feature2&#39;]<\/p>\n<h2><strong>\u7279\u5f81\u7f29\u653e<\/strong><\/h2>\n<p>scaler = StandardScaler()<\/p>\n<p>data_scaled = scaler.fit_transform(data)<\/p>\n<h2><strong>\u7279\u5f81\u9009\u62e9<\/strong><\/h2>\n<p>selector = SelectKBest(chi2, k=10)<\/p>\n<p>data_selected = selector.fit_transform(data, labels)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h4>7.3 \u6a21\u578b\u96c6\u6210<\/h4>\n<\/p>\n<p><p>\u6a21\u578b\u96c6\u6210\u662f\u901a\u8fc7\u7ed3\u5408\u591a\u4e2a\u6a21\u578b\u7684\u9884\u6d4b\u7ed3\u679c\u6765\u63d0\u9ad8\u6a21\u578b\u6027\u80fd\u7684\u8fc7\u7a0b\u3002\u5e38\u89c1\u7684\u65b9\u6cd5\u5305\u62ecBagging\u3001Boosting\u3001Stacking\u7b49\u3002\u4ee5\u4e0b\u662f\u4e00\u4e9b\u4ee3\u7801\u793a\u4f8b\uff1a<\/p>\n<\/p>\n<p><pre><code class=\"language-python\">from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, StackingClassifier<\/p>\n<h2><strong>\u4f7f\u7528Bagging\u96c6\u6210\u65b9\u6cd5<\/strong><\/h2>\n<p>bagging_clf = BaggingClassifier(base_estimator=clf, n_estimators=10, random_state=42)<\/p>\n<p>bagging_clf.fit(data, labels)<\/p>\n<h2><strong>\u4f7f\u7528Boosting\u96c6\u6210\u65b9\u6cd5<\/strong><\/h2>\n<p>boosting_clf = GradientBoostingClassifier(n_estimators=100, random_state=42)<\/p>\n<p>boosting_clf.fit(data, labels)<\/p>\n<h2><strong>\u4f7f\u7528Stacking\u96c6\u6210\u65b9\u6cd5<\/strong><\/h2>\n<p>estimators = [(&#39;rf&#39;, RandomForestClassifier(n_estimators=50, random_state=42)),<\/p>\n<p>              (&#39;gb&#39;, GradientBoostingClassifier(n_estimators=50, random_state=42))]<\/p>\n<p>stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())<\/p>\n<p>stacking_clf.fit(data, labels)<\/p>\n<p><\/code><\/pre>\n<\/p>\n<p><h3>\u516b\u3001\u603b\u7ed3<\/h3>\n<\/p>\n<p><p>\u5728Python\u4e2d\u53bb\u9664\u9519\u8bef\u6837\u672c\u662f\u6570\u636e\u9884\u5904\u7406\u548c\u6a21\u578b\u8bad\u7ec3\u4e2d\u7684\u91cd\u8981\u6b65\u9aa4\u3002\u901a\u8fc7<strong>\u6570\u636e\u6e05\u6d17\u3001\u5f02\u5e38\u68c0\u6d4b\u3001\u6570\u636e\u8f6c\u6362\u3001\u7279\u5f81\u9009\u62e9\u3001\u6570\u636e\u589e\u5f3a\u3001\u6a21\u578b\u8bc4\u4f30\u548c\u9a8c\u8bc1\u3001\u6a21\u578b\u4f18\u5316<\/strong>\u7b49\u65b9\u6cd5\uff0c\u53ef\u4ee5\u6709\u6548\u8bc6\u522b\u548c\u5904\u7406\u9519\u8bef\u6837\u672c\uff0c\u63d0\u9ad8\u6570\u636e\u8d28\u91cf\u548c\u6a21\u578b\u6027\u80fd\u3002\u6bcf\u4e2a\u65b9\u6cd5\u90fd\u6709\u5176\u7279\u5b9a\u7684\u5e94\u7528\u573a\u666f\u548c\u64cd\u4f5c\u6b65\u9aa4\uff0c\u9700\u8981\u6839\u636e\u5177\u4f53\u95ee\u9898\u9009\u62e9\u5408\u9002\u7684\u65b9\u6cd5\u8fdb\u884c\u5904\u7406\u3002\u901a\u8fc7\u7cfb\u7edf\u5730\u5e94\u7528\u8fd9\u4e9b\u65b9\u6cd5\uff0c\u53ef\u4ee5\u6784\u5efa\u51fa\u66f4\u52a0\u51c6\u786e\u548c\u9c81\u68d2\u7684\u673a\u5668\u5b66\u4e60\u6a21\u578b\uff0c\u4ece\u800c\u63d0\u5347\u6570\u636e\u5206\u6790\u548c\u51b3\u7b56\u7684\u79d1\u5b66\u6027\u548c\u53ef\u9760\u6027\u3002<\/p>\n<\/p>\n<h2><strong>\u76f8\u5173\u95ee\u7b54FAQs\uff1a<\/strong><\/h2>\n<p> <strong>\u5982\u4f55\u5728Python\u4e2d\u8bc6\u522b\u9519\u8bef\u6837\u672c\uff1f<\/strong><br \/>\u8bc6\u522b\u9519\u8bef\u6837\u672c\u901a\u5e38\u53ef\u4ee5\u901a\u8fc7\u6570\u636e\u9a8c\u8bc1\u548c\u6e05\u7406\u6280\u672f\u6765\u5b9e\u73b0\u3002\u4f7f\u7528\u6570\u636e\u5206\u6790\u5e93\u5982Pandas\uff0c\u53ef\u4ee5\u5bf9\u6570\u636e\u96c6\u8fdb\u884c\u57fa\u672c\u7684\u7edf\u8ba1\u5206\u6790\uff0c\u67e5\u627e\u7f3a\u5931\u503c\u3001\u5f02\u5e38\u503c\u6216\u4e0d\u7b26\u5408\u9884\u671f\u683c\u5f0f\u7684\u6570\u636e\u3002\u5e38\u7528\u7684\u6280\u672f\u5305\u62ec\u53ef\u89c6\u5316\u5de5\u5177\uff08\u5982\u76f4\u65b9\u56fe\u548c\u7bb1\u7ebf\u56fe\uff09\u6765\u53d1\u73b0\u5f02\u5e38\u70b9\uff0c\u4ee5\u53ca\u4f7f\u7528\u6761\u4ef6\u7b5b\u9009\u6765\u6807\u8bc6\u4e0d\u7b26\u5408\u6807\u51c6\u7684\u6570\u636e\u884c\u3002<\/p>\n<p><strong>\u5728\u5904\u7406\u6570\u636e\u65f6\uff0c\u5982\u4f55\u6709\u6548\u5730\u53bb\u9664\u7f3a\u5931\u503c\uff1f<\/strong><br \/>\u53bb\u9664\u7f3a\u5931\u503c\u53ef\u4ee5\u901a\u8fc7Pandas\u5e93\u4e2d\u7684<code>dropna()<\/code>\u51fd\u6570\u5b9e\u73b0\u3002\u8be5\u51fd\u6570\u5141\u8bb8\u7528\u6237\u6839\u636e\u7279\u5b9a\u6761\u4ef6\uff08\u5982\u5220\u9664\u6240\u6709\u7f3a\u5931\u503c\u7684\u884c\u6216\u5217\uff09\u6765\u6e05\u7406\u6570\u636e\u3002\u5bf9\u4e8e\u8f83\u5927\u7684\u6570\u636e\u96c6\uff0c\u53ef\u4ee5\u8003\u8651\u586b\u8865\u7f3a\u5931\u503c\uff08\u5982\u4f7f\u7528\u5747\u503c\u6216\u4e2d\u4f4d\u6570\u586b\u8865\uff09\uff0c\u4ee5\u907f\u514d\u4e22\u5931\u8fc7\u591a\u4fe1\u606f\uff0c\u540c\u65f6\u4fdd\u6301\u6570\u636e\u7684\u5b8c\u6574\u6027\u3002<\/p>\n<p><strong>\u6709\u6ca1\u6709\u63a8\u8350\u7684\u5e93\u6216\u5de5\u5177\u53ef\u4ee5\u5e2e\u52a9\u68c0\u6d4b\u548c\u53bb\u9664\u9519\u8bef\u6837\u672c\uff1f<\/strong><br \/>\u9664\u4e86Pandas\uff0c\u8fd8\u6709\u5176\u4ed6\u5e93\u53ef\u4ee5\u5e2e\u52a9\u5904\u7406\u9519\u8bef\u6837\u672c\uff0c\u5982NumPy\u7528\u4e8e\u9ad8\u6548\u7684\u6570\u503c\u8ba1\u7b97\uff0cScikit-learn\u63d0\u4f9b\u7684\u6570\u636e\u9884\u5904\u7406\u529f\u80fd\uff0c\u4ee5\u53caOpenCV\u7528\u4e8e\u5904\u7406\u56fe\u50cf\u6570\u636e\u3002\u7ed3\u5408\u4f7f\u7528\u8fd9\u4e9b\u5de5\u5177\uff0c\u53ef\u4ee5\u5b9e\u73b0\u66f4\u590d\u6742\u7684\u9519\u8bef\u68c0\u6d4b\u548c\u6570\u636e\u6e05\u7406\u6d41\u7a0b\uff0c\u786e\u4fdd\u6570\u636e\u7684\u8d28\u91cf\u4e0e\u51c6\u786e\u6027\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"\u5728Python\u4e2d\u53bb\u9664\u9519\u8bef\u6837\u672c\u53ef\u4ee5\u901a\u8fc7\u6570\u636e\u6e05\u6d17\u3001\u5f02\u5e38\u68c0\u6d4b\u3001\u7edf\u8ba1\u65b9\u6cd5\u7b49\u591a\u79cd\u624b\u6bb5\u6765\u5b9e\u73b0\u3002\u4ee5\u6570\u636e\u6e05\u6d17\u4e3a\u4f8b\uff0c\u53ef\u4ee5\u5229\u7528Pa [&hellip;]","protected":false},"author":3,"featured_media":1165402,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"_acf_changed":false,"footnotes":""},"categories":[37],"tags":[],"acf":[],"_links":{"self":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1165398"}],"collection":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/users\/3"}],"replies":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/comments?post=1165398"}],"version-history":[{"count":"1","href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1165398\/revisions"}],"predecessor-version":[{"id":1165404,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/posts\/1165398\/revisions\/1165404"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media\/1165402"}],"wp:attachment":[{"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/media?parent=1165398"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/categories?post=1165398"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/docs.pingcode.com\/wp-json\/wp\/v2\/tags?post=1165398"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}