日韩av无码中文字幕,情侣作爱免费网站

這里原始數(shù)據(jù)刨除因變量price只存在7個(gè)特征，將在這個(gè)基礎(chǔ)上在想象力范圍能組合出盡可能多的特征

# 提取自變量特征 X = df.drop(['price'], axis=1) # 獲取所有列名并轉(zhuǎn)換為列表以適合特定輸入格式 variables = X.columns.tolist() from feature_engine.discretisation import EqualFrequencyDiscretiser from feature_engine.transformation import LogTransformer, PowerTransformer # 創(chuàng)建等頻分箱器 discretizer = EqualFrequencyDiscretiser(q=4, variables=variables) X_1 = discretizer.fit_transform(X) # 重命名分箱后的列名 new_column_names = {col: f"{col}_分箱" for col in X_1.columns} X_1.rename(columns=new_column_names, inplace=True) X_1

等頻分箱是一種將連續(xù)變量轉(zhuǎn)化為離散變量的技術(shù)，其原理是將數(shù)據(jù)排序后，將排序后的數(shù)據(jù)等頻率地劃分為指定數(shù)量的區(qū)間（分箱），每個(gè)分箱內(nèi)包含的數(shù)據(jù)點(diǎn)數(shù)量大致相同

# 對(duì)數(shù)變換前確保沒(méi)有零或負(fù)值 # 對(duì)數(shù)變換 log_transformer = LogTransformer(variables=variables) X_2 = log_transformer.fit_transform(X) # 重命名對(duì)數(shù)變換后的列名 new_column_names = {col: f"{col}_對(duì)數(shù)" for col in X_2.columns} X_2.rename(columns=new_column_names, inplace=True) X_2

對(duì)數(shù)變換將特征值拉伸或壓縮，從而減少數(shù)據(jù)的偏度，平滑數(shù)據(jù)分布，適合于具有正偏態(tài)或長(zhǎng)尾分布的數(shù)據(jù)

# 冪變換 power_transformer = PowerTransformer(variables=variables, exp=0.5) X_3 = power_transformer.fit_transform(X) # 重命名對(duì)數(shù)變換后的列名 new_column_names = {col: f"{col}_冪變換" for col in X_3.columns} X_3.rename(columns=new_column_names, inplace=True) X_3

冪變換用于通過(guò)對(duì)數(shù)據(jù)進(jìn)行冪函數(shù)變換來(lái)減小偏度和改善數(shù)據(jù)的正態(tài)性

# 創(chuàng)建一個(gè)字典來(lái)存儲(chǔ)新特征 new_features = {} # 添加數(shù)學(xué)特征到字典中 for feature in variables: for other_feature in variables: if feature != other_feature: # 創(chuàng)建新的特征，表示兩個(gè)特征的加和 new_features[f'{feature}_plus_{other_feature}'] = df[feature] + df[other_feature] # 創(chuàng)建新的特征，表示兩個(gè)特征的差 new_features[f'{feature}_minus_{other_feature}'] = df[feature] - df[other_feature] # 創(chuàng)建新的特征，表示兩個(gè)特征的乘積 new_features[f'{feature}_times_{other_feature}'] = df[feature] * df[other_feature] # 創(chuàng)建新的特征，表示兩個(gè)特征的商 new_features[f'{feature}_div_by_{other_feature}'] = df[feature] / df[other_feature] # 將新特征轉(zhuǎn)為數(shù)據(jù)框 X_4 = pd.DataFrame(new_features) X_4

合并特征字典

通過(guò)上述特征工程步驟，從最初的7個(gè)特征擴(kuò)展到196個(gè)特征，這種暴力特征工程方法有助于提供更多的信息和特征組合，從而提高模型的預(yù)測(cè)精度，當(dāng)然這里是作者的暴力特征工程，讀者可拓寬自己對(duì)特征工程的想象力，以此來(lái)得到更多的特征

from sklearn.model_selection import train_test_split X_temp, X_test, y_temp, y_test = train_test_split(data, df['price'], test_size=0.2, random_state=42) # 然后將訓(xùn)練集進(jìn)一步劃分為訓(xùn)練集和驗(yàn)證集 X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.125, random_state=42) # 0.125 x 0.8 = 0.1 import xgboost as xgb # XGBoost參數(shù) params_xgb = { 'learning_rate': 0.02, 'booster': 'gbtree', 'objective': 'reg:squarederror', 'max_leaves': 127, 'verbosity': 1, 'seed': 42, 'nthread': -1, 'colsample_bytree': 0.6, 'subsample': 0.7, 'early_stopping_rounds': 100, 'eval_metric': 'rmse' } # 初始化模型 model_xgb = xgb.XGBRegressor(**params_xgb) # 訓(xùn)練模型 model_xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False) # 提取特征重要性 feature_importances = model_xgb.feature_importances_ # 創(chuàng)建一個(gè)DataFrame，包含特征名稱和對(duì)應(yīng)的重要性 importance_df = pd.DataFrame({ 'Feature': X_train.columns, 'Importance': feature_importances }) # 按重要性排序 importance_df = importance_df.sort_values(by='Importance', ascending=False) # 選擇排名前20的特征 top_20_features = importance_df.head(20) import matplotlib.pyplot as plt plt.rcParams['font.sans-serif'] = 'SimHei' plt.rcParams['axes.unicode_minus'] = False # 可視化 plt.figure(figsize=(15, 5), dpi=600) plt.barh(top_20_features['Feature'], top_20_features['Importance'], color='skyblue') plt.xlabel('Feature Importance') plt.ylabel('Feature Name') plt.title('Top 20 Feature Importances') plt.gca().invert_yaxis() plt.show()