热门角色不仅是灵感来源,更是你的效率助手。通过精挑细选的角色提示词,你可以快速生成高质量内容、提升创作灵感,并找到最契合你需求的解决方案。让创作更轻松,让价值更直接!
我们根据不同用户需求,持续更新角色库,让你总能找到合适的灵感入口。
通过分步骤参数优化方法和评估策略,帮助用户提升模型预测精度,提供可操作的优化流程、结果解读及持续改进建议,确保机器学习模型性能稳健提升。
领域知识提示:
计算资源考虑:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, recall_score, roc_auc_score
from lightgbm import LGBMClassifier
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings("ignore")
RANDOM_STATE = 2025
# X, y = ... # 加载数据,确保y为{0,1}
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
def find_best_threshold(y_true, y_prob, metric='f1'):
thresholds = np.linspace(0.05, 0.95, 91)
best_th, best_f1, best_rec = 0.5, -1, 0
for th in thresholds:
y_pred = (y_prob >= th).astype(int)
f1 = f1_score(y_true, y_pred, zero_division=0)
if f1 > best_f1:
best_f1 = f1
best_rec = recall_score(y_true, y_pred, zero_division=0)
best_th = th
return best_th, best_f1, best_rec
def objective(trial):
# 采样参数(来自给定范围)
max_depth = trial.suggest_int("max_depth", 4, 12)
num_leaves = trial.suggest_int("num_leaves", 31, min(255, 2 ** max_depth))
learning_rate = trial.suggest_float("learning_rate", 0.03, 0.2, log=True)
n_estimators = trial.suggest_int("n_estimators", 300, 900)
feature_fraction = trial.suggest_float("feature_fraction", 0.6, 1.0)
bagging_fraction = trial.suggest_float("bagging_fraction", 0.6, 1.0)
lambda_l1 = trial.suggest_float("lambda_l1", 0.0, 1.0)
lambda_l2 = trial.suggest_float("lambda_l2", 0.0, 3.0)
scale_pos_weight = trial.suggest_int("scale_pos_weight", 3, 7)
min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 20, 100)
early_stopping_rounds = trial.suggest_int("early_stopping_rounds", 50, 100)
# 5折CV
f1s, recs, aucs, ths = [], [], [], []
for train_idx, valid_idx in skf.split(X_train, y_train):
X_tr, X_val = X_train[train_idx], X_train[valid_idx]
y_tr, y_val = y_train[train_idx], y_train[valid_idx]
model = LGBMClassifier(
objective="binary",
boosting_type="gbdt",
max_depth=max_depth,
num_leaves=num_leaves,
learning_rate=learning_rate,
n_estimators=n_estimators,
feature_fraction=feature_fraction,
bagging_fraction=bagging_fraction,
bagging_freq=1,
lambda_l1=lambda_l1,
lambda_l2=lambda_l2,
scale_pos_weight=scale_pos_weight,
min_data_in_leaf=min_data_in_leaf,
n_jobs=-1,
random_state=RANDOM_STATE,
verbosity=-1,
)
model.fit(
X_tr, y_tr,
eval_set=[(X_val, y_val)],
eval_metric="auc",
early_stopping_rounds=early_stopping_rounds,
)
y_prob = model.predict_proba(X_val)[:, 1]
th, f1_cv, rec_cv = find_best_threshold(y_val, y_prob, metric='f1')
auc_cv = roc_auc_score(y_val, y_prob)
f1s.append(f1_cv); recs.append(rec_cv); aucs.append(auc_cv); ths.append(th)
# 加权目标(可按业务调整)
f1_mean, rec_mean, auc_mean = np.mean(f1s), np.mean(recs), np.mean(aucs)
score = 0.5 * f1_mean + 0.3 * rec_mean + 0.2 * auc_mean
# 将中位数阈值作为trial的用户属性记录(便于后续复现/评估)
trial.set_user_attr("f1_mean", f1_mean)
trial.set_user_attr("rec_mean", rec_mean)
trial.set_user_attr("auc_mean", auc_mean)
trial.set_user_attr("th_median", float(np.median(ths)))
return score
# 运行贝叶斯优化
sampler = TPESampler(seed=RANDOM_STATE)
study = optuna.create_study(direction="maximize", sampler=sampler)
# 并行执行,合理利用32核(避免过载)
study.optimize(objective, n_trials=100, n_jobs=16, timeout=90*60) # 约90分钟
best_trial = study.best_trial
print("Best score:", best_trial.value)
print("Best params:", best_trial.params)
print("CV F1:", best_trial.user_attrs["f1_mean"],
"Recall:", best_trial.user_attrs["rec_mean"],
"AUC:", best_trial.user_attrs["auc_mean"],
"Threshold(median):", best_trial.user_attrs["th_median"])
建议:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
def f1_by_best_th_scorer(estimator, X_val, y_val):
y_prob = estimator.predict_proba(X_val)[:, 1]
# 使用折内最佳阈值最大化F1
th, f1_cv, _ = find_best_threshold(y_val, y_prob)
y_pred = (y_prob >= th).astype(int)
return f1_score(y_val, y_pred, zero_division=0)
best_params = best_trial.params
param_grid = {
"num_leaves": [max(31, best_params["num_leaves"]-20), best_params["num_leaves"], min(255, best_params["num_leaves"]+20)],
"max_depth": [max(4, best_params["max_depth"]-1), best_params["max_depth"], min(12, best_params["max_depth"]+1)],
"min_data_in_leaf": [max(20, best_params["min_data_in_leaf"]-10), best_params["min_data_in_leaf"], min(100, best_params["min_data_in_leaf"]+10)],
"learning_rate": [max(0.03, best_params["learning_rate"]*0.8), best_params["learning_rate"], min(0.2, best_params["learning_rate"]*1.2)],
"feature_fraction": [max(0.6, best_params["feature_fraction"]-0.1), best_params["feature_fraction"], min(1.0, best_params["feature_fraction"]+0.1)],
"bagging_fraction": [max(0.6, best_params["bagging_fraction"]-0.1), best_params["bagging_fraction"], min(1.0, best_params["bagging_fraction"]+0.1)],
}
base_model = LGBMClassifier(
objective="binary",
boosting_type="gbdt",
n_estimators=best_params["n_estimators"],
early_stopping_rounds=best_params["early_stopping_rounds"],
lambda_l1=best_params["lambda_l1"],
lambda_l2=best_params["lambda_l2"],
scale_pos_weight=best_params["scale_pos_weight"],
bagging_freq=1,
n_jobs=-1,
random_state=RANDOM_STATE,
verbosity=-1,
)
grid = GridSearchCV(
estimator=base_model,
param_grid=param_grid,
scoring=make_scorer(f1_by_best_th_scorer),
cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE),
n_jobs=8,
verbose=1,
)
grid.fit(X_train, y_train)
print("Grid best params:", grid.best_params_)
print("Grid best F1:", grid.best_score_)
final_params = {**best_params, **grid.best_params_} # 若网格有更新,覆盖之
final_model = LGBMClassifier(
objective="binary",
boosting_type="gbdt",
n_estimators=final_params["n_estimators"],
learning_rate=final_params["learning_rate"],
max_depth=final_params["max_depth"],
num_leaves=final_params["num_leaves"],
feature_fraction=final_params["feature_fraction"],
bagging_fraction=final_params["bagging_fraction"],
bagging_freq=1,
lambda_l1=final_params["lambda_l1"],
lambda_l2=final_params["lambda_l2"],
scale_pos_weight=final_params["scale_pos_weight"],
min_data_in_leaf=final_params["min_data_in_leaf"],
n_jobs=-1,
random_state=RANDOM_STATE,
verbosity=-1,
)
# 用训练集做一次CV来确定最终阈值
ths = []
for tr_idx, val_idx in skf.split(X_train, y_train):
final_model.fit(
X_train[tr_idx], y_train[tr_idx],
eval_set=[(X_train[val_idx], y_train[val_idx])],
eval_metric="auc",
early_stopping_rounds=final_params["early_stopping_rounds"],
)
y_prob_val = final_model.predict_proba(X_train[val_idx])[:, 1]
th, _, _ = find_best_threshold(y_train[val_idx], y_prob_val)
ths.append(th)
final_th = float(np.median(ths))
# 在训练全集上重训(不早停),并在测试集评估
final_model.set_params(early_stopping_rounds=None, n_estimators=final_params["n_estimators"])
final_model.fit(X_train, y_train)
y_prob_test = final_model.predict_proba(X_test)[:, 1]
y_pred_test = (y_prob_test >= final_th).astype(int)
print("Test F1:", f1_score(y_test, y_pred_test, zero_division=0))
print("Test Recall:", recall_score(y_test, y_pred_test, zero_division=0))
print("Test AUC:", roc_auc_score(y_test, y_prob_test))
print("Final threshold:", final_th)
优先级建议:threshold ≈ class_weight ≈ C > penalty ≈ k >> max_iter(前五者对P/R/F1影响最大)。
资源预算建议(16核、32G、5折、1小时):
# 安装:pip install scikit-learn scikit-optimize
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
from scipy.stats import loguniform, randint, uniform
# 可复现性
RANDOM_STATE = 42
class ThresholdWrapper(BaseEstimator, ClassifierMixin):
"""将阈值作为可调参,兼容CV与搜索器"""
def __init__(self, base_estimator=None, threshold=0.5):
self.base_estimator = base_estimator
self.threshold = threshold
def fit(self, X, y):
self.base_estimator_ = clone(self.base_estimator)
self.base_estimator_.fit(X, y)
return self
def predict(self, X):
proba = self.base_estimator_.predict_proba(X)[:, 1]
return (proba >= self.threshold).astype(int)
def predict_proba(self, X):
return self.base_estimator_.predict_proba(X)
def get_params(self, deep=True):
params = {'threshold': self.threshold, 'base_estimator': self.base_estimator}
if deep and hasattr(self.base_estimator, 'get_params'):
for k, v in self.base_estimator.get_params(deep=True).items():
params[f'base_estimator__{k}'] = v
return params
def set_params(self, **params):
base_params = {}
for k, v in params.items():
if k == 'threshold':
self.threshold = v
elif k == 'base_estimator':
self.base_estimator = v
elif k.startswith('base_estimator__'):
base_params[k.split('__', 1)[1]] = v
if base_params and hasattr(self.base_estimator, 'set_params'):
self.base_estimator.set_params(**base_params)
return self
# 构建Pipeline:标准化 -> 单变量筛选 -> 逻辑回归(带阈值包装)
pipe = Pipeline([
('scaler', StandardScaler()),
('select', SelectKBest(score_func=f_classif)),
('clf', ThresholdWrapper(
base_estimator=LogisticRegression(
solver='saga',
penalty='l2', # 初始值,搜索中会覆盖
C=1.0,
class_weight='balanced', # 初始值,搜索中会覆盖
max_iter=500,
random_state=RANDOM_STATE # 提高可复现性
),
threshold=0.5
))
])
# 5折分层交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
# 多指标打分(refit用F1)
scoring = {
'F1': make_scorer(f1_score),
'Precision': make_scorer(precision_score),
'Recall': make_scorer(recall_score)
}
# 随机搜索空间(粗搜索)
param_dist = {
'select__k': randint(50, 201),
'clf__threshold': uniform(0.3, 0.4), # [0.3, 0.7]
'clf__base_estimator__penalty': ['l1', 'l2'],
'clf__base_estimator__C': loguniform(0.1, 10.0),
'clf__base_estimator__class_weight': [
'balanced',
{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 4}, {0: 1, 1: 6}, {0: 1, 1: 8}, {0: 1, 1: 10}
],
'clf__base_estimator__max_iter': randint(200, 1001)
}
rs = RandomizedSearchCV(
estimator=pipe,
param_distributions=param_dist,
n_iter=60, # 根据时间预算可调:50-70
cv=cv,
scoring=scoring,
refit='F1',
n_jobs=16,
verbose=1,
random_state=RANDOM_STATE,
return_train_score=False
)
rs.fit(X_train, y_train) # 请事先划分并保留测试集 X_test, y_test
print("Random Search Best F1 (CV):", rs.best_score_)
print("Best Params (Random):", rs.best_params_)
# 可选:在随机搜索的最优附近做贝叶斯优化(精细化)
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
search_spaces = {
'select__k': Integer( max(50, rs.best_params_['select__k'] - 50),
min(200, rs.best_params_['select__k'] + 50) ),
'clf__threshold': Real( max(0.3, rs.best_params_['clf__threshold'] - 0.1),
min(0.7, rs.best_params_['clf__threshold'] + 0.1) ),
'clf__base_estimator__penalty': Categorical(['l1', 'l2']),
'clf__base_estimator__C': Real(0.1, 10.0, prior='log-uniform'),
'clf__base_estimator__class_weight': Categorical([
'balanced',
{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 4}, {0: 1, 1: 6}, {0: 1, 1: 8}, {0: 1, 1: 10}
]),
'clf__base_estimator__max_iter': Integer(200, 1000)
}
bo = BayesSearchCV(
estimator=pipe,
search_spaces=search_spaces,
n_iter=25, # 时间预算20-30
cv=cv,
scoring='f1', # skopt更稳妥支持单指标;后续再评其他指标
n_jobs=16,
verbose=1,
random_state=RANDOM_STATE
)
bo.fit(X_train, y_train)
best_model = bo.best_estimator_
print("Bayes Opt Best F1 (CV):", bo.best_score_)
print("Best Params (Bayes):", bo.best_params_)
# 最终在保留测试集上评估(仅一次)
from sklearn.metrics import classification_report
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))
# 若需进一步查看CV稳定性:
from sklearn.model_selection import cross_validate
cv_res = cross_validate(best_model, X_train, y_train, cv=cv,
scoring={'f1': 'f1', 'precision': 'precision', 'recall': 'recall'},
n_jobs=16, return_estimator=False)
print("CV F1 mean±std:", np.mean(cv_res['test_f1']), np.std(cv_res['test_f1']))
print("CV Precision mean±std:", np.mean(cv_res['test_precision']), np.std(cv_res['test_precision']))
print("CV Recall mean±std:", np.mean(cv_res['test_recall']), np.std(cv_res['test_recall']))
以上流程在您的资源与时间约束下,优先优化C、class_weight、threshold与k,结合随机搜索与贝叶斯优化,能在1小时内达到稳定、可复现的精确率/召回率/F1的综合提升。
参数优化是将模型潜在能力转化为稳健预测质量的关键过程。通过系统化地调校高影响参数、选择合适的搜索策略与严谨的评估方法,可以显著降低均方误差(MSE),同时控制模型复杂度与过拟合风险。在时间序列回归场景中,合理处理时序交叉验证与特征工程(如滞后特征、季节性)尤为重要。
建议先优化对MSE影响最大的组合:{num_leaves, max_depth, min_data_in_leaf, learning_rate, n_estimators},随后微调{feature_fraction, lambda_l2},最后联合搜索滞后窗口(lag_windows),以避免在未稳固模型结构前过度扩展特征空间。
推荐策略(两阶段混合):
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import optuna
# 假设df包含列:['date', 'y'] + 80个原始特征,date为月度时间戳或月序整数
# 若为多实体面板数据,需在构造滞后时分组(例如'entity_id')
def build_lag_features(df, lags=6, seasonal_period=12, group_col=None, date_col='date', target_col='y'):
df = df.sort_values(date_col).copy()
if group_col:
df = df.groupby(group_col, group_keys=False).apply(
lambda g: add_lags(g, lags, seasonal_period, date_col, target_col)
)
else:
df = add_lags(df, lags, seasonal_period, date_col, target_col)
df = df.dropna() # 去除滞后导致的前期NA
return df
def add_lags(g, lags, seasonal_period, date_col, target_col):
for L in range(1, lags+1):
g[f'{target_col}_lag_{L}'] = g[target_col].shift(L)
# 季节性滞后
if seasonal_period:
g[f'{target_col}_lag_{seasonal_period}'] = g[target_col].shift(seasonal_period)
# 可加入月份one-hot/周期性编码(cos/sin),依据领域知识
g['month'] = pd.to_datetime(g[date_col]).dt.month
g['month_sin'] = np.sin(2*np.pi*g['month']/12)
g['month_cos'] = np.cos(2*np.pi*g['month']/12)
return g
def time_series_cv_splits(df, date_col='date', n_splits=5):
# 根据唯一月份进行滚动分割
months = np.sort(df[date_col].unique())
# 选取5个验证锚点(例如从后往前每2月),避免过多折数超时
val_points = months[-10:-0:2] # 可根据数据实际调整
splits = []
for m in val_points[:n_splits]:
train_idx = df[df[date_col] < m].index.values
val_idx = df[df[date_col] == m].index.values
if len(val_idx) == 0 or len(train_idx) == 0:
continue
splits.append((train_idx, val_idx))
return splits
def objective(trial):
# 采样超参
learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1, log=True)
max_depth = trial.suggest_categorical("max_depth", [-1, 6, 8, 10, 12])
num_leaves_max = 127 if max_depth == -1 else min(255, 2 ** max_depth)
num_leaves = trial.suggest_int("num_leaves", 31, num_leaves_max)
min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 30, 100)
n_estimators = trial.suggest_int("n_estimators", 600 if learning_rate <= 0.05 else 400, 1200)
feature_fraction = trial.suggest_float("feature_fraction", 0.7, 0.95)
lambda_l2 = trial.suggest_float("lambda_l2", 0.5, 5.0)
lag_windows = trial.suggest_int("lag_windows", 3, 12)
early_stopping_rounds = 80 # 固定或 trial.suggest_int("early_stopping_rounds", 50, 100)
# 构造滞后特征(注意:真实场景需缓存以节省时间)
df_lag = build_lag_features(df, lags=lag_windows, seasonal_period=12, group_col=None, date_col='date', target_col='y')
target = df_lag['y'].values
features = df_lag.drop(columns=['y']).select_dtypes(include=[np.number]).values
splits = time_series_cv_splits(df_lag, date_col='date', n_splits=5)
cv_mse = []
for (tr_idx, va_idx) in splits:
X_tr, y_tr = features[tr_idx], target[tr_idx]
X_va, y_va = features[va_idx], target[va_idx]
dtrain = lgb.Dataset(X_tr, label=y_tr)
dvalid = lgb.Dataset(X_va, label=y_va, reference=dtrain)
params = {
'objective': 'regression',
'metric': 'l2', # MSE
'learning_rate': learning_rate,
'num_leaves': num_leaves,
'max_depth': max_depth,
'min_data_in_leaf': min_data_in_leaf,
'feature_fraction': feature_fraction,
'lambda_l2': lambda_l2,
'verbosity': -1,
'num_threads': 4, # 与并行试验数相配合
'force_col_wise': True # 大特征数时更快
}
model = lgb.train(
params,
dtrain,
num_boost_round=n_estimators,
valid_sets=[dvalid],
valid_names=['valid'],
early_stopping_rounds=early_stopping_rounds,
callbacks=[optuna.integration.LightGBMPruningCallback(trial, "valid_l2")]
)
y_pred = model.predict(X_va, num_iteration=model.best_iteration)
cv_mse.append(mean_squared_error(y_va, y_pred))
# 报告中间结果以便剪枝
trial.report(np.mean(cv_mse), len(cv_mse))
if trial.should_prune():
raise optuna.TrialPruned()
return float(np.mean(cv_mse))
# ==== 主流程(示例)====
# 加载数据 df,并确保date为月度索引或时间戳,且不包含测试集数据
# df_test 作为完全留出的未来月份数据
study = optuna.create_study(
direction='minimize',
sampler=optuna.samplers.TPESampler(seed=42),
pruner=optuna.pruners.MedianPruner(n_warmup_steps=5)
)
study.optimize(objective, n_trials=70, n_jobs=6, show_progress_bar=True)
best_params = study.best_params
print("Best params:", best_params, "CV-MSE:", study.best_value)
# 最终评估:在训练集(含CV使用的所有月份)上用best_params拟合,再在留出测试集上评估MSE
# 注意:重复构造滞后特征,并确保测试集的滞后仅来自过去数据,不泄漏未来信息
进化算法阶段可使用DEAP等库,基于study.best_params初始化种群,并限制参数微调范围与评估预算。由于代码较长,此处给出伪代码框架:
# 伪代码
population = init_population_from_best(study.best_params, size=10, noise_scale=small)
for gen in range(2): # 2-4代
fitness = [cv_mse(individual) for individual in population] # 复用objective的CV评估
parents = selection_tournament(population, fitness)
offspring = mutate_and_crossover(parents, bounds=local_bounds)
population = elitism_merge(parents, offspring, fitness, top_k=3)
best_individual = select_best(population, fitness)
以上方案在2.5小时预算与24核CPU/64G内存环境下可稳健执行。建议先运行阶段A(贝叶斯优化),若时间允许,再进行阶段B(进化算法)的小步精修,并最终在留出测试集上给出一次性MSE评估与报告。
帮助用户通过调整关键参数和优化策略,全面提升机器学习预测模型的性能和准确性,从而在实际应用中实现更优质的数据分析与决策支持。
需要快速调整模型参数以提升任务精度的工程师,通过提示词获取高效实用的优化方案。
希望精确解读模型预测机制并优化参数设置,以实现数据驱动商业决策的科学家。
负责AI模型落地的产品经理,利用提示词降低技术壁垒,为开发团队输出具体优化指南。
将模板生成的提示词复制粘贴到您常用的 Chat 应用(如 ChatGPT、Claude 等),即可直接对话使用,无需额外开发。适合个人快速体验和轻量使用场景。
把提示词模板转化为 API,您的程序可任意修改模板参数,通过接口直接调用,轻松实现自动化与批量处理。适合开发者集成与业务系统嵌入。
在 MCP client 中配置对应的 server 地址,让您的 AI 应用自动调用提示词模板。适合高级用户和团队协作,让提示词在不同 AI 工具间无缝衔接。
半价获取高级提示词-优惠即将到期