热门角色不仅是灵感来源,更是你的效率助手。通过精挑细选的角色提示词,你可以快速生成高质量内容、提升创作灵感,并找到最契合你需求的解决方案。让创作更轻松,让价值更直接!
我们根据不同用户需求,持续更新角色库,让你总能找到合适的灵感入口。
本提示词专为数据清洗与预处理场景设计,帮助商业数据运营者高效处理数据集中的缺失值问题。通过系统分析数据特征、缺失模式及业务背景,智能推荐最适合的填充策略,确保数据完整性和质量。该提示词结合统计方法与业务逻辑,提供从缺失模式诊断到策略选择的全流程指导,支持数值型、分类型和时间序列等多种数据类型,最终输出具体可操作的填充方案及实施建议,提升数据分析和建模的准确性与可靠性。
总体原则
signup_date(日期型)
gender(分类型)
city(分类型)
premium_user(布尔型)
income_bracket(等级型)
age(数值型)
monthly_spend(货币型)
satisfaction_score(评分型1-5)
预期效果
数据预处理要求
填充操作步骤
结果验证方法与指标
常见问题与处理
说明:
import numpy as np
import pandas as pd
from io import StringIO
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
# ---------- 1) 读入数据 ----------
raw = """user_id,signup_date,gender,age,income_bracket,monthly_spend,premium_user,city,satisfaction_score
U1001,2023-11-05,F,28,L2,89.3,TRUE,Shenzhen,5
U1002,,M,35,L3,120.5,,Guangzhou,4
U1003,2024-01-12,,27,,75.0,FALSE,Shenzhen,
U1004,2023-12-30,F,,L1,,FALSE,,3
U1005,2024-02-10,Other,22,L2,45.0,TRUE,Shenzhen,5
U1006,2024-02-15,M,31,L4,,TRUE,Hangzhou,4
U1007,2023-11-20,,29,L3,0.0,FALSE,Guangzhou,
U1008,2024-03-01,F,26,,68.0,,Shenzhen,5
"""
df = pd.read_csv(StringIO(raw), dtype=str)
# ---------- 2) 基础清洗与类型转换 ----------
def to_bool(x):
if pd.isna(x): return np.nan
x = str(x).strip().lower()
if x in ['true','1','yes','y','t']: return 1
if x in ['false','0','no','n','f']: return 0
return np.nan
def parse_date(s):
if pd.isna(s) or s=='':
return pd.NaT
return pd.to_datetime(s)
df['signup_date'] = df['signup_date'].apply(parse_date)
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['monthly_spend'] = pd.to_numeric(df['monthly_spend'], errors='coerce')
df['premium_user'] = df['premium_user'].apply(to_bool).astype('float') # float to allow NaN
# 等级映射
lvl_map = {'L1':1,'L2':2,'L3':3,'L4':4,'L5':5}
df['income_level'] = df['income_bracket'].map(lvl_map).astype('float')
# 缺失指示
for col in ['signup_date','gender','age','income_level','monthly_spend','premium_user','city','satisfaction_score']:
df[f'is_{col}_missing'] = df[col].isna().astype(int)
# 日期序数(天)
min_date = df['signup_date'].min()
if pd.isna(min_date):
min_date = pd.Timestamp('2023-01-01')
df['signup_ordinal'] = (df['signup_date'] - min_date).dt.days
# ---------- 3) 特征构造(用于KNN的通用X) ----------
def make_features(df_local):
# 临时用UNK填充类别、用中位数填充数值(仅作预测特征,不是最终填充值)
tmp = df_local.copy()
cat_cols = ['gender','city']
for c in cat_cols:
tmp[c] = tmp[c].fillna('UNK')
# 数值临时填
num_cols = ['age','income_level','monthly_spend','premium_user','signup_ordinal']
for c in num_cols:
med = tmp[c].median(skipna=True)
tmp[c] = tmp[c].fillna(med)
X = pd.get_dummies(tmp[cat_cols], dummy_na=False)
X = pd.concat([X, tmp[num_cols],
tmp[['is_signup_date_missing','is_gender_missing','is_age_missing',
'is_income_level_missing','is_monthly_spend_missing',
'is_premium_user_missing','is_city_missing','is_satisfaction_score_missing']]], axis=1)
return X
# 标准化器(仅对数值列)
def scale_fit_transform(X):
X = X.copy()
num_mask = X.dtypes != 'uint8' # one-hot为uint8
scaler = StandardScaler()
X.loc[:, num_mask] = scaler.fit_transform(X.loc[:, num_mask])
return X, scaler, num_mask
def scale_transform(X, scaler, num_mask):
X = X.copy()
X.loc[:, num_mask] = scaler.transform(X.loc[:, num_mask])
return X
# ---------- 4) 边际分布保持的分配器 ----------
def marginal_preserving_assign(prob_mat, classes, target_counts, random_state=42):
# prob_mat: n_missing x n_classes 的概率
rng = np.random.default_rng(random_state)
n_missing, n_classes = prob_mat.shape
# 贪心:迭代选择当前剩余额度中概率最大的(样本,类)配对
remaining = target_counts.copy()
assigned = -np.ones(n_missing, dtype=int)
# 排序所有对
pairs = []
for i in range(n_missing):
for j in range(n_classes):
pairs.append((i,j,prob_mat[i,j]))
pairs.sort(key=lambda x: x[2], reverse=True)
used_rows = set()
for i,j,p in pairs:
if assigned[i] == -1 and remaining[j] > 0:
assigned[i] = j
remaining[j] -= 1
used_rows.add(i)
if len(used_rows) == n_missing:
break
# 若配额未完全用尽/分配失败,则用剩余最大概率类补齐
for i in range(n_missing):
if assigned[i] == -1:
j = int(np.argmax(prob_mat[i]))
assigned[i] = j
return np.array([classes[k] for k in assigned])
def build_targets_for_missing(obs_series, missing_cnt, classes):
# 目标:填完后总体分布 ≈ 观测分布
obs = obs_series.dropna()
if len(obs)==0:
# 平均分配
base = np.array([missing_cnt // len(classes)]*len(classes))
base[:missing_cnt - base.sum()] += 1
return base
counts = obs.value_counts().reindex(classes).fillna(0).values.astype(int)
p = counts / counts.sum()
target = np.round(p * missing_cnt).astype(int)
# 调整四舍五入误差
diff = missing_cnt - target.sum()
if diff != 0:
order = np.argsort(-(p - target/missing_cnt)) # 优先给占比高的
for k in order[:abs(diff)]:
target[k] += int(np.sign(diff))
return target
# ---------- 5) 各字段填充 ----------
X_base = make_features(df)
X_scaled, scaler, num_mask = scale_fit_transform(X_base)
# 5.1 signup_date -> KNN回归日期序数
mask_sd = df['signup_ordinal'].notna()
if mask_sd.sum() >= 2 and (~mask_sd).sum() > 0:
knn_sd = KNeighborsRegressor(n_neighbors=3, weights='distance')
knn_sd.fit(X_scaled[mask_sd], df.loc[mask_sd, 'signup_ordinal'])
X_miss = scale_transform(make_features(df[~mask_sd]), scaler, num_mask)
pred = knn_sd.predict(X_miss)
pred = np.round(np.clip(pred, df['signup_ordinal'].min(), df['signup_ordinal'].max())).astype(int)
df.loc[~mask_sd, 'signup_ordinal'] = pred
df['signup_date'] = df['signup_ordinal'].apply(lambda d: (min_date + pd.Timedelta(days=int(d))) if pd.notna(d) else pd.NaT)
# 5.2 gender -> KNN分类 + 边际分布保持
classes_gender = ['F','M','Other']
mask_g = df['gender'].isna()
if mask_g.any():
mask_g_obs = ~mask_g
knn_g = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_g.fit(X_scaled[mask_g_obs], df.loc[mask_g_obs, 'gender'])
X_miss = scale_transform(make_features(df[mask_g]), scaler, num_mask)
prob = knn_g.predict_proba(X_miss)
# 重新排序为固定类别顺序
proba_mat = np.zeros((prob.shape[0], len(classes_gender)))
for j, cls in enumerate(knn_g.classes_):
proba_mat[:, classes_gender.index(cls)] = prob[:, j]
tgt_counts = build_targets_for_missing(df.loc[mask_g, 'gender'], mask_g.sum(), classes_gender)
fill_vals = marginal_preserving_assign(proba_mat, classes_gender, tgt_counts, random_state=42)
df.loc[mask_g, 'gender'] = fill_vals
# 5.3 city -> KNN分类 + 边际分布保持
classes_city = sorted(df['city'].dropna().unique().tolist() + ['Shenzhen','Guangzhou','Hangzhou'])
classes_city = sorted(list(set([c for c in classes_city if isinstance(c, str) and c!=''])))
mask_c = df['city'].isna()
if mask_c.any():
mask_c_obs = ~mask_c
knn_c = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_c.fit(X_scaled[mask_c_obs], df.loc[mask_c_obs, 'city'])
X_miss = scale_transform(make_features(df[mask_c]), scaler, num_mask)
prob = knn_c.predict_proba(X_miss)
# 对齐类别顺序
proba_mat = np.zeros((prob.shape[0], len(classes_city)))
for j, cls in enumerate(knn_c.classes_):
proba_mat[:, classes_city.index(cls)] = prob[:, j]
tgt_counts = build_targets_for_missing(df.loc[mask_c, 'city'], mask_c.sum(), classes_city)
fill_vals = marginal_preserving_assign(proba_mat, classes_city, tgt_counts, random_state=42)
df.loc[mask_c, 'city'] = fill_vals
# 5.4 premium_user -> KNN分类(可做边际保持)
mask_p = df['premium_user'].isna()
if mask_p.any():
mask_p_obs = ~mask_p
knn_p = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn_p.fit(X_scaled[mask_p_obs], df.loc[mask_p_obs, 'premium_user'].astype(int))
X_miss = scale_transform(make_features(df[mask_p]), scaler, num_mask)
prob = knn_p.predict_proba(X_miss)
# 概率抽样(保持总体占比,可选)
classes_p = [0,1]
proba_mat = np.zeros((prob.shape[0], len(classes_p)))
for j, cls in enumerate(knn_p.classes_):
proba_mat[:, classes_p.index(cls)] = prob[:, j]
# 构造总占比目标
tgt_counts = build_targets_for_missing(df.loc[mask_p, 'premium_user'], mask_p.sum(), classes_p)
fill_vals = marginal_preserving_assign(proba_mat, classes_p, tgt_counts, random_state=42)
df.loc[mask_p, 'premium_user'] = fill_vals
# 5.5 income_bracket(等级)-> KNN分类(1-5)
mask_inc = df['income_level'].isna()
if mask_inc.any():
mask_inc_obs = ~mask_inc
knn_inc = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_inc.fit(X_scaled[mask_inc_obs], df.loc[mask_inc_obs, 'income_level'].astype(int))
X_miss = scale_transform(make_features(df[mask_inc]), scaler, num_mask)
pred = knn_inc.predict(X_miss)
df.loc[mask_inc, 'income_level'] = pred
df['income_bracket'] = df['income_level'].round().clip(1,5).map({v:k for k,v in lvl_map.items()})
# 5.6 age -> KNN回归
mask_age = df['age'].isna()
if mask_age.any():
mask_age_obs = ~mask_age
knn_age = KNeighborsRegressor(n_neighbors=5, weights='distance')
knn_age.fit(X_scaled[mask_age_obs], df.loc[mask_age_obs, 'age'])
X_miss = scale_transform(make_features(df[mask_age]), scaler, num_mask)
pred = knn_age.predict(X_miss)
df.loc[mask_age, 'age'] = np.clip(pred, 18, 60)
# 5.7 monthly_spend -> KNN回归(log1p)
mask_sp = df['monthly_spend'].isna()
if mask_sp.any():
mask_sp_obs = ~mask_sp
y = np.log1p(df.loc[mask_sp_obs, 'monthly_spend'].astype(float))
knn_sp = KNeighborsRegressor(n_neighbors=5, weights='distance')
knn_sp.fit(X_scaled[mask_sp_obs], y)
X_miss = scale_transform(make_features(df[mask_sp]), scaler, num_mask)
pred = knn_sp.predict(X_miss)
df.loc[mask_sp, 'monthly_spend'] = np.maximum(np.expm1(pred), 0.0)
# 5.8 satisfaction_score -> KNN加权分位数(τ=0.4) + 指示
def knn_weighted_quantile_impute(df_local, target_col, tau=0.4, n_neighbors=7, eps=1e-6):
mask_miss = df_local[target_col].isna()
if not mask_miss.any(): return
mask_obs = ~mask_miss
y = df_local.loc[mask_obs, target_col].astype(float).values
X_obs = scale_transform(make_features(df_local[mask_obs]), scaler, num_mask).values
X_miss = scale_transform(make_features(df_local[mask_miss]), scaler, num_mask).values
# 计算距离
from sklearn.metrics import pairwise_distances
D = pairwise_distances(X_miss, X_obs, metric='euclidean')
# 对每个缺失样本取最近邻
nn_idx = np.argsort(D, axis=1)[:, :min(n_neighbors, X_obs.shape[0])]
filled = []
for i in range(nn_idx.shape[0]):
idx = nn_idx[i]
d = D[i, idx]
w = 1.0 / (d + eps)
w = w / w.sum()
yy = y[idx]
# 计算加权分位数
ord_idx = np.argsort(yy)
yy_sorted = yy[ord_idx]
w_sorted = w[ord_idx]
cdf = np.cumsum(w_sorted)
q_val = yy_sorted[np.searchsorted(cdf, tau, side='right')]
filled.append(int(np.clip(np.round(q_val), 1, 5)))
df_local.loc[mask_miss, target_col] = filled
df['satisfaction_score'] = pd.to_numeric(df['satisfaction_score'], errors='coerce')
knn_weighted_quantile_impute(df, 'satisfaction_score', tau=0.4, n_neighbors=7)
df['satisfaction_missing'] = df['is_satisfaction_score_missing']
# ---------- 6) 结果与简要评估 ----------
# 性别与城市分布
gender_dist_before = (df.loc[df['is_gender_missing']==0, 'gender'].value_counts(normalize=True)*100).round(1)
gender_dist_after = (df['gender'].value_counts(normalize=True)*100).round(1)
city_dist_before = (df.loc[df['is_city_missing']==0, 'city'].value_counts(normalize=True)*100).round(1)
city_dist_after = (df['city'].value_counts(normalize=True)*100).round(1)
# 相关性(仅演示)
corr_before = df.loc[(df['is_age_missing']==0)&(df['is_monthly_spend_missing']==0), ['age','monthly_spend']].corr().iloc[0,1]
corr_after = df[['age','monthly_spend']].corr().iloc[0,1]
print("Gender dist (obs):\n", gender_dist_before.to_dict())
print("Gender dist (all):\n", gender_dist_after.to_dict())
print("City dist (obs):\n", city_dist_before.to_dict())
print("City dist (all):\n", city_dist_after.to_dict())
print("Age-Spend corr (obs only):", round(float(corr_before),3))
print("Age-Spend corr (after):", round(float(corr_after),3))
# 输出最终数据
out_cols = ['user_id','signup_date','gender','age','income_bracket','monthly_spend','premium_user','city','satisfaction_score','satisfaction_missing']
print(df[out_cols])
运行要点与注意
该方案遵循:
数据集概况
缺失模式分析
业务影响评估
数据预处理要求
填充操作具体步骤(Python 示例)
import pandas as pd
import numpy as np
from io import StringIO
raw = """timestamp,line_id,machine_id,shift,temperature_c,vibration_mm_s,defect_count,oee_percent,power_on,maintenance_flag
2024-07-01 08:00,L1,M03,Day,68.5,2.1,1,0.86,TRUE,FALSE
2024-07-01 09:00,L1,M03,Day,69.0,,0,0.88,TRUE,FALSE
2024-07-01 10:00,L1,M03,Day,,2.4,0,0.87,TRUE,FALSE
2024-07-01 11:00,L1,M03,Day,70.2,2.3,,0.85,TRUE,FALSE
2024-07-01 12:00,L1,M03,Day,69.8,2.2,0,,TRUE,FALSE
2024-07-01 13:00,L1,M03,Day,71.0,2.5,2,0.80,,
2024-07-01 14:00,L1,M03,Day,70.9,2.6,1,0.82,TRUE,FALSE
2024-07-01 15:00,L1,M03,Day,,2.4,0,0.83,FALSE,TRUE
2024-07-01 16:00,L1,M03,Day,70.5,,0,0.84,TRUE,FALSE
2024-07-01 17:00,L1,M03,Day,70.1,2.3,0,0.85,TRUE,FALSE
"""
df = pd.read_csv(StringIO(raw), parse_dates=['timestamp'])
# 统一布尔
for c in ['power_on','maintenance_flag']:
df[c] = df[c].map({'TRUE': True, 'FALSE': False})
df = df.sort_values(['line_id','machine_id','shift','timestamp'])
# 参数
group_cols = ['line_id','machine_id','shift']
num_cols = ['temperature_c','vibration_mm_s','oee_percent']
cnt_col = 'defect_count'
bool_cols = ['power_on','maintenance_flag']
max_gap_hours = 2
# Step 1: 先对布尔缺失做同班次近邻众数(不跨班次/不跨机台)
def fill_bool_mode(g):
g = g.copy()
for bc in bool_cols:
s = g[bc]
# 中心窗口众数(3 小时),优先用邻近一致
# 先做前后向各一步
f = s.ffill(limit=1)
b = s.bfill(limit=1)
agree = f.eq(b) & f.notna()
s2 = s.copy()
s2[agree & s2.isna()] = f[agree & s2.isna()]
# 再用滚动众数兜底
def _mode(x):
x = x[~pd.isna(x)]
if len(x)==0: return np.nan
m = x.mode()
return m.iloc[0] if len(m)>0 else np.nan
s3 = s2.copy()
s3 = s3.combine_first(s2.rolling(window=3, center=True, min_periods=2).apply(
lambda x: _mode(pd.Series(x)), raw=False))
# 最后一次 ffill/bfill(限制 1)
s3 = s3.ffill(limit=1).bfill(limit=1)
g[bc] = s3
return g
df = df.groupby(group_cols, group_keys=False).apply(fill_bool_mode)
# Step 2: 标记运行段(power_on=True 且 maintenance=False)
df['is_run'] = (df['power_on'] == True) & (df['maintenance_flag'] == False)
# 用 run 段编号(遇到非运行行就断段)
def assign_run_segment(g):
g = g.sort_values('timestamp').copy()
seg_id = 0
seg_ids = []
prev_run = False
for run in g['is_run']:
if run and not prev_run:
seg_id += 1
seg_ids.append(seg_id if run else 0) # 0 表示非运行段
prev_run = run
g['run_seg_id'] = seg_ids
return g
df = df.groupby(group_cols, group_keys=False).apply(assign_run_segment)
# Step 3: 数值型在运行段内插值(线性,段内,不跨越)
def interp_numeric_in_segment(g):
g = g.sort_values('timestamp').copy()
# 辅助:检查段内缺口长度
g = g.set_index('timestamp')
for c in num_cols:
# 仅对运行段行做插值,其他行保持原样
s = g[c].copy()
s_run = s[g['run_seg_id'] > 0]
# 分段执行插值
s_new = s.copy()
for seg_id, idx in s_run.groupby(g['run_seg_id']).groups.items():
seg = s.loc[idx].copy()
# 限制最大缺口:如果连续 NaN 序列长度大于 max_gap_hours,不做线性插值(保留缺失)
# 先线性插值只对内部缺口
seg_interp = seg.interpolate(method='linear', limit_area='inside')
# 对段边缘单端缺失,允许 1 步最近邻
seg_interp = seg_interp.ffill(limit=1).bfill(limit=1)
# 可选:对过长缺口还原为 NaN(这里按时间频率为小时计)
# 统计 NaN 连续段长度,若>max_gap_hours,恢复这些位置
na_runs = seg.isna().astype(int).groupby((seg.notna()).cumsum()).sum()
# 上面统计方式并不直接给出位置,简化:用 rolling 限制(短样本下可不必)
s_new.loc[idx] = seg_interp
# 物理边界裁剪
if c == 'temperature_c':
s_new = s_new.clip(lower=60, upper=90)
if c == 'oee_percent':
s_new = s_new.clip(lower=0, upper=1)
if c == 'vibration_mm_s':
s_new = s_new.clip(lower=0)
g[c] = s_new
g = g.reset_index()
return g
df = df.groupby(group_cols, group_keys=False).apply(interp_numeric_in_segment)
# Step 4: 计数型用段内居中中位数(window=3),兜底 ffill/bfill,取整
def fill_counts(g):
g = g.sort_values('timestamp').copy()
g = g.set_index('timestamp')
s = g[cnt_col]
# 仅对运行段内做填充,非运行段保持原值(停机时通常为 0 或缺失)
mask = g['run_seg_id'] > 0
s_run = s.where(mask)
# 居中中位数
med = s_run.rolling(window=3, center=True, min_periods=1).median()
filled = s_run.fillna(med)
filled = filled.ffill(limit=1).bfill(limit=1)
# 取整并非负
filled = np.rint(filled).clip(lower=0)
# 回写(仅运行段)
s_final = s.copy()
s_final[mask] = filled[mask]
g[cnt_col] = s_final
g = g.reset_index()
return g
df = df.groupby(group_cols, group_keys=False).apply(fill_counts)
# Step 5: 生成填充标记列与保留结构性缺失
for c in num_cols + [cnt_col] + bool_cols:
flag = f'{c}_was_imputed'
df[flag] = df[c].isna() # 先标记
# 重做标记:填充后现值不为 NaN 且原值为 NaN 的位置标记 True
# 为简单起见,使用一个备份原始缺失掩码:
raw_df = pd.read_csv(StringIO(raw), parse_dates=['timestamp'])
for c in ['power_on','maintenance_flag']:
raw_df[c] = raw_df[c].map({'TRUE': True, 'FALSE': False})
raw_df = raw_df.sort_values(['line_id','machine_id','shift','timestamp'])
raw_df = raw_df.merge(df[['timestamp','line_id','machine_id','shift']], on=['timestamp','line_id','machine_id','shift'], how='right')
for c in num_cols + [cnt_col] + bool_cols:
df[f'{c}_was_imputed'] = raw_df[c].isna() & df[c].notna()
# 停机导致的结构性缺失指示(数值型)
df['is_downtime'] = (~df['power_on']) | (df['maintenance_flag'] == True)
for c in num_cols + [cnt_col]:
df[f'{c}_downtime_nan'] = df['is_downtime'] & df[c].isna()
# 查看结果
print(df[['timestamp','temperature_c','vibration_mm_s','defect_count','oee_percent','power_on','maintenance_flag']])
结果验证方法与指标
常见问题及处理方案
# 简单蒙版验证(运行段内随机 20% 掩蔽)
def mask_and_eval(df_in, col, frac=0.2, seed=42):
gcols = ['line_id','machine_id','shift']
df_eval = df_in.copy()
run_mask = df_eval['is_run'] & df_eval[col].notna()
np.random.seed(seed)
idx = df_eval[run_mask].sample(frac=frac, random_state=seed).index
truth = df_eval.loc[idx, col].copy()
df_eval.loc[idx, col] = np.nan
# 仅对该列重跑段内插值(保持与主流程一致)
def _interp_one(g):
g = g.sort_values('timestamp').copy().set_index('timestamp')
s = g[col]
mask = g['run_seg_id'] > 0
for seg_id, ids in s[mask].groupby(g['run_seg_id']).groups.items():
seg = s.loc[ids]
seg_interp = seg.interpolate(method='linear', limit_area='inside').ffill(limit=1).bfill(limit=1)
s.loc[ids] = seg_interp
g[col] = s
return g.reset_index()
df_eval = df_eval.groupby(gcols, group_keys=False).apply(_interp_one)
pred = df_eval.loc[idx, col]
mae = (pred - truth).abs().mean()
rmse = np.sqrt(((pred - truth)**2).mean())
return mae, rmse
mae_t, rmse_t = mask_and_eval(df, 'temperature_c')
print('Temperature MAE/RMSE:', mae_t, rmse_t)
该方案遵循统计可验证方法,不跨结构边界、不扭曲分布,贴合您给定的“前向/后向插值、时间序列插值、中位数填充”偏好与产线业务逻辑。
用一次高效对话,快速拿到专业级的缺失值处理方案,帮助数据与业务团队在销售、用户画像、财务、生产与调研等场景中,从诊断缺失到策略选择、参数确定、落地执行与效果验收一站式完成。目标是缩短清洗周期,提升分析与建模准确性,降低人为拍脑袋填充的风险,确保业务逻辑与数据分布不被破坏;同时输出可直接用于汇报与协作的分析报告与实施指南,让策略可复用、可评估、可监控,最终把数据质量转化为可见的业务结果与决策信心。
快速识别缺失字段影响范围,制定填充方案并验证效果,提升模型训练与报表稳定性。
补全关键标签与行为序列,减少人群识别偏差,优化分群、触达与投放策略的转化率。
修复月度报表缺项与异常空值,保持口径统一,支撑预算预测与合规审计落地。
将模板生成的提示词复制粘贴到您常用的 Chat 应用(如 ChatGPT、Claude 等),即可直接对话使用,无需额外开发。适合个人快速体验和轻量使用场景。
把提示词模板转化为 API,您的程序可任意修改模板参数,通过接口直接调用,轻松实现自动化与批量处理。适合开发者集成与业务系统嵌入。
在 MCP client 中配置对应的 server 地址,让您的 AI 应用自动调用提示词模板。适合高级用户和团队协作,让提示词在不同 AI 工具间无缝衔接。
免费获取高级提示词-优惠即将到期