热门角色不仅是灵感来源,更是你的效率助手。通过精挑细选的角色提示词,你可以快速生成高质量内容、提升创作灵感,并找到最契合你需求的解决方案。让创作更轻松,让价值更直接!
我们根据不同用户需求,持续更新角色库,让你总能找到合适的灵感入口。
根据用户提供的数据集和目标分类生成自动数据分类算法,提供详细步骤、最佳实践和示例代码,帮助提升数据组织与分析效率,适用于数据科学和机器学习应用场景。
适用场景:您的数据规模为中型(1万~100万条),类别为12个业务标签,数据包含文本、分类型与时间序列特征,且偏好使用神经网络模型。以下给出一套兼顾准确性与工程可实施性的解决方案。
目标:提升数据质量,减少噪声与泄漏,提高模型可学习性。
最佳实践:
目标:用信息量大的特征提升分类区分度。
最佳实践:
目标:在准确率、速度与维护成本间取得平衡。偏好神经网络。
以下示例为端到端骨架代码,包含数据预处理、特征工程、融合模型与训练评估。实际项目需按您的数据结构调整列名与特征工程细节。
pip install pandas scikit-learn torch torchvision torchaudio transformers einops tqdm imbalanced-learn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
# 假设数据结构(示例列名,需按实际调整)
# df columns: ["text", "label", "channel", "region", "created_at", "last_event_ts", "event_count_24h", "avg_gap_minutes", ...]
df = pd.read_csv("tickets.csv")
# 标签标准化
label_map = {
"支付问题": "支付问题",
"物流延迟": "物流延迟",
"退款申请": "退款申请",
"账号登录": "账号登录",
"密码重置": "密码重置",
"功能故障": "功能故障",
"产品咨询": "产品咨询",
"发票开具": "发票开具",
"投诉建议": "投诉建议",
"售后维修": "售后维修",
"订单取消": "订单取消",
"紧急升级": "紧急升级",
}
df["label"] = df["label"].map(label_map)
# 简单文本清洗示例
def clean_text(s):
if not isinstance(s, str):
return ""
s = s.strip()
s = s.replace("\n", " ")
# 可根据需要去掉url/email/phone等,这里略
return s
df["text"] = df["text"].apply(clean_text)
# 时间特征示例:小时/星期
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
df["hour"] = df["created_at"].dt.hour
df["weekday"] = df["created_at"].dt.weekday
# 缺失处理
for col in ["channel", "region"]:
df[col] = df[col].fillna("Missing")
for col in ["event_count_24h", "avg_gap_minutes"]:
df[col] = df[col].fillna(0)
# 分层拆分
X = df.drop(columns=["label"])
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.15, random_state=42, stratify=y
)
若每条记录包含一段事件序列(时间戳列表),可将其聚合为统计特征。下面是示例函数(伪代码):
# 假设有一列 'events',其中每行是 [{"ts": "...", "type": "..."} ...]
def build_time_agg_features(events):
# 伪代码:按最近24h统计交互次数、平均间隔、最后一次与当前的间隔
# 实际需基于当前记录时间 created_at 来计算窗口
if not isinstance(events, list) or len(events) == 0:
return {"event_count_24h": 0, "avg_gap_minutes": 0.0, "last_gap_minutes": 0.0}
# ...计算逻辑略
return {"event_count_24h": 3, "avg_gap_minutes": 12.3, "last_gap_minutes": 5.1}
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
labels = sorted(y_train.unique())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
# 选择轻量预训练模型(多语种/中文)
PRETRAINED = "distilbert-base-multilingual-cased" # 或中文模型
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
# 选择表特征列(按实际调整)
cat_cols = ["channel", "region"]
num_cols = ["hour", "weekday", "event_count_24h", "avg_gap_minutes"]
# 拟合OneHot与标准化器(基于训练集)
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
scaler = StandardScaler()
ohe.fit(X_train[cat_cols])
scaler.fit(X_train[num_cols])
class TicketDataset(Dataset):
def __init__(self, df, y=None):
self.df = df
self.y = y
def __len__(self):
return len(self.df)
def __getitem__(self, idx):
row = self.df.iloc[idx]
text = row["text"]
enc = tokenizer(
text,
truncation=True,
padding="max_length",
max_length=192,
return_tensors="pt"
)
cat = ohe.transform([row[cat_cols]]) # shape (1, C)
num = scaler.transform([row[num_cols]]) # shape (1, N)
cat = torch.tensor(cat, dtype=torch.float32).squeeze(0)
num = torch.tensor(num, dtype=torch.float32).squeeze(0)
x_tab = torch.cat([cat, num], dim=-1)
if self.y is not None:
label_id = label2id[self.y.iloc[idx]]
return enc["input_ids"].squeeze(0), enc["attention_mask"].squeeze(0), x_tab, torch.tensor(label_id)
else:
return enc["input_ids"].squeeze(0), enc["attention_mask"].squeeze(0), x_tab
class TextTabModel(nn.Module):
def __init__(self, tab_in_dim, num_classes):
super().__init__()
self.text_model = AutoModel.from_pretrained(PRETRAINED)
# 冻结部分层,先训顶部
for p in self.text_model.parameters():
p.requires_grad = False
# 文本池化
self.text_proj = nn.Sequential(
nn.Linear(self.text_model.config.hidden_size, 256),
nn.ReLU(),
nn.Dropout(0.2)
)
# 表特征MLP
self.tab_mlp = nn.Sequential(
nn.Linear(tab_in_dim, 128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, 64),
nn.ReLU()
)
# 融合分类头
self.classifier = nn.Sequential(
nn.Linear(256 + 64, 128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, num_classes)
)
def forward(self, input_ids, attention_mask, x_tab):
out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
# 使用 [CLS] 或 mean pooling;DistilBERT无CLS,可用第一个token或mean
if "last_hidden_state" in out:
x_text = out.last_hidden_state.mean(dim=1) # mean pooling
else:
x_text = out.pooler_output
x_text = self.text_proj(x_text)
x_tab = self.tab_mlp(x_tab)
x = torch.cat([x_text, x_tab], dim=-1)
logits = self.classifier(x)
return logits
# DataLoader
train_ds = TicketDataset(X_train, y_train)
test_ds = TicketDataset(X_test, y_test)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)
# 类别权重
y_int = y_train.map(label2id).values
class_weights = compute_class_weight(class_weight="balanced", classes=np.arange(len(labels)), y=y_int)
class_weights_t = torch.tensor(class_weights, dtype=torch.float32)
# 训练
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tab_in_dim = ohe.transform(X_train[cat_cols][:1]).shape[1] + len(num_cols)
model = TextTabModel(tab_in_dim=tab_in_dim, num_classes=len(labels)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
criterion = nn.CrossEntropyLoss(weight=class_weights_t.to(device))
def evaluate(loader):
model.eval()
all_preds, all_trues = [], []
with torch.no_grad():
for input_ids, attention_mask, x_tab, labels_t in loader:
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
x_tab = x_tab.to(device)
labels_t = labels_t.to(device)
logits = model(input_ids, attention_mask, x_tab)
preds = logits.argmax(dim=-1).cpu().numpy()
all_preds.extend(list(preds))
all_trues.extend(list(labels_t.cpu().numpy()))
return np.array(all_preds), np.array(all_trues)
best_f1 = 0.0
for epoch in range(8):
model.train()
for input_ids, attention_mask, x_tab, labels_t in train_loader:
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
x_tab = x_tab.to(device)
labels_t = labels_t.to(device)
logits = model(input_ids, attention_mask, x_tab)
loss = criterion(logits, labels_t)
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
preds, trues = evaluate(test_loader)
from sklearn.metrics import f1_score
f1_macro = f1_score(trues, preds, average="macro")
print(f"Epoch {epoch} Macro-F1: {f1_macro:.4f}")
# 简单早停
if f1_macro > best_f1:
best_f1 = f1_macro
torch.save(model.state_dict(), "best_model.pt")
# 详细报告
preds, trues = evaluate(test_loader)
print(classification_report(trues, preds, target_names=labels))
print(confusion_matrix(trues, preds))
from sklearn.linear_model import LogisticRegression
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2))
Xtr_tfidf = tfidf.fit_transform(X_train["text"])
Xte_tfidf = tfidf.transform(X_test["text"])
lr = LogisticRegression(max_iter=1000, class_weight="balanced")
lr.fit(Xtr_tfidf, y_train)
preds_lr = lr.predict(Xte_tfidf)
print(classification_report(y_test, preds_lr))
自动数据分类能显著提升数据组织与业务响应效率。针对中型数据集与多模态特征,采用“Transformer文本编码 + 表/时序聚合特征的MLP融合”是实用且可维护的方案。请以数据质量与预处理为基础,结合不平衡处理与严谨评估,建立持续监控与迭代机制,确保模型在实际业务中稳定、准确地将数据归入您定义的12个类别,并对“紧急升级”等关键类别提供可靠的识别能力。
自动数据分类是指使用机器学习模型把原始数据自动归入预定义类别(如“国内时政”“科技互联网”等)。在数据组织与分析中,它能:
针对您的场景(>100万样本、中文新闻文本 + 若干分类型特征、15个主题类目、Scala/Spark 环境、偏好 SVM),我们推荐采用分布式的 Spark ML Pipeline + One-vs-Rest 的 LinearSVC(线性核 SVM)方案,结合高效的中文分词与稳健的特征工程。
每一步既有理论要点,也提供可直接落地的 Scala(Spark ML)代码片段。
最佳实践:
选项:
最佳实践:
常用且高效(适合大规模文本)的方案:
最佳实践:
备选模型(供权衡):
指标与方法:
说明:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification.{LinearSVC, OneVsRest}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
// 1) Spark Session
val spark = SparkSession.builder()
.appName("NewsMultiClassSVM")
.getOrCreate()
import spark.implicits._
// 2) 读入数据:假设包含 id, title, content, category(string), sourceType(string 类别特征)
val raw = spark.read
.option("header", "true")
.option("multiLine", "true") // 兼容长文本
.csv("hdfs:///path/to/news.csv")
.select(
$"id",
coalesce($"title", lit("")).as("title"),
coalesce($"content", lit("")).as("content"),
$"category".as("label_str"),
coalesce($"sourceType", lit("UNK")).as("sourceType")
).dropDuplicates("title","content") // 简单去重
// 3) 清洗文本与拼接
val cleanTextUdf = udf { (title: String, content: String) =>
val t = Option(title).getOrElse("")
val c = Option(content).getOrElse("")
val txt = (t + " " + c)
.replaceAll("<[^>]*>", " ") // 去 HTML
.replaceAll("\\s+", " ") // 归一空白
.replaceAll("[\\p{Cntrl}]", " ") // 控制字符
.trim
txt
}
val df = raw.withColumn("text", cleanTextUdf($"title", $"content"))
.filter(length($"text") >= 10) // 去除超短文本
// 4) 切分集合
val Array(train0, test) = df.randomSplit(Array(0.9, 0.1), seed = 42L)
// 从训练集再切分出验证(用于 TrainValidationSplit)
val Array(train, valid) = train0.randomSplit(Array(0.9, 0.1), seed = 7L)
// 5) 标签编码
val labelIndexer = new StringIndexer()
.setInputCol("label_str")
.setOutputCol("label")
.setHandleInvalid("skip")
.fit(train) // 使用训练集拟合,避免泄漏
// 6) 中文分词(两种方案,二选一)
// 6A) 简易 UDF 分词(可替换为 Jieba/HanLP 调用,示例仅演示)
val simpleCut = udf { (s: String) =>
// 简单按非中英文数字分割,再进行基本过滤
s.split("[^\\p{IsHan}\\p{IsAlphabetic}\\p{IsDigit}]+")
.filter(_.nonEmpty)
.toSeq
}
// 6B) 若使用 Spark NLP,请改用对应的 DocumentAssembler + Tokenizer + Normalizer 管道
// 停用词(需替换为您的中文停用词文件列表)
val stopwords = spark.read.textFile("hdfs:///path/to/stopwords_zh.txt").collect()
val remover = new StopWordsRemover()
.setStopWords(stopwords)
.setInputCol("tokens")
.setOutputCol("tokens_clean")
val tokenizer = new RegexTokenizer()
.setInputCol("text")
.setOutputCol("tokens")
.setPattern("[^\\p{IsHan}\\p{IsAlphabetic}\\p{IsDigit}]+") // 与 simpleCut 类似
.setMinTokenLength(1)
// 如需 n-gram
val bigram = new NGram()
.setN(2)
.setInputCol("tokens_clean")
.setOutputCol("tokens_bi")
val hasherUni = new HashingTF()
.setInputCol("tokens_clean")
.setOutputCol("tf_uni")
.setNumFeatures(1 << 18)
val hasherBi = new HashingTF()
.setInputCol("tokens_bi")
.setOutputCol("tf_bi")
.setNumFeatures(1 << 18)
val assemblerTF = new VectorAssembler()
.setInputCols(Array("tf_uni", "tf_bi"))
.setOutputCol("tf_all")
val idf = new IDF()
.setInputCol("tf_all")
.setOutputCol("tfidf")
.setMinDocFreq(5)
// 分类型特征:示例 sourceType
val srcIndexer = new StringIndexer()
.setInputCol("sourceType")
.setOutputCol("sourceType_idx")
.setHandleInvalid("keep")
val srcOHE = new OneHotEncoder()
.setInputCols(Array("sourceType_idx"))
.setOutputCols(Array("sourceType_vec"))
// 特征拼接与标准化(SVM 对尺度敏感)
val featAssembler = new VectorAssembler()
.setInputCols(Array("tfidf", "sourceType_vec"))
.setOutputCol("features_raw")
val scaler = new StandardScaler()
.setInputCol("features_raw")
.setOutputCol("features")
.setWithMean(false) // 稀疏谨慎
.setWithStd(true)
// 7) 处理类别不平衡:计算权重列
val labelCounts = labelIndexer.transform(train).groupBy("label").count().collect()
val total = labelCounts.map(_.getLong(1)).sum.toDouble
val labelToFreq = labelCounts.map(r => (r.getDouble(0), r.getLong(1).toDouble / total)).toMap
val bcMap = spark.sparkContext.broadcast(labelToFreq)
val weightUdf = udf { (label: Double) =>
// 反频率权重(平滑)
val freq = bcMap.value.getOrElse(label, 1e-9)
1.0 / math.sqrt(freq + 1e-9)
}
val trainLabeled = labelIndexer.transform(train)
.withColumn("classWeight", weightUdf($"label"))
val validLabeled = labelIndexer.transform(valid)
val testLabeled = labelIndexer.transform(test)
// 8) 模型(SVM + OneVsRest)
val lsvc = new LinearSVC()
.setFeaturesCol("features")
.setLabelCol("label")
.setMaxIter(50)
.setRegParam(0.1)
val ovr = new OneVsRest()
.setClassifier(lsvc)
.setFeaturesCol("features")
.setLabelCol("label")
.setWeightCol("classWeight") // 传入权重
// 9) 可选:Chi-Square 特征选择(放在 scaler 前更合理,因其基于离散计数/TF)
val selector = new ChiSqSelector()
.setFeaturesCol("tfidf")
.setLabelCol("label")
.setOutputCol("tfidf_sel")
.setNumTopFeatures(200000) // 依据资源调参
val featAssembler2 = new VectorAssembler()
.setInputCols(Array("tfidf_sel", "sourceType_vec"))
.setOutputCol("features_raw")
// 10) Pipeline 组装(根据是否启用 selector 选择组装方式)
val pipeline = new Pipeline().setStages(Array(
labelIndexer,
tokenizer, remover, bigram,
hasherUni, hasherBi, assemblerTF, idf,
srcIndexer, srcOHE,
featAssembler, // 或换成 featAssembler2 + selector
scaler,
ovr
))
// 11) 超参搜索(使用 TrainValidationSplit 节省资源)
val paramGrid = new ParamGridBuilder()
.addGrid(hasherUni.numFeatures, Array(1<<18, 1<<19))
.addGrid(hasherBi.numFeatures, Array(1<<18))
.addGrid(idf.minDocFreq, Array(3, 5))
.addGrid(lsvc.regParam, Array(0.05, 0.1, 0.2))
.addGrid(lsvc.maxIter, Array(50, 100))
.build()
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("f1") // Weighted-F1
val tvs = new TrainValidationSplit()
.setEstimator(pipeline)
.setEvaluator(evaluator)
.setEstimatorParamMaps(paramGrid)
.setTrainRatio(0.8)
.setParallelism(2)
// 12) 训练(注意:要使用带权重的训练数据)
val model = tvs.fit(trainLabeled)
// 13) 验证与测试
val validPred = model.transform(validLabeled)
val testPred = model.transform(labelIndexer.transform(test))
val f1Valid = evaluator.evaluate(validPred)
val f1Test = evaluator.evaluate(testPred)
println(s"Valid Weighted-F1 = $f1Valid")
println(s"Test Weighted-F1 = $f1Test")
// 计算 Macro-F1(需要从预测中提取标签与预测对比)
import org.apache.spark.mllib.evaluation.MulticlassMetrics
val rdd = testPred.select($"prediction", $"label").as[(Double, Double)].rdd
val metrics = new MulticlassMetrics(rdd)
println(s"Test Macro-F1 = ${metrics.fMeasure}")
// 14) 模型保存
val bestPipelineModel = model.bestModel.asInstanceOf[PipelineModel]
bestPipelineModel.write.overwrite().save("hdfs:///path/to/news_svm_ovr_pipeline")
推理服务(加载并预测):
val loaded = PipelineModel.load("hdfs:///path/to/news_svm_ovr_pipeline")
val toPredict = spark.read.parquet("hdfs:///path/to/new_items.parquet") // 需包含 text/sourceType 等字段
val pred = loaded.transform(toPredict)
.select($"id", $"prediction")
pred.write.mode("overwrite").parquet("hdfs:///path/to/pred_result")
自动数据分类是指利用算法将数据样本自动归入预定义类别(例如:正面、负面、讽刺等)。在数据组织与分析中,它能:
在本场景中,数据量小于1万,类别为8个情感/情绪标签,特征包含文本与数值型,偏好模型为随机森林。下文给出从数据到部署的系统化方案(R 语言),兼顾可实现与可维护性。
每个步骤均提供可操作技术、最佳实践与R代码片段/伪代码。
目标:产出干净、结构化、可学习的数据矩阵;避免信息泄露。
数据审计与一致性
文本清洗(中文)
数值特征处理
数据分割
示例:基础清洗与“情绪线索”数值特征抽取(R)
library(dplyr); library(stringr); library(purrr)
# df: data.frame(text, label, <numerical cols...>)
# 简单清洗
clean_text <- function(x) {
x %>%
str_replace_all("[\\p{Cc}\\p{Cf}]+", " ") %>% # 控制符
str_replace_all("[\\s]+", " ") %>%
str_trim()
}
# 计算情绪/讽刺线索型特征
sentiment_clues <- function(x) {
tibble(
exclam_cnt = str_count(x, fixed("!")),
ques_cnt = str_count(x, fixed("?")),
ellipsis_cnt = str_count(x, "…|\\.\\.\\."),
quote_cnt = str_count(x, "[“”\"']"),
negation_cnt = str_count(x, "不|没|无|别|非|别想|从不|毫无"),
haha_cnt = str_count(x, "呵呵|哈哈|lol|🙂|😅|😉"),
angry_cnt = str_count(x, "气死|生气|愤怒|怒|😡"),
emoji_cnt = str_count(x, "[\\p{So}]") # 符号类,近似统计emoji
)
}
df <- df %>%
mutate(
text = clean_text(text)
) %>%
bind_cols(sentiment_clues(.$text))
小样本+中文文本的稳妥做法:
用 text2vec 构建 TF-IDF + LSA(避免数据泄露的接口示例)
library(text2vec)
library(jiebaR)
library(Matrix)
# 分词器(中文)
wk <- jiebaR::worker()
tokenize_cn <- function(x) {
lapply(x, function(doc) jiebaR::segment(doc, wk))
}
# 拆分数据
set.seed(42)
idx <- caret::createDataPartition(df$label, p = 0.8, list = FALSE)
tr <- df[idx, ]
te <- df[-idx, ]
# 仅在训练集拟合词表/TF-IDF/LSA
it_tr <- itoken(tr$text, tokenizer = tokenize_cn, progressbar = FALSE)
vocab <- create_vocabulary(it_tr, ngram = c(1L, 2L)) %>%
prune_vocabulary(term_count_min = 2, doc_proportion_max = 0.5, vocab_term_max = 40000)
vectorizer <- vocab_vectorizer(vocab)
dtm_tr <- create_dtm(it_tr, vectorizer)
tfidf <- TfIdf$new()
dtm_tr_tfidf <- tfidf$fit_transform(dtm_tr)
lsa <- LSA$new(n_topics = 200)
emb_tr <- lsa$fit_transform(dtm_tr_tfidf) # 训练文档的低维表示 (n_train x 200)
# 对测试集仅做transform
it_te <- itoken(te$text, tokenizer = tokenize_cn, progressbar = FALSE)
dtm_te <- create_dtm(it_te, vectorizer)
dtm_te_tfidf <- tfidf$transform(dtm_te)
emb_te <- lsa$transform(dtm_te_tfidf)
# 数值特征(示例:除 text/label 外的数值列 + 刚才构造的情绪线索)
num_cols <- df %>% select(where(is.numeric)) %>% colnames()
robust_scale <- function(x) (x - median(x, na.rm=TRUE)) / (mad(x, constant = 1, na.rm=TRUE) + 1e-9)
Xnum_tr <- tr %>% select(all_of(num_cols)) %>% mutate(across(everything(), ~replace_na(., median(., na.rm=TRUE))))
Xnum_te <- te %>% select(all_of(num_cols)) %>% mutate(across(everything(), ~replace_na(., median(., na.rm=TRUE))))
Xnum_tr <- mutate(across(Xnum_tr, robust_scale))
Xnum_te <- mutate(across(Xnum_te, robust_scale))
# 融合特征
X_tr <- cbind(as.data.frame(emb_tr), Xnum_tr)
X_te <- cbind(as.data.frame(emb_te), Xnum_te)
y_tr <- factor(tr$label)
y_te <- factor(te$label, levels = levels(y_tr))
训练随机森林(ranger,含类别不平衡权重与调参网格)
library(ranger)
library(yardstick)
library(dplyr)
# 类别权重:频率的反比
w <- table(y_tr)
class_w <- as.numeric(max(w) / w)
names(class_w) <- names(w)
# 简单网格搜索
set.seed(42)
grid <- expand.grid(
mtry = c(floor(sqrt(ncol(X_tr))), floor(ncol(X_tr)/4), floor(ncol(X_tr)/2)),
min_node_size = c(1, 5, 10),
num_trees = c(500, 1000)
)
best <- NULL; best_f1 <- -Inf
for(i in seq_len(nrow(grid))) {
g <- grid[i, ]
fit <- ranger(
dependent.variable.name = "label",
data = cbind(label = y_tr, X_tr),
probability = TRUE,
num.trees = g$num_trees,
mtry = g$mtry,
min.node.size = g$min_node_size,
class.weights = class_w,
importance = "permutation",
seed = 42
)
pred <- predict(fit, data = X_te)$predictions
pred_class <- colnames(pred)[max.col(pred, ties.method = "first")]
res <- yardstick::f_meas_vec(truth = y_te, estimate = factor(pred_class, levels = levels(y_te)),
estimator = "macro", event_level = "first")
if(res > best_f1) { best_f1 <- res; best <- fit }
}
cat(sprintf("Best macro-F1 on holdout: %.4f\n", best_f1))
# 混淆矩阵与其他指标
best_pred <- predict(best, data = X_te)$predictions
best_class <- colnames(best_pred)[max.col(best_pred, ties.method = "first")]
metrics <- yardstick::metric_set(accuracy, kap, f_meas)
metrics_out <- metrics(truth = y_te, estimate = factor(best_class, levels = levels(y_te)),
estimator = "macro")
print(metrics_out)
提示:
交叉验证伪代码(避免特征泄露)
cv_eval <- function(df, K = 5) {
folds <- rsample::vfold_cv(df, v = K, strata = "label")
out <- purrr::map_dfr(folds$splits, function(s) {
tr <- rsample::analysis(s); va <- rsample::assessment(s)
# 1) 在tr上fit:tokenizer->vocab->tfidf->lsa;在va上transform
# 2) 处理数值特征(fit中位数/MAD于tr,再应用于va)
# 3) 训练ranger(含class.weights)
# 4) 在va上预测,计算macro-F1等
tibble(f1_macro = f1_value_here)
})
summarize(out, f1_macro_mean = mean(f1_macro), f1_macro_sd = sd(f1_macro))
}
保存与加载
# 训练后保存工件
saveRDS(list(vocab = vocab, tfidf = tfidf, lsa = lsa), "text_pipeline.rds")
saveRDS(best, "rf_model.rds")
saveRDS(list(num_cols = num_cols,
num_median = tr %>% summarise(across(all_of(num_cols), ~median(., na.rm=TRUE))),
num_mad = tr %>% summarise(across(all_of(num_cols), ~mad(., constant=1, na.rm=TRUE))))
, "num_stats.rds")
在线/批量推理(简化示例)
predict_classes <- function(new_df) {
pipe <- readRDS("text_pipeline.rds")
model <- readRDS("rf_model.rds")
num_stats <- readRDS("num_stats.rds")
new_df <- new_df %>% mutate(text = clean_text(text)) %>% bind_cols(sentiment_clues(.$text))
it_new <- text2vec::itoken(new_df$text, tokenizer = tokenize_cn, progressbar = FALSE)
dtm_new <- text2vec::create_dtm(it_new, text2vec::vocab_vectorizer(pipe$vocab))
dtm_new_tfidf <- pipe$tfidf$transform(dtm_new)
emb_new <- pipe$lsa$transform(dtm_new_tfidf)
Xnum <- new_df %>% select(all_of(names(num_stats$num_median)))
Xnum <- Xnum %>% mutate(across(everything(), ~replace_na(., as.numeric(num_stats$num_median[[cur_column()]]))))
robust <- function(x, med, md) (x - med) / (md + 1e-9)
for (col in names(Xnum)) {
med <- as.numeric(num_stats$num_median[[col]])
md <- as.numeric(num_stats$num_mad[[col]])
Xnum[[col]] <- robust(Xnum[[col]], med, md)
}
X <- cbind(as.data.frame(emb_new), Xnum)
pred <- predict(model, data = X)$predictions
probs <- as.data.frame(pred)
cls <- colnames(pred)[max.col(pred, ties.method = "first")]
tibble(class = cls, prob = do.call(pmax, as.list(probs)))
}
在小型数据集、中文多情绪多类别场景下,采用“文本TF-IDF→LSA降维 + 数值特征→随机森林”的管线,既稳健又易实施。辅以分层CV、类别权重、精细的文本情绪线索工程与误差分析,可获得可靠的Macro-F1。保持数据质量与持续评估,是长期表现优异的关键。建议将线性SVM/逻辑回归作为对照基线,必要时引入轻量句向量进一步提升“讽刺/情绪”类别的可分性。
帮助用户设计用于数据自动分类的算法,通过结构化的指导和代码示例,提升其数据组织效率和分析能力,同时降低技术门槛,使其能直接应用于实际场景。
希望快速开发和验证数据分类模型的科学家,可以节省重复工作时间,快速构建自动化分析管道。
负责企业内部数据分类与整理,通过该提示词轻松提升数据组织效率和分析深度,助力业务增长。
需要为应用系统设计高效数据处理模块,可快速获取分类算法并结合现有系统开发需求。
将模板生成的提示词复制粘贴到您常用的 Chat 应用(如 ChatGPT、Claude 等),即可直接对话使用,无需额外开发。适合个人快速体验和轻量使用场景。
把提示词模板转化为 API,您的程序可任意修改模板参数,通过接口直接调用,轻松实现自动化与批量处理。适合开发者集成与业务系统嵌入。
在 MCP client 中配置对应的 server 地址,让您的 AI 应用自动调用提示词模板。适合高级用户和团队协作,让提示词在不同 AI 工具间无缝衔接。
半价获取高级提示词-优惠即将到期