热门角色不仅是灵感来源,更是你的效率助手。通过精挑细选的角色提示词,你可以快速生成高质量内容、提升创作灵感,并找到最契合你需求的解决方案。让创作更轻松,让价值更直接!
我们根据不同用户需求,持续更新角色库,让你总能找到合适的灵感入口。
本提示词提供专业NLP指导,覆盖从数据收集、文本预处理、特征提取、模型选择与训练,到模型评估及部署的完整情感分析流程。用户可基于提供的数据源、系统规格和编程语言构建稳健模型,获得准确的客户情感解析结果,并支持模型优化与持续更新,适用于科研、企业客户分析及数据驱动决策场景。
以下方案面向“中文电商评论/售后对话”的情感分析,结合您的硬件环境(RTX 3060 12GB,Python 3.10,CUDA 12.1)与工具栈(Transformers、PyTorch、spaCy、Scikit-learn),以BERT为主模型,并提供可落地的基线与工程化建议。
import pandas as pd
from pathlib import Path
def load_data(paths):
dfs = []
for p in paths:
p = Path(p)
if p.suffix.lower() == ".csv":
dfs.append(pd.read_csv(p))
elif p.suffix.lower() == ".json":
dfs.append(pd.read_json(p, lines=True))
df = pd.concat(dfs, ignore_index=True)
# 基本清洗
df = df.drop_duplicates(subset=["comment", "user_id", "timestamp"])
df = df.dropna(subset=["comment", "rating", "timestamp"])
df["timestamp"] = pd.to_datetime(df["timestamp"])
return df
df = load_data(["data/reviews_2023.csv","data/reviews_2024.json"])
print(df.sample(3).to_dict(orient="records"))
import re
import spacy
import unicodedata
# 安装后执行:python -m spacy download zh_core_web_sm
nlp = spacy.load("zh_core_web_sm")
negations = set(["不","没","無","未","别","毋","甭","非","否","难以","并非"]) # 可扩展为您提供的词典
syn_map = {"非常好":"很好", "挺棒":"很好"} # 示例,实际用您的同义词词典
url_re = re.compile(r'https?://\S+|www\.\S+')
html_re = re.compile(r'<.*?>')
phone_email_re = re.compile(r'(\+?\d[\d -]{7,}\d)|([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})')
def normalize_text(s: str) -> str:
s = str(s)
s = unicodedata.normalize("NFKC", s) # 全半角统一
s = url_re.sub(" ", s)
s = html_re.sub(" ", s)
s = phone_email_re.sub(" ", s)
for k,v in syn_map.items():
s = s.replace(k, v)
s = re.sub(r'\s+', ' ', s).strip()
return s
def tokenize_with_negation(s: str):
doc = nlp(s)
tokens = []
negate = False
for t in doc:
if t.text in negations:
tokens.append(t.text)
negate = True
continue
if t.is_punct: # 标点终止否定范围
negate = False
continue
tok = t.text
if negate:
tok = "NOT_" + tok
tokens.append(tok)
return tokens
df["comment_clean"] = df["comment"].apply(normalize_text)
pip install -U torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install -U transformers datasets accelerate scikit-learn spacy
python -m spacy download zh_core_web_sm
# 部署/优化可选
pip install -U onnxruntime-gpu optimum fastapi uvicorn[standard] mlflow
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
# 标签(以3分类为例;如用5分类将mapping替换)
def map_label(r):
if r <= 2: return 0
elif r == 3: return 1
else: return 2
df["label"] = df["rating"].apply(map_label)
train_df, test_df = train_test_split(df, test_size=0.15, shuffle=False, stratify=None)
# 使用词级别+字符级别组合特征(对中文有用)
def spacy_tokenizer(s): return tokenize_with_negation(s)
vec_word = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,2), min_df=3, max_features=200000)
vec_char = TfidfVectorizer(analyzer='char', ngram_range=(2,5), min_df=3, max_features=100000)
Xw = vec_word.fit_transform(train_df["comment_clean"])
Xc = vec_char.fit_transform(train_df["comment_clean"])
from scipy.sparse import hstack
X_train = hstack([Xw, Xc])
y_train = train_df["label"]
clf = LogisticRegression(max_iter=2000, class_weight='balanced', n_jobs=-1)
clf.fit(X_train, y_train)
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# 5分类标签示例(改回5类更细粒度)
df = df.sort_values("timestamp").reset_index(drop=True)
n = len(df)
test_df = df.iloc[int(n*0.85):]
val_df = df.iloc[int(n*0.70):int(n*0.85)]
train_df = df.iloc[:int(n*0.70)]
df_train = Dataset.from_pandas(train_df[["comment_clean","rating"]].rename(columns={"comment_clean":"text","rating":"rating"}))
df_val = Dataset.from_pandas(val_df[["comment_clean","rating"]].rename(columns={"comment_clean":"text","rating":"rating"}))
df_test = Dataset.from_pandas(test_df[["comment_clean","rating"]].rename(columns={"comment_clean":"text","rating":"rating"}))
def map_rating_to_label(ex):
ex["labels"] = int(ex["rating"] - 1) # 1-5 -> 0-4
return ex
tokenizer = BertTokenizerFast.from_pretrained("hfl/chinese-macbert-base")
def tokenize(batch):
return tokenizer(batch["text"], truncation=True, padding=False, max_length=128)
ds = DatasetDict({
"train": df_train.map(map_rating_to_label).map(tokenize, batched=True),
"val": df_val.map(map_rating_to_label).map(tokenize, batched=True),
"test": df_test.map(map_rating_to_label).map(tokenize, batched=True),
}).remove_columns(["text","rating"])
num_labels = 5
model = BertForSequenceClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=num_labels)
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = np.argmax(logits, axis=-1)
acc = accuracy_score(labels, preds)
p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0)
return {"accuracy": acc, "precision_macro": p, "recall_macro": r, "f1_macro": f1}
training_args = TrainingArguments(
output_dir="chkpt/macbert_sa",
learning_rate=2e-5,
per_device_train_batch_size=24, # 3060 12GB 可承受;若OOM降到16
per_device_eval_batch_size=64,
num_train_epochs=3,
lr_scheduler_type="linear",
warmup_ratio=0.06,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
fp16=True, # AMP加速
gradient_accumulation_steps=1,
logging_steps=50,
report_to=["none"]
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=ds["train"],
eval_dataset=ds["val"],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
metrics = trainer.evaluate(ds["test"])
print(metrics)
trainer.save_model("artifacts/macbert_sa")
tokenizer.save_pretrained("artifacts/macbert_sa")
from sklearn.metrics import classification_report, confusion_matrix
X_val = hstack([vec_word.transform(val_df["comment_clean"]), vec_char.transform(val_df["comment_clean"])])
y_val = val_df["label"] # 若使用3类映射
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred, digits=4))
print(confusion_matrix(y_val, y_pred))
from fastapi import FastAPI
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification
import uvicorn
app = FastAPI()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizerFast.from_pretrained("artifacts/macbert_sa")
model = BertForSequenceClassification.from_pretrained("artifacts/macbert_sa").to(device).eval()
@torch.inference_mode()
def predict_batch(texts):
enc = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
enc = {k: v.to(device) for k,v in enc.items()}
with torch.cuda.amp.autocast():
logits = model(**enc).logits
probs = torch.softmax(logits, dim=-1).detach().cpu().numpy()
preds = probs.argmax(axis=1).tolist()
return preds, probs
@app.post("/predict")
def predict(payload: dict):
texts = payload.get("texts", [])
preds, probs = predict_batch(texts)
return {"preds": preds, "probs": probs}
# 运行:uvicorn app:app --host 0.0.0.0 --port 8080 --workers 1
补充:端到端脚手架(从TF-IDF基线到BERT)
如需,我可以基于您样本的真实分布(类别占比、文本长度、渠道差异)进一步给出精确的切分策略、最优超参与目标延迟的具体配置。
以下方案面向您的业务与技术环境(英/西语客服文本、Kafka/Parquet、Java 17 推理、K8s 无 GPU、TensorFlow/Keras/OpenNLP/FastText),以可落地的端到端步骤说明为主,并包含可直接执行的代码示例。
数据收集
数据预处理
工具选择
特征提取
import tensorflow as tf
from tensorflow.keras import layers, models
max_tokens = 30000
tfidf_vec = layers.TextVectorization(
max_tokens=max_tokens,
output_mode="tf-idf",
standardize="lower_and_strip_punctuation" # 可替换为自定义函数
)
# 适配词表(在 fit 前先调用 adapt)
# tfidf_vec.adapt(tf.data.Dataset.from_tensor_slices(texts_train).batch(1024))
inp = layers.Input(shape=(1,), dtype=tf.string, name="text")
x = tfidf_vec(inp)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(3, activation="softmax")(x)
tfidf_model = models.Model(inp, out)
tfidf_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
- LSTM(TextVectorization 输出 token id → Embedding → BiLSTM)
import re
import tensorflow as tf
from tensorflow.keras import layers, models
max_tokens = 30000
seq_len = 220
emb_dim = 100
lstm_units = 64
def custom_standardize(x):
# 保留重音;替换 url/email/数字;小写;压缩空白
x = tf.strings.lower(x)
x = tf.strings.regex_replace(x, r'https?://\S+|www\.\S+', ' <url> ')
x = tf.strings.regex_replace(x, r'\S+@\S+', ' <email> ')
x = tf.strings.regex_replace(x, r'\d+', ' <num> ')
x = tf.strings.regex_replace(x, r'\s+', ' ')
return x
vec = layers.TextVectorization(
max_tokens=max_tokens,
output_mode="int",
output_sequence_length=seq_len,
standardize=custom_standardize
)
# vec.adapt(ds_text_train) # ds_text_train: tf.data.Dataset of strings
inp = layers.Input(shape=(1,), dtype=tf.string, name="text")
x = vec(inp)
x = layers.Embedding(max_tokens, emb_dim, name="embed")(x)
x = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=False))(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(3, activation="softmax")(x)
lstm_model = models.Model(inp, out)
lstm_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy",
metrics=["accuracy", tf.keras.metrics.Precision(name="precision"),
tf.keras.metrics.Recall(name="recall")])
模型选择
模型训练
# train_lstm_sentiment.py
import os, math
import pandas as pd
import pyarrow.dataset as ds
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
PARQUET_PATH = "/path/to/parquet" # 目录或通配
EXPORT_DIR = "export/sentiment_lstm/1" # 版本号用于 TF Serving
BATCH = 128
EPOCHS = 8
MAX_TOKENS = 30000
SEQ_LEN = 220
EMB = 100
LSTM_UNITS = 64
LANG_TOK = {"en": "<lang_en>", "es": "<lang_es>"}
LABEL2ID = {"neg":0, "neu":1, "pos":2}
def map_label(csat):
if pd.isna(csat): return None
if csat <= 2: return "neg"
if csat == 3: return "neu"
return "pos"
def load_df():
dataset = ds.dataset(PARQUET_PATH, format="parquet")
table = dataset.to_table(columns=['ticket_id','lang','text','csat','ts'])
df = table.to_pandas()
df = df[df['lang'].isin(['en','es'])]
df = df.dropna(subset=['text'])
df['label'] = df['csat'].apply(map_label)
df = df.dropna(subset=['label'])
df['ts'] = pd.to_datetime(df['ts'])
# 去重:保留最新
df = df.sort_values('ts').drop_duplicates('ticket_id', keep='last')
# 语言 token
df['text'] = df.apply(lambda r: f"{LANG_TOK.get(r['lang'],'')} {r['text']}", axis=1)
return df
def stratified_split(df, val_ratio=0.1, test_start="2024-11-01"):
test = df[df['ts'] >= test_start]
trainval = df[df['ts'] < test_start].sample(frac=1, random_state=42)
train_parts, val_parts = [], []
for (lab, lang), g in trainval.groupby(['label','lang']):
n_val = int(len(g) * val_ratio)
val_parts.append(g.iloc[:n_val])
train_parts.append(g.iloc[n_val:])
train = pd.concat(train_parts); val = pd.concat(val_parts)
return train, val, test
def make_ds(texts, labels, batch=BATCH, shuffle=False):
ds = tf.data.Dataset.from_tensor_slices((texts.values, labels.values))
if shuffle: ds = ds.shuffle(10000, seed=42)
return ds.batch(batch).prefetch(tf.data.AUTOTUNE)
def custom_standardize(x):
x = tf.strings.lower(x)
x = tf.strings.regex_replace(x, r'https?://\S+|www\.\S+', ' <url> ')
x = tf.strings.regex_replace(x, r'\S+@\S+', ' <email> ')
x = tf.strings.regex_replace(x, r'\d+', ' <num> ')
x = tf.strings.regex_replace(x, r'\s+', ' ')
return x
def build_model():
vec = layers.TextVectorization(
max_tokens=MAX_TOKENS, output_mode="int",
output_sequence_length=SEQ_LEN, standardize=custom_standardize, name="vectorize"
)
inp = layers.Input(shape=(1,), dtype=tf.string, name="text")
x = vec(inp)
x = layers.Embedding(MAX_TOKENS, EMB, name="embed")(x)
x = layers.Bidirectional(layers.LSTM(LSTM_UNITS))(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(3, activation="softmax")(x)
model = models.Model(inp, out)
model.vec = vec
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy",
metrics=["accuracy", tf.keras.metrics.Precision(name="precision"),
tf.keras.metrics.Recall(name="recall")])
return model
def main():
df = load_df()
train, val, test = stratified_split(df)
y_map = lambda s: s.map(LABEL2ID).astype('int32')
ds_text_train = tf.data.Dataset.from_tensor_slices(train['text'].values).batch(1024)
model = build_model()
model.vec.adapt(ds_text_train)
# class weight (防类别不平衡)
counts = train['label'].value_counts()
total = len(train)
class_weight = {LABEL2ID[k]: total/(3*counts[k]) for k in LABEL2ID}
ds_tr = make_ds(train['text'], y_map(train['label']), shuffle=True)
ds_va = make_ds(val['text'], y_map(val['label']))
ds_te = make_ds(test['text'], y_map(test['label']))
es = callbacks.EarlyStopping(monitor="val_accuracy", patience=2, restore_best_weights=True)
model.fit(ds_tr, validation_data=ds_va, epochs=EPOCHS, class_weight=class_weight, callbacks=[es])
# 评估
metrics = model.evaluate(ds_te, return_dict=True)
print("Test metrics:", metrics)
# 导出 SavedModel(包含 TextVectorization)
os.makedirs(EXPORT_DIR, exist_ok=True)
model.save(EXPORT_DIR) # 目录命名为版本号,便于 TF Serving
print("Saved to", EXPORT_DIR)
if __name__ == "__main__":
main()
# 生成训练/验证/测试 txt(示例思路)
# 每行格式: __label__pos <lang_en> your text...
./fasttext supervised -input train.txt -output ft_model -lr 0.5 -epoch 10 -wordNgrams 2 -dim 100 -loss hs
./fasttext test ft_model.bin valid.txt
./fasttext predict-prob ft_model.bin test.txt > preds.txt
import numpy as np
from sklearn.metrics import classification_report, f1_score # 若不使用 sklearn,则自行实现 F1
y_true = test['label'].map(LABEL2ID).values
y_prob = model.predict(make_ds(test['text'], y_map(test['label']))) # (N,3)
y_pred = y_prob.argmax(axis=1)
print("Macro-F1:", f1_score(y_true, y_pred, average='macro'))
print(classification_report(y_true, y_pred, target_names=['neg','neu','pos']))
# 分语言评估
for lg in ['en', 'es']:
mask = test['lang']==lg
print(lg, f1_score(y_true[mask], y_pred[mask], average='macro'))
- 阈值/代价敏感:若业务更关心负向召回,可对负向类别设置更高阈值或使用类权重/焦点损失。
// 使用 OkHttp 发送到 TF Serving REST(serving_default 签名接受 string 输入)
OkHttpClient client = new OkHttpClient();
String[] texts = new String[] {
"<lang_en> package arrived late, very disappointed",
"<lang_es> excelente servicio, gracias!"
};
String payload = "{\"instances\":" + new com.fasterxml.jackson.databind.ObjectMapper().writeValueAsString(texts) + "}";
Request req = new Request.Builder()
.url("http://tf-serving:8501/v1/models/sentiment_lstm:predict")
.post(RequestBody.create(payload, MediaType.parse("application/json")))
.build();
try (Response resp = client.newCall(req).execute()) {
String body = resp.body().string();
// 解析 {"predictions":[[p_neg,p_neu,p_pos], ...]}
// 取 argmax 为类别,最大值为置信度
}
CREATE TABLE IF NOT EXISTS sentiment_predictions (
ticket_id TEXT PRIMARY KEY,
ts TIMESTAMPTZ NOT NULL,
lang TEXT NOT NULL,
sentiment TEXT NOT NULL, -- neg/neu/pos
prob REAL NOT NULL,
model_version TEXT NOT NULL,
created_at TIMESTAMPTZ DEFAULT now()
);
补充:示例数据准备(Python,从 Parquet 生成 FastText 训练文件)
import pandas as pd, pyarrow.dataset as ds
def to_label(csat):
if pd.isna(csat): return None
if csat <= 2: return "__label__neg"
if csat == 3: return "__label__neu"
return "__label__pos"
dataset = ds.dataset("/path/to/parquet", format="parquet")
df = dataset.to_table(columns=['ticket_id','lang','text','csat','ts']).to_pandas()
df = df[df['lang'].isin(['en','es'])].dropna(subset=['text'])
df['label'] = df['csat'].apply(to_label)
df = df.dropna(subset=['label'])
df['ts'] = pd.to_datetime(df['ts'])
df = df.sort_values('ts').drop_duplicates('ticket_id', keep='last')
df['text'] = df.apply(lambda r: f"{'<lang_en>' if r['lang']=='en' else '<lang_es>'} {r['text']}", axis=1)
train = df[df['ts'] < '2024-11-01']
test = df[df['ts'] >= '2024-11-01']
def dump(df, path):
with open(path, 'w', encoding='utf8') as f:
for _, r in df.iterrows():
f.write(f"{r['label']} {r['text'].replace('\n',' ')}\n")
dump(train, "train.txt")
dump(test, "test.txt")
关键实践总结
以下为面向“社交媒体与应用商店短评(中英混合+表情符号)”的情感分析端到端方案。以R为主实现,并给出与Python生态(NLTK/spaCy/TensorFlow、Scikit-learn、FastText)的互操作建议,适配您当前的服务器、MinIO存储、Airflow调度和REST网关部署。主模型为线性SVM,特征为TF-IDF(词袋或字符n-gram),兼顾批量与流式预测与可复现实验报告。
# 安装:install.packages(c("arrow","dplyr"))
library(arrow); library(dplyr)
# 通过arrow连接MinIO (S3协议)
fs <- S3FileSystem$create(
access_key = Sys.getenv("MINIO_ACCESS_KEY"),
secret_key = Sys.getenv("MINIO_SECRET_KEY"),
endpoint_override = Sys.getenv("MINIO_ENDPOINT"), # 例如 "http://minio:9000"
scheme = "http"
)
# 读取分区数据集(示例路径)
ds <- open_dataset("s3://sentiment/raw/2024-06_2025-02/", filesystem = fs, format = "parquet")
data <- ds %>% select(post_id, platform, locale, content, rating, opt_in, created_at) %>% collect()
# 筛选合规与非空文本
data <- data %>% filter(opt_in == TRUE, !is.na(content), nchar(content) > 0)
# 安装:install.packages(c("quanteda","stringi","dplyr"))
library(quanteda); library(stringi); library(dplyr)
clean_text <- function(x) {
x <- stri_trans_nfkc(x)
x <- stri_replace_all_regex(x, "https?://\\S+|www\\.\\S+", " ") # URL
x <- stri_replace_all_regex(x, "@[A-Za-z0-9_]+", "@user ") # @提及
x <- stri_replace_all_regex(x, "#(\\p{L}[\\p{L}\\p{N}_]+)", "hashtag_\\1 ") # 话题
x <- stri_replace_all_charclass(x, "\\p{Control}", " ")
x <- stri_replace_all_regex(x, "([!\\?\\.])\\1{2,}", "\\1\\1") # 连续标点
x <- stri_replace_all_regex(x, "([A-Za-z])\\1{2,}", "\\1\\1") # 拉长词
x <- stri_trim_both(stri_replace_all_regex(x, "\\s+", " "))
x
}
data$clean <- clean_text(data$content)
# 字符n-gram分词与DFM
toks <- tokens(data$clean, what = "character", remove_punct = FALSE, remove_numbers = FALSE)
toks <- tokens_ngrams(toks, n = 2:5, concatenator = "")
dfm_all <- dfm(toks, tolower = FALSE)
dfm_all <- dfm_trim(dfm_all, min_docfreq = 5, docfreq_type = "count", max_docfreq = 0.9, docfreq_type2 = "prop")
dfm_all_tfidf <- dfm_tfidf(dfm_all, scheme_tf = "logcount", scheme_df = "inverse")
# R
install.packages(c("quanteda","jiebaR","stringi","dplyr","rsample","yardstick",
"LiblineaR","plumber","arrow","mlflow","Matrix","renv","withr"))
# Python(在R里用reticulate或系统环境安装)
# install.packages("reticulate")
reticulate::py_install(c("spacy","scikit-learn","nltk","gensim","fasttext"), envname = "r-nlp", method = "conda")
# 安装spaCy模型(命令行)
# python -m spacy download en_core_web_sm
# python -m spacy download zh_core_web_sm
# 已得到 dfm_all_tfidf (dgCMatrix 兼容稀疏)
# 标签构造(基于rating的弱监督)
lab_map <- function(r) ifelse(r <= 2, "neg", ifelse(r == 3, "neu", ifelse(r >= 4, "pos", NA)))
data$label <- lab_map(data$rating)
labeled <- data %>% filter(!is.na(label))
# 用相同文档子集对齐DFM与标签
dfm_lbl <- dfm_all_tfidf[docnames(dfm_all_tfidf) %in% labeled$post_id, ]
labeled <- labeled %>% filter(post_id %in% docnames(dfm_lbl))
y <- factor(labeled$label, levels = c("neg","neu","pos"))
# 安装:install.packages(c("rsample","LiblineaR","Matrix","dplyr","yardstick","purrr"))
library(rsample); library(LiblineaR); library(Matrix); library(purrr); library(yardstick)
# 时间分割(例如 80/20)
labeled$created_at <- as.POSIXct(labeled$created_at, tz = "UTC")
labeled <- labeled %>% arrange(created_at)
split_idx <- floor(0.8 * nrow(labeled))
train_ids <- labeled$post_id[1:split_idx]
test_ids <- labeled$post_id[(split_idx+1):nrow(labeled)]
X_train <- dfm_lbl[docnames(dfm_lbl) %in% train_ids, ]
X_test <- dfm_lbl[docnames(dfm_lbl) %in% test_ids, ]
y_train <- y[labeled$post_id %in% train_ids]
y_test <- y[labeled$post_id %in% test_ids]
# 调参与训练(type=1: L2-regularized L2-loss SVC (primal))
cost_grid <- 10 ^ seq(-3, 2, by = 1)
set.seed(42)
cv_results <- map_dfr(cost_grid, function(C) {
m <- LiblineaR(data = X_train, target = y_train, type = 1, cost = C,
cross = 5, bias = TRUE, verbose = FALSE)
tibble(cost = C, cv_accuracy = m) # cross>0时返回CV准确率
})
bestC <- cv_results$cost[which.max(cv_results$cv_accuracy)]
final_model <- LiblineaR(data = X_train, target = y_train, type = 1, cost = bestC, bias = TRUE, verbose = TRUE)
# 预测
pred_test <- predict(final_model, X_test)$predictions
library(yardstick); library(tibble); library(dplyr)
results <- tibble(
truth = factor(y_test, levels = levels(y_train)),
pred = factor(pred_test, levels = levels(y_train))
)
# 全局指标
accuracy <- accuracy(results, truth, pred)
macro_f1 <- f_meas(results, truth, pred, estimator = "macro")
precision <- precision(results, truth, pred, estimator = "macro")
recall <- recall(results, truth, pred, estimator = "macro")
# 混淆矩阵
cm <- conf_mat(results, truth, pred)
print(accuracy); print(macro_f1); print(precision); print(recall); print(cm)
saveRDS(list(model = final_model,
feat_names = colnames(X_train),
dfm_settings = list(ngram = "char2-5", tfidf = TRUE, trim = list(min_df=5, max_df=0.9))),
"artifacts/sentiment_svm_charngram.rds")
# 安装:install.packages("plumber")
library(plumber); library(quanteda); library(LiblineaR); library(Matrix); library(stringi)
bundle <- readRDS("artifacts/sentiment_svm_charngram.rds")
final_model <- bundle$model; feat_names <- bundle$feat_names
# 复用与训练相同的清洗函数 clean_text()
#* @post /predict
function(content = "") {
txt <- clean_text(content)
toks <- tokens(txt, what = "character", remove_punct = FALSE, remove_numbers = FALSE)
toks <- tokens_ngrams(toks, n = 2:5, concatenator = "")
dfm_new <- dfm(toks, tolower = FALSE)
dfm_new <- dfm_match(dfm_new, feat_names) # 对齐训练特征空间
dfm_new <- dfm_tfidf(dfm_new, scheme_tf = "logcount", scheme_df = "inverse")
pred <- predict(final_model, dfm_new)$predictions[1]
list(prediction = as.character(pred))
}
# 运行:pr("api.R") %>% pr_run(port=8000, host="0.0.0.0")
可选:Python生态互操作(满足NLTK/spaCy/TensorFlow与Scikit-learn/FastText偏好)
# install.packages("spacyr"); spacyr::spacy_initialize(model = "en_core_web_sm")
library(spacyr)
# 英文/中文分词(需要下载相应模型)
# txt_tokens <- spacy_tokenize(texts, lemma = FALSE, tag = FALSE)
# scikit-learn LinearSVC(通过reticulate)
library(reticulate)
sk <- import("sklearn.svm")
calib <- import("sklearn.calibration")
sp <- import("scipy.sparse")
# 将dgCMatrix转为scipy CSR
to_csr <- function(m) {
i <- as.integer(m@i)
p <- as.integer(m@p)
x <- as.numeric(m@x)
dims <- as.integer(dim(m))
sp$csr_matrix(tuple(x, i, p), shape = tuple(dims[1], dims[2]))
}
Xtr_py <- to_csr(as(X_train, "dgCMatrix"))
Xte_py <- to_csr(as(X_test, "dgCMatrix"))
ytr_py <- as.character(y_train)
clf <- sk$LinearSVC(C = bestC)
# 如需概率,使用 CalibratedClassifierCV 包裹
# clf <- calib$CalibratedClassifierCV(base_estimator = sk$LinearSVC(C=bestC), method="sigmoid", cv=5)
clf$fit(Xtr_py, ytr_py)
pred_py <- clf$predict(Xte_py)
实验与合规要点
总结
帮助用户一步步建立自己的情感分析模型,从数据收集到模型部署,实现精确解析客户反馈的能力,最终提升用户决策效率与客户满意度。
快速从头开发一个情感分析模型,精准解析市场舆情和客户反馈,为产品迭代提供科学依据。
利用情感分析模型高效处理来自社交媒体或客户评论的数据,快速洞察用户情绪趋势和市场反馈。
构建强健的情感分析基础系统,支持企业对客户意见的自动化分析与实时响应需求。
将模板生成的提示词复制粘贴到您常用的 Chat 应用(如 ChatGPT、Claude 等),即可直接对话使用,无需额外开发。适合个人快速体验和轻量使用场景。
把提示词模板转化为 API,您的程序可任意修改模板参数,通过接口直接调用,轻松实现自动化与批量处理。适合开发者集成与业务系统嵌入。
在 MCP client 中配置对应的 server 地址,让您的 AI 应用自动调用提示词模板。适合高级用户和团队协作,让提示词在不同 AI 工具间无缝衔接。
半价获取高级提示词-优惠即将到期