5.9 KiB
5.9 KiB
Hugging Face Transformers 指南
文本分类
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
# 加载模型和分词器
model_name = 'bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
# 数据预处理
def preprocess(examples):
return tokenizer(
examples['text'],
truncation=True,
padding='max_length',
max_length=512
)
dataset = Dataset.from_pandas(df)
dataset = dataset.map(preprocess, batched=True)
# 训练参数
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
warmup_ratio=0.1,
learning_rate=2e-5,
weight_decay=0.01,
logging_steps=100,
eval_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True,
metric_for_best_model='f1',
)
# 评估指标
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = predictions.argmax(axis=-1)
return {
'accuracy': accuracy_score(labels, predictions),
'f1': f1_score(labels, predictions, average='macro')
}
# 训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
命名实体识别 (NER)
from transformers import AutoModelForTokenClassification
# 标签映射
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}
model = AutoModelForTokenClassification.from_pretrained(
model_name,
num_labels=len(label_list),
id2label=id2label,
label2id=label2id
)
# 数据预处理(处理子词对齐)
def tokenize_and_align_labels(examples):
tokenized = tokenizer(
examples['tokens'],
truncation=True,
is_split_into_words=True
)
labels = []
for i, label in enumerate(examples['ner_tags']):
word_ids = tokenized.word_ids(batch_index=i)
label_ids = []
previous_word_idx = None
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
label_ids.append(-100) # 子词
previous_word_idx = word_idx
labels.append(label_ids)
tokenized['labels'] = labels
return tokenized
文本生成
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = 'Qwen/Qwen2-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map='auto'
)
# 生成
messages = [
{"role": "system", "content": "你是一个有帮助的助手。"},
{"role": "user", "content": "写一首关于春天的诗"}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors='pt').to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
Embedding 模型
from sentence_transformers import SentenceTransformer
# 加载模型
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
# 编码
sentences = ['这是一个句子', '这是另一个句子']
embeddings = model.encode(sentences, normalize_embeddings=True)
# 相似度计算
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
LoRA 微调
from peft import LoraConfig, get_peft_model, TaskType
# LoRA 配置
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8, # LoRA 秩
lora_alpha=32, # 缩放因子
lora_dropout=0.1,
target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj'], # 目标模块
)
# 应用 LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# 训练后保存
model.save_pretrained('lora_model')
# 加载
from peft import PeftModel
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
model = PeftModel.from_pretrained(base_model, 'lora_model')
Pipeline 快速推理
from transformers import pipeline
# 文本分类
classifier = pipeline('text-classification', model='bert-base-chinese')
result = classifier('这是一个测试句子')
# NER
ner = pipeline('ner', model='bert-base-chinese-ner', aggregation_strategy='simple')
entities = ner('张三在北京工作')
# 文本生成
generator = pipeline('text-generation', model='gpt2')
text = generator('Once upon a time', max_length=50)
# 问答
qa = pipeline('question-answering', model='bert-large-uncased-whole-word-masking-finetuned-squad')
answer = qa(question='What is AI?', context='AI is artificial intelligence...')
# 零样本分类
classifier = pipeline('zero-shot-classification')
result = classifier('这是一篇体育新闻', candidate_labels=['体育', '科技', '娱乐'])
量化推理
from transformers import BitsAndBytesConfig
# 4-bit 量化配置
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map='auto'
)