232 lines
5.9 KiB
Markdown
232 lines
5.9 KiB
Markdown
# Hugging Face Transformers 指南
|
|
|
|
## 文本分类
|
|
|
|
```python
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
from transformers import Trainer, TrainingArguments
|
|
from datasets import Dataset
|
|
|
|
# 加载模型和分词器
|
|
model_name = 'bert-base-chinese'
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
|
|
|
|
# 数据预处理
|
|
def preprocess(examples):
|
|
return tokenizer(
|
|
examples['text'],
|
|
truncation=True,
|
|
padding='max_length',
|
|
max_length=512
|
|
)
|
|
|
|
dataset = Dataset.from_pandas(df)
|
|
dataset = dataset.map(preprocess, batched=True)
|
|
|
|
# 训练参数
|
|
training_args = TrainingArguments(
|
|
output_dir='./results',
|
|
num_train_epochs=3,
|
|
per_device_train_batch_size=16,
|
|
per_device_eval_batch_size=32,
|
|
warmup_ratio=0.1,
|
|
learning_rate=2e-5,
|
|
weight_decay=0.01,
|
|
logging_steps=100,
|
|
eval_strategy='epoch',
|
|
save_strategy='epoch',
|
|
load_best_model_at_end=True,
|
|
metric_for_best_model='f1',
|
|
)
|
|
|
|
# 评估指标
|
|
from sklearn.metrics import accuracy_score, f1_score
|
|
|
|
def compute_metrics(eval_pred):
|
|
predictions, labels = eval_pred
|
|
predictions = predictions.argmax(axis=-1)
|
|
return {
|
|
'accuracy': accuracy_score(labels, predictions),
|
|
'f1': f1_score(labels, predictions, average='macro')
|
|
}
|
|
|
|
# 训练
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=train_dataset,
|
|
eval_dataset=eval_dataset,
|
|
compute_metrics=compute_metrics,
|
|
)
|
|
trainer.train()
|
|
```
|
|
|
|
## 命名实体识别 (NER)
|
|
|
|
```python
|
|
from transformers import AutoModelForTokenClassification
|
|
|
|
# 标签映射
|
|
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
|
|
label2id = {l: i for i, l in enumerate(label_list)}
|
|
id2label = {i: l for i, l in enumerate(label_list)}
|
|
|
|
model = AutoModelForTokenClassification.from_pretrained(
|
|
model_name,
|
|
num_labels=len(label_list),
|
|
id2label=id2label,
|
|
label2id=label2id
|
|
)
|
|
|
|
# 数据预处理(处理子词对齐)
|
|
def tokenize_and_align_labels(examples):
|
|
tokenized = tokenizer(
|
|
examples['tokens'],
|
|
truncation=True,
|
|
is_split_into_words=True
|
|
)
|
|
|
|
labels = []
|
|
for i, label in enumerate(examples['ner_tags']):
|
|
word_ids = tokenized.word_ids(batch_index=i)
|
|
label_ids = []
|
|
previous_word_idx = None
|
|
for word_idx in word_ids:
|
|
if word_idx is None:
|
|
label_ids.append(-100)
|
|
elif word_idx != previous_word_idx:
|
|
label_ids.append(label[word_idx])
|
|
else:
|
|
label_ids.append(-100) # 子词
|
|
previous_word_idx = word_idx
|
|
labels.append(label_ids)
|
|
|
|
tokenized['labels'] = labels
|
|
return tokenized
|
|
```
|
|
|
|
## 文本生成
|
|
|
|
```python
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
model_name = 'Qwen/Qwen2-7B-Instruct'
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name,
|
|
torch_dtype=torch.float16,
|
|
device_map='auto'
|
|
)
|
|
|
|
# 生成
|
|
messages = [
|
|
{"role": "system", "content": "你是一个有帮助的助手。"},
|
|
{"role": "user", "content": "写一首关于春天的诗"}
|
|
]
|
|
|
|
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
inputs = tokenizer(text, return_tensors='pt').to(model.device)
|
|
|
|
outputs = model.generate(
|
|
**inputs,
|
|
max_new_tokens=512,
|
|
temperature=0.7,
|
|
top_p=0.9,
|
|
do_sample=True,
|
|
pad_token_id=tokenizer.eos_token_id
|
|
)
|
|
|
|
response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
|
|
```
|
|
|
|
## Embedding 模型
|
|
|
|
```python
|
|
from sentence_transformers import SentenceTransformer
|
|
|
|
# 加载模型
|
|
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
|
|
|
|
# 编码
|
|
sentences = ['这是一个句子', '这是另一个句子']
|
|
embeddings = model.encode(sentences, normalize_embeddings=True)
|
|
|
|
# 相似度计算
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
|
|
```
|
|
|
|
## LoRA 微调
|
|
|
|
```python
|
|
from peft import LoraConfig, get_peft_model, TaskType
|
|
|
|
# LoRA 配置
|
|
lora_config = LoraConfig(
|
|
task_type=TaskType.CAUSAL_LM,
|
|
r=8, # LoRA 秩
|
|
lora_alpha=32, # 缩放因子
|
|
lora_dropout=0.1,
|
|
target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj'], # 目标模块
|
|
)
|
|
|
|
# 应用 LoRA
|
|
model = get_peft_model(model, lora_config)
|
|
model.print_trainable_parameters()
|
|
|
|
# 训练后保存
|
|
model.save_pretrained('lora_model')
|
|
|
|
# 加载
|
|
from peft import PeftModel
|
|
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
|
model = PeftModel.from_pretrained(base_model, 'lora_model')
|
|
```
|
|
|
|
## Pipeline 快速推理
|
|
|
|
```python
|
|
from transformers import pipeline
|
|
|
|
# 文本分类
|
|
classifier = pipeline('text-classification', model='bert-base-chinese')
|
|
result = classifier('这是一个测试句子')
|
|
|
|
# NER
|
|
ner = pipeline('ner', model='bert-base-chinese-ner', aggregation_strategy='simple')
|
|
entities = ner('张三在北京工作')
|
|
|
|
# 文本生成
|
|
generator = pipeline('text-generation', model='gpt2')
|
|
text = generator('Once upon a time', max_length=50)
|
|
|
|
# 问答
|
|
qa = pipeline('question-answering', model='bert-large-uncased-whole-word-masking-finetuned-squad')
|
|
answer = qa(question='What is AI?', context='AI is artificial intelligence...')
|
|
|
|
# 零样本分类
|
|
classifier = pipeline('zero-shot-classification')
|
|
result = classifier('这是一篇体育新闻', candidate_labels=['体育', '科技', '娱乐'])
|
|
```
|
|
|
|
## 量化推理
|
|
|
|
```python
|
|
from transformers import BitsAndBytesConfig
|
|
|
|
# 4-bit 量化配置
|
|
bnb_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_quant_type='nf4',
|
|
bnb_4bit_compute_dtype=torch.float16,
|
|
bnb_4bit_use_double_quant=True
|
|
)
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name,
|
|
quantization_config=bnb_config,
|
|
device_map='auto'
|
|
)
|
|
```
|