要点1:把输入转换为模型特定的模板(Template), 能显著提升模型的回复质量。
要点2:多轮对话中,模型能够记住前序对话,并对输出的词语概率做调整。可以观察不同轮次模型输出时,对不同词汇的输出概率。
要点3:模型把一个句子打成token的方式
要点4:不同的模型温度(采样方式)对输出结果“多样性”的影响。采样越严格,模型的多样性就越差。
要点5:相似的token,embedding后的距离也更接近(降维为可视化的图表)
要点6:生成一个句子的过程中,如何观察某一层,词与词之间的注意力关系?(Attention Map)
零、前置准备:导入模型库
# 确保运行在GPU上
!nvidia-smi
# 安装transformers
!pip install transformers==4.47.0
######################## TODO (Pre-requisites) ########################
# replace `your_hf_token` with your huggingface token
from huggingface_hub import login
login("YOUR HUGGINGFACE TOKEN")
#######################################################################
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "google/gemma-2-2b-it"
dtype = torch.float16
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
attn_implementation="eager",
device_map="cuda",
torch_dtype=dtype,
)
一、对比“有格式模板” VS “无模板” 的回答
1)用于评价生成结果好坏的模型 https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
SCORING_MODEL = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
SCORING_TOKENIZER = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
def calculate_coherence(question, answer, scoring_model=SCORING_MODEL, tokenizer=SCORING_TOKENIZER):
features = tokenizer([question], [answer], padding=True, truncation=True, return_tensors="pt")
scoring_model.eval()
with torch.no_grad():
scores = scoring_model(**features).logits.squeeze().item()
return scores
2)获取模型结果
def generate_text_from_prompt(prompt, tokenizer, model):
"""
generate the output from the prompt.
param:
prompt (str): the prompt inputted to the model
tokenizer : the tokenizer that is used to encode / decode the input / output
model : the model that is used to generate the output
return:
the response of the model
"""
print("========== Prompt inputted to the model ==========\n", prompt)
# Tokenize the prompt
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
######################## TODO (Q1.1 ~ 1.4) ########################
### You can refer to https://huggingface.co/google/gemma-2-2b-it for basic usage
### Make sure to use 'do_sample=False' to get a deterministic response
### Otherwise the coherence score may be different from the sample answer
# Generate response
output_ids = model.generate(
input_ids,
do_sample=False,
max_new_tokens=128,
pad_token_id=tokenizer.eos_token_id
)
###################################################################
if output_ids is not None and len(output_ids) > 0:
return tokenizer.decode(output_ids[0], skip_special_tokens=True)
else:
return "Empty Response"
3)有模板的回答
# With chat template
question = "Please tell me about the key differences between supervised learning and unsupervised learning. Answer in 200 words."
# chat模型格式
chat = [
{"role": "user", "content": question},
]
prompt_with_template = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
# 按照chat模式获取回答
response_with_template = generate_text_from_prompt(prompt_with_template, tokenizer, model)
# extract the real output from the model
response_with_template = response_with_template.split('model\n')[-1].strip('\n').strip()
print("========== Output ==========\n", response_with_template)
score = calculate_coherence(question, response_with_template)
print(f"========== Coherence Score : {score:.4f} ==========")
结果:质量分6.0589
4)没有模板的回答
# Without chat template (directly using plain text)
response_without_template = generate_text_from_prompt(question, tokenizer, model)
# extract the real output from the model
response_without_template = response_without_template.split(question.split(' ')[-1])[-1].strip('\n').strip()
print("========== Output ==========\n", response_without_template)
score = calculate_coherence(question, response_without_template)
print(f"========== Coherence Score : {score:.4f} ==========")
结果:质量分只有4.0062,低于有模板的回答
二、多轮对话,不同轮次的输出结果
import matplotlib.pyplot as plt
import seaborn as sns
chat_history = []
round = 0
print("Chatbot: Hello! How can I assist you today? (Type 'exit' to quit)")
while True:
user_input = input("You: ")
if user_input.lower() == "exit":
print("Chatbot: Goodbye!")
break
round += 1
chat_history.append({"role": "user", "content": user_input})
chat_template_format_prompt = tokenizer.apply_chat_template(chat_history, tokenize=False, add_generation_prompt=True)
######################## (Q2.1 ~ 2.3) ########################
# Observe the prompt with chat template format that was inputted to the model in the current round to answer Q2.1 ~ Q2.3.
print(f"=== Prompt with chat template format inputted to the model on round {round} ===\n{chat_template_format_prompt}")
print(f"===============================================")
###################################################################
inputs = tokenizer(chat_template_format_prompt, return_tensors="pt").to("cuda")
# Get logits instead of directly generating
with torch.no_grad():
outputs_p = model(**inputs)
logits = outputs_p.logits # Logits of the model (raw scores before softmax)
last_token_logits = logits[:, -1, :] # Take the logits of the last generated token
# Apply softmax to get probabilities
probs = torch.nn.functional.softmax(last_token_logits, dim=-1)
# Get top-k tokens (e.g., 10)
top_k = 10
top_probs, top_indices = torch.topk(probs, top_k)
# Convert to numpy for plotting
top_probs = top_probs.cpu().squeeze().numpy()
top_indices = top_indices.cpu().squeeze().numpy()
top_tokens = [tokenizer.decode([idx]) for idx in top_indices]
# Plot probability distribution
plt.figure(figsize=(10, 5))
sns.barplot(x=top_probs, y=top_tokens, palette="coolwarm")
plt.xlabel("Probability")
plt.ylabel("Token")
plt.title("Top Token Probabilities for Next Word")
plt.show()
# Generate response
outputs = model.generate(**inputs, max_new_tokens=200, pad_token_id=tokenizer.eos_token_id, do_sample=False)
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(f"Chatbot: {response}")
chat_history.append({"role": "assistant", "content": response})
第一轮:
* 问题:告诉我彩虹的7种颜色中的一种,只回答一种。
* 模型输出概率最高的是“红色”
第二轮:
* 问题:告诉我彩虹的7种颜色中的一种,除了之前提到的颜色,只回答一种。
* 输出概率最高的是“橙色”
三、把句子打成token的方式
sentence = "I love taking a Machine Learning course by Professor Hung-yi Lee, What about you?" #@param {type:"string"}
######################## TODO (Q3.1 ~ 3.4) ########################
### You can refer to https://huggingface.co/learn/nlp-course/en/chapter2/4?fw=pt for basic tokenizer usage
### and https://huggingface.co/docs/transformers/en/main_classes/tokenizer for full tokenizer usage
# Encode the sentence into token IDs without adding special tokens
token_ids = tokenizer.encode(sentence, add_special_tokens=False)
# Convert the token IDs back to their corresponding tokens (words or subwords)
tokens = tokenizer.convert_ids_to_tokens(token_ids)
###################################################################
# Iterate through the tokens and their corresponding token IDs
for t, t_id in zip(tokens, token_ids):
# Print the token and its index (ID)
print(f"Token: {t}, token index: {t_id}")
句子:I love taking a Machine Learning course by Professor Hung-yi Lee, What about you?
结果见截图,第一个token 是 I , 映射成的 token index 是235285
四、不同的采样方式,模型生成结果的多样性
让模型在不同的参数下生成结果,查看结果的多样性
* top_k 只取概率最高的K个输出
* top_p 只取概率前p%的输出
* 多样性衡量模型 BLEU 分数越低多样性更好
from tqdm import trange
from transformers import HybridCache
max_generation_tokens = 30
######################## TODO (Q4.3 ~ 4.6) ########################
# Modify the value of k and p accordingly
top_k = 100 # Set K for top-k sampling
top_p = 0.999 # Set P for nucleus sampling
###################################################################
# Input prompt
prompt = f"Generate a paraphrase of the sentence 'Professor Hung-yi Lee is one of the best teachers in the domain of machine learning'. Just response with one sentence."
input_ids = tokenizer(prompt, return_tensors="pt")
# Initialize KV Cache
kv_cache = HybridCache(config=model.config, max_batch_size=1, max_cache_len=max_generation_tokens, device="cuda", dtype=torch.float16)
next_token_id = input_ids.input_ids.to("cuda")
attention_mask = input_ids.attention_mask.to("cuda")
cache_position = torch.arange(attention_mask.shape[1], device="cuda")
generated_sentences_top_k = []
generated_sentences_top_p = []
# Define the generation parameters
generation_params = {
"do_sample": True, # Enable sampling
"max_length": max_generation_tokens + len(input_ids.input_ids[0]), # Total length including prompt
"pad_token_id": tokenizer.pad_token_id, # Ensure padding token is set
"eos_token_id": tokenizer.eos_token_id, # Ensure EOS token is set
"bos_token_id": tokenizer.bos_token_id, # Ensure BOS token is set
"attention_mask": input_ids.attention_mask.to("cuda"), # Move attention mask to GPU
"use_cache": True, # Enable caching
"return_dict_in_generate": True, # Return generation outputs
"output_scores": False, # Disable outputting scores
}
for method in ["top-k", "top-p"]:
for _ in trange(20):
if method == "top-k":
# Generate text using the model with top_k
generated_output = model.generate(
input_ids=input_ids.input_ids.to("cuda"),
top_k=top_k,
**generation_params
)
elif method == "top-p":
# Generate text using the model with top_p
######################## TODO (Q4.3 ~ 4.6) ########################
# Generate output from the model based on the input_ids and specified generation parameters
# You can refer to this documentation: https://huggingface.co/docs/transformers/en/main_classes/text_generation
# Hint: You can check how we generate the text with top_k
generated_output = model.generate(
input_ids=input_ids.input_ids.to("cuda"),
top_p=top_p,
**generation_params
)
###################################################################
else:
raise NotImplementedError()
# Decode the generated tokens
generated_tokens = generated_output.sequences[0, len(input_ids.input_ids[0]):]
decoded_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
# Combine the prompt with the generated text
sentence = decoded_text.replace(" ,", ",").replace(" 's", "'s").replace(" .", ".").strip()
# Append the generated sentence to the appropriate list
if method == "top-k":
generated_sentences_top_k.append(sentence)
else:
generated_sentences_top_p.append(sentence)
# Print results
print("===== Top-K Sampling Output =====")
print()
for idx,sentence in enumerate(generated_sentences_top_k):
print(f"{idx}. {sentence}")
print()
print("===== Top-P Sampling Output =====")
print()
for idx,sentence in enumerate(generated_sentences_top_p):
print(f"{idx}. {sentence}")
print()
# 评价模型
from nltk.translate.bleu_score import sentence_bleu
def compute_self_bleu(generated_sentences):
total_bleu_score = 0
num_sentences = len(generated_sentences)
for i, hypothesis in enumerate(generated_sentences):
references = [generated_sentences[j] for j in range(num_sentences) if j != i]
bleu_scores = [sentence_bleu([ref.split()], hypothesis.split()) for ref in references]
total_bleu_score += sum(bleu_scores) / len(bleu_scores)
return total_bleu_score / num_sentences
# Calculate BLEU score
bleu_score = compute_self_bleu(generated_sentences_top_k)
print(f"self-BLEU Score for top_k (k={top_k}): {bleu_score:.4f}")
# Calculate BLEU score
bleu_score = compute_self_bleu(generated_sentences_top_p)
print(f"self-BLEU Score for top_p (p={top_p}): {bleu_score:.4f}")
结果: 当k = 0.999 时,模型的多样性更好。
五、相似embedding位置更接近
绘制不同词汇的embedding 二维位置图
将不同的词组做embedding,词性相近的词,距离也比较近
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
######################## (Q5.2 ~ 5.3) ########################
# Sentences with different meanings of words
sentences = [
"I ate a fresh apple.", # Apple (fruit)
"Apple released the new iPhone.", # Apple (company)
"I peeled an orange and ate it.", # Orange (fruit)
"The Orange network has great coverage.", # Orange (telecom)
"Microsoft announced a new update.", # Microsoft (company)
"Banana is my favorite fruit.", # Banana (fruit)
]
# Tokenize and move to device
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
inputs = inputs.to(device)
# Get hidden states
with torch.no_grad():
outputs = model(**inputs, output_hidden_states=True)
hidden_states = outputs.hidden_states[-1] # Extract last layer embeddings
# Compute sentence-level embeddings (mean pooling)
sentence_embeddings = hidden_states.mean(dim=1).cpu().numpy()
# Words to visualize
word_labels = [
"Apple (fruit)", "Apple (company)",
"Orange (fruit)", "Orange (telecom)",
"Microsoft (company)", "Banana (fruit)"
]
# Reduce to 2D using t-SNE
tsne = TSNE(n_components=2, perplexity=2, random_state=42)
embeddings_2d = tsne.fit_transform(sentence_embeddings)
# Plot the embeddings
plt.figure(figsize=(8, 6))
colors = ["red", "blue", "orange", "purple", "green", "brown"]
for i, label in enumerate(word_labels):
plt.scatter(embeddings_2d[i, 0], embeddings_2d[i, 1], color=colors[i], s=100)
plt.text(embeddings_2d[i, 0] + 0.1, embeddings_2d[i, 1] + 0.1, label, fontsize=12, color=colors[i])
plt.xlabel("t-SNE Dim 1")
plt.ylabel("t-SNE Dim 2")
plt.title("t-SNE Visualization of Word Embeddings")
plt.show()
##################################################
六、观察某一层的注意力矩阵
从“Google”这一个词开始,让大模型生成20个词,观察第10层第7个注意力头的情况。
颜色越浅注意力越高:company这个词生成的时候(查看company这一行),注意力最高,即颜色最亮的列的是“multinational”
# Import necessary libraries
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import trange
from sklearn.decomposition import PCA
# Input prompt for text generation
prompt = "Google "
input_ids = tokenizer(prompt, return_tensors="pt") # Tokenize the input prompt
next_token_id = input_ids.input_ids.to("cuda") # Move input token ids to GPU
attention_mask = input_ids.attention_mask.to("cuda") # Move attention mask to GPU
cache_position = torch.arange(attention_mask.shape[1], device="cuda") # Position for the KV cache
# Set the number of tokens to generate and other parameters
generation_tokens = 20 # Limit for visualization (number of tokens to generate)
total_tokens = generation_tokens + next_token_id.size(1) - 1 # Total tokens to handle
layer_idx = 10 # Specify the layer index for attention visualization
head_idx = 7 # Specify the attention head index to visualize
# KV cache setup for caching key/values across time steps
from transformers.cache_utils import HybridCache
kv_cache = HybridCache(config=model.config, max_batch_size=1, max_cache_len=total_tokens, device="cuda", dtype=torch.float16)
generated_tokens = [] # List to store generated tokens
attentions = None # Placeholder to store attention weights
num_new_tokens = 0 # Counter for the number of new tokens generated
model.eval() # Set the model to evaluation mode
# Generate tokens and collect attention weights for visualization
for num_new_tokens in range(generation_tokens):
with torch.no_grad(): # Disable gradients during inference for efficiency
# Pass the input through the model to get the next token prediction and attention weights
outputs = model(
next_token_id,
attention_mask=attention_mask,
cache_position=cache_position,
use_cache=True, # Use the KV cache for efficiency
past_key_values=kv_cache, # Provide the cached key-value pairs for fast inference
output_attentions=True # Enable the extraction of attention weights
)
######################## TODO (Q6.1 ~ 6.4) ########################
### You can refer to https://huggingface.co/docs/transformers/en/main_classes/output#transformers.modeling_outputs.BaseModelOutput.attentions to see the structure of model output attentions
# Get the logits for the last generated token from outputs
logits = outputs.logits[:, -1, :]
# Extract the attention scores from the model's outputs
attention_scores = outputs.attentions
###################################################################
# Extract attention weights for the specified layer and head
last_layer_attention = attention_scores[layer_idx][0][head_idx].detach().cpu().numpy()
# If it's the first generated token, initialize the attentions array
if num_new_tokens == 0:
attentions = last_layer_attention
else:
# Append the current attention weights to the existing array
attentions = np.append(attentions, last_layer_attention, axis=0)
# Choose the next token to generate based on the highest probability (logits)
next_token_id = logits.argmax(dim=-1)
generated_tokens.append(next_token_id.item()) # Add the token ID to the generated tokens list
# Update the attention mask and next token ID for the next iteration
attention_mask = torch.cat([attention_mask, torch.ones(1, 1, device="cuda")], dim=-1) # Add a new attention mask for the generated token
next_token_id = next_token_id.unsqueeze(0) # Convert the token ID to the required shape
# Update the KV cache with the new past key-values
kv_cache = outputs.past_key_values
cache_position = cache_position[-1:] + 1 # Update the cache position for the next iteration
# Decode the generated tokens into human-readable text
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
full_text = prompt + generated_text # Combine the prompt with the generated text
# Tokenize all the generated text (prompt + generated)
tokens = tokenizer.tokenize(full_text)
# Function to plot a heatmap of attention weights
def plot_attention(attn_matrix, tokens, title="Attention Heatmap"):
plt.figure(figsize=(10, 8)) # Set the figure size
sns.heatmap(attn_matrix, xticklabels=tokens, yticklabels=tokens, cmap="viridis", annot=False) # Plot the attention matrix as a heatmap
plt.xlabel("Key Tokens")
plt.ylabel("Query Tokens")
plt.title(title)
plt.xticks(rotation=45) # Rotate x-axis labels for better visibility
plt.yticks(rotation=0) # Rotate y-axis labels
plt.show()
# Plot the attention heatmap for the last generated token
plot_attention(attentions, tokens, title=f"Attention Weights for Generated Token of Layer {layer_idx}")