Wagtail推荐引擎:个性化内容推送
引言:内容过载时代的精准推送挑战
在信息爆炸的时代,用户每天面对海量内容,如何让用户快速找到他们真正感兴趣的信息成为内容管理系统的核心挑战。Wagtail作为基于Django的现代化CMS(内容管理系统),虽然原生提供了强大的搜索功能,但要实现真正的个性化推荐,需要开发者进行深度定制。
本文将深入探讨如何在Wagtail中构建个性化推荐引擎,从基础概念到高级实现,为您提供完整的解决方案。
Wagtail搜索系统架构解析
核心搜索组件
Wagtail的搜索系统建立在灵活的架构之上,支持多种后端引擎:
# Wagtail搜索后端配置示例
WAGTAILSEARCH_BACKENDS = {
'default': {
'BACKEND': 'wagtail.search.backends.elasticsearch8',
'URLS': ['http://localhost:9200'],
'INDEX': 'wagtail',
'TIMEOUT': 5,
'OPTIONS': {},
'INDEX_SETTINGS': {},
'AUTO_UPDATE': True,
}
}
索引机制深度解析
Wagtail使用Indexed
类来定义可搜索的模型字段:
from wagtail.search import index
class ArticlePage(Page):
body = RichTextField()
tags = ClusterTaggableManager(through=ArticleTag, blank=True)
publish_date = models.DateTimeField()
author = models.ForeignKey(Author, on_delete=models.SET_NULL, null=True)
search_fields = Page.search_fields + [
index.SearchField('body'),
index.FilterField('publish_date'),
index.FilterField('author'),
index.RelatedFields('tags', [
index.SearchField('name'),
]),
]
个性化推荐引擎设计
用户行为数据收集
构建推荐系统的第一步是收集用户行为数据:
class UserBehavior(models.Model):
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
page = models.ForeignKey('wagtailcore.Page', on_delete=models.CASCADE)
behavior_type = models.CharField(max_length=20, choices=[
('view', '浏览'),
('click', '点击'),
('share', '分享'),
('bookmark', '收藏'),
])
timestamp = models.DateTimeField(auto_now_add=True)
duration = models.IntegerField(null=True, blank=True) # 浏览时长(秒)
class Meta:
indexes = [
models.Index(fields=['user', '-timestamp']),
models.Index(fields=['page', '-timestamp']),
]
推荐算法实现
基于内容的推荐
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
class ContentBasedRecommender:
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=1000,
stop_words='english',
ngram_range=(1, 2)
)
self.content_matrix = None
self.page_ids = []
def build_content_matrix(self, pages):
"""构建内容特征矩阵"""
contents = []
self.page_ids = []
for page in pages:
# 提取页面文本内容
text_content = self.extract_page_content(page)
contents.append(text_content)
self.page_ids.append(page.id)
self.content_matrix = self.vectorizer.fit_transform(contents)
return self.content_matrix
def extract_page_content(self, page):
"""从页面提取文本内容"""
content_parts = []
# 提取标题
if hasattr(page, 'title'):
content_parts.append(page.title)
# 提取正文内容
if hasattr(page, 'body'):
# 假设body是StreamField或RichTextField
content_parts.append(str(page.body))
# 提取标签
if hasattr(page, 'tags') and page.tags.all():
content_parts.extend([tag.name for tag in page.tags.all()])
return ' '.join(content_parts)
def recommend_similar(self, target_page_id, n_recommendations=5):
"""推荐相似内容"""
if target_page_id not in self.page_ids:
return []
target_idx = self.page_ids.index(target_page_id)
target_vector = self.content_matrix[target_idx]
# 计算相似度
similarities = cosine_similarity(target_vector, self.content_matrix).flatten()
# 获取最相似的页面(排除自身)
similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
return [
(self.page_ids[i], similarities[i])
for i in similar_indices
if similarities[i] > 0.1 # 相似度阈值
]
协同过滤推荐
from collections import defaultdict
import math
class CollaborativeFilteringRecommender:
def __init__(self):
self.user_item_matrix = defaultdict(lambda: defaultdict(int))
self.item_similarity = {}
def add_user_behavior(self, user_id, page_id, weight=1):
"""添加用户行为数据"""
self.user_item_matrix[user_id][page_id] += weight
def compute_item_similarity(self):
"""计算物品相似度矩阵"""
# 构建共现矩阵
cooccurrence = defaultdict(lambda: defaultdict(int))
item_counts = defaultdict(int)
for user_items in self.user_item_matrix.values():
items = list(user_items.keys())
for i in range(len(items)):
for j in range(i + 1, len(items)):
item1, item2 = items[i], items[j]
weight = user_items[item1] * user_items[item2]
cooccurrence[item1][item2] += weight
cooccurrence[item2][item1] += weight
for item in items:
item_counts[item] += 1
# 计算相似度(改进的余弦相似度)
self.item_similarity = {}
for item1 in cooccurrence:
self.item_similarity[item1] = {}
for item2 in cooccurrence[item1]:
# 避免除零错误
if item_counts[item1] == 0 or item_counts[item2] == 0:
similarity = 0
else:
similarity = cooccurrence[item1][item2] / math.sqrt(
item_counts[item1] * item_counts[item2]
)
self.item_similarity[item1][item2] = similarity
def recommend_for_user(self, user_id, n_recommendations=5):
"""为用户生成推荐"""
if user_id not in self.user_item_matrix:
return []
user_items = self.user_item_matrix[user_id]
scores = defaultdict(float)
for viewed_item, weight in user_items.items():
if viewed_item in self.item_similarity:
for similar_item, similarity in self.item_similarity[viewed_item].items():
if similar_item not in user_items: # 排除已浏览的
scores[similar_item] += weight * similarity
# 返回评分最高的推荐
recommended_items = sorted(
scores.items(),
key=lambda x: x[1],
reverse=True
)[:n_recommendations]
return recommended_items
实时推荐系统集成
Django信号机制实现实时更新
from django.db.models.signals import post_save, post_delete
from django.dispatch import receiver
from wagtail.signals import page_published, page_unpublished
@receiver(page_published)
def handle_page_published(sender, instance, **kwargs):
"""页面发布时更新推荐索引"""
from .tasks import update_recommendation_index
update_recommendation_index.delay(instance.id)
@receiver(post_save, sender=UserBehavior)
def handle_user_behavior(sender, instance, created, **kwargs):
"""用户行为发生时更新推荐模型"""
if created:
from .tasks import update_user_recommendations
update_user_recommendations.delay(instance.user_id)
# Celery异步任务示例
@app.task
def update_recommendation_index(page_id):
"""异步更新推荐索引"""
try:
page = Page.objects.get(id=page_id)
content_recommender = ContentBasedRecommender()
# 获取相关页面并更新索引
related_pages = Page.objects.live().type(page.specific_class)
content_recommender.build_content_matrix(related_pages)
except Page.DoesNotExist:
pass
REST API接口设计
from rest_framework import viewsets, status
from rest_framework.decorators import action
from rest_framework.response import Response
class RecommendationViewSet(viewsets.ViewSet):
"""推荐API视图集"""
@action(detail=False, methods=['get'])
def for_user(self, request):
"""获取用户个性化推荐"""
user = request.user
if not user.is_authenticated:
return Response([], status=status.HTTP_200_OK)
n_recommendations = int(request.GET.get('limit', 5))
# 获取推荐结果
recommender = CollaborativeFilteringRecommender()
recommendations = recommender.recommend_for_user(
user.id, n_recommendations
)
# 格式化响应数据
recommended_pages = []
for page_id, score in recommendations:
try:
page = Page.objects.get(id=page_id).specific
recommended_pages.append({
'id': page.id,
'title': page.title,
'url': page.url,
'score': round(score, 3),
'type': page.__class__.__name__
})
except Page.DoesNotExist:
continue
return Response(recommended_pages)
@action(detail=True, methods=['get'])
def similar_to_page(self, request, pk=None):
"""获取相似页面推荐"""
try:
target_page = Page.objects.get(id=pk).specific
n_recommendations = int(request.GET.get('limit', 5))
content_recommender = ContentBasedRecommender()
# 需要预先构建内容矩阵
similar_pages = content_recommender.recommend_similar(
target_page.id, n_recommendations
)
result = []
for page_id, similarity in similar_pages:
try:
page = Page.objects.get(id=page_id).specific
result.append({
'id': page.id,
'title': page.title,
'url': page.url,
'similarity': round(similarity, 3),
'type': page.__class__.__name__
})
except Page.DoesNotExist:
continue
return Response(result)
except Page.DoesNotExist:
return Response(
{'error': 'Page not found'},
status=status.HTTP_404_NOT_FOUND
)
性能优化与缓存策略
Redis缓存实现
import redis
import json
from django.core.cache import caches
class CachedRecommender:
def __init__(self):
self.redis_client = redis.Redis(
host='localhost', port=6379, db=0
)
self.cache_timeout = 3600 # 1小时缓存
def get_user_recommendations(self, user_id, force_refresh=False):
"""获取缓存化的用户推荐"""
cache_key = f'user_recommendations:{user_id}'
if not force_refresh:
# 尝试从缓存获取
cached = self.redis_client.get(cache_key)
if cached:
return json.loads(cached)
# 重新计算推荐
recommender = CollaborativeFilteringRecommender()
recommendations = recommender.recommend_for_user(user_id, 10)
# 缓存结果
self.redis_client.setex(
cache_key,
self.cache_timeout,
json.dumps(recommendations)
)
return recommendations
def invalidate_user_cache(self, user_id):
"""失效用户推荐缓存"""
cache_key = f'user_recommendations:{user_id}'
self.redis_client.delete(cache_key)
数据库查询优化
# 使用Django的select_related和prefetch_related优化查询
def get_recommendation_data(user_id):
"""优化推荐数据查询"""
from django.db.models import Count, F
# 获取用户行为数据(优化查询)
user_behaviors = UserBehavior.objects.filter(
user_id=user_id
).select_related(
'page'
).prefetch_related(
'page__tags'
).annotate(
weighted_score=F('duration') * F('weight')
).order_by('-timestamp')[:1000]
# 获取热门内容(作为冷启动推荐)
popular_pages = Page.objects.live().annotate(
view_count=Count('userbehavior')
).select_related(
'content_type'
).prefetch_related(
'tags'
).order_by('-view_count')[:20]
return {
'user_behaviors': user_behaviors,
'popular_pages': popular_pages
}
评估与监控体系
推荐质量评估
class RecommendationEvaluator:
"""推荐系统评估器"""
def calculate_precision(self, recommendations, actual_interactions):
"""计算精确率"""
if not recommendations:
return 0.0
relevant_count = sum(1 for rec in recommendations
if rec['id'] in actual_interactions)
return relevant_count / len(recommendations)
def calculate_recall(self, recommendations, actual_interactions):
"""计算召回率"""
if not actual_interactions:
return 0.0
relevant_count = sum(1 for rec in recommendations
if rec['id'] in actual_interactions)
return relevant_count / len(actual_interactions)
def calculate_f1_score(self, precision, recall):
"""计算F1分数"""
if precision + recall == 0:
return 0.0
return 2 * (precision * recall) / (precision + recall)
def evaluate_recommendations(self, user_id, recommendations, time_period='7d'):
"""综合评估推荐效果"""
# 获取实际用户交互数据
from datetime import datetime, timedelta
start_date = datetime.now() - timedelta(days=7)
actual_interactions = set(
UserBehavior.objects.filter(
user_id=user_id,
timestamp__gte=start_date,
behavior_type__in=['click', 'view', 'bookmark']
).values_list('page_id', flat=True)
)
precision = self.calculate_precision(recommendations, actual_interactions)
recall = self.calculate_recall(recommendations, actual_interactions)
f1_score = self.calculate_f1_score(precision, recall)
return {
'precision': precision,
'recall': recall,
'f1_score': f1_score,
'recommendation_count': len(recommendations),
'interaction_count': len(actual_interactions)
}
监控仪表板
from django.http import JsonResponse
from django.views import View
from collections import defaultdict
from datetime import datetime, timedelta
class RecommendationMetricsView(View):
"""推荐系统监控视图"""
def get(self, request):
time_period = request.GET.get('period', '7d')
days = int(time_period[:-1])
end_date = datetime.now()
start_date = end_date - timedelta(days=days)
metrics = self.calculate_metrics(start_date, end_date)
return JsonResponse(metrics)
def calculate_metrics(self, start_date, end_date):
"""计算各项监控指标"""
# 推荐点击率
recommendation_clicks = UserBehavior.objects.filter(
timestamp__range=(start_date, end_date),
behavior_type='click',
source='recommendation'
).count()
total_recommendations = UserBehavior.objects.filter(
timestamp__range=(start_date, end_date),
behavior_type='impression',
source='recommendation'
).count()
ctr = (recommendation_clicks / total_recommendations * 100) if total_recommendations else 0
# 用户参与度
user_engagement = UserBehavior.objects.filter(
timestamp__range=(start_date, end_date)
).values('user').annotate(
total_actions=Count('id'),
avg_duration=Avg('duration')
)
return {
'click_through_rate': round(ctr, 2),
'total_recommendations': total_recommendations,
'recommendation_clicks': recommendation_clicks,
'avg_user_actions': user_engagement.aggregate(avg=Avg('total_actions'))['avg'] or 0,
'avg_engagement_duration': user_engagement.aggregate(avg=Avg('avg_duration'))['avg'] or 0,
'time_period': {
'start': start_date.isoformat(),
'end': end_date.isoformat()
}
}
部署与扩展策略
Docker容器化部署
# Dockerfile for Recommendation Service
FROM python:3.11-slim
WORKDIR /app
# 安装系统依赖
RUN apt-get update && apt-get install -y \
gcc \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# 复制依赖文件
COPY requirements.txt .
RUN pip install -r requirements.txt
# 复制应用代码
COPY . .
# 创建非root用户
RUN useradd --create-home --shell /bin/bash appuser
USER appuser
# 启动服务
CMD ["gunicorn", "recommendation_service.wsgi:application", "--bind", "0.0.0.0:8000"]
Kubernetes部署配置
# recommendation-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: recommendation-service
spec:
replicas: 3
selector:
matchLabels:
app: recommendation-service
template:
metadata:
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考