222 lines
8.8 KiB
Python
222 lines
8.8 KiB
Python
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
'''
|
|
Recommender system using content-based filtering
|
|
'''
|
|
class Recommender:
|
|
def __init__(self):
|
|
# Initialize data structures
|
|
self.products_df = None
|
|
self.user_profiles = {}
|
|
self.tfidf_matrix = None
|
|
self.tfidf_vectorizer = None
|
|
self.product_indices = None
|
|
|
|
def load_products(self, products_data):
|
|
"""
|
|
Load product data into the recommender system
|
|
|
|
products_data: list of dictionaries with product info (id, name, description, category, etc.)
|
|
"""
|
|
self.products_df = pd.DataFrame(products_data)
|
|
|
|
# Create a text representation for each product (combining various features)
|
|
self.products_df['content'] = (
|
|
self.products_df['category'] + ' ' +
|
|
self.products_df['name'] + ' ' +
|
|
self.products_df['description']
|
|
)
|
|
|
|
# Initialize TF-IDF vectorizer to convert text to vectors
|
|
self.tfidf_vectorizer = TfidfVectorizer(
|
|
stop_words='english',
|
|
max_features=5000, # Limit features to avoid sparse matrices
|
|
ngram_range=(1, 2) # Use both unigrams and bigrams
|
|
)
|
|
|
|
# Compute TF-IDF matrix
|
|
self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.products_df['content'])
|
|
|
|
# Create a mapping from product_id to index
|
|
self.product_indices = pd.Series(
|
|
self.products_df.index,
|
|
index=self.products_df['product_id']
|
|
).drop_duplicates()
|
|
|
|
def track_user_click(self, user_id, product_id):
|
|
"""
|
|
Track user clicks on products to build user profiles
|
|
"""
|
|
if user_id not in self.user_profiles:
|
|
self.user_profiles[user_id] = {
|
|
'clicks': {},
|
|
'category_weights': {
|
|
'electronics': 0,
|
|
'school supplies': 0,
|
|
'rental place': 0,
|
|
'furniture': 0
|
|
}
|
|
}
|
|
|
|
# Get the clicked product's index and details
|
|
if product_id in self.product_indices:
|
|
product_idx = self.product_indices[product_id]
|
|
product_category = self.products_df.iloc[product_idx]['category']
|
|
|
|
# Update click count
|
|
if product_id in self.user_profiles[user_id]['clicks']:
|
|
self.user_profiles[user_id]['clicks'][product_id] += 1
|
|
else:
|
|
self.user_profiles[user_id]['clicks'][product_id] = 1
|
|
|
|
# Update category weight
|
|
self.user_profiles[user_id]['category_weights'][product_category] += 1
|
|
|
|
def get_user_profile_vector(self, user_id):
|
|
"""
|
|
Generate a user profile vector based on their click history
|
|
"""
|
|
if user_id not in self.user_profiles or not self.user_profiles[user_id]['clicks']:
|
|
# Return a zero vector if no click history
|
|
return np.zeros((1, self.tfidf_matrix.shape[1]))
|
|
|
|
# Create a weighted average of all clicked products' TF-IDF vectors
|
|
clicked_product_vectors = []
|
|
weights = []
|
|
|
|
for product_id, click_count in self.user_profiles[user_id]['clicks'].items():
|
|
if product_id in self.product_indices:
|
|
product_idx = self.product_indices[product_id]
|
|
product_category = self.products_df.iloc[product_idx]['category']
|
|
category_weight = self.user_profiles[user_id]['category_weights'][product_category]
|
|
|
|
# Weight is based on both click count and category preference
|
|
weight = click_count * (1 + 0.5 * category_weight)
|
|
weights.append(weight)
|
|
clicked_product_vectors.append(self.tfidf_matrix[product_idx])
|
|
|
|
# Normalize weights
|
|
weights = np.array(weights) / np.sum(weights)
|
|
|
|
# Compute weighted average
|
|
user_profile = np.zeros((1, self.tfidf_matrix.shape[1]))
|
|
for i, vector in enumerate(clicked_product_vectors):
|
|
user_profile += weights[i] * vector.toarray()
|
|
|
|
return user_profile
|
|
|
|
def recommend_products(self, user_id, n=5, category_filter=None):
|
|
"""
|
|
Recommend products to a user based on their profile
|
|
|
|
user_id: ID of the user
|
|
n: Number of recommendations to return
|
|
category_filter: Optional filter to limit recommendations to a specific category
|
|
"""
|
|
# Get user profile vector
|
|
user_profile = self.get_user_profile_vector(user_id)
|
|
|
|
# If user has no profile, recommend popular products (not implemented)
|
|
if np.sum(user_profile) == 0:
|
|
return self._get_popular_products(n, category_filter)
|
|
|
|
# Calculate similarity scores
|
|
sim_scores = cosine_similarity(user_profile, self.tfidf_matrix)
|
|
sim_scores = sim_scores.flatten()
|
|
|
|
# Create a DataFrame for easier filtering
|
|
recommendations_df = pd.DataFrame({
|
|
'product_id': self.products_df['product_id'],
|
|
'score': sim_scores,
|
|
'category': self.products_df['category']
|
|
})
|
|
|
|
# Filter out products that the user has already clicked on
|
|
if user_id in self.user_profiles and self.user_profiles[user_id]['clicks']:
|
|
clicked_products = list(self.user_profiles[user_id]['clicks'].keys())
|
|
recommendations_df = recommendations_df[~recommendations_df['product_id'].isin(clicked_products)]
|
|
|
|
# Apply category filter if provided
|
|
if category_filter:
|
|
recommendations_df = recommendations_df[recommendations_df['category'] == category_filter]
|
|
|
|
# Sort by similarity score and get top n recommendations
|
|
recommendations_df = recommendations_df.sort_values('score', ascending=False).head(n)
|
|
|
|
# Return recommended product IDs
|
|
return recommendations_df['product_id'].tolist()
|
|
|
|
def _get_popular_products(self, n=5, category_filter=None):
|
|
"""
|
|
Return popular products when a user has no profile
|
|
(This would typically be implemented with actual popularity metrics)
|
|
"""
|
|
filtered_df = self.products_df
|
|
|
|
if category_filter:
|
|
filtered_df = filtered_df[filtered_df['category'] == category_filter]
|
|
|
|
# Just return random products for now (in a real system you'd use popularity metrics)
|
|
if len(filtered_df) >= n:
|
|
return filtered_df.sample(n)['product_id'].tolist()
|
|
else:
|
|
return filtered_df['product_id'].tolist()
|
|
|
|
def recommend_by_category_preference(self, user_id, n=5):
|
|
"""
|
|
Recommend products based primarily on the user's category preferences
|
|
"""
|
|
if user_id not in self.user_profiles:
|
|
return self._get_popular_products(n)
|
|
|
|
# Get the user's most clicked category
|
|
category_weights = self.user_profiles[user_id]['category_weights']
|
|
|
|
# If no category has been clicked, return popular products
|
|
if sum(category_weights.values()) == 0:
|
|
return self._get_popular_products(n)
|
|
|
|
# Sort categories by number of clicks
|
|
sorted_categories = sorted(
|
|
category_weights.items(),
|
|
key=lambda x: x[1],
|
|
reverse=True
|
|
)
|
|
|
|
recommendations = []
|
|
remaining = n
|
|
|
|
# Allocate recommendations proportionally across categories
|
|
for category, weight in sorted_categories:
|
|
if weight > 0:
|
|
# Allocate recommendations proportionally to category weight
|
|
category_allocation = max(1, int(remaining * (weight / sum(category_weights.values()))))
|
|
if category_allocation > remaining:
|
|
category_allocation = remaining
|
|
|
|
# Get recommendations for this category
|
|
category_recs = self.recommend_products(user_id, category_allocation, category)
|
|
recommendations.extend(category_recs)
|
|
|
|
# Update remaining slots
|
|
remaining -= len(category_recs)
|
|
|
|
if remaining <= 0:
|
|
break
|
|
|
|
# If we still have slots to fill, add general recommendations
|
|
if remaining > 0:
|
|
general_recs = self.recommend_products(user_id, remaining)
|
|
# Filter out duplicates
|
|
general_recs = [rec for rec in general_recs if rec not in recommendations]
|
|
recommendations.extend(general_recs[:remaining])
|
|
|
|
return recommendations
|
|
|
|
|
|
exported = Recommender()
|