Cosine Sim Calc now Working properly

2025-04-03 11:59:25 -06:00
parent e7580c36f5
commit 99f12319d5
5 changed files with 152 additions and 887 deletions
--- a/recommondation-engine/example1.py
+++ b/recommondation-engine/example1.py
@@ -1,221 +1,113 @@
+# pip install mysql.connector

-import pandas as pd
-import numpy as np
+import mysql.connector
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import logging

-'''
-Recommender system using content-based filtering
-'''
-class Recommender:
-    def __init__(self):
-        # Initialize data structures
-        self.products_df = None
-        self.user_profiles = {}
-        self.tfidf_matrix = None
-        self.tfidf_vectorizer = None
-        self.product_indices = None
-        
-    def load_products(self, products_data):
+def database():
+    db_connection = mysql.connector.connect(
+        host = "localhost",
+        port = "3306",
+        user = "root",
+        database = "Marketplace"
+    )
+    return db_connection
+
+
+def get_all_products():
+
+    db_con = database()
+    cursor = db_con.cursor()
+
+    cursor.execute("SELECT CategoryID FROM Category")
+    categories = cursor.fetchall()
+
+    select_clause = "SELECT p.ProductID"
+    for category in categories:
+        category_id = category[0]
+        select_clause += f", MAX(CASE WHEN pc.CategoryID = {category_id} THEN 1 ELSE 0 END) AS `Cat_{category_id}`"
+
+    final_query = f"""
+        {select_clause}
+        FROM Product p
+        LEFT JOIN Product_Category pc ON p.ProductID = pc.ProductID
+        LEFT JOIN Category c ON pc.CategoryID = c.CategoryID
+        GROUP BY p.ProductID;
        """
-        Load product data into the recommender system
-        
-        products_data: list of dictionaries with product info (id, name, description, category, etc.)
+
+    cursor.execute(final_query)
+    results = cursor.fetchall()
+
+    final = []
+    for row in results:
+        text_list = list(row)
+        text_list.pop(0)
+        final.append(text_list)
+
+    cursor.close()
+    db_con.close()
+    return final
+
+def get_user_history(user_id):
+    db_con = database()
+    cursor = db_con.cursor()
+
+    cursor.execute("SELECT CategoryID FROM Category")
+    categories = cursor.fetchall()
+
+    select_clause = "SELECT p.ProductID"
+    for category in categories:
+        category_id = category[0] # get the uid of the catefory and then append that to the new column
+        select_clause += f", MAX(CASE WHEN pc.CategoryID = {category_id} THEN 1 ELSE 0 END) AS `Cat_{category_id}`"
+
+    final_query = f"""
+        {select_clause}
+        FROM Product p
+        LEFT JOIN Product_Category pc ON p.ProductID = pc.ProductID
+        LEFT JOIN Category c ON pc.CategoryID = c.CategoryID
+        where p.ProductID in (select ProductID from History where UserID = {user_id})
+        GROUP BY p.ProductID;
        """
-        self.products_df = pd.DataFrame(products_data)
-        
-        # Create a text representation for each product (combining various features)
-        self.products_df['content'] = (
-            self.products_df['category'] + ' ' + 
-            self.products_df['name'] + ' ' + 
-            self.products_df['description']
-        )
-        
-        # Initialize TF-IDF vectorizer to convert text to vectors
-        self.tfidf_vectorizer = TfidfVectorizer(
-            stop_words='english',
-            max_features=5000,  # Limit features to avoid sparse matrices
-            ngram_range=(1, 2)  # Use both unigrams and bigrams
-        )
-        
-        # Compute TF-IDF matrix
-        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.products_df['content'])
-        
-        # Create a mapping from product_id to index
-        self.product_indices = pd.Series(
-            self.products_df.index, 
-            index=self.products_df['product_id']
-        ).drop_duplicates()
-        
-    def track_user_click(self, user_id, product_id):
-        """
-        Track user clicks on products to build user profiles
-        """
-        if user_id not in self.user_profiles:
-            self.user_profiles[user_id] = {
-                'clicks': {},
-                'category_weights': {
-                    'electronics': 0,
-                    'school supplies': 0,
-                    'rental place': 0,
-                    'furniture': 0
-                }
-            }
-        
-        # Get the clicked product's index and details
-        if product_id in self.product_indices:
-            product_idx = self.product_indices[product_id]
-            product_category = self.products_df.iloc[product_idx]['category']
-            
-            # Update click count
-            if product_id in self.user_profiles[user_id]['clicks']:
-                self.user_profiles[user_id]['clicks'][product_id] += 1
-            else:
-                self.user_profiles[user_id]['clicks'][product_id] = 1
-            
-            # Update category weight
-            self.user_profiles[user_id]['category_weights'][product_category] += 1
-    
-    def get_user_profile_vector(self, user_id):
-        """
-        Generate a user profile vector based on their click history
-        """
-        if user_id not in self.user_profiles or not self.user_profiles[user_id]['clicks']:
-            # Return a zero vector if no click history
-            return np.zeros((1, self.tfidf_matrix.shape[1]))
-        
-        # Create a weighted average of all clicked products' TF-IDF vectors
-        clicked_product_vectors = []
-        weights = []
-        
-        for product_id, click_count in self.user_profiles[user_id]['clicks'].items():
-            if product_id in self.product_indices:
-                product_idx = self.product_indices[product_id]
-                product_category = self.products_df.iloc[product_idx]['category']
-                category_weight = self.user_profiles[user_id]['category_weights'][product_category]
-                
-                # Weight is based on both click count and category preference
-                weight = click_count * (1 + 0.5 * category_weight)
-                weights.append(weight)
-                clicked_product_vectors.append(self.tfidf_matrix[product_idx])
-        
-        # Normalize weights
-        weights = np.array(weights) / np.sum(weights)
-        
-        # Compute weighted average
-        user_profile = np.zeros((1, self.tfidf_matrix.shape[1]))
-        for i, vector in enumerate(clicked_product_vectors):
-            user_profile += weights[i] * vector.toarray()
-            
-        return user_profile
-    
-    def recommend_products(self, user_id, n=5, category_filter=None):
-        """
-        Recommend products to a user based on their profile
-        
-        user_id: ID of the user
-        n: Number of recommendations to return
-        category_filter: Optional filter to limit recommendations to a specific category
-        """
-        # Get user profile vector
-        user_profile = self.get_user_profile_vector(user_id)
-        
-        # If user has no profile, recommend popular products (not implemented)
-        if np.sum(user_profile) == 0:
-            return self._get_popular_products(n, category_filter)
-        
-        # Calculate similarity scores
-        sim_scores = cosine_similarity(user_profile, self.tfidf_matrix)
-        sim_scores = sim_scores.flatten()
-        
-        # Create a DataFrame for easier filtering
-        recommendations_df = pd.DataFrame({
-            'product_id': self.products_df['product_id'],
-            'score': sim_scores,
-            'category': self.products_df['category']
-        })
-        
-        # Filter out products that the user has already clicked on
-        if user_id in self.user_profiles and self.user_profiles[user_id]['clicks']:
-            clicked_products = list(self.user_profiles[user_id]['clicks'].keys())
-            recommendations_df = recommendations_df[~recommendations_df['product_id'].isin(clicked_products)]
-        
-        # Apply category filter if provided
-        if category_filter:
-            recommendations_df = recommendations_df[recommendations_df['category'] == category_filter]
-        
-        # Sort by similarity score and get top n recommendations
-        recommendations_df = recommendations_df.sort_values('score', ascending=False).head(n)
-        
+
+    cursor.execute(final_query)
+    results = cursor.fetchall()
+    final = []
+    for row in results:
+        text_list = list(row)
+        text_list.pop(0)
+        final.append(text_list)
+
+    cursor.close()
+    db_con.close()
+    return final
+
+
+def get_recommendations(user_id, top_n=40):
+    try:
+        # Get all products and user history with their category vectors
+        all_products = get_all_products()
+        user_history = get_user_history(user_id)
+
+        # if not user_history:
+        #     # Cold start: return popular products
+        #     return get_popular_products(top_n)
+
+        # Calculate similarity between all products and user history
+        user_profile = np.mean(user_history, axis=0)  # Average user preferences
+        similarities = cosine_similarity([user_profile], all_products)
+
+        # finds the indices of the top N products that have the highest
+        # cosine similarity with the user's profile and sorted from most similar to least similar.
+        product_indices = similarities[0].argsort()[-top_n:][::-1]
+        print("product", product_indices)
+
        # Return recommended product IDs
-        return recommendations_df['product_id'].tolist()
-    
-    def _get_popular_products(self, n=5, category_filter=None):
-        """
-        Return popular products when a user has no profile
-        (This would typically be implemented with actual popularity metrics)
-        """
-        filtered_df = self.products_df
-        
-        if category_filter:
-            filtered_df = filtered_df[filtered_df['category'] == category_filter]
-        
-        # Just return random products for now (in a real system you'd use popularity metrics)
-        if len(filtered_df) >= n:
-            return filtered_df.sample(n)['product_id'].tolist()
-        else:
-            return filtered_df['product_id'].tolist()
-
-    def recommend_by_category_preference(self, user_id, n=5):
-        """
-        Recommend products based primarily on the user's category preferences
-        """
-        if user_id not in self.user_profiles:
-            return self._get_popular_products(n)
-        
-        # Get the user's most clicked category
-        category_weights = self.user_profiles[user_id]['category_weights']
-        
-        # If no category has been clicked, return popular products
-        if sum(category_weights.values()) == 0:
-            return self._get_popular_products(n)
-        
-        # Sort categories by number of clicks
-        sorted_categories = sorted(
-            category_weights.items(),
-            key=lambda x: x[1],
-            reverse=True
-        )
-        
-        recommendations = []
-        remaining = n
-        
-        # Allocate recommendations proportionally across categories
-        for category, weight in sorted_categories:
-            if weight > 0:
-                # Allocate recommendations proportionally to category weight
-                category_allocation = max(1, int(remaining * (weight / sum(category_weights.values()))))
-                if category_allocation > remaining:
-                    category_allocation = remaining
-                
-                # Get recommendations for this category
-                category_recs = self.recommend_products(user_id, category_allocation, category)
-                recommendations.extend(category_recs)
-                
-                # Update remaining slots
-                remaining -= len(category_recs)
-                
-                if remaining <= 0:
-                    break
-        
-        # If we still have slots to fill, add general recommendations
-        if remaining > 0:
-            general_recs = self.recommend_products(user_id, remaining)
-            # Filter out duplicates
-            general_recs = [rec for rec in general_recs if rec not in recommendations]
-            recommendations.extend(general_recs[:remaining])
-        
-        return recommendations
+        return [all_products[i][0] for i in product_indices]  # Product IDs
+    except Exception as e:
+        logging.error(f"Recommendation error for user {user_id}: {str(e)}")
+        # return get_popular_products(top_n)  # Fallback to popular products


-exported = Recommender()
+get_recommendations(1)